{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.999969938373666, "eval_steps": 500, "global_step": 16632, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "auxiliary_loss_clip": 0.04940867, "auxiliary_loss_mlp": 0.02022554, "balance_loss_clip": 2.42054653, "balance_loss_mlp": 1.60770607, "epoch": 6.012325266796934e-05, "flos": 24456507091200.0, "grad_norm": 431.42742291568476, "language_loss": 2.93780541, "learning_rate": 0.0, "loss": 1.9937849, "num_input_tokens_seen": 19155, "router_z_loss_clip": 25.140625, "router_z_loss_mlp": 4.15039062, "step": 1, "time_per_iteration": 18.08106780052185 }, { "auxiliary_loss_clip": 0.03306889, "auxiliary_loss_mlp": 0.01334092, "balance_loss_clip": 1.61681187, "balance_loss_mlp": 1.08403873, "epoch": 0.00012024650533593868, "flos": 20225931246720.0, "grad_norm": 56.60211952861628, "language_loss": 1.85136652, "learning_rate": 4.4628432569317594e-07, "loss": 1.89777637, "num_input_tokens_seen": 36175, "router_z_loss_clip": 16.875, "router_z_loss_mlp": 2.49804688, "step": 2, "time_per_iteration": 2.448704242706299 }, { "auxiliary_loss_clip": 0.03262712, "auxiliary_loss_mlp": 0.01333688, "balance_loss_clip": 1.61833572, "balance_loss_mlp": 1.08916557, "epoch": 0.000180369758003908, "flos": 22309935454080.0, "grad_norm": 231.8354364196141, "language_loss": 1.62016344, "learning_rate": 7.073439208833112e-07, "loss": 1.66612744, "num_input_tokens_seen": 54870, "router_z_loss_clip": 16.453125, "router_z_loss_mlp": 2.44335938, "step": 3, "time_per_iteration": 2.451188325881958 }, { "auxiliary_loss_clip": 0.03285281, "auxiliary_loss_mlp": 0.01353865, "balance_loss_clip": 1.61045289, "balance_loss_mlp": 1.07462955, "epoch": 0.00024049301067187735, "flos": 22414650577920.0, "grad_norm": 75.43225617412423, "language_loss": 1.71785331, "learning_rate": 8.925686513863519e-07, "loss": 1.76424468, "num_input_tokens_seen": 74575, "router_z_loss_clip": 16.75, "router_z_loss_mlp": 2.79296875, "step": 4, "time_per_iteration": 2.549875259399414 }, { "auxiliary_loss_clip": 0.03328519, "auxiliary_loss_mlp": 0.01362364, "balance_loss_clip": 1.61330354, "balance_loss_mlp": 1.09514475, "epoch": 0.0003006162633398467, "flos": 21396978449280.0, "grad_norm": 63.29544907072378, "language_loss": 1.95051217, "learning_rate": 1.0362401141348472e-06, "loss": 1.99742103, "num_input_tokens_seen": 92580, "router_z_loss_clip": 17.15625, "router_z_loss_mlp": 2.66992188, "step": 5, "time_per_iteration": 2.734335422515869 }, { "auxiliary_loss_clip": 0.03290469, "auxiliary_loss_mlp": 0.01366025, "balance_loss_clip": 1.60763097, "balance_loss_mlp": 1.09689856, "epoch": 0.000360739516007816, "flos": 21652375127040.0, "grad_norm": 123.76547633955164, "language_loss": 1.64787006, "learning_rate": 1.153628246576487e-06, "loss": 1.694435, "num_input_tokens_seen": 109705, "router_z_loss_clip": 16.84375, "router_z_loss_mlp": 2.69335938, "step": 6, "time_per_iteration": 2.7639269828796387 }, { "auxiliary_loss_clip": 0.0325183, "auxiliary_loss_mlp": 0.0132199, "balance_loss_clip": 1.60599482, "balance_loss_mlp": 1.06659627, "epoch": 0.0004208627686757854, "flos": 27159742897920.0, "grad_norm": 263.0274420605537, "language_loss": 1.55358648, "learning_rate": 1.2528784983718962e-06, "loss": 1.5993247, "num_input_tokens_seen": 129425, "router_z_loss_clip": 16.46875, "router_z_loss_mlp": 2.55273438, "step": 7, "time_per_iteration": 2.7299726009368896 }, { "auxiliary_loss_clip": 0.03272678, "auxiliary_loss_mlp": 0.01337296, "balance_loss_clip": 1.60518384, "balance_loss_mlp": 1.07923198, "epoch": 0.0004809860213437547, "flos": 31319096135040.0, "grad_norm": 151.48753424455907, "language_loss": 1.77746153, "learning_rate": 1.338852977079528e-06, "loss": 1.82356131, "num_input_tokens_seen": 149210, "router_z_loss_clip": 16.6875, "router_z_loss_mlp": 2.58398438, "step": 8, "time_per_iteration": 2.8052265644073486 }, { "auxiliary_loss_clip": 0.03283289, "auxiliary_loss_mlp": 0.01347131, "balance_loss_clip": 1.6042273, "balance_loss_mlp": 1.08372593, "epoch": 0.000541109274011724, "flos": 32160411463680.0, "grad_norm": 548.4434083033489, "language_loss": 1.51649261, "learning_rate": 1.4146878417666224e-06, "loss": 1.56279671, "num_input_tokens_seen": 169055, "router_z_loss_clip": 16.796875, "router_z_loss_mlp": 2.63476562, "step": 9, "time_per_iteration": 2.7609801292419434 }, { "auxiliary_loss_clip": 0.03260777, "auxiliary_loss_mlp": 0.01341184, "balance_loss_clip": 1.60495639, "balance_loss_mlp": 1.08578992, "epoch": 0.0006012325266796934, "flos": 18916808163840.0, "grad_norm": 54.602955404329194, "language_loss": 1.49180126, "learning_rate": 1.4825244398280232e-06, "loss": 1.53782082, "num_input_tokens_seen": 188045, "router_z_loss_clip": 16.546875, "router_z_loss_mlp": 2.5546875, "step": 10, "time_per_iteration": 2.715550422668457 }, { "auxiliary_loss_clip": 0.03292792, "auxiliary_loss_mlp": 0.01373374, "balance_loss_clip": 1.61044312, "balance_loss_mlp": 1.10768056, "epoch": 0.0006613557793476627, "flos": 20774861867520.0, "grad_norm": 97.40042078981905, "language_loss": 1.4816246, "learning_rate": 1.5438901072051983e-06, "loss": 1.52828634, "num_input_tokens_seen": 207035, "router_z_loss_clip": 16.828125, "router_z_loss_mlp": 2.65820312, "step": 11, "time_per_iteration": 2.695802927017212 }, { "auxiliary_loss_clip": 0.03240874, "auxiliary_loss_mlp": 0.01329048, "balance_loss_clip": 1.60257936, "balance_loss_mlp": 1.07441664, "epoch": 0.000721479032015632, "flos": 16581680997120.0, "grad_norm": 33.74826511093444, "language_loss": 1.46082878, "learning_rate": 1.5999125722696629e-06, "loss": 1.50652802, "num_input_tokens_seen": 223225, "router_z_loss_clip": 16.390625, "router_z_loss_mlp": 2.54492188, "step": 12, "time_per_iteration": 2.7831013202667236 }, { "auxiliary_loss_clip": 0.03259551, "auxiliary_loss_mlp": 0.01320259, "balance_loss_clip": 1.60987043, "balance_loss_mlp": 1.07802606, "epoch": 0.0007816022846836014, "flos": 23805471144960.0, "grad_norm": 52.88327014025935, "language_loss": 1.32444811, "learning_rate": 1.6514482443788434e-06, "loss": 1.37024617, "num_input_tokens_seen": 242570, "router_z_loss_clip": 16.484375, "router_z_loss_mlp": 2.421875, "step": 13, "time_per_iteration": 2.719764471054077 }, { "auxiliary_loss_clip": 0.03254492, "auxiliary_loss_mlp": 0.01332719, "balance_loss_clip": 1.60936332, "balance_loss_mlp": 1.07217491, "epoch": 0.0008417255373515708, "flos": 19172204841600.0, "grad_norm": 1457.4443877763251, "language_loss": 1.37907791, "learning_rate": 1.6991628240650723e-06, "loss": 1.42495012, "num_input_tokens_seen": 261215, "router_z_loss_clip": 16.4765625, "router_z_loss_mlp": 2.60546875, "step": 14, "time_per_iteration": 2.833561420440674 }, { "auxiliary_loss_clip": 0.03223786, "auxiliary_loss_mlp": 0.0133393, "balance_loss_clip": 1.60960174, "balance_loss_mlp": 1.07872701, "epoch": 0.00090184879001954, "flos": 26395564026240.0, "grad_norm": 1348.3446942659677, "language_loss": 1.16258526, "learning_rate": 1.7435840350181584e-06, "loss": 1.20816243, "num_input_tokens_seen": 280035, "router_z_loss_clip": 16.1484375, "router_z_loss_mlp": 2.55078125, "step": 15, "time_per_iteration": 2.720165729522705 }, { "auxiliary_loss_clip": 0.03210462, "auxiliary_loss_mlp": 0.01316266, "balance_loss_clip": 1.59970784, "balance_loss_mlp": 1.07193434, "epoch": 0.0009619720426875094, "flos": 24679500785280.0, "grad_norm": 22.782794260985284, "language_loss": 1.13624287, "learning_rate": 1.7851373027727038e-06, "loss": 1.18151021, "num_input_tokens_seen": 300265, "router_z_loss_clip": 16.109375, "router_z_loss_mlp": 2.44140625, "step": 16, "time_per_iteration": 2.729309558868408 }, { "auxiliary_loss_clip": 0.03191205, "auxiliary_loss_mlp": 0.01315719, "balance_loss_clip": 1.60227561, "balance_loss_mlp": 1.06967103, "epoch": 0.0010220952953554788, "flos": 18624531196800.0, "grad_norm": 920.2190802851648, "language_loss": 1.14784718, "learning_rate": 1.8241705979033208e-06, "loss": 1.19291639, "num_input_tokens_seen": 317375, "router_z_loss_clip": 15.8828125, "router_z_loss_mlp": 2.45898438, "step": 17, "time_per_iteration": 2.7290642261505127 }, { "auxiliary_loss_clip": 0.03138035, "auxiliary_loss_mlp": 0.01302877, "balance_loss_clip": 1.59850764, "balance_loss_mlp": 1.06731999, "epoch": 0.001082218548023448, "flos": 26142537646080.0, "grad_norm": 20.61944900501725, "language_loss": 1.11251807, "learning_rate": 1.860972167459798e-06, "loss": 1.15692711, "num_input_tokens_seen": 337975, "router_z_loss_clip": 15.3828125, "router_z_loss_mlp": 2.35546875, "step": 18, "time_per_iteration": 2.727212905883789 }, { "auxiliary_loss_clip": 0.03155722, "auxiliary_loss_mlp": 0.01321716, "balance_loss_clip": 1.60159111, "balance_loss_mlp": 1.06193519, "epoch": 0.0011423418006914173, "flos": 19609776322560.0, "grad_norm": 110.53388480760488, "language_loss": 1.05804229, "learning_rate": 1.89578346593066e-06, "loss": 1.10281658, "num_input_tokens_seen": 356635, "router_z_loss_clip": 15.5390625, "router_z_loss_mlp": 2.59765625, "step": 19, "time_per_iteration": 4.157699108123779 }, { "auxiliary_loss_clip": 0.03110102, "auxiliary_loss_mlp": 0.01308076, "balance_loss_clip": 1.59889913, "balance_loss_mlp": 1.07595205, "epoch": 0.0012024650533593868, "flos": 17895365107200.0, "grad_norm": 265.8383866319236, "language_loss": 1.21631849, "learning_rate": 1.928808765521199e-06, "loss": 1.26050031, "num_input_tokens_seen": 375625, "router_z_loss_clip": 15.109375, "router_z_loss_mlp": 2.32226562, "step": 20, "time_per_iteration": 4.193562030792236 }, { "auxiliary_loss_clip": 0.03097398, "auxiliary_loss_mlp": 0.01302734, "balance_loss_clip": 1.5930295, "balance_loss_mlp": 1.05668616, "epoch": 0.001262588306027356, "flos": 21252043071360.0, "grad_norm": 1041.224967491002, "language_loss": 1.11708641, "learning_rate": 1.9602224192552076e-06, "loss": 1.16108775, "num_input_tokens_seen": 394350, "router_z_loss_clip": 15.046875, "router_z_loss_mlp": 2.4609375, "step": 21, "time_per_iteration": 2.689336061477661 }, { "auxiliary_loss_clip": 0.03089181, "auxiliary_loss_mlp": 0.01326468, "balance_loss_clip": 1.58431077, "balance_loss_mlp": 1.07984781, "epoch": 0.0013227115586953253, "flos": 26104077158400.0, "grad_norm": 62.80105985588221, "language_loss": 1.1550374, "learning_rate": 1.9901744328983746e-06, "loss": 1.19919395, "num_input_tokens_seen": 413255, "router_z_loss_clip": 15.046875, "router_z_loss_mlp": 2.46289062, "step": 22, "time_per_iteration": 2.823751926422119 }, { "auxiliary_loss_clip": 0.03085493, "auxiliary_loss_mlp": 0.01307047, "balance_loss_clip": 1.58930326, "balance_loss_mlp": 1.07168055, "epoch": 0.0013828348113632948, "flos": 23951376190080.0, "grad_norm": 34.61602067786599, "language_loss": 0.9921748, "learning_rate": 2.018794797290208e-06, "loss": 1.03610015, "num_input_tokens_seen": 433065, "router_z_loss_clip": 14.9765625, "router_z_loss_mlp": 2.3515625, "step": 23, "time_per_iteration": 2.759662628173828 }, { "auxiliary_loss_clip": 0.03078863, "auxiliary_loss_mlp": 0.0128745, "balance_loss_clip": 1.586483, "balance_loss_mlp": 1.04807782, "epoch": 0.001442958064031264, "flos": 15959851724160.0, "grad_norm": 28.24948465353262, "language_loss": 1.16537142, "learning_rate": 2.046196897962839e-06, "loss": 1.20903444, "num_input_tokens_seen": 451175, "router_z_loss_clip": 14.921875, "router_z_loss_mlp": 2.39257812, "step": 24, "time_per_iteration": 2.8151495456695557 }, { "auxiliary_loss_clip": 0.02970239, "auxiliary_loss_mlp": 0.01298334, "balance_loss_clip": 1.57416224, "balance_loss_mlp": 1.06144118, "epoch": 0.0015030813166992333, "flos": 18108350801280.0, "grad_norm": 2130.6784355704317, "language_loss": 1.13982749, "learning_rate": 2.0724802282696944e-06, "loss": 1.18251324, "num_input_tokens_seen": 468775, "router_z_loss_clip": 13.96875, "router_z_loss_mlp": 2.36914062, "step": 25, "time_per_iteration": 2.794750213623047 }, { "auxiliary_loss_clip": 0.0297638, "auxiliary_loss_mlp": 0.0129326, "balance_loss_clip": 1.57925808, "balance_loss_mlp": 1.05064487, "epoch": 0.0015632045693672028, "flos": 22234558763520.0, "grad_norm": 90.52587352155847, "language_loss": 1.13118839, "learning_rate": 2.0977325700720194e-06, "loss": 1.17388487, "num_input_tokens_seen": 488530, "router_z_loss_clip": 13.984375, "router_z_loss_mlp": 2.42773438, "step": 26, "time_per_iteration": 2.72493314743042 }, { "auxiliary_loss_clip": 0.0295081, "auxiliary_loss_mlp": 0.01281096, "balance_loss_clip": 1.57585168, "balance_loss_mlp": 1.04305911, "epoch": 0.001623327822035172, "flos": 23991955580160.0, "grad_norm": 23.692710902198804, "language_loss": 0.99895966, "learning_rate": 2.122031762649933e-06, "loss": 1.0412786, "num_input_tokens_seen": 510495, "router_z_loss_clip": 13.7421875, "router_z_loss_mlp": 2.37695312, "step": 27, "time_per_iteration": 2.828873634338379 }, { "auxiliary_loss_clip": 0.02924742, "auxiliary_loss_mlp": 0.01286247, "balance_loss_clip": 1.56620121, "balance_loss_mlp": 1.06175172, "epoch": 0.0016834510747031415, "flos": 19677647070720.0, "grad_norm": 31.75135772225323, "language_loss": 1.11496043, "learning_rate": 2.1454471497582483e-06, "loss": 1.1570704, "num_input_tokens_seen": 528605, "router_z_loss_clip": 13.59375, "router_z_loss_mlp": 2.25, "step": 28, "time_per_iteration": 2.720031976699829 }, { "auxiliary_loss_clip": 0.0287684, "auxiliary_loss_mlp": 0.01300301, "balance_loss_clip": 1.55965185, "balance_loss_mlp": 1.0716095, "epoch": 0.0017435743273711108, "flos": 20923819568640.0, "grad_norm": 66.86658138440043, "language_loss": 1.08543491, "learning_rate": 2.1680407726407727e-06, "loss": 1.12720633, "num_input_tokens_seen": 548515, "router_z_loss_clip": 13.1484375, "router_z_loss_mlp": 2.28710938, "step": 29, "time_per_iteration": 2.7586584091186523 }, { "auxiliary_loss_clip": 0.02855522, "auxiliary_loss_mlp": 0.01310865, "balance_loss_clip": 1.55363822, "balance_loss_mlp": 1.07368672, "epoch": 0.00180369758003908, "flos": 19528976678400.0, "grad_norm": 62.02926246580285, "language_loss": 1.23949707, "learning_rate": 2.189868360711334e-06, "loss": 1.28116107, "num_input_tokens_seen": 564025, "router_z_loss_clip": 13.0078125, "router_z_loss_mlp": 2.37011719, "step": 30, "time_per_iteration": 2.653543710708618 }, { "auxiliary_loss_clip": 0.02808271, "auxiliary_loss_mlp": 0.01299935, "balance_loss_clip": 1.5520829, "balance_loss_mlp": 1.07725203, "epoch": 0.0018638208327070496, "flos": 27453169100160.0, "grad_norm": 42.371809069278704, "language_loss": 1.09587204, "learning_rate": 2.2109801597326265e-06, "loss": 1.13695407, "num_input_tokens_seen": 583345, "router_z_loss_clip": 12.5625, "router_z_loss_mlp": 2.22558594, "step": 31, "time_per_iteration": 2.7936320304870605 }, { "auxiliary_loss_clip": 0.02761796, "auxiliary_loss_mlp": 0.01328903, "balance_loss_clip": 1.55384374, "balance_loss_mlp": 1.10364532, "epoch": 0.0019239440853750188, "flos": 13589460380160.0, "grad_norm": 21.60605904386998, "language_loss": 1.01039648, "learning_rate": 2.2314216284658796e-06, "loss": 1.05130339, "num_input_tokens_seen": 600010, "router_z_loss_clip": 12.078125, "router_z_loss_mlp": 2.25390625, "step": 32, "time_per_iteration": 2.6848011016845703 }, { "auxiliary_loss_clip": 0.02753326, "auxiliary_loss_mlp": 0.01331473, "balance_loss_clip": 1.54402947, "balance_loss_mlp": 1.10354495, "epoch": 0.001984067338042988, "flos": 11253866336640.0, "grad_norm": 49.09215801277915, "language_loss": 1.02446342, "learning_rate": 2.2512340280885094e-06, "loss": 1.06531143, "num_input_tokens_seen": 616295, "router_z_loss_clip": 12.0859375, "router_z_loss_mlp": 2.28320312, "step": 33, "time_per_iteration": 2.701322317123413 }, { "auxiliary_loss_clip": 0.02702875, "auxiliary_loss_mlp": 0.01312201, "balance_loss_clip": 1.54197383, "balance_loss_mlp": 1.09905469, "epoch": 0.0020441905907109576, "flos": 22386245898240.0, "grad_norm": 65.90723670698493, "language_loss": 0.94838488, "learning_rate": 2.270454923596497e-06, "loss": 0.98853564, "num_input_tokens_seen": 637640, "router_z_loss_clip": 11.6015625, "router_z_loss_mlp": 2.12988281, "step": 34, "time_per_iteration": 2.764930009841919 }, { "auxiliary_loss_clip": 0.02684131, "auxiliary_loss_mlp": 0.01303448, "balance_loss_clip": 1.52230144, "balance_loss_mlp": 1.08610559, "epoch": 0.0021043138433789266, "flos": 49778580337920.0, "grad_norm": 32.86618588314514, "language_loss": 0.82620692, "learning_rate": 2.2891186125067434e-06, "loss": 0.86608279, "num_input_tokens_seen": 659710, "router_z_loss_clip": 11.625, "router_z_loss_mlp": 2.17480469, "step": 35, "time_per_iteration": 2.9288556575775146 }, { "auxiliary_loss_clip": 0.02669394, "auxiliary_loss_mlp": 0.01346577, "balance_loss_clip": 1.53418922, "balance_loss_mlp": 1.12990212, "epoch": 0.002164437096046896, "flos": 20557961591040.0, "grad_norm": 444.95011720707805, "language_loss": 0.93787003, "learning_rate": 2.307256493152974e-06, "loss": 0.97802973, "num_input_tokens_seen": 679670, "router_z_loss_clip": 11.3515625, "router_z_loss_mlp": 2.16796875, "step": 36, "time_per_iteration": 2.775209903717041 }, { "auxiliary_loss_clip": 0.02642179, "auxiliary_loss_mlp": 0.01322472, "balance_loss_clip": 1.53311217, "balance_loss_mlp": 1.10903931, "epoch": 0.0022245603487148656, "flos": 26542295084160.0, "grad_norm": 35.91763941301846, "language_loss": 0.98899949, "learning_rate": 2.3248973825097614e-06, "loss": 1.02864599, "num_input_tokens_seen": 700170, "router_z_loss_clip": 11.0859375, "router_z_loss_mlp": 2.13476562, "step": 37, "time_per_iteration": 2.7308075428009033 }, { "auxiliary_loss_clip": 0.02620757, "auxiliary_loss_mlp": 0.01316861, "balance_loss_clip": 1.53019023, "balance_loss_mlp": 1.12459993, "epoch": 0.0022846836013828346, "flos": 20338188226560.0, "grad_norm": 22.968780722988555, "language_loss": 1.07722783, "learning_rate": 2.3420677916238357e-06, "loss": 1.11660409, "num_input_tokens_seen": 718545, "router_z_loss_clip": 10.8984375, "router_z_loss_mlp": 1.92285156, "step": 38, "time_per_iteration": 2.69674015045166 }, { "auxiliary_loss_clip": 0.02601211, "auxiliary_loss_mlp": 0.01287829, "balance_loss_clip": 1.52835381, "balance_loss_mlp": 1.09060931, "epoch": 0.002344806854050804, "flos": 26247575992320.0, "grad_norm": 99.21884267032314, "language_loss": 0.89673287, "learning_rate": 2.358792165262154e-06, "loss": 0.93562323, "num_input_tokens_seen": 739865, "router_z_loss_clip": 10.75, "router_z_loss_mlp": 1.97363281, "step": 39, "time_per_iteration": 2.737496852874756 }, { "auxiliary_loss_clip": 0.02588993, "auxiliary_loss_mlp": 0.01328981, "balance_loss_clip": 1.52042925, "balance_loss_mlp": 1.11154342, "epoch": 0.0024049301067187736, "flos": 11801539981440.0, "grad_norm": 123.46383223716182, "language_loss": 0.96389353, "learning_rate": 2.3750930912143747e-06, "loss": 1.00307322, "num_input_tokens_seen": 755770, "router_z_loss_clip": 10.6953125, "router_z_loss_mlp": 2.17480469, "step": 40, "time_per_iteration": 2.6452231407165527 }, { "auxiliary_loss_clip": 0.02517774, "auxiliary_loss_mlp": 0.013075, "balance_loss_clip": 1.50534654, "balance_loss_mlp": 1.10551167, "epoch": 0.0024650533593867426, "flos": 20631506688000.0, "grad_norm": 62.80579277283349, "language_loss": 0.99583501, "learning_rate": 2.3909914837471044e-06, "loss": 1.03408778, "num_input_tokens_seen": 773440, "router_z_loss_clip": 10.1328125, "router_z_loss_mlp": 2.02050781, "step": 41, "time_per_iteration": 2.6673805713653564 }, { "auxiliary_loss_clip": 0.02477733, "auxiliary_loss_mlp": 0.01301428, "balance_loss_clip": 1.4929018, "balance_loss_mlp": 1.1058296, "epoch": 0.002525176612054712, "flos": 18406122549120.0, "grad_norm": 90.31424620439675, "language_loss": 1.01721597, "learning_rate": 2.4065067449483835e-06, "loss": 1.05500758, "num_input_tokens_seen": 790455, "router_z_loss_clip": 9.859375, "router_z_loss_mlp": 1.95703125, "step": 42, "time_per_iteration": 2.6653618812561035 }, { "auxiliary_loss_clip": 0.02434214, "auxiliary_loss_mlp": 0.01327007, "balance_loss_clip": 1.48420453, "balance_loss_mlp": 1.12749863, "epoch": 0.0025852998647226816, "flos": 28184023128960.0, "grad_norm": 65.64319758603754, "language_loss": 1.0512743, "learning_rate": 2.4216569070848724e-06, "loss": 1.0888865, "num_input_tokens_seen": 810645, "router_z_loss_clip": 9.5, "router_z_loss_mlp": 1.99414062, "step": 43, "time_per_iteration": 2.7091448307037354 }, { "auxiliary_loss_clip": 0.02462031, "auxiliary_loss_mlp": 0.01331977, "balance_loss_clip": 1.48602533, "balance_loss_mlp": 1.11434817, "epoch": 0.0026454231173906506, "flos": 14283110897280.0, "grad_norm": 47.931902994763774, "language_loss": 1.0010066, "learning_rate": 2.4364587585915504e-06, "loss": 1.03894663, "num_input_tokens_seen": 827470, "router_z_loss_clip": 9.7421875, "router_z_loss_mlp": 2.17675781, "step": 44, "time_per_iteration": 2.6538941860198975 }, { "auxiliary_loss_clip": 0.0243126, "auxiliary_loss_mlp": 0.01321845, "balance_loss_clip": 1.48442781, "balance_loss_mlp": 1.12376654, "epoch": 0.00270554637005862, "flos": 22419211605120.0, "grad_norm": 140.88107798702873, "language_loss": 1.05196345, "learning_rate": 2.450927955901469e-06, "loss": 1.08949459, "num_input_tokens_seen": 847285, "router_z_loss_clip": 9.46875, "router_z_loss_mlp": 1.98046875, "step": 45, "time_per_iteration": 2.715353012084961 }, { "auxiliary_loss_clip": 0.0240242, "auxiliary_loss_mlp": 0.01341412, "balance_loss_clip": 1.48302877, "balance_loss_mlp": 1.15086818, "epoch": 0.0027656696227265896, "flos": 23985778440960.0, "grad_norm": 14.239642536771683, "language_loss": 1.06964588, "learning_rate": 2.465079122983384e-06, "loss": 1.10708416, "num_input_tokens_seen": 867545, "router_z_loss_clip": 9.1875, "router_z_loss_mlp": 1.90722656, "step": 46, "time_per_iteration": 2.7373414039611816 }, { "auxiliary_loss_clip": 0.02392466, "auxiliary_loss_mlp": 0.01309438, "balance_loss_clip": 1.47576904, "balance_loss_mlp": 1.12461567, "epoch": 0.0028257928753945586, "flos": 37669503087360.0, "grad_norm": 485.89104997546167, "language_loss": 0.94949025, "learning_rate": 2.4789259401737868e-06, "loss": 0.98650926, "num_input_tokens_seen": 889915, "router_z_loss_clip": 9.171875, "router_z_loss_mlp": 1.84765625, "step": 47, "time_per_iteration": 2.8968899250030518 }, { "auxiliary_loss_clip": 0.02338982, "auxiliary_loss_mlp": 0.01305856, "balance_loss_clip": 1.46775496, "balance_loss_mlp": 1.13085663, "epoch": 0.002885916128062528, "flos": 22454547609600.0, "grad_norm": 37.85568929195237, "language_loss": 0.92363203, "learning_rate": 2.492481223656015e-06, "loss": 0.96008039, "num_input_tokens_seen": 908975, "router_z_loss_clip": 8.7109375, "router_z_loss_mlp": 1.75195312, "step": 48, "time_per_iteration": 2.716738700866699 }, { "auxiliary_loss_clip": 0.02336327, "auxiliary_loss_mlp": 0.01283174, "balance_loss_clip": 1.4516778, "balance_loss_mlp": 1.09577692, "epoch": 0.0029460393807304976, "flos": 27012796358400.0, "grad_norm": 19.500970315534268, "language_loss": 0.95528328, "learning_rate": 2.5057569967437924e-06, "loss": 0.99147832, "num_input_tokens_seen": 929810, "router_z_loss_clip": 8.84375, "router_z_loss_mlp": 1.87402344, "step": 49, "time_per_iteration": 2.847170829772949 }, { "auxiliary_loss_clip": 0.02329412, "auxiliary_loss_mlp": 0.01258851, "balance_loss_clip": 1.45790052, "balance_loss_mlp": 1.0739336, "epoch": 0.0030061626333984666, "flos": 15851832549120.0, "grad_norm": 5.603780464897849, "language_loss": 0.96002185, "learning_rate": 2.51876455396287e-06, "loss": 0.99590451, "num_input_tokens_seen": 948650, "router_z_loss_clip": 8.734375, "router_z_loss_mlp": 1.84960938, "step": 50, "time_per_iteration": 2.8095386028289795 }, { "auxiliary_loss_clip": 0.02309276, "auxiliary_loss_mlp": 0.01242595, "balance_loss_clip": 1.45265245, "balance_loss_mlp": 1.07083797, "epoch": 0.003066285886066436, "flos": 31827052316160.0, "grad_norm": 28.45143537386336, "language_loss": 0.95147896, "learning_rate": 2.5315145187866316e-06, "loss": 0.98699766, "num_input_tokens_seen": 966455, "router_z_loss_clip": 8.5546875, "router_z_loss_mlp": 1.71679688, "step": 51, "time_per_iteration": 2.8305585384368896 }, { "auxiliary_loss_clip": 0.02256136, "auxiliary_loss_mlp": 0.01225732, "balance_loss_clip": 1.43516695, "balance_loss_mlp": 1.06351173, "epoch": 0.0031264091387344056, "flos": 41427482774400.0, "grad_norm": 94.42222103638578, "language_loss": 1.00930095, "learning_rate": 2.5440168957651953e-06, "loss": 1.04411972, "num_input_tokens_seen": 988110, "router_z_loss_clip": 8.21484375, "router_z_loss_mlp": 1.62207031, "step": 52, "time_per_iteration": 2.8783364295959473 }, { "auxiliary_loss_clip": 0.02247874, "auxiliary_loss_mlp": 0.01242103, "balance_loss_clip": 1.43771672, "balance_loss_mlp": 1.06796217, "epoch": 0.0031865323914023747, "flos": 23440941970560.0, "grad_norm": 63312.633389651266, "language_loss": 0.97376406, "learning_rate": 2.5562811176888872e-06, "loss": 1.00866389, "num_input_tokens_seen": 1008550, "router_z_loss_clip": 8.1015625, "router_z_loss_mlp": 1.74023438, "step": 53, "time_per_iteration": 2.6935160160064697 }, { "auxiliary_loss_clip": 0.02221918, "auxiliary_loss_mlp": 0.01199666, "balance_loss_clip": 1.43675661, "balance_loss_mlp": 1.03992581, "epoch": 0.003246655644070344, "flos": 14429195510400.0, "grad_norm": 247.4179747170599, "language_loss": 0.89060074, "learning_rate": 2.5683160883431093e-06, "loss": 0.92481649, "num_input_tokens_seen": 1026840, "router_z_loss_clip": 7.8671875, "router_z_loss_mlp": 1.59765625, "step": 54, "time_per_iteration": 2.752204418182373 }, { "auxiliary_loss_clip": 0.02200308, "auxiliary_loss_mlp": 0.01194186, "balance_loss_clip": 1.42074502, "balance_loss_mlp": 1.03787887, "epoch": 0.0033067788967383136, "flos": 35918247496320.0, "grad_norm": 101.76188740785923, "language_loss": 0.89212346, "learning_rate": 2.580130221340046e-06, "loss": 0.92606837, "num_input_tokens_seen": 1048875, "router_z_loss_clip": 7.79296875, "router_z_loss_mlp": 1.56445312, "step": 55, "time_per_iteration": 2.822024345397949 }, { "auxiliary_loss_clip": 0.02183447, "auxiliary_loss_mlp": 0.01195923, "balance_loss_clip": 1.40635633, "balance_loss_mlp": 1.02960277, "epoch": 0.003366902149406283, "flos": 22958732862720.0, "grad_norm": 172.27893092354108, "language_loss": 0.9559992, "learning_rate": 2.5917314754514246e-06, "loss": 0.98979294, "num_input_tokens_seen": 1066435, "router_z_loss_clip": 7.7734375, "router_z_loss_mlp": 1.66503906, "step": 56, "time_per_iteration": 2.7651193141937256 }, { "auxiliary_loss_clip": 0.0220836, "auxiliary_loss_mlp": 0.01165465, "balance_loss_clip": 1.41535497, "balance_loss_mlp": 1.01163781, "epoch": 0.003427025402074252, "flos": 26582838560640.0, "grad_norm": 23.064460238577755, "language_loss": 0.98894596, "learning_rate": 2.6031273868139713e-06, "loss": 1.02268422, "num_input_tokens_seen": 1090330, "router_z_loss_clip": 7.92578125, "router_z_loss_mlp": 1.53808594, "step": 57, "time_per_iteration": 2.7371528148651123 }, { "auxiliary_loss_clip": 0.02138088, "auxiliary_loss_mlp": 0.01206659, "balance_loss_clip": 1.40319359, "balance_loss_mlp": 1.05254483, "epoch": 0.0034871486547422216, "flos": 23951196622080.0, "grad_norm": 103.12747954339874, "language_loss": 1.04919195, "learning_rate": 2.614325098333948e-06, "loss": 1.08263934, "num_input_tokens_seen": 1109840, "router_z_loss_clip": 7.34765625, "router_z_loss_mlp": 1.54101562, "step": 58, "time_per_iteration": 2.6921706199645996 }, { "auxiliary_loss_clip": 0.02156405, "auxiliary_loss_mlp": 0.01169474, "balance_loss_clip": 1.41648829, "balance_loss_mlp": 1.01764917, "epoch": 0.003547271907410191, "flos": 21214983214080.0, "grad_norm": 145.14848650609827, "language_loss": 0.94359553, "learning_rate": 2.625331386578098e-06, "loss": 0.97685438, "num_input_tokens_seen": 1128415, "router_z_loss_clip": 7.3984375, "router_z_loss_mlp": 1.51660156, "step": 59, "time_per_iteration": 2.658005952835083 }, { "auxiliary_loss_clip": 0.02177339, "auxiliary_loss_mlp": 0.01133527, "balance_loss_clip": 1.42449474, "balance_loss_mlp": 0.98580259, "epoch": 0.00360739516007816, "flos": 16504903676160.0, "grad_norm": 142.4494646255864, "language_loss": 1.00314593, "learning_rate": 2.63615268640451e-06, "loss": 1.03625464, "num_input_tokens_seen": 1146515, "router_z_loss_clip": 7.5390625, "router_z_loss_mlp": 1.47558594, "step": 60, "time_per_iteration": 2.7726120948791504 }, { "auxiliary_loss_clip": 0.02182476, "auxiliary_loss_mlp": 0.01171676, "balance_loss_clip": 1.4197166, "balance_loss_mlp": 1.02185404, "epoch": 0.0036675184127461296, "flos": 19464805031040.0, "grad_norm": 77.88206981789193, "language_loss": 0.96767461, "learning_rate": 2.6467951135575943e-06, "loss": 1.00121617, "num_input_tokens_seen": 1166330, "router_z_loss_clip": 7.6328125, "router_z_loss_mlp": 1.5, "step": 61, "time_per_iteration": 4.129663944244385 }, { "auxiliary_loss_clip": 0.02184578, "auxiliary_loss_mlp": 0.01115908, "balance_loss_clip": 1.42362571, "balance_loss_mlp": 0.97962797, "epoch": 0.003727641665414099, "flos": 20957323979520.0, "grad_norm": 20.725746417473825, "language_loss": 0.93849456, "learning_rate": 2.657264485425803e-06, "loss": 0.97149944, "num_input_tokens_seen": 1186010, "router_z_loss_clip": 7.6015625, "router_z_loss_mlp": 1.36425781, "step": 62, "time_per_iteration": 4.0925445556640625 }, { "auxiliary_loss_clip": 0.02194393, "auxiliary_loss_mlp": 0.01165293, "balance_loss_clip": 1.42796373, "balance_loss_mlp": 1.00993955, "epoch": 0.003787764918082068, "flos": 18406050721920.0, "grad_norm": 27.073009705191886, "language_loss": 1.01741993, "learning_rate": 2.6675663401385186e-06, "loss": 1.05101681, "num_input_tokens_seen": 1204985, "router_z_loss_clip": 7.6640625, "router_z_loss_mlp": 1.55566406, "step": 63, "time_per_iteration": 2.694244861602783 }, { "auxiliary_loss_clip": 0.02193907, "auxiliary_loss_mlp": 0.01098423, "balance_loss_clip": 1.42856252, "balance_loss_mlp": 0.96071202, "epoch": 0.0038478881707500376, "flos": 12459243962880.0, "grad_norm": 1335.0696375835914, "language_loss": 1.06253374, "learning_rate": 2.677705954159056e-06, "loss": 1.09545708, "num_input_tokens_seen": 1223545, "router_z_loss_clip": 7.65234375, "router_z_loss_mlp": 1.37695312, "step": 64, "time_per_iteration": 2.651221990585327 }, { "auxiliary_loss_clip": 0.02214258, "auxiliary_loss_mlp": 0.01120772, "balance_loss_clip": 1.43472815, "balance_loss_mlp": 0.9811545, "epoch": 0.003908011423418007, "flos": 13553334276480.0, "grad_norm": 173.61352144807333, "language_loss": 0.92714763, "learning_rate": 2.6876883585136904e-06, "loss": 0.96049798, "num_input_tokens_seen": 1241175, "router_z_loss_clip": 7.80078125, "router_z_loss_mlp": 1.39648438, "step": 65, "time_per_iteration": 2.6508736610412598 }, { "auxiliary_loss_clip": 0.02212337, "auxiliary_loss_mlp": 0.01121401, "balance_loss_clip": 1.42848206, "balance_loss_mlp": 0.97749192, "epoch": 0.003968134676085976, "flos": 18333475292160.0, "grad_norm": 37.07558923721963, "language_loss": 0.92252994, "learning_rate": 2.697518353781685e-06, "loss": 0.95586729, "num_input_tokens_seen": 1259315, "router_z_loss_clip": 7.83203125, "router_z_loss_mlp": 1.43945312, "step": 66, "time_per_iteration": 2.714247941970825 }, { "auxiliary_loss_clip": 0.02225378, "auxiliary_loss_mlp": 0.01124971, "balance_loss_clip": 1.43057001, "balance_loss_mlp": 0.97419536, "epoch": 0.004028257928753946, "flos": 20485242506880.0, "grad_norm": 140.09729530731164, "language_loss": 1.02202392, "learning_rate": 2.7072005239581103e-06, "loss": 1.05552721, "num_input_tokens_seen": 1277055, "router_z_loss_clip": 7.953125, "router_z_loss_mlp": 1.5078125, "step": 67, "time_per_iteration": 2.6621944904327393 }, { "auxiliary_loss_clip": 0.02190293, "auxiliary_loss_mlp": 0.0113862, "balance_loss_clip": 1.42736208, "balance_loss_mlp": 1.00233984, "epoch": 0.004088381181421915, "flos": 18843837684480.0, "grad_norm": 22.033752812797886, "language_loss": 1.00362921, "learning_rate": 2.7167392492896727e-06, "loss": 1.0369184, "num_input_tokens_seen": 1294355, "router_z_loss_clip": 7.62890625, "router_z_loss_mlp": 1.36328125, "step": 68, "time_per_iteration": 2.664377450942993 }, { "auxiliary_loss_clip": 0.02195274, "auxiliary_loss_mlp": 0.01110621, "balance_loss_clip": 1.42951298, "balance_loss_mlp": 0.97691542, "epoch": 0.004148504434089885, "flos": 19427817000960.0, "grad_norm": 19.55597657055439, "language_loss": 1.01218116, "learning_rate": 2.7261387181735195e-06, "loss": 1.04524004, "num_input_tokens_seen": 1313525, "router_z_loss_clip": 7.6640625, "router_z_loss_mlp": 1.33789062, "step": 69, "time_per_iteration": 2.678802490234375 }, { "auxiliary_loss_clip": 0.02196015, "auxiliary_loss_mlp": 0.01142339, "balance_loss_clip": 1.42513716, "balance_loss_mlp": 1.00348401, "epoch": 0.004208627686757853, "flos": 20811023884800.0, "grad_norm": 125.86886574365333, "language_loss": 1.05500841, "learning_rate": 2.7354029381999196e-06, "loss": 1.0883919, "num_input_tokens_seen": 1330505, "router_z_loss_clip": 7.70703125, "router_z_loss_mlp": 1.38769531, "step": 70, "time_per_iteration": 2.725579023361206 }, { "auxiliary_loss_clip": 0.02225095, "auxiliary_loss_mlp": 0.01086273, "balance_loss_clip": 1.43085718, "balance_loss_mlp": 0.95094681, "epoch": 0.004268750939425823, "flos": 19098623831040.0, "grad_norm": 50.70332776348157, "language_loss": 1.06066394, "learning_rate": 2.7445357464116983e-06, "loss": 1.09377766, "num_input_tokens_seen": 1349615, "router_z_loss_clip": 7.94140625, "router_z_loss_mlp": 1.35449219, "step": 71, "time_per_iteration": 2.7464962005615234 }, { "auxiliary_loss_clip": 0.02049182, "auxiliary_loss_mlp": 0.00866868, "balance_loss_clip": 1.43269873, "balance_loss_mlp": 0.78904808, "epoch": 0.004328874192093792, "flos": 52439635514880.0, "grad_norm": 2.4793740484044093, "language_loss": 0.65862, "learning_rate": 2.75354081884615e-06, "loss": 0.6877805, "num_input_tokens_seen": 1410275, "router_z_loss_clip": 6.1875, "router_z_loss_mlp": 0.77734375, "step": 72, "time_per_iteration": 3.345369815826416 }, { "auxiliary_loss_clip": 0.02003517, "auxiliary_loss_mlp": 0.00987034, "balance_loss_clip": 1.41772044, "balance_loss_mlp": 0.90921462, "epoch": 0.004388997444761762, "flos": 66473239564800.0, "grad_norm": 95.07444356128, "language_loss": 0.63359886, "learning_rate": 2.7624216794188286e-06, "loss": 0.66350436, "num_input_tokens_seen": 1473020, "router_z_loss_clip": 5.875, "router_z_loss_mlp": 0.77734375, "step": 73, "time_per_iteration": 3.6851634979248047 }, { "auxiliary_loss_clip": 0.02176877, "auxiliary_loss_mlp": 0.01064429, "balance_loss_clip": 1.41189611, "balance_loss_mlp": 0.93730426, "epoch": 0.004449120697429731, "flos": 18952970181120.0, "grad_norm": 44.927423806678945, "language_loss": 0.93187869, "learning_rate": 2.771181708202938e-06, "loss": 0.96429169, "num_input_tokens_seen": 1490385, "router_z_loss_clip": 7.65234375, "router_z_loss_mlp": 1.27148438, "step": 74, "time_per_iteration": 2.992326259613037 }, { "auxiliary_loss_clip": 0.02179771, "auxiliary_loss_mlp": 0.01088197, "balance_loss_clip": 1.41291952, "balance_loss_mlp": 0.95744801, "epoch": 0.004509243950097701, "flos": 21105491581440.0, "grad_norm": 151.1805947336832, "language_loss": 1.03513861, "learning_rate": 2.779824149153005e-06, "loss": 1.06781816, "num_input_tokens_seen": 1509725, "router_z_loss_clip": 7.66015625, "router_z_loss_mlp": 1.30761719, "step": 75, "time_per_iteration": 2.9348511695861816 }, { "auxiliary_loss_clip": 0.02158967, "auxiliary_loss_mlp": 0.01081527, "balance_loss_clip": 1.40469193, "balance_loss_mlp": 0.95502257, "epoch": 0.004569367202765669, "flos": 20698730991360.0, "grad_norm": 25.40560884135099, "language_loss": 0.93477261, "learning_rate": 2.788352117317012e-06, "loss": 0.96717757, "num_input_tokens_seen": 1527245, "router_z_loss_clip": 7.54296875, "router_z_loss_mlp": 1.26513672, "step": 76, "time_per_iteration": 2.6567721366882324 }, { "auxiliary_loss_clip": 0.02172508, "auxiliary_loss_mlp": 0.01069911, "balance_loss_clip": 1.40563083, "balance_loss_mlp": 0.94402647, "epoch": 0.004629490455433639, "flos": 28658474899200.0, "grad_norm": 19.35003661391888, "language_loss": 0.96598023, "learning_rate": 2.796768605577095e-06, "loss": 0.9984045, "num_input_tokens_seen": 1548930, "router_z_loss_clip": 7.671875, "router_z_loss_mlp": 1.25878906, "step": 77, "time_per_iteration": 2.735846996307373 }, { "auxiliary_loss_clip": 0.02160073, "auxiliary_loss_mlp": 0.0107145, "balance_loss_clip": 1.40726519, "balance_loss_mlp": 0.95066726, "epoch": 0.004689613708101608, "flos": 11072409805440.0, "grad_norm": 25.48323626119104, "language_loss": 0.99792445, "learning_rate": 2.80507649095533e-06, "loss": 1.03023958, "num_input_tokens_seen": 1565695, "router_z_loss_clip": 7.53515625, "router_z_loss_mlp": 1.20751953, "step": 78, "time_per_iteration": 2.7138967514038086 }, { "auxiliary_loss_clip": 0.02170112, "auxiliary_loss_mlp": 0.01056534, "balance_loss_clip": 1.40753651, "balance_loss_mlp": 0.93827856, "epoch": 0.004749736960769578, "flos": 21799106184960.0, "grad_norm": 241.3534720633038, "language_loss": 0.89993793, "learning_rate": 2.813278540517843e-06, "loss": 0.93220437, "num_input_tokens_seen": 1582625, "router_z_loss_clip": 7.62109375, "router_z_loss_mlp": 1.18066406, "step": 79, "time_per_iteration": 2.6628527641296387 }, { "auxiliary_loss_clip": 0.0219081, "auxiliary_loss_mlp": 0.01064344, "balance_loss_clip": 1.40085912, "balance_loss_mlp": 0.93617076, "epoch": 0.004809860213437547, "flos": 19792597570560.0, "grad_norm": 91.02662319578228, "language_loss": 0.95361269, "learning_rate": 2.8213774169075505e-06, "loss": 0.98616421, "num_input_tokens_seen": 1601725, "router_z_loss_clip": 7.90234375, "router_z_loss_mlp": 1.28027344, "step": 80, "time_per_iteration": 2.656294822692871 }, { "auxiliary_loss_clip": 0.0217854, "auxiliary_loss_mlp": 0.01046879, "balance_loss_clip": 1.40287113, "balance_loss_mlp": 0.92290121, "epoch": 0.004869983466105517, "flos": 26574327037440.0, "grad_norm": 12.237590315667472, "language_loss": 1.00894713, "learning_rate": 2.829375683533245e-06, "loss": 1.04120135, "num_input_tokens_seen": 1622420, "router_z_loss_clip": 7.75, "router_z_loss_mlp": 1.24023438, "step": 81, "time_per_iteration": 2.7014718055725098 }, { "auxiliary_loss_clip": 0.02199176, "auxiliary_loss_mlp": 0.01068031, "balance_loss_clip": 1.40907454, "balance_loss_mlp": 0.93957126, "epoch": 0.004930106718773485, "flos": 12823378087680.0, "grad_norm": 36.006855457394046, "language_loss": 1.03224838, "learning_rate": 2.8372758094402803e-06, "loss": 1.06492043, "num_input_tokens_seen": 1640715, "router_z_loss_clip": 7.90234375, "router_z_loss_mlp": 1.28417969, "step": 82, "time_per_iteration": 2.7453949451446533 }, { "auxiliary_loss_clip": 0.02184234, "auxiliary_loss_mlp": 0.01075117, "balance_loss_clip": 1.39907908, "balance_loss_mlp": 0.9440825, "epoch": 0.004990229971441455, "flos": 25774919902080.0, "grad_norm": 464.8680436658363, "language_loss": 0.92199713, "learning_rate": 2.84508017388607e-06, "loss": 0.95459062, "num_input_tokens_seen": 1662210, "router_z_loss_clip": 7.8515625, "router_z_loss_mlp": 1.31152344, "step": 83, "time_per_iteration": 2.7285687923431396 }, { "auxiliary_loss_clip": 0.0217107, "auxiliary_loss_mlp": 0.01081099, "balance_loss_clip": 1.40076947, "balance_loss_mlp": 0.95521432, "epoch": 0.005050353224109424, "flos": 17457254922240.0, "grad_norm": 139.8480965057028, "language_loss": 0.97223222, "learning_rate": 2.852791070641559e-06, "loss": 1.00475383, "num_input_tokens_seen": 1681070, "router_z_loss_clip": 7.7109375, "router_z_loss_mlp": 1.25976562, "step": 84, "time_per_iteration": 2.6686079502105713 }, { "auxiliary_loss_clip": 0.02067746, "auxiliary_loss_mlp": 0.01042959, "balance_loss_clip": 1.42078733, "balance_loss_mlp": 0.9632315, "epoch": 0.005110476476777394, "flos": 69805460367360.0, "grad_norm": 1.182275793685502, "language_loss": 0.61750686, "learning_rate": 2.8604107120381682e-06, "loss": 0.64861381, "num_input_tokens_seen": 1747140, "router_z_loss_clip": 6.4375, "router_z_loss_mlp": 0.796875, "step": 85, "time_per_iteration": 3.2537620067596436 }, { "auxiliary_loss_clip": 0.02156857, "auxiliary_loss_mlp": 0.01063804, "balance_loss_clip": 1.39681888, "balance_loss_mlp": 0.94168591, "epoch": 0.005170599729445363, "flos": 24790105739520.0, "grad_norm": 40.025983457987955, "language_loss": 0.96041584, "learning_rate": 2.8679412327780482e-06, "loss": 0.99262238, "num_input_tokens_seen": 1767475, "router_z_loss_clip": 7.59765625, "router_z_loss_mlp": 1.22119141, "step": 86, "time_per_iteration": 2.7150163650512695 }, { "auxiliary_loss_clip": 0.02142156, "auxiliary_loss_mlp": 0.01070653, "balance_loss_clip": 1.38717043, "balance_loss_mlp": 0.94629389, "epoch": 0.005230722982113333, "flos": 23258048895360.0, "grad_norm": 51.90947836306321, "language_loss": 0.8888706, "learning_rate": 2.8753846935240833e-06, "loss": 0.92099869, "num_input_tokens_seen": 1784980, "router_z_loss_clip": 7.55078125, "router_z_loss_mlp": 1.24414062, "step": 87, "time_per_iteration": 2.7187232971191406 }, { "auxiliary_loss_clip": 0.02119578, "auxiliary_loss_mlp": 0.01057995, "balance_loss_clip": 1.37867022, "balance_loss_mlp": 0.94050193, "epoch": 0.005290846234781301, "flos": 16727909264640.0, "grad_norm": 67.43317709442191, "language_loss": 1.00044632, "learning_rate": 2.8827430842847267e-06, "loss": 1.03222203, "num_input_tokens_seen": 1803030, "router_z_loss_clip": 7.41015625, "router_z_loss_mlp": 1.17480469, "step": 88, "time_per_iteration": 2.732900381088257 }, { "auxiliary_loss_clip": 0.02129642, "auxiliary_loss_mlp": 0.01021508, "balance_loss_clip": 1.37711203, "balance_loss_mlp": 0.90749621, "epoch": 0.005350969487449271, "flos": 20886077352960.0, "grad_norm": 1253.0786690730397, "language_loss": 0.91518873, "learning_rate": 2.8900183276075957e-06, "loss": 0.94670022, "num_input_tokens_seen": 1822865, "router_z_loss_clip": 7.52734375, "router_z_loss_mlp": 1.13916016, "step": 89, "time_per_iteration": 2.736537456512451 }, { "auxiliary_loss_clip": 0.02130883, "auxiliary_loss_mlp": 0.01017509, "balance_loss_clip": 1.38667631, "balance_loss_mlp": 0.90893358, "epoch": 0.00541109274011724, "flos": 26209977431040.0, "grad_norm": 20.547820733114815, "language_loss": 0.98645926, "learning_rate": 2.8972122815946455e-06, "loss": 1.01794314, "num_input_tokens_seen": 1842435, "router_z_loss_clip": 7.44140625, "router_z_loss_mlp": 1.08642578, "step": 90, "time_per_iteration": 2.72572922706604 }, { "auxiliary_loss_clip": 0.02070509, "auxiliary_loss_mlp": 0.01016198, "balance_loss_clip": 1.35954988, "balance_loss_mlp": 0.90719283, "epoch": 0.00547121599278521, "flos": 21178569801600.0, "grad_norm": 230.31490285568063, "language_loss": 0.92482209, "learning_rate": 2.90432674275074e-06, "loss": 0.95568907, "num_input_tokens_seen": 1860065, "router_z_loss_clip": 7.109375, "router_z_loss_mlp": 1.09033203, "step": 91, "time_per_iteration": 2.7694270610809326 }, { "auxiliary_loss_clip": 0.02080731, "auxiliary_loss_mlp": 0.01033199, "balance_loss_clip": 1.36302543, "balance_loss_mlp": 0.92138112, "epoch": 0.005531339245453179, "flos": 19718801078400.0, "grad_norm": 6.924376700314359, "language_loss": 0.94420964, "learning_rate": 2.91136344867656e-06, "loss": 0.97534895, "num_input_tokens_seen": 1878135, "router_z_loss_clip": 7.17578125, "router_z_loss_mlp": 1.1171875, "step": 92, "time_per_iteration": 2.7769083976745605 }, { "auxiliary_loss_clip": 0.02088772, "auxiliary_loss_mlp": 0.01048953, "balance_loss_clip": 1.36210716, "balance_loss_mlp": 0.93150806, "epoch": 0.005591462498121149, "flos": 17636089760640.0, "grad_norm": 964.0360410097962, "language_loss": 1.02574992, "learning_rate": 2.918324080615938e-06, "loss": 1.05712712, "num_input_tokens_seen": 1894895, "router_z_loss_clip": 7.26171875, "router_z_loss_mlp": 1.17578125, "step": 93, "time_per_iteration": 2.6762332916259766 }, { "auxiliary_loss_clip": 0.02099035, "auxiliary_loss_mlp": 0.01025731, "balance_loss_clip": 1.36745358, "balance_loss_mlp": 0.9121002, "epoch": 0.005651585750789117, "flos": 20011221699840.0, "grad_norm": 23.0494808323307, "language_loss": 0.94119775, "learning_rate": 2.925210265866963e-06, "loss": 0.97244549, "num_input_tokens_seen": 1913220, "router_z_loss_clip": 7.3203125, "router_z_loss_mlp": 1.13671875, "step": 94, "time_per_iteration": 2.671448230743408 }, { "auxiliary_loss_clip": 0.0182264, "auxiliary_loss_mlp": 0.00722016, "balance_loss_clip": 1.31438065, "balance_loss_mlp": 0.67433184, "epoch": 0.005711709003457087, "flos": 59812957981440.0, "grad_norm": 1.1700782818630062, "language_loss": 0.67012691, "learning_rate": 2.932023580065507e-06, "loss": 0.69557345, "num_input_tokens_seen": 1970970, "router_z_loss_clip": 5.0625, "router_z_loss_mlp": 0.4765625, "step": 95, "time_per_iteration": 3.119431257247925 }, { "auxiliary_loss_clip": 0.02031375, "auxiliary_loss_mlp": 0.00947526, "balance_loss_clip": 1.34114122, "balance_loss_mlp": 0.8508234, "epoch": 0.005771832256125056, "flos": 15559591495680.0, "grad_norm": 77.34193106643143, "language_loss": 0.97181404, "learning_rate": 2.9387655493491906e-06, "loss": 1.00160301, "num_input_tokens_seen": 1988930, "router_z_loss_clip": 6.8984375, "router_z_loss_mlp": 0.96826172, "step": 96, "time_per_iteration": 2.642866611480713 }, { "auxiliary_loss_clip": 0.02009174, "auxiliary_loss_mlp": 0.00924142, "balance_loss_clip": 1.33040726, "balance_loss_mlp": 0.82615167, "epoch": 0.005831955508793026, "flos": 22528380015360.0, "grad_norm": 48.050049411535326, "language_loss": 0.97001088, "learning_rate": 2.9454376524092147e-06, "loss": 0.99934405, "num_input_tokens_seen": 2006285, "router_z_loss_clip": 6.78515625, "router_z_loss_mlp": 0.97998047, "step": 97, "time_per_iteration": 2.7164926528930664 }, { "auxiliary_loss_clip": 0.01991181, "auxiliary_loss_mlp": 0.00910185, "balance_loss_clip": 1.32509971, "balance_loss_mlp": 0.80623412, "epoch": 0.005892078761460995, "flos": 22049834094720.0, "grad_norm": 32.220105580473806, "language_loss": 0.81935883, "learning_rate": 2.952041322436969e-06, "loss": 0.84837246, "num_input_tokens_seen": 2024905, "router_z_loss_clip": 6.6640625, "router_z_loss_mlp": 1.03857422, "step": 98, "time_per_iteration": 2.789259433746338 }, { "auxiliary_loss_clip": 0.01684029, "auxiliary_loss_mlp": 0.0069636, "balance_loss_clip": 1.25979209, "balance_loss_mlp": 0.65306324, "epoch": 0.005952202014128965, "flos": 68539143317760.0, "grad_norm": 1.0758018993447673, "language_loss": 0.64717102, "learning_rate": 2.9585779489718204e-06, "loss": 0.67097485, "num_input_tokens_seen": 2086220, "router_z_loss_clip": 4.25, "router_z_loss_mlp": 0.43359375, "step": 99, "time_per_iteration": 3.288888692855835 }, { "auxiliary_loss_clip": 0.01948164, "auxiliary_loss_mlp": 0.00894639, "balance_loss_clip": 1.30847979, "balance_loss_mlp": 0.79788864, "epoch": 0.006012325266796933, "flos": 22960887678720.0, "grad_norm": 169.1745340797044, "language_loss": 0.98576927, "learning_rate": 2.9650488796560464e-06, "loss": 1.01419723, "num_input_tokens_seen": 2103365, "router_z_loss_clip": 6.39453125, "router_z_loss_mlp": 0.96728516, "step": 100, "time_per_iteration": 2.6935875415802 }, { "auxiliary_loss_clip": 0.01960751, "auxiliary_loss_mlp": 0.00878936, "balance_loss_clip": 1.3112216, "balance_loss_mlp": 0.7778942, "epoch": 0.006072448519464903, "flos": 17347942857600.0, "grad_norm": 47.07254933490482, "language_loss": 0.97072709, "learning_rate": 2.971455421902446e-06, "loss": 0.99912393, "num_input_tokens_seen": 2121995, "router_z_loss_clip": 6.5, "router_z_loss_mlp": 1.00927734, "step": 101, "time_per_iteration": 2.725368022918701 }, { "auxiliary_loss_clip": 0.01931863, "auxiliary_loss_mlp": 0.00826707, "balance_loss_clip": 1.30742383, "balance_loss_mlp": 0.73486876, "epoch": 0.006132571772132872, "flos": 24681116897280.0, "grad_norm": 48.141580874145475, "language_loss": 0.97692084, "learning_rate": 2.9777988444798075e-06, "loss": 1.00450659, "num_input_tokens_seen": 2141815, "router_z_loss_clip": 6.24609375, "router_z_loss_mlp": 0.91796875, "step": 102, "time_per_iteration": 2.732429265975952 }, { "auxiliary_loss_clip": 0.01920405, "auxiliary_loss_mlp": 0.00842023, "balance_loss_clip": 1.30536985, "balance_loss_mlp": 0.74698955, "epoch": 0.006192695024800842, "flos": 21465675210240.0, "grad_norm": 10.538882813098272, "language_loss": 0.9406389, "learning_rate": 2.9840803790210285e-06, "loss": 0.96826321, "num_input_tokens_seen": 2161125, "router_z_loss_clip": 6.16015625, "router_z_loss_mlp": 0.95117188, "step": 103, "time_per_iteration": 4.10395884513855 }, { "auxiliary_loss_clip": 0.01901529, "auxiliary_loss_mlp": 0.00808728, "balance_loss_clip": 1.29738867, "balance_loss_mlp": 0.71998864, "epoch": 0.006252818277468811, "flos": 17420410546560.0, "grad_norm": 97.33011414098947, "language_loss": 0.98254943, "learning_rate": 2.990301221458371e-06, "loss": 1.00965202, "num_input_tokens_seen": 2179510, "router_z_loss_clip": 6.0390625, "router_z_loss_mlp": 0.88671875, "step": 104, "time_per_iteration": 3.9994308948516846 }, { "auxiliary_loss_clip": 0.01880987, "auxiliary_loss_mlp": 0.00803401, "balance_loss_clip": 1.29134119, "balance_loss_mlp": 0.71485209, "epoch": 0.006312941530136781, "flos": 19099557584640.0, "grad_norm": 288.15173888344356, "language_loss": 1.03769183, "learning_rate": 2.9964625333900544e-06, "loss": 1.06453586, "num_input_tokens_seen": 2197870, "router_z_loss_clip": 5.89453125, "router_z_loss_mlp": 0.88525391, "step": 105, "time_per_iteration": 2.80711030960083 }, { "auxiliary_loss_clip": 0.01874455, "auxiliary_loss_mlp": 0.00826528, "balance_loss_clip": 1.28512263, "balance_loss_mlp": 0.72634411, "epoch": 0.006373064782804749, "flos": 24060831909120.0, "grad_norm": 386.94822596291954, "language_loss": 0.97950977, "learning_rate": 3.002565443382063e-06, "loss": 1.00651956, "num_input_tokens_seen": 2217495, "router_z_loss_clip": 5.89453125, "router_z_loss_mlp": 1.00244141, "step": 106, "time_per_iteration": 2.70206880569458 }, { "auxiliary_loss_clip": 0.01857568, "auxiliary_loss_mlp": 0.00825123, "balance_loss_clip": 1.27497029, "balance_loss_mlp": 0.727705, "epoch": 0.006433188035472719, "flos": 18332433797760.0, "grad_norm": 81.25182199587908, "language_loss": 0.92604768, "learning_rate": 3.008611048208843e-06, "loss": 0.95287454, "num_input_tokens_seen": 2236520, "router_z_loss_clip": 5.82421875, "router_z_loss_mlp": 0.97412109, "step": 107, "time_per_iteration": 2.6596474647521973 }, { "auxiliary_loss_clip": 0.01538748, "auxiliary_loss_mlp": 0.00482339, "balance_loss_clip": 1.18675041, "balance_loss_mlp": 0.4346548, "epoch": 0.006493311288140688, "flos": 62562387594240.0, "grad_norm": 0.9782370975553005, "language_loss": 0.64388406, "learning_rate": 3.014600414036285e-06, "loss": 0.66409492, "num_input_tokens_seen": 2300140, "router_z_loss_clip": 3.53125, "router_z_loss_mlp": 0.4765625, "step": 108, "time_per_iteration": 3.2340567111968994 }, { "auxiliary_loss_clip": 0.0181261, "auxiliary_loss_mlp": 0.00778817, "balance_loss_clip": 1.26208007, "balance_loss_mlp": 0.68945777, "epoch": 0.006553434540808658, "flos": 19500141035520.0, "grad_norm": 1347.3978630558404, "language_loss": 1.05323362, "learning_rate": 3.0205345775501937e-06, "loss": 1.07914793, "num_input_tokens_seen": 2317320, "router_z_loss_clip": 5.5078125, "router_z_loss_mlp": 0.89306641, "step": 109, "time_per_iteration": 2.7123146057128906 }, { "auxiliary_loss_clip": 0.01789383, "auxiliary_loss_mlp": 0.00806952, "balance_loss_clip": 1.25275242, "balance_loss_mlp": 0.71292007, "epoch": 0.006613557793476627, "flos": 21105132445440.0, "grad_norm": 28.25374315634908, "language_loss": 0.91203845, "learning_rate": 3.0264145470332218e-06, "loss": 0.93800175, "num_input_tokens_seen": 2337820, "router_z_loss_clip": 5.36328125, "router_z_loss_mlp": 0.93945312, "step": 110, "time_per_iteration": 2.713517189025879 }, { "auxiliary_loss_clip": 0.01780471, "auxiliary_loss_mlp": 0.00778436, "balance_loss_clip": 1.24193454, "balance_loss_mlp": 0.68774199, "epoch": 0.006673681046144597, "flos": 26030747543040.0, "grad_norm": 438.167999056945, "language_loss": 0.8765887, "learning_rate": 3.032241303393073e-06, "loss": 0.90217769, "num_input_tokens_seen": 2358560, "router_z_loss_clip": 5.38671875, "router_z_loss_mlp": 0.90673828, "step": 111, "time_per_iteration": 2.7387893199920654 }, { "auxiliary_loss_clip": 0.01762534, "auxiliary_loss_mlp": 0.00778245, "balance_loss_clip": 1.23906755, "balance_loss_mlp": 0.69146079, "epoch": 0.006733804298812566, "flos": 23147767163520.0, "grad_norm": 193.05633382336595, "language_loss": 1.0101614, "learning_rate": 3.0380158011446e-06, "loss": 1.03556919, "num_input_tokens_seen": 2379005, "router_z_loss_clip": 5.234375, "router_z_loss_mlp": 0.86767578, "step": 112, "time_per_iteration": 2.7015302181243896 }, { "auxiliary_loss_clip": 0.0175362, "auxiliary_loss_mlp": 0.00759923, "balance_loss_clip": 1.23133671, "balance_loss_mlp": 0.67528403, "epoch": 0.006793927551480535, "flos": 11764444210560.0, "grad_norm": 94.36321292513273, "language_loss": 0.86510372, "learning_rate": 3.0437389693482466e-06, "loss": 0.89023912, "num_input_tokens_seen": 2395610, "router_z_loss_clip": 5.22265625, "router_z_loss_mlp": 0.84667969, "step": 113, "time_per_iteration": 2.7100846767425537 }, { "auxiliary_loss_clip": 0.01737425, "auxiliary_loss_mlp": 0.00760596, "balance_loss_clip": 1.2305243, "balance_loss_mlp": 0.67481261, "epoch": 0.006854050804148504, "flos": 19171953446400.0, "grad_norm": 141.05572596946854, "language_loss": 1.01577854, "learning_rate": 3.0494117125071475e-06, "loss": 1.04075873, "num_input_tokens_seen": 2415005, "router_z_loss_clip": 5.07421875, "router_z_loss_mlp": 0.85742188, "step": 114, "time_per_iteration": 2.641103982925415 }, { "auxiliary_loss_clip": 0.01757001, "auxiliary_loss_mlp": 0.00748572, "balance_loss_clip": 1.23863637, "balance_loss_mlp": 0.66398066, "epoch": 0.006914174056816474, "flos": 21981891519360.0, "grad_norm": 102.45540254769315, "language_loss": 1.01735425, "learning_rate": 3.055034911425055e-06, "loss": 1.04241002, "num_input_tokens_seen": 2433965, "router_z_loss_clip": 5.1875, "router_z_loss_mlp": 0.84570312, "step": 115, "time_per_iteration": 2.7194206714630127 }, { "auxiliary_loss_clip": 0.01753099, "auxiliary_loss_mlp": 0.00773998, "balance_loss_clip": 1.23089552, "balance_loss_mlp": 0.68082356, "epoch": 0.006974297309484443, "flos": 16289152634880.0, "grad_norm": 570.2237930641412, "language_loss": 0.90330911, "learning_rate": 3.0606094240271244e-06, "loss": 0.92858005, "num_input_tokens_seen": 2451605, "router_z_loss_clip": 5.2265625, "router_z_loss_mlp": 0.93212891, "step": 116, "time_per_iteration": 2.6253557205200195 }, { "auxiliary_loss_clip": 0.01743625, "auxiliary_loss_mlp": 0.00744717, "balance_loss_clip": 1.23006773, "balance_loss_mlp": 0.65712225, "epoch": 0.007034420562152413, "flos": 26104005331200.0, "grad_norm": 52.22874466357756, "language_loss": 0.96376693, "learning_rate": 3.0661360861454656e-06, "loss": 0.98865038, "num_input_tokens_seen": 2472035, "router_z_loss_clip": 5.1328125, "router_z_loss_mlp": 0.87597656, "step": 117, "time_per_iteration": 2.8993194103240967 }, { "auxiliary_loss_clip": 0.01747987, "auxiliary_loss_mlp": 0.00784745, "balance_loss_clip": 1.23875809, "balance_loss_mlp": 0.69343102, "epoch": 0.007094543814820382, "flos": 14204609723520.0, "grad_norm": 52.47309873150167, "language_loss": 0.93600887, "learning_rate": 3.071615712271274e-06, "loss": 0.96133614, "num_input_tokens_seen": 2489285, "router_z_loss_clip": 5.08984375, "router_z_loss_mlp": 0.9140625, "step": 118, "time_per_iteration": 2.6580498218536377 }, { "auxiliary_loss_clip": 0.01769533, "auxiliary_loss_mlp": 0.00760741, "balance_loss_clip": 1.23823047, "balance_loss_mlp": 0.67033303, "epoch": 0.007154667067488351, "flos": 14976007228800.0, "grad_norm": 69.29585503079377, "language_loss": 1.05577409, "learning_rate": 3.0770490962752172e-06, "loss": 1.08107674, "num_input_tokens_seen": 2506460, "router_z_loss_clip": 5.3125, "router_z_loss_mlp": 0.90429688, "step": 119, "time_per_iteration": 2.6431193351745605 }, { "auxiliary_loss_clip": 0.01785732, "auxiliary_loss_mlp": 0.00832628, "balance_loss_clip": 1.24209511, "balance_loss_mlp": 0.73006034, "epoch": 0.00721479032015632, "flos": 20193288762240.0, "grad_norm": 141.55905065710016, "language_loss": 0.99686277, "learning_rate": 3.082437012097686e-06, "loss": 1.02304626, "num_input_tokens_seen": 2525565, "router_z_loss_clip": 5.43359375, "router_z_loss_mlp": 1.02587891, "step": 120, "time_per_iteration": 2.673266887664795 }, { "auxiliary_loss_clip": 0.01751374, "auxiliary_loss_mlp": 0.0078848, "balance_loss_clip": 1.23402703, "balance_loss_mlp": 0.69654536, "epoch": 0.00727491357282429, "flos": 23147228459520.0, "grad_norm": 77.25770422616483, "language_loss": 0.98269451, "learning_rate": 3.0877802144103967e-06, "loss": 1.00809312, "num_input_tokens_seen": 2546605, "router_z_loss_clip": 5.17578125, "router_z_loss_mlp": 0.91943359, "step": 121, "time_per_iteration": 2.6723721027374268 }, { "auxiliary_loss_clip": 0.01766994, "auxiliary_loss_mlp": 0.00786209, "balance_loss_clip": 1.24295712, "balance_loss_mlp": 0.6934641, "epoch": 0.007335036825492259, "flos": 15521669712000.0, "grad_norm": 36.64401628206922, "language_loss": 0.9861933, "learning_rate": 3.09307943925077e-06, "loss": 1.01172543, "num_input_tokens_seen": 2560730, "router_z_loss_clip": 5.234375, "router_z_loss_mlp": 0.92724609, "step": 122, "time_per_iteration": 2.5889391899108887 }, { "auxiliary_loss_clip": 0.01771978, "auxiliary_loss_mlp": 0.00776311, "balance_loss_clip": 1.24376607, "balance_loss_mlp": 0.68194455, "epoch": 0.007395160078160229, "flos": 24243365848320.0, "grad_norm": 60.670169491101674, "language_loss": 1.00440967, "learning_rate": 3.0983354046304154e-06, "loss": 1.02989244, "num_input_tokens_seen": 2579550, "router_z_loss_clip": 5.28125, "router_z_loss_mlp": 0.94384766, "step": 123, "time_per_iteration": 2.6796882152557373 }, { "auxiliary_loss_clip": 0.01775125, "auxiliary_loss_mlp": 0.00781009, "balance_loss_clip": 1.24194121, "balance_loss_mlp": 0.69408208, "epoch": 0.007455283330828198, "flos": 31759792099200.0, "grad_norm": 35.394390487063916, "language_loss": 0.77560151, "learning_rate": 3.103548811118979e-06, "loss": 0.80116284, "num_input_tokens_seen": 2600390, "router_z_loss_clip": 5.3359375, "router_z_loss_mlp": 0.86962891, "step": 124, "time_per_iteration": 2.713456153869629 }, { "auxiliary_loss_clip": 0.01752454, "auxiliary_loss_mlp": 0.00767223, "balance_loss_clip": 1.24400115, "balance_loss_mlp": 0.67729127, "epoch": 0.007515406583496167, "flos": 26615157822720.0, "grad_norm": 428.15630342270236, "language_loss": 0.96539855, "learning_rate": 3.108720342404542e-06, "loss": 0.99059528, "num_input_tokens_seen": 2620770, "router_z_loss_clip": 5.08984375, "router_z_loss_mlp": 0.89990234, "step": 125, "time_per_iteration": 2.6760177612304688 }, { "auxiliary_loss_clip": 0.017887, "auxiliary_loss_mlp": 0.00797788, "balance_loss_clip": 1.25548112, "balance_loss_mlp": 0.70299309, "epoch": 0.007575529836164136, "flos": 18223696350720.0, "grad_norm": 22.260902466392288, "language_loss": 0.90518093, "learning_rate": 3.1138506658316945e-06, "loss": 0.93104583, "num_input_tokens_seen": 2639900, "router_z_loss_clip": 5.328125, "router_z_loss_mlp": 0.94824219, "step": 126, "time_per_iteration": 2.651451587677002 }, { "auxiliary_loss_clip": 0.01788478, "auxiliary_loss_mlp": 0.00772681, "balance_loss_clip": 1.25122523, "balance_loss_mlp": 0.68513334, "epoch": 0.007635653088832106, "flos": 21580410228480.0, "grad_norm": 167.07985154842163, "language_loss": 0.77010608, "learning_rate": 3.1189404329183404e-06, "loss": 0.79571766, "num_input_tokens_seen": 2657450, "router_z_loss_clip": 5.375, "router_z_loss_mlp": 0.87646484, "step": 127, "time_per_iteration": 2.643202543258667 }, { "auxiliary_loss_clip": 0.01773221, "auxiliary_loss_mlp": 0.00768555, "balance_loss_clip": 1.25460029, "balance_loss_mlp": 0.68124622, "epoch": 0.007695776341500075, "flos": 25375054723200.0, "grad_norm": 53.59254096758381, "language_loss": 0.93846619, "learning_rate": 3.1239902798522317e-06, "loss": 0.96388394, "num_input_tokens_seen": 2678150, "router_z_loss_clip": 5.1875, "router_z_loss_mlp": 0.87353516, "step": 128, "time_per_iteration": 2.684917688369751 }, { "auxiliary_loss_clip": 0.01775106, "auxiliary_loss_mlp": 0.00763329, "balance_loss_clip": 1.24901199, "balance_loss_mlp": 0.67187178, "epoch": 0.007755899594168045, "flos": 22343906741760.0, "grad_norm": 39.02329277609331, "language_loss": 0.90290201, "learning_rate": 3.129000827968184e-06, "loss": 0.92828631, "num_input_tokens_seen": 2698290, "router_z_loss_clip": 5.26171875, "router_z_loss_mlp": 0.91552734, "step": 129, "time_per_iteration": 2.6390843391418457 }, { "auxiliary_loss_clip": 0.01756609, "auxiliary_loss_mlp": 0.00773448, "balance_loss_clip": 1.2491858, "balance_loss_mlp": 0.68041694, "epoch": 0.007816022846836013, "flos": 22638230784000.0, "grad_norm": 12.976926796179098, "language_loss": 1.03236341, "learning_rate": 3.133972684206866e-06, "loss": 1.05766404, "num_input_tokens_seen": 2717630, "router_z_loss_clip": 5.07421875, "router_z_loss_mlp": 0.92919922, "step": 130, "time_per_iteration": 2.7191576957702637 }, { "auxiliary_loss_clip": 0.01750998, "auxiliary_loss_mlp": 0.00757384, "balance_loss_clip": 1.24581182, "balance_loss_mlp": 0.67098141, "epoch": 0.007876146099503984, "flos": 18182901479040.0, "grad_norm": 57.47025478369975, "language_loss": 0.88960469, "learning_rate": 3.138906441556014e-06, "loss": 0.91468859, "num_input_tokens_seen": 2735835, "router_z_loss_clip": 5.0546875, "router_z_loss_mlp": 0.86376953, "step": 131, "time_per_iteration": 2.644083261489868 }, { "auxiliary_loss_clip": 0.01776655, "auxiliary_loss_mlp": 0.00786279, "balance_loss_clip": 1.25448513, "balance_loss_mlp": 0.6949169, "epoch": 0.007936269352171952, "flos": 27119486730240.0, "grad_norm": 154.18035094910812, "language_loss": 0.87372893, "learning_rate": 3.143802679474861e-06, "loss": 0.89935827, "num_input_tokens_seen": 2756335, "router_z_loss_clip": 5.22265625, "router_z_loss_mlp": 0.91357422, "step": 132, "time_per_iteration": 2.750429153442383 }, { "auxiliary_loss_clip": 0.01762673, "auxiliary_loss_mlp": 0.00787466, "balance_loss_clip": 1.24702501, "balance_loss_mlp": 0.69743967, "epoch": 0.007996392604839923, "flos": 19026335710080.0, "grad_norm": 88.40975584435802, "language_loss": 1.02845812, "learning_rate": 3.1486619643025565e-06, "loss": 1.05395961, "num_input_tokens_seen": 2775090, "router_z_loss_clip": 5.15625, "router_z_loss_mlp": 0.90087891, "step": 133, "time_per_iteration": 2.6239280700683594 }, { "auxiliary_loss_clip": 0.01744155, "auxiliary_loss_mlp": 0.00788276, "balance_loss_clip": 1.2458055, "balance_loss_mlp": 0.69848716, "epoch": 0.008056515857507891, "flos": 25484151306240.0, "grad_norm": 95.20562204907687, "language_loss": 0.78226084, "learning_rate": 3.153484849651286e-06, "loss": 0.80758518, "num_input_tokens_seen": 2795320, "router_z_loss_clip": 4.984375, "router_z_loss_mlp": 0.89746094, "step": 134, "time_per_iteration": 2.7143054008483887 }, { "auxiliary_loss_clip": 0.0174332, "auxiliary_loss_mlp": 0.00777515, "balance_loss_clip": 1.24046564, "balance_loss_mlp": 0.68596184, "epoch": 0.00811663911017586, "flos": 20557566541440.0, "grad_norm": 436.21684023090097, "language_loss": 0.97615826, "learning_rate": 3.1582718767847806e-06, "loss": 1.00136662, "num_input_tokens_seen": 2812815, "router_z_loss_clip": 5.02734375, "router_z_loss_mlp": 0.91650391, "step": 135, "time_per_iteration": 2.610452651977539 }, { "auxiliary_loss_clip": 0.01740659, "auxiliary_loss_mlp": 0.00724727, "balance_loss_clip": 1.24308956, "balance_loss_mlp": 0.64170939, "epoch": 0.00817676236284383, "flos": 18799738761600.0, "grad_norm": 1203.2494406193493, "language_loss": 0.95668411, "learning_rate": 3.1630235749828485e-06, "loss": 0.9813379, "num_input_tokens_seen": 2830445, "router_z_loss_clip": 4.98046875, "router_z_loss_mlp": 0.82958984, "step": 136, "time_per_iteration": 2.6823713779449463 }, { "auxiliary_loss_clip": 0.01759191, "auxiliary_loss_mlp": 0.00758185, "balance_loss_clip": 1.24827206, "balance_loss_mlp": 0.67178237, "epoch": 0.008236885615511799, "flos": 23873593288320.0, "grad_norm": 780.2701440131653, "language_loss": 0.92436039, "learning_rate": 3.1677404618925676e-06, "loss": 0.94953412, "num_input_tokens_seen": 2846965, "router_z_loss_clip": 5.1171875, "router_z_loss_mlp": 0.86376953, "step": 137, "time_per_iteration": 2.6557068824768066 }, { "auxiliary_loss_clip": 0.01746173, "auxiliary_loss_mlp": 0.00751694, "balance_loss_clip": 1.24646688, "balance_loss_mlp": 0.66729343, "epoch": 0.00829700886817977, "flos": 24643626076800.0, "grad_norm": 134.19126751062498, "language_loss": 0.95734376, "learning_rate": 3.1724230438666953e-06, "loss": 0.98232239, "num_input_tokens_seen": 2867520, "router_z_loss_clip": 5.0, "router_z_loss_mlp": 0.84375, "step": 138, "time_per_iteration": 2.713535785675049 }, { "auxiliary_loss_clip": 0.01732029, "auxiliary_loss_mlp": 0.00795707, "balance_loss_clip": 1.24104059, "balance_loss_mlp": 0.7008642, "epoch": 0.008357132120847738, "flos": 25262007644160.0, "grad_norm": 598.7182429439498, "language_loss": 0.96529615, "learning_rate": 3.177071816289865e-06, "loss": 0.99057353, "num_input_tokens_seen": 2885675, "router_z_loss_clip": 4.9140625, "router_z_loss_mlp": 0.94824219, "step": 139, "time_per_iteration": 2.7126951217651367 }, { "auxiliary_loss_clip": 0.01754028, "auxiliary_loss_mlp": 0.00786769, "balance_loss_clip": 1.24849916, "balance_loss_mlp": 0.69273639, "epoch": 0.008417255373515706, "flos": 27344898529920.0, "grad_norm": 886.678802051824, "language_loss": 0.95855319, "learning_rate": 3.181687263893095e-06, "loss": 0.98396116, "num_input_tokens_seen": 2905960, "router_z_loss_clip": 5.0625, "router_z_loss_mlp": 0.93994141, "step": 140, "time_per_iteration": 2.7129945755004883 }, { "auxiliary_loss_clip": 0.01739299, "auxiliary_loss_mlp": 0.00790665, "balance_loss_clip": 1.24510455, "balance_loss_mlp": 0.69415271, "epoch": 0.008477378626183677, "flos": 17639070589440.0, "grad_norm": 38137.036250938516, "language_loss": 0.92888939, "learning_rate": 3.186269861057098e-06, "loss": 0.95418906, "num_input_tokens_seen": 2922780, "router_z_loss_clip": 4.9453125, "router_z_loss_mlp": 0.96533203, "step": 141, "time_per_iteration": 2.64109206199646 }, { "auxiliary_loss_clip": 0.01743335, "auxiliary_loss_mlp": 0.00795547, "balance_loss_clip": 1.2420994, "balance_loss_mlp": 0.69979835, "epoch": 0.008537501878851645, "flos": 13881342297600.0, "grad_norm": 164.6775671611334, "language_loss": 0.88635027, "learning_rate": 3.1908200721048745e-06, "loss": 0.91173911, "num_input_tokens_seen": 2938765, "router_z_loss_clip": 5.02734375, "router_z_loss_mlp": 0.95849609, "step": 142, "time_per_iteration": 2.650099039077759 }, { "auxiliary_loss_clip": 0.01565533, "auxiliary_loss_mlp": 0.00581501, "balance_loss_clip": 1.21773124, "balance_loss_mlp": 0.52504373, "epoch": 0.008597625131519616, "flos": 71248101281280.0, "grad_norm": 5.389600543782937, "language_loss": 0.66106755, "learning_rate": 3.195338351584042e-06, "loss": 0.68253791, "num_input_tokens_seen": 3006665, "router_z_loss_clip": 3.46875, "router_z_loss_mlp": 0.56640625, "step": 143, "time_per_iteration": 3.493806838989258 }, { "auxiliary_loss_clip": 0.01715963, "auxiliary_loss_mlp": 0.00759861, "balance_loss_clip": 1.24156356, "balance_loss_mlp": 0.67283833, "epoch": 0.008657748384187584, "flos": 17602836744960.0, "grad_norm": 68.87181438319264, "language_loss": 0.92641509, "learning_rate": 3.1998251445393258e-06, "loss": 0.95117337, "num_input_tokens_seen": 3024335, "router_z_loss_clip": 4.73828125, "router_z_loss_mlp": 0.87011719, "step": 144, "time_per_iteration": 2.8749797344207764 }, { "auxiliary_loss_clip": 0.01706701, "auxiliary_loss_mlp": 0.00772767, "balance_loss_clip": 1.24137139, "balance_loss_mlp": 0.68245435, "epoch": 0.008717871636855555, "flos": 19715317459200.0, "grad_norm": 17.116239848777173, "language_loss": 0.95647281, "learning_rate": 3.204280886775619e-06, "loss": 0.98126757, "num_input_tokens_seen": 3043300, "router_z_loss_clip": 4.64453125, "router_z_loss_mlp": 0.90429688, "step": 145, "time_per_iteration": 4.194197177886963 }, { "auxiliary_loss_clip": 0.01720869, "auxiliary_loss_mlp": 0.00800799, "balance_loss_clip": 1.24171853, "balance_loss_mlp": 0.70237935, "epoch": 0.008777994889523523, "flos": 24717422568960.0, "grad_norm": 98.68573447680528, "language_loss": 0.9223997, "learning_rate": 3.208706005112005e-06, "loss": 0.9476164, "num_input_tokens_seen": 3064610, "router_z_loss_clip": 4.796875, "router_z_loss_mlp": 0.984375, "step": 146, "time_per_iteration": 4.1476662158966064 }, { "auxiliary_loss_clip": 0.01533834, "auxiliary_loss_mlp": 0.00530087, "balance_loss_clip": 1.21718585, "balance_loss_mlp": 0.48507333, "epoch": 0.008838118142191492, "flos": 70132067758080.0, "grad_norm": 0.9153580819035948, "language_loss": 0.59278697, "learning_rate": 3.213100917627104e-06, "loss": 0.61342621, "num_input_tokens_seen": 3130385, "router_z_loss_clip": 3.15625, "router_z_loss_mlp": 0.44921875, "step": 147, "time_per_iteration": 3.3022539615631104 }, { "auxiliary_loss_clip": 0.01673189, "auxiliary_loss_mlp": 0.00775806, "balance_loss_clip": 1.22842979, "balance_loss_mlp": 0.68668514, "epoch": 0.008898241394859462, "flos": 20044797937920.0, "grad_norm": 48.01713386653333, "language_loss": 0.9028042, "learning_rate": 3.2174660338961135e-06, "loss": 0.92729408, "num_input_tokens_seen": 3149760, "router_z_loss_clip": 4.44921875, "router_z_loss_mlp": 0.89111328, "step": 148, "time_per_iteration": 2.662189483642578 }, { "auxiliary_loss_clip": 0.01653112, "auxiliary_loss_mlp": 0.00740834, "balance_loss_clip": 1.21770346, "balance_loss_mlp": 0.65886551, "epoch": 0.008958364647527431, "flos": 10743611685120.0, "grad_norm": 91.81741903217146, "language_loss": 0.96288693, "learning_rate": 3.2218017552198588e-06, "loss": 0.98682642, "num_input_tokens_seen": 3164500, "router_z_loss_clip": 4.35546875, "router_z_loss_mlp": 0.81933594, "step": 149, "time_per_iteration": 2.6241729259490967 }, { "auxiliary_loss_clip": 0.01670923, "auxiliary_loss_mlp": 0.00782092, "balance_loss_clip": 1.22522414, "balance_loss_mlp": 0.69402009, "epoch": 0.009018487900195401, "flos": 29127467802240.0, "grad_norm": 676.3265451256625, "language_loss": 0.99732471, "learning_rate": 3.226108474846181e-06, "loss": 1.02185488, "num_input_tokens_seen": 3182455, "router_z_loss_clip": 4.46875, "router_z_loss_mlp": 0.88037109, "step": 150, "time_per_iteration": 2.6795654296875 }, { "auxiliary_loss_clip": 0.01649499, "auxiliary_loss_mlp": 0.00763406, "balance_loss_clip": 1.21304679, "balance_loss_mlp": 0.68344015, "epoch": 0.00907861115286337, "flos": 32963661354240.0, "grad_norm": 1049.5309711403313, "language_loss": 0.8071866, "learning_rate": 3.2303865781839817e-06, "loss": 0.83131564, "num_input_tokens_seen": 3203995, "router_z_loss_clip": 4.3671875, "router_z_loss_mlp": 0.79980469, "step": 151, "time_per_iteration": 2.789259195327759 }, { "auxiliary_loss_clip": 0.01661436, "auxiliary_loss_mlp": 0.00767244, "balance_loss_clip": 1.21939147, "balance_loss_mlp": 0.68546689, "epoch": 0.009138734405531338, "flos": 21762441377280.0, "grad_norm": 245.5696027946517, "language_loss": 0.97121191, "learning_rate": 3.234636443010188e-06, "loss": 0.99549878, "num_input_tokens_seen": 3222575, "router_z_loss_clip": 4.421875, "router_z_loss_mlp": 0.81835938, "step": 152, "time_per_iteration": 2.8077077865600586 }, { "auxiliary_loss_clip": 0.01641907, "auxiliary_loss_mlp": 0.0075845, "balance_loss_clip": 1.21144855, "balance_loss_mlp": 0.67652923, "epoch": 0.009198857658199309, "flos": 20842517134080.0, "grad_norm": 21.13924437047166, "language_loss": 0.95071375, "learning_rate": 3.238858439669943e-06, "loss": 0.97471732, "num_input_tokens_seen": 3240180, "router_z_loss_clip": 4.3046875, "router_z_loss_mlp": 0.81982422, "step": 153, "time_per_iteration": 2.779595136642456 }, { "auxiliary_loss_clip": 0.01638204, "auxiliary_loss_mlp": 0.00748329, "balance_loss_clip": 1.20797575, "balance_loss_mlp": 0.66631269, "epoch": 0.009258980910867277, "flos": 24827381078400.0, "grad_norm": 12.885582570076016, "language_loss": 0.95361781, "learning_rate": 3.2430529312702712e-06, "loss": 0.97748315, "num_input_tokens_seen": 3259800, "router_z_loss_clip": 4.30273438, "router_z_loss_mlp": 0.8203125, "step": 154, "time_per_iteration": 2.7090158462524414 }, { "auxiliary_loss_clip": 0.0162463, "auxiliary_loss_mlp": 0.00739291, "balance_loss_clip": 1.20600653, "balance_loss_mlp": 0.6628536, "epoch": 0.009319104163535248, "flos": 28767786963840.0, "grad_norm": 80.2292186789532, "language_loss": 0.96439672, "learning_rate": 3.2472202738674737e-06, "loss": 0.98803592, "num_input_tokens_seen": 3280400, "router_z_loss_clip": 4.1875, "router_z_loss_mlp": 0.76416016, "step": 155, "time_per_iteration": 2.713118076324463 }, { "auxiliary_loss_clip": 0.01637668, "auxiliary_loss_mlp": 0.00736488, "balance_loss_clip": 1.20523357, "balance_loss_mlp": 0.65842986, "epoch": 0.009379227416203216, "flos": 16582004219520.0, "grad_norm": 46.789258511663554, "language_loss": 0.96475333, "learning_rate": 3.2513608166485063e-06, "loss": 0.98849487, "num_input_tokens_seen": 3297600, "router_z_loss_clip": 4.3203125, "router_z_loss_mlp": 0.78027344, "step": 156, "time_per_iteration": 2.712151288986206 }, { "auxiliary_loss_clip": 0.01629038, "auxiliary_loss_mlp": 0.00696354, "balance_loss_clip": 1.21185493, "balance_loss_mlp": 0.62482822, "epoch": 0.009439350668871187, "flos": 18329919845760.0, "grad_norm": 66.26567585416505, "language_loss": 1.07964456, "learning_rate": 3.2554749021065498e-06, "loss": 1.10289848, "num_input_tokens_seen": 3313635, "router_z_loss_clip": 4.171875, "router_z_loss_mlp": 0.71484375, "step": 157, "time_per_iteration": 2.6144847869873047 }, { "auxiliary_loss_clip": 0.01609093, "auxiliary_loss_mlp": 0.00750568, "balance_loss_clip": 1.20453691, "balance_loss_mlp": 0.67498958, "epoch": 0.009499473921539155, "flos": 24349912565760.0, "grad_norm": 117.80569634588933, "language_loss": 0.96375167, "learning_rate": 3.2595628662110186e-06, "loss": 0.98734832, "num_input_tokens_seen": 3333735, "router_z_loss_clip": 4.05078125, "router_z_loss_mlp": 0.75634766, "step": 158, "time_per_iteration": 2.7037343978881836 }, { "auxiliary_loss_clip": 0.01609644, "auxiliary_loss_mlp": 0.00731084, "balance_loss_clip": 1.19778621, "balance_loss_mlp": 0.65569592, "epoch": 0.009559597174207124, "flos": 16399326625920.0, "grad_norm": 149.38512302956997, "language_loss": 0.95625806, "learning_rate": 3.2636250385721982e-06, "loss": 0.97966534, "num_input_tokens_seen": 3348800, "router_z_loss_clip": 4.1171875, "router_z_loss_mlp": 0.75390625, "step": 159, "time_per_iteration": 2.5914180278778076 }, { "auxiliary_loss_clip": 0.0159316, "auxiliary_loss_mlp": 0.00751342, "balance_loss_clip": 1.19011819, "balance_loss_mlp": 0.67261571, "epoch": 0.009619720426875094, "flos": 22856890826880.0, "grad_norm": 30.788354582864105, "language_loss": 0.93491191, "learning_rate": 3.2676617426007263e-06, "loss": 0.95835692, "num_input_tokens_seen": 3368595, "router_z_loss_clip": 4.03710938, "router_z_loss_mlp": 0.78710938, "step": 160, "time_per_iteration": 2.71919846534729 }, { "auxiliary_loss_clip": 0.0160286, "auxiliary_loss_mlp": 0.0072706, "balance_loss_clip": 1.19454634, "balance_loss_mlp": 0.6561541, "epoch": 0.009679843679543063, "flos": 19135001329920.0, "grad_norm": 105.50496687828468, "language_loss": 0.99424708, "learning_rate": 3.2716732956621042e-06, "loss": 1.01754618, "num_input_tokens_seen": 3384975, "router_z_loss_clip": 4.08007812, "router_z_loss_mlp": 0.70849609, "step": 161, "time_per_iteration": 2.6637136936187744 }, { "auxiliary_loss_clip": 0.01608946, "auxiliary_loss_mlp": 0.007541, "balance_loss_clip": 1.19606805, "balance_loss_mlp": 0.6782825, "epoch": 0.009739966932211033, "flos": 20302995876480.0, "grad_norm": 104.5269765446991, "language_loss": 1.00560844, "learning_rate": 3.2756600092264203e-06, "loss": 1.02923894, "num_input_tokens_seen": 3404755, "router_z_loss_clip": 4.12890625, "router_z_loss_mlp": 0.7578125, "step": 162, "time_per_iteration": 2.685962438583374 }, { "auxiliary_loss_clip": 0.01442512, "auxiliary_loss_mlp": 0.00651847, "balance_loss_clip": 1.15969419, "balance_loss_mlp": 0.61007589, "epoch": 0.009800090184879002, "flos": 67034234177280.0, "grad_norm": 1.1970724915005706, "language_loss": 0.71661985, "learning_rate": 3.279622189013474e-06, "loss": 0.73756337, "num_input_tokens_seen": 3467210, "router_z_loss_clip": 2.828125, "router_z_loss_mlp": 0.41796875, "step": 163, "time_per_iteration": 3.178384304046631 }, { "auxiliary_loss_clip": 0.01580206, "auxiliary_loss_mlp": 0.00733139, "balance_loss_clip": 1.18762159, "balance_loss_mlp": 0.66385496, "epoch": 0.00986021343754697, "flos": 17164690646400.0, "grad_norm": 647.0366806010128, "language_loss": 0.95580047, "learning_rate": 3.283560135133457e-06, "loss": 0.97893393, "num_input_tokens_seen": 3483220, "router_z_loss_clip": 3.93164062, "router_z_loss_mlp": 0.69238281, "step": 164, "time_per_iteration": 2.750044822692871 }, { "auxiliary_loss_clip": 0.01591246, "auxiliary_loss_mlp": 0.00705772, "balance_loss_clip": 1.1910032, "balance_loss_mlp": 0.63653523, "epoch": 0.00992033669021494, "flos": 17749424148480.0, "grad_norm": 181.3048081787111, "language_loss": 0.97304893, "learning_rate": 3.2874741422233565e-06, "loss": 0.99601912, "num_input_tokens_seen": 3501465, "router_z_loss_clip": 4.00390625, "router_z_loss_mlp": 0.69189453, "step": 165, "time_per_iteration": 2.6562674045562744 }, { "auxiliary_loss_clip": 0.01591912, "auxiliary_loss_mlp": 0.00690902, "balance_loss_clip": 1.19175887, "balance_loss_mlp": 0.6221897, "epoch": 0.00998045994288291, "flos": 25297164080640.0, "grad_norm": 26.034375133662355, "language_loss": 0.8747412, "learning_rate": 3.2913644995792465e-06, "loss": 0.8975693, "num_input_tokens_seen": 3520480, "router_z_loss_clip": 4.00390625, "router_z_loss_mlp": 0.68701172, "step": 166, "time_per_iteration": 2.6884992122650146 }, { "auxiliary_loss_clip": 0.01583873, "auxiliary_loss_mlp": 0.00675401, "balance_loss_clip": 1.18845034, "balance_loss_mlp": 0.60530549, "epoch": 0.01004058319555088, "flos": 32298954220800.0, "grad_norm": 3743.241036658859, "language_loss": 0.99003947, "learning_rate": 3.2952314912845914e-06, "loss": 1.01263213, "num_input_tokens_seen": 3539570, "router_z_loss_clip": 3.95703125, "router_z_loss_mlp": 0.70117188, "step": 167, "time_per_iteration": 2.7227530479431152 }, { "auxiliary_loss_clip": 0.01573326, "auxiliary_loss_mlp": 0.00656226, "balance_loss_clip": 1.18747842, "balance_loss_mlp": 0.59461874, "epoch": 0.010100706448218848, "flos": 11319941404800.0, "grad_norm": 45.28732651477756, "language_loss": 1.00718939, "learning_rate": 3.299075396334735e-06, "loss": 1.02948487, "num_input_tokens_seen": 3555465, "router_z_loss_clip": 3.86328125, "router_z_loss_mlp": 0.61572266, "step": 168, "time_per_iteration": 2.5909481048583984 }, { "auxiliary_loss_clip": 0.01566265, "auxiliary_loss_mlp": 0.00623007, "balance_loss_clip": 1.18383694, "balance_loss_mlp": 0.56445163, "epoch": 0.010160829700886819, "flos": 29719491765120.0, "grad_norm": 16.852707871288015, "language_loss": 0.93939906, "learning_rate": 3.3028964887576868e-06, "loss": 0.96129179, "num_input_tokens_seen": 3578970, "router_z_loss_clip": 3.82617188, "router_z_loss_mlp": 0.5859375, "step": 169, "time_per_iteration": 2.7504444122314453 }, { "auxiliary_loss_clip": 0.01566277, "auxiliary_loss_mlp": 0.006676, "balance_loss_clip": 1.18478215, "balance_loss_mlp": 0.6027028, "epoch": 0.010220952953554787, "flos": 20412343854720.0, "grad_norm": 96.56175336320626, "language_loss": 0.91784328, "learning_rate": 3.306695037731344e-06, "loss": 0.94018209, "num_input_tokens_seen": 3597275, "router_z_loss_clip": 3.8125, "router_z_loss_mlp": 0.64892578, "step": 170, "time_per_iteration": 2.618197202682495 }, { "auxiliary_loss_clip": 0.01591934, "auxiliary_loss_mlp": 0.00661083, "balance_loss_clip": 1.19585764, "balance_loss_mlp": 0.59423083, "epoch": 0.010281076206222756, "flos": 31285124847360.0, "grad_norm": 12.454374482471781, "language_loss": 0.95813894, "learning_rate": 3.3104713076972827e-06, "loss": 0.98066902, "num_input_tokens_seen": 3618905, "router_z_loss_clip": 3.95898438, "router_z_loss_mlp": 0.66894531, "step": 171, "time_per_iteration": 2.744884490966797 }, { "auxiliary_loss_clip": 0.01571606, "auxiliary_loss_mlp": 0.00647152, "balance_loss_clip": 1.19140601, "balance_loss_mlp": 0.58172965, "epoch": 0.010341199458890726, "flos": 21982286568960.0, "grad_norm": 1489.8922747559545, "language_loss": 0.96083951, "learning_rate": 3.314225558471224e-06, "loss": 0.9830271, "num_input_tokens_seen": 3639610, "router_z_loss_clip": 3.80273438, "router_z_loss_mlp": 0.65429688, "step": 172, "time_per_iteration": 2.63980770111084 }, { "auxiliary_loss_clip": 0.01543207, "auxiliary_loss_mlp": 0.00631501, "balance_loss_clip": 1.17982912, "balance_loss_mlp": 0.57056147, "epoch": 0.010401322711558695, "flos": 30810529422720.0, "grad_norm": 39.35570339201625, "language_loss": 0.88731027, "learning_rate": 3.317958045350308e-06, "loss": 0.90905738, "num_input_tokens_seen": 3664030, "router_z_loss_clip": 3.6328125, "router_z_loss_mlp": 0.609375, "step": 173, "time_per_iteration": 2.7362773418426514 }, { "auxiliary_loss_clip": 0.01579008, "auxiliary_loss_mlp": 0.00634998, "balance_loss_clip": 1.19688904, "balance_loss_mlp": 0.57257998, "epoch": 0.010461445964226665, "flos": 24715124098560.0, "grad_norm": 34.04474970822509, "language_loss": 0.90769964, "learning_rate": 3.3216690192172596e-06, "loss": 0.92983967, "num_input_tokens_seen": 3683615, "router_z_loss_clip": 3.82421875, "router_z_loss_mlp": 0.62402344, "step": 174, "time_per_iteration": 2.6625475883483887 }, { "auxiliary_loss_clip": 0.01559468, "auxiliary_loss_mlp": 0.00627325, "balance_loss_clip": 1.18548846, "balance_loss_mlp": 0.5672431, "epoch": 0.010521569216894634, "flos": 27710361457920.0, "grad_norm": 25.326286542282165, "language_loss": 0.79154444, "learning_rate": 3.325358726641591e-06, "loss": 0.81341231, "num_input_tokens_seen": 3704540, "router_z_loss_clip": 3.7421875, "router_z_loss_mlp": 0.60009766, "step": 175, "time_per_iteration": 2.7443044185638428 }, { "auxiliary_loss_clip": 0.01545853, "auxiliary_loss_mlp": 0.00624451, "balance_loss_clip": 1.17718959, "balance_loss_mlp": 0.55869508, "epoch": 0.010581692469562603, "flos": 12458346122880.0, "grad_norm": 58.38761549940081, "language_loss": 1.06906199, "learning_rate": 3.329027409977902e-06, "loss": 1.09076512, "num_input_tokens_seen": 3721320, "router_z_loss_clip": 3.68554688, "router_z_loss_mlp": 0.65722656, "step": 176, "time_per_iteration": 2.642580270767212 }, { "auxiliary_loss_clip": 0.01553325, "auxiliary_loss_mlp": 0.00616911, "balance_loss_clip": 1.18271804, "balance_loss_mlp": 0.55549425, "epoch": 0.010641815722230573, "flos": 19427601519360.0, "grad_norm": 86.77421767160196, "language_loss": 0.85222119, "learning_rate": 3.3326753074614087e-06, "loss": 0.87392354, "num_input_tokens_seen": 3739385, "router_z_loss_clip": 3.70703125, "router_z_loss_mlp": 0.61474609, "step": 177, "time_per_iteration": 2.6845474243164062 }, { "auxiliary_loss_clip": 0.0157588, "auxiliary_loss_mlp": 0.00596528, "balance_loss_clip": 1.18894255, "balance_loss_mlp": 0.53415787, "epoch": 0.010701938974898541, "flos": 18332577452160.0, "grad_norm": 38.932481738464716, "language_loss": 0.89594585, "learning_rate": 3.3363026533007716e-06, "loss": 0.91767001, "num_input_tokens_seen": 3756360, "router_z_loss_clip": 3.87304688, "router_z_loss_mlp": 0.62353516, "step": 178, "time_per_iteration": 2.6692745685577393 }, { "auxiliary_loss_clip": 0.01564698, "auxiliary_loss_mlp": 0.00607416, "balance_loss_clip": 1.18618131, "balance_loss_mlp": 0.54557061, "epoch": 0.010762062227566512, "flos": 19203985399680.0, "grad_norm": 13.056752990531393, "language_loss": 0.92749262, "learning_rate": 3.3399096777683303e-06, "loss": 0.9492138, "num_input_tokens_seen": 3773930, "router_z_loss_clip": 3.7890625, "router_z_loss_mlp": 0.61914062, "step": 179, "time_per_iteration": 2.634744882583618 }, { "auxiliary_loss_clip": 0.01580928, "auxiliary_loss_mlp": 0.00584861, "balance_loss_clip": 1.19354916, "balance_loss_mlp": 0.52168041, "epoch": 0.01082218548023448, "flos": 31425427370880.0, "grad_norm": 268.3947636101802, "language_loss": 0.92995822, "learning_rate": 3.3434966072878213e-06, "loss": 0.95161617, "num_input_tokens_seen": 3793630, "router_z_loss_clip": 3.875, "router_z_loss_mlp": 0.63110352, "step": 180, "time_per_iteration": 2.727526903152466 }, { "auxiliary_loss_clip": 0.01589344, "auxiliary_loss_mlp": 0.00616983, "balance_loss_clip": 1.20378017, "balance_loss_mlp": 0.55179977, "epoch": 0.01088230873290245, "flos": 25046436170880.0, "grad_norm": 12.207737165871551, "language_loss": 0.8537823, "learning_rate": 3.3470636645196674e-06, "loss": 0.87584555, "num_input_tokens_seen": 3813610, "router_z_loss_clip": 3.85742188, "router_z_loss_mlp": 0.65136719, "step": 181, "time_per_iteration": 2.725041151046753 }, { "auxiliary_loss_clip": 0.01594552, "auxiliary_loss_mlp": 0.00587309, "balance_loss_clip": 1.20277941, "balance_loss_mlp": 0.52422357, "epoch": 0.01094243198557042, "flos": 22893411980160.0, "grad_norm": 28.91984465366057, "language_loss": 0.88539469, "learning_rate": 3.3506110684439156e-06, "loss": 0.90721333, "num_input_tokens_seen": 3831390, "router_z_loss_clip": 3.91601562, "router_z_loss_mlp": 0.63085938, "step": 182, "time_per_iteration": 2.7122724056243896 }, { "auxiliary_loss_clip": 0.01611154, "auxiliary_loss_mlp": 0.00574381, "balance_loss_clip": 1.21825707, "balance_loss_mlp": 0.51196289, "epoch": 0.011002555238238388, "flos": 17165049782400.0, "grad_norm": 207.6845951392771, "language_loss": 0.96730793, "learning_rate": 3.3541390344409054e-06, "loss": 0.98916328, "num_input_tokens_seen": 3849705, "router_z_loss_clip": 3.92773438, "router_z_loss_mlp": 0.62451172, "step": 183, "time_per_iteration": 2.6229159832000732 }, { "auxiliary_loss_clip": 0.01602611, "auxiliary_loss_mlp": 0.00606016, "balance_loss_clip": 1.21063662, "balance_loss_mlp": 0.54040277, "epoch": 0.011062678490906358, "flos": 22310150935680.0, "grad_norm": 131.47900804308713, "language_loss": 0.94533432, "learning_rate": 3.357647774369736e-06, "loss": 0.96742058, "num_input_tokens_seen": 3869230, "router_z_loss_clip": 3.921875, "router_z_loss_mlp": 0.65625, "step": 184, "time_per_iteration": 2.7038180828094482 }, { "auxiliary_loss_clip": 0.01613853, "auxiliary_loss_mlp": 0.00538514, "balance_loss_clip": 1.22603488, "balance_loss_mlp": 0.47926664, "epoch": 0.011122801743574327, "flos": 24388373053440.0, "grad_norm": 27.018539465882636, "language_loss": 0.91888285, "learning_rate": 3.3611374966446085e-06, "loss": 0.94040644, "num_input_tokens_seen": 3889735, "router_z_loss_clip": 3.87890625, "router_z_loss_mlp": 0.59301758, "step": 185, "time_per_iteration": 2.6613054275512695 }, { "auxiliary_loss_clip": 0.01622394, "auxiliary_loss_mlp": 0.00608331, "balance_loss_clip": 1.22446799, "balance_loss_mlp": 0.54028678, "epoch": 0.011182924996242297, "flos": 18150258994560.0, "grad_norm": 68.79651855810427, "language_loss": 0.80130154, "learning_rate": 3.3646084063091142e-06, "loss": 0.82360876, "num_input_tokens_seen": 3908855, "router_z_loss_clip": 3.98046875, "router_z_loss_mlp": 0.68066406, "step": 186, "time_per_iteration": 2.6364312171936035 }, { "auxiliary_loss_clip": 0.01612141, "auxiliary_loss_mlp": 0.0058272, "balance_loss_clip": 1.22270823, "balance_loss_mlp": 0.52201867, "epoch": 0.011243048248910266, "flos": 15486800584320.0, "grad_norm": 75.16434513413567, "language_loss": 1.10130453, "learning_rate": 3.3680607051085194e-06, "loss": 1.12325311, "num_input_tokens_seen": 3923865, "router_z_loss_clip": 3.89453125, "router_z_loss_mlp": 0.60644531, "step": 187, "time_per_iteration": 4.127197027206421 }, { "auxiliary_loss_clip": 0.01619047, "auxiliary_loss_mlp": 0.0056733, "balance_loss_clip": 1.23301578, "balance_loss_mlp": 0.50720108, "epoch": 0.011303171501578235, "flos": 40916868986880.0, "grad_norm": 13.68927363712344, "language_loss": 0.81994116, "learning_rate": 3.371494591560139e-06, "loss": 0.84180486, "num_input_tokens_seen": 3946870, "router_z_loss_clip": 3.86328125, "router_z_loss_mlp": 0.60083008, "step": 188, "time_per_iteration": 4.269714832305908 }, { "auxiliary_loss_clip": 0.01461046, "auxiliary_loss_mlp": 0.00506188, "balance_loss_clip": 1.17419195, "balance_loss_mlp": 0.47700557, "epoch": 0.011363294754246205, "flos": 66302697790080.0, "grad_norm": 0.7738953343683723, "language_loss": 0.55504274, "learning_rate": 3.3749102610218297e-06, "loss": 0.57471502, "num_input_tokens_seen": 4010005, "router_z_loss_clip": 2.875, "router_z_loss_mlp": 0.29101562, "step": 189, "time_per_iteration": 3.2164976596832275 }, { "auxiliary_loss_clip": 0.01620531, "auxiliary_loss_mlp": 0.00570427, "balance_loss_clip": 1.23081672, "balance_loss_mlp": 0.50815183, "epoch": 0.011423418006914174, "flos": 24900279730560.0, "grad_norm": 21.9776498791501, "language_loss": 1.04523301, "learning_rate": 3.3783079057586833e-06, "loss": 1.06714249, "num_input_tokens_seen": 4029035, "router_z_loss_clip": 3.89648438, "router_z_loss_mlp": 0.62353516, "step": 190, "time_per_iteration": 2.6951098442077637 }, { "auxiliary_loss_clip": 0.01595422, "auxiliary_loss_mlp": 0.00588818, "balance_loss_clip": 1.21567011, "balance_loss_mlp": 0.53209782, "epoch": 0.011483541259582144, "flos": 19791879298560.0, "grad_norm": 257.96569814789456, "language_loss": 0.91753006, "learning_rate": 3.3816877150079665e-06, "loss": 0.93937242, "num_input_tokens_seen": 4046995, "router_z_loss_clip": 3.79492188, "router_z_loss_mlp": 0.56738281, "step": 191, "time_per_iteration": 2.6398918628692627 }, { "auxiliary_loss_clip": 0.0159607, "auxiliary_loss_mlp": 0.00563988, "balance_loss_clip": 1.21540308, "balance_loss_mlp": 0.50831723, "epoch": 0.011543664512250112, "flos": 26176939896960.0, "grad_norm": 4.655397784307342, "language_loss": 0.98557943, "learning_rate": 3.385049875042367e-06, "loss": 1.00717998, "num_input_tokens_seen": 4065865, "router_z_loss_clip": 3.80273438, "router_z_loss_mlp": 0.55761719, "step": 192, "time_per_iteration": 2.7107768058776855 }, { "auxiliary_loss_clip": 0.01583962, "auxiliary_loss_mlp": 0.00638442, "balance_loss_clip": 1.20853102, "balance_loss_mlp": 0.57492709, "epoch": 0.011603787764918083, "flos": 23768985905280.0, "grad_norm": 154.5776993070653, "language_loss": 0.950854, "learning_rate": 3.3883945692315938e-06, "loss": 0.97307807, "num_input_tokens_seen": 4085305, "router_z_loss_clip": 3.76171875, "router_z_loss_mlp": 0.63525391, "step": 193, "time_per_iteration": 2.6503746509552 }, { "auxiliary_loss_clip": 0.01576143, "auxiliary_loss_mlp": 0.00598983, "balance_loss_clip": 1.20520329, "balance_loss_mlp": 0.54624504, "epoch": 0.011663911017586051, "flos": 25954688494080.0, "grad_norm": 13.18657964905435, "language_loss": 1.00363481, "learning_rate": 3.3917219781023906e-06, "loss": 1.0253861, "num_input_tokens_seen": 4105185, "router_z_loss_clip": 3.7109375, "router_z_loss_mlp": 0.52758789, "step": 194, "time_per_iteration": 2.678149700164795 }, { "auxiliary_loss_clip": 0.01569893, "auxiliary_loss_mlp": 0.00636595, "balance_loss_clip": 1.19969916, "balance_loss_mlp": 0.58163995, "epoch": 0.01172403427025402, "flos": 17895149625600.0, "grad_norm": 286.0909421657423, "language_loss": 1.01452959, "learning_rate": 3.3950322793970014e-06, "loss": 1.03659451, "num_input_tokens_seen": 4123160, "router_z_loss_clip": 3.69921875, "router_z_loss_mlp": 0.55029297, "step": 195, "time_per_iteration": 2.6224329471588135 }, { "auxiliary_loss_clip": 0.01565051, "auxiliary_loss_mlp": 0.00619708, "balance_loss_clip": 1.20195889, "balance_loss_mlp": 0.56611192, "epoch": 0.01178415752292199, "flos": 17894539094400.0, "grad_norm": 71.11732831761992, "language_loss": 0.955984, "learning_rate": 3.3983256481301445e-06, "loss": 0.97783154, "num_input_tokens_seen": 4140425, "router_z_loss_clip": 3.62890625, "router_z_loss_mlp": 0.53564453, "step": 196, "time_per_iteration": 2.6526803970336914 }, { "auxiliary_loss_clip": 0.01556823, "auxiliary_loss_mlp": 0.0066075, "balance_loss_clip": 1.19599283, "balance_loss_mlp": 0.60758305, "epoch": 0.011844280775589959, "flos": 22893555634560.0, "grad_norm": 64.45095057168676, "language_loss": 1.01164472, "learning_rate": 3.4016022566445335e-06, "loss": 1.03382051, "num_input_tokens_seen": 4159555, "router_z_loss_clip": 3.61328125, "router_z_loss_mlp": 0.53125, "step": 197, "time_per_iteration": 2.6944479942321777 }, { "auxiliary_loss_clip": 0.01547856, "auxiliary_loss_mlp": 0.00655646, "balance_loss_clip": 1.18900299, "balance_loss_mlp": 0.60021365, "epoch": 0.01190440402825793, "flos": 26980333441920.0, "grad_norm": 56.588184045404624, "language_loss": 0.87437308, "learning_rate": 3.4048622746649966e-06, "loss": 0.89640808, "num_input_tokens_seen": 4180480, "router_z_loss_clip": 3.58789062, "router_z_loss_mlp": 0.55517578, "step": 198, "time_per_iteration": 2.770284414291382 }, { "auxiliary_loss_clip": 0.01544437, "auxiliary_loss_mlp": 0.00664084, "balance_loss_clip": 1.19115996, "balance_loss_mlp": 0.61556578, "epoch": 0.011964527280925898, "flos": 20521584092160.0, "grad_norm": 65.43650243893312, "language_loss": 0.94706368, "learning_rate": 3.4081058693512278e-06, "loss": 0.96914881, "num_input_tokens_seen": 4198835, "router_z_loss_clip": 3.53515625, "router_z_loss_mlp": 0.48510742, "step": 199, "time_per_iteration": 2.783076763153076 }, { "auxiliary_loss_clip": 0.0155405, "auxiliary_loss_mlp": 0.00742345, "balance_loss_clip": 1.19207299, "balance_loss_mlp": 0.68479085, "epoch": 0.012024650533593867, "flos": 27745984771200.0, "grad_norm": 18.741226555700443, "language_loss": 0.88611114, "learning_rate": 3.411333205349222e-06, "loss": 0.90907514, "num_input_tokens_seen": 4219335, "router_z_loss_clip": 3.6171875, "router_z_loss_mlp": 0.57617188, "step": 200, "time_per_iteration": 2.698286294937134 }, { "auxiliary_loss_clip": 0.01561633, "auxiliary_loss_mlp": 0.00734543, "balance_loss_clip": 1.1927526, "balance_loss_mlp": 0.67560554, "epoch": 0.012084773786261837, "flos": 10452017076480.0, "grad_norm": 10.506897469222446, "language_loss": 0.98028105, "learning_rate": 3.4145444448414217e-06, "loss": 1.00324273, "num_input_tokens_seen": 4236940, "router_z_loss_clip": 3.68945312, "router_z_loss_mlp": 0.58935547, "step": 201, "time_per_iteration": 2.617231845855713 }, { "auxiliary_loss_clip": 0.01568704, "auxiliary_loss_mlp": 0.00709125, "balance_loss_clip": 1.20021772, "balance_loss_mlp": 0.65486121, "epoch": 0.012144897038929806, "flos": 23105751229440.0, "grad_norm": 58.30983622113242, "language_loss": 0.91112447, "learning_rate": 3.4177397475956223e-06, "loss": 0.9339028, "num_input_tokens_seen": 4256755, "router_z_loss_clip": 3.68554688, "router_z_loss_mlp": 0.54272461, "step": 202, "time_per_iteration": 2.654141426086426 }, { "auxiliary_loss_clip": 0.01563421, "auxiliary_loss_mlp": 0.00745806, "balance_loss_clip": 1.19565558, "balance_loss_mlp": 0.68818057, "epoch": 0.012205020291597776, "flos": 21033203460480.0, "grad_norm": 58.105315983245575, "language_loss": 0.97845, "learning_rate": 3.4209192710126685e-06, "loss": 1.00154233, "num_input_tokens_seen": 4276505, "router_z_loss_clip": 3.67773438, "router_z_loss_mlp": 0.57617188, "step": 203, "time_per_iteration": 2.724932909011841 }, { "auxiliary_loss_clip": 0.01400112, "auxiliary_loss_mlp": 0.00602754, "balance_loss_clip": 1.12785184, "balance_loss_mlp": 0.572999, "epoch": 0.012265143544265745, "flos": 68447785075200.0, "grad_norm": 1.004336381848441, "language_loss": 0.60621649, "learning_rate": 3.4240831701729837e-06, "loss": 0.62624514, "num_input_tokens_seen": 4330965, "router_z_loss_clip": 2.71875, "router_z_loss_mlp": 0.296875, "step": 204, "time_per_iteration": 3.1386635303497314 }, { "auxiliary_loss_clip": 0.0156447, "auxiliary_loss_mlp": 0.00714389, "balance_loss_clip": 1.19143367, "balance_loss_mlp": 0.65609545, "epoch": 0.012325266796933715, "flos": 17019252478080.0, "grad_norm": 1007.4251654515991, "language_loss": 0.99754077, "learning_rate": 3.4272315978819516e-06, "loss": 1.02032936, "num_input_tokens_seen": 4348200, "router_z_loss_clip": 3.72460938, "router_z_loss_mlp": 0.58276367, "step": 205, "time_per_iteration": 2.656306266784668 }, { "auxiliary_loss_clip": 0.01572238, "auxiliary_loss_mlp": 0.00617556, "balance_loss_clip": 1.19592643, "balance_loss_mlp": 0.56238568, "epoch": 0.012385390049601683, "flos": 20190056538240.0, "grad_norm": 7.591315498388621, "language_loss": 0.97027284, "learning_rate": 3.4303647047142043e-06, "loss": 0.99217075, "num_input_tokens_seen": 4365460, "router_z_loss_clip": 3.76757812, "router_z_loss_mlp": 0.55224609, "step": 206, "time_per_iteration": 2.612391233444214 }, { "auxiliary_loss_clip": 0.01566899, "auxiliary_loss_mlp": 0.00642157, "balance_loss_clip": 1.19064474, "balance_loss_mlp": 0.58510369, "epoch": 0.012445513302269652, "flos": 16253134272000.0, "grad_norm": 19.111539624077594, "language_loss": 1.05933738, "learning_rate": 3.43348263905683e-06, "loss": 1.08142793, "num_input_tokens_seen": 4383650, "router_z_loss_clip": 3.76171875, "router_z_loss_mlp": 0.57055664, "step": 207, "time_per_iteration": 2.631415367126465 }, { "auxiliary_loss_clip": 0.0155764, "auxiliary_loss_mlp": 0.00654626, "balance_loss_clip": 1.1859194, "balance_loss_mlp": 0.59206522, "epoch": 0.012505636554937622, "flos": 23769380954880.0, "grad_norm": 39.73522262294019, "language_loss": 0.82715881, "learning_rate": 3.436585547151547e-06, "loss": 0.84928143, "num_input_tokens_seen": 4403765, "router_z_loss_clip": 3.71679688, "router_z_loss_mlp": 0.62548828, "step": 208, "time_per_iteration": 2.6905648708343506 }, { "auxiliary_loss_clip": 0.0155132, "auxiliary_loss_mlp": 0.00652378, "balance_loss_clip": 1.18596292, "balance_loss_mlp": 0.59167635, "epoch": 0.012565759807605591, "flos": 30591546157440.0, "grad_norm": 10.55452011055248, "language_loss": 1.06887817, "learning_rate": 3.4396735731358586e-06, "loss": 1.09091508, "num_input_tokens_seen": 4421935, "router_z_loss_clip": 3.65234375, "router_z_loss_mlp": 0.60742188, "step": 209, "time_per_iteration": 2.761862277984619 }, { "auxiliary_loss_clip": 0.01557561, "auxiliary_loss_mlp": 0.0056308, "balance_loss_clip": 1.18741155, "balance_loss_mlp": 0.50745672, "epoch": 0.012625883060273561, "flos": 40113511355520.0, "grad_norm": 33.648708585269524, "language_loss": 0.94585407, "learning_rate": 3.4427468590832302e-06, "loss": 0.96706045, "num_input_tokens_seen": 4441470, "router_z_loss_clip": 3.70898438, "router_z_loss_mlp": 0.5559082, "step": 210, "time_per_iteration": 2.7711572647094727 }, { "auxiliary_loss_clip": 0.01551024, "auxiliary_loss_mlp": 0.00572182, "balance_loss_clip": 1.18215847, "balance_loss_mlp": 0.51457977, "epoch": 0.01268600631294153, "flos": 27089178629760.0, "grad_norm": 38.57271449529399, "language_loss": 1.0240258, "learning_rate": 3.445805545042314e-06, "loss": 1.04525781, "num_input_tokens_seen": 4459950, "router_z_loss_clip": 3.68945312, "router_z_loss_mlp": 0.57617188, "step": 211, "time_per_iteration": 2.779919385910034 }, { "auxiliary_loss_clip": 0.01563064, "auxiliary_loss_mlp": 0.00592989, "balance_loss_clip": 1.19039619, "balance_loss_mlp": 0.53071451, "epoch": 0.012746129565609499, "flos": 16982767238400.0, "grad_norm": 68.35424893992301, "language_loss": 1.03842223, "learning_rate": 3.448849769075239e-06, "loss": 1.05998278, "num_input_tokens_seen": 4478390, "router_z_loss_clip": 3.73046875, "router_z_loss_mlp": 0.62255859, "step": 212, "time_per_iteration": 2.607290267944336 }, { "auxiliary_loss_clip": 0.01558939, "auxiliary_loss_mlp": 0.0054631, "balance_loss_clip": 1.19377351, "balance_loss_mlp": 0.49190316, "epoch": 0.012806252818277469, "flos": 46533476995200.0, "grad_norm": 9.16904552373241, "language_loss": 0.84267753, "learning_rate": 3.4518796672950093e-06, "loss": 0.86373001, "num_input_tokens_seen": 4501665, "router_z_loss_clip": 3.6484375, "router_z_loss_mlp": 0.54443359, "step": 213, "time_per_iteration": 2.8693034648895264 }, { "auxiliary_loss_clip": 0.01556984, "auxiliary_loss_mlp": 0.0055737, "balance_loss_clip": 1.18931401, "balance_loss_mlp": 0.50422692, "epoch": 0.012866376070945438, "flos": 14388616120320.0, "grad_norm": 234.90236881503495, "language_loss": 0.95823032, "learning_rate": 3.4548953739020187e-06, "loss": 0.97937381, "num_input_tokens_seen": 4519055, "router_z_loss_clip": 3.6796875, "router_z_loss_mlp": 0.53125, "step": 214, "time_per_iteration": 2.6336557865142822 }, { "auxiliary_loss_clip": 0.01559645, "auxiliary_loss_mlp": 0.00549369, "balance_loss_clip": 1.19710743, "balance_loss_mlp": 0.49698812, "epoch": 0.012926499323613408, "flos": 26140813793280.0, "grad_norm": 354.88863794634847, "language_loss": 0.86068559, "learning_rate": 3.4578970212197196e-06, "loss": 0.8817758, "num_input_tokens_seen": 4540870, "router_z_loss_clip": 3.62695312, "router_z_loss_mlp": 0.52416992, "step": 215, "time_per_iteration": 2.6882364749908447 }, { "auxiliary_loss_clip": 0.01588028, "auxiliary_loss_mlp": 0.00634422, "balance_loss_clip": 1.21162534, "balance_loss_mlp": 0.5725044, "epoch": 0.012986622576281377, "flos": 30117202128000.0, "grad_norm": 172.79752638185997, "language_loss": 1.00035024, "learning_rate": 3.460884739729461e-06, "loss": 1.02257466, "num_input_tokens_seen": 4560395, "router_z_loss_clip": 3.76171875, "router_z_loss_mlp": 0.61889648, "step": 216, "time_per_iteration": 2.752753257751465 }, { "auxiliary_loss_clip": 0.01594513, "auxiliary_loss_mlp": 0.00586919, "balance_loss_clip": 1.216097, "balance_loss_mlp": 0.52864909, "epoch": 0.013046745828949347, "flos": 13954025468160.0, "grad_norm": 51.54354903624533, "language_loss": 1.06081915, "learning_rate": 3.463858658104523e-06, "loss": 1.0826335, "num_input_tokens_seen": 4575785, "router_z_loss_clip": 3.78320312, "router_z_loss_mlp": 0.58300781, "step": 217, "time_per_iteration": 2.6360628604888916 }, { "auxiliary_loss_clip": 0.0159155, "auxiliary_loss_mlp": 0.00590419, "balance_loss_clip": 1.21482003, "balance_loss_mlp": 0.53079009, "epoch": 0.013106869081617315, "flos": 17347835116800.0, "grad_norm": 77.50927215667582, "language_loss": 1.0089674, "learning_rate": 3.4668189032433696e-06, "loss": 1.03078699, "num_input_tokens_seen": 4594985, "router_z_loss_clip": 3.765625, "router_z_loss_mlp": 0.59594727, "step": 218, "time_per_iteration": 2.6822335720062256 }, { "auxiliary_loss_clip": 0.01596823, "auxiliary_loss_mlp": 0.00571851, "balance_loss_clip": 1.22095132, "balance_loss_mlp": 0.51525068, "epoch": 0.013166992334285284, "flos": 25884914325120.0, "grad_norm": 237.59621494351723, "language_loss": 0.93932462, "learning_rate": 3.46976560030214e-06, "loss": 0.96101141, "num_input_tokens_seen": 4616125, "router_z_loss_clip": 3.75976562, "router_z_loss_mlp": 0.56591797, "step": 219, "time_per_iteration": 2.6951491832733154 }, { "auxiliary_loss_clip": 0.01627395, "auxiliary_loss_mlp": 0.005767, "balance_loss_clip": 1.24566078, "balance_loss_mlp": 0.52119625, "epoch": 0.013227115586953254, "flos": 31175956437120.0, "grad_norm": 12.711095472783795, "language_loss": 0.94856256, "learning_rate": 3.4726988727263976e-06, "loss": 0.97060347, "num_input_tokens_seen": 4637795, "router_z_loss_clip": 3.81640625, "router_z_loss_mlp": 0.55419922, "step": 220, "time_per_iteration": 2.7985880374908447 }, { "auxiliary_loss_clip": 0.01636169, "auxiliary_loss_mlp": 0.00548354, "balance_loss_clip": 1.25030375, "balance_loss_mlp": 0.4983815, "epoch": 0.013287238839621223, "flos": 20409470766720.0, "grad_norm": 221.0083291963642, "language_loss": 0.93181264, "learning_rate": 3.475618842282164e-06, "loss": 0.95365787, "num_input_tokens_seen": 4656835, "router_z_loss_clip": 3.86132812, "router_z_loss_mlp": 0.5, "step": 221, "time_per_iteration": 2.64241886138916 }, { "auxiliary_loss_clip": 0.01685497, "auxiliary_loss_mlp": 0.00559162, "balance_loss_clip": 1.28239822, "balance_loss_mlp": 0.50141662, "epoch": 0.013347362092289193, "flos": 14137134024960.0, "grad_norm": 23.650828221004197, "language_loss": 1.00632095, "learning_rate": 3.4785256290862486e-06, "loss": 1.02876759, "num_input_tokens_seen": 4673015, "router_z_loss_clip": 4.02929688, "router_z_loss_mlp": 0.57714844, "step": 222, "time_per_iteration": 2.6198606491088867 }, { "auxiliary_loss_clip": 0.01698042, "auxiliary_loss_mlp": 0.00598077, "balance_loss_clip": 1.29351223, "balance_loss_mlp": 0.53737533, "epoch": 0.013407485344957162, "flos": 21797705554560.0, "grad_norm": 58.051248063839516, "language_loss": 1.02626121, "learning_rate": 3.481419351635897e-06, "loss": 1.04922235, "num_input_tokens_seen": 4692355, "router_z_loss_clip": 4.04492188, "router_z_loss_mlp": 0.60693359, "step": 223, "time_per_iteration": 2.6706135272979736 }, { "auxiliary_loss_clip": 0.01721336, "auxiliary_loss_mlp": 0.005916, "balance_loss_clip": 1.30073416, "balance_loss_mlp": 0.53404534, "epoch": 0.013467608597625132, "flos": 18621622195200.0, "grad_norm": 6535.04896122925, "language_loss": 0.98268652, "learning_rate": 3.484300126837776e-06, "loss": 1.00581586, "num_input_tokens_seen": 4710080, "router_z_loss_clip": 4.20898438, "router_z_loss_mlp": 0.57543945, "step": 224, "time_per_iteration": 2.636991262435913 }, { "auxiliary_loss_clip": 0.01760255, "auxiliary_loss_mlp": 0.00595519, "balance_loss_clip": 1.32432592, "balance_loss_mlp": 0.53705883, "epoch": 0.013527731850293101, "flos": 18552314903040.0, "grad_norm": 54.000050666786414, "language_loss": 0.9857409, "learning_rate": 3.487168070036317e-06, "loss": 1.00929856, "num_input_tokens_seen": 4728980, "router_z_loss_clip": 4.359375, "router_z_loss_mlp": 0.58447266, "step": 225, "time_per_iteration": 2.746201515197754 }, { "auxiliary_loss_clip": 0.01767756, "auxiliary_loss_mlp": 0.00577927, "balance_loss_clip": 1.33693051, "balance_loss_mlp": 0.52197039, "epoch": 0.01358785510296107, "flos": 19165381257600.0, "grad_norm": 3167.3129793268704, "language_loss": 1.0526216, "learning_rate": 3.4900232950414224e-06, "loss": 1.07607841, "num_input_tokens_seen": 4747020, "router_z_loss_clip": 4.3046875, "router_z_loss_mlp": 0.55883789, "step": 226, "time_per_iteration": 2.657371759414673 }, { "auxiliary_loss_clip": 0.01810365, "auxiliary_loss_mlp": 0.00585321, "balance_loss_clip": 1.35201335, "balance_loss_mlp": 0.52504897, "epoch": 0.01364797835562904, "flos": 23329941966720.0, "grad_norm": 56.58997924325406, "language_loss": 0.99236029, "learning_rate": 3.4928659141555727e-06, "loss": 1.01631713, "num_input_tokens_seen": 4765000, "router_z_loss_clip": 4.58203125, "router_z_loss_mlp": 0.60302734, "step": 227, "time_per_iteration": 2.727694272994995 }, { "auxiliary_loss_clip": 0.01819124, "auxiliary_loss_mlp": 0.00458225, "balance_loss_clip": 1.41110277, "balance_loss_mlp": 0.42770791, "epoch": 0.013708101608297009, "flos": 70993746097920.0, "grad_norm": 0.9981676246161005, "language_loss": 0.57178611, "learning_rate": 3.4956960382003234e-06, "loss": 0.59455955, "num_input_tokens_seen": 4833210, "router_z_loss_clip": 4.0625, "router_z_loss_mlp": 0.3046875, "step": 228, "time_per_iteration": 3.2424325942993164 }, { "auxiliary_loss_clip": 0.01779778, "auxiliary_loss_mlp": 0.00539834, "balance_loss_clip": 1.34304762, "balance_loss_mlp": 0.48747757, "epoch": 0.013768224860964979, "flos": 16325170997760.0, "grad_norm": 167.36505794293024, "language_loss": 0.96456593, "learning_rate": 3.4985137765422354e-06, "loss": 0.98776203, "num_input_tokens_seen": 4850120, "router_z_loss_clip": 4.37109375, "router_z_loss_mlp": 0.5234375, "step": 229, "time_per_iteration": 5.511472702026367 }, { "auxiliary_loss_clip": 0.01819645, "auxiliary_loss_mlp": 0.00590883, "balance_loss_clip": 1.35915709, "balance_loss_mlp": 0.53187484, "epoch": 0.013828348113632948, "flos": 20193037367040.0, "grad_norm": 222.86560992138374, "language_loss": 0.91976517, "learning_rate": 3.501319237118231e-06, "loss": 0.94387048, "num_input_tokens_seen": 4866215, "router_z_loss_clip": 4.59765625, "router_z_loss_mlp": 0.58959961, "step": 230, "time_per_iteration": 4.105376958847046 }, { "auxiliary_loss_clip": 0.01787017, "auxiliary_loss_mlp": 0.00552044, "balance_loss_clip": 1.34928548, "balance_loss_mlp": 0.49890053, "epoch": 0.013888471366300916, "flos": 20741070147840.0, "grad_norm": 9.947321187751191, "language_loss": 0.96040511, "learning_rate": 3.5041125264604056e-06, "loss": 0.98379576, "num_input_tokens_seen": 4885630, "router_z_loss_clip": 4.37304688, "router_z_loss_mlp": 0.53051758, "step": 231, "time_per_iteration": 2.660130739212036 }, { "auxiliary_loss_clip": 0.01787912, "auxiliary_loss_mlp": 0.00562979, "balance_loss_clip": 1.34769464, "balance_loss_mlp": 0.51026428, "epoch": 0.013948594618968886, "flos": 22090628966400.0, "grad_norm": 24.128470373150417, "language_loss": 0.93142104, "learning_rate": 3.5068937497203002e-06, "loss": 0.95492995, "num_input_tokens_seen": 4905570, "router_z_loss_clip": 4.40234375, "router_z_loss_mlp": 0.52661133, "step": 232, "time_per_iteration": 2.609828233718872 }, { "auxiliary_loss_clip": 0.0180468, "auxiliary_loss_mlp": 0.00570568, "balance_loss_clip": 1.34959757, "balance_loss_mlp": 0.51566029, "epoch": 0.014008717871636855, "flos": 19063108258560.0, "grad_norm": 762.0860104031582, "language_loss": 0.83401281, "learning_rate": 3.509663010692652e-06, "loss": 0.85776532, "num_input_tokens_seen": 4923535, "router_z_loss_clip": 4.55078125, "router_z_loss_mlp": 0.54907227, "step": 233, "time_per_iteration": 2.627774715423584 }, { "auxiliary_loss_clip": 0.01778407, "auxiliary_loss_mlp": 0.00584006, "balance_loss_clip": 1.34249258, "balance_loss_mlp": 0.5257839, "epoch": 0.014068841124304825, "flos": 14530822064640.0, "grad_norm": 67.4954048858811, "language_loss": 0.94033861, "learning_rate": 3.512420411838642e-06, "loss": 0.96396279, "num_input_tokens_seen": 4939200, "router_z_loss_clip": 4.35546875, "router_z_loss_mlp": 0.58251953, "step": 234, "time_per_iteration": 2.5852134227752686 }, { "auxiliary_loss_clip": 0.01774071, "auxiliary_loss_mlp": 0.00575399, "balance_loss_clip": 1.34351325, "balance_loss_mlp": 0.51841748, "epoch": 0.014128964376972794, "flos": 18077396256000.0, "grad_norm": 87.2419566677951, "language_loss": 0.9858191, "learning_rate": 3.515166054308634e-06, "loss": 1.00931382, "num_input_tokens_seen": 4956620, "router_z_loss_clip": 4.30078125, "router_z_loss_mlp": 0.56982422, "step": 235, "time_per_iteration": 2.6216847896575928 }, { "auxiliary_loss_clip": 0.01761737, "auxiliary_loss_mlp": 0.0058501, "balance_loss_clip": 1.33448374, "balance_loss_mlp": 0.52988744, "epoch": 0.014189087629640764, "flos": 25334331678720.0, "grad_norm": 54.99610967094566, "language_loss": 0.93348372, "learning_rate": 3.5179000379644498e-06, "loss": 0.95695126, "num_input_tokens_seen": 4975650, "router_z_loss_clip": 4.2734375, "router_z_loss_mlp": 0.55102539, "step": 236, "time_per_iteration": 2.6516685485839844 }, { "auxiliary_loss_clip": 0.01785741, "auxiliary_loss_mlp": 0.00570218, "balance_loss_clip": 1.34407997, "balance_loss_mlp": 0.51530993, "epoch": 0.014249210882308733, "flos": 36139744713600.0, "grad_norm": 373.833610998033, "language_loss": 0.90477121, "learning_rate": 3.520622461401154e-06, "loss": 0.92833072, "num_input_tokens_seen": 4997415, "router_z_loss_clip": 4.421875, "router_z_loss_mlp": 0.54882812, "step": 237, "time_per_iteration": 2.8423547744750977 }, { "auxiliary_loss_clip": 0.01821179, "auxiliary_loss_mlp": 0.0062562, "balance_loss_clip": 1.358881, "balance_loss_mlp": 0.561867, "epoch": 0.014309334134976702, "flos": 12932977461120.0, "grad_norm": 850.7657494068205, "language_loss": 0.83836877, "learning_rate": 3.5233334219683935e-06, "loss": 0.86283678, "num_input_tokens_seen": 5013905, "router_z_loss_clip": 4.6171875, "router_z_loss_mlp": 0.63769531, "step": 238, "time_per_iteration": 2.6175143718719482 }, { "auxiliary_loss_clip": 0.01860131, "auxiliary_loss_mlp": 0.00578941, "balance_loss_clip": 1.39353371, "balance_loss_mlp": 0.52365196, "epoch": 0.014369457387644672, "flos": 20777519473920.0, "grad_norm": 162.63140767673028, "language_loss": 0.93906862, "learning_rate": 3.526033015791284e-06, "loss": 0.96345931, "num_input_tokens_seen": 5033645, "router_z_loss_clip": 4.66015625, "router_z_loss_mlp": 0.55297852, "step": 239, "time_per_iteration": 2.6669375896453857 }, { "auxiliary_loss_clip": 0.01874818, "auxiliary_loss_mlp": 0.00527436, "balance_loss_clip": 1.40589428, "balance_loss_mlp": 0.47488856, "epoch": 0.01442958064031264, "flos": 25848536826240.0, "grad_norm": 120.14794067570288, "language_loss": 1.00595963, "learning_rate": 3.528721337790862e-06, "loss": 1.02998209, "num_input_tokens_seen": 5052875, "router_z_loss_clip": 4.68359375, "router_z_loss_mlp": 0.52587891, "step": 240, "time_per_iteration": 2.7326700687408447 }, { "auxiliary_loss_clip": 0.01920968, "auxiliary_loss_mlp": 0.00618437, "balance_loss_clip": 1.41369092, "balance_loss_mlp": 0.56016791, "epoch": 0.014489703892980611, "flos": 28219718269440.0, "grad_norm": 15.653623606455913, "language_loss": 0.91664922, "learning_rate": 3.531398481704111e-06, "loss": 0.9420433, "num_input_tokens_seen": 5075005, "router_z_loss_clip": 5.0703125, "router_z_loss_mlp": 0.58276367, "step": 241, "time_per_iteration": 2.798095464706421 }, { "auxiliary_loss_clip": 0.01905582, "auxiliary_loss_mlp": 0.00572836, "balance_loss_clip": 1.41286266, "balance_loss_mlp": 0.51611674, "epoch": 0.01454982714564858, "flos": 22490925108480.0, "grad_norm": 22.887515893572623, "language_loss": 0.95485985, "learning_rate": 3.534064540103573e-06, "loss": 0.979644, "num_input_tokens_seen": 5091875, "router_z_loss_clip": 4.92578125, "router_z_loss_mlp": 0.56713867, "step": 242, "time_per_iteration": 2.724915027618408 }, { "auxiliary_loss_clip": 0.01929853, "auxiliary_loss_mlp": 0.00611535, "balance_loss_clip": 1.41785908, "balance_loss_mlp": 0.54782945, "epoch": 0.014609950398316548, "flos": 21653201139840.0, "grad_norm": 47.95976028230824, "language_loss": 0.94221067, "learning_rate": 3.536719604416555e-06, "loss": 0.96762455, "num_input_tokens_seen": 5111290, "router_z_loss_clip": 5.12109375, "router_z_loss_mlp": 0.63720703, "step": 243, "time_per_iteration": 2.686410665512085 }, { "auxiliary_loss_clip": 0.01948453, "auxiliary_loss_mlp": 0.00593192, "balance_loss_clip": 1.41908062, "balance_loss_mlp": 0.53177518, "epoch": 0.014670073650984519, "flos": 21869993675520.0, "grad_norm": 28.447695152734994, "language_loss": 0.89414114, "learning_rate": 3.5393637649439464e-06, "loss": 0.91955757, "num_input_tokens_seen": 5132265, "router_z_loss_clip": 5.296875, "router_z_loss_mlp": 0.61474609, "step": 244, "time_per_iteration": 2.7025434970855713 }, { "auxiliary_loss_clip": 0.01958994, "auxiliary_loss_mlp": 0.00653424, "balance_loss_clip": 1.4127785, "balance_loss_mlp": 0.58113539, "epoch": 0.014730196903652487, "flos": 23183713699200.0, "grad_norm": 28.765803168613186, "language_loss": 0.87027651, "learning_rate": 3.54199711087864e-06, "loss": 0.89640069, "num_input_tokens_seen": 5148575, "router_z_loss_clip": 5.46484375, "router_z_loss_mlp": 0.72265625, "step": 245, "time_per_iteration": 2.640968084335327 }, { "auxiliary_loss_clip": 0.01960035, "auxiliary_loss_mlp": 0.0060384, "balance_loss_clip": 1.41385865, "balance_loss_mlp": 0.53779823, "epoch": 0.014790320156320457, "flos": 23222605150080.0, "grad_norm": 40.73299354052646, "language_loss": 0.89651424, "learning_rate": 3.5446197303235913e-06, "loss": 0.92215294, "num_input_tokens_seen": 5170415, "router_z_loss_clip": 5.46484375, "router_z_loss_mlp": 0.66064453, "step": 246, "time_per_iteration": 2.9166998863220215 }, { "auxiliary_loss_clip": 0.01948442, "auxiliary_loss_mlp": 0.00586716, "balance_loss_clip": 1.40752637, "balance_loss_mlp": 0.52682549, "epoch": 0.014850443408988426, "flos": 15815490963840.0, "grad_norm": 16.194323766638902, "language_loss": 0.97404552, "learning_rate": 3.5472317103095034e-06, "loss": 0.9993971, "num_input_tokens_seen": 5188565, "router_z_loss_clip": 5.40234375, "router_z_loss_mlp": 0.59912109, "step": 247, "time_per_iteration": 2.6574418544769287 }, { "auxiliary_loss_clip": 0.01997519, "auxiliary_loss_mlp": 0.00591367, "balance_loss_clip": 1.41272092, "balance_loss_mlp": 0.53080893, "epoch": 0.014910566661656396, "flos": 22781657790720.0, "grad_norm": 316.46072931021536, "language_loss": 0.84265119, "learning_rate": 3.549833136812155e-06, "loss": 0.86854005, "num_input_tokens_seen": 5207810, "router_z_loss_clip": 5.84375, "router_z_loss_mlp": 0.60498047, "step": 248, "time_per_iteration": 2.6765387058258057 }, { "auxiliary_loss_clip": 0.01961814, "auxiliary_loss_mlp": 0.00586562, "balance_loss_clip": 1.40723526, "balance_loss_mlp": 0.52123535, "epoch": 0.014970689914324365, "flos": 26865023806080.0, "grad_norm": 16.30719544402728, "language_loss": 0.89288592, "learning_rate": 3.552424094769381e-06, "loss": 0.91836965, "num_input_tokens_seen": 5226210, "router_z_loss_clip": 5.55078125, "router_z_loss_mlp": 0.65283203, "step": 249, "time_per_iteration": 2.6739673614501953 }, { "auxiliary_loss_clip": 0.02000153, "auxiliary_loss_mlp": 0.00554754, "balance_loss_clip": 1.43031454, "balance_loss_mlp": 0.49414763, "epoch": 0.015030813166992334, "flos": 13985662371840.0, "grad_norm": 5.908390935835151, "language_loss": 1.01359701, "learning_rate": 3.5550046680977174e-06, "loss": 1.03914607, "num_input_tokens_seen": 5241660, "router_z_loss_clip": 5.6953125, "router_z_loss_mlp": 0.60693359, "step": 250, "time_per_iteration": 2.5972321033477783 }, { "auxiliary_loss_clip": 0.02018793, "auxiliary_loss_mlp": 0.00592404, "balance_loss_clip": 1.43168259, "balance_loss_mlp": 0.52645773, "epoch": 0.015090936419660304, "flos": 24717817618560.0, "grad_norm": 123.37580980487063, "language_loss": 1.0430038, "learning_rate": 3.5575749397087034e-06, "loss": 1.06911576, "num_input_tokens_seen": 5261090, "router_z_loss_clip": 5.87109375, "router_z_loss_mlp": 0.65966797, "step": 251, "time_per_iteration": 2.7801220417022705 }, { "auxiliary_loss_clip": 0.02040659, "auxiliary_loss_mlp": 0.005622, "balance_loss_clip": 1.44415021, "balance_loss_mlp": 0.50354862, "epoch": 0.015151059672328273, "flos": 25738793798400.0, "grad_norm": 99.75393012587074, "language_loss": 0.91932654, "learning_rate": 3.5601349915248707e-06, "loss": 0.94535518, "num_input_tokens_seen": 5279175, "router_z_loss_clip": 5.95703125, "router_z_loss_mlp": 0.58642578, "step": 252, "time_per_iteration": 2.699920415878296 }, { "auxiliary_loss_clip": 0.02076542, "auxiliary_loss_mlp": 0.00589782, "balance_loss_clip": 1.46503699, "balance_loss_mlp": 0.5281744, "epoch": 0.015211182924996243, "flos": 21871214737920.0, "grad_norm": 96.61481723701019, "language_loss": 1.07489049, "learning_rate": 3.5626849044954064e-06, "loss": 1.10155368, "num_input_tokens_seen": 5296975, "router_z_loss_clip": 6.1171875, "router_z_loss_mlp": 0.61572266, "step": 253, "time_per_iteration": 2.674912929534912 }, { "auxiliary_loss_clip": 0.02030678, "auxiliary_loss_mlp": 0.0035376, "balance_loss_clip": 1.60078955, "balance_loss_mlp": 0.32591292, "epoch": 0.015271306177664212, "flos": 66895080888960.0, "grad_norm": 0.8480610043062556, "language_loss": 0.55287009, "learning_rate": 3.5652247586115167e-06, "loss": 0.57671446, "num_input_tokens_seen": 5358375, "router_z_loss_clip": 4.3125, "router_z_loss_mlp": 0.27929688, "step": 254, "time_per_iteration": 3.1742122173309326 }, { "auxiliary_loss_clip": 0.02122121, "auxiliary_loss_mlp": 0.00585752, "balance_loss_clip": 1.47170329, "balance_loss_mlp": 0.52340591, "epoch": 0.01533142943033218, "flos": 26834069260800.0, "grad_norm": 36.01824518401435, "language_loss": 0.98110729, "learning_rate": 3.567754632921479e-06, "loss": 1.0081861, "num_input_tokens_seen": 5377255, "router_z_loss_clip": 6.49609375, "router_z_loss_mlp": 0.62329102, "step": 255, "time_per_iteration": 2.7435643672943115 }, { "auxiliary_loss_clip": 0.02129079, "auxiliary_loss_mlp": 0.00573858, "balance_loss_clip": 1.47937739, "balance_loss_mlp": 0.50757784, "epoch": 0.01539155268300015, "flos": 20813753318400.0, "grad_norm": 17.751031441267152, "language_loss": 0.92396587, "learning_rate": 3.5702746055454075e-06, "loss": 0.95099521, "num_input_tokens_seen": 5395320, "router_z_loss_clip": 6.5, "router_z_loss_mlp": 0.66357422, "step": 256, "time_per_iteration": 2.686060667037964 }, { "auxiliary_loss_clip": 0.02100683, "auxiliary_loss_mlp": 0.00586784, "balance_loss_clip": 1.46657598, "balance_loss_mlp": 0.51964569, "epoch": 0.01545167593566812, "flos": 15961862885760.0, "grad_norm": 62.954281188432596, "language_loss": 0.81132758, "learning_rate": 3.5727847536897254e-06, "loss": 0.83820218, "num_input_tokens_seen": 5411970, "router_z_loss_clip": 6.34765625, "router_z_loss_mlp": 0.67089844, "step": 257, "time_per_iteration": 2.7117342948913574 }, { "auxiliary_loss_clip": 0.02122183, "auxiliary_loss_mlp": 0.00554315, "balance_loss_clip": 1.48824203, "balance_loss_mlp": 0.49435231, "epoch": 0.01551179918833609, "flos": 22601745544320.0, "grad_norm": 16.64617576633985, "language_loss": 1.01906121, "learning_rate": 3.5752851536613596e-06, "loss": 1.0458262, "num_input_tokens_seen": 5430245, "router_z_loss_clip": 6.34375, "router_z_loss_mlp": 0.59936523, "step": 258, "time_per_iteration": 2.739394426345825 }, { "auxiliary_loss_clip": 0.02117414, "auxiliary_loss_mlp": 0.00598855, "balance_loss_clip": 1.48084521, "balance_loss_mlp": 0.53810585, "epoch": 0.015571922441004058, "flos": 22816706486400.0, "grad_norm": 50.297320485888726, "language_loss": 0.98817581, "learning_rate": 3.577775880881658e-06, "loss": 1.01533842, "num_input_tokens_seen": 5448905, "router_z_loss_clip": 6.36328125, "router_z_loss_mlp": 0.60693359, "step": 259, "time_per_iteration": 2.6861932277679443 }, { "auxiliary_loss_clip": 0.02116685, "auxiliary_loss_mlp": 0.005311, "balance_loss_clip": 1.50014329, "balance_loss_mlp": 0.47435668, "epoch": 0.015632045693672027, "flos": 18947439486720.0, "grad_norm": 22.85837295159286, "language_loss": 1.02622962, "learning_rate": 3.5802570099000424e-06, "loss": 1.05270743, "num_input_tokens_seen": 5466405, "router_z_loss_clip": 6.16015625, "router_z_loss_mlp": 0.56689453, "step": 260, "time_per_iteration": 2.66146183013916 }, { "auxiliary_loss_clip": 0.02144236, "auxiliary_loss_mlp": 0.00540054, "balance_loss_clip": 1.4931252, "balance_loss_mlp": 0.48025876, "epoch": 0.015692168946339995, "flos": 29971728046080.0, "grad_norm": 21.61432442038631, "language_loss": 0.96622723, "learning_rate": 3.5827286144073947e-06, "loss": 0.99307007, "num_input_tokens_seen": 5487055, "router_z_loss_clip": 6.515625, "router_z_loss_mlp": 0.59814453, "step": 261, "time_per_iteration": 2.7040562629699707 }, { "auxiliary_loss_clip": 0.02172117, "auxiliary_loss_mlp": 0.00569892, "balance_loss_clip": 1.50275397, "balance_loss_mlp": 0.50566173, "epoch": 0.015752292199007967, "flos": 19392085946880.0, "grad_norm": 113.0554563532551, "language_loss": 0.72484016, "learning_rate": 3.5851907672491904e-06, "loss": 0.75226027, "num_input_tokens_seen": 5506600, "router_z_loss_clip": 6.6953125, "router_z_loss_mlp": 0.64208984, "step": 262, "time_per_iteration": 2.6863486766815186 }, { "auxiliary_loss_clip": 0.0218678, "auxiliary_loss_mlp": 0.00555706, "balance_loss_clip": 1.5097369, "balance_loss_mlp": 0.49047509, "epoch": 0.015812415451675936, "flos": 20339804338560.0, "grad_norm": 72.52714285890461, "language_loss": 0.77276534, "learning_rate": 3.587643540438383e-06, "loss": 0.80019021, "num_input_tokens_seen": 5524350, "router_z_loss_clip": 6.77734375, "router_z_loss_mlp": 0.65283203, "step": 263, "time_per_iteration": 2.6862237453460693 }, { "auxiliary_loss_clip": 0.02244302, "auxiliary_loss_mlp": 0.00581325, "balance_loss_clip": 1.53684866, "balance_loss_mlp": 0.51881182, "epoch": 0.015872538704343905, "flos": 17525412979200.0, "grad_norm": 155.59638997344967, "language_loss": 0.93721497, "learning_rate": 3.590087005168037e-06, "loss": 0.96547121, "num_input_tokens_seen": 5542145, "router_z_loss_clip": 7.078125, "router_z_loss_mlp": 0.62451172, "step": 264, "time_per_iteration": 2.692227363586426 }, { "auxiliary_loss_clip": 0.02229739, "auxiliary_loss_mlp": 0.00553546, "balance_loss_clip": 1.53102446, "balance_loss_mlp": 0.49174747, "epoch": 0.015932661957011873, "flos": 15260490944640.0, "grad_norm": 89.71012884301727, "language_loss": 1.09384394, "learning_rate": 3.5925212318237344e-06, "loss": 1.12167668, "num_input_tokens_seen": 5557920, "router_z_loss_clip": 6.98828125, "router_z_loss_mlp": 0.61816406, "step": 265, "time_per_iteration": 2.6428778171539307 }, { "auxiliary_loss_clip": 0.02256601, "auxiliary_loss_mlp": 0.00606123, "balance_loss_clip": 1.54505742, "balance_loss_mlp": 0.53884155, "epoch": 0.015992785209679845, "flos": 20302528999680.0, "grad_norm": 202.7442210169537, "language_loss": 0.82801276, "learning_rate": 3.5949462899957323e-06, "loss": 0.85663998, "num_input_tokens_seen": 5576290, "router_z_loss_clip": 7.1171875, "router_z_loss_mlp": 0.67333984, "step": 266, "time_per_iteration": 2.7283992767333984 }, { "auxiliary_loss_clip": 0.02246724, "auxiliary_loss_mlp": 0.00576986, "balance_loss_clip": 1.54379869, "balance_loss_mlp": 0.51060987, "epoch": 0.016052908462347814, "flos": 23362368969600.0, "grad_norm": 219.70753829087977, "language_loss": 0.95850682, "learning_rate": 3.5973622484909068e-06, "loss": 0.98674393, "num_input_tokens_seen": 5595205, "router_z_loss_clip": 7.02734375, "router_z_loss_mlp": 0.66357422, "step": 267, "time_per_iteration": 2.6664535999298096 }, { "auxiliary_loss_clip": 0.0225273, "auxiliary_loss_mlp": 0.00598825, "balance_loss_clip": 1.53619194, "balance_loss_mlp": 0.53340256, "epoch": 0.016113031715015783, "flos": 21286588976640.0, "grad_norm": 49.50453103677667, "language_loss": 0.93735957, "learning_rate": 3.599769175344462e-06, "loss": 0.96587509, "num_input_tokens_seen": 5612645, "router_z_loss_clip": 7.1640625, "router_z_loss_mlp": 0.65332031, "step": 268, "time_per_iteration": 2.726308584213257 }, { "auxiliary_loss_clip": 0.0226132, "auxiliary_loss_mlp": 0.00590741, "balance_loss_clip": 1.55828714, "balance_loss_mlp": 0.52460372, "epoch": 0.01617315496768375, "flos": 18914689261440.0, "grad_norm": 54.60269103378473, "language_loss": 0.94503599, "learning_rate": 3.602167137831432e-06, "loss": 0.97355664, "num_input_tokens_seen": 5628345, "router_z_loss_clip": 7.03515625, "router_z_loss_mlp": 0.66162109, "step": 269, "time_per_iteration": 2.632283926010132 }, { "auxiliary_loss_clip": 0.02273287, "auxiliary_loss_mlp": 0.00629812, "balance_loss_clip": 1.54582047, "balance_loss_mlp": 0.55494857, "epoch": 0.01623327822035172, "flos": 16546488647040.0, "grad_norm": 14.245814619000443, "language_loss": 1.02824736, "learning_rate": 3.6045562024779565e-06, "loss": 1.05727839, "num_input_tokens_seen": 5645940, "router_z_loss_clip": 7.26953125, "router_z_loss_mlp": 0.74804688, "step": 270, "time_per_iteration": 2.8481240272521973 }, { "auxiliary_loss_clip": 0.02260972, "auxiliary_loss_mlp": 0.0056089, "balance_loss_clip": 1.5500288, "balance_loss_mlp": 0.5012852, "epoch": 0.016293401473019692, "flos": 23513481486720.0, "grad_norm": 13.429992469819332, "language_loss": 0.94599068, "learning_rate": 3.606936435072361e-06, "loss": 0.97420919, "num_input_tokens_seen": 5665690, "router_z_loss_clip": 7.109375, "router_z_loss_mlp": 0.59619141, "step": 271, "time_per_iteration": 2.796449661254883 }, { "auxiliary_loss_clip": 0.02281675, "auxiliary_loss_mlp": 0.00547963, "balance_loss_clip": 1.55234146, "balance_loss_mlp": 0.48440036, "epoch": 0.01635352472568766, "flos": 29016072748800.0, "grad_norm": 56.236567732993066, "language_loss": 0.88716447, "learning_rate": 3.609307900676025e-06, "loss": 0.91546088, "num_input_tokens_seen": 5683190, "router_z_loss_clip": 7.296875, "router_z_loss_mlp": 0.63500977, "step": 272, "time_per_iteration": 5.5638954639434814 }, { "auxiliary_loss_clip": 0.02276875, "auxiliary_loss_mlp": 0.00587464, "balance_loss_clip": 1.55196035, "balance_loss_mlp": 0.52542782, "epoch": 0.01641364797835563, "flos": 13370513028480.0, "grad_norm": 340.5576221885258, "language_loss": 0.89525419, "learning_rate": 3.611670663634051e-06, "loss": 0.92389762, "num_input_tokens_seen": 5699780, "router_z_loss_clip": 7.25, "router_z_loss_mlp": 0.62060547, "step": 273, "time_per_iteration": 2.626847505569458 }, { "auxiliary_loss_clip": 0.02281522, "auxiliary_loss_mlp": 0.00637369, "balance_loss_clip": 1.54046559, "balance_loss_mlp": 0.56565297, "epoch": 0.016473771231023598, "flos": 18878239935360.0, "grad_norm": 81.95737835237048, "language_loss": 0.99150485, "learning_rate": 3.614024787585744e-06, "loss": 1.02069378, "num_input_tokens_seen": 5716980, "router_z_loss_clip": 7.41015625, "router_z_loss_mlp": 0.71777344, "step": 274, "time_per_iteration": 2.6829895973205566 }, { "auxiliary_loss_clip": 0.02232075, "auxiliary_loss_mlp": 0.00553647, "balance_loss_clip": 1.53371572, "balance_loss_mlp": 0.49170592, "epoch": 0.016533894483691566, "flos": 22601637803520.0, "grad_norm": 57.6556460487762, "language_loss": 0.95856875, "learning_rate": 3.6163703354748927e-06, "loss": 0.986426, "num_input_tokens_seen": 5737780, "router_z_loss_clip": 6.984375, "router_z_loss_mlp": 0.62011719, "step": 275, "time_per_iteration": 2.649658441543579 }, { "auxiliary_loss_clip": 0.02256956, "auxiliary_loss_mlp": 0.00544893, "balance_loss_clip": 1.54077637, "balance_loss_mlp": 0.48285621, "epoch": 0.01659401773635954, "flos": 21507188353920.0, "grad_norm": 74.78714364879785, "language_loss": 0.86479264, "learning_rate": 3.6187073695598707e-06, "loss": 0.89281106, "num_input_tokens_seen": 5758330, "router_z_loss_clip": 7.1640625, "router_z_loss_mlp": 0.62060547, "step": 276, "time_per_iteration": 2.7019007205963135 }, { "auxiliary_loss_clip": 0.02277047, "auxiliary_loss_mlp": 0.00576845, "balance_loss_clip": 1.55702615, "balance_loss_mlp": 0.51528513, "epoch": 0.016654140989027507, "flos": 32850973411200.0, "grad_norm": 28.492665225728416, "language_loss": 0.86281979, "learning_rate": 3.621035951423551e-06, "loss": 0.89135867, "num_input_tokens_seen": 5778340, "router_z_loss_clip": 7.20703125, "router_z_loss_mlp": 0.61572266, "step": 277, "time_per_iteration": 2.753699779510498 }, { "auxiliary_loss_clip": 0.02231439, "auxiliary_loss_mlp": 0.00571895, "balance_loss_clip": 1.53533983, "balance_loss_mlp": 0.50814199, "epoch": 0.016714264241695476, "flos": 12306228024960.0, "grad_norm": 65.16904526154283, "language_loss": 0.86543435, "learning_rate": 3.623356141983041e-06, "loss": 0.89346766, "num_input_tokens_seen": 5794295, "router_z_loss_clip": 6.95703125, "router_z_loss_mlp": 0.63769531, "step": 278, "time_per_iteration": 2.6593353748321533 }, { "auxiliary_loss_clip": 0.02257508, "auxiliary_loss_mlp": 0.00587678, "balance_loss_clip": 1.54377651, "balance_loss_mlp": 0.52521193, "epoch": 0.016774387494363444, "flos": 27123796362240.0, "grad_norm": 2554.167843947644, "language_loss": 0.97359329, "learning_rate": 3.6256680014992486e-06, "loss": 1.00204515, "num_input_tokens_seen": 5814405, "router_z_loss_clip": 7.13671875, "router_z_loss_mlp": 0.625, "step": 279, "time_per_iteration": 2.6829869747161865 }, { "auxiliary_loss_clip": 0.02251308, "auxiliary_loss_mlp": 0.00569188, "balance_loss_clip": 1.53438139, "balance_loss_mlp": 0.50624561, "epoch": 0.016834510747031413, "flos": 20191493082240.0, "grad_norm": 79.23945148130363, "language_loss": 1.01273894, "learning_rate": 3.6279715895862713e-06, "loss": 1.04094398, "num_input_tokens_seen": 5832795, "router_z_loss_clip": 7.16015625, "router_z_loss_mlp": 0.62939453, "step": 280, "time_per_iteration": 2.686588764190674 }, { "auxiliary_loss_clip": 0.0221417, "auxiliary_loss_mlp": 0.00587778, "balance_loss_clip": 1.51844347, "balance_loss_mlp": 0.52464527, "epoch": 0.016894633999699385, "flos": 27274262434560.0, "grad_norm": 18.152716349919157, "language_loss": 0.80668306, "learning_rate": 3.6302669652206183e-06, "loss": 0.83470255, "num_input_tokens_seen": 5855750, "router_z_loss_clip": 6.953125, "router_z_loss_mlp": 0.63085938, "step": 281, "time_per_iteration": 2.7612662315368652 }, { "auxiliary_loss_clip": 0.02253296, "auxiliary_loss_mlp": 0.00595143, "balance_loss_clip": 1.53594601, "balance_loss_mlp": 0.53091317, "epoch": 0.016954757252367354, "flos": 14902964922240.0, "grad_norm": 369.06384780400793, "language_loss": 0.89860511, "learning_rate": 3.632554186750274e-06, "loss": 0.92708945, "num_input_tokens_seen": 5872610, "router_z_loss_clip": 7.17578125, "router_z_loss_mlp": 0.64257812, "step": 282, "time_per_iteration": 2.6820385456085205 }, { "auxiliary_loss_clip": 0.02247081, "auxiliary_loss_mlp": 0.00600348, "balance_loss_clip": 1.5313133, "balance_loss_mlp": 0.53568876, "epoch": 0.017014880505035322, "flos": 21358805270400.0, "grad_norm": 30.047066493164632, "language_loss": 0.84262067, "learning_rate": 3.6348333119035937e-06, "loss": 0.87109494, "num_input_tokens_seen": 5892985, "router_z_loss_clip": 7.1484375, "router_z_loss_mlp": 0.64648438, "step": 283, "time_per_iteration": 2.6617631912231445 }, { "auxiliary_loss_clip": 0.02220001, "auxiliary_loss_mlp": 0.00559302, "balance_loss_clip": 1.53008354, "balance_loss_mlp": 0.50050843, "epoch": 0.01707500375770329, "flos": 35333154858240.0, "grad_norm": 75.78034843239011, "language_loss": 0.9115634, "learning_rate": 3.6371043977980503e-06, "loss": 0.93935645, "num_input_tokens_seen": 5914060, "router_z_loss_clip": 6.90234375, "router_z_loss_mlp": 0.58862305, "step": 284, "time_per_iteration": 2.793515920639038 }, { "auxiliary_loss_clip": 0.02179936, "auxiliary_loss_mlp": 0.0055666, "balance_loss_clip": 1.51460993, "balance_loss_mlp": 0.49562502, "epoch": 0.01713512701037126, "flos": 23582070506880.0, "grad_norm": 187.72072771350332, "language_loss": 1.0542345, "learning_rate": 3.639367500948819e-06, "loss": 1.08160043, "num_input_tokens_seen": 5932860, "router_z_loss_clip": 6.65625, "router_z_loss_mlp": 0.60986328, "step": 285, "time_per_iteration": 2.8574185371398926 }, { "auxiliary_loss_clip": 0.02241883, "auxiliary_loss_mlp": 0.00620696, "balance_loss_clip": 1.53609371, "balance_loss_mlp": 0.55331892, "epoch": 0.01719525026303923, "flos": 27634661544960.0, "grad_norm": 102.56481080203969, "language_loss": 0.99409777, "learning_rate": 3.6416226772772178e-06, "loss": 1.02272356, "num_input_tokens_seen": 5952725, "router_z_loss_clip": 7.0625, "router_z_loss_mlp": 0.67431641, "step": 286, "time_per_iteration": 2.743102788925171 }, { "auxiliary_loss_clip": 0.02215812, "auxiliary_loss_mlp": 0.00563183, "balance_loss_clip": 1.52384841, "balance_loss_mlp": 0.49954891, "epoch": 0.0172553735157072, "flos": 26979722910720.0, "grad_norm": 112.71859709995559, "language_loss": 0.9883315, "learning_rate": 3.643869982119001e-06, "loss": 1.01612139, "num_input_tokens_seen": 5970560, "router_z_loss_clip": 6.91796875, "router_z_loss_mlp": 0.63598633, "step": 287, "time_per_iteration": 2.734180450439453 }, { "auxiliary_loss_clip": 0.02237627, "auxiliary_loss_mlp": 0.00587276, "balance_loss_clip": 1.52910841, "balance_loss_mlp": 0.52161545, "epoch": 0.01731549676837517, "flos": 14056621689600.0, "grad_norm": 499.55486428713584, "language_loss": 1.09135747, "learning_rate": 3.646109470232502e-06, "loss": 1.11960649, "num_input_tokens_seen": 5982980, "router_z_loss_clip": 7.0859375, "router_z_loss_mlp": 0.65649414, "step": 288, "time_per_iteration": 2.6088435649871826 }, { "auxiliary_loss_clip": 0.01981643, "auxiliary_loss_mlp": 0.00282682, "balance_loss_clip": 1.57284904, "balance_loss_mlp": 0.26074716, "epoch": 0.017375620021043137, "flos": 66510694471680.0, "grad_norm": 1.3655305970700842, "language_loss": 0.63465434, "learning_rate": 3.6483411958066417e-06, "loss": 0.65729761, "num_input_tokens_seen": 6049445, "router_z_loss_clip": 4.0625, "router_z_loss_mlp": 0.21972656, "step": 289, "time_per_iteration": 3.3067214488983154 }, { "auxiliary_loss_clip": 0.02184603, "auxiliary_loss_mlp": 0.00590008, "balance_loss_clip": 1.51327443, "balance_loss_mlp": 0.52623117, "epoch": 0.01743574327371111, "flos": 15225154940160.0, "grad_norm": 119.13430869550395, "language_loss": 0.95603848, "learning_rate": 3.6505652124687957e-06, "loss": 0.98378462, "num_input_tokens_seen": 6064150, "router_z_loss_clip": 6.7109375, "router_z_loss_mlp": 0.63745117, "step": 290, "time_per_iteration": 2.6351704597473145 }, { "auxiliary_loss_clip": 0.02219156, "auxiliary_loss_mlp": 0.00565994, "balance_loss_clip": 1.53261089, "balance_loss_mlp": 0.50245506, "epoch": 0.017495866526379078, "flos": 25373869574400.0, "grad_norm": 252.48313055341464, "language_loss": 0.92350888, "learning_rate": 3.6527815732925258e-06, "loss": 0.9513604, "num_input_tokens_seen": 6083920, "router_z_loss_clip": 6.859375, "router_z_loss_mlp": 0.63574219, "step": 291, "time_per_iteration": 2.7097580432891846 }, { "auxiliary_loss_clip": 0.02211272, "auxiliary_loss_mlp": 0.00578071, "balance_loss_clip": 1.53518248, "balance_loss_mlp": 0.51331609, "epoch": 0.017555989779047047, "flos": 26359473836160.0, "grad_norm": 3.557829154290891, "language_loss": 0.77330887, "learning_rate": 3.6549903308051806e-06, "loss": 0.80120236, "num_input_tokens_seen": 6105460, "router_z_loss_clip": 6.7578125, "router_z_loss_mlp": 0.64746094, "step": 292, "time_per_iteration": 2.695697784423828 }, { "auxiliary_loss_clip": 0.02187446, "auxiliary_loss_mlp": 0.00577156, "balance_loss_clip": 1.52475905, "balance_loss_mlp": 0.51268774, "epoch": 0.017616113031715015, "flos": 22338807010560.0, "grad_norm": 297.1329070283204, "language_loss": 0.94681865, "learning_rate": 3.6571915369953646e-06, "loss": 0.97446471, "num_input_tokens_seen": 6122890, "router_z_loss_clip": 6.62109375, "router_z_loss_mlp": 0.64428711, "step": 293, "time_per_iteration": 2.8104803562164307 }, { "auxiliary_loss_clip": 0.0217286, "auxiliary_loss_mlp": 0.00584504, "balance_loss_clip": 1.51650751, "balance_loss_mlp": 0.52401686, "epoch": 0.017676236284382984, "flos": 20156911263360.0, "grad_norm": 66.88611021201943, "language_loss": 0.87539965, "learning_rate": 3.6593852433202797e-06, "loss": 0.90297329, "num_input_tokens_seen": 6142890, "router_z_loss_clip": 6.5625, "router_z_loss_mlp": 0.60473633, "step": 294, "time_per_iteration": 2.6958553791046143 }, { "auxiliary_loss_clip": 0.0214242, "auxiliary_loss_mlp": 0.00605323, "balance_loss_clip": 1.49480438, "balance_loss_mlp": 0.54152197, "epoch": 0.017736359537050956, "flos": 25223331674880.0, "grad_norm": 7.040968336726166, "language_loss": 0.90252101, "learning_rate": 3.6615715007129453e-06, "loss": 0.9299984, "num_input_tokens_seen": 6162030, "router_z_loss_clip": 6.47265625, "router_z_loss_mlp": 0.63818359, "step": 295, "time_per_iteration": 2.6923816204071045 }, { "auxiliary_loss_clip": 0.02124755, "auxiliary_loss_mlp": 0.00598808, "balance_loss_clip": 1.50211668, "balance_loss_mlp": 0.53600812, "epoch": 0.017796482789718925, "flos": 20338798757760.0, "grad_norm": 18.68642373603933, "language_loss": 0.91229928, "learning_rate": 3.6637503595892897e-06, "loss": 0.9395349, "num_input_tokens_seen": 6180540, "router_z_loss_clip": 6.2265625, "router_z_loss_mlp": 0.62841797, "step": 296, "time_per_iteration": 2.672752618789673 }, { "auxiliary_loss_clip": 0.02119006, "auxiliary_loss_mlp": 0.00562705, "balance_loss_clip": 1.49071825, "balance_loss_mlp": 0.5029096, "epoch": 0.017856606042386893, "flos": 22379206832640.0, "grad_norm": 14.923547052602913, "language_loss": 0.93438566, "learning_rate": 3.665921869855132e-06, "loss": 0.9612028, "num_input_tokens_seen": 6199425, "router_z_loss_clip": 6.28125, "router_z_loss_mlp": 0.59863281, "step": 297, "time_per_iteration": 2.656914710998535 }, { "auxiliary_loss_clip": 0.02108881, "auxiliary_loss_mlp": 0.00570442, "balance_loss_clip": 1.49264336, "balance_loss_mlp": 0.50797617, "epoch": 0.017916729295054862, "flos": 20230061310720.0, "grad_norm": 20.269381645749043, "language_loss": 0.95909017, "learning_rate": 3.6680860809130346e-06, "loss": 0.98588336, "num_input_tokens_seen": 6219170, "router_z_loss_clip": 6.16015625, "router_z_loss_mlp": 0.62426758, "step": 298, "time_per_iteration": 2.6483123302459717 }, { "auxiliary_loss_clip": 0.02037936, "auxiliary_loss_mlp": 0.00597463, "balance_loss_clip": 1.46666598, "balance_loss_mlp": 0.53523582, "epoch": 0.01797685254772283, "flos": 19390972625280.0, "grad_norm": 792.2793042870924, "language_loss": 0.94363475, "learning_rate": 3.6702430416690516e-06, "loss": 0.9699887, "num_input_tokens_seen": 6237930, "router_z_loss_clip": 5.71875, "router_z_loss_mlp": 0.62255859, "step": 299, "time_per_iteration": 2.6219515800476074 }, { "auxiliary_loss_clip": 0.02056771, "auxiliary_loss_mlp": 0.00589462, "balance_loss_clip": 1.4606595, "balance_loss_mlp": 0.5253278, "epoch": 0.018036975800390802, "flos": 24426007528320.0, "grad_norm": 13.351104301649913, "language_loss": 0.73285842, "learning_rate": 3.672392800539357e-06, "loss": 0.7593208, "num_input_tokens_seen": 6257170, "router_z_loss_clip": 5.9609375, "router_z_loss_mlp": 0.64111328, "step": 300, "time_per_iteration": 2.7350857257843018 }, { "auxiliary_loss_clip": 0.02068434, "auxiliary_loss_mlp": 0.0058627, "balance_loss_clip": 1.48357356, "balance_loss_mlp": 0.52459145, "epoch": 0.01809709905305877, "flos": 15778933896960.0, "grad_norm": 17.40931694446122, "language_loss": 0.96841168, "learning_rate": 3.6745354054567686e-06, "loss": 0.99495864, "num_input_tokens_seen": 6274780, "router_z_loss_clip": 5.85546875, "router_z_loss_mlp": 0.61645508, "step": 301, "time_per_iteration": 2.616377830505371 }, { "auxiliary_loss_clip": 0.01875618, "auxiliary_loss_mlp": 0.00367908, "balance_loss_clip": 1.50937164, "balance_loss_mlp": 0.34683144, "epoch": 0.01815722230572674, "flos": 67348382526720.0, "grad_norm": 2.52405345772591, "language_loss": 0.61594856, "learning_rate": 3.676670903877158e-06, "loss": 0.63838387, "num_input_tokens_seen": 6340435, "router_z_loss_clip": 3.65625, "router_z_loss_mlp": 0.2109375, "step": 302, "time_per_iteration": 3.3501651287078857 }, { "auxiliary_loss_clip": 0.01991982, "auxiliary_loss_mlp": 0.00620386, "balance_loss_clip": 1.44394743, "balance_loss_mlp": 0.55482101, "epoch": 0.01821734555839471, "flos": 15485615435520.0, "grad_norm": 67.5894417984717, "language_loss": 0.9751035, "learning_rate": 3.6787993427857567e-06, "loss": 1.00122726, "num_input_tokens_seen": 6358160, "router_z_loss_clip": 5.484375, "router_z_loss_mlp": 0.65478516, "step": 303, "time_per_iteration": 2.6427557468414307 }, { "auxiliary_loss_clip": 0.02027375, "auxiliary_loss_mlp": 0.00618955, "balance_loss_clip": 1.46076298, "balance_loss_mlp": 0.55162561, "epoch": 0.018277468811062677, "flos": 24097424889600.0, "grad_norm": 8.740594504733139, "language_loss": 0.87040031, "learning_rate": 3.680920768703364e-06, "loss": 0.89686358, "num_input_tokens_seen": 6378485, "router_z_loss_clip": 5.66796875, "router_z_loss_mlp": 0.67285156, "step": 304, "time_per_iteration": 2.670032024383545 }, { "auxiliary_loss_clip": 0.01957326, "auxiliary_loss_mlp": 0.00578348, "balance_loss_clip": 1.44150114, "balance_loss_mlp": 0.51678872, "epoch": 0.01833759206373065, "flos": 20959335141120.0, "grad_norm": 186.29371339630384, "language_loss": 0.87482071, "learning_rate": 3.6830352276924415e-06, "loss": 0.90017748, "num_input_tokens_seen": 6397845, "router_z_loss_clip": 5.16015625, "router_z_loss_mlp": 0.61523438, "step": 305, "time_per_iteration": 2.8225855827331543 }, { "auxiliary_loss_clip": 0.01950558, "auxiliary_loss_mlp": 0.00587778, "balance_loss_clip": 1.41957462, "balance_loss_mlp": 0.52521718, "epoch": 0.018397715316398618, "flos": 19390757143680.0, "grad_norm": 64.70021577125354, "language_loss": 0.96552229, "learning_rate": 3.685142765363119e-06, "loss": 0.9909057, "num_input_tokens_seen": 6416475, "router_z_loss_clip": 5.31640625, "router_z_loss_mlp": 0.62548828, "step": 306, "time_per_iteration": 2.643157482147217 }, { "auxiliary_loss_clip": 0.01952758, "auxiliary_loss_mlp": 0.00626587, "balance_loss_clip": 1.41488981, "balance_loss_mlp": 0.56183267, "epoch": 0.018457838569066586, "flos": 29132531619840.0, "grad_norm": 532.5524800914161, "language_loss": 0.93367171, "learning_rate": 3.687243426879095e-06, "loss": 0.95946515, "num_input_tokens_seen": 6437520, "router_z_loss_clip": 5.3828125, "router_z_loss_mlp": 0.64746094, "step": 307, "time_per_iteration": 2.7934508323669434 }, { "auxiliary_loss_clip": 0.01943029, "auxiliary_loss_mlp": 0.00627135, "balance_loss_clip": 1.42122304, "balance_loss_mlp": 0.56137931, "epoch": 0.018517961821734555, "flos": 19208654167680.0, "grad_norm": 137.1687420238854, "language_loss": 0.77668875, "learning_rate": 3.6893372569634466e-06, "loss": 0.8023904, "num_input_tokens_seen": 6455680, "router_z_loss_clip": 5.21484375, "router_z_loss_mlp": 0.65771484, "step": 308, "time_per_iteration": 2.6190052032470703 }, { "auxiliary_loss_clip": 0.0192778, "auxiliary_loss_mlp": 0.00633262, "balance_loss_clip": 1.39624834, "balance_loss_mlp": 0.56450224, "epoch": 0.018578085074402523, "flos": 19863018184320.0, "grad_norm": 87.61787371984016, "language_loss": 0.95723361, "learning_rate": 3.6914242999043395e-06, "loss": 0.982844, "num_input_tokens_seen": 6474880, "router_z_loss_clip": 5.31640625, "router_z_loss_mlp": 0.6875, "step": 309, "time_per_iteration": 2.7256393432617188 }, { "auxiliary_loss_clip": 0.01940024, "auxiliary_loss_mlp": 0.00604975, "balance_loss_clip": 1.40905523, "balance_loss_mlp": 0.53588146, "epoch": 0.018638208327070496, "flos": 29606947476480.0, "grad_norm": 141.67762967809463, "language_loss": 0.80049896, "learning_rate": 3.69350459956065e-06, "loss": 0.82594895, "num_input_tokens_seen": 6495945, "router_z_loss_clip": 5.30078125, "router_z_loss_mlp": 0.69091797, "step": 310, "time_per_iteration": 2.706019163131714 }, { "auxiliary_loss_clip": 0.01911132, "auxiliary_loss_mlp": 0.00564405, "balance_loss_clip": 1.40414667, "balance_loss_mlp": 0.50367987, "epoch": 0.018698331579738464, "flos": 45731555907840.0, "grad_norm": 27.169703525625188, "language_loss": 0.80480087, "learning_rate": 3.695578199367497e-06, "loss": 0.82955623, "num_input_tokens_seen": 6519930, "router_z_loss_clip": 5.0703125, "router_z_loss_mlp": 0.60791016, "step": 311, "time_per_iteration": 2.9174864292144775 }, { "auxiliary_loss_clip": 0.01918838, "auxiliary_loss_mlp": 0.00583053, "balance_loss_clip": 1.40000701, "balance_loss_mlp": 0.52180374, "epoch": 0.018758454832406433, "flos": 20483662308480.0, "grad_norm": 67.65161688327657, "language_loss": 0.98480678, "learning_rate": 3.6976451423416825e-06, "loss": 1.00982571, "num_input_tokens_seen": 6535070, "router_z_loss_clip": 5.19140625, "router_z_loss_mlp": 0.61254883, "step": 312, "time_per_iteration": 2.6283090114593506 }, { "auxiliary_loss_clip": 0.01896005, "auxiliary_loss_mlp": 0.00558838, "balance_loss_clip": 1.38950479, "balance_loss_mlp": 0.50161779, "epoch": 0.0188185780850744, "flos": 15777784661760.0, "grad_norm": 127.75495787855547, "language_loss": 0.98651153, "learning_rate": 3.699705471087043e-06, "loss": 1.01105988, "num_input_tokens_seen": 6554135, "router_z_loss_clip": 5.06640625, "router_z_loss_mlp": 0.57226562, "step": 313, "time_per_iteration": 2.671684741973877 }, { "auxiliary_loss_clip": 0.01927028, "auxiliary_loss_mlp": 0.00652108, "balance_loss_clip": 1.39609647, "balance_loss_mlp": 0.579247, "epoch": 0.018878701337742373, "flos": 22455732758400.0, "grad_norm": 79.78003727905573, "language_loss": 0.84097493, "learning_rate": 3.7017592277997256e-06, "loss": 0.86676621, "num_input_tokens_seen": 6572275, "router_z_loss_clip": 5.3125, "router_z_loss_mlp": 0.72851562, "step": 314, "time_per_iteration": 4.141700506210327 }, { "auxiliary_loss_clip": 0.01880948, "auxiliary_loss_mlp": 0.00649402, "balance_loss_clip": 1.37381172, "balance_loss_mlp": 0.58026052, "epoch": 0.018938824590410342, "flos": 30993530238720.0, "grad_norm": 26.186092506709006, "language_loss": 0.98196876, "learning_rate": 3.7038064542733654e-06, "loss": 1.00727224, "num_input_tokens_seen": 6594520, "router_z_loss_clip": 5.06640625, "router_z_loss_mlp": 0.69042969, "step": 315, "time_per_iteration": 4.144528865814209 }, { "auxiliary_loss_clip": 0.01894662, "auxiliary_loss_mlp": 0.00625582, "balance_loss_clip": 1.386235, "balance_loss_mlp": 0.55901575, "epoch": 0.01899894784307831, "flos": 23258910821760.0, "grad_norm": 16.899518832477554, "language_loss": 0.86744428, "learning_rate": 3.7058471919041945e-06, "loss": 0.89264667, "num_input_tokens_seen": 6614245, "router_z_loss_clip": 5.08203125, "router_z_loss_mlp": 0.66601562, "step": 316, "time_per_iteration": 2.637929677963257 }, { "auxiliary_loss_clip": 0.01853121, "auxiliary_loss_mlp": 0.00571831, "balance_loss_clip": 1.37452281, "balance_loss_mlp": 0.51213133, "epoch": 0.01905907109574628, "flos": 17457901367040.0, "grad_norm": 32.999764804712555, "language_loss": 0.94601923, "learning_rate": 3.7078814816960605e-06, "loss": 0.97026873, "num_input_tokens_seen": 6632015, "router_z_loss_clip": 4.7890625, "router_z_loss_mlp": 0.59765625, "step": 317, "time_per_iteration": 2.6296045780181885 }, { "auxiliary_loss_clip": 0.0184487, "auxiliary_loss_mlp": 0.00557567, "balance_loss_clip": 1.36584783, "balance_loss_mlp": 0.49805823, "epoch": 0.019119194348414248, "flos": 14970225139200.0, "grad_norm": 364.02355462429205, "language_loss": 1.00598466, "learning_rate": 3.709909364265374e-06, "loss": 1.03000903, "num_input_tokens_seen": 6649015, "router_z_loss_clip": 4.796875, "router_z_loss_mlp": 0.59521484, "step": 318, "time_per_iteration": 2.598317861557007 }, { "auxiliary_loss_clip": 0.0183217, "auxiliary_loss_mlp": 0.00571069, "balance_loss_clip": 1.35594416, "balance_loss_mlp": 0.51294261, "epoch": 0.01917931760108222, "flos": 25482822503040.0, "grad_norm": 67.82298671830179, "language_loss": 1.01693892, "learning_rate": 3.7119308798459706e-06, "loss": 1.0409714, "num_input_tokens_seen": 6669225, "router_z_loss_clip": 4.765625, "router_z_loss_mlp": 0.58154297, "step": 319, "time_per_iteration": 2.7049930095672607 }, { "auxiliary_loss_clip": 0.01575436, "auxiliary_loss_mlp": 0.00491821, "balance_loss_clip": 1.27333307, "balance_loss_mlp": 0.46454597, "epoch": 0.01923944085375019, "flos": 71556967353600.0, "grad_norm": 0.9469011974605895, "language_loss": 0.59268308, "learning_rate": 3.7139460682939026e-06, "loss": 0.61335564, "num_input_tokens_seen": 6725775, "router_z_loss_clip": 3.015625, "router_z_loss_mlp": 0.2734375, "step": 320, "time_per_iteration": 3.0543596744537354 }, { "auxiliary_loss_clip": 0.01788905, "auxiliary_loss_mlp": 0.00573487, "balance_loss_clip": 1.32543993, "balance_loss_mlp": 0.50692064, "epoch": 0.019299564106418157, "flos": 19682495406720.0, "grad_norm": 170.3814386167926, "language_loss": 0.99715489, "learning_rate": 3.715954969092154e-06, "loss": 1.02077889, "num_input_tokens_seen": 6744170, "router_z_loss_clip": 4.6328125, "router_z_loss_mlp": 0.66503906, "step": 321, "time_per_iteration": 2.679600715637207 }, { "auxiliary_loss_clip": 0.01773872, "auxiliary_loss_mlp": 0.00563566, "balance_loss_clip": 1.31902266, "balance_loss_mlp": 0.50138676, "epoch": 0.019359687359086126, "flos": 24387151991040.0, "grad_norm": 67.16829898159335, "language_loss": 0.90888399, "learning_rate": 3.7179576213552805e-06, "loss": 0.93225837, "num_input_tokens_seen": 6764565, "router_z_loss_clip": 4.546875, "router_z_loss_mlp": 0.62158203, "step": 322, "time_per_iteration": 2.670535087585449 }, { "auxiliary_loss_clip": 0.01764579, "auxiliary_loss_mlp": 0.00511119, "balance_loss_clip": 1.31445599, "balance_loss_mlp": 0.45041743, "epoch": 0.019419810611754094, "flos": 23951376190080.0, "grad_norm": 25.036396611729852, "language_loss": 0.81708264, "learning_rate": 3.719954063833981e-06, "loss": 0.83983958, "num_input_tokens_seen": 6785310, "router_z_loss_clip": 4.5078125, "router_z_loss_mlp": 0.60668945, "step": 323, "time_per_iteration": 2.671778917312622 }, { "auxiliary_loss_clip": 0.01737965, "auxiliary_loss_mlp": 0.00504715, "balance_loss_clip": 1.29733872, "balance_loss_mlp": 0.44277409, "epoch": 0.019479933864422067, "flos": 22160223567360.0, "grad_norm": 11.807415918632262, "language_loss": 0.9827792, "learning_rate": 3.721944334919596e-06, "loss": 1.00520599, "num_input_tokens_seen": 6803290, "router_z_loss_clip": 4.40820312, "router_z_loss_mlp": 0.61914062, "step": 324, "time_per_iteration": 2.702497959136963 }, { "auxiliary_loss_clip": 0.01731413, "auxiliary_loss_mlp": 0.00508918, "balance_loss_clip": 1.29354072, "balance_loss_mlp": 0.44790635, "epoch": 0.019540057117090035, "flos": 22236821320320.0, "grad_norm": 96.16596751443934, "language_loss": 0.74214327, "learning_rate": 3.7239284726485375e-06, "loss": 0.76454657, "num_input_tokens_seen": 6822570, "router_z_loss_clip": 4.37695312, "router_z_loss_mlp": 0.60986328, "step": 325, "time_per_iteration": 2.651331901550293 }, { "auxiliary_loss_clip": 0.01681344, "auxiliary_loss_mlp": 0.00460343, "balance_loss_clip": 1.27323341, "balance_loss_mlp": 0.40333682, "epoch": 0.019600180369758004, "flos": 23076771932160.0, "grad_norm": 26.16831652607451, "language_loss": 0.82556951, "learning_rate": 3.72590651470665e-06, "loss": 0.84698641, "num_input_tokens_seen": 6841910, "router_z_loss_clip": 4.0859375, "router_z_loss_mlp": 0.56958008, "step": 326, "time_per_iteration": 2.687748908996582 }, { "auxiliary_loss_clip": 0.01674731, "auxiliary_loss_mlp": 0.00442401, "balance_loss_clip": 1.26869357, "balance_loss_mlp": 0.38801777, "epoch": 0.019660303622425972, "flos": 25410857604480.0, "grad_norm": 20.897883022641754, "language_loss": 0.86422735, "learning_rate": 3.727878498433505e-06, "loss": 0.88539869, "num_input_tokens_seen": 6862480, "router_z_loss_clip": 4.0625, "router_z_loss_mlp": 0.54394531, "step": 327, "time_per_iteration": 2.688828468322754 }, { "auxiliary_loss_clip": 0.01680565, "auxiliary_loss_mlp": 0.00484639, "balance_loss_clip": 1.2719723, "balance_loss_mlp": 0.4284679, "epoch": 0.01972042687509394, "flos": 23657519024640.0, "grad_norm": 12.996775435534014, "language_loss": 0.87821972, "learning_rate": 3.7298444608266328e-06, "loss": 0.89987171, "num_input_tokens_seen": 6882015, "router_z_loss_clip": 4.08203125, "router_z_loss_mlp": 0.56201172, "step": 328, "time_per_iteration": 2.696183681488037 }, { "auxiliary_loss_clip": 0.0166814, "auxiliary_loss_mlp": 0.00520772, "balance_loss_clip": 1.26261473, "balance_loss_mlp": 0.45897388, "epoch": 0.019780550127761913, "flos": 18223480869120.0, "grad_norm": 514.2828161640098, "language_loss": 1.05165446, "learning_rate": 3.731804438545683e-06, "loss": 1.07354355, "num_input_tokens_seen": 6899785, "router_z_loss_clip": 4.04882812, "router_z_loss_mlp": 0.61767578, "step": 329, "time_per_iteration": 2.700099468231201 }, { "auxiliary_loss_clip": 0.01646876, "auxiliary_loss_mlp": 0.0050209, "balance_loss_clip": 1.24925864, "balance_loss_mlp": 0.44401163, "epoch": 0.01984067338042988, "flos": 22418780641920.0, "grad_norm": 65.24107477841241, "language_loss": 0.84408891, "learning_rate": 3.7337584679165324e-06, "loss": 0.86557865, "num_input_tokens_seen": 6918575, "router_z_loss_clip": 3.984375, "router_z_loss_mlp": 0.58007812, "step": 330, "time_per_iteration": 2.716856002807617 }, { "auxiliary_loss_clip": 0.01649218, "auxiliary_loss_mlp": 0.00550871, "balance_loss_clip": 1.2538352, "balance_loss_mlp": 0.49150488, "epoch": 0.01990079663309785, "flos": 17055199013760.0, "grad_norm": 12.52852625095614, "language_loss": 1.03892481, "learning_rate": 3.7357065849353186e-06, "loss": 1.0609256, "num_input_tokens_seen": 6936965, "router_z_loss_clip": 3.95703125, "router_z_loss_mlp": 0.59375, "step": 331, "time_per_iteration": 2.6292076110839844 }, { "auxiliary_loss_clip": 0.01608305, "auxiliary_loss_mlp": 0.00480475, "balance_loss_clip": 1.23974895, "balance_loss_mlp": 0.42866617, "epoch": 0.01996091988576582, "flos": 15961791058560.0, "grad_norm": 19.01419541058959, "language_loss": 1.01349318, "learning_rate": 3.737648825272422e-06, "loss": 1.03438091, "num_input_tokens_seen": 6953475, "router_z_loss_clip": 3.68359375, "router_z_loss_mlp": 0.51879883, "step": 332, "time_per_iteration": 2.7464144229888916 }, { "auxiliary_loss_clip": 0.01618529, "auxiliary_loss_mlp": 0.00512484, "balance_loss_clip": 1.23843884, "balance_loss_mlp": 0.45268869, "epoch": 0.02002104313843379, "flos": 23586451966080.0, "grad_norm": 85.18298770950838, "language_loss": 0.87455326, "learning_rate": 3.739585224276384e-06, "loss": 0.89586341, "num_input_tokens_seen": 6971630, "router_z_loss_clip": 3.8046875, "router_z_loss_mlp": 0.59765625, "step": 333, "time_per_iteration": 2.6837961673736572 }, { "auxiliary_loss_clip": 0.01615747, "auxiliary_loss_mlp": 0.00549613, "balance_loss_clip": 1.23368192, "balance_loss_mlp": 0.49069938, "epoch": 0.02008116639110176, "flos": 34094883352320.0, "grad_norm": 139.65863162406126, "language_loss": 0.89745402, "learning_rate": 3.7415158169777673e-06, "loss": 0.91910768, "num_input_tokens_seen": 6992775, "router_z_loss_clip": 3.81640625, "router_z_loss_mlp": 0.58959961, "step": 334, "time_per_iteration": 2.7973129749298096 }, { "auxiliary_loss_clip": 0.01605423, "auxiliary_loss_mlp": 0.00495666, "balance_loss_clip": 1.23113739, "balance_loss_mlp": 0.43846929, "epoch": 0.020141289643769728, "flos": 19683716469120.0, "grad_norm": 420.27676017331135, "language_loss": 0.89613837, "learning_rate": 3.7434406380929575e-06, "loss": 0.91714931, "num_input_tokens_seen": 7011425, "router_z_loss_clip": 3.74609375, "router_z_loss_mlp": 0.57250977, "step": 335, "time_per_iteration": 2.6402931213378906 }, { "auxiliary_loss_clip": 0.01581489, "auxiliary_loss_mlp": 0.00513036, "balance_loss_clip": 1.21737957, "balance_loss_mlp": 0.45927253, "epoch": 0.020201412896437697, "flos": 20740567357440.0, "grad_norm": 43.20996183303322, "language_loss": 0.98710144, "learning_rate": 3.745359722027911e-06, "loss": 1.00804663, "num_input_tokens_seen": 7029450, "router_z_loss_clip": 3.640625, "router_z_loss_mlp": 0.53833008, "step": 336, "time_per_iteration": 2.637401819229126 }, { "auxiliary_loss_clip": 0.01600632, "auxiliary_loss_mlp": 0.00493784, "balance_loss_clip": 1.22920382, "balance_loss_mlp": 0.43770826, "epoch": 0.020261536149105665, "flos": 20266510636800.0, "grad_norm": 186.48634814785888, "language_loss": 0.94911402, "learning_rate": 3.7472731028818428e-06, "loss": 0.9700582, "num_input_tokens_seen": 7047555, "router_z_loss_clip": 3.71679688, "router_z_loss_mlp": 0.56054688, "step": 337, "time_per_iteration": 2.6537132263183594 }, { "auxiliary_loss_clip": 0.01566339, "auxiliary_loss_mlp": 0.0049742, "balance_loss_clip": 1.21586204, "balance_loss_mlp": 0.44339469, "epoch": 0.020321659401773638, "flos": 25848752307840.0, "grad_norm": 269.21351390933336, "language_loss": 0.95424926, "learning_rate": 3.7491808144508626e-06, "loss": 0.97488683, "num_input_tokens_seen": 7068185, "router_z_loss_clip": 3.50195312, "router_z_loss_mlp": 0.54003906, "step": 338, "time_per_iteration": 2.6590757369995117 }, { "auxiliary_loss_clip": 0.01584851, "auxiliary_loss_mlp": 0.00495898, "balance_loss_clip": 1.22041345, "balance_loss_mlp": 0.44230118, "epoch": 0.020381782654441606, "flos": 17495033051520.0, "grad_norm": 166.23521792484578, "language_loss": 0.92984003, "learning_rate": 3.7510828902315576e-06, "loss": 0.95064759, "num_input_tokens_seen": 7085955, "router_z_loss_clip": 3.64257812, "router_z_loss_mlp": 0.53564453, "step": 339, "time_per_iteration": 2.6070399284362793 }, { "auxiliary_loss_clip": 0.01589192, "auxiliary_loss_mlp": 0.00559817, "balance_loss_clip": 1.21699238, "balance_loss_mlp": 0.50190467, "epoch": 0.020441905907109575, "flos": 24243940465920.0, "grad_norm": 538.9376044915294, "language_loss": 0.94868124, "learning_rate": 3.75297936342452e-06, "loss": 0.97017133, "num_input_tokens_seen": 7106345, "router_z_loss_clip": 3.71875, "router_z_loss_mlp": 0.57885742, "step": 340, "time_per_iteration": 2.8064749240875244 }, { "auxiliary_loss_clip": 0.01577495, "auxiliary_loss_mlp": 0.00540694, "balance_loss_clip": 1.21853173, "balance_loss_mlp": 0.48526165, "epoch": 0.020502029159777543, "flos": 22233301787520.0, "grad_norm": 56.72102413440594, "language_loss": 0.94670618, "learning_rate": 3.7548702669378253e-06, "loss": 0.96788812, "num_input_tokens_seen": 7125070, "router_z_loss_clip": 3.58984375, "router_z_loss_mlp": 0.55395508, "step": 341, "time_per_iteration": 2.7288832664489746 }, { "auxiliary_loss_clip": 0.01574016, "auxiliary_loss_mlp": 0.00459179, "balance_loss_clip": 1.21408892, "balance_loss_mlp": 0.4109472, "epoch": 0.020562152412445512, "flos": 23987861429760.0, "grad_norm": 21.715956417489668, "language_loss": 0.92123783, "learning_rate": 3.756755633390458e-06, "loss": 0.94156981, "num_input_tokens_seen": 7144675, "router_z_loss_clip": 3.59960938, "router_z_loss_mlp": 0.48242188, "step": 342, "time_per_iteration": 2.658841848373413 }, { "auxiliary_loss_clip": 0.01555781, "auxiliary_loss_mlp": 0.00505225, "balance_loss_clip": 1.20409107, "balance_loss_mlp": 0.44764715, "epoch": 0.020622275665113484, "flos": 26975305537920.0, "grad_norm": 355.7760497545173, "language_loss": 0.96989119, "learning_rate": 3.7586354951156886e-06, "loss": 0.99050122, "num_input_tokens_seen": 7165505, "router_z_loss_clip": 3.51757812, "router_z_loss_mlp": 0.57519531, "step": 343, "time_per_iteration": 2.7005555629730225 }, { "auxiliary_loss_clip": 0.01548089, "auxiliary_loss_mlp": 0.00499678, "balance_loss_clip": 1.20106864, "balance_loss_mlp": 0.44972908, "epoch": 0.020682398917781453, "flos": 22600704049920.0, "grad_norm": 49.813031470563764, "language_loss": 0.84791827, "learning_rate": 3.7605098841644e-06, "loss": 0.86839592, "num_input_tokens_seen": 7184605, "router_z_loss_clip": 3.47070312, "router_z_loss_mlp": 0.49951172, "step": 344, "time_per_iteration": 2.688504934310913 }, { "auxiliary_loss_clip": 0.01555407, "auxiliary_loss_mlp": 0.00529, "balance_loss_clip": 1.20427155, "balance_loss_mlp": 0.47323391, "epoch": 0.02074252217044942, "flos": 15013605790080.0, "grad_norm": 41.64105873649356, "language_loss": 0.85191011, "learning_rate": 3.7623788323083666e-06, "loss": 0.87275422, "num_input_tokens_seen": 7203065, "router_z_loss_clip": 3.50976562, "router_z_loss_mlp": 0.55786133, "step": 345, "time_per_iteration": 2.8025686740875244 }, { "auxiliary_loss_clip": 0.01531499, "auxiliary_loss_mlp": 0.00493695, "balance_loss_clip": 1.19356287, "balance_loss_mlp": 0.44427055, "epoch": 0.02080264542311739, "flos": 25337958952320.0, "grad_norm": 9.121240579578473, "language_loss": 0.96176076, "learning_rate": 3.7642423710434837e-06, "loss": 0.98201269, "num_input_tokens_seen": 7222995, "router_z_loss_clip": 3.38085938, "router_z_loss_mlp": 0.49438477, "step": 346, "time_per_iteration": 2.728078842163086 }, { "auxiliary_loss_clip": 0.0154885, "auxiliary_loss_mlp": 0.00492269, "balance_loss_clip": 1.20300698, "balance_loss_mlp": 0.43848136, "epoch": 0.02086276867578536, "flos": 24388804016640.0, "grad_norm": 100.64245956440361, "language_loss": 0.89130104, "learning_rate": 3.7661005315929563e-06, "loss": 0.91171217, "num_input_tokens_seen": 7244625, "router_z_loss_clip": 3.45898438, "router_z_loss_mlp": 0.53808594, "step": 347, "time_per_iteration": 2.750784397125244 }, { "auxiliary_loss_clip": 0.01523387, "auxiliary_loss_mlp": 0.00485177, "balance_loss_clip": 1.18697691, "balance_loss_mlp": 0.43458438, "epoch": 0.02092289192845333, "flos": 24462205459200.0, "grad_norm": 31.142157314495158, "language_loss": 0.81494319, "learning_rate": 3.7679533449104354e-06, "loss": 0.83502883, "num_input_tokens_seen": 7263255, "router_z_loss_clip": 3.36523438, "router_z_loss_mlp": 0.50561523, "step": 348, "time_per_iteration": 2.719184637069702 }, { "auxiliary_loss_clip": 0.01535608, "auxiliary_loss_mlp": 0.00460714, "balance_loss_clip": 1.18873239, "balance_loss_mlp": 0.40954989, "epoch": 0.0209830151811213, "flos": 17451185523840.0, "grad_norm": 26.211156367775544, "language_loss": 0.87328959, "learning_rate": 3.7698008416831116e-06, "loss": 0.89325279, "num_input_tokens_seen": 7279275, "router_z_loss_clip": 3.46875, "router_z_loss_mlp": 0.51147461, "step": 349, "time_per_iteration": 2.6466236114501953 }, { "auxiliary_loss_clip": 0.01517702, "auxiliary_loss_mlp": 0.00462923, "balance_loss_clip": 1.18648422, "balance_loss_mlp": 0.41664594, "epoch": 0.021043138433789268, "flos": 24573995562240.0, "grad_norm": 11.984612101666762, "language_loss": 0.91260052, "learning_rate": 3.7716430523347664e-06, "loss": 0.93240678, "num_input_tokens_seen": 7300180, "router_z_loss_clip": 3.30859375, "router_z_loss_mlp": 0.46240234, "step": 350, "time_per_iteration": 2.699597120285034 }, { "auxiliary_loss_clip": 0.01523505, "auxiliary_loss_mlp": 0.00463708, "balance_loss_clip": 1.18674469, "balance_loss_mlp": 0.41483241, "epoch": 0.021103261686457236, "flos": 24454053072000.0, "grad_norm": 8.956580187560727, "language_loss": 0.86202234, "learning_rate": 3.773480007028776e-06, "loss": 0.88189447, "num_input_tokens_seen": 7317430, "router_z_loss_clip": 3.37304688, "router_z_loss_mlp": 0.48828125, "step": 351, "time_per_iteration": 2.692387819290161 }, { "auxiliary_loss_clip": 0.01513763, "auxiliary_loss_mlp": 0.00426332, "balance_loss_clip": 1.17884672, "balance_loss_mlp": 0.37843406, "epoch": 0.021163384939125205, "flos": 14683083816960.0, "grad_norm": 35.0760203666088, "language_loss": 0.9445309, "learning_rate": 3.775311735671078e-06, "loss": 0.96393192, "num_input_tokens_seen": 7334875, "router_z_loss_clip": 3.34960938, "router_z_loss_mlp": 0.47900391, "step": 352, "time_per_iteration": 2.7070248126983643 }, { "auxiliary_loss_clip": 0.01523317, "auxiliary_loss_mlp": 0.0041904, "balance_loss_clip": 1.18905723, "balance_loss_mlp": 0.37056965, "epoch": 0.021223508191793177, "flos": 24493195918080.0, "grad_norm": 31.76226518990824, "language_loss": 0.89309633, "learning_rate": 3.7771382679130878e-06, "loss": 0.91251987, "num_input_tokens_seen": 7355185, "router_z_loss_clip": 3.34375, "router_z_loss_mlp": 0.48461914, "step": 353, "time_per_iteration": 2.72784686088562 }, { "auxiliary_loss_clip": 0.01521784, "auxiliary_loss_mlp": 0.00380775, "balance_loss_clip": 1.18873739, "balance_loss_mlp": 0.3322331, "epoch": 0.021283631444461146, "flos": 24126978804480.0, "grad_norm": 9.925819161780565, "language_loss": 0.88594818, "learning_rate": 3.7789596331545845e-06, "loss": 0.9049738, "num_input_tokens_seen": 7374425, "router_z_loss_clip": 3.33007812, "router_z_loss_mlp": 0.48510742, "step": 354, "time_per_iteration": 2.7640459537506104 }, { "auxiliary_loss_clip": 0.01513916, "auxiliary_loss_mlp": 0.00371801, "balance_loss_clip": 1.17788553, "balance_loss_mlp": 0.32271087, "epoch": 0.021343754697129114, "flos": 25192233475200.0, "grad_norm": 29.738870651972434, "language_loss": 0.89208066, "learning_rate": 3.780775860546545e-06, "loss": 0.91093791, "num_input_tokens_seen": 7394175, "router_z_loss_clip": 3.35742188, "router_z_loss_mlp": 0.49047852, "step": 355, "time_per_iteration": 2.677489757537842 }, { "auxiliary_loss_clip": 0.01529443, "auxiliary_loss_mlp": 0.00328227, "balance_loss_clip": 1.19072616, "balance_loss_mlp": 0.2762281, "epoch": 0.021403877949797083, "flos": 17274182279040.0, "grad_norm": 98.02544639954667, "language_loss": 0.97622085, "learning_rate": 3.7825869789939474e-06, "loss": 0.99479759, "num_input_tokens_seen": 7412645, "router_z_loss_clip": 3.38867188, "router_z_loss_mlp": 0.52001953, "step": 356, "time_per_iteration": 4.057261943817139 }, { "auxiliary_loss_clip": 0.01509145, "auxiliary_loss_mlp": 0.00339883, "balance_loss_clip": 1.17671204, "balance_loss_mlp": 0.2927236, "epoch": 0.021464001202465055, "flos": 30917435276160.0, "grad_norm": 7.057764249280363, "language_loss": 0.88654959, "learning_rate": 3.784393017158528e-06, "loss": 0.90503991, "num_input_tokens_seen": 7432275, "router_z_loss_clip": 3.32421875, "router_z_loss_mlp": 0.47192383, "step": 357, "time_per_iteration": 4.111879348754883 }, { "auxiliary_loss_clip": 0.0151194, "auxiliary_loss_mlp": 0.00317522, "balance_loss_clip": 1.18188429, "balance_loss_mlp": 0.27246064, "epoch": 0.021524124455133024, "flos": 18186385098240.0, "grad_norm": 60.287967632089305, "language_loss": 0.86501372, "learning_rate": 3.786194003461506e-06, "loss": 0.88330829, "num_input_tokens_seen": 7450245, "router_z_loss_clip": 3.29882812, "router_z_loss_mlp": 0.45092773, "step": 358, "time_per_iteration": 2.629321575164795 }, { "auxiliary_loss_clip": 0.01509169, "auxiliary_loss_mlp": 0.00334755, "balance_loss_clip": 1.17580867, "balance_loss_mlp": 0.28547344, "epoch": 0.021584247707800992, "flos": 13805786039040.0, "grad_norm": 14.208377939748068, "language_loss": 0.98343897, "learning_rate": 3.787989966086264e-06, "loss": 1.00187826, "num_input_tokens_seen": 7466845, "router_z_loss_clip": 3.33789062, "router_z_loss_mlp": 0.4921875, "step": 359, "time_per_iteration": 2.6666781902313232 }, { "auxiliary_loss_clip": 0.01527921, "auxiliary_loss_mlp": 0.00339357, "balance_loss_clip": 1.18625307, "balance_loss_mlp": 0.29238832, "epoch": 0.02164437096046896, "flos": 23294713703040.0, "grad_norm": 45.84299647618601, "language_loss": 0.87531233, "learning_rate": 3.789780932980997e-06, "loss": 0.89398509, "num_input_tokens_seen": 7485450, "router_z_loss_clip": 3.41796875, "router_z_loss_mlp": 0.4699707, "step": 360, "time_per_iteration": 2.643038749694824 }, { "auxiliary_loss_clip": 0.01429652, "auxiliary_loss_mlp": 0.00214293, "balance_loss_clip": 1.1649189, "balance_loss_mlp": 0.20017846, "epoch": 0.02170449421313693, "flos": 68899578341760.0, "grad_norm": 0.9529653064056531, "language_loss": 0.64688778, "learning_rate": 3.79156693186132e-06, "loss": 0.66332722, "num_input_tokens_seen": 7553780, "router_z_loss_clip": 2.65625, "router_z_loss_mlp": 0.14160156, "step": 361, "time_per_iteration": 3.26251482963562 }, { "auxiliary_loss_clip": 0.01509172, "auxiliary_loss_mlp": 0.00316629, "balance_loss_clip": 1.17540598, "balance_loss_mlp": 0.26830164, "epoch": 0.0217646174658049, "flos": 25228539146880.0, "grad_norm": 613.6648658632408, "language_loss": 0.9185428, "learning_rate": 3.7933479902128433e-06, "loss": 0.93680078, "num_input_tokens_seen": 7574155, "router_z_loss_clip": 3.33789062, "router_z_loss_mlp": 0.48291016, "step": 362, "time_per_iteration": 2.7215042114257812 }, { "auxiliary_loss_clip": 0.01511226, "auxiliary_loss_mlp": 0.00334452, "balance_loss_clip": 1.17727757, "balance_loss_mlp": 0.28893808, "epoch": 0.02182474071847287, "flos": 22893124671360.0, "grad_norm": 24.13255144442568, "language_loss": 1.0062834, "learning_rate": 3.7951241352937077e-06, "loss": 1.0247401, "num_input_tokens_seen": 7592320, "router_z_loss_clip": 3.33984375, "router_z_loss_mlp": 0.45507812, "step": 363, "time_per_iteration": 2.707303285598755 }, { "auxiliary_loss_clip": 0.01520518, "auxiliary_loss_mlp": 0.00311269, "balance_loss_clip": 1.19119895, "balance_loss_mlp": 0.26470545, "epoch": 0.02188486397114084, "flos": 23658991482240.0, "grad_norm": 88.32388624868497, "language_loss": 0.98755634, "learning_rate": 3.7968953941370915e-06, "loss": 1.00587428, "num_input_tokens_seen": 7611185, "router_z_loss_clip": 3.29492188, "router_z_loss_mlp": 0.46557617, "step": 364, "time_per_iteration": 2.67305064201355 }, { "auxiliary_loss_clip": 0.01515798, "auxiliary_loss_mlp": 0.00333592, "balance_loss_clip": 1.18681383, "balance_loss_mlp": 0.28447765, "epoch": 0.021944987223808807, "flos": 21543637680000.0, "grad_norm": 21.01746795026877, "language_loss": 0.8771807, "learning_rate": 3.798661793553676e-06, "loss": 0.89567459, "num_input_tokens_seen": 7631970, "router_z_loss_clip": 3.29492188, "router_z_loss_mlp": 0.49145508, "step": 365, "time_per_iteration": 2.6399991512298584 }, { "auxiliary_loss_clip": 0.01502205, "auxiliary_loss_mlp": 0.00322314, "balance_loss_clip": 1.18171906, "balance_loss_mlp": 0.27765781, "epoch": 0.022005110476476776, "flos": 16070887641600.0, "grad_norm": 38.780389958421054, "language_loss": 0.90803719, "learning_rate": 3.8004233601340808e-06, "loss": 0.92628235, "num_input_tokens_seen": 7649745, "router_z_loss_clip": 3.20507812, "router_z_loss_mlp": 0.4465332, "step": 366, "time_per_iteration": 2.6642067432403564 }, { "auxiliary_loss_clip": 0.01535178, "auxiliary_loss_mlp": 0.00349082, "balance_loss_clip": 1.19863892, "balance_loss_mlp": 0.30285305, "epoch": 0.022065233729144748, "flos": 21433715084160.0, "grad_norm": 38.01086816308675, "language_loss": 0.96822864, "learning_rate": 3.8021801202512694e-06, "loss": 0.98707128, "num_input_tokens_seen": 7668830, "router_z_loss_clip": 3.3671875, "router_z_loss_mlp": 0.4621582, "step": 367, "time_per_iteration": 2.6914007663726807 }, { "auxiliary_loss_clip": 0.01533183, "auxiliary_loss_mlp": 0.00362311, "balance_loss_clip": 1.19617915, "balance_loss_mlp": 0.31343508, "epoch": 0.022125356981812717, "flos": 21543709507200.0, "grad_norm": 43.225859883623606, "language_loss": 0.9389894, "learning_rate": 3.803932100062912e-06, "loss": 0.95794439, "num_input_tokens_seen": 7687240, "router_z_loss_clip": 3.36914062, "router_z_loss_mlp": 0.48852539, "step": 368, "time_per_iteration": 2.7055225372314453 }, { "auxiliary_loss_clip": 0.01544261, "auxiliary_loss_mlp": 0.00406056, "balance_loss_clip": 1.19792843, "balance_loss_mlp": 0.35386607, "epoch": 0.022185480234480685, "flos": 20704153944960.0, "grad_norm": 75.62107275637335, "language_loss": 0.89077866, "learning_rate": 3.8056793255137264e-06, "loss": 0.91028184, "num_input_tokens_seen": 7704440, "router_z_loss_clip": 3.46289062, "router_z_loss_mlp": 0.52172852, "step": 369, "time_per_iteration": 2.6808202266693115 }, { "auxiliary_loss_clip": 0.0151256, "auxiliary_loss_mlp": 0.00333646, "balance_loss_clip": 1.18819463, "balance_loss_mlp": 0.28672522, "epoch": 0.022245603487148654, "flos": 25193203142400.0, "grad_norm": 386.12195641403605, "language_loss": 0.92814636, "learning_rate": 3.8074218223377844e-06, "loss": 0.94660842, "num_input_tokens_seen": 7727160, "router_z_loss_clip": 3.25, "router_z_loss_mlp": 0.46948242, "step": 370, "time_per_iteration": 2.7185535430908203 }, { "auxiliary_loss_clip": 0.01517399, "auxiliary_loss_mlp": 0.00363723, "balance_loss_clip": 1.18639851, "balance_loss_mlp": 0.3199493, "epoch": 0.022305726739816623, "flos": 21395936954880.0, "grad_norm": 18.86696379815136, "language_loss": 0.89356893, "learning_rate": 3.8091596160607834e-06, "loss": 0.91238016, "num_input_tokens_seen": 7747730, "router_z_loss_clip": 3.31054688, "router_z_loss_mlp": 0.43774414, "step": 371, "time_per_iteration": 2.6834590435028076 }, { "auxiliary_loss_clip": 0.01527466, "auxiliary_loss_mlp": 0.00353966, "balance_loss_clip": 1.19729149, "balance_loss_mlp": 0.30706865, "epoch": 0.022365849992484595, "flos": 22492146170880.0, "grad_norm": 7334.19949890276, "language_loss": 0.92881465, "learning_rate": 3.8108927320022896e-06, "loss": 0.94762897, "num_input_tokens_seen": 7766765, "router_z_loss_clip": 3.3046875, "router_z_loss_mlp": 0.46923828, "step": 372, "time_per_iteration": 2.6618363857269287 }, { "auxiliary_loss_clip": 0.01514259, "auxiliary_loss_mlp": 0.00317182, "balance_loss_clip": 1.19431376, "balance_loss_mlp": 0.27305084, "epoch": 0.022425973245152563, "flos": 17856581397120.0, "grad_norm": 42.379007433544956, "language_loss": 0.90764785, "learning_rate": 3.8126211952779548e-06, "loss": 0.92596233, "num_input_tokens_seen": 7784010, "router_z_loss_clip": 3.20117188, "router_z_loss_mlp": 0.44116211, "step": 373, "time_per_iteration": 3.0775949954986572 }, { "auxiliary_loss_clip": 0.01521169, "auxiliary_loss_mlp": 0.00319969, "balance_loss_clip": 1.19598556, "balance_loss_mlp": 0.27476451, "epoch": 0.022486096497820532, "flos": 15483029656320.0, "grad_norm": 12.264853926991645, "language_loss": 0.90925711, "learning_rate": 3.8143450308016952e-06, "loss": 0.92766851, "num_input_tokens_seen": 7801305, "router_z_loss_clip": 3.25, "router_z_loss_mlp": 0.45214844, "step": 374, "time_per_iteration": 2.8823177814483643 }, { "auxiliary_loss_clip": 0.0151253, "auxiliary_loss_mlp": 0.00329569, "balance_loss_clip": 1.19073188, "balance_loss_mlp": 0.28319687, "epoch": 0.0225462197504885, "flos": 27784157950080.0, "grad_norm": 25.789442353711863, "language_loss": 0.9232924, "learning_rate": 3.8160642632878525e-06, "loss": 0.94171345, "num_input_tokens_seen": 7823965, "router_z_loss_clip": 3.21484375, "router_z_loss_mlp": 0.46362305, "step": 375, "time_per_iteration": 2.708775281906128 }, { "auxiliary_loss_clip": 0.01538233, "auxiliary_loss_mlp": 0.00341524, "balance_loss_clip": 1.21063495, "balance_loss_mlp": 0.2903598, "epoch": 0.02260634300315647, "flos": 19975490645760.0, "grad_norm": 24.57225782089914, "language_loss": 0.96388578, "learning_rate": 3.817778917253314e-06, "loss": 0.98268342, "num_input_tokens_seen": 7842115, "router_z_loss_clip": 3.27734375, "router_z_loss_mlp": 0.51196289, "step": 376, "time_per_iteration": 2.6269731521606445 }, { "auxiliary_loss_clip": 0.01549991, "auxiliary_loss_mlp": 0.00361838, "balance_loss_clip": 1.21508563, "balance_loss_mlp": 0.31758755, "epoch": 0.02266646625582444, "flos": 16028189349120.0, "grad_norm": 38.46674291964797, "language_loss": 0.85004634, "learning_rate": 3.8194890170196155e-06, "loss": 0.86916459, "num_input_tokens_seen": 7857830, "router_z_loss_clip": 3.34960938, "router_z_loss_mlp": 0.44238281, "step": 377, "time_per_iteration": 2.6011924743652344 }, { "auxiliary_loss_clip": 0.01543211, "auxiliary_loss_mlp": 0.00326384, "balance_loss_clip": 1.21542931, "balance_loss_mlp": 0.28323036, "epoch": 0.02272658950849241, "flos": 20404622430720.0, "grad_norm": 39.127833122263, "language_loss": 1.07027102, "learning_rate": 3.8211945867150055e-06, "loss": 1.08896685, "num_input_tokens_seen": 7875840, "router_z_loss_clip": 3.27734375, "router_z_loss_mlp": 0.43164062, "step": 378, "time_per_iteration": 2.6778993606567383 }, { "auxiliary_loss_clip": 0.01360279, "auxiliary_loss_mlp": 0.00309604, "balance_loss_clip": 1.1094451, "balance_loss_mlp": 0.29453617, "epoch": 0.02278671276116038, "flos": 69847332647040.0, "grad_norm": 1.1115497447768135, "language_loss": 0.75309598, "learning_rate": 3.822895650276492e-06, "loss": 0.76979476, "num_input_tokens_seen": 7940190, "router_z_loss_clip": 2.5, "router_z_loss_mlp": 0.15039062, "step": 379, "time_per_iteration": 3.212510585784912 }, { "auxiliary_loss_clip": 0.01575326, "auxiliary_loss_mlp": 0.00360175, "balance_loss_clip": 1.22719979, "balance_loss_mlp": 0.3130154, "epoch": 0.022846836013828347, "flos": 38508771340800.0, "grad_norm": 5886.55427360622, "language_loss": 0.86944425, "learning_rate": 3.824592231451859e-06, "loss": 0.88879931, "num_input_tokens_seen": 7960840, "router_z_loss_clip": 3.484375, "router_z_loss_mlp": 0.47192383, "step": 380, "time_per_iteration": 2.7725703716278076 }, { "auxiliary_loss_clip": 0.01570244, "auxiliary_loss_mlp": 0.0033501, "balance_loss_clip": 1.23151445, "balance_loss_mlp": 0.28985393, "epoch": 0.02290695926649632, "flos": 20959478795520.0, "grad_norm": 517.2507037819446, "language_loss": 1.06674385, "learning_rate": 3.826284353801652e-06, "loss": 1.08579659, "num_input_tokens_seen": 7975500, "router_z_loss_clip": 3.38867188, "router_z_loss_mlp": 0.4519043, "step": 381, "time_per_iteration": 2.6238043308258057 }, { "auxiliary_loss_clip": 0.01562872, "auxiliary_loss_mlp": 0.0034918, "balance_loss_clip": 1.22451746, "balance_loss_mlp": 0.30323696, "epoch": 0.022967082519164288, "flos": 24022407335040.0, "grad_norm": 19.10157888637605, "language_loss": 0.95642018, "learning_rate": 3.827972040701142e-06, "loss": 0.9755407, "num_input_tokens_seen": 7993880, "router_z_loss_clip": 3.3828125, "router_z_loss_mlp": 0.4597168, "step": 382, "time_per_iteration": 2.6926281452178955 }, { "auxiliary_loss_clip": 0.01541195, "auxiliary_loss_mlp": 0.00345423, "balance_loss_clip": 1.21146035, "balance_loss_mlp": 0.30169746, "epoch": 0.023027205771832256, "flos": 20997149184000.0, "grad_norm": 15.047680114401052, "language_loss": 0.94004655, "learning_rate": 3.829655315342268e-06, "loss": 0.95891273, "num_input_tokens_seen": 8012730, "router_z_loss_clip": 3.296875, "router_z_loss_mlp": 0.4375, "step": 383, "time_per_iteration": 2.66292405128479 }, { "auxiliary_loss_clip": 0.01509651, "auxiliary_loss_mlp": 0.00341772, "balance_loss_clip": 1.18782377, "balance_loss_mlp": 0.29971534, "epoch": 0.023087329024500225, "flos": 21360816432000.0, "grad_norm": 41.04131544865794, "language_loss": 0.93791032, "learning_rate": 3.831334200735543e-06, "loss": 0.95642447, "num_input_tokens_seen": 8031275, "router_z_loss_clip": 3.21484375, "router_z_loss_mlp": 0.42041016, "step": 384, "time_per_iteration": 2.729787588119507 }, { "auxiliary_loss_clip": 0.0150381, "auxiliary_loss_mlp": 0.00321713, "balance_loss_clip": 1.1899091, "balance_loss_mlp": 0.28089529, "epoch": 0.023147452277168194, "flos": 21872435800320.0, "grad_norm": 10.47499622636741, "language_loss": 0.96072507, "learning_rate": 3.8330087197119426e-06, "loss": 0.9789803, "num_input_tokens_seen": 8051600, "router_z_loss_clip": 3.13867188, "router_z_loss_mlp": 0.40844727, "step": 385, "time_per_iteration": 2.6384949684143066 }, { "auxiliary_loss_clip": 0.01498257, "auxiliary_loss_mlp": 0.00337882, "balance_loss_clip": 1.1809392, "balance_loss_mlp": 0.29773244, "epoch": 0.023207575529836166, "flos": 18916700423040.0, "grad_norm": 59.67904665395745, "language_loss": 0.76697737, "learning_rate": 3.83467889492477e-06, "loss": 0.78533876, "num_input_tokens_seen": 8070600, "router_z_loss_clip": 3.171875, "router_z_loss_mlp": 0.40136719, "step": 386, "time_per_iteration": 2.6650912761688232 }, { "auxiliary_loss_clip": 0.01492622, "auxiliary_loss_mlp": 0.0036505, "balance_loss_clip": 1.16879439, "balance_loss_mlp": 0.32406634, "epoch": 0.023267698782504134, "flos": 25046005207680.0, "grad_norm": 15.321456637892089, "language_loss": 0.94538677, "learning_rate": 3.836344748851495e-06, "loss": 0.96396351, "num_input_tokens_seen": 8090680, "router_z_loss_clip": 3.24023438, "router_z_loss_mlp": 0.40991211, "step": 387, "time_per_iteration": 2.7699034214019775 }, { "auxiliary_loss_clip": 0.01488843, "auxiliary_loss_mlp": 0.00353733, "balance_loss_clip": 1.16618443, "balance_loss_mlp": 0.31093735, "epoch": 0.023327822035172103, "flos": 28879217930880.0, "grad_norm": 167.72664415642237, "language_loss": 0.90978992, "learning_rate": 3.838006303795566e-06, "loss": 0.92821574, "num_input_tokens_seen": 8114610, "router_z_loss_clip": 3.2265625, "router_z_loss_mlp": 0.42773438, "step": 388, "time_per_iteration": 2.7591171264648438 }, { "auxiliary_loss_clip": 0.01488672, "auxiliary_loss_mlp": 0.00386302, "balance_loss_clip": 1.1630466, "balance_loss_mlp": 0.34457856, "epoch": 0.02338794528784007, "flos": 27121533805440.0, "grad_norm": 122.31283401229604, "language_loss": 1.03253531, "learning_rate": 3.839663581888206e-06, "loss": 1.05128515, "num_input_tokens_seen": 8133975, "router_z_loss_clip": 3.2578125, "router_z_loss_mlp": 0.41772461, "step": 389, "time_per_iteration": 2.708311080932617 }, { "auxiliary_loss_clip": 0.0146032, "auxiliary_loss_mlp": 0.00322361, "balance_loss_clip": 1.15428615, "balance_loss_mlp": 0.28290242, "epoch": 0.02344806854050804, "flos": 21322355944320.0, "grad_norm": 77.0883118132372, "language_loss": 0.96493512, "learning_rate": 3.841316605090178e-06, "loss": 0.98276198, "num_input_tokens_seen": 8153570, "router_z_loss_clip": 3.0625, "router_z_loss_mlp": 0.39453125, "step": 390, "time_per_iteration": 2.702151298522949 }, { "auxiliary_loss_clip": 0.01462422, "auxiliary_loss_mlp": 0.00365464, "balance_loss_clip": 1.15242827, "balance_loss_mlp": 0.32564798, "epoch": 0.023508191793176012, "flos": 24789997998720.0, "grad_norm": 10.148867398654644, "language_loss": 1.01097083, "learning_rate": 3.842965395193529e-06, "loss": 1.02924967, "num_input_tokens_seen": 8170075, "router_z_loss_clip": 3.1015625, "router_z_loss_mlp": 0.39868164, "step": 391, "time_per_iteration": 2.683629035949707 }, { "auxiliary_loss_clip": 0.01465617, "auxiliary_loss_mlp": 0.00357577, "balance_loss_clip": 1.15055084, "balance_loss_mlp": 0.31821409, "epoch": 0.02356831504584398, "flos": 25995375624960.0, "grad_norm": 62.51213740886254, "language_loss": 0.94825232, "learning_rate": 3.84460997382332e-06, "loss": 0.96648425, "num_input_tokens_seen": 8190420, "router_z_loss_clip": 3.1484375, "router_z_loss_mlp": 0.39379883, "step": 392, "time_per_iteration": 2.6983795166015625 }, { "auxiliary_loss_clip": 0.01436542, "auxiliary_loss_mlp": 0.00344158, "balance_loss_clip": 1.14328551, "balance_loss_mlp": 0.30617833, "epoch": 0.02362843829851195, "flos": 19062461813760.0, "grad_norm": 247.14878208640872, "language_loss": 0.98110199, "learning_rate": 3.8462503624393256e-06, "loss": 0.99890906, "num_input_tokens_seen": 8208790, "router_z_loss_clip": 2.93164062, "router_z_loss_mlp": 0.37988281, "step": 393, "time_per_iteration": 2.6159582138061523 }, { "auxiliary_loss_clip": 0.01453291, "auxiliary_loss_mlp": 0.00359151, "balance_loss_clip": 1.15109539, "balance_loss_mlp": 0.32143313, "epoch": 0.023688561551179918, "flos": 16071031296000.0, "grad_norm": 43.41067005792844, "language_loss": 0.89193499, "learning_rate": 3.84788658233771e-06, "loss": 0.91005933, "num_input_tokens_seen": 8226885, "router_z_loss_clip": 3.0234375, "router_z_loss_mlp": 0.37719727, "step": 394, "time_per_iteration": 2.6035096645355225 }, { "auxiliary_loss_clip": 0.01440423, "auxiliary_loss_mlp": 0.00344552, "balance_loss_clip": 1.13887262, "balance_loss_mlp": 0.30490261, "epoch": 0.023748684803847887, "flos": 21724375939200.0, "grad_norm": 192.5227778711521, "language_loss": 0.93150342, "learning_rate": 3.84951865465269e-06, "loss": 0.94935322, "num_input_tokens_seen": 8246825, "router_z_loss_clip": 3.015625, "router_z_loss_mlp": 0.39672852, "step": 395, "time_per_iteration": 2.679187774658203 }, { "auxiliary_loss_clip": 0.01305619, "auxiliary_loss_mlp": 0.00313634, "balance_loss_clip": 1.06917858, "balance_loss_mlp": 0.29856643, "epoch": 0.02380880805651586, "flos": 61926192881280.0, "grad_norm": 1.4795646620529492, "language_loss": 0.63490987, "learning_rate": 3.851146600358172e-06, "loss": 0.65110242, "num_input_tokens_seen": 8302835, "router_z_loss_clip": 2.375, "router_z_loss_mlp": 0.15039062, "step": 396, "time_per_iteration": 3.0325188636779785 }, { "auxiliary_loss_clip": 0.01441148, "auxiliary_loss_mlp": 0.00343902, "balance_loss_clip": 1.13871348, "balance_loss_mlp": 0.30630368, "epoch": 0.023868931309183827, "flos": 20266331068800.0, "grad_norm": 23.64073367094155, "language_loss": 0.95396185, "learning_rate": 3.852770440269372e-06, "loss": 0.97181237, "num_input_tokens_seen": 8320745, "router_z_loss_clip": 3.0234375, "router_z_loss_mlp": 0.37597656, "step": 397, "time_per_iteration": 2.6354172229766846 }, { "auxiliary_loss_clip": 0.01454733, "auxiliary_loss_mlp": 0.00381303, "balance_loss_clip": 1.14698672, "balance_loss_mlp": 0.34174931, "epoch": 0.023929054561851796, "flos": 21139103733120.0, "grad_norm": 54.2604897699147, "language_loss": 0.95084643, "learning_rate": 3.854390195044404e-06, "loss": 0.96920675, "num_input_tokens_seen": 8339540, "router_z_loss_clip": 3.078125, "router_z_loss_mlp": 0.39575195, "step": 398, "time_per_iteration": 4.0485146045684814 }, { "auxiliary_loss_clip": 0.01452601, "auxiliary_loss_mlp": 0.00386179, "balance_loss_clip": 1.1419481, "balance_loss_mlp": 0.34400269, "epoch": 0.023989177814519765, "flos": 13698521049600.0, "grad_norm": 16.771937191662516, "language_loss": 0.98184246, "learning_rate": 3.856005885185868e-06, "loss": 1.00023031, "num_input_tokens_seen": 8354890, "router_z_loss_clip": 3.10546875, "router_z_loss_mlp": 0.42163086, "step": 399, "time_per_iteration": 5.570476055145264 }, { "auxiliary_loss_clip": 0.01426035, "auxiliary_loss_mlp": 0.00319738, "balance_loss_clip": 1.13257658, "balance_loss_mlp": 0.28142446, "epoch": 0.024049301067187733, "flos": 26322018929280.0, "grad_norm": 10.568282220440201, "language_loss": 0.93230581, "learning_rate": 3.857617531042398e-06, "loss": 0.94976354, "num_input_tokens_seen": 8375845, "router_z_loss_clip": 2.93359375, "router_z_loss_mlp": 0.38330078, "step": 400, "time_per_iteration": 2.676722764968872 }, { "auxiliary_loss_clip": 0.01431544, "auxiliary_loss_mlp": 0.00318261, "balance_loss_clip": 1.13156176, "balance_loss_mlp": 0.28078112, "epoch": 0.024109424319855705, "flos": 24425432910720.0, "grad_norm": 24.00082735376984, "language_loss": 0.86950696, "learning_rate": 3.8592251528102065e-06, "loss": 0.88700497, "num_input_tokens_seen": 8395240, "router_z_loss_clip": 3.00195312, "router_z_loss_mlp": 0.375, "step": 401, "time_per_iteration": 2.6696531772613525 }, { "auxiliary_loss_clip": 0.01441826, "auxiliary_loss_mlp": 0.00322226, "balance_loss_clip": 1.13702631, "balance_loss_mlp": 0.28198087, "epoch": 0.024169547572523674, "flos": 29604397610880.0, "grad_norm": 3.5523075894526444, "language_loss": 0.9022274, "learning_rate": 3.8608287705345976e-06, "loss": 0.91986793, "num_input_tokens_seen": 8416950, "router_z_loss_clip": 3.04492188, "router_z_loss_mlp": 0.40258789, "step": 402, "time_per_iteration": 2.6798934936523438 }, { "auxiliary_loss_clip": 0.01476831, "auxiliary_loss_mlp": 0.0039759, "balance_loss_clip": 1.14419782, "balance_loss_mlp": 0.35171807, "epoch": 0.024229670825191642, "flos": 22601458235520.0, "grad_norm": 20.24102196538997, "language_loss": 1.04564536, "learning_rate": 3.86242840411147e-06, "loss": 1.06438947, "num_input_tokens_seen": 8433660, "router_z_loss_clip": 3.32617188, "router_z_loss_mlp": 0.45849609, "step": 403, "time_per_iteration": 2.6480603218078613 }, { "auxiliary_loss_clip": 0.01468691, "auxiliary_loss_mlp": 0.00362143, "balance_loss_clip": 1.14455867, "balance_loss_mlp": 0.31829736, "epoch": 0.02428979407785961, "flos": 18150258994560.0, "grad_norm": 124.81872321059176, "language_loss": 1.07451332, "learning_rate": 3.864024073288798e-06, "loss": 1.0928216, "num_input_tokens_seen": 8450180, "router_z_loss_clip": 3.24023438, "router_z_loss_mlp": 0.43823242, "step": 404, "time_per_iteration": 2.6177539825439453 }, { "auxiliary_loss_clip": 0.01497079, "auxiliary_loss_mlp": 0.00383814, "balance_loss_clip": 1.16568434, "balance_loss_mlp": 0.33882481, "epoch": 0.024349917330527583, "flos": 15304984917120.0, "grad_norm": 11.914263017202467, "language_loss": 0.98148525, "learning_rate": 3.865615797668091e-06, "loss": 1.00029421, "num_input_tokens_seen": 8467775, "router_z_loss_clip": 3.31640625, "router_z_loss_mlp": 0.44995117, "step": 405, "time_per_iteration": 2.6843278408050537 }, { "auxiliary_loss_clip": 0.01567696, "auxiliary_loss_mlp": 0.00429368, "balance_loss_clip": 1.20863152, "balance_loss_mlp": 0.38144609, "epoch": 0.024410040583195552, "flos": 20773892200320.0, "grad_norm": 24.611002263020453, "language_loss": 1.01854444, "learning_rate": 3.867203596705844e-06, "loss": 1.03851509, "num_input_tokens_seen": 8486765, "router_z_loss_clip": 3.59179688, "router_z_loss_mlp": 0.47900391, "step": 406, "time_per_iteration": 2.648547410964966 }, { "auxiliary_loss_clip": 0.01576812, "auxiliary_loss_mlp": 0.00427859, "balance_loss_clip": 1.21369028, "balance_loss_mlp": 0.37822077, "epoch": 0.02447016383586352, "flos": 21798854789760.0, "grad_norm": 53.63628982535537, "language_loss": 0.95902008, "learning_rate": 3.86878748971496e-06, "loss": 0.97906685, "num_input_tokens_seen": 8506515, "router_z_loss_clip": 3.63085938, "router_z_loss_mlp": 0.49658203, "step": 407, "time_per_iteration": 2.650451183319092 }, { "auxiliary_loss_clip": 0.01601904, "auxiliary_loss_mlp": 0.00436098, "balance_loss_clip": 1.22742486, "balance_loss_mlp": 0.39106107, "epoch": 0.02453028708853149, "flos": 33948116380800.0, "grad_norm": 13.620053661620686, "language_loss": 0.80098253, "learning_rate": 3.8703674958661596e-06, "loss": 0.82136256, "num_input_tokens_seen": 8528035, "router_z_loss_clip": 3.74609375, "router_z_loss_mlp": 0.45019531, "step": 408, "time_per_iteration": 2.7871968746185303 }, { "auxiliary_loss_clip": 0.01658732, "auxiliary_loss_mlp": 0.00453093, "balance_loss_clip": 1.25224447, "balance_loss_mlp": 0.40285772, "epoch": 0.024590410341199458, "flos": 21793000872960.0, "grad_norm": 8.531554986303604, "language_loss": 1.01906967, "learning_rate": 3.871943634189376e-06, "loss": 1.04018784, "num_input_tokens_seen": 8546455, "router_z_loss_clip": 4.06640625, "router_z_loss_mlp": 0.50268555, "step": 409, "time_per_iteration": 2.6576178073883057 }, { "auxiliary_loss_clip": 0.01692906, "auxiliary_loss_mlp": 0.00460758, "balance_loss_clip": 1.26603675, "balance_loss_mlp": 0.41214493, "epoch": 0.02465053359386743, "flos": 35114782124160.0, "grad_norm": 393.06278710061054, "language_loss": 0.89926088, "learning_rate": 3.873515923575128e-06, "loss": 0.92079759, "num_input_tokens_seen": 8568450, "router_z_loss_clip": 4.27734375, "router_z_loss_mlp": 0.48608398, "step": 410, "time_per_iteration": 2.7898566722869873 }, { "auxiliary_loss_clip": 0.01751991, "auxiliary_loss_mlp": 0.00478314, "balance_loss_clip": 1.29831076, "balance_loss_mlp": 0.42681584, "epoch": 0.0247106568465354, "flos": 27451409333760.0, "grad_norm": 48.13319885688254, "language_loss": 0.85088003, "learning_rate": 3.875084382775879e-06, "loss": 0.87318313, "num_input_tokens_seen": 8589340, "router_z_loss_clip": 4.54296875, "router_z_loss_mlp": 0.51586914, "step": 411, "time_per_iteration": 2.679041862487793 }, { "auxiliary_loss_clip": 0.0175643, "auxiliary_loss_mlp": 0.00468497, "balance_loss_clip": 1.29576516, "balance_loss_mlp": 0.41485265, "epoch": 0.024770780099203367, "flos": 20703794808960.0, "grad_norm": 169.38057031166306, "language_loss": 0.96551132, "learning_rate": 3.87664903040738e-06, "loss": 0.9877606, "num_input_tokens_seen": 8607150, "router_z_loss_clip": 4.609375, "router_z_loss_mlp": 0.53637695, "step": 412, "time_per_iteration": 2.7272403240203857 }, { "auxiliary_loss_clip": 0.01536049, "auxiliary_loss_mlp": 0.00260981, "balance_loss_clip": 1.20265138, "balance_loss_mlp": 0.24658008, "epoch": 0.024830903351871336, "flos": 69551859369600.0, "grad_norm": 0.8399841194984358, "language_loss": 0.58072162, "learning_rate": 3.878209884949994e-06, "loss": 0.59869188, "num_input_tokens_seen": 8669865, "router_z_loss_clip": 3.328125, "router_z_loss_mlp": 0.14355469, "step": 413, "time_per_iteration": 3.256260633468628 }, { "auxiliary_loss_clip": 0.01801652, "auxiliary_loss_mlp": 0.00468195, "balance_loss_clip": 1.30706239, "balance_loss_mlp": 0.41047412, "epoch": 0.024891026604539304, "flos": 32270477713920.0, "grad_norm": 15.593118246125881, "language_loss": 0.87347746, "learning_rate": 3.879766964750006e-06, "loss": 0.89617592, "num_input_tokens_seen": 8690235, "router_z_loss_clip": 4.94921875, "router_z_loss_mlp": 0.5769043, "step": 414, "time_per_iteration": 2.769810676574707 }, { "auxiliary_loss_clip": 0.01826005, "auxiliary_loss_mlp": 0.00453289, "balance_loss_clip": 1.32519507, "balance_loss_mlp": 0.40188599, "epoch": 0.024951149857207276, "flos": 18840282238080.0, "grad_norm": 31.937992567950268, "language_loss": 0.88616091, "learning_rate": 3.881320288020917e-06, "loss": 0.90895385, "num_input_tokens_seen": 8706295, "router_z_loss_clip": 5.01171875, "router_z_loss_mlp": 0.51391602, "step": 415, "time_per_iteration": 2.6401093006134033 }, { "auxiliary_loss_clip": 0.0187456, "auxiliary_loss_mlp": 0.00468046, "balance_loss_clip": 1.33668232, "balance_loss_mlp": 0.4147594, "epoch": 0.025011273109875245, "flos": 15377201210880.0, "grad_norm": 44.13028506879389, "language_loss": 1.08054113, "learning_rate": 3.882869872844723e-06, "loss": 1.10396719, "num_input_tokens_seen": 8724200, "router_z_loss_clip": 5.375, "router_z_loss_mlp": 0.53320312, "step": 416, "time_per_iteration": 2.7485015392303467 }, { "auxiliary_loss_clip": 0.01886013, "auxiliary_loss_mlp": 0.00512743, "balance_loss_clip": 1.33771777, "balance_loss_mlp": 0.45371011, "epoch": 0.025071396362543213, "flos": 18915515274240.0, "grad_norm": 24.57796699676221, "language_loss": 0.8260203, "learning_rate": 3.884415737173176e-06, "loss": 0.85000789, "num_input_tokens_seen": 8744170, "router_z_loss_clip": 5.48046875, "router_z_loss_mlp": 0.58959961, "step": 417, "time_per_iteration": 2.7337136268615723 }, { "auxiliary_loss_clip": 0.01886415, "auxiliary_loss_mlp": 0.00462626, "balance_loss_clip": 1.35136271, "balance_loss_mlp": 0.40812334, "epoch": 0.025131519615211182, "flos": 25337958952320.0, "grad_norm": 56.44568362393487, "language_loss": 0.84310174, "learning_rate": 3.8859578988290344e-06, "loss": 0.86659217, "num_input_tokens_seen": 8765120, "router_z_loss_clip": 5.3515625, "router_z_loss_mlp": 0.54516602, "step": 418, "time_per_iteration": 2.6680052280426025 }, { "auxiliary_loss_clip": 0.01905592, "auxiliary_loss_mlp": 0.00448095, "balance_loss_clip": 1.35417175, "balance_loss_mlp": 0.39621529, "epoch": 0.02519164286787915, "flos": 18953149749120.0, "grad_norm": 287.0761373730547, "language_loss": 0.93945718, "learning_rate": 3.887496375507294e-06, "loss": 0.96299404, "num_input_tokens_seen": 8783500, "router_z_loss_clip": 5.51171875, "router_z_loss_mlp": 0.51855469, "step": 419, "time_per_iteration": 2.6204447746276855 }, { "auxiliary_loss_clip": 0.01892131, "auxiliary_loss_mlp": 0.00465204, "balance_loss_clip": 1.34680414, "balance_loss_mlp": 0.41201332, "epoch": 0.025251766120547123, "flos": 17421092904960.0, "grad_norm": 9.219796122688148, "language_loss": 0.81337059, "learning_rate": 3.8890311847764065e-06, "loss": 0.83694398, "num_input_tokens_seen": 8801175, "router_z_loss_clip": 5.453125, "router_z_loss_mlp": 0.53271484, "step": 420, "time_per_iteration": 2.6334424018859863 }, { "auxiliary_loss_clip": 0.01915238, "auxiliary_loss_mlp": 0.0046521, "balance_loss_clip": 1.34768367, "balance_loss_mlp": 0.41476095, "epoch": 0.02531188937321509, "flos": 25045430590080.0, "grad_norm": 15.69342414969331, "language_loss": 0.86411136, "learning_rate": 3.890562344079484e-06, "loss": 0.88791585, "num_input_tokens_seen": 8820215, "router_z_loss_clip": 5.66796875, "router_z_loss_mlp": 0.50439453, "step": 421, "time_per_iteration": 2.757634162902832 }, { "auxiliary_loss_clip": 0.01919809, "auxiliary_loss_mlp": 0.0047349, "balance_loss_clip": 1.35392189, "balance_loss_mlp": 0.42161012, "epoch": 0.02537201262588306, "flos": 30592228515840.0, "grad_norm": 79.02758993556142, "language_loss": 0.90216053, "learning_rate": 3.89208987073549e-06, "loss": 0.92609352, "num_input_tokens_seen": 8839660, "router_z_loss_clip": 5.6640625, "router_z_loss_mlp": 0.51806641, "step": 422, "time_per_iteration": 2.725576877593994 }, { "auxiliary_loss_clip": 0.01934553, "auxiliary_loss_mlp": 0.00454766, "balance_loss_clip": 1.35145044, "balance_loss_mlp": 0.4050315, "epoch": 0.02543213587855103, "flos": 26065365275520.0, "grad_norm": 87.63871128429972, "language_loss": 0.89437306, "learning_rate": 3.893613781940409e-06, "loss": 0.91826624, "num_input_tokens_seen": 8859280, "router_z_loss_clip": 5.83984375, "router_z_loss_mlp": 0.49755859, "step": 423, "time_per_iteration": 2.715895414352417 }, { "auxiliary_loss_clip": 0.01953064, "auxiliary_loss_mlp": 0.00486886, "balance_loss_clip": 1.36693192, "balance_loss_mlp": 0.4337188, "epoch": 0.025492259131218997, "flos": 36022818965760.0, "grad_norm": 2921.2046451277474, "language_loss": 0.79795778, "learning_rate": 3.895134094768415e-06, "loss": 0.8223573, "num_input_tokens_seen": 8880560, "router_z_loss_clip": 5.86328125, "router_z_loss_mlp": 0.53173828, "step": 424, "time_per_iteration": 2.753077268600464 }, { "auxiliary_loss_clip": 0.01960533, "auxiliary_loss_mlp": 0.00488344, "balance_loss_clip": 1.36884391, "balance_loss_mlp": 0.43829975, "epoch": 0.02555238238388697, "flos": 18588045957120.0, "grad_norm": 9.224667483097814, "language_loss": 0.91462213, "learning_rate": 3.896650826173015e-06, "loss": 0.93911093, "num_input_tokens_seen": 8899155, "router_z_loss_clip": 5.9140625, "router_z_loss_mlp": 0.50073242, "step": 425, "time_per_iteration": 2.645948648452759 }, { "auxiliary_loss_clip": 0.01958062, "auxiliary_loss_mlp": 0.00490973, "balance_loss_clip": 1.35766709, "balance_loss_mlp": 0.43673301, "epoch": 0.025612505636554938, "flos": 24243186280320.0, "grad_norm": 63.03664372839104, "language_loss": 0.91528869, "learning_rate": 3.898163992988186e-06, "loss": 0.93977904, "num_input_tokens_seen": 8917890, "router_z_loss_clip": 6.00390625, "router_z_loss_mlp": 0.54272461, "step": 426, "time_per_iteration": 2.6336944103240967 }, { "auxiliary_loss_clip": 0.01670869, "auxiliary_loss_mlp": 0.00431966, "balance_loss_clip": 1.29185009, "balance_loss_mlp": 0.41546726, "epoch": 0.025672628889222907, "flos": 60586941265920.0, "grad_norm": 0.90655101929836, "language_loss": 0.56794798, "learning_rate": 3.899673611929491e-06, "loss": 0.58897638, "num_input_tokens_seen": 8978260, "router_z_loss_clip": 3.78125, "router_z_loss_mlp": 0.16503906, "step": 427, "time_per_iteration": 3.297841787338257 }, { "auxiliary_loss_clip": 0.01969685, "auxiliary_loss_mlp": 0.00493751, "balance_loss_clip": 1.3699851, "balance_loss_mlp": 0.44315836, "epoch": 0.025732752141890875, "flos": 19573255169280.0, "grad_norm": 10.304390367867011, "language_loss": 0.95777792, "learning_rate": 3.901179699595194e-06, "loss": 0.98241228, "num_input_tokens_seen": 8994460, "router_z_loss_clip": 6.0, "router_z_loss_mlp": 0.5065918, "step": 428, "time_per_iteration": 2.6635453701019287 }, { "auxiliary_loss_clip": 0.01925572, "auxiliary_loss_mlp": 0.0048904, "balance_loss_clip": 1.35072351, "balance_loss_mlp": 0.43642142, "epoch": 0.025792875394558847, "flos": 31284262920960.0, "grad_norm": 47.04621176664694, "language_loss": 0.91731411, "learning_rate": 3.902682272467353e-06, "loss": 0.94146025, "num_input_tokens_seen": 9016670, "router_z_loss_clip": 5.75, "router_z_loss_mlp": 0.52661133, "step": 429, "time_per_iteration": 2.734067678451538 }, { "auxiliary_loss_clip": 0.01946465, "auxiliary_loss_mlp": 0.00482067, "balance_loss_clip": 1.35330558, "balance_loss_mlp": 0.42868459, "epoch": 0.025852998647226816, "flos": 32379610210560.0, "grad_norm": 36.55793556515902, "language_loss": 0.93248236, "learning_rate": 3.904181346912895e-06, "loss": 0.95676768, "num_input_tokens_seen": 9039720, "router_z_loss_clip": 5.92578125, "router_z_loss_mlp": 0.53442383, "step": 430, "time_per_iteration": 2.752880573272705 }, { "auxiliary_loss_clip": 0.01915667, "auxiliary_loss_mlp": 0.00437971, "balance_loss_clip": 1.35594988, "balance_loss_mlp": 0.38909552, "epoch": 0.025913121899894784, "flos": 20193288762240.0, "grad_norm": 27.348754325705023, "language_loss": 0.89985824, "learning_rate": 3.905676939184698e-06, "loss": 0.92339456, "num_input_tokens_seen": 9059850, "router_z_loss_clip": 5.59375, "router_z_loss_mlp": 0.48852539, "step": 431, "time_per_iteration": 2.6637299060821533 }, { "auxiliary_loss_clip": 0.01938013, "auxiliary_loss_mlp": 0.00469542, "balance_loss_clip": 1.36681557, "balance_loss_mlp": 0.41837755, "epoch": 0.025973245152562753, "flos": 14720430983040.0, "grad_norm": 25.941064600871343, "language_loss": 0.96615708, "learning_rate": 3.907169065422638e-06, "loss": 0.99023259, "num_input_tokens_seen": 9077590, "router_z_loss_clip": 5.71484375, "router_z_loss_mlp": 0.51220703, "step": 432, "time_per_iteration": 2.6887614727020264 }, { "auxiliary_loss_clip": 0.01942939, "auxiliary_loss_mlp": 0.00422268, "balance_loss_clip": 1.37782359, "balance_loss_mlp": 0.37777928, "epoch": 0.02603336840523072, "flos": 30992991534720.0, "grad_norm": 66.27044034611329, "language_loss": 0.83125114, "learning_rate": 3.908657741654636e-06, "loss": 0.85490316, "num_input_tokens_seen": 9099880, "router_z_loss_clip": 5.65234375, "router_z_loss_mlp": 0.4453125, "step": 433, "time_per_iteration": 2.750136137008667 }, { "auxiliary_loss_clip": 0.0193993, "auxiliary_loss_mlp": 0.00451461, "balance_loss_clip": 1.37188828, "balance_loss_mlp": 0.40315735, "epoch": 0.026093491657898694, "flos": 17674262939520.0, "grad_norm": 99.74953088672527, "language_loss": 0.97187757, "learning_rate": 3.910142983797699e-06, "loss": 0.99579149, "num_input_tokens_seen": 9118620, "router_z_loss_clip": 5.6796875, "router_z_loss_mlp": 0.48291016, "step": 434, "time_per_iteration": 2.850921869277954 }, { "auxiliary_loss_clip": 0.01939192, "auxiliary_loss_mlp": 0.00461281, "balance_loss_clip": 1.37358069, "balance_loss_mlp": 0.41307271, "epoch": 0.026153614910566662, "flos": 17857874286720.0, "grad_norm": 731.863262841361, "language_loss": 0.86682081, "learning_rate": 3.9116248076589305e-06, "loss": 0.89082551, "num_input_tokens_seen": 9135655, "router_z_loss_clip": 5.66015625, "router_z_loss_mlp": 0.48266602, "step": 435, "time_per_iteration": 2.7048747539520264 }, { "auxiliary_loss_clip": 0.01951588, "auxiliary_loss_mlp": 0.00486545, "balance_loss_clip": 1.37643385, "balance_loss_mlp": 0.43275762, "epoch": 0.02621373816323463, "flos": 20011113959040.0, "grad_norm": 55.174905445967404, "language_loss": 0.93517882, "learning_rate": 3.913103228936546e-06, "loss": 0.95956016, "num_input_tokens_seen": 9153520, "router_z_loss_clip": 5.74609375, "router_z_loss_mlp": 0.5378418, "step": 436, "time_per_iteration": 2.700579881668091 }, { "auxiliary_loss_clip": 0.01946465, "auxiliary_loss_mlp": 0.00456365, "balance_loss_clip": 1.38551855, "balance_loss_mlp": 0.40691659, "epoch": 0.0262738614159026, "flos": 19281193683840.0, "grad_norm": 19.81804861277507, "language_loss": 0.82856333, "learning_rate": 3.914578263220868e-06, "loss": 0.85259157, "num_input_tokens_seen": 9170750, "router_z_loss_clip": 5.609375, "router_z_loss_mlp": 0.49462891, "step": 437, "time_per_iteration": 2.6619081497192383 }, { "auxiliary_loss_clip": 0.01938884, "auxiliary_loss_mlp": 0.00480396, "balance_loss_clip": 1.37848711, "balance_loss_mlp": 0.43099555, "epoch": 0.026333984668570568, "flos": 18807208790400.0, "grad_norm": 15.902963620949732, "language_loss": 0.99318653, "learning_rate": 3.916049925995316e-06, "loss": 1.01737928, "num_input_tokens_seen": 9188430, "router_z_loss_clip": 5.6015625, "router_z_loss_mlp": 0.49365234, "step": 438, "time_per_iteration": 2.644674301147461 }, { "auxiliary_loss_clip": 0.01750927, "auxiliary_loss_mlp": 0.00454274, "balance_loss_clip": 1.38240564, "balance_loss_mlp": 0.43701228, "epoch": 0.02639410792123854, "flos": 64572020691840.0, "grad_norm": 1.4720357441998169, "language_loss": 0.62343764, "learning_rate": 3.917518232637377e-06, "loss": 0.64548969, "num_input_tokens_seen": 9255835, "router_z_loss_clip": 3.6875, "router_z_loss_mlp": 0.17285156, "step": 439, "time_per_iteration": 3.3001835346221924 }, { "auxiliary_loss_clip": 0.0194043, "auxiliary_loss_mlp": 0.00489717, "balance_loss_clip": 1.38442683, "balance_loss_mlp": 0.43786147, "epoch": 0.02645423117390651, "flos": 28473462921600.0, "grad_norm": 20.78358588700867, "language_loss": 0.83345759, "learning_rate": 3.918983198419573e-06, "loss": 0.85775912, "num_input_tokens_seen": 9276835, "router_z_loss_clip": 5.55859375, "router_z_loss_mlp": 0.51831055, "step": 440, "time_per_iteration": 4.1120805740356445 }, { "auxiliary_loss_clip": 0.01941359, "auxiliary_loss_mlp": 0.00438517, "balance_loss_clip": 1.39317942, "balance_loss_mlp": 0.3917152, "epoch": 0.026514354426574478, "flos": 18551237495040.0, "grad_norm": 2247.6513005122347, "language_loss": 0.90906537, "learning_rate": 3.920444838510415e-06, "loss": 0.93286413, "num_input_tokens_seen": 9295075, "router_z_loss_clip": 5.484375, "router_z_loss_mlp": 0.46826172, "step": 441, "time_per_iteration": 5.567206382751465 }, { "auxiliary_loss_clip": 0.01954785, "auxiliary_loss_mlp": 0.00451507, "balance_loss_clip": 1.40021086, "balance_loss_mlp": 0.40260708, "epoch": 0.026574477679242446, "flos": 20667812359680.0, "grad_norm": 506.99370759632956, "language_loss": 0.849334, "learning_rate": 3.92190316797534e-06, "loss": 0.87339687, "num_input_tokens_seen": 9314205, "router_z_loss_clip": 5.546875, "router_z_loss_mlp": 0.48925781, "step": 442, "time_per_iteration": 2.626519203186035 }, { "auxiliary_loss_clip": 0.01658803, "auxiliary_loss_mlp": 0.00217584, "balance_loss_clip": 1.33796549, "balance_loss_mlp": 0.2046144, "epoch": 0.026634600931910415, "flos": 57956125340160.0, "grad_norm": 0.9679991114437049, "language_loss": 0.64158452, "learning_rate": 3.92335820177765e-06, "loss": 0.66034842, "num_input_tokens_seen": 9367395, "router_z_loss_clip": 3.203125, "router_z_loss_mlp": 0.12988281, "step": 443, "time_per_iteration": 3.059054374694824 }, { "auxiliary_loss_clip": 0.01909781, "auxiliary_loss_mlp": 0.0044616, "balance_loss_clip": 1.39098167, "balance_loss_mlp": 0.39571083, "epoch": 0.026694724184578387, "flos": 15815131827840.0, "grad_norm": 13.290039460576928, "language_loss": 0.90822178, "learning_rate": 3.924809954779425e-06, "loss": 0.93178117, "num_input_tokens_seen": 9385185, "router_z_loss_clip": 5.18359375, "router_z_loss_mlp": 0.50463867, "step": 444, "time_per_iteration": 2.6563777923583984 }, { "auxiliary_loss_clip": 0.01898828, "auxiliary_loss_mlp": 0.00458561, "balance_loss_clip": 1.38008046, "balance_loss_mlp": 0.40637124, "epoch": 0.026754847437246355, "flos": 23440259612160.0, "grad_norm": 57.76755443032764, "language_loss": 1.01194012, "learning_rate": 3.9262584417424425e-06, "loss": 1.035514, "num_input_tokens_seen": 9403225, "router_z_loss_clip": 5.1875, "router_z_loss_mlp": 0.52270508, "step": 445, "time_per_iteration": 2.692028284072876 }, { "auxiliary_loss_clip": 0.01880676, "auxiliary_loss_mlp": 0.00410256, "balance_loss_clip": 1.38432562, "balance_loss_mlp": 0.36621976, "epoch": 0.026814970689914324, "flos": 17341801632000.0, "grad_norm": 36.98819617157792, "language_loss": 1.01574707, "learning_rate": 3.9277036773290725e-06, "loss": 1.03865635, "num_input_tokens_seen": 9420540, "router_z_loss_clip": 4.96875, "router_z_loss_mlp": 0.44042969, "step": 446, "time_per_iteration": 2.5932154655456543 }, { "auxiliary_loss_clip": 0.01862618, "auxiliary_loss_mlp": 0.00411635, "balance_loss_clip": 1.38006401, "balance_loss_mlp": 0.36590642, "epoch": 0.026875093942582293, "flos": 17894718662400.0, "grad_norm": 23.815238136463286, "language_loss": 0.86685622, "learning_rate": 3.92914567610317e-06, "loss": 0.88959873, "num_input_tokens_seen": 9438840, "router_z_loss_clip": 4.83203125, "router_z_loss_mlp": 0.45727539, "step": 447, "time_per_iteration": 2.7016611099243164 }, { "auxiliary_loss_clip": 0.01872129, "auxiliary_loss_mlp": 0.00452209, "balance_loss_clip": 1.38635755, "balance_loss_mlp": 0.40378606, "epoch": 0.026935217195250265, "flos": 21723980889600.0, "grad_norm": 94.45400929868842, "language_loss": 0.94554722, "learning_rate": 3.930584452530952e-06, "loss": 0.96879065, "num_input_tokens_seen": 9457215, "router_z_loss_clip": 4.85546875, "router_z_loss_mlp": 0.48388672, "step": 448, "time_per_iteration": 2.676861524581909 }, { "auxiliary_loss_clip": 0.01835772, "auxiliary_loss_mlp": 0.00441551, "balance_loss_clip": 1.36642444, "balance_loss_mlp": 0.39689541, "epoch": 0.026995340447918233, "flos": 23622685810560.0, "grad_norm": 7.402532700974451, "language_loss": 0.94567442, "learning_rate": 3.9320200209818755e-06, "loss": 0.96844769, "num_input_tokens_seen": 9475615, "router_z_loss_clip": 4.69140625, "router_z_loss_mlp": 0.44677734, "step": 449, "time_per_iteration": 2.75723934173584 }, { "auxiliary_loss_clip": 0.0182571, "auxiliary_loss_mlp": 0.00420423, "balance_loss_clip": 1.35864973, "balance_loss_mlp": 0.37273955, "epoch": 0.027055463700586202, "flos": 17931275729280.0, "grad_norm": 81.60930322476118, "language_loss": 0.8950932, "learning_rate": 3.933452395729493e-06, "loss": 0.9175545, "num_input_tokens_seen": 9493975, "router_z_loss_clip": 4.67578125, "router_z_loss_mlp": 0.47680664, "step": 450, "time_per_iteration": 2.7558090686798096 }, { "auxiliary_loss_clip": 0.01811312, "auxiliary_loss_mlp": 0.00416045, "balance_loss_clip": 1.360623, "balance_loss_mlp": 0.37315381, "epoch": 0.02711558695325417, "flos": 25118903859840.0, "grad_norm": 42.85678937337225, "language_loss": 0.85984957, "learning_rate": 3.934881590952304e-06, "loss": 0.88212311, "num_input_tokens_seen": 9514810, "router_z_loss_clip": 4.50390625, "router_z_loss_mlp": 0.42895508, "step": 451, "time_per_iteration": 2.6904289722442627 }, { "auxiliary_loss_clip": 0.01789084, "auxiliary_loss_mlp": 0.00435235, "balance_loss_clip": 1.3475492, "balance_loss_mlp": 0.3929871, "epoch": 0.02717571020592214, "flos": 24239559006720.0, "grad_norm": 13.679177810339048, "language_loss": 0.82072639, "learning_rate": 3.936307620734599e-06, "loss": 0.84296966, "num_input_tokens_seen": 9533635, "router_z_loss_clip": 4.41015625, "router_z_loss_mlp": 0.42236328, "step": 452, "time_per_iteration": 2.6572868824005127 }, { "auxiliary_loss_clip": 0.01771109, "auxiliary_loss_mlp": 0.00416559, "balance_loss_clip": 1.33754241, "balance_loss_mlp": 0.37452608, "epoch": 0.02723583345859011, "flos": 25118939773440.0, "grad_norm": 1109.0197702294663, "language_loss": 0.78876501, "learning_rate": 3.937730499067294e-06, "loss": 0.81064165, "num_input_tokens_seen": 9555420, "router_z_loss_clip": 4.34375, "router_z_loss_mlp": 0.42016602, "step": 453, "time_per_iteration": 2.698025703430176 }, { "auxiliary_loss_clip": 0.01780293, "auxiliary_loss_mlp": 0.00448948, "balance_loss_clip": 1.3382268, "balance_loss_mlp": 0.40479296, "epoch": 0.02729595671125808, "flos": 42741597847680.0, "grad_norm": 232.20089385861013, "language_loss": 0.89494789, "learning_rate": 3.939150239848748e-06, "loss": 0.91724026, "num_input_tokens_seen": 9578950, "router_z_loss_clip": 4.41015625, "router_z_loss_mlp": 0.44116211, "step": 454, "time_per_iteration": 2.900243043899536 }, { "auxiliary_loss_clip": 0.0174922, "auxiliary_loss_mlp": 0.00411056, "balance_loss_clip": 1.32154298, "balance_loss_mlp": 0.37045363, "epoch": 0.02735607996392605, "flos": 21430985650560.0, "grad_norm": 23.775145100919225, "language_loss": 0.81177461, "learning_rate": 3.9405668568855866e-06, "loss": 0.8333773, "num_input_tokens_seen": 9598160, "router_z_loss_clip": 4.265625, "router_z_loss_mlp": 0.40600586, "step": 455, "time_per_iteration": 2.6646695137023926 }, { "auxiliary_loss_clip": 0.01747247, "auxiliary_loss_mlp": 0.00433986, "balance_loss_clip": 1.31197917, "balance_loss_mlp": 0.39216718, "epoch": 0.027416203216594017, "flos": 20851280052480.0, "grad_norm": 17.454564835644767, "language_loss": 0.87627399, "learning_rate": 3.941980363893499e-06, "loss": 0.89808631, "num_input_tokens_seen": 9616010, "router_z_loss_clip": 4.35546875, "router_z_loss_mlp": 0.41845703, "step": 456, "time_per_iteration": 2.7151741981506348 }, { "auxiliary_loss_clip": 0.01733935, "auxiliary_loss_mlp": 0.00402339, "balance_loss_clip": 1.31142581, "balance_loss_mlp": 0.36066338, "epoch": 0.027476326469261986, "flos": 13224500242560.0, "grad_norm": 16.493579754209, "language_loss": 0.88958716, "learning_rate": 3.9433907744980384e-06, "loss": 0.91094989, "num_input_tokens_seen": 9634000, "router_z_loss_clip": 4.22265625, "router_z_loss_mlp": 0.41674805, "step": 457, "time_per_iteration": 2.6008777618408203 }, { "auxiliary_loss_clip": 0.01716322, "auxiliary_loss_mlp": 0.00415937, "balance_loss_clip": 1.29540324, "balance_loss_mlp": 0.37080467, "epoch": 0.027536449721929958, "flos": 24024526237440.0, "grad_norm": 40.7736951299237, "language_loss": 1.00295758, "learning_rate": 3.944798102235412e-06, "loss": 1.02428019, "num_input_tokens_seen": 9653455, "router_z_loss_clip": 4.2109375, "router_z_loss_mlp": 0.45141602, "step": 458, "time_per_iteration": 2.703333616256714 }, { "auxiliary_loss_clip": 0.01699359, "auxiliary_loss_mlp": 0.00457321, "balance_loss_clip": 1.276389, "balance_loss_mlp": 0.41233116, "epoch": 0.027596572974597926, "flos": 13006055681280.0, "grad_norm": 92.30125692059008, "language_loss": 0.87570715, "learning_rate": 3.9462023605532545e-06, "loss": 0.89727396, "num_input_tokens_seen": 9669650, "router_z_loss_clip": 4.23046875, "router_z_loss_mlp": 0.44995117, "step": 459, "time_per_iteration": 2.628692150115967 }, { "auxiliary_loss_clip": 0.0169973, "auxiliary_loss_mlp": 0.00442701, "balance_loss_clip": 1.28536344, "balance_loss_mlp": 0.39749649, "epoch": 0.027656696227265895, "flos": 26143076350080.0, "grad_norm": 39.57679013343044, "language_loss": 0.88234496, "learning_rate": 3.947603562811407e-06, "loss": 0.90376925, "num_input_tokens_seen": 9691415, "router_z_loss_clip": 4.15039062, "router_z_loss_mlp": 0.45214844, "step": 460, "time_per_iteration": 2.8593802452087402 }, { "auxiliary_loss_clip": 0.01505137, "auxiliary_loss_mlp": 0.00693502, "balance_loss_clip": 1.21468282, "balance_loss_mlp": 0.66279376, "epoch": 0.027716819479933864, "flos": 60697222997760.0, "grad_norm": 1.554594634949751, "language_loss": 0.73609626, "learning_rate": 3.949001722282675e-06, "loss": 0.75808263, "num_input_tokens_seen": 9755605, "router_z_loss_clip": 2.90625, "router_z_loss_mlp": 0.30664062, "step": 461, "time_per_iteration": 3.1300318241119385 }, { "auxiliary_loss_clip": 0.01675542, "auxiliary_loss_mlp": 0.00465729, "balance_loss_clip": 1.27117693, "balance_loss_mlp": 0.42185968, "epoch": 0.027776942732601832, "flos": 31211938886400.0, "grad_norm": 57.232253487117234, "language_loss": 0.9064694, "learning_rate": 3.950396852153582e-06, "loss": 0.92788208, "num_input_tokens_seen": 9776270, "router_z_loss_clip": 4.0390625, "router_z_loss_mlp": 0.4387207, "step": 462, "time_per_iteration": 2.7181713581085205 }, { "auxiliary_loss_clip": 0.01676767, "auxiliary_loss_mlp": 0.00416642, "balance_loss_clip": 1.263762, "balance_loss_mlp": 0.37191498, "epoch": 0.027837065985269804, "flos": 22674644196480.0, "grad_norm": 24.095409608614425, "language_loss": 0.98577988, "learning_rate": 3.951788965525118e-06, "loss": 1.00671399, "num_input_tokens_seen": 9794465, "router_z_loss_clip": 4.12695312, "router_z_loss_mlp": 0.44702148, "step": 463, "time_per_iteration": 2.648257255554199 }, { "auxiliary_loss_clip": 0.01530792, "auxiliary_loss_mlp": 0.00279071, "balance_loss_clip": 1.24043727, "balance_loss_mlp": 0.25408459, "epoch": 0.027897189237937773, "flos": 62182487399040.0, "grad_norm": 0.8842510867762596, "language_loss": 0.58598363, "learning_rate": 3.953178075413476e-06, "loss": 0.60408223, "num_input_tokens_seen": 9849685, "router_z_loss_clip": 2.90625, "router_z_loss_mlp": 0.24902344, "step": 464, "time_per_iteration": 3.1209614276885986 }, { "auxiliary_loss_clip": 0.01707636, "auxiliary_loss_mlp": 0.00493769, "balance_loss_clip": 1.26912069, "balance_loss_mlp": 0.43936178, "epoch": 0.02795731249060574, "flos": 24493160004480.0, "grad_norm": 137.6766620633682, "language_loss": 0.90081096, "learning_rate": 3.954564194750784e-06, "loss": 0.92282498, "num_input_tokens_seen": 9869505, "router_z_loss_clip": 4.3828125, "router_z_loss_mlp": 0.54321289, "step": 465, "time_per_iteration": 2.689724922180176 }, { "auxiliary_loss_clip": 0.0168916, "auxiliary_loss_mlp": 0.00424534, "balance_loss_clip": 1.26498342, "balance_loss_mlp": 0.37868589, "epoch": 0.02801743574327371, "flos": 23733003456000.0, "grad_norm": 14.990376893959041, "language_loss": 0.84852701, "learning_rate": 3.955947336385828e-06, "loss": 0.86966395, "num_input_tokens_seen": 9890950, "router_z_loss_clip": 4.23828125, "router_z_loss_mlp": 0.45874023, "step": 466, "time_per_iteration": 2.6538822650909424 }, { "auxiliary_loss_clip": 0.01664629, "auxiliary_loss_mlp": 0.00386995, "balance_loss_clip": 1.25750756, "balance_loss_mlp": 0.34038395, "epoch": 0.02807755899594168, "flos": 20629100476800.0, "grad_norm": 482.0439172078701, "language_loss": 0.93490696, "learning_rate": 3.957327513084761e-06, "loss": 0.95542324, "num_input_tokens_seen": 9911265, "router_z_loss_clip": 4.07421875, "router_z_loss_mlp": 0.46582031, "step": 467, "time_per_iteration": 2.6369411945343018 }, { "auxiliary_loss_clip": 0.01674334, "auxiliary_loss_mlp": 0.00395732, "balance_loss_clip": 1.26272964, "balance_loss_mlp": 0.34623629, "epoch": 0.02813768224860965, "flos": 19244564789760.0, "grad_norm": 13.730101772483815, "language_loss": 0.94508934, "learning_rate": 3.958704737531818e-06, "loss": 0.96579003, "num_input_tokens_seen": 9929025, "router_z_loss_clip": 4.11132812, "router_z_loss_mlp": 0.49536133, "step": 468, "time_per_iteration": 2.5876619815826416 }, { "auxiliary_loss_clip": 0.01685533, "auxiliary_loss_mlp": 0.00405385, "balance_loss_clip": 1.26634765, "balance_loss_mlp": 0.35522196, "epoch": 0.02819780550127762, "flos": 20813968800000.0, "grad_norm": 39.239317277857765, "language_loss": 0.99810183, "learning_rate": 3.9600790223300065e-06, "loss": 1.01901102, "num_input_tokens_seen": 9945190, "router_z_loss_clip": 4.19335938, "router_z_loss_mlp": 0.50170898, "step": 469, "time_per_iteration": 2.655141592025757 }, { "auxiliary_loss_clip": 0.01656482, "auxiliary_loss_mlp": 0.0040573, "balance_loss_clip": 1.25377572, "balance_loss_mlp": 0.35587716, "epoch": 0.028257928753945588, "flos": 19974125928960.0, "grad_norm": 18.627156912056076, "language_loss": 0.93777621, "learning_rate": 3.96145038000181e-06, "loss": 0.95839834, "num_input_tokens_seen": 9962820, "router_z_loss_clip": 4.02734375, "router_z_loss_mlp": 0.4987793, "step": 470, "time_per_iteration": 2.61094331741333 }, { "auxiliary_loss_clip": 0.01671436, "auxiliary_loss_mlp": 0.00446841, "balance_loss_clip": 1.26193404, "balance_loss_mlp": 0.39498454, "epoch": 0.028318052006613557, "flos": 20484488321280.0, "grad_norm": 6.028146617309995, "language_loss": 0.98823726, "learning_rate": 3.962818822989861e-06, "loss": 1.00942004, "num_input_tokens_seen": 9982595, "router_z_loss_clip": 4.0859375, "router_z_loss_mlp": 0.51904297, "step": 471, "time_per_iteration": 2.67940616607666 }, { "auxiliary_loss_clip": 0.01677671, "auxiliary_loss_mlp": 0.00471743, "balance_loss_clip": 1.26170433, "balance_loss_mlp": 0.41719291, "epoch": 0.02837817525928153, "flos": 28514832410880.0, "grad_norm": 64.5131300716062, "language_loss": 0.82872355, "learning_rate": 3.964184363657625e-06, "loss": 0.8502177, "num_input_tokens_seen": 10004645, "router_z_loss_clip": 4.15429688, "router_z_loss_mlp": 0.54541016, "step": 472, "time_per_iteration": 2.709202289581299 }, { "auxiliary_loss_clip": 0.01684532, "auxiliary_loss_mlp": 0.00417002, "balance_loss_clip": 1.27155757, "balance_loss_mlp": 0.36679086, "epoch": 0.028438298511949497, "flos": 18551668458240.0, "grad_norm": 8.601339366916669, "language_loss": 0.99594665, "learning_rate": 3.965547014290071e-06, "loss": 1.01696181, "num_input_tokens_seen": 10022555, "router_z_loss_clip": 4.1328125, "router_z_loss_mlp": 0.50219727, "step": 473, "time_per_iteration": 2.66874098777771 }, { "auxiliary_loss_clip": 0.01701227, "auxiliary_loss_mlp": 0.00431521, "balance_loss_clip": 1.27728355, "balance_loss_mlp": 0.37782955, "epoch": 0.028498421764617466, "flos": 16910227722240.0, "grad_norm": 4.791088798316729, "language_loss": 0.96900588, "learning_rate": 3.96690678709433e-06, "loss": 0.99033332, "num_input_tokens_seen": 10041025, "router_z_loss_clip": 4.23242188, "router_z_loss_mlp": 0.53662109, "step": 474, "time_per_iteration": 2.6374316215515137 }, { "auxiliary_loss_clip": 0.01666358, "auxiliary_loss_mlp": 0.00406079, "balance_loss_clip": 1.26403487, "balance_loss_mlp": 0.35505795, "epoch": 0.028558545017285435, "flos": 27778699082880.0, "grad_norm": 35.43302404030814, "language_loss": 0.87442017, "learning_rate": 3.968263694200355e-06, "loss": 0.89514458, "num_input_tokens_seen": 10060775, "router_z_loss_clip": 4.02929688, "router_z_loss_mlp": 0.51025391, "step": 475, "time_per_iteration": 2.6786632537841797 }, { "auxiliary_loss_clip": 0.01384902, "auxiliary_loss_mlp": 0.00193587, "balance_loss_clip": 1.14020324, "balance_loss_mlp": 0.16898173, "epoch": 0.028618668269953403, "flos": 65654367258240.0, "grad_norm": 0.9291700734381153, "language_loss": 0.66371679, "learning_rate": 3.969617747661569e-06, "loss": 0.67950165, "num_input_tokens_seen": 10120225, "router_z_loss_clip": 2.4375, "router_z_loss_mlp": 0.24511719, "step": 476, "time_per_iteration": 3.121316909790039 }, { "auxiliary_loss_clip": 0.01666, "auxiliary_loss_mlp": 0.00431453, "balance_loss_clip": 1.25922418, "balance_loss_mlp": 0.3784287, "epoch": 0.028678791522621375, "flos": 21937074324480.0, "grad_norm": 24.93841940312841, "language_loss": 0.9307071, "learning_rate": 3.970968959455509e-06, "loss": 0.95168161, "num_input_tokens_seen": 10137880, "router_z_loss_clip": 4.06445312, "router_z_loss_mlp": 0.53051758, "step": 477, "time_per_iteration": 2.6416988372802734 }, { "auxiliary_loss_clip": 0.01654292, "auxiliary_loss_mlp": 0.00400377, "balance_loss_clip": 1.25869203, "balance_loss_mlp": 0.34811527, "epoch": 0.028738914775289344, "flos": 24572128055040.0, "grad_norm": 42.15343484209316, "language_loss": 0.91207069, "learning_rate": 3.97231734148446e-06, "loss": 0.93261743, "num_input_tokens_seen": 10156930, "router_z_loss_clip": 3.94921875, "router_z_loss_mlp": 0.52270508, "step": 478, "time_per_iteration": 2.7045342922210693 }, { "auxiliary_loss_clip": 0.01636847, "auxiliary_loss_mlp": 0.0041247, "balance_loss_clip": 1.24128032, "balance_loss_mlp": 0.36056578, "epoch": 0.028799038027957313, "flos": 23257977068160.0, "grad_norm": 10.06608199179809, "language_loss": 0.89379597, "learning_rate": 3.973662905576082e-06, "loss": 0.91428912, "num_input_tokens_seen": 10176295, "router_z_loss_clip": 3.95703125, "router_z_loss_mlp": 0.51904297, "step": 479, "time_per_iteration": 2.6477785110473633 }, { "auxiliary_loss_clip": 0.01622189, "auxiliary_loss_mlp": 0.00375676, "balance_loss_clip": 1.23930502, "balance_loss_mlp": 0.32606089, "epoch": 0.02885916128062528, "flos": 22164102236160.0, "grad_norm": 344.7146441499143, "language_loss": 0.79431993, "learning_rate": 3.975005663484038e-06, "loss": 0.81429863, "num_input_tokens_seen": 10195790, "router_z_loss_clip": 3.83203125, "router_z_loss_mlp": 0.49633789, "step": 480, "time_per_iteration": 2.8323681354522705 }, { "auxiliary_loss_clip": 0.01630188, "auxiliary_loss_mlp": 0.00399362, "balance_loss_clip": 1.24044001, "balance_loss_mlp": 0.34972373, "epoch": 0.02891928453329325, "flos": 22932842135040.0, "grad_norm": 31.78291565060535, "language_loss": 0.93088746, "learning_rate": 3.976345626888605e-06, "loss": 0.95118284, "num_input_tokens_seen": 10218405, "router_z_loss_clip": 3.8984375, "router_z_loss_mlp": 0.49584961, "step": 481, "time_per_iteration": 2.7295796871185303 }, { "auxiliary_loss_clip": 0.01332034, "auxiliary_loss_mlp": 0.00204299, "balance_loss_clip": 1.10088432, "balance_loss_mlp": 0.1837955, "epoch": 0.028979407785961222, "flos": 57432941792640.0, "grad_norm": 0.846702217204551, "language_loss": 0.65514052, "learning_rate": 3.9776828073972864e-06, "loss": 0.67050385, "num_input_tokens_seen": 10271005, "router_z_loss_clip": 2.3125, "router_z_loss_mlp": 0.20507812, "step": 482, "time_per_iteration": 4.35185432434082 }, { "auxiliary_loss_clip": 0.01641228, "auxiliary_loss_mlp": 0.00371041, "balance_loss_clip": 1.24434757, "balance_loss_mlp": 0.32009065, "epoch": 0.02903953103862919, "flos": 16722737706240.0, "grad_norm": 17.037588262614094, "language_loss": 0.90730822, "learning_rate": 3.979017216545415e-06, "loss": 0.92743099, "num_input_tokens_seen": 10288405, "router_z_loss_clip": 3.97070312, "router_z_loss_mlp": 0.50952148, "step": 483, "time_per_iteration": 5.494209051132202 }, { "auxiliary_loss_clip": 0.01629055, "auxiliary_loss_mlp": 0.00407596, "balance_loss_clip": 1.23883271, "balance_loss_mlp": 0.35926831, "epoch": 0.02909965429129716, "flos": 16763640318720.0, "grad_norm": 579.9113753000111, "language_loss": 0.83976912, "learning_rate": 3.980348865796749e-06, "loss": 0.86013561, "num_input_tokens_seen": 10306875, "router_z_loss_clip": 3.90429688, "router_z_loss_mlp": 0.48266602, "step": 484, "time_per_iteration": 4.004236936569214 }, { "auxiliary_loss_clip": 0.01644707, "auxiliary_loss_mlp": 0.00465921, "balance_loss_clip": 1.2430445, "balance_loss_mlp": 0.41413647, "epoch": 0.029159777543965128, "flos": 19785343023360.0, "grad_norm": 8.056651768195435, "language_loss": 0.89565438, "learning_rate": 3.9816777665440615e-06, "loss": 0.91676068, "num_input_tokens_seen": 10323965, "router_z_loss_clip": 4.015625, "router_z_loss_mlp": 0.51782227, "step": 485, "time_per_iteration": 2.711810827255249 }, { "auxiliary_loss_clip": 0.01605974, "auxiliary_loss_mlp": 0.00432714, "balance_loss_clip": 1.2275424, "balance_loss_mlp": 0.38913119, "epoch": 0.029219900796633096, "flos": 19642670202240.0, "grad_norm": 667.9359666331274, "language_loss": 0.94309163, "learning_rate": 3.983003930109732e-06, "loss": 0.96347851, "num_input_tokens_seen": 10342620, "router_z_loss_clip": 3.78515625, "router_z_loss_mlp": 0.43579102, "step": 486, "time_per_iteration": 2.6045475006103516 }, { "auxiliary_loss_clip": 0.01631774, "auxiliary_loss_mlp": 0.00498792, "balance_loss_clip": 1.23758793, "balance_loss_mlp": 0.45029742, "epoch": 0.02928002404930107, "flos": 25885704424320.0, "grad_norm": 30.677630014429024, "language_loss": 0.95339292, "learning_rate": 3.984327367746315e-06, "loss": 0.9746986, "num_input_tokens_seen": 10364610, "router_z_loss_clip": 3.94335938, "router_z_loss_mlp": 0.48486328, "step": 487, "time_per_iteration": 2.7017838954925537 }, { "auxiliary_loss_clip": 0.01623267, "auxiliary_loss_mlp": 0.00505647, "balance_loss_clip": 1.23782516, "balance_loss_mlp": 0.45793906, "epoch": 0.029340147301969037, "flos": 20660234590080.0, "grad_norm": 24.55853370115544, "language_loss": 0.99562252, "learning_rate": 3.985648090637122e-06, "loss": 1.01691175, "num_input_tokens_seen": 10380910, "router_z_loss_clip": 3.84960938, "router_z_loss_mlp": 0.47705078, "step": 488, "time_per_iteration": 2.6169066429138184 }, { "auxiliary_loss_clip": 0.01627008, "auxiliary_loss_mlp": 0.00525776, "balance_loss_clip": 1.24182856, "balance_loss_mlp": 0.47718614, "epoch": 0.029400270554637006, "flos": 24428018689920.0, "grad_norm": 49.252005004550455, "language_loss": 0.94390881, "learning_rate": 3.986966109896785e-06, "loss": 0.9654367, "num_input_tokens_seen": 10400665, "router_z_loss_clip": 3.8515625, "router_z_loss_mlp": 0.4855957, "step": 489, "time_per_iteration": 2.713409423828125 }, { "auxiliary_loss_clip": 0.01613674, "auxiliary_loss_mlp": 0.00566241, "balance_loss_clip": 1.23198676, "balance_loss_mlp": 0.5197736, "epoch": 0.029460393807304974, "flos": 20120892900480.0, "grad_norm": 129.26095784973577, "language_loss": 0.93325692, "learning_rate": 3.988281436571815e-06, "loss": 0.95505607, "num_input_tokens_seen": 10420150, "router_z_loss_clip": 3.81640625, "router_z_loss_mlp": 0.46484375, "step": 490, "time_per_iteration": 2.748716354370117 }, { "auxiliary_loss_clip": 0.01636393, "auxiliary_loss_mlp": 0.00630094, "balance_loss_clip": 1.23939967, "balance_loss_mlp": 0.57943046, "epoch": 0.029520517059972943, "flos": 17675914965120.0, "grad_norm": 8.470335412807401, "language_loss": 0.98680246, "learning_rate": 3.989594081641164e-06, "loss": 1.00946736, "num_input_tokens_seen": 10438210, "router_z_loss_clip": 3.9765625, "router_z_loss_mlp": 0.50708008, "step": 491, "time_per_iteration": 2.6535732746124268 }, { "auxiliary_loss_clip": 0.01616857, "auxiliary_loss_mlp": 0.00611852, "balance_loss_clip": 1.24136138, "balance_loss_mlp": 0.56660008, "epoch": 0.029580640312640915, "flos": 18953185662720.0, "grad_norm": 66.00535074055533, "language_loss": 0.90708905, "learning_rate": 3.9909040560167675e-06, "loss": 0.92937613, "num_input_tokens_seen": 10455125, "router_z_loss_clip": 3.7578125, "router_z_loss_mlp": 0.45239258, "step": 492, "time_per_iteration": 2.7165191173553467 }, { "auxiliary_loss_clip": 0.01632885, "auxiliary_loss_mlp": 0.00582145, "balance_loss_clip": 1.25056148, "balance_loss_mlp": 0.53481865, "epoch": 0.029640763565308884, "flos": 18726121837440.0, "grad_norm": 615.7254573905122, "language_loss": 0.92586887, "learning_rate": 3.992211370544093e-06, "loss": 0.94801921, "num_input_tokens_seen": 10470990, "router_z_loss_clip": 3.8203125, "router_z_loss_mlp": 0.47338867, "step": 493, "time_per_iteration": 2.641636371612549 }, { "auxiliary_loss_clip": 0.01631096, "auxiliary_loss_mlp": 0.00634959, "balance_loss_clip": 1.24258494, "balance_loss_mlp": 0.58586878, "epoch": 0.029700886817976852, "flos": 20595308757120.0, "grad_norm": 22.316228197752555, "language_loss": 0.94347453, "learning_rate": 3.99351603600268e-06, "loss": 0.96613508, "num_input_tokens_seen": 10490685, "router_z_loss_clip": 3.88476562, "router_z_loss_mlp": 0.4909668, "step": 494, "time_per_iteration": 2.7028331756591797 }, { "auxiliary_loss_clip": 0.0164094, "auxiliary_loss_mlp": 0.0064609, "balance_loss_clip": 1.25168967, "balance_loss_mlp": 0.59730959, "epoch": 0.02976101007064482, "flos": 22236857233920.0, "grad_norm": 32.501966079315324, "language_loss": 0.93185163, "learning_rate": 3.994818063106668e-06, "loss": 0.95472193, "num_input_tokens_seen": 10509435, "router_z_loss_clip": 3.89257812, "router_z_loss_mlp": 0.48779297, "step": 495, "time_per_iteration": 2.653486967086792 }, { "auxiliary_loss_clip": 0.01600817, "auxiliary_loss_mlp": 0.00588808, "balance_loss_clip": 1.2354176, "balance_loss_mlp": 0.54472458, "epoch": 0.029821133323312793, "flos": 23732644320000.0, "grad_norm": 166.1558215812027, "language_loss": 0.68286473, "learning_rate": 3.99611746250533e-06, "loss": 0.70476091, "num_input_tokens_seen": 10530050, "router_z_loss_clip": 3.65234375, "router_z_loss_mlp": 0.44067383, "step": 496, "time_per_iteration": 2.7420854568481445 }, { "auxiliary_loss_clip": 0.01630275, "auxiliary_loss_mlp": 0.0067904, "balance_loss_clip": 1.25510406, "balance_loss_mlp": 0.62451369, "epoch": 0.02988125657598076, "flos": 22419498913920.0, "grad_norm": 31.80347437307548, "language_loss": 0.94580102, "learning_rate": 3.997414244783595e-06, "loss": 0.96889412, "num_input_tokens_seen": 10551370, "router_z_loss_clip": 3.75390625, "router_z_loss_mlp": 0.54492188, "step": 497, "time_per_iteration": 2.6838860511779785 }, { "auxiliary_loss_clip": 0.01639771, "auxiliary_loss_mlp": 0.00706722, "balance_loss_clip": 1.25539446, "balance_loss_mlp": 0.65140939, "epoch": 0.02994137982864873, "flos": 13845108453120.0, "grad_norm": 31.58930142222188, "language_loss": 0.94131684, "learning_rate": 3.998708420462557e-06, "loss": 0.96478176, "num_input_tokens_seen": 10569225, "router_z_loss_clip": 3.84375, "router_z_loss_mlp": 0.55297852, "step": 498, "time_per_iteration": 2.6679627895355225 }, { "auxiliary_loss_clip": 0.01616164, "auxiliary_loss_mlp": 0.00677247, "balance_loss_clip": 1.24491, "balance_loss_mlp": 0.62612963, "epoch": 0.0300015030813167, "flos": 23908354675200.0, "grad_norm": 686.1086590570216, "language_loss": 0.87458235, "learning_rate": 4e-06, "loss": 0.89751637, "num_input_tokens_seen": 10586170, "router_z_loss_clip": 3.71484375, "router_z_loss_mlp": 0.51123047, "step": 499, "time_per_iteration": 2.624300241470337 }, { "auxiliary_loss_clip": 0.01631564, "auxiliary_loss_mlp": 0.0075037, "balance_loss_clip": 1.25528026, "balance_loss_mlp": 0.69765562, "epoch": 0.030061626333984667, "flos": 22016796560640.0, "grad_norm": 23.320261392628876, "language_loss": 0.87525988, "learning_rate": 3.9999999620799e-06, "loss": 0.8990792, "num_input_tokens_seen": 10606205, "router_z_loss_clip": 3.765625, "router_z_loss_mlp": 0.52709961, "step": 500, "time_per_iteration": 2.7644782066345215 }, { "auxiliary_loss_clip": 0.01624909, "auxiliary_loss_mlp": 0.00675242, "balance_loss_clip": 1.25540876, "balance_loss_mlp": 0.62233651, "epoch": 0.03012174958665264, "flos": 23039747988480.0, "grad_norm": 239.55311891655197, "language_loss": 0.95346224, "learning_rate": 3.9999998483196e-06, "loss": 0.97646379, "num_input_tokens_seen": 10625995, "router_z_loss_clip": 3.69140625, "router_z_loss_mlp": 0.52905273, "step": 501, "time_per_iteration": 2.617553472518921 }, { "auxiliary_loss_clip": 0.01643131, "auxiliary_loss_mlp": 0.00787933, "balance_loss_clip": 1.25768328, "balance_loss_mlp": 0.73114157, "epoch": 0.030181872839320608, "flos": 18953257489920.0, "grad_norm": 19.153410822087142, "language_loss": 0.93180466, "learning_rate": 3.9999996587191065e-06, "loss": 0.95611525, "num_input_tokens_seen": 10644105, "router_z_loss_clip": 3.85742188, "router_z_loss_mlp": 0.56762695, "step": 502, "time_per_iteration": 2.6452219486236572 }, { "auxiliary_loss_clip": 0.0162632, "auxiliary_loss_mlp": 0.00664238, "balance_loss_clip": 1.25957489, "balance_loss_mlp": 0.61350268, "epoch": 0.030241996091988577, "flos": 16728017005440.0, "grad_norm": 19.54399061538533, "language_loss": 0.89776182, "learning_rate": 3.999999393278425e-06, "loss": 0.92066741, "num_input_tokens_seen": 10661090, "router_z_loss_clip": 3.66796875, "router_z_loss_mlp": 0.50708008, "step": 503, "time_per_iteration": 2.6938576698303223 }, { "auxiliary_loss_clip": 0.01591859, "auxiliary_loss_mlp": 0.00494039, "balance_loss_clip": 1.24951673, "balance_loss_mlp": 0.45779979, "epoch": 0.030302119344656545, "flos": 28621271387520.0, "grad_norm": 141.58734540020782, "language_loss": 0.94647115, "learning_rate": 3.999999051997567e-06, "loss": 0.9673301, "num_input_tokens_seen": 10682380, "router_z_loss_clip": 3.41992188, "router_z_loss_mlp": 0.36254883, "step": 504, "time_per_iteration": 2.7574057579040527 }, { "auxiliary_loss_clip": 0.01565787, "auxiliary_loss_mlp": 0.00438381, "balance_loss_clip": 1.22881031, "balance_loss_mlp": 0.40006739, "epoch": 0.030362242597324514, "flos": 15669334523520.0, "grad_norm": 31.005432475865618, "language_loss": 0.8530699, "learning_rate": 3.9999986348765425e-06, "loss": 0.87311155, "num_input_tokens_seen": 10699925, "router_z_loss_clip": 3.36914062, "router_z_loss_mlp": 0.38305664, "step": 505, "time_per_iteration": 2.617814302444458 }, { "auxiliary_loss_clip": 0.01267881, "auxiliary_loss_mlp": 0.00126276, "balance_loss_clip": 1.04718673, "balance_loss_mlp": 0.11592839, "epoch": 0.030422365849992486, "flos": 72125973676800.0, "grad_norm": 0.9391482765599067, "language_loss": 0.55090737, "learning_rate": 3.999998141915371e-06, "loss": 0.56484896, "num_input_tokens_seen": 10766525, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.10351562, "step": 506, "time_per_iteration": 3.300869941711426 }, { "auxiliary_loss_clip": 0.01514615, "auxiliary_loss_mlp": 0.0030222, "balance_loss_clip": 1.19311118, "balance_loss_mlp": 0.26383421, "epoch": 0.030482489102660455, "flos": 19427817000960.0, "grad_norm": 3.09395363291162, "language_loss": 0.89906734, "learning_rate": 3.999997573114069e-06, "loss": 0.91723567, "num_input_tokens_seen": 10786725, "router_z_loss_clip": 3.2109375, "router_z_loss_mlp": 0.3840332, "step": 507, "time_per_iteration": 2.661376714706421 }, { "auxiliary_loss_clip": 0.01530554, "auxiliary_loss_mlp": 0.00305617, "balance_loss_clip": 1.20238423, "balance_loss_mlp": 0.2684471, "epoch": 0.030542612355328423, "flos": 20375822701440.0, "grad_norm": 24.66052323830122, "language_loss": 0.96589637, "learning_rate": 3.999996928472659e-06, "loss": 0.984258, "num_input_tokens_seen": 10805390, "router_z_loss_clip": 3.28320312, "router_z_loss_mlp": 0.37182617, "step": 508, "time_per_iteration": 2.6320927143096924 }, { "auxiliary_loss_clip": 0.01529286, "auxiliary_loss_mlp": 0.00319818, "balance_loss_clip": 1.20247126, "balance_loss_mlp": 0.27952483, "epoch": 0.030602735607996392, "flos": 34677354297600.0, "grad_norm": 53.060682858793804, "language_loss": 0.77238065, "learning_rate": 3.999996207991165e-06, "loss": 0.79087174, "num_input_tokens_seen": 10828030, "router_z_loss_clip": 3.26953125, "router_z_loss_mlp": 0.40283203, "step": 509, "time_per_iteration": 2.7597920894622803 }, { "auxiliary_loss_clip": 0.01498749, "auxiliary_loss_mlp": 0.0025273, "balance_loss_clip": 1.19064879, "balance_loss_mlp": 0.2159659, "epoch": 0.03066285886066436, "flos": 23658668259840.0, "grad_norm": 14.855713631811353, "language_loss": 0.89747268, "learning_rate": 3.999995411669614e-06, "loss": 0.91498744, "num_input_tokens_seen": 10845240, "router_z_loss_clip": 3.08203125, "router_z_loss_mlp": 0.36767578, "step": 510, "time_per_iteration": 2.6731817722320557 }, { "auxiliary_loss_clip": 0.01486287, "auxiliary_loss_mlp": 0.0029509, "balance_loss_clip": 1.1828804, "balance_loss_mlp": 0.26082876, "epoch": 0.030722982113332332, "flos": 23002975440000.0, "grad_norm": 16.999197705817675, "language_loss": 0.94074953, "learning_rate": 3.999994539508036e-06, "loss": 0.95856327, "num_input_tokens_seen": 10864325, "router_z_loss_clip": 3.03125, "router_z_loss_mlp": 0.34301758, "step": 511, "time_per_iteration": 2.6485049724578857 }, { "auxiliary_loss_clip": 0.01489985, "auxiliary_loss_mlp": 0.00290181, "balance_loss_clip": 1.1758498, "balance_loss_mlp": 0.25303537, "epoch": 0.0307831053660003, "flos": 24750855152640.0, "grad_norm": 63.87178091044093, "language_loss": 0.91400093, "learning_rate": 3.9999935915064655e-06, "loss": 0.93180263, "num_input_tokens_seen": 10883860, "router_z_loss_clip": 3.14453125, "router_z_loss_mlp": 0.37182617, "step": 512, "time_per_iteration": 2.652329444885254 }, { "auxiliary_loss_clip": 0.01469858, "auxiliary_loss_mlp": 0.00305389, "balance_loss_clip": 1.16574049, "balance_loss_mlp": 0.26991197, "epoch": 0.03084322861866827, "flos": 26140885620480.0, "grad_norm": 5.467410380538235, "language_loss": 0.95335698, "learning_rate": 3.9999925676649374e-06, "loss": 0.97110951, "num_input_tokens_seen": 10904555, "router_z_loss_clip": 3.04296875, "router_z_loss_mlp": 0.35473633, "step": 513, "time_per_iteration": 2.724252223968506 }, { "auxiliary_loss_clip": 0.01482473, "auxiliary_loss_mlp": 0.00341052, "balance_loss_clip": 1.17050958, "balance_loss_mlp": 0.30261886, "epoch": 0.03090335187133624, "flos": 18771298168320.0, "grad_norm": 9.422891357298525, "language_loss": 0.86297917, "learning_rate": 3.999991467983491e-06, "loss": 0.8812145, "num_input_tokens_seen": 10923700, "router_z_loss_clip": 3.12109375, "router_z_loss_mlp": 0.38378906, "step": 514, "time_per_iteration": 2.605374336242676 }, { "auxiliary_loss_clip": 0.01463462, "auxiliary_loss_mlp": 0.00361657, "balance_loss_clip": 1.15642476, "balance_loss_mlp": 0.32413, "epoch": 0.030963475124004207, "flos": 23221886878080.0, "grad_norm": 553.2286196816439, "language_loss": 0.8670736, "learning_rate": 3.999990292462167e-06, "loss": 0.88532478, "num_input_tokens_seen": 10942730, "router_z_loss_clip": 3.0703125, "router_z_loss_mlp": 0.37524414, "step": 515, "time_per_iteration": 2.7141072750091553 }, { "auxiliary_loss_clip": 0.01470196, "auxiliary_loss_mlp": 0.00373336, "balance_loss_clip": 1.15618575, "balance_loss_mlp": 0.33430719, "epoch": 0.03102359837667218, "flos": 42525595411200.0, "grad_norm": 91.74291686572451, "language_loss": 0.91210866, "learning_rate": 3.999989041101011e-06, "loss": 0.9305439, "num_input_tokens_seen": 10967120, "router_z_loss_clip": 3.140625, "router_z_loss_mlp": 0.39013672, "step": 516, "time_per_iteration": 2.854295492172241 }, { "auxiliary_loss_clip": 0.01444216, "auxiliary_loss_mlp": 0.00365185, "balance_loss_clip": 1.14123392, "balance_loss_mlp": 0.32923099, "epoch": 0.031083721629340148, "flos": 21176953689600.0, "grad_norm": 12.637876151477146, "language_loss": 0.86290765, "learning_rate": 3.999987713900071e-06, "loss": 0.88100165, "num_input_tokens_seen": 10986775, "router_z_loss_clip": 3.02929688, "router_z_loss_mlp": 0.35961914, "step": 517, "time_per_iteration": 2.6639604568481445 }, { "auxiliary_loss_clip": 0.01458448, "auxiliary_loss_mlp": 0.00349409, "balance_loss_clip": 1.15857005, "balance_loss_mlp": 0.31464791, "epoch": 0.031143844882008116, "flos": 29716187713920.0, "grad_norm": 6.845053219721069, "language_loss": 0.96038306, "learning_rate": 3.999986310859396e-06, "loss": 0.97846168, "num_input_tokens_seen": 11011360, "router_z_loss_clip": 2.99804688, "router_z_loss_mlp": 0.34765625, "step": 518, "time_per_iteration": 2.727872610092163 }, { "auxiliary_loss_clip": 0.0145879, "auxiliary_loss_mlp": 0.0035611, "balance_loss_clip": 1.15677774, "balance_loss_mlp": 0.31965533, "epoch": 0.031203968134676085, "flos": 23112467072640.0, "grad_norm": 9.234371928440192, "language_loss": 0.95085871, "learning_rate": 3.999984831979039e-06, "loss": 0.96900773, "num_input_tokens_seen": 11030150, "router_z_loss_clip": 3.02148438, "router_z_loss_mlp": 0.36425781, "step": 519, "time_per_iteration": 2.6519672870635986 }, { "auxiliary_loss_clip": 0.0145317, "auxiliary_loss_mlp": 0.00346983, "balance_loss_clip": 1.14113975, "balance_loss_mlp": 0.31029049, "epoch": 0.03126409138734405, "flos": 20954379064320.0, "grad_norm": 37.613327483904854, "language_loss": 0.9437331, "learning_rate": 3.999983277259057e-06, "loss": 0.96173459, "num_input_tokens_seen": 11049145, "router_z_loss_clip": 3.12109375, "router_z_loss_mlp": 0.36694336, "step": 520, "time_per_iteration": 2.6994664669036865 }, { "auxiliary_loss_clip": 0.01464707, "auxiliary_loss_mlp": 0.00381959, "balance_loss_clip": 1.15429139, "balance_loss_mlp": 0.34400246, "epoch": 0.031324214640012026, "flos": 21650112570240.0, "grad_norm": 109.41931350161167, "language_loss": 0.94820178, "learning_rate": 3.999981646699509e-06, "loss": 0.96666837, "num_input_tokens_seen": 11068835, "router_z_loss_clip": 3.10546875, "router_z_loss_mlp": 0.37963867, "step": 521, "time_per_iteration": 2.6520917415618896 }, { "auxiliary_loss_clip": 0.0144627, "auxiliary_loss_mlp": 0.00387856, "balance_loss_clip": 1.14122367, "balance_loss_mlp": 0.35183135, "epoch": 0.03138433789267999, "flos": 23441337020160.0, "grad_norm": 13.76961157632272, "language_loss": 0.77541494, "learning_rate": 3.999979940300456e-06, "loss": 0.79375619, "num_input_tokens_seen": 11088980, "router_z_loss_clip": 3.05078125, "router_z_loss_mlp": 0.3605957, "step": 522, "time_per_iteration": 2.7099862098693848 }, { "auxiliary_loss_clip": 0.01447079, "auxiliary_loss_mlp": 0.00376848, "balance_loss_clip": 1.13733935, "balance_loss_mlp": 0.34015507, "epoch": 0.03144446114534796, "flos": 18982164960000.0, "grad_norm": 83.07709269267495, "language_loss": 0.95348203, "learning_rate": 3.999978158061963e-06, "loss": 0.97172129, "num_input_tokens_seen": 11104300, "router_z_loss_clip": 3.09765625, "router_z_loss_mlp": 0.36669922, "step": 523, "time_per_iteration": 2.6191391944885254 }, { "auxiliary_loss_clip": 0.01429476, "auxiliary_loss_mlp": 0.00394735, "balance_loss_clip": 1.12157094, "balance_loss_mlp": 0.35763752, "epoch": 0.031504584398015935, "flos": 22637692080000.0, "grad_norm": 19.151827090153482, "language_loss": 1.00478852, "learning_rate": 3.999976299984099e-06, "loss": 1.02303064, "num_input_tokens_seen": 11123335, "router_z_loss_clip": 3.07617188, "router_z_loss_mlp": 0.37109375, "step": 524, "time_per_iteration": 2.753680467605591 }, { "auxiliary_loss_clip": 0.01431105, "auxiliary_loss_mlp": 0.00415275, "balance_loss_clip": 1.12703514, "balance_loss_mlp": 0.37801021, "epoch": 0.0315647076506839, "flos": 25297056339840.0, "grad_norm": 17.7046588126031, "language_loss": 0.90463626, "learning_rate": 3.999974366066933e-06, "loss": 0.92310005, "num_input_tokens_seen": 11140880, "router_z_loss_clip": 3.04296875, "router_z_loss_mlp": 0.37255859, "step": 525, "time_per_iteration": 7.104484558105469 }, { "auxiliary_loss_clip": 0.01411041, "auxiliary_loss_mlp": 0.00396481, "balance_loss_clip": 1.11544585, "balance_loss_mlp": 0.36021784, "epoch": 0.03162483090335187, "flos": 16982839065600.0, "grad_norm": 6.892137231508144, "language_loss": 0.88602221, "learning_rate": 3.999972356310538e-06, "loss": 0.9040975, "num_input_tokens_seen": 11158710, "router_z_loss_clip": 2.953125, "router_z_loss_mlp": 0.36279297, "step": 526, "time_per_iteration": 3.9900999069213867 }, { "auxiliary_loss_clip": 0.01417671, "auxiliary_loss_mlp": 0.00407412, "balance_loss_clip": 1.11478317, "balance_loss_mlp": 0.36986065, "epoch": 0.03168495415601984, "flos": 18734489706240.0, "grad_norm": 32.290830555084426, "language_loss": 0.902354, "learning_rate": 3.999970270714991e-06, "loss": 0.92060483, "num_input_tokens_seen": 11177550, "router_z_loss_clip": 3.02929688, "router_z_loss_mlp": 0.37573242, "step": 527, "time_per_iteration": 2.5914225578308105 }, { "auxiliary_loss_clip": 0.01405367, "auxiliary_loss_mlp": 0.00396599, "balance_loss_clip": 1.10656619, "balance_loss_mlp": 0.35926259, "epoch": 0.03174507740868781, "flos": 21214875473280.0, "grad_norm": 23.238630028526792, "language_loss": 1.04777873, "learning_rate": 3.999968109280371e-06, "loss": 1.06579828, "num_input_tokens_seen": 11196230, "router_z_loss_clip": 2.98632812, "router_z_loss_mlp": 0.37329102, "step": 528, "time_per_iteration": 2.650545597076416 }, { "auxiliary_loss_clip": 0.01406028, "auxiliary_loss_mlp": 0.0042047, "balance_loss_clip": 1.10637617, "balance_loss_mlp": 0.38482636, "epoch": 0.03180520066135578, "flos": 24787663614720.0, "grad_norm": 10.859910882662396, "language_loss": 0.9004252, "learning_rate": 3.99996587200676e-06, "loss": 0.91869015, "num_input_tokens_seen": 11214935, "router_z_loss_clip": 2.99414062, "router_z_loss_mlp": 0.35644531, "step": 529, "time_per_iteration": 2.6545662879943848 }, { "auxiliary_loss_clip": 0.01402594, "auxiliary_loss_mlp": 0.00398653, "balance_loss_clip": 1.11123836, "balance_loss_mlp": 0.36277124, "epoch": 0.03186532391402375, "flos": 24864261367680.0, "grad_norm": 68.68164399603273, "language_loss": 0.98089647, "learning_rate": 3.999963558894243e-06, "loss": 0.998909, "num_input_tokens_seen": 11235310, "router_z_loss_clip": 2.90820312, "router_z_loss_mlp": 0.35864258, "step": 530, "time_per_iteration": 2.6588621139526367 }, { "auxiliary_loss_clip": 0.0140758, "auxiliary_loss_mlp": 0.00376178, "balance_loss_clip": 1.10522103, "balance_loss_mlp": 0.33841231, "epoch": 0.03192544716669172, "flos": 21215055041280.0, "grad_norm": 419.9234834610135, "language_loss": 0.83871984, "learning_rate": 3.999961169942907e-06, "loss": 0.85655743, "num_input_tokens_seen": 11254425, "router_z_loss_clip": 3.02539062, "router_z_loss_mlp": 0.37744141, "step": 531, "time_per_iteration": 2.664142370223999 }, { "auxiliary_loss_clip": 0.0140912, "auxiliary_loss_mlp": 0.00381424, "balance_loss_clip": 1.11147809, "balance_loss_mlp": 0.3443259, "epoch": 0.03198557041935969, "flos": 24353216616960.0, "grad_norm": 23.678258102373285, "language_loss": 1.0056839, "learning_rate": 3.999958705152843e-06, "loss": 1.02358937, "num_input_tokens_seen": 11274595, "router_z_loss_clip": 2.97851562, "router_z_loss_mlp": 0.37109375, "step": 532, "time_per_iteration": 2.653696298599243 }, { "auxiliary_loss_clip": 0.01244597, "auxiliary_loss_mlp": 0.00331241, "balance_loss_clip": 1.0033561, "balance_loss_mlp": 0.31302628, "epoch": 0.032045693672027656, "flos": 61827367587840.0, "grad_norm": 0.7693832825670364, "language_loss": 0.5764575, "learning_rate": 3.9999561645241445e-06, "loss": 0.5922159, "num_input_tokens_seen": 11336705, "router_z_loss_clip": 2.40625, "router_z_loss_mlp": 0.18261719, "step": 533, "time_per_iteration": 3.1847927570343018 }, { "auxiliary_loss_clip": 0.01417275, "auxiliary_loss_mlp": 0.00389014, "balance_loss_clip": 1.11583483, "balance_loss_mlp": 0.35163003, "epoch": 0.03210581692469563, "flos": 28401174800640.0, "grad_norm": 14.938099841623377, "language_loss": 0.95006204, "learning_rate": 3.999953548056907e-06, "loss": 0.96812493, "num_input_tokens_seen": 11356820, "router_z_loss_clip": 3.01757812, "router_z_loss_mlp": 0.37426758, "step": 534, "time_per_iteration": 2.6831164360046387 }, { "auxiliary_loss_clip": 0.01431815, "auxiliary_loss_mlp": 0.00385965, "balance_loss_clip": 1.12664342, "balance_loss_mlp": 0.34669715, "epoch": 0.03216594017736359, "flos": 24717709877760.0, "grad_norm": 56.31789877808908, "language_loss": 0.86914629, "learning_rate": 3.999950855751232e-06, "loss": 0.88732409, "num_input_tokens_seen": 11376645, "router_z_loss_clip": 3.05273438, "router_z_loss_mlp": 0.39257812, "step": 535, "time_per_iteration": 2.790473699569702 }, { "auxiliary_loss_clip": 0.01428891, "auxiliary_loss_mlp": 0.00374157, "balance_loss_clip": 1.13094544, "balance_loss_mlp": 0.33586666, "epoch": 0.032226063430031565, "flos": 31175453646720.0, "grad_norm": 526.7941884697213, "language_loss": 0.8899107, "learning_rate": 3.999948087607219e-06, "loss": 0.90794122, "num_input_tokens_seen": 11397310, "router_z_loss_clip": 2.98046875, "router_z_loss_mlp": 0.3828125, "step": 536, "time_per_iteration": 2.8727195262908936 }, { "auxiliary_loss_clip": 0.01435018, "auxiliary_loss_mlp": 0.00407134, "balance_loss_clip": 1.13346696, "balance_loss_mlp": 0.36536288, "epoch": 0.03228618668269954, "flos": 32198225506560.0, "grad_norm": 5.63621813557084, "language_loss": 0.78065759, "learning_rate": 3.999945243624975e-06, "loss": 0.79907906, "num_input_tokens_seen": 11418475, "router_z_loss_clip": 3.015625, "router_z_loss_mlp": 0.41772461, "step": 537, "time_per_iteration": 2.757281541824341 }, { "auxiliary_loss_clip": 0.01427019, "auxiliary_loss_mlp": 0.00381439, "balance_loss_clip": 1.12819004, "balance_loss_mlp": 0.34446031, "epoch": 0.0323463099353675, "flos": 22670154996480.0, "grad_norm": 328.7442255678031, "language_loss": 0.91313142, "learning_rate": 3.999942323804607e-06, "loss": 0.93121594, "num_input_tokens_seen": 11436630, "router_z_loss_clip": 2.98632812, "router_z_loss_mlp": 0.37011719, "step": 538, "time_per_iteration": 2.649658441543579 }, { "auxiliary_loss_clip": 0.01455074, "auxiliary_loss_mlp": 0.00430085, "balance_loss_clip": 1.14582765, "balance_loss_mlp": 0.387909, "epoch": 0.032406433188035474, "flos": 26905172232960.0, "grad_norm": 31.55741572942659, "language_loss": 0.86308122, "learning_rate": 3.999939328146225e-06, "loss": 0.8819328, "num_input_tokens_seen": 11457275, "router_z_loss_clip": 3.09179688, "router_z_loss_mlp": 0.42138672, "step": 539, "time_per_iteration": 2.6828184127807617 }, { "auxiliary_loss_clip": 0.01437098, "auxiliary_loss_mlp": 0.00425216, "balance_loss_clip": 1.13273478, "balance_loss_mlp": 0.38272941, "epoch": 0.03246655644070344, "flos": 31503928544640.0, "grad_norm": 23.300193756156954, "language_loss": 0.84225589, "learning_rate": 3.999936256649943e-06, "loss": 0.860879, "num_input_tokens_seen": 11476925, "router_z_loss_clip": 3.04101562, "router_z_loss_mlp": 0.42480469, "step": 540, "time_per_iteration": 2.7755157947540283 }, { "auxiliary_loss_clip": 0.01456472, "auxiliary_loss_mlp": 0.00429091, "balance_loss_clip": 1.14925909, "balance_loss_mlp": 0.38414946, "epoch": 0.03252667969337141, "flos": 23218331431680.0, "grad_norm": 176.41188597304046, "language_loss": 0.93958241, "learning_rate": 3.999933109315878e-06, "loss": 0.95843804, "num_input_tokens_seen": 11496830, "router_z_loss_clip": 3.07421875, "router_z_loss_mlp": 0.44946289, "step": 541, "time_per_iteration": 2.6530449390411377 }, { "auxiliary_loss_clip": 0.01437866, "auxiliary_loss_mlp": 0.00433644, "balance_loss_clip": 1.1411612, "balance_loss_mlp": 0.39130038, "epoch": 0.032586802946039384, "flos": 14757454926720.0, "grad_norm": 16.364442470543636, "language_loss": 0.96431792, "learning_rate": 3.9999298861441496e-06, "loss": 0.98303306, "num_input_tokens_seen": 11515605, "router_z_loss_clip": 2.97070312, "router_z_loss_mlp": 0.42333984, "step": 542, "time_per_iteration": 2.669996976852417 }, { "auxiliary_loss_clip": 0.01443739, "auxiliary_loss_mlp": 0.00443743, "balance_loss_clip": 1.13566113, "balance_loss_mlp": 0.40256789, "epoch": 0.03264692619870735, "flos": 24280677100800.0, "grad_norm": 60.11754299098067, "language_loss": 0.80248559, "learning_rate": 3.999926587134879e-06, "loss": 0.82136047, "num_input_tokens_seen": 11536230, "router_z_loss_clip": 3.08398438, "router_z_loss_mlp": 0.41186523, "step": 543, "time_per_iteration": 2.6673216819763184 }, { "auxiliary_loss_clip": 0.01454557, "auxiliary_loss_mlp": 0.00455767, "balance_loss_clip": 1.1382401, "balance_loss_mlp": 0.41278037, "epoch": 0.03270704945137532, "flos": 22893160584960.0, "grad_norm": 5.319865603142345, "language_loss": 1.01399291, "learning_rate": 3.999923212288192e-06, "loss": 1.03309631, "num_input_tokens_seen": 11554715, "router_z_loss_clip": 3.16210938, "router_z_loss_mlp": 0.42993164, "step": 544, "time_per_iteration": 2.6687207221984863 }, { "auxiliary_loss_clip": 0.01463236, "auxiliary_loss_mlp": 0.00509963, "balance_loss_clip": 1.14999413, "balance_loss_mlp": 0.46545005, "epoch": 0.032767172704043286, "flos": 18041018757120.0, "grad_norm": 20.14819768742619, "language_loss": 0.77094567, "learning_rate": 3.999919761604216e-06, "loss": 0.79067767, "num_input_tokens_seen": 11571370, "router_z_loss_clip": 3.12890625, "router_z_loss_mlp": 0.44506836, "step": 545, "time_per_iteration": 2.6066088676452637 }, { "auxiliary_loss_clip": 0.01459992, "auxiliary_loss_mlp": 0.00484205, "balance_loss_clip": 1.14656448, "balance_loss_mlp": 0.43993023, "epoch": 0.03282729595671126, "flos": 22528739151360.0, "grad_norm": 34.310423874500415, "language_loss": 1.01545835, "learning_rate": 3.999916235083083e-06, "loss": 1.03490043, "num_input_tokens_seen": 11588560, "router_z_loss_clip": 3.1328125, "router_z_loss_mlp": 0.44287109, "step": 546, "time_per_iteration": 2.7529876232147217 }, { "auxiliary_loss_clip": 0.01433524, "auxiliary_loss_mlp": 0.00449381, "balance_loss_clip": 1.12726116, "balance_loss_mlp": 0.40441495, "epoch": 0.03288741920937923, "flos": 20410620001920.0, "grad_norm": 26.707600559384097, "language_loss": 0.91979563, "learning_rate": 3.999912632724925e-06, "loss": 0.93862474, "num_input_tokens_seen": 11605685, "router_z_loss_clip": 3.06640625, "router_z_loss_mlp": 0.44995117, "step": 547, "time_per_iteration": 2.655493974685669 }, { "auxiliary_loss_clip": 0.01429994, "auxiliary_loss_mlp": 0.00528379, "balance_loss_clip": 1.1212585, "balance_loss_mlp": 0.47964653, "epoch": 0.032947542462047195, "flos": 20777986350720.0, "grad_norm": 8.923209221035412, "language_loss": 0.89867127, "learning_rate": 3.999908954529881e-06, "loss": 0.91825497, "num_input_tokens_seen": 11626290, "router_z_loss_clip": 3.0859375, "router_z_loss_mlp": 0.48779297, "step": 548, "time_per_iteration": 2.7090330123901367 }, { "auxiliary_loss_clip": 0.01415985, "auxiliary_loss_mlp": 0.00486902, "balance_loss_clip": 1.11845803, "balance_loss_mlp": 0.44250861, "epoch": 0.03300766571471517, "flos": 19901263190400.0, "grad_norm": 312.6066792198882, "language_loss": 0.79911727, "learning_rate": 3.999905200498087e-06, "loss": 0.81814617, "num_input_tokens_seen": 11643950, "router_z_loss_clip": 2.97070312, "router_z_loss_mlp": 0.44335938, "step": 549, "time_per_iteration": 2.674975633621216 }, { "auxiliary_loss_clip": 0.01408982, "auxiliary_loss_mlp": 0.00461059, "balance_loss_clip": 1.11810076, "balance_loss_mlp": 0.41876343, "epoch": 0.03306778896738313, "flos": 17967760968960.0, "grad_norm": 22.136794942723494, "language_loss": 0.92098647, "learning_rate": 3.999901370629689e-06, "loss": 0.93968689, "num_input_tokens_seen": 11662560, "router_z_loss_clip": 2.90820312, "router_z_loss_mlp": 0.42333984, "step": 550, "time_per_iteration": 2.6124558448791504 }, { "auxiliary_loss_clip": 0.01398223, "auxiliary_loss_mlp": 0.00475955, "balance_loss_clip": 1.11558187, "balance_loss_mlp": 0.4373554, "epoch": 0.033127912220051105, "flos": 21653380707840.0, "grad_norm": 22.723804039138805, "language_loss": 0.88557947, "learning_rate": 3.99989746492483e-06, "loss": 0.90432125, "num_input_tokens_seen": 11682265, "router_z_loss_clip": 2.82421875, "router_z_loss_mlp": 0.38598633, "step": 551, "time_per_iteration": 2.659625291824341 }, { "auxiliary_loss_clip": 0.01416708, "auxiliary_loss_mlp": 0.00489711, "balance_loss_clip": 1.11481047, "balance_loss_mlp": 0.44255143, "epoch": 0.03318803547271908, "flos": 30188376927360.0, "grad_norm": 41.63544371367076, "language_loss": 0.96905786, "learning_rate": 3.999893483383658e-06, "loss": 0.98812199, "num_input_tokens_seen": 11699300, "router_z_loss_clip": 3.01757812, "router_z_loss_mlp": 0.47167969, "step": 552, "time_per_iteration": 2.717057943344116 }, { "auxiliary_loss_clip": 0.01399877, "auxiliary_loss_mlp": 0.00468806, "balance_loss_clip": 1.10748768, "balance_loss_mlp": 0.4273209, "epoch": 0.03324815872538704, "flos": 20376038183040.0, "grad_norm": 173.49162987535243, "language_loss": 1.0129168, "learning_rate": 3.999889426006326e-06, "loss": 1.03160357, "num_input_tokens_seen": 11716955, "router_z_loss_clip": 2.921875, "router_z_loss_mlp": 0.41455078, "step": 553, "time_per_iteration": 2.6790990829467773 }, { "auxiliary_loss_clip": 0.01392059, "auxiliary_loss_mlp": 0.00473758, "balance_loss_clip": 1.10543799, "balance_loss_mlp": 0.43277425, "epoch": 0.033308281978055014, "flos": 24494560634880.0, "grad_norm": 134.77969276687043, "language_loss": 0.86991549, "learning_rate": 3.999885292792986e-06, "loss": 0.88857371, "num_input_tokens_seen": 11736130, "router_z_loss_clip": 2.8671875, "router_z_loss_mlp": 0.41015625, "step": 554, "time_per_iteration": 2.68129563331604 }, { "auxiliary_loss_clip": 0.01402678, "auxiliary_loss_mlp": 0.00427516, "balance_loss_clip": 1.1220783, "balance_loss_mlp": 0.38524458, "epoch": 0.03336840523072298, "flos": 23400326666880.0, "grad_norm": 13594.393388761804, "language_loss": 0.90083694, "learning_rate": 3.999881083743795e-06, "loss": 0.91913891, "num_input_tokens_seen": 11754425, "router_z_loss_clip": 2.8046875, "router_z_loss_mlp": 0.42236328, "step": 555, "time_per_iteration": 2.6592769622802734 }, { "auxiliary_loss_clip": 0.01419177, "auxiliary_loss_mlp": 0.00377343, "balance_loss_clip": 1.12870145, "balance_loss_mlp": 0.33397514, "epoch": 0.03342852848339095, "flos": 30550571717760.0, "grad_norm": 5.415235348014524, "language_loss": 0.99613839, "learning_rate": 3.999876798858914e-06, "loss": 1.01410365, "num_input_tokens_seen": 11772845, "router_z_loss_clip": 2.90625, "router_z_loss_mlp": 0.43334961, "step": 556, "time_per_iteration": 2.6849286556243896 }, { "auxiliary_loss_clip": 0.01420982, "auxiliary_loss_mlp": 0.00403536, "balance_loss_clip": 1.13188338, "balance_loss_mlp": 0.36083519, "epoch": 0.03348865173605892, "flos": 22893304239360.0, "grad_norm": 79.67036984617063, "language_loss": 0.93315685, "learning_rate": 3.999872438138503e-06, "loss": 0.95140201, "num_input_tokens_seen": 11792850, "router_z_loss_clip": 2.890625, "router_z_loss_mlp": 0.42700195, "step": 557, "time_per_iteration": 2.6483161449432373 }, { "auxiliary_loss_clip": 0.01441726, "auxiliary_loss_mlp": 0.00397586, "balance_loss_clip": 1.14553046, "balance_loss_mlp": 0.35540974, "epoch": 0.03354877498872689, "flos": 17676022705920.0, "grad_norm": 229.2299094226992, "language_loss": 1.05163693, "learning_rate": 3.999868001582729e-06, "loss": 1.07003009, "num_input_tokens_seen": 11809670, "router_z_loss_clip": 2.9609375, "router_z_loss_mlp": 0.421875, "step": 558, "time_per_iteration": 2.83170485496521 }, { "auxiliary_loss_clip": 0.01454886, "auxiliary_loss_mlp": 0.00377803, "balance_loss_clip": 1.15662587, "balance_loss_mlp": 0.33965608, "epoch": 0.03360889824139486, "flos": 21652985658240.0, "grad_norm": 37.95872776710402, "language_loss": 0.86306882, "learning_rate": 3.99986348919176e-06, "loss": 0.8813957, "num_input_tokens_seen": 11829665, "router_z_loss_clip": 2.98632812, "router_z_loss_mlp": 0.38134766, "step": 559, "time_per_iteration": 2.7128684520721436 }, { "auxiliary_loss_clip": 0.0146213, "auxiliary_loss_mlp": 0.00343619, "balance_loss_clip": 1.17122161, "balance_loss_mlp": 0.30642527, "epoch": 0.033669021494062826, "flos": 21795730306560.0, "grad_norm": 12.106716410882317, "language_loss": 0.94454145, "learning_rate": 3.9998589009657675e-06, "loss": 0.96259892, "num_input_tokens_seen": 11848190, "router_z_loss_clip": 2.91015625, "router_z_loss_mlp": 0.37207031, "step": 560, "time_per_iteration": 2.630286455154419 }, { "auxiliary_loss_clip": 0.01480486, "auxiliary_loss_mlp": 0.00343919, "balance_loss_clip": 1.18209958, "balance_loss_mlp": 0.30746448, "epoch": 0.0337291447467308, "flos": 21866222747520.0, "grad_norm": 46.79975559390302, "language_loss": 0.88768727, "learning_rate": 3.999854236904925e-06, "loss": 0.90593135, "num_input_tokens_seen": 11864795, "router_z_loss_clip": 2.98632812, "router_z_loss_mlp": 0.36474609, "step": 561, "time_per_iteration": 2.6813862323760986 }, { "auxiliary_loss_clip": 0.01517033, "auxiliary_loss_mlp": 0.00397801, "balance_loss_clip": 1.21353054, "balance_loss_mlp": 0.36055994, "epoch": 0.03378926799939877, "flos": 24245951627520.0, "grad_norm": 46.99898596212948, "language_loss": 0.88649303, "learning_rate": 3.999849497009409e-06, "loss": 0.90564132, "num_input_tokens_seen": 11885275, "router_z_loss_clip": 3.03710938, "router_z_loss_mlp": 0.37207031, "step": 562, "time_per_iteration": 2.7265377044677734 }, { "auxiliary_loss_clip": 0.0152929, "auxiliary_loss_mlp": 0.00408223, "balance_loss_clip": 1.22183669, "balance_loss_mlp": 0.3703618, "epoch": 0.033849391252066735, "flos": 16507812677760.0, "grad_norm": 12.913551012918083, "language_loss": 0.90110111, "learning_rate": 3.999844681279401e-06, "loss": 0.92047614, "num_input_tokens_seen": 11903595, "router_z_loss_clip": 3.07617188, "router_z_loss_mlp": 0.37866211, "step": 563, "time_per_iteration": 2.6296610832214355 }, { "auxiliary_loss_clip": 0.01573137, "auxiliary_loss_mlp": 0.00535282, "balance_loss_clip": 1.25706983, "balance_loss_mlp": 0.49885139, "epoch": 0.03390951450473471, "flos": 15669298609920.0, "grad_norm": 44.335420908147455, "language_loss": 0.99358177, "learning_rate": 3.99983978971508e-06, "loss": 1.01466608, "num_input_tokens_seen": 11917815, "router_z_loss_clip": 3.15820312, "router_z_loss_mlp": 0.36401367, "step": 564, "time_per_iteration": 2.635957717895508 }, { "auxiliary_loss_clip": 0.01605492, "auxiliary_loss_mlp": 0.00757579, "balance_loss_clip": 1.26272273, "balance_loss_mlp": 0.7084173, "epoch": 0.03396963775740267, "flos": 22674787850880.0, "grad_norm": 203.50803183057292, "language_loss": 1.01487458, "learning_rate": 3.999834822316635e-06, "loss": 1.03850532, "num_input_tokens_seen": 11936305, "router_z_loss_clip": 3.42578125, "router_z_loss_mlp": 0.49194336, "step": 565, "time_per_iteration": 2.669855833053589 }, { "auxiliary_loss_clip": 0.01622963, "auxiliary_loss_mlp": 0.01001888, "balance_loss_clip": 1.3108145, "balance_loss_mlp": 0.96602994, "epoch": 0.034029761010070644, "flos": 64392683063040.0, "grad_norm": 1.8677573933637512, "language_loss": 0.56518769, "learning_rate": 3.9998297790842535e-06, "loss": 0.59143615, "num_input_tokens_seen": 11998940, "router_z_loss_clip": 3.125, "router_z_loss_mlp": 0.359375, "step": 566, "time_per_iteration": 3.2568817138671875 }, { "auxiliary_loss_clip": 0.01642218, "auxiliary_loss_mlp": 0.01026052, "balance_loss_clip": 1.28326082, "balance_loss_mlp": 0.96809232, "epoch": 0.034089884262738616, "flos": 25004204755200.0, "grad_norm": 1498.6202311004936, "language_loss": 0.83743101, "learning_rate": 3.999824660018126e-06, "loss": 0.86411369, "num_input_tokens_seen": 12018860, "router_z_loss_clip": 3.59375, "router_z_loss_mlp": 0.57983398, "step": 567, "time_per_iteration": 4.146958827972412 }, { "auxiliary_loss_clip": 0.01649733, "auxiliary_loss_mlp": 0.01071359, "balance_loss_clip": 1.30259621, "balance_loss_mlp": 1.01673698, "epoch": 0.03415000751540658, "flos": 28439096584320.0, "grad_norm": 8.853944548668016, "language_loss": 0.8747673, "learning_rate": 3.999819465118447e-06, "loss": 0.90197819, "num_input_tokens_seen": 12039675, "router_z_loss_clip": 3.47460938, "router_z_loss_mlp": 0.54638672, "step": 568, "time_per_iteration": 5.578441143035889 }, { "auxiliary_loss_clip": 0.01681329, "auxiliary_loss_mlp": 0.01012992, "balance_loss_clip": 1.32591093, "balance_loss_mlp": 0.95677346, "epoch": 0.034210130768074554, "flos": 21468727866240.0, "grad_norm": 43.262484023601175, "language_loss": 0.92702097, "learning_rate": 3.999814194385413e-06, "loss": 0.95396417, "num_input_tokens_seen": 12057680, "router_z_loss_clip": 3.55859375, "router_z_loss_mlp": 0.56176758, "step": 569, "time_per_iteration": 2.6144187450408936 }, { "auxiliary_loss_clip": 0.01702387, "auxiliary_loss_mlp": 0.00967973, "balance_loss_clip": 1.33617926, "balance_loss_mlp": 0.91161126, "epoch": 0.03427025402074252, "flos": 18697501676160.0, "grad_norm": 22.446507801485726, "language_loss": 1.01552176, "learning_rate": 3.9998088478192255e-06, "loss": 1.04222536, "num_input_tokens_seen": 12076135, "router_z_loss_clip": 3.66210938, "router_z_loss_mlp": 0.56347656, "step": 570, "time_per_iteration": 2.666616201400757 }, { "auxiliary_loss_clip": 0.01742455, "auxiliary_loss_mlp": 0.0092479, "balance_loss_clip": 1.35013866, "balance_loss_mlp": 0.86232466, "epoch": 0.03433037727341049, "flos": 20849987162880.0, "grad_norm": 1077.7680718825236, "language_loss": 0.87356269, "learning_rate": 3.9998034254200846e-06, "loss": 0.90023512, "num_input_tokens_seen": 12094785, "router_z_loss_clip": 3.92578125, "router_z_loss_mlp": 0.62451172, "step": 571, "time_per_iteration": 2.7737529277801514 }, { "auxiliary_loss_clip": 0.01761361, "auxiliary_loss_mlp": 0.00986959, "balance_loss_clip": 1.36948776, "balance_loss_mlp": 0.92492282, "epoch": 0.03439050052607846, "flos": 25410282986880.0, "grad_norm": 74.58121092739677, "language_loss": 0.89474499, "learning_rate": 3.999797927188199e-06, "loss": 0.92222822, "num_input_tokens_seen": 12114590, "router_z_loss_clip": 3.921875, "router_z_loss_mlp": 0.62060547, "step": 572, "time_per_iteration": 2.749459981918335 }, { "auxiliary_loss_clip": 0.01805608, "auxiliary_loss_mlp": 0.00967457, "balance_loss_clip": 1.3915472, "balance_loss_mlp": 0.89922154, "epoch": 0.03445062377874643, "flos": 17640147997440.0, "grad_norm": 33.6482832974872, "language_loss": 0.90398079, "learning_rate": 3.999792353123774e-06, "loss": 0.93171138, "num_input_tokens_seen": 12132390, "router_z_loss_clip": 4.13671875, "router_z_loss_mlp": 0.68261719, "step": 573, "time_per_iteration": 2.5988261699676514 }, { "auxiliary_loss_clip": 0.01814002, "auxiliary_loss_mlp": 0.00890432, "balance_loss_clip": 1.39303374, "balance_loss_mlp": 0.82558239, "epoch": 0.0345107470314144, "flos": 16764502245120.0, "grad_norm": 14.868498631335719, "language_loss": 0.85061079, "learning_rate": 3.999786703227023e-06, "loss": 0.87765515, "num_input_tokens_seen": 12149035, "router_z_loss_clip": 4.2109375, "router_z_loss_mlp": 0.64892578, "step": 574, "time_per_iteration": 2.673856735229492 }, { "auxiliary_loss_clip": 0.01833263, "auxiliary_loss_mlp": 0.01003166, "balance_loss_clip": 1.40273416, "balance_loss_mlp": 0.93082976, "epoch": 0.03457087028408237, "flos": 14684448533760.0, "grad_norm": 211.00482096691226, "language_loss": 0.91248941, "learning_rate": 3.9997809774981606e-06, "loss": 0.94085371, "num_input_tokens_seen": 12167530, "router_z_loss_clip": 4.3046875, "router_z_loss_mlp": 0.72314453, "step": 575, "time_per_iteration": 2.610645055770874 }, { "auxiliary_loss_clip": 0.01868642, "auxiliary_loss_mlp": 0.00928575, "balance_loss_clip": 1.42889428, "balance_loss_mlp": 0.86253327, "epoch": 0.03463099353675034, "flos": 20011293527040.0, "grad_norm": 34.77446711763382, "language_loss": 0.8858934, "learning_rate": 3.9997751759374025e-06, "loss": 0.91386557, "num_input_tokens_seen": 12186340, "router_z_loss_clip": 4.40234375, "router_z_loss_mlp": 0.66015625, "step": 576, "time_per_iteration": 2.6080243587493896 }, { "auxiliary_loss_clip": 0.01892458, "auxiliary_loss_mlp": 0.00922645, "balance_loss_clip": 1.45069718, "balance_loss_mlp": 0.85841531, "epoch": 0.03469111678941831, "flos": 25301150490240.0, "grad_norm": 30.295653150372363, "language_loss": 0.91821396, "learning_rate": 3.99976929854497e-06, "loss": 0.946365, "num_input_tokens_seen": 12204090, "router_z_loss_clip": 4.4140625, "router_z_loss_mlp": 0.64257812, "step": 577, "time_per_iteration": 2.6505420207977295 }, { "auxiliary_loss_clip": 0.01917375, "auxiliary_loss_mlp": 0.00815244, "balance_loss_clip": 1.46010435, "balance_loss_mlp": 0.7511574, "epoch": 0.034751240042086275, "flos": 23259413612160.0, "grad_norm": 25.263367315564192, "language_loss": 0.7807588, "learning_rate": 3.9997633453210845e-06, "loss": 0.80808502, "num_input_tokens_seen": 12224850, "router_z_loss_clip": 4.57421875, "router_z_loss_mlp": 0.64160156, "step": 578, "time_per_iteration": 2.6346471309661865 }, { "auxiliary_loss_clip": 0.01964185, "auxiliary_loss_mlp": 0.00846752, "balance_loss_clip": 1.48495173, "balance_loss_mlp": 0.77761054, "epoch": 0.03481136329475425, "flos": 23769237300480.0, "grad_norm": 27.096566289297, "language_loss": 0.82887483, "learning_rate": 3.999757316265973e-06, "loss": 0.85698414, "num_input_tokens_seen": 12244935, "router_z_loss_clip": 4.80078125, "router_z_loss_mlp": 0.69189453, "step": 579, "time_per_iteration": 2.708754777908325 }, { "auxiliary_loss_clip": 0.01980446, "auxiliary_loss_mlp": 0.00766055, "balance_loss_clip": 1.4903897, "balance_loss_mlp": 0.70296997, "epoch": 0.03487148654742222, "flos": 20157521794560.0, "grad_norm": 6.171736537355569, "language_loss": 0.909549, "learning_rate": 3.999751211379863e-06, "loss": 0.93701398, "num_input_tokens_seen": 12262140, "router_z_loss_clip": 4.90234375, "router_z_loss_mlp": 0.63085938, "step": 580, "time_per_iteration": 2.644131660461426 }, { "auxiliary_loss_clip": 0.01994816, "auxiliary_loss_mlp": 0.00809086, "balance_loss_clip": 1.50035203, "balance_loss_mlp": 0.74495143, "epoch": 0.034931609800090184, "flos": 15669585918720.0, "grad_norm": 414.56554159352845, "language_loss": 0.89531243, "learning_rate": 3.999745030662987e-06, "loss": 0.92335141, "num_input_tokens_seen": 12280930, "router_z_loss_clip": 4.9375, "router_z_loss_mlp": 0.64111328, "step": 581, "time_per_iteration": 2.6543874740600586 }, { "auxiliary_loss_clip": 0.02020242, "auxiliary_loss_mlp": 0.00709909, "balance_loss_clip": 1.52640152, "balance_loss_mlp": 0.65287942, "epoch": 0.034991733052758156, "flos": 16362374509440.0, "grad_norm": 6.665103957820782, "language_loss": 0.8334043, "learning_rate": 3.99973877411558e-06, "loss": 0.86070573, "num_input_tokens_seen": 12299125, "router_z_loss_clip": 4.94140625, "router_z_loss_mlp": 0.56982422, "step": 582, "time_per_iteration": 2.6119818687438965 }, { "auxiliary_loss_clip": 0.02019719, "auxiliary_loss_mlp": 0.0073515, "balance_loss_clip": 1.52335572, "balance_loss_mlp": 0.67139739, "epoch": 0.03505185630542612, "flos": 19387309438080.0, "grad_norm": 21.997048664800474, "language_loss": 0.93027049, "learning_rate": 3.999732441737877e-06, "loss": 0.95781922, "num_input_tokens_seen": 12316905, "router_z_loss_clip": 4.96484375, "router_z_loss_mlp": 0.63671875, "step": 583, "time_per_iteration": 2.740124464035034 }, { "auxiliary_loss_clip": 0.02041112, "auxiliary_loss_mlp": 0.00690405, "balance_loss_clip": 1.52354217, "balance_loss_mlp": 0.62865436, "epoch": 0.03511197955809409, "flos": 21323828401920.0, "grad_norm": 296.0604880249817, "language_loss": 0.87412864, "learning_rate": 3.99972603353012e-06, "loss": 0.90144378, "num_input_tokens_seen": 12335070, "router_z_loss_clip": 5.17578125, "router_z_loss_mlp": 0.6171875, "step": 584, "time_per_iteration": 2.624398946762085 }, { "auxiliary_loss_clip": 0.02014235, "auxiliary_loss_mlp": 0.00687322, "balance_loss_clip": 1.51288009, "balance_loss_mlp": 0.62561977, "epoch": 0.035172102810762065, "flos": 14136595320960.0, "grad_norm": 315.6845240989909, "language_loss": 1.01652288, "learning_rate": 3.999719549492551e-06, "loss": 1.04353845, "num_input_tokens_seen": 12350315, "router_z_loss_clip": 5.01953125, "router_z_loss_mlp": 0.6171875, "step": 585, "time_per_iteration": 2.643486261367798 }, { "auxiliary_loss_clip": 0.02008165, "auxiliary_loss_mlp": 0.0070608, "balance_loss_clip": 1.50918734, "balance_loss_mlp": 0.64451993, "epoch": 0.03523222606343003, "flos": 20296890564480.0, "grad_norm": 237.65832751070636, "language_loss": 0.93462563, "learning_rate": 3.9997129896254165e-06, "loss": 0.96176809, "num_input_tokens_seen": 12366030, "router_z_loss_clip": 4.9921875, "router_z_loss_mlp": 0.61523438, "step": 586, "time_per_iteration": 2.6885077953338623 }, { "auxiliary_loss_clip": 0.02020211, "auxiliary_loss_mlp": 0.00691459, "balance_loss_clip": 1.5188297, "balance_loss_mlp": 0.63147265, "epoch": 0.035292349316098, "flos": 20375822701440.0, "grad_norm": 14.504496014331218, "language_loss": 0.82897758, "learning_rate": 3.999706353928965e-06, "loss": 0.85609436, "num_input_tokens_seen": 12384895, "router_z_loss_clip": 5.0078125, "router_z_loss_mlp": 0.60009766, "step": 587, "time_per_iteration": 2.6590113639831543 }, { "auxiliary_loss_clip": 0.02029785, "auxiliary_loss_mlp": 0.00687763, "balance_loss_clip": 1.51845419, "balance_loss_mlp": 0.62467724, "epoch": 0.03535247256876597, "flos": 21468871520640.0, "grad_norm": 14.244863750209356, "language_loss": 0.83517396, "learning_rate": 3.999699642403449e-06, "loss": 0.86234945, "num_input_tokens_seen": 12404980, "router_z_loss_clip": 5.11328125, "router_z_loss_mlp": 0.63110352, "step": 588, "time_per_iteration": 2.626535177230835 }, { "auxiliary_loss_clip": 0.02002217, "auxiliary_loss_mlp": 0.00705739, "balance_loss_clip": 1.50593865, "balance_loss_mlp": 0.63831377, "epoch": 0.03541259582143394, "flos": 23623044946560.0, "grad_norm": 97.69233701336675, "language_loss": 1.02350628, "learning_rate": 3.99969285504912e-06, "loss": 1.05058587, "num_input_tokens_seen": 12423835, "router_z_loss_clip": 4.9609375, "router_z_loss_mlp": 0.67431641, "step": 589, "time_per_iteration": 2.658149480819702 }, { "auxiliary_loss_clip": 0.0200716, "auxiliary_loss_mlp": 0.00654137, "balance_loss_clip": 1.513098, "balance_loss_mlp": 0.59543884, "epoch": 0.03547271907410191, "flos": 33726367768320.0, "grad_norm": 11.777739714064962, "language_loss": 0.90522361, "learning_rate": 3.99968599186624e-06, "loss": 0.93183661, "num_input_tokens_seen": 12443135, "router_z_loss_clip": 4.9375, "router_z_loss_mlp": 0.58691406, "step": 590, "time_per_iteration": 2.708995819091797 }, { "auxiliary_loss_clip": 0.02007494, "auxiliary_loss_mlp": 0.00627235, "balance_loss_clip": 1.51799631, "balance_loss_mlp": 0.57068229, "epoch": 0.03553284232676988, "flos": 21142695093120.0, "grad_norm": 18.3018832348814, "language_loss": 0.93866599, "learning_rate": 3.999679052855065e-06, "loss": 0.96501327, "num_input_tokens_seen": 12462895, "router_z_loss_clip": 4.890625, "router_z_loss_mlp": 0.56591797, "step": 591, "time_per_iteration": 2.6830832958221436 }, { "auxiliary_loss_clip": 0.02018357, "auxiliary_loss_mlp": 0.00635042, "balance_loss_clip": 1.51816225, "balance_loss_mlp": 0.57310134, "epoch": 0.03559296557943785, "flos": 20046593617920.0, "grad_norm": 36.099413574747146, "language_loss": 0.90049314, "learning_rate": 3.999672038015861e-06, "loss": 0.92702723, "num_input_tokens_seen": 12481515, "router_z_loss_clip": 5.00390625, "router_z_loss_mlp": 0.61914062, "step": 592, "time_per_iteration": 2.6144847869873047 }, { "auxiliary_loss_clip": 0.01949497, "auxiliary_loss_mlp": 0.00531676, "balance_loss_clip": 1.50533867, "balance_loss_mlp": 0.50573599, "epoch": 0.035653088832105814, "flos": 60334597244160.0, "grad_norm": 1.2813554423647389, "language_loss": 0.60362959, "learning_rate": 3.999664947348893e-06, "loss": 0.62844121, "num_input_tokens_seen": 12548220, "router_z_loss_clip": 4.4375, "router_z_loss_mlp": 0.25976562, "step": 593, "time_per_iteration": 3.1666927337646484 }, { "auxiliary_loss_clip": 0.01977755, "auxiliary_loss_mlp": 0.00614455, "balance_loss_clip": 1.50727212, "balance_loss_mlp": 0.55656743, "epoch": 0.035713212084773786, "flos": 20113135562880.0, "grad_norm": 50.77649937291338, "language_loss": 0.92572582, "learning_rate": 3.999657780854429e-06, "loss": 0.95164788, "num_input_tokens_seen": 12566105, "router_z_loss_clip": 4.703125, "router_z_loss_mlp": 0.57836914, "step": 594, "time_per_iteration": 2.6271414756774902 }, { "auxiliary_loss_clip": 0.0195604, "auxiliary_loss_mlp": 0.0060608, "balance_loss_clip": 1.48770142, "balance_loss_mlp": 0.5499559, "epoch": 0.03577333533744176, "flos": 26285785084800.0, "grad_norm": 33.92128025014446, "language_loss": 0.90670955, "learning_rate": 3.999650538532742e-06, "loss": 0.93233073, "num_input_tokens_seen": 12586680, "router_z_loss_clip": 4.6796875, "router_z_loss_mlp": 0.56201172, "step": 595, "time_per_iteration": 2.6685123443603516 }, { "auxiliary_loss_clip": 0.01943685, "auxiliary_loss_mlp": 0.00649888, "balance_loss_clip": 1.48400664, "balance_loss_mlp": 0.58828044, "epoch": 0.035833458590109724, "flos": 10889732211840.0, "grad_norm": 9.945465030226796, "language_loss": 1.03141451, "learning_rate": 3.999643220384106e-06, "loss": 1.05735016, "num_input_tokens_seen": 12601605, "router_z_loss_clip": 4.59375, "router_z_loss_mlp": 0.61572266, "step": 596, "time_per_iteration": 2.649545431137085 }, { "auxiliary_loss_clip": 0.01900971, "auxiliary_loss_mlp": 0.00629936, "balance_loss_clip": 1.46156228, "balance_loss_mlp": 0.57331192, "epoch": 0.035893581842777696, "flos": 22090198003200.0, "grad_norm": 62.40770534230728, "language_loss": 0.90476179, "learning_rate": 3.999635826408799e-06, "loss": 0.93007088, "num_input_tokens_seen": 12620365, "router_z_loss_clip": 4.39453125, "router_z_loss_mlp": 0.56616211, "step": 597, "time_per_iteration": 2.6184678077697754 }, { "auxiliary_loss_clip": 0.01856841, "auxiliary_loss_mlp": 0.00689091, "balance_loss_clip": 1.43695235, "balance_loss_mlp": 0.6286279, "epoch": 0.03595370509544566, "flos": 23038347358080.0, "grad_norm": 17.785870669816784, "language_loss": 0.85853088, "learning_rate": 3.999628356607101e-06, "loss": 0.88399023, "num_input_tokens_seen": 12641140, "router_z_loss_clip": 4.1953125, "router_z_loss_mlp": 0.60400391, "step": 598, "time_per_iteration": 2.6845881938934326 }, { "auxiliary_loss_clip": 0.01842241, "auxiliary_loss_mlp": 0.00670352, "balance_loss_clip": 1.44181287, "balance_loss_mlp": 0.61155838, "epoch": 0.03601382834811363, "flos": 20777734955520.0, "grad_norm": 58.81162484315021, "language_loss": 0.86409879, "learning_rate": 3.999620810979295e-06, "loss": 0.88922471, "num_input_tokens_seen": 12661080, "router_z_loss_clip": 4.00390625, "router_z_loss_mlp": 0.58789062, "step": 599, "time_per_iteration": 2.620342254638672 }, { "auxiliary_loss_clip": 0.01849226, "auxiliary_loss_mlp": 0.00716901, "balance_loss_clip": 1.42464352, "balance_loss_mlp": 0.65166974, "epoch": 0.036073951600781605, "flos": 23951627585280.0, "grad_norm": 55.48536596399606, "language_loss": 0.9408803, "learning_rate": 3.999613189525668e-06, "loss": 0.96654159, "num_input_tokens_seen": 12678270, "router_z_loss_clip": 4.24609375, "router_z_loss_mlp": 0.65234375, "step": 600, "time_per_iteration": 2.6679952144622803 }, { "auxiliary_loss_clip": 0.01786637, "auxiliary_loss_mlp": 0.00721602, "balance_loss_clip": 1.38870406, "balance_loss_mlp": 0.65918416, "epoch": 0.03613407485344957, "flos": 18912283050240.0, "grad_norm": 16.97827862683609, "language_loss": 0.8887701, "learning_rate": 3.999605492246508e-06, "loss": 0.91385245, "num_input_tokens_seen": 12697295, "router_z_loss_clip": 3.97851562, "router_z_loss_mlp": 0.62451172, "step": 601, "time_per_iteration": 2.608794689178467 }, { "auxiliary_loss_clip": 0.01790957, "auxiliary_loss_mlp": 0.00692612, "balance_loss_clip": 1.39483416, "balance_loss_mlp": 0.62690437, "epoch": 0.03619419810611754, "flos": 23038526926080.0, "grad_norm": 29.261740903186556, "language_loss": 0.83166671, "learning_rate": 3.999597719142107e-06, "loss": 0.85650235, "num_input_tokens_seen": 12716165, "router_z_loss_clip": 3.95898438, "router_z_loss_mlp": 0.65722656, "step": 602, "time_per_iteration": 2.720123529434204 }, { "auxiliary_loss_clip": 0.01755859, "auxiliary_loss_mlp": 0.00797982, "balance_loss_clip": 1.37093163, "balance_loss_mlp": 0.72898388, "epoch": 0.03625432135878551, "flos": 29457774293760.0, "grad_norm": 7.454777674472082, "language_loss": 0.85395163, "learning_rate": 3.999589870212761e-06, "loss": 0.87949002, "num_input_tokens_seen": 12735475, "router_z_loss_clip": 3.8515625, "router_z_loss_mlp": 0.68994141, "step": 603, "time_per_iteration": 2.6800179481506348 }, { "auxiliary_loss_clip": 0.01755196, "auxiliary_loss_mlp": 0.00750192, "balance_loss_clip": 1.37865901, "balance_loss_mlp": 0.68386394, "epoch": 0.03631444461145348, "flos": 23508525409920.0, "grad_norm": 149.98577430233928, "language_loss": 0.92379153, "learning_rate": 3.9995819454587664e-06, "loss": 0.94884539, "num_input_tokens_seen": 12754540, "router_z_loss_clip": 3.765625, "router_z_loss_mlp": 0.66308594, "step": 604, "time_per_iteration": 2.703775644302368 }, { "auxiliary_loss_clip": 0.01723986, "auxiliary_loss_mlp": 0.00798295, "balance_loss_clip": 1.34745407, "balance_loss_mlp": 0.72705567, "epoch": 0.03637456786412145, "flos": 16618130323200.0, "grad_norm": 108.13593476923434, "language_loss": 0.87444812, "learning_rate": 3.999573944880424e-06, "loss": 0.89967096, "num_input_tokens_seen": 12773050, "router_z_loss_clip": 3.765625, "router_z_loss_mlp": 0.71240234, "step": 605, "time_per_iteration": 2.6238231658935547 }, { "auxiliary_loss_clip": 0.01712726, "auxiliary_loss_mlp": 0.00792757, "balance_loss_clip": 1.3400563, "balance_loss_mlp": 0.72337711, "epoch": 0.03643469111678942, "flos": 15851832549120.0, "grad_norm": 208.49637246258834, "language_loss": 0.9387697, "learning_rate": 3.9995658684780375e-06, "loss": 0.96382451, "num_input_tokens_seen": 12791240, "router_z_loss_clip": 3.72851562, "router_z_loss_mlp": 0.69384766, "step": 606, "time_per_iteration": 2.6952779293060303 }, { "auxiliary_loss_clip": 0.01709396, "auxiliary_loss_mlp": 0.00779323, "balance_loss_clip": 1.33238947, "balance_loss_mlp": 0.70851243, "epoch": 0.03649481436945739, "flos": 23620387340160.0, "grad_norm": 107.25136719834062, "language_loss": 0.88664651, "learning_rate": 3.999557716251912e-06, "loss": 0.91153365, "num_input_tokens_seen": 12812245, "router_z_loss_clip": 3.76953125, "router_z_loss_mlp": 0.70800781, "step": 607, "time_per_iteration": 2.694709539413452 }, { "auxiliary_loss_clip": 0.01673821, "auxiliary_loss_mlp": 0.00794912, "balance_loss_clip": 1.31629944, "balance_loss_mlp": 0.72548413, "epoch": 0.036554937622125354, "flos": 21755581879680.0, "grad_norm": 73.24456522714853, "language_loss": 0.87785721, "learning_rate": 3.999549488202358e-06, "loss": 0.90254462, "num_input_tokens_seen": 12831085, "router_z_loss_clip": 3.57617188, "router_z_loss_mlp": 0.69384766, "step": 608, "time_per_iteration": 2.6421711444854736 }, { "auxiliary_loss_clip": 0.01672513, "auxiliary_loss_mlp": 0.00784633, "balance_loss_clip": 1.31212986, "balance_loss_mlp": 0.71143895, "epoch": 0.036615060874793326, "flos": 17819772935040.0, "grad_norm": 33.54363176602559, "language_loss": 0.89208674, "learning_rate": 3.999541184329688e-06, "loss": 0.91665816, "num_input_tokens_seen": 12849115, "router_z_loss_clip": 3.6015625, "router_z_loss_mlp": 0.73144531, "step": 609, "time_per_iteration": 4.183840990066528 }, { "auxiliary_loss_clip": 0.01670426, "auxiliary_loss_mlp": 0.00764576, "balance_loss_clip": 1.31005907, "balance_loss_mlp": 0.6976279, "epoch": 0.0366751841274613, "flos": 26753808320640.0, "grad_norm": 123.67412266866349, "language_loss": 0.87920654, "learning_rate": 3.999532804634215e-06, "loss": 0.90355659, "num_input_tokens_seen": 12868005, "router_z_loss_clip": 3.6015625, "router_z_loss_mlp": 0.66992188, "step": 610, "time_per_iteration": 5.556333065032959 }, { "auxiliary_loss_clip": 0.01661842, "auxiliary_loss_mlp": 0.00805105, "balance_loss_clip": 1.30855036, "balance_loss_mlp": 0.73062265, "epoch": 0.03673530738012926, "flos": 22196960202240.0, "grad_norm": 3103.4075066069727, "language_loss": 0.92518616, "learning_rate": 3.9995243491162575e-06, "loss": 0.94985569, "num_input_tokens_seen": 12886890, "router_z_loss_clip": 3.52929688, "router_z_loss_mlp": 0.74511719, "step": 611, "time_per_iteration": 2.666883945465088 }, { "auxiliary_loss_clip": 0.01645307, "auxiliary_loss_mlp": 0.00776172, "balance_loss_clip": 1.30125988, "balance_loss_mlp": 0.70955807, "epoch": 0.036795430632797235, "flos": 24681655601280.0, "grad_norm": 45.88738320244028, "language_loss": 0.80365014, "learning_rate": 3.999515817776136e-06, "loss": 0.82786489, "num_input_tokens_seen": 12906130, "router_z_loss_clip": 3.4375, "router_z_loss_mlp": 0.66650391, "step": 612, "time_per_iteration": 2.702878713607788 }, { "auxiliary_loss_clip": 0.01666348, "auxiliary_loss_mlp": 0.00738106, "balance_loss_clip": 1.31442523, "balance_loss_mlp": 0.67320818, "epoch": 0.0368555538854652, "flos": 17748921358080.0, "grad_norm": 31.380879775743193, "language_loss": 0.87318254, "learning_rate": 3.999507210614175e-06, "loss": 0.89722705, "num_input_tokens_seen": 12925260, "router_z_loss_clip": 3.51757812, "router_z_loss_mlp": 0.64892578, "step": 613, "time_per_iteration": 2.6394965648651123 }, { "auxiliary_loss_clip": 0.01655801, "auxiliary_loss_mlp": 0.00760903, "balance_loss_clip": 1.30376768, "balance_loss_mlp": 0.69304866, "epoch": 0.03691567713813317, "flos": 20594554571520.0, "grad_norm": 222.08150868411596, "language_loss": 1.00481391, "learning_rate": 3.9994985276307e-06, "loss": 1.02898097, "num_input_tokens_seen": 12944590, "router_z_loss_clip": 3.52539062, "router_z_loss_mlp": 0.67822266, "step": 614, "time_per_iteration": 2.6039953231811523 }, { "auxiliary_loss_clip": 0.01704064, "auxiliary_loss_mlp": 0.00702621, "balance_loss_clip": 1.33099854, "balance_loss_mlp": 0.63810462, "epoch": 0.036975800390801145, "flos": 33650380546560.0, "grad_norm": 9.233223548431452, "language_loss": 0.80384535, "learning_rate": 3.999489768826041e-06, "loss": 0.82791221, "num_input_tokens_seen": 12964785, "router_z_loss_clip": 3.7265625, "router_z_loss_mlp": 0.64453125, "step": 615, "time_per_iteration": 2.7351934909820557 }, { "auxiliary_loss_clip": 0.01686097, "auxiliary_loss_mlp": 0.00671777, "balance_loss_clip": 1.3214612, "balance_loss_mlp": 0.61078936, "epoch": 0.03703592364346911, "flos": 28293694329600.0, "grad_norm": 30.953645878050597, "language_loss": 0.88484204, "learning_rate": 3.999480934200528e-06, "loss": 0.90842074, "num_input_tokens_seen": 12986705, "router_z_loss_clip": 3.64453125, "router_z_loss_mlp": 0.61035156, "step": 616, "time_per_iteration": 2.6970481872558594 }, { "auxiliary_loss_clip": 0.0171754, "auxiliary_loss_mlp": 0.00619353, "balance_loss_clip": 1.35251343, "balance_loss_mlp": 0.56630504, "epoch": 0.03709604689613708, "flos": 31504215853440.0, "grad_norm": 46.508553013642874, "language_loss": 0.76023853, "learning_rate": 3.999472023754499e-06, "loss": 0.78360748, "num_input_tokens_seen": 13010560, "router_z_loss_clip": 3.65234375, "router_z_loss_mlp": 0.53100586, "step": 617, "time_per_iteration": 2.732340097427368 }, { "auxiliary_loss_clip": 0.01727317, "auxiliary_loss_mlp": 0.00627676, "balance_loss_clip": 1.35073292, "balance_loss_mlp": 0.57312572, "epoch": 0.03715617014880505, "flos": 19609381272960.0, "grad_norm": 10.607218686800634, "language_loss": 0.87950921, "learning_rate": 3.99946303748829e-06, "loss": 0.90305912, "num_input_tokens_seen": 13028935, "router_z_loss_clip": 3.76367188, "router_z_loss_mlp": 0.54541016, "step": 618, "time_per_iteration": 2.7103404998779297 }, { "auxiliary_loss_clip": 0.0174649, "auxiliary_loss_mlp": 0.00582684, "balance_loss_clip": 1.36922455, "balance_loss_mlp": 0.52844357, "epoch": 0.03721629340147302, "flos": 15924192497280.0, "grad_norm": 8.639996354184785, "language_loss": 0.96735764, "learning_rate": 3.999453975402242e-06, "loss": 0.9906494, "num_input_tokens_seen": 13046000, "router_z_loss_clip": 3.77148438, "router_z_loss_mlp": 0.54199219, "step": 619, "time_per_iteration": 2.732637882232666 }, { "auxiliary_loss_clip": 0.01757406, "auxiliary_loss_mlp": 0.00582804, "balance_loss_clip": 1.37296748, "balance_loss_mlp": 0.53008997, "epoch": 0.03727641665414099, "flos": 21104090951040.0, "grad_norm": 119.11059251748637, "language_loss": 1.0224694, "learning_rate": 3.9994448374967e-06, "loss": 1.0458715, "num_input_tokens_seen": 13062995, "router_z_loss_clip": 3.83984375, "router_z_loss_mlp": 0.52709961, "step": 620, "time_per_iteration": 2.612511157989502 }, { "auxiliary_loss_clip": 0.01769647, "auxiliary_loss_mlp": 0.00536909, "balance_loss_clip": 1.38318586, "balance_loss_mlp": 0.48820043, "epoch": 0.037336539906808956, "flos": 24131683486080.0, "grad_norm": 142.34343113389593, "language_loss": 0.84162456, "learning_rate": 3.999435623772008e-06, "loss": 0.86469018, "num_input_tokens_seen": 13084120, "router_z_loss_clip": 3.86328125, "router_z_loss_mlp": 0.48730469, "step": 621, "time_per_iteration": 2.6906991004943848 }, { "auxiliary_loss_clip": 0.0178301, "auxiliary_loss_mlp": 0.00521567, "balance_loss_clip": 1.4010098, "balance_loss_mlp": 0.47529027, "epoch": 0.03739666315947693, "flos": 22346384780160.0, "grad_norm": 6.52740370431409, "language_loss": 0.93270314, "learning_rate": 3.999426334228518e-06, "loss": 0.95574898, "num_input_tokens_seen": 13100035, "router_z_loss_clip": 3.82421875, "router_z_loss_mlp": 0.46313477, "step": 622, "time_per_iteration": 2.644049644470215 }, { "auxiliary_loss_clip": 0.01765069, "auxiliary_loss_mlp": 0.00497296, "balance_loss_clip": 1.38324809, "balance_loss_mlp": 0.4536888, "epoch": 0.0374567864121449, "flos": 20449511452800.0, "grad_norm": 19.174729412505698, "language_loss": 0.96270442, "learning_rate": 3.999416968866581e-06, "loss": 0.98532814, "num_input_tokens_seen": 13118070, "router_z_loss_clip": 3.81445312, "router_z_loss_mlp": 0.43579102, "step": 623, "time_per_iteration": 2.636193037033081 }, { "auxiliary_loss_clip": 0.01802411, "auxiliary_loss_mlp": 0.00486951, "balance_loss_clip": 1.41435432, "balance_loss_mlp": 0.44465533, "epoch": 0.037516909664812866, "flos": 19208043636480.0, "grad_norm": 25.46390177568775, "language_loss": 0.90066588, "learning_rate": 3.999407527686551e-06, "loss": 0.92355943, "num_input_tokens_seen": 13136355, "router_z_loss_clip": 3.8828125, "router_z_loss_mlp": 0.42333984, "step": 624, "time_per_iteration": 2.6942198276519775 }, { "auxiliary_loss_clip": 0.01818944, "auxiliary_loss_mlp": 0.00473283, "balance_loss_clip": 1.42042422, "balance_loss_mlp": 0.43160778, "epoch": 0.03757703291748084, "flos": 35005218664320.0, "grad_norm": 13.668658410713816, "language_loss": 0.75106621, "learning_rate": 3.999398010688788e-06, "loss": 0.77398849, "num_input_tokens_seen": 13155435, "router_z_loss_clip": 3.9765625, "router_z_loss_mlp": 0.41650391, "step": 625, "time_per_iteration": 2.7894694805145264 }, { "auxiliary_loss_clip": 0.01836122, "auxiliary_loss_mlp": 0.0044201, "balance_loss_clip": 1.4400301, "balance_loss_mlp": 0.4016462, "epoch": 0.0376371561701488, "flos": 25483899911040.0, "grad_norm": 20.31428238383766, "language_loss": 0.85251456, "learning_rate": 3.999388417873652e-06, "loss": 0.87529588, "num_input_tokens_seen": 13174295, "router_z_loss_clip": 3.95507812, "router_z_loss_mlp": 0.40356445, "step": 626, "time_per_iteration": 2.7474262714385986 }, { "auxiliary_loss_clip": 0.01879358, "auxiliary_loss_mlp": 0.00442147, "balance_loss_clip": 1.46245313, "balance_loss_mlp": 0.40183067, "epoch": 0.037697279422816775, "flos": 18185630912640.0, "grad_norm": 61.61251114623347, "language_loss": 0.86823785, "learning_rate": 3.999378749241506e-06, "loss": 0.89145291, "num_input_tokens_seen": 13192500, "router_z_loss_clip": 4.171875, "router_z_loss_mlp": 0.40283203, "step": 627, "time_per_iteration": 2.615792751312256 }, { "auxiliary_loss_clip": 0.01890805, "auxiliary_loss_mlp": 0.00479059, "balance_loss_clip": 1.47144294, "balance_loss_mlp": 0.4370015, "epoch": 0.03775740267548475, "flos": 24644272521600.0, "grad_norm": 8.942604614822011, "language_loss": 0.94272935, "learning_rate": 3.999369004792719e-06, "loss": 0.96642804, "num_input_tokens_seen": 13213470, "router_z_loss_clip": 4.1953125, "router_z_loss_mlp": 0.4206543, "step": 628, "time_per_iteration": 2.720594882965088 }, { "auxiliary_loss_clip": 0.0189625, "auxiliary_loss_mlp": 0.00424298, "balance_loss_clip": 1.4767189, "balance_loss_mlp": 0.38784418, "epoch": 0.03781752592815271, "flos": 21288205088640.0, "grad_norm": 9.292827658801787, "language_loss": 0.86653376, "learning_rate": 3.999359184527658e-06, "loss": 0.88973922, "num_input_tokens_seen": 13232365, "router_z_loss_clip": 4.1953125, "router_z_loss_mlp": 0.36450195, "step": 629, "time_per_iteration": 2.5990638732910156 }, { "auxiliary_loss_clip": 0.0192087, "auxiliary_loss_mlp": 0.00424266, "balance_loss_clip": 1.49892771, "balance_loss_mlp": 0.38826507, "epoch": 0.037877649180820684, "flos": 22089623385600.0, "grad_norm": 14.134274765357928, "language_loss": 0.84481788, "learning_rate": 3.999349288446696e-06, "loss": 0.86826921, "num_input_tokens_seen": 13251920, "router_z_loss_clip": 4.2265625, "router_z_loss_mlp": 0.35986328, "step": 630, "time_per_iteration": 2.722088575363159 }, { "auxiliary_loss_clip": 0.01935542, "auxiliary_loss_mlp": 0.00425377, "balance_loss_clip": 1.5198946, "balance_loss_mlp": 0.39013892, "epoch": 0.03793777243348865, "flos": 14501339976960.0, "grad_norm": 30.062075033987398, "language_loss": 1.01996052, "learning_rate": 3.99933931655021e-06, "loss": 1.0435698, "num_input_tokens_seen": 13267440, "router_z_loss_clip": 4.15234375, "router_z_loss_mlp": 0.35253906, "step": 631, "time_per_iteration": 2.595613479614258 }, { "auxiliary_loss_clip": 0.01908467, "auxiliary_loss_mlp": 0.00387605, "balance_loss_clip": 1.51143909, "balance_loss_mlp": 0.35508439, "epoch": 0.03799789568615662, "flos": 21908418249600.0, "grad_norm": 319.81103059709346, "language_loss": 0.97060728, "learning_rate": 3.999329268838575e-06, "loss": 0.99356794, "num_input_tokens_seen": 13287850, "router_z_loss_clip": 3.97265625, "router_z_loss_mlp": 0.32519531, "step": 632, "time_per_iteration": 2.67881441116333 }, { "auxiliary_loss_clip": 0.01943401, "auxiliary_loss_mlp": 0.0038268, "balance_loss_clip": 1.54486537, "balance_loss_mlp": 0.34973049, "epoch": 0.03805801893882459, "flos": 24827021942400.0, "grad_norm": 31.102081817193174, "language_loss": 0.89858079, "learning_rate": 3.999319145312175e-06, "loss": 0.92184156, "num_input_tokens_seen": 13307760, "router_z_loss_clip": 3.99023438, "router_z_loss_mlp": 0.3293457, "step": 633, "time_per_iteration": 2.617716073989868 }, { "auxiliary_loss_clip": 0.01964605, "auxiliary_loss_mlp": 0.00376556, "balance_loss_clip": 1.56341171, "balance_loss_mlp": 0.34019747, "epoch": 0.03811814219149256, "flos": 30482952364800.0, "grad_norm": 11.492085796734942, "language_loss": 0.76998353, "learning_rate": 3.999308945971392e-06, "loss": 0.79339516, "num_input_tokens_seen": 13331230, "router_z_loss_clip": 4.00976562, "router_z_loss_mlp": 0.36352539, "step": 634, "time_per_iteration": 2.7283332347869873 }, { "auxiliary_loss_clip": 0.01774151, "auxiliary_loss_mlp": 0.00314401, "balance_loss_clip": 1.40765691, "balance_loss_mlp": 0.29742548, "epoch": 0.03817826544416053, "flos": 66992577379200.0, "grad_norm": 1.1335412237511113, "language_loss": 0.61858273, "learning_rate": 3.999298670816614e-06, "loss": 0.63946825, "num_input_tokens_seen": 13394760, "router_z_loss_clip": 3.65625, "router_z_loss_mlp": 0.16992188, "step": 635, "time_per_iteration": 3.142587423324585 }, { "auxiliary_loss_clip": 0.01925258, "auxiliary_loss_mlp": 0.00345045, "balance_loss_clip": 1.54288304, "balance_loss_mlp": 0.31097507, "epoch": 0.038238388696828496, "flos": 20485350247680.0, "grad_norm": 27.847161150631294, "language_loss": 0.93111575, "learning_rate": 3.9992883198482294e-06, "loss": 0.95381874, "num_input_tokens_seen": 13412775, "router_z_loss_clip": 3.82226562, "router_z_loss_mlp": 0.34057617, "step": 636, "time_per_iteration": 2.62214994430542 }, { "auxiliary_loss_clip": 0.0197474, "auxiliary_loss_mlp": 0.00392603, "balance_loss_clip": 1.58592582, "balance_loss_mlp": 0.35564813, "epoch": 0.03829851194949647, "flos": 17965893461760.0, "grad_norm": 5.093947573232372, "language_loss": 0.93186462, "learning_rate": 3.999277893066632e-06, "loss": 0.95553803, "num_input_tokens_seen": 13427835, "router_z_loss_clip": 3.88671875, "router_z_loss_mlp": 0.36938477, "step": 637, "time_per_iteration": 2.567892551422119 }, { "auxiliary_loss_clip": 0.01955957, "auxiliary_loss_mlp": 0.00345063, "balance_loss_clip": 1.58456826, "balance_loss_mlp": 0.31127918, "epoch": 0.03835863520216444, "flos": 22456522857600.0, "grad_norm": 17.486262876536127, "language_loss": 0.92828321, "learning_rate": 3.999267390472215e-06, "loss": 0.95129347, "num_input_tokens_seen": 13447295, "router_z_loss_clip": 3.7109375, "router_z_loss_mlp": 0.33813477, "step": 638, "time_per_iteration": 2.673048257827759 }, { "auxiliary_loss_clip": 0.019692, "auxiliary_loss_mlp": 0.00364066, "balance_loss_clip": 1.58900356, "balance_loss_mlp": 0.32837445, "epoch": 0.038418758454832405, "flos": 22164425458560.0, "grad_norm": 14.723795914872971, "language_loss": 0.79829764, "learning_rate": 3.999256812065381e-06, "loss": 0.8216303, "num_input_tokens_seen": 13468455, "router_z_loss_clip": 3.80273438, "router_z_loss_mlp": 0.35693359, "step": 639, "time_per_iteration": 2.820840358734131 }, { "auxiliary_loss_clip": 0.01961481, "auxiliary_loss_mlp": 0.00390971, "balance_loss_clip": 1.58266592, "balance_loss_mlp": 0.3518703, "epoch": 0.03847888170750038, "flos": 22747435107840.0, "grad_norm": 125.71937525632016, "language_loss": 0.94800961, "learning_rate": 3.999246157846526e-06, "loss": 0.97153413, "num_input_tokens_seen": 13489085, "router_z_loss_clip": 3.78515625, "router_z_loss_mlp": 0.39111328, "step": 640, "time_per_iteration": 2.6748476028442383 }, { "auxiliary_loss_clip": 0.01950324, "auxiliary_loss_mlp": 0.00420923, "balance_loss_clip": 1.58119774, "balance_loss_mlp": 0.38377696, "epoch": 0.03853900496016834, "flos": 22711201263360.0, "grad_norm": 18.150582403006332, "language_loss": 0.91801643, "learning_rate": 3.9992354278160574e-06, "loss": 0.94172889, "num_input_tokens_seen": 13509120, "router_z_loss_clip": 3.68554688, "router_z_loss_mlp": 0.37109375, "step": 641, "time_per_iteration": 2.7279531955718994 }, { "auxiliary_loss_clip": 0.01684038, "auxiliary_loss_mlp": 0.00286559, "balance_loss_clip": 1.37494826, "balance_loss_mlp": 0.27025115, "epoch": 0.038599128212836314, "flos": 70399136355840.0, "grad_norm": 0.8857152202062817, "language_loss": 0.64668989, "learning_rate": 3.999224621974381e-06, "loss": 0.6663959, "num_input_tokens_seen": 13562005, "router_z_loss_clip": 3.09375, "router_z_loss_mlp": 0.16308594, "step": 642, "time_per_iteration": 3.111320972442627 }, { "auxiliary_loss_clip": 0.01911382, "auxiliary_loss_mlp": 0.00539412, "balance_loss_clip": 1.55162835, "balance_loss_mlp": 0.50012088, "epoch": 0.03865925146550429, "flos": 23295144666240.0, "grad_norm": 81.3274848875477, "language_loss": 0.86299127, "learning_rate": 3.999213740321906e-06, "loss": 0.88749921, "num_input_tokens_seen": 13582185, "router_z_loss_clip": 3.59960938, "router_z_loss_mlp": 0.39282227, "step": 643, "time_per_iteration": 2.669640064239502 }, { "auxiliary_loss_clip": 0.0193515, "auxiliary_loss_mlp": 0.00547202, "balance_loss_clip": 1.57247365, "balance_loss_mlp": 0.50853086, "epoch": 0.03871937471817225, "flos": 21430446946560.0, "grad_norm": 282.34850671061906, "language_loss": 0.88534284, "learning_rate": 3.999202782859046e-06, "loss": 0.91016638, "num_input_tokens_seen": 13599555, "router_z_loss_clip": 3.62890625, "router_z_loss_mlp": 0.38671875, "step": 644, "time_per_iteration": 2.654322862625122 }, { "auxiliary_loss_clip": 0.01974262, "auxiliary_loss_mlp": 0.00568553, "balance_loss_clip": 1.61236489, "balance_loss_mlp": 0.53023887, "epoch": 0.038779497970840224, "flos": 34277309550720.0, "grad_norm": 375.5361713733268, "language_loss": 0.91128975, "learning_rate": 3.9991917495862165e-06, "loss": 0.93671787, "num_input_tokens_seen": 13621160, "router_z_loss_clip": 3.6171875, "router_z_loss_mlp": 0.38256836, "step": 645, "time_per_iteration": 2.7381434440612793 }, { "auxiliary_loss_clip": 0.01999831, "auxiliary_loss_mlp": 0.00593107, "balance_loss_clip": 1.61336291, "balance_loss_mlp": 0.55248082, "epoch": 0.03883962122350819, "flos": 22748189293440.0, "grad_norm": 3.5665763313601313, "language_loss": 0.88968921, "learning_rate": 3.9991806405038345e-06, "loss": 0.9156186, "num_input_tokens_seen": 13641915, "router_z_loss_clip": 3.8671875, "router_z_loss_mlp": 0.40649414, "step": 646, "time_per_iteration": 2.6984643936157227 }, { "auxiliary_loss_clip": 0.01975735, "auxiliary_loss_mlp": 0.00652024, "balance_loss_clip": 1.58888066, "balance_loss_mlp": 0.60579473, "epoch": 0.03889974447617616, "flos": 21945837242880.0, "grad_norm": 71.70577580328208, "language_loss": 0.88167202, "learning_rate": 3.999169455612323e-06, "loss": 0.90794969, "num_input_tokens_seen": 13661410, "router_z_loss_clip": 3.8671875, "router_z_loss_mlp": 0.46191406, "step": 647, "time_per_iteration": 2.7149605751037598 }, { "auxiliary_loss_clip": 0.01986583, "auxiliary_loss_mlp": 0.0058033, "balance_loss_clip": 1.59928608, "balance_loss_mlp": 0.53834498, "epoch": 0.03895986772884413, "flos": 31504826384640.0, "grad_norm": 233.7051180166742, "language_loss": 0.92008734, "learning_rate": 3.999158194912106e-06, "loss": 0.94575655, "num_input_tokens_seen": 13681705, "router_z_loss_clip": 3.86523438, "router_z_loss_mlp": 0.42016602, "step": 648, "time_per_iteration": 2.7556378841400146 }, { "auxiliary_loss_clip": 0.01996342, "auxiliary_loss_mlp": 0.00608746, "balance_loss_clip": 1.60296428, "balance_loss_mlp": 0.56406641, "epoch": 0.0390199909815121, "flos": 19901011795200.0, "grad_norm": 49.411917250117355, "language_loss": 0.90366793, "learning_rate": 3.9991468584036086e-06, "loss": 0.92971885, "num_input_tokens_seen": 13700400, "router_z_loss_clip": 3.93554688, "router_z_loss_mlp": 0.44726562, "step": 649, "time_per_iteration": 2.6371541023254395 }, { "auxiliary_loss_clip": 0.02046406, "auxiliary_loss_mlp": 0.00562397, "balance_loss_clip": 1.62874031, "balance_loss_mlp": 0.51895672, "epoch": 0.03908011423418007, "flos": 21612478095360.0, "grad_norm": 65.50023558447242, "language_loss": 0.8566522, "learning_rate": 3.999135446087263e-06, "loss": 0.88274014, "num_input_tokens_seen": 13720145, "router_z_loss_clip": 4.17382812, "router_z_loss_mlp": 0.43457031, "step": 650, "time_per_iteration": 2.7042124271392822 }, { "auxiliary_loss_clip": 0.02074033, "auxiliary_loss_mlp": 0.00578286, "balance_loss_clip": 1.63805676, "balance_loss_mlp": 0.53408337, "epoch": 0.039140237486848035, "flos": 18661411486080.0, "grad_norm": 6416.836453017863, "language_loss": 0.85550672, "learning_rate": 3.9991239579635e-06, "loss": 0.88202989, "num_input_tokens_seen": 13737500, "router_z_loss_clip": 4.359375, "router_z_loss_mlp": 0.44213867, "step": 651, "time_per_iteration": 4.092526197433472 }, { "auxiliary_loss_clip": 0.02093141, "auxiliary_loss_mlp": 0.0055894, "balance_loss_clip": 1.64628017, "balance_loss_mlp": 0.51123238, "epoch": 0.03920036073951601, "flos": 18661124177280.0, "grad_norm": 251.2964613630038, "language_loss": 0.96829975, "learning_rate": 3.999112394032757e-06, "loss": 0.99482059, "num_input_tokens_seen": 13754750, "router_z_loss_clip": 4.46875, "router_z_loss_mlp": 0.47729492, "step": 652, "time_per_iteration": 4.000746488571167 }, { "auxiliary_loss_clip": 0.020829, "auxiliary_loss_mlp": 0.00643139, "balance_loss_clip": 1.63088882, "balance_loss_mlp": 0.59152174, "epoch": 0.03926048399218398, "flos": 31354468053120.0, "grad_norm": 44.15598492436149, "language_loss": 0.88257718, "learning_rate": 3.999100754295471e-06, "loss": 0.9098376, "num_input_tokens_seen": 13771990, "router_z_loss_clip": 4.51953125, "router_z_loss_mlp": 0.51635742, "step": 653, "time_per_iteration": 4.055376291275024 }, { "auxiliary_loss_clip": 0.02156269, "auxiliary_loss_mlp": 0.00569803, "balance_loss_clip": 1.6559484, "balance_loss_mlp": 0.5228582, "epoch": 0.039320607244851945, "flos": 29603499770880.0, "grad_norm": 19.97538381348395, "language_loss": 0.94763261, "learning_rate": 3.999089038752085e-06, "loss": 0.97489333, "num_input_tokens_seen": 13792750, "router_z_loss_clip": 5.00390625, "router_z_loss_mlp": 0.46948242, "step": 654, "time_per_iteration": 2.732017993927002 }, { "auxiliary_loss_clip": 0.01760074, "auxiliary_loss_mlp": 0.00357257, "balance_loss_clip": 1.46064818, "balance_loss_mlp": 0.33856481, "epoch": 0.03938073049751992, "flos": 66534609951360.0, "grad_norm": 0.8172297183611896, "language_loss": 0.49776557, "learning_rate": 3.999077247403041e-06, "loss": 0.51893884, "num_input_tokens_seen": 13858570, "router_z_loss_clip": 3.0, "router_z_loss_mlp": 0.18652344, "step": 655, "time_per_iteration": 3.168562650680542 }, { "auxiliary_loss_clip": 0.0217516, "auxiliary_loss_mlp": 0.00553205, "balance_loss_clip": 1.68573654, "balance_loss_mlp": 0.51083755, "epoch": 0.03944085375018788, "flos": 23367827836800.0, "grad_norm": 87.3725371008475, "language_loss": 0.87139708, "learning_rate": 3.9990653802487886e-06, "loss": 0.89868075, "num_input_tokens_seen": 13876335, "router_z_loss_clip": 4.890625, "router_z_loss_mlp": 0.42333984, "step": 656, "time_per_iteration": 2.665041923522949 }, { "auxiliary_loss_clip": 0.0219256, "auxiliary_loss_mlp": 0.00627665, "balance_loss_clip": 1.66923535, "balance_loss_mlp": 0.57251883, "epoch": 0.039500977002855854, "flos": 18548292579840.0, "grad_norm": 17.533615763992845, "language_loss": 0.864766, "learning_rate": 3.999053437289776e-06, "loss": 0.89296818, "num_input_tokens_seen": 13892640, "router_z_loss_clip": 5.2265625, "router_z_loss_mlp": 0.55175781, "step": 657, "time_per_iteration": 2.5272276401519775 }, { "auxiliary_loss_clip": 0.02198626, "auxiliary_loss_mlp": 0.00574232, "balance_loss_clip": 1.67796814, "balance_loss_mlp": 0.52270973, "epoch": 0.039561100255523826, "flos": 25338174433920.0, "grad_norm": 6.444703610725153, "language_loss": 0.8865844, "learning_rate": 3.999041418526457e-06, "loss": 0.91431296, "num_input_tokens_seen": 13910085, "router_z_loss_clip": 5.203125, "router_z_loss_mlp": 0.515625, "step": 658, "time_per_iteration": 2.656120777130127 }, { "auxiliary_loss_clip": 0.02268436, "auxiliary_loss_mlp": 0.00559187, "balance_loss_clip": 1.71665215, "balance_loss_mlp": 0.51353031, "epoch": 0.03962122350819179, "flos": 18219889509120.0, "grad_norm": 52.69173536888343, "language_loss": 0.98754567, "learning_rate": 3.999029323959287e-06, "loss": 1.01582193, "num_input_tokens_seen": 13928800, "router_z_loss_clip": 5.51171875, "router_z_loss_mlp": 0.45654297, "step": 659, "time_per_iteration": 2.587963342666626 }, { "auxiliary_loss_clip": 0.02405383, "auxiliary_loss_mlp": 0.00527192, "balance_loss_clip": 1.77901661, "balance_loss_mlp": 0.48253635, "epoch": 0.03968134676085976, "flos": 20522230536960.0, "grad_norm": 19.331217708857565, "language_loss": 0.86348087, "learning_rate": 3.999017153588724e-06, "loss": 0.89280665, "num_input_tokens_seen": 13948325, "router_z_loss_clip": 6.2578125, "router_z_loss_mlp": 0.44628906, "step": 660, "time_per_iteration": 2.6425647735595703 }, { "auxiliary_loss_clip": 0.02507957, "auxiliary_loss_mlp": 0.00552722, "balance_loss_clip": 1.80635357, "balance_loss_mlp": 0.50575352, "epoch": 0.03974147001352773, "flos": 22422587483520.0, "grad_norm": 22.779691536193326, "language_loss": 0.88695961, "learning_rate": 3.999004907415231e-06, "loss": 0.91756642, "num_input_tokens_seen": 13969090, "router_z_loss_clip": 7.01953125, "router_z_loss_mlp": 0.46948242, "step": 661, "time_per_iteration": 2.625096082687378 }, { "auxiliary_loss_clip": 0.02546688, "auxiliary_loss_mlp": 0.00585067, "balance_loss_clip": 1.85607219, "balance_loss_mlp": 0.54863662, "epoch": 0.0398015932661957, "flos": 71128769322240.0, "grad_norm": 1.2247738578318161, "language_loss": 0.69276673, "learning_rate": 3.998992585439272e-06, "loss": 0.72408432, "num_input_tokens_seen": 14037555, "router_z_loss_clip": 6.90625, "router_z_loss_mlp": 0.36523438, "step": 662, "time_per_iteration": 3.2324178218841553 }, { "auxiliary_loss_clip": 0.0262099, "auxiliary_loss_mlp": 0.00587252, "balance_loss_clip": 1.81714535, "balance_loss_mlp": 0.53620648, "epoch": 0.03986171651886367, "flos": 16800951571200.0, "grad_norm": 3.426026851534422, "language_loss": 0.89582694, "learning_rate": 3.998980187661314e-06, "loss": 0.92790931, "num_input_tokens_seen": 14055765, "router_z_loss_clip": 8.05078125, "router_z_loss_mlp": 0.50952148, "step": 663, "time_per_iteration": 2.6464409828186035 }, { "auxiliary_loss_clip": 0.02556614, "auxiliary_loss_mlp": 0.00595794, "balance_loss_clip": 1.77946091, "balance_loss_mlp": 0.54629874, "epoch": 0.03992183977153164, "flos": 24535068197760.0, "grad_norm": 25.994257837945693, "language_loss": 0.95952493, "learning_rate": 3.998967714081826e-06, "loss": 0.99104899, "num_input_tokens_seen": 14074195, "router_z_loss_clip": 7.7734375, "router_z_loss_mlp": 0.49462891, "step": 664, "time_per_iteration": 2.6639349460601807 }, { "auxiliary_loss_clip": 0.02449348, "auxiliary_loss_mlp": 0.00512162, "balance_loss_clip": 1.75904465, "balance_loss_mlp": 0.4702003, "epoch": 0.03998196302419961, "flos": 15595897167360.0, "grad_norm": 5.051422469954378, "language_loss": 0.90852118, "learning_rate": 3.998955164701281e-06, "loss": 0.93813622, "num_input_tokens_seen": 14090215, "router_z_loss_clip": 6.9140625, "router_z_loss_mlp": 0.41967773, "step": 665, "time_per_iteration": 2.7399356365203857 }, { "auxiliary_loss_clip": 0.02383126, "auxiliary_loss_mlp": 0.00577178, "balance_loss_clip": 1.7154125, "balance_loss_mlp": 0.52923214, "epoch": 0.04004208627686758, "flos": 25305065072640.0, "grad_norm": 94.92558934825564, "language_loss": 0.87820464, "learning_rate": 3.998942539520158e-06, "loss": 0.90780765, "num_input_tokens_seen": 14112150, "router_z_loss_clip": 6.68359375, "router_z_loss_mlp": 0.47973633, "step": 666, "time_per_iteration": 2.6586272716522217 }, { "auxiliary_loss_clip": 0.02263394, "auxiliary_loss_mlp": 0.00536666, "balance_loss_clip": 1.66534281, "balance_loss_mlp": 0.49472791, "epoch": 0.04010220952953555, "flos": 23475847011840.0, "grad_norm": 73.0874144190832, "language_loss": 0.93097818, "learning_rate": 3.998929838538932e-06, "loss": 0.95897877, "num_input_tokens_seen": 14131475, "router_z_loss_clip": 5.984375, "router_z_loss_mlp": 0.41943359, "step": 667, "time_per_iteration": 2.675675392150879 }, { "auxiliary_loss_clip": 0.02175935, "auxiliary_loss_mlp": 0.0052408, "balance_loss_clip": 1.62860155, "balance_loss_mlp": 0.48393008, "epoch": 0.04016233278220352, "flos": 18617025254400.0, "grad_norm": 15.27651906351232, "language_loss": 0.88229167, "learning_rate": 3.998917061758087e-06, "loss": 0.9092918, "num_input_tokens_seen": 14146165, "router_z_loss_clip": 5.47265625, "router_z_loss_mlp": 0.40112305, "step": 668, "time_per_iteration": 2.5493245124816895 }, { "auxiliary_loss_clip": 0.01805453, "auxiliary_loss_mlp": 0.00210456, "balance_loss_clip": 1.50548339, "balance_loss_mlp": 0.19472075, "epoch": 0.040222456034871484, "flos": 70906194696960.0, "grad_norm": 2.6400101795725646, "language_loss": 0.61212009, "learning_rate": 3.998904209178107e-06, "loss": 0.63227922, "num_input_tokens_seen": 14215005, "router_z_loss_clip": 3.0, "router_z_loss_mlp": 0.15722656, "step": 669, "time_per_iteration": 3.2700488567352295 }, { "auxiliary_loss_clip": 0.02112235, "auxiliary_loss_mlp": 0.00540982, "balance_loss_clip": 1.59481025, "balance_loss_mlp": 0.49904376, "epoch": 0.040282579287539456, "flos": 23764712186880.0, "grad_norm": 21.999949832904544, "language_loss": 0.93456936, "learning_rate": 3.9988912807994785e-06, "loss": 0.96110153, "num_input_tokens_seen": 14235510, "router_z_loss_clip": 5.171875, "router_z_loss_mlp": 0.41943359, "step": 670, "time_per_iteration": 2.687681198120117 }, { "auxiliary_loss_clip": 0.02034449, "auxiliary_loss_mlp": 0.00530567, "balance_loss_clip": 1.5695318, "balance_loss_mlp": 0.48772344, "epoch": 0.04034270254020743, "flos": 18478518410880.0, "grad_norm": 30.833397007559316, "language_loss": 0.81083345, "learning_rate": 3.998878276622692e-06, "loss": 0.83648366, "num_input_tokens_seen": 14254565, "router_z_loss_clip": 4.64453125, "router_z_loss_mlp": 0.42797852, "step": 671, "time_per_iteration": 2.6715080738067627 }, { "auxiliary_loss_clip": 0.01952451, "auxiliary_loss_mlp": 0.00519614, "balance_loss_clip": 1.52733755, "balance_loss_mlp": 0.47915423, "epoch": 0.040402825792875394, "flos": 17201858244480.0, "grad_norm": 13.706260668472142, "language_loss": 1.00329661, "learning_rate": 3.998865196648242e-06, "loss": 1.02801716, "num_input_tokens_seen": 14271885, "router_z_loss_clip": 4.25390625, "router_z_loss_mlp": 0.4050293, "step": 672, "time_per_iteration": 2.5770363807678223 }, { "auxiliary_loss_clip": 0.01883741, "auxiliary_loss_mlp": 0.00566891, "balance_loss_clip": 1.4855001, "balance_loss_mlp": 0.52242589, "epoch": 0.040462949045543366, "flos": 19172168928000.0, "grad_norm": 4.146258687732022, "language_loss": 0.97702432, "learning_rate": 3.998852040876622e-06, "loss": 1.00153065, "num_input_tokens_seen": 14289670, "router_z_loss_clip": 3.98632812, "router_z_loss_mlp": 0.4453125, "step": 673, "time_per_iteration": 2.6310653686523438 }, { "auxiliary_loss_clip": 0.01820583, "auxiliary_loss_mlp": 0.0055822, "balance_loss_clip": 1.44987381, "balance_loss_mlp": 0.51227641, "epoch": 0.04052307229821133, "flos": 24019821555840.0, "grad_norm": 26.590680117649466, "language_loss": 0.82526392, "learning_rate": 3.998838809308334e-06, "loss": 0.84905195, "num_input_tokens_seen": 14309285, "router_z_loss_clip": 3.70507812, "router_z_loss_mlp": 0.45947266, "step": 674, "time_per_iteration": 2.631953239440918 }, { "auxiliary_loss_clip": 0.01814148, "auxiliary_loss_mlp": 0.00572547, "balance_loss_clip": 1.4414711, "balance_loss_mlp": 0.52603102, "epoch": 0.0405831955508793, "flos": 16436601964800.0, "grad_norm": 15.003727805945285, "language_loss": 0.85965931, "learning_rate": 3.9988255019438766e-06, "loss": 0.88352633, "num_input_tokens_seen": 14328300, "router_z_loss_clip": 3.72851562, "router_z_loss_mlp": 0.46508789, "step": 675, "time_per_iteration": 2.5894391536712646 }, { "auxiliary_loss_clip": 0.01759414, "auxiliary_loss_mlp": 0.00581232, "balance_loss_clip": 1.41543889, "balance_loss_mlp": 0.53171241, "epoch": 0.040643318803547275, "flos": 24279922915200.0, "grad_norm": 68.12186848323748, "language_loss": 0.82364386, "learning_rate": 3.998812118783757e-06, "loss": 0.84705031, "num_input_tokens_seen": 14346395, "router_z_loss_clip": 3.4375, "router_z_loss_mlp": 0.49511719, "step": 676, "time_per_iteration": 2.678849458694458 }, { "auxiliary_loss_clip": 0.01747955, "auxiliary_loss_mlp": 0.00632076, "balance_loss_clip": 1.40351784, "balance_loss_mlp": 0.57991016, "epoch": 0.04070344205621524, "flos": 17712076982400.0, "grad_norm": 46.45720877019674, "language_loss": 0.9308188, "learning_rate": 3.9987986598284804e-06, "loss": 0.95461917, "num_input_tokens_seen": 14364605, "router_z_loss_clip": 3.44335938, "router_z_loss_mlp": 0.52148438, "step": 677, "time_per_iteration": 2.7620902061462402 }, { "auxiliary_loss_clip": 0.01738642, "auxiliary_loss_mlp": 0.00608979, "balance_loss_clip": 1.40411758, "balance_loss_mlp": 0.55919719, "epoch": 0.04076356530888321, "flos": 26177658168960.0, "grad_norm": 4.584290896147419, "language_loss": 0.83028448, "learning_rate": 3.998785125078559e-06, "loss": 0.85376072, "num_input_tokens_seen": 14385265, "router_z_loss_clip": 3.34375, "router_z_loss_mlp": 0.49804688, "step": 678, "time_per_iteration": 2.7228546142578125 }, { "auxiliary_loss_clip": 0.01735919, "auxiliary_loss_mlp": 0.00554663, "balance_loss_clip": 1.40054727, "balance_loss_mlp": 0.50917244, "epoch": 0.04082368856155118, "flos": 35773455772800.0, "grad_norm": 10.537941339399541, "language_loss": 0.88295138, "learning_rate": 3.998771514534505e-06, "loss": 0.90585721, "num_input_tokens_seen": 14406090, "router_z_loss_clip": 3.35742188, "router_z_loss_mlp": 0.45483398, "step": 679, "time_per_iteration": 2.7209551334381104 }, { "auxiliary_loss_clip": 0.01709327, "auxiliary_loss_mlp": 0.00571791, "balance_loss_clip": 1.39651668, "balance_loss_mlp": 0.5257048, "epoch": 0.04088381181421915, "flos": 28146640049280.0, "grad_norm": 18.583747760518797, "language_loss": 0.84233713, "learning_rate": 3.998757828196835e-06, "loss": 0.86514831, "num_input_tokens_seen": 14425130, "router_z_loss_clip": 3.13085938, "router_z_loss_mlp": 0.46069336, "step": 680, "time_per_iteration": 2.648125410079956 }, { "auxiliary_loss_clip": 0.01696908, "auxiliary_loss_mlp": 0.00593822, "balance_loss_clip": 1.37453663, "balance_loss_mlp": 0.5429436, "epoch": 0.04094393506688712, "flos": 27597673514880.0, "grad_norm": 33.38905964533509, "language_loss": 0.89899087, "learning_rate": 3.9987440660660685e-06, "loss": 0.92189819, "num_input_tokens_seen": 14447355, "router_z_loss_clip": 3.22460938, "router_z_loss_mlp": 0.50854492, "step": 681, "time_per_iteration": 2.6421959400177 }, { "auxiliary_loss_clip": 0.01701977, "auxiliary_loss_mlp": 0.00622249, "balance_loss_clip": 1.37764668, "balance_loss_mlp": 0.56884283, "epoch": 0.04100405831955509, "flos": 23112036109440.0, "grad_norm": 116.91096025399212, "language_loss": 0.7770822, "learning_rate": 3.998730228142726e-06, "loss": 0.80032444, "num_input_tokens_seen": 14466790, "router_z_loss_clip": 3.24804688, "router_z_loss_mlp": 0.53466797, "step": 682, "time_per_iteration": 2.660576343536377 }, { "auxiliary_loss_clip": 0.01677625, "auxiliary_loss_mlp": 0.0056175, "balance_loss_clip": 1.37401628, "balance_loss_mlp": 0.51649773, "epoch": 0.04106418157222306, "flos": 20156731695360.0, "grad_norm": 179.7737758796279, "language_loss": 0.8009454, "learning_rate": 3.998716314427333e-06, "loss": 0.82333916, "num_input_tokens_seen": 14485195, "router_z_loss_clip": 3.03320312, "router_z_loss_mlp": 0.45263672, "step": 683, "time_per_iteration": 2.609663963317871 }, { "auxiliary_loss_clip": 0.01667256, "auxiliary_loss_mlp": 0.00552528, "balance_loss_clip": 1.36447358, "balance_loss_mlp": 0.50749028, "epoch": 0.041124304824891024, "flos": 17420697855360.0, "grad_norm": 12.520434985300072, "language_loss": 0.88856578, "learning_rate": 3.998702324920417e-06, "loss": 0.91076362, "num_input_tokens_seen": 14503370, "router_z_loss_clip": 3.02734375, "router_z_loss_mlp": 0.44970703, "step": 684, "time_per_iteration": 2.6338376998901367 }, { "auxiliary_loss_clip": 0.01663224, "auxiliary_loss_mlp": 0.00630728, "balance_loss_clip": 1.35930037, "balance_loss_mlp": 0.5793013, "epoch": 0.041184428077558996, "flos": 25780163287680.0, "grad_norm": 11.469153130522159, "language_loss": 0.96790755, "learning_rate": 3.9986882596225085e-06, "loss": 0.99084711, "num_input_tokens_seen": 14526415, "router_z_loss_clip": 3.0390625, "router_z_loss_mlp": 0.51391602, "step": 685, "time_per_iteration": 2.708998203277588 }, { "auxiliary_loss_clip": 0.01688141, "auxiliary_loss_mlp": 0.00614166, "balance_loss_clip": 1.37076056, "balance_loss_mlp": 0.56207144, "epoch": 0.04124455133022697, "flos": 22964766347520.0, "grad_norm": 14.618648563099569, "language_loss": 0.96074629, "learning_rate": 3.998674118534141e-06, "loss": 0.9837693, "num_input_tokens_seen": 14546595, "router_z_loss_clip": 3.17578125, "router_z_loss_mlp": 0.52050781, "step": 686, "time_per_iteration": 2.6661434173583984 }, { "auxiliary_loss_clip": 0.01688666, "auxiliary_loss_mlp": 0.00608278, "balance_loss_clip": 1.36904311, "balance_loss_mlp": 0.55477679, "epoch": 0.04130467458289493, "flos": 21289067015040.0, "grad_norm": 47.15214115047802, "language_loss": 0.78831488, "learning_rate": 3.998659901655851e-06, "loss": 0.8112843, "num_input_tokens_seen": 14566590, "router_z_loss_clip": 3.19726562, "router_z_loss_mlp": 0.53466797, "step": 687, "time_per_iteration": 2.7051823139190674 }, { "auxiliary_loss_clip": 0.01662237, "auxiliary_loss_mlp": 0.0057823, "balance_loss_clip": 1.35691285, "balance_loss_mlp": 0.53157121, "epoch": 0.041364797835562905, "flos": 19974233669760.0, "grad_norm": 24.00543725917011, "language_loss": 0.92567748, "learning_rate": 3.998645608988177e-06, "loss": 0.94808209, "num_input_tokens_seen": 14585965, "router_z_loss_clip": 3.05273438, "router_z_loss_mlp": 0.46606445, "step": 688, "time_per_iteration": 2.6573781967163086 }, { "auxiliary_loss_clip": 0.0165701, "auxiliary_loss_mlp": 0.00598196, "balance_loss_clip": 1.35321069, "balance_loss_mlp": 0.55156147, "epoch": 0.04142492108823087, "flos": 21906227520000.0, "grad_norm": 13.382936225765608, "language_loss": 0.89131337, "learning_rate": 3.998631240531661e-06, "loss": 0.91386539, "num_input_tokens_seen": 14606015, "router_z_loss_clip": 3.0390625, "router_z_loss_mlp": 0.46655273, "step": 689, "time_per_iteration": 2.713397264480591 }, { "auxiliary_loss_clip": 0.01660731, "auxiliary_loss_mlp": 0.00596836, "balance_loss_clip": 1.35020816, "balance_loss_mlp": 0.55249041, "epoch": 0.04148504434089884, "flos": 27639617621760.0, "grad_norm": 1591.2107716678804, "language_loss": 0.75028044, "learning_rate": 3.998616796286848e-06, "loss": 0.77285612, "num_input_tokens_seen": 14629955, "router_z_loss_clip": 3.10351562, "router_z_loss_mlp": 0.44360352, "step": 690, "time_per_iteration": 2.756495952606201 }, { "auxiliary_loss_clip": 0.0166896, "auxiliary_loss_mlp": 0.00660485, "balance_loss_clip": 1.35126138, "balance_loss_mlp": 0.60614944, "epoch": 0.041545167593566815, "flos": 20518387781760.0, "grad_norm": 109.69850579566506, "language_loss": 0.80587423, "learning_rate": 3.998602276254286e-06, "loss": 0.82916868, "num_input_tokens_seen": 14648000, "router_z_loss_clip": 3.17578125, "router_z_loss_mlp": 0.54321289, "step": 691, "time_per_iteration": 2.6761515140533447 }, { "auxiliary_loss_clip": 0.01649624, "auxiliary_loss_mlp": 0.00641622, "balance_loss_clip": 1.33713126, "balance_loss_mlp": 0.58945644, "epoch": 0.04160529084623478, "flos": 11868907939200.0, "grad_norm": 73.97961816373382, "language_loss": 0.90069556, "learning_rate": 3.998587680434526e-06, "loss": 0.92360806, "num_input_tokens_seen": 14662235, "router_z_loss_clip": 3.125, "router_z_loss_mlp": 0.52148438, "step": 692, "time_per_iteration": 2.59477162361145 }, { "auxiliary_loss_clip": 0.01667166, "auxiliary_loss_mlp": 0.00702187, "balance_loss_clip": 1.34054923, "balance_loss_mlp": 0.64062703, "epoch": 0.04166541409890275, "flos": 14828306503680.0, "grad_norm": 81.53478424440388, "language_loss": 0.96929157, "learning_rate": 3.99857300882812e-06, "loss": 0.99298513, "num_input_tokens_seen": 14676065, "router_z_loss_clip": 3.26367188, "router_z_loss_mlp": 0.61572266, "step": 693, "time_per_iteration": 4.127526760101318 }, { "auxiliary_loss_clip": 0.01672766, "auxiliary_loss_mlp": 0.00628943, "balance_loss_clip": 1.34850574, "balance_loss_mlp": 0.57579994, "epoch": 0.04172553735157072, "flos": 25808137004160.0, "grad_norm": 8.49364069899407, "language_loss": 0.89662707, "learning_rate": 3.998558261435626e-06, "loss": 0.91964424, "num_input_tokens_seen": 14694955, "router_z_loss_clip": 3.24023438, "router_z_loss_mlp": 0.53125, "step": 694, "time_per_iteration": 4.069854497909546 }, { "auxiliary_loss_clip": 0.01680149, "auxiliary_loss_mlp": 0.00683672, "balance_loss_clip": 1.34997547, "balance_loss_mlp": 0.62499726, "epoch": 0.04178566060423869, "flos": 24279815174400.0, "grad_norm": 4.905552123487546, "language_loss": 0.9113884, "learning_rate": 3.9985434382576015e-06, "loss": 0.93502653, "num_input_tokens_seen": 14715510, "router_z_loss_clip": 3.30078125, "router_z_loss_mlp": 0.5859375, "step": 695, "time_per_iteration": 4.090334177017212 }, { "auxiliary_loss_clip": 0.01656277, "auxiliary_loss_mlp": 0.00651567, "balance_loss_clip": 1.34279442, "balance_loss_mlp": 0.59503824, "epoch": 0.04184578385690666, "flos": 18222008411520.0, "grad_norm": 136.01014630787253, "language_loss": 0.92289627, "learning_rate": 3.99852853929461e-06, "loss": 0.94597471, "num_input_tokens_seen": 14731755, "router_z_loss_clip": 3.13671875, "router_z_loss_mlp": 0.56542969, "step": 696, "time_per_iteration": 2.578969717025757 }, { "auxiliary_loss_clip": 0.01673582, "auxiliary_loss_mlp": 0.00670714, "balance_loss_clip": 1.35682821, "balance_loss_mlp": 0.61482871, "epoch": 0.041905907109574626, "flos": 22776342577920.0, "grad_norm": 9.189305971287913, "language_loss": 0.99314475, "learning_rate": 3.998513564547216e-06, "loss": 1.01658773, "num_input_tokens_seen": 14750810, "router_z_loss_clip": 3.16796875, "router_z_loss_mlp": 0.55834961, "step": 697, "time_per_iteration": 2.6971144676208496 }, { "auxiliary_loss_clip": 0.01679678, "auxiliary_loss_mlp": 0.00629183, "balance_loss_clip": 1.35825109, "balance_loss_mlp": 0.57556295, "epoch": 0.0419660303622426, "flos": 20156947176960.0, "grad_norm": 25.356556580341532, "language_loss": 0.92936927, "learning_rate": 3.998498514015987e-06, "loss": 0.9524579, "num_input_tokens_seen": 14768435, "router_z_loss_clip": 3.21679688, "router_z_loss_mlp": 0.53564453, "step": 698, "time_per_iteration": 2.6208012104034424 }, { "auxiliary_loss_clip": 0.01712302, "auxiliary_loss_mlp": 0.00591885, "balance_loss_clip": 1.38408935, "balance_loss_mlp": 0.54415369, "epoch": 0.042026153614910564, "flos": 23076376882560.0, "grad_norm": 23.190362305839166, "language_loss": 0.96935213, "learning_rate": 3.998483387701495e-06, "loss": 0.99239409, "num_input_tokens_seen": 14786690, "router_z_loss_clip": 3.28125, "router_z_loss_mlp": 0.4777832, "step": 699, "time_per_iteration": 2.6588175296783447 }, { "auxiliary_loss_clip": 0.01637874, "auxiliary_loss_mlp": 0.00531743, "balance_loss_clip": 1.34434748, "balance_loss_mlp": 0.50217897, "epoch": 0.042086276867578536, "flos": 64495243370880.0, "grad_norm": 0.9196749125804492, "language_loss": 0.68242824, "learning_rate": 3.998468185604312e-06, "loss": 0.70412445, "num_input_tokens_seen": 14853840, "router_z_loss_clip": 2.9375, "router_z_loss_mlp": 0.29492188, "step": 700, "time_per_iteration": 3.1739885807037354 }, { "auxiliary_loss_clip": 0.01813639, "auxiliary_loss_mlp": 0.00546141, "balance_loss_clip": 1.45208645, "balance_loss_mlp": 0.49869582, "epoch": 0.04214640012024651, "flos": 15487016065920.0, "grad_norm": 45.69409073707609, "language_loss": 0.95972288, "learning_rate": 3.998452907725016e-06, "loss": 0.98332071, "num_input_tokens_seen": 14869580, "router_z_loss_clip": 3.61328125, "router_z_loss_mlp": 0.47436523, "step": 701, "time_per_iteration": 2.6863391399383545 }, { "auxiliary_loss_clip": 0.01849197, "auxiliary_loss_mlp": 0.00590157, "balance_loss_clip": 1.47140622, "balance_loss_mlp": 0.54087627, "epoch": 0.04220652337291447, "flos": 23877040993920.0, "grad_norm": 92.53025160928954, "language_loss": 0.73163611, "learning_rate": 3.998437554064184e-06, "loss": 0.75602967, "num_input_tokens_seen": 14891065, "router_z_loss_clip": 3.77734375, "router_z_loss_mlp": 0.49291992, "step": 702, "time_per_iteration": 2.688028573989868 }, { "auxiliary_loss_clip": 0.01725033, "auxiliary_loss_mlp": 0.00353602, "balance_loss_clip": 1.4130336, "balance_loss_mlp": 0.33510089, "epoch": 0.042266646625582445, "flos": 63795451628160.0, "grad_norm": 0.9031037050006142, "language_loss": 0.60757738, "learning_rate": 3.9984221246224006e-06, "loss": 0.62836379, "num_input_tokens_seen": 14954815, "router_z_loss_clip": 3.125, "router_z_loss_mlp": 0.18457031, "step": 703, "time_per_iteration": 3.1882622241973877 }, { "auxiliary_loss_clip": 0.01709145, "auxiliary_loss_mlp": 0.00248237, "balance_loss_clip": 1.40468955, "balance_loss_mlp": 0.22954522, "epoch": 0.04232676987825041, "flos": 50018863345920.0, "grad_norm": 1.0315781545076872, "language_loss": 0.57778203, "learning_rate": 3.9984066194002494e-06, "loss": 0.59735584, "num_input_tokens_seen": 15003050, "router_z_loss_clip": 3.046875, "router_z_loss_mlp": 0.18652344, "step": 704, "time_per_iteration": 3.0621256828308105 }, { "auxiliary_loss_clip": 0.01926917, "auxiliary_loss_mlp": 0.00576541, "balance_loss_clip": 1.51088285, "balance_loss_mlp": 0.52458984, "epoch": 0.04238689313091838, "flos": 21616105368960.0, "grad_norm": 519.3402419437504, "language_loss": 0.94500351, "learning_rate": 3.998391038398319e-06, "loss": 0.97003806, "num_input_tokens_seen": 15021990, "router_z_loss_clip": 4.1640625, "router_z_loss_mlp": 0.51928711, "step": 705, "time_per_iteration": 2.6108241081237793 }, { "auxiliary_loss_clip": 0.01919147, "auxiliary_loss_mlp": 0.0053508, "balance_loss_clip": 1.50891423, "balance_loss_mlp": 0.48887488, "epoch": 0.042447016383586354, "flos": 19135109070720.0, "grad_norm": 26.816510880888114, "language_loss": 0.78098452, "learning_rate": 3.998375381617201e-06, "loss": 0.80552673, "num_input_tokens_seen": 15040700, "router_z_loss_clip": 4.10351562, "router_z_loss_mlp": 0.4621582, "step": 706, "time_per_iteration": 2.639017343521118 }, { "auxiliary_loss_clip": 0.01917584, "auxiliary_loss_mlp": 0.00536565, "balance_loss_clip": 1.50685763, "balance_loss_mlp": 0.48604369, "epoch": 0.04250713963625432, "flos": 24426007528320.0, "grad_norm": 5.315494052513124, "language_loss": 0.98393273, "learning_rate": 3.9983596490574875e-06, "loss": 1.00847423, "num_input_tokens_seen": 15056725, "router_z_loss_clip": 4.11328125, "router_z_loss_mlp": 0.50561523, "step": 707, "time_per_iteration": 2.6399965286254883 }, { "auxiliary_loss_clip": 0.01915653, "auxiliary_loss_mlp": 0.00545442, "balance_loss_clip": 1.50004673, "balance_loss_mlp": 0.49284708, "epoch": 0.04256726288892229, "flos": 30367391333760.0, "grad_norm": 39.95717743980437, "language_loss": 0.88288164, "learning_rate": 3.998343840719776e-06, "loss": 0.90749264, "num_input_tokens_seen": 15077550, "router_z_loss_clip": 4.15625, "router_z_loss_mlp": 0.52636719, "step": 708, "time_per_iteration": 2.7736308574676514 }, { "auxiliary_loss_clip": 0.01901422, "auxiliary_loss_mlp": 0.0057718, "balance_loss_clip": 1.48729539, "balance_loss_mlp": 0.52315468, "epoch": 0.04262738614159026, "flos": 16362661818240.0, "grad_norm": 58.26147947441614, "language_loss": 0.90113723, "learning_rate": 3.998327956604666e-06, "loss": 0.92592323, "num_input_tokens_seen": 15094955, "router_z_loss_clip": 4.140625, "router_z_loss_mlp": 0.54077148, "step": 709, "time_per_iteration": 2.606072187423706 }, { "auxiliary_loss_clip": 0.0192841, "auxiliary_loss_mlp": 0.00660778, "balance_loss_clip": 1.50481486, "balance_loss_mlp": 0.59883654, "epoch": 0.04268750939425823, "flos": 20412379768320.0, "grad_norm": 34.15601489535805, "language_loss": 0.93343544, "learning_rate": 3.99831199671276e-06, "loss": 0.95932728, "num_input_tokens_seen": 15113395, "router_z_loss_clip": 4.23828125, "router_z_loss_mlp": 0.61914062, "step": 710, "time_per_iteration": 2.6763060092926025 }, { "auxiliary_loss_clip": 0.01893345, "auxiliary_loss_mlp": 0.00702682, "balance_loss_clip": 1.48752081, "balance_loss_mlp": 0.64240992, "epoch": 0.0427476326469262, "flos": 20302959962880.0, "grad_norm": 36.07950193478578, "language_loss": 0.903862, "learning_rate": 3.998295961044662e-06, "loss": 0.92982233, "num_input_tokens_seen": 15132920, "router_z_loss_clip": 4.0546875, "router_z_loss_mlp": 0.60253906, "step": 711, "time_per_iteration": 2.6675117015838623 }, { "auxiliary_loss_clip": 0.01883556, "auxiliary_loss_mlp": 0.00784427, "balance_loss_clip": 1.4822607, "balance_loss_mlp": 0.71514297, "epoch": 0.042807755899594166, "flos": 21650794928640.0, "grad_norm": 61.51999906686752, "language_loss": 0.93069386, "learning_rate": 3.9982798496009804e-06, "loss": 0.95737368, "num_input_tokens_seen": 15153115, "router_z_loss_clip": 4.01367188, "router_z_loss_mlp": 0.69287109, "step": 712, "time_per_iteration": 2.79653263092041 }, { "auxiliary_loss_clip": 0.01904924, "auxiliary_loss_mlp": 0.00809494, "balance_loss_clip": 1.49445748, "balance_loss_mlp": 0.74140179, "epoch": 0.04286787915226214, "flos": 21435007973760.0, "grad_norm": 5.30412501788728, "language_loss": 0.98289996, "learning_rate": 3.998263662382328e-06, "loss": 1.0100441, "num_input_tokens_seen": 15172770, "router_z_loss_clip": 4.1015625, "router_z_loss_mlp": 0.68115234, "step": 713, "time_per_iteration": 2.673685312271118 }, { "auxiliary_loss_clip": 0.01751084, "auxiliary_loss_mlp": 0.00537148, "balance_loss_clip": 1.46883821, "balance_loss_mlp": 0.51588148, "epoch": 0.04292800240493011, "flos": 66397970615040.0, "grad_norm": 0.9727692491409453, "language_loss": 0.64473629, "learning_rate": 3.9982473993893165e-06, "loss": 0.66761863, "num_input_tokens_seen": 15240055, "router_z_loss_clip": 2.8125, "router_z_loss_mlp": 0.21289062, "step": 714, "time_per_iteration": 3.289107322692871 }, { "auxiliary_loss_clip": 0.01852733, "auxiliary_loss_mlp": 0.00737667, "balance_loss_clip": 1.46903765, "balance_loss_mlp": 0.67138678, "epoch": 0.042988125657598075, "flos": 31650264552960.0, "grad_norm": 39.62571548453726, "language_loss": 0.80969656, "learning_rate": 3.998231060622563e-06, "loss": 0.83560061, "num_input_tokens_seen": 15261585, "router_z_loss_clip": 3.8359375, "router_z_loss_mlp": 0.66259766, "step": 715, "time_per_iteration": 2.7501022815704346 }, { "auxiliary_loss_clip": 0.01843783, "auxiliary_loss_mlp": 0.00681385, "balance_loss_clip": 1.45391011, "balance_loss_mlp": 0.61310196, "epoch": 0.04304824891026605, "flos": 33248468292480.0, "grad_norm": 89.95459247620057, "language_loss": 0.78846765, "learning_rate": 3.998214646082688e-06, "loss": 0.81371927, "num_input_tokens_seen": 15281160, "router_z_loss_clip": 3.89453125, "router_z_loss_mlp": 0.68310547, "step": 716, "time_per_iteration": 2.75793194770813 }, { "auxiliary_loss_clip": 0.01787173, "auxiliary_loss_mlp": 0.00623957, "balance_loss_clip": 1.50843167, "balance_loss_mlp": 0.60059208, "epoch": 0.04310837216293401, "flos": 64064782782720.0, "grad_norm": 0.9451730020071218, "language_loss": 0.65513599, "learning_rate": 3.998198155770314e-06, "loss": 0.67924732, "num_input_tokens_seen": 15344505, "router_z_loss_clip": 2.78125, "router_z_loss_mlp": 0.23339844, "step": 717, "time_per_iteration": 3.2478103637695312 }, { "auxiliary_loss_clip": 0.01658094, "auxiliary_loss_mlp": 0.00203555, "balance_loss_clip": 1.41420054, "balance_loss_mlp": 0.1892494, "epoch": 0.043168495415601985, "flos": 61343757849600.0, "grad_norm": 1.0063366495566757, "language_loss": 0.58419108, "learning_rate": 3.998181589686065e-06, "loss": 0.60280752, "num_input_tokens_seen": 15404050, "router_z_loss_clip": 2.4375, "router_z_loss_mlp": 0.14257812, "step": 718, "time_per_iteration": 3.0017330646514893 }, { "auxiliary_loss_clip": 0.01826733, "auxiliary_loss_mlp": 0.00581048, "balance_loss_clip": 1.44255292, "balance_loss_mlp": 0.52408975, "epoch": 0.04322861866826996, "flos": 20704261685760.0, "grad_norm": 4.2708150121173185, "language_loss": 0.97743702, "learning_rate": 3.99816494783057e-06, "loss": 1.00151491, "num_input_tokens_seen": 15424190, "router_z_loss_clip": 3.83789062, "router_z_loss_mlp": 0.57006836, "step": 719, "time_per_iteration": 2.617546796798706 }, { "auxiliary_loss_clip": 0.01849039, "auxiliary_loss_mlp": 0.00618883, "balance_loss_clip": 1.44409192, "balance_loss_mlp": 0.56182963, "epoch": 0.04328874192093792, "flos": 30373352991360.0, "grad_norm": 779.0729005643553, "language_loss": 0.71113551, "learning_rate": 3.99814823020446e-06, "loss": 0.73581475, "num_input_tokens_seen": 15446500, "router_z_loss_clip": 4.04882812, "router_z_loss_mlp": 0.57080078, "step": 720, "time_per_iteration": 2.728274345397949 }, { "auxiliary_loss_clip": 0.01825817, "auxiliary_loss_mlp": 0.00608812, "balance_loss_clip": 1.44484568, "balance_loss_mlp": 0.55070901, "epoch": 0.043348865173605894, "flos": 21944795748480.0, "grad_norm": 34.830416017614795, "language_loss": 0.82678872, "learning_rate": 3.9981314368083684e-06, "loss": 0.85113502, "num_input_tokens_seen": 15465830, "router_z_loss_clip": 3.80859375, "router_z_loss_mlp": 0.58203125, "step": 721, "time_per_iteration": 2.6226534843444824 }, { "auxiliary_loss_clip": 0.01872051, "auxiliary_loss_mlp": 0.00577108, "balance_loss_clip": 1.48478556, "balance_loss_mlp": 0.52894735, "epoch": 0.04340898842627386, "flos": 15264225959040.0, "grad_norm": 25.554006557439678, "language_loss": 0.95385122, "learning_rate": 3.998114567642933e-06, "loss": 0.97834277, "num_input_tokens_seen": 15479985, "router_z_loss_clip": 3.86914062, "router_z_loss_mlp": 0.48120117, "step": 722, "time_per_iteration": 2.646631956100464 }, { "auxiliary_loss_clip": 0.01902472, "auxiliary_loss_mlp": 0.00635194, "balance_loss_clip": 1.50350046, "balance_loss_mlp": 0.57828343, "epoch": 0.04346911167894183, "flos": 27965434913280.0, "grad_norm": 4.6228388333301105, "language_loss": 0.908306, "learning_rate": 3.998097622708792e-06, "loss": 0.93368262, "num_input_tokens_seen": 15501545, "router_z_loss_clip": 3.98632812, "router_z_loss_mlp": 0.56860352, "step": 723, "time_per_iteration": 2.680851936340332 }, { "auxiliary_loss_clip": 0.01910245, "auxiliary_loss_mlp": 0.00576182, "balance_loss_clip": 1.50901055, "balance_loss_mlp": 0.52623308, "epoch": 0.0435292349316098, "flos": 29242202820480.0, "grad_norm": 10.845984846170527, "language_loss": 0.87470865, "learning_rate": 3.99808060200659e-06, "loss": 0.89957285, "num_input_tokens_seen": 15521725, "router_z_loss_clip": 4.015625, "router_z_loss_mlp": 0.49926758, "step": 724, "time_per_iteration": 2.6944401264190674 }, { "auxiliary_loss_clip": 0.01902456, "auxiliary_loss_mlp": 0.00614245, "balance_loss_clip": 1.50722921, "balance_loss_mlp": 0.56043375, "epoch": 0.04358935818427777, "flos": 20558356640640.0, "grad_norm": 11.818354124322987, "language_loss": 0.86182839, "learning_rate": 3.998063505536971e-06, "loss": 0.88699538, "num_input_tokens_seen": 15540910, "router_z_loss_clip": 3.95117188, "router_z_loss_mlp": 0.53833008, "step": 725, "time_per_iteration": 2.588808536529541 }, { "auxiliary_loss_clip": 0.01886803, "auxiliary_loss_mlp": 0.00625011, "balance_loss_clip": 1.48490822, "balance_loss_mlp": 0.57267785, "epoch": 0.04364948143694574, "flos": 14464926564480.0, "grad_norm": 35.68773126051081, "language_loss": 0.9403441, "learning_rate": 3.998046333300584e-06, "loss": 0.96546221, "num_input_tokens_seen": 15558640, "router_z_loss_clip": 4.01953125, "router_z_loss_mlp": 0.52319336, "step": 726, "time_per_iteration": 2.616271734237671 }, { "auxiliary_loss_clip": 0.01676269, "auxiliary_loss_mlp": 0.00515434, "balance_loss_clip": 1.35932374, "balance_loss_mlp": 0.49254543, "epoch": 0.043709604689613706, "flos": 50067268922880.0, "grad_norm": 1.00764493074828, "language_loss": 0.55935264, "learning_rate": 3.998029085298079e-06, "loss": 0.58126968, "num_input_tokens_seen": 15612975, "router_z_loss_clip": 3.15625, "router_z_loss_mlp": 0.22851562, "step": 727, "time_per_iteration": 3.2592782974243164 }, { "auxiliary_loss_clip": 0.01834742, "auxiliary_loss_mlp": 0.00671625, "balance_loss_clip": 1.4574368, "balance_loss_mlp": 0.61478633, "epoch": 0.04376972794228168, "flos": 13991588115840.0, "grad_norm": 80.95313198153251, "language_loss": 0.88401091, "learning_rate": 3.998011761530112e-06, "loss": 0.9090746, "num_input_tokens_seen": 15631070, "router_z_loss_clip": 3.7734375, "router_z_loss_mlp": 0.56860352, "step": 728, "time_per_iteration": 2.6531593799591064 }, { "auxiliary_loss_clip": 0.0180862, "auxiliary_loss_mlp": 0.00643124, "balance_loss_clip": 1.44599414, "balance_loss_mlp": 0.59191215, "epoch": 0.04382985119494965, "flos": 22009901149440.0, "grad_norm": 41.34957982720944, "language_loss": 0.81811553, "learning_rate": 3.997994361997338e-06, "loss": 0.84263301, "num_input_tokens_seen": 15647825, "router_z_loss_clip": 3.62304688, "router_z_loss_mlp": 0.51269531, "step": 729, "time_per_iteration": 2.6272218227386475 }, { "auxiliary_loss_clip": 0.01777496, "auxiliary_loss_mlp": 0.00626124, "balance_loss_clip": 1.42228436, "balance_loss_mlp": 0.56976199, "epoch": 0.043889974447617615, "flos": 24206521472640.0, "grad_norm": 24.594713735323435, "language_loss": 1.01923561, "learning_rate": 3.997976886700417e-06, "loss": 1.04327178, "num_input_tokens_seen": 15668260, "router_z_loss_clip": 3.54882812, "router_z_loss_mlp": 0.56396484, "step": 730, "time_per_iteration": 2.672004461288452 }, { "auxiliary_loss_clip": 0.0180749, "auxiliary_loss_mlp": 0.00605042, "balance_loss_clip": 1.4437654, "balance_loss_mlp": 0.55211353, "epoch": 0.04395009770028559, "flos": 17274541415040.0, "grad_norm": 9.708760576686043, "language_loss": 0.9552213, "learning_rate": 3.997959335640013e-06, "loss": 0.97934663, "num_input_tokens_seen": 15685630, "router_z_loss_clip": 3.63867188, "router_z_loss_mlp": 0.52905273, "step": 731, "time_per_iteration": 2.6576879024505615 }, { "auxiliary_loss_clip": 0.01807565, "auxiliary_loss_mlp": 0.00549159, "balance_loss_clip": 1.44214487, "balance_loss_mlp": 0.5004738, "epoch": 0.04401022095295355, "flos": 12310286261760.0, "grad_norm": 114.69460952735488, "language_loss": 0.99384016, "learning_rate": 3.997941708816791e-06, "loss": 1.01740742, "num_input_tokens_seen": 15698645, "router_z_loss_clip": 3.65429688, "router_z_loss_mlp": 0.48730469, "step": 732, "time_per_iteration": 2.587033271789551 }, { "auxiliary_loss_clip": 0.018012, "auxiliary_loss_mlp": 0.00556574, "balance_loss_clip": 1.43126631, "balance_loss_mlp": 0.50796074, "epoch": 0.044070344205621524, "flos": 20959658363520.0, "grad_norm": 2.9809440524821635, "language_loss": 0.92383182, "learning_rate": 3.997924006231419e-06, "loss": 0.94740951, "num_input_tokens_seen": 15716775, "router_z_loss_clip": 3.6953125, "router_z_loss_mlp": 0.48632812, "step": 733, "time_per_iteration": 2.76142954826355 }, { "auxiliary_loss_clip": 0.01799183, "auxiliary_loss_mlp": 0.00526247, "balance_loss_clip": 1.42680335, "balance_loss_mlp": 0.47608364, "epoch": 0.044130467458289496, "flos": 13845288021120.0, "grad_norm": 49.5211373737798, "language_loss": 0.9657886, "learning_rate": 3.9979062278845685e-06, "loss": 0.98904294, "num_input_tokens_seen": 15733320, "router_z_loss_clip": 3.72460938, "router_z_loss_mlp": 0.50170898, "step": 734, "time_per_iteration": 2.573207139968872 }, { "auxiliary_loss_clip": 0.01795784, "auxiliary_loss_mlp": 0.0048323, "balance_loss_clip": 1.4242177, "balance_loss_mlp": 0.43993297, "epoch": 0.04419059071095746, "flos": 28655063107200.0, "grad_norm": 108.08018719957764, "language_loss": 0.85673702, "learning_rate": 3.9978883737769125e-06, "loss": 0.87952721, "num_input_tokens_seen": 15752705, "router_z_loss_clip": 3.71484375, "router_z_loss_mlp": 0.43286133, "step": 735, "time_per_iteration": 2.7445359230041504 }, { "auxiliary_loss_clip": 0.01791788, "auxiliary_loss_mlp": 0.00516108, "balance_loss_clip": 1.41111648, "balance_loss_mlp": 0.46995038, "epoch": 0.04425071396362543, "flos": 28183304856960.0, "grad_norm": 12.348637740441953, "language_loss": 0.95734715, "learning_rate": 3.9978704439091305e-06, "loss": 0.98042613, "num_input_tokens_seen": 15772800, "router_z_loss_clip": 3.80273438, "router_z_loss_mlp": 0.46142578, "step": 736, "time_per_iteration": 5.5520179271698 }, { "auxiliary_loss_clip": 0.01778103, "auxiliary_loss_mlp": 0.00454534, "balance_loss_clip": 1.40353537, "balance_loss_mlp": 0.41130853, "epoch": 0.0443108372162934, "flos": 23658452778240.0, "grad_norm": 13.812762379932344, "language_loss": 0.90680456, "learning_rate": 3.997852438281901e-06, "loss": 0.92913091, "num_input_tokens_seen": 15793665, "router_z_loss_clip": 3.74609375, "router_z_loss_mlp": 0.43212891, "step": 737, "time_per_iteration": 4.098562717437744 }, { "auxiliary_loss_clip": 0.01806015, "auxiliary_loss_mlp": 0.00471538, "balance_loss_clip": 1.41929293, "balance_loss_mlp": 0.42306709, "epoch": 0.04437096046896137, "flos": 33979861025280.0, "grad_norm": 11.102505927743579, "language_loss": 0.91092336, "learning_rate": 3.997834356895906e-06, "loss": 0.93369889, "num_input_tokens_seen": 15813175, "router_z_loss_clip": 3.86523438, "router_z_loss_mlp": 0.48461914, "step": 738, "time_per_iteration": 4.19480299949646 }, { "auxiliary_loss_clip": 0.01512159, "auxiliary_loss_mlp": 0.00194761, "balance_loss_clip": 1.2966361, "balance_loss_mlp": 0.18121903, "epoch": 0.04443108372162934, "flos": 67397506375680.0, "grad_norm": 0.8852143336296973, "language_loss": 0.59093809, "learning_rate": 3.9978161997518324e-06, "loss": 0.60800731, "num_input_tokens_seen": 15872050, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.13574219, "step": 739, "time_per_iteration": 3.100675106048584 }, { "auxiliary_loss_clip": 0.01827177, "auxiliary_loss_mlp": 0.00446427, "balance_loss_clip": 1.4126308, "balance_loss_mlp": 0.40465599, "epoch": 0.04449120697429731, "flos": 29752672953600.0, "grad_norm": 91.91945997327304, "language_loss": 0.98923743, "learning_rate": 3.997797966850369e-06, "loss": 1.0119735, "num_input_tokens_seen": 15891085, "router_z_loss_clip": 4.14648438, "router_z_loss_mlp": 0.41796875, "step": 740, "time_per_iteration": 2.7332727909088135 }, { "auxiliary_loss_clip": 0.01845553, "auxiliary_loss_mlp": 0.00426983, "balance_loss_clip": 1.4178102, "balance_loss_mlp": 0.38602239, "epoch": 0.04455133022696528, "flos": 36502119072000.0, "grad_norm": 26.68004100042141, "language_loss": 0.78248048, "learning_rate": 3.997779658192205e-06, "loss": 0.80520582, "num_input_tokens_seen": 15914225, "router_z_loss_clip": 4.27734375, "router_z_loss_mlp": 0.40966797, "step": 741, "time_per_iteration": 2.7543768882751465 }, { "auxiliary_loss_clip": 0.01858739, "auxiliary_loss_mlp": 0.00365452, "balance_loss_clip": 1.4193368, "balance_loss_mlp": 0.33071405, "epoch": 0.044611453479633245, "flos": 28803661672320.0, "grad_norm": 68.27173179263607, "language_loss": 0.92020392, "learning_rate": 3.997761273778037e-06, "loss": 0.94244587, "num_input_tokens_seen": 15934540, "router_z_loss_clip": 4.39453125, "router_z_loss_mlp": 0.34765625, "step": 742, "time_per_iteration": 2.7714953422546387 }, { "auxiliary_loss_clip": 0.01891007, "auxiliary_loss_mlp": 0.00397318, "balance_loss_clip": 1.4294064, "balance_loss_mlp": 0.35898063, "epoch": 0.04467157673230122, "flos": 20010970304640.0, "grad_norm": 6.182932563559368, "language_loss": 0.9161191, "learning_rate": 3.997742813608561e-06, "loss": 0.93900234, "num_input_tokens_seen": 15952560, "router_z_loss_clip": 4.6171875, "router_z_loss_mlp": 0.38330078, "step": 743, "time_per_iteration": 2.598271369934082 }, { "auxiliary_loss_clip": 0.01891102, "auxiliary_loss_mlp": 0.00398452, "balance_loss_clip": 1.42897284, "balance_loss_mlp": 0.35930318, "epoch": 0.04473169998496919, "flos": 18004964480640.0, "grad_norm": 6.25588368089373, "language_loss": 0.88164341, "learning_rate": 3.997724277684479e-06, "loss": 0.90453893, "num_input_tokens_seen": 15970620, "router_z_loss_clip": 4.62109375, "router_z_loss_mlp": 0.39160156, "step": 744, "time_per_iteration": 2.652085781097412 }, { "auxiliary_loss_clip": 0.01894667, "auxiliary_loss_mlp": 0.00372913, "balance_loss_clip": 1.4377352, "balance_loss_mlp": 0.33488566, "epoch": 0.044791823237637154, "flos": 20631722169600.0, "grad_norm": 3.970387333103113, "language_loss": 0.92340887, "learning_rate": 3.99770566600649e-06, "loss": 0.94608468, "num_input_tokens_seen": 15987325, "router_z_loss_clip": 4.56640625, "router_z_loss_mlp": 0.38012695, "step": 745, "time_per_iteration": 2.6184685230255127 }, { "auxiliary_loss_clip": 0.0189821, "auxiliary_loss_mlp": 0.00429561, "balance_loss_clip": 1.43217802, "balance_loss_mlp": 0.38838568, "epoch": 0.04485194649030513, "flos": 31176171918720.0, "grad_norm": 127.52672196292924, "language_loss": 0.75506973, "learning_rate": 3.997686978575302e-06, "loss": 0.77834749, "num_input_tokens_seen": 16008310, "router_z_loss_clip": 4.66015625, "router_z_loss_mlp": 0.41162109, "step": 746, "time_per_iteration": 2.7507131099700928 }, { "auxiliary_loss_clip": 0.01898308, "auxiliary_loss_mlp": 0.00427273, "balance_loss_clip": 1.4456991, "balance_loss_mlp": 0.385717, "epoch": 0.04491206974297309, "flos": 26143291831680.0, "grad_norm": 35.49829127682906, "language_loss": 0.76436138, "learning_rate": 3.997668215391625e-06, "loss": 0.78761721, "num_input_tokens_seen": 16029620, "router_z_loss_clip": 4.53125, "router_z_loss_mlp": 0.41577148, "step": 747, "time_per_iteration": 2.664830446243286 }, { "auxiliary_loss_clip": 0.01915546, "auxiliary_loss_mlp": 0.00416157, "balance_loss_clip": 1.4569962, "balance_loss_mlp": 0.37696111, "epoch": 0.044972192995641064, "flos": 20667668705280.0, "grad_norm": 43.726412112338345, "language_loss": 0.72856754, "learning_rate": 3.997649376456168e-06, "loss": 0.75188452, "num_input_tokens_seen": 16049065, "router_z_loss_clip": 4.5859375, "router_z_loss_mlp": 0.39208984, "step": 748, "time_per_iteration": 2.7309587001800537 }, { "auxiliary_loss_clip": 0.01908121, "auxiliary_loss_mlp": 0.0048002, "balance_loss_clip": 1.4469769, "balance_loss_mlp": 0.43319502, "epoch": 0.045032316248309036, "flos": 16106834177280.0, "grad_norm": 53.38791382826946, "language_loss": 0.84428704, "learning_rate": 3.997630461769647e-06, "loss": 0.86816847, "num_input_tokens_seen": 16066765, "router_z_loss_clip": 4.6171875, "router_z_loss_mlp": 0.46826172, "step": 749, "time_per_iteration": 2.636099100112915 }, { "auxiliary_loss_clip": 0.01925955, "auxiliary_loss_mlp": 0.00516219, "balance_loss_clip": 1.45280886, "balance_loss_mlp": 0.47294623, "epoch": 0.045092439500977, "flos": 17858843953920.0, "grad_norm": 13.600231900421955, "language_loss": 0.95595843, "learning_rate": 3.997611471332778e-06, "loss": 0.98038012, "num_input_tokens_seen": 16085980, "router_z_loss_clip": 4.734375, "router_z_loss_mlp": 0.43310547, "step": 750, "time_per_iteration": 2.6272833347320557 }, { "auxiliary_loss_clip": 0.01940517, "auxiliary_loss_mlp": 0.00609479, "balance_loss_clip": 1.45615649, "balance_loss_mlp": 0.55390346, "epoch": 0.04515256275364497, "flos": 24462815990400.0, "grad_norm": 10.478815296779713, "language_loss": 0.81120455, "learning_rate": 3.9975924051462825e-06, "loss": 0.83670449, "num_input_tokens_seen": 16106260, "router_z_loss_clip": 4.84375, "router_z_loss_mlp": 0.55566406, "step": 751, "time_per_iteration": 2.723679304122925 }, { "auxiliary_loss_clip": 0.01970077, "auxiliary_loss_mlp": 0.00623429, "balance_loss_clip": 1.46576881, "balance_loss_mlp": 0.57111967, "epoch": 0.04521268600631294, "flos": 20916385453440.0, "grad_norm": 5.607757877235695, "language_loss": 0.79174054, "learning_rate": 3.997573263210883e-06, "loss": 0.81767559, "num_input_tokens_seen": 16123475, "router_z_loss_clip": 5.04296875, "router_z_loss_mlp": 0.52294922, "step": 752, "time_per_iteration": 2.68918776512146 }, { "auxiliary_loss_clip": 0.02001078, "auxiliary_loss_mlp": 0.00673054, "balance_loss_clip": 1.4692564, "balance_loss_mlp": 0.61898053, "epoch": 0.04527280925898091, "flos": 13371374954880.0, "grad_norm": 85.27299153438244, "language_loss": 1.01283848, "learning_rate": 3.997554045527305e-06, "loss": 1.03957987, "num_input_tokens_seen": 16138335, "router_z_loss_clip": 5.32421875, "router_z_loss_mlp": 0.54077148, "step": 753, "time_per_iteration": 2.6330745220184326 }, { "auxiliary_loss_clip": 0.02060231, "auxiliary_loss_mlp": 0.00822137, "balance_loss_clip": 1.47807848, "balance_loss_mlp": 0.75738233, "epoch": 0.04533293251164888, "flos": 23254565276160.0, "grad_norm": 62.128051058476764, "language_loss": 0.98132205, "learning_rate": 3.997534752096277e-06, "loss": 1.01014578, "num_input_tokens_seen": 16157110, "router_z_loss_clip": 5.82421875, "router_z_loss_mlp": 0.64746094, "step": 754, "time_per_iteration": 2.664883852005005 }, { "auxiliary_loss_clip": 0.02027432, "auxiliary_loss_mlp": 0.00795333, "balance_loss_clip": 1.46249843, "balance_loss_mlp": 0.73668188, "epoch": 0.04539305576431685, "flos": 12422004537600.0, "grad_norm": 66.2395708954885, "language_loss": 0.86686981, "learning_rate": 3.997515382918531e-06, "loss": 0.89509743, "num_input_tokens_seen": 16174155, "router_z_loss_clip": 5.64453125, "router_z_loss_mlp": 0.58642578, "step": 755, "time_per_iteration": 2.632842540740967 }, { "auxiliary_loss_clip": 0.02072408, "auxiliary_loss_mlp": 0.00844809, "balance_loss_clip": 1.47878981, "balance_loss_mlp": 0.77657354, "epoch": 0.04545317901698482, "flos": 16070995382400.0, "grad_norm": 17.939018517329966, "language_loss": 0.85513973, "learning_rate": 3.9974959379948015e-06, "loss": 0.88431191, "num_input_tokens_seen": 16192240, "router_z_loss_clip": 5.93359375, "router_z_loss_mlp": 0.68212891, "step": 756, "time_per_iteration": 2.6389565467834473 }, { "auxiliary_loss_clip": 0.01669742, "auxiliary_loss_mlp": 0.00240112, "balance_loss_clip": 1.39981651, "balance_loss_mlp": 0.22122931, "epoch": 0.045513302269652785, "flos": 66396139021440.0, "grad_norm": 0.8282115449185338, "language_loss": 0.62813628, "learning_rate": 3.997476417325827e-06, "loss": 0.6472348, "num_input_tokens_seen": 16255775, "router_z_loss_clip": 2.6875, "router_z_loss_mlp": 0.18847656, "step": 757, "time_per_iteration": 3.2467265129089355 }, { "auxiliary_loss_clip": 0.02122433, "auxiliary_loss_mlp": 0.00943747, "balance_loss_clip": 1.49097371, "balance_loss_mlp": 0.87460554, "epoch": 0.04557342552232076, "flos": 21471169991040.0, "grad_norm": 72.84049119495762, "language_loss": 0.88665026, "learning_rate": 3.997456820912346e-06, "loss": 0.91731203, "num_input_tokens_seen": 16277015, "router_z_loss_clip": 6.3203125, "router_z_loss_mlp": 0.69042969, "step": 758, "time_per_iteration": 2.7413125038146973 }, { "auxiliary_loss_clip": 0.02090271, "auxiliary_loss_mlp": 0.00849576, "balance_loss_clip": 1.47047448, "balance_loss_mlp": 0.79111612, "epoch": 0.04563354877498873, "flos": 23732680233600.0, "grad_norm": 47.51341628815215, "language_loss": 0.93559581, "learning_rate": 3.997437148755101e-06, "loss": 0.96499431, "num_input_tokens_seen": 16296005, "router_z_loss_clip": 6.20703125, "router_z_loss_mlp": 0.58398438, "step": 759, "time_per_iteration": 2.6904356479644775 }, { "auxiliary_loss_clip": 0.02125923, "auxiliary_loss_mlp": 0.00990581, "balance_loss_clip": 1.48251605, "balance_loss_mlp": 0.91929388, "epoch": 0.045693672027656694, "flos": 25735741142400.0, "grad_norm": 2661.7574243517897, "language_loss": 0.80696011, "learning_rate": 3.9974174008548405e-06, "loss": 0.83812511, "num_input_tokens_seen": 16315300, "router_z_loss_clip": 6.43359375, "router_z_loss_mlp": 0.71240234, "step": 760, "time_per_iteration": 2.6806578636169434 }, { "auxiliary_loss_clip": 0.02134688, "auxiliary_loss_mlp": 0.0099569, "balance_loss_clip": 1.47898078, "balance_loss_mlp": 0.93050623, "epoch": 0.045753795280324666, "flos": 19719016560000.0, "grad_norm": 6.168276617121272, "language_loss": 0.89172745, "learning_rate": 3.9973975772123105e-06, "loss": 0.92303133, "num_input_tokens_seen": 16333820, "router_z_loss_clip": 6.55859375, "router_z_loss_mlp": 0.65209961, "step": 761, "time_per_iteration": 2.616067409515381 }, { "auxiliary_loss_clip": 0.02099262, "auxiliary_loss_mlp": 0.00937008, "balance_loss_clip": 1.46195304, "balance_loss_mlp": 0.86653161, "epoch": 0.04581391853299264, "flos": 23255786338560.0, "grad_norm": 71.07298919669064, "language_loss": 0.85291123, "learning_rate": 3.997377677828266e-06, "loss": 0.88327396, "num_input_tokens_seen": 16355290, "router_z_loss_clip": 6.375, "router_z_loss_mlp": 0.70458984, "step": 762, "time_per_iteration": 2.6804826259613037 }, { "auxiliary_loss_clip": 0.01614589, "auxiliary_loss_mlp": 0.00361888, "balance_loss_clip": 1.32801676, "balance_loss_mlp": 0.33690152, "epoch": 0.0458740417856606, "flos": 64231155601920.0, "grad_norm": 0.9999990608480717, "language_loss": 0.59096426, "learning_rate": 3.9973577027034585e-06, "loss": 0.61072904, "num_input_tokens_seen": 16415995, "router_z_loss_clip": 2.859375, "router_z_loss_mlp": 0.24902344, "step": 763, "time_per_iteration": 3.3696467876434326 }, { "auxiliary_loss_clip": 0.0211924, "auxiliary_loss_mlp": 0.01081746, "balance_loss_clip": 1.44429326, "balance_loss_mlp": 1.00182819, "epoch": 0.045934165038328575, "flos": 20770121272320.0, "grad_norm": 2.8521362321513815, "language_loss": 0.94714499, "learning_rate": 3.9973376518386475e-06, "loss": 0.97915494, "num_input_tokens_seen": 16433120, "router_z_loss_clip": 6.75, "router_z_loss_mlp": 0.79931641, "step": 764, "time_per_iteration": 2.731343984603882 }, { "auxiliary_loss_clip": 0.0207197, "auxiliary_loss_mlp": 0.0103123, "balance_loss_clip": 1.42662525, "balance_loss_mlp": 0.95212245, "epoch": 0.04599428829099654, "flos": 30262891691520.0, "grad_norm": 233.94235822546887, "language_loss": 0.92292035, "learning_rate": 3.997317525234592e-06, "loss": 0.95395231, "num_input_tokens_seen": 16453360, "router_z_loss_clip": 6.453125, "router_z_loss_mlp": 0.79101562, "step": 765, "time_per_iteration": 2.7207884788513184 }, { "auxiliary_loss_clip": 0.02015826, "auxiliary_loss_mlp": 0.00920282, "balance_loss_clip": 1.4108932, "balance_loss_mlp": 0.84308207, "epoch": 0.04605441154366451, "flos": 23038921975680.0, "grad_norm": 39.689741247687344, "language_loss": 0.96385831, "learning_rate": 3.997297322892056e-06, "loss": 0.99321944, "num_input_tokens_seen": 16471160, "router_z_loss_clip": 6.05078125, "router_z_loss_mlp": 0.77246094, "step": 766, "time_per_iteration": 2.663517475128174 }, { "auxiliary_loss_clip": 0.02015754, "auxiliary_loss_mlp": 0.0091493, "balance_loss_clip": 1.40132523, "balance_loss_mlp": 0.84211648, "epoch": 0.046114534796332485, "flos": 22017407091840.0, "grad_norm": 59.146484763623846, "language_loss": 0.89922118, "learning_rate": 3.997277044811806e-06, "loss": 0.92852795, "num_input_tokens_seen": 16488940, "router_z_loss_clip": 6.140625, "router_z_loss_mlp": 0.72802734, "step": 767, "time_per_iteration": 2.595640182495117 }, { "auxiliary_loss_clip": 0.01979128, "auxiliary_loss_mlp": 0.00840297, "balance_loss_clip": 1.38942027, "balance_loss_mlp": 0.76486105, "epoch": 0.04617465804900045, "flos": 29862380067840.0, "grad_norm": 269.4524179954204, "language_loss": 0.93196058, "learning_rate": 3.99725669099461e-06, "loss": 0.96015477, "num_input_tokens_seen": 16509505, "router_z_loss_clip": 5.890625, "router_z_loss_mlp": 0.75439453, "step": 768, "time_per_iteration": 2.720355272293091 }, { "auxiliary_loss_clip": 0.02003907, "auxiliary_loss_mlp": 0.00896657, "balance_loss_clip": 1.39102173, "balance_loss_mlp": 0.82627594, "epoch": 0.04623478130166842, "flos": 25630056351360.0, "grad_norm": 411.67907214764506, "language_loss": 0.81029558, "learning_rate": 3.9972362614412395e-06, "loss": 0.83930117, "num_input_tokens_seen": 16528840, "router_z_loss_clip": 6.1171875, "router_z_loss_mlp": 0.70410156, "step": 769, "time_per_iteration": 2.676751136779785 }, { "auxiliary_loss_clip": 0.01979988, "auxiliary_loss_mlp": 0.00846295, "balance_loss_clip": 1.3919487, "balance_loss_mlp": 0.77605677, "epoch": 0.04629490455433639, "flos": 20449080489600.0, "grad_norm": 5.473746502186435, "language_loss": 0.91380221, "learning_rate": 3.997215756152471e-06, "loss": 0.942065, "num_input_tokens_seen": 16548335, "router_z_loss_clip": 5.875, "router_z_loss_mlp": 0.70166016, "step": 770, "time_per_iteration": 2.698193073272705 }, { "auxiliary_loss_clip": 0.01966438, "auxiliary_loss_mlp": 0.00780114, "balance_loss_clip": 1.38342619, "balance_loss_mlp": 0.70930403, "epoch": 0.04635502780700436, "flos": 23148736830720.0, "grad_norm": 42.688067939268294, "language_loss": 0.95074135, "learning_rate": 3.99719517512908e-06, "loss": 0.97820687, "num_input_tokens_seen": 16567725, "router_z_loss_clip": 5.82421875, "router_z_loss_mlp": 0.70849609, "step": 771, "time_per_iteration": 2.680745840072632 }, { "auxiliary_loss_clip": 0.01972238, "auxiliary_loss_mlp": 0.0085187, "balance_loss_clip": 1.38186872, "balance_loss_mlp": 0.77581489, "epoch": 0.04641515105967233, "flos": 23292020183040.0, "grad_norm": 201.3484719752468, "language_loss": 0.92091644, "learning_rate": 3.997174518371848e-06, "loss": 0.9491576, "num_input_tokens_seen": 16588175, "router_z_loss_clip": 5.91015625, "router_z_loss_mlp": 0.76025391, "step": 772, "time_per_iteration": 2.695600748062134 }, { "auxiliary_loss_clip": 0.01959231, "auxiliary_loss_mlp": 0.00809324, "balance_loss_clip": 1.3804276, "balance_loss_mlp": 0.73832315, "epoch": 0.046475274312340296, "flos": 25115204759040.0, "grad_norm": 35.5093176169444, "language_loss": 0.81066144, "learning_rate": 3.997153785881557e-06, "loss": 0.83834696, "num_input_tokens_seen": 16607735, "router_z_loss_clip": 5.79296875, "router_z_loss_mlp": 0.70996094, "step": 773, "time_per_iteration": 2.762474536895752 }, { "auxiliary_loss_clip": 0.01925295, "auxiliary_loss_mlp": 0.00763685, "balance_loss_clip": 1.37234688, "balance_loss_mlp": 0.69544977, "epoch": 0.04653539756500827, "flos": 25264916645760.0, "grad_norm": 371.81061758176304, "language_loss": 0.84285092, "learning_rate": 3.997132977658996e-06, "loss": 0.86974072, "num_input_tokens_seen": 16627225, "router_z_loss_clip": 5.53515625, "router_z_loss_mlp": 0.68212891, "step": 774, "time_per_iteration": 2.709393262863159 }, { "auxiliary_loss_clip": 0.01930085, "auxiliary_loss_mlp": 0.00758949, "balance_loss_clip": 1.37389922, "balance_loss_mlp": 0.6882813, "epoch": 0.046595520817676234, "flos": 35404150089600.0, "grad_norm": 12.256097133067644, "language_loss": 0.79794049, "learning_rate": 3.997112093704952e-06, "loss": 0.82483083, "num_input_tokens_seen": 16647785, "router_z_loss_clip": 5.56640625, "router_z_loss_mlp": 0.70605469, "step": 775, "time_per_iteration": 2.7908260822296143 }, { "auxiliary_loss_clip": 0.01906264, "auxiliary_loss_mlp": 0.0074636, "balance_loss_clip": 1.36411858, "balance_loss_mlp": 0.67259359, "epoch": 0.046655644070344206, "flos": 18112516778880.0, "grad_norm": 8.192144842907005, "language_loss": 0.82359064, "learning_rate": 3.997091134020217e-06, "loss": 0.85011685, "num_input_tokens_seen": 16667555, "router_z_loss_clip": 5.421875, "router_z_loss_mlp": 0.73730469, "step": 776, "time_per_iteration": 2.7027831077575684 }, { "auxiliary_loss_clip": 0.01873448, "auxiliary_loss_mlp": 0.00725034, "balance_loss_clip": 1.35835588, "balance_loss_mlp": 0.65212566, "epoch": 0.04671576732301218, "flos": 29205286617600.0, "grad_norm": 142.77043301184582, "language_loss": 0.79184151, "learning_rate": 3.997070098605585e-06, "loss": 0.81782627, "num_input_tokens_seen": 16686875, "router_z_loss_clip": 5.15625, "router_z_loss_mlp": 0.72851562, "step": 777, "time_per_iteration": 2.7541096210479736 }, { "auxiliary_loss_clip": 0.01864801, "auxiliary_loss_mlp": 0.00731263, "balance_loss_clip": 1.35666406, "balance_loss_mlp": 0.66126335, "epoch": 0.04677589057568014, "flos": 30478319510400.0, "grad_norm": 149.60500324324997, "language_loss": 0.81369823, "learning_rate": 3.997048987461856e-06, "loss": 0.83965886, "num_input_tokens_seen": 16706420, "router_z_loss_clip": 5.0859375, "router_z_loss_mlp": 0.70019531, "step": 778, "time_per_iteration": 5.677742004394531 }, { "auxiliary_loss_clip": 0.01842519, "auxiliary_loss_mlp": 0.00669039, "balance_loss_clip": 1.34712625, "balance_loss_mlp": 0.60628772, "epoch": 0.046836013828348115, "flos": 20557674282240.0, "grad_norm": 25.75734806832815, "language_loss": 0.85305798, "learning_rate": 3.997027800589829e-06, "loss": 0.87817347, "num_input_tokens_seen": 16726390, "router_z_loss_clip": 4.953125, "router_z_loss_mlp": 0.62744141, "step": 779, "time_per_iteration": 4.102157831192017 }, { "auxiliary_loss_clip": 0.01832032, "auxiliary_loss_mlp": 0.00663451, "balance_loss_clip": 1.34169757, "balance_loss_mlp": 0.59903002, "epoch": 0.04689613708101608, "flos": 25447378757760.0, "grad_norm": 6.332365112167262, "language_loss": 0.82173133, "learning_rate": 3.997006537990308e-06, "loss": 0.84668612, "num_input_tokens_seen": 16748965, "router_z_loss_clip": 4.8984375, "router_z_loss_mlp": 0.64453125, "step": 780, "time_per_iteration": 4.114478826522827 }, { "auxiliary_loss_clip": 0.01798746, "auxiliary_loss_mlp": 0.00671787, "balance_loss_clip": 1.33579683, "balance_loss_mlp": 0.61108589, "epoch": 0.04695626033368405, "flos": 23001395241600.0, "grad_norm": 3.2436311683858876, "language_loss": 0.81979525, "learning_rate": 3.996985199664099e-06, "loss": 0.84450054, "num_input_tokens_seen": 16768620, "router_z_loss_clip": 4.62890625, "router_z_loss_mlp": 0.60693359, "step": 781, "time_per_iteration": 2.6431217193603516 }, { "auxiliary_loss_clip": 0.0179338, "auxiliary_loss_mlp": 0.00730883, "balance_loss_clip": 1.33438683, "balance_loss_mlp": 0.66064465, "epoch": 0.047016383586352024, "flos": 29133357632640.0, "grad_norm": 17.64943166900422, "language_loss": 0.82255447, "learning_rate": 3.99696378561201e-06, "loss": 0.84779716, "num_input_tokens_seen": 16789755, "router_z_loss_clip": 4.58984375, "router_z_loss_mlp": 0.70214844, "step": 782, "time_per_iteration": 2.7442290782928467 }, { "auxiliary_loss_clip": 0.01757459, "auxiliary_loss_mlp": 0.00679999, "balance_loss_clip": 1.32338369, "balance_loss_mlp": 0.61572093, "epoch": 0.04707650683901999, "flos": 14976330451200.0, "grad_norm": 744.582057588403, "language_loss": 0.87187755, "learning_rate": 3.996942295834855e-06, "loss": 0.89625216, "num_input_tokens_seen": 16807585, "router_z_loss_clip": 4.3359375, "router_z_loss_mlp": 0.64306641, "step": 783, "time_per_iteration": 2.5977392196655273 }, { "auxiliary_loss_clip": 0.01726196, "auxiliary_loss_mlp": 0.00683615, "balance_loss_clip": 1.31908238, "balance_loss_mlp": 0.61785901, "epoch": 0.04713663009168796, "flos": 21651118151040.0, "grad_norm": 28.135790300750536, "language_loss": 0.87496144, "learning_rate": 3.996920730333448e-06, "loss": 0.89905953, "num_input_tokens_seen": 16827220, "router_z_loss_clip": 4.07421875, "router_z_loss_mlp": 0.65771484, "step": 784, "time_per_iteration": 2.670236349105835 }, { "auxiliary_loss_clip": 0.01719863, "auxiliary_loss_mlp": 0.00692496, "balance_loss_clip": 1.32386923, "balance_loss_mlp": 0.62683576, "epoch": 0.04719675334435593, "flos": 21325408600320.0, "grad_norm": 76.55283764483013, "language_loss": 0.85465741, "learning_rate": 3.996899089108607e-06, "loss": 0.87878096, "num_input_tokens_seen": 16846230, "router_z_loss_clip": 3.95703125, "router_z_loss_mlp": 0.65673828, "step": 785, "time_per_iteration": 2.726367473602295 }, { "auxiliary_loss_clip": 0.01708243, "auxiliary_loss_mlp": 0.00691236, "balance_loss_clip": 1.32614422, "balance_loss_mlp": 0.62481254, "epoch": 0.0472568765970239, "flos": 17931383470080.0, "grad_norm": 10.686150861979993, "language_loss": 0.97648871, "learning_rate": 3.996877372161152e-06, "loss": 1.00048351, "num_input_tokens_seen": 16865325, "router_z_loss_clip": 3.82226562, "router_z_loss_mlp": 0.6640625, "step": 786, "time_per_iteration": 2.645674705505371 }, { "auxiliary_loss_clip": 0.01697931, "auxiliary_loss_mlp": 0.0070949, "balance_loss_clip": 1.31398618, "balance_loss_mlp": 0.63820326, "epoch": 0.04731699984969187, "flos": 18077324428800.0, "grad_norm": 155.24143350845583, "language_loss": 0.86919188, "learning_rate": 3.9968555794919065e-06, "loss": 0.89326608, "num_input_tokens_seen": 16882930, "router_z_loss_clip": 3.84375, "router_z_loss_mlp": 0.71240234, "step": 787, "time_per_iteration": 2.5903167724609375 }, { "auxiliary_loss_clip": 0.01695468, "auxiliary_loss_mlp": 0.0068902, "balance_loss_clip": 1.32561815, "balance_loss_mlp": 0.62302554, "epoch": 0.047377123102359836, "flos": 23185078416000.0, "grad_norm": 11.712945113928907, "language_loss": 0.88195384, "learning_rate": 3.996833711101698e-06, "loss": 0.90579879, "num_input_tokens_seen": 16900710, "router_z_loss_clip": 3.69726562, "router_z_loss_mlp": 0.65966797, "step": 788, "time_per_iteration": 2.701110601425171 }, { "auxiliary_loss_clip": 0.01670211, "auxiliary_loss_mlp": 0.00657961, "balance_loss_clip": 1.31850886, "balance_loss_mlp": 0.59492296, "epoch": 0.04743724635502781, "flos": 22747794243840.0, "grad_norm": 16.624783614077174, "language_loss": 0.90065479, "learning_rate": 3.996811766991355e-06, "loss": 0.92393649, "num_input_tokens_seen": 16919210, "router_z_loss_clip": 3.515625, "router_z_loss_mlp": 0.63085938, "step": 789, "time_per_iteration": 2.6665971279144287 }, { "auxiliary_loss_clip": 0.01697043, "auxiliary_loss_mlp": 0.00651673, "balance_loss_clip": 1.33579159, "balance_loss_mlp": 0.58987534, "epoch": 0.04749736960769577, "flos": 17238702620160.0, "grad_norm": 26.998405653444987, "language_loss": 0.88663912, "learning_rate": 3.996789747161709e-06, "loss": 0.91012633, "num_input_tokens_seen": 16937125, "router_z_loss_clip": 3.61132812, "router_z_loss_mlp": 0.6171875, "step": 790, "time_per_iteration": 2.8173024654388428 }, { "auxiliary_loss_clip": 0.01684351, "auxiliary_loss_mlp": 0.00671671, "balance_loss_clip": 1.32096493, "balance_loss_mlp": 0.60400748, "epoch": 0.047557492860363745, "flos": 40479261592320.0, "grad_norm": 30.933860354143427, "language_loss": 0.9445321, "learning_rate": 3.996767651613597e-06, "loss": 0.96809232, "num_input_tokens_seen": 16958610, "router_z_loss_clip": 3.63671875, "router_z_loss_mlp": 0.67626953, "step": 791, "time_per_iteration": 2.7913708686828613 }, { "auxiliary_loss_clip": 0.01679159, "auxiliary_loss_mlp": 0.00628467, "balance_loss_clip": 1.31972027, "balance_loss_mlp": 0.57007837, "epoch": 0.04761761611303172, "flos": 18698004466560.0, "grad_norm": 42.835559807477786, "language_loss": 0.94821858, "learning_rate": 3.996745480347854e-06, "loss": 0.97129482, "num_input_tokens_seen": 16977300, "router_z_loss_clip": 3.59179688, "router_z_loss_mlp": 0.58349609, "step": 792, "time_per_iteration": 2.6417407989501953 }, { "auxiliary_loss_clip": 0.01691638, "auxiliary_loss_mlp": 0.00609187, "balance_loss_clip": 1.32146442, "balance_loss_mlp": 0.55339754, "epoch": 0.04767773936569968, "flos": 20921987975040.0, "grad_norm": 47.9325061351851, "language_loss": 0.79132503, "learning_rate": 3.996723233365324e-06, "loss": 0.81433332, "num_input_tokens_seen": 16994950, "router_z_loss_clip": 3.703125, "router_z_loss_mlp": 0.55883789, "step": 793, "time_per_iteration": 2.6506357192993164 }, { "auxiliary_loss_clip": 0.0171687, "auxiliary_loss_mlp": 0.00638299, "balance_loss_clip": 1.3328805, "balance_loss_mlp": 0.57497495, "epoch": 0.047737862618367655, "flos": 23732680233600.0, "grad_norm": 40.27346322317569, "language_loss": 0.93030167, "learning_rate": 3.996700910666847e-06, "loss": 0.95385337, "num_input_tokens_seen": 17014760, "router_z_loss_clip": 3.83984375, "router_z_loss_mlp": 0.63354492, "step": 794, "time_per_iteration": 2.732424259185791 }, { "auxiliary_loss_clip": 0.01747249, "auxiliary_loss_mlp": 0.00610187, "balance_loss_clip": 1.34332323, "balance_loss_mlp": 0.55186933, "epoch": 0.04779798587103562, "flos": 23695764030720.0, "grad_norm": 32.03584557102573, "language_loss": 0.77707183, "learning_rate": 3.996678512253272e-06, "loss": 0.80064619, "num_input_tokens_seen": 17032715, "router_z_loss_clip": 4.04296875, "router_z_loss_mlp": 0.58349609, "step": 795, "time_per_iteration": 2.687696695327759 }, { "auxiliary_loss_clip": 0.01744641, "auxiliary_loss_mlp": 0.0059147, "balance_loss_clip": 1.35113621, "balance_loss_mlp": 0.53663391, "epoch": 0.04785810912370359, "flos": 23183641872000.0, "grad_norm": 11.477493725723159, "language_loss": 0.8668257, "learning_rate": 3.996656038125449e-06, "loss": 0.89018691, "num_input_tokens_seen": 17052215, "router_z_loss_clip": 3.93554688, "router_z_loss_mlp": 0.54882812, "step": 796, "time_per_iteration": 2.651655435562134 }, { "auxiliary_loss_clip": 0.01771284, "auxiliary_loss_mlp": 0.00613483, "balance_loss_clip": 1.36345148, "balance_loss_mlp": 0.55297279, "epoch": 0.047918232376371564, "flos": 18040623707520.0, "grad_norm": 80.39089267269694, "language_loss": 0.89312083, "learning_rate": 3.996633488284228e-06, "loss": 0.91696852, "num_input_tokens_seen": 17069225, "router_z_loss_clip": 4.078125, "router_z_loss_mlp": 0.60546875, "step": 797, "time_per_iteration": 2.743457794189453 }, { "auxiliary_loss_clip": 0.01474906, "auxiliary_loss_mlp": 0.00404169, "balance_loss_clip": 1.24384129, "balance_loss_mlp": 0.384619, "epoch": 0.04797835562903953, "flos": 62442588758400.0, "grad_norm": 0.9467917394107902, "language_loss": 0.64544719, "learning_rate": 3.996610862730465e-06, "loss": 0.66423792, "num_input_tokens_seen": 17126680, "router_z_loss_clip": 2.3125, "router_z_loss_mlp": 0.1953125, "step": 798, "time_per_iteration": 3.0898241996765137 }, { "auxiliary_loss_clip": 0.01860024, "auxiliary_loss_mlp": 0.00636737, "balance_loss_clip": 1.40724587, "balance_loss_mlp": 0.57241195, "epoch": 0.0480384788817075, "flos": 21507296094720.0, "grad_norm": 70.33132455897956, "language_loss": 0.96834505, "learning_rate": 3.996588161465018e-06, "loss": 0.99331272, "num_input_tokens_seen": 17144835, "router_z_loss_clip": 4.53125, "router_z_loss_mlp": 0.64306641, "step": 799, "time_per_iteration": 2.7204036712646484 }, { "auxiliary_loss_clip": 0.01855174, "auxiliary_loss_mlp": 0.00614753, "balance_loss_clip": 1.42659497, "balance_loss_mlp": 0.55634046, "epoch": 0.048098602134375466, "flos": 21726710323200.0, "grad_norm": 128.06904460274455, "language_loss": 0.91925037, "learning_rate": 3.996565384488748e-06, "loss": 0.94394964, "num_input_tokens_seen": 17165030, "router_z_loss_clip": 4.29296875, "router_z_loss_mlp": 0.58422852, "step": 800, "time_per_iteration": 2.6985037326812744 }, { "auxiliary_loss_clip": 0.01872611, "auxiliary_loss_mlp": 0.00600719, "balance_loss_clip": 1.42619777, "balance_loss_mlp": 0.54228282, "epoch": 0.04815872538704344, "flos": 22931082368640.0, "grad_norm": 9.512955791076795, "language_loss": 0.9014957, "learning_rate": 3.996542531802518e-06, "loss": 0.92622894, "num_input_tokens_seen": 17184895, "router_z_loss_clip": 4.46484375, "router_z_loss_mlp": 0.58422852, "step": 801, "time_per_iteration": 2.749375343322754 }, { "auxiliary_loss_clip": 0.01900708, "auxiliary_loss_mlp": 0.00639607, "balance_loss_clip": 1.44764984, "balance_loss_mlp": 0.57637882, "epoch": 0.04821884863971141, "flos": 43174716042240.0, "grad_norm": 41.37458616582597, "language_loss": 0.85626942, "learning_rate": 3.996519603407196e-06, "loss": 0.88167262, "num_input_tokens_seen": 17208225, "router_z_loss_clip": 4.53320312, "router_z_loss_mlp": 0.63183594, "step": 802, "time_per_iteration": 2.8913662433624268 }, { "auxiliary_loss_clip": 0.01872651, "auxiliary_loss_mlp": 0.00637158, "balance_loss_clip": 1.42879701, "balance_loss_mlp": 0.57605088, "epoch": 0.048278971892379376, "flos": 18620006083200.0, "grad_norm": 7.141341785683392, "language_loss": 0.92570174, "learning_rate": 3.996496599303649e-06, "loss": 0.95079982, "num_input_tokens_seen": 17226305, "router_z_loss_clip": 4.4453125, "router_z_loss_mlp": 0.61132812, "step": 803, "time_per_iteration": 2.679373264312744 }, { "auxiliary_loss_clip": 0.0189631, "auxiliary_loss_mlp": 0.00645835, "balance_loss_clip": 1.45019925, "balance_loss_mlp": 0.58255845, "epoch": 0.04833909514504735, "flos": 20230061310720.0, "grad_norm": 57.341681876187224, "language_loss": 0.93902028, "learning_rate": 3.996473519492753e-06, "loss": 0.96444166, "num_input_tokens_seen": 17244545, "router_z_loss_clip": 4.46875, "router_z_loss_mlp": 0.63305664, "step": 804, "time_per_iteration": 2.8157472610473633 }, { "auxiliary_loss_clip": 0.0192525, "auxiliary_loss_mlp": 0.00621779, "balance_loss_clip": 1.46296275, "balance_loss_mlp": 0.56100631, "epoch": 0.04839921839771532, "flos": 24645170361600.0, "grad_norm": 149.79700091360982, "language_loss": 0.9161377, "learning_rate": 3.99645036397538e-06, "loss": 0.94160801, "num_input_tokens_seen": 17265730, "router_z_loss_clip": 4.62109375, "router_z_loss_mlp": 0.60766602, "step": 805, "time_per_iteration": 2.654324769973755 }, { "auxiliary_loss_clip": 0.01919477, "auxiliary_loss_mlp": 0.00615726, "balance_loss_clip": 1.45042217, "balance_loss_mlp": 0.55597806, "epoch": 0.048459341650383285, "flos": 24827452905600.0, "grad_norm": 10.681133091495237, "language_loss": 0.73470742, "learning_rate": 3.9964271327524085e-06, "loss": 0.76005942, "num_input_tokens_seen": 17284820, "router_z_loss_clip": 4.69140625, "router_z_loss_mlp": 0.59667969, "step": 806, "time_per_iteration": 2.680438280105591 }, { "auxiliary_loss_clip": 0.01906259, "auxiliary_loss_mlp": 0.00614998, "balance_loss_clip": 1.44561863, "balance_loss_mlp": 0.56094849, "epoch": 0.04851946490305126, "flos": 22163204396160.0, "grad_norm": 63.87799231482107, "language_loss": 0.82774925, "learning_rate": 3.9964038258247214e-06, "loss": 0.85296178, "num_input_tokens_seen": 17305085, "router_z_loss_clip": 4.6015625, "router_z_loss_mlp": 0.54101562, "step": 807, "time_per_iteration": 2.6045193672180176 }, { "auxiliary_loss_clip": 0.01928777, "auxiliary_loss_mlp": 0.00623396, "balance_loss_clip": 1.46732152, "balance_loss_mlp": 0.56605655, "epoch": 0.04857958815571922, "flos": 19792022952960.0, "grad_norm": 15.171736061469401, "language_loss": 0.92958784, "learning_rate": 3.9963804431932005e-06, "loss": 0.95510966, "num_input_tokens_seen": 17322715, "router_z_loss_clip": 4.61328125, "router_z_loss_mlp": 0.57324219, "step": 808, "time_per_iteration": 2.6949644088745117 }, { "auxiliary_loss_clip": 0.01958761, "auxiliary_loss_mlp": 0.00658412, "balance_loss_clip": 1.48206675, "balance_loss_mlp": 0.59270358, "epoch": 0.048639711408387194, "flos": 18697968552960.0, "grad_norm": 8.13083308779844, "language_loss": 0.96039653, "learning_rate": 3.996356984858732e-06, "loss": 0.98656821, "num_input_tokens_seen": 17341455, "router_z_loss_clip": 4.76171875, "router_z_loss_mlp": 0.65722656, "step": 809, "time_per_iteration": 2.6232762336730957 }, { "auxiliary_loss_clip": 0.01950126, "auxiliary_loss_mlp": 0.00643776, "balance_loss_clip": 1.49260533, "balance_loss_mlp": 0.58259773, "epoch": 0.048699834661055166, "flos": 24863507182080.0, "grad_norm": 6.0065731221634255, "language_loss": 0.91812855, "learning_rate": 3.996333450822208e-06, "loss": 0.9440676, "num_input_tokens_seen": 17360765, "router_z_loss_clip": 4.5703125, "router_z_loss_mlp": 0.61132812, "step": 810, "time_per_iteration": 2.6927523612976074 }, { "auxiliary_loss_clip": 0.0194961, "auxiliary_loss_mlp": 0.00617379, "balance_loss_clip": 1.47997785, "balance_loss_mlp": 0.55884743, "epoch": 0.04875995791372313, "flos": 20704010290560.0, "grad_norm": 139.49603247061447, "language_loss": 0.86343634, "learning_rate": 3.99630984108452e-06, "loss": 0.88910627, "num_input_tokens_seen": 17380625, "router_z_loss_clip": 4.70703125, "router_z_loss_mlp": 0.58447266, "step": 811, "time_per_iteration": 2.6777451038360596 }, { "auxiliary_loss_clip": 0.01938657, "auxiliary_loss_mlp": 0.00619328, "balance_loss_clip": 1.48062634, "balance_loss_mlp": 0.56444359, "epoch": 0.048820081166391104, "flos": 18588297352320.0, "grad_norm": 16.39122673326237, "language_loss": 0.79464114, "learning_rate": 3.9962861556465615e-06, "loss": 0.82022101, "num_input_tokens_seen": 17399355, "router_z_loss_clip": 4.578125, "router_z_loss_mlp": 0.54833984, "step": 812, "time_per_iteration": 2.7148749828338623 }, { "auxiliary_loss_clip": 0.0194951, "auxiliary_loss_mlp": 0.00564321, "balance_loss_clip": 1.49686468, "balance_loss_mlp": 0.51351362, "epoch": 0.04888020441905907, "flos": 22707322594560.0, "grad_norm": 19.37989838640658, "language_loss": 0.95163387, "learning_rate": 3.996262394509233e-06, "loss": 0.97677219, "num_input_tokens_seen": 17418240, "router_z_loss_clip": 4.52734375, "router_z_loss_mlp": 0.50805664, "step": 813, "time_per_iteration": 2.621969699859619 }, { "auxiliary_loss_clip": 0.01967871, "auxiliary_loss_mlp": 0.00558417, "balance_loss_clip": 1.4917593, "balance_loss_mlp": 0.50546443, "epoch": 0.04894032767172704, "flos": 22784351310720.0, "grad_norm": 148.51658978175917, "language_loss": 0.82339329, "learning_rate": 3.9962385576734335e-06, "loss": 0.84865618, "num_input_tokens_seen": 17436250, "router_z_loss_clip": 4.74609375, "router_z_loss_mlp": 0.52978516, "step": 814, "time_per_iteration": 2.6812689304351807 }, { "auxiliary_loss_clip": 0.01992883, "auxiliary_loss_mlp": 0.00622935, "balance_loss_clip": 1.50228739, "balance_loss_mlp": 0.56280619, "epoch": 0.04900045092439501, "flos": 25516147345920.0, "grad_norm": 15.494807207981768, "language_loss": 0.89238703, "learning_rate": 3.9962146451400675e-06, "loss": 0.91854525, "num_input_tokens_seen": 17455750, "router_z_loss_clip": 4.90234375, "router_z_loss_mlp": 0.60107422, "step": 815, "time_per_iteration": 2.7152721881866455 }, { "auxiliary_loss_clip": 0.020353, "auxiliary_loss_mlp": 0.00605897, "balance_loss_clip": 1.52456713, "balance_loss_mlp": 0.54681712, "epoch": 0.04906057417706298, "flos": 25958136199680.0, "grad_norm": 79.49121858297036, "language_loss": 0.98980999, "learning_rate": 3.996190656910043e-06, "loss": 1.016222, "num_input_tokens_seen": 17474995, "router_z_loss_clip": 5.10546875, "router_z_loss_mlp": 0.59082031, "step": 816, "time_per_iteration": 2.7170348167419434 }, { "auxiliary_loss_clip": 0.02029663, "auxiliary_loss_mlp": 0.00545444, "balance_loss_clip": 1.51527441, "balance_loss_mlp": 0.49394536, "epoch": 0.04912069742973095, "flos": 18624638937600.0, "grad_norm": 24.225954841030525, "language_loss": 0.86173624, "learning_rate": 3.996166592984268e-06, "loss": 0.88748729, "num_input_tokens_seen": 17493395, "router_z_loss_clip": 5.1328125, "router_z_loss_mlp": 0.51513672, "step": 817, "time_per_iteration": 2.7165017127990723 }, { "auxiliary_loss_clip": 0.02057423, "auxiliary_loss_mlp": 0.00596363, "balance_loss_clip": 1.54008555, "balance_loss_mlp": 0.53923774, "epoch": 0.049180820682398915, "flos": 23699786353920.0, "grad_norm": 24.612310013554886, "language_loss": 0.89410353, "learning_rate": 3.996142453363656e-06, "loss": 0.92064142, "num_input_tokens_seen": 17514565, "router_z_loss_clip": 5.16796875, "router_z_loss_mlp": 0.5715332, "step": 818, "time_per_iteration": 2.7395379543304443 }, { "auxiliary_loss_clip": 0.02071008, "auxiliary_loss_mlp": 0.00546629, "balance_loss_clip": 1.54373157, "balance_loss_mlp": 0.49324745, "epoch": 0.04924094393506689, "flos": 22420396753920.0, "grad_norm": 6.576655520737829, "language_loss": 0.83518481, "learning_rate": 3.996118238049124e-06, "loss": 0.86136115, "num_input_tokens_seen": 17534590, "router_z_loss_clip": 5.2734375, "router_z_loss_mlp": 0.53369141, "step": 819, "time_per_iteration": 2.7592594623565674 }, { "auxiliary_loss_clip": 0.02070044, "auxiliary_loss_mlp": 0.00575626, "balance_loss_clip": 1.53742218, "balance_loss_mlp": 0.52233922, "epoch": 0.04930106718773486, "flos": 15738246766080.0, "grad_norm": 3.521882075692675, "language_loss": 0.90201759, "learning_rate": 3.996093947041586e-06, "loss": 0.92847431, "num_input_tokens_seen": 17551900, "router_z_loss_clip": 5.3203125, "router_z_loss_mlp": 0.5324707, "step": 820, "time_per_iteration": 2.621433734893799 }, { "auxiliary_loss_clip": 0.02088102, "auxiliary_loss_mlp": 0.00608356, "balance_loss_clip": 1.54700172, "balance_loss_mlp": 0.55073047, "epoch": 0.049361190440402825, "flos": 26250628648320.0, "grad_norm": 21.169884093128427, "language_loss": 0.95639831, "learning_rate": 3.996069580341966e-06, "loss": 0.98336291, "num_input_tokens_seen": 17571485, "router_z_loss_clip": 5.4140625, "router_z_loss_mlp": 0.57641602, "step": 821, "time_per_iteration": 5.5618486404418945 }, { "auxiliary_loss_clip": 0.02095164, "auxiliary_loss_mlp": 0.00538367, "balance_loss_clip": 1.56291771, "balance_loss_mlp": 0.48968166, "epoch": 0.0494213136930708, "flos": 21252366293760.0, "grad_norm": 121.33176196475102, "language_loss": 0.95208395, "learning_rate": 3.996045137951188e-06, "loss": 0.97841918, "num_input_tokens_seen": 17591410, "router_z_loss_clip": 5.31640625, "router_z_loss_mlp": 0.48706055, "step": 822, "time_per_iteration": 2.7759718894958496 }, { "auxiliary_loss_clip": 0.02078032, "auxiliary_loss_mlp": 0.00559783, "balance_loss_clip": 1.55931485, "balance_loss_mlp": 0.50754577, "epoch": 0.04948143694573876, "flos": 27965506740480.0, "grad_norm": 59.7954962767851, "language_loss": 0.74669462, "learning_rate": 3.996020619870178e-06, "loss": 0.77307278, "num_input_tokens_seen": 17612010, "router_z_loss_clip": 5.19140625, "router_z_loss_mlp": 0.52246094, "step": 823, "time_per_iteration": 4.106802701950073 }, { "auxiliary_loss_clip": 0.01947618, "auxiliary_loss_mlp": 0.00231869, "balance_loss_clip": 1.67284644, "balance_loss_mlp": 0.21317743, "epoch": 0.049541560198406734, "flos": 66180995533440.0, "grad_norm": 1.5021650702350493, "language_loss": 0.62632531, "learning_rate": 3.995996026099866e-06, "loss": 0.64812016, "num_input_tokens_seen": 17673430, "router_z_loss_clip": 2.75, "router_z_loss_mlp": 0.18652344, "step": 824, "time_per_iteration": 3.218663215637207 }, { "auxiliary_loss_clip": 0.02052215, "auxiliary_loss_mlp": 0.00632423, "balance_loss_clip": 1.53640842, "balance_loss_mlp": 0.57191187, "epoch": 0.049601683451074706, "flos": 22892693708160.0, "grad_norm": 4.986448762622307, "language_loss": 0.96080399, "learning_rate": 3.995971356641185e-06, "loss": 0.98765039, "num_input_tokens_seen": 17689545, "router_z_loss_clip": 5.16015625, "router_z_loss_mlp": 0.60522461, "step": 825, "time_per_iteration": 2.6487462520599365 }, { "auxiliary_loss_clip": 0.01990604, "auxiliary_loss_mlp": 0.00628302, "balance_loss_clip": 1.50622475, "balance_loss_mlp": 0.57134342, "epoch": 0.04966180670374267, "flos": 21433643256960.0, "grad_norm": 72.5292786641759, "language_loss": 0.73832178, "learning_rate": 3.9959466114950695e-06, "loss": 0.76451087, "num_input_tokens_seen": 17705965, "router_z_loss_clip": 4.8359375, "router_z_loss_mlp": 0.56982422, "step": 826, "time_per_iteration": 2.6590113639831543 }, { "auxiliary_loss_clip": 0.01982033, "auxiliary_loss_mlp": 0.00665089, "balance_loss_clip": 1.5020566, "balance_loss_mlp": 0.60121661, "epoch": 0.04972192995641064, "flos": 23107367341440.0, "grad_norm": 39.94237820448589, "language_loss": 0.83888173, "learning_rate": 3.995921790662459e-06, "loss": 0.86535299, "num_input_tokens_seen": 17724580, "router_z_loss_clip": 4.796875, "router_z_loss_mlp": 0.63891602, "step": 827, "time_per_iteration": 2.6934103965759277 }, { "auxiliary_loss_clip": 0.01956368, "auxiliary_loss_mlp": 0.0066146, "balance_loss_clip": 1.48178792, "balance_loss_mlp": 0.59997225, "epoch": 0.04978205320907861, "flos": 40406147458560.0, "grad_norm": 18.315037259798938, "language_loss": 0.84420794, "learning_rate": 3.995896894144294e-06, "loss": 0.87038624, "num_input_tokens_seen": 17747755, "router_z_loss_clip": 4.75390625, "router_z_loss_mlp": 0.61474609, "step": 828, "time_per_iteration": 2.8112950325012207 }, { "auxiliary_loss_clip": 0.01919099, "auxiliary_loss_mlp": 0.00663659, "balance_loss_clip": 1.47243714, "balance_loss_mlp": 0.60224187, "epoch": 0.04984217646174658, "flos": 25228539146880.0, "grad_norm": 13800.023793837081, "language_loss": 0.89132357, "learning_rate": 3.995871921941519e-06, "loss": 0.91715109, "num_input_tokens_seen": 17768550, "router_z_loss_clip": 4.4609375, "router_z_loss_mlp": 0.61376953, "step": 829, "time_per_iteration": 2.7020108699798584 }, { "auxiliary_loss_clip": 0.01918511, "auxiliary_loss_mlp": 0.00694476, "balance_loss_clip": 1.46930528, "balance_loss_mlp": 0.62590659, "epoch": 0.04990229971441455, "flos": 15959636242560.0, "grad_norm": 94.3226575017001, "language_loss": 0.82855403, "learning_rate": 3.99584687405508e-06, "loss": 0.85468388, "num_input_tokens_seen": 17786080, "router_z_loss_clip": 4.49609375, "router_z_loss_mlp": 0.68505859, "step": 830, "time_per_iteration": 2.6879682540893555 }, { "auxiliary_loss_clip": 0.01917334, "auxiliary_loss_mlp": 0.00687025, "balance_loss_clip": 1.46498466, "balance_loss_mlp": 0.62417752, "epoch": 0.04996242296708252, "flos": 18405116968320.0, "grad_norm": 4.664498017085166, "language_loss": 0.8347764, "learning_rate": 3.995821750485929e-06, "loss": 0.86082006, "num_input_tokens_seen": 17803635, "router_z_loss_clip": 4.52734375, "router_z_loss_mlp": 0.62890625, "step": 831, "time_per_iteration": 2.5884928703308105 }, { "auxiliary_loss_clip": 0.01870155, "auxiliary_loss_mlp": 0.007262, "balance_loss_clip": 1.42827058, "balance_loss_mlp": 0.66130227, "epoch": 0.05002254621975049, "flos": 17858053854720.0, "grad_norm": 6.031280293274802, "language_loss": 0.9801628, "learning_rate": 3.995796551235016e-06, "loss": 1.0061264, "num_input_tokens_seen": 17822190, "router_z_loss_clip": 4.4140625, "router_z_loss_mlp": 0.64892578, "step": 832, "time_per_iteration": 2.655500888824463 }, { "auxiliary_loss_clip": 0.01856654, "auxiliary_loss_mlp": 0.00713304, "balance_loss_clip": 1.43562102, "balance_loss_mlp": 0.64640379, "epoch": 0.050082669472418455, "flos": 45660273367680.0, "grad_norm": 16.10031813951759, "language_loss": 0.87988597, "learning_rate": 3.9957712763032974e-06, "loss": 0.90558559, "num_input_tokens_seen": 17846915, "router_z_loss_clip": 4.21289062, "router_z_loss_mlp": 0.66943359, "step": 833, "time_per_iteration": 2.8584516048431396 }, { "auxiliary_loss_clip": 0.01828871, "auxiliary_loss_mlp": 0.00660417, "balance_loss_clip": 1.41829872, "balance_loss_mlp": 0.59742665, "epoch": 0.05014279272508643, "flos": 37962067363200.0, "grad_norm": 9.46645668021778, "language_loss": 0.87605166, "learning_rate": 3.995745925691733e-06, "loss": 0.90094447, "num_input_tokens_seen": 17867270, "router_z_loss_clip": 4.10546875, "router_z_loss_mlp": 0.62939453, "step": 834, "time_per_iteration": 2.7947311401367188 }, { "auxiliary_loss_clip": 0.01827164, "auxiliary_loss_mlp": 0.00636996, "balance_loss_clip": 1.4157654, "balance_loss_mlp": 0.58032358, "epoch": 0.0502029159777544, "flos": 20996179516800.0, "grad_norm": 13.522677705756996, "language_loss": 0.97817659, "learning_rate": 3.995720499401282e-06, "loss": 1.00281811, "num_input_tokens_seen": 17884880, "router_z_loss_clip": 4.11328125, "router_z_loss_mlp": 0.56665039, "step": 835, "time_per_iteration": 2.6257901191711426 }, { "auxiliary_loss_clip": 0.01804923, "auxiliary_loss_mlp": 0.00637389, "balance_loss_clip": 1.4009192, "balance_loss_mlp": 0.57787991, "epoch": 0.050263039230422364, "flos": 15888066393600.0, "grad_norm": 23.5666771974087, "language_loss": 0.84499639, "learning_rate": 3.995694997432911e-06, "loss": 0.86941952, "num_input_tokens_seen": 17903695, "router_z_loss_clip": 4.04296875, "router_z_loss_mlp": 0.59521484, "step": 836, "time_per_iteration": 2.7221529483795166 }, { "auxiliary_loss_clip": 0.01775714, "auxiliary_loss_mlp": 0.00590374, "balance_loss_clip": 1.39550686, "balance_loss_mlp": 0.5373494, "epoch": 0.050323162483090336, "flos": 23732752060800.0, "grad_norm": 32.9990596450426, "language_loss": 0.89832258, "learning_rate": 3.9956694197875855e-06, "loss": 0.92198348, "num_input_tokens_seen": 17920745, "router_z_loss_clip": 3.80078125, "router_z_loss_mlp": 0.53027344, "step": 837, "time_per_iteration": 2.7224795818328857 }, { "auxiliary_loss_clip": 0.01822871, "auxiliary_loss_mlp": 0.00625346, "balance_loss_clip": 1.42960024, "balance_loss_mlp": 0.56698078, "epoch": 0.0503832857357583, "flos": 20266223328000.0, "grad_norm": 11.684287960086067, "language_loss": 0.79501355, "learning_rate": 3.995643766466275e-06, "loss": 0.81949568, "num_input_tokens_seen": 17938220, "router_z_loss_clip": 3.93554688, "router_z_loss_mlp": 0.58374023, "step": 838, "time_per_iteration": 2.6432714462280273 }, { "auxiliary_loss_clip": 0.01807311, "auxiliary_loss_mlp": 0.00626187, "balance_loss_clip": 1.39889097, "balance_loss_mlp": 0.56801283, "epoch": 0.05044340898842627, "flos": 17785011548160.0, "grad_norm": 11.531138834147129, "language_loss": 0.87507713, "learning_rate": 3.995618037469953e-06, "loss": 0.89941216, "num_input_tokens_seen": 17957325, "router_z_loss_clip": 4.08984375, "router_z_loss_mlp": 0.58154297, "step": 839, "time_per_iteration": 2.661890745162964 }, { "auxiliary_loss_clip": 0.01789312, "auxiliary_loss_mlp": 0.00593055, "balance_loss_clip": 1.40167117, "balance_loss_mlp": 0.54062659, "epoch": 0.050503532241094246, "flos": 22966526113920.0, "grad_norm": 44.450854048895486, "language_loss": 0.91081661, "learning_rate": 3.995592232799595e-06, "loss": 0.93464029, "num_input_tokens_seen": 17975875, "router_z_loss_clip": 3.88085938, "router_z_loss_mlp": 0.52441406, "step": 840, "time_per_iteration": 2.6979548931121826 }, { "auxiliary_loss_clip": 0.01792389, "auxiliary_loss_mlp": 0.00602572, "balance_loss_clip": 1.39741433, "balance_loss_mlp": 0.54637706, "epoch": 0.05056365549376221, "flos": 22776989022720.0, "grad_norm": 6.837679585794663, "language_loss": 1.00103331, "learning_rate": 3.99556635245618e-06, "loss": 1.02498293, "num_input_tokens_seen": 17994340, "router_z_loss_clip": 3.94726562, "router_z_loss_mlp": 0.56274414, "step": 841, "time_per_iteration": 2.683462619781494 }, { "auxiliary_loss_clip": 0.0179708, "auxiliary_loss_mlp": 0.0056253, "balance_loss_clip": 1.40491152, "balance_loss_mlp": 0.50983924, "epoch": 0.05062377874643018, "flos": 30916968399360.0, "grad_norm": 15.162577944300773, "language_loss": 0.84531176, "learning_rate": 3.995540396440688e-06, "loss": 0.86890781, "num_input_tokens_seen": 18015260, "router_z_loss_clip": 3.921875, "router_z_loss_mlp": 0.52685547, "step": 842, "time_per_iteration": 2.7239091396331787 }, { "auxiliary_loss_clip": 0.01823941, "auxiliary_loss_mlp": 0.0059746, "balance_loss_clip": 1.42272139, "balance_loss_mlp": 0.53787887, "epoch": 0.05068390199909815, "flos": 19647159402240.0, "grad_norm": 15.61960677967312, "language_loss": 0.83642244, "learning_rate": 3.995514364754105e-06, "loss": 0.86063641, "num_input_tokens_seen": 18033960, "router_z_loss_clip": 4.01171875, "router_z_loss_mlp": 0.59521484, "step": 843, "time_per_iteration": 2.6605565547943115 }, { "auxiliary_loss_clip": 0.0180933, "auxiliary_loss_mlp": 0.00575262, "balance_loss_clip": 1.41319823, "balance_loss_mlp": 0.52099806, "epoch": 0.05074402525176612, "flos": 37962103276800.0, "grad_norm": 19.771051366402194, "language_loss": 0.89457417, "learning_rate": 3.995488257397417e-06, "loss": 0.91842014, "num_input_tokens_seen": 18056700, "router_z_loss_clip": 3.95507812, "router_z_loss_mlp": 0.54321289, "step": 844, "time_per_iteration": 2.803661823272705 }, { "auxiliary_loss_clip": 0.0175222, "auxiliary_loss_mlp": 0.0051188, "balance_loss_clip": 1.36945617, "balance_loss_mlp": 0.46402895, "epoch": 0.05080414850443409, "flos": 22054610603520.0, "grad_norm": 27.97210265298818, "language_loss": 0.82759351, "learning_rate": 3.995462074371614e-06, "loss": 0.85023445, "num_input_tokens_seen": 18075815, "router_z_loss_clip": 3.82421875, "router_z_loss_mlp": 0.47851562, "step": 845, "time_per_iteration": 2.65384840965271 }, { "auxiliary_loss_clip": 0.01748123, "auxiliary_loss_mlp": 0.00534968, "balance_loss_clip": 1.37789094, "balance_loss_mlp": 0.48420846, "epoch": 0.05086427175710206, "flos": 20225787592320.0, "grad_norm": 6.728449134179256, "language_loss": 0.94173467, "learning_rate": 3.99543581567769e-06, "loss": 0.96456552, "num_input_tokens_seen": 18095095, "router_z_loss_clip": 3.70507812, "router_z_loss_mlp": 0.50805664, "step": 846, "time_per_iteration": 2.6099023818969727 }, { "auxiliary_loss_clip": 0.01765746, "auxiliary_loss_mlp": 0.00569879, "balance_loss_clip": 1.38826609, "balance_loss_mlp": 0.51795155, "epoch": 0.05092439500977003, "flos": 15159223526400.0, "grad_norm": 289.5224484625523, "language_loss": 0.92622137, "learning_rate": 3.9954094813166394e-06, "loss": 0.94957757, "num_input_tokens_seen": 18112675, "router_z_loss_clip": 3.77539062, "router_z_loss_mlp": 0.51953125, "step": 847, "time_per_iteration": 2.6253128051757812 }, { "auxiliary_loss_clip": 0.01744629, "auxiliary_loss_mlp": 0.00528142, "balance_loss_clip": 1.37170577, "balance_loss_mlp": 0.48012424, "epoch": 0.050984518262437994, "flos": 22055149307520.0, "grad_norm": 7.9634515612335175, "language_loss": 0.88365161, "learning_rate": 3.995383071289462e-06, "loss": 0.90637934, "num_input_tokens_seen": 18130745, "router_z_loss_clip": 3.72851562, "router_z_loss_mlp": 0.48022461, "step": 848, "time_per_iteration": 2.610164165496826 }, { "auxiliary_loss_clip": 0.01768101, "auxiliary_loss_mlp": 0.00542718, "balance_loss_clip": 1.38414812, "balance_loss_mlp": 0.49305579, "epoch": 0.05104464151510597, "flos": 30225329043840.0, "grad_norm": 11.171096467137474, "language_loss": 0.92854655, "learning_rate": 3.995356585597158e-06, "loss": 0.95165467, "num_input_tokens_seen": 18152410, "router_z_loss_clip": 3.83984375, "router_z_loss_mlp": 0.49658203, "step": 849, "time_per_iteration": 2.777496576309204 }, { "auxiliary_loss_clip": 0.01791547, "auxiliary_loss_mlp": 0.00486207, "balance_loss_clip": 1.40575635, "balance_loss_mlp": 0.4432677, "epoch": 0.05110476476777394, "flos": 18332900674560.0, "grad_norm": 10.891672760550325, "language_loss": 0.89826548, "learning_rate": 3.995330024240732e-06, "loss": 0.92104298, "num_input_tokens_seen": 18170870, "router_z_loss_clip": 3.85742188, "router_z_loss_mlp": 0.42944336, "step": 850, "time_per_iteration": 2.6980459690093994 }, { "auxiliary_loss_clip": 0.01804584, "auxiliary_loss_mlp": 0.00527217, "balance_loss_clip": 1.41541338, "balance_loss_mlp": 0.47826961, "epoch": 0.051164888020441904, "flos": 37998732170880.0, "grad_norm": 6.462387989099383, "language_loss": 0.73496753, "learning_rate": 3.995303387221192e-06, "loss": 0.75828552, "num_input_tokens_seen": 18191555, "router_z_loss_clip": 3.89453125, "router_z_loss_mlp": 0.48950195, "step": 851, "time_per_iteration": 2.8701047897338867 }, { "auxiliary_loss_clip": 0.0180674, "auxiliary_loss_mlp": 0.00494134, "balance_loss_clip": 1.41060925, "balance_loss_mlp": 0.44566375, "epoch": 0.051225011273109876, "flos": 23038634666880.0, "grad_norm": 154.8695135520212, "language_loss": 0.90431666, "learning_rate": 3.995276674539547e-06, "loss": 0.92732543, "num_input_tokens_seen": 18208620, "router_z_loss_clip": 3.95703125, "router_z_loss_mlp": 0.484375, "step": 852, "time_per_iteration": 2.6970930099487305 }, { "auxiliary_loss_clip": 0.01821816, "auxiliary_loss_mlp": 0.005125, "balance_loss_clip": 1.41785645, "balance_loss_mlp": 0.46309948, "epoch": 0.05128513452577785, "flos": 18259822454400.0, "grad_norm": 21.959710238248892, "language_loss": 0.8631658, "learning_rate": 3.995249886196811e-06, "loss": 0.88650888, "num_input_tokens_seen": 18226370, "router_z_loss_clip": 4.03515625, "router_z_loss_mlp": 0.49365234, "step": 853, "time_per_iteration": 2.612952709197998 }, { "auxiliary_loss_clip": 0.01807803, "auxiliary_loss_mlp": 0.00519624, "balance_loss_clip": 1.39804935, "balance_loss_mlp": 0.47039077, "epoch": 0.05134525777844581, "flos": 27198957571200.0, "grad_norm": 18.5981467577841, "language_loss": 0.8345623, "learning_rate": 3.995223022193999e-06, "loss": 0.8578366, "num_input_tokens_seen": 18247075, "router_z_loss_clip": 4.10546875, "router_z_loss_mlp": 0.49243164, "step": 854, "time_per_iteration": 2.7002182006835938 }, { "auxiliary_loss_clip": 0.01794944, "auxiliary_loss_mlp": 0.00468083, "balance_loss_clip": 1.3936404, "balance_loss_mlp": 0.42252105, "epoch": 0.051405381031113785, "flos": 28362247436160.0, "grad_norm": 17.102669687057773, "language_loss": 0.86831594, "learning_rate": 3.99519608253213e-06, "loss": 0.89094615, "num_input_tokens_seen": 18265680, "router_z_loss_clip": 4.01171875, "router_z_loss_mlp": 0.45581055, "step": 855, "time_per_iteration": 2.703603506088257 }, { "auxiliary_loss_clip": 0.01416758, "auxiliary_loss_mlp": 0.00376836, "balance_loss_clip": 1.19377446, "balance_loss_mlp": 0.35223138, "epoch": 0.05146550428378175, "flos": 65618169327360.0, "grad_norm": 0.9659037549788141, "language_loss": 0.65370506, "learning_rate": 3.995169067212227e-06, "loss": 0.67164099, "num_input_tokens_seen": 18327015, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.24511719, "step": 856, "time_per_iteration": 3.125983476638794 }, { "auxiliary_loss_clip": 0.01838339, "auxiliary_loss_mlp": 0.00484257, "balance_loss_clip": 1.42251277, "balance_loss_mlp": 0.44076979, "epoch": 0.05152562753644972, "flos": 22054861998720.0, "grad_norm": 23.398901697124433, "language_loss": 0.81381327, "learning_rate": 3.9951419762353116e-06, "loss": 0.83703923, "num_input_tokens_seen": 18345235, "router_z_loss_clip": 4.1640625, "router_z_loss_mlp": 0.43530273, "step": 857, "time_per_iteration": 2.6325879096984863 }, { "auxiliary_loss_clip": 0.01847236, "auxiliary_loss_mlp": 0.0048602, "balance_loss_clip": 1.41629648, "balance_loss_mlp": 0.44029117, "epoch": 0.051585750789117694, "flos": 18509544783360.0, "grad_norm": 2.8991958216961797, "language_loss": 0.94211203, "learning_rate": 3.995114809602412e-06, "loss": 0.96544456, "num_input_tokens_seen": 18362350, "router_z_loss_clip": 4.31640625, "router_z_loss_mlp": 0.45776367, "step": 858, "time_per_iteration": 2.654993772506714 }, { "auxiliary_loss_clip": 0.0185559, "auxiliary_loss_mlp": 0.00450773, "balance_loss_clip": 1.42294741, "balance_loss_mlp": 0.40611771, "epoch": 0.05164587404178566, "flos": 23730238108800.0, "grad_norm": 2.8644293644391787, "language_loss": 0.84076244, "learning_rate": 3.9950875673145605e-06, "loss": 0.8638261, "num_input_tokens_seen": 18383390, "router_z_loss_clip": 4.328125, "router_z_loss_mlp": 0.4465332, "step": 859, "time_per_iteration": 2.6572022438049316 }, { "auxiliary_loss_clip": 0.01877652, "auxiliary_loss_mlp": 0.00465733, "balance_loss_clip": 1.42919087, "balance_loss_mlp": 0.42157817, "epoch": 0.05170599729445363, "flos": 16252882876800.0, "grad_norm": 8.120185033276039, "language_loss": 0.9786644, "learning_rate": 3.995060249372788e-06, "loss": 1.00209832, "num_input_tokens_seen": 18399220, "router_z_loss_clip": 4.484375, "router_z_loss_mlp": 0.44140625, "step": 860, "time_per_iteration": 2.6666126251220703 }, { "auxiliary_loss_clip": 0.01816181, "auxiliary_loss_mlp": 0.00453932, "balance_loss_clip": 1.39375699, "balance_loss_mlp": 0.41368699, "epoch": 0.0517661205471216, "flos": 23985922095360.0, "grad_norm": 4.95519167257323, "language_loss": 0.87895405, "learning_rate": 3.99503285577813e-06, "loss": 0.9016552, "num_input_tokens_seen": 18419005, "router_z_loss_clip": 4.22265625, "router_z_loss_mlp": 0.40209961, "step": 861, "time_per_iteration": 2.6688601970672607 }, { "auxiliary_loss_clip": 0.01849814, "auxiliary_loss_mlp": 0.00492742, "balance_loss_clip": 1.42095256, "balance_loss_mlp": 0.44796747, "epoch": 0.05182624379978957, "flos": 29277718392960.0, "grad_norm": 51.98539204076333, "language_loss": 0.84816521, "learning_rate": 3.995005386531627e-06, "loss": 0.87159079, "num_input_tokens_seen": 18440550, "router_z_loss_clip": 4.2890625, "router_z_loss_mlp": 0.44775391, "step": 862, "time_per_iteration": 2.7266600131988525 }, { "auxiliary_loss_clip": 0.01842484, "auxiliary_loss_mlp": 0.0047518, "balance_loss_clip": 1.42311859, "balance_loss_mlp": 0.43462467, "epoch": 0.05188636705245754, "flos": 24170826332160.0, "grad_norm": 13.748884119792233, "language_loss": 0.95299232, "learning_rate": 3.9949778416343195e-06, "loss": 0.97616899, "num_input_tokens_seen": 18461950, "router_z_loss_clip": 4.19921875, "router_z_loss_mlp": 0.40576172, "step": 863, "time_per_iteration": 6.9993977546691895 }, { "auxiliary_loss_clip": 0.01818335, "auxiliary_loss_mlp": 0.00488351, "balance_loss_clip": 1.39554441, "balance_loss_mlp": 0.44514924, "epoch": 0.051946490305125506, "flos": 26760703731840.0, "grad_norm": 79.08470432130815, "language_loss": 0.83019346, "learning_rate": 3.9949502210872525e-06, "loss": 0.85326034, "num_input_tokens_seen": 18480555, "router_z_loss_clip": 4.2265625, "router_z_loss_mlp": 0.43188477, "step": 864, "time_per_iteration": 2.7332234382629395 }, { "auxiliary_loss_clip": 0.01825519, "auxiliary_loss_mlp": 0.004641, "balance_loss_clip": 1.39407909, "balance_loss_mlp": 0.42273426, "epoch": 0.05200661355779348, "flos": 21502519585920.0, "grad_norm": 28.25080088290974, "language_loss": 0.85253006, "learning_rate": 3.994922524891474e-06, "loss": 0.87542629, "num_input_tokens_seen": 18499645, "router_z_loss_clip": 4.31054688, "router_z_loss_mlp": 0.41357422, "step": 865, "time_per_iteration": 4.068542718887329 }, { "auxiliary_loss_clip": 0.01813655, "auxiliary_loss_mlp": 0.00485492, "balance_loss_clip": 1.40094817, "balance_loss_mlp": 0.44205266, "epoch": 0.05206673681046144, "flos": 18114492026880.0, "grad_norm": 94.04546712657232, "language_loss": 0.91081071, "learning_rate": 3.994894753048032e-06, "loss": 0.93380225, "num_input_tokens_seen": 18516810, "router_z_loss_clip": 4.12890625, "router_z_loss_mlp": 0.43383789, "step": 866, "time_per_iteration": 2.5832226276397705 }, { "auxiliary_loss_clip": 0.01752927, "auxiliary_loss_mlp": 0.00432133, "balance_loss_clip": 1.36585259, "balance_loss_mlp": 0.39310366, "epoch": 0.052126860063129415, "flos": 17524191916800.0, "grad_norm": 4.480715756466937, "language_loss": 0.95786309, "learning_rate": 3.9948669055579815e-06, "loss": 0.97971368, "num_input_tokens_seen": 18532510, "router_z_loss_clip": 3.87109375, "router_z_loss_mlp": 0.390625, "step": 867, "time_per_iteration": 2.617755889892578 }, { "auxiliary_loss_clip": 0.01750036, "auxiliary_loss_mlp": 0.00443482, "balance_loss_clip": 1.36745274, "balance_loss_mlp": 0.40547836, "epoch": 0.05218698331579739, "flos": 32598054771840.0, "grad_norm": 17.21715381494275, "language_loss": 0.67033482, "learning_rate": 3.9948389824223785e-06, "loss": 0.69227004, "num_input_tokens_seen": 18557380, "router_z_loss_clip": 3.828125, "router_z_loss_mlp": 0.38012695, "step": 868, "time_per_iteration": 2.694946527481079 }, { "auxiliary_loss_clip": 0.01763915, "auxiliary_loss_mlp": 0.00496021, "balance_loss_clip": 1.37502027, "balance_loss_mlp": 0.4503873, "epoch": 0.05224710656846535, "flos": 22127293774080.0, "grad_norm": 30.101169403263086, "language_loss": 0.89680946, "learning_rate": 3.994810983642281e-06, "loss": 0.91940886, "num_input_tokens_seen": 18575720, "router_z_loss_clip": 3.88476562, "router_z_loss_mlp": 0.45678711, "step": 869, "time_per_iteration": 2.612485647201538 }, { "auxiliary_loss_clip": 0.01739797, "auxiliary_loss_mlp": 0.00519835, "balance_loss_clip": 1.36049187, "balance_loss_mlp": 0.47682473, "epoch": 0.052307229821133325, "flos": 11145092976000.0, "grad_norm": 13.603991159104815, "language_loss": 0.94369733, "learning_rate": 3.994782909218751e-06, "loss": 0.96629357, "num_input_tokens_seen": 18592185, "router_z_loss_clip": 3.79101562, "router_z_loss_mlp": 0.42993164, "step": 870, "time_per_iteration": 2.5729899406433105 }, { "auxiliary_loss_clip": 0.01739533, "auxiliary_loss_mlp": 0.00488883, "balance_loss_clip": 1.36362672, "balance_loss_mlp": 0.44773245, "epoch": 0.05236735307380129, "flos": 19128070005120.0, "grad_norm": 6.646130373083594, "language_loss": 0.86846924, "learning_rate": 3.994754759152854e-06, "loss": 0.89075339, "num_input_tokens_seen": 18609560, "router_z_loss_clip": 3.7578125, "router_z_loss_mlp": 0.41113281, "step": 871, "time_per_iteration": 2.632363796234131 }, { "auxiliary_loss_clip": 0.01733193, "auxiliary_loss_mlp": 0.00452427, "balance_loss_clip": 1.36935544, "balance_loss_mlp": 0.41456571, "epoch": 0.05242747632646926, "flos": 20960663944320.0, "grad_norm": 10.969765354594596, "language_loss": 0.85358477, "learning_rate": 3.994726533445656e-06, "loss": 0.87544096, "num_input_tokens_seen": 18629405, "router_z_loss_clip": 3.640625, "router_z_loss_mlp": 0.37841797, "step": 872, "time_per_iteration": 2.643563747406006 }, { "auxiliary_loss_clip": 0.0138621, "auxiliary_loss_mlp": 0.00408666, "balance_loss_clip": 1.16170096, "balance_loss_mlp": 0.38806689, "epoch": 0.052487599579137234, "flos": 65020542842880.0, "grad_norm": 1.2823183081890244, "language_loss": 0.61694765, "learning_rate": 3.9946982320982274e-06, "loss": 0.6348964, "num_input_tokens_seen": 18681480, "router_z_loss_clip": 2.25, "router_z_loss_mlp": 0.20605469, "step": 873, "time_per_iteration": 3.039125680923462 }, { "auxiliary_loss_clip": 0.01727478, "auxiliary_loss_mlp": 0.00490882, "balance_loss_clip": 1.35732448, "balance_loss_mlp": 0.44701278, "epoch": 0.0525477228318052, "flos": 23288859786240.0, "grad_norm": 4.480880507769946, "language_loss": 0.94957238, "learning_rate": 3.994669855111643e-06, "loss": 0.97175604, "num_input_tokens_seen": 18700390, "router_z_loss_clip": 3.70507812, "router_z_loss_mlp": 0.43847656, "step": 874, "time_per_iteration": 2.6265294551849365 }, { "auxiliary_loss_clip": 0.01742645, "auxiliary_loss_mlp": 0.00453563, "balance_loss_clip": 1.36618471, "balance_loss_mlp": 0.410815, "epoch": 0.05260784608447317, "flos": 32230221546240.0, "grad_norm": 108.55232001613368, "language_loss": 0.79611647, "learning_rate": 3.994641402486977e-06, "loss": 0.81807858, "num_input_tokens_seen": 18721280, "router_z_loss_clip": 3.76367188, "router_z_loss_mlp": 0.42749023, "step": 875, "time_per_iteration": 2.7108123302459717 }, { "auxiliary_loss_clip": 0.01702986, "auxiliary_loss_mlp": 0.0044744, "balance_loss_clip": 1.34875894, "balance_loss_mlp": 0.40817294, "epoch": 0.052667969337141136, "flos": 24463211040000.0, "grad_norm": 185.84883256414145, "language_loss": 0.98097688, "learning_rate": 3.99461287422531e-06, "loss": 1.0024811, "num_input_tokens_seen": 18741545, "router_z_loss_clip": 3.54296875, "router_z_loss_mlp": 0.39282227, "step": 876, "time_per_iteration": 2.727052927017212 }, { "auxiliary_loss_clip": 0.01362129, "auxiliary_loss_mlp": 0.00228833, "balance_loss_clip": 1.14320755, "balance_loss_mlp": 0.21319227, "epoch": 0.05272809258980911, "flos": 57784329567360.0, "grad_norm": 0.8105898847675201, "language_loss": 0.62917268, "learning_rate": 3.994584270327722e-06, "loss": 0.64508224, "num_input_tokens_seen": 18801400, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.15625, "step": 877, "time_per_iteration": 3.1606452465057373 }, { "auxiliary_loss_clip": 0.01735485, "auxiliary_loss_mlp": 0.00509755, "balance_loss_clip": 1.36859751, "balance_loss_mlp": 0.46498042, "epoch": 0.05278821584247708, "flos": 17420805596160.0, "grad_norm": 53.329454199134176, "language_loss": 0.91574776, "learning_rate": 3.994555590795299e-06, "loss": 0.93820012, "num_input_tokens_seen": 18819670, "router_z_loss_clip": 3.66601562, "router_z_loss_mlp": 0.44750977, "step": 878, "time_per_iteration": 2.655395030975342 }, { "auxiliary_loss_clip": 0.01721791, "auxiliary_loss_mlp": 0.00470242, "balance_loss_clip": 1.35854495, "balance_loss_mlp": 0.42975807, "epoch": 0.052848339095145046, "flos": 26137258346880.0, "grad_norm": 11.289471182359376, "language_loss": 0.90511191, "learning_rate": 3.9945268356291275e-06, "loss": 0.92703223, "num_input_tokens_seen": 18840580, "router_z_loss_clip": 3.63085938, "router_z_loss_mlp": 0.4050293, "step": 879, "time_per_iteration": 2.6622626781463623 }, { "auxiliary_loss_clip": 0.01729424, "auxiliary_loss_mlp": 0.00496735, "balance_loss_clip": 1.37161911, "balance_loss_mlp": 0.45420146, "epoch": 0.05290846234781302, "flos": 16472081623680.0, "grad_norm": 19.814622542722052, "language_loss": 0.90951884, "learning_rate": 3.9944980048302985e-06, "loss": 0.93178046, "num_input_tokens_seen": 18859295, "router_z_loss_clip": 3.58203125, "router_z_loss_mlp": 0.42578125, "step": 880, "time_per_iteration": 2.6624526977539062 }, { "auxiliary_loss_clip": 0.01752044, "auxiliary_loss_mlp": 0.00539682, "balance_loss_clip": 1.37833142, "balance_loss_mlp": 0.49459723, "epoch": 0.05296858560048098, "flos": 19865173000320.0, "grad_norm": 20.64539096850892, "language_loss": 0.9406693, "learning_rate": 3.994469098399906e-06, "loss": 0.96358651, "num_input_tokens_seen": 18877485, "router_z_loss_clip": 3.734375, "router_z_loss_mlp": 0.45092773, "step": 881, "time_per_iteration": 2.631316661834717 }, { "auxiliary_loss_clip": 0.01766423, "auxiliary_loss_mlp": 0.00491335, "balance_loss_clip": 1.3840996, "balance_loss_mlp": 0.44663179, "epoch": 0.053028708853148955, "flos": 24388588535040.0, "grad_norm": 69.89188300332219, "language_loss": 0.93824089, "learning_rate": 3.994440116339046e-06, "loss": 0.96081853, "num_input_tokens_seen": 18898275, "router_z_loss_clip": 3.82226562, "router_z_loss_mlp": 0.44702148, "step": 882, "time_per_iteration": 2.6870999336242676 }, { "auxiliary_loss_clip": 0.01794781, "auxiliary_loss_mlp": 0.00515078, "balance_loss_clip": 1.40209651, "balance_loss_mlp": 0.46613115, "epoch": 0.05308883210581693, "flos": 36393166143360.0, "grad_norm": 5.288844038076498, "language_loss": 0.77087057, "learning_rate": 3.994411058648816e-06, "loss": 0.79396915, "num_input_tokens_seen": 18920665, "router_z_loss_clip": 3.92773438, "router_z_loss_mlp": 0.48901367, "step": 883, "time_per_iteration": 2.8606038093566895 }, { "auxiliary_loss_clip": 0.01806552, "auxiliary_loss_mlp": 0.00488355, "balance_loss_clip": 1.4089551, "balance_loss_mlp": 0.4426502, "epoch": 0.05314895535848489, "flos": 22855095146880.0, "grad_norm": 6.88384230790795, "language_loss": 0.82693535, "learning_rate": 3.994381925330319e-06, "loss": 0.84988439, "num_input_tokens_seen": 18939835, "router_z_loss_clip": 3.97460938, "router_z_loss_mlp": 0.45654297, "step": 884, "time_per_iteration": 2.6392874717712402 }, { "auxiliary_loss_clip": 0.01821359, "auxiliary_loss_mlp": 0.0047535, "balance_loss_clip": 1.42315745, "balance_loss_mlp": 0.43844295, "epoch": 0.053209078611152864, "flos": 12860330204160.0, "grad_norm": 756.2058686700101, "language_loss": 0.92832446, "learning_rate": 3.994352716384659e-06, "loss": 0.95129156, "num_input_tokens_seen": 18958405, "router_z_loss_clip": 3.98046875, "router_z_loss_mlp": 0.36889648, "step": 885, "time_per_iteration": 2.631077289581299 }, { "auxiliary_loss_clip": 0.0185807, "auxiliary_loss_mlp": 0.00483352, "balance_loss_clip": 1.44712305, "balance_loss_mlp": 0.44117612, "epoch": 0.05326920186382083, "flos": 12164596698240.0, "grad_norm": 44.61466463436276, "language_loss": 0.9446938, "learning_rate": 3.994323431812945e-06, "loss": 0.96810806, "num_input_tokens_seen": 18975445, "router_z_loss_clip": 4.11132812, "router_z_loss_mlp": 0.42163086, "step": 886, "time_per_iteration": 2.668527841567993 }, { "auxiliary_loss_clip": 0.01866275, "auxiliary_loss_mlp": 0.0045802, "balance_loss_clip": 1.45486283, "balance_loss_mlp": 0.41529524, "epoch": 0.0533293251164888, "flos": 22704485420160.0, "grad_norm": 48.137273334601375, "language_loss": 0.95188934, "learning_rate": 3.994294071616286e-06, "loss": 0.97513223, "num_input_tokens_seen": 18991930, "router_z_loss_clip": 4.1171875, "router_z_loss_mlp": 0.42700195, "step": 887, "time_per_iteration": 2.6415271759033203 }, { "auxiliary_loss_clip": 0.01882104, "auxiliary_loss_mlp": 0.00460157, "balance_loss_clip": 1.45388722, "balance_loss_mlp": 0.41721737, "epoch": 0.053389448369156774, "flos": 26940939200640.0, "grad_norm": 130.41107995658203, "language_loss": 0.79995942, "learning_rate": 3.994264635795796e-06, "loss": 0.82338202, "num_input_tokens_seen": 19009790, "router_z_loss_clip": 4.28125, "router_z_loss_mlp": 0.4296875, "step": 888, "time_per_iteration": 2.6423799991607666 }, { "auxiliary_loss_clip": 0.01880745, "auxiliary_loss_mlp": 0.00424104, "balance_loss_clip": 1.45791459, "balance_loss_mlp": 0.38626707, "epoch": 0.05344957162182474, "flos": 25556331686400.0, "grad_norm": 40.01266621964378, "language_loss": 0.94601011, "learning_rate": 3.994235124352592e-06, "loss": 0.96905869, "num_input_tokens_seen": 19030170, "router_z_loss_clip": 4.2265625, "router_z_loss_mlp": 0.37841797, "step": 889, "time_per_iteration": 2.7261788845062256 }, { "auxiliary_loss_clip": 0.01904904, "auxiliary_loss_mlp": 0.00392318, "balance_loss_clip": 1.46991825, "balance_loss_mlp": 0.35903525, "epoch": 0.05350969487449271, "flos": 19719591177600.0, "grad_norm": 6.150673468330552, "language_loss": 0.94277948, "learning_rate": 3.994205537287791e-06, "loss": 0.96575171, "num_input_tokens_seen": 19048075, "router_z_loss_clip": 4.35351562, "router_z_loss_mlp": 0.33300781, "step": 890, "time_per_iteration": 2.6315758228302 }, { "auxiliary_loss_clip": 0.01909533, "auxiliary_loss_mlp": 0.00422282, "balance_loss_clip": 1.47044182, "balance_loss_mlp": 0.38272801, "epoch": 0.053569818127160676, "flos": 27016351804800.0, "grad_norm": 7.292798196424232, "language_loss": 1.02258134, "learning_rate": 3.994175874602517e-06, "loss": 1.04589963, "num_input_tokens_seen": 19067465, "router_z_loss_clip": 4.3828125, "router_z_loss_mlp": 0.39550781, "step": 891, "time_per_iteration": 2.730222463607788 }, { "auxiliary_loss_clip": 0.01907218, "auxiliary_loss_mlp": 0.00400269, "balance_loss_clip": 1.45915091, "balance_loss_mlp": 0.3626225, "epoch": 0.05362994137982865, "flos": 13188338225280.0, "grad_norm": 11.449869997346486, "language_loss": 0.77762532, "learning_rate": 3.994146136297893e-06, "loss": 0.80070019, "num_input_tokens_seen": 19085505, "router_z_loss_clip": 4.484375, "router_z_loss_mlp": 0.37670898, "step": 892, "time_per_iteration": 2.5977931022644043 }, { "auxiliary_loss_clip": 0.01936303, "auxiliary_loss_mlp": 0.00386534, "balance_loss_clip": 1.46724665, "balance_loss_mlp": 0.35136753, "epoch": 0.05369006463249662, "flos": 28658008022400.0, "grad_norm": 73.52392366010264, "language_loss": 0.89767838, "learning_rate": 3.994116322375049e-06, "loss": 0.92090678, "num_input_tokens_seen": 19104360, "router_z_loss_clip": 4.6953125, "router_z_loss_mlp": 0.3515625, "step": 893, "time_per_iteration": 2.7308437824249268 }, { "auxiliary_loss_clip": 0.01917745, "auxiliary_loss_mlp": 0.00387407, "balance_loss_clip": 1.45360613, "balance_loss_mlp": 0.35266903, "epoch": 0.053750187885164585, "flos": 28913153304960.0, "grad_norm": 6.6061548550390485, "language_loss": 0.89339435, "learning_rate": 3.994086432835114e-06, "loss": 0.91644579, "num_input_tokens_seen": 19124680, "router_z_loss_clip": 4.64453125, "router_z_loss_mlp": 0.34741211, "step": 894, "time_per_iteration": 2.665363073348999 }, { "auxiliary_loss_clip": 0.01909769, "auxiliary_loss_mlp": 0.00371733, "balance_loss_clip": 1.44957817, "balance_loss_mlp": 0.33363372, "epoch": 0.05381031113783256, "flos": 15158828476800.0, "grad_norm": 22.407071972748415, "language_loss": 0.82978785, "learning_rate": 3.994056467679221e-06, "loss": 0.8526029, "num_input_tokens_seen": 19142895, "router_z_loss_clip": 4.6015625, "router_z_loss_mlp": 0.38085938, "step": 895, "time_per_iteration": 2.7008798122406006 }, { "auxiliary_loss_clip": 0.01996582, "auxiliary_loss_mlp": 0.00398709, "balance_loss_clip": 1.48255277, "balance_loss_mlp": 0.36094368, "epoch": 0.05387043439050053, "flos": 21835232288640.0, "grad_norm": 8.770876148159605, "language_loss": 0.93599904, "learning_rate": 3.9940264269085065e-06, "loss": 0.959952, "num_input_tokens_seen": 19163125, "router_z_loss_clip": 5.1328125, "router_z_loss_mlp": 0.37768555, "step": 896, "time_per_iteration": 2.700834035873413 }, { "auxiliary_loss_clip": 0.02064736, "auxiliary_loss_mlp": 0.00410395, "balance_loss_clip": 1.49287081, "balance_loss_mlp": 0.36957788, "epoch": 0.053930557643168495, "flos": 17310308382720.0, "grad_norm": 24.972660284814282, "language_loss": 0.96285182, "learning_rate": 3.9939963105241115e-06, "loss": 0.98760319, "num_input_tokens_seen": 19179385, "router_z_loss_clip": 5.72265625, "router_z_loss_mlp": 0.40820312, "step": 897, "time_per_iteration": 2.7015769481658936 }, { "auxiliary_loss_clip": 0.02110562, "auxiliary_loss_mlp": 0.0038739, "balance_loss_clip": 1.51168621, "balance_loss_mlp": 0.34750301, "epoch": 0.05399068089583647, "flos": 17348481561600.0, "grad_norm": 29.640415801413177, "language_loss": 0.95662892, "learning_rate": 3.993966118527175e-06, "loss": 0.98160833, "num_input_tokens_seen": 19198725, "router_z_loss_clip": 5.9921875, "router_z_loss_mlp": 0.39892578, "step": 898, "time_per_iteration": 2.636110782623291 }, { "auxiliary_loss_clip": 0.02149158, "auxiliary_loss_mlp": 0.00400164, "balance_loss_clip": 1.51665676, "balance_loss_mlp": 0.36020523, "epoch": 0.05405080414850443, "flos": 17486952491520.0, "grad_norm": 251.76860332533985, "language_loss": 1.01757252, "learning_rate": 3.993935850918845e-06, "loss": 1.04306579, "num_input_tokens_seen": 19212380, "router_z_loss_clip": 6.33203125, "router_z_loss_mlp": 0.3996582, "step": 899, "time_per_iteration": 2.6251611709594727 }, { "auxiliary_loss_clip": 0.02213833, "auxiliary_loss_mlp": 0.00391818, "balance_loss_clip": 1.52035844, "balance_loss_mlp": 0.3556262, "epoch": 0.054110927401172404, "flos": 24496787278080.0, "grad_norm": 31.46391484374107, "language_loss": 0.8095454, "learning_rate": 3.9939055077002665e-06, "loss": 0.83560193, "num_input_tokens_seen": 19232235, "router_z_loss_clip": 6.9296875, "router_z_loss_mlp": 0.36181641, "step": 900, "time_per_iteration": 2.6651110649108887 }, { "auxiliary_loss_clip": 0.0221555, "auxiliary_loss_mlp": 0.00408473, "balance_loss_clip": 1.50631166, "balance_loss_mlp": 0.37149426, "epoch": 0.054171050653840376, "flos": 22930040874240.0, "grad_norm": 5.017049363280182, "language_loss": 0.85126376, "learning_rate": 3.993875088872592e-06, "loss": 0.87750405, "num_input_tokens_seen": 19251460, "router_z_loss_clip": 7.08203125, "router_z_loss_mlp": 0.36987305, "step": 901, "time_per_iteration": 2.6414055824279785 }, { "auxiliary_loss_clip": 0.02124949, "auxiliary_loss_mlp": 0.00400023, "balance_loss_clip": 1.48850584, "balance_loss_mlp": 0.36402139, "epoch": 0.05423117390650834, "flos": 12933192942720.0, "grad_norm": 36.957730548523486, "language_loss": 0.92066169, "learning_rate": 3.9938445944369745e-06, "loss": 0.94591141, "num_input_tokens_seen": 19269060, "router_z_loss_clip": 6.359375, "router_z_loss_mlp": 0.35986328, "step": 902, "time_per_iteration": 2.608783006668091 }, { "auxiliary_loss_clip": 0.0206733, "auxiliary_loss_mlp": 0.00424623, "balance_loss_clip": 1.46440661, "balance_loss_mlp": 0.38475901, "epoch": 0.05429129715917631, "flos": 19901335017600.0, "grad_norm": 3.0087089055216034, "language_loss": 0.93807304, "learning_rate": 3.993814024394569e-06, "loss": 0.96299255, "num_input_tokens_seen": 19288620, "router_z_loss_clip": 6.0234375, "router_z_loss_mlp": 0.39868164, "step": 903, "time_per_iteration": 2.6678402423858643 }, { "auxiliary_loss_clip": 0.01994314, "auxiliary_loss_mlp": 0.00442839, "balance_loss_clip": 1.42761564, "balance_loss_mlp": 0.40440571, "epoch": 0.05435142041184428, "flos": 16908611610240.0, "grad_norm": 15.121796619501287, "language_loss": 0.82869506, "learning_rate": 3.993783378746537e-06, "loss": 0.85306656, "num_input_tokens_seen": 19306615, "router_z_loss_clip": 5.6640625, "router_z_loss_mlp": 0.38427734, "step": 904, "time_per_iteration": 2.6681394577026367 }, { "auxiliary_loss_clip": 0.01966773, "auxiliary_loss_mlp": 0.00508461, "balance_loss_clip": 1.43124723, "balance_loss_mlp": 0.46561676, "epoch": 0.05441154366451225, "flos": 23948323534080.0, "grad_norm": 17.274527042949426, "language_loss": 0.93922746, "learning_rate": 3.993752657494039e-06, "loss": 0.96397984, "num_input_tokens_seen": 19321680, "router_z_loss_clip": 5.35546875, "router_z_loss_mlp": 0.42797852, "step": 905, "time_per_iteration": 4.118563890457153 }, { "auxiliary_loss_clip": 0.01928249, "auxiliary_loss_mlp": 0.00474403, "balance_loss_clip": 1.41293025, "balance_loss_mlp": 0.43573138, "epoch": 0.05447166691718022, "flos": 19975382904960.0, "grad_norm": 24.178947777754704, "language_loss": 0.81103706, "learning_rate": 3.993721860638241e-06, "loss": 0.83506358, "num_input_tokens_seen": 19339760, "router_z_loss_clip": 5.1484375, "router_z_loss_mlp": 0.38696289, "step": 906, "time_per_iteration": 4.057260274887085 }, { "auxiliary_loss_clip": 0.01862781, "auxiliary_loss_mlp": 0.00488076, "balance_loss_clip": 1.36902261, "balance_loss_mlp": 0.44613814, "epoch": 0.05453179016984819, "flos": 24936513575040.0, "grad_norm": 33.82835448654488, "language_loss": 0.95018649, "learning_rate": 3.993690988180309e-06, "loss": 0.9736951, "num_input_tokens_seen": 19359585, "router_z_loss_clip": 4.94140625, "router_z_loss_mlp": 0.41943359, "step": 907, "time_per_iteration": 2.743877410888672 }, { "auxiliary_loss_clip": 0.01830778, "auxiliary_loss_mlp": 0.00532811, "balance_loss_clip": 1.36039281, "balance_loss_mlp": 0.48846492, "epoch": 0.05459191342251616, "flos": 18115102558080.0, "grad_norm": 5.046168128917598, "language_loss": 0.95843905, "learning_rate": 3.9936600401214165e-06, "loss": 0.98207504, "num_input_tokens_seen": 19378590, "router_z_loss_clip": 4.70703125, "router_z_loss_mlp": 0.44335938, "step": 908, "time_per_iteration": 4.0955681800842285 }, { "auxiliary_loss_clip": 0.01820729, "auxiliary_loss_mlp": 0.00586616, "balance_loss_clip": 1.3596282, "balance_loss_mlp": 0.54451144, "epoch": 0.054652036675184125, "flos": 19208295031680.0, "grad_norm": 125.93407706020687, "language_loss": 0.99151742, "learning_rate": 3.9936290164627345e-06, "loss": 1.01559091, "num_input_tokens_seen": 19397910, "router_z_loss_clip": 4.60546875, "router_z_loss_mlp": 0.42114258, "step": 909, "time_per_iteration": 2.6084134578704834 }, { "auxiliary_loss_clip": 0.01817703, "auxiliary_loss_mlp": 0.0057909, "balance_loss_clip": 1.35444474, "balance_loss_mlp": 0.52966583, "epoch": 0.0547121599278521, "flos": 16325745615360.0, "grad_norm": 378.68557798185253, "language_loss": 0.80332208, "learning_rate": 3.99359791720544e-06, "loss": 0.82729006, "num_input_tokens_seen": 19415950, "router_z_loss_clip": 4.640625, "router_z_loss_mlp": 0.49438477, "step": 910, "time_per_iteration": 2.5913472175598145 }, { "auxiliary_loss_clip": 0.0174992, "auxiliary_loss_mlp": 0.00587603, "balance_loss_clip": 1.31948948, "balance_loss_mlp": 0.54483104, "epoch": 0.05477228318052007, "flos": 20339014239360.0, "grad_norm": 13.949952091580005, "language_loss": 0.91533351, "learning_rate": 3.993566742350714e-06, "loss": 0.93870872, "num_input_tokens_seen": 19435275, "router_z_loss_clip": 4.3125, "router_z_loss_mlp": 0.42773438, "step": 911, "time_per_iteration": 2.622007131576538 }, { "auxiliary_loss_clip": 0.01762993, "auxiliary_loss_mlp": 0.00634464, "balance_loss_clip": 1.3304733, "balance_loss_mlp": 0.58716142, "epoch": 0.054832406433188034, "flos": 21973092687360.0, "grad_norm": 80.79802762624257, "language_loss": 0.84448653, "learning_rate": 3.993535491899736e-06, "loss": 0.86846113, "num_input_tokens_seen": 19452090, "router_z_loss_clip": 4.32421875, "router_z_loss_mlp": 0.47290039, "step": 912, "time_per_iteration": 2.661226987838745 }, { "auxiliary_loss_clip": 0.01779401, "auxiliary_loss_mlp": 0.00605116, "balance_loss_clip": 1.34452891, "balance_loss_mlp": 0.56308281, "epoch": 0.054892529685856006, "flos": 16398931576320.0, "grad_norm": 2.954012261646679, "language_loss": 0.90320653, "learning_rate": 3.993504165853694e-06, "loss": 0.92705178, "num_input_tokens_seen": 19470865, "router_z_loss_clip": 4.34765625, "router_z_loss_mlp": 0.42016602, "step": 913, "time_per_iteration": 2.582120656967163 }, { "auxiliary_loss_clip": 0.01761157, "auxiliary_loss_mlp": 0.00637234, "balance_loss_clip": 1.33852255, "balance_loss_mlp": 0.58950281, "epoch": 0.05495265293852397, "flos": 23912341084800.0, "grad_norm": 2.689229029184647, "language_loss": 0.89246714, "learning_rate": 3.993472764213772e-06, "loss": 0.91645104, "num_input_tokens_seen": 19492145, "router_z_loss_clip": 4.2265625, "router_z_loss_mlp": 0.47705078, "step": 914, "time_per_iteration": 2.684616804122925 }, { "auxiliary_loss_clip": 0.01760658, "auxiliary_loss_mlp": 0.00585628, "balance_loss_clip": 1.33318567, "balance_loss_mlp": 0.54199731, "epoch": 0.055012776191191944, "flos": 23586954756480.0, "grad_norm": 9.079867508682248, "language_loss": 0.98997307, "learning_rate": 3.9934412869811655e-06, "loss": 1.01343584, "num_input_tokens_seen": 19511015, "router_z_loss_clip": 4.2734375, "router_z_loss_mlp": 0.43603516, "step": 915, "time_per_iteration": 2.65529727935791 }, { "auxiliary_loss_clip": 0.01769287, "auxiliary_loss_mlp": 0.00549542, "balance_loss_clip": 1.334306, "balance_loss_mlp": 0.5078665, "epoch": 0.055072899443859916, "flos": 17528501548800.0, "grad_norm": 6.024315553424879, "language_loss": 0.95688248, "learning_rate": 3.993409734157064e-06, "loss": 0.98007071, "num_input_tokens_seen": 19529040, "router_z_loss_clip": 4.34765625, "router_z_loss_mlp": 0.41674805, "step": 916, "time_per_iteration": 2.7659976482391357 }, { "auxiliary_loss_clip": 0.01794558, "auxiliary_loss_mlp": 0.00563892, "balance_loss_clip": 1.34648132, "balance_loss_mlp": 0.52097714, "epoch": 0.05513302269652788, "flos": 21687172427520.0, "grad_norm": 11.251816023657971, "language_loss": 0.86911815, "learning_rate": 3.993378105742666e-06, "loss": 0.89270264, "num_input_tokens_seen": 19549540, "router_z_loss_clip": 4.48046875, "router_z_loss_mlp": 0.42895508, "step": 917, "time_per_iteration": 2.6677706241607666 }, { "auxiliary_loss_clip": 0.01794905, "auxiliary_loss_mlp": 0.0054934, "balance_loss_clip": 1.35434282, "balance_loss_mlp": 0.50628161, "epoch": 0.05519314594919585, "flos": 21613340021760.0, "grad_norm": 19.389152824504357, "language_loss": 0.87820232, "learning_rate": 3.9933464017391705e-06, "loss": 0.90164477, "num_input_tokens_seen": 19567570, "router_z_loss_clip": 4.40625, "router_z_loss_mlp": 0.42993164, "step": 918, "time_per_iteration": 2.6696524620056152 }, { "auxiliary_loss_clip": 0.01813262, "auxiliary_loss_mlp": 0.00548841, "balance_loss_clip": 1.35193563, "balance_loss_mlp": 0.50635463, "epoch": 0.05525326920186382, "flos": 21798567480960.0, "grad_norm": 3.101437172717932, "language_loss": 0.94457006, "learning_rate": 3.99331462214778e-06, "loss": 0.96819115, "num_input_tokens_seen": 19585330, "router_z_loss_clip": 4.6171875, "router_z_loss_mlp": 0.42504883, "step": 919, "time_per_iteration": 2.7370235919952393 }, { "auxiliary_loss_clip": 0.01787307, "auxiliary_loss_mlp": 0.0048915, "balance_loss_clip": 1.34047377, "balance_loss_mlp": 0.4493106, "epoch": 0.05531339245453179, "flos": 28439635288320.0, "grad_norm": 15.251987036627693, "language_loss": 0.95477593, "learning_rate": 3.993282766969699e-06, "loss": 0.97754055, "num_input_tokens_seen": 19604970, "router_z_loss_clip": 4.4609375, "router_z_loss_mlp": 0.39892578, "step": 920, "time_per_iteration": 2.7196731567382812 }, { "auxiliary_loss_clip": 0.01793841, "auxiliary_loss_mlp": 0.00475209, "balance_loss_clip": 1.34983087, "balance_loss_mlp": 0.43596563, "epoch": 0.05537351570719976, "flos": 37375143131520.0, "grad_norm": 3.41216067938645, "language_loss": 0.73470169, "learning_rate": 3.993250836206136e-06, "loss": 0.75739223, "num_input_tokens_seen": 19626235, "router_z_loss_clip": 4.44140625, "router_z_loss_mlp": 0.39233398, "step": 921, "time_per_iteration": 2.8482346534729004 }, { "auxiliary_loss_clip": 0.01827091, "auxiliary_loss_mlp": 0.00518619, "balance_loss_clip": 1.37573767, "balance_loss_mlp": 0.4750365, "epoch": 0.05543363895986773, "flos": 20084479488000.0, "grad_norm": 8.503238966940122, "language_loss": 0.79605162, "learning_rate": 3.993218829858301e-06, "loss": 0.81950867, "num_input_tokens_seen": 19644305, "router_z_loss_clip": 4.51171875, "router_z_loss_mlp": 0.43530273, "step": 922, "time_per_iteration": 2.645453929901123 }, { "auxiliary_loss_clip": 0.0178505, "auxiliary_loss_mlp": 0.00494322, "balance_loss_clip": 1.35082281, "balance_loss_mlp": 0.45269442, "epoch": 0.0554937622125357, "flos": 24533200690560.0, "grad_norm": 10.074786988647633, "language_loss": 0.90350109, "learning_rate": 3.993186747927408e-06, "loss": 0.9262948, "num_input_tokens_seen": 19662130, "router_z_loss_clip": 4.34375, "router_z_loss_mlp": 0.41625977, "step": 923, "time_per_iteration": 2.670626640319824 }, { "auxiliary_loss_clip": 0.01773417, "auxiliary_loss_mlp": 0.00451005, "balance_loss_clip": 1.33897734, "balance_loss_mlp": 0.40985391, "epoch": 0.055553885465203665, "flos": 14320063013760.0, "grad_norm": 40.94003758638627, "language_loss": 0.85532564, "learning_rate": 3.993154590414675e-06, "loss": 0.87756985, "num_input_tokens_seen": 19680715, "router_z_loss_clip": 4.3359375, "router_z_loss_mlp": 0.41137695, "step": 924, "time_per_iteration": 2.612720489501953 }, { "auxiliary_loss_clip": 0.01762343, "auxiliary_loss_mlp": 0.00463437, "balance_loss_clip": 1.34371734, "balance_loss_mlp": 0.42259562, "epoch": 0.05561400871787164, "flos": 27381132374400.0, "grad_norm": 5.4109943734867745, "language_loss": 1.07817507, "learning_rate": 3.993122357321319e-06, "loss": 1.10043287, "num_input_tokens_seen": 19700535, "router_z_loss_clip": 4.1875, "router_z_loss_mlp": 0.40795898, "step": 925, "time_per_iteration": 2.7028605937957764 }, { "auxiliary_loss_clip": 0.0175838, "auxiliary_loss_mlp": 0.00461304, "balance_loss_clip": 1.3336395, "balance_loss_mlp": 0.42153588, "epoch": 0.05567413197053961, "flos": 23221096778880.0, "grad_norm": 101.2262599651773, "language_loss": 0.87683904, "learning_rate": 3.993090048648564e-06, "loss": 0.89903587, "num_input_tokens_seen": 19718825, "router_z_loss_clip": 4.25390625, "router_z_loss_mlp": 0.39770508, "step": 926, "time_per_iteration": 2.6274898052215576 }, { "auxiliary_loss_clip": 0.017823, "auxiliary_loss_mlp": 0.00513003, "balance_loss_clip": 1.35315883, "balance_loss_mlp": 0.46877682, "epoch": 0.055734255223207574, "flos": 25264952559360.0, "grad_norm": 18.357884434419617, "language_loss": 0.84876317, "learning_rate": 3.993057664397634e-06, "loss": 0.87171614, "num_input_tokens_seen": 19739080, "router_z_loss_clip": 4.29296875, "router_z_loss_mlp": 0.44238281, "step": 927, "time_per_iteration": 2.7139382362365723 }, { "auxiliary_loss_clip": 0.0153901, "auxiliary_loss_mlp": 0.00597545, "balance_loss_clip": 1.24272084, "balance_loss_mlp": 0.56473869, "epoch": 0.055794378475875546, "flos": 66503116702080.0, "grad_norm": 94.6342139542409, "language_loss": 0.59725136, "learning_rate": 3.9930252045697585e-06, "loss": 0.61861688, "num_input_tokens_seen": 19802960, "router_z_loss_clip": 2.96875, "router_z_loss_mlp": 0.328125, "step": 928, "time_per_iteration": 3.1806087493896484 }, { "auxiliary_loss_clip": 0.01740478, "auxiliary_loss_mlp": 0.00506131, "balance_loss_clip": 1.33095312, "balance_loss_mlp": 0.46519473, "epoch": 0.05585450172854351, "flos": 25337635729920.0, "grad_norm": 9.22723760214616, "language_loss": 1.03655946, "learning_rate": 3.992992669166168e-06, "loss": 1.05902553, "num_input_tokens_seen": 19822765, "router_z_loss_clip": 4.08984375, "router_z_loss_mlp": 0.40942383, "step": 929, "time_per_iteration": 2.7041940689086914 }, { "auxiliary_loss_clip": 0.01738628, "auxiliary_loss_mlp": 0.00496184, "balance_loss_clip": 1.33178711, "balance_loss_mlp": 0.45384097, "epoch": 0.05591462498121148, "flos": 33911738881920.0, "grad_norm": 4.1632716818384985, "language_loss": 0.81096864, "learning_rate": 3.992960058188094e-06, "loss": 0.83331674, "num_input_tokens_seen": 19843590, "router_z_loss_clip": 4.0703125, "router_z_loss_mlp": 0.42333984, "step": 930, "time_per_iteration": 2.8635597229003906 }, { "auxiliary_loss_clip": 0.01727614, "auxiliary_loss_mlp": 0.00441801, "balance_loss_clip": 1.31811261, "balance_loss_mlp": 0.40172291, "epoch": 0.055974748233879455, "flos": 17930880679680.0, "grad_norm": 24.608419918063564, "language_loss": 0.93112612, "learning_rate": 3.992927371636776e-06, "loss": 0.9528203, "num_input_tokens_seen": 19860230, "router_z_loss_clip": 4.09765625, "router_z_loss_mlp": 0.40112305, "step": 931, "time_per_iteration": 2.596482515335083 }, { "auxiliary_loss_clip": 0.01717981, "auxiliary_loss_mlp": 0.00484677, "balance_loss_clip": 1.31293285, "balance_loss_mlp": 0.441333, "epoch": 0.05603487148654742, "flos": 24021976371840.0, "grad_norm": 25.5514173118681, "language_loss": 0.90797722, "learning_rate": 3.9928946095134525e-06, "loss": 0.93000388, "num_input_tokens_seen": 19880795, "router_z_loss_clip": 4.046875, "router_z_loss_mlp": 0.43359375, "step": 932, "time_per_iteration": 2.7318124771118164 }, { "auxiliary_loss_clip": 0.01708789, "auxiliary_loss_mlp": 0.00490028, "balance_loss_clip": 1.31256866, "balance_loss_mlp": 0.4480184, "epoch": 0.05609499473921539, "flos": 17307758517120.0, "grad_norm": 6.287785884327479, "language_loss": 0.83031631, "learning_rate": 3.992861771819365e-06, "loss": 0.85230452, "num_input_tokens_seen": 19897960, "router_z_loss_clip": 3.95898438, "router_z_loss_mlp": 0.42041016, "step": 933, "time_per_iteration": 2.5961861610412598 }, { "auxiliary_loss_clip": 0.01705374, "auxiliary_loss_mlp": 0.0043135, "balance_loss_clip": 1.30995607, "balance_loss_mlp": 0.39332202, "epoch": 0.05615511799188336, "flos": 20994742972800.0, "grad_norm": 4.21534535401605, "language_loss": 0.9413777, "learning_rate": 3.99282885855576e-06, "loss": 0.96274495, "num_input_tokens_seen": 19913315, "router_z_loss_clip": 3.95703125, "router_z_loss_mlp": 0.38061523, "step": 934, "time_per_iteration": 2.6803839206695557 }, { "auxiliary_loss_clip": 0.01692668, "auxiliary_loss_mlp": 0.00399836, "balance_loss_clip": 1.30016255, "balance_loss_mlp": 0.36290526, "epoch": 0.05621524124455133, "flos": 17273535834240.0, "grad_norm": 107.04591177556884, "language_loss": 0.87611997, "learning_rate": 3.992795869723885e-06, "loss": 0.89704496, "num_input_tokens_seen": 19928790, "router_z_loss_clip": 3.93164062, "router_z_loss_mlp": 0.36962891, "step": 935, "time_per_iteration": 2.605409860610962 }, { "auxiliary_loss_clip": 0.0150573, "auxiliary_loss_mlp": 0.00458334, "balance_loss_clip": 1.19996762, "balance_loss_mlp": 0.42724457, "epoch": 0.0562753644972193, "flos": 58719370458240.0, "grad_norm": 0.8607848108686647, "language_loss": 0.6916219, "learning_rate": 3.99276280532499e-06, "loss": 0.71126258, "num_input_tokens_seen": 19988785, "router_z_loss_clip": 3.0625, "router_z_loss_mlp": 0.31054688, "step": 936, "time_per_iteration": 3.047752618789673 }, { "auxiliary_loss_clip": 0.01718602, "auxiliary_loss_mlp": 0.00502104, "balance_loss_clip": 1.31997311, "balance_loss_mlp": 0.46073857, "epoch": 0.05633548774988727, "flos": 17457039440640.0, "grad_norm": 32.678350370678615, "language_loss": 0.85398382, "learning_rate": 3.992729665360331e-06, "loss": 0.8761909, "num_input_tokens_seen": 20007685, "router_z_loss_clip": 3.98632812, "router_z_loss_mlp": 0.4140625, "step": 937, "time_per_iteration": 2.619739294052124 }, { "auxiliary_loss_clip": 0.01490642, "auxiliary_loss_mlp": 0.00270365, "balance_loss_clip": 1.20579696, "balance_loss_mlp": 0.24385297, "epoch": 0.05639561100255524, "flos": 70654928083200.0, "grad_norm": 0.8833337043326056, "language_loss": 0.64388412, "learning_rate": 3.992696449831162e-06, "loss": 0.6614942, "num_input_tokens_seen": 20072750, "router_z_loss_clip": 2.84375, "router_z_loss_mlp": 0.265625, "step": 938, "time_per_iteration": 3.084285259246826 }, { "auxiliary_loss_clip": 0.01749834, "auxiliary_loss_mlp": 0.00514616, "balance_loss_clip": 1.32865465, "balance_loss_mlp": 0.47081828, "epoch": 0.056455734255223204, "flos": 20485996692480.0, "grad_norm": 93.44651891046269, "language_loss": 0.88869172, "learning_rate": 3.992663158738745e-06, "loss": 0.91133618, "num_input_tokens_seen": 20089070, "router_z_loss_clip": 4.2109375, "router_z_loss_mlp": 0.43774414, "step": 939, "time_per_iteration": 2.668912410736084 }, { "auxiliary_loss_clip": 0.01764555, "auxiliary_loss_mlp": 0.00505843, "balance_loss_clip": 1.34084237, "balance_loss_mlp": 0.46211731, "epoch": 0.056515857507891176, "flos": 22053569109120.0, "grad_norm": 5.949069665058613, "language_loss": 0.80140364, "learning_rate": 3.992629792084341e-06, "loss": 0.82410765, "num_input_tokens_seen": 20108790, "router_z_loss_clip": 4.234375, "router_z_loss_mlp": 0.43701172, "step": 940, "time_per_iteration": 2.681264638900757 }, { "auxiliary_loss_clip": 0.01800432, "auxiliary_loss_mlp": 0.0053689, "balance_loss_clip": 1.36484575, "balance_loss_mlp": 0.49426088, "epoch": 0.05657598076055915, "flos": 24025316336640.0, "grad_norm": 14.820073274207592, "language_loss": 0.78170013, "learning_rate": 3.992596349869216e-06, "loss": 0.80507338, "num_input_tokens_seen": 20128455, "router_z_loss_clip": 4.35546875, "router_z_loss_mlp": 0.42626953, "step": 941, "time_per_iteration": 2.6951587200164795 }, { "auxiliary_loss_clip": 0.01815152, "auxiliary_loss_mlp": 0.0054092, "balance_loss_clip": 1.36693358, "balance_loss_mlp": 0.49748033, "epoch": 0.05663610401322711, "flos": 20480609652480.0, "grad_norm": 362.0262727255131, "language_loss": 0.87207568, "learning_rate": 3.992562832094637e-06, "loss": 0.89563638, "num_input_tokens_seen": 20145775, "router_z_loss_clip": 4.48046875, "router_z_loss_mlp": 0.43457031, "step": 942, "time_per_iteration": 2.6325583457946777 }, { "auxiliary_loss_clip": 0.01847493, "auxiliary_loss_mlp": 0.00565238, "balance_loss_clip": 1.36444974, "balance_loss_mlp": 0.52141637, "epoch": 0.056696227265895086, "flos": 21069042255360.0, "grad_norm": 53.36786643870972, "language_loss": 0.96522897, "learning_rate": 3.9925292387618755e-06, "loss": 0.98935628, "num_input_tokens_seen": 20164315, "router_z_loss_clip": 4.828125, "router_z_loss_mlp": 0.43823242, "step": 943, "time_per_iteration": 2.750648021697998 }, { "auxiliary_loss_clip": 0.01907178, "auxiliary_loss_mlp": 0.00547553, "balance_loss_clip": 1.39377451, "balance_loss_mlp": 0.50525713, "epoch": 0.05675635051856306, "flos": 17821317219840.0, "grad_norm": 26.747850149197312, "language_loss": 0.84224367, "learning_rate": 3.992495569872206e-06, "loss": 0.86679101, "num_input_tokens_seen": 20182760, "router_z_loss_clip": 5.12890625, "router_z_loss_mlp": 0.4230957, "step": 944, "time_per_iteration": 2.717059373855591 }, { "auxiliary_loss_clip": 0.01940047, "auxiliary_loss_mlp": 0.00512523, "balance_loss_clip": 1.3940227, "balance_loss_mlp": 0.47318375, "epoch": 0.05681647377123102, "flos": 23114945111040.0, "grad_norm": 1.9456023418220305, "language_loss": 0.86061525, "learning_rate": 3.992461825426906e-06, "loss": 0.88514102, "num_input_tokens_seen": 20203830, "router_z_loss_clip": 5.46484375, "router_z_loss_mlp": 0.39306641, "step": 945, "time_per_iteration": 2.6823153495788574 }, { "auxiliary_loss_clip": 0.01955139, "auxiliary_loss_mlp": 0.00527639, "balance_loss_clip": 1.40367174, "balance_loss_mlp": 0.48694074, "epoch": 0.056876597023898995, "flos": 16070528505600.0, "grad_norm": 11.26918952530046, "language_loss": 0.91534138, "learning_rate": 3.992428005427252e-06, "loss": 0.94016922, "num_input_tokens_seen": 20220365, "router_z_loss_clip": 5.515625, "router_z_loss_mlp": 0.40673828, "step": 946, "time_per_iteration": 2.5735859870910645 }, { "auxiliary_loss_clip": 0.01980308, "auxiliary_loss_mlp": 0.0060074, "balance_loss_clip": 1.40488029, "balance_loss_mlp": 0.55305636, "epoch": 0.05693672027656696, "flos": 16835641130880.0, "grad_norm": 35.46331054096292, "language_loss": 0.86846191, "learning_rate": 3.992394109874529e-06, "loss": 0.89427245, "num_input_tokens_seen": 20238640, "router_z_loss_clip": 5.75, "router_z_loss_mlp": 0.47705078, "step": 947, "time_per_iteration": 4.076129674911499 }, { "auxiliary_loss_clip": 0.01981743, "auxiliary_loss_mlp": 0.00580601, "balance_loss_clip": 1.4105978, "balance_loss_mlp": 0.53503931, "epoch": 0.05699684352923493, "flos": 21389113370880.0, "grad_norm": 11.58105383219656, "language_loss": 0.93177497, "learning_rate": 3.9923601387700225e-06, "loss": 0.95739841, "num_input_tokens_seen": 20251025, "router_z_loss_clip": 5.71484375, "router_z_loss_mlp": 0.45581055, "step": 948, "time_per_iteration": 4.054994106292725 }, { "auxiliary_loss_clip": 0.01983255, "auxiliary_loss_mlp": 0.00587047, "balance_loss_clip": 1.40292501, "balance_loss_mlp": 0.5408653, "epoch": 0.057056966781902904, "flos": 15560309767680.0, "grad_norm": 404.44655829120086, "language_loss": 0.9347614, "learning_rate": 3.992326092115019e-06, "loss": 0.96046448, "num_input_tokens_seen": 20269775, "router_z_loss_clip": 5.80078125, "router_z_loss_mlp": 0.46166992, "step": 949, "time_per_iteration": 2.6716415882110596 }, { "auxiliary_loss_clip": 0.01940038, "auxiliary_loss_mlp": 0.00524098, "balance_loss_clip": 1.3876574, "balance_loss_mlp": 0.48647529, "epoch": 0.05711709003457087, "flos": 19937856170880.0, "grad_norm": 3.980106878883666, "language_loss": 0.84345758, "learning_rate": 3.992291969910811e-06, "loss": 0.86809897, "num_input_tokens_seen": 20287715, "router_z_loss_clip": 5.52734375, "router_z_loss_mlp": 0.3762207, "step": 950, "time_per_iteration": 4.114667177200317 }, { "auxiliary_loss_clip": 0.0189358, "auxiliary_loss_mlp": 0.00569942, "balance_loss_clip": 1.37363362, "balance_loss_mlp": 0.52852905, "epoch": 0.05717721328723884, "flos": 30332701774080.0, "grad_norm": 33.808354044603604, "language_loss": 0.89216077, "learning_rate": 3.992257772158691e-06, "loss": 0.91679597, "num_input_tokens_seen": 20307070, "router_z_loss_clip": 5.19921875, "router_z_loss_mlp": 0.41381836, "step": 951, "time_per_iteration": 2.7260332107543945 }, { "auxiliary_loss_clip": 0.01855162, "auxiliary_loss_mlp": 0.00603096, "balance_loss_clip": 1.35999322, "balance_loss_mlp": 0.55460155, "epoch": 0.05723733653990681, "flos": 23654358627840.0, "grad_norm": 20.368231199540404, "language_loss": 0.93933797, "learning_rate": 3.992223498859958e-06, "loss": 0.96392053, "num_input_tokens_seen": 20324945, "router_z_loss_clip": 4.953125, "router_z_loss_mlp": 0.48510742, "step": 952, "time_per_iteration": 2.6228275299072266 }, { "auxiliary_loss_clip": 0.01818815, "auxiliary_loss_mlp": 0.00616961, "balance_loss_clip": 1.34017062, "balance_loss_mlp": 0.5678463, "epoch": 0.05729745979257478, "flos": 22055759838720.0, "grad_norm": 63.427552374841255, "language_loss": 0.86905164, "learning_rate": 3.9921891500159084e-06, "loss": 0.89340943, "num_input_tokens_seen": 20346135, "router_z_loss_clip": 4.78125, "router_z_loss_mlp": 0.49072266, "step": 953, "time_per_iteration": 2.664486885070801 }, { "auxiliary_loss_clip": 0.01801376, "auxiliary_loss_mlp": 0.00566432, "balance_loss_clip": 1.33606076, "balance_loss_mlp": 0.52220577, "epoch": 0.05735758304524275, "flos": 19604353368960.0, "grad_norm": 18.7892456439907, "language_loss": 0.94109023, "learning_rate": 3.992154725627848e-06, "loss": 0.96476829, "num_input_tokens_seen": 20364450, "router_z_loss_clip": 4.65234375, "router_z_loss_mlp": 0.44262695, "step": 954, "time_per_iteration": 2.6121699810028076 }, { "auxiliary_loss_clip": 0.01803382, "auxiliary_loss_mlp": 0.0057832, "balance_loss_clip": 1.34093094, "balance_loss_mlp": 0.53573799, "epoch": 0.057417706297910716, "flos": 19099018880640.0, "grad_norm": 4.252796450519734, "language_loss": 0.98213321, "learning_rate": 3.9921202256970804e-06, "loss": 1.00595021, "num_input_tokens_seen": 20383500, "router_z_loss_clip": 4.62109375, "router_z_loss_mlp": 0.42578125, "step": 955, "time_per_iteration": 2.6320037841796875 }, { "auxiliary_loss_clip": 0.01771463, "auxiliary_loss_mlp": 0.00559407, "balance_loss_clip": 1.33174849, "balance_loss_mlp": 0.51677746, "epoch": 0.05747782955057869, "flos": 16654507822080.0, "grad_norm": 3.2785943300703804, "language_loss": 0.95770657, "learning_rate": 3.992085650224914e-06, "loss": 0.98101532, "num_input_tokens_seen": 20400295, "router_z_loss_clip": 4.3984375, "router_z_loss_mlp": 0.42651367, "step": 956, "time_per_iteration": 2.5881903171539307 }, { "auxiliary_loss_clip": 0.01781379, "auxiliary_loss_mlp": 0.00571029, "balance_loss_clip": 1.34326708, "balance_loss_mlp": 0.52987814, "epoch": 0.05753795280324665, "flos": 14502058248960.0, "grad_norm": 2.4355064332758043, "language_loss": 0.8229996, "learning_rate": 3.99205099921266e-06, "loss": 0.8465237, "num_input_tokens_seen": 20419085, "router_z_loss_clip": 4.38671875, "router_z_loss_mlp": 0.41162109, "step": 957, "time_per_iteration": 2.6059138774871826 }, { "auxiliary_loss_clip": 0.01737133, "auxiliary_loss_mlp": 0.0056011, "balance_loss_clip": 1.31161427, "balance_loss_mlp": 0.51724207, "epoch": 0.057598076055914625, "flos": 18076318848000.0, "grad_norm": 16.027683514065775, "language_loss": 0.85971862, "learning_rate": 3.992016272661633e-06, "loss": 0.88269103, "num_input_tokens_seen": 20437465, "router_z_loss_clip": 4.25390625, "router_z_loss_mlp": 0.42871094, "step": 958, "time_per_iteration": 2.602760076522827 }, { "auxiliary_loss_clip": 0.01721829, "auxiliary_loss_mlp": 0.00526263, "balance_loss_clip": 1.30836928, "balance_loss_mlp": 0.48787761, "epoch": 0.0576581993085826, "flos": 22124600254080.0, "grad_norm": 2.7762420468421043, "language_loss": 0.94987857, "learning_rate": 3.99198147057315e-06, "loss": 0.97235948, "num_input_tokens_seen": 20456235, "router_z_loss_clip": 4.1328125, "router_z_loss_mlp": 0.3840332, "step": 959, "time_per_iteration": 2.6261825561523438 }, { "auxiliary_loss_clip": 0.01708294, "auxiliary_loss_mlp": 0.00529597, "balance_loss_clip": 1.30080438, "balance_loss_mlp": 0.49147382, "epoch": 0.05771832256125056, "flos": 33181746779520.0, "grad_norm": 6.599927021264802, "language_loss": 0.86910421, "learning_rate": 3.991946592948529e-06, "loss": 0.89148307, "num_input_tokens_seen": 20476825, "router_z_loss_clip": 4.07421875, "router_z_loss_mlp": 0.38110352, "step": 960, "time_per_iteration": 2.7559149265289307 }, { "auxiliary_loss_clip": 0.01662059, "auxiliary_loss_mlp": 0.0053954, "balance_loss_clip": 1.26772141, "balance_loss_mlp": 0.49686259, "epoch": 0.057778445813918534, "flos": 24170143973760.0, "grad_norm": 11.22024783925195, "language_loss": 1.00127459, "learning_rate": 3.991911639789094e-06, "loss": 1.02329051, "num_input_tokens_seen": 20496965, "router_z_loss_clip": 3.94335938, "router_z_loss_mlp": 0.42651367, "step": 961, "time_per_iteration": 2.670612096786499 }, { "auxiliary_loss_clip": 0.01656915, "auxiliary_loss_mlp": 0.00532961, "balance_loss_clip": 1.26074672, "balance_loss_mlp": 0.49033153, "epoch": 0.0578385690665865, "flos": 29643037666560.0, "grad_norm": 15.370856970853373, "language_loss": 0.78111768, "learning_rate": 3.991876611096169e-06, "loss": 0.80301648, "num_input_tokens_seen": 20518035, "router_z_loss_clip": 3.96289062, "router_z_loss_mlp": 0.42602539, "step": 962, "time_per_iteration": 2.74819016456604 }, { "auxiliary_loss_clip": 0.01650034, "auxiliary_loss_mlp": 0.00500616, "balance_loss_clip": 1.24973655, "balance_loss_mlp": 0.46170652, "epoch": 0.05789869231925447, "flos": 20885430908160.0, "grad_norm": 6.66375047375855, "language_loss": 0.95622104, "learning_rate": 3.991841506871084e-06, "loss": 0.97772753, "num_input_tokens_seen": 20534740, "router_z_loss_clip": 4.00585938, "router_z_loss_mlp": 0.38891602, "step": 963, "time_per_iteration": 2.6615824699401855 }, { "auxiliary_loss_clip": 0.01695989, "auxiliary_loss_mlp": 0.00518673, "balance_loss_clip": 1.2750535, "balance_loss_mlp": 0.47921515, "epoch": 0.057958815571922444, "flos": 26031106679040.0, "grad_norm": 26.12666491258496, "language_loss": 0.95553809, "learning_rate": 3.99180632711517e-06, "loss": 0.97768474, "num_input_tokens_seen": 20553485, "router_z_loss_clip": 4.20898438, "router_z_loss_mlp": 0.39477539, "step": 964, "time_per_iteration": 2.769488573074341 }, { "auxiliary_loss_clip": 0.01697784, "auxiliary_loss_mlp": 0.00475586, "balance_loss_clip": 1.2676332, "balance_loss_mlp": 0.43793941, "epoch": 0.05801893882459041, "flos": 18077683564800.0, "grad_norm": 10.22613285020329, "language_loss": 0.86878109, "learning_rate": 3.99177107182976e-06, "loss": 0.89051479, "num_input_tokens_seen": 20572155, "router_z_loss_clip": 4.30664062, "router_z_loss_mlp": 0.37646484, "step": 965, "time_per_iteration": 2.648366689682007 }, { "auxiliary_loss_clip": 0.01722978, "auxiliary_loss_mlp": 0.00475942, "balance_loss_clip": 1.27746153, "balance_loss_mlp": 0.43629289, "epoch": 0.05807906207725838, "flos": 17748885444480.0, "grad_norm": 6.717538864036319, "language_loss": 0.91474187, "learning_rate": 3.99173574101619e-06, "loss": 0.9367311, "num_input_tokens_seen": 20590395, "router_z_loss_clip": 4.4609375, "router_z_loss_mlp": 0.39648438, "step": 966, "time_per_iteration": 2.6468584537506104 }, { "auxiliary_loss_clip": 0.01767904, "auxiliary_loss_mlp": 0.00444916, "balance_loss_clip": 1.3063904, "balance_loss_mlp": 0.40784156, "epoch": 0.058139185329926346, "flos": 18040372312320.0, "grad_norm": 2.276564976747255, "language_loss": 0.85453457, "learning_rate": 3.9917003346758035e-06, "loss": 0.87666285, "num_input_tokens_seen": 20608435, "router_z_loss_clip": 4.61328125, "router_z_loss_mlp": 0.37060547, "step": 967, "time_per_iteration": 2.631924867630005 }, { "auxiliary_loss_clip": 0.01416589, "auxiliary_loss_mlp": 0.00183514, "balance_loss_clip": 1.14752614, "balance_loss_mlp": 0.15356825, "epoch": 0.05819930858259432, "flos": 62363297485440.0, "grad_norm": 1.0172049194584312, "language_loss": 0.57393897, "learning_rate": 3.991664852809939e-06, "loss": 0.58994001, "num_input_tokens_seen": 20668575, "router_z_loss_clip": 2.6875, "router_z_loss_mlp": 0.29882812, "step": 968, "time_per_iteration": 3.1238908767700195 }, { "auxiliary_loss_clip": 0.01737193, "auxiliary_loss_mlp": 0.00442892, "balance_loss_clip": 1.28753257, "balance_loss_mlp": 0.40333885, "epoch": 0.05825943183526229, "flos": 19135360465920.0, "grad_norm": 84.4478735663587, "language_loss": 0.89742708, "learning_rate": 3.991629295419945e-06, "loss": 0.91922796, "num_input_tokens_seen": 20687355, "router_z_loss_clip": 4.4921875, "router_z_loss_mlp": 0.39575195, "step": 969, "time_per_iteration": 2.628114938735962 }, { "auxiliary_loss_clip": 0.01760266, "auxiliary_loss_mlp": 0.00419313, "balance_loss_clip": 1.29520631, "balance_loss_mlp": 0.37842435, "epoch": 0.058319555087930255, "flos": 29022465369600.0, "grad_norm": 49.0195720245925, "language_loss": 0.85831594, "learning_rate": 3.991593662507167e-06, "loss": 0.88011169, "num_input_tokens_seen": 20705710, "router_z_loss_clip": 4.65234375, "router_z_loss_mlp": 0.40893555, "step": 970, "time_per_iteration": 2.6714987754821777 }, { "auxiliary_loss_clip": 0.01749526, "auxiliary_loss_mlp": 0.00430922, "balance_loss_clip": 1.29003549, "balance_loss_mlp": 0.39015213, "epoch": 0.05837967834059823, "flos": 18879999701760.0, "grad_norm": 31.163886959858, "language_loss": 1.02563953, "learning_rate": 3.991557954072958e-06, "loss": 1.04744399, "num_input_tokens_seen": 20722405, "router_z_loss_clip": 4.59765625, "router_z_loss_mlp": 0.40771484, "step": 971, "time_per_iteration": 2.603003978729248 }, { "auxiliary_loss_clip": 0.01708068, "auxiliary_loss_mlp": 0.00412253, "balance_loss_clip": 1.27216148, "balance_loss_mlp": 0.37343812, "epoch": 0.05843980159326619, "flos": 25703062744320.0, "grad_norm": 9.548571456596441, "language_loss": 0.93512803, "learning_rate": 3.991522170118673e-06, "loss": 0.95633131, "num_input_tokens_seen": 20741480, "router_z_loss_clip": 4.35546875, "router_z_loss_mlp": 0.38793945, "step": 972, "time_per_iteration": 2.699547529220581 }, { "auxiliary_loss_clip": 0.01816696, "auxiliary_loss_mlp": 0.00406327, "balance_loss_clip": 1.32280707, "balance_loss_mlp": 0.36860967, "epoch": 0.058499924845934165, "flos": 25552129795200.0, "grad_norm": 58.4696183899547, "language_loss": 0.95317572, "learning_rate": 3.991486310645667e-06, "loss": 0.97540593, "num_input_tokens_seen": 20759685, "router_z_loss_clip": 4.94140625, "router_z_loss_mlp": 0.37719727, "step": 973, "time_per_iteration": 2.6980061531066895 }, { "auxiliary_loss_clip": 0.01772396, "auxiliary_loss_mlp": 0.00413902, "balance_loss_clip": 1.30503571, "balance_loss_mlp": 0.3743009, "epoch": 0.05856004809860214, "flos": 16436171001600.0, "grad_norm": 10.840408437692407, "language_loss": 0.82452202, "learning_rate": 3.991450375655301e-06, "loss": 0.846385, "num_input_tokens_seen": 20778180, "router_z_loss_clip": 4.66796875, "router_z_loss_mlp": 0.39575195, "step": 974, "time_per_iteration": 2.5982847213745117 }, { "auxiliary_loss_clip": 0.01781646, "auxiliary_loss_mlp": 0.0039081, "balance_loss_clip": 1.3084681, "balance_loss_mlp": 0.3534497, "epoch": 0.0586201713512701, "flos": 39458824116480.0, "grad_norm": 17.991482705299635, "language_loss": 0.82773483, "learning_rate": 3.991414365148936e-06, "loss": 0.84945941, "num_input_tokens_seen": 20802705, "router_z_loss_clip": 4.73828125, "router_z_loss_mlp": 0.37353516, "step": 975, "time_per_iteration": 2.8071060180664062 }, { "auxiliary_loss_clip": 0.0178021, "auxiliary_loss_mlp": 0.00444642, "balance_loss_clip": 1.31565917, "balance_loss_mlp": 0.40110654, "epoch": 0.058680294603938074, "flos": 23365170230400.0, "grad_norm": 5.639986831676729, "language_loss": 0.8510282, "learning_rate": 3.99137827912794e-06, "loss": 0.87327671, "num_input_tokens_seen": 20822540, "router_z_loss_clip": 4.6484375, "router_z_loss_mlp": 0.43530273, "step": 976, "time_per_iteration": 2.6275224685668945 }, { "auxiliary_loss_clip": 0.01791208, "auxiliary_loss_mlp": 0.00419549, "balance_loss_clip": 1.31809402, "balance_loss_mlp": 0.38104409, "epoch": 0.05874041785660604, "flos": 32232017226240.0, "grad_norm": 2.468698548563877, "language_loss": 0.92671204, "learning_rate": 3.991342117593679e-06, "loss": 0.94881964, "num_input_tokens_seen": 20844175, "router_z_loss_clip": 4.73046875, "router_z_loss_mlp": 0.38525391, "step": 977, "time_per_iteration": 2.7718665599823 }, { "auxiliary_loss_clip": 0.01753582, "auxiliary_loss_mlp": 0.00412122, "balance_loss_clip": 1.30041885, "balance_loss_mlp": 0.37163812, "epoch": 0.05880054110927401, "flos": 22310043194880.0, "grad_norm": 14.809127019196689, "language_loss": 0.8682735, "learning_rate": 3.991305880547527e-06, "loss": 0.88993055, "num_input_tokens_seen": 20864730, "router_z_loss_clip": 4.52734375, "router_z_loss_mlp": 0.40478516, "step": 978, "time_per_iteration": 2.6365275382995605 }, { "auxiliary_loss_clip": 0.01774499, "auxiliary_loss_mlp": 0.00419181, "balance_loss_clip": 1.30926323, "balance_loss_mlp": 0.37800649, "epoch": 0.05886066436194198, "flos": 27380450016000.0, "grad_norm": 10.21025917633754, "language_loss": 0.8702389, "learning_rate": 3.991269567990855e-06, "loss": 0.89217579, "num_input_tokens_seen": 20885200, "router_z_loss_clip": 4.66015625, "router_z_loss_mlp": 0.41137695, "step": 979, "time_per_iteration": 2.708183526992798 }, { "auxiliary_loss_clip": 0.01398857, "auxiliary_loss_mlp": 0.00181478, "balance_loss_clip": 1.0927906, "balance_loss_mlp": 0.16183192, "epoch": 0.05892078761460995, "flos": 59584493525760.0, "grad_norm": 1.0912572623344694, "language_loss": 0.59258795, "learning_rate": 3.9912331799250415e-06, "loss": 0.60839128, "num_input_tokens_seen": 20940325, "router_z_loss_clip": 3.0625, "router_z_loss_mlp": 0.19628906, "step": 980, "time_per_iteration": 3.055863618850708 }, { "auxiliary_loss_clip": 0.01762895, "auxiliary_loss_mlp": 0.00374049, "balance_loss_clip": 1.30756664, "balance_loss_mlp": 0.3351391, "epoch": 0.05898091086727792, "flos": 15414081500160.0, "grad_norm": 5.982337744485506, "language_loss": 0.94057399, "learning_rate": 3.9911967163514665e-06, "loss": 0.96194345, "num_input_tokens_seen": 20958220, "router_z_loss_clip": 4.55078125, "router_z_loss_mlp": 0.38891602, "step": 981, "time_per_iteration": 2.6406936645507812 }, { "auxiliary_loss_clip": 0.01732501, "auxiliary_loss_mlp": 0.00361432, "balance_loss_clip": 1.28992009, "balance_loss_mlp": 0.32416725, "epoch": 0.059041034119945886, "flos": 23655328295040.0, "grad_norm": 3.499936463769145, "language_loss": 0.85492826, "learning_rate": 3.991160177271513e-06, "loss": 0.87586755, "num_input_tokens_seen": 20978920, "router_z_loss_clip": 4.42578125, "router_z_loss_mlp": 0.37280273, "step": 982, "time_per_iteration": 2.6601898670196533 }, { "auxiliary_loss_clip": 0.01714556, "auxiliary_loss_mlp": 0.00349984, "balance_loss_clip": 1.28025103, "balance_loss_mlp": 0.30945295, "epoch": 0.05910115737261386, "flos": 24754087376640.0, "grad_norm": 31.506532804059844, "language_loss": 0.93357801, "learning_rate": 3.9911235626865654e-06, "loss": 0.95422339, "num_input_tokens_seen": 20999490, "router_z_loss_clip": 4.34375, "router_z_loss_mlp": 0.40527344, "step": 983, "time_per_iteration": 2.6741371154785156 }, { "auxiliary_loss_clip": 0.01645633, "auxiliary_loss_mlp": 0.00340075, "balance_loss_clip": 1.25016356, "balance_loss_mlp": 0.30266684, "epoch": 0.05916128062528183, "flos": 11728749070080.0, "grad_norm": 37.933988772030695, "language_loss": 0.92243147, "learning_rate": 3.9910868725980125e-06, "loss": 0.94228852, "num_input_tokens_seen": 21017865, "router_z_loss_clip": 3.95117188, "router_z_loss_mlp": 0.37402344, "step": 984, "time_per_iteration": 2.619649648666382 }, { "auxiliary_loss_clip": 0.01659682, "auxiliary_loss_mlp": 0.00343236, "balance_loss_clip": 1.25745797, "balance_loss_mlp": 0.307569, "epoch": 0.059221403877949795, "flos": 21902995296000.0, "grad_norm": 782.7973208884571, "language_loss": 0.85744202, "learning_rate": 3.9910501070072465e-06, "loss": 0.87747121, "num_input_tokens_seen": 21035900, "router_z_loss_clip": 4.01367188, "router_z_loss_mlp": 0.35644531, "step": 985, "time_per_iteration": 2.6285126209259033 }, { "auxiliary_loss_clip": 0.01678796, "auxiliary_loss_mlp": 0.00328507, "balance_loss_clip": 1.26990747, "balance_loss_mlp": 0.29305398, "epoch": 0.05928152713061777, "flos": 20514580940160.0, "grad_norm": 3.7945138102849816, "language_loss": 0.97963524, "learning_rate": 3.991013265915661e-06, "loss": 0.99970829, "num_input_tokens_seen": 21053235, "router_z_loss_clip": 4.08789062, "router_z_loss_mlp": 0.35424805, "step": 986, "time_per_iteration": 2.703303813934326 }, { "auxiliary_loss_clip": 0.01665464, "auxiliary_loss_mlp": 0.0032005, "balance_loss_clip": 1.26600242, "balance_loss_mlp": 0.28290477, "epoch": 0.05934165038328574, "flos": 24495135252480.0, "grad_norm": 2.89219661127288, "language_loss": 0.83994293, "learning_rate": 3.9909763493246525e-06, "loss": 0.85979807, "num_input_tokens_seen": 21073090, "router_z_loss_clip": 3.99023438, "router_z_loss_mlp": 0.37182617, "step": 987, "time_per_iteration": 2.682426929473877 }, { "auxiliary_loss_clip": 0.01650861, "auxiliary_loss_mlp": 0.00330608, "balance_loss_clip": 1.25947285, "balance_loss_mlp": 0.29226989, "epoch": 0.059401773635953704, "flos": 38728041914880.0, "grad_norm": 4.113118901959257, "language_loss": 0.80119324, "learning_rate": 3.990939357235621e-06, "loss": 0.82100797, "num_input_tokens_seen": 21094895, "router_z_loss_clip": 3.91015625, "router_z_loss_mlp": 0.38354492, "step": 988, "time_per_iteration": 2.8041887283325195 }, { "auxiliary_loss_clip": 0.01358978, "auxiliary_loss_mlp": 0.00115667, "balance_loss_clip": 1.10455847, "balance_loss_mlp": 0.10050378, "epoch": 0.059461896888621676, "flos": 58023565125120.0, "grad_norm": 1.154805644810666, "language_loss": 0.71360099, "learning_rate": 3.99090228964997e-06, "loss": 0.72834748, "num_input_tokens_seen": 21147555, "router_z_loss_clip": 2.53125, "router_z_loss_mlp": 0.15136719, "step": 989, "time_per_iteration": 2.980240821838379 }, { "auxiliary_loss_clip": 0.01626309, "auxiliary_loss_mlp": 0.00322504, "balance_loss_clip": 1.25046539, "balance_loss_mlp": 0.28540596, "epoch": 0.05952202014128964, "flos": 22127760650880.0, "grad_norm": 16.816813001840316, "language_loss": 0.89332926, "learning_rate": 3.990865146569105e-06, "loss": 0.91281736, "num_input_tokens_seen": 21167845, "router_z_loss_clip": 3.76171875, "router_z_loss_mlp": 0.37109375, "step": 990, "time_per_iteration": 4.209578514099121 }, { "auxiliary_loss_clip": 0.01620381, "auxiliary_loss_mlp": 0.00347667, "balance_loss_clip": 1.24760878, "balance_loss_mlp": 0.30961585, "epoch": 0.059582143393957614, "flos": 20445776438400.0, "grad_norm": 2.225494722717274, "language_loss": 0.91901922, "learning_rate": 3.990827927994434e-06, "loss": 0.93869972, "num_input_tokens_seen": 21185085, "router_z_loss_clip": 3.72460938, "router_z_loss_mlp": 0.38061523, "step": 991, "time_per_iteration": 4.055104970932007 }, { "auxiliary_loss_clip": 0.01647733, "auxiliary_loss_mlp": 0.00378909, "balance_loss_clip": 1.265311, "balance_loss_mlp": 0.34102407, "epoch": 0.059642266646625586, "flos": 20594877793920.0, "grad_norm": 87.50052918852928, "language_loss": 0.85411024, "learning_rate": 3.9907906339273674e-06, "loss": 0.87437665, "num_input_tokens_seen": 21204230, "router_z_loss_clip": 3.83007812, "router_z_loss_mlp": 0.37841797, "step": 992, "time_per_iteration": 2.6310813426971436 }, { "auxiliary_loss_clip": 0.01637106, "auxiliary_loss_mlp": 0.00324015, "balance_loss_clip": 1.26657367, "balance_loss_mlp": 0.28548694, "epoch": 0.05970238989929355, "flos": 19352655792000.0, "grad_norm": 13.423008164917546, "language_loss": 0.85813022, "learning_rate": 3.9907532643693215e-06, "loss": 0.8777414, "num_input_tokens_seen": 21222655, "router_z_loss_clip": 3.70703125, "router_z_loss_mlp": 0.38549805, "step": 993, "time_per_iteration": 4.132798194885254 }, { "auxiliary_loss_clip": 0.0166116, "auxiliary_loss_mlp": 0.0036086, "balance_loss_clip": 1.29145288, "balance_loss_mlp": 0.32609814, "epoch": 0.05976251315196152, "flos": 30264040926720.0, "grad_norm": 2.9596703954082884, "language_loss": 0.87511587, "learning_rate": 3.990715819321712e-06, "loss": 0.89533603, "num_input_tokens_seen": 21242310, "router_z_loss_clip": 3.69335938, "router_z_loss_mlp": 0.34716797, "step": 994, "time_per_iteration": 2.7761309146881104 }, { "auxiliary_loss_clip": 0.01681864, "auxiliary_loss_mlp": 0.0032291, "balance_loss_clip": 1.30787945, "balance_loss_mlp": 0.28299823, "epoch": 0.05982263640462949, "flos": 23185150243200.0, "grad_norm": 16.813937286339197, "language_loss": 0.87366414, "learning_rate": 3.99067829878596e-06, "loss": 0.89371192, "num_input_tokens_seen": 21261410, "router_z_loss_clip": 3.73828125, "router_z_loss_mlp": 0.39941406, "step": 995, "time_per_iteration": 2.6131067276000977 }, { "auxiliary_loss_clip": 0.01706365, "auxiliary_loss_mlp": 0.00338826, "balance_loss_clip": 1.33180904, "balance_loss_mlp": 0.29970169, "epoch": 0.05988275965729746, "flos": 27850879463040.0, "grad_norm": 24.819074816642743, "language_loss": 0.94176435, "learning_rate": 3.990640702763487e-06, "loss": 0.96221626, "num_input_tokens_seen": 21280080, "router_z_loss_clip": 3.74804688, "router_z_loss_mlp": 0.39111328, "step": 996, "time_per_iteration": 2.679871082305908 }, { "auxiliary_loss_clip": 0.01731905, "auxiliary_loss_mlp": 0.00359559, "balance_loss_clip": 1.35744858, "balance_loss_mlp": 0.32031548, "epoch": 0.05994288290996543, "flos": 24680003575680.0, "grad_norm": 21.222392055358505, "language_loss": 0.97970444, "learning_rate": 3.990603031255718e-06, "loss": 1.00061905, "num_input_tokens_seen": 21296765, "router_z_loss_clip": 3.74804688, "router_z_loss_mlp": 0.39233398, "step": 997, "time_per_iteration": 2.6330554485321045 }, { "auxiliary_loss_clip": 0.01633977, "auxiliary_loss_mlp": 0.00150513, "balance_loss_clip": 1.38874149, "balance_loss_mlp": 0.12752967, "epoch": 0.0600030061626334, "flos": 69929568835200.0, "grad_norm": 1.0551008262270325, "language_loss": 0.75483477, "learning_rate": 3.990565284264083e-06, "loss": 0.77267969, "num_input_tokens_seen": 21363345, "router_z_loss_clip": 2.453125, "router_z_loss_mlp": 0.22949219, "step": 998, "time_per_iteration": 3.203890562057495 }, { "auxiliary_loss_clip": 0.01776175, "auxiliary_loss_mlp": 0.00326555, "balance_loss_clip": 1.40509033, "balance_loss_mlp": 0.29024369, "epoch": 0.06006312941530137, "flos": 26540140268160.0, "grad_norm": 106.00589759352613, "language_loss": 0.83826053, "learning_rate": 3.990527461790013e-06, "loss": 0.85928786, "num_input_tokens_seen": 21385290, "router_z_loss_clip": 3.71289062, "router_z_loss_mlp": 0.36303711, "step": 999, "time_per_iteration": 2.685818672180176 }, { "auxiliary_loss_clip": 0.01822776, "auxiliary_loss_mlp": 0.00345271, "balance_loss_clip": 1.41901135, "balance_loss_mlp": 0.30252233, "epoch": 0.060123252667969335, "flos": 27344000689920.0, "grad_norm": 3.2307546122738313, "language_loss": 0.87548482, "learning_rate": 3.990489563834943e-06, "loss": 0.89716524, "num_input_tokens_seen": 21407625, "router_z_loss_clip": 4.03710938, "router_z_loss_mlp": 0.42724609, "step": 1000, "time_per_iteration": 2.6541972160339355 }, { "auxiliary_loss_clip": 0.01842635, "auxiliary_loss_mlp": 0.00370844, "balance_loss_clip": 1.43991327, "balance_loss_mlp": 0.32881033, "epoch": 0.06018337592063731, "flos": 27016710940800.0, "grad_norm": 39.40998100797067, "language_loss": 0.93547332, "learning_rate": 3.990451590400309e-06, "loss": 0.9576081, "num_input_tokens_seen": 21426835, "router_z_loss_clip": 4.02148438, "router_z_loss_mlp": 0.42041016, "step": 1001, "time_per_iteration": 2.6802408695220947 }, { "auxiliary_loss_clip": 0.01827205, "auxiliary_loss_mlp": 0.00325694, "balance_loss_clip": 1.43629336, "balance_loss_mlp": 0.28895375, "epoch": 0.06024349917330528, "flos": 25592960580480.0, "grad_norm": 16.304013767085294, "language_loss": 0.80052364, "learning_rate": 3.990413541487551e-06, "loss": 0.8220526, "num_input_tokens_seen": 21444920, "router_z_loss_clip": 3.91015625, "router_z_loss_mlp": 0.3671875, "step": 1002, "time_per_iteration": 2.637951374053955 }, { "auxiliary_loss_clip": 0.01878831, "auxiliary_loss_mlp": 0.00368687, "balance_loss_clip": 1.45997667, "balance_loss_mlp": 0.32746488, "epoch": 0.060303622425973244, "flos": 26133271937280.0, "grad_norm": 143.03357346808386, "language_loss": 0.84405422, "learning_rate": 3.990375417098112e-06, "loss": 0.86652941, "num_input_tokens_seen": 21463555, "router_z_loss_clip": 4.1953125, "router_z_loss_mlp": 0.41210938, "step": 1003, "time_per_iteration": 2.694754123687744 }, { "auxiliary_loss_clip": 0.01843627, "auxiliary_loss_mlp": 0.00340702, "balance_loss_clip": 1.43888187, "balance_loss_mlp": 0.29781002, "epoch": 0.060363745678641216, "flos": 20377187418240.0, "grad_norm": 25.42022466348677, "language_loss": 0.79460704, "learning_rate": 3.990337217233437e-06, "loss": 0.81645036, "num_input_tokens_seen": 21481990, "router_z_loss_clip": 4.05273438, "router_z_loss_mlp": 0.42895508, "step": 1004, "time_per_iteration": 2.6466543674468994 }, { "auxiliary_loss_clip": 0.01877194, "auxiliary_loss_mlp": 0.00360124, "balance_loss_clip": 1.45792222, "balance_loss_mlp": 0.31840116, "epoch": 0.06042386893130918, "flos": 17749172753280.0, "grad_norm": 8.376134772046267, "language_loss": 0.91994119, "learning_rate": 3.990298941894976e-06, "loss": 0.94231427, "num_input_tokens_seen": 21500385, "router_z_loss_clip": 4.19140625, "router_z_loss_mlp": 0.41748047, "step": 1005, "time_per_iteration": 2.634685754776001 }, { "auxiliary_loss_clip": 0.01538009, "auxiliary_loss_mlp": 0.00219056, "balance_loss_clip": 1.3119669, "balance_loss_mlp": 0.20093589, "epoch": 0.06048399218397715, "flos": 68538496872960.0, "grad_norm": 43.177648572100075, "language_loss": 0.58819586, "learning_rate": 3.9902605910841794e-06, "loss": 0.60576648, "num_input_tokens_seen": 21561040, "router_z_loss_clip": 2.25, "router_z_loss_mlp": 0.18164062, "step": 1006, "time_per_iteration": 3.2165729999542236 }, { "auxiliary_loss_clip": 0.01868401, "auxiliary_loss_mlp": 0.00360045, "balance_loss_clip": 1.45409334, "balance_loss_mlp": 0.32065862, "epoch": 0.060544115436645125, "flos": 23258515772160.0, "grad_norm": 2.4604411269669657, "language_loss": 0.82523763, "learning_rate": 3.990222164802503e-06, "loss": 0.84752208, "num_input_tokens_seen": 21580655, "router_z_loss_clip": 4.13671875, "router_z_loss_mlp": 0.39404297, "step": 1007, "time_per_iteration": 2.6570353507995605 }, { "auxiliary_loss_clip": 0.01896049, "auxiliary_loss_mlp": 0.00382109, "balance_loss_clip": 1.46747708, "balance_loss_mlp": 0.33752495, "epoch": 0.06060423868931309, "flos": 23878441624320.0, "grad_norm": 4.400931930487963, "language_loss": 0.88007832, "learning_rate": 3.9901836630514006e-06, "loss": 0.90285981, "num_input_tokens_seen": 21599650, "router_z_loss_clip": 4.28515625, "router_z_loss_mlp": 0.44604492, "step": 1008, "time_per_iteration": 2.670351505279541 }, { "auxiliary_loss_clip": 0.01879731, "auxiliary_loss_mlp": 0.00352687, "balance_loss_clip": 1.47013474, "balance_loss_mlp": 0.31420663, "epoch": 0.06066436194198106, "flos": 18728061171840.0, "grad_norm": 6.58875470930951, "language_loss": 0.86297059, "learning_rate": 3.990145085832335e-06, "loss": 0.8852948, "num_input_tokens_seen": 21617550, "router_z_loss_clip": 4.0859375, "router_z_loss_mlp": 0.38476562, "step": 1009, "time_per_iteration": 2.6222121715545654 }, { "auxiliary_loss_clip": 0.01850553, "auxiliary_loss_mlp": 0.00341223, "balance_loss_clip": 1.45428109, "balance_loss_mlp": 0.30124068, "epoch": 0.06072448519464903, "flos": 24640465680000.0, "grad_norm": 279.7912568692669, "language_loss": 0.98131657, "learning_rate": 3.990106433146769e-06, "loss": 1.00323427, "num_input_tokens_seen": 21635865, "router_z_loss_clip": 3.95898438, "router_z_loss_mlp": 0.39990234, "step": 1010, "time_per_iteration": 2.696638822555542 }, { "auxiliary_loss_clip": 0.01873696, "auxiliary_loss_mlp": 0.00431112, "balance_loss_clip": 1.44980359, "balance_loss_mlp": 0.3839286, "epoch": 0.060784608447317, "flos": 17378825575680.0, "grad_norm": 223.97511486786448, "language_loss": 0.81750584, "learning_rate": 3.9900677049961665e-06, "loss": 0.84055394, "num_input_tokens_seen": 21653945, "router_z_loss_clip": 4.24609375, "router_z_loss_mlp": 0.47241211, "step": 1011, "time_per_iteration": 2.600973129272461 }, { "auxiliary_loss_clip": 0.01853479, "auxiliary_loss_mlp": 0.00393421, "balance_loss_clip": 1.45569372, "balance_loss_mlp": 0.35110182, "epoch": 0.06084473169998497, "flos": 23692208584320.0, "grad_norm": 2.6030797472411518, "language_loss": 0.93187892, "learning_rate": 3.990028901381999e-06, "loss": 0.95434785, "num_input_tokens_seen": 21671230, "router_z_loss_clip": 3.9765625, "router_z_loss_mlp": 0.4230957, "step": 1012, "time_per_iteration": 2.7041497230529785 }, { "auxiliary_loss_clip": 0.01877217, "auxiliary_loss_mlp": 0.00416363, "balance_loss_clip": 1.45679975, "balance_loss_mlp": 0.371373, "epoch": 0.06090485495265294, "flos": 23546339452800.0, "grad_norm": 66.06224628510309, "language_loss": 0.83260453, "learning_rate": 3.989990022305734e-06, "loss": 0.85554034, "num_input_tokens_seen": 21691155, "router_z_loss_clip": 4.20898438, "router_z_loss_mlp": 0.44995117, "step": 1013, "time_per_iteration": 2.636848211288452 }, { "auxiliary_loss_clip": 0.01913739, "auxiliary_loss_mlp": 0.00429934, "balance_loss_clip": 1.47620547, "balance_loss_mlp": 0.38034272, "epoch": 0.06096497820532091, "flos": 20339301548160.0, "grad_norm": 45.23706536116578, "language_loss": 0.91789669, "learning_rate": 3.98995106776885e-06, "loss": 0.94133341, "num_input_tokens_seen": 21707405, "router_z_loss_clip": 4.37890625, "router_z_loss_mlp": 0.49560547, "step": 1014, "time_per_iteration": 2.6293182373046875 }, { "auxiliary_loss_clip": 0.01870352, "auxiliary_loss_mlp": 0.00425068, "balance_loss_clip": 1.44842851, "balance_loss_mlp": 0.37728938, "epoch": 0.061025101457988874, "flos": 26939035779840.0, "grad_norm": 6.954144846840946, "language_loss": 0.81825733, "learning_rate": 3.98991203777282e-06, "loss": 0.8412115, "num_input_tokens_seen": 21728090, "router_z_loss_clip": 4.22265625, "router_z_loss_mlp": 0.47729492, "step": 1015, "time_per_iteration": 2.696056842803955 }, { "auxiliary_loss_clip": 0.01909917, "auxiliary_loss_mlp": 0.0036301, "balance_loss_clip": 1.47450376, "balance_loss_mlp": 0.3243621, "epoch": 0.061085224710656846, "flos": 25375054723200.0, "grad_norm": 6.526687359081663, "language_loss": 0.85430336, "learning_rate": 3.9898729323191275e-06, "loss": 0.87703264, "num_input_tokens_seen": 21747950, "router_z_loss_clip": 4.359375, "router_z_loss_mlp": 0.38623047, "step": 1016, "time_per_iteration": 2.6829376220703125 }, { "auxiliary_loss_clip": 0.0187294, "auxiliary_loss_mlp": 0.00393178, "balance_loss_clip": 1.45441532, "balance_loss_mlp": 0.34921318, "epoch": 0.06114534796332482, "flos": 24824759385600.0, "grad_norm": 127.25332078933282, "language_loss": 0.81198764, "learning_rate": 3.989833751409254e-06, "loss": 0.83464879, "num_input_tokens_seen": 21767900, "router_z_loss_clip": 4.1875, "router_z_loss_mlp": 0.43994141, "step": 1017, "time_per_iteration": 2.6310696601867676 }, { "auxiliary_loss_clip": 0.01861398, "auxiliary_loss_mlp": 0.0040916, "balance_loss_clip": 1.44158649, "balance_loss_mlp": 0.36054617, "epoch": 0.061205471215992784, "flos": 20631434860800.0, "grad_norm": 46.068224886081566, "language_loss": 0.93144459, "learning_rate": 3.989794495044685e-06, "loss": 0.9541502, "num_input_tokens_seen": 21787375, "router_z_loss_clip": 4.19921875, "router_z_loss_mlp": 0.48608398, "step": 1018, "time_per_iteration": 2.678844690322876 }, { "auxiliary_loss_clip": 0.01886766, "auxiliary_loss_mlp": 0.00373083, "balance_loss_clip": 1.46006417, "balance_loss_mlp": 0.33028725, "epoch": 0.061265594468660756, "flos": 16508351381760.0, "grad_norm": 374.41224165433374, "language_loss": 0.86680174, "learning_rate": 3.989755163226909e-06, "loss": 0.88940024, "num_input_tokens_seen": 21806275, "router_z_loss_clip": 4.26757812, "router_z_loss_mlp": 0.42773438, "step": 1019, "time_per_iteration": 2.6223413944244385 }, { "auxiliary_loss_clip": 0.01902488, "auxiliary_loss_mlp": 0.00392727, "balance_loss_clip": 1.46695912, "balance_loss_mlp": 0.35210082, "epoch": 0.06132571772132872, "flos": 26246211275520.0, "grad_norm": 99.48803234602866, "language_loss": 0.89968264, "learning_rate": 3.989715755957418e-06, "loss": 0.92263484, "num_input_tokens_seen": 21826430, "router_z_loss_clip": 4.35742188, "router_z_loss_mlp": 0.40673828, "step": 1020, "time_per_iteration": 2.66105318069458 }, { "auxiliary_loss_clip": 0.01898763, "auxiliary_loss_mlp": 0.0038324, "balance_loss_clip": 1.46687329, "balance_loss_mlp": 0.34146923, "epoch": 0.06138584097399669, "flos": 37414788768000.0, "grad_norm": 12.244898198062398, "language_loss": 0.85054487, "learning_rate": 3.989676273237705e-06, "loss": 0.87336487, "num_input_tokens_seen": 21847800, "router_z_loss_clip": 4.31640625, "router_z_loss_mlp": 0.41772461, "step": 1021, "time_per_iteration": 2.730564594268799 }, { "auxiliary_loss_clip": 0.01850455, "auxiliary_loss_mlp": 0.00390287, "balance_loss_clip": 1.44380486, "balance_loss_mlp": 0.35209283, "epoch": 0.061445964226664665, "flos": 17420661941760.0, "grad_norm": 27.189028685072607, "language_loss": 0.94254279, "learning_rate": 3.9896367150692705e-06, "loss": 0.96495026, "num_input_tokens_seen": 21863385, "router_z_loss_clip": 4.06640625, "router_z_loss_mlp": 0.38183594, "step": 1022, "time_per_iteration": 2.6192102432250977 }, { "auxiliary_loss_clip": 0.01880958, "auxiliary_loss_mlp": 0.00383711, "balance_loss_clip": 1.4619596, "balance_loss_mlp": 0.34356105, "epoch": 0.06150608747933263, "flos": 22600021691520.0, "grad_norm": 2.8361091294177605, "language_loss": 0.88909113, "learning_rate": 3.989597081453611e-06, "loss": 0.9117378, "num_input_tokens_seen": 21881880, "router_z_loss_clip": 4.1875, "router_z_loss_mlp": 0.40136719, "step": 1023, "time_per_iteration": 2.633117437362671 }, { "auxiliary_loss_clip": 0.01456976, "auxiliary_loss_mlp": 0.00162268, "balance_loss_clip": 1.24416542, "balance_loss_mlp": 0.14948836, "epoch": 0.0615662107320006, "flos": 56741482005120.0, "grad_norm": 0.9635263638814625, "language_loss": 0.64754468, "learning_rate": 3.989557372392231e-06, "loss": 0.66373712, "num_input_tokens_seen": 21940550, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.12792969, "step": 1024, "time_per_iteration": 3.20097017288208 }, { "auxiliary_loss_clip": 0.01861612, "auxiliary_loss_mlp": 0.00395914, "balance_loss_clip": 1.44995832, "balance_loss_mlp": 0.35376108, "epoch": 0.06162633398466857, "flos": 22564793427840.0, "grad_norm": 177.3400539591709, "language_loss": 0.94409627, "learning_rate": 3.989517587886636e-06, "loss": 0.96667153, "num_input_tokens_seen": 21958390, "router_z_loss_clip": 4.11914062, "router_z_loss_mlp": 0.421875, "step": 1025, "time_per_iteration": 2.659351348876953 }, { "auxiliary_loss_clip": 0.01831959, "auxiliary_loss_mlp": 0.00390447, "balance_loss_clip": 1.43562269, "balance_loss_mlp": 0.34798512, "epoch": 0.06168645723733654, "flos": 25593104234880.0, "grad_norm": 3.5224436118804165, "language_loss": 0.89430487, "learning_rate": 3.989477727938335e-06, "loss": 0.91652894, "num_input_tokens_seen": 21978625, "router_z_loss_clip": 3.96289062, "router_z_loss_mlp": 0.42456055, "step": 1026, "time_per_iteration": 2.6625702381134033 }, { "auxiliary_loss_clip": 0.01808867, "auxiliary_loss_mlp": 0.00408159, "balance_loss_clip": 1.41947174, "balance_loss_mlp": 0.3622635, "epoch": 0.06174658049000451, "flos": 15997917162240.0, "grad_norm": 5.570639554946303, "language_loss": 0.88275379, "learning_rate": 3.989437792548839e-06, "loss": 0.90492404, "num_input_tokens_seen": 21996035, "router_z_loss_clip": 3.89453125, "router_z_loss_mlp": 0.45898438, "step": 1027, "time_per_iteration": 2.693840503692627 }, { "auxiliary_loss_clip": 0.01767877, "auxiliary_loss_mlp": 0.00373479, "balance_loss_clip": 1.40484929, "balance_loss_mlp": 0.33084989, "epoch": 0.06180670374267248, "flos": 11285970117120.0, "grad_norm": 11.56938496606523, "language_loss": 0.90706736, "learning_rate": 3.989397781719663e-06, "loss": 0.92848092, "num_input_tokens_seen": 22011625, "router_z_loss_clip": 3.6328125, "router_z_loss_mlp": 0.42602539, "step": 1028, "time_per_iteration": 2.6674892902374268 }, { "auxiliary_loss_clip": 0.01433993, "auxiliary_loss_mlp": 0.00150846, "balance_loss_clip": 1.22681284, "balance_loss_mlp": 0.13568221, "epoch": 0.06186682699534045, "flos": 65130142216320.0, "grad_norm": 1.0901759473695556, "language_loss": 0.60327333, "learning_rate": 3.989357695452323e-06, "loss": 0.61912173, "num_input_tokens_seen": 22066035, "router_z_loss_clip": 2.0625, "router_z_loss_mlp": 0.15136719, "step": 1029, "time_per_iteration": 3.0580804347991943 }, { "auxiliary_loss_clip": 0.01743541, "auxiliary_loss_mlp": 0.00395852, "balance_loss_clip": 1.38180041, "balance_loss_mlp": 0.35105315, "epoch": 0.061926950248008414, "flos": 21105742976640.0, "grad_norm": 9.723089529918502, "language_loss": 0.88497978, "learning_rate": 3.98931753374834e-06, "loss": 0.90637368, "num_input_tokens_seen": 22085015, "router_z_loss_clip": 3.61523438, "router_z_loss_mlp": 0.44775391, "step": 1030, "time_per_iteration": 2.7045562267303467 }, { "auxiliary_loss_clip": 0.01708122, "auxiliary_loss_mlp": 0.00354881, "balance_loss_clip": 1.36777782, "balance_loss_mlp": 0.31427795, "epoch": 0.061987073500676386, "flos": 17748454481280.0, "grad_norm": 10.52847040221949, "language_loss": 0.8979668, "learning_rate": 3.989277296609237e-06, "loss": 0.91859686, "num_input_tokens_seen": 22102775, "router_z_loss_clip": 3.40234375, "router_z_loss_mlp": 0.40576172, "step": 1031, "time_per_iteration": 2.7101552486419678 }, { "auxiliary_loss_clip": 0.01717851, "auxiliary_loss_mlp": 0.00378509, "balance_loss_clip": 1.3781321, "balance_loss_mlp": 0.33869347, "epoch": 0.06204719675334436, "flos": 21836237869440.0, "grad_norm": 27.03847660760682, "language_loss": 0.83419991, "learning_rate": 3.98923698403654e-06, "loss": 0.85516351, "num_input_tokens_seen": 22121680, "router_z_loss_clip": 3.3984375, "router_z_loss_mlp": 0.39868164, "step": 1032, "time_per_iteration": 4.214528799057007 }, { "auxiliary_loss_clip": 0.01659865, "auxiliary_loss_mlp": 0.00332538, "balance_loss_clip": 1.33332968, "balance_loss_mlp": 0.29136324, "epoch": 0.06210732000601232, "flos": 19353697286400.0, "grad_norm": 6.631828807688025, "language_loss": 0.9858889, "learning_rate": 3.989196596031776e-06, "loss": 1.00581288, "num_input_tokens_seen": 22138155, "router_z_loss_clip": 3.26171875, "router_z_loss_mlp": 0.41210938, "step": 1033, "time_per_iteration": 4.075666189193726 }, { "auxiliary_loss_clip": 0.01685666, "auxiliary_loss_mlp": 0.00342824, "balance_loss_clip": 1.35468292, "balance_loss_mlp": 0.30405715, "epoch": 0.062167443258680295, "flos": 24749382695040.0, "grad_norm": 51.025577116666796, "language_loss": 0.90720797, "learning_rate": 3.989156132596479e-06, "loss": 0.92749286, "num_input_tokens_seen": 22157420, "router_z_loss_clip": 3.3125, "router_z_loss_mlp": 0.38745117, "step": 1034, "time_per_iteration": 2.6525027751922607 }, { "auxiliary_loss_clip": 0.01659377, "auxiliary_loss_mlp": 0.00339913, "balance_loss_clip": 1.34293532, "balance_loss_mlp": 0.30074096, "epoch": 0.06222756651134827, "flos": 34458478773120.0, "grad_norm": 7.819582584557896, "language_loss": 0.87168229, "learning_rate": 3.989115593732182e-06, "loss": 0.89167523, "num_input_tokens_seen": 22178620, "router_z_loss_clip": 3.1640625, "router_z_loss_mlp": 0.3918457, "step": 1035, "time_per_iteration": 4.274803400039673 }, { "auxiliary_loss_clip": 0.01659602, "auxiliary_loss_mlp": 0.00421645, "balance_loss_clip": 1.33363366, "balance_loss_mlp": 0.37434241, "epoch": 0.06228768976401623, "flos": 25666469763840.0, "grad_norm": 16.870520550119824, "language_loss": 0.85486513, "learning_rate": 3.989074979440421e-06, "loss": 0.87567759, "num_input_tokens_seen": 22197125, "router_z_loss_clip": 3.26171875, "router_z_loss_mlp": 0.47290039, "step": 1036, "time_per_iteration": 3.036057472229004 }, { "auxiliary_loss_clip": 0.01658651, "auxiliary_loss_mlp": 0.00396918, "balance_loss_clip": 1.33672905, "balance_loss_mlp": 0.35538575, "epoch": 0.062347813016684205, "flos": 25295619795840.0, "grad_norm": 51.35407173571347, "language_loss": 0.91975152, "learning_rate": 3.989034289722739e-06, "loss": 0.9403072, "num_input_tokens_seen": 22217575, "router_z_loss_clip": 3.21289062, "router_z_loss_mlp": 0.41552734, "step": 1037, "time_per_iteration": 2.703125238418579 }, { "auxiliary_loss_clip": 0.01631469, "auxiliary_loss_mlp": 0.00371782, "balance_loss_clip": 1.3175211, "balance_loss_mlp": 0.32874703, "epoch": 0.06240793626935217, "flos": 26907039740160.0, "grad_norm": 102.63033574690957, "language_loss": 0.87647635, "learning_rate": 3.988993524580676e-06, "loss": 0.89650881, "num_input_tokens_seen": 22236840, "router_z_loss_clip": 3.14257812, "router_z_loss_mlp": 0.43066406, "step": 1038, "time_per_iteration": 2.6997342109680176 }, { "auxiliary_loss_clip": 0.01626554, "auxiliary_loss_mlp": 0.00396285, "balance_loss_clip": 1.31949532, "balance_loss_mlp": 0.35236877, "epoch": 0.06246805952202014, "flos": 21615782146560.0, "grad_norm": 14.82545027577469, "language_loss": 0.92193919, "learning_rate": 3.98895268401578e-06, "loss": 0.94216758, "num_input_tokens_seen": 22256465, "router_z_loss_clip": 3.0703125, "router_z_loss_mlp": 0.43920898, "step": 1039, "time_per_iteration": 2.607518434524536 }, { "auxiliary_loss_clip": 0.01624446, "auxiliary_loss_mlp": 0.00401954, "balance_loss_clip": 1.31115305, "balance_loss_mlp": 0.35884771, "epoch": 0.0625281827746881, "flos": 19311896833920.0, "grad_norm": 3.2250239979738278, "language_loss": 0.88526273, "learning_rate": 3.9889117680296e-06, "loss": 0.90552664, "num_input_tokens_seen": 22274025, "router_z_loss_clip": 3.1328125, "router_z_loss_mlp": 0.43139648, "step": 1040, "time_per_iteration": 2.636939287185669 }, { "auxiliary_loss_clip": 0.01610067, "auxiliary_loss_mlp": 0.0043907, "balance_loss_clip": 1.30093145, "balance_loss_mlp": 0.39317405, "epoch": 0.06258830602735609, "flos": 27745769289600.0, "grad_norm": 24.126318215047956, "language_loss": 0.75930858, "learning_rate": 3.988870776623685e-06, "loss": 0.7798, "num_input_tokens_seen": 22292245, "router_z_loss_clip": 3.08789062, "router_z_loss_mlp": 0.45898438, "step": 1041, "time_per_iteration": 2.6684648990631104 }, { "auxiliary_loss_clip": 0.01575003, "auxiliary_loss_mlp": 0.00418086, "balance_loss_clip": 1.2693882, "balance_loss_mlp": 0.37219071, "epoch": 0.06264842928002405, "flos": 23222605150080.0, "grad_norm": 5.778946106556664, "language_loss": 0.87992918, "learning_rate": 3.9888297097995905e-06, "loss": 0.89986014, "num_input_tokens_seen": 22311455, "router_z_loss_clip": 3.05664062, "router_z_loss_mlp": 0.45947266, "step": 1042, "time_per_iteration": 2.6777265071868896 }, { "auxiliary_loss_clip": 0.01566022, "auxiliary_loss_mlp": 0.0039418, "balance_loss_clip": 1.26798785, "balance_loss_mlp": 0.35393482, "epoch": 0.06270855253269202, "flos": 38399495189760.0, "grad_norm": 82.22822001469639, "language_loss": 0.82518959, "learning_rate": 3.988788567558874e-06, "loss": 0.84479165, "num_input_tokens_seen": 22333750, "router_z_loss_clip": 2.97851562, "router_z_loss_mlp": 0.40258789, "step": 1043, "time_per_iteration": 2.797785997390747 }, { "auxiliary_loss_clip": 0.01551051, "auxiliary_loss_mlp": 0.00408318, "balance_loss_clip": 1.26185131, "balance_loss_mlp": 0.36649978, "epoch": 0.06276867578535998, "flos": 22453542028800.0, "grad_norm": 10.33769875826316, "language_loss": 0.97667933, "learning_rate": 3.988747349903097e-06, "loss": 0.99627304, "num_input_tokens_seen": 22351940, "router_z_loss_clip": 2.890625, "router_z_loss_mlp": 0.41772461, "step": 1044, "time_per_iteration": 2.660820722579956 }, { "auxiliary_loss_clip": 0.0153446, "auxiliary_loss_mlp": 0.00440267, "balance_loss_clip": 1.24242496, "balance_loss_mlp": 0.39742348, "epoch": 0.06282879903802796, "flos": 22930435923840.0, "grad_norm": 5.094887820162873, "language_loss": 0.90001857, "learning_rate": 3.988706056833821e-06, "loss": 0.91976577, "num_input_tokens_seen": 22372085, "router_z_loss_clip": 2.91796875, "router_z_loss_mlp": 0.42871094, "step": 1045, "time_per_iteration": 2.6405088901519775 }, { "auxiliary_loss_clip": 0.01538443, "auxiliary_loss_mlp": 0.00411714, "balance_loss_clip": 1.24800789, "balance_loss_mlp": 0.37049109, "epoch": 0.06288892229069593, "flos": 34819237019520.0, "grad_norm": 572.2871477114196, "language_loss": 0.8495332, "learning_rate": 3.9886646883526125e-06, "loss": 0.86903477, "num_input_tokens_seen": 22392020, "router_z_loss_clip": 2.90625, "router_z_loss_mlp": 0.41210938, "step": 1046, "time_per_iteration": 2.743758201599121 }, { "auxiliary_loss_clip": 0.01529178, "auxiliary_loss_mlp": 0.00392568, "balance_loss_clip": 1.24025345, "balance_loss_mlp": 0.35375324, "epoch": 0.06294904554336389, "flos": 19427134642560.0, "grad_norm": 23.076959387051286, "language_loss": 0.85560918, "learning_rate": 3.988623244461039e-06, "loss": 0.87482667, "num_input_tokens_seen": 22411180, "router_z_loss_clip": 2.88867188, "router_z_loss_mlp": 0.38793945, "step": 1047, "time_per_iteration": 2.631890296936035 }, { "auxiliary_loss_clip": 0.0153781, "auxiliary_loss_mlp": 0.00432864, "balance_loss_clip": 1.2367475, "balance_loss_mlp": 0.38847029, "epoch": 0.06300916879603187, "flos": 40661867358720.0, "grad_norm": 11.663159420434013, "language_loss": 0.82016432, "learning_rate": 3.988581725160672e-06, "loss": 0.83987111, "num_input_tokens_seen": 22435105, "router_z_loss_clip": 3.01171875, "router_z_loss_mlp": 0.4440918, "step": 1048, "time_per_iteration": 2.8371901512145996 }, { "auxiliary_loss_clip": 0.01505333, "auxiliary_loss_mlp": 0.00419765, "balance_loss_clip": 1.21950078, "balance_loss_mlp": 0.3788051, "epoch": 0.06306929204869983, "flos": 23804142341760.0, "grad_norm": 9.289639915963775, "language_loss": 0.86412036, "learning_rate": 3.988540130453087e-06, "loss": 0.88337135, "num_input_tokens_seen": 22452710, "router_z_loss_clip": 2.859375, "router_z_loss_mlp": 0.40991211, "step": 1049, "time_per_iteration": 2.7028040885925293 }, { "auxiliary_loss_clip": 0.0150963, "auxiliary_loss_mlp": 0.00416393, "balance_loss_clip": 1.22100389, "balance_loss_mlp": 0.37357295, "epoch": 0.0631294153013678, "flos": 18915802583040.0, "grad_norm": 355.8530737112155, "language_loss": 0.88772601, "learning_rate": 3.988498460339862e-06, "loss": 0.90698624, "num_input_tokens_seen": 22470175, "router_z_loss_clip": 2.88671875, "router_z_loss_mlp": 0.4284668, "step": 1050, "time_per_iteration": 2.6492273807525635 }, { "auxiliary_loss_clip": 0.01499065, "auxiliary_loss_mlp": 0.00414138, "balance_loss_clip": 1.221246, "balance_loss_mlp": 0.37599057, "epoch": 0.06318953855403578, "flos": 24280174310400.0, "grad_norm": 50.78399641545571, "language_loss": 0.83595645, "learning_rate": 3.988456714822575e-06, "loss": 0.85508847, "num_input_tokens_seen": 22490020, "router_z_loss_clip": 2.77929688, "router_z_loss_mlp": 0.38110352, "step": 1051, "time_per_iteration": 2.7040462493896484 }, { "auxiliary_loss_clip": 0.0149942, "auxiliary_loss_mlp": 0.0037517, "balance_loss_clip": 1.216043, "balance_loss_mlp": 0.33575985, "epoch": 0.06324966180670374, "flos": 22528918719360.0, "grad_norm": 10.062722000736667, "language_loss": 0.88306195, "learning_rate": 3.98841489390281e-06, "loss": 0.9018079, "num_input_tokens_seen": 22509685, "router_z_loss_clip": 2.83398438, "router_z_loss_mlp": 0.39453125, "step": 1052, "time_per_iteration": 2.645591974258423 }, { "auxiliary_loss_clip": 0.0149662, "auxiliary_loss_mlp": 0.00409485, "balance_loss_clip": 1.20810378, "balance_loss_mlp": 0.36862049, "epoch": 0.06330978505937171, "flos": 15778107884160.0, "grad_norm": 129.30674006392036, "language_loss": 0.85990751, "learning_rate": 3.988372997582155e-06, "loss": 0.8789686, "num_input_tokens_seen": 22527905, "router_z_loss_clip": 2.88867188, "router_z_loss_mlp": 0.40893555, "step": 1053, "time_per_iteration": 2.6938087940216064 }, { "auxiliary_loss_clip": 0.01483033, "auxiliary_loss_mlp": 0.00400075, "balance_loss_clip": 1.20167947, "balance_loss_mlp": 0.36214286, "epoch": 0.06336990831203967, "flos": 21471098163840.0, "grad_norm": 3.3322601717178655, "language_loss": 0.89846718, "learning_rate": 3.988331025862195e-06, "loss": 0.91729832, "num_input_tokens_seen": 22546335, "router_z_loss_clip": 2.81445312, "router_z_loss_mlp": 0.37915039, "step": 1054, "time_per_iteration": 2.6541240215301514 }, { "auxiliary_loss_clip": 0.01479265, "auxiliary_loss_mlp": 0.00384223, "balance_loss_clip": 1.19916058, "balance_loss_mlp": 0.34671992, "epoch": 0.06343003156470765, "flos": 18478877546880.0, "grad_norm": 12.357516295828102, "language_loss": 0.9272716, "learning_rate": 3.9882889787445225e-06, "loss": 0.94590652, "num_input_tokens_seen": 22563885, "router_z_loss_clip": 2.8046875, "router_z_loss_mlp": 0.375, "step": 1055, "time_per_iteration": 2.664271116256714 }, { "auxiliary_loss_clip": 0.01505765, "auxiliary_loss_mlp": 0.00393729, "balance_loss_clip": 1.21096802, "balance_loss_mlp": 0.35467586, "epoch": 0.06349015481737562, "flos": 25154886309120.0, "grad_norm": 88.11730059923521, "language_loss": 0.92687345, "learning_rate": 3.988246856230734e-06, "loss": 0.94586837, "num_input_tokens_seen": 22583035, "router_z_loss_clip": 2.94921875, "router_z_loss_mlp": 0.39038086, "step": 1056, "time_per_iteration": 2.6961615085601807 }, { "auxiliary_loss_clip": 0.01518604, "auxiliary_loss_mlp": 0.00475803, "balance_loss_clip": 1.22077286, "balance_loss_mlp": 0.42645082, "epoch": 0.06355027807004358, "flos": 26871775562880.0, "grad_norm": 20.327154981567332, "language_loss": 0.88945347, "learning_rate": 3.988204658322426e-06, "loss": 0.90939754, "num_input_tokens_seen": 22605055, "router_z_loss_clip": 2.9765625, "router_z_loss_mlp": 0.4934082, "step": 1057, "time_per_iteration": 2.7087011337280273 }, { "auxiliary_loss_clip": 0.01497867, "auxiliary_loss_mlp": 0.00364259, "balance_loss_clip": 1.2158916, "balance_loss_mlp": 0.32863915, "epoch": 0.06361040132271156, "flos": 21396691140480.0, "grad_norm": 46.064800770965434, "language_loss": 0.89666426, "learning_rate": 3.988162385021196e-06, "loss": 0.91528553, "num_input_tokens_seen": 22623760, "router_z_loss_clip": 2.8203125, "router_z_loss_mlp": 0.35620117, "step": 1058, "time_per_iteration": 2.6293692588806152 }, { "auxiliary_loss_clip": 0.01519517, "auxiliary_loss_mlp": 0.00388324, "balance_loss_clip": 1.22846603, "balance_loss_mlp": 0.34858, "epoch": 0.06367052457537953, "flos": 25733765894400.0, "grad_norm": 63.72267193090609, "language_loss": 0.94794762, "learning_rate": 3.988120036328651e-06, "loss": 0.96702611, "num_input_tokens_seen": 22643000, "router_z_loss_clip": 2.90820312, "router_z_loss_mlp": 0.39746094, "step": 1059, "time_per_iteration": 2.7277355194091797 }, { "auxiliary_loss_clip": 0.01536588, "auxiliary_loss_mlp": 0.00394358, "balance_loss_clip": 1.23985207, "balance_loss_mlp": 0.35354123, "epoch": 0.0637306478280475, "flos": 17631420992640.0, "grad_norm": 4.157186951909997, "language_loss": 0.99300253, "learning_rate": 3.988077612246394e-06, "loss": 1.01231194, "num_input_tokens_seen": 22660460, "router_z_loss_clip": 2.96679688, "router_z_loss_mlp": 0.40820312, "step": 1060, "time_per_iteration": 2.5862419605255127 }, { "auxiliary_loss_clip": 0.01547165, "auxiliary_loss_mlp": 0.00391442, "balance_loss_clip": 1.24869335, "balance_loss_mlp": 0.35200712, "epoch": 0.06379077108071547, "flos": 13662610427520.0, "grad_norm": 5.584768817925065, "language_loss": 0.94537163, "learning_rate": 3.988035112776035e-06, "loss": 0.96475774, "num_input_tokens_seen": 22679270, "router_z_loss_clip": 2.98242188, "router_z_loss_mlp": 0.39453125, "step": 1061, "time_per_iteration": 2.6051387786865234 }, { "auxiliary_loss_clip": 0.01543514, "auxiliary_loss_mlp": 0.00372989, "balance_loss_clip": 1.24258363, "balance_loss_mlp": 0.33520007, "epoch": 0.06385089433338344, "flos": 28478849961600.0, "grad_norm": 8.75335593505827, "language_loss": 0.83288771, "learning_rate": 3.987992537919185e-06, "loss": 0.85205275, "num_input_tokens_seen": 22699330, "router_z_loss_clip": 3.0078125, "router_z_loss_mlp": 0.37817383, "step": 1062, "time_per_iteration": 2.7441327571868896 }, { "auxiliary_loss_clip": 0.01557501, "auxiliary_loss_mlp": 0.00391459, "balance_loss_clip": 1.25073242, "balance_loss_mlp": 0.35161975, "epoch": 0.0639110175860514, "flos": 24311057028480.0, "grad_norm": 9.362583243499625, "language_loss": 0.91714388, "learning_rate": 3.987949887677459e-06, "loss": 0.93663347, "num_input_tokens_seen": 22717945, "router_z_loss_clip": 3.06640625, "router_z_loss_mlp": 0.3984375, "step": 1063, "time_per_iteration": 2.642925500869751 }, { "auxiliary_loss_clip": 0.01573474, "auxiliary_loss_mlp": 0.00438398, "balance_loss_clip": 1.26112819, "balance_loss_mlp": 0.39202562, "epoch": 0.06397114083871938, "flos": 22090772620800.0, "grad_norm": 9.789677258754391, "language_loss": 0.85331023, "learning_rate": 3.9879071620524744e-06, "loss": 0.87342894, "num_input_tokens_seen": 22736790, "router_z_loss_clip": 3.12304688, "router_z_loss_mlp": 0.46362305, "step": 1064, "time_per_iteration": 2.677478313446045 }, { "auxiliary_loss_clip": 0.01557572, "auxiliary_loss_mlp": 0.0039344, "balance_loss_clip": 1.25735438, "balance_loss_mlp": 0.35433954, "epoch": 0.06403126409138735, "flos": 19572824206080.0, "grad_norm": 146.04552299016308, "language_loss": 0.91371113, "learning_rate": 3.987864361045851e-06, "loss": 0.93322122, "num_input_tokens_seen": 22754745, "router_z_loss_clip": 3.00195312, "router_z_loss_mlp": 0.39086914, "step": 1065, "time_per_iteration": 2.607443332672119 }, { "auxiliary_loss_clip": 0.01552345, "auxiliary_loss_mlp": 0.00403413, "balance_loss_clip": 1.24839139, "balance_loss_mlp": 0.36371672, "epoch": 0.06409138734405531, "flos": 40807413267840.0, "grad_norm": 680.7253694394888, "language_loss": 0.74215496, "learning_rate": 3.987821484659211e-06, "loss": 0.76171255, "num_input_tokens_seen": 22776780, "router_z_loss_clip": 3.0390625, "router_z_loss_mlp": 0.3972168, "step": 1066, "time_per_iteration": 2.8125345706939697 }, { "auxiliary_loss_clip": 0.01560783, "auxiliary_loss_mlp": 0.0040573, "balance_loss_clip": 1.25480652, "balance_loss_mlp": 0.36901328, "epoch": 0.06415151059672328, "flos": 20441610460800.0, "grad_norm": 14.980010759529012, "language_loss": 0.98019511, "learning_rate": 3.987778532894181e-06, "loss": 0.99986023, "num_input_tokens_seen": 22793915, "router_z_loss_clip": 3.05859375, "router_z_loss_mlp": 0.36694336, "step": 1067, "time_per_iteration": 2.602358102798462 }, { "auxiliary_loss_clip": 0.015529, "auxiliary_loss_mlp": 0.00401857, "balance_loss_clip": 1.25268376, "balance_loss_mlp": 0.36559349, "epoch": 0.06421163384939126, "flos": 18072045129600.0, "grad_norm": 46.00123723636975, "language_loss": 0.89616835, "learning_rate": 3.987735505752391e-06, "loss": 0.91571599, "num_input_tokens_seen": 22812670, "router_z_loss_clip": 3.00195312, "router_z_loss_mlp": 0.36254883, "step": 1068, "time_per_iteration": 2.615971088409424 }, { "auxiliary_loss_clip": 0.01549241, "auxiliary_loss_mlp": 0.0038103, "balance_loss_clip": 1.2496841, "balance_loss_mlp": 0.34569657, "epoch": 0.06427175710205922, "flos": 25119442563840.0, "grad_norm": 29.213615004749805, "language_loss": 0.97126901, "learning_rate": 3.987692403235471e-06, "loss": 0.99057174, "num_input_tokens_seen": 22832440, "router_z_loss_clip": 2.99609375, "router_z_loss_mlp": 0.35327148, "step": 1069, "time_per_iteration": 2.694183349609375 }, { "auxiliary_loss_clip": 0.01563775, "auxiliary_loss_mlp": 0.00416793, "balance_loss_clip": 1.25250769, "balance_loss_mlp": 0.37471163, "epoch": 0.06433188035472719, "flos": 17380549428480.0, "grad_norm": 29.67592500752994, "language_loss": 1.02379549, "learning_rate": 3.987649225345056e-06, "loss": 1.04360116, "num_input_tokens_seen": 22845495, "router_z_loss_clip": 3.11523438, "router_z_loss_mlp": 0.42114258, "step": 1070, "time_per_iteration": 2.641355037689209 }, { "auxiliary_loss_clip": 0.01567864, "auxiliary_loss_mlp": 0.00439148, "balance_loss_clip": 1.25787437, "balance_loss_mlp": 0.39785361, "epoch": 0.06439200360739517, "flos": 23546267625600.0, "grad_norm": 53.54771654045865, "language_loss": 0.92173779, "learning_rate": 3.987605972082782e-06, "loss": 0.94180787, "num_input_tokens_seen": 22865390, "router_z_loss_clip": 3.09960938, "router_z_loss_mlp": 0.4128418, "step": 1071, "time_per_iteration": 2.8051066398620605 }, { "auxiliary_loss_clip": 0.0155171, "auxiliary_loss_mlp": 0.00379936, "balance_loss_clip": 1.24605525, "balance_loss_mlp": 0.34467351, "epoch": 0.06445212686006313, "flos": 21979772616960.0, "grad_norm": 8.067639148397818, "language_loss": 0.82165742, "learning_rate": 3.987562643450292e-06, "loss": 0.84097385, "num_input_tokens_seen": 22885495, "router_z_loss_clip": 3.0546875, "router_z_loss_mlp": 0.35253906, "step": 1072, "time_per_iteration": 2.6741156578063965 }, { "auxiliary_loss_clip": 0.01556527, "auxiliary_loss_mlp": 0.00418543, "balance_loss_clip": 1.24563742, "balance_loss_mlp": 0.38106364, "epoch": 0.0645122501127311, "flos": 25921291824000.0, "grad_norm": 40.777536209979765, "language_loss": 0.8974393, "learning_rate": 3.987519239449226e-06, "loss": 0.91719002, "num_input_tokens_seen": 22904845, "router_z_loss_clip": 3.109375, "router_z_loss_mlp": 0.37475586, "step": 1073, "time_per_iteration": 2.6493942737579346 }, { "auxiliary_loss_clip": 0.01558547, "auxiliary_loss_mlp": 0.00450251, "balance_loss_clip": 1.25615835, "balance_loss_mlp": 0.41057837, "epoch": 0.06457237336539907, "flos": 25626034028160.0, "grad_norm": 87.85068016510577, "language_loss": 0.85579813, "learning_rate": 3.987475760081233e-06, "loss": 0.87588608, "num_input_tokens_seen": 22925940, "router_z_loss_clip": 3.02539062, "router_z_loss_mlp": 0.39672852, "step": 1074, "time_per_iteration": 5.51121711730957 }, { "auxiliary_loss_clip": 0.01547752, "auxiliary_loss_mlp": 0.0039766, "balance_loss_clip": 1.24638975, "balance_loss_mlp": 0.36206406, "epoch": 0.06463249661806704, "flos": 19463979018240.0, "grad_norm": 76.75042737376367, "language_loss": 0.86686063, "learning_rate": 3.987432205347958e-06, "loss": 0.88631481, "num_input_tokens_seen": 22944375, "router_z_loss_clip": 3.00976562, "router_z_loss_mlp": 0.35595703, "step": 1075, "time_per_iteration": 2.613860607147217 }, { "auxiliary_loss_clip": 0.01546384, "auxiliary_loss_mlp": 0.00396056, "balance_loss_clip": 1.24677181, "balance_loss_mlp": 0.36153337, "epoch": 0.064692619870735, "flos": 24498044254080.0, "grad_norm": 26.22063402936528, "language_loss": 0.96182168, "learning_rate": 3.987388575251055e-06, "loss": 0.98124605, "num_input_tokens_seen": 22959145, "router_z_loss_clip": 2.99609375, "router_z_loss_mlp": 0.34545898, "step": 1076, "time_per_iteration": 4.071648120880127 }, { "auxiliary_loss_clip": 0.015302, "auxiliary_loss_mlp": 0.00435003, "balance_loss_clip": 1.2339946, "balance_loss_mlp": 0.39685613, "epoch": 0.06475274312340297, "flos": 17018677860480.0, "grad_norm": 210.6407695580567, "language_loss": 0.88593298, "learning_rate": 3.98734486979218e-06, "loss": 0.90558499, "num_input_tokens_seen": 22978100, "router_z_loss_clip": 2.96289062, "router_z_loss_mlp": 0.38110352, "step": 1077, "time_per_iteration": 2.6218690872192383 }, { "auxiliary_loss_clip": 0.01544744, "auxiliary_loss_mlp": 0.0045729, "balance_loss_clip": 1.23875308, "balance_loss_mlp": 0.41334954, "epoch": 0.06481286637607095, "flos": 24572379450240.0, "grad_norm": 12.315166116433852, "language_loss": 0.98389566, "learning_rate": 3.987301088972986e-06, "loss": 1.00391603, "num_input_tokens_seen": 22997285, "router_z_loss_clip": 3.06054688, "router_z_loss_mlp": 0.43920898, "step": 1078, "time_per_iteration": 4.145873069763184 }, { "auxiliary_loss_clip": 0.01541123, "auxiliary_loss_mlp": 0.00460134, "balance_loss_clip": 1.22873521, "balance_loss_mlp": 0.42050847, "epoch": 0.06487298962873891, "flos": 21105635235840.0, "grad_norm": 29.602996997084748, "language_loss": 0.86135066, "learning_rate": 3.987257232795137e-06, "loss": 0.88136321, "num_input_tokens_seen": 23016285, "router_z_loss_clip": 3.125, "router_z_loss_mlp": 0.39648438, "step": 1079, "time_per_iteration": 2.6874167919158936 }, { "auxiliary_loss_clip": 0.01508993, "auxiliary_loss_mlp": 0.00445486, "balance_loss_clip": 1.21067691, "balance_loss_mlp": 0.40698129, "epoch": 0.06493311288140688, "flos": 24608182331520.0, "grad_norm": 43.58859169042818, "language_loss": 0.76935792, "learning_rate": 3.987213301260294e-06, "loss": 0.7889027, "num_input_tokens_seen": 23036420, "router_z_loss_clip": 2.98242188, "router_z_loss_mlp": 0.38500977, "step": 1080, "time_per_iteration": 2.7185263633728027 }, { "auxiliary_loss_clip": 0.01502675, "auxiliary_loss_mlp": 0.00430247, "balance_loss_clip": 1.20162296, "balance_loss_mlp": 0.39097995, "epoch": 0.06499323613407486, "flos": 25337994865920.0, "grad_norm": 242.99721186075243, "language_loss": 0.80292755, "learning_rate": 3.987169294370123e-06, "loss": 0.8222568, "num_input_tokens_seen": 23056945, "router_z_loss_clip": 3.00976562, "router_z_loss_mlp": 0.39282227, "step": 1081, "time_per_iteration": 2.688467502593994 }, { "auxiliary_loss_clip": 0.01500189, "auxiliary_loss_mlp": 0.00433031, "balance_loss_clip": 1.20405936, "balance_loss_mlp": 0.39683947, "epoch": 0.06505335938674282, "flos": 20375714960640.0, "grad_norm": 188.8150167244038, "language_loss": 0.92806673, "learning_rate": 3.987125212126294e-06, "loss": 0.9473989, "num_input_tokens_seen": 23074940, "router_z_loss_clip": 2.95898438, "router_z_loss_mlp": 0.36206055, "step": 1082, "time_per_iteration": 2.8112616539001465 }, { "auxiliary_loss_clip": 0.01516944, "auxiliary_loss_mlp": 0.00464151, "balance_loss_clip": 1.20826054, "balance_loss_mlp": 0.42493162, "epoch": 0.06511348263941079, "flos": 25337923038720.0, "grad_norm": 11.397109176745477, "language_loss": 0.91180778, "learning_rate": 3.987081054530478e-06, "loss": 0.93161875, "num_input_tokens_seen": 23093420, "router_z_loss_clip": 3.08789062, "router_z_loss_mlp": 0.39208984, "step": 1083, "time_per_iteration": 2.6949079036712646 }, { "auxiliary_loss_clip": 0.01517887, "auxiliary_loss_mlp": 0.00469492, "balance_loss_clip": 1.21159422, "balance_loss_mlp": 0.42974758, "epoch": 0.06517360589207877, "flos": 20332801186560.0, "grad_norm": 3.758280459939653, "language_loss": 0.87138993, "learning_rate": 3.987036821584348e-06, "loss": 0.89126366, "num_input_tokens_seen": 23111550, "router_z_loss_clip": 3.06054688, "router_z_loss_mlp": 0.39746094, "step": 1084, "time_per_iteration": 2.621683359146118 }, { "auxiliary_loss_clip": 0.01514992, "auxiliary_loss_mlp": 0.00496219, "balance_loss_clip": 1.20840478, "balance_loss_mlp": 0.456236, "epoch": 0.06523372914474673, "flos": 31681650061440.0, "grad_norm": 35.04331138540899, "language_loss": 0.75052845, "learning_rate": 3.986992513289584e-06, "loss": 0.77064061, "num_input_tokens_seen": 23130335, "router_z_loss_clip": 3.06445312, "router_z_loss_mlp": 0.3996582, "step": 1085, "time_per_iteration": 2.718093156814575 }, { "auxiliary_loss_clip": 0.01503572, "auxiliary_loss_mlp": 0.00442939, "balance_loss_clip": 1.20146847, "balance_loss_mlp": 0.40765265, "epoch": 0.0652938523974147, "flos": 20778165918720.0, "grad_norm": 14.582494034499238, "language_loss": 0.82788557, "learning_rate": 3.9869481296478645e-06, "loss": 0.84735072, "num_input_tokens_seen": 23152380, "router_z_loss_clip": 3.02148438, "router_z_loss_mlp": 0.35253906, "step": 1086, "time_per_iteration": 2.6772780418395996 }, { "auxiliary_loss_clip": 0.01511313, "auxiliary_loss_mlp": 0.00441418, "balance_loss_clip": 1.20503306, "balance_loss_mlp": 0.40532196, "epoch": 0.06535397565008266, "flos": 16690993061760.0, "grad_norm": 36.656422257372256, "language_loss": 0.90638804, "learning_rate": 3.986903670660872e-06, "loss": 0.92591536, "num_input_tokens_seen": 23171630, "router_z_loss_clip": 3.06054688, "router_z_loss_mlp": 0.36108398, "step": 1087, "time_per_iteration": 2.585280418395996 }, { "auxiliary_loss_clip": 0.01511202, "auxiliary_loss_mlp": 0.00464397, "balance_loss_clip": 1.20263517, "balance_loss_mlp": 0.42646471, "epoch": 0.06541409890275064, "flos": 26868220116480.0, "grad_norm": 5.206591851545119, "language_loss": 0.85310954, "learning_rate": 3.9868591363302945e-06, "loss": 0.87286556, "num_input_tokens_seen": 23192520, "router_z_loss_clip": 3.08984375, "router_z_loss_mlp": 0.37915039, "step": 1088, "time_per_iteration": 2.719813346862793 }, { "auxiliary_loss_clip": 0.01533252, "auxiliary_loss_mlp": 0.00442714, "balance_loss_clip": 1.21480274, "balance_loss_mlp": 0.40690309, "epoch": 0.06547422215541861, "flos": 20521620005760.0, "grad_norm": 31.964932527850305, "language_loss": 0.79606891, "learning_rate": 3.9868145266578186e-06, "loss": 0.81582856, "num_input_tokens_seen": 23210710, "router_z_loss_clip": 3.18359375, "router_z_loss_mlp": 0.35791016, "step": 1089, "time_per_iteration": 2.6042380332946777 }, { "auxiliary_loss_clip": 0.01507508, "auxiliary_loss_mlp": 0.00473291, "balance_loss_clip": 1.19788396, "balance_loss_mlp": 0.43831497, "epoch": 0.06553434540808657, "flos": 22016616992640.0, "grad_norm": 78135.55625499117, "language_loss": 0.90796828, "learning_rate": 3.9867698416451366e-06, "loss": 0.92777622, "num_input_tokens_seen": 23230305, "router_z_loss_clip": 3.09570312, "router_z_loss_mlp": 0.34985352, "step": 1090, "time_per_iteration": 2.6337783336639404 }, { "auxiliary_loss_clip": 0.01496927, "auxiliary_loss_mlp": 0.00417726, "balance_loss_clip": 1.18738317, "balance_loss_mlp": 0.38432378, "epoch": 0.06559446866075455, "flos": 24608649208320.0, "grad_norm": 3.7839025941876936, "language_loss": 0.80294907, "learning_rate": 3.9867250812939434e-06, "loss": 0.82209563, "num_input_tokens_seen": 23249015, "router_z_loss_clip": 3.09570312, "router_z_loss_mlp": 0.33398438, "step": 1091, "time_per_iteration": 2.6386218070983887 }, { "auxiliary_loss_clip": 0.01527811, "auxiliary_loss_mlp": 0.00429907, "balance_loss_clip": 1.20664191, "balance_loss_mlp": 0.39340517, "epoch": 0.06565459191342252, "flos": 24274679529600.0, "grad_norm": 85.63250149530734, "language_loss": 0.88802528, "learning_rate": 3.986680245605936e-06, "loss": 0.90760243, "num_input_tokens_seen": 23265105, "router_z_loss_clip": 3.20898438, "router_z_loss_mlp": 0.36499023, "step": 1092, "time_per_iteration": 2.7093825340270996 }, { "auxiliary_loss_clip": 0.01530232, "auxiliary_loss_mlp": 0.00431994, "balance_loss_clip": 1.20471907, "balance_loss_mlp": 0.39606392, "epoch": 0.06571471516609048, "flos": 24787124910720.0, "grad_norm": 5.569353587638561, "language_loss": 0.78017324, "learning_rate": 3.986635334582814e-06, "loss": 0.79979551, "num_input_tokens_seen": 23283950, "router_z_loss_clip": 3.25390625, "router_z_loss_mlp": 0.35888672, "step": 1093, "time_per_iteration": 2.648963451385498 }, { "auxiliary_loss_clip": 0.01530819, "auxiliary_loss_mlp": 0.00467103, "balance_loss_clip": 1.20848739, "balance_loss_mlp": 0.43205568, "epoch": 0.06577483841875846, "flos": 26214071581440.0, "grad_norm": 10.472997528471563, "language_loss": 0.9358511, "learning_rate": 3.986590348226282e-06, "loss": 0.95583034, "num_input_tokens_seen": 23305005, "router_z_loss_clip": 3.22265625, "router_z_loss_mlp": 0.3503418, "step": 1094, "time_per_iteration": 2.7091903686523438 }, { "auxiliary_loss_clip": 0.01542966, "auxiliary_loss_mlp": 0.00427117, "balance_loss_clip": 1.21122992, "balance_loss_mlp": 0.39011449, "epoch": 0.06583496167142643, "flos": 25080802508160.0, "grad_norm": 4.432508296085686, "language_loss": 0.8707093, "learning_rate": 3.986545286538044e-06, "loss": 0.89041013, "num_input_tokens_seen": 23323220, "router_z_loss_clip": 3.31640625, "router_z_loss_mlp": 0.36962891, "step": 1095, "time_per_iteration": 2.7017223834991455 }, { "auxiliary_loss_clip": 0.01545124, "auxiliary_loss_mlp": 0.00380493, "balance_loss_clip": 1.2118094, "balance_loss_mlp": 0.34680438, "epoch": 0.06589508492409439, "flos": 25629804956160.0, "grad_norm": 42.43572473162818, "language_loss": 0.81397593, "learning_rate": 3.986500149519811e-06, "loss": 0.83323205, "num_input_tokens_seen": 23342235, "router_z_loss_clip": 3.33398438, "router_z_loss_mlp": 0.33666992, "step": 1096, "time_per_iteration": 2.7311806678771973 }, { "auxiliary_loss_clip": 0.0155012, "auxiliary_loss_mlp": 0.00381489, "balance_loss_clip": 1.2126838, "balance_loss_mlp": 0.34963635, "epoch": 0.06595520817676236, "flos": 23621249266560.0, "grad_norm": 14.825915176056803, "language_loss": 0.83390749, "learning_rate": 3.986454937173292e-06, "loss": 0.85322356, "num_input_tokens_seen": 23363680, "router_z_loss_clip": 3.37304688, "router_z_loss_mlp": 0.31835938, "step": 1097, "time_per_iteration": 2.66382098197937 }, { "auxiliary_loss_clip": 0.015903, "auxiliary_loss_mlp": 0.00422726, "balance_loss_clip": 1.2359128, "balance_loss_mlp": 0.3859143, "epoch": 0.06601533142943034, "flos": 33801708545280.0, "grad_norm": 6.196072147382637, "language_loss": 0.85043728, "learning_rate": 3.986409649500203e-06, "loss": 0.87056744, "num_input_tokens_seen": 23385590, "router_z_loss_clip": 3.54492188, "router_z_loss_mlp": 0.3684082, "step": 1098, "time_per_iteration": 2.7350857257843018 }, { "auxiliary_loss_clip": 0.01579616, "auxiliary_loss_mlp": 0.00408974, "balance_loss_clip": 1.23240757, "balance_loss_mlp": 0.37323463, "epoch": 0.0660754546820983, "flos": 20259184262400.0, "grad_norm": 184.51648028747067, "language_loss": 0.86960495, "learning_rate": 3.986364286502261e-06, "loss": 0.88949084, "num_input_tokens_seen": 23402945, "router_z_loss_clip": 3.4765625, "router_z_loss_mlp": 0.35742188, "step": 1099, "time_per_iteration": 2.6432154178619385 }, { "auxiliary_loss_clip": 0.01578707, "auxiliary_loss_mlp": 0.00394742, "balance_loss_clip": 1.2310977, "balance_loss_mlp": 0.36241212, "epoch": 0.06613557793476627, "flos": 19354164163200.0, "grad_norm": 9.299121306933175, "language_loss": 0.91285431, "learning_rate": 3.986318848181186e-06, "loss": 0.93258882, "num_input_tokens_seen": 23421410, "router_z_loss_clip": 3.47851562, "router_z_loss_mlp": 0.32324219, "step": 1100, "time_per_iteration": 2.6162126064300537 }, { "auxiliary_loss_clip": 0.01622129, "auxiliary_loss_mlp": 0.00405581, "balance_loss_clip": 1.25428033, "balance_loss_mlp": 0.36962759, "epoch": 0.06619570118743424, "flos": 13772568936960.0, "grad_norm": 840.4603302899762, "language_loss": 0.81159019, "learning_rate": 3.986273334538702e-06, "loss": 0.83186728, "num_input_tokens_seen": 23438870, "router_z_loss_clip": 3.6796875, "router_z_loss_mlp": 0.35961914, "step": 1101, "time_per_iteration": 2.642754316329956 }, { "auxiliary_loss_clip": 0.01622071, "auxiliary_loss_mlp": 0.0039952, "balance_loss_clip": 1.25738811, "balance_loss_mlp": 0.36368519, "epoch": 0.06625582444010221, "flos": 17857874286720.0, "grad_norm": 17.843400776548947, "language_loss": 0.9459098, "learning_rate": 3.986227745576533e-06, "loss": 0.96612567, "num_input_tokens_seen": 23456975, "router_z_loss_clip": 3.6484375, "router_z_loss_mlp": 0.3581543, "step": 1102, "time_per_iteration": 2.586280345916748 }, { "auxiliary_loss_clip": 0.01640467, "auxiliary_loss_mlp": 0.00397432, "balance_loss_clip": 1.26550233, "balance_loss_mlp": 0.36212233, "epoch": 0.06631594769277017, "flos": 11838707579520.0, "grad_norm": 43.899597911879226, "language_loss": 0.91727132, "learning_rate": 3.98618208129641e-06, "loss": 0.93765026, "num_input_tokens_seen": 23473440, "router_z_loss_clip": 3.75, "router_z_loss_mlp": 0.35327148, "step": 1103, "time_per_iteration": 2.6517715454101562 }, { "auxiliary_loss_clip": 0.01649206, "auxiliary_loss_mlp": 0.00361247, "balance_loss_clip": 1.27190113, "balance_loss_mlp": 0.32705826, "epoch": 0.06637607094543815, "flos": 19793351756160.0, "grad_norm": 6.779525427558469, "language_loss": 0.87057012, "learning_rate": 3.986136341700063e-06, "loss": 0.89067471, "num_input_tokens_seen": 23493880, "router_z_loss_clip": 3.76953125, "router_z_loss_mlp": 0.34204102, "step": 1104, "time_per_iteration": 2.62314510345459 }, { "auxiliary_loss_clip": 0.01649048, "auxiliary_loss_mlp": 0.00358936, "balance_loss_clip": 1.27057528, "balance_loss_mlp": 0.32500857, "epoch": 0.06643619419810612, "flos": 25485659677440.0, "grad_norm": 27.93691968056262, "language_loss": 0.85043311, "learning_rate": 3.986090526789227e-06, "loss": 0.87051296, "num_input_tokens_seen": 23514920, "router_z_loss_clip": 3.77929688, "router_z_loss_mlp": 0.33959961, "step": 1105, "time_per_iteration": 2.6693952083587646 }, { "auxiliary_loss_clip": 0.01653518, "auxiliary_loss_mlp": 0.0031932, "balance_loss_clip": 1.26982379, "balance_loss_mlp": 0.2877053, "epoch": 0.06649631745077408, "flos": 16946533393920.0, "grad_norm": 2.8522276632720027, "language_loss": 1.02287722, "learning_rate": 3.986044636565639e-06, "loss": 1.04260564, "num_input_tokens_seen": 23531635, "router_z_loss_clip": 3.83398438, "router_z_loss_mlp": 0.31640625, "step": 1106, "time_per_iteration": 2.564732789993286 }, { "auxiliary_loss_clip": 0.01658518, "auxiliary_loss_mlp": 0.00340788, "balance_loss_clip": 1.27290928, "balance_loss_mlp": 0.30540663, "epoch": 0.06655644070344206, "flos": 17858592558720.0, "grad_norm": 35.325263204974235, "language_loss": 0.88242, "learning_rate": 3.985998671031039e-06, "loss": 0.90241301, "num_input_tokens_seen": 23551020, "router_z_loss_clip": 3.85546875, "router_z_loss_mlp": 0.35375977, "step": 1107, "time_per_iteration": 2.6067376136779785 }, { "auxiliary_loss_clip": 0.01442802, "auxiliary_loss_mlp": 0.00194826, "balance_loss_clip": 1.15568638, "balance_loss_mlp": 0.17451292, "epoch": 0.06661656395611003, "flos": 61419350021760.0, "grad_norm": 0.8347110073290563, "language_loss": 0.56689417, "learning_rate": 3.9859526301871705e-06, "loss": 0.58327043, "num_input_tokens_seen": 23610675, "router_z_loss_clip": 2.875, "router_z_loss_mlp": 0.203125, "step": 1108, "time_per_iteration": 3.0873939990997314 }, { "auxiliary_loss_clip": 0.01642278, "auxiliary_loss_mlp": 0.00353747, "balance_loss_clip": 1.2612282, "balance_loss_mlp": 0.3184377, "epoch": 0.066676687208778, "flos": 20662856282880.0, "grad_norm": 54.703890366498094, "language_loss": 0.77887332, "learning_rate": 3.9859065140357795e-06, "loss": 0.79883361, "num_input_tokens_seen": 23628710, "router_z_loss_clip": 3.8125, "router_z_loss_mlp": 0.35302734, "step": 1109, "time_per_iteration": 2.6344878673553467 }, { "auxiliary_loss_clip": 0.01631735, "auxiliary_loss_mlp": 0.00349072, "balance_loss_clip": 1.26157081, "balance_loss_mlp": 0.31526428, "epoch": 0.06673681046144596, "flos": 20923280864640.0, "grad_norm": 3.4030065813555805, "language_loss": 0.84524399, "learning_rate": 3.985860322578614e-06, "loss": 0.86505204, "num_input_tokens_seen": 23649160, "router_z_loss_clip": 3.703125, "router_z_loss_mlp": 0.33764648, "step": 1110, "time_per_iteration": 2.622086763381958 }, { "auxiliary_loss_clip": 0.01622198, "auxiliary_loss_mlp": 0.00337702, "balance_loss_clip": 1.25308657, "balance_loss_mlp": 0.30162913, "epoch": 0.06679693371411394, "flos": 31065818359680.0, "grad_norm": 18.23724618390184, "language_loss": 0.80302697, "learning_rate": 3.985814055817427e-06, "loss": 0.82262599, "num_input_tokens_seen": 23671995, "router_z_loss_clip": 3.6953125, "router_z_loss_mlp": 0.36083984, "step": 1111, "time_per_iteration": 2.709742307662964 }, { "auxiliary_loss_clip": 0.01608889, "auxiliary_loss_mlp": 0.0033618, "balance_loss_clip": 1.24935901, "balance_loss_mlp": 0.30206278, "epoch": 0.0668570569667819, "flos": 21726135705600.0, "grad_norm": 25.85640391990921, "language_loss": 0.86714983, "learning_rate": 3.985767713753971e-06, "loss": 0.88660055, "num_input_tokens_seen": 23690705, "router_z_loss_clip": 3.59570312, "router_z_loss_mlp": 0.34106445, "step": 1112, "time_per_iteration": 2.641934394836426 }, { "auxiliary_loss_clip": 0.01598814, "auxiliary_loss_mlp": 0.00314776, "balance_loss_clip": 1.24116898, "balance_loss_mlp": 0.28189793, "epoch": 0.06691718021944987, "flos": 22747255539840.0, "grad_norm": 104.3728604204608, "language_loss": 0.88341606, "learning_rate": 3.985721296390005e-06, "loss": 0.90255189, "num_input_tokens_seen": 23709990, "router_z_loss_clip": 3.57421875, "router_z_loss_mlp": 0.32861328, "step": 1113, "time_per_iteration": 2.618006706237793 }, { "auxiliary_loss_clip": 0.01571733, "auxiliary_loss_mlp": 0.00315928, "balance_loss_clip": 1.22871113, "balance_loss_mlp": 0.28436148, "epoch": 0.06697730347211785, "flos": 16545626720640.0, "grad_norm": 11.516511632202697, "language_loss": 0.8987301, "learning_rate": 3.985674803727289e-06, "loss": 0.91760671, "num_input_tokens_seen": 23728485, "router_z_loss_clip": 3.4296875, "router_z_loss_mlp": 0.31591797, "step": 1114, "time_per_iteration": 2.6031835079193115 }, { "auxiliary_loss_clip": 0.01472119, "auxiliary_loss_mlp": 0.00238236, "balance_loss_clip": 1.22960997, "balance_loss_mlp": 0.22173743, "epoch": 0.06703742672478581, "flos": 59782326658560.0, "grad_norm": 0.854563349565955, "language_loss": 0.58195043, "learning_rate": 3.985628235767584e-06, "loss": 0.59905398, "num_input_tokens_seen": 23786650, "router_z_loss_clip": 2.421875, "router_z_loss_mlp": 0.16503906, "step": 1115, "time_per_iteration": 3.1172804832458496 }, { "auxiliary_loss_clip": 0.01569155, "auxiliary_loss_mlp": 0.00330821, "balance_loss_clip": 1.22810304, "balance_loss_mlp": 0.2982291, "epoch": 0.06709754997745378, "flos": 16800197385600.0, "grad_norm": 3.9785170330059443, "language_loss": 0.97796261, "learning_rate": 3.985581592512658e-06, "loss": 0.99696237, "num_input_tokens_seen": 23802555, "router_z_loss_clip": 3.41015625, "router_z_loss_mlp": 0.32568359, "step": 1116, "time_per_iteration": 5.490621089935303 }, { "auxiliary_loss_clip": 0.0157432, "auxiliary_loss_mlp": 0.00338017, "balance_loss_clip": 1.2318356, "balance_loss_mlp": 0.30175358, "epoch": 0.06715767323012176, "flos": 22123917895680.0, "grad_norm": 74.69648050434692, "language_loss": 0.93750024, "learning_rate": 3.985534873964279e-06, "loss": 0.95662367, "num_input_tokens_seen": 23822945, "router_z_loss_clip": 3.421875, "router_z_loss_mlp": 0.36230469, "step": 1117, "time_per_iteration": 2.6785333156585693 }, { "auxiliary_loss_clip": 0.01386377, "auxiliary_loss_mlp": 0.00197589, "balance_loss_clip": 1.16091776, "balance_loss_mlp": 0.17956434, "epoch": 0.06721779648278972, "flos": 66618100137600.0, "grad_norm": 0.8398369303680734, "language_loss": 0.59745502, "learning_rate": 3.985488080124218e-06, "loss": 0.61329472, "num_input_tokens_seen": 23874075, "router_z_loss_clip": 2.25, "router_z_loss_mlp": 0.18066406, "step": 1118, "time_per_iteration": 4.494428396224976 }, { "auxiliary_loss_clip": 0.01569084, "auxiliary_loss_mlp": 0.00325188, "balance_loss_clip": 1.23355997, "balance_loss_mlp": 0.28983086, "epoch": 0.06727791973545769, "flos": 22382474970240.0, "grad_norm": 6.547349853400643, "language_loss": 0.94161958, "learning_rate": 3.985441210994251e-06, "loss": 0.96056235, "num_input_tokens_seen": 23889720, "router_z_loss_clip": 3.35546875, "router_z_loss_mlp": 0.35351562, "step": 1119, "time_per_iteration": 2.605325937271118 }, { "auxiliary_loss_clip": 0.01551582, "auxiliary_loss_mlp": 0.00319682, "balance_loss_clip": 1.22145998, "balance_loss_mlp": 0.28706586, "epoch": 0.06733804298812565, "flos": 24280210224000.0, "grad_norm": 19.742784783764762, "language_loss": 0.91820306, "learning_rate": 3.9853942665761545e-06, "loss": 0.9369157, "num_input_tokens_seen": 23909385, "router_z_loss_clip": 3.296875, "router_z_loss_mlp": 0.32617188, "step": 1120, "time_per_iteration": 4.112436771392822 }, { "auxiliary_loss_clip": 0.01542665, "auxiliary_loss_mlp": 0.00305823, "balance_loss_clip": 1.21776271, "balance_loss_mlp": 0.27256316, "epoch": 0.06739816624079363, "flos": 15918230839680.0, "grad_norm": 17.317694518391747, "language_loss": 0.84988981, "learning_rate": 3.985347246871708e-06, "loss": 0.86837465, "num_input_tokens_seen": 23926830, "router_z_loss_clip": 3.24609375, "router_z_loss_mlp": 0.33227539, "step": 1121, "time_per_iteration": 2.6816940307617188 }, { "auxiliary_loss_clip": 0.0147589, "auxiliary_loss_mlp": 0.00159378, "balance_loss_clip": 1.24019074, "balance_loss_mlp": 0.14221199, "epoch": 0.0674582894934616, "flos": 71398567353600.0, "grad_norm": 0.9420820738565313, "language_loss": 0.58228141, "learning_rate": 3.985300151882694e-06, "loss": 0.59863406, "num_input_tokens_seen": 23992640, "router_z_loss_clip": 2.359375, "router_z_loss_mlp": 0.171875, "step": 1122, "time_per_iteration": 3.301090955734253 }, { "auxiliary_loss_clip": 0.01541368, "auxiliary_loss_mlp": 0.00298738, "balance_loss_clip": 1.21444511, "balance_loss_mlp": 0.26834011, "epoch": 0.06751841274612956, "flos": 25264952559360.0, "grad_norm": 6.7434611826683994, "language_loss": 0.79904532, "learning_rate": 3.985252981610901e-06, "loss": 0.81744641, "num_input_tokens_seen": 24011135, "router_z_loss_clip": 3.26953125, "router_z_loss_mlp": 0.30371094, "step": 1123, "time_per_iteration": 2.6947760581970215 }, { "auxiliary_loss_clip": 0.01544659, "auxiliary_loss_mlp": 0.00341201, "balance_loss_clip": 1.21795201, "balance_loss_mlp": 0.30250543, "epoch": 0.06757853599879754, "flos": 23802741711360.0, "grad_norm": 7.983247594063329, "language_loss": 0.86740446, "learning_rate": 3.985205736058114e-06, "loss": 0.88626307, "num_input_tokens_seen": 24030695, "router_z_loss_clip": 3.26953125, "router_z_loss_mlp": 0.38696289, "step": 1124, "time_per_iteration": 2.680556535720825 }, { "auxiliary_loss_clip": 0.01531511, "auxiliary_loss_mlp": 0.00284997, "balance_loss_clip": 1.2098726, "balance_loss_mlp": 0.2545509, "epoch": 0.0676386592514655, "flos": 21033742164480.0, "grad_norm": 53.99835329944499, "language_loss": 0.79650009, "learning_rate": 3.985158415226128e-06, "loss": 0.8146652, "num_input_tokens_seen": 24050680, "router_z_loss_clip": 3.21679688, "router_z_loss_mlp": 0.30456543, "step": 1125, "time_per_iteration": 2.797760009765625 }, { "auxiliary_loss_clip": 0.0153063, "auxiliary_loss_mlp": 0.00324624, "balance_loss_clip": 1.21017039, "balance_loss_mlp": 0.28764489, "epoch": 0.06769878250413347, "flos": 25556331686400.0, "grad_norm": 48.68996196504871, "language_loss": 0.89216411, "learning_rate": 3.985111019116736e-06, "loss": 0.91071671, "num_input_tokens_seen": 24067205, "router_z_loss_clip": 3.203125, "router_z_loss_mlp": 0.36987305, "step": 1126, "time_per_iteration": 2.6467361450195312 }, { "auxiliary_loss_clip": 0.0147831, "auxiliary_loss_mlp": 0.00185745, "balance_loss_clip": 1.23904741, "balance_loss_mlp": 0.17325151, "epoch": 0.06775890575680145, "flos": 70655251305600.0, "grad_norm": 0.781343461824998, "language_loss": 0.59978271, "learning_rate": 3.985063547731735e-06, "loss": 0.61642325, "num_input_tokens_seen": 24131320, "router_z_loss_clip": 2.40625, "router_z_loss_mlp": 0.125, "step": 1127, "time_per_iteration": 3.144613027572632 }, { "auxiliary_loss_clip": 0.01546235, "auxiliary_loss_mlp": 0.00348587, "balance_loss_clip": 1.22098255, "balance_loss_mlp": 0.31232375, "epoch": 0.06781902900946941, "flos": 24235500769920.0, "grad_norm": 8.940820543093892, "language_loss": 0.87959445, "learning_rate": 3.985016001072925e-06, "loss": 0.89854264, "num_input_tokens_seen": 24149930, "router_z_loss_clip": 3.25585938, "router_z_loss_mlp": 0.36279297, "step": 1128, "time_per_iteration": 2.7251129150390625 }, { "auxiliary_loss_clip": 0.01569021, "auxiliary_loss_mlp": 0.00395362, "balance_loss_clip": 1.23742604, "balance_loss_mlp": 0.35471135, "epoch": 0.06787915226213738, "flos": 22417523665920.0, "grad_norm": 42.37293701502774, "language_loss": 0.81682754, "learning_rate": 3.984968379142109e-06, "loss": 0.83647132, "num_input_tokens_seen": 24169590, "router_z_loss_clip": 3.3125, "router_z_loss_mlp": 0.40637207, "step": 1129, "time_per_iteration": 2.662787914276123 }, { "auxiliary_loss_clip": 0.01562279, "auxiliary_loss_mlp": 0.00410292, "balance_loss_clip": 1.23244727, "balance_loss_mlp": 0.37002271, "epoch": 0.06793927551480534, "flos": 37706922080640.0, "grad_norm": 309.0301604210543, "language_loss": 0.79519355, "learning_rate": 3.984920681941094e-06, "loss": 0.81491929, "num_input_tokens_seen": 24189965, "router_z_loss_clip": 3.29882812, "router_z_loss_mlp": 0.40258789, "step": 1130, "time_per_iteration": 2.780775308609009 }, { "auxiliary_loss_clip": 0.01554943, "auxiliary_loss_mlp": 0.00421782, "balance_loss_clip": 1.22690511, "balance_loss_mlp": 0.38403994, "epoch": 0.06799939876747332, "flos": 20631398947200.0, "grad_norm": 3.0056273591166858, "language_loss": 0.87709939, "learning_rate": 3.984872909471688e-06, "loss": 0.89686668, "num_input_tokens_seen": 24208045, "router_z_loss_clip": 3.28125, "router_z_loss_mlp": 0.37768555, "step": 1131, "time_per_iteration": 2.645106315612793 }, { "auxiliary_loss_clip": 0.01546091, "auxiliary_loss_mlp": 0.00372547, "balance_loss_clip": 1.22206211, "balance_loss_mlp": 0.33873951, "epoch": 0.06805952202014129, "flos": 14864755829760.0, "grad_norm": 6.07236335985529, "language_loss": 0.87335199, "learning_rate": 3.984825061735701e-06, "loss": 0.89253843, "num_input_tokens_seen": 24223805, "router_z_loss_clip": 3.24414062, "router_z_loss_mlp": 0.33764648, "step": 1132, "time_per_iteration": 2.6039018630981445 }, { "auxiliary_loss_clip": 0.01557666, "auxiliary_loss_mlp": 0.003452, "balance_loss_clip": 1.23310101, "balance_loss_mlp": 0.31189311, "epoch": 0.06811964527280925, "flos": 48909434947200.0, "grad_norm": 10.992894049198293, "language_loss": 0.69868863, "learning_rate": 3.9847771387349495e-06, "loss": 0.71771729, "num_input_tokens_seen": 24249475, "router_z_loss_clip": 3.24414062, "router_z_loss_mlp": 0.33239746, "step": 1133, "time_per_iteration": 2.8680946826934814 }, { "auxiliary_loss_clip": 0.01561422, "auxiliary_loss_mlp": 0.00387697, "balance_loss_clip": 1.23093164, "balance_loss_mlp": 0.34819093, "epoch": 0.06817976852547723, "flos": 15377273038080.0, "grad_norm": 20.089213066606074, "language_loss": 0.84641808, "learning_rate": 3.9847291404712506e-06, "loss": 0.86590922, "num_input_tokens_seen": 24267980, "router_z_loss_clip": 3.3046875, "router_z_loss_mlp": 0.39526367, "step": 1134, "time_per_iteration": 2.6728973388671875 }, { "auxiliary_loss_clip": 0.01572015, "auxiliary_loss_mlp": 0.00355688, "balance_loss_clip": 1.24514592, "balance_loss_mlp": 0.32269111, "epoch": 0.0682398917781452, "flos": 20155690200960.0, "grad_norm": 2.1750504208071475, "language_loss": 0.93159044, "learning_rate": 3.984681066946423e-06, "loss": 0.95086741, "num_input_tokens_seen": 24286805, "router_z_loss_clip": 3.27148438, "router_z_loss_mlp": 0.33007812, "step": 1135, "time_per_iteration": 2.7111313343048096 }, { "auxiliary_loss_clip": 0.01574075, "auxiliary_loss_mlp": 0.00378982, "balance_loss_clip": 1.24319422, "balance_loss_mlp": 0.3397859, "epoch": 0.06830001503081316, "flos": 23440618748160.0, "grad_norm": 4.145828388770281, "language_loss": 0.87704086, "learning_rate": 3.984632918162291e-06, "loss": 0.8965714, "num_input_tokens_seen": 24305855, "router_z_loss_clip": 3.31445312, "router_z_loss_mlp": 0.39160156, "step": 1136, "time_per_iteration": 2.651824712753296 }, { "auxiliary_loss_clip": 0.01575999, "auxiliary_loss_mlp": 0.00387799, "balance_loss_clip": 1.24775779, "balance_loss_mlp": 0.34793541, "epoch": 0.06836013828348114, "flos": 34349813153280.0, "grad_norm": 29.465991111605405, "language_loss": 0.9107123, "learning_rate": 3.984584694120679e-06, "loss": 0.93035024, "num_input_tokens_seen": 24326535, "router_z_loss_clip": 3.28320312, "router_z_loss_mlp": 0.39868164, "step": 1137, "time_per_iteration": 2.7855048179626465 }, { "auxiliary_loss_clip": 0.01569601, "auxiliary_loss_mlp": 0.00358542, "balance_loss_clip": 1.24267805, "balance_loss_mlp": 0.32266006, "epoch": 0.06842026153614911, "flos": 23148844571520.0, "grad_norm": 12.941904901721681, "language_loss": 0.86551988, "learning_rate": 3.984536394823418e-06, "loss": 0.88480127, "num_input_tokens_seen": 24345810, "router_z_loss_clip": 3.2734375, "router_z_loss_mlp": 0.35888672, "step": 1138, "time_per_iteration": 2.6317453384399414 }, { "auxiliary_loss_clip": 0.01582238, "auxiliary_loss_mlp": 0.00371449, "balance_loss_clip": 1.24834633, "balance_loss_mlp": 0.33265841, "epoch": 0.06848038478881707, "flos": 24608972430720.0, "grad_norm": 12.766622163507185, "language_loss": 0.90994763, "learning_rate": 3.984488020272336e-06, "loss": 0.92948449, "num_input_tokens_seen": 24366095, "router_z_loss_clip": 3.33984375, "router_z_loss_mlp": 0.38842773, "step": 1139, "time_per_iteration": 2.6876070499420166 }, { "auxiliary_loss_clip": 0.01578922, "auxiliary_loss_mlp": 0.00370103, "balance_loss_clip": 1.2496357, "balance_loss_mlp": 0.33400667, "epoch": 0.06854050804148504, "flos": 40880994278400.0, "grad_norm": 3.755456063678699, "language_loss": 0.81623602, "learning_rate": 3.984439570469271e-06, "loss": 0.83572632, "num_input_tokens_seen": 24388665, "router_z_loss_clip": 3.29296875, "router_z_loss_mlp": 0.36083984, "step": 1140, "time_per_iteration": 2.818798780441284 }, { "auxiliary_loss_clip": 0.01615117, "auxiliary_loss_mlp": 0.00445102, "balance_loss_clip": 1.27004397, "balance_loss_mlp": 0.40027899, "epoch": 0.06860063129415302, "flos": 31686354743040.0, "grad_norm": 27.733424929984626, "language_loss": 0.7764231, "learning_rate": 3.9843910454160574e-06, "loss": 0.79702526, "num_input_tokens_seen": 24407705, "router_z_loss_clip": 3.453125, "router_z_loss_mlp": 0.44848633, "step": 1141, "time_per_iteration": 2.7750937938690186 }, { "auxiliary_loss_clip": 0.01607596, "auxiliary_loss_mlp": 0.00404925, "balance_loss_clip": 1.26090598, "balance_loss_mlp": 0.36165231, "epoch": 0.06866075454682098, "flos": 26542007775360.0, "grad_norm": 9.982567015359463, "language_loss": 0.86510372, "learning_rate": 3.984342445114538e-06, "loss": 0.88522899, "num_input_tokens_seen": 24428390, "router_z_loss_clip": 3.46679688, "router_z_loss_mlp": 0.43286133, "step": 1142, "time_per_iteration": 2.6843442916870117 }, { "auxiliary_loss_clip": 0.01601757, "auxiliary_loss_mlp": 0.0035194, "balance_loss_clip": 1.27338457, "balance_loss_mlp": 0.31782225, "epoch": 0.06872087779948895, "flos": 29789768724480.0, "grad_norm": 101.96065626602888, "language_loss": 0.75338131, "learning_rate": 3.984293769566553e-06, "loss": 0.77291822, "num_input_tokens_seen": 24450810, "router_z_loss_clip": 3.28320312, "router_z_loss_mlp": 0.34082031, "step": 1143, "time_per_iteration": 2.709531784057617 }, { "auxiliary_loss_clip": 0.01596759, "auxiliary_loss_mlp": 0.00384561, "balance_loss_clip": 1.26724696, "balance_loss_mlp": 0.34588927, "epoch": 0.06878100105215693, "flos": 26941118768640.0, "grad_norm": 8.603017299665016, "language_loss": 0.80269861, "learning_rate": 3.98424501877395e-06, "loss": 0.82251179, "num_input_tokens_seen": 24469965, "router_z_loss_clip": 3.296875, "router_z_loss_mlp": 0.38647461, "step": 1144, "time_per_iteration": 2.6498889923095703 }, { "auxiliary_loss_clip": 0.01636768, "auxiliary_loss_mlp": 0.00398318, "balance_loss_clip": 1.29371274, "balance_loss_mlp": 0.35637987, "epoch": 0.06884112430482489, "flos": 10670748946560.0, "grad_norm": 3.999698703659843, "language_loss": 1.00703073, "learning_rate": 3.984196192738577e-06, "loss": 1.02738166, "num_input_tokens_seen": 24486370, "router_z_loss_clip": 3.4296875, "router_z_loss_mlp": 0.41918945, "step": 1145, "time_per_iteration": 2.6486732959747314 }, { "auxiliary_loss_clip": 0.01637557, "auxiliary_loss_mlp": 0.00404592, "balance_loss_clip": 1.29367757, "balance_loss_mlp": 0.35883927, "epoch": 0.06890124755749286, "flos": 20193647898240.0, "grad_norm": 7.580126670843302, "language_loss": 0.91712755, "learning_rate": 3.984147291462285e-06, "loss": 0.93754905, "num_input_tokens_seen": 24503780, "router_z_loss_clip": 3.4375, "router_z_loss_mlp": 0.45751953, "step": 1146, "time_per_iteration": 2.632357597351074 }, { "auxiliary_loss_clip": 0.01605423, "auxiliary_loss_mlp": 0.00327038, "balance_loss_clip": 1.28163981, "balance_loss_mlp": 0.290178, "epoch": 0.06896137081016084, "flos": 20449224144000.0, "grad_norm": 12.59067332855627, "language_loss": 0.92195052, "learning_rate": 3.98409831494693e-06, "loss": 0.94127512, "num_input_tokens_seen": 24522320, "router_z_loss_clip": 3.23632812, "router_z_loss_mlp": 0.36816406, "step": 1147, "time_per_iteration": 2.6410651206970215 }, { "auxiliary_loss_clip": 0.01635895, "auxiliary_loss_mlp": 0.00355845, "balance_loss_clip": 1.30349255, "balance_loss_mlp": 0.31881875, "epoch": 0.0690214940628288, "flos": 18368703555840.0, "grad_norm": 16.863136346462884, "language_loss": 0.92531264, "learning_rate": 3.984049263194367e-06, "loss": 0.94523001, "num_input_tokens_seen": 24540445, "router_z_loss_clip": 3.328125, "router_z_loss_mlp": 0.37060547, "step": 1148, "time_per_iteration": 2.6643481254577637 }, { "auxiliary_loss_clip": 0.01655751, "auxiliary_loss_mlp": 0.00366252, "balance_loss_clip": 1.31801224, "balance_loss_mlp": 0.32653135, "epoch": 0.06908161731549677, "flos": 20558033418240.0, "grad_norm": 17.20691771101131, "language_loss": 0.76093686, "learning_rate": 3.9840001362064575e-06, "loss": 0.7811569, "num_input_tokens_seen": 24557105, "router_z_loss_clip": 3.38085938, "router_z_loss_mlp": 0.39672852, "step": 1149, "time_per_iteration": 2.622317314147949 }, { "auxiliary_loss_clip": 0.01672064, "auxiliary_loss_mlp": 0.00381476, "balance_loss_clip": 1.32701421, "balance_loss_mlp": 0.33999118, "epoch": 0.06914174056816474, "flos": 27563666313600.0, "grad_norm": 3.3630760386636465, "language_loss": 0.90461552, "learning_rate": 3.983950933985064e-06, "loss": 0.92515093, "num_input_tokens_seen": 24578240, "router_z_loss_clip": 3.453125, "router_z_loss_mlp": 0.41479492, "step": 1150, "time_per_iteration": 2.6836493015289307 }, { "auxiliary_loss_clip": 0.01681058, "auxiliary_loss_mlp": 0.00367535, "balance_loss_clip": 1.34158707, "balance_loss_mlp": 0.32635987, "epoch": 0.06920186382083271, "flos": 15304015249920.0, "grad_norm": 36.22665717035775, "language_loss": 0.90929663, "learning_rate": 3.983901656532052e-06, "loss": 0.92978257, "num_input_tokens_seen": 24593585, "router_z_loss_clip": 3.39453125, "router_z_loss_mlp": 0.41162109, "step": 1151, "time_per_iteration": 2.5954506397247314 }, { "auxiliary_loss_clip": 0.01692839, "auxiliary_loss_mlp": 0.00356976, "balance_loss_clip": 1.34626472, "balance_loss_mlp": 0.31830412, "epoch": 0.06926198707350067, "flos": 25191227894400.0, "grad_norm": 11.067330982503133, "language_loss": 0.92167258, "learning_rate": 3.983852303849291e-06, "loss": 0.94217074, "num_input_tokens_seen": 24613110, "router_z_loss_clip": 3.46484375, "router_z_loss_mlp": 0.38647461, "step": 1152, "time_per_iteration": 2.64985728263855 }, { "auxiliary_loss_clip": 0.01733544, "auxiliary_loss_mlp": 0.00370901, "balance_loss_clip": 1.37383246, "balance_loss_mlp": 0.32905805, "epoch": 0.06932211032616864, "flos": 13256137146240.0, "grad_norm": 20.563340864918022, "language_loss": 0.97641873, "learning_rate": 3.983802875938651e-06, "loss": 0.99746323, "num_input_tokens_seen": 24628795, "router_z_loss_clip": 3.59765625, "router_z_loss_mlp": 0.41845703, "step": 1153, "time_per_iteration": 2.587677001953125 }, { "auxiliary_loss_clip": 0.01742356, "auxiliary_loss_mlp": 0.00329485, "balance_loss_clip": 1.38553739, "balance_loss_mlp": 0.28981233, "epoch": 0.06938223357883662, "flos": 24827381078400.0, "grad_norm": 10.418496116067969, "language_loss": 0.87752032, "learning_rate": 3.983753372802008e-06, "loss": 0.89823872, "num_input_tokens_seen": 24645480, "router_z_loss_clip": 3.5703125, "router_z_loss_mlp": 0.39648438, "step": 1154, "time_per_iteration": 2.689920425415039 }, { "auxiliary_loss_clip": 0.01747583, "auxiliary_loss_mlp": 0.00328271, "balance_loss_clip": 1.38908732, "balance_loss_mlp": 0.28886086, "epoch": 0.06944235683150458, "flos": 27267977554560.0, "grad_norm": 5.145383504792062, "language_loss": 0.81923866, "learning_rate": 3.983703794441237e-06, "loss": 0.83999717, "num_input_tokens_seen": 24664630, "router_z_loss_clip": 3.5859375, "router_z_loss_mlp": 0.39428711, "step": 1155, "time_per_iteration": 2.6897523403167725 }, { "auxiliary_loss_clip": 0.01787378, "auxiliary_loss_mlp": 0.00352341, "balance_loss_clip": 1.40646243, "balance_loss_mlp": 0.31099969, "epoch": 0.06950248008417255, "flos": 25808065176960.0, "grad_norm": 31.149074774946293, "language_loss": 0.76379859, "learning_rate": 3.98365414085822e-06, "loss": 0.78519571, "num_input_tokens_seen": 24684210, "router_z_loss_clip": 3.80859375, "router_z_loss_mlp": 0.41381836, "step": 1156, "time_per_iteration": 2.728524923324585 }, { "auxiliary_loss_clip": 0.01812423, "auxiliary_loss_mlp": 0.00340964, "balance_loss_clip": 1.42713916, "balance_loss_mlp": 0.29699922, "epoch": 0.06956260333684053, "flos": 22271546793600.0, "grad_norm": 12.713093012304856, "language_loss": 0.80121899, "learning_rate": 3.98360441205484e-06, "loss": 0.82275283, "num_input_tokens_seen": 24702490, "router_z_loss_clip": 3.85351562, "router_z_loss_mlp": 0.43920898, "step": 1157, "time_per_iteration": 2.6542296409606934 }, { "auxiliary_loss_clip": 0.01800264, "auxiliary_loss_mlp": 0.00342536, "balance_loss_clip": 1.41583931, "balance_loss_mlp": 0.30322045, "epoch": 0.0696227265895085, "flos": 29681390413440.0, "grad_norm": 14.809911364438436, "language_loss": 0.79372001, "learning_rate": 3.983554608032982e-06, "loss": 0.815148, "num_input_tokens_seen": 24724340, "router_z_loss_clip": 3.83984375, "router_z_loss_mlp": 0.39331055, "step": 1158, "time_per_iteration": 4.160548210144043 }, { "auxiliary_loss_clip": 0.0183108, "auxiliary_loss_mlp": 0.00333623, "balance_loss_clip": 1.43854368, "balance_loss_mlp": 0.2933062, "epoch": 0.06968284984217646, "flos": 25523545547520.0, "grad_norm": 9.705391720166851, "language_loss": 0.86484182, "learning_rate": 3.983504728794533e-06, "loss": 0.88648885, "num_input_tokens_seen": 24745550, "router_z_loss_clip": 3.921875, "router_z_loss_mlp": 0.40307617, "step": 1159, "time_per_iteration": 2.703179359436035 }, { "auxiliary_loss_clip": 0.01851809, "auxiliary_loss_mlp": 0.00341525, "balance_loss_clip": 1.44974697, "balance_loss_mlp": 0.29846621, "epoch": 0.06974297309484444, "flos": 20698192287360.0, "grad_norm": 6.058266436064423, "language_loss": 0.9052459, "learning_rate": 3.983454774341387e-06, "loss": 0.92717922, "num_input_tokens_seen": 24762575, "router_z_loss_clip": 4.02148438, "router_z_loss_mlp": 0.43066406, "step": 1160, "time_per_iteration": 2.6436238288879395 }, { "auxiliary_loss_clip": 0.01897252, "auxiliary_loss_mlp": 0.00354339, "balance_loss_clip": 1.4797399, "balance_loss_mlp": 0.31352174, "epoch": 0.0698030963475124, "flos": 26505199313280.0, "grad_norm": 22.190599681929076, "language_loss": 0.83230537, "learning_rate": 3.983404744675437e-06, "loss": 0.85482132, "num_input_tokens_seen": 24782605, "router_z_loss_clip": 4.1796875, "router_z_loss_mlp": 0.40820312, "step": 1161, "time_per_iteration": 4.166844606399536 }, { "auxiliary_loss_clip": 0.01923672, "auxiliary_loss_mlp": 0.00350492, "balance_loss_clip": 1.49595666, "balance_loss_mlp": 0.30922163, "epoch": 0.06986321960018037, "flos": 23040430346880.0, "grad_norm": 5.133298835517874, "language_loss": 0.89439559, "learning_rate": 3.9833546397985794e-06, "loss": 0.91713721, "num_input_tokens_seen": 24802910, "router_z_loss_clip": 4.27734375, "router_z_loss_mlp": 0.41259766, "step": 1162, "time_per_iteration": 2.7280614376068115 }, { "auxiliary_loss_clip": 0.01927346, "auxiliary_loss_mlp": 0.00342161, "balance_loss_clip": 1.49551845, "balance_loss_mlp": 0.30055657, "epoch": 0.06992334285284833, "flos": 28584822061440.0, "grad_norm": 24.826763187592892, "language_loss": 0.85314769, "learning_rate": 3.983304459712716e-06, "loss": 0.87584281, "num_input_tokens_seen": 24823305, "router_z_loss_clip": 4.3203125, "router_z_loss_mlp": 0.41625977, "step": 1163, "time_per_iteration": 4.137516975402832 }, { "auxiliary_loss_clip": 0.01976647, "auxiliary_loss_mlp": 0.00386264, "balance_loss_clip": 1.52781284, "balance_loss_mlp": 0.33860368, "epoch": 0.06998346610551631, "flos": 20595344670720.0, "grad_norm": 5.559898458919318, "language_loss": 0.8442781, "learning_rate": 3.983254204419749e-06, "loss": 0.86790717, "num_input_tokens_seen": 24842155, "router_z_loss_clip": 4.4921875, "router_z_loss_mlp": 0.47680664, "step": 1164, "time_per_iteration": 2.6722609996795654 }, { "auxiliary_loss_clip": 0.01991184, "auxiliary_loss_mlp": 0.00358153, "balance_loss_clip": 1.5383904, "balance_loss_mlp": 0.31309199, "epoch": 0.07004358935818428, "flos": 22528810978560.0, "grad_norm": 12.558368149832297, "language_loss": 0.79571867, "learning_rate": 3.983203873921583e-06, "loss": 0.81921202, "num_input_tokens_seen": 24862080, "router_z_loss_clip": 4.5234375, "router_z_loss_mlp": 0.45117188, "step": 1165, "time_per_iteration": 2.684443473815918 }, { "auxiliary_loss_clip": 0.0197024, "auxiliary_loss_mlp": 0.00335613, "balance_loss_clip": 1.5304004, "balance_loss_mlp": 0.29250675, "epoch": 0.07010371261085224, "flos": 28949997680640.0, "grad_norm": 34.75302646478748, "language_loss": 0.86790049, "learning_rate": 3.983153468220128e-06, "loss": 0.89095902, "num_input_tokens_seen": 24886165, "router_z_loss_clip": 4.40234375, "router_z_loss_mlp": 0.4309082, "step": 1166, "time_per_iteration": 2.729116201400757 }, { "auxiliary_loss_clip": 0.02002851, "auxiliary_loss_mlp": 0.00324148, "balance_loss_clip": 1.54772401, "balance_loss_mlp": 0.28314009, "epoch": 0.07016383586352022, "flos": 23659171050240.0, "grad_norm": 8.039851687618254, "language_loss": 0.90736967, "learning_rate": 3.983102987317295e-06, "loss": 0.93063962, "num_input_tokens_seen": 24905775, "router_z_loss_clip": 4.5546875, "router_z_loss_mlp": 0.40991211, "step": 1167, "time_per_iteration": 2.6508960723876953 }, { "auxiliary_loss_clip": 0.02001508, "auxiliary_loss_mlp": 0.00353214, "balance_loss_clip": 1.54645526, "balance_loss_mlp": 0.30901158, "epoch": 0.07022395911618819, "flos": 19792130693760.0, "grad_norm": 42.72310995107356, "language_loss": 0.98066354, "learning_rate": 3.983052431214997e-06, "loss": 1.00421071, "num_input_tokens_seen": 24924295, "router_z_loss_clip": 4.55078125, "router_z_loss_mlp": 0.44262695, "step": 1168, "time_per_iteration": 2.6517162322998047 }, { "auxiliary_loss_clip": 0.02005164, "auxiliary_loss_mlp": 0.00379899, "balance_loss_clip": 1.53919387, "balance_loss_mlp": 0.33133364, "epoch": 0.07028408236885615, "flos": 21689147675520.0, "grad_norm": 11.169331359162113, "language_loss": 0.94846225, "learning_rate": 3.983001799915153e-06, "loss": 0.97231293, "num_input_tokens_seen": 24943210, "router_z_loss_clip": 4.6640625, "router_z_loss_mlp": 0.48583984, "step": 1169, "time_per_iteration": 2.618039846420288 }, { "auxiliary_loss_clip": 0.02036618, "auxiliary_loss_mlp": 0.00376617, "balance_loss_clip": 1.56720948, "balance_loss_mlp": 0.33172256, "epoch": 0.07034420562152413, "flos": 25630271832960.0, "grad_norm": 10.894662129386642, "language_loss": 0.93560767, "learning_rate": 3.982951093419681e-06, "loss": 0.95974004, "num_input_tokens_seen": 24960360, "router_z_loss_clip": 4.6953125, "router_z_loss_mlp": 0.44873047, "step": 1170, "time_per_iteration": 2.6453754901885986 }, { "auxiliary_loss_clip": 0.0203113, "auxiliary_loss_mlp": 0.00332593, "balance_loss_clip": 1.5750525, "balance_loss_mlp": 0.29218054, "epoch": 0.0704043288741921, "flos": 20810449267200.0, "grad_norm": 9.179689398681868, "language_loss": 0.82112861, "learning_rate": 3.982900311730506e-06, "loss": 0.8447659, "num_input_tokens_seen": 24978290, "router_z_loss_clip": 4.5546875, "router_z_loss_mlp": 0.40405273, "step": 1171, "time_per_iteration": 2.606445074081421 }, { "auxiliary_loss_clip": 0.02034779, "auxiliary_loss_mlp": 0.0034623, "balance_loss_clip": 1.57007992, "balance_loss_mlp": 0.30424446, "epoch": 0.07046445212686006, "flos": 25593176062080.0, "grad_norm": 3.0473501297660066, "language_loss": 0.95480716, "learning_rate": 3.9828494548495514e-06, "loss": 0.97861731, "num_input_tokens_seen": 24997055, "router_z_loss_clip": 4.65234375, "router_z_loss_mlp": 0.41992188, "step": 1172, "time_per_iteration": 2.687406063079834 }, { "auxiliary_loss_clip": 0.02022102, "auxiliary_loss_mlp": 0.00368744, "balance_loss_clip": 1.55239677, "balance_loss_mlp": 0.32127506, "epoch": 0.07052457537952803, "flos": 25556978131200.0, "grad_norm": 96.05533046217592, "language_loss": 0.87490821, "learning_rate": 3.982798522778748e-06, "loss": 0.8988167, "num_input_tokens_seen": 25017490, "router_z_loss_clip": 4.69921875, "router_z_loss_mlp": 0.47460938, "step": 1173, "time_per_iteration": 2.7375686168670654 }, { "auxiliary_loss_clip": 0.02046676, "auxiliary_loss_mlp": 0.00380363, "balance_loss_clip": 1.57515335, "balance_loss_mlp": 0.33496839, "epoch": 0.070584698632196, "flos": 17968515154560.0, "grad_norm": 3.0069679790741453, "language_loss": 0.87586689, "learning_rate": 3.9827475155200245e-06, "loss": 0.90013731, "num_input_tokens_seen": 25035660, "router_z_loss_clip": 4.71484375, "router_z_loss_mlp": 0.45361328, "step": 1174, "time_per_iteration": 2.5884928703308105 }, { "auxiliary_loss_clip": 0.02027474, "auxiliary_loss_mlp": 0.00389971, "balance_loss_clip": 1.55896735, "balance_loss_mlp": 0.34257302, "epoch": 0.07064482188486397, "flos": 25370888745600.0, "grad_norm": 3.4081120625283265, "language_loss": 0.89958864, "learning_rate": 3.982696433075317e-06, "loss": 0.92376304, "num_input_tokens_seen": 25054785, "router_z_loss_clip": 4.6796875, "router_z_loss_mlp": 0.47436523, "step": 1175, "time_per_iteration": 2.712364673614502 }, { "auxiliary_loss_clip": 0.02035345, "auxiliary_loss_mlp": 0.00336522, "balance_loss_clip": 1.57401204, "balance_loss_mlp": 0.29045999, "epoch": 0.07070494513753194, "flos": 24899848767360.0, "grad_norm": 217.93869975188045, "language_loss": 0.91556942, "learning_rate": 3.982645275446563e-06, "loss": 0.93928814, "num_input_tokens_seen": 25075180, "router_z_loss_clip": 4.6171875, "router_z_loss_mlp": 0.46044922, "step": 1176, "time_per_iteration": 2.7098910808563232 }, { "auxiliary_loss_clip": 0.02028748, "auxiliary_loss_mlp": 0.00335689, "balance_loss_clip": 1.57773232, "balance_loss_mlp": 0.29017478, "epoch": 0.07076506839019991, "flos": 22338447874560.0, "grad_norm": 5.959343762549416, "language_loss": 0.81497651, "learning_rate": 3.982594042635701e-06, "loss": 0.83862084, "num_input_tokens_seen": 25093035, "router_z_loss_clip": 4.51171875, "router_z_loss_mlp": 0.45458984, "step": 1177, "time_per_iteration": 2.6549010276794434 }, { "auxiliary_loss_clip": 0.02014092, "auxiliary_loss_mlp": 0.00343284, "balance_loss_clip": 1.55788565, "balance_loss_mlp": 0.29881924, "epoch": 0.07082519164286788, "flos": 18660800954880.0, "grad_norm": 3.013587511572665, "language_loss": 0.903732, "learning_rate": 3.982542734644673e-06, "loss": 0.92730576, "num_input_tokens_seen": 25112520, "router_z_loss_clip": 4.5625, "router_z_loss_mlp": 0.44384766, "step": 1178, "time_per_iteration": 2.604640007019043 }, { "auxiliary_loss_clip": 0.02148813, "auxiliary_loss_mlp": 0.00108228, "balance_loss_clip": 1.80609822, "balance_loss_mlp": 0.08905933, "epoch": 0.07088531489553584, "flos": 63654107610240.0, "grad_norm": 0.8940067811555822, "language_loss": 0.63398254, "learning_rate": 3.982491351475427e-06, "loss": 0.65655291, "num_input_tokens_seen": 25177760, "router_z_loss_clip": 3.4375, "router_z_loss_mlp": 0.19140625, "step": 1179, "time_per_iteration": 3.2820568084716797 }, { "auxiliary_loss_clip": 0.01997068, "auxiliary_loss_mlp": 0.00334994, "balance_loss_clip": 1.55009174, "balance_loss_mlp": 0.28771591, "epoch": 0.07094543814820382, "flos": 21572688804480.0, "grad_norm": 23865.90907202281, "language_loss": 0.96034491, "learning_rate": 3.98243989312991e-06, "loss": 0.98366547, "num_input_tokens_seen": 25195260, "router_z_loss_clip": 4.47265625, "router_z_loss_mlp": 0.47265625, "step": 1180, "time_per_iteration": 2.6100945472717285 }, { "auxiliary_loss_clip": 0.0199728, "auxiliary_loss_mlp": 0.00390421, "balance_loss_clip": 1.54114878, "balance_loss_mlp": 0.33990011, "epoch": 0.07100556140087179, "flos": 22089946608000.0, "grad_norm": 9.57608657555911, "language_loss": 0.94702613, "learning_rate": 3.982388359610074e-06, "loss": 0.9709031, "num_input_tokens_seen": 25212740, "router_z_loss_clip": 4.5625, "router_z_loss_mlp": 0.50463867, "step": 1181, "time_per_iteration": 2.6121630668640137 }, { "auxiliary_loss_clip": 0.01973638, "auxiliary_loss_mlp": 0.00340385, "balance_loss_clip": 1.54010332, "balance_loss_mlp": 0.29403669, "epoch": 0.07106568465353975, "flos": 47922286400640.0, "grad_norm": 9.963702778957204, "language_loss": 0.89889759, "learning_rate": 3.9823367509178725e-06, "loss": 0.92203772, "num_input_tokens_seen": 25236420, "router_z_loss_clip": 4.33203125, "router_z_loss_mlp": 0.46337891, "step": 1182, "time_per_iteration": 2.844759464263916 }, { "auxiliary_loss_clip": 0.01928846, "auxiliary_loss_mlp": 0.00300602, "balance_loss_clip": 1.5182929, "balance_loss_mlp": 0.25911701, "epoch": 0.07112580790620772, "flos": 23440798316160.0, "grad_norm": 137.72326808533958, "language_loss": 0.88906991, "learning_rate": 3.982285067055262e-06, "loss": 0.91136438, "num_input_tokens_seen": 25255120, "router_z_loss_clip": 4.10742188, "router_z_loss_mlp": 0.41503906, "step": 1183, "time_per_iteration": 2.7407312393188477 }, { "auxiliary_loss_clip": 0.0193812, "auxiliary_loss_mlp": 0.00355591, "balance_loss_clip": 1.51941478, "balance_loss_mlp": 0.3101486, "epoch": 0.0711859311588757, "flos": 31868888682240.0, "grad_norm": 4.035364526223806, "language_loss": 0.87759435, "learning_rate": 3.982233308024204e-06, "loss": 0.90053153, "num_input_tokens_seen": 25275150, "router_z_loss_clip": 4.18359375, "router_z_loss_mlp": 0.45458984, "step": 1184, "time_per_iteration": 2.797081708908081 }, { "auxiliary_loss_clip": 0.01925962, "auxiliary_loss_mlp": 0.00297677, "balance_loss_clip": 1.5160321, "balance_loss_mlp": 0.25550067, "epoch": 0.07124605441154366, "flos": 19610315026560.0, "grad_norm": 4.100639930432864, "language_loss": 0.83988082, "learning_rate": 3.98218147382666e-06, "loss": 0.86211723, "num_input_tokens_seen": 25293680, "router_z_loss_clip": 4.09179688, "router_z_loss_mlp": 0.421875, "step": 1185, "time_per_iteration": 2.6754298210144043 }, { "auxiliary_loss_clip": 0.01918268, "auxiliary_loss_mlp": 0.00308039, "balance_loss_clip": 1.50033057, "balance_loss_mlp": 0.2645275, "epoch": 0.07130617766421163, "flos": 14684448533760.0, "grad_norm": 133.5844426362042, "language_loss": 0.73247939, "learning_rate": 3.982129564464596e-06, "loss": 0.75474244, "num_input_tokens_seen": 25310050, "router_z_loss_clip": 4.17773438, "router_z_loss_mlp": 0.43481445, "step": 1186, "time_per_iteration": 2.6543211936950684 }, { "auxiliary_loss_clip": 0.01906878, "auxiliary_loss_mlp": 0.00320587, "balance_loss_clip": 1.49768758, "balance_loss_mlp": 0.27688473, "epoch": 0.07136630091687961, "flos": 26067915141120.0, "grad_norm": 9.64702243732301, "language_loss": 0.76121724, "learning_rate": 3.98207757993998e-06, "loss": 0.78349191, "num_input_tokens_seen": 25331020, "router_z_loss_clip": 4.08984375, "router_z_loss_mlp": 0.43701172, "step": 1187, "time_per_iteration": 2.7187769412994385 }, { "auxiliary_loss_clip": 0.0187841, "auxiliary_loss_mlp": 0.0025653, "balance_loss_clip": 1.48427939, "balance_loss_mlp": 0.21409161, "epoch": 0.07142642416954757, "flos": 15669190869120.0, "grad_norm": 21.134611372777158, "language_loss": 0.87557077, "learning_rate": 3.9820255202547845e-06, "loss": 0.89692026, "num_input_tokens_seen": 25347875, "router_z_loss_clip": 3.94140625, "router_z_loss_mlp": 0.42456055, "step": 1188, "time_per_iteration": 2.606985330581665 }, { "auxiliary_loss_clip": 0.01881321, "auxiliary_loss_mlp": 0.00308404, "balance_loss_clip": 1.4825598, "balance_loss_mlp": 0.26541704, "epoch": 0.07148654742221554, "flos": 19755322231680.0, "grad_norm": 24.66476086861671, "language_loss": 0.92398912, "learning_rate": 3.981973385410981e-06, "loss": 0.94588637, "num_input_tokens_seen": 25366715, "router_z_loss_clip": 3.98632812, "router_z_loss_mlp": 0.42993164, "step": 1189, "time_per_iteration": 2.5759904384613037 }, { "auxiliary_loss_clip": 0.0187238, "auxiliary_loss_mlp": 0.00296084, "balance_loss_clip": 1.47655392, "balance_loss_mlp": 0.2529301, "epoch": 0.07154667067488352, "flos": 23471824688640.0, "grad_norm": 52.94855472970312, "language_loss": 0.83732516, "learning_rate": 3.9819211754105494e-06, "loss": 0.85900974, "num_input_tokens_seen": 25385450, "router_z_loss_clip": 3.95703125, "router_z_loss_mlp": 0.43164062, "step": 1190, "time_per_iteration": 2.6727817058563232 }, { "auxiliary_loss_clip": 0.01848568, "auxiliary_loss_mlp": 0.00302119, "balance_loss_clip": 1.45050132, "balance_loss_mlp": 0.25691503, "epoch": 0.07160679392755148, "flos": 18332936588160.0, "grad_norm": 2.524916110046282, "language_loss": 0.83949542, "learning_rate": 3.981868890255468e-06, "loss": 0.86100233, "num_input_tokens_seen": 25403940, "router_z_loss_clip": 3.98046875, "router_z_loss_mlp": 0.45214844, "step": 1191, "time_per_iteration": 2.5905675888061523 }, { "auxiliary_loss_clip": 0.01844228, "auxiliary_loss_mlp": 0.0031116, "balance_loss_clip": 1.44942403, "balance_loss_mlp": 0.27031875, "epoch": 0.07166691718021945, "flos": 17747017937280.0, "grad_norm": 38.086326928987624, "language_loss": 0.8390466, "learning_rate": 3.981816529947719e-06, "loss": 0.86060047, "num_input_tokens_seen": 25420410, "router_z_loss_clip": 3.94921875, "router_z_loss_mlp": 0.40844727, "step": 1192, "time_per_iteration": 2.6470110416412354 }, { "auxiliary_loss_clip": 0.01835179, "auxiliary_loss_mlp": 0.002984, "balance_loss_clip": 1.44467187, "balance_loss_mlp": 0.25846496, "epoch": 0.07172704043288743, "flos": 22451925916800.0, "grad_norm": 43.53075509293043, "language_loss": 0.85164464, "learning_rate": 3.9817640944892896e-06, "loss": 0.87298042, "num_input_tokens_seen": 25439415, "router_z_loss_clip": 3.91015625, "router_z_loss_mlp": 0.39941406, "step": 1193, "time_per_iteration": 2.6253979206085205 }, { "auxiliary_loss_clip": 0.01828407, "auxiliary_loss_mlp": 0.00274522, "balance_loss_clip": 1.44100332, "balance_loss_mlp": 0.23358509, "epoch": 0.07178716368555539, "flos": 23222210100480.0, "grad_norm": 15.554209254832838, "language_loss": 0.92416239, "learning_rate": 3.981711583882166e-06, "loss": 0.94519168, "num_input_tokens_seen": 25458715, "router_z_loss_clip": 3.87304688, "router_z_loss_mlp": 0.40942383, "step": 1194, "time_per_iteration": 2.629704475402832 }, { "auxiliary_loss_clip": 0.01804624, "auxiliary_loss_mlp": 0.00267411, "balance_loss_clip": 1.42643046, "balance_loss_mlp": 0.22690405, "epoch": 0.07184728693822336, "flos": 25150828072320.0, "grad_norm": 2.7532576868593286, "language_loss": 0.87093425, "learning_rate": 3.981658998128341e-06, "loss": 0.89165455, "num_input_tokens_seen": 25477985, "router_z_loss_clip": 3.78125, "router_z_loss_mlp": 0.40478516, "step": 1195, "time_per_iteration": 2.642906904220581 }, { "auxiliary_loss_clip": 0.01794631, "auxiliary_loss_mlp": 0.00280913, "balance_loss_clip": 1.41756892, "balance_loss_mlp": 0.24121594, "epoch": 0.07190741019089132, "flos": 22711237176960.0, "grad_norm": 39.030288273948024, "language_loss": 0.83658838, "learning_rate": 3.981606337229808e-06, "loss": 0.85734379, "num_input_tokens_seen": 25497110, "router_z_loss_clip": 3.76953125, "router_z_loss_mlp": 0.39697266, "step": 1196, "time_per_iteration": 2.6311964988708496 }, { "auxiliary_loss_clip": 0.01786964, "auxiliary_loss_mlp": 0.00307228, "balance_loss_clip": 1.4012084, "balance_loss_mlp": 0.26512301, "epoch": 0.0719675334435593, "flos": 29349791032320.0, "grad_norm": 33.47031376312383, "language_loss": 0.81230927, "learning_rate": 3.9815536011885655e-06, "loss": 0.83325112, "num_input_tokens_seen": 25516555, "router_z_loss_clip": 3.85546875, "router_z_loss_mlp": 0.42114258, "step": 1197, "time_per_iteration": 2.6679258346557617 }, { "auxiliary_loss_clip": 0.01770838, "auxiliary_loss_mlp": 0.00282168, "balance_loss_clip": 1.39186382, "balance_loss_mlp": 0.24559422, "epoch": 0.07202765669622727, "flos": 17639788861440.0, "grad_norm": 9.64301881290966, "language_loss": 0.9137944, "learning_rate": 3.98150079000661e-06, "loss": 0.93432438, "num_input_tokens_seen": 25533895, "router_z_loss_clip": 3.79101562, "router_z_loss_mlp": 0.36572266, "step": 1198, "time_per_iteration": 2.5990424156188965 }, { "auxiliary_loss_clip": 0.01760693, "auxiliary_loss_mlp": 0.00293016, "balance_loss_clip": 1.38390839, "balance_loss_mlp": 0.25505954, "epoch": 0.07208777994889523, "flos": 21434038306560.0, "grad_norm": 39.15808558529027, "language_loss": 0.90738612, "learning_rate": 3.981447903685947e-06, "loss": 0.9279232, "num_input_tokens_seen": 25554195, "router_z_loss_clip": 3.76367188, "router_z_loss_mlp": 0.37963867, "step": 1199, "time_per_iteration": 2.6991679668426514 }, { "auxiliary_loss_clip": 0.01782043, "auxiliary_loss_mlp": 0.0031653, "balance_loss_clip": 1.39591277, "balance_loss_mlp": 0.2761423, "epoch": 0.07214790320156321, "flos": 26940867373440.0, "grad_norm": 81.3456418493951, "language_loss": 0.81673861, "learning_rate": 3.981394942228581e-06, "loss": 0.83772433, "num_input_tokens_seen": 25574155, "router_z_loss_clip": 3.859375, "router_z_loss_mlp": 0.40380859, "step": 1200, "time_per_iteration": 4.170860767364502 }, { "auxiliary_loss_clip": 0.01769445, "auxiliary_loss_mlp": 0.00260656, "balance_loss_clip": 1.39191413, "balance_loss_mlp": 0.22453588, "epoch": 0.07220802645423118, "flos": 23879949995520.0, "grad_norm": 133.14806697535727, "language_loss": 0.88566136, "learning_rate": 3.98134190563652e-06, "loss": 0.90596241, "num_input_tokens_seen": 25592735, "router_z_loss_clip": 3.77539062, "router_z_loss_mlp": 0.36157227, "step": 1201, "time_per_iteration": 2.81274676322937 }, { "auxiliary_loss_clip": 0.01762425, "auxiliary_loss_mlp": 0.00302936, "balance_loss_clip": 1.37606883, "balance_loss_mlp": 0.26490796, "epoch": 0.07226814970689914, "flos": 19243631036160.0, "grad_norm": 16.065357553284617, "language_loss": 0.7752313, "learning_rate": 3.981288793911775e-06, "loss": 0.79588497, "num_input_tokens_seen": 25611510, "router_z_loss_clip": 3.86523438, "router_z_loss_mlp": 0.38012695, "step": 1202, "time_per_iteration": 2.620327949523926 }, { "auxiliary_loss_clip": 0.01778451, "auxiliary_loss_mlp": 0.00303594, "balance_loss_clip": 1.39132261, "balance_loss_mlp": 0.26570928, "epoch": 0.07232827295956712, "flos": 19172025273600.0, "grad_norm": 13.595155062122313, "language_loss": 0.92762685, "learning_rate": 3.98123560705636e-06, "loss": 0.94844735, "num_input_tokens_seen": 25629560, "router_z_loss_clip": 3.87109375, "router_z_loss_mlp": 0.37890625, "step": 1203, "time_per_iteration": 4.1546101570129395 }, { "auxiliary_loss_clip": 0.01781765, "auxiliary_loss_mlp": 0.00329685, "balance_loss_clip": 1.39342701, "balance_loss_mlp": 0.29039347, "epoch": 0.07238839621223508, "flos": 17639752947840.0, "grad_norm": 17532.39370600667, "language_loss": 0.86552793, "learning_rate": 3.981182345072293e-06, "loss": 0.88664246, "num_input_tokens_seen": 25648330, "router_z_loss_clip": 3.88671875, "router_z_loss_mlp": 0.39306641, "step": 1204, "time_per_iteration": 2.6317145824432373 }, { "auxiliary_loss_clip": 0.01758173, "auxiliary_loss_mlp": 0.0031316, "balance_loss_clip": 1.37398362, "balance_loss_mlp": 0.27236617, "epoch": 0.07244851946490305, "flos": 28292401440000.0, "grad_norm": 22.646117281179823, "language_loss": 0.8769682, "learning_rate": 3.981129007961593e-06, "loss": 0.89768147, "num_input_tokens_seen": 25669470, "router_z_loss_clip": 3.84375, "router_z_loss_mlp": 0.40820312, "step": 1205, "time_per_iteration": 4.085329294204712 }, { "auxiliary_loss_clip": 0.01782191, "auxiliary_loss_mlp": 0.00362068, "balance_loss_clip": 1.38668084, "balance_loss_mlp": 0.31638706, "epoch": 0.07250864271757101, "flos": 22564829341440.0, "grad_norm": 2.534414062526998, "language_loss": 0.81857502, "learning_rate": 3.981075595726283e-06, "loss": 0.84001756, "num_input_tokens_seen": 25690470, "router_z_loss_clip": 3.95117188, "router_z_loss_mlp": 0.45678711, "step": 1206, "time_per_iteration": 2.640543222427368 }, { "auxiliary_loss_clip": 0.01776629, "auxiliary_loss_mlp": 0.00315558, "balance_loss_clip": 1.38480473, "balance_loss_mlp": 0.27569401, "epoch": 0.072568765970239, "flos": 21762405463680.0, "grad_norm": 4.860574613242271, "language_loss": 0.82871318, "learning_rate": 3.981022108368387e-06, "loss": 0.84963512, "num_input_tokens_seen": 25709205, "router_z_loss_clip": 3.9140625, "router_z_loss_mlp": 0.39868164, "step": 1207, "time_per_iteration": 2.6494531631469727 }, { "auxiliary_loss_clip": 0.01773117, "auxiliary_loss_mlp": 0.00322019, "balance_loss_clip": 1.38668418, "balance_loss_mlp": 0.28220338, "epoch": 0.07262888922290696, "flos": 25519702792320.0, "grad_norm": 17.485206470674285, "language_loss": 0.86022907, "learning_rate": 3.9809685458899345e-06, "loss": 0.88118052, "num_input_tokens_seen": 25728485, "router_z_loss_clip": 3.859375, "router_z_loss_mlp": 0.39794922, "step": 1208, "time_per_iteration": 2.7001681327819824 }, { "auxiliary_loss_clip": 0.01763255, "auxiliary_loss_mlp": 0.00293591, "balance_loss_clip": 1.37545252, "balance_loss_mlp": 0.25444287, "epoch": 0.07268901247557492, "flos": 21246548290560.0, "grad_norm": 17.789517354924158, "language_loss": 0.84867996, "learning_rate": 3.980914908292955e-06, "loss": 0.86924839, "num_input_tokens_seen": 25747730, "router_z_loss_clip": 3.87890625, "router_z_loss_mlp": 0.39160156, "step": 1209, "time_per_iteration": 2.6945884227752686 }, { "auxiliary_loss_clip": 0.0175133, "auxiliary_loss_mlp": 0.00265919, "balance_loss_clip": 1.36699462, "balance_loss_mlp": 0.22860655, "epoch": 0.0727491357282429, "flos": 25479302970240.0, "grad_norm": 4.116735519190709, "language_loss": 0.88198376, "learning_rate": 3.980861195579486e-06, "loss": 0.90215629, "num_input_tokens_seen": 25768050, "router_z_loss_clip": 3.84570312, "router_z_loss_mlp": 0.37280273, "step": 1210, "time_per_iteration": 2.654256582260132 }, { "auxiliary_loss_clip": 0.0175576, "auxiliary_loss_mlp": 0.00279568, "balance_loss_clip": 1.37871587, "balance_loss_mlp": 0.24323305, "epoch": 0.07280925898091087, "flos": 24462169545600.0, "grad_norm": 12391.021023178184, "language_loss": 0.91680288, "learning_rate": 3.98080740775156e-06, "loss": 0.93715608, "num_input_tokens_seen": 25787985, "router_z_loss_clip": 3.77148438, "router_z_loss_mlp": 0.36303711, "step": 1211, "time_per_iteration": 2.7109155654907227 }, { "auxiliary_loss_clip": 0.01752315, "auxiliary_loss_mlp": 0.00325872, "balance_loss_clip": 1.36802518, "balance_loss_mlp": 0.28455377, "epoch": 0.07286938223357883, "flos": 18288191220480.0, "grad_norm": 84.89166911499264, "language_loss": 0.99883854, "learning_rate": 3.98075354481122e-06, "loss": 1.01962042, "num_input_tokens_seen": 25803620, "router_z_loss_clip": 3.84765625, "router_z_loss_mlp": 0.41333008, "step": 1212, "time_per_iteration": 2.6248910427093506 }, { "auxiliary_loss_clip": 0.01754823, "auxiliary_loss_mlp": 0.00306164, "balance_loss_clip": 1.37429309, "balance_loss_mlp": 0.26730192, "epoch": 0.07292950548624681, "flos": 21214803646080.0, "grad_norm": 96.34855686279157, "language_loss": 0.80296028, "learning_rate": 3.9806996067605055e-06, "loss": 0.82357013, "num_input_tokens_seen": 25823315, "router_z_loss_clip": 3.80859375, "router_z_loss_mlp": 0.38842773, "step": 1213, "time_per_iteration": 2.636127471923828 }, { "auxiliary_loss_clip": 0.01730965, "auxiliary_loss_mlp": 0.00298987, "balance_loss_clip": 1.35703671, "balance_loss_mlp": 0.26064903, "epoch": 0.07298962873891478, "flos": 24642009964800.0, "grad_norm": 4.9462898932934145, "language_loss": 0.91152471, "learning_rate": 3.980645593601465e-06, "loss": 0.93182421, "num_input_tokens_seen": 25842605, "router_z_loss_clip": 3.7421875, "router_z_loss_mlp": 0.38378906, "step": 1214, "time_per_iteration": 2.6319620609283447 }, { "auxiliary_loss_clip": 0.01729426, "auxiliary_loss_mlp": 0.00311594, "balance_loss_clip": 1.35041261, "balance_loss_mlp": 0.27359009, "epoch": 0.07304975199158274, "flos": 27052765217280.0, "grad_norm": 26.378948035232142, "language_loss": 0.92744732, "learning_rate": 3.980591505336144e-06, "loss": 0.9478575, "num_input_tokens_seen": 25863030, "router_z_loss_clip": 3.79296875, "router_z_loss_mlp": 0.37988281, "step": 1215, "time_per_iteration": 2.664077043533325 }, { "auxiliary_loss_clip": 0.0171555, "auxiliary_loss_mlp": 0.00323434, "balance_loss_clip": 1.34287202, "balance_loss_mlp": 0.28199649, "epoch": 0.07310987524425071, "flos": 33549544091520.0, "grad_norm": 280.32436371514217, "language_loss": 0.8826263, "learning_rate": 3.980537341966595e-06, "loss": 0.90301609, "num_input_tokens_seen": 25888015, "router_z_loss_clip": 3.72460938, "router_z_loss_mlp": 0.41430664, "step": 1216, "time_per_iteration": 2.767228603363037 }, { "auxiliary_loss_clip": 0.01692427, "auxiliary_loss_mlp": 0.00284548, "balance_loss_clip": 1.33269238, "balance_loss_mlp": 0.24685387, "epoch": 0.07316999849691869, "flos": 28110944908800.0, "grad_norm": 8.511851482279422, "language_loss": 0.85677612, "learning_rate": 3.980483103494872e-06, "loss": 0.87654591, "num_input_tokens_seen": 25908660, "router_z_loss_clip": 3.59765625, "router_z_loss_mlp": 0.37719727, "step": 1217, "time_per_iteration": 2.6749603748321533 }, { "auxiliary_loss_clip": 0.0168151, "auxiliary_loss_mlp": 0.00283002, "balance_loss_clip": 1.32201362, "balance_loss_mlp": 0.2453322, "epoch": 0.07323012174958665, "flos": 14392602529920.0, "grad_norm": 236.37909431363445, "language_loss": 0.93766129, "learning_rate": 3.98042878992303e-06, "loss": 0.95730639, "num_input_tokens_seen": 25927215, "router_z_loss_clip": 3.59765625, "router_z_loss_mlp": 0.37670898, "step": 1218, "time_per_iteration": 2.6020755767822266 }, { "auxiliary_loss_clip": 0.01671514, "auxiliary_loss_mlp": 0.00294243, "balance_loss_clip": 1.31294298, "balance_loss_mlp": 0.25542864, "epoch": 0.07329024500225462, "flos": 21616428591360.0, "grad_norm": 6.493228085037835, "language_loss": 0.94410551, "learning_rate": 3.9803744012531305e-06, "loss": 0.96376312, "num_input_tokens_seen": 25945500, "router_z_loss_clip": 3.58789062, "router_z_loss_mlp": 0.38818359, "step": 1219, "time_per_iteration": 2.6060001850128174 }, { "auxiliary_loss_clip": 0.01658501, "auxiliary_loss_mlp": 0.00288324, "balance_loss_clip": 1.30809283, "balance_loss_mlp": 0.24996282, "epoch": 0.0733503682549226, "flos": 13224141106560.0, "grad_norm": 116.40844388325014, "language_loss": 0.93935961, "learning_rate": 3.980319937487235e-06, "loss": 0.95882785, "num_input_tokens_seen": 25963105, "router_z_loss_clip": 3.50195312, "router_z_loss_mlp": 0.38378906, "step": 1220, "time_per_iteration": 2.634206771850586 }, { "auxiliary_loss_clip": 0.01658668, "auxiliary_loss_mlp": 0.0029836, "balance_loss_clip": 1.30339968, "balance_loss_mlp": 0.25763839, "epoch": 0.07341049150759056, "flos": 20886975192960.0, "grad_norm": 49.95680182421531, "language_loss": 0.8629232, "learning_rate": 3.98026539862741e-06, "loss": 0.8824935, "num_input_tokens_seen": 25981690, "router_z_loss_clip": 3.55273438, "router_z_loss_mlp": 0.40698242, "step": 1221, "time_per_iteration": 2.5993690490722656 }, { "auxiliary_loss_clip": 0.01655561, "auxiliary_loss_mlp": 0.0029513, "balance_loss_clip": 1.30015111, "balance_loss_mlp": 0.25512385, "epoch": 0.07347061476025853, "flos": 15413614623360.0, "grad_norm": 19.204668160790856, "language_loss": 1.00942874, "learning_rate": 3.980210784675722e-06, "loss": 1.02893555, "num_input_tokens_seen": 25999890, "router_z_loss_clip": 3.55273438, "router_z_loss_mlp": 0.40039062, "step": 1222, "time_per_iteration": 2.6222894191741943 }, { "auxiliary_loss_clip": 0.01657953, "auxiliary_loss_mlp": 0.00263511, "balance_loss_clip": 1.29812622, "balance_loss_mlp": 0.22574517, "epoch": 0.0735307380129265, "flos": 11108859131520.0, "grad_norm": 6.166627177130085, "language_loss": 0.99910802, "learning_rate": 3.980156095634242e-06, "loss": 1.01832271, "num_input_tokens_seen": 26016445, "router_z_loss_clip": 3.59765625, "router_z_loss_mlp": 0.37768555, "step": 1223, "time_per_iteration": 2.863445281982422 }, { "auxiliary_loss_clip": 0.01660513, "auxiliary_loss_mlp": 0.00258814, "balance_loss_clip": 1.30219626, "balance_loss_mlp": 0.22057119, "epoch": 0.07359086126559447, "flos": 23732392924800.0, "grad_norm": 85.62252114613987, "language_loss": 0.91951895, "learning_rate": 3.980101331505045e-06, "loss": 0.93871218, "num_input_tokens_seen": 26036080, "router_z_loss_clip": 3.5859375, "router_z_loss_mlp": 0.38256836, "step": 1224, "time_per_iteration": 2.7737972736358643 }, { "auxiliary_loss_clip": 0.01687969, "auxiliary_loss_mlp": 0.00290875, "balance_loss_clip": 1.31576633, "balance_loss_mlp": 0.24841282, "epoch": 0.07365098451826244, "flos": 20993270515200.0, "grad_norm": 2.8043529514526355, "language_loss": 0.90741557, "learning_rate": 3.9800464922902076e-06, "loss": 0.92720401, "num_input_tokens_seen": 26055805, "router_z_loss_clip": 3.71875, "router_z_loss_mlp": 0.42456055, "step": 1225, "time_per_iteration": 2.6662776470184326 }, { "auxiliary_loss_clip": 0.01702346, "auxiliary_loss_mlp": 0.00316492, "balance_loss_clip": 1.32743192, "balance_loss_mlp": 0.27712899, "epoch": 0.0737111077709304, "flos": 19933582452480.0, "grad_norm": 13.040113648083024, "language_loss": 0.96725172, "learning_rate": 3.979991577991808e-06, "loss": 0.98744011, "num_input_tokens_seen": 26073905, "router_z_loss_clip": 3.75, "router_z_loss_mlp": 0.39355469, "step": 1226, "time_per_iteration": 2.6351284980773926 }, { "auxiliary_loss_clip": 0.01723308, "auxiliary_loss_mlp": 0.0026638, "balance_loss_clip": 1.33294439, "balance_loss_mlp": 0.22603986, "epoch": 0.07377123102359838, "flos": 16581537342720.0, "grad_norm": 13.750423025944169, "language_loss": 0.90023571, "learning_rate": 3.97993658861193e-06, "loss": 0.92013264, "num_input_tokens_seen": 26091700, "router_z_loss_clip": 3.90234375, "router_z_loss_mlp": 0.40356445, "step": 1227, "time_per_iteration": 2.5911548137664795 }, { "auxiliary_loss_clip": 0.01722762, "auxiliary_loss_mlp": 0.00251349, "balance_loss_clip": 1.33773184, "balance_loss_mlp": 0.21105593, "epoch": 0.07383135427626634, "flos": 28328563457280.0, "grad_norm": 8.900785442979558, "language_loss": 0.9091475, "learning_rate": 3.9798815241526575e-06, "loss": 0.92888862, "num_input_tokens_seen": 26114105, "router_z_loss_clip": 3.84960938, "router_z_loss_mlp": 0.40283203, "step": 1228, "time_per_iteration": 2.7142958641052246 }, { "auxiliary_loss_clip": 0.01770463, "auxiliary_loss_mlp": 0.00267699, "balance_loss_clip": 1.36320162, "balance_loss_mlp": 0.227001, "epoch": 0.07389147752893431, "flos": 20047168235520.0, "grad_norm": 31.11147435652375, "language_loss": 0.87942225, "learning_rate": 3.97982638461608e-06, "loss": 0.89980388, "num_input_tokens_seen": 26131165, "router_z_loss_clip": 4.06640625, "router_z_loss_mlp": 0.40722656, "step": 1229, "time_per_iteration": 2.6391875743865967 }, { "auxiliary_loss_clip": 0.01792303, "auxiliary_loss_mlp": 0.0030875, "balance_loss_clip": 1.37106538, "balance_loss_mlp": 0.26497626, "epoch": 0.07395160078160229, "flos": 18114132890880.0, "grad_norm": 15.452870379079627, "language_loss": 0.87003779, "learning_rate": 3.979771170004287e-06, "loss": 0.89104831, "num_input_tokens_seen": 26150040, "router_z_loss_clip": 4.2109375, "router_z_loss_mlp": 0.43774414, "step": 1230, "time_per_iteration": 2.607898235321045 }, { "auxiliary_loss_clip": 0.01844256, "auxiliary_loss_mlp": 0.0033389, "balance_loss_clip": 1.40002084, "balance_loss_mlp": 0.290999, "epoch": 0.07401172403427025, "flos": 23586918842880.0, "grad_norm": 2056.2328742958885, "language_loss": 0.8793844, "learning_rate": 3.979715880319372e-06, "loss": 0.90116584, "num_input_tokens_seen": 26169380, "router_z_loss_clip": 4.43359375, "router_z_loss_mlp": 0.42871094, "step": 1231, "time_per_iteration": 2.64737606048584 }, { "auxiliary_loss_clip": 0.01850298, "auxiliary_loss_mlp": 0.00339964, "balance_loss_clip": 1.39627111, "balance_loss_mlp": 0.30024299, "epoch": 0.07407184728693822, "flos": 26359904799360.0, "grad_norm": 30.354822010416864, "language_loss": 1.02926707, "learning_rate": 3.979660515563434e-06, "loss": 1.05116963, "num_input_tokens_seen": 26189420, "router_z_loss_clip": 4.54296875, "router_z_loss_mlp": 0.39746094, "step": 1232, "time_per_iteration": 2.6508588790893555 }, { "auxiliary_loss_clip": 0.01884588, "auxiliary_loss_mlp": 0.00391555, "balance_loss_clip": 1.41314435, "balance_loss_mlp": 0.35119033, "epoch": 0.0741319705396062, "flos": 22200443821440.0, "grad_norm": 32.68244041767896, "language_loss": 0.87297297, "learning_rate": 3.979605075738569e-06, "loss": 0.89573443, "num_input_tokens_seen": 26209300, "router_z_loss_clip": 4.71484375, "router_z_loss_mlp": 0.40380859, "step": 1233, "time_per_iteration": 2.636453628540039 }, { "auxiliary_loss_clip": 0.01924915, "auxiliary_loss_mlp": 0.00457656, "balance_loss_clip": 1.43229795, "balance_loss_mlp": 0.4144786, "epoch": 0.07419209379227416, "flos": 39200482523520.0, "grad_norm": 12.729354984209062, "language_loss": 0.77940959, "learning_rate": 3.979549560846883e-06, "loss": 0.80323529, "num_input_tokens_seen": 26228110, "router_z_loss_clip": 4.92578125, "router_z_loss_mlp": 0.43164062, "step": 1234, "time_per_iteration": 2.7591991424560547 }, { "auxiliary_loss_clip": 0.01948153, "auxiliary_loss_mlp": 0.00442524, "balance_loss_clip": 1.44320703, "balance_loss_mlp": 0.4005383, "epoch": 0.07425221704494213, "flos": 22781657790720.0, "grad_norm": 175.93980104519426, "language_loss": 0.84322345, "learning_rate": 3.979493970890478e-06, "loss": 0.86713016, "num_input_tokens_seen": 26247020, "router_z_loss_clip": 5.0546875, "router_z_loss_mlp": 0.41967773, "step": 1235, "time_per_iteration": 2.6482746601104736 }, { "auxiliary_loss_clip": 0.01983477, "auxiliary_loss_mlp": 0.00442699, "balance_loss_clip": 1.45647144, "balance_loss_mlp": 0.40157139, "epoch": 0.0743123402976101, "flos": 22272983337600.0, "grad_norm": 11.588088671415957, "language_loss": 0.88405347, "learning_rate": 3.979438305871464e-06, "loss": 0.9083153, "num_input_tokens_seen": 26265750, "router_z_loss_clip": 5.265625, "router_z_loss_mlp": 0.41113281, "step": 1236, "time_per_iteration": 2.6108901500701904 }, { "auxiliary_loss_clip": 0.01998804, "auxiliary_loss_mlp": 0.00545399, "balance_loss_clip": 1.45789552, "balance_loss_mlp": 0.49771485, "epoch": 0.07437246355027807, "flos": 29315029645440.0, "grad_norm": 5.045829172421548, "language_loss": 0.84263647, "learning_rate": 3.979382565791951e-06, "loss": 0.86807847, "num_input_tokens_seen": 26287905, "router_z_loss_clip": 5.41015625, "router_z_loss_mlp": 0.47631836, "step": 1237, "time_per_iteration": 2.687476873397827 }, { "auxiliary_loss_clip": 0.02015388, "auxiliary_loss_mlp": 0.00473895, "balance_loss_clip": 1.45919621, "balance_loss_mlp": 0.43064633, "epoch": 0.07443258680294604, "flos": 31944732249600.0, "grad_norm": 9.555855448702177, "language_loss": 0.82000387, "learning_rate": 3.979326750654053e-06, "loss": 0.84489673, "num_input_tokens_seen": 26311795, "router_z_loss_clip": 5.5546875, "router_z_loss_mlp": 0.43237305, "step": 1238, "time_per_iteration": 2.7219672203063965 }, { "auxiliary_loss_clip": 0.02051824, "auxiliary_loss_mlp": 0.00531211, "balance_loss_clip": 1.48416519, "balance_loss_mlp": 0.48476708, "epoch": 0.074492710055614, "flos": 22675290641280.0, "grad_norm": 24.192768947576294, "language_loss": 0.92108226, "learning_rate": 3.9792708604598854e-06, "loss": 0.94691265, "num_input_tokens_seen": 26330330, "router_z_loss_clip": 5.67578125, "router_z_loss_mlp": 0.46386719, "step": 1239, "time_per_iteration": 2.643127918243408 }, { "auxiliary_loss_clip": 0.02057622, "auxiliary_loss_mlp": 0.00512899, "balance_loss_clip": 1.47692084, "balance_loss_mlp": 0.46683636, "epoch": 0.07455283330828198, "flos": 21284901037440.0, "grad_norm": 75.01082286181317, "language_loss": 0.95546561, "learning_rate": 3.979214895211569e-06, "loss": 0.98117089, "num_input_tokens_seen": 26348865, "router_z_loss_clip": 5.80859375, "router_z_loss_mlp": 0.46069336, "step": 1240, "time_per_iteration": 2.5958359241485596 }, { "auxiliary_loss_clip": 0.02087433, "auxiliary_loss_mlp": 0.00565152, "balance_loss_clip": 1.49864912, "balance_loss_mlp": 0.517111, "epoch": 0.07461295656094995, "flos": 24388408967040.0, "grad_norm": 29.334731538265935, "language_loss": 0.95643449, "learning_rate": 3.979158854911225e-06, "loss": 0.98296034, "num_input_tokens_seen": 26368210, "router_z_loss_clip": 5.88671875, "router_z_loss_mlp": 0.47998047, "step": 1241, "time_per_iteration": 2.682462215423584 }, { "auxiliary_loss_clip": 0.02123784, "auxiliary_loss_mlp": 0.00233252, "balance_loss_clip": 1.59085, "balance_loss_mlp": 0.20521365, "epoch": 0.07467307981361791, "flos": 62109660574080.0, "grad_norm": 0.9188999348400004, "language_loss": 0.63106978, "learning_rate": 3.979102739560979e-06, "loss": 0.6546402, "num_input_tokens_seen": 26424890, "router_z_loss_clip": 5.3125, "router_z_loss_mlp": 0.28125, "step": 1242, "time_per_iteration": 4.556946039199829 }, { "auxiliary_loss_clip": 0.02083826, "auxiliary_loss_mlp": 0.00544255, "balance_loss_clip": 1.48711562, "balance_loss_mlp": 0.4930191, "epoch": 0.07473320306628589, "flos": 24863148046080.0, "grad_norm": 16.37931004705845, "language_loss": 0.74394864, "learning_rate": 3.9790465491629595e-06, "loss": 0.77022946, "num_input_tokens_seen": 26446405, "router_z_loss_clip": 5.97265625, "router_z_loss_mlp": 0.51196289, "step": 1243, "time_per_iteration": 4.109212160110474 }, { "auxiliary_loss_clip": 0.02072201, "auxiliary_loss_mlp": 0.00498868, "balance_loss_clip": 1.48642945, "balance_loss_mlp": 0.45609522, "epoch": 0.07479332631895386, "flos": 24897442556160.0, "grad_norm": 4.406390036972256, "language_loss": 0.81601471, "learning_rate": 3.978990283719296e-06, "loss": 0.84172535, "num_input_tokens_seen": 26466070, "router_z_loss_clip": 5.86328125, "router_z_loss_mlp": 0.42797852, "step": 1244, "time_per_iteration": 2.686453342437744 }, { "auxiliary_loss_clip": 0.02100165, "auxiliary_loss_mlp": 0.00469691, "balance_loss_clip": 1.50296664, "balance_loss_mlp": 0.42739576, "epoch": 0.07485344957162182, "flos": 17815247821440.0, "grad_norm": 8.287687264290682, "language_loss": 0.77038962, "learning_rate": 3.978933943232123e-06, "loss": 0.79608822, "num_input_tokens_seen": 26479350, "router_z_loss_clip": 5.96875, "router_z_loss_mlp": 0.42260742, "step": 1245, "time_per_iteration": 2.6932711601257324 }, { "auxiliary_loss_clip": 0.02083214, "auxiliary_loss_mlp": 0.00511406, "balance_loss_clip": 1.501302, "balance_loss_mlp": 0.46632069, "epoch": 0.0749135728242898, "flos": 25010202326400.0, "grad_norm": 5.026412155457402, "language_loss": 0.94894791, "learning_rate": 3.978877527703576e-06, "loss": 0.97489411, "num_input_tokens_seen": 26498255, "router_z_loss_clip": 5.82421875, "router_z_loss_mlp": 0.45092773, "step": 1246, "time_per_iteration": 4.063573360443115 }, { "auxiliary_loss_clip": 0.02061325, "auxiliary_loss_mlp": 0.00519109, "balance_loss_clip": 1.48485541, "balance_loss_mlp": 0.4711864, "epoch": 0.07497369607695777, "flos": 17822071405440.0, "grad_norm": 6.11090591072476, "language_loss": 0.9845742, "learning_rate": 3.9788210371357945e-06, "loss": 1.01037848, "num_input_tokens_seen": 26515375, "router_z_loss_clip": 5.76171875, "router_z_loss_mlp": 0.47949219, "step": 1247, "time_per_iteration": 2.6021323204040527 }, { "auxiliary_loss_clip": 0.0206292, "auxiliary_loss_mlp": 0.00492644, "balance_loss_clip": 1.5020957, "balance_loss_mlp": 0.44648594, "epoch": 0.07503381932962573, "flos": 15121086261120.0, "grad_norm": 4.85421606279229, "language_loss": 0.71860194, "learning_rate": 3.978764471530921e-06, "loss": 0.74415761, "num_input_tokens_seen": 26533595, "router_z_loss_clip": 5.6171875, "router_z_loss_mlp": 0.46191406, "step": 1248, "time_per_iteration": 4.044569492340088 }, { "auxiliary_loss_clip": 0.02053743, "auxiliary_loss_mlp": 0.00479746, "balance_loss_clip": 1.50054634, "balance_loss_mlp": 0.43892905, "epoch": 0.0750939425822937, "flos": 12816734071680.0, "grad_norm": 31.157364854126744, "language_loss": 0.82510203, "learning_rate": 3.978707830891102e-06, "loss": 0.85043693, "num_input_tokens_seen": 26549405, "router_z_loss_clip": 5.53515625, "router_z_loss_mlp": 0.40820312, "step": 1249, "time_per_iteration": 2.6047630310058594 }, { "auxiliary_loss_clip": 0.02062647, "auxiliary_loss_mlp": 0.0048002, "balance_loss_clip": 1.50360537, "balance_loss_mlp": 0.43362403, "epoch": 0.07515406583496168, "flos": 24206844695040.0, "grad_norm": 12.62034212670182, "language_loss": 0.90865469, "learning_rate": 3.978651115218482e-06, "loss": 0.93408138, "num_input_tokens_seen": 26567200, "router_z_loss_clip": 5.5859375, "router_z_loss_mlp": 0.46362305, "step": 1250, "time_per_iteration": 2.6986653804779053 }, { "auxiliary_loss_clip": 0.02021262, "auxiliary_loss_mlp": 0.00482564, "balance_loss_clip": 1.49205637, "balance_loss_mlp": 0.43623954, "epoch": 0.07521418908762964, "flos": 26688164215680.0, "grad_norm": 180.82353292519102, "language_loss": 0.73793209, "learning_rate": 3.978594324515215e-06, "loss": 0.76297039, "num_input_tokens_seen": 26586190, "router_z_loss_clip": 5.2890625, "router_z_loss_mlp": 0.46362305, "step": 1251, "time_per_iteration": 2.650257110595703 }, { "auxiliary_loss_clip": 0.01939101, "auxiliary_loss_mlp": 0.00090447, "balance_loss_clip": 1.57252979, "balance_loss_mlp": 0.07280371, "epoch": 0.0752743123402976, "flos": 59095140589440.0, "grad_norm": 0.9407650897128139, "language_loss": 0.70348328, "learning_rate": 3.9785374587834515e-06, "loss": 0.72377872, "num_input_tokens_seen": 26650710, "router_z_loss_clip": 3.65625, "router_z_loss_mlp": 0.17675781, "step": 1252, "time_per_iteration": 3.1591384410858154 }, { "auxiliary_loss_clip": 0.01985061, "auxiliary_loss_mlp": 0.00428676, "balance_loss_clip": 1.47543526, "balance_loss_mlp": 0.38361502, "epoch": 0.07533443559296558, "flos": 23477032160640.0, "grad_norm": 20.758675179068167, "language_loss": 0.85778308, "learning_rate": 3.97848051802535e-06, "loss": 0.88192046, "num_input_tokens_seen": 26669000, "router_z_loss_clip": 5.09375, "router_z_loss_mlp": 0.45043945, "step": 1253, "time_per_iteration": 2.6204352378845215 }, { "auxiliary_loss_clip": 0.01989333, "auxiliary_loss_mlp": 0.00430266, "balance_loss_clip": 1.48151541, "balance_loss_mlp": 0.38675439, "epoch": 0.07539455884563355, "flos": 20879110114560.0, "grad_norm": 307.95804209728215, "language_loss": 1.04643893, "learning_rate": 3.978423502243069e-06, "loss": 1.07063484, "num_input_tokens_seen": 26683075, "router_z_loss_clip": 5.0859375, "router_z_loss_mlp": 0.43530273, "step": 1254, "time_per_iteration": 2.5903306007385254 }, { "auxiliary_loss_clip": 0.01962063, "auxiliary_loss_mlp": 0.00445455, "balance_loss_clip": 1.46999669, "balance_loss_mlp": 0.3995592, "epoch": 0.07545468209830151, "flos": 27672906551040.0, "grad_norm": 122.40374423241059, "language_loss": 0.93612808, "learning_rate": 3.97836641143877e-06, "loss": 0.96020317, "num_input_tokens_seen": 26701875, "router_z_loss_clip": 4.9140625, "router_z_loss_mlp": 0.45947266, "step": 1255, "time_per_iteration": 2.7531492710113525 }, { "auxiliary_loss_clip": 0.01951354, "auxiliary_loss_mlp": 0.0039811, "balance_loss_clip": 1.46657276, "balance_loss_mlp": 0.35593367, "epoch": 0.0755148053509695, "flos": 14136990370560.0, "grad_norm": 132.4509159294308, "language_loss": 0.86846274, "learning_rate": 3.978309245614618e-06, "loss": 0.8919574, "num_input_tokens_seen": 26719050, "router_z_loss_clip": 4.8515625, "router_z_loss_mlp": 0.42163086, "step": 1256, "time_per_iteration": 2.693870782852173 }, { "auxiliary_loss_clip": 0.01843558, "auxiliary_loss_mlp": 0.00099102, "balance_loss_clip": 1.50573826, "balance_loss_mlp": 0.0853695, "epoch": 0.07557492860363746, "flos": 58235257929600.0, "grad_norm": 0.7789917263767719, "language_loss": 0.57786608, "learning_rate": 3.9782520047727825e-06, "loss": 0.59729266, "num_input_tokens_seen": 26780650, "router_z_loss_clip": 3.375, "router_z_loss_mlp": 0.13769531, "step": 1257, "time_per_iteration": 3.2493221759796143 }, { "auxiliary_loss_clip": 0.01935169, "auxiliary_loss_mlp": 0.00411377, "balance_loss_clip": 1.4522388, "balance_loss_mlp": 0.36734146, "epoch": 0.07563505185630542, "flos": 24644380262400.0, "grad_norm": 28.439571061385895, "language_loss": 0.96720612, "learning_rate": 3.978194688915432e-06, "loss": 0.99067158, "num_input_tokens_seen": 26798725, "router_z_loss_clip": 4.8359375, "router_z_loss_mlp": 0.44018555, "step": 1258, "time_per_iteration": 2.674708604812622 }, { "auxiliary_loss_clip": 0.01879693, "auxiliary_loss_mlp": 0.00387259, "balance_loss_clip": 1.42166996, "balance_loss_mlp": 0.34834915, "epoch": 0.07569517510897339, "flos": 15522998515200.0, "grad_norm": 33.760609803398125, "language_loss": 0.87107396, "learning_rate": 3.978137298044741e-06, "loss": 0.89374346, "num_input_tokens_seen": 26817005, "router_z_loss_clip": 4.58203125, "router_z_loss_mlp": 0.38891602, "step": 1259, "time_per_iteration": 2.6118059158325195 }, { "auxiliary_loss_clip": 0.018978, "auxiliary_loss_mlp": 0.00384913, "balance_loss_clip": 1.43256068, "balance_loss_mlp": 0.34583625, "epoch": 0.07575529836164137, "flos": 22928532503040.0, "grad_norm": 6.334528540629632, "language_loss": 0.81544089, "learning_rate": 3.978079832162885e-06, "loss": 0.83826804, "num_input_tokens_seen": 26836655, "router_z_loss_clip": 4.65234375, "router_z_loss_mlp": 0.39086914, "step": 1260, "time_per_iteration": 2.6686007976531982 }, { "auxiliary_loss_clip": 0.0186867, "auxiliary_loss_mlp": 0.00409607, "balance_loss_clip": 1.42425132, "balance_loss_mlp": 0.36793119, "epoch": 0.07581542161430933, "flos": 19500428344320.0, "grad_norm": 19282.728988933814, "language_loss": 0.91563696, "learning_rate": 3.978022291272044e-06, "loss": 0.9384197, "num_input_tokens_seen": 26854925, "router_z_loss_clip": 4.44140625, "router_z_loss_mlp": 0.41699219, "step": 1261, "time_per_iteration": 2.629772663116455 }, { "auxiliary_loss_clip": 0.0185738, "auxiliary_loss_mlp": 0.00406583, "balance_loss_clip": 1.41467881, "balance_loss_mlp": 0.36011517, "epoch": 0.0758755448669773, "flos": 24973465691520.0, "grad_norm": 111.3396820062762, "language_loss": 0.87286377, "learning_rate": 3.977964675374399e-06, "loss": 0.89550334, "num_input_tokens_seen": 26876170, "router_z_loss_clip": 4.421875, "router_z_loss_mlp": 0.46459961, "step": 1262, "time_per_iteration": 2.732150077819824 }, { "auxiliary_loss_clip": 0.01815647, "auxiliary_loss_mlp": 0.00428914, "balance_loss_clip": 1.38262391, "balance_loss_mlp": 0.38399595, "epoch": 0.07593566811964528, "flos": 22747973811840.0, "grad_norm": 22.509841609017162, "language_loss": 0.90741086, "learning_rate": 3.977906984472136e-06, "loss": 0.92985654, "num_input_tokens_seen": 26895005, "router_z_loss_clip": 4.328125, "router_z_loss_mlp": 0.44897461, "step": 1263, "time_per_iteration": 2.649317979812622 }, { "auxiliary_loss_clip": 0.01811943, "auxiliary_loss_mlp": 0.00407305, "balance_loss_clip": 1.37618589, "balance_loss_mlp": 0.36374646, "epoch": 0.07599579137231324, "flos": 23112395245440.0, "grad_norm": 15.581912491732353, "language_loss": 0.81774187, "learning_rate": 3.977849218567442e-06, "loss": 0.83993435, "num_input_tokens_seen": 26913930, "router_z_loss_clip": 4.359375, "router_z_loss_mlp": 0.43579102, "step": 1264, "time_per_iteration": 2.665125846862793 }, { "auxiliary_loss_clip": 0.01773768, "auxiliary_loss_mlp": 0.00387493, "balance_loss_clip": 1.35008764, "balance_loss_mlp": 0.34543562, "epoch": 0.07605591462498121, "flos": 14502058248960.0, "grad_norm": 9.018273664800596, "language_loss": 0.92410636, "learning_rate": 3.977791377662507e-06, "loss": 0.94571888, "num_input_tokens_seen": 26931485, "router_z_loss_clip": 4.24609375, "router_z_loss_mlp": 0.42041016, "step": 1265, "time_per_iteration": 2.584578037261963 }, { "auxiliary_loss_clip": 0.01775309, "auxiliary_loss_mlp": 0.00414332, "balance_loss_clip": 1.34737098, "balance_loss_mlp": 0.37148857, "epoch": 0.07611603787764919, "flos": 23514199758720.0, "grad_norm": 207.75085167469493, "language_loss": 0.72564304, "learning_rate": 3.977733461759524e-06, "loss": 0.74753952, "num_input_tokens_seen": 26951670, "router_z_loss_clip": 4.28125, "router_z_loss_mlp": 0.42822266, "step": 1266, "time_per_iteration": 2.727354049682617 }, { "auxiliary_loss_clip": 0.01745726, "auxiliary_loss_mlp": 0.0039794, "balance_loss_clip": 1.32969236, "balance_loss_mlp": 0.35700351, "epoch": 0.07617616113031715, "flos": 21507188353920.0, "grad_norm": 91.4630059630357, "language_loss": 0.9122858, "learning_rate": 3.977675470860691e-06, "loss": 0.93372244, "num_input_tokens_seen": 26970335, "router_z_loss_clip": 4.1640625, "router_z_loss_mlp": 0.40917969, "step": 1267, "time_per_iteration": 2.6626882553100586 }, { "auxiliary_loss_clip": 0.01732228, "auxiliary_loss_mlp": 0.00398932, "balance_loss_clip": 1.32293785, "balance_loss_mlp": 0.36018917, "epoch": 0.07623628438298512, "flos": 14573161221120.0, "grad_norm": 33.55488548490917, "language_loss": 0.79705495, "learning_rate": 3.977617404968205e-06, "loss": 0.81836653, "num_input_tokens_seen": 26986025, "router_z_loss_clip": 4.09179688, "router_z_loss_mlp": 0.38745117, "step": 1268, "time_per_iteration": 2.6123392581939697 }, { "auxiliary_loss_clip": 0.01741528, "auxiliary_loss_mlp": 0.00374701, "balance_loss_clip": 1.32704854, "balance_loss_mlp": 0.33652985, "epoch": 0.07629640763565308, "flos": 14720395069440.0, "grad_norm": 70.11496768498584, "language_loss": 0.89901024, "learning_rate": 3.977559264084269e-06, "loss": 0.92017251, "num_input_tokens_seen": 27004045, "router_z_loss_clip": 4.14453125, "router_z_loss_mlp": 0.3815918, "step": 1269, "time_per_iteration": 2.6489317417144775 }, { "auxiliary_loss_clip": 0.01727575, "auxiliary_loss_mlp": 0.00397976, "balance_loss_clip": 1.3155452, "balance_loss_mlp": 0.35820818, "epoch": 0.07635653088832106, "flos": 14902929008640.0, "grad_norm": 19.570112889270128, "language_loss": 0.97329545, "learning_rate": 3.977501048211088e-06, "loss": 0.99455094, "num_input_tokens_seen": 27022070, "router_z_loss_clip": 4.11328125, "router_z_loss_mlp": 0.3972168, "step": 1270, "time_per_iteration": 2.7451910972595215 }, { "auxiliary_loss_clip": 0.01732058, "auxiliary_loss_mlp": 0.00431138, "balance_loss_clip": 1.31979036, "balance_loss_mlp": 0.38533753, "epoch": 0.07641665414098903, "flos": 26651571235200.0, "grad_norm": 8.923158329237609, "language_loss": 0.78309023, "learning_rate": 3.977442757350869e-06, "loss": 0.80472219, "num_input_tokens_seen": 27041755, "router_z_loss_clip": 4.12109375, "router_z_loss_mlp": 0.45800781, "step": 1271, "time_per_iteration": 2.65997052192688 }, { "auxiliary_loss_clip": 0.01697876, "auxiliary_loss_mlp": 0.00344314, "balance_loss_clip": 1.30313241, "balance_loss_mlp": 0.30919528, "epoch": 0.07647677739365699, "flos": 25192808092800.0, "grad_norm": 4.737226967339089, "language_loss": 0.88084912, "learning_rate": 3.977384391505823e-06, "loss": 0.90127099, "num_input_tokens_seen": 27061540, "router_z_loss_clip": 3.94726562, "router_z_loss_mlp": 0.35107422, "step": 1272, "time_per_iteration": 2.7631027698516846 }, { "auxiliary_loss_clip": 0.01673589, "auxiliary_loss_mlp": 0.00309038, "balance_loss_clip": 1.27907014, "balance_loss_mlp": 0.27580267, "epoch": 0.07653690064632497, "flos": 20558141159040.0, "grad_norm": 4.346671735743657, "language_loss": 0.88363796, "learning_rate": 3.977325950678162e-06, "loss": 0.90346426, "num_input_tokens_seen": 27081395, "router_z_loss_clip": 3.94921875, "router_z_loss_mlp": 0.33251953, "step": 1273, "time_per_iteration": 2.6227657794952393 }, { "auxiliary_loss_clip": 0.01669931, "auxiliary_loss_mlp": 0.00339851, "balance_loss_clip": 1.2711041, "balance_loss_mlp": 0.30153733, "epoch": 0.07659702389899294, "flos": 22269320150400.0, "grad_norm": 5.044524737465733, "language_loss": 0.87016308, "learning_rate": 3.977267434870103e-06, "loss": 0.89026093, "num_input_tokens_seen": 27101175, "router_z_loss_clip": 3.9921875, "router_z_loss_mlp": 0.3828125, "step": 1274, "time_per_iteration": 2.651512861251831 }, { "auxiliary_loss_clip": 0.01646599, "auxiliary_loss_mlp": 0.00305376, "balance_loss_clip": 1.25978887, "balance_loss_mlp": 0.27161574, "epoch": 0.0766571471516609, "flos": 32636120209920.0, "grad_norm": 14.363001335113136, "language_loss": 0.79308766, "learning_rate": 3.977208844083865e-06, "loss": 0.81260741, "num_input_tokens_seen": 27124505, "router_z_loss_clip": 3.87109375, "router_z_loss_mlp": 0.33764648, "step": 1275, "time_per_iteration": 2.74676251411438 }, { "auxiliary_loss_clip": 0.01631204, "auxiliary_loss_mlp": 0.00310552, "balance_loss_clip": 1.23960042, "balance_loss_mlp": 0.27395451, "epoch": 0.07671727040432888, "flos": 15267386355840.0, "grad_norm": 25.13753843577571, "language_loss": 0.88959205, "learning_rate": 3.9771501783216685e-06, "loss": 0.90900958, "num_input_tokens_seen": 27140960, "router_z_loss_clip": 3.91992188, "router_z_loss_mlp": 0.36572266, "step": 1276, "time_per_iteration": 2.6248066425323486 }, { "auxiliary_loss_clip": 0.0161095, "auxiliary_loss_mlp": 0.00308203, "balance_loss_clip": 1.22672296, "balance_loss_mlp": 0.27260691, "epoch": 0.07677739365699685, "flos": 28184094956160.0, "grad_norm": 961.320283048435, "language_loss": 0.69782436, "learning_rate": 3.97709143758574e-06, "loss": 0.71701586, "num_input_tokens_seen": 27160985, "router_z_loss_clip": 3.84179688, "router_z_loss_mlp": 0.35595703, "step": 1277, "time_per_iteration": 2.6630594730377197 }, { "auxiliary_loss_clip": 0.01618516, "auxiliary_loss_mlp": 0.00312157, "balance_loss_clip": 1.23010349, "balance_loss_mlp": 0.27546433, "epoch": 0.07683751690966481, "flos": 18296128126080.0, "grad_norm": 6.618897011855969, "language_loss": 0.83482885, "learning_rate": 3.977032621878305e-06, "loss": 0.85413551, "num_input_tokens_seen": 27178390, "router_z_loss_clip": 3.88476562, "router_z_loss_mlp": 0.3671875, "step": 1278, "time_per_iteration": 2.647878646850586 }, { "auxiliary_loss_clip": 0.01592786, "auxiliary_loss_mlp": 0.0030171, "balance_loss_clip": 1.21005392, "balance_loss_mlp": 0.26766348, "epoch": 0.07689764016233278, "flos": 21981101420160.0, "grad_norm": 28.326049654561977, "language_loss": 0.95236957, "learning_rate": 3.976973731201596e-06, "loss": 0.97131455, "num_input_tokens_seen": 27197505, "router_z_loss_clip": 3.828125, "router_z_loss_mlp": 0.34057617, "step": 1279, "time_per_iteration": 2.647526502609253 }, { "auxiliary_loss_clip": 0.0158287, "auxiliary_loss_mlp": 0.00304841, "balance_loss_clip": 1.19835341, "balance_loss_mlp": 0.2701987, "epoch": 0.07695776341500075, "flos": 22235995307520.0, "grad_norm": 5.0587063161056784, "language_loss": 0.88756126, "learning_rate": 3.976914765557845e-06, "loss": 0.90643835, "num_input_tokens_seen": 27214260, "router_z_loss_clip": 3.84179688, "router_z_loss_mlp": 0.34643555, "step": 1280, "time_per_iteration": 2.614304780960083 }, { "auxiliary_loss_clip": 0.01565731, "auxiliary_loss_mlp": 0.00302362, "balance_loss_clip": 1.188761, "balance_loss_mlp": 0.26910272, "epoch": 0.07701788666766872, "flos": 16143750380160.0, "grad_norm": 3.314127578716393, "language_loss": 0.84418833, "learning_rate": 3.9768557249492875e-06, "loss": 0.86286926, "num_input_tokens_seen": 27232525, "router_z_loss_clip": 3.76757812, "router_z_loss_mlp": 0.33251953, "step": 1281, "time_per_iteration": 2.612200975418091 }, { "auxiliary_loss_clip": 0.01561521, "auxiliary_loss_mlp": 0.0030691, "balance_loss_clip": 1.17886806, "balance_loss_mlp": 0.26800001, "epoch": 0.07707800992033668, "flos": 19463045264640.0, "grad_norm": 12955.784200158134, "language_loss": 0.85814333, "learning_rate": 3.9767966093781634e-06, "loss": 0.87682772, "num_input_tokens_seen": 27249800, "router_z_loss_clip": 3.82617188, "router_z_loss_mlp": 0.38916016, "step": 1282, "time_per_iteration": 2.6317756175994873 }, { "auxiliary_loss_clip": 0.01551849, "auxiliary_loss_mlp": 0.00306627, "balance_loss_clip": 1.17753339, "balance_loss_mlp": 0.27198493, "epoch": 0.07713813317300466, "flos": 18990281433600.0, "grad_norm": 4.915151956411521, "language_loss": 0.88332635, "learning_rate": 3.976737418846713e-06, "loss": 0.90191114, "num_input_tokens_seen": 27268895, "router_z_loss_clip": 3.74804688, "router_z_loss_mlp": 0.34643555, "step": 1283, "time_per_iteration": 2.714460849761963 }, { "auxiliary_loss_clip": 0.0154992, "auxiliary_loss_mlp": 0.00317281, "balance_loss_clip": 1.17429495, "balance_loss_mlp": 0.28073195, "epoch": 0.07719825642567263, "flos": 18113953322880.0, "grad_norm": 43.001294932244875, "language_loss": 0.81510997, "learning_rate": 3.976678153357181e-06, "loss": 0.83378196, "num_input_tokens_seen": 27288180, "router_z_loss_clip": 3.7578125, "router_z_loss_mlp": 0.36547852, "step": 1284, "time_per_iteration": 4.036075830459595 }, { "auxiliary_loss_clip": 0.01540166, "auxiliary_loss_mlp": 0.00298823, "balance_loss_clip": 1.16643167, "balance_loss_mlp": 0.26372784, "epoch": 0.0772583796783406, "flos": 42194426993280.0, "grad_norm": 46.02181373526052, "language_loss": 0.81287575, "learning_rate": 3.976618812911817e-06, "loss": 0.83126563, "num_input_tokens_seen": 27311815, "router_z_loss_clip": 3.74023438, "router_z_loss_mlp": 0.35083008, "step": 1285, "time_per_iteration": 4.1603734493255615 }, { "auxiliary_loss_clip": 0.01522623, "auxiliary_loss_mlp": 0.00290177, "balance_loss_clip": 1.15279865, "balance_loss_mlp": 0.25710821, "epoch": 0.07731850293100857, "flos": 24753692327040.0, "grad_norm": 9.036012574273268, "language_loss": 0.91345906, "learning_rate": 3.9765593975128685e-06, "loss": 0.9315871, "num_input_tokens_seen": 27331890, "router_z_loss_clip": 3.70117188, "router_z_loss_mlp": 0.33056641, "step": 1286, "time_per_iteration": 2.68825101852417 }, { "auxiliary_loss_clip": 0.01538433, "auxiliary_loss_mlp": 0.00314877, "balance_loss_clip": 1.15773606, "balance_loss_mlp": 0.27744561, "epoch": 0.07737862618367654, "flos": 17565884628480.0, "grad_norm": 6.638437210453716, "language_loss": 0.89166927, "learning_rate": 3.97649990716259e-06, "loss": 0.91020238, "num_input_tokens_seen": 27348320, "router_z_loss_clip": 3.80664062, "router_z_loss_mlp": 0.37426758, "step": 1287, "time_per_iteration": 2.6894731521606445 }, { "auxiliary_loss_clip": 0.01508099, "auxiliary_loss_mlp": 0.00232291, "balance_loss_clip": 1.13968885, "balance_loss_mlp": 0.1978398, "epoch": 0.0774387494363445, "flos": 25627147349760.0, "grad_norm": 3.1202048357135714, "language_loss": 0.92406225, "learning_rate": 3.976440341863237e-06, "loss": 0.94146615, "num_input_tokens_seen": 27367670, "router_z_loss_clip": 3.68359375, "router_z_loss_mlp": 0.34448242, "step": 1288, "time_per_iteration": 4.105735778808594 }, { "auxiliary_loss_clip": 0.01530784, "auxiliary_loss_mlp": 0.00261543, "balance_loss_clip": 1.15744758, "balance_loss_mlp": 0.22487432, "epoch": 0.07749887268901248, "flos": 12239865648000.0, "grad_norm": 9.695149628619848, "language_loss": 0.97393548, "learning_rate": 3.976380701617068e-06, "loss": 0.99185878, "num_input_tokens_seen": 27385485, "router_z_loss_clip": 3.73242188, "router_z_loss_mlp": 0.36645508, "step": 1289, "time_per_iteration": 2.660024642944336 }, { "auxiliary_loss_clip": 0.01499395, "auxiliary_loss_mlp": 0.0026983, "balance_loss_clip": 1.13270521, "balance_loss_mlp": 0.23583147, "epoch": 0.07755899594168045, "flos": 25081736261760.0, "grad_norm": 15.763323708444984, "language_loss": 0.91781104, "learning_rate": 3.976320986426344e-06, "loss": 0.9355033, "num_input_tokens_seen": 27405110, "router_z_loss_clip": 3.66601562, "router_z_loss_mlp": 0.34008789, "step": 1290, "time_per_iteration": 4.107405662536621 }, { "auxiliary_loss_clip": 0.01480336, "auxiliary_loss_mlp": 0.00270968, "balance_loss_clip": 1.11924624, "balance_loss_mlp": 0.23832861, "epoch": 0.07761911919434841, "flos": 14246410176000.0, "grad_norm": 15.119856203535962, "language_loss": 1.00472176, "learning_rate": 3.9762611962933315e-06, "loss": 1.02223468, "num_input_tokens_seen": 27422855, "router_z_loss_clip": 3.61328125, "router_z_loss_mlp": 0.32592773, "step": 1291, "time_per_iteration": 2.6115522384643555 }, { "auxiliary_loss_clip": 0.01448861, "auxiliary_loss_mlp": 0.00115257, "balance_loss_clip": 1.14143157, "balance_loss_mlp": 0.09799597, "epoch": 0.07767924244701638, "flos": 67237202954880.0, "grad_norm": 0.873794344290375, "language_loss": 0.65275633, "learning_rate": 3.9762013312202955e-06, "loss": 0.66839755, "num_input_tokens_seen": 27487190, "router_z_loss_clip": 3.078125, "router_z_loss_mlp": 0.17285156, "step": 1292, "time_per_iteration": 3.2827842235565186 }, { "auxiliary_loss_clip": 0.01473027, "auxiliary_loss_mlp": 0.00255291, "balance_loss_clip": 1.11182308, "balance_loss_mlp": 0.22343849, "epoch": 0.07773936569968436, "flos": 28550635292160.0, "grad_norm": 403.31357504539426, "language_loss": 0.94402105, "learning_rate": 3.9761413912095075e-06, "loss": 0.96130419, "num_input_tokens_seen": 27510465, "router_z_loss_clip": 3.6171875, "router_z_loss_mlp": 0.31835938, "step": 1293, "time_per_iteration": 2.6694436073303223 }, { "auxiliary_loss_clip": 0.01478372, "auxiliary_loss_mlp": 0.00318, "balance_loss_clip": 1.11369538, "balance_loss_mlp": 0.28202266, "epoch": 0.07779948895235232, "flos": 27490264871040.0, "grad_norm": 4.516317694182326, "language_loss": 0.92960036, "learning_rate": 3.976081376263239e-06, "loss": 0.94756407, "num_input_tokens_seen": 27528645, "router_z_loss_clip": 3.65039062, "router_z_loss_mlp": 0.35961914, "step": 1294, "time_per_iteration": 2.738605260848999 }, { "auxiliary_loss_clip": 0.01493806, "auxiliary_loss_mlp": 0.00347296, "balance_loss_clip": 1.12264013, "balance_loss_mlp": 0.31005472, "epoch": 0.07785961220502029, "flos": 18223301301120.0, "grad_norm": 12.451545536122813, "language_loss": 0.89866936, "learning_rate": 3.976021286383768e-06, "loss": 0.9170804, "num_input_tokens_seen": 27546165, "router_z_loss_clip": 3.71289062, "router_z_loss_mlp": 0.37255859, "step": 1295, "time_per_iteration": 2.5776052474975586 }, { "auxiliary_loss_clip": 0.01485882, "auxiliary_loss_mlp": 0.00327894, "balance_loss_clip": 1.12294316, "balance_loss_mlp": 0.29520714, "epoch": 0.07791973545768827, "flos": 24608218245120.0, "grad_norm": 64.63187690010908, "language_loss": 0.95203745, "learning_rate": 3.975961121573371e-06, "loss": 0.97017527, "num_input_tokens_seen": 27566520, "router_z_loss_clip": 3.625, "router_z_loss_mlp": 0.3269043, "step": 1296, "time_per_iteration": 2.6654083728790283 }, { "auxiliary_loss_clip": 0.0147747, "auxiliary_loss_mlp": 0.00321517, "balance_loss_clip": 1.11305833, "balance_loss_mlp": 0.28666016, "epoch": 0.07797985871035623, "flos": 14282069402880.0, "grad_norm": 25.87711742882511, "language_loss": 1.04803872, "learning_rate": 3.9759008818343305e-06, "loss": 1.06602848, "num_input_tokens_seen": 27581960, "router_z_loss_clip": 3.640625, "router_z_loss_mlp": 0.34887695, "step": 1297, "time_per_iteration": 2.6322638988494873 }, { "auxiliary_loss_clip": 0.01469857, "auxiliary_loss_mlp": 0.00302587, "balance_loss_clip": 1.11044669, "balance_loss_mlp": 0.27035236, "epoch": 0.0780399819630242, "flos": 26610453141120.0, "grad_norm": 72.34628357240062, "language_loss": 0.84144515, "learning_rate": 3.97584056716893e-06, "loss": 0.8591696, "num_input_tokens_seen": 27601415, "router_z_loss_clip": 3.59570312, "router_z_loss_mlp": 0.32226562, "step": 1298, "time_per_iteration": 2.629021406173706 }, { "auxiliary_loss_clip": 0.01466746, "auxiliary_loss_mlp": 0.00267258, "balance_loss_clip": 1.11132181, "balance_loss_mlp": 0.23750377, "epoch": 0.07810010521569218, "flos": 21834514016640.0, "grad_norm": 6.277747504722253, "language_loss": 0.86946702, "learning_rate": 3.9757801775794575e-06, "loss": 0.88680708, "num_input_tokens_seen": 27621490, "router_z_loss_clip": 3.55859375, "router_z_loss_mlp": 0.29736328, "step": 1299, "time_per_iteration": 2.708911657333374 }, { "auxiliary_loss_clip": 0.01455982, "auxiliary_loss_mlp": 0.00315179, "balance_loss_clip": 1.10262775, "balance_loss_mlp": 0.28455424, "epoch": 0.07816022846836014, "flos": 25081233471360.0, "grad_norm": 2.659201632890322, "language_loss": 0.94243062, "learning_rate": 3.975719713068202e-06, "loss": 0.96014214, "num_input_tokens_seen": 27640600, "router_z_loss_clip": 3.52929688, "router_z_loss_mlp": 0.30651855, "step": 1300, "time_per_iteration": 2.676058769226074 }, { "auxiliary_loss_clip": 0.01466151, "auxiliary_loss_mlp": 0.00307926, "balance_loss_clip": 1.10686314, "balance_loss_mlp": 0.27428484, "epoch": 0.0782203517210281, "flos": 40917515431680.0, "grad_norm": 3.2940222042143663, "language_loss": 0.81058365, "learning_rate": 3.975659173637458e-06, "loss": 0.82832444, "num_input_tokens_seen": 27663070, "router_z_loss_clip": 3.59570312, "router_z_loss_mlp": 0.33642578, "step": 1301, "time_per_iteration": 2.834796905517578 }, { "auxiliary_loss_clip": 0.01454787, "auxiliary_loss_mlp": 0.00300937, "balance_loss_clip": 1.10203028, "balance_loss_mlp": 0.26968068, "epoch": 0.07828047497369607, "flos": 41172014269440.0, "grad_norm": 16.901336868381307, "language_loss": 0.77863818, "learning_rate": 3.97559855928952e-06, "loss": 0.79619545, "num_input_tokens_seen": 27686425, "router_z_loss_clip": 3.52734375, "router_z_loss_mlp": 0.31262207, "step": 1302, "time_per_iteration": 2.8743605613708496 }, { "auxiliary_loss_clip": 0.01434724, "auxiliary_loss_mlp": 0.00276383, "balance_loss_clip": 1.089697, "balance_loss_mlp": 0.24553189, "epoch": 0.07834059822636405, "flos": 23508130360320.0, "grad_norm": 2.7184775746163004, "language_loss": 0.90487152, "learning_rate": 3.9755378700266864e-06, "loss": 0.92198259, "num_input_tokens_seen": 27704900, "router_z_loss_clip": 3.453125, "router_z_loss_mlp": 0.30895996, "step": 1303, "time_per_iteration": 2.6830248832702637 }, { "auxiliary_loss_clip": 0.01426043, "auxiliary_loss_mlp": 0.00314533, "balance_loss_clip": 1.07919967, "balance_loss_mlp": 0.28277561, "epoch": 0.07840072147903202, "flos": 20193899293440.0, "grad_norm": 2.22191932793665, "language_loss": 0.81473172, "learning_rate": 3.9754771058512585e-06, "loss": 0.83213747, "num_input_tokens_seen": 27724890, "router_z_loss_clip": 3.47070312, "router_z_loss_mlp": 0.31762695, "step": 1304, "time_per_iteration": 2.664872884750366 }, { "auxiliary_loss_clip": 0.01410942, "auxiliary_loss_mlp": 0.00329632, "balance_loss_clip": 1.06785464, "balance_loss_mlp": 0.29673055, "epoch": 0.07846084473169998, "flos": 21360816432000.0, "grad_norm": 22.67342063662987, "language_loss": 0.82748127, "learning_rate": 3.975416266765542e-06, "loss": 0.84488702, "num_input_tokens_seen": 27743115, "router_z_loss_clip": 3.43164062, "router_z_loss_mlp": 0.32910156, "step": 1305, "time_per_iteration": 2.6626453399658203 }, { "auxiliary_loss_clip": 0.01418873, "auxiliary_loss_mlp": 0.00318543, "balance_loss_clip": 1.07806051, "balance_loss_mlp": 0.28685689, "epoch": 0.07852096798436796, "flos": 25410965345280.0, "grad_norm": 3.3206413140308517, "language_loss": 0.93043554, "learning_rate": 3.975355352771841e-06, "loss": 0.94780964, "num_input_tokens_seen": 27763570, "router_z_loss_clip": 3.40820312, "router_z_loss_mlp": 0.31689453, "step": 1306, "time_per_iteration": 2.6679725646972656 }, { "auxiliary_loss_clip": 0.01410053, "auxiliary_loss_mlp": 0.00308313, "balance_loss_clip": 1.07601142, "balance_loss_mlp": 0.27879685, "epoch": 0.07858109123703592, "flos": 24571481610240.0, "grad_norm": 120.56965652071403, "language_loss": 0.96196103, "learning_rate": 3.975294363872468e-06, "loss": 0.97914469, "num_input_tokens_seen": 27780030, "router_z_loss_clip": 3.33984375, "router_z_loss_mlp": 0.29492188, "step": 1307, "time_per_iteration": 2.717865467071533 }, { "auxiliary_loss_clip": 0.01408236, "auxiliary_loss_mlp": 0.00322812, "balance_loss_clip": 1.07271171, "balance_loss_mlp": 0.2894097, "epoch": 0.07864121448970389, "flos": 20698874645760.0, "grad_norm": 17.687448734907633, "language_loss": 0.90909654, "learning_rate": 3.975233300069735e-06, "loss": 0.92640698, "num_input_tokens_seen": 27796225, "router_z_loss_clip": 3.35742188, "router_z_loss_mlp": 0.33374023, "step": 1308, "time_per_iteration": 2.647608995437622 }, { "auxiliary_loss_clip": 0.01394271, "auxiliary_loss_mlp": 0.00300571, "balance_loss_clip": 1.06297922, "balance_loss_mlp": 0.27044648, "epoch": 0.07870133774237187, "flos": 22966526113920.0, "grad_norm": 8.147203224519316, "language_loss": 0.82811677, "learning_rate": 3.975172161365958e-06, "loss": 0.84506518, "num_input_tokens_seen": 27815975, "router_z_loss_clip": 3.31445312, "router_z_loss_mlp": 0.30126953, "step": 1309, "time_per_iteration": 2.6644253730773926 }, { "auxiliary_loss_clip": 0.01407384, "auxiliary_loss_mlp": 0.00330953, "balance_loss_clip": 1.07292914, "balance_loss_mlp": 0.3006376, "epoch": 0.07876146099503983, "flos": 18842832103680.0, "grad_norm": 2.9965015254121434, "language_loss": 0.8733964, "learning_rate": 3.975110947763453e-06, "loss": 0.89077973, "num_input_tokens_seen": 27832255, "router_z_loss_clip": 3.34570312, "router_z_loss_mlp": 0.30334473, "step": 1310, "time_per_iteration": 2.6906721591949463 }, { "auxiliary_loss_clip": 0.01395347, "auxiliary_loss_mlp": 0.00312646, "balance_loss_clip": 1.07704675, "balance_loss_mlp": 0.2825579, "epoch": 0.0788215842477078, "flos": 23805794367360.0, "grad_norm": 10.04567384592467, "language_loss": 0.81175137, "learning_rate": 3.9750496592645435e-06, "loss": 0.82883132, "num_input_tokens_seen": 27852180, "router_z_loss_clip": 3.18164062, "router_z_loss_mlp": 0.30078125, "step": 1311, "time_per_iteration": 2.6528451442718506 }, { "auxiliary_loss_clip": 0.01400381, "auxiliary_loss_mlp": 0.00336469, "balance_loss_clip": 1.07760525, "balance_loss_mlp": 0.30487901, "epoch": 0.07888170750037576, "flos": 21579907438080.0, "grad_norm": 11.478228168800827, "language_loss": 0.92468333, "learning_rate": 3.974988295871553e-06, "loss": 0.94205177, "num_input_tokens_seen": 27871435, "router_z_loss_clip": 3.22851562, "router_z_loss_mlp": 0.31591797, "step": 1312, "time_per_iteration": 2.6705079078674316 }, { "auxiliary_loss_clip": 0.01385438, "auxiliary_loss_mlp": 0.00316402, "balance_loss_clip": 1.06337905, "balance_loss_mlp": 0.28726742, "epoch": 0.07894183075304374, "flos": 19864849777920.0, "grad_norm": 26.338500635792823, "language_loss": 0.88949043, "learning_rate": 3.9749268575868085e-06, "loss": 0.90650886, "num_input_tokens_seen": 27890625, "router_z_loss_clip": 3.22070312, "router_z_loss_mlp": 0.29125977, "step": 1313, "time_per_iteration": 2.6854147911071777 }, { "auxiliary_loss_clip": 0.01390935, "auxiliary_loss_mlp": 0.00331473, "balance_loss_clip": 1.06350064, "balance_loss_mlp": 0.29692662, "epoch": 0.07900195400571171, "flos": 16143463071360.0, "grad_norm": 4.080048582420998, "language_loss": 0.83872008, "learning_rate": 3.97486534441264e-06, "loss": 0.85594422, "num_input_tokens_seen": 27906530, "router_z_loss_clip": 3.2734375, "router_z_loss_mlp": 0.34545898, "step": 1314, "time_per_iteration": 2.6966142654418945 }, { "auxiliary_loss_clip": 0.01392291, "auxiliary_loss_mlp": 0.00291491, "balance_loss_clip": 1.06610084, "balance_loss_mlp": 0.25799349, "epoch": 0.07906207725837967, "flos": 23730417676800.0, "grad_norm": 18.15723668597912, "language_loss": 0.85387945, "learning_rate": 3.974803756351379e-06, "loss": 0.87071723, "num_input_tokens_seen": 27926725, "router_z_loss_clip": 3.26171875, "router_z_loss_mlp": 0.33520508, "step": 1315, "time_per_iteration": 2.659820079803467 }, { "auxiliary_loss_clip": 0.01390301, "auxiliary_loss_mlp": 0.00296508, "balance_loss_clip": 1.06603003, "balance_loss_mlp": 0.25924331, "epoch": 0.07912220051104765, "flos": 24315905364480.0, "grad_norm": 3.577516834993414, "language_loss": 0.79986447, "learning_rate": 3.974742093405362e-06, "loss": 0.81673259, "num_input_tokens_seen": 27947875, "router_z_loss_clip": 3.24414062, "router_z_loss_mlp": 0.37304688, "step": 1316, "time_per_iteration": 2.769587993621826 }, { "auxiliary_loss_clip": 0.01394711, "auxiliary_loss_mlp": 0.00291121, "balance_loss_clip": 1.0727675, "balance_loss_mlp": 0.25590658, "epoch": 0.07918232376371562, "flos": 18880035615360.0, "grad_norm": 5.184978455922231, "language_loss": 0.78060043, "learning_rate": 3.974680355576927e-06, "loss": 0.79745877, "num_input_tokens_seen": 27965040, "router_z_loss_clip": 3.21679688, "router_z_loss_mlp": 0.35229492, "step": 1317, "time_per_iteration": 2.615846872329712 }, { "auxiliary_loss_clip": 0.01408987, "auxiliary_loss_mlp": 0.00305417, "balance_loss_clip": 1.0773716, "balance_loss_mlp": 0.26691246, "epoch": 0.07924244701638358, "flos": 27376284038400.0, "grad_norm": 176.31192396419831, "language_loss": 0.85277057, "learning_rate": 3.974618542868415e-06, "loss": 0.86991465, "num_input_tokens_seen": 27985330, "router_z_loss_clip": 3.31445312, "router_z_loss_mlp": 0.38525391, "step": 1318, "time_per_iteration": 2.681126594543457 }, { "auxiliary_loss_clip": 0.01388766, "auxiliary_loss_mlp": 0.00264503, "balance_loss_clip": 1.0707655, "balance_loss_mlp": 0.2302184, "epoch": 0.07930257026905156, "flos": 25120340403840.0, "grad_norm": 756.8473818355302, "language_loss": 0.97091597, "learning_rate": 3.97455665528217e-06, "loss": 0.98744857, "num_input_tokens_seen": 28007615, "router_z_loss_clip": 3.17578125, "router_z_loss_mlp": 0.3425293, "step": 1319, "time_per_iteration": 2.671539783477783 }, { "auxiliary_loss_clip": 0.01382685, "auxiliary_loss_mlp": 0.00233903, "balance_loss_clip": 1.06473112, "balance_loss_mlp": 0.1955649, "epoch": 0.07936269352171953, "flos": 21834478103040.0, "grad_norm": 11.650460679611053, "language_loss": 0.88113636, "learning_rate": 3.974494692820539e-06, "loss": 0.89730227, "num_input_tokens_seen": 28027765, "router_z_loss_clip": 3.18164062, "router_z_loss_mlp": 0.38330078, "step": 1320, "time_per_iteration": 2.648799419403076 }, { "auxiliary_loss_clip": 0.01404987, "auxiliary_loss_mlp": 0.00278176, "balance_loss_clip": 1.0882796, "balance_loss_mlp": 0.23855065, "epoch": 0.07942281677438749, "flos": 16939889377920.0, "grad_norm": 260.1573593843893, "language_loss": 0.77232158, "learning_rate": 3.974432655485872e-06, "loss": 0.78915322, "num_input_tokens_seen": 28044225, "router_z_loss_clip": 3.1640625, "router_z_loss_mlp": 0.39648438, "step": 1321, "time_per_iteration": 2.579237937927246 }, { "auxiliary_loss_clip": 0.0139272, "auxiliary_loss_mlp": 0.00241047, "balance_loss_clip": 1.0747391, "balance_loss_mlp": 0.20261356, "epoch": 0.07948294002705546, "flos": 18986941468800.0, "grad_norm": 8.838144935149005, "language_loss": 0.92992288, "learning_rate": 3.9743705432805195e-06, "loss": 0.94626057, "num_input_tokens_seen": 28062915, "router_z_loss_clip": 3.18164062, "router_z_loss_mlp": 0.38452148, "step": 1322, "time_per_iteration": 2.669523000717163 }, { "auxiliary_loss_clip": 0.01403796, "auxiliary_loss_mlp": 0.0026753, "balance_loss_clip": 1.08187318, "balance_loss_mlp": 0.22726148, "epoch": 0.07954306327972344, "flos": 21653452535040.0, "grad_norm": 12.326743069990895, "language_loss": 0.98932981, "learning_rate": 3.974308356206838e-06, "loss": 1.0060432, "num_input_tokens_seen": 28082175, "router_z_loss_clip": 3.22070312, "router_z_loss_mlp": 0.40283203, "step": 1323, "time_per_iteration": 2.6597750186920166 }, { "auxiliary_loss_clip": 0.01414834, "auxiliary_loss_mlp": 0.00253188, "balance_loss_clip": 1.09591353, "balance_loss_mlp": 0.21253768, "epoch": 0.0796031865323914, "flos": 23220270766080.0, "grad_norm": 2.385538940534498, "language_loss": 0.89007491, "learning_rate": 3.974246094267187e-06, "loss": 0.90675509, "num_input_tokens_seen": 28102645, "router_z_loss_clip": 3.19335938, "router_z_loss_mlp": 0.40625, "step": 1324, "time_per_iteration": 2.6459662914276123 }, { "auxiliary_loss_clip": 0.01419459, "auxiliary_loss_mlp": 0.00262011, "balance_loss_clip": 1.092273, "balance_loss_mlp": 0.21993056, "epoch": 0.07966330978505937, "flos": 23294534135040.0, "grad_norm": 3.8664466737200485, "language_loss": 0.86864859, "learning_rate": 3.974183757463925e-06, "loss": 0.88546336, "num_input_tokens_seen": 28122805, "router_z_loss_clip": 3.26953125, "router_z_loss_mlp": 0.42089844, "step": 1325, "time_per_iteration": 2.6315460205078125 }, { "auxiliary_loss_clip": 0.01422942, "auxiliary_loss_mlp": 0.00251991, "balance_loss_clip": 1.10237932, "balance_loss_mlp": 0.21412989, "epoch": 0.07972343303772735, "flos": 18363783392640.0, "grad_norm": 11.667800206315418, "language_loss": 0.95391053, "learning_rate": 3.974121345799418e-06, "loss": 0.97065985, "num_input_tokens_seen": 28140530, "router_z_loss_clip": 3.20703125, "router_z_loss_mlp": 0.37866211, "step": 1326, "time_per_iteration": 4.018853187561035 }, { "auxiliary_loss_clip": 0.01433172, "auxiliary_loss_mlp": 0.00260051, "balance_loss_clip": 1.1066407, "balance_loss_mlp": 0.21584773, "epoch": 0.07978355629039531, "flos": 21762513204480.0, "grad_norm": 6.975556626986303, "language_loss": 0.90548259, "learning_rate": 3.974058859276032e-06, "loss": 0.92241478, "num_input_tokens_seen": 28159640, "router_z_loss_clip": 3.26171875, "router_z_loss_mlp": 0.44213867, "step": 1327, "time_per_iteration": 4.1199305057525635 }, { "auxiliary_loss_clip": 0.01445254, "auxiliary_loss_mlp": 0.00280698, "balance_loss_clip": 1.10936821, "balance_loss_mlp": 0.23797376, "epoch": 0.07984367954306328, "flos": 18551309322240.0, "grad_norm": 5.054045915624102, "language_loss": 0.88045537, "learning_rate": 3.9739962978961354e-06, "loss": 0.89771485, "num_input_tokens_seen": 28177050, "router_z_loss_clip": 3.359375, "router_z_loss_mlp": 0.42724609, "step": 1328, "time_per_iteration": 2.6364290714263916 }, { "auxiliary_loss_clip": 0.0144175, "auxiliary_loss_mlp": 0.00266718, "balance_loss_clip": 1.11200893, "balance_loss_mlp": 0.22623454, "epoch": 0.07990380279573125, "flos": 16904050583040.0, "grad_norm": 14.179001909603684, "language_loss": 0.87453425, "learning_rate": 3.973933661662101e-06, "loss": 0.89161897, "num_input_tokens_seen": 28193245, "router_z_loss_clip": 3.29492188, "router_z_loss_mlp": 0.40478516, "step": 1329, "time_per_iteration": 2.580385446548462 }, { "auxiliary_loss_clip": 0.01431004, "auxiliary_loss_mlp": 0.00262007, "balance_loss_clip": 1.11454558, "balance_loss_mlp": 0.22238135, "epoch": 0.07996392604839922, "flos": 24098358643200.0, "grad_norm": 59.21744736908394, "language_loss": 0.87752497, "learning_rate": 3.973870950576305e-06, "loss": 0.89445502, "num_input_tokens_seen": 28213570, "router_z_loss_clip": 3.1640625, "router_z_loss_mlp": 0.39599609, "step": 1330, "time_per_iteration": 4.077152490615845 }, { "auxiliary_loss_clip": 0.01444203, "auxiliary_loss_mlp": 0.00267575, "balance_loss_clip": 1.11664915, "balance_loss_mlp": 0.23004797, "epoch": 0.08002404930106718, "flos": 14278729438080.0, "grad_norm": 65.28312445244964, "language_loss": 0.96788812, "learning_rate": 3.9738081646411255e-06, "loss": 0.98500586, "num_input_tokens_seen": 28229980, "router_z_loss_clip": 3.27929688, "router_z_loss_mlp": 0.37524414, "step": 1331, "time_per_iteration": 2.6203792095184326 }, { "auxiliary_loss_clip": 0.01440922, "auxiliary_loss_mlp": 0.00298727, "balance_loss_clip": 1.11194932, "balance_loss_mlp": 0.26015094, "epoch": 0.08008417255373516, "flos": 40406219285760.0, "grad_norm": 49.18814412866486, "language_loss": 0.81172788, "learning_rate": 3.973745303858942e-06, "loss": 0.82912439, "num_input_tokens_seen": 28253840, "router_z_loss_clip": 3.2890625, "router_z_loss_mlp": 0.38598633, "step": 1332, "time_per_iteration": 2.8272128105163574 }, { "auxiliary_loss_clip": 0.01426554, "auxiliary_loss_mlp": 0.00248876, "balance_loss_clip": 1.10953546, "balance_loss_mlp": 0.21194528, "epoch": 0.08014429580640313, "flos": 18478913460480.0, "grad_norm": 32.2903565935653, "language_loss": 0.88781768, "learning_rate": 3.973682368232138e-06, "loss": 0.90457195, "num_input_tokens_seen": 28271675, "router_z_loss_clip": 3.171875, "router_z_loss_mlp": 0.36938477, "step": 1333, "time_per_iteration": 4.1850972175598145 }, { "auxiliary_loss_clip": 0.01436825, "auxiliary_loss_mlp": 0.00237983, "balance_loss_clip": 1.12227488, "balance_loss_mlp": 0.19864425, "epoch": 0.0802044190590711, "flos": 22053461368320.0, "grad_norm": 69.7683104693559, "language_loss": 0.83835208, "learning_rate": 3.9736193577631015e-06, "loss": 0.85510015, "num_input_tokens_seen": 28291850, "router_z_loss_clip": 3.14453125, "router_z_loss_mlp": 0.39331055, "step": 1334, "time_per_iteration": 2.6641104221343994 }, { "auxiliary_loss_clip": 0.01456593, "auxiliary_loss_mlp": 0.0027576, "balance_loss_clip": 1.12998819, "balance_loss_mlp": 0.23572937, "epoch": 0.08026454231173906, "flos": 24572128055040.0, "grad_norm": 8.306882742668991, "language_loss": 0.87945241, "learning_rate": 3.973556272454221e-06, "loss": 0.8967759, "num_input_tokens_seen": 28310780, "router_z_loss_clip": 3.26953125, "router_z_loss_mlp": 0.39990234, "step": 1335, "time_per_iteration": 2.698497772216797 }, { "auxiliary_loss_clip": 0.02042788, "auxiliary_loss_mlp": 0.00177893, "balance_loss_clip": 1.5445745, "balance_loss_mlp": 0.14775665, "epoch": 0.08032466556440704, "flos": 52581841459200.0, "grad_norm": 0.7601829488643302, "language_loss": 0.56162977, "learning_rate": 3.973493112307889e-06, "loss": 0.58383656, "num_input_tokens_seen": 28369985, "router_z_loss_clip": 5.0, "router_z_loss_mlp": 0.30078125, "step": 1336, "time_per_iteration": 3.205627918243408 }, { "auxiliary_loss_clip": 0.01451414, "auxiliary_loss_mlp": 0.0026925, "balance_loss_clip": 1.13348436, "balance_loss_mlp": 0.23112664, "epoch": 0.080384788817075, "flos": 23842602829440.0, "grad_norm": 5.4611488041055845, "language_loss": 0.76377583, "learning_rate": 3.9734298773265005e-06, "loss": 0.78098238, "num_input_tokens_seen": 28388670, "router_z_loss_clip": 3.17773438, "router_z_loss_mlp": 0.38134766, "step": 1337, "time_per_iteration": 2.701704740524292 }, { "auxiliary_loss_clip": 0.01420685, "auxiliary_loss_mlp": 0.00254874, "balance_loss_clip": 1.11420918, "balance_loss_mlp": 0.22035086, "epoch": 0.08044491206974297, "flos": 25300719527040.0, "grad_norm": 14.069328534570172, "language_loss": 0.95055366, "learning_rate": 3.973366567512453e-06, "loss": 0.96730924, "num_input_tokens_seen": 28411845, "router_z_loss_clip": 3.06640625, "router_z_loss_mlp": 0.3449707, "step": 1338, "time_per_iteration": 2.7330636978149414 }, { "auxiliary_loss_clip": 0.01436469, "auxiliary_loss_mlp": 0.00294185, "balance_loss_clip": 1.11614764, "balance_loss_mlp": 0.25618076, "epoch": 0.08050503532241095, "flos": 22376549226240.0, "grad_norm": 5.8244769519333754, "language_loss": 0.94220161, "learning_rate": 3.973303182868147e-06, "loss": 0.95950818, "num_input_tokens_seen": 28427875, "router_z_loss_clip": 3.20507812, "router_z_loss_mlp": 0.37988281, "step": 1339, "time_per_iteration": 2.6214189529418945 }, { "auxiliary_loss_clip": 0.0141966, "auxiliary_loss_mlp": 0.00257269, "balance_loss_clip": 1.11751604, "balance_loss_mlp": 0.22595277, "epoch": 0.08056515857507891, "flos": 18369421827840.0, "grad_norm": 3.3326363771219927, "language_loss": 0.97063345, "learning_rate": 3.973239723395988e-06, "loss": 0.98740268, "num_input_tokens_seen": 28446615, "router_z_loss_clip": 3.02148438, "router_z_loss_mlp": 0.31286621, "step": 1340, "time_per_iteration": 2.6067183017730713 }, { "auxiliary_loss_clip": 0.01681847, "auxiliary_loss_mlp": 0.0012862, "balance_loss_clip": 1.373528, "balance_loss_mlp": 0.11145389, "epoch": 0.08062528182774688, "flos": 51348130980480.0, "grad_norm": 0.9098652215587222, "language_loss": 0.65231824, "learning_rate": 3.97317618909838e-06, "loss": 0.67042291, "num_input_tokens_seen": 28505290, "router_z_loss_clip": 3.078125, "router_z_loss_mlp": 0.171875, "step": 1341, "time_per_iteration": 3.0832440853118896 }, { "auxiliary_loss_clip": 0.01434552, "auxiliary_loss_mlp": 0.00300324, "balance_loss_clip": 1.11568832, "balance_loss_mlp": 0.26222479, "epoch": 0.08068540508041486, "flos": 17599712261760.0, "grad_norm": 2.7079291988594205, "language_loss": 0.97051221, "learning_rate": 3.973112579977733e-06, "loss": 0.98786098, "num_input_tokens_seen": 28522735, "router_z_loss_clip": 3.18945312, "router_z_loss_mlp": 0.38110352, "step": 1342, "time_per_iteration": 2.657400608062744 }, { "auxiliary_loss_clip": 0.01426119, "auxiliary_loss_mlp": 0.00297171, "balance_loss_clip": 1.12599373, "balance_loss_mlp": 0.26326814, "epoch": 0.08074552833308282, "flos": 10561185486720.0, "grad_norm": 2.675041880325655, "language_loss": 0.83563262, "learning_rate": 3.973048896036459e-06, "loss": 0.85286546, "num_input_tokens_seen": 28539460, "router_z_loss_clip": 3.0, "router_z_loss_mlp": 0.33911133, "step": 1343, "time_per_iteration": 2.638948678970337 }, { "auxiliary_loss_clip": 0.01565932, "auxiliary_loss_mlp": 0.00292391, "balance_loss_clip": 1.29619789, "balance_loss_mlp": 0.27760869, "epoch": 0.08080565158575079, "flos": 60840254954880.0, "grad_norm": 0.79597231301682, "language_loss": 0.57763076, "learning_rate": 3.972985137276974e-06, "loss": 0.596214, "num_input_tokens_seen": 28599855, "router_z_loss_clip": 2.6875, "router_z_loss_mlp": 0.14746094, "step": 1344, "time_per_iteration": 3.0435681343078613 }, { "auxiliary_loss_clip": 0.01420147, "auxiliary_loss_mlp": 0.00311191, "balance_loss_clip": 1.1183486, "balance_loss_mlp": 0.28022087, "epoch": 0.08086577483841875, "flos": 18332361970560.0, "grad_norm": 5.039585428395724, "language_loss": 0.97421706, "learning_rate": 3.972921303701695e-06, "loss": 0.99153042, "num_input_tokens_seen": 28617585, "router_z_loss_clip": 3.015625, "router_z_loss_mlp": 0.31005859, "step": 1345, "time_per_iteration": 2.6152117252349854 }, { "auxiliary_loss_clip": 0.01405817, "auxiliary_loss_mlp": 0.00274243, "balance_loss_clip": 1.10885477, "balance_loss_mlp": 0.24331996, "epoch": 0.08092589809108673, "flos": 21543601766400.0, "grad_norm": 5.838819945360532, "language_loss": 0.9270404, "learning_rate": 3.972857395313042e-06, "loss": 0.94384098, "num_input_tokens_seen": 28636355, "router_z_loss_clip": 2.97070312, "router_z_loss_mlp": 0.30932617, "step": 1346, "time_per_iteration": 2.6433396339416504 }, { "auxiliary_loss_clip": 0.01413063, "auxiliary_loss_mlp": 0.00303431, "balance_loss_clip": 1.11475408, "balance_loss_mlp": 0.2691468, "epoch": 0.0809860213437547, "flos": 22128012046080.0, "grad_norm": 77.41764165493458, "language_loss": 0.98336738, "learning_rate": 3.972793412113439e-06, "loss": 1.00053239, "num_input_tokens_seen": 28656260, "router_z_loss_clip": 2.97851562, "router_z_loss_mlp": 0.34277344, "step": 1347, "time_per_iteration": 2.744241237640381 }, { "auxiliary_loss_clip": 0.01400941, "auxiliary_loss_mlp": 0.00301265, "balance_loss_clip": 1.10825872, "balance_loss_mlp": 0.26955515, "epoch": 0.08104614459642266, "flos": 21725489260800.0, "grad_norm": 1.6436972827826741, "language_loss": 0.95333678, "learning_rate": 3.972729354105312e-06, "loss": 0.97035885, "num_input_tokens_seen": 28675865, "router_z_loss_clip": 2.92578125, "router_z_loss_mlp": 0.31665039, "step": 1348, "time_per_iteration": 2.7137255668640137 }, { "auxiliary_loss_clip": 0.01407239, "auxiliary_loss_mlp": 0.00324102, "balance_loss_clip": 1.11742091, "balance_loss_mlp": 0.29306, "epoch": 0.08110626784909064, "flos": 23951878980480.0, "grad_norm": 32.97636595817586, "language_loss": 0.82468939, "learning_rate": 3.97266522129109e-06, "loss": 0.84200275, "num_input_tokens_seen": 28696255, "router_z_loss_clip": 2.89648438, "router_z_loss_mlp": 0.31054688, "step": 1349, "time_per_iteration": 2.7487282752990723 }, { "auxiliary_loss_clip": 0.01413591, "auxiliary_loss_mlp": 0.00327915, "balance_loss_clip": 1.11875904, "balance_loss_mlp": 0.29332051, "epoch": 0.0811663911017586, "flos": 19025689265280.0, "grad_norm": 10.447957278290486, "language_loss": 0.95505327, "learning_rate": 3.972601013673205e-06, "loss": 0.97246826, "num_input_tokens_seen": 28713905, "router_z_loss_clip": 2.9453125, "router_z_loss_mlp": 0.34594727, "step": 1350, "time_per_iteration": 2.617375135421753 }, { "auxiliary_loss_clip": 0.01421181, "auxiliary_loss_mlp": 0.00315214, "balance_loss_clip": 1.13039172, "balance_loss_mlp": 0.28369457, "epoch": 0.08122651435442657, "flos": 15341290588800.0, "grad_norm": 6.912938196762292, "language_loss": 0.91077328, "learning_rate": 3.972536731254092e-06, "loss": 0.92813718, "num_input_tokens_seen": 28732075, "router_z_loss_clip": 2.91210938, "router_z_loss_mlp": 0.31518555, "step": 1351, "time_per_iteration": 2.691523790359497 }, { "auxiliary_loss_clip": 0.01423061, "auxiliary_loss_mlp": 0.00322645, "balance_loss_clip": 1.12792492, "balance_loss_mlp": 0.28518909, "epoch": 0.08128663760709455, "flos": 23221563655680.0, "grad_norm": 32.31038849588147, "language_loss": 0.82267827, "learning_rate": 3.972472374036189e-06, "loss": 0.84013534, "num_input_tokens_seen": 28751150, "router_z_loss_clip": 2.95117188, "router_z_loss_mlp": 0.37451172, "step": 1352, "time_per_iteration": 2.667226791381836 }, { "auxiliary_loss_clip": 0.01454504, "auxiliary_loss_mlp": 0.00352808, "balance_loss_clip": 1.15080142, "balance_loss_mlp": 0.31509066, "epoch": 0.08134676085976252, "flos": 22965628273920.0, "grad_norm": 13.358210995339473, "language_loss": 0.89220464, "learning_rate": 3.972407942021935e-06, "loss": 0.91027784, "num_input_tokens_seen": 28773360, "router_z_loss_clip": 3.03320312, "router_z_loss_mlp": 0.37695312, "step": 1353, "time_per_iteration": 2.763735055923462 }, { "auxiliary_loss_clip": 0.0144267, "auxiliary_loss_mlp": 0.00734404, "balance_loss_clip": 1.18967795, "balance_loss_mlp": 0.71266013, "epoch": 0.08140688411243048, "flos": 64322115816960.0, "grad_norm": 0.9461380544002201, "language_loss": 0.59893882, "learning_rate": 3.972343435213775e-06, "loss": 0.62070954, "num_input_tokens_seen": 28833390, "router_z_loss_clip": 2.53125, "router_z_loss_mlp": 0.21777344, "step": 1354, "time_per_iteration": 3.2026994228363037 }, { "auxiliary_loss_clip": 0.01431726, "auxiliary_loss_mlp": 0.00324446, "balance_loss_clip": 1.14842105, "balance_loss_mlp": 0.29149652, "epoch": 0.08146700736509845, "flos": 22491858862080.0, "grad_norm": 8.061344233741078, "language_loss": 0.90319955, "learning_rate": 3.972278853614154e-06, "loss": 0.92076129, "num_input_tokens_seen": 28852430, "router_z_loss_clip": 2.8359375, "router_z_loss_mlp": 0.32958984, "step": 1355, "time_per_iteration": 2.636833429336548 }, { "auxiliary_loss_clip": 0.01448948, "auxiliary_loss_mlp": 0.00354532, "balance_loss_clip": 1.16368151, "balance_loss_mlp": 0.31791103, "epoch": 0.08152713061776642, "flos": 20447823513600.0, "grad_norm": 649.2016172846198, "language_loss": 0.80451214, "learning_rate": 3.972214197225521e-06, "loss": 0.82254696, "num_input_tokens_seen": 28870685, "router_z_loss_clip": 2.85546875, "router_z_loss_mlp": 0.36621094, "step": 1356, "time_per_iteration": 2.641563653945923 }, { "auxiliary_loss_clip": 0.01447019, "auxiliary_loss_mlp": 0.00379513, "balance_loss_clip": 1.15043473, "balance_loss_mlp": 0.34029353, "epoch": 0.08158725387043439, "flos": 23550218121600.0, "grad_norm": 1424.8314398088262, "language_loss": 0.76557982, "learning_rate": 3.972149466050329e-06, "loss": 0.78384519, "num_input_tokens_seen": 28889860, "router_z_loss_clip": 2.96484375, "router_z_loss_mlp": 0.39257812, "step": 1357, "time_per_iteration": 2.7008700370788574 }, { "auxiliary_loss_clip": 0.01442217, "auxiliary_loss_mlp": 0.00366685, "balance_loss_clip": 1.1464138, "balance_loss_mlp": 0.32894379, "epoch": 0.08164737712310235, "flos": 22017335264640.0, "grad_norm": 21.227015740965125, "language_loss": 0.91807985, "learning_rate": 3.97208466009103e-06, "loss": 0.93616885, "num_input_tokens_seen": 28905865, "router_z_loss_clip": 2.95703125, "router_z_loss_mlp": 0.37744141, "step": 1358, "time_per_iteration": 2.679595470428467 }, { "auxiliary_loss_clip": 0.01453959, "auxiliary_loss_mlp": 0.00418901, "balance_loss_clip": 1.15256476, "balance_loss_mlp": 0.37510327, "epoch": 0.08170750037577033, "flos": 23367827836800.0, "grad_norm": 3.3955513623216826, "language_loss": 1.07820153, "learning_rate": 3.972019779350084e-06, "loss": 1.09693003, "num_input_tokens_seen": 28925250, "router_z_loss_clip": 3.015625, "router_z_loss_mlp": 0.43774414, "step": 1359, "time_per_iteration": 2.6239664554595947 }, { "auxiliary_loss_clip": 0.01461177, "auxiliary_loss_mlp": 0.00408633, "balance_loss_clip": 1.16645861, "balance_loss_mlp": 0.36965173, "epoch": 0.0817676236284383, "flos": 28397978490240.0, "grad_norm": 7.555162744047138, "language_loss": 0.92664993, "learning_rate": 3.971954823829951e-06, "loss": 0.94534802, "num_input_tokens_seen": 28943445, "router_z_loss_clip": 2.94921875, "router_z_loss_mlp": 0.38989258, "step": 1360, "time_per_iteration": 2.6898250579833984 }, { "auxiliary_loss_clip": 0.01481577, "auxiliary_loss_mlp": 0.00424362, "balance_loss_clip": 1.18043351, "balance_loss_mlp": 0.38528496, "epoch": 0.08182774688110626, "flos": 19208905562880.0, "grad_norm": 25.29377455752081, "language_loss": 0.8328377, "learning_rate": 3.971889793533093e-06, "loss": 0.85189712, "num_input_tokens_seen": 28962695, "router_z_loss_clip": 3.01171875, "router_z_loss_mlp": 0.390625, "step": 1361, "time_per_iteration": 2.6112399101257324 }, { "auxiliary_loss_clip": 0.01476786, "auxiliary_loss_mlp": 0.00372993, "balance_loss_clip": 1.18250513, "balance_loss_mlp": 0.33274817, "epoch": 0.08188787013377424, "flos": 22784099915520.0, "grad_norm": 5.30383032689686, "language_loss": 0.85654593, "learning_rate": 3.971824688461976e-06, "loss": 0.87504375, "num_input_tokens_seen": 28982120, "router_z_loss_clip": 2.9453125, "router_z_loss_mlp": 0.40209961, "step": 1362, "time_per_iteration": 2.72064208984375 }, { "auxiliary_loss_clip": 0.01501628, "auxiliary_loss_mlp": 0.00371492, "balance_loss_clip": 1.20607579, "balance_loss_mlp": 0.33212864, "epoch": 0.08194799338644221, "flos": 16468095214080.0, "grad_norm": 21.194398991352116, "language_loss": 0.81935596, "learning_rate": 3.971759508619069e-06, "loss": 0.8380872, "num_input_tokens_seen": 28998100, "router_z_loss_clip": 2.95898438, "router_z_loss_mlp": 0.39379883, "step": 1363, "time_per_iteration": 2.5952396392822266 }, { "auxiliary_loss_clip": 0.01494139, "auxiliary_loss_mlp": 0.00376529, "balance_loss_clip": 1.19424534, "balance_loss_mlp": 0.33604527, "epoch": 0.08200811663911017, "flos": 23913633974400.0, "grad_norm": 2.70565991622576, "language_loss": 0.84861469, "learning_rate": 3.971694254006844e-06, "loss": 0.86732137, "num_input_tokens_seen": 29017095, "router_z_loss_clip": 2.99804688, "router_z_loss_mlp": 0.4050293, "step": 1364, "time_per_iteration": 2.684079170227051 }, { "auxiliary_loss_clip": 0.01512381, "auxiliary_loss_mlp": 0.00372853, "balance_loss_clip": 1.20360422, "balance_loss_mlp": 0.33217889, "epoch": 0.08206823989177814, "flos": 17896550256000.0, "grad_norm": 10.3001611154545, "language_loss": 0.87720716, "learning_rate": 3.971628924627776e-06, "loss": 0.89605945, "num_input_tokens_seen": 29037240, "router_z_loss_clip": 3.08984375, "router_z_loss_mlp": 0.40625, "step": 1365, "time_per_iteration": 2.7238385677337646 }, { "auxiliary_loss_clip": 0.01505445, "auxiliary_loss_mlp": 0.0035495, "balance_loss_clip": 1.200032, "balance_loss_mlp": 0.31644556, "epoch": 0.08212836314444612, "flos": 22088186841600.0, "grad_norm": 3.371759335043087, "language_loss": 0.86607736, "learning_rate": 3.97156352048434e-06, "loss": 0.88468134, "num_input_tokens_seen": 29056250, "router_z_loss_clip": 3.05273438, "router_z_loss_mlp": 0.38476562, "step": 1366, "time_per_iteration": 2.6997082233428955 }, { "auxiliary_loss_clip": 0.01503157, "auxiliary_loss_mlp": 0.00340469, "balance_loss_clip": 1.18876421, "balance_loss_mlp": 0.29972345, "epoch": 0.08218848639711408, "flos": 17597485618560.0, "grad_norm": 19.524977014807764, "language_loss": 0.89101708, "learning_rate": 3.97149804157902e-06, "loss": 0.90945339, "num_input_tokens_seen": 29073380, "router_z_loss_clip": 3.14648438, "router_z_loss_mlp": 0.4074707, "step": 1367, "time_per_iteration": 2.5843591690063477 }, { "auxiliary_loss_clip": 0.01524178, "auxiliary_loss_mlp": 0.00380291, "balance_loss_clip": 1.2023375, "balance_loss_mlp": 0.3390688, "epoch": 0.08224860964978205, "flos": 17857838373120.0, "grad_norm": 3.1394002588550505, "language_loss": 0.92075825, "learning_rate": 3.9714324879142946e-06, "loss": 0.939803, "num_input_tokens_seen": 29091330, "router_z_loss_clip": 3.21875, "router_z_loss_mlp": 0.41186523, "step": 1368, "time_per_iteration": 2.613765239715576 }, { "auxiliary_loss_clip": 0.01503181, "auxiliary_loss_mlp": 0.00313359, "balance_loss_clip": 1.19737434, "balance_loss_mlp": 0.27804875, "epoch": 0.08230873290245003, "flos": 25227533566080.0, "grad_norm": 1319.882525717178, "language_loss": 0.86825335, "learning_rate": 3.971366859492653e-06, "loss": 0.8864187, "num_input_tokens_seen": 29110375, "router_z_loss_clip": 3.0546875, "router_z_loss_mlp": 0.35327148, "step": 1369, "time_per_iteration": 5.457366466522217 }, { "auxiliary_loss_clip": 0.01545534, "auxiliary_loss_mlp": 0.00319879, "balance_loss_clip": 1.21872437, "balance_loss_mlp": 0.28435433, "epoch": 0.08236885615511799, "flos": 31759935753600.0, "grad_norm": 7.091798063172654, "language_loss": 0.82685709, "learning_rate": 3.971301156316582e-06, "loss": 0.84551126, "num_input_tokens_seen": 29129395, "router_z_loss_clip": 3.26757812, "router_z_loss_mlp": 0.35522461, "step": 1370, "time_per_iteration": 2.75970721244812 }, { "auxiliary_loss_clip": 0.01550499, "auxiliary_loss_mlp": 0.00354941, "balance_loss_clip": 1.22069907, "balance_loss_mlp": 0.31247887, "epoch": 0.08242897940778596, "flos": 23185832601600.0, "grad_norm": 16.80090765670434, "language_loss": 0.80525839, "learning_rate": 3.971235378388573e-06, "loss": 0.82431281, "num_input_tokens_seen": 29148650, "router_z_loss_clip": 3.30078125, "router_z_loss_mlp": 0.42480469, "step": 1371, "time_per_iteration": 2.651773452758789 }, { "auxiliary_loss_clip": 0.01510917, "auxiliary_loss_mlp": 0.00340208, "balance_loss_clip": 1.18743753, "balance_loss_mlp": 0.29962936, "epoch": 0.08248910266045394, "flos": 34491480393600.0, "grad_norm": 17.008377014679116, "language_loss": 0.77367759, "learning_rate": 3.971169525711122e-06, "loss": 0.79218882, "num_input_tokens_seen": 29170785, "router_z_loss_clip": 3.23828125, "router_z_loss_mlp": 0.40576172, "step": 1372, "time_per_iteration": 4.184276580810547 }, { "auxiliary_loss_clip": 0.01536004, "auxiliary_loss_mlp": 0.00381441, "balance_loss_clip": 1.19820213, "balance_loss_mlp": 0.33654669, "epoch": 0.0825492259131219, "flos": 13436228960640.0, "grad_norm": 12.89747305238341, "language_loss": 0.98249853, "learning_rate": 3.9711035982867246e-06, "loss": 1.00167298, "num_input_tokens_seen": 29185210, "router_z_loss_clip": 3.375, "router_z_loss_mlp": 0.44897461, "step": 1373, "time_per_iteration": 2.591956615447998 }, { "auxiliary_loss_clip": 0.01504488, "auxiliary_loss_mlp": 0.00361955, "balance_loss_clip": 1.17351198, "balance_loss_mlp": 0.321042, "epoch": 0.08260934916578987, "flos": 25812446636160.0, "grad_norm": 16.96708677937049, "language_loss": 0.90338355, "learning_rate": 3.971037596117882e-06, "loss": 0.92204797, "num_input_tokens_seen": 29205210, "router_z_loss_clip": 3.31054688, "router_z_loss_mlp": 0.40917969, "step": 1374, "time_per_iteration": 2.6778225898742676 }, { "auxiliary_loss_clip": 0.01286433, "auxiliary_loss_mlp": 0.00312579, "balance_loss_clip": 1.06154203, "balance_loss_mlp": 0.28883225, "epoch": 0.08266947241845783, "flos": 63460009491840.0, "grad_norm": 1.1604104936875914, "language_loss": 0.60793674, "learning_rate": 3.970971519207095e-06, "loss": 0.62392688, "num_input_tokens_seen": 29265350, "router_z_loss_clip": 2.25, "router_z_loss_mlp": 0.23730469, "step": 1375, "time_per_iteration": 4.589948415756226 }, { "auxiliary_loss_clip": 0.01279185, "auxiliary_loss_mlp": 0.00299179, "balance_loss_clip": 1.05391312, "balance_loss_mlp": 0.27581388, "epoch": 0.08272959567112581, "flos": 69993704568960.0, "grad_norm": 0.9207949082836084, "language_loss": 0.62195438, "learning_rate": 3.970905367556871e-06, "loss": 0.63773805, "num_input_tokens_seen": 29321475, "router_z_loss_clip": 2.25, "router_z_loss_mlp": 0.23339844, "step": 1376, "time_per_iteration": 3.1141345500946045 }, { "auxiliary_loss_clip": 0.01506086, "auxiliary_loss_mlp": 0.00375788, "balance_loss_clip": 1.17959213, "balance_loss_mlp": 0.33511329, "epoch": 0.08278971892379378, "flos": 20413205781120.0, "grad_norm": 29.75791082188071, "language_loss": 0.87200284, "learning_rate": 3.970839141169718e-06, "loss": 0.89082158, "num_input_tokens_seen": 29341405, "router_z_loss_clip": 3.26171875, "router_z_loss_mlp": 0.40673828, "step": 1377, "time_per_iteration": 2.783621311187744 }, { "auxiliary_loss_clip": 0.0149548, "auxiliary_loss_mlp": 0.00343829, "balance_loss_clip": 1.17049563, "balance_loss_mlp": 0.30665916, "epoch": 0.08284984217646174, "flos": 26250233598720.0, "grad_norm": 27.82418002448848, "language_loss": 0.90899485, "learning_rate": 3.970772840048147e-06, "loss": 0.92738795, "num_input_tokens_seen": 29361955, "router_z_loss_clip": 3.25, "router_z_loss_mlp": 0.37182617, "step": 1378, "time_per_iteration": 2.7616915702819824 }, { "auxiliary_loss_clip": 0.01490664, "auxiliary_loss_mlp": 0.0035293, "balance_loss_clip": 1.16233206, "balance_loss_mlp": 0.31287569, "epoch": 0.08290996542912972, "flos": 27194683852800.0, "grad_norm": 25.91353255953872, "language_loss": 0.93953145, "learning_rate": 3.970706464194672e-06, "loss": 0.95796728, "num_input_tokens_seen": 29382395, "router_z_loss_clip": 3.2890625, "router_z_loss_mlp": 0.40087891, "step": 1379, "time_per_iteration": 2.6669485569000244 }, { "auxiliary_loss_clip": 0.0148992, "auxiliary_loss_mlp": 0.00340641, "balance_loss_clip": 1.171103, "balance_loss_mlp": 0.3070004, "epoch": 0.08297008868179769, "flos": 38618191146240.0, "grad_norm": 4.957387895827511, "language_loss": 0.86753297, "learning_rate": 3.970640013611812e-06, "loss": 0.88583863, "num_input_tokens_seen": 29404460, "router_z_loss_clip": 3.18945312, "router_z_loss_mlp": 0.33642578, "step": 1380, "time_per_iteration": 2.7512216567993164 }, { "auxiliary_loss_clip": 0.01476404, "auxiliary_loss_mlp": 0.00359361, "balance_loss_clip": 1.1538167, "balance_loss_mlp": 0.32138044, "epoch": 0.08303021193446565, "flos": 19974736460160.0, "grad_norm": 2.4440670162000764, "language_loss": 0.9310627, "learning_rate": 3.970573488302083e-06, "loss": 0.94942033, "num_input_tokens_seen": 29422675, "router_z_loss_clip": 3.2265625, "router_z_loss_mlp": 0.37963867, "step": 1381, "time_per_iteration": 2.583623170852661 }, { "auxiliary_loss_clip": 0.01479991, "auxiliary_loss_mlp": 0.00362033, "balance_loss_clip": 1.15537977, "balance_loss_mlp": 0.32410049, "epoch": 0.08309033518713363, "flos": 13662646341120.0, "grad_norm": 17.340877554445722, "language_loss": 0.9950496, "learning_rate": 3.970506888268011e-06, "loss": 1.01346993, "num_input_tokens_seen": 29439840, "router_z_loss_clip": 3.24414062, "router_z_loss_mlp": 0.37963867, "step": 1382, "time_per_iteration": 2.6213760375976562 }, { "auxiliary_loss_clip": 0.01476485, "auxiliary_loss_mlp": 0.00338399, "balance_loss_clip": 1.1549325, "balance_loss_mlp": 0.30173057, "epoch": 0.0831504584398016, "flos": 17968551068160.0, "grad_norm": 41.384821818849886, "language_loss": 0.82987273, "learning_rate": 3.970440213512121e-06, "loss": 0.84802151, "num_input_tokens_seen": 29457360, "router_z_loss_clip": 3.21484375, "router_z_loss_mlp": 0.36694336, "step": 1383, "time_per_iteration": 2.5799930095672607 }, { "auxiliary_loss_clip": 0.01461771, "auxiliary_loss_mlp": 0.00334515, "balance_loss_clip": 1.14639688, "balance_loss_mlp": 0.29879999, "epoch": 0.08321058169246956, "flos": 22601386408320.0, "grad_norm": 3.64615711127402, "language_loss": 0.91532779, "learning_rate": 3.97037346403694e-06, "loss": 0.9332906, "num_input_tokens_seen": 29477040, "router_z_loss_clip": 3.15234375, "router_z_loss_mlp": 0.35693359, "step": 1384, "time_per_iteration": 2.6343994140625 }, { "auxiliary_loss_clip": 0.0147155, "auxiliary_loss_mlp": 0.00420901, "balance_loss_clip": 1.14142644, "balance_loss_mlp": 0.37803364, "epoch": 0.08327070494513754, "flos": 22850426378880.0, "grad_norm": 11.649242793535327, "language_loss": 0.95609069, "learning_rate": 3.970306639845e-06, "loss": 0.97501516, "num_input_tokens_seen": 29492010, "router_z_loss_clip": 3.30078125, "router_z_loss_mlp": 0.42871094, "step": 1385, "time_per_iteration": 2.6666338443756104 }, { "auxiliary_loss_clip": 0.01457364, "auxiliary_loss_mlp": 0.00396145, "balance_loss_clip": 1.13607764, "balance_loss_mlp": 0.35322946, "epoch": 0.0833308281978055, "flos": 22782986593920.0, "grad_norm": 34.959581184296546, "language_loss": 0.7881304, "learning_rate": 3.970239740938835e-06, "loss": 0.80666554, "num_input_tokens_seen": 29511850, "router_z_loss_clip": 3.21484375, "router_z_loss_mlp": 0.4296875, "step": 1386, "time_per_iteration": 2.64005184173584 }, { "auxiliary_loss_clip": 0.014624, "auxiliary_loss_mlp": 0.00374258, "balance_loss_clip": 1.1418376, "balance_loss_mlp": 0.33525279, "epoch": 0.08339095145047347, "flos": 20812604083200.0, "grad_norm": 16.610477528073023, "language_loss": 0.88818771, "learning_rate": 3.97017276732098e-06, "loss": 0.90655428, "num_input_tokens_seen": 29531415, "router_z_loss_clip": 3.20507812, "router_z_loss_mlp": 0.39038086, "step": 1387, "time_per_iteration": 2.710378885269165 }, { "auxiliary_loss_clip": 0.01457109, "auxiliary_loss_mlp": 0.00386503, "balance_loss_clip": 1.13884139, "balance_loss_mlp": 0.34859487, "epoch": 0.08345107470314143, "flos": 18515326872960.0, "grad_norm": 4.152718201045154, "language_loss": 0.84819818, "learning_rate": 3.970105718993978e-06, "loss": 0.86663425, "num_input_tokens_seen": 29549525, "router_z_loss_clip": 3.18359375, "router_z_loss_mlp": 0.37915039, "step": 1388, "time_per_iteration": 2.605311870574951 }, { "auxiliary_loss_clip": 0.01443011, "auxiliary_loss_mlp": 0.00357603, "balance_loss_clip": 1.13340139, "balance_loss_mlp": 0.32167298, "epoch": 0.08351119795580941, "flos": 18807567926400.0, "grad_norm": 8.309567451449135, "language_loss": 0.87633842, "learning_rate": 3.970038595960369e-06, "loss": 0.89434457, "num_input_tokens_seen": 29568705, "router_z_loss_clip": 3.09375, "router_z_loss_mlp": 0.359375, "step": 1389, "time_per_iteration": 2.6574862003326416 }, { "auxiliary_loss_clip": 0.0146468, "auxiliary_loss_mlp": 0.00400138, "balance_loss_clip": 1.1459204, "balance_loss_mlp": 0.35874861, "epoch": 0.08357132120847738, "flos": 18441817689600.0, "grad_norm": 59.022413469291216, "language_loss": 0.96220088, "learning_rate": 3.969971398222699e-06, "loss": 0.98084909, "num_input_tokens_seen": 29585855, "router_z_loss_clip": 3.18945312, "router_z_loss_mlp": 0.41381836, "step": 1390, "time_per_iteration": 2.714029550552368 }, { "auxiliary_loss_clip": 0.01470727, "auxiliary_loss_mlp": 0.00374149, "balance_loss_clip": 1.15050161, "balance_loss_mlp": 0.33318818, "epoch": 0.08363144446114534, "flos": 25922333318400.0, "grad_norm": 9.874281619070665, "language_loss": 0.92696488, "learning_rate": 3.969904125783517e-06, "loss": 0.94541365, "num_input_tokens_seen": 29607280, "router_z_loss_clip": 3.20117188, "router_z_loss_mlp": 0.40942383, "step": 1391, "time_per_iteration": 2.803018093109131 }, { "auxiliary_loss_clip": 0.01486396, "auxiliary_loss_mlp": 0.00446116, "balance_loss_clip": 1.16030455, "balance_loss_mlp": 0.40105504, "epoch": 0.08369156771381332, "flos": 18041306065920.0, "grad_norm": 42.72536792030382, "language_loss": 0.99160397, "learning_rate": 3.969836778645371e-06, "loss": 1.01092911, "num_input_tokens_seen": 29624130, "router_z_loss_clip": 3.26171875, "router_z_loss_mlp": 0.45019531, "step": 1392, "time_per_iteration": 2.6316065788269043 }, { "auxiliary_loss_clip": 0.01484904, "auxiliary_loss_mlp": 0.00403862, "balance_loss_clip": 1.15944028, "balance_loss_mlp": 0.36223447, "epoch": 0.08375169096648129, "flos": 22675111073280.0, "grad_norm": 947.9845413577557, "language_loss": 0.88693172, "learning_rate": 3.969769356810819e-06, "loss": 0.90581942, "num_input_tokens_seen": 29643210, "router_z_loss_clip": 3.25585938, "router_z_loss_mlp": 0.41625977, "step": 1393, "time_per_iteration": 2.6955533027648926 }, { "auxiliary_loss_clip": 0.01474758, "auxiliary_loss_mlp": 0.00392328, "balance_loss_clip": 1.15612888, "balance_loss_mlp": 0.35465834, "epoch": 0.08381181421914925, "flos": 26103215232000.0, "grad_norm": 196.96000809576648, "language_loss": 0.91408396, "learning_rate": 3.969701860282415e-06, "loss": 0.93275481, "num_input_tokens_seen": 29663920, "router_z_loss_clip": 3.18359375, "router_z_loss_mlp": 0.37670898, "step": 1394, "time_per_iteration": 2.679044008255005 }, { "auxiliary_loss_clip": 0.01473909, "auxiliary_loss_mlp": 0.00401305, "balance_loss_clip": 1.15419221, "balance_loss_mlp": 0.36201355, "epoch": 0.08387193747181723, "flos": 20629782835200.0, "grad_norm": 48.92272220059523, "language_loss": 0.88366318, "learning_rate": 3.969634289062719e-06, "loss": 0.90241534, "num_input_tokens_seen": 29683825, "router_z_loss_clip": 3.19921875, "router_z_loss_mlp": 0.39282227, "step": 1395, "time_per_iteration": 2.7173666954040527 }, { "auxiliary_loss_clip": 0.01491892, "auxiliary_loss_mlp": 0.00438842, "balance_loss_clip": 1.16492701, "balance_loss_mlp": 0.39482969, "epoch": 0.0839320607244852, "flos": 13443196199040.0, "grad_norm": 3.486364621210599, "language_loss": 0.91763014, "learning_rate": 3.969566643154293e-06, "loss": 0.93693751, "num_input_tokens_seen": 29698775, "router_z_loss_clip": 3.27148438, "router_z_loss_mlp": 0.44018555, "step": 1396, "time_per_iteration": 2.5968189239501953 }, { "auxiliary_loss_clip": 0.01489128, "auxiliary_loss_mlp": 0.00427251, "balance_loss_clip": 1.16639566, "balance_loss_mlp": 0.38605267, "epoch": 0.08399218397715316, "flos": 23477247642240.0, "grad_norm": 3.3466701554884732, "language_loss": 0.83904576, "learning_rate": 3.969498922559703e-06, "loss": 0.85820961, "num_input_tokens_seen": 29719430, "router_z_loss_clip": 3.22460938, "router_z_loss_mlp": 0.41210938, "step": 1397, "time_per_iteration": 2.758316993713379 }, { "auxiliary_loss_clip": 0.0150117, "auxiliary_loss_mlp": 0.00421882, "balance_loss_clip": 1.18098354, "balance_loss_mlp": 0.38108853, "epoch": 0.08405230722982113, "flos": 25920717206400.0, "grad_norm": 117.623737339325, "language_loss": 0.85628992, "learning_rate": 3.969431127281516e-06, "loss": 0.87552047, "num_input_tokens_seen": 29739685, "router_z_loss_clip": 3.203125, "router_z_loss_mlp": 0.40771484, "step": 1398, "time_per_iteration": 2.6821324825286865 }, { "auxiliary_loss_clip": 0.01477498, "auxiliary_loss_mlp": 0.00420109, "balance_loss_clip": 1.1636008, "balance_loss_mlp": 0.38141394, "epoch": 0.0841124304824891, "flos": 17967437746560.0, "grad_norm": 307.11899152085994, "language_loss": 1.00626695, "learning_rate": 3.969363257322304e-06, "loss": 1.02524304, "num_input_tokens_seen": 29756165, "router_z_loss_clip": 3.140625, "router_z_loss_mlp": 0.38720703, "step": 1399, "time_per_iteration": 2.6626136302948 }, { "auxiliary_loss_clip": 0.01516433, "auxiliary_loss_mlp": 0.00478644, "balance_loss_clip": 1.17932439, "balance_loss_mlp": 0.43045929, "epoch": 0.08417255373515707, "flos": 25629661301760.0, "grad_norm": 52.06691807792565, "language_loss": 0.89229429, "learning_rate": 3.96929531268464e-06, "loss": 0.91224504, "num_input_tokens_seen": 29776425, "router_z_loss_clip": 3.37304688, "router_z_loss_mlp": 0.48217773, "step": 1400, "time_per_iteration": 2.6726112365722656 }, { "auxiliary_loss_clip": 0.01509501, "auxiliary_loss_mlp": 0.0046302, "balance_loss_clip": 1.17792201, "balance_loss_mlp": 0.418531, "epoch": 0.08423267698782504, "flos": 26249730808320.0, "grad_norm": 3.8812751806786823, "language_loss": 0.92343247, "learning_rate": 3.969227293371099e-06, "loss": 0.94315767, "num_input_tokens_seen": 29796440, "router_z_loss_clip": 3.31445312, "router_z_loss_mlp": 0.44458008, "step": 1401, "time_per_iteration": 2.671091318130493 }, { "auxiliary_loss_clip": 0.01480182, "auxiliary_loss_mlp": 0.00463384, "balance_loss_clip": 1.15472519, "balance_loss_mlp": 0.41956222, "epoch": 0.08429280024049302, "flos": 20119707751680.0, "grad_norm": 9.292077422187699, "language_loss": 0.93297279, "learning_rate": 3.969159199384263e-06, "loss": 0.95240831, "num_input_tokens_seen": 29814755, "router_z_loss_clip": 3.25585938, "router_z_loss_mlp": 0.43774414, "step": 1402, "time_per_iteration": 2.6246485710144043 }, { "auxiliary_loss_clip": 0.01479151, "auxiliary_loss_mlp": 0.00450614, "balance_loss_clip": 1.15982437, "balance_loss_mlp": 0.41151339, "epoch": 0.08435292349316098, "flos": 42924526836480.0, "grad_norm": 5.146818398454941, "language_loss": 0.94136912, "learning_rate": 3.9690910307267125e-06, "loss": 0.96066672, "num_input_tokens_seen": 29834785, "router_z_loss_clip": 3.19335938, "router_z_loss_mlp": 0.390625, "step": 1403, "time_per_iteration": 2.8343868255615234 }, { "auxiliary_loss_clip": 0.01491251, "auxiliary_loss_mlp": 0.0052383, "balance_loss_clip": 1.16804576, "balance_loss_mlp": 0.47676599, "epoch": 0.08441304674582895, "flos": 22857285876480.0, "grad_norm": 1917.7035852091574, "language_loss": 0.87639654, "learning_rate": 3.969022787401033e-06, "loss": 0.89654732, "num_input_tokens_seen": 29854695, "router_z_loss_clip": 3.23242188, "router_z_loss_mlp": 0.47070312, "step": 1404, "time_per_iteration": 2.644843578338623 }, { "auxiliary_loss_clip": 0.01491258, "auxiliary_loss_mlp": 0.00554098, "balance_loss_clip": 1.16087317, "balance_loss_mlp": 0.50758219, "epoch": 0.08447316999849692, "flos": 18697501676160.0, "grad_norm": 9.603147835275461, "language_loss": 0.92833388, "learning_rate": 3.968954469409811e-06, "loss": 0.94878745, "num_input_tokens_seen": 29872180, "router_z_loss_clip": 3.30273438, "router_z_loss_mlp": 0.46557617, "step": 1405, "time_per_iteration": 2.6111621856689453 }, { "auxiliary_loss_clip": 0.01488433, "auxiliary_loss_mlp": 0.0064152, "balance_loss_clip": 1.16277635, "balance_loss_mlp": 0.59250075, "epoch": 0.08453329325116489, "flos": 25483971738240.0, "grad_norm": 275.78596606624666, "language_loss": 0.85675609, "learning_rate": 3.968886076755639e-06, "loss": 0.87805557, "num_input_tokens_seen": 29893205, "router_z_loss_clip": 3.25390625, "router_z_loss_mlp": 0.49023438, "step": 1406, "time_per_iteration": 2.707791805267334 }, { "auxiliary_loss_clip": 0.0150664, "auxiliary_loss_mlp": 0.00721485, "balance_loss_clip": 1.17429471, "balance_loss_mlp": 0.6713928, "epoch": 0.08459341650383286, "flos": 20920048640640.0, "grad_norm": 23.086204912749324, "language_loss": 0.85250294, "learning_rate": 3.96881760944111e-06, "loss": 0.87478423, "num_input_tokens_seen": 29911970, "router_z_loss_clip": 3.32617188, "router_z_loss_mlp": 0.5012207, "step": 1407, "time_per_iteration": 2.610388994216919 }, { "auxiliary_loss_clip": 0.01507452, "auxiliary_loss_mlp": 0.00786237, "balance_loss_clip": 1.17099369, "balance_loss_mlp": 0.73221099, "epoch": 0.08465353975650082, "flos": 13043079624960.0, "grad_norm": 57.22063493213088, "language_loss": 0.98215747, "learning_rate": 3.968749067468819e-06, "loss": 1.00509429, "num_input_tokens_seen": 29929925, "router_z_loss_clip": 3.36523438, "router_z_loss_mlp": 0.53979492, "step": 1408, "time_per_iteration": 2.6865179538726807 }, { "auxiliary_loss_clip": 0.01578148, "auxiliary_loss_mlp": 0.00681473, "balance_loss_clip": 1.34453368, "balance_loss_mlp": 0.64237195, "epoch": 0.0847136630091688, "flos": 60877422552960.0, "grad_norm": 0.9466890930103423, "language_loss": 0.62134963, "learning_rate": 3.968680450841368e-06, "loss": 0.64394581, "num_input_tokens_seen": 29985950, "router_z_loss_clip": 2.34375, "router_z_loss_mlp": 0.390625, "step": 1409, "time_per_iteration": 3.2589778900146484 }, { "auxiliary_loss_clip": 0.01512313, "auxiliary_loss_mlp": 0.00888072, "balance_loss_clip": 1.17792308, "balance_loss_mlp": 0.83423698, "epoch": 0.08477378626183676, "flos": 22046530043520.0, "grad_norm": 3.436600939512624, "language_loss": 0.93820202, "learning_rate": 3.968611759561355e-06, "loss": 0.96220589, "num_input_tokens_seen": 30004330, "router_z_loss_clip": 3.34375, "router_z_loss_mlp": 0.53808594, "step": 1410, "time_per_iteration": 2.6580398082733154 }, { "auxiliary_loss_clip": 0.01518692, "auxiliary_loss_mlp": 0.00887568, "balance_loss_clip": 1.16877747, "balance_loss_mlp": 0.82801116, "epoch": 0.08483390951450473, "flos": 16690059308160.0, "grad_norm": 3.9595279578017806, "language_loss": 0.80310565, "learning_rate": 3.968542993631388e-06, "loss": 0.82716823, "num_input_tokens_seen": 30022555, "router_z_loss_clip": 3.49414062, "router_z_loss_mlp": 0.59570312, "step": 1411, "time_per_iteration": 5.423904657363892 }, { "auxiliary_loss_clip": 0.01470354, "auxiliary_loss_mlp": 0.00506167, "balance_loss_clip": 1.24291337, "balance_loss_mlp": 0.46763849, "epoch": 0.08489403276717271, "flos": 51584640082560.0, "grad_norm": 0.9027719414639797, "language_loss": 0.5690847, "learning_rate": 3.968474153054073e-06, "loss": 0.5888499, "num_input_tokens_seen": 30077220, "router_z_loss_clip": 2.28125, "router_z_loss_mlp": 0.38476562, "step": 1412, "time_per_iteration": 3.097105026245117 }, { "auxiliary_loss_clip": 0.01495942, "auxiliary_loss_mlp": 0.00691523, "balance_loss_clip": 1.14618254, "balance_loss_mlp": 0.64026308, "epoch": 0.08495415601984067, "flos": 17092330698240.0, "grad_norm": 148.88458336940633, "language_loss": 0.98071611, "learning_rate": 3.96840523783202e-06, "loss": 1.00259078, "num_input_tokens_seen": 30094600, "router_z_loss_clip": 3.49804688, "router_z_loss_mlp": 0.51245117, "step": 1413, "time_per_iteration": 2.6239163875579834 }, { "auxiliary_loss_clip": 0.01494976, "auxiliary_loss_mlp": 0.00676632, "balance_loss_clip": 1.14531684, "balance_loss_mlp": 0.62468112, "epoch": 0.08501427927250864, "flos": 23148413608320.0, "grad_norm": 5.253088800144623, "language_loss": 0.9449749, "learning_rate": 3.968336247967844e-06, "loss": 0.96669096, "num_input_tokens_seen": 30114475, "router_z_loss_clip": 3.49804688, "router_z_loss_mlp": 0.51928711, "step": 1414, "time_per_iteration": 2.692439079284668 }, { "auxiliary_loss_clip": 0.01492321, "auxiliary_loss_mlp": 0.00628795, "balance_loss_clip": 1.13486242, "balance_loss_mlp": 0.57958543, "epoch": 0.08507440252517662, "flos": 19063467394560.0, "grad_norm": 5.732840028320461, "language_loss": 0.82910991, "learning_rate": 3.96826718346416e-06, "loss": 0.85032105, "num_input_tokens_seen": 30133350, "router_z_loss_clip": 3.57617188, "router_z_loss_mlp": 0.49194336, "step": 1415, "time_per_iteration": 4.076260328292847 }, { "auxiliary_loss_clip": 0.01497348, "auxiliary_loss_mlp": 0.00596743, "balance_loss_clip": 1.12750661, "balance_loss_mlp": 0.54567426, "epoch": 0.08513452577784458, "flos": 60182296600320.0, "grad_norm": 11.99697523076555, "language_loss": 0.76938081, "learning_rate": 3.968198044323587e-06, "loss": 0.79032171, "num_input_tokens_seen": 30159005, "router_z_loss_clip": 3.69921875, "router_z_loss_mlp": 0.51049805, "step": 1416, "time_per_iteration": 2.974874973297119 }, { "auxiliary_loss_clip": 0.01504847, "auxiliary_loss_mlp": 0.00552343, "balance_loss_clip": 1.128811, "balance_loss_mlp": 0.50330061, "epoch": 0.08519464903051255, "flos": 27308485117440.0, "grad_norm": 17.624798077327075, "language_loss": 0.83111322, "learning_rate": 3.968128830548748e-06, "loss": 0.85168517, "num_input_tokens_seen": 30179450, "router_z_loss_clip": 3.7578125, "router_z_loss_mlp": 0.49047852, "step": 1417, "time_per_iteration": 4.14927339553833 }, { "auxiliary_loss_clip": 0.01517473, "auxiliary_loss_mlp": 0.00551271, "balance_loss_clip": 1.13932395, "balance_loss_mlp": 0.50027335, "epoch": 0.08525477228318051, "flos": 20266438809600.0, "grad_norm": 77.08634862179153, "language_loss": 0.92682022, "learning_rate": 3.968059542142265e-06, "loss": 0.94750762, "num_input_tokens_seen": 30197235, "router_z_loss_clip": 3.78320312, "router_z_loss_mlp": 0.51025391, "step": 1418, "time_per_iteration": 2.651712656021118 }, { "auxiliary_loss_clip": 0.01429892, "auxiliary_loss_mlp": 0.01093657, "balance_loss_clip": 1.1568867, "balance_loss_mlp": 1.0429213, "epoch": 0.08531489553584849, "flos": 67615017183360.0, "grad_norm": 1.091315559581165, "language_loss": 0.56647778, "learning_rate": 3.9679901791067685e-06, "loss": 0.59171319, "num_input_tokens_seen": 30257410, "router_z_loss_clip": 2.734375, "router_z_loss_mlp": 0.5078125, "step": 1419, "time_per_iteration": 3.086315393447876 }, { "auxiliary_loss_clip": 0.01502746, "auxiliary_loss_mlp": 0.00511685, "balance_loss_clip": 1.11740446, "balance_loss_mlp": 0.46683809, "epoch": 0.08537501878851646, "flos": 27526965592320.0, "grad_norm": 13.627144618741106, "language_loss": 0.78429788, "learning_rate": 3.967920741444886e-06, "loss": 0.80444217, "num_input_tokens_seen": 30277865, "router_z_loss_clip": 3.85546875, "router_z_loss_mlp": 0.44799805, "step": 1420, "time_per_iteration": 2.7210195064544678 }, { "auxiliary_loss_clip": 0.01533569, "auxiliary_loss_mlp": 0.00467096, "balance_loss_clip": 1.13367939, "balance_loss_mlp": 0.42553943, "epoch": 0.08543514204118442, "flos": 22784243569920.0, "grad_norm": 10.652391466836573, "language_loss": 0.94743413, "learning_rate": 3.967851229159252e-06, "loss": 0.96744078, "num_input_tokens_seen": 30298545, "router_z_loss_clip": 4.0, "router_z_loss_mlp": 0.41552734, "step": 1421, "time_per_iteration": 2.736368179321289 }, { "auxiliary_loss_clip": 0.0144046, "auxiliary_loss_mlp": 0.00572, "balance_loss_clip": 1.15733194, "balance_loss_mlp": 0.54739541, "epoch": 0.0854952652938524, "flos": 60990721027200.0, "grad_norm": 0.8035149817353388, "language_loss": 0.63610911, "learning_rate": 3.967781642252502e-06, "loss": 0.65623379, "num_input_tokens_seen": 30361725, "router_z_loss_clip": 2.84375, "router_z_loss_mlp": 0.24609375, "step": 1422, "time_per_iteration": 3.1159443855285645 }, { "auxiliary_loss_clip": 0.01523658, "auxiliary_loss_mlp": 0.00456332, "balance_loss_clip": 1.13433599, "balance_loss_mlp": 0.41749397, "epoch": 0.08555538854652037, "flos": 28038046256640.0, "grad_norm": 6.82050859032842, "language_loss": 0.89552706, "learning_rate": 3.967711980727276e-06, "loss": 0.91532695, "num_input_tokens_seen": 30382180, "router_z_loss_clip": 3.89453125, "router_z_loss_mlp": 0.38842773, "step": 1423, "time_per_iteration": 2.670178174972534 }, { "auxiliary_loss_clip": 0.01545768, "auxiliary_loss_mlp": 0.00415143, "balance_loss_clip": 1.1478734, "balance_loss_mlp": 0.37961873, "epoch": 0.08561551179918833, "flos": 23509279595520.0, "grad_norm": 41.0583568058291, "language_loss": 0.83026731, "learning_rate": 3.967642244586213e-06, "loss": 0.8498764, "num_input_tokens_seen": 30402980, "router_z_loss_clip": 3.98046875, "router_z_loss_mlp": 0.35571289, "step": 1424, "time_per_iteration": 2.6902871131896973 }, { "auxiliary_loss_clip": 0.01551748, "auxiliary_loss_mlp": 0.00386208, "balance_loss_clip": 1.15483809, "balance_loss_mlp": 0.3531158, "epoch": 0.08567563505185631, "flos": 17926930183680.0, "grad_norm": 12.249714471230456, "language_loss": 0.82475638, "learning_rate": 3.96757243383196e-06, "loss": 0.844136, "num_input_tokens_seen": 30420800, "router_z_loss_clip": 3.97070312, "router_z_loss_mlp": 0.33105469, "step": 1425, "time_per_iteration": 2.5962626934051514 }, { "auxiliary_loss_clip": 0.01546463, "auxiliary_loss_mlp": 0.00400786, "balance_loss_clip": 1.15391159, "balance_loss_mlp": 0.36437944, "epoch": 0.08573575830452428, "flos": 19719519350400.0, "grad_norm": 5.503482996069108, "language_loss": 1.00503898, "learning_rate": 3.9675025484671624e-06, "loss": 1.02451158, "num_input_tokens_seen": 30439620, "router_z_loss_clip": 3.9296875, "router_z_loss_mlp": 0.36425781, "step": 1426, "time_per_iteration": 2.652249574661255 }, { "auxiliary_loss_clip": 0.01546222, "auxiliary_loss_mlp": 0.00444703, "balance_loss_clip": 1.15647817, "balance_loss_mlp": 0.40770048, "epoch": 0.08579588155719224, "flos": 17931563038080.0, "grad_norm": 780.6686840872288, "language_loss": 0.84651613, "learning_rate": 3.967432588494471e-06, "loss": 0.86642545, "num_input_tokens_seen": 30457300, "router_z_loss_clip": 3.90039062, "router_z_loss_mlp": 0.36987305, "step": 1427, "time_per_iteration": 2.6352133750915527 }, { "auxiliary_loss_clip": 0.01558936, "auxiliary_loss_mlp": 0.00612597, "balance_loss_clip": 1.17077947, "balance_loss_mlp": 0.57411641, "epoch": 0.08585600480986022, "flos": 16033324993920.0, "grad_norm": 45.58893189663297, "language_loss": 0.9032799, "learning_rate": 3.96736255391654e-06, "loss": 0.92499518, "num_input_tokens_seen": 30471580, "router_z_loss_clip": 3.88085938, "router_z_loss_mlp": 0.38427734, "step": 1428, "time_per_iteration": 2.6484854221343994 }, { "auxiliary_loss_clip": 0.01571678, "auxiliary_loss_mlp": 0.00627901, "balance_loss_clip": 1.17429042, "balance_loss_mlp": 0.5867734, "epoch": 0.08591612806252819, "flos": 28657433404800.0, "grad_norm": 20.679888670668813, "language_loss": 0.89883065, "learning_rate": 3.967292444736023e-06, "loss": 0.92082649, "num_input_tokens_seen": 30492720, "router_z_loss_clip": 3.97265625, "router_z_loss_mlp": 0.41137695, "step": 1429, "time_per_iteration": 2.6823489665985107 }, { "auxiliary_loss_clip": 0.01568537, "auxiliary_loss_mlp": 0.00821139, "balance_loss_clip": 1.17744303, "balance_loss_mlp": 0.77572048, "epoch": 0.08597625131519615, "flos": 20959119659520.0, "grad_norm": 1910.1090961771852, "language_loss": 0.93997419, "learning_rate": 3.967222260955578e-06, "loss": 0.96387088, "num_input_tokens_seen": 30509535, "router_z_loss_clip": 3.91796875, "router_z_loss_mlp": 0.45410156, "step": 1430, "time_per_iteration": 2.6118879318237305 }, { "auxiliary_loss_clip": 0.01567433, "auxiliary_loss_mlp": 0.00907472, "balance_loss_clip": 1.17795038, "balance_loss_mlp": 0.86214888, "epoch": 0.08603637456786412, "flos": 23256360956160.0, "grad_norm": 13.615381743991968, "language_loss": 0.87828445, "learning_rate": 3.96715200257787e-06, "loss": 0.90303355, "num_input_tokens_seen": 30529490, "router_z_loss_clip": 3.89453125, "router_z_loss_mlp": 0.45336914, "step": 1431, "time_per_iteration": 2.6730010509490967 }, { "auxiliary_loss_clip": 0.01583713, "auxiliary_loss_mlp": 0.00998183, "balance_loss_clip": 1.19190609, "balance_loss_mlp": 0.94837701, "epoch": 0.0860964978205321, "flos": 28694170039680.0, "grad_norm": 43.756851987733626, "language_loss": 0.81969738, "learning_rate": 3.967081669605559e-06, "loss": 0.84551644, "num_input_tokens_seen": 30550205, "router_z_loss_clip": 3.91601562, "router_z_loss_mlp": 0.49755859, "step": 1432, "time_per_iteration": 2.6992886066436768 }, { "auxiliary_loss_clip": 0.01560809, "auxiliary_loss_mlp": 0.01055311, "balance_loss_clip": 1.18237913, "balance_loss_mlp": 1.00424194, "epoch": 0.08615662107320006, "flos": 19318397195520.0, "grad_norm": 5.912445912875783, "language_loss": 0.81196034, "learning_rate": 3.967011262041315e-06, "loss": 0.83812153, "num_input_tokens_seen": 30568830, "router_z_loss_clip": 3.78515625, "router_z_loss_mlp": 0.51074219, "step": 1433, "time_per_iteration": 2.6771671772003174 }, { "auxiliary_loss_clip": 0.01577512, "auxiliary_loss_mlp": 0.01104613, "balance_loss_clip": 1.19040751, "balance_loss_mlp": 1.04791701, "epoch": 0.08621674432586802, "flos": 15851688894720.0, "grad_norm": 342.91953573954845, "language_loss": 0.94460464, "learning_rate": 3.9669407798878065e-06, "loss": 0.97142589, "num_input_tokens_seen": 30585730, "router_z_loss_clip": 3.87109375, "router_z_loss_mlp": 0.56689453, "step": 1434, "time_per_iteration": 2.639725923538208 }, { "auxiliary_loss_clip": 0.01569051, "auxiliary_loss_mlp": 0.01090521, "balance_loss_clip": 1.18526292, "balance_loss_mlp": 1.03656721, "epoch": 0.086276867578536, "flos": 14100648785280.0, "grad_norm": 103.31636671846091, "language_loss": 0.8596074, "learning_rate": 3.966870223147707e-06, "loss": 0.88620311, "num_input_tokens_seen": 30603180, "router_z_loss_clip": 3.83398438, "router_z_loss_mlp": 0.53955078, "step": 1435, "time_per_iteration": 2.7046473026275635 }, { "auxiliary_loss_clip": 0.0154036, "auxiliary_loss_mlp": 0.0144339, "balance_loss_clip": 1.29662931, "balance_loss_mlp": 1.41287231, "epoch": 0.08633699083120397, "flos": 70184857772160.0, "grad_norm": 0.9812531490652936, "language_loss": 0.58532405, "learning_rate": 3.96679959182369e-06, "loss": 0.61516154, "num_input_tokens_seen": 30668895, "router_z_loss_clip": 2.4375, "router_z_loss_mlp": 0.3046875, "step": 1436, "time_per_iteration": 3.2617053985595703 }, { "auxiliary_loss_clip": 0.01562005, "auxiliary_loss_mlp": 0.00987248, "balance_loss_clip": 1.19172621, "balance_loss_mlp": 0.93615472, "epoch": 0.08639711408387193, "flos": 30298874140800.0, "grad_norm": 20.640088491902095, "language_loss": 0.80583286, "learning_rate": 3.966728885918437e-06, "loss": 0.83132541, "num_input_tokens_seen": 30688955, "router_z_loss_clip": 3.70117188, "router_z_loss_mlp": 0.51098633, "step": 1437, "time_per_iteration": 2.741683006286621 }, { "auxiliary_loss_clip": 0.01570457, "auxiliary_loss_mlp": 0.00938521, "balance_loss_clip": 1.19378304, "balance_loss_mlp": 0.88976395, "epoch": 0.08645723733653991, "flos": 20297680663680.0, "grad_norm": 5.532559750722417, "language_loss": 0.78448057, "learning_rate": 3.966658105434627e-06, "loss": 0.80957037, "num_input_tokens_seen": 30706095, "router_z_loss_clip": 3.765625, "router_z_loss_mlp": 0.48730469, "step": 1438, "time_per_iteration": 2.600903034210205 }, { "auxiliary_loss_clip": 0.01566197, "auxiliary_loss_mlp": 0.00919743, "balance_loss_clip": 1.19831443, "balance_loss_mlp": 0.86469233, "epoch": 0.08651736058920788, "flos": 32890583134080.0, "grad_norm": 13.912638113822572, "language_loss": 0.71318328, "learning_rate": 3.966587250374945e-06, "loss": 0.73804265, "num_input_tokens_seen": 30729025, "router_z_loss_clip": 3.68164062, "router_z_loss_mlp": 0.55029297, "step": 1439, "time_per_iteration": 2.822282075881958 }, { "auxiliary_loss_clip": 0.01546718, "auxiliary_loss_mlp": 0.00702491, "balance_loss_clip": 1.17904437, "balance_loss_mlp": 0.65616584, "epoch": 0.08657748384187584, "flos": 22637368857600.0, "grad_norm": 72.2508745101841, "language_loss": 0.94360763, "learning_rate": 3.966516320742077e-06, "loss": 0.96609974, "num_input_tokens_seen": 30746155, "router_z_loss_clip": 3.68164062, "router_z_loss_mlp": 0.46386719, "step": 1440, "time_per_iteration": 2.797118663787842 }, { "auxiliary_loss_clip": 0.01574159, "auxiliary_loss_mlp": 0.00632237, "balance_loss_clip": 1.19959438, "balance_loss_mlp": 0.58345616, "epoch": 0.08663760709454381, "flos": 23658380951040.0, "grad_norm": 44.01009224805087, "language_loss": 0.90897381, "learning_rate": 3.9664453165387124e-06, "loss": 0.93103784, "num_input_tokens_seen": 30761410, "router_z_loss_clip": 3.74414062, "router_z_loss_mlp": 0.48803711, "step": 1441, "time_per_iteration": 2.7048444747924805 }, { "auxiliary_loss_clip": 0.01562712, "auxiliary_loss_mlp": 0.00339397, "balance_loss_clip": 1.32322001, "balance_loss_mlp": 0.3208003, "epoch": 0.08669773034721179, "flos": 62686564911360.0, "grad_norm": 0.8251012677919664, "language_loss": 0.60425186, "learning_rate": 3.966374237767545e-06, "loss": 0.62327296, "num_input_tokens_seen": 30823010, "router_z_loss_clip": 2.40625, "router_z_loss_mlp": 0.18554688, "step": 1442, "time_per_iteration": 3.2543179988861084 }, { "auxiliary_loss_clip": 0.01569746, "auxiliary_loss_mlp": 0.00547648, "balance_loss_clip": 1.2011044, "balance_loss_mlp": 0.50513756, "epoch": 0.08675785359987975, "flos": 20667489137280.0, "grad_norm": 2.473713864904694, "language_loss": 0.90083337, "learning_rate": 3.96630308443127e-06, "loss": 0.92200732, "num_input_tokens_seen": 30841980, "router_z_loss_clip": 3.6875, "router_z_loss_mlp": 0.42504883, "step": 1443, "time_per_iteration": 2.5958163738250732 }, { "auxiliary_loss_clip": 0.0158384, "auxiliary_loss_mlp": 0.00475403, "balance_loss_clip": 1.22192049, "balance_loss_mlp": 0.43525285, "epoch": 0.08681797685254772, "flos": 26941118768640.0, "grad_norm": 15.460596296428054, "language_loss": 0.89236176, "learning_rate": 3.966231856532584e-06, "loss": 0.91295421, "num_input_tokens_seen": 30863280, "router_z_loss_clip": 3.61914062, "router_z_loss_mlp": 0.40136719, "step": 1444, "time_per_iteration": 2.720736026763916 }, { "auxiliary_loss_clip": 0.01576157, "auxiliary_loss_mlp": 0.00517931, "balance_loss_clip": 1.20932555, "balance_loss_mlp": 0.47430074, "epoch": 0.0868781001052157, "flos": 17712831168000.0, "grad_norm": 51.26186637250275, "language_loss": 0.94343007, "learning_rate": 3.966160554074189e-06, "loss": 0.96437091, "num_input_tokens_seen": 30881710, "router_z_loss_clip": 3.66601562, "router_z_loss_mlp": 0.43603516, "step": 1445, "time_per_iteration": 2.586970090866089 }, { "auxiliary_loss_clip": 0.01578771, "auxiliary_loss_mlp": 0.00451015, "balance_loss_clip": 1.21820855, "balance_loss_mlp": 0.41165262, "epoch": 0.08693822335788366, "flos": 19896522595200.0, "grad_norm": 3.941530536675627, "language_loss": 0.91298515, "learning_rate": 3.96608917705879e-06, "loss": 0.93328303, "num_input_tokens_seen": 30900225, "router_z_loss_clip": 3.60742188, "router_z_loss_mlp": 0.39355469, "step": 1446, "time_per_iteration": 2.6339218616485596 }, { "auxiliary_loss_clip": 0.01581026, "auxiliary_loss_mlp": 0.00407267, "balance_loss_clip": 1.32373238, "balance_loss_mlp": 0.3909595, "epoch": 0.08699834661055163, "flos": 67023747406080.0, "grad_norm": 0.7625732369971318, "language_loss": 0.5475384, "learning_rate": 3.966017725489091e-06, "loss": 0.56742132, "num_input_tokens_seen": 30959580, "router_z_loss_clip": 2.578125, "router_z_loss_mlp": 0.16308594, "step": 1447, "time_per_iteration": 3.190419912338257 }, { "auxiliary_loss_clip": 0.01581667, "auxiliary_loss_mlp": 0.00423207, "balance_loss_clip": 1.2257477, "balance_loss_mlp": 0.38227063, "epoch": 0.0870584698632196, "flos": 13480507451520.0, "grad_norm": 25.514377247628772, "language_loss": 0.91820055, "learning_rate": 3.965946199367804e-06, "loss": 0.93824923, "num_input_tokens_seen": 30976775, "router_z_loss_clip": 3.55664062, "router_z_loss_mlp": 0.40942383, "step": 1448, "time_per_iteration": 2.696981191635132 }, { "auxiliary_loss_clip": 0.0157822, "auxiliary_loss_mlp": 0.00456349, "balance_loss_clip": 1.21800101, "balance_loss_mlp": 0.41245613, "epoch": 0.08711859311588757, "flos": 16107013745280.0, "grad_norm": 9.401953666086182, "language_loss": 0.91845506, "learning_rate": 3.965874598697638e-06, "loss": 0.93880081, "num_input_tokens_seen": 30990495, "router_z_loss_clip": 3.60351562, "router_z_loss_mlp": 0.4387207, "step": 1449, "time_per_iteration": 2.571150302886963 }, { "auxiliary_loss_clip": 0.01604003, "auxiliary_loss_mlp": 0.00407474, "balance_loss_clip": 1.25340414, "balance_loss_mlp": 0.3684206, "epoch": 0.08717871636855554, "flos": 38472357928320.0, "grad_norm": 35.20826794401027, "language_loss": 0.78185683, "learning_rate": 3.965802923481313e-06, "loss": 0.80197155, "num_input_tokens_seen": 31014080, "router_z_loss_clip": 3.50585938, "router_z_loss_mlp": 0.39038086, "step": 1450, "time_per_iteration": 2.772542715072632 }, { "auxiliary_loss_clip": 0.01581481, "auxiliary_loss_mlp": 0.00406654, "balance_loss_clip": 1.22777724, "balance_loss_mlp": 0.3656458, "epoch": 0.0872388396212235, "flos": 17600574188160.0, "grad_norm": 27.311863243292056, "language_loss": 0.89296091, "learning_rate": 3.965731173721542e-06, "loss": 0.91284221, "num_input_tokens_seen": 31031210, "router_z_loss_clip": 3.53710938, "router_z_loss_mlp": 0.41015625, "step": 1451, "time_per_iteration": 2.7174925804138184 }, { "auxiliary_loss_clip": 0.01579928, "auxiliary_loss_mlp": 0.00345203, "balance_loss_clip": 1.23532534, "balance_loss_mlp": 0.30989337, "epoch": 0.08729896287389148, "flos": 25259385951360.0, "grad_norm": 4.645548795457372, "language_loss": 0.80492789, "learning_rate": 3.965659349421049e-06, "loss": 0.82417917, "num_input_tokens_seen": 31049710, "router_z_loss_clip": 3.44726562, "router_z_loss_mlp": 0.35302734, "step": 1452, "time_per_iteration": 2.705221652984619 }, { "auxiliary_loss_clip": 0.0159542, "auxiliary_loss_mlp": 0.00406579, "balance_loss_clip": 1.23951316, "balance_loss_mlp": 0.36325788, "epoch": 0.08735908612655945, "flos": 15632454234240.0, "grad_norm": 7.6860130390421855, "language_loss": 0.91024005, "learning_rate": 3.965587450582556e-06, "loss": 0.93026006, "num_input_tokens_seen": 31066160, "router_z_loss_clip": 3.55664062, "router_z_loss_mlp": 0.43334961, "step": 1453, "time_per_iteration": 5.48798680305481 }, { "auxiliary_loss_clip": 0.01592778, "auxiliary_loss_mlp": 0.00339243, "balance_loss_clip": 1.24938381, "balance_loss_mlp": 0.30491054, "epoch": 0.08741920937922741, "flos": 20339660684160.0, "grad_norm": 4.6883716046616035, "language_loss": 0.79788315, "learning_rate": 3.96551547720879e-06, "loss": 0.8172034, "num_input_tokens_seen": 31085270, "router_z_loss_clip": 3.43164062, "router_z_loss_mlp": 0.34326172, "step": 1454, "time_per_iteration": 2.71899676322937 }, { "auxiliary_loss_clip": 0.01458203, "auxiliary_loss_mlp": 0.0021438, "balance_loss_clip": 1.21420407, "balance_loss_mlp": 0.19940762, "epoch": 0.08747933263189539, "flos": 62819795433600.0, "grad_norm": 0.7527579905524507, "language_loss": 0.58253145, "learning_rate": 3.96544342930248e-06, "loss": 0.59925723, "num_input_tokens_seen": 31148445, "router_z_loss_clip": 2.4375, "router_z_loss_mlp": 0.14941406, "step": 1455, "time_per_iteration": 3.1643145084381104 }, { "auxiliary_loss_clip": 0.01583321, "auxiliary_loss_mlp": 0.00348986, "balance_loss_clip": 1.23727179, "balance_loss_mlp": 0.31043416, "epoch": 0.08753945588456336, "flos": 33035877648000.0, "grad_norm": 1.987938260369758, "language_loss": 0.83422053, "learning_rate": 3.965371306866359e-06, "loss": 0.85354364, "num_input_tokens_seen": 31168770, "router_z_loss_clip": 3.45703125, "router_z_loss_mlp": 0.38549805, "step": 1456, "time_per_iteration": 2.7493720054626465 }, { "auxiliary_loss_clip": 0.01595597, "auxiliary_loss_mlp": 0.00382875, "balance_loss_clip": 1.24868667, "balance_loss_mlp": 0.34344065, "epoch": 0.08759957913723132, "flos": 35547182046720.0, "grad_norm": 5.613722847915186, "language_loss": 0.79174274, "learning_rate": 3.96529910990316e-06, "loss": 0.81152743, "num_input_tokens_seen": 31189270, "router_z_loss_clip": 3.46875, "router_z_loss_mlp": 0.39453125, "step": 1457, "time_per_iteration": 4.139661550521851 }, { "auxiliary_loss_clip": 0.01594869, "auxiliary_loss_mlp": 0.00392641, "balance_loss_clip": 1.2512207, "balance_loss_mlp": 0.35325384, "epoch": 0.0876597023898993, "flos": 23911120022400.0, "grad_norm": 6.599538582868752, "language_loss": 0.91490519, "learning_rate": 3.965226838415622e-06, "loss": 0.9347803, "num_input_tokens_seen": 31210385, "router_z_loss_clip": 3.43359375, "router_z_loss_mlp": 0.39379883, "step": 1458, "time_per_iteration": 2.6788337230682373 }, { "auxiliary_loss_clip": 0.01591275, "auxiliary_loss_mlp": 0.00389736, "balance_loss_clip": 1.25290251, "balance_loss_mlp": 0.3504447, "epoch": 0.08771982564256726, "flos": 18114025150080.0, "grad_norm": 6.235376130127792, "language_loss": 0.87044156, "learning_rate": 3.965154492406486e-06, "loss": 0.89025164, "num_input_tokens_seen": 31229745, "router_z_loss_clip": 3.38476562, "router_z_loss_mlp": 0.39282227, "step": 1459, "time_per_iteration": 4.084290027618408 }, { "auxiliary_loss_clip": 0.01589365, "auxiliary_loss_mlp": 0.00425835, "balance_loss_clip": 1.24368417, "balance_loss_mlp": 0.38115507, "epoch": 0.08777994889523523, "flos": 17712005155200.0, "grad_norm": 5.4607513263334715, "language_loss": 0.91788113, "learning_rate": 3.9650820718784945e-06, "loss": 0.9380331, "num_input_tokens_seen": 31248280, "router_z_loss_clip": 3.45898438, "router_z_loss_mlp": 0.44677734, "step": 1460, "time_per_iteration": 2.717517614364624 }, { "auxiliary_loss_clip": 0.01585435, "auxiliary_loss_mlp": 0.00405065, "balance_loss_clip": 1.2388885, "balance_loss_mlp": 0.36610729, "epoch": 0.0878400721479032, "flos": 12819930382080.0, "grad_norm": 9.69070630027994, "language_loss": 0.88599145, "learning_rate": 3.965009576834394e-06, "loss": 0.90589654, "num_input_tokens_seen": 31262190, "router_z_loss_clip": 3.46679688, "router_z_loss_mlp": 0.38964844, "step": 1461, "time_per_iteration": 2.557520866394043 }, { "auxiliary_loss_clip": 0.01598293, "auxiliary_loss_mlp": 0.0040446, "balance_loss_clip": 1.24687982, "balance_loss_mlp": 0.3632139, "epoch": 0.08790019540057117, "flos": 26392690938240.0, "grad_norm": 3.3952066529514253, "language_loss": 0.80208695, "learning_rate": 3.964937007276932e-06, "loss": 0.82211447, "num_input_tokens_seen": 31283690, "router_z_loss_clip": 3.51757812, "router_z_loss_mlp": 0.41235352, "step": 1462, "time_per_iteration": 2.6768417358398438 }, { "auxiliary_loss_clip": 0.01616374, "auxiliary_loss_mlp": 0.00419098, "balance_loss_clip": 1.25791562, "balance_loss_mlp": 0.3736077, "epoch": 0.08796031865323914, "flos": 19134031662720.0, "grad_norm": 11.47304298145174, "language_loss": 0.81623977, "learning_rate": 3.9648643632088634e-06, "loss": 0.83659452, "num_input_tokens_seen": 31302505, "router_z_loss_clip": 3.58398438, "router_z_loss_mlp": 0.45507812, "step": 1463, "time_per_iteration": 2.662208080291748 }, { "auxiliary_loss_clip": 0.01605593, "auxiliary_loss_mlp": 0.00420011, "balance_loss_clip": 1.23618209, "balance_loss_mlp": 0.37337613, "epoch": 0.0880204419059071, "flos": 26064287867520.0, "grad_norm": 7.011016933900526, "language_loss": 0.89989328, "learning_rate": 3.964791644632941e-06, "loss": 0.92014933, "num_input_tokens_seen": 31323070, "router_z_loss_clip": 3.69335938, "router_z_loss_mlp": 0.46704102, "step": 1464, "time_per_iteration": 2.7780041694641113 }, { "auxiliary_loss_clip": 0.01615123, "auxiliary_loss_mlp": 0.00417952, "balance_loss_clip": 1.25952578, "balance_loss_mlp": 0.37372512, "epoch": 0.08808056515857508, "flos": 22377842115840.0, "grad_norm": 3.4124844519847404, "language_loss": 0.85021961, "learning_rate": 3.964718851551923e-06, "loss": 0.87055039, "num_input_tokens_seen": 31341880, "router_z_loss_clip": 3.55664062, "router_z_loss_mlp": 0.44213867, "step": 1465, "time_per_iteration": 2.6788649559020996 }, { "auxiliary_loss_clip": 0.01625608, "auxiliary_loss_mlp": 0.00377846, "balance_loss_clip": 1.2641933, "balance_loss_mlp": 0.33574146, "epoch": 0.08814068841124305, "flos": 23185293897600.0, "grad_norm": 6.35475347199947, "language_loss": 0.90895057, "learning_rate": 3.9646459839685675e-06, "loss": 0.92898506, "num_input_tokens_seen": 31361995, "router_z_loss_clip": 3.61132812, "router_z_loss_mlp": 0.42089844, "step": 1466, "time_per_iteration": 2.6616625785827637 }, { "auxiliary_loss_clip": 0.01624978, "auxiliary_loss_mlp": 0.00377253, "balance_loss_clip": 1.26290488, "balance_loss_mlp": 0.33784205, "epoch": 0.08820081166391101, "flos": 25155281358720.0, "grad_norm": 242.78436003807883, "language_loss": 0.92217487, "learning_rate": 3.964573041885641e-06, "loss": 0.9421972, "num_input_tokens_seen": 31381515, "router_z_loss_clip": 3.62109375, "router_z_loss_mlp": 0.39428711, "step": 1467, "time_per_iteration": 2.6709775924682617 }, { "auxiliary_loss_clip": 0.01629565, "auxiliary_loss_mlp": 0.00347062, "balance_loss_clip": 1.26833236, "balance_loss_mlp": 0.30898675, "epoch": 0.08826093491657899, "flos": 22231685675520.0, "grad_norm": 459.563549449036, "language_loss": 0.82179737, "learning_rate": 3.964500025305907e-06, "loss": 0.84156358, "num_input_tokens_seen": 31400345, "router_z_loss_clip": 3.61328125, "router_z_loss_mlp": 0.38085938, "step": 1468, "time_per_iteration": 2.6433968544006348 }, { "auxiliary_loss_clip": 0.01628283, "auxiliary_loss_mlp": 0.00307358, "balance_loss_clip": 1.26499581, "balance_loss_mlp": 0.27183408, "epoch": 0.08832105816924696, "flos": 22126826897280.0, "grad_norm": 11.19509994202962, "language_loss": 0.8544035, "learning_rate": 3.9644269342321355e-06, "loss": 0.87375993, "num_input_tokens_seen": 31419620, "router_z_loss_clip": 3.6328125, "router_z_loss_mlp": 0.35498047, "step": 1469, "time_per_iteration": 2.625145673751831 }, { "auxiliary_loss_clip": 0.0159723, "auxiliary_loss_mlp": 0.00348327, "balance_loss_clip": 1.23295629, "balance_loss_mlp": 0.31056118, "epoch": 0.08838118142191492, "flos": 17566495159680.0, "grad_norm": 9.565210086393156, "language_loss": 0.86388314, "learning_rate": 3.9643537686670974e-06, "loss": 0.88333869, "num_input_tokens_seen": 31437970, "router_z_loss_clip": 3.64257812, "router_z_loss_mlp": 0.37768555, "step": 1470, "time_per_iteration": 2.5956976413726807 }, { "auxiliary_loss_clip": 0.01605832, "auxiliary_loss_mlp": 0.00362015, "balance_loss_clip": 1.23745584, "balance_loss_mlp": 0.32196051, "epoch": 0.0884413046745829, "flos": 20777196251520.0, "grad_norm": 24.248940037979935, "language_loss": 0.90666354, "learning_rate": 3.964280528613569e-06, "loss": 0.92634195, "num_input_tokens_seen": 31457040, "router_z_loss_clip": 3.68554688, "router_z_loss_mlp": 0.40039062, "step": 1471, "time_per_iteration": 2.734083890914917 }, { "auxiliary_loss_clip": 0.01583128, "auxiliary_loss_mlp": 0.00300949, "balance_loss_clip": 1.22102046, "balance_loss_mlp": 0.2661404, "epoch": 0.08850142792725087, "flos": 22125462180480.0, "grad_norm": 1.865924153659744, "language_loss": 0.8799125, "learning_rate": 3.964207214074324e-06, "loss": 0.89875329, "num_input_tokens_seen": 31477520, "router_z_loss_clip": 3.62109375, "router_z_loss_mlp": 0.34814453, "step": 1472, "time_per_iteration": 2.660503387451172 }, { "auxiliary_loss_clip": 0.01568122, "auxiliary_loss_mlp": 0.00337664, "balance_loss_clip": 1.20431137, "balance_loss_mlp": 0.29701388, "epoch": 0.08856155117991883, "flos": 22418744728320.0, "grad_norm": 995.3046801656517, "language_loss": 0.92719686, "learning_rate": 3.964133825052146e-06, "loss": 0.94625479, "num_input_tokens_seen": 31495575, "router_z_loss_clip": 3.640625, "router_z_loss_mlp": 0.40649414, "step": 1473, "time_per_iteration": 2.6887805461883545 }, { "auxiliary_loss_clip": 0.01569712, "auxiliary_loss_mlp": 0.0033602, "balance_loss_clip": 1.20492148, "balance_loss_mlp": 0.29782534, "epoch": 0.0886216744325868, "flos": 29937002572800.0, "grad_norm": 5.858810318885965, "language_loss": 0.83431828, "learning_rate": 3.964060361549816e-06, "loss": 0.85337555, "num_input_tokens_seen": 31520020, "router_z_loss_clip": 3.6484375, "router_z_loss_mlp": 0.38208008, "step": 1474, "time_per_iteration": 2.7197086811065674 }, { "auxiliary_loss_clip": 0.01561453, "auxiliary_loss_mlp": 0.0034727, "balance_loss_clip": 1.19673407, "balance_loss_mlp": 0.30671513, "epoch": 0.08868179768525478, "flos": 23982833525760.0, "grad_norm": 25.952968433194332, "language_loss": 0.85512388, "learning_rate": 3.963986823570121e-06, "loss": 0.87421119, "num_input_tokens_seen": 31539265, "router_z_loss_clip": 3.64648438, "router_z_loss_mlp": 0.40527344, "step": 1475, "time_per_iteration": 2.6799991130828857 }, { "auxiliary_loss_clip": 0.01567188, "auxiliary_loss_mlp": 0.00342718, "balance_loss_clip": 1.19416332, "balance_loss_mlp": 0.30008855, "epoch": 0.08874192093792274, "flos": 43177553216640.0, "grad_norm": 19.96664790518494, "language_loss": 0.79734194, "learning_rate": 3.963913211115848e-06, "loss": 0.816441, "num_input_tokens_seen": 31563425, "router_z_loss_clip": 3.72851562, "router_z_loss_mlp": 0.42578125, "step": 1476, "time_per_iteration": 2.8424692153930664 }, { "auxiliary_loss_clip": 0.01571038, "auxiliary_loss_mlp": 0.00337963, "balance_loss_clip": 1.20601892, "balance_loss_mlp": 0.29931554, "epoch": 0.0888020441905907, "flos": 32852445868800.0, "grad_norm": 49.864469092350475, "language_loss": 0.81114447, "learning_rate": 3.9638395241897895e-06, "loss": 0.83023453, "num_input_tokens_seen": 31584525, "router_z_loss_clip": 3.6484375, "router_z_loss_mlp": 0.38696289, "step": 1477, "time_per_iteration": 2.837495803833008 }, { "auxiliary_loss_clip": 0.01559138, "auxiliary_loss_mlp": 0.00354624, "balance_loss_clip": 1.18700182, "balance_loss_mlp": 0.31249577, "epoch": 0.08886216744325869, "flos": 23149347361920.0, "grad_norm": 2.130205359540668, "language_loss": 0.94125962, "learning_rate": 3.963765762794739e-06, "loss": 0.96039724, "num_input_tokens_seen": 31603325, "router_z_loss_clip": 3.72265625, "router_z_loss_mlp": 0.42089844, "step": 1478, "time_per_iteration": 2.713108777999878 }, { "auxiliary_loss_clip": 0.01547915, "auxiliary_loss_mlp": 0.00358919, "balance_loss_clip": 1.17697155, "balance_loss_mlp": 0.32163057, "epoch": 0.08892229069592665, "flos": 23331593992320.0, "grad_norm": 2.1952947899098225, "language_loss": 0.82691729, "learning_rate": 3.963691926933495e-06, "loss": 0.84598565, "num_input_tokens_seen": 31624820, "router_z_loss_clip": 3.70898438, "router_z_loss_mlp": 0.37255859, "step": 1479, "time_per_iteration": 2.7240498065948486 }, { "auxiliary_loss_clip": 0.01562655, "auxiliary_loss_mlp": 0.00339286, "balance_loss_clip": 1.19045353, "balance_loss_mlp": 0.29610798, "epoch": 0.08898241394859462, "flos": 26213784272640.0, "grad_norm": 3.9987522432269453, "language_loss": 0.85996449, "learning_rate": 3.9636180166088555e-06, "loss": 0.87898386, "num_input_tokens_seen": 31646080, "router_z_loss_clip": 3.72460938, "router_z_loss_mlp": 0.43188477, "step": 1480, "time_per_iteration": 2.693326711654663 }, { "auxiliary_loss_clip": 0.01577628, "auxiliary_loss_mlp": 0.00339982, "balance_loss_clip": 1.20067549, "balance_loss_mlp": 0.29422992, "epoch": 0.0890425372012626, "flos": 23550613171200.0, "grad_norm": 121.29383422049143, "language_loss": 0.75861752, "learning_rate": 3.963544031823624e-06, "loss": 0.77779365, "num_input_tokens_seen": 31665770, "router_z_loss_clip": 3.76953125, "router_z_loss_mlp": 0.45800781, "step": 1481, "time_per_iteration": 2.693154811859131 }, { "auxiliary_loss_clip": 0.01564301, "auxiliary_loss_mlp": 0.00303104, "balance_loss_clip": 1.20162559, "balance_loss_mlp": 0.26746073, "epoch": 0.08910266045393056, "flos": 23002795872000.0, "grad_norm": 4616.726736567042, "language_loss": 1.02828932, "learning_rate": 3.9634699725806065e-06, "loss": 1.04696345, "num_input_tokens_seen": 31683805, "router_z_loss_clip": 3.63085938, "router_z_loss_mlp": 0.35644531, "step": 1482, "time_per_iteration": 2.661295175552368 }, { "auxiliary_loss_clip": 0.01567796, "auxiliary_loss_mlp": 0.00334588, "balance_loss_clip": 1.19999325, "balance_loss_mlp": 0.29482016, "epoch": 0.08916278370659853, "flos": 31936508035200.0, "grad_norm": 11.300069846144542, "language_loss": 0.85775232, "learning_rate": 3.96339583888261e-06, "loss": 0.87677616, "num_input_tokens_seen": 31704630, "router_z_loss_clip": 3.68359375, "router_z_loss_mlp": 0.39770508, "step": 1483, "time_per_iteration": 2.729128837585449 }, { "auxiliary_loss_clip": 0.01580363, "auxiliary_loss_mlp": 0.00331863, "balance_loss_clip": 1.20794618, "balance_loss_mlp": 0.29002064, "epoch": 0.08922290695926649, "flos": 17530404969600.0, "grad_norm": 7.879587425760029, "language_loss": 0.94488662, "learning_rate": 3.963321630732448e-06, "loss": 0.96400881, "num_input_tokens_seen": 31723255, "router_z_loss_clip": 3.72460938, "router_z_loss_mlp": 0.41870117, "step": 1484, "time_per_iteration": 2.641637086868286 }, { "auxiliary_loss_clip": 0.01574408, "auxiliary_loss_mlp": 0.00310707, "balance_loss_clip": 1.203022, "balance_loss_mlp": 0.27294153, "epoch": 0.08928303021193447, "flos": 32125075459200.0, "grad_norm": 20.467661454623748, "language_loss": 0.86243308, "learning_rate": 3.963247348132932e-06, "loss": 0.8812843, "num_input_tokens_seen": 31747045, "router_z_loss_clip": 3.71289062, "router_z_loss_mlp": 0.37768555, "step": 1485, "time_per_iteration": 2.7051448822021484 }, { "auxiliary_loss_clip": 0.01572866, "auxiliary_loss_mlp": 0.00303224, "balance_loss_clip": 1.20184731, "balance_loss_mlp": 0.26059484, "epoch": 0.08934315346460243, "flos": 22125210785280.0, "grad_norm": 3.355033484221029, "language_loss": 0.88766825, "learning_rate": 3.96317299108688e-06, "loss": 0.90642917, "num_input_tokens_seen": 31766615, "router_z_loss_clip": 3.703125, "router_z_loss_mlp": 0.42651367, "step": 1486, "time_per_iteration": 2.6984832286834717 }, { "auxiliary_loss_clip": 0.01568498, "auxiliary_loss_mlp": 0.00288281, "balance_loss_clip": 1.20885468, "balance_loss_mlp": 0.24929953, "epoch": 0.0894032767172704, "flos": 22565583527040.0, "grad_norm": 1.7244405284250541, "language_loss": 0.81739998, "learning_rate": 3.963098559597111e-06, "loss": 0.83596778, "num_input_tokens_seen": 31785855, "router_z_loss_clip": 3.59570312, "router_z_loss_mlp": 0.38964844, "step": 1487, "time_per_iteration": 2.689114809036255 }, { "auxiliary_loss_clip": 0.01574206, "auxiliary_loss_mlp": 0.00279755, "balance_loss_clip": 1.21371031, "balance_loss_mlp": 0.24089323, "epoch": 0.08946339996993838, "flos": 20193396503040.0, "grad_norm": 4.543520103385851, "language_loss": 0.91147989, "learning_rate": 3.963024053666449e-06, "loss": 0.93001944, "num_input_tokens_seen": 31804210, "router_z_loss_clip": 3.60546875, "router_z_loss_mlp": 0.38891602, "step": 1488, "time_per_iteration": 2.6531174182891846 }, { "auxiliary_loss_clip": 0.01577609, "auxiliary_loss_mlp": 0.00296511, "balance_loss_clip": 1.21665323, "balance_loss_mlp": 0.25869787, "epoch": 0.08952352322260634, "flos": 48360181104000.0, "grad_norm": 2.8942813101283535, "language_loss": 0.8072167, "learning_rate": 3.962949473297718e-06, "loss": 0.82595789, "num_input_tokens_seen": 31826150, "router_z_loss_clip": 3.61132812, "router_z_loss_mlp": 0.37817383, "step": 1489, "time_per_iteration": 2.851170539855957 }, { "auxiliary_loss_clip": 0.01590006, "auxiliary_loss_mlp": 0.00285174, "balance_loss_clip": 1.23171496, "balance_loss_mlp": 0.24483365, "epoch": 0.08958364647527431, "flos": 31793081028480.0, "grad_norm": 7.273568883768564, "language_loss": 0.96688199, "learning_rate": 3.962874818493745e-06, "loss": 0.98563373, "num_input_tokens_seen": 31848060, "router_z_loss_clip": 3.58789062, "router_z_loss_mlp": 0.40332031, "step": 1490, "time_per_iteration": 2.752030372619629 }, { "auxiliary_loss_clip": 0.01582263, "auxiliary_loss_mlp": 0.00312939, "balance_loss_clip": 1.2222966, "balance_loss_mlp": 0.27402878, "epoch": 0.08964376972794229, "flos": 23368186972800.0, "grad_norm": 35.86472995713647, "language_loss": 0.82015389, "learning_rate": 3.9628000892573635e-06, "loss": 0.83910584, "num_input_tokens_seen": 31870040, "router_z_loss_clip": 3.6015625, "router_z_loss_mlp": 0.38916016, "step": 1491, "time_per_iteration": 2.686924934387207 }, { "auxiliary_loss_clip": 0.01615572, "auxiliary_loss_mlp": 0.00271151, "balance_loss_clip": 1.26250625, "balance_loss_mlp": 0.23381504, "epoch": 0.08970389298061025, "flos": 23294785530240.0, "grad_norm": 2.712787447824314, "language_loss": 0.84336603, "learning_rate": 3.9627252855914055e-06, "loss": 0.86223328, "num_input_tokens_seen": 31890400, "router_z_loss_clip": 3.53125, "router_z_loss_mlp": 0.37329102, "step": 1492, "time_per_iteration": 2.6261963844299316 }, { "auxiliary_loss_clip": 0.01616953, "auxiliary_loss_mlp": 0.00314642, "balance_loss_clip": 1.25760388, "balance_loss_mlp": 0.2733717, "epoch": 0.08976401623327822, "flos": 33761703772800.0, "grad_norm": 6.083717042201034, "language_loss": 0.78103268, "learning_rate": 3.962650407498707e-06, "loss": 0.80034858, "num_input_tokens_seen": 31913435, "router_z_loss_clip": 3.58984375, "router_z_loss_mlp": 0.41308594, "step": 1493, "time_per_iteration": 2.780771493911743 }, { "auxiliary_loss_clip": 0.01656158, "auxiliary_loss_mlp": 0.0032934, "balance_loss_clip": 1.30003917, "balance_loss_mlp": 0.28785479, "epoch": 0.08982413948594618, "flos": 23911335504000.0, "grad_norm": 6.488057531062658, "language_loss": 0.9253701, "learning_rate": 3.962575454982109e-06, "loss": 0.94522512, "num_input_tokens_seen": 31932435, "router_z_loss_clip": 3.56445312, "router_z_loss_mlp": 0.41503906, "step": 1494, "time_per_iteration": 2.7523915767669678 }, { "auxiliary_loss_clip": 0.01671075, "auxiliary_loss_mlp": 0.00332138, "balance_loss_clip": 1.30932033, "balance_loss_mlp": 0.29589862, "epoch": 0.08988426273861416, "flos": 16837544551680.0, "grad_norm": 2.6123256035253926, "language_loss": 0.89621484, "learning_rate": 3.962500428044454e-06, "loss": 0.91624701, "num_input_tokens_seen": 31950125, "router_z_loss_clip": 3.61914062, "router_z_loss_mlp": 0.36206055, "step": 1495, "time_per_iteration": 4.064457893371582 }, { "auxiliary_loss_clip": 0.01702574, "auxiliary_loss_mlp": 0.00368731, "balance_loss_clip": 1.32747948, "balance_loss_mlp": 0.32922471, "epoch": 0.08994438599128213, "flos": 14793365548800.0, "grad_norm": 175.7652820081294, "language_loss": 0.81182015, "learning_rate": 3.962425326688585e-06, "loss": 0.83253324, "num_input_tokens_seen": 31968050, "router_z_loss_clip": 3.74804688, "router_z_loss_mlp": 0.39501953, "step": 1496, "time_per_iteration": 4.0558555126190186 }, { "auxiliary_loss_clip": 0.01726287, "auxiliary_loss_mlp": 0.00360627, "balance_loss_clip": 1.34267735, "balance_loss_mlp": 0.32190776, "epoch": 0.09000450924395009, "flos": 17384320356480.0, "grad_norm": 26.57186645173995, "language_loss": 0.8609606, "learning_rate": 3.962350150917351e-06, "loss": 0.88182974, "num_input_tokens_seen": 31985675, "router_z_loss_clip": 3.8359375, "router_z_loss_mlp": 0.38745117, "step": 1497, "time_per_iteration": 2.579988479614258 }, { "auxiliary_loss_clip": 0.01725993, "auxiliary_loss_mlp": 0.00400293, "balance_loss_clip": 1.33496189, "balance_loss_mlp": 0.34950966, "epoch": 0.09006463249661807, "flos": 24280317964800.0, "grad_norm": 52.490545434077504, "language_loss": 0.89416736, "learning_rate": 3.9622749007336035e-06, "loss": 0.91543025, "num_input_tokens_seen": 32005180, "router_z_loss_clip": 3.91015625, "router_z_loss_mlp": 0.50756836, "step": 1498, "time_per_iteration": 2.6758151054382324 }, { "auxiliary_loss_clip": 0.01759891, "auxiliary_loss_mlp": 0.00374046, "balance_loss_clip": 1.37216735, "balance_loss_mlp": 0.33086801, "epoch": 0.09012475574928604, "flos": 13661928069120.0, "grad_norm": 3.952955149616601, "language_loss": 0.87263429, "learning_rate": 3.962199576140195e-06, "loss": 0.89397365, "num_input_tokens_seen": 32022970, "router_z_loss_clip": 3.87304688, "router_z_loss_mlp": 0.43188477, "step": 1499, "time_per_iteration": 4.077389478683472 }, { "auxiliary_loss_clip": 0.01773848, "auxiliary_loss_mlp": 0.00372398, "balance_loss_clip": 1.38102996, "balance_loss_mlp": 0.32902998, "epoch": 0.090184879001954, "flos": 23327751237120.0, "grad_norm": 7.215151261617818, "language_loss": 0.9849751, "learning_rate": 3.962124177139981e-06, "loss": 1.00643754, "num_input_tokens_seen": 32043055, "router_z_loss_clip": 3.92773438, "router_z_loss_mlp": 0.43383789, "step": 1500, "time_per_iteration": 2.6519615650177 }, { "auxiliary_loss_clip": 0.01782728, "auxiliary_loss_mlp": 0.00424648, "balance_loss_clip": 1.36813414, "balance_loss_mlp": 0.37312621, "epoch": 0.09024500225462198, "flos": 23002688131200.0, "grad_norm": 35.5678122782809, "language_loss": 0.8169477, "learning_rate": 3.962048703735822e-06, "loss": 0.83902144, "num_input_tokens_seen": 32061900, "router_z_loss_clip": 4.14453125, "router_z_loss_mlp": 0.51513672, "step": 1501, "time_per_iteration": 2.6926915645599365 }, { "auxiliary_loss_clip": 0.01675956, "auxiliary_loss_mlp": 0.00110863, "balance_loss_clip": 1.38541698, "balance_loss_mlp": 0.09045473, "epoch": 0.09030512550728995, "flos": 62189203242240.0, "grad_norm": 0.8179451123860338, "language_loss": 0.58089972, "learning_rate": 3.96197315593058e-06, "loss": 0.59876794, "num_input_tokens_seen": 32122745, "router_z_loss_clip": 2.90625, "router_z_loss_mlp": 0.20410156, "step": 1502, "time_per_iteration": 4.5393595695495605 }, { "auxiliary_loss_clip": 0.01816271, "auxiliary_loss_mlp": 0.00410913, "balance_loss_clip": 1.39728427, "balance_loss_mlp": 0.36673456, "epoch": 0.09036524875995791, "flos": 38800689171840.0, "grad_norm": 3.3062648926156704, "language_loss": 0.7925148, "learning_rate": 3.961897533727119e-06, "loss": 0.81478655, "num_input_tokens_seen": 32145125, "router_z_loss_clip": 4.19140625, "router_z_loss_mlp": 0.44189453, "step": 1503, "time_per_iteration": 2.825369358062744 }, { "auxiliary_loss_clip": 0.0183539, "auxiliary_loss_mlp": 0.00409794, "balance_loss_clip": 1.40867186, "balance_loss_mlp": 0.36304, "epoch": 0.09042537201262588, "flos": 21690081429120.0, "grad_norm": 4.26176357684905, "language_loss": 0.92152816, "learning_rate": 3.961821837128306e-06, "loss": 0.94398004, "num_input_tokens_seen": 32166255, "router_z_loss_clip": 4.265625, "router_z_loss_mlp": 0.46728516, "step": 1504, "time_per_iteration": 2.6720006465911865 }, { "auxiliary_loss_clip": 0.01830553, "auxiliary_loss_mlp": 0.00450672, "balance_loss_clip": 1.40082777, "balance_loss_mlp": 0.40048489, "epoch": 0.09048549526529386, "flos": 22267021680000.0, "grad_norm": 200.18885387245447, "language_loss": 0.80855888, "learning_rate": 3.961746066137014e-06, "loss": 0.83137119, "num_input_tokens_seen": 32184010, "router_z_loss_clip": 4.296875, "router_z_loss_mlp": 0.50219727, "step": 1505, "time_per_iteration": 2.5953216552734375 }, { "auxiliary_loss_clip": 0.01792472, "auxiliary_loss_mlp": 0.00455161, "balance_loss_clip": 1.37919021, "balance_loss_mlp": 0.40912265, "epoch": 0.09054561851796182, "flos": 14610939350400.0, "grad_norm": 12.295315050612254, "language_loss": 0.91298413, "learning_rate": 3.961670220756114e-06, "loss": 0.93546045, "num_input_tokens_seen": 32201635, "router_z_loss_clip": 4.12890625, "router_z_loss_mlp": 0.46020508, "step": 1506, "time_per_iteration": 2.6337547302246094 }, { "auxiliary_loss_clip": 0.01809916, "auxiliary_loss_mlp": 0.00417883, "balance_loss_clip": 1.39665592, "balance_loss_mlp": 0.3760885, "epoch": 0.09060574177062979, "flos": 27636169916160.0, "grad_norm": 2.468973553449486, "language_loss": 0.84710658, "learning_rate": 3.961594300988482e-06, "loss": 0.86938465, "num_input_tokens_seen": 32221940, "router_z_loss_clip": 4.12304688, "router_z_loss_mlp": 0.41821289, "step": 1507, "time_per_iteration": 2.6579830646514893 }, { "auxiliary_loss_clip": 0.01730123, "auxiliary_loss_mlp": 0.0014326, "balance_loss_clip": 1.43544436, "balance_loss_mlp": 0.12456796, "epoch": 0.09066586502329776, "flos": 66085797513600.0, "grad_norm": 0.7533481289751136, "language_loss": 0.57411951, "learning_rate": 3.961518306836998e-06, "loss": 0.59285331, "num_input_tokens_seen": 32276495, "router_z_loss_clip": 2.9375, "router_z_loss_mlp": 0.18652344, "step": 1508, "time_per_iteration": 3.042431592941284 }, { "auxiliary_loss_clip": 0.01792571, "auxiliary_loss_mlp": 0.00449326, "balance_loss_clip": 1.37518454, "balance_loss_mlp": 0.40724522, "epoch": 0.09072598827596573, "flos": 18916449027840.0, "grad_norm": 10.80070953798028, "language_loss": 0.91211253, "learning_rate": 3.961442238304543e-06, "loss": 0.93453151, "num_input_tokens_seen": 32294130, "router_z_loss_clip": 4.17578125, "router_z_loss_mlp": 0.42138672, "step": 1509, "time_per_iteration": 2.6387927532196045 }, { "auxiliary_loss_clip": 0.01801171, "auxiliary_loss_mlp": 0.00455922, "balance_loss_clip": 1.37930453, "balance_loss_mlp": 0.41069421, "epoch": 0.0907861115286337, "flos": 24821742643200.0, "grad_norm": 34.337139873825734, "language_loss": 0.93186909, "learning_rate": 3.961366095394002e-06, "loss": 0.95444006, "num_input_tokens_seen": 32313555, "router_z_loss_clip": 4.21484375, "router_z_loss_mlp": 0.4519043, "step": 1510, "time_per_iteration": 2.6705751419067383 }, { "auxiliary_loss_clip": 0.01799356, "auxiliary_loss_mlp": 0.00483077, "balance_loss_clip": 1.38159049, "balance_loss_mlp": 0.44013757, "epoch": 0.09084623478130167, "flos": 21652842003840.0, "grad_norm": 4.373573658731089, "language_loss": 0.93755358, "learning_rate": 3.961289878108262e-06, "loss": 0.96037793, "num_input_tokens_seen": 32331430, "router_z_loss_clip": 4.17578125, "router_z_loss_mlp": 0.42895508, "step": 1511, "time_per_iteration": 2.6582579612731934 }, { "auxiliary_loss_clip": 0.01785887, "auxiliary_loss_mlp": 0.0048177, "balance_loss_clip": 1.36791825, "balance_loss_mlp": 0.4371618, "epoch": 0.09090635803396964, "flos": 27639258485760.0, "grad_norm": 2.832251317020543, "language_loss": 0.89805901, "learning_rate": 3.9612135864502135e-06, "loss": 0.92073554, "num_input_tokens_seen": 32353705, "router_z_loss_clip": 4.1796875, "router_z_loss_mlp": 0.44628906, "step": 1512, "time_per_iteration": 2.6895296573638916 }, { "auxiliary_loss_clip": 0.0180561, "auxiliary_loss_mlp": 0.00437455, "balance_loss_clip": 1.39308035, "balance_loss_mlp": 0.394611, "epoch": 0.0909664812866376, "flos": 17669127294720.0, "grad_norm": 186.96223430803812, "language_loss": 0.94236714, "learning_rate": 3.961137220422749e-06, "loss": 0.96479774, "num_input_tokens_seen": 32370520, "router_z_loss_clip": 4.12890625, "router_z_loss_mlp": 0.42797852, "step": 1513, "time_per_iteration": 2.629394769668579 }, { "auxiliary_loss_clip": 0.01779137, "auxiliary_loss_mlp": 0.00497217, "balance_loss_clip": 1.35778809, "balance_loss_mlp": 0.45587537, "epoch": 0.09102660453930557, "flos": 23951448017280.0, "grad_norm": 263.76130801197286, "language_loss": 0.92893934, "learning_rate": 3.961060780028764e-06, "loss": 0.95170289, "num_input_tokens_seen": 32389105, "router_z_loss_clip": 4.21484375, "router_z_loss_mlp": 0.41357422, "step": 1514, "time_per_iteration": 2.731889009475708 }, { "auxiliary_loss_clip": 0.01761006, "auxiliary_loss_mlp": 0.00442167, "balance_loss_clip": 1.35008144, "balance_loss_mlp": 0.40072942, "epoch": 0.09108672779197355, "flos": 25812949426560.0, "grad_norm": 12.330847545217834, "language_loss": 0.95575231, "learning_rate": 3.960984265271159e-06, "loss": 0.97778404, "num_input_tokens_seen": 32408065, "router_z_loss_clip": 4.11132812, "router_z_loss_mlp": 0.41430664, "step": 1515, "time_per_iteration": 2.8291573524475098 }, { "auxiliary_loss_clip": 0.01759858, "auxiliary_loss_mlp": 0.00524765, "balance_loss_clip": 1.34196186, "balance_loss_mlp": 0.47817805, "epoch": 0.09114685104464151, "flos": 29639482220160.0, "grad_norm": 342.3887823075521, "language_loss": 0.92929971, "learning_rate": 3.9609076761528335e-06, "loss": 0.95214593, "num_input_tokens_seen": 32427225, "router_z_loss_clip": 4.17578125, "router_z_loss_mlp": 0.46582031, "step": 1516, "time_per_iteration": 2.7146244049072266 }, { "auxiliary_loss_clip": 0.01728076, "auxiliary_loss_mlp": 0.00471359, "balance_loss_clip": 1.32002854, "balance_loss_mlp": 0.42799073, "epoch": 0.09120697429730948, "flos": 33729635905920.0, "grad_norm": 8.730817973817866, "language_loss": 0.85792905, "learning_rate": 3.960831012676692e-06, "loss": 0.8799234, "num_input_tokens_seen": 32450510, "router_z_loss_clip": 4.08398438, "router_z_loss_mlp": 0.43359375, "step": 1517, "time_per_iteration": 2.7968058586120605 }, { "auxiliary_loss_clip": 0.01734379, "auxiliary_loss_mlp": 0.00537622, "balance_loss_clip": 1.31959426, "balance_loss_mlp": 0.49113011, "epoch": 0.09126709754997746, "flos": 18401381953920.0, "grad_norm": 6.455216979365299, "language_loss": 0.84012085, "learning_rate": 3.960754274845642e-06, "loss": 0.86284089, "num_input_tokens_seen": 32468425, "router_z_loss_clip": 4.15234375, "router_z_loss_mlp": 0.46484375, "step": 1518, "time_per_iteration": 2.6377646923065186 }, { "auxiliary_loss_clip": 0.01702512, "auxiliary_loss_mlp": 0.00512369, "balance_loss_clip": 1.29633868, "balance_loss_mlp": 0.46761751, "epoch": 0.09132722080264542, "flos": 22091957769600.0, "grad_norm": 4.481546695648526, "language_loss": 0.94656217, "learning_rate": 3.960677462662594e-06, "loss": 0.96871096, "num_input_tokens_seen": 32487510, "router_z_loss_clip": 4.05664062, "router_z_loss_mlp": 0.44750977, "step": 1519, "time_per_iteration": 2.7401487827301025 }, { "auxiliary_loss_clip": 0.01709386, "auxiliary_loss_mlp": 0.00514006, "balance_loss_clip": 1.29701912, "balance_loss_mlp": 0.46813428, "epoch": 0.09138734405531339, "flos": 21033131633280.0, "grad_norm": 17.587547218197223, "language_loss": 0.81025362, "learning_rate": 3.96060057613046e-06, "loss": 0.83248752, "num_input_tokens_seen": 32507250, "router_z_loss_clip": 4.125, "router_z_loss_mlp": 0.45898438, "step": 1520, "time_per_iteration": 2.641139268875122 }, { "auxiliary_loss_clip": 0.01675844, "auxiliary_loss_mlp": 0.00533127, "balance_loss_clip": 1.27065396, "balance_loss_mlp": 0.48427501, "epoch": 0.09144746730798137, "flos": 20083940784000.0, "grad_norm": 371.9197341888047, "language_loss": 0.95201653, "learning_rate": 3.960523615252156e-06, "loss": 0.97410619, "num_input_tokens_seen": 32526045, "router_z_loss_clip": 4.05273438, "router_z_loss_mlp": 0.48803711, "step": 1521, "time_per_iteration": 2.713355541229248 }, { "auxiliary_loss_clip": 0.01669803, "auxiliary_loss_mlp": 0.00542446, "balance_loss_clip": 1.2612685, "balance_loss_mlp": 0.49585882, "epoch": 0.09150759056064933, "flos": 22778210085120.0, "grad_norm": 7.659337601814755, "language_loss": 0.9075892, "learning_rate": 3.960446580030599e-06, "loss": 0.92971164, "num_input_tokens_seen": 32546575, "router_z_loss_clip": 4.08789062, "router_z_loss_mlp": 0.46582031, "step": 1522, "time_per_iteration": 2.680223226547241 }, { "auxiliary_loss_clip": 0.01675039, "auxiliary_loss_mlp": 0.0054828, "balance_loss_clip": 1.2633152, "balance_loss_mlp": 0.50238472, "epoch": 0.0915677138133173, "flos": 27564205017600.0, "grad_norm": 40.88806898134373, "language_loss": 0.8713423, "learning_rate": 3.960369470468711e-06, "loss": 0.89357555, "num_input_tokens_seen": 32568795, "router_z_loss_clip": 4.12890625, "router_z_loss_mlp": 0.45874023, "step": 1523, "time_per_iteration": 2.7535557746887207 }, { "auxiliary_loss_clip": 0.01697219, "auxiliary_loss_mlp": 0.00535494, "balance_loss_clip": 1.28180909, "balance_loss_mlp": 0.49057603, "epoch": 0.09162783706598528, "flos": 17674765729920.0, "grad_norm": 180.54293566040218, "language_loss": 0.81970835, "learning_rate": 3.960292286569418e-06, "loss": 0.84203541, "num_input_tokens_seen": 32587010, "router_z_loss_clip": 4.16015625, "router_z_loss_mlp": 0.44946289, "step": 1524, "time_per_iteration": 2.6478545665740967 }, { "auxiliary_loss_clip": 0.01699438, "auxiliary_loss_mlp": 0.00542212, "balance_loss_clip": 1.29479074, "balance_loss_mlp": 0.49681699, "epoch": 0.09168796031865324, "flos": 18478195188480.0, "grad_norm": 47.74393968052814, "language_loss": 0.94465744, "learning_rate": 3.960215028335644e-06, "loss": 0.96707398, "num_input_tokens_seen": 32602375, "router_z_loss_clip": 4.04296875, "router_z_loss_mlp": 0.45410156, "step": 1525, "time_per_iteration": 2.611694574356079 }, { "auxiliary_loss_clip": 0.01682704, "auxiliary_loss_mlp": 0.00602508, "balance_loss_clip": 1.27430856, "balance_loss_mlp": 0.5536803, "epoch": 0.0917480835713212, "flos": 29387605075200.0, "grad_norm": 17.444547939193942, "language_loss": 0.8180362, "learning_rate": 3.96013769577032e-06, "loss": 0.84088832, "num_input_tokens_seen": 32621460, "router_z_loss_clip": 4.078125, "router_z_loss_mlp": 0.48828125, "step": 1526, "time_per_iteration": 2.7148869037628174 }, { "auxiliary_loss_clip": 0.01644479, "auxiliary_loss_mlp": 0.00599545, "balance_loss_clip": 1.23871422, "balance_loss_mlp": 0.54904819, "epoch": 0.09180820682398917, "flos": 19829262378240.0, "grad_norm": 7.337035611845298, "language_loss": 0.8390677, "learning_rate": 3.960060288876378e-06, "loss": 0.86150795, "num_input_tokens_seen": 32640440, "router_z_loss_clip": 4.05859375, "router_z_loss_mlp": 0.50415039, "step": 1527, "time_per_iteration": 2.6500484943389893 }, { "auxiliary_loss_clip": 0.01642783, "auxiliary_loss_mlp": 0.00656761, "balance_loss_clip": 1.23855948, "balance_loss_mlp": 0.60719419, "epoch": 0.09186833007665715, "flos": 23841848643840.0, "grad_norm": 4.703988269676352, "language_loss": 0.87835938, "learning_rate": 3.959982807656753e-06, "loss": 0.90135479, "num_input_tokens_seen": 32660020, "router_z_loss_clip": 4.04296875, "router_z_loss_mlp": 0.49609375, "step": 1528, "time_per_iteration": 2.768157958984375 }, { "auxiliary_loss_clip": 0.01654742, "auxiliary_loss_mlp": 0.00693401, "balance_loss_clip": 1.23855722, "balance_loss_mlp": 0.63660967, "epoch": 0.09192845332932512, "flos": 12932726065920.0, "grad_norm": 32.63815762770739, "language_loss": 0.86390901, "learning_rate": 3.959905252114384e-06, "loss": 0.88739043, "num_input_tokens_seen": 32678170, "router_z_loss_clip": 4.1640625, "router_z_loss_mlp": 0.56835938, "step": 1529, "time_per_iteration": 2.6292948722839355 }, { "auxiliary_loss_clip": 0.01656884, "auxiliary_loss_mlp": 0.0064774, "balance_loss_clip": 1.24080634, "balance_loss_mlp": 0.59390533, "epoch": 0.09198857658199308, "flos": 24568177559040.0, "grad_norm": 88.35197366551327, "language_loss": 0.873698, "learning_rate": 3.959827622252211e-06, "loss": 0.89674419, "num_input_tokens_seen": 32697540, "router_z_loss_clip": 4.15625, "router_z_loss_mlp": 0.53833008, "step": 1530, "time_per_iteration": 2.6976447105407715 }, { "auxiliary_loss_clip": 0.01636188, "auxiliary_loss_mlp": 0.0063498, "balance_loss_clip": 1.2365458, "balance_loss_mlp": 0.58579385, "epoch": 0.09204869983466106, "flos": 20266941600000.0, "grad_norm": 314.41043627810313, "language_loss": 0.90581191, "learning_rate": 3.959749918073179e-06, "loss": 0.92852366, "num_input_tokens_seen": 32716805, "router_z_loss_clip": 3.99804688, "router_z_loss_mlp": 0.4921875, "step": 1531, "time_per_iteration": 2.651965618133545 }, { "auxiliary_loss_clip": 0.01652492, "auxiliary_loss_mlp": 0.00679882, "balance_loss_clip": 1.24188805, "balance_loss_mlp": 0.62969446, "epoch": 0.09210882308732903, "flos": 20885646389760.0, "grad_norm": 62.946539908423745, "language_loss": 0.86835289, "learning_rate": 3.959672139580233e-06, "loss": 0.8916766, "num_input_tokens_seen": 32736385, "router_z_loss_clip": 4.10742188, "router_z_loss_mlp": 0.5012207, "step": 1532, "time_per_iteration": 2.651139736175537 }, { "auxiliary_loss_clip": 0.01633058, "auxiliary_loss_mlp": 0.00613392, "balance_loss_clip": 1.22748733, "balance_loss_mlp": 0.56604242, "epoch": 0.09216894633999699, "flos": 30956326727040.0, "grad_norm": 75.54649267875463, "language_loss": 0.90606219, "learning_rate": 3.9595942867763235e-06, "loss": 0.9285267, "num_input_tokens_seen": 32757140, "router_z_loss_clip": 4.05859375, "router_z_loss_mlp": 0.47338867, "step": 1533, "time_per_iteration": 2.776776075363159 }, { "auxiliary_loss_clip": 0.01655275, "auxiliary_loss_mlp": 0.00661887, "balance_loss_clip": 1.24497104, "balance_loss_mlp": 0.61243916, "epoch": 0.09222906959266497, "flos": 13151565676800.0, "grad_norm": 16.055698880649206, "language_loss": 0.97866738, "learning_rate": 3.959516359664402e-06, "loss": 1.00183892, "num_input_tokens_seen": 32774860, "router_z_loss_clip": 4.10546875, "router_z_loss_mlp": 0.49462891, "step": 1534, "time_per_iteration": 2.681025266647339 }, { "auxiliary_loss_clip": 0.01659167, "auxiliary_loss_mlp": 0.00706021, "balance_loss_clip": 1.24455714, "balance_loss_mlp": 0.65602505, "epoch": 0.09228919284533293, "flos": 25994477784960.0, "grad_norm": 4.398359631916404, "language_loss": 0.83252072, "learning_rate": 3.959438358247424e-06, "loss": 0.85617262, "num_input_tokens_seen": 32795250, "router_z_loss_clip": 4.14453125, "router_z_loss_mlp": 0.5, "step": 1535, "time_per_iteration": 2.6962666511535645 }, { "auxiliary_loss_clip": 0.01639613, "auxiliary_loss_mlp": 0.00653107, "balance_loss_clip": 1.23045468, "balance_loss_mlp": 0.6034447, "epoch": 0.0923493160980009, "flos": 18660800954880.0, "grad_norm": 11.297827703421754, "language_loss": 0.86581314, "learning_rate": 3.959360282528346e-06, "loss": 0.88874042, "num_input_tokens_seen": 32813805, "router_z_loss_clip": 4.08984375, "router_z_loss_mlp": 0.49658203, "step": 1536, "time_per_iteration": 2.664775848388672 }, { "auxiliary_loss_clip": 0.01662177, "auxiliary_loss_mlp": 0.00701084, "balance_loss_clip": 1.24532378, "balance_loss_mlp": 0.65113533, "epoch": 0.09240943935066886, "flos": 21140576190720.0, "grad_norm": 18.306664001246855, "language_loss": 0.95891291, "learning_rate": 3.959282132510131e-06, "loss": 0.9825455, "num_input_tokens_seen": 32830960, "router_z_loss_clip": 4.171875, "router_z_loss_mlp": 0.49926758, "step": 1537, "time_per_iteration": 4.145832777023315 }, { "auxiliary_loss_clip": 0.01658308, "auxiliary_loss_mlp": 0.00634429, "balance_loss_clip": 1.24047887, "balance_loss_mlp": 0.58478999, "epoch": 0.09246956260333684, "flos": 20592435669120.0, "grad_norm": 92.64681478904845, "language_loss": 0.87369573, "learning_rate": 3.959203908195741e-06, "loss": 0.89662313, "num_input_tokens_seen": 32848275, "router_z_loss_clip": 4.17382812, "router_z_loss_mlp": 0.49682617, "step": 1538, "time_per_iteration": 4.041836738586426 }, { "auxiliary_loss_clip": 0.01565601, "auxiliary_loss_mlp": 0.00313331, "balance_loss_clip": 1.25927317, "balance_loss_mlp": 0.28996646, "epoch": 0.09252968585600481, "flos": 67558710614400.0, "grad_norm": 2.2845906255469743, "language_loss": 0.57442313, "learning_rate": 3.959125609588142e-06, "loss": 0.59321243, "num_input_tokens_seen": 32917730, "router_z_loss_clip": 3.0625, "router_z_loss_mlp": 0.23339844, "step": 1539, "time_per_iteration": 3.273526430130005 }, { "auxiliary_loss_clip": 0.01628467, "auxiliary_loss_mlp": 0.00656435, "balance_loss_clip": 1.21253383, "balance_loss_mlp": 0.60770178, "epoch": 0.09258980910867277, "flos": 17383853479680.0, "grad_norm": 8.888998114133742, "language_loss": 0.79205966, "learning_rate": 3.959047236690304e-06, "loss": 0.81490862, "num_input_tokens_seen": 32934910, "router_z_loss_clip": 4.15625, "router_z_loss_mlp": 0.48754883, "step": 1540, "time_per_iteration": 2.6476352214813232 }, { "auxiliary_loss_clip": 0.01629932, "auxiliary_loss_mlp": 0.00602607, "balance_loss_clip": 1.21026552, "balance_loss_mlp": 0.55413616, "epoch": 0.09264993236134075, "flos": 19865927185920.0, "grad_norm": 34.944447123019785, "language_loss": 0.87133998, "learning_rate": 3.958968789505198e-06, "loss": 0.89366537, "num_input_tokens_seen": 32953840, "router_z_loss_clip": 4.19726562, "router_z_loss_mlp": 0.48461914, "step": 1541, "time_per_iteration": 4.154497861862183 }, { "auxiliary_loss_clip": 0.01482631, "auxiliary_loss_mlp": 0.00291397, "balance_loss_clip": 1.19269955, "balance_loss_mlp": 0.2718465, "epoch": 0.09271005561400872, "flos": 62284401262080.0, "grad_norm": 0.9151811691425403, "language_loss": 0.62236893, "learning_rate": 3.9588902680358e-06, "loss": 0.64010918, "num_input_tokens_seen": 33011410, "router_z_loss_clip": 2.90625, "router_z_loss_mlp": 0.1953125, "step": 1542, "time_per_iteration": 3.132094144821167 }, { "auxiliary_loss_clip": 0.01625259, "auxiliary_loss_mlp": 0.00630527, "balance_loss_clip": 1.20626616, "balance_loss_mlp": 0.57893336, "epoch": 0.09277017886667668, "flos": 23329870139520.0, "grad_norm": 8.859439562296323, "language_loss": 0.87736726, "learning_rate": 3.958811672285086e-06, "loss": 0.89992511, "num_input_tokens_seen": 33031675, "router_z_loss_clip": 4.1875, "router_z_loss_mlp": 0.51611328, "step": 1543, "time_per_iteration": 2.6965577602386475 }, { "auxiliary_loss_clip": 0.01606999, "auxiliary_loss_mlp": 0.00647212, "balance_loss_clip": 1.19383538, "balance_loss_mlp": 0.59771681, "epoch": 0.09283030211934466, "flos": 54745169875200.0, "grad_norm": 16.406634187051356, "language_loss": 0.77680957, "learning_rate": 3.958733002256038e-06, "loss": 0.79935169, "num_input_tokens_seen": 33056355, "router_z_loss_clip": 4.13671875, "router_z_loss_mlp": 0.49438477, "step": 1544, "time_per_iteration": 4.575198411941528 }, { "auxiliary_loss_clip": 0.01628789, "auxiliary_loss_mlp": 0.00619671, "balance_loss_clip": 1.20869875, "balance_loss_mlp": 0.56977046, "epoch": 0.09289042537201263, "flos": 30334784762880.0, "grad_norm": 7.921700233245406, "language_loss": 0.81350303, "learning_rate": 3.958654257951637e-06, "loss": 0.83598763, "num_input_tokens_seen": 33079520, "router_z_loss_clip": 4.203125, "router_z_loss_mlp": 0.49853516, "step": 1545, "time_per_iteration": 2.7150626182556152 }, { "auxiliary_loss_clip": 0.01602438, "auxiliary_loss_mlp": 0.00590741, "balance_loss_clip": 1.20135772, "balance_loss_mlp": 0.54203242, "epoch": 0.09295054862468059, "flos": 17746838369280.0, "grad_norm": 71.81283007322989, "language_loss": 0.86549926, "learning_rate": 3.9585754393748706e-06, "loss": 0.88743103, "num_input_tokens_seen": 33096135, "router_z_loss_clip": 4.01757812, "router_z_loss_mlp": 0.48730469, "step": 1546, "time_per_iteration": 2.6454825401306152 }, { "auxiliary_loss_clip": 0.01619346, "auxiliary_loss_mlp": 0.00621272, "balance_loss_clip": 1.2106967, "balance_loss_mlp": 0.5710851, "epoch": 0.09301067187734856, "flos": 23658021815040.0, "grad_norm": 3.593977388948039, "language_loss": 0.89920056, "learning_rate": 3.9584965465287275e-06, "loss": 0.92160666, "num_input_tokens_seen": 33115245, "router_z_loss_clip": 4.0859375, "router_z_loss_mlp": 0.50219727, "step": 1547, "time_per_iteration": 2.631695032119751 }, { "auxiliary_loss_clip": 0.01589727, "auxiliary_loss_mlp": 0.00604264, "balance_loss_clip": 1.1914252, "balance_loss_mlp": 0.55481565, "epoch": 0.09307079513001654, "flos": 27527719777920.0, "grad_norm": 635.9342212026423, "language_loss": 0.76053756, "learning_rate": 3.958417579416199e-06, "loss": 0.78247744, "num_input_tokens_seen": 33136640, "router_z_loss_clip": 3.984375, "router_z_loss_mlp": 0.49487305, "step": 1548, "time_per_iteration": 2.699474811553955 }, { "auxiliary_loss_clip": 0.01607152, "auxiliary_loss_mlp": 0.00569635, "balance_loss_clip": 1.19668519, "balance_loss_mlp": 0.5242399, "epoch": 0.0931309183826845, "flos": 20627340710400.0, "grad_norm": 13.33747818995187, "language_loss": 0.89217889, "learning_rate": 3.9583385380402795e-06, "loss": 0.91394675, "num_input_tokens_seen": 33155060, "router_z_loss_clip": 4.1015625, "router_z_loss_mlp": 0.45410156, "step": 1549, "time_per_iteration": 2.651573896408081 }, { "auxiliary_loss_clip": 0.0159192, "auxiliary_loss_mlp": 0.00550111, "balance_loss_clip": 1.19058537, "balance_loss_mlp": 0.50199813, "epoch": 0.09319104163535247, "flos": 29020921084800.0, "grad_norm": 10.880066456355165, "language_loss": 0.82360852, "learning_rate": 3.958259422403966e-06, "loss": 0.84502882, "num_input_tokens_seen": 33175420, "router_z_loss_clip": 4.01171875, "router_z_loss_mlp": 0.48144531, "step": 1550, "time_per_iteration": 2.749711275100708 }, { "auxiliary_loss_clip": 0.01589303, "auxiliary_loss_mlp": 0.00545529, "balance_loss_clip": 1.19195426, "balance_loss_mlp": 0.49887088, "epoch": 0.09325116488802045, "flos": 25301545539840.0, "grad_norm": 27.730445852207158, "language_loss": 0.90889001, "learning_rate": 3.95818023251026e-06, "loss": 0.93023831, "num_input_tokens_seen": 33194120, "router_z_loss_clip": 3.96679688, "router_z_loss_mlp": 0.46655273, "step": 1551, "time_per_iteration": 2.7047722339630127 }, { "auxiliary_loss_clip": 0.01495563, "auxiliary_loss_mlp": 0.002173, "balance_loss_clip": 1.23523831, "balance_loss_mlp": 0.20204151, "epoch": 0.09331128814068841, "flos": 61536203942400.0, "grad_norm": 0.7452644886119041, "language_loss": 0.61796463, "learning_rate": 3.958100968362163e-06, "loss": 0.63509321, "num_input_tokens_seen": 33261080, "router_z_loss_clip": 2.59375, "router_z_loss_mlp": 0.15234375, "step": 1552, "time_per_iteration": 3.268558979034424 }, { "auxiliary_loss_clip": 0.01513505, "auxiliary_loss_mlp": 0.00211826, "balance_loss_clip": 1.24946463, "balance_loss_mlp": 0.19570903, "epoch": 0.09337141139335638, "flos": 53293700171520.0, "grad_norm": 0.8049983232334689, "language_loss": 0.59318334, "learning_rate": 3.958021629962681e-06, "loss": 0.61043668, "num_input_tokens_seen": 33330235, "router_z_loss_clip": 2.640625, "router_z_loss_mlp": 0.16113281, "step": 1553, "time_per_iteration": 3.2934842109680176 }, { "auxiliary_loss_clip": 0.01600356, "auxiliary_loss_mlp": 0.00551251, "balance_loss_clip": 1.18251085, "balance_loss_mlp": 0.50330538, "epoch": 0.09343153464602436, "flos": 23476852592640.0, "grad_norm": 54.272013237657355, "language_loss": 0.93816429, "learning_rate": 3.957942217314823e-06, "loss": 0.95968044, "num_input_tokens_seen": 33349035, "router_z_loss_clip": 4.17773438, "router_z_loss_mlp": 0.47973633, "step": 1554, "time_per_iteration": 2.648390293121338 }, { "auxiliary_loss_clip": 0.01581516, "auxiliary_loss_mlp": 0.00490826, "balance_loss_clip": 1.17801082, "balance_loss_mlp": 0.44802943, "epoch": 0.09349165789869232, "flos": 19353481804800.0, "grad_norm": 67.96675484368957, "language_loss": 0.86747026, "learning_rate": 3.957862730421599e-06, "loss": 0.88819373, "num_input_tokens_seen": 33368060, "router_z_loss_clip": 4.02929688, "router_z_loss_mlp": 0.42822266, "step": 1555, "time_per_iteration": 2.69736385345459 }, { "auxiliary_loss_clip": 0.01492957, "auxiliary_loss_mlp": 0.00228128, "balance_loss_clip": 1.23023033, "balance_loss_mlp": 0.21181993, "epoch": 0.09355178115136029, "flos": 67502580635520.0, "grad_norm": 0.8555562176413279, "language_loss": 0.60012031, "learning_rate": 3.957783169286024e-06, "loss": 0.61733115, "num_input_tokens_seen": 33430825, "router_z_loss_clip": 2.625, "router_z_loss_mlp": 0.16308594, "step": 1556, "time_per_iteration": 3.1395955085754395 }, { "auxiliary_loss_clip": 0.01585635, "auxiliary_loss_mlp": 0.00490798, "balance_loss_clip": 1.17867541, "balance_loss_mlp": 0.44788292, "epoch": 0.09361190440402825, "flos": 37341638720640.0, "grad_norm": 182.6493193179392, "language_loss": 0.90195459, "learning_rate": 3.9577035339111155e-06, "loss": 0.92271888, "num_input_tokens_seen": 33454855, "router_z_loss_clip": 4.0703125, "router_z_loss_mlp": 0.42895508, "step": 1557, "time_per_iteration": 2.818901777267456 }, { "auxiliary_loss_clip": 0.01555467, "auxiliary_loss_mlp": 0.00529103, "balance_loss_clip": 1.15301824, "balance_loss_mlp": 0.48127645, "epoch": 0.09367202765669623, "flos": 24899705112960.0, "grad_norm": 61.8930760389845, "language_loss": 0.82229632, "learning_rate": 3.957623824299893e-06, "loss": 0.84314203, "num_input_tokens_seen": 33476000, "router_z_loss_clip": 4.0234375, "router_z_loss_mlp": 0.47802734, "step": 1558, "time_per_iteration": 2.7207813262939453 }, { "auxiliary_loss_clip": 0.01570947, "auxiliary_loss_mlp": 0.00499843, "balance_loss_clip": 1.16724968, "balance_loss_mlp": 0.45373327, "epoch": 0.0937321509093642, "flos": 15705568368000.0, "grad_norm": 20.696220525003472, "language_loss": 0.86067146, "learning_rate": 3.957544040455379e-06, "loss": 0.88137931, "num_input_tokens_seen": 33493845, "router_z_loss_clip": 4.03320312, "router_z_loss_mlp": 0.46118164, "step": 1559, "time_per_iteration": 2.661522150039673 }, { "auxiliary_loss_clip": 0.01564957, "auxiliary_loss_mlp": 0.00517802, "balance_loss_clip": 1.16092741, "balance_loss_mlp": 0.4689739, "epoch": 0.09379227416203216, "flos": 20483698222080.0, "grad_norm": 4.882125126426394, "language_loss": 0.82246792, "learning_rate": 3.957464182380599e-06, "loss": 0.84329557, "num_input_tokens_seen": 33510850, "router_z_loss_clip": 4.04101562, "router_z_loss_mlp": 0.48828125, "step": 1560, "time_per_iteration": 2.708096742630005 }, { "auxiliary_loss_clip": 0.01541136, "auxiliary_loss_mlp": 0.00531468, "balance_loss_clip": 1.14612556, "balance_loss_mlp": 0.48209196, "epoch": 0.09385239741470014, "flos": 24352498344960.0, "grad_norm": 816.4133272574475, "language_loss": 0.85582119, "learning_rate": 3.95738425007858e-06, "loss": 0.87654728, "num_input_tokens_seen": 33530430, "router_z_loss_clip": 3.94921875, "router_z_loss_mlp": 0.49389648, "step": 1561, "time_per_iteration": 2.6832385063171387 }, { "auxiliary_loss_clip": 0.01533228, "auxiliary_loss_mlp": 0.00482349, "balance_loss_clip": 1.14608204, "balance_loss_mlp": 0.438885, "epoch": 0.0939125206673681, "flos": 33291489807360.0, "grad_norm": 3.420960250163326, "language_loss": 0.68902409, "learning_rate": 3.957304243552354e-06, "loss": 0.70917988, "num_input_tokens_seen": 33551975, "router_z_loss_clip": 3.87109375, "router_z_loss_mlp": 0.43457031, "step": 1562, "time_per_iteration": 2.790388584136963 }, { "auxiliary_loss_clip": 0.01506302, "auxiliary_loss_mlp": 0.00506693, "balance_loss_clip": 1.13592994, "balance_loss_mlp": 0.46423072, "epoch": 0.09397264392003607, "flos": 19244923925760.0, "grad_norm": 7.522424022735563, "language_loss": 0.9164809, "learning_rate": 3.957224162804956e-06, "loss": 0.93661082, "num_input_tokens_seen": 33569850, "router_z_loss_clip": 3.703125, "router_z_loss_mlp": 0.42480469, "step": 1563, "time_per_iteration": 2.6451759338378906 }, { "auxiliary_loss_clip": 0.01520821, "auxiliary_loss_mlp": 0.00518274, "balance_loss_clip": 1.13607562, "balance_loss_mlp": 0.47099608, "epoch": 0.09403276717270405, "flos": 19317930318720.0, "grad_norm": 12.675444127368275, "language_loss": 0.82647002, "learning_rate": 3.9571440078394205e-06, "loss": 0.84686095, "num_input_tokens_seen": 33590510, "router_z_loss_clip": 3.84375, "router_z_loss_mlp": 0.47290039, "step": 1564, "time_per_iteration": 2.781106472015381 }, { "auxiliary_loss_clip": 0.0149551, "auxiliary_loss_mlp": 0.00518837, "balance_loss_clip": 1.1335746, "balance_loss_mlp": 0.47706646, "epoch": 0.09409289042537201, "flos": 23583471137280.0, "grad_norm": 144.45693104976684, "language_loss": 0.8541857, "learning_rate": 3.9570637786587895e-06, "loss": 0.87432915, "num_input_tokens_seen": 33608810, "router_z_loss_clip": 3.61914062, "router_z_loss_mlp": 0.41772461, "step": 1565, "time_per_iteration": 2.7218289375305176 }, { "auxiliary_loss_clip": 0.01488816, "auxiliary_loss_mlp": 0.00479026, "balance_loss_clip": 1.12428498, "balance_loss_mlp": 0.43708813, "epoch": 0.09415301367803998, "flos": 20078446003200.0, "grad_norm": 31.19361825357456, "language_loss": 0.84114218, "learning_rate": 3.956983475266103e-06, "loss": 0.86082059, "num_input_tokens_seen": 33627265, "router_z_loss_clip": 3.64453125, "router_z_loss_mlp": 0.41943359, "step": 1566, "time_per_iteration": 2.6999051570892334 }, { "auxiliary_loss_clip": 0.0148521, "auxiliary_loss_mlp": 0.00480907, "balance_loss_clip": 1.12038589, "balance_loss_mlp": 0.4394697, "epoch": 0.09421313693070796, "flos": 21062075016960.0, "grad_norm": 303.8233521217873, "language_loss": 0.8423934, "learning_rate": 3.956903097664407e-06, "loss": 0.86205453, "num_input_tokens_seen": 33644810, "router_z_loss_clip": 3.6484375, "router_z_loss_mlp": 0.4140625, "step": 1567, "time_per_iteration": 2.6433186531066895 }, { "auxiliary_loss_clip": 0.01485648, "auxiliary_loss_mlp": 0.00511538, "balance_loss_clip": 1.12570798, "balance_loss_mlp": 0.46986234, "epoch": 0.09427326018337592, "flos": 24316156759680.0, "grad_norm": 100.51565541178083, "language_loss": 0.88190681, "learning_rate": 3.956822645856749e-06, "loss": 0.90187865, "num_input_tokens_seen": 33665665, "router_z_loss_clip": 3.6015625, "router_z_loss_mlp": 0.41699219, "step": 1568, "time_per_iteration": 2.7152044773101807 }, { "auxiliary_loss_clip": 0.0149411, "auxiliary_loss_mlp": 0.00502308, "balance_loss_clip": 1.12584877, "balance_loss_mlp": 0.45655572, "epoch": 0.09433338343604389, "flos": 20263888944000.0, "grad_norm": 20.483043568742048, "language_loss": 0.81969607, "learning_rate": 3.9567421198461814e-06, "loss": 0.83966023, "num_input_tokens_seen": 33684760, "router_z_loss_clip": 3.6875, "router_z_loss_mlp": 0.45727539, "step": 1569, "time_per_iteration": 2.6175484657287598 }, { "auxiliary_loss_clip": 0.01466198, "auxiliary_loss_mlp": 0.00465416, "balance_loss_clip": 1.11272573, "balance_loss_mlp": 0.42359719, "epoch": 0.09439350668871185, "flos": 12742973493120.0, "grad_norm": 6.487320274238727, "language_loss": 0.9258256, "learning_rate": 3.956661519635756e-06, "loss": 0.94514173, "num_input_tokens_seen": 33700750, "router_z_loss_clip": 3.53515625, "router_z_loss_mlp": 0.41845703, "step": 1570, "time_per_iteration": 2.617433786392212 }, { "auxiliary_loss_clip": 0.0147314, "auxiliary_loss_mlp": 0.00503594, "balance_loss_clip": 1.1193608, "balance_loss_mlp": 0.46020186, "epoch": 0.09445362994137983, "flos": 25962266263680.0, "grad_norm": 39.08927261475672, "language_loss": 0.8249743, "learning_rate": 3.95658084522853e-06, "loss": 0.84474164, "num_input_tokens_seen": 33724430, "router_z_loss_clip": 3.53515625, "router_z_loss_mlp": 0.43359375, "step": 1571, "time_per_iteration": 2.7181520462036133 }, { "auxiliary_loss_clip": 0.01458332, "auxiliary_loss_mlp": 0.00457139, "balance_loss_clip": 1.1175611, "balance_loss_mlp": 0.41718, "epoch": 0.0945137531940478, "flos": 19715353372800.0, "grad_norm": 5.4681301376066545, "language_loss": 0.84105802, "learning_rate": 3.956500096627561e-06, "loss": 0.86021268, "num_input_tokens_seen": 33743455, "router_z_loss_clip": 3.40429688, "router_z_loss_mlp": 0.3996582, "step": 1572, "time_per_iteration": 2.618077278137207 }, { "auxiliary_loss_clip": 0.01482239, "auxiliary_loss_mlp": 0.00513872, "balance_loss_clip": 1.1227057, "balance_loss_mlp": 0.46595007, "epoch": 0.09457387644671576, "flos": 23617047375360.0, "grad_norm": 427.41612822045556, "language_loss": 0.92771113, "learning_rate": 3.956419273835913e-06, "loss": 0.94767225, "num_input_tokens_seen": 33763435, "router_z_loss_clip": 3.59375, "router_z_loss_mlp": 0.47998047, "step": 1573, "time_per_iteration": 2.696810483932495 }, { "auxiliary_loss_clip": 0.01471767, "auxiliary_loss_mlp": 0.00448053, "balance_loss_clip": 1.12313032, "balance_loss_mlp": 0.40342149, "epoch": 0.09463399969938374, "flos": 26907291135360.0, "grad_norm": 3749.8828682398394, "language_loss": 0.87966132, "learning_rate": 3.95633837685665e-06, "loss": 0.89885956, "num_input_tokens_seen": 33784325, "router_z_loss_clip": 3.48242188, "router_z_loss_mlp": 0.4465332, "step": 1574, "time_per_iteration": 2.6723339557647705 }, { "auxiliary_loss_clip": 0.01467599, "auxiliary_loss_mlp": 0.00465044, "balance_loss_clip": 1.11964607, "balance_loss_mlp": 0.42188984, "epoch": 0.0946941229520517, "flos": 23659566099840.0, "grad_norm": 35.68293558082962, "language_loss": 0.86506128, "learning_rate": 3.95625740569284e-06, "loss": 0.88438761, "num_input_tokens_seen": 33802510, "router_z_loss_clip": 3.48046875, "router_z_loss_mlp": 0.43139648, "step": 1575, "time_per_iteration": 2.671769142150879 }, { "auxiliary_loss_clip": 0.0145967, "auxiliary_loss_mlp": 0.00444835, "balance_loss_clip": 1.1150347, "balance_loss_mlp": 0.40304002, "epoch": 0.09475424620471967, "flos": 24134053783680.0, "grad_norm": 117.55989651134243, "language_loss": 0.9370051, "learning_rate": 3.956176360347553e-06, "loss": 0.95605016, "num_input_tokens_seen": 33819980, "router_z_loss_clip": 3.44726562, "router_z_loss_mlp": 0.41796875, "step": 1576, "time_per_iteration": 2.7179057598114014 }, { "auxiliary_loss_clip": 0.01411443, "auxiliary_loss_mlp": 0.00172979, "balance_loss_clip": 1.16829157, "balance_loss_mlp": 0.15972319, "epoch": 0.09481436945738765, "flos": 68426168065920.0, "grad_norm": 1.0014544854988967, "language_loss": 0.65732551, "learning_rate": 3.956095240823862e-06, "loss": 0.67316973, "num_input_tokens_seen": 33878925, "router_z_loss_clip": 2.4375, "router_z_loss_mlp": 0.1328125, "step": 1577, "time_per_iteration": 3.1825499534606934 }, { "auxiliary_loss_clip": 0.01456388, "auxiliary_loss_mlp": 0.00414046, "balance_loss_clip": 1.11309326, "balance_loss_mlp": 0.37318096, "epoch": 0.09487449271005562, "flos": 16654076858880.0, "grad_norm": 41.863794204615424, "language_loss": 0.8529796, "learning_rate": 3.956014047124844e-06, "loss": 0.87168396, "num_input_tokens_seen": 33897600, "router_z_loss_clip": 3.43164062, "router_z_loss_mlp": 0.40893555, "step": 1578, "time_per_iteration": 2.780531167984009 }, { "auxiliary_loss_clip": 0.01466738, "auxiliary_loss_mlp": 0.00390814, "balance_loss_clip": 1.12128162, "balance_loss_mlp": 0.34684959, "epoch": 0.09493461596272358, "flos": 24275685110400.0, "grad_norm": 21.292668316234014, "language_loss": 0.82940769, "learning_rate": 3.955932779253578e-06, "loss": 0.84798312, "num_input_tokens_seen": 33917365, "router_z_loss_clip": 3.45703125, "router_z_loss_mlp": 0.43969727, "step": 1579, "time_per_iteration": 4.191045761108398 }, { "auxiliary_loss_clip": 0.01464207, "auxiliary_loss_mlp": 0.00423602, "balance_loss_clip": 1.12283206, "balance_loss_mlp": 0.37780175, "epoch": 0.09499473921539155, "flos": 21870173243520.0, "grad_norm": 6.908559414710728, "language_loss": 0.79794967, "learning_rate": 3.955851437213144e-06, "loss": 0.81682771, "num_input_tokens_seen": 33936680, "router_z_loss_clip": 3.41210938, "router_z_loss_mlp": 0.45776367, "step": 1580, "time_per_iteration": 4.097251892089844 }, { "auxiliary_loss_clip": 0.01447281, "auxiliary_loss_mlp": 0.00380162, "balance_loss_clip": 1.11353004, "balance_loss_mlp": 0.3392016, "epoch": 0.09505486246805953, "flos": 33547137880320.0, "grad_norm": 7.320494110706318, "language_loss": 0.83431184, "learning_rate": 3.955770021006627e-06, "loss": 0.85258627, "num_input_tokens_seen": 33960685, "router_z_loss_clip": 3.3359375, "router_z_loss_mlp": 0.40991211, "step": 1581, "time_per_iteration": 2.8278415203094482 }, { "auxiliary_loss_clip": 0.01459828, "auxiliary_loss_mlp": 0.00396564, "balance_loss_clip": 1.11631131, "balance_loss_mlp": 0.35484061, "epoch": 0.09511498572072749, "flos": 21215342350080.0, "grad_norm": 5.498188881887209, "language_loss": 0.94164562, "learning_rate": 3.955688530637116e-06, "loss": 0.96020955, "num_input_tokens_seen": 33980015, "router_z_loss_clip": 3.4296875, "router_z_loss_mlp": 0.41748047, "step": 1582, "time_per_iteration": 2.626692056655884 }, { "auxiliary_loss_clip": 0.01465287, "auxiliary_loss_mlp": 0.00400317, "balance_loss_clip": 1.11928856, "balance_loss_mlp": 0.35706824, "epoch": 0.09517510897339546, "flos": 14611262572800.0, "grad_norm": 13.814871522144895, "language_loss": 0.75378948, "learning_rate": 3.955606966107699e-06, "loss": 0.77244556, "num_input_tokens_seen": 33997705, "router_z_loss_clip": 3.46289062, "router_z_loss_mlp": 0.43261719, "step": 1583, "time_per_iteration": 4.055368423461914 }, { "auxiliary_loss_clip": 0.01463242, "auxiliary_loss_mlp": 0.00396962, "balance_loss_clip": 1.12047172, "balance_loss_mlp": 0.35488129, "epoch": 0.09523523222606343, "flos": 27817339138560.0, "grad_norm": 23.22407884464331, "language_loss": 0.78388214, "learning_rate": 3.95552532742147e-06, "loss": 0.80248421, "num_input_tokens_seen": 34017465, "router_z_loss_clip": 3.421875, "router_z_loss_mlp": 0.42089844, "step": 1584, "time_per_iteration": 2.697338581085205 }, { "auxiliary_loss_clip": 0.01452479, "auxiliary_loss_mlp": 0.00339452, "balance_loss_clip": 1.11809778, "balance_loss_mlp": 0.30058995, "epoch": 0.0952953554787314, "flos": 20706272847360.0, "grad_norm": 15.914812722230907, "language_loss": 0.86134779, "learning_rate": 3.955443614581525e-06, "loss": 0.8792671, "num_input_tokens_seen": 34038550, "router_z_loss_clip": 3.34179688, "router_z_loss_mlp": 0.38842773, "step": 1585, "time_per_iteration": 2.69438099861145 }, { "auxiliary_loss_clip": 0.01482448, "auxiliary_loss_mlp": 0.00378674, "balance_loss_clip": 1.12254703, "balance_loss_mlp": 0.33406585, "epoch": 0.09535547873139937, "flos": 24787627701120.0, "grad_norm": 109.99262021094316, "language_loss": 0.7914027, "learning_rate": 3.955361827590961e-06, "loss": 0.81001401, "num_input_tokens_seen": 34058665, "router_z_loss_clip": 3.59765625, "router_z_loss_mlp": 0.44604492, "step": 1586, "time_per_iteration": 4.076338529586792 }, { "auxiliary_loss_clip": 0.01409773, "auxiliary_loss_mlp": 0.00279769, "balance_loss_clip": 1.18431008, "balance_loss_mlp": 0.26431933, "epoch": 0.09541560198406734, "flos": 71912194905600.0, "grad_norm": 0.9105395878185389, "language_loss": 0.54989076, "learning_rate": 3.955279966452883e-06, "loss": 0.56678617, "num_input_tokens_seen": 34109655, "router_z_loss_clip": 2.25, "router_z_loss_mlp": 0.15429688, "step": 1587, "time_per_iteration": 2.9562792778015137 }, { "auxiliary_loss_clip": 0.01460014, "auxiliary_loss_mlp": 0.00359325, "balance_loss_clip": 1.11448169, "balance_loss_mlp": 0.31631458, "epoch": 0.09547572523673531, "flos": 28982604251520.0, "grad_norm": 10.426169845069333, "language_loss": 0.86589348, "learning_rate": 3.955198031170391e-06, "loss": 0.88408691, "num_input_tokens_seen": 34131115, "router_z_loss_clip": 3.45703125, "router_z_loss_mlp": 0.43017578, "step": 1588, "time_per_iteration": 2.7309257984161377 }, { "auxiliary_loss_clip": 0.014602, "auxiliary_loss_mlp": 0.00339123, "balance_loss_clip": 1.11950755, "balance_loss_mlp": 0.29954541, "epoch": 0.09553584848940327, "flos": 24133910129280.0, "grad_norm": 15.35885723515526, "language_loss": 0.86943209, "learning_rate": 3.955116021746594e-06, "loss": 0.8874253, "num_input_tokens_seen": 34151925, "router_z_loss_clip": 3.40820312, "router_z_loss_mlp": 0.39599609, "step": 1589, "time_per_iteration": 2.7566306591033936 }, { "auxiliary_loss_clip": 0.01464313, "auxiliary_loss_mlp": 0.00370477, "balance_loss_clip": 1.11987281, "balance_loss_mlp": 0.32803842, "epoch": 0.09559597174207124, "flos": 42851376789120.0, "grad_norm": 20.92898834993177, "language_loss": 0.70740473, "learning_rate": 3.955033938184601e-06, "loss": 0.72575271, "num_input_tokens_seen": 34175395, "router_z_loss_clip": 3.44140625, "router_z_loss_mlp": 0.42456055, "step": 1590, "time_per_iteration": 2.8768157958984375 }, { "auxiliary_loss_clip": 0.01465584, "auxiliary_loss_mlp": 0.00383055, "balance_loss_clip": 1.12031651, "balance_loss_mlp": 0.34195149, "epoch": 0.09565609499473922, "flos": 32670845683200.0, "grad_norm": 2.0550934775046947, "language_loss": 0.88488436, "learning_rate": 3.954951780487526e-06, "loss": 0.90337068, "num_input_tokens_seen": 34197760, "router_z_loss_clip": 3.44921875, "router_z_loss_mlp": 0.41064453, "step": 1591, "time_per_iteration": 2.800288677215576 }, { "auxiliary_loss_clip": 0.01485254, "auxiliary_loss_mlp": 0.00374049, "balance_loss_clip": 1.12966633, "balance_loss_mlp": 0.32829696, "epoch": 0.09571621824740718, "flos": 18478410670080.0, "grad_norm": 34.44222560297388, "language_loss": 0.859339, "learning_rate": 3.9548695486584835e-06, "loss": 0.87793207, "num_input_tokens_seen": 34215330, "router_z_loss_clip": 3.55273438, "router_z_loss_mlp": 0.45776367, "step": 1592, "time_per_iteration": 2.6514580249786377 }, { "auxiliary_loss_clip": 0.01471372, "auxiliary_loss_mlp": 0.00394593, "balance_loss_clip": 1.12136495, "balance_loss_mlp": 0.35122448, "epoch": 0.09577634150007515, "flos": 29387497334400.0, "grad_norm": 11.410260545778037, "language_loss": 0.80087972, "learning_rate": 3.954787242700592e-06, "loss": 0.81953937, "num_input_tokens_seen": 34237745, "router_z_loss_clip": 3.50195312, "router_z_loss_mlp": 0.43359375, "step": 1593, "time_per_iteration": 2.7255873680114746 }, { "auxiliary_loss_clip": 0.01476229, "auxiliary_loss_mlp": 0.00377183, "balance_loss_clip": 1.1287142, "balance_loss_mlp": 0.33433884, "epoch": 0.09583646475274313, "flos": 22747830157440.0, "grad_norm": 10.648045066655213, "language_loss": 0.76151621, "learning_rate": 3.954704862616971e-06, "loss": 0.78005028, "num_input_tokens_seen": 34256565, "router_z_loss_clip": 3.47265625, "router_z_loss_mlp": 0.4284668, "step": 1594, "time_per_iteration": 2.6383283138275146 }, { "auxiliary_loss_clip": 0.01470646, "auxiliary_loss_mlp": 0.00354301, "balance_loss_clip": 1.12303591, "balance_loss_mlp": 0.31279218, "epoch": 0.0958965880054111, "flos": 23218367345280.0, "grad_norm": 28.02014346170689, "language_loss": 0.89696717, "learning_rate": 3.954622408410747e-06, "loss": 0.91521668, "num_input_tokens_seen": 34275970, "router_z_loss_clip": 3.4765625, "router_z_loss_mlp": 0.4152832, "step": 1595, "time_per_iteration": 2.680110454559326 }, { "auxiliary_loss_clip": 0.01454898, "auxiliary_loss_mlp": 0.00371408, "balance_loss_clip": 1.11202383, "balance_loss_mlp": 0.32849285, "epoch": 0.09595671125807906, "flos": 21324438933120.0, "grad_norm": 9.600881169712252, "language_loss": 0.93615103, "learning_rate": 3.954539880085045e-06, "loss": 0.95441407, "num_input_tokens_seen": 34295490, "router_z_loss_clip": 3.42773438, "router_z_loss_mlp": 0.42895508, "step": 1596, "time_per_iteration": 2.6818222999572754 }, { "auxiliary_loss_clip": 0.01478502, "auxiliary_loss_mlp": 0.00349746, "balance_loss_clip": 1.13161671, "balance_loss_mlp": 0.31150377, "epoch": 0.09601683451074704, "flos": 39603472185600.0, "grad_norm": 11.433726438916572, "language_loss": 0.74922872, "learning_rate": 3.9544572776429945e-06, "loss": 0.76751119, "num_input_tokens_seen": 34319990, "router_z_loss_clip": 3.47265625, "router_z_loss_mlp": 0.38232422, "step": 1597, "time_per_iteration": 2.805889368057251 }, { "auxiliary_loss_clip": 0.01464834, "auxiliary_loss_mlp": 0.00371706, "balance_loss_clip": 1.11700821, "balance_loss_mlp": 0.33229494, "epoch": 0.096076957763415, "flos": 23732716147200.0, "grad_norm": 8.125810718576458, "language_loss": 0.84064847, "learning_rate": 3.954374601087729e-06, "loss": 0.85901386, "num_input_tokens_seen": 34339225, "router_z_loss_clip": 3.47851562, "router_z_loss_mlp": 0.39428711, "step": 1598, "time_per_iteration": 2.6935274600982666 }, { "auxiliary_loss_clip": 0.01477652, "auxiliary_loss_mlp": 0.00432739, "balance_loss_clip": 1.12353015, "balance_loss_mlp": 0.39030051, "epoch": 0.09613708101608297, "flos": 34678108483200.0, "grad_norm": 11.647002242074175, "language_loss": 0.74882346, "learning_rate": 3.954291850422382e-06, "loss": 0.76792741, "num_input_tokens_seen": 34361020, "router_z_loss_clip": 3.5390625, "router_z_loss_mlp": 0.42431641, "step": 1599, "time_per_iteration": 2.756392240524292 }, { "auxiliary_loss_clip": 0.01454713, "auxiliary_loss_mlp": 0.00359831, "balance_loss_clip": 1.12457955, "balance_loss_mlp": 0.32044381, "epoch": 0.09619720426875093, "flos": 20740028653440.0, "grad_norm": 30.18276578540757, "language_loss": 0.89875239, "learning_rate": 3.954209025650093e-06, "loss": 0.91689777, "num_input_tokens_seen": 34378630, "router_z_loss_clip": 3.30078125, "router_z_loss_mlp": 0.39355469, "step": 1600, "time_per_iteration": 2.6578125953674316 }, { "auxiliary_loss_clip": 0.01432234, "auxiliary_loss_mlp": 0.00368497, "balance_loss_clip": 1.09863353, "balance_loss_mlp": 0.32806119, "epoch": 0.09625732752141891, "flos": 13042720488960.0, "grad_norm": 238.33126297537575, "language_loss": 0.88368344, "learning_rate": 3.954126126774001e-06, "loss": 0.90169072, "num_input_tokens_seen": 34397110, "router_z_loss_clip": 3.3359375, "router_z_loss_mlp": 0.40454102, "step": 1601, "time_per_iteration": 2.715336799621582 }, { "auxiliary_loss_clip": 0.01437705, "auxiliary_loss_mlp": 0.0040907, "balance_loss_clip": 1.09881997, "balance_loss_mlp": 0.36510539, "epoch": 0.09631745077408688, "flos": 22273629782400.0, "grad_norm": 330.1765393079444, "language_loss": 0.88512212, "learning_rate": 3.954043153797251e-06, "loss": 0.90358984, "num_input_tokens_seen": 34414165, "router_z_loss_clip": 3.390625, "router_z_loss_mlp": 0.43920898, "step": 1602, "time_per_iteration": 2.699397087097168 }, { "auxiliary_loss_clip": 0.01404657, "auxiliary_loss_mlp": 0.00363434, "balance_loss_clip": 1.08432114, "balance_loss_mlp": 0.32342711, "epoch": 0.09637757402675484, "flos": 24754266944640.0, "grad_norm": 58.49719159649841, "language_loss": 0.72358978, "learning_rate": 3.953960106722989e-06, "loss": 0.74127078, "num_input_tokens_seen": 34434445, "router_z_loss_clip": 3.20117188, "router_z_loss_mlp": 0.40014648, "step": 1603, "time_per_iteration": 2.689565896987915 }, { "auxiliary_loss_clip": 0.01414849, "auxiliary_loss_mlp": 0.00388815, "balance_loss_clip": 1.08947968, "balance_loss_mlp": 0.34830758, "epoch": 0.09643769727942282, "flos": 22525758322560.0, "grad_norm": 13.61654815514293, "language_loss": 0.82450479, "learning_rate": 3.953876985554364e-06, "loss": 0.8425414, "num_input_tokens_seen": 34453095, "router_z_loss_clip": 3.2578125, "router_z_loss_mlp": 0.4050293, "step": 1604, "time_per_iteration": 2.721919059753418 }, { "auxiliary_loss_clip": 0.01396028, "auxiliary_loss_mlp": 0.00375856, "balance_loss_clip": 1.08172405, "balance_loss_mlp": 0.3398551, "epoch": 0.09649782053209079, "flos": 30921026636160.0, "grad_norm": 26.34210016506659, "language_loss": 0.84968126, "learning_rate": 3.953793790294527e-06, "loss": 0.86740005, "num_input_tokens_seen": 34473680, "router_z_loss_clip": 3.140625, "router_z_loss_mlp": 0.36035156, "step": 1605, "time_per_iteration": 2.7258007526397705 }, { "auxiliary_loss_clip": 0.01405068, "auxiliary_loss_mlp": 0.00399818, "balance_loss_clip": 1.07626534, "balance_loss_mlp": 0.36076462, "epoch": 0.09655794378475875, "flos": 25337635729920.0, "grad_norm": 7.224346524094109, "language_loss": 0.83657813, "learning_rate": 3.953710520946634e-06, "loss": 0.85462701, "num_input_tokens_seen": 34492610, "router_z_loss_clip": 3.28710938, "router_z_loss_mlp": 0.39038086, "step": 1606, "time_per_iteration": 2.7228784561157227 }, { "auxiliary_loss_clip": 0.01381636, "auxiliary_loss_mlp": 0.00376243, "balance_loss_clip": 1.06844008, "balance_loss_mlp": 0.34083745, "epoch": 0.09661806703742673, "flos": 22346061557760.0, "grad_norm": 187.0862912371859, "language_loss": 0.82656562, "learning_rate": 3.953627177513843e-06, "loss": 0.84414434, "num_input_tokens_seen": 34511855, "router_z_loss_clip": 3.1328125, "router_z_loss_mlp": 0.35400391, "step": 1607, "time_per_iteration": 2.6270315647125244 }, { "auxiliary_loss_clip": 0.01385431, "auxiliary_loss_mlp": 0.00353894, "balance_loss_clip": 1.06754148, "balance_loss_mlp": 0.31686792, "epoch": 0.0966781902900947, "flos": 17457578144640.0, "grad_norm": 360.39419267827736, "language_loss": 0.94055772, "learning_rate": 3.953543759999312e-06, "loss": 0.95795095, "num_input_tokens_seen": 34528905, "router_z_loss_clip": 3.18164062, "router_z_loss_mlp": 0.37036133, "step": 1608, "time_per_iteration": 2.611989736557007 }, { "auxiliary_loss_clip": 0.01388022, "auxiliary_loss_mlp": 0.00394951, "balance_loss_clip": 1.07045162, "balance_loss_mlp": 0.35534942, "epoch": 0.09673831354276266, "flos": 36903995412480.0, "grad_norm": 51.08193485225469, "language_loss": 0.78378534, "learning_rate": 3.953460268406207e-06, "loss": 0.80161512, "num_input_tokens_seen": 34548480, "router_z_loss_clip": 3.17773438, "router_z_loss_mlp": 0.39624023, "step": 1609, "time_per_iteration": 2.739863157272339 }, { "auxiliary_loss_clip": 0.01375351, "auxiliary_loss_mlp": 0.00363272, "balance_loss_clip": 1.0613271, "balance_loss_mlp": 0.32760417, "epoch": 0.09679843679543064, "flos": 20701388597760.0, "grad_norm": 10.563222755965997, "language_loss": 0.92151344, "learning_rate": 3.953376702737693e-06, "loss": 0.93889964, "num_input_tokens_seen": 34565410, "router_z_loss_clip": 3.13867188, "router_z_loss_mlp": 0.35693359, "step": 1610, "time_per_iteration": 2.644197940826416 }, { "auxiliary_loss_clip": 0.0138217, "auxiliary_loss_mlp": 0.00350981, "balance_loss_clip": 1.07012129, "balance_loss_mlp": 0.31483662, "epoch": 0.0968585600480986, "flos": 23514415240320.0, "grad_norm": 39.89982408706452, "language_loss": 0.73902893, "learning_rate": 3.953293062996939e-06, "loss": 0.75636041, "num_input_tokens_seen": 34584840, "router_z_loss_clip": 3.12109375, "router_z_loss_mlp": 0.36132812, "step": 1611, "time_per_iteration": 2.6438217163085938 }, { "auxiliary_loss_clip": 0.01375535, "auxiliary_loss_mlp": 0.0037727, "balance_loss_clip": 1.06224203, "balance_loss_mlp": 0.33869368, "epoch": 0.09691868330076657, "flos": 20121072468480.0, "grad_norm": 10.446907198892651, "language_loss": 0.87943256, "learning_rate": 3.953209349187115e-06, "loss": 0.89696062, "num_input_tokens_seen": 34603360, "router_z_loss_clip": 3.13476562, "router_z_loss_mlp": 0.38574219, "step": 1612, "time_per_iteration": 2.6622846126556396 }, { "auxiliary_loss_clip": 0.01378831, "auxiliary_loss_mlp": 0.00355607, "balance_loss_clip": 1.07094121, "balance_loss_mlp": 0.32001123, "epoch": 0.09697880655343454, "flos": 16544692967040.0, "grad_norm": 15.982028498510683, "language_loss": 0.88025498, "learning_rate": 3.953125561311398e-06, "loss": 0.89759934, "num_input_tokens_seen": 34620760, "router_z_loss_clip": 3.08007812, "router_z_loss_mlp": 0.35620117, "step": 1613, "time_per_iteration": 2.595209836959839 }, { "auxiliary_loss_clip": 0.0137308, "auxiliary_loss_mlp": 0.00349218, "balance_loss_clip": 1.07069135, "balance_loss_mlp": 0.31152433, "epoch": 0.09703892980610251, "flos": 26104184899200.0, "grad_norm": 11.895974929224586, "language_loss": 0.91014051, "learning_rate": 3.953041699372964e-06, "loss": 0.92736346, "num_input_tokens_seen": 34640695, "router_z_loss_clip": 3.0234375, "router_z_loss_mlp": 0.37670898, "step": 1614, "time_per_iteration": 2.6693644523620605 }, { "auxiliary_loss_clip": 0.01294948, "auxiliary_loss_mlp": 0.00129416, "balance_loss_clip": 1.07847786, "balance_loss_mlp": 0.11616013, "epoch": 0.09709905305877048, "flos": 60443622000000.0, "grad_norm": 2.2876741084324825, "language_loss": 0.54809356, "learning_rate": 3.952957763374992e-06, "loss": 0.56233716, "num_input_tokens_seen": 34702395, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.1328125, "step": 1615, "time_per_iteration": 3.119640350341797 }, { "auxiliary_loss_clip": 0.0130366, "auxiliary_loss_mlp": 0.0013799, "balance_loss_clip": 1.08367229, "balance_loss_mlp": 0.12602146, "epoch": 0.09715917631143844, "flos": 57639932893440.0, "grad_norm": 5.855854192864963, "language_loss": 0.58044302, "learning_rate": 3.952873753320666e-06, "loss": 0.5948596, "num_input_tokens_seen": 34768910, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.11962891, "step": 1616, "time_per_iteration": 3.2822763919830322 }, { "auxiliary_loss_clip": 0.01366331, "auxiliary_loss_mlp": 0.00322466, "balance_loss_clip": 1.06608224, "balance_loss_mlp": 0.28713194, "epoch": 0.09721929956410642, "flos": 20558212986240.0, "grad_norm": 4.957679542592365, "language_loss": 0.74371392, "learning_rate": 3.952789669213172e-06, "loss": 0.76060188, "num_input_tokens_seen": 34787680, "router_z_loss_clip": 3.00195312, "router_z_loss_mlp": 0.35302734, "step": 1617, "time_per_iteration": 2.674182653427124 }, { "auxiliary_loss_clip": 0.0137517, "auxiliary_loss_mlp": 0.00344133, "balance_loss_clip": 1.06713128, "balance_loss_mlp": 0.30686823, "epoch": 0.09727942281677439, "flos": 27344359825920.0, "grad_norm": 55.12159380015707, "language_loss": 0.86654687, "learning_rate": 3.952705511055698e-06, "loss": 0.88373989, "num_input_tokens_seen": 34808330, "router_z_loss_clip": 3.08007812, "router_z_loss_mlp": 0.37255859, "step": 1618, "time_per_iteration": 2.7114498615264893 }, { "auxiliary_loss_clip": 0.01364984, "auxiliary_loss_mlp": 0.00305408, "balance_loss_clip": 1.06822395, "balance_loss_mlp": 0.27310205, "epoch": 0.09733954606944235, "flos": 24900028335360.0, "grad_norm": 30.072303136014003, "language_loss": 0.97694063, "learning_rate": 3.952621278851435e-06, "loss": 0.99364454, "num_input_tokens_seen": 34830020, "router_z_loss_clip": 2.96875, "router_z_loss_mlp": 0.32324219, "step": 1619, "time_per_iteration": 2.683267593383789 }, { "auxiliary_loss_clip": 0.01368553, "auxiliary_loss_mlp": 0.00304764, "balance_loss_clip": 1.06991768, "balance_loss_mlp": 0.27098033, "epoch": 0.09739966932211033, "flos": 31503928544640.0, "grad_norm": 5.0350542866414365, "language_loss": 0.94946963, "learning_rate": 3.9525369726035784e-06, "loss": 0.96620274, "num_input_tokens_seen": 34850330, "router_z_loss_clip": 2.98632812, "router_z_loss_mlp": 0.33764648, "step": 1620, "time_per_iteration": 2.731321096420288 }, { "auxiliary_loss_clip": 0.01381503, "auxiliary_loss_mlp": 0.00333597, "balance_loss_clip": 1.0778439, "balance_loss_mlp": 0.29730994, "epoch": 0.0974597925747783, "flos": 23878764846720.0, "grad_norm": 26.31097156913841, "language_loss": 0.84652954, "learning_rate": 3.952452592315324e-06, "loss": 0.86368048, "num_input_tokens_seen": 34871640, "router_z_loss_clip": 3.03320312, "router_z_loss_mlp": 0.36279297, "step": 1621, "time_per_iteration": 4.102027177810669 }, { "auxiliary_loss_clip": 0.01382471, "auxiliary_loss_mlp": 0.00314216, "balance_loss_clip": 1.07660162, "balance_loss_mlp": 0.27633166, "epoch": 0.09751991582744626, "flos": 17019575700480.0, "grad_norm": 11.276804478880166, "language_loss": 0.82661992, "learning_rate": 3.952368137989871e-06, "loss": 0.8435868, "num_input_tokens_seen": 34888100, "router_z_loss_clip": 3.05664062, "router_z_loss_mlp": 0.37890625, "step": 1622, "time_per_iteration": 2.6767466068267822 }, { "auxiliary_loss_clip": 0.01394598, "auxiliary_loss_mlp": 0.00319706, "balance_loss_clip": 1.08397174, "balance_loss_mlp": 0.28198802, "epoch": 0.09758003908011423, "flos": 28402826826240.0, "grad_norm": 17.2295521800851, "language_loss": 0.90989459, "learning_rate": 3.9522836096304225e-06, "loss": 0.9270376, "num_input_tokens_seen": 34910485, "router_z_loss_clip": 3.10742188, "router_z_loss_mlp": 0.37744141, "step": 1623, "time_per_iteration": 4.288774251937866 }, { "auxiliary_loss_clip": 0.0140463, "auxiliary_loss_mlp": 0.00326162, "balance_loss_clip": 1.09507561, "balance_loss_mlp": 0.29182965, "epoch": 0.09764016233278221, "flos": 18144297336960.0, "grad_norm": 37.17631508535162, "language_loss": 0.89013153, "learning_rate": 3.952199007240184e-06, "loss": 0.90743947, "num_input_tokens_seen": 34928615, "router_z_loss_clip": 3.09960938, "router_z_loss_mlp": 0.34326172, "step": 1624, "time_per_iteration": 2.604285478591919 }, { "auxiliary_loss_clip": 0.01400579, "auxiliary_loss_mlp": 0.00311985, "balance_loss_clip": 1.09476292, "balance_loss_mlp": 0.27724695, "epoch": 0.09770028558545017, "flos": 15265842071040.0, "grad_norm": 51.66483367253249, "language_loss": 0.93351501, "learning_rate": 3.952114330822364e-06, "loss": 0.95064062, "num_input_tokens_seen": 34946045, "router_z_loss_clip": 3.05859375, "router_z_loss_mlp": 0.34716797, "step": 1625, "time_per_iteration": 2.6437041759490967 }, { "auxiliary_loss_clip": 0.01428175, "auxiliary_loss_mlp": 0.0032931, "balance_loss_clip": 1.11567664, "balance_loss_mlp": 0.28997111, "epoch": 0.09776040883811814, "flos": 23472435219840.0, "grad_norm": 10.913433525491849, "language_loss": 0.93571645, "learning_rate": 3.952029580380172e-06, "loss": 0.9532913, "num_input_tokens_seen": 34962865, "router_z_loss_clip": 3.12695312, "router_z_loss_mlp": 0.39331055, "step": 1626, "time_per_iteration": 4.204698801040649 }, { "auxiliary_loss_clip": 0.01441592, "auxiliary_loss_mlp": 0.00308147, "balance_loss_clip": 1.12127328, "balance_loss_mlp": 0.26938018, "epoch": 0.09782053209078612, "flos": 24499480798080.0, "grad_norm": 56.141400213673954, "language_loss": 0.89405572, "learning_rate": 3.9519447559168234e-06, "loss": 0.91155308, "num_input_tokens_seen": 34983505, "router_z_loss_clip": 3.203125, "router_z_loss_mlp": 0.38769531, "step": 1627, "time_per_iteration": 2.7062103748321533 }, { "auxiliary_loss_clip": 0.01451777, "auxiliary_loss_mlp": 0.0032704, "balance_loss_clip": 1.13677704, "balance_loss_mlp": 0.2882497, "epoch": 0.09788065534345408, "flos": 21580158833280.0, "grad_norm": 188.67721871091837, "language_loss": 0.90784442, "learning_rate": 3.951859857435534e-06, "loss": 0.9256326, "num_input_tokens_seen": 35001825, "router_z_loss_clip": 3.14648438, "router_z_loss_mlp": 0.38793945, "step": 1628, "time_per_iteration": 4.093711614608765 }, { "auxiliary_loss_clip": 0.01448772, "auxiliary_loss_mlp": 0.00312192, "balance_loss_clip": 1.13319123, "balance_loss_mlp": 0.27635786, "epoch": 0.09794077859612205, "flos": 23842459175040.0, "grad_norm": 390.511768140594, "language_loss": 0.81458747, "learning_rate": 3.951774884939523e-06, "loss": 0.83219713, "num_input_tokens_seen": 35023075, "router_z_loss_clip": 3.15429688, "router_z_loss_mlp": 0.35864258, "step": 1629, "time_per_iteration": 2.682356357574463 }, { "auxiliary_loss_clip": 0.01462408, "auxiliary_loss_mlp": 0.00296461, "balance_loss_clip": 1.1371609, "balance_loss_mlp": 0.25652623, "epoch": 0.09800090184879003, "flos": 23659889322240.0, "grad_norm": 304.26986508420475, "language_loss": 0.84501708, "learning_rate": 3.951689838432013e-06, "loss": 0.86260575, "num_input_tokens_seen": 35043480, "router_z_loss_clip": 3.25585938, "router_z_loss_mlp": 0.39941406, "step": 1630, "time_per_iteration": 2.676539659500122 }, { "auxiliary_loss_clip": 0.01482923, "auxiliary_loss_mlp": 0.0033944, "balance_loss_clip": 1.14890969, "balance_loss_mlp": 0.30055392, "epoch": 0.09806102510145799, "flos": 17055773631360.0, "grad_norm": 67.37776359726827, "language_loss": 0.93010575, "learning_rate": 3.951604717916228e-06, "loss": 0.94832933, "num_input_tokens_seen": 35061490, "router_z_loss_clip": 3.33984375, "router_z_loss_mlp": 0.38916016, "step": 1631, "time_per_iteration": 2.669018507003784 }, { "auxiliary_loss_clip": 0.01456454, "auxiliary_loss_mlp": 0.00302459, "balance_loss_clip": 1.13027453, "balance_loss_mlp": 0.26660094, "epoch": 0.09812114835412596, "flos": 23878477537920.0, "grad_norm": 5.085698129476178, "language_loss": 0.88871658, "learning_rate": 3.9515195233953975e-06, "loss": 0.90630567, "num_input_tokens_seen": 35079670, "router_z_loss_clip": 3.25976562, "router_z_loss_mlp": 0.35864258, "step": 1632, "time_per_iteration": 2.668764591217041 }, { "auxiliary_loss_clip": 0.01447343, "auxiliary_loss_mlp": 0.00303951, "balance_loss_clip": 1.12104309, "balance_loss_mlp": 0.26697278, "epoch": 0.09818127160679392, "flos": 20595488325120.0, "grad_norm": 5.160438317551523, "language_loss": 0.83777547, "learning_rate": 3.951434254872751e-06, "loss": 0.85528845, "num_input_tokens_seen": 35099205, "router_z_loss_clip": 3.26171875, "router_z_loss_mlp": 0.36987305, "step": 1633, "time_per_iteration": 2.747514247894287 }, { "auxiliary_loss_clip": 0.01445719, "auxiliary_loss_mlp": 0.00336213, "balance_loss_clip": 1.11935377, "balance_loss_mlp": 0.29513341, "epoch": 0.0982413948594619, "flos": 15487339288320.0, "grad_norm": 12.407250287034092, "language_loss": 0.80728757, "learning_rate": 3.951348912351521e-06, "loss": 0.82510686, "num_input_tokens_seen": 35115270, "router_z_loss_clip": 3.26171875, "router_z_loss_mlp": 0.41088867, "step": 1634, "time_per_iteration": 2.598177909851074 }, { "auxiliary_loss_clip": 0.0144646, "auxiliary_loss_mlp": 0.00291074, "balance_loss_clip": 1.11214113, "balance_loss_mlp": 0.25135383, "epoch": 0.09830151811212987, "flos": 24207958016640.0, "grad_norm": 5.187586179991366, "language_loss": 0.85152233, "learning_rate": 3.951263495834947e-06, "loss": 0.86889756, "num_input_tokens_seen": 35134065, "router_z_loss_clip": 3.34375, "router_z_loss_mlp": 0.3972168, "step": 1635, "time_per_iteration": 2.729297399520874 }, { "auxiliary_loss_clip": 0.01465532, "auxiliary_loss_mlp": 0.00293758, "balance_loss_clip": 1.13040185, "balance_loss_mlp": 0.24857731, "epoch": 0.09836164136479783, "flos": 20594590485120.0, "grad_norm": 22.188453190597773, "language_loss": 0.831532, "learning_rate": 3.951178005326264e-06, "loss": 0.84912479, "num_input_tokens_seen": 35154870, "router_z_loss_clip": 3.34960938, "router_z_loss_mlp": 0.45166016, "step": 1636, "time_per_iteration": 2.774968147277832 }, { "auxiliary_loss_clip": 0.01461645, "auxiliary_loss_mlp": 0.00320959, "balance_loss_clip": 1.12905419, "balance_loss_mlp": 0.27692294, "epoch": 0.09842176461746581, "flos": 19934157070080.0, "grad_norm": 4.79021418135691, "language_loss": 0.77162182, "learning_rate": 3.951092440828715e-06, "loss": 0.78944778, "num_input_tokens_seen": 35171850, "router_z_loss_clip": 3.328125, "router_z_loss_mlp": 0.43994141, "step": 1637, "time_per_iteration": 2.685391426086426 }, { "auxiliary_loss_clip": 0.01447096, "auxiliary_loss_mlp": 0.00292509, "balance_loss_clip": 1.11828542, "balance_loss_mlp": 0.25429046, "epoch": 0.09848188787013377, "flos": 21214659991680.0, "grad_norm": 6.175645093288549, "language_loss": 0.84843856, "learning_rate": 3.951006802345545e-06, "loss": 0.86583465, "num_input_tokens_seen": 35188795, "router_z_loss_clip": 3.29101562, "router_z_loss_mlp": 0.38208008, "step": 1638, "time_per_iteration": 2.6604514122009277 }, { "auxiliary_loss_clip": 0.01435713, "auxiliary_loss_mlp": 0.00299368, "balance_loss_clip": 1.10743308, "balance_loss_mlp": 0.26391506, "epoch": 0.09854201112280174, "flos": 30154226071680.0, "grad_norm": 6.420272583261749, "language_loss": 0.78366297, "learning_rate": 3.950921089880003e-06, "loss": 0.80101383, "num_input_tokens_seen": 35212100, "router_z_loss_clip": 3.27929688, "router_z_loss_mlp": 0.35449219, "step": 1639, "time_per_iteration": 2.8032875061035156 }, { "auxiliary_loss_clip": 0.01445628, "auxiliary_loss_mlp": 0.00292475, "balance_loss_clip": 1.11155391, "balance_loss_mlp": 0.25318322, "epoch": 0.09860213437546972, "flos": 21795730306560.0, "grad_norm": 26.462372245390814, "language_loss": 0.95435041, "learning_rate": 3.950835303435337e-06, "loss": 0.97173142, "num_input_tokens_seen": 35230390, "router_z_loss_clip": 3.34179688, "router_z_loss_mlp": 0.39306641, "step": 1640, "time_per_iteration": 2.7309720516204834 }, { "auxiliary_loss_clip": 0.01427115, "auxiliary_loss_mlp": 0.00284209, "balance_loss_clip": 1.09873629, "balance_loss_mlp": 0.24413131, "epoch": 0.09866225762813768, "flos": 21835555511040.0, "grad_norm": 882.1241222844299, "language_loss": 0.89556414, "learning_rate": 3.950749443014801e-06, "loss": 0.91267741, "num_input_tokens_seen": 35250405, "router_z_loss_clip": 3.28515625, "router_z_loss_mlp": 0.40063477, "step": 1641, "time_per_iteration": 2.6903560161590576 }, { "auxiliary_loss_clip": 0.014426, "auxiliary_loss_mlp": 0.00311693, "balance_loss_clip": 1.11048245, "balance_loss_mlp": 0.26961216, "epoch": 0.09872238088080565, "flos": 17599855916160.0, "grad_norm": 12.390994715658211, "language_loss": 0.96133423, "learning_rate": 3.95066350862165e-06, "loss": 0.97887719, "num_input_tokens_seen": 35262820, "router_z_loss_clip": 3.32226562, "router_z_loss_mlp": 0.42089844, "step": 1642, "time_per_iteration": 2.650209903717041 }, { "auxiliary_loss_clip": 0.01417788, "auxiliary_loss_mlp": 0.00272326, "balance_loss_clip": 1.09144521, "balance_loss_mlp": 0.23248588, "epoch": 0.09878250413347361, "flos": 27636134002560.0, "grad_norm": 11.404019135932062, "language_loss": 0.85627437, "learning_rate": 3.950577500259144e-06, "loss": 0.8731755, "num_input_tokens_seen": 35284490, "router_z_loss_clip": 3.25976562, "router_z_loss_mlp": 0.39868164, "step": 1643, "time_per_iteration": 2.7236762046813965 }, { "auxiliary_loss_clip": 0.0143033, "auxiliary_loss_mlp": 0.00278208, "balance_loss_clip": 1.10411024, "balance_loss_mlp": 0.23836799, "epoch": 0.0988426273861416, "flos": 16544728880640.0, "grad_norm": 2.2323141187370847, "language_loss": 0.90144938, "learning_rate": 3.950491417930543e-06, "loss": 0.91853476, "num_input_tokens_seen": 35302815, "router_z_loss_clip": 3.26171875, "router_z_loss_mlp": 0.39794922, "step": 1644, "time_per_iteration": 2.7793643474578857 }, { "auxiliary_loss_clip": 0.01405208, "auxiliary_loss_mlp": 0.00243818, "balance_loss_clip": 1.08328223, "balance_loss_mlp": 0.20431212, "epoch": 0.09890275063880956, "flos": 21215270522880.0, "grad_norm": 154.41265567661176, "language_loss": 0.76154464, "learning_rate": 3.9504052616391124e-06, "loss": 0.77803487, "num_input_tokens_seen": 35321175, "router_z_loss_clip": 3.22070312, "router_z_loss_mlp": 0.39501953, "step": 1645, "time_per_iteration": 2.614414691925049 }, { "auxiliary_loss_clip": 0.01417552, "auxiliary_loss_mlp": 0.00101531, "balance_loss_clip": 1.15992928, "balance_loss_mlp": 0.09065877, "epoch": 0.09896287389147752, "flos": 59379372910080.0, "grad_norm": 0.9210233144219142, "language_loss": 0.60677552, "learning_rate": 3.950319031388119e-06, "loss": 0.62196636, "num_input_tokens_seen": 35381740, "router_z_loss_clip": 2.578125, "router_z_loss_mlp": 0.10888672, "step": 1646, "time_per_iteration": 3.06249737739563 }, { "auxiliary_loss_clip": 0.01414766, "auxiliary_loss_mlp": 0.00280914, "balance_loss_clip": 1.08509278, "balance_loss_mlp": 0.23930976, "epoch": 0.0990229971441455, "flos": 29642678530560.0, "grad_norm": 437.3844928219182, "language_loss": 0.80162436, "learning_rate": 3.950232727180833e-06, "loss": 0.8185811, "num_input_tokens_seen": 35403760, "router_z_loss_clip": 3.296875, "router_z_loss_mlp": 0.41625977, "step": 1647, "time_per_iteration": 2.755734443664551 }, { "auxiliary_loss_clip": 0.01417598, "auxiliary_loss_mlp": 0.00268623, "balance_loss_clip": 1.09185374, "balance_loss_mlp": 0.23176306, "epoch": 0.09908312039681347, "flos": 21834873152640.0, "grad_norm": 5.233240041146046, "language_loss": 0.91865826, "learning_rate": 3.950146349020525e-06, "loss": 0.93552041, "num_input_tokens_seen": 35424050, "router_z_loss_clip": 3.25390625, "router_z_loss_mlp": 0.36889648, "step": 1648, "time_per_iteration": 2.6644535064697266 }, { "auxiliary_loss_clip": 0.01383242, "auxiliary_loss_mlp": 0.00116059, "balance_loss_clip": 1.14814699, "balance_loss_mlp": 0.10609303, "epoch": 0.09914324364948143, "flos": 57564304807680.0, "grad_norm": 0.7517902942788768, "language_loss": 0.55603468, "learning_rate": 3.950059896910473e-06, "loss": 0.57102764, "num_input_tokens_seen": 35481690, "router_z_loss_clip": 2.34375, "router_z_loss_mlp": 0.09960938, "step": 1649, "time_per_iteration": 3.0834810733795166 }, { "auxiliary_loss_clip": 0.01412644, "auxiliary_loss_mlp": 0.00279201, "balance_loss_clip": 1.09079838, "balance_loss_mlp": 0.24327101, "epoch": 0.09920336690214941, "flos": 34123934476800.0, "grad_norm": 729.2221105261593, "language_loss": 0.97997546, "learning_rate": 3.949973370853954e-06, "loss": 0.99689388, "num_input_tokens_seen": 35498635, "router_z_loss_clip": 3.21484375, "router_z_loss_mlp": 0.359375, "step": 1650, "time_per_iteration": 2.684771776199341 }, { "auxiliary_loss_clip": 0.01398386, "auxiliary_loss_mlp": 0.00121662, "balance_loss_clip": 1.17629623, "balance_loss_mlp": 0.11160024, "epoch": 0.09926349015481738, "flos": 71216428464000.0, "grad_norm": 0.7993799667827964, "language_loss": 0.63431954, "learning_rate": 3.94988677085425e-06, "loss": 0.64951998, "num_input_tokens_seen": 35565720, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.10058594, "step": 1651, "time_per_iteration": 3.2925758361816406 }, { "auxiliary_loss_clip": 0.0140087, "auxiliary_loss_mlp": 0.00262476, "balance_loss_clip": 1.08953261, "balance_loss_mlp": 0.22485337, "epoch": 0.09932361340748534, "flos": 23148700917120.0, "grad_norm": 3.756383771269253, "language_loss": 0.94219649, "learning_rate": 3.949800096914643e-06, "loss": 0.95883, "num_input_tokens_seen": 35586000, "router_z_loss_clip": 3.11328125, "router_z_loss_mlp": 0.3762207, "step": 1652, "time_per_iteration": 2.666222095489502 }, { "auxiliary_loss_clip": 0.01425063, "auxiliary_loss_mlp": 0.00310801, "balance_loss_clip": 1.10903072, "balance_loss_mlp": 0.27069882, "epoch": 0.09938373666015332, "flos": 19828651847040.0, "grad_norm": 58.590945464554004, "language_loss": 0.86843073, "learning_rate": 3.949713349038422e-06, "loss": 0.88578939, "num_input_tokens_seen": 35604355, "router_z_loss_clip": 3.16210938, "router_z_loss_mlp": 0.40136719, "step": 1653, "time_per_iteration": 2.698317766189575 }, { "auxiliary_loss_clip": 0.01410764, "auxiliary_loss_mlp": 0.00281301, "balance_loss_clip": 1.10168052, "balance_loss_mlp": 0.24634862, "epoch": 0.09944385991282129, "flos": 22090664880000.0, "grad_norm": 2.415372573811635, "language_loss": 0.86359215, "learning_rate": 3.949626527228875e-06, "loss": 0.88051271, "num_input_tokens_seen": 35625495, "router_z_loss_clip": 3.09375, "router_z_loss_mlp": 0.34960938, "step": 1654, "time_per_iteration": 2.663205862045288 }, { "auxiliary_loss_clip": 0.01393019, "auxiliary_loss_mlp": 0.00259011, "balance_loss_clip": 1.10055566, "balance_loss_mlp": 0.22672895, "epoch": 0.09950398316548925, "flos": 19828867328640.0, "grad_norm": 123.67061679825424, "language_loss": 0.86177742, "learning_rate": 3.949539631489295e-06, "loss": 0.87829775, "num_input_tokens_seen": 35645030, "router_z_loss_clip": 2.92382812, "router_z_loss_mlp": 0.32275391, "step": 1655, "time_per_iteration": 2.724569082260132 }, { "auxiliary_loss_clip": 0.01435015, "auxiliary_loss_mlp": 0.00303977, "balance_loss_clip": 1.12446809, "balance_loss_mlp": 0.26373199, "epoch": 0.09956410641815722, "flos": 25003701964800.0, "grad_norm": 15.390113460269587, "language_loss": 0.8770988, "learning_rate": 3.9494526618229765e-06, "loss": 0.89448869, "num_input_tokens_seen": 35664305, "router_z_loss_clip": 3.10546875, "router_z_loss_mlp": 0.40258789, "step": 1656, "time_per_iteration": 2.645508050918579 }, { "auxiliary_loss_clip": 0.0142248, "auxiliary_loss_mlp": 0.0029147, "balance_loss_clip": 1.12607574, "balance_loss_mlp": 0.25673288, "epoch": 0.0996242296708252, "flos": 19317714837120.0, "grad_norm": 3.204614845971538, "language_loss": 0.94127679, "learning_rate": 3.949365618233217e-06, "loss": 0.95841628, "num_input_tokens_seen": 35684060, "router_z_loss_clip": 2.96484375, "router_z_loss_mlp": 0.34741211, "step": 1657, "time_per_iteration": 2.644312620162964 }, { "auxiliary_loss_clip": 0.01442547, "auxiliary_loss_mlp": 0.00315656, "balance_loss_clip": 1.13442957, "balance_loss_mlp": 0.27636468, "epoch": 0.09968435292349316, "flos": 21871609787520.0, "grad_norm": 9.11912395004817, "language_loss": 0.91288, "learning_rate": 3.9492785007233195e-06, "loss": 0.930462, "num_input_tokens_seen": 35703250, "router_z_loss_clip": 3.078125, "router_z_loss_mlp": 0.39306641, "step": 1658, "time_per_iteration": 2.6545588970184326 }, { "auxiliary_loss_clip": 0.01549162, "auxiliary_loss_mlp": 0.00207984, "balance_loss_clip": 1.34527731, "balance_loss_mlp": 0.19272523, "epoch": 0.09974447617616113, "flos": 65384533313280.0, "grad_norm": 0.9352830289529805, "language_loss": 0.60266578, "learning_rate": 3.949191309296585e-06, "loss": 0.62023729, "num_input_tokens_seen": 35762165, "router_z_loss_clip": 2.03125, "router_z_loss_mlp": 0.15234375, "step": 1659, "time_per_iteration": 3.163952112197876 }, { "auxiliary_loss_clip": 0.01413159, "auxiliary_loss_mlp": 0.00257895, "balance_loss_clip": 1.12765956, "balance_loss_mlp": 0.22270395, "epoch": 0.0998045994288291, "flos": 23659817495040.0, "grad_norm": 107.55994449957241, "language_loss": 0.93383938, "learning_rate": 3.949104043956321e-06, "loss": 0.95054996, "num_input_tokens_seen": 35781520, "router_z_loss_clip": 2.85351562, "router_z_loss_mlp": 0.35131836, "step": 1660, "time_per_iteration": 2.7075650691986084 }, { "auxiliary_loss_clip": 0.01442641, "auxiliary_loss_mlp": 0.00259285, "balance_loss_clip": 1.14798045, "balance_loss_mlp": 0.22168659, "epoch": 0.09986472268149707, "flos": 19609704495360.0, "grad_norm": 41.6734041499813, "language_loss": 0.87648177, "learning_rate": 3.949016704705836e-06, "loss": 0.89350104, "num_input_tokens_seen": 35799565, "router_z_loss_clip": 2.9453125, "router_z_loss_mlp": 0.3762207, "step": 1661, "time_per_iteration": 2.6705164909362793 }, { "auxiliary_loss_clip": 0.01430529, "auxiliary_loss_mlp": 0.00293332, "balance_loss_clip": 1.13189209, "balance_loss_mlp": 0.25530419, "epoch": 0.09992484593416504, "flos": 26213317395840.0, "grad_norm": 426.2581226746184, "language_loss": 0.92841876, "learning_rate": 3.948929291548443e-06, "loss": 0.94565737, "num_input_tokens_seen": 35821085, "router_z_loss_clip": 2.984375, "router_z_loss_mlp": 0.38061523, "step": 1662, "time_per_iteration": 2.758939504623413 }, { "auxiliary_loss_clip": 0.0143714, "auxiliary_loss_mlp": 0.00275315, "balance_loss_clip": 1.14108229, "balance_loss_mlp": 0.23549867, "epoch": 0.09998496918683301, "flos": 17493632421120.0, "grad_norm": 17.1901505029231, "language_loss": 0.9703455, "learning_rate": 3.9488418044874546e-06, "loss": 0.98747009, "num_input_tokens_seen": 35839840, "router_z_loss_clip": 2.95898438, "router_z_loss_mlp": 0.39868164, "step": 1663, "time_per_iteration": 2.6607539653778076 }, { "auxiliary_loss_clip": 0.01437242, "auxiliary_loss_mlp": 0.002511, "balance_loss_clip": 1.14134049, "balance_loss_mlp": 0.21312006, "epoch": 0.10004509243950098, "flos": 22784925928320.0, "grad_norm": 157.49998123032466, "language_loss": 0.79141688, "learning_rate": 3.948754243526191e-06, "loss": 0.80830038, "num_input_tokens_seen": 35861545, "router_z_loss_clip": 2.9609375, "router_z_loss_mlp": 0.37963867, "step": 1664, "time_per_iteration": 4.176863670349121 }, { "auxiliary_loss_clip": 0.01426042, "auxiliary_loss_mlp": 0.00242867, "balance_loss_clip": 1.13227177, "balance_loss_mlp": 0.20715141, "epoch": 0.10010521569216894, "flos": 16253385667200.0, "grad_norm": 176.79789016053664, "language_loss": 0.88678372, "learning_rate": 3.94866660866797e-06, "loss": 0.90347278, "num_input_tokens_seen": 35878295, "router_z_loss_clip": 2.9375, "router_z_loss_mlp": 0.35742188, "step": 1665, "time_per_iteration": 4.265687942504883 }, { "auxiliary_loss_clip": 0.01424719, "auxiliary_loss_mlp": 0.00226468, "balance_loss_clip": 1.13280511, "balance_loss_mlp": 0.19087201, "epoch": 0.10016533894483691, "flos": 23402589223680.0, "grad_norm": 25.438376289177135, "language_loss": 0.76097465, "learning_rate": 3.9485788999161165e-06, "loss": 0.77748656, "num_input_tokens_seen": 35898990, "router_z_loss_clip": 2.91992188, "router_z_loss_mlp": 0.35571289, "step": 1666, "time_per_iteration": 2.6964516639709473 }, { "auxiliary_loss_clip": 0.0143677, "auxiliary_loss_mlp": 0.00239459, "balance_loss_clip": 1.13127327, "balance_loss_mlp": 0.19852208, "epoch": 0.10022546219750489, "flos": 19354164163200.0, "grad_norm": 6.191282033078129, "language_loss": 0.87066084, "learning_rate": 3.948491117273956e-06, "loss": 0.8874231, "num_input_tokens_seen": 35916225, "router_z_loss_clip": 3.0546875, "router_z_loss_mlp": 0.40917969, "step": 1667, "time_per_iteration": 2.668174982070923 }, { "auxiliary_loss_clip": 0.01429857, "auxiliary_loss_mlp": 0.00234921, "balance_loss_clip": 1.13258195, "balance_loss_mlp": 0.19760823, "epoch": 0.10028558545017285, "flos": 27085766837760.0, "grad_norm": 114.51418818226877, "language_loss": 0.8688609, "learning_rate": 3.948403260744817e-06, "loss": 0.88550866, "num_input_tokens_seen": 35934630, "router_z_loss_clip": 2.97460938, "router_z_loss_mlp": 0.37304688, "step": 1668, "time_per_iteration": 4.189879894256592 }, { "auxiliary_loss_clip": 0.01434699, "auxiliary_loss_mlp": 0.00227556, "balance_loss_clip": 1.1325829, "balance_loss_mlp": 0.19012442, "epoch": 0.10034570870284082, "flos": 25847136195840.0, "grad_norm": 22.691598928189748, "language_loss": 0.8759082, "learning_rate": 3.948315330332031e-06, "loss": 0.89253074, "num_input_tokens_seen": 35953855, "router_z_loss_clip": 3.01953125, "router_z_loss_mlp": 0.37451172, "step": 1669, "time_per_iteration": 2.672041893005371 }, { "auxiliary_loss_clip": 0.01439864, "auxiliary_loss_mlp": 0.0023979, "balance_loss_clip": 1.13469648, "balance_loss_mlp": 0.20016462, "epoch": 0.1004058319555088, "flos": 26249587153920.0, "grad_norm": 10.14360407598895, "language_loss": 0.95060164, "learning_rate": 3.948227326038933e-06, "loss": 0.96739817, "num_input_tokens_seen": 35974555, "router_z_loss_clip": 3.04882812, "router_z_loss_mlp": 0.39624023, "step": 1670, "time_per_iteration": 4.0912933349609375 }, { "auxiliary_loss_clip": 0.01434845, "auxiliary_loss_mlp": 0.00185353, "balance_loss_clip": 1.14061391, "balance_loss_mlp": 0.15278493, "epoch": 0.10046595520817676, "flos": 25374480105600.0, "grad_norm": 17.57773720044362, "language_loss": 0.83600575, "learning_rate": 3.9481392478688586e-06, "loss": 0.85220778, "num_input_tokens_seen": 35996830, "router_z_loss_clip": 2.93945312, "router_z_loss_mlp": 0.32568359, "step": 1671, "time_per_iteration": 2.6996774673461914 }, { "auxiliary_loss_clip": 0.01464616, "auxiliary_loss_mlp": 0.00123662, "balance_loss_clip": 1.27898288, "balance_loss_mlp": 0.11383937, "epoch": 0.10052607846084473, "flos": 67461821677440.0, "grad_norm": 0.8019365244581269, "language_loss": 0.60874546, "learning_rate": 3.948051095825149e-06, "loss": 0.62462819, "num_input_tokens_seen": 36054465, "router_z_loss_clip": 1.859375, "router_z_loss_mlp": 0.09814453, "step": 1672, "time_per_iteration": 3.0950546264648438 }, { "auxiliary_loss_clip": 0.01421163, "auxiliary_loss_mlp": 0.0021351, "balance_loss_clip": 1.11952817, "balance_loss_mlp": 0.17908221, "epoch": 0.10058620171351271, "flos": 21360493209600.0, "grad_norm": 30.17956306927547, "language_loss": 0.85099995, "learning_rate": 3.947962869911147e-06, "loss": 0.86734664, "num_input_tokens_seen": 36073480, "router_z_loss_clip": 3.01367188, "router_z_loss_mlp": 0.34448242, "step": 1673, "time_per_iteration": 2.6489686965942383 }, { "auxiliary_loss_clip": 0.01411727, "auxiliary_loss_mlp": 0.00195937, "balance_loss_clip": 1.1093303, "balance_loss_mlp": 0.15965018, "epoch": 0.10064632496618067, "flos": 16800125558400.0, "grad_norm": 20.23488339568771, "language_loss": 0.85137451, "learning_rate": 3.947874570130197e-06, "loss": 0.86745119, "num_input_tokens_seen": 36091830, "router_z_loss_clip": 3.02148438, "router_z_loss_mlp": 0.36303711, "step": 1674, "time_per_iteration": 2.609600782394409 }, { "auxiliary_loss_clip": 0.01412876, "auxiliary_loss_mlp": 0.00228919, "balance_loss_clip": 1.10891938, "balance_loss_mlp": 0.19542141, "epoch": 0.10070644821884864, "flos": 23624445576960.0, "grad_norm": 33.302508536933516, "language_loss": 0.88327658, "learning_rate": 3.947786196485649e-06, "loss": 0.89969456, "num_input_tokens_seen": 36111400, "router_z_loss_clip": 3.0390625, "router_z_loss_mlp": 0.33496094, "step": 1675, "time_per_iteration": 2.696009874343872 }, { "auxiliary_loss_clip": 0.01419315, "auxiliary_loss_mlp": 0.00206404, "balance_loss_clip": 1.10375834, "balance_loss_mlp": 0.17281038, "epoch": 0.1007665714715166, "flos": 24462564595200.0, "grad_norm": 5.336583009923454, "language_loss": 0.89189374, "learning_rate": 3.947697748980853e-06, "loss": 0.90815091, "num_input_tokens_seen": 36129345, "router_z_loss_clip": 3.15625, "router_z_loss_mlp": 0.33569336, "step": 1676, "time_per_iteration": 2.6659927368164062 }, { "auxiliary_loss_clip": 0.0141732, "auxiliary_loss_mlp": 0.00210008, "balance_loss_clip": 1.10244513, "balance_loss_mlp": 0.17865562, "epoch": 0.10082669472418458, "flos": 16799119977600.0, "grad_norm": 11.763357457203323, "language_loss": 0.92486691, "learning_rate": 3.947609227619163e-06, "loss": 0.94114012, "num_input_tokens_seen": 36146255, "router_z_loss_clip": 3.14648438, "router_z_loss_mlp": 0.31347656, "step": 1677, "time_per_iteration": 2.6620006561279297 }, { "auxiliary_loss_clip": 0.01414404, "auxiliary_loss_mlp": 0.00226389, "balance_loss_clip": 1.09996927, "balance_loss_mlp": 0.19236687, "epoch": 0.10088681797685255, "flos": 13553513844480.0, "grad_norm": 2.41861870268386, "language_loss": 0.94003248, "learning_rate": 3.947520632403936e-06, "loss": 0.95644039, "num_input_tokens_seen": 36164050, "router_z_loss_clip": 3.14257812, "router_z_loss_mlp": 0.34008789, "step": 1678, "time_per_iteration": 2.5857696533203125 }, { "auxiliary_loss_clip": 0.0141754, "auxiliary_loss_mlp": 0.00240343, "balance_loss_clip": 1.10247672, "balance_loss_mlp": 0.20846653, "epoch": 0.10094694122952051, "flos": 25265706744960.0, "grad_norm": 3.37059595808477, "language_loss": 0.97837138, "learning_rate": 3.947431963338532e-06, "loss": 0.99495029, "num_input_tokens_seen": 36183530, "router_z_loss_clip": 3.15039062, "router_z_loss_mlp": 0.31884766, "step": 1679, "time_per_iteration": 2.7101662158966064 }, { "auxiliary_loss_clip": 0.01296312, "auxiliary_loss_mlp": 0.00132391, "balance_loss_clip": 1.10422802, "balance_loss_mlp": 0.11961204, "epoch": 0.10100706448218849, "flos": 69854299885440.0, "grad_norm": 0.8173892981822501, "language_loss": 0.53013223, "learning_rate": 3.947343220426312e-06, "loss": 0.54441923, "num_input_tokens_seen": 36248550, "router_z_loss_clip": 1.921875, "router_z_loss_mlp": 0.12792969, "step": 1680, "time_per_iteration": 3.205596446990967 }, { "auxiliary_loss_clip": 0.01414649, "auxiliary_loss_mlp": 0.00226822, "balance_loss_clip": 1.09729505, "balance_loss_mlp": 0.19377714, "epoch": 0.10106718773485646, "flos": 20007163463040.0, "grad_norm": 4.803159303337, "language_loss": 0.84584606, "learning_rate": 3.947254403670641e-06, "loss": 0.86226082, "num_input_tokens_seen": 36266065, "router_z_loss_clip": 3.17382812, "router_z_loss_mlp": 0.33056641, "step": 1681, "time_per_iteration": 2.784868001937866 }, { "auxiliary_loss_clip": 0.0143211, "auxiliary_loss_mlp": 0.00300444, "balance_loss_clip": 1.11296535, "balance_loss_mlp": 0.26489624, "epoch": 0.10112731098752442, "flos": 13479825093120.0, "grad_norm": 133.00649642392773, "language_loss": 1.03123105, "learning_rate": 3.947165513074889e-06, "loss": 1.04855669, "num_input_tokens_seen": 36280960, "router_z_loss_clip": 3.18945312, "router_z_loss_mlp": 0.35571289, "step": 1682, "time_per_iteration": 2.7600817680358887 }, { "auxiliary_loss_clip": 0.01423609, "auxiliary_loss_mlp": 0.00257474, "balance_loss_clip": 1.1057204, "balance_loss_mlp": 0.22459617, "epoch": 0.1011874342401924, "flos": 18515901490560.0, "grad_norm": 3.7287758085842624, "language_loss": 0.93201721, "learning_rate": 3.947076548642425e-06, "loss": 0.94882798, "num_input_tokens_seen": 36299010, "router_z_loss_clip": 3.1796875, "router_z_loss_mlp": 0.32885742, "step": 1683, "time_per_iteration": 2.761664628982544 }, { "auxiliary_loss_clip": 0.01423101, "auxiliary_loss_mlp": 0.00304024, "balance_loss_clip": 1.1126833, "balance_loss_mlp": 0.27311268, "epoch": 0.10124755749286037, "flos": 20702861055360.0, "grad_norm": 5.602867533156251, "language_loss": 0.82046485, "learning_rate": 3.946987510376624e-06, "loss": 0.83773601, "num_input_tokens_seen": 36318400, "router_z_loss_clip": 3.1015625, "router_z_loss_mlp": 0.30908203, "step": 1684, "time_per_iteration": 2.8779306411743164 }, { "auxiliary_loss_clip": 0.01329211, "auxiliary_loss_mlp": 0.00130437, "balance_loss_clip": 1.14411998, "balance_loss_mlp": 0.11813499, "epoch": 0.10130768074552833, "flos": 56109456247680.0, "grad_norm": 0.7564894431992994, "language_loss": 0.61162949, "learning_rate": 3.9468983982808615e-06, "loss": 0.62622607, "num_input_tokens_seen": 36381815, "router_z_loss_clip": 1.8515625, "router_z_loss_mlp": 0.12304688, "step": 1685, "time_per_iteration": 3.36984920501709 }, { "auxiliary_loss_clip": 0.01437008, "auxiliary_loss_mlp": 0.00321937, "balance_loss_clip": 1.11840606, "balance_loss_mlp": 0.28863022, "epoch": 0.1013678039981963, "flos": 33402346156800.0, "grad_norm": 16.8453613964189, "language_loss": 0.6915592, "learning_rate": 3.946809212358516e-06, "loss": 0.70914865, "num_input_tokens_seen": 36404320, "router_z_loss_clip": 3.1875, "router_z_loss_mlp": 0.33300781, "step": 1686, "time_per_iteration": 2.8667733669281006 }, { "auxiliary_loss_clip": 0.014281, "auxiliary_loss_mlp": 0.00303839, "balance_loss_clip": 1.11823153, "balance_loss_mlp": 0.27313024, "epoch": 0.10142792725086427, "flos": 31905338008320.0, "grad_norm": 87.20751824406162, "language_loss": 0.86326063, "learning_rate": 3.946719952612972e-06, "loss": 0.88057995, "num_input_tokens_seen": 36427510, "router_z_loss_clip": 3.09960938, "router_z_loss_mlp": 0.30737305, "step": 1687, "time_per_iteration": 2.754265546798706 }, { "auxiliary_loss_clip": 0.01463459, "auxiliary_loss_mlp": 0.00293834, "balance_loss_clip": 1.1423552, "balance_loss_mlp": 0.26322144, "epoch": 0.10148805050353224, "flos": 28475905046400.0, "grad_norm": 2.3945908658288007, "language_loss": 0.79567862, "learning_rate": 3.94663061904761e-06, "loss": 0.81325155, "num_input_tokens_seen": 36448230, "router_z_loss_clip": 3.20898438, "router_z_loss_mlp": 0.3059082, "step": 1688, "time_per_iteration": 2.678931713104248 }, { "auxiliary_loss_clip": 0.01447083, "auxiliary_loss_mlp": 0.00325439, "balance_loss_clip": 1.13314104, "balance_loss_mlp": 0.2938723, "epoch": 0.1015481737562002, "flos": 25148888737920.0, "grad_norm": 14.020804627921958, "language_loss": 0.94305789, "learning_rate": 3.94654121166582e-06, "loss": 0.96078306, "num_input_tokens_seen": 36464395, "router_z_loss_clip": 3.13671875, "router_z_loss_mlp": 0.31542969, "step": 1689, "time_per_iteration": 2.7358720302581787 }, { "auxiliary_loss_clip": 0.01461619, "auxiliary_loss_mlp": 0.00299861, "balance_loss_clip": 1.14937055, "balance_loss_mlp": 0.26914099, "epoch": 0.10160829700886818, "flos": 30882781630080.0, "grad_norm": 11.997471921879947, "language_loss": 0.95998096, "learning_rate": 3.946451730470993e-06, "loss": 0.97759575, "num_input_tokens_seen": 36486475, "router_z_loss_clip": 3.12304688, "router_z_loss_mlp": 0.30712891, "step": 1690, "time_per_iteration": 2.723723888397217 }, { "auxiliary_loss_clip": 0.01472717, "auxiliary_loss_mlp": 0.00344132, "balance_loss_clip": 1.15350783, "balance_loss_mlp": 0.31103909, "epoch": 0.10166842026153615, "flos": 20412020632320.0, "grad_norm": 14.412676678804287, "language_loss": 0.90871477, "learning_rate": 3.946362175466521e-06, "loss": 0.92688322, "num_input_tokens_seen": 36505310, "router_z_loss_clip": 3.19140625, "router_z_loss_mlp": 0.33081055, "step": 1691, "time_per_iteration": 2.6711792945861816 }, { "auxiliary_loss_clip": 0.01474346, "auxiliary_loss_mlp": 0.00372493, "balance_loss_clip": 1.1555202, "balance_loss_mlp": 0.33620518, "epoch": 0.10172854351420411, "flos": 33476968661760.0, "grad_norm": 5.172169120212253, "language_loss": 0.74186301, "learning_rate": 3.946272546655801e-06, "loss": 0.76033139, "num_input_tokens_seen": 36529820, "router_z_loss_clip": 3.18359375, "router_z_loss_mlp": 0.36279297, "step": 1692, "time_per_iteration": 2.734665632247925 }, { "auxiliary_loss_clip": 0.01439562, "auxiliary_loss_mlp": 0.00300195, "balance_loss_clip": 1.13034093, "balance_loss_mlp": 0.2680797, "epoch": 0.1017886667668721, "flos": 23550325862400.0, "grad_norm": 4.570299163859535, "language_loss": 0.84220147, "learning_rate": 3.94618284404223e-06, "loss": 0.85959899, "num_input_tokens_seen": 36549000, "router_z_loss_clip": 3.09179688, "router_z_loss_mlp": 0.32104492, "step": 1693, "time_per_iteration": 2.660723924636841 }, { "auxiliary_loss_clip": 0.01444867, "auxiliary_loss_mlp": 0.00344537, "balance_loss_clip": 1.12629056, "balance_loss_mlp": 0.31015658, "epoch": 0.10184879001954006, "flos": 23296078419840.0, "grad_norm": 9.345539190684303, "language_loss": 0.92098272, "learning_rate": 3.9460930676292105e-06, "loss": 0.93887675, "num_input_tokens_seen": 36567515, "router_z_loss_clip": 3.18359375, "router_z_loss_mlp": 0.34399414, "step": 1694, "time_per_iteration": 2.640963315963745 }, { "auxiliary_loss_clip": 0.01428722, "auxiliary_loss_mlp": 0.00323056, "balance_loss_clip": 1.11391139, "balance_loss_mlp": 0.28984481, "epoch": 0.10190891327220802, "flos": 18333116156160.0, "grad_norm": 2.939321379974881, "language_loss": 0.8741678, "learning_rate": 3.946003217420147e-06, "loss": 0.89168555, "num_input_tokens_seen": 36586190, "router_z_loss_clip": 3.14453125, "router_z_loss_mlp": 0.33227539, "step": 1695, "time_per_iteration": 2.622555732727051 }, { "auxiliary_loss_clip": 0.01415742, "auxiliary_loss_mlp": 0.00298006, "balance_loss_clip": 1.1001513, "balance_loss_mlp": 0.26496127, "epoch": 0.10196903652487599, "flos": 26465374108800.0, "grad_norm": 18.009717742828766, "language_loss": 0.92779464, "learning_rate": 3.945913293418447e-06, "loss": 0.9449321, "num_input_tokens_seen": 36607495, "router_z_loss_clip": 3.15820312, "router_z_loss_mlp": 0.33056641, "step": 1696, "time_per_iteration": 2.6884772777557373 }, { "auxiliary_loss_clip": 0.01430817, "auxiliary_loss_mlp": 0.00303471, "balance_loss_clip": 1.11552036, "balance_loss_mlp": 0.27391857, "epoch": 0.10202915977754397, "flos": 21869526798720.0, "grad_norm": 426.3906322388595, "language_loss": 0.88028038, "learning_rate": 3.945823295627519e-06, "loss": 0.89762318, "num_input_tokens_seen": 36628555, "router_z_loss_clip": 3.15234375, "router_z_loss_mlp": 0.29528809, "step": 1697, "time_per_iteration": 2.7094011306762695 }, { "auxiliary_loss_clip": 0.01453221, "auxiliary_loss_mlp": 0.00314835, "balance_loss_clip": 1.12871981, "balance_loss_mlp": 0.28243411, "epoch": 0.10208928303021193, "flos": 22309755886080.0, "grad_norm": 35.08912082176857, "language_loss": 0.89441675, "learning_rate": 3.9457332240507775e-06, "loss": 0.91209733, "num_input_tokens_seen": 36646250, "router_z_loss_clip": 3.2421875, "router_z_loss_mlp": 0.32397461, "step": 1698, "time_per_iteration": 2.6961843967437744 }, { "auxiliary_loss_clip": 0.01463414, "auxiliary_loss_mlp": 0.00335751, "balance_loss_clip": 1.13443375, "balance_loss_mlp": 0.30272973, "epoch": 0.1021494062828799, "flos": 22125569921280.0, "grad_norm": 10.313537078099635, "language_loss": 0.82790053, "learning_rate": 3.945643078691637e-06, "loss": 0.84589213, "num_input_tokens_seen": 36666675, "router_z_loss_clip": 3.29101562, "router_z_loss_mlp": 0.33032227, "step": 1699, "time_per_iteration": 2.633845090866089 }, { "auxiliary_loss_clip": 0.01453232, "auxiliary_loss_mlp": 0.00332058, "balance_loss_clip": 1.13075733, "balance_loss_mlp": 0.2996096, "epoch": 0.10220952953554788, "flos": 19646728439040.0, "grad_norm": 1.8595135487559091, "language_loss": 0.86757338, "learning_rate": 3.945552859553516e-06, "loss": 0.88542628, "num_input_tokens_seen": 36685225, "router_z_loss_clip": 3.22460938, "router_z_loss_mlp": 0.32446289, "step": 1700, "time_per_iteration": 2.6122031211853027 }, { "auxiliary_loss_clip": 0.01456532, "auxiliary_loss_mlp": 0.00341514, "balance_loss_clip": 1.12906694, "balance_loss_mlp": 0.30634719, "epoch": 0.10226965278821584, "flos": 29787290686080.0, "grad_norm": 2.896818235789939, "language_loss": 0.83932233, "learning_rate": 3.945462566639836e-06, "loss": 0.85730278, "num_input_tokens_seen": 36705985, "router_z_loss_clip": 3.27539062, "router_z_loss_mlp": 0.35180664, "step": 1701, "time_per_iteration": 2.697561740875244 }, { "auxiliary_loss_clip": 0.01487409, "auxiliary_loss_mlp": 0.00376085, "balance_loss_clip": 1.1499728, "balance_loss_mlp": 0.33874881, "epoch": 0.10232977604088381, "flos": 27016818681600.0, "grad_norm": 41.79406622165582, "language_loss": 0.85265207, "learning_rate": 3.945372199954019e-06, "loss": 0.87128699, "num_input_tokens_seen": 36725815, "router_z_loss_clip": 3.375, "router_z_loss_mlp": 0.37304688, "step": 1702, "time_per_iteration": 2.748521327972412 }, { "auxiliary_loss_clip": 0.01485064, "auxiliary_loss_mlp": 0.00346643, "balance_loss_clip": 1.16014636, "balance_loss_mlp": 0.31338388, "epoch": 0.10238989929355179, "flos": 20777519473920.0, "grad_norm": 8.435986546849765, "language_loss": 1.02211738, "learning_rate": 3.945281759499494e-06, "loss": 1.04043436, "num_input_tokens_seen": 36742345, "router_z_loss_clip": 3.24804688, "router_z_loss_mlp": 0.33227539, "step": 1703, "time_per_iteration": 2.657196283340454 }, { "auxiliary_loss_clip": 0.014011, "auxiliary_loss_mlp": 0.00233996, "balance_loss_clip": 1.20626974, "balance_loss_mlp": 0.21263407, "epoch": 0.10245002254621975, "flos": 57698322451200.0, "grad_norm": 0.9166106437099278, "language_loss": 0.55300939, "learning_rate": 3.94519124527969e-06, "loss": 0.56936038, "num_input_tokens_seen": 36798775, "router_z_loss_clip": 1.953125, "router_z_loss_mlp": 0.21386719, "step": 1704, "time_per_iteration": 3.086576461791992 }, { "auxiliary_loss_clip": 0.01487249, "auxiliary_loss_mlp": 0.00404888, "balance_loss_clip": 1.16411316, "balance_loss_mlp": 0.36717057, "epoch": 0.10251014579888772, "flos": 16800125558400.0, "grad_norm": 12.420975702376198, "language_loss": 0.92600834, "learning_rate": 3.945100657298039e-06, "loss": 0.94492972, "num_input_tokens_seen": 36816295, "router_z_loss_clip": 3.23632812, "router_z_loss_mlp": 0.37695312, "step": 1705, "time_per_iteration": 2.623239040374756 }, { "auxiliary_loss_clip": 0.01408798, "auxiliary_loss_mlp": 0.00224156, "balance_loss_clip": 1.21146464, "balance_loss_mlp": 0.20374699, "epoch": 0.1025702690515557, "flos": 68565500922240.0, "grad_norm": 0.7625945072221837, "language_loss": 0.60557574, "learning_rate": 3.9450099955579765e-06, "loss": 0.62190527, "num_input_tokens_seen": 36882030, "router_z_loss_clip": 1.96875, "router_z_loss_mlp": 0.20410156, "step": 1706, "time_per_iteration": 4.550852298736572 }, { "auxiliary_loss_clip": 0.01491073, "auxiliary_loss_mlp": 0.00382191, "balance_loss_clip": 1.15817165, "balance_loss_mlp": 0.34618956, "epoch": 0.10263039230422366, "flos": 14866623336960.0, "grad_norm": 6.226344804669652, "language_loss": 0.94599545, "learning_rate": 3.94491926006294e-06, "loss": 0.96472812, "num_input_tokens_seen": 36899245, "router_z_loss_clip": 3.33398438, "router_z_loss_mlp": 0.36010742, "step": 1707, "time_per_iteration": 4.080641746520996 }, { "auxiliary_loss_clip": 0.01471034, "auxiliary_loss_mlp": 0.00353264, "balance_loss_clip": 1.15367365, "balance_loss_mlp": 0.32138765, "epoch": 0.10269051555689163, "flos": 25337599816320.0, "grad_norm": 65.92205469271558, "language_loss": 0.78483182, "learning_rate": 3.944828450816369e-06, "loss": 0.80307484, "num_input_tokens_seen": 36920950, "router_z_loss_clip": 3.17382812, "router_z_loss_mlp": 0.31860352, "step": 1708, "time_per_iteration": 2.7636330127716064 }, { "auxiliary_loss_clip": 0.01496377, "auxiliary_loss_mlp": 0.0035135, "balance_loss_clip": 1.17134738, "balance_loss_mlp": 0.3171367, "epoch": 0.10275063880955959, "flos": 21068826773760.0, "grad_norm": 16.255664032896288, "language_loss": 0.97282398, "learning_rate": 3.944737567821709e-06, "loss": 0.99130124, "num_input_tokens_seen": 36938900, "router_z_loss_clip": 3.25, "router_z_loss_mlp": 0.34228516, "step": 1709, "time_per_iteration": 2.669799327850342 }, { "auxiliary_loss_clip": 0.01489755, "auxiliary_loss_mlp": 0.00361515, "balance_loss_clip": 1.17187619, "balance_loss_mlp": 0.3277548, "epoch": 0.10281076206222757, "flos": 30366780802560.0, "grad_norm": 7.491993838787577, "language_loss": 0.94368768, "learning_rate": 3.944646611082406e-06, "loss": 0.9622004, "num_input_tokens_seen": 36957010, "router_z_loss_clip": 3.1796875, "router_z_loss_mlp": 0.33764648, "step": 1710, "time_per_iteration": 4.161514759063721 }, { "auxiliary_loss_clip": 0.01482634, "auxiliary_loss_mlp": 0.00352932, "balance_loss_clip": 1.16162252, "balance_loss_mlp": 0.31859973, "epoch": 0.10287088531489554, "flos": 22418313765120.0, "grad_norm": 4.7032082268272735, "language_loss": 0.84358144, "learning_rate": 3.944555580601908e-06, "loss": 0.86193711, "num_input_tokens_seen": 36977690, "router_z_loss_clip": 3.20703125, "router_z_loss_mlp": 0.34326172, "step": 1711, "time_per_iteration": 2.72757887840271 }, { "auxiliary_loss_clip": 0.01469805, "auxiliary_loss_mlp": 0.00363638, "balance_loss_clip": 1.15177464, "balance_loss_mlp": 0.32861468, "epoch": 0.1029310085675635, "flos": 25115994858240.0, "grad_norm": 191.12944304550513, "language_loss": 0.79383457, "learning_rate": 3.944464476383668e-06, "loss": 0.81216896, "num_input_tokens_seen": 36997300, "router_z_loss_clip": 3.18164062, "router_z_loss_mlp": 0.3503418, "step": 1712, "time_per_iteration": 4.049677133560181 }, { "auxiliary_loss_clip": 0.01471269, "auxiliary_loss_mlp": 0.00318981, "balance_loss_clip": 1.15653777, "balance_loss_mlp": 0.28498232, "epoch": 0.10299113182023148, "flos": 19865639877120.0, "grad_norm": 3.563271369616205, "language_loss": 0.93426603, "learning_rate": 3.94437329843114e-06, "loss": 0.95216846, "num_input_tokens_seen": 37016110, "router_z_loss_clip": 3.14453125, "router_z_loss_mlp": 0.33959961, "step": 1713, "time_per_iteration": 2.741966485977173 }, { "auxiliary_loss_clip": 0.0147867, "auxiliary_loss_mlp": 0.0036055, "balance_loss_clip": 1.15600443, "balance_loss_mlp": 0.32514477, "epoch": 0.10305125507289944, "flos": 20447608032000.0, "grad_norm": 17.664611134235038, "language_loss": 0.7881524, "learning_rate": 3.944282046747782e-06, "loss": 0.80654454, "num_input_tokens_seen": 37036405, "router_z_loss_clip": 3.22851562, "router_z_loss_mlp": 0.35400391, "step": 1714, "time_per_iteration": 2.6604840755462646 }, { "auxiliary_loss_clip": 0.01459778, "auxiliary_loss_mlp": 0.00385334, "balance_loss_clip": 1.13968778, "balance_loss_mlp": 0.34752065, "epoch": 0.10311137832556741, "flos": 26250772302720.0, "grad_norm": 8.74199316690998, "language_loss": 0.97704434, "learning_rate": 3.944190721337053e-06, "loss": 0.99549544, "num_input_tokens_seen": 37057580, "router_z_loss_clip": 3.19726562, "router_z_loss_mlp": 0.37792969, "step": 1715, "time_per_iteration": 2.672192335128784 }, { "auxiliary_loss_clip": 0.01463325, "auxiliary_loss_mlp": 0.00331049, "balance_loss_clip": 1.14660215, "balance_loss_mlp": 0.29345083, "epoch": 0.10317150157823539, "flos": 35298932175360.0, "grad_norm": 14.919969256491244, "language_loss": 0.83315682, "learning_rate": 3.944099322202418e-06, "loss": 0.85110056, "num_input_tokens_seen": 37079120, "router_z_loss_clip": 3.171875, "router_z_loss_mlp": 0.37597656, "step": 1716, "time_per_iteration": 2.76080584526062 }, { "auxiliary_loss_clip": 0.01485743, "auxiliary_loss_mlp": 0.00394856, "balance_loss_clip": 1.15672457, "balance_loss_mlp": 0.35441989, "epoch": 0.10323162483090335, "flos": 25739943033600.0, "grad_norm": 7.058526159856095, "language_loss": 0.93469548, "learning_rate": 3.944007849347342e-06, "loss": 0.95350152, "num_input_tokens_seen": 37099710, "router_z_loss_clip": 3.28710938, "router_z_loss_mlp": 0.40405273, "step": 1717, "time_per_iteration": 2.6621499061584473 }, { "auxiliary_loss_clip": 0.0146078, "auxiliary_loss_mlp": 0.00400768, "balance_loss_clip": 1.13858485, "balance_loss_mlp": 0.36078477, "epoch": 0.10329174808357132, "flos": 16289870906880.0, "grad_norm": 7.9528746026420665, "language_loss": 0.93365324, "learning_rate": 3.943916302775292e-06, "loss": 0.95226872, "num_input_tokens_seen": 37117775, "router_z_loss_clip": 3.22070312, "router_z_loss_mlp": 0.39990234, "step": 1718, "time_per_iteration": 2.7176437377929688 }, { "auxiliary_loss_clip": 0.0146603, "auxiliary_loss_mlp": 0.00398268, "balance_loss_clip": 1.14542365, "balance_loss_mlp": 0.35985917, "epoch": 0.10335187133623928, "flos": 36687166963200.0, "grad_norm": 10.07476906888366, "language_loss": 0.78950119, "learning_rate": 3.943824682489742e-06, "loss": 0.80814421, "num_input_tokens_seen": 37140280, "router_z_loss_clip": 3.2109375, "router_z_loss_mlp": 0.38427734, "step": 1719, "time_per_iteration": 2.7405381202697754 }, { "auxiliary_loss_clip": 0.01473594, "auxiliary_loss_mlp": 0.00423049, "balance_loss_clip": 1.14802992, "balance_loss_mlp": 0.38499749, "epoch": 0.10341199458890726, "flos": 14975648092800.0, "grad_norm": 6.912032760363442, "language_loss": 0.98751557, "learning_rate": 3.9437329884941665e-06, "loss": 1.00648189, "num_input_tokens_seen": 37158350, "router_z_loss_clip": 3.25, "router_z_loss_mlp": 0.38012695, "step": 1720, "time_per_iteration": 2.6464953422546387 }, { "auxiliary_loss_clip": 0.01499929, "auxiliary_loss_mlp": 0.00433222, "balance_loss_clip": 1.16659033, "balance_loss_mlp": 0.39259592, "epoch": 0.10347211784157523, "flos": 21031587348480.0, "grad_norm": 5.3935762883715075, "language_loss": 0.85919183, "learning_rate": 3.943641220792039e-06, "loss": 0.87852335, "num_input_tokens_seen": 37177120, "router_z_loss_clip": 3.33007812, "router_z_loss_mlp": 0.40649414, "step": 1721, "time_per_iteration": 2.632091760635376 }, { "auxiliary_loss_clip": 0.0150303, "auxiliary_loss_mlp": 0.00454445, "balance_loss_clip": 1.17036772, "balance_loss_mlp": 0.41172075, "epoch": 0.1035322410942432, "flos": 19792094780160.0, "grad_norm": 26.124846446410498, "language_loss": 0.86820292, "learning_rate": 3.9435493793868434e-06, "loss": 0.88777769, "num_input_tokens_seen": 37195895, "router_z_loss_clip": 3.32617188, "router_z_loss_mlp": 0.42700195, "step": 1722, "time_per_iteration": 2.6710479259490967 }, { "auxiliary_loss_clip": 0.0131594, "auxiliary_loss_mlp": 0.0021099, "balance_loss_clip": 1.12796068, "balance_loss_mlp": 0.19983213, "epoch": 0.10359236434691117, "flos": 52698874947840.0, "grad_norm": 1.0005180675917151, "language_loss": 0.67111003, "learning_rate": 3.943457464282059e-06, "loss": 0.68637931, "num_input_tokens_seen": 37247270, "router_z_loss_clip": 1.875, "router_z_loss_mlp": 0.11181641, "step": 1723, "time_per_iteration": 2.9467110633850098 }, { "auxiliary_loss_clip": 0.01528722, "auxiliary_loss_mlp": 0.00438229, "balance_loss_clip": 1.19561613, "balance_loss_mlp": 0.3970775, "epoch": 0.10365248759957914, "flos": 18405404277120.0, "grad_norm": 4.770418731197657, "language_loss": 0.8820169, "learning_rate": 3.9433654754811745e-06, "loss": 0.90168643, "num_input_tokens_seen": 37265595, "router_z_loss_clip": 3.33203125, "router_z_loss_mlp": 0.41162109, "step": 1724, "time_per_iteration": 2.6420791149139404 }, { "auxiliary_loss_clip": 0.01535596, "auxiliary_loss_mlp": 0.00461173, "balance_loss_clip": 1.19737482, "balance_loss_mlp": 0.42030802, "epoch": 0.1037126108522471, "flos": 47553555335040.0, "grad_norm": 136.54881706307236, "language_loss": 0.81538224, "learning_rate": 3.943273412987676e-06, "loss": 0.83534992, "num_input_tokens_seen": 37286660, "router_z_loss_clip": 3.38085938, "router_z_loss_mlp": 0.40820312, "step": 1725, "time_per_iteration": 2.8589065074920654 }, { "auxiliary_loss_clip": 0.01558511, "auxiliary_loss_mlp": 0.00437395, "balance_loss_clip": 1.22125304, "balance_loss_mlp": 0.39688724, "epoch": 0.10377273410491508, "flos": 22816670572800.0, "grad_norm": 7.2656130321275985, "language_loss": 0.84472722, "learning_rate": 3.943181276805054e-06, "loss": 0.86468625, "num_input_tokens_seen": 37304915, "router_z_loss_clip": 3.36914062, "router_z_loss_mlp": 0.40478516, "step": 1726, "time_per_iteration": 2.668335437774658 }, { "auxiliary_loss_clip": 0.01566532, "auxiliary_loss_mlp": 0.00432072, "balance_loss_clip": 1.22737098, "balance_loss_mlp": 0.38860872, "epoch": 0.10383285735758305, "flos": 26138694890880.0, "grad_norm": 4.963619203083205, "language_loss": 0.82461321, "learning_rate": 3.9430890669368035e-06, "loss": 0.84459925, "num_input_tokens_seen": 37325265, "router_z_loss_clip": 3.390625, "router_z_loss_mlp": 0.43432617, "step": 1727, "time_per_iteration": 2.9124534130096436 }, { "auxiliary_loss_clip": 0.01565128, "auxiliary_loss_mlp": 0.003756, "balance_loss_clip": 1.23016262, "balance_loss_mlp": 0.33428231, "epoch": 0.10389298061025101, "flos": 17091791994240.0, "grad_norm": 7.506658546973763, "language_loss": 0.90746099, "learning_rate": 3.942996783386422e-06, "loss": 0.92686826, "num_input_tokens_seen": 37341650, "router_z_loss_clip": 3.34960938, "router_z_loss_mlp": 0.41308594, "step": 1728, "time_per_iteration": 2.6007304191589355 }, { "auxiliary_loss_clip": 0.01572954, "auxiliary_loss_mlp": 0.00381895, "balance_loss_clip": 1.23956895, "balance_loss_mlp": 0.34000459, "epoch": 0.10395310386291898, "flos": 20776513893120.0, "grad_norm": 14.486373543647078, "language_loss": 0.77515733, "learning_rate": 3.942904426157406e-06, "loss": 0.79470587, "num_input_tokens_seen": 37360270, "router_z_loss_clip": 3.33398438, "router_z_loss_mlp": 0.41918945, "step": 1729, "time_per_iteration": 2.7050485610961914 }, { "auxiliary_loss_clip": 0.01587969, "auxiliary_loss_mlp": 0.00359456, "balance_loss_clip": 1.24423742, "balance_loss_mlp": 0.31384614, "epoch": 0.10401322711558696, "flos": 12820540913280.0, "grad_norm": 53.714932989829535, "language_loss": 0.952214, "learning_rate": 3.9428119952532605e-06, "loss": 0.97168827, "num_input_tokens_seen": 37375225, "router_z_loss_clip": 3.4375, "router_z_loss_mlp": 0.45629883, "step": 1730, "time_per_iteration": 2.567613363265991 }, { "auxiliary_loss_clip": 0.01585584, "auxiliary_loss_mlp": 0.00379136, "balance_loss_clip": 1.24647939, "balance_loss_mlp": 0.33660209, "epoch": 0.10407335036825492, "flos": 23184683366400.0, "grad_norm": 30.78783770511746, "language_loss": 0.83378559, "learning_rate": 3.942719490677489e-06, "loss": 0.85343277, "num_input_tokens_seen": 37395165, "router_z_loss_clip": 3.38867188, "router_z_loss_mlp": 0.42529297, "step": 1731, "time_per_iteration": 2.675737142562866 }, { "auxiliary_loss_clip": 0.01592693, "auxiliary_loss_mlp": 0.00349519, "balance_loss_clip": 1.24333, "balance_loss_mlp": 0.30770087, "epoch": 0.10413347362092289, "flos": 26104184899200.0, "grad_norm": 38.85001036042507, "language_loss": 0.90364116, "learning_rate": 3.9426269124336e-06, "loss": 0.92306328, "num_input_tokens_seen": 37414845, "router_z_loss_clip": 3.4921875, "router_z_loss_mlp": 0.41845703, "step": 1732, "time_per_iteration": 2.6506969928741455 }, { "auxiliary_loss_clip": 0.01598534, "auxiliary_loss_mlp": 0.00372415, "balance_loss_clip": 1.24979663, "balance_loss_mlp": 0.32995307, "epoch": 0.10419359687359087, "flos": 12641059630080.0, "grad_norm": 8.244051030442067, "language_loss": 0.92917335, "learning_rate": 3.942534260525104e-06, "loss": 0.94888288, "num_input_tokens_seen": 37432490, "router_z_loss_clip": 3.49023438, "router_z_loss_mlp": 0.42431641, "step": 1733, "time_per_iteration": 2.6240134239196777 }, { "auxiliary_loss_clip": 0.01582947, "auxiliary_loss_mlp": 0.00373633, "balance_loss_clip": 1.23463225, "balance_loss_mlp": 0.32962102, "epoch": 0.10425372012625883, "flos": 12125094716160.0, "grad_norm": 9.398022619894, "language_loss": 0.85588163, "learning_rate": 3.942441534955514e-06, "loss": 0.87544751, "num_input_tokens_seen": 37449435, "router_z_loss_clip": 3.48046875, "router_z_loss_mlp": 0.44018555, "step": 1734, "time_per_iteration": 2.6137728691101074 }, { "auxiliary_loss_clip": 0.015765, "auxiliary_loss_mlp": 0.00342236, "balance_loss_clip": 1.23770082, "balance_loss_mlp": 0.30225328, "epoch": 0.1043138433789268, "flos": 25337563902720.0, "grad_norm": 3.2159614994933547, "language_loss": 0.8199057, "learning_rate": 3.9423487357283465e-06, "loss": 0.83909309, "num_input_tokens_seen": 37469105, "router_z_loss_clip": 3.38476562, "router_z_loss_mlp": 0.3996582, "step": 1735, "time_per_iteration": 2.6779568195343018 }, { "auxiliary_loss_clip": 0.01612614, "auxiliary_loss_mlp": 0.00349545, "balance_loss_clip": 1.25769794, "balance_loss_mlp": 0.30460298, "epoch": 0.10437396663159478, "flos": 29167149352320.0, "grad_norm": 18.007387948388846, "language_loss": 0.84907115, "learning_rate": 3.94225586284712e-06, "loss": 0.8686927, "num_input_tokens_seen": 37490540, "router_z_loss_clip": 3.55078125, "router_z_loss_mlp": 0.44921875, "step": 1736, "time_per_iteration": 2.7234153747558594 }, { "auxiliary_loss_clip": 0.01633939, "auxiliary_loss_mlp": 0.0034326, "balance_loss_clip": 1.28547168, "balance_loss_mlp": 0.29884294, "epoch": 0.10443408988426274, "flos": 25080946162560.0, "grad_norm": 2.0585507570629287, "language_loss": 0.77182263, "learning_rate": 3.942162916315356e-06, "loss": 0.79159462, "num_input_tokens_seen": 37511905, "router_z_loss_clip": 3.48632812, "router_z_loss_mlp": 0.44433594, "step": 1737, "time_per_iteration": 2.644218921661377 }, { "auxiliary_loss_clip": 0.01628862, "auxiliary_loss_mlp": 0.0035896, "balance_loss_clip": 1.27622736, "balance_loss_mlp": 0.31418532, "epoch": 0.1044942131369307, "flos": 26759662237440.0, "grad_norm": 218.03672651605248, "language_loss": 0.9278326, "learning_rate": 3.942069896136581e-06, "loss": 0.94771081, "num_input_tokens_seen": 37533635, "router_z_loss_clip": 3.53125, "router_z_loss_mlp": 0.44799805, "step": 1738, "time_per_iteration": 2.6946380138397217 }, { "auxiliary_loss_clip": 0.01655126, "auxiliary_loss_mlp": 0.0035218, "balance_loss_clip": 1.300475, "balance_loss_mlp": 0.30654699, "epoch": 0.10455433638959867, "flos": 18442571875200.0, "grad_norm": 2.2849147879749356, "language_loss": 0.83059168, "learning_rate": 3.9419768023143196e-06, "loss": 0.85066473, "num_input_tokens_seen": 37552035, "router_z_loss_clip": 3.54882812, "router_z_loss_mlp": 0.45703125, "step": 1739, "time_per_iteration": 2.6532387733459473 }, { "auxiliary_loss_clip": 0.01684166, "auxiliary_loss_mlp": 0.0034767, "balance_loss_clip": 1.3190062, "balance_loss_mlp": 0.30394423, "epoch": 0.10461445964226665, "flos": 23218977876480.0, "grad_norm": 3.3121362405484787, "language_loss": 0.86222488, "learning_rate": 3.941883634852104e-06, "loss": 0.88254321, "num_input_tokens_seen": 37571540, "router_z_loss_clip": 3.65039062, "router_z_loss_mlp": 0.43701172, "step": 1740, "time_per_iteration": 2.716984272003174 }, { "auxiliary_loss_clip": 0.01672187, "auxiliary_loss_mlp": 0.0036453, "balance_loss_clip": 1.31213522, "balance_loss_mlp": 0.32321253, "epoch": 0.10467458289493461, "flos": 24345243797760.0, "grad_norm": 22.97544195510155, "language_loss": 0.93669802, "learning_rate": 3.941790393753467e-06, "loss": 0.95706517, "num_input_tokens_seen": 37588265, "router_z_loss_clip": 3.6015625, "router_z_loss_mlp": 0.41333008, "step": 1741, "time_per_iteration": 2.7056710720062256 }, { "auxiliary_loss_clip": 0.01670901, "auxiliary_loss_mlp": 0.00354977, "balance_loss_clip": 1.31115997, "balance_loss_mlp": 0.30903405, "epoch": 0.10473470614760258, "flos": 21287953693440.0, "grad_norm": 14.175035145142978, "language_loss": 0.84021676, "learning_rate": 3.941697079021942e-06, "loss": 0.86047554, "num_input_tokens_seen": 37606860, "router_z_loss_clip": 3.6015625, "router_z_loss_mlp": 0.45947266, "step": 1742, "time_per_iteration": 2.667635202407837 }, { "auxiliary_loss_clip": 0.01657407, "auxiliary_loss_mlp": 0.00340346, "balance_loss_clip": 1.31108665, "balance_loss_mlp": 0.30014819, "epoch": 0.10479482940027056, "flos": 21687208341120.0, "grad_norm": 2.197745873710425, "language_loss": 0.9459815, "learning_rate": 3.94160369066107e-06, "loss": 0.96595907, "num_input_tokens_seen": 37625210, "router_z_loss_clip": 3.46484375, "router_z_loss_mlp": 0.40185547, "step": 1743, "time_per_iteration": 2.6571171283721924 }, { "auxiliary_loss_clip": 0.0165929, "auxiliary_loss_mlp": 0.00347903, "balance_loss_clip": 1.3098222, "balance_loss_mlp": 0.3076342, "epoch": 0.10485495265293852, "flos": 21573694385280.0, "grad_norm": 3.330117996187679, "language_loss": 0.83216488, "learning_rate": 3.941510228674391e-06, "loss": 0.85223681, "num_input_tokens_seen": 37644110, "router_z_loss_clip": 3.4921875, "router_z_loss_mlp": 0.40283203, "step": 1744, "time_per_iteration": 2.693143367767334 }, { "auxiliary_loss_clip": 0.01654802, "auxiliary_loss_mlp": 0.00318819, "balance_loss_clip": 1.31073022, "balance_loss_mlp": 0.28060049, "epoch": 0.10491507590560649, "flos": 37961923708800.0, "grad_norm": 2.157220932673442, "language_loss": 0.8924619, "learning_rate": 3.941416693065451e-06, "loss": 0.91219819, "num_input_tokens_seen": 37665800, "router_z_loss_clip": 3.44335938, "router_z_loss_mlp": 0.38232422, "step": 1745, "time_per_iteration": 2.80916166305542 }, { "auxiliary_loss_clip": 0.01638523, "auxiliary_loss_mlp": 0.00362725, "balance_loss_clip": 1.298347, "balance_loss_mlp": 0.32283789, "epoch": 0.10497519915827447, "flos": 26396282298240.0, "grad_norm": 3.2043601876571843, "language_loss": 0.91712874, "learning_rate": 3.941323083837794e-06, "loss": 0.93714118, "num_input_tokens_seen": 37685095, "router_z_loss_clip": 3.40234375, "router_z_loss_mlp": 0.39868164, "step": 1746, "time_per_iteration": 2.7198946475982666 }, { "auxiliary_loss_clip": 0.01652269, "auxiliary_loss_mlp": 0.00349176, "balance_loss_clip": 1.31304872, "balance_loss_mlp": 0.30766743, "epoch": 0.10503532241094243, "flos": 40662190581120.0, "grad_norm": 7.493146446017974, "language_loss": 0.7653659, "learning_rate": 3.941229400994971e-06, "loss": 0.78538036, "num_input_tokens_seen": 37707445, "router_z_loss_clip": 3.39257812, "router_z_loss_mlp": 0.4152832, "step": 1747, "time_per_iteration": 2.8258371353149414 }, { "auxiliary_loss_clip": 0.01635086, "auxiliary_loss_mlp": 0.00327726, "balance_loss_clip": 1.29716003, "balance_loss_mlp": 0.28738546, "epoch": 0.1050954456636104, "flos": 29789409588480.0, "grad_norm": 9.619571107583479, "language_loss": 0.93550444, "learning_rate": 3.941135644540535e-06, "loss": 0.9551326, "num_input_tokens_seen": 37728325, "router_z_loss_clip": 3.3828125, "router_z_loss_mlp": 0.40332031, "step": 1748, "time_per_iteration": 2.7179088592529297 }, { "auxiliary_loss_clip": 0.01642907, "auxiliary_loss_mlp": 0.00319631, "balance_loss_clip": 1.29663587, "balance_loss_mlp": 0.27661985, "epoch": 0.10515556891627838, "flos": 23948754497280.0, "grad_norm": 12.593562486728725, "language_loss": 0.79039842, "learning_rate": 3.941041814478041e-06, "loss": 0.81002378, "num_input_tokens_seen": 37748910, "router_z_loss_clip": 3.46484375, "router_z_loss_mlp": 0.43017578, "step": 1749, "time_per_iteration": 4.172596454620361 }, { "auxiliary_loss_clip": 0.01645279, "auxiliary_loss_mlp": 0.00304084, "balance_loss_clip": 1.30683625, "balance_loss_mlp": 0.26481614, "epoch": 0.10521569216894634, "flos": 18259606972800.0, "grad_norm": 15.558889682317055, "language_loss": 0.91207302, "learning_rate": 3.940947910811047e-06, "loss": 0.9315666, "num_input_tokens_seen": 37765745, "router_z_loss_clip": 3.38476562, "router_z_loss_mlp": 0.39282227, "step": 1750, "time_per_iteration": 4.028273582458496 }, { "auxiliary_loss_clip": 0.01658375, "auxiliary_loss_mlp": 0.00295728, "balance_loss_clip": 1.31654418, "balance_loss_mlp": 0.25593632, "epoch": 0.10527581542161431, "flos": 15630909949440.0, "grad_norm": 274.7052338540931, "language_loss": 1.02233803, "learning_rate": 3.940853933543114e-06, "loss": 1.04187894, "num_input_tokens_seen": 37780520, "router_z_loss_clip": 3.421875, "router_z_loss_mlp": 0.39794922, "step": 1751, "time_per_iteration": 2.654942512512207 }, { "auxiliary_loss_clip": 0.01701578, "auxiliary_loss_mlp": 0.00292643, "balance_loss_clip": 1.34819889, "balance_loss_mlp": 0.25361401, "epoch": 0.10533593867428227, "flos": 18296559089280.0, "grad_norm": 33.456783425964694, "language_loss": 0.86443907, "learning_rate": 3.940759882677805e-06, "loss": 0.88438129, "num_input_tokens_seen": 37799515, "router_z_loss_clip": 3.53515625, "router_z_loss_mlp": 0.39038086, "step": 1752, "time_per_iteration": 4.137614727020264 }, { "auxiliary_loss_clip": 0.01711838, "auxiliary_loss_mlp": 0.00286283, "balance_loss_clip": 1.35606897, "balance_loss_mlp": 0.24613284, "epoch": 0.10539606192695025, "flos": 29023219555200.0, "grad_norm": 6.345608017836026, "language_loss": 0.83368587, "learning_rate": 3.940665758218686e-06, "loss": 0.85366714, "num_input_tokens_seen": 37818695, "router_z_loss_clip": 3.55664062, "router_z_loss_mlp": 0.40161133, "step": 1753, "time_per_iteration": 2.6869330406188965 }, { "auxiliary_loss_clip": 0.01729438, "auxiliary_loss_mlp": 0.00299034, "balance_loss_clip": 1.35467029, "balance_loss_mlp": 0.25390178, "epoch": 0.10545618517961822, "flos": 19969313506560.0, "grad_norm": 2.9128279787075866, "language_loss": 0.91589999, "learning_rate": 3.940571560169328e-06, "loss": 0.9361847, "num_input_tokens_seen": 37837860, "router_z_loss_clip": 3.74414062, "router_z_loss_mlp": 0.45117188, "step": 1754, "time_per_iteration": 2.6418986320495605 }, { "auxiliary_loss_clip": 0.01741594, "auxiliary_loss_mlp": 0.00288061, "balance_loss_clip": 1.38498521, "balance_loss_mlp": 0.24485929, "epoch": 0.10551630843228618, "flos": 16143427157760.0, "grad_norm": 8.033709684639213, "language_loss": 0.80845946, "learning_rate": 3.940477288533302e-06, "loss": 0.82875597, "num_input_tokens_seen": 37856260, "router_z_loss_clip": 3.56640625, "router_z_loss_mlp": 0.43188477, "step": 1755, "time_per_iteration": 4.052709341049194 }, { "auxiliary_loss_clip": 0.01757224, "auxiliary_loss_mlp": 0.00287052, "balance_loss_clip": 1.3872968, "balance_loss_mlp": 0.24454184, "epoch": 0.10557643168495416, "flos": 23440115957760.0, "grad_norm": 43.002468541496725, "language_loss": 0.85647726, "learning_rate": 3.940382943314182e-06, "loss": 0.87691998, "num_input_tokens_seen": 37876960, "router_z_loss_clip": 3.70507812, "router_z_loss_mlp": 0.42456055, "step": 1756, "time_per_iteration": 2.6470117568969727 }, { "auxiliary_loss_clip": 0.01752411, "auxiliary_loss_mlp": 0.00268172, "balance_loss_clip": 1.38460732, "balance_loss_mlp": 0.22618632, "epoch": 0.10563655493762213, "flos": 21799034357760.0, "grad_norm": 3.976043726062851, "language_loss": 0.86173648, "learning_rate": 3.940288524515547e-06, "loss": 0.88194227, "num_input_tokens_seen": 37897070, "router_z_loss_clip": 3.67773438, "router_z_loss_mlp": 0.41992188, "step": 1757, "time_per_iteration": 2.6458230018615723 }, { "auxiliary_loss_clip": 0.01759251, "auxiliary_loss_mlp": 0.0028966, "balance_loss_clip": 1.37784207, "balance_loss_mlp": 0.24798459, "epoch": 0.10569667819029009, "flos": 53800863275520.0, "grad_norm": 4.871918184727479, "language_loss": 0.85340738, "learning_rate": 3.940194032140976e-06, "loss": 0.87389648, "num_input_tokens_seen": 37923635, "router_z_loss_clip": 3.81445312, "router_z_loss_mlp": 0.41674805, "step": 1758, "time_per_iteration": 2.9245893955230713 }, { "auxiliary_loss_clip": 0.01737635, "auxiliary_loss_mlp": 0.00281741, "balance_loss_clip": 1.36661971, "balance_loss_mlp": 0.24166249, "epoch": 0.10575680144295807, "flos": 22925515760640.0, "grad_norm": 8.717656175096629, "language_loss": 0.99208629, "learning_rate": 3.940099466194054e-06, "loss": 1.01228011, "num_input_tokens_seen": 37942650, "router_z_loss_clip": 3.71289062, "router_z_loss_mlp": 0.40063477, "step": 1759, "time_per_iteration": 2.6256330013275146 }, { "auxiliary_loss_clip": 0.01713552, "auxiliary_loss_mlp": 0.002799, "balance_loss_clip": 1.35401869, "balance_loss_mlp": 0.24120498, "epoch": 0.10581692469562604, "flos": 14136667148160.0, "grad_norm": 26.46173487594867, "language_loss": 0.86705029, "learning_rate": 3.940004826678365e-06, "loss": 0.88698483, "num_input_tokens_seen": 37960660, "router_z_loss_clip": 3.59375, "router_z_loss_mlp": 0.38647461, "step": 1760, "time_per_iteration": 2.6640896797180176 }, { "auxiliary_loss_clip": 0.0170733, "auxiliary_loss_mlp": 0.00288845, "balance_loss_clip": 1.34804416, "balance_loss_mlp": 0.24976845, "epoch": 0.105877047948294, "flos": 25958674903680.0, "grad_norm": 20.080540378596147, "language_loss": 1.00639331, "learning_rate": 3.939910113597498e-06, "loss": 1.02635515, "num_input_tokens_seen": 37978625, "router_z_loss_clip": 3.59179688, "router_z_loss_mlp": 0.390625, "step": 1761, "time_per_iteration": 2.740861415863037 }, { "auxiliary_loss_clip": 0.01682569, "auxiliary_loss_mlp": 0.00319183, "balance_loss_clip": 1.33622932, "balance_loss_mlp": 0.28191769, "epoch": 0.10593717120096197, "flos": 30664768032000.0, "grad_norm": 87.81189016518454, "language_loss": 0.87708509, "learning_rate": 3.9398153269550464e-06, "loss": 0.89710271, "num_input_tokens_seen": 38000005, "router_z_loss_clip": 3.46484375, "router_z_loss_mlp": 0.37231445, "step": 1762, "time_per_iteration": 2.761380195617676 }, { "auxiliary_loss_clip": 0.01863446, "auxiliary_loss_mlp": 0.00117205, "balance_loss_clip": 1.47991347, "balance_loss_mlp": 0.10695274, "epoch": 0.10599729445362994, "flos": 66436682497920.0, "grad_norm": 0.8057422092373937, "language_loss": 0.60650402, "learning_rate": 3.939720466754602e-06, "loss": 0.62631053, "num_input_tokens_seen": 38066165, "router_z_loss_clip": 3.84375, "router_z_loss_mlp": 0.10253906, "step": 1763, "time_per_iteration": 3.274625539779663 }, { "auxiliary_loss_clip": 0.01650158, "auxiliary_loss_mlp": 0.00328583, "balance_loss_clip": 1.30935001, "balance_loss_mlp": 0.29301125, "epoch": 0.10605741770629791, "flos": 23948179879680.0, "grad_norm": 9.202227912979748, "language_loss": 0.87587559, "learning_rate": 3.939625532999763e-06, "loss": 0.8956629, "num_input_tokens_seen": 38086150, "router_z_loss_clip": 3.40820312, "router_z_loss_mlp": 0.35571289, "step": 1764, "time_per_iteration": 2.697500467300415 }, { "auxiliary_loss_clip": 0.01641061, "auxiliary_loss_mlp": 0.00299009, "balance_loss_clip": 1.30888319, "balance_loss_mlp": 0.26267403, "epoch": 0.10611754095896588, "flos": 19387524919680.0, "grad_norm": 67.13536636283474, "language_loss": 0.85348725, "learning_rate": 3.9395305256941314e-06, "loss": 0.87288797, "num_input_tokens_seen": 38104205, "router_z_loss_clip": 3.32617188, "router_z_loss_mlp": 0.36328125, "step": 1765, "time_per_iteration": 2.682814359664917 }, { "auxiliary_loss_clip": 0.01625933, "auxiliary_loss_mlp": 0.00317842, "balance_loss_clip": 1.29344821, "balance_loss_mlp": 0.28324705, "epoch": 0.10617766421163385, "flos": 22237755073920.0, "grad_norm": 2.6863882634616036, "language_loss": 0.83934575, "learning_rate": 3.939435444841306e-06, "loss": 0.85878348, "num_input_tokens_seen": 38122005, "router_z_loss_clip": 3.32421875, "router_z_loss_mlp": 0.34594727, "step": 1766, "time_per_iteration": 2.7067835330963135 }, { "auxiliary_loss_clip": 0.015995, "auxiliary_loss_mlp": 0.00323429, "balance_loss_clip": 1.28106248, "balance_loss_mlp": 0.28981227, "epoch": 0.10623778746430182, "flos": 28404407024640.0, "grad_norm": 24.726945863101065, "language_loss": 0.82537717, "learning_rate": 3.939340290444895e-06, "loss": 0.8446064, "num_input_tokens_seen": 38143365, "router_z_loss_clip": 3.18164062, "router_z_loss_mlp": 0.3359375, "step": 1767, "time_per_iteration": 2.6797306537628174 }, { "auxiliary_loss_clip": 0.01467311, "auxiliary_loss_mlp": 0.00193077, "balance_loss_clip": 1.24390388, "balance_loss_mlp": 0.17762774, "epoch": 0.10629791071696978, "flos": 64234639221120.0, "grad_norm": 1.1153906039358974, "language_loss": 0.57504141, "learning_rate": 3.939245062508506e-06, "loss": 0.59164524, "num_input_tokens_seen": 38210035, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.15429688, "step": 1768, "time_per_iteration": 3.2208104133605957 }, { "auxiliary_loss_clip": 0.01510407, "auxiliary_loss_mlp": 0.00313131, "balance_loss_clip": 1.19781613, "balance_loss_mlp": 0.28133741, "epoch": 0.10635803396963776, "flos": 22747578762240.0, "grad_norm": 23.22499291320026, "language_loss": 0.91387397, "learning_rate": 3.939149761035749e-06, "loss": 0.93210936, "num_input_tokens_seen": 38231230, "router_z_loss_clip": 3.12695312, "router_z_loss_mlp": 0.31774902, "step": 1769, "time_per_iteration": 2.7244365215301514 }, { "auxiliary_loss_clip": 0.01481427, "auxiliary_loss_mlp": 0.00310284, "balance_loss_clip": 1.17454815, "balance_loss_mlp": 0.27723953, "epoch": 0.10641815722230573, "flos": 31395586147200.0, "grad_norm": 2.12958368562284, "language_loss": 0.68461412, "learning_rate": 3.9390543860302395e-06, "loss": 0.70253122, "num_input_tokens_seen": 38253890, "router_z_loss_clip": 3.07617188, "router_z_loss_mlp": 0.33007812, "step": 1770, "time_per_iteration": 3.024176836013794 }, { "auxiliary_loss_clip": 0.01387852, "auxiliary_loss_mlp": 0.00100124, "balance_loss_clip": 1.15792489, "balance_loss_mlp": 0.08419771, "epoch": 0.1064782804749737, "flos": 58552527784320.0, "grad_norm": 0.8959836845230595, "language_loss": 0.57012165, "learning_rate": 3.9389589374955925e-06, "loss": 0.58500135, "num_input_tokens_seen": 38304290, "router_z_loss_clip": 2.296875, "router_z_loss_mlp": 0.15917969, "step": 1771, "time_per_iteration": 3.2143800258636475 }, { "auxiliary_loss_clip": 0.01441214, "auxiliary_loss_mlp": 0.00350587, "balance_loss_clip": 1.13185406, "balance_loss_mlp": 0.3165175, "epoch": 0.10653840372764166, "flos": 23987825516160.0, "grad_norm": 9.729089096675889, "language_loss": 0.94739389, "learning_rate": 3.938863415435429e-06, "loss": 0.96531188, "num_input_tokens_seen": 38324725, "router_z_loss_clip": 3.09179688, "router_z_loss_mlp": 0.34082031, "step": 1772, "time_per_iteration": 2.7913882732391357 }, { "auxiliary_loss_clip": 0.01434425, "auxiliary_loss_mlp": 0.00356702, "balance_loss_clip": 1.12479115, "balance_loss_mlp": 0.32270345, "epoch": 0.10659852698030964, "flos": 18294655668480.0, "grad_norm": 12.224388743052401, "language_loss": 0.90300584, "learning_rate": 3.93876781985337e-06, "loss": 0.92091703, "num_input_tokens_seen": 38340735, "router_z_loss_clip": 3.09765625, "router_z_loss_mlp": 0.34008789, "step": 1773, "time_per_iteration": 2.698160409927368 }, { "auxiliary_loss_clip": 0.01415572, "auxiliary_loss_mlp": 0.00284989, "balance_loss_clip": 1.11411023, "balance_loss_mlp": 0.25769031, "epoch": 0.1066586502329776, "flos": 32160591031680.0, "grad_norm": 2.034992932550508, "language_loss": 0.92722976, "learning_rate": 3.938672150753041e-06, "loss": 0.94423532, "num_input_tokens_seen": 38361315, "router_z_loss_clip": 3.01171875, "router_z_loss_mlp": 0.27294922, "step": 1774, "time_per_iteration": 2.7041797637939453 }, { "auxiliary_loss_clip": 0.01411655, "auxiliary_loss_mlp": 0.00299293, "balance_loss_clip": 1.10590029, "balance_loss_mlp": 0.26753521, "epoch": 0.10671877348564557, "flos": 17785155202560.0, "grad_norm": 194.33838225348612, "language_loss": 0.90750575, "learning_rate": 3.9385764081380704e-06, "loss": 0.92461514, "num_input_tokens_seen": 38377425, "router_z_loss_clip": 3.05859375, "router_z_loss_mlp": 0.31762695, "step": 1775, "time_per_iteration": 2.5868654251098633 }, { "auxiliary_loss_clip": 0.01304781, "auxiliary_loss_mlp": 0.00088594, "balance_loss_clip": 1.0759201, "balance_loss_mlp": 0.0785807, "epoch": 0.10677889673831355, "flos": 63510177813120.0, "grad_norm": 0.8286941972070695, "language_loss": 0.57709074, "learning_rate": 3.9384805920120876e-06, "loss": 0.59102452, "num_input_tokens_seen": 38440275, "router_z_loss_clip": 2.28125, "router_z_loss_mlp": 0.10009766, "step": 1776, "time_per_iteration": 3.182332992553711 }, { "auxiliary_loss_clip": 0.01401687, "auxiliary_loss_mlp": 0.00324942, "balance_loss_clip": 1.09254098, "balance_loss_mlp": 0.29034728, "epoch": 0.10683901999098151, "flos": 22017694400640.0, "grad_norm": 3.207762622350708, "language_loss": 0.89230108, "learning_rate": 3.938384702378727e-06, "loss": 0.90956736, "num_input_tokens_seen": 38461820, "router_z_loss_clip": 3.09179688, "router_z_loss_mlp": 0.34594727, "step": 1777, "time_per_iteration": 2.6986420154571533 }, { "auxiliary_loss_clip": 0.01389856, "auxiliary_loss_mlp": 0.00287281, "balance_loss_clip": 1.08882618, "balance_loss_mlp": 0.25869507, "epoch": 0.10689914324364948, "flos": 25042952551680.0, "grad_norm": 22.125395543454207, "language_loss": 0.93993527, "learning_rate": 3.938288739241625e-06, "loss": 0.95670664, "num_input_tokens_seen": 38482235, "router_z_loss_clip": 3.00585938, "router_z_loss_mlp": 0.28613281, "step": 1778, "time_per_iteration": 2.6574933528900146 }, { "auxiliary_loss_clip": 0.01384805, "auxiliary_loss_mlp": 0.00315935, "balance_loss_clip": 1.07933569, "balance_loss_mlp": 0.28424942, "epoch": 0.10695926649631746, "flos": 16435129507200.0, "grad_norm": 10.754016814933884, "language_loss": 0.93168962, "learning_rate": 3.938192702604417e-06, "loss": 0.94869697, "num_input_tokens_seen": 38500690, "router_z_loss_clip": 3.05664062, "router_z_loss_mlp": 0.31677246, "step": 1779, "time_per_iteration": 2.5876429080963135 }, { "auxiliary_loss_clip": 0.01382948, "auxiliary_loss_mlp": 0.00273382, "balance_loss_clip": 1.08044362, "balance_loss_mlp": 0.24454525, "epoch": 0.10701938974898542, "flos": 16979211792000.0, "grad_norm": 3.873722912315816, "language_loss": 0.73245776, "learning_rate": 3.9380965924707495e-06, "loss": 0.74902105, "num_input_tokens_seen": 38518405, "router_z_loss_clip": 3.02539062, "router_z_loss_mlp": 0.2878418, "step": 1780, "time_per_iteration": 2.6332502365112305 }, { "auxiliary_loss_clip": 0.01380599, "auxiliary_loss_mlp": 0.00291917, "balance_loss_clip": 1.07162881, "balance_loss_mlp": 0.26025462, "epoch": 0.10707951300165339, "flos": 15888102307200.0, "grad_norm": 38.296689111228126, "language_loss": 1.00835276, "learning_rate": 3.938000408844265e-06, "loss": 1.02507794, "num_input_tokens_seen": 38535060, "router_z_loss_clip": 3.09179688, "router_z_loss_mlp": 0.31640625, "step": 1781, "time_per_iteration": 2.5870883464813232 }, { "auxiliary_loss_clip": 0.01385451, "auxiliary_loss_mlp": 0.00295995, "balance_loss_clip": 1.08034861, "balance_loss_mlp": 0.26299787, "epoch": 0.10713963625432135, "flos": 14247164361600.0, "grad_norm": 4.104005449789552, "language_loss": 0.86690247, "learning_rate": 3.9379041517286105e-06, "loss": 0.88371694, "num_input_tokens_seen": 38552855, "router_z_loss_clip": 3.05078125, "router_z_loss_mlp": 0.33007812, "step": 1782, "time_per_iteration": 2.6141035556793213 }, { "auxiliary_loss_clip": 0.01386131, "auxiliary_loss_mlp": 0.00293206, "balance_loss_clip": 1.08323598, "balance_loss_mlp": 0.26094785, "epoch": 0.10719975950698933, "flos": 16756780821120.0, "grad_norm": 71.47114223340824, "language_loss": 0.8806839, "learning_rate": 3.937807821127436e-06, "loss": 0.89747733, "num_input_tokens_seen": 38570075, "router_z_loss_clip": 3.02734375, "router_z_loss_mlp": 0.3223877, "step": 1783, "time_per_iteration": 2.7039520740509033 }, { "auxiliary_loss_clip": 0.01389, "auxiliary_loss_mlp": 0.00308699, "balance_loss_clip": 1.08492899, "balance_loss_mlp": 0.27548766, "epoch": 0.1072598827596573, "flos": 22710626645760.0, "grad_norm": 48.84198781954945, "language_loss": 0.97773558, "learning_rate": 3.937711417044395e-06, "loss": 0.99471259, "num_input_tokens_seen": 38587970, "router_z_loss_clip": 3.04296875, "router_z_loss_mlp": 0.33227539, "step": 1784, "time_per_iteration": 2.6580519676208496 }, { "auxiliary_loss_clip": 0.0139393, "auxiliary_loss_mlp": 0.00286184, "balance_loss_clip": 1.08680058, "balance_loss_mlp": 0.25378338, "epoch": 0.10732000601232526, "flos": 23258264376960.0, "grad_norm": 24.873085342389736, "language_loss": 1.11884212, "learning_rate": 3.937614939483143e-06, "loss": 1.13564336, "num_input_tokens_seen": 38605840, "router_z_loss_clip": 3.07226562, "router_z_loss_mlp": 0.32421875, "step": 1785, "time_per_iteration": 2.6441102027893066 }, { "auxiliary_loss_clip": 0.01383589, "auxiliary_loss_mlp": 0.00252598, "balance_loss_clip": 1.08451617, "balance_loss_mlp": 0.22392821, "epoch": 0.10738012926499324, "flos": 24207060176640.0, "grad_norm": 10.798155680158187, "language_loss": 0.89920998, "learning_rate": 3.937518388447339e-06, "loss": 0.91557193, "num_input_tokens_seen": 38627070, "router_z_loss_clip": 2.99414062, "router_z_loss_mlp": 0.28649902, "step": 1786, "time_per_iteration": 2.672955274581909 }, { "auxiliary_loss_clip": 0.01397942, "auxiliary_loss_mlp": 0.00288884, "balance_loss_clip": 1.0907495, "balance_loss_mlp": 0.25529066, "epoch": 0.1074402525176612, "flos": 20923065383040.0, "grad_norm": 83.39734802068014, "language_loss": 0.869241, "learning_rate": 3.937421763940642e-06, "loss": 0.88610923, "num_input_tokens_seen": 38645840, "router_z_loss_clip": 3.07226562, "router_z_loss_mlp": 0.33569336, "step": 1787, "time_per_iteration": 2.6503984928131104 }, { "auxiliary_loss_clip": 0.01374866, "auxiliary_loss_mlp": 0.00308939, "balance_loss_clip": 1.07301688, "balance_loss_mlp": 0.27594215, "epoch": 0.10750037577032917, "flos": 16946928443520.0, "grad_norm": 38.795870721172356, "language_loss": 0.92538303, "learning_rate": 3.937325065966719e-06, "loss": 0.94222105, "num_input_tokens_seen": 38664770, "router_z_loss_clip": 3.01757812, "router_z_loss_mlp": 0.33007812, "step": 1788, "time_per_iteration": 2.65669322013855 }, { "auxiliary_loss_clip": 0.01390519, "auxiliary_loss_mlp": 0.00271547, "balance_loss_clip": 1.09290147, "balance_loss_mlp": 0.24086252, "epoch": 0.10756049902299715, "flos": 20266546550400.0, "grad_norm": 2.4419012950144388, "language_loss": 0.8723377, "learning_rate": 3.9372282945292335e-06, "loss": 0.88895833, "num_input_tokens_seen": 38683865, "router_z_loss_clip": 2.97460938, "router_z_loss_mlp": 0.30664062, "step": 1789, "time_per_iteration": 2.6223630905151367 }, { "auxiliary_loss_clip": 0.01379965, "auxiliary_loss_mlp": 0.00278745, "balance_loss_clip": 1.07419562, "balance_loss_mlp": 0.24627212, "epoch": 0.10762062227566511, "flos": 23586523793280.0, "grad_norm": 3.529426570472208, "language_loss": 0.86947161, "learning_rate": 3.937131449631859e-06, "loss": 0.88605875, "num_input_tokens_seen": 38702485, "router_z_loss_clip": 3.06054688, "router_z_loss_mlp": 0.32470703, "step": 1790, "time_per_iteration": 2.689379930496216 }, { "auxiliary_loss_clip": 0.01386043, "auxiliary_loss_mlp": 0.0029117, "balance_loss_clip": 1.07635701, "balance_loss_mlp": 0.25817335, "epoch": 0.10768074552833308, "flos": 24310626065280.0, "grad_norm": 3.5458341488307172, "language_loss": 0.88728356, "learning_rate": 3.9370345312782645e-06, "loss": 0.90405571, "num_input_tokens_seen": 38722475, "router_z_loss_clip": 3.09570312, "router_z_loss_mlp": 0.32983398, "step": 1791, "time_per_iteration": 4.071853160858154 }, { "auxiliary_loss_clip": 0.01375607, "auxiliary_loss_mlp": 0.00256091, "balance_loss_clip": 1.08217645, "balance_loss_mlp": 0.22559699, "epoch": 0.10774086878100106, "flos": 25299965341440.0, "grad_norm": 47.54786146047729, "language_loss": 0.78196716, "learning_rate": 3.936937539472126e-06, "loss": 0.79828411, "num_input_tokens_seen": 38743285, "router_z_loss_clip": 2.93164062, "router_z_loss_mlp": 0.30493164, "step": 1792, "time_per_iteration": 4.2336485385894775 }, { "auxiliary_loss_clip": 0.01377868, "auxiliary_loss_mlp": 0.00309846, "balance_loss_clip": 1.06617689, "balance_loss_mlp": 0.27656317, "epoch": 0.10780099203366902, "flos": 22054035985920.0, "grad_norm": 8.47381438303947, "language_loss": 0.85015178, "learning_rate": 3.9368404742171236e-06, "loss": 0.86702889, "num_input_tokens_seen": 38763035, "router_z_loss_clip": 3.1171875, "router_z_loss_mlp": 0.33300781, "step": 1793, "time_per_iteration": 2.6233699321746826 }, { "auxiliary_loss_clip": 0.01395847, "auxiliary_loss_mlp": 0.00268225, "balance_loss_clip": 1.10036004, "balance_loss_mlp": 0.23820816, "epoch": 0.10786111528633699, "flos": 22747471021440.0, "grad_norm": 4.701263541175931, "language_loss": 0.90830886, "learning_rate": 3.936743335516936e-06, "loss": 0.92494953, "num_input_tokens_seen": 38784900, "router_z_loss_clip": 2.953125, "router_z_loss_mlp": 0.30004883, "step": 1794, "time_per_iteration": 4.1916680335998535 }, { "auxiliary_loss_clip": 0.01397203, "auxiliary_loss_mlp": 0.00320966, "balance_loss_clip": 1.08993363, "balance_loss_mlp": 0.28520328, "epoch": 0.10792123853900495, "flos": 20851064570880.0, "grad_norm": 96.65656289813673, "language_loss": 0.82907444, "learning_rate": 3.936646123375246e-06, "loss": 0.84625614, "num_input_tokens_seen": 38804695, "router_z_loss_clip": 3.078125, "router_z_loss_mlp": 0.35766602, "step": 1795, "time_per_iteration": 2.669577121734619 }, { "auxiliary_loss_clip": 0.01403106, "auxiliary_loss_mlp": 0.00275299, "balance_loss_clip": 1.10466099, "balance_loss_mlp": 0.24196775, "epoch": 0.10798136179167293, "flos": 17748705876480.0, "grad_norm": 433.13951906292334, "language_loss": 0.95062566, "learning_rate": 3.936548837795741e-06, "loss": 0.96740961, "num_input_tokens_seen": 38822395, "router_z_loss_clip": 2.98242188, "router_z_loss_mlp": 0.33349609, "step": 1796, "time_per_iteration": 2.5828678607940674 }, { "auxiliary_loss_clip": 0.0142448, "auxiliary_loss_mlp": 0.0028301, "balance_loss_clip": 1.12576127, "balance_loss_mlp": 0.24667516, "epoch": 0.1080414850443409, "flos": 13589639948160.0, "grad_norm": 437.7938292108712, "language_loss": 0.89110261, "learning_rate": 3.936451478782111e-06, "loss": 0.9081775, "num_input_tokens_seen": 38839865, "router_z_loss_clip": 2.99023438, "router_z_loss_mlp": 0.36328125, "step": 1797, "time_per_iteration": 4.118465185165405 }, { "auxiliary_loss_clip": 0.01404743, "auxiliary_loss_mlp": 0.00273057, "balance_loss_clip": 1.1114254, "balance_loss_mlp": 0.24286154, "epoch": 0.10810160829700886, "flos": 16253421580800.0, "grad_norm": 2.3054153173628174, "language_loss": 0.89840937, "learning_rate": 3.936354046338046e-06, "loss": 0.91518736, "num_input_tokens_seen": 38857300, "router_z_loss_clip": 2.93554688, "router_z_loss_mlp": 0.30212402, "step": 1798, "time_per_iteration": 2.586261034011841 }, { "auxiliary_loss_clip": 0.01429342, "auxiliary_loss_mlp": 0.00280687, "balance_loss_clip": 1.13677311, "balance_loss_mlp": 0.24566326, "epoch": 0.10816173154967684, "flos": 15158002464000.0, "grad_norm": 2595.5447062725198, "language_loss": 0.96062791, "learning_rate": 3.936256540467242e-06, "loss": 0.97772825, "num_input_tokens_seen": 38874960, "router_z_loss_clip": 2.92773438, "router_z_loss_mlp": 0.35009766, "step": 1799, "time_per_iteration": 2.6031785011291504 }, { "auxiliary_loss_clip": 0.01442261, "auxiliary_loss_mlp": 0.00258559, "balance_loss_clip": 1.15770769, "balance_loss_mlp": 0.22849467, "epoch": 0.10822185480234481, "flos": 17785334770560.0, "grad_norm": 98.14875183093585, "language_loss": 0.87112027, "learning_rate": 3.9361589611733955e-06, "loss": 0.88812852, "num_input_tokens_seen": 38893610, "router_z_loss_clip": 2.84375, "router_z_loss_mlp": 0.30078125, "step": 1800, "time_per_iteration": 2.6057732105255127 }, { "auxiliary_loss_clip": 0.01467966, "auxiliary_loss_mlp": 0.00315666, "balance_loss_clip": 1.1698432, "balance_loss_mlp": 0.28092852, "epoch": 0.10828197805501277, "flos": 25556654908800.0, "grad_norm": 6.807032902799721, "language_loss": 0.79651642, "learning_rate": 3.9360613084602075e-06, "loss": 0.81435275, "num_input_tokens_seen": 38913485, "router_z_loss_clip": 2.984375, "router_z_loss_mlp": 0.34741211, "step": 1801, "time_per_iteration": 2.6665329933166504 }, { "auxiliary_loss_clip": 0.01472321, "auxiliary_loss_mlp": 0.00328669, "balance_loss_clip": 1.17027164, "balance_loss_mlp": 0.29364514, "epoch": 0.10834210130768075, "flos": 28984435845120.0, "grad_norm": 2.8411098187515536, "language_loss": 0.73854452, "learning_rate": 3.935963582331381e-06, "loss": 0.75655442, "num_input_tokens_seen": 38935650, "router_z_loss_clip": 3.02148438, "router_z_loss_mlp": 0.35009766, "step": 1802, "time_per_iteration": 2.7500131130218506 }, { "auxiliary_loss_clip": 0.01474939, "auxiliary_loss_mlp": 0.00314696, "balance_loss_clip": 1.1837461, "balance_loss_mlp": 0.28236601, "epoch": 0.10840222456034872, "flos": 20264212166400.0, "grad_norm": 34.54018930936695, "language_loss": 0.89551222, "learning_rate": 3.935865782790621e-06, "loss": 0.91340864, "num_input_tokens_seen": 38954130, "router_z_loss_clip": 2.90820312, "router_z_loss_mlp": 0.32324219, "step": 1803, "time_per_iteration": 2.604130983352661 }, { "auxiliary_loss_clip": 0.01478203, "auxiliary_loss_mlp": 0.00324797, "balance_loss_clip": 1.17036879, "balance_loss_mlp": 0.28839049, "epoch": 0.10846234781301668, "flos": 19863054097920.0, "grad_norm": 11.917340415905127, "language_loss": 0.97469795, "learning_rate": 3.9357679098416365e-06, "loss": 0.99272788, "num_input_tokens_seen": 38972905, "router_z_loss_clip": 3.08203125, "router_z_loss_mlp": 0.36425781, "step": 1804, "time_per_iteration": 2.6387641429901123 }, { "auxiliary_loss_clip": 0.01496578, "auxiliary_loss_mlp": 0.00342665, "balance_loss_clip": 1.18304646, "balance_loss_mlp": 0.30604413, "epoch": 0.10852247106568465, "flos": 26469037296000.0, "grad_norm": 33.78534704626988, "language_loss": 0.84067535, "learning_rate": 3.935669963488139e-06, "loss": 0.8590678, "num_input_tokens_seen": 38993255, "router_z_loss_clip": 3.13476562, "router_z_loss_mlp": 0.36645508, "step": 1805, "time_per_iteration": 2.660825729370117 }, { "auxiliary_loss_clip": 0.01508224, "auxiliary_loss_mlp": 0.00353214, "balance_loss_clip": 1.20527029, "balance_loss_mlp": 0.31783253, "epoch": 0.10858259431835263, "flos": 30081506987520.0, "grad_norm": 6.971086073440231, "language_loss": 0.92360806, "learning_rate": 3.935571943733843e-06, "loss": 0.94222248, "num_input_tokens_seen": 39012610, "router_z_loss_clip": 3.03125, "router_z_loss_mlp": 0.35424805, "step": 1806, "time_per_iteration": 2.8033337593078613 }, { "auxiliary_loss_clip": 0.01484505, "auxiliary_loss_mlp": 0.00304765, "balance_loss_clip": 1.19085264, "balance_loss_mlp": 0.26876372, "epoch": 0.10864271757102059, "flos": 19063180085760.0, "grad_norm": 27.975771254139342, "language_loss": 0.89652562, "learning_rate": 3.9354738505824635e-06, "loss": 0.91441834, "num_input_tokens_seen": 39030120, "router_z_loss_clip": 2.93554688, "router_z_loss_mlp": 0.35986328, "step": 1807, "time_per_iteration": 2.603841781616211 }, { "auxiliary_loss_clip": 0.01510018, "auxiliary_loss_mlp": 0.00305211, "balance_loss_clip": 1.20350218, "balance_loss_mlp": 0.27025837, "epoch": 0.10870284082368856, "flos": 24715052271360.0, "grad_norm": 7.6441670455931385, "language_loss": 0.8495543, "learning_rate": 3.9353756840377225e-06, "loss": 0.8677066, "num_input_tokens_seen": 39049875, "router_z_loss_clip": 3.06445312, "router_z_loss_mlp": 0.34960938, "step": 1808, "time_per_iteration": 2.731492042541504 }, { "auxiliary_loss_clip": 0.0151618, "auxiliary_loss_mlp": 0.00316619, "balance_loss_clip": 1.20883238, "balance_loss_mlp": 0.27937794, "epoch": 0.10876296407635654, "flos": 20627663932800.0, "grad_norm": 24.977472720099854, "language_loss": 0.86508191, "learning_rate": 3.935277444103342e-06, "loss": 0.88340986, "num_input_tokens_seen": 39068935, "router_z_loss_clip": 3.07421875, "router_z_loss_mlp": 0.37231445, "step": 1809, "time_per_iteration": 2.6619210243225098 }, { "auxiliary_loss_clip": 0.0153999, "auxiliary_loss_mlp": 0.00361241, "balance_loss_clip": 1.2304461, "balance_loss_mlp": 0.32259336, "epoch": 0.1088230873290245, "flos": 21579835610880.0, "grad_norm": 9.73462218335589, "language_loss": 0.9343105, "learning_rate": 3.935179130783046e-06, "loss": 0.95332277, "num_input_tokens_seen": 39087370, "router_z_loss_clip": 3.09570312, "router_z_loss_mlp": 0.38647461, "step": 1810, "time_per_iteration": 2.6820244789123535 }, { "auxiliary_loss_clip": 0.0154735, "auxiliary_loss_mlp": 0.00326603, "balance_loss_clip": 1.23460555, "balance_loss_mlp": 0.2843551, "epoch": 0.10888321058169247, "flos": 26469037296000.0, "grad_norm": 11.394632188952714, "language_loss": 0.73688769, "learning_rate": 3.935080744080564e-06, "loss": 0.75562727, "num_input_tokens_seen": 39106635, "router_z_loss_clip": 3.12695312, "router_z_loss_mlp": 0.42236328, "step": 1811, "time_per_iteration": 2.6823573112487793 }, { "auxiliary_loss_clip": 0.01587271, "auxiliary_loss_mlp": 0.00337602, "balance_loss_clip": 1.27420378, "balance_loss_mlp": 0.29480544, "epoch": 0.10894333383436045, "flos": 25848608653440.0, "grad_norm": 1910.3878602498169, "language_loss": 0.82111073, "learning_rate": 3.934982283999626e-06, "loss": 0.84035939, "num_input_tokens_seen": 39126335, "router_z_loss_clip": 3.1328125, "router_z_loss_mlp": 0.42797852, "step": 1812, "time_per_iteration": 2.6844756603240967 }, { "auxiliary_loss_clip": 0.015848, "auxiliary_loss_mlp": 0.00343341, "balance_loss_clip": 1.26620245, "balance_loss_mlp": 0.29975769, "epoch": 0.10900345708702841, "flos": 19537093152000.0, "grad_norm": 3.204041933392418, "language_loss": 0.80224597, "learning_rate": 3.934883750543966e-06, "loss": 0.82152736, "num_input_tokens_seen": 39144820, "router_z_loss_clip": 3.18359375, "router_z_loss_mlp": 0.4362793, "step": 1813, "time_per_iteration": 2.653700113296509 }, { "auxiliary_loss_clip": 0.0159923, "auxiliary_loss_mlp": 0.00327981, "balance_loss_clip": 1.27784538, "balance_loss_mlp": 0.2868059, "epoch": 0.10906358033969638, "flos": 23623296341760.0, "grad_norm": 5.225977549526657, "language_loss": 0.88955015, "learning_rate": 3.93478514371732e-06, "loss": 0.90882224, "num_input_tokens_seen": 39165945, "router_z_loss_clip": 3.21289062, "router_z_loss_mlp": 0.41162109, "step": 1814, "time_per_iteration": 2.692347526550293 }, { "auxiliary_loss_clip": 0.01623156, "auxiliary_loss_mlp": 0.00359196, "balance_loss_clip": 1.29374838, "balance_loss_mlp": 0.31697163, "epoch": 0.10912370359236434, "flos": 21214731818880.0, "grad_norm": 12.541549781049843, "language_loss": 0.93313444, "learning_rate": 3.934686463523429e-06, "loss": 0.95295799, "num_input_tokens_seen": 39183520, "router_z_loss_clip": 3.29882812, "router_z_loss_mlp": 0.42211914, "step": 1815, "time_per_iteration": 2.6972737312316895 }, { "auxiliary_loss_clip": 0.01623073, "auxiliary_loss_mlp": 0.00365482, "balance_loss_clip": 1.29678762, "balance_loss_mlp": 0.32538027, "epoch": 0.10918382684503232, "flos": 13553190622080.0, "grad_norm": 4.264451448830845, "language_loss": 0.81292301, "learning_rate": 3.9345877099660315e-06, "loss": 0.83280861, "num_input_tokens_seen": 39201190, "router_z_loss_clip": 3.26171875, "router_z_loss_mlp": 0.40063477, "step": 1816, "time_per_iteration": 2.660128355026245 }, { "auxiliary_loss_clip": 0.01635908, "auxiliary_loss_mlp": 0.00410412, "balance_loss_clip": 1.29705024, "balance_loss_mlp": 0.36952287, "epoch": 0.10924395009770028, "flos": 27964321591680.0, "grad_norm": 4.613955592607701, "language_loss": 0.8324784, "learning_rate": 3.9344888830488744e-06, "loss": 0.85294157, "num_input_tokens_seen": 39221210, "router_z_loss_clip": 3.38671875, "router_z_loss_mlp": 0.40893555, "step": 1817, "time_per_iteration": 2.6948177814483643 }, { "auxiliary_loss_clip": 0.01614317, "auxiliary_loss_mlp": 0.00326715, "balance_loss_clip": 1.28303671, "balance_loss_mlp": 0.28713739, "epoch": 0.10930407335036825, "flos": 25593750679680.0, "grad_norm": 4.241227629910632, "language_loss": 0.73909134, "learning_rate": 3.934389982775706e-06, "loss": 0.75850165, "num_input_tokens_seen": 39242025, "router_z_loss_clip": 3.3125, "router_z_loss_mlp": 0.39575195, "step": 1818, "time_per_iteration": 2.640061855316162 }, { "auxiliary_loss_clip": 0.01623247, "auxiliary_loss_mlp": 0.00391462, "balance_loss_clip": 1.28983951, "balance_loss_mlp": 0.34950054, "epoch": 0.10936419660303623, "flos": 18406194376320.0, "grad_norm": 6.573320252767443, "language_loss": 0.80576319, "learning_rate": 3.934291009150275e-06, "loss": 0.82591027, "num_input_tokens_seen": 39259870, "router_z_loss_clip": 3.33789062, "router_z_loss_mlp": 0.41943359, "step": 1819, "time_per_iteration": 2.6848671436309814 }, { "auxiliary_loss_clip": 0.0162179, "auxiliary_loss_mlp": 0.00359719, "balance_loss_clip": 1.29034734, "balance_loss_mlp": 0.32178682, "epoch": 0.1094243198557042, "flos": 23840052963840.0, "grad_norm": 8.12649100918384, "language_loss": 0.81334555, "learning_rate": 3.934191962176335e-06, "loss": 0.8331607, "num_input_tokens_seen": 39278500, "router_z_loss_clip": 3.31835938, "router_z_loss_mlp": 0.37939453, "step": 1820, "time_per_iteration": 2.6963491439819336 }, { "auxiliary_loss_clip": 0.01629089, "auxiliary_loss_mlp": 0.00406508, "balance_loss_clip": 1.29039383, "balance_loss_mlp": 0.3658815, "epoch": 0.10948444310837216, "flos": 14643940970880.0, "grad_norm": 145.2705431942514, "language_loss": 0.9071455, "learning_rate": 3.934092841857642e-06, "loss": 0.92750144, "num_input_tokens_seen": 39294800, "router_z_loss_clip": 3.38671875, "router_z_loss_mlp": 0.40625, "step": 1821, "time_per_iteration": 2.6103506088256836 }, { "auxiliary_loss_clip": 0.01604937, "auxiliary_loss_mlp": 0.00399406, "balance_loss_clip": 1.26750946, "balance_loss_mlp": 0.36240363, "epoch": 0.10954456636104014, "flos": 27818811596160.0, "grad_norm": 12.563522602177912, "language_loss": 0.82315767, "learning_rate": 3.933993648197955e-06, "loss": 0.84320116, "num_input_tokens_seen": 39314625, "router_z_loss_clip": 3.375, "router_z_loss_mlp": 0.37011719, "step": 1822, "time_per_iteration": 2.676347017288208 }, { "auxiliary_loss_clip": 0.01614525, "auxiliary_loss_mlp": 0.00374395, "balance_loss_clip": 1.27959442, "balance_loss_mlp": 0.33820325, "epoch": 0.1096046896137081, "flos": 33620934372480.0, "grad_norm": 89.37402637073949, "language_loss": 0.85689342, "learning_rate": 3.933894381201034e-06, "loss": 0.87678266, "num_input_tokens_seen": 39336465, "router_z_loss_clip": 3.34960938, "router_z_loss_mlp": 0.36206055, "step": 1823, "time_per_iteration": 2.744638681411743 }, { "auxiliary_loss_clip": 0.01609962, "auxiliary_loss_mlp": 0.00367224, "balance_loss_clip": 1.27928019, "balance_loss_mlp": 0.33208162, "epoch": 0.10966481286637607, "flos": 26980010219520.0, "grad_norm": 34.91677701077163, "language_loss": 0.85110998, "learning_rate": 3.933795040870645e-06, "loss": 0.87088192, "num_input_tokens_seen": 39357930, "router_z_loss_clip": 3.30859375, "router_z_loss_mlp": 0.3515625, "step": 1824, "time_per_iteration": 2.6609127521514893 }, { "auxiliary_loss_clip": 0.01623299, "auxiliary_loss_mlp": 0.00389844, "balance_loss_clip": 1.28175902, "balance_loss_mlp": 0.35176849, "epoch": 0.10972493611904403, "flos": 23036551678080.0, "grad_norm": 9.102866931322795, "language_loss": 0.95714837, "learning_rate": 3.933695627210554e-06, "loss": 0.97727984, "num_input_tokens_seen": 39376380, "router_z_loss_clip": 3.4140625, "router_z_loss_mlp": 0.38061523, "step": 1825, "time_per_iteration": 2.6535887718200684 }, { "auxiliary_loss_clip": 0.01592187, "auxiliary_loss_mlp": 0.00414589, "balance_loss_clip": 1.24365628, "balance_loss_mlp": 0.37737176, "epoch": 0.10978505937171201, "flos": 38104632443520.0, "grad_norm": 7.218148193288993, "language_loss": 0.81585062, "learning_rate": 3.933596140224532e-06, "loss": 0.83591843, "num_input_tokens_seen": 39399935, "router_z_loss_clip": 3.48242188, "router_z_loss_mlp": 0.37231445, "step": 1826, "time_per_iteration": 2.7624993324279785 }, { "auxiliary_loss_clip": 0.01729088, "auxiliary_loss_mlp": 0.00125958, "balance_loss_clip": 1.39071679, "balance_loss_mlp": 0.11441834, "epoch": 0.10984518262437998, "flos": 59849694616320.0, "grad_norm": 0.8109961898326862, "language_loss": 0.55054653, "learning_rate": 3.93349657991635e-06, "loss": 0.56909692, "num_input_tokens_seen": 39460685, "router_z_loss_clip": 3.375, "router_z_loss_mlp": 0.11523438, "step": 1827, "time_per_iteration": 3.209993600845337 }, { "auxiliary_loss_clip": 0.01706902, "auxiliary_loss_mlp": 0.00121979, "balance_loss_clip": 1.37636495, "balance_loss_mlp": 0.10900941, "epoch": 0.10990530587704794, "flos": 66719837410560.0, "grad_norm": 1.1199575484257127, "language_loss": 0.55318272, "learning_rate": 3.933396946289784e-06, "loss": 0.57147151, "num_input_tokens_seen": 39524765, "router_z_loss_clip": 3.3125, "router_z_loss_mlp": 0.12988281, "step": 1828, "time_per_iteration": 3.1910574436187744 }, { "auxiliary_loss_clip": 0.01566081, "auxiliary_loss_mlp": 0.0036008, "balance_loss_clip": 1.24322033, "balance_loss_mlp": 0.32651085, "epoch": 0.10996542912971592, "flos": 25447199189760.0, "grad_norm": 4.017678019569364, "language_loss": 0.9279502, "learning_rate": 3.933297239348612e-06, "loss": 0.94721174, "num_input_tokens_seen": 39543640, "router_z_loss_clip": 3.2265625, "router_z_loss_mlp": 0.33569336, "step": 1829, "time_per_iteration": 2.816012144088745 }, { "auxiliary_loss_clip": 0.01544855, "auxiliary_loss_mlp": 0.00388557, "balance_loss_clip": 1.22472119, "balance_loss_mlp": 0.35300922, "epoch": 0.11002555238238389, "flos": 44018186186880.0, "grad_norm": 173.20911181698102, "language_loss": 0.94620395, "learning_rate": 3.933197459096614e-06, "loss": 0.96553808, "num_input_tokens_seen": 39567525, "router_z_loss_clip": 3.203125, "router_z_loss_mlp": 0.35546875, "step": 1830, "time_per_iteration": 2.882603883743286 }, { "auxiliary_loss_clip": 0.01646177, "auxiliary_loss_mlp": 0.00142622, "balance_loss_clip": 1.34722471, "balance_loss_mlp": 0.12965249, "epoch": 0.11008567563505185, "flos": 54065133590400.0, "grad_norm": 0.6818301990539664, "language_loss": 0.55377603, "learning_rate": 3.9330976055375756e-06, "loss": 0.57166404, "num_input_tokens_seen": 39628470, "router_z_loss_clip": 3.0, "router_z_loss_mlp": 0.12988281, "step": 1831, "time_per_iteration": 3.1484909057617188 }, { "auxiliary_loss_clip": 0.01522746, "auxiliary_loss_mlp": 0.00376712, "balance_loss_clip": 1.21257639, "balance_loss_mlp": 0.34071082, "epoch": 0.11014579888771983, "flos": 24243150366720.0, "grad_norm": 71.01357958629484, "language_loss": 0.97846997, "learning_rate": 3.932997678675282e-06, "loss": 0.99746454, "num_input_tokens_seen": 39646670, "router_z_loss_clip": 3.10546875, "router_z_loss_mlp": 0.36010742, "step": 1832, "time_per_iteration": 2.7099859714508057 }, { "auxiliary_loss_clip": 0.01588613, "auxiliary_loss_mlp": 0.0030973, "balance_loss_clip": 1.31844795, "balance_loss_mlp": 0.29275453, "epoch": 0.1102059221403878, "flos": 57743965658880.0, "grad_norm": 0.7233894276619168, "language_loss": 0.59755969, "learning_rate": 3.932897678513523e-06, "loss": 0.61654305, "num_input_tokens_seen": 39712915, "router_z_loss_clip": 2.703125, "router_z_loss_mlp": 0.16992188, "step": 1833, "time_per_iteration": 3.1736154556274414 }, { "auxiliary_loss_clip": 0.01517251, "auxiliary_loss_mlp": 0.00395085, "balance_loss_clip": 1.20315528, "balance_loss_mlp": 0.35987055, "epoch": 0.11026604539305576, "flos": 16795923667200.0, "grad_norm": 169.53829158831732, "language_loss": 0.9125731, "learning_rate": 3.93279760505609e-06, "loss": 0.93169641, "num_input_tokens_seen": 39730650, "router_z_loss_clip": 3.14257812, "router_z_loss_mlp": 0.35180664, "step": 1834, "time_per_iteration": 5.501774311065674 }, { "auxiliary_loss_clip": 0.01503822, "auxiliary_loss_mlp": 0.00385618, "balance_loss_clip": 1.19490921, "balance_loss_mlp": 0.35214394, "epoch": 0.11032616864572373, "flos": 23988076911360.0, "grad_norm": 143.68551988049956, "language_loss": 0.99950361, "learning_rate": 3.932697458306779e-06, "loss": 1.01839805, "num_input_tokens_seen": 39751065, "router_z_loss_clip": 3.08984375, "router_z_loss_mlp": 0.3347168, "step": 1835, "time_per_iteration": 2.7429637908935547 }, { "auxiliary_loss_clip": 0.01502036, "auxiliary_loss_mlp": 0.00376736, "balance_loss_clip": 1.20072079, "balance_loss_mlp": 0.34346437, "epoch": 0.1103862918983917, "flos": 19683141851520.0, "grad_norm": 12.775869127227542, "language_loss": 0.73760641, "learning_rate": 3.932597238269386e-06, "loss": 0.75639415, "num_input_tokens_seen": 39769245, "router_z_loss_clip": 3.01367188, "router_z_loss_mlp": 0.33239746, "step": 1836, "time_per_iteration": 2.6511008739471436 }, { "auxiliary_loss_clip": 0.01486623, "auxiliary_loss_mlp": 0.00378279, "balance_loss_clip": 1.18981481, "balance_loss_mlp": 0.34370798, "epoch": 0.11044641515105967, "flos": 32160878340480.0, "grad_norm": 32.27895800765118, "language_loss": 0.78578794, "learning_rate": 3.932496944947711e-06, "loss": 0.80443692, "num_input_tokens_seen": 39790830, "router_z_loss_clip": 2.96875, "router_z_loss_mlp": 0.34594727, "step": 1837, "time_per_iteration": 4.077972650527954 }, { "auxiliary_loss_clip": 0.01505597, "auxiliary_loss_mlp": 0.00408937, "balance_loss_clip": 1.2091229, "balance_loss_mlp": 0.37405628, "epoch": 0.11050653840372764, "flos": 16689233295360.0, "grad_norm": 16.859323590609453, "language_loss": 0.86386251, "learning_rate": 3.93239657834556e-06, "loss": 0.88300782, "num_input_tokens_seen": 39809475, "router_z_loss_clip": 2.95898438, "router_z_loss_mlp": 0.34887695, "step": 1838, "time_per_iteration": 2.6017768383026123 }, { "auxiliary_loss_clip": 0.01492602, "auxiliary_loss_mlp": 0.00437073, "balance_loss_clip": 1.20116532, "balance_loss_mlp": 0.39968851, "epoch": 0.11056666165639562, "flos": 21208877902080.0, "grad_norm": 17.785028472712543, "language_loss": 0.78523737, "learning_rate": 3.932296138466736e-06, "loss": 0.80453408, "num_input_tokens_seen": 39826355, "router_z_loss_clip": 2.91210938, "router_z_loss_mlp": 0.3737793, "step": 1839, "time_per_iteration": 4.129225730895996 }, { "auxiliary_loss_clip": 0.01512603, "auxiliary_loss_mlp": 0.00497813, "balance_loss_clip": 1.20934319, "balance_loss_mlp": 0.45687687, "epoch": 0.11062678490906358, "flos": 19165488998400.0, "grad_norm": 27.369073127944052, "language_loss": 0.85577941, "learning_rate": 3.93219562531505e-06, "loss": 0.87588352, "num_input_tokens_seen": 39845335, "router_z_loss_clip": 3.02734375, "router_z_loss_mlp": 0.40917969, "step": 1840, "time_per_iteration": 2.6889336109161377 }, { "auxiliary_loss_clip": 0.01516334, "auxiliary_loss_mlp": 0.00517685, "balance_loss_clip": 1.22048616, "balance_loss_mlp": 0.476915, "epoch": 0.11068690816173155, "flos": 24895287740160.0, "grad_norm": 322.4156717713936, "language_loss": 0.93081105, "learning_rate": 3.932095038894311e-06, "loss": 0.95115125, "num_input_tokens_seen": 39865065, "router_z_loss_clip": 2.95703125, "router_z_loss_mlp": 0.40771484, "step": 1841, "time_per_iteration": 2.674248695373535 }, { "auxiliary_loss_clip": 0.01517465, "auxiliary_loss_mlp": 0.00519808, "balance_loss_clip": 1.21889591, "balance_loss_mlp": 0.47863328, "epoch": 0.11074703141439952, "flos": 16472368932480.0, "grad_norm": 6.001106881364176, "language_loss": 0.97516751, "learning_rate": 3.931994379208334e-06, "loss": 0.99554026, "num_input_tokens_seen": 39882780, "router_z_loss_clip": 2.98828125, "router_z_loss_mlp": 0.41162109, "step": 1842, "time_per_iteration": 2.6109797954559326 }, { "auxiliary_loss_clip": 0.01519917, "auxiliary_loss_mlp": 0.00578539, "balance_loss_clip": 1.22115874, "balance_loss_mlp": 0.53903341, "epoch": 0.11080715466706749, "flos": 19172420323200.0, "grad_norm": 9.11035780431033, "language_loss": 0.92763209, "learning_rate": 3.931893646260937e-06, "loss": 0.94861674, "num_input_tokens_seen": 39900295, "router_z_loss_clip": 2.9921875, "router_z_loss_mlp": 0.39501953, "step": 1843, "time_per_iteration": 2.605214834213257 }, { "auxiliary_loss_clip": 0.01525653, "auxiliary_loss_mlp": 0.00557528, "balance_loss_clip": 1.22174239, "balance_loss_mlp": 0.5154947, "epoch": 0.11086727791973545, "flos": 27704687109120.0, "grad_norm": 13.668610342905762, "language_loss": 0.80902016, "learning_rate": 3.931792840055941e-06, "loss": 0.82985198, "num_input_tokens_seen": 39922075, "router_z_loss_clip": 3.04101562, "router_z_loss_mlp": 0.42041016, "step": 1844, "time_per_iteration": 2.67594313621521 }, { "auxiliary_loss_clip": 0.01534229, "auxiliary_loss_mlp": 0.00579375, "balance_loss_clip": 1.22458482, "balance_loss_mlp": 0.53436118, "epoch": 0.11092740117240343, "flos": 18514967736960.0, "grad_norm": 2.993921245963958, "language_loss": 0.82913709, "learning_rate": 3.931691960597165e-06, "loss": 0.85027313, "num_input_tokens_seen": 39940115, "router_z_loss_clip": 3.09960938, "router_z_loss_mlp": 0.44995117, "step": 1845, "time_per_iteration": 2.5866506099700928 }, { "auxiliary_loss_clip": 0.01550872, "auxiliary_loss_mlp": 0.00626795, "balance_loss_clip": 1.24162066, "balance_loss_mlp": 0.58001685, "epoch": 0.1109875244250714, "flos": 20522446018560.0, "grad_norm": 81.0829551717652, "language_loss": 0.82336473, "learning_rate": 3.9315910078884375e-06, "loss": 0.84514147, "num_input_tokens_seen": 39959920, "router_z_loss_clip": 3.09179688, "router_z_loss_mlp": 0.46801758, "step": 1846, "time_per_iteration": 2.699831247329712 }, { "auxiliary_loss_clip": 0.01581326, "auxiliary_loss_mlp": 0.00646595, "balance_loss_clip": 1.25892127, "balance_loss_mlp": 0.59755254, "epoch": 0.11104764767773936, "flos": 14098601710080.0, "grad_norm": 194.61970163304517, "language_loss": 0.95756525, "learning_rate": 3.931489981933584e-06, "loss": 0.97984439, "num_input_tokens_seen": 39974755, "router_z_loss_clip": 3.22265625, "router_z_loss_mlp": 0.49023438, "step": 1847, "time_per_iteration": 2.6291682720184326 }, { "auxiliary_loss_clip": 0.01589729, "auxiliary_loss_mlp": 0.00602754, "balance_loss_clip": 1.26432705, "balance_loss_mlp": 0.55511844, "epoch": 0.11110777093040733, "flos": 20594518657920.0, "grad_norm": 12.755682546890265, "language_loss": 0.83710957, "learning_rate": 3.931388882736438e-06, "loss": 0.85903442, "num_input_tokens_seen": 39993355, "router_z_loss_clip": 3.2578125, "router_z_loss_mlp": 0.47558594, "step": 1848, "time_per_iteration": 2.6921894550323486 }, { "auxiliary_loss_clip": 0.01633343, "auxiliary_loss_mlp": 0.0059155, "balance_loss_clip": 1.30315542, "balance_loss_mlp": 0.54868245, "epoch": 0.11116789418307531, "flos": 21870065502720.0, "grad_norm": 79.53701685844075, "language_loss": 0.83417308, "learning_rate": 3.931287710300832e-06, "loss": 0.85642207, "num_input_tokens_seen": 40012410, "router_z_loss_clip": 3.30273438, "router_z_loss_mlp": 0.4284668, "step": 1849, "time_per_iteration": 2.6991817951202393 }, { "auxiliary_loss_clip": 0.01635501, "auxiliary_loss_mlp": 0.00625994, "balance_loss_clip": 1.29677737, "balance_loss_mlp": 0.57783389, "epoch": 0.11122801743574327, "flos": 15523106256000.0, "grad_norm": 46.654569461353205, "language_loss": 0.79757488, "learning_rate": 3.931186464630601e-06, "loss": 0.82018977, "num_input_tokens_seen": 40029315, "router_z_loss_clip": 3.38867188, "router_z_loss_mlp": 0.48193359, "step": 1850, "time_per_iteration": 2.6482081413269043 }, { "auxiliary_loss_clip": 0.01672229, "auxiliary_loss_mlp": 0.00675386, "balance_loss_clip": 1.32253122, "balance_loss_mlp": 0.62581885, "epoch": 0.11128814068841124, "flos": 14392279307520.0, "grad_norm": 12.87232261205664, "language_loss": 0.90954936, "learning_rate": 3.931085145729588e-06, "loss": 0.93302548, "num_input_tokens_seen": 40045765, "router_z_loss_clip": 3.49609375, "router_z_loss_mlp": 0.49609375, "step": 1851, "time_per_iteration": 2.6153407096862793 }, { "auxiliary_loss_clip": 0.01671796, "auxiliary_loss_mlp": 0.0060093, "balance_loss_clip": 1.32091832, "balance_loss_mlp": 0.55648881, "epoch": 0.11134826394107922, "flos": 16653933204480.0, "grad_norm": 2.9525943341220704, "language_loss": 0.94772923, "learning_rate": 3.930983753601631e-06, "loss": 0.97045648, "num_input_tokens_seen": 40061660, "router_z_loss_clip": 3.50585938, "router_z_loss_mlp": 0.44458008, "step": 1852, "time_per_iteration": 2.740330696105957 }, { "auxiliary_loss_clip": 0.01662207, "auxiliary_loss_mlp": 0.00597708, "balance_loss_clip": 1.31359351, "balance_loss_mlp": 0.55569845, "epoch": 0.11140838719374718, "flos": 16690993061760.0, "grad_norm": 25.41324538280786, "language_loss": 0.78251535, "learning_rate": 3.930882288250578e-06, "loss": 0.80511451, "num_input_tokens_seen": 40080180, "router_z_loss_clip": 3.484375, "router_z_loss_mlp": 0.42016602, "step": 1853, "time_per_iteration": 2.6732494831085205 }, { "auxiliary_loss_clip": 0.01703721, "auxiliary_loss_mlp": 0.00332303, "balance_loss_clip": 1.39928019, "balance_loss_mlp": 0.30426455, "epoch": 0.11146851044641515, "flos": 60976355587200.0, "grad_norm": 0.7919067087660426, "language_loss": 0.5370115, "learning_rate": 3.930780749680273e-06, "loss": 0.55737174, "num_input_tokens_seen": 40138910, "router_z_loss_clip": 3.046875, "router_z_loss_mlp": 0.28125, "step": 1854, "time_per_iteration": 3.13718843460083 }, { "auxiliary_loss_clip": 0.0168083, "auxiliary_loss_mlp": 0.00593079, "balance_loss_clip": 1.30855775, "balance_loss_mlp": 0.54303479, "epoch": 0.11152863369908313, "flos": 22193835719040.0, "grad_norm": 2.8997223117781594, "language_loss": 0.9235673, "learning_rate": 3.9306791378945705e-06, "loss": 0.94630641, "num_input_tokens_seen": 40157745, "router_z_loss_clip": 3.72460938, "router_z_loss_mlp": 0.50073242, "step": 1855, "time_per_iteration": 2.6760520935058594 }, { "auxiliary_loss_clip": 0.0166198, "auxiliary_loss_mlp": 0.00545183, "balance_loss_clip": 1.30192661, "balance_loss_mlp": 0.50021702, "epoch": 0.11158875695175109, "flos": 19537524115200.0, "grad_norm": 2.2107208361359683, "language_loss": 0.88537341, "learning_rate": 3.9305774528973205e-06, "loss": 0.90744501, "num_input_tokens_seen": 40175375, "router_z_loss_clip": 3.6015625, "router_z_loss_mlp": 0.44946289, "step": 1856, "time_per_iteration": 2.663083791732788 }, { "auxiliary_loss_clip": 0.01657056, "auxiliary_loss_mlp": 0.00478138, "balance_loss_clip": 1.2954824, "balance_loss_mlp": 0.43968105, "epoch": 0.11164888020441906, "flos": 25442709989760.0, "grad_norm": 4.748360323084261, "language_loss": 0.88185579, "learning_rate": 3.93047569469238e-06, "loss": 0.90320766, "num_input_tokens_seen": 40195715, "router_z_loss_clip": 3.6171875, "router_z_loss_mlp": 0.38476562, "step": 1857, "time_per_iteration": 2.669299840927124 }, { "auxiliary_loss_clip": 0.01643477, "auxiliary_loss_mlp": 0.00491009, "balance_loss_clip": 1.28665543, "balance_loss_mlp": 0.44775957, "epoch": 0.11170900345708702, "flos": 15632741543040.0, "grad_norm": 11.695428267679812, "language_loss": 0.91223681, "learning_rate": 3.930373863283608e-06, "loss": 0.93358159, "num_input_tokens_seen": 40213975, "router_z_loss_clip": 3.5703125, "router_z_loss_mlp": 0.43261719, "step": 1858, "time_per_iteration": 2.620676040649414 }, { "auxiliary_loss_clip": 0.01676075, "auxiliary_loss_mlp": 0.00500113, "balance_loss_clip": 1.30905843, "balance_loss_mlp": 0.45447946, "epoch": 0.111769126709755, "flos": 23039424766080.0, "grad_norm": 17.893183287751707, "language_loss": 0.99418414, "learning_rate": 3.930271958674866e-06, "loss": 1.01594603, "num_input_tokens_seen": 40233905, "router_z_loss_clip": 3.671875, "router_z_loss_mlp": 0.45629883, "step": 1859, "time_per_iteration": 2.7012901306152344 }, { "auxiliary_loss_clip": 0.01676723, "auxiliary_loss_mlp": 0.00477543, "balance_loss_clip": 1.30480719, "balance_loss_mlp": 0.43074188, "epoch": 0.11182924996242297, "flos": 20850705434880.0, "grad_norm": 31.39701333402126, "language_loss": 0.91769266, "learning_rate": 3.930169980870018e-06, "loss": 0.93923533, "num_input_tokens_seen": 40252810, "router_z_loss_clip": 3.72070312, "router_z_loss_mlp": 0.46826172, "step": 1860, "time_per_iteration": 2.631378650665283 }, { "auxiliary_loss_clip": 0.01689087, "auxiliary_loss_mlp": 0.00436957, "balance_loss_clip": 1.3155818, "balance_loss_mlp": 0.39494798, "epoch": 0.11188937321509093, "flos": 17455315587840.0, "grad_norm": 11.278647419398805, "language_loss": 0.83187759, "learning_rate": 3.930067929872931e-06, "loss": 0.85313809, "num_input_tokens_seen": 40272000, "router_z_loss_clip": 3.73632812, "router_z_loss_mlp": 0.42016602, "step": 1861, "time_per_iteration": 2.6338207721710205 }, { "auxiliary_loss_clip": 0.01708253, "auxiliary_loss_mlp": 0.00446855, "balance_loss_clip": 1.31779063, "balance_loss_mlp": 0.40248567, "epoch": 0.11194949646775891, "flos": 24095916518400.0, "grad_norm": 16.661477955858906, "language_loss": 0.95724958, "learning_rate": 3.929965805687474e-06, "loss": 0.97880065, "num_input_tokens_seen": 40290660, "router_z_loss_clip": 3.90429688, "router_z_loss_mlp": 0.44384766, "step": 1862, "time_per_iteration": 2.657116413116455 }, { "auxiliary_loss_clip": 0.01715665, "auxiliary_loss_mlp": 0.00463638, "balance_loss_clip": 1.30590034, "balance_loss_mlp": 0.41898268, "epoch": 0.11200961972042688, "flos": 25153880728320.0, "grad_norm": 441.5929409415987, "language_loss": 0.94876027, "learning_rate": 3.92986360831752e-06, "loss": 0.97055328, "num_input_tokens_seen": 40307820, "router_z_loss_clip": 4.09960938, "router_z_loss_mlp": 0.4465332, "step": 1863, "time_per_iteration": 2.658836841583252 }, { "auxiliary_loss_clip": 0.01763403, "auxiliary_loss_mlp": 0.00459417, "balance_loss_clip": 1.33859301, "balance_loss_mlp": 0.4129256, "epoch": 0.11206974297309484, "flos": 21288312829440.0, "grad_norm": 41.20300317362616, "language_loss": 0.70874095, "learning_rate": 3.929761337766945e-06, "loss": 0.73096919, "num_input_tokens_seen": 40327430, "router_z_loss_clip": 4.2578125, "router_z_loss_mlp": 0.46508789, "step": 1864, "time_per_iteration": 2.616436243057251 }, { "auxiliary_loss_clip": 0.01775251, "auxiliary_loss_mlp": 0.00577093, "balance_loss_clip": 1.33516932, "balance_loss_mlp": 0.52504623, "epoch": 0.11212986622576282, "flos": 18915982151040.0, "grad_norm": 35.183866966243826, "language_loss": 0.80569696, "learning_rate": 3.929658994039627e-06, "loss": 0.82922041, "num_input_tokens_seen": 40344545, "router_z_loss_clip": 4.3984375, "router_z_loss_mlp": 0.52075195, "step": 1865, "time_per_iteration": 2.7222490310668945 }, { "auxiliary_loss_clip": 0.0178983, "auxiliary_loss_mlp": 0.0056548, "balance_loss_clip": 1.33985639, "balance_loss_mlp": 0.50868839, "epoch": 0.11218998947843078, "flos": 22054754257920.0, "grad_norm": 73.71999564243977, "language_loss": 0.94617701, "learning_rate": 3.929556577139446e-06, "loss": 0.96973008, "num_input_tokens_seen": 40362300, "router_z_loss_clip": 4.49609375, "router_z_loss_mlp": 0.56884766, "step": 1866, "time_per_iteration": 2.740328788757324 }, { "auxiliary_loss_clip": 0.01749259, "auxiliary_loss_mlp": 0.00563376, "balance_loss_clip": 1.32157123, "balance_loss_mlp": 0.50603616, "epoch": 0.11225011273109875, "flos": 24571697091840.0, "grad_norm": 12.458967143221573, "language_loss": 0.8782382, "learning_rate": 3.929454087070286e-06, "loss": 0.90136456, "num_input_tokens_seen": 40384720, "router_z_loss_clip": 4.27734375, "router_z_loss_mlp": 0.57299805, "step": 1867, "time_per_iteration": 2.735369920730591 }, { "auxiliary_loss_clip": 0.01803705, "auxiliary_loss_mlp": 0.00568062, "balance_loss_clip": 1.34728563, "balance_loss_mlp": 0.51391745, "epoch": 0.11231023598376672, "flos": 28438665621120.0, "grad_norm": 19.127531992181996, "language_loss": 0.92490119, "learning_rate": 3.929351523836035e-06, "loss": 0.94861877, "num_input_tokens_seen": 40404000, "router_z_loss_clip": 4.5625, "router_z_loss_mlp": 0.54150391, "step": 1868, "time_per_iteration": 2.736497163772583 }, { "auxiliary_loss_clip": 0.01778644, "auxiliary_loss_mlp": 0.00575402, "balance_loss_clip": 1.34154415, "balance_loss_mlp": 0.52395177, "epoch": 0.1123703592364347, "flos": 14426466076800.0, "grad_norm": 6.254195777842118, "language_loss": 0.75587708, "learning_rate": 3.9292488874405795e-06, "loss": 0.77941757, "num_input_tokens_seen": 40418665, "router_z_loss_clip": 4.3671875, "router_z_loss_mlp": 0.51391602, "step": 1869, "time_per_iteration": 2.5943546295166016 }, { "auxiliary_loss_clip": 0.01761751, "auxiliary_loss_mlp": 0.00573342, "balance_loss_clip": 1.32416689, "balance_loss_mlp": 0.51802856, "epoch": 0.11243048248910266, "flos": 22236282616320.0, "grad_norm": 12.641171635282825, "language_loss": 0.83496541, "learning_rate": 3.929146177887814e-06, "loss": 0.8583163, "num_input_tokens_seen": 40437870, "router_z_loss_clip": 4.37890625, "router_z_loss_mlp": 0.55297852, "step": 1870, "time_per_iteration": 2.7942755222320557 }, { "auxiliary_loss_clip": 0.01755736, "auxiliary_loss_mlp": 0.00585868, "balance_loss_clip": 1.32654965, "balance_loss_mlp": 0.52788508, "epoch": 0.11249060574177062, "flos": 18584167288320.0, "grad_norm": 10.180587496262952, "language_loss": 0.85909432, "learning_rate": 3.929043395181631e-06, "loss": 0.8825103, "num_input_tokens_seen": 40455570, "router_z_loss_clip": 4.296875, "router_z_loss_mlp": 0.58007812, "step": 1871, "time_per_iteration": 2.6427886486053467 }, { "auxiliary_loss_clip": 0.01754863, "auxiliary_loss_mlp": 0.00633823, "balance_loss_clip": 1.32364035, "balance_loss_mlp": 0.57641137, "epoch": 0.1125507289944386, "flos": 22856567604480.0, "grad_norm": 4.382185254390835, "language_loss": 0.88828522, "learning_rate": 3.928940539325929e-06, "loss": 0.91217208, "num_input_tokens_seen": 40473600, "router_z_loss_clip": 4.3125, "router_z_loss_mlp": 0.57348633, "step": 1872, "time_per_iteration": 2.7204928398132324 }, { "auxiliary_loss_clip": 0.01759948, "auxiliary_loss_mlp": 0.00627755, "balance_loss_clip": 1.3266809, "balance_loss_mlp": 0.5666244, "epoch": 0.11261085224710657, "flos": 19676390094720.0, "grad_norm": 18.195923961961665, "language_loss": 0.87984651, "learning_rate": 3.9288376103246095e-06, "loss": 0.9037236, "num_input_tokens_seen": 40490025, "router_z_loss_clip": 4.33203125, "router_z_loss_mlp": 0.61132812, "step": 1873, "time_per_iteration": 2.6613609790802 }, { "auxiliary_loss_clip": 0.0175678, "auxiliary_loss_mlp": 0.00652545, "balance_loss_clip": 1.32274449, "balance_loss_mlp": 0.58898246, "epoch": 0.11267097549977453, "flos": 26063246373120.0, "grad_norm": 113.09618485303261, "language_loss": 0.9674232, "learning_rate": 3.928734608181575e-06, "loss": 0.99151647, "num_input_tokens_seen": 40511580, "router_z_loss_clip": 4.33984375, "router_z_loss_mlp": 0.63623047, "step": 1874, "time_per_iteration": 2.7091379165649414 }, { "auxiliary_loss_clip": 0.01720419, "auxiliary_loss_mlp": 0.00590552, "balance_loss_clip": 1.30049121, "balance_loss_mlp": 0.53426129, "epoch": 0.11273109875244251, "flos": 21068036674560.0, "grad_norm": 4.882780919674207, "language_loss": 0.79103369, "learning_rate": 3.928631532900729e-06, "loss": 0.81414342, "num_input_tokens_seen": 40530155, "router_z_loss_clip": 4.19921875, "router_z_loss_mlp": 0.56298828, "step": 1875, "time_per_iteration": 2.6748838424682617 }, { "auxiliary_loss_clip": 0.01755929, "auxiliary_loss_mlp": 0.00657582, "balance_loss_clip": 1.3427825, "balance_loss_mlp": 0.60045737, "epoch": 0.11279122200511048, "flos": 27088999061760.0, "grad_norm": 118.5655034890092, "language_loss": 0.77524608, "learning_rate": 3.928528384485984e-06, "loss": 0.79938114, "num_input_tokens_seen": 40549500, "router_z_loss_clip": 4.12890625, "router_z_loss_mlp": 0.57202148, "step": 1876, "time_per_iteration": 5.505356550216675 }, { "auxiliary_loss_clip": 0.01728044, "auxiliary_loss_mlp": 0.00613004, "balance_loss_clip": 1.3276279, "balance_loss_mlp": 0.55721432, "epoch": 0.11285134525777844, "flos": 20187901722240.0, "grad_norm": 15.4388596520176, "language_loss": 0.82620615, "learning_rate": 3.9284251629412475e-06, "loss": 0.84961665, "num_input_tokens_seen": 40567475, "router_z_loss_clip": 4.00195312, "router_z_loss_mlp": 0.55834961, "step": 1877, "time_per_iteration": 2.650360584259033 }, { "auxiliary_loss_clip": 0.01717025, "auxiliary_loss_mlp": 0.00674631, "balance_loss_clip": 1.3061887, "balance_loss_mlp": 0.61454976, "epoch": 0.11291146851044641, "flos": 12458453863680.0, "grad_norm": 75.81159603542575, "language_loss": 0.95667845, "learning_rate": 3.928321868270436e-06, "loss": 0.98059499, "num_input_tokens_seen": 40583280, "router_z_loss_clip": 4.1015625, "router_z_loss_mlp": 0.60107422, "step": 1878, "time_per_iteration": 2.794698715209961 }, { "auxiliary_loss_clip": 0.01724233, "auxiliary_loss_mlp": 0.00619817, "balance_loss_clip": 1.32018054, "balance_loss_mlp": 0.56347901, "epoch": 0.11297159176311439, "flos": 23842315520640.0, "grad_norm": 18.391473131070985, "language_loss": 0.90035439, "learning_rate": 3.928218500477466e-06, "loss": 0.92379487, "num_input_tokens_seen": 40603080, "router_z_loss_clip": 4.03320312, "router_z_loss_mlp": 0.56323242, "step": 1879, "time_per_iteration": 4.118160009384155 }, { "auxiliary_loss_clip": 0.01733838, "auxiliary_loss_mlp": 0.00702556, "balance_loss_clip": 1.32535028, "balance_loss_mlp": 0.63765836, "epoch": 0.11303171501578235, "flos": 29930538124800.0, "grad_norm": 64.92923010692152, "language_loss": 0.77249712, "learning_rate": 3.928115059566259e-06, "loss": 0.79686105, "num_input_tokens_seen": 40623255, "router_z_loss_clip": 4.08007812, "router_z_loss_mlp": 0.64868164, "step": 1880, "time_per_iteration": 2.703927516937256 }, { "auxiliary_loss_clip": 0.01743003, "auxiliary_loss_mlp": 0.00640585, "balance_loss_clip": 1.33831167, "balance_loss_mlp": 0.58355534, "epoch": 0.11309183826845032, "flos": 16180558842240.0, "grad_norm": 26.446195002159403, "language_loss": 0.78471828, "learning_rate": 3.928011545540734e-06, "loss": 0.80855417, "num_input_tokens_seen": 40641570, "router_z_loss_clip": 4.046875, "router_z_loss_mlp": 0.57006836, "step": 1881, "time_per_iteration": 4.1128716468811035 }, { "auxiliary_loss_clip": 0.01757191, "auxiliary_loss_mlp": 0.0065549, "balance_loss_clip": 1.33913684, "balance_loss_mlp": 0.59235662, "epoch": 0.1131519615211183, "flos": 12020702814720.0, "grad_norm": 51.601637197549564, "language_loss": 0.81918985, "learning_rate": 3.927907958404819e-06, "loss": 0.84331667, "num_input_tokens_seen": 40658775, "router_z_loss_clip": 4.18359375, "router_z_loss_mlp": 0.63110352, "step": 1882, "time_per_iteration": 2.617844820022583 }, { "auxiliary_loss_clip": 0.01741314, "auxiliary_loss_mlp": 0.00641319, "balance_loss_clip": 1.34026408, "balance_loss_mlp": 0.57809085, "epoch": 0.11321208477378626, "flos": 26250125857920.0, "grad_norm": 8.47375127554542, "language_loss": 0.8819741, "learning_rate": 3.92780429816244e-06, "loss": 0.90580046, "num_input_tokens_seen": 40679555, "router_z_loss_clip": 4.00585938, "router_z_loss_mlp": 0.63183594, "step": 1883, "time_per_iteration": 2.688058376312256 }, { "auxiliary_loss_clip": 0.01720172, "auxiliary_loss_mlp": 0.00604242, "balance_loss_clip": 1.32307911, "balance_loss_mlp": 0.54726005, "epoch": 0.11327220802645423, "flos": 13626376583040.0, "grad_norm": 15.547420083553092, "language_loss": 0.85141528, "learning_rate": 3.927700564817529e-06, "loss": 0.87465948, "num_input_tokens_seen": 40697295, "router_z_loss_clip": 3.97460938, "router_z_loss_mlp": 0.56982422, "step": 1884, "time_per_iteration": 2.6467769145965576 }, { "auxiliary_loss_clip": 0.01853828, "auxiliary_loss_mlp": 0.00428225, "balance_loss_clip": 1.50167513, "balance_loss_mlp": 0.40209466, "epoch": 0.1133323312791222, "flos": 57191802814080.0, "grad_norm": 0.7989833517751878, "language_loss": 0.55413187, "learning_rate": 3.927596758374019e-06, "loss": 0.5769524, "num_input_tokens_seen": 40758095, "router_z_loss_clip": 3.53125, "router_z_loss_mlp": 0.26171875, "step": 1885, "time_per_iteration": 3.01548433303833 }, { "auxiliary_loss_clip": 0.01714426, "auxiliary_loss_mlp": 0.00627603, "balance_loss_clip": 1.32890725, "balance_loss_mlp": 0.57138395, "epoch": 0.11339245453179017, "flos": 24351708245760.0, "grad_norm": 104.7832019248302, "language_loss": 0.96166122, "learning_rate": 3.927492878835848e-06, "loss": 0.98508149, "num_input_tokens_seen": 40777140, "router_z_loss_clip": 3.8515625, "router_z_loss_mlp": 0.56201172, "step": 1886, "time_per_iteration": 2.657252073287964 }, { "auxiliary_loss_clip": 0.0171322, "auxiliary_loss_mlp": 0.00609814, "balance_loss_clip": 1.33694005, "balance_loss_mlp": 0.55609787, "epoch": 0.11345257778445814, "flos": 22670693700480.0, "grad_norm": 23.56868225058732, "language_loss": 0.90837932, "learning_rate": 3.927388926206953e-06, "loss": 0.93160963, "num_input_tokens_seen": 40797505, "router_z_loss_clip": 3.75976562, "router_z_loss_mlp": 0.53735352, "step": 1887, "time_per_iteration": 2.655362129211426 }, { "auxiliary_loss_clip": 0.01700472, "auxiliary_loss_mlp": 0.00644564, "balance_loss_clip": 1.32485676, "balance_loss_mlp": 0.58615154, "epoch": 0.11351270103712612, "flos": 20988242611200.0, "grad_norm": 5.144868233175181, "language_loss": 0.86318523, "learning_rate": 3.927284900491277e-06, "loss": 0.8866356, "num_input_tokens_seen": 40812970, "router_z_loss_clip": 3.75976562, "router_z_loss_mlp": 0.58496094, "step": 1888, "time_per_iteration": 2.6479663848876953 }, { "auxiliary_loss_clip": 0.01691717, "auxiliary_loss_mlp": 0.00617442, "balance_loss_clip": 1.32095528, "balance_loss_mlp": 0.55931532, "epoch": 0.11357282428979408, "flos": 37347923600640.0, "grad_norm": 4.053085303352004, "language_loss": 0.73163694, "learning_rate": 3.927180801692764e-06, "loss": 0.75472856, "num_input_tokens_seen": 40837745, "router_z_loss_clip": 3.70507812, "router_z_loss_mlp": 0.58154297, "step": 1889, "time_per_iteration": 2.842867851257324 }, { "auxiliary_loss_clip": 0.01711575, "auxiliary_loss_mlp": 0.00616312, "balance_loss_clip": 1.34253609, "balance_loss_mlp": 0.55911577, "epoch": 0.11363294754246205, "flos": 21757018423680.0, "grad_norm": 5.719702482591755, "language_loss": 0.8865037, "learning_rate": 3.927076629815362e-06, "loss": 0.90978259, "num_input_tokens_seen": 40856490, "router_z_loss_clip": 3.69140625, "router_z_loss_mlp": 0.57250977, "step": 1890, "time_per_iteration": 2.7731456756591797 }, { "auxiliary_loss_clip": 0.01707824, "auxiliary_loss_mlp": 0.00643546, "balance_loss_clip": 1.33679557, "balance_loss_mlp": 0.58508623, "epoch": 0.11369307079513001, "flos": 22601637803520.0, "grad_norm": 18.007245552310994, "language_loss": 0.7180357, "learning_rate": 3.926972384863022e-06, "loss": 0.74154943, "num_input_tokens_seen": 40874070, "router_z_loss_clip": 3.71289062, "router_z_loss_mlp": 0.5847168, "step": 1891, "time_per_iteration": 2.667884111404419 }, { "auxiliary_loss_clip": 0.01709949, "auxiliary_loss_mlp": 0.00653874, "balance_loss_clip": 1.33724332, "balance_loss_mlp": 0.58935803, "epoch": 0.11375319404779799, "flos": 21944257044480.0, "grad_norm": 82.69689388177135, "language_loss": 0.93715262, "learning_rate": 3.9268680668396956e-06, "loss": 0.96079087, "num_input_tokens_seen": 40892425, "router_z_loss_clip": 3.72851562, "router_z_loss_mlp": 0.64501953, "step": 1892, "time_per_iteration": 2.671290159225464 }, { "auxiliary_loss_clip": 0.01704305, "auxiliary_loss_mlp": 0.00636287, "balance_loss_clip": 1.32284474, "balance_loss_mlp": 0.57000637, "epoch": 0.11381331730046595, "flos": 26395456285440.0, "grad_norm": 5.3375868111432805, "language_loss": 0.80255097, "learning_rate": 3.926763675749339e-06, "loss": 0.82595688, "num_input_tokens_seen": 40912190, "router_z_loss_clip": 3.81640625, "router_z_loss_mlp": 0.66308594, "step": 1893, "time_per_iteration": 2.6738500595092773 }, { "auxiliary_loss_clip": 0.01689084, "auxiliary_loss_mlp": 0.00627885, "balance_loss_clip": 1.3247788, "balance_loss_mlp": 0.56541908, "epoch": 0.11387344055313392, "flos": 23804716959360.0, "grad_norm": 23.366490131309735, "language_loss": 0.85045999, "learning_rate": 3.92665921159591e-06, "loss": 0.87362969, "num_input_tokens_seen": 40928395, "router_z_loss_clip": 3.6484375, "router_z_loss_mlp": 0.625, "step": 1894, "time_per_iteration": 2.6925628185272217 }, { "auxiliary_loss_clip": 0.01688916, "auxiliary_loss_mlp": 0.00613962, "balance_loss_clip": 1.32583427, "balance_loss_mlp": 0.55650365, "epoch": 0.1139335638058019, "flos": 34522865902080.0, "grad_norm": 15.132480094852998, "language_loss": 0.86617565, "learning_rate": 3.926554674383371e-06, "loss": 0.88920438, "num_input_tokens_seen": 40946555, "router_z_loss_clip": 3.63671875, "router_z_loss_mlp": 0.57421875, "step": 1895, "time_per_iteration": 2.763258934020996 }, { "auxiliary_loss_clip": 0.01990852, "auxiliary_loss_mlp": 0.00241221, "balance_loss_clip": 1.64146852, "balance_loss_mlp": 0.22081225, "epoch": 0.11399368705846986, "flos": 70587811520640.0, "grad_norm": 0.7855820347153732, "language_loss": 0.6307497, "learning_rate": 3.926450064115686e-06, "loss": 0.65307045, "num_input_tokens_seen": 41004910, "router_z_loss_clip": 3.5, "router_z_loss_mlp": 0.20410156, "step": 1896, "time_per_iteration": 3.2090044021606445 }, { "auxiliary_loss_clip": 0.01671959, "auxiliary_loss_mlp": 0.0055691, "balance_loss_clip": 1.3175621, "balance_loss_mlp": 0.50212157, "epoch": 0.11405381031113783, "flos": 21324259365120.0, "grad_norm": 9.64710605543728, "language_loss": 0.88512725, "learning_rate": 3.926345380796821e-06, "loss": 0.90741593, "num_input_tokens_seen": 41026385, "router_z_loss_clip": 3.54296875, "router_z_loss_mlp": 0.54833984, "step": 1897, "time_per_iteration": 2.728876829147339 }, { "auxiliary_loss_clip": 0.0168213, "auxiliary_loss_mlp": 0.00602323, "balance_loss_clip": 1.32743144, "balance_loss_mlp": 0.54500711, "epoch": 0.11411393356380581, "flos": 19719627091200.0, "grad_norm": 19.42738256911801, "language_loss": 0.86541563, "learning_rate": 3.9262406244307465e-06, "loss": 0.88826019, "num_input_tokens_seen": 41045315, "router_z_loss_clip": 3.54492188, "router_z_loss_mlp": 0.57324219, "step": 1898, "time_per_iteration": 2.7851269245147705 }, { "auxiliary_loss_clip": 0.0166109, "auxiliary_loss_mlp": 0.00588624, "balance_loss_clip": 1.30676842, "balance_loss_mlp": 0.52892387, "epoch": 0.11417405681647377, "flos": 17530440883200.0, "grad_norm": 12.695465204799858, "language_loss": 0.81557614, "learning_rate": 3.926135795021435e-06, "loss": 0.83807325, "num_input_tokens_seen": 41063390, "router_z_loss_clip": 3.54296875, "router_z_loss_mlp": 0.59643555, "step": 1899, "time_per_iteration": 2.766625165939331 }, { "auxiliary_loss_clip": 0.01928492, "auxiliary_loss_mlp": 0.00254966, "balance_loss_clip": 1.60081649, "balance_loss_mlp": 0.23617911, "epoch": 0.11423418006914174, "flos": 59674666619520.0, "grad_norm": 0.9078883697283056, "language_loss": 0.6346494, "learning_rate": 3.92603089257286e-06, "loss": 0.65648389, "num_input_tokens_seen": 41124180, "router_z_loss_clip": 3.28125, "router_z_loss_mlp": 0.1875, "step": 1900, "time_per_iteration": 3.0836589336395264 }, { "auxiliary_loss_clip": 0.01618102, "auxiliary_loss_mlp": 0.00524626, "balance_loss_clip": 1.28376675, "balance_loss_mlp": 0.47391483, "epoch": 0.1142943033218097, "flos": 22963114321920.0, "grad_norm": 13.493780917059022, "language_loss": 0.82974625, "learning_rate": 3.925925917089001e-06, "loss": 0.85117352, "num_input_tokens_seen": 41143485, "router_z_loss_clip": 3.34179688, "router_z_loss_mlp": 0.50732422, "step": 1901, "time_per_iteration": 2.716153621673584 }, { "auxiliary_loss_clip": 0.0162837, "auxiliary_loss_mlp": 0.00571108, "balance_loss_clip": 1.29270256, "balance_loss_mlp": 0.5137918, "epoch": 0.11435442657447768, "flos": 18256267008000.0, "grad_norm": 31.748997825112514, "language_loss": 0.90583575, "learning_rate": 3.925820868573839e-06, "loss": 0.92783058, "num_input_tokens_seen": 41161695, "router_z_loss_clip": 3.35546875, "router_z_loss_mlp": 0.57324219, "step": 1902, "time_per_iteration": 2.646496534347534 }, { "auxiliary_loss_clip": 0.01646353, "auxiliary_loss_mlp": 0.00571692, "balance_loss_clip": 1.29822576, "balance_loss_mlp": 0.51342291, "epoch": 0.11441454982714565, "flos": 24061191045120.0, "grad_norm": 54.892278907975, "language_loss": 0.8338443, "learning_rate": 3.925715747031356e-06, "loss": 0.85602474, "num_input_tokens_seen": 41181715, "router_z_loss_clip": 3.47851562, "router_z_loss_mlp": 0.58276367, "step": 1903, "time_per_iteration": 2.696735382080078 }, { "auxiliary_loss_clip": 0.01636497, "auxiliary_loss_mlp": 0.00553512, "balance_loss_clip": 1.29948294, "balance_loss_mlp": 0.50232333, "epoch": 0.11447467307981361, "flos": 25337707557120.0, "grad_norm": 770.788206538944, "language_loss": 0.82237184, "learning_rate": 3.925610552465539e-06, "loss": 0.8442719, "num_input_tokens_seen": 41201770, "router_z_loss_clip": 3.36914062, "router_z_loss_mlp": 0.51245117, "step": 1904, "time_per_iteration": 2.6629276275634766 }, { "auxiliary_loss_clip": 0.01649494, "auxiliary_loss_mlp": 0.00581103, "balance_loss_clip": 1.31106985, "balance_loss_mlp": 0.52426434, "epoch": 0.11453479633248159, "flos": 21726063878400.0, "grad_norm": 6.886829512253228, "language_loss": 0.98280287, "learning_rate": 3.9255052848803764e-06, "loss": 1.00510895, "num_input_tokens_seen": 41220590, "router_z_loss_clip": 3.38671875, "router_z_loss_mlp": 0.56860352, "step": 1905, "time_per_iteration": 2.6310532093048096 }, { "auxiliary_loss_clip": 0.01637985, "auxiliary_loss_mlp": 0.006069, "balance_loss_clip": 1.28378057, "balance_loss_mlp": 0.54362321, "epoch": 0.11459491958514956, "flos": 12969714096000.0, "grad_norm": 15.94529979519172, "language_loss": 0.85887551, "learning_rate": 3.925399944279861e-06, "loss": 0.88132441, "num_input_tokens_seen": 41237250, "router_z_loss_clip": 3.54296875, "router_z_loss_mlp": 0.6328125, "step": 1906, "time_per_iteration": 2.5976357460021973 }, { "auxiliary_loss_clip": 0.01638647, "auxiliary_loss_mlp": 0.0057, "balance_loss_clip": 1.29434359, "balance_loss_mlp": 0.51478261, "epoch": 0.11465504283781752, "flos": 22711273090560.0, "grad_norm": 14.39447499876485, "language_loss": 0.89750141, "learning_rate": 3.925294530667986e-06, "loss": 0.91958791, "num_input_tokens_seen": 41256680, "router_z_loss_clip": 3.44335938, "router_z_loss_mlp": 0.55151367, "step": 1907, "time_per_iteration": 2.6454720497131348 }, { "auxiliary_loss_clip": 0.01652733, "auxiliary_loss_mlp": 0.00576222, "balance_loss_clip": 1.30351698, "balance_loss_mlp": 0.51790476, "epoch": 0.1147151660904855, "flos": 23398387332480.0, "grad_norm": 95.03590844576158, "language_loss": 0.90662968, "learning_rate": 3.92518904404875e-06, "loss": 0.92891932, "num_input_tokens_seen": 41270955, "router_z_loss_clip": 3.49414062, "router_z_loss_mlp": 0.58325195, "step": 1908, "time_per_iteration": 2.6210763454437256 }, { "auxiliary_loss_clip": 0.01822972, "auxiliary_loss_mlp": 0.00216322, "balance_loss_clip": 1.52485347, "balance_loss_mlp": 0.19810694, "epoch": 0.11477528934315347, "flos": 63011843498880.0, "grad_norm": 0.9589290721778688, "language_loss": 0.60801494, "learning_rate": 3.925083484426153e-06, "loss": 0.62840784, "num_input_tokens_seen": 41319180, "router_z_loss_clip": 2.984375, "router_z_loss_mlp": 0.18261719, "step": 1909, "time_per_iteration": 2.8814597129821777 }, { "auxiliary_loss_clip": 0.01658055, "auxiliary_loss_mlp": 0.00563737, "balance_loss_clip": 1.31287205, "balance_loss_mlp": 0.50732672, "epoch": 0.11483541259582143, "flos": 16325601960960.0, "grad_norm": 4.404829145885058, "language_loss": 0.84660721, "learning_rate": 3.924977851804197e-06, "loss": 0.86882514, "num_input_tokens_seen": 41337480, "router_z_loss_clip": 3.45117188, "router_z_loss_mlp": 0.56420898, "step": 1910, "time_per_iteration": 2.6967506408691406 }, { "auxiliary_loss_clip": 0.01657105, "auxiliary_loss_mlp": 0.00565699, "balance_loss_clip": 1.30532002, "balance_loss_mlp": 0.51112461, "epoch": 0.1148955358484894, "flos": 21580410228480.0, "grad_norm": 3.520374691284433, "language_loss": 0.82293338, "learning_rate": 3.9248721461868875e-06, "loss": 0.84516138, "num_input_tokens_seen": 41354650, "router_z_loss_clip": 3.515625, "router_z_loss_mlp": 0.54541016, "step": 1911, "time_per_iteration": 2.7137577533721924 }, { "auxiliary_loss_clip": 0.01654663, "auxiliary_loss_mlp": 0.00522755, "balance_loss_clip": 1.31361187, "balance_loss_mlp": 0.4730683, "epoch": 0.11495565910115738, "flos": 27673696650240.0, "grad_norm": 19.24418539694855, "language_loss": 0.83476162, "learning_rate": 3.9247663675782336e-06, "loss": 0.85653585, "num_input_tokens_seen": 41376935, "router_z_loss_clip": 3.41210938, "router_z_loss_mlp": 0.49707031, "step": 1912, "time_per_iteration": 2.7902281284332275 }, { "auxiliary_loss_clip": 0.01650601, "auxiliary_loss_mlp": 0.00515653, "balance_loss_clip": 1.3058989, "balance_loss_mlp": 0.46303397, "epoch": 0.11501578235382534, "flos": 20632368614400.0, "grad_norm": 13.742813178946793, "language_loss": 0.85280716, "learning_rate": 3.924660515982246e-06, "loss": 0.87446976, "num_input_tokens_seen": 41396105, "router_z_loss_clip": 3.44726562, "router_z_loss_mlp": 0.52563477, "step": 1913, "time_per_iteration": 2.72263503074646 }, { "auxiliary_loss_clip": 0.0166418, "auxiliary_loss_mlp": 0.00504654, "balance_loss_clip": 1.31831813, "balance_loss_mlp": 0.45275012, "epoch": 0.1150759056064933, "flos": 19829046896640.0, "grad_norm": 94.04147068244441, "language_loss": 0.75963807, "learning_rate": 3.924554591402939e-06, "loss": 0.78132641, "num_input_tokens_seen": 41415600, "router_z_loss_clip": 3.4609375, "router_z_loss_mlp": 0.51928711, "step": 1914, "time_per_iteration": 2.705019235610962 }, { "auxiliary_loss_clip": 0.01787993, "auxiliary_loss_mlp": 0.00353935, "balance_loss_clip": 1.4968679, "balance_loss_mlp": 0.33056957, "epoch": 0.11513602885916129, "flos": 70045776311040.0, "grad_norm": 1.7776036246784046, "language_loss": 0.60993826, "learning_rate": 3.92444859384433e-06, "loss": 0.63135749, "num_input_tokens_seen": 41478760, "router_z_loss_clip": 2.90625, "router_z_loss_mlp": 0.23339844, "step": 1915, "time_per_iteration": 3.258871555328369 }, { "auxiliary_loss_clip": 0.0165776, "auxiliary_loss_mlp": 0.00604279, "balance_loss_clip": 1.30976987, "balance_loss_mlp": 0.54703414, "epoch": 0.11519615211182925, "flos": 15741730385280.0, "grad_norm": 71.99436300110091, "language_loss": 0.99692947, "learning_rate": 3.924342523310436e-06, "loss": 1.01954985, "num_input_tokens_seen": 41495720, "router_z_loss_clip": 3.48046875, "router_z_loss_mlp": 0.57275391, "step": 1916, "time_per_iteration": 2.666714668273926 }, { "auxiliary_loss_clip": 0.01651976, "auxiliary_loss_mlp": 0.00580742, "balance_loss_clip": 1.29288077, "balance_loss_mlp": 0.52347374, "epoch": 0.11525627536449722, "flos": 20667632791680.0, "grad_norm": 2.32259654164924, "language_loss": 0.78835481, "learning_rate": 3.9242363798052806e-06, "loss": 0.81068206, "num_input_tokens_seen": 41513585, "router_z_loss_clip": 3.59375, "router_z_loss_mlp": 0.57275391, "step": 1917, "time_per_iteration": 2.782750129699707 }, { "auxiliary_loss_clip": 0.01668259, "auxiliary_loss_mlp": 0.00625637, "balance_loss_clip": 1.31655335, "balance_loss_mlp": 0.56827319, "epoch": 0.1153163986171652, "flos": 20303283185280.0, "grad_norm": 217.8253763216717, "language_loss": 0.80325037, "learning_rate": 3.92413016333289e-06, "loss": 0.8261894, "num_input_tokens_seen": 41533390, "router_z_loss_clip": 3.51953125, "router_z_loss_mlp": 0.57348633, "step": 1918, "time_per_iteration": 4.023033380508423 }, { "auxiliary_loss_clip": 0.01673993, "auxiliary_loss_mlp": 0.00568665, "balance_loss_clip": 1.31614339, "balance_loss_mlp": 0.51747644, "epoch": 0.11537652186983316, "flos": 17639321984640.0, "grad_norm": 6.428376963377782, "language_loss": 0.92650265, "learning_rate": 3.92402387389729e-06, "loss": 0.94892919, "num_input_tokens_seen": 41551015, "router_z_loss_clip": 3.578125, "router_z_loss_mlp": 0.51196289, "step": 1919, "time_per_iteration": 4.039808750152588 }, { "auxiliary_loss_clip": 0.01662466, "auxiliary_loss_mlp": 0.00585124, "balance_loss_clip": 1.30723834, "balance_loss_mlp": 0.53171861, "epoch": 0.11543664512250112, "flos": 21069401391360.0, "grad_norm": 5.263475273263496, "language_loss": 0.91403365, "learning_rate": 3.923917511502512e-06, "loss": 0.93650955, "num_input_tokens_seen": 41568055, "router_z_loss_clip": 3.55078125, "router_z_loss_mlp": 0.53393555, "step": 1920, "time_per_iteration": 2.697622537612915 }, { "auxiliary_loss_clip": 0.0167394, "auxiliary_loss_mlp": 0.0060343, "balance_loss_clip": 1.3223418, "balance_loss_mlp": 0.55128783, "epoch": 0.11549676837516909, "flos": 22747542848640.0, "grad_norm": 15.797380629692773, "language_loss": 0.85608256, "learning_rate": 3.923811076152589e-06, "loss": 0.8788563, "num_input_tokens_seen": 41587435, "router_z_loss_clip": 3.51757812, "router_z_loss_mlp": 0.52148438, "step": 1921, "time_per_iteration": 4.066251516342163 }, { "auxiliary_loss_clip": 0.01692994, "auxiliary_loss_mlp": 0.00685517, "balance_loss_clip": 1.31157327, "balance_loss_mlp": 0.62359929, "epoch": 0.11555689162783707, "flos": 19168972617600.0, "grad_norm": 2.4748444860897783, "language_loss": 0.83602208, "learning_rate": 3.923704567851557e-06, "loss": 0.85980713, "num_input_tokens_seen": 41604975, "router_z_loss_clip": 3.81054688, "router_z_loss_mlp": 0.62011719, "step": 1922, "time_per_iteration": 2.646246910095215 }, { "auxiliary_loss_clip": 0.01673025, "auxiliary_loss_mlp": 0.00620342, "balance_loss_clip": 1.31596959, "balance_loss_mlp": 0.56428975, "epoch": 0.11561701488050503, "flos": 24572056227840.0, "grad_norm": 4.752854472641086, "language_loss": 0.88896304, "learning_rate": 3.923597986603456e-06, "loss": 0.91189671, "num_input_tokens_seen": 41626155, "router_z_loss_clip": 3.57226562, "router_z_loss_mlp": 0.56030273, "step": 1923, "time_per_iteration": 4.101696729660034 }, { "auxiliary_loss_clip": 0.0169913, "auxiliary_loss_mlp": 0.0063469, "balance_loss_clip": 1.32676733, "balance_loss_mlp": 0.57398909, "epoch": 0.115677138133173, "flos": 17092546179840.0, "grad_norm": 1679.1180069374766, "language_loss": 0.87661779, "learning_rate": 3.9234913324123264e-06, "loss": 0.89995599, "num_input_tokens_seen": 41644805, "router_z_loss_clip": 3.72460938, "router_z_loss_mlp": 0.60644531, "step": 1924, "time_per_iteration": 2.6217706203460693 }, { "auxiliary_loss_clip": 0.01808058, "auxiliary_loss_mlp": 0.0037216, "balance_loss_clip": 1.52008367, "balance_loss_mlp": 0.34412193, "epoch": 0.11573726138584098, "flos": 62703875266560.0, "grad_norm": 0.9361943685531844, "language_loss": 0.60954827, "learning_rate": 3.923384605282212e-06, "loss": 0.63135046, "num_input_tokens_seen": 41709345, "router_z_loss_clip": 2.875, "router_z_loss_mlp": 0.28125, "step": 1925, "time_per_iteration": 3.1561315059661865 }, { "auxiliary_loss_clip": 0.01732258, "auxiliary_loss_mlp": 0.00641434, "balance_loss_clip": 1.35215044, "balance_loss_mlp": 0.58233035, "epoch": 0.11579738463850894, "flos": 22601135013120.0, "grad_norm": 8.131277558715853, "language_loss": 0.80813849, "learning_rate": 3.923277805217161e-06, "loss": 0.83187544, "num_input_tokens_seen": 41730210, "router_z_loss_clip": 3.80664062, "router_z_loss_mlp": 0.59106445, "step": 1926, "time_per_iteration": 2.7139699459075928 }, { "auxiliary_loss_clip": 0.01771838, "auxiliary_loss_mlp": 0.00616886, "balance_loss_clip": 1.36981416, "balance_loss_mlp": 0.55573165, "epoch": 0.11585750789117691, "flos": 21726135705600.0, "grad_norm": 21.28348240427446, "language_loss": 0.81719959, "learning_rate": 3.923170932221222e-06, "loss": 0.84108686, "num_input_tokens_seen": 41750270, "router_z_loss_clip": 4.01367188, "router_z_loss_mlp": 0.61181641, "step": 1927, "time_per_iteration": 2.6903817653656006 }, { "auxiliary_loss_clip": 0.01769326, "auxiliary_loss_mlp": 0.00656905, "balance_loss_clip": 1.37348104, "balance_loss_mlp": 0.59553653, "epoch": 0.11591763114384489, "flos": 26287544851200.0, "grad_norm": 9.400642227139691, "language_loss": 0.92323732, "learning_rate": 3.92306398629845e-06, "loss": 0.94749963, "num_input_tokens_seen": 41772975, "router_z_loss_clip": 3.9609375, "router_z_loss_mlp": 0.61303711, "step": 1928, "time_per_iteration": 2.7419395446777344 }, { "auxiliary_loss_clip": 0.01768864, "auxiliary_loss_mlp": 0.0066234, "balance_loss_clip": 1.37058663, "balance_loss_mlp": 0.60063696, "epoch": 0.11597775439651285, "flos": 23000461488000.0, "grad_norm": 2.5664928193416374, "language_loss": 0.83891857, "learning_rate": 3.922956967452898e-06, "loss": 0.86323059, "num_input_tokens_seen": 41791765, "router_z_loss_clip": 3.97851562, "router_z_loss_mlp": 0.61645508, "step": 1929, "time_per_iteration": 2.640810966491699 }, { "auxiliary_loss_clip": 0.01761664, "auxiliary_loss_mlp": 0.00665458, "balance_loss_clip": 1.37132096, "balance_loss_mlp": 0.60935843, "epoch": 0.11603787764918082, "flos": 31941715507200.0, "grad_norm": 3.3939980312742817, "language_loss": 0.82377291, "learning_rate": 3.922849875688626e-06, "loss": 0.8480441, "num_input_tokens_seen": 41815615, "router_z_loss_clip": 3.90039062, "router_z_loss_mlp": 0.56054688, "step": 1930, "time_per_iteration": 2.759760856628418 }, { "auxiliary_loss_clip": 0.01766848, "auxiliary_loss_mlp": 0.0062782, "balance_loss_clip": 1.38046765, "balance_loss_mlp": 0.57188749, "epoch": 0.1160980009018488, "flos": 22271654534400.0, "grad_norm": 7.956732632522512, "language_loss": 0.78631318, "learning_rate": 3.922742711009693e-06, "loss": 0.81025982, "num_input_tokens_seen": 41834810, "router_z_loss_clip": 3.86132812, "router_z_loss_mlp": 0.55957031, "step": 1931, "time_per_iteration": 2.6793019771575928 }, { "auxiliary_loss_clip": 0.01774123, "auxiliary_loss_mlp": 0.00670236, "balance_loss_clip": 1.37664711, "balance_loss_mlp": 0.61091757, "epoch": 0.11615812415451676, "flos": 22783633038720.0, "grad_norm": 8.951706299283444, "language_loss": 0.87210429, "learning_rate": 3.922635473420164e-06, "loss": 0.89654791, "num_input_tokens_seen": 41854975, "router_z_loss_clip": 3.96875, "router_z_loss_mlp": 0.59350586, "step": 1932, "time_per_iteration": 2.69088077545166 }, { "auxiliary_loss_clip": 0.01811042, "auxiliary_loss_mlp": 0.00316513, "balance_loss_clip": 1.48718071, "balance_loss_mlp": 0.29171798, "epoch": 0.11621824740718473, "flos": 67146096107520.0, "grad_norm": 0.7796113695804527, "language_loss": 0.6132291, "learning_rate": 3.922528162924105e-06, "loss": 0.63450462, "num_input_tokens_seen": 41911105, "router_z_loss_clip": 3.25, "router_z_loss_mlp": 0.24707031, "step": 1933, "time_per_iteration": 3.0618674755096436 }, { "auxiliary_loss_clip": 0.0178179, "auxiliary_loss_mlp": 0.00627433, "balance_loss_clip": 1.37709999, "balance_loss_mlp": 0.57021284, "epoch": 0.11627837065985269, "flos": 20375930442240.0, "grad_norm": 11.83009940899843, "language_loss": 0.92728949, "learning_rate": 3.922420779525586e-06, "loss": 0.95138174, "num_input_tokens_seen": 41931750, "router_z_loss_clip": 4.04882812, "router_z_loss_mlp": 0.57250977, "step": 1934, "time_per_iteration": 2.68870210647583 }, { "auxiliary_loss_clip": 0.01790605, "auxiliary_loss_mlp": 0.00698104, "balance_loss_clip": 1.37114429, "balance_loss_mlp": 0.6312272, "epoch": 0.11633849391252067, "flos": 21725812483200.0, "grad_norm": 934.7222755998406, "language_loss": 0.74774718, "learning_rate": 3.9223133232286776e-06, "loss": 0.77263427, "num_input_tokens_seen": 41949400, "router_z_loss_clip": 4.19726562, "router_z_loss_mlp": 0.66796875, "step": 1935, "time_per_iteration": 2.694038152694702 }, { "auxiliary_loss_clip": 0.01779276, "auxiliary_loss_mlp": 0.00668296, "balance_loss_clip": 1.37529993, "balance_loss_mlp": 0.60788059, "epoch": 0.11639861716518864, "flos": 18805341283200.0, "grad_norm": 67.00456238394733, "language_loss": 0.81070447, "learning_rate": 3.922205794037456e-06, "loss": 0.83518022, "num_input_tokens_seen": 41968100, "router_z_loss_clip": 4.04296875, "router_z_loss_mlp": 0.60424805, "step": 1936, "time_per_iteration": 2.6756670475006104 }, { "auxiliary_loss_clip": 0.01771072, "auxiliary_loss_mlp": 0.0064326, "balance_loss_clip": 1.3626188, "balance_loss_mlp": 0.58556259, "epoch": 0.1164587404178566, "flos": 21214983214080.0, "grad_norm": 7.875149828544808, "language_loss": 0.908077, "learning_rate": 3.922098191955998e-06, "loss": 0.93222022, "num_input_tokens_seen": 41986375, "router_z_loss_clip": 4.08984375, "router_z_loss_mlp": 0.57739258, "step": 1937, "time_per_iteration": 2.7393155097961426 }, { "auxiliary_loss_clip": 0.01757528, "auxiliary_loss_mlp": 0.00644985, "balance_loss_clip": 1.36324298, "balance_loss_mlp": 0.58785945, "epoch": 0.11651886367052458, "flos": 27818632028160.0, "grad_norm": 9.316170604279959, "language_loss": 0.81292868, "learning_rate": 3.921990516988384e-06, "loss": 0.83695382, "num_input_tokens_seen": 42006055, "router_z_loss_clip": 3.9453125, "router_z_loss_mlp": 0.57104492, "step": 1938, "time_per_iteration": 2.712216377258301 }, { "auxiliary_loss_clip": 0.01780941, "auxiliary_loss_mlp": 0.00616709, "balance_loss_clip": 1.37645078, "balance_loss_mlp": 0.55963147, "epoch": 0.11657898692319255, "flos": 22889569224960.0, "grad_norm": 30.76918893819415, "language_loss": 0.8424567, "learning_rate": 3.921882769138696e-06, "loss": 0.86643314, "num_input_tokens_seen": 42024995, "router_z_loss_clip": 4.046875, "router_z_loss_mlp": 0.57006836, "step": 1939, "time_per_iteration": 2.7060909271240234 }, { "auxiliary_loss_clip": 0.0177362, "auxiliary_loss_mlp": 0.00643073, "balance_loss_clip": 1.37638533, "balance_loss_mlp": 0.58795083, "epoch": 0.11663911017586051, "flos": 24315905364480.0, "grad_norm": 132.7882142430819, "language_loss": 0.91907585, "learning_rate": 3.9217749484110215e-06, "loss": 0.94324279, "num_input_tokens_seen": 42042640, "router_z_loss_clip": 3.97265625, "router_z_loss_mlp": 0.55126953, "step": 1940, "time_per_iteration": 2.734760046005249 }, { "auxiliary_loss_clip": 0.0178655, "auxiliary_loss_mlp": 0.00650426, "balance_loss_clip": 1.39379084, "balance_loss_mlp": 0.59470731, "epoch": 0.11669923342852849, "flos": 42340152470400.0, "grad_norm": 3.4921947228825023, "language_loss": 0.81201434, "learning_rate": 3.921667054809449e-06, "loss": 0.83638406, "num_input_tokens_seen": 42067005, "router_z_loss_clip": 3.92578125, "router_z_loss_mlp": 0.55688477, "step": 1941, "time_per_iteration": 2.9949352741241455 }, { "auxiliary_loss_clip": 0.01788201, "auxiliary_loss_mlp": 0.00663008, "balance_loss_clip": 1.38701844, "balance_loss_mlp": 0.60678911, "epoch": 0.11675935668119646, "flos": 14642288945280.0, "grad_norm": 7.965972463687534, "language_loss": 0.94957578, "learning_rate": 3.921559088338068e-06, "loss": 0.97408783, "num_input_tokens_seen": 42082295, "router_z_loss_clip": 4.0078125, "router_z_loss_mlp": 0.56225586, "step": 1942, "time_per_iteration": 2.766167640686035 }, { "auxiliary_loss_clip": 0.01799466, "auxiliary_loss_mlp": 0.00577087, "balance_loss_clip": 1.40223622, "balance_loss_mlp": 0.52873528, "epoch": 0.11681947993386442, "flos": 35116470063360.0, "grad_norm": 3.998693573435623, "language_loss": 0.73761117, "learning_rate": 3.921451049000975e-06, "loss": 0.76137674, "num_input_tokens_seen": 42105295, "router_z_loss_clip": 3.97070312, "router_z_loss_mlp": 0.48339844, "step": 1943, "time_per_iteration": 2.8701040744781494 }, { "auxiliary_loss_clip": 0.01788888, "auxiliary_loss_mlp": 0.00625234, "balance_loss_clip": 1.38636732, "balance_loss_mlp": 0.56827545, "epoch": 0.11687960318653239, "flos": 38983259024640.0, "grad_norm": 3.192010214528672, "language_loss": 0.75866193, "learning_rate": 3.921342936802265e-06, "loss": 0.78280318, "num_input_tokens_seen": 42125520, "router_z_loss_clip": 4.02734375, "router_z_loss_mlp": 0.56982422, "step": 1944, "time_per_iteration": 2.966899871826172 }, { "auxiliary_loss_clip": 0.01776145, "auxiliary_loss_mlp": 0.00598364, "balance_loss_clip": 1.38127017, "balance_loss_mlp": 0.54722303, "epoch": 0.11693972643920036, "flos": 25994980575360.0, "grad_norm": 9.65528747835021, "language_loss": 0.87851322, "learning_rate": 3.921234751746038e-06, "loss": 0.9022584, "num_input_tokens_seen": 42146335, "router_z_loss_clip": 3.94726562, "router_z_loss_mlp": 0.51196289, "step": 1945, "time_per_iteration": 2.672325372695923 }, { "auxiliary_loss_clip": 0.01784969, "auxiliary_loss_mlp": 0.00570737, "balance_loss_clip": 1.38640642, "balance_loss_mlp": 0.52112192, "epoch": 0.11699984969186833, "flos": 27272107618560.0, "grad_norm": 1500.2242084542074, "language_loss": 0.81017005, "learning_rate": 3.9211264938363975e-06, "loss": 0.83372712, "num_input_tokens_seen": 42165320, "router_z_loss_clip": 3.98242188, "router_z_loss_mlp": 0.49658203, "step": 1946, "time_per_iteration": 2.752255916595459 }, { "auxiliary_loss_clip": 0.01782918, "auxiliary_loss_mlp": 0.00582788, "balance_loss_clip": 1.39419913, "balance_loss_mlp": 0.53043175, "epoch": 0.1170599729445363, "flos": 15267853232640.0, "grad_norm": 5.313038079143364, "language_loss": 0.74473661, "learning_rate": 3.921018163077448e-06, "loss": 0.76839364, "num_input_tokens_seen": 42182955, "router_z_loss_clip": 3.89257812, "router_z_loss_mlp": 0.52368164, "step": 1947, "time_per_iteration": 2.5974977016448975 }, { "auxiliary_loss_clip": 0.01817315, "auxiliary_loss_mlp": 0.0063317, "balance_loss_clip": 1.40481591, "balance_loss_mlp": 0.57289803, "epoch": 0.11712009619720427, "flos": 17164439251200.0, "grad_norm": 40.21854387612632, "language_loss": 0.89940149, "learning_rate": 3.920909759473295e-06, "loss": 0.92390633, "num_input_tokens_seen": 42200760, "router_z_loss_clip": 4.125, "router_z_loss_mlp": 0.60302734, "step": 1948, "time_per_iteration": 2.664835214614868 }, { "auxiliary_loss_clip": 0.01747261, "auxiliary_loss_mlp": 0.0027056, "balance_loss_clip": 1.48090696, "balance_loss_mlp": 0.25129548, "epoch": 0.11718021944987224, "flos": 70940991997440.0, "grad_norm": 0.829951883030092, "language_loss": 0.65126789, "learning_rate": 3.920801283028054e-06, "loss": 0.67144614, "num_input_tokens_seen": 42265745, "router_z_loss_clip": 2.65625, "router_z_loss_mlp": 0.19238281, "step": 1949, "time_per_iteration": 3.1453447341918945 }, { "auxiliary_loss_clip": 0.01782267, "auxiliary_loss_mlp": 0.00594337, "balance_loss_clip": 1.3876574, "balance_loss_mlp": 0.54250479, "epoch": 0.1172403427025402, "flos": 27453456408960.0, "grad_norm": 4.495927920913043, "language_loss": 0.76340055, "learning_rate": 3.920692733745835e-06, "loss": 0.7871666, "num_input_tokens_seen": 42286245, "router_z_loss_clip": 3.93945312, "router_z_loss_mlp": 0.51806641, "step": 1950, "time_per_iteration": 2.710724115371704 }, { "auxiliary_loss_clip": 0.01821903, "auxiliary_loss_mlp": 0.00621216, "balance_loss_clip": 1.40860891, "balance_loss_mlp": 0.5634473, "epoch": 0.11730046595520818, "flos": 15668723992320.0, "grad_norm": 6.571310751910141, "language_loss": 0.82690823, "learning_rate": 3.920584111630755e-06, "loss": 0.85133946, "num_input_tokens_seen": 42302710, "router_z_loss_clip": 4.140625, "router_z_loss_mlp": 0.57788086, "step": 1951, "time_per_iteration": 2.595916986465454 }, { "auxiliary_loss_clip": 0.01794216, "auxiliary_loss_mlp": 0.00611017, "balance_loss_clip": 1.39229703, "balance_loss_mlp": 0.55587125, "epoch": 0.11736058920787615, "flos": 25630164092160.0, "grad_norm": 3.6415155275570945, "language_loss": 0.82260293, "learning_rate": 3.9204754166869325e-06, "loss": 0.84665525, "num_input_tokens_seen": 42324115, "router_z_loss_clip": 4.0234375, "router_z_loss_mlp": 0.55200195, "step": 1952, "time_per_iteration": 2.681856632232666 }, { "auxiliary_loss_clip": 0.01810857, "auxiliary_loss_mlp": 0.00564085, "balance_loss_clip": 1.40595686, "balance_loss_mlp": 0.51373065, "epoch": 0.11742071246054411, "flos": 21434289701760.0, "grad_norm": 7.896293010906656, "language_loss": 0.78067684, "learning_rate": 3.920366648918491e-06, "loss": 0.80442631, "num_input_tokens_seen": 42342505, "router_z_loss_clip": 4.046875, "router_z_loss_mlp": 0.50390625, "step": 1953, "time_per_iteration": 2.6890859603881836 }, { "auxiliary_loss_clip": 0.01783727, "auxiliary_loss_mlp": 0.00645554, "balance_loss_clip": 1.37344086, "balance_loss_mlp": 0.58556819, "epoch": 0.11748083571321208, "flos": 15997845335040.0, "grad_norm": 212.67865253860748, "language_loss": 0.86848998, "learning_rate": 3.920257808329552e-06, "loss": 0.89278281, "num_input_tokens_seen": 42360525, "router_z_loss_clip": 4.1015625, "router_z_loss_mlp": 0.59985352, "step": 1954, "time_per_iteration": 2.701418161392212 }, { "auxiliary_loss_clip": 0.01798114, "auxiliary_loss_mlp": 0.00609097, "balance_loss_clip": 1.37913465, "balance_loss_mlp": 0.54965913, "epoch": 0.11754095896588006, "flos": 16180056051840.0, "grad_norm": 23.193245867152633, "language_loss": 0.91114515, "learning_rate": 3.920148894924246e-06, "loss": 0.93521726, "num_input_tokens_seen": 42377045, "router_z_loss_clip": 4.19140625, "router_z_loss_mlp": 0.59448242, "step": 1955, "time_per_iteration": 2.7172789573669434 }, { "auxiliary_loss_clip": 0.01800644, "auxiliary_loss_mlp": 0.00592839, "balance_loss_clip": 1.38858283, "balance_loss_mlp": 0.53440285, "epoch": 0.11760108221854802, "flos": 13261596013440.0, "grad_norm": 96.0705487860443, "language_loss": 0.83777106, "learning_rate": 3.920039908706701e-06, "loss": 0.8617059, "num_input_tokens_seen": 42393960, "router_z_loss_clip": 4.11914062, "router_z_loss_mlp": 0.58398438, "step": 1956, "time_per_iteration": 2.653679847717285 }, { "auxiliary_loss_clip": 0.01785293, "auxiliary_loss_mlp": 0.00565304, "balance_loss_clip": 1.39144897, "balance_loss_mlp": 0.5159272, "epoch": 0.11766120547121599, "flos": 24498439303680.0, "grad_norm": 19.250621891688173, "language_loss": 0.85633063, "learning_rate": 3.91993084968105e-06, "loss": 0.87983656, "num_input_tokens_seen": 42413160, "router_z_loss_clip": 3.94335938, "router_z_loss_mlp": 0.4934082, "step": 1957, "time_per_iteration": 2.8523097038269043 }, { "auxiliary_loss_clip": 0.01795964, "auxiliary_loss_mlp": 0.00626798, "balance_loss_clip": 1.38574207, "balance_loss_mlp": 0.56972086, "epoch": 0.11772132872388397, "flos": 17784005967360.0, "grad_norm": 4.7177334437267, "language_loss": 0.83362293, "learning_rate": 3.919821717851428e-06, "loss": 0.85785055, "num_input_tokens_seen": 42432590, "router_z_loss_clip": 4.10546875, "router_z_loss_mlp": 0.57104492, "step": 1958, "time_per_iteration": 2.676231622695923 }, { "auxiliary_loss_clip": 0.01777601, "auxiliary_loss_mlp": 0.00581681, "balance_loss_clip": 1.37786853, "balance_loss_mlp": 0.5239594, "epoch": 0.11778145197655193, "flos": 13217030213760.0, "grad_norm": 47.26295303034747, "language_loss": 0.84330511, "learning_rate": 3.919712513221976e-06, "loss": 0.86689794, "num_input_tokens_seen": 42450135, "router_z_loss_clip": 4.00195312, "router_z_loss_mlp": 0.57763672, "step": 1959, "time_per_iteration": 2.6990253925323486 }, { "auxiliary_loss_clip": 0.01756819, "auxiliary_loss_mlp": 0.00571793, "balance_loss_clip": 1.36498189, "balance_loss_mlp": 0.51667023, "epoch": 0.1178415752292199, "flos": 20230204965120.0, "grad_norm": 86.44643977548232, "language_loss": 0.74515551, "learning_rate": 3.919603235796832e-06, "loss": 0.76844162, "num_input_tokens_seen": 42470050, "router_z_loss_clip": 3.9140625, "router_z_loss_mlp": 0.55078125, "step": 1960, "time_per_iteration": 4.21378231048584 }, { "auxiliary_loss_clip": 0.0176958, "auxiliary_loss_mlp": 0.00537774, "balance_loss_clip": 1.36212695, "balance_loss_mlp": 0.48310435, "epoch": 0.11790169848188788, "flos": 13040134709760.0, "grad_norm": 53.72672242314129, "language_loss": 0.89169872, "learning_rate": 3.9194938855801406e-06, "loss": 0.91477227, "num_input_tokens_seen": 42484335, "router_z_loss_clip": 4.078125, "router_z_loss_mlp": 0.546875, "step": 1961, "time_per_iteration": 4.1107776165008545 }, { "auxiliary_loss_clip": 0.01739942, "auxiliary_loss_mlp": 0.00524803, "balance_loss_clip": 1.35698009, "balance_loss_mlp": 0.47347173, "epoch": 0.11796182173455584, "flos": 22265728790400.0, "grad_norm": 21.791295169491356, "language_loss": 0.98113585, "learning_rate": 3.919384462576049e-06, "loss": 1.00378335, "num_input_tokens_seen": 42502720, "router_z_loss_clip": 3.828125, "router_z_loss_mlp": 0.51293945, "step": 1962, "time_per_iteration": 2.6620912551879883 }, { "auxiliary_loss_clip": 0.01742221, "auxiliary_loss_mlp": 0.00501832, "balance_loss_clip": 1.35679555, "balance_loss_mlp": 0.45269382, "epoch": 0.1180219449872238, "flos": 10635017892480.0, "grad_norm": 509.2482214006204, "language_loss": 0.95044506, "learning_rate": 3.919274966788707e-06, "loss": 0.97288561, "num_input_tokens_seen": 42519460, "router_z_loss_clip": 3.85546875, "router_z_loss_mlp": 0.4909668, "step": 1963, "time_per_iteration": 4.113672256469727 }, { "auxiliary_loss_clip": 0.01741811, "auxiliary_loss_mlp": 0.00489811, "balance_loss_clip": 1.34877682, "balance_loss_mlp": 0.43676245, "epoch": 0.11808206823989177, "flos": 20923532259840.0, "grad_norm": 74.16154417803935, "language_loss": 0.89193356, "learning_rate": 3.919165398222265e-06, "loss": 0.91424978, "num_input_tokens_seen": 42539420, "router_z_loss_clip": 3.92773438, "router_z_loss_mlp": 0.53076172, "step": 1964, "time_per_iteration": 2.692570209503174 }, { "auxiliary_loss_clip": 0.01760356, "auxiliary_loss_mlp": 0.00456978, "balance_loss_clip": 1.36798882, "balance_loss_mlp": 0.41036725, "epoch": 0.11814219149255975, "flos": 20777770869120.0, "grad_norm": 16.87805373458288, "language_loss": 0.89665926, "learning_rate": 3.919055756880879e-06, "loss": 0.91883266, "num_input_tokens_seen": 42558225, "router_z_loss_clip": 3.91601562, "router_z_loss_mlp": 0.46655273, "step": 1965, "time_per_iteration": 4.160141229629517 }, { "auxiliary_loss_clip": 0.01751525, "auxiliary_loss_mlp": 0.00465822, "balance_loss_clip": 1.35395741, "balance_loss_mlp": 0.4149434, "epoch": 0.11820231474522772, "flos": 48759938542080.0, "grad_norm": 3.890609141706635, "language_loss": 0.79338348, "learning_rate": 3.918946042768707e-06, "loss": 0.81555694, "num_input_tokens_seen": 42580790, "router_z_loss_clip": 3.9765625, "router_z_loss_mlp": 0.50952148, "step": 1966, "time_per_iteration": 2.9449069499969482 }, { "auxiliary_loss_clip": 0.01737613, "auxiliary_loss_mlp": 0.00395164, "balance_loss_clip": 1.34824157, "balance_loss_mlp": 0.35143781, "epoch": 0.11826243799789568, "flos": 16690598012160.0, "grad_norm": 11.591240086599106, "language_loss": 0.79381371, "learning_rate": 3.918836255889908e-06, "loss": 0.8151415, "num_input_tokens_seen": 42597355, "router_z_loss_clip": 3.88867188, "router_z_loss_mlp": 0.43774414, "step": 1967, "time_per_iteration": 2.6649765968322754 }, { "auxiliary_loss_clip": 0.017486, "auxiliary_loss_mlp": 0.00413676, "balance_loss_clip": 1.36228669, "balance_loss_mlp": 0.3640846, "epoch": 0.11832256125056366, "flos": 16909868586240.0, "grad_norm": 17.886787894362595, "language_loss": 0.94988602, "learning_rate": 3.9187263962486456e-06, "loss": 0.97150874, "num_input_tokens_seen": 42616060, "router_z_loss_clip": 3.86132812, "router_z_loss_mlp": 0.49584961, "step": 1968, "time_per_iteration": 2.658308506011963 }, { "auxiliary_loss_clip": 0.01720651, "auxiliary_loss_mlp": 0.00395966, "balance_loss_clip": 1.33987451, "balance_loss_mlp": 0.35171592, "epoch": 0.11838268450323162, "flos": 22820405587200.0, "grad_norm": 3.787328807302539, "language_loss": 0.74344444, "learning_rate": 3.918616463849087e-06, "loss": 0.76461065, "num_input_tokens_seen": 42636285, "router_z_loss_clip": 3.80078125, "router_z_loss_mlp": 0.44287109, "step": 1969, "time_per_iteration": 2.7411253452301025 }, { "auxiliary_loss_clip": 0.01707681, "auxiliary_loss_mlp": 0.0043051, "balance_loss_clip": 1.33684444, "balance_loss_mlp": 0.38199186, "epoch": 0.11844280775589959, "flos": 33545844990720.0, "grad_norm": 3.240812274592002, "language_loss": 0.88753033, "learning_rate": 3.918506458695399e-06, "loss": 0.90891218, "num_input_tokens_seen": 42658320, "router_z_loss_clip": 3.7109375, "router_z_loss_mlp": 0.48510742, "step": 1970, "time_per_iteration": 2.7830018997192383 }, { "auxiliary_loss_clip": 0.01604112, "auxiliary_loss_mlp": 0.00237742, "balance_loss_clip": 1.34109199, "balance_loss_mlp": 0.21981309, "epoch": 0.11850293100856757, "flos": 66350998604160.0, "grad_norm": 0.8751610642813445, "language_loss": 0.66168559, "learning_rate": 3.918396380791754e-06, "loss": 0.68010414, "num_input_tokens_seen": 42721500, "router_z_loss_clip": 2.625, "router_z_loss_mlp": 0.1796875, "step": 1971, "time_per_iteration": 3.124837875366211 }, { "auxiliary_loss_clip": 0.01663295, "auxiliary_loss_mlp": 0.00389748, "balance_loss_clip": 1.30431843, "balance_loss_mlp": 0.34678534, "epoch": 0.11856305426123553, "flos": 24681045070080.0, "grad_norm": 3.3693958968051314, "language_loss": 0.86832565, "learning_rate": 3.918286230142327e-06, "loss": 0.88885611, "num_input_tokens_seen": 42739825, "router_z_loss_clip": 3.59179688, "router_z_loss_mlp": 0.4296875, "step": 1972, "time_per_iteration": 2.7367348670959473 }, { "auxiliary_loss_clip": 0.01642799, "auxiliary_loss_mlp": 0.00378416, "balance_loss_clip": 1.29048038, "balance_loss_mlp": 0.33452311, "epoch": 0.1186231775139035, "flos": 24280102483200.0, "grad_norm": 5.0315733692862175, "language_loss": 0.78670645, "learning_rate": 3.918176006751292e-06, "loss": 0.80691862, "num_input_tokens_seen": 42758695, "router_z_loss_clip": 3.5234375, "router_z_loss_mlp": 0.43920898, "step": 1973, "time_per_iteration": 2.724130153656006 }, { "auxiliary_loss_clip": 0.01635275, "auxiliary_loss_mlp": 0.00389158, "balance_loss_clip": 1.28355837, "balance_loss_mlp": 0.34674379, "epoch": 0.11868330076657148, "flos": 21757413473280.0, "grad_norm": 1.935074741880182, "language_loss": 0.76805389, "learning_rate": 3.918065710622832e-06, "loss": 0.78829819, "num_input_tokens_seen": 42778510, "router_z_loss_clip": 3.515625, "router_z_loss_mlp": 0.42431641, "step": 1974, "time_per_iteration": 2.70682692527771 }, { "auxiliary_loss_clip": 0.01637969, "auxiliary_loss_mlp": 0.00374947, "balance_loss_clip": 1.27883077, "balance_loss_mlp": 0.3297188, "epoch": 0.11874342401923944, "flos": 17193274894080.0, "grad_norm": 89.76861819338684, "language_loss": 0.85188556, "learning_rate": 3.917955341761128e-06, "loss": 0.8720147, "num_input_tokens_seen": 42793995, "router_z_loss_clip": 3.58984375, "router_z_loss_mlp": 0.45214844, "step": 1975, "time_per_iteration": 2.724034309387207 }, { "auxiliary_loss_clip": 0.01611779, "auxiliary_loss_mlp": 0.00362235, "balance_loss_clip": 1.27519798, "balance_loss_mlp": 0.32001099, "epoch": 0.11880354727190741, "flos": 15229572312960.0, "grad_norm": 2.7779441585997975, "language_loss": 0.81700099, "learning_rate": 3.917844900170364e-06, "loss": 0.83674115, "num_input_tokens_seen": 42809000, "router_z_loss_clip": 3.3671875, "router_z_loss_mlp": 0.42211914, "step": 1976, "time_per_iteration": 2.6696720123291016 }, { "auxiliary_loss_clip": 0.01604672, "auxiliary_loss_mlp": 0.00362759, "balance_loss_clip": 1.26026082, "balance_loss_mlp": 0.32120305, "epoch": 0.11886367052457537, "flos": 27309706179840.0, "grad_norm": 1.973244759994322, "language_loss": 0.79469442, "learning_rate": 3.91773438585473e-06, "loss": 0.81436872, "num_input_tokens_seen": 42831585, "router_z_loss_clip": 3.44726562, "router_z_loss_mlp": 0.41552734, "step": 1977, "time_per_iteration": 2.8059372901916504 }, { "auxiliary_loss_clip": 0.01618286, "auxiliary_loss_mlp": 0.00394106, "balance_loss_clip": 1.26154613, "balance_loss_mlp": 0.35006988, "epoch": 0.11892379377724335, "flos": 21798280172160.0, "grad_norm": 38.91924888632274, "language_loss": 0.81887102, "learning_rate": 3.9176237988184165e-06, "loss": 0.83899486, "num_input_tokens_seen": 42848420, "router_z_loss_clip": 3.56640625, "router_z_loss_mlp": 0.44042969, "step": 1978, "time_per_iteration": 2.7566325664520264 }, { "auxiliary_loss_clip": 0.01590761, "auxiliary_loss_mlp": 0.00361329, "balance_loss_clip": 1.25682807, "balance_loss_mlp": 0.32122684, "epoch": 0.11898391702991132, "flos": 13991013498240.0, "grad_norm": 2.7428667915447966, "language_loss": 0.78527105, "learning_rate": 3.917513139065616e-06, "loss": 0.80479193, "num_input_tokens_seen": 42866645, "router_z_loss_clip": 3.33789062, "router_z_loss_mlp": 0.40112305, "step": 1979, "time_per_iteration": 2.702584743499756 }, { "auxiliary_loss_clip": 0.01598727, "auxiliary_loss_mlp": 0.00340386, "balance_loss_clip": 1.25963163, "balance_loss_mlp": 0.30078417, "epoch": 0.11904404028257928, "flos": 32234567091840.0, "grad_norm": 10.219824057816563, "language_loss": 1.03051174, "learning_rate": 3.917402406600525e-06, "loss": 1.04990292, "num_input_tokens_seen": 42888515, "router_z_loss_clip": 3.38867188, "router_z_loss_mlp": 0.39599609, "step": 1980, "time_per_iteration": 2.774045944213867 }, { "auxiliary_loss_clip": 0.01597991, "auxiliary_loss_mlp": 0.00368582, "balance_loss_clip": 1.25351739, "balance_loss_mlp": 0.32502288, "epoch": 0.11910416353524726, "flos": 23586272398080.0, "grad_norm": 14.630210447654045, "language_loss": 0.91074049, "learning_rate": 3.917291601427342e-06, "loss": 0.93040621, "num_input_tokens_seen": 42909035, "router_z_loss_clip": 3.4453125, "router_z_loss_mlp": 0.43530273, "step": 1981, "time_per_iteration": 2.661867141723633 }, { "auxiliary_loss_clip": 0.01590517, "auxiliary_loss_mlp": 0.00380912, "balance_loss_clip": 1.25586689, "balance_loss_mlp": 0.33833009, "epoch": 0.11916428678791523, "flos": 25333038789120.0, "grad_norm": 166.23384718823678, "language_loss": 0.91056794, "learning_rate": 3.91718072355027e-06, "loss": 0.93028224, "num_input_tokens_seen": 42927555, "router_z_loss_clip": 3.34570312, "router_z_loss_mlp": 0.42578125, "step": 1982, "time_per_iteration": 2.7405333518981934 }, { "auxiliary_loss_clip": 0.01581916, "auxiliary_loss_mlp": 0.00370408, "balance_loss_clip": 1.24882925, "balance_loss_mlp": 0.32978135, "epoch": 0.11922441004058319, "flos": 19788431592960.0, "grad_norm": 6.6037418652459285, "language_loss": 0.88808882, "learning_rate": 3.917069772973513e-06, "loss": 0.90761209, "num_input_tokens_seen": 42945300, "router_z_loss_clip": 3.33203125, "router_z_loss_mlp": 0.40625, "step": 1983, "time_per_iteration": 2.659723997116089 }, { "auxiliary_loss_clip": 0.01598207, "auxiliary_loss_mlp": 0.00367266, "balance_loss_clip": 1.25513732, "balance_loss_mlp": 0.32642478, "epoch": 0.11928453329325117, "flos": 21536347219200.0, "grad_norm": 32.39621773485324, "language_loss": 0.88360161, "learning_rate": 3.916958749701277e-06, "loss": 0.9032563, "num_input_tokens_seen": 42961295, "router_z_loss_clip": 3.4296875, "router_z_loss_mlp": 0.40820312, "step": 1984, "time_per_iteration": 2.6931474208831787 }, { "auxiliary_loss_clip": 0.01600806, "auxiliary_loss_mlp": 0.00364828, "balance_loss_clip": 1.25914824, "balance_loss_mlp": 0.32274759, "epoch": 0.11934465654591914, "flos": 20815010294400.0, "grad_norm": 2.7800155379269245, "language_loss": 0.88973498, "learning_rate": 3.9168476537377745e-06, "loss": 0.9093914, "num_input_tokens_seen": 42980330, "router_z_loss_clip": 3.421875, "router_z_loss_mlp": 0.4206543, "step": 1985, "time_per_iteration": 2.6295671463012695 }, { "auxiliary_loss_clip": 0.01592769, "auxiliary_loss_mlp": 0.00356639, "balance_loss_clip": 1.26204228, "balance_loss_mlp": 0.31582189, "epoch": 0.1194047797985871, "flos": 19060486565760.0, "grad_norm": 7.719108086373187, "language_loss": 0.8050521, "learning_rate": 3.916736485087216e-06, "loss": 0.8245461, "num_input_tokens_seen": 42996125, "router_z_loss_clip": 3.30859375, "router_z_loss_mlp": 0.40820312, "step": 1986, "time_per_iteration": 2.6566152572631836 }, { "auxiliary_loss_clip": 0.01612723, "auxiliary_loss_mlp": 0.00354126, "balance_loss_clip": 1.27331305, "balance_loss_mlp": 0.31416684, "epoch": 0.11946490305125507, "flos": 27190805184000.0, "grad_norm": 78.34466963959444, "language_loss": 0.81263101, "learning_rate": 3.916625243753819e-06, "loss": 0.83229953, "num_input_tokens_seen": 43014180, "router_z_loss_clip": 3.40039062, "router_z_loss_mlp": 0.39941406, "step": 1987, "time_per_iteration": 2.700974225997925 }, { "auxiliary_loss_clip": 0.0162828, "auxiliary_loss_mlp": 0.0043218, "balance_loss_clip": 1.27401686, "balance_loss_mlp": 0.38547349, "epoch": 0.11952502630392305, "flos": 21140791672320.0, "grad_norm": 15.901285822012056, "language_loss": 0.79763025, "learning_rate": 3.916513929741799e-06, "loss": 0.81823486, "num_input_tokens_seen": 43032120, "router_z_loss_clip": 3.54296875, "router_z_loss_mlp": 0.46704102, "step": 1988, "time_per_iteration": 2.6459853649139404 }, { "auxiliary_loss_clip": 0.01614828, "auxiliary_loss_mlp": 0.00385718, "balance_loss_clip": 1.27737701, "balance_loss_mlp": 0.34084803, "epoch": 0.11958514955659101, "flos": 22124241118080.0, "grad_norm": 37.22381269326038, "language_loss": 0.86806595, "learning_rate": 3.91640254305538e-06, "loss": 0.88807142, "num_input_tokens_seen": 43052215, "router_z_loss_clip": 3.375, "router_z_loss_mlp": 0.44873047, "step": 1989, "time_per_iteration": 2.714582920074463 }, { "auxiliary_loss_clip": 0.01630511, "auxiliary_loss_mlp": 0.00433331, "balance_loss_clip": 1.28364778, "balance_loss_mlp": 0.38862711, "epoch": 0.11964527280925898, "flos": 17421452040960.0, "grad_norm": 276.91924065757047, "language_loss": 0.84253955, "learning_rate": 3.916291083698784e-06, "loss": 0.86317801, "num_input_tokens_seen": 43069720, "router_z_loss_clip": 3.47265625, "router_z_loss_mlp": 0.44702148, "step": 1990, "time_per_iteration": 2.668959379196167 }, { "auxiliary_loss_clip": 0.01580699, "auxiliary_loss_mlp": 0.00089546, "balance_loss_clip": 1.37270463, "balance_loss_mlp": 0.08034267, "epoch": 0.11970539606192696, "flos": 70679741402880.0, "grad_norm": 0.8625000334843518, "language_loss": 0.55352938, "learning_rate": 3.916179551676238e-06, "loss": 0.57023185, "num_input_tokens_seen": 43123130, "router_z_loss_clip": 2.078125, "router_z_loss_mlp": 0.09179688, "step": 1991, "time_per_iteration": 3.134777069091797 }, { "auxiliary_loss_clip": 0.01616759, "auxiliary_loss_mlp": 0.00365401, "balance_loss_clip": 1.2847054, "balance_loss_mlp": 0.32782644, "epoch": 0.11976551931459492, "flos": 21215019127680.0, "grad_norm": 3.2941151598836416, "language_loss": 0.84734547, "learning_rate": 3.916067946991971e-06, "loss": 0.86716706, "num_input_tokens_seen": 43140015, "router_z_loss_clip": 3.3203125, "router_z_loss_mlp": 0.37548828, "step": 1992, "time_per_iteration": 2.6415200233459473 }, { "auxiliary_loss_clip": 0.01609997, "auxiliary_loss_mlp": 0.00418577, "balance_loss_clip": 1.27155375, "balance_loss_mlp": 0.3757574, "epoch": 0.11982564256726289, "flos": 25989306226560.0, "grad_norm": 9.454851072798192, "language_loss": 0.84291065, "learning_rate": 3.915956269650216e-06, "loss": 0.86319637, "num_input_tokens_seen": 43160105, "router_z_loss_clip": 3.38867188, "router_z_loss_mlp": 0.42797852, "step": 1993, "time_per_iteration": 2.6415982246398926 }, { "auxiliary_loss_clip": 0.01608359, "auxiliary_loss_mlp": 0.00375776, "balance_loss_clip": 1.27148807, "balance_loss_mlp": 0.33782023, "epoch": 0.11988576581993086, "flos": 21650866755840.0, "grad_norm": 6.924494587883622, "language_loss": 0.885979, "learning_rate": 3.915844519655208e-06, "loss": 0.90582037, "num_input_tokens_seen": 43179835, "router_z_loss_clip": 3.36914062, "router_z_loss_mlp": 0.37963867, "step": 1994, "time_per_iteration": 2.6363766193389893 }, { "auxiliary_loss_clip": 0.01633313, "auxiliary_loss_mlp": 0.00371836, "balance_loss_clip": 1.29159069, "balance_loss_mlp": 0.33626381, "epoch": 0.11994588907259883, "flos": 17857407409920.0, "grad_norm": 9.56406624343577, "language_loss": 0.96738195, "learning_rate": 3.915732697011183e-06, "loss": 0.98743343, "num_input_tokens_seen": 43197210, "router_z_loss_clip": 3.4140625, "router_z_loss_mlp": 0.35571289, "step": 1995, "time_per_iteration": 2.6397545337677 }, { "auxiliary_loss_clip": 0.01632349, "auxiliary_loss_mlp": 0.00403699, "balance_loss_clip": 1.28681552, "balance_loss_mlp": 0.36142725, "epoch": 0.1200060123252668, "flos": 24462744163200.0, "grad_norm": 13.445292079631123, "language_loss": 0.81188786, "learning_rate": 3.9156208017223825e-06, "loss": 0.83224833, "num_input_tokens_seen": 43215050, "router_z_loss_clip": 3.45117188, "router_z_loss_mlp": 0.42285156, "step": 1996, "time_per_iteration": 2.6725575923919678 }, { "auxiliary_loss_clip": 0.01642375, "auxiliary_loss_mlp": 0.00354913, "balance_loss_clip": 1.3034637, "balance_loss_mlp": 0.31564572, "epoch": 0.12006613557793476, "flos": 18732191235840.0, "grad_norm": 27.23634212712201, "language_loss": 0.92744505, "learning_rate": 3.915508833793048e-06, "loss": 0.94741786, "num_input_tokens_seen": 43233900, "router_z_loss_clip": 3.38867188, "router_z_loss_mlp": 0.39257812, "step": 1997, "time_per_iteration": 2.675361394882202 }, { "auxiliary_loss_clip": 0.01673091, "auxiliary_loss_mlp": 0.00367663, "balance_loss_clip": 1.3207078, "balance_loss_mlp": 0.32944405, "epoch": 0.12012625883060274, "flos": 22267739952000.0, "grad_norm": 8.944693923428655, "language_loss": 0.86022878, "learning_rate": 3.915396793227428e-06, "loss": 0.88063639, "num_input_tokens_seen": 43252105, "router_z_loss_clip": 3.52734375, "router_z_loss_mlp": 0.38208008, "step": 1998, "time_per_iteration": 2.713930606842041 }, { "auxiliary_loss_clip": 0.01659711, "auxiliary_loss_mlp": 0.00332687, "balance_loss_clip": 1.31691742, "balance_loss_mlp": 0.29659075, "epoch": 0.1201863820832707, "flos": 21758885930880.0, "grad_norm": 7.689945568161542, "language_loss": 0.78891212, "learning_rate": 3.915284680029769e-06, "loss": 0.8088361, "num_input_tokens_seen": 43270315, "router_z_loss_clip": 3.42773438, "router_z_loss_mlp": 0.36108398, "step": 1999, "time_per_iteration": 2.680948495864868 }, { "auxiliary_loss_clip": 0.01688411, "auxiliary_loss_mlp": 0.00368746, "balance_loss_clip": 1.32791388, "balance_loss_mlp": 0.3265698, "epoch": 0.12024650533593867, "flos": 21907987286400.0, "grad_norm": 12.76163814403361, "language_loss": 0.83535087, "learning_rate": 3.915172494204323e-06, "loss": 0.8559224, "num_input_tokens_seen": 43289935, "router_z_loss_clip": 3.6015625, "router_z_loss_mlp": 0.42211914, "step": 2000, "time_per_iteration": 2.691784620285034 }, { "auxiliary_loss_clip": 0.01704384, "auxiliary_loss_mlp": 0.00332477, "balance_loss_clip": 1.33916473, "balance_loss_mlp": 0.2962369, "epoch": 0.12030662858860665, "flos": 21689219502720.0, "grad_norm": 3.0896033462379697, "language_loss": 0.90799725, "learning_rate": 3.915060235755344e-06, "loss": 0.92836589, "num_input_tokens_seen": 43309325, "router_z_loss_clip": 3.6484375, "router_z_loss_mlp": 0.36254883, "step": 2001, "time_per_iteration": 2.738615036010742 }, { "auxiliary_loss_clip": 0.01699772, "auxiliary_loss_mlp": 0.00348187, "balance_loss_clip": 1.33698761, "balance_loss_mlp": 0.31049299, "epoch": 0.12036675184127461, "flos": 12933228856320.0, "grad_norm": 10.810892707382656, "language_loss": 0.83120286, "learning_rate": 3.91494790468709e-06, "loss": 0.85168248, "num_input_tokens_seen": 43327010, "router_z_loss_clip": 3.62695312, "router_z_loss_mlp": 0.37695312, "step": 2002, "time_per_iteration": 2.651668071746826 }, { "auxiliary_loss_clip": 0.01714319, "auxiliary_loss_mlp": 0.00356142, "balance_loss_clip": 1.33512449, "balance_loss_mlp": 0.31561086, "epoch": 0.12042687509394258, "flos": 20851028657280.0, "grad_norm": 9.159407766200552, "language_loss": 0.85453761, "learning_rate": 3.9148355010038185e-06, "loss": 0.87524223, "num_input_tokens_seen": 43345650, "router_z_loss_clip": 3.79296875, "router_z_loss_mlp": 0.40527344, "step": 2003, "time_per_iteration": 4.183563947677612 }, { "auxiliary_loss_clip": 0.01711321, "auxiliary_loss_mlp": 0.00333584, "balance_loss_clip": 1.34493327, "balance_loss_mlp": 0.29438794, "epoch": 0.12048699834661056, "flos": 23878513451520.0, "grad_norm": 2.975325651086925, "language_loss": 0.7869935, "learning_rate": 3.914723024709793e-06, "loss": 0.80744261, "num_input_tokens_seen": 43365555, "router_z_loss_clip": 3.6640625, "router_z_loss_mlp": 0.39208984, "step": 2004, "time_per_iteration": 4.133337497711182 }, { "auxiliary_loss_clip": 0.01707368, "auxiliary_loss_mlp": 0.00359485, "balance_loss_clip": 1.33640206, "balance_loss_mlp": 0.31647438, "epoch": 0.12054712159927852, "flos": 19756363726080.0, "grad_norm": 31.27541006995268, "language_loss": 0.83329254, "learning_rate": 3.914610475809279e-06, "loss": 0.85396111, "num_input_tokens_seen": 43384990, "router_z_loss_clip": 3.7109375, "router_z_loss_mlp": 0.42993164, "step": 2005, "time_per_iteration": 4.0789501667022705 }, { "auxiliary_loss_clip": 0.01811753, "auxiliary_loss_mlp": 0.00089319, "balance_loss_clip": 1.55455613, "balance_loss_mlp": 0.07839921, "epoch": 0.12060724485194649, "flos": 51672763123200.0, "grad_norm": 4.5589941026928695, "language_loss": 0.58246225, "learning_rate": 3.914497854306543e-06, "loss": 0.60147297, "num_input_tokens_seen": 43436335, "router_z_loss_clip": 2.5625, "router_z_loss_mlp": 0.109375, "step": 2006, "time_per_iteration": 2.910663366317749 }, { "auxiliary_loss_clip": 0.01718538, "auxiliary_loss_mlp": 0.00332277, "balance_loss_clip": 1.35086215, "balance_loss_mlp": 0.29217526, "epoch": 0.12066736810461445, "flos": 18990425088000.0, "grad_norm": 3.1303569453186393, "language_loss": 0.82841963, "learning_rate": 3.9143851602058575e-06, "loss": 0.8489278, "num_input_tokens_seen": 43456495, "router_z_loss_clip": 3.67773438, "router_z_loss_mlp": 0.40112305, "step": 2007, "time_per_iteration": 2.655571460723877 }, { "auxiliary_loss_clip": 0.01738483, "auxiliary_loss_mlp": 0.00359876, "balance_loss_clip": 1.35317898, "balance_loss_mlp": 0.31796181, "epoch": 0.12072749135728243, "flos": 16471973882880.0, "grad_norm": 43.84610844280674, "language_loss": 0.93761486, "learning_rate": 3.914272393511494e-06, "loss": 0.95859843, "num_input_tokens_seen": 43473085, "router_z_loss_clip": 3.85546875, "router_z_loss_mlp": 0.41918945, "step": 2008, "time_per_iteration": 4.191547870635986 }, { "auxiliary_loss_clip": 0.01738813, "auxiliary_loss_mlp": 0.00365951, "balance_loss_clip": 1.36029565, "balance_loss_mlp": 0.32544327, "epoch": 0.1207876146099504, "flos": 18077108947200.0, "grad_norm": 5.6596940104906235, "language_loss": 0.90523648, "learning_rate": 3.91415955422773e-06, "loss": 0.92628407, "num_input_tokens_seen": 43491135, "router_z_loss_clip": 3.78710938, "router_z_loss_mlp": 0.40551758, "step": 2009, "time_per_iteration": 2.6039528846740723 }, { "auxiliary_loss_clip": 0.01734669, "auxiliary_loss_mlp": 0.00378997, "balance_loss_clip": 1.35602605, "balance_loss_mlp": 0.33903801, "epoch": 0.12084773786261836, "flos": 21871573873920.0, "grad_norm": 3.1658567331342407, "language_loss": 0.90652239, "learning_rate": 3.914046642358844e-06, "loss": 0.92765903, "num_input_tokens_seen": 43510440, "router_z_loss_clip": 3.78125, "router_z_loss_mlp": 0.39941406, "step": 2010, "time_per_iteration": 2.673386812210083 }, { "auxiliary_loss_clip": 0.01753014, "auxiliary_loss_mlp": 0.00386663, "balance_loss_clip": 1.36024988, "balance_loss_mlp": 0.34219792, "epoch": 0.12090786111528634, "flos": 18333044328960.0, "grad_norm": 77.3860974396215, "language_loss": 0.88378572, "learning_rate": 3.9139336579091174e-06, "loss": 0.90518254, "num_input_tokens_seen": 43530145, "router_z_loss_clip": 3.93359375, "router_z_loss_mlp": 0.44433594, "step": 2011, "time_per_iteration": 2.6294357776641846 }, { "auxiliary_loss_clip": 0.01754642, "auxiliary_loss_mlp": 0.00358574, "balance_loss_clip": 1.36587548, "balance_loss_mlp": 0.31711259, "epoch": 0.1209679843679543, "flos": 21105850717440.0, "grad_norm": 7084.483233650414, "language_loss": 1.03533351, "learning_rate": 3.913820600882834e-06, "loss": 1.05646563, "num_input_tokens_seen": 43549315, "router_z_loss_clip": 3.890625, "router_z_loss_mlp": 0.41430664, "step": 2012, "time_per_iteration": 2.6867923736572266 }, { "auxiliary_loss_clip": 0.01762264, "auxiliary_loss_mlp": 0.00338564, "balance_loss_clip": 1.37436604, "balance_loss_mlp": 0.2986052, "epoch": 0.12102810762062227, "flos": 29241053585280.0, "grad_norm": 16.482056652401038, "language_loss": 0.85399854, "learning_rate": 3.913707471284283e-06, "loss": 0.87500679, "num_input_tokens_seen": 43569240, "router_z_loss_clip": 3.8828125, "router_z_loss_mlp": 0.39916992, "step": 2013, "time_per_iteration": 2.7254762649536133 }, { "auxiliary_loss_clip": 0.01768462, "auxiliary_loss_mlp": 0.00409403, "balance_loss_clip": 1.36984849, "balance_loss_mlp": 0.3657245, "epoch": 0.12108823087329025, "flos": 17930701111680.0, "grad_norm": 59.586262703705124, "language_loss": 0.84439778, "learning_rate": 3.9135942691177515e-06, "loss": 0.86617637, "num_input_tokens_seen": 43587710, "router_z_loss_clip": 3.99414062, "router_z_loss_mlp": 0.43676758, "step": 2014, "time_per_iteration": 2.67704439163208 }, { "auxiliary_loss_clip": 0.0176095, "auxiliary_loss_mlp": 0.00356461, "balance_loss_clip": 1.37730551, "balance_loss_mlp": 0.31659785, "epoch": 0.12114835412595822, "flos": 22091850028800.0, "grad_norm": 18.208118513666314, "language_loss": 0.93590772, "learning_rate": 3.913480994387535e-06, "loss": 0.95708179, "num_input_tokens_seen": 43606000, "router_z_loss_clip": 3.8359375, "router_z_loss_mlp": 0.39892578, "step": 2015, "time_per_iteration": 2.7172884941101074 }, { "auxiliary_loss_clip": 0.01760855, "auxiliary_loss_mlp": 0.00351671, "balance_loss_clip": 1.37268567, "balance_loss_mlp": 0.31140256, "epoch": 0.12120847737862618, "flos": 20412343854720.0, "grad_norm": 8.942722721751693, "language_loss": 0.76109332, "learning_rate": 3.913367647097926e-06, "loss": 0.78221858, "num_input_tokens_seen": 43624815, "router_z_loss_clip": 3.8828125, "router_z_loss_mlp": 0.40258789, "step": 2016, "time_per_iteration": 2.670454978942871 }, { "auxiliary_loss_clip": 0.01757119, "auxiliary_loss_mlp": 0.00430485, "balance_loss_clip": 1.36272502, "balance_loss_mlp": 0.38375527, "epoch": 0.12126860063129415, "flos": 22309037614080.0, "grad_norm": 13.268745307348686, "language_loss": 0.8977176, "learning_rate": 3.913254227253225e-06, "loss": 0.91959369, "num_input_tokens_seen": 43643960, "router_z_loss_clip": 3.94726562, "router_z_loss_mlp": 0.46728516, "step": 2017, "time_per_iteration": 2.761678695678711 }, { "auxiliary_loss_clip": 0.01762393, "auxiliary_loss_mlp": 0.00389481, "balance_loss_clip": 1.3683424, "balance_loss_mlp": 0.34716183, "epoch": 0.12132872388396213, "flos": 13699275235200.0, "grad_norm": 44.883157597221015, "language_loss": 0.80019444, "learning_rate": 3.913140734857731e-06, "loss": 0.82171321, "num_input_tokens_seen": 43662650, "router_z_loss_clip": 3.94335938, "router_z_loss_mlp": 0.42358398, "step": 2018, "time_per_iteration": 2.623199224472046 }, { "auxiliary_loss_clip": 0.01762792, "auxiliary_loss_mlp": 0.0038846, "balance_loss_clip": 1.37043333, "balance_loss_mlp": 0.34852508, "epoch": 0.12138884713663009, "flos": 26466954307200.0, "grad_norm": 1.7324546925939657, "language_loss": 0.77129614, "learning_rate": 3.91302716991575e-06, "loss": 0.79280865, "num_input_tokens_seen": 43684205, "router_z_loss_clip": 3.92578125, "router_z_loss_mlp": 0.39941406, "step": 2019, "time_per_iteration": 2.7677927017211914 }, { "auxiliary_loss_clip": 0.01778574, "auxiliary_loss_mlp": 0.00428088, "balance_loss_clip": 1.37607646, "balance_loss_mlp": 0.38395691, "epoch": 0.12144897038929806, "flos": 26141603892480.0, "grad_norm": 630.4597621471653, "language_loss": 0.98692352, "learning_rate": 3.912913532431586e-06, "loss": 1.00899017, "num_input_tokens_seen": 43706320, "router_z_loss_clip": 4.0234375, "router_z_loss_mlp": 0.44116211, "step": 2020, "time_per_iteration": 2.6932878494262695 }, { "auxiliary_loss_clip": 0.01774626, "auxiliary_loss_mlp": 0.00395161, "balance_loss_clip": 1.37995887, "balance_loss_mlp": 0.35098228, "epoch": 0.12150909364196603, "flos": 24717530309760.0, "grad_norm": 14.373613196358898, "language_loss": 0.83206069, "learning_rate": 3.912799822409549e-06, "loss": 0.85375857, "num_input_tokens_seen": 43724805, "router_z_loss_clip": 3.9453125, "router_z_loss_mlp": 0.44189453, "step": 2021, "time_per_iteration": 2.735373020172119 }, { "auxiliary_loss_clip": 0.01766796, "auxiliary_loss_mlp": 0.00399005, "balance_loss_clip": 1.37315297, "balance_loss_mlp": 0.35735345, "epoch": 0.121569216894634, "flos": 25186990089600.0, "grad_norm": 57.88323672498597, "language_loss": 0.85934895, "learning_rate": 3.912686039853952e-06, "loss": 0.88100696, "num_input_tokens_seen": 43742320, "router_z_loss_clip": 3.93554688, "router_z_loss_mlp": 0.41650391, "step": 2022, "time_per_iteration": 2.6768858432769775 }, { "auxiliary_loss_clip": 0.01786282, "auxiliary_loss_mlp": 0.00396876, "balance_loss_clip": 1.37968898, "balance_loss_mlp": 0.35448474, "epoch": 0.12162934014730196, "flos": 13444094039040.0, "grad_norm": 131.2990893911295, "language_loss": 0.90355802, "learning_rate": 3.912572184769108e-06, "loss": 0.92538965, "num_input_tokens_seen": 43760665, "router_z_loss_clip": 4.0703125, "router_z_loss_mlp": 0.42382812, "step": 2023, "time_per_iteration": 2.6642773151397705 }, { "auxiliary_loss_clip": 0.01816985, "auxiliary_loss_mlp": 0.00460605, "balance_loss_clip": 1.39537764, "balance_loss_mlp": 0.41599733, "epoch": 0.12168946339996994, "flos": 16946138344320.0, "grad_norm": 53.47701099902678, "language_loss": 0.94140965, "learning_rate": 3.912458257159335e-06, "loss": 0.96418548, "num_input_tokens_seen": 43779020, "router_z_loss_clip": 4.22070312, "router_z_loss_mlp": 0.44604492, "step": 2024, "time_per_iteration": 2.655184030532837 }, { "auxiliary_loss_clip": 0.01787935, "auxiliary_loss_mlp": 0.0041131, "balance_loss_clip": 1.38293242, "balance_loss_mlp": 0.36818033, "epoch": 0.12174958665263791, "flos": 29821585196160.0, "grad_norm": 11.566926015490681, "language_loss": 0.78531778, "learning_rate": 3.912344257028954e-06, "loss": 0.80731022, "num_input_tokens_seen": 43798850, "router_z_loss_clip": 4.046875, "router_z_loss_mlp": 0.4309082, "step": 2025, "time_per_iteration": 2.7235443592071533 }, { "auxiliary_loss_clip": 0.01816719, "auxiliary_loss_mlp": 0.00416955, "balance_loss_clip": 1.40478957, "balance_loss_mlp": 0.37244266, "epoch": 0.12180970990530587, "flos": 24641902224000.0, "grad_norm": 16.851193805552278, "language_loss": 0.81205708, "learning_rate": 3.912230184382286e-06, "loss": 0.83439386, "num_input_tokens_seen": 43820130, "router_z_loss_clip": 4.12109375, "router_z_loss_mlp": 0.4453125, "step": 2026, "time_per_iteration": 2.694554567337036 }, { "auxiliary_loss_clip": 0.01822299, "auxiliary_loss_mlp": 0.00392278, "balance_loss_clip": 1.41021633, "balance_loss_mlp": 0.34948194, "epoch": 0.12186983315797385, "flos": 20521691832960.0, "grad_norm": 3.026219364426958, "language_loss": 0.96441638, "learning_rate": 3.912116039223659e-06, "loss": 0.98656225, "num_input_tokens_seen": 43838485, "router_z_loss_clip": 4.12109375, "router_z_loss_mlp": 0.42822266, "step": 2027, "time_per_iteration": 2.6492908000946045 }, { "auxiliary_loss_clip": 0.01811527, "auxiliary_loss_mlp": 0.00420448, "balance_loss_clip": 1.40301228, "balance_loss_mlp": 0.37514806, "epoch": 0.12192995641064182, "flos": 27818344719360.0, "grad_norm": 37.6614853711704, "language_loss": 0.82814842, "learning_rate": 3.912001821557399e-06, "loss": 0.85046816, "num_input_tokens_seen": 43859080, "router_z_loss_clip": 4.0859375, "router_z_loss_mlp": 0.45288086, "step": 2028, "time_per_iteration": 2.7446517944335938 }, { "auxiliary_loss_clip": 0.01847529, "auxiliary_loss_mlp": 0.00385365, "balance_loss_clip": 1.42352498, "balance_loss_mlp": 0.34035128, "epoch": 0.12199007966330978, "flos": 22017119783040.0, "grad_norm": 2.2147416673985565, "language_loss": 0.8376087, "learning_rate": 3.911887531387839e-06, "loss": 0.85993767, "num_input_tokens_seen": 43879030, "router_z_loss_clip": 4.24023438, "router_z_loss_mlp": 0.45043945, "step": 2029, "time_per_iteration": 2.7855300903320312 }, { "auxiliary_loss_clip": 0.01875314, "auxiliary_loss_mlp": 0.00373294, "balance_loss_clip": 1.44147325, "balance_loss_mlp": 0.32801801, "epoch": 0.12205020291597775, "flos": 23295216493440.0, "grad_norm": 2.790689213094286, "language_loss": 0.85698032, "learning_rate": 3.911773168719313e-06, "loss": 0.87946641, "num_input_tokens_seen": 43898505, "router_z_loss_clip": 4.34179688, "router_z_loss_mlp": 0.45288086, "step": 2030, "time_per_iteration": 2.815398693084717 }, { "auxiliary_loss_clip": 0.01896898, "auxiliary_loss_mlp": 0.00398681, "balance_loss_clip": 1.45480943, "balance_loss_mlp": 0.3450129, "epoch": 0.12211032616864573, "flos": 26031609469440.0, "grad_norm": 60.51771077451318, "language_loss": 0.82735497, "learning_rate": 3.911658733556155e-06, "loss": 0.85031068, "num_input_tokens_seen": 43917945, "router_z_loss_clip": 4.421875, "router_z_loss_mlp": 0.53686523, "step": 2031, "time_per_iteration": 2.6838607788085938 }, { "auxiliary_loss_clip": 0.01886854, "auxiliary_loss_mlp": 0.00386467, "balance_loss_clip": 1.45095205, "balance_loss_mlp": 0.33892685, "epoch": 0.12217044942131369, "flos": 20410943224320.0, "grad_norm": 67.98607540315224, "language_loss": 0.81142521, "learning_rate": 3.911544225902707e-06, "loss": 0.83415842, "num_input_tokens_seen": 43937385, "router_z_loss_clip": 4.3671875, "router_z_loss_mlp": 0.4753418, "step": 2032, "time_per_iteration": 2.63751482963562 }, { "auxiliary_loss_clip": 0.01900101, "auxiliary_loss_mlp": 0.00398489, "balance_loss_clip": 1.45925212, "balance_loss_mlp": 0.3520208, "epoch": 0.12223057267398166, "flos": 22857142222080.0, "grad_norm": 13.507721918288947, "language_loss": 0.94006735, "learning_rate": 3.911429645763311e-06, "loss": 0.96305323, "num_input_tokens_seen": 43958130, "router_z_loss_clip": 4.41015625, "router_z_loss_mlp": 0.46459961, "step": 2033, "time_per_iteration": 2.657071590423584 }, { "auxiliary_loss_clip": 0.01949918, "auxiliary_loss_mlp": 0.00405613, "balance_loss_clip": 1.47692037, "balance_loss_mlp": 0.35235041, "epoch": 0.12229069592664964, "flos": 20047563285120.0, "grad_norm": 8.04409813220032, "language_loss": 0.7276063, "learning_rate": 3.911314993142311e-06, "loss": 0.75116158, "num_input_tokens_seen": 43976800, "router_z_loss_clip": 4.734375, "router_z_loss_mlp": 0.53295898, "step": 2034, "time_per_iteration": 2.6342110633850098 }, { "auxiliary_loss_clip": 0.0191957, "auxiliary_loss_mlp": 0.0037162, "balance_loss_clip": 1.46885216, "balance_loss_mlp": 0.32221979, "epoch": 0.1223508191793176, "flos": 22274240313600.0, "grad_norm": 15.36725132232068, "language_loss": 0.80696404, "learning_rate": 3.911200268044055e-06, "loss": 0.82987589, "num_input_tokens_seen": 43996620, "router_z_loss_clip": 4.5078125, "router_z_loss_mlp": 0.49414062, "step": 2035, "time_per_iteration": 2.6698505878448486 }, { "auxiliary_loss_clip": 0.0196221, "auxiliary_loss_mlp": 0.00408656, "balance_loss_clip": 1.48058319, "balance_loss_mlp": 0.35591739, "epoch": 0.12241094243198557, "flos": 21285978445440.0, "grad_norm": 21.625540642824298, "language_loss": 0.77652425, "learning_rate": 3.911085470472892e-06, "loss": 0.80023289, "num_input_tokens_seen": 44016175, "router_z_loss_clip": 4.8125, "router_z_loss_mlp": 0.52783203, "step": 2036, "time_per_iteration": 2.769523859024048 }, { "auxiliary_loss_clip": 0.01943328, "auxiliary_loss_mlp": 0.00395587, "balance_loss_clip": 1.47608852, "balance_loss_mlp": 0.34246743, "epoch": 0.12247106568465355, "flos": 17382381022080.0, "grad_norm": 12.872081961977104, "language_loss": 0.89003801, "learning_rate": 3.910970600433178e-06, "loss": 0.91342711, "num_input_tokens_seen": 44035060, "router_z_loss_clip": 4.66796875, "router_z_loss_mlp": 0.53076172, "step": 2037, "time_per_iteration": 2.6673941612243652 }, { "auxiliary_loss_clip": 0.01979068, "auxiliary_loss_mlp": 0.00396783, "balance_loss_clip": 1.49738395, "balance_loss_mlp": 0.3431389, "epoch": 0.12253118893732151, "flos": 27045438842880.0, "grad_norm": 3.8415831493524353, "language_loss": 0.89598948, "learning_rate": 3.910855657929267e-06, "loss": 0.91974801, "num_input_tokens_seen": 44053330, "router_z_loss_clip": 4.8203125, "router_z_loss_mlp": 0.53662109, "step": 2038, "time_per_iteration": 2.699010133743286 }, { "auxiliary_loss_clip": 0.02030504, "auxiliary_loss_mlp": 0.00163056, "balance_loss_clip": 1.72166944, "balance_loss_mlp": 0.10545367, "epoch": 0.12259131218998948, "flos": 53861518368000.0, "grad_norm": 0.8127998182082139, "language_loss": 0.58381355, "learning_rate": 3.910740642965518e-06, "loss": 0.60574913, "num_input_tokens_seen": 44107575, "router_z_loss_clip": 3.09375, "router_z_loss_mlp": 0.57421875, "step": 2039, "time_per_iteration": 3.0729787349700928 }, { "auxiliary_loss_clip": 0.02006546, "auxiliary_loss_mlp": 0.00368289, "balance_loss_clip": 1.51543725, "balance_loss_mlp": 0.31135482, "epoch": 0.12265143544265744, "flos": 17891917401600.0, "grad_norm": 29.697478144298458, "language_loss": 0.88885707, "learning_rate": 3.910625555546292e-06, "loss": 0.9126054, "num_input_tokens_seen": 44126075, "router_z_loss_clip": 4.91796875, "router_z_loss_mlp": 0.56884766, "step": 2040, "time_per_iteration": 2.669980764389038 }, { "auxiliary_loss_clip": 0.01964803, "auxiliary_loss_mlp": 0.00368202, "balance_loss_clip": 1.49581778, "balance_loss_mlp": 0.31665611, "epoch": 0.12271155869532542, "flos": 21799932197760.0, "grad_norm": 14.094835995497059, "language_loss": 0.88352859, "learning_rate": 3.910510395675953e-06, "loss": 0.90685868, "num_input_tokens_seen": 44145605, "router_z_loss_clip": 4.69140625, "router_z_loss_mlp": 0.515625, "step": 2041, "time_per_iteration": 2.710817575454712 }, { "auxiliary_loss_clip": 0.02011324, "auxiliary_loss_mlp": 0.00423752, "balance_loss_clip": 1.5201025, "balance_loss_mlp": 0.37025109, "epoch": 0.12277168194799339, "flos": 19828759587840.0, "grad_norm": 30.856014992781862, "language_loss": 0.74871147, "learning_rate": 3.9103951633588694e-06, "loss": 0.77306223, "num_input_tokens_seen": 44164770, "router_z_loss_clip": 4.91015625, "router_z_loss_mlp": 0.53515625, "step": 2042, "time_per_iteration": 2.7029571533203125 }, { "auxiliary_loss_clip": 0.01974424, "auxiliary_loss_mlp": 0.00387545, "balance_loss_clip": 1.49793458, "balance_loss_mlp": 0.33747724, "epoch": 0.12283180520066135, "flos": 23221024951680.0, "grad_norm": 3.81319594633366, "language_loss": 0.86901999, "learning_rate": 3.910279858599409e-06, "loss": 0.8926397, "num_input_tokens_seen": 44184025, "router_z_loss_clip": 4.765625, "router_z_loss_mlp": 0.50097656, "step": 2043, "time_per_iteration": 2.7868995666503906 }, { "auxiliary_loss_clip": 0.0198587, "auxiliary_loss_mlp": 0.0038744, "balance_loss_clip": 1.49904442, "balance_loss_mlp": 0.3337481, "epoch": 0.12289192845332933, "flos": 18588476920320.0, "grad_norm": 8.442433340672633, "language_loss": 0.86138409, "learning_rate": 3.910164481401946e-06, "loss": 0.88511717, "num_input_tokens_seen": 44202950, "router_z_loss_clip": 4.8671875, "router_z_loss_mlp": 0.53686523, "step": 2044, "time_per_iteration": 2.692396879196167 }, { "auxiliary_loss_clip": 0.01987572, "auxiliary_loss_mlp": 0.00394512, "balance_loss_clip": 1.50802231, "balance_loss_mlp": 0.34158298, "epoch": 0.1229520517059973, "flos": 25769532862080.0, "grad_norm": 3.045405250600599, "language_loss": 0.83070517, "learning_rate": 3.910049031770853e-06, "loss": 0.85452604, "num_input_tokens_seen": 44221115, "router_z_loss_clip": 4.796875, "router_z_loss_mlp": 0.52954102, "step": 2045, "time_per_iteration": 4.1421825885772705 }, { "auxiliary_loss_clip": 0.01951534, "auxiliary_loss_mlp": 0.00413587, "balance_loss_clip": 1.48543489, "balance_loss_mlp": 0.36220759, "epoch": 0.12301217495866526, "flos": 20887154760960.0, "grad_norm": 39.237385245264974, "language_loss": 0.76563686, "learning_rate": 3.90993350971051e-06, "loss": 0.78928804, "num_input_tokens_seen": 44240575, "router_z_loss_clip": 4.6640625, "router_z_loss_mlp": 0.51391602, "step": 2046, "time_per_iteration": 4.081751585006714 }, { "auxiliary_loss_clip": 0.01936632, "auxiliary_loss_mlp": 0.00387261, "balance_loss_clip": 1.47501516, "balance_loss_mlp": 0.33969659, "epoch": 0.12307229821133324, "flos": 22378811783040.0, "grad_norm": 3.0855459325776278, "language_loss": 0.79870903, "learning_rate": 3.909817915225297e-06, "loss": 0.82194793, "num_input_tokens_seen": 44257145, "router_z_loss_clip": 4.6171875, "router_z_loss_mlp": 0.47607422, "step": 2047, "time_per_iteration": 4.081492185592651 }, { "auxiliary_loss_clip": 0.01932108, "auxiliary_loss_mlp": 0.00407873, "balance_loss_clip": 1.47784257, "balance_loss_mlp": 0.3556115, "epoch": 0.1231324214640012, "flos": 23367396873600.0, "grad_norm": 4.566384644948009, "language_loss": 0.84182286, "learning_rate": 3.909702248319597e-06, "loss": 0.86522257, "num_input_tokens_seen": 44278035, "router_z_loss_clip": 4.546875, "router_z_loss_mlp": 0.52294922, "step": 2048, "time_per_iteration": 2.784001111984253 }, { "auxiliary_loss_clip": 0.01935365, "auxiliary_loss_mlp": 0.00386043, "balance_loss_clip": 1.49011409, "balance_loss_mlp": 0.33704829, "epoch": 0.12319254471666917, "flos": 23767154311680.0, "grad_norm": 8.607285925133759, "language_loss": 0.92037117, "learning_rate": 3.909586508997797e-06, "loss": 0.94358528, "num_input_tokens_seen": 44296980, "router_z_loss_clip": 4.453125, "router_z_loss_mlp": 0.48950195, "step": 2049, "time_per_iteration": 2.7165257930755615 }, { "auxiliary_loss_clip": 0.01910118, "auxiliary_loss_mlp": 0.00408525, "balance_loss_clip": 1.4636395, "balance_loss_mlp": 0.35924417, "epoch": 0.12325266796933713, "flos": 23550146294400.0, "grad_norm": 13.943077043192451, "language_loss": 0.82887417, "learning_rate": 3.909470697264285e-06, "loss": 0.85206062, "num_input_tokens_seen": 44318005, "router_z_loss_clip": 4.46875, "router_z_loss_mlp": 0.49291992, "step": 2050, "time_per_iteration": 4.099667549133301 }, { "auxiliary_loss_clip": 0.01916274, "auxiliary_loss_mlp": 0.00430178, "balance_loss_clip": 1.46171367, "balance_loss_mlp": 0.3757475, "epoch": 0.12331279122200511, "flos": 24423996366720.0, "grad_norm": 86.64768259303388, "language_loss": 0.88433063, "learning_rate": 3.909354813123452e-06, "loss": 0.90779519, "num_input_tokens_seen": 44335260, "router_z_loss_clip": 4.54296875, "router_z_loss_mlp": 0.54418945, "step": 2051, "time_per_iteration": 2.694953441619873 }, { "auxiliary_loss_clip": 0.0189139, "auxiliary_loss_mlp": 0.00390438, "balance_loss_clip": 1.4580543, "balance_loss_mlp": 0.34087071, "epoch": 0.12337291447467308, "flos": 25484294960640.0, "grad_norm": 5.801912214934079, "language_loss": 0.8567881, "learning_rate": 3.909238856579693e-06, "loss": 0.87960637, "num_input_tokens_seen": 44355315, "router_z_loss_clip": 4.3359375, "router_z_loss_mlp": 0.49536133, "step": 2052, "time_per_iteration": 2.693880558013916 }, { "auxiliary_loss_clip": 0.01882668, "auxiliary_loss_mlp": 0.00391559, "balance_loss_clip": 1.44145727, "balance_loss_mlp": 0.34139621, "epoch": 0.12343303772734104, "flos": 23550002640000.0, "grad_norm": 4.436952559144917, "language_loss": 0.82103848, "learning_rate": 3.909122827637406e-06, "loss": 0.84378076, "num_input_tokens_seen": 44373020, "router_z_loss_clip": 4.41796875, "router_z_loss_mlp": 0.50146484, "step": 2053, "time_per_iteration": 2.6890225410461426 }, { "auxiliary_loss_clip": 0.01855337, "auxiliary_loss_mlp": 0.00418763, "balance_loss_clip": 1.41887665, "balance_loss_mlp": 0.36852807, "epoch": 0.12349316098000902, "flos": 47557074867840.0, "grad_norm": 1.9255231796273566, "language_loss": 0.79452527, "learning_rate": 3.909006726300991e-06, "loss": 0.81726629, "num_input_tokens_seen": 44397525, "router_z_loss_clip": 4.37109375, "router_z_loss_mlp": 0.50219727, "step": 2054, "time_per_iteration": 2.948261260986328 }, { "auxiliary_loss_clip": 0.0184305, "auxiliary_loss_mlp": 0.00388776, "balance_loss_clip": 1.40926933, "balance_loss_mlp": 0.34152189, "epoch": 0.12355328423267699, "flos": 25045969294080.0, "grad_norm": 10.91568054629515, "language_loss": 0.89208633, "learning_rate": 3.908890552574849e-06, "loss": 0.91440463, "num_input_tokens_seen": 44415890, "router_z_loss_clip": 4.34375, "router_z_loss_mlp": 0.47265625, "step": 2055, "time_per_iteration": 2.7117483615875244 }, { "auxiliary_loss_clip": 0.01817763, "auxiliary_loss_mlp": 0.00421369, "balance_loss_clip": 1.3977623, "balance_loss_mlp": 0.37454316, "epoch": 0.12361340748534495, "flos": 27709140395520.0, "grad_norm": 24.173134082624497, "language_loss": 0.84100187, "learning_rate": 3.908774306463384e-06, "loss": 0.86339325, "num_input_tokens_seen": 44436625, "router_z_loss_clip": 4.203125, "router_z_loss_mlp": 0.46826172, "step": 2056, "time_per_iteration": 2.737187623977661 }, { "auxiliary_loss_clip": 0.01819137, "auxiliary_loss_mlp": 0.00405996, "balance_loss_clip": 1.39532304, "balance_loss_mlp": 0.35638082, "epoch": 0.12367353073801293, "flos": 26140598311680.0, "grad_norm": 9.832387690324614, "language_loss": 0.90314275, "learning_rate": 3.908657987971009e-06, "loss": 0.92539406, "num_input_tokens_seen": 44455265, "router_z_loss_clip": 4.24023438, "router_z_loss_mlp": 0.49584961, "step": 2057, "time_per_iteration": 2.700202226638794 }, { "auxiliary_loss_clip": 0.0180609, "auxiliary_loss_mlp": 0.00386127, "balance_loss_clip": 1.39021182, "balance_loss_mlp": 0.33746624, "epoch": 0.1237336539906809, "flos": 25156035544320.0, "grad_norm": 4.584495861410982, "language_loss": 0.83662587, "learning_rate": 3.90854159710213e-06, "loss": 0.85854799, "num_input_tokens_seen": 44475815, "router_z_loss_clip": 4.1484375, "router_z_loss_mlp": 0.48608398, "step": 2058, "time_per_iteration": 2.7029194831848145 }, { "auxiliary_loss_clip": 0.01808693, "auxiliary_loss_mlp": 0.00395209, "balance_loss_clip": 1.39141035, "balance_loss_mlp": 0.34681019, "epoch": 0.12379377724334886, "flos": 15304589867520.0, "grad_norm": 2.7520352996354114, "language_loss": 0.9058882, "learning_rate": 3.9084251338611624e-06, "loss": 0.92792726, "num_input_tokens_seen": 44494045, "router_z_loss_clip": 4.16796875, "router_z_loss_mlp": 0.48388672, "step": 2059, "time_per_iteration": 2.663249969482422 }, { "auxiliary_loss_clip": 0.01781805, "auxiliary_loss_mlp": 0.00391762, "balance_loss_clip": 1.36414659, "balance_loss_mlp": 0.34431642, "epoch": 0.12385390049601683, "flos": 21316717509120.0, "grad_norm": 75.19768123926501, "language_loss": 0.89420092, "learning_rate": 3.908308598252523e-06, "loss": 0.91593659, "num_input_tokens_seen": 44509120, "router_z_loss_clip": 4.19140625, "router_z_loss_mlp": 0.47412109, "step": 2060, "time_per_iteration": 2.666761875152588 }, { "auxiliary_loss_clip": 0.01770446, "auxiliary_loss_mlp": 0.00411806, "balance_loss_clip": 1.35978365, "balance_loss_mlp": 0.36290622, "epoch": 0.1239140237486848, "flos": 15116309752320.0, "grad_norm": 61.43247958533111, "language_loss": 0.921507, "learning_rate": 3.9081919902806306e-06, "loss": 0.94332951, "num_input_tokens_seen": 44525780, "router_z_loss_clip": 4.10546875, "router_z_loss_mlp": 0.48950195, "step": 2061, "time_per_iteration": 2.6435422897338867 }, { "auxiliary_loss_clip": 0.01749322, "auxiliary_loss_mlp": 0.00385184, "balance_loss_clip": 1.35607731, "balance_loss_mlp": 0.34048033, "epoch": 0.12397414700135277, "flos": 21976791788160.0, "grad_norm": 12.732646878342084, "language_loss": 0.90746045, "learning_rate": 3.908075309949906e-06, "loss": 0.92880547, "num_input_tokens_seen": 44543125, "router_z_loss_clip": 3.94140625, "router_z_loss_mlp": 0.44702148, "step": 2062, "time_per_iteration": 2.682983636856079 }, { "auxiliary_loss_clip": 0.01760842, "auxiliary_loss_mlp": 0.00377206, "balance_loss_clip": 1.35988343, "balance_loss_mlp": 0.3310717, "epoch": 0.12403427025402074, "flos": 13400892956160.0, "grad_norm": 4.036626663712904, "language_loss": 0.85537976, "learning_rate": 3.907958557264774e-06, "loss": 0.87676024, "num_input_tokens_seen": 44560275, "router_z_loss_clip": 4.00976562, "router_z_loss_mlp": 0.46142578, "step": 2063, "time_per_iteration": 2.7223594188690186 }, { "auxiliary_loss_clip": 0.01746242, "auxiliary_loss_mlp": 0.00378511, "balance_loss_clip": 1.34462476, "balance_loss_mlp": 0.3311134, "epoch": 0.12409439350668872, "flos": 15304374385920.0, "grad_norm": 10.59341689024465, "language_loss": 0.86875141, "learning_rate": 3.907841732229663e-06, "loss": 0.88999891, "num_input_tokens_seen": 44577640, "router_z_loss_clip": 4.015625, "router_z_loss_mlp": 0.47436523, "step": 2064, "time_per_iteration": 2.6331048011779785 }, { "auxiliary_loss_clip": 0.0173812, "auxiliary_loss_mlp": 0.00403707, "balance_loss_clip": 1.33657312, "balance_loss_mlp": 0.35898, "epoch": 0.12415451675935668, "flos": 25009376313600.0, "grad_norm": 15.176325939830342, "language_loss": 0.97590244, "learning_rate": 3.907724834849002e-06, "loss": 0.99732071, "num_input_tokens_seen": 44594860, "router_z_loss_clip": 4.01367188, "router_z_loss_mlp": 0.44726562, "step": 2065, "time_per_iteration": 2.7307064533233643 }, { "auxiliary_loss_clip": 0.01722509, "auxiliary_loss_mlp": 0.00402492, "balance_loss_clip": 1.32592869, "balance_loss_mlp": 0.35697755, "epoch": 0.12421464001202465, "flos": 23659673840640.0, "grad_norm": 35.06853528406464, "language_loss": 0.86829662, "learning_rate": 3.907607865127225e-06, "loss": 0.88954669, "num_input_tokens_seen": 44614780, "router_z_loss_clip": 3.96484375, "router_z_loss_mlp": 0.45483398, "step": 2066, "time_per_iteration": 2.7077105045318604 }, { "auxiliary_loss_clip": 0.01640537, "auxiliary_loss_mlp": 0.00181686, "balance_loss_clip": 1.38238072, "balance_loss_mlp": 0.16461532, "epoch": 0.12427476326469263, "flos": 65732904345600.0, "grad_norm": 0.873693752439367, "language_loss": 0.63664752, "learning_rate": 3.907490823068766e-06, "loss": 0.65486979, "num_input_tokens_seen": 44671240, "router_z_loss_clip": 2.578125, "router_z_loss_mlp": 0.17089844, "step": 2067, "time_per_iteration": 3.1525774002075195 }, { "auxiliary_loss_clip": 0.01726075, "auxiliary_loss_mlp": 0.00392461, "balance_loss_clip": 1.32444191, "balance_loss_mlp": 0.34801936, "epoch": 0.12433488651736059, "flos": 24535427333760.0, "grad_norm": 31.962669120902927, "language_loss": 1.00577569, "learning_rate": 3.907373708678063e-06, "loss": 1.02696097, "num_input_tokens_seen": 44691050, "router_z_loss_clip": 4.01171875, "router_z_loss_mlp": 0.44433594, "step": 2068, "time_per_iteration": 2.6813771724700928 }, { "auxiliary_loss_clip": 0.01713896, "auxiliary_loss_mlp": 0.00400623, "balance_loss_clip": 1.31890702, "balance_loss_mlp": 0.35806513, "epoch": 0.12439500977002856, "flos": 21031659175680.0, "grad_norm": 4.688460957845058, "language_loss": 0.8693254, "learning_rate": 3.9072565219595596e-06, "loss": 0.89047062, "num_input_tokens_seen": 44709850, "router_z_loss_clip": 3.95117188, "router_z_loss_mlp": 0.42578125, "step": 2069, "time_per_iteration": 2.640092372894287 }, { "auxiliary_loss_clip": 0.0171443, "auxiliary_loss_mlp": 0.00406793, "balance_loss_clip": 1.31456923, "balance_loss_mlp": 0.35715446, "epoch": 0.12445513302269653, "flos": 26830621555200.0, "grad_norm": 6.839066480456285, "language_loss": 0.81538022, "learning_rate": 3.907139262917696e-06, "loss": 0.83659244, "num_input_tokens_seen": 44731475, "router_z_loss_clip": 4.00195312, "router_z_loss_mlp": 0.49633789, "step": 2070, "time_per_iteration": 2.7099668979644775 }, { "auxiliary_loss_clip": 0.01711394, "auxiliary_loss_mlp": 0.00375883, "balance_loss_clip": 1.31867635, "balance_loss_mlp": 0.33056006, "epoch": 0.1245152562753645, "flos": 18368919037440.0, "grad_norm": 9.882925816215396, "language_loss": 0.89791441, "learning_rate": 3.907021931556922e-06, "loss": 0.91878718, "num_input_tokens_seen": 44749685, "router_z_loss_clip": 3.92773438, "router_z_loss_mlp": 0.45336914, "step": 2071, "time_per_iteration": 2.6528475284576416 }, { "auxiliary_loss_clip": 0.01690362, "auxiliary_loss_mlp": 0.00400887, "balance_loss_clip": 1.30400801, "balance_loss_mlp": 0.35601687, "epoch": 0.12457537952803246, "flos": 33107986200960.0, "grad_norm": 19.814249211283823, "language_loss": 0.83038598, "learning_rate": 3.906904527881684e-06, "loss": 0.85129851, "num_input_tokens_seen": 44772165, "router_z_loss_clip": 3.859375, "router_z_loss_mlp": 0.44848633, "step": 2072, "time_per_iteration": 2.7795817852020264 }, { "auxiliary_loss_clip": 0.01687993, "auxiliary_loss_mlp": 0.00389975, "balance_loss_clip": 1.29302573, "balance_loss_mlp": 0.34484261, "epoch": 0.12463550278070043, "flos": 22270217990400.0, "grad_norm": 3.2584239256339265, "language_loss": 0.81820977, "learning_rate": 3.9067870518964355e-06, "loss": 0.8389895, "num_input_tokens_seen": 44790580, "router_z_loss_clip": 3.95117188, "router_z_loss_mlp": 0.45166016, "step": 2073, "time_per_iteration": 2.7300331592559814 }, { "auxiliary_loss_clip": 0.01680669, "auxiliary_loss_mlp": 0.00359832, "balance_loss_clip": 1.29280007, "balance_loss_mlp": 0.3176792, "epoch": 0.12469562603336841, "flos": 14679025580160.0, "grad_norm": 42.49441958192599, "language_loss": 0.94735849, "learning_rate": 3.906669503605631e-06, "loss": 0.96776354, "num_input_tokens_seen": 44806730, "router_z_loss_clip": 3.8828125, "router_z_loss_mlp": 0.42138672, "step": 2074, "time_per_iteration": 2.6676025390625 }, { "auxiliary_loss_clip": 0.01683008, "auxiliary_loss_mlp": 0.00448367, "balance_loss_clip": 1.28231716, "balance_loss_mlp": 0.39798966, "epoch": 0.12475574928603637, "flos": 24644775312000.0, "grad_norm": 48.479385495813354, "language_loss": 0.90834355, "learning_rate": 3.906551883013728e-06, "loss": 0.92965734, "num_input_tokens_seen": 44825550, "router_z_loss_clip": 4.00390625, "router_z_loss_mlp": 0.50366211, "step": 2075, "time_per_iteration": 2.6708788871765137 }, { "auxiliary_loss_clip": 0.01675622, "auxiliary_loss_mlp": 0.00390652, "balance_loss_clip": 1.28606749, "balance_loss_mlp": 0.34232485, "epoch": 0.12481587253870434, "flos": 21762980081280.0, "grad_norm": 22.528077385205204, "language_loss": 0.79129732, "learning_rate": 3.9064341901251865e-06, "loss": 0.8119601, "num_input_tokens_seen": 44844155, "router_z_loss_clip": 3.890625, "router_z_loss_mlp": 0.48339844, "step": 2076, "time_per_iteration": 2.6896884441375732 }, { "auxiliary_loss_clip": 0.01699708, "auxiliary_loss_mlp": 0.00406741, "balance_loss_clip": 1.31020296, "balance_loss_mlp": 0.36332482, "epoch": 0.12487599579137232, "flos": 21432529935360.0, "grad_norm": 2.900477239244331, "language_loss": 0.81243014, "learning_rate": 3.906316424944469e-06, "loss": 0.83349454, "num_input_tokens_seen": 44863780, "router_z_loss_clip": 3.8984375, "router_z_loss_mlp": 0.43408203, "step": 2077, "time_per_iteration": 2.6749234199523926 }, { "auxiliary_loss_clip": 0.01667923, "auxiliary_loss_mlp": 0.00445008, "balance_loss_clip": 1.28414071, "balance_loss_mlp": 0.39680004, "epoch": 0.12493611904404028, "flos": 16107624276480.0, "grad_norm": 140.94893306817133, "language_loss": 0.8956449, "learning_rate": 3.906198587476043e-06, "loss": 0.91677427, "num_input_tokens_seen": 44881480, "router_z_loss_clip": 3.84179688, "router_z_loss_mlp": 0.48193359, "step": 2078, "time_per_iteration": 2.687915086746216 }, { "auxiliary_loss_clip": 0.01685985, "auxiliary_loss_mlp": 0.00423459, "balance_loss_clip": 1.29154086, "balance_loss_mlp": 0.3750366, "epoch": 0.12499624229670825, "flos": 21580266574080.0, "grad_norm": 76.72788183895335, "language_loss": 0.8157469, "learning_rate": 3.906080677724374e-06, "loss": 0.83684134, "num_input_tokens_seen": 44900390, "router_z_loss_clip": 3.94140625, "router_z_loss_mlp": 0.48413086, "step": 2079, "time_per_iteration": 2.7145512104034424 }, { "auxiliary_loss_clip": 0.01711822, "auxiliary_loss_mlp": 0.00444153, "balance_loss_clip": 1.31295455, "balance_loss_mlp": 0.39844859, "epoch": 0.1250563655493762, "flos": 25699040421120.0, "grad_norm": 14.901555346051175, "language_loss": 0.91956753, "learning_rate": 3.905962695693935e-06, "loss": 0.9411273, "num_input_tokens_seen": 44920375, "router_z_loss_clip": 3.98632812, "router_z_loss_mlp": 0.45678711, "step": 2080, "time_per_iteration": 2.723886013031006 }, { "auxiliary_loss_clip": 0.01698945, "auxiliary_loss_mlp": 0.00398199, "balance_loss_clip": 1.30922639, "balance_loss_mlp": 0.35161245, "epoch": 0.12511648880204418, "flos": 16909509450240.0, "grad_norm": 18.279877199647796, "language_loss": 0.91025496, "learning_rate": 3.9058446413892e-06, "loss": 0.93122643, "num_input_tokens_seen": 44938415, "router_z_loss_clip": 3.89648438, "router_z_loss_mlp": 0.46630859, "step": 2081, "time_per_iteration": 2.666071891784668 }, { "auxiliary_loss_clip": 0.01689778, "auxiliary_loss_mlp": 0.00388622, "balance_loss_clip": 1.30481625, "balance_loss_mlp": 0.34658846, "epoch": 0.12517661205471217, "flos": 17567500740480.0, "grad_norm": 3.487910934127648, "language_loss": 0.81309235, "learning_rate": 3.905726514814646e-06, "loss": 0.83387631, "num_input_tokens_seen": 44957135, "router_z_loss_clip": 3.8515625, "router_z_loss_mlp": 0.42016602, "step": 2082, "time_per_iteration": 2.7497057914733887 }, { "auxiliary_loss_clip": 0.01716233, "auxiliary_loss_mlp": 0.00410481, "balance_loss_clip": 1.31175113, "balance_loss_mlp": 0.36415654, "epoch": 0.12523673530738014, "flos": 16033791870720.0, "grad_norm": 12.881752486950456, "language_loss": 0.87881494, "learning_rate": 3.9056083159747495e-06, "loss": 0.90008211, "num_input_tokens_seen": 44974480, "router_z_loss_clip": 4.04492188, "router_z_loss_mlp": 0.46337891, "step": 2083, "time_per_iteration": 2.695235252380371 }, { "auxiliary_loss_clip": 0.0171126, "auxiliary_loss_mlp": 0.00405519, "balance_loss_clip": 1.30931175, "balance_loss_mlp": 0.35833639, "epoch": 0.1252968585600481, "flos": 18807747494400.0, "grad_norm": 4.802378001144312, "language_loss": 0.96305227, "learning_rate": 3.9054900448739966e-06, "loss": 0.98422003, "num_input_tokens_seen": 44990310, "router_z_loss_clip": 4.01953125, "router_z_loss_mlp": 0.47192383, "step": 2084, "time_per_iteration": 2.6263034343719482 }, { "auxiliary_loss_clip": 0.01719845, "auxiliary_loss_mlp": 0.00371309, "balance_loss_clip": 1.31958985, "balance_loss_mlp": 0.32915649, "epoch": 0.12535698181271607, "flos": 27271568914560.0, "grad_norm": 79.96079346506664, "language_loss": 0.86879981, "learning_rate": 3.905371701516869e-06, "loss": 0.88971138, "num_input_tokens_seen": 45010720, "router_z_loss_clip": 3.99804688, "router_z_loss_mlp": 0.42114258, "step": 2085, "time_per_iteration": 2.827605724334717 }, { "auxiliary_loss_clip": 0.01681477, "auxiliary_loss_mlp": 0.0037234, "balance_loss_clip": 1.28363645, "balance_loss_mlp": 0.33056876, "epoch": 0.12541710506538403, "flos": 22054107813120.0, "grad_norm": 219.47050385690088, "language_loss": 0.94437766, "learning_rate": 3.905253285907856e-06, "loss": 0.96491587, "num_input_tokens_seen": 45030360, "router_z_loss_clip": 3.98046875, "router_z_loss_mlp": 0.41772461, "step": 2086, "time_per_iteration": 2.6515567302703857 }, { "auxiliary_loss_clip": 0.01704364, "auxiliary_loss_mlp": 0.00316115, "balance_loss_clip": 1.30843055, "balance_loss_mlp": 0.2761327, "epoch": 0.125477228318052, "flos": 12603173760000.0, "grad_norm": 3.4339998040483146, "language_loss": 0.91749728, "learning_rate": 3.905134798051447e-06, "loss": 0.93770206, "num_input_tokens_seen": 45045085, "router_z_loss_clip": 3.95703125, "router_z_loss_mlp": 0.39990234, "step": 2087, "time_per_iteration": 4.096821546554565 }, { "auxiliary_loss_clip": 0.01718061, "auxiliary_loss_mlp": 0.00351611, "balance_loss_clip": 1.31077933, "balance_loss_mlp": 0.3073608, "epoch": 0.12553735157071996, "flos": 23878549365120.0, "grad_norm": 1.8913959827349356, "language_loss": 0.81052017, "learning_rate": 3.905016237952136e-06, "loss": 0.83121687, "num_input_tokens_seen": 45065145, "router_z_loss_clip": 4.07226562, "router_z_loss_mlp": 0.44238281, "step": 2088, "time_per_iteration": 2.6884617805480957 }, { "auxiliary_loss_clip": 0.0164792, "auxiliary_loss_mlp": 0.00234942, "balance_loss_clip": 1.33214402, "balance_loss_mlp": 0.21911052, "epoch": 0.12559747482338796, "flos": 69920841830400.0, "grad_norm": 0.7667830775052864, "language_loss": 0.61813539, "learning_rate": 3.904897605614418e-06, "loss": 0.63696402, "num_input_tokens_seen": 45126230, "router_z_loss_clip": 3.15625, "router_z_loss_mlp": 0.15820312, "step": 2089, "time_per_iteration": 4.502696990966797 }, { "auxiliary_loss_clip": 0.01706935, "auxiliary_loss_mlp": 0.00355111, "balance_loss_clip": 1.29543042, "balance_loss_mlp": 0.31260142, "epoch": 0.12565759807605592, "flos": 24279563779200.0, "grad_norm": 3.6279139319812046, "language_loss": 0.85941708, "learning_rate": 3.904778901042793e-06, "loss": 0.88003755, "num_input_tokens_seen": 45145545, "router_z_loss_clip": 4.11328125, "router_z_loss_mlp": 0.42553711, "step": 2090, "time_per_iteration": 4.126194953918457 }, { "auxiliary_loss_clip": 0.0161513, "auxiliary_loss_mlp": 0.00176243, "balance_loss_clip": 1.29992962, "balance_loss_mlp": 0.16231941, "epoch": 0.12571772132872389, "flos": 56451180286080.0, "grad_norm": 0.7559432867246049, "language_loss": 0.58621824, "learning_rate": 3.90466012424176e-06, "loss": 0.60413194, "num_input_tokens_seen": 45206845, "router_z_loss_clip": 3.15625, "router_z_loss_mlp": 0.13964844, "step": 2091, "time_per_iteration": 3.111359119415283 }, { "auxiliary_loss_clip": 0.01732001, "auxiliary_loss_mlp": 0.00320606, "balance_loss_clip": 1.31456399, "balance_loss_mlp": 0.27907377, "epoch": 0.12577784458139185, "flos": 41245846675200.0, "grad_norm": 46.657890669289465, "language_loss": 0.71219409, "learning_rate": 3.904541275215825e-06, "loss": 0.7327202, "num_input_tokens_seen": 45228495, "router_z_loss_clip": 4.171875, "router_z_loss_mlp": 0.41552734, "step": 2092, "time_per_iteration": 4.238163232803345 }, { "auxiliary_loss_clip": 0.01739254, "auxiliary_loss_mlp": 0.00362456, "balance_loss_clip": 1.31518829, "balance_loss_mlp": 0.31782418, "epoch": 0.12583796783405982, "flos": 19755501799680.0, "grad_norm": 14.845391907035642, "language_loss": 0.88387609, "learning_rate": 3.904422353969493e-06, "loss": 0.90489316, "num_input_tokens_seen": 45245720, "router_z_loss_clip": 4.23828125, "router_z_loss_mlp": 0.44604492, "step": 2093, "time_per_iteration": 2.8204591274261475 }, { "auxiliary_loss_clip": 0.01711548, "auxiliary_loss_mlp": 0.00320118, "balance_loss_clip": 1.30734444, "balance_loss_mlp": 0.27975422, "epoch": 0.12589809108672778, "flos": 22602104680320.0, "grad_norm": 3.849064650104178, "language_loss": 0.81710386, "learning_rate": 3.904303360507276e-06, "loss": 0.83742052, "num_input_tokens_seen": 45265650, "router_z_loss_clip": 4.0390625, "router_z_loss_mlp": 0.40380859, "step": 2094, "time_per_iteration": 2.697709798812866 }, { "auxiliary_loss_clip": 0.01729273, "auxiliary_loss_mlp": 0.0032501, "balance_loss_clip": 1.32296181, "balance_loss_mlp": 0.28140366, "epoch": 0.12595821433939577, "flos": 45222845541120.0, "grad_norm": 15.759362376272625, "language_loss": 0.83316499, "learning_rate": 3.9041842948336835e-06, "loss": 0.85370785, "num_input_tokens_seen": 45287790, "router_z_loss_clip": 4.06445312, "router_z_loss_mlp": 0.43603516, "step": 2095, "time_per_iteration": 2.8614957332611084 }, { "auxiliary_loss_clip": 0.01730099, "auxiliary_loss_mlp": 0.00338554, "balance_loss_clip": 1.31732702, "balance_loss_mlp": 0.29356402, "epoch": 0.12601833759206374, "flos": 14319811618560.0, "grad_norm": 4.748756204444818, "language_loss": 0.92814052, "learning_rate": 3.904065156953232e-06, "loss": 0.94882703, "num_input_tokens_seen": 45305720, "router_z_loss_clip": 4.12304688, "router_z_loss_mlp": 0.45019531, "step": 2096, "time_per_iteration": 2.6677074432373047 }, { "auxiliary_loss_clip": 0.0172964, "auxiliary_loss_mlp": 0.00320261, "balance_loss_clip": 1.32489789, "balance_loss_mlp": 0.27832347, "epoch": 0.1260784608447317, "flos": 21288241002240.0, "grad_norm": 20.765127297429633, "language_loss": 0.84060937, "learning_rate": 3.903945946870439e-06, "loss": 0.86110842, "num_input_tokens_seen": 45325290, "router_z_loss_clip": 4.04296875, "router_z_loss_mlp": 0.41943359, "step": 2097, "time_per_iteration": 2.663586378097534 }, { "auxiliary_loss_clip": 0.01761472, "auxiliary_loss_mlp": 0.00345266, "balance_loss_clip": 1.34870267, "balance_loss_mlp": 0.30139658, "epoch": 0.12613858409739967, "flos": 26251311006720.0, "grad_norm": 17.188028424074336, "language_loss": 0.93151975, "learning_rate": 3.9038266645898246e-06, "loss": 0.95258719, "num_input_tokens_seen": 45344465, "router_z_loss_clip": 4.13085938, "router_z_loss_mlp": 0.43823242, "step": 2098, "time_per_iteration": 2.7051756381988525 }, { "auxiliary_loss_clip": 0.01762013, "auxiliary_loss_mlp": 0.00348689, "balance_loss_clip": 1.34839749, "balance_loss_mlp": 0.29924107, "epoch": 0.12619870735006763, "flos": 21579979265280.0, "grad_norm": 2.146667909868135, "language_loss": 0.77820039, "learning_rate": 3.903707310115912e-06, "loss": 0.79930747, "num_input_tokens_seen": 45362465, "router_z_loss_clip": 4.13671875, "router_z_loss_mlp": 0.49462891, "step": 2099, "time_per_iteration": 2.7075464725494385 }, { "auxiliary_loss_clip": 0.01756204, "auxiliary_loss_mlp": 0.00350586, "balance_loss_clip": 1.34743357, "balance_loss_mlp": 0.30337888, "epoch": 0.1262588306027356, "flos": 23367037737600.0, "grad_norm": 88.94749039985169, "language_loss": 0.88204587, "learning_rate": 3.903587883453228e-06, "loss": 0.90311372, "num_input_tokens_seen": 45382700, "router_z_loss_clip": 4.09375, "router_z_loss_mlp": 0.47216797, "step": 2100, "time_per_iteration": 2.7222235202789307 }, { "auxiliary_loss_clip": 0.01805004, "auxiliary_loss_mlp": 0.00366676, "balance_loss_clip": 1.38769174, "balance_loss_mlp": 0.31954086, "epoch": 0.12631895385540357, "flos": 23949185460480.0, "grad_norm": 17.015151668349162, "language_loss": 0.8767693, "learning_rate": 3.903468384606302e-06, "loss": 0.89848608, "num_input_tokens_seen": 45401005, "router_z_loss_clip": 4.171875, "router_z_loss_mlp": 0.47119141, "step": 2101, "time_per_iteration": 2.7332987785339355 }, { "auxiliary_loss_clip": 0.0170296, "auxiliary_loss_mlp": 0.0011635, "balance_loss_clip": 1.4436419, "balance_loss_mlp": 0.06923802, "epoch": 0.12637907710807156, "flos": 70282138780800.0, "grad_norm": 0.744726053726032, "language_loss": 0.57040519, "learning_rate": 3.903348813579662e-06, "loss": 0.58859825, "num_input_tokens_seen": 45466555, "router_z_loss_clip": 2.59375, "router_z_loss_mlp": 0.47070312, "step": 2102, "time_per_iteration": 3.211390495300293 }, { "auxiliary_loss_clip": 0.01830239, "auxiliary_loss_mlp": 0.00368598, "balance_loss_clip": 1.40774679, "balance_loss_mlp": 0.31867269, "epoch": 0.12643920036073952, "flos": 18915084311040.0, "grad_norm": 26.38028617236145, "language_loss": 0.99246216, "learning_rate": 3.903229170377845e-06, "loss": 1.01445055, "num_input_tokens_seen": 45485165, "router_z_loss_clip": 4.2265625, "router_z_loss_mlp": 0.4987793, "step": 2103, "time_per_iteration": 2.686946392059326 }, { "auxiliary_loss_clip": 0.01872387, "auxiliary_loss_mlp": 0.00371786, "balance_loss_clip": 1.44541466, "balance_loss_mlp": 0.32374462, "epoch": 0.1264993236134075, "flos": 27782470010880.0, "grad_norm": 3.422017288889648, "language_loss": 0.83287525, "learning_rate": 3.903109455005387e-06, "loss": 0.855317, "num_input_tokens_seen": 45504630, "router_z_loss_clip": 4.26953125, "router_z_loss_mlp": 0.48022461, "step": 2104, "time_per_iteration": 2.7553439140319824 }, { "auxiliary_loss_clip": 0.01899143, "auxiliary_loss_mlp": 0.00408695, "balance_loss_clip": 1.4546032, "balance_loss_mlp": 0.36155921, "epoch": 0.12655944686607545, "flos": 24754697907840.0, "grad_norm": 2.8458516400502267, "language_loss": 0.87583524, "learning_rate": 3.902989667466828e-06, "loss": 0.89891356, "num_input_tokens_seen": 45524885, "router_z_loss_clip": 4.4453125, "router_z_loss_mlp": 0.47119141, "step": 2105, "time_per_iteration": 2.718404769897461 }, { "auxiliary_loss_clip": 0.01904987, "auxiliary_loss_mlp": 0.0042953, "balance_loss_clip": 1.44875479, "balance_loss_mlp": 0.37865168, "epoch": 0.12661957011874342, "flos": 24133048202880.0, "grad_norm": 321.0046116578536, "language_loss": 0.8996309, "learning_rate": 3.90286980776671e-06, "loss": 0.92297608, "num_input_tokens_seen": 45545000, "router_z_loss_clip": 4.56640625, "router_z_loss_mlp": 0.5090332, "step": 2106, "time_per_iteration": 2.9100565910339355 }, { "auxiliary_loss_clip": 0.01871835, "auxiliary_loss_mlp": 0.00398222, "balance_loss_clip": 1.43068004, "balance_loss_mlp": 0.35015723, "epoch": 0.12667969337141138, "flos": 24569614103040.0, "grad_norm": 125.37959747523752, "language_loss": 0.7970767, "learning_rate": 3.902749875909578e-06, "loss": 0.81977725, "num_input_tokens_seen": 45564210, "router_z_loss_clip": 4.41015625, "router_z_loss_mlp": 0.48022461, "step": 2107, "time_per_iteration": 2.691894054412842 }, { "auxiliary_loss_clip": 0.01874767, "auxiliary_loss_mlp": 0.00396223, "balance_loss_clip": 1.43763816, "balance_loss_mlp": 0.35087615, "epoch": 0.12673981662407935, "flos": 22961677777920.0, "grad_norm": 12.734446934443623, "language_loss": 0.85792273, "learning_rate": 3.90262987189998e-06, "loss": 0.8806327, "num_input_tokens_seen": 45583030, "router_z_loss_clip": 4.375, "router_z_loss_mlp": 0.45361328, "step": 2108, "time_per_iteration": 2.7000575065612793 }, { "auxiliary_loss_clip": 0.01882169, "auxiliary_loss_mlp": 0.00443327, "balance_loss_clip": 1.43769467, "balance_loss_mlp": 0.39533347, "epoch": 0.12679993987674734, "flos": 17274864637440.0, "grad_norm": 5.030709777228479, "language_loss": 0.82092148, "learning_rate": 3.902509795742467e-06, "loss": 0.84417641, "num_input_tokens_seen": 45602265, "router_z_loss_clip": 4.4453125, "router_z_loss_mlp": 0.48022461, "step": 2109, "time_per_iteration": 2.6058220863342285 }, { "auxiliary_loss_clip": 0.01919924, "auxiliary_loss_mlp": 0.0043401, "balance_loss_clip": 1.46468365, "balance_loss_mlp": 0.3858729, "epoch": 0.1268600631294153, "flos": 17275080119040.0, "grad_norm": 4.52743301064182, "language_loss": 0.88976061, "learning_rate": 3.902389647441592e-06, "loss": 0.91329998, "num_input_tokens_seen": 45620595, "router_z_loss_clip": 4.5625, "router_z_loss_mlp": 0.48193359, "step": 2110, "time_per_iteration": 2.7262282371520996 }, { "auxiliary_loss_clip": 0.01922748, "auxiliary_loss_mlp": 0.00471652, "balance_loss_clip": 1.46128762, "balance_loss_mlp": 0.42270452, "epoch": 0.12692018638208327, "flos": 24061047390720.0, "grad_norm": 2.6833899331736233, "language_loss": 0.85108054, "learning_rate": 3.90226942700191e-06, "loss": 0.87502456, "num_input_tokens_seen": 45641140, "router_z_loss_clip": 4.6171875, "router_z_loss_mlp": 0.48974609, "step": 2111, "time_per_iteration": 2.6392388343811035 }, { "auxiliary_loss_clip": 0.01926495, "auxiliary_loss_mlp": 0.00437575, "balance_loss_clip": 1.45815408, "balance_loss_mlp": 0.38858032, "epoch": 0.12698030963475124, "flos": 31831900652160.0, "grad_norm": 24.625637092623357, "language_loss": 0.85261023, "learning_rate": 3.902149134427982e-06, "loss": 0.87625092, "num_input_tokens_seen": 45662315, "router_z_loss_clip": 4.6796875, "router_z_loss_mlp": 0.49023438, "step": 2112, "time_per_iteration": 2.756848096847534 }, { "auxiliary_loss_clip": 0.01889376, "auxiliary_loss_mlp": 0.00448293, "balance_loss_clip": 1.44081557, "balance_loss_mlp": 0.40103859, "epoch": 0.1270404328874192, "flos": 25187744275200.0, "grad_norm": 4.145187832131962, "language_loss": 0.91119504, "learning_rate": 3.902028769724367e-06, "loss": 0.93457174, "num_input_tokens_seen": 45680335, "router_z_loss_clip": 4.484375, "router_z_loss_mlp": 0.47265625, "step": 2113, "time_per_iteration": 2.658616781234741 }, { "auxiliary_loss_clip": 0.01949149, "auxiliary_loss_mlp": 0.00460247, "balance_loss_clip": 1.47604668, "balance_loss_mlp": 0.40350366, "epoch": 0.12710055614008717, "flos": 15997342544640.0, "grad_norm": 8.453803976940339, "language_loss": 0.79926491, "learning_rate": 3.9019083328956315e-06, "loss": 0.82335883, "num_input_tokens_seen": 45696240, "router_z_loss_clip": 4.73046875, "router_z_loss_mlp": 0.56738281, "step": 2114, "time_per_iteration": 2.6448779106140137 }, { "auxiliary_loss_clip": 0.01876034, "auxiliary_loss_mlp": 0.00430875, "balance_loss_clip": 1.43540645, "balance_loss_mlp": 0.38166547, "epoch": 0.12716067939275516, "flos": 15085642515840.0, "grad_norm": 16.1134788944923, "language_loss": 0.89567077, "learning_rate": 3.901787823946341e-06, "loss": 0.9187398, "num_input_tokens_seen": 45713695, "router_z_loss_clip": 4.40234375, "router_z_loss_mlp": 0.49169922, "step": 2115, "time_per_iteration": 2.6348748207092285 }, { "auxiliary_loss_clip": 0.01880549, "auxiliary_loss_mlp": 0.0043009, "balance_loss_clip": 1.43923485, "balance_loss_mlp": 0.3832173, "epoch": 0.12722080264542313, "flos": 28366736636160.0, "grad_norm": 53.450882083490065, "language_loss": 0.93583977, "learning_rate": 3.901667242881065e-06, "loss": 0.95894623, "num_input_tokens_seen": 45736655, "router_z_loss_clip": 4.4140625, "router_z_loss_mlp": 0.46923828, "step": 2116, "time_per_iteration": 2.839796543121338 }, { "auxiliary_loss_clip": 0.01851017, "auxiliary_loss_mlp": 0.00450155, "balance_loss_clip": 1.42157745, "balance_loss_mlp": 0.40812194, "epoch": 0.1272809258980911, "flos": 32379897519360.0, "grad_norm": 20.31689017739344, "language_loss": 0.75771403, "learning_rate": 3.9015465897043775e-06, "loss": 0.78072578, "num_input_tokens_seen": 45758195, "router_z_loss_clip": 4.29296875, "router_z_loss_mlp": 0.42016602, "step": 2117, "time_per_iteration": 2.7671260833740234 }, { "auxiliary_loss_clip": 0.01860101, "auxiliary_loss_mlp": 0.00455165, "balance_loss_clip": 1.41694832, "balance_loss_mlp": 0.40791088, "epoch": 0.12734104915075906, "flos": 16034402401920.0, "grad_norm": 17.71580929895811, "language_loss": 0.91157782, "learning_rate": 3.901425864420852e-06, "loss": 0.93473053, "num_input_tokens_seen": 45774280, "router_z_loss_clip": 4.4296875, "router_z_loss_mlp": 0.47241211, "step": 2118, "time_per_iteration": 2.650200366973877 }, { "auxiliary_loss_clip": 0.01864874, "auxiliary_loss_mlp": 0.00419558, "balance_loss_clip": 1.42709148, "balance_loss_mlp": 0.37518784, "epoch": 0.12740117240342702, "flos": 18260325244800.0, "grad_norm": 3.207592072248894, "language_loss": 0.9446193, "learning_rate": 3.901305067035068e-06, "loss": 0.96746367, "num_input_tokens_seen": 45792760, "router_z_loss_clip": 4.37890625, "router_z_loss_mlp": 0.4440918, "step": 2119, "time_per_iteration": 2.678295850753784 }, { "auxiliary_loss_clip": 0.01874503, "auxiliary_loss_mlp": 0.00432211, "balance_loss_clip": 1.43054414, "balance_loss_mlp": 0.38509959, "epoch": 0.127461295656095, "flos": 12121790664960.0, "grad_norm": 913.2025515079109, "language_loss": 0.95669526, "learning_rate": 3.901184197551605e-06, "loss": 0.97976232, "num_input_tokens_seen": 45804300, "router_z_loss_clip": 4.43359375, "router_z_loss_mlp": 0.47119141, "step": 2120, "time_per_iteration": 2.6011974811553955 }, { "auxiliary_loss_clip": 0.01882495, "auxiliary_loss_mlp": 0.00406709, "balance_loss_clip": 1.43211985, "balance_loss_mlp": 0.3616243, "epoch": 0.12752141890876295, "flos": 23149095966720.0, "grad_norm": 3.576438013037645, "language_loss": 0.83597523, "learning_rate": 3.901063255975046e-06, "loss": 0.85886729, "num_input_tokens_seen": 45823780, "router_z_loss_clip": 4.50390625, "router_z_loss_mlp": 0.45092773, "step": 2121, "time_per_iteration": 2.7812507152557373 }, { "auxiliary_loss_clip": 0.01870306, "auxiliary_loss_mlp": 0.00439022, "balance_loss_clip": 1.4280076, "balance_loss_mlp": 0.39217302, "epoch": 0.12758154216143094, "flos": 21615997628160.0, "grad_norm": 66.86927813558697, "language_loss": 0.90969205, "learning_rate": 3.900942242309978e-06, "loss": 0.93278533, "num_input_tokens_seen": 45840495, "router_z_loss_clip": 4.42578125, "router_z_loss_mlp": 0.46850586, "step": 2122, "time_per_iteration": 2.63783860206604 }, { "auxiliary_loss_clip": 0.01878657, "auxiliary_loss_mlp": 0.00442612, "balance_loss_clip": 1.42971826, "balance_loss_mlp": 0.39416558, "epoch": 0.1276416654140989, "flos": 15924874855680.0, "grad_norm": 30.69898237085341, "language_loss": 0.85788035, "learning_rate": 3.90082115656099e-06, "loss": 0.88109297, "num_input_tokens_seen": 45857735, "router_z_loss_clip": 4.48828125, "router_z_loss_mlp": 0.48461914, "step": 2123, "time_per_iteration": 2.8146605491638184 }, { "auxiliary_loss_clip": 0.01878321, "auxiliary_loss_mlp": 0.00412049, "balance_loss_clip": 1.4380126, "balance_loss_mlp": 0.36677346, "epoch": 0.12770178866676687, "flos": 22382690451840.0, "grad_norm": 11.750155678117526, "language_loss": 0.85184264, "learning_rate": 3.900699998732673e-06, "loss": 0.87474638, "num_input_tokens_seen": 45876485, "router_z_loss_clip": 4.40625, "router_z_loss_mlp": 0.45288086, "step": 2124, "time_per_iteration": 2.6366004943847656 }, { "auxiliary_loss_clip": 0.01857431, "auxiliary_loss_mlp": 0.00394344, "balance_loss_clip": 1.41424346, "balance_loss_mlp": 0.35045126, "epoch": 0.12776191191943484, "flos": 21652482867840.0, "grad_norm": 3.7578796059147965, "language_loss": 0.82128447, "learning_rate": 3.900578768829623e-06, "loss": 0.84380221, "num_input_tokens_seen": 45894645, "router_z_loss_clip": 4.42578125, "router_z_loss_mlp": 0.43896484, "step": 2125, "time_per_iteration": 2.688483953475952 }, { "auxiliary_loss_clip": 0.01863992, "auxiliary_loss_mlp": 0.00412159, "balance_loss_clip": 1.42043209, "balance_loss_mlp": 0.36829045, "epoch": 0.1278220351721028, "flos": 25735561574400.0, "grad_norm": 3.1930173913833677, "language_loss": 0.84304464, "learning_rate": 3.900457466856434e-06, "loss": 0.8658061, "num_input_tokens_seen": 45913755, "router_z_loss_clip": 4.44140625, "router_z_loss_mlp": 0.4387207, "step": 2126, "time_per_iteration": 2.6996278762817383 }, { "auxiliary_loss_clip": 0.01862803, "auxiliary_loss_mlp": 0.00419595, "balance_loss_clip": 1.42413998, "balance_loss_mlp": 0.37820518, "epoch": 0.12788215842477077, "flos": 41243224982400.0, "grad_norm": 10.148381240003673, "language_loss": 0.75616562, "learning_rate": 3.9003360928177085e-06, "loss": 0.77898961, "num_input_tokens_seen": 45936095, "router_z_loss_clip": 4.37890625, "router_z_loss_mlp": 0.41381836, "step": 2127, "time_per_iteration": 2.9792075157165527 }, { "auxiliary_loss_clip": 0.01860319, "auxiliary_loss_mlp": 0.00116748, "balance_loss_clip": 1.56499767, "balance_loss_mlp": 0.0864211, "epoch": 0.12794228167743876, "flos": 70877430881280.0, "grad_norm": 0.8212479864869062, "language_loss": 0.62315154, "learning_rate": 3.900214646718047e-06, "loss": 0.64292228, "num_input_tokens_seen": 46004655, "router_z_loss_clip": 2.953125, "router_z_loss_mlp": 0.30273438, "step": 2128, "time_per_iteration": 3.2068824768066406 }, { "auxiliary_loss_clip": 0.01867829, "auxiliary_loss_mlp": 0.00435067, "balance_loss_clip": 1.42545891, "balance_loss_mlp": 0.38752615, "epoch": 0.12800240493010673, "flos": 16289727252480.0, "grad_norm": 691.7688551879252, "language_loss": 0.8594743, "learning_rate": 3.900093128562056e-06, "loss": 0.88250327, "num_input_tokens_seen": 46023610, "router_z_loss_clip": 4.421875, "router_z_loss_mlp": 0.47558594, "step": 2129, "time_per_iteration": 4.040445804595947 }, { "auxiliary_loss_clip": 0.01846811, "auxiliary_loss_mlp": 0.00422801, "balance_loss_clip": 1.40798473, "balance_loss_mlp": 0.37561789, "epoch": 0.1280625281827747, "flos": 20631542601600.0, "grad_norm": 28.7605328028082, "language_loss": 0.87154984, "learning_rate": 3.899971538354343e-06, "loss": 0.89424598, "num_input_tokens_seen": 46041725, "router_z_loss_clip": 4.38085938, "router_z_loss_mlp": 0.47216797, "step": 2130, "time_per_iteration": 2.701592206954956 }, { "auxiliary_loss_clip": 0.0185648, "auxiliary_loss_mlp": 0.00447598, "balance_loss_clip": 1.41309142, "balance_loss_mlp": 0.40303731, "epoch": 0.12812265143544266, "flos": 22638230784000.0, "grad_norm": 5.385552921053677, "language_loss": 0.78741562, "learning_rate": 3.899849876099518e-06, "loss": 0.81045645, "num_input_tokens_seen": 46061095, "router_z_loss_clip": 4.4375, "router_z_loss_mlp": 0.44555664, "step": 2131, "time_per_iteration": 4.095390319824219 }, { "auxiliary_loss_clip": 0.01843978, "auxiliary_loss_mlp": 0.00443391, "balance_loss_clip": 1.41774273, "balance_loss_mlp": 0.39995104, "epoch": 0.12818277468811062, "flos": 34714701463680.0, "grad_norm": 8.136697697140965, "language_loss": 0.78422701, "learning_rate": 3.899728141802197e-06, "loss": 0.80710071, "num_input_tokens_seen": 46082670, "router_z_loss_clip": 4.26171875, "router_z_loss_mlp": 0.43408203, "step": 2132, "time_per_iteration": 4.28913950920105 }, { "auxiliary_loss_clip": 0.01826986, "auxiliary_loss_mlp": 0.00403817, "balance_loss_clip": 1.4112184, "balance_loss_mlp": 0.36195081, "epoch": 0.1282428979407786, "flos": 23112107936640.0, "grad_norm": 3.7219319701229336, "language_loss": 0.86790287, "learning_rate": 3.8996063354669935e-06, "loss": 0.89021087, "num_input_tokens_seen": 46102410, "router_z_loss_clip": 4.16015625, "router_z_loss_mlp": 0.41894531, "step": 2133, "time_per_iteration": 2.645785093307495 }, { "auxiliary_loss_clip": 0.01850686, "auxiliary_loss_mlp": 0.00457326, "balance_loss_clip": 1.41480219, "balance_loss_mlp": 0.41288447, "epoch": 0.12830302119344655, "flos": 20886508316160.0, "grad_norm": 16.447563226023828, "language_loss": 0.8849923, "learning_rate": 3.899484457098528e-06, "loss": 0.90807247, "num_input_tokens_seen": 46121145, "router_z_loss_clip": 4.359375, "router_z_loss_mlp": 0.4440918, "step": 2134, "time_per_iteration": 2.640983819961548 }, { "auxiliary_loss_clip": 0.01830474, "auxiliary_loss_mlp": 0.00444988, "balance_loss_clip": 1.40820003, "balance_loss_mlp": 0.39942592, "epoch": 0.12836314444611455, "flos": 21397768548480.0, "grad_norm": 17.497241013409685, "language_loss": 0.89519072, "learning_rate": 3.899362506701421e-06, "loss": 0.91794538, "num_input_tokens_seen": 46140740, "router_z_loss_clip": 4.23046875, "router_z_loss_mlp": 0.45556641, "step": 2135, "time_per_iteration": 4.031800985336304 }, { "auxiliary_loss_clip": 0.01814895, "auxiliary_loss_mlp": 0.00432271, "balance_loss_clip": 1.40296388, "balance_loss_mlp": 0.38985634, "epoch": 0.1284232676987825, "flos": 13662466773120.0, "grad_norm": 9.990883622697485, "language_loss": 0.84238231, "learning_rate": 3.899240484280298e-06, "loss": 0.86485398, "num_input_tokens_seen": 46156805, "router_z_loss_clip": 4.12109375, "router_z_loss_mlp": 0.42407227, "step": 2136, "time_per_iteration": 2.6686925888061523 }, { "auxiliary_loss_clip": 0.01744958, "auxiliary_loss_mlp": 0.00230887, "balance_loss_clip": 1.50894976, "balance_loss_mlp": 0.21124144, "epoch": 0.12848339095145048, "flos": 59994737735040.0, "grad_norm": 0.9157893838301836, "language_loss": 0.59662038, "learning_rate": 3.899118389839785e-06, "loss": 0.61637884, "num_input_tokens_seen": 46222085, "router_z_loss_clip": 2.359375, "router_z_loss_mlp": 0.19628906, "step": 2137, "time_per_iteration": 3.2991943359375 }, { "auxiliary_loss_clip": 0.01855914, "auxiliary_loss_mlp": 0.00468289, "balance_loss_clip": 1.42923236, "balance_loss_mlp": 0.42418137, "epoch": 0.12854351420411844, "flos": 13881378211200.0, "grad_norm": 2.176059162167412, "language_loss": 0.90502697, "learning_rate": 3.898996223384512e-06, "loss": 0.92826903, "num_input_tokens_seen": 46239970, "router_z_loss_clip": 4.26171875, "router_z_loss_mlp": 0.44116211, "step": 2138, "time_per_iteration": 2.648012399673462 }, { "auxiliary_loss_clip": 0.01872676, "auxiliary_loss_mlp": 0.00497255, "balance_loss_clip": 1.43520129, "balance_loss_mlp": 0.44866493, "epoch": 0.1286036374567864, "flos": 22637943475200.0, "grad_norm": 451.16768205963155, "language_loss": 0.85783231, "learning_rate": 3.898873984919113e-06, "loss": 0.8815316, "num_input_tokens_seen": 46257740, "router_z_loss_clip": 4.375, "router_z_loss_mlp": 0.48632812, "step": 2139, "time_per_iteration": 2.8073408603668213 }, { "auxiliary_loss_clip": 0.01838827, "auxiliary_loss_mlp": 0.00409904, "balance_loss_clip": 1.42780602, "balance_loss_mlp": 0.36839515, "epoch": 0.12866376070945437, "flos": 16324775948160.0, "grad_norm": 111.748254402506, "language_loss": 0.89996672, "learning_rate": 3.8987516744482215e-06, "loss": 0.922454, "num_input_tokens_seen": 46275445, "router_z_loss_clip": 4.10742188, "router_z_loss_mlp": 0.4152832, "step": 2140, "time_per_iteration": 2.643249273300171 }, { "auxiliary_loss_clip": 0.01811667, "auxiliary_loss_mlp": 0.00446993, "balance_loss_clip": 1.40908754, "balance_loss_mlp": 0.40512669, "epoch": 0.12872388396212234, "flos": 11874546374400.0, "grad_norm": 22.491088816815292, "language_loss": 0.92005253, "learning_rate": 3.898629291976476e-06, "loss": 0.94263911, "num_input_tokens_seen": 46291710, "router_z_loss_clip": 4.03125, "router_z_loss_mlp": 0.41894531, "step": 2141, "time_per_iteration": 2.6784236431121826 }, { "auxiliary_loss_clip": 0.01807008, "auxiliary_loss_mlp": 0.00453182, "balance_loss_clip": 1.41078246, "balance_loss_mlp": 0.40819228, "epoch": 0.12878400721479033, "flos": 28366700722560.0, "grad_norm": 18.201279704919, "language_loss": 0.76158154, "learning_rate": 3.898506837508518e-06, "loss": 0.78418344, "num_input_tokens_seen": 46311335, "router_z_loss_clip": 3.95898438, "router_z_loss_mlp": 0.44970703, "step": 2142, "time_per_iteration": 2.7159788608551025 }, { "auxiliary_loss_clip": 0.0182741, "auxiliary_loss_mlp": 0.00475293, "balance_loss_clip": 1.42543662, "balance_loss_mlp": 0.42534417, "epoch": 0.1288441304674583, "flos": 25885632597120.0, "grad_norm": 485.58166862508403, "language_loss": 0.88744438, "learning_rate": 3.89838431104899e-06, "loss": 0.91047144, "num_input_tokens_seen": 46330985, "router_z_loss_clip": 4.01757812, "router_z_loss_mlp": 0.49926758, "step": 2143, "time_per_iteration": 2.7095143795013428 }, { "auxiliary_loss_clip": 0.01820282, "auxiliary_loss_mlp": 0.00464024, "balance_loss_clip": 1.42267621, "balance_loss_mlp": 0.41633987, "epoch": 0.12890425372012626, "flos": 20813789232000.0, "grad_norm": 7.284826436223412, "language_loss": 0.86425102, "learning_rate": 3.898261712602539e-06, "loss": 0.88709414, "num_input_tokens_seen": 46351295, "router_z_loss_clip": 3.97265625, "router_z_loss_mlp": 0.47729492, "step": 2144, "time_per_iteration": 2.66925311088562 }, { "auxiliary_loss_clip": 0.01777986, "auxiliary_loss_mlp": 0.00451539, "balance_loss_clip": 1.39586389, "balance_loss_mlp": 0.40645421, "epoch": 0.12896437697279423, "flos": 22565870835840.0, "grad_norm": 22.000421952332097, "language_loss": 0.84740174, "learning_rate": 3.898139042173813e-06, "loss": 0.86969697, "num_input_tokens_seen": 46368600, "router_z_loss_clip": 3.8203125, "router_z_loss_mlp": 0.45043945, "step": 2145, "time_per_iteration": 2.7036099433898926 }, { "auxiliary_loss_clip": 0.01759627, "auxiliary_loss_mlp": 0.00440991, "balance_loss_clip": 1.38627028, "balance_loss_mlp": 0.39392725, "epoch": 0.1290245002254622, "flos": 17493776075520.0, "grad_norm": 4.715395168394478, "language_loss": 0.89599967, "learning_rate": 3.898016299767465e-06, "loss": 0.91800582, "num_input_tokens_seen": 46387370, "router_z_loss_clip": 3.734375, "router_z_loss_mlp": 0.47094727, "step": 2146, "time_per_iteration": 2.657203197479248 }, { "auxiliary_loss_clip": 0.01734711, "auxiliary_loss_mlp": 0.00446417, "balance_loss_clip": 1.37188435, "balance_loss_mlp": 0.40168983, "epoch": 0.12908462347813016, "flos": 36315957859200.0, "grad_norm": 5.77019151459533, "language_loss": 0.7744388, "learning_rate": 3.897893485388149e-06, "loss": 0.7962501, "num_input_tokens_seen": 46409570, "router_z_loss_clip": 3.62304688, "router_z_loss_mlp": 0.44726562, "step": 2147, "time_per_iteration": 2.821298122406006 }, { "auxiliary_loss_clip": 0.01726245, "auxiliary_loss_mlp": 0.0047571, "balance_loss_clip": 1.36647344, "balance_loss_mlp": 0.4265486, "epoch": 0.12914474673079815, "flos": 22528703237760.0, "grad_norm": 4.762964990354122, "language_loss": 0.77224517, "learning_rate": 3.897770599040521e-06, "loss": 0.79426467, "num_input_tokens_seen": 46429320, "router_z_loss_clip": 3.59960938, "router_z_loss_mlp": 0.49194336, "step": 2148, "time_per_iteration": 2.75254225730896 }, { "auxiliary_loss_clip": 0.01690865, "auxiliary_loss_mlp": 0.00483768, "balance_loss_clip": 1.34275866, "balance_loss_mlp": 0.43582216, "epoch": 0.12920486998346611, "flos": 21471888263040.0, "grad_norm": 4.588498531273309, "language_loss": 0.82972169, "learning_rate": 3.897647640729242e-06, "loss": 0.85146803, "num_input_tokens_seen": 46450155, "router_z_loss_clip": 3.484375, "router_z_loss_mlp": 0.47924805, "step": 2149, "time_per_iteration": 2.6784353256225586 }, { "auxiliary_loss_clip": 0.01695018, "auxiliary_loss_mlp": 0.00450009, "balance_loss_clip": 1.34649622, "balance_loss_mlp": 0.40387499, "epoch": 0.12926499323613408, "flos": 27308556944640.0, "grad_norm": 29.78124015872229, "language_loss": 0.83015358, "learning_rate": 3.897524610458975e-06, "loss": 0.85160387, "num_input_tokens_seen": 46470280, "router_z_loss_clip": 3.48046875, "router_z_loss_mlp": 0.4621582, "step": 2150, "time_per_iteration": 2.727299451828003 }, { "auxiliary_loss_clip": 0.01673302, "auxiliary_loss_mlp": 0.00456217, "balance_loss_clip": 1.32677722, "balance_loss_mlp": 0.41170382, "epoch": 0.12932511648880204, "flos": 22091131756800.0, "grad_norm": 410.62531179703853, "language_loss": 0.77656949, "learning_rate": 3.8974015082343835e-06, "loss": 0.79786462, "num_input_tokens_seen": 46487605, "router_z_loss_clip": 3.46484375, "router_z_loss_mlp": 0.44506836, "step": 2151, "time_per_iteration": 2.652137517929077 }, { "auxiliary_loss_clip": 0.01665846, "auxiliary_loss_mlp": 0.00460785, "balance_loss_clip": 1.32561648, "balance_loss_mlp": 0.41584271, "epoch": 0.12938523974147, "flos": 20302780394880.0, "grad_norm": 3.4018003286765963, "language_loss": 0.8965103, "learning_rate": 3.897278334060137e-06, "loss": 0.91777658, "num_input_tokens_seen": 46505100, "router_z_loss_clip": 3.40429688, "router_z_loss_mlp": 0.44970703, "step": 2152, "time_per_iteration": 2.6721339225769043 }, { "auxiliary_loss_clip": 0.01651161, "auxiliary_loss_mlp": 0.00407457, "balance_loss_clip": 1.31799555, "balance_loss_mlp": 0.36528087, "epoch": 0.12944536299413797, "flos": 19499961467520.0, "grad_norm": 5.988853955469221, "language_loss": 0.84988821, "learning_rate": 3.897155087940906e-06, "loss": 0.87047434, "num_input_tokens_seen": 46524020, "router_z_loss_clip": 3.33203125, "router_z_loss_mlp": 0.421875, "step": 2153, "time_per_iteration": 2.665348529815674 }, { "auxiliary_loss_clip": 0.01670566, "auxiliary_loss_mlp": 0.004234, "balance_loss_clip": 1.33499193, "balance_loss_mlp": 0.38072309, "epoch": 0.12950548624680594, "flos": 27707919333120.0, "grad_norm": 11.382817267989774, "language_loss": 0.86426175, "learning_rate": 3.897031769881364e-06, "loss": 0.88520145, "num_input_tokens_seen": 46544640, "router_z_loss_clip": 3.35742188, "router_z_loss_mlp": 0.42675781, "step": 2154, "time_per_iteration": 2.751298427581787 }, { "auxiliary_loss_clip": 0.01653418, "auxiliary_loss_mlp": 0.00452661, "balance_loss_clip": 1.32102323, "balance_loss_mlp": 0.40943563, "epoch": 0.12956560949947393, "flos": 17565740974080.0, "grad_norm": 4.0826583565111445, "language_loss": 0.89364982, "learning_rate": 3.896908379886188e-06, "loss": 0.91471064, "num_input_tokens_seen": 46561395, "router_z_loss_clip": 3.32617188, "router_z_loss_mlp": 0.43237305, "step": 2155, "time_per_iteration": 2.6321706771850586 }, { "auxiliary_loss_clip": 0.01626097, "auxiliary_loss_mlp": 0.00453152, "balance_loss_clip": 1.2906692, "balance_loss_mlp": 0.41045088, "epoch": 0.1296257327521419, "flos": 20740711011840.0, "grad_norm": 90.27107559925862, "language_loss": 0.84624463, "learning_rate": 3.896784917960055e-06, "loss": 0.86703712, "num_input_tokens_seen": 46579395, "router_z_loss_clip": 3.35546875, "router_z_loss_mlp": 0.42675781, "step": 2156, "time_per_iteration": 2.688981533050537 }, { "auxiliary_loss_clip": 0.01628206, "auxiliary_loss_mlp": 0.00427089, "balance_loss_clip": 1.30503094, "balance_loss_mlp": 0.38567519, "epoch": 0.12968585600480986, "flos": 16395735265920.0, "grad_norm": 11.742380929726915, "language_loss": 0.90689915, "learning_rate": 3.896661384107648e-06, "loss": 0.92745209, "num_input_tokens_seen": 46597090, "router_z_loss_clip": 3.23242188, "router_z_loss_mlp": 0.4140625, "step": 2157, "time_per_iteration": 2.6291191577911377 }, { "auxiliary_loss_clip": 0.01625416, "auxiliary_loss_mlp": 0.00432718, "balance_loss_clip": 1.29532659, "balance_loss_mlp": 0.38915873, "epoch": 0.12974597925747783, "flos": 28329533124480.0, "grad_norm": 173.26795372566554, "language_loss": 0.86296451, "learning_rate": 3.896537778333651e-06, "loss": 0.88354588, "num_input_tokens_seen": 46617355, "router_z_loss_clip": 3.29882812, "router_z_loss_mlp": 0.43554688, "step": 2158, "time_per_iteration": 2.7158915996551514 }, { "auxiliary_loss_clip": 0.01604328, "auxiliary_loss_mlp": 0.00489566, "balance_loss_clip": 1.27445376, "balance_loss_mlp": 0.44226357, "epoch": 0.1298061025101458, "flos": 9683025782400.0, "grad_norm": 45.33372519514227, "language_loss": 0.81698406, "learning_rate": 3.896414100642752e-06, "loss": 0.83792293, "num_input_tokens_seen": 46633130, "router_z_loss_clip": 3.296875, "router_z_loss_mlp": 0.47314453, "step": 2159, "time_per_iteration": 2.7019472122192383 }, { "auxiliary_loss_clip": 0.01589649, "auxiliary_loss_mlp": 0.00441269, "balance_loss_clip": 1.27203035, "balance_loss_mlp": 0.4010244, "epoch": 0.12986622576281376, "flos": 27709535445120.0, "grad_norm": 32.51788720524774, "language_loss": 0.89362848, "learning_rate": 3.89629035103964e-06, "loss": 0.91393763, "num_input_tokens_seen": 46650575, "router_z_loss_clip": 3.17578125, "router_z_loss_mlp": 0.40234375, "step": 2160, "time_per_iteration": 2.7197370529174805 }, { "auxiliary_loss_clip": 0.01598905, "auxiliary_loss_mlp": 0.00423213, "balance_loss_clip": 1.2806952, "balance_loss_mlp": 0.37836659, "epoch": 0.12992634901548175, "flos": 18802719590400.0, "grad_norm": 276.1469797367235, "language_loss": 0.86583245, "learning_rate": 3.896166529529008e-06, "loss": 0.88605356, "num_input_tokens_seen": 46668780, "router_z_loss_clip": 3.1796875, "router_z_loss_mlp": 0.44848633, "step": 2161, "time_per_iteration": 2.71525502204895 }, { "auxiliary_loss_clip": 0.01603232, "auxiliary_loss_mlp": 0.00456297, "balance_loss_clip": 1.28134727, "balance_loss_mlp": 0.41030651, "epoch": 0.12998647226814972, "flos": 29127575543040.0, "grad_norm": 2.8414930949012005, "language_loss": 0.8906157, "learning_rate": 3.896042636115551e-06, "loss": 0.91121101, "num_input_tokens_seen": 46687550, "router_z_loss_clip": 3.21875, "router_z_loss_mlp": 0.45947266, "step": 2162, "time_per_iteration": 2.7103567123413086 }, { "auxiliary_loss_clip": 0.01599689, "auxiliary_loss_mlp": 0.00441556, "balance_loss_clip": 1.27351594, "balance_loss_mlp": 0.39952308, "epoch": 0.13004659552081768, "flos": 19573686132480.0, "grad_norm": 229.5492161885517, "language_loss": 0.79769391, "learning_rate": 3.895918670803968e-06, "loss": 0.81810635, "num_input_tokens_seen": 46706730, "router_z_loss_clip": 3.25585938, "router_z_loss_mlp": 0.42041016, "step": 2163, "time_per_iteration": 2.6889004707336426 }, { "auxiliary_loss_clip": 0.01589461, "auxiliary_loss_mlp": 0.00454405, "balance_loss_clip": 1.26793075, "balance_loss_mlp": 0.40917701, "epoch": 0.13010671877348565, "flos": 22490709626880.0, "grad_norm": 37.164932538998485, "language_loss": 0.87828308, "learning_rate": 3.895794633598958e-06, "loss": 0.89872169, "num_input_tokens_seen": 46724250, "router_z_loss_clip": 3.21875, "router_z_loss_mlp": 0.45214844, "step": 2164, "time_per_iteration": 2.6591014862060547 }, { "auxiliary_loss_clip": 0.01583285, "auxiliary_loss_mlp": 0.00479302, "balance_loss_clip": 1.259619, "balance_loss_mlp": 0.4348844, "epoch": 0.1301668420261536, "flos": 23878226142720.0, "grad_norm": 86.437227601016, "language_loss": 0.79446328, "learning_rate": 3.8956705245052256e-06, "loss": 0.81508917, "num_input_tokens_seen": 46744105, "router_z_loss_clip": 3.23242188, "router_z_loss_mlp": 0.44458008, "step": 2165, "time_per_iteration": 2.7069926261901855 }, { "auxiliary_loss_clip": 0.01598496, "auxiliary_loss_mlp": 0.00433174, "balance_loss_clip": 1.27131283, "balance_loss_mlp": 0.38673019, "epoch": 0.13022696527882158, "flos": 23150065633920.0, "grad_norm": 230.4066382092491, "language_loss": 0.8238278, "learning_rate": 3.8955463435274765e-06, "loss": 0.84414446, "num_input_tokens_seen": 46764250, "router_z_loss_clip": 3.27148438, "router_z_loss_mlp": 0.46411133, "step": 2166, "time_per_iteration": 2.6700475215911865 }, { "auxiliary_loss_clip": 0.01568292, "auxiliary_loss_mlp": 0.00428638, "balance_loss_clip": 1.25126505, "balance_loss_mlp": 0.38805878, "epoch": 0.13028708853148954, "flos": 26908548111360.0, "grad_norm": 188.65731273129157, "language_loss": 0.88631666, "learning_rate": 3.895422090670421e-06, "loss": 0.90628594, "num_input_tokens_seen": 46786865, "router_z_loss_clip": 3.171875, "router_z_loss_mlp": 0.40576172, "step": 2167, "time_per_iteration": 2.806652069091797 }, { "auxiliary_loss_clip": 0.01567749, "auxiliary_loss_mlp": 0.00437976, "balance_loss_clip": 1.25153232, "balance_loss_mlp": 0.39482185, "epoch": 0.13034721178415754, "flos": 21251468453760.0, "grad_norm": 42.159399595956, "language_loss": 0.89860308, "learning_rate": 3.89529776593877e-06, "loss": 0.91866028, "num_input_tokens_seen": 46807030, "router_z_loss_clip": 3.16210938, "router_z_loss_mlp": 0.43188477, "step": 2168, "time_per_iteration": 2.6503350734710693 }, { "auxiliary_loss_clip": 0.01552491, "auxiliary_loss_mlp": 0.00415648, "balance_loss_clip": 1.23271501, "balance_loss_mlp": 0.37478325, "epoch": 0.1304073350368255, "flos": 18767239931520.0, "grad_norm": 6.64503790927978, "language_loss": 0.86463511, "learning_rate": 3.8951733693372375e-06, "loss": 0.88431656, "num_input_tokens_seen": 46826280, "router_z_loss_clip": 3.19921875, "router_z_loss_mlp": 0.40844727, "step": 2169, "time_per_iteration": 2.680546998977661 }, { "auxiliary_loss_clip": 0.01555653, "auxiliary_loss_mlp": 0.00407948, "balance_loss_clip": 1.23892069, "balance_loss_mlp": 0.36701128, "epoch": 0.13046745828949347, "flos": 28364653647360.0, "grad_norm": 10.30477413057772, "language_loss": 0.75914431, "learning_rate": 3.8950489008705406e-06, "loss": 0.77878034, "num_input_tokens_seen": 46846505, "router_z_loss_clip": 3.16992188, "router_z_loss_mlp": 0.40942383, "step": 2170, "time_per_iteration": 2.702849864959717 }, { "auxiliary_loss_clip": 0.01550119, "auxiliary_loss_mlp": 0.00392287, "balance_loss_clip": 1.23607767, "balance_loss_mlp": 0.35378233, "epoch": 0.13052758154216143, "flos": 29605044055680.0, "grad_norm": 486.2605619014117, "language_loss": 0.72714663, "learning_rate": 3.8949243605434e-06, "loss": 0.74657065, "num_input_tokens_seen": 46867380, "router_z_loss_clip": 3.14257812, "router_z_loss_mlp": 0.38500977, "step": 2171, "time_per_iteration": 4.0728068351745605 }, { "auxiliary_loss_clip": 0.01544479, "auxiliary_loss_mlp": 0.00400007, "balance_loss_clip": 1.22711372, "balance_loss_mlp": 0.35988128, "epoch": 0.1305877047948294, "flos": 19390864884480.0, "grad_norm": 26.121974399572995, "language_loss": 0.82225037, "learning_rate": 3.894799748360537e-06, "loss": 0.84169519, "num_input_tokens_seen": 46886810, "router_z_loss_clip": 3.17382812, "router_z_loss_mlp": 0.40161133, "step": 2172, "time_per_iteration": 2.6557199954986572 }, { "auxiliary_loss_clip": 0.01541205, "auxiliary_loss_mlp": 0.0041749, "balance_loss_clip": 1.23090601, "balance_loss_mlp": 0.3789134, "epoch": 0.13064782804749736, "flos": 16873527000960.0, "grad_norm": 9.122068002291355, "language_loss": 0.82722831, "learning_rate": 3.894675064326678e-06, "loss": 0.84681523, "num_input_tokens_seen": 46905620, "router_z_loss_clip": 3.10546875, "router_z_loss_mlp": 0.38574219, "step": 2173, "time_per_iteration": 2.693657875061035 }, { "auxiliary_loss_clip": 0.01540236, "auxiliary_loss_mlp": 0.0040706, "balance_loss_clip": 1.22635663, "balance_loss_mlp": 0.36574227, "epoch": 0.13070795130016533, "flos": 24499085748480.0, "grad_norm": 7.321632271148574, "language_loss": 0.79413223, "learning_rate": 3.894550308446551e-06, "loss": 0.81360519, "num_input_tokens_seen": 46925120, "router_z_loss_clip": 3.13671875, "router_z_loss_mlp": 0.41308594, "step": 2174, "time_per_iteration": 5.572129249572754 }, { "auxiliary_loss_clip": 0.01515071, "auxiliary_loss_mlp": 0.00650605, "balance_loss_clip": 1.27576423, "balance_loss_mlp": 0.6248557, "epoch": 0.13076807455283332, "flos": 71054505953280.0, "grad_norm": 0.8791697768683808, "language_loss": 0.58768988, "learning_rate": 3.894425480724886e-06, "loss": 0.60934663, "num_input_tokens_seen": 46988195, "router_z_loss_clip": 2.390625, "router_z_loss_mlp": 0.2578125, "step": 2175, "time_per_iteration": 3.23691725730896 }, { "auxiliary_loss_clip": 0.01509982, "auxiliary_loss_mlp": 0.00418481, "balance_loss_clip": 1.19759011, "balance_loss_mlp": 0.37845045, "epoch": 0.13082819780550128, "flos": 20264499475200.0, "grad_norm": 2.7047287850236974, "language_loss": 0.85819411, "learning_rate": 3.894300581166417e-06, "loss": 0.87747872, "num_input_tokens_seen": 47004720, "router_z_loss_clip": 3.12109375, "router_z_loss_mlp": 0.40039062, "step": 2176, "time_per_iteration": 2.6656627655029297 }, { "auxiliary_loss_clip": 0.0151674, "auxiliary_loss_mlp": 0.0034344, "balance_loss_clip": 1.21031559, "balance_loss_mlp": 0.3084397, "epoch": 0.13088832105816925, "flos": 34203441231360.0, "grad_norm": 3.263474343284716, "language_loss": 0.80767012, "learning_rate": 3.894175609775881e-06, "loss": 0.82627189, "num_input_tokens_seen": 47024255, "router_z_loss_clip": 3.06445312, "router_z_loss_mlp": 0.34985352, "step": 2177, "time_per_iteration": 4.122455835342407 }, { "auxiliary_loss_clip": 0.0152154, "auxiliary_loss_mlp": 0.00341388, "balance_loss_clip": 1.21573722, "balance_loss_mlp": 0.30691272, "epoch": 0.13094844431083721, "flos": 17894970057600.0, "grad_norm": 1.8654335804426165, "language_loss": 0.88926554, "learning_rate": 3.894050566558015e-06, "loss": 0.90789473, "num_input_tokens_seen": 47042465, "router_z_loss_clip": 3.05859375, "router_z_loss_mlp": 0.3449707, "step": 2178, "time_per_iteration": 2.6145846843719482 }, { "auxiliary_loss_clip": 0.01492929, "auxiliary_loss_mlp": 0.00361116, "balance_loss_clip": 1.19518447, "balance_loss_mlp": 0.32660457, "epoch": 0.13100856756350518, "flos": 17311313963520.0, "grad_norm": 20.876971709083342, "language_loss": 0.82829499, "learning_rate": 3.893925451517562e-06, "loss": 0.84683549, "num_input_tokens_seen": 47060370, "router_z_loss_clip": 2.97460938, "router_z_loss_mlp": 0.34484863, "step": 2179, "time_per_iteration": 2.6292593479156494 }, { "auxiliary_loss_clip": 0.01512644, "auxiliary_loss_mlp": 0.00317075, "balance_loss_clip": 1.21766996, "balance_loss_mlp": 0.28666505, "epoch": 0.13106869081617314, "flos": 22200551562240.0, "grad_norm": 4.43649030559081, "language_loss": 0.90314484, "learning_rate": 3.893800264659266e-06, "loss": 0.92144209, "num_input_tokens_seen": 47081415, "router_z_loss_clip": 2.94921875, "router_z_loss_mlp": 0.30444336, "step": 2180, "time_per_iteration": 2.680825710296631 }, { "auxiliary_loss_clip": 0.01506549, "auxiliary_loss_mlp": 0.00320719, "balance_loss_clip": 1.21493912, "balance_loss_mlp": 0.28927162, "epoch": 0.13112881406884114, "flos": 21763123735680.0, "grad_norm": 125.50890600209725, "language_loss": 0.94848144, "learning_rate": 3.8936750059878746e-06, "loss": 0.96675414, "num_input_tokens_seen": 47099860, "router_z_loss_clip": 2.91601562, "router_z_loss_mlp": 0.31420898, "step": 2181, "time_per_iteration": 2.690551280975342 }, { "auxiliary_loss_clip": 0.01509996, "auxiliary_loss_mlp": 0.0035176, "balance_loss_clip": 1.21411467, "balance_loss_mlp": 0.32189792, "epoch": 0.1311889373215091, "flos": 23331091201920.0, "grad_norm": 54.069024631043035, "language_loss": 0.76681268, "learning_rate": 3.893549675508137e-06, "loss": 0.78543019, "num_input_tokens_seen": 47118540, "router_z_loss_clip": 2.95703125, "router_z_loss_mlp": 0.2989502, "step": 2182, "time_per_iteration": 2.646402359008789 }, { "auxiliary_loss_clip": 0.01499584, "auxiliary_loss_mlp": 0.00367912, "balance_loss_clip": 1.20597744, "balance_loss_mlp": 0.33312693, "epoch": 0.13124906057417707, "flos": 21467363149440.0, "grad_norm": 4.154810934320002, "language_loss": 0.85222089, "learning_rate": 3.893424273224806e-06, "loss": 0.87089586, "num_input_tokens_seen": 47136710, "router_z_loss_clip": 2.93554688, "router_z_loss_mlp": 0.34790039, "step": 2183, "time_per_iteration": 2.6701717376708984 }, { "auxiliary_loss_clip": 0.01501266, "auxiliary_loss_mlp": 0.00334045, "balance_loss_clip": 1.2154119, "balance_loss_mlp": 0.30644816, "epoch": 0.13130918382684503, "flos": 23255319461760.0, "grad_norm": 7.481997217838419, "language_loss": 0.91929758, "learning_rate": 3.893298799142636e-06, "loss": 0.93765068, "num_input_tokens_seen": 47157155, "router_z_loss_clip": 2.85546875, "router_z_loss_mlp": 0.27600098, "step": 2184, "time_per_iteration": 2.6474099159240723 }, { "auxiliary_loss_clip": 0.01526164, "auxiliary_loss_mlp": 0.00382464, "balance_loss_clip": 1.23997688, "balance_loss_mlp": 0.35101622, "epoch": 0.131369307079513, "flos": 20850274471680.0, "grad_norm": 3.747045430057401, "language_loss": 0.88129628, "learning_rate": 3.893173253266387e-06, "loss": 0.90038252, "num_input_tokens_seen": 47176820, "router_z_loss_clip": 2.86132812, "router_z_loss_mlp": 0.31445312, "step": 2185, "time_per_iteration": 2.7743515968322754 }, { "auxiliary_loss_clip": 0.01522269, "auxiliary_loss_mlp": 0.00367139, "balance_loss_clip": 1.2372694, "balance_loss_mlp": 0.33676463, "epoch": 0.13142943033218096, "flos": 17858341163520.0, "grad_norm": 14606.935367415379, "language_loss": 0.80836439, "learning_rate": 3.893047635600818e-06, "loss": 0.82725847, "num_input_tokens_seen": 47195855, "router_z_loss_clip": 2.84765625, "router_z_loss_mlp": 0.30322266, "step": 2186, "time_per_iteration": 2.6173899173736572 }, { "auxiliary_loss_clip": 0.01525106, "auxiliary_loss_mlp": 0.00395585, "balance_loss_clip": 1.24039841, "balance_loss_mlp": 0.36268288, "epoch": 0.13148955358484893, "flos": 20996035862400.0, "grad_norm": 3.5067819573199586, "language_loss": 0.88195169, "learning_rate": 3.892921946150693e-06, "loss": 0.90115857, "num_input_tokens_seen": 47214535, "router_z_loss_clip": 2.84570312, "router_z_loss_mlp": 0.32910156, "step": 2187, "time_per_iteration": 2.6669094562530518 }, { "auxiliary_loss_clip": 0.01596754, "auxiliary_loss_mlp": 0.0020029, "balance_loss_clip": 1.39462495, "balance_loss_mlp": 0.18693808, "epoch": 0.13154967683751692, "flos": 70172467580160.0, "grad_norm": 0.8368093262268099, "language_loss": 0.587924, "learning_rate": 3.892796184920778e-06, "loss": 0.60589445, "num_input_tokens_seen": 47270300, "router_z_loss_clip": 2.03125, "router_z_loss_mlp": 0.13378906, "step": 2188, "time_per_iteration": 3.1455326080322266 }, { "auxiliary_loss_clip": 0.01507433, "auxiliary_loss_mlp": 0.00396615, "balance_loss_clip": 1.22872186, "balance_loss_mlp": 0.36612105, "epoch": 0.1316098000901849, "flos": 20376145923840.0, "grad_norm": 8.484822081955246, "language_loss": 0.81055248, "learning_rate": 3.892670351915842e-06, "loss": 0.829593, "num_input_tokens_seen": 47290720, "router_z_loss_clip": 2.78710938, "router_z_loss_mlp": 0.30493164, "step": 2189, "time_per_iteration": 2.672227621078491 }, { "auxiliary_loss_clip": 0.01518846, "auxiliary_loss_mlp": 0.00443616, "balance_loss_clip": 1.23531187, "balance_loss_mlp": 0.41116709, "epoch": 0.13166992334285285, "flos": 23221132692480.0, "grad_norm": 2.085468847642438, "language_loss": 0.77342689, "learning_rate": 3.892544447140657e-06, "loss": 0.79305148, "num_input_tokens_seen": 47311820, "router_z_loss_clip": 2.83203125, "router_z_loss_mlp": 0.32421875, "step": 2190, "time_per_iteration": 2.6519052982330322 }, { "auxiliary_loss_clip": 0.01501935, "auxiliary_loss_mlp": 0.00415993, "balance_loss_clip": 1.22276151, "balance_loss_mlp": 0.3846167, "epoch": 0.13173004659552082, "flos": 23330947547520.0, "grad_norm": 80.86204830725327, "language_loss": 0.78894478, "learning_rate": 3.892418470599996e-06, "loss": 0.80812407, "num_input_tokens_seen": 47331605, "router_z_loss_clip": 2.79492188, "router_z_loss_mlp": 0.3137207, "step": 2191, "time_per_iteration": 2.718846082687378 }, { "auxiliary_loss_clip": 0.01510904, "auxiliary_loss_mlp": 0.0042851, "balance_loss_clip": 1.2307024, "balance_loss_mlp": 0.39599019, "epoch": 0.13179016984818878, "flos": 21251504367360.0, "grad_norm": 18.29927495970653, "language_loss": 0.87209249, "learning_rate": 3.892292422298637e-06, "loss": 0.89148664, "num_input_tokens_seen": 47350455, "router_z_loss_clip": 2.80273438, "router_z_loss_mlp": 0.32519531, "step": 2192, "time_per_iteration": 2.690819025039673 }, { "auxiliary_loss_clip": 0.01521608, "auxiliary_loss_mlp": 0.00418408, "balance_loss_clip": 1.24085259, "balance_loss_mlp": 0.38662666, "epoch": 0.13185029310085675, "flos": 17778690754560.0, "grad_norm": 60.83550502000043, "language_loss": 0.90697491, "learning_rate": 3.892166302241361e-06, "loss": 0.92637503, "num_input_tokens_seen": 47368225, "router_z_loss_clip": 2.80859375, "router_z_loss_mlp": 0.31787109, "step": 2193, "time_per_iteration": 2.601816415786743 }, { "auxiliary_loss_clip": 0.01662666, "auxiliary_loss_mlp": 0.00252662, "balance_loss_clip": 1.46294916, "balance_loss_mlp": 0.23730765, "epoch": 0.1319104163535247, "flos": 69851785933440.0, "grad_norm": 0.8034213597428385, "language_loss": 0.54208124, "learning_rate": 3.8920401104329475e-06, "loss": 0.56123447, "num_input_tokens_seen": 47427125, "router_z_loss_clip": 2.0, "router_z_loss_mlp": 0.15332031, "step": 2194, "time_per_iteration": 3.0759031772613525 }, { "auxiliary_loss_clip": 0.01508462, "auxiliary_loss_mlp": 0.00405315, "balance_loss_clip": 1.22791409, "balance_loss_mlp": 0.3744399, "epoch": 0.1319705396061927, "flos": 25193095401600.0, "grad_norm": 5.0166519753038274, "language_loss": 0.79507476, "learning_rate": 3.891913846878185e-06, "loss": 0.81421256, "num_input_tokens_seen": 47450275, "router_z_loss_clip": 2.80273438, "router_z_loss_mlp": 0.30859375, "step": 2195, "time_per_iteration": 2.657094717025757 }, { "auxiliary_loss_clip": 0.01509431, "auxiliary_loss_mlp": 0.00437768, "balance_loss_clip": 1.22584629, "balance_loss_mlp": 0.40210068, "epoch": 0.13203066285886067, "flos": 20740459616640.0, "grad_norm": 5.540263298466166, "language_loss": 0.84302479, "learning_rate": 3.891787511581859e-06, "loss": 0.86249685, "num_input_tokens_seen": 47469155, "router_z_loss_clip": 2.83398438, "router_z_loss_mlp": 0.35644531, "step": 2196, "time_per_iteration": 2.7006309032440186 }, { "auxiliary_loss_clip": 0.01513508, "auxiliary_loss_mlp": 0.00408859, "balance_loss_clip": 1.23032725, "balance_loss_mlp": 0.37629104, "epoch": 0.13209078611152864, "flos": 22054395121920.0, "grad_norm": 73.65476141643053, "language_loss": 0.82924628, "learning_rate": 3.89166110454876e-06, "loss": 0.84846991, "num_input_tokens_seen": 47488405, "router_z_loss_clip": 2.83398438, "router_z_loss_mlp": 0.32568359, "step": 2197, "time_per_iteration": 2.615997314453125 }, { "auxiliary_loss_clip": 0.01506355, "auxiliary_loss_mlp": 0.00395182, "balance_loss_clip": 1.22221172, "balance_loss_mlp": 0.36301982, "epoch": 0.1321509093641966, "flos": 16284950743680.0, "grad_norm": 11.998549475256134, "language_loss": 0.86611676, "learning_rate": 3.891534625783685e-06, "loss": 0.88513213, "num_input_tokens_seen": 47505650, "router_z_loss_clip": 2.84570312, "router_z_loss_mlp": 0.32177734, "step": 2198, "time_per_iteration": 2.630207061767578 }, { "auxiliary_loss_clip": 0.01502465, "auxiliary_loss_mlp": 0.00406697, "balance_loss_clip": 1.22183228, "balance_loss_mlp": 0.37393788, "epoch": 0.13221103261686457, "flos": 16983018633600.0, "grad_norm": 2.6909682703335207, "language_loss": 0.9156608, "learning_rate": 3.891408075291425e-06, "loss": 0.93475246, "num_input_tokens_seen": 47521540, "router_z_loss_clip": 2.80664062, "router_z_loss_mlp": 0.32739258, "step": 2199, "time_per_iteration": 2.642869710922241 }, { "auxiliary_loss_clip": 0.01488353, "auxiliary_loss_mlp": 0.00374772, "balance_loss_clip": 1.20825672, "balance_loss_mlp": 0.34393251, "epoch": 0.13227115586953253, "flos": 34233605677440.0, "grad_norm": 50.5771509691961, "language_loss": 0.75087154, "learning_rate": 3.8912814530767826e-06, "loss": 0.76950276, "num_input_tokens_seen": 47543625, "router_z_loss_clip": 2.80273438, "router_z_loss_mlp": 0.30810547, "step": 2200, "time_per_iteration": 2.721544027328491 }, { "auxiliary_loss_clip": 0.014981, "auxiliary_loss_mlp": 0.0035917, "balance_loss_clip": 1.22060156, "balance_loss_mlp": 0.32633987, "epoch": 0.13233127912220052, "flos": 20704656735360.0, "grad_norm": 6.102791821254887, "language_loss": 0.91429007, "learning_rate": 3.891154759144557e-06, "loss": 0.93286276, "num_input_tokens_seen": 47563740, "router_z_loss_clip": 2.77734375, "router_z_loss_mlp": 0.32836914, "step": 2201, "time_per_iteration": 2.64933443069458 }, { "auxiliary_loss_clip": 0.01525206, "auxiliary_loss_mlp": 0.00369605, "balance_loss_clip": 1.23581553, "balance_loss_mlp": 0.33553487, "epoch": 0.1323914023748685, "flos": 25805048434560.0, "grad_norm": 4.14872891737483, "language_loss": 0.93231416, "learning_rate": 3.891027993499554e-06, "loss": 0.95126224, "num_input_tokens_seen": 47582655, "router_z_loss_clip": 2.89648438, "router_z_loss_mlp": 0.34082031, "step": 2202, "time_per_iteration": 2.743112087249756 }, { "auxiliary_loss_clip": 0.01533376, "auxiliary_loss_mlp": 0.00343626, "balance_loss_clip": 1.24354792, "balance_loss_mlp": 0.31248829, "epoch": 0.13245152562753645, "flos": 21251540280960.0, "grad_norm": 6.347976136651485, "language_loss": 0.78070605, "learning_rate": 3.89090115614658e-06, "loss": 0.79947609, "num_input_tokens_seen": 47600875, "router_z_loss_clip": 2.8984375, "router_z_loss_mlp": 0.31176758, "step": 2203, "time_per_iteration": 2.74862003326416 }, { "auxiliary_loss_clip": 0.01529506, "auxiliary_loss_mlp": 0.0036182, "balance_loss_clip": 1.23740554, "balance_loss_mlp": 0.33087322, "epoch": 0.13251164888020442, "flos": 26610955931520.0, "grad_norm": 24.608412848296513, "language_loss": 0.80658054, "learning_rate": 3.890774247090444e-06, "loss": 0.82549381, "num_input_tokens_seen": 47619250, "router_z_loss_clip": 2.91992188, "router_z_loss_mlp": 0.30932617, "step": 2204, "time_per_iteration": 2.7778241634368896 }, { "auxiliary_loss_clip": 0.01536044, "auxiliary_loss_mlp": 0.00355514, "balance_loss_clip": 1.24503493, "balance_loss_mlp": 0.32490075, "epoch": 0.13257177213287238, "flos": 29826541272960.0, "grad_norm": 39.9189360512908, "language_loss": 0.85276902, "learning_rate": 3.89064726633596e-06, "loss": 0.87168461, "num_input_tokens_seen": 47639445, "router_z_loss_clip": 2.90820312, "router_z_loss_mlp": 0.30615234, "step": 2205, "time_per_iteration": 2.829871892929077 }, { "auxiliary_loss_clip": 0.01551663, "auxiliary_loss_mlp": 0.00327907, "balance_loss_clip": 1.25769544, "balance_loss_mlp": 0.2971867, "epoch": 0.13263189538554035, "flos": 21288456483840.0, "grad_norm": 6.905555175618172, "language_loss": 0.86851752, "learning_rate": 3.890520213887941e-06, "loss": 0.88731319, "num_input_tokens_seen": 47658740, "router_z_loss_clip": 2.94335938, "router_z_loss_mlp": 0.30737305, "step": 2206, "time_per_iteration": 2.6661744117736816 }, { "auxiliary_loss_clip": 0.01558428, "auxiliary_loss_mlp": 0.00348669, "balance_loss_clip": 1.25526679, "balance_loss_mlp": 0.31533766, "epoch": 0.13269201863820831, "flos": 16874101618560.0, "grad_norm": 13.978254388666263, "language_loss": 0.81164211, "learning_rate": 3.890393089751208e-06, "loss": 0.83071309, "num_input_tokens_seen": 47676880, "router_z_loss_clip": 3.02929688, "router_z_loss_mlp": 0.33325195, "step": 2207, "time_per_iteration": 2.6961610317230225 }, { "auxiliary_loss_clip": 0.01582709, "auxiliary_loss_mlp": 0.00321959, "balance_loss_clip": 1.28289127, "balance_loss_mlp": 0.28817505, "epoch": 0.1327521418908763, "flos": 23768914078080.0, "grad_norm": 19.51679921604977, "language_loss": 0.9149633, "learning_rate": 3.890265893930578e-06, "loss": 0.93401003, "num_input_tokens_seen": 47696635, "router_z_loss_clip": 2.99804688, "router_z_loss_mlp": 0.33789062, "step": 2208, "time_per_iteration": 2.6587164402008057 }, { "auxiliary_loss_clip": 0.01588153, "auxiliary_loss_mlp": 0.00393216, "balance_loss_clip": 1.29302943, "balance_loss_mlp": 0.35981393, "epoch": 0.13281226514354427, "flos": 26505594362880.0, "grad_norm": 3.943063040618238, "language_loss": 0.9122014, "learning_rate": 3.890138626430876e-06, "loss": 0.93201512, "num_input_tokens_seen": 47717760, "router_z_loss_clip": 2.953125, "router_z_loss_mlp": 0.33374023, "step": 2209, "time_per_iteration": 2.7247605323791504 }, { "auxiliary_loss_clip": 0.0160108, "auxiliary_loss_mlp": 0.00443807, "balance_loss_clip": 1.29567242, "balance_loss_mlp": 0.40756696, "epoch": 0.13287238839621224, "flos": 24498762526080.0, "grad_norm": 2.406358566978983, "language_loss": 0.87716651, "learning_rate": 3.890011287256929e-06, "loss": 0.89761537, "num_input_tokens_seen": 47737685, "router_z_loss_clip": 3.0546875, "router_z_loss_mlp": 0.36230469, "step": 2210, "time_per_iteration": 2.7119290828704834 }, { "auxiliary_loss_clip": 0.01811033, "auxiliary_loss_mlp": 0.00525734, "balance_loss_clip": 1.59973061, "balance_loss_mlp": 0.49655119, "epoch": 0.1329325116488802, "flos": 67694344369920.0, "grad_norm": 0.793506187976699, "language_loss": 0.57992315, "learning_rate": 3.889883876413563e-06, "loss": 0.6032908, "num_input_tokens_seen": 47802415, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.29101562, "step": 2211, "time_per_iteration": 3.2393884658813477 }, { "auxiliary_loss_clip": 0.0180064, "auxiliary_loss_mlp": 0.00343597, "balance_loss_clip": 1.58880424, "balance_loss_mlp": 0.3204231, "epoch": 0.13299263490154817, "flos": 72261894741120.0, "grad_norm": 0.7792858491279445, "language_loss": 0.54891938, "learning_rate": 3.889756393905611e-06, "loss": 0.57036167, "num_input_tokens_seen": 47871485, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.23144531, "step": 2212, "time_per_iteration": 3.209986686706543 }, { "auxiliary_loss_clip": 0.01609738, "auxiliary_loss_mlp": 0.00432394, "balance_loss_clip": 1.30029213, "balance_loss_mlp": 0.39768004, "epoch": 0.13305275815421613, "flos": 17931275729280.0, "grad_norm": 47.14730013865663, "language_loss": 0.82769728, "learning_rate": 3.889628839737908e-06, "loss": 0.8481186, "num_input_tokens_seen": 47888315, "router_z_loss_clip": 3.09375, "router_z_loss_mlp": 0.34667969, "step": 2213, "time_per_iteration": 4.035093069076538 }, { "auxiliary_loss_clip": 0.01629546, "auxiliary_loss_mlp": 0.00434839, "balance_loss_clip": 1.31983948, "balance_loss_mlp": 0.39919567, "epoch": 0.13311288140688413, "flos": 22340889999360.0, "grad_norm": 5.876158964549202, "language_loss": 0.84844929, "learning_rate": 3.889501213915291e-06, "loss": 0.86909312, "num_input_tokens_seen": 47906600, "router_z_loss_clip": 3.09765625, "router_z_loss_mlp": 0.35668945, "step": 2214, "time_per_iteration": 2.6538705825805664 }, { "auxiliary_loss_clip": 0.01602761, "auxiliary_loss_mlp": 0.0049601, "balance_loss_clip": 1.293365, "balance_loss_mlp": 0.45361924, "epoch": 0.1331730046595521, "flos": 31868888682240.0, "grad_norm": 4.213597187349654, "language_loss": 0.7598083, "learning_rate": 3.889373516442597e-06, "loss": 0.78079605, "num_input_tokens_seen": 47927630, "router_z_loss_clip": 3.09765625, "router_z_loss_mlp": 0.42431641, "step": 2215, "time_per_iteration": 2.775247573852539 }, { "auxiliary_loss_clip": 0.01621253, "auxiliary_loss_mlp": 0.0044466, "balance_loss_clip": 1.30882001, "balance_loss_mlp": 0.40632188, "epoch": 0.13323312791222006, "flos": 22566589107840.0, "grad_norm": 2.2264488027549962, "language_loss": 0.86730617, "learning_rate": 3.889245747324671e-06, "loss": 0.88796526, "num_input_tokens_seen": 47947935, "router_z_loss_clip": 3.125, "router_z_loss_mlp": 0.38330078, "step": 2216, "time_per_iteration": 5.620861768722534 }, { "auxiliary_loss_clip": 0.01625086, "auxiliary_loss_mlp": 0.00440844, "balance_loss_clip": 1.31634903, "balance_loss_mlp": 0.4018144, "epoch": 0.13329325116488802, "flos": 15085319293440.0, "grad_norm": 12.679864657715088, "language_loss": 0.93800384, "learning_rate": 3.889117906566356e-06, "loss": 0.95866317, "num_input_tokens_seen": 47965515, "router_z_loss_clip": 3.08789062, "router_z_loss_mlp": 0.39038086, "step": 2217, "time_per_iteration": 2.6848409175872803 }, { "auxiliary_loss_clip": 0.01633584, "auxiliary_loss_mlp": 0.00519945, "balance_loss_clip": 1.31734228, "balance_loss_mlp": 0.48086828, "epoch": 0.133353374417556, "flos": 27453671890560.0, "grad_norm": 12.62786802789947, "language_loss": 0.81359899, "learning_rate": 3.888989994172501e-06, "loss": 0.83513433, "num_input_tokens_seen": 47985675, "router_z_loss_clip": 3.16796875, "router_z_loss_mlp": 0.39038086, "step": 2218, "time_per_iteration": 2.78733491897583 }, { "auxiliary_loss_clip": 0.01614286, "auxiliary_loss_mlp": 0.00443358, "balance_loss_clip": 1.30020642, "balance_loss_mlp": 0.40301752, "epoch": 0.13341349767022395, "flos": 24094695456000.0, "grad_norm": 522.514868969117, "language_loss": 0.92187911, "learning_rate": 3.8888620101479565e-06, "loss": 0.94245553, "num_input_tokens_seen": 48004985, "router_z_loss_clip": 3.14257812, "router_z_loss_mlp": 0.40332031, "step": 2219, "time_per_iteration": 4.197736024856567 }, { "auxiliary_loss_clip": 0.01635528, "auxiliary_loss_mlp": 0.00401287, "balance_loss_clip": 1.31905437, "balance_loss_mlp": 0.3664782, "epoch": 0.13347362092289192, "flos": 24133335511680.0, "grad_norm": 53.460264860587124, "language_loss": 0.83549392, "learning_rate": 3.888733954497574e-06, "loss": 0.85586202, "num_input_tokens_seen": 48024965, "router_z_loss_clip": 3.1640625, "router_z_loss_mlp": 0.34838867, "step": 2220, "time_per_iteration": 2.6320626735687256 }, { "auxiliary_loss_clip": 0.0161538, "auxiliary_loss_mlp": 0.00451384, "balance_loss_clip": 1.31147313, "balance_loss_mlp": 0.41454858, "epoch": 0.1335337441755599, "flos": 18436538390400.0, "grad_norm": 4.175446198787223, "language_loss": 0.8805027, "learning_rate": 3.888605827226212e-06, "loss": 0.90117037, "num_input_tokens_seen": 48040890, "router_z_loss_clip": 3.04101562, "router_z_loss_mlp": 0.36816406, "step": 2221, "time_per_iteration": 2.659736156463623 }, { "auxiliary_loss_clip": 0.01598258, "auxiliary_loss_mlp": 0.00208975, "balance_loss_clip": 1.38396025, "balance_loss_mlp": 0.19657719, "epoch": 0.13359386742822787, "flos": 50611997652480.0, "grad_norm": 0.9865411645579303, "language_loss": 0.69075716, "learning_rate": 3.8884776283387275e-06, "loss": 0.70882952, "num_input_tokens_seen": 48091855, "router_z_loss_clip": 2.140625, "router_z_loss_mlp": 0.12353516, "step": 2222, "time_per_iteration": 2.986023426055908 }, { "auxiliary_loss_clip": 0.01616356, "auxiliary_loss_mlp": 0.00417566, "balance_loss_clip": 1.31336594, "balance_loss_mlp": 0.38285199, "epoch": 0.13365399068089584, "flos": 22778569221120.0, "grad_norm": 4.498523299375341, "language_loss": 0.72695112, "learning_rate": 3.888349357839982e-06, "loss": 0.74729037, "num_input_tokens_seen": 48111350, "router_z_loss_clip": 3.02929688, "router_z_loss_mlp": 0.34667969, "step": 2223, "time_per_iteration": 2.713348627090454 }, { "auxiliary_loss_clip": 0.01608336, "auxiliary_loss_mlp": 0.00448269, "balance_loss_clip": 1.30005884, "balance_loss_mlp": 0.40897769, "epoch": 0.1337141139335638, "flos": 12531603911040.0, "grad_norm": 4.455303214597745, "language_loss": 0.89373797, "learning_rate": 3.88822101573484e-06, "loss": 0.91430402, "num_input_tokens_seen": 48129840, "router_z_loss_clip": 3.08007812, "router_z_loss_mlp": 0.39306641, "step": 2224, "time_per_iteration": 2.647359609603882 }, { "auxiliary_loss_clip": 0.01605605, "auxiliary_loss_mlp": 0.00445814, "balance_loss_clip": 1.29358292, "balance_loss_mlp": 0.40268457, "epoch": 0.13377423718623177, "flos": 23038957889280.0, "grad_norm": 15.854242732186917, "language_loss": 0.75309408, "learning_rate": 3.888092602028167e-06, "loss": 0.77360821, "num_input_tokens_seen": 48149240, "router_z_loss_clip": 3.11914062, "router_z_loss_mlp": 0.43188477, "step": 2225, "time_per_iteration": 2.6947176456451416 }, { "auxiliary_loss_clip": 0.015983, "auxiliary_loss_mlp": 0.00422781, "balance_loss_clip": 1.28761256, "balance_loss_mlp": 0.38601679, "epoch": 0.13383436043889974, "flos": 16216397637120.0, "grad_norm": 2.607003427469622, "language_loss": 0.97944796, "learning_rate": 3.887964116724835e-06, "loss": 0.99965882, "num_input_tokens_seen": 48166330, "router_z_loss_clip": 3.10546875, "router_z_loss_mlp": 0.36767578, "step": 2226, "time_per_iteration": 2.6507370471954346 }, { "auxiliary_loss_clip": 0.01610364, "auxiliary_loss_mlp": 0.00429588, "balance_loss_clip": 1.29584742, "balance_loss_mlp": 0.38896164, "epoch": 0.1338944836915677, "flos": 24279671520000.0, "grad_norm": 12.543298083756024, "language_loss": 0.8216064, "learning_rate": 3.887835559829712e-06, "loss": 0.84200585, "num_input_tokens_seen": 48187600, "router_z_loss_clip": 3.14648438, "router_z_loss_mlp": 0.40649414, "step": 2227, "time_per_iteration": 2.7082040309906006 }, { "auxiliary_loss_clip": 0.01604913, "auxiliary_loss_mlp": 0.004555, "balance_loss_clip": 1.29562604, "balance_loss_mlp": 0.41210791, "epoch": 0.1339546069442357, "flos": 17598742594560.0, "grad_norm": 2.960823765475233, "language_loss": 0.89635676, "learning_rate": 3.8877069313476764e-06, "loss": 0.91696084, "num_input_tokens_seen": 48204400, "router_z_loss_clip": 3.09375, "router_z_loss_mlp": 0.43383789, "step": 2228, "time_per_iteration": 2.6851024627685547 }, { "auxiliary_loss_clip": 0.01596183, "auxiliary_loss_mlp": 0.00391853, "balance_loss_clip": 1.29018331, "balance_loss_mlp": 0.35437351, "epoch": 0.13401473019690366, "flos": 18990065952000.0, "grad_norm": 19.463068068199583, "language_loss": 0.87158692, "learning_rate": 3.8875782312836054e-06, "loss": 0.89146727, "num_input_tokens_seen": 48222180, "router_z_loss_clip": 3.06054688, "router_z_loss_mlp": 0.375, "step": 2229, "time_per_iteration": 2.6916606426239014 }, { "auxiliary_loss_clip": 0.01611457, "auxiliary_loss_mlp": 0.00402227, "balance_loss_clip": 1.29857826, "balance_loss_mlp": 0.3643899, "epoch": 0.13407485344957162, "flos": 26943812288640.0, "grad_norm": 28.615340212951335, "language_loss": 0.8011694, "learning_rate": 3.887449459642378e-06, "loss": 0.82130623, "num_input_tokens_seen": 48243245, "router_z_loss_clip": 3.12695312, "router_z_loss_mlp": 0.37841797, "step": 2230, "time_per_iteration": 2.7726480960845947 }, { "auxiliary_loss_clip": 0.016007, "auxiliary_loss_mlp": 0.00371155, "balance_loss_clip": 1.28641117, "balance_loss_mlp": 0.33410469, "epoch": 0.1341349767022396, "flos": 20339373375360.0, "grad_norm": 2.7602174316772317, "language_loss": 0.85824049, "learning_rate": 3.8873206164288785e-06, "loss": 0.87795901, "num_input_tokens_seen": 48262600, "router_z_loss_clip": 3.14257812, "router_z_loss_mlp": 0.37060547, "step": 2231, "time_per_iteration": 2.761098623275757 }, { "auxiliary_loss_clip": 0.01602233, "auxiliary_loss_mlp": 0.00458154, "balance_loss_clip": 1.28190243, "balance_loss_mlp": 0.41264009, "epoch": 0.13419509995490755, "flos": 29862020931840.0, "grad_norm": 5.06449609013117, "language_loss": 0.79259902, "learning_rate": 3.887191701647992e-06, "loss": 0.81320292, "num_input_tokens_seen": 48285075, "router_z_loss_clip": 3.20703125, "router_z_loss_mlp": 0.45507812, "step": 2232, "time_per_iteration": 2.7455995082855225 }, { "auxiliary_loss_clip": 0.01604641, "auxiliary_loss_mlp": 0.00403894, "balance_loss_clip": 1.29058886, "balance_loss_mlp": 0.36202806, "epoch": 0.13425522320757552, "flos": 26942986275840.0, "grad_norm": 6.0943671304911256, "language_loss": 0.76255077, "learning_rate": 3.8870627153046066e-06, "loss": 0.78263617, "num_input_tokens_seen": 48301285, "router_z_loss_clip": 3.13867188, "router_z_loss_mlp": 0.41894531, "step": 2233, "time_per_iteration": 2.732440710067749 }, { "auxiliary_loss_clip": 0.0160772, "auxiliary_loss_mlp": 0.00399973, "balance_loss_clip": 1.29598355, "balance_loss_mlp": 0.36025292, "epoch": 0.1343153464602435, "flos": 15777281871360.0, "grad_norm": 17.69639398068632, "language_loss": 0.89476693, "learning_rate": 3.886933657403615e-06, "loss": 0.91484392, "num_input_tokens_seen": 48317835, "router_z_loss_clip": 3.11914062, "router_z_loss_mlp": 0.3972168, "step": 2234, "time_per_iteration": 2.705817222595215 }, { "auxiliary_loss_clip": 0.01612115, "auxiliary_loss_mlp": 0.00398986, "balance_loss_clip": 1.2938627, "balance_loss_mlp": 0.35955149, "epoch": 0.13437546971291148, "flos": 24314756129280.0, "grad_norm": 7.773827299931583, "language_loss": 0.86939394, "learning_rate": 3.886804527949909e-06, "loss": 0.88950491, "num_input_tokens_seen": 48335670, "router_z_loss_clip": 3.18164062, "router_z_loss_mlp": 0.39428711, "step": 2235, "time_per_iteration": 2.6558005809783936 }, { "auxiliary_loss_clip": 0.01592731, "auxiliary_loss_mlp": 0.00388372, "balance_loss_clip": 1.28452063, "balance_loss_mlp": 0.34910437, "epoch": 0.13443559296557944, "flos": 26650673395200.0, "grad_norm": 5.349589835337512, "language_loss": 0.91360945, "learning_rate": 3.8866753269483864e-06, "loss": 0.93342042, "num_input_tokens_seen": 48357805, "router_z_loss_clip": 3.08203125, "router_z_loss_mlp": 0.39233398, "step": 2236, "time_per_iteration": 2.72800350189209 }, { "auxiliary_loss_clip": 0.01608679, "auxiliary_loss_mlp": 0.00436731, "balance_loss_clip": 1.29183078, "balance_loss_mlp": 0.39529377, "epoch": 0.1344957162182474, "flos": 21796197183360.0, "grad_norm": 4.891539076869965, "language_loss": 0.82867354, "learning_rate": 3.886546054403946e-06, "loss": 0.84912765, "num_input_tokens_seen": 48377845, "router_z_loss_clip": 3.16796875, "router_z_loss_mlp": 0.41430664, "step": 2237, "time_per_iteration": 2.663424491882324 }, { "auxiliary_loss_clip": 0.01633886, "auxiliary_loss_mlp": 0.00400472, "balance_loss_clip": 1.31834579, "balance_loss_mlp": 0.35700822, "epoch": 0.13455583947091537, "flos": 19865568049920.0, "grad_norm": 7.946838432505047, "language_loss": 0.86774635, "learning_rate": 3.886416710321491e-06, "loss": 0.88808995, "num_input_tokens_seen": 48394735, "router_z_loss_clip": 3.15625, "router_z_loss_mlp": 0.43481445, "step": 2238, "time_per_iteration": 2.6577165126800537 }, { "auxiliary_loss_clip": 0.01610859, "auxiliary_loss_mlp": 0.003817, "balance_loss_clip": 1.3045218, "balance_loss_mlp": 0.34331429, "epoch": 0.13461596272358334, "flos": 30846835094400.0, "grad_norm": 9.395647743910972, "language_loss": 0.73465872, "learning_rate": 3.886287294705924e-06, "loss": 0.75458431, "num_input_tokens_seen": 48414200, "router_z_loss_clip": 3.06445312, "router_z_loss_mlp": 0.38378906, "step": 2239, "time_per_iteration": 2.7247626781463623 }, { "auxiliary_loss_clip": 0.0163558, "auxiliary_loss_mlp": 0.00405092, "balance_loss_clip": 1.31769514, "balance_loss_mlp": 0.36623013, "epoch": 0.1346760859762513, "flos": 12494436312960.0, "grad_norm": 10.309963126258848, "language_loss": 0.91402972, "learning_rate": 3.8861578075621555e-06, "loss": 0.93443644, "num_input_tokens_seen": 48431065, "router_z_loss_clip": 3.17773438, "router_z_loss_mlp": 0.38891602, "step": 2240, "time_per_iteration": 2.6023077964782715 }, { "auxiliary_loss_clip": 0.01627036, "auxiliary_loss_mlp": 0.00388854, "balance_loss_clip": 1.31401145, "balance_loss_mlp": 0.34717843, "epoch": 0.1347362092289193, "flos": 21836022387840.0, "grad_norm": 3.0330924478637336, "language_loss": 0.85158211, "learning_rate": 3.886028248895093e-06, "loss": 0.87174094, "num_input_tokens_seen": 48450335, "router_z_loss_clip": 3.13085938, "router_z_loss_mlp": 0.41650391, "step": 2241, "time_per_iteration": 2.6687700748443604 }, { "auxiliary_loss_clip": 0.01634353, "auxiliary_loss_mlp": 0.00368491, "balance_loss_clip": 1.32877016, "balance_loss_mlp": 0.33003414, "epoch": 0.13479633248158726, "flos": 23509459163520.0, "grad_norm": 67.55839539641968, "language_loss": 0.90563482, "learning_rate": 3.88589861870965e-06, "loss": 0.92566323, "num_input_tokens_seen": 48468555, "router_z_loss_clip": 3.05664062, "router_z_loss_mlp": 0.38452148, "step": 2242, "time_per_iteration": 2.667794942855835 }, { "auxiliary_loss_clip": 0.01636017, "auxiliary_loss_mlp": 0.00370875, "balance_loss_clip": 1.32142007, "balance_loss_mlp": 0.33067805, "epoch": 0.13485645573425523, "flos": 29344332165120.0, "grad_norm": 6.287714615197704, "language_loss": 0.74839354, "learning_rate": 3.885768917010744e-06, "loss": 0.76846248, "num_input_tokens_seen": 48488515, "router_z_loss_clip": 3.14257812, "router_z_loss_mlp": 0.40185547, "step": 2243, "time_per_iteration": 2.7412161827087402 }, { "auxiliary_loss_clip": 0.01617073, "auxiliary_loss_mlp": 0.00345745, "balance_loss_clip": 1.31119215, "balance_loss_mlp": 0.30781221, "epoch": 0.1349165789869232, "flos": 28037112503040.0, "grad_norm": 2.3511482396371775, "language_loss": 0.78273624, "learning_rate": 3.8856391438032895e-06, "loss": 0.80236447, "num_input_tokens_seen": 48510515, "router_z_loss_clip": 3.05859375, "router_z_loss_mlp": 0.37915039, "step": 2244, "time_per_iteration": 2.6759209632873535 }, { "auxiliary_loss_clip": 0.01645138, "auxiliary_loss_mlp": 0.00340352, "balance_loss_clip": 1.32900214, "balance_loss_mlp": 0.30294403, "epoch": 0.13497670223959116, "flos": 22853730430080.0, "grad_norm": 4.885967132029266, "language_loss": 0.91612893, "learning_rate": 3.88550929909221e-06, "loss": 0.93598384, "num_input_tokens_seen": 48529940, "router_z_loss_clip": 3.1640625, "router_z_loss_mlp": 0.37402344, "step": 2245, "time_per_iteration": 2.729450225830078 }, { "auxiliary_loss_clip": 0.01643296, "auxiliary_loss_mlp": 0.00342618, "balance_loss_clip": 1.33324146, "balance_loss_mlp": 0.30456656, "epoch": 0.13503682549225912, "flos": 16504580453760.0, "grad_norm": 4.167645119209793, "language_loss": 0.84119469, "learning_rate": 3.88537938288243e-06, "loss": 0.86105382, "num_input_tokens_seen": 48548190, "router_z_loss_clip": 3.09765625, "router_z_loss_mlp": 0.38037109, "step": 2246, "time_per_iteration": 2.6621875762939453 }, { "auxiliary_loss_clip": 0.01585746, "auxiliary_loss_mlp": 0.00308068, "balance_loss_clip": 1.39875674, "balance_loss_mlp": 0.29433474, "epoch": 0.1350969487449271, "flos": 70756303242240.0, "grad_norm": 0.7678822874401862, "language_loss": 0.6070075, "learning_rate": 3.885249395178874e-06, "loss": 0.62594569, "num_input_tokens_seen": 48613165, "router_z_loss_clip": 1.875, "router_z_loss_mlp": 0.13769531, "step": 2247, "time_per_iteration": 3.28216814994812 }, { "auxiliary_loss_clip": 0.0165823, "auxiliary_loss_mlp": 0.00341481, "balance_loss_clip": 1.33784199, "balance_loss_mlp": 0.30247605, "epoch": 0.13515707199759508, "flos": 23075981832960.0, "grad_norm": 30.014423367760006, "language_loss": 0.88795584, "learning_rate": 3.885119335986473e-06, "loss": 0.90795302, "num_input_tokens_seen": 48631705, "router_z_loss_clip": 3.203125, "router_z_loss_mlp": 0.38989258, "step": 2248, "time_per_iteration": 2.7126874923706055 }, { "auxiliary_loss_clip": 0.01647462, "auxiliary_loss_mlp": 0.00331543, "balance_loss_clip": 1.33886743, "balance_loss_mlp": 0.2924189, "epoch": 0.13521719525026304, "flos": 23186371305600.0, "grad_norm": 2.0358391031148915, "language_loss": 0.83031321, "learning_rate": 3.884989205310157e-06, "loss": 0.85010326, "num_input_tokens_seen": 48649740, "router_z_loss_clip": 3.08789062, "router_z_loss_mlp": 0.39086914, "step": 2249, "time_per_iteration": 2.7020082473754883 }, { "auxiliary_loss_clip": 0.01674203, "auxiliary_loss_mlp": 0.00310838, "balance_loss_clip": 1.36113513, "balance_loss_mlp": 0.27166533, "epoch": 0.135277318502931, "flos": 24790931752320.0, "grad_norm": 7.520936200690906, "language_loss": 0.88644302, "learning_rate": 3.884859003154862e-06, "loss": 0.90629339, "num_input_tokens_seen": 48671565, "router_z_loss_clip": 3.12890625, "router_z_loss_mlp": 0.39160156, "step": 2250, "time_per_iteration": 2.8134584426879883 }, { "auxiliary_loss_clip": 0.01687419, "auxiliary_loss_mlp": 0.00345239, "balance_loss_clip": 1.36754525, "balance_loss_mlp": 0.30682993, "epoch": 0.13533744175559898, "flos": 21908525990400.0, "grad_norm": 3.0033754364104994, "language_loss": 0.91127849, "learning_rate": 3.884728729525524e-06, "loss": 0.9316051, "num_input_tokens_seen": 48690425, "router_z_loss_clip": 3.19921875, "router_z_loss_mlp": 0.3840332, "step": 2251, "time_per_iteration": 2.660628080368042 }, { "auxiliary_loss_clip": 0.01678791, "auxiliary_loss_mlp": 0.0031174, "balance_loss_clip": 1.35808086, "balance_loss_mlp": 0.26994541, "epoch": 0.13539756500826694, "flos": 21211643249280.0, "grad_norm": 6.249349685778207, "language_loss": 0.91480994, "learning_rate": 3.884598384427084e-06, "loss": 0.93471527, "num_input_tokens_seen": 48707505, "router_z_loss_clip": 3.20898438, "router_z_loss_mlp": 0.41772461, "step": 2252, "time_per_iteration": 2.6970295906066895 }, { "auxiliary_loss_clip": 0.01713993, "auxiliary_loss_mlp": 0.00086872, "balance_loss_clip": 1.49385357, "balance_loss_mlp": 0.06608181, "epoch": 0.1354576882609349, "flos": 63242103634560.0, "grad_norm": 0.767120661851963, "language_loss": 0.61130828, "learning_rate": 3.884467967864485e-06, "loss": 0.62931687, "num_input_tokens_seen": 48775895, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.20800781, "step": 2253, "time_per_iteration": 3.2262625694274902 }, { "auxiliary_loss_clip": 0.01711893, "auxiliary_loss_mlp": 0.00309842, "balance_loss_clip": 1.39152193, "balance_loss_mlp": 0.26811862, "epoch": 0.1355178115136029, "flos": 25483037984640.0, "grad_norm": 2.54632927031721, "language_loss": 0.95238829, "learning_rate": 3.884337479842671e-06, "loss": 0.97260571, "num_input_tokens_seen": 48798370, "router_z_loss_clip": 3.20507812, "router_z_loss_mlp": 0.41723633, "step": 2254, "time_per_iteration": 2.7124781608581543 }, { "auxiliary_loss_clip": 0.0169538, "auxiliary_loss_mlp": 0.00307763, "balance_loss_clip": 1.38067114, "balance_loss_mlp": 0.26673135, "epoch": 0.13557793476627086, "flos": 21616967295360.0, "grad_norm": 4.976946498557635, "language_loss": 0.92510128, "learning_rate": 3.884206920366591e-06, "loss": 0.94513261, "num_input_tokens_seen": 48817955, "router_z_loss_clip": 3.1484375, "router_z_loss_mlp": 0.41015625, "step": 2255, "time_per_iteration": 2.7385644912719727 }, { "auxiliary_loss_clip": 0.01721348, "auxiliary_loss_mlp": 0.00284596, "balance_loss_clip": 1.40846407, "balance_loss_mlp": 0.24442284, "epoch": 0.13563805801893883, "flos": 24928253447040.0, "grad_norm": 7.492365996922788, "language_loss": 0.8449896, "learning_rate": 3.884076289441196e-06, "loss": 0.86504906, "num_input_tokens_seen": 48836330, "router_z_loss_clip": 3.12890625, "router_z_loss_mlp": 0.40161133, "step": 2256, "time_per_iteration": 4.071101903915405 }, { "auxiliary_loss_clip": 0.01740813, "auxiliary_loss_mlp": 0.00322762, "balance_loss_clip": 1.41182208, "balance_loss_mlp": 0.28182524, "epoch": 0.1356981812716068, "flos": 14750272206720.0, "grad_norm": 6.8632490355233715, "language_loss": 0.89808488, "learning_rate": 3.88394558707144e-06, "loss": 0.91872066, "num_input_tokens_seen": 48851890, "router_z_loss_clip": 3.2890625, "router_z_loss_mlp": 0.40917969, "step": 2257, "time_per_iteration": 2.6415019035339355 }, { "auxiliary_loss_clip": 0.01723612, "auxiliary_loss_mlp": 0.00296858, "balance_loss_clip": 1.40039444, "balance_loss_mlp": 0.25461012, "epoch": 0.13575830452427476, "flos": 11108571822720.0, "grad_norm": 31.16406450417965, "language_loss": 0.91930431, "learning_rate": 3.883814813262277e-06, "loss": 0.93950897, "num_input_tokens_seen": 48865510, "router_z_loss_clip": 3.23046875, "router_z_loss_mlp": 0.42260742, "step": 2258, "time_per_iteration": 3.9550912380218506 }, { "auxiliary_loss_clip": 0.01721371, "auxiliary_loss_mlp": 0.00297159, "balance_loss_clip": 1.39894581, "balance_loss_mlp": 0.25662816, "epoch": 0.13581842777694272, "flos": 17960290940160.0, "grad_norm": 21.411228013274496, "language_loss": 0.90668863, "learning_rate": 3.883683968018669e-06, "loss": 0.92687392, "num_input_tokens_seen": 48882360, "router_z_loss_clip": 3.22460938, "router_z_loss_mlp": 0.40527344, "step": 2259, "time_per_iteration": 4.002089738845825 }, { "auxiliary_loss_clip": 0.01726892, "auxiliary_loss_mlp": 0.00260637, "balance_loss_clip": 1.41025352, "balance_loss_mlp": 0.22263339, "epoch": 0.1358785510296107, "flos": 22857142222080.0, "grad_norm": 4.335778561342026, "language_loss": 0.81083423, "learning_rate": 3.8835530513455755e-06, "loss": 0.83070952, "num_input_tokens_seen": 48902700, "router_z_loss_clip": 3.16796875, "router_z_loss_mlp": 0.37988281, "step": 2260, "time_per_iteration": 2.6961915493011475 }, { "auxiliary_loss_clip": 0.01741186, "auxiliary_loss_mlp": 0.00283703, "balance_loss_clip": 1.42500925, "balance_loss_mlp": 0.24584226, "epoch": 0.13593867428227868, "flos": 25739404329600.0, "grad_norm": 168.10289749968092, "language_loss": 0.83893788, "learning_rate": 3.883422063247961e-06, "loss": 0.85918677, "num_input_tokens_seen": 48922525, "router_z_loss_clip": 3.15820312, "router_z_loss_mlp": 0.37866211, "step": 2261, "time_per_iteration": 4.219906330108643 }, { "auxiliary_loss_clip": 0.01730501, "auxiliary_loss_mlp": 0.00280292, "balance_loss_clip": 1.42029619, "balance_loss_mlp": 0.24307534, "epoch": 0.13599879753494665, "flos": 31249214225280.0, "grad_norm": 162.23505727950769, "language_loss": 0.71261948, "learning_rate": 3.883291003730794e-06, "loss": 0.73272741, "num_input_tokens_seen": 48942510, "router_z_loss_clip": 3.10546875, "router_z_loss_mlp": 0.37207031, "step": 2262, "time_per_iteration": 2.7387874126434326 }, { "auxiliary_loss_clip": 0.01740068, "auxiliary_loss_mlp": 0.00314197, "balance_loss_clip": 1.42636013, "balance_loss_mlp": 0.27838629, "epoch": 0.1360589207876146, "flos": 23915034604800.0, "grad_norm": 172.32622438737866, "language_loss": 0.93591285, "learning_rate": 3.883159872799043e-06, "loss": 0.95645547, "num_input_tokens_seen": 48962625, "router_z_loss_clip": 3.13671875, "router_z_loss_mlp": 0.35839844, "step": 2263, "time_per_iteration": 2.748283863067627 }, { "auxiliary_loss_clip": 0.01739632, "auxiliary_loss_mlp": 0.00299105, "balance_loss_clip": 1.4289521, "balance_loss_mlp": 0.25909835, "epoch": 0.13611904404028258, "flos": 19974197756160.0, "grad_norm": 37.94167393000394, "language_loss": 0.96888369, "learning_rate": 3.8830286704576815e-06, "loss": 0.98927104, "num_input_tokens_seen": 48982525, "router_z_loss_clip": 3.10742188, "router_z_loss_mlp": 0.39990234, "step": 2264, "time_per_iteration": 2.6471424102783203 }, { "auxiliary_loss_clip": 0.01733124, "auxiliary_loss_mlp": 0.00318671, "balance_loss_clip": 1.42359829, "balance_loss_mlp": 0.2813589, "epoch": 0.13617916729295054, "flos": 15340644144000.0, "grad_norm": 47.392278766567884, "language_loss": 0.81526411, "learning_rate": 3.882897396711683e-06, "loss": 0.83578217, "num_input_tokens_seen": 48997605, "router_z_loss_clip": 3.09179688, "router_z_loss_mlp": 0.37329102, "step": 2265, "time_per_iteration": 2.6827893257141113 }, { "auxiliary_loss_clip": 0.01739002, "auxiliary_loss_mlp": 0.00271807, "balance_loss_clip": 1.43006039, "balance_loss_mlp": 0.23363648, "epoch": 0.1362392905456185, "flos": 27451445247360.0, "grad_norm": 28.544922001778595, "language_loss": 0.75360972, "learning_rate": 3.882766051566027e-06, "loss": 0.77371776, "num_input_tokens_seen": 49018535, "router_z_loss_clip": 3.0859375, "router_z_loss_mlp": 0.3815918, "step": 2266, "time_per_iteration": 2.714566230773926 }, { "auxiliary_loss_clip": 0.0174319, "auxiliary_loss_mlp": 0.00301941, "balance_loss_clip": 1.43010616, "balance_loss_mlp": 0.26038456, "epoch": 0.1362994137982865, "flos": 25009017177600.0, "grad_norm": 8.446508157478693, "language_loss": 0.83757091, "learning_rate": 3.882634635025694e-06, "loss": 0.85802227, "num_input_tokens_seen": 49038865, "router_z_loss_clip": 3.12890625, "router_z_loss_mlp": 0.41552734, "step": 2267, "time_per_iteration": 2.7205007076263428 }, { "auxiliary_loss_clip": 0.01744654, "auxiliary_loss_mlp": 0.00289301, "balance_loss_clip": 1.42825198, "balance_loss_mlp": 0.24962792, "epoch": 0.13635953705095447, "flos": 20303031790080.0, "grad_norm": 7.46225905453687, "language_loss": 0.89355028, "learning_rate": 3.882503147095667e-06, "loss": 0.91388983, "num_input_tokens_seen": 49058010, "router_z_loss_clip": 3.16210938, "router_z_loss_mlp": 0.39672852, "step": 2268, "time_per_iteration": 2.618335008621216 }, { "auxiliary_loss_clip": 0.01751331, "auxiliary_loss_mlp": 0.002961, "balance_loss_clip": 1.43787932, "balance_loss_mlp": 0.255831, "epoch": 0.13641966030362243, "flos": 31358418549120.0, "grad_norm": 43.41173343281043, "language_loss": 0.85545588, "learning_rate": 3.882371587780931e-06, "loss": 0.87593013, "num_input_tokens_seen": 49080330, "router_z_loss_clip": 3.13476562, "router_z_loss_mlp": 0.40258789, "step": 2269, "time_per_iteration": 2.710521697998047 }, { "auxiliary_loss_clip": 0.01735048, "auxiliary_loss_mlp": 0.00295405, "balance_loss_clip": 1.4235003, "balance_loss_mlp": 0.25453985, "epoch": 0.1364797835562904, "flos": 20478095700480.0, "grad_norm": 9.897315906865742, "language_loss": 0.88210857, "learning_rate": 3.882239957086477e-06, "loss": 0.90241307, "num_input_tokens_seen": 49097035, "router_z_loss_clip": 3.11523438, "router_z_loss_mlp": 0.40917969, "step": 2270, "time_per_iteration": 2.6368048191070557 }, { "auxiliary_loss_clip": 0.01742018, "auxiliary_loss_mlp": 0.00328399, "balance_loss_clip": 1.42704451, "balance_loss_mlp": 0.28734294, "epoch": 0.13653990680895836, "flos": 13078343802240.0, "grad_norm": 5.017469089174887, "language_loss": 0.84668469, "learning_rate": 3.882108255017295e-06, "loss": 0.86738884, "num_input_tokens_seen": 49113945, "router_z_loss_clip": 3.14648438, "router_z_loss_mlp": 0.41040039, "step": 2271, "time_per_iteration": 2.612729787826538 }, { "auxiliary_loss_clip": 0.01736909, "auxiliary_loss_mlp": 0.00327012, "balance_loss_clip": 1.42554927, "balance_loss_mlp": 0.28540829, "epoch": 0.13660003006162633, "flos": 16946712961920.0, "grad_norm": 3.626413875493014, "language_loss": 0.87713838, "learning_rate": 3.881976481578379e-06, "loss": 0.89777756, "num_input_tokens_seen": 49132855, "router_z_loss_clip": 3.11328125, "router_z_loss_mlp": 0.41625977, "step": 2272, "time_per_iteration": 2.616694927215576 }, { "auxiliary_loss_clip": 0.01811654, "auxiliary_loss_mlp": 0.00137848, "balance_loss_clip": 1.53115749, "balance_loss_mlp": 0.12335243, "epoch": 0.1366601533142943, "flos": 68682749892480.0, "grad_norm": 0.727302690083977, "language_loss": 0.60572517, "learning_rate": 3.8818446367747255e-06, "loss": 0.62522018, "num_input_tokens_seen": 49198310, "router_z_loss_clip": 2.8125, "router_z_loss_mlp": 0.14453125, "step": 2273, "time_per_iteration": 3.250225782394409 }, { "auxiliary_loss_clip": 0.01704164, "auxiliary_loss_mlp": 0.00315045, "balance_loss_clip": 1.39727592, "balance_loss_mlp": 0.27160525, "epoch": 0.13672027656696228, "flos": 19244241567360.0, "grad_norm": 11.072227576530562, "language_loss": 0.85491383, "learning_rate": 3.881712720611336e-06, "loss": 0.87510598, "num_input_tokens_seen": 49217250, "router_z_loss_clip": 3.06640625, "router_z_loss_mlp": 0.43432617, "step": 2274, "time_per_iteration": 2.640986204147339 }, { "auxiliary_loss_clip": 0.01690059, "auxiliary_loss_mlp": 0.00354835, "balance_loss_clip": 1.38865674, "balance_loss_mlp": 0.31659275, "epoch": 0.13678039981963025, "flos": 24534924543360.0, "grad_norm": 7.537891747889789, "language_loss": 0.85110444, "learning_rate": 3.881580733093211e-06, "loss": 0.87155342, "num_input_tokens_seen": 49236615, "router_z_loss_clip": 3.015625, "router_z_loss_mlp": 0.38232422, "step": 2275, "time_per_iteration": 2.6740224361419678 }, { "auxiliary_loss_clip": 0.01703761, "auxiliary_loss_mlp": 0.00347841, "balance_loss_clip": 1.39837861, "balance_loss_mlp": 0.30766797, "epoch": 0.13684052307229821, "flos": 15669334523520.0, "grad_norm": 176.7944160985356, "language_loss": 0.89733207, "learning_rate": 3.881448674225356e-06, "loss": 0.91784811, "num_input_tokens_seen": 49253935, "router_z_loss_clip": 3.05664062, "router_z_loss_mlp": 0.40185547, "step": 2276, "time_per_iteration": 2.7013766765594482 }, { "auxiliary_loss_clip": 0.01677465, "auxiliary_loss_mlp": 0.00348976, "balance_loss_clip": 1.37171948, "balance_loss_mlp": 0.30820638, "epoch": 0.13690064632496618, "flos": 28364689560960.0, "grad_norm": 6.600546384164188, "language_loss": 0.82908988, "learning_rate": 3.881316544012779e-06, "loss": 0.84935427, "num_input_tokens_seen": 49273605, "router_z_loss_clip": 3.0625, "router_z_loss_mlp": 0.4074707, "step": 2277, "time_per_iteration": 2.8744826316833496 }, { "auxiliary_loss_clip": 0.01706424, "auxiliary_loss_mlp": 0.00362518, "balance_loss_clip": 1.39746642, "balance_loss_mlp": 0.32277328, "epoch": 0.13696076957763414, "flos": 23404779953280.0, "grad_norm": 33.597015001327236, "language_loss": 0.87765014, "learning_rate": 3.88118434246049e-06, "loss": 0.89833957, "num_input_tokens_seen": 49291785, "router_z_loss_clip": 3.08984375, "router_z_loss_mlp": 0.3972168, "step": 2278, "time_per_iteration": 2.693669557571411 }, { "auxiliary_loss_clip": 0.01694892, "auxiliary_loss_mlp": 0.00327992, "balance_loss_clip": 1.38306856, "balance_loss_mlp": 0.28636366, "epoch": 0.1370208928303021, "flos": 37196595601920.0, "grad_norm": 16.60111835588534, "language_loss": 0.82618892, "learning_rate": 3.881052069573502e-06, "loss": 0.84641773, "num_input_tokens_seen": 49311405, "router_z_loss_clip": 3.12109375, "router_z_loss_mlp": 0.41601562, "step": 2279, "time_per_iteration": 2.772155523300171 }, { "auxiliary_loss_clip": 0.01697846, "auxiliary_loss_mlp": 0.00323736, "balance_loss_clip": 1.38707042, "balance_loss_mlp": 0.2835623, "epoch": 0.13708101608297008, "flos": 26976311118720.0, "grad_norm": 18.74074016764787, "language_loss": 0.83749127, "learning_rate": 3.880919725356831e-06, "loss": 0.85770702, "num_input_tokens_seen": 49331835, "router_z_loss_clip": 3.109375, "router_z_loss_mlp": 0.40185547, "step": 2280, "time_per_iteration": 2.6925134658813477 }, { "auxiliary_loss_clip": 0.01726125, "auxiliary_loss_mlp": 0.00362877, "balance_loss_clip": 1.41533172, "balance_loss_mlp": 0.32318014, "epoch": 0.13714113933563807, "flos": 32556864850560.0, "grad_norm": 6.844636792640624, "language_loss": 0.84932792, "learning_rate": 3.880787309815496e-06, "loss": 0.87021798, "num_input_tokens_seen": 49352290, "router_z_loss_clip": 3.10742188, "router_z_loss_mlp": 0.39697266, "step": 2281, "time_per_iteration": 2.758948564529419 }, { "auxiliary_loss_clip": 0.01721064, "auxiliary_loss_mlp": 0.0033469, "balance_loss_clip": 1.4046092, "balance_loss_mlp": 0.295279, "epoch": 0.13720126258830603, "flos": 16101267569280.0, "grad_norm": 6.90506497429327, "language_loss": 0.90604949, "learning_rate": 3.880654822954518e-06, "loss": 0.92660695, "num_input_tokens_seen": 49370285, "router_z_loss_clip": 3.1640625, "router_z_loss_mlp": 0.39428711, "step": 2282, "time_per_iteration": 2.608741283416748 }, { "auxiliary_loss_clip": 0.01727841, "auxiliary_loss_mlp": 0.00373573, "balance_loss_clip": 1.41768861, "balance_loss_mlp": 0.33211195, "epoch": 0.137261385840974, "flos": 18953544798720.0, "grad_norm": 4.16002700806474, "language_loss": 0.78833055, "learning_rate": 3.8805222647789195e-06, "loss": 0.80934465, "num_input_tokens_seen": 49389610, "router_z_loss_clip": 3.10546875, "router_z_loss_mlp": 0.41455078, "step": 2283, "time_per_iteration": 2.66733717918396 }, { "auxiliary_loss_clip": 0.0172209, "auxiliary_loss_mlp": 0.00331278, "balance_loss_clip": 1.41356301, "balance_loss_mlp": 0.28991193, "epoch": 0.13732150909364196, "flos": 23295360147840.0, "grad_norm": 27.589505943992513, "language_loss": 0.91841602, "learning_rate": 3.880389635293729e-06, "loss": 0.9389497, "num_input_tokens_seen": 49408390, "router_z_loss_clip": 3.08984375, "router_z_loss_mlp": 0.41381836, "step": 2284, "time_per_iteration": 2.655518054962158 }, { "auxiliary_loss_clip": 0.01728103, "auxiliary_loss_mlp": 0.00402843, "balance_loss_clip": 1.40869045, "balance_loss_mlp": 0.35854456, "epoch": 0.13738163234630993, "flos": 29351263489920.0, "grad_norm": 19.65830084499441, "language_loss": 0.82795978, "learning_rate": 3.880256934503974e-06, "loss": 0.84926921, "num_input_tokens_seen": 49427725, "router_z_loss_clip": 3.19726562, "router_z_loss_mlp": 0.44311523, "step": 2285, "time_per_iteration": 2.7559046745300293 }, { "auxiliary_loss_clip": 0.01712256, "auxiliary_loss_mlp": 0.00390329, "balance_loss_clip": 1.39832366, "balance_loss_mlp": 0.34896368, "epoch": 0.1374417555989779, "flos": 26651319840000.0, "grad_norm": 13.808260770955913, "language_loss": 0.8201161, "learning_rate": 3.880124162414689e-06, "loss": 0.84114194, "num_input_tokens_seen": 49449000, "router_z_loss_clip": 3.13671875, "router_z_loss_mlp": 0.41381836, "step": 2286, "time_per_iteration": 2.68526554107666 }, { "auxiliary_loss_clip": 0.01743714, "auxiliary_loss_mlp": 0.0033834, "balance_loss_clip": 1.42121363, "balance_loss_mlp": 0.29072827, "epoch": 0.1375018788516459, "flos": 28403401443840.0, "grad_norm": 4.880562342215428, "language_loss": 0.94983089, "learning_rate": 3.879991319030908e-06, "loss": 0.97065145, "num_input_tokens_seen": 49468360, "router_z_loss_clip": 3.2265625, "router_z_loss_mlp": 0.47607422, "step": 2287, "time_per_iteration": 2.735827922821045 }, { "auxiliary_loss_clip": 0.01752692, "auxiliary_loss_mlp": 0.00360393, "balance_loss_clip": 1.43236089, "balance_loss_mlp": 0.31857491, "epoch": 0.13756200210431385, "flos": 37413783187200.0, "grad_norm": 17.292672356005877, "language_loss": 0.75064814, "learning_rate": 3.879858404357666e-06, "loss": 0.77177894, "num_input_tokens_seen": 49493450, "router_z_loss_clip": 3.203125, "router_z_loss_mlp": 0.41821289, "step": 2288, "time_per_iteration": 2.8927557468414307 }, { "auxiliary_loss_clip": 0.01733083, "auxiliary_loss_mlp": 0.00347856, "balance_loss_clip": 1.41397572, "balance_loss_mlp": 0.30386761, "epoch": 0.13762212535698182, "flos": 22711021695360.0, "grad_norm": 48.78721831489223, "language_loss": 0.97330827, "learning_rate": 3.879725418400005e-06, "loss": 0.99411762, "num_input_tokens_seen": 49511220, "router_z_loss_clip": 3.19335938, "router_z_loss_mlp": 0.43969727, "step": 2289, "time_per_iteration": 2.6906795501708984 }, { "auxiliary_loss_clip": 0.01733606, "auxiliary_loss_mlp": 0.00309337, "balance_loss_clip": 1.41860378, "balance_loss_mlp": 0.26508641, "epoch": 0.13768224860964978, "flos": 23952130375680.0, "grad_norm": 7.006691013093668, "language_loss": 0.80991888, "learning_rate": 3.879592361162969e-06, "loss": 0.83034837, "num_input_tokens_seen": 49529820, "router_z_loss_clip": 3.15039062, "router_z_loss_mlp": 0.44287109, "step": 2290, "time_per_iteration": 2.699404716491699 }, { "auxiliary_loss_clip": 0.01691771, "auxiliary_loss_mlp": 0.00073373, "balance_loss_clip": 1.44319534, "balance_loss_mlp": 0.03846836, "epoch": 0.13774237186231775, "flos": 63590438753280.0, "grad_norm": 0.7183001898460868, "language_loss": 0.51825833, "learning_rate": 3.8794592326516015e-06, "loss": 0.53590977, "num_input_tokens_seen": 49595325, "router_z_loss_clip": 2.5, "router_z_loss_mlp": 0.34960938, "step": 2291, "time_per_iteration": 3.2160511016845703 }, { "auxiliary_loss_clip": 0.01722475, "auxiliary_loss_mlp": 0.00328149, "balance_loss_clip": 1.40784371, "balance_loss_mlp": 0.28630635, "epoch": 0.1378024951149857, "flos": 24279456038400.0, "grad_norm": 4.523887314175646, "language_loss": 0.77549624, "learning_rate": 3.879326032870952e-06, "loss": 0.79600251, "num_input_tokens_seen": 49615850, "router_z_loss_clip": 3.1484375, "router_z_loss_mlp": 0.41821289, "step": 2292, "time_per_iteration": 2.78210186958313 }, { "auxiliary_loss_clip": 0.01734337, "auxiliary_loss_mlp": 0.00338729, "balance_loss_clip": 1.41377878, "balance_loss_mlp": 0.29576588, "epoch": 0.13786261836765368, "flos": 14021537080320.0, "grad_norm": 6.361987942899286, "language_loss": 0.89421207, "learning_rate": 3.879192761826071e-06, "loss": 0.91494274, "num_input_tokens_seen": 49631860, "router_z_loss_clip": 3.20507812, "router_z_loss_mlp": 0.42944336, "step": 2293, "time_per_iteration": 2.6341216564178467 }, { "auxiliary_loss_clip": 0.01716172, "auxiliary_loss_mlp": 0.00334573, "balance_loss_clip": 1.39678299, "balance_loss_mlp": 0.29129964, "epoch": 0.13792274162032167, "flos": 28878679226880.0, "grad_norm": 4.96587203884443, "language_loss": 0.84903502, "learning_rate": 3.879059419522011e-06, "loss": 0.86954248, "num_input_tokens_seen": 49652145, "router_z_loss_clip": 3.19335938, "router_z_loss_mlp": 0.43261719, "step": 2294, "time_per_iteration": 2.7460155487060547 }, { "auxiliary_loss_clip": 0.01709893, "auxiliary_loss_mlp": 0.00340479, "balance_loss_clip": 1.39471698, "balance_loss_mlp": 0.29846984, "epoch": 0.13798286487298964, "flos": 21141150808320.0, "grad_norm": 34.48939666128053, "language_loss": 0.88074201, "learning_rate": 3.878926005963831e-06, "loss": 0.90124577, "num_input_tokens_seen": 49669880, "router_z_loss_clip": 3.15234375, "router_z_loss_mlp": 0.42016602, "step": 2295, "time_per_iteration": 2.6310698986053467 }, { "auxiliary_loss_clip": 0.0172946, "auxiliary_loss_mlp": 0.00354658, "balance_loss_clip": 1.40933859, "balance_loss_mlp": 0.31171906, "epoch": 0.1380429881256576, "flos": 22487477402880.0, "grad_norm": 39.19919527120189, "language_loss": 0.84933639, "learning_rate": 3.878792521156588e-06, "loss": 0.87017763, "num_input_tokens_seen": 49687255, "router_z_loss_clip": 3.20117188, "router_z_loss_mlp": 0.42919922, "step": 2296, "time_per_iteration": 2.67309308052063 }, { "auxiliary_loss_clip": 0.01728347, "auxiliary_loss_mlp": 0.00346332, "balance_loss_clip": 1.40845037, "balance_loss_mlp": 0.30310631, "epoch": 0.13810311137832557, "flos": 21393674398080.0, "grad_norm": 2.9252574518194634, "language_loss": 0.83854431, "learning_rate": 3.8786589651053446e-06, "loss": 0.85929108, "num_input_tokens_seen": 49706650, "router_z_loss_clip": 3.19726562, "router_z_loss_mlp": 0.43237305, "step": 2297, "time_per_iteration": 2.7222647666931152 }, { "auxiliary_loss_clip": 0.01728057, "auxiliary_loss_mlp": 0.00326588, "balance_loss_clip": 1.40978229, "balance_loss_mlp": 0.28646231, "epoch": 0.13816323463099353, "flos": 25989844930560.0, "grad_norm": 2.53191593005944, "language_loss": 0.76338178, "learning_rate": 3.878525337815164e-06, "loss": 0.78392828, "num_input_tokens_seen": 49725715, "router_z_loss_clip": 3.18359375, "router_z_loss_mlp": 0.40136719, "step": 2298, "time_per_iteration": 4.020915746688843 }, { "auxiliary_loss_clip": 0.01725038, "auxiliary_loss_mlp": 0.00366753, "balance_loss_clip": 1.40024281, "balance_loss_mlp": 0.32235941, "epoch": 0.1382233578836615, "flos": 19244313394560.0, "grad_norm": 2.032312426022189, "language_loss": 0.92791325, "learning_rate": 3.878391639291116e-06, "loss": 0.9488312, "num_input_tokens_seen": 49744710, "router_z_loss_clip": 3.25, "router_z_loss_mlp": 0.44384766, "step": 2299, "time_per_iteration": 2.7385354042053223 }, { "auxiliary_loss_clip": 0.01735572, "auxiliary_loss_mlp": 0.00315968, "balance_loss_clip": 1.41294074, "balance_loss_mlp": 0.27212322, "epoch": 0.1382834811363295, "flos": 25666290195840.0, "grad_norm": 22.603018952368355, "language_loss": 0.83358324, "learning_rate": 3.878257869538267e-06, "loss": 0.85409856, "num_input_tokens_seen": 49764300, "router_z_loss_clip": 3.22460938, "router_z_loss_mlp": 0.43847656, "step": 2300, "time_per_iteration": 2.6817822456359863 }, { "auxiliary_loss_clip": 0.01733898, "auxiliary_loss_mlp": 0.00299292, "balance_loss_clip": 1.41570377, "balance_loss_mlp": 0.25637668, "epoch": 0.13834360438899745, "flos": 19784193788160.0, "grad_norm": 12.587972417867086, "language_loss": 0.93122494, "learning_rate": 3.878124028561692e-06, "loss": 0.95155692, "num_input_tokens_seen": 49778380, "router_z_loss_clip": 3.1875, "router_z_loss_mlp": 0.42895508, "step": 2301, "time_per_iteration": 5.430131435394287 }, { "auxiliary_loss_clip": 0.01740478, "auxiliary_loss_mlp": 0.0034479, "balance_loss_clip": 1.41849768, "balance_loss_mlp": 0.30244705, "epoch": 0.13840372764166542, "flos": 26651858544000.0, "grad_norm": 6.872056206854722, "language_loss": 0.92295885, "learning_rate": 3.877990116366466e-06, "loss": 0.94381154, "num_input_tokens_seen": 49797460, "router_z_loss_clip": 3.22070312, "router_z_loss_mlp": 0.4230957, "step": 2302, "time_per_iteration": 2.7313973903656006 }, { "auxiliary_loss_clip": 0.01707113, "auxiliary_loss_mlp": 0.00105995, "balance_loss_clip": 1.47101235, "balance_loss_mlp": 0.06689429, "epoch": 0.13846385089433338, "flos": 70510998286080.0, "grad_norm": 0.7617015544967195, "language_loss": 0.65824926, "learning_rate": 3.877856132957667e-06, "loss": 0.67638034, "num_input_tokens_seen": 49868005, "router_z_loss_clip": 2.359375, "router_z_loss_mlp": 0.390625, "step": 2303, "time_per_iteration": 4.653339385986328 }, { "auxiliary_loss_clip": 0.01761544, "auxiliary_loss_mlp": 0.00298012, "balance_loss_clip": 1.4360075, "balance_loss_mlp": 0.25717112, "epoch": 0.13852397414700135, "flos": 17348732956800.0, "grad_norm": 67.54570059512443, "language_loss": 0.83863068, "learning_rate": 3.877722078340374e-06, "loss": 0.85922629, "num_input_tokens_seen": 49885825, "router_z_loss_clip": 3.25585938, "router_z_loss_mlp": 0.40844727, "step": 2304, "time_per_iteration": 2.62591552734375 }, { "auxiliary_loss_clip": 0.01771703, "auxiliary_loss_mlp": 0.00303932, "balance_loss_clip": 1.44084692, "balance_loss_mlp": 0.26356822, "epoch": 0.13858409739966931, "flos": 21543781334400.0, "grad_norm": 4.704670060317913, "language_loss": 0.84059846, "learning_rate": 3.877587952519672e-06, "loss": 0.86135483, "num_input_tokens_seen": 49905975, "router_z_loss_clip": 3.3046875, "router_z_loss_mlp": 0.40380859, "step": 2305, "time_per_iteration": 2.691819906234741 }, { "auxiliary_loss_clip": 0.01791838, "auxiliary_loss_mlp": 0.0035628, "balance_loss_clip": 1.45667458, "balance_loss_mlp": 0.30964553, "epoch": 0.13864422065233728, "flos": 21579907438080.0, "grad_norm": 11.761922827058488, "language_loss": 0.94843209, "learning_rate": 3.877453755500647e-06, "loss": 0.96991324, "num_input_tokens_seen": 49925800, "router_z_loss_clip": 3.3515625, "router_z_loss_mlp": 0.46655273, "step": 2306, "time_per_iteration": 2.6301190853118896 }, { "auxiliary_loss_clip": 0.01749834, "auxiliary_loss_mlp": 0.00055483, "balance_loss_clip": 1.49697828, "balance_loss_mlp": 0.02553809, "epoch": 0.13870434390500527, "flos": 53371156872960.0, "grad_norm": 0.8480063924496607, "language_loss": 0.5845781, "learning_rate": 3.877319487288387e-06, "loss": 0.60263127, "num_input_tokens_seen": 49977620, "router_z_loss_clip": 2.53125, "router_z_loss_mlp": 0.29882812, "step": 2307, "time_per_iteration": 3.166128635406494 }, { "auxiliary_loss_clip": 0.01861496, "auxiliary_loss_mlp": 0.00375112, "balance_loss_clip": 1.49340355, "balance_loss_mlp": 0.32859692, "epoch": 0.13876446715767324, "flos": 22565906749440.0, "grad_norm": 7.964671322011736, "language_loss": 0.86287397, "learning_rate": 3.877185147887984e-06, "loss": 0.88524008, "num_input_tokens_seen": 49996650, "router_z_loss_clip": 3.68359375, "router_z_loss_mlp": 0.46484375, "step": 2308, "time_per_iteration": 2.6811676025390625 }, { "auxiliary_loss_clip": 0.01891951, "auxiliary_loss_mlp": 0.00356229, "balance_loss_clip": 1.52612126, "balance_loss_mlp": 0.31412396, "epoch": 0.1388245904103412, "flos": 20705231352960.0, "grad_norm": 6.488551298485882, "language_loss": 0.85698849, "learning_rate": 3.877050737304533e-06, "loss": 0.87947023, "num_input_tokens_seen": 50015640, "router_z_loss_clip": 3.65820312, "router_z_loss_mlp": 0.42089844, "step": 2309, "time_per_iteration": 2.6998629570007324 }, { "auxiliary_loss_clip": 0.01925409, "auxiliary_loss_mlp": 0.00392009, "balance_loss_clip": 1.53719926, "balance_loss_mlp": 0.34923679, "epoch": 0.13888471366300917, "flos": 20554729367040.0, "grad_norm": 22.183255974732976, "language_loss": 0.76913357, "learning_rate": 3.876916255543129e-06, "loss": 0.79230779, "num_input_tokens_seen": 50033500, "router_z_loss_clip": 3.88476562, "router_z_loss_mlp": 0.42773438, "step": 2310, "time_per_iteration": 2.660141706466675 }, { "auxiliary_loss_clip": 0.01971124, "auxiliary_loss_mlp": 0.0040137, "balance_loss_clip": 1.56523359, "balance_loss_mlp": 0.35745305, "epoch": 0.13894483691567713, "flos": 13838033473920.0, "grad_norm": 3.571705419098448, "language_loss": 0.91221684, "learning_rate": 3.8767817026088725e-06, "loss": 0.93594176, "num_input_tokens_seen": 50050075, "router_z_loss_clip": 4.05859375, "router_z_loss_mlp": 0.43945312, "step": 2311, "time_per_iteration": 2.5867483615875244 }, { "auxiliary_loss_clip": 0.01966215, "auxiliary_loss_mlp": 0.00388726, "balance_loss_clip": 1.55660105, "balance_loss_mlp": 0.34576306, "epoch": 0.1390049601683451, "flos": 28031186759040.0, "grad_norm": 3.623504820829164, "language_loss": 0.88195312, "learning_rate": 3.876647078506866e-06, "loss": 0.9055025, "num_input_tokens_seen": 50070080, "router_z_loss_clip": 4.09375, "router_z_loss_mlp": 0.4296875, "step": 2312, "time_per_iteration": 2.7510104179382324 }, { "auxiliary_loss_clip": 0.01991648, "auxiliary_loss_mlp": 0.00472129, "balance_loss_clip": 1.57087708, "balance_loss_mlp": 0.4284265, "epoch": 0.13906508342101306, "flos": 26756860976640.0, "grad_norm": 7.888174054819899, "language_loss": 0.93511891, "learning_rate": 3.876512383242215e-06, "loss": 0.95975661, "num_input_tokens_seen": 50090040, "router_z_loss_clip": 4.2109375, "router_z_loss_mlp": 0.43676758, "step": 2313, "time_per_iteration": 2.6738502979278564 }, { "auxiliary_loss_clip": 0.02014479, "auxiliary_loss_mlp": 0.00434077, "balance_loss_clip": 1.58464479, "balance_loss_mlp": 0.38718003, "epoch": 0.13912520667368106, "flos": 24535104111360.0, "grad_norm": 7.594127547465398, "language_loss": 0.88928771, "learning_rate": 3.876377616820024e-06, "loss": 0.9137733, "num_input_tokens_seen": 50110595, "router_z_loss_clip": 4.30078125, "router_z_loss_mlp": 0.46899414, "step": 2314, "time_per_iteration": 2.717693328857422 }, { "auxiliary_loss_clip": 0.02015357, "auxiliary_loss_mlp": 0.00482565, "balance_loss_clip": 1.58206487, "balance_loss_mlp": 0.43631226, "epoch": 0.13918532992634902, "flos": 19383215287680.0, "grad_norm": 31.820317228997382, "language_loss": 0.93154335, "learning_rate": 3.876242779245409e-06, "loss": 0.95652258, "num_input_tokens_seen": 50125430, "router_z_loss_clip": 4.3359375, "router_z_loss_mlp": 0.46240234, "step": 2315, "time_per_iteration": 2.622575521469116 }, { "auxiliary_loss_clip": 0.02032928, "auxiliary_loss_mlp": 0.00531646, "balance_loss_clip": 1.59063399, "balance_loss_mlp": 0.48465356, "epoch": 0.139245453179017, "flos": 21323756574720.0, "grad_norm": 5.6011943662800645, "language_loss": 0.86876619, "learning_rate": 3.876107870523477e-06, "loss": 0.89441192, "num_input_tokens_seen": 50144120, "router_z_loss_clip": 4.42578125, "router_z_loss_mlp": 0.47045898, "step": 2316, "time_per_iteration": 2.6522772312164307 }, { "auxiliary_loss_clip": 0.02063999, "auxiliary_loss_mlp": 0.00553569, "balance_loss_clip": 1.61072147, "balance_loss_mlp": 0.50431168, "epoch": 0.13930557643168495, "flos": 19500607912320.0, "grad_norm": 4.419107648026177, "language_loss": 0.82260925, "learning_rate": 3.875972890659349e-06, "loss": 0.84878492, "num_input_tokens_seen": 50162500, "router_z_loss_clip": 4.53125, "router_z_loss_mlp": 0.49291992, "step": 2317, "time_per_iteration": 2.6538596153259277 }, { "auxiliary_loss_clip": 0.020831, "auxiliary_loss_mlp": 0.00526248, "balance_loss_clip": 1.61954033, "balance_loss_mlp": 0.4777537, "epoch": 0.13936569968435292, "flos": 25410821690880.0, "grad_norm": 39.29366775197261, "language_loss": 0.87803257, "learning_rate": 3.875837839658139e-06, "loss": 0.90412605, "num_input_tokens_seen": 50182415, "router_z_loss_clip": 4.62890625, "router_z_loss_mlp": 0.48510742, "step": 2318, "time_per_iteration": 2.7538936138153076 }, { "auxiliary_loss_clip": 0.01956781, "auxiliary_loss_mlp": 0.00141006, "balance_loss_clip": 1.667202, "balance_loss_mlp": 0.12202819, "epoch": 0.13942582293702088, "flos": 70771063731840.0, "grad_norm": 0.8435574320031615, "language_loss": 0.59259385, "learning_rate": 3.87570271752497e-06, "loss": 0.6135717, "num_input_tokens_seen": 50245160, "router_z_loss_clip": 2.90625, "router_z_loss_mlp": 0.18945312, "step": 2319, "time_per_iteration": 3.2219576835632324 }, { "auxiliary_loss_clip": 0.02041471, "auxiliary_loss_mlp": 0.00471772, "balance_loss_clip": 1.59567428, "balance_loss_mlp": 0.42628184, "epoch": 0.13948594618968888, "flos": 35590885920000.0, "grad_norm": 7.943269784462708, "language_loss": 0.74114347, "learning_rate": 3.875567524264967e-06, "loss": 0.76627588, "num_input_tokens_seen": 50268215, "router_z_loss_clip": 4.45703125, "router_z_loss_mlp": 0.45483398, "step": 2320, "time_per_iteration": 2.7902398109436035 }, { "auxiliary_loss_clip": 0.02064874, "auxiliary_loss_mlp": 0.00494552, "balance_loss_clip": 1.61355841, "balance_loss_mlp": 0.44806001, "epoch": 0.13954606944235684, "flos": 21105204272640.0, "grad_norm": 5.844751817752818, "language_loss": 0.74904084, "learning_rate": 3.875432259883256e-06, "loss": 0.77463508, "num_input_tokens_seen": 50288575, "router_z_loss_clip": 4.51171875, "router_z_loss_mlp": 0.46484375, "step": 2321, "time_per_iteration": 2.663471221923828 }, { "auxiliary_loss_clip": 0.02068644, "auxiliary_loss_mlp": 0.00484691, "balance_loss_clip": 1.60742033, "balance_loss_mlp": 0.43903407, "epoch": 0.1396061926950248, "flos": 25044425009280.0, "grad_norm": 44.97992675602325, "language_loss": 0.92569709, "learning_rate": 3.875296924384965e-06, "loss": 0.95123041, "num_input_tokens_seen": 50308735, "router_z_loss_clip": 4.609375, "router_z_loss_mlp": 0.45678711, "step": 2322, "time_per_iteration": 2.7132582664489746 }, { "auxiliary_loss_clip": 0.02098242, "auxiliary_loss_mlp": 0.0046136, "balance_loss_clip": 1.6317966, "balance_loss_mlp": 0.41484424, "epoch": 0.13966631594769277, "flos": 37634023428480.0, "grad_norm": 17.46903950478155, "language_loss": 0.7180748, "learning_rate": 3.875161517775226e-06, "loss": 0.74367082, "num_input_tokens_seen": 50331025, "router_z_loss_clip": 4.66796875, "router_z_loss_mlp": 0.46533203, "step": 2323, "time_per_iteration": 2.8142175674438477 }, { "auxiliary_loss_clip": 0.0207826, "auxiliary_loss_mlp": 0.00494412, "balance_loss_clip": 1.61026907, "balance_loss_mlp": 0.44625127, "epoch": 0.13972643920036074, "flos": 16690993061760.0, "grad_norm": 33.7020484288853, "language_loss": 0.97969031, "learning_rate": 3.875026040059175e-06, "loss": 1.00541711, "num_input_tokens_seen": 50349725, "router_z_loss_clip": 4.68359375, "router_z_loss_mlp": 0.48120117, "step": 2324, "time_per_iteration": 2.604588031768799 }, { "auxiliary_loss_clip": 0.02089966, "auxiliary_loss_mlp": 0.00419522, "balance_loss_clip": 1.62247849, "balance_loss_mlp": 0.37341198, "epoch": 0.1397865624530287, "flos": 23331055288320.0, "grad_norm": 17.336120014140587, "language_loss": 0.80208284, "learning_rate": 3.8748904912419485e-06, "loss": 0.82717776, "num_input_tokens_seen": 50367965, "router_z_loss_clip": 4.67578125, "router_z_loss_mlp": 0.46118164, "step": 2325, "time_per_iteration": 2.6371679306030273 }, { "auxiliary_loss_clip": 0.02125297, "auxiliary_loss_mlp": 0.00430376, "balance_loss_clip": 1.63803983, "balance_loss_mlp": 0.38421804, "epoch": 0.13984668570569667, "flos": 22778317825920.0, "grad_norm": 46.88244748697015, "language_loss": 0.88785732, "learning_rate": 3.874754871328688e-06, "loss": 0.91341406, "num_input_tokens_seen": 50385605, "router_z_loss_clip": 4.8671875, "router_z_loss_mlp": 0.46118164, "step": 2326, "time_per_iteration": 2.6156606674194336 }, { "auxiliary_loss_clip": 0.02138633, "auxiliary_loss_mlp": 0.0046333, "balance_loss_clip": 1.65406132, "balance_loss_mlp": 0.41888878, "epoch": 0.13990680895836466, "flos": 19464553635840.0, "grad_norm": 2.6135810138222157, "language_loss": 0.92901158, "learning_rate": 3.874619180324534e-06, "loss": 0.95503128, "num_input_tokens_seen": 50403985, "router_z_loss_clip": 4.84765625, "router_z_loss_mlp": 0.44482422, "step": 2327, "time_per_iteration": 2.631580114364624 }, { "auxiliary_loss_clip": 0.02149381, "auxiliary_loss_mlp": 0.00470044, "balance_loss_clip": 1.65263152, "balance_loss_mlp": 0.42390972, "epoch": 0.13996693221103262, "flos": 20303283185280.0, "grad_norm": 23.547553533120677, "language_loss": 0.90065622, "learning_rate": 3.874483418234632e-06, "loss": 0.92685044, "num_input_tokens_seen": 50421590, "router_z_loss_clip": 4.9609375, "router_z_loss_mlp": 0.46118164, "step": 2328, "time_per_iteration": 2.5938303470611572 }, { "auxiliary_loss_clip": 0.02124869, "auxiliary_loss_mlp": 0.00478792, "balance_loss_clip": 1.63643241, "balance_loss_mlp": 0.4330875, "epoch": 0.1400270554637006, "flos": 26617707688320.0, "grad_norm": 25.617212924096126, "language_loss": 0.79347759, "learning_rate": 3.874347585064131e-06, "loss": 0.81951427, "num_input_tokens_seen": 50443945, "router_z_loss_clip": 4.8828125, "router_z_loss_mlp": 0.45703125, "step": 2329, "time_per_iteration": 2.6913211345672607 }, { "auxiliary_loss_clip": 0.02136505, "auxiliary_loss_mlp": 0.00448635, "balance_loss_clip": 1.64422739, "balance_loss_mlp": 0.40159535, "epoch": 0.14008717871636855, "flos": 19391475415680.0, "grad_norm": 6.028791536012523, "language_loss": 0.84277952, "learning_rate": 3.874211680818183e-06, "loss": 0.86863101, "num_input_tokens_seen": 50462065, "router_z_loss_clip": 4.9140625, "router_z_loss_mlp": 0.47070312, "step": 2330, "time_per_iteration": 2.6378204822540283 }, { "auxiliary_loss_clip": 0.02135177, "auxiliary_loss_mlp": 0.00448244, "balance_loss_clip": 1.64123321, "balance_loss_mlp": 0.40363634, "epoch": 0.14014730196903652, "flos": 15304266645120.0, "grad_norm": 124.29133595590609, "language_loss": 0.79233456, "learning_rate": 3.87407570550194e-06, "loss": 0.81816882, "num_input_tokens_seen": 50479565, "router_z_loss_clip": 4.93359375, "router_z_loss_mlp": 0.44604492, "step": 2331, "time_per_iteration": 2.7307207584381104 }, { "auxiliary_loss_clip": 0.02128656, "auxiliary_loss_mlp": 0.0040661, "balance_loss_clip": 1.64735413, "balance_loss_mlp": 0.36581695, "epoch": 0.14020742522170448, "flos": 14939701557120.0, "grad_norm": 8.531332065999765, "language_loss": 0.78101063, "learning_rate": 3.873939659120557e-06, "loss": 0.80636322, "num_input_tokens_seen": 50497305, "router_z_loss_clip": 4.8125, "router_z_loss_mlp": 0.40795898, "step": 2332, "time_per_iteration": 2.6431050300598145 }, { "auxiliary_loss_clip": 0.0205468, "auxiliary_loss_mlp": 0.00108062, "balance_loss_clip": 1.73468852, "balance_loss_mlp": 0.09041937, "epoch": 0.14026754847437245, "flos": 48824580044160.0, "grad_norm": 1.3786048769294796, "language_loss": 0.55892992, "learning_rate": 3.873803541679196e-06, "loss": 0.58055735, "num_input_tokens_seen": 50549735, "router_z_loss_clip": 3.203125, "router_z_loss_mlp": 0.17675781, "step": 2333, "time_per_iteration": 3.010904312133789 }, { "auxiliary_loss_clip": 0.02105501, "auxiliary_loss_mlp": 0.00383396, "balance_loss_clip": 1.6255132, "balance_loss_mlp": 0.34224448, "epoch": 0.14032767172704044, "flos": 25773267876480.0, "grad_norm": 13.395522076625811, "language_loss": 0.87654328, "learning_rate": 3.873667353183016e-06, "loss": 0.90143228, "num_input_tokens_seen": 50570100, "router_z_loss_clip": 4.796875, "router_z_loss_mlp": 0.41162109, "step": 2334, "time_per_iteration": 2.70176100730896 }, { "auxiliary_loss_clip": 0.02094586, "auxiliary_loss_mlp": 0.0041486, "balance_loss_clip": 1.6170584, "balance_loss_mlp": 0.37044269, "epoch": 0.1403877949797084, "flos": 21216312017280.0, "grad_norm": 1.8910572150981582, "language_loss": 0.85997134, "learning_rate": 3.8735310936371825e-06, "loss": 0.88506579, "num_input_tokens_seen": 50589185, "router_z_loss_clip": 4.7734375, "router_z_loss_mlp": 0.44433594, "step": 2335, "time_per_iteration": 2.6413543224334717 }, { "auxiliary_loss_clip": 0.02036472, "auxiliary_loss_mlp": 0.00460885, "balance_loss_clip": 1.58144999, "balance_loss_mlp": 0.41110307, "epoch": 0.14044791823237637, "flos": 22747973811840.0, "grad_norm": 40.659141386231646, "language_loss": 0.88444376, "learning_rate": 3.873394763046862e-06, "loss": 0.90941727, "num_input_tokens_seen": 50609645, "router_z_loss_clip": 4.55078125, "router_z_loss_mlp": 0.49707031, "step": 2336, "time_per_iteration": 2.759342908859253 }, { "auxiliary_loss_clip": 0.02074772, "auxiliary_loss_mlp": 0.00416953, "balance_loss_clip": 1.60831463, "balance_loss_mlp": 0.37055698, "epoch": 0.14050804148504434, "flos": 22964443125120.0, "grad_norm": 8.284414062387494, "language_loss": 0.86074984, "learning_rate": 3.873258361417225e-06, "loss": 0.88566709, "num_input_tokens_seen": 50628385, "router_z_loss_clip": 4.66796875, "router_z_loss_mlp": 0.46411133, "step": 2337, "time_per_iteration": 2.6402621269226074 }, { "auxiliary_loss_clip": 0.02096601, "auxiliary_loss_mlp": 0.00440076, "balance_loss_clip": 1.62700009, "balance_loss_mlp": 0.39248723, "epoch": 0.1405681647377123, "flos": 22200336080640.0, "grad_norm": 50.81895528868738, "language_loss": 0.85825264, "learning_rate": 3.873121888753442e-06, "loss": 0.88361937, "num_input_tokens_seen": 50647260, "router_z_loss_clip": 4.69140625, "router_z_loss_mlp": 0.47631836, "step": 2338, "time_per_iteration": 2.6338813304901123 }, { "auxiliary_loss_clip": 0.02082928, "auxiliary_loss_mlp": 0.00391347, "balance_loss_clip": 1.60980248, "balance_loss_mlp": 0.34614325, "epoch": 0.14062828799038027, "flos": 23732787974400.0, "grad_norm": 36.28339122645138, "language_loss": 0.92307603, "learning_rate": 3.87298534506069e-06, "loss": 0.94781882, "num_input_tokens_seen": 50666130, "router_z_loss_clip": 4.73828125, "router_z_loss_mlp": 0.45214844, "step": 2339, "time_per_iteration": 2.6442949771881104 }, { "auxiliary_loss_clip": 0.02059916, "auxiliary_loss_mlp": 0.00367875, "balance_loss_clip": 1.60308719, "balance_loss_mlp": 0.3268432, "epoch": 0.14068841124304826, "flos": 39202493685120.0, "grad_norm": 9.838141493739467, "language_loss": 0.71504223, "learning_rate": 3.872848730344146e-06, "loss": 0.73932016, "num_input_tokens_seen": 50687440, "router_z_loss_clip": 4.5625, "router_z_loss_mlp": 0.41015625, "step": 2340, "time_per_iteration": 4.135826826095581 }, { "auxiliary_loss_clip": 0.0207862, "auxiliary_loss_mlp": 0.00424127, "balance_loss_clip": 1.6196661, "balance_loss_mlp": 0.37837428, "epoch": 0.14074853449571623, "flos": 20192283181440.0, "grad_norm": 12.995165935408608, "language_loss": 0.86857504, "learning_rate": 3.87271204460899e-06, "loss": 0.89360255, "num_input_tokens_seen": 50704030, "router_z_loss_clip": 4.59375, "router_z_loss_mlp": 0.45727539, "step": 2341, "time_per_iteration": 2.6327874660491943 }, { "auxiliary_loss_clip": 0.0206525, "auxiliary_loss_mlp": 0.00420583, "balance_loss_clip": 1.60622334, "balance_loss_mlp": 0.3744725, "epoch": 0.1408086577483842, "flos": 18405871153920.0, "grad_norm": 87.27164423831798, "language_loss": 0.87307137, "learning_rate": 3.8725752878604066e-06, "loss": 0.89792967, "num_input_tokens_seen": 50723305, "router_z_loss_clip": 4.59375, "router_z_loss_mlp": 0.4609375, "step": 2342, "time_per_iteration": 2.6445467472076416 }, { "auxiliary_loss_clip": 0.02082878, "auxiliary_loss_mlp": 0.00388842, "balance_loss_clip": 1.63145471, "balance_loss_mlp": 0.34575993, "epoch": 0.14086878100105216, "flos": 25264593423360.0, "grad_norm": 3.1388579377937753, "language_loss": 0.84251189, "learning_rate": 3.87243846010358e-06, "loss": 0.8672291, "num_input_tokens_seen": 50743270, "router_z_loss_clip": 4.515625, "router_z_loss_mlp": 0.43066406, "step": 2343, "time_per_iteration": 4.072537183761597 }, { "auxiliary_loss_clip": 0.01670557, "auxiliary_loss_mlp": 0.00229138, "balance_loss_clip": 1.44151235, "balance_loss_mlp": 0.21626341, "epoch": 0.14092890425372012, "flos": 65978388869760.0, "grad_norm": 0.8218595230577952, "language_loss": 0.6127317, "learning_rate": 3.872301561343699e-06, "loss": 0.63172865, "num_input_tokens_seen": 50802710, "router_z_loss_clip": 2.28125, "router_z_loss_mlp": 0.12890625, "step": 2344, "time_per_iteration": 4.485307693481445 }, { "auxiliary_loss_clip": 0.02033222, "auxiliary_loss_mlp": 0.00415453, "balance_loss_clip": 1.59605706, "balance_loss_mlp": 0.3728233, "epoch": 0.1409890275063881, "flos": 23694973931520.0, "grad_norm": 6.724294118531236, "language_loss": 0.70419282, "learning_rate": 3.872164591585956e-06, "loss": 0.72867954, "num_input_tokens_seen": 50822625, "router_z_loss_clip": 4.3671875, "router_z_loss_mlp": 0.42651367, "step": 2345, "time_per_iteration": 2.6330220699310303 }, { "auxiliary_loss_clip": 0.01990281, "auxiliary_loss_mlp": 0.00386722, "balance_loss_clip": 1.55611944, "balance_loss_mlp": 0.34290054, "epoch": 0.14104915075905605, "flos": 23623152687360.0, "grad_norm": 36.090470136190845, "language_loss": 0.83182681, "learning_rate": 3.8720275508355435e-06, "loss": 0.85559678, "num_input_tokens_seen": 50842330, "router_z_loss_clip": 4.33984375, "router_z_loss_mlp": 0.43823242, "step": 2346, "time_per_iteration": 4.011506080627441 }, { "auxiliary_loss_clip": 0.01940948, "auxiliary_loss_mlp": 0.00409171, "balance_loss_clip": 1.53149128, "balance_loss_mlp": 0.36132064, "epoch": 0.14110927401172405, "flos": 20595165102720.0, "grad_norm": 8.792522701452404, "language_loss": 0.83181393, "learning_rate": 3.8718904390976585e-06, "loss": 0.85531509, "num_input_tokens_seen": 50861035, "router_z_loss_clip": 4.09375, "router_z_loss_mlp": 0.47802734, "step": 2347, "time_per_iteration": 2.6255335807800293 }, { "auxiliary_loss_clip": 0.01938959, "auxiliary_loss_mlp": 0.00410601, "balance_loss_clip": 1.53087556, "balance_loss_mlp": 0.36732781, "epoch": 0.141169397264392, "flos": 28548049512960.0, "grad_norm": 540.9514366630341, "language_loss": 0.8413018, "learning_rate": 3.8717532563775e-06, "loss": 0.86479741, "num_input_tokens_seen": 50880105, "router_z_loss_clip": 4.08007812, "router_z_loss_mlp": 0.43286133, "step": 2348, "time_per_iteration": 2.743312358856201 }, { "auxiliary_loss_clip": 0.0192801, "auxiliary_loss_mlp": 0.00411596, "balance_loss_clip": 1.52277517, "balance_loss_mlp": 0.36634457, "epoch": 0.14122952051705998, "flos": 17092258871040.0, "grad_norm": 34.876575289301215, "language_loss": 0.92166328, "learning_rate": 3.871616002680272e-06, "loss": 0.94505942, "num_input_tokens_seen": 50897720, "router_z_loss_clip": 4.05664062, "router_z_loss_mlp": 0.45288086, "step": 2349, "time_per_iteration": 2.641409158706665 }, { "auxiliary_loss_clip": 0.01876492, "auxiliary_loss_mlp": 0.00397231, "balance_loss_clip": 1.49277854, "balance_loss_mlp": 0.35579389, "epoch": 0.14128964376972794, "flos": 28946801370240.0, "grad_norm": 25.23571791026447, "language_loss": 0.93530399, "learning_rate": 3.871478678011177e-06, "loss": 0.95804119, "num_input_tokens_seen": 50918385, "router_z_loss_clip": 3.83789062, "router_z_loss_mlp": 0.41430664, "step": 2350, "time_per_iteration": 2.7005515098571777 }, { "auxiliary_loss_clip": 0.01837938, "auxiliary_loss_mlp": 0.00406417, "balance_loss_clip": 1.47458291, "balance_loss_mlp": 0.36128461, "epoch": 0.1413497670223959, "flos": 18989778643200.0, "grad_norm": 2.69384665636976, "language_loss": 0.87426722, "learning_rate": 3.871341282375423e-06, "loss": 0.89671075, "num_input_tokens_seen": 50938270, "router_z_loss_clip": 3.63476562, "router_z_loss_mlp": 0.45141602, "step": 2351, "time_per_iteration": 2.662097215652466 }, { "auxiliary_loss_clip": 0.01820379, "auxiliary_loss_mlp": 0.0040464, "balance_loss_clip": 1.45459998, "balance_loss_mlp": 0.36472824, "epoch": 0.14140989027506387, "flos": 29862236413440.0, "grad_norm": 4.688304809743304, "language_loss": 0.89524376, "learning_rate": 3.871203815778219e-06, "loss": 0.91749394, "num_input_tokens_seen": 50958155, "router_z_loss_clip": 3.65625, "router_z_loss_mlp": 0.39916992, "step": 2352, "time_per_iteration": 2.7027509212493896 }, { "auxiliary_loss_clip": 0.01471364, "auxiliary_loss_mlp": 0.00259536, "balance_loss_clip": 1.26722121, "balance_loss_mlp": 0.24513583, "epoch": 0.14147001352773186, "flos": 62079532041600.0, "grad_norm": 5.362709869596716, "language_loss": 0.62034833, "learning_rate": 3.87106627822478e-06, "loss": 0.63765734, "num_input_tokens_seen": 51020705, "router_z_loss_clip": 2.03125, "router_z_loss_mlp": 0.14355469, "step": 2353, "time_per_iteration": 3.09131121635437 }, { "auxiliary_loss_clip": 0.01808988, "auxiliary_loss_mlp": 0.00430917, "balance_loss_clip": 1.4499681, "balance_loss_mlp": 0.39071941, "epoch": 0.14153013678039983, "flos": 22017514832640.0, "grad_norm": 5.219364425302211, "language_loss": 0.92351568, "learning_rate": 3.8709286697203196e-06, "loss": 0.94591469, "num_input_tokens_seen": 51039995, "router_z_loss_clip": 3.58984375, "router_z_loss_mlp": 0.40185547, "step": 2354, "time_per_iteration": 2.6497347354888916 }, { "auxiliary_loss_clip": 0.0180633, "auxiliary_loss_mlp": 0.00407942, "balance_loss_clip": 1.4441396, "balance_loss_mlp": 0.36793587, "epoch": 0.1415902600330678, "flos": 19720093968000.0, "grad_norm": 8.222446717949463, "language_loss": 0.79457772, "learning_rate": 3.870790990270057e-06, "loss": 0.81672037, "num_input_tokens_seen": 51059075, "router_z_loss_clip": 3.61914062, "router_z_loss_mlp": 0.40014648, "step": 2355, "time_per_iteration": 2.7837769985198975 }, { "auxiliary_loss_clip": 0.01570487, "auxiliary_loss_mlp": 0.00371067, "balance_loss_clip": 1.35978174, "balance_loss_mlp": 0.35161239, "epoch": 0.14165038328573576, "flos": 65900929190400.0, "grad_norm": 0.6735003149634003, "language_loss": 0.51615012, "learning_rate": 3.870653239879212e-06, "loss": 0.53556567, "num_input_tokens_seen": 51120380, "router_z_loss_clip": 2.09375, "router_z_loss_mlp": 0.19433594, "step": 2356, "time_per_iteration": 3.071575880050659 }, { "auxiliary_loss_clip": 0.0181862, "auxiliary_loss_mlp": 0.00369091, "balance_loss_clip": 1.45944858, "balance_loss_mlp": 0.33008525, "epoch": 0.14171050653840372, "flos": 12130158533760.0, "grad_norm": 10.699405528032358, "language_loss": 0.78852111, "learning_rate": 3.8705154185530095e-06, "loss": 0.81039822, "num_input_tokens_seen": 51136950, "router_z_loss_clip": 3.59375, "router_z_loss_mlp": 0.38989258, "step": 2357, "time_per_iteration": 2.620875835418701 }, { "auxiliary_loss_clip": 0.01803306, "auxiliary_loss_mlp": 0.00374961, "balance_loss_clip": 1.44701469, "balance_loss_mlp": 0.33519322, "epoch": 0.1417706297910717, "flos": 20412487509120.0, "grad_norm": 16.5363197681133, "language_loss": 0.89480424, "learning_rate": 3.870377526296674e-06, "loss": 0.916587, "num_input_tokens_seen": 51155175, "router_z_loss_clip": 3.56054688, "router_z_loss_mlp": 0.39770508, "step": 2358, "time_per_iteration": 2.6388328075408936 }, { "auxiliary_loss_clip": 0.01787049, "auxiliary_loss_mlp": 0.00411493, "balance_loss_clip": 1.42879152, "balance_loss_mlp": 0.36724269, "epoch": 0.14183075304373965, "flos": 22380607463040.0, "grad_norm": 107.35664589963177, "language_loss": 0.76729172, "learning_rate": 3.870239563115436e-06, "loss": 0.7892772, "num_input_tokens_seen": 51174500, "router_z_loss_clip": 3.58203125, "router_z_loss_mlp": 0.44238281, "step": 2359, "time_per_iteration": 2.6506779193878174 }, { "auxiliary_loss_clip": 0.01797141, "auxiliary_loss_mlp": 0.00346592, "balance_loss_clip": 1.43993688, "balance_loss_mlp": 0.30875525, "epoch": 0.14189087629640765, "flos": 21580913018880.0, "grad_norm": 4.3025988163657045, "language_loss": 0.83552408, "learning_rate": 3.870101529014526e-06, "loss": 0.85696149, "num_input_tokens_seen": 51194270, "router_z_loss_clip": 3.57226562, "router_z_loss_mlp": 0.37866211, "step": 2360, "time_per_iteration": 2.6694557666778564 }, { "auxiliary_loss_clip": 0.01780456, "auxiliary_loss_mlp": 0.00393872, "balance_loss_clip": 1.4302392, "balance_loss_mlp": 0.35167181, "epoch": 0.1419509995490756, "flos": 20008564093440.0, "grad_norm": 18.53570245932095, "language_loss": 0.88981366, "learning_rate": 3.869963423999178e-06, "loss": 0.9115569, "num_input_tokens_seen": 51211850, "router_z_loss_clip": 3.50585938, "router_z_loss_mlp": 0.42211914, "step": 2361, "time_per_iteration": 2.6207611560821533 }, { "auxiliary_loss_clip": 0.01780153, "auxiliary_loss_mlp": 0.00398142, "balance_loss_clip": 1.43254113, "balance_loss_mlp": 0.3578974, "epoch": 0.14201112280174358, "flos": 31941464112000.0, "grad_norm": 9.995260919987503, "language_loss": 0.81404436, "learning_rate": 3.86982524807463e-06, "loss": 0.83582735, "num_input_tokens_seen": 51233545, "router_z_loss_clip": 3.4765625, "router_z_loss_mlp": 0.40234375, "step": 2362, "time_per_iteration": 2.6999425888061523 }, { "auxiliary_loss_clip": 0.01759006, "auxiliary_loss_mlp": 0.0039743, "balance_loss_clip": 1.41712737, "balance_loss_mlp": 0.3564699, "epoch": 0.14207124605441154, "flos": 41464147582080.0, "grad_norm": 25.58773738037776, "language_loss": 0.79121161, "learning_rate": 3.869687001246122e-06, "loss": 0.81277591, "num_input_tokens_seen": 51257615, "router_z_loss_clip": 3.41796875, "router_z_loss_mlp": 0.40966797, "step": 2363, "time_per_iteration": 2.7948222160339355 }, { "auxiliary_loss_clip": 0.0177647, "auxiliary_loss_mlp": 0.00367624, "balance_loss_clip": 1.43155575, "balance_loss_mlp": 0.32728389, "epoch": 0.1421313693070795, "flos": 31905086613120.0, "grad_norm": 23.199158576795803, "language_loss": 0.78158778, "learning_rate": 3.8695486835188946e-06, "loss": 0.8030287, "num_input_tokens_seen": 51279645, "router_z_loss_clip": 3.45117188, "router_z_loss_mlp": 0.40307617, "step": 2364, "time_per_iteration": 2.7205965518951416 }, { "auxiliary_loss_clip": 0.01757075, "auxiliary_loss_mlp": 0.00378757, "balance_loss_clip": 1.41780233, "balance_loss_mlp": 0.34096786, "epoch": 0.14219149255974747, "flos": 26871165031680.0, "grad_norm": 329.28910810495034, "language_loss": 0.95872331, "learning_rate": 3.869410294898195e-06, "loss": 0.98008168, "num_input_tokens_seen": 51299775, "router_z_loss_clip": 3.39257812, "router_z_loss_mlp": 0.37817383, "step": 2365, "time_per_iteration": 2.690016269683838 }, { "auxiliary_loss_clip": 0.01752763, "auxiliary_loss_mlp": 0.00382856, "balance_loss_clip": 1.40990853, "balance_loss_mlp": 0.34413719, "epoch": 0.14225161581241544, "flos": 27454426076160.0, "grad_norm": 8.404680241420886, "language_loss": 0.73477799, "learning_rate": 3.869271835389268e-06, "loss": 0.75613415, "num_input_tokens_seen": 51319430, "router_z_loss_clip": 3.42382812, "router_z_loss_mlp": 0.38696289, "step": 2366, "time_per_iteration": 2.6327381134033203 }, { "auxiliary_loss_clip": 0.01764242, "auxiliary_loss_mlp": 0.00374986, "balance_loss_clip": 1.42452884, "balance_loss_mlp": 0.33590925, "epoch": 0.14231173906508343, "flos": 10561436881920.0, "grad_norm": 6.434289115218777, "language_loss": 0.87306643, "learning_rate": 3.8691333049973665e-06, "loss": 0.89445865, "num_input_tokens_seen": 51336045, "router_z_loss_clip": 3.3984375, "router_z_loss_mlp": 0.390625, "step": 2367, "time_per_iteration": 2.6381852626800537 }, { "auxiliary_loss_clip": 0.01767899, "auxiliary_loss_mlp": 0.00348111, "balance_loss_clip": 1.42072153, "balance_loss_mlp": 0.30903459, "epoch": 0.1423718623177514, "flos": 28360882719360.0, "grad_norm": 25.832668353102697, "language_loss": 0.88672334, "learning_rate": 3.868994703727742e-06, "loss": 0.90788341, "num_input_tokens_seen": 51357030, "router_z_loss_clip": 3.46875, "router_z_loss_mlp": 0.390625, "step": 2368, "time_per_iteration": 2.6883957386016846 }, { "auxiliary_loss_clip": 0.01774214, "auxiliary_loss_mlp": 0.00378691, "balance_loss_clip": 1.42986274, "balance_loss_mlp": 0.3401868, "epoch": 0.14243198557041936, "flos": 19354235990400.0, "grad_norm": 4.616907820275121, "language_loss": 0.92658049, "learning_rate": 3.868856031585652e-06, "loss": 0.94810957, "num_input_tokens_seen": 51374890, "router_z_loss_clip": 3.4453125, "router_z_loss_mlp": 0.38476562, "step": 2369, "time_per_iteration": 2.7533013820648193 }, { "auxiliary_loss_clip": 0.01756007, "auxiliary_loss_mlp": 0.00365948, "balance_loss_clip": 1.41179323, "balance_loss_mlp": 0.32794419, "epoch": 0.14249210882308733, "flos": 28806857982720.0, "grad_norm": 2.9828542658169948, "language_loss": 0.81392866, "learning_rate": 3.868717288576354e-06, "loss": 0.83514822, "num_input_tokens_seen": 51398100, "router_z_loss_clip": 3.44140625, "router_z_loss_mlp": 0.37988281, "step": 2370, "time_per_iteration": 2.676750421524048 }, { "auxiliary_loss_clip": 0.01751973, "auxiliary_loss_mlp": 0.00366553, "balance_loss_clip": 1.41430831, "balance_loss_mlp": 0.32871571, "epoch": 0.1425522320757553, "flos": 21835016807040.0, "grad_norm": 14.589047947275677, "language_loss": 0.88186145, "learning_rate": 3.868578474705109e-06, "loss": 0.90304673, "num_input_tokens_seen": 51418745, "router_z_loss_clip": 3.37695312, "router_z_loss_mlp": 0.37817383, "step": 2371, "time_per_iteration": 2.7274887561798096 }, { "auxiliary_loss_clip": 0.01779486, "auxiliary_loss_mlp": 0.00374675, "balance_loss_clip": 1.43787217, "balance_loss_mlp": 0.3339054, "epoch": 0.14261235532842326, "flos": 17311457617920.0, "grad_norm": 5.262649518158599, "language_loss": 0.88381112, "learning_rate": 3.868439589977181e-06, "loss": 0.90535271, "num_input_tokens_seen": 51437455, "router_z_loss_clip": 3.41796875, "router_z_loss_mlp": 0.40771484, "step": 2372, "time_per_iteration": 2.583505392074585 }, { "auxiliary_loss_clip": 0.01760009, "auxiliary_loss_mlp": 0.00382406, "balance_loss_clip": 1.41863215, "balance_loss_mlp": 0.34325802, "epoch": 0.14267247858109125, "flos": 18806741913600.0, "grad_norm": 48.06618907907713, "language_loss": 0.90282965, "learning_rate": 3.868300634397836e-06, "loss": 0.92425376, "num_input_tokens_seen": 51455710, "router_z_loss_clip": 3.41601562, "router_z_loss_mlp": 0.39135742, "step": 2373, "time_per_iteration": 2.623603105545044 }, { "auxiliary_loss_clip": 0.01784183, "auxiliary_loss_mlp": 0.00367861, "balance_loss_clip": 1.43486762, "balance_loss_mlp": 0.33069167, "epoch": 0.14273260183375922, "flos": 11358904682880.0, "grad_norm": 4.223651037681332, "language_loss": 0.92535776, "learning_rate": 3.8681616079723445e-06, "loss": 0.94687819, "num_input_tokens_seen": 51471270, "router_z_loss_clip": 3.49414062, "router_z_loss_mlp": 0.37158203, "step": 2374, "time_per_iteration": 2.6366302967071533 }, { "auxiliary_loss_clip": 0.01778329, "auxiliary_loss_mlp": 0.00389582, "balance_loss_clip": 1.42927122, "balance_loss_mlp": 0.3485744, "epoch": 0.14279272508642718, "flos": 27567688636800.0, "grad_norm": 3.4394784391948057, "language_loss": 0.84618247, "learning_rate": 3.868022510705977e-06, "loss": 0.86786157, "num_input_tokens_seen": 51492705, "router_z_loss_clip": 3.4921875, "router_z_loss_mlp": 0.41015625, "step": 2375, "time_per_iteration": 2.6902639865875244 }, { "auxiliary_loss_clip": 0.0178427, "auxiliary_loss_mlp": 0.00343937, "balance_loss_clip": 1.43798435, "balance_loss_mlp": 0.30648115, "epoch": 0.14285284833909515, "flos": 16252559654400.0, "grad_norm": 11.33482313462612, "language_loss": 0.84746844, "learning_rate": 3.867883342604009e-06, "loss": 0.86875045, "num_input_tokens_seen": 51510780, "router_z_loss_clip": 3.4609375, "router_z_loss_mlp": 0.37451172, "step": 2376, "time_per_iteration": 2.568498373031616 }, { "auxiliary_loss_clip": 0.01783164, "auxiliary_loss_mlp": 0.00363178, "balance_loss_clip": 1.43492997, "balance_loss_mlp": 0.32600805, "epoch": 0.1429129715917631, "flos": 19755609540480.0, "grad_norm": 7.0043441061109055, "language_loss": 1.00391674, "learning_rate": 3.867744103671717e-06, "loss": 1.02538037, "num_input_tokens_seen": 51531400, "router_z_loss_clip": 3.48242188, "router_z_loss_mlp": 0.37182617, "step": 2377, "time_per_iteration": 2.650938034057617 }, { "auxiliary_loss_clip": 0.01773079, "auxiliary_loss_mlp": 0.00357865, "balance_loss_clip": 1.42280698, "balance_loss_mlp": 0.3164992, "epoch": 0.14297309484443108, "flos": 21137092571520.0, "grad_norm": 282.80613083252666, "language_loss": 0.97854519, "learning_rate": 3.867604793914382e-06, "loss": 0.99985462, "num_input_tokens_seen": 51548215, "router_z_loss_clip": 3.50390625, "router_z_loss_mlp": 0.41381836, "step": 2378, "time_per_iteration": 2.6553452014923096 }, { "auxiliary_loss_clip": 0.0177129, "auxiliary_loss_mlp": 0.00338589, "balance_loss_clip": 1.41915929, "balance_loss_mlp": 0.30039483, "epoch": 0.14303321809709904, "flos": 23586667447680.0, "grad_norm": 3.1218105197579544, "language_loss": 0.80126053, "learning_rate": 3.8674654133372864e-06, "loss": 0.82235932, "num_input_tokens_seen": 51566820, "router_z_loss_clip": 3.51953125, "router_z_loss_mlp": 0.38208008, "step": 2379, "time_per_iteration": 2.663740634918213 }, { "auxiliary_loss_clip": 0.01792043, "auxiliary_loss_mlp": 0.00353282, "balance_loss_clip": 1.43882346, "balance_loss_mlp": 0.31380028, "epoch": 0.14309334134976703, "flos": 15888281875200.0, "grad_norm": 335.0049147347527, "language_loss": 0.84888309, "learning_rate": 3.867325961945714e-06, "loss": 0.87033629, "num_input_tokens_seen": 51585075, "router_z_loss_clip": 3.52929688, "router_z_loss_mlp": 0.39453125, "step": 2380, "time_per_iteration": 2.6369264125823975 }, { "auxiliary_loss_clip": 0.01833814, "auxiliary_loss_mlp": 0.00364252, "balance_loss_clip": 1.46819806, "balance_loss_mlp": 0.32333899, "epoch": 0.143153464602435, "flos": 16325601960960.0, "grad_norm": 10.921680456110401, "language_loss": 0.95441324, "learning_rate": 3.867186439744955e-06, "loss": 0.97639394, "num_input_tokens_seen": 51603185, "router_z_loss_clip": 3.65429688, "router_z_loss_mlp": 0.40917969, "step": 2381, "time_per_iteration": 2.6100270748138428 }, { "auxiliary_loss_clip": 0.01840909, "auxiliary_loss_mlp": 0.00343158, "balance_loss_clip": 1.47377086, "balance_loss_mlp": 0.30236414, "epoch": 0.14321358785510296, "flos": 17092079303040.0, "grad_norm": 213.88943368808603, "language_loss": 0.81535631, "learning_rate": 3.867046846740299e-06, "loss": 0.83719695, "num_input_tokens_seen": 51620880, "router_z_loss_clip": 3.66796875, "router_z_loss_mlp": 0.40771484, "step": 2382, "time_per_iteration": 4.0442914962768555 }, { "auxiliary_loss_clip": 0.01842518, "auxiliary_loss_mlp": 0.00365934, "balance_loss_clip": 1.47344756, "balance_loss_mlp": 0.32611817, "epoch": 0.14327371110777093, "flos": 26322916769280.0, "grad_norm": 39.32595587675109, "language_loss": 0.82171184, "learning_rate": 3.866907182937039e-06, "loss": 0.84379637, "num_input_tokens_seen": 51640170, "router_z_loss_clip": 3.69140625, "router_z_loss_mlp": 0.39794922, "step": 2383, "time_per_iteration": 2.6947667598724365 }, { "auxiliary_loss_clip": 0.01868796, "auxiliary_loss_mlp": 0.00386162, "balance_loss_clip": 1.49248052, "balance_loss_mlp": 0.34110087, "epoch": 0.1433338343604389, "flos": 18076462502400.0, "grad_norm": 32.11274620152352, "language_loss": 0.95956588, "learning_rate": 3.866767448340471e-06, "loss": 0.98211551, "num_input_tokens_seen": 51656580, "router_z_loss_clip": 3.7578125, "router_z_loss_mlp": 0.45043945, "step": 2384, "time_per_iteration": 2.6118345260620117 }, { "auxiliary_loss_clip": 0.01862857, "auxiliary_loss_mlp": 0.00408285, "balance_loss_clip": 1.486871, "balance_loss_mlp": 0.36124513, "epoch": 0.14339395761310686, "flos": 15522783033600.0, "grad_norm": 23.268996123529075, "language_loss": 0.87953472, "learning_rate": 3.866627642955895e-06, "loss": 0.90224612, "num_input_tokens_seen": 51674645, "router_z_loss_clip": 3.7578125, "router_z_loss_mlp": 0.47045898, "step": 2385, "time_per_iteration": 4.064507722854614 }, { "auxiliary_loss_clip": 0.01905547, "auxiliary_loss_mlp": 0.00366629, "balance_loss_clip": 1.51066089, "balance_loss_mlp": 0.32435691, "epoch": 0.14345408086577485, "flos": 28548767784960.0, "grad_norm": 3.0406915829310965, "language_loss": 0.81431013, "learning_rate": 3.866487766788612e-06, "loss": 0.8370319, "num_input_tokens_seen": 51695770, "router_z_loss_clip": 3.9453125, "router_z_loss_mlp": 0.4230957, "step": 2386, "time_per_iteration": 4.154547214508057 }, { "auxiliary_loss_clip": 0.01904887, "auxiliary_loss_mlp": 0.00373229, "balance_loss_clip": 1.52083826, "balance_loss_mlp": 0.32888326, "epoch": 0.14351420411844282, "flos": 20230061310720.0, "grad_norm": 16.363844209280597, "language_loss": 0.84211659, "learning_rate": 3.866347819843925e-06, "loss": 0.86489773, "num_input_tokens_seen": 51714165, "router_z_loss_clip": 3.84179688, "router_z_loss_mlp": 0.44360352, "step": 2387, "time_per_iteration": 2.609616994857788 }, { "auxiliary_loss_clip": 0.01930062, "auxiliary_loss_mlp": 0.00357174, "balance_loss_clip": 1.53863335, "balance_loss_mlp": 0.31452096, "epoch": 0.14357432737111078, "flos": 19865029345920.0, "grad_norm": 39.692086733002924, "language_loss": 0.8902775, "learning_rate": 3.866207802127143e-06, "loss": 0.91314983, "num_input_tokens_seen": 51734440, "router_z_loss_clip": 3.90820312, "router_z_loss_mlp": 0.42651367, "step": 2388, "time_per_iteration": 4.02367901802063 }, { "auxiliary_loss_clip": 0.01956346, "auxiliary_loss_mlp": 0.00405264, "balance_loss_clip": 1.55022836, "balance_loss_mlp": 0.36230069, "epoch": 0.14363445062377875, "flos": 28256814040320.0, "grad_norm": 16.79098415104295, "language_loss": 0.88824296, "learning_rate": 3.866067713643573e-06, "loss": 0.9118591, "num_input_tokens_seen": 51753730, "router_z_loss_clip": 4.0625, "router_z_loss_mlp": 0.42993164, "step": 2389, "time_per_iteration": 2.702955961227417 }, { "auxiliary_loss_clip": 0.0196807, "auxiliary_loss_mlp": 0.00410461, "balance_loss_clip": 1.55828047, "balance_loss_mlp": 0.36370677, "epoch": 0.1436945738764467, "flos": 18186672407040.0, "grad_norm": 34.88833990220932, "language_loss": 0.91085166, "learning_rate": 3.8659275543985285e-06, "loss": 0.93463695, "num_input_tokens_seen": 51771195, "router_z_loss_clip": 4.09570312, "router_z_loss_mlp": 0.4675293, "step": 2390, "time_per_iteration": 2.5904757976531982 }, { "auxiliary_loss_clip": 0.01981039, "auxiliary_loss_mlp": 0.00378743, "balance_loss_clip": 1.55858874, "balance_loss_mlp": 0.33337238, "epoch": 0.14375469712911468, "flos": 27307910499840.0, "grad_norm": 40.63468257236911, "language_loss": 0.80678785, "learning_rate": 3.865787324397324e-06, "loss": 0.83038568, "num_input_tokens_seen": 51792290, "router_z_loss_clip": 4.23046875, "router_z_loss_mlp": 0.45410156, "step": 2391, "time_per_iteration": 2.685671806335449 }, { "auxiliary_loss_clip": 0.01762629, "auxiliary_loss_mlp": 0.00159422, "balance_loss_clip": 1.50587845, "balance_loss_mlp": 0.14435354, "epoch": 0.14381482038178264, "flos": 56891445287040.0, "grad_norm": 0.8952846955429091, "language_loss": 0.61896211, "learning_rate": 3.865647023645277e-06, "loss": 0.63818264, "num_input_tokens_seen": 51843675, "router_z_loss_clip": 2.5625, "router_z_loss_mlp": 0.15039062, "step": 2392, "time_per_iteration": 2.9671523571014404 }, { "auxiliary_loss_clip": 0.01989099, "auxiliary_loss_mlp": 0.00397518, "balance_loss_clip": 1.56482434, "balance_loss_mlp": 0.35183722, "epoch": 0.14387494363445064, "flos": 14282177143680.0, "grad_norm": 11.840894828039522, "language_loss": 0.85670269, "learning_rate": 3.865506652147709e-06, "loss": 0.88056886, "num_input_tokens_seen": 51860285, "router_z_loss_clip": 4.25, "router_z_loss_mlp": 0.45727539, "step": 2393, "time_per_iteration": 2.6457254886627197 }, { "auxiliary_loss_clip": 0.01969779, "auxiliary_loss_mlp": 0.00408238, "balance_loss_clip": 1.55305719, "balance_loss_mlp": 0.36613351, "epoch": 0.1439350668871186, "flos": 26761493831040.0, "grad_norm": 26.748057841034655, "language_loss": 0.82969052, "learning_rate": 3.865366209909941e-06, "loss": 0.85347062, "num_input_tokens_seen": 51880105, "router_z_loss_clip": 4.16992188, "router_z_loss_mlp": 0.42089844, "step": 2394, "time_per_iteration": 2.743086099624634 }, { "auxiliary_loss_clip": 0.01993561, "auxiliary_loss_mlp": 0.00395187, "balance_loss_clip": 1.56672835, "balance_loss_mlp": 0.35038793, "epoch": 0.14399519013978657, "flos": 40700040537600.0, "grad_norm": 4.41345706136014, "language_loss": 0.91500854, "learning_rate": 3.8652256969372994e-06, "loss": 0.93889606, "num_input_tokens_seen": 51905175, "router_z_loss_clip": 4.26757812, "router_z_loss_mlp": 0.44824219, "step": 2395, "time_per_iteration": 2.888425827026367 }, { "auxiliary_loss_clip": 0.01996581, "auxiliary_loss_mlp": 0.00392601, "balance_loss_clip": 1.56460333, "balance_loss_mlp": 0.34708679, "epoch": 0.14405531339245453, "flos": 20557530627840.0, "grad_norm": 2.822293253579182, "language_loss": 0.86897254, "learning_rate": 3.865085113235113e-06, "loss": 0.89286435, "num_input_tokens_seen": 51924490, "router_z_loss_clip": 4.32421875, "router_z_loss_mlp": 0.45556641, "step": 2396, "time_per_iteration": 2.620025873184204 }, { "auxiliary_loss_clip": 0.02013514, "auxiliary_loss_mlp": 0.00379534, "balance_loss_clip": 1.5828526, "balance_loss_mlp": 0.33621341, "epoch": 0.1441154366451225, "flos": 19572931946880.0, "grad_norm": 3.9101165647065943, "language_loss": 0.90137064, "learning_rate": 3.864944458808712e-06, "loss": 0.92530107, "num_input_tokens_seen": 51940490, "router_z_loss_clip": 4.30859375, "router_z_loss_mlp": 0.43310547, "step": 2397, "time_per_iteration": 2.6658072471618652 }, { "auxiliary_loss_clip": 0.0202295, "auxiliary_loss_mlp": 0.00378338, "balance_loss_clip": 1.58125758, "balance_loss_mlp": 0.3357566, "epoch": 0.14417555989779046, "flos": 18515721922560.0, "grad_norm": 1.72657615346728, "language_loss": 0.85516918, "learning_rate": 3.86480373366343e-06, "loss": 0.8791821, "num_input_tokens_seen": 51957910, "router_z_loss_clip": 4.41796875, "router_z_loss_mlp": 0.42602539, "step": 2398, "time_per_iteration": 2.6088039875030518 }, { "auxiliary_loss_clip": 0.01985428, "auxiliary_loss_mlp": 0.0037477, "balance_loss_clip": 1.55962074, "balance_loss_mlp": 0.3313542, "epoch": 0.14423568315045843, "flos": 26031681296640.0, "grad_norm": 5.397865341436297, "language_loss": 0.70474309, "learning_rate": 3.864662937804603e-06, "loss": 0.72834504, "num_input_tokens_seen": 51978010, "router_z_loss_clip": 4.2578125, "router_z_loss_mlp": 0.43432617, "step": 2399, "time_per_iteration": 2.6856799125671387 }, { "auxiliary_loss_clip": 0.02000048, "auxiliary_loss_mlp": 0.0038467, "balance_loss_clip": 1.57548606, "balance_loss_mlp": 0.34242195, "epoch": 0.14429580640312642, "flos": 21288743792640.0, "grad_norm": 7.140133738073261, "language_loss": 0.87982303, "learning_rate": 3.864522071237571e-06, "loss": 0.90367019, "num_input_tokens_seen": 51998515, "router_z_loss_clip": 4.24023438, "router_z_loss_mlp": 0.42236328, "step": 2400, "time_per_iteration": 2.643049955368042 }, { "auxiliary_loss_clip": 0.0196364, "auxiliary_loss_mlp": 0.00343506, "balance_loss_clip": 1.54349065, "balance_loss_mlp": 0.30094877, "epoch": 0.14435592965579438, "flos": 25627865621760.0, "grad_norm": 8.092756271382756, "language_loss": 0.80937696, "learning_rate": 3.864381133967676e-06, "loss": 0.83244836, "num_input_tokens_seen": 52019270, "router_z_loss_clip": 4.20117188, "router_z_loss_mlp": 0.42553711, "step": 2401, "time_per_iteration": 2.6657564640045166 }, { "auxiliary_loss_clip": 0.01953315, "auxiliary_loss_mlp": 0.00328671, "balance_loss_clip": 1.54539299, "balance_loss_mlp": 0.28701976, "epoch": 0.14441605290846235, "flos": 22965053656320.0, "grad_norm": 21.731361269867882, "language_loss": 0.86387753, "learning_rate": 3.86424012600026e-06, "loss": 0.88669741, "num_input_tokens_seen": 52039315, "router_z_loss_clip": 4.08007812, "router_z_loss_mlp": 0.41699219, "step": 2402, "time_per_iteration": 2.780905246734619 }, { "auxiliary_loss_clip": 0.01933597, "auxiliary_loss_mlp": 0.00362055, "balance_loss_clip": 1.53447413, "balance_loss_mlp": 0.31935406, "epoch": 0.14447617616113032, "flos": 17347655548800.0, "grad_norm": 40.011975048933046, "language_loss": 0.89684606, "learning_rate": 3.864099047340673e-06, "loss": 0.91980261, "num_input_tokens_seen": 52056555, "router_z_loss_clip": 3.9921875, "router_z_loss_mlp": 0.42700195, "step": 2403, "time_per_iteration": 2.6401710510253906 }, { "auxiliary_loss_clip": 0.01945862, "auxiliary_loss_mlp": 0.00373547, "balance_loss_clip": 1.5433166, "balance_loss_mlp": 0.33020258, "epoch": 0.14453629941379828, "flos": 24060185464320.0, "grad_norm": 3.275541944880875, "language_loss": 0.75479782, "learning_rate": 3.863957897994262e-06, "loss": 0.77799189, "num_input_tokens_seen": 52075800, "router_z_loss_clip": 4.02539062, "router_z_loss_mlp": 0.43334961, "step": 2404, "time_per_iteration": 2.665656566619873 }, { "auxiliary_loss_clip": 0.01964149, "auxiliary_loss_mlp": 0.00371528, "balance_loss_clip": 1.54704273, "balance_loss_mlp": 0.32804018, "epoch": 0.14459642266646625, "flos": 14429554646400.0, "grad_norm": 10.821297317715707, "language_loss": 0.80543625, "learning_rate": 3.863816677966381e-06, "loss": 0.82879305, "num_input_tokens_seen": 52092585, "router_z_loss_clip": 4.1640625, "router_z_loss_mlp": 0.43481445, "step": 2405, "time_per_iteration": 2.5817453861236572 }, { "auxiliary_loss_clip": 0.0191782, "auxiliary_loss_mlp": 0.00345224, "balance_loss_clip": 1.52187777, "balance_loss_mlp": 0.3051694, "epoch": 0.14465654591913424, "flos": 9867032179200.0, "grad_norm": 11.358528036896125, "language_loss": 0.7975536, "learning_rate": 3.863675387262386e-06, "loss": 0.82018399, "num_input_tokens_seen": 52108990, "router_z_loss_clip": 3.9609375, "router_z_loss_mlp": 0.40014648, "step": 2406, "time_per_iteration": 2.609281301498413 }, { "auxiliary_loss_clip": 0.01913331, "auxiliary_loss_mlp": 0.00412383, "balance_loss_clip": 1.51612663, "balance_loss_mlp": 0.36851397, "epoch": 0.1447166691718022, "flos": 24972926987520.0, "grad_norm": 8.564345447161083, "language_loss": 0.8308816, "learning_rate": 3.8635340258876325e-06, "loss": 0.85413879, "num_input_tokens_seen": 52125385, "router_z_loss_clip": 3.97265625, "router_z_loss_mlp": 0.4387207, "step": 2407, "time_per_iteration": 2.680971384048462 }, { "auxiliary_loss_clip": 0.01873664, "auxiliary_loss_mlp": 0.00346411, "balance_loss_clip": 1.4928422, "balance_loss_mlp": 0.30719098, "epoch": 0.14477679242447017, "flos": 21908023200000.0, "grad_norm": 3.6568476621556907, "language_loss": 0.83740807, "learning_rate": 3.8633925938474826e-06, "loss": 0.85960877, "num_input_tokens_seen": 52144985, "router_z_loss_clip": 3.80664062, "router_z_loss_mlp": 0.39233398, "step": 2408, "time_per_iteration": 2.698587656021118 }, { "auxiliary_loss_clip": 0.01856084, "auxiliary_loss_mlp": 0.00409487, "balance_loss_clip": 1.47945261, "balance_loss_mlp": 0.36673906, "epoch": 0.14483691567713813, "flos": 20740746925440.0, "grad_norm": 61.966759737185065, "language_loss": 0.8914237, "learning_rate": 3.863251091147299e-06, "loss": 0.91407943, "num_input_tokens_seen": 52163885, "router_z_loss_clip": 3.765625, "router_z_loss_mlp": 0.42724609, "step": 2409, "time_per_iteration": 2.6334686279296875 }, { "auxiliary_loss_clip": 0.01851472, "auxiliary_loss_mlp": 0.00375725, "balance_loss_clip": 1.47214603, "balance_loss_mlp": 0.33502728, "epoch": 0.1448970389298061, "flos": 35407705536000.0, "grad_norm": 3.021720198671276, "language_loss": 0.83693218, "learning_rate": 3.863109517792446e-06, "loss": 0.85920417, "num_input_tokens_seen": 52184325, "router_z_loss_clip": 3.79101562, "router_z_loss_mlp": 0.40698242, "step": 2410, "time_per_iteration": 2.7589001655578613 }, { "auxiliary_loss_clip": 0.01827099, "auxiliary_loss_mlp": 0.00370853, "balance_loss_clip": 1.46402705, "balance_loss_mlp": 0.33242032, "epoch": 0.14495716218247406, "flos": 15414368808960.0, "grad_norm": 4.5319304439509205, "language_loss": 0.87651128, "learning_rate": 3.8629678737882945e-06, "loss": 0.89849079, "num_input_tokens_seen": 52202740, "router_z_loss_clip": 3.62890625, "router_z_loss_mlp": 0.38427734, "step": 2411, "time_per_iteration": 2.58463978767395 }, { "auxiliary_loss_clip": 0.01803531, "auxiliary_loss_mlp": 0.0043411, "balance_loss_clip": 1.44635355, "balance_loss_mlp": 0.39398423, "epoch": 0.14501728543514203, "flos": 33693222493440.0, "grad_norm": 13.367023216544364, "language_loss": 0.78341079, "learning_rate": 3.862826159140214e-06, "loss": 0.80578721, "num_input_tokens_seen": 52223100, "router_z_loss_clip": 3.5703125, "router_z_loss_mlp": 0.40161133, "step": 2412, "time_per_iteration": 2.735686779022217 }, { "auxiliary_loss_clip": 0.01787869, "auxiliary_loss_mlp": 0.00405399, "balance_loss_clip": 1.44090044, "balance_loss_mlp": 0.36858737, "epoch": 0.14507740868781002, "flos": 15596112648960.0, "grad_norm": 3.257091823027264, "language_loss": 0.82785404, "learning_rate": 3.862684373853579e-06, "loss": 0.84978676, "num_input_tokens_seen": 52239690, "router_z_loss_clip": 3.47460938, "router_z_loss_mlp": 0.36816406, "step": 2413, "time_per_iteration": 2.574233055114746 }, { "auxiliary_loss_clip": 0.01526728, "auxiliary_loss_mlp": 0.00146514, "balance_loss_clip": 1.3108052, "balance_loss_mlp": 0.13435462, "epoch": 0.145137531940478, "flos": 66675343438080.0, "grad_norm": 1.0579816518967537, "language_loss": 0.58784711, "learning_rate": 3.8625425179337656e-06, "loss": 0.60457945, "num_input_tokens_seen": 52296705, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.12158203, "step": 2414, "time_per_iteration": 3.015807867050171 }, { "auxiliary_loss_clip": 0.01524396, "auxiliary_loss_mlp": 0.00167644, "balance_loss_clip": 1.31194592, "balance_loss_mlp": 0.15724865, "epoch": 0.14519765519314595, "flos": 67521578929920.0, "grad_norm": 0.8388172853507377, "language_loss": 0.61602587, "learning_rate": 3.862400591386154e-06, "loss": 0.63294625, "num_input_tokens_seen": 52361830, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.10400391, "step": 2415, "time_per_iteration": 3.173332929611206 }, { "auxiliary_loss_clip": 0.01687723, "auxiliary_loss_mlp": 0.00404728, "balance_loss_clip": 1.36368835, "balance_loss_mlp": 0.36622381, "epoch": 0.14525777844581392, "flos": 17198913329280.0, "grad_norm": 9.306326006444426, "language_loss": 0.80068541, "learning_rate": 3.8622585942161245e-06, "loss": 0.82160991, "num_input_tokens_seen": 52379420, "router_z_loss_clip": 3.2421875, "router_z_loss_mlp": 0.38549805, "step": 2416, "time_per_iteration": 2.6195569038391113 }, { "auxiliary_loss_clip": 0.01482566, "auxiliary_loss_mlp": 0.00209679, "balance_loss_clip": 1.2708292, "balance_loss_mlp": 0.19699517, "epoch": 0.14531790169848188, "flos": 65404609015680.0, "grad_norm": 0.7281077376760037, "language_loss": 0.60357046, "learning_rate": 3.8621165264290635e-06, "loss": 0.62049294, "num_input_tokens_seen": 52446290, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.12695312, "step": 2417, "time_per_iteration": 3.153162717819214 }, { "auxiliary_loss_clip": 0.01647566, "auxiliary_loss_mlp": 0.00464211, "balance_loss_clip": 1.32781172, "balance_loss_mlp": 0.42432371, "epoch": 0.14537802495114985, "flos": 32562467372160.0, "grad_norm": 17.33098192830113, "language_loss": 0.85923439, "learning_rate": 3.861974388030356e-06, "loss": 0.88035214, "num_input_tokens_seen": 52467295, "router_z_loss_clip": 3.19921875, "router_z_loss_mlp": 0.39916992, "step": 2418, "time_per_iteration": 2.748457670211792 }, { "auxiliary_loss_clip": 0.01635654, "auxiliary_loss_mlp": 0.00456069, "balance_loss_clip": 1.32627416, "balance_loss_mlp": 0.42189133, "epoch": 0.1454381482038178, "flos": 20226685432320.0, "grad_norm": 6.879132254755813, "language_loss": 0.77730578, "learning_rate": 3.861832179025394e-06, "loss": 0.79822302, "num_input_tokens_seen": 52487295, "router_z_loss_clip": 3.09375, "router_z_loss_mlp": 0.3416748, "step": 2419, "time_per_iteration": 2.740342140197754 }, { "auxiliary_loss_clip": 0.01639496, "auxiliary_loss_mlp": 0.00444252, "balance_loss_clip": 1.32498145, "balance_loss_mlp": 0.40949056, "epoch": 0.1454982714564858, "flos": 22893124671360.0, "grad_norm": 5.665089792536025, "language_loss": 0.98082501, "learning_rate": 3.861689899419569e-06, "loss": 1.00166249, "num_input_tokens_seen": 52504220, "router_z_loss_clip": 3.14648438, "router_z_loss_mlp": 0.34765625, "step": 2420, "time_per_iteration": 2.731821298599243 }, { "auxiliary_loss_clip": 0.01627975, "auxiliary_loss_mlp": 0.00452102, "balance_loss_clip": 1.32084846, "balance_loss_mlp": 0.41712597, "epoch": 0.14555839470915377, "flos": 20229845829120.0, "grad_norm": 6.659441200517906, "language_loss": 0.88514543, "learning_rate": 3.861547549218276e-06, "loss": 0.90594625, "num_input_tokens_seen": 52521900, "router_z_loss_clip": 3.07226562, "router_z_loss_mlp": 0.34960938, "step": 2421, "time_per_iteration": 2.6730456352233887 }, { "auxiliary_loss_clip": 0.0160957, "auxiliary_loss_mlp": 0.00461438, "balance_loss_clip": 1.30253482, "balance_loss_mlp": 0.4257943, "epoch": 0.14561851796182174, "flos": 22236282616320.0, "grad_norm": 23.475026272286634, "language_loss": 0.86136413, "learning_rate": 3.861405128426914e-06, "loss": 0.88207418, "num_input_tokens_seen": 52540495, "router_z_loss_clip": 3.06835938, "router_z_loss_mlp": 0.35668945, "step": 2422, "time_per_iteration": 2.7132580280303955 }, { "auxiliary_loss_clip": 0.0144891, "auxiliary_loss_mlp": 0.00131355, "balance_loss_clip": 1.23413551, "balance_loss_mlp": 0.12243789, "epoch": 0.1456786412144897, "flos": 52636786289280.0, "grad_norm": 0.9157392098813985, "language_loss": 0.63366085, "learning_rate": 3.861262637050883e-06, "loss": 0.64946347, "num_input_tokens_seen": 52603305, "router_z_loss_clip": 2.140625, "router_z_loss_mlp": 0.08935547, "step": 2423, "time_per_iteration": 3.1558289527893066 }, { "auxiliary_loss_clip": 0.01608988, "auxiliary_loss_mlp": 0.00549941, "balance_loss_clip": 1.30252528, "balance_loss_mlp": 0.51379699, "epoch": 0.14573876446715767, "flos": 23221671396480.0, "grad_norm": 256.27471738754525, "language_loss": 0.87042552, "learning_rate": 3.861120075095585e-06, "loss": 0.8920148, "num_input_tokens_seen": 52623435, "router_z_loss_clip": 3.06445312, "router_z_loss_mlp": 0.36132812, "step": 2424, "time_per_iteration": 4.153766393661499 }, { "auxiliary_loss_clip": 0.01618282, "auxiliary_loss_mlp": 0.0053708, "balance_loss_clip": 1.30931282, "balance_loss_mlp": 0.49776453, "epoch": 0.14579888771982563, "flos": 18114384286080.0, "grad_norm": 18.658564631669613, "language_loss": 0.85264325, "learning_rate": 3.860977442566429e-06, "loss": 0.87419689, "num_input_tokens_seen": 52642255, "router_z_loss_clip": 3.08789062, "router_z_loss_mlp": 0.39306641, "step": 2425, "time_per_iteration": 2.613293170928955 }, { "auxiliary_loss_clip": 0.01617538, "auxiliary_loss_mlp": 0.00553011, "balance_loss_clip": 1.30527663, "balance_loss_mlp": 0.51290882, "epoch": 0.14585901097249362, "flos": 23001107932800.0, "grad_norm": 44.84661269710091, "language_loss": 0.9051888, "learning_rate": 3.860834739468821e-06, "loss": 0.92689431, "num_input_tokens_seen": 52658700, "router_z_loss_clip": 3.12304688, "router_z_loss_mlp": 0.40136719, "step": 2426, "time_per_iteration": 2.6549105644226074 }, { "auxiliary_loss_clip": 0.01609023, "auxiliary_loss_mlp": 0.00502745, "balance_loss_clip": 1.29771686, "balance_loss_mlp": 0.46593302, "epoch": 0.1459191342251616, "flos": 21908669644800.0, "grad_norm": 5.006199808961258, "language_loss": 0.94188672, "learning_rate": 3.860691965808173e-06, "loss": 0.96300447, "num_input_tokens_seen": 52678140, "router_z_loss_clip": 3.11523438, "router_z_loss_mlp": 0.36816406, "step": 2427, "time_per_iteration": 4.08456563949585 }, { "auxiliary_loss_clip": 0.01603691, "auxiliary_loss_mlp": 0.00539492, "balance_loss_clip": 1.29076529, "balance_loss_mlp": 0.49929434, "epoch": 0.14597925747782955, "flos": 14975504438400.0, "grad_norm": 55.03266341130837, "language_loss": 0.7804482, "learning_rate": 3.8605491215899e-06, "loss": 0.80188, "num_input_tokens_seen": 52696825, "router_z_loss_clip": 3.12695312, "router_z_loss_mlp": 0.40209961, "step": 2428, "time_per_iteration": 4.062969207763672 }, { "auxiliary_loss_clip": 0.01606852, "auxiliary_loss_mlp": 0.00537626, "balance_loss_clip": 1.29211533, "balance_loss_mlp": 0.49940798, "epoch": 0.14603938073049752, "flos": 21068898600960.0, "grad_norm": 25.84155161362217, "language_loss": 0.8835879, "learning_rate": 3.860406206819417e-06, "loss": 0.90503263, "num_input_tokens_seen": 52715125, "router_z_loss_clip": 3.14257812, "router_z_loss_mlp": 0.38208008, "step": 2429, "time_per_iteration": 2.6535232067108154 }, { "auxiliary_loss_clip": 0.01600901, "auxiliary_loss_mlp": 0.00510743, "balance_loss_clip": 1.29073954, "balance_loss_mlp": 0.47428912, "epoch": 0.14609950398316549, "flos": 19864777950720.0, "grad_norm": 25.13788080417407, "language_loss": 0.83297211, "learning_rate": 3.860263221502145e-06, "loss": 0.85408854, "num_input_tokens_seen": 52734015, "router_z_loss_clip": 3.1015625, "router_z_loss_mlp": 0.36499023, "step": 2430, "time_per_iteration": 4.0280537605285645 }, { "auxiliary_loss_clip": 0.01624085, "auxiliary_loss_mlp": 0.00540022, "balance_loss_clip": 1.30452919, "balance_loss_mlp": 0.50289977, "epoch": 0.14615962723583345, "flos": 22418852469120.0, "grad_norm": 5.418015849039973, "language_loss": 0.91539216, "learning_rate": 3.860120165643504e-06, "loss": 0.93703324, "num_input_tokens_seen": 52753025, "router_z_loss_clip": 3.1953125, "router_z_loss_mlp": 0.37158203, "step": 2431, "time_per_iteration": 2.6182899475097656 }, { "auxiliary_loss_clip": 0.01626289, "auxiliary_loss_mlp": 0.0049305, "balance_loss_clip": 1.29826939, "balance_loss_mlp": 0.45373505, "epoch": 0.14621975048850142, "flos": 22346241125760.0, "grad_norm": 3.5085942427230656, "language_loss": 0.86390316, "learning_rate": 3.859977039248921e-06, "loss": 0.88509655, "num_input_tokens_seen": 52773420, "router_z_loss_clip": 3.27929688, "router_z_loss_mlp": 0.39306641, "step": 2432, "time_per_iteration": 2.648545503616333 }, { "auxiliary_loss_clip": 0.01633579, "auxiliary_loss_mlp": 0.00528693, "balance_loss_clip": 1.30801368, "balance_loss_mlp": 0.48975909, "epoch": 0.1462798737411694, "flos": 24389163152640.0, "grad_norm": 9.95977124781716, "language_loss": 0.87824857, "learning_rate": 3.859833842323822e-06, "loss": 0.89987135, "num_input_tokens_seen": 52792870, "router_z_loss_clip": 3.25195312, "router_z_loss_mlp": 0.38964844, "step": 2433, "time_per_iteration": 2.7057135105133057 }, { "auxiliary_loss_clip": 0.01626861, "auxiliary_loss_mlp": 0.00482402, "balance_loss_clip": 1.30688667, "balance_loss_mlp": 0.44649625, "epoch": 0.14633999699383737, "flos": 19244672530560.0, "grad_norm": 11.671656497308854, "language_loss": 0.8679235, "learning_rate": 3.859690574873638e-06, "loss": 0.88901615, "num_input_tokens_seen": 52811615, "router_z_loss_clip": 3.19921875, "router_z_loss_mlp": 0.35913086, "step": 2434, "time_per_iteration": 2.6057655811309814 }, { "auxiliary_loss_clip": 0.01452155, "auxiliary_loss_mlp": 0.00239953, "balance_loss_clip": 1.22696102, "balance_loss_mlp": 0.22860418, "epoch": 0.14640012024650534, "flos": 62660638270080.0, "grad_norm": 0.9120752209568954, "language_loss": 0.58460754, "learning_rate": 3.8595472369038e-06, "loss": 0.60152858, "num_input_tokens_seen": 52873230, "router_z_loss_clip": 2.25, "router_z_loss_mlp": 0.11328125, "step": 2435, "time_per_iteration": 3.0956122875213623 }, { "auxiliary_loss_clip": 0.01624844, "auxiliary_loss_mlp": 0.00473727, "balance_loss_clip": 1.30112565, "balance_loss_mlp": 0.43834567, "epoch": 0.1464602434991733, "flos": 12276243146880.0, "grad_norm": 40.346732921935896, "language_loss": 0.94926429, "learning_rate": 3.859403828419744e-06, "loss": 0.97025001, "num_input_tokens_seen": 52889325, "router_z_loss_clip": 3.23828125, "router_z_loss_mlp": 0.35400391, "step": 2436, "time_per_iteration": 2.5820891857147217 }, { "auxiliary_loss_clip": 0.01642271, "auxiliary_loss_mlp": 0.00544176, "balance_loss_clip": 1.31358778, "balance_loss_mlp": 0.5038597, "epoch": 0.14652036675184127, "flos": 20922311197440.0, "grad_norm": 8.045499417585694, "language_loss": 0.80211848, "learning_rate": 3.85926034942691e-06, "loss": 0.82398295, "num_input_tokens_seen": 52909705, "router_z_loss_clip": 3.28515625, "router_z_loss_mlp": 0.40332031, "step": 2437, "time_per_iteration": 2.684666156768799 }, { "auxiliary_loss_clip": 0.01628367, "auxiliary_loss_mlp": 0.00461795, "balance_loss_clip": 1.29916334, "balance_loss_mlp": 0.42250371, "epoch": 0.14658049000450923, "flos": 27703681528320.0, "grad_norm": 20.42496793216016, "language_loss": 0.78786564, "learning_rate": 3.859116799930736e-06, "loss": 0.80876732, "num_input_tokens_seen": 52930300, "router_z_loss_clip": 3.29492188, "router_z_loss_mlp": 0.39306641, "step": 2438, "time_per_iteration": 2.7619826793670654 }, { "auxiliary_loss_clip": 0.01656095, "auxiliary_loss_mlp": 0.00469984, "balance_loss_clip": 1.32182932, "balance_loss_mlp": 0.43329161, "epoch": 0.14664061325717723, "flos": 24936513575040.0, "grad_norm": 3.8551481201298734, "language_loss": 0.80649745, "learning_rate": 3.858973179936668e-06, "loss": 0.82775825, "num_input_tokens_seen": 52949955, "router_z_loss_clip": 3.34375, "router_z_loss_mlp": 0.36669922, "step": 2439, "time_per_iteration": 2.686614990234375 }, { "auxiliary_loss_clip": 0.01635439, "auxiliary_loss_mlp": 0.00415878, "balance_loss_clip": 1.30665374, "balance_loss_mlp": 0.37789762, "epoch": 0.1467007365098452, "flos": 40297661406720.0, "grad_norm": 24.752089340466576, "language_loss": 0.79378921, "learning_rate": 3.85882948945015e-06, "loss": 0.81430233, "num_input_tokens_seen": 52972905, "router_z_loss_clip": 3.28515625, "router_z_loss_mlp": 0.37963867, "step": 2440, "time_per_iteration": 2.790498733520508 }, { "auxiliary_loss_clip": 0.01612895, "auxiliary_loss_mlp": 0.00422945, "balance_loss_clip": 1.28836679, "balance_loss_mlp": 0.38641936, "epoch": 0.14676085976251316, "flos": 26541074021760.0, "grad_norm": 1.4720986998048202, "language_loss": 0.87044477, "learning_rate": 3.85868572847663e-06, "loss": 0.89080316, "num_input_tokens_seen": 52994850, "router_z_loss_clip": 3.24609375, "router_z_loss_mlp": 0.36499023, "step": 2441, "time_per_iteration": 2.6785061359405518 }, { "auxiliary_loss_clip": 0.01640292, "auxiliary_loss_mlp": 0.00444358, "balance_loss_clip": 1.304322, "balance_loss_mlp": 0.40551916, "epoch": 0.14682098301518112, "flos": 23550110380800.0, "grad_norm": 36.636410065655106, "language_loss": 0.81636333, "learning_rate": 3.858541897021563e-06, "loss": 0.83720988, "num_input_tokens_seen": 53014740, "router_z_loss_clip": 3.36132812, "router_z_loss_mlp": 0.38818359, "step": 2442, "time_per_iteration": 2.6433072090148926 }, { "auxiliary_loss_clip": 0.01622206, "auxiliary_loss_mlp": 0.00437106, "balance_loss_clip": 1.28583813, "balance_loss_mlp": 0.39824328, "epoch": 0.1468811062678491, "flos": 11651073909120.0, "grad_norm": 5.256031133961432, "language_loss": 0.92289931, "learning_rate": 3.8583979950904e-06, "loss": 0.94349241, "num_input_tokens_seen": 53029780, "router_z_loss_clip": 3.36328125, "router_z_loss_mlp": 0.38842773, "step": 2443, "time_per_iteration": 2.632796049118042 }, { "auxiliary_loss_clip": 0.01628511, "auxiliary_loss_mlp": 0.00402201, "balance_loss_clip": 1.29727435, "balance_loss_mlp": 0.36691481, "epoch": 0.14694122952051705, "flos": 23002616304000.0, "grad_norm": 5.436685729397422, "language_loss": 0.90654802, "learning_rate": 3.858254022688599e-06, "loss": 0.92685515, "num_input_tokens_seen": 53048620, "router_z_loss_clip": 3.30859375, "router_z_loss_mlp": 0.35302734, "step": 2444, "time_per_iteration": 2.6500275135040283 }, { "auxiliary_loss_clip": 0.01642299, "auxiliary_loss_mlp": 0.00372631, "balance_loss_clip": 1.30441093, "balance_loss_mlp": 0.33684427, "epoch": 0.14700135277318502, "flos": 26502972670080.0, "grad_norm": 28.952639969668795, "language_loss": 0.78250134, "learning_rate": 3.85810997982162e-06, "loss": 0.80265069, "num_input_tokens_seen": 53070055, "router_z_loss_clip": 3.37890625, "router_z_loss_mlp": 0.35791016, "step": 2445, "time_per_iteration": 2.680816173553467 }, { "auxiliary_loss_clip": 0.01499433, "auxiliary_loss_mlp": 0.00227165, "balance_loss_clip": 1.28249657, "balance_loss_mlp": 0.21381362, "epoch": 0.147061476025853, "flos": 59449434387840.0, "grad_norm": 0.8314020279283899, "language_loss": 0.62566644, "learning_rate": 3.857965866494923e-06, "loss": 0.64293242, "num_input_tokens_seen": 53126945, "router_z_loss_clip": 2.171875, "router_z_loss_mlp": 0.13378906, "step": 2446, "time_per_iteration": 3.040093421936035 }, { "auxiliary_loss_clip": 0.01615849, "auxiliary_loss_mlp": 0.0036199, "balance_loss_clip": 1.28785634, "balance_loss_mlp": 0.32765758, "epoch": 0.14712159927852098, "flos": 28330897841280.0, "grad_norm": 168.5745105581572, "language_loss": 0.81941146, "learning_rate": 3.857821682713975e-06, "loss": 0.83918983, "num_input_tokens_seen": 53149130, "router_z_loss_clip": 3.28320312, "router_z_loss_mlp": 0.34350586, "step": 2447, "time_per_iteration": 2.711287260055542 }, { "auxiliary_loss_clip": 0.01617077, "auxiliary_loss_mlp": 0.00400508, "balance_loss_clip": 1.29264891, "balance_loss_mlp": 0.36338618, "epoch": 0.14718172253118894, "flos": 27089825074560.0, "grad_norm": 38.995425830165296, "language_loss": 0.93901086, "learning_rate": 3.857677428484242e-06, "loss": 0.95918673, "num_input_tokens_seen": 53167120, "router_z_loss_clip": 3.2421875, "router_z_loss_mlp": 0.37133789, "step": 2448, "time_per_iteration": 2.7602267265319824 }, { "auxiliary_loss_clip": 0.01455175, "auxiliary_loss_mlp": 0.00181904, "balance_loss_clip": 1.24937224, "balance_loss_mlp": 0.16941111, "epoch": 0.1472418457838569, "flos": 66706764860160.0, "grad_norm": 0.7453985158857575, "language_loss": 0.56667739, "learning_rate": 3.857533103811195e-06, "loss": 0.58304816, "num_input_tokens_seen": 53227945, "router_z_loss_clip": 2.0625, "router_z_loss_mlp": 0.125, "step": 2449, "time_per_iteration": 3.064450740814209 }, { "auxiliary_loss_clip": 0.01609355, "auxiliary_loss_mlp": 0.00361545, "balance_loss_clip": 1.29101348, "balance_loss_mlp": 0.32573426, "epoch": 0.14730196903652487, "flos": 19573578391680.0, "grad_norm": 4.121024637974723, "language_loss": 0.9166441, "learning_rate": 3.857388708700307e-06, "loss": 0.93635309, "num_input_tokens_seen": 53244615, "router_z_loss_clip": 3.18359375, "router_z_loss_mlp": 0.35839844, "step": 2450, "time_per_iteration": 2.6369552612304688 }, { "auxiliary_loss_clip": 0.01600476, "auxiliary_loss_mlp": 0.00374141, "balance_loss_clip": 1.28184557, "balance_loss_mlp": 0.33480185, "epoch": 0.14736209228919284, "flos": 16071031296000.0, "grad_norm": 227.27998717514515, "language_loss": 0.84079111, "learning_rate": 3.857244243157052e-06, "loss": 0.86053729, "num_input_tokens_seen": 53262205, "router_z_loss_clip": 3.18945312, "router_z_loss_mlp": 0.39355469, "step": 2451, "time_per_iteration": 2.6080753803253174 }, { "auxiliary_loss_clip": 0.01584178, "auxiliary_loss_mlp": 0.00370436, "balance_loss_clip": 1.27595162, "balance_loss_mlp": 0.33534086, "epoch": 0.1474222155418608, "flos": 23039460679680.0, "grad_norm": 6.82664270847563, "language_loss": 0.85971248, "learning_rate": 3.85709970718691e-06, "loss": 0.87925863, "num_input_tokens_seen": 53282445, "router_z_loss_clip": 3.08203125, "router_z_loss_mlp": 0.35083008, "step": 2452, "time_per_iteration": 2.626696825027466 }, { "auxiliary_loss_clip": 0.01588116, "auxiliary_loss_mlp": 0.00356505, "balance_loss_clip": 1.27803755, "balance_loss_mlp": 0.32226759, "epoch": 0.1474823387945288, "flos": 17018641946880.0, "grad_norm": 2.4895493554001145, "language_loss": 0.78355277, "learning_rate": 3.856955100795361e-06, "loss": 0.80299896, "num_input_tokens_seen": 53299060, "router_z_loss_clip": 3.09765625, "router_z_loss_mlp": 0.3425293, "step": 2453, "time_per_iteration": 2.5891456604003906 }, { "auxiliary_loss_clip": 0.01576921, "auxiliary_loss_mlp": 0.00383691, "balance_loss_clip": 1.25857353, "balance_loss_mlp": 0.34592554, "epoch": 0.14754246204719676, "flos": 17895041884800.0, "grad_norm": 7.82085192995453, "language_loss": 0.84399265, "learning_rate": 3.856810423987889e-06, "loss": 0.86359876, "num_input_tokens_seen": 53315970, "router_z_loss_clip": 3.18359375, "router_z_loss_mlp": 0.37792969, "step": 2454, "time_per_iteration": 2.599808931350708 }, { "auxiliary_loss_clip": 0.01560374, "auxiliary_loss_mlp": 0.00341085, "balance_loss_clip": 1.25317168, "balance_loss_mlp": 0.30861247, "epoch": 0.14760258529986472, "flos": 13079097987840.0, "grad_norm": 1.8163254146491146, "language_loss": 0.8873539, "learning_rate": 3.856665676769979e-06, "loss": 0.90636843, "num_input_tokens_seen": 53332940, "router_z_loss_clip": 3.0703125, "router_z_loss_mlp": 0.32470703, "step": 2455, "time_per_iteration": 2.590508222579956 }, { "auxiliary_loss_clip": 0.01573932, "auxiliary_loss_mlp": 0.00369374, "balance_loss_clip": 1.25997353, "balance_loss_mlp": 0.33466068, "epoch": 0.1476627085525327, "flos": 30806399358720.0, "grad_norm": 6.88079383058879, "language_loss": 0.90353501, "learning_rate": 3.85652085914712e-06, "loss": 0.92296803, "num_input_tokens_seen": 53353295, "router_z_loss_clip": 3.140625, "router_z_loss_mlp": 0.34716797, "step": 2456, "time_per_iteration": 2.69640851020813 }, { "auxiliary_loss_clip": 0.01543405, "auxiliary_loss_mlp": 0.00312464, "balance_loss_clip": 1.24504697, "balance_loss_mlp": 0.2808018, "epoch": 0.14772283180520066, "flos": 21689434984320.0, "grad_norm": 9.67383785333568, "language_loss": 0.89389837, "learning_rate": 3.856375971124805e-06, "loss": 0.91245705, "num_input_tokens_seen": 53373410, "router_z_loss_clip": 2.98242188, "router_z_loss_mlp": 0.31689453, "step": 2457, "time_per_iteration": 2.6199755668640137 }, { "auxiliary_loss_clip": 0.01547731, "auxiliary_loss_mlp": 0.00317204, "balance_loss_clip": 1.24855065, "balance_loss_mlp": 0.28504151, "epoch": 0.14778295505786862, "flos": 18770400328320.0, "grad_norm": 6.206287917888965, "language_loss": 0.82055318, "learning_rate": 3.856231012708527e-06, "loss": 0.83920258, "num_input_tokens_seen": 53391430, "router_z_loss_clip": 2.99023438, "router_z_loss_mlp": 0.32128906, "step": 2458, "time_per_iteration": 2.6181480884552 }, { "auxiliary_loss_clip": 0.01557336, "auxiliary_loss_mlp": 0.00344438, "balance_loss_clip": 1.24836731, "balance_loss_mlp": 0.30855626, "epoch": 0.1478430783105366, "flos": 22893555634560.0, "grad_norm": 335.5896793670183, "language_loss": 0.90951377, "learning_rate": 3.856085983903782e-06, "loss": 0.92853153, "num_input_tokens_seen": 53409960, "router_z_loss_clip": 3.08789062, "router_z_loss_mlp": 0.35913086, "step": 2459, "time_per_iteration": 2.6861929893493652 }, { "auxiliary_loss_clip": 0.01534149, "auxiliary_loss_mlp": 0.00326284, "balance_loss_clip": 1.23634291, "balance_loss_mlp": 0.29412162, "epoch": 0.14790320156320458, "flos": 15085319293440.0, "grad_norm": 5.018673776356329, "language_loss": 0.80742931, "learning_rate": 3.855940884716071e-06, "loss": 0.82603359, "num_input_tokens_seen": 53426160, "router_z_loss_clip": 2.97460938, "router_z_loss_mlp": 0.3215332, "step": 2460, "time_per_iteration": 2.569617509841919 }, { "auxiliary_loss_clip": 0.01566871, "auxiliary_loss_mlp": 0.00331369, "balance_loss_clip": 1.25053966, "balance_loss_mlp": 0.29677412, "epoch": 0.14796332481587254, "flos": 26504768350080.0, "grad_norm": 3.3749445794675026, "language_loss": 0.8735944, "learning_rate": 3.855795715150896e-06, "loss": 0.89257681, "num_input_tokens_seen": 53448530, "router_z_loss_clip": 3.16601562, "router_z_loss_mlp": 0.34594727, "step": 2461, "time_per_iteration": 2.763770341873169 }, { "auxiliary_loss_clip": 0.01548533, "auxiliary_loss_mlp": 0.00322656, "balance_loss_clip": 1.24675655, "balance_loss_mlp": 0.28930128, "epoch": 0.1480234480685405, "flos": 17563191108480.0, "grad_norm": 6.424820318841996, "language_loss": 0.75674993, "learning_rate": 3.855650475213761e-06, "loss": 0.77546185, "num_input_tokens_seen": 53465915, "router_z_loss_clip": 3.015625, "router_z_loss_mlp": 0.33349609, "step": 2462, "time_per_iteration": 2.6681056022644043 }, { "auxiliary_loss_clip": 0.01542891, "auxiliary_loss_mlp": 0.00335617, "balance_loss_clip": 1.24031544, "balance_loss_mlp": 0.30021143, "epoch": 0.14808357132120847, "flos": 53582203232640.0, "grad_norm": 3.2747585938813493, "language_loss": 0.73050928, "learning_rate": 3.8555051649101745e-06, "loss": 0.74929434, "num_input_tokens_seen": 53496055, "router_z_loss_clip": 3.0234375, "router_z_loss_mlp": 0.35449219, "step": 2463, "time_per_iteration": 2.9880292415618896 }, { "auxiliary_loss_clip": 0.01545092, "auxiliary_loss_mlp": 0.00317081, "balance_loss_clip": 1.23889279, "balance_loss_mlp": 0.28484654, "epoch": 0.14814369457387644, "flos": 19829190551040.0, "grad_norm": 2.775371980376823, "language_loss": 0.83397096, "learning_rate": 3.855359784245646e-06, "loss": 0.85259271, "num_input_tokens_seen": 53513790, "router_z_loss_clip": 3.06640625, "router_z_loss_mlp": 0.32250977, "step": 2464, "time_per_iteration": 2.626296043395996 }, { "auxiliary_loss_clip": 0.01517346, "auxiliary_loss_mlp": 0.00284037, "balance_loss_clip": 1.22637498, "balance_loss_mlp": 0.25502157, "epoch": 0.1482038178265444, "flos": 23914962777600.0, "grad_norm": 2.9901439681316733, "language_loss": 0.86422062, "learning_rate": 3.855214333225688e-06, "loss": 0.88223445, "num_input_tokens_seen": 53533410, "router_z_loss_clip": 2.91015625, "router_z_loss_mlp": 0.29016113, "step": 2465, "time_per_iteration": 2.6636416912078857 }, { "auxiliary_loss_clip": 0.01544309, "auxiliary_loss_mlp": 0.00308295, "balance_loss_clip": 1.23659933, "balance_loss_mlp": 0.276299, "epoch": 0.1482639410792124, "flos": 24170503109760.0, "grad_norm": 18.980502148221706, "language_loss": 0.82083958, "learning_rate": 3.855068811855817e-06, "loss": 0.8393656, "num_input_tokens_seen": 53554775, "router_z_loss_clip": 3.07617188, "router_z_loss_mlp": 0.32006836, "step": 2466, "time_per_iteration": 2.6584107875823975 }, { "auxiliary_loss_clip": 0.01325923, "auxiliary_loss_mlp": 0.00074163, "balance_loss_clip": 1.13118994, "balance_loss_mlp": 0.06429276, "epoch": 0.14832406433188036, "flos": 66191051341440.0, "grad_norm": 0.7785810398500818, "language_loss": 0.60094774, "learning_rate": 3.854923220141551e-06, "loss": 0.61494863, "num_input_tokens_seen": 53609675, "router_z_loss_clip": 1.953125, "router_z_loss_mlp": 0.09863281, "step": 2467, "time_per_iteration": 4.47997522354126 }, { "auxiliary_loss_clip": 0.01546701, "auxiliary_loss_mlp": 0.00308748, "balance_loss_clip": 1.24208391, "balance_loss_mlp": 0.27663311, "epoch": 0.14838418758454833, "flos": 25411252654080.0, "grad_norm": 12.46212472057566, "language_loss": 0.96241748, "learning_rate": 3.85477755808841e-06, "loss": 0.98097193, "num_input_tokens_seen": 53626950, "router_z_loss_clip": 3.046875, "router_z_loss_mlp": 0.32128906, "step": 2468, "time_per_iteration": 2.6732184886932373 }, { "auxiliary_loss_clip": 0.01571752, "auxiliary_loss_mlp": 0.00307303, "balance_loss_clip": 1.25737298, "balance_loss_mlp": 0.27418655, "epoch": 0.1484443108372163, "flos": 23289901280640.0, "grad_norm": 9.60751845626421, "language_loss": 0.82821751, "learning_rate": 3.854631825701919e-06, "loss": 0.84700811, "num_input_tokens_seen": 53644200, "router_z_loss_clip": 3.14257812, "router_z_loss_mlp": 0.33129883, "step": 2469, "time_per_iteration": 2.609269618988037 }, { "auxiliary_loss_clip": 0.01558435, "auxiliary_loss_mlp": 0.00290291, "balance_loss_clip": 1.24974203, "balance_loss_mlp": 0.25874832, "epoch": 0.14850443408988426, "flos": 14647675985280.0, "grad_norm": 167.46675395117833, "language_loss": 0.81615615, "learning_rate": 3.854486022987603e-06, "loss": 0.83464336, "num_input_tokens_seen": 53659650, "router_z_loss_clip": 3.0859375, "router_z_loss_mlp": 0.31518555, "step": 2470, "time_per_iteration": 5.462946176528931 }, { "auxiliary_loss_clip": 0.01554916, "auxiliary_loss_mlp": 0.00278917, "balance_loss_clip": 1.2519958, "balance_loss_mlp": 0.24684966, "epoch": 0.14856455734255222, "flos": 23548314700800.0, "grad_norm": 6.1487229628716555, "language_loss": 0.77934361, "learning_rate": 3.8543401499509905e-06, "loss": 0.79768199, "num_input_tokens_seen": 53680275, "router_z_loss_clip": 3.02929688, "router_z_loss_mlp": 0.3203125, "step": 2471, "time_per_iteration": 2.632833242416382 }, { "auxiliary_loss_clip": 0.01566739, "auxiliary_loss_mlp": 0.00315349, "balance_loss_clip": 1.25171185, "balance_loss_mlp": 0.28228074, "epoch": 0.1486246805952202, "flos": 18077288515200.0, "grad_norm": 190.23979086722719, "language_loss": 0.95600599, "learning_rate": 3.854194206597615e-06, "loss": 0.97482687, "num_input_tokens_seen": 53698270, "router_z_loss_clip": 3.15039062, "router_z_loss_mlp": 0.33032227, "step": 2472, "time_per_iteration": 3.9915647506713867 }, { "auxiliary_loss_clip": 0.01569873, "auxiliary_loss_mlp": 0.00292529, "balance_loss_clip": 1.25750101, "balance_loss_mlp": 0.25953197, "epoch": 0.14868480384788818, "flos": 19353625459200.0, "grad_norm": 14.82720353246185, "language_loss": 0.89305174, "learning_rate": 3.854048192933008e-06, "loss": 0.91167581, "num_input_tokens_seen": 53716845, "router_z_loss_clip": 3.12304688, "router_z_loss_mlp": 0.32983398, "step": 2473, "time_per_iteration": 2.5711441040039062 }, { "auxiliary_loss_clip": 0.01558636, "auxiliary_loss_mlp": 0.00289657, "balance_loss_clip": 1.24572551, "balance_loss_mlp": 0.25918737, "epoch": 0.14874492710055615, "flos": 22200192426240.0, "grad_norm": 57.07355075763097, "language_loss": 0.88195264, "learning_rate": 3.853902108962709e-06, "loss": 0.90043557, "num_input_tokens_seen": 53734970, "router_z_loss_clip": 3.125, "router_z_loss_mlp": 0.30456543, "step": 2474, "time_per_iteration": 2.6235275268554688 }, { "auxiliary_loss_clip": 0.01582245, "auxiliary_loss_mlp": 0.00355248, "balance_loss_clip": 1.26073575, "balance_loss_mlp": 0.31910419, "epoch": 0.1488050503532241, "flos": 21103444506240.0, "grad_norm": 3.6648964368455954, "language_loss": 0.87978381, "learning_rate": 3.853755954692255e-06, "loss": 0.89915872, "num_input_tokens_seen": 53753415, "router_z_loss_clip": 3.21679688, "router_z_loss_mlp": 0.36157227, "step": 2475, "time_per_iteration": 2.5865561962127686 }, { "auxiliary_loss_clip": 0.01585071, "auxiliary_loss_mlp": 0.00304587, "balance_loss_clip": 1.26173913, "balance_loss_mlp": 0.27187639, "epoch": 0.14886517360589208, "flos": 12786569625600.0, "grad_norm": 188.62960135640387, "language_loss": 0.8840704, "learning_rate": 3.85360973012719e-06, "loss": 0.90296698, "num_input_tokens_seen": 53770305, "router_z_loss_clip": 3.234375, "router_z_loss_mlp": 0.32714844, "step": 2476, "time_per_iteration": 2.585855484008789 }, { "auxiliary_loss_clip": 0.01569625, "auxiliary_loss_mlp": 0.00279169, "balance_loss_clip": 1.25515366, "balance_loss_mlp": 0.24846017, "epoch": 0.14892529685856004, "flos": 29022860419200.0, "grad_norm": 12099.088897209576, "language_loss": 0.83214116, "learning_rate": 3.853463435273058e-06, "loss": 0.85062909, "num_input_tokens_seen": 53788895, "router_z_loss_clip": 3.14453125, "router_z_loss_mlp": 0.30712891, "step": 2477, "time_per_iteration": 2.6612794399261475 }, { "auxiliary_loss_clip": 0.01364918, "auxiliary_loss_mlp": 0.0008118, "balance_loss_clip": 1.16748261, "balance_loss_mlp": 0.06401383, "epoch": 0.148985420111228, "flos": 61926121054080.0, "grad_norm": 0.7845629868148587, "language_loss": 0.59985578, "learning_rate": 3.853317070135407e-06, "loss": 0.61431676, "num_input_tokens_seen": 53850260, "router_z_loss_clip": 1.9765625, "router_z_loss_mlp": 0.171875, "step": 2478, "time_per_iteration": 3.148921489715576 }, { "auxiliary_loss_clip": 0.01588782, "auxiliary_loss_mlp": 0.00287077, "balance_loss_clip": 1.26392519, "balance_loss_mlp": 0.25522476, "epoch": 0.149045543363896, "flos": 23915106432000.0, "grad_norm": 4.428772815430691, "language_loss": 0.78747118, "learning_rate": 3.853170634719787e-06, "loss": 0.80622977, "num_input_tokens_seen": 53867520, "router_z_loss_clip": 3.24804688, "router_z_loss_mlp": 0.31860352, "step": 2479, "time_per_iteration": 2.676405191421509 }, { "auxiliary_loss_clip": 0.01596338, "auxiliary_loss_mlp": 0.00322482, "balance_loss_clip": 1.26893413, "balance_loss_mlp": 0.28910339, "epoch": 0.14910566661656396, "flos": 23654394541440.0, "grad_norm": 3.87379940899696, "language_loss": 0.86007911, "learning_rate": 3.853024129031751e-06, "loss": 0.87926733, "num_input_tokens_seen": 53886620, "router_z_loss_clip": 3.2734375, "router_z_loss_mlp": 0.33349609, "step": 2480, "time_per_iteration": 2.623211622238159 }, { "auxiliary_loss_clip": 0.01624366, "auxiliary_loss_mlp": 0.00299422, "balance_loss_clip": 1.28487372, "balance_loss_mlp": 0.26879716, "epoch": 0.14916578986923193, "flos": 20515299212160.0, "grad_norm": 4.406775484455945, "language_loss": 0.9151746, "learning_rate": 3.852877553076854e-06, "loss": 0.93441254, "num_input_tokens_seen": 53902230, "router_z_loss_clip": 3.390625, "router_z_loss_mlp": 0.30651855, "step": 2481, "time_per_iteration": 2.682844400405884 }, { "auxiliary_loss_clip": 0.01625743, "auxiliary_loss_mlp": 0.00307336, "balance_loss_clip": 1.28582084, "balance_loss_mlp": 0.27448177, "epoch": 0.1492259131218999, "flos": 22491822948480.0, "grad_norm": 89.13939956583506, "language_loss": 0.85025942, "learning_rate": 3.8527309068606546e-06, "loss": 0.86959022, "num_input_tokens_seen": 53919475, "router_z_loss_clip": 3.40234375, "router_z_loss_mlp": 0.32800293, "step": 2482, "time_per_iteration": 2.6415438652038574 }, { "auxiliary_loss_clip": 0.0164181, "auxiliary_loss_mlp": 0.00276053, "balance_loss_clip": 1.293064, "balance_loss_mlp": 0.24343684, "epoch": 0.14928603637456786, "flos": 23185868515200.0, "grad_norm": 9.643467203083583, "language_loss": 0.89552939, "learning_rate": 3.852584190388713e-06, "loss": 0.91470802, "num_input_tokens_seen": 53939150, "router_z_loss_clip": 3.48632812, "router_z_loss_mlp": 0.32641602, "step": 2483, "time_per_iteration": 2.7075886726379395 }, { "auxiliary_loss_clip": 0.01628216, "auxiliary_loss_mlp": 0.00269768, "balance_loss_clip": 1.28980017, "balance_loss_mlp": 0.24060898, "epoch": 0.14934615962723582, "flos": 21653237053440.0, "grad_norm": 1.5141872351610104, "language_loss": 0.76758659, "learning_rate": 3.852437403666595e-06, "loss": 0.78656644, "num_input_tokens_seen": 53958735, "router_z_loss_clip": 3.38671875, "router_z_loss_mlp": 0.29174805, "step": 2484, "time_per_iteration": 2.63010835647583 }, { "auxiliary_loss_clip": 0.01639742, "auxiliary_loss_mlp": 0.00290324, "balance_loss_clip": 1.29734516, "balance_loss_mlp": 0.25685039, "epoch": 0.1494062828799038, "flos": 27010066924800.0, "grad_norm": 3.7297601773064155, "language_loss": 0.91307777, "learning_rate": 3.852290546699863e-06, "loss": 0.93237841, "num_input_tokens_seen": 53975065, "router_z_loss_clip": 3.42773438, "router_z_loss_mlp": 0.33447266, "step": 2485, "time_per_iteration": 2.67417311668396 }, { "auxiliary_loss_clip": 0.0165301, "auxiliary_loss_mlp": 0.0026006, "balance_loss_clip": 1.30327702, "balance_loss_mlp": 0.22763523, "epoch": 0.14946640613257178, "flos": 21214947300480.0, "grad_norm": 52.62297084528538, "language_loss": 0.92717242, "learning_rate": 3.8521436194940894e-06, "loss": 0.94630313, "num_input_tokens_seen": 53993330, "router_z_loss_clip": 3.5, "router_z_loss_mlp": 0.32446289, "step": 2486, "time_per_iteration": 2.597712755203247 }, { "auxiliary_loss_clip": 0.01616514, "auxiliary_loss_mlp": 0.00237, "balance_loss_clip": 1.28329265, "balance_loss_mlp": 0.20840198, "epoch": 0.14952652938523975, "flos": 13370872164480.0, "grad_norm": 125.83621675804288, "language_loss": 0.80113089, "learning_rate": 3.851996622054842e-06, "loss": 0.81966603, "num_input_tokens_seen": 54010515, "router_z_loss_clip": 3.33203125, "router_z_loss_mlp": 0.28552246, "step": 2487, "time_per_iteration": 2.593411445617676 }, { "auxiliary_loss_clip": 0.01629921, "auxiliary_loss_mlp": 0.00261316, "balance_loss_clip": 1.29360318, "balance_loss_mlp": 0.22860458, "epoch": 0.1495866526379077, "flos": 35517699959040.0, "grad_norm": 10.607304822189466, "language_loss": 0.78926998, "learning_rate": 3.8518495543877e-06, "loss": 0.80818236, "num_input_tokens_seen": 54031315, "router_z_loss_clip": 3.36523438, "router_z_loss_mlp": 0.3269043, "step": 2488, "time_per_iteration": 2.7316718101501465 }, { "auxiliary_loss_clip": 0.01624984, "auxiliary_loss_mlp": 0.00305617, "balance_loss_clip": 1.28619993, "balance_loss_mlp": 0.27149954, "epoch": 0.14964677589057568, "flos": 17632749795840.0, "grad_norm": 7.869198219612591, "language_loss": 0.77973109, "learning_rate": 3.851702416498235e-06, "loss": 0.7990371, "num_input_tokens_seen": 54045965, "router_z_loss_clip": 3.38085938, "router_z_loss_mlp": 0.34106445, "step": 2489, "time_per_iteration": 2.6027779579162598 }, { "auxiliary_loss_clip": 0.01638582, "auxiliary_loss_mlp": 0.00256353, "balance_loss_clip": 1.29780877, "balance_loss_mlp": 0.2238563, "epoch": 0.14970689914324364, "flos": 20185280029440.0, "grad_norm": 366.0743527532123, "language_loss": 0.92903459, "learning_rate": 3.8515552083920295e-06, "loss": 0.94798398, "num_input_tokens_seen": 54059960, "router_z_loss_clip": 3.40820312, "router_z_loss_mlp": 0.32495117, "step": 2490, "time_per_iteration": 2.5710806846618652 }, { "auxiliary_loss_clip": 0.01605017, "auxiliary_loss_mlp": 0.00256177, "balance_loss_clip": 1.26925695, "balance_loss_mlp": 0.22208276, "epoch": 0.1497670223959116, "flos": 37228699382400.0, "grad_norm": 3.5425547441319027, "language_loss": 0.8661859, "learning_rate": 3.851407930074666e-06, "loss": 0.88479787, "num_input_tokens_seen": 54079330, "router_z_loss_clip": 3.36328125, "router_z_loss_mlp": 0.34057617, "step": 2491, "time_per_iteration": 2.721687078475952 }, { "auxiliary_loss_clip": 0.01601996, "auxiliary_loss_mlp": 0.00267855, "balance_loss_clip": 1.2723, "balance_loss_mlp": 0.23399997, "epoch": 0.1498271456485796, "flos": 24455848752000.0, "grad_norm": 3.304164621645694, "language_loss": 0.96676511, "learning_rate": 3.851260581551727e-06, "loss": 0.98546362, "num_input_tokens_seen": 54097555, "router_z_loss_clip": 3.30078125, "router_z_loss_mlp": 0.33862305, "step": 2492, "time_per_iteration": 2.623558282852173 }, { "auxiliary_loss_clip": 0.01598484, "auxiliary_loss_mlp": 0.00268035, "balance_loss_clip": 1.27145219, "balance_loss_mlp": 0.23212895, "epoch": 0.14988726890124757, "flos": 16253601148800.0, "grad_norm": 5.094206394411597, "language_loss": 0.89801371, "learning_rate": 3.851113162828802e-06, "loss": 0.91667891, "num_input_tokens_seen": 54115600, "router_z_loss_clip": 3.27148438, "router_z_loss_mlp": 0.35913086, "step": 2493, "time_per_iteration": 2.576873779296875 }, { "auxiliary_loss_clip": 0.01590926, "auxiliary_loss_mlp": 0.00278372, "balance_loss_clip": 1.26681423, "balance_loss_mlp": 0.24120273, "epoch": 0.14994739215391553, "flos": 20666555383680.0, "grad_norm": 37.58936596639462, "language_loss": 0.86877656, "learning_rate": 3.85096567391148e-06, "loss": 0.88746953, "num_input_tokens_seen": 54135220, "router_z_loss_clip": 3.2421875, "router_z_loss_mlp": 0.37207031, "step": 2494, "time_per_iteration": 2.6444461345672607 }, { "auxiliary_loss_clip": 0.01568131, "auxiliary_loss_mlp": 0.00242941, "balance_loss_clip": 1.25324011, "balance_loss_mlp": 0.2084896, "epoch": 0.1500075154065835, "flos": 70652375239680.0, "grad_norm": 104.57574075176639, "language_loss": 0.74512506, "learning_rate": 3.850818114805354e-06, "loss": 0.76323581, "num_input_tokens_seen": 54161065, "router_z_loss_clip": 3.14648438, "router_z_loss_mlp": 0.34472656, "step": 2495, "time_per_iteration": 3.054676055908203 }, { "auxiliary_loss_clip": 0.01450753, "auxiliary_loss_mlp": 0.00082425, "balance_loss_clip": 1.24505997, "balance_loss_mlp": 0.06602141, "epoch": 0.15006763865925146, "flos": 68011937447040.0, "grad_norm": 0.8669193022401698, "language_loss": 0.59335506, "learning_rate": 3.850670485516019e-06, "loss": 0.60868686, "num_input_tokens_seen": 54225095, "router_z_loss_clip": 2.0625, "router_z_loss_mlp": 0.1640625, "step": 2496, "time_per_iteration": 3.142096996307373 }, { "auxiliary_loss_clip": 0.01594813, "auxiliary_loss_mlp": 0.00268259, "balance_loss_clip": 1.27118826, "balance_loss_mlp": 0.2326152, "epoch": 0.15012776191191943, "flos": 18916269459840.0, "grad_norm": 8.079476178037334, "language_loss": 0.74926746, "learning_rate": 3.850522786049075e-06, "loss": 0.7678982, "num_input_tokens_seen": 54243750, "router_z_loss_clip": 3.23632812, "router_z_loss_mlp": 0.35644531, "step": 2497, "time_per_iteration": 2.665170431137085 }, { "auxiliary_loss_clip": 0.01587707, "auxiliary_loss_mlp": 0.00252835, "balance_loss_clip": 1.26921391, "balance_loss_mlp": 0.22088695, "epoch": 0.1501878851645874, "flos": 23701330638720.0, "grad_norm": 2.0506151961532537, "language_loss": 0.81187433, "learning_rate": 3.850375016410121e-06, "loss": 0.83027977, "num_input_tokens_seen": 54266185, "router_z_loss_clip": 3.1875, "router_z_loss_mlp": 0.31933594, "step": 2498, "time_per_iteration": 2.709078311920166 }, { "auxiliary_loss_clip": 0.01589785, "auxiliary_loss_mlp": 0.00234432, "balance_loss_clip": 1.26903987, "balance_loss_mlp": 0.20176834, "epoch": 0.15024800841725539, "flos": 20412523422720.0, "grad_norm": 29.24965262041153, "language_loss": 0.79991865, "learning_rate": 3.850227176604761e-06, "loss": 0.81816083, "num_input_tokens_seen": 54283940, "router_z_loss_clip": 3.20703125, "router_z_loss_mlp": 0.3269043, "step": 2499, "time_per_iteration": 2.5754408836364746 }, { "auxiliary_loss_clip": 0.01570721, "auxiliary_loss_mlp": 0.00242156, "balance_loss_clip": 1.25895286, "balance_loss_mlp": 0.20925438, "epoch": 0.15030813166992335, "flos": 31831002812160.0, "grad_norm": 40.63285611098953, "language_loss": 0.78963661, "learning_rate": 3.850079266638601e-06, "loss": 0.80776536, "num_input_tokens_seen": 54304830, "router_z_loss_clip": 3.11328125, "router_z_loss_mlp": 0.32885742, "step": 2500, "time_per_iteration": 2.7095417976379395 }, { "auxiliary_loss_clip": 0.01562254, "auxiliary_loss_mlp": 0.00234233, "balance_loss_clip": 1.25282264, "balance_loss_mlp": 0.20185606, "epoch": 0.15036825492259132, "flos": 35657822914560.0, "grad_norm": 1297.8321854101105, "language_loss": 0.73788387, "learning_rate": 3.849931286517249e-06, "loss": 0.75584877, "num_input_tokens_seen": 54325595, "router_z_loss_clip": 3.09765625, "router_z_loss_mlp": 0.32373047, "step": 2501, "time_per_iteration": 2.7395715713500977 }, { "auxiliary_loss_clip": 0.0154255, "auxiliary_loss_mlp": 0.00225286, "balance_loss_clip": 1.23447847, "balance_loss_mlp": 0.19121641, "epoch": 0.15042837817525928, "flos": 18838163335680.0, "grad_norm": 18.088775367675087, "language_loss": 0.92135644, "learning_rate": 3.849783236246318e-06, "loss": 0.93903482, "num_input_tokens_seen": 54342180, "router_z_loss_clip": 3.08203125, "router_z_loss_mlp": 0.34106445, "step": 2502, "time_per_iteration": 2.605713367462158 }, { "auxiliary_loss_clip": 0.01555657, "auxiliary_loss_mlp": 0.00243045, "balance_loss_clip": 1.24765432, "balance_loss_mlp": 0.20921384, "epoch": 0.15048850142792725, "flos": 19535548867200.0, "grad_norm": 16.224585492607996, "language_loss": 0.83266205, "learning_rate": 3.849635115831421e-06, "loss": 0.85064912, "num_input_tokens_seen": 54360255, "router_z_loss_clip": 3.08007812, "router_z_loss_mlp": 0.33813477, "step": 2503, "time_per_iteration": 2.6008641719818115 }, { "auxiliary_loss_clip": 0.01542385, "auxiliary_loss_mlp": 0.00216759, "balance_loss_clip": 1.24204433, "balance_loss_mlp": 0.18702865, "epoch": 0.1505486246805952, "flos": 22017550746240.0, "grad_norm": 168.39530036590995, "language_loss": 0.94323653, "learning_rate": 3.849486925278176e-06, "loss": 0.96082795, "num_input_tokens_seen": 54378260, "router_z_loss_clip": 2.99804688, "router_z_loss_mlp": 0.29711914, "step": 2504, "time_per_iteration": 2.619610548019409 }, { "auxiliary_loss_clip": 0.01544203, "auxiliary_loss_mlp": 0.00218379, "balance_loss_clip": 1.2394321, "balance_loss_mlp": 0.18569203, "epoch": 0.15060874793326318, "flos": 20743153136640.0, "grad_norm": 9.221524369359267, "language_loss": 0.88750851, "learning_rate": 3.8493386645922e-06, "loss": 0.90513438, "num_input_tokens_seen": 54399745, "router_z_loss_clip": 3.05078125, "router_z_loss_mlp": 0.32666016, "step": 2505, "time_per_iteration": 2.6584722995758057 }, { "auxiliary_loss_clip": 0.01536385, "auxiliary_loss_mlp": 0.00230294, "balance_loss_clip": 1.23773146, "balance_loss_mlp": 0.19836934, "epoch": 0.15066887118593117, "flos": 16471902055680.0, "grad_norm": 2.390167478194995, "language_loss": 0.83467561, "learning_rate": 3.849190333779117e-06, "loss": 0.85234237, "num_input_tokens_seen": 54417105, "router_z_loss_clip": 2.98632812, "router_z_loss_mlp": 0.31933594, "step": 2506, "time_per_iteration": 2.570402145385742 }, { "auxiliary_loss_clip": 0.01546421, "auxiliary_loss_mlp": 0.00237708, "balance_loss_clip": 1.2358681, "balance_loss_mlp": 0.20494929, "epoch": 0.15072899443859913, "flos": 19859319083520.0, "grad_norm": 3.922555026375878, "language_loss": 0.91204834, "learning_rate": 3.849041932844552e-06, "loss": 0.92988962, "num_input_tokens_seen": 54433920, "router_z_loss_clip": 3.10351562, "router_z_loss_mlp": 0.32763672, "step": 2507, "time_per_iteration": 2.6086461544036865 }, { "auxiliary_loss_clip": 0.01531268, "auxiliary_loss_mlp": 0.00234611, "balance_loss_clip": 1.23072898, "balance_loss_mlp": 0.2025203, "epoch": 0.1507891176912671, "flos": 20776226584320.0, "grad_norm": 14.990724293909658, "language_loss": 0.76193988, "learning_rate": 3.848893461794131e-06, "loss": 0.77959859, "num_input_tokens_seen": 54451540, "router_z_loss_clip": 3.00585938, "router_z_loss_mlp": 0.32104492, "step": 2508, "time_per_iteration": 2.58901309967041 }, { "auxiliary_loss_clip": 0.0154761, "auxiliary_loss_mlp": 0.00239725, "balance_loss_clip": 1.23886132, "balance_loss_mlp": 0.20527393, "epoch": 0.15084924094393506, "flos": 23586631534080.0, "grad_norm": 2.1232111206102813, "language_loss": 0.85818493, "learning_rate": 3.8487449206334845e-06, "loss": 0.87605834, "num_input_tokens_seen": 54470800, "router_z_loss_clip": 3.08984375, "router_z_loss_mlp": 0.34472656, "step": 2509, "time_per_iteration": 4.161874294281006 }, { "auxiliary_loss_clip": 0.0155794, "auxiliary_loss_mlp": 0.00237386, "balance_loss_clip": 1.24415421, "balance_loss_mlp": 0.20457938, "epoch": 0.15090936419660303, "flos": 18911313383040.0, "grad_norm": 9.826453223536552, "language_loss": 0.908952, "learning_rate": 3.848596309368246e-06, "loss": 0.92690527, "num_input_tokens_seen": 54486525, "router_z_loss_clip": 3.13476562, "router_z_loss_mlp": 0.328125, "step": 2510, "time_per_iteration": 2.6408462524414062 }, { "auxiliary_loss_clip": 0.01570842, "auxiliary_loss_mlp": 0.00240406, "balance_loss_clip": 1.25548065, "balance_loss_mlp": 0.20538205, "epoch": 0.150969487449271, "flos": 17928223073280.0, "grad_norm": 8.359064558418252, "language_loss": 0.83340919, "learning_rate": 3.8484476280040495e-06, "loss": 0.85152161, "num_input_tokens_seen": 54503795, "router_z_loss_clip": 3.15039062, "router_z_loss_mlp": 0.3503418, "step": 2511, "time_per_iteration": 2.666086435317993 }, { "auxiliary_loss_clip": 0.01555481, "auxiliary_loss_mlp": 0.00244282, "balance_loss_clip": 1.24898553, "balance_loss_mlp": 0.21319203, "epoch": 0.151029610701939, "flos": 24243078539520.0, "grad_norm": 40.59272651019958, "language_loss": 0.79566783, "learning_rate": 3.848298876546534e-06, "loss": 0.81366545, "num_input_tokens_seen": 54523025, "router_z_loss_clip": 3.06445312, "router_z_loss_mlp": 0.31054688, "step": 2512, "time_per_iteration": 5.474567651748657 }, { "auxiliary_loss_clip": 0.01564395, "auxiliary_loss_mlp": 0.00228505, "balance_loss_clip": 1.25920343, "balance_loss_mlp": 0.1968666, "epoch": 0.15108973395460695, "flos": 30262496641920.0, "grad_norm": 28.64030331798118, "language_loss": 0.82314253, "learning_rate": 3.84815005500134e-06, "loss": 0.84107149, "num_input_tokens_seen": 54545025, "router_z_loss_clip": 3.05078125, "router_z_loss_mlp": 0.31665039, "step": 2513, "time_per_iteration": 2.7438831329345703 }, { "auxiliary_loss_clip": 0.01434861, "auxiliary_loss_mlp": 0.00242872, "balance_loss_clip": 1.23072898, "balance_loss_mlp": 0.22894879, "epoch": 0.15114985720727492, "flos": 60437624428800.0, "grad_norm": 0.8611693719913415, "language_loss": 0.64273036, "learning_rate": 3.84800116337411e-06, "loss": 0.65950775, "num_input_tokens_seen": 54604545, "router_z_loss_clip": 2.03125, "router_z_loss_mlp": 0.13964844, "step": 2514, "time_per_iteration": 3.084643602371216 }, { "auxiliary_loss_clip": 0.01577903, "auxiliary_loss_mlp": 0.00241994, "balance_loss_clip": 1.26363897, "balance_loss_mlp": 0.2106415, "epoch": 0.15120998045994288, "flos": 20521691832960.0, "grad_norm": 12.463132858349073, "language_loss": 0.79227948, "learning_rate": 3.8478522016704916e-06, "loss": 0.81047845, "num_input_tokens_seen": 54620590, "router_z_loss_clip": 3.14453125, "router_z_loss_mlp": 0.3137207, "step": 2515, "time_per_iteration": 4.032611131668091 }, { "auxiliary_loss_clip": 0.01579375, "auxiliary_loss_mlp": 0.00228658, "balance_loss_clip": 1.26846027, "balance_loss_mlp": 0.19644743, "epoch": 0.15127010371261085, "flos": 21178893024000.0, "grad_norm": 257.0006272322974, "language_loss": 0.8435123, "learning_rate": 3.8477031698961325e-06, "loss": 0.86159265, "num_input_tokens_seen": 54640410, "router_z_loss_clip": 3.10351562, "router_z_loss_mlp": 0.32177734, "step": 2516, "time_per_iteration": 2.6606013774871826 }, { "auxiliary_loss_clip": 0.01467249, "auxiliary_loss_mlp": 0.00122547, "balance_loss_clip": 1.25531423, "balance_loss_mlp": 0.11043525, "epoch": 0.1513302269652788, "flos": 65320648974720.0, "grad_norm": 0.7378599727952849, "language_loss": 0.54866964, "learning_rate": 3.8475540680566835e-06, "loss": 0.56456757, "num_input_tokens_seen": 54701430, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.12109375, "step": 2517, "time_per_iteration": 3.124859571456909 }, { "auxiliary_loss_clip": 0.0159449, "auxiliary_loss_mlp": 0.00222864, "balance_loss_clip": 1.27775824, "balance_loss_mlp": 0.18729232, "epoch": 0.15139035021794678, "flos": 19135827342720.0, "grad_norm": 8.328798327243442, "language_loss": 0.85610127, "learning_rate": 3.8474048961577995e-06, "loss": 0.87427479, "num_input_tokens_seen": 54720845, "router_z_loss_clip": 3.16601562, "router_z_loss_mlp": 0.35571289, "step": 2518, "time_per_iteration": 2.622445583343506 }, { "auxiliary_loss_clip": 0.01611652, "auxiliary_loss_mlp": 0.00234718, "balance_loss_clip": 1.2888, "balance_loss_mlp": 0.19759651, "epoch": 0.15145047347061477, "flos": 26578564842240.0, "grad_norm": 8.978993990265556, "language_loss": 0.76849484, "learning_rate": 3.847255654205137e-06, "loss": 0.78695858, "num_input_tokens_seen": 54740495, "router_z_loss_clip": 3.2265625, "router_z_loss_mlp": 0.37133789, "step": 2519, "time_per_iteration": 2.699012041091919 }, { "auxiliary_loss_clip": 0.01611398, "auxiliary_loss_mlp": 0.00225545, "balance_loss_clip": 1.29162693, "balance_loss_mlp": 0.19183244, "epoch": 0.15151059672328274, "flos": 20302959962880.0, "grad_norm": 61.45689820302245, "language_loss": 0.87648678, "learning_rate": 3.847106342204354e-06, "loss": 0.89485615, "num_input_tokens_seen": 54758415, "router_z_loss_clip": 3.19921875, "router_z_loss_mlp": 0.33740234, "step": 2520, "time_per_iteration": 2.6811656951904297 }, { "auxiliary_loss_clip": 0.0161782, "auxiliary_loss_mlp": 0.0024508, "balance_loss_clip": 1.29340506, "balance_loss_mlp": 0.20822001, "epoch": 0.1515707199759507, "flos": 27228367831680.0, "grad_norm": 30.889466296905333, "language_loss": 0.84168392, "learning_rate": 3.846956960161114e-06, "loss": 0.86031294, "num_input_tokens_seen": 54779355, "router_z_loss_clip": 3.24414062, "router_z_loss_mlp": 0.3684082, "step": 2521, "time_per_iteration": 2.7189838886260986 }, { "auxiliary_loss_clip": 0.01618224, "auxiliary_loss_mlp": 0.00247881, "balance_loss_clip": 1.29107726, "balance_loss_mlp": 0.20844644, "epoch": 0.15163084322861867, "flos": 23587349806080.0, "grad_norm": 18.740129060944867, "language_loss": 0.91395271, "learning_rate": 3.84680750808108e-06, "loss": 0.93261385, "num_input_tokens_seen": 54799465, "router_z_loss_clip": 3.27148438, "router_z_loss_mlp": 0.39453125, "step": 2522, "time_per_iteration": 2.6862030029296875 }, { "auxiliary_loss_clip": 0.01440721, "auxiliary_loss_mlp": 0.00124767, "balance_loss_clip": 1.22685742, "balance_loss_mlp": 0.10874529, "epoch": 0.15169096648128663, "flos": 66889622021760.0, "grad_norm": 0.8237152892074878, "language_loss": 0.58199298, "learning_rate": 3.846657985969922e-06, "loss": 0.59764791, "num_input_tokens_seen": 54857665, "router_z_loss_clip": 2.140625, "router_z_loss_mlp": 0.16015625, "step": 2523, "time_per_iteration": 3.0839016437530518 }, { "auxiliary_loss_clip": 0.01625573, "auxiliary_loss_mlp": 0.00209027, "balance_loss_clip": 1.2976073, "balance_loss_mlp": 0.17073706, "epoch": 0.1517510897339546, "flos": 29095435848960.0, "grad_norm": 1.9383839463572936, "language_loss": 0.82129604, "learning_rate": 3.8465083938333066e-06, "loss": 0.83964205, "num_input_tokens_seen": 54879895, "router_z_loss_clip": 3.28125, "router_z_loss_mlp": 0.38305664, "step": 2524, "time_per_iteration": 2.6963484287261963 }, { "auxiliary_loss_clip": 0.01640849, "auxiliary_loss_mlp": 0.00250713, "balance_loss_clip": 1.30471408, "balance_loss_mlp": 0.21237499, "epoch": 0.1518112129866226, "flos": 18406553512320.0, "grad_norm": 22.47027800107615, "language_loss": 0.8259356, "learning_rate": 3.8463587316769085e-06, "loss": 0.84485126, "num_input_tokens_seen": 54898245, "router_z_loss_clip": 3.36328125, "router_z_loss_mlp": 0.38305664, "step": 2525, "time_per_iteration": 2.7734124660491943 }, { "auxiliary_loss_clip": 0.01635398, "auxiliary_loss_mlp": 0.00262942, "balance_loss_clip": 1.29702294, "balance_loss_mlp": 0.22250581, "epoch": 0.15187133623929056, "flos": 19425410789760.0, "grad_norm": 6.939908492055646, "language_loss": 0.87264699, "learning_rate": 3.846208999506402e-06, "loss": 0.89163041, "num_input_tokens_seen": 54917060, "router_z_loss_clip": 3.38085938, "router_z_loss_mlp": 0.40405273, "step": 2526, "time_per_iteration": 2.608898639678955 }, { "auxiliary_loss_clip": 0.01630783, "auxiliary_loss_mlp": 0.00238119, "balance_loss_clip": 1.29695153, "balance_loss_mlp": 0.2003772, "epoch": 0.15193145949195852, "flos": 17566207850880.0, "grad_norm": 20.16195856476102, "language_loss": 0.92105526, "learning_rate": 3.846059197327466e-06, "loss": 0.93974435, "num_input_tokens_seen": 54936365, "router_z_loss_clip": 3.33984375, "router_z_loss_mlp": 0.37744141, "step": 2527, "time_per_iteration": 2.650644302368164 }, { "auxiliary_loss_clip": 0.01630969, "auxiliary_loss_mlp": 0.00258386, "balance_loss_clip": 1.29788852, "balance_loss_mlp": 0.2212165, "epoch": 0.15199158274462649, "flos": 36176265866880.0, "grad_norm": 141.8763667575267, "language_loss": 0.74725777, "learning_rate": 3.845909325145779e-06, "loss": 0.76615131, "num_input_tokens_seen": 54961365, "router_z_loss_clip": 3.33007812, "router_z_loss_mlp": 0.37182617, "step": 2528, "time_per_iteration": 2.7457351684570312 }, { "auxiliary_loss_clip": 0.01622804, "auxiliary_loss_mlp": 0.0023328, "balance_loss_clip": 1.28689814, "balance_loss_mlp": 0.19205755, "epoch": 0.15205170599729445, "flos": 23074042498560.0, "grad_norm": 28.76182727348651, "language_loss": 0.94781882, "learning_rate": 3.845759382967026e-06, "loss": 0.96637964, "num_input_tokens_seen": 54980750, "router_z_loss_clip": 3.35742188, "router_z_loss_mlp": 0.41210938, "step": 2529, "time_per_iteration": 2.70662784576416 }, { "auxiliary_loss_clip": 0.01626107, "auxiliary_loss_mlp": 0.00252221, "balance_loss_clip": 1.29531932, "balance_loss_mlp": 0.21173792, "epoch": 0.15211182924996242, "flos": 21908382336000.0, "grad_norm": 18.565547915021952, "language_loss": 0.91219878, "learning_rate": 3.845609370796893e-06, "loss": 0.93098211, "num_input_tokens_seen": 54999675, "router_z_loss_clip": 3.3046875, "router_z_loss_mlp": 0.40478516, "step": 2530, "time_per_iteration": 2.6448428630828857 }, { "auxiliary_loss_clip": 0.01616085, "auxiliary_loss_mlp": 0.00252873, "balance_loss_clip": 1.28461266, "balance_loss_mlp": 0.21460673, "epoch": 0.15217195250263038, "flos": 13881521865600.0, "grad_norm": 5.936961852443786, "language_loss": 0.89857721, "learning_rate": 3.845459288641066e-06, "loss": 0.91726685, "num_input_tokens_seen": 55018295, "router_z_loss_clip": 3.31445312, "router_z_loss_mlp": 0.3828125, "step": 2531, "time_per_iteration": 2.6350133419036865 }, { "auxiliary_loss_clip": 0.01620508, "auxiliary_loss_mlp": 0.00233963, "balance_loss_clip": 1.2866087, "balance_loss_mlp": 0.19617343, "epoch": 0.15223207575529837, "flos": 24535319592960.0, "grad_norm": 14.184620266236404, "language_loss": 0.87394327, "learning_rate": 3.8453091365052394e-06, "loss": 0.892488, "num_input_tokens_seen": 55037975, "router_z_loss_clip": 3.33789062, "router_z_loss_mlp": 0.37841797, "step": 2532, "time_per_iteration": 2.7181296348571777 }, { "auxiliary_loss_clip": 0.0161876, "auxiliary_loss_mlp": 0.00258474, "balance_loss_clip": 1.28644562, "balance_loss_mlp": 0.21799043, "epoch": 0.15229219900796634, "flos": 25556798563200.0, "grad_norm": 78.57601574113427, "language_loss": 0.93418705, "learning_rate": 3.845158914395105e-06, "loss": 0.95295942, "num_input_tokens_seen": 55057135, "router_z_loss_clip": 3.32421875, "router_z_loss_mlp": 0.40478516, "step": 2533, "time_per_iteration": 2.642825126647949 }, { "auxiliary_loss_clip": 0.01628306, "auxiliary_loss_mlp": 0.0025612, "balance_loss_clip": 1.28857446, "balance_loss_mlp": 0.21585152, "epoch": 0.1523523222606343, "flos": 18217806520320.0, "grad_norm": 101.29676945424707, "language_loss": 0.90375048, "learning_rate": 3.84500862231636e-06, "loss": 0.92259479, "num_input_tokens_seen": 55075525, "router_z_loss_clip": 3.40234375, "router_z_loss_mlp": 0.40283203, "step": 2534, "time_per_iteration": 2.6192991733551025 }, { "auxiliary_loss_clip": 0.01640767, "auxiliary_loss_mlp": 0.00253091, "balance_loss_clip": 1.2933526, "balance_loss_mlp": 0.21160659, "epoch": 0.15241244551330227, "flos": 13260087642240.0, "grad_norm": 5.237232206398613, "language_loss": 0.8798399, "learning_rate": 3.844858260274702e-06, "loss": 0.8987785, "num_input_tokens_seen": 55090845, "router_z_loss_clip": 3.47265625, "router_z_loss_mlp": 0.41479492, "step": 2535, "time_per_iteration": 2.6173343658447266 }, { "auxiliary_loss_clip": 0.01629169, "auxiliary_loss_mlp": 0.00281706, "balance_loss_clip": 1.28631902, "balance_loss_mlp": 0.23790854, "epoch": 0.15247256876597023, "flos": 19715568854400.0, "grad_norm": 798.6603842417262, "language_loss": 0.86047918, "learning_rate": 3.844707828275835e-06, "loss": 0.87958789, "num_input_tokens_seen": 55108750, "router_z_loss_clip": 3.42773438, "router_z_loss_mlp": 0.43774414, "step": 2536, "time_per_iteration": 2.5932931900024414 }, { "auxiliary_loss_clip": 0.01611211, "auxiliary_loss_mlp": 0.00254777, "balance_loss_clip": 1.2795465, "balance_loss_mlp": 0.21128914, "epoch": 0.1525326920186382, "flos": 20375858615040.0, "grad_norm": 20.9788629796572, "language_loss": 0.8418116, "learning_rate": 3.844557326325461e-06, "loss": 0.86047149, "num_input_tokens_seen": 55126750, "router_z_loss_clip": 3.31640625, "router_z_loss_mlp": 0.43457031, "step": 2537, "time_per_iteration": 2.6311850547790527 }, { "auxiliary_loss_clip": 0.01624565, "auxiliary_loss_mlp": 0.00263381, "balance_loss_clip": 1.28702021, "balance_loss_mlp": 0.21753323, "epoch": 0.15259281527130616, "flos": 13589963170560.0, "grad_norm": 9.886280834137322, "language_loss": 0.87735939, "learning_rate": 3.8444067544292896e-06, "loss": 0.8962388, "num_input_tokens_seen": 55144690, "router_z_loss_clip": 3.37695312, "router_z_loss_mlp": 0.45874023, "step": 2538, "time_per_iteration": 2.5921809673309326 }, { "auxiliary_loss_clip": 0.01612354, "auxiliary_loss_mlp": 0.0023165, "balance_loss_clip": 1.27915776, "balance_loss_mlp": 0.18906876, "epoch": 0.15265293852397416, "flos": 22860374446080.0, "grad_norm": 10.75103225888784, "language_loss": 0.95733005, "learning_rate": 3.844256112593029e-06, "loss": 0.97577006, "num_input_tokens_seen": 55166055, "router_z_loss_clip": 3.33398438, "router_z_loss_mlp": 0.42602539, "step": 2539, "time_per_iteration": 2.721728801727295 }, { "auxiliary_loss_clip": 0.01635738, "auxiliary_loss_mlp": 0.00252977, "balance_loss_clip": 1.30021596, "balance_loss_mlp": 0.20760572, "epoch": 0.15271306177664212, "flos": 29238108670080.0, "grad_norm": 8.476933983522327, "language_loss": 0.99781525, "learning_rate": 3.844105400822391e-06, "loss": 1.01670241, "num_input_tokens_seen": 55186285, "router_z_loss_clip": 3.35546875, "router_z_loss_mlp": 0.45410156, "step": 2540, "time_per_iteration": 2.7217564582824707 }, { "auxiliary_loss_clip": 0.01626099, "auxiliary_loss_mlp": 0.00250871, "balance_loss_clip": 1.28776598, "balance_loss_mlp": 0.21017267, "epoch": 0.1527731850293101, "flos": 31246269310080.0, "grad_norm": 15.218628472577134, "language_loss": 0.816257, "learning_rate": 3.843954619123092e-06, "loss": 0.83502674, "num_input_tokens_seen": 55207915, "router_z_loss_clip": 3.38671875, "router_z_loss_mlp": 0.40698242, "step": 2541, "time_per_iteration": 2.7615857124328613 }, { "auxiliary_loss_clip": 0.01608346, "auxiliary_loss_mlp": 0.00273214, "balance_loss_clip": 1.27522755, "balance_loss_mlp": 0.2297266, "epoch": 0.15283330828197805, "flos": 22382079920640.0, "grad_norm": 167.74467643550037, "language_loss": 0.8777352, "learning_rate": 3.84380376750085e-06, "loss": 0.89655077, "num_input_tokens_seen": 55227860, "router_z_loss_clip": 3.33007812, "router_z_loss_mlp": 0.43505859, "step": 2542, "time_per_iteration": 2.68774676322937 }, { "auxiliary_loss_clip": 0.01636098, "auxiliary_loss_mlp": 0.00251484, "balance_loss_clip": 1.29186249, "balance_loss_mlp": 0.20446792, "epoch": 0.15289343153464602, "flos": 25520133755520.0, "grad_norm": 2.814139679603823, "language_loss": 0.86786318, "learning_rate": 3.843652845961383e-06, "loss": 0.88673902, "num_input_tokens_seen": 55247330, "router_z_loss_clip": 3.44335938, "router_z_loss_mlp": 0.47021484, "step": 2543, "time_per_iteration": 2.6428306102752686 }, { "auxiliary_loss_clip": 0.01614323, "auxiliary_loss_mlp": 0.00215879, "balance_loss_clip": 1.28177977, "balance_loss_mlp": 0.17310674, "epoch": 0.15295355478731398, "flos": 22710016114560.0, "grad_norm": 8.918182586822097, "language_loss": 0.94078326, "learning_rate": 3.843501854510416e-06, "loss": 0.95908529, "num_input_tokens_seen": 55266195, "router_z_loss_clip": 3.33007812, "router_z_loss_mlp": 0.42797852, "step": 2544, "time_per_iteration": 2.681633472442627 }, { "auxiliary_loss_clip": 0.01632535, "auxiliary_loss_mlp": 0.00266262, "balance_loss_clip": 1.28883576, "balance_loss_mlp": 0.21583624, "epoch": 0.15301367803998198, "flos": 23251907669760.0, "grad_norm": 2.423267833349333, "language_loss": 0.9081493, "learning_rate": 3.843350793153673e-06, "loss": 0.92713726, "num_input_tokens_seen": 55283305, "router_z_loss_clip": 3.4375, "router_z_loss_mlp": 0.50439453, "step": 2545, "time_per_iteration": 2.6305911540985107 }, { "auxiliary_loss_clip": 0.01629965, "auxiliary_loss_mlp": 0.0023342, "balance_loss_clip": 1.29757571, "balance_loss_mlp": 0.18578373, "epoch": 0.15307380129264994, "flos": 25886279041920.0, "grad_norm": 15.931722474072362, "language_loss": 0.78658283, "learning_rate": 3.843199661896884e-06, "loss": 0.80521667, "num_input_tokens_seen": 55303035, "router_z_loss_clip": 3.32421875, "router_z_loss_mlp": 0.4765625, "step": 2546, "time_per_iteration": 2.7246267795562744 }, { "auxiliary_loss_clip": 0.01616977, "auxiliary_loss_mlp": 0.00267973, "balance_loss_clip": 1.28204763, "balance_loss_mlp": 0.22098053, "epoch": 0.1531339245453179, "flos": 46973239205760.0, "grad_norm": 2.6994356513097184, "language_loss": 0.83969665, "learning_rate": 3.843048460745779e-06, "loss": 0.8585462, "num_input_tokens_seen": 55327570, "router_z_loss_clip": 3.35351562, "router_z_loss_mlp": 0.46948242, "step": 2547, "time_per_iteration": 2.934541940689087 }, { "auxiliary_loss_clip": 0.01634875, "auxiliary_loss_mlp": 0.00249529, "balance_loss_clip": 1.29406452, "balance_loss_mlp": 0.20308527, "epoch": 0.15319404779798587, "flos": 35882049565440.0, "grad_norm": 21.148930899683883, "language_loss": 0.8371681, "learning_rate": 3.842897189706092e-06, "loss": 0.85601217, "num_input_tokens_seen": 55351090, "router_z_loss_clip": 3.41210938, "router_z_loss_mlp": 0.46459961, "step": 2548, "time_per_iteration": 2.8117611408233643 }, { "auxiliary_loss_clip": 0.01595455, "auxiliary_loss_mlp": 0.00226833, "balance_loss_clip": 1.26647949, "balance_loss_mlp": 0.17755231, "epoch": 0.15325417105065384, "flos": 25664638170240.0, "grad_norm": 1.9856712484717172, "language_loss": 0.87923318, "learning_rate": 3.842745848783558e-06, "loss": 0.89745611, "num_input_tokens_seen": 55371050, "router_z_loss_clip": 3.2890625, "router_z_loss_mlp": 0.49267578, "step": 2549, "time_per_iteration": 2.6900737285614014 }, { "auxiliary_loss_clip": 0.01583488, "auxiliary_loss_mlp": 0.00231177, "balance_loss_clip": 1.25284767, "balance_loss_mlp": 0.17948794, "epoch": 0.1533142943033218, "flos": 18770831291520.0, "grad_norm": 3.098756463030102, "language_loss": 0.8176403, "learning_rate": 3.842594437983917e-06, "loss": 0.835787, "num_input_tokens_seen": 55390375, "router_z_loss_clip": 3.30859375, "router_z_loss_mlp": 0.51708984, "step": 2550, "time_per_iteration": 2.61804461479187 }, { "auxiliary_loss_clip": 0.01592688, "auxiliary_loss_mlp": 0.00229981, "balance_loss_clip": 1.25931847, "balance_loss_mlp": 0.18170102, "epoch": 0.15337441755598977, "flos": 23107367341440.0, "grad_norm": 47.74865767098378, "language_loss": 0.86577916, "learning_rate": 3.8424429573129115e-06, "loss": 0.88400578, "num_input_tokens_seen": 55408890, "router_z_loss_clip": 3.3359375, "router_z_loss_mlp": 0.48266602, "step": 2551, "time_per_iteration": 4.052828073501587 }, { "auxiliary_loss_clip": 0.01485482, "auxiliary_loss_mlp": 0.00099328, "balance_loss_clip": 1.25168014, "balance_loss_mlp": 0.07911022, "epoch": 0.15343454080865776, "flos": 59861079227520.0, "grad_norm": 0.9186994025996846, "language_loss": 0.56737566, "learning_rate": 3.842291406776283e-06, "loss": 0.58322382, "num_input_tokens_seen": 55463815, "router_z_loss_clip": 2.34375, "router_z_loss_mlp": 0.20214844, "step": 2552, "time_per_iteration": 3.0606324672698975 }, { "auxiliary_loss_clip": 0.01570777, "auxiliary_loss_mlp": 0.00279082, "balance_loss_clip": 1.24603093, "balance_loss_mlp": 0.23568988, "epoch": 0.15349466406132573, "flos": 11910887959680.0, "grad_norm": 6.884758469108168, "language_loss": 0.972067, "learning_rate": 3.84213978637978e-06, "loss": 0.99056554, "num_input_tokens_seen": 55481050, "router_z_loss_clip": 3.24804688, "router_z_loss_mlp": 0.43408203, "step": 2553, "time_per_iteration": 2.6267240047454834 }, { "auxiliary_loss_clip": 0.01561026, "auxiliary_loss_mlp": 0.00260332, "balance_loss_clip": 1.24011159, "balance_loss_mlp": 0.21517539, "epoch": 0.1535547873139937, "flos": 24096922099200.0, "grad_norm": 9.921957576485191, "language_loss": 0.85563958, "learning_rate": 3.841988096129152e-06, "loss": 0.87385321, "num_input_tokens_seen": 55500050, "router_z_loss_clip": 3.21484375, "router_z_loss_mlp": 0.4519043, "step": 2554, "time_per_iteration": 4.165403127670288 }, { "auxiliary_loss_clip": 0.01541127, "auxiliary_loss_mlp": 0.00307741, "balance_loss_clip": 1.2266022, "balance_loss_mlp": 0.26809162, "epoch": 0.15361491056666166, "flos": 17566459246080.0, "grad_norm": 39.73989395282854, "language_loss": 0.86981714, "learning_rate": 3.841836336030151e-06, "loss": 0.88830578, "num_input_tokens_seen": 55518125, "router_z_loss_clip": 3.14648438, "router_z_loss_mlp": 0.39599609, "step": 2555, "time_per_iteration": 2.583627462387085 }, { "auxiliary_loss_clip": 0.01524074, "auxiliary_loss_mlp": 0.00262746, "balance_loss_clip": 1.21455228, "balance_loss_mlp": 0.22283429, "epoch": 0.15367503381932962, "flos": 25046041121280.0, "grad_norm": 36.16690594957559, "language_loss": 0.83966595, "learning_rate": 3.8416845060885305e-06, "loss": 0.85753411, "num_input_tokens_seen": 55540960, "router_z_loss_clip": 3.1015625, "router_z_loss_mlp": 0.39916992, "step": 2556, "time_per_iteration": 2.734755039215088 }, { "auxiliary_loss_clip": 0.01516425, "auxiliary_loss_mlp": 0.00280889, "balance_loss_clip": 1.21158671, "balance_loss_mlp": 0.24376762, "epoch": 0.15373515707199759, "flos": 21507332008320.0, "grad_norm": 903.3952187479553, "language_loss": 0.98869383, "learning_rate": 3.84153260631005e-06, "loss": 1.00666702, "num_input_tokens_seen": 55559210, "router_z_loss_clip": 3.04882812, "router_z_loss_mlp": 0.37109375, "step": 2557, "time_per_iteration": 4.029478073120117 }, { "auxiliary_loss_clip": 0.01509724, "auxiliary_loss_mlp": 0.00275112, "balance_loss_clip": 1.20548248, "balance_loss_mlp": 0.23837146, "epoch": 0.15379528032466555, "flos": 25994729180160.0, "grad_norm": 30.307277811038187, "language_loss": 0.7766034, "learning_rate": 3.841380636700468e-06, "loss": 0.79445171, "num_input_tokens_seen": 55578925, "router_z_loss_clip": 3.04296875, "router_z_loss_mlp": 0.36743164, "step": 2558, "time_per_iteration": 2.6918795108795166 }, { "auxiliary_loss_clip": 0.01509371, "auxiliary_loss_mlp": 0.00299406, "balance_loss_clip": 1.20460677, "balance_loss_mlp": 0.26307061, "epoch": 0.15385540357733354, "flos": 19277315015040.0, "grad_norm": 4.803617757527174, "language_loss": 1.00611627, "learning_rate": 3.841228597265548e-06, "loss": 1.02420402, "num_input_tokens_seen": 55597255, "router_z_loss_clip": 3.05078125, "router_z_loss_mlp": 0.36303711, "step": 2559, "time_per_iteration": 2.6553070545196533 }, { "auxiliary_loss_clip": 0.01495525, "auxiliary_loss_mlp": 0.00310812, "balance_loss_clip": 1.19795132, "balance_loss_mlp": 0.27671832, "epoch": 0.1539155268300015, "flos": 28549126920960.0, "grad_norm": 7.653277802502302, "language_loss": 0.74739462, "learning_rate": 3.841076488011055e-06, "loss": 0.76545799, "num_input_tokens_seen": 55619515, "router_z_loss_clip": 2.97460938, "router_z_loss_mlp": 0.34106445, "step": 2560, "time_per_iteration": 2.739847421646118 }, { "auxiliary_loss_clip": 0.01485899, "auxiliary_loss_mlp": 0.00309504, "balance_loss_clip": 1.18430352, "balance_loss_mlp": 0.27393243, "epoch": 0.15397565008266947, "flos": 23547883737600.0, "grad_norm": 6.991742381012541, "language_loss": 0.95442653, "learning_rate": 3.8409243089427574e-06, "loss": 0.97238052, "num_input_tokens_seen": 55640050, "router_z_loss_clip": 3.01367188, "router_z_loss_mlp": 0.35571289, "step": 2561, "time_per_iteration": 2.6655654907226562 }, { "auxiliary_loss_clip": 0.01489339, "auxiliary_loss_mlp": 0.00313716, "balance_loss_clip": 1.1975311, "balance_loss_mlp": 0.28152919, "epoch": 0.15403577333533744, "flos": 17129821518720.0, "grad_norm": 21.602920206214563, "language_loss": 0.90192831, "learning_rate": 3.840772060066425e-06, "loss": 0.91995889, "num_input_tokens_seen": 55658695, "router_z_loss_clip": 2.91796875, "router_z_loss_mlp": 0.32177734, "step": 2562, "time_per_iteration": 2.620293140411377 }, { "auxiliary_loss_clip": 0.01485022, "auxiliary_loss_mlp": 0.00350451, "balance_loss_clip": 1.18550348, "balance_loss_mlp": 0.31576061, "epoch": 0.1540958965880054, "flos": 17894503180800.0, "grad_norm": 4.98717428674485, "language_loss": 0.8547194, "learning_rate": 3.840619741387832e-06, "loss": 0.87307417, "num_input_tokens_seen": 55676340, "router_z_loss_clip": 2.99609375, "router_z_loss_mlp": 0.34741211, "step": 2563, "time_per_iteration": 2.605706214904785 }, { "auxiliary_loss_clip": 0.01479408, "auxiliary_loss_mlp": 0.00332988, "balance_loss_clip": 1.18054247, "balance_loss_mlp": 0.30077717, "epoch": 0.15415601984067337, "flos": 32161057908480.0, "grad_norm": 23.61477204818768, "language_loss": 0.86962521, "learning_rate": 3.8404673529127534e-06, "loss": 0.8877492, "num_input_tokens_seen": 55698890, "router_z_loss_clip": 2.9921875, "router_z_loss_mlp": 0.32202148, "step": 2564, "time_per_iteration": 2.7559595108032227 }, { "auxiliary_loss_clip": 0.01475131, "auxiliary_loss_mlp": 0.00325261, "balance_loss_clip": 1.1828655, "balance_loss_mlp": 0.29080963, "epoch": 0.15421614309334136, "flos": 24024418496640.0, "grad_norm": 2.8614464463678955, "language_loss": 0.78839713, "learning_rate": 3.840314894646969e-06, "loss": 0.80640107, "num_input_tokens_seen": 55718535, "router_z_loss_clip": 2.921875, "router_z_loss_mlp": 0.34448242, "step": 2565, "time_per_iteration": 2.6494717597961426 }, { "auxiliary_loss_clip": 0.01475977, "auxiliary_loss_mlp": 0.00332215, "balance_loss_clip": 1.18491316, "balance_loss_mlp": 0.30019492, "epoch": 0.15427626634600933, "flos": 24386290064640.0, "grad_norm": 50.414544979775535, "language_loss": 0.79050028, "learning_rate": 3.840162366596259e-06, "loss": 0.80858219, "num_input_tokens_seen": 55738970, "router_z_loss_clip": 2.9140625, "router_z_loss_mlp": 0.3203125, "step": 2566, "time_per_iteration": 2.673837661743164 }, { "auxiliary_loss_clip": 0.01458597, "auxiliary_loss_mlp": 0.00315778, "balance_loss_clip": 1.1742897, "balance_loss_mlp": 0.28442585, "epoch": 0.1543363895986773, "flos": 23331522165120.0, "grad_norm": 6.988010211601425, "language_loss": 0.9099223, "learning_rate": 3.840009768766408e-06, "loss": 0.92766607, "num_input_tokens_seen": 55759585, "router_z_loss_clip": 2.84179688, "router_z_loss_mlp": 0.31347656, "step": 2567, "time_per_iteration": 2.635493755340576 }, { "auxiliary_loss_clip": 0.01455228, "auxiliary_loss_mlp": 0.00315324, "balance_loss_clip": 1.16958523, "balance_loss_mlp": 0.28216028, "epoch": 0.15439651285134526, "flos": 24274284480000.0, "grad_norm": 2.752623766419202, "language_loss": 0.86643553, "learning_rate": 3.839857101163202e-06, "loss": 0.88414109, "num_input_tokens_seen": 55779250, "router_z_loss_clip": 2.859375, "router_z_loss_mlp": 0.33154297, "step": 2568, "time_per_iteration": 2.68095064163208 }, { "auxiliary_loss_clip": 0.01467577, "auxiliary_loss_mlp": 0.00343389, "balance_loss_clip": 1.18275833, "balance_loss_mlp": 0.31148869, "epoch": 0.15445663610401322, "flos": 22456163721600.0, "grad_norm": 40.824314213216915, "language_loss": 0.77444482, "learning_rate": 3.83970436379243e-06, "loss": 0.7925545, "num_input_tokens_seen": 55800470, "router_z_loss_clip": 2.84960938, "router_z_loss_mlp": 0.3190918, "step": 2569, "time_per_iteration": 2.681640625 }, { "auxiliary_loss_clip": 0.01452415, "auxiliary_loss_mlp": 0.00344314, "balance_loss_clip": 1.16895247, "balance_loss_mlp": 0.31403527, "epoch": 0.1545167593566812, "flos": 22049510872320.0, "grad_norm": 3.413883933694002, "language_loss": 0.83541799, "learning_rate": 3.839551556659884e-06, "loss": 0.85338533, "num_input_tokens_seen": 55817795, "router_z_loss_clip": 2.83398438, "router_z_loss_mlp": 0.30249023, "step": 2570, "time_per_iteration": 2.693608283996582 }, { "auxiliary_loss_clip": 0.01457221, "auxiliary_loss_mlp": 0.0029191, "balance_loss_clip": 1.17364204, "balance_loss_mlp": 0.26170245, "epoch": 0.15457688260934915, "flos": 19318253541120.0, "grad_norm": 134.60965151379412, "language_loss": 0.86954844, "learning_rate": 3.839398679771359e-06, "loss": 0.88703978, "num_input_tokens_seen": 55836125, "router_z_loss_clip": 2.83789062, "router_z_loss_mlp": 0.30200195, "step": 2571, "time_per_iteration": 2.586700916290283 }, { "auxiliary_loss_clip": 0.0145876, "auxiliary_loss_mlp": 0.00307149, "balance_loss_clip": 1.17471766, "balance_loss_mlp": 0.27369875, "epoch": 0.15463700586201715, "flos": 24133981956480.0, "grad_norm": 10.681428776470964, "language_loss": 0.89696288, "learning_rate": 3.839245733132652e-06, "loss": 0.91462195, "num_input_tokens_seen": 55855280, "router_z_loss_clip": 2.84179688, "router_z_loss_mlp": 0.33447266, "step": 2572, "time_per_iteration": 2.7055938243865967 }, { "auxiliary_loss_clip": 0.01465159, "auxiliary_loss_mlp": 0.00346388, "balance_loss_clip": 1.17677069, "balance_loss_mlp": 0.31360537, "epoch": 0.1546971291146851, "flos": 22420935457920.0, "grad_norm": 14.416738600053371, "language_loss": 0.95889413, "learning_rate": 3.839092716749563e-06, "loss": 0.97700959, "num_input_tokens_seen": 55875695, "router_z_loss_clip": 2.8828125, "router_z_loss_mlp": 0.328125, "step": 2573, "time_per_iteration": 2.6784615516662598 }, { "auxiliary_loss_clip": 0.01465327, "auxiliary_loss_mlp": 0.00295428, "balance_loss_clip": 1.17368364, "balance_loss_mlp": 0.26507744, "epoch": 0.15475725236735308, "flos": 17530225401600.0, "grad_norm": 431.44591558706276, "language_loss": 0.78680819, "learning_rate": 3.838939630627893e-06, "loss": 0.80441576, "num_input_tokens_seen": 55894575, "router_z_loss_clip": 2.91601562, "router_z_loss_mlp": 0.3034668, "step": 2574, "time_per_iteration": 2.7049922943115234 }, { "auxiliary_loss_clip": 0.01461284, "auxiliary_loss_mlp": 0.00269148, "balance_loss_clip": 1.17161691, "balance_loss_mlp": 0.23872612, "epoch": 0.15481737562002104, "flos": 22561740771840.0, "grad_norm": 11.52945808412478, "language_loss": 0.88106215, "learning_rate": 3.838786474773448e-06, "loss": 0.89836645, "num_input_tokens_seen": 55912855, "router_z_loss_clip": 2.8984375, "router_z_loss_mlp": 0.30419922, "step": 2575, "time_per_iteration": 2.6296682357788086 }, { "auxiliary_loss_clip": 0.01480177, "auxiliary_loss_mlp": 0.00314932, "balance_loss_clip": 1.18693161, "balance_loss_mlp": 0.28410494, "epoch": 0.154877498872689, "flos": 24900567039360.0, "grad_norm": 21.16407426982429, "language_loss": 0.89805096, "learning_rate": 3.838633249192036e-06, "loss": 0.91600204, "num_input_tokens_seen": 55932375, "router_z_loss_clip": 2.93164062, "router_z_loss_mlp": 0.30786133, "step": 2576, "time_per_iteration": 2.6712937355041504 }, { "auxiliary_loss_clip": 0.01486819, "auxiliary_loss_mlp": 0.00346065, "balance_loss_clip": 1.19123673, "balance_loss_mlp": 0.31397378, "epoch": 0.15493762212535697, "flos": 28147501975680.0, "grad_norm": 12.330311065628313, "language_loss": 0.89937872, "learning_rate": 3.838479953889465e-06, "loss": 0.91770756, "num_input_tokens_seen": 55953970, "router_z_loss_clip": 2.95507812, "router_z_loss_mlp": 0.32104492, "step": 2577, "time_per_iteration": 2.6715023517608643 }, { "auxiliary_loss_clip": 0.01509158, "auxiliary_loss_mlp": 0.00372982, "balance_loss_clip": 1.20803463, "balance_loss_mlp": 0.34288186, "epoch": 0.15499774537802496, "flos": 25411073086080.0, "grad_norm": 13.952922026592379, "language_loss": 0.87261856, "learning_rate": 3.8383265888715525e-06, "loss": 0.89143991, "num_input_tokens_seen": 55973120, "router_z_loss_clip": 3.01171875, "router_z_loss_mlp": 0.30114746, "step": 2578, "time_per_iteration": 2.6950864791870117 }, { "auxiliary_loss_clip": 0.01499788, "auxiliary_loss_mlp": 0.00308255, "balance_loss_clip": 1.19632804, "balance_loss_mlp": 0.27723712, "epoch": 0.15505786863069293, "flos": 22091562720000.0, "grad_norm": 3.2017028943249892, "language_loss": 0.88576221, "learning_rate": 3.83817315414411e-06, "loss": 0.90384269, "num_input_tokens_seen": 55993260, "router_z_loss_clip": 3.03320312, "router_z_loss_mlp": 0.31054688, "step": 2579, "time_per_iteration": 2.598050355911255 }, { "auxiliary_loss_clip": 0.01502329, "auxiliary_loss_mlp": 0.00293024, "balance_loss_clip": 1.19733095, "balance_loss_mlp": 0.25888291, "epoch": 0.1551179918833609, "flos": 18917131386240.0, "grad_norm": 154.83660107877932, "language_loss": 0.8632561, "learning_rate": 3.838019649712958e-06, "loss": 0.88120967, "num_input_tokens_seen": 56012130, "router_z_loss_clip": 3.05078125, "router_z_loss_mlp": 0.34106445, "step": 2580, "time_per_iteration": 2.6332767009735107 }, { "auxiliary_loss_clip": 0.01485671, "auxiliary_loss_mlp": 0.00115499, "balance_loss_clip": 1.24617839, "balance_loss_mlp": 0.10090782, "epoch": 0.15517811513602886, "flos": 66239172587520.0, "grad_norm": 0.8945207642247979, "language_loss": 0.5908972, "learning_rate": 3.8378660755839166e-06, "loss": 0.60690892, "num_input_tokens_seen": 56079045, "router_z_loss_clip": 2.40625, "router_z_loss_mlp": 0.14550781, "step": 2581, "time_per_iteration": 3.2989563941955566 }, { "auxiliary_loss_clip": 0.01525063, "auxiliary_loss_mlp": 0.00292339, "balance_loss_clip": 1.21906114, "balance_loss_mlp": 0.26046211, "epoch": 0.15523823838869683, "flos": 24021078531840.0, "grad_norm": 3.9832219091750147, "language_loss": 0.91446042, "learning_rate": 3.8377124317628095e-06, "loss": 0.93263447, "num_input_tokens_seen": 56098745, "router_z_loss_clip": 3.05859375, "router_z_loss_mlp": 0.3190918, "step": 2582, "time_per_iteration": 2.6220510005950928 }, { "auxiliary_loss_clip": 0.01540849, "auxiliary_loss_mlp": 0.00308837, "balance_loss_clip": 1.22973299, "balance_loss_mlp": 0.27588743, "epoch": 0.1552983616413648, "flos": 20485062938880.0, "grad_norm": 58.792296300678316, "language_loss": 0.86030918, "learning_rate": 3.8375587182554625e-06, "loss": 0.87880599, "num_input_tokens_seen": 56117655, "router_z_loss_clip": 3.11328125, "router_z_loss_mlp": 0.3293457, "step": 2583, "time_per_iteration": 2.6678519248962402 }, { "auxiliary_loss_clip": 0.01526271, "auxiliary_loss_mlp": 0.00378041, "balance_loss_clip": 1.22040606, "balance_loss_mlp": 0.34382787, "epoch": 0.15535848489403276, "flos": 32123710742400.0, "grad_norm": 21.634238965583553, "language_loss": 0.82757521, "learning_rate": 3.837404935067705e-06, "loss": 0.84661829, "num_input_tokens_seen": 56141960, "router_z_loss_clip": 3.05664062, "router_z_loss_mlp": 0.34204102, "step": 2584, "time_per_iteration": 2.7165346145629883 }, { "auxiliary_loss_clip": 0.0155321, "auxiliary_loss_mlp": 0.00350123, "balance_loss_clip": 1.24243534, "balance_loss_mlp": 0.31734061, "epoch": 0.15541860814670075, "flos": 19098444263040.0, "grad_norm": 27.808968027693233, "language_loss": 0.84162283, "learning_rate": 3.837251082205368e-06, "loss": 0.86065614, "num_input_tokens_seen": 56161430, "router_z_loss_clip": 3.10742188, "router_z_loss_mlp": 0.328125, "step": 2585, "time_per_iteration": 2.68513560295105 }, { "auxiliary_loss_clip": 0.01541487, "auxiliary_loss_mlp": 0.00326095, "balance_loss_clip": 1.23273206, "balance_loss_mlp": 0.29250151, "epoch": 0.1554787313993687, "flos": 19172097100800.0, "grad_norm": 31.06605321888724, "language_loss": 0.70684701, "learning_rate": 3.837097159674286e-06, "loss": 0.72552288, "num_input_tokens_seen": 56179390, "router_z_loss_clip": 3.08398438, "router_z_loss_mlp": 0.33569336, "step": 2586, "time_per_iteration": 2.703218698501587 }, { "auxiliary_loss_clip": 0.01556801, "auxiliary_loss_mlp": 0.00376244, "balance_loss_clip": 1.24627686, "balance_loss_mlp": 0.34141108, "epoch": 0.15553885465203668, "flos": 16143822207360.0, "grad_norm": 16.72471161076826, "language_loss": 0.88593817, "learning_rate": 3.836943167480296e-06, "loss": 0.90526855, "num_input_tokens_seen": 56198020, "router_z_loss_clip": 3.10351562, "router_z_loss_mlp": 0.34790039, "step": 2587, "time_per_iteration": 2.722289562225342 }, { "auxiliary_loss_clip": 0.01566025, "auxiliary_loss_mlp": 0.0039617, "balance_loss_clip": 1.24897683, "balance_loss_mlp": 0.35699767, "epoch": 0.15559897790470464, "flos": 25337779384320.0, "grad_norm": 2.7725611997411184, "language_loss": 0.95930409, "learning_rate": 3.836789105629236e-06, "loss": 0.97892606, "num_input_tokens_seen": 56218165, "router_z_loss_clip": 3.171875, "router_z_loss_mlp": 0.39160156, "step": 2588, "time_per_iteration": 2.710186004638672 }, { "auxiliary_loss_clip": 0.01567545, "auxiliary_loss_mlp": 0.00341813, "balance_loss_clip": 1.2569778, "balance_loss_mlp": 0.30779094, "epoch": 0.1556591011573726, "flos": 23148772744320.0, "grad_norm": 21.950528294190818, "language_loss": 0.73083258, "learning_rate": 3.83663497412695e-06, "loss": 0.74992621, "num_input_tokens_seen": 56237160, "router_z_loss_clip": 3.10742188, "router_z_loss_mlp": 0.34008789, "step": 2589, "time_per_iteration": 2.6287307739257812 }, { "auxiliary_loss_clip": 0.01573898, "auxiliary_loss_mlp": 0.00335027, "balance_loss_clip": 1.26334667, "balance_loss_mlp": 0.30131432, "epoch": 0.15571922441004057, "flos": 25370888745600.0, "grad_norm": 6.498655226574372, "language_loss": 0.8839978, "learning_rate": 3.836480772979281e-06, "loss": 0.90308702, "num_input_tokens_seen": 56257610, "router_z_loss_clip": 3.10546875, "router_z_loss_mlp": 0.33691406, "step": 2590, "time_per_iteration": 2.6553735733032227 }, { "auxiliary_loss_clip": 0.01578203, "auxiliary_loss_mlp": 0.00333866, "balance_loss_clip": 1.2677635, "balance_loss_mlp": 0.30141741, "epoch": 0.15577934766270854, "flos": 14501375890560.0, "grad_norm": 12.63559943265338, "language_loss": 0.88026792, "learning_rate": 3.836326502192077e-06, "loss": 0.89938861, "num_input_tokens_seen": 56275215, "router_z_loss_clip": 3.10351562, "router_z_loss_mlp": 0.32446289, "step": 2591, "time_per_iteration": 2.639390468597412 }, { "auxiliary_loss_clip": 0.01576345, "auxiliary_loss_mlp": 0.00316978, "balance_loss_clip": 1.26633525, "balance_loss_mlp": 0.28524399, "epoch": 0.15583947091537653, "flos": 37414537372800.0, "grad_norm": 8.264318151868066, "language_loss": 0.74604517, "learning_rate": 3.836172161771189e-06, "loss": 0.76497835, "num_input_tokens_seen": 56297130, "router_z_loss_clip": 3.09960938, "router_z_loss_mlp": 0.31762695, "step": 2592, "time_per_iteration": 2.8106682300567627 }, { "auxiliary_loss_clip": 0.01595681, "auxiliary_loss_mlp": 0.00328823, "balance_loss_clip": 1.27481735, "balance_loss_mlp": 0.29554003, "epoch": 0.1558995941680445, "flos": 21834729498240.0, "grad_norm": 11.934889752563848, "language_loss": 0.90146577, "learning_rate": 3.836017751722467e-06, "loss": 0.9207108, "num_input_tokens_seen": 56314995, "router_z_loss_clip": 3.20898438, "router_z_loss_mlp": 0.33276367, "step": 2593, "time_per_iteration": 4.023046970367432 }, { "auxiliary_loss_clip": 0.0159261, "auxiliary_loss_mlp": 0.00363401, "balance_loss_clip": 1.28128147, "balance_loss_mlp": 0.32787675, "epoch": 0.15595971742071246, "flos": 19792633484160.0, "grad_norm": 2.7458738174247186, "language_loss": 0.78902453, "learning_rate": 3.8358632720517695e-06, "loss": 0.80858457, "num_input_tokens_seen": 56334005, "router_z_loss_clip": 3.1171875, "router_z_loss_mlp": 0.35522461, "step": 2594, "time_per_iteration": 2.635490655899048 }, { "auxiliary_loss_clip": 0.0158967, "auxiliary_loss_mlp": 0.0030033, "balance_loss_clip": 1.2820003, "balance_loss_mlp": 0.26730919, "epoch": 0.15601984067338043, "flos": 26722135503360.0, "grad_norm": 3.0463256708396598, "language_loss": 0.87282169, "learning_rate": 3.835708722764952e-06, "loss": 0.89172173, "num_input_tokens_seen": 56353795, "router_z_loss_clip": 3.07617188, "router_z_loss_mlp": 0.33007812, "step": 2595, "time_per_iteration": 2.770974636077881 }, { "auxiliary_loss_clip": 0.01579849, "auxiliary_loss_mlp": 0.00317172, "balance_loss_clip": 1.26811373, "balance_loss_mlp": 0.28281629, "epoch": 0.1560799639260484, "flos": 18369278173440.0, "grad_norm": 11.410284020540507, "language_loss": 0.93354243, "learning_rate": 3.835554103867876e-06, "loss": 0.95251262, "num_input_tokens_seen": 56373195, "router_z_loss_clip": 3.11523438, "router_z_loss_mlp": 0.34326172, "step": 2596, "time_per_iteration": 4.137763500213623 }, { "auxiliary_loss_clip": 0.01586004, "auxiliary_loss_mlp": 0.00297567, "balance_loss_clip": 1.27753186, "balance_loss_mlp": 0.26440316, "epoch": 0.15614008717871636, "flos": 22598980197120.0, "grad_norm": 3.6540057159934807, "language_loss": 0.73644364, "learning_rate": 3.835399415366404e-06, "loss": 0.75527942, "num_input_tokens_seen": 56391525, "router_z_loss_clip": 3.08398438, "router_z_loss_mlp": 0.33178711, "step": 2597, "time_per_iteration": 2.6384782791137695 }, { "auxiliary_loss_clip": 0.01596133, "auxiliary_loss_mlp": 0.00326302, "balance_loss_clip": 1.28960323, "balance_loss_mlp": 0.29485494, "epoch": 0.15620021043138435, "flos": 22746860490240.0, "grad_norm": 5.201426088448285, "language_loss": 0.85769761, "learning_rate": 3.8352446572664035e-06, "loss": 0.87692189, "num_input_tokens_seen": 56410715, "router_z_loss_clip": 3.06445312, "router_z_loss_mlp": 0.31494141, "step": 2598, "time_per_iteration": 2.692850351333618 }, { "auxiliary_loss_clip": 0.01600591, "auxiliary_loss_mlp": 0.00331565, "balance_loss_clip": 1.29030657, "balance_loss_mlp": 0.29656544, "epoch": 0.15626033368405232, "flos": 13114936782720.0, "grad_norm": 28.65367191081519, "language_loss": 0.87953258, "learning_rate": 3.8350898295737405e-06, "loss": 0.89885414, "num_input_tokens_seen": 56429170, "router_z_loss_clip": 3.1015625, "router_z_loss_mlp": 0.35009766, "step": 2599, "time_per_iteration": 4.153528690338135 }, { "auxiliary_loss_clip": 0.01611025, "auxiliary_loss_mlp": 0.00322718, "balance_loss_clip": 1.2950604, "balance_loss_mlp": 0.2852383, "epoch": 0.15632045693672028, "flos": 16472297105280.0, "grad_norm": 33.551668115540636, "language_loss": 0.88049442, "learning_rate": 3.834934932294287e-06, "loss": 0.89983189, "num_input_tokens_seen": 56445685, "router_z_loss_clip": 3.15625, "router_z_loss_mlp": 0.375, "step": 2600, "time_per_iteration": 2.6539618968963623 }, { "auxiliary_loss_clip": 0.01616843, "auxiliary_loss_mlp": 0.00332869, "balance_loss_clip": 1.30248582, "balance_loss_mlp": 0.29784557, "epoch": 0.15638058018938825, "flos": 20850346298880.0, "grad_norm": 14.637595730480623, "language_loss": 0.94794333, "learning_rate": 3.834779965433917e-06, "loss": 0.96744049, "num_input_tokens_seen": 56465900, "router_z_loss_clip": 3.14453125, "router_z_loss_mlp": 0.3503418, "step": 2601, "time_per_iteration": 2.680041790008545 }, { "auxiliary_loss_clip": 0.01639848, "auxiliary_loss_mlp": 0.00311866, "balance_loss_clip": 1.31595731, "balance_loss_mlp": 0.27517292, "epoch": 0.1564407034420562, "flos": 21872220318720.0, "grad_norm": 6.253866759679896, "language_loss": 0.86380893, "learning_rate": 3.834624928998508e-06, "loss": 0.88332605, "num_input_tokens_seen": 56485020, "router_z_loss_clip": 3.23632812, "router_z_loss_mlp": 0.36669922, "step": 2602, "time_per_iteration": 2.663860321044922 }, { "auxiliary_loss_clip": 0.01631214, "auxiliary_loss_mlp": 0.00273266, "balance_loss_clip": 1.30994987, "balance_loss_mlp": 0.23862353, "epoch": 0.15650082669472418, "flos": 21834549930240.0, "grad_norm": 3.432073245431192, "language_loss": 0.80171072, "learning_rate": 3.8344698229939376e-06, "loss": 0.82075548, "num_input_tokens_seen": 56505205, "router_z_loss_clip": 3.21289062, "router_z_loss_mlp": 0.34619141, "step": 2603, "time_per_iteration": 2.6073925495147705 }, { "auxiliary_loss_clip": 0.01621949, "auxiliary_loss_mlp": 0.002787, "balance_loss_clip": 1.30390775, "balance_loss_mlp": 0.24617946, "epoch": 0.15656094994739214, "flos": 13800542653440.0, "grad_norm": 2.782779524868727, "language_loss": 0.97018343, "learning_rate": 3.8343146474260865e-06, "loss": 0.98918986, "num_input_tokens_seen": 56521495, "router_z_loss_clip": 3.1796875, "router_z_loss_mlp": 0.32519531, "step": 2604, "time_per_iteration": 2.633498191833496 }, { "auxiliary_loss_clip": 0.01638548, "auxiliary_loss_mlp": 0.00246522, "balance_loss_clip": 1.31166792, "balance_loss_mlp": 0.2111406, "epoch": 0.15662107320006013, "flos": 27308197808640.0, "grad_norm": 510.61866400522115, "language_loss": 0.91254258, "learning_rate": 3.834159402300841e-06, "loss": 0.93139327, "num_input_tokens_seen": 56540665, "router_z_loss_clip": 3.27148438, "router_z_loss_mlp": 0.35375977, "step": 2605, "time_per_iteration": 2.6894805431365967 }, { "auxiliary_loss_clip": 0.01642257, "auxiliary_loss_mlp": 0.00259192, "balance_loss_clip": 1.3119278, "balance_loss_mlp": 0.22276208, "epoch": 0.1566811964527281, "flos": 26685075646080.0, "grad_norm": 36.90407084438398, "language_loss": 0.82168567, "learning_rate": 3.834004087624087e-06, "loss": 0.84070015, "num_input_tokens_seen": 56560805, "router_z_loss_clip": 3.30273438, "router_z_loss_mlp": 0.36376953, "step": 2606, "time_per_iteration": 2.708345651626587 }, { "auxiliary_loss_clip": 0.01657621, "auxiliary_loss_mlp": 0.00241075, "balance_loss_clip": 1.32816553, "balance_loss_mlp": 0.20593223, "epoch": 0.15674131970539606, "flos": 16103422385280.0, "grad_norm": 9.335648599823031, "language_loss": 0.83225858, "learning_rate": 3.8338487034017145e-06, "loss": 0.85124558, "num_input_tokens_seen": 56576335, "router_z_loss_clip": 3.30078125, "router_z_loss_mlp": 0.35131836, "step": 2607, "time_per_iteration": 2.5973219871520996 }, { "auxiliary_loss_clip": 0.01641805, "auxiliary_loss_mlp": 0.0022514, "balance_loss_clip": 1.31427228, "balance_loss_mlp": 0.19126114, "epoch": 0.15680144295806403, "flos": 19169690889600.0, "grad_norm": 27.805256096089785, "language_loss": 0.88044912, "learning_rate": 3.833693249639615e-06, "loss": 0.89911854, "num_input_tokens_seen": 56595880, "router_z_loss_clip": 3.27539062, "router_z_loss_mlp": 0.33886719, "step": 2608, "time_per_iteration": 2.6723620891571045 }, { "auxiliary_loss_clip": 0.01648134, "auxiliary_loss_mlp": 0.0026406, "balance_loss_clip": 1.31379414, "balance_loss_mlp": 0.22624652, "epoch": 0.156861566210732, "flos": 20813430096000.0, "grad_norm": 5.41946598638552, "language_loss": 0.79876554, "learning_rate": 3.833537726343684e-06, "loss": 0.81788743, "num_input_tokens_seen": 56615130, "router_z_loss_clip": 3.34179688, "router_z_loss_mlp": 0.37817383, "step": 2609, "time_per_iteration": 2.6655771732330322 }, { "auxiliary_loss_clip": 0.01653146, "auxiliary_loss_mlp": 0.00253563, "balance_loss_clip": 1.31731677, "balance_loss_mlp": 0.21844402, "epoch": 0.15692168946339996, "flos": 20047922421120.0, "grad_norm": 21.26977573189445, "language_loss": 0.83414245, "learning_rate": 3.833382133519818e-06, "loss": 0.8532095, "num_input_tokens_seen": 56634005, "router_z_loss_clip": 3.359375, "router_z_loss_mlp": 0.35131836, "step": 2610, "time_per_iteration": 2.622344732284546 }, { "auxiliary_loss_clip": 0.01651632, "auxiliary_loss_mlp": 0.00261938, "balance_loss_clip": 1.31205845, "balance_loss_mlp": 0.22016656, "epoch": 0.15698181271606793, "flos": 21398019943680.0, "grad_norm": 2.6199104824434496, "language_loss": 0.80716926, "learning_rate": 3.833226471173919e-06, "loss": 0.82630491, "num_input_tokens_seen": 56653480, "router_z_loss_clip": 3.39453125, "router_z_loss_mlp": 0.41748047, "step": 2611, "time_per_iteration": 2.721489429473877 }, { "auxiliary_loss_clip": 0.01650468, "auxiliary_loss_mlp": 0.00262909, "balance_loss_clip": 1.31496191, "balance_loss_mlp": 0.22833827, "epoch": 0.15704193596873592, "flos": 20845785271680.0, "grad_norm": 11.648019938953155, "language_loss": 0.79038036, "learning_rate": 3.833070739311887e-06, "loss": 0.80951416, "num_input_tokens_seen": 56672270, "router_z_loss_clip": 3.35546875, "router_z_loss_mlp": 0.34594727, "step": 2612, "time_per_iteration": 2.6574203968048096 }, { "auxiliary_loss_clip": 0.01649243, "auxiliary_loss_mlp": 0.002369, "balance_loss_clip": 1.31278896, "balance_loss_mlp": 0.20373577, "epoch": 0.15710205922140388, "flos": 21762908254080.0, "grad_norm": 187.5337019008567, "language_loss": 0.84351325, "learning_rate": 3.83291493793963e-06, "loss": 0.86237466, "num_input_tokens_seen": 56691510, "router_z_loss_clip": 3.36132812, "router_z_loss_mlp": 0.33154297, "step": 2613, "time_per_iteration": 2.7396273612976074 }, { "auxiliary_loss_clip": 0.01643194, "auxiliary_loss_mlp": 0.00258654, "balance_loss_clip": 1.30705953, "balance_loss_mlp": 0.22041139, "epoch": 0.15716218247407185, "flos": 25007760201600.0, "grad_norm": 11.17782383318674, "language_loss": 0.74144739, "learning_rate": 3.832759067063055e-06, "loss": 0.76046586, "num_input_tokens_seen": 56712230, "router_z_loss_clip": 3.36132812, "router_z_loss_mlp": 0.38208008, "step": 2614, "time_per_iteration": 2.682082414627075 }, { "auxiliary_loss_clip": 0.01658328, "auxiliary_loss_mlp": 0.00226303, "balance_loss_clip": 1.31434512, "balance_loss_mlp": 0.19225693, "epoch": 0.1572223057267398, "flos": 20191780391040.0, "grad_norm": 55.7729619456281, "language_loss": 0.83776665, "learning_rate": 3.832603126688072e-06, "loss": 0.85661304, "num_input_tokens_seen": 56727490, "router_z_loss_clip": 3.43945312, "router_z_loss_mlp": 0.34057617, "step": 2615, "time_per_iteration": 2.6215434074401855 }, { "auxiliary_loss_clip": 0.01659563, "auxiliary_loss_mlp": 0.00256509, "balance_loss_clip": 1.31827569, "balance_loss_mlp": 0.22084117, "epoch": 0.15728242897940778, "flos": 20959514709120.0, "grad_norm": 9.629710574978073, "language_loss": 0.78766906, "learning_rate": 3.832447116820594e-06, "loss": 0.80682981, "num_input_tokens_seen": 56747385, "router_z_loss_clip": 3.4140625, "router_z_loss_mlp": 0.35693359, "step": 2616, "time_per_iteration": 2.636673927307129 }, { "auxiliary_loss_clip": 0.01656929, "auxiliary_loss_mlp": 0.00248591, "balance_loss_clip": 1.31584907, "balance_loss_mlp": 0.21251872, "epoch": 0.15734255223207574, "flos": 23038275530880.0, "grad_norm": 39.09485207485857, "language_loss": 0.7952913, "learning_rate": 3.832291037466539e-06, "loss": 0.81434655, "num_input_tokens_seen": 56768055, "router_z_loss_clip": 3.41015625, "router_z_loss_mlp": 0.36108398, "step": 2617, "time_per_iteration": 2.7147884368896484 }, { "auxiliary_loss_clip": 0.01676724, "auxiliary_loss_mlp": 0.00257395, "balance_loss_clip": 1.33182907, "balance_loss_mlp": 0.22232336, "epoch": 0.15740267548474374, "flos": 20551281661440.0, "grad_norm": 26.424849103071082, "language_loss": 0.80752236, "learning_rate": 3.8321348886318235e-06, "loss": 0.82686353, "num_input_tokens_seen": 56785110, "router_z_loss_clip": 3.4453125, "router_z_loss_mlp": 0.35058594, "step": 2618, "time_per_iteration": 2.596367120742798 }, { "auxiliary_loss_clip": 0.01696111, "auxiliary_loss_mlp": 0.0026809, "balance_loss_clip": 1.33625603, "balance_loss_mlp": 0.22839305, "epoch": 0.1574627987374117, "flos": 22666922772480.0, "grad_norm": 3.56764192313242, "language_loss": 0.86945373, "learning_rate": 3.8319786703223695e-06, "loss": 0.88909572, "num_input_tokens_seen": 56804975, "router_z_loss_clip": 3.59765625, "router_z_loss_mlp": 0.39697266, "step": 2619, "time_per_iteration": 2.647993564605713 }, { "auxiliary_loss_clip": 0.01686203, "auxiliary_loss_mlp": 0.00250304, "balance_loss_clip": 1.33280981, "balance_loss_mlp": 0.21532822, "epoch": 0.15752292199007967, "flos": 16800664262400.0, "grad_norm": 5.078227502989648, "language_loss": 0.81878126, "learning_rate": 3.831822382544101e-06, "loss": 0.83814633, "num_input_tokens_seen": 56822470, "router_z_loss_clip": 3.53515625, "router_z_loss_mlp": 0.35009766, "step": 2620, "time_per_iteration": 2.5828206539154053 }, { "auxiliary_loss_clip": 0.01684453, "auxiliary_loss_mlp": 0.00245744, "balance_loss_clip": 1.33075905, "balance_loss_mlp": 0.20702486, "epoch": 0.15758304524274763, "flos": 29826002568960.0, "grad_norm": 2.591791935082625, "language_loss": 0.78352451, "learning_rate": 3.831666025302944e-06, "loss": 0.80282652, "num_input_tokens_seen": 56842100, "router_z_loss_clip": 3.53125, "router_z_loss_mlp": 0.38720703, "step": 2621, "time_per_iteration": 2.725764513015747 }, { "auxiliary_loss_clip": 0.01695707, "auxiliary_loss_mlp": 0.00254706, "balance_loss_clip": 1.33514118, "balance_loss_mlp": 0.21863329, "epoch": 0.1576431684954156, "flos": 53577426723840.0, "grad_norm": 30.46959782951778, "language_loss": 0.80646968, "learning_rate": 3.831509598604828e-06, "loss": 0.82597381, "num_input_tokens_seen": 56865920, "router_z_loss_clip": 3.60351562, "router_z_loss_mlp": 0.36083984, "step": 2622, "time_per_iteration": 2.9106900691986084 }, { "auxiliary_loss_clip": 0.01695764, "auxiliary_loss_mlp": 0.00247053, "balance_loss_clip": 1.34150732, "balance_loss_mlp": 0.21348388, "epoch": 0.15770329174808356, "flos": 20813609664000.0, "grad_norm": 6.952143479317169, "language_loss": 0.93389189, "learning_rate": 3.831353102455684e-06, "loss": 0.95332009, "num_input_tokens_seen": 56885265, "router_z_loss_clip": 3.54492188, "router_z_loss_mlp": 0.33544922, "step": 2623, "time_per_iteration": 2.6047191619873047 }, { "auxiliary_loss_clip": 0.01688175, "auxiliary_loss_mlp": 0.0025232, "balance_loss_clip": 1.33443284, "balance_loss_mlp": 0.21767816, "epoch": 0.15776341500075153, "flos": 24974004395520.0, "grad_norm": 252.55605854039928, "language_loss": 0.85329765, "learning_rate": 3.831196536861448e-06, "loss": 0.8727026, "num_input_tokens_seen": 56906710, "router_z_loss_clip": 3.53515625, "router_z_loss_mlp": 0.34667969, "step": 2624, "time_per_iteration": 2.6776247024536133 }, { "auxiliary_loss_clip": 0.01689593, "auxiliary_loss_mlp": 0.00251853, "balance_loss_clip": 1.32877612, "balance_loss_mlp": 0.21485044, "epoch": 0.15782353825341952, "flos": 21907915459200.0, "grad_norm": 163.35626486849284, "language_loss": 0.87918651, "learning_rate": 3.831039901828054e-06, "loss": 0.898601, "num_input_tokens_seen": 56924275, "router_z_loss_clip": 3.60742188, "router_z_loss_mlp": 0.37011719, "step": 2625, "time_per_iteration": 2.6763527393341064 }, { "auxiliary_loss_clip": 0.01677082, "auxiliary_loss_mlp": 0.00224205, "balance_loss_clip": 1.32228518, "balance_loss_mlp": 0.19006389, "epoch": 0.15788366150608749, "flos": 26177191292160.0, "grad_norm": 36.51602713488243, "language_loss": 0.8802532, "learning_rate": 3.830883197361445e-06, "loss": 0.89926606, "num_input_tokens_seen": 56941525, "router_z_loss_clip": 3.55078125, "router_z_loss_mlp": 0.34130859, "step": 2626, "time_per_iteration": 2.6627957820892334 }, { "auxiliary_loss_clip": 0.01673849, "auxiliary_loss_mlp": 0.00248605, "balance_loss_clip": 1.32187295, "balance_loss_mlp": 0.21293791, "epoch": 0.15794378475875545, "flos": 27709822753920.0, "grad_norm": 2.1084359455671406, "language_loss": 0.82395166, "learning_rate": 3.830726423467561e-06, "loss": 0.84317625, "num_input_tokens_seen": 56962145, "router_z_loss_clip": 3.52148438, "router_z_loss_mlp": 0.35668945, "step": 2627, "time_per_iteration": 2.667367696762085 }, { "auxiliary_loss_clip": 0.01662976, "auxiliary_loss_mlp": 0.00256148, "balance_loss_clip": 1.31007564, "balance_loss_mlp": 0.222984, "epoch": 0.15800390801142342, "flos": 12130158533760.0, "grad_norm": 11.37584649812117, "language_loss": 0.93822759, "learning_rate": 3.830569580152348e-06, "loss": 0.9574188, "num_input_tokens_seen": 56977505, "router_z_loss_clip": 3.53125, "router_z_loss_mlp": 0.33154297, "step": 2628, "time_per_iteration": 2.5871810913085938 }, { "auxiliary_loss_clip": 0.01649001, "auxiliary_loss_mlp": 0.00236664, "balance_loss_clip": 1.30186307, "balance_loss_mlp": 0.20462099, "epoch": 0.15806403126409138, "flos": 20704728562560.0, "grad_norm": 33.7366911159071, "language_loss": 0.83724153, "learning_rate": 3.830412667421752e-06, "loss": 0.85609818, "num_input_tokens_seen": 56996770, "router_z_loss_clip": 3.47070312, "router_z_loss_mlp": 0.3203125, "step": 2629, "time_per_iteration": 2.632408380508423 }, { "auxiliary_loss_clip": 0.01645039, "auxiliary_loss_mlp": 0.00229443, "balance_loss_clip": 1.29784453, "balance_loss_mlp": 0.1968272, "epoch": 0.15812415451675935, "flos": 17821712269440.0, "grad_norm": 919.5490562472952, "language_loss": 0.83190012, "learning_rate": 3.8302556852817245e-06, "loss": 0.85064495, "num_input_tokens_seen": 57014970, "router_z_loss_clip": 3.47070312, "router_z_loss_mlp": 0.32617188, "step": 2630, "time_per_iteration": 2.69235897064209 }, { "auxiliary_loss_clip": 0.01634841, "auxiliary_loss_mlp": 0.00267264, "balance_loss_clip": 1.28532994, "balance_loss_mlp": 0.23274116, "epoch": 0.15818427776942734, "flos": 20084048524800.0, "grad_norm": 92.57361178130363, "language_loss": 0.92002684, "learning_rate": 3.8300986337382184e-06, "loss": 0.93904793, "num_input_tokens_seen": 57034045, "router_z_loss_clip": 3.49609375, "router_z_loss_mlp": 0.3449707, "step": 2631, "time_per_iteration": 2.6491079330444336 }, { "auxiliary_loss_clip": 0.01621319, "auxiliary_loss_mlp": 0.00272597, "balance_loss_clip": 1.27777445, "balance_loss_mlp": 0.23781152, "epoch": 0.1582444010220953, "flos": 21214911386880.0, "grad_norm": 2.4289931335920336, "language_loss": 0.88437468, "learning_rate": 3.8299415127971895e-06, "loss": 0.90331388, "num_input_tokens_seen": 57053695, "router_z_loss_clip": 3.43554688, "router_z_loss_mlp": 0.34790039, "step": 2632, "time_per_iteration": 2.6516783237457275 }, { "auxiliary_loss_clip": 0.01614379, "auxiliary_loss_mlp": 0.00254858, "balance_loss_clip": 1.27052891, "balance_loss_mlp": 0.22400703, "epoch": 0.15830452427476327, "flos": 17858341163520.0, "grad_norm": 1086.3276714529295, "language_loss": 0.90639508, "learning_rate": 3.829784322464594e-06, "loss": 0.92508745, "num_input_tokens_seen": 57071290, "router_z_loss_clip": 3.44140625, "router_z_loss_mlp": 0.30883789, "step": 2633, "time_per_iteration": 2.6136326789855957 }, { "auxiliary_loss_clip": 0.01615842, "auxiliary_loss_mlp": 0.00274609, "balance_loss_clip": 1.27264607, "balance_loss_mlp": 0.24137366, "epoch": 0.15836464752743123, "flos": 24534960456960.0, "grad_norm": 5.963081096991649, "language_loss": 0.83877909, "learning_rate": 3.829627062746394e-06, "loss": 0.85768366, "num_input_tokens_seen": 57091465, "router_z_loss_clip": 3.43164062, "router_z_loss_mlp": 0.33227539, "step": 2634, "time_per_iteration": 2.6822216510772705 }, { "auxiliary_loss_clip": 0.01595213, "auxiliary_loss_mlp": 0.00270926, "balance_loss_clip": 1.24889565, "balance_loss_mlp": 0.23914506, "epoch": 0.1584247707800992, "flos": 20120821073280.0, "grad_norm": 8.437586342943668, "language_loss": 0.95932919, "learning_rate": 3.829469733648552e-06, "loss": 0.97799063, "num_input_tokens_seen": 57110075, "router_z_loss_clip": 3.46289062, "router_z_loss_mlp": 0.31762695, "step": 2635, "time_per_iteration": 4.1620330810546875 }, { "auxiliary_loss_clip": 0.01597383, "auxiliary_loss_mlp": 0.00306145, "balance_loss_clip": 1.25848711, "balance_loss_mlp": 0.27434021, "epoch": 0.15848489403276717, "flos": 20375966355840.0, "grad_norm": 172.2723016430246, "language_loss": 0.8423236, "learning_rate": 3.829312335177034e-06, "loss": 0.86135888, "num_input_tokens_seen": 57128945, "router_z_loss_clip": 3.390625, "router_z_loss_mlp": 0.31811523, "step": 2636, "time_per_iteration": 2.648315191268921 }, { "auxiliary_loss_clip": 0.01613965, "auxiliary_loss_mlp": 0.00290471, "balance_loss_clip": 1.2687459, "balance_loss_mlp": 0.25742584, "epoch": 0.15854501728543513, "flos": 39346890359040.0, "grad_norm": 19.50841596235931, "language_loss": 0.79897535, "learning_rate": 3.82915486733781e-06, "loss": 0.81801975, "num_input_tokens_seen": 57152385, "router_z_loss_clip": 3.45507812, "router_z_loss_mlp": 0.33056641, "step": 2637, "time_per_iteration": 2.8147454261779785 }, { "auxiliary_loss_clip": 0.01574872, "auxiliary_loss_mlp": 0.00272426, "balance_loss_clip": 1.24458027, "balance_loss_mlp": 0.2407399, "epoch": 0.15860514053810312, "flos": 24864225454080.0, "grad_norm": 3.8410829649171396, "language_loss": 0.84517193, "learning_rate": 3.82899733013685e-06, "loss": 0.86364496, "num_input_tokens_seen": 57172620, "router_z_loss_clip": 3.3046875, "router_z_loss_mlp": 0.31713867, "step": 2638, "time_per_iteration": 4.168790340423584 }, { "auxiliary_loss_clip": 0.01576488, "auxiliary_loss_mlp": 0.00280258, "balance_loss_clip": 1.24106956, "balance_loss_mlp": 0.24854819, "epoch": 0.1586652637907711, "flos": 26177694082560.0, "grad_norm": 1.7974579543169713, "language_loss": 0.81622612, "learning_rate": 3.828839723580128e-06, "loss": 0.83479363, "num_input_tokens_seen": 57194680, "router_z_loss_clip": 3.35351562, "router_z_loss_mlp": 0.31689453, "step": 2639, "time_per_iteration": 4.10056471824646 }, { "auxiliary_loss_clip": 0.0158749, "auxiliary_loss_mlp": 0.00312686, "balance_loss_clip": 1.25399828, "balance_loss_mlp": 0.28151238, "epoch": 0.15872538704343905, "flos": 19792058866560.0, "grad_norm": 12.02060547710641, "language_loss": 0.87061715, "learning_rate": 3.82868204767362e-06, "loss": 0.88961899, "num_input_tokens_seen": 57214675, "router_z_loss_clip": 3.33984375, "router_z_loss_mlp": 0.31164551, "step": 2640, "time_per_iteration": 2.629162549972534 }, { "auxiliary_loss_clip": 0.01576248, "auxiliary_loss_mlp": 0.00292534, "balance_loss_clip": 1.24464881, "balance_loss_mlp": 0.26217121, "epoch": 0.15878551029610702, "flos": 28475366342400.0, "grad_norm": 11.902807807471113, "language_loss": 0.72325945, "learning_rate": 3.828524302423306e-06, "loss": 0.74194723, "num_input_tokens_seen": 57235830, "router_z_loss_clip": 3.31640625, "router_z_loss_mlp": 0.30358887, "step": 2641, "time_per_iteration": 4.065160512924194 }, { "auxiliary_loss_clip": 0.01583132, "auxiliary_loss_mlp": 0.00296364, "balance_loss_clip": 1.24132681, "balance_loss_mlp": 0.26389116, "epoch": 0.15884563354877498, "flos": 24206701040640.0, "grad_norm": 71.73898257856055, "language_loss": 0.84056884, "learning_rate": 3.828366487835167e-06, "loss": 0.85936379, "num_input_tokens_seen": 57255970, "router_z_loss_clip": 3.41601562, "router_z_loss_mlp": 0.32446289, "step": 2642, "time_per_iteration": 2.648829460144043 }, { "auxiliary_loss_clip": 0.01592669, "auxiliary_loss_mlp": 0.00296621, "balance_loss_clip": 1.25519586, "balance_loss_mlp": 0.26300442, "epoch": 0.15890575680144295, "flos": 23949795991680.0, "grad_norm": 10.21000418656979, "language_loss": 0.76032066, "learning_rate": 3.828208603915186e-06, "loss": 0.77921361, "num_input_tokens_seen": 57274435, "router_z_loss_clip": 3.37304688, "router_z_loss_mlp": 0.33618164, "step": 2643, "time_per_iteration": 2.6624574661254883 }, { "auxiliary_loss_clip": 0.01584298, "auxiliary_loss_mlp": 0.00262387, "balance_loss_clip": 1.25193501, "balance_loss_mlp": 0.23201287, "epoch": 0.15896588005411091, "flos": 21215019127680.0, "grad_norm": 4.201785397662325, "language_loss": 0.8680833, "learning_rate": 3.828050650669353e-06, "loss": 0.88655013, "num_input_tokens_seen": 57293115, "router_z_loss_clip": 3.32421875, "router_z_loss_mlp": 0.30371094, "step": 2644, "time_per_iteration": 2.6380221843719482 }, { "auxiliary_loss_clip": 0.01585512, "auxiliary_loss_mlp": 0.00332104, "balance_loss_clip": 1.25574005, "balance_loss_mlp": 0.30013174, "epoch": 0.1590260033067789, "flos": 24352390604160.0, "grad_norm": 34.19428091317874, "language_loss": 0.88873821, "learning_rate": 3.827892628103657e-06, "loss": 0.90791434, "num_input_tokens_seen": 57312565, "router_z_loss_clip": 3.296875, "router_z_loss_mlp": 0.31982422, "step": 2645, "time_per_iteration": 2.701603651046753 }, { "auxiliary_loss_clip": 0.01567159, "auxiliary_loss_mlp": 0.00306443, "balance_loss_clip": 1.23598957, "balance_loss_mlp": 0.27325484, "epoch": 0.15908612655944687, "flos": 32048944583040.0, "grad_norm": 10.249123123465564, "language_loss": 0.77741045, "learning_rate": 3.827734536224087e-06, "loss": 0.79614645, "num_input_tokens_seen": 57333360, "router_z_loss_clip": 3.3125, "router_z_loss_mlp": 0.33203125, "step": 2646, "time_per_iteration": 2.722123622894287 }, { "auxiliary_loss_clip": 0.01563278, "auxiliary_loss_mlp": 0.00299807, "balance_loss_clip": 1.23718596, "balance_loss_mlp": 0.26962343, "epoch": 0.15914624981211484, "flos": 17785370684160.0, "grad_norm": 10.423381618675766, "language_loss": 0.70291907, "learning_rate": 3.827576375036642e-06, "loss": 0.72154987, "num_input_tokens_seen": 57350575, "router_z_loss_clip": 3.26367188, "router_z_loss_mlp": 0.30175781, "step": 2647, "time_per_iteration": 2.6305108070373535 }, { "auxiliary_loss_clip": 0.01571646, "auxiliary_loss_mlp": 0.00263591, "balance_loss_clip": 1.2492609, "balance_loss_mlp": 0.23338288, "epoch": 0.1592063730647828, "flos": 17712507945600.0, "grad_norm": 67.61574811986122, "language_loss": 0.9643929, "learning_rate": 3.827418144547318e-06, "loss": 0.98274529, "num_input_tokens_seen": 57367570, "router_z_loss_clip": 3.22460938, "router_z_loss_mlp": 0.30200195, "step": 2648, "time_per_iteration": 2.6609647274017334 }, { "auxiliary_loss_clip": 0.01573025, "auxiliary_loss_mlp": 0.00312912, "balance_loss_clip": 1.25080729, "balance_loss_mlp": 0.28333664, "epoch": 0.15926649631745077, "flos": 18803545603200.0, "grad_norm": 3.582292185688921, "language_loss": 0.97373873, "learning_rate": 3.827259844762114e-06, "loss": 0.99259818, "num_input_tokens_seen": 57383980, "router_z_loss_clip": 3.22070312, "router_z_loss_mlp": 0.29577637, "step": 2649, "time_per_iteration": 2.6487786769866943 }, { "auxiliary_loss_clip": 0.01591808, "auxiliary_loss_mlp": 0.00346136, "balance_loss_clip": 1.24882817, "balance_loss_mlp": 0.31125581, "epoch": 0.15932661957011873, "flos": 17566243764480.0, "grad_norm": 3.3458307383052537, "language_loss": 0.82949138, "learning_rate": 3.827101475687033e-06, "loss": 0.84887081, "num_input_tokens_seen": 57400840, "router_z_loss_clip": 3.4296875, "router_z_loss_mlp": 0.34887695, "step": 2650, "time_per_iteration": 2.6652326583862305 }, { "auxiliary_loss_clip": 0.0157642, "auxiliary_loss_mlp": 0.00292448, "balance_loss_clip": 1.25345993, "balance_loss_mlp": 0.2608695, "epoch": 0.15938674282278673, "flos": 13334351011200.0, "grad_norm": 25.65695111808917, "language_loss": 0.79484904, "learning_rate": 3.826943037328082e-06, "loss": 0.81353766, "num_input_tokens_seen": 57419230, "router_z_loss_clip": 3.23242188, "router_z_loss_mlp": 0.3157959, "step": 2651, "time_per_iteration": 2.622145175933838 }, { "auxiliary_loss_clip": 0.01574955, "auxiliary_loss_mlp": 0.0032264, "balance_loss_clip": 1.25005174, "balance_loss_mlp": 0.29083544, "epoch": 0.1594468660754547, "flos": 22488842119680.0, "grad_norm": 13.629683634613663, "language_loss": 0.85702252, "learning_rate": 3.8267845296912674e-06, "loss": 0.87599844, "num_input_tokens_seen": 57439315, "router_z_loss_clip": 3.24609375, "router_z_loss_mlp": 0.31787109, "step": 2652, "time_per_iteration": 2.6910243034362793 }, { "auxiliary_loss_clip": 0.01587214, "auxiliary_loss_mlp": 0.0029708, "balance_loss_clip": 1.26191378, "balance_loss_mlp": 0.26825476, "epoch": 0.15950698932812266, "flos": 15007320910080.0, "grad_norm": 7.27668784656326, "language_loss": 0.79208094, "learning_rate": 3.826625952782601e-06, "loss": 0.81092387, "num_input_tokens_seen": 57454635, "router_z_loss_clip": 3.25390625, "router_z_loss_mlp": 0.28820801, "step": 2653, "time_per_iteration": 2.583361864089966 }, { "auxiliary_loss_clip": 0.01571586, "auxiliary_loss_mlp": 0.00294018, "balance_loss_clip": 1.24422383, "balance_loss_mlp": 0.26235652, "epoch": 0.15956711258079062, "flos": 30155052084480.0, "grad_norm": 9.729465956418146, "language_loss": 0.86591345, "learning_rate": 3.826467306608095e-06, "loss": 0.88456953, "num_input_tokens_seen": 57476805, "router_z_loss_clip": 3.26953125, "router_z_loss_mlp": 0.31652832, "step": 2654, "time_per_iteration": 2.774886131286621 }, { "auxiliary_loss_clip": 0.01569108, "auxiliary_loss_mlp": 0.00306644, "balance_loss_clip": 1.24487507, "balance_loss_mlp": 0.27686539, "epoch": 0.1596272358334586, "flos": 21032700670080.0, "grad_norm": 8.589924780455577, "language_loss": 0.86640328, "learning_rate": 3.826308591173765e-06, "loss": 0.88516068, "num_input_tokens_seen": 57496400, "router_z_loss_clip": 3.23828125, "router_z_loss_mlp": 0.29785156, "step": 2655, "time_per_iteration": 2.6170763969421387 }, { "auxiliary_loss_clip": 0.01567749, "auxiliary_loss_mlp": 0.00329643, "balance_loss_clip": 1.24568069, "balance_loss_mlp": 0.29979303, "epoch": 0.15968735908612655, "flos": 15268032800640.0, "grad_norm": 36.55571346816118, "language_loss": 0.80531311, "learning_rate": 3.826149806485631e-06, "loss": 0.82428706, "num_input_tokens_seen": 57513700, "router_z_loss_clip": 3.22070312, "router_z_loss_mlp": 0.29821777, "step": 2656, "time_per_iteration": 2.659684658050537 }, { "auxiliary_loss_clip": 0.01563948, "auxiliary_loss_mlp": 0.00295472, "balance_loss_clip": 1.24710619, "balance_loss_mlp": 0.26657584, "epoch": 0.15974748233879452, "flos": 52665726695040.0, "grad_norm": 6.863134475577112, "language_loss": 0.83095384, "learning_rate": 3.825990952549713e-06, "loss": 0.8495481, "num_input_tokens_seen": 57536180, "router_z_loss_clip": 3.16796875, "router_z_loss_mlp": 0.2890625, "step": 2657, "time_per_iteration": 2.9867470264434814 }, { "auxiliary_loss_clip": 0.01589598, "auxiliary_loss_mlp": 0.00336382, "balance_loss_clip": 1.2624377, "balance_loss_mlp": 0.30500564, "epoch": 0.1598076055914625, "flos": 18733232730240.0, "grad_norm": 424.18736051083096, "language_loss": 0.80669785, "learning_rate": 3.825832029372035e-06, "loss": 0.82595766, "num_input_tokens_seen": 57555025, "router_z_loss_clip": 3.27539062, "router_z_loss_mlp": 0.3137207, "step": 2658, "time_per_iteration": 2.6460344791412354 }, { "auxiliary_loss_clip": 0.01578292, "auxiliary_loss_mlp": 0.00332919, "balance_loss_clip": 1.25049782, "balance_loss_mlp": 0.2992304, "epoch": 0.15986772884413047, "flos": 34349238535680.0, "grad_norm": 15.534975021043405, "language_loss": 0.81617999, "learning_rate": 3.825673036958624e-06, "loss": 0.8352921, "num_input_tokens_seen": 57577660, "router_z_loss_clip": 3.27734375, "router_z_loss_mlp": 0.33691406, "step": 2659, "time_per_iteration": 2.7632296085357666 }, { "auxiliary_loss_clip": 0.01574144, "auxiliary_loss_mlp": 0.00373306, "balance_loss_clip": 1.24541855, "balance_loss_mlp": 0.33980861, "epoch": 0.15992785209679844, "flos": 22054969739520.0, "grad_norm": 12.74794777171614, "language_loss": 0.98307019, "learning_rate": 3.825513975315508e-06, "loss": 1.00254464, "num_input_tokens_seen": 57596335, "router_z_loss_clip": 3.28515625, "router_z_loss_mlp": 0.33520508, "step": 2660, "time_per_iteration": 2.6263175010681152 }, { "auxiliary_loss_clip": 0.01590633, "auxiliary_loss_mlp": 0.00339649, "balance_loss_clip": 1.2573601, "balance_loss_mlp": 0.30703324, "epoch": 0.1599879753494664, "flos": 33066652625280.0, "grad_norm": 7.270379550388851, "language_loss": 0.84005153, "learning_rate": 3.82535484444872e-06, "loss": 0.85935426, "num_input_tokens_seen": 57616830, "router_z_loss_clip": 3.33203125, "router_z_loss_mlp": 0.32617188, "step": 2661, "time_per_iteration": 2.7240869998931885 }, { "auxiliary_loss_clip": 0.01575904, "auxiliary_loss_mlp": 0.0035141, "balance_loss_clip": 1.24956727, "balance_loss_mlp": 0.32027221, "epoch": 0.16004809860213437, "flos": 28038010343040.0, "grad_norm": 6.043222471438579, "language_loss": 0.78394473, "learning_rate": 3.825195644364292e-06, "loss": 0.80321789, "num_input_tokens_seen": 57635515, "router_z_loss_clip": 3.26171875, "router_z_loss_mlp": 0.31091309, "step": 2662, "time_per_iteration": 2.700563669204712 }, { "auxiliary_loss_clip": 0.01571625, "auxiliary_loss_mlp": 0.00321206, "balance_loss_clip": 1.24054813, "balance_loss_mlp": 0.28849518, "epoch": 0.16010822185480234, "flos": 22780113505920.0, "grad_norm": 34.49566461666907, "language_loss": 0.89167869, "learning_rate": 3.825036375068263e-06, "loss": 0.91060698, "num_input_tokens_seen": 57654250, "router_z_loss_clip": 3.30859375, "router_z_loss_mlp": 0.32739258, "step": 2663, "time_per_iteration": 2.6508285999298096 }, { "auxiliary_loss_clip": 0.01575064, "auxiliary_loss_mlp": 0.0034158, "balance_loss_clip": 1.24843001, "balance_loss_mlp": 0.30908376, "epoch": 0.16016834510747033, "flos": 20084012611200.0, "grad_norm": 3.093260048421951, "language_loss": 0.89060497, "learning_rate": 3.824877036566672e-06, "loss": 0.90977144, "num_input_tokens_seen": 57672645, "router_z_loss_clip": 3.27148438, "router_z_loss_mlp": 0.32495117, "step": 2664, "time_per_iteration": 2.643136739730835 }, { "auxiliary_loss_clip": 0.01558862, "auxiliary_loss_mlp": 0.00325449, "balance_loss_clip": 1.23547769, "balance_loss_mlp": 0.29327431, "epoch": 0.1602284683601383, "flos": 21173829206400.0, "grad_norm": 84.10507895827095, "language_loss": 0.99097407, "learning_rate": 3.824717628865561e-06, "loss": 1.00981724, "num_input_tokens_seen": 57691055, "router_z_loss_clip": 3.23242188, "router_z_loss_mlp": 0.32141113, "step": 2665, "time_per_iteration": 2.623013496398926 }, { "auxiliary_loss_clip": 0.01572031, "auxiliary_loss_mlp": 0.00306682, "balance_loss_clip": 1.24411285, "balance_loss_mlp": 0.2755447, "epoch": 0.16028859161280626, "flos": 14647568244480.0, "grad_norm": 104.06287967914122, "language_loss": 0.90355217, "learning_rate": 3.824558151970974e-06, "loss": 0.92233932, "num_input_tokens_seen": 57707235, "router_z_loss_clip": 3.28320312, "router_z_loss_mlp": 0.3112793, "step": 2666, "time_per_iteration": 2.582685947418213 }, { "auxiliary_loss_clip": 0.01570422, "auxiliary_loss_mlp": 0.00342745, "balance_loss_clip": 1.24499702, "balance_loss_mlp": 0.31067723, "epoch": 0.16034871486547422, "flos": 20990325600000.0, "grad_norm": 21.760050002744773, "language_loss": 0.87986791, "learning_rate": 3.8243986058889595e-06, "loss": 0.89899957, "num_input_tokens_seen": 57724190, "router_z_loss_clip": 3.25390625, "router_z_loss_mlp": 0.32080078, "step": 2667, "time_per_iteration": 2.63266921043396 }, { "auxiliary_loss_clip": 0.01590987, "auxiliary_loss_mlp": 0.00352187, "balance_loss_clip": 1.26556277, "balance_loss_mlp": 0.31873661, "epoch": 0.1604088381181422, "flos": 21397732634880.0, "grad_norm": 5.894846609074103, "language_loss": 0.810862, "learning_rate": 3.824238990625567e-06, "loss": 0.83029377, "num_input_tokens_seen": 57743620, "router_z_loss_clip": 3.25390625, "router_z_loss_mlp": 0.33447266, "step": 2668, "time_per_iteration": 2.607182741165161 }, { "auxiliary_loss_clip": 0.01558201, "auxiliary_loss_mlp": 0.00358836, "balance_loss_clip": 1.23460722, "balance_loss_mlp": 0.32716221, "epoch": 0.16046896137081015, "flos": 23877040993920.0, "grad_norm": 69.74517656543513, "language_loss": 0.8304494, "learning_rate": 3.824079306186848e-06, "loss": 0.84961975, "num_input_tokens_seen": 57764810, "router_z_loss_clip": 3.23632812, "router_z_loss_mlp": 0.31652832, "step": 2669, "time_per_iteration": 2.6844518184661865 }, { "auxiliary_loss_clip": 0.01614687, "auxiliary_loss_mlp": 0.00258412, "balance_loss_clip": 1.34819472, "balance_loss_mlp": 0.24854124, "epoch": 0.16052908462347812, "flos": 59806709015040.0, "grad_norm": 0.8106444371479665, "language_loss": 0.55457699, "learning_rate": 3.823919552578861e-06, "loss": 0.57330793, "num_input_tokens_seen": 57824390, "router_z_loss_clip": 2.65625, "router_z_loss_mlp": 0.09863281, "step": 2670, "time_per_iteration": 3.0058343410491943 }, { "auxiliary_loss_clip": 0.01556094, "auxiliary_loss_mlp": 0.00356893, "balance_loss_clip": 1.23311639, "balance_loss_mlp": 0.32430115, "epoch": 0.1605892078761461, "flos": 18296559089280.0, "grad_norm": 3.917739041216195, "language_loss": 0.85281831, "learning_rate": 3.82375972980766e-06, "loss": 0.87194812, "num_input_tokens_seen": 57843665, "router_z_loss_clip": 3.22851562, "router_z_loss_mlp": 0.32617188, "step": 2671, "time_per_iteration": 2.6324734687805176 }, { "auxiliary_loss_clip": 0.01563785, "auxiliary_loss_mlp": 0.00349293, "balance_loss_clip": 1.24001789, "balance_loss_mlp": 0.31708288, "epoch": 0.16064933112881408, "flos": 32160734686080.0, "grad_norm": 42.28605023878604, "language_loss": 0.72794318, "learning_rate": 3.8235998378793086e-06, "loss": 0.74707395, "num_input_tokens_seen": 57863305, "router_z_loss_clip": 3.23632812, "router_z_loss_mlp": 0.32214355, "step": 2672, "time_per_iteration": 2.7562615871429443 }, { "auxiliary_loss_clip": 0.01546132, "auxiliary_loss_mlp": 0.00348406, "balance_loss_clip": 1.22496939, "balance_loss_mlp": 0.31545621, "epoch": 0.16070945438148204, "flos": 19828795501440.0, "grad_norm": 2.840082842040135, "language_loss": 0.93799025, "learning_rate": 3.8234398767998675e-06, "loss": 0.95693564, "num_input_tokens_seen": 57883025, "router_z_loss_clip": 3.21289062, "router_z_loss_mlp": 0.32958984, "step": 2673, "time_per_iteration": 2.6335413455963135 }, { "auxiliary_loss_clip": 0.01551736, "auxiliary_loss_mlp": 0.0032407, "balance_loss_clip": 1.2321229, "balance_loss_mlp": 0.2938858, "epoch": 0.16076957763415, "flos": 18913144976640.0, "grad_norm": 31.29970428900479, "language_loss": 0.81051505, "learning_rate": 3.823279846575403e-06, "loss": 0.8292731, "num_input_tokens_seen": 57901430, "router_z_loss_clip": 3.19140625, "router_z_loss_mlp": 0.30200195, "step": 2674, "time_per_iteration": 2.6648240089416504 }, { "auxiliary_loss_clip": 0.01519245, "auxiliary_loss_mlp": 0.00323296, "balance_loss_clip": 1.20640838, "balance_loss_mlp": 0.29410189, "epoch": 0.16082970088681797, "flos": 16764358590720.0, "grad_norm": 5.0200210509485474, "language_loss": 0.89212847, "learning_rate": 3.823119747211986e-06, "loss": 0.91055393, "num_input_tokens_seen": 57919550, "router_z_loss_clip": 3.12695312, "router_z_loss_mlp": 0.29174805, "step": 2675, "time_per_iteration": 2.6344492435455322 }, { "auxiliary_loss_clip": 0.01540878, "auxiliary_loss_mlp": 0.00357451, "balance_loss_clip": 1.22136497, "balance_loss_mlp": 0.32411981, "epoch": 0.16088982413948594, "flos": 35150261783040.0, "grad_norm": 92.23674115353234, "language_loss": 0.88970441, "learning_rate": 3.822959578715685e-06, "loss": 0.90868771, "num_input_tokens_seen": 57939890, "router_z_loss_clip": 3.19726562, "router_z_loss_mlp": 0.33325195, "step": 2676, "time_per_iteration": 2.8063149452209473 }, { "auxiliary_loss_clip": 0.01528491, "auxiliary_loss_mlp": 0.0032605, "balance_loss_clip": 1.21991777, "balance_loss_mlp": 0.29805934, "epoch": 0.1609499473921539, "flos": 18625105814400.0, "grad_norm": 1644.3029219872897, "language_loss": 0.80902827, "learning_rate": 3.822799341092573e-06, "loss": 0.82757366, "num_input_tokens_seen": 57957410, "router_z_loss_clip": 3.0859375, "router_z_loss_mlp": 0.27978516, "step": 2677, "time_per_iteration": 4.018671035766602 }, { "auxiliary_loss_clip": 0.01514041, "auxiliary_loss_mlp": 0.00309619, "balance_loss_clip": 1.20559525, "balance_loss_mlp": 0.27874357, "epoch": 0.1610100706448219, "flos": 33145728416640.0, "grad_norm": 100.98841025221115, "language_loss": 0.82716465, "learning_rate": 3.822639034348728e-06, "loss": 0.84540129, "num_input_tokens_seen": 57977900, "router_z_loss_clip": 3.08398438, "router_z_loss_mlp": 0.30883789, "step": 2678, "time_per_iteration": 2.785815954208374 }, { "auxiliary_loss_clip": 0.01508048, "auxiliary_loss_mlp": 0.00354232, "balance_loss_clip": 1.19319367, "balance_loss_mlp": 0.3232491, "epoch": 0.16107019389748986, "flos": 34676707852800.0, "grad_norm": 223.4515652288639, "language_loss": 0.76176667, "learning_rate": 3.822478658490228e-06, "loss": 0.78038949, "num_input_tokens_seen": 57998210, "router_z_loss_clip": 3.1484375, "router_z_loss_mlp": 0.30993652, "step": 2679, "time_per_iteration": 2.7988789081573486 }, { "auxiliary_loss_clip": 0.01545061, "auxiliary_loss_mlp": 0.00093612, "balance_loss_clip": 1.28559136, "balance_loss_mlp": 0.08393177, "epoch": 0.16113031715015783, "flos": 65713403260800.0, "grad_norm": 0.7656348627607401, "language_loss": 0.51709282, "learning_rate": 3.822318213523154e-06, "loss": 0.53347957, "num_input_tokens_seen": 58059420, "router_z_loss_clip": 2.59375, "router_z_loss_mlp": 0.09667969, "step": 2680, "time_per_iteration": 4.674199104309082 }, { "auxiliary_loss_clip": 0.01501354, "auxiliary_loss_mlp": 0.00363516, "balance_loss_clip": 1.18978953, "balance_loss_mlp": 0.33142501, "epoch": 0.1611904404028258, "flos": 20810413353600.0, "grad_norm": 28.31040329341951, "language_loss": 0.87912714, "learning_rate": 3.8221576994535925e-06, "loss": 0.89777583, "num_input_tokens_seen": 58078370, "router_z_loss_clip": 3.1171875, "router_z_loss_mlp": 0.32080078, "step": 2681, "time_per_iteration": 4.036627292633057 }, { "auxiliary_loss_clip": 0.01501736, "auxiliary_loss_mlp": 0.00338649, "balance_loss_clip": 1.19294286, "balance_loss_mlp": 0.30923998, "epoch": 0.16125056365549376, "flos": 27013335062400.0, "grad_norm": 25.53976665953396, "language_loss": 0.75182319, "learning_rate": 3.821997116287627e-06, "loss": 0.77022696, "num_input_tokens_seen": 58097395, "router_z_loss_clip": 3.08984375, "router_z_loss_mlp": 0.29431152, "step": 2682, "time_per_iteration": 2.7181761264801025 }, { "auxiliary_loss_clip": 0.01504543, "auxiliary_loss_mlp": 0.00334228, "balance_loss_clip": 1.19047189, "balance_loss_mlp": 0.30517656, "epoch": 0.16131068690816172, "flos": 19276524915840.0, "grad_norm": 79.47713798211194, "language_loss": 0.94243979, "learning_rate": 3.821836464031348e-06, "loss": 0.96082753, "num_input_tokens_seen": 58115630, "router_z_loss_clip": 3.140625, "router_z_loss_mlp": 0.29016113, "step": 2683, "time_per_iteration": 2.6084911823272705 }, { "auxiliary_loss_clip": 0.01485201, "auxiliary_loss_mlp": 0.00380955, "balance_loss_clip": 1.17942047, "balance_loss_mlp": 0.35136747, "epoch": 0.16137081016082971, "flos": 35337931367040.0, "grad_norm": 3.2506449100138823, "language_loss": 0.79911184, "learning_rate": 3.821675742690849e-06, "loss": 0.8177734, "num_input_tokens_seen": 58138655, "router_z_loss_clip": 3.05664062, "router_z_loss_mlp": 0.29614258, "step": 2684, "time_per_iteration": 4.145632266998291 }, { "auxiliary_loss_clip": 0.01494012, "auxiliary_loss_mlp": 0.00352254, "balance_loss_clip": 1.18417799, "balance_loss_mlp": 0.32090202, "epoch": 0.16143093341349768, "flos": 34235257703040.0, "grad_norm": 5.650808995633419, "language_loss": 0.78644603, "learning_rate": 3.821514952272223e-06, "loss": 0.80490863, "num_input_tokens_seen": 58157440, "router_z_loss_clip": 3.1015625, "router_z_loss_mlp": 0.31347656, "step": 2685, "time_per_iteration": 2.7523105144500732 }, { "auxiliary_loss_clip": 0.01493039, "auxiliary_loss_mlp": 0.00365117, "balance_loss_clip": 1.18247318, "balance_loss_mlp": 0.33479014, "epoch": 0.16149105666616564, "flos": 27999262546560.0, "grad_norm": 88.80787229962928, "language_loss": 0.77555597, "learning_rate": 3.821354092781567e-06, "loss": 0.7941376, "num_input_tokens_seen": 58176660, "router_z_loss_clip": 3.10742188, "router_z_loss_mlp": 0.3034668, "step": 2686, "time_per_iteration": 2.6700022220611572 }, { "auxiliary_loss_clip": 0.01489696, "auxiliary_loss_mlp": 0.00368774, "balance_loss_clip": 1.17607188, "balance_loss_mlp": 0.33768457, "epoch": 0.1615511799188336, "flos": 19422214479360.0, "grad_norm": 4.186947625028718, "language_loss": 0.88581407, "learning_rate": 3.821193164224981e-06, "loss": 0.9043988, "num_input_tokens_seen": 58195085, "router_z_loss_clip": 3.13476562, "router_z_loss_mlp": 0.31079102, "step": 2687, "time_per_iteration": 2.725583553314209 }, { "auxiliary_loss_clip": 0.01485894, "auxiliary_loss_mlp": 0.00354146, "balance_loss_clip": 1.17319274, "balance_loss_mlp": 0.32172066, "epoch": 0.16161130317150157, "flos": 22854915578880.0, "grad_norm": 37.2355222020744, "language_loss": 0.79072726, "learning_rate": 3.821032166608568e-06, "loss": 0.80912769, "num_input_tokens_seen": 58213540, "router_z_loss_clip": 3.12890625, "router_z_loss_mlp": 0.32446289, "step": 2688, "time_per_iteration": 2.6536107063293457 }, { "auxiliary_loss_clip": 0.01475126, "auxiliary_loss_mlp": 0.00368322, "balance_loss_clip": 1.166134, "balance_loss_mlp": 0.3367793, "epoch": 0.16167142642416954, "flos": 26110577520000.0, "grad_norm": 695.9941335939913, "language_loss": 0.79992294, "learning_rate": 3.8208710999384325e-06, "loss": 0.81835735, "num_input_tokens_seen": 58236995, "router_z_loss_clip": 3.09179688, "router_z_loss_mlp": 0.31518555, "step": 2689, "time_per_iteration": 2.770781993865967 }, { "auxiliary_loss_clip": 0.01491304, "auxiliary_loss_mlp": 0.00341294, "balance_loss_clip": 1.18214345, "balance_loss_mlp": 0.31281522, "epoch": 0.1617315496768375, "flos": 22779646629120.0, "grad_norm": 7.5926790475690495, "language_loss": 0.92410958, "learning_rate": 3.820709964220683e-06, "loss": 0.9424355, "num_input_tokens_seen": 58257230, "router_z_loss_clip": 3.08789062, "router_z_loss_mlp": 0.28491211, "step": 2690, "time_per_iteration": 2.6786348819732666 }, { "auxiliary_loss_clip": 0.01487629, "auxiliary_loss_mlp": 0.00360201, "balance_loss_clip": 1.17780983, "balance_loss_mlp": 0.33172178, "epoch": 0.1617916729295055, "flos": 22017299351040.0, "grad_norm": 7.711460672445254, "language_loss": 0.92270446, "learning_rate": 3.8205487594614284e-06, "loss": 0.94118279, "num_input_tokens_seen": 58277080, "router_z_loss_clip": 3.09765625, "router_z_loss_mlp": 0.28479004, "step": 2691, "time_per_iteration": 2.643651008605957 }, { "auxiliary_loss_clip": 0.01497395, "auxiliary_loss_mlp": 0.0035929, "balance_loss_clip": 1.18159211, "balance_loss_mlp": 0.32679379, "epoch": 0.16185179618217346, "flos": 23438248450560.0, "grad_norm": 5.860582725108861, "language_loss": 0.88634086, "learning_rate": 3.820387485666784e-06, "loss": 0.90490764, "num_input_tokens_seen": 58294815, "router_z_loss_clip": 3.16210938, "router_z_loss_mlp": 0.32470703, "step": 2692, "time_per_iteration": 2.652879238128662 }, { "auxiliary_loss_clip": 0.01503198, "auxiliary_loss_mlp": 0.0033079, "balance_loss_clip": 1.18503058, "balance_loss_mlp": 0.29888949, "epoch": 0.16191191943484143, "flos": 25666110627840.0, "grad_norm": 90.94622664893245, "language_loss": 0.9040885, "learning_rate": 3.820226142842862e-06, "loss": 0.92242843, "num_input_tokens_seen": 58313215, "router_z_loss_clip": 3.1796875, "router_z_loss_mlp": 0.31872559, "step": 2693, "time_per_iteration": 2.666032552719116 }, { "auxiliary_loss_clip": 0.01474381, "auxiliary_loss_mlp": 0.00274449, "balance_loss_clip": 1.16376472, "balance_loss_mlp": 0.24951032, "epoch": 0.1619720426875094, "flos": 23477355383040.0, "grad_norm": 4.46592753887704, "language_loss": 0.91420865, "learning_rate": 3.820064730995783e-06, "loss": 0.93169695, "num_input_tokens_seen": 58333215, "router_z_loss_clip": 3.10351562, "router_z_loss_mlp": 0.24938965, "step": 2694, "time_per_iteration": 2.8285701274871826 }, { "auxiliary_loss_clip": 0.01491694, "auxiliary_loss_mlp": 0.00375932, "balance_loss_clip": 1.17725515, "balance_loss_mlp": 0.34659445, "epoch": 0.16203216594017736, "flos": 24133658734080.0, "grad_norm": 29.479188773876324, "language_loss": 0.76774526, "learning_rate": 3.819903250131667e-06, "loss": 0.78642154, "num_input_tokens_seen": 58351160, "router_z_loss_clip": 3.15039062, "router_z_loss_mlp": 0.29333496, "step": 2695, "time_per_iteration": 2.6555259227752686 }, { "auxiliary_loss_clip": 0.01490895, "auxiliary_loss_mlp": 0.00348358, "balance_loss_clip": 1.17483401, "balance_loss_mlp": 0.3176139, "epoch": 0.16209228919284532, "flos": 22340889999360.0, "grad_norm": 61.2603312191152, "language_loss": 0.89614302, "learning_rate": 3.819741700256637e-06, "loss": 0.91453552, "num_input_tokens_seen": 58368505, "router_z_loss_clip": 3.15820312, "router_z_loss_mlp": 0.30749512, "step": 2696, "time_per_iteration": 2.689598560333252 }, { "auxiliary_loss_clip": 0.01500437, "auxiliary_loss_mlp": 0.00391026, "balance_loss_clip": 1.17920876, "balance_loss_mlp": 0.35814798, "epoch": 0.1621524124455133, "flos": 15815131827840.0, "grad_norm": 4.770994033786437, "language_loss": 1.00805831, "learning_rate": 3.8195800813768194e-06, "loss": 1.02697289, "num_input_tokens_seen": 58385085, "router_z_loss_clip": 3.2109375, "router_z_loss_mlp": 0.32885742, "step": 2697, "time_per_iteration": 2.5794692039489746 }, { "auxiliary_loss_clip": 0.01473411, "auxiliary_loss_mlp": 0.00318192, "balance_loss_clip": 1.16698444, "balance_loss_mlp": 0.29063115, "epoch": 0.16221253569818128, "flos": 30186688988160.0, "grad_norm": 13.425801390947946, "language_loss": 0.86699176, "learning_rate": 3.819418393498343e-06, "loss": 0.88490772, "num_input_tokens_seen": 58406985, "router_z_loss_clip": 3.06445312, "router_z_loss_mlp": 0.2755127, "step": 2698, "time_per_iteration": 2.7508957386016846 }, { "auxiliary_loss_clip": 0.0148537, "auxiliary_loss_mlp": 0.00357338, "balance_loss_clip": 1.18238366, "balance_loss_mlp": 0.32761919, "epoch": 0.16227265895084925, "flos": 24605991601920.0, "grad_norm": 3.924991351855062, "language_loss": 0.82409328, "learning_rate": 3.819256636627339e-06, "loss": 0.84252036, "num_input_tokens_seen": 58426205, "router_z_loss_clip": 3.02734375, "router_z_loss_mlp": 0.29699707, "step": 2699, "time_per_iteration": 2.72969913482666 }, { "auxiliary_loss_clip": 0.01480903, "auxiliary_loss_mlp": 0.0032273, "balance_loss_clip": 1.17293429, "balance_loss_mlp": 0.29547843, "epoch": 0.1623327822035172, "flos": 19573326996480.0, "grad_norm": 3.505079491547812, "language_loss": 0.93516052, "learning_rate": 3.81909481076994e-06, "loss": 0.95319676, "num_input_tokens_seen": 58443830, "router_z_loss_clip": 3.07617188, "router_z_loss_mlp": 0.27258301, "step": 2700, "time_per_iteration": 2.744044065475464 }, { "auxiliary_loss_clip": 0.01468434, "auxiliary_loss_mlp": 0.00299343, "balance_loss_clip": 1.16150403, "balance_loss_mlp": 0.27064919, "epoch": 0.16239290545618518, "flos": 26468462678400.0, "grad_norm": 144.93768435477512, "language_loss": 0.85072631, "learning_rate": 3.818932915932284e-06, "loss": 0.86840409, "num_input_tokens_seen": 58464405, "router_z_loss_clip": 3.06445312, "router_z_loss_mlp": 0.28674316, "step": 2701, "time_per_iteration": 2.73068904876709 }, { "auxiliary_loss_clip": 0.01476671, "auxiliary_loss_mlp": 0.00344652, "balance_loss_clip": 1.16599941, "balance_loss_mlp": 0.31282273, "epoch": 0.16245302870885314, "flos": 15851940289920.0, "grad_norm": 41.42322094182128, "language_loss": 0.80711383, "learning_rate": 3.818770952120511e-06, "loss": 0.82532704, "num_input_tokens_seen": 58483295, "router_z_loss_clip": 3.10742188, "router_z_loss_mlp": 0.31811523, "step": 2702, "time_per_iteration": 2.639944076538086 }, { "auxiliary_loss_clip": 0.01481008, "auxiliary_loss_mlp": 0.00313635, "balance_loss_clip": 1.172719, "balance_loss_mlp": 0.28390419, "epoch": 0.1625131519615211, "flos": 14756521173120.0, "grad_norm": 2.83468396014323, "language_loss": 0.81229645, "learning_rate": 3.81860891934076e-06, "loss": 0.83024287, "num_input_tokens_seen": 58501205, "router_z_loss_clip": 3.08398438, "router_z_loss_mlp": 0.29760742, "step": 2703, "time_per_iteration": 2.582578182220459 }, { "auxiliary_loss_clip": 0.01488281, "auxiliary_loss_mlp": 0.00321263, "balance_loss_clip": 1.17740321, "balance_loss_mlp": 0.28907627, "epoch": 0.1625732752141891, "flos": 28220508368640.0, "grad_norm": 3.1986356517563084, "language_loss": 0.76752788, "learning_rate": 3.818446817599176e-06, "loss": 0.78562331, "num_input_tokens_seen": 58522315, "router_z_loss_clip": 3.10742188, "router_z_loss_mlp": 0.32177734, "step": 2704, "time_per_iteration": 2.7193050384521484 }, { "auxiliary_loss_clip": 0.0149688, "auxiliary_loss_mlp": 0.00056253, "balance_loss_clip": 1.25535989, "balance_loss_mlp": 0.04576214, "epoch": 0.16263339846685707, "flos": 67327947688320.0, "grad_norm": 0.774855822536366, "language_loss": 0.53273785, "learning_rate": 3.818284646901907e-06, "loss": 0.54826915, "num_input_tokens_seen": 58586695, "router_z_loss_clip": 2.40625, "router_z_loss_mlp": 0.10498047, "step": 2705, "time_per_iteration": 3.117077350616455 }, { "auxiliary_loss_clip": 0.01493922, "auxiliary_loss_mlp": 0.00326283, "balance_loss_clip": 1.18135762, "balance_loss_mlp": 0.29630196, "epoch": 0.16269352171952503, "flos": 14319165173760.0, "grad_norm": 4.05398422709656, "language_loss": 0.8509953, "learning_rate": 3.818122407255102e-06, "loss": 0.86919737, "num_input_tokens_seen": 58602435, "router_z_loss_clip": 3.12695312, "router_z_loss_mlp": 0.29968262, "step": 2706, "time_per_iteration": 2.5767924785614014 }, { "auxiliary_loss_clip": 0.01483845, "auxiliary_loss_mlp": 0.00371876, "balance_loss_clip": 1.17664731, "balance_loss_mlp": 0.34164411, "epoch": 0.162753644972193, "flos": 28361205941760.0, "grad_norm": 3.1104604119048274, "language_loss": 0.79787266, "learning_rate": 3.817960098664914e-06, "loss": 0.81642985, "num_input_tokens_seen": 58621275, "router_z_loss_clip": 3.07421875, "router_z_loss_mlp": 0.30224609, "step": 2707, "time_per_iteration": 2.675731897354126 }, { "auxiliary_loss_clip": 0.01527578, "auxiliary_loss_mlp": 0.00359955, "balance_loss_clip": 1.21470177, "balance_loss_mlp": 0.32888907, "epoch": 0.16281376822486096, "flos": 19937856170880.0, "grad_norm": 7.252683704048497, "language_loss": 0.90741777, "learning_rate": 3.817797721137495e-06, "loss": 0.92629313, "num_input_tokens_seen": 58637550, "router_z_loss_clip": 3.12890625, "router_z_loss_mlp": 0.31054688, "step": 2708, "time_per_iteration": 2.578489303588867 }, { "auxiliary_loss_clip": 0.01521617, "auxiliary_loss_mlp": 0.00334664, "balance_loss_clip": 1.20661044, "balance_loss_mlp": 0.30185804, "epoch": 0.16287389147752893, "flos": 21251719848960.0, "grad_norm": 4.278918636421453, "language_loss": 0.94295567, "learning_rate": 3.817635274679006e-06, "loss": 0.96151853, "num_input_tokens_seen": 58654135, "router_z_loss_clip": 3.15039062, "router_z_loss_mlp": 0.328125, "step": 2709, "time_per_iteration": 2.643327474594116 }, { "auxiliary_loss_clip": 0.01525925, "auxiliary_loss_mlp": 0.00361664, "balance_loss_clip": 1.20881009, "balance_loss_mlp": 0.33035967, "epoch": 0.1629340147301969, "flos": 19244672530560.0, "grad_norm": 8.831786939509964, "language_loss": 0.96190214, "learning_rate": 3.817472759295605e-06, "loss": 0.98077798, "num_input_tokens_seen": 58674320, "router_z_loss_clip": 3.16992188, "router_z_loss_mlp": 0.31323242, "step": 2710, "time_per_iteration": 2.6302928924560547 }, { "auxiliary_loss_clip": 0.01532237, "auxiliary_loss_mlp": 0.00317204, "balance_loss_clip": 1.21722209, "balance_loss_mlp": 0.2853992, "epoch": 0.16299413798286488, "flos": 21249816428160.0, "grad_norm": 10.49339854084623, "language_loss": 0.89061755, "learning_rate": 3.817310174993453e-06, "loss": 0.90911186, "num_input_tokens_seen": 58691000, "router_z_loss_clip": 3.1484375, "router_z_loss_mlp": 0.31762695, "step": 2711, "time_per_iteration": 2.6194944381713867 }, { "auxiliary_loss_clip": 0.0153955, "auxiliary_loss_mlp": 0.00319387, "balance_loss_clip": 1.21803784, "balance_loss_mlp": 0.28817818, "epoch": 0.16305426123553285, "flos": 18770579896320.0, "grad_norm": 7.595074077114739, "language_loss": 0.89767945, "learning_rate": 3.817147521778719e-06, "loss": 0.91626883, "num_input_tokens_seen": 58710230, "router_z_loss_clip": 3.21679688, "router_z_loss_mlp": 0.31176758, "step": 2712, "time_per_iteration": 2.690333366394043 }, { "auxiliary_loss_clip": 0.01536871, "auxiliary_loss_mlp": 0.00327091, "balance_loss_clip": 1.22118711, "balance_loss_mlp": 0.29600126, "epoch": 0.16311438448820081, "flos": 22087648137600.0, "grad_norm": 51.28455592055306, "language_loss": 0.83963013, "learning_rate": 3.816984799657568e-06, "loss": 0.85826981, "num_input_tokens_seen": 58728610, "router_z_loss_clip": 3.15625, "router_z_loss_mlp": 0.31079102, "step": 2713, "time_per_iteration": 2.690988063812256 }, { "auxiliary_loss_clip": 0.01546404, "auxiliary_loss_mlp": 0.00309744, "balance_loss_clip": 1.23403871, "balance_loss_mlp": 0.2797513, "epoch": 0.16317450774086878, "flos": 16467700164480.0, "grad_norm": 59.14708318536763, "language_loss": 0.88047516, "learning_rate": 3.8168220086361715e-06, "loss": 0.89903665, "num_input_tokens_seen": 58744385, "router_z_loss_clip": 3.12304688, "router_z_loss_mlp": 0.29980469, "step": 2714, "time_per_iteration": 2.6443002223968506 }, { "auxiliary_loss_clip": 0.01537956, "auxiliary_loss_mlp": 0.00336149, "balance_loss_clip": 1.22752881, "balance_loss_mlp": 0.30596554, "epoch": 0.16323463099353674, "flos": 24352929308160.0, "grad_norm": 31.848344689164975, "language_loss": 0.83808923, "learning_rate": 3.816659148720702e-06, "loss": 0.85683024, "num_input_tokens_seen": 58763905, "router_z_loss_clip": 3.1015625, "router_z_loss_mlp": 0.30200195, "step": 2715, "time_per_iteration": 2.684872627258301 }, { "auxiliary_loss_clip": 0.0151838, "auxiliary_loss_mlp": 0.00317532, "balance_loss_clip": 1.20830309, "balance_loss_mlp": 0.287682, "epoch": 0.1632947542462047, "flos": 24900782520960.0, "grad_norm": 69.27278249030107, "language_loss": 0.90019464, "learning_rate": 3.816496219917336e-06, "loss": 0.91855371, "num_input_tokens_seen": 58785580, "router_z_loss_clip": 3.1015625, "router_z_loss_mlp": 0.29858398, "step": 2716, "time_per_iteration": 2.7345104217529297 }, { "auxiliary_loss_clip": 0.01517208, "auxiliary_loss_mlp": 0.00352637, "balance_loss_clip": 1.20465088, "balance_loss_mlp": 0.32091558, "epoch": 0.1633548774988727, "flos": 24900279730560.0, "grad_norm": 7573.817811784336, "language_loss": 0.92966998, "learning_rate": 3.816333222232251e-06, "loss": 0.94836837, "num_input_tokens_seen": 58806075, "router_z_loss_clip": 3.12695312, "router_z_loss_mlp": 0.31750488, "step": 2717, "time_per_iteration": 2.7030417919158936 }, { "auxiliary_loss_clip": 0.01501098, "auxiliary_loss_mlp": 0.00414825, "balance_loss_clip": 1.19563258, "balance_loss_mlp": 0.38385475, "epoch": 0.16341500075154067, "flos": 30441798357120.0, "grad_norm": 8.630009765511026, "language_loss": 0.84394771, "learning_rate": 3.816170155671629e-06, "loss": 0.86310697, "num_input_tokens_seen": 58827405, "router_z_loss_clip": 3.05273438, "router_z_loss_mlp": 0.30957031, "step": 2718, "time_per_iteration": 2.75041127204895 }, { "auxiliary_loss_clip": 0.01494856, "auxiliary_loss_mlp": 0.00486355, "balance_loss_clip": 1.19174278, "balance_loss_mlp": 0.45378727, "epoch": 0.16347512400420863, "flos": 22784530878720.0, "grad_norm": 19.929234099519352, "language_loss": 0.8170321, "learning_rate": 3.816007020241652e-06, "loss": 0.83684421, "num_input_tokens_seen": 58847205, "router_z_loss_clip": 3.02929688, "router_z_loss_mlp": 0.32568359, "step": 2719, "time_per_iteration": 4.039881944656372 }, { "auxiliary_loss_clip": 0.01490797, "auxiliary_loss_mlp": 0.00524836, "balance_loss_clip": 1.19029224, "balance_loss_mlp": 0.49202994, "epoch": 0.1635352472568766, "flos": 22633274707200.0, "grad_norm": 9.227010051049746, "language_loss": 0.77038616, "learning_rate": 3.815843815948507e-06, "loss": 0.79054248, "num_input_tokens_seen": 58866865, "router_z_loss_clip": 3.00585938, "router_z_loss_mlp": 0.328125, "step": 2720, "time_per_iteration": 2.6892387866973877 }, { "auxiliary_loss_clip": 0.01478013, "auxiliary_loss_mlp": 0.0060964, "balance_loss_clip": 1.1804626, "balance_loss_mlp": 0.57151645, "epoch": 0.16359537050954456, "flos": 15522998515200.0, "grad_norm": 45.44461887361915, "language_loss": 0.83546692, "learning_rate": 3.8156805427983824e-06, "loss": 0.85634345, "num_input_tokens_seen": 58885200, "router_z_loss_clip": 2.97460938, "router_z_loss_mlp": 0.38110352, "step": 2721, "time_per_iteration": 2.578968048095703 }, { "auxiliary_loss_clip": 0.01496874, "auxiliary_loss_mlp": 0.00604759, "balance_loss_clip": 1.18958092, "balance_loss_mlp": 0.56635022, "epoch": 0.16365549376221253, "flos": 22090162089600.0, "grad_norm": 4.765991255892765, "language_loss": 0.83918065, "learning_rate": 3.8155172007974695e-06, "loss": 0.86019701, "num_input_tokens_seen": 58906385, "router_z_loss_clip": 3.07421875, "router_z_loss_mlp": 0.38427734, "step": 2722, "time_per_iteration": 2.6746537685394287 }, { "auxiliary_loss_clip": 0.01509678, "auxiliary_loss_mlp": 0.00663292, "balance_loss_clip": 1.20350432, "balance_loss_mlp": 0.6205914, "epoch": 0.1637156170148805, "flos": 24060400945920.0, "grad_norm": 26.9548465627625, "language_loss": 0.92477477, "learning_rate": 3.8153537899519624e-06, "loss": 0.94650447, "num_input_tokens_seen": 58925040, "router_z_loss_clip": 3.0625, "router_z_loss_mlp": 0.42700195, "step": 2723, "time_per_iteration": 5.510082006454468 }, { "auxiliary_loss_clip": 0.01486114, "auxiliary_loss_mlp": 0.0067569, "balance_loss_clip": 1.19219112, "balance_loss_mlp": 0.63692343, "epoch": 0.1637757402675485, "flos": 26685362954880.0, "grad_norm": 4.1000060520368145, "language_loss": 0.76033413, "learning_rate": 3.815190310268058e-06, "loss": 0.78195214, "num_input_tokens_seen": 58944790, "router_z_loss_clip": 2.93945312, "router_z_loss_mlp": 0.38793945, "step": 2724, "time_per_iteration": 2.698376417160034 }, { "auxiliary_loss_clip": 0.01494247, "auxiliary_loss_mlp": 0.00722343, "balance_loss_clip": 1.20127964, "balance_loss_mlp": 0.68078679, "epoch": 0.16383586352021645, "flos": 16106941918080.0, "grad_norm": 16.709666065525667, "language_loss": 0.77340174, "learning_rate": 3.815026761751955e-06, "loss": 0.79556763, "num_input_tokens_seen": 58962500, "router_z_loss_clip": 2.93164062, "router_z_loss_mlp": 0.4152832, "step": 2725, "time_per_iteration": 2.6056911945343018 }, { "auxiliary_loss_clip": 0.01488647, "auxiliary_loss_mlp": 0.00687636, "balance_loss_clip": 1.19494009, "balance_loss_mlp": 0.64593613, "epoch": 0.16389598677288442, "flos": 19165991788800.0, "grad_norm": 16.77786495887254, "language_loss": 0.92969573, "learning_rate": 3.814863144409855e-06, "loss": 0.95145857, "num_input_tokens_seen": 58980355, "router_z_loss_clip": 2.93554688, "router_z_loss_mlp": 0.41748047, "step": 2726, "time_per_iteration": 4.062068462371826 }, { "auxiliary_loss_clip": 0.01512188, "auxiliary_loss_mlp": 0.00767425, "balance_loss_clip": 1.21410418, "balance_loss_mlp": 0.72529638, "epoch": 0.16395611002555238, "flos": 21507008785920.0, "grad_norm": 44.43119539631893, "language_loss": 0.78856421, "learning_rate": 3.814699458247963e-06, "loss": 0.81136036, "num_input_tokens_seen": 58999505, "router_z_loss_clip": 2.98046875, "router_z_loss_mlp": 0.42138672, "step": 2727, "time_per_iteration": 2.622298002243042 }, { "auxiliary_loss_clip": 0.01508921, "auxiliary_loss_mlp": 0.0078251, "balance_loss_clip": 1.21662354, "balance_loss_mlp": 0.74154997, "epoch": 0.16401623327822035, "flos": 21470918595840.0, "grad_norm": 67.47763260228845, "language_loss": 0.88574898, "learning_rate": 3.8145357032724855e-06, "loss": 0.90866327, "num_input_tokens_seen": 59017930, "router_z_loss_clip": 2.92773438, "router_z_loss_mlp": 0.40966797, "step": 2728, "time_per_iteration": 2.61509108543396 }, { "auxiliary_loss_clip": 0.01500354, "auxiliary_loss_mlp": 0.00742427, "balance_loss_clip": 1.20129681, "balance_loss_mlp": 0.6977948, "epoch": 0.1640763565308883, "flos": 13626232928640.0, "grad_norm": 19.596142921214614, "language_loss": 0.92963934, "learning_rate": 3.814371879489633e-06, "loss": 0.95206714, "num_input_tokens_seen": 59035130, "router_z_loss_clip": 2.9921875, "router_z_loss_mlp": 0.44702148, "step": 2729, "time_per_iteration": 2.592879295349121 }, { "auxiliary_loss_clip": 0.01511738, "auxiliary_loss_mlp": 0.00723265, "balance_loss_clip": 1.21525383, "balance_loss_mlp": 0.6802305, "epoch": 0.16413647978355628, "flos": 15451464579840.0, "grad_norm": 5.415165963883723, "language_loss": 0.81626475, "learning_rate": 3.814207986905616e-06, "loss": 0.83861476, "num_input_tokens_seen": 59053080, "router_z_loss_clip": 2.96875, "router_z_loss_mlp": 0.43041992, "step": 2730, "time_per_iteration": 2.5956099033355713 }, { "auxiliary_loss_clip": 0.0150523, "auxiliary_loss_mlp": 0.00787382, "balance_loss_clip": 1.20745564, "balance_loss_mlp": 0.74027008, "epoch": 0.16419660303622427, "flos": 45878682015360.0, "grad_norm": 4.182912979124772, "language_loss": 0.81178451, "learning_rate": 3.814044025526651e-06, "loss": 0.83471054, "num_input_tokens_seen": 59075610, "router_z_loss_clip": 2.97851562, "router_z_loss_mlp": 0.47167969, "step": 2731, "time_per_iteration": 2.8144614696502686 }, { "auxiliary_loss_clip": 0.01499421, "auxiliary_loss_mlp": 0.00751234, "balance_loss_clip": 1.20190048, "balance_loss_mlp": 0.70479012, "epoch": 0.16425672628889224, "flos": 18952826526720.0, "grad_norm": 508.96703258864056, "language_loss": 0.85445988, "learning_rate": 3.8138799953589548e-06, "loss": 0.87696648, "num_input_tokens_seen": 59094555, "router_z_loss_clip": 2.9765625, "router_z_loss_mlp": 0.46459961, "step": 2732, "time_per_iteration": 2.631650924682617 }, { "auxiliary_loss_clip": 0.015121, "auxiliary_loss_mlp": 0.00771298, "balance_loss_clip": 1.21442461, "balance_loss_mlp": 0.72521174, "epoch": 0.1643168495415602, "flos": 24312996362880.0, "grad_norm": 11.22616128029311, "language_loss": 0.74435639, "learning_rate": 3.8137158964087473e-06, "loss": 0.7671904, "num_input_tokens_seen": 59113515, "router_z_loss_clip": 2.98046875, "router_z_loss_mlp": 0.46118164, "step": 2733, "time_per_iteration": 2.767143964767456 }, { "auxiliary_loss_clip": 0.01499214, "auxiliary_loss_mlp": 0.00769566, "balance_loss_clip": 1.20607233, "balance_loss_mlp": 0.72445738, "epoch": 0.16437697279422817, "flos": 26428421992320.0, "grad_norm": 3.2040007811817697, "language_loss": 0.8586145, "learning_rate": 3.8135517286822508e-06, "loss": 0.88130236, "num_input_tokens_seen": 59133275, "router_z_loss_clip": 2.9296875, "router_z_loss_mlp": 0.45092773, "step": 2734, "time_per_iteration": 2.649622678756714 }, { "auxiliary_loss_clip": 0.01498412, "auxiliary_loss_mlp": 0.00705137, "balance_loss_clip": 1.20352888, "balance_loss_mlp": 0.66274655, "epoch": 0.16443709604689613, "flos": 34532239351680.0, "grad_norm": 3.608109154713085, "language_loss": 0.90270656, "learning_rate": 3.8133874921856914e-06, "loss": 0.9247421, "num_input_tokens_seen": 59154095, "router_z_loss_clip": 2.9453125, "router_z_loss_mlp": 0.42407227, "step": 2735, "time_per_iteration": 2.803454637527466 }, { "auxiliary_loss_clip": 0.01502671, "auxiliary_loss_mlp": 0.00634138, "balance_loss_clip": 1.21297789, "balance_loss_mlp": 0.59606278, "epoch": 0.1644972192995641, "flos": 23258048895360.0, "grad_norm": 21.164701853935167, "language_loss": 0.86329299, "learning_rate": 3.813223186925296e-06, "loss": 0.88466108, "num_input_tokens_seen": 59173795, "router_z_loss_clip": 2.89648438, "router_z_loss_mlp": 0.38110352, "step": 2736, "time_per_iteration": 2.5941431522369385 }, { "auxiliary_loss_clip": 0.01520267, "auxiliary_loss_mlp": 0.00555395, "balance_loss_clip": 1.23267269, "balance_loss_mlp": 0.51906013, "epoch": 0.1645573425522321, "flos": 26979543342720.0, "grad_norm": 1233.5714948751527, "language_loss": 0.85806489, "learning_rate": 3.8130588129072964e-06, "loss": 0.87882155, "num_input_tokens_seen": 59191610, "router_z_loss_clip": 2.875, "router_z_loss_mlp": 0.36303711, "step": 2737, "time_per_iteration": 2.7108564376831055 }, { "auxiliary_loss_clip": 0.01516658, "auxiliary_loss_mlp": 0.00526131, "balance_loss_clip": 1.22384238, "balance_loss_mlp": 0.49094027, "epoch": 0.16461746580490005, "flos": 28731768600960.0, "grad_norm": 2357.4209483279783, "language_loss": 0.92703778, "learning_rate": 3.8128943701379246e-06, "loss": 0.94746566, "num_input_tokens_seen": 59213000, "router_z_loss_clip": 2.92773438, "router_z_loss_mlp": 0.35180664, "step": 2738, "time_per_iteration": 2.7346370220184326 }, { "auxiliary_loss_clip": 0.01499073, "auxiliary_loss_mlp": 0.00494643, "balance_loss_clip": 1.20406806, "balance_loss_mlp": 0.46116859, "epoch": 0.16467758905756802, "flos": 24930156867840.0, "grad_norm": 53.364908275663474, "language_loss": 0.76337326, "learning_rate": 3.8127298586234167e-06, "loss": 0.78331041, "num_input_tokens_seen": 59232340, "router_z_loss_clip": 2.94726562, "router_z_loss_mlp": 0.3347168, "step": 2739, "time_per_iteration": 2.6788041591644287 }, { "auxiliary_loss_clip": 0.01507913, "auxiliary_loss_mlp": 0.00483519, "balance_loss_clip": 1.21390259, "balance_loss_mlp": 0.44992566, "epoch": 0.16473771231023598, "flos": 24826519152000.0, "grad_norm": 7.415243957160203, "language_loss": 0.88038731, "learning_rate": 3.8125652783700104e-06, "loss": 0.90030158, "num_input_tokens_seen": 59253950, "router_z_loss_clip": 2.94140625, "router_z_loss_mlp": 0.3359375, "step": 2740, "time_per_iteration": 2.663074254989624 }, { "auxiliary_loss_clip": 0.0152237, "auxiliary_loss_mlp": 0.00506633, "balance_loss_clip": 1.22777677, "balance_loss_mlp": 0.47175264, "epoch": 0.16479783556290395, "flos": 39896072375040.0, "grad_norm": 65.14322682502963, "language_loss": 0.7745229, "learning_rate": 3.8124006293839475e-06, "loss": 0.79481292, "num_input_tokens_seen": 59275545, "router_z_loss_clip": 2.94140625, "router_z_loss_mlp": 0.34887695, "step": 2741, "time_per_iteration": 2.8516757488250732 }, { "auxiliary_loss_clip": 0.01519222, "auxiliary_loss_mlp": 0.00464428, "balance_loss_clip": 1.22273457, "balance_loss_mlp": 0.43155015, "epoch": 0.16485795881557191, "flos": 19897061299200.0, "grad_norm": 7.524343480413302, "language_loss": 0.85168797, "learning_rate": 3.812235911671472e-06, "loss": 0.87152445, "num_input_tokens_seen": 59293480, "router_z_loss_clip": 2.96484375, "router_z_loss_mlp": 0.32885742, "step": 2742, "time_per_iteration": 2.623344659805298 }, { "auxiliary_loss_clip": 0.01531458, "auxiliary_loss_mlp": 0.00425975, "balance_loss_clip": 1.23353493, "balance_loss_mlp": 0.3955524, "epoch": 0.16491808206823988, "flos": 20556129997440.0, "grad_norm": 16.06150438959695, "language_loss": 0.89797938, "learning_rate": 3.8120711252388274e-06, "loss": 0.91755372, "num_input_tokens_seen": 59313435, "router_z_loss_clip": 2.9765625, "router_z_loss_mlp": 0.30444336, "step": 2743, "time_per_iteration": 2.627279043197632 }, { "auxiliary_loss_clip": 0.01501605, "auxiliary_loss_mlp": 0.00458106, "balance_loss_clip": 1.20581722, "balance_loss_mlp": 0.42534706, "epoch": 0.16497820532090787, "flos": 23800802376960.0, "grad_norm": 8.222120306605389, "language_loss": 0.90509182, "learning_rate": 3.811906270092265e-06, "loss": 0.92468894, "num_input_tokens_seen": 59331535, "router_z_loss_clip": 2.95507812, "router_z_loss_mlp": 0.32739258, "step": 2744, "time_per_iteration": 2.6992557048797607 }, { "auxiliary_loss_clip": 0.01496332, "auxiliary_loss_mlp": 0.00414858, "balance_loss_clip": 1.2007879, "balance_loss_mlp": 0.38786942, "epoch": 0.16503832857357584, "flos": 25482642935040.0, "grad_norm": 8.250385815604746, "language_loss": 0.8796792, "learning_rate": 3.811741346238036e-06, "loss": 0.89879107, "num_input_tokens_seen": 59350680, "router_z_loss_clip": 2.95117188, "router_z_loss_mlp": 0.26965332, "step": 2745, "time_per_iteration": 2.610992670059204 }, { "auxiliary_loss_clip": 0.01518094, "auxiliary_loss_mlp": 0.00469632, "balance_loss_clip": 1.22074008, "balance_loss_mlp": 0.43583575, "epoch": 0.1650984518262438, "flos": 17676058619520.0, "grad_norm": 419.22654607005035, "language_loss": 0.8147589, "learning_rate": 3.8115763536823923e-06, "loss": 0.83463615, "num_input_tokens_seen": 59367020, "router_z_loss_clip": 2.97265625, "router_z_loss_mlp": 0.3380127, "step": 2746, "time_per_iteration": 2.596585750579834 }, { "auxiliary_loss_clip": 0.0152074, "auxiliary_loss_mlp": 0.00427491, "balance_loss_clip": 1.22142982, "balance_loss_mlp": 0.39494729, "epoch": 0.16515857507891177, "flos": 18698327688960.0, "grad_norm": 259.5125276363661, "language_loss": 0.85699075, "learning_rate": 3.811411292431592e-06, "loss": 0.87647307, "num_input_tokens_seen": 59386075, "router_z_loss_clip": 2.99023438, "router_z_loss_mlp": 0.32568359, "step": 2747, "time_per_iteration": 2.592313766479492 }, { "auxiliary_loss_clip": 0.01532142, "auxiliary_loss_mlp": 0.00426771, "balance_loss_clip": 1.23334384, "balance_loss_mlp": 0.39699274, "epoch": 0.16521869833157973, "flos": 15010481306880.0, "grad_norm": 17.751421239793256, "language_loss": 0.76698685, "learning_rate": 3.8112461624918945e-06, "loss": 0.78657597, "num_input_tokens_seen": 59402690, "router_z_loss_clip": 2.98828125, "router_z_loss_mlp": 0.29760742, "step": 2748, "time_per_iteration": 2.62016224861145 }, { "auxiliary_loss_clip": 0.01529937, "auxiliary_loss_mlp": 0.00450328, "balance_loss_clip": 1.2282759, "balance_loss_mlp": 0.42057347, "epoch": 0.1652788215842477, "flos": 22121152548480.0, "grad_norm": 2.5734466565648746, "language_loss": 0.9402017, "learning_rate": 3.811080963869561e-06, "loss": 0.96000433, "num_input_tokens_seen": 59421130, "router_z_loss_clip": 3.01171875, "router_z_loss_mlp": 0.29736328, "step": 2749, "time_per_iteration": 2.5881943702697754 }, { "auxiliary_loss_clip": 0.01557474, "auxiliary_loss_mlp": 0.00517101, "balance_loss_clip": 1.25423551, "balance_loss_mlp": 0.480838, "epoch": 0.16533894483691566, "flos": 18333080242560.0, "grad_norm": 21.682374896252586, "language_loss": 0.84844917, "learning_rate": 3.8109156965708557e-06, "loss": 0.86919492, "num_input_tokens_seen": 59438970, "router_z_loss_clip": 3.03320312, "router_z_loss_mlp": 0.36254883, "step": 2750, "time_per_iteration": 2.6236283779144287 }, { "auxiliary_loss_clip": 0.01547834, "auxiliary_loss_mlp": 0.00480992, "balance_loss_clip": 1.24434686, "balance_loss_mlp": 0.4478282, "epoch": 0.16539906808958366, "flos": 22382115834240.0, "grad_norm": 10.05253584043569, "language_loss": 1.01491499, "learning_rate": 3.8107503606020455e-06, "loss": 1.03520322, "num_input_tokens_seen": 59458510, "router_z_loss_clip": 3.03710938, "router_z_loss_mlp": 0.33178711, "step": 2751, "time_per_iteration": 2.5954554080963135 }, { "auxiliary_loss_clip": 0.01540075, "auxiliary_loss_mlp": 0.00472519, "balance_loss_clip": 1.23961258, "balance_loss_mlp": 0.43966442, "epoch": 0.16545919134225162, "flos": 22711093522560.0, "grad_norm": 36.96308707823127, "language_loss": 0.77336812, "learning_rate": 3.8105849559693997e-06, "loss": 0.79349411, "num_input_tokens_seen": 59477110, "router_z_loss_clip": 3.00585938, "router_z_loss_mlp": 0.32824707, "step": 2752, "time_per_iteration": 2.6553573608398438 }, { "auxiliary_loss_clip": 0.01680246, "auxiliary_loss_mlp": 0.00306021, "balance_loss_clip": 1.42792106, "balance_loss_mlp": 0.28418145, "epoch": 0.1655193145949196, "flos": 67802974076160.0, "grad_norm": 0.7586558537862054, "language_loss": 0.53750372, "learning_rate": 3.810419482679192e-06, "loss": 0.55736637, "num_input_tokens_seen": 59541155, "router_z_loss_clip": 2.53125, "router_z_loss_mlp": 0.21875, "step": 2753, "time_per_iteration": 3.200726270675659 }, { "auxiliary_loss_clip": 0.01561789, "auxiliary_loss_mlp": 0.00515637, "balance_loss_clip": 1.26514244, "balance_loss_mlp": 0.47925472, "epoch": 0.16557943784758755, "flos": 24280389792000.0, "grad_norm": 8.498209702674462, "language_loss": 0.81139821, "learning_rate": 3.8102539407376954e-06, "loss": 0.83217251, "num_input_tokens_seen": 59561155, "router_z_loss_clip": 2.96289062, "router_z_loss_mlp": 0.36401367, "step": 2754, "time_per_iteration": 2.6174275875091553 }, { "auxiliary_loss_clip": 0.01581704, "auxiliary_loss_mlp": 0.00541582, "balance_loss_clip": 1.27486062, "balance_loss_mlp": 0.5073455, "epoch": 0.16563956110025552, "flos": 20083617561600.0, "grad_norm": 4.179186631994736, "language_loss": 0.94976014, "learning_rate": 3.810088330151188e-06, "loss": 0.97099298, "num_input_tokens_seen": 59580460, "router_z_loss_clip": 3.06835938, "router_z_loss_mlp": 0.34228516, "step": 2755, "time_per_iteration": 2.645301103591919 }, { "auxiliary_loss_clip": 0.01574657, "auxiliary_loss_mlp": 0.00527446, "balance_loss_clip": 1.27316022, "balance_loss_mlp": 0.49306589, "epoch": 0.16569968435292348, "flos": 28034454896640.0, "grad_norm": 29.937220770831704, "language_loss": 0.77596354, "learning_rate": 3.80992265092595e-06, "loss": 0.79698455, "num_input_tokens_seen": 59600025, "router_z_loss_clip": 3.01367188, "router_z_loss_mlp": 0.34350586, "step": 2756, "time_per_iteration": 2.713578224182129 }, { "auxiliary_loss_clip": 0.01579459, "auxiliary_loss_mlp": 0.00542375, "balance_loss_clip": 1.28080463, "balance_loss_mlp": 0.50787616, "epoch": 0.16575980760559147, "flos": 26250233598720.0, "grad_norm": 174.6002239115907, "language_loss": 0.81242263, "learning_rate": 3.8097569030682636e-06, "loss": 0.83364099, "num_input_tokens_seen": 59620600, "router_z_loss_clip": 2.98632812, "router_z_loss_mlp": 0.34472656, "step": 2757, "time_per_iteration": 2.6382925510406494 }, { "auxiliary_loss_clip": 0.01593656, "auxiliary_loss_mlp": 0.00520746, "balance_loss_clip": 1.28837538, "balance_loss_mlp": 0.48743927, "epoch": 0.16581993085825944, "flos": 26943955943040.0, "grad_norm": 7.540760953872778, "language_loss": 0.89776254, "learning_rate": 3.8095910865844137e-06, "loss": 0.91890657, "num_input_tokens_seen": 59641385, "router_z_loss_clip": 3.04882812, "router_z_loss_mlp": 0.33300781, "step": 2758, "time_per_iteration": 2.6746013164520264 }, { "auxiliary_loss_clip": 0.01601131, "auxiliary_loss_mlp": 0.00489292, "balance_loss_clip": 1.2964828, "balance_loss_mlp": 0.4580833, "epoch": 0.1658800541109274, "flos": 21653632103040.0, "grad_norm": 159.05122660516275, "language_loss": 0.85809779, "learning_rate": 3.809425201480689e-06, "loss": 0.87900198, "num_input_tokens_seen": 59659865, "router_z_loss_clip": 3.04492188, "router_z_loss_mlp": 0.31164551, "step": 2759, "time_per_iteration": 2.6031949520111084 }, { "auxiliary_loss_clip": 0.01582353, "auxiliary_loss_mlp": 0.00505667, "balance_loss_clip": 1.2721498, "balance_loss_mlp": 0.47076252, "epoch": 0.16594017736359537, "flos": 16435488643200.0, "grad_norm": 5.340647119360419, "language_loss": 0.85851657, "learning_rate": 3.8092592477633793e-06, "loss": 0.8793968, "num_input_tokens_seen": 59678780, "router_z_loss_clip": 3.10546875, "router_z_loss_mlp": 0.34838867, "step": 2760, "time_per_iteration": 2.5691797733306885 }, { "auxiliary_loss_clip": 0.0159188, "auxiliary_loss_mlp": 0.00533651, "balance_loss_clip": 1.28510141, "balance_loss_mlp": 0.49941453, "epoch": 0.16600030061626334, "flos": 22637297030400.0, "grad_norm": 9.237520839732298, "language_loss": 0.79982007, "learning_rate": 3.8090932254387774e-06, "loss": 0.82107532, "num_input_tokens_seen": 59698795, "router_z_loss_clip": 3.0703125, "router_z_loss_mlp": 0.34277344, "step": 2761, "time_per_iteration": 2.655872106552124 }, { "auxiliary_loss_clip": 0.01577531, "auxiliary_loss_mlp": 0.00520251, "balance_loss_clip": 1.26571012, "balance_loss_mlp": 0.48558456, "epoch": 0.1660604238689313, "flos": 26396569607040.0, "grad_norm": 56.81007174309186, "language_loss": 0.93187624, "learning_rate": 3.8089271345131788e-06, "loss": 0.9528541, "num_input_tokens_seen": 59718795, "router_z_loss_clip": 3.1171875, "router_z_loss_mlp": 0.34692383, "step": 2762, "time_per_iteration": 4.067171096801758 }, { "auxiliary_loss_clip": 0.01568703, "auxiliary_loss_mlp": 0.0048465, "balance_loss_clip": 1.25455451, "balance_loss_mlp": 0.45279688, "epoch": 0.16612054712159927, "flos": 23039999383680.0, "grad_norm": 3.122237771536203, "language_loss": 0.9585017, "learning_rate": 3.8087609749928822e-06, "loss": 0.97903526, "num_input_tokens_seen": 59737555, "router_z_loss_clip": 3.14453125, "router_z_loss_mlp": 0.31860352, "step": 2763, "time_per_iteration": 2.6473898887634277 }, { "auxiliary_loss_clip": 0.01639962, "auxiliary_loss_mlp": 0.00203403, "balance_loss_clip": 1.38814592, "balance_loss_mlp": 0.18795316, "epoch": 0.16618067037426726, "flos": 59241225202560.0, "grad_norm": 0.8102432604774885, "language_loss": 0.59855008, "learning_rate": 3.8085947468841885e-06, "loss": 0.61698371, "num_input_tokens_seen": 59800915, "router_z_loss_clip": 2.515625, "router_z_loss_mlp": 0.15429688, "step": 2764, "time_per_iteration": 3.158813238143921 }, { "auxiliary_loss_clip": 0.01604923, "auxiliary_loss_mlp": 0.00535246, "balance_loss_clip": 1.29501128, "balance_loss_mlp": 0.50065142, "epoch": 0.16624079362693522, "flos": 27198813916800.0, "grad_norm": 12.571611601855002, "language_loss": 0.86643255, "learning_rate": 3.808428450193401e-06, "loss": 0.88783419, "num_input_tokens_seen": 59822910, "router_z_loss_clip": 3.09765625, "router_z_loss_mlp": 0.34594727, "step": 2765, "time_per_iteration": 4.181353569030762 }, { "auxiliary_loss_clip": 0.01597185, "auxiliary_loss_mlp": 0.00530132, "balance_loss_clip": 1.29039037, "balance_loss_mlp": 0.49420235, "epoch": 0.1663009168796032, "flos": 10925068216320.0, "grad_norm": 2.562859459698781, "language_loss": 0.79835808, "learning_rate": 3.8082620849268244e-06, "loss": 0.81963122, "num_input_tokens_seen": 59838805, "router_z_loss_clip": 3.06445312, "router_z_loss_mlp": 0.359375, "step": 2766, "time_per_iteration": 2.701281785964966 }, { "auxiliary_loss_clip": 0.01579458, "auxiliary_loss_mlp": 0.00486304, "balance_loss_clip": 1.2717545, "balance_loss_mlp": 0.45561963, "epoch": 0.16636104013227115, "flos": 17894431353600.0, "grad_norm": 25.151213035663982, "language_loss": 0.95959461, "learning_rate": 3.808095651090769e-06, "loss": 0.98025227, "num_input_tokens_seen": 59855345, "router_z_loss_clip": 3.07421875, "router_z_loss_mlp": 0.30712891, "step": 2767, "time_per_iteration": 2.545445680618286 }, { "auxiliary_loss_clip": 0.01615571, "auxiliary_loss_mlp": 0.00208724, "balance_loss_clip": 1.37825775, "balance_loss_mlp": 0.19413239, "epoch": 0.16642116338493912, "flos": 66726050463360.0, "grad_norm": 0.6343364974405218, "language_loss": 0.52650172, "learning_rate": 3.8079291486915447e-06, "loss": 0.54474467, "num_input_tokens_seen": 59917710, "router_z_loss_clip": 2.375, "router_z_loss_mlp": 0.14550781, "step": 2768, "time_per_iteration": 4.549867630004883 }, { "auxiliary_loss_clip": 0.015795, "auxiliary_loss_mlp": 0.00464358, "balance_loss_clip": 1.26934862, "balance_loss_mlp": 0.43250555, "epoch": 0.16648128663760708, "flos": 19026048401280.0, "grad_norm": 18.37728515768898, "language_loss": 0.95549893, "learning_rate": 3.8077625777354667e-06, "loss": 0.97593749, "num_input_tokens_seen": 59935105, "router_z_loss_clip": 3.1015625, "router_z_loss_mlp": 0.31884766, "step": 2769, "time_per_iteration": 2.5749552249908447 }, { "auxiliary_loss_clip": 0.0159671, "auxiliary_loss_mlp": 0.0016211, "balance_loss_clip": 1.35810804, "balance_loss_mlp": 0.14894941, "epoch": 0.16654140989027508, "flos": 70134976759680.0, "grad_norm": 0.842726617912059, "language_loss": 0.57276988, "learning_rate": 3.80759593822885e-06, "loss": 0.59035808, "num_input_tokens_seen": 59984085, "router_z_loss_clip": 2.390625, "router_z_loss_mlp": 0.13183594, "step": 2770, "time_per_iteration": 2.9766881465911865 }, { "auxiliary_loss_clip": 0.01583239, "auxiliary_loss_mlp": 0.00173811, "balance_loss_clip": 1.34685087, "balance_loss_mlp": 0.16131769, "epoch": 0.16660153314294304, "flos": 70272406195200.0, "grad_norm": 0.8384171893313379, "language_loss": 0.56193471, "learning_rate": 3.807429230178015e-06, "loss": 0.57950521, "num_input_tokens_seen": 60043470, "router_z_loss_clip": 2.375, "router_z_loss_mlp": 0.12451172, "step": 2771, "time_per_iteration": 2.9132072925567627 }, { "auxiliary_loss_clip": 0.01552938, "auxiliary_loss_mlp": 0.00420022, "balance_loss_clip": 1.24767756, "balance_loss_mlp": 0.39231783, "epoch": 0.166661656395611, "flos": 23075048079360.0, "grad_norm": 86.22638596179698, "language_loss": 0.7947619, "learning_rate": 3.8072624535892817e-06, "loss": 0.81449157, "num_input_tokens_seen": 60063045, "router_z_loss_clip": 3.05273438, "router_z_loss_mlp": 0.27734375, "step": 2772, "time_per_iteration": 2.617250442504883 }, { "auxiliary_loss_clip": 0.01566926, "auxiliary_loss_mlp": 0.00419063, "balance_loss_clip": 1.26243353, "balance_loss_mlp": 0.39225245, "epoch": 0.16672177964827897, "flos": 28366341586560.0, "grad_norm": 481.57444742963145, "language_loss": 0.92570812, "learning_rate": 3.807095608468975e-06, "loss": 0.94556797, "num_input_tokens_seen": 60081945, "router_z_loss_clip": 3.04296875, "router_z_loss_mlp": 0.26806641, "step": 2773, "time_per_iteration": 2.7371411323547363 }, { "auxiliary_loss_clip": 0.01570316, "auxiliary_loss_mlp": 0.00430611, "balance_loss_clip": 1.26422417, "balance_loss_mlp": 0.40219134, "epoch": 0.16678190290094694, "flos": 19091010147840.0, "grad_norm": 5.978239333596565, "language_loss": 0.86509436, "learning_rate": 3.8069286948234224e-06, "loss": 0.8851037, "num_input_tokens_seen": 60096820, "router_z_loss_clip": 3.0625, "router_z_loss_mlp": 0.28430176, "step": 2774, "time_per_iteration": 2.6032941341400146 }, { "auxiliary_loss_clip": 0.01538553, "auxiliary_loss_mlp": 0.00424004, "balance_loss_clip": 1.23546624, "balance_loss_mlp": 0.3956798, "epoch": 0.1668420261536149, "flos": 21799106184960.0, "grad_norm": 8.832182760926356, "language_loss": 0.90142351, "learning_rate": 3.806761712658952e-06, "loss": 0.92104906, "num_input_tokens_seen": 60116140, "router_z_loss_clip": 3.03320312, "router_z_loss_mlp": 0.28308105, "step": 2775, "time_per_iteration": 2.6223034858703613 }, { "auxiliary_loss_clip": 0.01547029, "auxiliary_loss_mlp": 0.00441895, "balance_loss_clip": 1.24255562, "balance_loss_mlp": 0.41115141, "epoch": 0.16690214940628287, "flos": 19062533640960.0, "grad_norm": 3.8027990523989175, "language_loss": 0.86457282, "learning_rate": 3.806594661981897e-06, "loss": 0.884462, "num_input_tokens_seen": 60134235, "router_z_loss_clip": 3.046875, "router_z_loss_mlp": 0.30737305, "step": 2776, "time_per_iteration": 2.7541089057922363 }, { "auxiliary_loss_clip": 0.01569447, "auxiliary_loss_mlp": 0.0038257, "balance_loss_clip": 1.26347387, "balance_loss_mlp": 0.35722601, "epoch": 0.16696227265895086, "flos": 18588548747520.0, "grad_norm": 44.109686125714234, "language_loss": 0.84417802, "learning_rate": 3.8064275427985906e-06, "loss": 0.86369824, "num_input_tokens_seen": 60153275, "router_z_loss_clip": 3.05664062, "router_z_loss_mlp": 0.25366211, "step": 2777, "time_per_iteration": 2.6745004653930664 }, { "auxiliary_loss_clip": 0.01551495, "auxiliary_loss_mlp": 0.00400781, "balance_loss_clip": 1.240134, "balance_loss_mlp": 0.37357736, "epoch": 0.16702239591161883, "flos": 23294139085440.0, "grad_norm": 2.926498193492357, "language_loss": 0.91054499, "learning_rate": 3.806260355115371e-06, "loss": 0.93006778, "num_input_tokens_seen": 60173215, "router_z_loss_clip": 3.11328125, "router_z_loss_mlp": 0.27246094, "step": 2778, "time_per_iteration": 2.645562171936035 }, { "auxiliary_loss_clip": 0.01531061, "auxiliary_loss_mlp": 0.0037395, "balance_loss_clip": 1.22234726, "balance_loss_mlp": 0.34642416, "epoch": 0.1670825191642868, "flos": 24425648392320.0, "grad_norm": 15.960819624421113, "language_loss": 0.82165581, "learning_rate": 3.8060930989385778e-06, "loss": 0.84070593, "num_input_tokens_seen": 60190515, "router_z_loss_clip": 3.08789062, "router_z_loss_mlp": 0.27526855, "step": 2779, "time_per_iteration": 2.6379306316375732 }, { "auxiliary_loss_clip": 0.01515334, "auxiliary_loss_mlp": 0.00316151, "balance_loss_clip": 1.19882321, "balance_loss_mlp": 0.28930533, "epoch": 0.16714264241695476, "flos": 26797512193920.0, "grad_norm": 10.668924561281605, "language_loss": 0.74301565, "learning_rate": 3.805925774274554e-06, "loss": 0.76133054, "num_input_tokens_seen": 60211655, "router_z_loss_clip": 3.16210938, "router_z_loss_mlp": 0.26818848, "step": 2780, "time_per_iteration": 2.6986050605773926 }, { "auxiliary_loss_clip": 0.01527653, "auxiliary_loss_mlp": 0.00325159, "balance_loss_clip": 1.21148992, "balance_loss_mlp": 0.30024457, "epoch": 0.16720276566962272, "flos": 21835304115840.0, "grad_norm": 2.5041010820845755, "language_loss": 0.86352217, "learning_rate": 3.805758381129643e-06, "loss": 0.88205028, "num_input_tokens_seen": 60230860, "router_z_loss_clip": 3.16015625, "router_z_loss_mlp": 0.24890137, "step": 2781, "time_per_iteration": 2.6219475269317627 }, { "auxiliary_loss_clip": 0.01512222, "auxiliary_loss_mlp": 0.00321511, "balance_loss_clip": 1.20203257, "balance_loss_mlp": 0.29575032, "epoch": 0.1672628889222907, "flos": 21470415805440.0, "grad_norm": 23.297196733120938, "language_loss": 0.80978429, "learning_rate": 3.805590919510193e-06, "loss": 0.82812166, "num_input_tokens_seen": 60250535, "router_z_loss_clip": 3.1015625, "router_z_loss_mlp": 0.25769043, "step": 2782, "time_per_iteration": 2.6472222805023193 }, { "auxiliary_loss_clip": 0.015128, "auxiliary_loss_mlp": 0.00309962, "balance_loss_clip": 1.19345582, "balance_loss_mlp": 0.28322306, "epoch": 0.16732301217495865, "flos": 30774008269440.0, "grad_norm": 253.98528690377086, "language_loss": 0.76301354, "learning_rate": 3.8054233894225547e-06, "loss": 0.78124118, "num_input_tokens_seen": 60269530, "router_z_loss_clip": 3.19335938, "router_z_loss_mlp": 0.26745605, "step": 2783, "time_per_iteration": 2.7067902088165283 }, { "auxiliary_loss_clip": 0.0151243, "auxiliary_loss_mlp": 0.00295296, "balance_loss_clip": 1.19831514, "balance_loss_mlp": 0.26928422, "epoch": 0.16738313542762664, "flos": 23474625949440.0, "grad_norm": 32.268752284769626, "language_loss": 0.78705752, "learning_rate": 3.805255790873081e-06, "loss": 0.80513477, "num_input_tokens_seen": 60289900, "router_z_loss_clip": 3.14648438, "router_z_loss_mlp": 0.2598877, "step": 2784, "time_per_iteration": 2.641408681869507 }, { "auxiliary_loss_clip": 0.01503695, "auxiliary_loss_mlp": 0.00331044, "balance_loss_clip": 1.18442655, "balance_loss_mlp": 0.30336326, "epoch": 0.1674432586802946, "flos": 29789086366080.0, "grad_norm": 237.84675070714448, "language_loss": 0.69798136, "learning_rate": 3.805088123868126e-06, "loss": 0.71632874, "num_input_tokens_seen": 60310025, "router_z_loss_clip": 3.19140625, "router_z_loss_mlp": 0.27661133, "step": 2785, "time_per_iteration": 2.6646230220794678 }, { "auxiliary_loss_clip": 0.01465874, "auxiliary_loss_mlp": 0.00085867, "balance_loss_clip": 1.21767473, "balance_loss_mlp": 0.07737903, "epoch": 0.16750338193296258, "flos": 66136073575680.0, "grad_norm": 0.7651810220694134, "language_loss": 0.58054984, "learning_rate": 3.8049203884140492e-06, "loss": 0.59606731, "num_input_tokens_seen": 60377800, "router_z_loss_clip": 2.484375, "router_z_loss_mlp": 0.08496094, "step": 2786, "time_per_iteration": 3.1776509284973145 }, { "auxiliary_loss_clip": 0.01500163, "auxiliary_loss_mlp": 0.0030499, "balance_loss_clip": 1.18208218, "balance_loss_mlp": 0.27866888, "epoch": 0.16756350518563054, "flos": 25696777864320.0, "grad_norm": 327.87107599919943, "language_loss": 0.84408772, "learning_rate": 3.80475258451721e-06, "loss": 0.86213923, "num_input_tokens_seen": 60398215, "router_z_loss_clip": 3.1796875, "router_z_loss_mlp": 0.26306152, "step": 2787, "time_per_iteration": 2.7246758937835693 }, { "auxiliary_loss_clip": 0.01484567, "auxiliary_loss_mlp": 0.00279647, "balance_loss_clip": 1.17085564, "balance_loss_mlp": 0.25480354, "epoch": 0.1676236284382985, "flos": 23836102467840.0, "grad_norm": 2.2608941604488004, "language_loss": 0.85842586, "learning_rate": 3.804584712183972e-06, "loss": 0.876068, "num_input_tokens_seen": 60416910, "router_z_loss_clip": 3.13867188, "router_z_loss_mlp": 0.24829102, "step": 2788, "time_per_iteration": 2.675745725631714 }, { "auxiliary_loss_clip": 0.0150304, "auxiliary_loss_mlp": 0.00087819, "balance_loss_clip": 1.26242805, "balance_loss_mlp": 0.07637501, "epoch": 0.16768375169096647, "flos": 59874902985600.0, "grad_norm": 0.8455351134761456, "language_loss": 0.59354115, "learning_rate": 3.8044167714207013e-06, "loss": 0.60944974, "num_input_tokens_seen": 60468660, "router_z_loss_clip": 2.40625, "router_z_loss_mlp": 0.11425781, "step": 2789, "time_per_iteration": 2.992363691329956 }, { "auxiliary_loss_clip": 0.0149192, "auxiliary_loss_mlp": 0.00281453, "balance_loss_clip": 1.17521858, "balance_loss_mlp": 0.25703883, "epoch": 0.16774387494363446, "flos": 38435657207040.0, "grad_norm": 4.2473990238099955, "language_loss": 0.75751221, "learning_rate": 3.804248762233765e-06, "loss": 0.77524596, "num_input_tokens_seen": 60492370, "router_z_loss_clip": 3.16796875, "router_z_loss_mlp": 0.24401855, "step": 2790, "time_per_iteration": 2.816970109939575 }, { "auxiliary_loss_clip": 0.01497149, "auxiliary_loss_mlp": 0.00281097, "balance_loss_clip": 1.18015349, "balance_loss_mlp": 0.25669488, "epoch": 0.16780399819630243, "flos": 22637620252800.0, "grad_norm": 56.11641076431466, "language_loss": 0.85745847, "learning_rate": 3.8040806846295356e-06, "loss": 0.87524092, "num_input_tokens_seen": 60512655, "router_z_loss_clip": 3.16601562, "router_z_loss_mlp": 0.24401855, "step": 2791, "time_per_iteration": 2.673689603805542 }, { "auxiliary_loss_clip": 0.0151221, "auxiliary_loss_mlp": 0.00263106, "balance_loss_clip": 1.19389415, "balance_loss_mlp": 0.23814318, "epoch": 0.1678641214489704, "flos": 32891516887680.0, "grad_norm": 14.241501356933936, "language_loss": 0.80406058, "learning_rate": 3.8039125386143853e-06, "loss": 0.82181376, "num_input_tokens_seen": 60533090, "router_z_loss_clip": 3.1796875, "router_z_loss_mlp": 0.24951172, "step": 2792, "time_per_iteration": 2.7420248985290527 }, { "auxiliary_loss_clip": 0.01518821, "auxiliary_loss_mlp": 0.00270292, "balance_loss_clip": 1.20173311, "balance_loss_mlp": 0.24620029, "epoch": 0.16792424470163836, "flos": 19974916028160.0, "grad_norm": 26.244529015290727, "language_loss": 0.80911475, "learning_rate": 3.803744324194691e-06, "loss": 0.82700586, "num_input_tokens_seen": 60553190, "router_z_loss_clip": 3.171875, "router_z_loss_mlp": 0.2409668, "step": 2793, "time_per_iteration": 2.6261637210845947 }, { "auxiliary_loss_clip": 0.01523985, "auxiliary_loss_mlp": 0.00274408, "balance_loss_clip": 1.20541143, "balance_loss_mlp": 0.2488492, "epoch": 0.16798436795430632, "flos": 19719878486400.0, "grad_norm": 604.4563994635255, "language_loss": 0.84471512, "learning_rate": 3.803576041376831e-06, "loss": 0.86269903, "num_input_tokens_seen": 60571995, "router_z_loss_clip": 3.18554688, "router_z_loss_mlp": 0.25549316, "step": 2794, "time_per_iteration": 2.5863535404205322 }, { "auxiliary_loss_clip": 0.01472357, "auxiliary_loss_mlp": 0.00279302, "balance_loss_clip": 1.15909767, "balance_loss_mlp": 0.25377926, "epoch": 0.1680444912069743, "flos": 28104839596800.0, "grad_norm": 8.294355255018363, "language_loss": 0.80171728, "learning_rate": 3.803407690167187e-06, "loss": 0.81923389, "num_input_tokens_seen": 60591275, "router_z_loss_clip": 3.13085938, "router_z_loss_mlp": 0.25524902, "step": 2795, "time_per_iteration": 2.6292593479156494 }, { "auxiliary_loss_clip": 0.01504711, "auxiliary_loss_mlp": 0.00252945, "balance_loss_clip": 1.18623018, "balance_loss_mlp": 0.22879352, "epoch": 0.16810461445964225, "flos": 18075205526400.0, "grad_norm": 149.06251058494846, "language_loss": 0.91055453, "learning_rate": 3.803239270572142e-06, "loss": 0.9281311, "num_input_tokens_seen": 60609235, "router_z_loss_clip": 3.18359375, "router_z_loss_mlp": 0.24169922, "step": 2796, "time_per_iteration": 2.5971531867980957 }, { "auxiliary_loss_clip": 0.01525732, "auxiliary_loss_mlp": 0.00291347, "balance_loss_clip": 1.20983219, "balance_loss_mlp": 0.26611087, "epoch": 0.16816473771231025, "flos": 23878657105920.0, "grad_norm": 1191.2003497130668, "language_loss": 0.88844228, "learning_rate": 3.8030707825980838e-06, "loss": 0.90661305, "num_input_tokens_seen": 60629880, "router_z_loss_clip": 3.16210938, "router_z_loss_mlp": 0.25231934, "step": 2797, "time_per_iteration": 2.6292848587036133 }, { "auxiliary_loss_clip": 0.01573033, "auxiliary_loss_mlp": 0.00310382, "balance_loss_clip": 1.26075554, "balance_loss_mlp": 0.28679004, "epoch": 0.1682248609649782, "flos": 22783597125120.0, "grad_norm": 8.040820399661333, "language_loss": 0.80318642, "learning_rate": 3.802902226251401e-06, "loss": 0.82202065, "num_input_tokens_seen": 60651175, "router_z_loss_clip": 3.12304688, "router_z_loss_mlp": 0.23620605, "step": 2798, "time_per_iteration": 2.697493553161621 }, { "auxiliary_loss_clip": 0.01575002, "auxiliary_loss_mlp": 0.00310083, "balance_loss_clip": 1.2593956, "balance_loss_mlp": 0.28507242, "epoch": 0.16828498421764618, "flos": 20705123612160.0, "grad_norm": 26.91652603900832, "language_loss": 0.86144269, "learning_rate": 3.8027336015384845e-06, "loss": 0.88029361, "num_input_tokens_seen": 60670210, "router_z_loss_clip": 3.15429688, "router_z_loss_mlp": 0.25036621, "step": 2799, "time_per_iteration": 2.6669394969940186 }, { "auxiliary_loss_clip": 0.01576658, "auxiliary_loss_mlp": 0.00314405, "balance_loss_clip": 1.26029146, "balance_loss_mlp": 0.28637904, "epoch": 0.16834510747031414, "flos": 29420606695680.0, "grad_norm": 31.1364554509091, "language_loss": 0.77877808, "learning_rate": 3.8025649084657296e-06, "loss": 0.79768872, "num_input_tokens_seen": 60690895, "router_z_loss_clip": 3.16210938, "router_z_loss_mlp": 0.28039551, "step": 2800, "time_per_iteration": 2.7169628143310547 }, { "auxiliary_loss_clip": 0.0158241, "auxiliary_loss_mlp": 0.00324991, "balance_loss_clip": 1.27189386, "balance_loss_mlp": 0.29641628, "epoch": 0.1684052307229821, "flos": 18145374744960.0, "grad_norm": 35.31192006779947, "language_loss": 0.9041118, "learning_rate": 3.8023961470395326e-06, "loss": 0.92318583, "num_input_tokens_seen": 60708280, "router_z_loss_clip": 3.10351562, "router_z_loss_mlp": 0.28564453, "step": 2801, "time_per_iteration": 2.6092381477355957 }, { "auxiliary_loss_clip": 0.01614615, "auxiliary_loss_mlp": 0.00335256, "balance_loss_clip": 1.29622948, "balance_loss_mlp": 0.308505, "epoch": 0.16846535397565007, "flos": 16574929240320.0, "grad_norm": 181.47337450471304, "language_loss": 0.93160427, "learning_rate": 3.8022273172662933e-06, "loss": 0.95110297, "num_input_tokens_seen": 60724150, "router_z_loss_clip": 3.18359375, "router_z_loss_mlp": 0.2677002, "step": 2802, "time_per_iteration": 2.662832498550415 }, { "auxiliary_loss_clip": 0.0163436, "auxiliary_loss_mlp": 0.0034435, "balance_loss_clip": 1.31581306, "balance_loss_mlp": 0.31696722, "epoch": 0.16852547722831807, "flos": 30408868563840.0, "grad_norm": 7.681828039676108, "language_loss": 0.88171291, "learning_rate": 3.802058419152413e-06, "loss": 0.90149999, "num_input_tokens_seen": 60746485, "router_z_loss_clip": 3.18945312, "router_z_loss_mlp": 0.27368164, "step": 2803, "time_per_iteration": 2.7021379470825195 }, { "auxiliary_loss_clip": 0.0162498, "auxiliary_loss_mlp": 0.00336877, "balance_loss_clip": 1.30701327, "balance_loss_mlp": 0.30931574, "epoch": 0.16858560048098603, "flos": 33507420416640.0, "grad_norm": 32.47858904236908, "language_loss": 0.85075468, "learning_rate": 3.801889452704297e-06, "loss": 0.87037331, "num_input_tokens_seen": 60762875, "router_z_loss_clip": 3.17382812, "router_z_loss_mlp": 0.27539062, "step": 2804, "time_per_iteration": 4.215130090713501 }, { "auxiliary_loss_clip": 0.01728264, "auxiliary_loss_mlp": 0.00133016, "balance_loss_clip": 1.47415257, "balance_loss_mlp": 0.12157159, "epoch": 0.168645723733654, "flos": 67370502326400.0, "grad_norm": 0.8460022134653995, "language_loss": 0.55344337, "learning_rate": 3.8017204179283526e-06, "loss": 0.57205617, "num_input_tokens_seen": 60825510, "router_z_loss_clip": 2.53125, "router_z_loss_mlp": 0.11425781, "step": 2805, "time_per_iteration": 3.1155099868774414 }, { "auxiliary_loss_clip": 0.01660068, "auxiliary_loss_mlp": 0.00404778, "balance_loss_clip": 1.33746064, "balance_loss_mlp": 0.37424862, "epoch": 0.16870584698632196, "flos": 21324618501120.0, "grad_norm": 216.68695019328382, "language_loss": 0.77917558, "learning_rate": 3.8015513148309892e-06, "loss": 0.79982412, "num_input_tokens_seen": 60844440, "router_z_loss_clip": 3.2265625, "router_z_loss_mlp": 0.30529785, "step": 2806, "time_per_iteration": 2.616473436355591 }, { "auxiliary_loss_clip": 0.01634981, "auxiliary_loss_mlp": 0.00363363, "balance_loss_clip": 1.31464696, "balance_loss_mlp": 0.3341094, "epoch": 0.16876597023898993, "flos": 20740746925440.0, "grad_norm": 2.3559913442754365, "language_loss": 0.76223707, "learning_rate": 3.80138214341862e-06, "loss": 0.78222048, "num_input_tokens_seen": 60863210, "router_z_loss_clip": 3.20507812, "router_z_loss_mlp": 0.29248047, "step": 2807, "time_per_iteration": 4.067338228225708 }, { "auxiliary_loss_clip": 0.01655668, "auxiliary_loss_mlp": 0.00390862, "balance_loss_clip": 1.33546805, "balance_loss_mlp": 0.36008215, "epoch": 0.1688260934916579, "flos": 20303498666880.0, "grad_norm": 6.406218688776993, "language_loss": 0.79034853, "learning_rate": 3.8012129036976587e-06, "loss": 0.8108139, "num_input_tokens_seen": 60882510, "router_z_loss_clip": 3.19921875, "router_z_loss_mlp": 0.30761719, "step": 2808, "time_per_iteration": 2.625635862350464 }, { "auxiliary_loss_clip": 0.01670111, "auxiliary_loss_mlp": 0.00385115, "balance_loss_clip": 1.34275007, "balance_loss_mlp": 0.35278583, "epoch": 0.16888621674432586, "flos": 20340702178560.0, "grad_norm": 90.9861267908468, "language_loss": 0.89700931, "learning_rate": 3.8010435956745236e-06, "loss": 0.91756153, "num_input_tokens_seen": 60901105, "router_z_loss_clip": 3.2734375, "router_z_loss_mlp": 0.32324219, "step": 2809, "time_per_iteration": 2.5992331504821777 }, { "auxiliary_loss_clip": 0.01675969, "auxiliary_loss_mlp": 0.00402916, "balance_loss_clip": 1.35013485, "balance_loss_mlp": 0.37223184, "epoch": 0.16894633999699385, "flos": 16244802316800.0, "grad_norm": 7.5278243519433925, "language_loss": 0.95603281, "learning_rate": 3.8008742193556358e-06, "loss": 0.97682166, "num_input_tokens_seen": 60915340, "router_z_loss_clip": 3.26171875, "router_z_loss_mlp": 0.30688477, "step": 2810, "time_per_iteration": 3.968993902206421 }, { "auxiliary_loss_clip": 0.01660066, "auxiliary_loss_mlp": 0.00402561, "balance_loss_clip": 1.3315798, "balance_loss_mlp": 0.37027878, "epoch": 0.16900646324966181, "flos": 19610171372160.0, "grad_norm": 19.405482780028084, "language_loss": 0.99006462, "learning_rate": 3.800704774747416e-06, "loss": 1.01069093, "num_input_tokens_seen": 60933735, "router_z_loss_clip": 3.2890625, "router_z_loss_mlp": 0.32348633, "step": 2811, "time_per_iteration": 2.5867760181427 }, { "auxiliary_loss_clip": 0.01647623, "auxiliary_loss_mlp": 0.00372266, "balance_loss_clip": 1.32019138, "balance_loss_mlp": 0.34127158, "epoch": 0.16906658650232978, "flos": 22018089450240.0, "grad_norm": 65.2263568623519, "language_loss": 0.86907345, "learning_rate": 3.800535261856291e-06, "loss": 0.88927233, "num_input_tokens_seen": 60953105, "router_z_loss_clip": 3.2734375, "router_z_loss_mlp": 0.30957031, "step": 2812, "time_per_iteration": 2.5909907817840576 }, { "auxiliary_loss_clip": 0.01667008, "auxiliary_loss_mlp": 0.00381654, "balance_loss_clip": 1.33975077, "balance_loss_mlp": 0.35251892, "epoch": 0.16912670975499774, "flos": 11763690024960.0, "grad_norm": 16.548029630438844, "language_loss": 0.83302999, "learning_rate": 3.8003656806886887e-06, "loss": 0.85351658, "num_input_tokens_seen": 60969150, "router_z_loss_clip": 3.2734375, "router_z_loss_mlp": 0.29150391, "step": 2813, "time_per_iteration": 2.5343430042266846 }, { "auxiliary_loss_clip": 0.01659957, "auxiliary_loss_mlp": 0.00396537, "balance_loss_clip": 1.33452737, "balance_loss_mlp": 0.364088, "epoch": 0.1691868330076657, "flos": 17161386595200.0, "grad_norm": 131.7533502246331, "language_loss": 0.77569634, "learning_rate": 3.8001960312510396e-06, "loss": 0.79626125, "num_input_tokens_seen": 60982825, "router_z_loss_clip": 3.25585938, "router_z_loss_mlp": 0.32470703, "step": 2814, "time_per_iteration": 2.567167282104492 }, { "auxiliary_loss_clip": 0.01675933, "auxiliary_loss_mlp": 0.00407834, "balance_loss_clip": 1.34716904, "balance_loss_mlp": 0.37679175, "epoch": 0.16924695626033368, "flos": 22416553998720.0, "grad_norm": 2.2792626266000338, "language_loss": 0.68135762, "learning_rate": 3.800026313549776e-06, "loss": 0.70219529, "num_input_tokens_seen": 61000875, "router_z_loss_clip": 3.2890625, "router_z_loss_mlp": 0.31054688, "step": 2815, "time_per_iteration": 2.644531011581421 }, { "auxiliary_loss_clip": 0.01672263, "auxiliary_loss_mlp": 0.00403161, "balance_loss_clip": 1.34141111, "balance_loss_mlp": 0.37116539, "epoch": 0.16930707951300164, "flos": 25739655724800.0, "grad_norm": 1.7443031059889547, "language_loss": 0.88414025, "learning_rate": 3.7998565275913342e-06, "loss": 0.90489453, "num_input_tokens_seen": 61021940, "router_z_loss_clip": 3.30664062, "router_z_loss_mlp": 0.31982422, "step": 2816, "time_per_iteration": 2.611846446990967 }, { "auxiliary_loss_clip": 0.01677454, "auxiliary_loss_mlp": 0.00419093, "balance_loss_clip": 1.34849858, "balance_loss_mlp": 0.384451, "epoch": 0.16936720276566963, "flos": 22747040058240.0, "grad_norm": 47.46971541708254, "language_loss": 0.96680909, "learning_rate": 3.799686673382153e-06, "loss": 0.98777455, "num_input_tokens_seen": 61040285, "router_z_loss_clip": 3.28710938, "router_z_loss_mlp": 0.34643555, "step": 2817, "time_per_iteration": 2.584468126296997 }, { "auxiliary_loss_clip": 0.01649988, "auxiliary_loss_mlp": 0.00384509, "balance_loss_clip": 1.32484341, "balance_loss_mlp": 0.35253704, "epoch": 0.1694273260183376, "flos": 19573973441280.0, "grad_norm": 17.06018620342636, "language_loss": 0.86834371, "learning_rate": 3.799516750928672e-06, "loss": 0.88868868, "num_input_tokens_seen": 61059020, "router_z_loss_clip": 3.25390625, "router_z_loss_mlp": 0.31982422, "step": 2818, "time_per_iteration": 2.583502769470215 }, { "auxiliary_loss_clip": 0.01659341, "auxiliary_loss_mlp": 0.00408215, "balance_loss_clip": 1.33772683, "balance_loss_mlp": 0.37526602, "epoch": 0.16948744927100556, "flos": 12457843332480.0, "grad_norm": 44.55527737448798, "language_loss": 0.89862347, "learning_rate": 3.799346760237336e-06, "loss": 0.91929907, "num_input_tokens_seen": 61074245, "router_z_loss_clip": 3.21875, "router_z_loss_mlp": 0.3293457, "step": 2819, "time_per_iteration": 2.5876431465148926 }, { "auxiliary_loss_clip": 0.01715909, "auxiliary_loss_mlp": 0.00078152, "balance_loss_clip": 1.45182276, "balance_loss_mlp": 0.06918719, "epoch": 0.16954757252367353, "flos": 71291694435840.0, "grad_norm": 0.9265422276775296, "language_loss": 0.60369706, "learning_rate": 3.7991767013145902e-06, "loss": 0.6216377, "num_input_tokens_seen": 61127080, "router_z_loss_clip": 2.640625, "router_z_loss_mlp": 0.08984375, "step": 2820, "time_per_iteration": 3.0801053047180176 }, { "auxiliary_loss_clip": 0.01656389, "auxiliary_loss_mlp": 0.00387422, "balance_loss_clip": 1.33244133, "balance_loss_mlp": 0.35421038, "epoch": 0.1696076957763415, "flos": 29606516513280.0, "grad_norm": 554.7585460850772, "language_loss": 0.86952692, "learning_rate": 3.7990065741668844e-06, "loss": 0.88996506, "num_input_tokens_seen": 61146955, "router_z_loss_clip": 3.23828125, "router_z_loss_mlp": 0.33203125, "step": 2821, "time_per_iteration": 2.754221200942993 }, { "auxiliary_loss_clip": 0.01668593, "auxiliary_loss_mlp": 0.00394684, "balance_loss_clip": 1.34350967, "balance_loss_mlp": 0.35925505, "epoch": 0.16966781902900946, "flos": 24388588535040.0, "grad_norm": 54.604614263795426, "language_loss": 0.86286175, "learning_rate": 3.7988363788006685e-06, "loss": 0.88349456, "num_input_tokens_seen": 61166605, "router_z_loss_clip": 3.25195312, "router_z_loss_mlp": 0.35473633, "step": 2822, "time_per_iteration": 2.6833648681640625 }, { "auxiliary_loss_clip": 0.01655116, "auxiliary_loss_mlp": 0.00424732, "balance_loss_clip": 1.33501482, "balance_loss_mlp": 0.39316529, "epoch": 0.16972794228167745, "flos": 23038814234880.0, "grad_norm": 56.78750161072761, "language_loss": 0.81057334, "learning_rate": 3.7986661152223967e-06, "loss": 0.83137178, "num_input_tokens_seen": 61186535, "router_z_loss_clip": 3.203125, "router_z_loss_mlp": 0.31591797, "step": 2823, "time_per_iteration": 2.6734437942504883 }, { "auxiliary_loss_clip": 0.01663379, "auxiliary_loss_mlp": 0.00411163, "balance_loss_clip": 1.34112644, "balance_loss_mlp": 0.37737906, "epoch": 0.16978806553434542, "flos": 35228691129600.0, "grad_norm": 5.211912583678025, "language_loss": 0.64940435, "learning_rate": 3.7984957834385257e-06, "loss": 0.6701498, "num_input_tokens_seen": 61208965, "router_z_loss_clip": 3.2265625, "router_z_loss_mlp": 0.33837891, "step": 2824, "time_per_iteration": 2.7787842750549316 }, { "auxiliary_loss_clip": 0.01646402, "auxiliary_loss_mlp": 0.00398883, "balance_loss_clip": 1.32455254, "balance_loss_mlp": 0.36564767, "epoch": 0.16984818878701338, "flos": 32014290936960.0, "grad_norm": 18.740832071887603, "language_loss": 0.8000986, "learning_rate": 3.7983253834555144e-06, "loss": 0.82055146, "num_input_tokens_seen": 61230670, "router_z_loss_clip": 3.22070312, "router_z_loss_mlp": 0.33276367, "step": 2825, "time_per_iteration": 2.7301807403564453 }, { "auxiliary_loss_clip": 0.01630732, "auxiliary_loss_mlp": 0.00475312, "balance_loss_clip": 1.31052089, "balance_loss_mlp": 0.43847629, "epoch": 0.16990831203968135, "flos": 22818609907200.0, "grad_norm": 24.04716773255392, "language_loss": 0.93173462, "learning_rate": 3.7981549152798245e-06, "loss": 0.95279509, "num_input_tokens_seen": 61249510, "router_z_loss_clip": 3.19921875, "router_z_loss_mlp": 0.3684082, "step": 2826, "time_per_iteration": 2.6366043090820312 }, { "auxiliary_loss_clip": 0.01636179, "auxiliary_loss_mlp": 0.00427333, "balance_loss_clip": 1.31419909, "balance_loss_mlp": 0.39414543, "epoch": 0.1699684352923493, "flos": 23039604334080.0, "grad_norm": 2.3689432902324294, "language_loss": 0.87891853, "learning_rate": 3.7979843789179196e-06, "loss": 0.89955372, "num_input_tokens_seen": 61269440, "router_z_loss_clip": 3.22070312, "router_z_loss_mlp": 0.33203125, "step": 2827, "time_per_iteration": 2.6390578746795654 }, { "auxiliary_loss_clip": 0.01643637, "auxiliary_loss_mlp": 0.00435449, "balance_loss_clip": 1.3171525, "balance_loss_mlp": 0.39904234, "epoch": 0.17002855854501728, "flos": 21434110133760.0, "grad_norm": 4.010884877575574, "language_loss": 0.80720967, "learning_rate": 3.797813774376267e-06, "loss": 0.82800055, "num_input_tokens_seen": 61288195, "router_z_loss_clip": 3.26367188, "router_z_loss_mlp": 0.36376953, "step": 2828, "time_per_iteration": 2.6938681602478027 }, { "auxiliary_loss_clip": 0.01764534, "auxiliary_loss_mlp": 0.00078511, "balance_loss_clip": 1.51312447, "balance_loss_mlp": 0.0680206, "epoch": 0.17008868179768524, "flos": 71453509205760.0, "grad_norm": 0.781746573654529, "language_loss": 0.56404388, "learning_rate": 3.797643101661336e-06, "loss": 0.58247435, "num_input_tokens_seen": 61350850, "router_z_loss_clip": 2.515625, "router_z_loss_mlp": 0.10498047, "step": 2829, "time_per_iteration": 3.189039468765259 }, { "auxiliary_loss_clip": 0.01654784, "auxiliary_loss_mlp": 0.00441777, "balance_loss_clip": 1.33113551, "balance_loss_mlp": 0.40889949, "epoch": 0.17014880505035324, "flos": 24900315644160.0, "grad_norm": 65.75571949180538, "language_loss": 0.90996832, "learning_rate": 3.7974723607795983e-06, "loss": 0.93093395, "num_input_tokens_seen": 61370765, "router_z_loss_clip": 3.24023438, "router_z_loss_mlp": 0.32885742, "step": 2830, "time_per_iteration": 2.6856963634490967 }, { "auxiliary_loss_clip": 0.01676789, "auxiliary_loss_mlp": 0.00463686, "balance_loss_clip": 1.34546256, "balance_loss_mlp": 0.42646927, "epoch": 0.1702089283030212, "flos": 29862415981440.0, "grad_norm": 13.277333253177126, "language_loss": 0.85197097, "learning_rate": 3.797301551737529e-06, "loss": 0.87337571, "num_input_tokens_seen": 61388935, "router_z_loss_clip": 3.31445312, "router_z_loss_mlp": 0.37207031, "step": 2831, "time_per_iteration": 2.6245081424713135 }, { "auxiliary_loss_clip": 0.01641537, "auxiliary_loss_mlp": 0.0045993, "balance_loss_clip": 1.3210541, "balance_loss_mlp": 0.42397696, "epoch": 0.17026905155568917, "flos": 17744180762880.0, "grad_norm": 13.562530573707724, "language_loss": 0.85494024, "learning_rate": 3.7971306745416044e-06, "loss": 0.87595499, "num_input_tokens_seen": 61407350, "router_z_loss_clip": 3.20703125, "router_z_loss_mlp": 0.359375, "step": 2832, "time_per_iteration": 2.627206802368164 }, { "auxiliary_loss_clip": 0.01652525, "auxiliary_loss_mlp": 0.00464651, "balance_loss_clip": 1.32758057, "balance_loss_mlp": 0.42860177, "epoch": 0.17032917480835713, "flos": 23148665003520.0, "grad_norm": 8.35840670056133, "language_loss": 0.94780076, "learning_rate": 3.7969597291983046e-06, "loss": 0.96897256, "num_input_tokens_seen": 61429010, "router_z_loss_clip": 3.24804688, "router_z_loss_mlp": 0.3605957, "step": 2833, "time_per_iteration": 2.584639310836792 }, { "auxiliary_loss_clip": 0.01656563, "auxiliary_loss_mlp": 0.00399127, "balance_loss_clip": 1.33496213, "balance_loss_mlp": 0.36410373, "epoch": 0.1703892980610251, "flos": 39202565512320.0, "grad_norm": 8.038519835931204, "language_loss": 0.80757254, "learning_rate": 3.7967887157141115e-06, "loss": 0.82812947, "num_input_tokens_seen": 61450040, "router_z_loss_clip": 3.21679688, "router_z_loss_mlp": 0.35009766, "step": 2834, "time_per_iteration": 2.823976516723633 }, { "auxiliary_loss_clip": 0.01653492, "auxiliary_loss_mlp": 0.00398106, "balance_loss_clip": 1.32546937, "balance_loss_mlp": 0.36441779, "epoch": 0.17044942131369306, "flos": 23039101543680.0, "grad_norm": 5.063417193159619, "language_loss": 0.93216622, "learning_rate": 3.7966176340955106e-06, "loss": 0.95268214, "num_input_tokens_seen": 61468585, "router_z_loss_clip": 3.27929688, "router_z_loss_mlp": 0.33691406, "step": 2835, "time_per_iteration": 2.684882164001465 }, { "auxiliary_loss_clip": 0.01629655, "auxiliary_loss_mlp": 0.00446716, "balance_loss_clip": 1.29893351, "balance_loss_mlp": 0.40706676, "epoch": 0.17050954456636103, "flos": 17054983532160.0, "grad_norm": 1987.830848429452, "language_loss": 0.84194738, "learning_rate": 3.796446484348989e-06, "loss": 0.86271107, "num_input_tokens_seen": 61486330, "router_z_loss_clip": 3.3046875, "router_z_loss_mlp": 0.39648438, "step": 2836, "time_per_iteration": 2.635378122329712 }, { "auxiliary_loss_clip": 0.01640859, "auxiliary_loss_mlp": 0.0047696, "balance_loss_clip": 1.31160736, "balance_loss_mlp": 0.44134012, "epoch": 0.17056966781902902, "flos": 16836969934080.0, "grad_norm": 9.71112704236635, "language_loss": 0.88252681, "learning_rate": 3.796275266481036e-06, "loss": 0.903705, "num_input_tokens_seen": 61503950, "router_z_loss_clip": 3.29296875, "router_z_loss_mlp": 0.35620117, "step": 2837, "time_per_iteration": 2.606732130050659 }, { "auxiliary_loss_clip": 0.01626567, "auxiliary_loss_mlp": 0.00378911, "balance_loss_clip": 1.30357409, "balance_loss_mlp": 0.34620047, "epoch": 0.17062979107169698, "flos": 17712543859200.0, "grad_norm": 12.361487907693673, "language_loss": 0.90404415, "learning_rate": 3.7961039804981456e-06, "loss": 0.92409891, "num_input_tokens_seen": 61523550, "router_z_loss_clip": 3.23242188, "router_z_loss_mlp": 0.32714844, "step": 2838, "time_per_iteration": 2.630979299545288 }, { "auxiliary_loss_clip": 0.01642016, "auxiliary_loss_mlp": 0.00414983, "balance_loss_clip": 1.31354284, "balance_loss_mlp": 0.3816998, "epoch": 0.17068991432436495, "flos": 22525040050560.0, "grad_norm": 13.836299511764548, "language_loss": 0.99272883, "learning_rate": 3.795932626406812e-06, "loss": 1.01329887, "num_input_tokens_seen": 61542720, "router_z_loss_clip": 3.28320312, "router_z_loss_mlp": 0.33300781, "step": 2839, "time_per_iteration": 2.5943171977996826 }, { "auxiliary_loss_clip": 0.01638842, "auxiliary_loss_mlp": 0.0040927, "balance_loss_clip": 1.31083298, "balance_loss_mlp": 0.37407941, "epoch": 0.17075003757703291, "flos": 25882939077120.0, "grad_norm": 3.833333150688125, "language_loss": 0.89484662, "learning_rate": 3.7957612042135336e-06, "loss": 0.91532779, "num_input_tokens_seen": 61563040, "router_z_loss_clip": 3.28125, "router_z_loss_mlp": 0.35205078, "step": 2840, "time_per_iteration": 2.631910800933838 }, { "auxiliary_loss_clip": 0.01628894, "auxiliary_loss_mlp": 0.00369787, "balance_loss_clip": 1.29778123, "balance_loss_mlp": 0.33628887, "epoch": 0.17081016082970088, "flos": 20120713332480.0, "grad_norm": 81.37501012014637, "language_loss": 0.82781237, "learning_rate": 3.79558971392481e-06, "loss": 0.84779918, "num_input_tokens_seen": 61581890, "router_z_loss_clip": 3.31054688, "router_z_loss_mlp": 0.3347168, "step": 2841, "time_per_iteration": 2.61309814453125 }, { "auxiliary_loss_clip": 0.01623514, "auxiliary_loss_mlp": 0.00356261, "balance_loss_clip": 1.29632163, "balance_loss_mlp": 0.32612506, "epoch": 0.17087028408236885, "flos": 24936477661440.0, "grad_norm": 20.446512603463738, "language_loss": 0.83108056, "learning_rate": 3.7954181555471443e-06, "loss": 0.8508783, "num_input_tokens_seen": 61602095, "router_z_loss_clip": 3.27148438, "router_z_loss_mlp": 0.30175781, "step": 2842, "time_per_iteration": 2.7079126834869385 }, { "auxiliary_loss_clip": 0.0164326, "auxiliary_loss_mlp": 0.00373944, "balance_loss_clip": 1.31246006, "balance_loss_mlp": 0.34190035, "epoch": 0.17093040733503684, "flos": 19057864872960.0, "grad_norm": 9.800974212912745, "language_loss": 0.91800606, "learning_rate": 3.795246529087043e-06, "loss": 0.93817818, "num_input_tokens_seen": 61620400, "router_z_loss_clip": 3.30859375, "router_z_loss_mlp": 0.3203125, "step": 2843, "time_per_iteration": 2.6590230464935303 }, { "auxiliary_loss_clip": 0.01643642, "auxiliary_loss_mlp": 0.00375031, "balance_loss_clip": 1.31347764, "balance_loss_mlp": 0.34227204, "epoch": 0.1709905305877048, "flos": 13078954333440.0, "grad_norm": 24.490727227864625, "language_loss": 0.76118588, "learning_rate": 3.7950748345510126e-06, "loss": 0.78137261, "num_input_tokens_seen": 61637680, "router_z_loss_clip": 3.3046875, "router_z_loss_mlp": 0.32739258, "step": 2844, "time_per_iteration": 2.6607186794281006 }, { "auxiliary_loss_clip": 0.01630198, "auxiliary_loss_mlp": 0.00370138, "balance_loss_clip": 1.29966712, "balance_loss_mlp": 0.33635366, "epoch": 0.17105065384037277, "flos": 19209336526080.0, "grad_norm": 186.08842537310363, "language_loss": 0.84722114, "learning_rate": 3.7949030719455646e-06, "loss": 0.86722445, "num_input_tokens_seen": 61655630, "router_z_loss_clip": 3.3046875, "router_z_loss_mlp": 0.33789062, "step": 2845, "time_per_iteration": 2.629296064376831 }, { "auxiliary_loss_clip": 0.01614087, "auxiliary_loss_mlp": 0.00349518, "balance_loss_clip": 1.28028154, "balance_loss_mlp": 0.31592497, "epoch": 0.17111077709304073, "flos": 18515183218560.0, "grad_norm": 13.082785880506599, "language_loss": 0.86483365, "learning_rate": 3.7947312412772127e-06, "loss": 0.88446969, "num_input_tokens_seen": 61673475, "router_z_loss_clip": 3.3359375, "router_z_loss_mlp": 0.33618164, "step": 2846, "time_per_iteration": 3.971649646759033 }, { "auxiliary_loss_clip": 0.01630911, "auxiliary_loss_mlp": 0.00329462, "balance_loss_clip": 1.29876733, "balance_loss_mlp": 0.29648834, "epoch": 0.1711709003457087, "flos": 25082670015360.0, "grad_norm": 400.72037222133144, "language_loss": 0.85869235, "learning_rate": 3.794559342552472e-06, "loss": 0.87829608, "num_input_tokens_seen": 61693370, "router_z_loss_clip": 3.32226562, "router_z_loss_mlp": 0.32958984, "step": 2847, "time_per_iteration": 2.633068323135376 }, { "auxiliary_loss_clip": 0.0162445, "auxiliary_loss_mlp": 0.00387494, "balance_loss_clip": 1.28304482, "balance_loss_mlp": 0.35120708, "epoch": 0.17123102359837666, "flos": 17566387418880.0, "grad_norm": 15.425640227217771, "language_loss": 0.94748604, "learning_rate": 3.7943873757778614e-06, "loss": 0.96760553, "num_input_tokens_seen": 61710820, "router_z_loss_clip": 3.41015625, "router_z_loss_mlp": 0.36303711, "step": 2848, "time_per_iteration": 2.626005172729492 }, { "auxiliary_loss_clip": 0.01600459, "auxiliary_loss_mlp": 0.0039117, "balance_loss_clip": 1.26286459, "balance_loss_mlp": 0.35776731, "epoch": 0.17129114685104463, "flos": 26173635845760.0, "grad_norm": 10.1262811475921, "language_loss": 0.81883603, "learning_rate": 3.794215340959902e-06, "loss": 0.83875227, "num_input_tokens_seen": 61729855, "router_z_loss_clip": 3.37890625, "router_z_loss_mlp": 0.33422852, "step": 2849, "time_per_iteration": 2.6590452194213867 }, { "auxiliary_loss_clip": 0.01572823, "auxiliary_loss_mlp": 0.00082247, "balance_loss_clip": 1.33486319, "balance_loss_mlp": 0.07032644, "epoch": 0.17135127010371262, "flos": 69269710037760.0, "grad_norm": 0.9335737949777978, "language_loss": 0.57459354, "learning_rate": 3.7940432381051163e-06, "loss": 0.59114426, "num_input_tokens_seen": 61790290, "router_z_loss_clip": 2.375, "router_z_loss_mlp": 0.11914062, "step": 2850, "time_per_iteration": 4.603308916091919 }, { "auxiliary_loss_clip": 0.01612907, "auxiliary_loss_mlp": 0.00370566, "balance_loss_clip": 1.27980733, "balance_loss_mlp": 0.337044, "epoch": 0.1714113933563806, "flos": 23550110380800.0, "grad_norm": 28.298466022075466, "language_loss": 0.86533904, "learning_rate": 3.793871067220031e-06, "loss": 0.8851738, "num_input_tokens_seen": 61809265, "router_z_loss_clip": 3.33203125, "router_z_loss_mlp": 0.33520508, "step": 2851, "time_per_iteration": 2.5969908237457275 }, { "auxiliary_loss_clip": 0.01615864, "auxiliary_loss_mlp": 0.00368225, "balance_loss_clip": 1.27923226, "balance_loss_mlp": 0.33379769, "epoch": 0.17147151660904855, "flos": 21142443697920.0, "grad_norm": 54.052901534375486, "language_loss": 1.01103592, "learning_rate": 3.7936988283111764e-06, "loss": 1.03087687, "num_input_tokens_seen": 61828980, "router_z_loss_clip": 3.3671875, "router_z_loss_mlp": 0.34423828, "step": 2852, "time_per_iteration": 4.044406414031982 }, { "auxiliary_loss_clip": 0.01618915, "auxiliary_loss_mlp": 0.00435226, "balance_loss_clip": 1.27672994, "balance_loss_mlp": 0.39536253, "epoch": 0.17153163986171652, "flos": 18624890332800.0, "grad_norm": 2.1055841302276557, "language_loss": 0.7540015, "learning_rate": 3.7935265213850817e-06, "loss": 0.77454293, "num_input_tokens_seen": 61847915, "router_z_loss_clip": 3.42382812, "router_z_loss_mlp": 0.39892578, "step": 2853, "time_per_iteration": 2.6490702629089355 }, { "auxiliary_loss_clip": 0.01613396, "auxiliary_loss_mlp": 0.00403401, "balance_loss_clip": 1.27666473, "balance_loss_mlp": 0.36792457, "epoch": 0.17159176311438448, "flos": 18223265387520.0, "grad_norm": 7.127158032635832, "language_loss": 0.76272076, "learning_rate": 3.7933541464482815e-06, "loss": 0.78288871, "num_input_tokens_seen": 61865570, "router_z_loss_clip": 3.37109375, "router_z_loss_mlp": 0.35498047, "step": 2854, "time_per_iteration": 2.607835292816162 }, { "auxiliary_loss_clip": 0.01620031, "auxiliary_loss_mlp": 0.00396779, "balance_loss_clip": 1.28297758, "balance_loss_mlp": 0.36468828, "epoch": 0.17165188636705245, "flos": 20738987159040.0, "grad_norm": 27.280916722840864, "language_loss": 0.94863307, "learning_rate": 3.7931817035073124e-06, "loss": 0.96880126, "num_input_tokens_seen": 61883340, "router_z_loss_clip": 3.37109375, "router_z_loss_mlp": 0.32128906, "step": 2855, "time_per_iteration": 2.6052746772766113 }, { "auxiliary_loss_clip": 0.01611442, "auxiliary_loss_mlp": 0.00396367, "balance_loss_clip": 1.27662611, "balance_loss_mlp": 0.36387029, "epoch": 0.17171200961972044, "flos": 24899884680960.0, "grad_norm": 5.145771858813035, "language_loss": 0.909863, "learning_rate": 3.7930091925687134e-06, "loss": 0.92994106, "num_input_tokens_seen": 61900610, "router_z_loss_clip": 3.34570312, "router_z_loss_mlp": 0.32458496, "step": 2856, "time_per_iteration": 2.645092725753784 }, { "auxiliary_loss_clip": 0.01608803, "auxiliary_loss_mlp": 0.0041388, "balance_loss_clip": 1.27200615, "balance_loss_mlp": 0.37952423, "epoch": 0.1717721328723884, "flos": 20157234485760.0, "grad_norm": 9.834272703315236, "language_loss": 0.93375814, "learning_rate": 3.792836613639026e-06, "loss": 0.95398498, "num_input_tokens_seen": 61916795, "router_z_loss_clip": 3.36523438, "router_z_loss_mlp": 0.34350586, "step": 2857, "time_per_iteration": 2.6408448219299316 }, { "auxiliary_loss_clip": 0.01598982, "auxiliary_loss_mlp": 0.00386329, "balance_loss_clip": 1.2617979, "balance_loss_mlp": 0.35275951, "epoch": 0.17183225612505637, "flos": 23361650697600.0, "grad_norm": 5.767570431749359, "language_loss": 0.84936011, "learning_rate": 3.7926639667247947e-06, "loss": 0.86921322, "num_input_tokens_seen": 61936665, "router_z_loss_clip": 3.375, "router_z_loss_mlp": 0.33532715, "step": 2858, "time_per_iteration": 2.6481873989105225 }, { "auxiliary_loss_clip": 0.0160334, "auxiliary_loss_mlp": 0.0039149, "balance_loss_clip": 1.26237941, "balance_loss_mlp": 0.35413021, "epoch": 0.17189237937772434, "flos": 18114240631680.0, "grad_norm": 7.252808194461094, "language_loss": 0.83874846, "learning_rate": 3.7924912518325663e-06, "loss": 0.85869682, "num_input_tokens_seen": 61954415, "router_z_loss_clip": 3.40820312, "router_z_loss_mlp": 0.37329102, "step": 2859, "time_per_iteration": 2.60473895072937 }, { "auxiliary_loss_clip": 0.01591254, "auxiliary_loss_mlp": 0.00411685, "balance_loss_clip": 1.25407791, "balance_loss_mlp": 0.37716195, "epoch": 0.1719525026303923, "flos": 23258408031360.0, "grad_norm": 29.671229301623022, "language_loss": 0.81750619, "learning_rate": 3.7923184689688902e-06, "loss": 0.83753562, "num_input_tokens_seen": 61973940, "router_z_loss_clip": 3.37109375, "router_z_loss_mlp": 0.34521484, "step": 2860, "time_per_iteration": 2.6374683380126953 }, { "auxiliary_loss_clip": 0.01579502, "auxiliary_loss_mlp": 0.00403119, "balance_loss_clip": 1.24171948, "balance_loss_mlp": 0.36761808, "epoch": 0.17201262588306027, "flos": 20810413353600.0, "grad_norm": 10.7246006357813, "language_loss": 0.88041478, "learning_rate": 3.792145618140317e-06, "loss": 0.90024102, "num_input_tokens_seen": 61991845, "router_z_loss_clip": 3.375, "router_z_loss_mlp": 0.35498047, "step": 2861, "time_per_iteration": 2.581437349319458 }, { "auxiliary_loss_clip": 0.01596656, "auxiliary_loss_mlp": 0.0040611, "balance_loss_clip": 1.25437844, "balance_loss_mlp": 0.37118199, "epoch": 0.17207274913572823, "flos": 20375858615040.0, "grad_norm": 11.668236617794646, "language_loss": 0.93676561, "learning_rate": 3.7919726993534038e-06, "loss": 0.95679331, "num_input_tokens_seen": 62009395, "router_z_loss_clip": 3.41992188, "router_z_loss_mlp": 0.34912109, "step": 2862, "time_per_iteration": 2.6143763065338135 }, { "auxiliary_loss_clip": 0.01590826, "auxiliary_loss_mlp": 0.00391797, "balance_loss_clip": 1.25542021, "balance_loss_mlp": 0.35789347, "epoch": 0.17213287238839622, "flos": 26797727675520.0, "grad_norm": 6.172739402101407, "language_loss": 0.83415174, "learning_rate": 3.7917997126147054e-06, "loss": 0.85397792, "num_input_tokens_seen": 62029005, "router_z_loss_clip": 3.34960938, "router_z_loss_mlp": 0.33886719, "step": 2863, "time_per_iteration": 2.737022876739502 }, { "auxiliary_loss_clip": 0.01590477, "auxiliary_loss_mlp": 0.00401119, "balance_loss_clip": 1.25725436, "balance_loss_mlp": 0.36311495, "epoch": 0.1721929956410642, "flos": 26030819370240.0, "grad_norm": 1.8819394461679213, "language_loss": 0.77633798, "learning_rate": 3.7916266579307823e-06, "loss": 0.79625386, "num_input_tokens_seen": 62048730, "router_z_loss_clip": 3.33007812, "router_z_loss_mlp": 0.37988281, "step": 2864, "time_per_iteration": 2.7099649906158447 }, { "auxiliary_loss_clip": 0.01582786, "auxiliary_loss_mlp": 0.00432173, "balance_loss_clip": 1.24684215, "balance_loss_mlp": 0.39636245, "epoch": 0.17225311889373215, "flos": 22273091078400.0, "grad_norm": 15.269430613730918, "language_loss": 0.80278313, "learning_rate": 3.7914535353081973e-06, "loss": 0.82293278, "num_input_tokens_seen": 62069000, "router_z_loss_clip": 3.35742188, "router_z_loss_mlp": 0.35791016, "step": 2865, "time_per_iteration": 2.6830692291259766 }, { "auxiliary_loss_clip": 0.01576678, "auxiliary_loss_mlp": 0.0039839, "balance_loss_clip": 1.23932767, "balance_loss_mlp": 0.36227, "epoch": 0.17231324214640012, "flos": 21287774125440.0, "grad_norm": 16.62298810063414, "language_loss": 0.8651793, "learning_rate": 3.7912803447535145e-06, "loss": 0.88493001, "num_input_tokens_seen": 62086750, "router_z_loss_clip": 3.37109375, "router_z_loss_mlp": 0.36132812, "step": 2866, "time_per_iteration": 2.661207675933838 }, { "auxiliary_loss_clip": 0.01592461, "auxiliary_loss_mlp": 0.00424143, "balance_loss_clip": 1.25185013, "balance_loss_mlp": 0.38234797, "epoch": 0.17237336539906808, "flos": 19680735640320.0, "grad_norm": 2.3027466197440725, "language_loss": 0.85206628, "learning_rate": 3.7911070862733016e-06, "loss": 0.87223232, "num_input_tokens_seen": 62106240, "router_z_loss_clip": 3.40820312, "router_z_loss_mlp": 0.41748047, "step": 2867, "time_per_iteration": 2.6447298526763916 }, { "auxiliary_loss_clip": 0.01583327, "auxiliary_loss_mlp": 0.00428947, "balance_loss_clip": 1.24739957, "balance_loss_mlp": 0.39153942, "epoch": 0.17243348865173605, "flos": 17529650784000.0, "grad_norm": 28.360011264946444, "language_loss": 0.85568464, "learning_rate": 3.7909337598741276e-06, "loss": 0.8758074, "num_input_tokens_seen": 62124895, "router_z_loss_clip": 3.36523438, "router_z_loss_mlp": 0.37451172, "step": 2868, "time_per_iteration": 2.700200319290161 }, { "auxiliary_loss_clip": 0.01590867, "auxiliary_loss_mlp": 0.00437063, "balance_loss_clip": 1.25185227, "balance_loss_mlp": 0.39882106, "epoch": 0.17249361190440402, "flos": 18259858368000.0, "grad_norm": 10.913980959844753, "language_loss": 0.89937437, "learning_rate": 3.7907603655625674e-06, "loss": 0.91965365, "num_input_tokens_seen": 62143510, "router_z_loss_clip": 3.39453125, "router_z_loss_mlp": 0.38256836, "step": 2869, "time_per_iteration": 2.572721242904663 }, { "auxiliary_loss_clip": 0.01577495, "auxiliary_loss_mlp": 0.00369882, "balance_loss_clip": 1.24228406, "balance_loss_mlp": 0.33471549, "epoch": 0.172553735157072, "flos": 21174367910400.0, "grad_norm": 48.51235025531055, "language_loss": 0.83528829, "learning_rate": 3.7905869033451932e-06, "loss": 0.85476208, "num_input_tokens_seen": 62162285, "router_z_loss_clip": 3.3515625, "router_z_loss_mlp": 0.35180664, "step": 2870, "time_per_iteration": 2.657640218734741 }, { "auxiliary_loss_clip": 0.01587733, "auxiliary_loss_mlp": 0.0040183, "balance_loss_clip": 1.25555539, "balance_loss_mlp": 0.36921456, "epoch": 0.17261385840973997, "flos": 22273270646400.0, "grad_norm": 7.677152564749584, "language_loss": 0.82970178, "learning_rate": 3.7904133732285857e-06, "loss": 0.84959745, "num_input_tokens_seen": 62180970, "router_z_loss_clip": 3.32421875, "router_z_loss_mlp": 0.32592773, "step": 2871, "time_per_iteration": 2.5949013233184814 }, { "auxiliary_loss_clip": 0.01593417, "auxiliary_loss_mlp": 0.00371812, "balance_loss_clip": 1.2535646, "balance_loss_mlp": 0.33838564, "epoch": 0.17267398166240794, "flos": 27922233830400.0, "grad_norm": 27.564721315137902, "language_loss": 0.83275104, "learning_rate": 3.7902397752193228e-06, "loss": 0.85240334, "num_input_tokens_seen": 62198965, "router_z_loss_clip": 3.40234375, "router_z_loss_mlp": 0.33422852, "step": 2872, "time_per_iteration": 2.750235080718994 }, { "auxiliary_loss_clip": 0.01577911, "auxiliary_loss_mlp": 0.00362736, "balance_loss_clip": 1.24798131, "balance_loss_mlp": 0.32654357, "epoch": 0.1727341049150759, "flos": 21945118970880.0, "grad_norm": 12.922535113781615, "language_loss": 0.87429404, "learning_rate": 3.790066109323988e-06, "loss": 0.8937006, "num_input_tokens_seen": 62219890, "router_z_loss_clip": 3.3046875, "router_z_loss_mlp": 0.36206055, "step": 2873, "time_per_iteration": 2.6698966026306152 }, { "auxiliary_loss_clip": 0.01585722, "auxiliary_loss_mlp": 0.00339382, "balance_loss_clip": 1.2517271, "balance_loss_mlp": 0.30450135, "epoch": 0.17279422816774387, "flos": 18107883924480.0, "grad_norm": 8.626853250133276, "language_loss": 0.81331134, "learning_rate": 3.7898923755491678e-06, "loss": 0.83256233, "num_input_tokens_seen": 62237140, "router_z_loss_clip": 3.33984375, "router_z_loss_mlp": 0.34912109, "step": 2874, "time_per_iteration": 2.619825839996338 }, { "auxiliary_loss_clip": 0.01596486, "auxiliary_loss_mlp": 0.00386172, "balance_loss_clip": 1.25883949, "balance_loss_mlp": 0.34571266, "epoch": 0.17285435142041183, "flos": 21835447770240.0, "grad_norm": 3.035800439321808, "language_loss": 0.88006592, "learning_rate": 3.7897185739014487e-06, "loss": 0.89989251, "num_input_tokens_seen": 62255405, "router_z_loss_clip": 3.37890625, "router_z_loss_mlp": 0.40478516, "step": 2875, "time_per_iteration": 2.583841562271118 }, { "auxiliary_loss_clip": 0.01580444, "auxiliary_loss_mlp": 0.00371598, "balance_loss_clip": 1.24768186, "balance_loss_mlp": 0.33433324, "epoch": 0.17291447467307983, "flos": 18368452160640.0, "grad_norm": 4.674852513174099, "language_loss": 0.97543585, "learning_rate": 3.7895447043874217e-06, "loss": 0.99495631, "num_input_tokens_seen": 62271280, "router_z_loss_clip": 3.328125, "router_z_loss_mlp": 0.37304688, "step": 2876, "time_per_iteration": 2.574932336807251 }, { "auxiliary_loss_clip": 0.01601962, "auxiliary_loss_mlp": 0.00367482, "balance_loss_clip": 1.27176642, "balance_loss_mlp": 0.3329111, "epoch": 0.1729745979257478, "flos": 18624638937600.0, "grad_norm": 6.499217159425447, "language_loss": 0.91084963, "learning_rate": 3.789370767013681e-06, "loss": 0.93054402, "num_input_tokens_seen": 62289140, "router_z_loss_clip": 3.30078125, "router_z_loss_mlp": 0.34570312, "step": 2877, "time_per_iteration": 2.628335952758789 }, { "auxiliary_loss_clip": 0.0160922, "auxiliary_loss_mlp": 0.00359238, "balance_loss_clip": 1.27446008, "balance_loss_mlp": 0.32285553, "epoch": 0.17303472117841576, "flos": 22998234844800.0, "grad_norm": 45.291922993575035, "language_loss": 0.84763825, "learning_rate": 3.7891967617868204e-06, "loss": 0.8673228, "num_input_tokens_seen": 62307490, "router_z_loss_clip": 3.3515625, "router_z_loss_mlp": 0.36352539, "step": 2878, "time_per_iteration": 2.60723876953125 }, { "auxiliary_loss_clip": 0.01590195, "auxiliary_loss_mlp": 0.0035414, "balance_loss_clip": 1.25985241, "balance_loss_mlp": 0.31806698, "epoch": 0.17309484443108372, "flos": 25664386775040.0, "grad_norm": 6.531736077439721, "language_loss": 0.76387012, "learning_rate": 3.78902268871344e-06, "loss": 0.78331351, "num_input_tokens_seen": 62328570, "router_z_loss_clip": 3.30273438, "router_z_loss_mlp": 0.36035156, "step": 2879, "time_per_iteration": 2.6278786659240723 }, { "auxiliary_loss_clip": 0.01608715, "auxiliary_loss_mlp": 0.00378227, "balance_loss_clip": 1.27330804, "balance_loss_mlp": 0.34027085, "epoch": 0.1731549676837517, "flos": 13552903313280.0, "grad_norm": 68.48929197781315, "language_loss": 0.91919184, "learning_rate": 3.78884854780014e-06, "loss": 0.93906128, "num_input_tokens_seen": 62345735, "router_z_loss_clip": 3.35742188, "router_z_loss_mlp": 0.37963867, "step": 2880, "time_per_iteration": 2.612574577331543 }, { "auxiliary_loss_clip": 0.01618108, "auxiliary_loss_mlp": 0.00389651, "balance_loss_clip": 1.28299117, "balance_loss_mlp": 0.35107481, "epoch": 0.17321509093641965, "flos": 22857070394880.0, "grad_norm": 10.968661474908936, "language_loss": 0.87553382, "learning_rate": 3.7886743390535236e-06, "loss": 0.89561146, "num_input_tokens_seen": 62365525, "router_z_loss_clip": 3.34960938, "router_z_loss_mlp": 0.38598633, "step": 2881, "time_per_iteration": 2.730792284011841 }, { "auxiliary_loss_clip": 0.01598009, "auxiliary_loss_mlp": 0.00368109, "balance_loss_clip": 1.2681396, "balance_loss_mlp": 0.33267987, "epoch": 0.17327521418908762, "flos": 24352785653760.0, "grad_norm": 10.183450504537923, "language_loss": 0.83834422, "learning_rate": 3.788500062480197e-06, "loss": 0.8580054, "num_input_tokens_seen": 62385160, "router_z_loss_clip": 3.296875, "router_z_loss_mlp": 0.35424805, "step": 2882, "time_per_iteration": 2.7336947917938232 }, { "auxiliary_loss_clip": 0.01611117, "auxiliary_loss_mlp": 0.00366005, "balance_loss_clip": 1.28305721, "balance_loss_mlp": 0.33002803, "epoch": 0.1733353374417556, "flos": 33105651816960.0, "grad_norm": 3.228718968165747, "language_loss": 0.81622279, "learning_rate": 3.788325718086769e-06, "loss": 0.83599401, "num_input_tokens_seen": 62405280, "router_z_loss_clip": 3.27929688, "router_z_loss_mlp": 0.35961914, "step": 2883, "time_per_iteration": 2.7368886470794678 }, { "auxiliary_loss_clip": 0.01598902, "auxiliary_loss_mlp": 0.00371454, "balance_loss_clip": 1.26809013, "balance_loss_mlp": 0.33330733, "epoch": 0.17339546069442358, "flos": 24388947671040.0, "grad_norm": 13.028930845886626, "language_loss": 0.91194785, "learning_rate": 3.7881513058798503e-06, "loss": 0.93165141, "num_input_tokens_seen": 62423665, "router_z_loss_clip": 3.30664062, "router_z_loss_mlp": 0.38183594, "step": 2884, "time_per_iteration": 2.621738910675049 }, { "auxiliary_loss_clip": 0.01631193, "auxiliary_loss_mlp": 0.00387395, "balance_loss_clip": 1.29682231, "balance_loss_mlp": 0.35170352, "epoch": 0.17345558394709154, "flos": 27454174680960.0, "grad_norm": 16.70027813332801, "language_loss": 0.80197388, "learning_rate": 3.787976825866055e-06, "loss": 0.82215977, "num_input_tokens_seen": 62445170, "router_z_loss_clip": 3.34765625, "router_z_loss_mlp": 0.35693359, "step": 2885, "time_per_iteration": 2.695791006088257 }, { "auxiliary_loss_clip": 0.01609248, "auxiliary_loss_mlp": 0.00318444, "balance_loss_clip": 1.28542733, "balance_loss_mlp": 0.28713953, "epoch": 0.1735157071997595, "flos": 24682158391680.0, "grad_norm": 6.932721830328607, "language_loss": 0.75935137, "learning_rate": 3.7878022780519998e-06, "loss": 0.77862829, "num_input_tokens_seen": 62466135, "router_z_loss_clip": 3.234375, "router_z_loss_mlp": 0.31274414, "step": 2886, "time_per_iteration": 2.6621205806732178 }, { "auxiliary_loss_clip": 0.0161782, "auxiliary_loss_mlp": 0.0036193, "balance_loss_clip": 1.28891516, "balance_loss_mlp": 0.32385454, "epoch": 0.17357583045242747, "flos": 21688932193920.0, "grad_norm": 9.930228298911933, "language_loss": 0.77582335, "learning_rate": 3.7876276624443024e-06, "loss": 0.79562086, "num_input_tokens_seen": 62483910, "router_z_loss_clip": 3.2890625, "router_z_loss_mlp": 0.38061523, "step": 2887, "time_per_iteration": 2.6286046504974365 }, { "auxiliary_loss_clip": 0.01608346, "auxiliary_loss_mlp": 0.00374176, "balance_loss_clip": 1.28270197, "balance_loss_mlp": 0.33605278, "epoch": 0.17363595370509544, "flos": 15375728753280.0, "grad_norm": 66.80689679060514, "language_loss": 0.91617334, "learning_rate": 3.787452979049585e-06, "loss": 0.93599856, "num_input_tokens_seen": 62501530, "router_z_loss_clip": 3.25585938, "router_z_loss_mlp": 0.38110352, "step": 2888, "time_per_iteration": 4.02417516708374 }, { "auxiliary_loss_clip": 0.0163217, "auxiliary_loss_mlp": 0.00386811, "balance_loss_clip": 1.29867196, "balance_loss_mlp": 0.34628007, "epoch": 0.1736960769577634, "flos": 23440941970560.0, "grad_norm": 19.88501214419633, "language_loss": 0.88021713, "learning_rate": 3.7872782278744718e-06, "loss": 0.90040696, "num_input_tokens_seen": 62521295, "router_z_loss_clip": 3.33203125, "router_z_loss_mlp": 0.40527344, "step": 2889, "time_per_iteration": 2.6377077102661133 }, { "auxiliary_loss_clip": 0.01596061, "auxiliary_loss_mlp": 0.00360905, "balance_loss_clip": 1.27159023, "balance_loss_mlp": 0.32633463, "epoch": 0.1737562002104314, "flos": 18587830475520.0, "grad_norm": 133.64726117882464, "language_loss": 0.91790432, "learning_rate": 3.7871034089255883e-06, "loss": 0.93747401, "num_input_tokens_seen": 62539615, "router_z_loss_clip": 3.24414062, "router_z_loss_mlp": 0.34570312, "step": 2890, "time_per_iteration": 2.64852237701416 }, { "auxiliary_loss_clip": 0.01608835, "auxiliary_loss_mlp": 0.00386094, "balance_loss_clip": 1.27760363, "balance_loss_mlp": 0.34890032, "epoch": 0.17381632346309936, "flos": 15998060816640.0, "grad_norm": 15.781227661519257, "language_loss": 0.89875042, "learning_rate": 3.7869285222095653e-06, "loss": 0.91869974, "num_input_tokens_seen": 62556820, "router_z_loss_clip": 3.31054688, "router_z_loss_mlp": 0.37207031, "step": 2891, "time_per_iteration": 2.5430305004119873 }, { "auxiliary_loss_clip": 0.01590069, "auxiliary_loss_mlp": 0.00364138, "balance_loss_clip": 1.26050878, "balance_loss_mlp": 0.32951951, "epoch": 0.17387644671576732, "flos": 13369830670080.0, "grad_norm": 8.993123411689751, "language_loss": 0.88440019, "learning_rate": 3.7867535677330334e-06, "loss": 0.90394223, "num_input_tokens_seen": 62572450, "router_z_loss_clip": 3.296875, "router_z_loss_mlp": 0.34643555, "step": 2892, "time_per_iteration": 3.9830968379974365 }, { "auxiliary_loss_clip": 0.01611171, "auxiliary_loss_mlp": 0.00396143, "balance_loss_clip": 1.28262115, "balance_loss_mlp": 0.35461077, "epoch": 0.1739365699684353, "flos": 26615516958720.0, "grad_norm": 12.351561806973393, "language_loss": 0.81906676, "learning_rate": 3.786578545502627e-06, "loss": 0.83913994, "num_input_tokens_seen": 62592580, "router_z_loss_clip": 3.28320312, "router_z_loss_mlp": 0.41503906, "step": 2893, "time_per_iteration": 2.6654129028320312 }, { "auxiliary_loss_clip": 0.01602249, "auxiliary_loss_mlp": 0.00401749, "balance_loss_clip": 1.2743336, "balance_loss_mlp": 0.3629581, "epoch": 0.17399669322110325, "flos": 23367971491200.0, "grad_norm": 24.058606751004426, "language_loss": 0.8996911, "learning_rate": 3.7864034555249828e-06, "loss": 0.91973102, "num_input_tokens_seen": 62611220, "router_z_loss_clip": 3.27929688, "router_z_loss_mlp": 0.38793945, "step": 2894, "time_per_iteration": 4.076308250427246 }, { "auxiliary_loss_clip": 0.0161018, "auxiliary_loss_mlp": 0.00363714, "balance_loss_clip": 1.28257573, "balance_loss_mlp": 0.32296783, "epoch": 0.17405681647377122, "flos": 22054107813120.0, "grad_norm": 10.093248332996742, "language_loss": 0.81053376, "learning_rate": 3.786228297806741e-06, "loss": 0.83027267, "num_input_tokens_seen": 62629185, "router_z_loss_clip": 3.27734375, "router_z_loss_mlp": 0.4074707, "step": 2895, "time_per_iteration": 2.703037977218628 }, { "auxiliary_loss_clip": 0.01532734, "auxiliary_loss_mlp": 0.00160578, "balance_loss_clip": 1.27944827, "balance_loss_mlp": 0.14856122, "epoch": 0.1741169397264392, "flos": 61457559114240.0, "grad_norm": 0.8615490908528676, "language_loss": 0.62389958, "learning_rate": 3.7860530723545435e-06, "loss": 0.64083266, "num_input_tokens_seen": 62691895, "router_z_loss_clip": 2.53125, "router_z_loss_mlp": 0.12011719, "step": 2896, "time_per_iteration": 3.1833369731903076 }, { "auxiliary_loss_clip": 0.01604582, "auxiliary_loss_mlp": 0.00371149, "balance_loss_clip": 1.27812648, "balance_loss_mlp": 0.33345526, "epoch": 0.17417706297910718, "flos": 27017680608000.0, "grad_norm": 4.568816182369421, "language_loss": 0.8203221, "learning_rate": 3.785877779175034e-06, "loss": 0.84007943, "num_input_tokens_seen": 62713790, "router_z_loss_clip": 3.26757812, "router_z_loss_mlp": 0.37695312, "step": 2897, "time_per_iteration": 2.663837432861328 }, { "auxiliary_loss_clip": 0.01607266, "auxiliary_loss_mlp": 0.00374937, "balance_loss_clip": 1.28561187, "balance_loss_mlp": 0.33729118, "epoch": 0.17423718623177514, "flos": 33508856960640.0, "grad_norm": 37.498714508948694, "language_loss": 0.7570107, "learning_rate": 3.7857024182748606e-06, "loss": 0.7768327, "num_input_tokens_seen": 62736285, "router_z_loss_clip": 3.21679688, "router_z_loss_mlp": 0.3762207, "step": 2898, "time_per_iteration": 2.7769758701324463 }, { "auxiliary_loss_clip": 0.0161216, "auxiliary_loss_mlp": 0.00400165, "balance_loss_clip": 1.28530169, "balance_loss_mlp": 0.36082608, "epoch": 0.1742973094844431, "flos": 27198634348800.0, "grad_norm": 4.268951667534003, "language_loss": 0.85133278, "learning_rate": 3.7855269896606717e-06, "loss": 0.87145603, "num_input_tokens_seen": 62756240, "router_z_loss_clip": 3.26953125, "router_z_loss_mlp": 0.39331055, "step": 2899, "time_per_iteration": 2.6533725261688232 }, { "auxiliary_loss_clip": 0.01582769, "auxiliary_loss_mlp": 0.00361439, "balance_loss_clip": 1.2630254, "balance_loss_mlp": 0.3262009, "epoch": 0.17435743273711107, "flos": 22710734386560.0, "grad_norm": 38.374200433209104, "language_loss": 0.77501225, "learning_rate": 3.785351493339121e-06, "loss": 0.79445434, "num_input_tokens_seen": 62775910, "router_z_loss_clip": 3.20117188, "router_z_loss_mlp": 0.35205078, "step": 2900, "time_per_iteration": 2.613215684890747 }, { "auxiliary_loss_clip": 0.01597426, "auxiliary_loss_mlp": 0.00357499, "balance_loss_clip": 1.27692652, "balance_loss_mlp": 0.32114041, "epoch": 0.17441755598977904, "flos": 41646466039680.0, "grad_norm": 6.686397054162063, "language_loss": 0.75523388, "learning_rate": 3.785175929316863e-06, "loss": 0.77478313, "num_input_tokens_seen": 62799385, "router_z_loss_clip": 3.203125, "router_z_loss_mlp": 0.36352539, "step": 2901, "time_per_iteration": 2.758744239807129 }, { "auxiliary_loss_clip": 0.01617854, "auxiliary_loss_mlp": 0.00396284, "balance_loss_clip": 1.29500806, "balance_loss_mlp": 0.3600443, "epoch": 0.174477679242447, "flos": 26287077974400.0, "grad_norm": 3.325677044410505, "language_loss": 0.79899657, "learning_rate": 3.7850002976005543e-06, "loss": 0.81913793, "num_input_tokens_seen": 62819380, "router_z_loss_clip": 3.2265625, "router_z_loss_mlp": 0.36230469, "step": 2902, "time_per_iteration": 2.6566319465637207 }, { "auxiliary_loss_clip": 0.01601601, "auxiliary_loss_mlp": 0.00379343, "balance_loss_clip": 1.28042245, "balance_loss_mlp": 0.34210208, "epoch": 0.174537802495115, "flos": 17858412990720.0, "grad_norm": 52.36683900420012, "language_loss": 0.86866605, "learning_rate": 3.7848245981968558e-06, "loss": 0.88847554, "num_input_tokens_seen": 62836205, "router_z_loss_clip": 3.2109375, "router_z_loss_mlp": 0.37231445, "step": 2903, "time_per_iteration": 2.5493059158325195 }, { "auxiliary_loss_clip": 0.0159092, "auxiliary_loss_mlp": 0.00409244, "balance_loss_clip": 1.27330947, "balance_loss_mlp": 0.37271842, "epoch": 0.17459792574778296, "flos": 16940715390720.0, "grad_norm": 1.751282112256498, "language_loss": 0.78762889, "learning_rate": 3.784648831112429e-06, "loss": 0.80763054, "num_input_tokens_seen": 62854045, "router_z_loss_clip": 3.17578125, "router_z_loss_mlp": 0.36572266, "step": 2904, "time_per_iteration": 2.5679984092712402 }, { "auxiliary_loss_clip": 0.01589778, "auxiliary_loss_mlp": 0.00326536, "balance_loss_clip": 1.2674185, "balance_loss_mlp": 0.29434934, "epoch": 0.17465804900045093, "flos": 25520026014720.0, "grad_norm": 2.4991700814130833, "language_loss": 0.71851599, "learning_rate": 3.7844729963539406e-06, "loss": 0.73767912, "num_input_tokens_seen": 62873075, "router_z_loss_clip": 3.22070312, "router_z_loss_mlp": 0.32177734, "step": 2905, "time_per_iteration": 2.6622025966644287 }, { "auxiliary_loss_clip": 0.01601111, "auxiliary_loss_mlp": 0.00398787, "balance_loss_clip": 1.27539778, "balance_loss_mlp": 0.36183202, "epoch": 0.1747181722531189, "flos": 24129708238080.0, "grad_norm": 48.261952315982604, "language_loss": 0.84766102, "learning_rate": 3.7842970939280566e-06, "loss": 0.86766005, "num_input_tokens_seen": 62892675, "router_z_loss_clip": 3.25585938, "router_z_loss_mlp": 0.36938477, "step": 2906, "time_per_iteration": 2.6521172523498535 }, { "auxiliary_loss_clip": 0.01599127, "auxiliary_loss_mlp": 0.00383298, "balance_loss_clip": 1.27790368, "balance_loss_mlp": 0.34708241, "epoch": 0.17477829550578686, "flos": 17748813617280.0, "grad_norm": 10.63724007889585, "language_loss": 0.86949801, "learning_rate": 3.784121123841449e-06, "loss": 0.88932228, "num_input_tokens_seen": 62910675, "router_z_loss_clip": 3.20703125, "router_z_loss_mlp": 0.36206055, "step": 2907, "time_per_iteration": 2.6125190258026123 }, { "auxiliary_loss_clip": 0.01585183, "auxiliary_loss_mlp": 0.00361715, "balance_loss_clip": 1.26188719, "balance_loss_mlp": 0.32712024, "epoch": 0.17483841875845482, "flos": 15377344865280.0, "grad_norm": 21.354094184309663, "language_loss": 0.88914657, "learning_rate": 3.7839450861007886e-06, "loss": 0.90861547, "num_input_tokens_seen": 62928130, "router_z_loss_clip": 3.23046875, "router_z_loss_mlp": 0.34619141, "step": 2908, "time_per_iteration": 2.6068763732910156 }, { "auxiliary_loss_clip": 0.01587302, "auxiliary_loss_mlp": 0.00390461, "balance_loss_clip": 1.26719642, "balance_loss_mlp": 0.35620067, "epoch": 0.17489854201112282, "flos": 17163254102400.0, "grad_norm": 14.529207281734951, "language_loss": 0.89763999, "learning_rate": 3.7837689807127518e-06, "loss": 0.91741759, "num_input_tokens_seen": 62944290, "router_z_loss_clip": 3.20507812, "router_z_loss_mlp": 0.3425293, "step": 2909, "time_per_iteration": 2.538262128829956 }, { "auxiliary_loss_clip": 0.01598266, "auxiliary_loss_mlp": 0.00436541, "balance_loss_clip": 1.2728889, "balance_loss_mlp": 0.39631939, "epoch": 0.17495866526379078, "flos": 19755286318080.0, "grad_norm": 5.124005520305537, "language_loss": 0.82317436, "learning_rate": 3.783592807684017e-06, "loss": 0.84352243, "num_input_tokens_seen": 62963505, "router_z_loss_clip": 3.25585938, "router_z_loss_mlp": 0.40234375, "step": 2910, "time_per_iteration": 2.6428537368774414 }, { "auxiliary_loss_clip": 0.01578194, "auxiliary_loss_mlp": 0.00378694, "balance_loss_clip": 1.25941813, "balance_loss_mlp": 0.3385438, "epoch": 0.17501878851645875, "flos": 28511133310080.0, "grad_norm": 3.90100587792205, "language_loss": 0.91507351, "learning_rate": 3.7834165670212645e-06, "loss": 0.93464231, "num_input_tokens_seen": 62985020, "router_z_loss_clip": 3.1875, "router_z_loss_mlp": 0.40185547, "step": 2911, "time_per_iteration": 2.660956382751465 }, { "auxiliary_loss_clip": 0.01582655, "auxiliary_loss_mlp": 0.00393829, "balance_loss_clip": 1.25783026, "balance_loss_mlp": 0.35615936, "epoch": 0.1750789117691267, "flos": 17931203902080.0, "grad_norm": 147.70845487285666, "language_loss": 0.9506216, "learning_rate": 3.7832402587311764e-06, "loss": 0.97038651, "num_input_tokens_seen": 63001745, "router_z_loss_clip": 3.25, "router_z_loss_mlp": 0.37646484, "step": 2912, "time_per_iteration": 2.610936403274536 }, { "auxiliary_loss_clip": 0.01575759, "auxiliary_loss_mlp": 0.00379939, "balance_loss_clip": 1.2592423, "balance_loss_mlp": 0.34353295, "epoch": 0.17513903502179468, "flos": 18259427404800.0, "grad_norm": 26.16938017364483, "language_loss": 0.80345166, "learning_rate": 3.783063882820439e-06, "loss": 0.82300872, "num_input_tokens_seen": 63019750, "router_z_loss_clip": 3.16210938, "router_z_loss_mlp": 0.36425781, "step": 2913, "time_per_iteration": 2.6082494258880615 }, { "auxiliary_loss_clip": 0.01592171, "auxiliary_loss_mlp": 0.00365247, "balance_loss_clip": 1.27639866, "balance_loss_mlp": 0.33134434, "epoch": 0.17519915827446264, "flos": 20704728562560.0, "grad_norm": 5.242876795117796, "language_loss": 0.77149445, "learning_rate": 3.782887439295741e-06, "loss": 0.79106867, "num_input_tokens_seen": 63039500, "router_z_loss_clip": 3.15625, "router_z_loss_mlp": 0.33911133, "step": 2914, "time_per_iteration": 2.6174118518829346 }, { "auxiliary_loss_clip": 0.01600625, "auxiliary_loss_mlp": 0.00430517, "balance_loss_clip": 1.28212202, "balance_loss_mlp": 0.39141595, "epoch": 0.1752592815271306, "flos": 20523415685760.0, "grad_norm": 5.795631581172803, "language_loss": 0.98740137, "learning_rate": 3.782710928163772e-06, "loss": 1.00771284, "num_input_tokens_seen": 63059785, "router_z_loss_clip": 3.1875, "router_z_loss_mlp": 0.39086914, "step": 2915, "time_per_iteration": 2.6912147998809814 }, { "auxiliary_loss_clip": 0.01587949, "auxiliary_loss_mlp": 0.00414755, "balance_loss_clip": 1.26779032, "balance_loss_mlp": 0.37830073, "epoch": 0.1753194047797986, "flos": 21799178012160.0, "grad_norm": 350.89470061078777, "language_loss": 0.86085725, "learning_rate": 3.782534349431226e-06, "loss": 0.88088429, "num_input_tokens_seen": 63079385, "router_z_loss_clip": 3.20117188, "router_z_loss_mlp": 0.36474609, "step": 2916, "time_per_iteration": 2.681788921356201 }, { "auxiliary_loss_clip": 0.01618591, "auxiliary_loss_mlp": 0.00408516, "balance_loss_clip": 1.29518652, "balance_loss_mlp": 0.37179971, "epoch": 0.17537952803246656, "flos": 20668351063680.0, "grad_norm": 491.97340489743493, "language_loss": 0.79733551, "learning_rate": 3.782357703104799e-06, "loss": 0.81760657, "num_input_tokens_seen": 63098970, "router_z_loss_clip": 3.23046875, "router_z_loss_mlp": 0.36743164, "step": 2917, "time_per_iteration": 2.674717426300049 }, { "auxiliary_loss_clip": 0.01589015, "auxiliary_loss_mlp": 0.00373005, "balance_loss_clip": 1.27175236, "balance_loss_mlp": 0.33709908, "epoch": 0.17543965128513453, "flos": 23295072839040.0, "grad_norm": 5.496399629848053, "language_loss": 0.8346073, "learning_rate": 3.7821809891911897e-06, "loss": 0.85422754, "num_input_tokens_seen": 63118750, "router_z_loss_clip": 3.17382812, "router_z_loss_mlp": 0.35888672, "step": 2918, "time_per_iteration": 2.602198600769043 }, { "auxiliary_loss_clip": 0.01604583, "auxiliary_loss_mlp": 0.00388511, "balance_loss_clip": 1.27981532, "balance_loss_mlp": 0.35205692, "epoch": 0.1754997745378025, "flos": 29095615416960.0, "grad_norm": 25.443771634742056, "language_loss": 0.81094688, "learning_rate": 3.782004207697098e-06, "loss": 0.83087778, "num_input_tokens_seen": 63136865, "router_z_loss_clip": 3.24804688, "router_z_loss_mlp": 0.36450195, "step": 2919, "time_per_iteration": 2.681159257888794 }, { "auxiliary_loss_clip": 0.01605154, "auxiliary_loss_mlp": 0.0043776, "balance_loss_clip": 1.28187251, "balance_loss_mlp": 0.40037608, "epoch": 0.17555989779047046, "flos": 30371844620160.0, "grad_norm": 21.530003988813743, "language_loss": 0.79337358, "learning_rate": 3.781827358629228e-06, "loss": 0.81380284, "num_input_tokens_seen": 63158325, "router_z_loss_clip": 3.23242188, "router_z_loss_mlp": 0.3737793, "step": 2920, "time_per_iteration": 2.659468650817871 }, { "auxiliary_loss_clip": 0.01602124, "auxiliary_loss_mlp": 0.00417619, "balance_loss_clip": 1.28167009, "balance_loss_mlp": 0.38185662, "epoch": 0.17562002104313842, "flos": 23287746464640.0, "grad_norm": 75.49101778596429, "language_loss": 0.86763257, "learning_rate": 3.7816504419942873e-06, "loss": 0.88783002, "num_input_tokens_seen": 63173115, "router_z_loss_clip": 3.20898438, "router_z_loss_mlp": 0.35766602, "step": 2921, "time_per_iteration": 2.675539970397949 }, { "auxiliary_loss_clip": 0.01608216, "auxiliary_loss_mlp": 0.00463809, "balance_loss_clip": 1.28321838, "balance_loss_mlp": 0.42568564, "epoch": 0.1756801442958064, "flos": 24790500789120.0, "grad_norm": 26.40520267773833, "language_loss": 0.92113149, "learning_rate": 3.7814734577989823e-06, "loss": 0.94185174, "num_input_tokens_seen": 63192880, "router_z_loss_clip": 3.25195312, "router_z_loss_mlp": 0.38110352, "step": 2922, "time_per_iteration": 2.6112544536590576 }, { "auxiliary_loss_clip": 0.01610331, "auxiliary_loss_mlp": 0.00428057, "balance_loss_clip": 1.28202629, "balance_loss_mlp": 0.38878909, "epoch": 0.17574026754847438, "flos": 25771651764480.0, "grad_norm": 7.002267340918597, "language_loss": 0.73321843, "learning_rate": 3.7812964060500253e-06, "loss": 0.75360233, "num_input_tokens_seen": 63214395, "router_z_loss_clip": 3.28320312, "router_z_loss_mlp": 0.39282227, "step": 2923, "time_per_iteration": 2.6469991207122803 }, { "auxiliary_loss_clip": 0.0163022, "auxiliary_loss_mlp": 0.00443538, "balance_loss_clip": 1.30487692, "balance_loss_mlp": 0.40353128, "epoch": 0.17580039080114235, "flos": 17456608477440.0, "grad_norm": 9.667956862944681, "language_loss": 0.89992672, "learning_rate": 3.78111928675413e-06, "loss": 0.92066431, "num_input_tokens_seen": 63231020, "router_z_loss_clip": 3.25195312, "router_z_loss_mlp": 0.40014648, "step": 2924, "time_per_iteration": 2.554562568664551 }, { "auxiliary_loss_clip": 0.01632631, "auxiliary_loss_mlp": 0.00442953, "balance_loss_clip": 1.30426228, "balance_loss_mlp": 0.40337533, "epoch": 0.1758605140538103, "flos": 14864648088960.0, "grad_norm": 21.73680363197503, "language_loss": 0.79385144, "learning_rate": 3.7809420999180126e-06, "loss": 0.81460726, "num_input_tokens_seen": 63246245, "router_z_loss_clip": 3.27929688, "router_z_loss_mlp": 0.39550781, "step": 2925, "time_per_iteration": 2.560873508453369 }, { "auxiliary_loss_clip": 0.01628887, "auxiliary_loss_mlp": 0.00437879, "balance_loss_clip": 1.30445957, "balance_loss_mlp": 0.40028062, "epoch": 0.17592063730647828, "flos": 23004268329600.0, "grad_norm": 2.4194001496581854, "language_loss": 0.74484652, "learning_rate": 3.7807648455483934e-06, "loss": 0.76551414, "num_input_tokens_seen": 63267790, "router_z_loss_clip": 3.24414062, "router_z_loss_mlp": 0.37548828, "step": 2926, "time_per_iteration": 2.603431224822998 }, { "auxiliary_loss_clip": 0.01632783, "auxiliary_loss_mlp": 0.00442581, "balance_loss_clip": 1.29832661, "balance_loss_mlp": 0.40111953, "epoch": 0.17598076055914624, "flos": 20741501111040.0, "grad_norm": 146.88174429117282, "language_loss": 0.93348622, "learning_rate": 3.7805875236519918e-06, "loss": 0.95423979, "num_input_tokens_seen": 63286830, "router_z_loss_clip": 3.34570312, "router_z_loss_mlp": 0.41430664, "step": 2927, "time_per_iteration": 2.6514618396759033 }, { "auxiliary_loss_clip": 0.01641799, "auxiliary_loss_mlp": 0.00390354, "balance_loss_clip": 1.31493592, "balance_loss_mlp": 0.35656977, "epoch": 0.1760408838118142, "flos": 34092441227520.0, "grad_norm": 21.36636927406695, "language_loss": 0.76367545, "learning_rate": 3.7804101342355336e-06, "loss": 0.78399694, "num_input_tokens_seen": 63308870, "router_z_loss_clip": 3.265625, "router_z_loss_mlp": 0.33789062, "step": 2928, "time_per_iteration": 2.792037010192871 }, { "auxiliary_loss_clip": 0.01647981, "auxiliary_loss_mlp": 0.004187, "balance_loss_clip": 1.31929374, "balance_loss_mlp": 0.3821741, "epoch": 0.1761010070644822, "flos": 24168384207360.0, "grad_norm": 25.618610409814156, "language_loss": 0.88052809, "learning_rate": 3.780232677305744e-06, "loss": 0.90119493, "num_input_tokens_seen": 63329005, "router_z_loss_clip": 3.29101562, "router_z_loss_mlp": 0.36499023, "step": 2929, "time_per_iteration": 2.637108564376831 }, { "auxiliary_loss_clip": 0.01657872, "auxiliary_loss_mlp": 0.00438965, "balance_loss_clip": 1.32631516, "balance_loss_mlp": 0.40167686, "epoch": 0.17616113031715017, "flos": 26576697335040.0, "grad_norm": 38.91506179530911, "language_loss": 0.86022699, "learning_rate": 3.7800551528693535e-06, "loss": 0.88119531, "num_input_tokens_seen": 63349390, "router_z_loss_clip": 3.31445312, "router_z_loss_mlp": 0.37304688, "step": 2930, "time_per_iteration": 4.185786724090576 }, { "auxiliary_loss_clip": 0.01645231, "auxiliary_loss_mlp": 0.00402437, "balance_loss_clip": 1.31648839, "balance_loss_mlp": 0.36603099, "epoch": 0.17622125356981813, "flos": 25666685245440.0, "grad_norm": 13.05541651622911, "language_loss": 0.84028715, "learning_rate": 3.7798775609330927e-06, "loss": 0.86076379, "num_input_tokens_seen": 63368835, "router_z_loss_clip": 3.29101562, "router_z_loss_mlp": 0.36425781, "step": 2931, "time_per_iteration": 2.6340599060058594 }, { "auxiliary_loss_clip": 0.01674262, "auxiliary_loss_mlp": 0.00403149, "balance_loss_clip": 1.34208357, "balance_loss_mlp": 0.37008068, "epoch": 0.1762813768224861, "flos": 16508530949760.0, "grad_norm": 5.4447707557350835, "language_loss": 0.83436322, "learning_rate": 3.779699901503696e-06, "loss": 0.85513735, "num_input_tokens_seen": 63385220, "router_z_loss_clip": 3.32226562, "router_z_loss_mlp": 0.33056641, "step": 2932, "time_per_iteration": 2.669804334640503 }, { "auxiliary_loss_clip": 0.01650202, "auxiliary_loss_mlp": 0.00442855, "balance_loss_clip": 1.31151271, "balance_loss_mlp": 0.40451756, "epoch": 0.17634150007515406, "flos": 11211850402560.0, "grad_norm": 6.97909541168818, "language_loss": 0.96766657, "learning_rate": 3.7795221745879016e-06, "loss": 0.98859715, "num_input_tokens_seen": 63400865, "router_z_loss_clip": 3.38671875, "router_z_loss_mlp": 0.38330078, "step": 2933, "time_per_iteration": 2.6442039012908936 }, { "auxiliary_loss_clip": 0.01676802, "auxiliary_loss_mlp": 0.00445503, "balance_loss_clip": 1.34833312, "balance_loss_mlp": 0.4096688, "epoch": 0.17640162332782203, "flos": 23659925235840.0, "grad_norm": 22.062400279232012, "language_loss": 0.92080224, "learning_rate": 3.779344380192448e-06, "loss": 0.9420253, "num_input_tokens_seen": 63421390, "router_z_loss_clip": 3.28515625, "router_z_loss_mlp": 0.3581543, "step": 2934, "time_per_iteration": 4.027655601501465 }, { "auxiliary_loss_clip": 0.01676797, "auxiliary_loss_mlp": 0.00419326, "balance_loss_clip": 1.34497797, "balance_loss_mlp": 0.38294357, "epoch": 0.17646174658049, "flos": 53796984606720.0, "grad_norm": 8.430966167994338, "language_loss": 0.76824796, "learning_rate": 3.779166518324077e-06, "loss": 0.78920925, "num_input_tokens_seen": 63444715, "router_z_loss_clip": 3.31445312, "router_z_loss_mlp": 0.36401367, "step": 2935, "time_per_iteration": 2.8843462467193604 }, { "auxiliary_loss_clip": 0.0167338, "auxiliary_loss_mlp": 0.00418557, "balance_loss_clip": 1.33619153, "balance_loss_mlp": 0.38312793, "epoch": 0.17652186983315798, "flos": 24243868638720.0, "grad_norm": 215.89171324135654, "language_loss": 0.7804361, "learning_rate": 3.7789885889895325e-06, "loss": 0.80135548, "num_input_tokens_seen": 63465525, "router_z_loss_clip": 3.37109375, "router_z_loss_mlp": 0.35449219, "step": 2936, "time_per_iteration": 2.638343095779419 }, { "auxiliary_loss_clip": 0.01672947, "auxiliary_loss_mlp": 0.00422136, "balance_loss_clip": 1.34283781, "balance_loss_mlp": 0.38665906, "epoch": 0.17658199308582595, "flos": 27454282421760.0, "grad_norm": 5.151603016907603, "language_loss": 0.78139842, "learning_rate": 3.7788105921955634e-06, "loss": 0.80234921, "num_input_tokens_seen": 63485815, "router_z_loss_clip": 3.30078125, "router_z_loss_mlp": 0.35498047, "step": 2937, "time_per_iteration": 4.051487684249878 }, { "auxiliary_loss_clip": 0.01677436, "auxiliary_loss_mlp": 0.00401206, "balance_loss_clip": 1.34153032, "balance_loss_mlp": 0.36513281, "epoch": 0.17664211633849392, "flos": 22418672901120.0, "grad_norm": 113.96221877319367, "language_loss": 0.84845877, "learning_rate": 3.7786325279489184e-06, "loss": 0.86924517, "num_input_tokens_seen": 63503905, "router_z_loss_clip": 3.359375, "router_z_loss_mlp": 0.36083984, "step": 2938, "time_per_iteration": 2.638329267501831 }, { "auxiliary_loss_clip": 0.01692726, "auxiliary_loss_mlp": 0.00382023, "balance_loss_clip": 1.35435128, "balance_loss_mlp": 0.34547395, "epoch": 0.17670223959116188, "flos": 24715124098560.0, "grad_norm": 9.834567763018205, "language_loss": 0.80581206, "learning_rate": 3.7784543962563495e-06, "loss": 0.82655954, "num_input_tokens_seen": 63521985, "router_z_loss_clip": 3.3828125, "router_z_loss_mlp": 0.36523438, "step": 2939, "time_per_iteration": 2.5841493606567383 }, { "auxiliary_loss_clip": 0.0169262, "auxiliary_loss_mlp": 0.00374332, "balance_loss_clip": 1.36250257, "balance_loss_mlp": 0.34212148, "epoch": 0.17676236284382985, "flos": 22527051212160.0, "grad_norm": 18.31124762552899, "language_loss": 0.83250117, "learning_rate": 3.7782761971246115e-06, "loss": 0.85317075, "num_input_tokens_seen": 63539830, "router_z_loss_clip": 3.29882812, "router_z_loss_mlp": 0.3215332, "step": 2940, "time_per_iteration": 2.6149258613586426 }, { "auxiliary_loss_clip": 0.01696319, "auxiliary_loss_mlp": 0.00385421, "balance_loss_clip": 1.36533344, "balance_loss_mlp": 0.35225686, "epoch": 0.1768224860964978, "flos": 12385160161920.0, "grad_norm": 86.14131534271506, "language_loss": 0.96077275, "learning_rate": 3.7780979305604616e-06, "loss": 0.98159015, "num_input_tokens_seen": 63555495, "router_z_loss_clip": 3.31445312, "router_z_loss_mlp": 0.33178711, "step": 2941, "time_per_iteration": 2.5285208225250244 }, { "auxiliary_loss_clip": 0.01680912, "auxiliary_loss_mlp": 0.00368907, "balance_loss_clip": 1.34848094, "balance_loss_mlp": 0.33238173, "epoch": 0.1768826093491658, "flos": 24353360271360.0, "grad_norm": 67.75715478845214, "language_loss": 0.83356899, "learning_rate": 3.7779195965706607e-06, "loss": 0.85406715, "num_input_tokens_seen": 63575290, "router_z_loss_clip": 3.32617188, "router_z_loss_mlp": 0.36572266, "step": 2942, "time_per_iteration": 2.63105845451355 }, { "auxiliary_loss_clip": 0.01695853, "auxiliary_loss_mlp": 0.00364749, "balance_loss_clip": 1.36694086, "balance_loss_mlp": 0.32853323, "epoch": 0.17694273260183377, "flos": 23587062497280.0, "grad_norm": 11.366194466393583, "language_loss": 0.8774178, "learning_rate": 3.77774119516197e-06, "loss": 0.89802384, "num_input_tokens_seen": 63594670, "router_z_loss_clip": 3.29101562, "router_z_loss_mlp": 0.36206055, "step": 2943, "time_per_iteration": 2.6408491134643555 }, { "auxiliary_loss_clip": 0.01697855, "auxiliary_loss_mlp": 0.00365484, "balance_loss_clip": 1.36424184, "balance_loss_mlp": 0.32862401, "epoch": 0.17700285585450173, "flos": 26760991040640.0, "grad_norm": 274.3347478158071, "language_loss": 0.87030655, "learning_rate": 3.777562726341155e-06, "loss": 0.89093995, "num_input_tokens_seen": 63614780, "router_z_loss_clip": 3.33398438, "router_z_loss_mlp": 0.36816406, "step": 2944, "time_per_iteration": 2.624326467514038 }, { "auxiliary_loss_clip": 0.01704861, "auxiliary_loss_mlp": 0.00361897, "balance_loss_clip": 1.37440419, "balance_loss_mlp": 0.32696825, "epoch": 0.1770629791071697, "flos": 42776323320960.0, "grad_norm": 2.5124722969935656, "language_loss": 0.79054618, "learning_rate": 3.7773841901149835e-06, "loss": 0.81121373, "num_input_tokens_seen": 63637190, "router_z_loss_clip": 3.3046875, "router_z_loss_mlp": 0.34936523, "step": 2945, "time_per_iteration": 2.759526014328003 }, { "auxiliary_loss_clip": 0.01703649, "auxiliary_loss_mlp": 0.00344105, "balance_loss_clip": 1.37417841, "balance_loss_mlp": 0.31151336, "epoch": 0.17712310235983766, "flos": 17345572560000.0, "grad_norm": 36.61454995864865, "language_loss": 0.87209976, "learning_rate": 3.7772055864902256e-06, "loss": 0.89257729, "num_input_tokens_seen": 63652140, "router_z_loss_clip": 3.29101562, "router_z_loss_mlp": 0.32543945, "step": 2946, "time_per_iteration": 2.5537919998168945 }, { "auxiliary_loss_clip": 0.01707089, "auxiliary_loss_mlp": 0.00331169, "balance_loss_clip": 1.38010812, "balance_loss_mlp": 0.29788613, "epoch": 0.17718322561250563, "flos": 23878477537920.0, "grad_norm": 10.783045203490603, "language_loss": 0.83512717, "learning_rate": 3.7770269154736535e-06, "loss": 0.85550976, "num_input_tokens_seen": 63671700, "router_z_loss_clip": 3.26757812, "router_z_loss_mlp": 0.33276367, "step": 2947, "time_per_iteration": 2.5929367542266846 }, { "auxiliary_loss_clip": 0.01727974, "auxiliary_loss_mlp": 0.00353401, "balance_loss_clip": 1.40152514, "balance_loss_mlp": 0.31697044, "epoch": 0.1772433488651736, "flos": 36466352104320.0, "grad_norm": 54.89672201836535, "language_loss": 0.78584141, "learning_rate": 3.7768481770720424e-06, "loss": 0.80665517, "num_input_tokens_seen": 63691685, "router_z_loss_clip": 3.26757812, "router_z_loss_mlp": 0.36450195, "step": 2948, "time_per_iteration": 2.7245473861694336 }, { "auxiliary_loss_clip": 0.01719247, "auxiliary_loss_mlp": 0.00331097, "balance_loss_clip": 1.39323151, "balance_loss_mlp": 0.29812342, "epoch": 0.1773034721178416, "flos": 26684716510080.0, "grad_norm": 11.237604013678606, "language_loss": 0.86718899, "learning_rate": 3.776669371292171e-06, "loss": 0.88769239, "num_input_tokens_seen": 63711720, "router_z_loss_clip": 3.25976562, "router_z_loss_mlp": 0.32958984, "step": 2949, "time_per_iteration": 2.6410107612609863 }, { "auxiliary_loss_clip": 0.01627314, "auxiliary_loss_mlp": 0.00149781, "balance_loss_clip": 1.39291215, "balance_loss_mlp": 0.13847953, "epoch": 0.17736359537050955, "flos": 57117467617920.0, "grad_norm": 0.7861432250145216, "language_loss": 0.6513325, "learning_rate": 3.7764904981408186e-06, "loss": 0.66910338, "num_input_tokens_seen": 63776280, "router_z_loss_clip": 2.34375, "router_z_loss_mlp": 0.11279297, "step": 2950, "time_per_iteration": 3.1811716556549072 }, { "auxiliary_loss_clip": 0.01736788, "auxiliary_loss_mlp": 0.0034283, "balance_loss_clip": 1.41344702, "balance_loss_mlp": 0.30747265, "epoch": 0.17742371862317752, "flos": 27198203385600.0, "grad_norm": 10.759356774054934, "language_loss": 0.89189374, "learning_rate": 3.7763115576247686e-06, "loss": 0.91268992, "num_input_tokens_seen": 63797535, "router_z_loss_clip": 3.24023438, "router_z_loss_mlp": 0.35375977, "step": 2951, "time_per_iteration": 2.6922221183776855 }, { "auxiliary_loss_clip": 0.01729113, "auxiliary_loss_mlp": 0.00321605, "balance_loss_clip": 1.39563727, "balance_loss_mlp": 0.2859613, "epoch": 0.17748384187584548, "flos": 20959694277120.0, "grad_norm": 1011.9644537042604, "language_loss": 0.86944306, "learning_rate": 3.776132549750806e-06, "loss": 0.88995028, "num_input_tokens_seen": 63817045, "router_z_loss_clip": 3.33203125, "router_z_loss_mlp": 0.35644531, "step": 2952, "time_per_iteration": 2.5570600032806396 }, { "auxiliary_loss_clip": 0.01738004, "auxiliary_loss_mlp": 0.00289055, "balance_loss_clip": 1.41097569, "balance_loss_mlp": 0.25469866, "epoch": 0.17754396512851345, "flos": 25009986844800.0, "grad_norm": 12.462222984757997, "language_loss": 0.87606007, "learning_rate": 3.7759534745257194e-06, "loss": 0.8963306, "num_input_tokens_seen": 63837665, "router_z_loss_clip": 3.26757812, "router_z_loss_mlp": 0.34375, "step": 2953, "time_per_iteration": 2.639462471008301 }, { "auxiliary_loss_clip": 0.01735225, "auxiliary_loss_mlp": 0.00328721, "balance_loss_clip": 1.39782691, "balance_loss_mlp": 0.29305398, "epoch": 0.1776040883811814, "flos": 32051566275840.0, "grad_norm": 3.371151765940661, "language_loss": 0.93892044, "learning_rate": 3.7757743319562994e-06, "loss": 0.95955998, "num_input_tokens_seen": 63858455, "router_z_loss_clip": 3.375, "router_z_loss_mlp": 0.35668945, "step": 2954, "time_per_iteration": 2.7158682346343994 }, { "auxiliary_loss_clip": 0.01729755, "auxiliary_loss_mlp": 0.00333638, "balance_loss_clip": 1.40050173, "balance_loss_mlp": 0.29983044, "epoch": 0.17766421163384938, "flos": 21574125348480.0, "grad_norm": 8.437973349051529, "language_loss": 0.89335573, "learning_rate": 3.7755951220493386e-06, "loss": 0.91398966, "num_input_tokens_seen": 63876935, "router_z_loss_clip": 3.29296875, "router_z_loss_mlp": 0.33813477, "step": 2955, "time_per_iteration": 2.6209347248077393 }, { "auxiliary_loss_clip": 0.01737991, "auxiliary_loss_mlp": 0.00319681, "balance_loss_clip": 1.40009212, "balance_loss_mlp": 0.2837511, "epoch": 0.17772433488651737, "flos": 22419319345920.0, "grad_norm": 33.24802141760314, "language_loss": 0.78516418, "learning_rate": 3.7754158448116327e-06, "loss": 0.80574095, "num_input_tokens_seen": 63896815, "router_z_loss_clip": 3.375, "router_z_loss_mlp": 0.35913086, "step": 2956, "time_per_iteration": 2.6015970706939697 }, { "auxiliary_loss_clip": 0.01744026, "auxiliary_loss_mlp": 0.00307125, "balance_loss_clip": 1.41112041, "balance_loss_mlp": 0.27195811, "epoch": 0.17778445813918534, "flos": 25629445820160.0, "grad_norm": 4.115659050266342, "language_loss": 0.89783955, "learning_rate": 3.7752365002499795e-06, "loss": 0.91835111, "num_input_tokens_seen": 63916140, "router_z_loss_clip": 3.33007812, "router_z_loss_mlp": 0.35180664, "step": 2957, "time_per_iteration": 2.655021905899048 }, { "auxiliary_loss_clip": 0.01738989, "auxiliary_loss_mlp": 0.0031842, "balance_loss_clip": 1.40319526, "balance_loss_mlp": 0.2835632, "epoch": 0.1778445813918533, "flos": 25628871202560.0, "grad_norm": 28.016264880930855, "language_loss": 0.81137729, "learning_rate": 3.7750570883711807e-06, "loss": 0.83195144, "num_input_tokens_seen": 63935220, "router_z_loss_clip": 3.35546875, "router_z_loss_mlp": 0.34838867, "step": 2958, "time_per_iteration": 2.6327555179595947 }, { "auxiliary_loss_clip": 0.01755487, "auxiliary_loss_mlp": 0.00323676, "balance_loss_clip": 1.41350555, "balance_loss_mlp": 0.2887243, "epoch": 0.17790470464452127, "flos": 22345522853760.0, "grad_norm": 66.3605069460464, "language_loss": 0.87486207, "learning_rate": 3.7748776091820397e-06, "loss": 0.89565372, "num_input_tokens_seen": 63954550, "router_z_loss_clip": 3.41796875, "router_z_loss_mlp": 0.34985352, "step": 2959, "time_per_iteration": 2.6341238021850586 }, { "auxiliary_loss_clip": 0.01740333, "auxiliary_loss_mlp": 0.00349226, "balance_loss_clip": 1.39871073, "balance_loss_mlp": 0.31122184, "epoch": 0.17796482789718923, "flos": 18765875214720.0, "grad_norm": 19.947908678351737, "language_loss": 0.61702979, "learning_rate": 3.774698062689362e-06, "loss": 0.63792551, "num_input_tokens_seen": 63972425, "router_z_loss_clip": 3.41601562, "router_z_loss_mlp": 0.38012695, "step": 2960, "time_per_iteration": 2.6058034896850586 }, { "auxiliary_loss_clip": 0.01751233, "auxiliary_loss_mlp": 0.00314273, "balance_loss_clip": 1.41222453, "balance_loss_mlp": 0.27579197, "epoch": 0.1780249511498572, "flos": 23440941970560.0, "grad_norm": 14.328636854777201, "language_loss": 0.94532275, "learning_rate": 3.7745184488999548e-06, "loss": 0.96597779, "num_input_tokens_seen": 63992165, "router_z_loss_clip": 3.39453125, "router_z_loss_mlp": 0.38500977, "step": 2961, "time_per_iteration": 2.641568899154663 }, { "auxiliary_loss_clip": 0.01732506, "auxiliary_loss_mlp": 0.00317439, "balance_loss_clip": 1.3925246, "balance_loss_mlp": 0.27848125, "epoch": 0.1780850744025252, "flos": 23367468700800.0, "grad_norm": 9.9922042343844, "language_loss": 0.84823644, "learning_rate": 3.774338767820631e-06, "loss": 0.86873591, "num_input_tokens_seen": 64013470, "router_z_loss_clip": 3.39648438, "router_z_loss_mlp": 0.38989258, "step": 2962, "time_per_iteration": 2.59748911857605 }, { "auxiliary_loss_clip": 0.01743214, "auxiliary_loss_mlp": 0.00329258, "balance_loss_clip": 1.40280378, "balance_loss_mlp": 0.29287505, "epoch": 0.17814519765519315, "flos": 13771994319360.0, "grad_norm": 3.4282332768667527, "language_loss": 0.81049562, "learning_rate": 3.774159019458203e-06, "loss": 0.83122039, "num_input_tokens_seen": 64030975, "router_z_loss_clip": 3.40625, "router_z_loss_mlp": 0.36352539, "step": 2963, "time_per_iteration": 2.6482770442962646 }, { "auxiliary_loss_clip": 0.01749487, "auxiliary_loss_mlp": 0.00338113, "balance_loss_clip": 1.40777266, "balance_loss_mlp": 0.29910779, "epoch": 0.17820532090786112, "flos": 21976396738560.0, "grad_norm": 2.897777777567761, "language_loss": 0.84387374, "learning_rate": 3.7739792038194877e-06, "loss": 0.86474967, "num_input_tokens_seen": 64050075, "router_z_loss_clip": 3.41601562, "router_z_loss_mlp": 0.39013672, "step": 2964, "time_per_iteration": 2.578401565551758 }, { "auxiliary_loss_clip": 0.01729338, "auxiliary_loss_mlp": 0.00321472, "balance_loss_clip": 1.38907647, "balance_loss_mlp": 0.28415996, "epoch": 0.17826544416052909, "flos": 24790752184320.0, "grad_norm": 2.1681078261603846, "language_loss": 0.86494362, "learning_rate": 3.7737993209113027e-06, "loss": 0.88545173, "num_input_tokens_seen": 64071920, "router_z_loss_clip": 3.40234375, "router_z_loss_mlp": 0.37329102, "step": 2965, "time_per_iteration": 2.6014368534088135 }, { "auxiliary_loss_clip": 0.01738834, "auxiliary_loss_mlp": 0.00321579, "balance_loss_clip": 1.39769292, "balance_loss_mlp": 0.2841236, "epoch": 0.17832556741319705, "flos": 13879582531200.0, "grad_norm": 2.477764050587281, "language_loss": 1.02825379, "learning_rate": 3.7736193707404698e-06, "loss": 1.04885793, "num_input_tokens_seen": 64086835, "router_z_loss_clip": 3.41210938, "router_z_loss_mlp": 0.37451172, "step": 2966, "time_per_iteration": 2.59429669380188 }, { "auxiliary_loss_clip": 0.01732891, "auxiliary_loss_mlp": 0.00327143, "balance_loss_clip": 1.39397597, "balance_loss_mlp": 0.28723198, "epoch": 0.17838569066586502, "flos": 36641703323520.0, "grad_norm": 12.094449908993102, "language_loss": 0.79137981, "learning_rate": 3.7734393533138127e-06, "loss": 0.81198013, "num_input_tokens_seen": 64107360, "router_z_loss_clip": 3.38867188, "router_z_loss_mlp": 0.39916992, "step": 2967, "time_per_iteration": 2.6967883110046387 }, { "auxiliary_loss_clip": 0.01745885, "auxiliary_loss_mlp": 0.00304427, "balance_loss_clip": 1.40467668, "balance_loss_mlp": 0.26873618, "epoch": 0.17844581391853298, "flos": 18727271072640.0, "grad_norm": 5.043658559497623, "language_loss": 0.83195406, "learning_rate": 3.773259268638157e-06, "loss": 0.85245723, "num_input_tokens_seen": 64124690, "router_z_loss_clip": 3.41015625, "router_z_loss_mlp": 0.35717773, "step": 2968, "time_per_iteration": 2.6440539360046387 }, { "auxiliary_loss_clip": 0.0173193, "auxiliary_loss_mlp": 0.00319039, "balance_loss_clip": 1.3951993, "balance_loss_mlp": 0.28144023, "epoch": 0.17850593717120097, "flos": 27378259286400.0, "grad_norm": 3.989225412152135, "language_loss": 0.81231725, "learning_rate": 3.7730791167203333e-06, "loss": 0.83282691, "num_input_tokens_seen": 64146315, "router_z_loss_clip": 3.3671875, "router_z_loss_mlp": 0.37597656, "step": 2969, "time_per_iteration": 2.6635966300964355 }, { "auxiliary_loss_clip": 0.01582409, "auxiliary_loss_mlp": 0.00183316, "balance_loss_clip": 1.34134841, "balance_loss_mlp": 0.17091796, "epoch": 0.17856606042386894, "flos": 66996025084800.0, "grad_norm": 0.8162667103462051, "language_loss": 0.69044936, "learning_rate": 3.772898897567171e-06, "loss": 0.70810652, "num_input_tokens_seen": 64210875, "router_z_loss_clip": 2.40625, "router_z_loss_mlp": 0.12402344, "step": 2970, "time_per_iteration": 3.1826894283294678 }, { "auxiliary_loss_clip": 0.01709007, "auxiliary_loss_mlp": 0.00326818, "balance_loss_clip": 1.37270594, "balance_loss_mlp": 0.29405957, "epoch": 0.1786261836765369, "flos": 36977001805440.0, "grad_norm": 12.770457510303977, "language_loss": 0.75235891, "learning_rate": 3.772718611185505e-06, "loss": 0.77271712, "num_input_tokens_seen": 64230740, "router_z_loss_clip": 3.36132812, "router_z_loss_mlp": 0.32788086, "step": 2971, "time_per_iteration": 2.7497899532318115 }, { "auxiliary_loss_clip": 0.01714477, "auxiliary_loss_mlp": 0.00319479, "balance_loss_clip": 1.37695849, "balance_loss_mlp": 0.28452706, "epoch": 0.17868630692920487, "flos": 24825441744000.0, "grad_norm": 13.889424513544334, "language_loss": 0.94690609, "learning_rate": 3.7725382575821717e-06, "loss": 0.9672457, "num_input_tokens_seen": 64252300, "router_z_loss_clip": 3.37695312, "router_z_loss_mlp": 0.34912109, "step": 2972, "time_per_iteration": 2.690767765045166 }, { "auxiliary_loss_clip": 0.01722765, "auxiliary_loss_mlp": 0.00328235, "balance_loss_clip": 1.38340521, "balance_loss_mlp": 0.29087442, "epoch": 0.17874643018187283, "flos": 16981977139200.0, "grad_norm": 8.055810466751607, "language_loss": 0.95602608, "learning_rate": 3.77235783676401e-06, "loss": 0.9765361, "num_input_tokens_seen": 64270105, "router_z_loss_clip": 3.39453125, "router_z_loss_mlp": 0.3737793, "step": 2973, "time_per_iteration": 4.099715232849121 }, { "auxiliary_loss_clip": 0.01712391, "auxiliary_loss_mlp": 0.00335766, "balance_loss_clip": 1.37017941, "balance_loss_mlp": 0.29854932, "epoch": 0.1788065534345408, "flos": 21032233793280.0, "grad_norm": 22.90800371097213, "language_loss": 0.83586377, "learning_rate": 3.7721773487378615e-06, "loss": 0.8563453, "num_input_tokens_seen": 64287250, "router_z_loss_clip": 3.41992188, "router_z_loss_mlp": 0.37255859, "step": 2974, "time_per_iteration": 2.623253345489502 }, { "auxiliary_loss_clip": 0.01705575, "auxiliary_loss_mlp": 0.00322176, "balance_loss_clip": 1.37818599, "balance_loss_mlp": 0.28665155, "epoch": 0.17886667668720876, "flos": 23987717775360.0, "grad_norm": 213.7282657397749, "language_loss": 0.83928984, "learning_rate": 3.7719967935105705e-06, "loss": 0.8595674, "num_input_tokens_seen": 64307140, "router_z_loss_clip": 3.27539062, "router_z_loss_mlp": 0.35546875, "step": 2975, "time_per_iteration": 2.6972732543945312 }, { "auxiliary_loss_clip": 0.0173197, "auxiliary_loss_mlp": 0.00347597, "balance_loss_clip": 1.40147626, "balance_loss_mlp": 0.31200093, "epoch": 0.17892679993987676, "flos": 25739476156800.0, "grad_norm": 12.268018027700547, "language_loss": 0.77136731, "learning_rate": 3.7718161710889833e-06, "loss": 0.79216301, "num_input_tokens_seen": 64328760, "router_z_loss_clip": 3.30273438, "router_z_loss_mlp": 0.35595703, "step": 2976, "time_per_iteration": 4.171847343444824 }, { "auxiliary_loss_clip": 0.01731515, "auxiliary_loss_mlp": 0.00303757, "balance_loss_clip": 1.39559615, "balance_loss_mlp": 0.26801774, "epoch": 0.17898692319254472, "flos": 25699686865920.0, "grad_norm": 53.69721796196957, "language_loss": 0.82290632, "learning_rate": 3.7716354814799495e-06, "loss": 0.84325904, "num_input_tokens_seen": 64348800, "router_z_loss_clip": 3.359375, "router_z_loss_mlp": 0.35717773, "step": 2977, "time_per_iteration": 2.6970789432525635 }, { "auxiliary_loss_clip": 0.01725318, "auxiliary_loss_mlp": 0.00305957, "balance_loss_clip": 1.39877892, "balance_loss_mlp": 0.27076614, "epoch": 0.1790470464452127, "flos": 19317786664320.0, "grad_norm": 14.271263707593098, "language_loss": 0.87599075, "learning_rate": 3.7714547246903203e-06, "loss": 0.89630353, "num_input_tokens_seen": 64367955, "router_z_loss_clip": 3.265625, "router_z_loss_mlp": 0.35205078, "step": 2978, "time_per_iteration": 2.718693256378174 }, { "auxiliary_loss_clip": 0.01736702, "auxiliary_loss_mlp": 0.003478, "balance_loss_clip": 1.40664399, "balance_loss_mlp": 0.31361043, "epoch": 0.17910716969788065, "flos": 30044267562240.0, "grad_norm": 3.96724576160593, "language_loss": 0.8139168, "learning_rate": 3.7712739007269508e-06, "loss": 0.83476174, "num_input_tokens_seen": 64389805, "router_z_loss_clip": 3.30078125, "router_z_loss_mlp": 0.34204102, "step": 2979, "time_per_iteration": 4.12359881401062 }, { "auxiliary_loss_clip": 0.01723404, "auxiliary_loss_mlp": 0.00336503, "balance_loss_clip": 1.39784646, "balance_loss_mlp": 0.30073985, "epoch": 0.17916729295054862, "flos": 19427709260160.0, "grad_norm": 87.9157679879234, "language_loss": 0.74933898, "learning_rate": 3.7710930095966976e-06, "loss": 0.76993799, "num_input_tokens_seen": 64408220, "router_z_loss_clip": 3.2578125, "router_z_loss_mlp": 0.35766602, "step": 2980, "time_per_iteration": 2.595059394836426 }, { "auxiliary_loss_clip": 0.01700824, "auxiliary_loss_mlp": 0.00354776, "balance_loss_clip": 1.37049806, "balance_loss_mlp": 0.31681943, "epoch": 0.17922741620321658, "flos": 14611549881600.0, "grad_norm": 3.95538549600532, "language_loss": 0.77905303, "learning_rate": 3.7709120513064196e-06, "loss": 0.79960901, "num_input_tokens_seen": 64426380, "router_z_loss_clip": 3.30078125, "router_z_loss_mlp": 0.37963867, "step": 2981, "time_per_iteration": 2.579810857772827 }, { "auxiliary_loss_clip": 0.01707679, "auxiliary_loss_mlp": 0.00329053, "balance_loss_clip": 1.37563455, "balance_loss_mlp": 0.29224092, "epoch": 0.17928753945588458, "flos": 17165301177600.0, "grad_norm": 50.23924997308542, "language_loss": 0.90267283, "learning_rate": 3.7707310258629796e-06, "loss": 0.92304015, "num_input_tokens_seen": 64444355, "router_z_loss_clip": 3.31640625, "router_z_loss_mlp": 0.36791992, "step": 2982, "time_per_iteration": 2.562204122543335 }, { "auxiliary_loss_clip": 0.01703467, "auxiliary_loss_mlp": 0.00308121, "balance_loss_clip": 1.37842298, "balance_loss_mlp": 0.27550513, "epoch": 0.17934766270855254, "flos": 31395622060800.0, "grad_norm": 19.187454847675095, "language_loss": 0.88739014, "learning_rate": 3.7705499332732413e-06, "loss": 0.90750599, "num_input_tokens_seen": 64467800, "router_z_loss_clip": 3.25, "router_z_loss_mlp": 0.32592773, "step": 2983, "time_per_iteration": 2.6947436332702637 }, { "auxiliary_loss_clip": 0.01713181, "auxiliary_loss_mlp": 0.00342694, "balance_loss_clip": 1.37918186, "balance_loss_mlp": 0.30571482, "epoch": 0.1794077859612205, "flos": 20814184281600.0, "grad_norm": 7.513000495462913, "language_loss": 0.95339924, "learning_rate": 3.7703687735440718e-06, "loss": 0.97395802, "num_input_tokens_seen": 64487230, "router_z_loss_clip": 3.33984375, "router_z_loss_mlp": 0.36987305, "step": 2984, "time_per_iteration": 2.593282699584961 }, { "auxiliary_loss_clip": 0.01697895, "auxiliary_loss_mlp": 0.00354002, "balance_loss_clip": 1.36893857, "balance_loss_mlp": 0.31773835, "epoch": 0.17946790921388847, "flos": 28986447006720.0, "grad_norm": 328.4096078659858, "language_loss": 0.9513039, "learning_rate": 3.7701875466823416e-06, "loss": 0.97182292, "num_input_tokens_seen": 64509165, "router_z_loss_clip": 3.2890625, "router_z_loss_mlp": 0.36303711, "step": 2985, "time_per_iteration": 2.6815977096557617 }, { "auxiliary_loss_clip": 0.01696503, "auxiliary_loss_mlp": 0.00300552, "balance_loss_clip": 1.37294292, "balance_loss_mlp": 0.26769772, "epoch": 0.17952803246655644, "flos": 20737406960640.0, "grad_norm": 12.560622880019952, "language_loss": 0.77010977, "learning_rate": 3.770006252694922e-06, "loss": 0.79008037, "num_input_tokens_seen": 64527940, "router_z_loss_clip": 3.23242188, "router_z_loss_mlp": 0.32861328, "step": 2986, "time_per_iteration": 2.567049980163574 }, { "auxiliary_loss_clip": 0.01696403, "auxiliary_loss_mlp": 0.00321083, "balance_loss_clip": 1.36649585, "balance_loss_mlp": 0.28775212, "epoch": 0.1795881557192244, "flos": 28255988027520.0, "grad_norm": 70.71938500588794, "language_loss": 0.8672213, "learning_rate": 3.769824891588688e-06, "loss": 0.88739622, "num_input_tokens_seen": 64545230, "router_z_loss_clip": 3.30078125, "router_z_loss_mlp": 0.33325195, "step": 2987, "time_per_iteration": 2.7324111461639404 }, { "auxiliary_loss_clip": 0.01712289, "auxiliary_loss_mlp": 0.00346523, "balance_loss_clip": 1.37701678, "balance_loss_mlp": 0.30873358, "epoch": 0.17964827897189237, "flos": 18552027594240.0, "grad_norm": 8.425555809036089, "language_loss": 0.84660536, "learning_rate": 3.7696434633705164e-06, "loss": 0.86719346, "num_input_tokens_seen": 64563820, "router_z_loss_clip": 3.35546875, "router_z_loss_mlp": 0.37768555, "step": 2988, "time_per_iteration": 2.556929588317871 }, { "auxiliary_loss_clip": 0.01489245, "auxiliary_loss_mlp": 0.00175781, "balance_loss_clip": 1.2391988, "balance_loss_mlp": 0.16543382, "epoch": 0.17970840222456036, "flos": 58165088711040.0, "grad_norm": 0.7378195025042584, "language_loss": 0.624349, "learning_rate": 3.7694619680472875e-06, "loss": 0.64099932, "num_input_tokens_seen": 64621315, "router_z_loss_clip": 2.5, "router_z_loss_mlp": 0.10351562, "step": 2989, "time_per_iteration": 3.0365383625030518 }, { "auxiliary_loss_clip": 0.0167958, "auxiliary_loss_mlp": 0.00320419, "balance_loss_clip": 1.35192096, "balance_loss_mlp": 0.28721875, "epoch": 0.17976852547722832, "flos": 20300805146880.0, "grad_norm": 11.45524792244918, "language_loss": 0.78051841, "learning_rate": 3.7692804056258837e-06, "loss": 0.80051845, "num_input_tokens_seen": 64639885, "router_z_loss_clip": 3.2734375, "router_z_loss_mlp": 0.33215332, "step": 2990, "time_per_iteration": 2.5805442333221436 }, { "auxiliary_loss_clip": 0.01690764, "auxiliary_loss_mlp": 0.00364355, "balance_loss_clip": 1.35487235, "balance_loss_mlp": 0.32880697, "epoch": 0.1798286487298963, "flos": 39669367685760.0, "grad_norm": 224.03424478886822, "language_loss": 0.75368762, "learning_rate": 3.7690987761131893e-06, "loss": 0.77423882, "num_input_tokens_seen": 64661220, "router_z_loss_clip": 3.35742188, "router_z_loss_mlp": 0.35571289, "step": 2991, "time_per_iteration": 2.7790701389312744 }, { "auxiliary_loss_clip": 0.01685172, "auxiliary_loss_mlp": 0.00332463, "balance_loss_clip": 1.35609412, "balance_loss_mlp": 0.29569882, "epoch": 0.17988877198256426, "flos": 25520313323520.0, "grad_norm": 78.71793802308387, "language_loss": 0.87999076, "learning_rate": 3.7689170795160924e-06, "loss": 0.90016711, "num_input_tokens_seen": 64682530, "router_z_loss_clip": 3.2890625, "router_z_loss_mlp": 0.36767578, "step": 2992, "time_per_iteration": 2.7004621028900146 }, { "auxiliary_loss_clip": 0.01679679, "auxiliary_loss_mlp": 0.00299524, "balance_loss_clip": 1.35797906, "balance_loss_mlp": 0.26507223, "epoch": 0.17994889523523222, "flos": 18807496099200.0, "grad_norm": 8.702798553513622, "language_loss": 0.88998121, "learning_rate": 3.7687353158414822e-06, "loss": 0.90977323, "num_input_tokens_seen": 64701025, "router_z_loss_clip": 3.21679688, "router_z_loss_mlp": 0.34411621, "step": 2993, "time_per_iteration": 2.6051950454711914 }, { "auxiliary_loss_clip": 0.01655502, "auxiliary_loss_mlp": 0.00345757, "balance_loss_clip": 1.32746911, "balance_loss_mlp": 0.30930272, "epoch": 0.18000901848790019, "flos": 21104450087040.0, "grad_norm": 5.913629477648111, "language_loss": 0.84598565, "learning_rate": 3.7685534850962517e-06, "loss": 0.86599827, "num_input_tokens_seen": 64719570, "router_z_loss_clip": 3.28125, "router_z_loss_mlp": 0.36450195, "step": 2994, "time_per_iteration": 2.609543561935425 }, { "auxiliary_loss_clip": 0.01662038, "auxiliary_loss_mlp": 0.00315141, "balance_loss_clip": 1.33400416, "balance_loss_mlp": 0.28037998, "epoch": 0.18006914174056818, "flos": 19646441130240.0, "grad_norm": 19.75128363653461, "language_loss": 0.89762259, "learning_rate": 3.768371587287296e-06, "loss": 0.9173944, "num_input_tokens_seen": 64738110, "router_z_loss_clip": 3.27734375, "router_z_loss_mlp": 0.34765625, "step": 2995, "time_per_iteration": 2.6026792526245117 }, { "auxiliary_loss_clip": 0.01694431, "auxiliary_loss_mlp": 0.00330936, "balance_loss_clip": 1.36711013, "balance_loss_mlp": 0.29627013, "epoch": 0.18012926499323614, "flos": 19499889640320.0, "grad_norm": 2.151278568227547, "language_loss": 0.89566374, "learning_rate": 3.768189622421512e-06, "loss": 0.9159174, "num_input_tokens_seen": 64756345, "router_z_loss_clip": 3.27148438, "router_z_loss_mlp": 0.34667969, "step": 2996, "time_per_iteration": 2.5955255031585693 }, { "auxiliary_loss_clip": 0.01673255, "auxiliary_loss_mlp": 0.00304325, "balance_loss_clip": 1.3561064, "balance_loss_mlp": 0.27120847, "epoch": 0.1801893882459041, "flos": 19464553635840.0, "grad_norm": 13.890924310523276, "language_loss": 0.9211309, "learning_rate": 3.7680075905058006e-06, "loss": 0.94090664, "num_input_tokens_seen": 64776375, "router_z_loss_clip": 3.17382812, "router_z_loss_mlp": 0.33105469, "step": 2997, "time_per_iteration": 2.6080853939056396 }, { "auxiliary_loss_clip": 0.01685063, "auxiliary_loss_mlp": 0.00328987, "balance_loss_clip": 1.35412836, "balance_loss_mlp": 0.29483312, "epoch": 0.18024951149857207, "flos": 26870590414080.0, "grad_norm": 2.9129340973406768, "language_loss": 0.92107046, "learning_rate": 3.7678254915470643e-06, "loss": 0.94121099, "num_input_tokens_seen": 64796210, "router_z_loss_clip": 3.30859375, "router_z_loss_mlp": 0.3416748, "step": 2998, "time_per_iteration": 2.6586430072784424 }, { "auxiliary_loss_clip": 0.01662252, "auxiliary_loss_mlp": 0.00288985, "balance_loss_clip": 1.34279728, "balance_loss_mlp": 0.25708452, "epoch": 0.18030963475124004, "flos": 30226621933440.0, "grad_norm": 44.86701455215213, "language_loss": 0.91554213, "learning_rate": 3.7676433255522084e-06, "loss": 0.93505454, "num_input_tokens_seen": 64818590, "router_z_loss_clip": 3.19726562, "router_z_loss_mlp": 0.3190918, "step": 2999, "time_per_iteration": 2.7043211460113525 }, { "auxiliary_loss_clip": 0.01682654, "auxiliary_loss_mlp": 0.00333092, "balance_loss_clip": 1.35433412, "balance_loss_mlp": 0.30047619, "epoch": 0.180369758003908, "flos": 22307493329280.0, "grad_norm": 22.19195405369308, "language_loss": 0.80710542, "learning_rate": 3.76746109252814e-06, "loss": 0.82726288, "num_input_tokens_seen": 64838350, "router_z_loss_clip": 3.28125, "router_z_loss_mlp": 0.32617188, "step": 3000, "time_per_iteration": 2.6410958766937256 }, { "auxiliary_loss_clip": 0.0168598, "auxiliary_loss_mlp": 0.00323721, "balance_loss_clip": 1.3685329, "balance_loss_mlp": 0.28948399, "epoch": 0.18042988125657597, "flos": 23732033788800.0, "grad_norm": 3.6694151905245174, "language_loss": 0.76644146, "learning_rate": 3.76727879248177e-06, "loss": 0.78653854, "num_input_tokens_seen": 64858065, "router_z_loss_clip": 3.17578125, "router_z_loss_mlp": 0.34228516, "step": 3001, "time_per_iteration": 2.6299209594726562 }, { "auxiliary_loss_clip": 0.01665649, "auxiliary_loss_mlp": 0.00301173, "balance_loss_clip": 1.34528089, "balance_loss_mlp": 0.26824784, "epoch": 0.18049000450924396, "flos": 24093582134400.0, "grad_norm": 2006.06929397038, "language_loss": 0.94246328, "learning_rate": 3.767096425420011e-06, "loss": 0.9621315, "num_input_tokens_seen": 64877305, "router_z_loss_clip": 3.20117188, "router_z_loss_mlp": 0.3293457, "step": 3002, "time_per_iteration": 2.676598310470581 }, { "auxiliary_loss_clip": 0.01698899, "auxiliary_loss_mlp": 0.00302597, "balance_loss_clip": 1.37676084, "balance_loss_mlp": 0.270291, "epoch": 0.18055012776191193, "flos": 22163168482560.0, "grad_norm": 27.942912998788085, "language_loss": 0.89463246, "learning_rate": 3.7669139913497788e-06, "loss": 0.91464746, "num_input_tokens_seen": 64896955, "router_z_loss_clip": 3.22265625, "router_z_loss_mlp": 0.32299805, "step": 3003, "time_per_iteration": 2.592668294906616 }, { "auxiliary_loss_clip": 0.01676738, "auxiliary_loss_mlp": 0.0030622, "balance_loss_clip": 1.35590661, "balance_loss_mlp": 0.27238861, "epoch": 0.1806102510145799, "flos": 28913512440960.0, "grad_norm": 11.006163318859617, "language_loss": 0.7503584, "learning_rate": 3.7667314902779907e-06, "loss": 0.77018797, "num_input_tokens_seen": 64917080, "router_z_loss_clip": 3.20507812, "router_z_loss_mlp": 0.33813477, "step": 3004, "time_per_iteration": 2.6520204544067383 }, { "auxiliary_loss_clip": 0.01659004, "auxiliary_loss_mlp": 0.00292867, "balance_loss_clip": 1.33897948, "balance_loss_mlp": 0.25901124, "epoch": 0.18067037426724786, "flos": 19025689265280.0, "grad_norm": 65.28706522446724, "language_loss": 0.90715241, "learning_rate": 3.7665489222115677e-06, "loss": 0.92667115, "num_input_tokens_seen": 64935215, "router_z_loss_clip": 3.203125, "router_z_loss_mlp": 0.33837891, "step": 3005, "time_per_iteration": 2.5417354106903076 }, { "auxiliary_loss_clip": 0.01675851, "auxiliary_loss_mlp": 0.00258744, "balance_loss_clip": 1.36522603, "balance_loss_mlp": 0.22553219, "epoch": 0.18073049751991582, "flos": 27453635976960.0, "grad_norm": 2.1086146418220304, "language_loss": 0.90258849, "learning_rate": 3.766366287157432e-06, "loss": 0.92193437, "num_input_tokens_seen": 64956275, "router_z_loss_clip": 3.109375, "router_z_loss_mlp": 0.33227539, "step": 3006, "time_per_iteration": 2.6629040241241455 }, { "auxiliary_loss_clip": 0.01676902, "auxiliary_loss_mlp": 0.00270779, "balance_loss_clip": 1.36334813, "balance_loss_mlp": 0.23978503, "epoch": 0.1807906207725838, "flos": 28729039167360.0, "grad_norm": 28.04779476156249, "language_loss": 0.82791805, "learning_rate": 3.7661835851225103e-06, "loss": 0.84739488, "num_input_tokens_seen": 64979390, "router_z_loss_clip": 3.13671875, "router_z_loss_mlp": 0.31005859, "step": 3007, "time_per_iteration": 2.6661691665649414 }, { "auxiliary_loss_clip": 0.01457761, "auxiliary_loss_mlp": 0.00079587, "balance_loss_clip": 1.24021363, "balance_loss_mlp": 0.06623592, "epoch": 0.18085074402525175, "flos": 64466515468800.0, "grad_norm": 0.8395286573026242, "language_loss": 0.56903189, "learning_rate": 3.7660008161137294e-06, "loss": 0.58440542, "num_input_tokens_seen": 65043135, "router_z_loss_clip": 2.171875, "router_z_loss_mlp": 0.13378906, "step": 3008, "time_per_iteration": 3.3074886798858643 }, { "auxiliary_loss_clip": 0.01651575, "auxiliary_loss_mlp": 0.00283196, "balance_loss_clip": 1.34233356, "balance_loss_mlp": 0.24945962, "epoch": 0.18091086727791975, "flos": 23476960333440.0, "grad_norm": 4.526457623270812, "language_loss": 0.7566669, "learning_rate": 3.765817980138021e-06, "loss": 0.77601463, "num_input_tokens_seen": 65062845, "router_z_loss_clip": 3.09179688, "router_z_loss_mlp": 0.33740234, "step": 3009, "time_per_iteration": 2.597426176071167 }, { "auxiliary_loss_clip": 0.016532, "auxiliary_loss_mlp": 0.00261022, "balance_loss_clip": 1.34916711, "balance_loss_mlp": 0.22962236, "epoch": 0.1809709905305877, "flos": 24170467196160.0, "grad_norm": 2.390700977920598, "language_loss": 0.83269376, "learning_rate": 3.7656350772023177e-06, "loss": 0.85183597, "num_input_tokens_seen": 65082110, "router_z_loss_clip": 3.04101562, "router_z_loss_mlp": 0.31420898, "step": 3010, "time_per_iteration": 2.7888777256011963 }, { "auxiliary_loss_clip": 0.01640787, "auxiliary_loss_mlp": 0.00233289, "balance_loss_clip": 1.34223425, "balance_loss_mlp": 0.20012541, "epoch": 0.18103111378325568, "flos": 21650902669440.0, "grad_norm": 6.964429054075578, "language_loss": 0.72664809, "learning_rate": 3.7654521073135553e-06, "loss": 0.74538887, "num_input_tokens_seen": 65101985, "router_z_loss_clip": 2.98632812, "router_z_loss_mlp": 0.33154297, "step": 3011, "time_per_iteration": 2.6180331707000732 }, { "auxiliary_loss_clip": 0.01646313, "auxiliary_loss_mlp": 0.00271565, "balance_loss_clip": 1.34096861, "balance_loss_mlp": 0.23980737, "epoch": 0.18109123703592364, "flos": 53686918356480.0, "grad_norm": 3.1153833235035484, "language_loss": 0.75622672, "learning_rate": 3.7652690704786723e-06, "loss": 0.77540553, "num_input_tokens_seen": 65129295, "router_z_loss_clip": 3.05273438, "router_z_loss_mlp": 0.31738281, "step": 3012, "time_per_iteration": 2.8878352642059326 }, { "auxiliary_loss_clip": 0.01628623, "auxiliary_loss_mlp": 0.00273187, "balance_loss_clip": 1.32895041, "balance_loss_mlp": 0.23883133, "epoch": 0.1811513602885916, "flos": 35845564325760.0, "grad_norm": 12.810328237979878, "language_loss": 0.70167935, "learning_rate": 3.765085966704609e-06, "loss": 0.7206974, "num_input_tokens_seen": 65150625, "router_z_loss_clip": 2.99414062, "router_z_loss_mlp": 0.34350586, "step": 3013, "time_per_iteration": 2.721540927886963 }, { "auxiliary_loss_clip": 0.01640371, "auxiliary_loss_mlp": 0.00238664, "balance_loss_clip": 1.33793116, "balance_loss_mlp": 0.20881385, "epoch": 0.18121148354125957, "flos": 23732572492800.0, "grad_norm": 14.987145821908094, "language_loss": 0.82502991, "learning_rate": 3.764902795998309e-06, "loss": 0.84382027, "num_input_tokens_seen": 65170880, "router_z_loss_clip": 3.02148438, "router_z_loss_mlp": 0.29858398, "step": 3014, "time_per_iteration": 2.599924325942993 }, { "auxiliary_loss_clip": 0.01630344, "auxiliary_loss_mlp": 0.00261756, "balance_loss_clip": 1.32219374, "balance_loss_mlp": 0.22775801, "epoch": 0.18127160679392756, "flos": 28728320895360.0, "grad_norm": 6.522643119014666, "language_loss": 0.74301332, "learning_rate": 3.7647195583667184e-06, "loss": 0.76193428, "num_input_tokens_seen": 65192530, "router_z_loss_clip": 3.08398438, "router_z_loss_mlp": 0.33959961, "step": 3015, "time_per_iteration": 4.063644647598267 }, { "auxiliary_loss_clip": 0.01641756, "auxiliary_loss_mlp": 0.00234972, "balance_loss_clip": 1.34390068, "balance_loss_mlp": 0.19978154, "epoch": 0.18133173004659553, "flos": 20485062938880.0, "grad_norm": 31.766011942606454, "language_loss": 0.84133601, "learning_rate": 3.764536253816785e-06, "loss": 0.86010331, "num_input_tokens_seen": 65211675, "router_z_loss_clip": 2.9765625, "router_z_loss_mlp": 0.3515625, "step": 3016, "time_per_iteration": 2.6370136737823486 }, { "auxiliary_loss_clip": 0.0164514, "auxiliary_loss_mlp": 0.00251869, "balance_loss_clip": 1.34361911, "balance_loss_mlp": 0.21915779, "epoch": 0.1813918532992635, "flos": 22852078404480.0, "grad_norm": 5.223894471365019, "language_loss": 0.89790022, "learning_rate": 3.7643528823554602e-06, "loss": 0.91687024, "num_input_tokens_seen": 65231185, "router_z_loss_clip": 3.01757812, "router_z_loss_mlp": 0.32714844, "step": 3017, "time_per_iteration": 2.6008665561676025 }, { "auxiliary_loss_clip": 0.01629628, "auxiliary_loss_mlp": 0.00213637, "balance_loss_clip": 1.33357525, "balance_loss_mlp": 0.1813077, "epoch": 0.18145197655193146, "flos": 36065122208640.0, "grad_norm": 23.724683614973813, "language_loss": 0.7380234, "learning_rate": 3.764169443989697e-06, "loss": 0.75645614, "num_input_tokens_seen": 65251645, "router_z_loss_clip": 2.95703125, "router_z_loss_mlp": 0.32324219, "step": 3018, "time_per_iteration": 4.182072162628174 }, { "auxiliary_loss_clip": 0.01611859, "auxiliary_loss_mlp": 0.00219539, "balance_loss_clip": 1.31341958, "balance_loss_mlp": 0.18637478, "epoch": 0.18151209980459942, "flos": 24023951619840.0, "grad_norm": 2.470574539886916, "language_loss": 0.84734225, "learning_rate": 3.7639859387264518e-06, "loss": 0.86565626, "num_input_tokens_seen": 65271125, "router_z_loss_clip": 2.984375, "router_z_loss_mlp": 0.33154297, "step": 3019, "time_per_iteration": 2.601876974105835 }, { "auxiliary_loss_clip": 0.01592414, "auxiliary_loss_mlp": 0.00240348, "balance_loss_clip": 1.29247987, "balance_loss_mlp": 0.20627841, "epoch": 0.1815722230572674, "flos": 23951627585280.0, "grad_norm": 10.297823279830373, "language_loss": 0.89034975, "learning_rate": 3.7638023665726834e-06, "loss": 0.90867734, "num_input_tokens_seen": 65290600, "router_z_loss_clip": 2.99804688, "router_z_loss_mlp": 0.34082031, "step": 3020, "time_per_iteration": 2.6422231197357178 }, { "auxiliary_loss_clip": 0.01599681, "auxiliary_loss_mlp": 0.0022762, "balance_loss_clip": 1.29366398, "balance_loss_mlp": 0.19445556, "epoch": 0.18163234630993536, "flos": 24386469632640.0, "grad_norm": 2.5844281565568217, "language_loss": 0.86744654, "learning_rate": 3.763618727535352e-06, "loss": 0.88571954, "num_input_tokens_seen": 65311040, "router_z_loss_clip": 3.05859375, "router_z_loss_mlp": 0.33154297, "step": 3021, "time_per_iteration": 2.6211998462677 }, { "auxiliary_loss_clip": 0.01566757, "auxiliary_loss_mlp": 0.00192223, "balance_loss_clip": 1.26606131, "balance_loss_mlp": 0.15872476, "epoch": 0.18169246956260335, "flos": 24681332378880.0, "grad_norm": 8.45266447611984, "language_loss": 0.90282154, "learning_rate": 3.763435021621422e-06, "loss": 0.92041135, "num_input_tokens_seen": 65332115, "router_z_loss_clip": 3.00585938, "router_z_loss_mlp": 0.33496094, "step": 3022, "time_per_iteration": 4.022431135177612 }, { "auxiliary_loss_clip": 0.01572966, "auxiliary_loss_mlp": 0.00203187, "balance_loss_clip": 1.26314616, "balance_loss_mlp": 0.16930732, "epoch": 0.1817525928152713, "flos": 24243294021120.0, "grad_norm": 2.4853845226181566, "language_loss": 0.78706867, "learning_rate": 3.763251248837859e-06, "loss": 0.80483019, "num_input_tokens_seen": 65352210, "router_z_loss_clip": 3.09960938, "router_z_loss_mlp": 0.33862305, "step": 3023, "time_per_iteration": 2.695844888687134 }, { "auxiliary_loss_clip": 0.01566108, "auxiliary_loss_mlp": 0.00214371, "balance_loss_clip": 1.25170684, "balance_loss_mlp": 0.17906076, "epoch": 0.18181271606793928, "flos": 16472081623680.0, "grad_norm": 12.689207528060845, "language_loss": 0.80776989, "learning_rate": 3.7630674091916317e-06, "loss": 0.8255747, "num_input_tokens_seen": 65370600, "router_z_loss_clip": 3.14257812, "router_z_loss_mlp": 0.35302734, "step": 3024, "time_per_iteration": 2.6130528450012207 }, { "auxiliary_loss_clip": 0.01569997, "auxiliary_loss_mlp": 0.00208506, "balance_loss_clip": 1.2528584, "balance_loss_mlp": 0.17491266, "epoch": 0.18187283932060724, "flos": 18581042805120.0, "grad_norm": 8.947148205378314, "language_loss": 0.97816902, "learning_rate": 3.7628835026897123e-06, "loss": 0.9959541, "num_input_tokens_seen": 65387270, "router_z_loss_clip": 3.16796875, "router_z_loss_mlp": 0.3359375, "step": 3025, "time_per_iteration": 2.6164488792419434 }, { "auxiliary_loss_clip": 0.01550449, "auxiliary_loss_mlp": 0.00215074, "balance_loss_clip": 1.23061681, "balance_loss_mlp": 0.18155271, "epoch": 0.1819329625732752, "flos": 20266833859200.0, "grad_norm": 5.084657501051491, "language_loss": 0.85703528, "learning_rate": 3.7626995293390735e-06, "loss": 0.87469053, "num_input_tokens_seen": 65406550, "router_z_loss_clip": 3.19726562, "router_z_loss_mlp": 0.33520508, "step": 3026, "time_per_iteration": 2.5609233379364014 }, { "auxiliary_loss_clip": 0.01555259, "auxiliary_loss_mlp": 0.00214967, "balance_loss_clip": 1.22758722, "balance_loss_mlp": 0.18299495, "epoch": 0.18199308582594317, "flos": 25915186512000.0, "grad_norm": 6.087223560368189, "language_loss": 0.82150435, "learning_rate": 3.762515489146692e-06, "loss": 0.83920658, "num_input_tokens_seen": 65425955, "router_z_loss_clip": 3.27539062, "router_z_loss_mlp": 0.31982422, "step": 3027, "time_per_iteration": 2.6569607257843018 }, { "auxiliary_loss_clip": 0.01537206, "auxiliary_loss_mlp": 0.0021924, "balance_loss_clip": 1.20712817, "balance_loss_mlp": 0.18640974, "epoch": 0.18205320907861114, "flos": 15377524433280.0, "grad_norm": 4.148316799096969, "language_loss": 0.9392786, "learning_rate": 3.762331382119546e-06, "loss": 0.95684302, "num_input_tokens_seen": 65442820, "router_z_loss_clip": 3.30078125, "router_z_loss_mlp": 0.328125, "step": 3028, "time_per_iteration": 2.540600061416626 }, { "auxiliary_loss_clip": 0.01543849, "auxiliary_loss_mlp": 0.00231834, "balance_loss_clip": 1.20713091, "balance_loss_mlp": 0.20186469, "epoch": 0.18211333233127913, "flos": 25624310175360.0, "grad_norm": 22.35304574870908, "language_loss": 0.90079319, "learning_rate": 3.7621472082646183e-06, "loss": 0.91855001, "num_input_tokens_seen": 65461825, "router_z_loss_clip": 3.3671875, "router_z_loss_mlp": 0.29980469, "step": 3029, "time_per_iteration": 2.6343696117401123 }, { "auxiliary_loss_clip": 0.01549099, "auxiliary_loss_mlp": 0.00207912, "balance_loss_clip": 1.21343362, "balance_loss_mlp": 0.17298374, "epoch": 0.1821734555839471, "flos": 14976007228800.0, "grad_norm": 47.87984202987346, "language_loss": 0.89879811, "learning_rate": 3.761962967588891e-06, "loss": 0.91636825, "num_input_tokens_seen": 65479480, "router_z_loss_clip": 3.35351562, "router_z_loss_mlp": 0.34960938, "step": 3030, "time_per_iteration": 2.557075262069702 }, { "auxiliary_loss_clip": 0.01530729, "auxiliary_loss_mlp": 0.00200664, "balance_loss_clip": 1.1906873, "balance_loss_mlp": 0.16685635, "epoch": 0.18223357883661506, "flos": 20194007034240.0, "grad_norm": 20.100231624213162, "language_loss": 0.93519866, "learning_rate": 3.761778660099352e-06, "loss": 0.95251262, "num_input_tokens_seen": 65497775, "router_z_loss_clip": 3.40039062, "router_z_loss_mlp": 0.33837891, "step": 3031, "time_per_iteration": 2.6064417362213135 }, { "auxiliary_loss_clip": 0.01536648, "auxiliary_loss_mlp": 0.00196139, "balance_loss_clip": 1.19581473, "balance_loss_mlp": 0.1639998, "epoch": 0.18229370208928303, "flos": 15231978524160.0, "grad_norm": 3.4775715214089935, "language_loss": 0.8847568, "learning_rate": 3.76159428580299e-06, "loss": 0.90208465, "num_input_tokens_seen": 65516505, "router_z_loss_clip": 3.40820312, "router_z_loss_mlp": 0.3215332, "step": 3032, "time_per_iteration": 2.6153273582458496 }, { "auxiliary_loss_clip": 0.01524865, "auxiliary_loss_mlp": 0.00256709, "balance_loss_clip": 1.173051, "balance_loss_mlp": 0.22514214, "epoch": 0.182353825341951, "flos": 23840483927040.0, "grad_norm": 10.635545684782272, "language_loss": 0.91162407, "learning_rate": 3.761409844706795e-06, "loss": 0.92943978, "num_input_tokens_seen": 65536160, "router_z_loss_clip": 3.51171875, "router_z_loss_mlp": 0.31567383, "step": 3033, "time_per_iteration": 2.6098062992095947 }, { "auxiliary_loss_clip": 0.01452129, "auxiliary_loss_mlp": 0.00111752, "balance_loss_clip": 1.13004816, "balance_loss_mlp": 0.10121351, "epoch": 0.18241394859461896, "flos": 61190957393280.0, "grad_norm": 0.9288711447928368, "language_loss": 0.6333729, "learning_rate": 3.7612253368177625e-06, "loss": 0.64901173, "num_input_tokens_seen": 65589375, "router_z_loss_clip": 3.21875, "router_z_loss_mlp": 0.10546875, "step": 3034, "time_per_iteration": 3.0390024185180664 }, { "auxiliary_loss_clip": 0.01532379, "auxiliary_loss_mlp": 0.00268305, "balance_loss_clip": 1.17863894, "balance_loss_mlp": 0.23599888, "epoch": 0.18247407184728695, "flos": 18471694826880.0, "grad_norm": 8.327328192466585, "language_loss": 0.8887347, "learning_rate": 3.7610407621428893e-06, "loss": 0.9067415, "num_input_tokens_seen": 65606720, "router_z_loss_clip": 3.53710938, "router_z_loss_mlp": 0.32299805, "step": 3035, "time_per_iteration": 2.5623927116394043 }, { "auxiliary_loss_clip": 0.01539332, "auxiliary_loss_mlp": 0.00252808, "balance_loss_clip": 1.18825841, "balance_loss_mlp": 0.22469863, "epoch": 0.18253419509995492, "flos": 21795191602560.0, "grad_norm": 275.4651971801066, "language_loss": 0.90366232, "learning_rate": 3.7608561206891735e-06, "loss": 0.92158365, "num_input_tokens_seen": 65625495, "router_z_loss_clip": 3.50585938, "router_z_loss_mlp": 0.28100586, "step": 3036, "time_per_iteration": 2.6810507774353027 }, { "auxiliary_loss_clip": 0.01539592, "auxiliary_loss_mlp": 0.00246177, "balance_loss_clip": 1.18787551, "balance_loss_mlp": 0.21929517, "epoch": 0.18259431835262288, "flos": 20149764456960.0, "grad_norm": 17.017284671938828, "language_loss": 0.85899162, "learning_rate": 3.760671412463617e-06, "loss": 0.87684929, "num_input_tokens_seen": 65643515, "router_z_loss_clip": 3.51367188, "router_z_loss_mlp": 0.26904297, "step": 3037, "time_per_iteration": 2.5536224842071533 }, { "auxiliary_loss_clip": 0.01522676, "auxiliary_loss_mlp": 0.00343535, "balance_loss_clip": 1.16861236, "balance_loss_mlp": 0.31254059, "epoch": 0.18265444160529085, "flos": 16981653916800.0, "grad_norm": 21.853940020894015, "language_loss": 0.91725051, "learning_rate": 3.7604866374732246e-06, "loss": 0.93591261, "num_input_tokens_seen": 65658155, "router_z_loss_clip": 3.54101562, "router_z_loss_mlp": 0.31005859, "step": 3038, "time_per_iteration": 2.6585006713867188 }, { "auxiliary_loss_clip": 0.01533899, "auxiliary_loss_mlp": 0.00283749, "balance_loss_clip": 1.18551874, "balance_loss_mlp": 0.25741538, "epoch": 0.1827145648579588, "flos": 34423250509440.0, "grad_norm": 3.751234473651476, "language_loss": 0.76615125, "learning_rate": 3.7603017957250023e-06, "loss": 0.78432775, "num_input_tokens_seen": 65679310, "router_z_loss_clip": 3.484375, "router_z_loss_mlp": 0.26318359, "step": 3039, "time_per_iteration": 2.6795053482055664 }, { "auxiliary_loss_clip": 0.015226, "auxiliary_loss_mlp": 0.00305527, "balance_loss_clip": 1.17529881, "balance_loss_mlp": 0.27733353, "epoch": 0.18277468811062678, "flos": 53287017264000.0, "grad_norm": 132.1370017285496, "language_loss": 0.81015903, "learning_rate": 3.7601168872259593e-06, "loss": 0.82844025, "num_input_tokens_seen": 65705235, "router_z_loss_clip": 3.47265625, "router_z_loss_mlp": 0.28173828, "step": 3040, "time_per_iteration": 2.900322914123535 }, { "auxiliary_loss_clip": 0.01532154, "auxiliary_loss_mlp": 0.00314611, "balance_loss_clip": 1.19415569, "balance_loss_mlp": 0.28788412, "epoch": 0.18283481136329474, "flos": 31650659602560.0, "grad_norm": 58.77434475039216, "language_loss": 0.68659317, "learning_rate": 3.7599319119831075e-06, "loss": 0.70506078, "num_input_tokens_seen": 65727575, "router_z_loss_clip": 3.37695312, "router_z_loss_mlp": 0.26696777, "step": 3041, "time_per_iteration": 2.676438808441162 }, { "auxiliary_loss_clip": 0.01535405, "auxiliary_loss_mlp": 0.00307351, "balance_loss_clip": 1.1921407, "balance_loss_mlp": 0.27885988, "epoch": 0.18289493461596273, "flos": 53137664513280.0, "grad_norm": 44.839220510283674, "language_loss": 0.66380012, "learning_rate": 3.7597468700034616e-06, "loss": 0.68222767, "num_input_tokens_seen": 65751370, "router_z_loss_clip": 3.4296875, "router_z_loss_mlp": 0.28527832, "step": 3042, "time_per_iteration": 2.8691580295562744 }, { "auxiliary_loss_clip": 0.01507125, "auxiliary_loss_mlp": 0.00391083, "balance_loss_clip": 1.15712988, "balance_loss_mlp": 0.36426112, "epoch": 0.1829550578686307, "flos": 25589369220480.0, "grad_norm": 22.513259382506913, "language_loss": 0.92922699, "learning_rate": 3.7595617612940374e-06, "loss": 0.94820911, "num_input_tokens_seen": 65771040, "router_z_loss_clip": 3.49804688, "router_z_loss_mlp": 0.26831055, "step": 3043, "time_per_iteration": 2.6782615184783936 }, { "auxiliary_loss_clip": 0.01516849, "auxiliary_loss_mlp": 0.00349954, "balance_loss_clip": 1.17193842, "balance_loss_mlp": 0.32086667, "epoch": 0.18301518112129866, "flos": 22601422321920.0, "grad_norm": 7.130446727406232, "language_loss": 0.79814345, "learning_rate": 3.7593765858618552e-06, "loss": 0.8168115, "num_input_tokens_seen": 65789345, "router_z_loss_clip": 3.45117188, "router_z_loss_mlp": 0.29077148, "step": 3044, "time_per_iteration": 2.595714807510376 }, { "auxiliary_loss_clip": 0.01524968, "auxiliary_loss_mlp": 0.00394175, "balance_loss_clip": 1.18041754, "balance_loss_mlp": 0.3638007, "epoch": 0.18307530437396663, "flos": 34020799551360.0, "grad_norm": 10.12515332767027, "language_loss": 0.70910883, "learning_rate": 3.7591913437139365e-06, "loss": 0.72830027, "num_input_tokens_seen": 65810990, "router_z_loss_clip": 3.4453125, "router_z_loss_mlp": 0.30371094, "step": 3045, "time_per_iteration": 2.7020103931427 }, { "auxiliary_loss_clip": 0.01535054, "auxiliary_loss_mlp": 0.00313165, "balance_loss_clip": 1.19381785, "balance_loss_mlp": 0.28496039, "epoch": 0.1831354276266346, "flos": 21279765392640.0, "grad_norm": 4.093416381513643, "language_loss": 0.88761044, "learning_rate": 3.7590060348573066e-06, "loss": 0.90609264, "num_input_tokens_seen": 65827230, "router_z_loss_clip": 3.41015625, "router_z_loss_mlp": 0.28186035, "step": 3046, "time_per_iteration": 2.555715560913086 }, { "auxiliary_loss_clip": 0.01499988, "auxiliary_loss_mlp": 0.00371524, "balance_loss_clip": 1.15956402, "balance_loss_mlp": 0.33874139, "epoch": 0.18319555087930256, "flos": 21032952065280.0, "grad_norm": 3.3708243403215166, "language_loss": 0.877698, "learning_rate": 3.7588206592989903e-06, "loss": 0.89641315, "num_input_tokens_seen": 65845900, "router_z_loss_clip": 3.40429688, "router_z_loss_mlp": 0.32788086, "step": 3047, "time_per_iteration": 2.6175742149353027 }, { "auxiliary_loss_clip": 0.01524021, "auxiliary_loss_mlp": 0.00361434, "balance_loss_clip": 1.1801064, "balance_loss_mlp": 0.33370563, "epoch": 0.18325567413197055, "flos": 34382958428160.0, "grad_norm": 12.606682037121889, "language_loss": 0.84488767, "learning_rate": 3.7586352170460194e-06, "loss": 0.86374223, "num_input_tokens_seen": 65868730, "router_z_loss_clip": 3.44140625, "router_z_loss_mlp": 0.27722168, "step": 3048, "time_per_iteration": 2.72282338142395 }, { "auxiliary_loss_clip": 0.01511098, "auxiliary_loss_mlp": 0.00382315, "balance_loss_clip": 1.17021632, "balance_loss_mlp": 0.35394287, "epoch": 0.18331579738463852, "flos": 20558464381440.0, "grad_norm": 3.3494763780074455, "language_loss": 0.94696087, "learning_rate": 3.758449708105424e-06, "loss": 0.965895, "num_input_tokens_seen": 65888420, "router_z_loss_clip": 3.40820312, "router_z_loss_mlp": 0.28344727, "step": 3049, "time_per_iteration": 2.6326675415039062 }, { "auxiliary_loss_clip": 0.01518786, "auxiliary_loss_mlp": 0.00336595, "balance_loss_clip": 1.17543769, "balance_loss_mlp": 0.30669704, "epoch": 0.18337592063730648, "flos": 19607872901760.0, "grad_norm": 13.068060450919587, "language_loss": 0.90371454, "learning_rate": 3.75826413248424e-06, "loss": 0.92226833, "num_input_tokens_seen": 65905840, "router_z_loss_clip": 3.43164062, "router_z_loss_mlp": 0.2989502, "step": 3050, "time_per_iteration": 2.548203945159912 }, { "auxiliary_loss_clip": 0.01508053, "auxiliary_loss_mlp": 0.00420147, "balance_loss_clip": 1.16921198, "balance_loss_mlp": 0.38860396, "epoch": 0.18343604388997445, "flos": 20850885002880.0, "grad_norm": 20.892119590154767, "language_loss": 1.0634172, "learning_rate": 3.7580784901895035e-06, "loss": 1.08269906, "num_input_tokens_seen": 65922845, "router_z_loss_clip": 3.39257812, "router_z_loss_mlp": 0.31567383, "step": 3051, "time_per_iteration": 2.616490602493286 }, { "auxiliary_loss_clip": 0.0150332, "auxiliary_loss_mlp": 0.00334232, "balance_loss_clip": 1.16466808, "balance_loss_mlp": 0.30650395, "epoch": 0.1834961671426424, "flos": 24394370624640.0, "grad_norm": 4.514240941947403, "language_loss": 0.92719942, "learning_rate": 3.7578927812282542e-06, "loss": 0.94557488, "num_input_tokens_seen": 65945555, "router_z_loss_clip": 3.38867188, "router_z_loss_mlp": 0.27734375, "step": 3052, "time_per_iteration": 2.655106544494629 }, { "auxiliary_loss_clip": 0.01507711, "auxiliary_loss_mlp": 0.00361111, "balance_loss_clip": 1.17524457, "balance_loss_mlp": 0.33248913, "epoch": 0.18355629039531038, "flos": 21251612108160.0, "grad_norm": 2.154978564720787, "language_loss": 0.80751288, "learning_rate": 3.7577070056075356e-06, "loss": 0.82620108, "num_input_tokens_seen": 65963965, "router_z_loss_clip": 3.32421875, "router_z_loss_mlp": 0.28662109, "step": 3053, "time_per_iteration": 2.6018311977386475 }, { "auxiliary_loss_clip": 0.01512497, "auxiliary_loss_mlp": 0.00438294, "balance_loss_clip": 1.1777308, "balance_loss_mlp": 0.40584546, "epoch": 0.18361641364797834, "flos": 28656499651200.0, "grad_norm": 7.501178405587123, "language_loss": 0.70897812, "learning_rate": 3.7575211633343902e-06, "loss": 0.728486, "num_input_tokens_seen": 65985965, "router_z_loss_clip": 3.34960938, "router_z_loss_mlp": 0.32446289, "step": 3054, "time_per_iteration": 2.649409532546997 }, { "auxiliary_loss_clip": 0.01511643, "auxiliary_loss_mlp": 0.00396537, "balance_loss_clip": 1.18099594, "balance_loss_mlp": 0.36690125, "epoch": 0.18367653690064634, "flos": 20918827578240.0, "grad_norm": 2.4464338155383376, "language_loss": 0.85562807, "learning_rate": 3.7573352544158663e-06, "loss": 0.87470996, "num_input_tokens_seen": 66005645, "router_z_loss_clip": 3.30664062, "router_z_loss_mlp": 0.29614258, "step": 3055, "time_per_iteration": 2.657930850982666 }, { "auxiliary_loss_clip": 0.01507194, "auxiliary_loss_mlp": 0.00347019, "balance_loss_clip": 1.17737567, "balance_loss_mlp": 0.31924361, "epoch": 0.1837366601533143, "flos": 28765596234240.0, "grad_norm": 15.959901365618911, "language_loss": 0.76733142, "learning_rate": 3.757149278859014e-06, "loss": 0.78587353, "num_input_tokens_seen": 66025675, "router_z_loss_clip": 3.29296875, "router_z_loss_mlp": 0.2779541, "step": 3056, "time_per_iteration": 2.6315054893493652 }, { "auxiliary_loss_clip": 0.0149491, "auxiliary_loss_mlp": 0.00350059, "balance_loss_clip": 1.17263365, "balance_loss_mlp": 0.32339215, "epoch": 0.18379678340598227, "flos": 21251432540160.0, "grad_norm": 5.2901480736551445, "language_loss": 0.85555905, "learning_rate": 3.7569632366708842e-06, "loss": 0.87400877, "num_input_tokens_seen": 66046125, "router_z_loss_clip": 3.22460938, "router_z_loss_mlp": 0.26647949, "step": 3057, "time_per_iteration": 4.03260064125061 }, { "auxiliary_loss_clip": 0.01507422, "auxiliary_loss_mlp": 0.00329852, "balance_loss_clip": 1.18009496, "balance_loss_mlp": 0.30114681, "epoch": 0.18385690665865023, "flos": 20449619193600.0, "grad_norm": 3.9438564556305855, "language_loss": 0.92049032, "learning_rate": 3.756777127858533e-06, "loss": 0.93886304, "num_input_tokens_seen": 66064375, "router_z_loss_clip": 3.27148438, "router_z_loss_mlp": 0.28686523, "step": 3058, "time_per_iteration": 2.6942460536956787 }, { "auxiliary_loss_clip": 0.0150065, "auxiliary_loss_mlp": 0.00361643, "balance_loss_clip": 1.17639399, "balance_loss_mlp": 0.33272249, "epoch": 0.1839170299113182, "flos": 26140562398080.0, "grad_norm": 17.464637340676852, "language_loss": 0.92989159, "learning_rate": 3.756590952429017e-06, "loss": 0.94851446, "num_input_tokens_seen": 66084590, "router_z_loss_clip": 3.2421875, "router_z_loss_mlp": 0.28930664, "step": 3059, "time_per_iteration": 2.6272411346435547 }, { "auxiliary_loss_clip": 0.01508019, "auxiliary_loss_mlp": 0.00281497, "balance_loss_clip": 1.1859473, "balance_loss_mlp": 0.25206465, "epoch": 0.18397715316398616, "flos": 31758032332800.0, "grad_norm": 120.29838418827718, "language_loss": 0.78195632, "learning_rate": 3.756404710389396e-06, "loss": 0.79985142, "num_input_tokens_seen": 66107105, "router_z_loss_clip": 3.22070312, "router_z_loss_mlp": 0.29443359, "step": 3060, "time_per_iteration": 4.124142169952393 }, { "auxiliary_loss_clip": 0.01515102, "auxiliary_loss_mlp": 0.00327383, "balance_loss_clip": 1.1929487, "balance_loss_mlp": 0.29693681, "epoch": 0.18403727641665413, "flos": 24611989173120.0, "grad_norm": 37.123678917385796, "language_loss": 0.78681952, "learning_rate": 3.7562184017467323e-06, "loss": 0.80524433, "num_input_tokens_seen": 66129295, "router_z_loss_clip": 3.22070312, "router_z_loss_mlp": 0.30444336, "step": 3061, "time_per_iteration": 2.645393133163452 }, { "auxiliary_loss_clip": 0.01531172, "auxiliary_loss_mlp": 0.00281633, "balance_loss_clip": 1.21432424, "balance_loss_mlp": 0.25206938, "epoch": 0.18409739966932212, "flos": 23439900476160.0, "grad_norm": 14.703830509584694, "language_loss": 0.87885475, "learning_rate": 3.7560320265080906e-06, "loss": 0.89698279, "num_input_tokens_seen": 66146910, "router_z_loss_clip": 3.16992188, "router_z_loss_mlp": 0.2956543, "step": 3062, "time_per_iteration": 2.6376729011535645 }, { "auxiliary_loss_clip": 0.0153072, "auxiliary_loss_mlp": 0.003261, "balance_loss_clip": 1.20866191, "balance_loss_mlp": 0.29831272, "epoch": 0.18415752292199009, "flos": 21872112577920.0, "grad_norm": 9.588343245182154, "language_loss": 0.83274961, "learning_rate": 3.7558455846805383e-06, "loss": 0.85131776, "num_input_tokens_seen": 66165370, "router_z_loss_clip": 3.22070312, "router_z_loss_mlp": 0.27807617, "step": 3063, "time_per_iteration": 2.6708319187164307 }, { "auxiliary_loss_clip": 0.01524029, "auxiliary_loss_mlp": 0.00253417, "balance_loss_clip": 1.20539212, "balance_loss_mlp": 0.22456875, "epoch": 0.18421764617465805, "flos": 25410678036480.0, "grad_norm": 2.6304513275061354, "language_loss": 0.73307568, "learning_rate": 3.7556590762711463e-06, "loss": 0.75085014, "num_input_tokens_seen": 66186210, "router_z_loss_clip": 3.1875, "router_z_loss_mlp": 0.28857422, "step": 3064, "time_per_iteration": 4.002074241638184 }, { "auxiliary_loss_clip": 0.01530815, "auxiliary_loss_mlp": 0.00286609, "balance_loss_clip": 1.21132636, "balance_loss_mlp": 0.25666338, "epoch": 0.18427776942732602, "flos": 27198131558400.0, "grad_norm": 2.0957779810111776, "language_loss": 0.77010643, "learning_rate": 3.7554725012869853e-06, "loss": 0.78828073, "num_input_tokens_seen": 66204800, "router_z_loss_clip": 3.19335938, "router_z_loss_mlp": 0.29907227, "step": 3065, "time_per_iteration": 2.663722038269043 }, { "auxiliary_loss_clip": 0.01538587, "auxiliary_loss_mlp": 0.00298987, "balance_loss_clip": 1.22327256, "balance_loss_mlp": 0.26932755, "epoch": 0.18433789267999398, "flos": 27852351920640.0, "grad_norm": 14.850803833542345, "language_loss": 0.82790166, "learning_rate": 3.7552858597351318e-06, "loss": 0.84627736, "num_input_tokens_seen": 66222195, "router_z_loss_clip": 3.15429688, "router_z_loss_mlp": 0.29638672, "step": 3066, "time_per_iteration": 2.6276907920837402 }, { "auxiliary_loss_clip": 0.0155798, "auxiliary_loss_mlp": 0.00259755, "balance_loss_clip": 1.24285328, "balance_loss_mlp": 0.23258701, "epoch": 0.18439801593266195, "flos": 17856940533120.0, "grad_norm": 10.66565467096761, "language_loss": 0.89436281, "learning_rate": 3.7550991516226622e-06, "loss": 0.91254008, "num_input_tokens_seen": 66239505, "router_z_loss_clip": 3.15429688, "router_z_loss_mlp": 0.27172852, "step": 3067, "time_per_iteration": 2.563652992248535 }, { "auxiliary_loss_clip": 0.01607479, "auxiliary_loss_mlp": 0.00629785, "balance_loss_clip": 1.28700876, "balance_loss_mlp": 0.61567092, "epoch": 0.18445813918532994, "flos": 56389522590720.0, "grad_norm": 0.9182244779711894, "language_loss": 0.59430492, "learning_rate": 3.754912376956657e-06, "loss": 0.61667752, "num_input_tokens_seen": 66295695, "router_z_loss_clip": 3.203125, "router_z_loss_mlp": 0.14160156, "step": 3068, "time_per_iteration": 2.9921112060546875 }, { "auxiliary_loss_clip": 0.01572683, "auxiliary_loss_mlp": 0.00251538, "balance_loss_clip": 1.25611138, "balance_loss_mlp": 0.22259408, "epoch": 0.1845182624379979, "flos": 20957180325120.0, "grad_norm": 10.637392938624341, "language_loss": 0.83823407, "learning_rate": 3.7547255357441987e-06, "loss": 0.85647631, "num_input_tokens_seen": 66315315, "router_z_loss_clip": 3.16601562, "router_z_loss_mlp": 0.28918457, "step": 3069, "time_per_iteration": 2.601224899291992 }, { "auxiliary_loss_clip": 0.01588986, "auxiliary_loss_mlp": 0.00253962, "balance_loss_clip": 1.27510524, "balance_loss_mlp": 0.2214891, "epoch": 0.18457838569066587, "flos": 20485170679680.0, "grad_norm": 44.36376174698488, "language_loss": 0.92129409, "learning_rate": 3.7545386279923718e-06, "loss": 0.93972355, "num_input_tokens_seen": 66333675, "router_z_loss_clip": 3.13476562, "router_z_loss_mlp": 0.32470703, "step": 3070, "time_per_iteration": 2.6287648677825928 }, { "auxiliary_loss_clip": 0.01609731, "auxiliary_loss_mlp": 0.00262295, "balance_loss_clip": 1.28948689, "balance_loss_mlp": 0.22991753, "epoch": 0.18463850894333383, "flos": 25010022758400.0, "grad_norm": 8.82375874693897, "language_loss": 0.85198462, "learning_rate": 3.754351653708265e-06, "loss": 0.87070489, "num_input_tokens_seen": 66354075, "router_z_loss_clip": 3.203125, "router_z_loss_mlp": 0.32373047, "step": 3071, "time_per_iteration": 2.710223436355591 }, { "auxiliary_loss_clip": 0.01619681, "auxiliary_loss_mlp": 0.0027835, "balance_loss_clip": 1.30217505, "balance_loss_mlp": 0.24797508, "epoch": 0.1846986321960018, "flos": 16800628348800.0, "grad_norm": 33.72595325429311, "language_loss": 0.8729825, "learning_rate": 3.7541646128989674e-06, "loss": 0.89196277, "num_input_tokens_seen": 66372520, "router_z_loss_clip": 3.17382812, "router_z_loss_mlp": 0.30395508, "step": 3072, "time_per_iteration": 2.581709146499634 }, { "auxiliary_loss_clip": 0.01622678, "auxiliary_loss_mlp": 0.00283665, "balance_loss_clip": 1.29768562, "balance_loss_mlp": 0.25133556, "epoch": 0.18475875544866976, "flos": 20814327936000.0, "grad_norm": 2.5826192403305406, "language_loss": 0.93197834, "learning_rate": 3.7539775055715715e-06, "loss": 0.95104182, "num_input_tokens_seen": 66390745, "router_z_loss_clip": 3.25195312, "router_z_loss_mlp": 0.32324219, "step": 3073, "time_per_iteration": 2.5655696392059326 }, { "auxiliary_loss_clip": 0.01646229, "auxiliary_loss_mlp": 0.00256326, "balance_loss_clip": 1.3219862, "balance_loss_mlp": 0.2251054, "epoch": 0.18481887870133773, "flos": 22601422321920.0, "grad_norm": 22.803176044839347, "language_loss": 1.0104413, "learning_rate": 3.7537903317331732e-06, "loss": 1.02946675, "num_input_tokens_seen": 66410525, "router_z_loss_clip": 3.2421875, "router_z_loss_mlp": 0.31237793, "step": 3074, "time_per_iteration": 2.6393306255340576 }, { "auxiliary_loss_clip": 0.01622781, "auxiliary_loss_mlp": 0.00272142, "balance_loss_clip": 1.30226135, "balance_loss_mlp": 0.23831087, "epoch": 0.18487900195400572, "flos": 29458815788160.0, "grad_norm": 3.7651780498563587, "language_loss": 0.71848583, "learning_rate": 3.75360309139087e-06, "loss": 0.73743504, "num_input_tokens_seen": 66432535, "router_z_loss_clip": 3.20507812, "router_z_loss_mlp": 0.33813477, "step": 3075, "time_per_iteration": 2.731102705001831 }, { "auxiliary_loss_clip": 0.01644452, "auxiliary_loss_mlp": 0.00264533, "balance_loss_clip": 1.32234311, "balance_loss_mlp": 0.23388433, "epoch": 0.1849391252066737, "flos": 20628777254400.0, "grad_norm": 53.663054589538426, "language_loss": 0.8111192, "learning_rate": 3.753415784551761e-06, "loss": 0.83020908, "num_input_tokens_seen": 66450620, "router_z_loss_clip": 3.22460938, "router_z_loss_mlp": 0.30627441, "step": 3076, "time_per_iteration": 2.744987726211548 }, { "auxiliary_loss_clip": 0.01637985, "auxiliary_loss_mlp": 0.00255599, "balance_loss_clip": 1.31661785, "balance_loss_mlp": 0.22424677, "epoch": 0.18499924845934165, "flos": 14428549065600.0, "grad_norm": 34.71133263912376, "language_loss": 0.9111377, "learning_rate": 3.7532284112229507e-06, "loss": 0.9300735, "num_input_tokens_seen": 66467865, "router_z_loss_clip": 3.21484375, "router_z_loss_mlp": 0.31347656, "step": 3077, "time_per_iteration": 2.5895066261291504 }, { "auxiliary_loss_clip": 0.01628087, "auxiliary_loss_mlp": 0.00249541, "balance_loss_clip": 1.30823159, "balance_loss_mlp": 0.21966726, "epoch": 0.18505937171200962, "flos": 23727652329600.0, "grad_norm": 49.11950615877729, "language_loss": 0.84906769, "learning_rate": 3.7530409714115424e-06, "loss": 0.86784393, "num_input_tokens_seen": 66486245, "router_z_loss_clip": 3.19921875, "router_z_loss_mlp": 0.29858398, "step": 3078, "time_per_iteration": 2.626169443130493 }, { "auxiliary_loss_clip": 0.01666263, "auxiliary_loss_mlp": 0.00273188, "balance_loss_clip": 1.34567809, "balance_loss_mlp": 0.24221763, "epoch": 0.18511949496467758, "flos": 25957489754880.0, "grad_norm": 998.3502174207551, "language_loss": 0.84222996, "learning_rate": 3.7528534651246453e-06, "loss": 0.86162454, "num_input_tokens_seen": 66506510, "router_z_loss_clip": 3.20898438, "router_z_loss_mlp": 0.30981445, "step": 3079, "time_per_iteration": 2.606353759765625 }, { "auxiliary_loss_clip": 0.01631679, "auxiliary_loss_mlp": 0.00273747, "balance_loss_clip": 1.31730044, "balance_loss_mlp": 0.24349129, "epoch": 0.18517961821734555, "flos": 42413553912960.0, "grad_norm": 19.33584671295054, "language_loss": 0.86888385, "learning_rate": 3.752665892369369e-06, "loss": 0.88793814, "num_input_tokens_seen": 66530960, "router_z_loss_clip": 3.14257812, "router_z_loss_mlp": 0.30236816, "step": 3080, "time_per_iteration": 2.788346767425537 }, { "auxiliary_loss_clip": 0.01637691, "auxiliary_loss_mlp": 0.00276854, "balance_loss_clip": 1.31555009, "balance_loss_mlp": 0.24333228, "epoch": 0.18523974147001354, "flos": 24097568544000.0, "grad_norm": 18.569887118270916, "language_loss": 0.83369905, "learning_rate": 3.7524782531528266e-06, "loss": 0.85284448, "num_input_tokens_seen": 66550275, "router_z_loss_clip": 3.22265625, "router_z_loss_mlp": 0.33496094, "step": 3081, "time_per_iteration": 2.658571243286133 }, { "auxiliary_loss_clip": 0.01661257, "auxiliary_loss_mlp": 0.00281242, "balance_loss_clip": 1.33963203, "balance_loss_mlp": 0.24745817, "epoch": 0.1852998647226815, "flos": 27375278457600.0, "grad_norm": 20.21335762303755, "language_loss": 0.80306554, "learning_rate": 3.7522905474821334e-06, "loss": 0.82249051, "num_input_tokens_seen": 66569040, "router_z_loss_clip": 3.21875, "router_z_loss_mlp": 0.33789062, "step": 3082, "time_per_iteration": 2.6207902431488037 }, { "auxiliary_loss_clip": 0.01673668, "auxiliary_loss_mlp": 0.0031532, "balance_loss_clip": 1.34993315, "balance_loss_mlp": 0.28055903, "epoch": 0.18535998797534947, "flos": 18332757020160.0, "grad_norm": 34.48322051706046, "language_loss": 0.78025854, "learning_rate": 3.752102775364407e-06, "loss": 0.80014837, "num_input_tokens_seen": 66587775, "router_z_loss_clip": 3.24023438, "router_z_loss_mlp": 0.34765625, "step": 3083, "time_per_iteration": 2.588942766189575 }, { "auxiliary_loss_clip": 0.01655624, "auxiliary_loss_mlp": 0.00295205, "balance_loss_clip": 1.33394718, "balance_loss_mlp": 0.26232731, "epoch": 0.18542011122801744, "flos": 37845859887360.0, "grad_norm": 5.541538689232545, "language_loss": 0.75980663, "learning_rate": 3.751914936806767e-06, "loss": 0.77931488, "num_input_tokens_seen": 66610800, "router_z_loss_clip": 3.21289062, "router_z_loss_mlp": 0.32910156, "step": 3084, "time_per_iteration": 2.7301909923553467 }, { "auxiliary_loss_clip": 0.01656585, "auxiliary_loss_mlp": 0.00296004, "balance_loss_clip": 1.33580422, "balance_loss_mlp": 0.2652007, "epoch": 0.1854802344806854, "flos": 25186128163200.0, "grad_norm": 15.226640048535147, "language_loss": 0.82346809, "learning_rate": 3.7517270318163377e-06, "loss": 0.84299397, "num_input_tokens_seen": 66630960, "router_z_loss_clip": 3.21289062, "router_z_loss_mlp": 0.30810547, "step": 3085, "time_per_iteration": 2.678938150405884 }, { "auxiliary_loss_clip": 0.0166076, "auxiliary_loss_mlp": 0.00307056, "balance_loss_clip": 1.33820057, "balance_loss_mlp": 0.2752271, "epoch": 0.18554035773335337, "flos": 26684788337280.0, "grad_norm": 2.272015834789527, "language_loss": 0.8073827, "learning_rate": 3.751539060400244e-06, "loss": 0.82706082, "num_input_tokens_seen": 66650585, "router_z_loss_clip": 3.22851562, "router_z_loss_mlp": 0.31811523, "step": 3086, "time_per_iteration": 2.630436420440674 }, { "auxiliary_loss_clip": 0.01653726, "auxiliary_loss_mlp": 0.00301985, "balance_loss_clip": 1.33216238, "balance_loss_mlp": 0.27101445, "epoch": 0.18560048098602133, "flos": 22346887570560.0, "grad_norm": 7.536261808767377, "language_loss": 0.79234982, "learning_rate": 3.7513510225656132e-06, "loss": 0.81190693, "num_input_tokens_seen": 66670045, "router_z_loss_clip": 3.21875, "router_z_loss_mlp": 0.30981445, "step": 3087, "time_per_iteration": 2.627048969268799 }, { "auxiliary_loss_clip": 0.01656228, "auxiliary_loss_mlp": 0.00313633, "balance_loss_clip": 1.33613563, "balance_loss_mlp": 0.28056473, "epoch": 0.18566060423868933, "flos": 17748526308480.0, "grad_norm": 3.247123598900782, "language_loss": 0.80494082, "learning_rate": 3.7511629183195764e-06, "loss": 0.82463944, "num_input_tokens_seen": 66688790, "router_z_loss_clip": 3.20507812, "router_z_loss_mlp": 0.33056641, "step": 3088, "time_per_iteration": 2.570807933807373 }, { "auxiliary_loss_clip": 0.01675206, "auxiliary_loss_mlp": 0.00334502, "balance_loss_clip": 1.3531971, "balance_loss_mlp": 0.30086082, "epoch": 0.1857207274913573, "flos": 24677274142080.0, "grad_norm": 1180.1543405635794, "language_loss": 1.00281715, "learning_rate": 3.7509747476692663e-06, "loss": 1.02291417, "num_input_tokens_seen": 66708090, "router_z_loss_clip": 3.21875, "router_z_loss_mlp": 0.33618164, "step": 3089, "time_per_iteration": 2.6639456748962402 }, { "auxiliary_loss_clip": 0.01664038, "auxiliary_loss_mlp": 0.00311069, "balance_loss_clip": 1.33855033, "balance_loss_mlp": 0.27790463, "epoch": 0.18578085074402526, "flos": 28147825198080.0, "grad_norm": 3.16832276232297, "language_loss": 0.65056133, "learning_rate": 3.7507865106218176e-06, "loss": 0.67031246, "num_input_tokens_seen": 66727320, "router_z_loss_clip": 3.25390625, "router_z_loss_mlp": 0.33154297, "step": 3090, "time_per_iteration": 2.681551218032837 }, { "auxiliary_loss_clip": 0.01643296, "auxiliary_loss_mlp": 0.00320944, "balance_loss_clip": 1.32096982, "balance_loss_mlp": 0.28873318, "epoch": 0.18584097399669322, "flos": 23951878980480.0, "grad_norm": 2.037289682048942, "language_loss": 0.86448622, "learning_rate": 3.7505982071843695e-06, "loss": 0.88412857, "num_input_tokens_seen": 66747505, "router_z_loss_clip": 3.22265625, "router_z_loss_mlp": 0.32177734, "step": 3091, "time_per_iteration": 2.6657636165618896 }, { "auxiliary_loss_clip": 0.01650816, "auxiliary_loss_mlp": 0.00345887, "balance_loss_clip": 1.32416439, "balance_loss_mlp": 0.31174541, "epoch": 0.18590109724936119, "flos": 17201678676480.0, "grad_norm": 65.93320914315568, "language_loss": 0.92212653, "learning_rate": 3.7504098373640617e-06, "loss": 0.94209355, "num_input_tokens_seen": 66766425, "router_z_loss_clip": 3.26953125, "router_z_loss_mlp": 0.34130859, "step": 3092, "time_per_iteration": 2.5892868041992188 }, { "auxiliary_loss_clip": 0.01659514, "auxiliary_loss_mlp": 0.00338185, "balance_loss_clip": 1.32853985, "balance_loss_mlp": 0.30404329, "epoch": 0.18596122050202915, "flos": 17234644383360.0, "grad_norm": 534.2992507356756, "language_loss": 1.01749563, "learning_rate": 3.750221401168038e-06, "loss": 1.03747261, "num_input_tokens_seen": 66781130, "router_z_loss_clip": 3.30859375, "router_z_loss_mlp": 0.34130859, "step": 3093, "time_per_iteration": 2.5600461959838867 }, { "auxiliary_loss_clip": 0.0164149, "auxiliary_loss_mlp": 0.00359868, "balance_loss_clip": 1.32020283, "balance_loss_mlp": 0.32777715, "epoch": 0.18602134375469712, "flos": 19020733188480.0, "grad_norm": 44.38019003382586, "language_loss": 0.82982826, "learning_rate": 3.750032898603443e-06, "loss": 0.84984183, "num_input_tokens_seen": 66797535, "router_z_loss_clip": 3.21875, "router_z_loss_mlp": 0.32104492, "step": 3094, "time_per_iteration": 2.5621337890625 }, { "auxiliary_loss_clip": 0.01650664, "auxiliary_loss_mlp": 0.0033597, "balance_loss_clip": 1.32793999, "balance_loss_mlp": 0.30449882, "epoch": 0.1860814670073651, "flos": 50950094417280.0, "grad_norm": 103.00390043862315, "language_loss": 0.75940788, "learning_rate": 3.749844329677425e-06, "loss": 0.77927423, "num_input_tokens_seen": 66821720, "router_z_loss_clip": 3.23242188, "router_z_loss_mlp": 0.31469727, "step": 3095, "time_per_iteration": 2.8782858848571777 }, { "auxiliary_loss_clip": 0.01651775, "auxiliary_loss_mlp": 0.00376687, "balance_loss_clip": 1.31888914, "balance_loss_mlp": 0.34135303, "epoch": 0.18614159026003307, "flos": 19390972625280.0, "grad_norm": 8.645657819694993, "language_loss": 0.87413257, "learning_rate": 3.749655694397135e-06, "loss": 0.89441717, "num_input_tokens_seen": 66839060, "router_z_loss_clip": 3.328125, "router_z_loss_mlp": 0.35302734, "step": 3096, "time_per_iteration": 2.5710058212280273 }, { "auxiliary_loss_clip": 0.0165442, "auxiliary_loss_mlp": 0.00368992, "balance_loss_clip": 1.3217485, "balance_loss_mlp": 0.33353949, "epoch": 0.18620171351270104, "flos": 21798782962560.0, "grad_norm": 43.14234731923158, "language_loss": 0.82141632, "learning_rate": 3.7494669927697255e-06, "loss": 0.84165043, "num_input_tokens_seen": 66857760, "router_z_loss_clip": 3.328125, "router_z_loss_mlp": 0.35424805, "step": 3097, "time_per_iteration": 2.595466375350952 }, { "auxiliary_loss_clip": 0.01667801, "auxiliary_loss_mlp": 0.00388953, "balance_loss_clip": 1.33644772, "balance_loss_mlp": 0.35402519, "epoch": 0.186261836765369, "flos": 16362877299840.0, "grad_norm": 10.359216165902295, "language_loss": 0.73551846, "learning_rate": 3.749278224802352e-06, "loss": 0.75608605, "num_input_tokens_seen": 66876460, "router_z_loss_clip": 3.31445312, "router_z_loss_mlp": 0.34912109, "step": 3098, "time_per_iteration": 2.566417932510376 }, { "auxiliary_loss_clip": 0.01672423, "auxiliary_loss_mlp": 0.00425081, "balance_loss_clip": 1.33346868, "balance_loss_mlp": 0.38319075, "epoch": 0.18632196001803697, "flos": 23370054480000.0, "grad_norm": 18.66550426789156, "language_loss": 0.75581026, "learning_rate": 3.7490893905021733e-06, "loss": 0.77678531, "num_input_tokens_seen": 66897960, "router_z_loss_clip": 3.38867188, "router_z_loss_mlp": 0.41894531, "step": 3099, "time_per_iteration": 4.084285020828247 }, { "auxiliary_loss_clip": 0.01683319, "auxiliary_loss_mlp": 0.00433986, "balance_loss_clip": 1.34544551, "balance_loss_mlp": 0.39469451, "epoch": 0.18638208327070493, "flos": 22492002516480.0, "grad_norm": 4.045755484108542, "language_loss": 0.78052115, "learning_rate": 3.7489004898763494e-06, "loss": 0.80169415, "num_input_tokens_seen": 66917675, "router_z_loss_clip": 3.38476562, "router_z_loss_mlp": 0.39282227, "step": 3100, "time_per_iteration": 2.6126701831817627 }, { "auxiliary_loss_clip": 0.01700443, "auxiliary_loss_mlp": 0.00452558, "balance_loss_clip": 1.35660517, "balance_loss_mlp": 0.40983325, "epoch": 0.18644220652337293, "flos": 29165245931520.0, "grad_norm": 286.22405897435686, "language_loss": 0.85808468, "learning_rate": 3.7487115229320444e-06, "loss": 0.87961471, "num_input_tokens_seen": 66936000, "router_z_loss_clip": 3.43359375, "router_z_loss_mlp": 0.42700195, "step": 3101, "time_per_iteration": 2.6593501567840576 }, { "auxiliary_loss_clip": 0.01691024, "auxiliary_loss_mlp": 0.00427232, "balance_loss_clip": 1.34987283, "balance_loss_mlp": 0.38677257, "epoch": 0.1865023297760409, "flos": 24243796811520.0, "grad_norm": 41.57034909205917, "language_loss": 0.81701523, "learning_rate": 3.7485224896764222e-06, "loss": 0.83819783, "num_input_tokens_seen": 66955700, "router_z_loss_clip": 3.41210938, "router_z_loss_mlp": 0.40454102, "step": 3102, "time_per_iteration": 4.114346742630005 }, { "auxiliary_loss_clip": 0.01686224, "auxiliary_loss_mlp": 0.0045867, "balance_loss_clip": 1.34484947, "balance_loss_mlp": 0.41594553, "epoch": 0.18656245302870886, "flos": 19128716449920.0, "grad_norm": 35.20736077242844, "language_loss": 0.82689422, "learning_rate": 3.7483333901166525e-06, "loss": 0.84834313, "num_input_tokens_seen": 66972815, "router_z_loss_clip": 3.41601562, "router_z_loss_mlp": 0.42749023, "step": 3103, "time_per_iteration": 2.6716675758361816 }, { "auxiliary_loss_clip": 0.01700646, "auxiliary_loss_mlp": 0.00467027, "balance_loss_clip": 1.35464787, "balance_loss_mlp": 0.42303836, "epoch": 0.18662257628137682, "flos": 17786088956160.0, "grad_norm": 25.84075737771718, "language_loss": 0.85729289, "learning_rate": 3.7481442242599054e-06, "loss": 0.87896967, "num_input_tokens_seen": 66992280, "router_z_loss_clip": 3.46484375, "router_z_loss_mlp": 0.44018555, "step": 3104, "time_per_iteration": 2.567841053009033 }, { "auxiliary_loss_clip": 0.01698843, "auxiliary_loss_mlp": 0.00485334, "balance_loss_clip": 1.35364544, "balance_loss_mlp": 0.44022554, "epoch": 0.1866826995340448, "flos": 24024382583040.0, "grad_norm": 4.352019525951, "language_loss": 0.9217062, "learning_rate": 3.747954992113354e-06, "loss": 0.94354802, "num_input_tokens_seen": 67012220, "router_z_loss_clip": 3.453125, "router_z_loss_mlp": 0.45117188, "step": 3105, "time_per_iteration": 2.7265758514404297 }, { "auxiliary_loss_clip": 0.01682993, "auxiliary_loss_mlp": 0.00488468, "balance_loss_clip": 1.32703757, "balance_loss_mlp": 0.44214332, "epoch": 0.18674282278671275, "flos": 26141244756480.0, "grad_norm": 136.91393057563036, "language_loss": 0.94989121, "learning_rate": 3.7477656936841742e-06, "loss": 0.97160578, "num_input_tokens_seen": 67032030, "router_z_loss_clip": 3.55859375, "router_z_loss_mlp": 0.46313477, "step": 3106, "time_per_iteration": 2.6285946369171143 }, { "auxiliary_loss_clip": 0.01716439, "auxiliary_loss_mlp": 0.00506679, "balance_loss_clip": 1.3561151, "balance_loss_mlp": 0.45944789, "epoch": 0.18680294603938072, "flos": 19201938324480.0, "grad_norm": 255.61588346492675, "language_loss": 0.84183848, "learning_rate": 3.7475763289795445e-06, "loss": 0.86406964, "num_input_tokens_seen": 67048920, "router_z_loss_clip": 3.60546875, "router_z_loss_mlp": 0.47216797, "step": 3107, "time_per_iteration": 3.9831018447875977 }, { "auxiliary_loss_clip": 0.01703067, "auxiliary_loss_mlp": 0.00551313, "balance_loss_clip": 1.34396195, "balance_loss_mlp": 0.49881315, "epoch": 0.1868630692920487, "flos": 28544889116160.0, "grad_norm": 7.365904566217898, "language_loss": 0.81877816, "learning_rate": 3.7473868980066446e-06, "loss": 0.84132195, "num_input_tokens_seen": 67068645, "router_z_loss_clip": 3.58984375, "router_z_loss_mlp": 0.52490234, "step": 3108, "time_per_iteration": 2.6588711738586426 }, { "auxiliary_loss_clip": 0.01718029, "auxiliary_loss_mlp": 0.0052768, "balance_loss_clip": 1.35255516, "balance_loss_mlp": 0.47546685, "epoch": 0.18692319254471668, "flos": 17238020261760.0, "grad_norm": 25.62521045093071, "language_loss": 0.80681783, "learning_rate": 3.747197400772658e-06, "loss": 0.82927495, "num_input_tokens_seen": 67087075, "router_z_loss_clip": 3.65429688, "router_z_loss_mlp": 0.5222168, "step": 3109, "time_per_iteration": 2.550187826156616 }, { "auxiliary_loss_clip": 0.01723372, "auxiliary_loss_mlp": 0.00507979, "balance_loss_clip": 1.35829604, "balance_loss_mlp": 0.456862, "epoch": 0.18698331579738464, "flos": 23185186156800.0, "grad_norm": 14.426702453136896, "language_loss": 0.90136915, "learning_rate": 3.747007837284772e-06, "loss": 0.92368269, "num_input_tokens_seen": 67108040, "router_z_loss_clip": 3.6484375, "router_z_loss_mlp": 0.51098633, "step": 3110, "time_per_iteration": 2.7198309898376465 }, { "auxiliary_loss_clip": 0.0171439, "auxiliary_loss_mlp": 0.00546005, "balance_loss_clip": 1.34636474, "balance_loss_mlp": 0.49469697, "epoch": 0.1870434390500526, "flos": 25516721963520.0, "grad_norm": 13.59083231314776, "language_loss": 0.89388096, "learning_rate": 3.7468182075501737e-06, "loss": 0.91648483, "num_input_tokens_seen": 67127605, "router_z_loss_clip": 3.68164062, "router_z_loss_mlp": 0.51293945, "step": 3111, "time_per_iteration": 2.6117324829101562 }, { "auxiliary_loss_clip": 0.01723638, "auxiliary_loss_mlp": 0.00524833, "balance_loss_clip": 1.34981394, "balance_loss_mlp": 0.47505161, "epoch": 0.18710356230272057, "flos": 19500823393920.0, "grad_norm": 7.4288106980061945, "language_loss": 0.83689344, "learning_rate": 3.7466285115760536e-06, "loss": 0.85937822, "num_input_tokens_seen": 67145785, "router_z_loss_clip": 3.7421875, "router_z_loss_mlp": 0.49829102, "step": 3112, "time_per_iteration": 2.61617374420166 }, { "auxiliary_loss_clip": 0.01712126, "auxiliary_loss_mlp": 0.00533277, "balance_loss_clip": 1.3341136, "balance_loss_mlp": 0.48521161, "epoch": 0.18716368555538854, "flos": 26760847386240.0, "grad_norm": 10.66984050648729, "language_loss": 0.71814096, "learning_rate": 3.7464387493696046e-06, "loss": 0.74059498, "num_input_tokens_seen": 67165930, "router_z_loss_clip": 3.77734375, "router_z_loss_mlp": 0.48046875, "step": 3113, "time_per_iteration": 2.650690793991089 }, { "auxiliary_loss_clip": 0.01734602, "auxiliary_loss_mlp": 0.00580481, "balance_loss_clip": 1.34600663, "balance_loss_mlp": 0.52504909, "epoch": 0.1872238088080565, "flos": 25189827264000.0, "grad_norm": 27.495445565186536, "language_loss": 0.87814927, "learning_rate": 3.746248920938024e-06, "loss": 0.90130013, "num_input_tokens_seen": 67185830, "router_z_loss_clip": 3.88085938, "router_z_loss_mlp": 0.55517578, "step": 3114, "time_per_iteration": 2.6658849716186523 }, { "auxiliary_loss_clip": 0.01721143, "auxiliary_loss_mlp": 0.00608043, "balance_loss_clip": 1.32654834, "balance_loss_mlp": 0.54877222, "epoch": 0.1872839320607245, "flos": 24134305178880.0, "grad_norm": 7.214666887042284, "language_loss": 0.64978838, "learning_rate": 3.74605902628851e-06, "loss": 0.67308021, "num_input_tokens_seen": 67206930, "router_z_loss_clip": 3.94335938, "router_z_loss_mlp": 0.59204102, "step": 3115, "time_per_iteration": 2.6355226039886475 }, { "auxiliary_loss_clip": 0.01748832, "auxiliary_loss_mlp": 0.00545443, "balance_loss_clip": 1.35340738, "balance_loss_mlp": 0.49396843, "epoch": 0.18734405531339246, "flos": 21173793292800.0, "grad_norm": 5.508442552968976, "language_loss": 0.77868479, "learning_rate": 3.745869065428261e-06, "loss": 0.80162752, "num_input_tokens_seen": 67226290, "router_z_loss_clip": 3.95507812, "router_z_loss_mlp": 0.51513672, "step": 3116, "time_per_iteration": 2.6324305534362793 }, { "auxiliary_loss_clip": 0.01751178, "auxiliary_loss_mlp": 0.0052754, "balance_loss_clip": 1.34480417, "balance_loss_mlp": 0.47601774, "epoch": 0.18740417856606043, "flos": 17237697039360.0, "grad_norm": 7.915268138544755, "language_loss": 0.86085224, "learning_rate": 3.7456790383644833e-06, "loss": 0.88363945, "num_input_tokens_seen": 67244410, "router_z_loss_clip": 4.0625, "router_z_loss_mlp": 0.51513672, "step": 3117, "time_per_iteration": 2.605804681777954 }, { "auxiliary_loss_clip": 0.01752969, "auxiliary_loss_mlp": 0.00576902, "balance_loss_clip": 1.3463378, "balance_loss_mlp": 0.5221135, "epoch": 0.1874643018187284, "flos": 32558049999360.0, "grad_norm": 40.72247025046098, "language_loss": 0.88354445, "learning_rate": 3.745488945104381e-06, "loss": 0.90684319, "num_input_tokens_seen": 67264470, "router_z_loss_clip": 4.0703125, "router_z_loss_mlp": 0.5480957, "step": 3118, "time_per_iteration": 2.7086985111236572 }, { "auxiliary_loss_clip": 0.01715843, "auxiliary_loss_mlp": 0.00540873, "balance_loss_clip": 1.3060267, "balance_loss_mlp": 0.49032873, "epoch": 0.18752442507139636, "flos": 23258156636160.0, "grad_norm": 186.65513159591933, "language_loss": 0.82141459, "learning_rate": 3.7452987856551636e-06, "loss": 0.84398174, "num_input_tokens_seen": 67284315, "router_z_loss_clip": 4.09960938, "router_z_loss_mlp": 0.50585938, "step": 3119, "time_per_iteration": 2.73049259185791 }, { "auxiliary_loss_clip": 0.01744478, "auxiliary_loss_mlp": 0.00524367, "balance_loss_clip": 1.32349443, "balance_loss_mlp": 0.47513381, "epoch": 0.18758454832406432, "flos": 21760933006080.0, "grad_norm": 10.159702496633102, "language_loss": 0.86584151, "learning_rate": 3.7451085600240406e-06, "loss": 0.88853002, "num_input_tokens_seen": 67302780, "router_z_loss_clip": 4.20703125, "router_z_loss_mlp": 0.49267578, "step": 3120, "time_per_iteration": 2.623866081237793 }, { "auxiliary_loss_clip": 0.01742199, "auxiliary_loss_mlp": 0.00517546, "balance_loss_clip": 1.31460452, "balance_loss_mlp": 0.46750218, "epoch": 0.1876446715767323, "flos": 29570210841600.0, "grad_norm": 6.2013043520552396, "language_loss": 0.90806007, "learning_rate": 3.7449182682182263e-06, "loss": 0.93065751, "num_input_tokens_seen": 67323405, "router_z_loss_clip": 4.28125, "router_z_loss_mlp": 0.5, "step": 3121, "time_per_iteration": 2.693110466003418 }, { "auxiliary_loss_clip": 0.01787672, "auxiliary_loss_mlp": 0.00542846, "balance_loss_clip": 1.34576845, "balance_loss_mlp": 0.49184823, "epoch": 0.18770479482940028, "flos": 30339992234880.0, "grad_norm": 31.033603838031425, "language_loss": 0.78074563, "learning_rate": 3.744727910244937e-06, "loss": 0.8040508, "num_input_tokens_seen": 67345800, "router_z_loss_clip": 4.421875, "router_z_loss_mlp": 0.50976562, "step": 3122, "time_per_iteration": 2.6971802711486816 }, { "auxiliary_loss_clip": 0.01764462, "auxiliary_loss_mlp": 0.00530373, "balance_loss_clip": 1.32902455, "balance_loss_mlp": 0.48092464, "epoch": 0.18776491808206824, "flos": 14465357527680.0, "grad_norm": 4.321147745477317, "language_loss": 0.77292508, "learning_rate": 3.7445374861113905e-06, "loss": 0.7958734, "num_input_tokens_seen": 67363575, "router_z_loss_clip": 4.35546875, "router_z_loss_mlp": 0.49511719, "step": 3123, "time_per_iteration": 2.5785975456237793 }, { "auxiliary_loss_clip": 0.01767734, "auxiliary_loss_mlp": 0.00488694, "balance_loss_clip": 1.32811284, "balance_loss_mlp": 0.44177362, "epoch": 0.1878250413347362, "flos": 24498547044480.0, "grad_norm": 48.374349847369565, "language_loss": 0.8068195, "learning_rate": 3.7443469958248066e-06, "loss": 0.82938373, "num_input_tokens_seen": 67381765, "router_z_loss_clip": 4.390625, "router_z_loss_mlp": 0.46923828, "step": 3124, "time_per_iteration": 2.6318585872650146 }, { "auxiliary_loss_clip": 0.01752127, "auxiliary_loss_mlp": 0.00562982, "balance_loss_clip": 1.30620027, "balance_loss_mlp": 0.5112927, "epoch": 0.18788516458740417, "flos": 39786185692800.0, "grad_norm": 15.218568735909521, "language_loss": 0.86856413, "learning_rate": 3.7441564393924106e-06, "loss": 0.89171529, "num_input_tokens_seen": 67405000, "router_z_loss_clip": 4.45703125, "router_z_loss_mlp": 0.51733398, "step": 3125, "time_per_iteration": 2.862583637237549 }, { "auxiliary_loss_clip": 0.01939333, "auxiliary_loss_mlp": 0.00723333, "balance_loss_clip": 1.38223183, "balance_loss_mlp": 0.68289757, "epoch": 0.18794528784007214, "flos": 64699250664960.0, "grad_norm": 1.0481687503649295, "language_loss": 0.63828248, "learning_rate": 3.7439658168214273e-06, "loss": 0.66490912, "num_input_tokens_seen": 67467140, "router_z_loss_clip": 5.5625, "router_z_loss_mlp": 0.40429688, "step": 3126, "time_per_iteration": 3.166856527328491 }, { "auxiliary_loss_clip": 0.01766415, "auxiliary_loss_mlp": 0.00525707, "balance_loss_clip": 1.32589114, "balance_loss_mlp": 0.47728425, "epoch": 0.1880054110927401, "flos": 28622061486720.0, "grad_norm": 3.6620536353428688, "language_loss": 0.85888076, "learning_rate": 3.7437751281190857e-06, "loss": 0.88180196, "num_input_tokens_seen": 67487980, "router_z_loss_clip": 4.4140625, "router_z_loss_mlp": 0.48413086, "step": 3127, "time_per_iteration": 2.7034950256347656 }, { "auxiliary_loss_clip": 0.01933443, "auxiliary_loss_mlp": 0.00490453, "balance_loss_clip": 1.35019016, "balance_loss_mlp": 0.46336871, "epoch": 0.1880655343454081, "flos": 64488958490880.0, "grad_norm": 1.0074141727830415, "language_loss": 0.61944276, "learning_rate": 3.7435843732926164e-06, "loss": 0.6436817, "num_input_tokens_seen": 67552500, "router_z_loss_clip": 5.84375, "router_z_loss_mlp": 0.27148438, "step": 3128, "time_per_iteration": 3.1770265102386475 }, { "auxiliary_loss_clip": 0.01735192, "auxiliary_loss_mlp": 0.00454923, "balance_loss_clip": 1.31358731, "balance_loss_mlp": 0.4123894, "epoch": 0.18812565759807606, "flos": 32124464928000.0, "grad_norm": 249.8593586899356, "language_loss": 0.79409349, "learning_rate": 3.7433935523492536e-06, "loss": 0.81599474, "num_input_tokens_seen": 67573295, "router_z_loss_clip": 4.2109375, "router_z_loss_mlp": 0.42553711, "step": 3129, "time_per_iteration": 2.7239208221435547 }, { "auxiliary_loss_clip": 0.01733419, "auxiliary_loss_mlp": 0.00443824, "balance_loss_clip": 1.32632768, "balance_loss_mlp": 0.40045533, "epoch": 0.18818578085074403, "flos": 20624539449600.0, "grad_norm": 3.200542563102962, "language_loss": 0.92112917, "learning_rate": 3.7432026652962314e-06, "loss": 0.94290161, "num_input_tokens_seen": 67590010, "router_z_loss_clip": 4.06835938, "router_z_loss_mlp": 0.43383789, "step": 3130, "time_per_iteration": 2.598921060562134 }, { "auxiliary_loss_clip": 0.0171425, "auxiliary_loss_mlp": 0.00416584, "balance_loss_clip": 1.30755246, "balance_loss_mlp": 0.37760258, "epoch": 0.188245904103412, "flos": 28840506048000.0, "grad_norm": 5.470956120316693, "language_loss": 0.82556313, "learning_rate": 3.7430117121407897e-06, "loss": 0.8468715, "num_input_tokens_seen": 67611110, "router_z_loss_clip": 4.06835938, "router_z_loss_mlp": 0.38989258, "step": 3131, "time_per_iteration": 2.6537117958068848 }, { "auxiliary_loss_clip": 0.01702177, "auxiliary_loss_mlp": 0.00432302, "balance_loss_clip": 1.31361103, "balance_loss_mlp": 0.39251003, "epoch": 0.18830602735607996, "flos": 29420319386880.0, "grad_norm": 27.450373763479938, "language_loss": 0.88559705, "learning_rate": 3.74282069289017e-06, "loss": 0.90694189, "num_input_tokens_seen": 67631990, "router_z_loss_clip": 3.88085938, "router_z_loss_mlp": 0.39819336, "step": 3132, "time_per_iteration": 2.756167411804199 }, { "auxiliary_loss_clip": 0.01691329, "auxiliary_loss_mlp": 0.0046265, "balance_loss_clip": 1.30717707, "balance_loss_mlp": 0.41982996, "epoch": 0.18836615060874792, "flos": 28872933050880.0, "grad_norm": 24.080784711086867, "language_loss": 0.86716044, "learning_rate": 3.742629607551614e-06, "loss": 0.88870019, "num_input_tokens_seen": 67650490, "router_z_loss_clip": 3.84570312, "router_z_loss_mlp": 0.42822266, "step": 3133, "time_per_iteration": 2.6437807083129883 }, { "auxiliary_loss_clip": 0.01684743, "auxiliary_loss_mlp": 0.00440287, "balance_loss_clip": 1.31290007, "balance_loss_mlp": 0.39961284, "epoch": 0.18842627386141592, "flos": 22601673717120.0, "grad_norm": 2.581375290161202, "language_loss": 0.876647, "learning_rate": 3.7424384561323698e-06, "loss": 0.8978973, "num_input_tokens_seen": 67668860, "router_z_loss_clip": 3.71875, "router_z_loss_mlp": 0.40673828, "step": 3134, "time_per_iteration": 2.67865252494812 }, { "auxiliary_loss_clip": 0.01681223, "auxiliary_loss_mlp": 0.00418853, "balance_loss_clip": 1.31104696, "balance_loss_mlp": 0.37918055, "epoch": 0.18848639711408388, "flos": 24573600512640.0, "grad_norm": 15.99826296251864, "language_loss": 0.87127078, "learning_rate": 3.742247238639684e-06, "loss": 0.89227152, "num_input_tokens_seen": 67690220, "router_z_loss_clip": 3.70117188, "router_z_loss_mlp": 0.39697266, "step": 3135, "time_per_iteration": 2.6381287574768066 }, { "auxiliary_loss_clip": 0.01684208, "auxiliary_loss_mlp": 0.00389484, "balance_loss_clip": 1.32177138, "balance_loss_mlp": 0.35486591, "epoch": 0.18854652036675185, "flos": 34166920078080.0, "grad_norm": 4.954205564589724, "language_loss": 0.84542209, "learning_rate": 3.7420559550808083e-06, "loss": 0.86615896, "num_input_tokens_seen": 67709820, "router_z_loss_clip": 3.62304688, "router_z_loss_mlp": 0.34643555, "step": 3136, "time_per_iteration": 2.7060134410858154 }, { "auxiliary_loss_clip": 0.0169789, "auxiliary_loss_mlp": 0.00448964, "balance_loss_clip": 1.33808458, "balance_loss_mlp": 0.4105069, "epoch": 0.1886066436194198, "flos": 24200236592640.0, "grad_norm": 7.025106882528393, "language_loss": 0.88601649, "learning_rate": 3.741864605462996e-06, "loss": 0.90748501, "num_input_tokens_seen": 67729490, "router_z_loss_clip": 3.60351562, "router_z_loss_mlp": 0.38452148, "step": 3137, "time_per_iteration": 2.60099720954895 }, { "auxiliary_loss_clip": 0.01712367, "auxiliary_loss_mlp": 0.00466973, "balance_loss_clip": 1.35512495, "balance_loss_mlp": 0.42954099, "epoch": 0.18866676687208778, "flos": 21251109317760.0, "grad_norm": 4.566958245055196, "language_loss": 0.85802186, "learning_rate": 3.741673189793504e-06, "loss": 0.87981522, "num_input_tokens_seen": 67749665, "router_z_loss_clip": 3.57421875, "router_z_loss_mlp": 0.37451172, "step": 3138, "time_per_iteration": 2.6089391708374023 }, { "auxiliary_loss_clip": 0.0169109, "auxiliary_loss_mlp": 0.00439093, "balance_loss_clip": 1.33560061, "balance_loss_mlp": 0.40066001, "epoch": 0.18872689012475574, "flos": 37308673013760.0, "grad_norm": 13.546352358803889, "language_loss": 0.70658588, "learning_rate": 3.7414817080795896e-06, "loss": 0.72788775, "num_input_tokens_seen": 67776230, "router_z_loss_clip": 3.55664062, "router_z_loss_mlp": 0.38427734, "step": 3139, "time_per_iteration": 2.758195638656616 }, { "auxiliary_loss_clip": 0.01672774, "auxiliary_loss_mlp": 0.00460229, "balance_loss_clip": 1.32042575, "balance_loss_mlp": 0.42174792, "epoch": 0.1887870133774237, "flos": 21652303299840.0, "grad_norm": 91.98992531998626, "language_loss": 0.77013695, "learning_rate": 3.741290160328514e-06, "loss": 0.79146701, "num_input_tokens_seen": 67795080, "router_z_loss_clip": 3.52148438, "router_z_loss_mlp": 0.38500977, "step": 3140, "time_per_iteration": 2.5791168212890625 }, { "auxiliary_loss_clip": 0.01659691, "auxiliary_loss_mlp": 0.00432422, "balance_loss_clip": 1.30910778, "balance_loss_mlp": 0.39572984, "epoch": 0.1888471366300917, "flos": 15924659374080.0, "grad_norm": 14.615255164886163, "language_loss": 0.94353902, "learning_rate": 3.7410985465475412e-06, "loss": 0.96446013, "num_input_tokens_seen": 67813110, "router_z_loss_clip": 3.50585938, "router_z_loss_mlp": 0.36645508, "step": 3141, "time_per_iteration": 3.9974100589752197 }, { "auxiliary_loss_clip": 0.0168594, "auxiliary_loss_mlp": 0.00455319, "balance_loss_clip": 1.33716142, "balance_loss_mlp": 0.4157179, "epoch": 0.18890725988275966, "flos": 18551955767040.0, "grad_norm": 49.767196238323, "language_loss": 0.82598871, "learning_rate": 3.7409068667439378e-06, "loss": 0.84740138, "num_input_tokens_seen": 67831070, "router_z_loss_clip": 3.48632812, "router_z_loss_mlp": 0.39575195, "step": 3142, "time_per_iteration": 2.5662922859191895 }, { "auxiliary_loss_clip": 0.01699314, "auxiliary_loss_mlp": 0.00487268, "balance_loss_clip": 1.35408139, "balance_loss_mlp": 0.45255435, "epoch": 0.18896738313542763, "flos": 28840865184000.0, "grad_norm": 60.26195208321762, "language_loss": 0.84146488, "learning_rate": 3.740715120924971e-06, "loss": 0.86333066, "num_input_tokens_seen": 67852170, "router_z_loss_clip": 3.45117188, "router_z_loss_mlp": 0.34716797, "step": 3143, "time_per_iteration": 2.6825618743896484 }, { "auxiliary_loss_clip": 0.01682691, "auxiliary_loss_mlp": 0.00458, "balance_loss_clip": 1.33822024, "balance_loss_mlp": 0.42173609, "epoch": 0.1890275063880956, "flos": 22412747157120.0, "grad_norm": 5.944887681619039, "language_loss": 0.777179, "learning_rate": 3.740523309097912e-06, "loss": 0.79858589, "num_input_tokens_seen": 67869945, "router_z_loss_clip": 3.4453125, "router_z_loss_mlp": 0.36254883, "step": 3144, "time_per_iteration": 4.012053728103638 }, { "auxiliary_loss_clip": 0.01685102, "auxiliary_loss_mlp": 0.00424817, "balance_loss_clip": 1.34271133, "balance_loss_mlp": 0.38793316, "epoch": 0.18908762964076356, "flos": 24243904552320.0, "grad_norm": 358.6020331637659, "language_loss": 0.82951605, "learning_rate": 3.7403314312700356e-06, "loss": 0.8506152, "num_input_tokens_seen": 67890240, "router_z_loss_clip": 3.42382812, "router_z_loss_mlp": 0.36865234, "step": 3145, "time_per_iteration": 2.6142802238464355 }, { "auxiliary_loss_clip": 0.01684259, "auxiliary_loss_mlp": 0.0045995, "balance_loss_clip": 1.34727931, "balance_loss_mlp": 0.4238776, "epoch": 0.18914775289343153, "flos": 16982910892800.0, "grad_norm": 24.256245211400067, "language_loss": 0.84361869, "learning_rate": 3.740139487448616e-06, "loss": 0.86506081, "num_input_tokens_seen": 67907825, "router_z_loss_clip": 3.36523438, "router_z_loss_mlp": 0.36108398, "step": 3146, "time_per_iteration": 2.5728037357330322 }, { "auxiliary_loss_clip": 0.0169326, "auxiliary_loss_mlp": 0.00451143, "balance_loss_clip": 1.35486603, "balance_loss_mlp": 0.41475987, "epoch": 0.1892078761460995, "flos": 21543781334400.0, "grad_norm": 8.667830695849041, "language_loss": 0.85222363, "learning_rate": 3.7399474776409326e-06, "loss": 0.87366766, "num_input_tokens_seen": 67926670, "router_z_loss_clip": 3.38085938, "router_z_loss_mlp": 0.36401367, "step": 3147, "time_per_iteration": 2.6194007396698 }, { "auxiliary_loss_clip": 0.01688237, "auxiliary_loss_mlp": 0.00493153, "balance_loss_clip": 1.35143888, "balance_loss_mlp": 0.45569712, "epoch": 0.18926799939876748, "flos": 23001538896000.0, "grad_norm": 34.37827438415028, "language_loss": 0.73301387, "learning_rate": 3.739755401854267e-06, "loss": 0.75482774, "num_input_tokens_seen": 67943645, "router_z_loss_clip": 3.36914062, "router_z_loss_mlp": 0.37475586, "step": 3148, "time_per_iteration": 2.6175315380096436 }, { "auxiliary_loss_clip": 0.01682207, "auxiliary_loss_mlp": 0.00494901, "balance_loss_clip": 1.34355259, "balance_loss_mlp": 0.45601457, "epoch": 0.18932812265143545, "flos": 22273019251200.0, "grad_norm": 7.522618279025313, "language_loss": 0.81988382, "learning_rate": 3.739563260095902e-06, "loss": 0.8416549, "num_input_tokens_seen": 67962345, "router_z_loss_clip": 3.3828125, "router_z_loss_mlp": 0.38891602, "step": 3149, "time_per_iteration": 4.02577543258667 }, { "auxiliary_loss_clip": 0.01682246, "auxiliary_loss_mlp": 0.00458202, "balance_loss_clip": 1.34741235, "balance_loss_mlp": 0.42360789, "epoch": 0.1893882459041034, "flos": 18624423456000.0, "grad_norm": 17.840704409322555, "language_loss": 0.88037741, "learning_rate": 3.7393710523731245e-06, "loss": 0.90178192, "num_input_tokens_seen": 67979760, "router_z_loss_clip": 3.3515625, "router_z_loss_mlp": 0.34594727, "step": 3150, "time_per_iteration": 2.6240153312683105 }, { "auxiliary_loss_clip": 0.01692824, "auxiliary_loss_mlp": 0.00477347, "balance_loss_clip": 1.35719061, "balance_loss_mlp": 0.44105953, "epoch": 0.18944836915677138, "flos": 22892981016960.0, "grad_norm": 4.225318384464344, "language_loss": 0.91111046, "learning_rate": 3.7391787786932215e-06, "loss": 0.93281221, "num_input_tokens_seen": 67996895, "router_z_loss_clip": 3.359375, "router_z_loss_mlp": 0.36230469, "step": 3151, "time_per_iteration": 2.6164162158966064 }, { "auxiliary_loss_clip": 0.01726362, "auxiliary_loss_mlp": 0.00470049, "balance_loss_clip": 1.38360023, "balance_loss_mlp": 0.4327361, "epoch": 0.18950849240943934, "flos": 26796542526720.0, "grad_norm": 5.1562454968299285, "language_loss": 0.80787349, "learning_rate": 3.7389864390634857e-06, "loss": 0.82983756, "num_input_tokens_seen": 68018365, "router_z_loss_clip": 3.42773438, "router_z_loss_mlp": 0.37304688, "step": 3152, "time_per_iteration": 2.623049020767212 }, { "auxiliary_loss_clip": 0.01711428, "auxiliary_loss_mlp": 0.00472568, "balance_loss_clip": 1.37572014, "balance_loss_mlp": 0.43499294, "epoch": 0.1895686156621073, "flos": 24971239048320.0, "grad_norm": 119.27796595724571, "language_loss": 0.81564045, "learning_rate": 3.738794033491209e-06, "loss": 0.83748049, "num_input_tokens_seen": 68037985, "router_z_loss_clip": 3.35742188, "router_z_loss_mlp": 0.37573242, "step": 3153, "time_per_iteration": 2.6324825286865234 }, { "auxiliary_loss_clip": 0.01699784, "auxiliary_loss_mlp": 0.00488908, "balance_loss_clip": 1.36432338, "balance_loss_mlp": 0.45176214, "epoch": 0.1896287389147753, "flos": 21944544353280.0, "grad_norm": 10.716313949959078, "language_loss": 0.85321635, "learning_rate": 3.7386015619836887e-06, "loss": 0.87510324, "num_input_tokens_seen": 68057975, "router_z_loss_clip": 3.35546875, "router_z_loss_mlp": 0.37158203, "step": 3154, "time_per_iteration": 2.633535623550415 }, { "auxiliary_loss_clip": 0.01719398, "auxiliary_loss_mlp": 0.00499323, "balance_loss_clip": 1.38208508, "balance_loss_mlp": 0.45945951, "epoch": 0.18968886216744327, "flos": 18179058723840.0, "grad_norm": 10.827747172857457, "language_loss": 0.79158169, "learning_rate": 3.738409024548223e-06, "loss": 0.81376886, "num_input_tokens_seen": 68074175, "router_z_loss_clip": 3.37109375, "router_z_loss_mlp": 0.39892578, "step": 3155, "time_per_iteration": 2.5578527450561523 }, { "auxiliary_loss_clip": 0.01706976, "auxiliary_loss_mlp": 0.00481855, "balance_loss_clip": 1.37100351, "balance_loss_mlp": 0.44361269, "epoch": 0.18974898542011123, "flos": 20412487509120.0, "grad_norm": 11.8894964010461, "language_loss": 0.79751754, "learning_rate": 3.7382164211921136e-06, "loss": 0.81940585, "num_input_tokens_seen": 68095230, "router_z_loss_clip": 3.359375, "router_z_loss_mlp": 0.38208008, "step": 3156, "time_per_iteration": 2.603217363357544 }, { "auxiliary_loss_clip": 0.01682323, "auxiliary_loss_mlp": 0.00507131, "balance_loss_clip": 1.35021579, "balance_loss_mlp": 0.46917507, "epoch": 0.1898091086727792, "flos": 23985024255360.0, "grad_norm": 6.864596717716565, "language_loss": 0.73516095, "learning_rate": 3.7380237519226623e-06, "loss": 0.75705552, "num_input_tokens_seen": 68113805, "router_z_loss_clip": 3.3203125, "router_z_loss_mlp": 0.37939453, "step": 3157, "time_per_iteration": 2.6270220279693604 }, { "auxiliary_loss_clip": 0.01734374, "auxiliary_loss_mlp": 0.00545288, "balance_loss_clip": 1.39743471, "balance_loss_mlp": 0.50346923, "epoch": 0.18986923192544716, "flos": 27637067756160.0, "grad_norm": 14.539530760746285, "language_loss": 0.87216377, "learning_rate": 3.737831016747176e-06, "loss": 0.8949604, "num_input_tokens_seen": 68133190, "router_z_loss_clip": 3.37109375, "router_z_loss_mlp": 0.41821289, "step": 3158, "time_per_iteration": 2.636578321456909 }, { "auxiliary_loss_clip": 0.01724401, "auxiliary_loss_mlp": 0.00569128, "balance_loss_clip": 1.38515735, "balance_loss_mlp": 0.52687973, "epoch": 0.18992935517811513, "flos": 25484151306240.0, "grad_norm": 80.11180810353741, "language_loss": 0.78750932, "learning_rate": 3.737638215672964e-06, "loss": 0.81044465, "num_input_tokens_seen": 68152330, "router_z_loss_clip": 3.390625, "router_z_loss_mlp": 0.42260742, "step": 3159, "time_per_iteration": 2.6683781147003174 }, { "auxiliary_loss_clip": 0.01742414, "auxiliary_loss_mlp": 0.00582249, "balance_loss_clip": 1.4096148, "balance_loss_mlp": 0.53678286, "epoch": 0.1899894784307831, "flos": 17420805596160.0, "grad_norm": 22.18055371203775, "language_loss": 0.92426836, "learning_rate": 3.7374453487073366e-06, "loss": 0.94751501, "num_input_tokens_seen": 68170185, "router_z_loss_clip": 3.33007812, "router_z_loss_mlp": 0.45458984, "step": 3160, "time_per_iteration": 2.5529603958129883 }, { "auxiliary_loss_clip": 0.01728278, "auxiliary_loss_mlp": 0.00513645, "balance_loss_clip": 1.39832497, "balance_loss_mlp": 0.47661859, "epoch": 0.19004960168345109, "flos": 27492240119040.0, "grad_norm": 21.50903482356712, "language_loss": 0.78881133, "learning_rate": 3.7372524158576074e-06, "loss": 0.81123054, "num_input_tokens_seen": 68191665, "router_z_loss_clip": 3.296875, "router_z_loss_mlp": 0.37011719, "step": 3161, "time_per_iteration": 2.6545088291168213 }, { "auxiliary_loss_clip": 0.01721999, "auxiliary_loss_mlp": 0.00592185, "balance_loss_clip": 1.39157426, "balance_loss_mlp": 0.54810095, "epoch": 0.19010972493611905, "flos": 38654676385920.0, "grad_norm": 3.244544897909727, "language_loss": 0.85762691, "learning_rate": 3.7370594171310926e-06, "loss": 0.88076878, "num_input_tokens_seen": 68214635, "router_z_loss_clip": 3.3046875, "router_z_loss_mlp": 0.44116211, "step": 3162, "time_per_iteration": 2.720372200012207 }, { "auxiliary_loss_clip": 0.017322, "auxiliary_loss_mlp": 0.00565331, "balance_loss_clip": 1.39665806, "balance_loss_mlp": 0.52267808, "epoch": 0.19016984818878702, "flos": 19244744357760.0, "grad_norm": 35.066218610506546, "language_loss": 0.82353741, "learning_rate": 3.73686635253511e-06, "loss": 0.84651268, "num_input_tokens_seen": 68232150, "router_z_loss_clip": 3.35546875, "router_z_loss_mlp": 0.42675781, "step": 3163, "time_per_iteration": 2.568563938140869 }, { "auxiliary_loss_clip": 0.017303, "auxiliary_loss_mlp": 0.00540716, "balance_loss_clip": 1.39999807, "balance_loss_mlp": 0.50082821, "epoch": 0.19022997144145498, "flos": 37596891744000.0, "grad_norm": 24.98464141287195, "language_loss": 0.79829139, "learning_rate": 3.736673222076982e-06, "loss": 0.82100153, "num_input_tokens_seen": 68253370, "router_z_loss_clip": 3.3046875, "router_z_loss_mlp": 0.39868164, "step": 3164, "time_per_iteration": 2.7046163082122803 }, { "auxiliary_loss_clip": 0.01737076, "auxiliary_loss_mlp": 0.0054128, "balance_loss_clip": 1.40142763, "balance_loss_mlp": 0.50155962, "epoch": 0.19029009469412295, "flos": 61530921665280.0, "grad_norm": 110.19614337506819, "language_loss": 0.72108704, "learning_rate": 3.7364800257640313e-06, "loss": 0.74387056, "num_input_tokens_seen": 68278895, "router_z_loss_clip": 3.35351562, "router_z_loss_mlp": 0.39697266, "step": 3165, "time_per_iteration": 2.9933810234069824 }, { "auxiliary_loss_clip": 0.01736881, "auxiliary_loss_mlp": 0.00582006, "balance_loss_clip": 1.40213513, "balance_loss_mlp": 0.53916168, "epoch": 0.1903502179467909, "flos": 13954851480960.0, "grad_norm": 20.500342837392825, "language_loss": 0.81278324, "learning_rate": 3.7362867636035835e-06, "loss": 0.83597219, "num_input_tokens_seen": 68294880, "router_z_loss_clip": 3.34765625, "router_z_loss_mlp": 0.4284668, "step": 3166, "time_per_iteration": 2.6156959533691406 }, { "auxiliary_loss_clip": 0.01700595, "auxiliary_loss_mlp": 0.00305785, "balance_loss_clip": 1.38142514, "balance_loss_mlp": 0.29195657, "epoch": 0.1904103411994589, "flos": 66899641916160.0, "grad_norm": 0.8115192165274928, "language_loss": 0.50529623, "learning_rate": 3.736093435602968e-06, "loss": 0.52535999, "num_input_tokens_seen": 68359665, "router_z_loss_clip": 3.1875, "router_z_loss_mlp": 0.13867188, "step": 3167, "time_per_iteration": 3.141352891921997 }, { "auxiliary_loss_clip": 0.01699917, "auxiliary_loss_mlp": 0.00507817, "balance_loss_clip": 1.3670541, "balance_loss_mlp": 0.47057557, "epoch": 0.19047046445212687, "flos": 21908741472000.0, "grad_norm": 3.4659399671662197, "language_loss": 0.79691911, "learning_rate": 3.7359000417695156e-06, "loss": 0.81899655, "num_input_tokens_seen": 68378950, "router_z_loss_clip": 3.33007812, "router_z_loss_mlp": 0.37207031, "step": 3168, "time_per_iteration": 2.6150496006011963 }, { "auxiliary_loss_clip": 0.01656833, "auxiliary_loss_mlp": 0.00325207, "balance_loss_clip": 1.34073234, "balance_loss_mlp": 0.30947137, "epoch": 0.19053058770479483, "flos": 59255156701440.0, "grad_norm": 0.8571464627917788, "language_loss": 0.60157406, "learning_rate": 3.73570658211056e-06, "loss": 0.62139446, "num_input_tokens_seen": 68434235, "router_z_loss_clip": 3.15625, "router_z_loss_mlp": 0.15722656, "step": 3169, "time_per_iteration": 3.0184872150421143 }, { "auxiliary_loss_clip": 0.01698343, "auxiliary_loss_mlp": 0.00585109, "balance_loss_clip": 1.35985923, "balance_loss_mlp": 0.54543638, "epoch": 0.1905907109574628, "flos": 23951304362880.0, "grad_norm": 154.06182242227857, "language_loss": 0.84933531, "learning_rate": 3.735513056633436e-06, "loss": 0.87216985, "num_input_tokens_seen": 68453830, "router_z_loss_clip": 3.37890625, "router_z_loss_mlp": 0.39672852, "step": 3170, "time_per_iteration": 2.661673069000244 }, { "auxiliary_loss_clip": 0.01680322, "auxiliary_loss_mlp": 0.00512149, "balance_loss_clip": 1.3440311, "balance_loss_mlp": 0.47517067, "epoch": 0.19065083421013077, "flos": 20812316774400.0, "grad_norm": 133.88367285757852, "language_loss": 0.83186501, "learning_rate": 3.7353194653454834e-06, "loss": 0.85378969, "num_input_tokens_seen": 68473005, "router_z_loss_clip": 3.36523438, "router_z_loss_mlp": 0.36938477, "step": 3171, "time_per_iteration": 2.578706979751587 }, { "auxiliary_loss_clip": 0.01664233, "auxiliary_loss_mlp": 0.00523108, "balance_loss_clip": 1.32850647, "balance_loss_mlp": 0.48319691, "epoch": 0.19071095746279873, "flos": 31284981192960.0, "grad_norm": 9.470600559681218, "language_loss": 0.86916053, "learning_rate": 3.7351258082540426e-06, "loss": 0.89103401, "num_input_tokens_seen": 68493470, "router_z_loss_clip": 3.35742188, "router_z_loss_mlp": 0.39916992, "step": 3172, "time_per_iteration": 2.707645893096924 }, { "auxiliary_loss_clip": 0.01662084, "auxiliary_loss_mlp": 0.00467493, "balance_loss_clip": 1.32937312, "balance_loss_mlp": 0.43256474, "epoch": 0.1907710807154667, "flos": 14356117290240.0, "grad_norm": 7.421585674796623, "language_loss": 0.86654681, "learning_rate": 3.7349320853664576e-06, "loss": 0.88784254, "num_input_tokens_seen": 68511290, "router_z_loss_clip": 3.32617188, "router_z_loss_mlp": 0.34936523, "step": 3173, "time_per_iteration": 2.573373556137085 }, { "auxiliary_loss_clip": 0.01643711, "auxiliary_loss_mlp": 0.0047709, "balance_loss_clip": 1.31029963, "balance_loss_mlp": 0.44027787, "epoch": 0.1908312039681347, "flos": 26907039740160.0, "grad_norm": 3.257679032550108, "language_loss": 0.85263026, "learning_rate": 3.7347382966900735e-06, "loss": 0.87383819, "num_input_tokens_seen": 68532575, "router_z_loss_clip": 3.3359375, "router_z_loss_mlp": 0.36816406, "step": 3174, "time_per_iteration": 2.6593265533447266 }, { "auxiliary_loss_clip": 0.01657578, "auxiliary_loss_mlp": 0.00445548, "balance_loss_clip": 1.32520199, "balance_loss_mlp": 0.41140658, "epoch": 0.19089132722080265, "flos": 14494695960960.0, "grad_norm": 54.74568498638887, "language_loss": 0.87250876, "learning_rate": 3.7345444422322395e-06, "loss": 0.89354002, "num_input_tokens_seen": 68548760, "router_z_loss_clip": 3.3203125, "router_z_loss_mlp": 0.34155273, "step": 3175, "time_per_iteration": 2.61513352394104 }, { "auxiliary_loss_clip": 0.01656494, "auxiliary_loss_mlp": 0.00486098, "balance_loss_clip": 1.32302237, "balance_loss_mlp": 0.44821298, "epoch": 0.19095145047347062, "flos": 13952876232960.0, "grad_norm": 5.834205291434537, "language_loss": 0.93476737, "learning_rate": 3.7343505220003067e-06, "loss": 0.95619321, "num_input_tokens_seen": 68563100, "router_z_loss_clip": 3.33398438, "router_z_loss_mlp": 0.37890625, "step": 3176, "time_per_iteration": 2.5750370025634766 }, { "auxiliary_loss_clip": 0.01643382, "auxiliary_loss_mlp": 0.00453435, "balance_loss_clip": 1.3091687, "balance_loss_mlp": 0.41679049, "epoch": 0.19101157372613858, "flos": 25301832848640.0, "grad_norm": 3.8591835531981067, "language_loss": 0.87430769, "learning_rate": 3.7341565360016285e-06, "loss": 0.89527595, "num_input_tokens_seen": 68581650, "router_z_loss_clip": 3.34375, "router_z_loss_mlp": 0.36621094, "step": 3177, "time_per_iteration": 2.634519100189209 }, { "auxiliary_loss_clip": 0.01644322, "auxiliary_loss_mlp": 0.0044323, "balance_loss_clip": 1.31537127, "balance_loss_mlp": 0.4094218, "epoch": 0.19107169697880655, "flos": 20558212986240.0, "grad_norm": 22.533891966428826, "language_loss": 0.8448416, "learning_rate": 3.73396248424356e-06, "loss": 0.86571711, "num_input_tokens_seen": 68600360, "router_z_loss_clip": 3.28710938, "router_z_loss_mlp": 0.33862305, "step": 3178, "time_per_iteration": 2.625331163406372 }, { "auxiliary_loss_clip": 0.01640721, "auxiliary_loss_mlp": 0.00424712, "balance_loss_clip": 1.31341505, "balance_loss_mlp": 0.39104712, "epoch": 0.19113182023147451, "flos": 22163204396160.0, "grad_norm": 13.038697754679694, "language_loss": 0.87683761, "learning_rate": 3.7337683667334606e-06, "loss": 0.89749193, "num_input_tokens_seen": 68617885, "router_z_loss_clip": 3.26953125, "router_z_loss_mlp": 0.33666992, "step": 3179, "time_per_iteration": 2.6003575325012207 }, { "auxiliary_loss_clip": 0.01657215, "auxiliary_loss_mlp": 0.00386645, "balance_loss_clip": 1.32963037, "balance_loss_mlp": 0.35610369, "epoch": 0.19119194348414248, "flos": 18581796990720.0, "grad_norm": 3.491195817568203, "language_loss": 0.89037251, "learning_rate": 3.733574183478691e-06, "loss": 0.91081113, "num_input_tokens_seen": 68634550, "router_z_loss_clip": 3.27734375, "router_z_loss_mlp": 0.30517578, "step": 3180, "time_per_iteration": 2.589942216873169 }, { "auxiliary_loss_clip": 0.01637488, "auxiliary_loss_mlp": 0.00402802, "balance_loss_clip": 1.30825853, "balance_loss_mlp": 0.37023354, "epoch": 0.19125206673681047, "flos": 19026623018880.0, "grad_norm": 4.0711558769185965, "language_loss": 0.85761929, "learning_rate": 3.733379934486615e-06, "loss": 0.87802219, "num_input_tokens_seen": 68651895, "router_z_loss_clip": 3.29101562, "router_z_loss_mlp": 0.32556152, "step": 3181, "time_per_iteration": 2.6085383892059326 }, { "auxiliary_loss_clip": 0.01653187, "auxiliary_loss_mlp": 0.00387023, "balance_loss_clip": 1.32197189, "balance_loss_mlp": 0.35753012, "epoch": 0.19131218998947844, "flos": 21690153256320.0, "grad_norm": 128.23821637245732, "language_loss": 0.80774218, "learning_rate": 3.7331856197645973e-06, "loss": 0.82814425, "num_input_tokens_seen": 68671500, "router_z_loss_clip": 3.31054688, "router_z_loss_mlp": 0.29516602, "step": 3182, "time_per_iteration": 2.5882487297058105 }, { "auxiliary_loss_clip": 0.01628729, "auxiliary_loss_mlp": 0.00384122, "balance_loss_clip": 1.3030479, "balance_loss_mlp": 0.35353342, "epoch": 0.1913723132421464, "flos": 18442500048000.0, "grad_norm": 2.06982825414926, "language_loss": 0.70666963, "learning_rate": 3.7329912393200084e-06, "loss": 0.72679818, "num_input_tokens_seen": 68690570, "router_z_loss_clip": 3.25585938, "router_z_loss_mlp": 0.30578613, "step": 3183, "time_per_iteration": 3.947877883911133 }, { "auxiliary_loss_clip": 0.01623501, "auxiliary_loss_mlp": 0.00383398, "balance_loss_clip": 1.29447615, "balance_loss_mlp": 0.35056737, "epoch": 0.19143243649481437, "flos": 27160102033920.0, "grad_norm": 2.821852268310014, "language_loss": 0.7835089, "learning_rate": 3.7327967931602173e-06, "loss": 0.8035779, "num_input_tokens_seen": 68709735, "router_z_loss_clip": 3.29101562, "router_z_loss_mlp": 0.32836914, "step": 3184, "time_per_iteration": 2.613609790802002 }, { "auxiliary_loss_clip": 0.01632295, "auxiliary_loss_mlp": 0.00338942, "balance_loss_clip": 1.30450284, "balance_loss_mlp": 0.30735135, "epoch": 0.19149255974748233, "flos": 21718952985600.0, "grad_norm": 4.244454214301981, "language_loss": 0.95121968, "learning_rate": 3.732602281292598e-06, "loss": 0.97093201, "num_input_tokens_seen": 68727565, "router_z_loss_clip": 3.28125, "router_z_loss_mlp": 0.31591797, "step": 3185, "time_per_iteration": 2.5876574516296387 }, { "auxiliary_loss_clip": 0.01604428, "auxiliary_loss_mlp": 0.0031529, "balance_loss_clip": 1.27808881, "balance_loss_mlp": 0.28729957, "epoch": 0.1915526830001503, "flos": 22963293889920.0, "grad_norm": 2.2611775775228935, "language_loss": 0.7927134, "learning_rate": 3.7324077037245267e-06, "loss": 0.81191051, "num_input_tokens_seen": 68748110, "router_z_loss_clip": 3.26367188, "router_z_loss_mlp": 0.27966309, "step": 3186, "time_per_iteration": 4.122814178466797 }, { "auxiliary_loss_clip": 0.01608653, "auxiliary_loss_mlp": 0.00294895, "balance_loss_clip": 1.28335381, "balance_loss_mlp": 0.26711908, "epoch": 0.1916128062528183, "flos": 26140741966080.0, "grad_norm": 20.10331539568603, "language_loss": 0.91689736, "learning_rate": 3.7322130604633825e-06, "loss": 0.93593287, "num_input_tokens_seen": 68769765, "router_z_loss_clip": 3.25195312, "router_z_loss_mlp": 0.27807617, "step": 3187, "time_per_iteration": 2.615020275115967 }, { "auxiliary_loss_clip": 0.01625526, "auxiliary_loss_mlp": 0.00149605, "balance_loss_clip": 1.28951526, "balance_loss_mlp": 0.14016354, "epoch": 0.19167292950548626, "flos": 54925767457920.0, "grad_norm": 0.9244426911560535, "language_loss": 0.5581308, "learning_rate": 3.732018351516544e-06, "loss": 0.57588208, "num_input_tokens_seen": 68826815, "router_z_loss_clip": 3.359375, "router_z_loss_mlp": 0.09423828, "step": 3188, "time_per_iteration": 3.146901845932007 }, { "auxiliary_loss_clip": 0.01578909, "auxiliary_loss_mlp": 0.0032034, "balance_loss_clip": 1.25893736, "balance_loss_mlp": 0.29158688, "epoch": 0.19173305275815422, "flos": 29935601942400.0, "grad_norm": 6.987586508764683, "language_loss": 0.77237087, "learning_rate": 3.731823576891397e-06, "loss": 0.79136336, "num_input_tokens_seen": 68847585, "router_z_loss_clip": 3.19726562, "router_z_loss_mlp": 0.28771973, "step": 3189, "time_per_iteration": 2.675635576248169 }, { "auxiliary_loss_clip": 0.01586177, "auxiliary_loss_mlp": 0.00250596, "balance_loss_clip": 1.26246405, "balance_loss_mlp": 0.22519284, "epoch": 0.1917931760108222, "flos": 24752471264640.0, "grad_norm": 5.529098131234571, "language_loss": 0.80960274, "learning_rate": 3.7316287365953266e-06, "loss": 0.8279705, "num_input_tokens_seen": 68866620, "router_z_loss_clip": 3.24023438, "router_z_loss_mlp": 0.2545166, "step": 3190, "time_per_iteration": 2.646608352661133 }, { "auxiliary_loss_clip": 0.01589356, "auxiliary_loss_mlp": 0.00283156, "balance_loss_clip": 1.26543069, "balance_loss_mlp": 0.25727558, "epoch": 0.19185329926349015, "flos": 18843550375680.0, "grad_norm": 11.718236291407097, "language_loss": 0.93292677, "learning_rate": 3.73143383063572e-06, "loss": 0.95165187, "num_input_tokens_seen": 68885515, "router_z_loss_clip": 3.23828125, "router_z_loss_mlp": 0.2590332, "step": 3191, "time_per_iteration": 2.563753843307495 }, { "auxiliary_loss_clip": 0.01580256, "auxiliary_loss_mlp": 0.00286726, "balance_loss_clip": 1.26097918, "balance_loss_mlp": 0.26052403, "epoch": 0.19191342251615812, "flos": 22086858038400.0, "grad_norm": 3.363389266911956, "language_loss": 0.96057326, "learning_rate": 3.73123885901997e-06, "loss": 0.9792431, "num_input_tokens_seen": 68903225, "router_z_loss_clip": 3.1953125, "router_z_loss_mlp": 0.26220703, "step": 3192, "time_per_iteration": 3.994459867477417 }, { "auxiliary_loss_clip": 0.01571292, "auxiliary_loss_mlp": 0.0028291, "balance_loss_clip": 1.2452451, "balance_loss_mlp": 0.2560637, "epoch": 0.19197354576882608, "flos": 22199115018240.0, "grad_norm": 30.930871743137494, "language_loss": 0.84659785, "learning_rate": 3.7310438217554687e-06, "loss": 0.86513984, "num_input_tokens_seen": 68922860, "router_z_loss_clip": 3.26171875, "router_z_loss_mlp": 0.26843262, "step": 3193, "time_per_iteration": 2.6646499633789062 }, { "auxiliary_loss_clip": 0.01577765, "auxiliary_loss_mlp": 0.0027571, "balance_loss_clip": 1.24861217, "balance_loss_mlp": 0.24957965, "epoch": 0.19203366902149407, "flos": 24896185580160.0, "grad_norm": 1.9357922858737837, "language_loss": 0.82536793, "learning_rate": 3.730848718849612e-06, "loss": 0.84390271, "num_input_tokens_seen": 68943000, "router_z_loss_clip": 3.29296875, "router_z_loss_mlp": 0.2611084, "step": 3194, "time_per_iteration": 2.647264003753662 }, { "auxiliary_loss_clip": 0.01498183, "auxiliary_loss_mlp": 0.0005207, "balance_loss_clip": 1.17927778, "balance_loss_mlp": 0.04167522, "epoch": 0.19209379227416204, "flos": 68416722789120.0, "grad_norm": 0.7479671120193637, "language_loss": 0.68280375, "learning_rate": 3.7306535503097985e-06, "loss": 0.69830626, "num_input_tokens_seen": 69000255, "router_z_loss_clip": 3.1875, "router_z_loss_mlp": 0.10400391, "step": 3195, "time_per_iteration": 3.0740277767181396 }, { "auxiliary_loss_clip": 0.01571938, "auxiliary_loss_mlp": 0.00292027, "balance_loss_clip": 1.24250722, "balance_loss_mlp": 0.26557493, "epoch": 0.19215391552683, "flos": 22055185221120.0, "grad_norm": 3.51951529921557, "language_loss": 0.82061028, "learning_rate": 3.730458316143429e-06, "loss": 0.83924997, "num_input_tokens_seen": 69019665, "router_z_loss_clip": 3.29296875, "router_z_loss_mlp": 0.26477051, "step": 3196, "time_per_iteration": 2.6202383041381836 }, { "auxiliary_loss_clip": 0.0155831, "auxiliary_loss_mlp": 0.00270302, "balance_loss_clip": 1.22946107, "balance_loss_mlp": 0.24414796, "epoch": 0.19221403877949797, "flos": 20302959962880.0, "grad_norm": 12.42846778506849, "language_loss": 0.91353506, "learning_rate": 3.7302630163579068e-06, "loss": 0.93182123, "num_input_tokens_seen": 69039055, "router_z_loss_clip": 3.28515625, "router_z_loss_mlp": 0.26123047, "step": 3197, "time_per_iteration": 2.641179323196411 }, { "auxiliary_loss_clip": 0.015395, "auxiliary_loss_mlp": 0.00296278, "balance_loss_clip": 1.21631181, "balance_loss_mlp": 0.26938394, "epoch": 0.19227416203216594, "flos": 23185329811200.0, "grad_norm": 7.319071579042978, "language_loss": 0.89620799, "learning_rate": 3.7300676509606373e-06, "loss": 0.9145658, "num_input_tokens_seen": 69056370, "router_z_loss_clip": 3.2265625, "router_z_loss_mlp": 0.2689209, "step": 3198, "time_per_iteration": 2.631232738494873 }, { "auxiliary_loss_clip": 0.01552381, "auxiliary_loss_mlp": 0.00318306, "balance_loss_clip": 1.23324513, "balance_loss_mlp": 0.28938597, "epoch": 0.1923342852848339, "flos": 25776607841280.0, "grad_norm": 5008.620430211642, "language_loss": 0.8664639, "learning_rate": 3.729872219959029e-06, "loss": 0.88517076, "num_input_tokens_seen": 69075915, "router_z_loss_clip": 3.18945312, "router_z_loss_mlp": 0.28955078, "step": 3199, "time_per_iteration": 2.7067389488220215 }, { "auxiliary_loss_clip": 0.01545267, "auxiliary_loss_mlp": 0.00292914, "balance_loss_clip": 1.22488368, "balance_loss_mlp": 0.26781997, "epoch": 0.19239440853750187, "flos": 17128349061120.0, "grad_norm": 4.831782627042346, "language_loss": 0.94012517, "learning_rate": 3.7296767233604934e-06, "loss": 0.958507, "num_input_tokens_seen": 69094145, "router_z_loss_clip": 3.20507812, "router_z_loss_mlp": 0.25085449, "step": 3200, "time_per_iteration": 2.560321807861328 }, { "auxiliary_loss_clip": 0.01536913, "auxiliary_loss_mlp": 0.00269732, "balance_loss_clip": 1.22186017, "balance_loss_mlp": 0.24672502, "epoch": 0.19245453179016986, "flos": 16435093593600.0, "grad_norm": 7.5696250507158584, "language_loss": 0.86634266, "learning_rate": 3.729481161172443e-06, "loss": 0.88440907, "num_input_tokens_seen": 69111110, "router_z_loss_clip": 3.14648438, "router_z_loss_mlp": 0.23022461, "step": 3201, "time_per_iteration": 2.546917200088501 }, { "auxiliary_loss_clip": 0.01541743, "auxiliary_loss_mlp": 0.00305686, "balance_loss_clip": 1.22569704, "balance_loss_mlp": 0.27935275, "epoch": 0.19251465504283782, "flos": 20230276792320.0, "grad_norm": 11.262665574878952, "language_loss": 0.7813201, "learning_rate": 3.7292855334022927e-06, "loss": 0.79979432, "num_input_tokens_seen": 69130280, "router_z_loss_clip": 3.16210938, "router_z_loss_mlp": 0.26318359, "step": 3202, "time_per_iteration": 2.5913736820220947 }, { "auxiliary_loss_clip": 0.01520481, "auxiliary_loss_mlp": 0.00246281, "balance_loss_clip": 1.20529473, "balance_loss_mlp": 0.22431117, "epoch": 0.1925747782955058, "flos": 19464374067840.0, "grad_norm": 1.8181907214211948, "language_loss": 0.99412167, "learning_rate": 3.7290898400574627e-06, "loss": 1.01178932, "num_input_tokens_seen": 69149570, "router_z_loss_clip": 3.1484375, "router_z_loss_mlp": 0.21984863, "step": 3203, "time_per_iteration": 2.5787971019744873 }, { "auxiliary_loss_clip": 0.01531795, "auxiliary_loss_mlp": 0.00288486, "balance_loss_clip": 1.21618617, "balance_loss_mlp": 0.26091325, "epoch": 0.19263490154817375, "flos": 17785586165760.0, "grad_norm": 9.223906728087742, "language_loss": 0.93070364, "learning_rate": 3.7288940811453725e-06, "loss": 0.94890642, "num_input_tokens_seen": 69168190, "router_z_loss_clip": 3.15820312, "router_z_loss_mlp": 0.27563477, "step": 3204, "time_per_iteration": 2.563422441482544 }, { "auxiliary_loss_clip": 0.01536477, "auxiliary_loss_mlp": 0.00280499, "balance_loss_clip": 1.2221663, "balance_loss_mlp": 0.25464231, "epoch": 0.19269502480084172, "flos": 17457075354240.0, "grad_norm": 7.90045946676244, "language_loss": 0.83433664, "learning_rate": 3.7286982566734454e-06, "loss": 0.8525064, "num_input_tokens_seen": 69186950, "router_z_loss_clip": 3.14453125, "router_z_loss_mlp": 0.25866699, "step": 3205, "time_per_iteration": 2.568598747253418 }, { "auxiliary_loss_clip": 0.01529023, "auxiliary_loss_mlp": 0.00289945, "balance_loss_clip": 1.2153759, "balance_loss_mlp": 0.2654475, "epoch": 0.19275514805350968, "flos": 21506901045120.0, "grad_norm": 2.918520140873357, "language_loss": 0.93835187, "learning_rate": 3.728502366649107e-06, "loss": 0.9565416, "num_input_tokens_seen": 69204850, "router_z_loss_clip": 3.13476562, "router_z_loss_mlp": 0.24499512, "step": 3206, "time_per_iteration": 2.6173465251922607 }, { "auxiliary_loss_clip": 0.01507756, "auxiliary_loss_mlp": 0.00122668, "balance_loss_clip": 1.18335152, "balance_loss_mlp": 0.11327467, "epoch": 0.19281527130617768, "flos": 47695979738880.0, "grad_norm": 0.8302323192630588, "language_loss": 0.60544157, "learning_rate": 3.728306411079786e-06, "loss": 0.62174582, "num_input_tokens_seen": 69259200, "router_z_loss_clip": 3.25, "router_z_loss_mlp": 0.09375, "step": 3207, "time_per_iteration": 2.9312236309051514 }, { "auxiliary_loss_clip": 0.01511495, "auxiliary_loss_mlp": 0.00266999, "balance_loss_clip": 1.19941664, "balance_loss_mlp": 0.24179861, "epoch": 0.19287539455884564, "flos": 11801252672640.0, "grad_norm": 9.03414216578795, "language_loss": 0.87089837, "learning_rate": 3.7281103899729125e-06, "loss": 0.88868332, "num_input_tokens_seen": 69275835, "router_z_loss_clip": 3.12304688, "router_z_loss_mlp": 0.25183105, "step": 3208, "time_per_iteration": 2.5573394298553467 }, { "auxiliary_loss_clip": 0.0152567, "auxiliary_loss_mlp": 0.00282935, "balance_loss_clip": 1.21338844, "balance_loss_mlp": 0.25691134, "epoch": 0.1929355178115136, "flos": 20631434860800.0, "grad_norm": 7.985640367104113, "language_loss": 0.70671535, "learning_rate": 3.7279143033359195e-06, "loss": 0.72480136, "num_input_tokens_seen": 69294810, "router_z_loss_clip": 3.1171875, "router_z_loss_mlp": 0.26037598, "step": 3209, "time_per_iteration": 2.59397292137146 }, { "auxiliary_loss_clip": 0.01523835, "auxiliary_loss_mlp": 0.00304598, "balance_loss_clip": 1.2110033, "balance_loss_mlp": 0.27912319, "epoch": 0.19299564106418157, "flos": 40807916058240.0, "grad_norm": 2.7675692869286768, "language_loss": 0.91212916, "learning_rate": 3.727718151176243e-06, "loss": 0.93041348, "num_input_tokens_seen": 69316065, "router_z_loss_clip": 3.12695312, "router_z_loss_mlp": 0.25488281, "step": 3210, "time_per_iteration": 2.8131754398345947 }, { "auxiliary_loss_clip": 0.01527552, "auxiliary_loss_mlp": 0.00251844, "balance_loss_clip": 1.21480989, "balance_loss_mlp": 0.22967061, "epoch": 0.19305576431684954, "flos": 11361418634880.0, "grad_norm": 49.43777214939405, "language_loss": 0.90687996, "learning_rate": 3.7275219335013217e-06, "loss": 0.92467391, "num_input_tokens_seen": 69332900, "router_z_loss_clip": 3.12695312, "router_z_loss_mlp": 0.22180176, "step": 3211, "time_per_iteration": 2.5791499614715576 }, { "auxiliary_loss_clip": 0.01420028, "auxiliary_loss_mlp": 0.00063104, "balance_loss_clip": 1.1288532, "balance_loss_mlp": 0.04660593, "epoch": 0.1931158875695175, "flos": 54511895975040.0, "grad_norm": 0.9865258043191949, "language_loss": 0.6354875, "learning_rate": 3.7273256503185953e-06, "loss": 0.6503188, "num_input_tokens_seen": 69382535, "router_z_loss_clip": 2.90625, "router_z_loss_mlp": 0.16503906, "step": 3212, "time_per_iteration": 3.007046699523926 }, { "auxiliary_loss_clip": 0.01525118, "auxiliary_loss_mlp": 0.00293143, "balance_loss_clip": 1.2150358, "balance_loss_mlp": 0.27007562, "epoch": 0.19317601082218547, "flos": 19828436365440.0, "grad_norm": 7.176983289105826, "language_loss": 0.8222028, "learning_rate": 3.7271293016355074e-06, "loss": 0.84038544, "num_input_tokens_seen": 69400600, "router_z_loss_clip": 3.09960938, "router_z_loss_mlp": 0.23071289, "step": 3213, "time_per_iteration": 2.647026300430298 }, { "auxiliary_loss_clip": 0.01531024, "auxiliary_loss_mlp": 0.00298517, "balance_loss_clip": 1.21970582, "balance_loss_mlp": 0.27368575, "epoch": 0.19323613407485346, "flos": 13152068467200.0, "grad_norm": 4.383594630789381, "language_loss": 0.80707854, "learning_rate": 3.726932887459503e-06, "loss": 0.82537401, "num_input_tokens_seen": 69417350, "router_z_loss_clip": 3.11523438, "router_z_loss_mlp": 0.24841309, "step": 3214, "time_per_iteration": 2.7162423133850098 }, { "auxiliary_loss_clip": 0.01531078, "auxiliary_loss_mlp": 0.00309725, "balance_loss_clip": 1.22213387, "balance_loss_mlp": 0.28093591, "epoch": 0.19329625732752143, "flos": 14027247342720.0, "grad_norm": 38.00908272136673, "language_loss": 0.84704304, "learning_rate": 3.72673640779803e-06, "loss": 0.8654511, "num_input_tokens_seen": 69431845, "router_z_loss_clip": 3.08984375, "router_z_loss_mlp": 0.2878418, "step": 3215, "time_per_iteration": 2.5573880672454834 }, { "auxiliary_loss_clip": 0.01561551, "auxiliary_loss_mlp": 0.00294325, "balance_loss_clip": 1.2487669, "balance_loss_mlp": 0.26898113, "epoch": 0.1933563805801894, "flos": 23441732069760.0, "grad_norm": 43.10569843756784, "language_loss": 0.93581194, "learning_rate": 3.72653986265854e-06, "loss": 0.95437062, "num_input_tokens_seen": 69453275, "router_z_loss_clip": 3.12695312, "router_z_loss_mlp": 0.25354004, "step": 3216, "time_per_iteration": 2.59704327583313 }, { "auxiliary_loss_clip": 0.0155366, "auxiliary_loss_mlp": 0.00287761, "balance_loss_clip": 1.24043894, "balance_loss_mlp": 0.26365662, "epoch": 0.19341650383285736, "flos": 20485314334080.0, "grad_norm": 3.4903914627740993, "language_loss": 0.87523758, "learning_rate": 3.726343252048485e-06, "loss": 0.89365184, "num_input_tokens_seen": 69471830, "router_z_loss_clip": 3.13085938, "router_z_loss_mlp": 0.24108887, "step": 3217, "time_per_iteration": 2.5640950202941895 }, { "auxiliary_loss_clip": 0.01536309, "auxiliary_loss_mlp": 0.00347293, "balance_loss_clip": 1.22123694, "balance_loss_mlp": 0.31625053, "epoch": 0.19347662708552532, "flos": 17858484817920.0, "grad_norm": 70.59647263118362, "language_loss": 0.77558446, "learning_rate": 3.7261465759753206e-06, "loss": 0.79442048, "num_input_tokens_seen": 69489320, "router_z_loss_clip": 3.15039062, "router_z_loss_mlp": 0.31054688, "step": 3218, "time_per_iteration": 2.589664936065674 }, { "auxiliary_loss_clip": 0.01568165, "auxiliary_loss_mlp": 0.00342923, "balance_loss_clip": 1.2515831, "balance_loss_mlp": 0.31334701, "epoch": 0.1935367503381933, "flos": 18187247024640.0, "grad_norm": 4.300051209379901, "language_loss": 0.86463904, "learning_rate": 3.7259498344465053e-06, "loss": 0.8837499, "num_input_tokens_seen": 69506665, "router_z_loss_clip": 3.16992188, "router_z_loss_mlp": 0.29614258, "step": 3219, "time_per_iteration": 2.57383394241333 }, { "auxiliary_loss_clip": 0.01584395, "auxiliary_loss_mlp": 0.0030114, "balance_loss_clip": 1.2677269, "balance_loss_mlp": 0.27634415, "epoch": 0.19359687359086128, "flos": 15957122290560.0, "grad_norm": 4.287716622493703, "language_loss": 0.96456349, "learning_rate": 3.7257530274694993e-06, "loss": 0.98341876, "num_input_tokens_seen": 69523835, "router_z_loss_clip": 3.16796875, "router_z_loss_mlp": 0.24829102, "step": 3220, "time_per_iteration": 2.578460693359375 }, { "auxiliary_loss_clip": 0.01606903, "auxiliary_loss_mlp": 0.00274851, "balance_loss_clip": 1.29085171, "balance_loss_mlp": 0.25056759, "epoch": 0.19365699684352924, "flos": 21215198695680.0, "grad_norm": 6.073350953552274, "language_loss": 0.90303695, "learning_rate": 3.725556155051766e-06, "loss": 0.92185444, "num_input_tokens_seen": 69542620, "router_z_loss_clip": 3.16210938, "router_z_loss_mlp": 0.24267578, "step": 3221, "time_per_iteration": 2.66359806060791 }, { "auxiliary_loss_clip": 0.0160858, "auxiliary_loss_mlp": 0.00253991, "balance_loss_clip": 1.29554391, "balance_loss_mlp": 0.23078066, "epoch": 0.1937171200961972, "flos": 17311098481920.0, "grad_norm": 160.08196496133073, "language_loss": 0.93217432, "learning_rate": 3.7253592172007702e-06, "loss": 0.95080006, "num_input_tokens_seen": 69561130, "router_z_loss_clip": 3.13476562, "router_z_loss_mlp": 0.23193359, "step": 3222, "time_per_iteration": 2.6556615829467773 }, { "auxiliary_loss_clip": 0.01615682, "auxiliary_loss_mlp": 0.00252913, "balance_loss_clip": 1.30022717, "balance_loss_mlp": 0.22747336, "epoch": 0.19377724334886517, "flos": 22635968227200.0, "grad_norm": 9.60117720472628, "language_loss": 0.85160828, "learning_rate": 3.72516221392398e-06, "loss": 0.87029421, "num_input_tokens_seen": 69580425, "router_z_loss_clip": 3.15625, "router_z_loss_mlp": 0.25463867, "step": 3223, "time_per_iteration": 2.6099977493286133 }, { "auxiliary_loss_clip": 0.01617165, "auxiliary_loss_mlp": 0.00308245, "balance_loss_clip": 1.30267155, "balance_loss_mlp": 0.2793963, "epoch": 0.19383736660153314, "flos": 15077813351040.0, "grad_norm": 23.97655141826144, "language_loss": 0.84178293, "learning_rate": 3.7249651452288653e-06, "loss": 0.86103702, "num_input_tokens_seen": 69597085, "router_z_loss_clip": 3.14648438, "router_z_loss_mlp": 0.28833008, "step": 3224, "time_per_iteration": 2.5219767093658447 }, { "auxiliary_loss_clip": 0.0161095, "auxiliary_loss_mlp": 0.00242923, "balance_loss_clip": 1.2951715, "balance_loss_mlp": 0.21772206, "epoch": 0.1938974898542011, "flos": 47119934350080.0, "grad_norm": 4.889514434809209, "language_loss": 0.801862, "learning_rate": 3.7247680111229e-06, "loss": 0.82040071, "num_input_tokens_seen": 69618885, "router_z_loss_clip": 3.16015625, "router_z_loss_mlp": 0.2520752, "step": 3225, "time_per_iteration": 2.8198182582855225 }, { "auxiliary_loss_clip": 0.01625276, "auxiliary_loss_mlp": 0.00241733, "balance_loss_clip": 1.3116461, "balance_loss_mlp": 0.21551889, "epoch": 0.19395761310686907, "flos": 25812554376960.0, "grad_norm": 13.425299724030042, "language_loss": 0.78042305, "learning_rate": 3.7245708116135585e-06, "loss": 0.79909313, "num_input_tokens_seen": 69638200, "router_z_loss_clip": 3.13476562, "router_z_loss_mlp": 0.26196289, "step": 3226, "time_per_iteration": 4.014801740646362 }, { "auxiliary_loss_clip": 0.01652959, "auxiliary_loss_mlp": 0.00259572, "balance_loss_clip": 1.33562589, "balance_loss_mlp": 0.23044896, "epoch": 0.19401773635953706, "flos": 23039604334080.0, "grad_norm": 1.6251928367387278, "language_loss": 0.83147204, "learning_rate": 3.7243735467083193e-06, "loss": 0.85059738, "num_input_tokens_seen": 69657550, "router_z_loss_clip": 3.171875, "router_z_loss_mlp": 0.29125977, "step": 3227, "time_per_iteration": 2.607297658920288 }, { "auxiliary_loss_clip": 0.0164526, "auxiliary_loss_mlp": 0.00236606, "balance_loss_clip": 1.33034325, "balance_loss_mlp": 0.21111855, "epoch": 0.19407785961220503, "flos": 15920780705280.0, "grad_norm": 4.537221649175126, "language_loss": 0.76813185, "learning_rate": 3.724176216414662e-06, "loss": 0.78695053, "num_input_tokens_seen": 69675005, "router_z_loss_clip": 3.1484375, "router_z_loss_mlp": 0.25488281, "step": 3228, "time_per_iteration": 3.99920654296875 }, { "auxiliary_loss_clip": 0.01646969, "auxiliary_loss_mlp": 0.00244338, "balance_loss_clip": 1.33159196, "balance_loss_mlp": 0.22060341, "epoch": 0.194137982864873, "flos": 25921722787200.0, "grad_norm": 48.87272818523903, "language_loss": 0.80981684, "learning_rate": 3.72397882074007e-06, "loss": 0.82872987, "num_input_tokens_seen": 69696455, "router_z_loss_clip": 3.15820312, "router_z_loss_mlp": 0.23742676, "step": 3229, "time_per_iteration": 4.061859846115112 }, { "auxiliary_loss_clip": 0.01666606, "auxiliary_loss_mlp": 0.00237628, "balance_loss_clip": 1.3459748, "balance_loss_mlp": 0.20993581, "epoch": 0.19419810611754096, "flos": 13261344618240.0, "grad_norm": 182.51295499116637, "language_loss": 0.73077691, "learning_rate": 3.7237813596920285e-06, "loss": 0.74981922, "num_input_tokens_seen": 69714245, "router_z_loss_clip": 3.20507812, "router_z_loss_mlp": 0.2767334, "step": 3230, "time_per_iteration": 2.5338315963745117 }, { "auxiliary_loss_clip": 0.01657417, "auxiliary_loss_mlp": 0.00211156, "balance_loss_clip": 1.34110689, "balance_loss_mlp": 0.18384495, "epoch": 0.19425822937020892, "flos": 15705568368000.0, "grad_norm": 82.72635826933988, "language_loss": 0.89025158, "learning_rate": 3.7235838332780254e-06, "loss": 0.90893734, "num_input_tokens_seen": 69731515, "router_z_loss_clip": 3.16210938, "router_z_loss_mlp": 0.27307129, "step": 3231, "time_per_iteration": 2.6019997596740723 }, { "auxiliary_loss_clip": 0.01670709, "auxiliary_loss_mlp": 0.00250838, "balance_loss_clip": 1.34881997, "balance_loss_mlp": 0.22206089, "epoch": 0.1943183526228769, "flos": 23105392093440.0, "grad_norm": 2.274731211728245, "language_loss": 0.95267725, "learning_rate": 3.72338624150555e-06, "loss": 0.97189271, "num_input_tokens_seen": 69748885, "router_z_loss_clip": 3.21875, "router_z_loss_mlp": 0.2878418, "step": 3232, "time_per_iteration": 2.6186575889587402 }, { "auxiliary_loss_clip": 0.01683839, "auxiliary_loss_mlp": 0.00233035, "balance_loss_clip": 1.36116242, "balance_loss_mlp": 0.20688051, "epoch": 0.19437847587554485, "flos": 24712610146560.0, "grad_norm": 11.33676279920379, "language_loss": 0.91370392, "learning_rate": 3.723188584382096e-06, "loss": 0.93287265, "num_input_tokens_seen": 69767540, "router_z_loss_clip": 3.22851562, "router_z_loss_mlp": 0.26184082, "step": 3233, "time_per_iteration": 2.725860357284546 }, { "auxiliary_loss_clip": 0.0168499, "auxiliary_loss_mlp": 0.00235845, "balance_loss_clip": 1.36028886, "balance_loss_mlp": 0.20960662, "epoch": 0.19443859912821285, "flos": 23116130259840.0, "grad_norm": 4.8888295552587415, "language_loss": 0.94255215, "learning_rate": 3.722990861915158e-06, "loss": 0.96176052, "num_input_tokens_seen": 69789340, "router_z_loss_clip": 3.25, "router_z_loss_mlp": 0.2623291, "step": 3234, "time_per_iteration": 4.124841213226318 }, { "auxiliary_loss_clip": 0.01684681, "auxiliary_loss_mlp": 0.00265449, "balance_loss_clip": 1.35862708, "balance_loss_mlp": 0.23521727, "epoch": 0.1944987223808808, "flos": 15084385539840.0, "grad_norm": 4.748176247558305, "language_loss": 0.90344107, "learning_rate": 3.722793074112234e-06, "loss": 0.9229424, "num_input_tokens_seen": 69806470, "router_z_loss_clip": 3.2578125, "router_z_loss_mlp": 0.30249023, "step": 3235, "time_per_iteration": 2.5719330310821533 }, { "auxiliary_loss_clip": 0.01715049, "auxiliary_loss_mlp": 0.00215271, "balance_loss_clip": 1.39019072, "balance_loss_mlp": 0.18805593, "epoch": 0.19455884563354878, "flos": 17126876603520.0, "grad_norm": 19.095545863884347, "language_loss": 0.86214787, "learning_rate": 3.7225952209808233e-06, "loss": 0.88145107, "num_input_tokens_seen": 69822655, "router_z_loss_clip": 3.24609375, "router_z_loss_mlp": 0.2722168, "step": 3236, "time_per_iteration": 2.6139965057373047 }, { "auxiliary_loss_clip": 0.01731205, "auxiliary_loss_mlp": 0.00223825, "balance_loss_clip": 1.39889121, "balance_loss_mlp": 0.19558483, "epoch": 0.19461896888621674, "flos": 20193396503040.0, "grad_norm": 8.616125287420768, "language_loss": 0.84036893, "learning_rate": 3.72239730252843e-06, "loss": 0.85991919, "num_input_tokens_seen": 69841895, "router_z_loss_clip": 3.3203125, "router_z_loss_mlp": 0.28222656, "step": 3237, "time_per_iteration": 2.563685894012451 }, { "auxiliary_loss_clip": 0.01697933, "auxiliary_loss_mlp": 0.00242972, "balance_loss_clip": 1.36904168, "balance_loss_mlp": 0.21443306, "epoch": 0.1946790921388847, "flos": 25301365971840.0, "grad_norm": 249.32142840880246, "language_loss": 0.81131625, "learning_rate": 3.7221993187625583e-06, "loss": 0.83072525, "num_input_tokens_seen": 69862220, "router_z_loss_clip": 3.2890625, "router_z_loss_mlp": 0.28540039, "step": 3238, "time_per_iteration": 2.6591615676879883 }, { "auxiliary_loss_clip": 0.01704329, "auxiliary_loss_mlp": 0.00200079, "balance_loss_clip": 1.37938893, "balance_loss_mlp": 0.17373402, "epoch": 0.19473921539155267, "flos": 20193396503040.0, "grad_norm": 11.472237620737886, "language_loss": 0.83058816, "learning_rate": 3.7220012696907155e-06, "loss": 0.84963226, "num_input_tokens_seen": 69881830, "router_z_loss_clip": 3.25195312, "router_z_loss_mlp": 0.26379395, "step": 3239, "time_per_iteration": 2.5623295307159424 }, { "auxiliary_loss_clip": 0.01694685, "auxiliary_loss_mlp": 0.00232481, "balance_loss_clip": 1.37377417, "balance_loss_mlp": 0.20549163, "epoch": 0.19479933864422067, "flos": 20887549810560.0, "grad_norm": 141.88924090250637, "language_loss": 0.80549228, "learning_rate": 3.721803155320412e-06, "loss": 0.82476383, "num_input_tokens_seen": 69900515, "router_z_loss_clip": 3.20703125, "router_z_loss_mlp": 0.2701416, "step": 3240, "time_per_iteration": 2.6704821586608887 }, { "auxiliary_loss_clip": 0.01677755, "auxiliary_loss_mlp": 0.00198554, "balance_loss_clip": 1.35479772, "balance_loss_mlp": 0.17063521, "epoch": 0.19485946189688863, "flos": 23295072839040.0, "grad_norm": 4.764016391708516, "language_loss": 0.78066552, "learning_rate": 3.7216049756591606e-06, "loss": 0.79942864, "num_input_tokens_seen": 69920060, "router_z_loss_clip": 3.22851562, "router_z_loss_mlp": 0.27929688, "step": 3241, "time_per_iteration": 2.599961996078491 }, { "auxiliary_loss_clip": 0.01676667, "auxiliary_loss_mlp": 0.00217073, "balance_loss_clip": 1.35208106, "balance_loss_mlp": 0.18963075, "epoch": 0.1949195851495566, "flos": 23295036925440.0, "grad_norm": 652.8906793645195, "language_loss": 0.8882848, "learning_rate": 3.7214067307144754e-06, "loss": 0.90722215, "num_input_tokens_seen": 69939820, "router_z_loss_clip": 3.2421875, "router_z_loss_mlp": 0.2746582, "step": 3242, "time_per_iteration": 2.6265292167663574 }, { "auxiliary_loss_clip": 0.01607915, "auxiliary_loss_mlp": 0.00054748, "balance_loss_clip": 1.31007528, "balance_loss_mlp": 0.04196914, "epoch": 0.19497970840222456, "flos": 64962871557120.0, "grad_norm": 0.8195852625520489, "language_loss": 0.5745995, "learning_rate": 3.721208420493875e-06, "loss": 0.5912261, "num_input_tokens_seen": 70002145, "router_z_loss_clip": 2.96875, "router_z_loss_mlp": 0.12792969, "step": 3243, "time_per_iteration": 3.100541591644287 }, { "auxiliary_loss_clip": 0.0166141, "auxiliary_loss_mlp": 0.00209324, "balance_loss_clip": 1.33653462, "balance_loss_mlp": 0.18331209, "epoch": 0.19503983165489253, "flos": 19644717277440.0, "grad_norm": 27.70060724365806, "language_loss": 0.91429317, "learning_rate": 3.7210100450048784e-06, "loss": 0.93300056, "num_input_tokens_seen": 70020510, "router_z_loss_clip": 3.25, "router_z_loss_mlp": 0.26025391, "step": 3244, "time_per_iteration": 2.6034953594207764 }, { "auxiliary_loss_clip": 0.016588, "auxiliary_loss_mlp": 0.00238076, "balance_loss_clip": 1.33229518, "balance_loss_mlp": 0.2112059, "epoch": 0.1950999549075605, "flos": 21141976821120.0, "grad_norm": 30.40523846212217, "language_loss": 0.84722865, "learning_rate": 3.7208116042550088e-06, "loss": 0.86619747, "num_input_tokens_seen": 70040760, "router_z_loss_clip": 3.26757812, "router_z_loss_mlp": 0.26879883, "step": 3245, "time_per_iteration": 2.5805068016052246 }, { "auxiliary_loss_clip": 0.01651127, "auxiliary_loss_mlp": 0.00233567, "balance_loss_clip": 1.32023668, "balance_loss_mlp": 0.20632738, "epoch": 0.19516007816022846, "flos": 20884820376960.0, "grad_norm": 6.987449608936733, "language_loss": 0.92598778, "learning_rate": 3.7206130982517906e-06, "loss": 0.94483471, "num_input_tokens_seen": 70058720, "router_z_loss_clip": 3.31054688, "router_z_loss_mlp": 0.27282715, "step": 3246, "time_per_iteration": 2.5855815410614014 }, { "auxiliary_loss_clip": 0.01666601, "auxiliary_loss_mlp": 0.00253858, "balance_loss_clip": 1.33901501, "balance_loss_mlp": 0.22639214, "epoch": 0.19522020141289645, "flos": 16910515031040.0, "grad_norm": 6.410970970995804, "language_loss": 0.85470837, "learning_rate": 3.7204145270027514e-06, "loss": 0.87391299, "num_input_tokens_seen": 70076470, "router_z_loss_clip": 3.27539062, "router_z_loss_mlp": 0.27478027, "step": 3247, "time_per_iteration": 2.555216073989868 }, { "auxiliary_loss_clip": 0.01669246, "auxiliary_loss_mlp": 0.00232711, "balance_loss_clip": 1.33948183, "balance_loss_mlp": 0.20658062, "epoch": 0.19528032466556441, "flos": 26724829023360.0, "grad_norm": 20.109160932901432, "language_loss": 0.81755388, "learning_rate": 3.720215890515421e-06, "loss": 0.83657348, "num_input_tokens_seen": 70096220, "router_z_loss_clip": 3.296875, "router_z_loss_mlp": 0.26123047, "step": 3248, "time_per_iteration": 2.6028952598571777 }, { "auxiliary_loss_clip": 0.01654535, "auxiliary_loss_mlp": 0.00255235, "balance_loss_clip": 1.31965685, "balance_loss_mlp": 0.22847281, "epoch": 0.19534044791823238, "flos": 21032808410880.0, "grad_norm": 3.9015438136738982, "language_loss": 0.85646415, "learning_rate": 3.7200171887973316e-06, "loss": 0.87556183, "num_input_tokens_seen": 70114800, "router_z_loss_clip": 3.35351562, "router_z_loss_mlp": 0.2677002, "step": 3249, "time_per_iteration": 2.561229705810547 }, { "auxiliary_loss_clip": 0.0164282, "auxiliary_loss_mlp": 0.00253993, "balance_loss_clip": 1.3118726, "balance_loss_mlp": 0.22720672, "epoch": 0.19540057117090034, "flos": 22344050396160.0, "grad_norm": 7.707379802710601, "language_loss": 0.8037318, "learning_rate": 3.7198184218560176e-06, "loss": 0.8226999, "num_input_tokens_seen": 70134930, "router_z_loss_clip": 3.30859375, "router_z_loss_mlp": 0.26806641, "step": 3250, "time_per_iteration": 2.607150077819824 }, { "auxiliary_loss_clip": 0.01651784, "auxiliary_loss_mlp": 0.00222811, "balance_loss_clip": 1.31708229, "balance_loss_mlp": 0.19526163, "epoch": 0.1954606944235683, "flos": 20301631159680.0, "grad_norm": 50.738659312622794, "language_loss": 0.8640238, "learning_rate": 3.719619589699017e-06, "loss": 0.8827697, "num_input_tokens_seen": 70152045, "router_z_loss_clip": 3.34765625, "router_z_loss_mlp": 0.27563477, "step": 3251, "time_per_iteration": 2.5346450805664062 }, { "auxiliary_loss_clip": 0.01626174, "auxiliary_loss_mlp": 0.00268794, "balance_loss_clip": 1.29893017, "balance_loss_mlp": 0.24137592, "epoch": 0.19552081767623627, "flos": 17346865449600.0, "grad_norm": 4.593978402027078, "language_loss": 0.91841972, "learning_rate": 3.7194206923338695e-06, "loss": 0.93736941, "num_input_tokens_seen": 70169240, "router_z_loss_clip": 3.26953125, "router_z_loss_mlp": 0.27380371, "step": 3252, "time_per_iteration": 2.5551297664642334 }, { "auxiliary_loss_clip": 0.01598297, "auxiliary_loss_mlp": 0.0027466, "balance_loss_clip": 1.2698493, "balance_loss_mlp": 0.24552572, "epoch": 0.19558094092890424, "flos": 31977626129280.0, "grad_norm": 7.008590108362895, "language_loss": 0.78824085, "learning_rate": 3.719221729768117e-06, "loss": 0.80697036, "num_input_tokens_seen": 70192690, "router_z_loss_clip": 3.28710938, "router_z_loss_mlp": 0.29150391, "step": 3253, "time_per_iteration": 2.6741738319396973 }, { "auxiliary_loss_clip": 0.01598262, "auxiliary_loss_mlp": 0.00263282, "balance_loss_clip": 1.26974499, "balance_loss_mlp": 0.2355302, "epoch": 0.19564106418157223, "flos": 22268889187200.0, "grad_norm": 12.051633557383417, "language_loss": 0.85108376, "learning_rate": 3.7190227020093037e-06, "loss": 0.86969924, "num_input_tokens_seen": 70209685, "router_z_loss_clip": 3.28515625, "router_z_loss_mlp": 0.27758789, "step": 3254, "time_per_iteration": 2.637686252593994 }, { "auxiliary_loss_clip": 0.01556163, "auxiliary_loss_mlp": 0.00053273, "balance_loss_clip": 1.23647332, "balance_loss_mlp": 0.03810933, "epoch": 0.1957011874342402, "flos": 54364554385920.0, "grad_norm": 0.859020874271997, "language_loss": 0.54955333, "learning_rate": 3.7188236090649774e-06, "loss": 0.56564772, "num_input_tokens_seen": 70265050, "router_z_loss_clip": 3.203125, "router_z_loss_mlp": 0.15136719, "step": 3255, "time_per_iteration": 3.1449332237243652 }, { "auxiliary_loss_clip": 0.01597168, "auxiliary_loss_mlp": 0.00314119, "balance_loss_clip": 1.26381254, "balance_loss_mlp": 0.28549671, "epoch": 0.19576131068690816, "flos": 16506699356160.0, "grad_norm": 323.8292888862172, "language_loss": 0.8376385, "learning_rate": 3.718624450942688e-06, "loss": 0.85675132, "num_input_tokens_seen": 70281830, "router_z_loss_clip": 3.33398438, "router_z_loss_mlp": 0.28625488, "step": 3256, "time_per_iteration": 2.5308613777160645 }, { "auxiliary_loss_clip": 0.01592761, "auxiliary_loss_mlp": 0.00247345, "balance_loss_clip": 1.25916219, "balance_loss_mlp": 0.22283517, "epoch": 0.19582143393957613, "flos": 14719676797440.0, "grad_norm": 24.582982927365926, "language_loss": 0.89647067, "learning_rate": 3.718425227649987e-06, "loss": 0.91487163, "num_input_tokens_seen": 70297420, "router_z_loss_clip": 3.33984375, "router_z_loss_mlp": 0.24511719, "step": 3257, "time_per_iteration": 2.5857226848602295 }, { "auxiliary_loss_clip": 0.01594991, "auxiliary_loss_mlp": 0.0022352, "balance_loss_clip": 1.2615484, "balance_loss_mlp": 0.19847424, "epoch": 0.1958815571922441, "flos": 24425504737920.0, "grad_norm": 37.01552989354877, "language_loss": 0.82487923, "learning_rate": 3.7182259391944292e-06, "loss": 0.84306431, "num_input_tokens_seen": 70319210, "router_z_loss_clip": 3.33203125, "router_z_loss_mlp": 0.25061035, "step": 3258, "time_per_iteration": 2.6177713871002197 }, { "auxiliary_loss_clip": 0.01579942, "auxiliary_loss_mlp": 0.00272231, "balance_loss_clip": 1.24838233, "balance_loss_mlp": 0.24596961, "epoch": 0.19594168044491206, "flos": 24900279730560.0, "grad_norm": 11.72590324452655, "language_loss": 0.82417285, "learning_rate": 3.7180265855835714e-06, "loss": 0.84269464, "num_input_tokens_seen": 70339045, "router_z_loss_clip": 3.31445312, "router_z_loss_mlp": 0.2623291, "step": 3259, "time_per_iteration": 2.6536808013916016 }, { "auxiliary_loss_clip": 0.01594172, "auxiliary_loss_mlp": 0.00287591, "balance_loss_clip": 1.25965738, "balance_loss_mlp": 0.26028055, "epoch": 0.19600180369758005, "flos": 12057008486400.0, "grad_norm": 7.262466502680778, "language_loss": 0.87930322, "learning_rate": 3.7178271668249735e-06, "loss": 0.89812082, "num_input_tokens_seen": 70356505, "router_z_loss_clip": 3.34375, "router_z_loss_mlp": 0.27319336, "step": 3260, "time_per_iteration": 2.5647459030151367 }, { "auxiliary_loss_clip": 0.01591344, "auxiliary_loss_mlp": 0.00269369, "balance_loss_clip": 1.25700498, "balance_loss_mlp": 0.24350038, "epoch": 0.19606192695024802, "flos": 20850202644480.0, "grad_norm": 2.724765949332248, "language_loss": 0.91394007, "learning_rate": 3.7176276829261975e-06, "loss": 0.93254721, "num_input_tokens_seen": 70375410, "router_z_loss_clip": 3.34179688, "router_z_loss_mlp": 0.25891113, "step": 3261, "time_per_iteration": 2.6633474826812744 }, { "auxiliary_loss_clip": 0.01605398, "auxiliary_loss_mlp": 0.00290633, "balance_loss_clip": 1.2678467, "balance_loss_mlp": 0.26277351, "epoch": 0.19612205020291598, "flos": 28475509996800.0, "grad_norm": 34.69001845062122, "language_loss": 0.82035899, "learning_rate": 3.717428133894807e-06, "loss": 0.83931929, "num_input_tokens_seen": 70396315, "router_z_loss_clip": 3.375, "router_z_loss_mlp": 0.27880859, "step": 3262, "time_per_iteration": 2.7044639587402344 }, { "auxiliary_loss_clip": 0.01594504, "auxiliary_loss_mlp": 0.00268086, "balance_loss_clip": 1.25316107, "balance_loss_mlp": 0.24197899, "epoch": 0.19618217345558395, "flos": 25556618995200.0, "grad_norm": 2.283297995293271, "language_loss": 0.8979274, "learning_rate": 3.71722851973837e-06, "loss": 0.91655332, "num_input_tokens_seen": 70417945, "router_z_loss_clip": 3.41601562, "router_z_loss_mlp": 0.26098633, "step": 3263, "time_per_iteration": 2.66318416595459 }, { "auxiliary_loss_clip": 0.01593306, "auxiliary_loss_mlp": 0.0027123, "balance_loss_clip": 1.25414801, "balance_loss_mlp": 0.24481362, "epoch": 0.1962422967082519, "flos": 25264413855360.0, "grad_norm": 1.690568253237969, "language_loss": 0.81223977, "learning_rate": 3.717028840464455e-06, "loss": 0.83088517, "num_input_tokens_seen": 70438690, "router_z_loss_clip": 3.390625, "router_z_loss_mlp": 0.26428223, "step": 3264, "time_per_iteration": 2.6438686847686768 }, { "auxiliary_loss_clip": 0.01586305, "auxiliary_loss_mlp": 0.00280257, "balance_loss_clip": 1.24798751, "balance_loss_mlp": 0.25484174, "epoch": 0.19630241996091988, "flos": 18807352444800.0, "grad_norm": 74.70703631031417, "language_loss": 0.88453114, "learning_rate": 3.7168290960806344e-06, "loss": 0.90319681, "num_input_tokens_seen": 70455385, "router_z_loss_clip": 3.3828125, "router_z_loss_mlp": 0.2545166, "step": 3265, "time_per_iteration": 2.600177526473999 }, { "auxiliary_loss_clip": 0.01509854, "auxiliary_loss_mlp": 0.00118852, "balance_loss_clip": 1.1622293, "balance_loss_mlp": 0.10569097, "epoch": 0.19636254321358784, "flos": 62321137896960.0, "grad_norm": 0.8339496170310328, "language_loss": 0.52962291, "learning_rate": 3.716629286594483e-06, "loss": 0.54591, "num_input_tokens_seen": 70514280, "router_z_loss_clip": 3.46875, "router_z_loss_mlp": 0.13183594, "step": 3266, "time_per_iteration": 3.160677671432495 }, { "auxiliary_loss_clip": 0.01569376, "auxiliary_loss_mlp": 0.00231915, "balance_loss_clip": 1.23266482, "balance_loss_mlp": 0.20585546, "epoch": 0.19642266646625584, "flos": 21069329564160.0, "grad_norm": 117.83953483340771, "language_loss": 0.89349329, "learning_rate": 3.7164294120135767e-06, "loss": 0.91150624, "num_input_tokens_seen": 70531800, "router_z_loss_clip": 3.36523438, "router_z_loss_mlp": 0.26074219, "step": 3267, "time_per_iteration": 2.5824849605560303 }, { "auxiliary_loss_clip": 0.01574902, "auxiliary_loss_mlp": 0.00229888, "balance_loss_clip": 1.23787069, "balance_loss_mlp": 0.20539078, "epoch": 0.1964827897189238, "flos": 14538651229440.0, "grad_norm": 55.96121150539209, "language_loss": 0.94693041, "learning_rate": 3.7162294723454953e-06, "loss": 0.96497834, "num_input_tokens_seen": 70550615, "router_z_loss_clip": 3.37109375, "router_z_loss_mlp": 0.24487305, "step": 3268, "time_per_iteration": 3.924222230911255 }, { "auxiliary_loss_clip": 0.01580856, "auxiliary_loss_mlp": 0.00236368, "balance_loss_clip": 1.24035406, "balance_loss_mlp": 0.21330056, "epoch": 0.19654291297159177, "flos": 19244636616960.0, "grad_norm": 13.958782838057187, "language_loss": 0.77618098, "learning_rate": 3.7160294675978197e-06, "loss": 0.79435325, "num_input_tokens_seen": 70568690, "router_z_loss_clip": 3.40625, "router_z_loss_mlp": 0.23095703, "step": 3269, "time_per_iteration": 2.566984176635742 }, { "auxiliary_loss_clip": 0.01568333, "auxiliary_loss_mlp": 0.00232781, "balance_loss_clip": 1.23283708, "balance_loss_mlp": 0.2069488, "epoch": 0.19660303622425973, "flos": 25775710001280.0, "grad_norm": 9.828762208481296, "language_loss": 0.88941944, "learning_rate": 3.715829397778135e-06, "loss": 0.90743059, "num_input_tokens_seen": 70588665, "router_z_loss_clip": 3.35351562, "router_z_loss_mlp": 0.25805664, "step": 3270, "time_per_iteration": 2.6500797271728516 }, { "auxiliary_loss_clip": 0.01579069, "auxiliary_loss_mlp": 0.00213741, "balance_loss_clip": 1.24414897, "balance_loss_mlp": 0.18778956, "epoch": 0.1966631594769277, "flos": 20595093275520.0, "grad_norm": 14.859535110237664, "language_loss": 0.91404366, "learning_rate": 3.715629262894028e-06, "loss": 0.93197179, "num_input_tokens_seen": 70606900, "router_z_loss_clip": 3.3515625, "router_z_loss_mlp": 0.25927734, "step": 3271, "time_per_iteration": 5.381535530090332 }, { "auxiliary_loss_clip": 0.01606197, "auxiliary_loss_mlp": 0.00236038, "balance_loss_clip": 1.26541901, "balance_loss_mlp": 0.21176746, "epoch": 0.19672328272959566, "flos": 23623188600960.0, "grad_norm": 24.964557214334917, "language_loss": 0.86506951, "learning_rate": 3.715429062953087e-06, "loss": 0.88349193, "num_input_tokens_seen": 70625955, "router_z_loss_clip": 3.40429688, "router_z_loss_mlp": 0.24279785, "step": 3272, "time_per_iteration": 2.581650495529175 }, { "auxiliary_loss_clip": 0.016054, "auxiliary_loss_mlp": 0.00244732, "balance_loss_clip": 1.26355278, "balance_loss_mlp": 0.21558538, "epoch": 0.19678340598226365, "flos": 23110922787840.0, "grad_norm": 3.952600402349236, "language_loss": 0.89029813, "learning_rate": 3.7152287979629043e-06, "loss": 0.90879941, "num_input_tokens_seen": 70646090, "router_z_loss_clip": 3.421875, "router_z_loss_mlp": 0.29174805, "step": 3273, "time_per_iteration": 2.5990960597991943 }, { "auxiliary_loss_clip": 0.01632096, "auxiliary_loss_mlp": 0.0023012, "balance_loss_clip": 1.28645742, "balance_loss_mlp": 0.20512211, "epoch": 0.19684352923493162, "flos": 24534852716160.0, "grad_norm": 15.225643396958727, "language_loss": 0.86697149, "learning_rate": 3.7150284679310735e-06, "loss": 0.88559365, "num_input_tokens_seen": 70666065, "router_z_loss_clip": 3.453125, "router_z_loss_mlp": 0.25036621, "step": 3274, "time_per_iteration": 2.6105189323425293 }, { "auxiliary_loss_clip": 0.01629337, "auxiliary_loss_mlp": 0.00267273, "balance_loss_clip": 1.28881109, "balance_loss_mlp": 0.2388058, "epoch": 0.19690365248759958, "flos": 21796448578560.0, "grad_norm": 7.28893316008536, "language_loss": 0.905047, "learning_rate": 3.7148280728651914e-06, "loss": 0.92401314, "num_input_tokens_seen": 70681580, "router_z_loss_clip": 3.40625, "router_z_loss_mlp": 0.28479004, "step": 3275, "time_per_iteration": 2.592250347137451 }, { "auxiliary_loss_clip": 0.01648171, "auxiliary_loss_mlp": 0.00230493, "balance_loss_clip": 1.2998004, "balance_loss_mlp": 0.20337246, "epoch": 0.19696377574026755, "flos": 19056643810560.0, "grad_norm": 95.67508139728109, "language_loss": 0.88158441, "learning_rate": 3.7146276127728563e-06, "loss": 0.90037107, "num_input_tokens_seen": 70697745, "router_z_loss_clip": 3.484375, "router_z_loss_mlp": 0.27099609, "step": 3276, "time_per_iteration": 4.026085615158081 }, { "auxiliary_loss_clip": 0.01654294, "auxiliary_loss_mlp": 0.00233652, "balance_loss_clip": 1.30787325, "balance_loss_mlp": 0.20588838, "epoch": 0.19702389899293551, "flos": 22820656982400.0, "grad_norm": 68.74576517792558, "language_loss": 0.97608268, "learning_rate": 3.7144270876616713e-06, "loss": 0.9949621, "num_input_tokens_seen": 70715110, "router_z_loss_clip": 3.46875, "router_z_loss_mlp": 0.27770996, "step": 3277, "time_per_iteration": 2.561866283416748 }, { "auxiliary_loss_clip": 0.01649894, "auxiliary_loss_mlp": 0.00244971, "balance_loss_clip": 1.29940057, "balance_loss_mlp": 0.21577683, "epoch": 0.19708402224560348, "flos": 22894237992960.0, "grad_norm": 46.74227684443058, "language_loss": 0.73361373, "learning_rate": 3.714226497539239e-06, "loss": 0.75256228, "num_input_tokens_seen": 70734715, "router_z_loss_clip": 3.50585938, "router_z_loss_mlp": 0.29162598, "step": 3278, "time_per_iteration": 2.591475009918213 }, { "auxiliary_loss_clip": 0.01655038, "auxiliary_loss_mlp": 0.00246773, "balance_loss_clip": 1.3059113, "balance_loss_mlp": 0.21732855, "epoch": 0.19714414549827144, "flos": 25662519267840.0, "grad_norm": 154.38355946251596, "language_loss": 0.85475028, "learning_rate": 3.714025842413166e-06, "loss": 0.87376845, "num_input_tokens_seen": 70752650, "router_z_loss_clip": 3.4921875, "router_z_loss_mlp": 0.29443359, "step": 3279, "time_per_iteration": 2.6198086738586426 }, { "auxiliary_loss_clip": 0.01695197, "auxiliary_loss_mlp": 0.00235821, "balance_loss_clip": 1.34679103, "balance_loss_mlp": 0.20877242, "epoch": 0.19720426875093944, "flos": 23915824704000.0, "grad_norm": 5.044738938900593, "language_loss": 0.88805658, "learning_rate": 3.713825122291061e-06, "loss": 0.90736675, "num_input_tokens_seen": 70772365, "router_z_loss_clip": 3.48242188, "router_z_loss_mlp": 0.27050781, "step": 3280, "time_per_iteration": 2.5816550254821777 }, { "auxiliary_loss_clip": 0.01713167, "auxiliary_loss_mlp": 0.00213185, "balance_loss_clip": 1.35716152, "balance_loss_mlp": 0.1839671, "epoch": 0.1972643920036074, "flos": 13881952828800.0, "grad_norm": 124.44930068522754, "language_loss": 0.84941959, "learning_rate": 3.713624337180536e-06, "loss": 0.8686831, "num_input_tokens_seen": 70790340, "router_z_loss_clip": 3.56054688, "router_z_loss_mlp": 0.29211426, "step": 3281, "time_per_iteration": 2.544574737548828 }, { "auxiliary_loss_clip": 0.01716477, "auxiliary_loss_mlp": 0.00219945, "balance_loss_clip": 1.37145531, "balance_loss_mlp": 0.19258657, "epoch": 0.19732451525627537, "flos": 19863592801920.0, "grad_norm": 12.622962000113143, "language_loss": 0.86611956, "learning_rate": 3.7134234870892045e-06, "loss": 0.88548386, "num_input_tokens_seen": 70809295, "router_z_loss_clip": 3.44921875, "router_z_loss_mlp": 0.27355957, "step": 3282, "time_per_iteration": 2.5634145736694336 }, { "auxiliary_loss_clip": 0.01716792, "auxiliary_loss_mlp": 0.00228697, "balance_loss_clip": 1.37278748, "balance_loss_mlp": 0.19964564, "epoch": 0.19738463850894333, "flos": 24973429777920.0, "grad_norm": 7.243081746527281, "language_loss": 0.7910558, "learning_rate": 3.7132225720246826e-06, "loss": 0.81051069, "num_input_tokens_seen": 70828765, "router_z_loss_clip": 3.43945312, "router_z_loss_mlp": 0.29064941, "step": 3283, "time_per_iteration": 2.657886505126953 }, { "auxiliary_loss_clip": 0.01736662, "auxiliary_loss_mlp": 0.00233106, "balance_loss_clip": 1.38794756, "balance_loss_mlp": 0.20308878, "epoch": 0.1974447617616113, "flos": 18368883123840.0, "grad_norm": 3.7591961800815237, "language_loss": 0.86467797, "learning_rate": 3.7130215919945886e-06, "loss": 0.88437563, "num_input_tokens_seen": 70846805, "router_z_loss_clip": 3.48828125, "router_z_loss_mlp": 0.30029297, "step": 3284, "time_per_iteration": 2.6388537883758545 }, { "auxiliary_loss_clip": 0.01751444, "auxiliary_loss_mlp": 0.00224701, "balance_loss_clip": 1.40368629, "balance_loss_mlp": 0.19451755, "epoch": 0.19750488501427926, "flos": 22892945103360.0, "grad_norm": 110.83966114522237, "language_loss": 0.93079031, "learning_rate": 3.7128205470065445e-06, "loss": 0.95055181, "num_input_tokens_seen": 70863805, "router_z_loss_clip": 3.4765625, "router_z_loss_mlp": 0.30175781, "step": 3285, "time_per_iteration": 2.5931429862976074 }, { "auxiliary_loss_clip": 0.01740417, "auxiliary_loss_mlp": 0.0021716, "balance_loss_clip": 1.39478922, "balance_loss_mlp": 0.18721455, "epoch": 0.19756500826694723, "flos": 21871502046720.0, "grad_norm": 3.3545109221201104, "language_loss": 0.97753358, "learning_rate": 3.712619437068174e-06, "loss": 0.99710935, "num_input_tokens_seen": 70882660, "router_z_loss_clip": 3.45898438, "router_z_loss_mlp": 0.29931641, "step": 3286, "time_per_iteration": 2.6027541160583496 }, { "auxiliary_loss_clip": 0.01740278, "auxiliary_loss_mlp": 0.00242743, "balance_loss_clip": 1.39507473, "balance_loss_mlp": 0.20652735, "epoch": 0.19762513151961522, "flos": 15158972131200.0, "grad_norm": 630.9313052211288, "language_loss": 0.88830584, "learning_rate": 3.712418262187102e-06, "loss": 0.90813601, "num_input_tokens_seen": 70898765, "router_z_loss_clip": 3.453125, "router_z_loss_mlp": 0.36230469, "step": 3287, "time_per_iteration": 2.587345838546753 }, { "auxiliary_loss_clip": 0.01735259, "auxiliary_loss_mlp": 0.00243358, "balance_loss_clip": 1.38841462, "balance_loss_mlp": 0.21112339, "epoch": 0.1976852547722832, "flos": 16979175878400.0, "grad_norm": 3.862111390240117, "language_loss": 0.91753525, "learning_rate": 3.7122170223709584e-06, "loss": 0.93732142, "num_input_tokens_seen": 70916370, "router_z_loss_clip": 3.46679688, "router_z_loss_mlp": 0.32250977, "step": 3288, "time_per_iteration": 2.5336194038391113 }, { "auxiliary_loss_clip": 0.01730471, "auxiliary_loss_mlp": 0.00237552, "balance_loss_clip": 1.38795483, "balance_loss_mlp": 0.20436421, "epoch": 0.19774537802495115, "flos": 20302924049280.0, "grad_norm": 13.162279079117496, "language_loss": 0.78869843, "learning_rate": 3.712015717627374e-06, "loss": 0.80837864, "num_input_tokens_seen": 70934870, "router_z_loss_clip": 3.41992188, "router_z_loss_mlp": 0.33203125, "step": 3289, "time_per_iteration": 2.629972219467163 }, { "auxiliary_loss_clip": 0.01739909, "auxiliary_loss_mlp": 0.00222653, "balance_loss_clip": 1.39211178, "balance_loss_mlp": 0.19218348, "epoch": 0.19780550127761912, "flos": 27235478724480.0, "grad_norm": 3.0284548510970453, "language_loss": 0.86550552, "learning_rate": 3.7118143479639813e-06, "loss": 0.88513112, "num_input_tokens_seen": 70955140, "router_z_loss_clip": 3.47460938, "router_z_loss_mlp": 0.3046875, "step": 3290, "time_per_iteration": 2.6082053184509277 }, { "auxiliary_loss_clip": 0.01742699, "auxiliary_loss_mlp": 0.0010688, "balance_loss_clip": 1.45337915, "balance_loss_mlp": 0.0966282, "epoch": 0.19786562453028708, "flos": 63550972684800.0, "grad_norm": 0.9105546651491299, "language_loss": 0.60590959, "learning_rate": 3.711612913388418e-06, "loss": 0.62440538, "num_input_tokens_seen": 71012005, "router_z_loss_clip": 2.890625, "router_z_loss_mlp": 0.10253906, "step": 3291, "time_per_iteration": 3.11200213432312 }, { "auxiliary_loss_clip": 0.0171837, "auxiliary_loss_mlp": 0.0025278, "balance_loss_clip": 1.37536407, "balance_loss_mlp": 0.21661225, "epoch": 0.19792574778295505, "flos": 26286647011200.0, "grad_norm": 6.936450996173116, "language_loss": 0.90199518, "learning_rate": 3.7114114139083204e-06, "loss": 0.92170674, "num_input_tokens_seen": 71031140, "router_z_loss_clip": 3.43164062, "router_z_loss_mlp": 0.36157227, "step": 3292, "time_per_iteration": 2.6358885765075684 }, { "auxiliary_loss_clip": 0.01711311, "auxiliary_loss_mlp": 0.00235949, "balance_loss_clip": 1.37358785, "balance_loss_mlp": 0.20698127, "epoch": 0.19798587103562304, "flos": 19938107566080.0, "grad_norm": 6.387644399729946, "language_loss": 0.8901372, "learning_rate": 3.7112098495313313e-06, "loss": 0.90960979, "num_input_tokens_seen": 71050250, "router_z_loss_clip": 3.37890625, "router_z_loss_mlp": 0.28930664, "step": 3293, "time_per_iteration": 2.604734420776367 }, { "auxiliary_loss_clip": 0.01698447, "auxiliary_loss_mlp": 0.00324164, "balance_loss_clip": 1.35732806, "balance_loss_mlp": 0.29014122, "epoch": 0.198045994288291, "flos": 20120282369280.0, "grad_norm": 11.302691434676905, "language_loss": 0.72843885, "learning_rate": 3.711008220265093e-06, "loss": 0.74866492, "num_input_tokens_seen": 71068665, "router_z_loss_clip": 3.41015625, "router_z_loss_mlp": 0.34057617, "step": 3294, "time_per_iteration": 2.6524415016174316 }, { "auxiliary_loss_clip": 0.0170503, "auxiliary_loss_mlp": 0.00246196, "balance_loss_clip": 1.36619186, "balance_loss_mlp": 0.21522534, "epoch": 0.19810611754095897, "flos": 17967653228160.0, "grad_norm": 153.68008581875495, "language_loss": 0.97740352, "learning_rate": 3.710806526117251e-06, "loss": 0.99691582, "num_input_tokens_seen": 71085320, "router_z_loss_clip": 3.38867188, "router_z_loss_mlp": 0.30981445, "step": 3295, "time_per_iteration": 2.543225049972534 }, { "auxiliary_loss_clip": 0.0169991, "auxiliary_loss_mlp": 0.00247386, "balance_loss_clip": 1.36094975, "balance_loss_mlp": 0.21753648, "epoch": 0.19816624079362694, "flos": 15084996071040.0, "grad_norm": 10.439087509340725, "language_loss": 0.88308656, "learning_rate": 3.7106047670954544e-06, "loss": 0.90255952, "num_input_tokens_seen": 71102020, "router_z_loss_clip": 3.38867188, "router_z_loss_mlp": 0.29858398, "step": 3296, "time_per_iteration": 2.5369954109191895 }, { "auxiliary_loss_clip": 0.01700846, "auxiliary_loss_mlp": 0.00288074, "balance_loss_clip": 1.36334109, "balance_loss_mlp": 0.25545788, "epoch": 0.1982263640462949, "flos": 24900315644160.0, "grad_norm": 7.633848241817781, "language_loss": 0.74273217, "learning_rate": 3.710402943207354e-06, "loss": 0.7626214, "num_input_tokens_seen": 71123390, "router_z_loss_clip": 3.375, "router_z_loss_mlp": 0.32592773, "step": 3297, "time_per_iteration": 2.668119192123413 }, { "auxiliary_loss_clip": 0.01706412, "auxiliary_loss_mlp": 0.00256837, "balance_loss_clip": 1.36589777, "balance_loss_mlp": 0.22654626, "epoch": 0.19828648729896287, "flos": 20376181837440.0, "grad_norm": 8.157638779079988, "language_loss": 0.89080775, "learning_rate": 3.7102010544606016e-06, "loss": 0.91044021, "num_input_tokens_seen": 71141800, "router_z_loss_clip": 3.40625, "router_z_loss_mlp": 0.30322266, "step": 3298, "time_per_iteration": 2.5893654823303223 }, { "auxiliary_loss_clip": 0.01687409, "auxiliary_loss_mlp": 0.00281477, "balance_loss_clip": 1.3476038, "balance_loss_mlp": 0.24845564, "epoch": 0.19834661055163083, "flos": 18880035615360.0, "grad_norm": 14.265885134114791, "language_loss": 0.91875201, "learning_rate": 3.7099991008628544e-06, "loss": 0.9384408, "num_input_tokens_seen": 71159505, "router_z_loss_clip": 3.39648438, "router_z_loss_mlp": 0.33032227, "step": 3299, "time_per_iteration": 2.640726327896118 }, { "auxiliary_loss_clip": 0.01773688, "auxiliary_loss_mlp": 0.00086313, "balance_loss_clip": 1.49320865, "balance_loss_mlp": 0.07749163, "epoch": 0.19840673380429882, "flos": 60259184640000.0, "grad_norm": 0.7508830124101357, "language_loss": 0.5337733, "learning_rate": 3.7097970824217706e-06, "loss": 0.55237329, "num_input_tokens_seen": 71223265, "router_z_loss_clip": 2.8125, "router_z_loss_mlp": 0.08837891, "step": 3300, "time_per_iteration": 3.0891201496124268 }, { "auxiliary_loss_clip": 0.01683655, "auxiliary_loss_mlp": 0.0028706, "balance_loss_clip": 1.34750783, "balance_loss_mlp": 0.25411052, "epoch": 0.1984668570569668, "flos": 19902017376000.0, "grad_norm": 3.6401044588337967, "language_loss": 0.80211008, "learning_rate": 3.7095949991450093e-06, "loss": 0.82181722, "num_input_tokens_seen": 71242385, "router_z_loss_clip": 3.359375, "router_z_loss_mlp": 0.32958984, "step": 3301, "time_per_iteration": 2.6006088256835938 }, { "auxiliary_loss_clip": 0.01662954, "auxiliary_loss_mlp": 0.00263136, "balance_loss_clip": 1.33083558, "balance_loss_mlp": 0.23433554, "epoch": 0.19852698030963475, "flos": 15630766295040.0, "grad_norm": 7.479622966378967, "language_loss": 0.97151214, "learning_rate": 3.709392851040235e-06, "loss": 0.99077308, "num_input_tokens_seen": 71258990, "router_z_loss_clip": 3.32226562, "router_z_loss_mlp": 0.2878418, "step": 3302, "time_per_iteration": 2.611330509185791 }, { "auxiliary_loss_clip": 0.0166016, "auxiliary_loss_mlp": 0.00256904, "balance_loss_clip": 1.32933974, "balance_loss_mlp": 0.22672033, "epoch": 0.19858710356230272, "flos": 43143007311360.0, "grad_norm": 10.686118776709614, "language_loss": 0.82607174, "learning_rate": 3.709190638115111e-06, "loss": 0.84524238, "num_input_tokens_seen": 71282770, "router_z_loss_clip": 3.30664062, "router_z_loss_mlp": 0.30175781, "step": 3303, "time_per_iteration": 2.7769429683685303 }, { "auxiliary_loss_clip": 0.01636269, "auxiliary_loss_mlp": 0.00263882, "balance_loss_clip": 1.30631685, "balance_loss_mlp": 0.23518831, "epoch": 0.19864722681497068, "flos": 35144084643840.0, "grad_norm": 8.281393734692466, "language_loss": 0.83481812, "learning_rate": 3.7089883603773084e-06, "loss": 0.85381961, "num_input_tokens_seen": 71301410, "router_z_loss_clip": 3.30078125, "router_z_loss_mlp": 0.28686523, "step": 3304, "time_per_iteration": 2.72213077545166 }, { "auxiliary_loss_clip": 0.01629099, "auxiliary_loss_mlp": 0.00252444, "balance_loss_clip": 1.3002938, "balance_loss_mlp": 0.22285652, "epoch": 0.19870735006763865, "flos": 19426200888960.0, "grad_norm": 23.41635539005961, "language_loss": 0.92207527, "learning_rate": 3.7087860178344955e-06, "loss": 0.94089067, "num_input_tokens_seen": 71319670, "router_z_loss_clip": 3.29101562, "router_z_loss_mlp": 0.29614258, "step": 3305, "time_per_iteration": 2.5579800605773926 }, { "auxiliary_loss_clip": 0.01625723, "auxiliary_loss_mlp": 0.00302223, "balance_loss_clip": 1.29301739, "balance_loss_mlp": 0.27300435, "epoch": 0.19876747332030664, "flos": 23547380947200.0, "grad_norm": 40.70781241757538, "language_loss": 0.75657457, "learning_rate": 3.7085836104943445e-06, "loss": 0.77585399, "num_input_tokens_seen": 71339850, "router_z_loss_clip": 3.33007812, "router_z_loss_mlp": 0.29187012, "step": 3306, "time_per_iteration": 2.6503536701202393 }, { "auxiliary_loss_clip": 0.01604146, "auxiliary_loss_mlp": 0.0032183, "balance_loss_clip": 1.27577662, "balance_loss_mlp": 0.29296976, "epoch": 0.1988275965729746, "flos": 19829406032640.0, "grad_norm": 6.690909754332828, "language_loss": 0.83644748, "learning_rate": 3.7083811383645332e-06, "loss": 0.85570717, "num_input_tokens_seen": 71359795, "router_z_loss_clip": 3.28320312, "router_z_loss_mlp": 0.28869629, "step": 3307, "time_per_iteration": 2.6274518966674805 }, { "auxiliary_loss_clip": 0.0160666, "auxiliary_loss_mlp": 0.00315734, "balance_loss_clip": 1.27453637, "balance_loss_mlp": 0.28835118, "epoch": 0.19888771982564257, "flos": 23513625141120.0, "grad_norm": 18.171112408422946, "language_loss": 0.82150424, "learning_rate": 3.708178601452737e-06, "loss": 0.84072816, "num_input_tokens_seen": 71378885, "router_z_loss_clip": 3.3203125, "router_z_loss_mlp": 0.27404785, "step": 3308, "time_per_iteration": 2.6042158603668213 }, { "auxiliary_loss_clip": 0.01596576, "auxiliary_loss_mlp": 0.00351054, "balance_loss_clip": 1.25541186, "balance_loss_mlp": 0.32194272, "epoch": 0.19894784307831054, "flos": 18150510389760.0, "grad_norm": 306.08984528305217, "language_loss": 0.83588636, "learning_rate": 3.7079759997666374e-06, "loss": 0.85536265, "num_input_tokens_seen": 71397285, "router_z_loss_clip": 3.40820312, "router_z_loss_mlp": 0.29089355, "step": 3309, "time_per_iteration": 2.683746814727783 }, { "auxiliary_loss_clip": 0.01596646, "auxiliary_loss_mlp": 0.00371989, "balance_loss_clip": 1.25151408, "balance_loss_mlp": 0.34399891, "epoch": 0.1990079663309785, "flos": 24276044246400.0, "grad_norm": 4.172722762901281, "language_loss": 0.93227172, "learning_rate": 3.707773333313917e-06, "loss": 0.95195812, "num_input_tokens_seen": 71415775, "router_z_loss_clip": 3.44726562, "router_z_loss_mlp": 0.2800293, "step": 3310, "time_per_iteration": 4.0087034702301025 }, { "auxiliary_loss_clip": 0.01595356, "auxiliary_loss_mlp": 0.0035237, "balance_loss_clip": 1.24066317, "balance_loss_mlp": 0.32683563, "epoch": 0.19906808958364647, "flos": 34897666366080.0, "grad_norm": 7.433018810100654, "language_loss": 0.70414293, "learning_rate": 3.70757060210226e-06, "loss": 0.72362018, "num_input_tokens_seen": 71437315, "router_z_loss_clip": 3.54296875, "router_z_loss_mlp": 0.25512695, "step": 3311, "time_per_iteration": 2.744691848754883 }, { "auxiliary_loss_clip": 0.01593955, "auxiliary_loss_mlp": 0.00384696, "balance_loss_clip": 1.23552847, "balance_loss_mlp": 0.35560927, "epoch": 0.19912821283631443, "flos": 24024885373440.0, "grad_norm": 2.514730262116759, "language_loss": 0.84682864, "learning_rate": 3.707367806139355e-06, "loss": 0.86661518, "num_input_tokens_seen": 71456320, "router_z_loss_clip": 3.58203125, "router_z_loss_mlp": 0.29101562, "step": 3312, "time_per_iteration": 2.5754823684692383 }, { "auxiliary_loss_clip": 0.01595006, "auxiliary_loss_mlp": 0.00438479, "balance_loss_clip": 1.23171496, "balance_loss_mlp": 0.40889117, "epoch": 0.19918833608898243, "flos": 19859031774720.0, "grad_norm": 10.902971306348944, "language_loss": 0.90011317, "learning_rate": 3.7071649454328915e-06, "loss": 0.92044806, "num_input_tokens_seen": 71475360, "router_z_loss_clip": 3.63476562, "router_z_loss_mlp": 0.29589844, "step": 3313, "time_per_iteration": 5.489395380020142 }, { "auxiliary_loss_clip": 0.01588437, "auxiliary_loss_mlp": 0.00413274, "balance_loss_clip": 1.21301162, "balance_loss_mlp": 0.3861773, "epoch": 0.1992484593416504, "flos": 29095794984960.0, "grad_norm": 7.682048378198502, "language_loss": 0.87474686, "learning_rate": 3.7069620199905625e-06, "loss": 0.89476395, "num_input_tokens_seen": 71496155, "router_z_loss_clip": 3.75390625, "router_z_loss_mlp": 0.27111816, "step": 3314, "time_per_iteration": 2.6807971000671387 }, { "auxiliary_loss_clip": 0.01585371, "auxiliary_loss_mlp": 0.00391069, "balance_loss_clip": 1.20168972, "balance_loss_mlp": 0.36464036, "epoch": 0.19930858259431836, "flos": 23295001011840.0, "grad_norm": 4.936649381620733, "language_loss": 0.93458748, "learning_rate": 3.7067590298200627e-06, "loss": 0.9543519, "num_input_tokens_seen": 71517295, "router_z_loss_clip": 3.83984375, "router_z_loss_mlp": 0.26428223, "step": 3315, "time_per_iteration": 2.643106460571289 }, { "auxiliary_loss_clip": 0.01593602, "auxiliary_loss_mlp": 0.00396192, "balance_loss_clip": 1.20489264, "balance_loss_mlp": 0.37008524, "epoch": 0.19936870584698632, "flos": 25378825651200.0, "grad_norm": 4.554801574021615, "language_loss": 0.78209412, "learning_rate": 3.7065559749290892e-06, "loss": 0.80199212, "num_input_tokens_seen": 71540000, "router_z_loss_clip": 3.88867188, "router_z_loss_mlp": 0.26123047, "step": 3316, "time_per_iteration": 2.6912410259246826 }, { "auxiliary_loss_clip": 0.01704241, "auxiliary_loss_mlp": 0.0017224, "balance_loss_clip": 1.33988428, "balance_loss_mlp": 0.1609392, "epoch": 0.1994288290996543, "flos": 62168053109760.0, "grad_norm": 0.8424836031286608, "language_loss": 0.66263014, "learning_rate": 3.706352855325342e-06, "loss": 0.68139499, "num_input_tokens_seen": 71607880, "router_z_loss_clip": 3.65625, "router_z_loss_mlp": 0.11279297, "step": 3317, "time_per_iteration": 3.1789515018463135 }, { "auxiliary_loss_clip": 0.01581061, "auxiliary_loss_mlp": 0.00354117, "balance_loss_clip": 1.19659793, "balance_loss_mlp": 0.32766449, "epoch": 0.19948895235232225, "flos": 19025832919680.0, "grad_norm": 3.608630906279698, "language_loss": 0.79290491, "learning_rate": 3.7061496710165233e-06, "loss": 0.81225669, "num_input_tokens_seen": 71625695, "router_z_loss_clip": 3.84960938, "router_z_loss_mlp": 0.26452637, "step": 3318, "time_per_iteration": 4.025881290435791 }, { "auxiliary_loss_clip": 0.01583877, "auxiliary_loss_mlp": 0.00328978, "balance_loss_clip": 1.1956749, "balance_loss_mlp": 0.30297869, "epoch": 0.19954907560499022, "flos": 37815803182080.0, "grad_norm": 35.982699549367446, "language_loss": 0.85352486, "learning_rate": 3.7059464220103385e-06, "loss": 0.87265337, "num_input_tokens_seen": 71648520, "router_z_loss_clip": 3.88085938, "router_z_loss_mlp": 0.25964355, "step": 3319, "time_per_iteration": 2.7836551666259766 }, { "auxiliary_loss_clip": 0.01572318, "auxiliary_loss_mlp": 0.00343487, "balance_loss_clip": 1.18881881, "balance_loss_mlp": 0.31500751, "epoch": 0.1996091988576582, "flos": 49565199594240.0, "grad_norm": 4.2297402216123325, "language_loss": 0.83273607, "learning_rate": 3.7057431083144945e-06, "loss": 0.85189408, "num_input_tokens_seen": 71672185, "router_z_loss_clip": 3.83007812, "router_z_loss_mlp": 0.28442383, "step": 3320, "time_per_iteration": 2.8594417572021484 }, { "auxiliary_loss_clip": 0.01564143, "auxiliary_loss_mlp": 0.00385496, "balance_loss_clip": 1.1853652, "balance_loss_mlp": 0.35818481, "epoch": 0.19966932211032618, "flos": 22635788659200.0, "grad_norm": 3.295923545560843, "language_loss": 0.86883295, "learning_rate": 3.705539729936701e-06, "loss": 0.88832927, "num_input_tokens_seen": 71692890, "router_z_loss_clip": 3.78515625, "router_z_loss_mlp": 0.27294922, "step": 3321, "time_per_iteration": 2.6478614807128906 }, { "auxiliary_loss_clip": 0.01671173, "auxiliary_loss_mlp": 0.00104199, "balance_loss_clip": 1.33721757, "balance_loss_mlp": 0.0940423, "epoch": 0.19972944536299414, "flos": 54082117745280.0, "grad_norm": 0.9342064908490068, "language_loss": 0.65404201, "learning_rate": 3.7053362868846696e-06, "loss": 0.67179573, "num_input_tokens_seen": 71745815, "router_z_loss_clip": 3.34375, "router_z_loss_mlp": 0.1015625, "step": 3322, "time_per_iteration": 2.966479778289795 }, { "auxiliary_loss_clip": 0.01691782, "auxiliary_loss_mlp": 0.00210978, "balance_loss_clip": 1.34446049, "balance_loss_mlp": 0.19910431, "epoch": 0.1997895686156621, "flos": 69355031817600.0, "grad_norm": 1.3034215182918623, "language_loss": 0.56620556, "learning_rate": 3.7051327791661153e-06, "loss": 0.58523315, "num_input_tokens_seen": 71806915, "router_z_loss_clip": 3.46875, "router_z_loss_mlp": 0.11865234, "step": 3323, "time_per_iteration": 3.2528433799743652 }, { "auxiliary_loss_clip": 0.01561051, "auxiliary_loss_mlp": 0.00403117, "balance_loss_clip": 1.18227172, "balance_loss_mlp": 0.37512687, "epoch": 0.19984969186833007, "flos": 18552063507840.0, "grad_norm": 14.035140574822844, "language_loss": 0.88788182, "learning_rate": 3.7049292067887555e-06, "loss": 0.90752351, "num_input_tokens_seen": 71824645, "router_z_loss_clip": 3.78515625, "router_z_loss_mlp": 0.27990723, "step": 3324, "time_per_iteration": 2.620413064956665 }, { "auxiliary_loss_clip": 0.01549292, "auxiliary_loss_mlp": 0.00387552, "balance_loss_clip": 1.17324889, "balance_loss_mlp": 0.35793999, "epoch": 0.19990981512099804, "flos": 26429678968320.0, "grad_norm": 33.84582193987661, "language_loss": 0.60351312, "learning_rate": 3.7047255697603092e-06, "loss": 0.62288159, "num_input_tokens_seen": 71845125, "router_z_loss_clip": 3.7578125, "router_z_loss_mlp": 0.29614258, "step": 3325, "time_per_iteration": 2.629257917404175 }, { "auxiliary_loss_clip": 0.01549427, "auxiliary_loss_mlp": 0.00421685, "balance_loss_clip": 1.17834485, "balance_loss_mlp": 0.39147726, "epoch": 0.19996993837366603, "flos": 16325997010560.0, "grad_norm": 14.631101618667708, "language_loss": 0.92372203, "learning_rate": 3.7045218680884984e-06, "loss": 0.94343317, "num_input_tokens_seen": 71863500, "router_z_loss_clip": 3.71679688, "router_z_loss_mlp": 0.30224609, "step": 3326, "time_per_iteration": 2.5861263275146484 }, { "auxiliary_loss_clip": 0.01531438, "auxiliary_loss_mlp": 0.00508936, "balance_loss_clip": 1.16712594, "balance_loss_mlp": 0.47820407, "epoch": 0.200030061626334, "flos": 20844169159680.0, "grad_norm": 5.712766190258668, "language_loss": 0.79595602, "learning_rate": 3.7043181017810476e-06, "loss": 0.81635982, "num_input_tokens_seen": 71881845, "router_z_loss_clip": 3.64257812, "router_z_loss_mlp": 0.30761719, "step": 3327, "time_per_iteration": 2.5910654067993164 }, { "auxiliary_loss_clip": 0.01531077, "auxiliary_loss_mlp": 0.00447128, "balance_loss_clip": 1.16439843, "balance_loss_mlp": 0.41563246, "epoch": 0.20009018487900196, "flos": 23762629198080.0, "grad_norm": 13.395911085146682, "language_loss": 0.83659095, "learning_rate": 3.7041142708456833e-06, "loss": 0.85637295, "num_input_tokens_seen": 71900940, "router_z_loss_clip": 3.66601562, "router_z_loss_mlp": 0.31518555, "step": 3328, "time_per_iteration": 2.6371023654937744 }, { "auxiliary_loss_clip": 0.01533503, "auxiliary_loss_mlp": 0.0044544, "balance_loss_clip": 1.16094589, "balance_loss_mlp": 0.41616213, "epoch": 0.20015030813166992, "flos": 28111555440000.0, "grad_norm": 9.32326841338631, "language_loss": 0.74682915, "learning_rate": 3.7039103752901353e-06, "loss": 0.76661861, "num_input_tokens_seen": 71921925, "router_z_loss_clip": 3.72460938, "router_z_loss_mlp": 0.29248047, "step": 3329, "time_per_iteration": 2.7418808937072754 }, { "auxiliary_loss_clip": 0.01527665, "auxiliary_loss_mlp": 0.00448887, "balance_loss_clip": 1.16034532, "balance_loss_mlp": 0.41702259, "epoch": 0.2002104313843379, "flos": 26067160955520.0, "grad_norm": 8.924714763284829, "language_loss": 0.86036003, "learning_rate": 3.7037064151221353e-06, "loss": 0.88012546, "num_input_tokens_seen": 71941855, "router_z_loss_clip": 3.67578125, "router_z_loss_mlp": 0.31848145, "step": 3330, "time_per_iteration": 2.7741734981536865 }, { "auxiliary_loss_clip": 0.01522699, "auxiliary_loss_mlp": 0.00457109, "balance_loss_clip": 1.15919459, "balance_loss_mlp": 0.42678258, "epoch": 0.20027055463700585, "flos": 22966633854720.0, "grad_norm": 44.498109570884026, "language_loss": 0.84106565, "learning_rate": 3.703502390349417e-06, "loss": 0.86086369, "num_input_tokens_seen": 71960915, "router_z_loss_clip": 3.63085938, "router_z_loss_mlp": 0.3034668, "step": 3331, "time_per_iteration": 2.756143808364868 }, { "auxiliary_loss_clip": 0.01517393, "auxiliary_loss_mlp": 0.00505927, "balance_loss_clip": 1.15742302, "balance_loss_mlp": 0.47211909, "epoch": 0.20033067788967382, "flos": 17165660313600.0, "grad_norm": 1649.7640189605431, "language_loss": 0.87632197, "learning_rate": 3.7032983009797176e-06, "loss": 0.89655513, "num_input_tokens_seen": 71979220, "router_z_loss_clip": 3.59960938, "router_z_loss_mlp": 0.33789062, "step": 3332, "time_per_iteration": 2.692411184310913 }, { "auxiliary_loss_clip": 0.01567493, "auxiliary_loss_mlp": 0.00349, "balance_loss_clip": 1.28823555, "balance_loss_mlp": 0.33507681, "epoch": 0.2003908011423418, "flos": 60825566292480.0, "grad_norm": 0.9259355977721593, "language_loss": 0.61901093, "learning_rate": 3.703094147020776e-06, "loss": 0.63817596, "num_input_tokens_seen": 72033950, "router_z_loss_clip": 2.796875, "router_z_loss_mlp": 0.13964844, "step": 3333, "time_per_iteration": 3.038001775741577 }, { "auxiliary_loss_clip": 0.01519179, "auxiliary_loss_mlp": 0.00500474, "balance_loss_clip": 1.15896297, "balance_loss_mlp": 0.46683359, "epoch": 0.20045092439500978, "flos": 24206234163840.0, "grad_norm": 66.62227776734161, "language_loss": 0.89682627, "learning_rate": 3.7028899284803334e-06, "loss": 0.91702282, "num_input_tokens_seen": 72051395, "router_z_loss_clip": 3.6015625, "router_z_loss_mlp": 0.33642578, "step": 3334, "time_per_iteration": 2.6374704837799072 }, { "auxiliary_loss_clip": 0.01529288, "auxiliary_loss_mlp": 0.0047696, "balance_loss_clip": 1.16898465, "balance_loss_mlp": 0.44093537, "epoch": 0.20051104764767774, "flos": 29387605075200.0, "grad_norm": 10.686283717365457, "language_loss": 0.84153485, "learning_rate": 3.702685645366134e-06, "loss": 0.8615973, "num_input_tokens_seen": 72071305, "router_z_loss_clip": 3.6015625, "router_z_loss_mlp": 0.36035156, "step": 3335, "time_per_iteration": 2.7275431156158447 }, { "auxiliary_loss_clip": 0.01514967, "auxiliary_loss_mlp": 0.00501732, "balance_loss_clip": 1.16363955, "balance_loss_mlp": 0.46816313, "epoch": 0.2005711709003457, "flos": 23513804709120.0, "grad_norm": 2.173876061963717, "language_loss": 0.85632032, "learning_rate": 3.7024812976859243e-06, "loss": 0.87648726, "num_input_tokens_seen": 72090165, "router_z_loss_clip": 3.51367188, "router_z_loss_mlp": 0.33544922, "step": 3336, "time_per_iteration": 2.682661294937134 }, { "auxiliary_loss_clip": 0.01510042, "auxiliary_loss_mlp": 0.00427259, "balance_loss_clip": 1.15812051, "balance_loss_mlp": 0.39473909, "epoch": 0.20063129415301367, "flos": 22523388024960.0, "grad_norm": 7.607943727113308, "language_loss": 0.86350936, "learning_rate": 3.7022768854474532e-06, "loss": 0.88288236, "num_input_tokens_seen": 72107210, "router_z_loss_clip": 3.51757812, "router_z_loss_mlp": 0.32519531, "step": 3337, "time_per_iteration": 2.6008236408233643 }, { "auxiliary_loss_clip": 0.01518032, "auxiliary_loss_mlp": 0.0039276, "balance_loss_clip": 1.16708767, "balance_loss_mlp": 0.36084771, "epoch": 0.20069141740568164, "flos": 25958243940480.0, "grad_norm": 317.1963663038814, "language_loss": 0.75748283, "learning_rate": 3.7020724086584724e-06, "loss": 0.77659082, "num_input_tokens_seen": 72126315, "router_z_loss_clip": 3.51367188, "router_z_loss_mlp": 0.31896973, "step": 3338, "time_per_iteration": 2.7257776260375977 }, { "auxiliary_loss_clip": 0.01504782, "auxiliary_loss_mlp": 0.00417458, "balance_loss_clip": 1.15869117, "balance_loss_mlp": 0.3872022, "epoch": 0.2007515406583496, "flos": 24790608529920.0, "grad_norm": 677.7815077373339, "language_loss": 0.78648484, "learning_rate": 3.701867867326735e-06, "loss": 0.80570716, "num_input_tokens_seen": 72146470, "router_z_loss_clip": 3.4609375, "router_z_loss_mlp": 0.30273438, "step": 3339, "time_per_iteration": 2.689835786819458 }, { "auxiliary_loss_clip": 0.01509634, "auxiliary_loss_mlp": 0.00414509, "balance_loss_clip": 1.16250086, "balance_loss_mlp": 0.38160771, "epoch": 0.2008116639110176, "flos": 37925582123520.0, "grad_norm": 9.40711996023775, "language_loss": 0.77309477, "learning_rate": 3.7016632614599974e-06, "loss": 0.79233623, "num_input_tokens_seen": 72166600, "router_z_loss_clip": 3.47070312, "router_z_loss_mlp": 0.32885742, "step": 3340, "time_per_iteration": 2.831068277359009 }, { "auxiliary_loss_clip": 0.01508913, "auxiliary_loss_mlp": 0.00376285, "balance_loss_clip": 1.16042948, "balance_loss_mlp": 0.34571967, "epoch": 0.20087178716368556, "flos": 20740531443840.0, "grad_norm": 6.766968220932683, "language_loss": 0.82865059, "learning_rate": 3.701458591066019e-06, "loss": 0.84750253, "num_input_tokens_seen": 72185160, "router_z_loss_clip": 3.48828125, "router_z_loss_mlp": 0.30566406, "step": 3341, "time_per_iteration": 2.643202543258667 }, { "auxiliary_loss_clip": 0.01507529, "auxiliary_loss_mlp": 0.00363777, "balance_loss_clip": 1.16245139, "balance_loss_mlp": 0.33409435, "epoch": 0.20093191041635353, "flos": 23842279607040.0, "grad_norm": 3.157524871586432, "language_loss": 0.79988825, "learning_rate": 3.70125385615256e-06, "loss": 0.81860131, "num_input_tokens_seen": 72205160, "router_z_loss_clip": 3.44921875, "router_z_loss_mlp": 0.29663086, "step": 3342, "time_per_iteration": 2.631683111190796 }, { "auxiliary_loss_clip": 0.01517439, "auxiliary_loss_mlp": 0.00377055, "balance_loss_clip": 1.17156649, "balance_loss_mlp": 0.34741932, "epoch": 0.2009920336690215, "flos": 21792067119360.0, "grad_norm": 4.184161842415689, "language_loss": 0.81668246, "learning_rate": 3.701049056727384e-06, "loss": 0.83562744, "num_input_tokens_seen": 72223555, "router_z_loss_clip": 3.4609375, "router_z_loss_mlp": 0.29638672, "step": 3343, "time_per_iteration": 2.6480438709259033 }, { "auxiliary_loss_clip": 0.01517147, "auxiliary_loss_mlp": 0.00341978, "balance_loss_clip": 1.1715318, "balance_loss_mlp": 0.31036386, "epoch": 0.20105215692168946, "flos": 26359222440960.0, "grad_norm": 134.9078972057988, "language_loss": 0.89974582, "learning_rate": 3.7008441927982574e-06, "loss": 0.91833711, "num_input_tokens_seen": 72242465, "router_z_loss_clip": 3.45703125, "router_z_loss_mlp": 0.31616211, "step": 3344, "time_per_iteration": 2.669680118560791 }, { "auxiliary_loss_clip": 0.01518323, "auxiliary_loss_mlp": 0.00332701, "balance_loss_clip": 1.17399549, "balance_loss_mlp": 0.30084848, "epoch": 0.20111228017435742, "flos": 18807280617600.0, "grad_norm": 99.58664664894089, "language_loss": 0.92340553, "learning_rate": 3.700639264372948e-06, "loss": 0.94191575, "num_input_tokens_seen": 72260655, "router_z_loss_clip": 3.44335938, "router_z_loss_mlp": 0.31835938, "step": 3345, "time_per_iteration": 2.6143481731414795 }, { "auxiliary_loss_clip": 0.01521331, "auxiliary_loss_mlp": 0.00320309, "balance_loss_clip": 1.17720163, "balance_loss_mlp": 0.29314113, "epoch": 0.20117240342702541, "flos": 19975059682560.0, "grad_norm": 9.367958661174363, "language_loss": 0.75286436, "learning_rate": 3.7004342714592283e-06, "loss": 0.77128077, "num_input_tokens_seen": 72279055, "router_z_loss_clip": 3.43945312, "router_z_loss_mlp": 0.27185059, "step": 3346, "time_per_iteration": 2.614147901535034 }, { "auxiliary_loss_clip": 0.01508834, "auxiliary_loss_mlp": 0.00307114, "balance_loss_clip": 1.1719538, "balance_loss_mlp": 0.2783961, "epoch": 0.20123252667969338, "flos": 23142703345920.0, "grad_norm": 7.673286117842933, "language_loss": 0.80577469, "learning_rate": 3.70022921406487e-06, "loss": 0.82393426, "num_input_tokens_seen": 72297895, "router_z_loss_clip": 3.3671875, "router_z_loss_mlp": 0.2869873, "step": 3347, "time_per_iteration": 2.7201085090637207 }, { "auxiliary_loss_clip": 0.01520355, "auxiliary_loss_mlp": 0.0029426, "balance_loss_clip": 1.18578732, "balance_loss_mlp": 0.26641315, "epoch": 0.20129264993236134, "flos": 23221671396480.0, "grad_norm": 2.0396528566841767, "language_loss": 0.93233061, "learning_rate": 3.70002409219765e-06, "loss": 0.95047677, "num_input_tokens_seen": 72318385, "router_z_loss_clip": 3.34570312, "router_z_loss_mlp": 0.27844238, "step": 3348, "time_per_iteration": 2.63254976272583 }, { "auxiliary_loss_clip": 0.0151194, "auxiliary_loss_mlp": 0.00318252, "balance_loss_clip": 1.1773535, "balance_loss_mlp": 0.29028562, "epoch": 0.2013527731850293, "flos": 21871466133120.0, "grad_norm": 237.23598117162115, "language_loss": 0.77857113, "learning_rate": 3.699818905865346e-06, "loss": 0.79687303, "num_input_tokens_seen": 72338235, "router_z_loss_clip": 3.34765625, "router_z_loss_mlp": 0.27978516, "step": 3349, "time_per_iteration": 2.599344253540039 }, { "auxiliary_loss_clip": 0.01514325, "auxiliary_loss_mlp": 0.00273531, "balance_loss_clip": 1.18013287, "balance_loss_mlp": 0.24509984, "epoch": 0.20141289643769728, "flos": 18040803275520.0, "grad_norm": 1.614194178611948, "language_loss": 0.78708875, "learning_rate": 3.6996136550757377e-06, "loss": 0.80496728, "num_input_tokens_seen": 72357825, "router_z_loss_clip": 3.3359375, "router_z_loss_mlp": 0.28442383, "step": 3350, "time_per_iteration": 2.5658459663391113 }, { "auxiliary_loss_clip": 0.01502884, "auxiliary_loss_mlp": 0.00270649, "balance_loss_clip": 1.16854692, "balance_loss_mlp": 0.24201477, "epoch": 0.20147301969036524, "flos": 23951412103680.0, "grad_norm": 22.350809425950196, "language_loss": 0.84767377, "learning_rate": 3.69940833983661e-06, "loss": 0.86540902, "num_input_tokens_seen": 72376335, "router_z_loss_clip": 3.34179688, "router_z_loss_mlp": 0.28662109, "step": 3351, "time_per_iteration": 2.603834390640259 }, { "auxiliary_loss_clip": 0.01512502, "auxiliary_loss_mlp": 0.00295265, "balance_loss_clip": 1.18027329, "balance_loss_mlp": 0.26577234, "epoch": 0.2015331429430332, "flos": 25588471380480.0, "grad_norm": 2.7912079088564217, "language_loss": 0.86898744, "learning_rate": 3.699202960155748e-06, "loss": 0.88706505, "num_input_tokens_seen": 72395440, "router_z_loss_clip": 3.32421875, "router_z_loss_mlp": 0.29492188, "step": 3352, "time_per_iteration": 4.127259016036987 }, { "auxiliary_loss_clip": 0.01507006, "auxiliary_loss_mlp": 0.00276564, "balance_loss_clip": 1.17614794, "balance_loss_mlp": 0.24877609, "epoch": 0.2015932661957012, "flos": 26724972677760.0, "grad_norm": 6.577881360054237, "language_loss": 0.87050712, "learning_rate": 3.6989975160409396e-06, "loss": 0.88834274, "num_input_tokens_seen": 72414670, "router_z_loss_clip": 3.31054688, "router_z_loss_mlp": 0.27807617, "step": 3353, "time_per_iteration": 2.6864709854125977 }, { "auxiliary_loss_clip": 0.01510269, "auxiliary_loss_mlp": 0.00244704, "balance_loss_clip": 1.18215239, "balance_loss_mlp": 0.21581987, "epoch": 0.20165338944836916, "flos": 15633136592640.0, "grad_norm": 726.9587299641073, "language_loss": 0.97875476, "learning_rate": 3.6987920074999747e-06, "loss": 0.99630451, "num_input_tokens_seen": 72432210, "router_z_loss_clip": 3.28515625, "router_z_loss_mlp": 0.28869629, "step": 3354, "time_per_iteration": 2.575960636138916 }, { "auxiliary_loss_clip": 0.01597888, "auxiliary_loss_mlp": 0.00122063, "balance_loss_clip": 1.30263436, "balance_loss_mlp": 0.11214524, "epoch": 0.20171351270103713, "flos": 57912529207680.0, "grad_norm": 0.8228369603789506, "language_loss": 0.55678731, "learning_rate": 3.6985864345406465e-06, "loss": 0.57398683, "num_input_tokens_seen": 72489225, "router_z_loss_clip": 2.9375, "router_z_loss_mlp": 0.09912109, "step": 3355, "time_per_iteration": 5.961647987365723 }, { "auxiliary_loss_clip": 0.01506457, "auxiliary_loss_mlp": 0.00236028, "balance_loss_clip": 1.18018508, "balance_loss_mlp": 0.21021903, "epoch": 0.2017736359537051, "flos": 20814363849600.0, "grad_norm": 5.963360373944326, "language_loss": 0.89915085, "learning_rate": 3.698380797170751e-06, "loss": 0.91657567, "num_input_tokens_seen": 72508715, "router_z_loss_clip": 3.26367188, "router_z_loss_mlp": 0.25793457, "step": 3356, "time_per_iteration": 2.62278151512146 }, { "auxiliary_loss_clip": 0.01491746, "auxiliary_loss_mlp": 0.00333766, "balance_loss_clip": 1.16633165, "balance_loss_mlp": 0.30186585, "epoch": 0.20183375920637306, "flos": 17092043389440.0, "grad_norm": 21.369250909035507, "language_loss": 0.81281453, "learning_rate": 3.698175095398085e-06, "loss": 0.83106959, "num_input_tokens_seen": 72525135, "router_z_loss_clip": 3.25, "router_z_loss_mlp": 0.31884766, "step": 3357, "time_per_iteration": 2.620171070098877 }, { "auxiliary_loss_clip": 0.01497693, "auxiliary_loss_mlp": 0.00285772, "balance_loss_clip": 1.17324197, "balance_loss_mlp": 0.25637469, "epoch": 0.20189388245904102, "flos": 18661339658880.0, "grad_norm": 3.2318712401182994, "language_loss": 0.80791003, "learning_rate": 3.6979693292304493e-06, "loss": 0.82574469, "num_input_tokens_seen": 72543690, "router_z_loss_clip": 3.24609375, "router_z_loss_mlp": 0.29394531, "step": 3358, "time_per_iteration": 2.6616175174713135 }, { "auxiliary_loss_clip": 0.01494767, "auxiliary_loss_mlp": 0.00308913, "balance_loss_clip": 1.17580438, "balance_loss_mlp": 0.28311577, "epoch": 0.20195400571170902, "flos": 16797539779200.0, "grad_norm": 9.617034052988245, "language_loss": 0.89514023, "learning_rate": 3.6977634986756463e-06, "loss": 0.91317701, "num_input_tokens_seen": 72560725, "router_z_loss_clip": 3.18554688, "router_z_loss_mlp": 0.2578125, "step": 3359, "time_per_iteration": 2.5680928230285645 }, { "auxiliary_loss_clip": 0.01541946, "auxiliary_loss_mlp": 0.00069522, "balance_loss_clip": 1.23367977, "balance_loss_mlp": 0.0575535, "epoch": 0.20201412896437698, "flos": 67174716268800.0, "grad_norm": 0.7841451856825833, "language_loss": 0.58765346, "learning_rate": 3.697557603741482e-06, "loss": 0.60376817, "num_input_tokens_seen": 72621940, "router_z_loss_clip": 3.078125, "router_z_loss_mlp": 0.11962891, "step": 3360, "time_per_iteration": 4.528645992279053 }, { "auxiliary_loss_clip": 0.01496429, "auxiliary_loss_mlp": 0.00287216, "balance_loss_clip": 1.17936158, "balance_loss_mlp": 0.26072752, "epoch": 0.20207425221704495, "flos": 21325013550720.0, "grad_norm": 25.45651006361267, "language_loss": 0.71807045, "learning_rate": 3.697351644435763e-06, "loss": 0.7359069, "num_input_tokens_seen": 72639135, "router_z_loss_clip": 3.16992188, "router_z_loss_mlp": 0.26477051, "step": 3361, "time_per_iteration": 2.6669304370880127 }, { "auxiliary_loss_clip": 0.01503622, "auxiliary_loss_mlp": 0.00312401, "balance_loss_clip": 1.18721151, "balance_loss_mlp": 0.28662819, "epoch": 0.2021343754697129, "flos": 22527158952960.0, "grad_norm": 80.34154106419717, "language_loss": 0.85824645, "learning_rate": 3.6971456207662993e-06, "loss": 0.87640667, "num_input_tokens_seen": 72658525, "router_z_loss_clip": 3.1640625, "router_z_loss_mlp": 0.25793457, "step": 3362, "time_per_iteration": 2.661623477935791 }, { "auxiliary_loss_clip": 0.01505652, "auxiliary_loss_mlp": 0.00282386, "balance_loss_clip": 1.1877079, "balance_loss_mlp": 0.25598109, "epoch": 0.20219449872238088, "flos": 19062785036160.0, "grad_norm": 2.8500539898704607, "language_loss": 0.83720404, "learning_rate": 3.6969395327409035e-06, "loss": 0.85508442, "num_input_tokens_seen": 72678085, "router_z_loss_clip": 3.18164062, "router_z_loss_mlp": 0.2644043, "step": 3363, "time_per_iteration": 2.6145687103271484 }, { "auxiliary_loss_clip": 0.01510939, "auxiliary_loss_mlp": 0.00320953, "balance_loss_clip": 1.19561934, "balance_loss_mlp": 0.2960856, "epoch": 0.20225462197504884, "flos": 24717027519360.0, "grad_norm": 34.14082761948568, "language_loss": 0.80552053, "learning_rate": 3.696733380367391e-06, "loss": 0.82383949, "num_input_tokens_seen": 72698695, "router_z_loss_clip": 3.15234375, "router_z_loss_mlp": 0.24890137, "step": 3364, "time_per_iteration": 2.631901264190674 }, { "auxiliary_loss_clip": 0.01515703, "auxiliary_loss_mlp": 0.00316571, "balance_loss_clip": 1.19852555, "balance_loss_mlp": 0.29020232, "epoch": 0.2023147452277168, "flos": 22018304931840.0, "grad_norm": 3.030024924426711, "language_loss": 0.80118322, "learning_rate": 3.6965271636535783e-06, "loss": 0.81950593, "num_input_tokens_seen": 72717880, "router_z_loss_clip": 3.16992188, "router_z_loss_mlp": 0.26391602, "step": 3365, "time_per_iteration": 2.6476588249206543 }, { "auxiliary_loss_clip": 0.01522185, "auxiliary_loss_mlp": 0.00333287, "balance_loss_clip": 1.20441651, "balance_loss_mlp": 0.30806217, "epoch": 0.2023748684803848, "flos": 17745365911680.0, "grad_norm": 8.108945897119938, "language_loss": 0.91959721, "learning_rate": 3.696320882607286e-06, "loss": 0.93815196, "num_input_tokens_seen": 72736410, "router_z_loss_clip": 3.17773438, "router_z_loss_mlp": 0.25219727, "step": 3366, "time_per_iteration": 2.631880760192871 }, { "auxiliary_loss_clip": 0.01520155, "auxiliary_loss_mlp": 0.00262881, "balance_loss_clip": 1.1992054, "balance_loss_mlp": 0.23762105, "epoch": 0.20243499173305277, "flos": 31138932493440.0, "grad_norm": 6.546396227672464, "language_loss": 0.75997448, "learning_rate": 3.696114537236335e-06, "loss": 0.77780485, "num_input_tokens_seen": 72758295, "router_z_loss_clip": 3.20898438, "router_z_loss_mlp": 0.25231934, "step": 3367, "time_per_iteration": 2.7376925945281982 }, { "auxiliary_loss_clip": 0.01525276, "auxiliary_loss_mlp": 0.00312198, "balance_loss_clip": 1.19821644, "balance_loss_mlp": 0.28307551, "epoch": 0.20249511498572073, "flos": 33839235279360.0, "grad_norm": 12.747926838855287, "language_loss": 0.74108064, "learning_rate": 3.6959081275485512e-06, "loss": 0.75945538, "num_input_tokens_seen": 72782495, "router_z_loss_clip": 3.26953125, "router_z_loss_mlp": 0.29138184, "step": 3368, "time_per_iteration": 2.819646120071411 }, { "auxiliary_loss_clip": 0.01534908, "auxiliary_loss_mlp": 0.00237421, "balance_loss_clip": 1.21162701, "balance_loss_mlp": 0.21369854, "epoch": 0.2025552382383887, "flos": 21215629658880.0, "grad_norm": 12.827577633490813, "language_loss": 0.85566616, "learning_rate": 3.6957016535517615e-06, "loss": 0.87338936, "num_input_tokens_seen": 72801885, "router_z_loss_clip": 3.23632812, "router_z_loss_mlp": 0.23730469, "step": 3369, "time_per_iteration": 2.6154844760894775 }, { "auxiliary_loss_clip": 0.01543912, "auxiliary_loss_mlp": 0.00260122, "balance_loss_clip": 1.21384406, "balance_loss_mlp": 0.23542157, "epoch": 0.20261536149105666, "flos": 14647388676480.0, "grad_norm": 5.2475378444922205, "language_loss": 0.77221775, "learning_rate": 3.695495115253795e-06, "loss": 0.79025805, "num_input_tokens_seen": 72816990, "router_z_loss_clip": 3.30273438, "router_z_loss_mlp": 0.24719238, "step": 3370, "time_per_iteration": 2.5543203353881836 }, { "auxiliary_loss_clip": 0.01671191, "auxiliary_loss_mlp": 0.00097779, "balance_loss_clip": 1.3596853, "balance_loss_mlp": 0.08995858, "epoch": 0.20267548474372463, "flos": 66783649921920.0, "grad_norm": 0.6849595341462454, "language_loss": 0.5826382, "learning_rate": 3.6952885126624834e-06, "loss": 0.60032791, "num_input_tokens_seen": 72879240, "router_z_loss_clip": 3.125, "router_z_loss_mlp": 0.078125, "step": 3371, "time_per_iteration": 3.165196657180786 }, { "auxiliary_loss_clip": 0.01548251, "auxiliary_loss_mlp": 0.00216743, "balance_loss_clip": 1.21448016, "balance_loss_mlp": 0.19358087, "epoch": 0.2027356079963926, "flos": 24680793674880.0, "grad_norm": 6.037760275745413, "language_loss": 0.97728312, "learning_rate": 3.6950818457856617e-06, "loss": 0.99493301, "num_input_tokens_seen": 72899030, "router_z_loss_clip": 3.3359375, "router_z_loss_mlp": 0.23156738, "step": 3372, "time_per_iteration": 2.611786365509033 }, { "auxiliary_loss_clip": 0.01551281, "auxiliary_loss_mlp": 0.00253058, "balance_loss_clip": 1.21597433, "balance_loss_mlp": 0.22778599, "epoch": 0.20279573124906058, "flos": 26392762765440.0, "grad_norm": 49.567010792146085, "language_loss": 0.85446185, "learning_rate": 3.694875114631167e-06, "loss": 0.87250525, "num_input_tokens_seen": 72919190, "router_z_loss_clip": 3.35351562, "router_z_loss_mlp": 0.25280762, "step": 3373, "time_per_iteration": 2.6386327743530273 }, { "auxiliary_loss_clip": 0.01561693, "auxiliary_loss_mlp": 0.00238562, "balance_loss_clip": 1.22808337, "balance_loss_mlp": 0.21612637, "epoch": 0.20285585450172855, "flos": 33799984692480.0, "grad_norm": 54.81005446443365, "language_loss": 0.77972323, "learning_rate": 3.6946683192068377e-06, "loss": 0.7977258, "num_input_tokens_seen": 72939720, "router_z_loss_clip": 3.33398438, "router_z_loss_mlp": 0.2244873, "step": 3374, "time_per_iteration": 2.7467145919799805 }, { "auxiliary_loss_clip": 0.0169152, "auxiliary_loss_mlp": 0.00121682, "balance_loss_clip": 1.39716697, "balance_loss_mlp": 0.11209805, "epoch": 0.20291597775439651, "flos": 71164823598720.0, "grad_norm": 1.0436133620611483, "language_loss": 0.6257416, "learning_rate": 3.694461459520516e-06, "loss": 0.64387357, "num_input_tokens_seen": 73000015, "router_z_loss_clip": 2.9375, "router_z_loss_mlp": 0.09570312, "step": 3375, "time_per_iteration": 3.0800652503967285 }, { "auxiliary_loss_clip": 0.01567303, "auxiliary_loss_mlp": 0.00217794, "balance_loss_clip": 1.23085845, "balance_loss_mlp": 0.19321296, "epoch": 0.20297610100706448, "flos": 19494287118720.0, "grad_norm": 3.3206856613344953, "language_loss": 0.87056077, "learning_rate": 3.6942545355800463e-06, "loss": 0.88841176, "num_input_tokens_seen": 73017675, "router_z_loss_clip": 3.3671875, "router_z_loss_mlp": 0.24609375, "step": 3376, "time_per_iteration": 2.5829644203186035 }, { "auxiliary_loss_clip": 0.01562887, "auxiliary_loss_mlp": 0.00261626, "balance_loss_clip": 1.22263026, "balance_loss_mlp": 0.23393402, "epoch": 0.20303622425973245, "flos": 25044245441280.0, "grad_norm": 2.799308150810884, "language_loss": 0.88729787, "learning_rate": 3.6940475473932743e-06, "loss": 0.90554303, "num_input_tokens_seen": 73036135, "router_z_loss_clip": 3.40039062, "router_z_loss_mlp": 0.2767334, "step": 3377, "time_per_iteration": 2.6033878326416016 }, { "auxiliary_loss_clip": 0.01585014, "auxiliary_loss_mlp": 0.00240337, "balance_loss_clip": 1.24871802, "balance_loss_mlp": 0.21356282, "epoch": 0.2030963475124004, "flos": 21979988098560.0, "grad_norm": 28.6956874020484, "language_loss": 0.8586973, "learning_rate": 3.69384049496805e-06, "loss": 0.87695086, "num_input_tokens_seen": 73054075, "router_z_loss_clip": 3.36328125, "router_z_loss_mlp": 0.26806641, "step": 3378, "time_per_iteration": 2.5993027687072754 }, { "auxiliary_loss_clip": 0.015947, "auxiliary_loss_mlp": 0.0022938, "balance_loss_clip": 1.25321138, "balance_loss_mlp": 0.20230751, "epoch": 0.2031564707650684, "flos": 19500392430720.0, "grad_norm": 1.9801851521290597, "language_loss": 0.89292645, "learning_rate": 3.6936333783122242e-06, "loss": 0.91116726, "num_input_tokens_seen": 73073530, "router_z_loss_clip": 3.4140625, "router_z_loss_mlp": 0.27087402, "step": 3379, "time_per_iteration": 2.6782355308532715 }, { "auxiliary_loss_clip": 0.01604663, "auxiliary_loss_mlp": 0.00219142, "balance_loss_clip": 1.26123857, "balance_loss_mlp": 0.19432312, "epoch": 0.20321659401773637, "flos": 22747075971840.0, "grad_norm": 4.36700379299658, "language_loss": 0.92286992, "learning_rate": 3.6934261974336505e-06, "loss": 0.94110799, "num_input_tokens_seen": 73092820, "router_z_loss_clip": 3.43554688, "router_z_loss_mlp": 0.24804688, "step": 3380, "time_per_iteration": 2.6686646938323975 }, { "auxiliary_loss_clip": 0.01596012, "auxiliary_loss_mlp": 0.00273094, "balance_loss_clip": 1.25996852, "balance_loss_mlp": 0.24314886, "epoch": 0.20327671727040433, "flos": 22455840499200.0, "grad_norm": 5.09450527223547, "language_loss": 0.86512101, "learning_rate": 3.693218952340186e-06, "loss": 0.88381207, "num_input_tokens_seen": 73113385, "router_z_loss_clip": 3.359375, "router_z_loss_mlp": 0.29956055, "step": 3381, "time_per_iteration": 2.6566500663757324 }, { "auxiliary_loss_clip": 0.01608725, "auxiliary_loss_mlp": 0.00223643, "balance_loss_clip": 1.27324712, "balance_loss_mlp": 0.19550988, "epoch": 0.2033368405230723, "flos": 19535010163200.0, "grad_norm": 2.5919786411866346, "language_loss": 0.86449647, "learning_rate": 3.6930116430396895e-06, "loss": 0.88282019, "num_input_tokens_seen": 73131195, "router_z_loss_clip": 3.35742188, "router_z_loss_mlp": 0.28161621, "step": 3382, "time_per_iteration": 2.601025342941284 }, { "auxiliary_loss_clip": 0.01598668, "auxiliary_loss_mlp": 0.00240579, "balance_loss_clip": 1.2686615, "balance_loss_mlp": 0.21035984, "epoch": 0.20339696377574026, "flos": 13809233744640.0, "grad_norm": 2.1146859180999975, "language_loss": 0.86342019, "learning_rate": 3.6928042695400214e-06, "loss": 0.88181263, "num_input_tokens_seen": 73148850, "router_z_loss_clip": 3.30078125, "router_z_loss_mlp": 0.30236816, "step": 3383, "time_per_iteration": 2.6038742065429688 }, { "auxiliary_loss_clip": 0.01611558, "auxiliary_loss_mlp": 0.00221967, "balance_loss_clip": 1.28001082, "balance_loss_mlp": 0.19433458, "epoch": 0.20345708702840823, "flos": 20339409288960.0, "grad_norm": 9.197926958184379, "language_loss": 0.85287344, "learning_rate": 3.6925968318490464e-06, "loss": 0.87120867, "num_input_tokens_seen": 73166775, "router_z_loss_clip": 3.31640625, "router_z_loss_mlp": 0.27685547, "step": 3384, "time_per_iteration": 2.6110074520111084 }, { "auxiliary_loss_clip": 0.01595453, "auxiliary_loss_mlp": 0.0025757, "balance_loss_clip": 1.26830411, "balance_loss_mlp": 0.22873281, "epoch": 0.2035172102810762, "flos": 20333950421760.0, "grad_norm": 20.205458880514904, "language_loss": 0.88728279, "learning_rate": 3.6923893299746293e-06, "loss": 0.90581298, "num_input_tokens_seen": 73183215, "router_z_loss_clip": 3.27148438, "router_z_loss_mlp": 0.28845215, "step": 3385, "time_per_iteration": 2.621403217315674 }, { "auxiliary_loss_clip": 0.01600082, "auxiliary_loss_mlp": 0.00223345, "balance_loss_clip": 1.28044963, "balance_loss_mlp": 0.19666633, "epoch": 0.2035773335337442, "flos": 23330983461120.0, "grad_norm": 417.981607225826, "language_loss": 0.7760759, "learning_rate": 3.692181763924639e-06, "loss": 0.79431009, "num_input_tokens_seen": 73203290, "router_z_loss_clip": 3.19335938, "router_z_loss_mlp": 0.26696777, "step": 3386, "time_per_iteration": 2.596284866333008 }, { "auxiliary_loss_clip": 0.01602364, "auxiliary_loss_mlp": 0.00245257, "balance_loss_clip": 1.28420913, "balance_loss_mlp": 0.21680161, "epoch": 0.20363745678641215, "flos": 28330287310080.0, "grad_norm": 5.111261424118402, "language_loss": 0.86794835, "learning_rate": 3.691974133706947e-06, "loss": 0.88642454, "num_input_tokens_seen": 73226185, "router_z_loss_clip": 3.18554688, "router_z_loss_mlp": 0.28466797, "step": 3387, "time_per_iteration": 2.6831936836242676 }, { "auxiliary_loss_clip": 0.01622614, "auxiliary_loss_mlp": 0.00232352, "balance_loss_clip": 1.30398202, "balance_loss_mlp": 0.20442149, "epoch": 0.20369758003908012, "flos": 18915658928640.0, "grad_norm": 5.361596552323315, "language_loss": 0.88992524, "learning_rate": 3.6917664393294262e-06, "loss": 0.90847486, "num_input_tokens_seen": 73243300, "router_z_loss_clip": 3.18554688, "router_z_loss_mlp": 0.27905273, "step": 3388, "time_per_iteration": 2.6128056049346924 }, { "auxiliary_loss_clip": 0.01610407, "auxiliary_loss_mlp": 0.00242753, "balance_loss_clip": 1.29408455, "balance_loss_mlp": 0.21641986, "epoch": 0.20375770329174808, "flos": 19206499351680.0, "grad_norm": 16.864635970179933, "language_loss": 0.79363906, "learning_rate": 3.6915586807999527e-06, "loss": 0.81217074, "num_input_tokens_seen": 73261490, "router_z_loss_clip": 3.16210938, "router_z_loss_mlp": 0.26342773, "step": 3389, "time_per_iteration": 2.678162097930908 }, { "auxiliary_loss_clip": 0.01642832, "auxiliary_loss_mlp": 0.00252772, "balance_loss_clip": 1.31946445, "balance_loss_mlp": 0.22486468, "epoch": 0.20381782654441605, "flos": 19391008538880.0, "grad_norm": 19.53617162703793, "language_loss": 0.93168974, "learning_rate": 3.691350858126404e-06, "loss": 0.9506458, "num_input_tokens_seen": 73280180, "router_z_loss_clip": 3.234375, "router_z_loss_mlp": 0.27905273, "step": 3390, "time_per_iteration": 2.561232805252075 }, { "auxiliary_loss_clip": 0.01638442, "auxiliary_loss_mlp": 0.00254923, "balance_loss_clip": 1.31780708, "balance_loss_mlp": 0.22915001, "epoch": 0.203877949797084, "flos": 24827704300800.0, "grad_norm": 9.215542849925786, "language_loss": 0.8014183, "learning_rate": 3.691142971316662e-06, "loss": 0.82035196, "num_input_tokens_seen": 73300680, "router_z_loss_clip": 3.20703125, "router_z_loss_mlp": 0.2578125, "step": 3391, "time_per_iteration": 2.7119133472442627 }, { "auxiliary_loss_clip": 0.01650216, "auxiliary_loss_mlp": 0.00255684, "balance_loss_clip": 1.32348835, "balance_loss_mlp": 0.22790802, "epoch": 0.20393807304975198, "flos": 18003707504640.0, "grad_norm": 81.13360015909967, "language_loss": 0.96719253, "learning_rate": 3.6909350203786086e-06, "loss": 0.98625159, "num_input_tokens_seen": 73316760, "router_z_loss_clip": 3.26757812, "router_z_loss_mlp": 0.27819824, "step": 3392, "time_per_iteration": 2.551901340484619 }, { "auxiliary_loss_clip": 0.01628498, "auxiliary_loss_mlp": 0.00260821, "balance_loss_clip": 1.30063677, "balance_loss_mlp": 0.23340249, "epoch": 0.20399819630241997, "flos": 24206988349440.0, "grad_norm": 343.009814005169, "language_loss": 0.85664582, "learning_rate": 3.69072700532013e-06, "loss": 0.875539, "num_input_tokens_seen": 73339385, "router_z_loss_clip": 3.27929688, "router_z_loss_mlp": 0.27429199, "step": 3393, "time_per_iteration": 2.6687848567962646 }, { "auxiliary_loss_clip": 0.0164115, "auxiliary_loss_mlp": 0.00229031, "balance_loss_clip": 1.31089234, "balance_loss_mlp": 0.20267418, "epoch": 0.20405831955508794, "flos": 20777124424320.0, "grad_norm": 572.2904127198888, "language_loss": 0.93745506, "learning_rate": 3.6905189261491137e-06, "loss": 0.95615685, "num_input_tokens_seen": 73357235, "router_z_loss_clip": 3.30078125, "router_z_loss_mlp": 0.26367188, "step": 3394, "time_per_iteration": 4.007560968399048 }, { "auxiliary_loss_clip": 0.01654198, "auxiliary_loss_mlp": 0.00269416, "balance_loss_clip": 1.32305634, "balance_loss_mlp": 0.24342832, "epoch": 0.2041184428077559, "flos": 15486908325120.0, "grad_norm": 8.836536611023096, "language_loss": 0.93967533, "learning_rate": 3.69031078287345e-06, "loss": 0.95891148, "num_input_tokens_seen": 73374435, "router_z_loss_clip": 3.31054688, "router_z_loss_mlp": 0.25976562, "step": 3395, "time_per_iteration": 2.6000635623931885 }, { "auxiliary_loss_clip": 0.01659335, "auxiliary_loss_mlp": 0.00274768, "balance_loss_clip": 1.32388389, "balance_loss_mlp": 0.24811271, "epoch": 0.20417856606042387, "flos": 15588463052160.0, "grad_norm": 35.41051793387967, "language_loss": 0.93267965, "learning_rate": 3.690102575501033e-06, "loss": 0.95202076, "num_input_tokens_seen": 73391025, "router_z_loss_clip": 3.35546875, "router_z_loss_mlp": 0.26660156, "step": 3396, "time_per_iteration": 2.5839202404022217 }, { "auxiliary_loss_clip": 0.01647255, "auxiliary_loss_mlp": 0.0025339, "balance_loss_clip": 1.3145597, "balance_loss_mlp": 0.22824919, "epoch": 0.20423868931309183, "flos": 24279348297600.0, "grad_norm": 1.784467159728919, "language_loss": 0.85187602, "learning_rate": 3.6898943040397556e-06, "loss": 0.87088245, "num_input_tokens_seen": 73409270, "router_z_loss_clip": 3.328125, "router_z_loss_mlp": 0.25146484, "step": 3397, "time_per_iteration": 5.472602367401123 }, { "auxiliary_loss_clip": 0.01650926, "auxiliary_loss_mlp": 0.00314472, "balance_loss_clip": 1.31654835, "balance_loss_mlp": 0.28639823, "epoch": 0.2042988125657598, "flos": 18614870438400.0, "grad_norm": 42676.41699550635, "language_loss": 0.96768296, "learning_rate": 3.689685968497518e-06, "loss": 0.98733693, "num_input_tokens_seen": 73425225, "router_z_loss_clip": 3.34179688, "router_z_loss_mlp": 0.28088379, "step": 3398, "time_per_iteration": 2.566080331802368 }, { "auxiliary_loss_clip": 0.01655344, "auxiliary_loss_mlp": 0.00291827, "balance_loss_clip": 1.31904101, "balance_loss_mlp": 0.26390821, "epoch": 0.2043589358184278, "flos": 17851230270720.0, "grad_norm": 221.27310823486246, "language_loss": 0.86389589, "learning_rate": 3.6894775688822186e-06, "loss": 0.88336754, "num_input_tokens_seen": 73440940, "router_z_loss_clip": 3.3671875, "router_z_loss_mlp": 0.27929688, "step": 3399, "time_per_iteration": 2.602679491043091 }, { "auxiliary_loss_clip": 0.0165212, "auxiliary_loss_mlp": 0.00258034, "balance_loss_clip": 1.31164551, "balance_loss_mlp": 0.23223668, "epoch": 0.20441905907109575, "flos": 21435223455360.0, "grad_norm": 2.8727257365162533, "language_loss": 0.83160019, "learning_rate": 3.6892691052017603e-06, "loss": 0.85070169, "num_input_tokens_seen": 73458805, "router_z_loss_clip": 3.40429688, "router_z_loss_mlp": 0.25805664, "step": 3400, "time_per_iteration": 2.6926517486572266 }, { "auxiliary_loss_clip": 0.01653535, "auxiliary_loss_mlp": 0.00266974, "balance_loss_clip": 1.3145498, "balance_loss_mlp": 0.24148686, "epoch": 0.20447918232376372, "flos": 27707703851520.0, "grad_norm": 77.54622684439366, "language_loss": 0.857126, "learning_rate": 3.6890605774640487e-06, "loss": 0.87633109, "num_input_tokens_seen": 73479380, "router_z_loss_clip": 3.39257812, "router_z_loss_mlp": 0.25476074, "step": 3401, "time_per_iteration": 2.7303481101989746 }, { "auxiliary_loss_clip": 0.01649737, "auxiliary_loss_mlp": 0.00305939, "balance_loss_clip": 1.3101964, "balance_loss_mlp": 0.27719805, "epoch": 0.20453930557643168, "flos": 30524214113280.0, "grad_norm": 21.619341932435333, "language_loss": 0.75505161, "learning_rate": 3.688851985676991e-06, "loss": 0.77460837, "num_input_tokens_seen": 73505105, "router_z_loss_clip": 3.39453125, "router_z_loss_mlp": 0.28747559, "step": 3402, "time_per_iteration": 4.172479629516602 }, { "auxiliary_loss_clip": 0.0166139, "auxiliary_loss_mlp": 0.00314098, "balance_loss_clip": 1.32149839, "balance_loss_mlp": 0.28529733, "epoch": 0.20459942882909965, "flos": 18987767481600.0, "grad_norm": 3.782809885665757, "language_loss": 0.89789522, "learning_rate": 3.688643329848496e-06, "loss": 0.9176501, "num_input_tokens_seen": 73523700, "router_z_loss_clip": 3.3984375, "router_z_loss_mlp": 0.28808594, "step": 3403, "time_per_iteration": 2.5721402168273926 }, { "auxiliary_loss_clip": 0.01655339, "auxiliary_loss_mlp": 0.00343152, "balance_loss_clip": 1.31937504, "balance_loss_mlp": 0.31634146, "epoch": 0.20465955208176762, "flos": 20339050152960.0, "grad_norm": 3.259970943546447, "language_loss": 0.91736883, "learning_rate": 3.6884346099864772e-06, "loss": 0.93735373, "num_input_tokens_seen": 73542625, "router_z_loss_clip": 3.35742188, "router_z_loss_mlp": 0.26831055, "step": 3404, "time_per_iteration": 2.6321499347686768 }, { "auxiliary_loss_clip": 0.01649667, "auxiliary_loss_mlp": 0.00331272, "balance_loss_clip": 1.31059027, "balance_loss_mlp": 0.30409181, "epoch": 0.20471967533443558, "flos": 21251288885760.0, "grad_norm": 4.7994159641283956, "language_loss": 0.92647988, "learning_rate": 3.6882258260988487e-06, "loss": 0.9462893, "num_input_tokens_seen": 73561450, "router_z_loss_clip": 3.39257812, "router_z_loss_mlp": 0.27148438, "step": 3405, "time_per_iteration": 2.656306266784668 }, { "auxiliary_loss_clip": 0.01644965, "auxiliary_loss_mlp": 0.0033092, "balance_loss_clip": 1.30679595, "balance_loss_mlp": 0.30394292, "epoch": 0.20477979858710357, "flos": 14501555458560.0, "grad_norm": 5.208504489349091, "language_loss": 0.91892326, "learning_rate": 3.6880169781935276e-06, "loss": 0.93868208, "num_input_tokens_seen": 73577155, "router_z_loss_clip": 3.38085938, "router_z_loss_mlp": 0.26965332, "step": 3406, "time_per_iteration": 2.631866693496704 }, { "auxiliary_loss_clip": 0.01635769, "auxiliary_loss_mlp": 0.00381906, "balance_loss_clip": 1.30233002, "balance_loss_mlp": 0.35484517, "epoch": 0.20483992183977154, "flos": 11400310085760.0, "grad_norm": 6.3971463467254, "language_loss": 0.76506352, "learning_rate": 3.6878080662784336e-06, "loss": 0.78524029, "num_input_tokens_seen": 73594900, "router_z_loss_clip": 3.33398438, "router_z_loss_mlp": 0.27062988, "step": 3407, "time_per_iteration": 2.636451244354248 }, { "auxiliary_loss_clip": 0.01628363, "auxiliary_loss_mlp": 0.00394079, "balance_loss_clip": 1.29657114, "balance_loss_mlp": 0.36774603, "epoch": 0.2049000450924395, "flos": 19060271084160.0, "grad_norm": 35.91726013818885, "language_loss": 0.91911602, "learning_rate": 3.6875990903614886e-06, "loss": 0.93934047, "num_input_tokens_seen": 73613810, "router_z_loss_clip": 3.31445312, "router_z_loss_mlp": 0.26330566, "step": 3408, "time_per_iteration": 2.61816668510437 }, { "auxiliary_loss_clip": 0.01634087, "auxiliary_loss_mlp": 0.00354034, "balance_loss_clip": 1.29848421, "balance_loss_mlp": 0.326985, "epoch": 0.20496016834510747, "flos": 14574561851520.0, "grad_norm": 524.091563396857, "language_loss": 0.76707935, "learning_rate": 3.6873900504506166e-06, "loss": 0.78696066, "num_input_tokens_seen": 73631495, "router_z_loss_clip": 3.35742188, "router_z_loss_mlp": 0.27050781, "step": 3409, "time_per_iteration": 2.6266613006591797 }, { "auxiliary_loss_clip": 0.01620664, "auxiliary_loss_mlp": 0.00389432, "balance_loss_clip": 1.29065633, "balance_loss_mlp": 0.36132252, "epoch": 0.20502029159777543, "flos": 22126647329280.0, "grad_norm": 2.5733695051172374, "language_loss": 0.85703129, "learning_rate": 3.687180946553745e-06, "loss": 0.8771323, "num_input_tokens_seen": 73652840, "router_z_loss_clip": 3.29882812, "router_z_loss_mlp": 0.28076172, "step": 3410, "time_per_iteration": 2.64896559715271 }, { "auxiliary_loss_clip": 0.01632537, "auxiliary_loss_mlp": 0.00374717, "balance_loss_clip": 1.30517364, "balance_loss_mlp": 0.34739411, "epoch": 0.2050804148504434, "flos": 25367907916800.0, "grad_norm": 2.8881756770158913, "language_loss": 0.81938505, "learning_rate": 3.686971778678803e-06, "loss": 0.83945763, "num_input_tokens_seen": 73672150, "router_z_loss_clip": 3.27539062, "router_z_loss_mlp": 0.27331543, "step": 3411, "time_per_iteration": 2.639554977416992 }, { "auxiliary_loss_clip": 0.01630737, "auxiliary_loss_mlp": 0.00359529, "balance_loss_clip": 1.30510497, "balance_loss_mlp": 0.33219397, "epoch": 0.2051405381031114, "flos": 23620171858560.0, "grad_norm": 21.217442003131445, "language_loss": 0.79906112, "learning_rate": 3.686762546833722e-06, "loss": 0.81896377, "num_input_tokens_seen": 73691940, "router_z_loss_clip": 3.2578125, "router_z_loss_mlp": 0.27307129, "step": 3412, "time_per_iteration": 2.658635377883911 }, { "auxiliary_loss_clip": 0.01611325, "auxiliary_loss_mlp": 0.00347606, "balance_loss_clip": 1.28489017, "balance_loss_mlp": 0.32123655, "epoch": 0.20520066135577936, "flos": 19565533745280.0, "grad_norm": 295.601258342676, "language_loss": 0.90155691, "learning_rate": 3.6865532510264362e-06, "loss": 0.92114627, "num_input_tokens_seen": 73709080, "router_z_loss_clip": 3.26757812, "router_z_loss_mlp": 0.2635498, "step": 3413, "time_per_iteration": 2.721214771270752 }, { "auxiliary_loss_clip": 0.01615723, "auxiliary_loss_mlp": 0.00346379, "balance_loss_clip": 1.29449153, "balance_loss_mlp": 0.32073668, "epoch": 0.20526078460844732, "flos": 17676345928320.0, "grad_norm": 5.0327704699043405, "language_loss": 0.89967704, "learning_rate": 3.6863438912648823e-06, "loss": 0.91929805, "num_input_tokens_seen": 73727670, "router_z_loss_clip": 3.21289062, "router_z_loss_mlp": 0.25646973, "step": 3414, "time_per_iteration": 2.6033012866973877 }, { "auxiliary_loss_clip": 0.01595973, "auxiliary_loss_mlp": 0.00370183, "balance_loss_clip": 1.27703285, "balance_loss_mlp": 0.34356323, "epoch": 0.2053209078611153, "flos": 21500328856320.0, "grad_norm": 10.63310112067813, "language_loss": 0.86197746, "learning_rate": 3.6861344675569986e-06, "loss": 0.881639, "num_input_tokens_seen": 73747170, "router_z_loss_clip": 3.18554688, "router_z_loss_mlp": 0.26635742, "step": 3415, "time_per_iteration": 2.6020355224609375 }, { "auxiliary_loss_clip": 0.01615826, "auxiliary_loss_mlp": 0.00364803, "balance_loss_clip": 1.29708862, "balance_loss_mlp": 0.33854145, "epoch": 0.20538103111378325, "flos": 25663524848640.0, "grad_norm": 2.277293005570793, "language_loss": 0.78654754, "learning_rate": 3.6859249799107275e-06, "loss": 0.80635381, "num_input_tokens_seen": 73767690, "router_z_loss_clip": 3.18945312, "router_z_loss_mlp": 0.26245117, "step": 3416, "time_per_iteration": 2.6249454021453857 }, { "auxiliary_loss_clip": 0.01593092, "auxiliary_loss_mlp": 0.00339258, "balance_loss_clip": 1.27881312, "balance_loss_mlp": 0.31509417, "epoch": 0.20544115436645122, "flos": 23148952312320.0, "grad_norm": 9.862127523498302, "language_loss": 0.85359675, "learning_rate": 3.6857154283340115e-06, "loss": 0.87292033, "num_input_tokens_seen": 73786900, "router_z_loss_clip": 3.14257812, "router_z_loss_mlp": 0.24121094, "step": 3417, "time_per_iteration": 2.64970326423645 }, { "auxiliary_loss_clip": 0.01596697, "auxiliary_loss_mlp": 0.003335, "balance_loss_clip": 1.28046095, "balance_loss_mlp": 0.30789363, "epoch": 0.20550127761911918, "flos": 19390433921280.0, "grad_norm": 67.74697726379345, "language_loss": 0.94924796, "learning_rate": 3.685505812834798e-06, "loss": 0.96854997, "num_input_tokens_seen": 73804515, "router_z_loss_clip": 3.15820312, "router_z_loss_mlp": 0.25598145, "step": 3418, "time_per_iteration": 2.559479236602783 }, { "auxiliary_loss_clip": 0.01581149, "auxiliary_loss_mlp": 0.00395809, "balance_loss_clip": 1.26559734, "balance_loss_mlp": 0.36650687, "epoch": 0.20556140087178718, "flos": 22893124671360.0, "grad_norm": 20.825222298144745, "language_loss": 0.72692871, "learning_rate": 3.685296133421035e-06, "loss": 0.74669838, "num_input_tokens_seen": 73822910, "router_z_loss_clip": 3.15234375, "router_z_loss_mlp": 0.29296875, "step": 3419, "time_per_iteration": 2.610783815383911 }, { "auxiliary_loss_clip": 0.01599869, "auxiliary_loss_mlp": 0.00373111, "balance_loss_clip": 1.2835573, "balance_loss_mlp": 0.34608638, "epoch": 0.20562152412445514, "flos": 19789652655360.0, "grad_norm": 2.7608884073767523, "language_loss": 0.92610961, "learning_rate": 3.685086390100674e-06, "loss": 0.94583941, "num_input_tokens_seen": 73841160, "router_z_loss_clip": 3.1640625, "router_z_loss_mlp": 0.27026367, "step": 3420, "time_per_iteration": 2.640362501144409 }, { "auxiliary_loss_clip": 0.01603708, "auxiliary_loss_mlp": 0.00360196, "balance_loss_clip": 1.28931236, "balance_loss_mlp": 0.3322649, "epoch": 0.2056816473771231, "flos": 31501989210240.0, "grad_norm": 55.96737164770995, "language_loss": 0.77535105, "learning_rate": 3.684876582881668e-06, "loss": 0.79499006, "num_input_tokens_seen": 73862795, "router_z_loss_clip": 3.14453125, "router_z_loss_mlp": 0.27941895, "step": 3421, "time_per_iteration": 2.7539775371551514 }, { "auxiliary_loss_clip": 0.01599289, "auxiliary_loss_mlp": 0.00365219, "balance_loss_clip": 1.2847743, "balance_loss_mlp": 0.34018552, "epoch": 0.20574177062979107, "flos": 23258372117760.0, "grad_norm": 2323.579834697472, "language_loss": 0.7795496, "learning_rate": 3.6846667117719732e-06, "loss": 0.79919469, "num_input_tokens_seen": 73881525, "router_z_loss_clip": 3.14257812, "router_z_loss_mlp": 0.25048828, "step": 3422, "time_per_iteration": 2.6117100715637207 }, { "auxiliary_loss_clip": 0.019372, "auxiliary_loss_mlp": 0.00191137, "balance_loss_clip": 1.66549277, "balance_loss_mlp": 0.17807183, "epoch": 0.20580189388245904, "flos": 70312518708480.0, "grad_norm": 0.7490472599166385, "language_loss": 0.55186224, "learning_rate": 3.684456776779548e-06, "loss": 0.57314557, "num_input_tokens_seen": 73937775, "router_z_loss_clip": 2.71875, "router_z_loss_mlp": 0.13085938, "step": 3423, "time_per_iteration": 3.162081003189087 }, { "auxiliary_loss_clip": 0.01596263, "auxiliary_loss_mlp": 0.00394264, "balance_loss_clip": 1.27944934, "balance_loss_mlp": 0.36726344, "epoch": 0.205862017135127, "flos": 30737846252160.0, "grad_norm": 8.66890325869687, "language_loss": 0.79612827, "learning_rate": 3.684246777912353e-06, "loss": 0.81603354, "num_input_tokens_seen": 73958250, "router_z_loss_clip": 3.16992188, "router_z_loss_mlp": 0.26989746, "step": 3424, "time_per_iteration": 2.677460193634033 }, { "auxiliary_loss_clip": 0.01623967, "auxiliary_loss_mlp": 0.0038267, "balance_loss_clip": 1.3021934, "balance_loss_mlp": 0.35659844, "epoch": 0.20592214038779497, "flos": 21324546673920.0, "grad_norm": 2.574371186720462, "language_loss": 0.79404241, "learning_rate": 3.684036715178351e-06, "loss": 0.81410879, "num_input_tokens_seen": 73977775, "router_z_loss_clip": 3.21875, "router_z_loss_mlp": 0.26074219, "step": 3425, "time_per_iteration": 2.614572048187256 }, { "auxiliary_loss_clip": 0.01623601, "auxiliary_loss_mlp": 0.00364461, "balance_loss_clip": 1.30400586, "balance_loss_mlp": 0.3375439, "epoch": 0.20598226364046296, "flos": 22891652213760.0, "grad_norm": 8.314073934996204, "language_loss": 0.93471134, "learning_rate": 3.683826588585508e-06, "loss": 0.95459193, "num_input_tokens_seen": 73996590, "router_z_loss_clip": 3.1953125, "router_z_loss_mlp": 0.2689209, "step": 3426, "time_per_iteration": 2.6819353103637695 }, { "auxiliary_loss_clip": 0.01636193, "auxiliary_loss_mlp": 0.00384769, "balance_loss_clip": 1.31070876, "balance_loss_mlp": 0.35747048, "epoch": 0.20604238689313092, "flos": 23878549365120.0, "grad_norm": 4.675669870223246, "language_loss": 0.82711005, "learning_rate": 3.6836163981417926e-06, "loss": 0.84731966, "num_input_tokens_seen": 74015935, "router_z_loss_clip": 3.25195312, "router_z_loss_mlp": 0.27307129, "step": 3427, "time_per_iteration": 2.660818576812744 }, { "auxiliary_loss_clip": 0.01626741, "auxiliary_loss_mlp": 0.00375657, "balance_loss_clip": 1.29882133, "balance_loss_mlp": 0.34904966, "epoch": 0.2061025101457989, "flos": 22491535639680.0, "grad_norm": 12.120466228516872, "language_loss": 0.8118363, "learning_rate": 3.683406143855174e-06, "loss": 0.8318603, "num_input_tokens_seen": 74036575, "router_z_loss_clip": 3.27734375, "router_z_loss_mlp": 0.26623535, "step": 3428, "time_per_iteration": 2.716301202774048 }, { "auxiliary_loss_clip": 0.01622234, "auxiliary_loss_mlp": 0.00387349, "balance_loss_clip": 1.29597163, "balance_loss_mlp": 0.35938269, "epoch": 0.20616263339846685, "flos": 22778928357120.0, "grad_norm": 13.043607647072895, "language_loss": 0.79431707, "learning_rate": 3.6831958257336256e-06, "loss": 0.81441295, "num_input_tokens_seen": 74055365, "router_z_loss_clip": 3.26171875, "router_z_loss_mlp": 0.27990723, "step": 3429, "time_per_iteration": 2.64923357963562 }, { "auxiliary_loss_clip": 0.01628854, "auxiliary_loss_mlp": 0.00404414, "balance_loss_clip": 1.30584359, "balance_loss_mlp": 0.37587526, "epoch": 0.20622275665113482, "flos": 20882198684160.0, "grad_norm": 43.03457891061135, "language_loss": 0.90849483, "learning_rate": 3.6829854437851237e-06, "loss": 0.92882746, "num_input_tokens_seen": 74074875, "router_z_loss_clip": 3.2265625, "router_z_loss_mlp": 0.28540039, "step": 3430, "time_per_iteration": 2.714949131011963 }, { "auxiliary_loss_clip": 0.01614169, "auxiliary_loss_mlp": 0.00399461, "balance_loss_clip": 1.28958392, "balance_loss_mlp": 0.37079161, "epoch": 0.20628287990380278, "flos": 19354415558400.0, "grad_norm": 1.6863102550098865, "language_loss": 0.75216895, "learning_rate": 3.6827749980176444e-06, "loss": 0.77230525, "num_input_tokens_seen": 74094505, "router_z_loss_clip": 3.24609375, "router_z_loss_mlp": 0.28674316, "step": 3431, "time_per_iteration": 2.6162221431732178 }, { "auxiliary_loss_clip": 0.01979379, "auxiliary_loss_mlp": 0.00135157, "balance_loss_clip": 1.68527603, "balance_loss_mlp": 0.1239995, "epoch": 0.20634300315647078, "flos": 71517932248320.0, "grad_norm": 0.7982807750735834, "language_loss": 0.60388893, "learning_rate": 3.6825644884391693e-06, "loss": 0.62503433, "num_input_tokens_seen": 74158500, "router_z_loss_clip": 2.9375, "router_z_loss_mlp": 0.11181641, "step": 3432, "time_per_iteration": 3.2824318408966064 }, { "auxiliary_loss_clip": 0.01630191, "auxiliary_loss_mlp": 0.00403265, "balance_loss_clip": 1.30411184, "balance_loss_mlp": 0.3750959, "epoch": 0.20640312640913874, "flos": 21723944976000.0, "grad_norm": 3.6011748710983533, "language_loss": 0.78365314, "learning_rate": 3.682353915057679e-06, "loss": 0.80398774, "num_input_tokens_seen": 74176685, "router_z_loss_clip": 3.2578125, "router_z_loss_mlp": 0.28198242, "step": 3433, "time_per_iteration": 2.6188950538635254 }, { "auxiliary_loss_clip": 0.01618594, "auxiliary_loss_mlp": 0.00364079, "balance_loss_clip": 1.29421425, "balance_loss_mlp": 0.33627987, "epoch": 0.2064632496618067, "flos": 20554621626240.0, "grad_norm": 10.199312787621931, "language_loss": 0.91035402, "learning_rate": 3.6821432778811604e-06, "loss": 0.93018079, "num_input_tokens_seen": 74194935, "router_z_loss_clip": 3.24609375, "router_z_loss_mlp": 0.27783203, "step": 3434, "time_per_iteration": 2.636929988861084 }, { "auxiliary_loss_clip": 0.01624412, "auxiliary_loss_mlp": 0.00399533, "balance_loss_clip": 1.29842997, "balance_loss_mlp": 0.37156606, "epoch": 0.20652337291447467, "flos": 29823273135360.0, "grad_norm": 7.4466518976917895, "language_loss": 0.75249171, "learning_rate": 3.6819325769176004e-06, "loss": 0.77273118, "num_input_tokens_seen": 74215400, "router_z_loss_clip": 3.2578125, "router_z_loss_mlp": 0.27990723, "step": 3435, "time_per_iteration": 2.695666790008545 }, { "auxiliary_loss_clip": 0.01615234, "auxiliary_loss_mlp": 0.00366358, "balance_loss_clip": 1.29581654, "balance_loss_mlp": 0.33860642, "epoch": 0.20658349616714264, "flos": 26213640618240.0, "grad_norm": 3.8371164130045314, "language_loss": 0.95477957, "learning_rate": 3.681721812174988e-06, "loss": 0.97459549, "num_input_tokens_seen": 74234090, "router_z_loss_clip": 3.1953125, "router_z_loss_mlp": 0.27758789, "step": 3436, "time_per_iteration": 2.6524510383605957 }, { "auxiliary_loss_clip": 0.01607378, "auxiliary_loss_mlp": 0.00348222, "balance_loss_clip": 1.28746057, "balance_loss_mlp": 0.32128114, "epoch": 0.2066436194198106, "flos": 25994370044160.0, "grad_norm": 3.150885301141018, "language_loss": 0.83754802, "learning_rate": 3.6815109836613163e-06, "loss": 0.857104, "num_input_tokens_seen": 74253345, "router_z_loss_clip": 3.19921875, "router_z_loss_mlp": 0.26940918, "step": 3437, "time_per_iteration": 4.058243274688721 }, { "auxiliary_loss_clip": 0.0158777, "auxiliary_loss_mlp": 0.00348412, "balance_loss_clip": 1.27042508, "balance_loss_mlp": 0.31989723, "epoch": 0.20670374267247857, "flos": 21361067827200.0, "grad_norm": 14.743508904785255, "language_loss": 0.84167385, "learning_rate": 3.6813000913845795e-06, "loss": 0.86103565, "num_input_tokens_seen": 74271615, "router_z_loss_clip": 3.17578125, "router_z_loss_mlp": 0.28527832, "step": 3438, "time_per_iteration": 2.5773627758026123 }, { "auxiliary_loss_clip": 0.0195629, "auxiliary_loss_mlp": 0.0013994, "balance_loss_clip": 1.65516067, "balance_loss_mlp": 0.12744719, "epoch": 0.20676386592514656, "flos": 66383281952640.0, "grad_norm": 0.8067213537012692, "language_loss": 0.66886163, "learning_rate": 3.6810891353527747e-06, "loss": 0.68982399, "num_input_tokens_seen": 74331390, "router_z_loss_clip": 3.0, "router_z_loss_mlp": 0.125, "step": 3439, "time_per_iteration": 4.422634840011597 }, { "auxiliary_loss_clip": 0.01577008, "auxiliary_loss_mlp": 0.00292686, "balance_loss_clip": 1.26101398, "balance_loss_mlp": 0.26352757, "epoch": 0.20682398917781453, "flos": 17274577328640.0, "grad_norm": 14.549635434568739, "language_loss": 0.915043, "learning_rate": 3.6808781155739014e-06, "loss": 0.9337399, "num_input_tokens_seen": 74347335, "router_z_loss_clip": 3.15625, "router_z_loss_mlp": 0.29150391, "step": 3440, "time_per_iteration": 4.1038658618927 }, { "auxiliary_loss_clip": 0.01561346, "auxiliary_loss_mlp": 0.00273275, "balance_loss_clip": 1.25119352, "balance_loss_mlp": 0.24517736, "epoch": 0.2068841124304825, "flos": 18077288515200.0, "grad_norm": 9.99898717846034, "language_loss": 0.92073292, "learning_rate": 3.6806670320559614e-06, "loss": 0.93907917, "num_input_tokens_seen": 74366310, "router_z_loss_clip": 3.09765625, "router_z_loss_mlp": 0.28088379, "step": 3441, "time_per_iteration": 2.6067090034484863 }, { "auxiliary_loss_clip": 0.01559821, "auxiliary_loss_mlp": 0.00252097, "balance_loss_clip": 1.25086105, "balance_loss_mlp": 0.22430936, "epoch": 0.20694423568315046, "flos": 27347017432320.0, "grad_norm": 6.950946598884631, "language_loss": 0.91382182, "learning_rate": 3.680455884806959e-06, "loss": 0.93194097, "num_input_tokens_seen": 74387100, "router_z_loss_clip": 3.08984375, "router_z_loss_mlp": 0.27807617, "step": 3442, "time_per_iteration": 2.6761128902435303 }, { "auxiliary_loss_clip": 0.01557453, "auxiliary_loss_mlp": 0.00261859, "balance_loss_clip": 1.24520755, "balance_loss_mlp": 0.2311033, "epoch": 0.20700435893581842, "flos": 20229845829120.0, "grad_norm": 4.806985466567603, "language_loss": 0.79234052, "learning_rate": 3.6802446738349014e-06, "loss": 0.81053364, "num_input_tokens_seen": 74404460, "router_z_loss_clip": 3.12109375, "router_z_loss_mlp": 0.30737305, "step": 3443, "time_per_iteration": 2.657576560974121 }, { "auxiliary_loss_clip": 0.0154284, "auxiliary_loss_mlp": 0.00263485, "balance_loss_clip": 1.24033225, "balance_loss_mlp": 0.23775993, "epoch": 0.2070644821884864, "flos": 20631111638400.0, "grad_norm": 6.095224429213428, "language_loss": 0.92028964, "learning_rate": 3.680033399147797e-06, "loss": 0.93835294, "num_input_tokens_seen": 74423790, "router_z_loss_clip": 3.02539062, "router_z_loss_mlp": 0.25769043, "step": 3444, "time_per_iteration": 4.159827470779419 }, { "auxiliary_loss_clip": 0.01824833, "auxiliary_loss_mlp": 0.00117273, "balance_loss_clip": 1.57938385, "balance_loss_mlp": 0.10520913, "epoch": 0.20712460544115438, "flos": 65941077617280.0, "grad_norm": 1.299388323095974, "language_loss": 0.56714767, "learning_rate": 3.6798220607536585e-06, "loss": 0.58656865, "num_input_tokens_seen": 74488130, "router_z_loss_clip": 2.453125, "router_z_loss_mlp": 0.12060547, "step": 3445, "time_per_iteration": 3.089937448501587 }, { "auxiliary_loss_clip": 0.01552392, "auxiliary_loss_mlp": 0.00242712, "balance_loss_clip": 1.25052214, "balance_loss_mlp": 0.21496055, "epoch": 0.20718472869382235, "flos": 19425734012160.0, "grad_norm": 4.9941269192228015, "language_loss": 0.84043962, "learning_rate": 3.6796106586604987e-06, "loss": 0.85839069, "num_input_tokens_seen": 74506720, "router_z_loss_clip": 3.0234375, "router_z_loss_mlp": 0.27770996, "step": 3446, "time_per_iteration": 2.657238721847534 }, { "auxiliary_loss_clip": 0.01539615, "auxiliary_loss_mlp": 0.00317822, "balance_loss_clip": 1.23502886, "balance_loss_mlp": 0.28670806, "epoch": 0.2072448519464903, "flos": 24499049834880.0, "grad_norm": 15.051926211485783, "language_loss": 0.73498869, "learning_rate": 3.679399192876334e-06, "loss": 0.75356311, "num_input_tokens_seen": 74525330, "router_z_loss_clip": 3.04492188, "router_z_loss_mlp": 0.31079102, "step": 3447, "time_per_iteration": 2.7353036403656006 }, { "auxiliary_loss_clip": 0.01547214, "auxiliary_loss_mlp": 0.00263206, "balance_loss_clip": 1.24945951, "balance_loss_mlp": 0.23349863, "epoch": 0.20730497519915828, "flos": 23075694524160.0, "grad_norm": 4.123866600566712, "language_loss": 0.93480545, "learning_rate": 3.679187663409184e-06, "loss": 0.95290959, "num_input_tokens_seen": 74544535, "router_z_loss_clip": 2.98046875, "router_z_loss_mlp": 0.29736328, "step": 3448, "time_per_iteration": 2.630033493041992 }, { "auxiliary_loss_clip": 0.0153343, "auxiliary_loss_mlp": 0.00260254, "balance_loss_clip": 1.23837566, "balance_loss_mlp": 0.22890174, "epoch": 0.20736509845182624, "flos": 21069042255360.0, "grad_norm": 3.9479438323139466, "language_loss": 0.8394224, "learning_rate": 3.6789760702670696e-06, "loss": 0.85735929, "num_input_tokens_seen": 74562300, "router_z_loss_clip": 2.953125, "router_z_loss_mlp": 0.31359863, "step": 3449, "time_per_iteration": 2.6151719093322754 }, { "auxiliary_loss_clip": 0.01531342, "auxiliary_loss_mlp": 0.0028821, "balance_loss_clip": 1.23594165, "balance_loss_mlp": 0.25478372, "epoch": 0.2074252217044942, "flos": 17633288499840.0, "grad_norm": 3.41647008663153, "language_loss": 0.86091793, "learning_rate": 3.6787644134580134e-06, "loss": 0.87911344, "num_input_tokens_seen": 74580080, "router_z_loss_clip": 2.953125, "router_z_loss_mlp": 0.33422852, "step": 3450, "time_per_iteration": 2.608027219772339 }, { "auxiliary_loss_clip": 0.01530017, "auxiliary_loss_mlp": 0.00251156, "balance_loss_clip": 1.23736644, "balance_loss_mlp": 0.22041172, "epoch": 0.20748534495716217, "flos": 23546985897600.0, "grad_norm": 290.4984777867855, "language_loss": 0.88355505, "learning_rate": 3.6785526929900436e-06, "loss": 0.90136671, "num_input_tokens_seen": 74598980, "router_z_loss_clip": 2.92773438, "router_z_loss_mlp": 0.30749512, "step": 3451, "time_per_iteration": 2.6280319690704346 }, { "auxiliary_loss_clip": 0.0172223, "auxiliary_loss_mlp": 0.00063115, "balance_loss_clip": 1.51969755, "balance_loss_mlp": 0.05143248, "epoch": 0.20754546820983016, "flos": 52252935598080.0, "grad_norm": 0.7779057981045974, "language_loss": 0.56402302, "learning_rate": 3.6783409088711875e-06, "loss": 0.58187652, "num_input_tokens_seen": 74655275, "router_z_loss_clip": 2.03125, "router_z_loss_mlp": 0.11669922, "step": 3452, "time_per_iteration": 3.044283866882324 }, { "auxiliary_loss_clip": 0.01530528, "auxiliary_loss_mlp": 0.00260037, "balance_loss_clip": 1.2421329, "balance_loss_mlp": 0.22770774, "epoch": 0.20760559146249813, "flos": 20412379768320.0, "grad_norm": 7.9543338709598315, "language_loss": 0.96564621, "learning_rate": 3.6781290611094755e-06, "loss": 0.98355186, "num_input_tokens_seen": 74674560, "router_z_loss_clip": 2.88476562, "router_z_loss_mlp": 0.32324219, "step": 3453, "time_per_iteration": 2.658588171005249 }, { "auxiliary_loss_clip": 0.01541158, "auxiliary_loss_mlp": 0.00285819, "balance_loss_clip": 1.25256503, "balance_loss_mlp": 0.25239253, "epoch": 0.2076657147151661, "flos": 23186012169600.0, "grad_norm": 56.34865306124164, "language_loss": 0.86570168, "learning_rate": 3.6779171497129407e-06, "loss": 0.88397145, "num_input_tokens_seen": 74694500, "router_z_loss_clip": 2.88671875, "router_z_loss_mlp": 0.33422852, "step": 3454, "time_per_iteration": 2.746335983276367 }, { "auxiliary_loss_clip": 0.01527543, "auxiliary_loss_mlp": 0.00214838, "balance_loss_clip": 1.24100876, "balance_loss_mlp": 0.18546452, "epoch": 0.20772583796783406, "flos": 18293219124480.0, "grad_norm": 141.10677190198854, "language_loss": 0.87963068, "learning_rate": 3.6777051746896202e-06, "loss": 0.89705443, "num_input_tokens_seen": 74710485, "router_z_loss_clip": 2.86328125, "router_z_loss_mlp": 0.2935791, "step": 3455, "time_per_iteration": 2.657067060470581 }, { "auxiliary_loss_clip": 0.01529163, "auxiliary_loss_mlp": 0.00242774, "balance_loss_clip": 1.24341655, "balance_loss_mlp": 0.21161285, "epoch": 0.20778596122050202, "flos": 17602800831360.0, "grad_norm": 8.093492405228416, "language_loss": 0.8911888, "learning_rate": 3.6774931360475516e-06, "loss": 0.90890825, "num_input_tokens_seen": 74727450, "router_z_loss_clip": 2.85742188, "router_z_loss_mlp": 0.31201172, "step": 3456, "time_per_iteration": 2.607266902923584 }, { "auxiliary_loss_clip": 0.01522489, "auxiliary_loss_mlp": 0.00255058, "balance_loss_clip": 1.23261213, "balance_loss_mlp": 0.21865159, "epoch": 0.20784608447317, "flos": 23805578885760.0, "grad_norm": 1.9875749830715173, "language_loss": 0.86231649, "learning_rate": 3.6772810337947745e-06, "loss": 0.88009191, "num_input_tokens_seen": 74746725, "router_z_loss_clip": 2.90234375, "router_z_loss_mlp": 0.36401367, "step": 3457, "time_per_iteration": 2.7015886306762695 }, { "auxiliary_loss_clip": 0.0151803, "auxiliary_loss_mlp": 0.00239295, "balance_loss_clip": 1.23102736, "balance_loss_mlp": 0.20517752, "epoch": 0.20790620772583795, "flos": 17639286071040.0, "grad_norm": 4.908145904256742, "language_loss": 0.92116439, "learning_rate": 3.677068867939333e-06, "loss": 0.93873763, "num_input_tokens_seen": 74765255, "router_z_loss_clip": 2.8671875, "router_z_loss_mlp": 0.34082031, "step": 3458, "time_per_iteration": 2.6327426433563232 }, { "auxiliary_loss_clip": 0.01522829, "auxiliary_loss_mlp": 0.0023263, "balance_loss_clip": 1.23837709, "balance_loss_mlp": 0.20120651, "epoch": 0.20796633097850595, "flos": 27673481168640.0, "grad_norm": 11.016511292540851, "language_loss": 0.82655764, "learning_rate": 3.676856638489272e-06, "loss": 0.84411216, "num_input_tokens_seen": 74785710, "router_z_loss_clip": 2.84375, "router_z_loss_mlp": 0.31396484, "step": 3459, "time_per_iteration": 2.7089052200317383 }, { "auxiliary_loss_clip": 0.01523641, "auxiliary_loss_mlp": 0.00223303, "balance_loss_clip": 1.23591876, "balance_loss_mlp": 0.19259456, "epoch": 0.2080264542311739, "flos": 19245606284160.0, "grad_norm": 12.664261057694537, "language_loss": 0.84410602, "learning_rate": 3.6766443454526382e-06, "loss": 0.86157548, "num_input_tokens_seen": 74804490, "router_z_loss_clip": 2.87304688, "router_z_loss_mlp": 0.30712891, "step": 3460, "time_per_iteration": 2.6089885234832764 }, { "auxiliary_loss_clip": 0.01502653, "auxiliary_loss_mlp": 0.0023224, "balance_loss_clip": 1.21884668, "balance_loss_mlp": 0.20231888, "epoch": 0.20808657748384188, "flos": 27525924097920.0, "grad_norm": 4.47772688058867, "language_loss": 0.80849522, "learning_rate": 3.6764319888374836e-06, "loss": 0.82584417, "num_input_tokens_seen": 74826340, "router_z_loss_clip": 2.83984375, "router_z_loss_mlp": 0.29907227, "step": 3461, "time_per_iteration": 2.6895523071289062 }, { "auxiliary_loss_clip": 0.01507396, "auxiliary_loss_mlp": 0.00262131, "balance_loss_clip": 1.21937275, "balance_loss_mlp": 0.22863282, "epoch": 0.20814670073650984, "flos": 26906931999360.0, "grad_norm": 13.538259324522214, "language_loss": 0.96118182, "learning_rate": 3.6762195686518604e-06, "loss": 0.97887707, "num_input_tokens_seen": 74844960, "router_z_loss_clip": 2.88476562, "router_z_loss_mlp": 0.3347168, "step": 3462, "time_per_iteration": 2.6575088500976562 }, { "auxiliary_loss_clip": 0.01658133, "auxiliary_loss_mlp": 0.00098021, "balance_loss_clip": 1.45861745, "balance_loss_mlp": 0.0837159, "epoch": 0.2082068239891778, "flos": 70175735717760.0, "grad_norm": 0.7468089376822732, "language_loss": 0.58884764, "learning_rate": 3.6760070849038226e-06, "loss": 0.60640913, "num_input_tokens_seen": 74909075, "router_z_loss_clip": 2.0, "router_z_loss_mlp": 0.14257812, "step": 3463, "time_per_iteration": 3.221330165863037 }, { "auxiliary_loss_clip": 0.01494031, "auxiliary_loss_mlp": 0.00250351, "balance_loss_clip": 1.2094084, "balance_loss_mlp": 0.21816432, "epoch": 0.20826694724184577, "flos": 24608074590720.0, "grad_norm": 142.2321810307915, "language_loss": 0.77303743, "learning_rate": 3.675794537601429e-06, "loss": 0.79048133, "num_input_tokens_seen": 74928125, "router_z_loss_clip": 2.84570312, "router_z_loss_mlp": 0.32177734, "step": 3464, "time_per_iteration": 2.6308348178863525 }, { "auxiliary_loss_clip": 0.01515384, "auxiliary_loss_mlp": 0.00247369, "balance_loss_clip": 1.22611904, "balance_loss_mlp": 0.21780525, "epoch": 0.20832707049451377, "flos": 12892829034240.0, "grad_norm": 14.905240692430773, "language_loss": 0.91202343, "learning_rate": 3.6755819267527373e-06, "loss": 0.92965096, "num_input_tokens_seen": 74945090, "router_z_loss_clip": 2.88867188, "router_z_loss_mlp": 0.29614258, "step": 3465, "time_per_iteration": 2.6086678504943848 }, { "auxiliary_loss_clip": 0.01502501, "auxiliary_loss_mlp": 0.00234264, "balance_loss_clip": 1.21559024, "balance_loss_mlp": 0.20396104, "epoch": 0.20838719374718173, "flos": 22198827709440.0, "grad_norm": 10.340112984963907, "language_loss": 0.90856552, "learning_rate": 3.6753692523658113e-06, "loss": 0.92593324, "num_input_tokens_seen": 74963630, "router_z_loss_clip": 2.8671875, "router_z_loss_mlp": 0.30322266, "step": 3466, "time_per_iteration": 2.7197132110595703 }, { "auxiliary_loss_clip": 0.01494578, "auxiliary_loss_mlp": 0.00205797, "balance_loss_clip": 1.20628679, "balance_loss_mlp": 0.17784223, "epoch": 0.2084473169998497, "flos": 15158648908800.0, "grad_norm": 2.482656317275437, "language_loss": 0.89261448, "learning_rate": 3.675156514448716e-06, "loss": 0.90961826, "num_input_tokens_seen": 74981875, "router_z_loss_clip": 2.88671875, "router_z_loss_mlp": 0.27941895, "step": 3467, "time_per_iteration": 2.646552801132202 }, { "auxiliary_loss_clip": 0.01498714, "auxiliary_loss_mlp": 0.00202872, "balance_loss_clip": 1.21592093, "balance_loss_mlp": 0.17689598, "epoch": 0.20850744025251766, "flos": 17456788045440.0, "grad_norm": 4.953790343819207, "language_loss": 0.88777995, "learning_rate": 3.674943713009518e-06, "loss": 0.90479583, "num_input_tokens_seen": 74999155, "router_z_loss_clip": 2.82617188, "router_z_loss_mlp": 0.25976562, "step": 3468, "time_per_iteration": 2.5931687355041504 }, { "auxiliary_loss_clip": 0.01495698, "auxiliary_loss_mlp": 0.00232745, "balance_loss_clip": 1.20579147, "balance_loss_mlp": 0.20196518, "epoch": 0.20856756350518563, "flos": 25698968593920.0, "grad_norm": 2.4463649288419425, "language_loss": 0.96888745, "learning_rate": 3.6747308480562856e-06, "loss": 0.98617196, "num_input_tokens_seen": 75017850, "router_z_loss_clip": 2.89648438, "router_z_loss_mlp": 0.30761719, "step": 3469, "time_per_iteration": 2.6519081592559814 }, { "auxiliary_loss_clip": 0.01501217, "auxiliary_loss_mlp": 0.00197717, "balance_loss_clip": 1.20796692, "balance_loss_mlp": 0.16913083, "epoch": 0.2086276867578536, "flos": 37889060970240.0, "grad_norm": 51.27594386544996, "language_loss": 0.83493388, "learning_rate": 3.674517919597092e-06, "loss": 0.85192323, "num_input_tokens_seen": 75039270, "router_z_loss_clip": 2.93359375, "router_z_loss_mlp": 0.28588867, "step": 3470, "time_per_iteration": 2.808732748031616 }, { "auxiliary_loss_clip": 0.01490091, "auxiliary_loss_mlp": 0.00212821, "balance_loss_clip": 1.20005763, "balance_loss_mlp": 0.18525934, "epoch": 0.20868781001052156, "flos": 25557049958400.0, "grad_norm": 6.610273730900486, "language_loss": 0.81131488, "learning_rate": 3.674304927640011e-06, "loss": 0.82834399, "num_input_tokens_seen": 75059350, "router_z_loss_clip": 2.90039062, "router_z_loss_mlp": 0.27587891, "step": 3471, "time_per_iteration": 2.74190092086792 }, { "auxiliary_loss_clip": 0.01498807, "auxiliary_loss_mlp": 0.00228271, "balance_loss_clip": 1.19936085, "balance_loss_mlp": 0.19819482, "epoch": 0.20874793326318955, "flos": 27529192235520.0, "grad_norm": 8.704251116889646, "language_loss": 0.84175128, "learning_rate": 3.67409187219312e-06, "loss": 0.85902202, "num_input_tokens_seen": 75080150, "router_z_loss_clip": 2.99609375, "router_z_loss_mlp": 0.30078125, "step": 3472, "time_per_iteration": 2.7337372303009033 }, { "auxiliary_loss_clip": 0.01497449, "auxiliary_loss_mlp": 0.00221215, "balance_loss_clip": 1.20417047, "balance_loss_mlp": 0.19348675, "epoch": 0.20880805651585752, "flos": 18548795370240.0, "grad_norm": 4.007316835636918, "language_loss": 0.92276502, "learning_rate": 3.6738787532644966e-06, "loss": 0.93995166, "num_input_tokens_seen": 75097920, "router_z_loss_clip": 2.93359375, "router_z_loss_mlp": 0.27746582, "step": 3473, "time_per_iteration": 2.625120162963867 }, { "auxiliary_loss_clip": 0.01615163, "auxiliary_loss_mlp": 0.00078627, "balance_loss_clip": 1.41480267, "balance_loss_mlp": 0.06923302, "epoch": 0.20886817976852548, "flos": 65946644225280.0, "grad_norm": 0.8768414329438375, "language_loss": 0.63612819, "learning_rate": 3.6736655708622235e-06, "loss": 0.65306604, "num_input_tokens_seen": 75152410, "router_z_loss_clip": 2.0, "router_z_loss_mlp": 0.09375, "step": 3474, "time_per_iteration": 3.0562610626220703 }, { "auxiliary_loss_clip": 0.01485888, "auxiliary_loss_mlp": 0.00216708, "balance_loss_clip": 1.19228137, "balance_loss_mlp": 0.18945722, "epoch": 0.20892830302119345, "flos": 36539178929280.0, "grad_norm": 7.946646497610969, "language_loss": 0.79150808, "learning_rate": 3.6734523249943844e-06, "loss": 0.80853403, "num_input_tokens_seen": 75173265, "router_z_loss_clip": 2.93554688, "router_z_loss_mlp": 0.27270508, "step": 3475, "time_per_iteration": 2.773393392562866 }, { "auxiliary_loss_clip": 0.01480855, "auxiliary_loss_mlp": 0.00243083, "balance_loss_clip": 1.18371022, "balance_loss_mlp": 0.21478267, "epoch": 0.2089884262738614, "flos": 20956749361920.0, "grad_norm": 4.279477986365849, "language_loss": 0.77272713, "learning_rate": 3.673239015669065e-06, "loss": 0.78996652, "num_input_tokens_seen": 75193640, "router_z_loss_clip": 2.96875, "router_z_loss_mlp": 0.28295898, "step": 3476, "time_per_iteration": 2.654015302658081 }, { "auxiliary_loss_clip": 0.01469564, "auxiliary_loss_mlp": 0.00189927, "balance_loss_clip": 1.17942548, "balance_loss_mlp": 0.1625091, "epoch": 0.20904854952652938, "flos": 22784028088320.0, "grad_norm": 22.029054802019886, "language_loss": 0.97070217, "learning_rate": 3.6730256428943544e-06, "loss": 0.98729706, "num_input_tokens_seen": 75212545, "router_z_loss_clip": 2.90429688, "router_z_loss_mlp": 0.27404785, "step": 3477, "time_per_iteration": 2.597456932067871 }, { "auxiliary_loss_clip": 0.01482688, "auxiliary_loss_mlp": 0.00216822, "balance_loss_clip": 1.18844676, "balance_loss_mlp": 0.18969023, "epoch": 0.20910867277919734, "flos": 27303277645440.0, "grad_norm": 35.464782932038545, "language_loss": 0.76445377, "learning_rate": 3.672812206678344e-06, "loss": 0.78144884, "num_input_tokens_seen": 75230865, "router_z_loss_clip": 2.94140625, "router_z_loss_mlp": 0.27124023, "step": 3478, "time_per_iteration": 2.6702702045440674 }, { "auxiliary_loss_clip": 0.01482825, "auxiliary_loss_mlp": 0.00218926, "balance_loss_clip": 1.18798089, "balance_loss_mlp": 0.19162694, "epoch": 0.20916879603186533, "flos": 14319237000960.0, "grad_norm": 5.148566983734678, "language_loss": 0.91909617, "learning_rate": 3.672598707029127e-06, "loss": 0.93611377, "num_input_tokens_seen": 75248285, "router_z_loss_clip": 2.94726562, "router_z_loss_mlp": 0.27294922, "step": 3479, "time_per_iteration": 3.9548451900482178 }, { "auxiliary_loss_clip": 0.01477847, "auxiliary_loss_mlp": 0.00253086, "balance_loss_clip": 1.18404937, "balance_loss_mlp": 0.22514299, "epoch": 0.2092289192845333, "flos": 22273019251200.0, "grad_norm": 8.477505895438382, "language_loss": 0.84408134, "learning_rate": 3.6723851439548003e-06, "loss": 0.86139071, "num_input_tokens_seen": 75266310, "router_z_loss_clip": 2.9375, "router_z_loss_mlp": 0.27954102, "step": 3480, "time_per_iteration": 2.6153805255889893 }, { "auxiliary_loss_clip": 0.01480879, "auxiliary_loss_mlp": 0.00200692, "balance_loss_clip": 1.18732297, "balance_loss_mlp": 0.17454913, "epoch": 0.20928904253720126, "flos": 14830712714880.0, "grad_norm": 164.24451642077236, "language_loss": 0.83763111, "learning_rate": 3.67217151746346e-06, "loss": 0.85444671, "num_input_tokens_seen": 75284175, "router_z_loss_clip": 2.93554688, "router_z_loss_mlp": 0.26171875, "step": 3481, "time_per_iteration": 2.6292145252227783 }, { "auxiliary_loss_clip": 0.01480962, "auxiliary_loss_mlp": 0.00195333, "balance_loss_clip": 1.18230557, "balance_loss_mlp": 0.16865417, "epoch": 0.20934916578986923, "flos": 23259162216960.0, "grad_norm": 26.412597338915898, "language_loss": 0.91040158, "learning_rate": 3.671957827563209e-06, "loss": 0.92716455, "num_input_tokens_seen": 75303465, "router_z_loss_clip": 2.98828125, "router_z_loss_mlp": 0.26672363, "step": 3482, "time_per_iteration": 5.551847696304321 }, { "auxiliary_loss_clip": 0.01490848, "auxiliary_loss_mlp": 0.00196787, "balance_loss_clip": 1.19272411, "balance_loss_mlp": 0.17132419, "epoch": 0.2094092890425372, "flos": 32014398677760.0, "grad_norm": 14.797961797233237, "language_loss": 0.78600037, "learning_rate": 3.6717440742621494e-06, "loss": 0.80287671, "num_input_tokens_seen": 75325290, "router_z_loss_clip": 2.98046875, "router_z_loss_mlp": 0.25463867, "step": 3483, "time_per_iteration": 2.7233521938323975 }, { "auxiliary_loss_clip": 0.01472007, "auxiliary_loss_mlp": 0.00244405, "balance_loss_clip": 1.1781522, "balance_loss_mlp": 0.21782085, "epoch": 0.20946941229520516, "flos": 20010647082240.0, "grad_norm": 5.749147764887854, "language_loss": 0.82755196, "learning_rate": 3.6715302575683865e-06, "loss": 0.84471601, "num_input_tokens_seen": 75343895, "router_z_loss_clip": 2.9375, "router_z_loss_mlp": 0.26586914, "step": 3484, "time_per_iteration": 2.6083261966705322 }, { "auxiliary_loss_clip": 0.01480855, "auxiliary_loss_mlp": 0.00253267, "balance_loss_clip": 1.1821003, "balance_loss_mlp": 0.22558701, "epoch": 0.20952953554787315, "flos": 30740072895360.0, "grad_norm": 11.361404794956952, "language_loss": 0.75073242, "learning_rate": 3.6713163774900292e-06, "loss": 0.76807368, "num_input_tokens_seen": 75367100, "router_z_loss_clip": 2.98632812, "router_z_loss_mlp": 0.27685547, "step": 3485, "time_per_iteration": 2.71386456489563 }, { "auxiliary_loss_clip": 0.01492268, "auxiliary_loss_mlp": 0.00230229, "balance_loss_clip": 1.19099259, "balance_loss_mlp": 0.20083177, "epoch": 0.20958965880054112, "flos": 27049209770880.0, "grad_norm": 29.6434983057903, "language_loss": 0.88840181, "learning_rate": 3.6711024340351875e-06, "loss": 0.90562671, "num_input_tokens_seen": 75389925, "router_z_loss_clip": 3.01171875, "router_z_loss_mlp": 0.29418945, "step": 3486, "time_per_iteration": 2.6766676902770996 }, { "auxiliary_loss_clip": 0.01487948, "auxiliary_loss_mlp": 0.00230185, "balance_loss_clip": 1.18558085, "balance_loss_mlp": 0.20228976, "epoch": 0.20964978205320908, "flos": 34204123589760.0, "grad_norm": 31.46009411164163, "language_loss": 0.92274857, "learning_rate": 3.6708884272119737e-06, "loss": 0.93992984, "num_input_tokens_seen": 75408575, "router_z_loss_clip": 3.0234375, "router_z_loss_mlp": 0.2791748, "step": 3487, "time_per_iteration": 4.210130453109741 }, { "auxiliary_loss_clip": 0.01487901, "auxiliary_loss_mlp": 0.0020866, "balance_loss_clip": 1.18521476, "balance_loss_mlp": 0.18076494, "epoch": 0.20970990530587705, "flos": 23477391296640.0, "grad_norm": 7.854326500418065, "language_loss": 0.81484342, "learning_rate": 3.670674357028504e-06, "loss": 0.83180904, "num_input_tokens_seen": 75427155, "router_z_loss_clip": 3.02929688, "router_z_loss_mlp": 0.27893066, "step": 3488, "time_per_iteration": 2.6238105297088623 }, { "auxiliary_loss_clip": 0.01509061, "auxiliary_loss_mlp": 0.00227107, "balance_loss_clip": 1.20090771, "balance_loss_mlp": 0.19956946, "epoch": 0.209770028558545, "flos": 18551452976640.0, "grad_norm": 20.4358991811247, "language_loss": 0.87407899, "learning_rate": 3.6704602234928945e-06, "loss": 0.89144075, "num_input_tokens_seen": 75444450, "router_z_loss_clip": 3.08007812, "router_z_loss_mlp": 0.27563477, "step": 3489, "time_per_iteration": 2.593170166015625 }, { "auxiliary_loss_clip": 0.01501359, "auxiliary_loss_mlp": 0.00222404, "balance_loss_clip": 1.19152629, "balance_loss_mlp": 0.19379413, "epoch": 0.20983015181121298, "flos": 21617003208960.0, "grad_norm": 8.804924387027768, "language_loss": 0.80008614, "learning_rate": 3.670246026613266e-06, "loss": 0.8173238, "num_input_tokens_seen": 75462625, "router_z_loss_clip": 3.09765625, "router_z_loss_mlp": 0.28613281, "step": 3490, "time_per_iteration": 2.588587522506714 }, { "auxiliary_loss_clip": 0.01490922, "auxiliary_loss_mlp": 0.00185676, "balance_loss_clip": 1.19164777, "balance_loss_mlp": 0.15984355, "epoch": 0.20989027506388094, "flos": 16614718531200.0, "grad_norm": 26.151648460501747, "language_loss": 0.7758534, "learning_rate": 3.6700317663977415e-06, "loss": 0.79261935, "num_input_tokens_seen": 75480640, "router_z_loss_clip": 2.99609375, "router_z_loss_mlp": 0.25817871, "step": 3491, "time_per_iteration": 2.5811173915863037 }, { "auxiliary_loss_clip": 0.01505034, "auxiliary_loss_mlp": 0.00197941, "balance_loss_clip": 1.19101465, "balance_loss_mlp": 0.16840082, "epoch": 0.20995039831654894, "flos": 23216823060480.0, "grad_norm": 61.74098755375713, "language_loss": 0.87104452, "learning_rate": 3.669817442854444e-06, "loss": 0.88807428, "num_input_tokens_seen": 75494900, "router_z_loss_clip": 3.14257812, "router_z_loss_mlp": 0.29504395, "step": 3492, "time_per_iteration": 2.651395082473755 }, { "auxiliary_loss_clip": 0.01508371, "auxiliary_loss_mlp": 0.00210292, "balance_loss_clip": 1.19839001, "balance_loss_mlp": 0.1828979, "epoch": 0.2100105215692169, "flos": 18147493647360.0, "grad_norm": 1.9903957774159589, "language_loss": 0.9237057, "learning_rate": 3.669603055991502e-06, "loss": 0.94089234, "num_input_tokens_seen": 75513370, "router_z_loss_clip": 3.09960938, "router_z_loss_mlp": 0.27404785, "step": 3493, "time_per_iteration": 2.7487635612487793 }, { "auxiliary_loss_clip": 0.01533556, "auxiliary_loss_mlp": 0.00204379, "balance_loss_clip": 1.21669531, "balance_loss_mlp": 0.1763171, "epoch": 0.21007064482188487, "flos": 15961611490560.0, "grad_norm": 18.04546089057488, "language_loss": 0.78289151, "learning_rate": 3.6693886058170455e-06, "loss": 0.80027086, "num_input_tokens_seen": 75532480, "router_z_loss_clip": 3.171875, "router_z_loss_mlp": 0.28051758, "step": 3494, "time_per_iteration": 2.6804728507995605 }, { "auxiliary_loss_clip": 0.01544451, "auxiliary_loss_mlp": 0.00245147, "balance_loss_clip": 1.22331512, "balance_loss_mlp": 0.21317488, "epoch": 0.21013076807455283, "flos": 32234315696640.0, "grad_norm": 39.84490826542003, "language_loss": 0.86445463, "learning_rate": 3.6691740923392053e-06, "loss": 0.88235068, "num_input_tokens_seen": 75552745, "router_z_loss_clip": 3.21289062, "router_z_loss_mlp": 0.32006836, "step": 3495, "time_per_iteration": 2.79950213432312 }, { "auxiliary_loss_clip": 0.0152981, "auxiliary_loss_mlp": 0.00221033, "balance_loss_clip": 1.21053052, "balance_loss_mlp": 0.19322111, "epoch": 0.2101908913272208, "flos": 23696625957120.0, "grad_norm": 24.408553154962004, "language_loss": 0.83914042, "learning_rate": 3.668959515566116e-06, "loss": 0.8566488, "num_input_tokens_seen": 75574355, "router_z_loss_clip": 3.19140625, "router_z_loss_mlp": 0.27807617, "step": 3496, "time_per_iteration": 2.6924169063568115 }, { "auxiliary_loss_clip": 0.01535553, "auxiliary_loss_mlp": 0.00231215, "balance_loss_clip": 1.2144016, "balance_loss_mlp": 0.20219994, "epoch": 0.21025101457988876, "flos": 20375786787840.0, "grad_norm": 8.449118682189528, "language_loss": 0.88033271, "learning_rate": 3.668744875505915e-06, "loss": 0.89800036, "num_input_tokens_seen": 75592215, "router_z_loss_clip": 3.20898438, "router_z_loss_mlp": 0.28991699, "step": 3497, "time_per_iteration": 2.5901620388031006 }, { "auxiliary_loss_clip": 0.01538673, "auxiliary_loss_mlp": 0.00251983, "balance_loss_clip": 1.21785069, "balance_loss_mlp": 0.22189459, "epoch": 0.21031113783255675, "flos": 25775638174080.0, "grad_norm": 11.388320156954588, "language_loss": 0.75902945, "learning_rate": 3.668530172166741e-06, "loss": 0.77693605, "num_input_tokens_seen": 75610740, "router_z_loss_clip": 3.20703125, "router_z_loss_mlp": 0.30078125, "step": 3498, "time_per_iteration": 2.648350238800049 }, { "auxiliary_loss_clip": 0.01528122, "auxiliary_loss_mlp": 0.00226798, "balance_loss_clip": 1.20507419, "balance_loss_mlp": 0.19639927, "epoch": 0.21037126108522472, "flos": 22018197191040.0, "grad_norm": 4.54234971675551, "language_loss": 0.89614248, "learning_rate": 3.6683154055567352e-06, "loss": 0.9136917, "num_input_tokens_seen": 75631005, "router_z_loss_clip": 3.23242188, "router_z_loss_mlp": 0.30395508, "step": 3499, "time_per_iteration": 2.629744529724121 }, { "auxiliary_loss_clip": 0.01556102, "auxiliary_loss_mlp": 0.00215606, "balance_loss_clip": 1.22362602, "balance_loss_mlp": 0.18468267, "epoch": 0.21043138433789269, "flos": 25334403505920.0, "grad_norm": 402.3850211550297, "language_loss": 0.83963621, "learning_rate": 3.668100575684043e-06, "loss": 0.85735327, "num_input_tokens_seen": 75650655, "router_z_loss_clip": 3.3203125, "router_z_loss_mlp": 0.30908203, "step": 3500, "time_per_iteration": 2.7254345417022705 }, { "auxiliary_loss_clip": 0.01557779, "auxiliary_loss_mlp": 0.00226362, "balance_loss_clip": 1.22231936, "balance_loss_mlp": 0.19570157, "epoch": 0.21049150759056065, "flos": 25556654908800.0, "grad_norm": 354.4054396455617, "language_loss": 0.80452132, "learning_rate": 3.6678856825568094e-06, "loss": 0.82236272, "num_input_tokens_seen": 75669895, "router_z_loss_clip": 3.35351562, "router_z_loss_mlp": 0.30664062, "step": 3501, "time_per_iteration": 2.6668622493743896 }, { "auxiliary_loss_clip": 0.01547741, "auxiliary_loss_mlp": 0.0019994, "balance_loss_clip": 1.22049856, "balance_loss_mlp": 0.17144856, "epoch": 0.21055163084322862, "flos": 24495602129280.0, "grad_norm": 3.3899259846031553, "language_loss": 0.80962002, "learning_rate": 3.667670726183183e-06, "loss": 0.82709682, "num_input_tokens_seen": 75689535, "router_z_loss_clip": 3.2734375, "router_z_loss_mlp": 0.28503418, "step": 3502, "time_per_iteration": 2.6850807666778564 }, { "auxiliary_loss_clip": 0.01558205, "auxiliary_loss_mlp": 0.00239932, "balance_loss_clip": 1.2299875, "balance_loss_mlp": 0.20996287, "epoch": 0.21061175409589658, "flos": 25739045193600.0, "grad_norm": 38.60052175615511, "language_loss": 0.82800674, "learning_rate": 3.667455706571316e-06, "loss": 0.84598804, "num_input_tokens_seen": 75709265, "router_z_loss_clip": 3.28320312, "router_z_loss_mlp": 0.29980469, "step": 3503, "time_per_iteration": 2.647056818008423 }, { "auxiliary_loss_clip": 0.01535799, "auxiliary_loss_mlp": 0.00253065, "balance_loss_clip": 1.2103188, "balance_loss_mlp": 0.22078341, "epoch": 0.21067187734856455, "flos": 18989168112000.0, "grad_norm": 18.134579816138285, "language_loss": 0.88127548, "learning_rate": 3.6672406237293617e-06, "loss": 0.89916408, "num_input_tokens_seen": 75727050, "router_z_loss_clip": 3.2578125, "router_z_loss_mlp": 0.32299805, "step": 3504, "time_per_iteration": 2.6129584312438965 }, { "auxiliary_loss_clip": 0.01527364, "auxiliary_loss_mlp": 0.00263775, "balance_loss_clip": 1.20505333, "balance_loss_mlp": 0.23266172, "epoch": 0.21073200060123254, "flos": 24681368292480.0, "grad_norm": 7.343272605920992, "language_loss": 0.83080912, "learning_rate": 3.6670254776654754e-06, "loss": 0.84872055, "num_input_tokens_seen": 75747175, "router_z_loss_clip": 3.22460938, "router_z_loss_mlp": 0.31103516, "step": 3505, "time_per_iteration": 2.6567957401275635 }, { "auxiliary_loss_clip": 0.01538722, "auxiliary_loss_mlp": 0.0022108, "balance_loss_clip": 1.21839213, "balance_loss_mlp": 0.19350719, "epoch": 0.2107921238539005, "flos": 28549342402560.0, "grad_norm": 140.83561714918048, "language_loss": 0.69307148, "learning_rate": 3.6668102683878163e-06, "loss": 0.71066952, "num_input_tokens_seen": 75767690, "router_z_loss_clip": 3.203125, "router_z_loss_mlp": 0.27575684, "step": 3506, "time_per_iteration": 2.6852028369903564 }, { "auxiliary_loss_clip": 0.01538043, "auxiliary_loss_mlp": 0.00245583, "balance_loss_clip": 1.21938384, "balance_loss_mlp": 0.21711631, "epoch": 0.21085224710656847, "flos": 25885848078720.0, "grad_norm": 52.73467750653108, "language_loss": 0.89146876, "learning_rate": 3.6665949959045443e-06, "loss": 0.90930498, "num_input_tokens_seen": 75787255, "router_z_loss_clip": 3.18945312, "router_z_loss_mlp": 0.28466797, "step": 3507, "time_per_iteration": 2.66064715385437 }, { "auxiliary_loss_clip": 0.01538427, "auxiliary_loss_mlp": 0.00272655, "balance_loss_clip": 1.21901286, "balance_loss_mlp": 0.24075416, "epoch": 0.21091237035923643, "flos": 14976294537600.0, "grad_norm": 10.175504902393257, "language_loss": 0.82136476, "learning_rate": 3.666379660223824e-06, "loss": 0.83947563, "num_input_tokens_seen": 75805890, "router_z_loss_clip": 3.1953125, "router_z_loss_mlp": 0.3190918, "step": 3508, "time_per_iteration": 2.6710569858551025 }, { "auxiliary_loss_clip": 0.01528845, "auxiliary_loss_mlp": 0.00259464, "balance_loss_clip": 1.21056652, "balance_loss_mlp": 0.22627652, "epoch": 0.2109724936119044, "flos": 16362518163840.0, "grad_norm": 19.2577446774282, "language_loss": 0.94984043, "learning_rate": 3.6661642613538192e-06, "loss": 0.96772349, "num_input_tokens_seen": 75821620, "router_z_loss_clip": 3.18554688, "router_z_loss_mlp": 0.33190918, "step": 3509, "time_per_iteration": 2.6409966945648193 }, { "auxiliary_loss_clip": 0.0149752, "auxiliary_loss_mlp": 0.00230924, "balance_loss_clip": 1.19069064, "balance_loss_mlp": 0.20102611, "epoch": 0.21103261686457236, "flos": 31502492000640.0, "grad_norm": 10.838864593052367, "language_loss": 0.75497186, "learning_rate": 3.6659487993026987e-06, "loss": 0.77225626, "num_input_tokens_seen": 75842490, "router_z_loss_clip": 3.06640625, "router_z_loss_mlp": 0.29882812, "step": 3510, "time_per_iteration": 2.7410929203033447 }, { "auxiliary_loss_clip": 0.014979, "auxiliary_loss_mlp": 0.00267964, "balance_loss_clip": 1.192047, "balance_loss_mlp": 0.23781613, "epoch": 0.21109274011724033, "flos": 27344072517120.0, "grad_norm": 7.129628729414589, "language_loss": 0.79466194, "learning_rate": 3.6657332740786327e-06, "loss": 0.81232059, "num_input_tokens_seen": 75865985, "router_z_loss_clip": 3.05664062, "router_z_loss_mlp": 0.30163574, "step": 3511, "time_per_iteration": 2.7011308670043945 }, { "auxiliary_loss_clip": 0.01492957, "auxiliary_loss_mlp": 0.00282699, "balance_loss_clip": 1.18841696, "balance_loss_mlp": 0.24972582, "epoch": 0.21115286336990832, "flos": 17820383466240.0, "grad_norm": 35.824160281195475, "language_loss": 0.80935407, "learning_rate": 3.665517685689794e-06, "loss": 0.82711065, "num_input_tokens_seen": 75882745, "router_z_loss_clip": 3.04492188, "router_z_loss_mlp": 0.32983398, "step": 3512, "time_per_iteration": 2.594832181930542 }, { "auxiliary_loss_clip": 0.01500483, "auxiliary_loss_mlp": 0.00288051, "balance_loss_clip": 1.19742107, "balance_loss_mlp": 0.25612646, "epoch": 0.2112129866225763, "flos": 27197987904000.0, "grad_norm": 4.785938310653663, "language_loss": 0.79294688, "learning_rate": 3.6653020341443584e-06, "loss": 0.81083226, "num_input_tokens_seen": 75904305, "router_z_loss_clip": 3.03125, "router_z_loss_mlp": 0.31933594, "step": 3513, "time_per_iteration": 2.708507776260376 }, { "auxiliary_loss_clip": 0.01526241, "auxiliary_loss_mlp": 0.0029864, "balance_loss_clip": 1.22208738, "balance_loss_mlp": 0.26976743, "epoch": 0.21127310987524425, "flos": 23731279603200.0, "grad_norm": 18.36364981717912, "language_loss": 0.80223513, "learning_rate": 3.665086319450502e-06, "loss": 0.82048392, "num_input_tokens_seen": 75923710, "router_z_loss_clip": 3.04296875, "router_z_loss_mlp": 0.28881836, "step": 3514, "time_per_iteration": 2.6958229541778564 }, { "auxiliary_loss_clip": 0.01516106, "auxiliary_loss_mlp": 0.00376062, "balance_loss_clip": 1.21598077, "balance_loss_mlp": 0.34485298, "epoch": 0.21133323312791222, "flos": 18332505624960.0, "grad_norm": 96.25954307651718, "language_loss": 0.84451127, "learning_rate": 3.6648705416164062e-06, "loss": 0.863433, "num_input_tokens_seen": 75942625, "router_z_loss_clip": 3.00390625, "router_z_loss_mlp": 0.31176758, "step": 3515, "time_per_iteration": 2.7391021251678467 }, { "auxiliary_loss_clip": 0.01519823, "auxiliary_loss_mlp": 0.00406673, "balance_loss_clip": 1.22169757, "balance_loss_mlp": 0.37398604, "epoch": 0.21139335638058018, "flos": 17931203902080.0, "grad_norm": 130.405471773741, "language_loss": 0.76774585, "learning_rate": 3.6646547006502518e-06, "loss": 0.78701079, "num_input_tokens_seen": 75959930, "router_z_loss_clip": 2.984375, "router_z_loss_mlp": 0.3269043, "step": 3516, "time_per_iteration": 2.641533613204956 }, { "auxiliary_loss_clip": 0.01538098, "auxiliary_loss_mlp": 0.00372015, "balance_loss_clip": 1.23154581, "balance_loss_mlp": 0.34049639, "epoch": 0.21145347963324815, "flos": 24572092141440.0, "grad_norm": 9104.606487295456, "language_loss": 0.91043049, "learning_rate": 3.664438796560225e-06, "loss": 0.92953157, "num_input_tokens_seen": 75980335, "router_z_loss_clip": 3.0625, "router_z_loss_mlp": 0.31518555, "step": 3517, "time_per_iteration": 2.746525764465332 }, { "auxiliary_loss_clip": 0.0154429, "auxiliary_loss_mlp": 0.00415685, "balance_loss_clip": 1.23841453, "balance_loss_mlp": 0.38136476, "epoch": 0.21151360288591614, "flos": 35845959375360.0, "grad_norm": 453.92933228073844, "language_loss": 0.71328694, "learning_rate": 3.664222829354512e-06, "loss": 0.73288667, "num_input_tokens_seen": 76002095, "router_z_loss_clip": 3.0625, "router_z_loss_mlp": 0.34338379, "step": 3518, "time_per_iteration": 2.752349376678467 }, { "auxiliary_loss_clip": 0.01555475, "auxiliary_loss_mlp": 0.00463259, "balance_loss_clip": 1.24923182, "balance_loss_mlp": 0.42928439, "epoch": 0.2115737261385841, "flos": 24641579001600.0, "grad_norm": 6.511846865051982, "language_loss": 0.95659697, "learning_rate": 3.664006799041303e-06, "loss": 0.97678429, "num_input_tokens_seen": 76020425, "router_z_loss_clip": 3.0625, "router_z_loss_mlp": 0.33959961, "step": 3519, "time_per_iteration": 2.639662504196167 }, { "auxiliary_loss_clip": 0.0156777, "auxiliary_loss_mlp": 0.0047406, "balance_loss_clip": 1.2578584, "balance_loss_mlp": 0.43853587, "epoch": 0.21163384939125207, "flos": 25226887121280.0, "grad_norm": 7.415547956023514, "language_loss": 0.88037896, "learning_rate": 3.6637907056287886e-06, "loss": 0.90079725, "num_input_tokens_seen": 76041210, "router_z_loss_clip": 3.09570312, "router_z_loss_mlp": 0.35546875, "step": 3520, "time_per_iteration": 2.699019432067871 }, { "auxiliary_loss_clip": 0.0156633, "auxiliary_loss_mlp": 0.00458917, "balance_loss_clip": 1.25914204, "balance_loss_mlp": 0.42701644, "epoch": 0.21169397264392004, "flos": 26067520091520.0, "grad_norm": 3.8671903082137256, "language_loss": 0.83037317, "learning_rate": 3.6635745491251642e-06, "loss": 0.85062557, "num_input_tokens_seen": 76062685, "router_z_loss_clip": 3.0703125, "router_z_loss_mlp": 0.31933594, "step": 3521, "time_per_iteration": 4.166273593902588 }, { "auxiliary_loss_clip": 0.01577077, "auxiliary_loss_mlp": 0.00486296, "balance_loss_clip": 1.2651813, "balance_loss_mlp": 0.45377594, "epoch": 0.211754095896588, "flos": 23108265181440.0, "grad_norm": 3.256129224582501, "language_loss": 0.80915916, "learning_rate": 3.663358329538626e-06, "loss": 0.82979286, "num_input_tokens_seen": 76082300, "router_z_loss_clip": 3.11914062, "router_z_loss_mlp": 0.32495117, "step": 3522, "time_per_iteration": 2.657609701156616 }, { "auxiliary_loss_clip": 0.01568454, "auxiliary_loss_mlp": 0.0051276, "balance_loss_clip": 1.25672102, "balance_loss_mlp": 0.47647256, "epoch": 0.21181421914925597, "flos": 27922341571200.0, "grad_norm": 5.931239695254143, "language_loss": 0.7791158, "learning_rate": 3.663142046877374e-06, "loss": 0.79992795, "num_input_tokens_seen": 76101135, "router_z_loss_clip": 3.11523438, "router_z_loss_mlp": 0.36279297, "step": 3523, "time_per_iteration": 2.639361619949341 }, { "auxiliary_loss_clip": 0.01584483, "auxiliary_loss_mlp": 0.00490012, "balance_loss_clip": 1.27363229, "balance_loss_mlp": 0.45505947, "epoch": 0.21187434240192393, "flos": 17128636369920.0, "grad_norm": 15.140942372848487, "language_loss": 0.8680563, "learning_rate": 3.6629257011496085e-06, "loss": 0.88880128, "num_input_tokens_seen": 76119320, "router_z_loss_clip": 3.109375, "router_z_loss_mlp": 0.34936523, "step": 3524, "time_per_iteration": 3.956183910369873 }, { "auxiliary_loss_clip": 0.01573636, "auxiliary_loss_mlp": 0.00530113, "balance_loss_clip": 1.2591753, "balance_loss_mlp": 0.49094087, "epoch": 0.21193446565459192, "flos": 22347318533760.0, "grad_norm": 6.4569390805065074, "language_loss": 0.87437773, "learning_rate": 3.6627092923635338e-06, "loss": 0.89541531, "num_input_tokens_seen": 76137445, "router_z_loss_clip": 3.14453125, "router_z_loss_mlp": 0.3918457, "step": 3525, "time_per_iteration": 4.136544704437256 }, { "auxiliary_loss_clip": 0.01585764, "auxiliary_loss_mlp": 0.00531254, "balance_loss_clip": 1.27133441, "balance_loss_mlp": 0.49375117, "epoch": 0.2119945889072599, "flos": 27199316707200.0, "grad_norm": 10.714436092308228, "language_loss": 0.79705751, "learning_rate": 3.662492820527356e-06, "loss": 0.81822777, "num_input_tokens_seen": 76159500, "router_z_loss_clip": 3.14648438, "router_z_loss_mlp": 0.375, "step": 3526, "time_per_iteration": 2.702420949935913 }, { "auxiliary_loss_clip": 0.01581238, "auxiliary_loss_mlp": 0.00574719, "balance_loss_clip": 1.26688337, "balance_loss_mlp": 0.53464133, "epoch": 0.21205471215992786, "flos": 20991869884800.0, "grad_norm": 5.9199236975499785, "language_loss": 0.81740439, "learning_rate": 3.662276285649284e-06, "loss": 0.83896399, "num_input_tokens_seen": 76177990, "router_z_loss_clip": 3.14453125, "router_z_loss_mlp": 0.40063477, "step": 3527, "time_per_iteration": 2.633538007736206 }, { "auxiliary_loss_clip": 0.01581897, "auxiliary_loss_mlp": 0.00574969, "balance_loss_clip": 1.2673223, "balance_loss_mlp": 0.53582108, "epoch": 0.21211483541259582, "flos": 20777663128320.0, "grad_norm": 111.26718897438622, "language_loss": 0.83883709, "learning_rate": 3.662059687737528e-06, "loss": 0.86040574, "num_input_tokens_seen": 76197125, "router_z_loss_clip": 3.14453125, "router_z_loss_mlp": 0.39160156, "step": 3528, "time_per_iteration": 2.6740565299987793 }, { "auxiliary_loss_clip": 0.01578632, "auxiliary_loss_mlp": 0.00534272, "balance_loss_clip": 1.26310039, "balance_loss_mlp": 0.49777019, "epoch": 0.21217495866526379, "flos": 18989994124800.0, "grad_norm": 7.078812931316298, "language_loss": 0.87107188, "learning_rate": 3.6618430268003024e-06, "loss": 0.89220095, "num_input_tokens_seen": 76216215, "router_z_loss_clip": 3.15429688, "router_z_loss_mlp": 0.36523438, "step": 3529, "time_per_iteration": 4.106525182723999 }, { "auxiliary_loss_clip": 0.01581274, "auxiliary_loss_mlp": 0.00596017, "balance_loss_clip": 1.26610243, "balance_loss_mlp": 0.55384088, "epoch": 0.21223508191793175, "flos": 20667309569280.0, "grad_norm": 5.2021284433646935, "language_loss": 0.84080982, "learning_rate": 3.6616263028458235e-06, "loss": 0.86258268, "num_input_tokens_seen": 76237010, "router_z_loss_clip": 3.15625, "router_z_loss_mlp": 0.421875, "step": 3530, "time_per_iteration": 2.6949081420898438 }, { "auxiliary_loss_clip": 0.01578875, "auxiliary_loss_mlp": 0.00608957, "balance_loss_clip": 1.26700258, "balance_loss_mlp": 0.5665189, "epoch": 0.21229520517059972, "flos": 21616464504960.0, "grad_norm": 11.377219446846189, "language_loss": 0.89893597, "learning_rate": 3.661409515882308e-06, "loss": 0.92081428, "num_input_tokens_seen": 76255965, "router_z_loss_clip": 3.1171875, "router_z_loss_mlp": 0.42456055, "step": 3531, "time_per_iteration": 2.7014217376708984 }, { "auxiliary_loss_clip": 0.01590062, "auxiliary_loss_mlp": 0.0063157, "balance_loss_clip": 1.27010214, "balance_loss_mlp": 0.58607936, "epoch": 0.2123553284232677, "flos": 13991049411840.0, "grad_norm": 20.188967616155526, "language_loss": 0.83662504, "learning_rate": 3.661192665917977e-06, "loss": 0.85884136, "num_input_tokens_seen": 76272150, "router_z_loss_clip": 3.203125, "router_z_loss_mlp": 0.45483398, "step": 3532, "time_per_iteration": 2.623870611190796 }, { "auxiliary_loss_clip": 0.0159624, "auxiliary_loss_mlp": 0.00596399, "balance_loss_clip": 1.27719057, "balance_loss_mlp": 0.5519343, "epoch": 0.21241545167593567, "flos": 18296774570880.0, "grad_norm": 16.077896471226165, "language_loss": 0.80648291, "learning_rate": 3.660975752961054e-06, "loss": 0.82840931, "num_input_tokens_seen": 76291425, "router_z_loss_clip": 3.19140625, "router_z_loss_mlp": 0.44482422, "step": 3533, "time_per_iteration": 2.6571602821350098 }, { "auxiliary_loss_clip": 0.01613153, "auxiliary_loss_mlp": 0.00684336, "balance_loss_clip": 1.29096937, "balance_loss_mlp": 0.63553202, "epoch": 0.21247557492860364, "flos": 34713121265280.0, "grad_norm": 26.05169960462859, "language_loss": 0.77599633, "learning_rate": 3.6607587770197634e-06, "loss": 0.79897118, "num_input_tokens_seen": 76313975, "router_z_loss_clip": 3.22460938, "router_z_loss_mlp": 0.48803711, "step": 3534, "time_per_iteration": 2.7503480911254883 }, { "auxiliary_loss_clip": 0.01598569, "auxiliary_loss_mlp": 0.0061579, "balance_loss_clip": 1.28294849, "balance_loss_mlp": 0.57060945, "epoch": 0.2125356981812716, "flos": 22053820504320.0, "grad_norm": 3.462957302647503, "language_loss": 0.79321229, "learning_rate": 3.6605417381023346e-06, "loss": 0.8153559, "num_input_tokens_seen": 76330955, "router_z_loss_clip": 3.15429688, "router_z_loss_mlp": 0.45166016, "step": 3535, "time_per_iteration": 2.6188313961029053 }, { "auxiliary_loss_clip": 0.01595813, "auxiliary_loss_mlp": 0.00685888, "balance_loss_clip": 1.27403021, "balance_loss_mlp": 0.63620126, "epoch": 0.21259582143393957, "flos": 28548336821760.0, "grad_norm": 19.126589326487302, "language_loss": 0.75741333, "learning_rate": 3.660324636216996e-06, "loss": 0.78023034, "num_input_tokens_seen": 76352680, "router_z_loss_clip": 3.21875, "router_z_loss_mlp": 0.49707031, "step": 3536, "time_per_iteration": 2.7014362812042236 }, { "auxiliary_loss_clip": 0.01608657, "auxiliary_loss_mlp": 0.00734843, "balance_loss_clip": 1.27982819, "balance_loss_mlp": 0.6812939, "epoch": 0.21265594468660753, "flos": 20120892900480.0, "grad_norm": 15.160573951972586, "language_loss": 0.92015445, "learning_rate": 3.660107471371981e-06, "loss": 0.94358939, "num_input_tokens_seen": 76370750, "router_z_loss_clip": 3.28515625, "router_z_loss_mlp": 0.53564453, "step": 3537, "time_per_iteration": 2.679823637008667 }, { "auxiliary_loss_clip": 0.01595316, "auxiliary_loss_mlp": 0.00651314, "balance_loss_clip": 1.27577579, "balance_loss_mlp": 0.60653949, "epoch": 0.21271606793927553, "flos": 23076161400960.0, "grad_norm": 132.46391691158146, "language_loss": 0.86416423, "learning_rate": 3.659890243575524e-06, "loss": 0.88663054, "num_input_tokens_seen": 76390610, "router_z_loss_clip": 3.19335938, "router_z_loss_mlp": 0.44775391, "step": 3538, "time_per_iteration": 2.68831467628479 }, { "auxiliary_loss_clip": 0.01599395, "auxiliary_loss_mlp": 0.00668754, "balance_loss_clip": 1.27373612, "balance_loss_mlp": 0.62250042, "epoch": 0.2127761911919435, "flos": 26388201738240.0, "grad_norm": 103.5069674492121, "language_loss": 0.92263675, "learning_rate": 3.659672952835863e-06, "loss": 0.94531822, "num_input_tokens_seen": 76408860, "router_z_loss_clip": 3.25585938, "router_z_loss_mlp": 0.46264648, "step": 3539, "time_per_iteration": 2.6827430725097656 }, { "auxiliary_loss_clip": 0.01603163, "auxiliary_loss_mlp": 0.00611129, "balance_loss_clip": 1.28095055, "balance_loss_mlp": 0.56332618, "epoch": 0.21283631444461146, "flos": 20228265630720.0, "grad_norm": 3.6687371724914826, "language_loss": 0.64338869, "learning_rate": 3.659455599161237e-06, "loss": 0.66553164, "num_input_tokens_seen": 76424980, "router_z_loss_clip": 3.22265625, "router_z_loss_mlp": 0.47802734, "step": 3540, "time_per_iteration": 2.75003981590271 }, { "auxiliary_loss_clip": 0.0160631, "auxiliary_loss_mlp": 0.00584412, "balance_loss_clip": 1.28448749, "balance_loss_mlp": 0.54204488, "epoch": 0.21289643769727942, "flos": 13516992691200.0, "grad_norm": 8.276254849339201, "language_loss": 0.84061462, "learning_rate": 3.659238182559888e-06, "loss": 0.86252183, "num_input_tokens_seen": 76443135, "router_z_loss_clip": 3.21875, "router_z_loss_mlp": 0.42382812, "step": 3541, "time_per_iteration": 2.611401081085205 }, { "auxiliary_loss_clip": 0.01611458, "auxiliary_loss_mlp": 0.00633496, "balance_loss_clip": 1.2889514, "balance_loss_mlp": 0.58798158, "epoch": 0.2129565609499474, "flos": 24827021942400.0, "grad_norm": 271.8847615453416, "language_loss": 0.74647337, "learning_rate": 3.6590207030400615e-06, "loss": 0.76892292, "num_input_tokens_seen": 76462470, "router_z_loss_clip": 3.22265625, "router_z_loss_mlp": 0.45556641, "step": 3542, "time_per_iteration": 2.7024431228637695 }, { "auxiliary_loss_clip": 0.01615127, "auxiliary_loss_mlp": 0.00618678, "balance_loss_clip": 1.29320455, "balance_loss_mlp": 0.57440352, "epoch": 0.21301668420261535, "flos": 23659242877440.0, "grad_norm": 45.609277626604346, "language_loss": 0.82277358, "learning_rate": 3.658803160610004e-06, "loss": 0.84511167, "num_input_tokens_seen": 76481995, "router_z_loss_clip": 3.22070312, "router_z_loss_mlp": 0.44287109, "step": 3543, "time_per_iteration": 2.6531195640563965 }, { "auxiliary_loss_clip": 0.01616016, "auxiliary_loss_mlp": 0.00659962, "balance_loss_clip": 1.2910974, "balance_loss_mlp": 0.60901177, "epoch": 0.21307680745528332, "flos": 16362805472640.0, "grad_norm": 34.83856056682472, "language_loss": 0.73383784, "learning_rate": 3.6585855552779634e-06, "loss": 0.75659758, "num_input_tokens_seen": 76500245, "router_z_loss_clip": 3.24804688, "router_z_loss_mlp": 0.50927734, "step": 3544, "time_per_iteration": 2.63032865524292 }, { "auxiliary_loss_clip": 0.01608176, "auxiliary_loss_mlp": 0.00682209, "balance_loss_clip": 1.28621614, "balance_loss_mlp": 0.63462031, "epoch": 0.2131369307079513, "flos": 19099054794240.0, "grad_norm": 24.570260596662447, "language_loss": 0.74626791, "learning_rate": 3.6583678870521934e-06, "loss": 0.76917171, "num_input_tokens_seen": 76519535, "router_z_loss_clip": 3.22460938, "router_z_loss_mlp": 0.47558594, "step": 3545, "time_per_iteration": 2.612578868865967 }, { "auxiliary_loss_clip": 0.01631618, "auxiliary_loss_mlp": 0.00626375, "balance_loss_clip": 1.3050437, "balance_loss_mlp": 0.57921571, "epoch": 0.21319705396061928, "flos": 30372275583360.0, "grad_norm": 7.185184840338022, "language_loss": 0.77602899, "learning_rate": 3.658150155940946e-06, "loss": 0.7986089, "num_input_tokens_seen": 76542065, "router_z_loss_clip": 3.265625, "router_z_loss_mlp": 0.47143555, "step": 3546, "time_per_iteration": 2.7468297481536865 }, { "auxiliary_loss_clip": 0.01612409, "auxiliary_loss_mlp": 0.00648089, "balance_loss_clip": 1.28975141, "balance_loss_mlp": 0.59988129, "epoch": 0.21325717721328724, "flos": 21756192410880.0, "grad_norm": 2.079554332889995, "language_loss": 0.85735279, "learning_rate": 3.657932361952479e-06, "loss": 0.8799578, "num_input_tokens_seen": 76560540, "router_z_loss_clip": 3.23242188, "router_z_loss_mlp": 0.48193359, "step": 3547, "time_per_iteration": 2.731757640838623 }, { "auxiliary_loss_clip": 0.01615674, "auxiliary_loss_mlp": 0.00659372, "balance_loss_clip": 1.28627634, "balance_loss_mlp": 0.61314309, "epoch": 0.2133173004659552, "flos": 28730870760960.0, "grad_norm": 10.680871888319698, "language_loss": 0.83509248, "learning_rate": 3.6577145050950504e-06, "loss": 0.85784292, "num_input_tokens_seen": 76581760, "router_z_loss_clip": 3.296875, "router_z_loss_mlp": 0.4621582, "step": 3548, "time_per_iteration": 2.8374104499816895 }, { "auxiliary_loss_clip": 0.01597379, "auxiliary_loss_mlp": 0.00599635, "balance_loss_clip": 1.27274036, "balance_loss_mlp": 0.55633843, "epoch": 0.21337742371862317, "flos": 16837077674880.0, "grad_norm": 5.771211318013403, "language_loss": 0.7896198, "learning_rate": 3.657496585376922e-06, "loss": 0.81158996, "num_input_tokens_seen": 76599940, "router_z_loss_clip": 3.24609375, "router_z_loss_mlp": 0.43310547, "step": 3549, "time_per_iteration": 2.618839740753174 }, { "auxiliary_loss_clip": 0.01600251, "auxiliary_loss_mlp": 0.0062073, "balance_loss_clip": 1.27925646, "balance_loss_mlp": 0.57562095, "epoch": 0.21343754697129114, "flos": 24424930120320.0, "grad_norm": 10.168011860006528, "language_loss": 0.86470902, "learning_rate": 3.657278602806357e-06, "loss": 0.88691884, "num_input_tokens_seen": 76619580, "router_z_loss_clip": 3.2109375, "router_z_loss_mlp": 0.45092773, "step": 3550, "time_per_iteration": 2.7149343490600586 }, { "auxiliary_loss_clip": 0.01626127, "auxiliary_loss_mlp": 0.00655214, "balance_loss_clip": 1.299106, "balance_loss_mlp": 0.61029577, "epoch": 0.21349767022395913, "flos": 19277817805440.0, "grad_norm": 2.754757517747534, "language_loss": 0.9227618, "learning_rate": 3.657060557391621e-06, "loss": 0.94557512, "num_input_tokens_seen": 76638195, "router_z_loss_clip": 3.26757812, "router_z_loss_mlp": 0.44921875, "step": 3551, "time_per_iteration": 2.6197052001953125 }, { "auxiliary_loss_clip": 0.01610374, "auxiliary_loss_mlp": 0.00618287, "balance_loss_clip": 1.28685582, "balance_loss_mlp": 0.57561004, "epoch": 0.2135577934766271, "flos": 17347547808000.0, "grad_norm": 164.5561652624733, "language_loss": 0.88956332, "learning_rate": 3.656842449140983e-06, "loss": 0.91184998, "num_input_tokens_seen": 76656695, "router_z_loss_clip": 3.23632812, "router_z_loss_mlp": 0.42700195, "step": 3552, "time_per_iteration": 2.6898481845855713 }, { "auxiliary_loss_clip": 0.01615719, "auxiliary_loss_mlp": 0.00641799, "balance_loss_clip": 1.28691602, "balance_loss_mlp": 0.5951643, "epoch": 0.21361791672929506, "flos": 24057204635520.0, "grad_norm": 60.68013624817904, "language_loss": 0.81090081, "learning_rate": 3.656624278062713e-06, "loss": 0.83347595, "num_input_tokens_seen": 76677430, "router_z_loss_clip": 3.28710938, "router_z_loss_mlp": 0.46655273, "step": 3553, "time_per_iteration": 2.680987596511841 }, { "auxiliary_loss_clip": 0.01621022, "auxiliary_loss_mlp": 0.00636929, "balance_loss_clip": 1.29349518, "balance_loss_mlp": 0.59010327, "epoch": 0.21367803998196302, "flos": 22162306556160.0, "grad_norm": 2.4852681813628412, "language_loss": 0.7648114, "learning_rate": 3.6564060441650843e-06, "loss": 0.78739095, "num_input_tokens_seen": 76697615, "router_z_loss_clip": 3.27539062, "router_z_loss_mlp": 0.46875, "step": 3554, "time_per_iteration": 2.7070655822753906 }, { "auxiliary_loss_clip": 0.01615229, "auxiliary_loss_mlp": 0.0058122, "balance_loss_clip": 1.28855956, "balance_loss_mlp": 0.53801858, "epoch": 0.213738163234631, "flos": 20886867452160.0, "grad_norm": 181.47876599521854, "language_loss": 0.73288745, "learning_rate": 3.6561877474563724e-06, "loss": 0.75485194, "num_input_tokens_seen": 76715685, "router_z_loss_clip": 3.26757812, "router_z_loss_mlp": 0.43188477, "step": 3555, "time_per_iteration": 2.676180839538574 }, { "auxiliary_loss_clip": 0.01611665, "auxiliary_loss_mlp": 0.00583947, "balance_loss_clip": 1.28573143, "balance_loss_mlp": 0.54067409, "epoch": 0.21379828648729896, "flos": 28403114135040.0, "grad_norm": 203.5868596792386, "language_loss": 0.7100777, "learning_rate": 3.6559693879448553e-06, "loss": 0.73203385, "num_input_tokens_seen": 76735405, "router_z_loss_clip": 3.26171875, "router_z_loss_mlp": 0.43286133, "step": 3556, "time_per_iteration": 2.7333500385284424 }, { "auxiliary_loss_clip": 0.01620226, "auxiliary_loss_mlp": 0.0055627, "balance_loss_clip": 1.29113889, "balance_loss_mlp": 0.51440394, "epoch": 0.21385840973996692, "flos": 25479662106240.0, "grad_norm": 664.7459288170553, "language_loss": 0.78061938, "learning_rate": 3.6557509656388125e-06, "loss": 0.80238432, "num_input_tokens_seen": 76754395, "router_z_loss_clip": 3.2890625, "router_z_loss_mlp": 0.41894531, "step": 3557, "time_per_iteration": 2.696516275405884 }, { "auxiliary_loss_clip": 0.01611461, "auxiliary_loss_mlp": 0.0056113, "balance_loss_clip": 1.28610384, "balance_loss_mlp": 0.51711786, "epoch": 0.2139185329926349, "flos": 28074280101120.0, "grad_norm": 47.40082189320505, "language_loss": 0.75722682, "learning_rate": 3.655532480546528e-06, "loss": 0.77895272, "num_input_tokens_seen": 76777210, "router_z_loss_clip": 3.25585938, "router_z_loss_mlp": 0.43994141, "step": 3558, "time_per_iteration": 2.7421951293945312 }, { "auxiliary_loss_clip": 0.01604195, "auxiliary_loss_mlp": 0.0063334, "balance_loss_clip": 1.27351284, "balance_loss_mlp": 0.58718228, "epoch": 0.21397865624530288, "flos": 19608698914560.0, "grad_norm": 12.451226471490383, "language_loss": 0.85294104, "learning_rate": 3.655313932676286e-06, "loss": 0.87531638, "num_input_tokens_seen": 76795830, "router_z_loss_clip": 3.3046875, "router_z_loss_mlp": 0.46142578, "step": 3559, "time_per_iteration": 2.6936323642730713 }, { "auxiliary_loss_clip": 0.01613819, "auxiliary_loss_mlp": 0.00566056, "balance_loss_clip": 1.28635943, "balance_loss_mlp": 0.52678847, "epoch": 0.21403877949797084, "flos": 24681476033280.0, "grad_norm": 5.457140000666865, "language_loss": 0.73492026, "learning_rate": 3.655095322036373e-06, "loss": 0.75671899, "num_input_tokens_seen": 76814700, "router_z_loss_clip": 3.27539062, "router_z_loss_mlp": 0.39282227, "step": 3560, "time_per_iteration": 2.8082358837127686 }, { "auxiliary_loss_clip": 0.01611615, "auxiliary_loss_mlp": 0.00527259, "balance_loss_clip": 1.28539073, "balance_loss_mlp": 0.48615605, "epoch": 0.2140989027506388, "flos": 19861150677120.0, "grad_norm": 12.329174368508658, "language_loss": 0.79341072, "learning_rate": 3.65487664863508e-06, "loss": 0.81479937, "num_input_tokens_seen": 76833400, "router_z_loss_clip": 3.2578125, "router_z_loss_mlp": 0.41088867, "step": 3561, "time_per_iteration": 2.8042449951171875 }, { "auxiliary_loss_clip": 0.01599742, "auxiliary_loss_mlp": 0.00541028, "balance_loss_clip": 1.27621269, "balance_loss_mlp": 0.50052029, "epoch": 0.21415902600330677, "flos": 19135324552320.0, "grad_norm": 25.146478973999574, "language_loss": 0.85120016, "learning_rate": 3.654657912480698e-06, "loss": 0.87260783, "num_input_tokens_seen": 76850645, "router_z_loss_clip": 3.234375, "router_z_loss_mlp": 0.40551758, "step": 3562, "time_per_iteration": 2.625723361968994 }, { "auxiliary_loss_clip": 0.01609645, "auxiliary_loss_mlp": 0.00498843, "balance_loss_clip": 1.28910136, "balance_loss_mlp": 0.46071953, "epoch": 0.21421914925597474, "flos": 22272624201600.0, "grad_norm": 2.1470309161166465, "language_loss": 0.88472712, "learning_rate": 3.6544391135815237e-06, "loss": 0.90581203, "num_input_tokens_seen": 76870135, "router_z_loss_clip": 3.20703125, "router_z_loss_mlp": 0.38110352, "step": 3563, "time_per_iteration": 4.006052494049072 }, { "auxiliary_loss_clip": 0.01609517, "auxiliary_loss_mlp": 0.00486364, "balance_loss_clip": 1.2903173, "balance_loss_mlp": 0.44993401, "epoch": 0.2142792725086427, "flos": 33875109987840.0, "grad_norm": 18.783197066617884, "language_loss": 0.82118702, "learning_rate": 3.6542202519458507e-06, "loss": 0.8421458, "num_input_tokens_seen": 76893905, "router_z_loss_clip": 3.19335938, "router_z_loss_mlp": 0.36425781, "step": 3564, "time_per_iteration": 2.721467971801758 }, { "auxiliary_loss_clip": 0.01611963, "auxiliary_loss_mlp": 0.00462994, "balance_loss_clip": 1.29568875, "balance_loss_mlp": 0.42599124, "epoch": 0.2143393957613107, "flos": 19860216923520.0, "grad_norm": 18.116011642723535, "language_loss": 0.93642807, "learning_rate": 3.654001327581981e-06, "loss": 0.95717764, "num_input_tokens_seen": 76914205, "router_z_loss_clip": 3.16015625, "router_z_loss_mlp": 0.37011719, "step": 3565, "time_per_iteration": 2.6702754497528076 }, { "auxiliary_loss_clip": 0.01743221, "auxiliary_loss_mlp": 0.00225471, "balance_loss_clip": 1.48064268, "balance_loss_mlp": 0.21345477, "epoch": 0.21439951901397866, "flos": 68530093090560.0, "grad_norm": 0.8274634318473537, "language_loss": 0.52509737, "learning_rate": 3.653782340498215e-06, "loss": 0.54478431, "num_input_tokens_seen": 76975650, "router_z_loss_clip": 2.625, "router_z_loss_mlp": 0.12011719, "step": 3566, "time_per_iteration": 4.469447612762451 }, { "auxiliary_loss_clip": 0.01603152, "auxiliary_loss_mlp": 0.00424292, "balance_loss_clip": 1.29048538, "balance_loss_mlp": 0.3909373, "epoch": 0.21445964226664663, "flos": 19682998197120.0, "grad_norm": 30.57776267841068, "language_loss": 0.72807431, "learning_rate": 3.6535632907028566e-06, "loss": 0.74834877, "num_input_tokens_seen": 76992615, "router_z_loss_clip": 3.125, "router_z_loss_mlp": 0.33374023, "step": 3567, "time_per_iteration": 4.13170313835144 }, { "auxiliary_loss_clip": 0.01617737, "auxiliary_loss_mlp": 0.0044175, "balance_loss_clip": 1.30540657, "balance_loss_mlp": 0.40801346, "epoch": 0.2145197655193146, "flos": 31107259676160.0, "grad_norm": 2.1115443391644275, "language_loss": 0.79030573, "learning_rate": 3.6533441782042126e-06, "loss": 0.81090063, "num_input_tokens_seen": 77017005, "router_z_loss_clip": 3.12695312, "router_z_loss_mlp": 0.3371582, "step": 3568, "time_per_iteration": 2.7216739654541016 }, { "auxiliary_loss_clip": 0.01609355, "auxiliary_loss_mlp": 0.00484986, "balance_loss_clip": 1.29591012, "balance_loss_mlp": 0.44812635, "epoch": 0.21457988877198256, "flos": 20120785159680.0, "grad_norm": 7.244945462412931, "language_loss": 0.83599573, "learning_rate": 3.6531250030105917e-06, "loss": 0.85693914, "num_input_tokens_seen": 77034990, "router_z_loss_clip": 3.13671875, "router_z_loss_mlp": 0.36865234, "step": 3569, "time_per_iteration": 2.6145713329315186 }, { "auxiliary_loss_clip": 0.01597671, "auxiliary_loss_mlp": 0.00435858, "balance_loss_clip": 1.28060377, "balance_loss_mlp": 0.39935666, "epoch": 0.21464001202465052, "flos": 18588045957120.0, "grad_norm": 3.9143759532378075, "language_loss": 0.77891064, "learning_rate": 3.6529057651303053e-06, "loss": 0.79924595, "num_input_tokens_seen": 77052610, "router_z_loss_clip": 3.171875, "router_z_loss_mlp": 0.36499023, "step": 3570, "time_per_iteration": 2.5947957038879395 }, { "auxiliary_loss_clip": 0.01605448, "auxiliary_loss_mlp": 0.00434065, "balance_loss_clip": 1.28878891, "balance_loss_mlp": 0.39854047, "epoch": 0.21470013527731852, "flos": 21835160461440.0, "grad_norm": 11.880048098882025, "language_loss": 0.88286018, "learning_rate": 3.6526864645716666e-06, "loss": 0.90325534, "num_input_tokens_seen": 77072475, "router_z_loss_clip": 3.16992188, "router_z_loss_mlp": 0.35522461, "step": 3571, "time_per_iteration": 2.6347668170928955 }, { "auxiliary_loss_clip": 0.01609458, "auxiliary_loss_mlp": 0.00409241, "balance_loss_clip": 1.29536915, "balance_loss_mlp": 0.37333542, "epoch": 0.21476025852998648, "flos": 17603195880960.0, "grad_norm": 11.20668068463244, "language_loss": 0.90071762, "learning_rate": 3.652467101342991e-06, "loss": 0.92090464, "num_input_tokens_seen": 77089930, "router_z_loss_clip": 3.13867188, "router_z_loss_mlp": 0.35888672, "step": 3572, "time_per_iteration": 4.1089417934417725 }, { "auxiliary_loss_clip": 0.01622002, "auxiliary_loss_mlp": 0.00390946, "balance_loss_clip": 1.30426836, "balance_loss_mlp": 0.35737681, "epoch": 0.21482038178265445, "flos": 24828135264000.0, "grad_norm": 110.60516493026257, "language_loss": 0.73189861, "learning_rate": 3.652247675452598e-06, "loss": 0.75202811, "num_input_tokens_seen": 77108970, "router_z_loss_clip": 3.17773438, "router_z_loss_mlp": 0.33569336, "step": 3573, "time_per_iteration": 2.673861026763916 }, { "auxiliary_loss_clip": 0.01638209, "auxiliary_loss_mlp": 0.00388955, "balance_loss_clip": 1.32073927, "balance_loss_mlp": 0.35700718, "epoch": 0.2148805050353224, "flos": 23258228463360.0, "grad_norm": 1.6114417897002828, "language_loss": 0.81473088, "learning_rate": 3.652028186908807e-06, "loss": 0.83500254, "num_input_tokens_seen": 77126045, "router_z_loss_clip": 3.171875, "router_z_loss_mlp": 0.31958008, "step": 3574, "time_per_iteration": 2.713569402694702 }, { "auxiliary_loss_clip": 0.01615701, "auxiliary_loss_mlp": 0.00432443, "balance_loss_clip": 1.30130589, "balance_loss_mlp": 0.3948679, "epoch": 0.21494062828799038, "flos": 21321098968320.0, "grad_norm": 35.682494162341726, "language_loss": 0.79578614, "learning_rate": 3.6518086357199416e-06, "loss": 0.81626749, "num_input_tokens_seen": 77144600, "router_z_loss_clip": 3.140625, "router_z_loss_mlp": 0.37597656, "step": 3575, "time_per_iteration": 2.6348962783813477 }, { "auxiliary_loss_clip": 0.01618482, "auxiliary_loss_mlp": 0.00346297, "balance_loss_clip": 1.31248116, "balance_loss_mlp": 0.31644681, "epoch": 0.21500075154065834, "flos": 18843334894080.0, "grad_norm": 4.70306139093771, "language_loss": 0.77017117, "learning_rate": 3.6515890218943277e-06, "loss": 0.789819, "num_input_tokens_seen": 77162965, "router_z_loss_clip": 3.0625, "router_z_loss_mlp": 0.29858398, "step": 3576, "time_per_iteration": 2.618419885635376 }, { "auxiliary_loss_clip": 0.01628386, "auxiliary_loss_mlp": 0.00357671, "balance_loss_clip": 1.31679893, "balance_loss_mlp": 0.32473364, "epoch": 0.2150608747933263, "flos": 18441997257600.0, "grad_norm": 140.24231158390728, "language_loss": 0.96199095, "learning_rate": 3.651369345440292e-06, "loss": 0.98185146, "num_input_tokens_seen": 77179960, "router_z_loss_clip": 3.1171875, "router_z_loss_mlp": 0.32946777, "step": 3577, "time_per_iteration": 2.6062519550323486 }, { "auxiliary_loss_clip": 0.01757007, "auxiliary_loss_mlp": 0.00077283, "balance_loss_clip": 1.49158573, "balance_loss_mlp": 0.06741296, "epoch": 0.2151209980459943, "flos": 66598242894720.0, "grad_norm": 0.7923399700887624, "language_loss": 0.5619089, "learning_rate": 3.6511496063661654e-06, "loss": 0.58025175, "num_input_tokens_seen": 77239500, "router_z_loss_clip": 2.65625, "router_z_loss_mlp": 0.09863281, "step": 3578, "time_per_iteration": 3.0878729820251465 }, { "auxiliary_loss_clip": 0.01624158, "auxiliary_loss_mlp": 0.00341301, "balance_loss_clip": 1.31240582, "balance_loss_mlp": 0.3105208, "epoch": 0.21518112129866226, "flos": 21575885114880.0, "grad_norm": 2.5537150796915746, "language_loss": 0.92943764, "learning_rate": 3.6509298046802807e-06, "loss": 0.94909221, "num_input_tokens_seen": 77254680, "router_z_loss_clip": 3.1171875, "router_z_loss_mlp": 0.30786133, "step": 3579, "time_per_iteration": 2.6308815479278564 }, { "auxiliary_loss_clip": 0.01611754, "auxiliary_loss_mlp": 0.00359959, "balance_loss_clip": 1.30226254, "balance_loss_mlp": 0.32770085, "epoch": 0.21524124455133023, "flos": 20047635112320.0, "grad_norm": 2.861230724015034, "language_loss": 0.85691249, "learning_rate": 3.650709940390972e-06, "loss": 0.87662971, "num_input_tokens_seen": 77274060, "router_z_loss_clip": 3.09765625, "router_z_loss_mlp": 0.32250977, "step": 3580, "time_per_iteration": 2.639620065689087 }, { "auxiliary_loss_clip": 0.01619946, "auxiliary_loss_mlp": 0.00307822, "balance_loss_clip": 1.31015444, "balance_loss_mlp": 0.27785242, "epoch": 0.2153013678039982, "flos": 23951807153280.0, "grad_norm": 245.56357021240936, "language_loss": 0.80181307, "learning_rate": 3.6504900135065775e-06, "loss": 0.82109076, "num_input_tokens_seen": 77293255, "router_z_loss_clip": 3.09960938, "router_z_loss_mlp": 0.29956055, "step": 3581, "time_per_iteration": 2.6372814178466797 }, { "auxiliary_loss_clip": 0.01612019, "auxiliary_loss_mlp": 0.00364072, "balance_loss_clip": 1.30388212, "balance_loss_mlp": 0.33102736, "epoch": 0.21536149105666616, "flos": 20594841880320.0, "grad_norm": 9.879129195384373, "language_loss": 0.79593885, "learning_rate": 3.6502700240354357e-06, "loss": 0.81569976, "num_input_tokens_seen": 77312390, "router_z_loss_clip": 3.08007812, "router_z_loss_mlp": 0.33032227, "step": 3582, "time_per_iteration": 2.6042470932006836 }, { "auxiliary_loss_clip": 0.01629051, "auxiliary_loss_mlp": 0.00336263, "balance_loss_clip": 1.32163811, "balance_loss_mlp": 0.30641347, "epoch": 0.21542161430933413, "flos": 12860042895360.0, "grad_norm": 25.544822826624216, "language_loss": 0.92148799, "learning_rate": 3.650049971985889e-06, "loss": 0.94114113, "num_input_tokens_seen": 77330985, "router_z_loss_clip": 3.07226562, "router_z_loss_mlp": 0.29858398, "step": 3583, "time_per_iteration": 2.5743021965026855 }, { "auxiliary_loss_clip": 0.01608495, "auxiliary_loss_mlp": 0.00319018, "balance_loss_clip": 1.3014946, "balance_loss_mlp": 0.28752255, "epoch": 0.21548173756200212, "flos": 26103933504000.0, "grad_norm": 50.85353086599087, "language_loss": 0.9232893, "learning_rate": 3.6498298573662824e-06, "loss": 0.94256443, "num_input_tokens_seen": 77350770, "router_z_loss_clip": 3.06640625, "router_z_loss_mlp": 0.31494141, "step": 3584, "time_per_iteration": 2.643791437149048 }, { "auxiliary_loss_clip": 0.01632622, "auxiliary_loss_mlp": 0.00336199, "balance_loss_clip": 1.32591414, "balance_loss_mlp": 0.30637279, "epoch": 0.21554186081467008, "flos": 22163779013760.0, "grad_norm": 111.01140886041367, "language_loss": 0.97755063, "learning_rate": 3.6496096801849625e-06, "loss": 0.99723887, "num_input_tokens_seen": 77370510, "router_z_loss_clip": 3.06640625, "router_z_loss_mlp": 0.29833984, "step": 3585, "time_per_iteration": 2.6343488693237305 }, { "auxiliary_loss_clip": 0.01607208, "auxiliary_loss_mlp": 0.00321859, "balance_loss_clip": 1.30417967, "balance_loss_mlp": 0.29218817, "epoch": 0.21560198406733805, "flos": 22966741595520.0, "grad_norm": 37.09758843206317, "language_loss": 0.81669307, "learning_rate": 3.649389440450277e-06, "loss": 0.83598375, "num_input_tokens_seen": 77390645, "router_z_loss_clip": 3.02929688, "router_z_loss_mlp": 0.29650879, "step": 3586, "time_per_iteration": 2.7761101722717285 }, { "auxiliary_loss_clip": 0.01618581, "auxiliary_loss_mlp": 0.00335566, "balance_loss_clip": 1.3100487, "balance_loss_mlp": 0.30512005, "epoch": 0.215662107320006, "flos": 22784064001920.0, "grad_norm": 5.179206637889916, "language_loss": 0.87364805, "learning_rate": 3.6491691381705804e-06, "loss": 0.89318955, "num_input_tokens_seen": 77409655, "router_z_loss_clip": 3.08789062, "router_z_loss_mlp": 0.3046875, "step": 3587, "time_per_iteration": 2.7846498489379883 }, { "auxiliary_loss_clip": 0.01590963, "auxiliary_loss_mlp": 0.00322595, "balance_loss_clip": 1.2886883, "balance_loss_mlp": 0.29307848, "epoch": 0.21572223057267398, "flos": 30883859038080.0, "grad_norm": 12.709707346691976, "language_loss": 0.81413066, "learning_rate": 3.648948773354224e-06, "loss": 0.83326626, "num_input_tokens_seen": 77430560, "router_z_loss_clip": 3.02539062, "router_z_loss_mlp": 0.29504395, "step": 3588, "time_per_iteration": 2.7178738117218018 }, { "auxiliary_loss_clip": 0.0159708, "auxiliary_loss_mlp": 0.00332506, "balance_loss_clip": 1.29345131, "balance_loss_mlp": 0.29996216, "epoch": 0.21578235382534194, "flos": 26910487445760.0, "grad_norm": 2.069396974763748, "language_loss": 0.87841856, "learning_rate": 3.6487283460095643e-06, "loss": 0.89771444, "num_input_tokens_seen": 77455000, "router_z_loss_clip": 3.0390625, "router_z_loss_mlp": 0.32568359, "step": 3589, "time_per_iteration": 2.6888983249664307 }, { "auxiliary_loss_clip": 0.01604977, "auxiliary_loss_mlp": 0.00331383, "balance_loss_clip": 1.30385602, "balance_loss_mlp": 0.30320179, "epoch": 0.2158424770780099, "flos": 24425720219520.0, "grad_norm": 30.299339912258606, "language_loss": 0.81366181, "learning_rate": 3.648507856144961e-06, "loss": 0.83302546, "num_input_tokens_seen": 77475075, "router_z_loss_clip": 3.01171875, "router_z_loss_mlp": 0.28173828, "step": 3590, "time_per_iteration": 2.673574447631836 }, { "auxiliary_loss_clip": 0.01580707, "auxiliary_loss_mlp": 0.00352271, "balance_loss_clip": 1.27869153, "balance_loss_mlp": 0.32156304, "epoch": 0.2159026003306779, "flos": 23949975559680.0, "grad_norm": 31.024732360451562, "language_loss": 0.91058779, "learning_rate": 3.648287303768775e-06, "loss": 0.92991751, "num_input_tokens_seen": 77495945, "router_z_loss_clip": 3.01953125, "router_z_loss_mlp": 0.30712891, "step": 3591, "time_per_iteration": 2.6728835105895996 }, { "auxiliary_loss_clip": 0.01597792, "auxiliary_loss_mlp": 0.0030179, "balance_loss_clip": 1.29531598, "balance_loss_mlp": 0.27074832, "epoch": 0.21596272358334587, "flos": 30040963511040.0, "grad_norm": 44.18946965478503, "language_loss": 0.75404805, "learning_rate": 3.6480666888893686e-06, "loss": 0.77304387, "num_input_tokens_seen": 77517140, "router_z_loss_clip": 3.02734375, "router_z_loss_mlp": 0.31079102, "step": 3592, "time_per_iteration": 2.6805336475372314 }, { "auxiliary_loss_clip": 0.01584708, "auxiliary_loss_mlp": 0.00333337, "balance_loss_clip": 1.28172755, "balance_loss_mlp": 0.30323684, "epoch": 0.21602284683601383, "flos": 20376217751040.0, "grad_norm": 4.895406907456475, "language_loss": 0.94500703, "learning_rate": 3.647846011515108e-06, "loss": 0.9641875, "num_input_tokens_seen": 77536085, "router_z_loss_clip": 3.03320312, "router_z_loss_mlp": 0.30102539, "step": 3593, "time_per_iteration": 2.630737781524658 }, { "auxiliary_loss_clip": 0.01582085, "auxiliary_loss_mlp": 0.00319524, "balance_loss_clip": 1.28340971, "balance_loss_mlp": 0.28769544, "epoch": 0.2160829700886818, "flos": 20777339905920.0, "grad_norm": 17.05854244068264, "language_loss": 0.86483097, "learning_rate": 3.6476252716543625e-06, "loss": 0.88384706, "num_input_tokens_seen": 77553675, "router_z_loss_clip": 2.99023438, "router_z_loss_mlp": 0.31835938, "step": 3594, "time_per_iteration": 2.7454886436462402 }, { "auxiliary_loss_clip": 0.01600923, "auxiliary_loss_mlp": 0.00315912, "balance_loss_clip": 1.29909515, "balance_loss_mlp": 0.28819564, "epoch": 0.21614309334134976, "flos": 22309755886080.0, "grad_norm": 3.8498528696883967, "language_loss": 0.86016357, "learning_rate": 3.6474044693155007e-06, "loss": 0.87933195, "num_input_tokens_seen": 77573360, "router_z_loss_clip": 3.01757812, "router_z_loss_mlp": 0.27697754, "step": 3595, "time_per_iteration": 2.640489101409912 }, { "auxiliary_loss_clip": 0.01602087, "auxiliary_loss_mlp": 0.00343905, "balance_loss_clip": 1.29777539, "balance_loss_mlp": 0.31238642, "epoch": 0.21620321659401773, "flos": 19609524927360.0, "grad_norm": 269.0310978174164, "language_loss": 0.87357748, "learning_rate": 3.647183604506897e-06, "loss": 0.89303732, "num_input_tokens_seen": 77591865, "router_z_loss_clip": 3.04492188, "router_z_loss_mlp": 0.31518555, "step": 3596, "time_per_iteration": 2.6259968280792236 }, { "auxiliary_loss_clip": 0.01590429, "auxiliary_loss_mlp": 0.00336159, "balance_loss_clip": 1.29005289, "balance_loss_mlp": 0.30579594, "epoch": 0.2162633398466857, "flos": 18844555956480.0, "grad_norm": 1.9049586985351887, "language_loss": 0.9040032, "learning_rate": 3.6469626772369253e-06, "loss": 0.92326909, "num_input_tokens_seen": 77611600, "router_z_loss_clip": 3.00390625, "router_z_loss_mlp": 0.30358887, "step": 3597, "time_per_iteration": 2.6933882236480713 }, { "auxiliary_loss_clip": 0.01584057, "auxiliary_loss_mlp": 0.00351418, "balance_loss_clip": 1.28141689, "balance_loss_mlp": 0.32051891, "epoch": 0.21632346309935369, "flos": 18768820129920.0, "grad_norm": 5.9334144670770215, "language_loss": 0.86746895, "learning_rate": 3.6467416875139642e-06, "loss": 0.88682365, "num_input_tokens_seen": 77630665, "router_z_loss_clip": 3.0234375, "router_z_loss_mlp": 0.30883789, "step": 3598, "time_per_iteration": 2.637389898300171 }, { "auxiliary_loss_clip": 0.01559019, "auxiliary_loss_mlp": 0.00338031, "balance_loss_clip": 1.25699127, "balance_loss_mlp": 0.30586845, "epoch": 0.21638358635202165, "flos": 26324173745280.0, "grad_norm": 113.12075916965986, "language_loss": 0.89655286, "learning_rate": 3.6465206353463934e-06, "loss": 0.91552335, "num_input_tokens_seen": 77650835, "router_z_loss_clip": 3.01367188, "router_z_loss_mlp": 0.3215332, "step": 3599, "time_per_iteration": 2.731397867202759 }, { "auxiliary_loss_clip": 0.01585792, "auxiliary_loss_mlp": 0.00276876, "balance_loss_clip": 1.28496027, "balance_loss_mlp": 0.25007752, "epoch": 0.21644370960468962, "flos": 20740854666240.0, "grad_norm": 7.911784792153688, "language_loss": 0.83855212, "learning_rate": 3.6462995207425947e-06, "loss": 0.85717887, "num_input_tokens_seen": 77669000, "router_z_loss_clip": 3.0078125, "router_z_loss_mlp": 0.26806641, "step": 3600, "time_per_iteration": 2.6961028575897217 }, { "auxiliary_loss_clip": 0.01581557, "auxiliary_loss_mlp": 0.00298126, "balance_loss_clip": 1.28253198, "balance_loss_mlp": 0.27273422, "epoch": 0.21650383285735758, "flos": 23952238116480.0, "grad_norm": 2.3133056860684396, "language_loss": 0.87551039, "learning_rate": 3.6460783437109533e-06, "loss": 0.89430726, "num_input_tokens_seen": 77688745, "router_z_loss_clip": 2.98632812, "router_z_loss_mlp": 0.25390625, "step": 3601, "time_per_iteration": 2.6702797412872314 }, { "auxiliary_loss_clip": 0.01572629, "auxiliary_loss_mlp": 0.00334606, "balance_loss_clip": 1.26482129, "balance_loss_mlp": 0.30334905, "epoch": 0.21656395611002555, "flos": 23696087253120.0, "grad_norm": 8.98951499991499, "language_loss": 0.89949334, "learning_rate": 3.6458571042598565e-06, "loss": 0.91856575, "num_input_tokens_seen": 77708445, "router_z_loss_clip": 3.078125, "router_z_loss_mlp": 0.3125, "step": 3602, "time_per_iteration": 2.72745418548584 }, { "auxiliary_loss_clip": 0.01586391, "auxiliary_loss_mlp": 0.00296696, "balance_loss_clip": 1.28147399, "balance_loss_mlp": 0.26805001, "epoch": 0.2166240793626935, "flos": 20666052593280.0, "grad_norm": 6.130699066643353, "language_loss": 0.81232178, "learning_rate": 3.645635802397693e-06, "loss": 0.83115268, "num_input_tokens_seen": 77728465, "router_z_loss_clip": 3.05078125, "router_z_loss_mlp": 0.28625488, "step": 3603, "time_per_iteration": 2.6937286853790283 }, { "auxiliary_loss_clip": 0.01592004, "auxiliary_loss_mlp": 0.0029874, "balance_loss_clip": 1.28992546, "balance_loss_mlp": 0.27198949, "epoch": 0.2166842026153615, "flos": 21580410228480.0, "grad_norm": 2.798811588946276, "language_loss": 0.8123793, "learning_rate": 3.645414438132855e-06, "loss": 0.83128679, "num_input_tokens_seen": 77746735, "router_z_loss_clip": 3.02539062, "router_z_loss_mlp": 0.26757812, "step": 3604, "time_per_iteration": 2.638385534286499 }, { "auxiliary_loss_clip": 0.01584435, "auxiliary_loss_mlp": 0.00318493, "balance_loss_clip": 1.28471923, "balance_loss_mlp": 0.29143226, "epoch": 0.21674432586802947, "flos": 25629948610560.0, "grad_norm": 11.822759585450322, "language_loss": 0.85824043, "learning_rate": 3.6451930114737366e-06, "loss": 0.87726974, "num_input_tokens_seen": 77768105, "router_z_loss_clip": 2.99609375, "router_z_loss_mlp": 0.27075195, "step": 3605, "time_per_iteration": 4.071672439575195 }, { "auxiliary_loss_clip": 0.01611744, "auxiliary_loss_mlp": 0.00042082, "balance_loss_clip": 1.37092054, "balance_loss_mlp": 0.02548792, "epoch": 0.21680444912069743, "flos": 56417783616000.0, "grad_norm": 0.7103392438815617, "language_loss": 0.58334601, "learning_rate": 3.6449715224287347e-06, "loss": 0.59988427, "num_input_tokens_seen": 77833750, "router_z_loss_clip": 2.40625, "router_z_loss_mlp": 0.16601562, "step": 3606, "time_per_iteration": 3.211568832397461 }, { "auxiliary_loss_clip": 0.01557575, "auxiliary_loss_mlp": 0.00370442, "balance_loss_clip": 1.25399697, "balance_loss_mlp": 0.34238037, "epoch": 0.2168645723733654, "flos": 23878944414720.0, "grad_norm": 3.9003917202347, "language_loss": 0.79156017, "learning_rate": 3.644749971006248e-06, "loss": 0.81084037, "num_input_tokens_seen": 77853780, "router_z_loss_clip": 3.03320312, "router_z_loss_mlp": 0.28100586, "step": 3607, "time_per_iteration": 2.6477012634277344 }, { "auxiliary_loss_clip": 0.01563207, "auxiliary_loss_mlp": 0.00386717, "balance_loss_clip": 1.26087523, "balance_loss_mlp": 0.35605663, "epoch": 0.21692469562603336, "flos": 16946174257920.0, "grad_norm": 4.005072169982187, "language_loss": 0.83983433, "learning_rate": 3.6445283572146765e-06, "loss": 0.85933363, "num_input_tokens_seen": 77872575, "router_z_loss_clip": 3.02539062, "router_z_loss_mlp": 0.30664062, "step": 3608, "time_per_iteration": 4.08153510093689 }, { "auxiliary_loss_clip": 0.01558851, "auxiliary_loss_mlp": 0.00383946, "balance_loss_clip": 1.25543809, "balance_loss_mlp": 0.3561821, "epoch": 0.21698481887870133, "flos": 25119047514240.0, "grad_norm": 3.239181229483383, "language_loss": 0.81626153, "learning_rate": 3.6443066810624255e-06, "loss": 0.83568949, "num_input_tokens_seen": 77892700, "router_z_loss_clip": 3.02929688, "router_z_loss_mlp": 0.27746582, "step": 3609, "time_per_iteration": 2.6214609146118164 }, { "auxiliary_loss_clip": 0.01556379, "auxiliary_loss_mlp": 0.00322462, "balance_loss_clip": 1.25432277, "balance_loss_mlp": 0.29435202, "epoch": 0.2170449421313693, "flos": 17894682748800.0, "grad_norm": 1327.7567291259452, "language_loss": 0.94945294, "learning_rate": 3.6440849425579e-06, "loss": 0.96824139, "num_input_tokens_seen": 77911060, "router_z_loss_clip": 3.0234375, "router_z_loss_mlp": 0.28100586, "step": 3610, "time_per_iteration": 4.094866514205933 }, { "auxiliary_loss_clip": 0.01563675, "auxiliary_loss_mlp": 0.00395896, "balance_loss_clip": 1.25885713, "balance_loss_mlp": 0.36744028, "epoch": 0.2171050653840373, "flos": 22638446265600.0, "grad_norm": 55.431700528637236, "language_loss": 0.8280986, "learning_rate": 3.6438631417095095e-06, "loss": 0.84769434, "num_input_tokens_seen": 77929930, "router_z_loss_clip": 3.05078125, "router_z_loss_mlp": 0.2845459, "step": 3611, "time_per_iteration": 2.646890878677368 }, { "auxiliary_loss_clip": 0.01553765, "auxiliary_loss_mlp": 0.00365081, "balance_loss_clip": 1.25075793, "balance_loss_mlp": 0.33881894, "epoch": 0.21716518863670525, "flos": 19499997381120.0, "grad_norm": 4.652950267164679, "language_loss": 0.70067012, "learning_rate": 3.6436412785256637e-06, "loss": 0.71985853, "num_input_tokens_seen": 77949060, "router_z_loss_clip": 3.03125, "router_z_loss_mlp": 0.26257324, "step": 3612, "time_per_iteration": 2.602277994155884 }, { "auxiliary_loss_clip": 0.01536523, "auxiliary_loss_mlp": 0.00403988, "balance_loss_clip": 1.23250353, "balance_loss_mlp": 0.37683168, "epoch": 0.21722531188937322, "flos": 19792022952960.0, "grad_norm": 198.38230887964176, "language_loss": 0.83389562, "learning_rate": 3.643419353014776e-06, "loss": 0.85330069, "num_input_tokens_seen": 77967920, "router_z_loss_clip": 3.04296875, "router_z_loss_mlp": 0.27172852, "step": 3613, "time_per_iteration": 2.635981559753418 }, { "auxiliary_loss_clip": 0.01534403, "auxiliary_loss_mlp": 0.00358383, "balance_loss_clip": 1.23171639, "balance_loss_mlp": 0.33400464, "epoch": 0.21728543514204118, "flos": 13334386924800.0, "grad_norm": 282.2804790611201, "language_loss": 0.805839, "learning_rate": 3.643197365185261e-06, "loss": 0.82476687, "num_input_tokens_seen": 77985330, "router_z_loss_clip": 3.02734375, "router_z_loss_mlp": 0.24389648, "step": 3614, "time_per_iteration": 4.127549886703491 }, { "auxiliary_loss_clip": 0.01510351, "auxiliary_loss_mlp": 0.00364931, "balance_loss_clip": 1.2087276, "balance_loss_mlp": 0.33760819, "epoch": 0.21734555839470915, "flos": 15231870783360.0, "grad_norm": 6.183239903385005, "language_loss": 0.7923674, "learning_rate": 3.6429753150455378e-06, "loss": 0.81112027, "num_input_tokens_seen": 78003105, "router_z_loss_clip": 3.01757812, "router_z_loss_mlp": 0.27319336, "step": 3615, "time_per_iteration": 2.6451001167297363 }, { "auxiliary_loss_clip": 0.01526353, "auxiliary_loss_mlp": 0.00367161, "balance_loss_clip": 1.22114015, "balance_loss_mlp": 0.34000546, "epoch": 0.2174056816473771, "flos": 19973982274560.0, "grad_norm": 3.306767416996963, "language_loss": 0.98737121, "learning_rate": 3.6427532026040263e-06, "loss": 1.00630641, "num_input_tokens_seen": 78019655, "router_z_loss_clip": 3.04882812, "router_z_loss_mlp": 0.27148438, "step": 3616, "time_per_iteration": 2.662276268005371 }, { "auxiliary_loss_clip": 0.01500849, "auxiliary_loss_mlp": 0.00322974, "balance_loss_clip": 1.19858265, "balance_loss_mlp": 0.29732016, "epoch": 0.21746580490004508, "flos": 16687293960960.0, "grad_norm": 53.219548055313005, "language_loss": 0.90027666, "learning_rate": 3.642531027869148e-06, "loss": 0.91851485, "num_input_tokens_seen": 78036025, "router_z_loss_clip": 3.02148438, "router_z_loss_mlp": 0.2565918, "step": 3617, "time_per_iteration": 2.640181303024292 }, { "auxiliary_loss_clip": 0.01514932, "auxiliary_loss_mlp": 0.00324952, "balance_loss_clip": 1.20930672, "balance_loss_mlp": 0.29811791, "epoch": 0.21752592815271307, "flos": 25772298209280.0, "grad_norm": 122.80807014486898, "language_loss": 0.82387662, "learning_rate": 3.642308790849329e-06, "loss": 0.84227544, "num_input_tokens_seen": 78055645, "router_z_loss_clip": 3.0546875, "router_z_loss_mlp": 0.26843262, "step": 3618, "time_per_iteration": 2.7579987049102783 }, { "auxiliary_loss_clip": 0.01511759, "auxiliary_loss_mlp": 0.00351525, "balance_loss_clip": 1.20727944, "balance_loss_mlp": 0.32491758, "epoch": 0.21758605140538104, "flos": 11254692349440.0, "grad_norm": 8.121300650501102, "language_loss": 0.79000998, "learning_rate": 3.642086491552996e-06, "loss": 0.80864286, "num_input_tokens_seen": 78071660, "router_z_loss_clip": 3.04296875, "router_z_loss_mlp": 0.26623535, "step": 3619, "time_per_iteration": 2.757826328277588 }, { "auxiliary_loss_clip": 0.01494259, "auxiliary_loss_mlp": 0.00337156, "balance_loss_clip": 1.18929112, "balance_loss_mlp": 0.31318319, "epoch": 0.217646174658049, "flos": 19242625455360.0, "grad_norm": 2.0717208756951093, "language_loss": 0.83535737, "learning_rate": 3.641864129988579e-06, "loss": 0.85367155, "num_input_tokens_seen": 78091265, "router_z_loss_clip": 3.05078125, "router_z_loss_mlp": 0.23962402, "step": 3620, "time_per_iteration": 2.673243522644043 }, { "auxiliary_loss_clip": 0.01492603, "auxiliary_loss_mlp": 0.00311423, "balance_loss_clip": 1.19221616, "balance_loss_mlp": 0.28678209, "epoch": 0.21770629791071697, "flos": 21945083057280.0, "grad_norm": 7.199974518599563, "language_loss": 0.8570466, "learning_rate": 3.641641706164509e-06, "loss": 0.87508678, "num_input_tokens_seen": 78110095, "router_z_loss_clip": 3.0078125, "router_z_loss_mlp": 0.24658203, "step": 3621, "time_per_iteration": 2.6308305263519287 }, { "auxiliary_loss_clip": 0.01474822, "auxiliary_loss_mlp": 0.00298996, "balance_loss_clip": 1.17260945, "balance_loss_mlp": 0.27336639, "epoch": 0.21776642116338493, "flos": 24936764970240.0, "grad_norm": 15.797736676534301, "language_loss": 0.94199121, "learning_rate": 3.641419220089221e-06, "loss": 0.95972937, "num_input_tokens_seen": 78129475, "router_z_loss_clip": 3.0234375, "router_z_loss_mlp": 0.25598145, "step": 3622, "time_per_iteration": 2.6791775226593018 }, { "auxiliary_loss_clip": 0.01475598, "auxiliary_loss_mlp": 0.00371809, "balance_loss_clip": 1.17092824, "balance_loss_mlp": 0.34268624, "epoch": 0.2178265444160529, "flos": 17821317219840.0, "grad_norm": 20.797157275913374, "language_loss": 0.85582066, "learning_rate": 3.641196671771152e-06, "loss": 0.87429476, "num_input_tokens_seen": 78146880, "router_z_loss_clip": 3.05078125, "router_z_loss_mlp": 0.29125977, "step": 3623, "time_per_iteration": 2.6242311000823975 }, { "auxiliary_loss_clip": 0.01473738, "auxiliary_loss_mlp": 0.00282092, "balance_loss_clip": 1.16955161, "balance_loss_mlp": 0.25654545, "epoch": 0.2178866676687209, "flos": 17712902995200.0, "grad_norm": 10.534770842798665, "language_loss": 0.94053268, "learning_rate": 3.640974061218741e-06, "loss": 0.95809102, "num_input_tokens_seen": 78165065, "router_z_loss_clip": 3.04101562, "router_z_loss_mlp": 0.25561523, "step": 3624, "time_per_iteration": 2.6518168449401855 }, { "auxiliary_loss_clip": 0.0148315, "auxiliary_loss_mlp": 0.00332319, "balance_loss_clip": 1.1710602, "balance_loss_mlp": 0.30275527, "epoch": 0.21794679092138886, "flos": 16945851035520.0, "grad_norm": 8.321961132450257, "language_loss": 0.88432467, "learning_rate": 3.640751388440429e-06, "loss": 0.90247935, "num_input_tokens_seen": 78180005, "router_z_loss_clip": 3.12109375, "router_z_loss_mlp": 0.29577637, "step": 3625, "time_per_iteration": 2.595280885696411 }, { "auxiliary_loss_clip": 0.01454073, "auxiliary_loss_mlp": 0.00060412, "balance_loss_clip": 1.20475459, "balance_loss_mlp": 0.05354583, "epoch": 0.21800691417405682, "flos": 63718566566400.0, "grad_norm": 0.8039210779528595, "language_loss": 0.60797095, "learning_rate": 3.64052865344466e-06, "loss": 0.62311578, "num_input_tokens_seen": 78245350, "router_z_loss_clip": 2.5, "router_z_loss_mlp": 0.06884766, "step": 3626, "time_per_iteration": 3.2164504528045654 }, { "auxiliary_loss_clip": 0.01476105, "auxiliary_loss_mlp": 0.00319917, "balance_loss_clip": 1.15810919, "balance_loss_mlp": 0.29118714, "epoch": 0.21806703742672479, "flos": 21616392677760.0, "grad_norm": 80.05716598415836, "language_loss": 0.97005838, "learning_rate": 3.6403058562398795e-06, "loss": 0.98801857, "num_input_tokens_seen": 78264165, "router_z_loss_clip": 3.17773438, "router_z_loss_mlp": 0.28710938, "step": 3627, "time_per_iteration": 2.6412737369537354 }, { "auxiliary_loss_clip": 0.01474004, "auxiliary_loss_mlp": 0.00279736, "balance_loss_clip": 1.15360701, "balance_loss_mlp": 0.25335547, "epoch": 0.21812716067939275, "flos": 19354882435200.0, "grad_norm": 5.984024173833941, "language_loss": 0.80515563, "learning_rate": 3.6400829968345365e-06, "loss": 0.82269305, "num_input_tokens_seen": 78283745, "router_z_loss_clip": 3.20117188, "router_z_loss_mlp": 0.26403809, "step": 3628, "time_per_iteration": 2.6414167881011963 }, { "auxiliary_loss_clip": 0.01470077, "auxiliary_loss_mlp": 0.00284279, "balance_loss_clip": 1.14756417, "balance_loss_mlp": 0.25676596, "epoch": 0.21818728393206072, "flos": 23548063305600.0, "grad_norm": 8.00374301430814, "language_loss": 0.85298753, "learning_rate": 3.6398600752370826e-06, "loss": 0.87053108, "num_input_tokens_seen": 78302900, "router_z_loss_clip": 3.2265625, "router_z_loss_mlp": 0.27502441, "step": 3629, "time_per_iteration": 2.7501423358917236 }, { "auxiliary_loss_clip": 0.01476061, "auxiliary_loss_mlp": 0.00261044, "balance_loss_clip": 1.15055919, "balance_loss_mlp": 0.23530687, "epoch": 0.21824740718472868, "flos": 30225652266240.0, "grad_norm": 179.89535502976875, "language_loss": 0.79891992, "learning_rate": 3.63963709145597e-06, "loss": 0.81629092, "num_input_tokens_seen": 78326470, "router_z_loss_clip": 3.25390625, "router_z_loss_mlp": 0.25732422, "step": 3630, "time_per_iteration": 2.7024085521698 }, { "auxiliary_loss_clip": 0.01466415, "auxiliary_loss_mlp": 0.0026295, "balance_loss_clip": 1.13904953, "balance_loss_mlp": 0.23530491, "epoch": 0.21830753043739667, "flos": 26134672567680.0, "grad_norm": 3.148414843381209, "language_loss": 0.84773189, "learning_rate": 3.6394140454996544e-06, "loss": 0.86502552, "num_input_tokens_seen": 78345810, "router_z_loss_clip": 3.2734375, "router_z_loss_mlp": 0.27648926, "step": 3631, "time_per_iteration": 2.66243839263916 }, { "auxiliary_loss_clip": 0.01463087, "auxiliary_loss_mlp": 0.00271391, "balance_loss_clip": 1.13489938, "balance_loss_mlp": 0.24100481, "epoch": 0.21836765369006464, "flos": 21720712752000.0, "grad_norm": 3.128521113035861, "language_loss": 0.8404448, "learning_rate": 3.639190937376594e-06, "loss": 0.85778958, "num_input_tokens_seen": 78364085, "router_z_loss_clip": 3.28320312, "router_z_loss_mlp": 0.30383301, "step": 3632, "time_per_iteration": 2.62469744682312 }, { "auxiliary_loss_clip": 0.01483347, "auxiliary_loss_mlp": 0.00259056, "balance_loss_clip": 1.15066552, "balance_loss_mlp": 0.23057702, "epoch": 0.2184277769427326, "flos": 19937604775680.0, "grad_norm": 16.15847755517661, "language_loss": 0.91857946, "learning_rate": 3.638967767095249e-06, "loss": 0.93600345, "num_input_tokens_seen": 78381385, "router_z_loss_clip": 3.328125, "router_z_loss_mlp": 0.28479004, "step": 3633, "time_per_iteration": 2.845135450363159 }, { "auxiliary_loss_clip": 0.01473281, "auxiliary_loss_mlp": 0.00283533, "balance_loss_clip": 1.14247918, "balance_loss_mlp": 0.25538787, "epoch": 0.21848790019540057, "flos": 20340235301760.0, "grad_norm": 11.467027853750524, "language_loss": 0.86116087, "learning_rate": 3.6387445346640823e-06, "loss": 0.87872899, "num_input_tokens_seen": 78400500, "router_z_loss_clip": 3.30859375, "router_z_loss_mlp": 0.28161621, "step": 3634, "time_per_iteration": 2.664198398590088 }, { "auxiliary_loss_clip": 0.0146853, "auxiliary_loss_mlp": 0.00267856, "balance_loss_clip": 1.13478589, "balance_loss_mlp": 0.24027067, "epoch": 0.21854802344806853, "flos": 15450818135040.0, "grad_norm": 2.5295402711664545, "language_loss": 0.81908536, "learning_rate": 3.638521240091558e-06, "loss": 0.83644921, "num_input_tokens_seen": 78418340, "router_z_loss_clip": 3.33398438, "router_z_loss_mlp": 0.27612305, "step": 3635, "time_per_iteration": 2.683044672012329 }, { "auxiliary_loss_clip": 0.01466096, "auxiliary_loss_mlp": 0.00249068, "balance_loss_clip": 1.13058686, "balance_loss_mlp": 0.22128071, "epoch": 0.2186081467007365, "flos": 16320717711360.0, "grad_norm": 3.076503544580779, "language_loss": 0.95559514, "learning_rate": 3.6382978833861445e-06, "loss": 0.97274679, "num_input_tokens_seen": 78434375, "router_z_loss_clip": 3.35742188, "router_z_loss_mlp": 0.27758789, "step": 3636, "time_per_iteration": 2.614664077758789 }, { "auxiliary_loss_clip": 0.01459183, "auxiliary_loss_mlp": 0.0026344, "balance_loss_clip": 1.12749934, "balance_loss_mlp": 0.23497277, "epoch": 0.2186682699534045, "flos": 21689255416320.0, "grad_norm": 4.253278726280932, "language_loss": 0.83852023, "learning_rate": 3.638074464556311e-06, "loss": 0.85574651, "num_input_tokens_seen": 78451735, "router_z_loss_clip": 3.31835938, "router_z_loss_mlp": 0.2845459, "step": 3637, "time_per_iteration": 2.6767051219940186 }, { "auxiliary_loss_clip": 0.01485066, "auxiliary_loss_mlp": 0.00307206, "balance_loss_clip": 1.14823139, "balance_loss_mlp": 0.27382761, "epoch": 0.21872839320607246, "flos": 17739260599680.0, "grad_norm": 85.31997841805858, "language_loss": 0.99180686, "learning_rate": 3.63785098361053e-06, "loss": 1.00972962, "num_input_tokens_seen": 78462730, "router_z_loss_clip": 3.36523438, "router_z_loss_mlp": 0.33374023, "step": 3638, "time_per_iteration": 2.610776901245117 }, { "auxiliary_loss_clip": 0.0147862, "auxiliary_loss_mlp": 0.0028786, "balance_loss_clip": 1.14642251, "balance_loss_mlp": 0.25777155, "epoch": 0.21878851645874042, "flos": 18652289431680.0, "grad_norm": 2.7578090612849078, "language_loss": 0.99263585, "learning_rate": 3.637627440557275e-06, "loss": 1.01030064, "num_input_tokens_seen": 78476300, "router_z_loss_clip": 3.3203125, "router_z_loss_mlp": 0.30065918, "step": 3639, "time_per_iteration": 2.6057791709899902 }, { "auxiliary_loss_clip": 0.01495865, "auxiliary_loss_mlp": 0.00265454, "balance_loss_clip": 1.15387738, "balance_loss_mlp": 0.23727328, "epoch": 0.2188486397114084, "flos": 25557301353600.0, "grad_norm": 10.085462622413498, "language_loss": 0.86392325, "learning_rate": 3.637403835405024e-06, "loss": 0.88153642, "num_input_tokens_seen": 78496135, "router_z_loss_clip": 3.421875, "router_z_loss_mlp": 0.28173828, "step": 3640, "time_per_iteration": 2.7305824756622314 }, { "auxiliary_loss_clip": 0.01496772, "auxiliary_loss_mlp": 0.00353767, "balance_loss_clip": 1.15911663, "balance_loss_mlp": 0.31893423, "epoch": 0.21890876296407635, "flos": 17892061056000.0, "grad_norm": 2532.3504278844507, "language_loss": 0.79868865, "learning_rate": 3.637180168162255e-06, "loss": 0.81719404, "num_input_tokens_seen": 78513855, "router_z_loss_clip": 3.37890625, "router_z_loss_mlp": 0.34814453, "step": 3641, "time_per_iteration": 2.6700491905212402 }, { "auxiliary_loss_clip": 0.01525157, "auxiliary_loss_mlp": 0.00322348, "balance_loss_clip": 1.17823148, "balance_loss_mlp": 0.29013729, "epoch": 0.21896888621674432, "flos": 17749100926080.0, "grad_norm": 12.461890884511147, "language_loss": 0.87278593, "learning_rate": 3.63695643883745e-06, "loss": 0.89126098, "num_input_tokens_seen": 78531740, "router_z_loss_clip": 3.46679688, "router_z_loss_mlp": 0.32226562, "step": 3642, "time_per_iteration": 2.67175555229187 }, { "auxiliary_loss_clip": 0.01496852, "auxiliary_loss_mlp": 0.00301082, "balance_loss_clip": 1.15004599, "balance_loss_mlp": 0.26880044, "epoch": 0.21902900946941228, "flos": 23076161400960.0, "grad_norm": 7.5319830973416, "language_loss": 0.79980826, "learning_rate": 3.6367326474390928e-06, "loss": 0.81778765, "num_input_tokens_seen": 78549600, "router_z_loss_clip": 3.46875, "router_z_loss_mlp": 0.32299805, "step": 3643, "time_per_iteration": 2.645763397216797 }, { "auxiliary_loss_clip": 0.01522684, "auxiliary_loss_mlp": 0.00308077, "balance_loss_clip": 1.16307104, "balance_loss_mlp": 0.27722591, "epoch": 0.21908913272208028, "flos": 48178545004800.0, "grad_norm": 15.28325632444656, "language_loss": 0.75391221, "learning_rate": 3.6365087939756696e-06, "loss": 0.77221978, "num_input_tokens_seen": 78573350, "router_z_loss_clip": 3.59375, "router_z_loss_mlp": 0.30834961, "step": 3644, "time_per_iteration": 2.8755621910095215 }, { "auxiliary_loss_clip": 0.01530973, "auxiliary_loss_mlp": 0.00317677, "balance_loss_clip": 1.16815829, "balance_loss_mlp": 0.28472719, "epoch": 0.21914925597474824, "flos": 22236749493120.0, "grad_norm": 186.38821355849947, "language_loss": 0.88407528, "learning_rate": 3.636284878455669e-06, "loss": 0.90256178, "num_input_tokens_seen": 78591005, "router_z_loss_clip": 3.62890625, "router_z_loss_mlp": 0.32958984, "step": 3645, "time_per_iteration": 2.639883518218994 }, { "auxiliary_loss_clip": 0.01528067, "auxiliary_loss_mlp": 0.00349339, "balance_loss_clip": 1.17261732, "balance_loss_mlp": 0.31977522, "epoch": 0.2192093792274162, "flos": 22125605834880.0, "grad_norm": 3.1650002445162437, "language_loss": 0.88617051, "learning_rate": 3.636060900887582e-06, "loss": 0.9049446, "num_input_tokens_seen": 78610645, "router_z_loss_clip": 3.55664062, "router_z_loss_mlp": 0.29541016, "step": 3646, "time_per_iteration": 2.676959276199341 }, { "auxiliary_loss_clip": 0.01526124, "auxiliary_loss_mlp": 0.00318893, "balance_loss_clip": 1.17214346, "balance_loss_mlp": 0.29049695, "epoch": 0.21926950248008417, "flos": 15669442264320.0, "grad_norm": 115.50145798923924, "language_loss": 0.89224422, "learning_rate": 3.635836861279901e-06, "loss": 0.91069442, "num_input_tokens_seen": 78628340, "router_z_loss_clip": 3.54101562, "router_z_loss_mlp": 0.28417969, "step": 3647, "time_per_iteration": 4.060911178588867 }, { "auxiliary_loss_clip": 0.01526715, "auxiliary_loss_mlp": 0.00308123, "balance_loss_clip": 1.17320442, "balance_loss_mlp": 0.28029931, "epoch": 0.21932962573275214, "flos": 30262496641920.0, "grad_norm": 7.346363070088784, "language_loss": 0.77557892, "learning_rate": 3.635612759641123e-06, "loss": 0.79392731, "num_input_tokens_seen": 78649355, "router_z_loss_clip": 3.53320312, "router_z_loss_mlp": 0.27832031, "step": 3648, "time_per_iteration": 2.7174322605133057 }, { "auxiliary_loss_clip": 0.01546703, "auxiliary_loss_mlp": 0.00358211, "balance_loss_clip": 1.18365908, "balance_loss_mlp": 0.32457009, "epoch": 0.2193897489854201, "flos": 10780132838400.0, "grad_norm": 3.8617938494841764, "language_loss": 0.81729341, "learning_rate": 3.635388595979745e-06, "loss": 0.83634257, "num_input_tokens_seen": 78664915, "router_z_loss_clip": 3.63085938, "router_z_loss_mlp": 0.33642578, "step": 3649, "time_per_iteration": 2.65700364112854 }, { "auxiliary_loss_clip": 0.01543353, "auxiliary_loss_mlp": 0.00334243, "balance_loss_clip": 1.1911056, "balance_loss_mlp": 0.30433309, "epoch": 0.21944987223808807, "flos": 19133313390720.0, "grad_norm": 5.4947272753916945, "language_loss": 0.92733514, "learning_rate": 3.635164370304267e-06, "loss": 0.94611096, "num_input_tokens_seen": 78681475, "router_z_loss_clip": 3.5234375, "router_z_loss_mlp": 0.2989502, "step": 3650, "time_per_iteration": 4.070281505584717 }, { "auxiliary_loss_clip": 0.0154735, "auxiliary_loss_mlp": 0.00319668, "balance_loss_clip": 1.19203889, "balance_loss_mlp": 0.28802982, "epoch": 0.21950999549075606, "flos": 22711093522560.0, "grad_norm": 7.316677456886712, "language_loss": 0.91237068, "learning_rate": 3.6349400826231927e-06, "loss": 0.93104088, "num_input_tokens_seen": 78702300, "router_z_loss_clip": 3.55664062, "router_z_loss_mlp": 0.31616211, "step": 3651, "time_per_iteration": 2.668449640274048 }, { "auxiliary_loss_clip": 0.01544067, "auxiliary_loss_mlp": 0.0031907, "balance_loss_clip": 1.19180608, "balance_loss_mlp": 0.29032844, "epoch": 0.21957011874342403, "flos": 10561329141120.0, "grad_norm": 29.706014275918317, "language_loss": 0.83289635, "learning_rate": 3.634715732945027e-06, "loss": 0.85152781, "num_input_tokens_seen": 78720230, "router_z_loss_clip": 3.5234375, "router_z_loss_mlp": 0.28747559, "step": 3652, "time_per_iteration": 4.025532007217407 }, { "auxiliary_loss_clip": 0.01466882, "auxiliary_loss_mlp": 0.0012509, "balance_loss_clip": 1.21648204, "balance_loss_mlp": 0.11769893, "epoch": 0.219630241996092, "flos": 65747913252480.0, "grad_norm": 0.7334509122641438, "language_loss": 0.51426327, "learning_rate": 3.6344913212782764e-06, "loss": 0.53018302, "num_input_tokens_seen": 78780200, "router_z_loss_clip": 2.5, "router_z_loss_mlp": 0.07373047, "step": 3653, "time_per_iteration": 3.1085758209228516 }, { "auxiliary_loss_clip": 0.01549517, "auxiliary_loss_mlp": 0.00324187, "balance_loss_clip": 1.1953429, "balance_loss_mlp": 0.2941106, "epoch": 0.21969036524875996, "flos": 23696518216320.0, "grad_norm": 3.3551344997472445, "language_loss": 0.80652672, "learning_rate": 3.6342668476314514e-06, "loss": 0.82526374, "num_input_tokens_seen": 78800575, "router_z_loss_clip": 3.5390625, "router_z_loss_mlp": 0.30078125, "step": 3654, "time_per_iteration": 2.6303417682647705 }, { "auxiliary_loss_clip": 0.01534793, "auxiliary_loss_mlp": 0.0033485, "balance_loss_clip": 1.18015969, "balance_loss_mlp": 0.303188, "epoch": 0.21975048850142792, "flos": 19640910435840.0, "grad_norm": 95.05182995387307, "language_loss": 0.79085732, "learning_rate": 3.634042312013064e-06, "loss": 0.80955374, "num_input_tokens_seen": 78819585, "router_z_loss_clip": 3.54882812, "router_z_loss_mlp": 0.31689453, "step": 3655, "time_per_iteration": 2.5903360843658447 }, { "auxiliary_loss_clip": 0.01517805, "auxiliary_loss_mlp": 0.00306166, "balance_loss_clip": 1.16896844, "balance_loss_mlp": 0.27777046, "epoch": 0.21981061175409589, "flos": 22448550038400.0, "grad_norm": 348.9580756303383, "language_loss": 0.86485106, "learning_rate": 3.6338177144316276e-06, "loss": 0.88309073, "num_input_tokens_seen": 78837330, "router_z_loss_clip": 3.48828125, "router_z_loss_mlp": 0.28417969, "step": 3656, "time_per_iteration": 2.6252734661102295 }, { "auxiliary_loss_clip": 0.01549753, "auxiliary_loss_mlp": 0.00307802, "balance_loss_clip": 1.19306862, "balance_loss_mlp": 0.27795249, "epoch": 0.21987073500676388, "flos": 18151049093760.0, "grad_norm": 4.913705309308351, "language_loss": 0.92876476, "learning_rate": 3.63359305489566e-06, "loss": 0.94734037, "num_input_tokens_seen": 78854955, "router_z_loss_clip": 3.5703125, "router_z_loss_mlp": 0.29870605, "step": 3657, "time_per_iteration": 3.977330446243286 }, { "auxiliary_loss_clip": 0.01521146, "auxiliary_loss_mlp": 0.00344456, "balance_loss_clip": 1.16878676, "balance_loss_mlp": 0.31217372, "epoch": 0.21993085825943184, "flos": 25626177682560.0, "grad_norm": 179.05664953194702, "language_loss": 0.88201547, "learning_rate": 3.6333683334136803e-06, "loss": 0.90067148, "num_input_tokens_seen": 78874965, "router_z_loss_clip": 3.5234375, "router_z_loss_mlp": 0.32275391, "step": 3658, "time_per_iteration": 2.6576168537139893 }, { "auxiliary_loss_clip": 0.01455089, "auxiliary_loss_mlp": 0.0007329, "balance_loss_clip": 1.17938018, "balance_loss_mlp": 0.06609001, "epoch": 0.2199909815120998, "flos": 70923217743360.0, "grad_norm": 0.7877157342308037, "language_loss": 0.57986796, "learning_rate": 3.6331435499942095e-06, "loss": 0.59515178, "num_input_tokens_seen": 78937740, "router_z_loss_clip": 2.75, "router_z_loss_mlp": 0.07177734, "step": 3659, "time_per_iteration": 3.222860813140869 }, { "auxiliary_loss_clip": 0.01525743, "auxiliary_loss_mlp": 0.00277454, "balance_loss_clip": 1.16858852, "balance_loss_mlp": 0.25039399, "epoch": 0.22005110476476777, "flos": 21543529939200.0, "grad_norm": 4.133740285734391, "language_loss": 0.8123244, "learning_rate": 3.632918704645772e-06, "loss": 0.83035642, "num_input_tokens_seen": 78955055, "router_z_loss_clip": 3.57226562, "router_z_loss_mlp": 0.27038574, "step": 3660, "time_per_iteration": 2.643080711364746 }, { "auxiliary_loss_clip": 0.01522395, "auxiliary_loss_mlp": 0.00280333, "balance_loss_clip": 1.16250086, "balance_loss_mlp": 0.25597811, "epoch": 0.22011122801743574, "flos": 22054502862720.0, "grad_norm": 1.8506001459858434, "language_loss": 0.86928678, "learning_rate": 3.632693797376893e-06, "loss": 0.88731408, "num_input_tokens_seen": 78974895, "router_z_loss_clip": 3.59960938, "router_z_loss_mlp": 0.24365234, "step": 3661, "time_per_iteration": 2.715698003768921 }, { "auxiliary_loss_clip": 0.01546378, "auxiliary_loss_mlp": 0.00248976, "balance_loss_clip": 1.18486845, "balance_loss_mlp": 0.22435948, "epoch": 0.2201713512701037, "flos": 26687589598080.0, "grad_norm": 2.3508647921349213, "language_loss": 0.78548348, "learning_rate": 3.632468828196102e-06, "loss": 0.80343699, "num_input_tokens_seen": 78994990, "router_z_loss_clip": 3.61523438, "router_z_loss_mlp": 0.24621582, "step": 3662, "time_per_iteration": 2.71220064163208 }, { "auxiliary_loss_clip": 0.01532182, "auxiliary_loss_mlp": 0.00302943, "balance_loss_clip": 1.17915702, "balance_loss_mlp": 0.27674121, "epoch": 0.22023147452277167, "flos": 22162198815360.0, "grad_norm": 1.7961677842534731, "language_loss": 0.83944595, "learning_rate": 3.632243797111929e-06, "loss": 0.85779721, "num_input_tokens_seen": 79014405, "router_z_loss_clip": 3.52929688, "router_z_loss_mlp": 0.26184082, "step": 3663, "time_per_iteration": 2.6385631561279297 }, { "auxiliary_loss_clip": 0.0155674, "auxiliary_loss_mlp": 0.00320358, "balance_loss_clip": 1.19525719, "balance_loss_mlp": 0.29237962, "epoch": 0.22029159777543966, "flos": 22523280284160.0, "grad_norm": 3.3494031345199176, "language_loss": 0.8724978, "learning_rate": 3.632018704132908e-06, "loss": 0.89126873, "num_input_tokens_seen": 79032375, "router_z_loss_clip": 3.6171875, "router_z_loss_mlp": 0.2800293, "step": 3664, "time_per_iteration": 2.625037908554077 }, { "auxiliary_loss_clip": 0.01535083, "auxiliary_loss_mlp": 0.00355846, "balance_loss_clip": 1.17787862, "balance_loss_mlp": 0.32373053, "epoch": 0.22035172102810763, "flos": 13042469093760.0, "grad_norm": 384.55832231825053, "language_loss": 0.8577944, "learning_rate": 3.6317935492675742e-06, "loss": 0.87670362, "num_input_tokens_seen": 79049635, "router_z_loss_clip": 3.57421875, "router_z_loss_mlp": 0.32128906, "step": 3665, "time_per_iteration": 2.625831127166748 }, { "auxiliary_loss_clip": 0.01552814, "auxiliary_loss_mlp": 0.00315692, "balance_loss_clip": 1.19230723, "balance_loss_mlp": 0.2868433, "epoch": 0.2204118442807756, "flos": 12165817760640.0, "grad_norm": 7.28471252235349, "language_loss": 1.0463388, "learning_rate": 3.631568332524466e-06, "loss": 1.0650239, "num_input_tokens_seen": 79062890, "router_z_loss_clip": 3.60546875, "router_z_loss_mlp": 0.28845215, "step": 3666, "time_per_iteration": 2.5694448947906494 }, { "auxiliary_loss_clip": 0.01540949, "auxiliary_loss_mlp": 0.00278159, "balance_loss_clip": 1.19062483, "balance_loss_mlp": 0.25189722, "epoch": 0.22047196753344356, "flos": 40108806673920.0, "grad_norm": 2.542331521339245, "language_loss": 0.84623539, "learning_rate": 3.631343053912122e-06, "loss": 0.86442649, "num_input_tokens_seen": 79085495, "router_z_loss_clip": 3.5, "router_z_loss_mlp": 0.26245117, "step": 3667, "time_per_iteration": 2.775805711746216 }, { "auxiliary_loss_clip": 0.01549659, "auxiliary_loss_mlp": 0.00328319, "balance_loss_clip": 1.19482636, "balance_loss_mlp": 0.29730076, "epoch": 0.22053209078611152, "flos": 20701137202560.0, "grad_norm": 6.6077892324985195, "language_loss": 0.84722066, "learning_rate": 3.631117713439087e-06, "loss": 0.86600041, "num_input_tokens_seen": 79101820, "router_z_loss_clip": 3.55273438, "router_z_loss_mlp": 0.31030273, "step": 3668, "time_per_iteration": 2.6298768520355225 }, { "auxiliary_loss_clip": 0.01535656, "auxiliary_loss_mlp": 0.00271464, "balance_loss_clip": 1.18987036, "balance_loss_mlp": 0.2437951, "epoch": 0.2205922140387795, "flos": 24716309247360.0, "grad_norm": 5.451919598688381, "language_loss": 0.77815115, "learning_rate": 3.630892311113904e-06, "loss": 0.79622245, "num_input_tokens_seen": 79123320, "router_z_loss_clip": 3.45507812, "router_z_loss_mlp": 0.27697754, "step": 3669, "time_per_iteration": 2.6486165523529053 }, { "auxiliary_loss_clip": 0.01534398, "auxiliary_loss_mlp": 0.00296183, "balance_loss_clip": 1.18771958, "balance_loss_mlp": 0.27112502, "epoch": 0.22065233729144745, "flos": 23477247642240.0, "grad_norm": 3.6751953844631573, "language_loss": 0.90529603, "learning_rate": 3.6306668469451215e-06, "loss": 0.92360187, "num_input_tokens_seen": 79141615, "router_z_loss_clip": 3.46875, "router_z_loss_mlp": 0.25048828, "step": 3670, "time_per_iteration": 2.6147139072418213 }, { "auxiliary_loss_clip": 0.01571282, "auxiliary_loss_mlp": 0.00282816, "balance_loss_clip": 1.21981442, "balance_loss_mlp": 0.25542152, "epoch": 0.22071246054411545, "flos": 35225566646400.0, "grad_norm": 2.4633340471157505, "language_loss": 0.83720291, "learning_rate": 3.6304413209412886e-06, "loss": 0.85574389, "num_input_tokens_seen": 79164910, "router_z_loss_clip": 3.515625, "router_z_loss_mlp": 0.27404785, "step": 3671, "time_per_iteration": 2.6902925968170166 }, { "auxiliary_loss_clip": 0.01572812, "auxiliary_loss_mlp": 0.00244554, "balance_loss_clip": 1.22162938, "balance_loss_mlp": 0.2215583, "epoch": 0.2207725837967834, "flos": 18150294908160.0, "grad_norm": 2.204238601109321, "language_loss": 0.88393527, "learning_rate": 3.6302157331109573e-06, "loss": 0.90210891, "num_input_tokens_seen": 79179685, "router_z_loss_clip": 3.51171875, "router_z_loss_mlp": 0.22998047, "step": 3672, "time_per_iteration": 2.5950493812561035 }, { "auxiliary_loss_clip": 0.01552066, "auxiliary_loss_mlp": 0.00272158, "balance_loss_clip": 1.2056942, "balance_loss_mlp": 0.24626553, "epoch": 0.22083270704945138, "flos": 20479675898880.0, "grad_norm": 1.7615580898166965, "language_loss": 0.80253083, "learning_rate": 3.629990083462682e-06, "loss": 0.82077307, "num_input_tokens_seen": 79196285, "router_z_loss_clip": 3.46484375, "router_z_loss_mlp": 0.25878906, "step": 3673, "time_per_iteration": 2.6458864212036133 }, { "auxiliary_loss_clip": 0.01555227, "auxiliary_loss_mlp": 0.00287149, "balance_loss_clip": 1.21455431, "balance_loss_mlp": 0.25923008, "epoch": 0.22089283030211934, "flos": 34125801984000.0, "grad_norm": 11.781788948197653, "language_loss": 0.85621011, "learning_rate": 3.6297643720050203e-06, "loss": 0.87463385, "num_input_tokens_seen": 79216060, "router_z_loss_clip": 3.40625, "router_z_loss_mlp": 0.27929688, "step": 3674, "time_per_iteration": 2.7116451263427734 }, { "auxiliary_loss_clip": 0.01559789, "auxiliary_loss_mlp": 0.00293964, "balance_loss_clip": 1.21605468, "balance_loss_mlp": 0.26558033, "epoch": 0.2209529535547873, "flos": 18077216688000.0, "grad_norm": 1.8237942258806847, "language_loss": 0.84058917, "learning_rate": 3.6295385987465293e-06, "loss": 0.85912669, "num_input_tokens_seen": 79235145, "router_z_loss_clip": 3.43359375, "router_z_loss_mlp": 0.28393555, "step": 3675, "time_per_iteration": 2.6111960411071777 }, { "auxiliary_loss_clip": 0.01561282, "auxiliary_loss_mlp": 0.00284995, "balance_loss_clip": 1.22122908, "balance_loss_mlp": 0.25822049, "epoch": 0.22101307680745527, "flos": 27235335070080.0, "grad_norm": 7.848373884358802, "language_loss": 0.87316263, "learning_rate": 3.629312763695772e-06, "loss": 0.8916254, "num_input_tokens_seen": 79256960, "router_z_loss_clip": 3.40625, "router_z_loss_mlp": 0.26782227, "step": 3676, "time_per_iteration": 2.6955549716949463 }, { "auxiliary_loss_clip": 0.01549374, "auxiliary_loss_mlp": 0.00304344, "balance_loss_clip": 1.21278012, "balance_loss_mlp": 0.27621022, "epoch": 0.22107320006012326, "flos": 16543256423040.0, "grad_norm": 3.747263336852701, "language_loss": 0.86282802, "learning_rate": 3.6290868668613107e-06, "loss": 0.88136518, "num_input_tokens_seen": 79274860, "router_z_loss_clip": 3.36523438, "router_z_loss_mlp": 0.28125, "step": 3677, "time_per_iteration": 2.59421968460083 }, { "auxiliary_loss_clip": 0.01574273, "auxiliary_loss_mlp": 0.00288401, "balance_loss_clip": 1.22534668, "balance_loss_mlp": 0.26159057, "epoch": 0.22113332331279123, "flos": 22054466949120.0, "grad_norm": 11.14396513509158, "language_loss": 0.91137254, "learning_rate": 3.628860908251712e-06, "loss": 0.92999923, "num_input_tokens_seen": 79294005, "router_z_loss_clip": 3.49023438, "router_z_loss_mlp": 0.26794434, "step": 3678, "time_per_iteration": 2.643198251724243 }, { "auxiliary_loss_clip": 0.01575697, "auxiliary_loss_mlp": 0.00275592, "balance_loss_clip": 1.22563601, "balance_loss_mlp": 0.24915087, "epoch": 0.2211934465654592, "flos": 26612787525120.0, "grad_norm": 1.787366371080943, "language_loss": 0.95562923, "learning_rate": 3.6286348878755452e-06, "loss": 0.97414219, "num_input_tokens_seen": 79314005, "router_z_loss_clip": 3.50195312, "router_z_loss_mlp": 0.2644043, "step": 3679, "time_per_iteration": 2.693951368331909 }, { "auxiliary_loss_clip": 0.01613782, "auxiliary_loss_mlp": 0.00287671, "balance_loss_clip": 1.25679982, "balance_loss_mlp": 0.26180291, "epoch": 0.22125356981812716, "flos": 16360363347840.0, "grad_norm": 18.093333379184173, "language_loss": 0.94428951, "learning_rate": 3.6284088057413803e-06, "loss": 0.96330404, "num_input_tokens_seen": 79331030, "router_z_loss_clip": 3.5703125, "router_z_loss_mlp": 0.25915527, "step": 3680, "time_per_iteration": 2.6228039264678955 }, { "auxiliary_loss_clip": 0.01623027, "auxiliary_loss_mlp": 0.00276076, "balance_loss_clip": 1.26449943, "balance_loss_mlp": 0.24900314, "epoch": 0.22131369307079513, "flos": 21651118151040.0, "grad_norm": 24.435980862523223, "language_loss": 0.88085759, "learning_rate": 3.6281826618577894e-06, "loss": 0.89984852, "num_input_tokens_seen": 79348560, "router_z_loss_clip": 3.59179688, "router_z_loss_mlp": 0.27062988, "step": 3681, "time_per_iteration": 2.6269021034240723 }, { "auxiliary_loss_clip": 0.01608963, "auxiliary_loss_mlp": 0.00278049, "balance_loss_clip": 1.25551772, "balance_loss_mlp": 0.25238347, "epoch": 0.2213738163234631, "flos": 19609524927360.0, "grad_norm": 8.5076292668002, "language_loss": 0.88539588, "learning_rate": 3.62795645623335e-06, "loss": 0.904266, "num_input_tokens_seen": 79367175, "router_z_loss_clip": 3.53320312, "router_z_loss_mlp": 0.25683594, "step": 3682, "time_per_iteration": 2.5884249210357666 }, { "auxiliary_loss_clip": 0.01619954, "auxiliary_loss_mlp": 0.00307035, "balance_loss_clip": 1.26633716, "balance_loss_mlp": 0.28039202, "epoch": 0.22143393957613106, "flos": 23623404082560.0, "grad_norm": 2.768790432028993, "language_loss": 0.8463006, "learning_rate": 3.627730188876638e-06, "loss": 0.86557055, "num_input_tokens_seen": 79388435, "router_z_loss_clip": 3.53515625, "router_z_loss_mlp": 0.26635742, "step": 3683, "time_per_iteration": 2.6336348056793213 }, { "auxiliary_loss_clip": 0.01608906, "auxiliary_loss_mlp": 0.00294743, "balance_loss_clip": 1.25589383, "balance_loss_mlp": 0.26802772, "epoch": 0.22149406282879905, "flos": 26177801823360.0, "grad_norm": 8.180395690943065, "language_loss": 0.8207382, "learning_rate": 3.627503859796234e-06, "loss": 0.83977461, "num_input_tokens_seen": 79407910, "router_z_loss_clip": 3.53125, "router_z_loss_mlp": 0.26696777, "step": 3684, "time_per_iteration": 2.6592659950256348 }, { "auxiliary_loss_clip": 0.01625066, "auxiliary_loss_mlp": 0.00287154, "balance_loss_clip": 1.2682054, "balance_loss_mlp": 0.25909197, "epoch": 0.221554186081467, "flos": 14538758970240.0, "grad_norm": 2.8508009955158307, "language_loss": 0.87378955, "learning_rate": 3.6272774690007207e-06, "loss": 0.89291167, "num_input_tokens_seen": 79424020, "router_z_loss_clip": 3.56835938, "router_z_loss_mlp": 0.28063965, "step": 3685, "time_per_iteration": 2.6591267585754395 }, { "auxiliary_loss_clip": 0.01647171, "auxiliary_loss_mlp": 0.00262565, "balance_loss_clip": 1.28293014, "balance_loss_mlp": 0.23700607, "epoch": 0.22161430933413498, "flos": 22238257864320.0, "grad_norm": 1.3897668354112773, "language_loss": 0.91917908, "learning_rate": 3.6270510164986823e-06, "loss": 0.93827653, "num_input_tokens_seen": 79445605, "router_z_loss_clip": 3.64453125, "router_z_loss_mlp": 0.2557373, "step": 3686, "time_per_iteration": 2.652676582336426 }, { "auxiliary_loss_clip": 0.01629834, "auxiliary_loss_mlp": 0.00379878, "balance_loss_clip": 1.27197981, "balance_loss_mlp": 0.34922922, "epoch": 0.22167443258680294, "flos": 23476529370240.0, "grad_norm": 5.822234974423388, "language_loss": 0.86127198, "learning_rate": 3.626824502298707e-06, "loss": 0.88136911, "num_input_tokens_seen": 79463850, "router_z_loss_clip": 3.57617188, "router_z_loss_mlp": 0.30639648, "step": 3687, "time_per_iteration": 2.6901378631591797 }, { "auxiliary_loss_clip": 0.01642433, "auxiliary_loss_mlp": 0.00331323, "balance_loss_clip": 1.28225315, "balance_loss_mlp": 0.3009485, "epoch": 0.2217345558394709, "flos": 23221132692480.0, "grad_norm": 13.64467537306214, "language_loss": 0.93890786, "learning_rate": 3.626597926409383e-06, "loss": 0.95864534, "num_input_tokens_seen": 79482845, "router_z_loss_clip": 3.60546875, "router_z_loss_mlp": 0.30371094, "step": 3688, "time_per_iteration": 2.617197036743164 }, { "auxiliary_loss_clip": 0.01657546, "auxiliary_loss_mlp": 0.00372826, "balance_loss_clip": 1.30028629, "balance_loss_mlp": 0.33963817, "epoch": 0.22179467909213887, "flos": 20011078045440.0, "grad_norm": 24.42787567537049, "language_loss": 0.87923419, "learning_rate": 3.6263712888393027e-06, "loss": 0.89953792, "num_input_tokens_seen": 79501550, "router_z_loss_clip": 3.57226562, "router_z_loss_mlp": 0.33178711, "step": 3689, "time_per_iteration": 4.033870458602905 }, { "auxiliary_loss_clip": 0.01672387, "auxiliary_loss_mlp": 0.00348362, "balance_loss_clip": 1.31137133, "balance_loss_mlp": 0.31888172, "epoch": 0.22185480234480687, "flos": 19683034110720.0, "grad_norm": 2.9699809532193893, "language_loss": 0.78957111, "learning_rate": 3.626144589597061e-06, "loss": 0.80977857, "num_input_tokens_seen": 79519680, "router_z_loss_clip": 3.60546875, "router_z_loss_mlp": 0.2947998, "step": 3690, "time_per_iteration": 2.629692554473877 }, { "auxiliary_loss_clip": 0.01688025, "auxiliary_loss_mlp": 0.00370755, "balance_loss_clip": 1.32806337, "balance_loss_mlp": 0.33840209, "epoch": 0.22191492559747483, "flos": 21981316901760.0, "grad_norm": 17.317343953410802, "language_loss": 0.81772292, "learning_rate": 3.6259178286912528e-06, "loss": 0.83831072, "num_input_tokens_seen": 79539000, "router_z_loss_clip": 3.59960938, "router_z_loss_mlp": 0.32324219, "step": 3691, "time_per_iteration": 2.6158835887908936 }, { "auxiliary_loss_clip": 0.01698609, "auxiliary_loss_mlp": 0.00353021, "balance_loss_clip": 1.33793259, "balance_loss_mlp": 0.32047677, "epoch": 0.2219750488501428, "flos": 23222066446080.0, "grad_norm": 5.607678534291826, "language_loss": 0.81631404, "learning_rate": 3.625691006130477e-06, "loss": 0.83683038, "num_input_tokens_seen": 79559695, "router_z_loss_clip": 3.60742188, "router_z_loss_mlp": 0.32543945, "step": 3692, "time_per_iteration": 4.0299012660980225 }, { "auxiliary_loss_clip": 0.01697891, "auxiliary_loss_mlp": 0.00374294, "balance_loss_clip": 1.34024358, "balance_loss_mlp": 0.34189284, "epoch": 0.22203517210281076, "flos": 22453685683200.0, "grad_norm": 11.286296493580691, "language_loss": 0.93502867, "learning_rate": 3.6254641219233362e-06, "loss": 0.95575052, "num_input_tokens_seen": 79579095, "router_z_loss_clip": 3.57617188, "router_z_loss_mlp": 0.32397461, "step": 3693, "time_per_iteration": 2.6183626651763916 }, { "auxiliary_loss_clip": 0.01696813, "auxiliary_loss_mlp": 0.00323026, "balance_loss_clip": 1.33565855, "balance_loss_mlp": 0.29379582, "epoch": 0.22209529535547873, "flos": 17564555825280.0, "grad_norm": 11.638232903147339, "language_loss": 0.94132411, "learning_rate": 3.6252371760784325e-06, "loss": 0.96152258, "num_input_tokens_seen": 79596430, "router_z_loss_clip": 3.61523438, "router_z_loss_mlp": 0.29248047, "step": 3694, "time_per_iteration": 2.5532171726226807 }, { "auxiliary_loss_clip": 0.01696223, "auxiliary_loss_mlp": 0.00341683, "balance_loss_clip": 1.32694256, "balance_loss_mlp": 0.3092584, "epoch": 0.2221554186081467, "flos": 21469015175040.0, "grad_norm": 1.9933793717720325, "language_loss": 0.7982434, "learning_rate": 3.6250101686043725e-06, "loss": 0.81862247, "num_input_tokens_seen": 79615825, "router_z_loss_clip": 3.6953125, "router_z_loss_mlp": 0.32421875, "step": 3695, "time_per_iteration": 4.054062366485596 }, { "auxiliary_loss_clip": 0.01712687, "auxiliary_loss_mlp": 0.00365992, "balance_loss_clip": 1.35601735, "balance_loss_mlp": 0.33487841, "epoch": 0.22221554186081466, "flos": 27673445255040.0, "grad_norm": 50.33291954235224, "language_loss": 0.77830058, "learning_rate": 3.6247830995097637e-06, "loss": 0.79908735, "num_input_tokens_seen": 79637875, "router_z_loss_clip": 3.56445312, "router_z_loss_mlp": 0.3112793, "step": 3696, "time_per_iteration": 2.6508381366729736 }, { "auxiliary_loss_clip": 0.01721258, "auxiliary_loss_mlp": 0.00355809, "balance_loss_clip": 1.35634518, "balance_loss_mlp": 0.32448041, "epoch": 0.22227566511348265, "flos": 25958926298880.0, "grad_norm": 23.22595493798862, "language_loss": 0.9437325, "learning_rate": 3.624555968803217e-06, "loss": 0.96450323, "num_input_tokens_seen": 79656970, "router_z_loss_clip": 3.65039062, "router_z_loss_mlp": 0.31347656, "step": 3697, "time_per_iteration": 2.628314256668091 }, { "auxiliary_loss_clip": 0.01733714, "auxiliary_loss_mlp": 0.00316092, "balance_loss_clip": 1.36933804, "balance_loss_mlp": 0.28569365, "epoch": 0.22233578836615062, "flos": 39203678833920.0, "grad_norm": 1.5316105139934753, "language_loss": 0.73240703, "learning_rate": 3.624328776493346e-06, "loss": 0.75290513, "num_input_tokens_seen": 79680275, "router_z_loss_clip": 3.64453125, "router_z_loss_mlp": 0.30395508, "step": 3698, "time_per_iteration": 2.74393892288208 }, { "auxiliary_loss_clip": 0.01728017, "auxiliary_loss_mlp": 0.00347417, "balance_loss_clip": 1.36542499, "balance_loss_mlp": 0.31413391, "epoch": 0.22239591161881858, "flos": 36283782251520.0, "grad_norm": 79.84971668382472, "language_loss": 0.90833133, "learning_rate": 3.6241015225887637e-06, "loss": 0.92908561, "num_input_tokens_seen": 79701255, "router_z_loss_clip": 3.62890625, "router_z_loss_mlp": 0.33276367, "step": 3699, "time_per_iteration": 4.102436304092407 }, { "auxiliary_loss_clip": 0.01722163, "auxiliary_loss_mlp": 0.00320874, "balance_loss_clip": 1.36665392, "balance_loss_mlp": 0.29054725, "epoch": 0.22245603487148655, "flos": 19719591177600.0, "grad_norm": 2.054063251891168, "language_loss": 0.87160373, "learning_rate": 3.62387420709809e-06, "loss": 0.89203405, "num_input_tokens_seen": 79721315, "router_z_loss_clip": 3.55859375, "router_z_loss_mlp": 0.30273438, "step": 3700, "time_per_iteration": 2.6151325702667236 }, { "auxiliary_loss_clip": 0.01728938, "auxiliary_loss_mlp": 0.00365476, "balance_loss_clip": 1.37781692, "balance_loss_mlp": 0.33104861, "epoch": 0.2225161581241545, "flos": 46280450615040.0, "grad_norm": 16.31634392870071, "language_loss": 0.80071652, "learning_rate": 3.623646830029943e-06, "loss": 0.82166064, "num_input_tokens_seen": 79742705, "router_z_loss_clip": 3.50976562, "router_z_loss_mlp": 0.34399414, "step": 3701, "time_per_iteration": 2.809668779373169 }, { "auxiliary_loss_clip": 0.01752385, "auxiliary_loss_mlp": 0.0031881, "balance_loss_clip": 1.4018718, "balance_loss_mlp": 0.28678989, "epoch": 0.22257628137682248, "flos": 23696194993920.0, "grad_norm": 31.486857422584386, "language_loss": 0.8939867, "learning_rate": 3.6234193913929454e-06, "loss": 0.91469866, "num_input_tokens_seen": 79763000, "router_z_loss_clip": 3.50976562, "router_z_loss_mlp": 0.32006836, "step": 3702, "time_per_iteration": 2.6198415756225586 }, { "auxiliary_loss_clip": 0.01733855, "auxiliary_loss_mlp": 0.00303546, "balance_loss_clip": 1.39256454, "balance_loss_mlp": 0.27383897, "epoch": 0.22263640462949044, "flos": 19353984595200.0, "grad_norm": 101.47959777847205, "language_loss": 0.86793268, "learning_rate": 3.623191891195723e-06, "loss": 0.88830674, "num_input_tokens_seen": 79781335, "router_z_loss_clip": 3.4140625, "router_z_loss_mlp": 0.29736328, "step": 3703, "time_per_iteration": 2.614367961883545 }, { "auxiliary_loss_clip": 0.0172518, "auxiliary_loss_mlp": 0.00306889, "balance_loss_clip": 1.38021469, "balance_loss_mlp": 0.27551284, "epoch": 0.22269652788215843, "flos": 20776047016320.0, "grad_norm": 22.16151273097123, "language_loss": 0.83292335, "learning_rate": 3.6229643294469005e-06, "loss": 0.85324407, "num_input_tokens_seen": 79800150, "router_z_loss_clip": 3.4453125, "router_z_loss_mlp": 0.31396484, "step": 3704, "time_per_iteration": 2.631744623184204 }, { "auxiliary_loss_clip": 0.01740922, "auxiliary_loss_mlp": 0.00297918, "balance_loss_clip": 1.39778256, "balance_loss_mlp": 0.26801986, "epoch": 0.2227566511348264, "flos": 47958843467520.0, "grad_norm": 9.415190522015516, "language_loss": 0.71521211, "learning_rate": 3.6227367061551074e-06, "loss": 0.73560047, "num_input_tokens_seen": 79822390, "router_z_loss_clip": 3.43164062, "router_z_loss_mlp": 0.29907227, "step": 3705, "time_per_iteration": 2.8331565856933594 }, { "auxiliary_loss_clip": 0.01713981, "auxiliary_loss_mlp": 0.00064361, "balance_loss_clip": 1.4706037, "balance_loss_mlp": 0.0575418, "epoch": 0.22281677438749437, "flos": 66218953230720.0, "grad_norm": 1.0600923250007073, "language_loss": 0.64869475, "learning_rate": 3.6225090213289766e-06, "loss": 0.66647816, "num_input_tokens_seen": 79873350, "router_z_loss_clip": 2.4375, "router_z_loss_mlp": 0.06835938, "step": 3706, "time_per_iteration": 3.0434606075286865 }, { "auxiliary_loss_clip": 0.01760707, "auxiliary_loss_mlp": 0.00309879, "balance_loss_clip": 1.41355371, "balance_loss_mlp": 0.27659565, "epoch": 0.22287689764016233, "flos": 21871609787520.0, "grad_norm": 12.079676041156633, "language_loss": 0.88239419, "learning_rate": 3.622281274977141e-06, "loss": 0.90310007, "num_input_tokens_seen": 79891715, "router_z_loss_clip": 3.47070312, "router_z_loss_mlp": 0.33276367, "step": 3707, "time_per_iteration": 2.7407302856445312 }, { "auxiliary_loss_clip": 0.01747122, "auxiliary_loss_mlp": 0.00286677, "balance_loss_clip": 1.40257502, "balance_loss_mlp": 0.25776899, "epoch": 0.2229370208928303, "flos": 27672475587840.0, "grad_norm": 536.1346735333221, "language_loss": 0.86869705, "learning_rate": 3.6220534671082367e-06, "loss": 0.88903505, "num_input_tokens_seen": 79911175, "router_z_loss_clip": 3.44335938, "router_z_loss_mlp": 0.2890625, "step": 3708, "time_per_iteration": 2.645179510116577 }, { "auxiliary_loss_clip": 0.01768878, "auxiliary_loss_mlp": 0.00292178, "balance_loss_clip": 1.41839325, "balance_loss_mlp": 0.25946712, "epoch": 0.22299714414549826, "flos": 30154657034880.0, "grad_norm": 5.761874765795653, "language_loss": 0.89920354, "learning_rate": 3.6218255977309024e-06, "loss": 0.91981405, "num_input_tokens_seen": 79931875, "router_z_loss_clip": 3.50390625, "router_z_loss_mlp": 0.32714844, "step": 3709, "time_per_iteration": 2.674984931945801 }, { "auxiliary_loss_clip": 0.01753318, "auxiliary_loss_mlp": 0.00312178, "balance_loss_clip": 1.40389371, "balance_loss_mlp": 0.27786934, "epoch": 0.22305726739816625, "flos": 23143134309120.0, "grad_norm": 3.1592723537410716, "language_loss": 0.78936088, "learning_rate": 3.6215976668537787e-06, "loss": 0.81001586, "num_input_tokens_seen": 79952445, "router_z_loss_clip": 3.49609375, "router_z_loss_mlp": 0.34289551, "step": 3710, "time_per_iteration": 2.633990526199341 }, { "auxiliary_loss_clip": 0.01743961, "auxiliary_loss_mlp": 0.0028421, "balance_loss_clip": 1.40137339, "balance_loss_mlp": 0.25409758, "epoch": 0.22311739065083422, "flos": 19172061187200.0, "grad_norm": 6.357035540803494, "language_loss": 0.97972631, "learning_rate": 3.6213696744855096e-06, "loss": 1.00000811, "num_input_tokens_seen": 79971030, "router_z_loss_clip": 3.4296875, "router_z_loss_mlp": 0.30102539, "step": 3711, "time_per_iteration": 2.5900700092315674 }, { "auxiliary_loss_clip": 0.01745471, "auxiliary_loss_mlp": 0.00321092, "balance_loss_clip": 1.40614247, "balance_loss_mlp": 0.28659284, "epoch": 0.22317751390350218, "flos": 13617757319040.0, "grad_norm": 33.553328462623845, "language_loss": 0.99660897, "learning_rate": 3.6211416206347395e-06, "loss": 1.01727462, "num_input_tokens_seen": 79982085, "router_z_loss_clip": 3.39257812, "router_z_loss_mlp": 0.3449707, "step": 3712, "time_per_iteration": 2.5452916622161865 }, { "auxiliary_loss_clip": 0.0174944, "auxiliary_loss_mlp": 0.00277402, "balance_loss_clip": 1.41678905, "balance_loss_mlp": 0.24478626, "epoch": 0.22323763715617015, "flos": 11029065068160.0, "grad_norm": 5.096945967082815, "language_loss": 0.86157012, "learning_rate": 3.620913505310117e-06, "loss": 0.88183856, "num_input_tokens_seen": 79997460, "router_z_loss_clip": 3.33007812, "router_z_loss_mlp": 0.32653809, "step": 3713, "time_per_iteration": 2.5520167350769043 }, { "auxiliary_loss_clip": 0.01746571, "auxiliary_loss_mlp": 0.00291547, "balance_loss_clip": 1.40656614, "balance_loss_mlp": 0.26095751, "epoch": 0.22329776040883811, "flos": 41351531466240.0, "grad_norm": 76.01407686239082, "language_loss": 0.69798458, "learning_rate": 3.6206853285202917e-06, "loss": 0.71836579, "num_input_tokens_seen": 80022450, "router_z_loss_clip": 3.40039062, "router_z_loss_mlp": 0.30615234, "step": 3714, "time_per_iteration": 2.7956221103668213 }, { "auxiliary_loss_clip": 0.0175136, "auxiliary_loss_mlp": 0.00290504, "balance_loss_clip": 1.41132259, "balance_loss_mlp": 0.25977197, "epoch": 0.22335788366150608, "flos": 25119478477440.0, "grad_norm": 5.1323160327537165, "language_loss": 0.86282134, "learning_rate": 3.6204570902739164e-06, "loss": 0.88323998, "num_input_tokens_seen": 80042100, "router_z_loss_clip": 3.40039062, "router_z_loss_mlp": 0.30761719, "step": 3715, "time_per_iteration": 2.644925832748413 }, { "auxiliary_loss_clip": 0.01729833, "auxiliary_loss_mlp": 0.00285261, "balance_loss_clip": 1.40240288, "balance_loss_mlp": 0.25469548, "epoch": 0.22341800691417404, "flos": 16983377769600.0, "grad_norm": 385.06320342901046, "language_loss": 0.8525815, "learning_rate": 3.620228790579645e-06, "loss": 0.8727324, "num_input_tokens_seen": 80059690, "router_z_loss_clip": 3.27734375, "router_z_loss_mlp": 0.30566406, "step": 3716, "time_per_iteration": 2.586158514022827 }, { "auxiliary_loss_clip": 0.01753008, "auxiliary_loss_mlp": 0.00277589, "balance_loss_clip": 1.41590047, "balance_loss_mlp": 0.24630815, "epoch": 0.22347813016684204, "flos": 14136738975360.0, "grad_norm": 14.075362001549422, "language_loss": 0.88795424, "learning_rate": 3.6200004294461367e-06, "loss": 0.90826023, "num_input_tokens_seen": 80076060, "router_z_loss_clip": 3.37109375, "router_z_loss_mlp": 0.31262207, "step": 3717, "time_per_iteration": 2.5936636924743652 }, { "auxiliary_loss_clip": 0.01759981, "auxiliary_loss_mlp": 0.00293332, "balance_loss_clip": 1.41568518, "balance_loss_mlp": 0.25711617, "epoch": 0.22353825341951, "flos": 23583147914880.0, "grad_norm": 1684.0348916195737, "language_loss": 0.7658022, "learning_rate": 3.6197720068820497e-06, "loss": 0.78633535, "num_input_tokens_seen": 80094760, "router_z_loss_clip": 3.44140625, "router_z_loss_mlp": 0.36206055, "step": 3718, "time_per_iteration": 2.6291565895080566 }, { "auxiliary_loss_clip": 0.01760422, "auxiliary_loss_mlp": 0.00290654, "balance_loss_clip": 1.42101526, "balance_loss_mlp": 0.25384185, "epoch": 0.22359837667217797, "flos": 29824206888960.0, "grad_norm": 2.9258486960805072, "language_loss": 0.8650775, "learning_rate": 3.619543522896045e-06, "loss": 0.88558829, "num_input_tokens_seen": 80114475, "router_z_loss_clip": 3.39257812, "router_z_loss_mlp": 0.36816406, "step": 3719, "time_per_iteration": 2.658836603164673 }, { "auxiliary_loss_clip": 0.01748132, "auxiliary_loss_mlp": 0.00285808, "balance_loss_clip": 1.40807343, "balance_loss_mlp": 0.24863864, "epoch": 0.22365849992484593, "flos": 17603088140160.0, "grad_norm": 18.19607218504418, "language_loss": 0.95700097, "learning_rate": 3.6193149774967885e-06, "loss": 0.97734034, "num_input_tokens_seen": 80132920, "router_z_loss_clip": 3.40429688, "router_z_loss_mlp": 0.37158203, "step": 3720, "time_per_iteration": 2.607684373855591 }, { "auxiliary_loss_clip": 0.01748168, "auxiliary_loss_mlp": 0.00256905, "balance_loss_clip": 1.41860306, "balance_loss_mlp": 0.2235266, "epoch": 0.2237186231775139, "flos": 22710949868160.0, "grad_norm": 1.883720739458755, "language_loss": 0.79906577, "learning_rate": 3.619086370692945e-06, "loss": 0.81911647, "num_input_tokens_seen": 80152845, "router_z_loss_clip": 3.29882812, "router_z_loss_mlp": 0.33374023, "step": 3721, "time_per_iteration": 2.6033802032470703 }, { "auxiliary_loss_clip": 0.01753427, "auxiliary_loss_mlp": 0.00281108, "balance_loss_clip": 1.41026485, "balance_loss_mlp": 0.24584566, "epoch": 0.22377874643018186, "flos": 13371518609280.0, "grad_norm": 2.5992682267625895, "language_loss": 0.88445222, "learning_rate": 3.6188577024931844e-06, "loss": 0.90479761, "num_input_tokens_seen": 80170680, "router_z_loss_clip": 3.43359375, "router_z_loss_mlp": 0.35253906, "step": 3722, "time_per_iteration": 2.6264517307281494 }, { "auxiliary_loss_clip": 0.01716987, "auxiliary_loss_mlp": 0.00275511, "balance_loss_clip": 1.38306284, "balance_loss_mlp": 0.24468386, "epoch": 0.22383886968284986, "flos": 17894970057600.0, "grad_norm": 4.237143029277335, "language_loss": 0.90874046, "learning_rate": 3.618628972906178e-06, "loss": 0.92866552, "num_input_tokens_seen": 80189030, "router_z_loss_clip": 3.33984375, "router_z_loss_mlp": 0.30834961, "step": 3723, "time_per_iteration": 2.566830635070801 }, { "auxiliary_loss_clip": 0.01707523, "auxiliary_loss_mlp": 0.00300966, "balance_loss_clip": 1.37428427, "balance_loss_mlp": 0.26658565, "epoch": 0.22389899293551782, "flos": 23879123982720.0, "grad_norm": 41.43982533131782, "language_loss": 0.93413842, "learning_rate": 3.6184001819405984e-06, "loss": 0.95422328, "num_input_tokens_seen": 80208365, "router_z_loss_clip": 3.33203125, "router_z_loss_mlp": 0.34375, "step": 3724, "time_per_iteration": 2.6416313648223877 }, { "auxiliary_loss_clip": 0.0172444, "auxiliary_loss_mlp": 0.00282778, "balance_loss_clip": 1.38867891, "balance_loss_mlp": 0.24773084, "epoch": 0.2239591161881858, "flos": 27272430840960.0, "grad_norm": 7.476115715257439, "language_loss": 0.87355769, "learning_rate": 3.618171329605121e-06, "loss": 0.89362991, "num_input_tokens_seen": 80228685, "router_z_loss_clip": 3.35546875, "router_z_loss_mlp": 0.3503418, "step": 3725, "time_per_iteration": 2.640695095062256 }, { "auxiliary_loss_clip": 0.01709947, "auxiliary_loss_mlp": 0.0027517, "balance_loss_clip": 1.37383604, "balance_loss_mlp": 0.24431907, "epoch": 0.22401923944085375, "flos": 22236857233920.0, "grad_norm": 2.1957370696906846, "language_loss": 0.8588782, "learning_rate": 3.6179424159084254e-06, "loss": 0.87872934, "num_input_tokens_seen": 80247635, "router_z_loss_clip": 3.36132812, "router_z_loss_mlp": 0.30859375, "step": 3726, "time_per_iteration": 2.6212878227233887 }, { "auxiliary_loss_clip": 0.01706481, "auxiliary_loss_mlp": 0.00308717, "balance_loss_clip": 1.36788344, "balance_loss_mlp": 0.27307314, "epoch": 0.22407936269352172, "flos": 12053668521600.0, "grad_norm": 6.265205322844015, "language_loss": 0.86108208, "learning_rate": 3.6177134408591914e-06, "loss": 0.88123411, "num_input_tokens_seen": 80260045, "router_z_loss_clip": 3.38867188, "router_z_loss_mlp": 0.35644531, "step": 3727, "time_per_iteration": 2.750154972076416 }, { "auxiliary_loss_clip": 0.016848, "auxiliary_loss_mlp": 0.00297886, "balance_loss_clip": 1.34830165, "balance_loss_mlp": 0.26226667, "epoch": 0.22413948594618968, "flos": 19353553632000.0, "grad_norm": 2.9916809250902845, "language_loss": 0.9560867, "learning_rate": 3.6174844044661013e-06, "loss": 0.97591352, "num_input_tokens_seen": 80277680, "router_z_loss_clip": 3.36523438, "router_z_loss_mlp": 0.35668945, "step": 3728, "time_per_iteration": 2.674445390701294 }, { "auxiliary_loss_clip": 0.01666801, "auxiliary_loss_mlp": 0.00289349, "balance_loss_clip": 1.33292699, "balance_loss_mlp": 0.25725788, "epoch": 0.22419960919885765, "flos": 24170000319360.0, "grad_norm": 3.487450119551203, "language_loss": 0.91883874, "learning_rate": 3.6172553067378406e-06, "loss": 0.93840027, "num_input_tokens_seen": 80294795, "router_z_loss_clip": 3.33984375, "router_z_loss_mlp": 0.32104492, "step": 3729, "time_per_iteration": 2.6321234703063965 }, { "auxiliary_loss_clip": 0.01651864, "auxiliary_loss_mlp": 0.00260111, "balance_loss_clip": 1.32876813, "balance_loss_mlp": 0.23073794, "epoch": 0.22425973245152564, "flos": 27378977558400.0, "grad_norm": 1.8051955359619953, "language_loss": 0.93593144, "learning_rate": 3.6170261476830964e-06, "loss": 0.95505118, "num_input_tokens_seen": 80315425, "router_z_loss_clip": 3.23242188, "router_z_loss_mlp": 0.2935791, "step": 3730, "time_per_iteration": 2.642503499984741 }, { "auxiliary_loss_clip": 0.01644362, "auxiliary_loss_mlp": 0.00287921, "balance_loss_clip": 1.31437838, "balance_loss_mlp": 0.25578254, "epoch": 0.2243198557041936, "flos": 13735652734080.0, "grad_norm": 23.69721076733346, "language_loss": 0.80117583, "learning_rate": 3.616796927310559e-06, "loss": 0.82049865, "num_input_tokens_seen": 80333905, "router_z_loss_clip": 3.30078125, "router_z_loss_mlp": 0.32128906, "step": 3731, "time_per_iteration": 2.6223978996276855 }, { "auxiliary_loss_clip": 0.01629544, "auxiliary_loss_mlp": 0.00284443, "balance_loss_clip": 1.29849982, "balance_loss_mlp": 0.25284061, "epoch": 0.22437997895686157, "flos": 19530700531200.0, "grad_norm": 128.58797139745107, "language_loss": 0.83693612, "learning_rate": 3.6165676456289195e-06, "loss": 0.856076, "num_input_tokens_seen": 80352165, "router_z_loss_clip": 3.31445312, "router_z_loss_mlp": 0.31555176, "step": 3732, "time_per_iteration": 4.01387357711792 }, { "auxiliary_loss_clip": 0.01622681, "auxiliary_loss_mlp": 0.00261306, "balance_loss_clip": 1.28904772, "balance_loss_mlp": 0.23097873, "epoch": 0.22444010220952954, "flos": 23696230907520.0, "grad_norm": 1.6243654256547913, "language_loss": 0.94479221, "learning_rate": 3.616338302646873e-06, "loss": 0.96363205, "num_input_tokens_seen": 80371305, "router_z_loss_clip": 3.3359375, "router_z_loss_mlp": 0.3034668, "step": 3733, "time_per_iteration": 2.6539969444274902 }, { "auxiliary_loss_clip": 0.01596926, "auxiliary_loss_mlp": 0.00281174, "balance_loss_clip": 1.26911247, "balance_loss_mlp": 0.25057322, "epoch": 0.2245002254621975, "flos": 22382905933440.0, "grad_norm": 6.296600100130652, "language_loss": 0.90009594, "learning_rate": 3.6161088983731166e-06, "loss": 0.91887701, "num_input_tokens_seen": 80391020, "router_z_loss_clip": 3.27734375, "router_z_loss_mlp": 0.30615234, "step": 3734, "time_per_iteration": 4.120077133178711 }, { "auxiliary_loss_clip": 0.01586135, "auxiliary_loss_mlp": 0.00239123, "balance_loss_clip": 1.25241518, "balance_loss_mlp": 0.21006027, "epoch": 0.22456034871486547, "flos": 26942303917440.0, "grad_norm": 14.868012265174627, "language_loss": 0.82364428, "learning_rate": 3.6158794328163482e-06, "loss": 0.84189683, "num_input_tokens_seen": 80411365, "router_z_loss_clip": 3.33984375, "router_z_loss_mlp": 0.29077148, "step": 3735, "time_per_iteration": 2.6559417247772217 }, { "auxiliary_loss_clip": 0.01574622, "auxiliary_loss_mlp": 0.00240886, "balance_loss_clip": 1.23917282, "balance_loss_mlp": 0.2146365, "epoch": 0.22462047196753343, "flos": 28983538005120.0, "grad_norm": 4.090248746192041, "language_loss": 0.90533888, "learning_rate": 3.6156499059852702e-06, "loss": 0.92349398, "num_input_tokens_seen": 80431075, "router_z_loss_clip": 3.35546875, "router_z_loss_mlp": 0.26220703, "step": 3736, "time_per_iteration": 2.703596353530884 }, { "auxiliary_loss_clip": 0.01589858, "auxiliary_loss_mlp": 0.0028323, "balance_loss_clip": 1.24937797, "balance_loss_mlp": 0.25446469, "epoch": 0.22468059522020142, "flos": 20011329440640.0, "grad_norm": 15.264752636765813, "language_loss": 0.93908215, "learning_rate": 3.615420317888586e-06, "loss": 0.95781296, "num_input_tokens_seen": 80449240, "router_z_loss_clip": 3.40625, "router_z_loss_mlp": 0.28747559, "step": 3737, "time_per_iteration": 4.047317743301392 }, { "auxiliary_loss_clip": 0.01580588, "auxiliary_loss_mlp": 0.00300648, "balance_loss_clip": 1.24001813, "balance_loss_mlp": 0.26821095, "epoch": 0.2247407184728694, "flos": 29314239546240.0, "grad_norm": 3.1243568449216528, "language_loss": 0.85876656, "learning_rate": 3.6151906685350006e-06, "loss": 0.87757885, "num_input_tokens_seen": 80467900, "router_z_loss_clip": 3.40625, "router_z_loss_mlp": 0.32446289, "step": 3738, "time_per_iteration": 2.658029079437256 }, { "auxiliary_loss_clip": 0.01575859, "auxiliary_loss_mlp": 0.00268173, "balance_loss_clip": 1.23830271, "balance_loss_mlp": 0.2392171, "epoch": 0.22480084172553735, "flos": 22310366417280.0, "grad_norm": 13.242282304508533, "language_loss": 0.84311473, "learning_rate": 3.614960957933224e-06, "loss": 0.86155498, "num_input_tokens_seen": 80487100, "router_z_loss_clip": 3.37304688, "router_z_loss_mlp": 0.28955078, "step": 3739, "time_per_iteration": 2.6988308429718018 }, { "auxiliary_loss_clip": 0.01565207, "auxiliary_loss_mlp": 0.00265972, "balance_loss_clip": 1.22739661, "balance_loss_mlp": 0.23357069, "epoch": 0.22486096497820532, "flos": 25591272641280.0, "grad_norm": 18.970612564592223, "language_loss": 0.81274772, "learning_rate": 3.6147311860919655e-06, "loss": 0.83105946, "num_input_tokens_seen": 80508625, "router_z_loss_clip": 3.375, "router_z_loss_mlp": 0.32446289, "step": 3740, "time_per_iteration": 2.6940972805023193 }, { "auxiliary_loss_clip": 0.01563965, "auxiliary_loss_mlp": 0.00247245, "balance_loss_clip": 1.2300539, "balance_loss_mlp": 0.22020839, "epoch": 0.22492108823087328, "flos": 17639824775040.0, "grad_norm": 6.6661159020940985, "language_loss": 0.83149922, "learning_rate": 3.614501353019939e-06, "loss": 0.84961128, "num_input_tokens_seen": 80527345, "router_z_loss_clip": 3.34375, "router_z_loss_mlp": 0.27050781, "step": 3741, "time_per_iteration": 2.5805277824401855 }, { "auxiliary_loss_clip": 0.01547146, "auxiliary_loss_mlp": 0.00263119, "balance_loss_clip": 1.20943642, "balance_loss_mlp": 0.2343777, "epoch": 0.22498121148354125, "flos": 16034653797120.0, "grad_norm": 6.624517730269669, "language_loss": 0.96296012, "learning_rate": 3.6142714587258592e-06, "loss": 0.98106277, "num_input_tokens_seen": 80545545, "router_z_loss_clip": 3.37890625, "router_z_loss_mlp": 0.28771973, "step": 3742, "time_per_iteration": 3.9571785926818848 }, { "auxiliary_loss_clip": 0.01542076, "auxiliary_loss_mlp": 0.00261836, "balance_loss_clip": 1.20222473, "balance_loss_mlp": 0.23543085, "epoch": 0.22504133473620924, "flos": 24023772051840.0, "grad_norm": 32.35585523370647, "language_loss": 0.8777771, "learning_rate": 3.614041503218444e-06, "loss": 0.89581627, "num_input_tokens_seen": 80565040, "router_z_loss_clip": 3.3984375, "router_z_loss_mlp": 0.26403809, "step": 3743, "time_per_iteration": 2.6378064155578613 }, { "auxiliary_loss_clip": 0.01578814, "auxiliary_loss_mlp": 0.00282835, "balance_loss_clip": 1.22328711, "balance_loss_mlp": 0.25262767, "epoch": 0.2251014579888772, "flos": 16763963541120.0, "grad_norm": 4.199372369551484, "language_loss": 0.7343837, "learning_rate": 3.6138114865064134e-06, "loss": 0.7530002, "num_input_tokens_seen": 80582815, "router_z_loss_clip": 3.5546875, "router_z_loss_mlp": 0.30163574, "step": 3744, "time_per_iteration": 2.5655248165130615 }, { "auxiliary_loss_clip": 0.01542687, "auxiliary_loss_mlp": 0.0026768, "balance_loss_clip": 1.19179702, "balance_loss_mlp": 0.24072722, "epoch": 0.22516158124154517, "flos": 13991013498240.0, "grad_norm": 93.14452409530932, "language_loss": 0.8547467, "learning_rate": 3.613581408598489e-06, "loss": 0.87285042, "num_input_tokens_seen": 80600865, "router_z_loss_clip": 3.51367188, "router_z_loss_mlp": 0.26965332, "step": 3745, "time_per_iteration": 2.590364694595337 }, { "auxiliary_loss_clip": 0.01555736, "auxiliary_loss_mlp": 0.00259223, "balance_loss_clip": 1.19764459, "balance_loss_mlp": 0.23321182, "epoch": 0.22522170449421314, "flos": 14390016750720.0, "grad_norm": 3.6015674738140118, "language_loss": 0.86922884, "learning_rate": 3.6133512695033965e-06, "loss": 0.88737845, "num_input_tokens_seen": 80617455, "router_z_loss_clip": 3.58007812, "router_z_loss_mlp": 0.26000977, "step": 3746, "time_per_iteration": 2.5995683670043945 }, { "auxiliary_loss_clip": 0.01547277, "auxiliary_loss_mlp": 0.00290854, "balance_loss_clip": 1.18504667, "balance_loss_mlp": 0.26328075, "epoch": 0.2252818277468811, "flos": 23805542972160.0, "grad_norm": 3.2540980932069905, "language_loss": 0.94966918, "learning_rate": 3.613121069229862e-06, "loss": 0.96805042, "num_input_tokens_seen": 80635125, "router_z_loss_clip": 3.62304688, "router_z_loss_mlp": 0.27575684, "step": 3747, "time_per_iteration": 2.6372578144073486 }, { "auxiliary_loss_clip": 0.01546306, "auxiliary_loss_mlp": 0.00268358, "balance_loss_clip": 1.18590236, "balance_loss_mlp": 0.24060571, "epoch": 0.22534195099954907, "flos": 24718033100160.0, "grad_norm": 2.114602440126379, "language_loss": 0.83550823, "learning_rate": 3.6128908077866145e-06, "loss": 0.85365486, "num_input_tokens_seen": 80656370, "router_z_loss_clip": 3.60351562, "router_z_loss_mlp": 0.27770996, "step": 3748, "time_per_iteration": 2.652848958969116 }, { "auxiliary_loss_clip": 0.01518413, "auxiliary_loss_mlp": 0.00289816, "balance_loss_clip": 1.16724515, "balance_loss_mlp": 0.26287472, "epoch": 0.22540207425221703, "flos": 21032341534080.0, "grad_norm": 13.006428734948704, "language_loss": 0.8602258, "learning_rate": 3.6126604851823864e-06, "loss": 0.87830806, "num_input_tokens_seen": 80676495, "router_z_loss_clip": 3.51367188, "router_z_loss_mlp": 0.26940918, "step": 3749, "time_per_iteration": 2.6743314266204834 }, { "auxiliary_loss_clip": 0.01494517, "auxiliary_loss_mlp": 0.00263137, "balance_loss_clip": 1.15013492, "balance_loss_mlp": 0.23548055, "epoch": 0.22546219750488503, "flos": 19390362094080.0, "grad_norm": 2.998229513291716, "language_loss": 0.87159061, "learning_rate": 3.6124301014259108e-06, "loss": 0.88916719, "num_input_tokens_seen": 80694755, "router_z_loss_clip": 3.44726562, "router_z_loss_mlp": 0.2767334, "step": 3750, "time_per_iteration": 2.7304012775421143 }, { "auxiliary_loss_clip": 0.01501942, "auxiliary_loss_mlp": 0.00268673, "balance_loss_clip": 1.15010595, "balance_loss_mlp": 0.24101678, "epoch": 0.225522320757553, "flos": 25192628524800.0, "grad_norm": 10.307570585188342, "language_loss": 0.90418935, "learning_rate": 3.6121996565259244e-06, "loss": 0.9218955, "num_input_tokens_seen": 80713670, "router_z_loss_clip": 3.52148438, "router_z_loss_mlp": 0.2767334, "step": 3751, "time_per_iteration": 2.7133405208587646 }, { "auxiliary_loss_clip": 0.01489346, "auxiliary_loss_mlp": 0.00235388, "balance_loss_clip": 1.14491463, "balance_loss_mlp": 0.20911434, "epoch": 0.22558244401022096, "flos": 17163110448000.0, "grad_norm": 3.322885603348074, "language_loss": 0.9160583, "learning_rate": 3.611969150491165e-06, "loss": 0.93330568, "num_input_tokens_seen": 80731450, "router_z_loss_clip": 3.44335938, "router_z_loss_mlp": 0.2623291, "step": 3752, "time_per_iteration": 2.666675567626953 }, { "auxiliary_loss_clip": 0.01479082, "auxiliary_loss_mlp": 0.00254369, "balance_loss_clip": 1.13345361, "balance_loss_mlp": 0.22838143, "epoch": 0.22564256726288892, "flos": 15231008856960.0, "grad_norm": 66.5203879579358, "language_loss": 0.84538692, "learning_rate": 3.611738583330375e-06, "loss": 0.86272144, "num_input_tokens_seen": 80748415, "router_z_loss_clip": 3.45703125, "router_z_loss_mlp": 0.25976562, "step": 3753, "time_per_iteration": 2.5854814052581787 }, { "auxiliary_loss_clip": 0.01472189, "auxiliary_loss_mlp": 0.0024686, "balance_loss_clip": 1.13263881, "balance_loss_mlp": 0.22046663, "epoch": 0.2257026905155569, "flos": 34568652764160.0, "grad_norm": 2.748066550416106, "language_loss": 0.8437615, "learning_rate": 3.611507955052295e-06, "loss": 0.86095202, "num_input_tokens_seen": 80770835, "router_z_loss_clip": 3.39648438, "router_z_loss_mlp": 0.26403809, "step": 3754, "time_per_iteration": 2.7454335689544678 }, { "auxiliary_loss_clip": 0.01494609, "auxiliary_loss_mlp": 0.00258218, "balance_loss_clip": 1.14446044, "balance_loss_mlp": 0.23300573, "epoch": 0.22576281376822485, "flos": 19938430788480.0, "grad_norm": 2.532103702961982, "language_loss": 0.77478129, "learning_rate": 3.6112772656656727e-06, "loss": 0.79230952, "num_input_tokens_seen": 80787840, "router_z_loss_clip": 3.5, "router_z_loss_mlp": 0.25195312, "step": 3755, "time_per_iteration": 2.611109733581543 }, { "auxiliary_loss_clip": 0.0146902, "auxiliary_loss_mlp": 0.00254977, "balance_loss_clip": 1.11967146, "balance_loss_mlp": 0.2274162, "epoch": 0.22582293702089282, "flos": 24602005192320.0, "grad_norm": 2.7168041333282806, "language_loss": 0.8662923, "learning_rate": 3.6110465151792547e-06, "loss": 0.88353229, "num_input_tokens_seen": 80806335, "router_z_loss_clip": 3.49609375, "router_z_loss_mlp": 0.2755127, "step": 3756, "time_per_iteration": 2.6402969360351562 }, { "auxiliary_loss_clip": 0.01476135, "auxiliary_loss_mlp": 0.00263765, "balance_loss_clip": 1.13475609, "balance_loss_mlp": 0.23526214, "epoch": 0.2258830602735608, "flos": 23035438356480.0, "grad_norm": 3.452118910324784, "language_loss": 0.89350921, "learning_rate": 3.6108157036017916e-06, "loss": 0.91090822, "num_input_tokens_seen": 80825355, "router_z_loss_clip": 3.4140625, "router_z_loss_mlp": 0.28479004, "step": 3757, "time_per_iteration": 2.6229939460754395 }, { "auxiliary_loss_clip": 0.01484057, "auxiliary_loss_mlp": 0.00282326, "balance_loss_clip": 1.13879275, "balance_loss_mlp": 0.25311926, "epoch": 0.22594318352622877, "flos": 22158427887360.0, "grad_norm": 33.213973999448974, "language_loss": 0.82079387, "learning_rate": 3.6105848309420358e-06, "loss": 0.8384577, "num_input_tokens_seen": 80842570, "router_z_loss_clip": 3.45703125, "router_z_loss_mlp": 0.29211426, "step": 3758, "time_per_iteration": 2.6361501216888428 }, { "auxiliary_loss_clip": 0.01478545, "auxiliary_loss_mlp": 0.00255803, "balance_loss_clip": 1.12851083, "balance_loss_mlp": 0.22830112, "epoch": 0.22600330677889674, "flos": 20594303176320.0, "grad_norm": 3.7980699276349865, "language_loss": 0.84744501, "learning_rate": 3.6103538972087412e-06, "loss": 0.86478847, "num_input_tokens_seen": 80858745, "router_z_loss_clip": 3.5, "router_z_loss_mlp": 0.27526855, "step": 3759, "time_per_iteration": 2.5729997158050537 }, { "auxiliary_loss_clip": 0.01467708, "auxiliary_loss_mlp": 0.00248822, "balance_loss_clip": 1.119766, "balance_loss_mlp": 0.22210692, "epoch": 0.2260634300315647, "flos": 35659798162560.0, "grad_norm": 2.2022061986511683, "language_loss": 0.84686321, "learning_rate": 3.6101229024106655e-06, "loss": 0.86402851, "num_input_tokens_seen": 80880085, "router_z_loss_clip": 3.47851562, "router_z_loss_mlp": 0.26708984, "step": 3760, "time_per_iteration": 2.702336311340332 }, { "auxiliary_loss_clip": 0.01604861, "auxiliary_loss_mlp": 0.00126645, "balance_loss_clip": 1.34817743, "balance_loss_mlp": 0.11734698, "epoch": 0.22612355328423267, "flos": 72090455126400.0, "grad_norm": 0.8789520094836868, "language_loss": 0.60041964, "learning_rate": 3.609891846556569e-06, "loss": 0.61773479, "num_input_tokens_seen": 80937660, "router_z_loss_clip": 2.5625, "router_z_loss_mlp": 0.09277344, "step": 3761, "time_per_iteration": 3.076446771621704 }, { "auxiliary_loss_clip": 0.01469807, "auxiliary_loss_mlp": 0.0024709, "balance_loss_clip": 1.12058938, "balance_loss_mlp": 0.21968368, "epoch": 0.22618367653690064, "flos": 22783776693120.0, "grad_norm": 1105.949890832068, "language_loss": 0.86608762, "learning_rate": 3.609660729655211e-06, "loss": 0.88325655, "num_input_tokens_seen": 80956265, "router_z_loss_clip": 3.49023438, "router_z_loss_mlp": 0.27416992, "step": 3762, "time_per_iteration": 2.6155059337615967 }, { "auxiliary_loss_clip": 0.01487241, "auxiliary_loss_mlp": 0.00238571, "balance_loss_clip": 1.13581181, "balance_loss_mlp": 0.21089014, "epoch": 0.22624379978956863, "flos": 20448254476800.0, "grad_norm": 39.59204086053606, "language_loss": 0.8986603, "learning_rate": 3.6094295517153573e-06, "loss": 0.91591841, "num_input_tokens_seen": 80975185, "router_z_loss_clip": 3.515625, "router_z_loss_mlp": 0.2767334, "step": 3763, "time_per_iteration": 2.5660717487335205 }, { "auxiliary_loss_clip": 0.01491278, "auxiliary_loss_mlp": 0.00251748, "balance_loss_clip": 1.13375616, "balance_loss_mlp": 0.22434179, "epoch": 0.2263039230422366, "flos": 17494314779520.0, "grad_norm": 2.7505557506183895, "language_loss": 0.98194957, "learning_rate": 3.6091983127457743e-06, "loss": 0.99937987, "num_input_tokens_seen": 80992830, "router_z_loss_clip": 3.57617188, "router_z_loss_mlp": 0.27429199, "step": 3764, "time_per_iteration": 2.574394702911377 }, { "auxiliary_loss_clip": 0.01472884, "auxiliary_loss_mlp": 0.00198136, "balance_loss_clip": 1.12239766, "balance_loss_mlp": 0.1741989, "epoch": 0.22636404629490456, "flos": 28329748606080.0, "grad_norm": 7.40099722990444, "language_loss": 0.82427269, "learning_rate": 3.6089670127552293e-06, "loss": 0.84098285, "num_input_tokens_seen": 81013675, "router_z_loss_clip": 3.50390625, "router_z_loss_mlp": 0.23913574, "step": 3765, "time_per_iteration": 2.678882122039795 }, { "auxiliary_loss_clip": 0.01498731, "auxiliary_loss_mlp": 0.00196275, "balance_loss_clip": 1.14206707, "balance_loss_mlp": 0.17203987, "epoch": 0.22642416954757252, "flos": 17489143221120.0, "grad_norm": 85.58254028116588, "language_loss": 0.99533308, "learning_rate": 3.608735651752494e-06, "loss": 1.01228309, "num_input_tokens_seen": 81030345, "router_z_loss_clip": 3.56445312, "router_z_loss_mlp": 0.2421875, "step": 3766, "time_per_iteration": 2.592388153076172 }, { "auxiliary_loss_clip": 0.0148388, "auxiliary_loss_mlp": 0.00190062, "balance_loss_clip": 1.13614202, "balance_loss_mlp": 0.16313274, "epoch": 0.2264842928002405, "flos": 24384530298240.0, "grad_norm": 83.31321765837072, "language_loss": 0.8187238, "learning_rate": 3.6085042297463417e-06, "loss": 0.83546317, "num_input_tokens_seen": 81051000, "router_z_loss_clip": 3.47851562, "router_z_loss_mlp": 0.26928711, "step": 3767, "time_per_iteration": 2.6785457134246826 }, { "auxiliary_loss_clip": 0.01480999, "auxiliary_loss_mlp": 0.00221725, "balance_loss_clip": 1.13329673, "balance_loss_mlp": 0.19591665, "epoch": 0.22654441605290845, "flos": 19830519354240.0, "grad_norm": 15853.12459299125, "language_loss": 0.79736352, "learning_rate": 3.6082727467455477e-06, "loss": 0.81439078, "num_input_tokens_seen": 81071205, "router_z_loss_clip": 3.47851562, "router_z_loss_mlp": 0.25793457, "step": 3768, "time_per_iteration": 2.6148195266723633 }, { "auxiliary_loss_clip": 0.01494432, "auxiliary_loss_mlp": 0.00205481, "balance_loss_clip": 1.14576077, "balance_loss_mlp": 0.18023278, "epoch": 0.22660453930557642, "flos": 27454569730560.0, "grad_norm": 7.01395108341076, "language_loss": 0.8604964, "learning_rate": 3.6080412027588905e-06, "loss": 0.87749553, "num_input_tokens_seen": 81091880, "router_z_loss_clip": 3.48242188, "router_z_loss_mlp": 0.25244141, "step": 3769, "time_per_iteration": 2.6502463817596436 }, { "auxiliary_loss_clip": 0.01483367, "auxiliary_loss_mlp": 0.00201625, "balance_loss_clip": 1.1342144, "balance_loss_mlp": 0.17488611, "epoch": 0.2266646625582444, "flos": 23988148738560.0, "grad_norm": 5.193023306427383, "language_loss": 0.78396207, "learning_rate": 3.6078095977951488e-06, "loss": 0.80081201, "num_input_tokens_seen": 81113290, "router_z_loss_clip": 3.48828125, "router_z_loss_mlp": 0.26745605, "step": 3770, "time_per_iteration": 2.640263557434082 }, { "auxiliary_loss_clip": 0.01489396, "auxiliary_loss_mlp": 0.001862, "balance_loss_clip": 1.13894057, "balance_loss_mlp": 0.16139276, "epoch": 0.22672478581091238, "flos": 26028054023040.0, "grad_norm": 1309.76988638453, "language_loss": 0.87205958, "learning_rate": 3.6075779318631067e-06, "loss": 0.88881552, "num_input_tokens_seen": 81133535, "router_z_loss_clip": 3.50195312, "router_z_loss_mlp": 0.24829102, "step": 3771, "time_per_iteration": 2.6380138397216797 }, { "auxiliary_loss_clip": 0.01460856, "auxiliary_loss_mlp": 0.00190784, "balance_loss_clip": 1.12229681, "balance_loss_mlp": 0.16679901, "epoch": 0.22678490906358034, "flos": 23841812730240.0, "grad_norm": 13.853751436531944, "language_loss": 0.84655017, "learning_rate": 3.6073462049715486e-06, "loss": 0.86306655, "num_input_tokens_seen": 81154650, "router_z_loss_clip": 3.38476562, "router_z_loss_mlp": 0.23974609, "step": 3772, "time_per_iteration": 2.6567471027374268 }, { "auxiliary_loss_clip": 0.01540068, "auxiliary_loss_mlp": 0.00055507, "balance_loss_clip": 1.29785252, "balance_loss_mlp": 0.04797298, "epoch": 0.2268450323162483, "flos": 65048088574080.0, "grad_norm": 0.6519527005091391, "language_loss": 0.53768098, "learning_rate": 3.607114417129261e-06, "loss": 0.55363679, "num_input_tokens_seen": 81221240, "router_z_loss_clip": 2.421875, "router_z_loss_mlp": 0.07519531, "step": 3773, "time_per_iteration": 3.1942477226257324 }, { "auxiliary_loss_clip": 0.01463465, "auxiliary_loss_mlp": 0.00197421, "balance_loss_clip": 1.12790728, "balance_loss_mlp": 0.17342424, "epoch": 0.22690515556891627, "flos": 22526081544960.0, "grad_norm": 15.714425528651002, "language_loss": 0.77801788, "learning_rate": 3.6068825683450334e-06, "loss": 0.79462677, "num_input_tokens_seen": 81241520, "router_z_loss_clip": 3.35351562, "router_z_loss_mlp": 0.23999023, "step": 3774, "time_per_iteration": 4.2597901821136475 }, { "auxiliary_loss_clip": 0.01465873, "auxiliary_loss_mlp": 0.0019841, "balance_loss_clip": 1.1298455, "balance_loss_mlp": 0.17224374, "epoch": 0.22696527882158424, "flos": 18223444955520.0, "grad_norm": 2.8794006614258225, "language_loss": 0.82740462, "learning_rate": 3.606650658627658e-06, "loss": 0.84404743, "num_input_tokens_seen": 81256825, "router_z_loss_clip": 3.359375, "router_z_loss_mlp": 0.26159668, "step": 3775, "time_per_iteration": 2.589857816696167 }, { "auxiliary_loss_clip": 0.01441417, "auxiliary_loss_mlp": 0.00174584, "balance_loss_clip": 1.11505008, "balance_loss_mlp": 0.15059876, "epoch": 0.22702540207425223, "flos": 17019252478080.0, "grad_norm": 3.5694158333676933, "language_loss": 0.93185186, "learning_rate": 3.606418687985928e-06, "loss": 0.94801188, "num_input_tokens_seen": 81275695, "router_z_loss_clip": 3.26171875, "router_z_loss_mlp": 0.23962402, "step": 3776, "time_per_iteration": 4.018434286117554 }, { "auxiliary_loss_clip": 0.01447866, "auxiliary_loss_mlp": 0.00213538, "balance_loss_clip": 1.11782956, "balance_loss_mlp": 0.18926661, "epoch": 0.2270855253269202, "flos": 21325731822720.0, "grad_norm": 2.4656605359535906, "language_loss": 0.90881091, "learning_rate": 3.606186656428641e-06, "loss": 0.92542493, "num_input_tokens_seen": 81294920, "router_z_loss_clip": 3.30078125, "router_z_loss_mlp": 0.24267578, "step": 3777, "time_per_iteration": 2.6232762336730957 }, { "auxiliary_loss_clip": 0.01462126, "auxiliary_loss_mlp": 0.00220729, "balance_loss_clip": 1.13179731, "balance_loss_mlp": 0.1959458, "epoch": 0.22714564857958816, "flos": 23550469516800.0, "grad_norm": 5.715832934674001, "language_loss": 0.84445029, "learning_rate": 3.6059545639645955e-06, "loss": 0.86127883, "num_input_tokens_seen": 81314275, "router_z_loss_clip": 3.30859375, "router_z_loss_mlp": 0.2479248, "step": 3778, "time_per_iteration": 2.651299238204956 }, { "auxiliary_loss_clip": 0.01447511, "auxiliary_loss_mlp": 0.00231076, "balance_loss_clip": 1.11835694, "balance_loss_mlp": 0.20443246, "epoch": 0.22720577183225613, "flos": 25989880844160.0, "grad_norm": 4.409052486835874, "language_loss": 0.76447302, "learning_rate": 3.605722410602591e-06, "loss": 0.78125894, "num_input_tokens_seen": 81333890, "router_z_loss_clip": 3.29101562, "router_z_loss_mlp": 0.26647949, "step": 3779, "time_per_iteration": 2.7251527309417725 }, { "auxiliary_loss_clip": 0.01441671, "auxiliary_loss_mlp": 0.00205789, "balance_loss_clip": 1.12188244, "balance_loss_mlp": 0.17933664, "epoch": 0.2272658950849241, "flos": 20814076540800.0, "grad_norm": 13.372673067637644, "language_loss": 0.78403294, "learning_rate": 3.6054901963514323e-06, "loss": 0.80050755, "num_input_tokens_seen": 81353640, "router_z_loss_clip": 3.19726562, "router_z_loss_mlp": 0.26452637, "step": 3780, "time_per_iteration": 4.226995944976807 }, { "auxiliary_loss_clip": 0.01457722, "auxiliary_loss_mlp": 0.00241159, "balance_loss_clip": 1.13814187, "balance_loss_mlp": 0.2146354, "epoch": 0.22732601833759206, "flos": 23909324342400.0, "grad_norm": 34.868951577999894, "language_loss": 0.96005428, "learning_rate": 3.6052579212199246e-06, "loss": 0.97704315, "num_input_tokens_seen": 81371595, "router_z_loss_clip": 3.19726562, "router_z_loss_mlp": 0.26538086, "step": 3781, "time_per_iteration": 2.66371488571167 }, { "auxiliary_loss_clip": 0.01444655, "auxiliary_loss_mlp": 0.00240751, "balance_loss_clip": 1.11860919, "balance_loss_mlp": 0.21237871, "epoch": 0.22738614159026002, "flos": 15924407978880.0, "grad_norm": 26.395089601407683, "language_loss": 0.86673003, "learning_rate": 3.6050255852168753e-06, "loss": 0.88358414, "num_input_tokens_seen": 81388435, "router_z_loss_clip": 3.26367188, "router_z_loss_mlp": 0.28369141, "step": 3782, "time_per_iteration": 2.617506265640259 }, { "auxiliary_loss_clip": 0.01440137, "auxiliary_loss_mlp": 0.00239254, "balance_loss_clip": 1.11913311, "balance_loss_mlp": 0.21442321, "epoch": 0.22744626484292801, "flos": 24205515891840.0, "grad_norm": 4.946512366229814, "language_loss": 0.88735139, "learning_rate": 3.604793188351095e-06, "loss": 0.9041453, "num_input_tokens_seen": 81410195, "router_z_loss_clip": 3.21289062, "router_z_loss_mlp": 0.24841309, "step": 3783, "time_per_iteration": 2.6770713329315186 }, { "auxiliary_loss_clip": 0.0145269, "auxiliary_loss_mlp": 0.00260614, "balance_loss_clip": 1.12478542, "balance_loss_mlp": 0.23455521, "epoch": 0.22750638809559598, "flos": 24791614110720.0, "grad_norm": 712.6147564479653, "language_loss": 0.8250649, "learning_rate": 3.6045607306313964e-06, "loss": 0.8421979, "num_input_tokens_seen": 81430060, "router_z_loss_clip": 3.27539062, "router_z_loss_mlp": 0.26062012, "step": 3784, "time_per_iteration": 4.01696515083313 }, { "auxiliary_loss_clip": 0.01460735, "auxiliary_loss_mlp": 0.00235702, "balance_loss_clip": 1.13296795, "balance_loss_mlp": 0.20705557, "epoch": 0.22756651134826394, "flos": 22236498097920.0, "grad_norm": 9.835490130864995, "language_loss": 0.78016859, "learning_rate": 3.604328212066594e-06, "loss": 0.79713297, "num_input_tokens_seen": 81447375, "router_z_loss_clip": 3.27929688, "router_z_loss_mlp": 0.28662109, "step": 3785, "time_per_iteration": 2.588087558746338 }, { "auxiliary_loss_clip": 0.01578094, "auxiliary_loss_mlp": 0.00112157, "balance_loss_clip": 1.33320618, "balance_loss_mlp": 0.1003793, "epoch": 0.2276266346009319, "flos": 62707466626560.0, "grad_norm": 0.8739821698525461, "language_loss": 0.62171841, "learning_rate": 3.6040956326655047e-06, "loss": 0.63862097, "num_input_tokens_seen": 81505235, "router_z_loss_clip": 2.4375, "router_z_loss_mlp": 0.11767578, "step": 3786, "time_per_iteration": 3.0779669284820557 }, { "auxiliary_loss_clip": 0.01481345, "auxiliary_loss_mlp": 0.00241773, "balance_loss_clip": 1.14729977, "balance_loss_mlp": 0.21570194, "epoch": 0.22768675785359987, "flos": 18613936684800.0, "grad_norm": 4.2170069054870485, "language_loss": 0.96787018, "learning_rate": 3.6038629924369486e-06, "loss": 0.98510134, "num_input_tokens_seen": 81518685, "router_z_loss_clip": 3.34765625, "router_z_loss_mlp": 0.26062012, "step": 3787, "time_per_iteration": 2.566694736480713 }, { "auxiliary_loss_clip": 0.01464301, "auxiliary_loss_mlp": 0.00259188, "balance_loss_clip": 1.13692188, "balance_loss_mlp": 0.23259225, "epoch": 0.22774688110626784, "flos": 26870195364480.0, "grad_norm": 2.3676336910294573, "language_loss": 0.78211159, "learning_rate": 3.6036302913897474e-06, "loss": 0.79934657, "num_input_tokens_seen": 81538940, "router_z_loss_clip": 3.27539062, "router_z_loss_mlp": 0.26611328, "step": 3788, "time_per_iteration": 2.6707265377044678 }, { "auxiliary_loss_clip": 0.01486565, "auxiliary_loss_mlp": 0.00254891, "balance_loss_clip": 1.15287161, "balance_loss_mlp": 0.22747239, "epoch": 0.2278070043589358, "flos": 15553593924480.0, "grad_norm": 2.3874569399524868, "language_loss": 0.76778829, "learning_rate": 3.6033975295327243e-06, "loss": 0.78520286, "num_input_tokens_seen": 81555525, "router_z_loss_clip": 3.3359375, "router_z_loss_mlp": 0.27441406, "step": 3789, "time_per_iteration": 2.5700623989105225 }, { "auxiliary_loss_clip": 0.01496191, "auxiliary_loss_mlp": 0.0025867, "balance_loss_clip": 1.16265082, "balance_loss_mlp": 0.23174061, "epoch": 0.2278671276116038, "flos": 22416805393920.0, "grad_norm": 26.160126235566878, "language_loss": 0.81849235, "learning_rate": 3.6031647068747065e-06, "loss": 0.83604097, "num_input_tokens_seen": 81576305, "router_z_loss_clip": 3.33398438, "router_z_loss_mlp": 0.26940918, "step": 3790, "time_per_iteration": 2.6664602756500244 }, { "auxiliary_loss_clip": 0.01496293, "auxiliary_loss_mlp": 0.00236631, "balance_loss_clip": 1.16245019, "balance_loss_mlp": 0.20852157, "epoch": 0.22792725086427176, "flos": 20631363033600.0, "grad_norm": 6.61602102945212, "language_loss": 0.99717367, "learning_rate": 3.602931823424522e-06, "loss": 1.01450288, "num_input_tokens_seen": 81594115, "router_z_loss_clip": 3.34179688, "router_z_loss_mlp": 0.28100586, "step": 3791, "time_per_iteration": 2.5914418697357178 }, { "auxiliary_loss_clip": 0.01509863, "auxiliary_loss_mlp": 0.00264067, "balance_loss_clip": 1.17673326, "balance_loss_mlp": 0.23431244, "epoch": 0.22798737411693973, "flos": 31428946903680.0, "grad_norm": 7.37214409417311, "language_loss": 0.90857595, "learning_rate": 3.6026988791910026e-06, "loss": 0.92631525, "num_input_tokens_seen": 81615355, "router_z_loss_clip": 3.3359375, "router_z_loss_mlp": 0.29785156, "step": 3792, "time_per_iteration": 2.6922590732574463 }, { "auxiliary_loss_clip": 0.01549448, "auxiliary_loss_mlp": 0.00089354, "balance_loss_clip": 1.32915473, "balance_loss_mlp": 0.07609776, "epoch": 0.2280474973696077, "flos": 52396685827200.0, "grad_norm": 1.1340433642462766, "language_loss": 0.65839267, "learning_rate": 3.602465874182981e-06, "loss": 0.67478073, "num_input_tokens_seen": 81662075, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.1328125, "step": 3793, "time_per_iteration": 2.9107420444488525 }, { "auxiliary_loss_clip": 0.01537376, "auxiliary_loss_mlp": 0.00317057, "balance_loss_clip": 1.1983161, "balance_loss_mlp": 0.28417861, "epoch": 0.22810762062227566, "flos": 26396066816640.0, "grad_norm": 324.3256417574254, "language_loss": 0.86184204, "learning_rate": 3.602232808409293e-06, "loss": 0.88038635, "num_input_tokens_seen": 81681625, "router_z_loss_clip": 3.390625, "router_z_loss_mlp": 0.32861328, "step": 3794, "time_per_iteration": 2.684657335281372 }, { "auxiliary_loss_clip": 0.01527243, "auxiliary_loss_mlp": 0.00300376, "balance_loss_clip": 1.18967474, "balance_loss_mlp": 0.27057418, "epoch": 0.22816774387494362, "flos": 25630271832960.0, "grad_norm": 62.848019560234285, "language_loss": 0.87247902, "learning_rate": 3.6019996818787755e-06, "loss": 0.89075524, "num_input_tokens_seen": 81701170, "router_z_loss_clip": 3.37695312, "router_z_loss_mlp": 0.2980957, "step": 3795, "time_per_iteration": 2.6837921142578125 }, { "auxiliary_loss_clip": 0.01513221, "auxiliary_loss_mlp": 0.00285835, "balance_loss_clip": 1.18411303, "balance_loss_mlp": 0.25900108, "epoch": 0.22822786712761162, "flos": 22451602694400.0, "grad_norm": 310.3910527622936, "language_loss": 0.84874344, "learning_rate": 3.6017664946002704e-06, "loss": 0.86673403, "num_input_tokens_seen": 81721265, "router_z_loss_clip": 3.2890625, "router_z_loss_mlp": 0.26843262, "step": 3796, "time_per_iteration": 2.616368293762207 }, { "auxiliary_loss_clip": 0.0154804, "auxiliary_loss_mlp": 0.00298538, "balance_loss_clip": 1.21022463, "balance_loss_mlp": 0.26828295, "epoch": 0.22828799038027958, "flos": 12202554395520.0, "grad_norm": 3.2248013044324124, "language_loss": 1.04688621, "learning_rate": 3.6015332465826188e-06, "loss": 1.06535196, "num_input_tokens_seen": 81736565, "router_z_loss_clip": 3.3828125, "router_z_loss_mlp": 0.30200195, "step": 3797, "time_per_iteration": 2.6025280952453613 }, { "auxiliary_loss_clip": 0.01536033, "auxiliary_loss_mlp": 0.00283494, "balance_loss_clip": 1.20216739, "balance_loss_mlp": 0.25440741, "epoch": 0.22834811363294755, "flos": 22085708803200.0, "grad_norm": 176.68408993618127, "language_loss": 0.87116992, "learning_rate": 3.601299937834666e-06, "loss": 0.88936526, "num_input_tokens_seen": 81756240, "router_z_loss_clip": 3.33984375, "router_z_loss_mlp": 0.29125977, "step": 3798, "time_per_iteration": 2.644073009490967 }, { "auxiliary_loss_clip": 0.01534388, "auxiliary_loss_mlp": 0.00329668, "balance_loss_clip": 1.19908309, "balance_loss_mlp": 0.29767185, "epoch": 0.2284082368856155, "flos": 24860634094080.0, "grad_norm": 2.518291054392234, "language_loss": 0.88611174, "learning_rate": 3.6010665683652596e-06, "loss": 0.90475225, "num_input_tokens_seen": 81775720, "router_z_loss_clip": 3.3515625, "router_z_loss_mlp": 0.31982422, "step": 3799, "time_per_iteration": 2.6403379440307617 }, { "auxiliary_loss_clip": 0.01546335, "auxiliary_loss_mlp": 0.00264631, "balance_loss_clip": 1.21076083, "balance_loss_mlp": 0.23726091, "epoch": 0.22846836013828348, "flos": 23292882109440.0, "grad_norm": 10.010092907979706, "language_loss": 0.82764274, "learning_rate": 3.6008331381832484e-06, "loss": 0.84575242, "num_input_tokens_seen": 81795830, "router_z_loss_clip": 3.35742188, "router_z_loss_mlp": 0.27392578, "step": 3800, "time_per_iteration": 2.5893261432647705 }, { "auxiliary_loss_clip": 0.01532576, "auxiliary_loss_mlp": 0.00272501, "balance_loss_clip": 1.2019701, "balance_loss_mlp": 0.24360511, "epoch": 0.22852848339095144, "flos": 27416288810880.0, "grad_norm": 58.83597301498925, "language_loss": 0.73710966, "learning_rate": 3.600599647297484e-06, "loss": 0.75516045, "num_input_tokens_seen": 81815745, "router_z_loss_clip": 3.3046875, "router_z_loss_mlp": 0.28918457, "step": 3801, "time_per_iteration": 2.677828550338745 }, { "auxiliary_loss_clip": 0.01587788, "auxiliary_loss_mlp": 0.00279791, "balance_loss_clip": 1.24426794, "balance_loss_mlp": 0.25160986, "epoch": 0.2285886066436194, "flos": 26321157002880.0, "grad_norm": 10.03252907367889, "language_loss": 0.88022298, "learning_rate": 3.60036609571682e-06, "loss": 0.89889872, "num_input_tokens_seen": 81835155, "router_z_loss_clip": 3.4375, "router_z_loss_mlp": 0.28173828, "step": 3802, "time_per_iteration": 2.610530376434326 }, { "auxiliary_loss_clip": 0.01622384, "auxiliary_loss_mlp": 0.00289162, "balance_loss_clip": 1.26447868, "balance_loss_mlp": 0.25673699, "epoch": 0.2286487298962874, "flos": 29716475022720.0, "grad_norm": 5.161547203267033, "language_loss": 0.86699682, "learning_rate": 3.600132483450114e-06, "loss": 0.88611233, "num_input_tokens_seen": 81855655, "router_z_loss_clip": 3.58007812, "router_z_loss_mlp": 0.32470703, "step": 3803, "time_per_iteration": 2.6577377319335938 }, { "auxiliary_loss_clip": 0.01635849, "auxiliary_loss_mlp": 0.00275, "balance_loss_clip": 1.26880074, "balance_loss_mlp": 0.24307598, "epoch": 0.22870885314895537, "flos": 21287199507840.0, "grad_norm": 5.5224944268367455, "language_loss": 0.92798424, "learning_rate": 3.5998988105062235e-06, "loss": 0.94709271, "num_input_tokens_seen": 81876385, "router_z_loss_clip": 3.66992188, "router_z_loss_mlp": 0.3190918, "step": 3804, "time_per_iteration": 2.6192097663879395 }, { "auxiliary_loss_clip": 0.01637202, "auxiliary_loss_mlp": 0.0029951, "balance_loss_clip": 1.26698458, "balance_loss_mlp": 0.2688489, "epoch": 0.22876897640162333, "flos": 14939450161920.0, "grad_norm": 72.61022764030776, "language_loss": 0.85961944, "learning_rate": 3.59966507689401e-06, "loss": 0.87898654, "num_input_tokens_seen": 81893225, "router_z_loss_clip": 3.70703125, "router_z_loss_mlp": 0.30712891, "step": 3805, "time_per_iteration": 2.619967222213745 }, { "auxiliary_loss_clip": 0.01682763, "auxiliary_loss_mlp": 0.00328826, "balance_loss_clip": 1.29840386, "balance_loss_mlp": 0.29616219, "epoch": 0.2288290996542913, "flos": 18113917409280.0, "grad_norm": 73.30421704403635, "language_loss": 0.88582981, "learning_rate": 3.5994312826223363e-06, "loss": 0.90594572, "num_input_tokens_seen": 81911350, "router_z_loss_clip": 3.84375, "router_z_loss_mlp": 0.32666016, "step": 3806, "time_per_iteration": 2.5871024131774902 }, { "auxiliary_loss_clip": 0.01708374, "auxiliary_loss_mlp": 0.00272325, "balance_loss_clip": 1.3144151, "balance_loss_mlp": 0.23999512, "epoch": 0.22888922290695926, "flos": 39855457071360.0, "grad_norm": 28.008932547167074, "language_loss": 0.79271215, "learning_rate": 3.5991974277000684e-06, "loss": 0.81251913, "num_input_tokens_seen": 81935420, "router_z_loss_clip": 3.9375, "router_z_loss_mlp": 0.32299805, "step": 3807, "time_per_iteration": 2.744049072265625 }, { "auxiliary_loss_clip": 0.0171314, "auxiliary_loss_mlp": 0.00299618, "balance_loss_clip": 1.31703544, "balance_loss_mlp": 0.26781267, "epoch": 0.22894934615962723, "flos": 23403774372480.0, "grad_norm": 2.3153668310295186, "language_loss": 0.745327, "learning_rate": 3.5989635121360733e-06, "loss": 0.76545465, "num_input_tokens_seen": 81953845, "router_z_loss_clip": 3.9609375, "router_z_loss_mlp": 0.31774902, "step": 3808, "time_per_iteration": 2.5916390419006348 }, { "auxiliary_loss_clip": 0.01726959, "auxiliary_loss_mlp": 0.00294458, "balance_loss_clip": 1.32122016, "balance_loss_mlp": 0.2615087, "epoch": 0.22900946941229522, "flos": 18843011671680.0, "grad_norm": 2.1288330982280472, "language_loss": 0.81988668, "learning_rate": 3.598729535939222e-06, "loss": 0.84010088, "num_input_tokens_seen": 81972100, "router_z_loss_clip": 4.05859375, "router_z_loss_mlp": 0.32958984, "step": 3809, "time_per_iteration": 2.592005491256714 }, { "auxiliary_loss_clip": 0.01710172, "auxiliary_loss_mlp": 0.00291151, "balance_loss_clip": 1.31287169, "balance_loss_mlp": 0.26080048, "epoch": 0.22906959266496318, "flos": 22929394429440.0, "grad_norm": 4.741205062903025, "language_loss": 0.86548162, "learning_rate": 3.5984954991183862e-06, "loss": 0.88549483, "num_input_tokens_seen": 81992760, "router_z_loss_clip": 3.97265625, "router_z_loss_mlp": 0.3034668, "step": 3810, "time_per_iteration": 2.6395115852355957 }, { "auxiliary_loss_clip": 0.01745461, "auxiliary_loss_mlp": 0.00288113, "balance_loss_clip": 1.34165359, "balance_loss_mlp": 0.25528285, "epoch": 0.22912971591763115, "flos": 19354523299200.0, "grad_norm": 4.150973826416358, "language_loss": 0.86122358, "learning_rate": 3.598261401682441e-06, "loss": 0.88155937, "num_input_tokens_seen": 82009080, "router_z_loss_clip": 4.0390625, "router_z_loss_mlp": 0.328125, "step": 3811, "time_per_iteration": 2.5900938510894775 }, { "auxiliary_loss_clip": 0.01726433, "auxiliary_loss_mlp": 0.00301015, "balance_loss_clip": 1.32037854, "balance_loss_mlp": 0.26980636, "epoch": 0.22918983917029911, "flos": 19933546538880.0, "grad_norm": 3.60620158171666, "language_loss": 0.90086806, "learning_rate": 3.5980272436402632e-06, "loss": 0.92114258, "num_input_tokens_seen": 82026705, "router_z_loss_clip": 4.05664062, "router_z_loss_mlp": 0.31201172, "step": 3812, "time_per_iteration": 2.5985283851623535 }, { "auxiliary_loss_clip": 0.01741628, "auxiliary_loss_mlp": 0.00326192, "balance_loss_clip": 1.33623874, "balance_loss_mlp": 0.29524538, "epoch": 0.22924996242296708, "flos": 16690885320960.0, "grad_norm": 3.1728373343400027, "language_loss": 0.94890583, "learning_rate": 3.5977930250007324e-06, "loss": 0.96958405, "num_input_tokens_seen": 82043245, "router_z_loss_clip": 4.0546875, "router_z_loss_mlp": 0.30932617, "step": 3813, "time_per_iteration": 2.5441997051239014 }, { "auxiliary_loss_clip": 0.01720725, "auxiliary_loss_mlp": 0.00334028, "balance_loss_clip": 1.3231467, "balance_loss_mlp": 0.30222341, "epoch": 0.22931008567563504, "flos": 33036164956800.0, "grad_norm": 31.602049187358634, "language_loss": 0.76296902, "learning_rate": 3.5975587457727298e-06, "loss": 0.78351653, "num_input_tokens_seen": 82066870, "router_z_loss_clip": 3.97460938, "router_z_loss_mlp": 0.31787109, "step": 3814, "time_per_iteration": 2.7498624324798584 }, { "auxiliary_loss_clip": 0.01717085, "auxiliary_loss_mlp": 0.00280211, "balance_loss_clip": 1.31670189, "balance_loss_mlp": 0.24735676, "epoch": 0.229370208928303, "flos": 23330696152320.0, "grad_norm": 7.992459309920149, "language_loss": 0.76270485, "learning_rate": 3.597324405965139e-06, "loss": 0.78267777, "num_input_tokens_seen": 82083180, "router_z_loss_clip": 4.00390625, "router_z_loss_mlp": 0.32836914, "step": 3815, "time_per_iteration": 2.59753680229187 }, { "auxiliary_loss_clip": 0.01722675, "auxiliary_loss_mlp": 0.00317691, "balance_loss_clip": 1.32510972, "balance_loss_mlp": 0.28312021, "epoch": 0.229430332180971, "flos": 28617213150720.0, "grad_norm": 9.298115920921022, "language_loss": 0.90096962, "learning_rate": 3.597090005586848e-06, "loss": 0.92137331, "num_input_tokens_seen": 82102950, "router_z_loss_clip": 3.97460938, "router_z_loss_mlp": 0.34570312, "step": 3816, "time_per_iteration": 4.144388437271118 }, { "auxiliary_loss_clip": 0.01715904, "auxiliary_loss_mlp": 0.00309111, "balance_loss_clip": 1.32122886, "balance_loss_mlp": 0.27699572, "epoch": 0.22949045543363897, "flos": 17238199829760.0, "grad_norm": 3.2765136115425486, "language_loss": 0.98105812, "learning_rate": 3.596855544646742e-06, "loss": 1.0013082, "num_input_tokens_seen": 82119510, "router_z_loss_clip": 3.9453125, "router_z_loss_mlp": 0.32104492, "step": 3817, "time_per_iteration": 2.5716075897216797 }, { "auxiliary_loss_clip": 0.01686057, "auxiliary_loss_mlp": 0.00336664, "balance_loss_clip": 1.3010509, "balance_loss_mlp": 0.30543089, "epoch": 0.22955057868630693, "flos": 27489438858240.0, "grad_norm": 3.046691292552805, "language_loss": 0.82034242, "learning_rate": 3.5966210231537154e-06, "loss": 0.84056962, "num_input_tokens_seen": 82140095, "router_z_loss_clip": 3.84960938, "router_z_loss_mlp": 0.31225586, "step": 3818, "time_per_iteration": 4.065706491470337 }, { "auxiliary_loss_clip": 0.01680708, "auxiliary_loss_mlp": 0.00367283, "balance_loss_clip": 1.30041564, "balance_loss_mlp": 0.33457211, "epoch": 0.2296107019389749, "flos": 23476421629440.0, "grad_norm": 5.742223213919984, "language_loss": 0.81604946, "learning_rate": 3.596386441116659e-06, "loss": 0.83652937, "num_input_tokens_seen": 82159510, "router_z_loss_clip": 3.80273438, "router_z_loss_mlp": 0.3269043, "step": 3819, "time_per_iteration": 2.688422441482544 }, { "auxiliary_loss_clip": 0.01672863, "auxiliary_loss_mlp": 0.00343884, "balance_loss_clip": 1.30121374, "balance_loss_mlp": 0.31095839, "epoch": 0.22967082519164286, "flos": 31285160760960.0, "grad_norm": 9.991675958048456, "language_loss": 0.86893058, "learning_rate": 3.5961517985444684e-06, "loss": 0.88909805, "num_input_tokens_seen": 82179580, "router_z_loss_clip": 3.71875, "router_z_loss_mlp": 0.3293457, "step": 3820, "time_per_iteration": 2.7714638710021973 }, { "auxiliary_loss_clip": 0.01664681, "auxiliary_loss_mlp": 0.00382529, "balance_loss_clip": 1.29442072, "balance_loss_mlp": 0.34616998, "epoch": 0.22973094844431083, "flos": 14642935390080.0, "grad_norm": 2.978612404594324, "language_loss": 0.75874436, "learning_rate": 3.595917095446042e-06, "loss": 0.77921641, "num_input_tokens_seen": 82195585, "router_z_loss_clip": 3.703125, "router_z_loss_mlp": 0.36303711, "step": 3821, "time_per_iteration": 2.642810583114624 }, { "auxiliary_loss_clip": 0.01654071, "auxiliary_loss_mlp": 0.00358299, "balance_loss_clip": 1.28819036, "balance_loss_mlp": 0.32673243, "epoch": 0.2297910716969788, "flos": 22823853292800.0, "grad_norm": 27.680267535752098, "language_loss": 0.87942874, "learning_rate": 3.5956823318302796e-06, "loss": 0.89955246, "num_input_tokens_seen": 82217530, "router_z_loss_clip": 3.66015625, "router_z_loss_mlp": 0.31567383, "step": 3822, "time_per_iteration": 4.121351480484009 }, { "auxiliary_loss_clip": 0.01639652, "auxiliary_loss_mlp": 0.00362421, "balance_loss_clip": 1.28446388, "balance_loss_mlp": 0.32942411, "epoch": 0.2298511949496468, "flos": 23039029716480.0, "grad_norm": 18.796079892317774, "language_loss": 0.72759765, "learning_rate": 3.5954475077060833e-06, "loss": 0.74761832, "num_input_tokens_seen": 82237980, "router_z_loss_clip": 3.55664062, "router_z_loss_mlp": 0.32983398, "step": 3823, "time_per_iteration": 2.623065710067749 }, { "auxiliary_loss_clip": 0.01615789, "auxiliary_loss_mlp": 0.00326492, "balance_loss_clip": 1.37119973, "balance_loss_mlp": 0.31247327, "epoch": 0.22991131820231475, "flos": 66890914911360.0, "grad_norm": 0.8326668672781224, "language_loss": 0.57068962, "learning_rate": 3.595212623082357e-06, "loss": 0.59011245, "num_input_tokens_seen": 82301785, "router_z_loss_clip": 2.4375, "router_z_loss_mlp": 0.140625, "step": 3824, "time_per_iteration": 3.1868696212768555 }, { "auxiliary_loss_clip": 0.01583706, "auxiliary_loss_mlp": 0.00349511, "balance_loss_clip": 1.24801302, "balance_loss_mlp": 0.31882608, "epoch": 0.22997144145498272, "flos": 17887248633600.0, "grad_norm": 2.1120881025767955, "language_loss": 0.79923916, "learning_rate": 3.594977677968009e-06, "loss": 0.81857133, "num_input_tokens_seen": 82317355, "router_z_loss_clip": 3.35546875, "router_z_loss_mlp": 0.30664062, "step": 3825, "time_per_iteration": 2.552391290664673 }, { "auxiliary_loss_clip": 0.01585987, "auxiliary_loss_mlp": 0.00385327, "balance_loss_clip": 1.25231576, "balance_loss_mlp": 0.35213926, "epoch": 0.23003156470765068, "flos": 24676843178880.0, "grad_norm": 52.40753069588256, "language_loss": 0.9418394, "learning_rate": 3.5947426723719473e-06, "loss": 0.96155262, "num_input_tokens_seen": 82336645, "router_z_loss_clip": 3.33789062, "router_z_loss_mlp": 0.33203125, "step": 3826, "time_per_iteration": 2.6896347999572754 }, { "auxiliary_loss_clip": 0.01586305, "auxiliary_loss_mlp": 0.0040504, "balance_loss_clip": 1.25150514, "balance_loss_mlp": 0.36715513, "epoch": 0.23009168796031865, "flos": 15814126247040.0, "grad_norm": 32.76298135644684, "language_loss": 0.93496799, "learning_rate": 3.594507606303083e-06, "loss": 0.95488131, "num_input_tokens_seen": 82354225, "router_z_loss_clip": 3.34765625, "router_z_loss_mlp": 0.37890625, "step": 3827, "time_per_iteration": 4.0243189334869385 }, { "auxiliary_loss_clip": 0.01559938, "auxiliary_loss_mlp": 0.00445952, "balance_loss_clip": 1.23246908, "balance_loss_mlp": 0.40759015, "epoch": 0.2301518112129866, "flos": 16212842190720.0, "grad_norm": 3.6689640330061577, "language_loss": 0.93316716, "learning_rate": 3.5942724797703314e-06, "loss": 0.95322609, "num_input_tokens_seen": 82370240, "router_z_loss_clip": 3.27539062, "router_z_loss_mlp": 0.38378906, "step": 3828, "time_per_iteration": 2.572387218475342 }, { "auxiliary_loss_clip": 0.01550184, "auxiliary_loss_mlp": 0.00372985, "balance_loss_clip": 1.22506833, "balance_loss_mlp": 0.33936799, "epoch": 0.2302119344656546, "flos": 20595452411520.0, "grad_norm": 15.907263742799122, "language_loss": 0.77885407, "learning_rate": 3.594037292782607e-06, "loss": 0.79808581, "num_input_tokens_seen": 82389145, "router_z_loss_clip": 3.25195312, "router_z_loss_mlp": 0.33618164, "step": 3829, "time_per_iteration": 2.583833932876587 }, { "auxiliary_loss_clip": 0.01523718, "auxiliary_loss_mlp": 0.00389387, "balance_loss_clip": 1.21057415, "balance_loss_mlp": 0.35650897, "epoch": 0.23027205771832257, "flos": 26796901662720.0, "grad_norm": 7.753957548504881, "language_loss": 0.91551393, "learning_rate": 3.5938020453488293e-06, "loss": 0.93464494, "num_input_tokens_seen": 82409185, "router_z_loss_clip": 3.12890625, "router_z_loss_mlp": 0.32885742, "step": 3830, "time_per_iteration": 2.632283926010132 }, { "auxiliary_loss_clip": 0.01514876, "auxiliary_loss_mlp": 0.0034917, "balance_loss_clip": 1.20565343, "balance_loss_mlp": 0.31626865, "epoch": 0.23033218097099054, "flos": 43873143068160.0, "grad_norm": 5.837360026030787, "language_loss": 0.73600936, "learning_rate": 3.5935667374779177e-06, "loss": 0.75464982, "num_input_tokens_seen": 82432070, "router_z_loss_clip": 3.09375, "router_z_loss_mlp": 0.32885742, "step": 3831, "time_per_iteration": 2.842564105987549 }, { "auxiliary_loss_clip": 0.01509839, "auxiliary_loss_mlp": 0.00399993, "balance_loss_clip": 1.19554758, "balance_loss_mlp": 0.36449289, "epoch": 0.2303923042236585, "flos": 26067663745920.0, "grad_norm": 9.065490226263991, "language_loss": 0.84183788, "learning_rate": 3.5933313691787957e-06, "loss": 0.86093616, "num_input_tokens_seen": 82450625, "router_z_loss_clip": 3.14453125, "router_z_loss_mlp": 0.35522461, "step": 3832, "time_per_iteration": 2.6516799926757812 }, { "auxiliary_loss_clip": 0.01467074, "auxiliary_loss_mlp": 0.00407116, "balance_loss_clip": 1.16796291, "balance_loss_mlp": 0.37290311, "epoch": 0.23045242747632647, "flos": 18296379521280.0, "grad_norm": 59.04356698156164, "language_loss": 0.94544077, "learning_rate": 3.593095940460389e-06, "loss": 0.96418273, "num_input_tokens_seen": 82468575, "router_z_loss_clip": 2.99023438, "router_z_loss_mlp": 0.34228516, "step": 3833, "time_per_iteration": 2.5972213745117188 }, { "auxiliary_loss_clip": 0.01473096, "auxiliary_loss_mlp": 0.00411363, "balance_loss_clip": 1.1681056, "balance_loss_mlp": 0.37450361, "epoch": 0.23051255072899443, "flos": 25520528805120.0, "grad_norm": 3.0994795522221166, "language_loss": 0.83798862, "learning_rate": 3.592860451331624e-06, "loss": 0.85683322, "num_input_tokens_seen": 82488655, "router_z_loss_clip": 3.04882812, "router_z_loss_mlp": 0.36816406, "step": 3834, "time_per_iteration": 2.6614646911621094 }, { "auxiliary_loss_clip": 0.01438258, "auxiliary_loss_mlp": 0.0041101, "balance_loss_clip": 1.14316475, "balance_loss_mlp": 0.37601, "epoch": 0.2305726739816624, "flos": 21215198695680.0, "grad_norm": 2.9920172433264614, "language_loss": 0.9582963, "learning_rate": 3.592624901801432e-06, "loss": 0.976789, "num_input_tokens_seen": 82507220, "router_z_loss_clip": 2.95117188, "router_z_loss_mlp": 0.34985352, "step": 3835, "time_per_iteration": 2.6086299419403076 }, { "auxiliary_loss_clip": 0.01439442, "auxiliary_loss_mlp": 0.00409891, "balance_loss_clip": 1.14746451, "balance_loss_mlp": 0.37510532, "epoch": 0.2306327972343304, "flos": 23331127115520.0, "grad_norm": 62.031530529009196, "language_loss": 0.92512608, "learning_rate": 3.5923892918787432e-06, "loss": 0.94361937, "num_input_tokens_seen": 82527920, "router_z_loss_clip": 2.92578125, "router_z_loss_mlp": 0.34790039, "step": 3836, "time_per_iteration": 2.618842124938965 }, { "auxiliary_loss_clip": 0.01435394, "auxiliary_loss_mlp": 0.00428894, "balance_loss_clip": 1.14295268, "balance_loss_mlp": 0.3941564, "epoch": 0.23069292048699835, "flos": 20666734951680.0, "grad_norm": 2.6031159319443775, "language_loss": 0.86546838, "learning_rate": 3.5921536215724934e-06, "loss": 0.88411129, "num_input_tokens_seen": 82549040, "router_z_loss_clip": 2.921875, "router_z_loss_mlp": 0.34716797, "step": 3837, "time_per_iteration": 2.7157907485961914 }, { "auxiliary_loss_clip": 0.01468477, "auxiliary_loss_mlp": 0.00147442, "balance_loss_clip": 1.26969779, "balance_loss_mlp": 0.1356639, "epoch": 0.23075304373966632, "flos": 70454832393600.0, "grad_norm": 0.9772372181541251, "language_loss": 0.65615225, "learning_rate": 3.5919178908916184e-06, "loss": 0.67231143, "num_input_tokens_seen": 82604070, "router_z_loss_clip": 1.984375, "router_z_loss_mlp": 0.11767578, "step": 3838, "time_per_iteration": 3.0955965518951416 }, { "auxiliary_loss_clip": 0.01413751, "auxiliary_loss_mlp": 0.00400222, "balance_loss_clip": 1.12745798, "balance_loss_mlp": 0.36796394, "epoch": 0.23081316699233428, "flos": 16617986668800.0, "grad_norm": 9.318951434495228, "language_loss": 0.82238513, "learning_rate": 3.591682099845058e-06, "loss": 0.84052485, "num_input_tokens_seen": 82619665, "router_z_loss_clip": 2.8671875, "router_z_loss_mlp": 0.32202148, "step": 3839, "time_per_iteration": 2.6256251335144043 }, { "auxiliary_loss_clip": 0.01407861, "auxiliary_loss_mlp": 0.00408014, "balance_loss_clip": 1.12268281, "balance_loss_mlp": 0.37270442, "epoch": 0.23087329024500225, "flos": 13298081253120.0, "grad_norm": 20.068595315099323, "language_loss": 0.79343551, "learning_rate": 3.591446248441752e-06, "loss": 0.81159425, "num_input_tokens_seen": 82637530, "router_z_loss_clip": 2.8515625, "router_z_loss_mlp": 0.35302734, "step": 3840, "time_per_iteration": 2.6267831325531006 }, { "auxiliary_loss_clip": 0.01410534, "auxiliary_loss_mlp": 0.00445238, "balance_loss_clip": 1.12444711, "balance_loss_mlp": 0.4068765, "epoch": 0.23093341349767021, "flos": 17785729820160.0, "grad_norm": 3422.678944921393, "language_loss": 0.85435545, "learning_rate": 3.591210336690645e-06, "loss": 0.87291312, "num_input_tokens_seen": 82656130, "router_z_loss_clip": 2.86132812, "router_z_loss_mlp": 0.38354492, "step": 3841, "time_per_iteration": 2.6340572834014893 }, { "auxiliary_loss_clip": 0.01386594, "auxiliary_loss_mlp": 0.00373744, "balance_loss_clip": 1.10743558, "balance_loss_mlp": 0.34301153, "epoch": 0.23099353675033818, "flos": 23988076911360.0, "grad_norm": 19.489356456138427, "language_loss": 0.90224278, "learning_rate": 3.590974364600683e-06, "loss": 0.91984618, "num_input_tokens_seen": 82675295, "router_z_loss_clip": 2.79296875, "router_z_loss_mlp": 0.30712891, "step": 3842, "time_per_iteration": 2.6355743408203125 }, { "auxiliary_loss_clip": 0.01400999, "auxiliary_loss_mlp": 0.00388384, "balance_loss_clip": 1.12165689, "balance_loss_mlp": 0.35641241, "epoch": 0.23105366000300617, "flos": 35995168471680.0, "grad_norm": 3.739288758117505, "language_loss": 0.72183323, "learning_rate": 3.5907383321808135e-06, "loss": 0.73972702, "num_input_tokens_seen": 82703260, "router_z_loss_clip": 2.79492188, "router_z_loss_mlp": 0.31958008, "step": 3843, "time_per_iteration": 2.819784164428711 }, { "auxiliary_loss_clip": 0.01385939, "auxiliary_loss_mlp": 0.00416082, "balance_loss_clip": 1.10552216, "balance_loss_mlp": 0.38062924, "epoch": 0.23111378325567414, "flos": 31245335556480.0, "grad_norm": 14.49094016202683, "language_loss": 0.83434343, "learning_rate": 3.590502239439987e-06, "loss": 0.85236365, "num_input_tokens_seen": 82725060, "router_z_loss_clip": 2.80273438, "router_z_loss_mlp": 0.35449219, "step": 3844, "time_per_iteration": 2.6809773445129395 }, { "auxiliary_loss_clip": 0.01401291, "auxiliary_loss_mlp": 0.00487767, "balance_loss_clip": 1.1176616, "balance_loss_mlp": 0.44807065, "epoch": 0.2311739065083421, "flos": 19208223204480.0, "grad_norm": 57.73295418574718, "language_loss": 0.84471834, "learning_rate": 3.590266086387156e-06, "loss": 0.86360896, "num_input_tokens_seen": 82742960, "router_z_loss_clip": 2.83789062, "router_z_loss_mlp": 0.3972168, "step": 3845, "time_per_iteration": 2.602656364440918 }, { "auxiliary_loss_clip": 0.01403874, "auxiliary_loss_mlp": 0.0045598, "balance_loss_clip": 1.12005186, "balance_loss_mlp": 0.41907299, "epoch": 0.23123402976101007, "flos": 23360178240000.0, "grad_norm": 3.86277212373702, "language_loss": 0.82519698, "learning_rate": 3.590029873031276e-06, "loss": 0.84379548, "num_input_tokens_seen": 82760205, "router_z_loss_clip": 2.8359375, "router_z_loss_mlp": 0.36914062, "step": 3846, "time_per_iteration": 2.581758737564087 }, { "auxiliary_loss_clip": 0.0142243, "auxiliary_loss_mlp": 0.00487723, "balance_loss_clip": 1.1349932, "balance_loss_mlp": 0.44671541, "epoch": 0.23129415301367803, "flos": 13735365425280.0, "grad_norm": 1.9351935870896522, "language_loss": 0.79531229, "learning_rate": 3.589793599381304e-06, "loss": 0.81441379, "num_input_tokens_seen": 82778590, "router_z_loss_clip": 2.87695312, "router_z_loss_mlp": 0.41015625, "step": 3847, "time_per_iteration": 2.5760838985443115 }, { "auxiliary_loss_clip": 0.01402315, "auxiliary_loss_mlp": 0.00219273, "balance_loss_clip": 1.21649718, "balance_loss_mlp": 0.20401409, "epoch": 0.231354276266346, "flos": 69737015001600.0, "grad_norm": 0.769502429711866, "language_loss": 0.60847056, "learning_rate": 3.589557265446198e-06, "loss": 0.62468636, "num_input_tokens_seen": 82833925, "router_z_loss_clip": 1.859375, "router_z_loss_mlp": 0.15234375, "step": 3848, "time_per_iteration": 3.000399589538574 }, { "auxiliary_loss_clip": 0.01413613, "auxiliary_loss_mlp": 0.00506104, "balance_loss_clip": 1.13069916, "balance_loss_mlp": 0.46497688, "epoch": 0.231414399519014, "flos": 18835900778880.0, "grad_norm": 11.863643549778564, "language_loss": 0.87480986, "learning_rate": 3.589320871234923e-06, "loss": 0.89400697, "num_input_tokens_seen": 82850625, "router_z_loss_clip": 2.828125, "router_z_loss_mlp": 0.41137695, "step": 3849, "time_per_iteration": 2.600926160812378 }, { "auxiliary_loss_clip": 0.01441006, "auxiliary_loss_mlp": 0.00559546, "balance_loss_clip": 1.14792323, "balance_loss_mlp": 0.51307869, "epoch": 0.23147452277168196, "flos": 36135470995200.0, "grad_norm": 13.56552653560103, "language_loss": 0.78588045, "learning_rate": 3.5890844167564405e-06, "loss": 0.80588591, "num_input_tokens_seen": 82872105, "router_z_loss_clip": 2.93164062, "router_z_loss_mlp": 0.46508789, "step": 3850, "time_per_iteration": 2.703482151031494 }, { "auxiliary_loss_clip": 0.01421209, "auxiliary_loss_mlp": 0.00583498, "balance_loss_clip": 1.13228464, "balance_loss_mlp": 0.5360049, "epoch": 0.23153464602434992, "flos": 20812927305600.0, "grad_norm": 6.632644562568718, "language_loss": 0.84451443, "learning_rate": 3.588847902019718e-06, "loss": 0.86456144, "num_input_tokens_seen": 82890595, "router_z_loss_clip": 2.890625, "router_z_loss_mlp": 0.47485352, "step": 3851, "time_per_iteration": 2.5967869758605957 }, { "auxiliary_loss_clip": 0.01419735, "auxiliary_loss_mlp": 0.00559549, "balance_loss_clip": 1.12682176, "balance_loss_mlp": 0.51041144, "epoch": 0.2315947692770179, "flos": 19939256801280.0, "grad_norm": 7.128407186335358, "language_loss": 0.75948274, "learning_rate": 3.588611327033723e-06, "loss": 0.7792756, "num_input_tokens_seen": 82908910, "router_z_loss_clip": 2.93164062, "router_z_loss_mlp": 0.4909668, "step": 3852, "time_per_iteration": 2.5646016597747803 }, { "auxiliary_loss_clip": 0.01437059, "auxiliary_loss_mlp": 0.00520789, "balance_loss_clip": 1.14139032, "balance_loss_mlp": 0.47565588, "epoch": 0.23165489252968585, "flos": 12855553695360.0, "grad_norm": 21.462240782163338, "language_loss": 0.75076616, "learning_rate": 3.588374691807428e-06, "loss": 0.77034461, "num_input_tokens_seen": 82925405, "router_z_loss_clip": 2.95898438, "router_z_loss_mlp": 0.45141602, "step": 3853, "time_per_iteration": 2.597895383834839 }, { "auxiliary_loss_clip": 0.01465847, "auxiliary_loss_mlp": 0.00535809, "balance_loss_clip": 1.16989088, "balance_loss_mlp": 0.48807761, "epoch": 0.23171501578235382, "flos": 30628282792320.0, "grad_norm": 24.353038918382964, "language_loss": 0.86582476, "learning_rate": 3.5881379963498053e-06, "loss": 0.88584125, "num_input_tokens_seen": 82945615, "router_z_loss_clip": 2.95898438, "router_z_loss_mlp": 0.47753906, "step": 3854, "time_per_iteration": 2.6793253421783447 }, { "auxiliary_loss_clip": 0.01465794, "auxiliary_loss_mlp": 0.00526728, "balance_loss_clip": 1.15848422, "balance_loss_mlp": 0.47794712, "epoch": 0.23177513903502178, "flos": 23842782397440.0, "grad_norm": 24.66245290716232, "language_loss": 0.74803782, "learning_rate": 3.587901240669831e-06, "loss": 0.76796299, "num_input_tokens_seen": 82967570, "router_z_loss_clip": 3.07421875, "router_z_loss_mlp": 0.48803711, "step": 3855, "time_per_iteration": 2.659219980239868 }, { "auxiliary_loss_clip": 0.01448953, "auxiliary_loss_mlp": 0.00532536, "balance_loss_clip": 1.14742708, "balance_loss_mlp": 0.48768905, "epoch": 0.23183526228768978, "flos": 29570282668800.0, "grad_norm": 17.106602927614528, "language_loss": 0.7810297, "learning_rate": 3.5876644247764815e-06, "loss": 0.80084455, "num_input_tokens_seen": 82987435, "router_z_loss_clip": 3.01757812, "router_z_loss_mlp": 0.44848633, "step": 3856, "time_per_iteration": 2.6995129585266113 }, { "auxiliary_loss_clip": 0.01440016, "auxiliary_loss_mlp": 0.00580546, "balance_loss_clip": 1.13458836, "balance_loss_mlp": 0.53369677, "epoch": 0.23189538554035774, "flos": 34458694254720.0, "grad_norm": 2.91834406866174, "language_loss": 0.83261752, "learning_rate": 3.5874275486787387e-06, "loss": 0.85282314, "num_input_tokens_seen": 83010505, "router_z_loss_clip": 3.0546875, "router_z_loss_mlp": 0.46875, "step": 3857, "time_per_iteration": 2.7494008541107178 }, { "auxiliary_loss_clip": 0.014752, "auxiliary_loss_mlp": 0.00516679, "balance_loss_clip": 1.16538775, "balance_loss_mlp": 0.46761233, "epoch": 0.2319555087930257, "flos": 18003815245440.0, "grad_norm": 3.660252918202163, "language_loss": 0.97188991, "learning_rate": 3.587190612385584e-06, "loss": 0.99180871, "num_input_tokens_seen": 83026705, "router_z_loss_clip": 3.09960938, "router_z_loss_mlp": 0.4909668, "step": 3858, "time_per_iteration": 4.023346424102783 }, { "auxiliary_loss_clip": 0.0147247, "auxiliary_loss_mlp": 0.00497449, "balance_loss_clip": 1.16436124, "balance_loss_mlp": 0.45293635, "epoch": 0.23201563204569367, "flos": 23143852581120.0, "grad_norm": 2.2905369744355544, "language_loss": 0.83435965, "learning_rate": 3.5869536159060026e-06, "loss": 0.85405886, "num_input_tokens_seen": 83046500, "router_z_loss_clip": 3.08203125, "router_z_loss_mlp": 0.44482422, "step": 3859, "time_per_iteration": 2.589754819869995 }, { "auxiliary_loss_clip": 0.01459842, "auxiliary_loss_mlp": 0.00532099, "balance_loss_clip": 1.15122652, "balance_loss_mlp": 0.48670369, "epoch": 0.23207575529836164, "flos": 20667991927680.0, "grad_norm": 3.1997030226697083, "language_loss": 0.88831747, "learning_rate": 3.58671655924898e-06, "loss": 0.90823686, "num_input_tokens_seen": 83065280, "router_z_loss_clip": 3.08203125, "router_z_loss_mlp": 0.45410156, "step": 3860, "time_per_iteration": 2.6342296600341797 }, { "auxiliary_loss_clip": 0.01464441, "auxiliary_loss_mlp": 0.00469904, "balance_loss_clip": 1.15824533, "balance_loss_mlp": 0.42989695, "epoch": 0.2321358785510296, "flos": 16472189364480.0, "grad_norm": 9.94350977514072, "language_loss": 0.91551816, "learning_rate": 3.586479442423508e-06, "loss": 0.93486166, "num_input_tokens_seen": 83082310, "router_z_loss_clip": 3.0625, "router_z_loss_mlp": 0.40014648, "step": 3861, "time_per_iteration": 3.9328665733337402 }, { "auxiliary_loss_clip": 0.01485871, "auxiliary_loss_mlp": 0.00446362, "balance_loss_clip": 1.16955948, "balance_loss_mlp": 0.40537792, "epoch": 0.2321960018036976, "flos": 21616320850560.0, "grad_norm": 3.1704372012580637, "language_loss": 0.91320801, "learning_rate": 3.586242265438576e-06, "loss": 0.93253028, "num_input_tokens_seen": 83102065, "router_z_loss_clip": 3.16210938, "router_z_loss_mlp": 0.40966797, "step": 3862, "time_per_iteration": 2.601963520050049 }, { "auxiliary_loss_clip": 0.01468829, "auxiliary_loss_mlp": 0.00433543, "balance_loss_clip": 1.15992713, "balance_loss_mlp": 0.39537245, "epoch": 0.23225612505636556, "flos": 22271474966400.0, "grad_norm": 22.589469095175428, "language_loss": 0.80228698, "learning_rate": 3.5860050283031773e-06, "loss": 0.8213107, "num_input_tokens_seen": 83121445, "router_z_loss_clip": 3.08789062, "router_z_loss_mlp": 0.3815918, "step": 3863, "time_per_iteration": 2.6034138202667236 }, { "auxiliary_loss_clip": 0.01481181, "auxiliary_loss_mlp": 0.00375875, "balance_loss_clip": 1.16672933, "balance_loss_mlp": 0.34359351, "epoch": 0.23231624830903352, "flos": 17052325925760.0, "grad_norm": 6.011408603416355, "language_loss": 0.8142789, "learning_rate": 3.58576773102631e-06, "loss": 0.8328495, "num_input_tokens_seen": 83138175, "router_z_loss_clip": 3.1484375, "router_z_loss_mlp": 0.32299805, "step": 3864, "time_per_iteration": 2.573729991912842 }, { "auxiliary_loss_clip": 0.01503191, "auxiliary_loss_mlp": 0.00447438, "balance_loss_clip": 1.18139279, "balance_loss_mlp": 0.40759784, "epoch": 0.2323763715617015, "flos": 34640043045120.0, "grad_norm": 6.475105030320255, "language_loss": 0.77081662, "learning_rate": 3.5855303736169714e-06, "loss": 0.7903229, "num_input_tokens_seen": 83161975, "router_z_loss_clip": 3.21289062, "router_z_loss_mlp": 0.3984375, "step": 3865, "time_per_iteration": 4.195216655731201 }, { "auxiliary_loss_clip": 0.01515262, "auxiliary_loss_mlp": 0.00407075, "balance_loss_clip": 1.18790758, "balance_loss_mlp": 0.36773598, "epoch": 0.23243649481436945, "flos": 25551698832000.0, "grad_norm": 8.690797802150158, "language_loss": 1.04475856, "learning_rate": 3.5852929560841617e-06, "loss": 1.06398201, "num_input_tokens_seen": 83180905, "router_z_loss_clip": 3.26953125, "router_z_loss_mlp": 0.39331055, "step": 3866, "time_per_iteration": 2.6873841285705566 }, { "auxiliary_loss_clip": 0.01506616, "auxiliary_loss_mlp": 0.00412907, "balance_loss_clip": 1.18787265, "balance_loss_mlp": 0.37602413, "epoch": 0.23249661806703742, "flos": 20483482740480.0, "grad_norm": 16.55336281660246, "language_loss": 0.81595457, "learning_rate": 3.5850554784368846e-06, "loss": 0.83514977, "num_input_tokens_seen": 83196390, "router_z_loss_clip": 3.1875, "router_z_loss_mlp": 0.36865234, "step": 3867, "time_per_iteration": 2.641598701477051 }, { "auxiliary_loss_clip": 0.01491443, "auxiliary_loss_mlp": 0.00393825, "balance_loss_clip": 1.1706624, "balance_loss_mlp": 0.35706037, "epoch": 0.23255674131970538, "flos": 20376612800640.0, "grad_norm": 28.88283394645634, "language_loss": 0.88434041, "learning_rate": 3.584817940684145e-06, "loss": 0.90319312, "num_input_tokens_seen": 83216165, "router_z_loss_clip": 3.203125, "router_z_loss_mlp": 0.3671875, "step": 3868, "time_per_iteration": 2.600832223892212 }, { "auxiliary_loss_clip": 0.01499517, "auxiliary_loss_mlp": 0.00351573, "balance_loss_clip": 1.17711627, "balance_loss_mlp": 0.31826597, "epoch": 0.23261686457237338, "flos": 17056096853760.0, "grad_norm": 5.4040744742976985, "language_loss": 0.8227948, "learning_rate": 3.58458034283495e-06, "loss": 0.84130561, "num_input_tokens_seen": 83233845, "router_z_loss_clip": 3.2265625, "router_z_loss_mlp": 0.33300781, "step": 3869, "time_per_iteration": 4.027331113815308 }, { "auxiliary_loss_clip": 0.01511296, "auxiliary_loss_mlp": 0.00392207, "balance_loss_clip": 1.19231784, "balance_loss_mlp": 0.3572548, "epoch": 0.23267698782504134, "flos": 29169878785920.0, "grad_norm": 2.393801602651625, "language_loss": 0.87253606, "learning_rate": 3.5843426848983097e-06, "loss": 0.8915711, "num_input_tokens_seen": 83254930, "router_z_loss_clip": 3.19335938, "router_z_loss_mlp": 0.34960938, "step": 3870, "time_per_iteration": 2.7029871940612793 }, { "auxiliary_loss_clip": 0.01528549, "auxiliary_loss_mlp": 0.00425756, "balance_loss_clip": 1.19548178, "balance_loss_mlp": 0.38838363, "epoch": 0.2327371110777093, "flos": 21174655219200.0, "grad_norm": 483.7557305808067, "language_loss": 0.80113006, "learning_rate": 3.5841049668832357e-06, "loss": 0.82067311, "num_input_tokens_seen": 83272095, "router_z_loss_clip": 3.33398438, "router_z_loss_mlp": 0.37365723, "step": 3871, "time_per_iteration": 2.561936378479004 }, { "auxiliary_loss_clip": 0.01518446, "auxiliary_loss_mlp": 0.00376531, "balance_loss_clip": 1.18750668, "balance_loss_mlp": 0.33824122, "epoch": 0.23279723433037727, "flos": 24863112132480.0, "grad_norm": 8.550990807151678, "language_loss": 0.76674926, "learning_rate": 3.5838671887987433e-06, "loss": 0.78569907, "num_input_tokens_seen": 83290980, "router_z_loss_clip": 3.30664062, "router_z_loss_mlp": 0.38305664, "step": 3872, "time_per_iteration": 2.641291856765747 }, { "auxiliary_loss_clip": 0.01518912, "auxiliary_loss_mlp": 0.00390294, "balance_loss_clip": 1.1905818, "balance_loss_mlp": 0.35503209, "epoch": 0.23285735758304524, "flos": 38800617344640.0, "grad_norm": 1.6446157696032937, "language_loss": 0.8568064, "learning_rate": 3.5836293506538474e-06, "loss": 0.87589842, "num_input_tokens_seen": 83315175, "router_z_loss_clip": 3.28710938, "router_z_loss_mlp": 0.3527832, "step": 3873, "time_per_iteration": 2.7859528064727783 }, { "auxiliary_loss_clip": 0.01470829, "auxiliary_loss_mlp": 0.00130258, "balance_loss_clip": 1.27159762, "balance_loss_mlp": 0.12019652, "epoch": 0.2329174808357132, "flos": 53944113692160.0, "grad_norm": 0.8422233868496591, "language_loss": 0.60677892, "learning_rate": 3.5833914524575687e-06, "loss": 0.6227898, "num_input_tokens_seen": 83372060, "router_z_loss_clip": 1.9921875, "router_z_loss_mlp": 0.10058594, "step": 3874, "time_per_iteration": 3.028733730316162 }, { "auxiliary_loss_clip": 0.01506636, "auxiliary_loss_mlp": 0.00346939, "balance_loss_clip": 1.18330979, "balance_loss_mlp": 0.31496736, "epoch": 0.23297760408838117, "flos": 21216024708480.0, "grad_norm": 4.386811172320261, "language_loss": 0.91494286, "learning_rate": 3.583153494218927e-06, "loss": 0.93347859, "num_input_tokens_seen": 83389795, "router_z_loss_clip": 3.23632812, "router_z_loss_mlp": 0.31982422, "step": 3875, "time_per_iteration": 2.6044623851776123 }, { "auxiliary_loss_clip": 0.01508442, "auxiliary_loss_mlp": 0.00361785, "balance_loss_clip": 1.1867516, "balance_loss_mlp": 0.32792968, "epoch": 0.23303772734104916, "flos": 28403006394240.0, "grad_norm": 6.228929072294961, "language_loss": 0.68952233, "learning_rate": 3.5829154759469464e-06, "loss": 0.70822465, "num_input_tokens_seen": 83410005, "router_z_loss_clip": 3.21679688, "router_z_loss_mlp": 0.33874512, "step": 3876, "time_per_iteration": 2.6666629314422607 }, { "auxiliary_loss_clip": 0.0153161, "auxiliary_loss_mlp": 0.00364566, "balance_loss_clip": 1.20668232, "balance_loss_mlp": 0.33190301, "epoch": 0.23309785059371713, "flos": 24314720215680.0, "grad_norm": 9.68407982865053, "language_loss": 0.78422785, "learning_rate": 3.5826773976506523e-06, "loss": 0.80318964, "num_input_tokens_seen": 83430250, "router_z_loss_clip": 3.25, "router_z_loss_mlp": 0.32617188, "step": 3877, "time_per_iteration": 2.6487319469451904 }, { "auxiliary_loss_clip": 0.01520727, "auxiliary_loss_mlp": 0.00337759, "balance_loss_clip": 1.19071329, "balance_loss_mlp": 0.30635971, "epoch": 0.2331579738463851, "flos": 15992925171840.0, "grad_norm": 857.9270381883572, "language_loss": 0.89301956, "learning_rate": 3.582439259339073e-06, "loss": 0.9116044, "num_input_tokens_seen": 83447950, "router_z_loss_clip": 3.29492188, "router_z_loss_mlp": 0.31408691, "step": 3878, "time_per_iteration": 2.5793309211730957 }, { "auxiliary_loss_clip": 0.01535064, "auxiliary_loss_mlp": 0.00374088, "balance_loss_clip": 1.20047045, "balance_loss_mlp": 0.3397316, "epoch": 0.23321809709905306, "flos": 36426957863040.0, "grad_norm": 1.8077024783088067, "language_loss": 0.80507064, "learning_rate": 3.5822010610212374e-06, "loss": 0.82416213, "num_input_tokens_seen": 83467785, "router_z_loss_clip": 3.34765625, "router_z_loss_mlp": 0.34350586, "step": 3879, "time_per_iteration": 2.7233481407165527 }, { "auxiliary_loss_clip": 0.0152801, "auxiliary_loss_mlp": 0.00347914, "balance_loss_clip": 1.19555938, "balance_loss_mlp": 0.31576347, "epoch": 0.23327822035172102, "flos": 21324762155520.0, "grad_norm": 4.049167064397569, "language_loss": 0.95763439, "learning_rate": 3.5819628027061795e-06, "loss": 0.97639358, "num_input_tokens_seen": 83485390, "router_z_loss_clip": 3.32226562, "router_z_loss_mlp": 0.3215332, "step": 3880, "time_per_iteration": 2.61074161529541 }, { "auxiliary_loss_clip": 0.01533362, "auxiliary_loss_mlp": 0.00361031, "balance_loss_clip": 1.20314157, "balance_loss_mlp": 0.32731897, "epoch": 0.233338343604389, "flos": 19171881619200.0, "grad_norm": 2418.88008758777, "language_loss": 0.78795087, "learning_rate": 3.5817244844029334e-06, "loss": 0.80689484, "num_input_tokens_seen": 83504890, "router_z_loss_clip": 3.30273438, "router_z_loss_mlp": 0.3371582, "step": 3881, "time_per_iteration": 2.6547446250915527 }, { "auxiliary_loss_clip": 0.01537589, "auxiliary_loss_mlp": 0.00319899, "balance_loss_clip": 1.21122742, "balance_loss_mlp": 0.29021594, "epoch": 0.23339846685705698, "flos": 26908368543360.0, "grad_norm": 7.137942466611316, "language_loss": 0.73574126, "learning_rate": 3.581486106120537e-06, "loss": 0.75431615, "num_input_tokens_seen": 83526475, "router_z_loss_clip": 3.265625, "router_z_loss_mlp": 0.29650879, "step": 3882, "time_per_iteration": 2.6802635192871094 }, { "auxiliary_loss_clip": 0.01552831, "auxiliary_loss_mlp": 0.00309094, "balance_loss_clip": 1.22341371, "balance_loss_mlp": 0.27752763, "epoch": 0.23345859010972494, "flos": 32343160884480.0, "grad_norm": 7.19393593533623, "language_loss": 0.8375268, "learning_rate": 3.5812476678680287e-06, "loss": 0.85614604, "num_input_tokens_seen": 83546620, "router_z_loss_clip": 3.29882812, "router_z_loss_mlp": 0.31567383, "step": 3883, "time_per_iteration": 2.7177138328552246 }, { "auxiliary_loss_clip": 0.0146364, "auxiliary_loss_mlp": 0.00100657, "balance_loss_clip": 1.2681303, "balance_loss_mlp": 0.09107301, "epoch": 0.2335187133623929, "flos": 58484229050880.0, "grad_norm": 0.7681811969274355, "language_loss": 0.59314609, "learning_rate": 3.58100916965445e-06, "loss": 0.60878909, "num_input_tokens_seen": 83616160, "router_z_loss_clip": 1.953125, "router_z_loss_mlp": 0.09570312, "step": 3884, "time_per_iteration": 3.292201042175293 }, { "auxiliary_loss_clip": 0.01549048, "auxiliary_loss_mlp": 0.00248486, "balance_loss_clip": 1.21497607, "balance_loss_mlp": 0.21947059, "epoch": 0.23357883661506088, "flos": 24502317972480.0, "grad_norm": 2.26561415972392, "language_loss": 0.85807645, "learning_rate": 3.5807706114888455e-06, "loss": 0.87605178, "num_input_tokens_seen": 83636795, "router_z_loss_clip": 3.33789062, "router_z_loss_mlp": 0.29052734, "step": 3885, "time_per_iteration": 2.6868319511413574 }, { "auxiliary_loss_clip": 0.01549705, "auxiliary_loss_mlp": 0.00307875, "balance_loss_clip": 1.21884155, "balance_loss_mlp": 0.276833, "epoch": 0.23363895986772884, "flos": 18948516894720.0, "grad_norm": 19.236034654201397, "language_loss": 0.93934977, "learning_rate": 3.580531993380261e-06, "loss": 0.95792556, "num_input_tokens_seen": 83654050, "router_z_loss_clip": 3.31054688, "router_z_loss_mlp": 0.3104248, "step": 3886, "time_per_iteration": 2.592223882675171 }, { "auxiliary_loss_clip": 0.01582786, "auxiliary_loss_mlp": 0.00283644, "balance_loss_clip": 1.24247432, "balance_loss_mlp": 0.25303078, "epoch": 0.2336990831203968, "flos": 31686821619840.0, "grad_norm": 3.0421341141889515, "language_loss": 0.79404575, "learning_rate": 3.5802933153377445e-06, "loss": 0.81271005, "num_input_tokens_seen": 83673720, "router_z_loss_clip": 3.40625, "router_z_loss_mlp": 0.30566406, "step": 3887, "time_per_iteration": 2.67746901512146 }, { "auxiliary_loss_clip": 0.01594643, "auxiliary_loss_mlp": 0.00308224, "balance_loss_clip": 1.24935484, "balance_loss_mlp": 0.2748934, "epoch": 0.23375920637306477, "flos": 27709750926720.0, "grad_norm": 26.781479898230245, "language_loss": 0.92225468, "learning_rate": 3.5800545773703475e-06, "loss": 0.94128335, "num_input_tokens_seen": 83693470, "router_z_loss_clip": 3.45117188, "router_z_loss_mlp": 0.33349609, "step": 3888, "time_per_iteration": 2.667341709136963 }, { "auxiliary_loss_clip": 0.01583932, "auxiliary_loss_mlp": 0.00264804, "balance_loss_clip": 1.24117899, "balance_loss_mlp": 0.2349301, "epoch": 0.23381932962573276, "flos": 17675627656320.0, "grad_norm": 25.14247936784745, "language_loss": 0.98027706, "learning_rate": 3.5798157794871225e-06, "loss": 0.9987644, "num_input_tokens_seen": 83711620, "router_z_loss_clip": 3.4296875, "router_z_loss_mlp": 0.29846191, "step": 3889, "time_per_iteration": 2.587050676345825 }, { "auxiliary_loss_clip": 0.01565128, "auxiliary_loss_mlp": 0.00273003, "balance_loss_clip": 1.23403776, "balance_loss_mlp": 0.24067374, "epoch": 0.23387945287840073, "flos": 14390842763520.0, "grad_norm": 10.23434922795639, "language_loss": 0.8841154, "learning_rate": 3.579576921697125e-06, "loss": 0.9024967, "num_input_tokens_seen": 83727890, "router_z_loss_clip": 3.31445312, "router_z_loss_mlp": 0.32324219, "step": 3890, "time_per_iteration": 2.5834460258483887 }, { "auxiliary_loss_clip": 0.01554162, "auxiliary_loss_mlp": 0.00266516, "balance_loss_clip": 1.22774434, "balance_loss_mlp": 0.23385252, "epoch": 0.2339395761310687, "flos": 46097988503040.0, "grad_norm": 91.72431315299482, "language_loss": 0.80513477, "learning_rate": 3.579338004009412e-06, "loss": 0.82334161, "num_input_tokens_seen": 83749370, "router_z_loss_clip": 3.26171875, "router_z_loss_mlp": 0.3269043, "step": 3891, "time_per_iteration": 2.813584566116333 }, { "auxiliary_loss_clip": 0.01566615, "auxiliary_loss_mlp": 0.00263504, "balance_loss_clip": 1.23669791, "balance_loss_mlp": 0.23184188, "epoch": 0.23399969938373666, "flos": 22382044007040.0, "grad_norm": 90.87057371080303, "language_loss": 0.90019822, "learning_rate": 3.5790990264330433e-06, "loss": 0.91849947, "num_input_tokens_seen": 83769560, "router_z_loss_clip": 3.30273438, "router_z_loss_mlp": 0.31640625, "step": 3892, "time_per_iteration": 2.670623540878296 }, { "auxiliary_loss_clip": 0.01573988, "auxiliary_loss_mlp": 0.00258747, "balance_loss_clip": 1.23813069, "balance_loss_mlp": 0.22620277, "epoch": 0.23405982263640462, "flos": 43508542066560.0, "grad_norm": 14.914749340877671, "language_loss": 0.72299528, "learning_rate": 3.578859988977082e-06, "loss": 0.74132258, "num_input_tokens_seen": 83795635, "router_z_loss_clip": 3.35742188, "router_z_loss_mlp": 0.32543945, "step": 3893, "time_per_iteration": 2.8169593811035156 }, { "auxiliary_loss_clip": 0.01587422, "auxiliary_loss_mlp": 0.00251067, "balance_loss_clip": 1.24884081, "balance_loss_mlp": 0.2194528, "epoch": 0.2341199458890726, "flos": 22564685687040.0, "grad_norm": 2.399695096691209, "language_loss": 0.87677372, "learning_rate": 3.5786208916505916e-06, "loss": 0.89515865, "num_input_tokens_seen": 83814090, "router_z_loss_clip": 3.38671875, "router_z_loss_mlp": 0.31591797, "step": 3894, "time_per_iteration": 2.5976035594940186 }, { "auxiliary_loss_clip": 0.01566574, "auxiliary_loss_mlp": 0.00268501, "balance_loss_clip": 1.23478985, "balance_loss_mlp": 0.23617128, "epoch": 0.23418006914174055, "flos": 25633970933760.0, "grad_norm": 5.193771558812267, "language_loss": 0.88060844, "learning_rate": 3.5783817344626383e-06, "loss": 0.89895928, "num_input_tokens_seen": 83836870, "router_z_loss_clip": 3.31640625, "router_z_loss_mlp": 0.32324219, "step": 3895, "time_per_iteration": 2.687689781188965 }, { "auxiliary_loss_clip": 0.01578566, "auxiliary_loss_mlp": 0.00287456, "balance_loss_clip": 1.241799, "balance_loss_mlp": 0.25410175, "epoch": 0.23424019239440855, "flos": 13545936074880.0, "grad_norm": 21.868334041643614, "language_loss": 0.92191064, "learning_rate": 3.578142517422292e-06, "loss": 0.94057083, "num_input_tokens_seen": 83853275, "router_z_loss_clip": 3.37109375, "router_z_loss_mlp": 0.33349609, "step": 3896, "time_per_iteration": 2.6185250282287598 }, { "auxiliary_loss_clip": 0.01571084, "auxiliary_loss_mlp": 0.00259424, "balance_loss_clip": 1.23473108, "balance_loss_mlp": 0.22738072, "epoch": 0.2343003156470765, "flos": 22419498913920.0, "grad_norm": 21.44322702814386, "language_loss": 0.89202905, "learning_rate": 3.577903240538623e-06, "loss": 0.91033411, "num_input_tokens_seen": 83872340, "router_z_loss_clip": 3.36132812, "router_z_loss_mlp": 0.3203125, "step": 3897, "time_per_iteration": 2.627413749694824 }, { "auxiliary_loss_clip": 0.01566192, "auxiliary_loss_mlp": 0.00267953, "balance_loss_clip": 1.2369715, "balance_loss_mlp": 0.23457402, "epoch": 0.23436043889974448, "flos": 14790815683200.0, "grad_norm": 2.7420126972733248, "language_loss": 0.86911112, "learning_rate": 3.577663903820705e-06, "loss": 0.8874526, "num_input_tokens_seen": 83888795, "router_z_loss_clip": 3.29296875, "router_z_loss_mlp": 0.33398438, "step": 3898, "time_per_iteration": 2.637450695037842 }, { "auxiliary_loss_clip": 0.01574462, "auxiliary_loss_mlp": 0.0022852, "balance_loss_clip": 1.24930453, "balance_loss_mlp": 0.1982166, "epoch": 0.23442056215241244, "flos": 22965700101120.0, "grad_norm": 1.9103512585332605, "language_loss": 0.82676625, "learning_rate": 3.577424507277614e-06, "loss": 0.84479606, "num_input_tokens_seen": 83906820, "router_z_loss_clip": 3.24804688, "router_z_loss_mlp": 0.30297852, "step": 3899, "time_per_iteration": 2.669923782348633 }, { "auxiliary_loss_clip": 0.01588296, "auxiliary_loss_mlp": 0.0026412, "balance_loss_clip": 1.24891829, "balance_loss_mlp": 0.2306221, "epoch": 0.2344806854050804, "flos": 23071887682560.0, "grad_norm": 16.727755607081637, "language_loss": 0.83395565, "learning_rate": 3.5771850509184277e-06, "loss": 0.85247976, "num_input_tokens_seen": 83926370, "router_z_loss_clip": 3.3984375, "router_z_loss_mlp": 0.33496094, "step": 3900, "time_per_iteration": 4.065730571746826 }, { "auxiliary_loss_clip": 0.01560803, "auxiliary_loss_mlp": 0.00255706, "balance_loss_clip": 1.23531055, "balance_loss_mlp": 0.2193234, "epoch": 0.23454080865774837, "flos": 16327074418560.0, "grad_norm": 16.746783477497107, "language_loss": 0.73628408, "learning_rate": 3.5769455347522256e-06, "loss": 0.75444913, "num_input_tokens_seen": 83944600, "router_z_loss_clip": 3.25390625, "router_z_loss_mlp": 0.36352539, "step": 3901, "time_per_iteration": 2.5840258598327637 }, { "auxiliary_loss_clip": 0.01514717, "auxiliary_loss_mlp": 0.00101665, "balance_loss_clip": 1.33315003, "balance_loss_mlp": 0.08468911, "epoch": 0.23460093191041637, "flos": 67760958142080.0, "grad_norm": 0.7420805585575118, "language_loss": 0.58182955, "learning_rate": 3.576705958788091e-06, "loss": 0.59799337, "num_input_tokens_seen": 84005100, "router_z_loss_clip": 1.8125, "router_z_loss_mlp": 0.16992188, "step": 3902, "time_per_iteration": 3.0634796619415283 }, { "auxiliary_loss_clip": 0.0158465, "auxiliary_loss_mlp": 0.00217457, "balance_loss_clip": 1.25498676, "balance_loss_mlp": 0.18231453, "epoch": 0.23466105516308433, "flos": 20077619990400.0, "grad_norm": 31.02036356202509, "language_loss": 0.91951847, "learning_rate": 3.576466323035108e-06, "loss": 0.93753952, "num_input_tokens_seen": 84023775, "router_z_loss_clip": 3.296875, "router_z_loss_mlp": 0.35131836, "step": 3903, "time_per_iteration": 4.025574445724487 }, { "auxiliary_loss_clip": 0.01554509, "auxiliary_loss_mlp": 0.00246712, "balance_loss_clip": 1.23120761, "balance_loss_mlp": 0.21180737, "epoch": 0.2347211784157523, "flos": 24535714642560.0, "grad_norm": 16.82525038503733, "language_loss": 0.89066505, "learning_rate": 3.5762266275023645e-06, "loss": 0.90867722, "num_input_tokens_seen": 84042605, "router_z_loss_clip": 3.23632812, "router_z_loss_mlp": 0.34936523, "step": 3904, "time_per_iteration": 2.63858699798584 }, { "auxiliary_loss_clip": 0.0158458, "auxiliary_loss_mlp": 0.00255746, "balance_loss_clip": 1.25399172, "balance_loss_mlp": 0.217289, "epoch": 0.23478130166842026, "flos": 23805040181760.0, "grad_norm": 33.03334815152209, "language_loss": 0.81339985, "learning_rate": 3.57598687219895e-06, "loss": 0.83180308, "num_input_tokens_seen": 84061520, "router_z_loss_clip": 3.30273438, "router_z_loss_mlp": 0.38452148, "step": 3905, "time_per_iteration": 2.6143176555633545 }, { "auxiliary_loss_clip": 0.01561623, "auxiliary_loss_mlp": 0.00248992, "balance_loss_clip": 1.2428596, "balance_loss_mlp": 0.21816465, "epoch": 0.23484142492108823, "flos": 24093618048000.0, "grad_norm": 7.284202622819403, "language_loss": 0.77291584, "learning_rate": 3.5757470571339543e-06, "loss": 0.79102194, "num_input_tokens_seen": 84081800, "router_z_loss_clip": 3.18554688, "router_z_loss_mlp": 0.30810547, "step": 3906, "time_per_iteration": 2.64517879486084 }, { "auxiliary_loss_clip": 0.01550132, "auxiliary_loss_mlp": 0.00253157, "balance_loss_clip": 1.23197889, "balance_loss_mlp": 0.2165361, "epoch": 0.2349015481737562, "flos": 29095830898560.0, "grad_norm": 7.350384070206175, "language_loss": 0.84044558, "learning_rate": 3.575507182316473e-06, "loss": 0.85847849, "num_input_tokens_seen": 84102340, "router_z_loss_clip": 3.18359375, "router_z_loss_mlp": 0.36645508, "step": 3907, "time_per_iteration": 4.168622732162476 }, { "auxiliary_loss_clip": 0.01573689, "auxiliary_loss_mlp": 0.00259579, "balance_loss_clip": 1.24849987, "balance_loss_mlp": 0.2226243, "epoch": 0.23496167142642416, "flos": 18916305373440.0, "grad_norm": 27.04044802041194, "language_loss": 0.80529988, "learning_rate": 3.575267247755601e-06, "loss": 0.8236326, "num_input_tokens_seen": 84120370, "router_z_loss_clip": 3.25390625, "router_z_loss_mlp": 0.36938477, "step": 3908, "time_per_iteration": 2.608916759490967 }, { "auxiliary_loss_clip": 0.01486048, "auxiliary_loss_mlp": 0.00105317, "balance_loss_clip": 1.31350732, "balance_loss_mlp": 0.09005806, "epoch": 0.23502179467909215, "flos": 55868062896000.0, "grad_norm": 1.0581299642734718, "language_loss": 0.73459184, "learning_rate": 3.5750272534604367e-06, "loss": 0.75050551, "num_input_tokens_seen": 84165515, "router_z_loss_clip": 1.7265625, "router_z_loss_mlp": 0.15234375, "step": 3909, "time_per_iteration": 2.8471546173095703 }, { "auxiliary_loss_clip": 0.01540532, "auxiliary_loss_mlp": 0.00260073, "balance_loss_clip": 1.22871268, "balance_loss_mlp": 0.22752905, "epoch": 0.23508191793176011, "flos": 23401763210880.0, "grad_norm": 2.9206773803145456, "language_loss": 0.94145197, "learning_rate": 3.5747871994400822e-06, "loss": 0.95945805, "num_input_tokens_seen": 84184540, "router_z_loss_clip": 3.12304688, "router_z_loss_mlp": 0.32568359, "step": 3910, "time_per_iteration": 2.6145718097686768 }, { "auxiliary_loss_clip": 0.01554889, "auxiliary_loss_mlp": 0.00252051, "balance_loss_clip": 1.24737096, "balance_loss_mlp": 0.21957824, "epoch": 0.23514204118442808, "flos": 20047671025920.0, "grad_norm": 14.916258518016361, "language_loss": 0.85056615, "learning_rate": 3.5745470857036386e-06, "loss": 0.86863554, "num_input_tokens_seen": 84202025, "router_z_loss_clip": 3.07421875, "router_z_loss_mlp": 0.32470703, "step": 3911, "time_per_iteration": 2.6060173511505127 }, { "auxiliary_loss_clip": 0.01566302, "auxiliary_loss_mlp": 0.00259928, "balance_loss_clip": 1.2565794, "balance_loss_mlp": 0.22373646, "epoch": 0.23520216443709605, "flos": 21580589796480.0, "grad_norm": 15.7874487418103, "language_loss": 0.864685, "learning_rate": 3.5743069122602122e-06, "loss": 0.88294727, "num_input_tokens_seen": 84221895, "router_z_loss_clip": 3.09960938, "router_z_loss_mlp": 0.36181641, "step": 3912, "time_per_iteration": 4.077056169509888 }, { "auxiliary_loss_clip": 0.01543309, "auxiliary_loss_mlp": 0.00240574, "balance_loss_clip": 1.23798919, "balance_loss_mlp": 0.20876943, "epoch": 0.235262287689764, "flos": 23185796688000.0, "grad_norm": 6.192101228155215, "language_loss": 0.80841827, "learning_rate": 3.574066679118909e-06, "loss": 0.82625711, "num_input_tokens_seen": 84240455, "router_z_loss_clip": 3.05078125, "router_z_loss_mlp": 0.31835938, "step": 3913, "time_per_iteration": 2.773421049118042 }, { "auxiliary_loss_clip": 0.01554037, "auxiliary_loss_mlp": 0.00243949, "balance_loss_clip": 1.2464658, "balance_loss_mlp": 0.21030819, "epoch": 0.23532241094243198, "flos": 23185222070400.0, "grad_norm": 8.748131638046283, "language_loss": 0.84424198, "learning_rate": 3.57382638628884e-06, "loss": 0.86222184, "num_input_tokens_seen": 84261605, "router_z_loss_clip": 3.07421875, "router_z_loss_mlp": 0.33642578, "step": 3914, "time_per_iteration": 2.6995866298675537 }, { "auxiliary_loss_clip": 0.01545427, "auxiliary_loss_mlp": 0.00251648, "balance_loss_clip": 1.24605656, "balance_loss_mlp": 0.21853197, "epoch": 0.23538253419509997, "flos": 17019324305280.0, "grad_norm": 2.063064134486771, "language_loss": 0.99065745, "learning_rate": 3.5735860337791174e-06, "loss": 1.00862825, "num_input_tokens_seen": 84278675, "router_z_loss_clip": 2.98828125, "router_z_loss_mlp": 0.33105469, "step": 3915, "time_per_iteration": 2.558112621307373 }, { "auxiliary_loss_clip": 0.01492564, "auxiliary_loss_mlp": 0.00106461, "balance_loss_clip": 1.32614124, "balance_loss_mlp": 0.0959233, "epoch": 0.23544265744776793, "flos": 63448588967040.0, "grad_norm": 0.8061902987727162, "language_loss": 0.5936991, "learning_rate": 3.573345621598854e-06, "loss": 0.60968935, "num_input_tokens_seen": 84329765, "router_z_loss_clip": 1.6640625, "router_z_loss_mlp": 0.10546875, "step": 3916, "time_per_iteration": 3.0434319972991943 }, { "auxiliary_loss_clip": 0.01492433, "auxiliary_loss_mlp": 0.00159461, "balance_loss_clip": 1.32925057, "balance_loss_mlp": 0.14715844, "epoch": 0.2355027807004359, "flos": 70515343831680.0, "grad_norm": 0.7655231812963825, "language_loss": 0.49393615, "learning_rate": 3.5731051497571675e-06, "loss": 0.51045513, "num_input_tokens_seen": 84393680, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.12255859, "step": 3917, "time_per_iteration": 3.137397289276123 }, { "auxiliary_loss_clip": 0.0155895, "auxiliary_loss_mlp": 0.00270919, "balance_loss_clip": 1.25439644, "balance_loss_mlp": 0.23899487, "epoch": 0.23556290395310386, "flos": 21434289701760.0, "grad_norm": 2.1371563650068857, "language_loss": 0.83699918, "learning_rate": 3.5728646182631756e-06, "loss": 0.8552978, "num_input_tokens_seen": 84412640, "router_z_loss_clip": 3.04296875, "router_z_loss_mlp": 0.3190918, "step": 3918, "time_per_iteration": 2.5929818153381348 }, { "auxiliary_loss_clip": 0.01544257, "auxiliary_loss_mlp": 0.00256567, "balance_loss_clip": 1.24220395, "balance_loss_mlp": 0.22438005, "epoch": 0.23562302720577183, "flos": 18186421011840.0, "grad_norm": 4.381977124747024, "language_loss": 0.78792334, "learning_rate": 3.5726240271259995e-06, "loss": 0.80593157, "num_input_tokens_seen": 84431605, "router_z_loss_clip": 3.02148438, "router_z_loss_mlp": 0.32202148, "step": 3919, "time_per_iteration": 2.61092209815979 }, { "auxiliary_loss_clip": 0.01535642, "auxiliary_loss_mlp": 0.00256333, "balance_loss_clip": 1.24478459, "balance_loss_mlp": 0.22598219, "epoch": 0.2356831504584398, "flos": 33730497832320.0, "grad_norm": 10.733740379184598, "language_loss": 0.75733346, "learning_rate": 3.5723833763547634e-06, "loss": 0.77525318, "num_input_tokens_seen": 84454210, "router_z_loss_clip": 2.91210938, "router_z_loss_mlp": 0.30322266, "step": 3920, "time_per_iteration": 2.8123748302459717 }, { "auxiliary_loss_clip": 0.01556207, "auxiliary_loss_mlp": 0.00290161, "balance_loss_clip": 1.26170325, "balance_loss_mlp": 0.25935712, "epoch": 0.23574327371110776, "flos": 24932778560640.0, "grad_norm": 5.393722228254328, "language_loss": 0.83442342, "learning_rate": 3.5721426659585916e-06, "loss": 0.85288709, "num_input_tokens_seen": 84475540, "router_z_loss_clip": 2.94140625, "router_z_loss_mlp": 0.30810547, "step": 3921, "time_per_iteration": 2.6680800914764404 }, { "auxiliary_loss_clip": 0.01545339, "auxiliary_loss_mlp": 0.00257986, "balance_loss_clip": 1.25230217, "balance_loss_mlp": 0.22341537, "epoch": 0.23580339696377575, "flos": 17822107319040.0, "grad_norm": 2.3089907563975576, "language_loss": 0.86559844, "learning_rate": 3.571901895946612e-06, "loss": 0.88363171, "num_input_tokens_seen": 84494580, "router_z_loss_clip": 2.93164062, "router_z_loss_mlp": 0.34545898, "step": 3922, "time_per_iteration": 2.5993423461914062 }, { "auxiliary_loss_clip": 0.01549468, "auxiliary_loss_mlp": 0.00267292, "balance_loss_clip": 1.25801146, "balance_loss_mlp": 0.23501037, "epoch": 0.23586352021644372, "flos": 26286611097600.0, "grad_norm": 7.836508164740288, "language_loss": 0.87763554, "learning_rate": 3.571661066327956e-06, "loss": 0.89580315, "num_input_tokens_seen": 84513850, "router_z_loss_clip": 2.9140625, "router_z_loss_mlp": 0.32299805, "step": 3923, "time_per_iteration": 2.657937526702881 }, { "auxiliary_loss_clip": 0.01581766, "auxiliary_loss_mlp": 0.00242517, "balance_loss_clip": 1.29043818, "balance_loss_mlp": 0.20978169, "epoch": 0.23592364346911168, "flos": 14246697484800.0, "grad_norm": 15.539235542254026, "language_loss": 0.81207317, "learning_rate": 3.571420177111754e-06, "loss": 0.83031607, "num_input_tokens_seen": 84532315, "router_z_loss_clip": 2.91796875, "router_z_loss_mlp": 0.32739258, "step": 3924, "time_per_iteration": 2.6227288246154785 }, { "auxiliary_loss_clip": 0.01592422, "auxiliary_loss_mlp": 0.00278589, "balance_loss_clip": 1.29647946, "balance_loss_mlp": 0.24301709, "epoch": 0.23598376672177965, "flos": 18587938216320.0, "grad_norm": 1.6616119983045639, "language_loss": 0.90760148, "learning_rate": 3.5711792283071416e-06, "loss": 0.92631155, "num_input_tokens_seen": 84550970, "router_z_loss_clip": 2.95898438, "router_z_loss_mlp": 0.35571289, "step": 3925, "time_per_iteration": 2.6524109840393066 }, { "auxiliary_loss_clip": 0.01584101, "auxiliary_loss_mlp": 0.00267022, "balance_loss_clip": 1.29239333, "balance_loss_mlp": 0.23078194, "epoch": 0.2360438899744476, "flos": 22675542036480.0, "grad_norm": 22.822191881419794, "language_loss": 0.68925118, "learning_rate": 3.5709382199232564e-06, "loss": 0.70776242, "num_input_tokens_seen": 84571655, "router_z_loss_clip": 2.91796875, "router_z_loss_mlp": 0.36230469, "step": 3926, "time_per_iteration": 2.6537747383117676 }, { "auxiliary_loss_clip": 0.01609847, "auxiliary_loss_mlp": 0.00316836, "balance_loss_clip": 1.31829071, "balance_loss_mlp": 0.28135914, "epoch": 0.23610401322711558, "flos": 29570139014400.0, "grad_norm": 5.729943695238413, "language_loss": 0.78991246, "learning_rate": 3.570697151969235e-06, "loss": 0.80917937, "num_input_tokens_seen": 84593130, "router_z_loss_clip": 2.91601562, "router_z_loss_mlp": 0.35498047, "step": 3927, "time_per_iteration": 2.7240242958068848 }, { "auxiliary_loss_clip": 0.01612617, "auxiliary_loss_mlp": 0.00275788, "balance_loss_clip": 1.31724811, "balance_loss_mlp": 0.23892856, "epoch": 0.23616413647978354, "flos": 17858520731520.0, "grad_norm": 23.753180391826476, "language_loss": 0.81736124, "learning_rate": 3.570456024454221e-06, "loss": 0.8362453, "num_input_tokens_seen": 84612410, "router_z_loss_clip": 2.953125, "router_z_loss_mlp": 0.36889648, "step": 3928, "time_per_iteration": 2.6664931774139404 }, { "auxiliary_loss_clip": 0.0164579, "auxiliary_loss_mlp": 0.00296392, "balance_loss_clip": 1.34759724, "balance_loss_mlp": 0.25772041, "epoch": 0.23622425973245154, "flos": 11034847157760.0, "grad_norm": 26.802330543272586, "language_loss": 0.93908441, "learning_rate": 3.5702148373873576e-06, "loss": 0.95850623, "num_input_tokens_seen": 84627610, "router_z_loss_clip": 2.984375, "router_z_loss_mlp": 0.38671875, "step": 3929, "time_per_iteration": 2.60994815826416 }, { "auxiliary_loss_clip": 0.01644935, "auxiliary_loss_mlp": 0.00351134, "balance_loss_clip": 1.34153318, "balance_loss_mlp": 0.30964893, "epoch": 0.2362843829851195, "flos": 23404061681280.0, "grad_norm": 12.508779584627433, "language_loss": 0.80607295, "learning_rate": 3.569973590777789e-06, "loss": 0.82603365, "num_input_tokens_seen": 84648415, "router_z_loss_clip": 3.03515625, "router_z_loss_mlp": 0.41479492, "step": 3930, "time_per_iteration": 2.6540067195892334 }, { "auxiliary_loss_clip": 0.01649709, "auxiliary_loss_mlp": 0.00312186, "balance_loss_clip": 1.35591149, "balance_loss_mlp": 0.27203614, "epoch": 0.23634450623778747, "flos": 39529855261440.0, "grad_norm": 160.3118174208196, "language_loss": 0.82710195, "learning_rate": 3.569732284634665e-06, "loss": 0.84672087, "num_input_tokens_seen": 84670080, "router_z_loss_clip": 2.93554688, "router_z_loss_mlp": 0.40185547, "step": 3931, "time_per_iteration": 2.768934726715088 }, { "auxiliary_loss_clip": 0.01660748, "auxiliary_loss_mlp": 0.00310808, "balance_loss_clip": 1.35857034, "balance_loss_mlp": 0.27013344, "epoch": 0.23640462949045543, "flos": 24207167917440.0, "grad_norm": 11.467637750819025, "language_loss": 0.89064109, "learning_rate": 3.569490918967136e-06, "loss": 0.9103567, "num_input_tokens_seen": 84686465, "router_z_loss_clip": 3.02148438, "router_z_loss_mlp": 0.40673828, "step": 3932, "time_per_iteration": 2.6669669151306152 }, { "auxiliary_loss_clip": 0.0165614, "auxiliary_loss_mlp": 0.00359892, "balance_loss_clip": 1.35943317, "balance_loss_mlp": 0.32069585, "epoch": 0.2364647527431234, "flos": 26177622255360.0, "grad_norm": 13.497292911386934, "language_loss": 0.92114955, "learning_rate": 3.5692494937843537e-06, "loss": 0.94130981, "num_input_tokens_seen": 84708825, "router_z_loss_clip": 2.96679688, "router_z_loss_mlp": 0.3918457, "step": 3933, "time_per_iteration": 2.6621618270874023 }, { "auxiliary_loss_clip": 0.01653134, "auxiliary_loss_mlp": 0.00383917, "balance_loss_clip": 1.34688485, "balance_loss_mlp": 0.3440769, "epoch": 0.23652487599579136, "flos": 22637009721600.0, "grad_norm": 2.775742774450453, "language_loss": 0.90382171, "learning_rate": 3.5690080090954727e-06, "loss": 0.92419225, "num_input_tokens_seen": 84726165, "router_z_loss_clip": 3.06054688, "router_z_loss_mlp": 0.39868164, "step": 3934, "time_per_iteration": 2.6503539085388184 }, { "auxiliary_loss_clip": 0.01642164, "auxiliary_loss_mlp": 0.00348172, "balance_loss_clip": 1.34210443, "balance_loss_mlp": 0.30952418, "epoch": 0.23658499924845935, "flos": 21762261809280.0, "grad_norm": 6.584901106765564, "language_loss": 0.86059344, "learning_rate": 3.5687664649096515e-06, "loss": 0.8804968, "num_input_tokens_seen": 84745815, "router_z_loss_clip": 2.99804688, "router_z_loss_mlp": 0.38671875, "step": 3935, "time_per_iteration": 2.633786916732788 }, { "auxiliary_loss_clip": 0.01621958, "auxiliary_loss_mlp": 0.00351701, "balance_loss_clip": 1.33459544, "balance_loss_mlp": 0.31591401, "epoch": 0.23664512250112732, "flos": 21798998444160.0, "grad_norm": 3.688213620248369, "language_loss": 0.87282515, "learning_rate": 3.5685248612360487e-06, "loss": 0.89256179, "num_input_tokens_seen": 84765415, "router_z_loss_clip": 2.875, "router_z_loss_mlp": 0.35791016, "step": 3936, "time_per_iteration": 2.6274874210357666 }, { "auxiliary_loss_clip": 0.01615905, "auxiliary_loss_mlp": 0.00386871, "balance_loss_clip": 1.32150865, "balance_loss_mlp": 0.34958249, "epoch": 0.23670524575379528, "flos": 22637871648000.0, "grad_norm": 4.4074001977466954, "language_loss": 0.84176922, "learning_rate": 3.568283198083826e-06, "loss": 0.86179703, "num_input_tokens_seen": 84787080, "router_z_loss_clip": 2.94726562, "router_z_loss_mlp": 0.37255859, "step": 3937, "time_per_iteration": 2.643369674682617 }, { "auxiliary_loss_clip": 0.01652543, "auxiliary_loss_mlp": 0.00457271, "balance_loss_clip": 1.35462093, "balance_loss_mlp": 0.417503, "epoch": 0.23676536900646325, "flos": 16725000263040.0, "grad_norm": 121.01535407188125, "language_loss": 0.92565048, "learning_rate": 3.568041475462147e-06, "loss": 0.94674861, "num_input_tokens_seen": 84805395, "router_z_loss_clip": 2.97851562, "router_z_loss_mlp": 0.39770508, "step": 3938, "time_per_iteration": 2.582885980606079 }, { "auxiliary_loss_clip": 0.01644519, "auxiliary_loss_mlp": 0.00487901, "balance_loss_clip": 1.34440875, "balance_loss_mlp": 0.44663098, "epoch": 0.23682549225913122, "flos": 11135611785600.0, "grad_norm": 47.63368999001478, "language_loss": 1.01336813, "learning_rate": 3.5677996933801785e-06, "loss": 1.03469241, "num_input_tokens_seen": 84818090, "router_z_loss_clip": 3.00195312, "router_z_loss_mlp": 0.41259766, "step": 3939, "time_per_iteration": 2.525876760482788 }, { "auxiliary_loss_clip": 0.01645792, "auxiliary_loss_mlp": 0.00459089, "balance_loss_clip": 1.33927917, "balance_loss_mlp": 0.41877213, "epoch": 0.23688561551179918, "flos": 22559226819840.0, "grad_norm": 2.005170846304699, "language_loss": 0.89588594, "learning_rate": 3.567557851847088e-06, "loss": 0.91693473, "num_input_tokens_seen": 84837695, "router_z_loss_clip": 3.06640625, "router_z_loss_mlp": 0.40332031, "step": 3940, "time_per_iteration": 2.6353864669799805 }, { "auxiliary_loss_clip": 0.01648145, "auxiliary_loss_mlp": 0.00444867, "balance_loss_clip": 1.33388793, "balance_loss_mlp": 0.40531337, "epoch": 0.23694573876446715, "flos": 18514895909760.0, "grad_norm": 5.406575191533512, "language_loss": 0.99016082, "learning_rate": 3.5673159508720464e-06, "loss": 1.01109099, "num_input_tokens_seen": 84854630, "router_z_loss_clip": 3.140625, "router_z_loss_mlp": 0.39550781, "step": 3941, "time_per_iteration": 2.5834462642669678 }, { "auxiliary_loss_clip": 0.01638895, "auxiliary_loss_mlp": 0.00448999, "balance_loss_clip": 1.325899, "balance_loss_mlp": 0.40818125, "epoch": 0.23700586201713514, "flos": 15335723980800.0, "grad_norm": 29.92943228445727, "language_loss": 0.93808508, "learning_rate": 3.5670739904642274e-06, "loss": 0.95896399, "num_input_tokens_seen": 84871805, "router_z_loss_clip": 3.13085938, "router_z_loss_mlp": 0.40820312, "step": 3942, "time_per_iteration": 4.1977598667144775 }, { "auxiliary_loss_clip": 0.01648596, "auxiliary_loss_mlp": 0.00480081, "balance_loss_clip": 1.33617735, "balance_loss_mlp": 0.43995482, "epoch": 0.2370659852698031, "flos": 23947605262080.0, "grad_norm": 5.860146191166942, "language_loss": 0.89329016, "learning_rate": 3.5668319706328065e-06, "loss": 0.91457689, "num_input_tokens_seen": 84889815, "router_z_loss_clip": 3.12695312, "router_z_loss_mlp": 0.40136719, "step": 3943, "time_per_iteration": 2.661034345626831 }, { "auxiliary_loss_clip": 0.01654389, "auxiliary_loss_mlp": 0.00436482, "balance_loss_clip": 1.32853842, "balance_loss_mlp": 0.39666647, "epoch": 0.23712610852247107, "flos": 15332527670400.0, "grad_norm": 97.690761144732, "language_loss": 0.78468108, "learning_rate": 3.566589891386959e-06, "loss": 0.80558985, "num_input_tokens_seen": 84904380, "router_z_loss_clip": 3.26171875, "router_z_loss_mlp": 0.39794922, "step": 3944, "time_per_iteration": 2.6178956031799316 }, { "auxiliary_loss_clip": 0.0165479, "auxiliary_loss_mlp": 0.00423613, "balance_loss_clip": 1.32936287, "balance_loss_mlp": 0.38703954, "epoch": 0.23718623177513903, "flos": 19682567233920.0, "grad_norm": 3.488330412869094, "language_loss": 0.86114967, "learning_rate": 3.566347752735866e-06, "loss": 0.88193369, "num_input_tokens_seen": 84922935, "router_z_loss_clip": 3.25390625, "router_z_loss_mlp": 0.3659668, "step": 3945, "time_per_iteration": 4.039342164993286 }, { "auxiliary_loss_clip": 0.01670172, "auxiliary_loss_mlp": 0.0042583, "balance_loss_clip": 1.34577441, "balance_loss_mlp": 0.38873214, "epoch": 0.237246355027807, "flos": 24973322037120.0, "grad_norm": 7.270644829318704, "language_loss": 0.70462501, "learning_rate": 3.5661055546887094e-06, "loss": 0.72558498, "num_input_tokens_seen": 84943685, "router_z_loss_clip": 3.24414062, "router_z_loss_mlp": 0.37084961, "step": 3946, "time_per_iteration": 2.676464080810547 }, { "auxiliary_loss_clip": 0.01634354, "auxiliary_loss_mlp": 0.00384909, "balance_loss_clip": 1.32062054, "balance_loss_mlp": 0.34714314, "epoch": 0.23730647828047496, "flos": 15377416692480.0, "grad_norm": 5.727668811218663, "language_loss": 0.83991826, "learning_rate": 3.5658632972546734e-06, "loss": 0.860111, "num_input_tokens_seen": 84959505, "router_z_loss_clip": 3.13476562, "router_z_loss_mlp": 0.37768555, "step": 3947, "time_per_iteration": 2.6729941368103027 }, { "auxiliary_loss_clip": 0.01650991, "auxiliary_loss_mlp": 0.00338129, "balance_loss_clip": 1.33138084, "balance_loss_mlp": 0.30110276, "epoch": 0.23736660153314296, "flos": 28150662372480.0, "grad_norm": 107.73161182776765, "language_loss": 0.87244987, "learning_rate": 3.565620980442944e-06, "loss": 0.89234114, "num_input_tokens_seen": 84982130, "router_z_loss_clip": 3.1953125, "router_z_loss_mlp": 0.37036133, "step": 3948, "time_per_iteration": 2.7135183811187744 }, { "auxiliary_loss_clip": 0.01628378, "auxiliary_loss_mlp": 0.00389448, "balance_loss_clip": 1.31541967, "balance_loss_mlp": 0.35292196, "epoch": 0.23742672478581092, "flos": 22086570729600.0, "grad_norm": 25.556796644547752, "language_loss": 0.87699342, "learning_rate": 3.5653786042627107e-06, "loss": 0.89717168, "num_input_tokens_seen": 85000640, "router_z_loss_clip": 3.12890625, "router_z_loss_mlp": 0.36474609, "step": 3949, "time_per_iteration": 2.6440789699554443 }, { "auxiliary_loss_clip": 0.01608256, "auxiliary_loss_mlp": 0.00339505, "balance_loss_clip": 1.29179096, "balance_loss_mlp": 0.30154908, "epoch": 0.2374868480384789, "flos": 19537093152000.0, "grad_norm": 2.64894777950134, "language_loss": 0.80734909, "learning_rate": 3.565136168723163e-06, "loss": 0.82682675, "num_input_tokens_seen": 85018970, "router_z_loss_clip": 3.16210938, "router_z_loss_mlp": 0.37939453, "step": 3950, "time_per_iteration": 4.183924674987793 }, { "auxiliary_loss_clip": 0.01570186, "auxiliary_loss_mlp": 0.00302515, "balance_loss_clip": 1.26669753, "balance_loss_mlp": 0.26582262, "epoch": 0.23754697129114685, "flos": 19422501788160.0, "grad_norm": 11.545321788611842, "language_loss": 0.78510273, "learning_rate": 3.564893673833495e-06, "loss": 0.80382979, "num_input_tokens_seen": 85035905, "router_z_loss_clip": 3.03515625, "router_z_loss_mlp": 0.36694336, "step": 3951, "time_per_iteration": 2.6240272521972656 }, { "auxiliary_loss_clip": 0.01566021, "auxiliary_loss_mlp": 0.00320402, "balance_loss_clip": 1.2649008, "balance_loss_mlp": 0.28661782, "epoch": 0.23760709454381482, "flos": 19501002961920.0, "grad_norm": 14.373065667816087, "language_loss": 0.81967396, "learning_rate": 3.564651119602903e-06, "loss": 0.83853817, "num_input_tokens_seen": 85054560, "router_z_loss_clip": 3.01367188, "router_z_loss_mlp": 0.33789062, "step": 3952, "time_per_iteration": 2.609994411468506 }, { "auxiliary_loss_clip": 0.01546732, "auxiliary_loss_mlp": 0.00285538, "balance_loss_clip": 1.24812555, "balance_loss_mlp": 0.25416243, "epoch": 0.23766721779648278, "flos": 27636600879360.0, "grad_norm": 1855.2640625322074, "language_loss": 0.78238112, "learning_rate": 3.564408506040583e-06, "loss": 0.80070382, "num_input_tokens_seen": 85074425, "router_z_loss_clip": 2.9921875, "router_z_loss_mlp": 0.31347656, "step": 3953, "time_per_iteration": 2.6848113536834717 }, { "auxiliary_loss_clip": 0.01533833, "auxiliary_loss_mlp": 0.00271547, "balance_loss_clip": 1.23571539, "balance_loss_mlp": 0.23644015, "epoch": 0.23772734104915075, "flos": 23404348990080.0, "grad_norm": 3.4353021497600142, "language_loss": 0.90531409, "learning_rate": 3.5641658331557356e-06, "loss": 0.92336792, "num_input_tokens_seen": 85092865, "router_z_loss_clip": 2.98632812, "router_z_loss_mlp": 0.35119629, "step": 3954, "time_per_iteration": 4.022048234939575 }, { "auxiliary_loss_clip": 0.01533009, "auxiliary_loss_mlp": 0.00287061, "balance_loss_clip": 1.23560345, "balance_loss_mlp": 0.2527523, "epoch": 0.23778746430181874, "flos": 15705496540800.0, "grad_norm": 11.091313293914116, "language_loss": 0.76538479, "learning_rate": 3.5639231009575634e-06, "loss": 0.78358549, "num_input_tokens_seen": 85110175, "router_z_loss_clip": 2.97265625, "router_z_loss_mlp": 0.34301758, "step": 3955, "time_per_iteration": 2.6496119499206543 }, { "auxiliary_loss_clip": 0.01513011, "auxiliary_loss_mlp": 0.00270749, "balance_loss_clip": 1.22067261, "balance_loss_mlp": 0.2366786, "epoch": 0.2378475875544867, "flos": 19426452284160.0, "grad_norm": 1.514591256806729, "language_loss": 0.89229727, "learning_rate": 3.5636803094552704e-06, "loss": 0.91013491, "num_input_tokens_seen": 85129925, "router_z_loss_clip": 2.92382812, "router_z_loss_mlp": 0.34082031, "step": 3956, "time_per_iteration": 2.6496944427490234 }, { "auxiliary_loss_clip": 0.01495179, "auxiliary_loss_mlp": 0.00298973, "balance_loss_clip": 1.21028054, "balance_loss_mlp": 0.26752606, "epoch": 0.23790771080715467, "flos": 22268565964800.0, "grad_norm": 32.466568387755466, "language_loss": 0.92815328, "learning_rate": 3.5634374586580635e-06, "loss": 0.94609487, "num_input_tokens_seen": 85147755, "router_z_loss_clip": 2.84960938, "router_z_loss_mlp": 0.31445312, "step": 3957, "time_per_iteration": 2.6973254680633545 }, { "auxiliary_loss_clip": 0.01475926, "auxiliary_loss_mlp": 0.002825, "balance_loss_clip": 1.18812442, "balance_loss_mlp": 0.25200653, "epoch": 0.23796783405982264, "flos": 20047311889920.0, "grad_norm": 24.846990447340048, "language_loss": 0.77539384, "learning_rate": 3.563194548575151e-06, "loss": 0.79297811, "num_input_tokens_seen": 85165270, "router_z_loss_clip": 2.88085938, "router_z_loss_mlp": 0.30493164, "step": 3958, "time_per_iteration": 2.6372926235198975 }, { "auxiliary_loss_clip": 0.01488619, "auxiliary_loss_mlp": 0.00249739, "balance_loss_clip": 1.19796634, "balance_loss_mlp": 0.21776709, "epoch": 0.2380279573124906, "flos": 14245943299200.0, "grad_norm": 39.59851545745892, "language_loss": 0.76945907, "learning_rate": 3.562951579215745e-06, "loss": 0.78684258, "num_input_tokens_seen": 85181555, "router_z_loss_clip": 2.90625, "router_z_loss_mlp": 0.31958008, "step": 3959, "time_per_iteration": 2.723525047302246 }, { "auxiliary_loss_clip": 0.01478225, "auxiliary_loss_mlp": 0.00258809, "balance_loss_clip": 1.18916094, "balance_loss_mlp": 0.22836268, "epoch": 0.23808808056515857, "flos": 21179180332800.0, "grad_norm": 11.601468047184996, "language_loss": 0.79523379, "learning_rate": 3.5627085505890586e-06, "loss": 0.81260413, "num_input_tokens_seen": 85199455, "router_z_loss_clip": 2.88867188, "router_z_loss_mlp": 0.30444336, "step": 3960, "time_per_iteration": 2.6715517044067383 }, { "auxiliary_loss_clip": 0.01471644, "auxiliary_loss_mlp": 0.00238101, "balance_loss_clip": 1.18275928, "balance_loss_mlp": 0.20811982, "epoch": 0.23814820381782653, "flos": 22528308188160.0, "grad_norm": 27.015769932336003, "language_loss": 0.82552099, "learning_rate": 3.562465462704307e-06, "loss": 0.84261835, "num_input_tokens_seen": 85219170, "router_z_loss_clip": 2.88867188, "router_z_loss_mlp": 0.29968262, "step": 3961, "time_per_iteration": 2.6212189197540283 }, { "auxiliary_loss_clip": 0.01475616, "auxiliary_loss_mlp": 0.00262092, "balance_loss_clip": 1.18285549, "balance_loss_mlp": 0.23019123, "epoch": 0.23820832707049452, "flos": 22304332932480.0, "grad_norm": 6.477860557403541, "language_loss": 0.74430829, "learning_rate": 3.5622223155707085e-06, "loss": 0.76168537, "num_input_tokens_seen": 85238480, "router_z_loss_clip": 2.92382812, "router_z_loss_mlp": 0.31860352, "step": 3962, "time_per_iteration": 2.6233973503112793 }, { "auxiliary_loss_clip": 0.01442909, "auxiliary_loss_mlp": 0.00226516, "balance_loss_clip": 1.16206479, "balance_loss_mlp": 0.1965825, "epoch": 0.2382684503231625, "flos": 24864225454080.0, "grad_norm": 3.4707071771198255, "language_loss": 0.82333398, "learning_rate": 3.561979109197483e-06, "loss": 0.84002823, "num_input_tokens_seen": 85259180, "router_z_loss_clip": 2.80859375, "router_z_loss_mlp": 0.29919434, "step": 3963, "time_per_iteration": 2.689762830734253 }, { "auxiliary_loss_clip": 0.01470324, "auxiliary_loss_mlp": 0.0026217, "balance_loss_clip": 1.18095839, "balance_loss_mlp": 0.23022181, "epoch": 0.23832857357583045, "flos": 21871609787520.0, "grad_norm": 22.221095597183197, "language_loss": 0.86760092, "learning_rate": 3.5617358435938538e-06, "loss": 0.88492584, "num_input_tokens_seen": 85278550, "router_z_loss_clip": 2.89257812, "router_z_loss_mlp": 0.31933594, "step": 3964, "time_per_iteration": 2.6445839405059814 }, { "auxiliary_loss_clip": 0.01447027, "auxiliary_loss_mlp": 0.00258091, "balance_loss_clip": 1.15965831, "balance_loss_mlp": 0.22833669, "epoch": 0.23838869682849842, "flos": 21288061434240.0, "grad_norm": 2.420332916241352, "language_loss": 0.80662549, "learning_rate": 3.561492518769045e-06, "loss": 0.82367671, "num_input_tokens_seen": 85297345, "router_z_loss_clip": 2.875, "router_z_loss_mlp": 0.29711914, "step": 3965, "time_per_iteration": 2.627964973449707 }, { "auxiliary_loss_clip": 0.01460842, "auxiliary_loss_mlp": 0.00255318, "balance_loss_clip": 1.17231417, "balance_loss_mlp": 0.22568239, "epoch": 0.23844882008116638, "flos": 16180594755840.0, "grad_norm": 4.099865366198051, "language_loss": 0.87728095, "learning_rate": 3.561249134732282e-06, "loss": 0.89444256, "num_input_tokens_seen": 85315105, "router_z_loss_clip": 2.8828125, "router_z_loss_mlp": 0.29663086, "step": 3966, "time_per_iteration": 2.6249730587005615 }, { "auxiliary_loss_clip": 0.01441323, "auxiliary_loss_mlp": 0.00251915, "balance_loss_clip": 1.15775037, "balance_loss_mlp": 0.22320952, "epoch": 0.23850894333383435, "flos": 21069724613760.0, "grad_norm": 2.78017290886812, "language_loss": 0.75793874, "learning_rate": 3.561005691492797e-06, "loss": 0.77487123, "num_input_tokens_seen": 85334735, "router_z_loss_clip": 2.83789062, "router_z_loss_mlp": 0.28735352, "step": 3967, "time_per_iteration": 2.7137513160705566 }, { "auxiliary_loss_clip": 0.01446857, "auxiliary_loss_mlp": 0.00238927, "balance_loss_clip": 1.16647339, "balance_loss_mlp": 0.21068624, "epoch": 0.23856906658650234, "flos": 17201606849280.0, "grad_norm": 5.815829030838157, "language_loss": 0.77416205, "learning_rate": 3.5607621890598185e-06, "loss": 0.79101992, "num_input_tokens_seen": 85352875, "router_z_loss_clip": 2.80859375, "router_z_loss_mlp": 0.28271484, "step": 3968, "time_per_iteration": 2.6641108989715576 }, { "auxiliary_loss_clip": 0.01411662, "auxiliary_loss_mlp": 0.00243993, "balance_loss_clip": 1.13764894, "balance_loss_mlp": 0.21500111, "epoch": 0.2386291898391703, "flos": 29494223619840.0, "grad_norm": 342.5621656772212, "language_loss": 0.83635175, "learning_rate": 3.5605186274425823e-06, "loss": 0.85290825, "num_input_tokens_seen": 85372205, "router_z_loss_clip": 2.7421875, "router_z_loss_mlp": 0.29003906, "step": 3969, "time_per_iteration": 2.7201578617095947 }, { "auxiliary_loss_clip": 0.014072, "auxiliary_loss_mlp": 0.00201002, "balance_loss_clip": 1.13388729, "balance_loss_mlp": 0.1745494, "epoch": 0.23868931309183827, "flos": 21142443697920.0, "grad_norm": 3.903847908375964, "language_loss": 0.84533596, "learning_rate": 3.5602750066503225e-06, "loss": 0.86141801, "num_input_tokens_seen": 85389705, "router_z_loss_clip": 2.73242188, "router_z_loss_mlp": 0.26464844, "step": 3970, "time_per_iteration": 2.6480729579925537 }, { "auxiliary_loss_clip": 0.01422776, "auxiliary_loss_mlp": 0.00257367, "balance_loss_clip": 1.14847374, "balance_loss_mlp": 0.22830355, "epoch": 0.23874943634450624, "flos": 25659394784640.0, "grad_norm": 15.270517772228047, "language_loss": 0.93937302, "learning_rate": 3.5600313266922793e-06, "loss": 0.95617437, "num_input_tokens_seen": 85407855, "router_z_loss_clip": 2.74609375, "router_z_loss_mlp": 0.29077148, "step": 3971, "time_per_iteration": 2.708425998687744 }, { "auxiliary_loss_clip": 0.01355738, "auxiliary_loss_mlp": 0.00077352, "balance_loss_clip": 1.20065081, "balance_loss_mlp": 0.06853019, "epoch": 0.2388095595971742, "flos": 58986618624000.0, "grad_norm": 0.7490609841624442, "language_loss": 0.62479568, "learning_rate": 3.5597875875776915e-06, "loss": 0.63912654, "num_input_tokens_seen": 85470885, "router_z_loss_clip": 1.546875, "router_z_loss_mlp": 0.08837891, "step": 3972, "time_per_iteration": 3.212846279144287 }, { "auxiliary_loss_clip": 0.01428399, "auxiliary_loss_mlp": 0.00245764, "balance_loss_clip": 1.15253651, "balance_loss_mlp": 0.21755911, "epoch": 0.23886968284984217, "flos": 16800341040000.0, "grad_norm": 5.510743683047225, "language_loss": 0.89110172, "learning_rate": 3.5595437893158013e-06, "loss": 0.90784335, "num_input_tokens_seen": 85488460, "router_z_loss_clip": 2.75976562, "router_z_loss_mlp": 0.2824707, "step": 3973, "time_per_iteration": 2.629711151123047 }, { "auxiliary_loss_clip": 0.01428039, "auxiliary_loss_mlp": 0.00260507, "balance_loss_clip": 1.15457368, "balance_loss_mlp": 0.23335147, "epoch": 0.23892980610251013, "flos": 22382654538240.0, "grad_norm": 1.7892156681987128, "language_loss": 0.85115939, "learning_rate": 3.5592999319158546e-06, "loss": 0.86804485, "num_input_tokens_seen": 85508590, "router_z_loss_clip": 2.73632812, "router_z_loss_mlp": 0.27185059, "step": 3974, "time_per_iteration": 2.627185106277466 }, { "auxiliary_loss_clip": 0.01438368, "auxiliary_loss_mlp": 0.00246295, "balance_loss_clip": 1.1560638, "balance_loss_mlp": 0.21894856, "epoch": 0.23898992935517813, "flos": 12823198519680.0, "grad_norm": 4.488444165019723, "language_loss": 0.93234682, "learning_rate": 3.5590560153870984e-06, "loss": 0.94919342, "num_input_tokens_seen": 85525970, "router_z_loss_clip": 2.82226562, "router_z_loss_mlp": 0.27331543, "step": 3975, "time_per_iteration": 2.6112895011901855 }, { "auxiliary_loss_clip": 0.01437789, "auxiliary_loss_mlp": 0.00225662, "balance_loss_clip": 1.15942562, "balance_loss_mlp": 0.19879206, "epoch": 0.2390500526078461, "flos": 22345666508160.0, "grad_norm": 3.5207995961601575, "language_loss": 0.94903219, "learning_rate": 3.5588120397387816e-06, "loss": 0.96566677, "num_input_tokens_seen": 85543700, "router_z_loss_clip": 2.78320312, "router_z_loss_mlp": 0.2689209, "step": 3976, "time_per_iteration": 2.615145683288574 }, { "auxiliary_loss_clip": 0.01445834, "auxiliary_loss_mlp": 0.00241606, "balance_loss_clip": 1.16587532, "balance_loss_mlp": 0.21528485, "epoch": 0.23911017586051406, "flos": 22635142214400.0, "grad_norm": 11.909951959059281, "language_loss": 0.79477876, "learning_rate": 3.5585680049801566e-06, "loss": 0.8116532, "num_input_tokens_seen": 85562765, "router_z_loss_clip": 2.796875, "router_z_loss_mlp": 0.26330566, "step": 3977, "time_per_iteration": 2.6397438049316406 }, { "auxiliary_loss_clip": 0.01454776, "auxiliary_loss_mlp": 0.00254609, "balance_loss_clip": 1.16945338, "balance_loss_mlp": 0.2245442, "epoch": 0.23917029911318202, "flos": 23653281219840.0, "grad_norm": 3.551392295607186, "language_loss": 0.78315997, "learning_rate": 3.5583239111204764e-06, "loss": 0.80025375, "num_input_tokens_seen": 85581755, "router_z_loss_clip": 2.85351562, "router_z_loss_mlp": 0.30053711, "step": 3978, "time_per_iteration": 2.621234893798828 }, { "auxiliary_loss_clip": 0.01496589, "auxiliary_loss_mlp": 0.00228627, "balance_loss_clip": 1.2032913, "balance_loss_mlp": 0.19944505, "epoch": 0.23923042236585, "flos": 22783597125120.0, "grad_norm": 183.0761118855229, "language_loss": 0.89345855, "learning_rate": 3.558079758168997e-06, "loss": 0.91071075, "num_input_tokens_seen": 85599455, "router_z_loss_clip": 2.93359375, "router_z_loss_mlp": 0.29199219, "step": 3979, "time_per_iteration": 2.6349189281463623 }, { "auxiliary_loss_clip": 0.01492647, "auxiliary_loss_mlp": 0.00240628, "balance_loss_clip": 1.20275784, "balance_loss_mlp": 0.21009868, "epoch": 0.23929054561851795, "flos": 28147717457280.0, "grad_norm": 93.95361243140225, "language_loss": 0.89174962, "learning_rate": 3.557835546134977e-06, "loss": 0.90908241, "num_input_tokens_seen": 85619970, "router_z_loss_clip": 2.90234375, "router_z_loss_mlp": 0.30529785, "step": 3980, "time_per_iteration": 2.731947898864746 }, { "auxiliary_loss_clip": 0.01496884, "auxiliary_loss_mlp": 0.00223731, "balance_loss_clip": 1.20652723, "balance_loss_mlp": 0.19218794, "epoch": 0.23935066887118592, "flos": 21686525982720.0, "grad_norm": 8.299072590549539, "language_loss": 0.90774637, "learning_rate": 3.5575912750276775e-06, "loss": 0.92495251, "num_input_tokens_seen": 85638850, "router_z_loss_clip": 2.90625, "router_z_loss_mlp": 0.31542969, "step": 3981, "time_per_iteration": 2.6254470348358154 }, { "auxiliary_loss_clip": 0.01505853, "auxiliary_loss_mlp": 0.00255128, "balance_loss_clip": 1.21459556, "balance_loss_mlp": 0.2216306, "epoch": 0.2394107921238539, "flos": 32122274198400.0, "grad_norm": 2.7316425743621875, "language_loss": 0.84854817, "learning_rate": 3.5573469448563607e-06, "loss": 0.86615801, "num_input_tokens_seen": 85656285, "router_z_loss_clip": 2.9140625, "router_z_loss_mlp": 0.3347168, "step": 3982, "time_per_iteration": 2.7111430168151855 }, { "auxiliary_loss_clip": 0.01502883, "auxiliary_loss_mlp": 0.00245068, "balance_loss_clip": 1.21609163, "balance_loss_mlp": 0.21405007, "epoch": 0.23947091537652188, "flos": 17019180650880.0, "grad_norm": 7.510907698285992, "language_loss": 0.85020339, "learning_rate": 3.5571025556302915e-06, "loss": 0.86768293, "num_input_tokens_seen": 85673020, "router_z_loss_clip": 2.8671875, "router_z_loss_mlp": 0.31005859, "step": 3983, "time_per_iteration": 2.616969108581543 }, { "auxiliary_loss_clip": 0.0151528, "auxiliary_loss_mlp": 0.00268483, "balance_loss_clip": 1.22330046, "balance_loss_mlp": 0.23410329, "epoch": 0.23953103862918984, "flos": 20593584904320.0, "grad_norm": 2.082014653407247, "language_loss": 0.79926229, "learning_rate": 3.556858107358737e-06, "loss": 0.81709993, "num_input_tokens_seen": 85692565, "router_z_loss_clip": 2.921875, "router_z_loss_mlp": 0.34375, "step": 3984, "time_per_iteration": 4.0893638134002686 }, { "auxiliary_loss_clip": 0.01509697, "auxiliary_loss_mlp": 0.00261698, "balance_loss_clip": 1.21801066, "balance_loss_mlp": 0.23075168, "epoch": 0.2395911618818578, "flos": 20704405340160.0, "grad_norm": 12.754420812163612, "language_loss": 0.87867707, "learning_rate": 3.5566136000509674e-06, "loss": 0.89639103, "num_input_tokens_seen": 85709730, "router_z_loss_clip": 2.91601562, "router_z_loss_mlp": 0.30932617, "step": 3985, "time_per_iteration": 2.676412343978882 }, { "auxiliary_loss_clip": 0.01516537, "auxiliary_loss_mlp": 0.00242394, "balance_loss_clip": 1.22189903, "balance_loss_mlp": 0.21001635, "epoch": 0.23965128513452577, "flos": 27053519402880.0, "grad_norm": 33.16018614193123, "language_loss": 0.81226176, "learning_rate": 3.556369033716254e-06, "loss": 0.82985109, "num_input_tokens_seen": 85730045, "router_z_loss_clip": 2.9453125, "router_z_loss_mlp": 0.32373047, "step": 3986, "time_per_iteration": 2.7242431640625 }, { "auxiliary_loss_clip": 0.01532144, "auxiliary_loss_mlp": 0.00283033, "balance_loss_clip": 1.23358738, "balance_loss_mlp": 0.24963051, "epoch": 0.23971140838719374, "flos": 23144319457920.0, "grad_norm": 6.573784297303717, "language_loss": 0.94496262, "learning_rate": 3.556124408363871e-06, "loss": 0.96311438, "num_input_tokens_seen": 85747590, "router_z_loss_clip": 2.984375, "router_z_loss_mlp": 0.33422852, "step": 3987, "time_per_iteration": 4.047123908996582 }, { "auxiliary_loss_clip": 0.01538512, "auxiliary_loss_mlp": 0.0023808, "balance_loss_clip": 1.2503531, "balance_loss_mlp": 0.20823035, "epoch": 0.23977153163986173, "flos": 18034554309120.0, "grad_norm": 39.797067360883126, "language_loss": 0.88829052, "learning_rate": 3.5558797240030945e-06, "loss": 0.90605646, "num_input_tokens_seen": 85763460, "router_z_loss_clip": 2.8828125, "router_z_loss_mlp": 0.29882812, "step": 3988, "time_per_iteration": 2.5939865112304688 }, { "auxiliary_loss_clip": 0.01519394, "auxiliary_loss_mlp": 0.00276943, "balance_loss_clip": 1.23009777, "balance_loss_mlp": 0.24417222, "epoch": 0.2398316548925297, "flos": 18113378705280.0, "grad_norm": 1.7075536099014303, "language_loss": 0.92908394, "learning_rate": 3.5556349806432035e-06, "loss": 0.94704723, "num_input_tokens_seen": 85782050, "router_z_loss_clip": 2.89257812, "router_z_loss_mlp": 0.32800293, "step": 3989, "time_per_iteration": 2.5944008827209473 }, { "auxiliary_loss_clip": 0.01515515, "auxiliary_loss_mlp": 0.00232092, "balance_loss_clip": 1.22836828, "balance_loss_mlp": 0.20257606, "epoch": 0.23989177814519766, "flos": 12567730014720.0, "grad_norm": 101.71186192355533, "language_loss": 0.91692352, "learning_rate": 3.555390178293477e-06, "loss": 0.93439955, "num_input_tokens_seen": 85797400, "router_z_loss_clip": 2.8671875, "router_z_loss_mlp": 0.29541016, "step": 3990, "time_per_iteration": 2.6023075580596924 }, { "auxiliary_loss_clip": 0.01515019, "auxiliary_loss_mlp": 0.00250389, "balance_loss_clip": 1.22752619, "balance_loss_mlp": 0.21982333, "epoch": 0.23995190139786562, "flos": 25264593423360.0, "grad_norm": 1.8001088674027574, "language_loss": 0.80636954, "learning_rate": 3.5551453169631994e-06, "loss": 0.82402354, "num_input_tokens_seen": 85818995, "router_z_loss_clip": 2.87304688, "router_z_loss_mlp": 0.3059082, "step": 3991, "time_per_iteration": 2.659005880355835 }, { "auxiliary_loss_clip": 0.01360896, "auxiliary_loss_mlp": 0.00107439, "balance_loss_clip": 1.19947219, "balance_loss_mlp": 0.09671061, "epoch": 0.2400120246505336, "flos": 61960379650560.0, "grad_norm": 0.8553043211173433, "language_loss": 0.63141495, "learning_rate": 3.554900396661656e-06, "loss": 0.64609826, "num_input_tokens_seen": 85876695, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.10742188, "step": 3992, "time_per_iteration": 4.4299726486206055 }, { "auxiliary_loss_clip": 0.01360402, "auxiliary_loss_mlp": 0.00109359, "balance_loss_clip": 1.19582486, "balance_loss_mlp": 0.0978668, "epoch": 0.24007214790320155, "flos": 66708560540160.0, "grad_norm": 0.7527208033166083, "language_loss": 0.62805325, "learning_rate": 3.5546554173981334e-06, "loss": 0.6427508, "num_input_tokens_seen": 85940990, "router_z_loss_clip": 1.640625, "router_z_loss_mlp": 0.11474609, "step": 3993, "time_per_iteration": 3.1985270977020264 }, { "auxiliary_loss_clip": 0.0154231, "auxiliary_loss_mlp": 0.00269525, "balance_loss_clip": 1.25286222, "balance_loss_mlp": 0.23731467, "epoch": 0.24013227115586952, "flos": 25809070757760.0, "grad_norm": 23.043453896550425, "language_loss": 0.82550782, "learning_rate": 3.5544103791819218e-06, "loss": 0.84362614, "num_input_tokens_seen": 85961165, "router_z_loss_clip": 2.89453125, "router_z_loss_mlp": 0.32177734, "step": 3994, "time_per_iteration": 2.6905274391174316 }, { "auxiliary_loss_clip": 0.01525478, "auxiliary_loss_mlp": 0.00248531, "balance_loss_clip": 1.23757696, "balance_loss_mlp": 0.21684489, "epoch": 0.2401923944085375, "flos": 25557480921600.0, "grad_norm": 8.859938911076036, "language_loss": 0.85656154, "learning_rate": 3.5541652820223124e-06, "loss": 0.87430155, "num_input_tokens_seen": 85982710, "router_z_loss_clip": 2.88085938, "router_z_loss_mlp": 0.31665039, "step": 3995, "time_per_iteration": 2.6377129554748535 }, { "auxiliary_loss_clip": 0.01321272, "auxiliary_loss_mlp": 0.000829, "balance_loss_clip": 1.16401136, "balance_loss_mlp": 0.07426915, "epoch": 0.24025251766120548, "flos": 54941138478720.0, "grad_norm": 0.8797027980534742, "language_loss": 0.63524348, "learning_rate": 3.5539201259286006e-06, "loss": 0.64928514, "num_input_tokens_seen": 86046935, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.08642578, "step": 3996, "time_per_iteration": 3.1851038932800293 }, { "auxiliary_loss_clip": 0.01521266, "auxiliary_loss_mlp": 0.00235083, "balance_loss_clip": 1.23336935, "balance_loss_mlp": 0.20665173, "epoch": 0.24031264091387344, "flos": 20631075724800.0, "grad_norm": 5.537774957588464, "language_loss": 0.79404795, "learning_rate": 3.5536749109100808e-06, "loss": 0.81161147, "num_input_tokens_seen": 86064355, "router_z_loss_clip": 2.87695312, "router_z_loss_mlp": 0.28430176, "step": 3997, "time_per_iteration": 4.008476495742798 }, { "auxiliary_loss_clip": 0.0150619, "auxiliary_loss_mlp": 0.00210122, "balance_loss_clip": 1.22635341, "balance_loss_mlp": 0.18017697, "epoch": 0.2403727641665414, "flos": 20886256920960.0, "grad_norm": 3.233083621421874, "language_loss": 0.9484967, "learning_rate": 3.5534296369760535e-06, "loss": 0.96565986, "num_input_tokens_seen": 86081340, "router_z_loss_clip": 2.8046875, "router_z_loss_mlp": 0.29919434, "step": 3998, "time_per_iteration": 2.592376947402954 }, { "auxiliary_loss_clip": 0.0149374, "auxiliary_loss_mlp": 0.00265953, "balance_loss_clip": 1.20979762, "balance_loss_mlp": 0.23383847, "epoch": 0.24043288741920937, "flos": 22820046451200.0, "grad_norm": 205.43370281249335, "language_loss": 0.83863509, "learning_rate": 3.5531843041358183e-06, "loss": 0.85623205, "num_input_tokens_seen": 86102260, "router_z_loss_clip": 2.83789062, "router_z_loss_mlp": 0.32080078, "step": 3999, "time_per_iteration": 2.6365256309509277 }, { "auxiliary_loss_clip": 0.01495974, "auxiliary_loss_mlp": 0.00240861, "balance_loss_clip": 1.22068083, "balance_loss_mlp": 0.21258414, "epoch": 0.24049301067187734, "flos": 27959652823680.0, "grad_norm": 320.1400145577789, "language_loss": 0.80222535, "learning_rate": 3.552938912398679e-06, "loss": 0.81959373, "num_input_tokens_seen": 86123400, "router_z_loss_clip": 2.75, "router_z_loss_mlp": 0.28295898, "step": 4000, "time_per_iteration": 2.6764094829559326 }, { "auxiliary_loss_clip": 0.01480468, "auxiliary_loss_mlp": 0.00273726, "balance_loss_clip": 1.20321667, "balance_loss_mlp": 0.2427312, "epoch": 0.24055313392454533, "flos": 27451409333760.0, "grad_norm": 31.309971904986266, "language_loss": 0.7288897, "learning_rate": 3.5526934617739397e-06, "loss": 0.74643165, "num_input_tokens_seen": 86144060, "router_z_loss_clip": 2.77148438, "router_z_loss_mlp": 0.30957031, "step": 4001, "time_per_iteration": 2.7127201557159424 }, { "auxiliary_loss_clip": 0.01496248, "auxiliary_loss_mlp": 0.00270949, "balance_loss_clip": 1.21928477, "balance_loss_mlp": 0.24175429, "epoch": 0.2406132571772133, "flos": 25556618995200.0, "grad_norm": 3.340568788889956, "language_loss": 0.91458923, "learning_rate": 3.5524479522709095e-06, "loss": 0.93226123, "num_input_tokens_seen": 86163005, "router_z_loss_clip": 2.76953125, "router_z_loss_mlp": 0.29211426, "step": 4002, "time_per_iteration": 2.6655960083007812 }, { "auxiliary_loss_clip": 0.01465055, "auxiliary_loss_mlp": 0.00244879, "balance_loss_clip": 1.1938051, "balance_loss_mlp": 0.21734208, "epoch": 0.24067338042988126, "flos": 24791398629120.0, "grad_norm": 81.37571598942792, "language_loss": 0.90693623, "learning_rate": 3.552202383898897e-06, "loss": 0.92403561, "num_input_tokens_seen": 86182580, "router_z_loss_clip": 2.71289062, "router_z_loss_mlp": 0.27563477, "step": 4003, "time_per_iteration": 2.6436946392059326 }, { "auxiliary_loss_clip": 0.01501058, "auxiliary_loss_mlp": 0.00287109, "balance_loss_clip": 1.22187805, "balance_loss_mlp": 0.25969076, "epoch": 0.24073350368254923, "flos": 21177923356800.0, "grad_norm": 393.25979736817106, "language_loss": 0.95673907, "learning_rate": 3.551956756667215e-06, "loss": 0.97462082, "num_input_tokens_seen": 86200665, "router_z_loss_clip": 2.79296875, "router_z_loss_mlp": 0.27441406, "step": 4004, "time_per_iteration": 2.6052279472351074 }, { "auxiliary_loss_clip": 0.01477048, "auxiliary_loss_mlp": 0.00275946, "balance_loss_clip": 1.20478272, "balance_loss_mlp": 0.24659631, "epoch": 0.2407936269352172, "flos": 22494300986880.0, "grad_norm": 201.7028671500432, "language_loss": 0.8600027, "learning_rate": 3.551711070585177e-06, "loss": 0.8775326, "num_input_tokens_seen": 86221640, "router_z_loss_clip": 2.72265625, "router_z_loss_mlp": 0.29333496, "step": 4005, "time_per_iteration": 2.693500518798828 }, { "auxiliary_loss_clip": 0.01466334, "auxiliary_loss_mlp": 0.00269573, "balance_loss_clip": 1.19920468, "balance_loss_mlp": 0.24304859, "epoch": 0.24085375018788516, "flos": 18551129754240.0, "grad_norm": 27.381339812084644, "language_loss": 0.85162103, "learning_rate": 3.5514653256620995e-06, "loss": 0.86898005, "num_input_tokens_seen": 86240795, "router_z_loss_clip": 2.67382812, "router_z_loss_mlp": 0.26501465, "step": 4006, "time_per_iteration": 2.7737812995910645 }, { "auxiliary_loss_clip": 0.01471403, "auxiliary_loss_mlp": 0.00327318, "balance_loss_clip": 1.196679, "balance_loss_mlp": 0.29506022, "epoch": 0.24091387344055312, "flos": 24170539023360.0, "grad_norm": 1.8127224714865762, "language_loss": 0.7868005, "learning_rate": 3.551219521907302e-06, "loss": 0.80478764, "num_input_tokens_seen": 86262000, "router_z_loss_clip": 2.74414062, "router_z_loss_mlp": 0.32226562, "step": 4007, "time_per_iteration": 2.7195253372192383 }, { "auxiliary_loss_clip": 0.01479124, "auxiliary_loss_mlp": 0.0031145, "balance_loss_clip": 1.20970583, "balance_loss_mlp": 0.28332835, "epoch": 0.24097399669322112, "flos": 11036319615360.0, "grad_norm": 2.4931790095153823, "language_loss": 0.83009696, "learning_rate": 3.5509736593301042e-06, "loss": 0.84800267, "num_input_tokens_seen": 86279680, "router_z_loss_clip": 2.69726562, "router_z_loss_mlp": 0.28137207, "step": 4008, "time_per_iteration": 2.605273485183716 }, { "auxiliary_loss_clip": 0.01473602, "auxiliary_loss_mlp": 0.00321273, "balance_loss_clip": 1.20357525, "balance_loss_mlp": 0.2902433, "epoch": 0.24103411994588908, "flos": 17165085696000.0, "grad_norm": 12.248530909853292, "language_loss": 0.83389544, "learning_rate": 3.5507277379398295e-06, "loss": 0.85184419, "num_input_tokens_seen": 86297180, "router_z_loss_clip": 2.70117188, "router_z_loss_mlp": 0.31030273, "step": 4009, "time_per_iteration": 2.6008353233337402 }, { "auxiliary_loss_clip": 0.01465699, "auxiliary_loss_mlp": 0.00348324, "balance_loss_clip": 1.19841075, "balance_loss_mlp": 0.31784236, "epoch": 0.24109424319855705, "flos": 20667956014080.0, "grad_norm": 33.118222941660704, "language_loss": 0.87025714, "learning_rate": 3.550481757745804e-06, "loss": 0.88839734, "num_input_tokens_seen": 86317660, "router_z_loss_clip": 2.671875, "router_z_loss_mlp": 0.30493164, "step": 4010, "time_per_iteration": 2.626300096511841 }, { "auxiliary_loss_clip": 0.01478667, "auxiliary_loss_mlp": 0.00334837, "balance_loss_clip": 1.20592046, "balance_loss_mlp": 0.30157775, "epoch": 0.241154366451225, "flos": 28181796485760.0, "grad_norm": 9.657541087390337, "language_loss": 0.77107638, "learning_rate": 3.5502357187573555e-06, "loss": 0.78921139, "num_input_tokens_seen": 86338325, "router_z_loss_clip": 2.73046875, "router_z_loss_mlp": 0.33251953, "step": 4011, "time_per_iteration": 2.7033228874206543 }, { "auxiliary_loss_clip": 0.01481032, "auxiliary_loss_mlp": 0.00362706, "balance_loss_clip": 1.21088171, "balance_loss_mlp": 0.33116302, "epoch": 0.24121448970389298, "flos": 21689722293120.0, "grad_norm": 319.7125193687515, "language_loss": 0.77801669, "learning_rate": 3.5499896209838118e-06, "loss": 0.79645413, "num_input_tokens_seen": 86357615, "router_z_loss_clip": 2.70117188, "router_z_loss_mlp": 0.31494141, "step": 4012, "time_per_iteration": 2.7173452377319336 }, { "auxiliary_loss_clip": 0.01503069, "auxiliary_loss_mlp": 0.00366104, "balance_loss_clip": 1.22667623, "balance_loss_mlp": 0.33050781, "epoch": 0.24127461295656094, "flos": 39676191269760.0, "grad_norm": 226.70463881660595, "language_loss": 0.80997932, "learning_rate": 3.5497434644345073e-06, "loss": 0.82867098, "num_input_tokens_seen": 86380355, "router_z_loss_clip": 2.765625, "router_z_loss_mlp": 0.35546875, "step": 4013, "time_per_iteration": 2.864473581314087 }, { "auxiliary_loss_clip": 0.01525062, "auxiliary_loss_mlp": 0.00354063, "balance_loss_clip": 1.24722195, "balance_loss_mlp": 0.32201958, "epoch": 0.2413347362092289, "flos": 19135863256320.0, "grad_norm": 2.832949217862195, "language_loss": 0.97509915, "learning_rate": 3.5494972491187753e-06, "loss": 0.9938904, "num_input_tokens_seen": 86399125, "router_z_loss_clip": 2.77734375, "router_z_loss_mlp": 0.32055664, "step": 4014, "time_per_iteration": 2.65303111076355 }, { "auxiliary_loss_clip": 0.01518794, "auxiliary_loss_mlp": 0.00348939, "balance_loss_clip": 1.23692012, "balance_loss_mlp": 0.31732506, "epoch": 0.2413948594618969, "flos": 26939430829440.0, "grad_norm": 5.776382086421996, "language_loss": 1.04411352, "learning_rate": 3.549250975045952e-06, "loss": 1.06279087, "num_input_tokens_seen": 86418625, "router_z_loss_clip": 2.8203125, "router_z_loss_mlp": 0.31616211, "step": 4015, "time_per_iteration": 2.676043748855591 }, { "auxiliary_loss_clip": 0.01556943, "auxiliary_loss_mlp": 0.00389868, "balance_loss_clip": 1.27391219, "balance_loss_mlp": 0.35603654, "epoch": 0.24145498271456486, "flos": 25228108183680.0, "grad_norm": 2.6841854221133414, "language_loss": 0.88839149, "learning_rate": 3.5490046422253768e-06, "loss": 0.90785956, "num_input_tokens_seen": 86438375, "router_z_loss_clip": 2.83007812, "router_z_loss_mlp": 0.33813477, "step": 4016, "time_per_iteration": 2.6809616088867188 }, { "auxiliary_loss_clip": 0.01555219, "auxiliary_loss_mlp": 0.00333865, "balance_loss_clip": 1.2772187, "balance_loss_mlp": 0.30375263, "epoch": 0.24151510596723283, "flos": 40661759617920.0, "grad_norm": 4.528299992829508, "language_loss": 0.77027035, "learning_rate": 3.54875825066639e-06, "loss": 0.78916115, "num_input_tokens_seen": 86463230, "router_z_loss_clip": 2.78320312, "router_z_loss_mlp": 0.30151367, "step": 4017, "time_per_iteration": 2.8239240646362305 }, { "auxiliary_loss_clip": 0.01585729, "auxiliary_loss_mlp": 0.00365415, "balance_loss_clip": 1.28597271, "balance_loss_mlp": 0.32972404, "epoch": 0.2415752292199008, "flos": 18146667634560.0, "grad_norm": 25.79908216408453, "language_loss": 0.91970849, "learning_rate": 3.5485118003783353e-06, "loss": 0.93921995, "num_input_tokens_seen": 86481230, "router_z_loss_clip": 2.99609375, "router_z_loss_mlp": 0.35693359, "step": 4018, "time_per_iteration": 2.605440139770508 }, { "auxiliary_loss_clip": 0.01474609, "auxiliary_loss_mlp": 0.00086238, "balance_loss_clip": 1.29541183, "balance_loss_mlp": 0.07841775, "epoch": 0.24163535247256876, "flos": 67288409792640.0, "grad_norm": 0.8580887066501497, "language_loss": 0.60503095, "learning_rate": 3.548265291370558e-06, "loss": 0.62063938, "num_input_tokens_seen": 86541260, "router_z_loss_clip": 1.796875, "router_z_loss_mlp": 0.078125, "step": 4019, "time_per_iteration": 3.173668622970581 }, { "auxiliary_loss_clip": 0.0159065, "auxiliary_loss_mlp": 0.00383293, "balance_loss_clip": 1.29534733, "balance_loss_mlp": 0.35077298, "epoch": 0.24169547572523672, "flos": 24929941386240.0, "grad_norm": 5.092596183938838, "language_loss": 0.79514277, "learning_rate": 3.5480187236524055e-06, "loss": 0.81488216, "num_input_tokens_seen": 86559580, "router_z_loss_clip": 2.953125, "router_z_loss_mlp": 0.32519531, "step": 4020, "time_per_iteration": 2.6483712196350098 }, { "auxiliary_loss_clip": 0.01615371, "auxiliary_loss_mlp": 0.004128, "balance_loss_clip": 1.31255746, "balance_loss_mlp": 0.37665591, "epoch": 0.24175559897790472, "flos": 18728312567040.0, "grad_norm": 3.2464722178492997, "language_loss": 0.89343703, "learning_rate": 3.5477720972332285e-06, "loss": 0.91371876, "num_input_tokens_seen": 86577560, "router_z_loss_clip": 3.03125, "router_z_loss_mlp": 0.36132812, "step": 4021, "time_per_iteration": 2.5894668102264404 }, { "auxiliary_loss_clip": 0.01627271, "auxiliary_loss_mlp": 0.00399852, "balance_loss_clip": 1.31680799, "balance_loss_mlp": 0.36218238, "epoch": 0.24181572223057268, "flos": 23039281111680.0, "grad_norm": 7.549929140657058, "language_loss": 0.82998955, "learning_rate": 3.547525412122378e-06, "loss": 0.85026085, "num_input_tokens_seen": 86595350, "router_z_loss_clip": 3.10546875, "router_z_loss_mlp": 0.37646484, "step": 4022, "time_per_iteration": 2.5951194763183594 }, { "auxiliary_loss_clip": 0.01636957, "auxiliary_loss_mlp": 0.00439849, "balance_loss_clip": 1.31789923, "balance_loss_mlp": 0.39896074, "epoch": 0.24187584548324065, "flos": 20376145923840.0, "grad_norm": 7.561232859376897, "language_loss": 0.83350331, "learning_rate": 3.5472786683292083e-06, "loss": 0.85427135, "num_input_tokens_seen": 86614805, "router_z_loss_clip": 3.19140625, "router_z_loss_mlp": 0.40869141, "step": 4023, "time_per_iteration": 2.655895233154297 }, { "auxiliary_loss_clip": 0.01631751, "auxiliary_loss_mlp": 0.00403891, "balance_loss_clip": 1.31449032, "balance_loss_mlp": 0.36564913, "epoch": 0.2419359687359086, "flos": 21397517153280.0, "grad_norm": 2.1899396332086187, "language_loss": 0.90049273, "learning_rate": 3.5470318658630766e-06, "loss": 0.9208492, "num_input_tokens_seen": 86633700, "router_z_loss_clip": 3.16992188, "router_z_loss_mlp": 0.38256836, "step": 4024, "time_per_iteration": 2.6063201427459717 }, { "auxiliary_loss_clip": 0.0163505, "auxiliary_loss_mlp": 0.00394875, "balance_loss_clip": 1.31734872, "balance_loss_mlp": 0.3584919, "epoch": 0.24199609198857658, "flos": 18369385914240.0, "grad_norm": 2.7135802243744065, "language_loss": 0.90361714, "learning_rate": 3.5467850047333424e-06, "loss": 0.9239164, "num_input_tokens_seen": 86650905, "router_z_loss_clip": 3.171875, "router_z_loss_mlp": 0.36401367, "step": 4025, "time_per_iteration": 2.6469478607177734 }, { "auxiliary_loss_clip": 0.01634767, "auxiliary_loss_mlp": 0.00392658, "balance_loss_clip": 1.31463265, "balance_loss_mlp": 0.35708624, "epoch": 0.24205621524124454, "flos": 19463871277440.0, "grad_norm": 3.807417277597529, "language_loss": 0.79531276, "learning_rate": 3.546538084949365e-06, "loss": 0.81558698, "num_input_tokens_seen": 86669185, "router_z_loss_clip": 3.20117188, "router_z_loss_mlp": 0.35546875, "step": 4026, "time_per_iteration": 4.0419745445251465 }, { "auxiliary_loss_clip": 0.01630671, "auxiliary_loss_mlp": 0.0041835, "balance_loss_clip": 1.32209706, "balance_loss_mlp": 0.37948778, "epoch": 0.2421163384939125, "flos": 14976330451200.0, "grad_norm": 14.369806801897061, "language_loss": 0.71087444, "learning_rate": 3.546291106520509e-06, "loss": 0.73136467, "num_input_tokens_seen": 86686805, "router_z_loss_clip": 3.08398438, "router_z_loss_mlp": 0.38867188, "step": 4027, "time_per_iteration": 2.6133980751037598 }, { "auxiliary_loss_clip": 0.0164711, "auxiliary_loss_mlp": 0.00410443, "balance_loss_clip": 1.32540178, "balance_loss_mlp": 0.37141353, "epoch": 0.2421764617465805, "flos": 18662057930880.0, "grad_norm": 2.7674177674863922, "language_loss": 0.78581321, "learning_rate": 3.5460440694561388e-06, "loss": 0.80638874, "num_input_tokens_seen": 86705520, "router_z_loss_clip": 3.21484375, "router_z_loss_mlp": 0.39038086, "step": 4028, "time_per_iteration": 2.674367904663086 }, { "auxiliary_loss_clip": 0.01544092, "auxiliary_loss_mlp": 0.00074855, "balance_loss_clip": 1.33819389, "balance_loss_mlp": 0.06565236, "epoch": 0.24223658499924847, "flos": 64347327164160.0, "grad_norm": 0.8549094930137469, "language_loss": 0.55439085, "learning_rate": 3.545796973765623e-06, "loss": 0.57058036, "num_input_tokens_seen": 86767320, "router_z_loss_clip": 2.0625, "router_z_loss_mlp": 0.09179688, "step": 4029, "time_per_iteration": 4.540846347808838 }, { "auxiliary_loss_clip": 0.01617844, "auxiliary_loss_mlp": 0.00408794, "balance_loss_clip": 1.30667186, "balance_loss_mlp": 0.3730073, "epoch": 0.24229670825191643, "flos": 25775243124480.0, "grad_norm": 5.488057818348033, "language_loss": 0.80174839, "learning_rate": 3.54554981945833e-06, "loss": 0.82201475, "num_input_tokens_seen": 86788110, "router_z_loss_clip": 3.109375, "router_z_loss_mlp": 0.35766602, "step": 4030, "time_per_iteration": 2.6995301246643066 }, { "auxiliary_loss_clip": 0.01638867, "auxiliary_loss_mlp": 0.00444825, "balance_loss_clip": 1.32291567, "balance_loss_mlp": 0.40586767, "epoch": 0.2423568315045844, "flos": 20667094087680.0, "grad_norm": 160.6522362737671, "language_loss": 0.83747768, "learning_rate": 3.5453026065436343e-06, "loss": 0.85831469, "num_input_tokens_seen": 86807640, "router_z_loss_clip": 3.15820312, "router_z_loss_mlp": 0.38964844, "step": 4031, "time_per_iteration": 2.687688112258911 }, { "auxiliary_loss_clip": 0.0162627, "auxiliary_loss_mlp": 0.00423719, "balance_loss_clip": 1.30040359, "balance_loss_mlp": 0.38330728, "epoch": 0.24241695475725236, "flos": 22416805393920.0, "grad_norm": 3.578725892548548, "language_loss": 0.75508857, "learning_rate": 3.5450553350309083e-06, "loss": 0.77558851, "num_input_tokens_seen": 86826795, "router_z_loss_clip": 3.25976562, "router_z_loss_mlp": 0.40380859, "step": 4032, "time_per_iteration": 2.621288776397705 }, { "auxiliary_loss_clip": 0.01649014, "auxiliary_loss_mlp": 0.00428162, "balance_loss_clip": 1.32856894, "balance_loss_mlp": 0.39065832, "epoch": 0.24247707800992033, "flos": 17128995505920.0, "grad_norm": 10.539217935540377, "language_loss": 0.87637466, "learning_rate": 3.5448080049295286e-06, "loss": 0.8971464, "num_input_tokens_seen": 86843175, "router_z_loss_clip": 3.20898438, "router_z_loss_mlp": 0.37475586, "step": 4033, "time_per_iteration": 2.6018104553222656 }, { "auxiliary_loss_clip": 0.01642308, "auxiliary_loss_mlp": 0.00393317, "balance_loss_clip": 1.32119632, "balance_loss_mlp": 0.35345381, "epoch": 0.2425372012625883, "flos": 31613743399680.0, "grad_norm": 7.49211548424063, "language_loss": 0.74672133, "learning_rate": 3.5445606162488754e-06, "loss": 0.76707762, "num_input_tokens_seen": 86863185, "router_z_loss_clip": 3.20898438, "router_z_loss_mlp": 0.3984375, "step": 4034, "time_per_iteration": 4.149355173110962 }, { "auxiliary_loss_clip": 0.01652773, "auxiliary_loss_mlp": 0.0044026, "balance_loss_clip": 1.32992697, "balance_loss_mlp": 0.40189832, "epoch": 0.24259732451525629, "flos": 16326032924160.0, "grad_norm": 2.8203308671644467, "language_loss": 1.02710199, "learning_rate": 3.5443131689983283e-06, "loss": 1.04803228, "num_input_tokens_seen": 86880040, "router_z_loss_clip": 3.22460938, "router_z_loss_mlp": 0.38354492, "step": 4035, "time_per_iteration": 2.6441774368286133 }, { "auxiliary_loss_clip": 0.01651524, "auxiliary_loss_mlp": 0.00431818, "balance_loss_clip": 1.33247519, "balance_loss_mlp": 0.39226428, "epoch": 0.24265744776792425, "flos": 22856639431680.0, "grad_norm": 9.808424142605672, "language_loss": 0.84412038, "learning_rate": 3.5440656631872715e-06, "loss": 0.86495376, "num_input_tokens_seen": 86900610, "router_z_loss_clip": 3.19140625, "router_z_loss_mlp": 0.39575195, "step": 4036, "time_per_iteration": 2.6744649410247803 }, { "auxiliary_loss_clip": 0.01654809, "auxiliary_loss_mlp": 0.00396976, "balance_loss_clip": 1.32816863, "balance_loss_mlp": 0.35551465, "epoch": 0.24271757102059222, "flos": 21871573873920.0, "grad_norm": 208.85647006991022, "language_loss": 0.80662501, "learning_rate": 3.5438180988250898e-06, "loss": 0.82714283, "num_input_tokens_seen": 86919385, "router_z_loss_clip": 3.265625, "router_z_loss_mlp": 0.41430664, "step": 4037, "time_per_iteration": 2.6264445781707764 }, { "auxiliary_loss_clip": 0.01638349, "auxiliary_loss_mlp": 0.00385149, "balance_loss_clip": 1.31586456, "balance_loss_mlp": 0.34645322, "epoch": 0.24277769427326018, "flos": 19208582340480.0, "grad_norm": 294.2207455447814, "language_loss": 0.86380303, "learning_rate": 3.543570475921171e-06, "loss": 0.88403797, "num_input_tokens_seen": 86938885, "router_z_loss_clip": 3.22070312, "router_z_loss_mlp": 0.38623047, "step": 4038, "time_per_iteration": 2.645948886871338 }, { "auxiliary_loss_clip": 0.01621172, "auxiliary_loss_mlp": 0.00396858, "balance_loss_clip": 1.29863667, "balance_loss_mlp": 0.3559449, "epoch": 0.24283781752592815, "flos": 19499889640320.0, "grad_norm": 17.81797288427343, "language_loss": 0.78902888, "learning_rate": 3.543322794484905e-06, "loss": 0.80920917, "num_input_tokens_seen": 86957705, "router_z_loss_clip": 3.22265625, "router_z_loss_mlp": 0.40869141, "step": 4039, "time_per_iteration": 4.016241788864136 }, { "auxiliary_loss_clip": 0.0164204, "auxiliary_loss_mlp": 0.00427717, "balance_loss_clip": 1.31436002, "balance_loss_mlp": 0.38503975, "epoch": 0.2428979407785961, "flos": 19902196944000.0, "grad_norm": 16.239942266194465, "language_loss": 0.84406078, "learning_rate": 3.5430750545256843e-06, "loss": 0.86475837, "num_input_tokens_seen": 86975845, "router_z_loss_clip": 3.27929688, "router_z_loss_mlp": 0.42651367, "step": 4040, "time_per_iteration": 2.6640191078186035 }, { "auxiliary_loss_clip": 0.01655402, "auxiliary_loss_mlp": 0.00337098, "balance_loss_clip": 1.33219814, "balance_loss_mlp": 0.2996659, "epoch": 0.2429580640312641, "flos": 24715878284160.0, "grad_norm": 3.025987311901787, "language_loss": 0.87836266, "learning_rate": 3.5428272560529027e-06, "loss": 0.89828771, "num_input_tokens_seen": 86994800, "router_z_loss_clip": 3.23242188, "router_z_loss_mlp": 0.37426758, "step": 4041, "time_per_iteration": 2.6808066368103027 }, { "auxiliary_loss_clip": 0.01636036, "auxiliary_loss_mlp": 0.00389701, "balance_loss_clip": 1.31724751, "balance_loss_mlp": 0.35048071, "epoch": 0.24301818728393207, "flos": 25630343660160.0, "grad_norm": 2.903359448513012, "language_loss": 0.81794715, "learning_rate": 3.542579399075957e-06, "loss": 0.8382045, "num_input_tokens_seen": 87016845, "router_z_loss_clip": 3.18945312, "router_z_loss_mlp": 0.39208984, "step": 4042, "time_per_iteration": 2.665313720703125 }, { "auxiliary_loss_clip": 0.01642039, "auxiliary_loss_mlp": 0.0038113, "balance_loss_clip": 1.32389355, "balance_loss_mlp": 0.34179121, "epoch": 0.24307831053660003, "flos": 26141388410880.0, "grad_norm": 3.8025967647269128, "language_loss": 0.86710137, "learning_rate": 3.542331483604246e-06, "loss": 0.8873331, "num_input_tokens_seen": 87036270, "router_z_loss_clip": 3.18164062, "router_z_loss_mlp": 0.39355469, "step": 4043, "time_per_iteration": 2.6969072818756104 }, { "auxiliary_loss_clip": 0.01630551, "auxiliary_loss_mlp": 0.00398971, "balance_loss_clip": 1.30158758, "balance_loss_mlp": 0.35975143, "epoch": 0.243138433789268, "flos": 14972415868800.0, "grad_norm": 343.23369440830544, "language_loss": 0.81866592, "learning_rate": 3.5420835096471706e-06, "loss": 0.83896112, "num_input_tokens_seen": 87049920, "router_z_loss_clip": 3.2890625, "router_z_loss_mlp": 0.39257812, "step": 4044, "time_per_iteration": 2.6264166831970215 }, { "auxiliary_loss_clip": 0.01626098, "auxiliary_loss_mlp": 0.00358241, "balance_loss_clip": 1.30834985, "balance_loss_mlp": 0.31949806, "epoch": 0.24319855704193596, "flos": 25191694771200.0, "grad_norm": 2.982591802394121, "language_loss": 0.90360993, "learning_rate": 3.5418354772141337e-06, "loss": 0.92345333, "num_input_tokens_seen": 87068230, "router_z_loss_clip": 3.1796875, "router_z_loss_mlp": 0.38745117, "step": 4045, "time_per_iteration": 2.681691884994507 }, { "auxiliary_loss_clip": 0.01613263, "auxiliary_loss_mlp": 0.00350835, "balance_loss_clip": 1.29409838, "balance_loss_mlp": 0.3125689, "epoch": 0.24325868029460393, "flos": 22127221946880.0, "grad_norm": 251.1124119243344, "language_loss": 0.93659747, "learning_rate": 3.541587386314541e-06, "loss": 0.95623851, "num_input_tokens_seen": 87086435, "router_z_loss_clip": 3.18945312, "router_z_loss_mlp": 0.38256836, "step": 4046, "time_per_iteration": 2.6540229320526123 }, { "auxiliary_loss_clip": 0.01626368, "auxiliary_loss_mlp": 0.00389399, "balance_loss_clip": 1.30363488, "balance_loss_mlp": 0.34812844, "epoch": 0.2433188035472719, "flos": 23582106420480.0, "grad_norm": 14.346983871100031, "language_loss": 0.79286879, "learning_rate": 3.5413392369578e-06, "loss": 0.81302649, "num_input_tokens_seen": 87105340, "router_z_loss_clip": 3.2265625, "router_z_loss_mlp": 0.41259766, "step": 4047, "time_per_iteration": 2.664977550506592 }, { "auxiliary_loss_clip": 0.01619084, "auxiliary_loss_mlp": 0.00364173, "balance_loss_clip": 1.2967273, "balance_loss_mlp": 0.32445198, "epoch": 0.2433789267999399, "flos": 24462815990400.0, "grad_norm": 2.9132014661097294, "language_loss": 0.80298364, "learning_rate": 3.5410910291533213e-06, "loss": 0.82281625, "num_input_tokens_seen": 87125780, "router_z_loss_clip": 3.22265625, "router_z_loss_mlp": 0.39770508, "step": 4048, "time_per_iteration": 2.6766440868377686 }, { "auxiliary_loss_clip": 0.01632401, "auxiliary_loss_mlp": 0.00353289, "balance_loss_clip": 1.30529225, "balance_loss_mlp": 0.31487948, "epoch": 0.24343905005260785, "flos": 16727909264640.0, "grad_norm": 24.376232773704785, "language_loss": 0.80592018, "learning_rate": 3.5408427629105155e-06, "loss": 0.82577711, "num_input_tokens_seen": 87144470, "router_z_loss_clip": 3.26953125, "router_z_loss_mlp": 0.3840332, "step": 4049, "time_per_iteration": 2.648660182952881 }, { "auxiliary_loss_clip": 0.01620866, "auxiliary_loss_mlp": 0.00341922, "balance_loss_clip": 1.30119562, "balance_loss_mlp": 0.30298811, "epoch": 0.24349917330527582, "flos": 20043756443520.0, "grad_norm": 219.07682909842322, "language_loss": 0.80543625, "learning_rate": 3.5405944382387985e-06, "loss": 0.82506406, "num_input_tokens_seen": 87162830, "router_z_loss_clip": 3.20117188, "router_z_loss_mlp": 0.38916016, "step": 4050, "time_per_iteration": 2.641662359237671 }, { "auxiliary_loss_clip": 0.01614013, "auxiliary_loss_mlp": 0.00310429, "balance_loss_clip": 1.29336119, "balance_loss_mlp": 0.27218711, "epoch": 0.24355929655794378, "flos": 17420554200960.0, "grad_norm": 7.7001967244823, "language_loss": 0.8316865, "learning_rate": 3.5403460551475854e-06, "loss": 0.85093093, "num_input_tokens_seen": 87180905, "router_z_loss_clip": 3.20703125, "router_z_loss_mlp": 0.3828125, "step": 4051, "time_per_iteration": 2.681220293045044 }, { "auxiliary_loss_clip": 0.01632796, "auxiliary_loss_mlp": 0.00393848, "balance_loss_clip": 1.30073822, "balance_loss_mlp": 0.35393625, "epoch": 0.24361941981061175, "flos": 25410929431680.0, "grad_norm": 38.816229818503025, "language_loss": 0.79124653, "learning_rate": 3.540097613646296e-06, "loss": 0.81151295, "num_input_tokens_seen": 87202290, "router_z_loss_clip": 3.31835938, "router_z_loss_mlp": 0.39892578, "step": 4052, "time_per_iteration": 2.821835994720459 }, { "auxiliary_loss_clip": 0.01631439, "auxiliary_loss_mlp": 0.00354311, "balance_loss_clip": 1.31202209, "balance_loss_mlp": 0.31656969, "epoch": 0.2436795430632797, "flos": 22820800636800.0, "grad_norm": 6.675292255985764, "language_loss": 0.8632583, "learning_rate": 3.539849113744351e-06, "loss": 0.88311577, "num_input_tokens_seen": 87221650, "router_z_loss_clip": 3.19335938, "router_z_loss_mlp": 0.37768555, "step": 4053, "time_per_iteration": 2.7326395511627197 }, { "auxiliary_loss_clip": 0.0161879, "auxiliary_loss_mlp": 0.00361161, "balance_loss_clip": 1.29348612, "balance_loss_mlp": 0.32215583, "epoch": 0.2437396663159477, "flos": 15157786982400.0, "grad_norm": 6.29863228357168, "language_loss": 0.83742344, "learning_rate": 3.539600555451172e-06, "loss": 0.85722303, "num_input_tokens_seen": 87238515, "router_z_loss_clip": 3.25195312, "router_z_loss_mlp": 0.38989258, "step": 4054, "time_per_iteration": 2.618770122528076 }, { "auxiliary_loss_clip": 0.01651357, "auxiliary_loss_mlp": 0.00387666, "balance_loss_clip": 1.31672871, "balance_loss_mlp": 0.34584761, "epoch": 0.24379978956861567, "flos": 22091131756800.0, "grad_norm": 7.97909212853329, "language_loss": 0.89997321, "learning_rate": 3.5393519387761866e-06, "loss": 0.92036343, "num_input_tokens_seen": 87256290, "router_z_loss_clip": 3.34570312, "router_z_loss_mlp": 0.41821289, "step": 4055, "time_per_iteration": 2.6917226314544678 }, { "auxiliary_loss_clip": 0.0162168, "auxiliary_loss_mlp": 0.00351616, "balance_loss_clip": 1.28798747, "balance_loss_mlp": 0.30910605, "epoch": 0.24385991282128364, "flos": 31467766527360.0, "grad_norm": 32.5416955023098, "language_loss": 0.64655161, "learning_rate": 3.5391032637288217e-06, "loss": 0.66628462, "num_input_tokens_seen": 87277085, "router_z_loss_clip": 3.3359375, "router_z_loss_mlp": 0.42504883, "step": 4056, "time_per_iteration": 2.770857095718384 }, { "auxiliary_loss_clip": 0.01677059, "auxiliary_loss_mlp": 0.0035839, "balance_loss_clip": 1.34126019, "balance_loss_mlp": 0.31447369, "epoch": 0.2439200360739516, "flos": 23838795987840.0, "grad_norm": 11.68339687262265, "language_loss": 0.88752818, "learning_rate": 3.538854530318506e-06, "loss": 0.90788269, "num_input_tokens_seen": 87293020, "router_z_loss_clip": 3.35351562, "router_z_loss_mlp": 0.43969727, "step": 4057, "time_per_iteration": 2.6745340824127197 }, { "auxiliary_loss_clip": 0.01672642, "auxiliary_loss_mlp": 0.00366565, "balance_loss_clip": 1.33638358, "balance_loss_mlp": 0.32596266, "epoch": 0.24398015932661957, "flos": 19169978198400.0, "grad_norm": 9.027204546362029, "language_loss": 0.85394537, "learning_rate": 3.538605738554673e-06, "loss": 0.87433743, "num_input_tokens_seen": 87311445, "router_z_loss_clip": 3.36523438, "router_z_loss_mlp": 0.40576172, "step": 4058, "time_per_iteration": 2.658832311630249 }, { "auxiliary_loss_clip": 0.01678353, "auxiliary_loss_mlp": 0.00353384, "balance_loss_clip": 1.3300699, "balance_loss_mlp": 0.31094587, "epoch": 0.24404028257928753, "flos": 25262474520960.0, "grad_norm": 17.421361725824354, "language_loss": 0.91296279, "learning_rate": 3.538356888446756e-06, "loss": 0.93328023, "num_input_tokens_seen": 87332055, "router_z_loss_clip": 3.48242188, "router_z_loss_mlp": 0.42431641, "step": 4059, "time_per_iteration": 2.749457597732544 }, { "auxiliary_loss_clip": 0.016906, "auxiliary_loss_mlp": 0.00362655, "balance_loss_clip": 1.34981775, "balance_loss_mlp": 0.32298195, "epoch": 0.2441004058319555, "flos": 26467600752000.0, "grad_norm": 9449.228278831311, "language_loss": 0.78562701, "learning_rate": 3.5381079800041913e-06, "loss": 0.80615956, "num_input_tokens_seen": 87351295, "router_z_loss_clip": 3.41015625, "router_z_loss_mlp": 0.39672852, "step": 4060, "time_per_iteration": 2.807614326477051 }, { "auxiliary_loss_clip": 0.01702156, "auxiliary_loss_mlp": 0.00414233, "balance_loss_clip": 1.35087037, "balance_loss_mlp": 0.36974409, "epoch": 0.2441605290846235, "flos": 26760524163840.0, "grad_norm": 3300.2049392401477, "language_loss": 0.82141334, "learning_rate": 3.5378590132364182e-06, "loss": 0.84257722, "num_input_tokens_seen": 87370650, "router_z_loss_clip": 3.515625, "router_z_loss_mlp": 0.4453125, "step": 4061, "time_per_iteration": 2.72826886177063 }, { "auxiliary_loss_clip": 0.01720814, "auxiliary_loss_mlp": 0.00360998, "balance_loss_clip": 1.36935711, "balance_loss_mlp": 0.31557935, "epoch": 0.24422065233729146, "flos": 21105850717440.0, "grad_norm": 5.184566758326707, "language_loss": 0.82850248, "learning_rate": 3.5376099881528768e-06, "loss": 0.84932065, "num_input_tokens_seen": 87389020, "router_z_loss_clip": 3.515625, "router_z_loss_mlp": 0.45385742, "step": 4062, "time_per_iteration": 2.6990461349487305 }, { "auxiliary_loss_clip": 0.01699404, "auxiliary_loss_mlp": 0.00341968, "balance_loss_clip": 1.35635757, "balance_loss_mlp": 0.30146098, "epoch": 0.24428077558995942, "flos": 25263156879360.0, "grad_norm": 2.716712954470212, "language_loss": 0.8957181, "learning_rate": 3.537360904763011e-06, "loss": 0.91613179, "num_input_tokens_seen": 87409695, "router_z_loss_clip": 3.43164062, "router_z_loss_mlp": 0.4050293, "step": 4063, "time_per_iteration": 2.6793694496154785 }, { "auxiliary_loss_clip": 0.0171584, "auxiliary_loss_mlp": 0.00362117, "balance_loss_clip": 1.35005546, "balance_loss_mlp": 0.31982154, "epoch": 0.24434089884262739, "flos": 20485278420480.0, "grad_norm": 9.667026749703734, "language_loss": 0.79313439, "learning_rate": 3.5371117630762656e-06, "loss": 0.81391394, "num_input_tokens_seen": 87428250, "router_z_loss_clip": 3.65820312, "router_z_loss_mlp": 0.42285156, "step": 4064, "time_per_iteration": 2.651258707046509 }, { "auxiliary_loss_clip": 0.01704108, "auxiliary_loss_mlp": 0.00360596, "balance_loss_clip": 1.35152292, "balance_loss_mlp": 0.31675038, "epoch": 0.24440102209529535, "flos": 23621895711360.0, "grad_norm": 3.2805419588388856, "language_loss": 0.75577211, "learning_rate": 3.536862563102088e-06, "loss": 0.7764191, "num_input_tokens_seen": 87449380, "router_z_loss_clip": 3.52539062, "router_z_loss_mlp": 0.43847656, "step": 4065, "time_per_iteration": 2.7352805137634277 }, { "auxiliary_loss_clip": 0.01679526, "auxiliary_loss_mlp": 0.00367896, "balance_loss_clip": 1.32755089, "balance_loss_mlp": 0.32011673, "epoch": 0.24446114534796332, "flos": 20554729367040.0, "grad_norm": 37.87336671302953, "language_loss": 0.91744608, "learning_rate": 3.5366133048499282e-06, "loss": 0.93792033, "num_input_tokens_seen": 87465365, "router_z_loss_clip": 3.51953125, "router_z_loss_mlp": 0.47827148, "step": 4066, "time_per_iteration": 2.6320135593414307 }, { "auxiliary_loss_clip": 0.01593239, "auxiliary_loss_mlp": 0.00165235, "balance_loss_clip": 1.39066124, "balance_loss_mlp": 0.15093008, "epoch": 0.24452126860063128, "flos": 60389575009920.0, "grad_norm": 0.7350235095971702, "language_loss": 0.52088296, "learning_rate": 3.5363639883292374e-06, "loss": 0.53846776, "num_input_tokens_seen": 87522525, "router_z_loss_clip": 2.03125, "router_z_loss_mlp": 0.14257812, "step": 4067, "time_per_iteration": 3.0438199043273926 }, { "auxiliary_loss_clip": 0.01700334, "auxiliary_loss_mlp": 0.00335543, "balance_loss_clip": 1.3478663, "balance_loss_mlp": 0.29451129, "epoch": 0.24458139185329927, "flos": 15121660878720.0, "grad_norm": 33.26523606196786, "language_loss": 0.81912065, "learning_rate": 3.5361146135494706e-06, "loss": 0.83947939, "num_input_tokens_seen": 87539170, "router_z_loss_clip": 3.52929688, "router_z_loss_mlp": 0.41064453, "step": 4068, "time_per_iteration": 2.620424509048462 }, { "auxiliary_loss_clip": 0.01703145, "auxiliary_loss_mlp": 0.00307511, "balance_loss_clip": 1.35665369, "balance_loss_mlp": 0.2666218, "epoch": 0.24464151510596724, "flos": 27998723842560.0, "grad_norm": 5.332173264784658, "language_loss": 0.84044564, "learning_rate": 3.5358651805200835e-06, "loss": 0.86055225, "num_input_tokens_seen": 87558875, "router_z_loss_clip": 3.46875, "router_z_loss_mlp": 0.40917969, "step": 4069, "time_per_iteration": 4.095356702804565 }, { "auxiliary_loss_clip": 0.01681753, "auxiliary_loss_mlp": 0.00331061, "balance_loss_clip": 1.33749247, "balance_loss_mlp": 0.28869432, "epoch": 0.2447016383586352, "flos": 19792884879360.0, "grad_norm": 2.2545112977960238, "language_loss": 0.86880082, "learning_rate": 3.5356156892505347e-06, "loss": 0.88892901, "num_input_tokens_seen": 87576485, "router_z_loss_clip": 3.44335938, "router_z_loss_mlp": 0.42382812, "step": 4070, "time_per_iteration": 2.6493489742279053 }, { "auxiliary_loss_clip": 0.01686764, "auxiliary_loss_mlp": 0.0031284, "balance_loss_clip": 1.33608198, "balance_loss_mlp": 0.27280974, "epoch": 0.24476176161130317, "flos": 26067340523520.0, "grad_norm": 3.1998643619847718, "language_loss": 0.89118576, "learning_rate": 3.5353661397502854e-06, "loss": 0.91118181, "num_input_tokens_seen": 87598620, "router_z_loss_clip": 3.5078125, "router_z_loss_mlp": 0.40063477, "step": 4071, "time_per_iteration": 2.695143938064575 }, { "auxiliary_loss_clip": 0.01643533, "auxiliary_loss_mlp": 0.00338322, "balance_loss_clip": 1.29758096, "balance_loss_mlp": 0.29767177, "epoch": 0.24482188486397113, "flos": 18843550375680.0, "grad_norm": 7.082563541770859, "language_loss": 0.86913276, "learning_rate": 3.535116532028798e-06, "loss": 0.8889513, "num_input_tokens_seen": 87616595, "router_z_loss_clip": 3.45703125, "router_z_loss_mlp": 0.40625, "step": 4072, "time_per_iteration": 4.085922002792358 }, { "auxiliary_loss_clip": 0.01681664, "auxiliary_loss_mlp": 0.00310393, "balance_loss_clip": 1.33177543, "balance_loss_mlp": 0.2702198, "epoch": 0.2448820081166391, "flos": 21251791676160.0, "grad_norm": 6.579187034653531, "language_loss": 0.76599199, "learning_rate": 3.5348668660955382e-06, "loss": 0.78591251, "num_input_tokens_seen": 87635755, "router_z_loss_clip": 3.49804688, "router_z_loss_mlp": 0.40185547, "step": 4073, "time_per_iteration": 2.700016498565674 }, { "auxiliary_loss_clip": 0.0168248, "auxiliary_loss_mlp": 0.00312356, "balance_loss_clip": 1.33674979, "balance_loss_mlp": 0.27280283, "epoch": 0.2449421313693071, "flos": 23950586090880.0, "grad_norm": 4.99629472936577, "language_loss": 0.76435769, "learning_rate": 3.5346171419599728e-06, "loss": 0.78430605, "num_input_tokens_seen": 87652885, "router_z_loss_clip": 3.45703125, "router_z_loss_mlp": 0.39575195, "step": 4074, "time_per_iteration": 2.6851930618286133 }, { "auxiliary_loss_clip": 0.01493718, "auxiliary_loss_mlp": 0.00182872, "balance_loss_clip": 1.31435633, "balance_loss_mlp": 0.16894864, "epoch": 0.24500225462197506, "flos": 60687669980160.0, "grad_norm": 0.9072845897780876, "language_loss": 0.68515682, "learning_rate": 3.5343673596315718e-06, "loss": 0.70192277, "num_input_tokens_seen": 87713220, "router_z_loss_clip": 1.796875, "router_z_loss_mlp": 0.13964844, "step": 4075, "time_per_iteration": 3.23795485496521 }, { "auxiliary_loss_clip": 0.01640555, "auxiliary_loss_mlp": 0.00360257, "balance_loss_clip": 1.3034656, "balance_loss_mlp": 0.32034534, "epoch": 0.24506237787464302, "flos": 26284204886400.0, "grad_norm": 58.985263983326654, "language_loss": 0.86912954, "learning_rate": 3.5341175191198063e-06, "loss": 0.88913763, "num_input_tokens_seen": 87732680, "router_z_loss_clip": 3.37109375, "router_z_loss_mlp": 0.39916992, "step": 4076, "time_per_iteration": 4.147852897644043 }, { "auxiliary_loss_clip": 0.01654063, "auxiliary_loss_mlp": 0.00354476, "balance_loss_clip": 1.30275679, "balance_loss_mlp": 0.31125027, "epoch": 0.245122501127311, "flos": 20552287242240.0, "grad_norm": 21.501932079705817, "language_loss": 0.88386375, "learning_rate": 3.533867620434151e-06, "loss": 0.90394914, "num_input_tokens_seen": 87751880, "router_z_loss_clip": 3.515625, "router_z_loss_mlp": 0.43212891, "step": 4077, "time_per_iteration": 2.697634696960449 }, { "auxiliary_loss_clip": 0.01663002, "auxiliary_loss_mlp": 0.00329453, "balance_loss_clip": 1.31272531, "balance_loss_mlp": 0.28610888, "epoch": 0.24518262437997895, "flos": 29132603447040.0, "grad_norm": 29.648568041383562, "language_loss": 0.69991684, "learning_rate": 3.533617663584082e-06, "loss": 0.71984136, "num_input_tokens_seen": 87771795, "router_z_loss_clip": 3.50585938, "router_z_loss_mlp": 0.43334961, "step": 4078, "time_per_iteration": 2.7280759811401367 }, { "auxiliary_loss_clip": 0.01626605, "auxiliary_loss_mlp": 0.00317804, "balance_loss_clip": 1.28407025, "balance_loss_mlp": 0.28008634, "epoch": 0.24524274763264692, "flos": 23476924419840.0, "grad_norm": 3.4996030849645807, "language_loss": 0.80390996, "learning_rate": 3.5333676485790765e-06, "loss": 0.82335407, "num_input_tokens_seen": 87793640, "router_z_loss_clip": 3.42773438, "router_z_loss_mlp": 0.37695312, "step": 4079, "time_per_iteration": 2.727792739868164 }, { "auxiliary_loss_clip": 0.01643981, "auxiliary_loss_mlp": 0.0034326, "balance_loss_clip": 1.30307317, "balance_loss_mlp": 0.30544716, "epoch": 0.24530287088531488, "flos": 17201175886080.0, "grad_norm": 8.665821652051525, "language_loss": 0.83709198, "learning_rate": 3.5331175754286173e-06, "loss": 0.85696435, "num_input_tokens_seen": 87812390, "router_z_loss_clip": 3.40820312, "router_z_loss_mlp": 0.37841797, "step": 4080, "time_per_iteration": 2.627474308013916 }, { "auxiliary_loss_clip": 0.01600778, "auxiliary_loss_mlp": 0.00334341, "balance_loss_clip": 1.26621485, "balance_loss_mlp": 0.29700503, "epoch": 0.24536299413798288, "flos": 14867449349760.0, "grad_norm": 3.5238721234300625, "language_loss": 0.8967607, "learning_rate": 3.532867444142186e-06, "loss": 0.91611195, "num_input_tokens_seen": 87830640, "router_z_loss_clip": 3.34570312, "router_z_loss_mlp": 0.37353516, "step": 4081, "time_per_iteration": 2.6587443351745605 }, { "auxiliary_loss_clip": 0.01621193, "auxiliary_loss_mlp": 0.00320958, "balance_loss_clip": 1.28541923, "balance_loss_mlp": 0.28312111, "epoch": 0.24542311739065084, "flos": 35262051886080.0, "grad_norm": 6.149470609982144, "language_loss": 0.80525708, "learning_rate": 3.532617254729267e-06, "loss": 0.82467854, "num_input_tokens_seen": 87850450, "router_z_loss_clip": 3.359375, "router_z_loss_mlp": 0.37841797, "step": 4082, "time_per_iteration": 4.157889366149902 }, { "auxiliary_loss_clip": 0.01650843, "auxiliary_loss_mlp": 0.00334914, "balance_loss_clip": 1.3079443, "balance_loss_mlp": 0.2950266, "epoch": 0.2454832406433188, "flos": 21503130117120.0, "grad_norm": 6.327889263559792, "language_loss": 0.77003419, "learning_rate": 3.5323670071993485e-06, "loss": 0.78989172, "num_input_tokens_seen": 87868810, "router_z_loss_clip": 3.43164062, "router_z_loss_mlp": 0.39892578, "step": 4083, "time_per_iteration": 2.6434004306793213 }, { "auxiliary_loss_clip": 0.01642518, "auxiliary_loss_mlp": 0.00347934, "balance_loss_clip": 1.29663515, "balance_loss_mlp": 0.30702111, "epoch": 0.24554336389598677, "flos": 14756664827520.0, "grad_norm": 3.9438764169614116, "language_loss": 0.82520747, "learning_rate": 3.532116701561919e-06, "loss": 0.84511197, "num_input_tokens_seen": 87885685, "router_z_loss_clip": 3.45898438, "router_z_loss_mlp": 0.40917969, "step": 4084, "time_per_iteration": 2.619974136352539 }, { "auxiliary_loss_clip": 0.01608016, "auxiliary_loss_mlp": 0.00317599, "balance_loss_clip": 1.2708565, "balance_loss_mlp": 0.27742499, "epoch": 0.24560348714865474, "flos": 14976402278400.0, "grad_norm": 8.364700557550242, "language_loss": 0.91903043, "learning_rate": 3.531866337826471e-06, "loss": 0.93828654, "num_input_tokens_seen": 87903715, "router_z_loss_clip": 3.37304688, "router_z_loss_mlp": 0.40185547, "step": 4085, "time_per_iteration": 2.6119370460510254 }, { "auxiliary_loss_clip": 0.0162692, "auxiliary_loss_mlp": 0.00346695, "balance_loss_clip": 1.28602362, "balance_loss_mlp": 0.30876249, "epoch": 0.2456636104013227, "flos": 22675326554880.0, "grad_norm": 6.0274868134171244, "language_loss": 0.86455661, "learning_rate": 3.5316159160024982e-06, "loss": 0.88429272, "num_input_tokens_seen": 87923375, "router_z_loss_clip": 3.41015625, "router_z_loss_mlp": 0.37939453, "step": 4086, "time_per_iteration": 2.69118070602417 }, { "auxiliary_loss_clip": 0.01612097, "auxiliary_loss_mlp": 0.00323966, "balance_loss_clip": 1.27446938, "balance_loss_mlp": 0.28796464, "epoch": 0.2457237336539907, "flos": 27417869009280.0, "grad_norm": 1.6881962128297683, "language_loss": 0.80459458, "learning_rate": 3.531365436099496e-06, "loss": 0.82395518, "num_input_tokens_seen": 87943115, "router_z_loss_clip": 3.37695312, "router_z_loss_mlp": 0.36010742, "step": 4087, "time_per_iteration": 2.761948823928833 }, { "auxiliary_loss_clip": 0.01659255, "auxiliary_loss_mlp": 0.00375422, "balance_loss_clip": 1.30153334, "balance_loss_mlp": 0.3354634, "epoch": 0.24578385690665866, "flos": 20412379768320.0, "grad_norm": 5.260589205122971, "language_loss": 0.87534219, "learning_rate": 3.5311148981269635e-06, "loss": 0.89568895, "num_input_tokens_seen": 87959505, "router_z_loss_clip": 3.57617188, "router_z_loss_mlp": 0.3996582, "step": 4088, "time_per_iteration": 2.6609017848968506 }, { "auxiliary_loss_clip": 0.0162987, "auxiliary_loss_mlp": 0.00340543, "balance_loss_clip": 1.28189683, "balance_loss_mlp": 0.30244383, "epoch": 0.24584398015932662, "flos": 23915393740800.0, "grad_norm": 7.44917209199784, "language_loss": 0.82691753, "learning_rate": 3.5308643020944e-06, "loss": 0.84662163, "num_input_tokens_seen": 87979725, "router_z_loss_clip": 3.48046875, "router_z_loss_mlp": 0.38110352, "step": 4089, "time_per_iteration": 2.725630760192871 }, { "auxiliary_loss_clip": 0.01629656, "auxiliary_loss_mlp": 0.00342092, "balance_loss_clip": 1.27485704, "balance_loss_mlp": 0.3040646, "epoch": 0.2459041034119946, "flos": 41496359103360.0, "grad_norm": 32.784828436748356, "language_loss": 0.86701292, "learning_rate": 3.530613648011309e-06, "loss": 0.88673037, "num_input_tokens_seen": 87998270, "router_z_loss_clip": 3.546875, "router_z_loss_mlp": 0.38012695, "step": 4090, "time_per_iteration": 2.8051397800445557 }, { "auxiliary_loss_clip": 0.01655695, "auxiliary_loss_mlp": 0.00341459, "balance_loss_clip": 1.29525399, "balance_loss_mlp": 0.30030793, "epoch": 0.24596422666466256, "flos": 19936814676480.0, "grad_norm": 10.72643422794924, "language_loss": 0.81976682, "learning_rate": 3.5303629358871946e-06, "loss": 0.83973837, "num_input_tokens_seen": 88016760, "router_z_loss_clip": 3.60546875, "router_z_loss_mlp": 0.41162109, "step": 4091, "time_per_iteration": 2.6356942653656006 }, { "auxiliary_loss_clip": 0.01658048, "auxiliary_loss_mlp": 0.00339047, "balance_loss_clip": 1.2961092, "balance_loss_mlp": 0.30054224, "epoch": 0.24602434991733052, "flos": 21544391865600.0, "grad_norm": 136.983693700579, "language_loss": 0.8320356, "learning_rate": 3.5301121657315653e-06, "loss": 0.85200661, "num_input_tokens_seen": 88036465, "router_z_loss_clip": 3.62109375, "router_z_loss_mlp": 0.38549805, "step": 4092, "time_per_iteration": 2.657351493835449 }, { "auxiliary_loss_clip": 0.01660216, "auxiliary_loss_mlp": 0.00373482, "balance_loss_clip": 1.28746927, "balance_loss_mlp": 0.33407182, "epoch": 0.24608447316999849, "flos": 23185078416000.0, "grad_norm": 51.45237817894789, "language_loss": 0.89129144, "learning_rate": 3.5298613375539287e-06, "loss": 0.91162848, "num_input_tokens_seen": 88053270, "router_z_loss_clip": 3.73046875, "router_z_loss_mlp": 0.39428711, "step": 4093, "time_per_iteration": 2.674542188644409 }, { "auxiliary_loss_clip": 0.01668889, "auxiliary_loss_mlp": 0.00370208, "balance_loss_clip": 1.29081655, "balance_loss_mlp": 0.32414562, "epoch": 0.24614459642266648, "flos": 19641951930240.0, "grad_norm": 7.534603615635604, "language_loss": 0.93290472, "learning_rate": 3.529610451363797e-06, "loss": 0.95329571, "num_input_tokens_seen": 88072305, "router_z_loss_clip": 3.78515625, "router_z_loss_mlp": 0.4609375, "step": 4094, "time_per_iteration": 2.6294965744018555 }, { "auxiliary_loss_clip": 0.01501504, "auxiliary_loss_mlp": 0.00175475, "balance_loss_clip": 1.30555201, "balance_loss_mlp": 0.16164683, "epoch": 0.24620471967533444, "flos": 61739816186880.0, "grad_norm": 1.3061354965899592, "language_loss": 0.56896842, "learning_rate": 3.5293595071706833e-06, "loss": 0.58573824, "num_input_tokens_seen": 88137995, "router_z_loss_clip": 1.9609375, "router_z_loss_mlp": 0.13867188, "step": 4095, "time_per_iteration": 3.221538543701172 }, { "auxiliary_loss_clip": 0.01526037, "auxiliary_loss_mlp": 0.00238153, "balance_loss_clip": 1.32304013, "balance_loss_mlp": 0.22270302, "epoch": 0.2462648429280024, "flos": 69154436315520.0, "grad_norm": 0.6860879517498415, "language_loss": 0.56359452, "learning_rate": 3.5291085049841042e-06, "loss": 0.58123636, "num_input_tokens_seen": 88208490, "router_z_loss_clip": 2.03125, "router_z_loss_mlp": 0.15429688, "step": 4096, "time_per_iteration": 3.260131359100342 }, { "auxiliary_loss_clip": 0.01648998, "auxiliary_loss_mlp": 0.0034901, "balance_loss_clip": 1.27948403, "balance_loss_mlp": 0.30657101, "epoch": 0.24632496618067037, "flos": 29459605887360.0, "grad_norm": 20.535292674275055, "language_loss": 0.83947718, "learning_rate": 3.5288574448135773e-06, "loss": 0.85945725, "num_input_tokens_seen": 88228050, "router_z_loss_clip": 3.69335938, "router_z_loss_mlp": 0.42456055, "step": 4097, "time_per_iteration": 2.7194676399230957 }, { "auxiliary_loss_clip": 0.01644025, "auxiliary_loss_mlp": 0.00356552, "balance_loss_clip": 1.26546001, "balance_loss_mlp": 0.31451833, "epoch": 0.24638508943333834, "flos": 24316444068480.0, "grad_norm": 35.486127284077924, "language_loss": 0.81548774, "learning_rate": 3.5286063266686235e-06, "loss": 0.83549356, "num_input_tokens_seen": 88248090, "router_z_loss_clip": 3.78320312, "router_z_loss_mlp": 0.42041016, "step": 4098, "time_per_iteration": 2.849799394607544 }, { "auxiliary_loss_clip": 0.01640582, "auxiliary_loss_mlp": 0.00344412, "balance_loss_clip": 1.26842737, "balance_loss_mlp": 0.30397645, "epoch": 0.2464452126860063, "flos": 26613254401920.0, "grad_norm": 16.075970307857546, "language_loss": 0.75530541, "learning_rate": 3.528355150558764e-06, "loss": 0.77515537, "num_input_tokens_seen": 88267545, "router_z_loss_clip": 3.72265625, "router_z_loss_mlp": 0.40405273, "step": 4099, "time_per_iteration": 2.814713954925537 }, { "auxiliary_loss_clip": 0.01641718, "auxiliary_loss_mlp": 0.00335346, "balance_loss_clip": 1.27401161, "balance_loss_mlp": 0.29641211, "epoch": 0.24650533593867427, "flos": 31212405763200.0, "grad_norm": 4.568995210169731, "language_loss": 0.71435982, "learning_rate": 3.5281039164935237e-06, "loss": 0.73413044, "num_input_tokens_seen": 88289785, "router_z_loss_clip": 3.67578125, "router_z_loss_mlp": 0.38916016, "step": 4100, "time_per_iteration": 2.782658815383911 }, { "auxiliary_loss_clip": 0.01483643, "auxiliary_loss_mlp": 0.00188695, "balance_loss_clip": 1.29511964, "balance_loss_mlp": 0.17248254, "epoch": 0.24656545919134226, "flos": 68494002900480.0, "grad_norm": 0.7077342202645737, "language_loss": 0.61420721, "learning_rate": 3.5278526244824304e-06, "loss": 0.6309306, "num_input_tokens_seen": 88357320, "router_z_loss_clip": 1.890625, "router_z_loss_mlp": 0.16210938, "step": 4101, "time_per_iteration": 3.2875173091888428 }, { "auxiliary_loss_clip": 0.01625483, "auxiliary_loss_mlp": 0.00338431, "balance_loss_clip": 1.25559545, "balance_loss_mlp": 0.29883015, "epoch": 0.24662558244401023, "flos": 20084192179200.0, "grad_norm": 7.298211494756958, "language_loss": 0.78749192, "learning_rate": 3.527601274535012e-06, "loss": 0.80713111, "num_input_tokens_seen": 88377040, "router_z_loss_clip": 3.69921875, "router_z_loss_mlp": 0.39575195, "step": 4102, "time_per_iteration": 2.630338668823242 }, { "auxiliary_loss_clip": 0.01621932, "auxiliary_loss_mlp": 0.00322387, "balance_loss_clip": 1.2472657, "balance_loss_mlp": 0.28497928, "epoch": 0.2466857056966782, "flos": 30701361012480.0, "grad_norm": 96.5505603023039, "language_loss": 0.81840634, "learning_rate": 3.5273498666608004e-06, "loss": 0.8378495, "num_input_tokens_seen": 88395085, "router_z_loss_clip": 3.7421875, "router_z_loss_mlp": 0.37402344, "step": 4103, "time_per_iteration": 2.6935436725616455 }, { "auxiliary_loss_clip": 0.01630983, "auxiliary_loss_mlp": 0.00315578, "balance_loss_clip": 1.2552824, "balance_loss_mlp": 0.27597648, "epoch": 0.24674582894934616, "flos": 22528523669760.0, "grad_norm": 8.341043070306116, "language_loss": 0.85758579, "learning_rate": 3.5270984008693288e-06, "loss": 0.87705135, "num_input_tokens_seen": 88413205, "router_z_loss_clip": 3.7578125, "router_z_loss_mlp": 0.39599609, "step": 4104, "time_per_iteration": 2.625530242919922 }, { "auxiliary_loss_clip": 0.01616517, "auxiliary_loss_mlp": 0.00308269, "balance_loss_clip": 1.24409103, "balance_loss_mlp": 0.27117145, "epoch": 0.24680595220201412, "flos": 20704297599360.0, "grad_norm": 15.119404904229249, "language_loss": 0.89451385, "learning_rate": 3.526846877170133e-06, "loss": 0.91376173, "num_input_tokens_seen": 88431525, "router_z_loss_clip": 3.72265625, "router_z_loss_mlp": 0.37109375, "step": 4105, "time_per_iteration": 2.680436134338379 }, { "auxiliary_loss_clip": 0.01610958, "auxiliary_loss_mlp": 0.00338641, "balance_loss_clip": 1.24424136, "balance_loss_mlp": 0.30161488, "epoch": 0.2468660754546821, "flos": 21831174051840.0, "grad_norm": 98.66766138659817, "language_loss": 0.83992261, "learning_rate": 3.52659529557275e-06, "loss": 0.85941863, "num_input_tokens_seen": 88451210, "router_z_loss_clip": 3.66796875, "router_z_loss_mlp": 0.37036133, "step": 4106, "time_per_iteration": 2.6495416164398193 }, { "auxiliary_loss_clip": 0.01617868, "auxiliary_loss_mlp": 0.00364904, "balance_loss_clip": 1.24129784, "balance_loss_mlp": 0.32642344, "epoch": 0.24692619870735008, "flos": 15267709578240.0, "grad_norm": 14.343963145145667, "language_loss": 0.8030206, "learning_rate": 3.5263436560867205e-06, "loss": 0.82284826, "num_input_tokens_seen": 88467790, "router_z_loss_clip": 3.76757812, "router_z_loss_mlp": 0.38476562, "step": 4107, "time_per_iteration": 2.6064810752868652 }, { "auxiliary_loss_clip": 0.01627781, "auxiliary_loss_mlp": 0.00332309, "balance_loss_clip": 1.25384164, "balance_loss_mlp": 0.29287487, "epoch": 0.24698632196001805, "flos": 29680097523840.0, "grad_norm": 13.286853741541114, "language_loss": 0.74721944, "learning_rate": 3.526091958721587e-06, "loss": 0.76682037, "num_input_tokens_seen": 88490330, "router_z_loss_clip": 3.74023438, "router_z_loss_mlp": 0.39453125, "step": 4108, "time_per_iteration": 2.6833479404449463 }, { "auxiliary_loss_clip": 0.01586324, "auxiliary_loss_mlp": 0.0035055, "balance_loss_clip": 1.22097564, "balance_loss_mlp": 0.31133023, "epoch": 0.247046445212686, "flos": 39165469741440.0, "grad_norm": 2.6725233528591823, "language_loss": 0.80024087, "learning_rate": 3.5258402034868936e-06, "loss": 0.81960964, "num_input_tokens_seen": 88512435, "router_z_loss_clip": 3.65429688, "router_z_loss_mlp": 0.39208984, "step": 4109, "time_per_iteration": 2.8353703022003174 }, { "auxiliary_loss_clip": 0.01579438, "auxiliary_loss_mlp": 0.00352227, "balance_loss_clip": 1.2160387, "balance_loss_mlp": 0.31522477, "epoch": 0.24710656846535398, "flos": 22998845376000.0, "grad_norm": 4.132501333207132, "language_loss": 0.84114712, "learning_rate": 3.5255883903921866e-06, "loss": 0.86046374, "num_input_tokens_seen": 88529780, "router_z_loss_clip": 3.63671875, "router_z_loss_mlp": 0.36987305, "step": 4110, "time_per_iteration": 2.6308388710021973 }, { "auxiliary_loss_clip": 0.01615343, "auxiliary_loss_mlp": 0.00331821, "balance_loss_clip": 1.24703717, "balance_loss_mlp": 0.29312548, "epoch": 0.24716669171802194, "flos": 26432803451520.0, "grad_norm": 71.16425149046033, "language_loss": 0.88919067, "learning_rate": 3.5253365194470144e-06, "loss": 0.90866226, "num_input_tokens_seen": 88547200, "router_z_loss_clip": 3.6796875, "router_z_loss_mlp": 0.38696289, "step": 4111, "time_per_iteration": 4.085323333740234 }, { "auxiliary_loss_clip": 0.01603114, "auxiliary_loss_mlp": 0.00358025, "balance_loss_clip": 1.23195708, "balance_loss_mlp": 0.31949612, "epoch": 0.2472268149706899, "flos": 23329870139520.0, "grad_norm": 11.44956199850168, "language_loss": 0.8055141, "learning_rate": 3.5250845906609294e-06, "loss": 0.82512558, "num_input_tokens_seen": 88566415, "router_z_loss_clip": 3.7109375, "router_z_loss_mlp": 0.38549805, "step": 4112, "time_per_iteration": 2.643277883529663 }, { "auxiliary_loss_clip": 0.01600087, "auxiliary_loss_mlp": 0.00344439, "balance_loss_clip": 1.22714186, "balance_loss_mlp": 0.30698308, "epoch": 0.24728693822335787, "flos": 23768734510080.0, "grad_norm": 3.3616311445321077, "language_loss": 0.89675593, "learning_rate": 3.5248326040434835e-06, "loss": 0.91620123, "num_input_tokens_seen": 88585225, "router_z_loss_clip": 3.73046875, "router_z_loss_mlp": 0.37426758, "step": 4113, "time_per_iteration": 2.6380510330200195 }, { "auxiliary_loss_clip": 0.01563954, "auxiliary_loss_mlp": 0.00299357, "balance_loss_clip": 1.20575559, "balance_loss_mlp": 0.26118588, "epoch": 0.24734706147602586, "flos": 19317499355520.0, "grad_norm": 2.560508202444428, "language_loss": 0.95158076, "learning_rate": 3.5245805596042322e-06, "loss": 0.97021389, "num_input_tokens_seen": 88603280, "router_z_loss_clip": 3.58007812, "router_z_loss_mlp": 0.3815918, "step": 4114, "time_per_iteration": 4.060837984085083 }, { "auxiliary_loss_clip": 0.01577795, "auxiliary_loss_mlp": 0.0030376, "balance_loss_clip": 1.21452498, "balance_loss_mlp": 0.26928455, "epoch": 0.24740718472869383, "flos": 28036932935040.0, "grad_norm": 2.593634770959704, "language_loss": 0.81411016, "learning_rate": 3.524328457352734e-06, "loss": 0.83292568, "num_input_tokens_seen": 88624925, "router_z_loss_clip": 3.63085938, "router_z_loss_mlp": 0.3449707, "step": 4115, "time_per_iteration": 2.7037220001220703 }, { "auxiliary_loss_clip": 0.01404145, "auxiliary_loss_mlp": 0.00076857, "balance_loss_clip": 1.24002099, "balance_loss_mlp": 0.05959515, "epoch": 0.2474673079813618, "flos": 68107569408000.0, "grad_norm": 1.5225242893981896, "language_loss": 0.57638657, "learning_rate": 3.5240762972985475e-06, "loss": 0.59119654, "num_input_tokens_seen": 88691475, "router_z_loss_clip": 1.640625, "router_z_loss_mlp": 0.17285156, "step": 4116, "time_per_iteration": 3.2344231605529785 }, { "auxiliary_loss_clip": 0.01558125, "auxiliary_loss_mlp": 0.00283354, "balance_loss_clip": 1.2040813, "balance_loss_mlp": 0.24730554, "epoch": 0.24752743123402976, "flos": 29462119839360.0, "grad_norm": 14.660377114872066, "language_loss": 0.8925041, "learning_rate": 3.523824079451235e-06, "loss": 0.91091889, "num_input_tokens_seen": 88713425, "router_z_loss_clip": 3.54101562, "router_z_loss_mlp": 0.3605957, "step": 4117, "time_per_iteration": 2.7813124656677246 }, { "auxiliary_loss_clip": 0.01378716, "auxiliary_loss_mlp": 0.00087638, "balance_loss_clip": 1.20988894, "balance_loss_mlp": 0.07485904, "epoch": 0.24758755448669773, "flos": 58350459824640.0, "grad_norm": 0.8751892768462821, "language_loss": 0.63379711, "learning_rate": 3.5235718038203602e-06, "loss": 0.64846063, "num_input_tokens_seen": 88769995, "router_z_loss_clip": 1.6875, "router_z_loss_mlp": 0.12792969, "step": 4118, "time_per_iteration": 4.359997749328613 }, { "auxiliary_loss_clip": 0.01583993, "auxiliary_loss_mlp": 0.00320558, "balance_loss_clip": 1.22221804, "balance_loss_mlp": 0.28367478, "epoch": 0.2476476777393657, "flos": 20484416494080.0, "grad_norm": 20.689716289313967, "language_loss": 0.8559221, "learning_rate": 3.523319470415491e-06, "loss": 0.87496758, "num_input_tokens_seen": 88789970, "router_z_loss_clip": 3.61914062, "router_z_loss_mlp": 0.3684082, "step": 4119, "time_per_iteration": 2.708580732345581 }, { "auxiliary_loss_clip": 0.01558951, "auxiliary_loss_mlp": 0.00287547, "balance_loss_clip": 1.20591235, "balance_loss_mlp": 0.25052103, "epoch": 0.24770780099203366, "flos": 20485853038080.0, "grad_norm": 10.007822824854461, "language_loss": 0.81251168, "learning_rate": 3.5230670792461943e-06, "loss": 0.83097667, "num_input_tokens_seen": 88810000, "router_z_loss_clip": 3.52929688, "router_z_loss_mlp": 0.37036133, "step": 4120, "time_per_iteration": 2.650531530380249 }, { "auxiliary_loss_clip": 0.01560447, "auxiliary_loss_mlp": 0.00324771, "balance_loss_clip": 1.20536828, "balance_loss_mlp": 0.28929389, "epoch": 0.24776792424470165, "flos": 15153405523200.0, "grad_norm": 16.67719211000628, "language_loss": 0.95121467, "learning_rate": 3.522814630322041e-06, "loss": 0.97006691, "num_input_tokens_seen": 88827515, "router_z_loss_clip": 3.55273438, "router_z_loss_mlp": 0.35498047, "step": 4121, "time_per_iteration": 2.5972020626068115 }, { "auxiliary_loss_clip": 0.01559296, "auxiliary_loss_mlp": 0.00308306, "balance_loss_clip": 1.20173848, "balance_loss_mlp": 0.27089852, "epoch": 0.2478280474973696, "flos": 21725453347200.0, "grad_norm": 3.444485956588, "language_loss": 0.78672338, "learning_rate": 3.5225621236526045e-06, "loss": 0.80539942, "num_input_tokens_seen": 88845025, "router_z_loss_clip": 3.578125, "router_z_loss_mlp": 0.37402344, "step": 4122, "time_per_iteration": 2.647784948348999 }, { "auxiliary_loss_clip": 0.01550757, "auxiliary_loss_mlp": 0.00282857, "balance_loss_clip": 1.19222713, "balance_loss_mlp": 0.24578337, "epoch": 0.24788817075003758, "flos": 20412200200320.0, "grad_norm": 2.2510618426609623, "language_loss": 0.86900854, "learning_rate": 3.5223095592474596e-06, "loss": 0.88734466, "num_input_tokens_seen": 88861740, "router_z_loss_clip": 3.58789062, "router_z_loss_mlp": 0.37060547, "step": 4123, "time_per_iteration": 2.631120443344116 }, { "auxiliary_loss_clip": 0.01566763, "auxiliary_loss_mlp": 0.00296299, "balance_loss_clip": 1.21172571, "balance_loss_mlp": 0.26139462, "epoch": 0.24794829400270554, "flos": 22594455083520.0, "grad_norm": 3.529432966033417, "language_loss": 0.79914916, "learning_rate": 3.5220569371161846e-06, "loss": 0.81777978, "num_input_tokens_seen": 88879740, "router_z_loss_clip": 3.546875, "router_z_loss_mlp": 0.34912109, "step": 4124, "time_per_iteration": 4.1663713455200195 }, { "auxiliary_loss_clip": 0.01560277, "auxiliary_loss_mlp": 0.00297022, "balance_loss_clip": 1.20577228, "balance_loss_mlp": 0.26364291, "epoch": 0.2480084172553735, "flos": 39676047615360.0, "grad_norm": 6.504756330539726, "language_loss": 0.79161829, "learning_rate": 3.521804257268357e-06, "loss": 0.81019127, "num_input_tokens_seen": 88904095, "router_z_loss_clip": 3.54492188, "router_z_loss_mlp": 0.33398438, "step": 4125, "time_per_iteration": 2.7839128971099854 }, { "auxiliary_loss_clip": 0.01582479, "auxiliary_loss_mlp": 0.00349939, "balance_loss_clip": 1.21150875, "balance_loss_mlp": 0.31288838, "epoch": 0.24806854050804147, "flos": 22053712763520.0, "grad_norm": 30.57992046959528, "language_loss": 0.77504444, "learning_rate": 3.5215515197135595e-06, "loss": 0.79436862, "num_input_tokens_seen": 88920740, "router_z_loss_clip": 3.71484375, "router_z_loss_mlp": 0.37060547, "step": 4126, "time_per_iteration": 2.621971845626831 }, { "auxiliary_loss_clip": 0.01588919, "auxiliary_loss_mlp": 0.00328616, "balance_loss_clip": 1.22084582, "balance_loss_mlp": 0.29237676, "epoch": 0.24812866376070947, "flos": 15486764670720.0, "grad_norm": 3.494661066054162, "language_loss": 0.8892861, "learning_rate": 3.5212987244613764e-06, "loss": 0.90846145, "num_input_tokens_seen": 88938510, "router_z_loss_clip": 3.68164062, "router_z_loss_mlp": 0.36230469, "step": 4127, "time_per_iteration": 2.619274854660034 }, { "auxiliary_loss_clip": 0.01593439, "auxiliary_loss_mlp": 0.00300869, "balance_loss_clip": 1.22469234, "balance_loss_mlp": 0.26434332, "epoch": 0.24818878701337743, "flos": 14757419013120.0, "grad_norm": 10.847052260671141, "language_loss": 0.91142428, "learning_rate": 3.5210458715213927e-06, "loss": 0.93036735, "num_input_tokens_seen": 88955235, "router_z_loss_clip": 3.6875, "router_z_loss_mlp": 0.36499023, "step": 4128, "time_per_iteration": 2.594078302383423 }, { "auxiliary_loss_clip": 0.01588823, "auxiliary_loss_mlp": 0.00328826, "balance_loss_clip": 1.21915174, "balance_loss_mlp": 0.29127467, "epoch": 0.2482489102660454, "flos": 27089501852160.0, "grad_norm": 70.58542331293015, "language_loss": 0.73438179, "learning_rate": 3.5207929609031973e-06, "loss": 0.75355828, "num_input_tokens_seen": 88975210, "router_z_loss_clip": 3.6953125, "router_z_loss_mlp": 0.37548828, "step": 4129, "time_per_iteration": 2.6914682388305664 }, { "auxiliary_loss_clip": 0.01575628, "auxiliary_loss_mlp": 0.0034094, "balance_loss_clip": 1.20976615, "balance_loss_mlp": 0.30434281, "epoch": 0.24830903351871336, "flos": 26467528924800.0, "grad_norm": 35.98961304762489, "language_loss": 0.81956077, "learning_rate": 3.5205399926163806e-06, "loss": 0.83872646, "num_input_tokens_seen": 88996120, "router_z_loss_clip": 3.66015625, "router_z_loss_mlp": 0.36621094, "step": 4130, "time_per_iteration": 2.686846971511841 }, { "auxiliary_loss_clip": 0.01597437, "auxiliary_loss_mlp": 0.00357673, "balance_loss_clip": 1.22617579, "balance_loss_mlp": 0.31926346, "epoch": 0.24836915677138133, "flos": 10228436870400.0, "grad_norm": 33.63542345259908, "language_loss": 0.86513841, "learning_rate": 3.520286966670535e-06, "loss": 0.88468957, "num_input_tokens_seen": 89008685, "router_z_loss_clip": 3.71289062, "router_z_loss_mlp": 0.3840332, "step": 4131, "time_per_iteration": 2.600212574005127 }, { "auxiliary_loss_clip": 0.01606668, "auxiliary_loss_mlp": 0.00311862, "balance_loss_clip": 1.23475814, "balance_loss_mlp": 0.27605128, "epoch": 0.2484292800240493, "flos": 30080429579520.0, "grad_norm": 40.67960033110541, "language_loss": 0.88529718, "learning_rate": 3.520033883075255e-06, "loss": 0.90448248, "num_input_tokens_seen": 89031160, "router_z_loss_clip": 3.71679688, "router_z_loss_mlp": 0.35791016, "step": 4132, "time_per_iteration": 2.76163649559021 }, { "auxiliary_loss_clip": 0.01624537, "auxiliary_loss_mlp": 0.00358648, "balance_loss_clip": 1.24673653, "balance_loss_mlp": 0.31587541, "epoch": 0.24848940327671726, "flos": 13442944803840.0, "grad_norm": 22.434479328811985, "language_loss": 0.77397722, "learning_rate": 3.5197807418401386e-06, "loss": 0.79380906, "num_input_tokens_seen": 89047235, "router_z_loss_clip": 3.77929688, "router_z_loss_mlp": 0.42797852, "step": 4133, "time_per_iteration": 2.6229987144470215 }, { "auxiliary_loss_clip": 0.0163632, "auxiliary_loss_mlp": 0.00339809, "balance_loss_clip": 1.24445057, "balance_loss_mlp": 0.30073196, "epoch": 0.24854952652938525, "flos": 19970247260160.0, "grad_norm": 318.8768316898681, "language_loss": 0.71988219, "learning_rate": 3.5195275429747834e-06, "loss": 0.73964345, "num_input_tokens_seen": 89064790, "router_z_loss_clip": 3.91796875, "router_z_loss_mlp": 0.390625, "step": 4134, "time_per_iteration": 2.6628546714782715 }, { "auxiliary_loss_clip": 0.01632215, "auxiliary_loss_mlp": 0.00311941, "balance_loss_clip": 1.24456239, "balance_loss_mlp": 0.27531958, "epoch": 0.24860964978205322, "flos": 18150187167360.0, "grad_norm": 98.16990109395587, "language_loss": 0.8436414, "learning_rate": 3.5192742864887914e-06, "loss": 0.86308295, "num_input_tokens_seen": 89083250, "router_z_loss_clip": 3.87695312, "router_z_loss_mlp": 0.36621094, "step": 4135, "time_per_iteration": 2.6176228523254395 }, { "auxiliary_loss_clip": 0.01618182, "auxiliary_loss_mlp": 0.00358081, "balance_loss_clip": 1.23771203, "balance_loss_mlp": 0.31957677, "epoch": 0.24866977303472118, "flos": 11728641329280.0, "grad_norm": 6.034747879130095, "language_loss": 0.91327989, "learning_rate": 3.5190209723917662e-06, "loss": 0.93304253, "num_input_tokens_seen": 89100905, "router_z_loss_clip": 3.8046875, "router_z_loss_mlp": 0.38500977, "step": 4136, "time_per_iteration": 2.636693000793457 }, { "auxiliary_loss_clip": 0.0165105, "auxiliary_loss_mlp": 0.00315477, "balance_loss_clip": 1.2525146, "balance_loss_mlp": 0.27702013, "epoch": 0.24872989628738915, "flos": 34823582565120.0, "grad_norm": 3.9493585424694375, "language_loss": 0.78993183, "learning_rate": 3.518767600693314e-06, "loss": 0.80959713, "num_input_tokens_seen": 89122630, "router_z_loss_clip": 3.98632812, "router_z_loss_mlp": 0.38452148, "step": 4137, "time_per_iteration": 2.8064165115356445 }, { "auxiliary_loss_clip": 0.0165912, "auxiliary_loss_mlp": 0.00344801, "balance_loss_clip": 1.25828147, "balance_loss_mlp": 0.30474666, "epoch": 0.2487900195400571, "flos": 13699347062400.0, "grad_norm": 48.897771210315916, "language_loss": 0.74513113, "learning_rate": 3.518514171403042e-06, "loss": 0.76517034, "num_input_tokens_seen": 89141050, "router_z_loss_clip": 4.0078125, "router_z_loss_mlp": 0.40063477, "step": 4138, "time_per_iteration": 2.6587345600128174 }, { "auxiliary_loss_clip": 0.01681615, "auxiliary_loss_mlp": 0.00332011, "balance_loss_clip": 1.27690828, "balance_loss_mlp": 0.2916702, "epoch": 0.24885014279272508, "flos": 25337815297920.0, "grad_norm": 4.99907537749592, "language_loss": 0.88505793, "learning_rate": 3.51826068453056e-06, "loss": 0.90519416, "num_input_tokens_seen": 89160810, "router_z_loss_clip": 4.046875, "router_z_loss_mlp": 0.40332031, "step": 4139, "time_per_iteration": 2.680232524871826 }, { "auxiliary_loss_clip": 0.01670372, "auxiliary_loss_mlp": 0.00368882, "balance_loss_clip": 1.26457274, "balance_loss_mlp": 0.32897103, "epoch": 0.24891026604539307, "flos": 20631434860800.0, "grad_norm": 3.733303265796573, "language_loss": 0.85245162, "learning_rate": 3.518007140085481e-06, "loss": 0.87284416, "num_input_tokens_seen": 89180610, "router_z_loss_clip": 4.06640625, "router_z_loss_mlp": 0.39868164, "step": 4140, "time_per_iteration": 2.649062395095825 }, { "auxiliary_loss_clip": 0.01378362, "auxiliary_loss_mlp": 0.000798, "balance_loss_clip": 1.19434214, "balance_loss_mlp": 0.06644814, "epoch": 0.24897038929806103, "flos": 66960294030720.0, "grad_norm": 0.7916896047721058, "language_loss": 0.6031363, "learning_rate": 3.51775353807742e-06, "loss": 0.61771792, "num_input_tokens_seen": 89241880, "router_z_loss_clip": 1.84375, "router_z_loss_mlp": 0.13378906, "step": 4141, "time_per_iteration": 3.223546028137207 }, { "auxiliary_loss_clip": 0.01667314, "auxiliary_loss_mlp": 0.00338031, "balance_loss_clip": 1.26513076, "balance_loss_mlp": 0.29833466, "epoch": 0.249030512550729, "flos": 36392555612160.0, "grad_norm": 2.2462992065935645, "language_loss": 0.79525411, "learning_rate": 3.5174998785159913e-06, "loss": 0.81530756, "num_input_tokens_seen": 89263340, "router_z_loss_clip": 4.02148438, "router_z_loss_mlp": 0.39672852, "step": 4142, "time_per_iteration": 2.818804979324341 }, { "auxiliary_loss_clip": 0.01677843, "auxiliary_loss_mlp": 0.00368829, "balance_loss_clip": 1.27150941, "balance_loss_mlp": 0.32708192, "epoch": 0.24909063580339696, "flos": 20154576879360.0, "grad_norm": 8.88068899723159, "language_loss": 0.86440003, "learning_rate": 3.5172461614108157e-06, "loss": 0.88486671, "num_input_tokens_seen": 89282870, "router_z_loss_clip": 4.06054688, "router_z_loss_mlp": 0.41723633, "step": 4143, "time_per_iteration": 2.615807294845581 }, { "auxiliary_loss_clip": 0.01673431, "auxiliary_loss_mlp": 0.00350429, "balance_loss_clip": 1.26996374, "balance_loss_mlp": 0.31030327, "epoch": 0.24915075905606493, "flos": 26396569607040.0, "grad_norm": 10.378688528502895, "language_loss": 0.64769375, "learning_rate": 3.5169923867715137e-06, "loss": 0.66793239, "num_input_tokens_seen": 89303830, "router_z_loss_clip": 4.03515625, "router_z_loss_mlp": 0.40136719, "step": 4144, "time_per_iteration": 2.78159236907959 }, { "auxiliary_loss_clip": 0.01682066, "auxiliary_loss_mlp": 0.00320384, "balance_loss_clip": 1.27979279, "balance_loss_mlp": 0.28249896, "epoch": 0.2492108823087329, "flos": 27527216987520.0, "grad_norm": 2.287360875936286, "language_loss": 0.84512997, "learning_rate": 3.516738554607708e-06, "loss": 0.8651545, "num_input_tokens_seen": 89324350, "router_z_loss_clip": 4.015625, "router_z_loss_mlp": 0.37866211, "step": 4145, "time_per_iteration": 2.8103249073028564 }, { "auxiliary_loss_clip": 0.01687868, "auxiliary_loss_mlp": 0.0036843, "balance_loss_clip": 1.28472412, "balance_loss_mlp": 0.32506174, "epoch": 0.24927100556140086, "flos": 16691388111360.0, "grad_norm": 88.56428761737007, "language_loss": 0.76088572, "learning_rate": 3.5164846649290253e-06, "loss": 0.78144872, "num_input_tokens_seen": 89342875, "router_z_loss_clip": 4.03125, "router_z_loss_mlp": 0.43359375, "step": 4146, "time_per_iteration": 2.6481122970581055 }, { "auxiliary_loss_clip": 0.01433006, "auxiliary_loss_mlp": 0.0007412, "balance_loss_clip": 1.24524558, "balance_loss_mlp": 0.05809785, "epoch": 0.24933112881406885, "flos": 62772464286720.0, "grad_norm": 0.9391876255772994, "language_loss": 0.67174792, "learning_rate": 3.5162307177450915e-06, "loss": 0.6868192, "num_input_tokens_seen": 89404925, "router_z_loss_clip": 1.875, "router_z_loss_mlp": 0.16015625, "step": 4147, "time_per_iteration": 3.293832302093506 }, { "auxiliary_loss_clip": 0.01676762, "auxiliary_loss_mlp": 0.00356581, "balance_loss_clip": 1.27315474, "balance_loss_mlp": 0.31650287, "epoch": 0.24939125206673682, "flos": 26651894457600.0, "grad_norm": 1.6993454730732647, "language_loss": 0.95997477, "learning_rate": 3.5159767130655366e-06, "loss": 0.98030818, "num_input_tokens_seen": 89425090, "router_z_loss_clip": 4.03710938, "router_z_loss_mlp": 0.40087891, "step": 4148, "time_per_iteration": 2.700181245803833 }, { "auxiliary_loss_clip": 0.01702047, "auxiliary_loss_mlp": 0.0036396, "balance_loss_clip": 1.28868723, "balance_loss_mlp": 0.32319051, "epoch": 0.24945137531940478, "flos": 20704333512960.0, "grad_norm": 68.05735301965714, "language_loss": 0.75568092, "learning_rate": 3.5157226508999935e-06, "loss": 0.77634096, "num_input_tokens_seen": 89442615, "router_z_loss_clip": 4.1328125, "router_z_loss_mlp": 0.40795898, "step": 4149, "time_per_iteration": 2.679426431655884 }, { "auxiliary_loss_clip": 0.01679139, "auxiliary_loss_mlp": 0.00361903, "balance_loss_clip": 1.28914356, "balance_loss_mlp": 0.32289767, "epoch": 0.24951149857207275, "flos": 23768662682880.0, "grad_norm": 12.978639419053055, "language_loss": 0.76382625, "learning_rate": 3.515468531258095e-06, "loss": 0.78423667, "num_input_tokens_seen": 89463025, "router_z_loss_clip": 3.90429688, "router_z_loss_mlp": 0.38989258, "step": 4150, "time_per_iteration": 2.6783668994903564 }, { "auxiliary_loss_clip": 0.0165879, "auxiliary_loss_mlp": 0.00374914, "balance_loss_clip": 1.26337111, "balance_loss_mlp": 0.33433527, "epoch": 0.2495716218247407, "flos": 15664881237120.0, "grad_norm": 15.940898924040004, "language_loss": 0.78920627, "learning_rate": 3.515214354149478e-06, "loss": 0.80954325, "num_input_tokens_seen": 89480225, "router_z_loss_clip": 3.95507812, "router_z_loss_mlp": 0.40600586, "step": 4151, "time_per_iteration": 2.6855456829071045 }, { "auxiliary_loss_clip": 0.01670458, "auxiliary_loss_mlp": 0.00378467, "balance_loss_clip": 1.27448392, "balance_loss_mlp": 0.33693409, "epoch": 0.24963174507740868, "flos": 24052499953920.0, "grad_norm": 24.948380849339397, "language_loss": 0.73644125, "learning_rate": 3.514960119583781e-06, "loss": 0.75693047, "num_input_tokens_seen": 89496985, "router_z_loss_clip": 3.9609375, "router_z_loss_mlp": 0.41503906, "step": 4152, "time_per_iteration": 2.7001969814300537 }, { "auxiliary_loss_clip": 0.01693832, "auxiliary_loss_mlp": 0.00368962, "balance_loss_clip": 1.29443991, "balance_loss_mlp": 0.32809728, "epoch": 0.24969186833007664, "flos": 21799501234560.0, "grad_norm": 6.18107207295018, "language_loss": 0.83690363, "learning_rate": 3.514705827570645e-06, "loss": 0.85753155, "num_input_tokens_seen": 89514420, "router_z_loss_clip": 3.99609375, "router_z_loss_mlp": 0.40869141, "step": 4153, "time_per_iteration": 4.045004367828369 }, { "auxiliary_loss_clip": 0.01688524, "auxiliary_loss_mlp": 0.00369848, "balance_loss_clip": 1.29476607, "balance_loss_mlp": 0.33055663, "epoch": 0.24975199158274464, "flos": 19938143479680.0, "grad_norm": 3.0147549487533696, "language_loss": 0.83342552, "learning_rate": 3.514451478119711e-06, "loss": 0.85400921, "num_input_tokens_seen": 89532925, "router_z_loss_clip": 3.93945312, "router_z_loss_mlp": 0.39257812, "step": 4154, "time_per_iteration": 2.659853935241699 }, { "auxiliary_loss_clip": 0.01699165, "auxiliary_loss_mlp": 0.00382485, "balance_loss_clip": 1.29056191, "balance_loss_mlp": 0.33775753, "epoch": 0.2498121148354126, "flos": 25338389915520.0, "grad_norm": 19.498646034506738, "language_loss": 0.76917291, "learning_rate": 3.5141970712406258e-06, "loss": 0.78998947, "num_input_tokens_seen": 89552855, "router_z_loss_clip": 4.08789062, "router_z_loss_mlp": 0.44775391, "step": 4155, "time_per_iteration": 2.6661174297332764 }, { "auxiliary_loss_clip": 0.01713254, "auxiliary_loss_mlp": 0.00394735, "balance_loss_clip": 1.29966712, "balance_loss_mlp": 0.35353652, "epoch": 0.24987223808808057, "flos": 20558787603840.0, "grad_norm": 3.388049981669927, "language_loss": 0.82391047, "learning_rate": 3.513942606943036e-06, "loss": 0.84499037, "num_input_tokens_seen": 89572830, "router_z_loss_clip": 4.13671875, "router_z_loss_mlp": 0.41210938, "step": 4156, "time_per_iteration": 4.044252872467041 }, { "auxiliary_loss_clip": 0.01722149, "auxiliary_loss_mlp": 0.00382439, "balance_loss_clip": 1.31101, "balance_loss_mlp": 0.33880812, "epoch": 0.24993236134074853, "flos": 19749037351680.0, "grad_norm": 8.458180837666884, "language_loss": 0.85453051, "learning_rate": 3.513688085236591e-06, "loss": 0.87557638, "num_input_tokens_seen": 89590345, "router_z_loss_clip": 4.11523438, "router_z_loss_mlp": 0.43652344, "step": 4157, "time_per_iteration": 2.656287431716919 }, { "auxiliary_loss_clip": 0.01697187, "auxiliary_loss_mlp": 0.00382409, "balance_loss_clip": 1.28843164, "balance_loss_mlp": 0.33977976, "epoch": 0.2499924845934165, "flos": 18770292587520.0, "grad_norm": 8.385051716522147, "language_loss": 0.86735475, "learning_rate": 3.513433506130942e-06, "loss": 0.88815069, "num_input_tokens_seen": 89610295, "router_z_loss_clip": 4.08789062, "router_z_loss_mlp": 0.42602539, "step": 4158, "time_per_iteration": 2.6642305850982666 }, { "auxiliary_loss_clip": 0.01746591, "auxiliary_loss_mlp": 0.00377001, "balance_loss_clip": 1.32438135, "balance_loss_mlp": 0.33539721, "epoch": 0.25005260784608446, "flos": 16872198197760.0, "grad_norm": 5.182674637914498, "language_loss": 0.82302916, "learning_rate": 3.5131788696357427e-06, "loss": 0.84426504, "num_input_tokens_seen": 89627795, "router_z_loss_clip": 4.2265625, "router_z_loss_mlp": 0.41601562, "step": 4159, "time_per_iteration": 2.6599841117858887 }, { "auxiliary_loss_clip": 0.01736726, "auxiliary_loss_mlp": 0.00380718, "balance_loss_clip": 1.31545138, "balance_loss_mlp": 0.33498913, "epoch": 0.2501127310987524, "flos": 22124923476480.0, "grad_norm": 4.360900478936708, "language_loss": 0.77357417, "learning_rate": 3.512924175760649e-06, "loss": 0.79474854, "num_input_tokens_seen": 89648090, "router_z_loss_clip": 4.2109375, "router_z_loss_mlp": 0.45703125, "step": 4160, "time_per_iteration": 4.100468158721924 }, { "auxiliary_loss_clip": 0.0147425, "auxiliary_loss_mlp": 0.00069325, "balance_loss_clip": 1.28157544, "balance_loss_mlp": 0.0516822, "epoch": 0.2501728543514204, "flos": 69458061980160.0, "grad_norm": 1.1044878035630226, "language_loss": 0.566706, "learning_rate": 3.5126694245153186e-06, "loss": 0.58214176, "num_input_tokens_seen": 89710345, "router_z_loss_clip": 1.921875, "router_z_loss_mlp": 0.17675781, "step": 4161, "time_per_iteration": 3.2302401065826416 }, { "auxiliary_loss_clip": 0.01750971, "auxiliary_loss_mlp": 0.00400915, "balance_loss_clip": 1.32357037, "balance_loss_mlp": 0.35790455, "epoch": 0.25023297760408836, "flos": 16289978647680.0, "grad_norm": 27.10750325978225, "language_loss": 0.8851552, "learning_rate": 3.5124146159094125e-06, "loss": 0.90667403, "num_input_tokens_seen": 89729390, "router_z_loss_clip": 4.27734375, "router_z_loss_mlp": 0.4296875, "step": 4162, "time_per_iteration": 2.740518093109131 }, { "auxiliary_loss_clip": 0.01757411, "auxiliary_loss_mlp": 0.00400136, "balance_loss_clip": 1.32371676, "balance_loss_mlp": 0.358055, "epoch": 0.2502931008567563, "flos": 12237998140800.0, "grad_norm": 4.924283023869376, "language_loss": 0.94599736, "learning_rate": 3.5121597499525927e-06, "loss": 0.96757287, "num_input_tokens_seen": 89742805, "router_z_loss_clip": 4.34179688, "router_z_loss_mlp": 0.42089844, "step": 4163, "time_per_iteration": 2.5928537845611572 }, { "auxiliary_loss_clip": 0.01773059, "auxiliary_loss_mlp": 0.00403892, "balance_loss_clip": 1.33595204, "balance_loss_mlp": 0.36114311, "epoch": 0.25035322410942434, "flos": 23181882105600.0, "grad_norm": 3.712350454006176, "language_loss": 0.89638305, "learning_rate": 3.5119048266545232e-06, "loss": 0.91815257, "num_input_tokens_seen": 89761145, "router_z_loss_clip": 4.3671875, "router_z_loss_mlp": 0.42749023, "step": 4164, "time_per_iteration": 2.680172920227051 }, { "auxiliary_loss_clip": 0.01740181, "auxiliary_loss_mlp": 0.00399107, "balance_loss_clip": 1.31957436, "balance_loss_mlp": 0.35597757, "epoch": 0.2504133473620923, "flos": 20917534688640.0, "grad_norm": 14.820452076107669, "language_loss": 0.78638607, "learning_rate": 3.5116498460248716e-06, "loss": 0.80777895, "num_input_tokens_seen": 89780905, "router_z_loss_clip": 4.2109375, "router_z_loss_mlp": 0.43139648, "step": 4165, "time_per_iteration": 2.6597282886505127 }, { "auxiliary_loss_clip": 0.01765597, "auxiliary_loss_mlp": 0.00430017, "balance_loss_clip": 1.32789409, "balance_loss_mlp": 0.38321573, "epoch": 0.2504734706147603, "flos": 20776549806720.0, "grad_norm": 8.285484897117435, "language_loss": 0.79103994, "learning_rate": 3.5113948080733062e-06, "loss": 0.81299615, "num_input_tokens_seen": 89799230, "router_z_loss_clip": 4.375, "router_z_loss_mlp": 0.46777344, "step": 4166, "time_per_iteration": 2.6459267139434814 }, { "auxiliary_loss_clip": 0.01725853, "auxiliary_loss_mlp": 0.00375757, "balance_loss_clip": 1.30199277, "balance_loss_mlp": 0.33365217, "epoch": 0.25053359386742824, "flos": 24349373861760.0, "grad_norm": 12.23596311459132, "language_loss": 0.87565714, "learning_rate": 3.5111397128094973e-06, "loss": 0.8966732, "num_input_tokens_seen": 89818240, "router_z_loss_clip": 4.2421875, "router_z_loss_mlp": 0.42138672, "step": 4167, "time_per_iteration": 4.156676530838013 }, { "auxiliary_loss_clip": 0.0172705, "auxiliary_loss_mlp": 0.00371574, "balance_loss_clip": 1.31781411, "balance_loss_mlp": 0.33159083, "epoch": 0.2505937171200962, "flos": 21214336769280.0, "grad_norm": 58.219360444293386, "language_loss": 0.85679185, "learning_rate": 3.51088456024312e-06, "loss": 0.87777817, "num_input_tokens_seen": 89834485, "router_z_loss_clip": 4.09765625, "router_z_loss_mlp": 0.3996582, "step": 4168, "time_per_iteration": 2.630072832107544 }, { "auxiliary_loss_clip": 0.01735012, "auxiliary_loss_mlp": 0.00395198, "balance_loss_clip": 1.30172074, "balance_loss_mlp": 0.34784758, "epoch": 0.25065384037276417, "flos": 41427231379200.0, "grad_norm": 27.149577690611306, "language_loss": 0.78424931, "learning_rate": 3.510629350383849e-06, "loss": 0.80555141, "num_input_tokens_seen": 89855645, "router_z_loss_clip": 4.328125, "router_z_loss_mlp": 0.47412109, "step": 4169, "time_per_iteration": 2.7959632873535156 }, { "auxiliary_loss_clip": 0.01721828, "auxiliary_loss_mlp": 0.00345627, "balance_loss_clip": 1.30208993, "balance_loss_mlp": 0.30366546, "epoch": 0.25071396362543213, "flos": 26102389219200.0, "grad_norm": 75.86475815591345, "language_loss": 0.84749293, "learning_rate": 3.510374083241361e-06, "loss": 0.86816752, "num_input_tokens_seen": 89874895, "router_z_loss_clip": 4.1953125, "router_z_loss_mlp": 0.41967773, "step": 4170, "time_per_iteration": 2.7124831676483154 }, { "auxiliary_loss_clip": 0.01733714, "auxiliary_loss_mlp": 0.00361093, "balance_loss_clip": 1.30724955, "balance_loss_mlp": 0.31979859, "epoch": 0.2507740868781001, "flos": 19098982967040.0, "grad_norm": 4.287048929377042, "language_loss": 0.83214402, "learning_rate": 3.5101187588253368e-06, "loss": 0.85309207, "num_input_tokens_seen": 89891700, "router_z_loss_clip": 4.25, "router_z_loss_mlp": 0.4128418, "step": 4171, "time_per_iteration": 2.702528953552246 }, { "auxiliary_loss_clip": 0.01562006, "auxiliary_loss_mlp": 0.00115442, "balance_loss_clip": 1.34955525, "balance_loss_mlp": 0.10075565, "epoch": 0.25083421013076806, "flos": 64341868296960.0, "grad_norm": 1.4656045544706184, "language_loss": 0.60125053, "learning_rate": 3.509863377145458e-06, "loss": 0.618025, "num_input_tokens_seen": 89955775, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.14648438, "step": 4172, "time_per_iteration": 3.1474382877349854 }, { "auxiliary_loss_clip": 0.0172086, "auxiliary_loss_mlp": 0.00355263, "balance_loss_clip": 1.29701805, "balance_loss_mlp": 0.31375426, "epoch": 0.25089433338343603, "flos": 24279599692800.0, "grad_norm": 379.96847912480564, "language_loss": 0.84558362, "learning_rate": 3.509607938211409e-06, "loss": 0.86634487, "num_input_tokens_seen": 89977150, "router_z_loss_clip": 4.23828125, "router_z_loss_mlp": 0.41552734, "step": 4173, "time_per_iteration": 2.7111175060272217 }, { "auxiliary_loss_clip": 0.01766929, "auxiliary_loss_mlp": 0.00361936, "balance_loss_clip": 1.32879353, "balance_loss_mlp": 0.3186388, "epoch": 0.250954456636104, "flos": 14721472477440.0, "grad_norm": 2.108061658194272, "language_loss": 0.91189992, "learning_rate": 3.509352442032875e-06, "loss": 0.93318856, "num_input_tokens_seen": 89994925, "router_z_loss_clip": 4.37890625, "router_z_loss_mlp": 0.43310547, "step": 4174, "time_per_iteration": 2.747072219848633 }, { "auxiliary_loss_clip": 0.01746269, "auxiliary_loss_mlp": 0.00382044, "balance_loss_clip": 1.32358027, "balance_loss_mlp": 0.3358624, "epoch": 0.25101457988877196, "flos": 22273593868800.0, "grad_norm": 1683.8000366236479, "language_loss": 0.78481686, "learning_rate": 3.509096888619545e-06, "loss": 0.80610001, "num_input_tokens_seen": 90013235, "router_z_loss_clip": 4.23046875, "router_z_loss_mlp": 0.46191406, "step": 4175, "time_per_iteration": 2.7203521728515625 }, { "auxiliary_loss_clip": 0.01723251, "auxiliary_loss_mlp": 0.00368967, "balance_loss_clip": 1.29564548, "balance_loss_mlp": 0.32431072, "epoch": 0.2510747031414399, "flos": 25188929424000.0, "grad_norm": 2.8406437051574653, "language_loss": 0.86460221, "learning_rate": 3.50884127798111e-06, "loss": 0.88552439, "num_input_tokens_seen": 90032150, "router_z_loss_clip": 4.27734375, "router_z_loss_mlp": 0.4465332, "step": 4176, "time_per_iteration": 2.6984965801239014 }, { "auxiliary_loss_clip": 0.01709031, "auxiliary_loss_mlp": 0.00347616, "balance_loss_clip": 1.29871988, "balance_loss_mlp": 0.30779988, "epoch": 0.25113482639410795, "flos": 20704189858560.0, "grad_norm": 9.894091325548603, "language_loss": 0.88829297, "learning_rate": 3.5085856101272623e-06, "loss": 0.90885949, "num_input_tokens_seen": 90049085, "router_z_loss_clip": 4.10351562, "router_z_loss_mlp": 0.39819336, "step": 4177, "time_per_iteration": 2.6713273525238037 }, { "auxiliary_loss_clip": 0.01740001, "auxiliary_loss_mlp": 0.00366186, "balance_loss_clip": 1.32228446, "balance_loss_mlp": 0.32277048, "epoch": 0.2511949496467759, "flos": 21506936958720.0, "grad_norm": 13.24306334502241, "language_loss": 0.90009606, "learning_rate": 3.508329885067698e-06, "loss": 0.92115796, "num_input_tokens_seen": 90067695, "router_z_loss_clip": 4.171875, "router_z_loss_mlp": 0.43383789, "step": 4178, "time_per_iteration": 2.6550631523132324 }, { "auxiliary_loss_clip": 0.01724731, "auxiliary_loss_mlp": 0.00339863, "balance_loss_clip": 1.30957627, "balance_loss_mlp": 0.29823545, "epoch": 0.2512550728994439, "flos": 20701999128960.0, "grad_norm": 16.460488558139318, "language_loss": 0.83062792, "learning_rate": 3.508074102812112e-06, "loss": 0.85127389, "num_input_tokens_seen": 90083890, "router_z_loss_clip": 4.15234375, "router_z_loss_mlp": 0.41650391, "step": 4179, "time_per_iteration": 2.656933546066284 }, { "auxiliary_loss_clip": 0.01721718, "auxiliary_loss_mlp": 0.00365529, "balance_loss_clip": 1.31100845, "balance_loss_mlp": 0.32232744, "epoch": 0.25131519615211184, "flos": 18478626151680.0, "grad_norm": 25.137492552808496, "language_loss": 0.78887159, "learning_rate": 3.507818263370206e-06, "loss": 0.80974406, "num_input_tokens_seen": 90100995, "router_z_loss_clip": 4.10742188, "router_z_loss_mlp": 0.43188477, "step": 4180, "time_per_iteration": 2.676567554473877 }, { "auxiliary_loss_clip": 0.017452, "auxiliary_loss_mlp": 0.00355899, "balance_loss_clip": 1.33292937, "balance_loss_mlp": 0.31701279, "epoch": 0.2513753194047798, "flos": 20484955198080.0, "grad_norm": 48.8172449956689, "language_loss": 0.91048241, "learning_rate": 3.5075623667516796e-06, "loss": 0.93149334, "num_input_tokens_seen": 90120365, "router_z_loss_clip": 4.12109375, "router_z_loss_mlp": 0.38842773, "step": 4181, "time_per_iteration": 2.659618854522705 }, { "auxiliary_loss_clip": 0.01725769, "auxiliary_loss_mlp": 0.00361301, "balance_loss_clip": 1.3215152, "balance_loss_mlp": 0.3193866, "epoch": 0.25143544265744777, "flos": 37670077704960.0, "grad_norm": 41.31115987201298, "language_loss": 0.74679357, "learning_rate": 3.507306412966238e-06, "loss": 0.76766431, "num_input_tokens_seen": 90142610, "router_z_loss_clip": 4.04492188, "router_z_loss_mlp": 0.41918945, "step": 4182, "time_per_iteration": 2.825866222381592 }, { "auxiliary_loss_clip": 0.01496348, "auxiliary_loss_mlp": 0.0014551, "balance_loss_clip": 1.27973676, "balance_loss_mlp": 0.1359729, "epoch": 0.25149556591011574, "flos": 69367457923200.0, "grad_norm": 0.8748226155282093, "language_loss": 0.70048219, "learning_rate": 3.5070504020235853e-06, "loss": 0.71690077, "num_input_tokens_seen": 90200555, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.09521484, "step": 4183, "time_per_iteration": 3.179516315460205 }, { "auxiliary_loss_clip": 0.01713858, "auxiliary_loss_mlp": 0.00335354, "balance_loss_clip": 1.30644178, "balance_loss_mlp": 0.2945841, "epoch": 0.2515556891627837, "flos": 13990402967040.0, "grad_norm": 13.194161648142764, "language_loss": 0.81278026, "learning_rate": 3.506794333933431e-06, "loss": 0.83327246, "num_input_tokens_seen": 90218120, "router_z_loss_clip": 4.07421875, "router_z_loss_mlp": 0.4074707, "step": 4184, "time_per_iteration": 2.6457557678222656 }, { "auxiliary_loss_clip": 0.01703966, "auxiliary_loss_mlp": 0.00336853, "balance_loss_clip": 1.31213653, "balance_loss_mlp": 0.29667962, "epoch": 0.25161581241545167, "flos": 22163527618560.0, "grad_norm": 4.824601058984388, "language_loss": 0.90101665, "learning_rate": 3.506538208705484e-06, "loss": 0.92142493, "num_input_tokens_seen": 90236790, "router_z_loss_clip": 3.921875, "router_z_loss_mlp": 0.40209961, "step": 4185, "time_per_iteration": 2.668301820755005 }, { "auxiliary_loss_clip": 0.01425414, "auxiliary_loss_mlp": 0.00159628, "balance_loss_clip": 1.23715138, "balance_loss_mlp": 0.15195122, "epoch": 0.25167593566811963, "flos": 69358407696000.0, "grad_norm": 0.805679358016476, "language_loss": 0.61374801, "learning_rate": 3.5062820263494574e-06, "loss": 0.62959844, "num_input_tokens_seen": 90297070, "router_z_loss_clip": 1.890625, "router_z_loss_mlp": 0.07666016, "step": 4186, "time_per_iteration": 3.0358080863952637 }, { "auxiliary_loss_clip": 0.01684407, "auxiliary_loss_mlp": 0.00353376, "balance_loss_clip": 1.30350399, "balance_loss_mlp": 0.31353626, "epoch": 0.2517360589207876, "flos": 13261452359040.0, "grad_norm": 7.956608087473492, "language_loss": 0.86799961, "learning_rate": 3.5060257868750656e-06, "loss": 0.88837743, "num_input_tokens_seen": 90315255, "router_z_loss_clip": 3.81054688, "router_z_loss_mlp": 0.39819336, "step": 4187, "time_per_iteration": 2.6614584922790527 }, { "auxiliary_loss_clip": 0.01673042, "auxiliary_loss_mlp": 0.00312322, "balance_loss_clip": 1.30131435, "balance_loss_mlp": 0.27500916, "epoch": 0.25179618217345556, "flos": 20376828282240.0, "grad_norm": 1.9757761599786032, "language_loss": 0.85114944, "learning_rate": 3.5057694902920244e-06, "loss": 0.87100315, "num_input_tokens_seen": 90334990, "router_z_loss_clip": 3.71875, "router_z_loss_mlp": 0.37304688, "step": 4188, "time_per_iteration": 2.6738290786743164 }, { "auxiliary_loss_clip": 0.01677668, "auxiliary_loss_mlp": 0.00314982, "balance_loss_clip": 1.30055034, "balance_loss_mlp": 0.27578634, "epoch": 0.25185630542612353, "flos": 27664718250240.0, "grad_norm": 2.3627564519292568, "language_loss": 0.81015015, "learning_rate": 3.5055131366100534e-06, "loss": 0.83007669, "num_input_tokens_seen": 90351825, "router_z_loss_clip": 3.77148438, "router_z_loss_mlp": 0.39160156, "step": 4189, "time_per_iteration": 2.8758323192596436 }, { "auxiliary_loss_clip": 0.01650569, "auxiliary_loss_mlp": 0.00296342, "balance_loss_clip": 1.28457665, "balance_loss_mlp": 0.26043588, "epoch": 0.25191642867879155, "flos": 20996430912000.0, "grad_norm": 7.851090411967549, "language_loss": 0.91027111, "learning_rate": 3.5052567258388745e-06, "loss": 0.92974019, "num_input_tokens_seen": 90369860, "router_z_loss_clip": 3.6640625, "router_z_loss_mlp": 0.35888672, "step": 4190, "time_per_iteration": 2.817430257797241 }, { "auxiliary_loss_clip": 0.01625086, "auxiliary_loss_mlp": 0.00318007, "balance_loss_clip": 1.26379275, "balance_loss_mlp": 0.27759552, "epoch": 0.2519765519314595, "flos": 21105671149440.0, "grad_norm": 5.153109194520077, "language_loss": 0.83802366, "learning_rate": 3.5050002579882082e-06, "loss": 0.8574546, "num_input_tokens_seen": 90389245, "router_z_loss_clip": 3.61328125, "router_z_loss_mlp": 0.40405273, "step": 4191, "time_per_iteration": 2.6803903579711914 }, { "auxiliary_loss_clip": 0.01372878, "auxiliary_loss_mlp": 0.0011357, "balance_loss_clip": 1.18128633, "balance_loss_mlp": 0.1053211, "epoch": 0.2520366751841275, "flos": 62744993360640.0, "grad_norm": 0.7210883294475887, "language_loss": 0.5648126, "learning_rate": 3.5047437330677823e-06, "loss": 0.5796771, "num_input_tokens_seen": 90456735, "router_z_loss_clip": 1.921875, "router_z_loss_mlp": 0.08251953, "step": 4192, "time_per_iteration": 3.2181878089904785 }, { "auxiliary_loss_clip": 0.01609207, "auxiliary_loss_mlp": 0.00297123, "balance_loss_clip": 1.26611495, "balance_loss_mlp": 0.2615037, "epoch": 0.25209679843679544, "flos": 22230716008320.0, "grad_norm": 2.5153332872618144, "language_loss": 0.82166004, "learning_rate": 3.504487151087323e-06, "loss": 0.84072334, "num_input_tokens_seen": 90474165, "router_z_loss_clip": 3.4296875, "router_z_loss_mlp": 0.35595703, "step": 4193, "time_per_iteration": 2.6363346576690674 }, { "auxiliary_loss_clip": 0.01604959, "auxiliary_loss_mlp": 0.00360085, "balance_loss_clip": 1.25498414, "balance_loss_mlp": 0.32036462, "epoch": 0.2521569216894634, "flos": 12166643773440.0, "grad_norm": 15.25054986623241, "language_loss": 0.91243666, "learning_rate": 3.5042305120565598e-06, "loss": 0.93208712, "num_input_tokens_seen": 90491660, "router_z_loss_clip": 3.5, "router_z_loss_mlp": 0.3972168, "step": 4194, "time_per_iteration": 2.616605520248413 }, { "auxiliary_loss_clip": 0.01589619, "auxiliary_loss_mlp": 0.00330798, "balance_loss_clip": 1.24687767, "balance_loss_mlp": 0.29503548, "epoch": 0.2522170449421314, "flos": 23699786353920.0, "grad_norm": 8.071640701383187, "language_loss": 0.91762942, "learning_rate": 3.5039738159852253e-06, "loss": 0.93683362, "num_input_tokens_seen": 90514025, "router_z_loss_clip": 3.42773438, "router_z_loss_mlp": 0.35766602, "step": 4195, "time_per_iteration": 4.136380195617676 }, { "auxiliary_loss_clip": 0.01615715, "auxiliary_loss_mlp": 0.00306963, "balance_loss_clip": 1.26009738, "balance_loss_mlp": 0.26504895, "epoch": 0.25227716819479934, "flos": 20955456472320.0, "grad_norm": 6.70556622227512, "language_loss": 0.9156577, "learning_rate": 3.503717062883053e-06, "loss": 0.93488455, "num_input_tokens_seen": 90533530, "router_z_loss_clip": 3.55664062, "router_z_loss_mlp": 0.41918945, "step": 4196, "time_per_iteration": 2.66886305809021 }, { "auxiliary_loss_clip": 0.0158347, "auxiliary_loss_mlp": 0.00341941, "balance_loss_clip": 1.23952961, "balance_loss_mlp": 0.30667895, "epoch": 0.2523372914474673, "flos": 23331342597120.0, "grad_norm": 42.29913244984571, "language_loss": 0.88880169, "learning_rate": 3.5034602527597786e-06, "loss": 0.90805578, "num_input_tokens_seen": 90554025, "router_z_loss_clip": 3.44140625, "router_z_loss_mlp": 0.3527832, "step": 4197, "time_per_iteration": 2.747697591781616 }, { "auxiliary_loss_clip": 0.01583687, "auxiliary_loss_mlp": 0.00327252, "balance_loss_clip": 1.23915565, "balance_loss_mlp": 0.28645813, "epoch": 0.25239741470013527, "flos": 36970321875840.0, "grad_norm": 3.4742372347917962, "language_loss": 0.79797244, "learning_rate": 3.5032033856251405e-06, "loss": 0.81708187, "num_input_tokens_seen": 90576930, "router_z_loss_clip": 3.44335938, "router_z_loss_mlp": 0.4074707, "step": 4198, "time_per_iteration": 4.180483341217041 }, { "auxiliary_loss_clip": 0.01574, "auxiliary_loss_mlp": 0.00371439, "balance_loss_clip": 1.22474766, "balance_loss_mlp": 0.32661596, "epoch": 0.25245753795280323, "flos": 18515757836160.0, "grad_norm": 4.734608765369558, "language_loss": 0.82854933, "learning_rate": 3.50294646148888e-06, "loss": 0.84800375, "num_input_tokens_seen": 90595710, "router_z_loss_clip": 3.4921875, "router_z_loss_mlp": 0.44799805, "step": 4199, "time_per_iteration": 2.6905980110168457 }, { "auxiliary_loss_clip": 0.01555098, "auxiliary_loss_mlp": 0.00320783, "balance_loss_clip": 1.21420097, "balance_loss_mlp": 0.28122896, "epoch": 0.2525176612054712, "flos": 32344884737280.0, "grad_norm": 28.722720742043276, "language_loss": 0.80940276, "learning_rate": 3.502689480360739e-06, "loss": 0.8281616, "num_input_tokens_seen": 90617945, "router_z_loss_clip": 3.40820312, "router_z_loss_mlp": 0.39526367, "step": 4200, "time_per_iteration": 2.7869579792022705 }, { "auxiliary_loss_clip": 0.01571189, "auxiliary_loss_mlp": 0.00319041, "balance_loss_clip": 1.22936296, "balance_loss_mlp": 0.28113225, "epoch": 0.25257778445813917, "flos": 45258217459200.0, "grad_norm": 3.506313192125652, "language_loss": 0.87721264, "learning_rate": 3.5024324422504616e-06, "loss": 0.89611501, "num_input_tokens_seen": 90640855, "router_z_loss_clip": 3.41796875, "router_z_loss_mlp": 0.37915039, "step": 4201, "time_per_iteration": 2.8783254623413086 }, { "auxiliary_loss_clip": 0.01568816, "auxiliary_loss_mlp": 0.00335331, "balance_loss_clip": 1.22597408, "balance_loss_mlp": 0.29830423, "epoch": 0.25263790771080713, "flos": 23367791923200.0, "grad_norm": 5.749991287640342, "language_loss": 0.81828117, "learning_rate": 3.5021753471677965e-06, "loss": 0.83732271, "num_input_tokens_seen": 90661350, "router_z_loss_clip": 3.42773438, "router_z_loss_mlp": 0.37036133, "step": 4202, "time_per_iteration": 4.1148083209991455 }, { "auxiliary_loss_clip": 0.01555945, "auxiliary_loss_mlp": 0.00323972, "balance_loss_clip": 1.22036195, "balance_loss_mlp": 0.28837645, "epoch": 0.25269803096347515, "flos": 18515039564160.0, "grad_norm": 9.629399845386747, "language_loss": 0.81411541, "learning_rate": 3.501918195122491e-06, "loss": 0.83291459, "num_input_tokens_seen": 90680540, "router_z_loss_clip": 3.359375, "router_z_loss_mlp": 0.35620117, "step": 4203, "time_per_iteration": 2.6485085487365723 }, { "auxiliary_loss_clip": 0.01554237, "auxiliary_loss_mlp": 0.00353593, "balance_loss_clip": 1.21712935, "balance_loss_mlp": 0.3138963, "epoch": 0.2527581542161431, "flos": 24610552629120.0, "grad_norm": 120.57566395971459, "language_loss": 0.82850516, "learning_rate": 3.501660986124297e-06, "loss": 0.84758347, "num_input_tokens_seen": 90703460, "router_z_loss_clip": 3.37695312, "router_z_loss_mlp": 0.39697266, "step": 4204, "time_per_iteration": 2.7450177669525146 }, { "auxiliary_loss_clip": 0.01539411, "auxiliary_loss_mlp": 0.00287728, "balance_loss_clip": 1.20598948, "balance_loss_mlp": 0.24915197, "epoch": 0.2528182774688111, "flos": 12641275111680.0, "grad_norm": 8.807417309699238, "language_loss": 0.82680202, "learning_rate": 3.5014037201829684e-06, "loss": 0.84507334, "num_input_tokens_seen": 90718815, "router_z_loss_clip": 3.3359375, "router_z_loss_mlp": 0.38574219, "step": 4205, "time_per_iteration": 2.723830461502075 }, { "auxiliary_loss_clip": 0.01526451, "auxiliary_loss_mlp": 0.00282255, "balance_loss_clip": 1.20809531, "balance_loss_mlp": 0.24866211, "epoch": 0.25287840072147905, "flos": 46936789879680.0, "grad_norm": 75.79627030497613, "language_loss": 0.80593145, "learning_rate": 3.50114639730826e-06, "loss": 0.82401848, "num_input_tokens_seen": 90742125, "router_z_loss_clip": 3.18359375, "router_z_loss_mlp": 0.3359375, "step": 4206, "time_per_iteration": 2.8919711112976074 }, { "auxiliary_loss_clip": 0.01530199, "auxiliary_loss_mlp": 0.0028257, "balance_loss_clip": 1.20517254, "balance_loss_mlp": 0.24842826, "epoch": 0.252938523974147, "flos": 18879712392960.0, "grad_norm": 11.022642687157369, "language_loss": 0.86206937, "learning_rate": 3.5008890175099296e-06, "loss": 0.88019705, "num_input_tokens_seen": 90760785, "router_z_loss_clip": 3.25390625, "router_z_loss_mlp": 0.34130859, "step": 4207, "time_per_iteration": 2.6290981769561768 }, { "auxiliary_loss_clip": 0.01516154, "auxiliary_loss_mlp": 0.00303017, "balance_loss_clip": 1.195081, "balance_loss_mlp": 0.2696389, "epoch": 0.252998647226815, "flos": 21434720664960.0, "grad_norm": 321.42915364295374, "language_loss": 0.82478523, "learning_rate": 3.5006315807977375e-06, "loss": 0.84297693, "num_input_tokens_seen": 90780045, "router_z_loss_clip": 3.20898438, "router_z_loss_mlp": 0.33349609, "step": 4208, "time_per_iteration": 2.647425889968872 }, { "auxiliary_loss_clip": 0.01525648, "auxiliary_loss_mlp": 0.0028523, "balance_loss_clip": 1.20681, "balance_loss_mlp": 0.25111198, "epoch": 0.25305877047948294, "flos": 25442171285760.0, "grad_norm": 29.090275724091324, "language_loss": 0.75740939, "learning_rate": 3.5003740871814456e-06, "loss": 0.77551818, "num_input_tokens_seen": 90797980, "router_z_loss_clip": 3.19140625, "router_z_loss_mlp": 0.34106445, "step": 4209, "time_per_iteration": 4.050754070281982 }, { "auxiliary_loss_clip": 0.01275699, "auxiliary_loss_mlp": 0.00086552, "balance_loss_clip": 1.07934809, "balance_loss_mlp": 0.07820731, "epoch": 0.2531188937321509, "flos": 60185603629440.0, "grad_norm": 0.7519981503578049, "language_loss": 0.55076367, "learning_rate": 3.5001165366708175e-06, "loss": 0.56438619, "num_input_tokens_seen": 90864865, "router_z_loss_clip": 1.9609375, "router_z_loss_mlp": 0.08349609, "step": 4210, "time_per_iteration": 3.2179529666900635 }, { "auxiliary_loss_clip": 0.01521211, "auxiliary_loss_mlp": 0.00302474, "balance_loss_clip": 1.19806159, "balance_loss_mlp": 0.26740283, "epoch": 0.25317901698481887, "flos": 19682387665920.0, "grad_norm": 2.4354892209208345, "language_loss": 0.85134113, "learning_rate": 3.4998589292756204e-06, "loss": 0.869578, "num_input_tokens_seen": 90882885, "router_z_loss_clip": 3.23046875, "router_z_loss_mlp": 0.3503418, "step": 4211, "time_per_iteration": 2.642101764678955 }, { "auxiliary_loss_clip": 0.0153171, "auxiliary_loss_mlp": 0.00291641, "balance_loss_clip": 1.21180427, "balance_loss_mlp": 0.25907263, "epoch": 0.25323914023748684, "flos": 24424355502720.0, "grad_norm": 2.839678746800825, "language_loss": 0.82798111, "learning_rate": 3.499601265005622e-06, "loss": 0.84621465, "num_input_tokens_seen": 90902985, "router_z_loss_clip": 3.19921875, "router_z_loss_mlp": 0.32592773, "step": 4212, "time_per_iteration": 2.6925222873687744 }, { "auxiliary_loss_clip": 0.01531661, "auxiliary_loss_mlp": 0.00312455, "balance_loss_clip": 1.20815301, "balance_loss_mlp": 0.27564344, "epoch": 0.2532992634901548, "flos": 25447450584960.0, "grad_norm": 65.12943478300109, "language_loss": 0.62873435, "learning_rate": 3.4993435438705938e-06, "loss": 0.64717555, "num_input_tokens_seen": 90923550, "router_z_loss_clip": 3.24023438, "router_z_loss_mlp": 0.36816406, "step": 4213, "time_per_iteration": 2.6699957847595215 }, { "auxiliary_loss_clip": 0.01523382, "auxiliary_loss_mlp": 0.00301076, "balance_loss_clip": 1.2063601, "balance_loss_mlp": 0.26385936, "epoch": 0.25335938674282277, "flos": 18880538405760.0, "grad_norm": 8.896055469047226, "language_loss": 0.72874963, "learning_rate": 3.499085765880308e-06, "loss": 0.7469942, "num_input_tokens_seen": 90943260, "router_z_loss_clip": 3.16796875, "router_z_loss_mlp": 0.37231445, "step": 4214, "time_per_iteration": 2.61181902885437 }, { "auxiliary_loss_clip": 0.01295669, "auxiliary_loss_mlp": 0.00135991, "balance_loss_clip": 1.10108078, "balance_loss_mlp": 0.12759836, "epoch": 0.25341950999549073, "flos": 53062649936640.0, "grad_norm": 3.2014141401046796, "language_loss": 0.57677394, "learning_rate": 3.4988279310445396e-06, "loss": 0.5910905, "num_input_tokens_seen": 90996295, "router_z_loss_clip": 1.953125, "router_z_loss_mlp": 0.08398438, "step": 4215, "time_per_iteration": 2.9706528186798096 }, { "auxiliary_loss_clip": 0.01538172, "auxiliary_loss_mlp": 0.00292261, "balance_loss_clip": 1.21908438, "balance_loss_mlp": 0.25811917, "epoch": 0.2534796332481587, "flos": 39020247054720.0, "grad_norm": 172.05271943626448, "language_loss": 0.90038514, "learning_rate": 3.498570039373066e-06, "loss": 0.91868949, "num_input_tokens_seen": 91017545, "router_z_loss_clip": 3.19335938, "router_z_loss_mlp": 0.34155273, "step": 4216, "time_per_iteration": 2.8323018550872803 }, { "auxiliary_loss_clip": 0.01528391, "auxiliary_loss_mlp": 0.00285296, "balance_loss_clip": 1.21001101, "balance_loss_mlp": 0.25206086, "epoch": 0.2535397565008267, "flos": 23586990670080.0, "grad_norm": 4.074770662778696, "language_loss": 0.86563402, "learning_rate": 3.498312090875666e-06, "loss": 0.88377088, "num_input_tokens_seen": 91037715, "router_z_loss_clip": 3.18554688, "router_z_loss_mlp": 0.33227539, "step": 4217, "time_per_iteration": 2.680891990661621 }, { "auxiliary_loss_clip": 0.01540522, "auxiliary_loss_mlp": 0.00304752, "balance_loss_clip": 1.22231889, "balance_loss_mlp": 0.27022874, "epoch": 0.2535998797534947, "flos": 19281373251840.0, "grad_norm": 5.752200767529421, "language_loss": 0.83950114, "learning_rate": 3.4980540855621218e-06, "loss": 0.85795391, "num_input_tokens_seen": 91055295, "router_z_loss_clip": 3.18359375, "router_z_loss_mlp": 0.34521484, "step": 4218, "time_per_iteration": 2.6433663368225098 }, { "auxiliary_loss_clip": 0.01543577, "auxiliary_loss_mlp": 0.0030859, "balance_loss_clip": 1.22418165, "balance_loss_mlp": 0.27192146, "epoch": 0.25366000300616265, "flos": 24024382583040.0, "grad_norm": 9.055378812375205, "language_loss": 0.81815314, "learning_rate": 3.4977960234422167e-06, "loss": 0.83667481, "num_input_tokens_seen": 91075485, "router_z_loss_clip": 3.19140625, "router_z_loss_mlp": 0.36645508, "step": 4219, "time_per_iteration": 2.7160122394561768 }, { "auxiliary_loss_clip": 0.0156381, "auxiliary_loss_mlp": 0.00311778, "balance_loss_clip": 1.23722303, "balance_loss_mlp": 0.2755388, "epoch": 0.2537201262588306, "flos": 16289368116480.0, "grad_norm": 18.78032902138606, "language_loss": 0.86756265, "learning_rate": 3.497537904525736e-06, "loss": 0.88631856, "num_input_tokens_seen": 91093620, "router_z_loss_clip": 3.26953125, "router_z_loss_mlp": 0.36230469, "step": 4220, "time_per_iteration": 2.619300603866577 }, { "auxiliary_loss_clip": 0.01555862, "auxiliary_loss_mlp": 0.00334232, "balance_loss_clip": 1.2344842, "balance_loss_mlp": 0.29784924, "epoch": 0.2537802495114986, "flos": 23294677789440.0, "grad_norm": 5.322390171449813, "language_loss": 0.78420442, "learning_rate": 3.497279728822468e-06, "loss": 0.80310535, "num_input_tokens_seen": 91114110, "router_z_loss_clip": 3.21289062, "router_z_loss_mlp": 0.36352539, "step": 4221, "time_per_iteration": 2.788115978240967 }, { "auxiliary_loss_clip": 0.01553838, "auxiliary_loss_mlp": 0.00338031, "balance_loss_clip": 1.22697818, "balance_loss_mlp": 0.29962146, "epoch": 0.25384037276416654, "flos": 17639142416640.0, "grad_norm": 7.085697036148153, "language_loss": 0.69304383, "learning_rate": 3.497021496342202e-06, "loss": 0.71196252, "num_input_tokens_seen": 91133135, "router_z_loss_clip": 3.27148438, "router_z_loss_mlp": 0.38427734, "step": 4222, "time_per_iteration": 2.655081033706665 }, { "auxiliary_loss_clip": 0.01566229, "auxiliary_loss_mlp": 0.00358182, "balance_loss_clip": 1.24218881, "balance_loss_mlp": 0.32077387, "epoch": 0.2539004960168345, "flos": 21507044699520.0, "grad_norm": 5.033238237654, "language_loss": 0.8004297, "learning_rate": 3.496763207094731e-06, "loss": 0.81967378, "num_input_tokens_seen": 91151805, "router_z_loss_clip": 3.23632812, "router_z_loss_mlp": 0.37402344, "step": 4223, "time_per_iteration": 2.730522394180298 }, { "auxiliary_loss_clip": 0.01569542, "auxiliary_loss_mlp": 0.0034086, "balance_loss_clip": 1.24866748, "balance_loss_mlp": 0.30619407, "epoch": 0.2539606192695025, "flos": 23950909313280.0, "grad_norm": 2.50144211063995, "language_loss": 0.85496986, "learning_rate": 3.49650486108985e-06, "loss": 0.87407386, "num_input_tokens_seen": 91172270, "router_z_loss_clip": 3.2109375, "router_z_loss_mlp": 0.34643555, "step": 4224, "time_per_iteration": 2.69619083404541 }, { "auxiliary_loss_clip": 0.01556332, "auxiliary_loss_mlp": 0.00346988, "balance_loss_clip": 1.23694634, "balance_loss_mlp": 0.30915052, "epoch": 0.25402074252217044, "flos": 24169784837760.0, "grad_norm": 3.7296491149198823, "language_loss": 0.8279196, "learning_rate": 3.496246458337354e-06, "loss": 0.84695274, "num_input_tokens_seen": 91192080, "router_z_loss_clip": 3.1953125, "router_z_loss_mlp": 0.37792969, "step": 4225, "time_per_iteration": 2.7021117210388184 }, { "auxiliary_loss_clip": 0.01555082, "auxiliary_loss_mlp": 0.00359136, "balance_loss_clip": 1.2422998, "balance_loss_mlp": 0.32208586, "epoch": 0.2540808657748384, "flos": 22303758314880.0, "grad_norm": 7.718800424302671, "language_loss": 0.89968348, "learning_rate": 3.4959879988470426e-06, "loss": 0.91882569, "num_input_tokens_seen": 91211450, "router_z_loss_clip": 3.1328125, "router_z_loss_mlp": 0.37036133, "step": 4226, "time_per_iteration": 2.6514675617218018 }, { "auxiliary_loss_clip": 0.0153839, "auxiliary_loss_mlp": 0.00333168, "balance_loss_clip": 1.22410047, "balance_loss_mlp": 0.29723793, "epoch": 0.25414098902750637, "flos": 27599541022080.0, "grad_norm": 11.675907811798416, "language_loss": 0.77429795, "learning_rate": 3.4957294826287164e-06, "loss": 0.79301351, "num_input_tokens_seen": 91231835, "router_z_loss_clip": 3.14648438, "router_z_loss_mlp": 0.35913086, "step": 4227, "time_per_iteration": 2.7726974487304688 }, { "auxiliary_loss_clip": 0.0137435, "auxiliary_loss_mlp": 0.00197491, "balance_loss_clip": 1.20258451, "balance_loss_mlp": 0.1893373, "epoch": 0.25420111228017434, "flos": 58170834887040.0, "grad_norm": 1.17377216590894, "language_loss": 0.61718476, "learning_rate": 3.4954709096921785e-06, "loss": 0.63290322, "num_input_tokens_seen": 91288755, "router_z_loss_clip": 1.71875, "router_z_loss_mlp": 0.08154297, "step": 4228, "time_per_iteration": 3.0211005210876465 }, { "auxiliary_loss_clip": 0.01555417, "auxiliary_loss_mlp": 0.00334805, "balance_loss_clip": 1.23528361, "balance_loss_mlp": 0.29777879, "epoch": 0.2542612355328423, "flos": 11464409905920.0, "grad_norm": 115.09788547368345, "language_loss": 0.95322537, "learning_rate": 3.4952122800472336e-06, "loss": 0.97212756, "num_input_tokens_seen": 91302485, "router_z_loss_clip": 3.20117188, "router_z_loss_mlp": 0.37036133, "step": 4229, "time_per_iteration": 2.5889062881469727 }, { "auxiliary_loss_clip": 0.01563052, "auxiliary_loss_mlp": 0.003312, "balance_loss_clip": 1.25449443, "balance_loss_mlp": 0.29488844, "epoch": 0.2543213587855103, "flos": 22965879669120.0, "grad_norm": 12.378293995279423, "language_loss": 0.83237767, "learning_rate": 3.4949535937036892e-06, "loss": 0.85132015, "num_input_tokens_seen": 91321120, "router_z_loss_clip": 3.08789062, "router_z_loss_mlp": 0.36303711, "step": 4230, "time_per_iteration": 2.6581215858459473 }, { "auxiliary_loss_clip": 0.01543072, "auxiliary_loss_mlp": 0.00356755, "balance_loss_clip": 1.23031211, "balance_loss_mlp": 0.31820303, "epoch": 0.2543814820381783, "flos": 18253178438400.0, "grad_norm": 11.142742653261285, "language_loss": 0.80322611, "learning_rate": 3.4946948506713544e-06, "loss": 0.82222444, "num_input_tokens_seen": 91338575, "router_z_loss_clip": 3.12695312, "router_z_loss_mlp": 0.38574219, "step": 4231, "time_per_iteration": 2.6209990978240967 }, { "auxiliary_loss_clip": 0.01556348, "auxiliary_loss_mlp": 0.00347078, "balance_loss_clip": 1.23863435, "balance_loss_mlp": 0.31234065, "epoch": 0.25444160529084625, "flos": 15632705629440.0, "grad_norm": 7.895121350920693, "language_loss": 0.80303478, "learning_rate": 3.4944360509600416e-06, "loss": 0.82206905, "num_input_tokens_seen": 91357355, "router_z_loss_clip": 3.17382812, "router_z_loss_mlp": 0.34765625, "step": 4232, "time_per_iteration": 2.654867172241211 }, { "auxiliary_loss_clip": 0.01543942, "auxiliary_loss_mlp": 0.00359716, "balance_loss_clip": 1.2314595, "balance_loss_mlp": 0.32400137, "epoch": 0.2545017285435142, "flos": 24601610142720.0, "grad_norm": 3.3498167264767993, "language_loss": 0.9206326, "learning_rate": 3.4941771945795637e-06, "loss": 0.93966913, "num_input_tokens_seen": 91376515, "router_z_loss_clip": 3.12695312, "router_z_loss_mlp": 0.35693359, "step": 4233, "time_per_iteration": 2.6950345039367676 }, { "auxiliary_loss_clip": 0.0153012, "auxiliary_loss_mlp": 0.00313456, "balance_loss_clip": 1.22450542, "balance_loss_mlp": 0.28081632, "epoch": 0.2545618517961822, "flos": 24679069822080.0, "grad_norm": 7.002956704915364, "language_loss": 0.78518867, "learning_rate": 3.493918281539737e-06, "loss": 0.80362439, "num_input_tokens_seen": 91397595, "router_z_loss_clip": 3.0546875, "router_z_loss_mlp": 0.32641602, "step": 4234, "time_per_iteration": 2.7019336223602295 }, { "auxiliary_loss_clip": 0.01541375, "auxiliary_loss_mlp": 0.00355886, "balance_loss_clip": 1.22939312, "balance_loss_mlp": 0.32174408, "epoch": 0.25462197504885015, "flos": 23915106432000.0, "grad_norm": 10.402531013342383, "language_loss": 0.8060208, "learning_rate": 3.493659311850379e-06, "loss": 0.82499343, "num_input_tokens_seen": 91417775, "router_z_loss_clip": 3.1171875, "router_z_loss_mlp": 0.34179688, "step": 4235, "time_per_iteration": 2.7035655975341797 }, { "auxiliary_loss_clip": 0.01569076, "auxiliary_loss_mlp": 0.00388091, "balance_loss_clip": 1.2464385, "balance_loss_mlp": 0.34891897, "epoch": 0.2546820983015181, "flos": 24789387467520.0, "grad_norm": 3.7740590285915814, "language_loss": 0.75472283, "learning_rate": 3.4934002855213106e-06, "loss": 0.7742945, "num_input_tokens_seen": 91437665, "router_z_loss_clip": 3.22460938, "router_z_loss_mlp": 0.39160156, "step": 4236, "time_per_iteration": 2.866189956665039 }, { "auxiliary_loss_clip": 0.01566775, "auxiliary_loss_mlp": 0.00353041, "balance_loss_clip": 1.24700987, "balance_loss_mlp": 0.31861368, "epoch": 0.2547422215541861, "flos": 18734130570240.0, "grad_norm": 25.45997715702305, "language_loss": 0.72652656, "learning_rate": 3.493141202562354e-06, "loss": 0.74572468, "num_input_tokens_seen": 91456705, "router_z_loss_clip": 3.19921875, "router_z_loss_mlp": 0.34448242, "step": 4237, "time_per_iteration": 4.105910062789917 }, { "auxiliary_loss_clip": 0.01557582, "auxiliary_loss_mlp": 0.00351021, "balance_loss_clip": 1.23008311, "balance_loss_mlp": 0.3124209, "epoch": 0.25480234480685404, "flos": 21032449274880.0, "grad_norm": 5.970043901183035, "language_loss": 0.80425054, "learning_rate": 3.492882062983333e-06, "loss": 0.8233366, "num_input_tokens_seen": 91475535, "router_z_loss_clip": 3.27148438, "router_z_loss_mlp": 0.38623047, "step": 4238, "time_per_iteration": 2.638857126235962 }, { "auxiliary_loss_clip": 0.01557818, "auxiliary_loss_mlp": 0.00347302, "balance_loss_clip": 1.2334609, "balance_loss_mlp": 0.30934554, "epoch": 0.254862468059522, "flos": 25082167224960.0, "grad_norm": 31.52544064120883, "language_loss": 0.8644948, "learning_rate": 3.492622866794074e-06, "loss": 0.88354599, "num_input_tokens_seen": 91499140, "router_z_loss_clip": 3.24414062, "router_z_loss_mlp": 0.37963867, "step": 4239, "time_per_iteration": 2.6800105571746826 }, { "auxiliary_loss_clip": 0.01561366, "auxiliary_loss_mlp": 0.00368979, "balance_loss_clip": 1.24142289, "balance_loss_mlp": 0.32913893, "epoch": 0.25492259131219, "flos": 20558392554240.0, "grad_norm": 68.89486892408794, "language_loss": 0.82912207, "learning_rate": 3.492363614004407e-06, "loss": 0.84842545, "num_input_tokens_seen": 91518335, "router_z_loss_clip": 3.19921875, "router_z_loss_mlp": 0.39819336, "step": 4240, "time_per_iteration": 2.6444733142852783 }, { "auxiliary_loss_clip": 0.01574133, "auxiliary_loss_mlp": 0.00396973, "balance_loss_clip": 1.24367714, "balance_loss_mlp": 0.3557744, "epoch": 0.25498271456485794, "flos": 25042485674880.0, "grad_norm": 3.4659288683785596, "language_loss": 0.90255392, "learning_rate": 3.492104304624162e-06, "loss": 0.92226499, "num_input_tokens_seen": 91537655, "router_z_loss_clip": 3.30078125, "router_z_loss_mlp": 0.41210938, "step": 4241, "time_per_iteration": 4.037973403930664 }, { "auxiliary_loss_clip": 0.01578156, "auxiliary_loss_mlp": 0.00363639, "balance_loss_clip": 1.25023651, "balance_loss_mlp": 0.32825744, "epoch": 0.2550428378175259, "flos": 26178412354560.0, "grad_norm": 25.210081573691333, "language_loss": 0.79022247, "learning_rate": 3.4918449386631725e-06, "loss": 0.80964041, "num_input_tokens_seen": 91557545, "router_z_loss_clip": 3.28320312, "router_z_loss_mlp": 0.35375977, "step": 4242, "time_per_iteration": 2.801414728164673 }, { "auxiliary_loss_clip": 0.0157073, "auxiliary_loss_mlp": 0.00341454, "balance_loss_clip": 1.24361038, "balance_loss_mlp": 0.30683526, "epoch": 0.2551029610701939, "flos": 15267170874240.0, "grad_norm": 13.06068062012221, "language_loss": 0.8026762, "learning_rate": 3.491585516131273e-06, "loss": 0.82179809, "num_input_tokens_seen": 91574405, "router_z_loss_clip": 3.26953125, "router_z_loss_mlp": 0.34643555, "step": 4243, "time_per_iteration": 2.6196041107177734 }, { "auxiliary_loss_clip": 0.01570652, "auxiliary_loss_mlp": 0.00374247, "balance_loss_clip": 1.24193931, "balance_loss_mlp": 0.33488452, "epoch": 0.2551630843228619, "flos": 18112193556480.0, "grad_norm": 104.11673399723135, "language_loss": 0.87608498, "learning_rate": 3.491326037038301e-06, "loss": 0.89553404, "num_input_tokens_seen": 91593755, "router_z_loss_clip": 3.28710938, "router_z_loss_mlp": 0.39379883, "step": 4244, "time_per_iteration": 4.106152057647705 }, { "auxiliary_loss_clip": 0.0127739, "auxiliary_loss_mlp": 0.00184849, "balance_loss_clip": 1.10293484, "balance_loss_mlp": 0.17645696, "epoch": 0.25522320757552985, "flos": 70520192167680.0, "grad_norm": 0.6923643241181449, "language_loss": 0.57200611, "learning_rate": 3.4910665013940967e-06, "loss": 0.5866285, "num_input_tokens_seen": 91660335, "router_z_loss_clip": 1.75, "router_z_loss_mlp": 0.08398438, "step": 4245, "time_per_iteration": 3.2411646842956543 }, { "auxiliary_loss_clip": 0.01584353, "auxiliary_loss_mlp": 0.00405443, "balance_loss_clip": 1.24506044, "balance_loss_mlp": 0.36054856, "epoch": 0.2552833308281978, "flos": 22893088757760.0, "grad_norm": 121.86447046187399, "language_loss": 0.73609692, "learning_rate": 3.4908069092085015e-06, "loss": 0.75599492, "num_input_tokens_seen": 91678500, "router_z_loss_clip": 3.39453125, "router_z_loss_mlp": 0.44897461, "step": 4246, "time_per_iteration": 2.6821351051330566 }, { "auxiliary_loss_clip": 0.01572623, "auxiliary_loss_mlp": 0.00402169, "balance_loss_clip": 1.24350858, "balance_loss_mlp": 0.36316413, "epoch": 0.2553434540808658, "flos": 22053605022720.0, "grad_norm": 3.0970510035740286, "language_loss": 0.85547251, "learning_rate": 3.4905472604913585e-06, "loss": 0.87522042, "num_input_tokens_seen": 91696430, "router_z_loss_clip": 3.29101562, "router_z_loss_mlp": 0.38964844, "step": 4247, "time_per_iteration": 2.657200336456299 }, { "auxiliary_loss_clip": 0.01620953, "auxiliary_loss_mlp": 0.00454791, "balance_loss_clip": 1.26221287, "balance_loss_mlp": 0.40968269, "epoch": 0.25540357733353375, "flos": 16544190176640.0, "grad_norm": 12.93149386933323, "language_loss": 0.90837109, "learning_rate": 3.490287555252514e-06, "loss": 0.92912853, "num_input_tokens_seen": 91713270, "router_z_loss_clip": 3.58398438, "router_z_loss_mlp": 0.45117188, "step": 4248, "time_per_iteration": 2.7281994819641113 }, { "auxiliary_loss_clip": 0.01601338, "auxiliary_loss_mlp": 0.0040274, "balance_loss_clip": 1.25351977, "balance_loss_mlp": 0.36182764, "epoch": 0.2554637005862017, "flos": 17565022702080.0, "grad_norm": 58.06943548327985, "language_loss": 0.90978038, "learning_rate": 3.4900277935018166e-06, "loss": 0.92982119, "num_input_tokens_seen": 91728865, "router_z_loss_clip": 3.4765625, "router_z_loss_mlp": 0.40917969, "step": 4249, "time_per_iteration": 2.6962203979492188 }, { "auxiliary_loss_clip": 0.01283519, "auxiliary_loss_mlp": 0.00151265, "balance_loss_clip": 1.10314035, "balance_loss_mlp": 0.1431583, "epoch": 0.2555238238388697, "flos": 72244763953920.0, "grad_norm": 0.7234700626928396, "language_loss": 0.55921382, "learning_rate": 3.489767975249115e-06, "loss": 0.57356167, "num_input_tokens_seen": 91787470, "router_z_loss_clip": 1.8046875, "router_z_loss_mlp": 0.08105469, "step": 4250, "time_per_iteration": 3.152906894683838 }, { "auxiliary_loss_clip": 0.01600296, "auxiliary_loss_mlp": 0.00401073, "balance_loss_clip": 1.24474072, "balance_loss_mlp": 0.36094749, "epoch": 0.25558394709153764, "flos": 24389414547840.0, "grad_norm": 40.99894601352705, "language_loss": 0.87354898, "learning_rate": 3.4895081005042632e-06, "loss": 0.89356267, "num_input_tokens_seen": 91805640, "router_z_loss_clip": 3.55078125, "router_z_loss_mlp": 0.40112305, "step": 4251, "time_per_iteration": 2.67527437210083 }, { "auxiliary_loss_clip": 0.01284076, "auxiliary_loss_mlp": 0.00138508, "balance_loss_clip": 1.10090721, "balance_loss_mlp": 0.12987719, "epoch": 0.2556440703442056, "flos": 69231213636480.0, "grad_norm": 0.8541731565296798, "language_loss": 0.66198182, "learning_rate": 3.4892481692771146e-06, "loss": 0.67620766, "num_input_tokens_seen": 91869695, "router_z_loss_clip": 1.8359375, "router_z_loss_mlp": 0.08642578, "step": 4252, "time_per_iteration": 4.534997224807739 }, { "auxiliary_loss_clip": 0.01595341, "auxiliary_loss_mlp": 0.00377438, "balance_loss_clip": 1.25456691, "balance_loss_mlp": 0.34174639, "epoch": 0.2557041935968736, "flos": 24863902231680.0, "grad_norm": 10.682323469473737, "language_loss": 0.80098146, "learning_rate": 3.4889881815775267e-06, "loss": 0.82070929, "num_input_tokens_seen": 91889920, "router_z_loss_clip": 3.40820312, "router_z_loss_mlp": 0.35668945, "step": 4253, "time_per_iteration": 2.663130521774292 }, { "auxiliary_loss_clip": 0.0161196, "auxiliary_loss_mlp": 0.00369733, "balance_loss_clip": 1.26501918, "balance_loss_mlp": 0.33220643, "epoch": 0.25576431684954154, "flos": 22492110257280.0, "grad_norm": 14.924121978140311, "language_loss": 0.78818291, "learning_rate": 3.488728137415357e-06, "loss": 0.80799985, "num_input_tokens_seen": 91908665, "router_z_loss_clip": 3.47070312, "router_z_loss_mlp": 0.37548828, "step": 4254, "time_per_iteration": 2.632819175720215 }, { "auxiliary_loss_clip": 0.01609731, "auxiliary_loss_mlp": 0.00412729, "balance_loss_clip": 1.25645936, "balance_loss_mlp": 0.37195963, "epoch": 0.2558244401022095, "flos": 19826748426240.0, "grad_norm": 3.0176308131749168, "language_loss": 0.86101925, "learning_rate": 3.4884680368004675e-06, "loss": 0.88124382, "num_input_tokens_seen": 91927855, "router_z_loss_clip": 3.5390625, "router_z_loss_mlp": 0.4074707, "step": 4255, "time_per_iteration": 2.6358561515808105 }, { "auxiliary_loss_clip": 0.01590787, "auxiliary_loss_mlp": 0.00375015, "balance_loss_clip": 1.24432445, "balance_loss_mlp": 0.33681998, "epoch": 0.2558845633548775, "flos": 23220486247680.0, "grad_norm": 13.130333463635614, "language_loss": 0.89639199, "learning_rate": 3.488207879742721e-06, "loss": 0.91604996, "num_input_tokens_seen": 91948500, "router_z_loss_clip": 3.46289062, "router_z_loss_mlp": 0.38183594, "step": 4256, "time_per_iteration": 2.6508142948150635 }, { "auxiliary_loss_clip": 0.01630649, "auxiliary_loss_mlp": 0.00411186, "balance_loss_clip": 1.27241397, "balance_loss_mlp": 0.37160835, "epoch": 0.2559446866075455, "flos": 16837867774080.0, "grad_norm": 2.0854665398401577, "language_loss": 0.82245088, "learning_rate": 3.4879476662519826e-06, "loss": 0.84286922, "num_input_tokens_seen": 91968375, "router_z_loss_clip": 3.58398438, "router_z_loss_mlp": 0.39599609, "step": 4257, "time_per_iteration": 2.65822696685791 }, { "auxiliary_loss_clip": 0.01314279, "auxiliary_loss_mlp": 0.00136997, "balance_loss_clip": 1.12814426, "balance_loss_mlp": 0.130178, "epoch": 0.25600480986021346, "flos": 57593786895360.0, "grad_norm": 0.7917209406458895, "language_loss": 0.64963633, "learning_rate": 3.4876873963381196e-06, "loss": 0.66414911, "num_input_tokens_seen": 92028490, "router_z_loss_clip": 1.859375, "router_z_loss_mlp": 0.06835938, "step": 4258, "time_per_iteration": 3.1186623573303223 }, { "auxiliary_loss_clip": 0.01617232, "auxiliary_loss_mlp": 0.00394954, "balance_loss_clip": 1.26560569, "balance_loss_mlp": 0.35282522, "epoch": 0.2560649331128814, "flos": 27819529868160.0, "grad_norm": 36.88756584068166, "language_loss": 0.82149684, "learning_rate": 3.4874270700110013e-06, "loss": 0.84161872, "num_input_tokens_seen": 92048060, "router_z_loss_clip": 3.515625, "router_z_loss_mlp": 0.42138672, "step": 4259, "time_per_iteration": 2.7915873527526855 }, { "auxiliary_loss_clip": 0.01299911, "auxiliary_loss_mlp": 0.00128024, "balance_loss_clip": 1.11840034, "balance_loss_mlp": 0.12101467, "epoch": 0.2561250563655494, "flos": 70950509101440.0, "grad_norm": 0.7887197038478593, "language_loss": 0.58340627, "learning_rate": 3.4871666872804994e-06, "loss": 0.59768564, "num_input_tokens_seen": 92118180, "router_z_loss_clip": 1.8125, "router_z_loss_mlp": 0.0703125, "step": 4260, "time_per_iteration": 3.2170069217681885 }, { "auxiliary_loss_clip": 0.0160055, "auxiliary_loss_mlp": 0.00371382, "balance_loss_clip": 1.25319278, "balance_loss_mlp": 0.33344954, "epoch": 0.25618517961821735, "flos": 27012329481600.0, "grad_norm": 655.9381536059058, "language_loss": 0.82652152, "learning_rate": 3.4869062481564875e-06, "loss": 0.84624088, "num_input_tokens_seen": 92137570, "router_z_loss_clip": 3.47265625, "router_z_loss_mlp": 0.37939453, "step": 4261, "time_per_iteration": 2.7703909873962402 }, { "auxiliary_loss_clip": 0.01591277, "auxiliary_loss_mlp": 0.00342695, "balance_loss_clip": 1.2477951, "balance_loss_mlp": 0.30731374, "epoch": 0.2562453028708853, "flos": 23068296322560.0, "grad_norm": 9.727791640840573, "language_loss": 0.86148041, "learning_rate": 3.486645752648842e-06, "loss": 0.88082016, "num_input_tokens_seen": 92157625, "router_z_loss_clip": 3.43554688, "router_z_loss_mlp": 0.35400391, "step": 4262, "time_per_iteration": 2.7573652267456055 }, { "auxiliary_loss_clip": 0.01581384, "auxiliary_loss_mlp": 0.00406145, "balance_loss_clip": 1.24133468, "balance_loss_mlp": 0.36609069, "epoch": 0.2563054261235533, "flos": 15120942606720.0, "grad_norm": 3.9580665729128177, "language_loss": 0.82364696, "learning_rate": 3.4863852007674405e-06, "loss": 0.84352225, "num_input_tokens_seen": 92175350, "router_z_loss_clip": 3.40039062, "router_z_loss_mlp": 0.40039062, "step": 4263, "time_per_iteration": 2.6295158863067627 }, { "auxiliary_loss_clip": 0.0160586, "auxiliary_loss_mlp": 0.0033018, "balance_loss_clip": 1.26005793, "balance_loss_mlp": 0.2941789, "epoch": 0.25636554937622125, "flos": 27854865872640.0, "grad_norm": 24.479787640707276, "language_loss": 0.87377059, "learning_rate": 3.486124592522163e-06, "loss": 0.89313102, "num_input_tokens_seen": 92196070, "router_z_loss_clip": 3.45703125, "router_z_loss_mlp": 0.36035156, "step": 4264, "time_per_iteration": 2.7303380966186523 }, { "auxiliary_loss_clip": 0.01601949, "auxiliary_loss_mlp": 0.00351841, "balance_loss_clip": 1.25344443, "balance_loss_mlp": 0.31331247, "epoch": 0.2564256726288892, "flos": 28906509288960.0, "grad_norm": 3.652980672540202, "language_loss": 0.80776316, "learning_rate": 3.4858639279228924e-06, "loss": 0.82730103, "num_input_tokens_seen": 92216310, "router_z_loss_clip": 3.484375, "router_z_loss_mlp": 0.38525391, "step": 4265, "time_per_iteration": 2.7210803031921387 }, { "auxiliary_loss_clip": 0.01596456, "auxiliary_loss_mlp": 0.00341342, "balance_loss_clip": 1.24985456, "balance_loss_mlp": 0.30221707, "epoch": 0.2564857958815572, "flos": 18514931823360.0, "grad_norm": 3.7265312932983075, "language_loss": 0.88204211, "learning_rate": 3.485603206979513e-06, "loss": 0.90142012, "num_input_tokens_seen": 92234510, "router_z_loss_clip": 3.46679688, "router_z_loss_mlp": 0.39111328, "step": 4266, "time_per_iteration": 2.6343631744384766 }, { "auxiliary_loss_clip": 0.01574934, "auxiliary_loss_mlp": 0.00348282, "balance_loss_clip": 1.2363081, "balance_loss_mlp": 0.31096992, "epoch": 0.25654591913422514, "flos": 25808280658560.0, "grad_norm": 140.73430313468518, "language_loss": 0.83885169, "learning_rate": 3.4853424297019103e-06, "loss": 0.85808384, "num_input_tokens_seen": 92254070, "router_z_loss_clip": 3.38476562, "router_z_loss_mlp": 0.37329102, "step": 4267, "time_per_iteration": 2.696073055267334 }, { "auxiliary_loss_clip": 0.01593715, "auxiliary_loss_mlp": 0.00326718, "balance_loss_clip": 1.26043797, "balance_loss_mlp": 0.28985846, "epoch": 0.2566060423868931, "flos": 19099665325440.0, "grad_norm": 18.95924583779467, "language_loss": 0.8391819, "learning_rate": 3.4850815960999736e-06, "loss": 0.85838628, "num_input_tokens_seen": 92275060, "router_z_loss_clip": 3.33007812, "router_z_loss_mlp": 0.36865234, "step": 4268, "time_per_iteration": 2.686746597290039 }, { "auxiliary_loss_clip": 0.01561113, "auxiliary_loss_mlp": 0.0038754, "balance_loss_clip": 1.22384131, "balance_loss_mlp": 0.34624612, "epoch": 0.25666616563956113, "flos": 23842674656640.0, "grad_norm": 7.499204100822425, "language_loss": 0.73866612, "learning_rate": 3.484820706183595e-06, "loss": 0.75815266, "num_input_tokens_seen": 92293610, "router_z_loss_clip": 3.37109375, "router_z_loss_mlp": 0.4128418, "step": 4269, "time_per_iteration": 2.7734789848327637 }, { "auxiliary_loss_clip": 0.01572622, "auxiliary_loss_mlp": 0.0037158, "balance_loss_clip": 1.23302507, "balance_loss_mlp": 0.33345681, "epoch": 0.2567262888922291, "flos": 14604259420800.0, "grad_norm": 18.997769613455297, "language_loss": 0.89150369, "learning_rate": 3.484559759962666e-06, "loss": 0.91094571, "num_input_tokens_seen": 92308305, "router_z_loss_clip": 3.39453125, "router_z_loss_mlp": 0.38134766, "step": 4270, "time_per_iteration": 2.7033731937408447 }, { "auxiliary_loss_clip": 0.01604338, "auxiliary_loss_mlp": 0.00382277, "balance_loss_clip": 1.25166416, "balance_loss_mlp": 0.33959994, "epoch": 0.25678641214489706, "flos": 32923117877760.0, "grad_norm": 10.76505531285129, "language_loss": 0.76590908, "learning_rate": 3.4842987574470816e-06, "loss": 0.78577518, "num_input_tokens_seen": 92329875, "router_z_loss_clip": 3.52539062, "router_z_loss_mlp": 0.42675781, "step": 4271, "time_per_iteration": 2.7528584003448486 }, { "auxiliary_loss_clip": 0.01603635, "auxiliary_loss_mlp": 0.0041348, "balance_loss_clip": 1.24735165, "balance_loss_mlp": 0.37075499, "epoch": 0.256846535397565, "flos": 24098933260800.0, "grad_norm": 54.62397528591798, "language_loss": 0.90503711, "learning_rate": 3.4840376986467403e-06, "loss": 0.92520821, "num_input_tokens_seen": 92348780, "router_z_loss_clip": 3.5625, "router_z_loss_mlp": 0.42700195, "step": 4272, "time_per_iteration": 2.693373441696167 }, { "auxiliary_loss_clip": 0.01601696, "auxiliary_loss_mlp": 0.00384962, "balance_loss_clip": 1.25500405, "balance_loss_mlp": 0.34316683, "epoch": 0.256906658650233, "flos": 19718441942400.0, "grad_norm": 119.8238468301448, "language_loss": 0.89091432, "learning_rate": 3.483776583571541e-06, "loss": 0.91078091, "num_input_tokens_seen": 92368175, "router_z_loss_clip": 3.46484375, "router_z_loss_mlp": 0.41796875, "step": 4273, "time_per_iteration": 2.635831117630005 }, { "auxiliary_loss_clip": 0.01569391, "auxiliary_loss_mlp": 0.00348969, "balance_loss_clip": 1.23402607, "balance_loss_mlp": 0.3118712, "epoch": 0.25696678190290095, "flos": 22926018551040.0, "grad_norm": 3.846553842427074, "language_loss": 0.82876581, "learning_rate": 3.4835154122313846e-06, "loss": 0.84794939, "num_input_tokens_seen": 92387755, "router_z_loss_clip": 3.35351562, "router_z_loss_mlp": 0.37109375, "step": 4274, "time_per_iteration": 2.711742401123047 }, { "auxiliary_loss_clip": 0.01556831, "auxiliary_loss_mlp": 0.00350051, "balance_loss_clip": 1.22377217, "balance_loss_mlp": 0.31455049, "epoch": 0.2570269051555689, "flos": 27307838672640.0, "grad_norm": 184.48437825694162, "language_loss": 0.90028185, "learning_rate": 3.4832541846361743e-06, "loss": 0.91935074, "num_input_tokens_seen": 92409850, "router_z_loss_clip": 3.33203125, "router_z_loss_mlp": 0.35498047, "step": 4275, "time_per_iteration": 2.764719247817993 }, { "auxiliary_loss_clip": 0.01597012, "auxiliary_loss_mlp": 0.00404477, "balance_loss_clip": 1.2432121, "balance_loss_mlp": 0.35912991, "epoch": 0.2570870284082369, "flos": 27563414918400.0, "grad_norm": 12.402653841018454, "language_loss": 0.83852744, "learning_rate": 3.4829929007958175e-06, "loss": 0.85854232, "num_input_tokens_seen": 92431250, "router_z_loss_clip": 3.5390625, "router_z_loss_mlp": 0.45336914, "step": 4276, "time_per_iteration": 2.738550901412964 }, { "auxiliary_loss_clip": 0.01579367, "auxiliary_loss_mlp": 0.0040313, "balance_loss_clip": 1.23732352, "balance_loss_mlp": 0.36162144, "epoch": 0.25714715166090485, "flos": 28730834847360.0, "grad_norm": 5.314268200947001, "language_loss": 0.85130978, "learning_rate": 3.4827315607202214e-06, "loss": 0.8711347, "num_input_tokens_seen": 92452065, "router_z_loss_clip": 3.41796875, "router_z_loss_mlp": 0.41503906, "step": 4277, "time_per_iteration": 2.7512271404266357 }, { "auxiliary_loss_clip": 0.01596766, "auxiliary_loss_mlp": 0.00381432, "balance_loss_clip": 1.25220597, "balance_loss_mlp": 0.33980396, "epoch": 0.2572072749135728, "flos": 20116152305280.0, "grad_norm": 10.27914051228256, "language_loss": 0.84481692, "learning_rate": 3.482470164419295e-06, "loss": 0.86459887, "num_input_tokens_seen": 92470025, "router_z_loss_clip": 3.44335938, "router_z_loss_mlp": 0.41650391, "step": 4278, "time_per_iteration": 2.641101598739624 }, { "auxiliary_loss_clip": 0.01611673, "auxiliary_loss_mlp": 0.0039117, "balance_loss_clip": 1.25830817, "balance_loss_mlp": 0.3509016, "epoch": 0.2572673981662408, "flos": 26030855283840.0, "grad_norm": 5.104598185502512, "language_loss": 0.81155163, "learning_rate": 3.482208711902952e-06, "loss": 0.83158004, "num_input_tokens_seen": 92489825, "router_z_loss_clip": 3.53320312, "router_z_loss_mlp": 0.40283203, "step": 4279, "time_per_iteration": 2.6795058250427246 }, { "auxiliary_loss_clip": 0.0161259, "auxiliary_loss_mlp": 0.00418857, "balance_loss_clip": 1.26013613, "balance_loss_mlp": 0.37196046, "epoch": 0.25732752141890874, "flos": 16106618695680.0, "grad_norm": 2.897643250747758, "language_loss": 0.91896927, "learning_rate": 3.4819472031811065e-06, "loss": 0.93928373, "num_input_tokens_seen": 92507270, "router_z_loss_clip": 3.52539062, "router_z_loss_mlp": 0.46899414, "step": 4280, "time_per_iteration": 4.12591814994812 }, { "auxiliary_loss_clip": 0.0160431, "auxiliary_loss_mlp": 0.00408189, "balance_loss_clip": 1.25457716, "balance_loss_mlp": 0.36610764, "epoch": 0.2573876446715767, "flos": 22524429519360.0, "grad_norm": 125.36500864329933, "language_loss": 0.85371846, "learning_rate": 3.4816856382636744e-06, "loss": 0.87384343, "num_input_tokens_seen": 92526300, "router_z_loss_clip": 3.49414062, "router_z_loss_mlp": 0.42089844, "step": 4281, "time_per_iteration": 2.8399553298950195 }, { "auxiliary_loss_clip": 0.01604141, "auxiliary_loss_mlp": 0.00411512, "balance_loss_clip": 1.26104164, "balance_loss_mlp": 0.36654642, "epoch": 0.2574477679242447, "flos": 23950837486080.0, "grad_norm": 61.39868000406039, "language_loss": 0.91258442, "learning_rate": 3.4814240171605737e-06, "loss": 0.93274093, "num_input_tokens_seen": 92546465, "router_z_loss_clip": 3.42578125, "router_z_loss_mlp": 0.44970703, "step": 4282, "time_per_iteration": 2.6627583503723145 }, { "auxiliary_loss_clip": 0.01604249, "auxiliary_loss_mlp": 0.00420262, "balance_loss_clip": 1.2580719, "balance_loss_mlp": 0.37987396, "epoch": 0.2575078911769127, "flos": 21981711951360.0, "grad_norm": 4.7154341544829, "language_loss": 0.77212214, "learning_rate": 3.4811623398817267e-06, "loss": 0.79236728, "num_input_tokens_seen": 92567260, "router_z_loss_clip": 3.45898438, "router_z_loss_mlp": 0.40380859, "step": 4283, "time_per_iteration": 4.092426538467407 }, { "auxiliary_loss_clip": 0.01609844, "auxiliary_loss_mlp": 0.00384799, "balance_loss_clip": 1.27080762, "balance_loss_mlp": 0.34646088, "epoch": 0.25756801442958066, "flos": 21945406279680.0, "grad_norm": 14.062485300645683, "language_loss": 0.85539663, "learning_rate": 3.4809006064370553e-06, "loss": 0.87534308, "num_input_tokens_seen": 92585425, "router_z_loss_clip": 3.38671875, "router_z_loss_mlp": 0.38330078, "step": 4284, "time_per_iteration": 2.658836603164673 }, { "auxiliary_loss_clip": 0.01613571, "auxiliary_loss_mlp": 0.00379593, "balance_loss_clip": 1.26241946, "balance_loss_mlp": 0.33827484, "epoch": 0.2576281376822486, "flos": 35261980058880.0, "grad_norm": 150.05128645686236, "language_loss": 0.77642357, "learning_rate": 3.4806388168364835e-06, "loss": 0.79635519, "num_input_tokens_seen": 92604770, "router_z_loss_clip": 3.5078125, "router_z_loss_mlp": 0.41308594, "step": 4285, "time_per_iteration": 2.7907333374023438 }, { "auxiliary_loss_clip": 0.01622956, "auxiliary_loss_mlp": 0.00398892, "balance_loss_clip": 1.2759769, "balance_loss_mlp": 0.35941017, "epoch": 0.2576882609349166, "flos": 14132285688960.0, "grad_norm": 4.138459867012965, "language_loss": 0.65171874, "learning_rate": 3.4803769710899402e-06, "loss": 0.67193723, "num_input_tokens_seen": 92622635, "router_z_loss_clip": 3.47265625, "router_z_loss_mlp": 0.39453125, "step": 4286, "time_per_iteration": 2.663689613342285 }, { "auxiliary_loss_clip": 0.01625226, "auxiliary_loss_mlp": 0.00409983, "balance_loss_clip": 1.2739948, "balance_loss_mlp": 0.36725861, "epoch": 0.25774838418758456, "flos": 23258336204160.0, "grad_norm": 1.8153738326139868, "language_loss": 0.70541805, "learning_rate": 3.480115069207354e-06, "loss": 0.72577018, "num_input_tokens_seen": 92642960, "router_z_loss_clip": 3.50976562, "router_z_loss_mlp": 0.42724609, "step": 4287, "time_per_iteration": 4.211211681365967 }, { "auxiliary_loss_clip": 0.01617148, "auxiliary_loss_mlp": 0.00408724, "balance_loss_clip": 1.26280737, "balance_loss_mlp": 0.36444989, "epoch": 0.2578085074402525, "flos": 22601745544320.0, "grad_norm": 42.03122823544241, "language_loss": 0.77232003, "learning_rate": 3.4798531111986557e-06, "loss": 0.79257882, "num_input_tokens_seen": 92662455, "router_z_loss_clip": 3.546875, "router_z_loss_mlp": 0.44287109, "step": 4288, "time_per_iteration": 2.686741352081299 }, { "auxiliary_loss_clip": 0.01619896, "auxiliary_loss_mlp": 0.00404427, "balance_loss_clip": 1.26980758, "balance_loss_mlp": 0.36325249, "epoch": 0.2578686306929205, "flos": 24571840746240.0, "grad_norm": 23.642214276729618, "language_loss": 0.82053757, "learning_rate": 3.4795910970737786e-06, "loss": 0.84078074, "num_input_tokens_seen": 92683520, "router_z_loss_clip": 3.50195312, "router_z_loss_mlp": 0.41186523, "step": 4289, "time_per_iteration": 2.7459123134613037 }, { "auxiliary_loss_clip": 0.01636609, "auxiliary_loss_mlp": 0.00432989, "balance_loss_clip": 1.27812791, "balance_loss_mlp": 0.38993108, "epoch": 0.25792875394558845, "flos": 18113953322880.0, "grad_norm": 1224.7041528629234, "language_loss": 0.91947466, "learning_rate": 3.4793290268426592e-06, "loss": 0.94017065, "num_input_tokens_seen": 92701450, "router_z_loss_clip": 3.5859375, "router_z_loss_mlp": 0.43066406, "step": 4290, "time_per_iteration": 2.6794793605804443 }, { "auxiliary_loss_clip": 0.01640249, "auxiliary_loss_mlp": 0.00435662, "balance_loss_clip": 1.28351521, "balance_loss_mlp": 0.38971841, "epoch": 0.2579888771982564, "flos": 17712902995200.0, "grad_norm": 11.973228876341132, "language_loss": 0.79227549, "learning_rate": 3.4790669005152354e-06, "loss": 0.81303465, "num_input_tokens_seen": 92720355, "router_z_loss_clip": 3.5625, "router_z_loss_mlp": 0.45947266, "step": 4291, "time_per_iteration": 2.6781442165374756 }, { "auxiliary_loss_clip": 0.01630446, "auxiliary_loss_mlp": 0.0041568, "balance_loss_clip": 1.26933157, "balance_loss_mlp": 0.37224039, "epoch": 0.2580490004509244, "flos": 16434878112000.0, "grad_norm": 140.8880845760531, "language_loss": 0.86449242, "learning_rate": 3.4788047181014458e-06, "loss": 0.88495368, "num_input_tokens_seen": 92736755, "router_z_loss_clip": 3.60546875, "router_z_loss_mlp": 0.43408203, "step": 4292, "time_per_iteration": 2.5928194522857666 }, { "auxiliary_loss_clip": 0.01625425, "auxiliary_loss_mlp": 0.00427696, "balance_loss_clip": 1.2706238, "balance_loss_mlp": 0.384161, "epoch": 0.25810912370359235, "flos": 33835141128960.0, "grad_norm": 182.34580909256167, "language_loss": 0.73902482, "learning_rate": 3.4785424796112337e-06, "loss": 0.75955606, "num_input_tokens_seen": 92757655, "router_z_loss_clip": 3.55273438, "router_z_loss_mlp": 0.43505859, "step": 4293, "time_per_iteration": 2.7577500343322754 }, { "auxiliary_loss_clip": 0.01613756, "auxiliary_loss_mlp": 0.00378855, "balance_loss_clip": 1.26349521, "balance_loss_mlp": 0.33954015, "epoch": 0.2581692469562603, "flos": 25192197561600.0, "grad_norm": 16.64140203140871, "language_loss": 0.81131637, "learning_rate": 3.478280185054542e-06, "loss": 0.83124256, "num_input_tokens_seen": 92776100, "router_z_loss_clip": 3.50195312, "router_z_loss_mlp": 0.39331055, "step": 4294, "time_per_iteration": 4.058233737945557 }, { "auxiliary_loss_clip": 0.01637403, "auxiliary_loss_mlp": 0.00390434, "balance_loss_clip": 1.28039443, "balance_loss_mlp": 0.34966433, "epoch": 0.2582293702089283, "flos": 34932212271360.0, "grad_norm": 2.9349595077038333, "language_loss": 0.8725425, "learning_rate": 3.478017834441318e-06, "loss": 0.89282089, "num_input_tokens_seen": 92798880, "router_z_loss_clip": 3.56835938, "router_z_loss_mlp": 0.40771484, "step": 4295, "time_per_iteration": 2.753767728805542 }, { "auxiliary_loss_clip": 0.01647666, "auxiliary_loss_mlp": 0.00450913, "balance_loss_clip": 1.27451801, "balance_loss_mlp": 0.40103593, "epoch": 0.2582894934615963, "flos": 26833746038400.0, "grad_norm": 102.74124785454518, "language_loss": 0.78106725, "learning_rate": 3.4777554277815096e-06, "loss": 0.80205303, "num_input_tokens_seen": 92817750, "router_z_loss_clip": 3.73632812, "router_z_loss_mlp": 0.4987793, "step": 4296, "time_per_iteration": 2.6904144287109375 }, { "auxiliary_loss_clip": 0.01628487, "auxiliary_loss_mlp": 0.00438152, "balance_loss_clip": 1.26937652, "balance_loss_mlp": 0.39089775, "epoch": 0.25834961671426426, "flos": 23515241253120.0, "grad_norm": 3.4303382615519435, "language_loss": 0.91841668, "learning_rate": 3.477492965085067e-06, "loss": 0.9390831, "num_input_tokens_seen": 92837995, "router_z_loss_clip": 3.59375, "router_z_loss_mlp": 0.47290039, "step": 4297, "time_per_iteration": 2.654670476913452 }, { "auxiliary_loss_clip": 0.01617403, "auxiliary_loss_mlp": 0.00429378, "balance_loss_clip": 1.25857663, "balance_loss_mlp": 0.3871063, "epoch": 0.25840973996693223, "flos": 22451028076800.0, "grad_norm": 3.0165916962340984, "language_loss": 0.8894074, "learning_rate": 3.477230446361943e-06, "loss": 0.90987515, "num_input_tokens_seen": 92857245, "router_z_loss_clip": 3.59179688, "router_z_loss_mlp": 0.42285156, "step": 4298, "time_per_iteration": 2.635442018508911 }, { "auxiliary_loss_clip": 0.01604089, "auxiliary_loss_mlp": 0.00404133, "balance_loss_clip": 1.25603473, "balance_loss_mlp": 0.36214763, "epoch": 0.2584698632196002, "flos": 11290854366720.0, "grad_norm": 4.6054608122235425, "language_loss": 0.9096446, "learning_rate": 3.4769678716220927e-06, "loss": 0.92972684, "num_input_tokens_seen": 92873265, "router_z_loss_clip": 3.484375, "router_z_loss_mlp": 0.41992188, "step": 4299, "time_per_iteration": 2.598755359649658 }, { "auxiliary_loss_clip": 0.01600372, "auxiliary_loss_mlp": 0.00392036, "balance_loss_clip": 1.25373793, "balance_loss_mlp": 0.35064644, "epoch": 0.25852998647226816, "flos": 17929982839680.0, "grad_norm": 3.3078295769091697, "language_loss": 0.89459771, "learning_rate": 3.4767052408754726e-06, "loss": 0.91452175, "num_input_tokens_seen": 92890880, "router_z_loss_clip": 3.46679688, "router_z_loss_mlp": 0.41381836, "step": 4300, "time_per_iteration": 2.622708320617676 }, { "auxiliary_loss_clip": 0.01603688, "auxiliary_loss_mlp": 0.00438602, "balance_loss_clip": 1.25486732, "balance_loss_mlp": 0.39540058, "epoch": 0.2585901097249361, "flos": 33256117889280.0, "grad_norm": 18.96064488499278, "language_loss": 0.74269748, "learning_rate": 3.4764425541320417e-06, "loss": 0.76312035, "num_input_tokens_seen": 92910770, "router_z_loss_clip": 3.48632812, "router_z_loss_mlp": 0.43237305, "step": 4301, "time_per_iteration": 2.7964694499969482 }, { "auxiliary_loss_clip": 0.01596575, "auxiliary_loss_mlp": 0.00436663, "balance_loss_clip": 1.2460959, "balance_loss_mlp": 0.39191133, "epoch": 0.2586502329776041, "flos": 18441278985600.0, "grad_norm": 42.41336524681273, "language_loss": 0.90777385, "learning_rate": 3.4761798114017617e-06, "loss": 0.92810619, "num_input_tokens_seen": 92929520, "router_z_loss_clip": 3.50390625, "router_z_loss_mlp": 0.44750977, "step": 4302, "time_per_iteration": 2.5983996391296387 }, { "auxiliary_loss_clip": 0.01592039, "auxiliary_loss_mlp": 0.00431757, "balance_loss_clip": 1.24890995, "balance_loss_mlp": 0.39031965, "epoch": 0.25871035623027205, "flos": 17968120104960.0, "grad_norm": 16.105681164555772, "language_loss": 0.96994853, "learning_rate": 3.475917012694595e-06, "loss": 0.99018651, "num_input_tokens_seen": 92947890, "router_z_loss_clip": 3.43164062, "router_z_loss_mlp": 0.41430664, "step": 4303, "time_per_iteration": 2.6844773292541504 }, { "auxiliary_loss_clip": 0.01586734, "auxiliary_loss_mlp": 0.00387979, "balance_loss_clip": 1.24113822, "balance_loss_mlp": 0.34198844, "epoch": 0.25877047948294, "flos": 27777729415680.0, "grad_norm": 194.4802105937529, "language_loss": 0.73129588, "learning_rate": 3.475654158020507e-06, "loss": 0.75104302, "num_input_tokens_seen": 92967690, "router_z_loss_clip": 3.45703125, "router_z_loss_mlp": 0.4597168, "step": 4304, "time_per_iteration": 2.71366286277771 }, { "auxiliary_loss_clip": 0.0161088, "auxiliary_loss_mlp": 0.0042917, "balance_loss_clip": 1.25213313, "balance_loss_mlp": 0.38425225, "epoch": 0.258830602735608, "flos": 27125843437440.0, "grad_norm": 55.338316121111596, "language_loss": 0.79223841, "learning_rate": 3.4753912473894657e-06, "loss": 0.81263888, "num_input_tokens_seen": 92986830, "router_z_loss_clip": 3.5859375, "router_z_loss_mlp": 0.44921875, "step": 4305, "time_per_iteration": 2.708440065383911 }, { "auxiliary_loss_clip": 0.01592599, "auxiliary_loss_mlp": 0.00450774, "balance_loss_clip": 1.24355602, "balance_loss_mlp": 0.40270904, "epoch": 0.25889072598827595, "flos": 17891486438400.0, "grad_norm": 8.268687918225556, "language_loss": 0.83387464, "learning_rate": 3.4751282808114403e-06, "loss": 0.85430837, "num_input_tokens_seen": 93002740, "router_z_loss_clip": 3.48828125, "router_z_loss_mlp": 0.48071289, "step": 4306, "time_per_iteration": 2.658363103866577 }, { "auxiliary_loss_clip": 0.01279781, "auxiliary_loss_mlp": 0.00081462, "balance_loss_clip": 1.11517692, "balance_loss_mlp": 0.0735467, "epoch": 0.2589508492409439, "flos": 53934955724160.0, "grad_norm": 0.7954183830100308, "language_loss": 0.56544894, "learning_rate": 3.474865258296403e-06, "loss": 0.57906139, "num_input_tokens_seen": 93058645, "router_z_loss_clip": 1.640625, "router_z_loss_mlp": 0.07910156, "step": 4307, "time_per_iteration": 3.1115903854370117 }, { "auxiliary_loss_clip": 0.01576248, "auxiliary_loss_mlp": 0.00362544, "balance_loss_clip": 1.2506566, "balance_loss_mlp": 0.32358682, "epoch": 0.2590109724936119, "flos": 22125785402880.0, "grad_norm": 5.070811505358024, "language_loss": 0.76528203, "learning_rate": 3.474602179854327e-06, "loss": 0.78466994, "num_input_tokens_seen": 93077140, "router_z_loss_clip": 3.25976562, "router_z_loss_mlp": 0.38964844, "step": 4308, "time_per_iteration": 2.66080904006958 }, { "auxiliary_loss_clip": 0.01594706, "auxiliary_loss_mlp": 0.00388852, "balance_loss_clip": 1.25092006, "balance_loss_mlp": 0.34607965, "epoch": 0.2590710957462799, "flos": 13474294398720.0, "grad_norm": 10.983139668061368, "language_loss": 0.90052921, "learning_rate": 3.4743390454951886e-06, "loss": 0.9203648, "num_input_tokens_seen": 93093580, "router_z_loss_clip": 3.43945312, "router_z_loss_mlp": 0.42773438, "step": 4309, "time_per_iteration": 2.6558055877685547 }, { "auxiliary_loss_clip": 0.01584926, "auxiliary_loss_mlp": 0.00369823, "balance_loss_clip": 1.25293779, "balance_loss_mlp": 0.33100802, "epoch": 0.25913121899894787, "flos": 22307098279680.0, "grad_norm": 23.14402985940009, "language_loss": 0.88763458, "learning_rate": 3.474075855228966e-06, "loss": 0.90718204, "num_input_tokens_seen": 93112345, "router_z_loss_clip": 3.3203125, "router_z_loss_mlp": 0.38818359, "step": 4310, "time_per_iteration": 2.6646313667297363 }, { "auxiliary_loss_clip": 0.01596395, "auxiliary_loss_mlp": 0.00432543, "balance_loss_clip": 1.26152253, "balance_loss_mlp": 0.38984227, "epoch": 0.25919134225161583, "flos": 25811728364160.0, "grad_norm": 9.310200179315895, "language_loss": 0.84004331, "learning_rate": 3.473812609065639e-06, "loss": 0.86033267, "num_input_tokens_seen": 93131545, "router_z_loss_clip": 3.3515625, "router_z_loss_mlp": 0.42675781, "step": 4311, "time_per_iteration": 2.6830973625183105 }, { "auxiliary_loss_clip": 0.01619912, "auxiliary_loss_mlp": 0.00412675, "balance_loss_clip": 1.27518725, "balance_loss_mlp": 0.36832952, "epoch": 0.2592514655042838, "flos": 31212262108800.0, "grad_norm": 2.833986316170463, "language_loss": 0.77660483, "learning_rate": 3.4735493070151904e-06, "loss": 0.79693067, "num_input_tokens_seen": 93150730, "router_z_loss_clip": 3.44921875, "router_z_loss_mlp": 0.44335938, "step": 4312, "time_per_iteration": 2.7557132244110107 }, { "auxiliary_loss_clip": 0.01604011, "auxiliary_loss_mlp": 0.00407215, "balance_loss_clip": 1.26379502, "balance_loss_mlp": 0.36348861, "epoch": 0.25931158875695176, "flos": 18474998878080.0, "grad_norm": 98.00910946601336, "language_loss": 0.76556087, "learning_rate": 3.4732859490876044e-06, "loss": 0.78567314, "num_input_tokens_seen": 93167895, "router_z_loss_clip": 3.40625, "router_z_loss_mlp": 0.43652344, "step": 4313, "time_per_iteration": 2.681765079498291 }, { "auxiliary_loss_clip": 0.01592302, "auxiliary_loss_mlp": 0.00393752, "balance_loss_clip": 1.2581594, "balance_loss_mlp": 0.35136092, "epoch": 0.2593717120096197, "flos": 19207935895680.0, "grad_norm": 5.939091800278994, "language_loss": 0.85633421, "learning_rate": 3.473022535292867e-06, "loss": 0.87619478, "num_input_tokens_seen": 93187650, "router_z_loss_clip": 3.34375, "router_z_loss_mlp": 0.42407227, "step": 4314, "time_per_iteration": 2.6464781761169434 }, { "auxiliary_loss_clip": 0.01608404, "auxiliary_loss_mlp": 0.00426106, "balance_loss_clip": 1.26118195, "balance_loss_mlp": 0.38259488, "epoch": 0.2594318352622877, "flos": 31248100903680.0, "grad_norm": 4.229394872690892, "language_loss": 0.72468436, "learning_rate": 3.472759065640968e-06, "loss": 0.74502945, "num_input_tokens_seen": 93207370, "router_z_loss_clip": 3.47460938, "router_z_loss_mlp": 0.43530273, "step": 4315, "time_per_iteration": 2.726323366165161 }, { "auxiliary_loss_clip": 0.01601574, "auxiliary_loss_mlp": 0.00384952, "balance_loss_clip": 1.26553917, "balance_loss_mlp": 0.34437346, "epoch": 0.25949195851495566, "flos": 22237144542720.0, "grad_norm": 1.7731085239065583, "language_loss": 0.84760374, "learning_rate": 3.4724955401418976e-06, "loss": 0.86746895, "num_input_tokens_seen": 93227925, "router_z_loss_clip": 3.36328125, "router_z_loss_mlp": 0.40576172, "step": 4316, "time_per_iteration": 2.620793342590332 }, { "auxiliary_loss_clip": 0.01623841, "auxiliary_loss_mlp": 0.00446893, "balance_loss_clip": 1.27619672, "balance_loss_mlp": 0.39954293, "epoch": 0.2595520817676236, "flos": 28075716645120.0, "grad_norm": 6.548440562813901, "language_loss": 0.81215823, "learning_rate": 3.4722319588056487e-06, "loss": 0.8328656, "num_input_tokens_seen": 93250020, "router_z_loss_clip": 3.47851562, "router_z_loss_mlp": 0.47363281, "step": 4317, "time_per_iteration": 2.7009530067443848 }, { "auxiliary_loss_clip": 0.0159178, "auxiliary_loss_mlp": 0.00449044, "balance_loss_clip": 1.25915325, "balance_loss_mlp": 0.40333965, "epoch": 0.2596122050202916, "flos": 20190954378240.0, "grad_norm": 93.44735429904904, "language_loss": 0.82745624, "learning_rate": 3.4719683216422163e-06, "loss": 0.84786445, "num_input_tokens_seen": 93269070, "router_z_loss_clip": 3.3203125, "router_z_loss_mlp": 0.45703125, "step": 4318, "time_per_iteration": 2.6476423740386963 }, { "auxiliary_loss_clip": 0.01585567, "auxiliary_loss_mlp": 0.00406372, "balance_loss_clip": 1.25922859, "balance_loss_mlp": 0.36467263, "epoch": 0.25967232827295955, "flos": 22527949052160.0, "grad_norm": 3.4522883704824414, "language_loss": 0.80410099, "learning_rate": 3.471704628661598e-06, "loss": 0.82402033, "num_input_tokens_seen": 93290250, "router_z_loss_clip": 3.26367188, "router_z_loss_mlp": 0.41699219, "step": 4319, "time_per_iteration": 2.661287546157837 }, { "auxiliary_loss_clip": 0.01576422, "auxiliary_loss_mlp": 0.00433006, "balance_loss_clip": 1.2508738, "balance_loss_mlp": 0.3883259, "epoch": 0.2597324515256275, "flos": 21068252156160.0, "grad_norm": 10.712097905220364, "language_loss": 0.82053566, "learning_rate": 3.4714408798737925e-06, "loss": 0.84062999, "num_input_tokens_seen": 93310090, "router_z_loss_clip": 3.25976562, "router_z_loss_mlp": 0.4465332, "step": 4320, "time_per_iteration": 2.663148880004883 }, { "auxiliary_loss_clip": 0.01569728, "auxiliary_loss_mlp": 0.00406678, "balance_loss_clip": 1.24328756, "balance_loss_mlp": 0.36106858, "epoch": 0.2597925747782955, "flos": 22050013662720.0, "grad_norm": 5.945908854569021, "language_loss": 0.77153206, "learning_rate": 3.471177075288801e-06, "loss": 0.79129612, "num_input_tokens_seen": 93329570, "router_z_loss_clip": 3.265625, "router_z_loss_mlp": 0.45556641, "step": 4321, "time_per_iteration": 2.663271427154541 }, { "auxiliary_loss_clip": 0.01589012, "auxiliary_loss_mlp": 0.00431169, "balance_loss_clip": 1.2539854, "balance_loss_mlp": 0.38572633, "epoch": 0.2598526980309635, "flos": 19536949497600.0, "grad_norm": 11.358765896560024, "language_loss": 0.80499792, "learning_rate": 3.4709132149166277e-06, "loss": 0.82519972, "num_input_tokens_seen": 93347920, "router_z_loss_clip": 3.3515625, "router_z_loss_mlp": 0.4543457, "step": 4322, "time_per_iteration": 4.150545120239258 }, { "auxiliary_loss_clip": 0.0157569, "auxiliary_loss_mlp": 0.00445891, "balance_loss_clip": 1.24828589, "balance_loss_mlp": 0.39837414, "epoch": 0.25991282128363147, "flos": 24495207079680.0, "grad_norm": 18.618925739278914, "language_loss": 0.79090673, "learning_rate": 3.470649298767278e-06, "loss": 0.81112254, "num_input_tokens_seen": 93367145, "router_z_loss_clip": 3.2734375, "router_z_loss_mlp": 0.47509766, "step": 4323, "time_per_iteration": 2.6640539169311523 }, { "auxiliary_loss_clip": 0.01593962, "auxiliary_loss_mlp": 0.00455599, "balance_loss_clip": 1.25926042, "balance_loss_mlp": 0.40886915, "epoch": 0.25997294453629943, "flos": 24201457655040.0, "grad_norm": 4.112078007119001, "language_loss": 0.72384733, "learning_rate": 3.4703853268507597e-06, "loss": 0.74434298, "num_input_tokens_seen": 93386555, "router_z_loss_clip": 3.34765625, "router_z_loss_mlp": 0.46679688, "step": 4324, "time_per_iteration": 2.7136664390563965 }, { "auxiliary_loss_clip": 0.01581835, "auxiliary_loss_mlp": 0.00411698, "balance_loss_clip": 1.25407541, "balance_loss_mlp": 0.3700223, "epoch": 0.2600330677889674, "flos": 31431460855680.0, "grad_norm": 22.01950689656422, "language_loss": 0.75527906, "learning_rate": 3.470121299177082e-06, "loss": 0.77521437, "num_input_tokens_seen": 93405590, "router_z_loss_clip": 3.27929688, "router_z_loss_mlp": 0.41625977, "step": 4325, "time_per_iteration": 4.164029836654663 }, { "auxiliary_loss_clip": 0.01574096, "auxiliary_loss_mlp": 0.00431408, "balance_loss_clip": 1.25117552, "balance_loss_mlp": 0.38587043, "epoch": 0.26009319104163536, "flos": 32266527217920.0, "grad_norm": 13.647746112935126, "language_loss": 0.78311121, "learning_rate": 3.469857215756257e-06, "loss": 0.80316627, "num_input_tokens_seen": 93424750, "router_z_loss_clip": 3.22851562, "router_z_loss_mlp": 0.45532227, "step": 4326, "time_per_iteration": 2.7986278533935547 }, { "auxiliary_loss_clip": 0.01577629, "auxiliary_loss_mlp": 0.00416787, "balance_loss_clip": 1.25449705, "balance_loss_mlp": 0.37277511, "epoch": 0.26015331429430333, "flos": 26286754752000.0, "grad_norm": 5.431525880549466, "language_loss": 0.91785192, "learning_rate": 3.4695930765982997e-06, "loss": 0.93779612, "num_input_tokens_seen": 93443465, "router_z_loss_clip": 3.23242188, "router_z_loss_mlp": 0.44042969, "step": 4327, "time_per_iteration": 2.726407051086426 }, { "auxiliary_loss_clip": 0.01597404, "auxiliary_loss_mlp": 0.00473068, "balance_loss_clip": 1.26838315, "balance_loss_mlp": 0.42424011, "epoch": 0.2602134375469713, "flos": 21142335957120.0, "grad_norm": 6.840347219264271, "language_loss": 0.85066736, "learning_rate": 3.4693288817132255e-06, "loss": 0.8713721, "num_input_tokens_seen": 93462580, "router_z_loss_clip": 3.2890625, "router_z_loss_mlp": 0.48803711, "step": 4328, "time_per_iteration": 2.655184745788574 }, { "auxiliary_loss_clip": 0.01576222, "auxiliary_loss_mlp": 0.00424598, "balance_loss_clip": 1.25083423, "balance_loss_mlp": 0.38034743, "epoch": 0.26027356079963926, "flos": 25921327737600.0, "grad_norm": 22.781754071934714, "language_loss": 0.92214918, "learning_rate": 3.4690646311110525e-06, "loss": 0.94215745, "num_input_tokens_seen": 93482790, "router_z_loss_clip": 3.25585938, "router_z_loss_mlp": 0.44213867, "step": 4329, "time_per_iteration": 4.126258373260498 }, { "auxiliary_loss_clip": 0.01569635, "auxiliary_loss_mlp": 0.00403015, "balance_loss_clip": 1.25131428, "balance_loss_mlp": 0.3617689, "epoch": 0.2603336840523072, "flos": 26359222440960.0, "grad_norm": 11.805415607317057, "language_loss": 0.82169896, "learning_rate": 3.468800324801802e-06, "loss": 0.84142548, "num_input_tokens_seen": 93498795, "router_z_loss_clip": 3.18359375, "router_z_loss_mlp": 0.41235352, "step": 4330, "time_per_iteration": 2.694737672805786 }, { "auxiliary_loss_clip": 0.01586112, "auxiliary_loss_mlp": 0.00451207, "balance_loss_clip": 1.25692773, "balance_loss_mlp": 0.40562153, "epoch": 0.2603938073049752, "flos": 23513661054720.0, "grad_norm": 51.03110825523836, "language_loss": 0.80559319, "learning_rate": 3.4685359627954958e-06, "loss": 0.82596642, "num_input_tokens_seen": 93518335, "router_z_loss_clip": 3.29101562, "router_z_loss_mlp": 0.45629883, "step": 4331, "time_per_iteration": 2.6537725925445557 }, { "auxiliary_loss_clip": 0.01609368, "auxiliary_loss_mlp": 0.00410992, "balance_loss_clip": 1.28021526, "balance_loss_mlp": 0.36805314, "epoch": 0.26045393055764315, "flos": 25374300537600.0, "grad_norm": 23.555371069240355, "language_loss": 0.73883736, "learning_rate": 3.4682715451021584e-06, "loss": 0.75904101, "num_input_tokens_seen": 93539170, "router_z_loss_clip": 3.29296875, "router_z_loss_mlp": 0.4296875, "step": 4332, "time_per_iteration": 2.697411298751831 }, { "auxiliary_loss_clip": 0.01586065, "auxiliary_loss_mlp": 0.00425533, "balance_loss_clip": 1.25465679, "balance_loss_mlp": 0.37780195, "epoch": 0.2605140538103111, "flos": 27635272076160.0, "grad_norm": 6.087085659935331, "language_loss": 0.83345699, "learning_rate": 3.4680070717318174e-06, "loss": 0.85357296, "num_input_tokens_seen": 93558480, "router_z_loss_clip": 3.31445312, "router_z_loss_mlp": 0.47705078, "step": 4333, "time_per_iteration": 2.7642416954040527 }, { "auxiliary_loss_clip": 0.01593222, "auxiliary_loss_mlp": 0.00419103, "balance_loss_clip": 1.26704526, "balance_loss_mlp": 0.37840497, "epoch": 0.2605741770629791, "flos": 13769839503360.0, "grad_norm": 10.871069290868418, "language_loss": 0.84198534, "learning_rate": 3.467742542694501e-06, "loss": 0.86210859, "num_input_tokens_seen": 93575220, "router_z_loss_clip": 3.26171875, "router_z_loss_mlp": 0.40698242, "step": 4334, "time_per_iteration": 2.7419238090515137 }, { "auxiliary_loss_clip": 0.01590673, "auxiliary_loss_mlp": 0.00419634, "balance_loss_clip": 1.26041031, "balance_loss_mlp": 0.37509763, "epoch": 0.26063430031564705, "flos": 26031681296640.0, "grad_norm": 2.3907817648259426, "language_loss": 0.83854085, "learning_rate": 3.46747795800024e-06, "loss": 0.85864395, "num_input_tokens_seen": 93597015, "router_z_loss_clip": 3.30273438, "router_z_loss_mlp": 0.44555664, "step": 4335, "time_per_iteration": 2.6968600749969482 }, { "auxiliary_loss_clip": 0.01434337, "auxiliary_loss_mlp": 0.00096329, "balance_loss_clip": 1.20947778, "balance_loss_mlp": 0.08655354, "epoch": 0.26069442356831507, "flos": 62443809820800.0, "grad_norm": 0.8402256951789285, "language_loss": 0.61005062, "learning_rate": 3.467213317659068e-06, "loss": 0.62535727, "num_input_tokens_seen": 93657775, "router_z_loss_clip": 2.25, "router_z_loss_mlp": 0.09765625, "step": 4336, "time_per_iteration": 3.0961384773254395 }, { "auxiliary_loss_clip": 0.0161221, "auxiliary_loss_mlp": 0.00420248, "balance_loss_clip": 1.27358568, "balance_loss_mlp": 0.37771386, "epoch": 0.26075454682098304, "flos": 13626376583040.0, "grad_norm": 2.799172392868859, "language_loss": 0.84572655, "learning_rate": 3.46694862168102e-06, "loss": 0.86605108, "num_input_tokens_seen": 93676145, "router_z_loss_clip": 3.38867188, "router_z_loss_mlp": 0.42504883, "step": 4337, "time_per_iteration": 4.061982154846191 }, { "auxiliary_loss_clip": 0.01608895, "auxiliary_loss_mlp": 0.00409573, "balance_loss_clip": 1.27347529, "balance_loss_mlp": 0.36684799, "epoch": 0.260814670073651, "flos": 12126531260160.0, "grad_norm": 628.2514120574647, "language_loss": 0.82055104, "learning_rate": 3.4666838700761334e-06, "loss": 0.84073573, "num_input_tokens_seen": 93692480, "router_z_loss_clip": 3.35351562, "router_z_loss_mlp": 0.42675781, "step": 4338, "time_per_iteration": 2.613847017288208 }, { "auxiliary_loss_clip": 0.0160815, "auxiliary_loss_mlp": 0.00455261, "balance_loss_clip": 1.27008176, "balance_loss_mlp": 0.40972298, "epoch": 0.26087479332631897, "flos": 15122522805120.0, "grad_norm": 22.27445067746055, "language_loss": 0.86933112, "learning_rate": 3.466419062854447e-06, "loss": 0.88996518, "num_input_tokens_seen": 93710165, "router_z_loss_clip": 3.37695312, "router_z_loss_mlp": 0.45532227, "step": 4339, "time_per_iteration": 2.6236824989318848 }, { "auxiliary_loss_clip": 0.0160059, "auxiliary_loss_mlp": 0.00399235, "balance_loss_clip": 1.27053785, "balance_loss_mlp": 0.3610163, "epoch": 0.26093491657898693, "flos": 24680937329280.0, "grad_norm": 17.83804118828831, "language_loss": 0.80930722, "learning_rate": 3.4661542000260033e-06, "loss": 0.82930547, "num_input_tokens_seen": 93730185, "router_z_loss_clip": 3.30273438, "router_z_loss_mlp": 0.38208008, "step": 4340, "time_per_iteration": 2.656724691390991 }, { "auxiliary_loss_clip": 0.01605632, "auxiliary_loss_mlp": 0.00415437, "balance_loss_clip": 1.26772189, "balance_loss_mlp": 0.37435716, "epoch": 0.2609950398316549, "flos": 25116138512640.0, "grad_norm": 2.2506691994915733, "language_loss": 0.87632167, "learning_rate": 3.465889281600845e-06, "loss": 0.8965323, "num_input_tokens_seen": 93747690, "router_z_loss_clip": 3.37890625, "router_z_loss_mlp": 0.41137695, "step": 4341, "time_per_iteration": 2.660353422164917 }, { "auxiliary_loss_clip": 0.01589287, "auxiliary_loss_mlp": 0.00439066, "balance_loss_clip": 1.25824142, "balance_loss_mlp": 0.39529243, "epoch": 0.26105516308432286, "flos": 28548588216960.0, "grad_norm": 8.598968899034103, "language_loss": 0.82364178, "learning_rate": 3.4656243075890183e-06, "loss": 0.84392524, "num_input_tokens_seen": 93767405, "router_z_loss_clip": 3.3125, "router_z_loss_mlp": 0.43774414, "step": 4342, "time_per_iteration": 2.7670674324035645 }, { "auxiliary_loss_clip": 0.01594681, "auxiliary_loss_mlp": 0.00433157, "balance_loss_clip": 1.26438308, "balance_loss_mlp": 0.38661808, "epoch": 0.2611152863369908, "flos": 39530609447040.0, "grad_norm": 73.19985743565881, "language_loss": 0.70387244, "learning_rate": 3.4653592780005707e-06, "loss": 0.72415078, "num_input_tokens_seen": 93789950, "router_z_loss_clip": 3.30273438, "router_z_loss_mlp": 0.46533203, "step": 4343, "time_per_iteration": 2.7839484214782715 }, { "auxiliary_loss_clip": 0.01596289, "auxiliary_loss_mlp": 0.00413977, "balance_loss_clip": 1.26153946, "balance_loss_mlp": 0.37425691, "epoch": 0.2611754095896588, "flos": 13735329511680.0, "grad_norm": 2701.0898486327687, "language_loss": 0.79664528, "learning_rate": 3.465094192845553e-06, "loss": 0.8167479, "num_input_tokens_seen": 93807835, "router_z_loss_clip": 3.34765625, "router_z_loss_mlp": 0.39697266, "step": 4344, "time_per_iteration": 2.609745979309082 }, { "auxiliary_loss_clip": 0.01602399, "auxiliary_loss_mlp": 0.00444106, "balance_loss_clip": 1.26933086, "balance_loss_mlp": 0.3980912, "epoch": 0.26123553284232676, "flos": 21506649649920.0, "grad_norm": 3.0385612342096993, "language_loss": 0.92440724, "learning_rate": 3.4648290521340165e-06, "loss": 0.94487232, "num_input_tokens_seen": 93825670, "router_z_loss_clip": 3.33007812, "router_z_loss_mlp": 0.46020508, "step": 4345, "time_per_iteration": 2.679110288619995 }, { "auxiliary_loss_clip": 0.01593604, "auxiliary_loss_mlp": 0.00414481, "balance_loss_clip": 1.2656002, "balance_loss_mlp": 0.37290066, "epoch": 0.2612956560949947, "flos": 21139786091520.0, "grad_norm": 22.146764244372324, "language_loss": 0.82104564, "learning_rate": 3.464563855876015e-06, "loss": 0.84112644, "num_input_tokens_seen": 93844045, "router_z_loss_clip": 3.27539062, "router_z_loss_mlp": 0.41552734, "step": 4346, "time_per_iteration": 2.6394100189208984 }, { "auxiliary_loss_clip": 0.01605986, "auxiliary_loss_mlp": 0.00403328, "balance_loss_clip": 1.267712, "balance_loss_mlp": 0.36220074, "epoch": 0.2613557793476627, "flos": 25119011600640.0, "grad_norm": 7.83741586146149, "language_loss": 0.80571508, "learning_rate": 3.464298604081606e-06, "loss": 0.82580829, "num_input_tokens_seen": 93864380, "router_z_loss_clip": 3.38476562, "router_z_loss_mlp": 0.41137695, "step": 4347, "time_per_iteration": 2.6952731609344482 }, { "auxiliary_loss_clip": 0.0158921, "auxiliary_loss_mlp": 0.00406489, "balance_loss_clip": 1.25962901, "balance_loss_mlp": 0.3681038, "epoch": 0.26141590260033065, "flos": 26067699659520.0, "grad_norm": 186.0934521535218, "language_loss": 0.79181105, "learning_rate": 3.4640332967608476e-06, "loss": 0.81176805, "num_input_tokens_seen": 93885475, "router_z_loss_clip": 3.29296875, "router_z_loss_mlp": 0.3840332, "step": 4348, "time_per_iteration": 2.7072737216949463 }, { "auxiliary_loss_clip": 0.01605126, "auxiliary_loss_mlp": 0.00426008, "balance_loss_clip": 1.26639664, "balance_loss_mlp": 0.38428435, "epoch": 0.2614760258529987, "flos": 25701518459520.0, "grad_norm": 21.29350366992205, "language_loss": 0.96784019, "learning_rate": 3.463767933923799e-06, "loss": 0.98815155, "num_input_tokens_seen": 93905545, "router_z_loss_clip": 3.38867188, "router_z_loss_mlp": 0.41748047, "step": 4349, "time_per_iteration": 2.684600830078125 }, { "auxiliary_loss_clip": 0.01593976, "auxiliary_loss_mlp": 0.00396517, "balance_loss_clip": 1.25685024, "balance_loss_mlp": 0.35867995, "epoch": 0.26153614910566664, "flos": 17457147181440.0, "grad_norm": 3.210882687861878, "language_loss": 0.84925383, "learning_rate": 3.463502515580524e-06, "loss": 0.86915874, "num_input_tokens_seen": 93924185, "router_z_loss_clip": 3.37304688, "router_z_loss_mlp": 0.37841797, "step": 4350, "time_per_iteration": 2.67560076713562 }, { "auxiliary_loss_clip": 0.01597224, "auxiliary_loss_mlp": 0.00413685, "balance_loss_clip": 1.25719774, "balance_loss_mlp": 0.37601489, "epoch": 0.2615962723583346, "flos": 17712831168000.0, "grad_norm": 10.69151414604349, "language_loss": 0.6798712, "learning_rate": 3.4632370417410866e-06, "loss": 0.69998032, "num_input_tokens_seen": 93942825, "router_z_loss_clip": 3.40234375, "router_z_loss_mlp": 0.37695312, "step": 4351, "time_per_iteration": 2.6551854610443115 }, { "auxiliary_loss_clip": 0.01615624, "auxiliary_loss_mlp": 0.00438704, "balance_loss_clip": 1.26244843, "balance_loss_mlp": 0.39574108, "epoch": 0.26165639561100257, "flos": 23257725672960.0, "grad_norm": 5.7130367530858495, "language_loss": 0.89691973, "learning_rate": 3.462971512415555e-06, "loss": 0.91746294, "num_input_tokens_seen": 93962045, "router_z_loss_clip": 3.53320312, "router_z_loss_mlp": 0.4296875, "step": 4352, "time_per_iteration": 2.642085075378418 }, { "auxiliary_loss_clip": 0.01452806, "auxiliary_loss_mlp": 0.00142172, "balance_loss_clip": 1.19030929, "balance_loss_mlp": 0.13058463, "epoch": 0.26171651886367053, "flos": 66737970800640.0, "grad_norm": 0.7903744642486397, "language_loss": 0.70106006, "learning_rate": 3.462705927613996e-06, "loss": 0.71700978, "num_input_tokens_seen": 94021175, "router_z_loss_clip": 2.625, "router_z_loss_mlp": 0.11572266, "step": 4353, "time_per_iteration": 3.059678792953491 }, { "auxiliary_loss_clip": 0.0161771, "auxiliary_loss_mlp": 0.00377098, "balance_loss_clip": 1.26782155, "balance_loss_mlp": 0.33766392, "epoch": 0.2617766421163385, "flos": 22349581090560.0, "grad_norm": 3.567774847989798, "language_loss": 0.83331227, "learning_rate": 3.4624402873464816e-06, "loss": 0.8532604, "num_input_tokens_seen": 94043370, "router_z_loss_clip": 3.50195312, "router_z_loss_mlp": 0.39404297, "step": 4354, "time_per_iteration": 2.673826217651367 }, { "auxiliary_loss_clip": 0.01611028, "auxiliary_loss_mlp": 0.00401768, "balance_loss_clip": 1.25300574, "balance_loss_mlp": 0.36302531, "epoch": 0.26183676536900646, "flos": 26067125041920.0, "grad_norm": 39.416878313967764, "language_loss": 0.75324029, "learning_rate": 3.462174591623085e-06, "loss": 0.77336824, "num_input_tokens_seen": 94063510, "router_z_loss_clip": 3.578125, "router_z_loss_mlp": 0.38745117, "step": 4355, "time_per_iteration": 2.7014710903167725 }, { "auxiliary_loss_clip": 0.01597082, "auxiliary_loss_mlp": 0.0037993, "balance_loss_clip": 1.24688053, "balance_loss_mlp": 0.33670494, "epoch": 0.26189688862167443, "flos": 20996466825600.0, "grad_norm": 221.80664695221512, "language_loss": 0.73810726, "learning_rate": 3.4619088404538815e-06, "loss": 0.75787735, "num_input_tokens_seen": 94083865, "router_z_loss_clip": 3.5, "router_z_loss_mlp": 0.43212891, "step": 4356, "time_per_iteration": 2.7149598598480225 }, { "auxiliary_loss_clip": 0.01425486, "auxiliary_loss_mlp": 0.00099175, "balance_loss_clip": 1.18707776, "balance_loss_mlp": 0.08715858, "epoch": 0.2619570118743424, "flos": 65798261141760.0, "grad_norm": 0.7693750162069145, "language_loss": 0.53090072, "learning_rate": 3.4616430338489487e-06, "loss": 0.54614735, "num_input_tokens_seen": 94144095, "router_z_loss_clip": 2.375, "router_z_loss_mlp": 0.12011719, "step": 4357, "time_per_iteration": 3.0530319213867188 }, { "auxiliary_loss_clip": 0.01605364, "auxiliary_loss_mlp": 0.00393669, "balance_loss_clip": 1.25234079, "balance_loss_mlp": 0.35430631, "epoch": 0.26201713512701036, "flos": 28766817296640.0, "grad_norm": 17.609466924667792, "language_loss": 0.90785289, "learning_rate": 3.4613771718183654e-06, "loss": 0.92784321, "num_input_tokens_seen": 94163035, "router_z_loss_clip": 3.52929688, "router_z_loss_mlp": 0.39404297, "step": 4358, "time_per_iteration": 2.688920736312866 }, { "auxiliary_loss_clip": 0.01595565, "auxiliary_loss_mlp": 0.00453183, "balance_loss_clip": 1.23778796, "balance_loss_mlp": 0.40578526, "epoch": 0.2620772583796783, "flos": 26432516142720.0, "grad_norm": 5.937081696078898, "language_loss": 0.73830462, "learning_rate": 3.4611112543722127e-06, "loss": 0.7587921, "num_input_tokens_seen": 94182520, "router_z_loss_clip": 3.57617188, "router_z_loss_mlp": 0.47363281, "step": 4359, "time_per_iteration": 2.664734125137329 }, { "auxiliary_loss_clip": 0.0160437, "auxiliary_loss_mlp": 0.004025, "balance_loss_clip": 1.25245309, "balance_loss_mlp": 0.36275613, "epoch": 0.2621373816323463, "flos": 20156552127360.0, "grad_norm": 82.24860916503191, "language_loss": 0.8299247, "learning_rate": 3.4608452815205757e-06, "loss": 0.84999341, "num_input_tokens_seen": 94201795, "router_z_loss_clip": 3.51757812, "router_z_loss_mlp": 0.39770508, "step": 4360, "time_per_iteration": 2.6236424446105957 }, { "auxiliary_loss_clip": 0.01593245, "auxiliary_loss_mlp": 0.0037216, "balance_loss_clip": 1.24547696, "balance_loss_mlp": 0.33339328, "epoch": 0.26219750488501425, "flos": 28621235473920.0, "grad_norm": 185.87731922635396, "language_loss": 0.73393184, "learning_rate": 3.4605792532735387e-06, "loss": 0.75358588, "num_input_tokens_seen": 94222390, "router_z_loss_clip": 3.47851562, "router_z_loss_mlp": 0.38769531, "step": 4361, "time_per_iteration": 2.694024085998535 }, { "auxiliary_loss_clip": 0.0160537, "auxiliary_loss_mlp": 0.00422127, "balance_loss_clip": 1.25430942, "balance_loss_mlp": 0.37990308, "epoch": 0.2622576281376823, "flos": 15042549173760.0, "grad_norm": 5.907545738207341, "language_loss": 0.89360207, "learning_rate": 3.46031316964119e-06, "loss": 0.91387701, "num_input_tokens_seen": 94239980, "router_z_loss_clip": 3.51367188, "router_z_loss_mlp": 0.42211914, "step": 4362, "time_per_iteration": 2.6335391998291016 }, { "auxiliary_loss_clip": 0.01587155, "auxiliary_loss_mlp": 0.00389422, "balance_loss_clip": 1.24581385, "balance_loss_mlp": 0.34259671, "epoch": 0.26231775139035024, "flos": 26396174557440.0, "grad_norm": 29.241391893927197, "language_loss": 0.72831702, "learning_rate": 3.4600470306336197e-06, "loss": 0.74808276, "num_input_tokens_seen": 94260715, "router_z_loss_clip": 3.41015625, "router_z_loss_mlp": 0.46875, "step": 4363, "time_per_iteration": 2.6818857192993164 }, { "auxiliary_loss_clip": 0.01402475, "auxiliary_loss_mlp": 0.00099538, "balance_loss_clip": 1.18080544, "balance_loss_mlp": 0.08628213, "epoch": 0.2623778746430182, "flos": 65408918647680.0, "grad_norm": 2.4978947748342613, "language_loss": 0.60720742, "learning_rate": 3.4597808362609194e-06, "loss": 0.62222755, "num_input_tokens_seen": 94321285, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.1328125, "step": 4364, "time_per_iteration": 4.593913793563843 }, { "auxiliary_loss_clip": 0.0158554, "auxiliary_loss_mlp": 0.00373655, "balance_loss_clip": 1.24395728, "balance_loss_mlp": 0.33014402, "epoch": 0.26243799789568617, "flos": 12604215254400.0, "grad_norm": 3.8533919999684376, "language_loss": 0.80207872, "learning_rate": 3.459514586533184e-06, "loss": 0.82167065, "num_input_tokens_seen": 94335420, "router_z_loss_clip": 3.41796875, "router_z_loss_mlp": 0.43554688, "step": 4365, "time_per_iteration": 2.638861894607544 }, { "auxiliary_loss_clip": 0.01593626, "auxiliary_loss_mlp": 0.00352879, "balance_loss_clip": 1.25105584, "balance_loss_mlp": 0.31418362, "epoch": 0.26249812114835414, "flos": 28623821253120.0, "grad_norm": 11.802399362668922, "language_loss": 0.8288877, "learning_rate": 3.459248281460509e-06, "loss": 0.84835279, "num_input_tokens_seen": 94357440, "router_z_loss_clip": 3.421875, "router_z_loss_mlp": 0.38696289, "step": 4366, "time_per_iteration": 2.6985559463500977 }, { "auxiliary_loss_clip": 0.01604727, "auxiliary_loss_mlp": 0.00359689, "balance_loss_clip": 1.25908136, "balance_loss_mlp": 0.32051647, "epoch": 0.2625582444010221, "flos": 14465393441280.0, "grad_norm": 36.15860051653756, "language_loss": 0.81951666, "learning_rate": 3.4589819210529927e-06, "loss": 0.8391608, "num_input_tokens_seen": 94375690, "router_z_loss_clip": 3.45507812, "router_z_loss_mlp": 0.3918457, "step": 4367, "time_per_iteration": 2.6854851245880127 }, { "auxiliary_loss_clip": 0.01598255, "auxiliary_loss_mlp": 0.00349349, "balance_loss_clip": 1.25806236, "balance_loss_mlp": 0.30724403, "epoch": 0.26261836765369007, "flos": 16613174246400.0, "grad_norm": 9.296935961972375, "language_loss": 0.74417782, "learning_rate": 3.458715505320736e-06, "loss": 0.76365387, "num_input_tokens_seen": 94393190, "router_z_loss_clip": 3.40234375, "router_z_loss_mlp": 0.42138672, "step": 4368, "time_per_iteration": 4.02173376083374 }, { "auxiliary_loss_clip": 0.01596189, "auxiliary_loss_mlp": 0.00383913, "balance_loss_clip": 1.24771821, "balance_loss_mlp": 0.33827996, "epoch": 0.26267849090635803, "flos": 20519932066560.0, "grad_norm": 9.50455581045785, "language_loss": 0.84595597, "learning_rate": 3.458449034273841e-06, "loss": 0.86575705, "num_input_tokens_seen": 94410975, "router_z_loss_clip": 3.484375, "router_z_loss_mlp": 0.45605469, "step": 4369, "time_per_iteration": 2.6540353298187256 }, { "auxiliary_loss_clip": 0.01610584, "auxiliary_loss_mlp": 0.00365747, "balance_loss_clip": 1.26062822, "balance_loss_mlp": 0.32640785, "epoch": 0.262738614159026, "flos": 21323936142720.0, "grad_norm": 8.488637794856688, "language_loss": 0.88776124, "learning_rate": 3.4581825079224133e-06, "loss": 0.90752459, "num_input_tokens_seen": 94429985, "router_z_loss_clip": 3.50195312, "router_z_loss_mlp": 0.39355469, "step": 4370, "time_per_iteration": 2.669826030731201 }, { "auxiliary_loss_clip": 0.01617663, "auxiliary_loss_mlp": 0.00425765, "balance_loss_clip": 1.25833035, "balance_loss_mlp": 0.37991661, "epoch": 0.26279873741169396, "flos": 17603590930560.0, "grad_norm": 16.1981727452957, "language_loss": 0.76926953, "learning_rate": 3.4579159262765575e-06, "loss": 0.78970379, "num_input_tokens_seen": 94448660, "router_z_loss_clip": 3.58984375, "router_z_loss_mlp": 0.45800781, "step": 4371, "time_per_iteration": 4.075396776199341 }, { "auxiliary_loss_clip": 0.01444178, "auxiliary_loss_mlp": 0.00107062, "balance_loss_clip": 1.19938016, "balance_loss_mlp": 0.0970008, "epoch": 0.2628588606643619, "flos": 60949746587520.0, "grad_norm": 0.7200117470464382, "language_loss": 0.55954444, "learning_rate": 3.457649289346384e-06, "loss": 0.57505691, "num_input_tokens_seen": 94515630, "router_z_loss_clip": 2.453125, "router_z_loss_mlp": 0.10058594, "step": 4372, "time_per_iteration": 3.353477954864502 }, { "auxiliary_loss_clip": 0.01608575, "auxiliary_loss_mlp": 0.00358184, "balance_loss_clip": 1.25803757, "balance_loss_mlp": 0.31534058, "epoch": 0.2629189839170299, "flos": 27016315891200.0, "grad_norm": 3.6508751177348, "language_loss": 0.8308627, "learning_rate": 3.4573825971420042e-06, "loss": 0.85053027, "num_input_tokens_seen": 94535385, "router_z_loss_clip": 3.50976562, "router_z_loss_mlp": 0.42871094, "step": 4373, "time_per_iteration": 2.717026472091675 }, { "auxiliary_loss_clip": 0.01600023, "auxiliary_loss_mlp": 0.00355614, "balance_loss_clip": 1.25338936, "balance_loss_mlp": 0.31484419, "epoch": 0.26297910716969786, "flos": 17019863009280.0, "grad_norm": 1010.8596292285557, "language_loss": 0.76675153, "learning_rate": 3.4571158496735294e-06, "loss": 0.78630787, "num_input_tokens_seen": 94552650, "router_z_loss_clip": 3.46875, "router_z_loss_mlp": 0.40771484, "step": 4374, "time_per_iteration": 2.6224489212036133 }, { "auxiliary_loss_clip": 0.0162273, "auxiliary_loss_mlp": 0.00341257, "balance_loss_clip": 1.26611972, "balance_loss_mlp": 0.30110729, "epoch": 0.2630392304223659, "flos": 24897370728960.0, "grad_norm": 4.9265379581341, "language_loss": 0.85816771, "learning_rate": 3.4568490469510756e-06, "loss": 0.8778075, "num_input_tokens_seen": 94574075, "router_z_loss_clip": 3.56445312, "router_z_loss_mlp": 0.40136719, "step": 4375, "time_per_iteration": 2.72318434715271 }, { "auxiliary_loss_clip": 0.01598343, "auxiliary_loss_mlp": 0.00345231, "balance_loss_clip": 1.25150204, "balance_loss_mlp": 0.30670267, "epoch": 0.26309935367503384, "flos": 32854026067200.0, "grad_norm": 8.58512537889125, "language_loss": 0.72630352, "learning_rate": 3.4565821889847603e-06, "loss": 0.74573922, "num_input_tokens_seen": 94594255, "router_z_loss_clip": 3.46875, "router_z_loss_mlp": 0.38549805, "step": 4376, "time_per_iteration": 2.8121542930603027 }, { "auxiliary_loss_clip": 0.0162395, "auxiliary_loss_mlp": 0.00384045, "balance_loss_clip": 1.26842117, "balance_loss_mlp": 0.3423928, "epoch": 0.2631594769277018, "flos": 15887958652800.0, "grad_norm": 14.622921557832578, "language_loss": 0.75015295, "learning_rate": 3.4563152757847026e-06, "loss": 0.77023292, "num_input_tokens_seen": 94611410, "router_z_loss_clip": 3.55664062, "router_z_loss_mlp": 0.41674805, "step": 4377, "time_per_iteration": 2.7024106979370117 }, { "auxiliary_loss_clip": 0.01603406, "auxiliary_loss_mlp": 0.00348568, "balance_loss_clip": 1.25489414, "balance_loss_mlp": 0.30875248, "epoch": 0.2632196001803698, "flos": 50804943557760.0, "grad_norm": 28.891185012586693, "language_loss": 0.84862113, "learning_rate": 3.4560483073610233e-06, "loss": 0.86814094, "num_input_tokens_seen": 94636575, "router_z_loss_clip": 3.48242188, "router_z_loss_mlp": 0.39819336, "step": 4378, "time_per_iteration": 2.9192392826080322 }, { "auxiliary_loss_clip": 0.01593389, "auxiliary_loss_mlp": 0.00349063, "balance_loss_clip": 1.2545321, "balance_loss_mlp": 0.31172633, "epoch": 0.26327972343303774, "flos": 13733031041280.0, "grad_norm": 2.25301397337326, "language_loss": 0.83861887, "learning_rate": 3.455781283723846e-06, "loss": 0.85804343, "num_input_tokens_seen": 94654345, "router_z_loss_clip": 3.38867188, "router_z_loss_mlp": 0.37353516, "step": 4379, "time_per_iteration": 4.116482496261597 }, { "auxiliary_loss_clip": 0.01622029, "auxiliary_loss_mlp": 0.00384969, "balance_loss_clip": 1.26492608, "balance_loss_mlp": 0.34205407, "epoch": 0.2633398466857057, "flos": 23769057732480.0, "grad_norm": 18.9534499346279, "language_loss": 0.85572368, "learning_rate": 3.4555142048832975e-06, "loss": 0.8757937, "num_input_tokens_seen": 94673985, "router_z_loss_clip": 3.5703125, "router_z_loss_mlp": 0.42895508, "step": 4380, "time_per_iteration": 2.691673755645752 }, { "auxiliary_loss_clip": 0.01594592, "auxiliary_loss_mlp": 0.00338465, "balance_loss_clip": 1.24240088, "balance_loss_mlp": 0.29969791, "epoch": 0.26339996993837367, "flos": 27600223380480.0, "grad_norm": 5.60441707904213, "language_loss": 0.71773791, "learning_rate": 3.4552470708495036e-06, "loss": 0.73706853, "num_input_tokens_seen": 94693145, "router_z_loss_clip": 3.5234375, "router_z_loss_mlp": 0.38769531, "step": 4381, "time_per_iteration": 2.6739449501037598 }, { "auxiliary_loss_clip": 0.01592409, "auxiliary_loss_mlp": 0.00330871, "balance_loss_clip": 1.24904776, "balance_loss_mlp": 0.28983968, "epoch": 0.26346009319104163, "flos": 16946317912320.0, "grad_norm": 4.409276731995388, "language_loss": 0.88812619, "learning_rate": 3.454979881632595e-06, "loss": 0.907359, "num_input_tokens_seen": 94710185, "router_z_loss_clip": 3.4375, "router_z_loss_mlp": 0.41040039, "step": 4382, "time_per_iteration": 2.6391866207122803 }, { "auxiliary_loss_clip": 0.01604257, "auxiliary_loss_mlp": 0.00384775, "balance_loss_clip": 1.25388157, "balance_loss_mlp": 0.34195471, "epoch": 0.2635202164437096, "flos": 37232218915200.0, "grad_norm": 80.48621024341162, "language_loss": 0.76218367, "learning_rate": 3.4547126372427035e-06, "loss": 0.78207397, "num_input_tokens_seen": 94730280, "router_z_loss_clip": 3.50585938, "router_z_loss_mlp": 0.42797852, "step": 4383, "time_per_iteration": 2.755786895751953 }, { "auxiliary_loss_clip": 0.01603106, "auxiliary_loss_mlp": 0.00362329, "balance_loss_clip": 1.25436664, "balance_loss_mlp": 0.32461131, "epoch": 0.26358033969637756, "flos": 20996359084800.0, "grad_norm": 11.457913830100518, "language_loss": 0.74340796, "learning_rate": 3.4544453376899638e-06, "loss": 0.76306236, "num_input_tokens_seen": 94748560, "router_z_loss_clip": 3.48632812, "router_z_loss_mlp": 0.37744141, "step": 4384, "time_per_iteration": 2.634168863296509 }, { "auxiliary_loss_clip": 0.01588677, "auxiliary_loss_mlp": 0.00352312, "balance_loss_clip": 1.24986625, "balance_loss_mlp": 0.31254393, "epoch": 0.26364046294904553, "flos": 27746092512000.0, "grad_norm": 10.605461962296028, "language_loss": 0.75145555, "learning_rate": 3.45417798298451e-06, "loss": 0.77086538, "num_input_tokens_seen": 94767570, "router_z_loss_clip": 3.38867188, "router_z_loss_mlp": 0.39770508, "step": 4385, "time_per_iteration": 2.6744024753570557 }, { "auxiliary_loss_clip": 0.01593112, "auxiliary_loss_mlp": 0.0037309, "balance_loss_clip": 1.25106788, "balance_loss_mlp": 0.33317852, "epoch": 0.2637005862017135, "flos": 22893088757760.0, "grad_norm": 1.8728132422185797, "language_loss": 0.88271332, "learning_rate": 3.453910573136482e-06, "loss": 0.90237534, "num_input_tokens_seen": 94784985, "router_z_loss_clip": 3.421875, "router_z_loss_mlp": 0.39916992, "step": 4386, "time_per_iteration": 2.6595780849456787 }, { "auxiliary_loss_clip": 0.01588997, "auxiliary_loss_mlp": 0.003341, "balance_loss_clip": 1.25372493, "balance_loss_mlp": 0.29697776, "epoch": 0.26376070945438146, "flos": 15048834053760.0, "grad_norm": 22.7332080326983, "language_loss": 0.84224373, "learning_rate": 3.4536431081560196e-06, "loss": 0.86147463, "num_input_tokens_seen": 94802545, "router_z_loss_clip": 3.35351562, "router_z_loss_mlp": 0.37109375, "step": 4387, "time_per_iteration": 2.624607563018799 }, { "auxiliary_loss_clip": 0.01588959, "auxiliary_loss_mlp": 0.00346927, "balance_loss_clip": 1.2509594, "balance_loss_mlp": 0.30878007, "epoch": 0.2638208327070494, "flos": 21141833166720.0, "grad_norm": 22.939609459723986, "language_loss": 0.81708795, "learning_rate": 3.453375588053264e-06, "loss": 0.83644676, "num_input_tokens_seen": 94820730, "router_z_loss_clip": 3.3828125, "router_z_loss_mlp": 0.3815918, "step": 4388, "time_per_iteration": 2.679506540298462 }, { "auxiliary_loss_clip": 0.01570917, "auxiliary_loss_mlp": 0.00349363, "balance_loss_clip": 1.24104393, "balance_loss_mlp": 0.31028616, "epoch": 0.26388095595971744, "flos": 21725597001600.0, "grad_norm": 24.366060921065614, "language_loss": 0.92738712, "learning_rate": 3.4531080128383617e-06, "loss": 0.94658995, "num_input_tokens_seen": 94839175, "router_z_loss_clip": 3.29882812, "router_z_loss_mlp": 0.390625, "step": 4389, "time_per_iteration": 2.653801441192627 }, { "auxiliary_loss_clip": 0.01391482, "auxiliary_loss_mlp": 0.000795, "balance_loss_clip": 1.18577051, "balance_loss_mlp": 0.06891456, "epoch": 0.2639410792123854, "flos": 65515537192320.0, "grad_norm": 0.8013620600417956, "language_loss": 0.60044217, "learning_rate": 3.452840382521457e-06, "loss": 0.615152, "num_input_tokens_seen": 94898865, "router_z_loss_clip": 2.0625, "router_z_loss_mlp": 0.10595703, "step": 4390, "time_per_iteration": 3.158923625946045 }, { "auxiliary_loss_clip": 0.01594783, "auxiliary_loss_mlp": 0.00364634, "balance_loss_clip": 1.24718022, "balance_loss_mlp": 0.32481843, "epoch": 0.2640012024650534, "flos": 23948574929280.0, "grad_norm": 9.140828030675603, "language_loss": 0.82767463, "learning_rate": 3.4525726971127e-06, "loss": 0.84726882, "num_input_tokens_seen": 94917490, "router_z_loss_clip": 3.4765625, "router_z_loss_mlp": 0.3984375, "step": 4391, "time_per_iteration": 2.7188563346862793 }, { "auxiliary_loss_clip": 0.0138959, "auxiliary_loss_mlp": 0.00076307, "balance_loss_clip": 1.18095422, "balance_loss_mlp": 0.06314629, "epoch": 0.26406132571772134, "flos": 56441163369600.0, "grad_norm": 0.9164135787524301, "language_loss": 0.58860999, "learning_rate": 3.45230495662224e-06, "loss": 0.60326898, "num_input_tokens_seen": 94969065, "router_z_loss_clip": 2.09375, "router_z_loss_mlp": 0.13183594, "step": 4392, "time_per_iteration": 3.118898630142212 }, { "auxiliary_loss_clip": 0.01583723, "auxiliary_loss_mlp": 0.00383535, "balance_loss_clip": 1.24660981, "balance_loss_mlp": 0.34257454, "epoch": 0.2641214489703893, "flos": 22090557139200.0, "grad_norm": 171.42948722094826, "language_loss": 0.76012051, "learning_rate": 3.4520371610602306e-06, "loss": 0.77979308, "num_input_tokens_seen": 94988540, "router_z_loss_clip": 3.36914062, "router_z_loss_mlp": 0.40966797, "step": 4393, "time_per_iteration": 2.661250352859497 }, { "auxiliary_loss_clip": 0.01623831, "auxiliary_loss_mlp": 0.00384944, "balance_loss_clip": 1.2655344, "balance_loss_mlp": 0.34126598, "epoch": 0.26418157222305727, "flos": 16544764794240.0, "grad_norm": 11.708284844087663, "language_loss": 0.91368806, "learning_rate": 3.4517693104368267e-06, "loss": 0.93377578, "num_input_tokens_seen": 95004810, "router_z_loss_clip": 3.58203125, "router_z_loss_mlp": 0.43676758, "step": 4394, "time_per_iteration": 2.6276464462280273 }, { "auxiliary_loss_clip": 0.01600912, "auxiliary_loss_mlp": 0.00402194, "balance_loss_clip": 1.25420022, "balance_loss_mlp": 0.35656053, "epoch": 0.26424169547572524, "flos": 18002486442240.0, "grad_norm": 1482.765293011046, "language_loss": 0.77639329, "learning_rate": 3.4515014047621856e-06, "loss": 0.79642433, "num_input_tokens_seen": 95024085, "router_z_loss_clip": 3.46875, "router_z_loss_mlp": 0.45629883, "step": 4395, "time_per_iteration": 2.619351625442505 }, { "auxiliary_loss_clip": 0.01588415, "auxiliary_loss_mlp": 0.00340152, "balance_loss_clip": 1.25052786, "balance_loss_mlp": 0.30238697, "epoch": 0.2643018187283932, "flos": 16983162288000.0, "grad_norm": 6.6543545038175225, "language_loss": 0.94116664, "learning_rate": 3.4512334440464655e-06, "loss": 0.96045232, "num_input_tokens_seen": 95042515, "router_z_loss_clip": 3.37890625, "router_z_loss_mlp": 0.37768555, "step": 4396, "time_per_iteration": 2.6286208629608154 }, { "auxiliary_loss_clip": 0.01428902, "auxiliary_loss_mlp": 0.00104129, "balance_loss_clip": 1.2148546, "balance_loss_mlp": 0.09483024, "epoch": 0.26436194198106117, "flos": 59664359416320.0, "grad_norm": 51.951619106525534, "language_loss": 0.5497418, "learning_rate": 3.4509654282998277e-06, "loss": 0.56507212, "num_input_tokens_seen": 95094835, "router_z_loss_clip": 2.140625, "router_z_loss_mlp": 0.09277344, "step": 4397, "time_per_iteration": 2.955902576446533 }, { "auxiliary_loss_clip": 0.01610627, "auxiliary_loss_mlp": 0.00362872, "balance_loss_clip": 1.26934361, "balance_loss_mlp": 0.3245101, "epoch": 0.26442206523372913, "flos": 32921322197760.0, "grad_norm": 12.10005954139989, "language_loss": 0.84692836, "learning_rate": 3.450697357532435e-06, "loss": 0.8666634, "num_input_tokens_seen": 95113480, "router_z_loss_clip": 3.41210938, "router_z_loss_mlp": 0.38354492, "step": 4398, "time_per_iteration": 2.7408740520477295 }, { "auxiliary_loss_clip": 0.01623277, "auxiliary_loss_mlp": 0.00371295, "balance_loss_clip": 1.27660108, "balance_loss_mlp": 0.33064455, "epoch": 0.2644821884863971, "flos": 21031300039680.0, "grad_norm": 69.63306269543864, "language_loss": 0.72652328, "learning_rate": 3.4504292317544534e-06, "loss": 0.74646902, "num_input_tokens_seen": 95132580, "router_z_loss_clip": 3.47265625, "router_z_loss_mlp": 0.40625, "step": 4399, "time_per_iteration": 2.647465705871582 }, { "auxiliary_loss_clip": 0.01605932, "auxiliary_loss_mlp": 0.0035963, "balance_loss_clip": 1.27070642, "balance_loss_mlp": 0.3201957, "epoch": 0.26454231173906506, "flos": 20776801201920.0, "grad_norm": 11.332896086100925, "language_loss": 0.91432726, "learning_rate": 3.4501610509760504e-06, "loss": 0.93398285, "num_input_tokens_seen": 95152375, "router_z_loss_clip": 3.35351562, "router_z_loss_mlp": 0.39453125, "step": 4400, "time_per_iteration": 2.6431796550750732 }, { "auxiliary_loss_clip": 0.01630613, "auxiliary_loss_mlp": 0.00354536, "balance_loss_clip": 1.27521586, "balance_loss_mlp": 0.31469631, "epoch": 0.264602434991733, "flos": 16618669027200.0, "grad_norm": 1.618481801358154, "language_loss": 0.82527089, "learning_rate": 3.4498928152073944e-06, "loss": 0.8451224, "num_input_tokens_seen": 95170265, "router_z_loss_clip": 3.55273438, "router_z_loss_mlp": 0.39892578, "step": 4401, "time_per_iteration": 2.6326613426208496 }, { "auxiliary_loss_clip": 0.01647247, "auxiliary_loss_mlp": 0.00358207, "balance_loss_clip": 1.29132462, "balance_loss_mlp": 0.31791455, "epoch": 0.26466255824440105, "flos": 19062677295360.0, "grad_norm": 102.91629804216895, "language_loss": 0.941486, "learning_rate": 3.4496245244586577e-06, "loss": 0.96154058, "num_input_tokens_seen": 95188655, "router_z_loss_clip": 3.55859375, "router_z_loss_mlp": 0.40307617, "step": 4402, "time_per_iteration": 2.655700922012329 }, { "auxiliary_loss_clip": 0.01635588, "auxiliary_loss_mlp": 0.00383494, "balance_loss_clip": 1.28456175, "balance_loss_mlp": 0.34384471, "epoch": 0.264722681497069, "flos": 22638554006400.0, "grad_norm": 5.817912925504186, "language_loss": 0.82817233, "learning_rate": 3.4493561787400137e-06, "loss": 0.84836316, "num_input_tokens_seen": 95209615, "router_z_loss_clip": 3.51367188, "router_z_loss_mlp": 0.39624023, "step": 4403, "time_per_iteration": 2.7161617279052734 }, { "auxiliary_loss_clip": 0.01621117, "auxiliary_loss_mlp": 0.00361903, "balance_loss_clip": 1.27246547, "balance_loss_mlp": 0.32292202, "epoch": 0.264782804749737, "flos": 22492253911680.0, "grad_norm": 7.951867323753539, "language_loss": 0.95772052, "learning_rate": 3.4490877780616387e-06, "loss": 0.97755075, "num_input_tokens_seen": 95227810, "router_z_loss_clip": 3.48632812, "router_z_loss_mlp": 0.3894043, "step": 4404, "time_per_iteration": 2.703840970993042 }, { "auxiliary_loss_clip": 0.01647617, "auxiliary_loss_mlp": 0.00380297, "balance_loss_clip": 1.29062009, "balance_loss_mlp": 0.3413631, "epoch": 0.26484292800240494, "flos": 16800269212800.0, "grad_norm": 154.77053076397786, "language_loss": 0.82240272, "learning_rate": 3.448819322433709e-06, "loss": 0.84268188, "num_input_tokens_seen": 95245890, "router_z_loss_clip": 3.5703125, "router_z_loss_mlp": 0.38964844, "step": 4405, "time_per_iteration": 2.66880202293396 }, { "auxiliary_loss_clip": 0.01665182, "auxiliary_loss_mlp": 0.00376446, "balance_loss_clip": 1.30851531, "balance_loss_mlp": 0.33565244, "epoch": 0.2649030512550729, "flos": 20449583280000.0, "grad_norm": 6.358367720175698, "language_loss": 0.76658416, "learning_rate": 3.4485508118664066e-06, "loss": 0.78700048, "num_input_tokens_seen": 95264955, "router_z_loss_clip": 3.56640625, "router_z_loss_mlp": 0.40771484, "step": 4406, "time_per_iteration": 4.046116828918457 }, { "auxiliary_loss_clip": 0.01644135, "auxiliary_loss_mlp": 0.00364583, "balance_loss_clip": 1.29360342, "balance_loss_mlp": 0.32715079, "epoch": 0.2649631745077409, "flos": 22416123035520.0, "grad_norm": 4.451931143264114, "language_loss": 0.89456713, "learning_rate": 3.448282246369912e-06, "loss": 0.91465431, "num_input_tokens_seen": 95284245, "router_z_loss_clip": 3.50390625, "router_z_loss_mlp": 0.37426758, "step": 4407, "time_per_iteration": 2.6576969623565674 }, { "auxiliary_loss_clip": 0.01661058, "auxiliary_loss_mlp": 0.00373256, "balance_loss_clip": 1.30885744, "balance_loss_mlp": 0.3305552, "epoch": 0.26502329776040884, "flos": 35116110927360.0, "grad_norm": 13.79497465849659, "language_loss": 0.81820953, "learning_rate": 3.4480136259544084e-06, "loss": 0.83855265, "num_input_tokens_seen": 95307125, "router_z_loss_clip": 3.5234375, "router_z_loss_mlp": 0.42700195, "step": 4408, "time_per_iteration": 2.768663167953491 }, { "auxiliary_loss_clip": 0.01665107, "auxiliary_loss_mlp": 0.00341423, "balance_loss_clip": 1.31174171, "balance_loss_mlp": 0.30129707, "epoch": 0.2650834210130768, "flos": 38687498438400.0, "grad_norm": 65.89275684587346, "language_loss": 0.75916332, "learning_rate": 3.447744950630084e-06, "loss": 0.77922857, "num_input_tokens_seen": 95329150, "router_z_loss_clip": 3.53320312, "router_z_loss_mlp": 0.40136719, "step": 4409, "time_per_iteration": 2.782188892364502 }, { "auxiliary_loss_clip": 0.01649175, "auxiliary_loss_mlp": 0.00365, "balance_loss_clip": 1.30138433, "balance_loss_mlp": 0.32260871, "epoch": 0.26514354426574477, "flos": 24716847951360.0, "grad_norm": 2.298126350193679, "language_loss": 0.79863763, "learning_rate": 3.4474762204071253e-06, "loss": 0.81877935, "num_input_tokens_seen": 95349880, "router_z_loss_clip": 3.47851562, "router_z_loss_mlp": 0.42407227, "step": 4410, "time_per_iteration": 4.194639682769775 }, { "auxiliary_loss_clip": 0.01661276, "auxiliary_loss_mlp": 0.00370911, "balance_loss_clip": 1.30788457, "balance_loss_mlp": 0.33304971, "epoch": 0.26520366751841273, "flos": 20340055733760.0, "grad_norm": 3016.7396988165387, "language_loss": 0.79149592, "learning_rate": 3.4472074352957244e-06, "loss": 0.81181777, "num_input_tokens_seen": 95368570, "router_z_loss_clip": 3.53515625, "router_z_loss_mlp": 0.37866211, "step": 4411, "time_per_iteration": 2.685380220413208 }, { "auxiliary_loss_clip": 0.01665414, "auxiliary_loss_mlp": 0.00365644, "balance_loss_clip": 1.3137666, "balance_loss_mlp": 0.32575637, "epoch": 0.2652637907710807, "flos": 22343870828160.0, "grad_norm": 6.344420228011144, "language_loss": 0.86301613, "learning_rate": 3.446938595306071e-06, "loss": 0.88332671, "num_input_tokens_seen": 95387065, "router_z_loss_clip": 3.515625, "router_z_loss_mlp": 0.39892578, "step": 4412, "time_per_iteration": 2.684338092803955 }, { "auxiliary_loss_clip": 0.01662976, "auxiliary_loss_mlp": 0.00381177, "balance_loss_clip": 1.31664872, "balance_loss_mlp": 0.33845234, "epoch": 0.26532391402374866, "flos": 19354235990400.0, "grad_norm": 19.09315943351729, "language_loss": 0.81205696, "learning_rate": 3.4466697004483622e-06, "loss": 0.83249843, "num_input_tokens_seen": 95406345, "router_z_loss_clip": 3.46289062, "router_z_loss_mlp": 0.42724609, "step": 4413, "time_per_iteration": 4.190816164016724 }, { "auxiliary_loss_clip": 0.01560515, "auxiliary_loss_mlp": 0.00081885, "balance_loss_clip": 1.34129024, "balance_loss_mlp": 0.07220526, "epoch": 0.26538403727641663, "flos": 44787611422080.0, "grad_norm": 2.6152054337818296, "language_loss": 0.56897759, "learning_rate": 3.446400750732793e-06, "loss": 0.58540159, "num_input_tokens_seen": 95463595, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.09667969, "step": 4414, "time_per_iteration": 3.086897134780884 }, { "auxiliary_loss_clip": 0.01670697, "auxiliary_loss_mlp": 0.00316586, "balance_loss_clip": 1.32437074, "balance_loss_mlp": 0.28335014, "epoch": 0.26544416052908465, "flos": 28182119708160.0, "grad_norm": 4.432633040028551, "language_loss": 0.7979036, "learning_rate": 3.4461317461695625e-06, "loss": 0.81777644, "num_input_tokens_seen": 95484115, "router_z_loss_clip": 3.46289062, "router_z_loss_mlp": 0.33203125, "step": 4415, "time_per_iteration": 2.7279345989227295 }, { "auxiliary_loss_clip": 0.01706293, "auxiliary_loss_mlp": 0.00341173, "balance_loss_clip": 1.34350157, "balance_loss_mlp": 0.29904473, "epoch": 0.2655042837817526, "flos": 17565274097280.0, "grad_norm": 3.5749072695502946, "language_loss": 0.93746585, "learning_rate": 3.4458626867688707e-06, "loss": 0.95794052, "num_input_tokens_seen": 95501435, "router_z_loss_clip": 3.62890625, "router_z_loss_mlp": 0.42138672, "step": 4416, "time_per_iteration": 2.6295835971832275 }, { "auxiliary_loss_clip": 0.01679806, "auxiliary_loss_mlp": 0.00394389, "balance_loss_clip": 1.33170605, "balance_loss_mlp": 0.35283297, "epoch": 0.2655644070344206, "flos": 23404636298880.0, "grad_norm": 33.0397158480753, "language_loss": 0.82156336, "learning_rate": 3.4455935725409217e-06, "loss": 0.84230536, "num_input_tokens_seen": 95520135, "router_z_loss_clip": 3.48242188, "router_z_loss_mlp": 0.41601562, "step": 4417, "time_per_iteration": 2.722395896911621 }, { "auxiliary_loss_clip": 0.01673118, "auxiliary_loss_mlp": 0.00365148, "balance_loss_clip": 1.32694411, "balance_loss_mlp": 0.32695347, "epoch": 0.26562453028708854, "flos": 26468462678400.0, "grad_norm": 18.254020775525536, "language_loss": 0.85134661, "learning_rate": 3.4453244034959196e-06, "loss": 0.87172925, "num_input_tokens_seen": 95541705, "router_z_loss_clip": 3.46289062, "router_z_loss_mlp": 0.38183594, "step": 4418, "time_per_iteration": 2.802130937576294 }, { "auxiliary_loss_clip": 0.01686134, "auxiliary_loss_mlp": 0.00381115, "balance_loss_clip": 1.3326745, "balance_loss_mlp": 0.34096509, "epoch": 0.2656846535397565, "flos": 19207576759680.0, "grad_norm": 1338.003638675426, "language_loss": 0.74313211, "learning_rate": 3.445055179644071e-06, "loss": 0.76380467, "num_input_tokens_seen": 95560300, "router_z_loss_clip": 3.53125, "router_z_loss_mlp": 0.40161133, "step": 4419, "time_per_iteration": 2.642695188522339 }, { "auxiliary_loss_clip": 0.01695314, "auxiliary_loss_mlp": 0.00395932, "balance_loss_clip": 1.33759522, "balance_loss_mlp": 0.35227793, "epoch": 0.2657447767924245, "flos": 30551325903360.0, "grad_norm": 1.9715654798980515, "language_loss": 0.83187604, "learning_rate": 3.444785900995585e-06, "loss": 0.85278857, "num_input_tokens_seen": 95580150, "router_z_loss_clip": 3.57617188, "router_z_loss_mlp": 0.43603516, "step": 4420, "time_per_iteration": 2.768308401107788 }, { "auxiliary_loss_clip": 0.0169135, "auxiliary_loss_mlp": 0.00390772, "balance_loss_clip": 1.33756638, "balance_loss_mlp": 0.34690344, "epoch": 0.26580490004509244, "flos": 20922742160640.0, "grad_norm": 66.77281440717351, "language_loss": 0.88456047, "learning_rate": 3.444516567560673e-06, "loss": 0.9053818, "num_input_tokens_seen": 95597570, "router_z_loss_clip": 3.53515625, "router_z_loss_mlp": 0.43823242, "step": 4421, "time_per_iteration": 2.645838975906372 }, { "auxiliary_loss_clip": 0.01691712, "auxiliary_loss_mlp": 0.00380703, "balance_loss_clip": 1.34299827, "balance_loss_mlp": 0.34036285, "epoch": 0.2658650232977604, "flos": 43945682584320.0, "grad_norm": 27.242346534237473, "language_loss": 0.71819156, "learning_rate": 3.444247179349548e-06, "loss": 0.73891568, "num_input_tokens_seen": 95619415, "router_z_loss_clip": 3.48828125, "router_z_loss_mlp": 0.40380859, "step": 4422, "time_per_iteration": 4.398077487945557 }, { "auxiliary_loss_clip": 0.01713157, "auxiliary_loss_mlp": 0.003808, "balance_loss_clip": 1.35866356, "balance_loss_mlp": 0.34262967, "epoch": 0.26592514655042837, "flos": 29716439109120.0, "grad_norm": 3.2531377722438743, "language_loss": 0.81198126, "learning_rate": 3.4439777363724252e-06, "loss": 0.83292079, "num_input_tokens_seen": 95639155, "router_z_loss_clip": 3.54296875, "router_z_loss_mlp": 0.3815918, "step": 4423, "time_per_iteration": 2.691225290298462 }, { "auxiliary_loss_clip": 0.0171078, "auxiliary_loss_mlp": 0.00359497, "balance_loss_clip": 1.35023379, "balance_loss_mlp": 0.32156426, "epoch": 0.26598526980309634, "flos": 46677730014720.0, "grad_norm": 107.92113502015309, "language_loss": 0.83775306, "learning_rate": 3.443708238639522e-06, "loss": 0.85845584, "num_input_tokens_seen": 95663320, "router_z_loss_clip": 3.61132812, "router_z_loss_mlp": 0.37890625, "step": 4424, "time_per_iteration": 2.865556240081787 }, { "auxiliary_loss_clip": 0.01720516, "auxiliary_loss_mlp": 0.00370531, "balance_loss_clip": 1.35926199, "balance_loss_mlp": 0.33028615, "epoch": 0.2660453930557643, "flos": 11509442582400.0, "grad_norm": 2.8169754673398706, "language_loss": 0.8712703, "learning_rate": 3.4434386861610573e-06, "loss": 0.89218074, "num_input_tokens_seen": 95680260, "router_z_loss_clip": 3.61328125, "router_z_loss_mlp": 0.40185547, "step": 4425, "time_per_iteration": 2.717451572418213 }, { "auxiliary_loss_clip": 0.01707991, "auxiliary_loss_mlp": 0.00346806, "balance_loss_clip": 1.34944868, "balance_loss_mlp": 0.31073329, "epoch": 0.26610551630843227, "flos": 24791578197120.0, "grad_norm": 29.326340744807283, "language_loss": 0.87234187, "learning_rate": 3.4431690789472532e-06, "loss": 0.89288986, "num_input_tokens_seen": 95701140, "router_z_loss_clip": 3.58398438, "router_z_loss_mlp": 0.36108398, "step": 4426, "time_per_iteration": 2.7240235805511475 }, { "auxiliary_loss_clip": 0.01722144, "auxiliary_loss_mlp": 0.00396775, "balance_loss_clip": 1.35879016, "balance_loss_mlp": 0.3560057, "epoch": 0.26616563956110023, "flos": 27636385397760.0, "grad_norm": 10.245466372152071, "language_loss": 0.81480241, "learning_rate": 3.442899417008333e-06, "loss": 0.83599162, "num_input_tokens_seen": 95722060, "router_z_loss_clip": 3.63476562, "router_z_loss_mlp": 0.40795898, "step": 4427, "time_per_iteration": 2.719569206237793 }, { "auxiliary_loss_clip": 0.01732347, "auxiliary_loss_mlp": 0.00372393, "balance_loss_clip": 1.37612939, "balance_loss_mlp": 0.33388823, "epoch": 0.26622576281376825, "flos": 28362893880960.0, "grad_norm": 134.32217586471037, "language_loss": 0.81191504, "learning_rate": 3.4426297003545227e-06, "loss": 0.83296239, "num_input_tokens_seen": 95742495, "router_z_loss_clip": 3.56640625, "router_z_loss_mlp": 0.38500977, "step": 4428, "time_per_iteration": 2.7029848098754883 }, { "auxiliary_loss_clip": 0.01729139, "auxiliary_loss_mlp": 0.00399247, "balance_loss_clip": 1.3653276, "balance_loss_mlp": 0.361696, "epoch": 0.2662858860664362, "flos": 18041341979520.0, "grad_norm": 3.559619986218037, "language_loss": 0.90190566, "learning_rate": 3.4423599289960495e-06, "loss": 0.92318952, "num_input_tokens_seen": 95761510, "router_z_loss_clip": 3.63671875, "router_z_loss_mlp": 0.37548828, "step": 4429, "time_per_iteration": 2.6381616592407227 }, { "auxiliary_loss_clip": 0.01764834, "auxiliary_loss_mlp": 0.00371279, "balance_loss_clip": 1.3990221, "balance_loss_mlp": 0.3300091, "epoch": 0.2663460093191042, "flos": 22745818995840.0, "grad_norm": 14.965997464731696, "language_loss": 0.78619695, "learning_rate": 3.442090102943143e-06, "loss": 0.80755806, "num_input_tokens_seen": 95782385, "router_z_loss_clip": 3.66210938, "router_z_loss_mlp": 0.41259766, "step": 4430, "time_per_iteration": 2.7359862327575684 }, { "auxiliary_loss_clip": 0.01748496, "auxiliary_loss_mlp": 0.00388265, "balance_loss_clip": 1.38243437, "balance_loss_mlp": 0.34725672, "epoch": 0.26640613257177215, "flos": 16508782344960.0, "grad_norm": 20.40977637072055, "language_loss": 0.89106119, "learning_rate": 3.441820222206035e-06, "loss": 0.91242874, "num_input_tokens_seen": 95800595, "router_z_loss_clip": 3.66015625, "router_z_loss_mlp": 0.41064453, "step": 4431, "time_per_iteration": 2.633929491043091 }, { "auxiliary_loss_clip": 0.01754074, "auxiliary_loss_mlp": 0.00381877, "balance_loss_clip": 1.37961435, "balance_loss_mlp": 0.34196621, "epoch": 0.2664662558244401, "flos": 23075945919360.0, "grad_norm": 3.6592495184331066, "language_loss": 0.83306289, "learning_rate": 3.44155028679496e-06, "loss": 0.85442233, "num_input_tokens_seen": 95818480, "router_z_loss_clip": 3.74414062, "router_z_loss_mlp": 0.39916992, "step": 4432, "time_per_iteration": 2.6431877613067627 }, { "auxiliary_loss_clip": 0.01759343, "auxiliary_loss_mlp": 0.00406631, "balance_loss_clip": 1.39300656, "balance_loss_mlp": 0.36576644, "epoch": 0.2665263790771081, "flos": 23769273214080.0, "grad_norm": 13.483417166048925, "language_loss": 0.87939644, "learning_rate": 3.441280296720154e-06, "loss": 0.90105623, "num_input_tokens_seen": 95837205, "router_z_loss_clip": 3.66210938, "router_z_loss_mlp": 0.40869141, "step": 4433, "time_per_iteration": 2.7074031829833984 }, { "auxiliary_loss_clip": 0.01767544, "auxiliary_loss_mlp": 0.00396169, "balance_loss_clip": 1.40131783, "balance_loss_mlp": 0.35775954, "epoch": 0.26658650232977604, "flos": 28001273708160.0, "grad_norm": 12.273698615442134, "language_loss": 0.82364643, "learning_rate": 3.441010251991854e-06, "loss": 0.84528351, "num_input_tokens_seen": 95858395, "router_z_loss_clip": 3.66601562, "router_z_loss_mlp": 0.38427734, "step": 4434, "time_per_iteration": 2.7192370891571045 }, { "auxiliary_loss_clip": 0.01765668, "auxiliary_loss_mlp": 0.00337562, "balance_loss_clip": 1.40214968, "balance_loss_mlp": 0.30065489, "epoch": 0.266646625582444, "flos": 22163635359360.0, "grad_norm": 194.27621476208552, "language_loss": 0.89281571, "learning_rate": 3.440740152620301e-06, "loss": 0.91384798, "num_input_tokens_seen": 95877875, "router_z_loss_clip": 3.63476562, "router_z_loss_mlp": 0.36889648, "step": 4435, "time_per_iteration": 2.686112642288208 }, { "auxiliary_loss_clip": 0.01779304, "auxiliary_loss_mlp": 0.0036657, "balance_loss_clip": 1.40237665, "balance_loss_mlp": 0.32274842, "epoch": 0.266706748835112, "flos": 27853537069440.0, "grad_norm": 5.654495295303697, "language_loss": 0.93507802, "learning_rate": 3.4404699986157376e-06, "loss": 0.95653677, "num_input_tokens_seen": 95895820, "router_z_loss_clip": 3.77148438, "router_z_loss_mlp": 0.43847656, "step": 4436, "time_per_iteration": 2.739887237548828 }, { "auxiliary_loss_clip": 0.01743841, "auxiliary_loss_mlp": 0.00360209, "balance_loss_clip": 1.38872135, "balance_loss_mlp": 0.32151347, "epoch": 0.26676687208777994, "flos": 25812123413760.0, "grad_norm": 14.256208074931841, "language_loss": 0.82260609, "learning_rate": 3.440199789988407e-06, "loss": 0.84364659, "num_input_tokens_seen": 95918025, "router_z_loss_clip": 3.55078125, "router_z_loss_mlp": 0.38696289, "step": 4437, "time_per_iteration": 2.7306854724884033 }, { "auxiliary_loss_clip": 0.0175288, "auxiliary_loss_mlp": 0.00394381, "balance_loss_clip": 1.39313245, "balance_loss_mlp": 0.3524195, "epoch": 0.2668269953404479, "flos": 36064583504640.0, "grad_norm": 7.619170753496409, "language_loss": 0.73111832, "learning_rate": 3.439929526748556e-06, "loss": 0.75259089, "num_input_tokens_seen": 95937725, "router_z_loss_clip": 3.59960938, "router_z_loss_mlp": 0.41967773, "step": 4438, "time_per_iteration": 2.800873041152954 }, { "auxiliary_loss_clip": 0.01753735, "auxiliary_loss_mlp": 0.00397802, "balance_loss_clip": 1.39476538, "balance_loss_mlp": 0.35650772, "epoch": 0.26688711859311587, "flos": 26570987072640.0, "grad_norm": 93.20551592953444, "language_loss": 0.81169271, "learning_rate": 3.4396592089064334e-06, "loss": 0.83320808, "num_input_tokens_seen": 95956335, "router_z_loss_clip": 3.58789062, "router_z_loss_mlp": 0.41308594, "step": 4439, "time_per_iteration": 2.7216665744781494 }, { "auxiliary_loss_clip": 0.01786916, "auxiliary_loss_mlp": 0.00360374, "balance_loss_clip": 1.41924095, "balance_loss_mlp": 0.32012898, "epoch": 0.26694724184578383, "flos": 26761565658240.0, "grad_norm": 11.122130703391738, "language_loss": 0.76121038, "learning_rate": 3.4393888364722897e-06, "loss": 0.78268325, "num_input_tokens_seen": 95977135, "router_z_loss_clip": 3.67578125, "router_z_loss_mlp": 0.40234375, "step": 4440, "time_per_iteration": 2.7328786849975586 }, { "auxiliary_loss_clip": 0.0175465, "auxiliary_loss_mlp": 0.00359095, "balance_loss_clip": 1.39617252, "balance_loss_mlp": 0.32025683, "epoch": 0.2670073650984518, "flos": 20959586536320.0, "grad_norm": 7.144158743098327, "language_loss": 0.74710035, "learning_rate": 3.439118409456376e-06, "loss": 0.76823771, "num_input_tokens_seen": 95995435, "router_z_loss_clip": 3.58789062, "router_z_loss_mlp": 0.38867188, "step": 4441, "time_per_iteration": 2.6313068866729736 }, { "auxiliary_loss_clip": 0.01786913, "auxiliary_loss_mlp": 0.00347142, "balance_loss_clip": 1.42233169, "balance_loss_mlp": 0.30832708, "epoch": 0.2670674883511198, "flos": 28366054277760.0, "grad_norm": 4.910556405614218, "language_loss": 0.8046118, "learning_rate": 3.4388479278689486e-06, "loss": 0.82595229, "num_input_tokens_seen": 96016340, "router_z_loss_clip": 3.65039062, "router_z_loss_mlp": 0.38818359, "step": 4442, "time_per_iteration": 2.736863613128662 }, { "auxiliary_loss_clip": 0.01572611, "auxiliary_loss_mlp": 0.00081311, "balance_loss_clip": 1.33654523, "balance_loss_mlp": 0.07134517, "epoch": 0.2671276116037878, "flos": 58971319430400.0, "grad_norm": 77.75508826375645, "language_loss": 0.6122694, "learning_rate": 3.4385773917202637e-06, "loss": 0.62880862, "num_input_tokens_seen": 96071205, "router_z_loss_clip": 2.375, "router_z_loss_mlp": 0.09960938, "step": 4443, "time_per_iteration": 3.0510714054107666 }, { "auxiliary_loss_clip": 0.01781453, "auxiliary_loss_mlp": 0.0033858, "balance_loss_clip": 1.41875398, "balance_loss_mlp": 0.30079019, "epoch": 0.26718773485645575, "flos": 43945072053120.0, "grad_norm": 957.5126358590235, "language_loss": 0.81968659, "learning_rate": 3.4383068010205793e-06, "loss": 0.84088689, "num_input_tokens_seen": 96094240, "router_z_loss_clip": 3.62695312, "router_z_loss_mlp": 0.37768555, "step": 4444, "time_per_iteration": 2.847710609436035 }, { "auxiliary_loss_clip": 0.01775713, "auxiliary_loss_mlp": 0.00362103, "balance_loss_clip": 1.41267347, "balance_loss_mlp": 0.32111874, "epoch": 0.2672478581091237, "flos": 25228323665280.0, "grad_norm": 186.15953464878817, "language_loss": 0.85374832, "learning_rate": 3.438036155780158e-06, "loss": 0.87512648, "num_input_tokens_seen": 96114105, "router_z_loss_clip": 3.63085938, "router_z_loss_mlp": 0.40966797, "step": 4445, "time_per_iteration": 2.6781675815582275 }, { "auxiliary_loss_clip": 0.01790274, "auxiliary_loss_mlp": 0.0034569, "balance_loss_clip": 1.42550921, "balance_loss_mlp": 0.30835325, "epoch": 0.2673079813617917, "flos": 15268176455040.0, "grad_norm": 4.288001138192591, "language_loss": 0.96481597, "learning_rate": 3.43776545600926e-06, "loss": 0.9861756, "num_input_tokens_seen": 96132140, "router_z_loss_clip": 3.65039062, "router_z_loss_mlp": 0.37329102, "step": 4446, "time_per_iteration": 2.606088399887085 }, { "auxiliary_loss_clip": 0.0178934, "auxiliary_loss_mlp": 0.0034773, "balance_loss_clip": 1.42279673, "balance_loss_mlp": 0.31199145, "epoch": 0.26736810461445965, "flos": 25812733944960.0, "grad_norm": 21.588094182953714, "language_loss": 0.73234665, "learning_rate": 3.437494701718153e-06, "loss": 0.7537173, "num_input_tokens_seen": 96152090, "router_z_loss_clip": 3.6640625, "router_z_loss_mlp": 0.35717773, "step": 4447, "time_per_iteration": 2.746830940246582 }, { "auxiliary_loss_clip": 0.01802767, "auxiliary_loss_mlp": 0.00328992, "balance_loss_clip": 1.43480897, "balance_loss_mlp": 0.29227582, "epoch": 0.2674282278671276, "flos": 24312709054080.0, "grad_norm": 9.008351583015127, "language_loss": 0.8982963, "learning_rate": 3.4372238929171026e-06, "loss": 0.9196139, "num_input_tokens_seen": 96170015, "router_z_loss_clip": 3.68359375, "router_z_loss_mlp": 0.3671875, "step": 4448, "time_per_iteration": 4.102078437805176 }, { "auxiliary_loss_clip": 0.01782475, "auxiliary_loss_mlp": 0.00327441, "balance_loss_clip": 1.42072201, "balance_loss_mlp": 0.29165423, "epoch": 0.2674883511197956, "flos": 22815521337600.0, "grad_norm": 39.76491722339182, "language_loss": 0.90724409, "learning_rate": 3.436953029616378e-06, "loss": 0.9283433, "num_input_tokens_seen": 96188065, "router_z_loss_clip": 3.61328125, "router_z_loss_mlp": 0.35766602, "step": 4449, "time_per_iteration": 2.7006447315216064 }, { "auxiliary_loss_clip": 0.01771308, "auxiliary_loss_mlp": 0.00349603, "balance_loss_clip": 1.40084016, "balance_loss_mlp": 0.31126535, "epoch": 0.26754847437246354, "flos": 25370170473600.0, "grad_norm": 14.33238402684806, "language_loss": 0.9034735, "learning_rate": 3.4366821118262506e-06, "loss": 0.92468262, "num_input_tokens_seen": 96205780, "router_z_loss_clip": 3.703125, "router_z_loss_mlp": 0.38330078, "step": 4450, "time_per_iteration": 2.6777524948120117 }, { "auxiliary_loss_clip": 0.01766435, "auxiliary_loss_mlp": 0.00320998, "balance_loss_clip": 1.41061568, "balance_loss_mlp": 0.28385222, "epoch": 0.2676085976251315, "flos": 20230420446720.0, "grad_norm": 1.9881956311920828, "language_loss": 0.8626948, "learning_rate": 3.4364111395569937e-06, "loss": 0.88356912, "num_input_tokens_seen": 96224990, "router_z_loss_clip": 3.5546875, "router_z_loss_mlp": 0.37109375, "step": 4451, "time_per_iteration": 2.695317506790161 }, { "auxiliary_loss_clip": 0.01785034, "auxiliary_loss_mlp": 0.00314527, "balance_loss_clip": 1.42931795, "balance_loss_mlp": 0.27673724, "epoch": 0.26766872087779947, "flos": 28038225824640.0, "grad_norm": 2.8883458846529817, "language_loss": 0.92019361, "learning_rate": 3.436140112818882e-06, "loss": 0.94118923, "num_input_tokens_seen": 96245345, "router_z_loss_clip": 3.55859375, "router_z_loss_mlp": 0.37817383, "step": 4452, "time_per_iteration": 4.181818962097168 }, { "auxiliary_loss_clip": 0.01750966, "auxiliary_loss_mlp": 0.00304844, "balance_loss_clip": 1.40393758, "balance_loss_mlp": 0.26943928, "epoch": 0.26772884413046744, "flos": 18325179250560.0, "grad_norm": 66.5727138171409, "language_loss": 0.92458737, "learning_rate": 3.435869031622194e-06, "loss": 0.94514549, "num_input_tokens_seen": 96259000, "router_z_loss_clip": 3.46679688, "router_z_loss_mlp": 0.35400391, "step": 4453, "time_per_iteration": 2.6070289611816406 }, { "auxiliary_loss_clip": 0.01780198, "auxiliary_loss_mlp": 0.00305377, "balance_loss_clip": 1.41953754, "balance_loss_mlp": 0.26878011, "epoch": 0.2677889673831354, "flos": 22127509255680.0, "grad_norm": 6.405915311322177, "language_loss": 0.8398279, "learning_rate": 3.435597895977208e-06, "loss": 0.86068368, "num_input_tokens_seen": 96277000, "router_z_loss_clip": 3.609375, "router_z_loss_mlp": 0.3659668, "step": 4454, "time_per_iteration": 2.6772449016571045 }, { "auxiliary_loss_clip": 0.01761056, "auxiliary_loss_mlp": 0.0031654, "balance_loss_clip": 1.40583742, "balance_loss_mlp": 0.27941856, "epoch": 0.2678490906358034, "flos": 23729699404800.0, "grad_norm": 233.98706347356833, "language_loss": 0.77264929, "learning_rate": 3.435326705894206e-06, "loss": 0.79342526, "num_input_tokens_seen": 96297010, "router_z_loss_clip": 3.55273438, "router_z_loss_mlp": 0.37133789, "step": 4455, "time_per_iteration": 4.074885845184326 }, { "auxiliary_loss_clip": 0.01761384, "auxiliary_loss_mlp": 0.00285488, "balance_loss_clip": 1.41740203, "balance_loss_mlp": 0.2490097, "epoch": 0.2679092138884714, "flos": 21762872340480.0, "grad_norm": 5.017561767758398, "language_loss": 0.79117537, "learning_rate": 3.435055461383471e-06, "loss": 0.81164408, "num_input_tokens_seen": 96315780, "router_z_loss_clip": 3.44335938, "router_z_loss_mlp": 0.36474609, "step": 4456, "time_per_iteration": 2.6519463062286377 }, { "auxiliary_loss_clip": 0.01770818, "auxiliary_loss_mlp": 0.00333291, "balance_loss_clip": 1.4104718, "balance_loss_mlp": 0.29612172, "epoch": 0.26796933714113935, "flos": 19861186590720.0, "grad_norm": 22.019987963505788, "language_loss": 0.772807, "learning_rate": 3.4347841624552896e-06, "loss": 0.7938481, "num_input_tokens_seen": 96333465, "router_z_loss_clip": 3.59960938, "router_z_loss_mlp": 0.37133789, "step": 4457, "time_per_iteration": 2.646531105041504 }, { "auxiliary_loss_clip": 0.01784807, "auxiliary_loss_mlp": 0.00320674, "balance_loss_clip": 1.41868258, "balance_loss_mlp": 0.28255093, "epoch": 0.2680294603938073, "flos": 20047886507520.0, "grad_norm": 16.210136382448397, "language_loss": 0.85734677, "learning_rate": 3.4345128091199493e-06, "loss": 0.87840158, "num_input_tokens_seen": 96352005, "router_z_loss_clip": 3.66015625, "router_z_loss_mlp": 0.3815918, "step": 4458, "time_per_iteration": 2.7151293754577637 }, { "auxiliary_loss_clip": 0.01662258, "auxiliary_loss_mlp": 0.00054031, "balance_loss_clip": 1.42428493, "balance_loss_mlp": 0.04253971, "epoch": 0.2680895836464753, "flos": 72113763052800.0, "grad_norm": 0.8319328967421279, "language_loss": 0.58521545, "learning_rate": 3.434241401387739e-06, "loss": 0.60237837, "num_input_tokens_seen": 96406265, "router_z_loss_clip": 2.375, "router_z_loss_mlp": 0.11474609, "step": 4459, "time_per_iteration": 3.1260011196136475 }, { "auxiliary_loss_clip": 0.01768002, "auxiliary_loss_mlp": 0.00295442, "balance_loss_clip": 1.41497231, "balance_loss_mlp": 0.25801033, "epoch": 0.26814970689914325, "flos": 20449044576000.0, "grad_norm": 18.251251138733224, "language_loss": 0.91461968, "learning_rate": 3.4339699392689507e-06, "loss": 0.9352541, "num_input_tokens_seen": 96425225, "router_z_loss_clip": 3.52929688, "router_z_loss_mlp": 0.37451172, "step": 4460, "time_per_iteration": 2.6385910511016846 }, { "auxiliary_loss_clip": 0.01752205, "auxiliary_loss_mlp": 0.00282046, "balance_loss_clip": 1.40861535, "balance_loss_mlp": 0.24640219, "epoch": 0.2682098301518112, "flos": 17566674727680.0, "grad_norm": 38.99845780083298, "language_loss": 0.76697206, "learning_rate": 3.4336984227738796e-06, "loss": 0.78731453, "num_input_tokens_seen": 96443780, "router_z_loss_clip": 3.43554688, "router_z_loss_mlp": 0.35644531, "step": 4461, "time_per_iteration": 2.630237102508545 }, { "auxiliary_loss_clip": 0.0176419, "auxiliary_loss_mlp": 0.0030782, "balance_loss_clip": 1.41325641, "balance_loss_mlp": 0.27291578, "epoch": 0.2682699534044792, "flos": 18333259810560.0, "grad_norm": 2.540563100055694, "language_loss": 0.74809861, "learning_rate": 3.43342685191282e-06, "loss": 0.76881874, "num_input_tokens_seen": 96464530, "router_z_loss_clip": 3.50976562, "router_z_loss_mlp": 0.34912109, "step": 4462, "time_per_iteration": 2.671095132827759 }, { "auxiliary_loss_clip": 0.01781115, "auxiliary_loss_mlp": 0.00299002, "balance_loss_clip": 1.42416835, "balance_loss_mlp": 0.26123697, "epoch": 0.26833007665714714, "flos": 25301294144640.0, "grad_norm": 11.784904295892973, "language_loss": 0.7622776, "learning_rate": 3.4331552266960705e-06, "loss": 0.78307879, "num_input_tokens_seen": 96483345, "router_z_loss_clip": 3.5703125, "router_z_loss_mlp": 0.37768555, "step": 4463, "time_per_iteration": 2.843959331512451 }, { "auxiliary_loss_clip": 0.0176902, "auxiliary_loss_mlp": 0.003234, "balance_loss_clip": 1.41163754, "balance_loss_mlp": 0.28429893, "epoch": 0.2683901999098151, "flos": 16099759198080.0, "grad_norm": 14.257168781003871, "language_loss": 0.85564226, "learning_rate": 3.432883547133931e-06, "loss": 0.87656647, "num_input_tokens_seen": 96498305, "router_z_loss_clip": 3.57421875, "router_z_loss_mlp": 0.39086914, "step": 4464, "time_per_iteration": 4.185925245285034 }, { "auxiliary_loss_clip": 0.01756228, "auxiliary_loss_mlp": 0.00306774, "balance_loss_clip": 1.40784335, "balance_loss_mlp": 0.27153599, "epoch": 0.2684503231624831, "flos": 27308054154240.0, "grad_norm": 102.28070255080105, "language_loss": 0.77004272, "learning_rate": 3.432611813236704e-06, "loss": 0.79067278, "num_input_tokens_seen": 96519740, "router_z_loss_clip": 3.484375, "router_z_loss_mlp": 0.3527832, "step": 4465, "time_per_iteration": 2.710642099380493 }, { "auxiliary_loss_clip": 0.01637127, "auxiliary_loss_mlp": 0.00074215, "balance_loss_clip": 1.41617489, "balance_loss_mlp": 0.06424913, "epoch": 0.26851044641515104, "flos": 71858007239040.0, "grad_norm": 0.6763758809177138, "language_loss": 0.52398598, "learning_rate": 3.4323400250146943e-06, "loss": 0.54109943, "num_input_tokens_seen": 96588870, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.09960938, "step": 4466, "time_per_iteration": 3.2841269969940186 }, { "auxiliary_loss_clip": 0.01739963, "auxiliary_loss_mlp": 0.00330925, "balance_loss_clip": 1.40518951, "balance_loss_mlp": 0.29375508, "epoch": 0.268570569667819, "flos": 18733771434240.0, "grad_norm": 6626.189657638971, "language_loss": 0.79805493, "learning_rate": 3.4320681824782057e-06, "loss": 0.81876385, "num_input_tokens_seen": 96605100, "router_z_loss_clip": 3.3515625, "router_z_loss_mlp": 0.37182617, "step": 4467, "time_per_iteration": 2.6106374263763428 }, { "auxiliary_loss_clip": 0.01767249, "auxiliary_loss_mlp": 0.00324373, "balance_loss_clip": 1.41641057, "balance_loss_mlp": 0.28815705, "epoch": 0.268630692920487, "flos": 18178376365440.0, "grad_norm": 12.602406983964583, "language_loss": 0.89848471, "learning_rate": 3.4317962856375493e-06, "loss": 0.91940087, "num_input_tokens_seen": 96621410, "router_z_loss_clip": 3.5078125, "router_z_loss_mlp": 0.36181641, "step": 4468, "time_per_iteration": 2.6108169555664062 }, { "auxiliary_loss_clip": 0.01607251, "auxiliary_loss_mlp": 0.00068945, "balance_loss_clip": 1.38639426, "balance_loss_mlp": 0.05912209, "epoch": 0.268690816173155, "flos": 68731768978560.0, "grad_norm": 0.8441198338643263, "language_loss": 0.59129828, "learning_rate": 3.4315243345030334e-06, "loss": 0.60806024, "num_input_tokens_seen": 96684810, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.09814453, "step": 4469, "time_per_iteration": 3.2028920650482178 }, { "auxiliary_loss_clip": 0.01735918, "auxiliary_loss_mlp": 0.00330745, "balance_loss_clip": 1.39865398, "balance_loss_mlp": 0.29333708, "epoch": 0.26875093942582295, "flos": 23293636295040.0, "grad_norm": 10.924709919227064, "language_loss": 0.86566275, "learning_rate": 3.431252329084972e-06, "loss": 0.88632941, "num_input_tokens_seen": 96701920, "router_z_loss_clip": 3.37695312, "router_z_loss_mlp": 0.37426758, "step": 4470, "time_per_iteration": 2.6719727516174316 }, { "auxiliary_loss_clip": 0.01721326, "auxiliary_loss_mlp": 0.00301509, "balance_loss_clip": 1.39632893, "balance_loss_mlp": 0.26569846, "epoch": 0.2688110626784909, "flos": 21543458112000.0, "grad_norm": 4.803991781136725, "language_loss": 0.87866032, "learning_rate": 3.4309802693936786e-06, "loss": 0.89888871, "num_input_tokens_seen": 96721260, "router_z_loss_clip": 3.24804688, "router_z_loss_mlp": 0.3581543, "step": 4471, "time_per_iteration": 2.7073724269866943 }, { "auxiliary_loss_clip": 0.01706195, "auxiliary_loss_mlp": 0.00298573, "balance_loss_clip": 1.38725162, "balance_loss_mlp": 0.26431227, "epoch": 0.2688711859311589, "flos": 28400600183040.0, "grad_norm": 4.156181291209035, "language_loss": 0.78360713, "learning_rate": 3.43070815543947e-06, "loss": 0.80365479, "num_input_tokens_seen": 96740385, "router_z_loss_clip": 3.19140625, "router_z_loss_mlp": 0.3425293, "step": 4472, "time_per_iteration": 2.676076650619507 }, { "auxiliary_loss_clip": 0.01706889, "auxiliary_loss_mlp": 0.00316085, "balance_loss_clip": 1.38579977, "balance_loss_mlp": 0.2809898, "epoch": 0.26893130918382685, "flos": 25994944661760.0, "grad_norm": 12.349018296914668, "language_loss": 0.73826396, "learning_rate": 3.4304359872326656e-06, "loss": 0.75849366, "num_input_tokens_seen": 96761860, "router_z_loss_clip": 3.20898438, "router_z_loss_mlp": 0.35131836, "step": 4473, "time_per_iteration": 2.6946754455566406 }, { "auxiliary_loss_clip": 0.01694648, "auxiliary_loss_mlp": 0.00276977, "balance_loss_clip": 1.37541938, "balance_loss_mlp": 0.24629208, "epoch": 0.2689914324364948, "flos": 20339624770560.0, "grad_norm": 34.630055758193194, "language_loss": 0.89245272, "learning_rate": 3.4301637647835843e-06, "loss": 0.91216898, "num_input_tokens_seen": 96781890, "router_z_loss_clip": 3.19335938, "router_z_loss_mlp": 0.30664062, "step": 4474, "time_per_iteration": 2.6751511096954346 }, { "auxiliary_loss_clip": 0.01711676, "auxiliary_loss_mlp": 0.00286971, "balance_loss_clip": 1.39995587, "balance_loss_mlp": 0.25594082, "epoch": 0.2690515556891628, "flos": 19464553635840.0, "grad_norm": 16.903318376535246, "language_loss": 0.76115656, "learning_rate": 3.4298914881025494e-06, "loss": 0.78114307, "num_input_tokens_seen": 96800390, "router_z_loss_clip": 3.11914062, "router_z_loss_mlp": 0.30993652, "step": 4475, "time_per_iteration": 2.6512725353240967 }, { "auxiliary_loss_clip": 0.01684704, "auxiliary_loss_mlp": 0.00286093, "balance_loss_clip": 1.37024283, "balance_loss_mlp": 0.2488279, "epoch": 0.26911167894183075, "flos": 18146631720960.0, "grad_norm": 20.100159250195027, "language_loss": 0.79686952, "learning_rate": 3.4296191571998863e-06, "loss": 0.81657743, "num_input_tokens_seen": 96816685, "router_z_loss_clip": 3.14257812, "router_z_loss_mlp": 0.37255859, "step": 4476, "time_per_iteration": 2.624776601791382 }, { "auxiliary_loss_clip": 0.01696074, "auxiliary_loss_mlp": 0.00303159, "balance_loss_clip": 1.38118839, "balance_loss_mlp": 0.26966119, "epoch": 0.2691718021944987, "flos": 19975131509760.0, "grad_norm": 23.78184398543077, "language_loss": 0.85709059, "learning_rate": 3.429346772085922e-06, "loss": 0.87708294, "num_input_tokens_seen": 96836285, "router_z_loss_clip": 3.15039062, "router_z_loss_mlp": 0.33496094, "step": 4477, "time_per_iteration": 2.6755316257476807 }, { "auxiliary_loss_clip": 0.01698122, "auxiliary_loss_mlp": 0.00288514, "balance_loss_clip": 1.37971854, "balance_loss_mlp": 0.25229815, "epoch": 0.2692319254471667, "flos": 37447215770880.0, "grad_norm": 181.54917570871226, "language_loss": 0.73722064, "learning_rate": 3.429074332770984e-06, "loss": 0.75708699, "num_input_tokens_seen": 96857745, "router_z_loss_clip": 3.18164062, "router_z_loss_mlp": 0.36230469, "step": 4478, "time_per_iteration": 2.805828094482422 }, { "auxiliary_loss_clip": 0.01682559, "auxiliary_loss_mlp": 0.00298534, "balance_loss_clip": 1.37139797, "balance_loss_mlp": 0.26169872, "epoch": 0.26929204869983464, "flos": 22127796564480.0, "grad_norm": 2.5756406746945992, "language_loss": 0.87561667, "learning_rate": 3.4288018392654047e-06, "loss": 0.89542758, "num_input_tokens_seen": 96877295, "router_z_loss_clip": 3.11328125, "router_z_loss_mlp": 0.36791992, "step": 4479, "time_per_iteration": 2.6652278900146484 }, { "auxiliary_loss_clip": 0.01704959, "auxiliary_loss_mlp": 0.00295705, "balance_loss_clip": 1.38935804, "balance_loss_mlp": 0.26084787, "epoch": 0.2693521719525026, "flos": 19792813052160.0, "grad_norm": 6.56392758113658, "language_loss": 0.86834037, "learning_rate": 3.4285292915795166e-06, "loss": 0.88834697, "num_input_tokens_seen": 96896160, "router_z_loss_clip": 3.15625, "router_z_loss_mlp": 0.34814453, "step": 4480, "time_per_iteration": 2.666550636291504 }, { "auxiliary_loss_clip": 0.0168871, "auxiliary_loss_mlp": 0.0028692, "balance_loss_clip": 1.37468195, "balance_loss_mlp": 0.252635, "epoch": 0.2694122952051706, "flos": 20994383836800.0, "grad_norm": 58.51533988753343, "language_loss": 0.82221937, "learning_rate": 3.4282566897236543e-06, "loss": 0.84197569, "num_input_tokens_seen": 96915410, "router_z_loss_clip": 3.140625, "router_z_loss_mlp": 0.3425293, "step": 4481, "time_per_iteration": 2.644855499267578 }, { "auxiliary_loss_clip": 0.0169266, "auxiliary_loss_mlp": 0.00294961, "balance_loss_clip": 1.38109052, "balance_loss_mlp": 0.25936559, "epoch": 0.2694724184578386, "flos": 25849291011840.0, "grad_norm": 20.60393661477104, "language_loss": 0.80874395, "learning_rate": 3.4279840337081547e-06, "loss": 0.8286202, "num_input_tokens_seen": 96937865, "router_z_loss_clip": 3.11523438, "router_z_loss_mlp": 0.35571289, "step": 4482, "time_per_iteration": 2.749721050262451 }, { "auxiliary_loss_clip": 0.01681001, "auxiliary_loss_mlp": 0.00286486, "balance_loss_clip": 1.37541044, "balance_loss_mlp": 0.25007966, "epoch": 0.26953254171050656, "flos": 21726961718400.0, "grad_norm": 2.938147658778023, "language_loss": 0.80795681, "learning_rate": 3.4277113235433584e-06, "loss": 0.82763165, "num_input_tokens_seen": 96957710, "router_z_loss_clip": 3.06054688, "router_z_loss_mlp": 0.36401367, "step": 4483, "time_per_iteration": 2.6267523765563965 }, { "auxiliary_loss_clip": 0.01706408, "auxiliary_loss_mlp": 0.00308111, "balance_loss_clip": 1.38593841, "balance_loss_mlp": 0.27137083, "epoch": 0.2695926649631745, "flos": 19682926369920.0, "grad_norm": 7.157626379297422, "language_loss": 0.94431788, "learning_rate": 3.427438559239605e-06, "loss": 0.96446306, "num_input_tokens_seen": 96975890, "router_z_loss_clip": 3.203125, "router_z_loss_mlp": 0.36743164, "step": 4484, "time_per_iteration": 2.6421194076538086 }, { "auxiliary_loss_clip": 0.01659697, "auxiliary_loss_mlp": 0.00270112, "balance_loss_clip": 1.35131443, "balance_loss_mlp": 0.23635209, "epoch": 0.2696527882158425, "flos": 32886596724480.0, "grad_norm": 4.832069808662705, "language_loss": 0.72256792, "learning_rate": 3.427165740807239e-06, "loss": 0.74186599, "num_input_tokens_seen": 96998595, "router_z_loss_clip": 3.08203125, "router_z_loss_mlp": 0.33789062, "step": 4485, "time_per_iteration": 2.741579294204712 }, { "auxiliary_loss_clip": 0.01677425, "auxiliary_loss_mlp": 0.00299118, "balance_loss_clip": 1.36963844, "balance_loss_mlp": 0.26392734, "epoch": 0.26971291146851045, "flos": 12124843320960.0, "grad_norm": 29.566356653867278, "language_loss": 0.81364131, "learning_rate": 3.426892868256604e-06, "loss": 0.83340681, "num_input_tokens_seen": 97013715, "router_z_loss_clip": 3.07617188, "router_z_loss_mlp": 0.35205078, "step": 4486, "time_per_iteration": 2.5904295444488525 }, { "auxiliary_loss_clip": 0.01690448, "auxiliary_loss_mlp": 0.00274593, "balance_loss_clip": 1.38178396, "balance_loss_mlp": 0.23880598, "epoch": 0.2697730347211784, "flos": 22634459856000.0, "grad_norm": 35.99737293295793, "language_loss": 0.91206032, "learning_rate": 3.4266199415980495e-06, "loss": 0.93171072, "num_input_tokens_seen": 97031570, "router_z_loss_clip": 3.08789062, "router_z_loss_mlp": 0.35791016, "step": 4487, "time_per_iteration": 2.6221632957458496 }, { "auxiliary_loss_clip": 0.01733248, "auxiliary_loss_mlp": 0.0027786, "balance_loss_clip": 1.40866137, "balance_loss_mlp": 0.24328962, "epoch": 0.2698331579738464, "flos": 23513050523520.0, "grad_norm": 28.78178003156099, "language_loss": 0.81006193, "learning_rate": 3.4263469608419234e-06, "loss": 0.83017302, "num_input_tokens_seen": 97049815, "router_z_loss_clip": 3.24804688, "router_z_loss_mlp": 0.34545898, "step": 4488, "time_per_iteration": 2.633152484893799 }, { "auxiliary_loss_clip": 0.01676429, "auxiliary_loss_mlp": 0.00288868, "balance_loss_clip": 1.37065637, "balance_loss_mlp": 0.25091141, "epoch": 0.26989328122651435, "flos": 24641040297600.0, "grad_norm": 18.437308599081437, "language_loss": 0.88005269, "learning_rate": 3.426073925998578e-06, "loss": 0.89970565, "num_input_tokens_seen": 97067570, "router_z_loss_clip": 3.05664062, "router_z_loss_mlp": 0.37963867, "step": 4489, "time_per_iteration": 2.656083822250366 }, { "auxiliary_loss_clip": 0.01667246, "auxiliary_loss_mlp": 0.00273887, "balance_loss_clip": 1.35664487, "balance_loss_mlp": 0.23900652, "epoch": 0.2699534044791823, "flos": 10772555068800.0, "grad_norm": 26.817061084326124, "language_loss": 0.99627924, "learning_rate": 3.4258008370783656e-06, "loss": 1.01569057, "num_input_tokens_seen": 97082180, "router_z_loss_clip": 3.10546875, "router_z_loss_mlp": 0.34887695, "step": 4490, "time_per_iteration": 2.599663496017456 }, { "auxiliary_loss_clip": 0.01681697, "auxiliary_loss_mlp": 0.00282812, "balance_loss_clip": 1.37665784, "balance_loss_mlp": 0.24812186, "epoch": 0.2700135277318503, "flos": 36171597098880.0, "grad_norm": 24.995421529669034, "language_loss": 0.78603721, "learning_rate": 3.4255276940916434e-06, "loss": 0.8056823, "num_input_tokens_seen": 97103470, "router_z_loss_clip": 3.04882812, "router_z_loss_mlp": 0.34692383, "step": 4491, "time_per_iteration": 4.181100130081177 }, { "auxiliary_loss_clip": 0.01684412, "auxiliary_loss_mlp": 0.00260792, "balance_loss_clip": 1.37476671, "balance_loss_mlp": 0.22834361, "epoch": 0.27007365098451824, "flos": 17418614866560.0, "grad_norm": 53.92831361287904, "language_loss": 0.82157719, "learning_rate": 3.4252544970487676e-06, "loss": 0.84102923, "num_input_tokens_seen": 97118100, "router_z_loss_clip": 3.09570312, "router_z_loss_mlp": 0.32421875, "step": 4492, "time_per_iteration": 2.5942978858947754 }, { "auxiliary_loss_clip": 0.01656345, "auxiliary_loss_mlp": 0.00252656, "balance_loss_clip": 1.35415673, "balance_loss_mlp": 0.21937284, "epoch": 0.2701337742371862, "flos": 23185688947200.0, "grad_norm": 2.529860711929969, "language_loss": 0.9497028, "learning_rate": 3.4249812459600986e-06, "loss": 0.9687928, "num_input_tokens_seen": 97136765, "router_z_loss_clip": 3.0234375, "router_z_loss_mlp": 0.33276367, "step": 4493, "time_per_iteration": 2.639547348022461 }, { "auxiliary_loss_clip": 0.01682025, "auxiliary_loss_mlp": 0.00268539, "balance_loss_clip": 1.37612128, "balance_loss_mlp": 0.23451675, "epoch": 0.2701938974898542, "flos": 24389450461440.0, "grad_norm": 1.9574098045686603, "language_loss": 0.76822555, "learning_rate": 3.424707940835998e-06, "loss": 0.78773123, "num_input_tokens_seen": 97157470, "router_z_loss_clip": 3.05859375, "router_z_loss_mlp": 0.34008789, "step": 4494, "time_per_iteration": 4.083948135375977 }, { "auxiliary_loss_clip": 0.01675873, "auxiliary_loss_mlp": 0.00268968, "balance_loss_clip": 1.37010264, "balance_loss_mlp": 0.23415874, "epoch": 0.2702540207425222, "flos": 26214322976640.0, "grad_norm": 4.0136051907628385, "language_loss": 0.92234242, "learning_rate": 3.42443458168683e-06, "loss": 0.94179088, "num_input_tokens_seen": 97176905, "router_z_loss_clip": 3.06054688, "router_z_loss_mlp": 0.34838867, "step": 4495, "time_per_iteration": 2.6980011463165283 }, { "auxiliary_loss_clip": 0.01680658, "auxiliary_loss_mlp": 0.00302762, "balance_loss_clip": 1.36451626, "balance_loss_mlp": 0.26716647, "epoch": 0.27031414399519016, "flos": 22926377687040.0, "grad_norm": 7.201571547512301, "language_loss": 0.82061613, "learning_rate": 3.424161168522959e-06, "loss": 0.84045029, "num_input_tokens_seen": 97196380, "router_z_loss_clip": 3.16601562, "router_z_loss_mlp": 0.35620117, "step": 4496, "time_per_iteration": 2.64264178276062 }, { "auxiliary_loss_clip": 0.01565977, "auxiliary_loss_mlp": 0.00054981, "balance_loss_clip": 1.31791544, "balance_loss_mlp": 0.04029403, "epoch": 0.2703742672478581, "flos": 63019780404480.0, "grad_norm": 0.6655178149962462, "language_loss": 0.49771115, "learning_rate": 3.423887701354754e-06, "loss": 0.51392072, "num_input_tokens_seen": 97260100, "router_z_loss_clip": 2.484375, "router_z_loss_mlp": 0.14648438, "step": 4497, "time_per_iteration": 4.574505805969238 }, { "auxiliary_loss_clip": 0.01667637, "auxiliary_loss_mlp": 0.00253031, "balance_loss_clip": 1.36435997, "balance_loss_mlp": 0.2204397, "epoch": 0.2704343905005261, "flos": 18840820942080.0, "grad_norm": 14.128998143477038, "language_loss": 0.79037404, "learning_rate": 3.4236141801925847e-06, "loss": 0.80958074, "num_input_tokens_seen": 97277935, "router_z_loss_clip": 3.03515625, "router_z_loss_mlp": 0.32568359, "step": 4498, "time_per_iteration": 2.6200194358825684 }, { "auxiliary_loss_clip": 0.01581378, "auxiliary_loss_mlp": 0.00076427, "balance_loss_clip": 1.33077693, "balance_loss_mlp": 0.06693798, "epoch": 0.27049451375319405, "flos": 71233412618880.0, "grad_norm": 0.7788683687025697, "language_loss": 0.59258878, "learning_rate": 3.4233406050468237e-06, "loss": 0.60916686, "num_input_tokens_seen": 97338845, "router_z_loss_clip": 2.5, "router_z_loss_mlp": 0.09472656, "step": 4499, "time_per_iteration": 3.1540472507476807 }, { "auxiliary_loss_clip": 0.01663261, "auxiliary_loss_mlp": 0.00277904, "balance_loss_clip": 1.36053824, "balance_loss_mlp": 0.243524, "epoch": 0.270554637005862, "flos": 24278594112000.0, "grad_norm": 10.72484320307758, "language_loss": 0.79905742, "learning_rate": 3.4230669759278438e-06, "loss": 0.81846905, "num_input_tokens_seen": 97356640, "router_z_loss_clip": 3.02929688, "router_z_loss_mlp": 0.34399414, "step": 4500, "time_per_iteration": 2.662043809890747 }, { "auxiliary_loss_clip": 0.01636725, "auxiliary_loss_mlp": 0.00246854, "balance_loss_clip": 1.3433311, "balance_loss_mlp": 0.21292754, "epoch": 0.27061476025853, "flos": 17632318832640.0, "grad_norm": 3.5457561509121778, "language_loss": 0.89406443, "learning_rate": 3.4227932928460215e-06, "loss": 0.91290021, "num_input_tokens_seen": 97372585, "router_z_loss_clip": 2.9375, "router_z_loss_mlp": 0.33911133, "step": 4501, "time_per_iteration": 2.6114330291748047 }, { "auxiliary_loss_clip": 0.01662413, "auxiliary_loss_mlp": 0.00317017, "balance_loss_clip": 1.34897351, "balance_loss_mlp": 0.27910829, "epoch": 0.27067488351119795, "flos": 22710123855360.0, "grad_norm": 3.587713853830485, "language_loss": 0.78681165, "learning_rate": 3.422519555811735e-06, "loss": 0.80660594, "num_input_tokens_seen": 97393315, "router_z_loss_clip": 3.13671875, "router_z_loss_mlp": 0.37915039, "step": 4502, "time_per_iteration": 2.688307523727417 }, { "auxiliary_loss_clip": 0.01638532, "auxiliary_loss_mlp": 0.002648, "balance_loss_clip": 1.33166909, "balance_loss_mlp": 0.22932306, "epoch": 0.2707350067638659, "flos": 41719616087040.0, "grad_norm": 41.73281732953292, "language_loss": 0.74053931, "learning_rate": 3.4222457648353642e-06, "loss": 0.75957257, "num_input_tokens_seen": 97417860, "router_z_loss_clip": 3.06445312, "router_z_loss_mlp": 0.35473633, "step": 4503, "time_per_iteration": 2.889307737350464 }, { "auxiliary_loss_clip": 0.01600911, "auxiliary_loss_mlp": 0.00284945, "balance_loss_clip": 1.31221223, "balance_loss_mlp": 0.2519713, "epoch": 0.2707951300165339, "flos": 20193037367040.0, "grad_norm": 4.631348426877167, "language_loss": 0.73773015, "learning_rate": 3.4219719199272918e-06, "loss": 0.7565887, "num_input_tokens_seen": 97436780, "router_z_loss_clip": 2.88476562, "router_z_loss_mlp": 0.3293457, "step": 4504, "time_per_iteration": 2.5923943519592285 }, { "auxiliary_loss_clip": 0.01643696, "auxiliary_loss_mlp": 0.00276331, "balance_loss_clip": 1.3393935, "balance_loss_mlp": 0.24416858, "epoch": 0.27085525326920185, "flos": 21433966479360.0, "grad_norm": 45.758168622599094, "language_loss": 0.82134485, "learning_rate": 3.421698021097902e-06, "loss": 0.84054512, "num_input_tokens_seen": 97456190, "router_z_loss_clip": 3.04492188, "router_z_loss_mlp": 0.32177734, "step": 4505, "time_per_iteration": 2.629586696624756 }, { "auxiliary_loss_clip": 0.01618632, "auxiliary_loss_mlp": 0.0023327, "balance_loss_clip": 1.32128811, "balance_loss_mlp": 0.20208445, "epoch": 0.2709153765218698, "flos": 17675232606720.0, "grad_norm": 93.03845038139863, "language_loss": 0.83357722, "learning_rate": 3.42142406835758e-06, "loss": 0.8520962, "num_input_tokens_seen": 97474545, "router_z_loss_clip": 2.9765625, "router_z_loss_mlp": 0.31176758, "step": 4506, "time_per_iteration": 4.037301778793335 }, { "auxiliary_loss_clip": 0.01604797, "auxiliary_loss_mlp": 0.00256585, "balance_loss_clip": 1.31236601, "balance_loss_mlp": 0.22220516, "epoch": 0.2709754997745378, "flos": 24456243801600.0, "grad_norm": 241.97278426955253, "language_loss": 0.87722474, "learning_rate": 3.421150061716715e-06, "loss": 0.89583862, "num_input_tokens_seen": 97494520, "router_z_loss_clip": 2.92578125, "router_z_loss_mlp": 0.34350586, "step": 4507, "time_per_iteration": 2.6379196643829346 }, { "auxiliary_loss_clip": 0.01489475, "auxiliary_loss_mlp": 0.00054432, "balance_loss_clip": 1.23242295, "balance_loss_mlp": 0.04360763, "epoch": 0.2710356230272058, "flos": 65210798206080.0, "grad_norm": 0.7155295055090518, "language_loss": 0.50395942, "learning_rate": 3.420876001185698e-06, "loss": 0.51939845, "num_input_tokens_seen": 97552455, "router_z_loss_clip": 2.5625, "router_z_loss_mlp": 0.10839844, "step": 4508, "time_per_iteration": 3.0530025959014893 }, { "auxiliary_loss_clip": 0.01589289, "auxiliary_loss_mlp": 0.00201446, "balance_loss_clip": 1.30300808, "balance_loss_mlp": 0.16999874, "epoch": 0.27109574627987376, "flos": 25484438615040.0, "grad_norm": 297.20728185365516, "language_loss": 0.79585135, "learning_rate": 3.4206018867749197e-06, "loss": 0.81375867, "num_input_tokens_seen": 97572650, "router_z_loss_clip": 2.85742188, "router_z_loss_mlp": 0.31445312, "step": 4509, "time_per_iteration": 2.719332695007324 }, { "auxiliary_loss_clip": 0.01584801, "auxiliary_loss_mlp": 0.00215445, "balance_loss_clip": 1.29736161, "balance_loss_mlp": 0.18313883, "epoch": 0.2711558695325417, "flos": 19682782715520.0, "grad_norm": 9.667674131157119, "language_loss": 0.76468623, "learning_rate": 3.4203277184947757e-06, "loss": 0.78268874, "num_input_tokens_seen": 97591150, "router_z_loss_clip": 2.87890625, "router_z_loss_mlp": 0.32275391, "step": 4510, "time_per_iteration": 2.6951777935028076 }, { "auxiliary_loss_clip": 0.01612475, "auxiliary_loss_mlp": 0.00216543, "balance_loss_clip": 1.31533873, "balance_loss_mlp": 0.17899203, "epoch": 0.2712159927852097, "flos": 18587758648320.0, "grad_norm": 32.48281965278007, "language_loss": 0.82016909, "learning_rate": 3.4200534963556627e-06, "loss": 0.83845925, "num_input_tokens_seen": 97607410, "router_z_loss_clip": 2.97265625, "router_z_loss_mlp": 0.37573242, "step": 4511, "time_per_iteration": 2.5786476135253906 }, { "auxiliary_loss_clip": 0.01592467, "auxiliary_loss_mlp": 0.00247218, "balance_loss_clip": 1.29618835, "balance_loss_mlp": 0.21472114, "epoch": 0.27127611603787766, "flos": 25630235919360.0, "grad_norm": 279.7785326352359, "language_loss": 0.89946163, "learning_rate": 3.419779220367979e-06, "loss": 0.91785848, "num_input_tokens_seen": 97626870, "router_z_loss_clip": 2.96289062, "router_z_loss_mlp": 0.32495117, "step": 4512, "time_per_iteration": 2.6616363525390625 }, { "auxiliary_loss_clip": 0.01577477, "auxiliary_loss_mlp": 0.00197582, "balance_loss_clip": 1.28884339, "balance_loss_mlp": 0.16351163, "epoch": 0.2713362392905456, "flos": 23148952312320.0, "grad_norm": 153.42615274126126, "language_loss": 0.87076247, "learning_rate": 3.419504890542124e-06, "loss": 0.88851309, "num_input_tokens_seen": 97646595, "router_z_loss_clip": 2.890625, "router_z_loss_mlp": 0.34106445, "step": 4513, "time_per_iteration": 2.624129295349121 }, { "auxiliary_loss_clip": 0.01585788, "auxiliary_loss_mlp": 0.00215651, "balance_loss_clip": 1.29194307, "balance_loss_mlp": 0.17967391, "epoch": 0.2713963625432136, "flos": 18366045949440.0, "grad_norm": 15.258985277576064, "language_loss": 0.96288073, "learning_rate": 3.4192305068885026e-06, "loss": 0.9808951, "num_input_tokens_seen": 97665485, "router_z_loss_clip": 2.94140625, "router_z_loss_mlp": 0.35961914, "step": 4514, "time_per_iteration": 2.645146608352661 }, { "auxiliary_loss_clip": 0.01594056, "auxiliary_loss_mlp": 0.0021771, "balance_loss_clip": 1.30066657, "balance_loss_mlp": 0.18354481, "epoch": 0.27145648579588155, "flos": 22491751121280.0, "grad_norm": 6.454196491718074, "language_loss": 0.97536933, "learning_rate": 3.418956069417517e-06, "loss": 0.993487, "num_input_tokens_seen": 97683800, "router_z_loss_clip": 2.93554688, "router_z_loss_mlp": 0.34204102, "step": 4515, "time_per_iteration": 2.650865077972412 }, { "auxiliary_loss_clip": 0.01623932, "auxiliary_loss_mlp": 0.00233635, "balance_loss_clip": 1.31792414, "balance_loss_mlp": 0.19503519, "epoch": 0.2715166090485495, "flos": 19239177749760.0, "grad_norm": 17.985410175775158, "language_loss": 0.83682394, "learning_rate": 3.4186815781395756e-06, "loss": 0.85539961, "num_input_tokens_seen": 97700505, "router_z_loss_clip": 3.05859375, "router_z_loss_mlp": 0.38549805, "step": 4516, "time_per_iteration": 2.653933525085449 }, { "auxiliary_loss_clip": 0.01587757, "auxiliary_loss_mlp": 0.00200057, "balance_loss_clip": 1.3017118, "balance_loss_mlp": 0.16343635, "epoch": 0.2715767323012175, "flos": 17709598944000.0, "grad_norm": 15.82888663250305, "language_loss": 0.83089334, "learning_rate": 3.4184070330650866e-06, "loss": 0.84877151, "num_input_tokens_seen": 97717410, "router_z_loss_clip": 2.86328125, "router_z_loss_mlp": 0.36572266, "step": 4517, "time_per_iteration": 2.8193275928497314 }, { "auxiliary_loss_clip": 0.01596604, "auxiliary_loss_mlp": 0.00210912, "balance_loss_clip": 1.30403149, "balance_loss_mlp": 0.17340913, "epoch": 0.27163685555388545, "flos": 22382834106240.0, "grad_norm": 106.97815448376242, "language_loss": 0.8964029, "learning_rate": 3.4181324342044607e-06, "loss": 0.91447806, "num_input_tokens_seen": 97734545, "router_z_loss_clip": 2.92578125, "router_z_loss_mlp": 0.37475586, "step": 4518, "time_per_iteration": 2.6116461753845215 }, { "auxiliary_loss_clip": 0.01581281, "auxiliary_loss_mlp": 0.00207656, "balance_loss_clip": 1.29104161, "balance_loss_mlp": 0.17067704, "epoch": 0.2716969788065534, "flos": 22346708002560.0, "grad_norm": 7.485683560264491, "language_loss": 0.76281214, "learning_rate": 3.41785778156811e-06, "loss": 0.78070152, "num_input_tokens_seen": 97754000, "router_z_loss_clip": 2.8984375, "router_z_loss_mlp": 0.36987305, "step": 4519, "time_per_iteration": 2.678581476211548 }, { "auxiliary_loss_clip": 0.01567893, "auxiliary_loss_mlp": 0.00206799, "balance_loss_clip": 1.28632236, "balance_loss_mlp": 0.17075002, "epoch": 0.2717571020592214, "flos": 25228467319680.0, "grad_norm": 13.208768356617812, "language_loss": 0.8048104, "learning_rate": 3.417583075166451e-06, "loss": 0.82255727, "num_input_tokens_seen": 97772080, "router_z_loss_clip": 2.81640625, "router_z_loss_mlp": 0.36083984, "step": 4520, "time_per_iteration": 2.648935317993164 }, { "auxiliary_loss_clip": 0.01598637, "auxiliary_loss_mlp": 0.00214173, "balance_loss_clip": 1.30178118, "balance_loss_mlp": 0.17659777, "epoch": 0.2718172253118894, "flos": 20189769229440.0, "grad_norm": 4.186069540744834, "language_loss": 0.83829969, "learning_rate": 3.4173083150099e-06, "loss": 0.85642779, "num_input_tokens_seen": 97789370, "router_z_loss_clip": 2.96875, "router_z_loss_mlp": 0.3762207, "step": 4521, "time_per_iteration": 2.6424646377563477 }, { "auxiliary_loss_clip": 0.01586136, "auxiliary_loss_mlp": 0.00228377, "balance_loss_clip": 1.29025292, "balance_loss_mlp": 0.19259053, "epoch": 0.27187734856455736, "flos": 14319129260160.0, "grad_norm": 14.642421876621789, "language_loss": 0.86559737, "learning_rate": 3.417033501108875e-06, "loss": 0.88374257, "num_input_tokens_seen": 97807385, "router_z_loss_clip": 2.95898438, "router_z_loss_mlp": 0.35791016, "step": 4522, "time_per_iteration": 2.652073383331299 }, { "auxiliary_loss_clip": 0.01591828, "auxiliary_loss_mlp": 0.00226403, "balance_loss_clip": 1.30702603, "balance_loss_mlp": 0.18997248, "epoch": 0.27193747181722533, "flos": 21107682311040.0, "grad_norm": 21.90759340674125, "language_loss": 0.79146367, "learning_rate": 3.416758633473798e-06, "loss": 0.80964595, "num_input_tokens_seen": 97827930, "router_z_loss_clip": 2.84960938, "router_z_loss_mlp": 0.36425781, "step": 4523, "time_per_iteration": 2.6658334732055664 }, { "auxiliary_loss_clip": 0.01562874, "auxiliary_loss_mlp": 0.00193414, "balance_loss_clip": 1.28031731, "balance_loss_mlp": 0.16060755, "epoch": 0.2719975950698933, "flos": 19682782715520.0, "grad_norm": 12.924650959984083, "language_loss": 0.80908144, "learning_rate": 3.4164837121150915e-06, "loss": 0.82664436, "num_input_tokens_seen": 97847440, "router_z_loss_clip": 2.82226562, "router_z_loss_mlp": 0.32788086, "step": 4524, "time_per_iteration": 2.6434154510498047 }, { "auxiliary_loss_clip": 0.01555893, "auxiliary_loss_mlp": 0.00215255, "balance_loss_clip": 1.27634561, "balance_loss_mlp": 0.18085107, "epoch": 0.27205771832256126, "flos": 24754482426240.0, "grad_norm": 12.800930601429771, "language_loss": 0.8150292, "learning_rate": 3.4162087370431803e-06, "loss": 0.83274066, "num_input_tokens_seen": 97867620, "router_z_loss_clip": 2.79296875, "router_z_loss_mlp": 0.34423828, "step": 4525, "time_per_iteration": 2.7041895389556885 }, { "auxiliary_loss_clip": 0.01580622, "auxiliary_loss_mlp": 0.00199854, "balance_loss_clip": 1.30059242, "balance_loss_mlp": 0.16418707, "epoch": 0.2721178415752292, "flos": 21755581879680.0, "grad_norm": 25.182707188554623, "language_loss": 0.8877486, "learning_rate": 3.4159337082684926e-06, "loss": 0.90555334, "num_input_tokens_seen": 97884345, "router_z_loss_clip": 2.80273438, "router_z_loss_mlp": 0.35668945, "step": 4526, "time_per_iteration": 2.5921270847320557 }, { "auxiliary_loss_clip": 0.01579248, "auxiliary_loss_mlp": 0.00251242, "balance_loss_clip": 1.28871465, "balance_loss_mlp": 0.21495487, "epoch": 0.2721779648278972, "flos": 12676826597760.0, "grad_norm": 27.065362799928746, "language_loss": 0.87825394, "learning_rate": 3.4156586258014566e-06, "loss": 0.89655888, "num_input_tokens_seen": 97901500, "router_z_loss_clip": 2.90625, "router_z_loss_mlp": 0.36303711, "step": 4527, "time_per_iteration": 2.6026248931884766 }, { "auxiliary_loss_clip": 0.0156393, "auxiliary_loss_mlp": 0.00224202, "balance_loss_clip": 1.28189707, "balance_loss_mlp": 0.18824852, "epoch": 0.27223808808056515, "flos": 16253206099200.0, "grad_norm": 3.933387745037403, "language_loss": 0.89046276, "learning_rate": 3.415383489652503e-06, "loss": 0.90834403, "num_input_tokens_seen": 97917800, "router_z_loss_clip": 2.82226562, "router_z_loss_mlp": 0.35961914, "step": 4528, "time_per_iteration": 2.6785829067230225 }, { "auxiliary_loss_clip": 0.0159149, "auxiliary_loss_mlp": 0.00203177, "balance_loss_clip": 1.30633378, "balance_loss_mlp": 0.16908276, "epoch": 0.2722982113332331, "flos": 27745805203200.0, "grad_norm": 8.53623919862263, "language_loss": 0.83549362, "learning_rate": 3.4151082998320666e-06, "loss": 0.85344028, "num_input_tokens_seen": 97937225, "router_z_loss_clip": 2.84765625, "router_z_loss_mlp": 0.34082031, "step": 4529, "time_per_iteration": 2.6636667251586914 }, { "auxiliary_loss_clip": 0.01571559, "auxiliary_loss_mlp": 0.00229147, "balance_loss_clip": 1.2881186, "balance_loss_mlp": 0.19526795, "epoch": 0.2723583345859011, "flos": 21726243446400.0, "grad_norm": 14.76010063563752, "language_loss": 0.89296508, "learning_rate": 3.4148330563505805e-06, "loss": 0.91097206, "num_input_tokens_seen": 97956845, "router_z_loss_clip": 2.83789062, "router_z_loss_mlp": 0.33911133, "step": 4530, "time_per_iteration": 2.612440824508667 }, { "auxiliary_loss_clip": 0.01570555, "auxiliary_loss_mlp": 0.002156, "balance_loss_clip": 1.28752685, "balance_loss_mlp": 0.18176824, "epoch": 0.27241845783856905, "flos": 17347260499200.0, "grad_norm": 9.007807897527684, "language_loss": 0.98103809, "learning_rate": 3.4145577592184838e-06, "loss": 0.99889964, "num_input_tokens_seen": 97972465, "router_z_loss_clip": 2.828125, "router_z_loss_mlp": 0.33837891, "step": 4531, "time_per_iteration": 2.5635035037994385 }, { "auxiliary_loss_clip": 0.01574109, "auxiliary_loss_mlp": 0.00253767, "balance_loss_clip": 1.28733099, "balance_loss_mlp": 0.22117475, "epoch": 0.272478581091237, "flos": 24754302858240.0, "grad_norm": 4.049562540688933, "language_loss": 0.82821423, "learning_rate": 3.4142824084462155e-06, "loss": 0.84649301, "num_input_tokens_seen": 97990770, "router_z_loss_clip": 2.8671875, "router_z_loss_mlp": 0.32592773, "step": 4532, "time_per_iteration": 2.6456568241119385 }, { "auxiliary_loss_clip": 0.01568879, "auxiliary_loss_mlp": 0.0020716, "balance_loss_clip": 1.29179931, "balance_loss_mlp": 0.17549773, "epoch": 0.272538704343905, "flos": 17890624512000.0, "grad_norm": 3.6818997040403305, "language_loss": 0.9597581, "learning_rate": 3.4140070040442162e-06, "loss": 0.97751856, "num_input_tokens_seen": 98005775, "router_z_loss_clip": 2.7734375, "router_z_loss_mlp": 0.31640625, "step": 4533, "time_per_iteration": 3.9501445293426514 }, { "auxiliary_loss_clip": 0.01610925, "auxiliary_loss_mlp": 0.00235166, "balance_loss_clip": 1.31797981, "balance_loss_mlp": 0.20159701, "epoch": 0.272598827596573, "flos": 22932016122240.0, "grad_norm": 4.620463960491849, "language_loss": 0.77031398, "learning_rate": 3.413731546022929e-06, "loss": 0.78877485, "num_input_tokens_seen": 98025750, "router_z_loss_clip": 2.92773438, "router_z_loss_mlp": 0.33569336, "step": 4534, "time_per_iteration": 2.6633293628692627 }, { "auxiliary_loss_clip": 0.01597553, "auxiliary_loss_mlp": 0.00253482, "balance_loss_clip": 1.2975142, "balance_loss_mlp": 0.21759978, "epoch": 0.27265895084924097, "flos": 24238409771520.0, "grad_norm": 2.4704819573654664, "language_loss": 0.97477418, "learning_rate": 3.4134560343928005e-06, "loss": 0.99328452, "num_input_tokens_seen": 98044955, "router_z_loss_clip": 3.0, "router_z_loss_mlp": 0.35888672, "step": 4535, "time_per_iteration": 2.6271297931671143 }, { "auxiliary_loss_clip": 0.01610417, "auxiliary_loss_mlp": 0.0025068, "balance_loss_clip": 1.31044078, "balance_loss_mlp": 0.2171587, "epoch": 0.27271907410190893, "flos": 27013155494400.0, "grad_norm": 3.928089233974998, "language_loss": 0.78289747, "learning_rate": 3.4131804691642778e-06, "loss": 0.80150843, "num_input_tokens_seen": 98065860, "router_z_loss_clip": 3.00195312, "router_z_loss_mlp": 0.33544922, "step": 4536, "time_per_iteration": 4.0961244106292725 }, { "auxiliary_loss_clip": 0.01621163, "auxiliary_loss_mlp": 0.00265989, "balance_loss_clip": 1.32249188, "balance_loss_mlp": 0.23089348, "epoch": 0.2727791973545769, "flos": 34452588942720.0, "grad_norm": 4.245097272879593, "language_loss": 0.78587723, "learning_rate": 3.41290485034781e-06, "loss": 0.80474877, "num_input_tokens_seen": 98085450, "router_z_loss_clip": 2.984375, "router_z_loss_mlp": 0.35083008, "step": 4537, "time_per_iteration": 2.7355167865753174 }, { "auxiliary_loss_clip": 0.01604007, "auxiliary_loss_mlp": 0.00258939, "balance_loss_clip": 1.308429, "balance_loss_mlp": 0.22532186, "epoch": 0.27283932060724486, "flos": 15041723160960.0, "grad_norm": 7.078910352249782, "language_loss": 0.84412825, "learning_rate": 3.4126291779538485e-06, "loss": 0.86275774, "num_input_tokens_seen": 98099115, "router_z_loss_clip": 2.953125, "router_z_loss_mlp": 0.3359375, "step": 4538, "time_per_iteration": 2.684952974319458 }, { "auxiliary_loss_clip": 0.0157595, "auxiliary_loss_mlp": 0.00243181, "balance_loss_clip": 1.28280413, "balance_loss_mlp": 0.21068488, "epoch": 0.2728994438599128, "flos": 21652411040640.0, "grad_norm": 8.696028760497006, "language_loss": 0.9532699, "learning_rate": 3.412353451992847e-06, "loss": 0.97146124, "num_input_tokens_seen": 98118415, "router_z_loss_clip": 2.93359375, "router_z_loss_mlp": 0.32519531, "step": 4539, "time_per_iteration": 4.230685710906982 }, { "auxiliary_loss_clip": 0.01565665, "auxiliary_loss_mlp": 0.00233023, "balance_loss_clip": 1.27626729, "balance_loss_mlp": 0.19568679, "epoch": 0.2729595671125808, "flos": 17488424949120.0, "grad_norm": 49.308237499078786, "language_loss": 0.93591702, "learning_rate": 3.4120776724752607e-06, "loss": 0.95390385, "num_input_tokens_seen": 98136300, "router_z_loss_clip": 2.89453125, "router_z_loss_mlp": 0.37329102, "step": 4540, "time_per_iteration": 2.659888505935669 }, { "auxiliary_loss_clip": 0.01555021, "auxiliary_loss_mlp": 0.00246336, "balance_loss_clip": 1.26966333, "balance_loss_mlp": 0.20988151, "epoch": 0.27301969036524876, "flos": 19318145800320.0, "grad_norm": 39.50255514683836, "language_loss": 0.88286555, "learning_rate": 3.4118018394115476e-06, "loss": 0.90087914, "num_input_tokens_seen": 98154580, "router_z_loss_clip": 2.85742188, "router_z_loss_mlp": 0.36450195, "step": 4541, "time_per_iteration": 2.584987163543701 }, { "auxiliary_loss_clip": 0.01548189, "auxiliary_loss_mlp": 0.00252421, "balance_loss_clip": 1.26388788, "balance_loss_mlp": 0.21508458, "epoch": 0.2730798136179167, "flos": 21065666376960.0, "grad_norm": 60.49664232164366, "language_loss": 0.86837971, "learning_rate": 3.4115259528121678e-06, "loss": 0.8863858, "num_input_tokens_seen": 98173115, "router_z_loss_clip": 2.84179688, "router_z_loss_mlp": 0.37329102, "step": 4542, "time_per_iteration": 2.608703136444092 }, { "auxiliary_loss_clip": 0.01550708, "auxiliary_loss_mlp": 0.00226694, "balance_loss_clip": 1.26698625, "balance_loss_mlp": 0.19424537, "epoch": 0.2731399368705847, "flos": 19171737964800.0, "grad_norm": 482.30258575543763, "language_loss": 0.99043071, "learning_rate": 3.411250012687582e-06, "loss": 1.0082047, "num_input_tokens_seen": 98190260, "router_z_loss_clip": 2.8359375, "router_z_loss_mlp": 0.32470703, "step": 4543, "time_per_iteration": 2.640106678009033 }, { "auxiliary_loss_clip": 0.01587291, "auxiliary_loss_mlp": 0.00250202, "balance_loss_clip": 1.28601694, "balance_loss_mlp": 0.21639435, "epoch": 0.27320006012325265, "flos": 18290130554880.0, "grad_norm": 51.07060010193714, "language_loss": 0.74302918, "learning_rate": 3.410974019048255e-06, "loss": 0.7614041, "num_input_tokens_seen": 98207115, "router_z_loss_clip": 3.01757812, "router_z_loss_mlp": 0.33813477, "step": 4544, "time_per_iteration": 2.6073198318481445 }, { "auxiliary_loss_clip": 0.01561144, "auxiliary_loss_mlp": 0.00235551, "balance_loss_clip": 1.27134192, "balance_loss_mlp": 0.20193404, "epoch": 0.2732601833759206, "flos": 34860929731200.0, "grad_norm": 6.343290049931752, "language_loss": 0.78909522, "learning_rate": 3.410697971904651e-06, "loss": 0.80706221, "num_input_tokens_seen": 98230610, "router_z_loss_clip": 2.90039062, "router_z_loss_mlp": 0.33618164, "step": 4545, "time_per_iteration": 2.7494277954101562 }, { "auxiliary_loss_clip": 0.01417486, "auxiliary_loss_mlp": 0.00040026, "balance_loss_clip": 1.16310596, "balance_loss_mlp": 0.02247802, "epoch": 0.2733203066285886, "flos": 53910824762880.0, "grad_norm": 0.803259201801223, "language_loss": 0.61814582, "learning_rate": 3.4104218712672383e-06, "loss": 0.63272095, "num_input_tokens_seen": 98293585, "router_z_loss_clip": 2.53125, "router_z_loss_mlp": 0.17578125, "step": 4546, "time_per_iteration": 3.1723639965057373 }, { "auxiliary_loss_clip": 0.0155984, "auxiliary_loss_mlp": 0.00241295, "balance_loss_clip": 1.27459109, "balance_loss_mlp": 0.2072245, "epoch": 0.2733804298812566, "flos": 20660378244480.0, "grad_norm": 85.27070750515162, "language_loss": 0.70286882, "learning_rate": 3.410145717146488e-06, "loss": 0.72088015, "num_input_tokens_seen": 98311680, "router_z_loss_clip": 2.84960938, "router_z_loss_mlp": 0.34082031, "step": 4547, "time_per_iteration": 2.606595277786255 }, { "auxiliary_loss_clip": 0.01560127, "auxiliary_loss_mlp": 0.00183877, "balance_loss_clip": 1.27357268, "balance_loss_mlp": 0.15250131, "epoch": 0.27344055313392457, "flos": 25884339707520.0, "grad_norm": 8.405477558205277, "language_loss": 0.85866594, "learning_rate": 3.4098695095528694e-06, "loss": 0.87610596, "num_input_tokens_seen": 98330770, "router_z_loss_clip": 2.86132812, "router_z_loss_mlp": 0.3137207, "step": 4548, "time_per_iteration": 4.0167882442474365 }, { "auxiliary_loss_clip": 0.01587427, "auxiliary_loss_mlp": 0.00216485, "balance_loss_clip": 1.2914114, "balance_loss_mlp": 0.1845842, "epoch": 0.27350067638659253, "flos": 22929753565440.0, "grad_norm": 4.76584582241432, "language_loss": 0.89299512, "learning_rate": 3.4095932484968585e-06, "loss": 0.91103423, "num_input_tokens_seen": 98349860, "router_z_loss_clip": 2.9609375, "router_z_loss_mlp": 0.3190918, "step": 4549, "time_per_iteration": 2.6597344875335693 }, { "auxiliary_loss_clip": 0.01552492, "auxiliary_loss_mlp": 0.00214591, "balance_loss_clip": 1.26523459, "balance_loss_mlp": 0.17720735, "epoch": 0.2735607996392605, "flos": 16574821499520.0, "grad_norm": 20.245757536829103, "language_loss": 0.79713255, "learning_rate": 3.4093169339889305e-06, "loss": 0.81480336, "num_input_tokens_seen": 98367040, "router_z_loss_clip": 2.87109375, "router_z_loss_mlp": 0.3737793, "step": 4550, "time_per_iteration": 2.592839479446411 }, { "auxiliary_loss_clip": 0.01574346, "auxiliary_loss_mlp": 0.00234094, "balance_loss_clip": 1.28493273, "balance_loss_mlp": 0.20317113, "epoch": 0.27362092289192846, "flos": 19645291895040.0, "grad_norm": 6.956141497705398, "language_loss": 0.86810625, "learning_rate": 3.409040566039563e-06, "loss": 0.88619065, "num_input_tokens_seen": 98384010, "router_z_loss_clip": 2.8984375, "router_z_loss_mlp": 0.30932617, "step": 4551, "time_per_iteration": 2.588219165802002 }, { "auxiliary_loss_clip": 0.01553227, "auxiliary_loss_mlp": 0.00214298, "balance_loss_clip": 1.26951385, "balance_loss_mlp": 0.18001331, "epoch": 0.27368104614459643, "flos": 17639142416640.0, "grad_norm": 24.714237349373025, "language_loss": 0.82119995, "learning_rate": 3.4087641446592362e-06, "loss": 0.83887517, "num_input_tokens_seen": 98399625, "router_z_loss_clip": 2.8359375, "router_z_loss_mlp": 0.3425293, "step": 4552, "time_per_iteration": 2.6232831478118896 }, { "auxiliary_loss_clip": 0.01555495, "auxiliary_loss_mlp": 0.00202436, "balance_loss_clip": 1.26928425, "balance_loss_mlp": 0.16979647, "epoch": 0.2737411693972644, "flos": 21580015178880.0, "grad_norm": 32.11255804101521, "language_loss": 0.79995441, "learning_rate": 3.408487669858431e-06, "loss": 0.81753367, "num_input_tokens_seen": 98417310, "router_z_loss_clip": 2.86132812, "router_z_loss_mlp": 0.32617188, "step": 4553, "time_per_iteration": 2.6763126850128174 }, { "auxiliary_loss_clip": 0.01557217, "auxiliary_loss_mlp": 0.0021954, "balance_loss_clip": 1.27436233, "balance_loss_mlp": 0.18711527, "epoch": 0.27380129264993236, "flos": 25484043565440.0, "grad_norm": 168.10114577567407, "language_loss": 0.68187088, "learning_rate": 3.4082111416476337e-06, "loss": 0.69963849, "num_input_tokens_seen": 98438670, "router_z_loss_clip": 2.83007812, "router_z_loss_mlp": 0.32446289, "step": 4554, "time_per_iteration": 2.6798346042633057 }, { "auxiliary_loss_clip": 0.01548404, "auxiliary_loss_mlp": 0.00224483, "balance_loss_clip": 1.25889504, "balance_loss_mlp": 0.19027036, "epoch": 0.2738614159026003, "flos": 18661196004480.0, "grad_norm": 43.154530756939145, "language_loss": 0.83406091, "learning_rate": 3.4079345600373275e-06, "loss": 0.85178977, "num_input_tokens_seen": 98456060, "router_z_loss_clip": 2.89648438, "router_z_loss_mlp": 0.34204102, "step": 4555, "time_per_iteration": 2.563953399658203 }, { "auxiliary_loss_clip": 0.01556879, "auxiliary_loss_mlp": 0.00243326, "balance_loss_clip": 1.26807499, "balance_loss_mlp": 0.21259362, "epoch": 0.2739215391552683, "flos": 23477139901440.0, "grad_norm": 5.188196555011678, "language_loss": 0.85815299, "learning_rate": 3.407657925038002e-06, "loss": 0.87615502, "num_input_tokens_seen": 98473765, "router_z_loss_clip": 2.88867188, "router_z_loss_mlp": 0.30712891, "step": 4556, "time_per_iteration": 2.6894569396972656 }, { "auxiliary_loss_clip": 0.01578891, "auxiliary_loss_mlp": 0.00285515, "balance_loss_clip": 1.27357841, "balance_loss_mlp": 0.25077695, "epoch": 0.27398166240793626, "flos": 17128636369920.0, "grad_norm": 13.735363744192822, "language_loss": 0.90234208, "learning_rate": 3.4073812366601473e-06, "loss": 0.92098618, "num_input_tokens_seen": 98490590, "router_z_loss_clip": 3.05664062, "router_z_loss_mlp": 0.34716797, "step": 4557, "time_per_iteration": 2.6907575130462646 }, { "auxiliary_loss_clip": 0.01574506, "auxiliary_loss_mlp": 0.00238983, "balance_loss_clip": 1.2814914, "balance_loss_mlp": 0.20934774, "epoch": 0.2740417856606042, "flos": 23404744039680.0, "grad_norm": 7.670273866644852, "language_loss": 0.82653081, "learning_rate": 3.4071044949142547e-06, "loss": 0.84466565, "num_input_tokens_seen": 98510590, "router_z_loss_clip": 2.92773438, "router_z_loss_mlp": 0.29626465, "step": 4558, "time_per_iteration": 2.6964802742004395 }, { "auxiliary_loss_clip": 0.01540571, "auxiliary_loss_mlp": 0.00236606, "balance_loss_clip": 1.2568326, "balance_loss_mlp": 0.20580187, "epoch": 0.2741019089132722, "flos": 12780428400000.0, "grad_norm": 3.4186623044187914, "language_loss": 0.74825639, "learning_rate": 3.406827699810819e-06, "loss": 0.76602811, "num_input_tokens_seen": 98527875, "router_z_loss_clip": 2.8359375, "router_z_loss_mlp": 0.30786133, "step": 4559, "time_per_iteration": 2.6025753021240234 }, { "auxiliary_loss_clip": 0.01543561, "auxiliary_loss_mlp": 0.00218461, "balance_loss_clip": 1.25338006, "balance_loss_mlp": 0.18875408, "epoch": 0.27416203216594015, "flos": 20631542601600.0, "grad_norm": 8.436019213709066, "language_loss": 0.78768599, "learning_rate": 3.4065508513603353e-06, "loss": 0.8053062, "num_input_tokens_seen": 98547575, "router_z_loss_clip": 2.89648438, "router_z_loss_mlp": 0.296875, "step": 4560, "time_per_iteration": 2.6664299964904785 }, { "auxiliary_loss_clip": 0.01560351, "auxiliary_loss_mlp": 0.00234128, "balance_loss_clip": 1.26830733, "balance_loss_mlp": 0.20346749, "epoch": 0.27422215541860817, "flos": 26541576812160.0, "grad_norm": 148.49572921329076, "language_loss": 0.89215982, "learning_rate": 3.406273949573303e-06, "loss": 0.91010463, "num_input_tokens_seen": 98566290, "router_z_loss_clip": 2.91992188, "router_z_loss_mlp": 0.30639648, "step": 4561, "time_per_iteration": 2.6673169136047363 }, { "auxiliary_loss_clip": 0.01566875, "auxiliary_loss_mlp": 0.00242383, "balance_loss_clip": 1.27127349, "balance_loss_mlp": 0.21394014, "epoch": 0.27428227867127614, "flos": 23331163029120.0, "grad_norm": 8.59651917189317, "language_loss": 0.81945741, "learning_rate": 3.4059969944602214e-06, "loss": 0.83754998, "num_input_tokens_seen": 98586255, "router_z_loss_clip": 2.95703125, "router_z_loss_mlp": 0.28442383, "step": 4562, "time_per_iteration": 2.648052930831909 }, { "auxiliary_loss_clip": 0.01559518, "auxiliary_loss_mlp": 0.00251725, "balance_loss_clip": 1.26429749, "balance_loss_mlp": 0.22061148, "epoch": 0.2743424019239441, "flos": 23035115134080.0, "grad_norm": 2.096790067883379, "language_loss": 0.81354201, "learning_rate": 3.4057199860315928e-06, "loss": 0.83165443, "num_input_tokens_seen": 98606030, "router_z_loss_clip": 2.95117188, "router_z_loss_mlp": 0.3112793, "step": 4563, "time_per_iteration": 2.6519968509674072 }, { "auxiliary_loss_clip": 0.01585975, "auxiliary_loss_mlp": 0.00275068, "balance_loss_clip": 1.27970743, "balance_loss_mlp": 0.24285811, "epoch": 0.27440252517661207, "flos": 21981101420160.0, "grad_norm": 3.0384243622099882, "language_loss": 0.75125164, "learning_rate": 3.4054429242979213e-06, "loss": 0.76986206, "num_input_tokens_seen": 98625225, "router_z_loss_clip": 3.06054688, "router_z_loss_mlp": 0.32202148, "step": 4564, "time_per_iteration": 2.7213807106018066 }, { "auxiliary_loss_clip": 0.01568673, "auxiliary_loss_mlp": 0.00213148, "balance_loss_clip": 1.27170277, "balance_loss_mlp": 0.1846216, "epoch": 0.27446264842928003, "flos": 40187451502080.0, "grad_norm": 10.898261233037164, "language_loss": 0.8656745, "learning_rate": 3.4051658092697135e-06, "loss": 0.88349271, "num_input_tokens_seen": 98649470, "router_z_loss_clip": 2.96875, "router_z_loss_mlp": 0.28527832, "step": 4565, "time_per_iteration": 2.8316197395324707 }, { "auxiliary_loss_clip": 0.01585748, "auxiliary_loss_mlp": 0.00210357, "balance_loss_clip": 1.28903079, "balance_loss_mlp": 0.17967269, "epoch": 0.274522771681948, "flos": 13479681438720.0, "grad_norm": 30.805194613072974, "language_loss": 0.78944337, "learning_rate": 3.404888640957477e-06, "loss": 0.80740446, "num_input_tokens_seen": 98666915, "router_z_loss_clip": 2.96875, "router_z_loss_mlp": 0.30664062, "step": 4566, "time_per_iteration": 2.6731512546539307 }, { "auxiliary_loss_clip": 0.01541336, "auxiliary_loss_mlp": 0.00200881, "balance_loss_clip": 1.24842954, "balance_loss_mlp": 0.17155552, "epoch": 0.27458289493461596, "flos": 28622133313920.0, "grad_norm": 4.864807161841828, "language_loss": 0.65775865, "learning_rate": 3.404611419371723e-06, "loss": 0.67518079, "num_input_tokens_seen": 98688240, "router_z_loss_clip": 2.9296875, "router_z_loss_mlp": 0.29321289, "step": 4567, "time_per_iteration": 2.757650852203369 }, { "auxiliary_loss_clip": 0.01560498, "auxiliary_loss_mlp": 0.00242634, "balance_loss_clip": 1.26447725, "balance_loss_mlp": 0.20961335, "epoch": 0.2746430181872839, "flos": 20119815492480.0, "grad_norm": 6.293623505219834, "language_loss": 0.88528204, "learning_rate": 3.4043341445229627e-06, "loss": 0.9033134, "num_input_tokens_seen": 98708245, "router_z_loss_clip": 2.95703125, "router_z_loss_mlp": 0.33032227, "step": 4568, "time_per_iteration": 2.762847423553467 }, { "auxiliary_loss_clip": 0.01574402, "auxiliary_loss_mlp": 0.00231094, "balance_loss_clip": 1.26895523, "balance_loss_mlp": 0.19924147, "epoch": 0.2747031414399519, "flos": 20193468330240.0, "grad_norm": 21.698888943414957, "language_loss": 0.75778788, "learning_rate": 3.4040568164217117e-06, "loss": 0.77584291, "num_input_tokens_seen": 98724575, "router_z_loss_clip": 3.05273438, "router_z_loss_mlp": 0.31860352, "step": 4569, "time_per_iteration": 2.6298258304595947 }, { "auxiliary_loss_clip": 0.01538891, "auxiliary_loss_mlp": 0.00234025, "balance_loss_clip": 1.24009252, "balance_loss_mlp": 0.20183812, "epoch": 0.27476326469261986, "flos": 13516346246400.0, "grad_norm": 4.31692128628831, "language_loss": 0.79828942, "learning_rate": 3.4037794350784848e-06, "loss": 0.81601858, "num_input_tokens_seen": 98740700, "router_z_loss_clip": 2.99023438, "router_z_loss_mlp": 0.32177734, "step": 4570, "time_per_iteration": 2.61742901802063 }, { "auxiliary_loss_clip": 0.01559462, "auxiliary_loss_mlp": 0.00143068, "balance_loss_clip": 1.24628294, "balance_loss_mlp": 0.13515237, "epoch": 0.2748233879452878, "flos": 65937127121280.0, "grad_norm": 0.7138605174132817, "language_loss": 0.55784851, "learning_rate": 3.4035020005038014e-06, "loss": 0.57487381, "num_input_tokens_seen": 98803030, "router_z_loss_clip": 3.125, "router_z_loss_mlp": 0.07910156, "step": 4571, "time_per_iteration": 3.244525671005249 }, { "auxiliary_loss_clip": 0.01567733, "auxiliary_loss_mlp": 0.00269095, "balance_loss_clip": 1.26961446, "balance_loss_mlp": 0.23855332, "epoch": 0.2748835111979558, "flos": 17384212615680.0, "grad_norm": 1.8936855702083126, "language_loss": 0.86809409, "learning_rate": 3.4032245127081812e-06, "loss": 0.88646233, "num_input_tokens_seen": 98820505, "router_z_loss_clip": 2.98046875, "router_z_loss_mlp": 0.30517578, "step": 4572, "time_per_iteration": 2.6719746589660645 }, { "auxiliary_loss_clip": 0.01541569, "auxiliary_loss_mlp": 0.00182776, "balance_loss_clip": 1.2499969, "balance_loss_mlp": 0.15410665, "epoch": 0.27494363445062375, "flos": 23587565287680.0, "grad_norm": 33.204937483664864, "language_loss": 0.85948527, "learning_rate": 3.402946971702147e-06, "loss": 0.87672871, "num_input_tokens_seen": 98842150, "router_z_loss_clip": 2.9140625, "router_z_loss_mlp": 0.28686523, "step": 4573, "time_per_iteration": 2.634859561920166 }, { "auxiliary_loss_clip": 0.01533638, "auxiliary_loss_mlp": 0.00221935, "balance_loss_clip": 1.24083972, "balance_loss_mlp": 0.19337264, "epoch": 0.2750037577032918, "flos": 17164582905600.0, "grad_norm": 4.785212972862184, "language_loss": 0.85274392, "learning_rate": 3.402669377496223e-06, "loss": 0.8702997, "num_input_tokens_seen": 98861050, "router_z_loss_clip": 2.9296875, "router_z_loss_mlp": 0.28564453, "step": 4574, "time_per_iteration": 2.578932762145996 }, { "auxiliary_loss_clip": 0.01560866, "auxiliary_loss_mlp": 0.00249093, "balance_loss_clip": 1.26251721, "balance_loss_mlp": 0.22063813, "epoch": 0.27506388095595974, "flos": 24491903028480.0, "grad_norm": 8.60742642731296, "language_loss": 0.81396818, "learning_rate": 3.402391730100936e-06, "loss": 0.83206773, "num_input_tokens_seen": 98879695, "router_z_loss_clip": 2.984375, "router_z_loss_mlp": 0.28442383, "step": 4575, "time_per_iteration": 4.026278257369995 }, { "auxiliary_loss_clip": 0.01552218, "auxiliary_loss_mlp": 0.00231002, "balance_loss_clip": 1.25690234, "balance_loss_mlp": 0.20165282, "epoch": 0.2751240042086277, "flos": 38764706722560.0, "grad_norm": 12.464961256452915, "language_loss": 0.78146648, "learning_rate": 3.402114029526814e-06, "loss": 0.7992987, "num_input_tokens_seen": 98902035, "router_z_loss_clip": 2.95507812, "router_z_loss_mlp": 0.29345703, "step": 4576, "time_per_iteration": 2.7828738689422607 }, { "auxiliary_loss_clip": 0.0155861, "auxiliary_loss_mlp": 0.00226401, "balance_loss_clip": 1.25946987, "balance_loss_mlp": 0.19590728, "epoch": 0.27518412746129567, "flos": 26907039740160.0, "grad_norm": 3.781584753355348, "language_loss": 0.79547864, "learning_rate": 3.4018362757843866e-06, "loss": 0.81332874, "num_input_tokens_seen": 98921835, "router_z_loss_clip": 2.9921875, "router_z_loss_mlp": 0.30493164, "step": 4577, "time_per_iteration": 2.682868480682373 }, { "auxiliary_loss_clip": 0.01584812, "auxiliary_loss_mlp": 0.00234274, "balance_loss_clip": 1.28283978, "balance_loss_mlp": 0.20335117, "epoch": 0.27524425071396363, "flos": 24900531125760.0, "grad_norm": 59.06258941851503, "language_loss": 0.82773811, "learning_rate": 3.401558468884188e-06, "loss": 0.84592891, "num_input_tokens_seen": 98939610, "router_z_loss_clip": 3.02148438, "router_z_loss_mlp": 0.30932617, "step": 4578, "time_per_iteration": 2.6752095222473145 }, { "auxiliary_loss_clip": 0.01568351, "auxiliary_loss_mlp": 0.00255424, "balance_loss_clip": 1.26870418, "balance_loss_mlp": 0.22278449, "epoch": 0.2753043739666316, "flos": 26288047641600.0, "grad_norm": 26.07687039856782, "language_loss": 0.73417926, "learning_rate": 3.4012806088367516e-06, "loss": 0.75241697, "num_input_tokens_seen": 98962250, "router_z_loss_clip": 2.99609375, "router_z_loss_mlp": 0.32641602, "step": 4579, "time_per_iteration": 4.184458494186401 }, { "auxiliary_loss_clip": 0.01547117, "auxiliary_loss_mlp": 0.00267841, "balance_loss_clip": 1.24732852, "balance_loss_mlp": 0.2333895, "epoch": 0.27536449721929956, "flos": 24206772867840.0, "grad_norm": 183.74328617105823, "language_loss": 0.86277461, "learning_rate": 3.4010026956526137e-06, "loss": 0.88092422, "num_input_tokens_seen": 98981845, "router_z_loss_clip": 3.0, "router_z_loss_mlp": 0.34448242, "step": 4580, "time_per_iteration": 2.6636626720428467 }, { "auxiliary_loss_clip": 0.0155742, "auxiliary_loss_mlp": 0.0025084, "balance_loss_clip": 1.25918639, "balance_loss_mlp": 0.21798617, "epoch": 0.27542462047196753, "flos": 19537272720000.0, "grad_norm": 1.4910340166500349, "language_loss": 0.74840295, "learning_rate": 3.4007247293423137e-06, "loss": 0.76648557, "num_input_tokens_seen": 99001855, "router_z_loss_clip": 2.98046875, "router_z_loss_mlp": 0.328125, "step": 4581, "time_per_iteration": 4.140495538711548 }, { "auxiliary_loss_clip": 0.01550674, "auxiliary_loss_mlp": 0.00240921, "balance_loss_clip": 1.25018132, "balance_loss_mlp": 0.2078758, "epoch": 0.2754847437246355, "flos": 14319165173760.0, "grad_norm": 10.781380615845523, "language_loss": 0.85684592, "learning_rate": 3.400446709916392e-06, "loss": 0.87476182, "num_input_tokens_seen": 99019880, "router_z_loss_clip": 3.00585938, "router_z_loss_mlp": 0.33056641, "step": 4582, "time_per_iteration": 2.716481924057007 }, { "auxiliary_loss_clip": 0.01556011, "auxiliary_loss_mlp": 0.00195127, "balance_loss_clip": 1.25652575, "balance_loss_mlp": 0.16769692, "epoch": 0.27554486697730346, "flos": 18838773866880.0, "grad_norm": 1.8510802448190473, "language_loss": 0.9054963, "learning_rate": 3.4001686373853895e-06, "loss": 0.92300773, "num_input_tokens_seen": 99037570, "router_z_loss_clip": 2.99609375, "router_z_loss_mlp": 0.27441406, "step": 4583, "time_per_iteration": 2.647376537322998 }, { "auxiliary_loss_clip": 0.01581235, "auxiliary_loss_mlp": 0.00255033, "balance_loss_clip": 1.26916552, "balance_loss_mlp": 0.22470617, "epoch": 0.2756049902299714, "flos": 22382295402240.0, "grad_norm": 5.2913647695758215, "language_loss": 0.76768696, "learning_rate": 3.3998905117598528e-06, "loss": 0.7860496, "num_input_tokens_seen": 99056875, "router_z_loss_clip": 3.11914062, "router_z_loss_mlp": 0.3034668, "step": 4584, "time_per_iteration": 2.6411688327789307 }, { "auxiliary_loss_clip": 0.01589055, "auxiliary_loss_mlp": 0.00223729, "balance_loss_clip": 1.2789104, "balance_loss_mlp": 0.19437999, "epoch": 0.2756651134826394, "flos": 19573901614080.0, "grad_norm": 171.144612461705, "language_loss": 0.83416164, "learning_rate": 3.399612333050327e-06, "loss": 0.8522895, "num_input_tokens_seen": 99074685, "router_z_loss_clip": 3.09765625, "router_z_loss_mlp": 0.29345703, "step": 4585, "time_per_iteration": 2.6393234729766846 }, { "auxiliary_loss_clip": 0.01571638, "auxiliary_loss_mlp": 0.00253059, "balance_loss_clip": 1.25662816, "balance_loss_mlp": 0.2205389, "epoch": 0.27572523673530736, "flos": 23586559706880.0, "grad_norm": 71.47468639620806, "language_loss": 0.80204725, "learning_rate": 3.399334101267362e-06, "loss": 0.8202942, "num_input_tokens_seen": 99095300, "router_z_loss_clip": 3.1484375, "router_z_loss_mlp": 0.32519531, "step": 4586, "time_per_iteration": 2.6717560291290283 }, { "auxiliary_loss_clip": 0.01564564, "auxiliary_loss_mlp": 0.00244752, "balance_loss_clip": 1.25843489, "balance_loss_mlp": 0.21452026, "epoch": 0.2757853599879754, "flos": 22820118278400.0, "grad_norm": 279.62898056645224, "language_loss": 0.86335826, "learning_rate": 3.3990558164215073e-06, "loss": 0.88145143, "num_input_tokens_seen": 99115965, "router_z_loss_clip": 3.06054688, "router_z_loss_mlp": 0.30249023, "step": 4587, "time_per_iteration": 2.655555009841919 }, { "auxiliary_loss_clip": 0.01542043, "auxiliary_loss_mlp": 0.00230618, "balance_loss_clip": 1.2402252, "balance_loss_mlp": 0.20142324, "epoch": 0.27584548324064334, "flos": 18551704371840.0, "grad_norm": 6.193081232064679, "language_loss": 0.88156557, "learning_rate": 3.398777478523316e-06, "loss": 0.89929211, "num_input_tokens_seen": 99134265, "router_z_loss_clip": 3.015625, "router_z_loss_mlp": 0.29211426, "step": 4588, "time_per_iteration": 2.679084539413452 }, { "auxiliary_loss_clip": 0.01548361, "auxiliary_loss_mlp": 0.00239055, "balance_loss_clip": 1.24653399, "balance_loss_mlp": 0.21132722, "epoch": 0.2759056064933113, "flos": 23769883745280.0, "grad_norm": 1.956529467711396, "language_loss": 0.80575848, "learning_rate": 3.398499087583342e-06, "loss": 0.82363272, "num_input_tokens_seen": 99156185, "router_z_loss_clip": 3.01953125, "router_z_loss_mlp": 0.27734375, "step": 4589, "time_per_iteration": 2.677325963973999 }, { "auxiliary_loss_clip": 0.01565342, "auxiliary_loss_mlp": 0.00254723, "balance_loss_clip": 1.25678253, "balance_loss_mlp": 0.22499266, "epoch": 0.27596572974597927, "flos": 24281898163200.0, "grad_norm": 17.179852967305877, "language_loss": 0.94184494, "learning_rate": 3.398220643612143e-06, "loss": 0.96004558, "num_input_tokens_seen": 99176735, "router_z_loss_clip": 3.08789062, "router_z_loss_mlp": 0.29736328, "step": 4590, "time_per_iteration": 4.126562595367432 }, { "auxiliary_loss_clip": 0.01579523, "auxiliary_loss_mlp": 0.00276757, "balance_loss_clip": 1.2651031, "balance_loss_mlp": 0.24769396, "epoch": 0.27602585299864724, "flos": 35040985632000.0, "grad_norm": 14.111477365743127, "language_loss": 0.77559626, "learning_rate": 3.397942146620277e-06, "loss": 0.79415905, "num_input_tokens_seen": 99199765, "router_z_loss_clip": 3.14453125, "router_z_loss_mlp": 0.29064941, "step": 4591, "time_per_iteration": 2.7779150009155273 }, { "auxiliary_loss_clip": 0.01544526, "auxiliary_loss_mlp": 0.00269327, "balance_loss_clip": 1.24699283, "balance_loss_mlp": 0.23809452, "epoch": 0.2760859762513152, "flos": 24309405002880.0, "grad_norm": 553.2485182405113, "language_loss": 0.86157554, "learning_rate": 3.3976635966183046e-06, "loss": 0.87971413, "num_input_tokens_seen": 99218435, "router_z_loss_clip": 2.97460938, "router_z_loss_mlp": 0.31225586, "step": 4592, "time_per_iteration": 2.6586720943450928 }, { "auxiliary_loss_clip": 0.01465138, "auxiliary_loss_mlp": 0.0008516, "balance_loss_clip": 1.18611026, "balance_loss_mlp": 0.07476512, "epoch": 0.27614609950398317, "flos": 71260739890560.0, "grad_norm": 0.6920349547374054, "language_loss": 0.61460841, "learning_rate": 3.3973849936167886e-06, "loss": 0.63011134, "num_input_tokens_seen": 99276200, "router_z_loss_clip": 2.78125, "router_z_loss_mlp": 0.10400391, "step": 4593, "time_per_iteration": 3.1123058795928955 }, { "auxiliary_loss_clip": 0.01554453, "auxiliary_loss_mlp": 0.00273624, "balance_loss_clip": 1.24871111, "balance_loss_mlp": 0.24518113, "epoch": 0.27620622275665113, "flos": 29674854138240.0, "grad_norm": 3.5755904911077616, "language_loss": 0.82169378, "learning_rate": 3.3971063376262937e-06, "loss": 0.83997452, "num_input_tokens_seen": 99297625, "router_z_loss_clip": 3.06054688, "router_z_loss_mlp": 0.28417969, "step": 4594, "time_per_iteration": 2.6748263835906982 }, { "auxiliary_loss_clip": 0.01554257, "auxiliary_loss_mlp": 0.00258631, "balance_loss_clip": 1.24984527, "balance_loss_mlp": 0.23054521, "epoch": 0.2762663460093191, "flos": 15378063137280.0, "grad_norm": 24.147520501970828, "language_loss": 0.97446072, "learning_rate": 3.3968276286573866e-06, "loss": 0.99258959, "num_input_tokens_seen": 99315790, "router_z_loss_clip": 3.04492188, "router_z_loss_mlp": 0.28100586, "step": 4595, "time_per_iteration": 2.5887420177459717 }, { "auxiliary_loss_clip": 0.01576256, "auxiliary_loss_mlp": 0.00272136, "balance_loss_clip": 1.26428056, "balance_loss_mlp": 0.24004443, "epoch": 0.27632646926198706, "flos": 20704082117760.0, "grad_norm": 62.121422483627335, "language_loss": 0.7627759, "learning_rate": 3.3965488667206353e-06, "loss": 0.78125983, "num_input_tokens_seen": 99334615, "router_z_loss_clip": 3.12109375, "router_z_loss_mlp": 0.32104492, "step": 4596, "time_per_iteration": 2.6254708766937256 }, { "auxiliary_loss_clip": 0.01577308, "auxiliary_loss_mlp": 0.00274628, "balance_loss_clip": 1.26431072, "balance_loss_mlp": 0.24591047, "epoch": 0.276386592514655, "flos": 32813374849920.0, "grad_norm": 4.03019144248856, "language_loss": 0.69006264, "learning_rate": 3.3962700518266113e-06, "loss": 0.70858204, "num_input_tokens_seen": 99356685, "router_z_loss_clip": 3.1328125, "router_z_loss_mlp": 0.28723145, "step": 4597, "time_per_iteration": 2.719001531600952 }, { "auxiliary_loss_clip": 0.01544764, "auxiliary_loss_mlp": 0.00251833, "balance_loss_clip": 1.24715638, "balance_loss_mlp": 0.22423568, "epoch": 0.276446715767323, "flos": 18551704371840.0, "grad_norm": 3.1210994758290025, "language_loss": 0.90733171, "learning_rate": 3.395991183985887e-06, "loss": 0.92529768, "num_input_tokens_seen": 99374810, "router_z_loss_clip": 2.9765625, "router_z_loss_mlp": 0.27612305, "step": 4598, "time_per_iteration": 2.6177024841308594 }, { "auxiliary_loss_clip": 0.01596747, "auxiliary_loss_mlp": 0.00266645, "balance_loss_clip": 1.2749362, "balance_loss_mlp": 0.23625827, "epoch": 0.27650683901999096, "flos": 22819615488000.0, "grad_norm": 3.2681317333661553, "language_loss": 0.8700099, "learning_rate": 3.395712263209037e-06, "loss": 0.8886438, "num_input_tokens_seen": 99391290, "router_z_loss_clip": 3.21289062, "router_z_loss_mlp": 0.30383301, "step": 4599, "time_per_iteration": 2.610305070877075 }, { "auxiliary_loss_clip": 0.01574816, "auxiliary_loss_mlp": 0.00313987, "balance_loss_clip": 1.25939584, "balance_loss_mlp": 0.28418446, "epoch": 0.276566962272659, "flos": 21361534704000.0, "grad_norm": 4.270466681588254, "language_loss": 0.87004256, "learning_rate": 3.395433289506639e-06, "loss": 0.88893056, "num_input_tokens_seen": 99409120, "router_z_loss_clip": 3.15625, "router_z_loss_mlp": 0.29760742, "step": 4600, "time_per_iteration": 2.633394718170166 }, { "auxiliary_loss_clip": 0.01568736, "auxiliary_loss_mlp": 0.00326036, "balance_loss_clip": 1.25808847, "balance_loss_mlp": 0.29630488, "epoch": 0.27662708552532694, "flos": 17710604524800.0, "grad_norm": 35.42518767289026, "language_loss": 0.80994654, "learning_rate": 3.3951542628892694e-06, "loss": 0.82889426, "num_input_tokens_seen": 99426180, "router_z_loss_clip": 3.10742188, "router_z_loss_mlp": 0.29711914, "step": 4601, "time_per_iteration": 2.625255584716797 }, { "auxiliary_loss_clip": 0.01599379, "auxiliary_loss_mlp": 0.00313667, "balance_loss_clip": 1.28081119, "balance_loss_mlp": 0.28281522, "epoch": 0.2766872087779949, "flos": 21252725429760.0, "grad_norm": 1.784662101693216, "language_loss": 0.8779704, "learning_rate": 3.3948751833675113e-06, "loss": 0.89710087, "num_input_tokens_seen": 99447720, "router_z_loss_clip": 3.18554688, "router_z_loss_mlp": 0.30859375, "step": 4602, "time_per_iteration": 2.67585825920105 }, { "auxiliary_loss_clip": 0.01579147, "auxiliary_loss_mlp": 0.00367879, "balance_loss_clip": 1.26756072, "balance_loss_mlp": 0.3339994, "epoch": 0.2767473320306629, "flos": 12931900053120.0, "grad_norm": 7.9571264333128475, "language_loss": 0.85257864, "learning_rate": 3.3945960509519455e-06, "loss": 0.87204885, "num_input_tokens_seen": 99464720, "router_z_loss_clip": 3.1171875, "router_z_loss_mlp": 0.33862305, "step": 4603, "time_per_iteration": 2.7897605895996094 }, { "auxiliary_loss_clip": 0.01592217, "auxiliary_loss_mlp": 0.00352924, "balance_loss_clip": 1.2744571, "balance_loss_mlp": 0.32477909, "epoch": 0.27680745528333084, "flos": 15012851604480.0, "grad_norm": 6.454686226042073, "language_loss": 0.87110883, "learning_rate": 3.3943168656531585e-06, "loss": 0.89056027, "num_input_tokens_seen": 99482310, "router_z_loss_clip": 3.17578125, "router_z_loss_mlp": 0.28137207, "step": 4604, "time_per_iteration": 2.6881730556488037 }, { "auxiliary_loss_clip": 0.01578487, "auxiliary_loss_mlp": 0.00321918, "balance_loss_clip": 1.26448548, "balance_loss_mlp": 0.29130489, "epoch": 0.2768675785359988, "flos": 22637835734400.0, "grad_norm": 13.351706690390365, "language_loss": 0.7570765, "learning_rate": 3.3940376274817363e-06, "loss": 0.77608061, "num_input_tokens_seen": 99501255, "router_z_loss_clip": 3.140625, "router_z_loss_mlp": 0.30615234, "step": 4605, "time_per_iteration": 2.6751046180725098 }, { "auxiliary_loss_clip": 0.0149603, "auxiliary_loss_mlp": 0.00130662, "balance_loss_clip": 1.21722198, "balance_loss_mlp": 0.12184059, "epoch": 0.27692770178866677, "flos": 66130542881280.0, "grad_norm": 0.699221176498872, "language_loss": 0.57032561, "learning_rate": 3.3937583364482673e-06, "loss": 0.58659256, "num_input_tokens_seen": 99568925, "router_z_loss_clip": 2.78125, "router_z_loss_mlp": 0.08837891, "step": 4606, "time_per_iteration": 3.2314839363098145 }, { "auxiliary_loss_clip": 0.01606027, "auxiliary_loss_mlp": 0.00342882, "balance_loss_clip": 1.28120494, "balance_loss_mlp": 0.31148258, "epoch": 0.27698782504133473, "flos": 26464979059200.0, "grad_norm": 9.557620257868262, "language_loss": 0.76025939, "learning_rate": 3.3934789925633424e-06, "loss": 0.77974844, "num_input_tokens_seen": 99588455, "router_z_loss_clip": 3.25, "router_z_loss_mlp": 0.31420898, "step": 4607, "time_per_iteration": 2.6667613983154297 }, { "auxiliary_loss_clip": 0.01584137, "auxiliary_loss_mlp": 0.00331055, "balance_loss_clip": 1.27396917, "balance_loss_mlp": 0.30224198, "epoch": 0.2770479482940027, "flos": 25884806584320.0, "grad_norm": 42.289663712935145, "language_loss": 0.75252306, "learning_rate": 3.393199595837555e-06, "loss": 0.77167499, "num_input_tokens_seen": 99609355, "router_z_loss_clip": 3.09960938, "router_z_loss_mlp": 0.28796387, "step": 4608, "time_per_iteration": 2.679741144180298 }, { "auxiliary_loss_clip": 0.01573651, "auxiliary_loss_mlp": 0.00327693, "balance_loss_clip": 1.25963879, "balance_loss_mlp": 0.29541135, "epoch": 0.27710807154667066, "flos": 22857249962880.0, "grad_norm": 36.69369011424916, "language_loss": 0.80163956, "learning_rate": 3.392920146281499e-06, "loss": 0.82065296, "num_input_tokens_seen": 99628780, "router_z_loss_clip": 3.140625, "router_z_loss_mlp": 0.32275391, "step": 4609, "time_per_iteration": 2.637974739074707 }, { "auxiliary_loss_clip": 0.01593924, "auxiliary_loss_mlp": 0.00323209, "balance_loss_clip": 1.27589703, "balance_loss_mlp": 0.29214314, "epoch": 0.27716819479933863, "flos": 17711071401600.0, "grad_norm": 12.174152792573059, "language_loss": 0.90147734, "learning_rate": 3.3926406439057714e-06, "loss": 0.92064863, "num_input_tokens_seen": 99644545, "router_z_loss_clip": 3.18164062, "router_z_loss_mlp": 0.31079102, "step": 4610, "time_per_iteration": 2.7526721954345703 }, { "auxiliary_loss_clip": 0.0159957, "auxiliary_loss_mlp": 0.00317456, "balance_loss_clip": 1.27541697, "balance_loss_mlp": 0.28677145, "epoch": 0.2772283180520066, "flos": 19646046080640.0, "grad_norm": 68.29123998369232, "language_loss": 0.79367256, "learning_rate": 3.3923610887209705e-06, "loss": 0.81284285, "num_input_tokens_seen": 99663125, "router_z_loss_clip": 3.24023438, "router_z_loss_mlp": 0.30688477, "step": 4611, "time_per_iteration": 2.6406233310699463 }, { "auxiliary_loss_clip": 0.01583327, "auxiliary_loss_mlp": 0.0027775, "balance_loss_clip": 1.2707541, "balance_loss_mlp": 0.24764964, "epoch": 0.27728844130467456, "flos": 21032628842880.0, "grad_norm": 1.5966653729018665, "language_loss": 0.81046247, "learning_rate": 3.392081480737698e-06, "loss": 0.82907319, "num_input_tokens_seen": 99682645, "router_z_loss_clip": 3.12695312, "router_z_loss_mlp": 0.30102539, "step": 4612, "time_per_iteration": 2.695617914199829 }, { "auxiliary_loss_clip": 0.01618339, "auxiliary_loss_mlp": 0.00337004, "balance_loss_clip": 1.2900331, "balance_loss_mlp": 0.30577078, "epoch": 0.2773485645573425, "flos": 18989204025600.0, "grad_norm": 5.549447105402128, "language_loss": 0.75465161, "learning_rate": 3.3918018199665563e-06, "loss": 0.77420503, "num_input_tokens_seen": 99700520, "router_z_loss_clip": 3.28320312, "router_z_loss_mlp": 0.3125, "step": 4613, "time_per_iteration": 2.6503517627716064 }, { "auxiliary_loss_clip": 0.01580183, "auxiliary_loss_mlp": 0.0028275, "balance_loss_clip": 1.26779532, "balance_loss_mlp": 0.25170782, "epoch": 0.27740868781001055, "flos": 21468440557440.0, "grad_norm": 10.460367033417189, "language_loss": 0.85215008, "learning_rate": 3.39152210641815e-06, "loss": 0.87077934, "num_input_tokens_seen": 99720355, "router_z_loss_clip": 3.12304688, "router_z_loss_mlp": 0.31079102, "step": 4614, "time_per_iteration": 2.633056402206421 }, { "auxiliary_loss_clip": 0.01601937, "auxiliary_loss_mlp": 0.00293702, "balance_loss_clip": 1.27660966, "balance_loss_mlp": 0.26380435, "epoch": 0.2774688110626785, "flos": 19827825834240.0, "grad_norm": 34.519143047106844, "language_loss": 0.91765571, "learning_rate": 3.3912423401030865e-06, "loss": 0.93661213, "num_input_tokens_seen": 99736090, "router_z_loss_clip": 3.25585938, "router_z_loss_mlp": 0.29907227, "step": 4615, "time_per_iteration": 2.652782440185547 }, { "auxiliary_loss_clip": 0.0159202, "auxiliary_loss_mlp": 0.00279342, "balance_loss_clip": 1.27205348, "balance_loss_mlp": 0.24952741, "epoch": 0.2775289343153465, "flos": 18216226321920.0, "grad_norm": 15.855423503002251, "language_loss": 0.76201063, "learning_rate": 3.3909625210319735e-06, "loss": 0.78072423, "num_input_tokens_seen": 99751805, "router_z_loss_clip": 3.19921875, "router_z_loss_mlp": 0.29846191, "step": 4616, "time_per_iteration": 2.6125993728637695 }, { "auxiliary_loss_clip": 0.01616348, "auxiliary_loss_mlp": 0.0033009, "balance_loss_clip": 1.28769088, "balance_loss_mlp": 0.29759404, "epoch": 0.27758905756801444, "flos": 16472476673280.0, "grad_norm": 2.6021579603519385, "language_loss": 0.90389383, "learning_rate": 3.3906826492154226e-06, "loss": 0.9233582, "num_input_tokens_seen": 99770610, "router_z_loss_clip": 3.2890625, "router_z_loss_mlp": 0.32519531, "step": 4617, "time_per_iteration": 4.029827833175659 }, { "auxiliary_loss_clip": 0.01612508, "auxiliary_loss_mlp": 0.0031352, "balance_loss_clip": 1.28824675, "balance_loss_mlp": 0.28040391, "epoch": 0.2776491808206824, "flos": 18728240739840.0, "grad_norm": 9.347561743591502, "language_loss": 0.83301258, "learning_rate": 3.3904027246640458e-06, "loss": 0.85227287, "num_input_tokens_seen": 99787305, "router_z_loss_clip": 3.24414062, "router_z_loss_mlp": 0.33081055, "step": 4618, "time_per_iteration": 2.6006414890289307 }, { "auxiliary_loss_clip": 0.01608787, "auxiliary_loss_mlp": 0.00279806, "balance_loss_clip": 1.2868216, "balance_loss_mlp": 0.25158954, "epoch": 0.27770930407335037, "flos": 28038189911040.0, "grad_norm": 6.199086347798929, "language_loss": 0.92086506, "learning_rate": 3.390122747388459e-06, "loss": 0.93975103, "num_input_tokens_seen": 99808940, "router_z_loss_clip": 3.21875, "router_z_loss_mlp": 0.28210449, "step": 4619, "time_per_iteration": 2.692800283432007 }, { "auxiliary_loss_clip": 0.01585512, "auxiliary_loss_mlp": 0.00283077, "balance_loss_clip": 1.2729212, "balance_loss_mlp": 0.25406125, "epoch": 0.27776942732601834, "flos": 23549823072000.0, "grad_norm": 33.29519430612466, "language_loss": 0.82342708, "learning_rate": 3.3898427173992778e-06, "loss": 0.84211296, "num_input_tokens_seen": 99829575, "router_z_loss_clip": 3.125, "router_z_loss_mlp": 0.29003906, "step": 4620, "time_per_iteration": 2.6956193447113037 }, { "auxiliary_loss_clip": 0.01574375, "auxiliary_loss_mlp": 0.00315389, "balance_loss_clip": 1.26078653, "balance_loss_mlp": 0.28355998, "epoch": 0.2778295505786863, "flos": 23908713811200.0, "grad_norm": 18.652174699886842, "language_loss": 0.82735336, "learning_rate": 3.389562634707122e-06, "loss": 0.84625101, "num_input_tokens_seen": 99847575, "router_z_loss_clip": 3.13476562, "router_z_loss_mlp": 0.31799316, "step": 4621, "time_per_iteration": 4.007266998291016 }, { "auxiliary_loss_clip": 0.01598988, "auxiliary_loss_mlp": 0.00279542, "balance_loss_clip": 1.27712619, "balance_loss_mlp": 0.2485718, "epoch": 0.27788967383135427, "flos": 25554571920000.0, "grad_norm": 3.370325993982739, "language_loss": 0.94443864, "learning_rate": 3.389282499322611e-06, "loss": 0.96322405, "num_input_tokens_seen": 99864995, "router_z_loss_clip": 3.21875, "router_z_loss_mlp": 0.30957031, "step": 4622, "time_per_iteration": 2.637813091278076 }, { "auxiliary_loss_clip": 0.01575449, "auxiliary_loss_mlp": 0.00311614, "balance_loss_clip": 1.25880432, "balance_loss_mlp": 0.28040501, "epoch": 0.27794979708402223, "flos": 16252631481600.0, "grad_norm": 36.25405837134604, "language_loss": 0.90045464, "learning_rate": 3.389002311256369e-06, "loss": 0.91932523, "num_input_tokens_seen": 99881540, "router_z_loss_clip": 3.16796875, "router_z_loss_mlp": 0.31225586, "step": 4623, "time_per_iteration": 4.141775608062744 }, { "auxiliary_loss_clip": 0.01608817, "auxiliary_loss_mlp": 0.00304766, "balance_loss_clip": 1.28664196, "balance_loss_mlp": 0.27424854, "epoch": 0.2780099203366902, "flos": 20667632791680.0, "grad_norm": 21.36671290110447, "language_loss": 0.88555604, "learning_rate": 3.3887220705190204e-06, "loss": 0.90469182, "num_input_tokens_seen": 99899595, "router_z_loss_clip": 3.22265625, "router_z_loss_mlp": 0.30493164, "step": 4624, "time_per_iteration": 2.686006784439087 }, { "auxiliary_loss_clip": 0.01596086, "auxiliary_loss_mlp": 0.00267632, "balance_loss_clip": 1.27988625, "balance_loss_mlp": 0.23620847, "epoch": 0.27807004358935816, "flos": 17739583822080.0, "grad_norm": 210.86693173980385, "language_loss": 0.84688556, "learning_rate": 3.388441777121191e-06, "loss": 0.86552274, "num_input_tokens_seen": 99913020, "router_z_loss_clip": 3.16210938, "router_z_loss_mlp": 0.31445312, "step": 4625, "time_per_iteration": 2.646076202392578 }, { "auxiliary_loss_clip": 0.01583808, "auxiliary_loss_mlp": 0.00275669, "balance_loss_clip": 1.27165687, "balance_loss_mlp": 0.24379244, "epoch": 0.2781301668420261, "flos": 16727119165440.0, "grad_norm": 68.9408105294936, "language_loss": 0.77647233, "learning_rate": 3.388161431073511e-06, "loss": 0.79506707, "num_input_tokens_seen": 99931405, "router_z_loss_clip": 3.12109375, "router_z_loss_mlp": 0.31872559, "step": 4626, "time_per_iteration": 2.6243302822113037 }, { "auxiliary_loss_clip": 0.01619271, "auxiliary_loss_mlp": 0.00299992, "balance_loss_clip": 1.29222524, "balance_loss_mlp": 0.26878339, "epoch": 0.27819029009469415, "flos": 13844749317120.0, "grad_norm": 2.7969993226072294, "language_loss": 1.02144611, "learning_rate": 3.38788103238661e-06, "loss": 1.04063869, "num_input_tokens_seen": 99948100, "router_z_loss_clip": 3.26953125, "router_z_loss_mlp": 0.31225586, "step": 4627, "time_per_iteration": 2.607041120529175 }, { "auxiliary_loss_clip": 0.0159841, "auxiliary_loss_mlp": 0.00299188, "balance_loss_clip": 1.2749207, "balance_loss_mlp": 0.27014881, "epoch": 0.2782504133473621, "flos": 27089286370560.0, "grad_norm": 3.1215468305476413, "language_loss": 0.91142803, "learning_rate": 3.387600581071121e-06, "loss": 0.93040407, "num_input_tokens_seen": 99966470, "router_z_loss_clip": 3.234375, "router_z_loss_mlp": 0.29016113, "step": 4628, "time_per_iteration": 2.694209575653076 }, { "auxiliary_loss_clip": 0.01582885, "auxiliary_loss_mlp": 0.0030818, "balance_loss_clip": 1.26539481, "balance_loss_mlp": 0.27487323, "epoch": 0.2783105366000301, "flos": 21068826773760.0, "grad_norm": 5.889525070693907, "language_loss": 0.84191138, "learning_rate": 3.387320077137679e-06, "loss": 0.86082202, "num_input_tokens_seen": 99985930, "router_z_loss_clip": 3.17578125, "router_z_loss_mlp": 0.33276367, "step": 4629, "time_per_iteration": 2.7062413692474365 }, { "auxiliary_loss_clip": 0.01574135, "auxiliary_loss_mlp": 0.00307547, "balance_loss_clip": 1.26568246, "balance_loss_mlp": 0.27811375, "epoch": 0.27837065985269804, "flos": 26501823434880.0, "grad_norm": 8.882886865317023, "language_loss": 0.89122188, "learning_rate": 3.3870395205969208e-06, "loss": 0.91003871, "num_input_tokens_seen": 100006235, "router_z_loss_clip": 3.08789062, "router_z_loss_mlp": 0.29431152, "step": 4630, "time_per_iteration": 2.7005350589752197 }, { "auxiliary_loss_clip": 0.01568982, "auxiliary_loss_mlp": 0.00291474, "balance_loss_clip": 1.2619797, "balance_loss_mlp": 0.26083672, "epoch": 0.278430783105366, "flos": 20223201813120.0, "grad_norm": 125.04851336576725, "language_loss": 0.90355325, "learning_rate": 3.386758911459485e-06, "loss": 0.92215776, "num_input_tokens_seen": 100023655, "router_z_loss_clip": 3.0703125, "router_z_loss_mlp": 0.30615234, "step": 4631, "time_per_iteration": 2.6541709899902344 }, { "auxiliary_loss_clip": 0.01578673, "auxiliary_loss_mlp": 0.0034988, "balance_loss_clip": 1.26995945, "balance_loss_mlp": 0.31790826, "epoch": 0.278490906358034, "flos": 25592888753280.0, "grad_norm": 78.68601412124104, "language_loss": 0.79526138, "learning_rate": 3.3864782497360126e-06, "loss": 0.81454688, "num_input_tokens_seen": 100043280, "router_z_loss_clip": 3.0859375, "router_z_loss_mlp": 0.31982422, "step": 4632, "time_per_iteration": 4.065192699432373 }, { "auxiliary_loss_clip": 0.01603373, "auxiliary_loss_mlp": 0.00296513, "balance_loss_clip": 1.29069543, "balance_loss_mlp": 0.26597169, "epoch": 0.27855102961070194, "flos": 16171544528640.0, "grad_norm": 43.06459144275631, "language_loss": 0.87519211, "learning_rate": 3.386197535437145e-06, "loss": 0.89419097, "num_input_tokens_seen": 100057690, "router_z_loss_clip": 3.12695312, "router_z_loss_mlp": 0.30517578, "step": 4633, "time_per_iteration": 2.5964860916137695 }, { "auxiliary_loss_clip": 0.01602117, "auxiliary_loss_mlp": 0.00329289, "balance_loss_clip": 1.28624964, "balance_loss_mlp": 0.2972818, "epoch": 0.2786111528633699, "flos": 22927598749440.0, "grad_norm": 47.658037151536945, "language_loss": 0.92635447, "learning_rate": 3.385916768573529e-06, "loss": 0.94566846, "num_input_tokens_seen": 100075875, "router_z_loss_clip": 3.16015625, "router_z_loss_mlp": 0.31994629, "step": 4634, "time_per_iteration": 2.663015604019165 }, { "auxiliary_loss_clip": 0.01611334, "auxiliary_loss_mlp": 0.00355778, "balance_loss_clip": 1.29139757, "balance_loss_mlp": 0.32168448, "epoch": 0.27867127611603787, "flos": 23404205335680.0, "grad_norm": 7.674788406087617, "language_loss": 0.83177418, "learning_rate": 3.38563594915581e-06, "loss": 0.85144532, "num_input_tokens_seen": 100092930, "router_z_loss_clip": 3.19726562, "router_z_loss_mlp": 0.34082031, "step": 4635, "time_per_iteration": 2.734494686126709 }, { "auxiliary_loss_clip": 0.01604974, "auxiliary_loss_mlp": 0.00363121, "balance_loss_clip": 1.28768635, "balance_loss_mlp": 0.33245987, "epoch": 0.27873139936870583, "flos": 19829010983040.0, "grad_norm": 5.527385940079714, "language_loss": 0.72194642, "learning_rate": 3.385355077194637e-06, "loss": 0.74162734, "num_input_tokens_seen": 100110790, "router_z_loss_clip": 3.17382812, "router_z_loss_mlp": 0.30639648, "step": 4636, "time_per_iteration": 2.650726795196533 }, { "auxiliary_loss_clip": 0.01569717, "auxiliary_loss_mlp": 0.00370269, "balance_loss_clip": 1.26028883, "balance_loss_mlp": 0.33827296, "epoch": 0.2787915226213738, "flos": 17707659609600.0, "grad_norm": 16.21700259165247, "language_loss": 0.93640453, "learning_rate": 3.3850741527006604e-06, "loss": 0.95580435, "num_input_tokens_seen": 100126970, "router_z_loss_clip": 3.09570312, "router_z_loss_mlp": 0.31982422, "step": 4637, "time_per_iteration": 2.607879400253296 }, { "auxiliary_loss_clip": 0.01584217, "auxiliary_loss_mlp": 0.00326159, "balance_loss_clip": 1.27805281, "balance_loss_mlp": 0.29723868, "epoch": 0.27885164587404176, "flos": 22090557139200.0, "grad_norm": 11.459380443733988, "language_loss": 0.83055186, "learning_rate": 3.384793175684533e-06, "loss": 0.84965563, "num_input_tokens_seen": 100146720, "router_z_loss_clip": 3.06054688, "router_z_loss_mlp": 0.28918457, "step": 4638, "time_per_iteration": 2.6357314586639404 }, { "auxiliary_loss_clip": 0.01599448, "auxiliary_loss_mlp": 0.00317989, "balance_loss_clip": 1.28620887, "balance_loss_mlp": 0.28832984, "epoch": 0.27891176912670973, "flos": 19207684500480.0, "grad_norm": 3.1586450689574246, "language_loss": 0.79116398, "learning_rate": 3.38451214615691e-06, "loss": 0.81033838, "num_input_tokens_seen": 100165920, "router_z_loss_clip": 3.13476562, "router_z_loss_mlp": 0.29663086, "step": 4639, "time_per_iteration": 2.6228363513946533 }, { "auxiliary_loss_clip": 0.01602918, "auxiliary_loss_mlp": 0.00338244, "balance_loss_clip": 1.28529024, "balance_loss_mlp": 0.30534187, "epoch": 0.27897189237937775, "flos": 27600007898880.0, "grad_norm": 3.3646375847196306, "language_loss": 0.75042534, "learning_rate": 3.384231064128447e-06, "loss": 0.7698369, "num_input_tokens_seen": 100185525, "router_z_loss_clip": 3.17578125, "router_z_loss_mlp": 0.32910156, "step": 4640, "time_per_iteration": 2.680684804916382 }, { "auxiliary_loss_clip": 0.01589465, "auxiliary_loss_mlp": 0.00293235, "balance_loss_clip": 1.27726936, "balance_loss_mlp": 0.26505345, "epoch": 0.2790320156320457, "flos": 21178210665600.0, "grad_norm": 8.587089712339315, "language_loss": 0.79932547, "learning_rate": 3.383949929609804e-06, "loss": 0.81815243, "num_input_tokens_seen": 100204850, "router_z_loss_clip": 3.125, "router_z_loss_mlp": 0.28198242, "step": 4641, "time_per_iteration": 2.6388065814971924 }, { "auxiliary_loss_clip": 0.0159008, "auxiliary_loss_mlp": 0.00307602, "balance_loss_clip": 1.27904952, "balance_loss_mlp": 0.27589187, "epoch": 0.2790921388847137, "flos": 22783920347520.0, "grad_norm": 3.7395298864133086, "language_loss": 0.84073955, "learning_rate": 3.383668742611641e-06, "loss": 0.8597163, "num_input_tokens_seen": 100224520, "router_z_loss_clip": 3.10742188, "router_z_loss_mlp": 0.31713867, "step": 4642, "time_per_iteration": 2.6814684867858887 }, { "auxiliary_loss_clip": 0.01583364, "auxiliary_loss_mlp": 0.00365986, "balance_loss_clip": 1.27534604, "balance_loss_mlp": 0.33563581, "epoch": 0.27915226213738165, "flos": 23400649889280.0, "grad_norm": 9.89059585062372, "language_loss": 0.91325057, "learning_rate": 3.3833875031446205e-06, "loss": 0.93274409, "num_input_tokens_seen": 100243935, "router_z_loss_clip": 3.078125, "router_z_loss_mlp": 0.30322266, "step": 4643, "time_per_iteration": 2.6561787128448486 }, { "auxiliary_loss_clip": 0.01580578, "auxiliary_loss_mlp": 0.00320976, "balance_loss_clip": 1.26607442, "balance_loss_mlp": 0.29036266, "epoch": 0.2792123853900496, "flos": 22747794243840.0, "grad_norm": 2.240433272018533, "language_loss": 0.88273239, "learning_rate": 3.383106211219407e-06, "loss": 0.90174794, "num_input_tokens_seen": 100262290, "router_z_loss_clip": 3.140625, "router_z_loss_mlp": 0.30615234, "step": 4644, "time_per_iteration": 2.612318277359009 }, { "auxiliary_loss_clip": 0.01595326, "auxiliary_loss_mlp": 0.00313591, "balance_loss_clip": 1.28029823, "balance_loss_mlp": 0.28269216, "epoch": 0.2792725086427176, "flos": 15049372757760.0, "grad_norm": 4.4783898912065645, "language_loss": 0.86636418, "learning_rate": 3.3828248668466673e-06, "loss": 0.88545334, "num_input_tokens_seen": 100280015, "router_z_loss_clip": 3.14648438, "router_z_loss_mlp": 0.30908203, "step": 4645, "time_per_iteration": 2.6397786140441895 }, { "auxiliary_loss_clip": 0.01345419, "auxiliary_loss_mlp": 0.00121615, "balance_loss_clip": 1.11451316, "balance_loss_mlp": 0.11498653, "epoch": 0.27933263189538554, "flos": 62544861757440.0, "grad_norm": 0.7693157033174948, "language_loss": 0.62047958, "learning_rate": 3.3825434700370705e-06, "loss": 0.63514996, "num_input_tokens_seen": 100338935, "router_z_loss_clip": 2.3125, "router_z_loss_mlp": 0.06640625, "step": 4646, "time_per_iteration": 3.0961034297943115 }, { "auxiliary_loss_clip": 0.01599694, "auxiliary_loss_mlp": 0.00279291, "balance_loss_clip": 1.28531325, "balance_loss_mlp": 0.24886864, "epoch": 0.2793927551480535, "flos": 25118365155840.0, "grad_norm": 9.400963403858782, "language_loss": 0.92530847, "learning_rate": 3.3822620208012865e-06, "loss": 0.94409835, "num_input_tokens_seen": 100359905, "router_z_loss_clip": 3.140625, "router_z_loss_mlp": 0.30407715, "step": 4647, "time_per_iteration": 2.6632654666900635 }, { "auxiliary_loss_clip": 0.0159555, "auxiliary_loss_mlp": 0.00335325, "balance_loss_clip": 1.28249013, "balance_loss_mlp": 0.30504617, "epoch": 0.27945287840072147, "flos": 21324582587520.0, "grad_norm": 21.792047087840604, "language_loss": 0.9149158, "learning_rate": 3.381980519149988e-06, "loss": 0.93422461, "num_input_tokens_seen": 100376955, "router_z_loss_clip": 3.12890625, "router_z_loss_mlp": 0.30297852, "step": 4648, "time_per_iteration": 2.690333127975464 }, { "auxiliary_loss_clip": 0.01594285, "auxiliary_loss_mlp": 0.00326331, "balance_loss_clip": 1.2798692, "balance_loss_mlp": 0.29237992, "epoch": 0.27951300165338944, "flos": 27450547407360.0, "grad_norm": 6.03952927016642, "language_loss": 0.80887538, "learning_rate": 3.38169896509385e-06, "loss": 0.82808161, "num_input_tokens_seen": 100397545, "router_z_loss_clip": 3.14257812, "router_z_loss_mlp": 0.33935547, "step": 4649, "time_per_iteration": 2.724867105484009 }, { "auxiliary_loss_clip": 0.0158349, "auxiliary_loss_mlp": 0.00296855, "balance_loss_clip": 1.28237367, "balance_loss_mlp": 0.26409596, "epoch": 0.2795731249060574, "flos": 15159008044800.0, "grad_norm": 2.309082706861934, "language_loss": 0.8973282, "learning_rate": 3.381417358643549e-06, "loss": 0.91613162, "num_input_tokens_seen": 100415080, "router_z_loss_clip": 3.01171875, "router_z_loss_mlp": 0.32739258, "step": 4650, "time_per_iteration": 2.72514009475708 }, { "auxiliary_loss_clip": 0.01318539, "auxiliary_loss_mlp": 0.0010631, "balance_loss_clip": 1.09032428, "balance_loss_mlp": 0.10001602, "epoch": 0.27963324815872537, "flos": 60120103178880.0, "grad_norm": 0.8065755201521717, "language_loss": 0.58598304, "learning_rate": 3.3811356998097624e-06, "loss": 0.60023153, "num_input_tokens_seen": 100471105, "router_z_loss_clip": 2.28125, "router_z_loss_mlp": 0.06298828, "step": 4651, "time_per_iteration": 3.1868698596954346 }, { "auxiliary_loss_clip": 0.01598704, "auxiliary_loss_mlp": 0.002965, "balance_loss_clip": 1.28565764, "balance_loss_mlp": 0.26426566, "epoch": 0.27969337141139333, "flos": 21765960910080.0, "grad_norm": 2.409292006228867, "language_loss": 0.81082332, "learning_rate": 3.3808539886031726e-06, "loss": 0.82977533, "num_input_tokens_seen": 100492520, "router_z_loss_clip": 3.1328125, "router_z_loss_mlp": 0.32275391, "step": 4652, "time_per_iteration": 2.676682710647583 }, { "auxiliary_loss_clip": 0.01599309, "auxiliary_loss_mlp": 0.00273548, "balance_loss_clip": 1.29238999, "balance_loss_mlp": 0.24128976, "epoch": 0.27975349466406135, "flos": 39851398834560.0, "grad_norm": 2.282969532173594, "language_loss": 0.85929394, "learning_rate": 3.380572225034461e-06, "loss": 0.87802249, "num_input_tokens_seen": 100512870, "router_z_loss_clip": 3.06835938, "router_z_loss_mlp": 0.32250977, "step": 4653, "time_per_iteration": 2.820068120956421 }, { "auxiliary_loss_clip": 0.01587644, "auxiliary_loss_mlp": 0.00284054, "balance_loss_clip": 1.28299057, "balance_loss_mlp": 0.25478843, "epoch": 0.2798136179167293, "flos": 21579799697280.0, "grad_norm": 6.011123518874792, "language_loss": 0.87384433, "learning_rate": 3.380290409114312e-06, "loss": 0.89256132, "num_input_tokens_seen": 100531655, "router_z_loss_clip": 3.04296875, "router_z_loss_mlp": 0.29284668, "step": 4654, "time_per_iteration": 2.6798083782196045 }, { "auxiliary_loss_clip": 0.01635706, "auxiliary_loss_mlp": 0.00267069, "balance_loss_clip": 1.31629753, "balance_loss_mlp": 0.23596719, "epoch": 0.2798737411693973, "flos": 21537676022400.0, "grad_norm": 118.2723476332369, "language_loss": 0.89278573, "learning_rate": 3.3800085408534127e-06, "loss": 0.9118135, "num_input_tokens_seen": 100548005, "router_z_loss_clip": 3.1953125, "router_z_loss_mlp": 0.31091309, "step": 4655, "time_per_iteration": 2.687196731567383 }, { "auxiliary_loss_clip": 0.01612749, "auxiliary_loss_mlp": 0.0026334, "balance_loss_clip": 1.30090249, "balance_loss_mlp": 0.23322769, "epoch": 0.27993386442206525, "flos": 26981051713920.0, "grad_norm": 202.88922219888045, "language_loss": 0.87620574, "learning_rate": 3.3797266202624506e-06, "loss": 0.8949666, "num_input_tokens_seen": 100567980, "router_z_loss_clip": 3.11914062, "router_z_loss_mlp": 0.30102539, "step": 4656, "time_per_iteration": 2.7527873516082764 }, { "auxiliary_loss_clip": 0.0160799, "auxiliary_loss_mlp": 0.00253687, "balance_loss_clip": 1.30306864, "balance_loss_mlp": 0.22533867, "epoch": 0.2799939876747332, "flos": 24349876652160.0, "grad_norm": 55.75280247176119, "language_loss": 0.88150108, "learning_rate": 3.3794446473521176e-06, "loss": 0.90011787, "num_input_tokens_seen": 100588630, "router_z_loss_clip": 3.04882812, "router_z_loss_mlp": 0.28356934, "step": 4657, "time_per_iteration": 2.7641730308532715 }, { "auxiliary_loss_clip": 0.01633502, "auxiliary_loss_mlp": 0.00275981, "balance_loss_clip": 1.32360387, "balance_loss_mlp": 0.24665537, "epoch": 0.2800541109274012, "flos": 33656988648960.0, "grad_norm": 67.93379707015656, "language_loss": 0.72181153, "learning_rate": 3.379162622133105e-06, "loss": 0.74090642, "num_input_tokens_seen": 100608775, "router_z_loss_clip": 3.09765625, "router_z_loss_mlp": 0.29321289, "step": 4658, "time_per_iteration": 2.749008893966675 }, { "auxiliary_loss_clip": 0.0164013, "auxiliary_loss_mlp": 0.00253387, "balance_loss_clip": 1.32042897, "balance_loss_mlp": 0.22210626, "epoch": 0.28011423418006914, "flos": 21614417429760.0, "grad_norm": 2.147522034004586, "language_loss": 0.87796485, "learning_rate": 3.3788805446161073e-06, "loss": 0.8969, "num_input_tokens_seen": 100627975, "router_z_loss_clip": 3.1953125, "router_z_loss_mlp": 0.31298828, "step": 4659, "time_per_iteration": 4.165915250778198 }, { "auxiliary_loss_clip": 0.01620776, "auxiliary_loss_mlp": 0.00243709, "balance_loss_clip": 1.31906676, "balance_loss_mlp": 0.21369207, "epoch": 0.2801743574327371, "flos": 23112431159040.0, "grad_norm": 6.023846862849659, "language_loss": 0.86748183, "learning_rate": 3.3785984148118215e-06, "loss": 0.88612676, "num_input_tokens_seen": 100645430, "router_z_loss_clip": 3.015625, "router_z_loss_mlp": 0.29980469, "step": 4660, "time_per_iteration": 2.637861490249634 }, { "auxiliary_loss_clip": 0.01656307, "auxiliary_loss_mlp": 0.00255534, "balance_loss_clip": 1.34179986, "balance_loss_mlp": 0.22675736, "epoch": 0.2802344806854051, "flos": 12641418766080.0, "grad_norm": 21.37424897571569, "language_loss": 0.90345782, "learning_rate": 3.3783162327309453e-06, "loss": 0.92257631, "num_input_tokens_seen": 100663775, "router_z_loss_clip": 3.14453125, "router_z_loss_mlp": 0.28759766, "step": 4661, "time_per_iteration": 2.6170201301574707 }, { "auxiliary_loss_clip": 0.01696332, "auxiliary_loss_mlp": 0.00270203, "balance_loss_clip": 1.3813374, "balance_loss_mlp": 0.2392564, "epoch": 0.28029460393807304, "flos": 37267878142080.0, "grad_norm": 4.4410778757899845, "language_loss": 0.85267138, "learning_rate": 3.3780339983841794e-06, "loss": 0.87233675, "num_input_tokens_seen": 100686085, "router_z_loss_clip": 3.15039062, "router_z_loss_mlp": 0.30957031, "step": 4662, "time_per_iteration": 2.835235595703125 }, { "auxiliary_loss_clip": 0.01659932, "auxiliary_loss_mlp": 0.00279634, "balance_loss_clip": 1.3438859, "balance_loss_mlp": 0.24456219, "epoch": 0.280354727190741, "flos": 20741106061440.0, "grad_norm": 12.22859604157899, "language_loss": 0.77876139, "learning_rate": 3.377751711782227e-06, "loss": 0.79815704, "num_input_tokens_seen": 100705135, "router_z_loss_clip": 3.16015625, "router_z_loss_mlp": 0.35083008, "step": 4663, "time_per_iteration": 4.022468328475952 }, { "auxiliary_loss_clip": 0.01674705, "auxiliary_loss_mlp": 0.00258098, "balance_loss_clip": 1.35780787, "balance_loss_mlp": 0.22715141, "epoch": 0.28041485044340897, "flos": 21471026336640.0, "grad_norm": 59.48558963338177, "language_loss": 0.87360942, "learning_rate": 3.377469372935791e-06, "loss": 0.89293742, "num_input_tokens_seen": 100724960, "router_z_loss_clip": 3.16992188, "router_z_loss_mlp": 0.30957031, "step": 4664, "time_per_iteration": 2.6585397720336914 }, { "auxiliary_loss_clip": 0.01651125, "auxiliary_loss_mlp": 0.00246852, "balance_loss_clip": 1.34845352, "balance_loss_mlp": 0.21609627, "epoch": 0.28047497369607693, "flos": 14794263388800.0, "grad_norm": 2.486962505327289, "language_loss": 0.86085814, "learning_rate": 3.377186981855578e-06, "loss": 0.87983793, "num_input_tokens_seen": 100741995, "router_z_loss_clip": 3.02929688, "router_z_loss_mlp": 0.30749512, "step": 4665, "time_per_iteration": 4.002792596817017 }, { "auxiliary_loss_clip": 0.0165657, "auxiliary_loss_mlp": 0.00238621, "balance_loss_clip": 1.34856427, "balance_loss_mlp": 0.20891395, "epoch": 0.2805350969487449, "flos": 23070738447360.0, "grad_norm": 38.928299230022056, "language_loss": 0.8765465, "learning_rate": 3.3769045385522968e-06, "loss": 0.89549839, "num_input_tokens_seen": 100758985, "router_z_loss_clip": 3.08007812, "router_z_loss_mlp": 0.29711914, "step": 4666, "time_per_iteration": 2.6525614261627197 }, { "auxiliary_loss_clip": 0.01690248, "auxiliary_loss_mlp": 0.00283673, "balance_loss_clip": 1.37830782, "balance_loss_mlp": 0.25370368, "epoch": 0.2805952202014129, "flos": 20479855466880.0, "grad_norm": 19.345630329293527, "language_loss": 0.92719686, "learning_rate": 3.376622043036658e-06, "loss": 0.94693613, "num_input_tokens_seen": 100777820, "router_z_loss_clip": 3.11914062, "router_z_loss_mlp": 0.29968262, "step": 4667, "time_per_iteration": 2.7392404079437256 }, { "auxiliary_loss_clip": 0.01682229, "auxiliary_loss_mlp": 0.00258197, "balance_loss_clip": 1.36356843, "balance_loss_mlp": 0.22858526, "epoch": 0.2806553434540809, "flos": 27417330305280.0, "grad_norm": 69.1957259778857, "language_loss": 0.86132646, "learning_rate": 3.376339495319373e-06, "loss": 0.88073069, "num_input_tokens_seen": 100798205, "router_z_loss_clip": 3.18554688, "router_z_loss_mlp": 0.29602051, "step": 4668, "time_per_iteration": 2.7748167514801025 }, { "auxiliary_loss_clip": 0.01698897, "auxiliary_loss_mlp": 0.00264513, "balance_loss_clip": 1.37199473, "balance_loss_mlp": 0.22905988, "epoch": 0.28071546670674885, "flos": 26505019745280.0, "grad_norm": 8.043220347628944, "language_loss": 0.81349754, "learning_rate": 3.3760568954111563e-06, "loss": 0.83313167, "num_input_tokens_seen": 100819800, "router_z_loss_clip": 3.2734375, "router_z_loss_mlp": 0.35449219, "step": 4669, "time_per_iteration": 2.7716145515441895 }, { "auxiliary_loss_clip": 0.01665705, "auxiliary_loss_mlp": 0.00255011, "balance_loss_clip": 1.35506904, "balance_loss_mlp": 0.22327772, "epoch": 0.2807755899594168, "flos": 20558679863040.0, "grad_norm": 963.2182609046885, "language_loss": 0.87411159, "learning_rate": 3.375774243322725e-06, "loss": 0.89331877, "num_input_tokens_seen": 100837880, "router_z_loss_clip": 3.10351562, "router_z_loss_mlp": 0.31738281, "step": 4670, "time_per_iteration": 2.6526458263397217 }, { "auxiliary_loss_clip": 0.01674316, "auxiliary_loss_mlp": 0.00253883, "balance_loss_clip": 1.36345243, "balance_loss_mlp": 0.2204091, "epoch": 0.2808357132120848, "flos": 24313319585280.0, "grad_norm": 40.388095305024486, "language_loss": 0.89394146, "learning_rate": 3.3754915390647955e-06, "loss": 0.91322351, "num_input_tokens_seen": 100856350, "router_z_loss_clip": 3.10742188, "router_z_loss_mlp": 0.33447266, "step": 4671, "time_per_iteration": 2.6808464527130127 }, { "auxiliary_loss_clip": 0.01693974, "auxiliary_loss_mlp": 0.0023103, "balance_loss_clip": 1.38039494, "balance_loss_mlp": 0.20084664, "epoch": 0.28089583646475275, "flos": 26432408401920.0, "grad_norm": 5.325396187758985, "language_loss": 0.81257355, "learning_rate": 3.37520878264809e-06, "loss": 0.83182359, "num_input_tokens_seen": 100876135, "router_z_loss_clip": 3.13671875, "router_z_loss_mlp": 0.30200195, "step": 4672, "time_per_iteration": 2.677990436553955 }, { "auxiliary_loss_clip": 0.01684123, "auxiliary_loss_mlp": 0.00261831, "balance_loss_clip": 1.36937022, "balance_loss_mlp": 0.22735575, "epoch": 0.2809559597174207, "flos": 23111820627840.0, "grad_norm": 75.45867886544858, "language_loss": 0.85301149, "learning_rate": 3.3749259740833286e-06, "loss": 0.87247097, "num_input_tokens_seen": 100894790, "router_z_loss_clip": 3.15039062, "router_z_loss_mlp": 0.3449707, "step": 4673, "time_per_iteration": 2.6267406940460205 }, { "auxiliary_loss_clip": 0.01674223, "auxiliary_loss_mlp": 0.00255654, "balance_loss_clip": 1.36602974, "balance_loss_mlp": 0.22380091, "epoch": 0.2810160829700887, "flos": 20923496346240.0, "grad_norm": 9.332460671539687, "language_loss": 0.79087758, "learning_rate": 3.374643113381237e-06, "loss": 0.81017631, "num_input_tokens_seen": 100915100, "router_z_loss_clip": 3.08007812, "router_z_loss_mlp": 0.31860352, "step": 4674, "time_per_iteration": 4.024905204772949 }, { "auxiliary_loss_clip": 0.01657641, "auxiliary_loss_mlp": 0.0022761, "balance_loss_clip": 1.35396099, "balance_loss_mlp": 0.19661546, "epoch": 0.28107620622275664, "flos": 14355901808640.0, "grad_norm": 11.516676175197956, "language_loss": 0.8464402, "learning_rate": 3.374360200552541e-06, "loss": 0.86529273, "num_input_tokens_seen": 100932795, "router_z_loss_clip": 3.03515625, "router_z_loss_mlp": 0.31030273, "step": 4675, "time_per_iteration": 2.6272220611572266 }, { "auxiliary_loss_clip": 0.0166156, "auxiliary_loss_mlp": 0.00225744, "balance_loss_clip": 1.35475457, "balance_loss_mlp": 0.19470204, "epoch": 0.2811363294754246, "flos": 20919078973440.0, "grad_norm": 19.224872172372645, "language_loss": 0.78491974, "learning_rate": 3.374077235607968e-06, "loss": 0.80379283, "num_input_tokens_seen": 100950505, "router_z_loss_clip": 3.0625, "router_z_loss_mlp": 0.31054688, "step": 4676, "time_per_iteration": 2.694166898727417 }, { "auxiliary_loss_clip": 0.01661827, "auxiliary_loss_mlp": 0.00229228, "balance_loss_clip": 1.35355949, "balance_loss_mlp": 0.19966441, "epoch": 0.28119645272809257, "flos": 20594841880320.0, "grad_norm": 179.06059161888058, "language_loss": 0.77235931, "learning_rate": 3.3737942185582487e-06, "loss": 0.7912699, "num_input_tokens_seen": 100968790, "router_z_loss_clip": 3.08398438, "router_z_loss_mlp": 0.29541016, "step": 4677, "time_per_iteration": 2.622537851333618 }, { "auxiliary_loss_clip": 0.01628561, "auxiliary_loss_mlp": 0.00237767, "balance_loss_clip": 1.33231544, "balance_loss_mlp": 0.20739228, "epoch": 0.28125657598076054, "flos": 25337420248320.0, "grad_norm": 17.650262076095398, "language_loss": 0.69706202, "learning_rate": 3.3735111494141153e-06, "loss": 0.7157253, "num_input_tokens_seen": 100990205, "router_z_loss_clip": 2.96289062, "router_z_loss_mlp": 0.3034668, "step": 4678, "time_per_iteration": 2.6600399017333984 }, { "auxiliary_loss_clip": 0.01608766, "auxiliary_loss_mlp": 0.00224259, "balance_loss_clip": 1.31726384, "balance_loss_mlp": 0.19419467, "epoch": 0.2813166992334285, "flos": 24827093769600.0, "grad_norm": 50.077661588530745, "language_loss": 0.77269304, "learning_rate": 3.3732280281863013e-06, "loss": 0.79102325, "num_input_tokens_seen": 101009815, "router_z_loss_clip": 2.91796875, "router_z_loss_mlp": 0.30078125, "step": 4679, "time_per_iteration": 2.6743009090423584 }, { "auxiliary_loss_clip": 0.01644159, "auxiliary_loss_mlp": 0.00220089, "balance_loss_clip": 1.34293056, "balance_loss_mlp": 0.18778336, "epoch": 0.2813768224860965, "flos": 21760753438080.0, "grad_norm": 3.054637002136198, "language_loss": 0.82242954, "learning_rate": 3.3729448548855422e-06, "loss": 0.84107202, "num_input_tokens_seen": 101026780, "router_z_loss_clip": 3.015625, "router_z_loss_mlp": 0.32299805, "step": 4680, "time_per_iteration": 2.637751340866089 }, { "auxiliary_loss_clip": 0.01603688, "auxiliary_loss_mlp": 0.00225466, "balance_loss_clip": 1.31495237, "balance_loss_mlp": 0.19618854, "epoch": 0.2814369457387645, "flos": 24316803204480.0, "grad_norm": 57.16248163343582, "language_loss": 0.84591174, "learning_rate": 3.3726616295225774e-06, "loss": 0.86420333, "num_input_tokens_seen": 101046215, "router_z_loss_clip": 2.88476562, "router_z_loss_mlp": 0.29309082, "step": 4681, "time_per_iteration": 2.6839046478271484 }, { "auxiliary_loss_clip": 0.0164386, "auxiliary_loss_mlp": 0.00257883, "balance_loss_clip": 1.33828688, "balance_loss_mlp": 0.22681752, "epoch": 0.28149706899143245, "flos": 18515326872960.0, "grad_norm": 55.90385149030006, "language_loss": 0.83569509, "learning_rate": 3.372378352108146e-06, "loss": 0.85471255, "num_input_tokens_seen": 101063365, "router_z_loss_clip": 3.0546875, "router_z_loss_mlp": 0.31066895, "step": 4682, "time_per_iteration": 2.636246919631958 }, { "auxiliary_loss_clip": 0.0163367, "auxiliary_loss_mlp": 0.00230523, "balance_loss_clip": 1.33894205, "balance_loss_mlp": 0.20236562, "epoch": 0.2815571922441004, "flos": 24863255786880.0, "grad_norm": 16.4274315057748, "language_loss": 0.87505263, "learning_rate": 3.3720950226529894e-06, "loss": 0.89369458, "num_input_tokens_seen": 101083835, "router_z_loss_clip": 2.94921875, "router_z_loss_mlp": 0.28149414, "step": 4683, "time_per_iteration": 2.652150869369507 }, { "auxiliary_loss_clip": 0.01625457, "auxiliary_loss_mlp": 0.00256421, "balance_loss_clip": 1.32566202, "balance_loss_mlp": 0.22602242, "epoch": 0.2816173154967684, "flos": 19901622326400.0, "grad_norm": 3682.7688563605384, "language_loss": 0.82746804, "learning_rate": 3.371811641167852e-06, "loss": 0.84628689, "num_input_tokens_seen": 101101740, "router_z_loss_clip": 3.0, "router_z_loss_mlp": 0.30395508, "step": 4684, "time_per_iteration": 2.6531569957733154 }, { "auxiliary_loss_clip": 0.01601331, "auxiliary_loss_mlp": 0.00196203, "balance_loss_clip": 1.31345677, "balance_loss_mlp": 0.16570926, "epoch": 0.28167743874943635, "flos": 17491333950720.0, "grad_norm": 1.986915627223217, "language_loss": 0.85059953, "learning_rate": 3.3715282076634807e-06, "loss": 0.86857486, "num_input_tokens_seen": 101120480, "router_z_loss_clip": 2.87890625, "router_z_loss_mlp": 0.30493164, "step": 4685, "time_per_iteration": 2.612409830093384 }, { "auxiliary_loss_clip": 0.01618363, "auxiliary_loss_mlp": 0.00262594, "balance_loss_clip": 1.32211316, "balance_loss_mlp": 0.23090824, "epoch": 0.2817375620021043, "flos": 25302120157440.0, "grad_norm": 2.3967575770447476, "language_loss": 0.82967412, "learning_rate": 3.3712447221506218e-06, "loss": 0.84848362, "num_input_tokens_seen": 101142910, "router_z_loss_clip": 2.96289062, "router_z_loss_mlp": 0.31665039, "step": 4686, "time_per_iteration": 2.6875386238098145 }, { "auxiliary_loss_clip": 0.01609973, "auxiliary_loss_mlp": 0.00285256, "balance_loss_clip": 1.31171894, "balance_loss_mlp": 0.24927905, "epoch": 0.2817976852547723, "flos": 18693227957760.0, "grad_norm": 3.1787296748717178, "language_loss": 0.75641596, "learning_rate": 3.370961184640025e-06, "loss": 0.77536827, "num_input_tokens_seen": 101160030, "router_z_loss_clip": 2.97851562, "router_z_loss_mlp": 0.35986328, "step": 4687, "time_per_iteration": 2.6094439029693604 }, { "auxiliary_loss_clip": 0.01587667, "auxiliary_loss_mlp": 0.0027306, "balance_loss_clip": 1.29992533, "balance_loss_mlp": 0.24082616, "epoch": 0.28185780850744024, "flos": 22742263549440.0, "grad_norm": 2.9333868868963364, "language_loss": 0.81783873, "learning_rate": 3.3706775951424433e-06, "loss": 0.83644599, "num_input_tokens_seen": 101177675, "router_z_loss_clip": 2.87890625, "router_z_loss_mlp": 0.32202148, "step": 4688, "time_per_iteration": 2.6644248962402344 }, { "auxiliary_loss_clip": 0.01624791, "auxiliary_loss_mlp": 0.00246188, "balance_loss_clip": 1.32582569, "balance_loss_mlp": 0.21804251, "epoch": 0.2819179317601082, "flos": 14933919467520.0, "grad_norm": 9.43176201369757, "language_loss": 0.86257523, "learning_rate": 3.37039395366863e-06, "loss": 0.88128507, "num_input_tokens_seen": 101192225, "router_z_loss_clip": 2.98828125, "router_z_loss_mlp": 0.28161621, "step": 4689, "time_per_iteration": 2.6134145259857178 }, { "auxiliary_loss_clip": 0.01613291, "auxiliary_loss_mlp": 0.00279853, "balance_loss_clip": 1.31959069, "balance_loss_mlp": 0.24957344, "epoch": 0.2819780550127762, "flos": 23145325038720.0, "grad_norm": 24.25863405412492, "language_loss": 0.84549701, "learning_rate": 3.37011026022934e-06, "loss": 0.86442846, "num_input_tokens_seen": 101210870, "router_z_loss_clip": 2.93554688, "router_z_loss_mlp": 0.30297852, "step": 4690, "time_per_iteration": 2.6709845066070557 }, { "auxiliary_loss_clip": 0.01597002, "auxiliary_loss_mlp": 0.00286513, "balance_loss_clip": 1.30673528, "balance_loss_mlp": 0.2578311, "epoch": 0.28203817826544414, "flos": 21616356764160.0, "grad_norm": 3.4096801956659735, "language_loss": 0.96184105, "learning_rate": 3.369826514835332e-06, "loss": 0.98067617, "num_input_tokens_seen": 101229965, "router_z_loss_clip": 2.90429688, "router_z_loss_mlp": 0.28710938, "step": 4691, "time_per_iteration": 2.697049379348755 }, { "auxiliary_loss_clip": 0.01616103, "auxiliary_loss_mlp": 0.00268462, "balance_loss_clip": 1.3186121, "balance_loss_mlp": 0.23875535, "epoch": 0.2820983015181121, "flos": 24026788794240.0, "grad_norm": 31.19083363063642, "language_loss": 0.87851799, "learning_rate": 3.3695427174973654e-06, "loss": 0.89736366, "num_input_tokens_seen": 101250980, "router_z_loss_clip": 2.9765625, "router_z_loss_mlp": 0.29663086, "step": 4692, "time_per_iteration": 2.6714847087860107 }, { "auxiliary_loss_clip": 0.01608532, "auxiliary_loss_mlp": 0.00305365, "balance_loss_clip": 1.31909978, "balance_loss_mlp": 0.27561063, "epoch": 0.2821584247707801, "flos": 30007925976960.0, "grad_norm": 4.525560818070334, "language_loss": 0.80368114, "learning_rate": 3.3692588682262022e-06, "loss": 0.82282007, "num_input_tokens_seen": 101273335, "router_z_loss_clip": 2.89453125, "router_z_loss_mlp": 0.29760742, "step": 4693, "time_per_iteration": 2.721050500869751 }, { "auxiliary_loss_clip": 0.01604025, "auxiliary_loss_mlp": 0.0029195, "balance_loss_clip": 1.31114423, "balance_loss_mlp": 0.26167127, "epoch": 0.2822185480234481, "flos": 21396762967680.0, "grad_norm": 2.8311450666874833, "language_loss": 0.85904908, "learning_rate": 3.3689749670326046e-06, "loss": 0.87800884, "num_input_tokens_seen": 101292110, "router_z_loss_clip": 2.9296875, "router_z_loss_mlp": 0.30273438, "step": 4694, "time_per_iteration": 2.7414908409118652 }, { "auxiliary_loss_clip": 0.01617627, "auxiliary_loss_mlp": 0.00293478, "balance_loss_clip": 1.31991625, "balance_loss_mlp": 0.26460534, "epoch": 0.28227867127611606, "flos": 27452809964160.0, "grad_norm": 9.731992252037038, "language_loss": 0.72721547, "learning_rate": 3.3686910139273392e-06, "loss": 0.74632645, "num_input_tokens_seen": 101312815, "router_z_loss_clip": 2.98242188, "router_z_loss_mlp": 0.28857422, "step": 4695, "time_per_iteration": 2.7868998050689697 }, { "auxiliary_loss_clip": 0.01633638, "auxiliary_loss_mlp": 0.00343873, "balance_loss_clip": 1.33331573, "balance_loss_mlp": 0.31109065, "epoch": 0.282338794528784, "flos": 22593736811520.0, "grad_norm": 13.943455410931433, "language_loss": 0.84865189, "learning_rate": 3.3684070089211736e-06, "loss": 0.86842704, "num_input_tokens_seen": 101329045, "router_z_loss_clip": 3.00195312, "router_z_loss_mlp": 0.32763672, "step": 4696, "time_per_iteration": 2.7816009521484375 }, { "auxiliary_loss_clip": 0.01584245, "auxiliary_loss_mlp": 0.00325356, "balance_loss_clip": 1.29527271, "balance_loss_mlp": 0.29548216, "epoch": 0.282398917781452, "flos": 42010923386880.0, "grad_norm": 48.82236989271834, "language_loss": 0.68643045, "learning_rate": 3.368122952024877e-06, "loss": 0.70552647, "num_input_tokens_seen": 101352715, "router_z_loss_clip": 2.88867188, "router_z_loss_mlp": 0.29833984, "step": 4697, "time_per_iteration": 2.86149001121521 }, { "auxiliary_loss_clip": 0.0158217, "auxiliary_loss_mlp": 0.00307672, "balance_loss_clip": 1.29461598, "balance_loss_mlp": 0.27932429, "epoch": 0.28245904103411995, "flos": 23224724052480.0, "grad_norm": 151.0128702357655, "language_loss": 0.78714895, "learning_rate": 3.3678388432492214e-06, "loss": 0.80604738, "num_input_tokens_seen": 101374640, "router_z_loss_clip": 2.875, "router_z_loss_mlp": 0.28369141, "step": 4698, "time_per_iteration": 2.755028247833252 }, { "auxiliary_loss_clip": 0.016012, "auxiliary_loss_mlp": 0.00285391, "balance_loss_clip": 1.30688918, "balance_loss_mlp": 0.2548492, "epoch": 0.2825191642867879, "flos": 25374623760000.0, "grad_norm": 3.534610631394604, "language_loss": 0.82200825, "learning_rate": 3.3675546826049788e-06, "loss": 0.8408742, "num_input_tokens_seen": 101393595, "router_z_loss_clip": 2.94335938, "router_z_loss_mlp": 0.30541992, "step": 4699, "time_per_iteration": 2.6520562171936035 }, { "auxiliary_loss_clip": 0.01628401, "auxiliary_loss_mlp": 0.00311691, "balance_loss_clip": 1.32533312, "balance_loss_mlp": 0.28091121, "epoch": 0.2825792875394559, "flos": 17236799199360.0, "grad_norm": 11.106100989706265, "language_loss": 0.94904602, "learning_rate": 3.3672704701029265e-06, "loss": 0.96844697, "num_input_tokens_seen": 101409265, "router_z_loss_clip": 3.03125, "router_z_loss_mlp": 0.30761719, "step": 4700, "time_per_iteration": 2.619940757751465 }, { "auxiliary_loss_clip": 0.01608649, "auxiliary_loss_mlp": 0.00277641, "balance_loss_clip": 1.30747867, "balance_loss_mlp": 0.25031805, "epoch": 0.28263941079212385, "flos": 26723967096960.0, "grad_norm": 7.796887182821752, "language_loss": 0.90085959, "learning_rate": 3.3669862057538402e-06, "loss": 0.9197225, "num_input_tokens_seen": 101428365, "router_z_loss_clip": 3.01367188, "router_z_loss_mlp": 0.2734375, "step": 4701, "time_per_iteration": 4.11996865272522 }, { "auxiliary_loss_clip": 0.01640716, "auxiliary_loss_mlp": 0.0029967, "balance_loss_clip": 1.33063745, "balance_loss_mlp": 0.26972461, "epoch": 0.2826995340447918, "flos": 25921327737600.0, "grad_norm": 8.710553034497698, "language_loss": 0.82103479, "learning_rate": 3.3667018895685004e-06, "loss": 0.8404386, "num_input_tokens_seen": 101447280, "router_z_loss_clip": 3.09960938, "router_z_loss_mlp": 0.29943848, "step": 4702, "time_per_iteration": 2.701606273651123 }, { "auxiliary_loss_clip": 0.01625542, "auxiliary_loss_mlp": 0.00305274, "balance_loss_clip": 1.32742643, "balance_loss_mlp": 0.27741483, "epoch": 0.2827596572974598, "flos": 22379709623040.0, "grad_norm": 4.979133582492381, "language_loss": 0.84778261, "learning_rate": 3.3664175215576886e-06, "loss": 0.86709082, "num_input_tokens_seen": 101465435, "router_z_loss_clip": 2.984375, "router_z_loss_mlp": 0.27844238, "step": 4703, "time_per_iteration": 2.8046929836273193 }, { "auxiliary_loss_clip": 0.01624231, "auxiliary_loss_mlp": 0.00327248, "balance_loss_clip": 1.32174087, "balance_loss_mlp": 0.29546687, "epoch": 0.28281978055012774, "flos": 33547137880320.0, "grad_norm": 14.798348708245324, "language_loss": 0.76076365, "learning_rate": 3.3661331017321867e-06, "loss": 0.78027844, "num_input_tokens_seen": 101486355, "router_z_loss_clip": 3.0234375, "router_z_loss_mlp": 0.31762695, "step": 4704, "time_per_iteration": 2.7232794761657715 }, { "auxiliary_loss_clip": 0.016388, "auxiliary_loss_mlp": 0.00338723, "balance_loss_clip": 1.32434821, "balance_loss_mlp": 0.3053925, "epoch": 0.2828799038027957, "flos": 23440870143360.0, "grad_norm": 4.0721411801949365, "language_loss": 0.77615917, "learning_rate": 3.3658486301027807e-06, "loss": 0.79593444, "num_input_tokens_seen": 101505875, "router_z_loss_clip": 3.14257812, "router_z_loss_mlp": 0.33325195, "step": 4705, "time_per_iteration": 4.069917440414429 }, { "auxiliary_loss_clip": 0.01472091, "auxiliary_loss_mlp": 0.00268633, "balance_loss_clip": 1.25661004, "balance_loss_mlp": 0.25337416, "epoch": 0.2829400270554637, "flos": 69873690251520.0, "grad_norm": 0.7191338470940035, "language_loss": 0.59051615, "learning_rate": 3.3655641066802577e-06, "loss": 0.60792339, "num_input_tokens_seen": 101565045, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.15234375, "step": 4706, "time_per_iteration": 3.23539662361145 }, { "auxiliary_loss_clip": 0.01627421, "auxiliary_loss_mlp": 0.00356863, "balance_loss_clip": 1.32206213, "balance_loss_mlp": 0.32673913, "epoch": 0.2830001503081317, "flos": 24789028331520.0, "grad_norm": 1.5433311862065702, "language_loss": 0.86172593, "learning_rate": 3.365279531475407e-06, "loss": 0.88156879, "num_input_tokens_seen": 101585825, "router_z_loss_clip": 3.05273438, "router_z_loss_mlp": 0.30126953, "step": 4707, "time_per_iteration": 2.656975746154785 }, { "auxiliary_loss_clip": 0.01613262, "auxiliary_loss_mlp": 0.00345227, "balance_loss_clip": 1.30934143, "balance_loss_mlp": 0.31287348, "epoch": 0.28306027356079966, "flos": 27669387018240.0, "grad_norm": 11.232725228398703, "language_loss": 0.87092268, "learning_rate": 3.36499490449902e-06, "loss": 0.89050758, "num_input_tokens_seen": 101606105, "router_z_loss_clip": 3.04101562, "router_z_loss_mlp": 0.32348633, "step": 4708, "time_per_iteration": 4.1207544803619385 }, { "auxiliary_loss_clip": 0.01430991, "auxiliary_loss_mlp": 0.00305249, "balance_loss_clip": 1.22730792, "balance_loss_mlp": 0.28979927, "epoch": 0.2831203968134676, "flos": 60527938199040.0, "grad_norm": 6.908157066103251, "language_loss": 0.62797505, "learning_rate": 3.3647102257618895e-06, "loss": 0.6453374, "num_input_tokens_seen": 101656875, "router_z_loss_clip": 2.03125, "router_z_loss_mlp": 0.15429688, "step": 4709, "time_per_iteration": 3.005810260772705 }, { "auxiliary_loss_clip": 0.01614144, "auxiliary_loss_mlp": 0.00336154, "balance_loss_clip": 1.30951631, "balance_loss_mlp": 0.30525488, "epoch": 0.2831805200661356, "flos": 22054790171520.0, "grad_norm": 6.8076593262392215, "language_loss": 0.78794026, "learning_rate": 3.3644254952748103e-06, "loss": 0.8074432, "num_input_tokens_seen": 101676225, "router_z_loss_clip": 3.04296875, "router_z_loss_mlp": 0.30883789, "step": 4710, "time_per_iteration": 2.66361403465271 }, { "auxiliary_loss_clip": 0.01638491, "auxiliary_loss_mlp": 0.00341773, "balance_loss_clip": 1.33143508, "balance_loss_mlp": 0.3090862, "epoch": 0.28324064331880355, "flos": 22600668136320.0, "grad_norm": 23.299886379806146, "language_loss": 0.87041718, "learning_rate": 3.364140713048579e-06, "loss": 0.89021981, "num_input_tokens_seen": 101693710, "router_z_loss_clip": 3.07226562, "router_z_loss_mlp": 0.32666016, "step": 4711, "time_per_iteration": 2.670058488845825 }, { "auxiliary_loss_clip": 0.0162569, "auxiliary_loss_mlp": 0.00367168, "balance_loss_clip": 1.31859636, "balance_loss_mlp": 0.33498132, "epoch": 0.2833007665714715, "flos": 30404127968640.0, "grad_norm": 9.591739854435675, "language_loss": 0.7717135, "learning_rate": 3.363855879093996e-06, "loss": 0.79164207, "num_input_tokens_seen": 101714010, "router_z_loss_clip": 3.06640625, "router_z_loss_mlp": 0.32177734, "step": 4712, "time_per_iteration": 2.804062604904175 }, { "auxiliary_loss_clip": 0.01641787, "auxiliary_loss_mlp": 0.00364611, "balance_loss_clip": 1.33097219, "balance_loss_mlp": 0.33283031, "epoch": 0.2833608898241395, "flos": 23549499849600.0, "grad_norm": 80.04840149261338, "language_loss": 0.89691377, "learning_rate": 3.3635709934218605e-06, "loss": 0.91697776, "num_input_tokens_seen": 101732995, "router_z_loss_clip": 3.11328125, "router_z_loss_mlp": 0.31787109, "step": 4713, "time_per_iteration": 2.7151596546173096 }, { "auxiliary_loss_clip": 0.01680615, "auxiliary_loss_mlp": 0.00362322, "balance_loss_clip": 1.35769987, "balance_loss_mlp": 0.32853818, "epoch": 0.28342101307680745, "flos": 20266726118400.0, "grad_norm": 25.37827047740509, "language_loss": 0.83040041, "learning_rate": 3.3632860560429766e-06, "loss": 0.85082984, "num_input_tokens_seen": 101751385, "router_z_loss_clip": 3.23046875, "router_z_loss_mlp": 0.33764648, "step": 4714, "time_per_iteration": 2.751180648803711 }, { "auxiliary_loss_clip": 0.01644597, "auxiliary_loss_mlp": 0.00359398, "balance_loss_clip": 1.33058047, "balance_loss_mlp": 0.32609135, "epoch": 0.2834811363294754, "flos": 30847050576000.0, "grad_norm": 6.336650456278102, "language_loss": 0.82404578, "learning_rate": 3.3630010669681494e-06, "loss": 0.84408569, "num_input_tokens_seen": 101773825, "router_z_loss_clip": 3.140625, "router_z_loss_mlp": 0.33325195, "step": 4715, "time_per_iteration": 2.8691794872283936 }, { "auxiliary_loss_clip": 0.0162932, "auxiliary_loss_mlp": 0.0033, "balance_loss_clip": 1.31813455, "balance_loss_mlp": 0.29786116, "epoch": 0.2835412595821434, "flos": 22711021695360.0, "grad_norm": 72.33621248752729, "language_loss": 0.81234157, "learning_rate": 3.3627160262081845e-06, "loss": 0.83193475, "num_input_tokens_seen": 101791920, "router_z_loss_clip": 3.109375, "router_z_loss_mlp": 0.32128906, "step": 4716, "time_per_iteration": 4.063138723373413 }, { "auxiliary_loss_clip": 0.01637748, "auxiliary_loss_mlp": 0.00370397, "balance_loss_clip": 1.31979907, "balance_loss_mlp": 0.33570671, "epoch": 0.28360138283481134, "flos": 18077719478400.0, "grad_norm": 11.928890607737554, "language_loss": 0.83308464, "learning_rate": 3.3624309337738917e-06, "loss": 0.8531661, "num_input_tokens_seen": 101809515, "router_z_loss_clip": 3.18164062, "router_z_loss_mlp": 0.34692383, "step": 4717, "time_per_iteration": 2.6545255184173584 }, { "auxiliary_loss_clip": 0.01646356, "auxiliary_loss_mlp": 0.00408443, "balance_loss_clip": 1.32293224, "balance_loss_mlp": 0.3754932, "epoch": 0.2836615060874793, "flos": 17854785717120.0, "grad_norm": 43.37671053489301, "language_loss": 0.73054206, "learning_rate": 3.3621457896760813e-06, "loss": 0.75109005, "num_input_tokens_seen": 101827735, "router_z_loss_clip": 3.234375, "router_z_loss_mlp": 0.3293457, "step": 4718, "time_per_iteration": 2.695160388946533 }, { "auxiliary_loss_clip": 0.01622051, "auxiliary_loss_mlp": 0.00376113, "balance_loss_clip": 1.3099432, "balance_loss_mlp": 0.34173262, "epoch": 0.2837216293401473, "flos": 25740302169600.0, "grad_norm": 1.9613817921563024, "language_loss": 0.79504067, "learning_rate": 3.361860593925566e-06, "loss": 0.81502229, "num_input_tokens_seen": 101845970, "router_z_loss_clip": 3.12109375, "router_z_loss_mlp": 0.34375, "step": 4719, "time_per_iteration": 2.7974069118499756 }, { "auxiliary_loss_clip": 0.01660217, "auxiliary_loss_mlp": 0.00373299, "balance_loss_clip": 1.33689141, "balance_loss_mlp": 0.34256691, "epoch": 0.2837817525928153, "flos": 20923532259840.0, "grad_norm": 33.921076365597564, "language_loss": 0.85820121, "learning_rate": 3.3615753465331605e-06, "loss": 0.87853634, "num_input_tokens_seen": 101865040, "router_z_loss_clip": 3.234375, "router_z_loss_mlp": 0.30761719, "step": 4720, "time_per_iteration": 2.6898224353790283 }, { "auxiliary_loss_clip": 0.0166991, "auxiliary_loss_mlp": 0.00424739, "balance_loss_clip": 1.34136808, "balance_loss_mlp": 0.3868542, "epoch": 0.28384187584548326, "flos": 18916700423040.0, "grad_norm": 27.639367740142806, "language_loss": 0.86362374, "learning_rate": 3.3612900475096817e-06, "loss": 0.88457024, "num_input_tokens_seen": 101883735, "router_z_loss_clip": 3.28710938, "router_z_loss_mlp": 0.37866211, "step": 4721, "time_per_iteration": 2.6308140754699707 }, { "auxiliary_loss_clip": 0.01652298, "auxiliary_loss_mlp": 0.00432028, "balance_loss_clip": 1.3315804, "balance_loss_mlp": 0.39731473, "epoch": 0.2839019990981512, "flos": 27343964776320.0, "grad_norm": 5.675479148334503, "language_loss": 0.8884697, "learning_rate": 3.3610046968659474e-06, "loss": 0.90931296, "num_input_tokens_seen": 101903025, "router_z_loss_clip": 3.20507812, "router_z_loss_mlp": 0.34716797, "step": 4722, "time_per_iteration": 2.665184736251831 }, { "auxiliary_loss_clip": 0.0165571, "auxiliary_loss_mlp": 0.00418037, "balance_loss_clip": 1.32827926, "balance_loss_mlp": 0.38329929, "epoch": 0.2839621223508192, "flos": 18114312458880.0, "grad_norm": 10.777808646051835, "language_loss": 0.76082671, "learning_rate": 3.3607192946127785e-06, "loss": 0.78156412, "num_input_tokens_seen": 101922255, "router_z_loss_clip": 3.2734375, "router_z_loss_mlp": 0.34741211, "step": 4723, "time_per_iteration": 2.6743991374969482 }, { "auxiliary_loss_clip": 0.01649796, "auxiliary_loss_mlp": 0.00423899, "balance_loss_clip": 1.32952523, "balance_loss_mlp": 0.39094913, "epoch": 0.28402224560348716, "flos": 26358360514560.0, "grad_norm": 36.89805844279536, "language_loss": 0.83698797, "learning_rate": 3.360433840760998e-06, "loss": 0.85772491, "num_input_tokens_seen": 101943100, "router_z_loss_clip": 3.20117188, "router_z_loss_mlp": 0.3293457, "step": 4724, "time_per_iteration": 2.641019105911255 }, { "auxiliary_loss_clip": 0.01681194, "auxiliary_loss_mlp": 0.00449209, "balance_loss_clip": 1.34770525, "balance_loss_mlp": 0.41475749, "epoch": 0.2840823688561551, "flos": 24060795995520.0, "grad_norm": 2.2576141937711496, "language_loss": 0.97874498, "learning_rate": 3.36014833532143e-06, "loss": 1.000049, "num_input_tokens_seen": 101963160, "router_z_loss_clip": 3.33789062, "router_z_loss_mlp": 0.34448242, "step": 4725, "time_per_iteration": 2.6407768726348877 }, { "auxiliary_loss_clip": 0.01684952, "auxiliary_loss_mlp": 0.00478397, "balance_loss_clip": 1.34878635, "balance_loss_mlp": 0.43979704, "epoch": 0.2841424921088231, "flos": 29459821368960.0, "grad_norm": 4.311832322493749, "language_loss": 0.92909765, "learning_rate": 3.3598627783049e-06, "loss": 0.95073116, "num_input_tokens_seen": 101984300, "router_z_loss_clip": 3.3671875, "router_z_loss_mlp": 0.38598633, "step": 4726, "time_per_iteration": 2.685722589492798 }, { "auxiliary_loss_clip": 0.01689871, "auxiliary_loss_mlp": 0.00447824, "balance_loss_clip": 1.35041809, "balance_loss_mlp": 0.41096491, "epoch": 0.28420261536149105, "flos": 48100367053440.0, "grad_norm": 70.1651328154248, "language_loss": 0.84216112, "learning_rate": 3.359577169722238e-06, "loss": 0.86353803, "num_input_tokens_seen": 102005765, "router_z_loss_clip": 3.390625, "router_z_loss_mlp": 0.36889648, "step": 4727, "time_per_iteration": 2.8862416744232178 }, { "auxiliary_loss_clip": 0.01678133, "auxiliary_loss_mlp": 0.00462783, "balance_loss_clip": 1.35101616, "balance_loss_mlp": 0.42613822, "epoch": 0.284262738614159, "flos": 25666146541440.0, "grad_norm": 57.33930053976764, "language_loss": 0.74120742, "learning_rate": 3.3592915095842733e-06, "loss": 0.76261657, "num_input_tokens_seen": 102022755, "router_z_loss_clip": 3.27539062, "router_z_loss_mlp": 0.36645508, "step": 4728, "time_per_iteration": 2.655653476715088 }, { "auxiliary_loss_clip": 0.01679475, "auxiliary_loss_mlp": 0.00440545, "balance_loss_clip": 1.34363747, "balance_loss_mlp": 0.40826344, "epoch": 0.284322861866827, "flos": 19718980646400.0, "grad_norm": 16.548196432894038, "language_loss": 0.82207078, "learning_rate": 3.3590057979018386e-06, "loss": 0.84327096, "num_input_tokens_seen": 102041850, "router_z_loss_clip": 3.36132812, "router_z_loss_mlp": 0.32299805, "step": 4729, "time_per_iteration": 2.640824317932129 }, { "auxiliary_loss_clip": 0.01685122, "auxiliary_loss_mlp": 0.0047972, "balance_loss_clip": 1.35071135, "balance_loss_mlp": 0.44264546, "epoch": 0.28438298511949495, "flos": 23915250086400.0, "grad_norm": 33.03200505205498, "language_loss": 0.73445129, "learning_rate": 3.3587200346857674e-06, "loss": 0.7560997, "num_input_tokens_seen": 102059500, "router_z_loss_clip": 3.34570312, "router_z_loss_mlp": 0.37084961, "step": 4730, "time_per_iteration": 2.648637294769287 }, { "auxiliary_loss_clip": 0.01719756, "auxiliary_loss_mlp": 0.00551905, "balance_loss_clip": 1.37237728, "balance_loss_mlp": 0.50910872, "epoch": 0.2844431083721629, "flos": 26067340523520.0, "grad_norm": 52.62437878941061, "language_loss": 0.81529248, "learning_rate": 3.3584342199468965e-06, "loss": 0.83800906, "num_input_tokens_seen": 102080460, "router_z_loss_clip": 3.4765625, "router_z_loss_mlp": 0.42797852, "step": 4731, "time_per_iteration": 2.7473647594451904 }, { "auxiliary_loss_clip": 0.01688423, "auxiliary_loss_mlp": 0.00471841, "balance_loss_clip": 1.35306919, "balance_loss_mlp": 0.43440884, "epoch": 0.2845032316248309, "flos": 25810435474560.0, "grad_norm": 2.3797141618486024, "language_loss": 0.89230806, "learning_rate": 3.3581483536960638e-06, "loss": 0.91391069, "num_input_tokens_seen": 102100950, "router_z_loss_clip": 3.35351562, "router_z_loss_mlp": 0.37451172, "step": 4732, "time_per_iteration": 2.7198784351348877 }, { "auxiliary_loss_clip": 0.01677734, "auxiliary_loss_mlp": 0.00517084, "balance_loss_clip": 1.34523416, "balance_loss_mlp": 0.47724438, "epoch": 0.2845633548774989, "flos": 19823192979840.0, "grad_norm": 556.4611932214924, "language_loss": 0.85835612, "learning_rate": 3.357862435944109e-06, "loss": 0.88030434, "num_input_tokens_seen": 102119345, "router_z_loss_clip": 3.32617188, "router_z_loss_mlp": 0.39868164, "step": 4733, "time_per_iteration": 2.6868326663970947 }, { "auxiliary_loss_clip": 0.01703913, "auxiliary_loss_mlp": 0.00526034, "balance_loss_clip": 1.3662113, "balance_loss_mlp": 0.48624146, "epoch": 0.28462347813016686, "flos": 23182815859200.0, "grad_norm": 6.179714114532041, "language_loss": 0.79664147, "learning_rate": 3.357576466701875e-06, "loss": 0.81894088, "num_input_tokens_seen": 102139050, "router_z_loss_clip": 3.37890625, "router_z_loss_mlp": 0.39819336, "step": 4734, "time_per_iteration": 2.728217601776123 }, { "auxiliary_loss_clip": 0.01663983, "auxiliary_loss_mlp": 0.0045438, "balance_loss_clip": 1.33523583, "balance_loss_mlp": 0.41804507, "epoch": 0.2846836013828348, "flos": 18660477732480.0, "grad_norm": 3.4907970988250683, "language_loss": 0.80195856, "learning_rate": 3.3572904459802056e-06, "loss": 0.82314211, "num_input_tokens_seen": 102157935, "router_z_loss_clip": 3.28710938, "router_z_loss_mlp": 0.36328125, "step": 4735, "time_per_iteration": 2.6204795837402344 }, { "auxiliary_loss_clip": 0.01673121, "auxiliary_loss_mlp": 0.00452085, "balance_loss_clip": 1.33877921, "balance_loss_mlp": 0.41805047, "epoch": 0.2847437246355028, "flos": 14173511523840.0, "grad_norm": 5.277331044899071, "language_loss": 0.85466617, "learning_rate": 3.357004373789946e-06, "loss": 0.87591827, "num_input_tokens_seen": 102175325, "router_z_loss_clip": 3.33984375, "router_z_loss_mlp": 0.34020996, "step": 4736, "time_per_iteration": 2.65429949760437 }, { "auxiliary_loss_clip": 0.01690696, "auxiliary_loss_mlp": 0.0042369, "balance_loss_clip": 1.35440588, "balance_loss_mlp": 0.38883293, "epoch": 0.28480384788817076, "flos": 29278364837760.0, "grad_norm": 52.409289487851055, "language_loss": 0.6969009, "learning_rate": 3.3567182501419453e-06, "loss": 0.71804476, "num_input_tokens_seen": 102196625, "router_z_loss_clip": 3.36328125, "router_z_loss_mlp": 0.34851074, "step": 4737, "time_per_iteration": 2.736737012863159 }, { "auxiliary_loss_clip": 0.016614, "auxiliary_loss_mlp": 0.00417948, "balance_loss_clip": 1.32993078, "balance_loss_mlp": 0.38447374, "epoch": 0.2848639711408387, "flos": 22601314581120.0, "grad_norm": 10.03821632265725, "language_loss": 0.91728246, "learning_rate": 3.356432075047052e-06, "loss": 0.9380759, "num_input_tokens_seen": 102214975, "router_z_loss_clip": 3.31445312, "router_z_loss_mlp": 0.3347168, "step": 4738, "time_per_iteration": 2.6771883964538574 }, { "auxiliary_loss_clip": 0.016737, "auxiliary_loss_mlp": 0.00409548, "balance_loss_clip": 1.34243917, "balance_loss_mlp": 0.37745661, "epoch": 0.2849240943935067, "flos": 17599460866560.0, "grad_norm": 5.083780003124386, "language_loss": 0.97753686, "learning_rate": 3.356145848516118e-06, "loss": 0.9983694, "num_input_tokens_seen": 102231885, "router_z_loss_clip": 3.31640625, "router_z_loss_mlp": 0.32092285, "step": 4739, "time_per_iteration": 2.621105909347534 }, { "auxiliary_loss_clip": 0.0163544, "auxiliary_loss_mlp": 0.00398734, "balance_loss_clip": 1.31303287, "balance_loss_mlp": 0.36750126, "epoch": 0.28498421764617465, "flos": 24862573428480.0, "grad_norm": 2.004535449589495, "language_loss": 0.76820028, "learning_rate": 3.355859570559998e-06, "loss": 0.78854197, "num_input_tokens_seen": 102252725, "router_z_loss_clip": 3.22460938, "router_z_loss_mlp": 0.3125, "step": 4740, "time_per_iteration": 2.7275452613830566 }, { "auxiliary_loss_clip": 0.01667668, "auxiliary_loss_mlp": 0.00400953, "balance_loss_clip": 1.3356818, "balance_loss_mlp": 0.36690688, "epoch": 0.2850443408988426, "flos": 22782555630720.0, "grad_norm": 9.149122077533956, "language_loss": 0.84876359, "learning_rate": 3.3555732411895477e-06, "loss": 0.86944985, "num_input_tokens_seen": 102271730, "router_z_loss_clip": 3.31835938, "router_z_loss_mlp": 0.34082031, "step": 4741, "time_per_iteration": 2.686898946762085 }, { "auxiliary_loss_clip": 0.01652195, "auxiliary_loss_mlp": 0.00386765, "balance_loss_clip": 1.31984651, "balance_loss_mlp": 0.35548449, "epoch": 0.2851044641515106, "flos": 18844053166080.0, "grad_norm": 115.25523814839043, "language_loss": 0.83979428, "learning_rate": 3.3552868604156235e-06, "loss": 0.86018389, "num_input_tokens_seen": 102291325, "router_z_loss_clip": 3.32226562, "router_z_loss_mlp": 0.31286621, "step": 4742, "time_per_iteration": 2.829045057296753 }, { "auxiliary_loss_clip": 0.01642266, "auxiliary_loss_mlp": 0.00397029, "balance_loss_clip": 1.31850743, "balance_loss_mlp": 0.36319703, "epoch": 0.28516458740417855, "flos": 18880502492160.0, "grad_norm": 4.651365881113107, "language_loss": 0.6352371, "learning_rate": 3.355000428249086e-06, "loss": 0.65563011, "num_input_tokens_seen": 102309000, "router_z_loss_clip": 3.23242188, "router_z_loss_mlp": 0.33837891, "step": 4743, "time_per_iteration": 2.692364454269409 }, { "auxiliary_loss_clip": 0.01673313, "auxiliary_loss_mlp": 0.00366421, "balance_loss_clip": 1.35053217, "balance_loss_mlp": 0.33545026, "epoch": 0.2852247106568465, "flos": 25299821687040.0, "grad_norm": 3.7721856710939146, "language_loss": 0.80997211, "learning_rate": 3.354713944700797e-06, "loss": 0.83036941, "num_input_tokens_seen": 102329240, "router_z_loss_clip": 3.23046875, "router_z_loss_mlp": 0.30944824, "step": 4744, "time_per_iteration": 4.150479793548584 }, { "auxiliary_loss_clip": 0.01625699, "auxiliary_loss_mlp": 0.0035636, "balance_loss_clip": 1.30040932, "balance_loss_mlp": 0.32542509, "epoch": 0.2852848339095145, "flos": 11655383541120.0, "grad_norm": 2.5885334714699337, "language_loss": 0.84211826, "learning_rate": 3.3544274097816185e-06, "loss": 0.86193883, "num_input_tokens_seen": 102344440, "router_z_loss_clip": 3.25, "router_z_loss_mlp": 0.30969238, "step": 4745, "time_per_iteration": 2.6903557777404785 }, { "auxiliary_loss_clip": 0.01660049, "auxiliary_loss_mlp": 0.00353006, "balance_loss_clip": 1.33082187, "balance_loss_mlp": 0.32348967, "epoch": 0.2853449571621825, "flos": 12933228856320.0, "grad_norm": 458.5000759223159, "language_loss": 0.88189304, "learning_rate": 3.3541408235024173e-06, "loss": 0.90202355, "num_input_tokens_seen": 102360985, "router_z_loss_clip": 3.29101562, "router_z_loss_mlp": 0.29516602, "step": 4746, "time_per_iteration": 2.592804431915283 }, { "auxiliary_loss_clip": 0.01682021, "auxiliary_loss_mlp": 0.00403192, "balance_loss_clip": 1.34150887, "balance_loss_mlp": 0.36938453, "epoch": 0.28540508041485046, "flos": 20010575255040.0, "grad_norm": 11.869548954391876, "language_loss": 0.87527508, "learning_rate": 3.3538541858740604e-06, "loss": 0.89612716, "num_input_tokens_seen": 102380320, "router_z_loss_clip": 3.40429688, "router_z_loss_mlp": 0.33813477, "step": 4747, "time_per_iteration": 4.056623220443726 }, { "auxiliary_loss_clip": 0.01443868, "auxiliary_loss_mlp": 0.00086535, "balance_loss_clip": 1.21674323, "balance_loss_mlp": 0.07747462, "epoch": 0.28546520366751843, "flos": 68139349966080.0, "grad_norm": 0.7553747870637408, "language_loss": 0.59956741, "learning_rate": 3.3535674969074173e-06, "loss": 0.61487144, "num_input_tokens_seen": 102439140, "router_z_loss_clip": 2.28125, "router_z_loss_mlp": 0.09082031, "step": 4748, "time_per_iteration": 3.142232894897461 }, { "auxiliary_loss_clip": 0.01641018, "auxiliary_loss_mlp": 0.0035461, "balance_loss_clip": 1.31471443, "balance_loss_mlp": 0.32471216, "epoch": 0.2855253269201864, "flos": 13251540205440.0, "grad_norm": 8.089513093174133, "language_loss": 0.90290046, "learning_rate": 3.3532807566133592e-06, "loss": 0.92285681, "num_input_tokens_seen": 102450990, "router_z_loss_clip": 3.26367188, "router_z_loss_mlp": 0.29882812, "step": 4749, "time_per_iteration": 2.6235690116882324 }, { "auxiliary_loss_clip": 0.01657807, "auxiliary_loss_mlp": 0.00340109, "balance_loss_clip": 1.32656264, "balance_loss_mlp": 0.30985385, "epoch": 0.28558545017285436, "flos": 28620876337920.0, "grad_norm": 3.694778315074223, "language_loss": 0.76693666, "learning_rate": 3.3529939650027587e-06, "loss": 0.78691578, "num_input_tokens_seen": 102471820, "router_z_loss_clip": 3.30859375, "router_z_loss_mlp": 0.30236816, "step": 4750, "time_per_iteration": 4.106104373931885 }, { "auxiliary_loss_clip": 0.01656803, "auxiliary_loss_mlp": 0.0033863, "balance_loss_clip": 1.32879901, "balance_loss_mlp": 0.30954331, "epoch": 0.2856455734255223, "flos": 34130470752000.0, "grad_norm": 8.789912390883961, "language_loss": 0.86551785, "learning_rate": 3.3527071220864917e-06, "loss": 0.88547218, "num_input_tokens_seen": 102492625, "router_z_loss_clip": 3.27929688, "router_z_loss_mlp": 0.29077148, "step": 4751, "time_per_iteration": 2.7182555198669434 }, { "auxiliary_loss_clip": 0.01641722, "auxiliary_loss_mlp": 0.00342539, "balance_loss_clip": 1.31488681, "balance_loss_mlp": 0.31160438, "epoch": 0.2857056966781903, "flos": 39786149779200.0, "grad_norm": 9.624447152484379, "language_loss": 0.86576116, "learning_rate": 3.3524202278754353e-06, "loss": 0.88560379, "num_input_tokens_seen": 102514145, "router_z_loss_clip": 3.26367188, "router_z_loss_mlp": 0.3092041, "step": 4752, "time_per_iteration": 2.8040924072265625 }, { "auxiliary_loss_clip": 0.01618184, "auxiliary_loss_mlp": 0.00328896, "balance_loss_clip": 1.30217719, "balance_loss_mlp": 0.29768687, "epoch": 0.28576581993085826, "flos": 21872292145920.0, "grad_norm": 30.008528234611813, "language_loss": 0.84944707, "learning_rate": 3.3521332823804676e-06, "loss": 0.86891782, "num_input_tokens_seen": 102532365, "router_z_loss_clip": 3.15820312, "router_z_loss_mlp": 0.31225586, "step": 4753, "time_per_iteration": 2.666517496109009 }, { "auxiliary_loss_clip": 0.01670862, "auxiliary_loss_mlp": 0.0035002, "balance_loss_clip": 1.33373332, "balance_loss_mlp": 0.3175593, "epoch": 0.2858259431835262, "flos": 19091656592640.0, "grad_norm": 12.196293430503038, "language_loss": 0.97219503, "learning_rate": 3.3518462856124704e-06, "loss": 0.99240381, "num_input_tokens_seen": 102548425, "router_z_loss_clip": 3.37304688, "router_z_loss_mlp": 0.32434082, "step": 4754, "time_per_iteration": 2.628031015396118 }, { "auxiliary_loss_clip": 0.01650731, "auxiliary_loss_mlp": 0.00277255, "balance_loss_clip": 1.32245409, "balance_loss_mlp": 0.25151801, "epoch": 0.2858860664361942, "flos": 20334309557760.0, "grad_norm": 11.729621378802253, "language_loss": 0.86892104, "learning_rate": 3.3515592375823267e-06, "loss": 0.88820088, "num_input_tokens_seen": 102566370, "router_z_loss_clip": 3.28515625, "router_z_loss_mlp": 0.25744629, "step": 4755, "time_per_iteration": 2.665055990219116 }, { "auxiliary_loss_clip": 0.01625849, "auxiliary_loss_mlp": 0.00317417, "balance_loss_clip": 1.30061221, "balance_loss_mlp": 0.28875953, "epoch": 0.28594618968886215, "flos": 24461738582400.0, "grad_norm": 2.9883454009622588, "language_loss": 0.89202553, "learning_rate": 3.351272138300922e-06, "loss": 0.91145819, "num_input_tokens_seen": 102588715, "router_z_loss_clip": 3.25195312, "router_z_loss_mlp": 0.28686523, "step": 4756, "time_per_iteration": 2.7143256664276123 }, { "auxiliary_loss_clip": 0.01473114, "auxiliary_loss_mlp": 0.00142423, "balance_loss_clip": 1.22799516, "balance_loss_mlp": 0.13026337, "epoch": 0.2860063129415301, "flos": 71652850709760.0, "grad_norm": 0.852295904037903, "language_loss": 0.60485053, "learning_rate": 3.350984987779142e-06, "loss": 0.62100595, "num_input_tokens_seen": 102656715, "router_z_loss_clip": 2.453125, "router_z_loss_mlp": 0.12158203, "step": 4757, "time_per_iteration": 3.2938787937164307 }, { "auxiliary_loss_clip": 0.01680445, "auxiliary_loss_mlp": 0.00310904, "balance_loss_clip": 1.3455708, "balance_loss_mlp": 0.28269893, "epoch": 0.2860664361941981, "flos": 20558679863040.0, "grad_norm": 4.1818602296924325, "language_loss": 0.7452535, "learning_rate": 3.3506977860278756e-06, "loss": 0.765167, "num_input_tokens_seen": 102676545, "router_z_loss_clip": 3.34765625, "router_z_loss_mlp": 0.28210449, "step": 4758, "time_per_iteration": 4.044320344924927 }, { "auxiliary_loss_clip": 0.01659889, "auxiliary_loss_mlp": 0.00309473, "balance_loss_clip": 1.32836509, "balance_loss_mlp": 0.28055304, "epoch": 0.2861265594468661, "flos": 35996389534080.0, "grad_norm": 1.3786935989320641, "language_loss": 0.68704075, "learning_rate": 3.3504105330580143e-06, "loss": 0.70673436, "num_input_tokens_seen": 102702875, "router_z_loss_clip": 3.31445312, "router_z_loss_mlp": 0.28930664, "step": 4759, "time_per_iteration": 2.7863552570343018 }, { "auxiliary_loss_clip": 0.01657324, "auxiliary_loss_mlp": 0.00291857, "balance_loss_clip": 1.32922268, "balance_loss_mlp": 0.26187578, "epoch": 0.28618668269953407, "flos": 20047419630720.0, "grad_norm": 5.202456839164399, "language_loss": 0.80678689, "learning_rate": 3.3501232288804496e-06, "loss": 0.82627875, "num_input_tokens_seen": 102723160, "router_z_loss_clip": 3.27734375, "router_z_loss_mlp": 0.29992676, "step": 4760, "time_per_iteration": 2.679760217666626 }, { "auxiliary_loss_clip": 0.01632986, "auxiliary_loss_mlp": 0.00281558, "balance_loss_clip": 1.31194329, "balance_loss_mlp": 0.25424671, "epoch": 0.28624680595220203, "flos": 24971849579520.0, "grad_norm": 23.984817874841426, "language_loss": 0.78277826, "learning_rate": 3.3498358735060773e-06, "loss": 0.80192375, "num_input_tokens_seen": 102743855, "router_z_loss_clip": 3.2109375, "router_z_loss_mlp": 0.27319336, "step": 4761, "time_per_iteration": 2.667949676513672 }, { "auxiliary_loss_clip": 0.01675902, "auxiliary_loss_mlp": 0.00328906, "balance_loss_clip": 1.34229481, "balance_loss_mlp": 0.29741102, "epoch": 0.28630692920487, "flos": 22492253911680.0, "grad_norm": 9.823831461277473, "language_loss": 0.81806457, "learning_rate": 3.349548466945793e-06, "loss": 0.83811271, "num_input_tokens_seen": 102761370, "router_z_loss_clip": 3.33789062, "router_z_loss_mlp": 0.31469727, "step": 4762, "time_per_iteration": 2.6710431575775146 }, { "auxiliary_loss_clip": 0.01652345, "auxiliary_loss_mlp": 0.00350796, "balance_loss_clip": 1.32453847, "balance_loss_mlp": 0.31768003, "epoch": 0.28636705245753796, "flos": 21249888255360.0, "grad_norm": 2.8241413257704373, "language_loss": 0.80020863, "learning_rate": 3.349261009210496e-06, "loss": 0.82024002, "num_input_tokens_seen": 102780885, "router_z_loss_clip": 3.27539062, "router_z_loss_mlp": 0.33105469, "step": 4763, "time_per_iteration": 2.647678852081299 }, { "auxiliary_loss_clip": 0.01670702, "auxiliary_loss_mlp": 0.00303097, "balance_loss_clip": 1.33634388, "balance_loss_mlp": 0.27138737, "epoch": 0.28642717571020593, "flos": 24095772864000.0, "grad_norm": 6.731645448575013, "language_loss": 0.84453988, "learning_rate": 3.348973500311086e-06, "loss": 0.86427784, "num_input_tokens_seen": 102801000, "router_z_loss_clip": 3.34375, "router_z_loss_mlp": 0.31713867, "step": 4764, "time_per_iteration": 2.665630578994751 }, { "auxiliary_loss_clip": 0.01687639, "auxiliary_loss_mlp": 0.0030719, "balance_loss_clip": 1.34561205, "balance_loss_mlp": 0.27621984, "epoch": 0.2864872989628739, "flos": 22601386408320.0, "grad_norm": 4.040443755256701, "language_loss": 0.79863954, "learning_rate": 3.348685940258466e-06, "loss": 0.81858784, "num_input_tokens_seen": 102820230, "router_z_loss_clip": 3.41992188, "router_z_loss_mlp": 0.30981445, "step": 4765, "time_per_iteration": 2.6523478031158447 }, { "auxiliary_loss_clip": 0.01686934, "auxiliary_loss_mlp": 0.00293848, "balance_loss_clip": 1.34787929, "balance_loss_mlp": 0.26218569, "epoch": 0.28654742221554186, "flos": 32745073138560.0, "grad_norm": 6.051850583241571, "language_loss": 0.817572, "learning_rate": 3.3483983290635395e-06, "loss": 0.83737987, "num_input_tokens_seen": 102842670, "router_z_loss_clip": 3.38867188, "router_z_loss_mlp": 0.31665039, "step": 4766, "time_per_iteration": 2.80873441696167 }, { "auxiliary_loss_clip": 0.01669734, "auxiliary_loss_mlp": 0.00261962, "balance_loss_clip": 1.33112037, "balance_loss_mlp": 0.23361361, "epoch": 0.2866075454682098, "flos": 26981626331520.0, "grad_norm": 1.6388542575784177, "language_loss": 0.84339178, "learning_rate": 3.348110666737214e-06, "loss": 0.86270869, "num_input_tokens_seen": 102864480, "router_z_loss_clip": 3.3828125, "router_z_loss_mlp": 0.28344727, "step": 4767, "time_per_iteration": 2.7050046920776367 }, { "auxiliary_loss_clip": 0.01677774, "auxiliary_loss_mlp": 0.00302355, "balance_loss_clip": 1.34827065, "balance_loss_mlp": 0.27071714, "epoch": 0.2866676687208778, "flos": 23253847004160.0, "grad_norm": 3.895020177199508, "language_loss": 0.6969856, "learning_rate": 3.3478229532903956e-06, "loss": 0.71678686, "num_input_tokens_seen": 102883740, "router_z_loss_clip": 3.296875, "router_z_loss_mlp": 0.31616211, "step": 4768, "time_per_iteration": 2.6171796321868896 }, { "auxiliary_loss_clip": 0.01686857, "auxiliary_loss_mlp": 0.00310207, "balance_loss_clip": 1.34253561, "balance_loss_mlp": 0.27875975, "epoch": 0.28672779197354575, "flos": 21579727870080.0, "grad_norm": 1.9284736349868663, "language_loss": 0.77705914, "learning_rate": 3.3475351887339967e-06, "loss": 0.79702979, "num_input_tokens_seen": 102902945, "router_z_loss_clip": 3.44335938, "router_z_loss_mlp": 0.3145752, "step": 4769, "time_per_iteration": 2.681192636489868 }, { "auxiliary_loss_clip": 0.01679064, "auxiliary_loss_mlp": 0.00294678, "balance_loss_clip": 1.33691299, "balance_loss_mlp": 0.26635385, "epoch": 0.2867879152262137, "flos": 19865568049920.0, "grad_norm": 3.0359543880802358, "language_loss": 0.81082606, "learning_rate": 3.3472473730789288e-06, "loss": 0.83056343, "num_input_tokens_seen": 102922405, "router_z_loss_clip": 3.421875, "router_z_loss_mlp": 0.28344727, "step": 4770, "time_per_iteration": 2.648583173751831 }, { "auxiliary_loss_clip": 0.01642054, "auxiliary_loss_mlp": 0.00306987, "balance_loss_clip": 1.31379163, "balance_loss_mlp": 0.27711278, "epoch": 0.2868480384788817, "flos": 28213325648640.0, "grad_norm": 40.99671387033889, "language_loss": 0.73237789, "learning_rate": 3.3469595063361045e-06, "loss": 0.75186831, "num_input_tokens_seen": 102938980, "router_z_loss_clip": 3.27929688, "router_z_loss_mlp": 0.29907227, "step": 4771, "time_per_iteration": 2.703432321548462 }, { "auxiliary_loss_clip": 0.0145368, "auxiliary_loss_mlp": 0.00115222, "balance_loss_clip": 1.20026374, "balance_loss_mlp": 0.10816433, "epoch": 0.2869081617315497, "flos": 65424286690560.0, "grad_norm": 0.8233291929627559, "language_loss": 0.56518191, "learning_rate": 3.3466715885164414e-06, "loss": 0.58087093, "num_input_tokens_seen": 103000405, "router_z_loss_clip": 2.53125, "router_z_loss_mlp": 0.07080078, "step": 4772, "time_per_iteration": 3.078024387359619 }, { "auxiliary_loss_clip": 0.01649319, "auxiliary_loss_mlp": 0.00278669, "balance_loss_clip": 1.31290674, "balance_loss_mlp": 0.25058341, "epoch": 0.28696828498421767, "flos": 18660729127680.0, "grad_norm": 9.987008598473734, "language_loss": 0.92112017, "learning_rate": 3.346383619630856e-06, "loss": 0.9404, "num_input_tokens_seen": 103017970, "router_z_loss_clip": 3.36328125, "router_z_loss_mlp": 0.28088379, "step": 4773, "time_per_iteration": 2.6300106048583984 }, { "auxiliary_loss_clip": 0.01649684, "auxiliary_loss_mlp": 0.00290662, "balance_loss_clip": 1.30729628, "balance_loss_mlp": 0.25952432, "epoch": 0.28702840823688563, "flos": 23659745667840.0, "grad_norm": 50.01136461617431, "language_loss": 0.8476274, "learning_rate": 3.34609559969027e-06, "loss": 0.8670308, "num_input_tokens_seen": 103036385, "router_z_loss_clip": 3.42382812, "router_z_loss_mlp": 0.31152344, "step": 4774, "time_per_iteration": 2.6683661937713623 }, { "auxiliary_loss_clip": 0.01674586, "auxiliary_loss_mlp": 0.00311903, "balance_loss_clip": 1.32749319, "balance_loss_mlp": 0.28093201, "epoch": 0.2870885314895536, "flos": 13804744544640.0, "grad_norm": 30.95612836902878, "language_loss": 0.8029207, "learning_rate": 3.3458075287056034e-06, "loss": 0.82278562, "num_input_tokens_seen": 103052170, "router_z_loss_clip": 3.46875, "router_z_loss_mlp": 0.30981445, "step": 4775, "time_per_iteration": 2.6364853382110596 }, { "auxiliary_loss_clip": 0.01660686, "auxiliary_loss_mlp": 0.00289553, "balance_loss_clip": 1.31991279, "balance_loss_mlp": 0.2611689, "epoch": 0.28714865474222157, "flos": 17786771314560.0, "grad_norm": 2.006411866678454, "language_loss": 0.93145108, "learning_rate": 3.34551940668778e-06, "loss": 0.95095354, "num_input_tokens_seen": 103070510, "router_z_loss_clip": 3.41015625, "router_z_loss_mlp": 0.28405762, "step": 4776, "time_per_iteration": 2.6232988834381104 }, { "auxiliary_loss_clip": 0.01672578, "auxiliary_loss_mlp": 0.00266974, "balance_loss_clip": 1.32477593, "balance_loss_mlp": 0.23988923, "epoch": 0.28720877799488953, "flos": 15997486199040.0, "grad_norm": 4.316749534407035, "language_loss": 0.82511544, "learning_rate": 3.345231233647726e-06, "loss": 0.84451097, "num_input_tokens_seen": 103089590, "router_z_loss_clip": 3.4765625, "router_z_loss_mlp": 0.27087402, "step": 4777, "time_per_iteration": 2.656352996826172 }, { "auxiliary_loss_clip": 0.01708032, "auxiliary_loss_mlp": 0.00306882, "balance_loss_clip": 1.34554982, "balance_loss_mlp": 0.27502948, "epoch": 0.2872689012475575, "flos": 20923137210240.0, "grad_norm": 6.4693903740004375, "language_loss": 0.88194549, "learning_rate": 3.3449430095963696e-06, "loss": 0.90209466, "num_input_tokens_seen": 103109080, "router_z_loss_clip": 3.62304688, "router_z_loss_mlp": 0.31884766, "step": 4778, "time_per_iteration": 2.6604461669921875 }, { "auxiliary_loss_clip": 0.01673888, "auxiliary_loss_mlp": 0.002766, "balance_loss_clip": 1.3239857, "balance_loss_mlp": 0.24793009, "epoch": 0.28732902450022546, "flos": 21325121291520.0, "grad_norm": 5.011305570582617, "language_loss": 0.80356604, "learning_rate": 3.3446547345446386e-06, "loss": 0.82307088, "num_input_tokens_seen": 103127755, "router_z_loss_clip": 3.49609375, "router_z_loss_mlp": 0.28649902, "step": 4779, "time_per_iteration": 2.6423301696777344 }, { "auxiliary_loss_clip": 0.01695165, "auxiliary_loss_mlp": 0.00309672, "balance_loss_clip": 1.3351326, "balance_loss_mlp": 0.28025091, "epoch": 0.2873891477528934, "flos": 20850382212480.0, "grad_norm": 12.225589067643705, "language_loss": 0.83389133, "learning_rate": 3.3443664085034656e-06, "loss": 0.85393965, "num_input_tokens_seen": 103147035, "router_z_loss_clip": 3.6015625, "router_z_loss_mlp": 0.29406738, "step": 4780, "time_per_iteration": 2.6619162559509277 }, { "auxiliary_loss_clip": 0.01667874, "auxiliary_loss_mlp": 0.00284101, "balance_loss_clip": 1.31709576, "balance_loss_mlp": 0.25581264, "epoch": 0.2874492710055614, "flos": 17420051410560.0, "grad_norm": 4.249654516878094, "language_loss": 0.86969936, "learning_rate": 3.344078031483784e-06, "loss": 0.88921916, "num_input_tokens_seen": 103165410, "router_z_loss_clip": 3.5078125, "router_z_loss_mlp": 0.28259277, "step": 4781, "time_per_iteration": 2.648014783859253 }, { "auxiliary_loss_clip": 0.01683341, "auxiliary_loss_mlp": 0.00325633, "balance_loss_clip": 1.32571673, "balance_loss_mlp": 0.29363674, "epoch": 0.28750939425822936, "flos": 13406818700160.0, "grad_norm": 5.974564792894227, "language_loss": 0.92900705, "learning_rate": 3.3437896034965283e-06, "loss": 0.9490968, "num_input_tokens_seen": 103183710, "router_z_loss_clip": 3.578125, "router_z_loss_mlp": 0.31982422, "step": 4782, "time_per_iteration": 2.689769744873047 }, { "auxiliary_loss_clip": 0.01722175, "auxiliary_loss_mlp": 0.0030027, "balance_loss_clip": 1.35292161, "balance_loss_mlp": 0.26903713, "epoch": 0.2875695175108973, "flos": 21870029589120.0, "grad_norm": 70.12287953806398, "language_loss": 0.77562463, "learning_rate": 3.3435011245526357e-06, "loss": 0.79584908, "num_input_tokens_seen": 103203790, "router_z_loss_clip": 3.69726562, "router_z_loss_mlp": 0.31237793, "step": 4783, "time_per_iteration": 2.719494104385376 }, { "auxiliary_loss_clip": 0.01661883, "auxiliary_loss_mlp": 0.00324803, "balance_loss_clip": 1.3119719, "balance_loss_mlp": 0.29483342, "epoch": 0.2876296407635653, "flos": 26245457089920.0, "grad_norm": 9.68682042705836, "language_loss": 0.82716024, "learning_rate": 3.343212594663047e-06, "loss": 0.84702718, "num_input_tokens_seen": 103223925, "router_z_loss_clip": 3.50195312, "router_z_loss_mlp": 0.29980469, "step": 4784, "time_per_iteration": 2.7448737621307373 }, { "auxiliary_loss_clip": 0.01643667, "auxiliary_loss_mlp": 0.00272484, "balance_loss_clip": 1.29573107, "balance_loss_mlp": 0.24430273, "epoch": 0.28768976401623325, "flos": 25373654092800.0, "grad_norm": 2.3625077222097524, "language_loss": 0.81009507, "learning_rate": 3.3429240138387015e-06, "loss": 0.82925659, "num_input_tokens_seen": 103244760, "router_z_loss_clip": 3.47851562, "router_z_loss_mlp": 0.28186035, "step": 4785, "time_per_iteration": 2.6764068603515625 }, { "auxiliary_loss_clip": 0.01641456, "auxiliary_loss_mlp": 0.00243396, "balance_loss_clip": 1.29124331, "balance_loss_mlp": 0.21576294, "epoch": 0.28774988726890127, "flos": 30664372982400.0, "grad_norm": 17.042018825673708, "language_loss": 0.89909041, "learning_rate": 3.3426353820905425e-06, "loss": 0.91793895, "num_input_tokens_seen": 103261995, "router_z_loss_clip": 3.50195312, "router_z_loss_mlp": 0.27648926, "step": 4786, "time_per_iteration": 4.065742015838623 }, { "auxiliary_loss_clip": 0.01625866, "auxiliary_loss_mlp": 0.00298757, "balance_loss_clip": 1.27702427, "balance_loss_mlp": 0.26969364, "epoch": 0.28781001052156924, "flos": 20595452411520.0, "grad_norm": 4.835435076941311, "language_loss": 0.8515864, "learning_rate": 3.342346699429516e-06, "loss": 0.87083256, "num_input_tokens_seen": 103279780, "router_z_loss_clip": 3.48632812, "router_z_loss_mlp": 0.29077148, "step": 4787, "time_per_iteration": 2.6337380409240723 }, { "auxiliary_loss_clip": 0.01663861, "auxiliary_loss_mlp": 0.00314687, "balance_loss_clip": 1.30284643, "balance_loss_mlp": 0.28505141, "epoch": 0.2878701337742372, "flos": 26542330997760.0, "grad_norm": 10.311346404534666, "language_loss": 0.90249598, "learning_rate": 3.3420579658665677e-06, "loss": 0.9222815, "num_input_tokens_seen": 103300580, "router_z_loss_clip": 3.61328125, "router_z_loss_mlp": 0.29626465, "step": 4788, "time_per_iteration": 2.813897132873535 }, { "auxiliary_loss_clip": 0.01650028, "auxiliary_loss_mlp": 0.00292405, "balance_loss_clip": 1.2937634, "balance_loss_mlp": 0.26329422, "epoch": 0.28793025702690517, "flos": 28146855530880.0, "grad_norm": 45.1170240152689, "language_loss": 0.81358898, "learning_rate": 3.3417691814126468e-06, "loss": 0.8330133, "num_input_tokens_seen": 103320430, "router_z_loss_clip": 3.5625, "router_z_loss_mlp": 0.29125977, "step": 4789, "time_per_iteration": 4.286560773849487 }, { "auxiliary_loss_clip": 0.01598901, "auxiliary_loss_mlp": 0.00242069, "balance_loss_clip": 1.26793861, "balance_loss_mlp": 0.21626063, "epoch": 0.28799038027957313, "flos": 23805471144960.0, "grad_norm": 9.920060553375782, "language_loss": 0.90827483, "learning_rate": 3.341480346078704e-06, "loss": 0.92668462, "num_input_tokens_seen": 103337695, "router_z_loss_clip": 3.31054688, "router_z_loss_mlp": 0.2578125, "step": 4790, "time_per_iteration": 2.6528830528259277 }, { "auxiliary_loss_clip": 0.01634669, "auxiliary_loss_mlp": 0.00302728, "balance_loss_clip": 1.28570485, "balance_loss_mlp": 0.27192438, "epoch": 0.2880505035322411, "flos": 22344122223360.0, "grad_norm": 10.897009097718461, "language_loss": 0.84922725, "learning_rate": 3.3411914598756922e-06, "loss": 0.8686012, "num_input_tokens_seen": 103357010, "router_z_loss_clip": 3.4921875, "router_z_loss_mlp": 0.30810547, "step": 4791, "time_per_iteration": 2.6346404552459717 }, { "auxiliary_loss_clip": 0.01622634, "auxiliary_loss_mlp": 0.00290359, "balance_loss_clip": 1.27290916, "balance_loss_mlp": 0.25844651, "epoch": 0.28811062678490906, "flos": 18004246208640.0, "grad_norm": 41.09127223671159, "language_loss": 0.78944194, "learning_rate": 3.3409025228145654e-06, "loss": 0.80857182, "num_input_tokens_seen": 103375600, "router_z_loss_clip": 3.5, "router_z_loss_mlp": 0.31896973, "step": 4792, "time_per_iteration": 4.0592734813690186 }, { "auxiliary_loss_clip": 0.01607942, "auxiliary_loss_mlp": 0.00276573, "balance_loss_clip": 1.26671064, "balance_loss_mlp": 0.25019157, "epoch": 0.28817075003757703, "flos": 22090880361600.0, "grad_norm": 17.75419520868001, "language_loss": 0.88692939, "learning_rate": 3.3406135349062812e-06, "loss": 0.90577453, "num_input_tokens_seen": 103395225, "router_z_loss_clip": 3.41601562, "router_z_loss_mlp": 0.26391602, "step": 4793, "time_per_iteration": 2.6383016109466553 }, { "auxiliary_loss_clip": 0.01612502, "auxiliary_loss_mlp": 0.00281107, "balance_loss_clip": 1.26984143, "balance_loss_mlp": 0.25522679, "epoch": 0.288230873290245, "flos": 41683130847360.0, "grad_norm": 12.159592890495626, "language_loss": 0.82287252, "learning_rate": 3.340324496161797e-06, "loss": 0.84180856, "num_input_tokens_seen": 103417245, "router_z_loss_clip": 3.4296875, "router_z_loss_mlp": 0.25891113, "step": 4794, "time_per_iteration": 2.9083666801452637 }, { "auxiliary_loss_clip": 0.0160476, "auxiliary_loss_mlp": 0.00281194, "balance_loss_clip": 1.2637006, "balance_loss_mlp": 0.25239331, "epoch": 0.28829099654291296, "flos": 18624423456000.0, "grad_norm": 2.910094433452658, "language_loss": 0.89856994, "learning_rate": 3.340035406592074e-06, "loss": 0.91742945, "num_input_tokens_seen": 103435500, "router_z_loss_clip": 3.41015625, "router_z_loss_mlp": 0.2878418, "step": 4795, "time_per_iteration": 2.683037757873535 }, { "auxiliary_loss_clip": 0.01585295, "auxiliary_loss_mlp": 0.00240303, "balance_loss_clip": 1.25398135, "balance_loss_mlp": 0.21617499, "epoch": 0.2883511197955809, "flos": 24674832017280.0, "grad_norm": 7.665158179075347, "language_loss": 0.79001808, "learning_rate": 3.339746266208074e-06, "loss": 0.80827403, "num_input_tokens_seen": 103451040, "router_z_loss_clip": 3.30859375, "router_z_loss_mlp": 0.24121094, "step": 4796, "time_per_iteration": 2.6798603534698486 }, { "auxiliary_loss_clip": 0.01617628, "auxiliary_loss_mlp": 0.00272055, "balance_loss_clip": 1.27667665, "balance_loss_mlp": 0.24232399, "epoch": 0.2884112430482489, "flos": 23112143850240.0, "grad_norm": 2.1965696795318856, "language_loss": 0.81033301, "learning_rate": 3.3394570750207614e-06, "loss": 0.82922983, "num_input_tokens_seen": 103471330, "router_z_loss_clip": 3.41015625, "router_z_loss_mlp": 0.29699707, "step": 4797, "time_per_iteration": 2.668360710144043 }, { "auxiliary_loss_clip": 0.01592366, "auxiliary_loss_mlp": 0.00254092, "balance_loss_clip": 1.25612426, "balance_loss_mlp": 0.22525528, "epoch": 0.28847136630091685, "flos": 16873347432960.0, "grad_norm": 73.94060012211794, "language_loss": 0.81609476, "learning_rate": 3.3391678330411017e-06, "loss": 0.83455932, "num_input_tokens_seen": 103488060, "router_z_loss_clip": 3.36328125, "router_z_loss_mlp": 0.28796387, "step": 4798, "time_per_iteration": 2.643634557723999 }, { "auxiliary_loss_clip": 0.01637858, "auxiliary_loss_mlp": 0.00263752, "balance_loss_clip": 1.29387975, "balance_loss_mlp": 0.23499903, "epoch": 0.2885314895535849, "flos": 25657527277440.0, "grad_norm": 5.030926605102282, "language_loss": 0.74981374, "learning_rate": 3.3388785402800642e-06, "loss": 0.76882982, "num_input_tokens_seen": 103503600, "router_z_loss_clip": 3.43945312, "router_z_loss_mlp": 0.2878418, "step": 4799, "time_per_iteration": 2.6968932151794434 }, { "auxiliary_loss_clip": 0.01610547, "auxiliary_loss_mlp": 0.00289617, "balance_loss_clip": 1.27320933, "balance_loss_mlp": 0.2598151, "epoch": 0.28859161280625284, "flos": 21107251347840.0, "grad_norm": 2.0686137486137146, "language_loss": 0.89502317, "learning_rate": 3.3385891967486178e-06, "loss": 0.91402483, "num_input_tokens_seen": 103524195, "router_z_loss_clip": 3.37109375, "router_z_loss_mlp": 0.2980957, "step": 4800, "time_per_iteration": 4.116274356842041 }, { "auxiliary_loss_clip": 0.0158209, "auxiliary_loss_mlp": 0.00236079, "balance_loss_clip": 1.25681961, "balance_loss_mlp": 0.21009117, "epoch": 0.2886517360589208, "flos": 26469540086400.0, "grad_norm": 28.830285294923623, "language_loss": 0.9635098, "learning_rate": 3.3382998024577347e-06, "loss": 0.98169154, "num_input_tokens_seen": 103545235, "router_z_loss_clip": 3.25195312, "router_z_loss_mlp": 0.26000977, "step": 4801, "time_per_iteration": 2.696899890899658 }, { "auxiliary_loss_clip": 0.01562938, "auxiliary_loss_mlp": 0.00241659, "balance_loss_clip": 1.24283099, "balance_loss_mlp": 0.21581489, "epoch": 0.28871185931158877, "flos": 25265275781760.0, "grad_norm": 82.82088692858193, "language_loss": 0.80222648, "learning_rate": 3.33801035741839e-06, "loss": 0.82027245, "num_input_tokens_seen": 103563305, "router_z_loss_clip": 3.20117188, "router_z_loss_mlp": 0.25817871, "step": 4802, "time_per_iteration": 2.6930692195892334 }, { "auxiliary_loss_clip": 0.01397537, "auxiliary_loss_mlp": 0.00055171, "balance_loss_clip": 1.15106225, "balance_loss_mlp": 0.0474466, "epoch": 0.28877198256425674, "flos": 66665431284480.0, "grad_norm": 0.7872175937567806, "language_loss": 0.62818098, "learning_rate": 3.337720861641558e-06, "loss": 0.64270806, "num_input_tokens_seen": 103625025, "router_z_loss_clip": 2.46875, "router_z_loss_mlp": 0.07714844, "step": 4803, "time_per_iteration": 3.0930697917938232 }, { "auxiliary_loss_clip": 0.01601592, "auxiliary_loss_mlp": 0.00246875, "balance_loss_clip": 1.27128816, "balance_loss_mlp": 0.2200053, "epoch": 0.2888321058169247, "flos": 20303031790080.0, "grad_norm": 23.64662922413243, "language_loss": 0.79756731, "learning_rate": 3.3374313151382165e-06, "loss": 0.81605196, "num_input_tokens_seen": 103644235, "router_z_loss_clip": 3.296875, "router_z_loss_mlp": 0.26855469, "step": 4804, "time_per_iteration": 2.6821911334991455 }, { "auxiliary_loss_clip": 0.01589767, "auxiliary_loss_mlp": 0.00241795, "balance_loss_clip": 1.26114416, "balance_loss_mlp": 0.21351878, "epoch": 0.28889222906959267, "flos": 25516721963520.0, "grad_norm": 1.958763590811428, "language_loss": 0.74934655, "learning_rate": 3.337141717919346e-06, "loss": 0.76766217, "num_input_tokens_seen": 103664700, "router_z_loss_clip": 3.2890625, "router_z_loss_mlp": 0.28283691, "step": 4805, "time_per_iteration": 2.671834707260132 }, { "auxiliary_loss_clip": 0.01602693, "auxiliary_loss_mlp": 0.00236796, "balance_loss_clip": 1.26799297, "balance_loss_mlp": 0.20789948, "epoch": 0.28895235232226063, "flos": 32671312560000.0, "grad_norm": 25.202732228764393, "language_loss": 0.76788545, "learning_rate": 3.3368520699959272e-06, "loss": 0.78628039, "num_input_tokens_seen": 103686595, "router_z_loss_clip": 3.34570312, "router_z_loss_mlp": 0.28881836, "step": 4806, "time_per_iteration": 2.727370262145996 }, { "auxiliary_loss_clip": 0.01584635, "auxiliary_loss_mlp": 0.00245318, "balance_loss_clip": 1.26524901, "balance_loss_mlp": 0.21708894, "epoch": 0.2890124755749286, "flos": 29714679342720.0, "grad_norm": 1.6747753336095474, "language_loss": 0.77108669, "learning_rate": 3.3365623713789443e-06, "loss": 0.78938627, "num_input_tokens_seen": 103707525, "router_z_loss_clip": 3.19140625, "router_z_loss_mlp": 0.28222656, "step": 4807, "time_per_iteration": 2.6979117393493652 }, { "auxiliary_loss_clip": 0.01569129, "auxiliary_loss_mlp": 0.0026319, "balance_loss_clip": 1.25403309, "balance_loss_mlp": 0.23607016, "epoch": 0.28907259882759656, "flos": 22674464628480.0, "grad_norm": 10.334349585692953, "language_loss": 0.86822766, "learning_rate": 3.336272622079382e-06, "loss": 0.88655084, "num_input_tokens_seen": 103727905, "router_z_loss_clip": 3.15039062, "router_z_loss_mlp": 0.2713623, "step": 4808, "time_per_iteration": 2.7061989307403564 }, { "auxiliary_loss_clip": 0.01594575, "auxiliary_loss_mlp": 0.00241956, "balance_loss_clip": 1.27660847, "balance_loss_mlp": 0.21257126, "epoch": 0.2891327220802645, "flos": 22566050403840.0, "grad_norm": 10.08881012835152, "language_loss": 0.84944999, "learning_rate": 3.3359828221082276e-06, "loss": 0.86781538, "num_input_tokens_seen": 103748335, "router_z_loss_clip": 3.17773438, "router_z_loss_mlp": 0.2935791, "step": 4809, "time_per_iteration": 2.759411096572876 }, { "auxiliary_loss_clip": 0.01595753, "auxiliary_loss_mlp": 0.0026893, "balance_loss_clip": 1.27048254, "balance_loss_mlp": 0.23958036, "epoch": 0.2891928453329325, "flos": 21652806090240.0, "grad_norm": 233.1375924690938, "language_loss": 0.86557746, "learning_rate": 3.3356929714764714e-06, "loss": 0.8842243, "num_input_tokens_seen": 103767020, "router_z_loss_clip": 3.25390625, "router_z_loss_mlp": 0.29370117, "step": 4810, "time_per_iteration": 2.636070966720581 }, { "auxiliary_loss_clip": 0.01597768, "auxiliary_loss_mlp": 0.00237317, "balance_loss_clip": 1.28222716, "balance_loss_mlp": 0.20887412, "epoch": 0.28925296858560046, "flos": 23222102359680.0, "grad_norm": 132.10036904199978, "language_loss": 0.82876122, "learning_rate": 3.3354030701951032e-06, "loss": 0.84711218, "num_input_tokens_seen": 103786355, "router_z_loss_clip": 3.15625, "router_z_loss_mlp": 0.2845459, "step": 4811, "time_per_iteration": 2.636136293411255 }, { "auxiliary_loss_clip": 0.01632703, "auxiliary_loss_mlp": 0.00252371, "balance_loss_clip": 1.30275273, "balance_loss_mlp": 0.2211501, "epoch": 0.2893130918382685, "flos": 28621666437120.0, "grad_norm": 17.481864445837573, "language_loss": 0.83418995, "learning_rate": 3.335113118275117e-06, "loss": 0.8530407, "num_input_tokens_seen": 103809345, "router_z_loss_clip": 3.29492188, "router_z_loss_mlp": 0.31237793, "step": 4812, "time_per_iteration": 2.680908441543579 }, { "auxiliary_loss_clip": 0.01454647, "auxiliary_loss_mlp": 0.00081054, "balance_loss_clip": 1.23036754, "balance_loss_mlp": 0.07433056, "epoch": 0.28937321509093644, "flos": 72301288982400.0, "grad_norm": 0.8097895392727522, "language_loss": 0.59659529, "learning_rate": 3.3348231157275085e-06, "loss": 0.6119523, "num_input_tokens_seen": 103871180, "router_z_loss_clip": 2.25, "router_z_loss_mlp": 0.06738281, "step": 4813, "time_per_iteration": 3.2664732933044434 }, { "auxiliary_loss_clip": 0.0160387, "auxiliary_loss_mlp": 0.00277395, "balance_loss_clip": 1.29045105, "balance_loss_mlp": 0.24831973, "epoch": 0.2894333383436044, "flos": 16216397637120.0, "grad_norm": 2.363460538940302, "language_loss": 0.90440464, "learning_rate": 3.3345330625632725e-06, "loss": 0.92321736, "num_input_tokens_seen": 103889040, "router_z_loss_clip": 3.1328125, "router_z_loss_mlp": 0.29064941, "step": 4814, "time_per_iteration": 2.617704153060913 }, { "auxiliary_loss_clip": 0.01586083, "auxiliary_loss_mlp": 0.00288101, "balance_loss_clip": 1.27895033, "balance_loss_mlp": 0.2580964, "epoch": 0.2894934615962724, "flos": 24828278918400.0, "grad_norm": 3.9419578685944634, "language_loss": 0.80413449, "learning_rate": 3.3342429587934094e-06, "loss": 0.82287639, "num_input_tokens_seen": 103910380, "router_z_loss_clip": 3.0703125, "router_z_loss_mlp": 0.30041504, "step": 4815, "time_per_iteration": 2.6765856742858887 }, { "auxiliary_loss_clip": 0.01583924, "auxiliary_loss_mlp": 0.00243495, "balance_loss_clip": 1.27972627, "balance_loss_mlp": 0.21606499, "epoch": 0.28955358484894034, "flos": 20449978329600.0, "grad_norm": 1.917147202938252, "language_loss": 0.77486467, "learning_rate": 3.3339528044289198e-06, "loss": 0.79313886, "num_input_tokens_seen": 103929955, "router_z_loss_clip": 3.04296875, "router_z_loss_mlp": 0.27404785, "step": 4816, "time_per_iteration": 2.636842966079712 }, { "auxiliary_loss_clip": 0.01625698, "auxiliary_loss_mlp": 0.00322508, "balance_loss_clip": 1.30101562, "balance_loss_mlp": 0.28908166, "epoch": 0.2896137081016083, "flos": 22565188477440.0, "grad_norm": 2.542180714341053, "language_loss": 0.85157263, "learning_rate": 3.3336625994808055e-06, "loss": 0.87105465, "num_input_tokens_seen": 103948020, "router_z_loss_clip": 3.24609375, "router_z_loss_mlp": 0.33398438, "step": 4817, "time_per_iteration": 2.732513427734375 }, { "auxiliary_loss_clip": 0.01599643, "auxiliary_loss_mlp": 0.00294656, "balance_loss_clip": 1.28001785, "balance_loss_mlp": 0.26299381, "epoch": 0.28967383135427627, "flos": 26687948734080.0, "grad_norm": 3.3082428432894715, "language_loss": 0.83311236, "learning_rate": 3.3333723439600723e-06, "loss": 0.85205537, "num_input_tokens_seen": 103968740, "router_z_loss_clip": 3.19140625, "router_z_loss_mlp": 0.31665039, "step": 4818, "time_per_iteration": 2.73789119720459 }, { "auxiliary_loss_clip": 0.01607485, "auxiliary_loss_mlp": 0.00285724, "balance_loss_clip": 1.29191935, "balance_loss_mlp": 0.25291806, "epoch": 0.28973395460694423, "flos": 15558262692480.0, "grad_norm": 2.724967322174943, "language_loss": 0.87565136, "learning_rate": 3.3330820378777263e-06, "loss": 0.89458346, "num_input_tokens_seen": 103986005, "router_z_loss_clip": 3.15625, "router_z_loss_mlp": 0.32800293, "step": 4819, "time_per_iteration": 2.6208438873291016 }, { "auxiliary_loss_clip": 0.01608801, "auxiliary_loss_mlp": 0.00307827, "balance_loss_clip": 1.29608893, "balance_loss_mlp": 0.27566403, "epoch": 0.2897940778596122, "flos": 18697465762560.0, "grad_norm": 20.21590245976609, "language_loss": 0.87885189, "learning_rate": 3.332791681244776e-06, "loss": 0.89801812, "num_input_tokens_seen": 104005070, "router_z_loss_clip": 3.12695312, "router_z_loss_mlp": 0.32177734, "step": 4820, "time_per_iteration": 2.6680848598480225 }, { "auxiliary_loss_clip": 0.01633195, "auxiliary_loss_mlp": 0.0030273, "balance_loss_clip": 1.30922461, "balance_loss_mlp": 0.27218905, "epoch": 0.28985420111228016, "flos": 18770292587520.0, "grad_norm": 30.81024535643987, "language_loss": 0.80478859, "learning_rate": 3.332501274072231e-06, "loss": 0.82414788, "num_input_tokens_seen": 104022945, "router_z_loss_clip": 3.2421875, "router_z_loss_mlp": 0.30541992, "step": 4821, "time_per_iteration": 2.652608871459961 }, { "auxiliary_loss_clip": 0.01629125, "auxiliary_loss_mlp": 0.00321887, "balance_loss_clip": 1.30606699, "balance_loss_mlp": 0.28775722, "epoch": 0.28991432436494813, "flos": 23069840607360.0, "grad_norm": 1.692796420877047, "language_loss": 0.78658867, "learning_rate": 3.332210816371104e-06, "loss": 0.80609882, "num_input_tokens_seen": 104042080, "router_z_loss_clip": 3.234375, "router_z_loss_mlp": 0.34143066, "step": 4822, "time_per_iteration": 2.6353065967559814 }, { "auxiliary_loss_clip": 0.01610408, "auxiliary_loss_mlp": 0.00276937, "balance_loss_clip": 1.29201293, "balance_loss_mlp": 0.24720654, "epoch": 0.2899744476176161, "flos": 17603195880960.0, "grad_norm": 30.53333565960886, "language_loss": 0.72391796, "learning_rate": 3.3319203081524102e-06, "loss": 0.74279141, "num_input_tokens_seen": 104060975, "router_z_loss_clip": 3.18554688, "router_z_loss_mlp": 0.29711914, "step": 4823, "time_per_iteration": 2.6165010929107666 }, { "auxiliary_loss_clip": 0.01592048, "auxiliary_loss_mlp": 0.00301793, "balance_loss_clip": 1.28302622, "balance_loss_mlp": 0.27182376, "epoch": 0.29003457087028406, "flos": 22309360836480.0, "grad_norm": 2.2463158085212016, "language_loss": 0.87091196, "learning_rate": 3.331629749427164e-06, "loss": 0.88985032, "num_input_tokens_seen": 104081395, "router_z_loss_clip": 3.09375, "router_z_loss_mlp": 0.29980469, "step": 4824, "time_per_iteration": 2.6951851844787598 }, { "auxiliary_loss_clip": 0.01634006, "auxiliary_loss_mlp": 0.00296321, "balance_loss_clip": 1.30878782, "balance_loss_mlp": 0.26294214, "epoch": 0.2900946941229521, "flos": 21944975316480.0, "grad_norm": 10.813071944621353, "language_loss": 0.79662025, "learning_rate": 3.331339140206385e-06, "loss": 0.81592357, "num_input_tokens_seen": 104099995, "router_z_loss_clip": 3.25, "router_z_loss_mlp": 0.33349609, "step": 4825, "time_per_iteration": 2.680131673812866 }, { "auxiliary_loss_clip": 0.01619788, "auxiliary_loss_mlp": 0.00318059, "balance_loss_clip": 1.29726756, "balance_loss_mlp": 0.28527653, "epoch": 0.29015481737562004, "flos": 17932173569280.0, "grad_norm": 48.36906058988632, "language_loss": 0.8257221, "learning_rate": 3.331048480501092e-06, "loss": 0.84510064, "num_input_tokens_seen": 104118930, "router_z_loss_clip": 3.22460938, "router_z_loss_mlp": 0.328125, "step": 4826, "time_per_iteration": 2.6892335414886475 }, { "auxiliary_loss_clip": 0.01624866, "auxiliary_loss_mlp": 0.00318043, "balance_loss_clip": 1.29952872, "balance_loss_mlp": 0.28661966, "epoch": 0.290214940628288, "flos": 22783525297920.0, "grad_norm": 3.1437969848418517, "language_loss": 0.7661615, "learning_rate": 3.3307577703223073e-06, "loss": 0.78559059, "num_input_tokens_seen": 104136940, "router_z_loss_clip": 3.25390625, "router_z_loss_mlp": 0.31445312, "step": 4827, "time_per_iteration": 2.7381391525268555 }, { "auxiliary_loss_clip": 0.01613012, "auxiliary_loss_mlp": 0.00307927, "balance_loss_clip": 1.30014324, "balance_loss_mlp": 0.2754786, "epoch": 0.290275063880956, "flos": 20006481104640.0, "grad_norm": 1.7735862724061389, "language_loss": 0.86312288, "learning_rate": 3.3304670096810545e-06, "loss": 0.88233227, "num_input_tokens_seen": 104154280, "router_z_loss_clip": 3.13085938, "router_z_loss_mlp": 0.32470703, "step": 4828, "time_per_iteration": 4.106780767440796 }, { "auxiliary_loss_clip": 0.01587902, "auxiliary_loss_mlp": 0.00293388, "balance_loss_clip": 1.28251517, "balance_loss_mlp": 0.26270345, "epoch": 0.29033518713362394, "flos": 22053605022720.0, "grad_norm": 4.176256488719725, "language_loss": 0.85667294, "learning_rate": 3.33017619858836e-06, "loss": 0.87548578, "num_input_tokens_seen": 104172605, "router_z_loss_clip": 3.0546875, "router_z_loss_mlp": 0.30688477, "step": 4829, "time_per_iteration": 2.622544288635254 }, { "auxiliary_loss_clip": 0.01592658, "auxiliary_loss_mlp": 0.00268923, "balance_loss_clip": 1.28586483, "balance_loss_mlp": 0.24142139, "epoch": 0.2903953103862919, "flos": 25630056351360.0, "grad_norm": 2.0296812207273325, "language_loss": 0.87396967, "learning_rate": 3.329885337055249e-06, "loss": 0.89258552, "num_input_tokens_seen": 104194120, "router_z_loss_clip": 3.0703125, "router_z_loss_mlp": 0.27490234, "step": 4830, "time_per_iteration": 2.676800489425659 }, { "auxiliary_loss_clip": 0.01627058, "auxiliary_loss_mlp": 0.00329696, "balance_loss_clip": 1.31066072, "balance_loss_mlp": 0.29631752, "epoch": 0.29045543363895987, "flos": 16945851035520.0, "grad_norm": 135.88901008841094, "language_loss": 0.87789178, "learning_rate": 3.3295944250927546e-06, "loss": 0.89745939, "num_input_tokens_seen": 104210875, "router_z_loss_clip": 3.16210938, "router_z_loss_mlp": 0.3338623, "step": 4831, "time_per_iteration": 4.213700771331787 }, { "auxiliary_loss_clip": 0.0159734, "auxiliary_loss_mlp": 0.00281121, "balance_loss_clip": 1.2922405, "balance_loss_mlp": 0.25046048, "epoch": 0.29051555689162784, "flos": 26395492199040.0, "grad_norm": 10.8383879246354, "language_loss": 0.80703014, "learning_rate": 3.3293034627119055e-06, "loss": 0.82581472, "num_input_tokens_seen": 104229875, "router_z_loss_clip": 3.04492188, "router_z_loss_mlp": 0.30688477, "step": 4832, "time_per_iteration": 2.7080366611480713 }, { "auxiliary_loss_clip": 0.01582505, "auxiliary_loss_mlp": 0.00304957, "balance_loss_clip": 1.27031338, "balance_loss_mlp": 0.27603707, "epoch": 0.2905756801442958, "flos": 21103875469440.0, "grad_norm": 242.73364488234517, "language_loss": 0.81474078, "learning_rate": 3.329012449923736e-06, "loss": 0.83361542, "num_input_tokens_seen": 104250405, "router_z_loss_clip": 3.12304688, "router_z_loss_mlp": 0.28930664, "step": 4833, "time_per_iteration": 2.7506818771362305 }, { "auxiliary_loss_clip": 0.01580332, "auxiliary_loss_mlp": 0.00289654, "balance_loss_clip": 1.27149081, "balance_loss_mlp": 0.2601018, "epoch": 0.29063580339696377, "flos": 15706071158400.0, "grad_norm": 3.3672665753678586, "language_loss": 0.73633504, "learning_rate": 3.3287213867392813e-06, "loss": 0.75503486, "num_input_tokens_seen": 104269185, "router_z_loss_clip": 3.0859375, "router_z_loss_mlp": 0.29528809, "step": 4834, "time_per_iteration": 2.814420461654663 }, { "auxiliary_loss_clip": 0.01583481, "auxiliary_loss_mlp": 0.00279231, "balance_loss_clip": 1.27226496, "balance_loss_mlp": 0.24747351, "epoch": 0.29069592664963173, "flos": 24644990793600.0, "grad_norm": 2.6084360107463933, "language_loss": 0.76715267, "learning_rate": 3.3284302731695783e-06, "loss": 0.78577983, "num_input_tokens_seen": 104289400, "router_z_loss_clip": 3.11132812, "router_z_loss_mlp": 0.31738281, "step": 4835, "time_per_iteration": 4.108250141143799 }, { "auxiliary_loss_clip": 0.01568995, "auxiliary_loss_mlp": 0.0029273, "balance_loss_clip": 1.26424408, "balance_loss_mlp": 0.26166445, "epoch": 0.2907560499022997, "flos": 24973753000320.0, "grad_norm": 3.4932471983634414, "language_loss": 0.86801314, "learning_rate": 3.3281391092256668e-06, "loss": 0.88663042, "num_input_tokens_seen": 104310485, "router_z_loss_clip": 3.04882812, "router_z_loss_mlp": 0.31079102, "step": 4836, "time_per_iteration": 2.697174310684204 }, { "auxiliary_loss_clip": 0.01573095, "auxiliary_loss_mlp": 0.00279948, "balance_loss_clip": 1.271981, "balance_loss_mlp": 0.24969263, "epoch": 0.29081617315496766, "flos": 18657496903680.0, "grad_norm": 62.29199338589051, "language_loss": 0.86402881, "learning_rate": 3.3278478949185865e-06, "loss": 0.88255924, "num_input_tokens_seen": 104327330, "router_z_loss_clip": 3.01171875, "router_z_loss_mlp": 0.30236816, "step": 4837, "time_per_iteration": 2.618614912033081 }, { "auxiliary_loss_clip": 0.0156619, "auxiliary_loss_mlp": 0.00298499, "balance_loss_clip": 1.26220489, "balance_loss_mlp": 0.26681301, "epoch": 0.2908762964076356, "flos": 35331035955840.0, "grad_norm": 14.619181300918413, "language_loss": 0.73421693, "learning_rate": 3.327556630259381e-06, "loss": 0.75286388, "num_input_tokens_seen": 104350350, "router_z_loss_clip": 3.03710938, "router_z_loss_mlp": 0.31665039, "step": 4838, "time_per_iteration": 2.8038620948791504 }, { "auxiliary_loss_clip": 0.01568258, "auxiliary_loss_mlp": 0.00289305, "balance_loss_clip": 1.26272178, "balance_loss_mlp": 0.25943065, "epoch": 0.29093641966030365, "flos": 23076305055360.0, "grad_norm": 4.3222222011141165, "language_loss": 0.78650403, "learning_rate": 3.327265315259095e-06, "loss": 0.8050797, "num_input_tokens_seen": 104369995, "router_z_loss_clip": 3.05859375, "router_z_loss_mlp": 0.29858398, "step": 4839, "time_per_iteration": 2.6770036220550537 }, { "auxiliary_loss_clip": 0.0158025, "auxiliary_loss_mlp": 0.00302923, "balance_loss_clip": 1.2756592, "balance_loss_mlp": 0.27045065, "epoch": 0.2909965429129716, "flos": 35955415094400.0, "grad_norm": 3.177759521372114, "language_loss": 0.83013737, "learning_rate": 3.326973949928776e-06, "loss": 0.8489691, "num_input_tokens_seen": 104392285, "router_z_loss_clip": 3.05078125, "router_z_loss_mlp": 0.32446289, "step": 4840, "time_per_iteration": 2.8860912322998047 }, { "auxiliary_loss_clip": 0.01581242, "auxiliary_loss_mlp": 0.00279988, "balance_loss_clip": 1.27577853, "balance_loss_mlp": 0.24858877, "epoch": 0.2910566661656396, "flos": 30880231764480.0, "grad_norm": 103.12975403831918, "language_loss": 0.70267689, "learning_rate": 3.326682534279471e-06, "loss": 0.72128922, "num_input_tokens_seen": 104412640, "router_z_loss_clip": 3.05273438, "router_z_loss_mlp": 0.31396484, "step": 4841, "time_per_iteration": 2.7527477741241455 }, { "auxiliary_loss_clip": 0.01561867, "auxiliary_loss_mlp": 0.00272769, "balance_loss_clip": 1.26434779, "balance_loss_mlp": 0.24299042, "epoch": 0.29111678941830754, "flos": 30010188533760.0, "grad_norm": 2.1470009958189222, "language_loss": 0.76646912, "learning_rate": 3.326391068322232e-06, "loss": 0.78481549, "num_input_tokens_seen": 104435245, "router_z_loss_clip": 2.97460938, "router_z_loss_mlp": 0.29760742, "step": 4842, "time_per_iteration": 4.1642467975616455 }, { "auxiliary_loss_clip": 0.01568762, "auxiliary_loss_mlp": 0.00278663, "balance_loss_clip": 1.26454294, "balance_loss_mlp": 0.24840814, "epoch": 0.2911769126709755, "flos": 22857393617280.0, "grad_norm": 2.4205394423608584, "language_loss": 0.80978823, "learning_rate": 3.3260995520681098e-06, "loss": 0.82826257, "num_input_tokens_seen": 104455395, "router_z_loss_clip": 3.04101562, "router_z_loss_mlp": 0.3026123, "step": 4843, "time_per_iteration": 2.7080800533294678 }, { "auxiliary_loss_clip": 0.01577086, "auxiliary_loss_mlp": 0.00264048, "balance_loss_clip": 1.2671206, "balance_loss_mlp": 0.23336369, "epoch": 0.2912370359236435, "flos": 21650507619840.0, "grad_norm": 3.791888943379148, "language_loss": 0.67201537, "learning_rate": 3.3258079855281602e-06, "loss": 0.69042671, "num_input_tokens_seen": 104473350, "router_z_loss_clip": 3.1015625, "router_z_loss_mlp": 0.30688477, "step": 4844, "time_per_iteration": 2.692915439605713 }, { "auxiliary_loss_clip": 0.01577903, "auxiliary_loss_mlp": 0.00289838, "balance_loss_clip": 1.27152145, "balance_loss_mlp": 0.2560299, "epoch": 0.29129715917631144, "flos": 22893340152960.0, "grad_norm": 2.560732944449527, "language_loss": 0.94854546, "learning_rate": 3.3255163687134396e-06, "loss": 0.96722293, "num_input_tokens_seen": 104492265, "router_z_loss_clip": 3.06054688, "router_z_loss_mlp": 0.33837891, "step": 4845, "time_per_iteration": 2.7472190856933594 }, { "auxiliary_loss_clip": 0.01547994, "auxiliary_loss_mlp": 0.00295216, "balance_loss_clip": 1.2566328, "balance_loss_mlp": 0.26376811, "epoch": 0.2913572824289794, "flos": 22674464628480.0, "grad_norm": 44.13642092755913, "language_loss": 0.73461169, "learning_rate": 3.3252247016350046e-06, "loss": 0.75304377, "num_input_tokens_seen": 104510755, "router_z_loss_clip": 2.91210938, "router_z_loss_mlp": 0.31445312, "step": 4846, "time_per_iteration": 2.6958000659942627 }, { "auxiliary_loss_clip": 0.01534262, "auxiliary_loss_mlp": 0.00253265, "balance_loss_clip": 1.24763036, "balance_loss_mlp": 0.22552454, "epoch": 0.29141740568164737, "flos": 23107403255040.0, "grad_norm": 5.2253917170121955, "language_loss": 0.76103383, "learning_rate": 3.3249329843039166e-06, "loss": 0.77890909, "num_input_tokens_seen": 104530830, "router_z_loss_clip": 2.86914062, "router_z_loss_mlp": 0.27758789, "step": 4847, "time_per_iteration": 2.684823989868164 }, { "auxiliary_loss_clip": 0.0154264, "auxiliary_loss_mlp": 0.00272411, "balance_loss_clip": 1.24918795, "balance_loss_mlp": 0.24251309, "epoch": 0.29147752893431533, "flos": 23587026583680.0, "grad_norm": 2.623932573418787, "language_loss": 0.80926484, "learning_rate": 3.324641216731237e-06, "loss": 0.82741535, "num_input_tokens_seen": 104550115, "router_z_loss_clip": 2.93359375, "router_z_loss_mlp": 0.29882812, "step": 4848, "time_per_iteration": 2.7317447662353516 }, { "auxiliary_loss_clip": 0.01568333, "auxiliary_loss_mlp": 0.00292211, "balance_loss_clip": 1.2686348, "balance_loss_mlp": 0.26256335, "epoch": 0.2915376521869833, "flos": 20591968792320.0, "grad_norm": 17.577846442460558, "language_loss": 0.85034895, "learning_rate": 3.3243493989280295e-06, "loss": 0.86895442, "num_input_tokens_seen": 104566255, "router_z_loss_clip": 2.99804688, "router_z_loss_mlp": 0.29663086, "step": 4849, "time_per_iteration": 2.6301426887512207 }, { "auxiliary_loss_clip": 0.01595768, "auxiliary_loss_mlp": 0.0029886, "balance_loss_clip": 1.28440905, "balance_loss_mlp": 0.26600593, "epoch": 0.29159777543965126, "flos": 20811490761600.0, "grad_norm": 4.050617406388676, "language_loss": 0.8564505, "learning_rate": 3.3240575309053596e-06, "loss": 0.87539673, "num_input_tokens_seen": 104585235, "router_z_loss_clip": 3.1171875, "router_z_loss_mlp": 0.32861328, "step": 4850, "time_per_iteration": 2.6469013690948486 }, { "auxiliary_loss_clip": 0.01586234, "auxiliary_loss_mlp": 0.00280953, "balance_loss_clip": 1.28378403, "balance_loss_mlp": 0.24988672, "epoch": 0.29165789869231923, "flos": 24244155947520.0, "grad_norm": 17.82449392684173, "language_loss": 0.83497405, "learning_rate": 3.323765612674296e-06, "loss": 0.85364592, "num_input_tokens_seen": 104605315, "router_z_loss_clip": 3.02148438, "router_z_loss_mlp": 0.31030273, "step": 4851, "time_per_iteration": 2.638295888900757 }, { "auxiliary_loss_clip": 0.01573609, "auxiliary_loss_mlp": 0.00261814, "balance_loss_clip": 1.27817523, "balance_loss_mlp": 0.23346606, "epoch": 0.29171802194498725, "flos": 28949925853440.0, "grad_norm": 1.9361715836929503, "language_loss": 0.82053888, "learning_rate": 3.3234736442459078e-06, "loss": 0.83889306, "num_input_tokens_seen": 104626055, "router_z_loss_clip": 2.95507812, "router_z_loss_mlp": 0.28393555, "step": 4852, "time_per_iteration": 2.7473742961883545 }, { "auxiliary_loss_clip": 0.01616088, "auxiliary_loss_mlp": 0.00273195, "balance_loss_clip": 1.30166507, "balance_loss_mlp": 0.24251074, "epoch": 0.2917781451976552, "flos": 22598226011520.0, "grad_norm": 12.791631457804728, "language_loss": 0.83584166, "learning_rate": 3.3231816256312665e-06, "loss": 0.85473454, "num_input_tokens_seen": 104646005, "router_z_loss_clip": 3.14453125, "router_z_loss_mlp": 0.30664062, "step": 4853, "time_per_iteration": 2.7051782608032227 }, { "auxiliary_loss_clip": 0.01569866, "auxiliary_loss_mlp": 0.00269121, "balance_loss_clip": 1.26828516, "balance_loss_mlp": 0.23641036, "epoch": 0.2918382684503232, "flos": 21574448570880.0, "grad_norm": 4.195435542249867, "language_loss": 0.93571937, "learning_rate": 3.322889556841445e-06, "loss": 0.95410931, "num_input_tokens_seen": 104661620, "router_z_loss_clip": 3.01757812, "router_z_loss_mlp": 0.3269043, "step": 4854, "time_per_iteration": 2.6016831398010254 }, { "auxiliary_loss_clip": 0.01602876, "auxiliary_loss_mlp": 0.00265811, "balance_loss_clip": 1.29563642, "balance_loss_mlp": 0.23267131, "epoch": 0.29189839170299114, "flos": 24353503925760.0, "grad_norm": 12.959596729799504, "language_loss": 0.90871692, "learning_rate": 3.322597437887519e-06, "loss": 0.92740381, "num_input_tokens_seen": 104681445, "router_z_loss_clip": 3.07226562, "router_z_loss_mlp": 0.33154297, "step": 4855, "time_per_iteration": 2.676748752593994 }, { "auxiliary_loss_clip": 0.01391324, "auxiliary_loss_mlp": 0.00133189, "balance_loss_clip": 1.13202024, "balance_loss_mlp": 0.12322346, "epoch": 0.2919585149556591, "flos": 71316726215040.0, "grad_norm": 0.7962725211885296, "language_loss": 0.60009313, "learning_rate": 3.322305268780566e-06, "loss": 0.61533827, "num_input_tokens_seen": 104747945, "router_z_loss_clip": 2.59375, "router_z_loss_mlp": 0.09960938, "step": 4856, "time_per_iteration": 3.255371332168579 }, { "auxiliary_loss_clip": 0.01577311, "auxiliary_loss_mlp": 0.00249907, "balance_loss_clip": 1.28431904, "balance_loss_mlp": 0.21681458, "epoch": 0.2920186382083271, "flos": 15633208419840.0, "grad_norm": 11.359451870640555, "language_loss": 0.76261055, "learning_rate": 3.322013049531664e-06, "loss": 0.78088272, "num_input_tokens_seen": 104766225, "router_z_loss_clip": 2.9296875, "router_z_loss_mlp": 0.33105469, "step": 4857, "time_per_iteration": 2.618434190750122 }, { "auxiliary_loss_clip": 0.01595119, "auxiliary_loss_mlp": 0.00274457, "balance_loss_clip": 1.29268241, "balance_loss_mlp": 0.2450359, "epoch": 0.29207876146099504, "flos": 28366018364160.0, "grad_norm": 25.411934396594226, "language_loss": 0.90066671, "learning_rate": 3.321720780151895e-06, "loss": 0.91936243, "num_input_tokens_seen": 104785345, "router_z_loss_clip": 3.01953125, "router_z_loss_mlp": 0.29467773, "step": 4858, "time_per_iteration": 2.7145328521728516 }, { "auxiliary_loss_clip": 0.01614129, "auxiliary_loss_mlp": 0.00268406, "balance_loss_clip": 1.31104529, "balance_loss_mlp": 0.2351228, "epoch": 0.292138884713663, "flos": 21870963342720.0, "grad_norm": 2.235097327405344, "language_loss": 0.84422982, "learning_rate": 3.321428460652342e-06, "loss": 0.86305511, "num_input_tokens_seen": 104804560, "router_z_loss_clip": 3.02929688, "router_z_loss_mlp": 0.33251953, "step": 4859, "time_per_iteration": 2.711402177810669 }, { "auxiliary_loss_clip": 0.01612589, "auxiliary_loss_mlp": 0.00286582, "balance_loss_clip": 1.30119109, "balance_loss_mlp": 0.25353739, "epoch": 0.29219900796633097, "flos": 20992552243200.0, "grad_norm": 7.123776477872965, "language_loss": 0.79723531, "learning_rate": 3.3211360910440885e-06, "loss": 0.81622708, "num_input_tokens_seen": 104821105, "router_z_loss_clip": 3.11132812, "router_z_loss_mlp": 0.33056641, "step": 4860, "time_per_iteration": 2.7873964309692383 }, { "auxiliary_loss_clip": 0.01619301, "auxiliary_loss_mlp": 0.00255276, "balance_loss_clip": 1.31606317, "balance_loss_mlp": 0.22375695, "epoch": 0.29225913121899894, "flos": 35004608133120.0, "grad_norm": 13.87536757520556, "language_loss": 0.81999749, "learning_rate": 3.320843671338222e-06, "loss": 0.83874333, "num_input_tokens_seen": 104841440, "router_z_loss_clip": 3.03320312, "router_z_loss_mlp": 0.31494141, "step": 4861, "time_per_iteration": 2.8162572383880615 }, { "auxiliary_loss_clip": 0.01639005, "auxiliary_loss_mlp": 0.0024486, "balance_loss_clip": 1.32986903, "balance_loss_mlp": 0.21164876, "epoch": 0.2923192544716669, "flos": 13515663888000.0, "grad_norm": 159.49943672448535, "language_loss": 0.98120123, "learning_rate": 3.320551201545832e-06, "loss": 1.00003982, "num_input_tokens_seen": 104858210, "router_z_loss_clip": 3.09179688, "router_z_loss_mlp": 0.33203125, "step": 4862, "time_per_iteration": 2.6080386638641357 }, { "auxiliary_loss_clip": 0.01590607, "auxiliary_loss_mlp": 0.0025822, "balance_loss_clip": 1.29491806, "balance_loss_mlp": 0.22596201, "epoch": 0.29237937772433487, "flos": 19463512141440.0, "grad_norm": 27.17185689422184, "language_loss": 0.81215799, "learning_rate": 3.320258681678008e-06, "loss": 0.83064628, "num_input_tokens_seen": 104875620, "router_z_loss_clip": 2.95507812, "router_z_loss_mlp": 0.32299805, "step": 4863, "time_per_iteration": 2.62457013130188 }, { "auxiliary_loss_clip": 0.01585761, "auxiliary_loss_mlp": 0.00225091, "balance_loss_clip": 1.29397345, "balance_loss_mlp": 0.19257085, "epoch": 0.29243950097700283, "flos": 20850597694080.0, "grad_norm": 6.0708105272629, "language_loss": 0.85696501, "learning_rate": 3.319966111745842e-06, "loss": 0.87507355, "num_input_tokens_seen": 104894600, "router_z_loss_clip": 2.91796875, "router_z_loss_mlp": 0.32543945, "step": 4864, "time_per_iteration": 2.6734812259674072 }, { "auxiliary_loss_clip": 0.01599777, "auxiliary_loss_mlp": 0.00241702, "balance_loss_clip": 1.30576277, "balance_loss_mlp": 0.2077989, "epoch": 0.29249962422967085, "flos": 23584225322880.0, "grad_norm": 3.2930103635377006, "language_loss": 0.87697041, "learning_rate": 3.319673491760429e-06, "loss": 0.89538515, "num_input_tokens_seen": 104914530, "router_z_loss_clip": 2.9375, "router_z_loss_mlp": 0.33935547, "step": 4865, "time_per_iteration": 2.676692247390747 }, { "auxiliary_loss_clip": 0.01621299, "auxiliary_loss_mlp": 0.00253523, "balance_loss_clip": 1.31580913, "balance_loss_mlp": 0.21964367, "epoch": 0.2925597474823388, "flos": 22273342473600.0, "grad_norm": 9.349034200563327, "language_loss": 0.93262506, "learning_rate": 3.3193808217328645e-06, "loss": 0.95137334, "num_input_tokens_seen": 104933460, "router_z_loss_clip": 3.05273438, "router_z_loss_mlp": 0.33911133, "step": 4866, "time_per_iteration": 2.631282329559326 }, { "auxiliary_loss_clip": 0.0158654, "auxiliary_loss_mlp": 0.00238703, "balance_loss_clip": 1.29215837, "balance_loss_mlp": 0.20737515, "epoch": 0.2926198707350068, "flos": 34456108475520.0, "grad_norm": 23.1504506729568, "language_loss": 0.82083607, "learning_rate": 3.3190881016742476e-06, "loss": 0.8390885, "num_input_tokens_seen": 104954495, "router_z_loss_clip": 2.9453125, "router_z_loss_mlp": 0.31298828, "step": 4867, "time_per_iteration": 2.7648160457611084 }, { "auxiliary_loss_clip": 0.01589795, "auxiliary_loss_mlp": 0.0025033, "balance_loss_clip": 1.29170275, "balance_loss_mlp": 0.21595022, "epoch": 0.29267999398767475, "flos": 20704153944960.0, "grad_norm": 431.9207819175144, "language_loss": 0.79193532, "learning_rate": 3.3187953315956776e-06, "loss": 0.81033653, "num_input_tokens_seen": 104971915, "router_z_loss_clip": 2.97851562, "router_z_loss_mlp": 0.34375, "step": 4868, "time_per_iteration": 2.6399261951446533 }, { "auxiliary_loss_clip": 0.01563624, "auxiliary_loss_mlp": 0.0024095, "balance_loss_clip": 1.27770591, "balance_loss_mlp": 0.21057546, "epoch": 0.2927401172403427, "flos": 18368667642240.0, "grad_norm": 3.686849990221056, "language_loss": 0.79627752, "learning_rate": 3.3185025115082566e-06, "loss": 0.81432331, "num_input_tokens_seen": 104991335, "router_z_loss_clip": 2.86132812, "router_z_loss_mlp": 0.30395508, "step": 4869, "time_per_iteration": 2.650019407272339 }, { "auxiliary_loss_clip": 0.0157325, "auxiliary_loss_mlp": 0.0023359, "balance_loss_clip": 1.28253591, "balance_loss_mlp": 0.19868596, "epoch": 0.2928002404930107, "flos": 26104041244800.0, "grad_norm": 9.510378483080297, "language_loss": 0.82524168, "learning_rate": 3.318209641423088e-06, "loss": 0.84331006, "num_input_tokens_seen": 105012015, "router_z_loss_clip": 2.90429688, "router_z_loss_mlp": 0.34887695, "step": 4870, "time_per_iteration": 2.730355978012085 }, { "auxiliary_loss_clip": 0.01588317, "auxiliary_loss_mlp": 0.00264624, "balance_loss_clip": 1.29045367, "balance_loss_mlp": 0.23124561, "epoch": 0.29286036374567864, "flos": 21324726241920.0, "grad_norm": 6.00621640419454, "language_loss": 0.77302265, "learning_rate": 3.3179167213512777e-06, "loss": 0.79155207, "num_input_tokens_seen": 105031460, "router_z_loss_clip": 2.98046875, "router_z_loss_mlp": 0.33398438, "step": 4871, "time_per_iteration": 4.191540718078613 }, { "auxiliary_loss_clip": 0.01561923, "auxiliary_loss_mlp": 0.00234273, "balance_loss_clip": 1.27241158, "balance_loss_mlp": 0.20294437, "epoch": 0.2929204869983466, "flos": 29569492569600.0, "grad_norm": 2.8572046450462216, "language_loss": 0.84662962, "learning_rate": 3.317623751303933e-06, "loss": 0.8645916, "num_input_tokens_seen": 105052965, "router_z_loss_clip": 2.89257812, "router_z_loss_mlp": 0.31347656, "step": 4872, "time_per_iteration": 2.718592882156372 }, { "auxiliary_loss_clip": 0.01609723, "auxiliary_loss_mlp": 0.00269416, "balance_loss_clip": 1.30285883, "balance_loss_mlp": 0.23680004, "epoch": 0.2929806102510146, "flos": 19058259922560.0, "grad_norm": 3.7030218909441066, "language_loss": 0.78123587, "learning_rate": 3.317330731292164e-06, "loss": 0.80002725, "num_input_tokens_seen": 105071840, "router_z_loss_clip": 3.06835938, "router_z_loss_mlp": 0.32641602, "step": 4873, "time_per_iteration": 2.656299591064453 }, { "auxiliary_loss_clip": 0.01583732, "auxiliary_loss_mlp": 0.00275961, "balance_loss_clip": 1.28719759, "balance_loss_mlp": 0.24353588, "epoch": 0.29304073350368254, "flos": 21944221130880.0, "grad_norm": 14.459734128343657, "language_loss": 0.8505441, "learning_rate": 3.3170376613270812e-06, "loss": 0.86914098, "num_input_tokens_seen": 105089445, "router_z_loss_clip": 2.96679688, "router_z_loss_mlp": 0.32397461, "step": 4874, "time_per_iteration": 4.185588836669922 }, { "auxiliary_loss_clip": 0.01600913, "auxiliary_loss_mlp": 0.00261548, "balance_loss_clip": 1.29444075, "balance_loss_mlp": 0.22866964, "epoch": 0.2931008567563505, "flos": 15450818135040.0, "grad_norm": 77.0191525701655, "language_loss": 0.85980272, "learning_rate": 3.3167445414197985e-06, "loss": 0.87842727, "num_input_tokens_seen": 105106210, "router_z_loss_clip": 3.06445312, "router_z_loss_mlp": 0.32910156, "step": 4875, "time_per_iteration": 2.6289570331573486 }, { "auxiliary_loss_clip": 0.01567314, "auxiliary_loss_mlp": 0.00245722, "balance_loss_clip": 1.2759335, "balance_loss_mlp": 0.21442959, "epoch": 0.29316098000901847, "flos": 16983162288000.0, "grad_norm": 2.9196948242157665, "language_loss": 0.76263922, "learning_rate": 3.316451371581431e-06, "loss": 0.78076959, "num_input_tokens_seen": 105124200, "router_z_loss_clip": 2.91015625, "router_z_loss_mlp": 0.31311035, "step": 4876, "time_per_iteration": 2.617654323577881 }, { "auxiliary_loss_clip": 0.01579555, "auxiliary_loss_mlp": 0.00249227, "balance_loss_clip": 1.28276205, "balance_loss_mlp": 0.21665952, "epoch": 0.29322110326168643, "flos": 16357705741440.0, "grad_norm": 84.99190085639044, "language_loss": 0.90022069, "learning_rate": 3.316158151823096e-06, "loss": 0.91850853, "num_input_tokens_seen": 105140400, "router_z_loss_clip": 2.96875, "router_z_loss_mlp": 0.32568359, "step": 4877, "time_per_iteration": 4.060422658920288 }, { "auxiliary_loss_clip": 0.01596862, "auxiliary_loss_mlp": 0.00263144, "balance_loss_clip": 1.29449081, "balance_loss_mlp": 0.23214975, "epoch": 0.29328122651435445, "flos": 13990869843840.0, "grad_norm": 121.85008833822694, "language_loss": 0.79009748, "learning_rate": 3.315864882155911e-06, "loss": 0.80869758, "num_input_tokens_seen": 105157535, "router_z_loss_clip": 3.02539062, "router_z_loss_mlp": 0.30957031, "step": 4878, "time_per_iteration": 2.626777410507202 }, { "auxiliary_loss_clip": 0.01603554, "auxiliary_loss_mlp": 0.00260925, "balance_loss_clip": 1.30484915, "balance_loss_mlp": 0.22852355, "epoch": 0.2933413497670224, "flos": 25264593423360.0, "grad_norm": 4.676252663622893, "language_loss": 0.81738377, "learning_rate": 3.3155715625909982e-06, "loss": 0.83602858, "num_input_tokens_seen": 105175185, "router_z_loss_clip": 2.98632812, "router_z_loss_mlp": 0.32421875, "step": 4879, "time_per_iteration": 2.776411771774292 }, { "auxiliary_loss_clip": 0.01626965, "auxiliary_loss_mlp": 0.00263191, "balance_loss_clip": 1.31708217, "balance_loss_mlp": 0.23238684, "epoch": 0.2934014730196904, "flos": 32123746656000.0, "grad_norm": 3.393169267782962, "language_loss": 0.74953312, "learning_rate": 3.3152781931394803e-06, "loss": 0.76843464, "num_input_tokens_seen": 105194540, "router_z_loss_clip": 3.09960938, "router_z_loss_mlp": 0.30810547, "step": 4880, "time_per_iteration": 2.7478859424591064 }, { "auxiliary_loss_clip": 0.01599791, "auxiliary_loss_mlp": 0.00266532, "balance_loss_clip": 1.29189992, "balance_loss_mlp": 0.23210463, "epoch": 0.29346159627235835, "flos": 24352498344960.0, "grad_norm": 7.115627434233218, "language_loss": 0.81892186, "learning_rate": 3.314984773812481e-06, "loss": 0.83758509, "num_input_tokens_seen": 105213215, "router_z_loss_clip": 3.08007812, "router_z_loss_mlp": 0.34448242, "step": 4881, "time_per_iteration": 2.6834983825683594 }, { "auxiliary_loss_clip": 0.01619698, "auxiliary_loss_mlp": 0.00276802, "balance_loss_clip": 1.31173456, "balance_loss_mlp": 0.24299452, "epoch": 0.2935217195250263, "flos": 22746752749440.0, "grad_norm": 9.904588074905043, "language_loss": 0.88614523, "learning_rate": 3.314691304621127e-06, "loss": 0.90511024, "num_input_tokens_seen": 105231585, "router_z_loss_clip": 3.08203125, "router_z_loss_mlp": 0.33789062, "step": 4882, "time_per_iteration": 2.6612088680267334 }, { "auxiliary_loss_clip": 0.01636963, "auxiliary_loss_mlp": 0.00271059, "balance_loss_clip": 1.31880379, "balance_loss_mlp": 0.23801431, "epoch": 0.2935818427776943, "flos": 21725561088000.0, "grad_norm": 5.004506428808566, "language_loss": 0.84553063, "learning_rate": 3.314397785576548e-06, "loss": 0.86461079, "num_input_tokens_seen": 105250120, "router_z_loss_clip": 3.18359375, "router_z_loss_mlp": 0.33056641, "step": 4883, "time_per_iteration": 2.704188585281372 }, { "auxiliary_loss_clip": 0.01633524, "auxiliary_loss_mlp": 0.00260989, "balance_loss_clip": 1.32149863, "balance_loss_mlp": 0.22470142, "epoch": 0.29364196603036224, "flos": 23804968354560.0, "grad_norm": 26.724161491589474, "language_loss": 0.99148232, "learning_rate": 3.3141042166898726e-06, "loss": 1.01042747, "num_input_tokens_seen": 105266065, "router_z_loss_clip": 3.11914062, "router_z_loss_mlp": 0.36279297, "step": 4884, "time_per_iteration": 2.708307981491089 }, { "auxiliary_loss_clip": 0.01609257, "auxiliary_loss_mlp": 0.00261951, "balance_loss_clip": 1.30433774, "balance_loss_mlp": 0.22955039, "epoch": 0.2937020892830302, "flos": 23470064922240.0, "grad_norm": 19.61680200977082, "language_loss": 0.81169522, "learning_rate": 3.313810597972234e-06, "loss": 0.83040732, "num_input_tokens_seen": 105282155, "router_z_loss_clip": 3.04882812, "router_z_loss_mlp": 0.32409668, "step": 4885, "time_per_iteration": 4.126938581466675 }, { "auxiliary_loss_clip": 0.01590925, "auxiliary_loss_mlp": 0.00241976, "balance_loss_clip": 1.28907287, "balance_loss_mlp": 0.20916972, "epoch": 0.2937622125356982, "flos": 24272740195200.0, "grad_norm": 8.852823358963004, "language_loss": 0.91809684, "learning_rate": 3.3135169294347655e-06, "loss": 0.93642581, "num_input_tokens_seen": 105299225, "router_z_loss_clip": 3.01757812, "router_z_loss_mlp": 0.32788086, "step": 4886, "time_per_iteration": 2.7778303623199463 }, { "auxiliary_loss_clip": 0.01567158, "auxiliary_loss_mlp": 0.00239144, "balance_loss_clip": 1.26939094, "balance_loss_mlp": 0.20690984, "epoch": 0.29382233578836614, "flos": 20662461233280.0, "grad_norm": 3.1583114443597164, "language_loss": 0.84297299, "learning_rate": 3.313223211088603e-06, "loss": 0.86103594, "num_input_tokens_seen": 105315710, "router_z_loss_clip": 2.97851562, "router_z_loss_mlp": 0.3223877, "step": 4887, "time_per_iteration": 2.674790382385254 }, { "auxiliary_loss_clip": 0.01602078, "auxiliary_loss_mlp": 0.00241221, "balance_loss_clip": 1.29781556, "balance_loss_mlp": 0.20803355, "epoch": 0.2938824590410341, "flos": 16545052103040.0, "grad_norm": 70.45312626741318, "language_loss": 0.8845064, "learning_rate": 3.3129294429448855e-06, "loss": 0.90293944, "num_input_tokens_seen": 105333505, "router_z_loss_clip": 3.03710938, "router_z_loss_mlp": 0.33178711, "step": 4888, "time_per_iteration": 2.6701881885528564 }, { "auxiliary_loss_clip": 0.01556964, "auxiliary_loss_mlp": 0.00233421, "balance_loss_clip": 1.26061451, "balance_loss_mlp": 0.20032865, "epoch": 0.29394258229370207, "flos": 37925474382720.0, "grad_norm": 2.072068413199802, "language_loss": 0.6124239, "learning_rate": 3.3126356250147517e-06, "loss": 0.63032764, "num_input_tokens_seen": 105355605, "router_z_loss_clip": 2.96289062, "router_z_loss_mlp": 0.33129883, "step": 4889, "time_per_iteration": 2.80428409576416 }, { "auxiliary_loss_clip": 0.01560569, "auxiliary_loss_mlp": 0.0024648, "balance_loss_clip": 1.26463056, "balance_loss_mlp": 0.21541393, "epoch": 0.29400270554637004, "flos": 20044690197120.0, "grad_norm": 4.9476499333792505, "language_loss": 0.91354263, "learning_rate": 3.3123417573093434e-06, "loss": 0.93161309, "num_input_tokens_seen": 105374225, "router_z_loss_clip": 2.9609375, "router_z_loss_mlp": 0.31079102, "step": 4890, "time_per_iteration": 2.6812074184417725 }, { "auxiliary_loss_clip": 0.01585664, "auxiliary_loss_mlp": 0.00255856, "balance_loss_clip": 1.27331817, "balance_loss_mlp": 0.22088045, "epoch": 0.294062828799038, "flos": 15266380775040.0, "grad_norm": 2.763896941661604, "language_loss": 0.80707657, "learning_rate": 3.3120478398398046e-06, "loss": 0.82549179, "num_input_tokens_seen": 105391565, "router_z_loss_clip": 3.11914062, "router_z_loss_mlp": 0.35009766, "step": 4891, "time_per_iteration": 2.691605806350708 }, { "auxiliary_loss_clip": 0.01565829, "auxiliary_loss_mlp": 0.00254881, "balance_loss_clip": 1.2630744, "balance_loss_mlp": 0.22256364, "epoch": 0.294122952051706, "flos": 22747147799040.0, "grad_norm": 195.97269192635505, "language_loss": 0.84506822, "learning_rate": 3.3117538726172797e-06, "loss": 0.86327541, "num_input_tokens_seen": 105409840, "router_z_loss_clip": 3.0234375, "router_z_loss_mlp": 0.32324219, "step": 4892, "time_per_iteration": 2.65470552444458 }, { "auxiliary_loss_clip": 0.01544291, "auxiliary_loss_mlp": 0.00255681, "balance_loss_clip": 1.25093508, "balance_loss_mlp": 0.22187361, "epoch": 0.294183075304374, "flos": 24972891073920.0, "grad_norm": 14.981574963847866, "language_loss": 0.82390839, "learning_rate": 3.3114598556529164e-06, "loss": 0.8419081, "num_input_tokens_seen": 105428645, "router_z_loss_clip": 2.93359375, "router_z_loss_mlp": 0.33813477, "step": 4893, "time_per_iteration": 2.6904594898223877 }, { "auxiliary_loss_clip": 0.01560109, "auxiliary_loss_mlp": 0.00244409, "balance_loss_clip": 1.26016784, "balance_loss_mlp": 0.21153116, "epoch": 0.29424319855704195, "flos": 30952986762240.0, "grad_norm": 7.943945494125445, "language_loss": 0.89851779, "learning_rate": 3.311165788957864e-06, "loss": 0.91656291, "num_input_tokens_seen": 105447480, "router_z_loss_clip": 2.99609375, "router_z_loss_mlp": 0.32885742, "step": 4894, "time_per_iteration": 2.706162929534912 }, { "auxiliary_loss_clip": 0.01564191, "auxiliary_loss_mlp": 0.00246161, "balance_loss_clip": 1.25974727, "balance_loss_mlp": 0.21259144, "epoch": 0.2943033218097099, "flos": 15231583474560.0, "grad_norm": 554.7390429639485, "language_loss": 0.97925097, "learning_rate": 3.310871672543274e-06, "loss": 0.99735451, "num_input_tokens_seen": 105464600, "router_z_loss_clip": 3.04492188, "router_z_loss_mlp": 0.33569336, "step": 4895, "time_per_iteration": 2.644888401031494 }, { "auxiliary_loss_clip": 0.01572693, "auxiliary_loss_mlp": 0.00284124, "balance_loss_clip": 1.26775539, "balance_loss_mlp": 0.25038785, "epoch": 0.2943634450623779, "flos": 21725884310400.0, "grad_norm": 3.745031108136354, "language_loss": 0.96112728, "learning_rate": 3.3105775064202982e-06, "loss": 0.97969544, "num_input_tokens_seen": 105481510, "router_z_loss_clip": 3.04492188, "router_z_loss_mlp": 0.33740234, "step": 4896, "time_per_iteration": 2.74212384223938 }, { "auxiliary_loss_clip": 0.01555809, "auxiliary_loss_mlp": 0.00266837, "balance_loss_clip": 1.25291657, "balance_loss_mlp": 0.2322664, "epoch": 0.29442356831504585, "flos": 22602104680320.0, "grad_norm": 27.79940233167017, "language_loss": 0.79677582, "learning_rate": 3.3102832906000924e-06, "loss": 0.8150022, "num_input_tokens_seen": 105501390, "router_z_loss_clip": 3.02734375, "router_z_loss_mlp": 0.34545898, "step": 4897, "time_per_iteration": 2.6942689418792725 }, { "auxiliary_loss_clip": 0.01572964, "auxiliary_loss_mlp": 0.00251935, "balance_loss_clip": 1.26245177, "balance_loss_mlp": 0.21912819, "epoch": 0.2944836915677138, "flos": 20011401267840.0, "grad_norm": 28.666296256917633, "language_loss": 0.83659971, "learning_rate": 3.309989025093813e-06, "loss": 0.85484862, "num_input_tokens_seen": 105519600, "router_z_loss_clip": 3.10546875, "router_z_loss_mlp": 0.328125, "step": 4898, "time_per_iteration": 2.6339564323425293 }, { "auxiliary_loss_clip": 0.01586863, "auxiliary_loss_mlp": 0.00312794, "balance_loss_clip": 1.27552974, "balance_loss_mlp": 0.27564791, "epoch": 0.2945438148203818, "flos": 20045875345920.0, "grad_norm": 3.398844834666672, "language_loss": 0.82648867, "learning_rate": 3.309694709912618e-06, "loss": 0.84548527, "num_input_tokens_seen": 105535970, "router_z_loss_clip": 3.11328125, "router_z_loss_mlp": 0.37133789, "step": 4899, "time_per_iteration": 2.6374709606170654 }, { "auxiliary_loss_clip": 0.01552376, "auxiliary_loss_mlp": 0.00305971, "balance_loss_clip": 1.25357175, "balance_loss_mlp": 0.27235356, "epoch": 0.29460393807304974, "flos": 23733542160000.0, "grad_norm": 87.18935639466329, "language_loss": 0.86218584, "learning_rate": 3.3094003450676685e-06, "loss": 0.88076931, "num_input_tokens_seen": 105556735, "router_z_loss_clip": 2.9921875, "router_z_loss_mlp": 0.3359375, "step": 4900, "time_per_iteration": 2.6559653282165527 }, { "auxiliary_loss_clip": 0.01567218, "auxiliary_loss_mlp": 0.00284819, "balance_loss_clip": 1.26311111, "balance_loss_mlp": 0.25296587, "epoch": 0.2946640613257177, "flos": 14976079056000.0, "grad_norm": 36.04462282724581, "language_loss": 0.8727411, "learning_rate": 3.3091059305701268e-06, "loss": 0.89126152, "num_input_tokens_seen": 105574875, "router_z_loss_clip": 3.0390625, "router_z_loss_mlp": 0.31835938, "step": 4901, "time_per_iteration": 2.6325416564941406 }, { "auxiliary_loss_clip": 0.01576879, "auxiliary_loss_mlp": 0.00277652, "balance_loss_clip": 1.2828455, "balance_loss_mlp": 0.24724174, "epoch": 0.2947241845783857, "flos": 24243904552320.0, "grad_norm": 4.442421748483573, "language_loss": 0.66384214, "learning_rate": 3.308811466431157e-06, "loss": 0.68238747, "num_input_tokens_seen": 105594225, "router_z_loss_clip": 2.94140625, "router_z_loss_mlp": 0.30407715, "step": 4902, "time_per_iteration": 2.6815147399902344 }, { "auxiliary_loss_clip": 0.0156737, "auxiliary_loss_mlp": 0.00296715, "balance_loss_clip": 1.26849294, "balance_loss_mlp": 0.26574424, "epoch": 0.29478430783105364, "flos": 19938394874880.0, "grad_norm": 3.8208432734531836, "language_loss": 0.82519406, "learning_rate": 3.308516952661925e-06, "loss": 0.84383494, "num_input_tokens_seen": 105614000, "router_z_loss_clip": 2.99023438, "router_z_loss_mlp": 0.30981445, "step": 4903, "time_per_iteration": 2.7038252353668213 }, { "auxiliary_loss_clip": 0.01616479, "auxiliary_loss_mlp": 0.00297667, "balance_loss_clip": 1.30352068, "balance_loss_mlp": 0.26598161, "epoch": 0.2948444310837216, "flos": 27381347856000.0, "grad_norm": 666.165085415943, "language_loss": 0.70741051, "learning_rate": 3.3082223892736e-06, "loss": 0.72655201, "num_input_tokens_seen": 105634575, "router_z_loss_clip": 3.12890625, "router_z_loss_mlp": 0.31713867, "step": 4904, "time_per_iteration": 2.7333292961120605 }, { "auxiliary_loss_clip": 0.0160765, "auxiliary_loss_mlp": 0.00308697, "balance_loss_clip": 1.29696178, "balance_loss_mlp": 0.27529496, "epoch": 0.2949045543363896, "flos": 23405462311680.0, "grad_norm": 3.399832230751802, "language_loss": 0.81180233, "learning_rate": 3.3079277762773496e-06, "loss": 0.83096588, "num_input_tokens_seen": 105654385, "router_z_loss_clip": 3.109375, "router_z_loss_mlp": 0.33422852, "step": 4905, "time_per_iteration": 2.817603349685669 }, { "auxiliary_loss_clip": 0.0161021, "auxiliary_loss_mlp": 0.00306947, "balance_loss_clip": 1.29880846, "balance_loss_mlp": 0.27442691, "epoch": 0.2949646775890576, "flos": 23951483930880.0, "grad_norm": 27.08106214973461, "language_loss": 0.88693029, "learning_rate": 3.3076331136843476e-06, "loss": 0.90610194, "num_input_tokens_seen": 105673570, "router_z_loss_clip": 3.1171875, "router_z_loss_mlp": 0.32495117, "step": 4906, "time_per_iteration": 2.666475296020508 }, { "auxiliary_loss_clip": 0.01602357, "auxiliary_loss_mlp": 0.00265874, "balance_loss_clip": 1.29971695, "balance_loss_mlp": 0.23502308, "epoch": 0.29502480084172555, "flos": 22784315397120.0, "grad_norm": 11.488511977439128, "language_loss": 0.93487805, "learning_rate": 3.3073384015057667e-06, "loss": 0.95356041, "num_input_tokens_seen": 105691940, "router_z_loss_clip": 3.02734375, "router_z_loss_mlp": 0.30859375, "step": 4907, "time_per_iteration": 2.7563719749450684 }, { "auxiliary_loss_clip": 0.01624353, "auxiliary_loss_mlp": 0.00302169, "balance_loss_clip": 1.30920923, "balance_loss_mlp": 0.26926696, "epoch": 0.2950849240943935, "flos": 19646656611840.0, "grad_norm": 11.44433723251566, "language_loss": 0.89514732, "learning_rate": 3.307043639752782e-06, "loss": 0.9144125, "num_input_tokens_seen": 105709825, "router_z_loss_clip": 3.15429688, "router_z_loss_mlp": 0.32910156, "step": 4908, "time_per_iteration": 2.652764320373535 }, { "auxiliary_loss_clip": 0.01654949, "auxiliary_loss_mlp": 0.00082233, "balance_loss_clip": 1.34268427, "balance_loss_mlp": 0.07198129, "epoch": 0.2951450473470615, "flos": 71002829260800.0, "grad_norm": 0.7920966601209799, "language_loss": 0.57292002, "learning_rate": 3.3067488284365728e-06, "loss": 0.5902918, "num_input_tokens_seen": 105766880, "router_z_loss_clip": 3.125, "router_z_loss_mlp": 0.10253906, "step": 4909, "time_per_iteration": 3.0749146938323975 }, { "auxiliary_loss_clip": 0.01591594, "auxiliary_loss_mlp": 0.00303792, "balance_loss_clip": 1.28472471, "balance_loss_mlp": 0.27086619, "epoch": 0.29520517059972945, "flos": 22966310632320.0, "grad_norm": 4.519770571304, "language_loss": 0.92212129, "learning_rate": 3.3064539675683163e-06, "loss": 0.94107509, "num_input_tokens_seen": 105786875, "router_z_loss_clip": 3.07226562, "router_z_loss_mlp": 0.32958984, "step": 4910, "time_per_iteration": 2.7072455883026123 }, { "auxiliary_loss_clip": 0.0160599, "auxiliary_loss_mlp": 0.00305976, "balance_loss_clip": 1.29235601, "balance_loss_mlp": 0.27750924, "epoch": 0.2952652938523974, "flos": 20485673470080.0, "grad_norm": 3.7948636123712816, "language_loss": 0.80481577, "learning_rate": 3.3061590571591946e-06, "loss": 0.82393551, "num_input_tokens_seen": 105805315, "router_z_loss_clip": 3.13867188, "router_z_loss_mlp": 0.28466797, "step": 4911, "time_per_iteration": 2.6546552181243896 }, { "auxiliary_loss_clip": 0.01607194, "auxiliary_loss_mlp": 0.00250682, "balance_loss_clip": 1.30395615, "balance_loss_mlp": 0.22185689, "epoch": 0.2953254171050654, "flos": 19646584784640.0, "grad_norm": 175.54456370892424, "language_loss": 0.9602294, "learning_rate": 3.3058640972203904e-06, "loss": 0.9788081, "num_input_tokens_seen": 105825125, "router_z_loss_clip": 3.03125, "router_z_loss_mlp": 0.28857422, "step": 4912, "time_per_iteration": 2.6215813159942627 }, { "auxiliary_loss_clip": 0.015909, "auxiliary_loss_mlp": 0.00309109, "balance_loss_clip": 1.28518355, "balance_loss_mlp": 0.27661273, "epoch": 0.29538554035773334, "flos": 22747973811840.0, "grad_norm": 1.5758892268386497, "language_loss": 0.88258433, "learning_rate": 3.3055690877630894e-06, "loss": 0.90158445, "num_input_tokens_seen": 105846085, "router_z_loss_clip": 3.05664062, "router_z_loss_mlp": 0.32495117, "step": 4913, "time_per_iteration": 4.183511257171631 }, { "auxiliary_loss_clip": 0.01593531, "auxiliary_loss_mlp": 0.00300485, "balance_loss_clip": 1.29208779, "balance_loss_mlp": 0.27214882, "epoch": 0.2954456636104013, "flos": 21871861182720.0, "grad_norm": 1.852993788676443, "language_loss": 0.83459616, "learning_rate": 3.3052740287984765e-06, "loss": 0.85353631, "num_input_tokens_seen": 105865400, "router_z_loss_clip": 3.01171875, "router_z_loss_mlp": 0.28344727, "step": 4914, "time_per_iteration": 2.6378183364868164 }, { "auxiliary_loss_clip": 0.01589841, "auxiliary_loss_mlp": 0.00290538, "balance_loss_clip": 1.28439045, "balance_loss_mlp": 0.25928137, "epoch": 0.2955057868630693, "flos": 40442560871040.0, "grad_norm": 38.62096331755229, "language_loss": 0.87942666, "learning_rate": 3.3049789203377424e-06, "loss": 0.89823043, "num_input_tokens_seen": 105887920, "router_z_loss_clip": 3.05273438, "router_z_loss_mlp": 0.31225586, "step": 4915, "time_per_iteration": 2.7893927097320557 }, { "auxiliary_loss_clip": 0.01602097, "auxiliary_loss_mlp": 0.00300702, "balance_loss_clip": 1.29442477, "balance_loss_mlp": 0.27191314, "epoch": 0.29556591011573724, "flos": 22564506119040.0, "grad_norm": 150.3059342064791, "language_loss": 0.90921313, "learning_rate": 3.3046837623920772e-06, "loss": 0.92824113, "num_input_tokens_seen": 105904035, "router_z_loss_clip": 3.078125, "router_z_loss_mlp": 0.2878418, "step": 4916, "time_per_iteration": 4.11733603477478 }, { "auxiliary_loss_clip": 0.01594235, "auxiliary_loss_mlp": 0.00266542, "balance_loss_clip": 1.28754973, "balance_loss_mlp": 0.23751502, "epoch": 0.2956260333684052, "flos": 22089300163200.0, "grad_norm": 183.88559603613297, "language_loss": 0.76850086, "learning_rate": 3.3043885549726723e-06, "loss": 0.78710866, "num_input_tokens_seen": 105922685, "router_z_loss_clip": 3.06445312, "router_z_loss_mlp": 0.29040527, "step": 4917, "time_per_iteration": 2.6759583950042725 }, { "auxiliary_loss_clip": 0.0160035, "auxiliary_loss_mlp": 0.00293559, "balance_loss_clip": 1.28847826, "balance_loss_mlp": 0.26397097, "epoch": 0.2956861566210732, "flos": 16435488643200.0, "grad_norm": 8.533540732928461, "language_loss": 0.96433914, "learning_rate": 3.3040932980907226e-06, "loss": 0.98327827, "num_input_tokens_seen": 105940425, "router_z_loss_clip": 3.11914062, "router_z_loss_mlp": 0.29589844, "step": 4918, "time_per_iteration": 2.6252222061157227 }, { "auxiliary_loss_clip": 0.01613226, "auxiliary_loss_mlp": 0.00312972, "balance_loss_clip": 1.29670823, "balance_loss_mlp": 0.27859169, "epoch": 0.2957462798737412, "flos": 25812087500160.0, "grad_norm": 23.457157713718594, "language_loss": 0.79637992, "learning_rate": 3.303797991757425e-06, "loss": 0.81564188, "num_input_tokens_seen": 105960550, "router_z_loss_clip": 3.16992188, "router_z_loss_mlp": 0.34399414, "step": 4919, "time_per_iteration": 4.086320638656616 }, { "auxiliary_loss_clip": 0.01604819, "auxiliary_loss_mlp": 0.00272211, "balance_loss_clip": 1.29391265, "balance_loss_mlp": 0.24133614, "epoch": 0.29580640312640916, "flos": 16690849407360.0, "grad_norm": 3.6834424393955145, "language_loss": 0.82658887, "learning_rate": 3.3035026359839763e-06, "loss": 0.84535921, "num_input_tokens_seen": 105978820, "router_z_loss_clip": 3.10546875, "router_z_loss_mlp": 0.30883789, "step": 4920, "time_per_iteration": 2.6230740547180176 }, { "auxiliary_loss_clip": 0.01635977, "auxiliary_loss_mlp": 0.00303199, "balance_loss_clip": 1.31319344, "balance_loss_mlp": 0.27415985, "epoch": 0.2958665263790771, "flos": 23945594100480.0, "grad_norm": 9.423401882495394, "language_loss": 0.7804265, "learning_rate": 3.3032072307815774e-06, "loss": 0.79981828, "num_input_tokens_seen": 105997545, "router_z_loss_clip": 3.22460938, "router_z_loss_mlp": 0.2902832, "step": 4921, "time_per_iteration": 2.6944007873535156 }, { "auxiliary_loss_clip": 0.01650793, "auxiliary_loss_mlp": 0.00306487, "balance_loss_clip": 1.32203031, "balance_loss_mlp": 0.2718451, "epoch": 0.2959266496317451, "flos": 18478410670080.0, "grad_norm": 24.727532860372353, "language_loss": 0.84502423, "learning_rate": 3.3029117761614298e-06, "loss": 0.86459708, "num_input_tokens_seen": 106015320, "router_z_loss_clip": 3.28515625, "router_z_loss_mlp": 0.34643555, "step": 4922, "time_per_iteration": 2.665149211883545 }, { "auxiliary_loss_clip": 0.01649781, "auxiliary_loss_mlp": 0.00307338, "balance_loss_clip": 1.32230449, "balance_loss_mlp": 0.27565229, "epoch": 0.29598677288441305, "flos": 25957489754880.0, "grad_norm": 13.845178702256723, "language_loss": 0.83494866, "learning_rate": 3.302616272134737e-06, "loss": 0.85451984, "num_input_tokens_seen": 106034555, "router_z_loss_clip": 3.27148438, "router_z_loss_mlp": 0.31665039, "step": 4923, "time_per_iteration": 2.694350481033325 }, { "auxiliary_loss_clip": 0.01658387, "auxiliary_loss_mlp": 0.00285673, "balance_loss_clip": 1.33211541, "balance_loss_mlp": 0.25591865, "epoch": 0.296046896137081, "flos": 25155999630720.0, "grad_norm": 8.089433610443926, "language_loss": 0.92656386, "learning_rate": 3.3023207187127042e-06, "loss": 0.94600451, "num_input_tokens_seen": 106054200, "router_z_loss_clip": 3.26171875, "router_z_loss_mlp": 0.29760742, "step": 4924, "time_per_iteration": 2.7178127765655518 }, { "auxiliary_loss_clip": 0.01652166, "auxiliary_loss_mlp": 0.00259242, "balance_loss_clip": 1.32675529, "balance_loss_mlp": 0.23122747, "epoch": 0.296107019389749, "flos": 21761148487680.0, "grad_norm": 9.092421568828517, "language_loss": 0.86735064, "learning_rate": 3.3020251159065396e-06, "loss": 0.88646472, "num_input_tokens_seen": 106074700, "router_z_loss_clip": 3.25585938, "router_z_loss_mlp": 0.27978516, "step": 4925, "time_per_iteration": 2.721133232116699 }, { "auxiliary_loss_clip": 0.01688705, "auxiliary_loss_mlp": 0.00268822, "balance_loss_clip": 1.3541553, "balance_loss_mlp": 0.2379947, "epoch": 0.29616714264241695, "flos": 17960039544960.0, "grad_norm": 3.1811037333663292, "language_loss": 0.95747721, "learning_rate": 3.301729463727452e-06, "loss": 0.97705245, "num_input_tokens_seen": 106091415, "router_z_loss_clip": 3.34570312, "router_z_loss_mlp": 0.30834961, "step": 4926, "time_per_iteration": 2.881401300430298 }, { "auxiliary_loss_clip": 0.01692702, "auxiliary_loss_mlp": 0.00289708, "balance_loss_clip": 1.35046053, "balance_loss_mlp": 0.25928593, "epoch": 0.2962272658950849, "flos": 15012779777280.0, "grad_norm": 6.1646601781246035, "language_loss": 0.93989438, "learning_rate": 3.3014337621866527e-06, "loss": 0.95971847, "num_input_tokens_seen": 106109135, "router_z_loss_clip": 3.421875, "router_z_loss_mlp": 0.30395508, "step": 4927, "time_per_iteration": 4.056729555130005 }, { "auxiliary_loss_clip": 0.01682744, "auxiliary_loss_mlp": 0.0025731, "balance_loss_clip": 1.3491497, "balance_loss_mlp": 0.22927216, "epoch": 0.2962873891477529, "flos": 14720861946240.0, "grad_norm": 1.6235061446641497, "language_loss": 0.85556185, "learning_rate": 3.3011380112953553e-06, "loss": 0.87496239, "num_input_tokens_seen": 106125750, "router_z_loss_clip": 3.3359375, "router_z_loss_mlp": 0.28039551, "step": 4928, "time_per_iteration": 2.638834238052368 }, { "auxiliary_loss_clip": 0.01711647, "auxiliary_loss_mlp": 0.00335023, "balance_loss_clip": 1.36009383, "balance_loss_mlp": 0.29947478, "epoch": 0.29634751240042084, "flos": 26723787528960.0, "grad_norm": 103.2197233480817, "language_loss": 0.81449342, "learning_rate": 3.300842211064773e-06, "loss": 0.8349601, "num_input_tokens_seen": 106142835, "router_z_loss_clip": 3.51171875, "router_z_loss_mlp": 0.35546875, "step": 4929, "time_per_iteration": 2.713148355484009 }, { "auxiliary_loss_clip": 0.01730431, "auxiliary_loss_mlp": 0.00310093, "balance_loss_clip": 1.37953722, "balance_loss_mlp": 0.28062445, "epoch": 0.2964076356530888, "flos": 14571293713920.0, "grad_norm": 26.086242054240792, "language_loss": 0.81994665, "learning_rate": 3.3005463615061246e-06, "loss": 0.84035188, "num_input_tokens_seen": 106160680, "router_z_loss_clip": 3.5078125, "router_z_loss_mlp": 0.29467773, "step": 4930, "time_per_iteration": 2.636342763900757 }, { "auxiliary_loss_clip": 0.01756445, "auxiliary_loss_mlp": 0.00084774, "balance_loss_clip": 1.35945892, "balance_loss_mlp": 0.07371178, "epoch": 0.29646775890575683, "flos": 63104315063040.0, "grad_norm": 0.8395289189806204, "language_loss": 0.60482299, "learning_rate": 3.3002504626306275e-06, "loss": 0.62323523, "num_input_tokens_seen": 106224415, "router_z_loss_clip": 3.96875, "router_z_loss_mlp": 0.11083984, "step": 4931, "time_per_iteration": 3.077744722366333 }, { "auxiliary_loss_clip": 0.0169344, "auxiliary_loss_mlp": 0.00050582, "balance_loss_clip": 1.31790805, "balance_loss_mlp": 0.042333, "epoch": 0.2965278821584248, "flos": 63067686168960.0, "grad_norm": 0.7490603683206308, "language_loss": 0.52329993, "learning_rate": 3.2999545144495023e-06, "loss": 0.54074013, "num_input_tokens_seen": 106279140, "router_z_loss_clip": 3.75, "router_z_loss_mlp": 0.08251953, "step": 4932, "time_per_iteration": 3.013166666030884 }, { "auxiliary_loss_clip": 0.01717327, "auxiliary_loss_mlp": 0.00306046, "balance_loss_clip": 1.36932659, "balance_loss_mlp": 0.2762194, "epoch": 0.29658800541109276, "flos": 23768734510080.0, "grad_norm": 5.006329736980159, "language_loss": 0.88136572, "learning_rate": 3.299658516973972e-06, "loss": 0.90159947, "num_input_tokens_seen": 106298190, "router_z_loss_clip": 3.47851562, "router_z_loss_mlp": 0.29846191, "step": 4933, "time_per_iteration": 2.686816692352295 }, { "auxiliary_loss_clip": 0.01704717, "auxiliary_loss_mlp": 0.00329146, "balance_loss_clip": 1.36000872, "balance_loss_mlp": 0.2993196, "epoch": 0.2966481286637607, "flos": 23988543788160.0, "grad_norm": 12.471326253350428, "language_loss": 0.81125712, "learning_rate": 3.299362470215261e-06, "loss": 0.83159572, "num_input_tokens_seen": 106319065, "router_z_loss_clip": 3.45117188, "router_z_loss_mlp": 0.2980957, "step": 4934, "time_per_iteration": 2.852874755859375 }, { "auxiliary_loss_clip": 0.01725719, "auxiliary_loss_mlp": 0.00360179, "balance_loss_clip": 1.37859702, "balance_loss_mlp": 0.3284691, "epoch": 0.2967082519164287, "flos": 17165157523200.0, "grad_norm": 13.5330203989292, "language_loss": 0.70213258, "learning_rate": 3.299066374184594e-06, "loss": 0.72299159, "num_input_tokens_seen": 106338040, "router_z_loss_clip": 3.46875, "router_z_loss_mlp": 0.31726074, "step": 4935, "time_per_iteration": 2.677365303039551 }, { "auxiliary_loss_clip": 0.01738119, "auxiliary_loss_mlp": 0.00343072, "balance_loss_clip": 1.38659739, "balance_loss_mlp": 0.31059954, "epoch": 0.29676837516909665, "flos": 29387712816000.0, "grad_norm": 1.9738736223028253, "language_loss": 0.85467857, "learning_rate": 3.2987702288932e-06, "loss": 0.87549049, "num_input_tokens_seen": 106358900, "router_z_loss_clip": 3.51367188, "router_z_loss_mlp": 0.32470703, "step": 4936, "time_per_iteration": 2.6857948303222656 }, { "auxiliary_loss_clip": 0.01720873, "auxiliary_loss_mlp": 0.00359349, "balance_loss_clip": 1.37080598, "balance_loss_mlp": 0.32627994, "epoch": 0.2968284984217646, "flos": 34751222616960.0, "grad_norm": 4.919400437168214, "language_loss": 0.80583334, "learning_rate": 3.298474034352309e-06, "loss": 0.8266356, "num_input_tokens_seen": 106381805, "router_z_loss_clip": 3.5, "router_z_loss_mlp": 0.33056641, "step": 4937, "time_per_iteration": 2.7490267753601074 }, { "auxiliary_loss_clip": 0.01711689, "auxiliary_loss_mlp": 0.0035362, "balance_loss_clip": 1.36515403, "balance_loss_mlp": 0.32143331, "epoch": 0.2968886216744326, "flos": 21544104556800.0, "grad_norm": 83.78442434392386, "language_loss": 0.82916129, "learning_rate": 3.2981777905731526e-06, "loss": 0.84981441, "num_input_tokens_seen": 106402365, "router_z_loss_clip": 3.4609375, "router_z_loss_mlp": 0.32177734, "step": 4938, "time_per_iteration": 2.6939804553985596 }, { "auxiliary_loss_clip": 0.01728143, "auxiliary_loss_mlp": 0.00373097, "balance_loss_clip": 1.37632906, "balance_loss_mlp": 0.34050527, "epoch": 0.29694874492710055, "flos": 12787323811200.0, "grad_norm": 62.41011748030156, "language_loss": 0.85228348, "learning_rate": 3.297881497566964e-06, "loss": 0.87329578, "num_input_tokens_seen": 106419800, "router_z_loss_clip": 3.51953125, "router_z_loss_mlp": 0.32568359, "step": 4939, "time_per_iteration": 2.642885446548462 }, { "auxiliary_loss_clip": 0.01711265, "auxiliary_loss_mlp": 0.00379214, "balance_loss_clip": 1.36799002, "balance_loss_mlp": 0.34593093, "epoch": 0.2970088681797685, "flos": 24569973239040.0, "grad_norm": 25.005099773318666, "language_loss": 0.84399682, "learning_rate": 3.297585155344979e-06, "loss": 0.86490154, "num_input_tokens_seen": 106440300, "router_z_loss_clip": 3.43164062, "router_z_loss_mlp": 0.33300781, "step": 4940, "time_per_iteration": 2.6727726459503174 }, { "auxiliary_loss_clip": 0.01701915, "auxiliary_loss_mlp": 0.00349592, "balance_loss_clip": 1.35936463, "balance_loss_mlp": 0.31568897, "epoch": 0.2970689914324365, "flos": 23659171050240.0, "grad_norm": 2.662894199931893, "language_loss": 0.82166958, "learning_rate": 3.297288763918435e-06, "loss": 0.84218466, "num_input_tokens_seen": 106460035, "router_z_loss_clip": 3.42578125, "router_z_loss_mlp": 0.33911133, "step": 4941, "time_per_iteration": 2.6557960510253906 }, { "auxiliary_loss_clip": 0.01754728, "auxiliary_loss_mlp": 0.00355255, "balance_loss_clip": 1.3927412, "balance_loss_mlp": 0.32254425, "epoch": 0.29712911468510445, "flos": 39670301439360.0, "grad_norm": 7.414456211423165, "language_loss": 0.8236354, "learning_rate": 3.2969923232985712e-06, "loss": 0.84473521, "num_input_tokens_seen": 106481095, "router_z_loss_clip": 3.61914062, "router_z_loss_mlp": 0.32714844, "step": 4942, "time_per_iteration": 2.784954786300659 }, { "auxiliary_loss_clip": 0.01716824, "auxiliary_loss_mlp": 0.00417176, "balance_loss_clip": 1.36406326, "balance_loss_mlp": 0.37883782, "epoch": 0.2971892379377724, "flos": 26395312631040.0, "grad_norm": 2.5255736836105753, "language_loss": 0.77059448, "learning_rate": 3.2966958334966287e-06, "loss": 0.79193449, "num_input_tokens_seen": 106501590, "router_z_loss_clip": 3.52929688, "router_z_loss_mlp": 0.38354492, "step": 4943, "time_per_iteration": 2.6953606605529785 }, { "auxiliary_loss_clip": 0.01728877, "auxiliary_loss_mlp": 0.0035889, "balance_loss_clip": 1.37924385, "balance_loss_mlp": 0.32734677, "epoch": 0.2972493611904404, "flos": 17603195880960.0, "grad_norm": 24.355685272639125, "language_loss": 0.8683399, "learning_rate": 3.2963992945238497e-06, "loss": 0.8892175, "num_input_tokens_seen": 106519430, "router_z_loss_clip": 3.49609375, "router_z_loss_mlp": 0.31542969, "step": 4944, "time_per_iteration": 2.621790647506714 }, { "auxiliary_loss_clip": 0.01685646, "auxiliary_loss_mlp": 0.00328918, "balance_loss_clip": 1.34927201, "balance_loss_mlp": 0.29902041, "epoch": 0.2973094844431084, "flos": 20412774817920.0, "grad_norm": 14.222754648308573, "language_loss": 0.90426636, "learning_rate": 3.2961027063914795e-06, "loss": 0.92441201, "num_input_tokens_seen": 106535870, "router_z_loss_clip": 3.36328125, "router_z_loss_mlp": 0.29907227, "step": 4945, "time_per_iteration": 2.6186470985412598 }, { "auxiliary_loss_clip": 0.01695984, "auxiliary_loss_mlp": 0.00336795, "balance_loss_clip": 1.36048913, "balance_loss_mlp": 0.30698121, "epoch": 0.29736960769577636, "flos": 17493488766720.0, "grad_norm": 7.789959551905341, "language_loss": 0.7390101, "learning_rate": 3.2958060691107654e-06, "loss": 0.7593379, "num_input_tokens_seen": 106553560, "router_z_loss_clip": 3.35546875, "router_z_loss_mlp": 0.29797363, "step": 4946, "time_per_iteration": 2.622807502746582 }, { "auxiliary_loss_clip": 0.01683624, "auxiliary_loss_mlp": 0.00324586, "balance_loss_clip": 1.34923196, "balance_loss_mlp": 0.29202995, "epoch": 0.2974297309484443, "flos": 26103969417600.0, "grad_norm": 112.10609749797582, "language_loss": 0.80707502, "learning_rate": 3.2955093826929547e-06, "loss": 0.82715702, "num_input_tokens_seen": 106574115, "router_z_loss_clip": 3.34179688, "router_z_loss_mlp": 0.3260498, "step": 4947, "time_per_iteration": 2.786508321762085 }, { "auxiliary_loss_clip": 0.01694662, "auxiliary_loss_mlp": 0.00366281, "balance_loss_clip": 1.35385048, "balance_loss_mlp": 0.32927811, "epoch": 0.2974898542011123, "flos": 25666433850240.0, "grad_norm": 2.333096832391043, "language_loss": 0.80562615, "learning_rate": 3.2952126471492985e-06, "loss": 0.82623553, "num_input_tokens_seen": 106593070, "router_z_loss_clip": 3.41015625, "router_z_loss_mlp": 0.36987305, "step": 4948, "time_per_iteration": 2.740069627761841 }, { "auxiliary_loss_clip": 0.0169712, "auxiliary_loss_mlp": 0.00339116, "balance_loss_clip": 1.36234105, "balance_loss_mlp": 0.3093375, "epoch": 0.29754997745378026, "flos": 18661339658880.0, "grad_norm": 12.560784986578224, "language_loss": 0.89112902, "learning_rate": 3.2949158624910497e-06, "loss": 0.91149139, "num_input_tokens_seen": 106610695, "router_z_loss_clip": 3.34765625, "router_z_loss_mlp": 0.29772949, "step": 4949, "time_per_iteration": 2.6404497623443604 }, { "auxiliary_loss_clip": 0.01684274, "auxiliary_loss_mlp": 0.00354539, "balance_loss_clip": 1.34668064, "balance_loss_mlp": 0.32211423, "epoch": 0.2976101007064482, "flos": 22274599449600.0, "grad_norm": 11.434321101350271, "language_loss": 0.77820522, "learning_rate": 3.2946190287294603e-06, "loss": 0.79859328, "num_input_tokens_seen": 106631300, "router_z_loss_clip": 3.37304688, "router_z_loss_mlp": 0.32470703, "step": 4950, "time_per_iteration": 2.737156867980957 }, { "auxiliary_loss_clip": 0.01666781, "auxiliary_loss_mlp": 0.00333309, "balance_loss_clip": 1.34051967, "balance_loss_mlp": 0.30467492, "epoch": 0.2976702239591162, "flos": 21945657674880.0, "grad_norm": 60.689704056939306, "language_loss": 0.88564229, "learning_rate": 3.294322145875789e-06, "loss": 0.90564322, "num_input_tokens_seen": 106650065, "router_z_loss_clip": 3.26171875, "router_z_loss_mlp": 0.28662109, "step": 4951, "time_per_iteration": 2.6988019943237305 }, { "auxiliary_loss_clip": 0.01686661, "auxiliary_loss_mlp": 0.00339625, "balance_loss_clip": 1.34840012, "balance_loss_mlp": 0.30624673, "epoch": 0.29773034721178415, "flos": 24637197542400.0, "grad_norm": 4.13527628266465, "language_loss": 0.81783247, "learning_rate": 3.2940252139412912e-06, "loss": 0.83809537, "num_input_tokens_seen": 106668230, "router_z_loss_clip": 3.37890625, "router_z_loss_mlp": 0.33349609, "step": 4952, "time_per_iteration": 2.6742208003997803 }, { "auxiliary_loss_clip": 0.01699122, "auxiliary_loss_mlp": 0.00354355, "balance_loss_clip": 1.36175394, "balance_loss_mlp": 0.32059479, "epoch": 0.2977904704644521, "flos": 20557566541440.0, "grad_norm": 2.0767210987190334, "language_loss": 0.90701425, "learning_rate": 3.293728232937228e-06, "loss": 0.927549, "num_input_tokens_seen": 106687785, "router_z_loss_clip": 3.37304688, "router_z_loss_mlp": 0.33764648, "step": 4953, "time_per_iteration": 2.6548681259155273 }, { "auxiliary_loss_clip": 0.01704368, "auxiliary_loss_mlp": 0.00342496, "balance_loss_clip": 1.36177695, "balance_loss_mlp": 0.31115553, "epoch": 0.2978505937171201, "flos": 18916449027840.0, "grad_norm": 2.1466075183501694, "language_loss": 0.83146346, "learning_rate": 3.2934312028748597e-06, "loss": 0.85193217, "num_input_tokens_seen": 106706875, "router_z_loss_clip": 3.421875, "router_z_loss_mlp": 0.31335449, "step": 4954, "time_per_iteration": 2.636214017868042 }, { "auxiliary_loss_clip": 0.01691459, "auxiliary_loss_mlp": 0.00356159, "balance_loss_clip": 1.35653162, "balance_loss_mlp": 0.32406762, "epoch": 0.29791071696978805, "flos": 19317750750720.0, "grad_norm": 5.496937605806224, "language_loss": 0.81063008, "learning_rate": 3.293134123765452e-06, "loss": 0.83110631, "num_input_tokens_seen": 106725105, "router_z_loss_clip": 3.34960938, "router_z_loss_mlp": 0.32043457, "step": 4955, "time_per_iteration": 4.060130596160889 }, { "auxiliary_loss_clip": 0.01694551, "auxiliary_loss_mlp": 0.00364888, "balance_loss_clip": 1.36031723, "balance_loss_mlp": 0.33170056, "epoch": 0.297970840222456, "flos": 18806813740800.0, "grad_norm": 6.964146340119644, "language_loss": 0.78587067, "learning_rate": 3.2928369956202684e-06, "loss": 0.80646509, "num_input_tokens_seen": 106744780, "router_z_loss_clip": 3.34179688, "router_z_loss_mlp": 0.33166504, "step": 4956, "time_per_iteration": 2.663576126098633 }, { "auxiliary_loss_clip": 0.01720805, "auxiliary_loss_mlp": 0.00397301, "balance_loss_clip": 1.3787148, "balance_loss_mlp": 0.36177617, "epoch": 0.298030963475124, "flos": 22852760762880.0, "grad_norm": 32.935383236697355, "language_loss": 0.84394133, "learning_rate": 3.2925398184505754e-06, "loss": 0.86512232, "num_input_tokens_seen": 106764670, "router_z_loss_clip": 3.41992188, "router_z_loss_mlp": 0.35546875, "step": 4957, "time_per_iteration": 2.6786117553710938 }, { "auxiliary_loss_clip": 0.01708245, "auxiliary_loss_mlp": 0.00331537, "balance_loss_clip": 1.37632608, "balance_loss_mlp": 0.3008762, "epoch": 0.298091086727792, "flos": 21868485304320.0, "grad_norm": 5.816596348544819, "language_loss": 0.75882053, "learning_rate": 3.2922425922676437e-06, "loss": 0.77921832, "num_input_tokens_seen": 106783695, "router_z_loss_clip": 3.32226562, "router_z_loss_mlp": 0.30639648, "step": 4958, "time_per_iteration": 4.065936088562012 }, { "auxiliary_loss_clip": 0.01683528, "auxiliary_loss_mlp": 0.00356702, "balance_loss_clip": 1.35480547, "balance_loss_mlp": 0.32546926, "epoch": 0.29815120998045996, "flos": 21175014355200.0, "grad_norm": 2.363870600071229, "language_loss": 0.83223379, "learning_rate": 3.291945317082743e-06, "loss": 0.85263604, "num_input_tokens_seen": 106803150, "router_z_loss_clip": 3.28710938, "router_z_loss_mlp": 0.3125, "step": 4959, "time_per_iteration": 2.6780941486358643 }, { "auxiliary_loss_clip": 0.01690392, "auxiliary_loss_mlp": 0.00346485, "balance_loss_clip": 1.3516134, "balance_loss_mlp": 0.31743366, "epoch": 0.29821133323312793, "flos": 19896271200000.0, "grad_norm": 6.086673121639434, "language_loss": 0.85522449, "learning_rate": 3.291647992907147e-06, "loss": 0.8755933, "num_input_tokens_seen": 106820705, "router_z_loss_clip": 3.38867188, "router_z_loss_mlp": 0.29040527, "step": 4960, "time_per_iteration": 2.700474262237549 }, { "auxiliary_loss_clip": 0.0168649, "auxiliary_loss_mlp": 0.00408454, "balance_loss_clip": 1.3477627, "balance_loss_mlp": 0.36994928, "epoch": 0.2982714564857959, "flos": 12750766744320.0, "grad_norm": 23.379369036407812, "language_loss": 0.82705277, "learning_rate": 3.291350619752129e-06, "loss": 0.84800225, "num_input_tokens_seen": 106837335, "router_z_loss_clip": 3.38867188, "router_z_loss_mlp": 0.38500977, "step": 4961, "time_per_iteration": 4.192182302474976 }, { "auxiliary_loss_clip": 0.01700282, "auxiliary_loss_mlp": 0.00375028, "balance_loss_clip": 1.36463237, "balance_loss_mlp": 0.34133989, "epoch": 0.29833157973846386, "flos": 22271905929600.0, "grad_norm": 265.8479635417841, "language_loss": 0.68132955, "learning_rate": 3.291053197628967e-06, "loss": 0.70208269, "num_input_tokens_seen": 106856250, "router_z_loss_clip": 3.35742188, "router_z_loss_mlp": 0.33666992, "step": 4962, "time_per_iteration": 2.657818555831909 }, { "auxiliary_loss_clip": 0.01669434, "auxiliary_loss_mlp": 0.00323708, "balance_loss_clip": 1.34221232, "balance_loss_mlp": 0.29190347, "epoch": 0.2983917029911318, "flos": 15372999319680.0, "grad_norm": 62.37006561410585, "language_loss": 0.88467997, "learning_rate": 3.2907557265489375e-06, "loss": 0.90461135, "num_input_tokens_seen": 106873370, "router_z_loss_clip": 3.2734375, "router_z_loss_mlp": 0.31811523, "step": 4963, "time_per_iteration": 2.605969190597534 }, { "auxiliary_loss_clip": 0.01712563, "auxiliary_loss_mlp": 0.00346866, "balance_loss_clip": 1.37072539, "balance_loss_mlp": 0.31744492, "epoch": 0.2984518262437998, "flos": 15377632174080.0, "grad_norm": 9.146345555641421, "language_loss": 0.75778848, "learning_rate": 3.290458206523322e-06, "loss": 0.77838278, "num_input_tokens_seen": 106890330, "router_z_loss_clip": 3.41796875, "router_z_loss_mlp": 0.29443359, "step": 4964, "time_per_iteration": 2.618959903717041 }, { "auxiliary_loss_clip": 0.01650455, "auxiliary_loss_mlp": 0.00318648, "balance_loss_clip": 1.32756376, "balance_loss_mlp": 0.28946561, "epoch": 0.29851194949646775, "flos": 18108458542080.0, "grad_norm": 9.454379289133874, "language_loss": 0.77372408, "learning_rate": 3.2901606375634015e-06, "loss": 0.79341513, "num_input_tokens_seen": 106909190, "router_z_loss_clip": 3.22851562, "router_z_loss_mlp": 0.29199219, "step": 4965, "time_per_iteration": 2.617427349090576 }, { "auxiliary_loss_clip": 0.0164847, "auxiliary_loss_mlp": 0.00331337, "balance_loss_clip": 1.32997966, "balance_loss_mlp": 0.3005099, "epoch": 0.2985720727491357, "flos": 22018233104640.0, "grad_norm": 7.415887358225868, "language_loss": 0.74740005, "learning_rate": 3.289863019680461e-06, "loss": 0.76719815, "num_input_tokens_seen": 106927825, "router_z_loss_clip": 3.18359375, "router_z_loss_mlp": 0.30822754, "step": 4966, "time_per_iteration": 2.6575918197631836 }, { "auxiliary_loss_clip": 0.01676717, "auxiliary_loss_mlp": 0.00385841, "balance_loss_clip": 1.35216451, "balance_loss_mlp": 0.35322571, "epoch": 0.2986321960018037, "flos": 13041355772160.0, "grad_norm": 153.47733363732107, "language_loss": 0.83440101, "learning_rate": 3.289565352885785e-06, "loss": 0.85502666, "num_input_tokens_seen": 106943155, "router_z_loss_clip": 3.24804688, "router_z_loss_mlp": 0.32617188, "step": 4967, "time_per_iteration": 2.5925536155700684 }, { "auxiliary_loss_clip": 0.01663894, "auxiliary_loss_mlp": 0.00360224, "balance_loss_clip": 1.33939183, "balance_loss_mlp": 0.3293013, "epoch": 0.29869231925447165, "flos": 14465034305280.0, "grad_norm": 48.10295530623364, "language_loss": 0.78370941, "learning_rate": 3.2892676371906614e-06, "loss": 0.80395061, "num_input_tokens_seen": 106960295, "router_z_loss_clip": 3.2421875, "router_z_loss_mlp": 0.30932617, "step": 4968, "time_per_iteration": 2.620279312133789 }, { "auxiliary_loss_clip": 0.01669872, "auxiliary_loss_mlp": 0.00343684, "balance_loss_clip": 1.34404039, "balance_loss_mlp": 0.30942309, "epoch": 0.2987524425071396, "flos": 31650228639360.0, "grad_norm": 98.41468214947817, "language_loss": 0.82691276, "learning_rate": 3.2889698726063805e-06, "loss": 0.84704834, "num_input_tokens_seen": 106982870, "router_z_loss_clip": 3.2578125, "router_z_loss_mlp": 0.34277344, "step": 4969, "time_per_iteration": 4.149555683135986 }, { "auxiliary_loss_clip": 0.01659601, "auxiliary_loss_mlp": 0.0029979, "balance_loss_clip": 1.33852386, "balance_loss_mlp": 0.27231205, "epoch": 0.2988125657598076, "flos": 21433427775360.0, "grad_norm": 37.36652711622734, "language_loss": 0.77078378, "learning_rate": 3.2886720591442327e-06, "loss": 0.79037774, "num_input_tokens_seen": 107002405, "router_z_loss_clip": 3.21289062, "router_z_loss_mlp": 0.27514648, "step": 4970, "time_per_iteration": 2.6591849327087402 }, { "auxiliary_loss_clip": 0.01697458, "auxiliary_loss_mlp": 0.00354223, "balance_loss_clip": 1.36034024, "balance_loss_mlp": 0.32067734, "epoch": 0.2988726890124756, "flos": 18076965292800.0, "grad_norm": 12.371556101898067, "language_loss": 0.91362405, "learning_rate": 3.2883741968155103e-06, "loss": 0.93414092, "num_input_tokens_seen": 107017310, "router_z_loss_clip": 3.37109375, "router_z_loss_mlp": 0.33496094, "step": 4971, "time_per_iteration": 2.9182960987091064 }, { "auxiliary_loss_clip": 0.01659505, "auxiliary_loss_mlp": 0.00310547, "balance_loss_clip": 1.34335136, "balance_loss_mlp": 0.28031504, "epoch": 0.29893281226514357, "flos": 21755653706880.0, "grad_norm": 74.36891516350391, "language_loss": 0.85126305, "learning_rate": 3.2880762856315107e-06, "loss": 0.87096357, "num_input_tokens_seen": 107034645, "router_z_loss_clip": 3.16015625, "router_z_loss_mlp": 0.30212402, "step": 4972, "time_per_iteration": 2.771169424057007 }, { "auxiliary_loss_clip": 0.01674858, "auxiliary_loss_mlp": 0.00350461, "balance_loss_clip": 1.34933317, "balance_loss_mlp": 0.31665373, "epoch": 0.29899293551781153, "flos": 16836718538880.0, "grad_norm": 266.2677418236052, "language_loss": 0.91842389, "learning_rate": 3.2877783256035285e-06, "loss": 0.93867707, "num_input_tokens_seen": 107051125, "router_z_loss_clip": 3.25976562, "router_z_loss_mlp": 0.33813477, "step": 4973, "time_per_iteration": 2.663741111755371 }, { "auxiliary_loss_clip": 0.01706555, "auxiliary_loss_mlp": 0.00308146, "balance_loss_clip": 1.37542152, "balance_loss_mlp": 0.27650821, "epoch": 0.2990530587704795, "flos": 11729215946880.0, "grad_norm": 9.372460173876998, "language_loss": 0.82528132, "learning_rate": 3.287480316742863e-06, "loss": 0.84542835, "num_input_tokens_seen": 107068815, "router_z_loss_clip": 3.3125, "router_z_loss_mlp": 0.31640625, "step": 4974, "time_per_iteration": 2.6302711963653564 }, { "auxiliary_loss_clip": 0.01687826, "auxiliary_loss_mlp": 0.00300713, "balance_loss_clip": 1.36573255, "balance_loss_mlp": 0.27167404, "epoch": 0.29911318202314746, "flos": 28039877850240.0, "grad_norm": 19.07694509376567, "language_loss": 0.78437209, "learning_rate": 3.287182259060815e-06, "loss": 0.80425745, "num_input_tokens_seen": 107090420, "router_z_loss_clip": 3.22460938, "router_z_loss_mlp": 0.29052734, "step": 4975, "time_per_iteration": 2.7536585330963135 }, { "auxiliary_loss_clip": 0.01659983, "auxiliary_loss_mlp": 0.00321823, "balance_loss_clip": 1.33849978, "balance_loss_mlp": 0.28772882, "epoch": 0.2991733052758154, "flos": 18733555952640.0, "grad_norm": 7.593827750029448, "language_loss": 0.82728332, "learning_rate": 3.286884152568687e-06, "loss": 0.84710139, "num_input_tokens_seen": 107107255, "router_z_loss_clip": 3.21484375, "router_z_loss_mlp": 0.34057617, "step": 4976, "time_per_iteration": 2.6658823490142822 }, { "auxiliary_loss_clip": 0.01647791, "auxiliary_loss_mlp": 0.0033883, "balance_loss_clip": 1.3317616, "balance_loss_mlp": 0.30716839, "epoch": 0.2992334285284834, "flos": 15559160532480.0, "grad_norm": 65.10500317731223, "language_loss": 0.94397801, "learning_rate": 3.2865859972777827e-06, "loss": 0.96384424, "num_input_tokens_seen": 107123840, "router_z_loss_clip": 3.16015625, "router_z_loss_mlp": 0.31665039, "step": 4977, "time_per_iteration": 2.7095606327056885 }, { "auxiliary_loss_clip": 0.01666869, "auxiliary_loss_mlp": 0.00303712, "balance_loss_clip": 1.34636617, "balance_loss_mlp": 0.2714541, "epoch": 0.29929355178115136, "flos": 21797561900160.0, "grad_norm": 6.605630584820512, "language_loss": 0.75482094, "learning_rate": 3.2862877931994088e-06, "loss": 0.77452672, "num_input_tokens_seen": 107143475, "router_z_loss_clip": 3.203125, "router_z_loss_mlp": 0.3223877, "step": 4978, "time_per_iteration": 2.7401673793792725 }, { "auxiliary_loss_clip": 0.01676154, "auxiliary_loss_mlp": 0.00320566, "balance_loss_clip": 1.35202265, "balance_loss_mlp": 0.2897979, "epoch": 0.2993536750338193, "flos": 21178533888000.0, "grad_norm": 3.230585652342804, "language_loss": 0.83918703, "learning_rate": 3.2859895403448726e-06, "loss": 0.85915422, "num_input_tokens_seen": 107161725, "router_z_loss_clip": 3.24414062, "router_z_loss_mlp": 0.30773926, "step": 4979, "time_per_iteration": 2.6197946071624756 }, { "auxiliary_loss_clip": 0.01660663, "auxiliary_loss_mlp": 0.00339504, "balance_loss_clip": 1.33979928, "balance_loss_mlp": 0.30960613, "epoch": 0.2994137982864873, "flos": 32122130544000.0, "grad_norm": 8.187358813677962, "language_loss": 0.75501168, "learning_rate": 3.285691238725484e-06, "loss": 0.77501333, "num_input_tokens_seen": 107183935, "router_z_loss_clip": 3.20898438, "router_z_loss_mlp": 0.29907227, "step": 4980, "time_per_iteration": 2.720956563949585 }, { "auxiliary_loss_clip": 0.01632354, "auxiliary_loss_mlp": 0.00269343, "balance_loss_clip": 1.32113278, "balance_loss_mlp": 0.24217474, "epoch": 0.29947392153915525, "flos": 21105419754240.0, "grad_norm": 5.642963770437911, "language_loss": 0.79236364, "learning_rate": 3.285392888352555e-06, "loss": 0.81138062, "num_input_tokens_seen": 107204285, "router_z_loss_clip": 3.11523438, "router_z_loss_mlp": 0.27160645, "step": 4981, "time_per_iteration": 2.7009975910186768 }, { "auxiliary_loss_clip": 0.01660525, "auxiliary_loss_mlp": 0.00370159, "balance_loss_clip": 1.33084178, "balance_loss_mlp": 0.33544526, "epoch": 0.2995340447918232, "flos": 21542632099200.0, "grad_norm": 2.103036377819465, "language_loss": 0.90376019, "learning_rate": 3.2850944892373987e-06, "loss": 0.92406702, "num_input_tokens_seen": 107225265, "router_z_loss_clip": 3.30078125, "router_z_loss_mlp": 0.34716797, "step": 4982, "time_per_iteration": 2.6899919509887695 }, { "auxiliary_loss_clip": 0.01657403, "auxiliary_loss_mlp": 0.00326814, "balance_loss_clip": 1.34037805, "balance_loss_mlp": 0.29317355, "epoch": 0.2995941680444912, "flos": 16725143917440.0, "grad_norm": 41.121788176537265, "language_loss": 0.93816894, "learning_rate": 3.2847960413913307e-06, "loss": 0.95801115, "num_input_tokens_seen": 107241335, "router_z_loss_clip": 3.16796875, "router_z_loss_mlp": 0.33642578, "step": 4983, "time_per_iteration": 2.6623270511627197 }, { "auxiliary_loss_clip": 0.01659323, "auxiliary_loss_mlp": 0.00332192, "balance_loss_clip": 1.33900368, "balance_loss_mlp": 0.30162647, "epoch": 0.2996542912971592, "flos": 20923496346240.0, "grad_norm": 12.95384173923672, "language_loss": 0.84014714, "learning_rate": 3.284497544825668e-06, "loss": 0.8600623, "num_input_tokens_seen": 107259375, "router_z_loss_clip": 3.203125, "router_z_loss_mlp": 0.30541992, "step": 4984, "time_per_iteration": 2.6239659786224365 }, { "auxiliary_loss_clip": 0.01657784, "auxiliary_loss_mlp": 0.00291065, "balance_loss_clip": 1.33363974, "balance_loss_mlp": 0.25880659, "epoch": 0.29971441454982717, "flos": 25079868754560.0, "grad_norm": 1.773444121738521, "language_loss": 0.84749371, "learning_rate": 3.2841989995517303e-06, "loss": 0.86698216, "num_input_tokens_seen": 107279890, "router_z_loss_clip": 3.24414062, "router_z_loss_mlp": 0.32250977, "step": 4985, "time_per_iteration": 2.685504198074341 }, { "auxiliary_loss_clip": 0.01676028, "auxiliary_loss_mlp": 0.00341582, "balance_loss_clip": 1.34401655, "balance_loss_mlp": 0.30734521, "epoch": 0.29977453780249513, "flos": 52555911840000.0, "grad_norm": 10.394232960241824, "language_loss": 0.78879648, "learning_rate": 3.283900405580837e-06, "loss": 0.8089726, "num_input_tokens_seen": 107303430, "router_z_loss_clip": 3.31640625, "router_z_loss_mlp": 0.34277344, "step": 4986, "time_per_iteration": 2.9662270545959473 }, { "auxiliary_loss_clip": 0.01682903, "auxiliary_loss_mlp": 0.00311108, "balance_loss_clip": 1.35201192, "balance_loss_mlp": 0.2781111, "epoch": 0.2998346610551631, "flos": 22237144542720.0, "grad_norm": 2.4230602280886866, "language_loss": 0.8256802, "learning_rate": 3.283601762924312e-06, "loss": 0.84562033, "num_input_tokens_seen": 107323700, "router_z_loss_clip": 3.30664062, "router_z_loss_mlp": 0.32958984, "step": 4987, "time_per_iteration": 2.637967348098755 }, { "auxiliary_loss_clip": 0.01655969, "auxiliary_loss_mlp": 0.00309042, "balance_loss_clip": 1.32787645, "balance_loss_mlp": 0.2782979, "epoch": 0.29989478430783106, "flos": 16873203778560.0, "grad_norm": 10.668973591842143, "language_loss": 0.86789912, "learning_rate": 3.2833030715934793e-06, "loss": 0.88754928, "num_input_tokens_seen": 107341965, "router_z_loss_clip": 3.28320312, "router_z_loss_mlp": 0.30749512, "step": 4988, "time_per_iteration": 2.599900722503662 }, { "auxiliary_loss_clip": 0.0168316, "auxiliary_loss_mlp": 0.00325076, "balance_loss_clip": 1.35022473, "balance_loss_mlp": 0.29257968, "epoch": 0.29995490756049903, "flos": 23768878164480.0, "grad_norm": 6.731550311590487, "language_loss": 0.76265168, "learning_rate": 3.2830043315996658e-06, "loss": 0.78273404, "num_input_tokens_seen": 107362615, "router_z_loss_clip": 3.33203125, "router_z_loss_mlp": 0.32495117, "step": 4989, "time_per_iteration": 2.6515250205993652 }, { "auxiliary_loss_clip": 0.0168226, "auxiliary_loss_mlp": 0.0034487, "balance_loss_clip": 1.34617054, "balance_loss_mlp": 0.30991817, "epoch": 0.300015030813167, "flos": 14465321614080.0, "grad_norm": 10.756527976583039, "language_loss": 0.91291529, "learning_rate": 3.282705542954199e-06, "loss": 0.93318659, "num_input_tokens_seen": 107378980, "router_z_loss_clip": 3.359375, "router_z_loss_mlp": 0.34960938, "step": 4990, "time_per_iteration": 2.6346352100372314 }, { "auxiliary_loss_clip": 0.01707629, "auxiliary_loss_mlp": 0.00340612, "balance_loss_clip": 1.36520207, "balance_loss_mlp": 0.3055402, "epoch": 0.30007515406583496, "flos": 25191982080000.0, "grad_norm": 7.178048812923887, "language_loss": 0.72436082, "learning_rate": 3.28240670566841e-06, "loss": 0.74484324, "num_input_tokens_seen": 107397640, "router_z_loss_clip": 3.42382812, "router_z_loss_mlp": 0.35058594, "step": 4991, "time_per_iteration": 2.724764585494995 }, { "auxiliary_loss_clip": 0.01691934, "auxiliary_loss_mlp": 0.00326868, "balance_loss_clip": 1.35565495, "balance_loss_mlp": 0.29348969, "epoch": 0.3001352773185029, "flos": 19391188106880.0, "grad_norm": 4.184707140041777, "language_loss": 0.87251562, "learning_rate": 3.28210781975363e-06, "loss": 0.89270365, "num_input_tokens_seen": 107416020, "router_z_loss_clip": 3.36328125, "router_z_loss_mlp": 0.33398438, "step": 4992, "time_per_iteration": 2.6661415100097656 }, { "auxiliary_loss_clip": 0.01688964, "auxiliary_loss_mlp": 0.00310974, "balance_loss_clip": 1.35439634, "balance_loss_mlp": 0.27945489, "epoch": 0.3001954005711709, "flos": 21543853161600.0, "grad_norm": 148.43595987562165, "language_loss": 0.89925432, "learning_rate": 3.281808885221193e-06, "loss": 0.91925371, "num_input_tokens_seen": 107436340, "router_z_loss_clip": 3.34570312, "router_z_loss_mlp": 0.31494141, "step": 4993, "time_per_iteration": 2.6864328384399414 }, { "auxiliary_loss_clip": 0.01719055, "auxiliary_loss_mlp": 0.00360724, "balance_loss_clip": 1.37434721, "balance_loss_mlp": 0.3226012, "epoch": 0.30025552382383885, "flos": 17384320356480.0, "grad_norm": 6.576550322099237, "language_loss": 0.94676054, "learning_rate": 3.2815099020824345e-06, "loss": 0.96755832, "num_input_tokens_seen": 107454585, "router_z_loss_clip": 3.4453125, "router_z_loss_mlp": 0.38085938, "step": 4994, "time_per_iteration": 2.6785879135131836 }, { "auxiliary_loss_clip": 0.01711434, "auxiliary_loss_mlp": 0.00328793, "balance_loss_clip": 1.36500406, "balance_loss_mlp": 0.29653525, "epoch": 0.3003156470765068, "flos": 29533330552320.0, "grad_norm": 10.257352334571467, "language_loss": 0.86412692, "learning_rate": 3.2812108703486924e-06, "loss": 0.88452923, "num_input_tokens_seen": 107477180, "router_z_loss_clip": 3.46679688, "router_z_loss_mlp": 0.32275391, "step": 4995, "time_per_iteration": 2.7360002994537354 }, { "auxiliary_loss_clip": 0.01710779, "auxiliary_loss_mlp": 0.00296442, "balance_loss_clip": 1.36548114, "balance_loss_mlp": 0.26587716, "epoch": 0.3003757703291748, "flos": 43646402465280.0, "grad_norm": 3.5796208087272485, "language_loss": 0.72241712, "learning_rate": 3.2809117900313055e-06, "loss": 0.74248934, "num_input_tokens_seen": 107500250, "router_z_loss_clip": 3.453125, "router_z_loss_mlp": 0.30566406, "step": 4996, "time_per_iteration": 2.8499486446380615 }, { "auxiliary_loss_clip": 0.01697402, "auxiliary_loss_mlp": 0.00329553, "balance_loss_clip": 1.36001289, "balance_loss_mlp": 0.29674622, "epoch": 0.30043589358184275, "flos": 22528380015360.0, "grad_norm": 10.53163906149369, "language_loss": 0.81874996, "learning_rate": 3.280612661141615e-06, "loss": 0.83901954, "num_input_tokens_seen": 107520070, "router_z_loss_clip": 3.37304688, "router_z_loss_mlp": 0.328125, "step": 4997, "time_per_iteration": 4.112249374389648 }, { "auxiliary_loss_clip": 0.01715089, "auxiliary_loss_mlp": 0.00327925, "balance_loss_clip": 1.36983418, "balance_loss_mlp": 0.29442692, "epoch": 0.30049601683451077, "flos": 20995892208000.0, "grad_norm": 3.9715840663230835, "language_loss": 0.84117079, "learning_rate": 3.2803134836909646e-06, "loss": 0.861601, "num_input_tokens_seen": 107539285, "router_z_loss_clip": 3.44921875, "router_z_loss_mlp": 0.33496094, "step": 4998, "time_per_iteration": 2.675410270690918 }, { "auxiliary_loss_clip": 0.01721982, "auxiliary_loss_mlp": 0.00321721, "balance_loss_clip": 1.37504089, "balance_loss_mlp": 0.29101297, "epoch": 0.30055614008717874, "flos": 23916004272000.0, "grad_norm": 10.479033523189317, "language_loss": 0.78989244, "learning_rate": 3.2800142576906985e-06, "loss": 0.81032956, "num_input_tokens_seen": 107560260, "router_z_loss_clip": 3.46875, "router_z_loss_mlp": 0.30737305, "step": 4999, "time_per_iteration": 2.747399091720581 }, { "auxiliary_loss_clip": 0.0172623, "auxiliary_loss_mlp": 0.00325133, "balance_loss_clip": 1.37625778, "balance_loss_mlp": 0.28979951, "epoch": 0.3006162633398467, "flos": 19169798630400.0, "grad_norm": 14.056966546728663, "language_loss": 0.82011443, "learning_rate": 3.2797149831521626e-06, "loss": 0.84062803, "num_input_tokens_seen": 107579260, "router_z_loss_clip": 3.49804688, "router_z_loss_mlp": 0.35327148, "step": 5000, "time_per_iteration": 4.122182607650757 }, { "auxiliary_loss_clip": 0.01721291, "auxiliary_loss_mlp": 0.00312279, "balance_loss_clip": 1.37576985, "balance_loss_mlp": 0.2811175, "epoch": 0.30067638659251467, "flos": 14679241061760.0, "grad_norm": 5.8313290589078335, "language_loss": 0.86561656, "learning_rate": 3.2794156600867073e-06, "loss": 0.88595223, "num_input_tokens_seen": 107595245, "router_z_loss_clip": 3.45703125, "router_z_loss_mlp": 0.31152344, "step": 5001, "time_per_iteration": 2.67290997505188 }, { "auxiliary_loss_clip": 0.01758843, "auxiliary_loss_mlp": 0.00331558, "balance_loss_clip": 1.41155815, "balance_loss_mlp": 0.29837042, "epoch": 0.30073650984518263, "flos": 23368007404800.0, "grad_norm": 47.77280351128787, "language_loss": 0.87144363, "learning_rate": 3.2791162885056815e-06, "loss": 0.89234769, "num_input_tokens_seen": 107613985, "router_z_loss_clip": 3.47265625, "router_z_loss_mlp": 0.33203125, "step": 5002, "time_per_iteration": 2.635960102081299 }, { "auxiliary_loss_clip": 0.01754752, "auxiliary_loss_mlp": 0.00310328, "balance_loss_clip": 1.39504528, "balance_loss_mlp": 0.27659163, "epoch": 0.3007966330978506, "flos": 22966633854720.0, "grad_norm": 3.5274273634667255, "language_loss": 0.7579124, "learning_rate": 3.2788168684204376e-06, "loss": 0.7785632, "num_input_tokens_seen": 107631435, "router_z_loss_clip": 3.59960938, "router_z_loss_mlp": 0.33740234, "step": 5003, "time_per_iteration": 4.087014675140381 }, { "auxiliary_loss_clip": 0.01718029, "auxiliary_loss_mlp": 0.00320253, "balance_loss_clip": 1.36763215, "balance_loss_mlp": 0.28787535, "epoch": 0.30085675635051856, "flos": 27818452460160.0, "grad_norm": 5.072252406270049, "language_loss": 0.7743358, "learning_rate": 3.27851739984233e-06, "loss": 0.79471862, "num_input_tokens_seen": 107650530, "router_z_loss_clip": 3.50585938, "router_z_loss_mlp": 0.32348633, "step": 5004, "time_per_iteration": 2.7146992683410645 }, { "auxiliary_loss_clip": 0.01747196, "auxiliary_loss_mlp": 0.00343222, "balance_loss_clip": 1.38922048, "balance_loss_mlp": 0.30736393, "epoch": 0.3009168796031865, "flos": 10882729059840.0, "grad_norm": 5.9531154662068335, "language_loss": 0.88306785, "learning_rate": 3.278217882782715e-06, "loss": 0.90397203, "num_input_tokens_seen": 107662240, "router_z_loss_clip": 3.578125, "router_z_loss_mlp": 0.35839844, "step": 5005, "time_per_iteration": 2.6417694091796875 }, { "auxiliary_loss_clip": 0.01728636, "auxiliary_loss_mlp": 0.00298763, "balance_loss_clip": 1.38282931, "balance_loss_mlp": 0.26770869, "epoch": 0.3009770028558545, "flos": 23805399317760.0, "grad_norm": 11.067783974671691, "language_loss": 0.82049155, "learning_rate": 3.2779183172529497e-06, "loss": 0.8407656, "num_input_tokens_seen": 107680330, "router_z_loss_clip": 3.45703125, "router_z_loss_mlp": 0.31066895, "step": 5006, "time_per_iteration": 2.6451570987701416 }, { "auxiliary_loss_clip": 0.01695053, "auxiliary_loss_mlp": 0.00328943, "balance_loss_clip": 1.35923886, "balance_loss_mlp": 0.29630321, "epoch": 0.30103712610852246, "flos": 26468211283200.0, "grad_norm": 2.774300457650169, "language_loss": 0.78321826, "learning_rate": 3.2776187032643932e-06, "loss": 0.80345815, "num_input_tokens_seen": 107700020, "router_z_loss_clip": 3.35742188, "router_z_loss_mlp": 0.32641602, "step": 5007, "time_per_iteration": 2.68097186088562 }, { "auxiliary_loss_clip": 0.01727418, "auxiliary_loss_mlp": 0.00331311, "balance_loss_clip": 1.37650716, "balance_loss_mlp": 0.29411763, "epoch": 0.3010972493611904, "flos": 22856459863680.0, "grad_norm": 7.783775968336854, "language_loss": 0.83831722, "learning_rate": 3.2773190408284075e-06, "loss": 0.85890454, "num_input_tokens_seen": 107718575, "router_z_loss_clip": 3.50585938, "router_z_loss_mlp": 0.37207031, "step": 5008, "time_per_iteration": 2.644542932510376 }, { "auxiliary_loss_clip": 0.0174071, "auxiliary_loss_mlp": 0.00377858, "balance_loss_clip": 1.38738561, "balance_loss_mlp": 0.3403067, "epoch": 0.3011573726138584, "flos": 24053685102720.0, "grad_norm": 8.247217436154942, "language_loss": 0.90940523, "learning_rate": 3.2770193299563564e-06, "loss": 0.93059087, "num_input_tokens_seen": 107738635, "router_z_loss_clip": 3.53320312, "router_z_loss_mlp": 0.37573242, "step": 5009, "time_per_iteration": 2.6476383209228516 }, { "auxiliary_loss_clip": 0.01728671, "auxiliary_loss_mlp": 0.00376503, "balance_loss_clip": 1.3757751, "balance_loss_mlp": 0.34064525, "epoch": 0.30121749586652635, "flos": 20259687052800.0, "grad_norm": 6.048461281922027, "language_loss": 0.91159689, "learning_rate": 3.276719570659604e-06, "loss": 0.93264866, "num_input_tokens_seen": 107753415, "router_z_loss_clip": 3.53320312, "router_z_loss_mlp": 0.35864258, "step": 5010, "time_per_iteration": 2.6815600395202637 }, { "auxiliary_loss_clip": 0.01715341, "auxiliary_loss_mlp": 0.00345556, "balance_loss_clip": 1.36675942, "balance_loss_mlp": 0.31208175, "epoch": 0.3012776191191944, "flos": 26943058103040.0, "grad_norm": 26.32035322248561, "language_loss": 0.92299318, "learning_rate": 3.2764197629495176e-06, "loss": 0.94360209, "num_input_tokens_seen": 107773840, "router_z_loss_clip": 3.48828125, "router_z_loss_mlp": 0.33422852, "step": 5011, "time_per_iteration": 4.133784294128418 }, { "auxiliary_loss_clip": 0.01720126, "auxiliary_loss_mlp": 0.00320168, "balance_loss_clip": 1.37171352, "balance_loss_mlp": 0.28771937, "epoch": 0.30133774237186234, "flos": 20412307941120.0, "grad_norm": 134.61118847974026, "language_loss": 0.79049039, "learning_rate": 3.2761199068374656e-06, "loss": 0.8108933, "num_input_tokens_seen": 107792020, "router_z_loss_clip": 3.48632812, "router_z_loss_mlp": 0.32446289, "step": 5012, "time_per_iteration": 2.645742177963257 }, { "auxiliary_loss_clip": 0.01713601, "auxiliary_loss_mlp": 0.00349841, "balance_loss_clip": 1.36996698, "balance_loss_mlp": 0.31572306, "epoch": 0.3013978656245303, "flos": 19792453916160.0, "grad_norm": 43.40790410931366, "language_loss": 0.95440465, "learning_rate": 3.275820002334819e-06, "loss": 0.97503906, "num_input_tokens_seen": 107809595, "router_z_loss_clip": 3.43945312, "router_z_loss_mlp": 0.34106445, "step": 5013, "time_per_iteration": 2.6414270401000977 }, { "auxiliary_loss_clip": 0.01719333, "auxiliary_loss_mlp": 0.00365829, "balance_loss_clip": 1.36974883, "balance_loss_mlp": 0.32758641, "epoch": 0.30145798887719827, "flos": 16249650652800.0, "grad_norm": 39.91372373469943, "language_loss": 0.90036809, "learning_rate": 3.2755200494529496e-06, "loss": 0.92121971, "num_input_tokens_seen": 107827230, "router_z_loss_clip": 3.49804688, "router_z_loss_mlp": 0.3828125, "step": 5014, "time_per_iteration": 2.6194136142730713 }, { "auxiliary_loss_clip": 0.01703318, "auxiliary_loss_mlp": 0.00337547, "balance_loss_clip": 1.36410832, "balance_loss_mlp": 0.30464554, "epoch": 0.30151811212986623, "flos": 24571733005440.0, "grad_norm": 2.364963849294682, "language_loss": 0.74670434, "learning_rate": 3.2752200482032323e-06, "loss": 0.76711297, "num_input_tokens_seen": 107847195, "router_z_loss_clip": 3.39257812, "router_z_loss_mlp": 0.32873535, "step": 5015, "time_per_iteration": 2.7236123085021973 }, { "auxiliary_loss_clip": 0.01689977, "auxiliary_loss_mlp": 0.00332771, "balance_loss_clip": 1.35040426, "balance_loss_mlp": 0.29851013, "epoch": 0.3015782353825342, "flos": 21872076664320.0, "grad_norm": 35.4442736143837, "language_loss": 0.82249701, "learning_rate": 3.2749199985970436e-06, "loss": 0.84272456, "num_input_tokens_seen": 107866420, "router_z_loss_clip": 3.39648438, "router_z_loss_mlp": 0.34277344, "step": 5016, "time_per_iteration": 2.683513879776001 }, { "auxiliary_loss_clip": 0.01703553, "auxiliary_loss_mlp": 0.00350149, "balance_loss_clip": 1.36030126, "balance_loss_mlp": 0.31438616, "epoch": 0.30163835863520216, "flos": 28769331248640.0, "grad_norm": 297.29508393605505, "language_loss": 0.71986508, "learning_rate": 3.2746199006457603e-06, "loss": 0.7404021, "num_input_tokens_seen": 107889090, "router_z_loss_clip": 3.43359375, "router_z_loss_mlp": 0.35717773, "step": 5017, "time_per_iteration": 2.7585113048553467 }, { "auxiliary_loss_clip": 0.01727802, "auxiliary_loss_mlp": 0.00341306, "balance_loss_clip": 1.37879956, "balance_loss_mlp": 0.30914325, "epoch": 0.30169848188787013, "flos": 22966202891520.0, "grad_norm": 21.00277780380629, "language_loss": 0.74886215, "learning_rate": 3.2743197543607628e-06, "loss": 0.76955318, "num_input_tokens_seen": 107907520, "router_z_loss_clip": 3.48632812, "router_z_loss_mlp": 0.3215332, "step": 5018, "time_per_iteration": 2.7499818801879883 }, { "auxiliary_loss_clip": 0.01691003, "auxiliary_loss_mlp": 0.00309482, "balance_loss_clip": 1.35711861, "balance_loss_mlp": 0.2781533, "epoch": 0.3017586051405381, "flos": 21835268202240.0, "grad_norm": 3.5423418358641996, "language_loss": 0.82930624, "learning_rate": 3.2740195597534327e-06, "loss": 0.84931099, "num_input_tokens_seen": 107925650, "router_z_loss_clip": 3.34179688, "router_z_loss_mlp": 0.31347656, "step": 5019, "time_per_iteration": 2.8014278411865234 }, { "auxiliary_loss_clip": 0.01695461, "auxiliary_loss_mlp": 0.0032915, "balance_loss_clip": 1.35117388, "balance_loss_mlp": 0.29169479, "epoch": 0.30181872839320606, "flos": 22160403135360.0, "grad_norm": 9.33474981092558, "language_loss": 0.76284552, "learning_rate": 3.2737193168351527e-06, "loss": 0.78309166, "num_input_tokens_seen": 107943975, "router_z_loss_clip": 3.44335938, "router_z_loss_mlp": 0.37475586, "step": 5020, "time_per_iteration": 2.674314498901367 }, { "auxiliary_loss_clip": 0.01758082, "auxiliary_loss_mlp": 0.0035312, "balance_loss_clip": 1.39264846, "balance_loss_mlp": 0.32086208, "epoch": 0.301878851645874, "flos": 18114168804480.0, "grad_norm": 18.924779812829176, "language_loss": 0.84685081, "learning_rate": 3.2734190256173085e-06, "loss": 0.86796284, "num_input_tokens_seen": 107962950, "router_z_loss_clip": 3.65234375, "router_z_loss_mlp": 0.32250977, "step": 5021, "time_per_iteration": 2.706972122192383 }, { "auxiliary_loss_clip": 0.01727384, "auxiliary_loss_mlp": 0.00309722, "balance_loss_clip": 1.37036753, "balance_loss_mlp": 0.27665353, "epoch": 0.301938974898542, "flos": 17602226213760.0, "grad_norm": 2.6508545074569856, "language_loss": 0.83589041, "learning_rate": 3.2731186861112877e-06, "loss": 0.85626149, "num_input_tokens_seen": 107979700, "router_z_loss_clip": 3.57226562, "router_z_loss_mlp": 0.33032227, "step": 5022, "time_per_iteration": 2.6335978507995605 }, { "auxiliary_loss_clip": 0.01733012, "auxiliary_loss_mlp": 0.0033621, "balance_loss_clip": 1.3746438, "balance_loss_mlp": 0.30347469, "epoch": 0.30199909815120995, "flos": 11181219079680.0, "grad_norm": 1.91572862461492, "language_loss": 0.74512047, "learning_rate": 3.2728182983284793e-06, "loss": 0.7658127, "num_input_tokens_seen": 107996645, "router_z_loss_clip": 3.58007812, "router_z_loss_mlp": 0.32714844, "step": 5023, "time_per_iteration": 2.640486717224121 }, { "auxiliary_loss_clip": 0.01727792, "auxiliary_loss_mlp": 0.00300615, "balance_loss_clip": 1.36529016, "balance_loss_mlp": 0.26795131, "epoch": 0.302059221403878, "flos": 21907843632000.0, "grad_norm": 2.5118044101033066, "language_loss": 0.77036607, "learning_rate": 3.2725178622802724e-06, "loss": 0.79065013, "num_input_tokens_seen": 108015020, "router_z_loss_clip": 3.62695312, "router_z_loss_mlp": 0.3269043, "step": 5024, "time_per_iteration": 2.6287717819213867 }, { "auxiliary_loss_clip": 0.01703377, "auxiliary_loss_mlp": 0.00294333, "balance_loss_clip": 1.35528588, "balance_loss_mlp": 0.25911853, "epoch": 0.30211934465654594, "flos": 26396390039040.0, "grad_norm": 2.8148181488349877, "language_loss": 0.79657149, "learning_rate": 3.272217377978061e-06, "loss": 0.81654865, "num_input_tokens_seen": 108036430, "router_z_loss_clip": 3.48046875, "router_z_loss_mlp": 0.35229492, "step": 5025, "time_per_iteration": 2.665497064590454 }, { "auxiliary_loss_clip": 0.01718012, "auxiliary_loss_mlp": 0.00318234, "balance_loss_clip": 1.3644706, "balance_loss_mlp": 0.28392592, "epoch": 0.3021794679092139, "flos": 23400470321280.0, "grad_norm": 19.087753369737943, "language_loss": 0.72601795, "learning_rate": 3.2719168454332387e-06, "loss": 0.74638045, "num_input_tokens_seen": 108054250, "router_z_loss_clip": 3.53125, "router_z_loss_mlp": 0.34326172, "step": 5026, "time_per_iteration": 2.701030731201172 }, { "auxiliary_loss_clip": 0.01711553, "auxiliary_loss_mlp": 0.00288556, "balance_loss_clip": 1.35355639, "balance_loss_mlp": 0.25529629, "epoch": 0.30223959116188187, "flos": 20260979942400.0, "grad_norm": 42.18497862430351, "language_loss": 0.89787143, "learning_rate": 3.2716162646572034e-06, "loss": 0.91787255, "num_input_tokens_seen": 108071495, "router_z_loss_clip": 3.58007812, "router_z_loss_mlp": 0.33276367, "step": 5027, "time_per_iteration": 2.6296753883361816 }, { "auxiliary_loss_clip": 0.01699864, "auxiliary_loss_mlp": 0.00295428, "balance_loss_clip": 1.3492291, "balance_loss_mlp": 0.26188284, "epoch": 0.30229971441454984, "flos": 26687840993280.0, "grad_norm": 75.35000688746985, "language_loss": 0.82110631, "learning_rate": 3.271315635661351e-06, "loss": 0.84105927, "num_input_tokens_seen": 108092135, "router_z_loss_clip": 3.50585938, "router_z_loss_mlp": 0.33569336, "step": 5028, "time_per_iteration": 2.686184883117676 }, { "auxiliary_loss_clip": 0.01698529, "auxiliary_loss_mlp": 0.00293606, "balance_loss_clip": 1.34654367, "balance_loss_mlp": 0.25977415, "epoch": 0.3023598376672178, "flos": 34345323953280.0, "grad_norm": 38.42331342577075, "language_loss": 0.8354497, "learning_rate": 3.2710149584570826e-06, "loss": 0.85537106, "num_input_tokens_seen": 108112945, "router_z_loss_clip": 3.52539062, "router_z_loss_mlp": 0.33862305, "step": 5029, "time_per_iteration": 2.7305829524993896 }, { "auxiliary_loss_clip": 0.01735173, "auxiliary_loss_mlp": 0.00319435, "balance_loss_clip": 1.36588573, "balance_loss_mlp": 0.28095415, "epoch": 0.30241996091988577, "flos": 23112143850240.0, "grad_norm": 3.8956889257423013, "language_loss": 0.89389646, "learning_rate": 3.2707142330557993e-06, "loss": 0.9144426, "num_input_tokens_seen": 108130325, "router_z_loss_clip": 3.69726562, "router_z_loss_mlp": 0.38476562, "step": 5030, "time_per_iteration": 2.6335182189941406 }, { "auxiliary_loss_clip": 0.01723231, "auxiliary_loss_mlp": 0.00281763, "balance_loss_clip": 1.3625586, "balance_loss_mlp": 0.24638139, "epoch": 0.30248008417255373, "flos": 19390002958080.0, "grad_norm": 5.722549567813594, "language_loss": 0.76277351, "learning_rate": 3.270413459468905e-06, "loss": 0.78282344, "num_input_tokens_seen": 108150300, "router_z_loss_clip": 3.60546875, "router_z_loss_mlp": 0.35400391, "step": 5031, "time_per_iteration": 2.658543348312378 }, { "auxiliary_loss_clip": 0.01719766, "auxiliary_loss_mlp": 0.00278056, "balance_loss_clip": 1.3509798, "balance_loss_mlp": 0.24229363, "epoch": 0.3025402074252217, "flos": 23769704177280.0, "grad_norm": 15.225132401950619, "language_loss": 0.8781023, "learning_rate": 3.2701126377078047e-06, "loss": 0.89808059, "num_input_tokens_seen": 108170330, "router_z_loss_clip": 3.68554688, "router_z_loss_mlp": 0.35742188, "step": 5032, "time_per_iteration": 2.6790599822998047 }, { "auxiliary_loss_clip": 0.01721841, "auxiliary_loss_mlp": 0.00320415, "balance_loss_clip": 1.35872877, "balance_loss_mlp": 0.28272104, "epoch": 0.30260033067788966, "flos": 25994118648960.0, "grad_norm": 18.27178497464597, "language_loss": 0.81584972, "learning_rate": 3.269811767783906e-06, "loss": 0.83627224, "num_input_tokens_seen": 108191265, "router_z_loss_clip": 3.62890625, "router_z_loss_mlp": 0.37744141, "step": 5033, "time_per_iteration": 2.711526870727539 }, { "auxiliary_loss_clip": 0.01685281, "auxiliary_loss_mlp": 0.00268887, "balance_loss_clip": 1.33443975, "balance_loss_mlp": 0.23474541, "epoch": 0.3026604539305576, "flos": 25374551932800.0, "grad_norm": 3.5308632377996823, "language_loss": 0.80328631, "learning_rate": 3.2695108497086185e-06, "loss": 0.82282799, "num_input_tokens_seen": 108211615, "router_z_loss_clip": 3.5078125, "router_z_loss_mlp": 0.34155273, "step": 5034, "time_per_iteration": 2.671795606613159 }, { "auxiliary_loss_clip": 0.01726979, "auxiliary_loss_mlp": 0.00275612, "balance_loss_clip": 1.35879481, "balance_loss_mlp": 0.23813248, "epoch": 0.3027205771832256, "flos": 25812733944960.0, "grad_norm": 2.7502961568949726, "language_loss": 0.79537642, "learning_rate": 3.269209883493352e-06, "loss": 0.81540227, "num_input_tokens_seen": 108231080, "router_z_loss_clip": 3.68359375, "router_z_loss_mlp": 0.37475586, "step": 5035, "time_per_iteration": 2.7078192234039307 }, { "auxiliary_loss_clip": 0.01701185, "auxiliary_loss_mlp": 0.00263252, "balance_loss_clip": 1.34740114, "balance_loss_mlp": 0.22779962, "epoch": 0.30278070043589356, "flos": 27344539393920.0, "grad_norm": 9.382495955022954, "language_loss": 0.93496794, "learning_rate": 3.2689088691495196e-06, "loss": 0.95461226, "num_input_tokens_seen": 108251125, "router_z_loss_clip": 3.5390625, "router_z_loss_mlp": 0.35473633, "step": 5036, "time_per_iteration": 2.7551615238189697 }, { "auxiliary_loss_clip": 0.01698353, "auxiliary_loss_mlp": 0.00268642, "balance_loss_clip": 1.34341669, "balance_loss_mlp": 0.23252192, "epoch": 0.3028408236885616, "flos": 24786227070720.0, "grad_norm": 5.444388563434392, "language_loss": 0.83113027, "learning_rate": 3.268607806688536e-06, "loss": 0.85080028, "num_input_tokens_seen": 108272545, "router_z_loss_clip": 3.54882812, "router_z_loss_mlp": 0.36083984, "step": 5037, "time_per_iteration": 2.7718801498413086 }, { "auxiliary_loss_clip": 0.01716359, "auxiliary_loss_mlp": 0.00301495, "balance_loss_clip": 1.34728754, "balance_loss_mlp": 0.2664476, "epoch": 0.30290094694122954, "flos": 12932474670720.0, "grad_norm": 2.772135000453805, "language_loss": 0.87076163, "learning_rate": 3.268306696121816e-06, "loss": 0.89094019, "num_input_tokens_seen": 108289725, "router_z_loss_clip": 3.68945312, "router_z_loss_mlp": 0.3503418, "step": 5038, "time_per_iteration": 2.6879334449768066 }, { "auxiliary_loss_clip": 0.01704447, "auxiliary_loss_mlp": 0.00281648, "balance_loss_clip": 1.34840775, "balance_loss_mlp": 0.24638617, "epoch": 0.3029610701938975, "flos": 25916443488000.0, "grad_norm": 15.279938817951566, "language_loss": 0.8045398, "learning_rate": 3.2680055374607804e-06, "loss": 0.82440078, "num_input_tokens_seen": 108310690, "router_z_loss_clip": 3.55859375, "router_z_loss_mlp": 0.3527832, "step": 5039, "time_per_iteration": 4.296742677688599 }, { "auxiliary_loss_clip": 0.01674228, "auxiliary_loss_mlp": 0.0024772, "balance_loss_clip": 1.32457376, "balance_loss_mlp": 0.216392, "epoch": 0.3030211934465655, "flos": 21980993679360.0, "grad_norm": 9.597897944337266, "language_loss": 0.85101956, "learning_rate": 3.267704330716847e-06, "loss": 0.87023902, "num_input_tokens_seen": 108328905, "router_z_loss_clip": 3.49609375, "router_z_loss_mlp": 0.31323242, "step": 5040, "time_per_iteration": 2.6590168476104736 }, { "auxiliary_loss_clip": 0.01698168, "auxiliary_loss_mlp": 0.00274268, "balance_loss_clip": 1.3369025, "balance_loss_mlp": 0.23871985, "epoch": 0.30308131669923344, "flos": 20991977625600.0, "grad_norm": 104.00263289305478, "language_loss": 0.87291014, "learning_rate": 3.267403075901438e-06, "loss": 0.89263451, "num_input_tokens_seen": 108346680, "router_z_loss_clip": 3.609375, "router_z_loss_mlp": 0.35546875, "step": 5041, "time_per_iteration": 2.661561965942383 }, { "auxiliary_loss_clip": 0.01754313, "auxiliary_loss_mlp": 0.00050635, "balance_loss_clip": 1.50181675, "balance_loss_mlp": 0.03547137, "epoch": 0.3031414399519014, "flos": 60548875827840.0, "grad_norm": 3.0495101800397357, "language_loss": 0.59305155, "learning_rate": 3.267101773025978e-06, "loss": 0.61110103, "num_input_tokens_seen": 108413885, "router_z_loss_clip": 2.53125, "router_z_loss_mlp": 0.15136719, "step": 5042, "time_per_iteration": 4.6341187953948975 }, { "auxiliary_loss_clip": 0.01717315, "auxiliary_loss_mlp": 0.0026997, "balance_loss_clip": 1.35316348, "balance_loss_mlp": 0.23756903, "epoch": 0.30320156320456937, "flos": 21907664064000.0, "grad_norm": 4.481557276230008, "language_loss": 0.78915846, "learning_rate": 3.266800422101892e-06, "loss": 0.80903137, "num_input_tokens_seen": 108433640, "router_z_loss_clip": 3.64257812, "router_z_loss_mlp": 0.32397461, "step": 5043, "time_per_iteration": 2.6353514194488525 }, { "auxiliary_loss_clip": 0.01704955, "auxiliary_loss_mlp": 0.00270294, "balance_loss_clip": 1.35057855, "balance_loss_mlp": 0.23529419, "epoch": 0.30326168645723733, "flos": 21652770176640.0, "grad_norm": 11.824032253737897, "language_loss": 0.76046658, "learning_rate": 3.266499023140606e-06, "loss": 0.78021908, "num_input_tokens_seen": 108452640, "router_z_loss_clip": 3.54492188, "router_z_loss_mlp": 0.35009766, "step": 5044, "time_per_iteration": 2.6414406299591064 }, { "auxiliary_loss_clip": 0.01705091, "auxiliary_loss_mlp": 0.00275346, "balance_loss_clip": 1.35046339, "balance_loss_mlp": 0.24129997, "epoch": 0.3033218097099053, "flos": 21871286565120.0, "grad_norm": 22.806482039488614, "language_loss": 0.82081127, "learning_rate": 3.2661975761535513e-06, "loss": 0.84061575, "num_input_tokens_seen": 108472470, "router_z_loss_clip": 3.546875, "router_z_loss_mlp": 0.34057617, "step": 5045, "time_per_iteration": 4.039073705673218 }, { "auxiliary_loss_clip": 0.0169761, "auxiliary_loss_mlp": 0.00251017, "balance_loss_clip": 1.34342873, "balance_loss_mlp": 0.21499217, "epoch": 0.30338193296257326, "flos": 27089717333760.0, "grad_norm": 13.59011574760537, "language_loss": 0.78183222, "learning_rate": 3.2658960811521564e-06, "loss": 0.80131853, "num_input_tokens_seen": 108493025, "router_z_loss_clip": 3.54101562, "router_z_loss_mlp": 0.36010742, "step": 5046, "time_per_iteration": 2.710190534591675 }, { "auxiliary_loss_clip": 0.0173002, "auxiliary_loss_mlp": 0.00268963, "balance_loss_clip": 1.35574532, "balance_loss_mlp": 0.23114944, "epoch": 0.30344205621524123, "flos": 19534363718400.0, "grad_norm": 12.832081918181148, "language_loss": 0.8762337, "learning_rate": 3.2655945381478564e-06, "loss": 0.89622355, "num_input_tokens_seen": 108513480, "router_z_loss_clip": 3.74023438, "router_z_loss_mlp": 0.37792969, "step": 5047, "time_per_iteration": 2.800795555114746 }, { "auxiliary_loss_clip": 0.01725061, "auxiliary_loss_mlp": 0.00266837, "balance_loss_clip": 1.36229467, "balance_loss_mlp": 0.2345078, "epoch": 0.3035021794679092, "flos": 23910976368000.0, "grad_norm": 2.061800633387559, "language_loss": 0.77328014, "learning_rate": 3.265292947152084e-06, "loss": 0.79319906, "num_input_tokens_seen": 108533155, "router_z_loss_clip": 3.62695312, "router_z_loss_mlp": 0.32287598, "step": 5048, "time_per_iteration": 2.684461832046509 }, { "auxiliary_loss_clip": 0.01689925, "auxiliary_loss_mlp": 0.00253282, "balance_loss_clip": 1.33996153, "balance_loss_mlp": 0.21926013, "epoch": 0.30356230272057716, "flos": 16143606725760.0, "grad_norm": 17.458672167188546, "language_loss": 0.81157267, "learning_rate": 3.2649913081762763e-06, "loss": 0.83100474, "num_input_tokens_seen": 108551900, "router_z_loss_clip": 3.49609375, "router_z_loss_mlp": 0.34033203, "step": 5049, "time_per_iteration": 2.7144885063171387 }, { "auxiliary_loss_clip": 0.01703953, "auxiliary_loss_mlp": 0.00271666, "balance_loss_clip": 1.34200525, "balance_loss_mlp": 0.23552197, "epoch": 0.3036224259732452, "flos": 28914697589760.0, "grad_norm": 386.2512468907635, "language_loss": 0.87627423, "learning_rate": 3.2646896212318717e-06, "loss": 0.89603043, "num_input_tokens_seen": 108574005, "router_z_loss_clip": 3.61914062, "router_z_loss_mlp": 0.36132812, "step": 5050, "time_per_iteration": 2.741415023803711 }, { "auxiliary_loss_clip": 0.0175467, "auxiliary_loss_mlp": 0.00258078, "balance_loss_clip": 1.37837243, "balance_loss_mlp": 0.21902472, "epoch": 0.30368254922591315, "flos": 21105599322240.0, "grad_norm": 2.7022949184552862, "language_loss": 0.80814093, "learning_rate": 3.2643878863303106e-06, "loss": 0.82826841, "num_input_tokens_seen": 108592715, "router_z_loss_clip": 3.765625, "router_z_loss_mlp": 0.39038086, "step": 5051, "time_per_iteration": 2.7059969902038574 }, { "auxiliary_loss_clip": 0.01706168, "auxiliary_loss_mlp": 0.00272332, "balance_loss_clip": 1.34525895, "balance_loss_mlp": 0.23847628, "epoch": 0.3037426724785811, "flos": 23002293081600.0, "grad_norm": 22.57106117504861, "language_loss": 0.81524903, "learning_rate": 3.264086103483033e-06, "loss": 0.83503401, "num_input_tokens_seen": 108611770, "router_z_loss_clip": 3.61132812, "router_z_loss_mlp": 0.33886719, "step": 5052, "time_per_iteration": 2.6755216121673584 }, { "auxiliary_loss_clip": 0.01727333, "auxiliary_loss_mlp": 0.00300306, "balance_loss_clip": 1.35398507, "balance_loss_mlp": 0.26432893, "epoch": 0.3038027957312491, "flos": 15632705629440.0, "grad_norm": 45.46222419954672, "language_loss": 0.90903413, "learning_rate": 3.2637842727014836e-06, "loss": 0.92931056, "num_input_tokens_seen": 108629070, "router_z_loss_clip": 3.734375, "router_z_loss_mlp": 0.35986328, "step": 5053, "time_per_iteration": 4.099026203155518 }, { "auxiliary_loss_clip": 0.01708596, "auxiliary_loss_mlp": 0.00271575, "balance_loss_clip": 1.34312212, "balance_loss_mlp": 0.23376164, "epoch": 0.30386291898391704, "flos": 12713994195840.0, "grad_norm": 2.697546057335955, "language_loss": 0.768924, "learning_rate": 3.2634823939971083e-06, "loss": 0.78872561, "num_input_tokens_seen": 108646315, "router_z_loss_clip": 3.65429688, "router_z_loss_mlp": 0.37792969, "step": 5054, "time_per_iteration": 2.614152431488037 }, { "auxiliary_loss_clip": 0.01718269, "auxiliary_loss_mlp": 0.00285457, "balance_loss_clip": 1.35242319, "balance_loss_mlp": 0.25019479, "epoch": 0.303923042236585, "flos": 26359437922560.0, "grad_norm": 75.505274193201, "language_loss": 0.74808049, "learning_rate": 3.2631804673813545e-06, "loss": 0.76811779, "num_input_tokens_seen": 108665920, "router_z_loss_clip": 3.65820312, "router_z_loss_mlp": 0.35253906, "step": 5055, "time_per_iteration": 2.6913764476776123 }, { "auxiliary_loss_clip": 0.01695379, "auxiliary_loss_mlp": 0.00279837, "balance_loss_clip": 1.34272397, "balance_loss_mlp": 0.24738786, "epoch": 0.30398316548925297, "flos": 19719232041600.0, "grad_norm": 26.476816939715214, "language_loss": 0.74013674, "learning_rate": 3.2628784928656707e-06, "loss": 0.75988889, "num_input_tokens_seen": 108683485, "router_z_loss_clip": 3.52734375, "router_z_loss_mlp": 0.32446289, "step": 5056, "time_per_iteration": 2.645218849182129 }, { "auxiliary_loss_clip": 0.01692734, "auxiliary_loss_mlp": 0.00267353, "balance_loss_clip": 1.33769035, "balance_loss_mlp": 0.23192434, "epoch": 0.30404328874192094, "flos": 24239846315520.0, "grad_norm": 350.99599910570345, "language_loss": 0.86252248, "learning_rate": 3.262576470461507e-06, "loss": 0.88212335, "num_input_tokens_seen": 108702700, "router_z_loss_clip": 3.55078125, "router_z_loss_mlp": 0.35449219, "step": 5057, "time_per_iteration": 2.675706624984741 }, { "auxiliary_loss_clip": 0.01702663, "auxiliary_loss_mlp": 0.00285542, "balance_loss_clip": 1.34175372, "balance_loss_mlp": 0.2489206, "epoch": 0.3041034119945889, "flos": 24498942094080.0, "grad_norm": 35.33363619159725, "language_loss": 0.94384646, "learning_rate": 3.2622744001803176e-06, "loss": 0.96372855, "num_input_tokens_seen": 108721860, "router_z_loss_clip": 3.609375, "router_z_loss_mlp": 0.3659668, "step": 5058, "time_per_iteration": 2.7421422004699707 }, { "auxiliary_loss_clip": 0.01673422, "auxiliary_loss_mlp": 0.00276341, "balance_loss_clip": 1.32050133, "balance_loss_mlp": 0.23988676, "epoch": 0.30416353524725687, "flos": 28288881907200.0, "grad_norm": 86.6072241297052, "language_loss": 0.78847158, "learning_rate": 3.2619722820335564e-06, "loss": 0.80796921, "num_input_tokens_seen": 108743215, "router_z_loss_clip": 3.53320312, "router_z_loss_mlp": 0.36425781, "step": 5059, "time_per_iteration": 2.681610107421875 }, { "auxiliary_loss_clip": 0.01671024, "auxiliary_loss_mlp": 0.00275782, "balance_loss_clip": 1.31723785, "balance_loss_mlp": 0.24016264, "epoch": 0.30422365849992483, "flos": 23660392112640.0, "grad_norm": 3.168197006093855, "language_loss": 0.77937061, "learning_rate": 3.26167011603268e-06, "loss": 0.79883868, "num_input_tokens_seen": 108765505, "router_z_loss_clip": 3.53710938, "router_z_loss_mlp": 0.35644531, "step": 5060, "time_per_iteration": 2.7104897499084473 }, { "auxiliary_loss_clip": 0.01671868, "auxiliary_loss_mlp": 0.0028499, "balance_loss_clip": 1.32042551, "balance_loss_mlp": 0.24801157, "epoch": 0.3042837817525928, "flos": 22998773548800.0, "grad_norm": 76.19916930633615, "language_loss": 0.8245827, "learning_rate": 3.2613679021891463e-06, "loss": 0.84415132, "num_input_tokens_seen": 108783370, "router_z_loss_clip": 3.515625, "router_z_loss_mlp": 0.36987305, "step": 5061, "time_per_iteration": 2.6696934700012207 }, { "auxiliary_loss_clip": 0.01701769, "auxiliary_loss_mlp": 0.00304697, "balance_loss_clip": 1.33029675, "balance_loss_mlp": 0.26430863, "epoch": 0.30434390500526076, "flos": 22082332924800.0, "grad_norm": 11.810597480865635, "language_loss": 0.89286911, "learning_rate": 3.261065640514415e-06, "loss": 0.91293377, "num_input_tokens_seen": 108797430, "router_z_loss_clip": 3.7109375, "router_z_loss_mlp": 0.40405273, "step": 5062, "time_per_iteration": 2.630356550216675 }, { "auxiliary_loss_clip": 0.01667679, "auxiliary_loss_mlp": 0.0028771, "balance_loss_clip": 1.31301808, "balance_loss_mlp": 0.2517091, "epoch": 0.3044040282579287, "flos": 25483504861440.0, "grad_norm": 8.293970348350475, "language_loss": 0.82513773, "learning_rate": 3.2607633310199483e-06, "loss": 0.84469163, "num_input_tokens_seen": 108816945, "router_z_loss_clip": 3.54492188, "router_z_loss_mlp": 0.35986328, "step": 5063, "time_per_iteration": 2.7010204792022705 }, { "auxiliary_loss_clip": 0.01679065, "auxiliary_loss_mlp": 0.00284683, "balance_loss_clip": 1.31701529, "balance_loss_mlp": 0.2477282, "epoch": 0.30446415151059675, "flos": 21945478106880.0, "grad_norm": 41.497354615023326, "language_loss": 0.89978385, "learning_rate": 3.26046097371721e-06, "loss": 0.91942132, "num_input_tokens_seen": 108836615, "router_z_loss_clip": 3.61914062, "router_z_loss_mlp": 0.36987305, "step": 5064, "time_per_iteration": 2.688836097717285 }, { "auxiliary_loss_clip": 0.01658797, "auxiliary_loss_mlp": 0.0027288, "balance_loss_clip": 1.30101669, "balance_loss_mlp": 0.23799953, "epoch": 0.3045242747632647, "flos": 16435416816000.0, "grad_norm": 15.004709595720298, "language_loss": 0.83435589, "learning_rate": 3.2601585686176655e-06, "loss": 0.85367262, "num_input_tokens_seen": 108855165, "router_z_loss_clip": 3.57617188, "router_z_loss_mlp": 0.34863281, "step": 5065, "time_per_iteration": 2.874338150024414 }, { "auxiliary_loss_clip": 0.01683736, "auxiliary_loss_mlp": 0.00281032, "balance_loss_clip": 1.31594324, "balance_loss_mlp": 0.24464929, "epoch": 0.3045843980159327, "flos": 31540341957120.0, "grad_norm": 245.76264539240603, "language_loss": 0.69777107, "learning_rate": 3.2598561157327814e-06, "loss": 0.71741867, "num_input_tokens_seen": 108874690, "router_z_loss_clip": 3.67773438, "router_z_loss_mlp": 0.36425781, "step": 5066, "time_per_iteration": 2.797149181365967 }, { "auxiliary_loss_clip": 0.0170354, "auxiliary_loss_mlp": 0.00282225, "balance_loss_clip": 1.32695138, "balance_loss_mlp": 0.24674864, "epoch": 0.30464452126860064, "flos": 17853636481920.0, "grad_norm": 25.331825483445815, "language_loss": 0.90886879, "learning_rate": 3.2595536150740265e-06, "loss": 0.92872649, "num_input_tokens_seen": 108893140, "router_z_loss_clip": 3.76757812, "router_z_loss_mlp": 0.35473633, "step": 5067, "time_per_iteration": 2.6480965614318848 }, { "auxiliary_loss_clip": 0.01669987, "auxiliary_loss_mlp": 0.00314031, "balance_loss_clip": 1.30796981, "balance_loss_mlp": 0.27669472, "epoch": 0.3047046445212686, "flos": 20631398947200.0, "grad_norm": 14.283558712056534, "language_loss": 0.69276083, "learning_rate": 3.259251066652873e-06, "loss": 0.71260095, "num_input_tokens_seen": 108911880, "router_z_loss_clip": 3.61523438, "router_z_loss_mlp": 0.37353516, "step": 5068, "time_per_iteration": 2.606750965118408 }, { "auxiliary_loss_clip": 0.01664318, "auxiliary_loss_mlp": 0.00276996, "balance_loss_clip": 1.31033111, "balance_loss_mlp": 0.23815808, "epoch": 0.3047647677739366, "flos": 21287594557440.0, "grad_norm": 406.99789753427166, "language_loss": 0.82798809, "learning_rate": 3.258948470480793e-06, "loss": 0.84740126, "num_input_tokens_seen": 108930440, "router_z_loss_clip": 3.5390625, "router_z_loss_mlp": 0.38818359, "step": 5069, "time_per_iteration": 2.692472457885742 }, { "auxiliary_loss_clip": 0.01679176, "auxiliary_loss_mlp": 0.003009, "balance_loss_clip": 1.31773758, "balance_loss_mlp": 0.26463646, "epoch": 0.30482489102660454, "flos": 20995928121600.0, "grad_norm": 3.2504784757531575, "language_loss": 0.82782066, "learning_rate": 3.258645826569261e-06, "loss": 0.84762144, "num_input_tokens_seen": 108949125, "router_z_loss_clip": 3.61328125, "router_z_loss_mlp": 0.36279297, "step": 5070, "time_per_iteration": 2.642348289489746 }, { "auxiliary_loss_clip": 0.01711694, "auxiliary_loss_mlp": 0.00316045, "balance_loss_clip": 1.32983994, "balance_loss_mlp": 0.27804118, "epoch": 0.3048850142792725, "flos": 26290812988800.0, "grad_norm": 2.375550364663542, "language_loss": 0.87931466, "learning_rate": 3.2583431349297527e-06, "loss": 0.89959204, "num_input_tokens_seen": 108972190, "router_z_loss_clip": 3.8203125, "router_z_loss_mlp": 0.37963867, "step": 5071, "time_per_iteration": 2.7070114612579346 }, { "auxiliary_loss_clip": 0.01685969, "auxiliary_loss_mlp": 0.00315429, "balance_loss_clip": 1.31780839, "balance_loss_mlp": 0.27649528, "epoch": 0.30494513753194047, "flos": 22346241125760.0, "grad_norm": 3.016786335465832, "language_loss": 0.84018189, "learning_rate": 3.2580403955737467e-06, "loss": 0.86019588, "num_input_tokens_seen": 108990325, "router_z_loss_clip": 3.68164062, "router_z_loss_mlp": 0.3894043, "step": 5072, "time_per_iteration": 2.63873291015625 }, { "auxiliary_loss_clip": 0.01691306, "auxiliary_loss_mlp": 0.00270172, "balance_loss_clip": 1.31983995, "balance_loss_mlp": 0.2312142, "epoch": 0.30500526078460843, "flos": 19537667769600.0, "grad_norm": 8.899947370532892, "language_loss": 0.78552371, "learning_rate": 3.257737608512723e-06, "loss": 0.80513847, "num_input_tokens_seen": 109009505, "router_z_loss_clip": 3.71679688, "router_z_loss_mlp": 0.3894043, "step": 5073, "time_per_iteration": 2.653093099594116 }, { "auxiliary_loss_clip": 0.01714172, "auxiliary_loss_mlp": 0.00307423, "balance_loss_clip": 1.33085001, "balance_loss_mlp": 0.27247104, "epoch": 0.3050653840372764, "flos": 14465321614080.0, "grad_norm": 16.737960431523828, "language_loss": 0.85949981, "learning_rate": 3.257434773758163e-06, "loss": 0.8797158, "num_input_tokens_seen": 109026350, "router_z_loss_clip": 3.83203125, "router_z_loss_mlp": 0.34936523, "step": 5074, "time_per_iteration": 2.585664987564087 }, { "auxiliary_loss_clip": 0.01702124, "auxiliary_loss_mlp": 0.00299576, "balance_loss_clip": 1.33506703, "balance_loss_mlp": 0.26302597, "epoch": 0.30512550728994436, "flos": 24243796811520.0, "grad_norm": 22.300047326971367, "language_loss": 0.82098347, "learning_rate": 3.25713189132155e-06, "loss": 0.84100056, "num_input_tokens_seen": 109044165, "router_z_loss_clip": 3.671875, "router_z_loss_mlp": 0.36572266, "step": 5075, "time_per_iteration": 2.696277379989624 }, { "auxiliary_loss_clip": 0.01715402, "auxiliary_loss_mlp": 0.00291142, "balance_loss_clip": 1.33064282, "balance_loss_mlp": 0.2517074, "epoch": 0.30518563054261233, "flos": 16360542915840.0, "grad_norm": 3.0853863224596223, "language_loss": 0.81179559, "learning_rate": 3.2568289612143703e-06, "loss": 0.83186102, "num_input_tokens_seen": 109060665, "router_z_loss_clip": 3.8515625, "router_z_loss_mlp": 0.39428711, "step": 5076, "time_per_iteration": 2.6479859352111816 }, { "auxiliary_loss_clip": 0.01719766, "auxiliary_loss_mlp": 0.00321408, "balance_loss_clip": 1.34309614, "balance_loss_mlp": 0.28416663, "epoch": 0.30524575379528035, "flos": 21579584215680.0, "grad_norm": 7.991304867853234, "language_loss": 0.84550244, "learning_rate": 3.25652598344811e-06, "loss": 0.86591411, "num_input_tokens_seen": 109080035, "router_z_loss_clip": 3.765625, "router_z_loss_mlp": 0.37280273, "step": 5077, "time_per_iteration": 2.6670756340026855 }, { "auxiliary_loss_clip": 0.01672514, "auxiliary_loss_mlp": 0.00261067, "balance_loss_clip": 1.31552649, "balance_loss_mlp": 0.22580467, "epoch": 0.3053058770479483, "flos": 16545231671040.0, "grad_norm": 5.445529076744947, "language_loss": 0.81052649, "learning_rate": 3.256222958034259e-06, "loss": 0.8298623, "num_input_tokens_seen": 109097385, "router_z_loss_clip": 3.56835938, "router_z_loss_mlp": 0.35253906, "step": 5078, "time_per_iteration": 2.636361837387085 }, { "auxiliary_loss_clip": 0.01713392, "auxiliary_loss_mlp": 0.00336122, "balance_loss_clip": 1.3356818, "balance_loss_mlp": 0.29752186, "epoch": 0.3053660003006163, "flos": 12312907954560.0, "grad_norm": 55.95493812940133, "language_loss": 0.72402191, "learning_rate": 3.255919884984307e-06, "loss": 0.74451709, "num_input_tokens_seen": 109115495, "router_z_loss_clip": 3.77539062, "router_z_loss_mlp": 0.38623047, "step": 5079, "time_per_iteration": 2.6536014080047607 }, { "auxiliary_loss_clip": 0.01701681, "auxiliary_loss_mlp": 0.00306907, "balance_loss_clip": 1.33269727, "balance_loss_mlp": 0.27076262, "epoch": 0.30542612355328425, "flos": 23112287504640.0, "grad_norm": 5.166235749431431, "language_loss": 0.84887159, "learning_rate": 3.2556167643097477e-06, "loss": 0.86895746, "num_input_tokens_seen": 109134235, "router_z_loss_clip": 3.68945312, "router_z_loss_mlp": 0.36132812, "step": 5080, "time_per_iteration": 2.6893739700317383 }, { "auxiliary_loss_clip": 0.01695917, "auxiliary_loss_mlp": 0.00307081, "balance_loss_clip": 1.32928658, "balance_loss_mlp": 0.27003068, "epoch": 0.3054862468059522, "flos": 24389450461440.0, "grad_norm": 36.95896810730087, "language_loss": 0.87477231, "learning_rate": 3.255313596022074e-06, "loss": 0.89480233, "num_input_tokens_seen": 109152760, "router_z_loss_clip": 3.66601562, "router_z_loss_mlp": 0.37036133, "step": 5081, "time_per_iteration": 2.722252368927002 }, { "auxiliary_loss_clip": 0.01723213, "auxiliary_loss_mlp": 0.00319343, "balance_loss_clip": 1.34751892, "balance_loss_mlp": 0.28198236, "epoch": 0.3055463700586202, "flos": 29386096704000.0, "grad_norm": 3.1307730869605113, "language_loss": 0.76509255, "learning_rate": 3.255010380132783e-06, "loss": 0.78551811, "num_input_tokens_seen": 109173925, "router_z_loss_clip": 3.75390625, "router_z_loss_mlp": 0.37353516, "step": 5082, "time_per_iteration": 4.126389026641846 }, { "auxiliary_loss_clip": 0.0172507, "auxiliary_loss_mlp": 0.00329156, "balance_loss_clip": 1.32928216, "balance_loss_mlp": 0.28872061, "epoch": 0.30560649331128814, "flos": 25591775431680.0, "grad_norm": 6.473991210979472, "language_loss": 0.78777373, "learning_rate": 3.2547071166533736e-06, "loss": 0.80831605, "num_input_tokens_seen": 109192510, "router_z_loss_clip": 3.95703125, "router_z_loss_mlp": 0.40429688, "step": 5083, "time_per_iteration": 2.6821818351745605 }, { "auxiliary_loss_clip": 0.01694221, "auxiliary_loss_mlp": 0.00299193, "balance_loss_clip": 1.32723653, "balance_loss_mlp": 0.26188082, "epoch": 0.3056666165639561, "flos": 19128321400320.0, "grad_norm": 34.90890507258392, "language_loss": 0.76504773, "learning_rate": 3.254403805595344e-06, "loss": 0.78498185, "num_input_tokens_seen": 109210885, "router_z_loss_clip": 3.66992188, "router_z_loss_mlp": 0.37329102, "step": 5084, "time_per_iteration": 4.0594282150268555 }, { "auxiliary_loss_clip": 0.01727525, "auxiliary_loss_mlp": 0.00309902, "balance_loss_clip": 1.34201479, "balance_loss_mlp": 0.27311438, "epoch": 0.30572673981662407, "flos": 15523860441600.0, "grad_norm": 2.7260888768982308, "language_loss": 0.87684566, "learning_rate": 3.2541004469701962e-06, "loss": 0.89721996, "num_input_tokens_seen": 109229180, "router_z_loss_clip": 3.85742188, "router_z_loss_mlp": 0.36767578, "step": 5085, "time_per_iteration": 2.636312484741211 }, { "auxiliary_loss_clip": 0.01691782, "auxiliary_loss_mlp": 0.0031813, "balance_loss_clip": 1.32588232, "balance_loss_mlp": 0.28267673, "epoch": 0.30578686306929204, "flos": 21506541909120.0, "grad_norm": 6.500023640293035, "language_loss": 0.82808828, "learning_rate": 3.2537970407894342e-06, "loss": 0.84818739, "num_input_tokens_seen": 109249510, "router_z_loss_clip": 3.66601562, "router_z_loss_mlp": 0.35449219, "step": 5086, "time_per_iteration": 2.6758785247802734 }, { "auxiliary_loss_clip": 0.01682534, "auxiliary_loss_mlp": 0.00306814, "balance_loss_clip": 1.32837868, "balance_loss_mlp": 0.27019301, "epoch": 0.30584698632196, "flos": 20954271323520.0, "grad_norm": 2.5704224874904678, "language_loss": 0.83394074, "learning_rate": 3.253493587064563e-06, "loss": 0.85383421, "num_input_tokens_seen": 109268200, "router_z_loss_clip": 3.54296875, "router_z_loss_mlp": 0.36621094, "step": 5087, "time_per_iteration": 2.724257230758667 }, { "auxiliary_loss_clip": 0.01699365, "auxiliary_loss_mlp": 0.00323417, "balance_loss_clip": 1.3261106, "balance_loss_mlp": 0.28531796, "epoch": 0.30590710957462797, "flos": 24681116897280.0, "grad_norm": 3.2426920509454904, "language_loss": 0.7722404, "learning_rate": 3.2531900858070885e-06, "loss": 0.79246819, "num_input_tokens_seen": 109288370, "router_z_loss_clip": 3.734375, "router_z_loss_mlp": 0.38134766, "step": 5088, "time_per_iteration": 4.275417804718018 }, { "auxiliary_loss_clip": 0.01725886, "auxiliary_loss_mlp": 0.00329005, "balance_loss_clip": 1.33700526, "balance_loss_mlp": 0.29054773, "epoch": 0.30596723282729593, "flos": 17086907744640.0, "grad_norm": 10.361691111790876, "language_loss": 0.88181674, "learning_rate": 3.252886537028521e-06, "loss": 0.90236568, "num_input_tokens_seen": 109306730, "router_z_loss_clip": 3.89453125, "router_z_loss_mlp": 0.38476562, "step": 5089, "time_per_iteration": 2.641275644302368 }, { "auxiliary_loss_clip": 0.01694911, "auxiliary_loss_mlp": 0.00333918, "balance_loss_clip": 1.33178568, "balance_loss_mlp": 0.29322028, "epoch": 0.30602735607996395, "flos": 22857106308480.0, "grad_norm": 3.163428747918257, "language_loss": 0.84071654, "learning_rate": 3.2525829407403703e-06, "loss": 0.86100483, "num_input_tokens_seen": 109327360, "router_z_loss_clip": 3.62890625, "router_z_loss_mlp": 0.40698242, "step": 5090, "time_per_iteration": 2.645031213760376 }, { "auxiliary_loss_clip": 0.01721682, "auxiliary_loss_mlp": 0.00326472, "balance_loss_clip": 1.33654332, "balance_loss_mlp": 0.28880173, "epoch": 0.3060874793326319, "flos": 29861482227840.0, "grad_norm": 25.83811927150604, "language_loss": 0.83572197, "learning_rate": 3.2522792969541488e-06, "loss": 0.85620356, "num_input_tokens_seen": 109348135, "router_z_loss_clip": 3.8515625, "router_z_loss_mlp": 0.37695312, "step": 5091, "time_per_iteration": 2.7386362552642822 }, { "auxiliary_loss_clip": 0.01690512, "auxiliary_loss_mlp": 0.00302911, "balance_loss_clip": 1.32667708, "balance_loss_mlp": 0.26607549, "epoch": 0.3061476025852999, "flos": 20448577699200.0, "grad_norm": 4.897062763175659, "language_loss": 0.77490127, "learning_rate": 3.2519756056813705e-06, "loss": 0.79483551, "num_input_tokens_seen": 109366220, "router_z_loss_clip": 3.63867188, "router_z_loss_mlp": 0.36816406, "step": 5092, "time_per_iteration": 2.6400818824768066 }, { "auxiliary_loss_clip": 0.01716013, "auxiliary_loss_mlp": 0.00296124, "balance_loss_clip": 1.35041046, "balance_loss_mlp": 0.2581917, "epoch": 0.30620772583796785, "flos": 19391475415680.0, "grad_norm": 4.944158568787182, "language_loss": 0.88317454, "learning_rate": 3.2516718669335522e-06, "loss": 0.90329599, "num_input_tokens_seen": 109385260, "router_z_loss_clip": 3.66015625, "router_z_loss_mlp": 0.37939453, "step": 5093, "time_per_iteration": 2.719830274581909 }, { "auxiliary_loss_clip": 0.01692347, "auxiliary_loss_mlp": 0.00283086, "balance_loss_clip": 1.3330152, "balance_loss_mlp": 0.24589245, "epoch": 0.3062678490906358, "flos": 24024562151040.0, "grad_norm": 4.162216387482296, "language_loss": 0.80938834, "learning_rate": 3.2513680807222114e-06, "loss": 0.82914269, "num_input_tokens_seen": 109405025, "router_z_loss_clip": 3.59179688, "router_z_loss_mlp": 0.37182617, "step": 5094, "time_per_iteration": 2.709756851196289 }, { "auxiliary_loss_clip": 0.0168095, "auxiliary_loss_mlp": 0.00266596, "balance_loss_clip": 1.32101798, "balance_loss_mlp": 0.23016523, "epoch": 0.3063279723433038, "flos": 19754639873280.0, "grad_norm": 31.474036936437027, "language_loss": 0.83043337, "learning_rate": 3.251064247058868e-06, "loss": 0.84990877, "num_input_tokens_seen": 109422465, "router_z_loss_clip": 3.59960938, "router_z_loss_mlp": 0.36450195, "step": 5095, "time_per_iteration": 2.7167506217956543 }, { "auxiliary_loss_clip": 0.016783, "auxiliary_loss_mlp": 0.00277495, "balance_loss_clip": 1.33114707, "balance_loss_mlp": 0.24273348, "epoch": 0.30638809559597174, "flos": 22450022496000.0, "grad_norm": 4.928067460159174, "language_loss": 0.84571767, "learning_rate": 3.250760365955042e-06, "loss": 0.86527556, "num_input_tokens_seen": 109440575, "router_z_loss_clip": 3.47265625, "router_z_loss_mlp": 0.34741211, "step": 5096, "time_per_iteration": 4.038203001022339 }, { "auxiliary_loss_clip": 0.01687233, "auxiliary_loss_mlp": 0.00291821, "balance_loss_clip": 1.32913005, "balance_loss_mlp": 0.25531876, "epoch": 0.3064482188486397, "flos": 17165157523200.0, "grad_norm": 5.901843330278196, "language_loss": 0.90392905, "learning_rate": 3.250456437422258e-06, "loss": 0.92371964, "num_input_tokens_seen": 109459050, "router_z_loss_clip": 3.578125, "router_z_loss_mlp": 0.36474609, "step": 5097, "time_per_iteration": 2.640777349472046 }, { "auxiliary_loss_clip": 0.01677908, "auxiliary_loss_mlp": 0.00287418, "balance_loss_clip": 1.32882905, "balance_loss_mlp": 0.24919964, "epoch": 0.3065083421013077, "flos": 23768483114880.0, "grad_norm": 22.44866442075845, "language_loss": 0.83746088, "learning_rate": 3.250152461472041e-06, "loss": 0.8571142, "num_input_tokens_seen": 109475860, "router_z_loss_clip": 3.49023438, "router_z_loss_mlp": 0.38208008, "step": 5098, "time_per_iteration": 2.7022149562835693 }, { "auxiliary_loss_clip": 0.01681234, "auxiliary_loss_mlp": 0.00269237, "balance_loss_clip": 1.33530664, "balance_loss_mlp": 0.23628798, "epoch": 0.30656846535397564, "flos": 26431833784320.0, "grad_norm": 36.98887344761161, "language_loss": 0.90828222, "learning_rate": 3.249848438115917e-06, "loss": 0.92778695, "num_input_tokens_seen": 109494760, "router_z_loss_clip": 3.45898438, "router_z_loss_mlp": 0.32958984, "step": 5099, "time_per_iteration": 2.712043285369873 }, { "auxiliary_loss_clip": 0.01688284, "auxiliary_loss_mlp": 0.00280329, "balance_loss_clip": 1.33199334, "balance_loss_mlp": 0.24413671, "epoch": 0.3066285886066436, "flos": 26651786716800.0, "grad_norm": 9.716576941818325, "language_loss": 0.90909195, "learning_rate": 3.2495443673654148e-06, "loss": 0.92877811, "num_input_tokens_seen": 109516480, "router_z_loss_clip": 3.5625, "router_z_loss_mlp": 0.36181641, "step": 5100, "time_per_iteration": 2.7526309490203857 }, { "auxiliary_loss_clip": 0.01692205, "auxiliary_loss_mlp": 0.00286398, "balance_loss_clip": 1.33869076, "balance_loss_mlp": 0.24930041, "epoch": 0.30668871185931157, "flos": 15049947375360.0, "grad_norm": 40.76826934534667, "language_loss": 0.85410166, "learning_rate": 3.249240249232065e-06, "loss": 0.87388766, "num_input_tokens_seen": 109534615, "router_z_loss_clip": 3.53320312, "router_z_loss_mlp": 0.37060547, "step": 5101, "time_per_iteration": 2.646016836166382 }, { "auxiliary_loss_clip": 0.01707477, "auxiliary_loss_mlp": 0.00276771, "balance_loss_clip": 1.35041666, "balance_loss_mlp": 0.23776546, "epoch": 0.30674883511197953, "flos": 20082109190400.0, "grad_norm": 4.7545075824236465, "language_loss": 0.86594069, "learning_rate": 3.2489360837273998e-06, "loss": 0.8857832, "num_input_tokens_seen": 109554040, "router_z_loss_clip": 3.5703125, "router_z_loss_mlp": 0.39013672, "step": 5102, "time_per_iteration": 2.7391293048858643 }, { "auxiliary_loss_clip": 0.01714825, "auxiliary_loss_mlp": 0.00300446, "balance_loss_clip": 1.35660493, "balance_loss_mlp": 0.26208395, "epoch": 0.30680895836464755, "flos": 22893807029760.0, "grad_norm": 43.40294235697135, "language_loss": 0.94077504, "learning_rate": 3.2486318708629532e-06, "loss": 0.96092778, "num_input_tokens_seen": 109574345, "router_z_loss_clip": 3.58398438, "router_z_loss_mlp": 0.38354492, "step": 5103, "time_per_iteration": 2.6569769382476807 }, { "auxiliary_loss_clip": 0.01725857, "auxiliary_loss_mlp": 0.00271356, "balance_loss_clip": 1.36440659, "balance_loss_mlp": 0.23432994, "epoch": 0.3068690816173155, "flos": 23696159080320.0, "grad_norm": 10.667197228311705, "language_loss": 0.80320418, "learning_rate": 3.2483276106502607e-06, "loss": 0.82317632, "num_input_tokens_seen": 109593670, "router_z_loss_clip": 3.61132812, "router_z_loss_mlp": 0.37036133, "step": 5104, "time_per_iteration": 2.6730387210845947 }, { "auxiliary_loss_clip": 0.01710581, "auxiliary_loss_mlp": 0.00304496, "balance_loss_clip": 1.35323596, "balance_loss_mlp": 0.26570508, "epoch": 0.3069292048699835, "flos": 23551044134400.0, "grad_norm": 2.3597249638049496, "language_loss": 0.7898289, "learning_rate": 3.2480233031008605e-06, "loss": 0.80997968, "num_input_tokens_seen": 109613385, "router_z_loss_clip": 3.578125, "router_z_loss_mlp": 0.38793945, "step": 5105, "time_per_iteration": 2.6618165969848633 }, { "auxiliary_loss_clip": 0.01726919, "auxiliary_loss_mlp": 0.00315844, "balance_loss_clip": 1.3666656, "balance_loss_mlp": 0.27841267, "epoch": 0.30698932812265145, "flos": 24531656405760.0, "grad_norm": 2.8067233311305397, "language_loss": 0.91137588, "learning_rate": 3.2477189482262916e-06, "loss": 0.93180352, "num_input_tokens_seen": 109632395, "router_z_loss_clip": 3.60546875, "router_z_loss_mlp": 0.37402344, "step": 5106, "time_per_iteration": 2.6973822116851807 }, { "auxiliary_loss_clip": 0.01722154, "auxiliary_loss_mlp": 0.00333417, "balance_loss_clip": 1.3556633, "balance_loss_mlp": 0.29331517, "epoch": 0.3070494513753194, "flos": 20996430912000.0, "grad_norm": 23.17094655009606, "language_loss": 0.80417478, "learning_rate": 3.2474145460380945e-06, "loss": 0.82473052, "num_input_tokens_seen": 109651380, "router_z_loss_clip": 3.66601562, "router_z_loss_mlp": 0.40112305, "step": 5107, "time_per_iteration": 2.6867480278015137 }, { "auxiliary_loss_clip": 0.01723163, "auxiliary_loss_mlp": 0.00289949, "balance_loss_clip": 1.36108613, "balance_loss_mlp": 0.25220746, "epoch": 0.3071095746279874, "flos": 19025940660480.0, "grad_norm": 2.6588765349276544, "language_loss": 0.78392768, "learning_rate": 3.247110096547814e-06, "loss": 0.80405879, "num_input_tokens_seen": 109670240, "router_z_loss_clip": 3.625, "router_z_loss_mlp": 0.37744141, "step": 5108, "time_per_iteration": 2.7106289863586426 }, { "auxiliary_loss_clip": 0.01722373, "auxiliary_loss_mlp": 0.003255, "balance_loss_clip": 1.36247325, "balance_loss_mlp": 0.28711426, "epoch": 0.30716969788065535, "flos": 21215521918080.0, "grad_norm": 2.802297815703217, "language_loss": 0.92712289, "learning_rate": 3.2468055997669926e-06, "loss": 0.94760162, "num_input_tokens_seen": 109690810, "router_z_loss_clip": 3.59960938, "router_z_loss_mlp": 0.3840332, "step": 5109, "time_per_iteration": 2.743403434753418 }, { "auxiliary_loss_clip": 0.01718532, "auxiliary_loss_mlp": 0.0030689, "balance_loss_clip": 1.35807228, "balance_loss_mlp": 0.27200904, "epoch": 0.3072298211333233, "flos": 25772765086080.0, "grad_norm": 11.500764534145157, "language_loss": 0.73212767, "learning_rate": 3.2465010557071788e-06, "loss": 0.7523818, "num_input_tokens_seen": 109711145, "router_z_loss_clip": 3.60351562, "router_z_loss_mlp": 0.34912109, "step": 5110, "time_per_iteration": 2.6858537197113037 }, { "auxiliary_loss_clip": 0.01714163, "auxiliary_loss_mlp": 0.00278667, "balance_loss_clip": 1.36410332, "balance_loss_mlp": 0.24426308, "epoch": 0.3072899443859913, "flos": 25848931875840.0, "grad_norm": 8.903527690591732, "language_loss": 0.81461811, "learning_rate": 3.246196464379919e-06, "loss": 0.83454645, "num_input_tokens_seen": 109731425, "router_z_loss_clip": 3.50195312, "router_z_loss_mlp": 0.34399414, "step": 5111, "time_per_iteration": 2.6853127479553223 }, { "auxiliary_loss_clip": 0.01743217, "auxiliary_loss_mlp": 0.00306142, "balance_loss_clip": 1.37481213, "balance_loss_mlp": 0.26961631, "epoch": 0.30735006763865924, "flos": 25922800195200.0, "grad_norm": 3.9521624984920822, "language_loss": 0.74848413, "learning_rate": 3.245891825796765e-06, "loss": 0.76897764, "num_input_tokens_seen": 109752720, "router_z_loss_clip": 3.68945312, "router_z_loss_mlp": 0.36523438, "step": 5112, "time_per_iteration": 2.801832914352417 }, { "auxiliary_loss_clip": 0.01748283, "auxiliary_loss_mlp": 0.0033869, "balance_loss_clip": 1.37925255, "balance_loss_mlp": 0.30016136, "epoch": 0.3074101908913272, "flos": 30917004312960.0, "grad_norm": 5.099620436919375, "language_loss": 0.86822814, "learning_rate": 3.2455871399692678e-06, "loss": 0.88909787, "num_input_tokens_seen": 109772840, "router_z_loss_clip": 3.68945312, "router_z_loss_mlp": 0.38549805, "step": 5113, "time_per_iteration": 2.833498001098633 }, { "auxiliary_loss_clip": 0.01734361, "auxiliary_loss_mlp": 0.00312205, "balance_loss_clip": 1.37279224, "balance_loss_mlp": 0.27355707, "epoch": 0.30747031414399517, "flos": 18401058731520.0, "grad_norm": 11.802789216737423, "language_loss": 0.85477757, "learning_rate": 3.2452824069089815e-06, "loss": 0.87524319, "num_input_tokens_seen": 109790150, "router_z_loss_clip": 3.61523438, "router_z_loss_mlp": 0.38623047, "step": 5114, "time_per_iteration": 2.6279284954071045 }, { "auxiliary_loss_clip": 0.01717352, "auxiliary_loss_mlp": 0.00294676, "balance_loss_clip": 1.36131525, "balance_loss_mlp": 0.25614792, "epoch": 0.30753043739666314, "flos": 22633166966400.0, "grad_norm": 4.407737924458952, "language_loss": 0.68319571, "learning_rate": 3.2449776266274623e-06, "loss": 0.70331597, "num_input_tokens_seen": 109807985, "router_z_loss_clip": 3.56054688, "router_z_loss_mlp": 0.38500977, "step": 5115, "time_per_iteration": 2.6493804454803467 }, { "auxiliary_loss_clip": 0.01724827, "auxiliary_loss_mlp": 0.00326839, "balance_loss_clip": 1.36300075, "balance_loss_mlp": 0.29055128, "epoch": 0.3075905606493311, "flos": 27344072517120.0, "grad_norm": 331.60580209235957, "language_loss": 0.89157343, "learning_rate": 3.2446727991362657e-06, "loss": 0.91209006, "num_input_tokens_seen": 109825920, "router_z_loss_clip": 3.6171875, "router_z_loss_mlp": 0.36303711, "step": 5116, "time_per_iteration": 2.6803503036499023 }, { "auxiliary_loss_clip": 0.01730502, "auxiliary_loss_mlp": 0.00306304, "balance_loss_clip": 1.36754358, "balance_loss_mlp": 0.2652964, "epoch": 0.3076506839019991, "flos": 22090808534400.0, "grad_norm": 6.374467127224982, "language_loss": 0.81918931, "learning_rate": 3.244367924446952e-06, "loss": 0.83955741, "num_input_tokens_seen": 109846220, "router_z_loss_clip": 3.63085938, "router_z_loss_mlp": 0.41015625, "step": 5117, "time_per_iteration": 2.647113561630249 }, { "auxiliary_loss_clip": 0.0175631, "auxiliary_loss_mlp": 0.00317915, "balance_loss_clip": 1.38645005, "balance_loss_mlp": 0.27986386, "epoch": 0.3077108071546671, "flos": 21289533891840.0, "grad_norm": 15.287581150150427, "language_loss": 0.77623105, "learning_rate": 3.2440630025710826e-06, "loss": 0.79697323, "num_input_tokens_seen": 109863870, "router_z_loss_clip": 3.69921875, "router_z_loss_mlp": 0.38061523, "step": 5118, "time_per_iteration": 2.633878231048584 }, { "auxiliary_loss_clip": 0.01721429, "auxiliary_loss_mlp": 0.00295961, "balance_loss_clip": 1.3628056, "balance_loss_mlp": 0.25790966, "epoch": 0.30777093040733505, "flos": 21430985650560.0, "grad_norm": 20.600950440471088, "language_loss": 0.80117202, "learning_rate": 3.243758033520219e-06, "loss": 0.82134593, "num_input_tokens_seen": 109883500, "router_z_loss_clip": 3.58398438, "router_z_loss_mlp": 0.38061523, "step": 5119, "time_per_iteration": 2.6808576583862305 }, { "auxiliary_loss_clip": 0.01723997, "auxiliary_loss_mlp": 0.00291679, "balance_loss_clip": 1.36920762, "balance_loss_mlp": 0.25391322, "epoch": 0.307831053660003, "flos": 23149275534720.0, "grad_norm": 17.77802738036293, "language_loss": 0.85141927, "learning_rate": 3.243453017305926e-06, "loss": 0.87157607, "num_input_tokens_seen": 109904620, "router_z_loss_clip": 3.54492188, "router_z_loss_mlp": 0.37792969, "step": 5120, "time_per_iteration": 2.6771743297576904 }, { "auxiliary_loss_clip": 0.01721656, "auxiliary_loss_mlp": 0.0029615, "balance_loss_clip": 1.37153077, "balance_loss_mlp": 0.26270011, "epoch": 0.307891176912671, "flos": 17019755268480.0, "grad_norm": 870.4327912939941, "language_loss": 0.85909259, "learning_rate": 3.24314795393977e-06, "loss": 0.87927067, "num_input_tokens_seen": 109922275, "router_z_loss_clip": 3.5, "router_z_loss_mlp": 0.33447266, "step": 5121, "time_per_iteration": 2.662991523742676 }, { "auxiliary_loss_clip": 0.01727957, "auxiliary_loss_mlp": 0.00283856, "balance_loss_clip": 1.37971151, "balance_loss_mlp": 0.2463046, "epoch": 0.30795130016533895, "flos": 27705046245120.0, "grad_norm": 14.935648000163184, "language_loss": 0.87636626, "learning_rate": 3.242842843433319e-06, "loss": 0.89648438, "num_input_tokens_seen": 109944265, "router_z_loss_clip": 3.48242188, "router_z_loss_mlp": 0.37548828, "step": 5122, "time_per_iteration": 2.769547462463379 }, { "auxiliary_loss_clip": 0.01900421, "auxiliary_loss_mlp": 0.00172307, "balance_loss_clip": 1.63681865, "balance_loss_mlp": 0.16367589, "epoch": 0.3080114234180069, "flos": 69058699591680.0, "grad_norm": 0.7581088488885888, "language_loss": 0.58735389, "learning_rate": 3.242537685798143e-06, "loss": 0.60808116, "num_input_tokens_seen": 110014160, "router_z_loss_clip": 2.640625, "router_z_loss_mlp": 0.08642578, "step": 5123, "time_per_iteration": 3.2759780883789062 }, { "auxiliary_loss_clip": 0.01750344, "auxiliary_loss_mlp": 0.00325877, "balance_loss_clip": 1.38352156, "balance_loss_mlp": 0.28446397, "epoch": 0.3080715466706749, "flos": 24060221377920.0, "grad_norm": 1.9334779220168143, "language_loss": 0.90086889, "learning_rate": 3.242232481045813e-06, "loss": 0.92163116, "num_input_tokens_seen": 110034865, "router_z_loss_clip": 3.671875, "router_z_loss_mlp": 0.41455078, "step": 5124, "time_per_iteration": 4.185925006866455 }, { "auxiliary_loss_clip": 0.01744978, "auxiliary_loss_mlp": 0.00268946, "balance_loss_clip": 1.38044572, "balance_loss_mlp": 0.2312282, "epoch": 0.30813166992334284, "flos": 25848680480640.0, "grad_norm": 2.231218683316119, "language_loss": 0.86506063, "learning_rate": 3.2419272291879035e-06, "loss": 0.8851999, "num_input_tokens_seen": 110052930, "router_z_loss_clip": 3.64257812, "router_z_loss_mlp": 0.37719727, "step": 5125, "time_per_iteration": 2.7389976978302 }, { "auxiliary_loss_clip": 0.01750572, "auxiliary_loss_mlp": 0.00306991, "balance_loss_clip": 1.38209462, "balance_loss_mlp": 0.26910591, "epoch": 0.3081917931760108, "flos": 20449619193600.0, "grad_norm": 4.695553451887266, "language_loss": 0.72740835, "learning_rate": 3.241621930235989e-06, "loss": 0.74798399, "num_input_tokens_seen": 110071765, "router_z_loss_clip": 3.6875, "router_z_loss_mlp": 0.37890625, "step": 5126, "time_per_iteration": 4.077983617782593 }, { "auxiliary_loss_clip": 0.01711025, "auxiliary_loss_mlp": 0.00253475, "balance_loss_clip": 1.36658001, "balance_loss_mlp": 0.21749774, "epoch": 0.3082519164286788, "flos": 22166257052160.0, "grad_norm": 7.92998448785644, "language_loss": 0.91114366, "learning_rate": 3.241316584201646e-06, "loss": 0.93078864, "num_input_tokens_seen": 110092660, "router_z_loss_clip": 3.4453125, "router_z_loss_mlp": 0.35961914, "step": 5127, "time_per_iteration": 2.713705539703369 }, { "auxiliary_loss_clip": 0.01720969, "auxiliary_loss_mlp": 0.00270282, "balance_loss_clip": 1.37477243, "balance_loss_mlp": 0.23418507, "epoch": 0.30831203968134674, "flos": 28913404700160.0, "grad_norm": 1.791776631169313, "language_loss": 0.73701656, "learning_rate": 3.2410111910964538e-06, "loss": 0.7569291, "num_input_tokens_seen": 110114960, "router_z_loss_clip": 3.46484375, "router_z_loss_mlp": 0.36108398, "step": 5128, "time_per_iteration": 2.7165515422821045 }, { "auxiliary_loss_clip": 0.01727063, "auxiliary_loss_mlp": 0.00289679, "balance_loss_clip": 1.36832058, "balance_loss_mlp": 0.25024456, "epoch": 0.3083721629340147, "flos": 25667726739840.0, "grad_norm": 3.089468405968999, "language_loss": 0.77942169, "learning_rate": 3.240705750931993e-06, "loss": 0.7995891, "num_input_tokens_seen": 110135750, "router_z_loss_clip": 3.59179688, "router_z_loss_mlp": 0.39477539, "step": 5129, "time_per_iteration": 2.7070820331573486 }, { "auxiliary_loss_clip": 0.0174413, "auxiliary_loss_mlp": 0.00048075, "balance_loss_clip": 1.4826715, "balance_loss_mlp": 0.04199563, "epoch": 0.3084322861866827, "flos": 68212679581440.0, "grad_norm": 0.8616940327018665, "language_loss": 0.58646894, "learning_rate": 3.240400263719846e-06, "loss": 0.60439098, "num_input_tokens_seen": 110189480, "router_z_loss_clip": 2.625, "router_z_loss_mlp": 0.06079102, "step": 5130, "time_per_iteration": 4.513909816741943 }, { "auxiliary_loss_clip": 0.01734287, "auxiliary_loss_mlp": 0.00299217, "balance_loss_clip": 1.3803854, "balance_loss_mlp": 0.26123667, "epoch": 0.3084924094393507, "flos": 20296495514880.0, "grad_norm": 17.528225827700425, "language_loss": 0.82093966, "learning_rate": 3.2400947294715957e-06, "loss": 0.84127474, "num_input_tokens_seen": 110206445, "router_z_loss_clip": 3.53515625, "router_z_loss_mlp": 0.37988281, "step": 5131, "time_per_iteration": 2.6802375316619873 }, { "auxiliary_loss_clip": 0.01722317, "auxiliary_loss_mlp": 0.00267519, "balance_loss_clip": 1.3807224, "balance_loss_mlp": 0.23270977, "epoch": 0.30855253269201866, "flos": 23949831905280.0, "grad_norm": 10.39632345769879, "language_loss": 0.76543266, "learning_rate": 3.2397891481988303e-06, "loss": 0.78533101, "num_input_tokens_seen": 110226845, "router_z_loss_clip": 3.41796875, "router_z_loss_mlp": 0.34814453, "step": 5132, "time_per_iteration": 2.6738364696502686 }, { "auxiliary_loss_clip": 0.01691777, "auxiliary_loss_mlp": 0.00253405, "balance_loss_clip": 1.35606635, "balance_loss_mlp": 0.21666446, "epoch": 0.3086126559446866, "flos": 19281876042240.0, "grad_norm": 7.272241563070787, "language_loss": 0.95952082, "learning_rate": 3.239483519913136e-06, "loss": 0.97897261, "num_input_tokens_seen": 110244095, "router_z_loss_clip": 3.35546875, "router_z_loss_mlp": 0.3671875, "step": 5133, "time_per_iteration": 2.637441396713257 }, { "auxiliary_loss_clip": 0.0172505, "auxiliary_loss_mlp": 0.00306087, "balance_loss_clip": 1.37955475, "balance_loss_mlp": 0.26734376, "epoch": 0.3086727791973546, "flos": 33760770019200.0, "grad_norm": 3.6752641222841596, "language_loss": 0.74254131, "learning_rate": 3.239177844626102e-06, "loss": 0.76285267, "num_input_tokens_seen": 110264240, "router_z_loss_clip": 3.453125, "router_z_loss_mlp": 0.38745117, "step": 5134, "time_per_iteration": 2.771911382675171 }, { "auxiliary_loss_clip": 0.01712991, "auxiliary_loss_mlp": 0.00294925, "balance_loss_clip": 1.36710286, "balance_loss_mlp": 0.25482309, "epoch": 0.30873290245002255, "flos": 16034151006720.0, "grad_norm": 12.484188002938083, "language_loss": 0.89189434, "learning_rate": 3.2388721223493197e-06, "loss": 0.91197354, "num_input_tokens_seen": 110282450, "router_z_loss_clip": 3.4609375, "router_z_loss_mlp": 0.40063477, "step": 5135, "time_per_iteration": 2.6252338886260986 }, { "auxiliary_loss_clip": 0.01679923, "auxiliary_loss_mlp": 0.00036712, "balance_loss_clip": 1.43073547, "balance_loss_mlp": 0.03063202, "epoch": 0.3087930257026905, "flos": 65048304055680.0, "grad_norm": 0.7113941476861222, "language_loss": 0.55281466, "learning_rate": 3.2385663530943824e-06, "loss": 0.56998098, "num_input_tokens_seen": 110343715, "router_z_loss_clip": 2.5, "router_z_loss_mlp": 0.06079102, "step": 5136, "time_per_iteration": 3.164055824279785 }, { "auxiliary_loss_clip": 0.01698614, "auxiliary_loss_mlp": 0.00264234, "balance_loss_clip": 1.36618018, "balance_loss_mlp": 0.22887659, "epoch": 0.3088531489553585, "flos": 74738829824640.0, "grad_norm": 2.8413403338439247, "language_loss": 0.83033901, "learning_rate": 3.2382605368728852e-06, "loss": 0.84996748, "num_input_tokens_seen": 110368430, "router_z_loss_clip": 3.32617188, "router_z_loss_mlp": 0.35351562, "step": 5137, "time_per_iteration": 3.0946240425109863 }, { "auxiliary_loss_clip": 0.01701646, "auxiliary_loss_mlp": 0.00272031, "balance_loss_clip": 1.36454618, "balance_loss_mlp": 0.23531494, "epoch": 0.30891327220802645, "flos": 21142300043520.0, "grad_norm": 3.673364483915258, "language_loss": 0.85371965, "learning_rate": 3.237954673696424e-06, "loss": 0.87345642, "num_input_tokens_seen": 110386735, "router_z_loss_clip": 3.37109375, "router_z_loss_mlp": 0.36694336, "step": 5138, "time_per_iteration": 4.117339849472046 }, { "auxiliary_loss_clip": 0.01712896, "auxiliary_loss_mlp": 0.00262564, "balance_loss_clip": 1.37028193, "balance_loss_mlp": 0.22486958, "epoch": 0.3089733954606944, "flos": 25664494515840.0, "grad_norm": 1.9689633144243122, "language_loss": 0.86087477, "learning_rate": 3.2376487635765983e-06, "loss": 0.88062936, "num_input_tokens_seen": 110406820, "router_z_loss_clip": 3.421875, "router_z_loss_mlp": 0.37695312, "step": 5139, "time_per_iteration": 2.742020606994629 }, { "auxiliary_loss_clip": 0.01708624, "auxiliary_loss_mlp": 0.00315562, "balance_loss_clip": 1.36384761, "balance_loss_mlp": 0.27727178, "epoch": 0.3090335187133624, "flos": 19427350124160.0, "grad_norm": 9.301213739218095, "language_loss": 0.84695053, "learning_rate": 3.2373428065250067e-06, "loss": 0.86719239, "num_input_tokens_seen": 110424225, "router_z_loss_clip": 3.44726562, "router_z_loss_mlp": 0.3828125, "step": 5140, "time_per_iteration": 2.629746198654175 }, { "auxiliary_loss_clip": 0.01670424, "auxiliary_loss_mlp": 0.00268781, "balance_loss_clip": 1.34625053, "balance_loss_mlp": 0.23483041, "epoch": 0.30909364196603034, "flos": 20011329440640.0, "grad_norm": 6.467792144708172, "language_loss": 0.84575069, "learning_rate": 3.237036802553252e-06, "loss": 0.8651427, "num_input_tokens_seen": 110443310, "router_z_loss_clip": 3.24023438, "router_z_loss_mlp": 0.33959961, "step": 5141, "time_per_iteration": 2.6718130111694336 }, { "auxiliary_loss_clip": 0.01681438, "auxiliary_loss_mlp": 0.00281472, "balance_loss_clip": 1.34319866, "balance_loss_mlp": 0.24404068, "epoch": 0.3091537652186983, "flos": 19677575243520.0, "grad_norm": 51.26661462855091, "language_loss": 0.9709568, "learning_rate": 3.2367307516729377e-06, "loss": 0.99058592, "num_input_tokens_seen": 110460215, "router_z_loss_clip": 3.38085938, "router_z_loss_mlp": 0.37426758, "step": 5142, "time_per_iteration": 2.6694140434265137 }, { "auxiliary_loss_clip": 0.01701137, "auxiliary_loss_mlp": 0.00308066, "balance_loss_clip": 1.36219311, "balance_loss_mlp": 0.27378103, "epoch": 0.3092138884713663, "flos": 17020042577280.0, "grad_norm": 11.3142667866755, "language_loss": 0.87914574, "learning_rate": 3.23642465389567e-06, "loss": 0.89923775, "num_input_tokens_seen": 110479385, "router_z_loss_clip": 3.38671875, "router_z_loss_mlp": 0.34277344, "step": 5143, "time_per_iteration": 2.6344077587127686 }, { "auxiliary_loss_clip": 0.01676726, "auxiliary_loss_mlp": 0.00284008, "balance_loss_clip": 1.34655321, "balance_loss_mlp": 0.24717201, "epoch": 0.3092740117240343, "flos": 25009986844800.0, "grad_norm": 28.41120477578561, "language_loss": 0.80043495, "learning_rate": 3.236118509233055e-06, "loss": 0.82004225, "num_input_tokens_seen": 110499885, "router_z_loss_clip": 3.30273438, "router_z_loss_mlp": 0.3684082, "step": 5144, "time_per_iteration": 2.734423875808716 }, { "auxiliary_loss_clip": 0.01699619, "auxiliary_loss_mlp": 0.00320813, "balance_loss_clip": 1.35201406, "balance_loss_mlp": 0.28466839, "epoch": 0.30933413497670226, "flos": 25590410714880.0, "grad_norm": 1.9558723523603616, "language_loss": 0.81329709, "learning_rate": 3.235812317696702e-06, "loss": 0.8335014, "num_input_tokens_seen": 110519690, "router_z_loss_clip": 3.47460938, "router_z_loss_mlp": 0.36132812, "step": 5145, "time_per_iteration": 2.6887567043304443 }, { "auxiliary_loss_clip": 0.01682319, "auxiliary_loss_mlp": 0.00329417, "balance_loss_clip": 1.34293497, "balance_loss_mlp": 0.29231885, "epoch": 0.3093942582293702, "flos": 24389665943040.0, "grad_norm": 6.887443098805595, "language_loss": 0.82231766, "learning_rate": 3.2355060792982224e-06, "loss": 0.842435, "num_input_tokens_seen": 110540520, "router_z_loss_clip": 3.39648438, "router_z_loss_mlp": 0.37133789, "step": 5146, "time_per_iteration": 2.6934635639190674 }, { "auxiliary_loss_clip": 0.01664858, "auxiliary_loss_mlp": 0.00276614, "balance_loss_clip": 1.34359527, "balance_loss_mlp": 0.24373591, "epoch": 0.3094543814820382, "flos": 19646441130240.0, "grad_norm": 3.4363251750098702, "language_loss": 0.73902059, "learning_rate": 3.2351997940492286e-06, "loss": 0.75843537, "num_input_tokens_seen": 110557950, "router_z_loss_clip": 3.21289062, "router_z_loss_mlp": 0.32861328, "step": 5147, "time_per_iteration": 2.6320059299468994 }, { "auxiliary_loss_clip": 0.01682623, "auxiliary_loss_mlp": 0.00332583, "balance_loss_clip": 1.35002899, "balance_loss_mlp": 0.29901367, "epoch": 0.30951450473470615, "flos": 25663812157440.0, "grad_norm": 5.079801987492071, "language_loss": 0.82594693, "learning_rate": 3.2348934619613346e-06, "loss": 0.84609896, "num_input_tokens_seen": 110578215, "router_z_loss_clip": 3.32421875, "router_z_loss_mlp": 0.33569336, "step": 5148, "time_per_iteration": 2.692263603210449 }, { "auxiliary_loss_clip": 0.01699424, "auxiliary_loss_mlp": 0.0032528, "balance_loss_clip": 1.35282755, "balance_loss_mlp": 0.2871803, "epoch": 0.3095746279873741, "flos": 12020415505920.0, "grad_norm": 12.798213015925347, "language_loss": 0.82503432, "learning_rate": 3.2345870830461567e-06, "loss": 0.84528136, "num_input_tokens_seen": 110592990, "router_z_loss_clip": 3.46484375, "router_z_loss_mlp": 0.38085938, "step": 5149, "time_per_iteration": 2.5879576206207275 }, { "auxiliary_loss_clip": 0.01692506, "auxiliary_loss_mlp": 0.00359366, "balance_loss_clip": 1.3533442, "balance_loss_mlp": 0.32214871, "epoch": 0.3096347512400421, "flos": 23623044946560.0, "grad_norm": 6.674509396184529, "language_loss": 0.91234744, "learning_rate": 3.2342806573153132e-06, "loss": 0.9328661, "num_input_tokens_seen": 110612130, "router_z_loss_clip": 3.390625, "router_z_loss_mlp": 0.37231445, "step": 5150, "time_per_iteration": 2.770207405090332 }, { "auxiliary_loss_clip": 0.01689195, "auxiliary_loss_mlp": 0.00315887, "balance_loss_clip": 1.36401629, "balance_loss_mlp": 0.28474927, "epoch": 0.30969487449271005, "flos": 22529313768960.0, "grad_norm": 2.246831319575793, "language_loss": 0.8434037, "learning_rate": 3.233974184780424e-06, "loss": 0.86345446, "num_input_tokens_seen": 110632045, "router_z_loss_clip": 3.25, "router_z_loss_mlp": 0.31152344, "step": 5151, "time_per_iteration": 2.779845714569092 }, { "auxiliary_loss_clip": 0.01671145, "auxiliary_loss_mlp": 0.00306576, "balance_loss_clip": 1.33753908, "balance_loss_mlp": 0.2705276, "epoch": 0.309754997745378, "flos": 15267925059840.0, "grad_norm": 26.119431109916697, "language_loss": 0.77106792, "learning_rate": 3.2336676654531084e-06, "loss": 0.7908451, "num_input_tokens_seen": 110649340, "router_z_loss_clip": 3.3359375, "router_z_loss_mlp": 0.36035156, "step": 5152, "time_per_iteration": 2.6572108268737793 }, { "auxiliary_loss_clip": 0.01681082, "auxiliary_loss_mlp": 0.00344223, "balance_loss_clip": 1.34221697, "balance_loss_mlp": 0.30805531, "epoch": 0.309815120998046, "flos": 26979291947520.0, "grad_norm": 8.652298710963008, "language_loss": 0.89523554, "learning_rate": 3.2333610993449926e-06, "loss": 0.9154886, "num_input_tokens_seen": 110668450, "router_z_loss_clip": 3.38867188, "router_z_loss_mlp": 0.36108398, "step": 5153, "time_per_iteration": 2.709210157394409 }, { "auxiliary_loss_clip": 0.01676355, "auxiliary_loss_mlp": 0.00324756, "balance_loss_clip": 1.34514868, "balance_loss_mlp": 0.29073405, "epoch": 0.30987524425071394, "flos": 21143161969920.0, "grad_norm": 2.5005432572890562, "language_loss": 0.79992777, "learning_rate": 3.2330544864676997e-06, "loss": 0.8199389, "num_input_tokens_seen": 110689410, "router_z_loss_clip": 3.3125, "router_z_loss_mlp": 0.34008789, "step": 5154, "time_per_iteration": 2.665780544281006 }, { "auxiliary_loss_clip": 0.01657454, "auxiliary_loss_mlp": 0.00319961, "balance_loss_clip": 1.34164047, "balance_loss_mlp": 0.28546226, "epoch": 0.3099353675033819, "flos": 15268284195840.0, "grad_norm": 3.8003115844964, "language_loss": 0.82454014, "learning_rate": 3.232747826832858e-06, "loss": 0.84431428, "num_input_tokens_seen": 110707350, "router_z_loss_clip": 3.15625, "router_z_loss_mlp": 0.3449707, "step": 5155, "time_per_iteration": 2.5902154445648193 }, { "auxiliary_loss_clip": 0.01653074, "auxiliary_loss_mlp": 0.00308415, "balance_loss_clip": 1.32345366, "balance_loss_mlp": 0.27021989, "epoch": 0.30999549075604993, "flos": 15413794191360.0, "grad_norm": 706.755243542043, "language_loss": 0.87837803, "learning_rate": 3.232441120452094e-06, "loss": 0.89799285, "num_input_tokens_seen": 110724910, "router_z_loss_clip": 3.29882812, "router_z_loss_mlp": 0.38183594, "step": 5156, "time_per_iteration": 2.6600122451782227 }, { "auxiliary_loss_clip": 0.0163886, "auxiliary_loss_mlp": 0.00336619, "balance_loss_clip": 1.31697321, "balance_loss_mlp": 0.30083275, "epoch": 0.3100556140087179, "flos": 23184539712000.0, "grad_norm": 40.02308935830575, "language_loss": 0.80953228, "learning_rate": 3.23213436733704e-06, "loss": 0.82928717, "num_input_tokens_seen": 110744010, "router_z_loss_clip": 3.22265625, "router_z_loss_mlp": 0.35791016, "step": 5157, "time_per_iteration": 2.6854069232940674 }, { "auxiliary_loss_clip": 0.01616162, "auxiliary_loss_mlp": 0.00283838, "balance_loss_clip": 1.29738736, "balance_loss_mlp": 0.25203249, "epoch": 0.31011573726138586, "flos": 25742169676800.0, "grad_norm": 315.7792588959449, "language_loss": 0.74506241, "learning_rate": 3.231827567499327e-06, "loss": 0.7640624, "num_input_tokens_seen": 110765835, "router_z_loss_clip": 3.18554688, "router_z_loss_mlp": 0.31835938, "step": 5158, "time_per_iteration": 2.8443126678466797 }, { "auxiliary_loss_clip": 0.01618809, "auxiliary_loss_mlp": 0.00250836, "balance_loss_clip": 1.30112278, "balance_loss_mlp": 0.21841113, "epoch": 0.3101758605140538, "flos": 20011329440640.0, "grad_norm": 54.70326830024265, "language_loss": 0.90621269, "learning_rate": 3.2315207209505896e-06, "loss": 0.92490911, "num_input_tokens_seen": 110784655, "router_z_loss_clip": 3.17382812, "router_z_loss_mlp": 0.32421875, "step": 5159, "time_per_iteration": 2.6315832138061523 }, { "auxiliary_loss_clip": 0.01625776, "auxiliary_loss_mlp": 0.00309697, "balance_loss_clip": 1.3055172, "balance_loss_mlp": 0.27515015, "epoch": 0.3102359837667218, "flos": 19135683688320.0, "grad_norm": 10.354334911713243, "language_loss": 0.90822387, "learning_rate": 3.231213827702462e-06, "loss": 0.92757869, "num_input_tokens_seen": 110802545, "router_z_loss_clip": 3.20117188, "router_z_loss_mlp": 0.34545898, "step": 5160, "time_per_iteration": 2.5971248149871826 }, { "auxiliary_loss_clip": 0.01610218, "auxiliary_loss_mlp": 0.00248509, "balance_loss_clip": 1.30056322, "balance_loss_mlp": 0.21699011, "epoch": 0.31029610701938976, "flos": 22265405568000.0, "grad_norm": 5.864630786557637, "language_loss": 0.83546948, "learning_rate": 3.230906887766584e-06, "loss": 0.85405678, "num_input_tokens_seen": 110820265, "router_z_loss_clip": 3.09375, "router_z_loss_mlp": 0.31542969, "step": 5161, "time_per_iteration": 2.631803512573242 }, { "auxiliary_loss_clip": 0.01622129, "auxiliary_loss_mlp": 0.00274063, "balance_loss_clip": 1.30049276, "balance_loss_mlp": 0.23937286, "epoch": 0.3103562302720577, "flos": 20805349536000.0, "grad_norm": 2.371181334548887, "language_loss": 0.9048751, "learning_rate": 3.2305999011545924e-06, "loss": 0.92383707, "num_input_tokens_seen": 110836195, "router_z_loss_clip": 3.21875, "router_z_loss_mlp": 0.34643555, "step": 5162, "time_per_iteration": 2.622304916381836 }, { "auxiliary_loss_clip": 0.01593234, "auxiliary_loss_mlp": 0.00251677, "balance_loss_clip": 1.28286338, "balance_loss_mlp": 0.2203486, "epoch": 0.3104163535247257, "flos": 22344158136960.0, "grad_norm": 1.962247989747067, "language_loss": 0.86771071, "learning_rate": 3.2302928678781295e-06, "loss": 0.88615984, "num_input_tokens_seen": 110856420, "router_z_loss_clip": 3.10351562, "router_z_loss_mlp": 0.31298828, "step": 5163, "time_per_iteration": 2.640636682510376 }, { "auxiliary_loss_clip": 0.01611927, "auxiliary_loss_mlp": 0.00279225, "balance_loss_clip": 1.29241776, "balance_loss_mlp": 0.24479714, "epoch": 0.31047647677739365, "flos": 21689363157120.0, "grad_norm": 4.240850566900648, "language_loss": 0.82602823, "learning_rate": 3.2299857879488376e-06, "loss": 0.84493971, "num_input_tokens_seen": 110876650, "router_z_loss_clip": 3.19726562, "router_z_loss_mlp": 0.34448242, "step": 5164, "time_per_iteration": 2.671548366546631 }, { "auxiliary_loss_clip": 0.0160132, "auxiliary_loss_mlp": 0.00256781, "balance_loss_clip": 1.28922963, "balance_loss_mlp": 0.22225833, "epoch": 0.3105366000300616, "flos": 18917275040640.0, "grad_norm": 77.73382144861004, "language_loss": 0.81160223, "learning_rate": 3.2296786613783626e-06, "loss": 0.83018327, "num_input_tokens_seen": 110894445, "router_z_loss_clip": 3.11914062, "router_z_loss_mlp": 0.34521484, "step": 5165, "time_per_iteration": 2.6305410861968994 }, { "auxiliary_loss_clip": 0.01607762, "auxiliary_loss_mlp": 0.00262365, "balance_loss_clip": 1.2965076, "balance_loss_mlp": 0.23063183, "epoch": 0.3105967232827296, "flos": 18260397072000.0, "grad_norm": 1.652636143067467, "language_loss": 0.81607097, "learning_rate": 3.229371488178348e-06, "loss": 0.83477223, "num_input_tokens_seen": 110912855, "router_z_loss_clip": 3.11132812, "router_z_loss_mlp": 0.31713867, "step": 5166, "time_per_iteration": 4.0905468463897705 }, { "auxiliary_loss_clip": 0.0161492, "auxiliary_loss_mlp": 0.00258089, "balance_loss_clip": 1.30012631, "balance_loss_mlp": 0.22368567, "epoch": 0.31065684653539755, "flos": 17672144037120.0, "grad_norm": 2.6270812145756306, "language_loss": 0.81540453, "learning_rate": 3.229064268360444e-06, "loss": 0.83413464, "num_input_tokens_seen": 110928025, "router_z_loss_clip": 3.14648438, "router_z_loss_mlp": 0.34423828, "step": 5167, "time_per_iteration": 2.6460793018341064 }, { "auxiliary_loss_clip": 0.01444654, "auxiliary_loss_mlp": 0.00094289, "balance_loss_clip": 1.2652992, "balance_loss_mlp": 0.08518146, "epoch": 0.3107169697880655, "flos": 68531996511360.0, "grad_norm": 0.7430316260024206, "language_loss": 0.53002322, "learning_rate": 3.2287570019362997e-06, "loss": 0.54541266, "num_input_tokens_seen": 110992215, "router_z_loss_clip": 1.796875, "router_z_loss_mlp": 0.09130859, "step": 5168, "time_per_iteration": 3.1841344833374023 }, { "auxiliary_loss_clip": 0.01593857, "auxiliary_loss_mlp": 0.00264806, "balance_loss_clip": 1.28233552, "balance_loss_mlp": 0.23059337, "epoch": 0.3107770930407335, "flos": 13188733274880.0, "grad_norm": 2.3700911615921973, "language_loss": 0.8606289, "learning_rate": 3.2284496889175668e-06, "loss": 0.87921554, "num_input_tokens_seen": 111010400, "router_z_loss_clip": 3.11328125, "router_z_loss_mlp": 0.34204102, "step": 5169, "time_per_iteration": 4.010276556015015 }, { "auxiliary_loss_clip": 0.01622144, "auxiliary_loss_mlp": 0.00272231, "balance_loss_clip": 1.30175829, "balance_loss_mlp": 0.23813726, "epoch": 0.3108372162934015, "flos": 31580849520000.0, "grad_norm": 7.69615503459207, "language_loss": 0.70168376, "learning_rate": 3.2281423293158986e-06, "loss": 0.72062755, "num_input_tokens_seen": 111033960, "router_z_loss_clip": 3.20898438, "router_z_loss_mlp": 0.34082031, "step": 5170, "time_per_iteration": 2.753401756286621 }, { "auxiliary_loss_clip": 0.01598244, "auxiliary_loss_mlp": 0.00255349, "balance_loss_clip": 1.29638076, "balance_loss_mlp": 0.22157723, "epoch": 0.31089733954606946, "flos": 28729829266560.0, "grad_norm": 5.918053560078943, "language_loss": 0.84496701, "learning_rate": 3.22783492314295e-06, "loss": 0.86350292, "num_input_tokens_seen": 111053265, "router_z_loss_clip": 3.02148438, "router_z_loss_mlp": 0.33776855, "step": 5171, "time_per_iteration": 2.694481134414673 }, { "auxiliary_loss_clip": 0.01596969, "auxiliary_loss_mlp": 0.00261056, "balance_loss_clip": 1.2848289, "balance_loss_mlp": 0.22641338, "epoch": 0.3109574627987374, "flos": 19683249592320.0, "grad_norm": 16.99084222793687, "language_loss": 0.89268345, "learning_rate": 3.2275274704103785e-06, "loss": 0.9112637, "num_input_tokens_seen": 111071130, "router_z_loss_clip": 3.12304688, "router_z_loss_mlp": 0.34643555, "step": 5172, "time_per_iteration": 2.6495842933654785 }, { "auxiliary_loss_clip": 0.01595888, "auxiliary_loss_mlp": 0.00274884, "balance_loss_clip": 1.27819419, "balance_loss_mlp": 0.24155267, "epoch": 0.3110175860514054, "flos": 14683981656960.0, "grad_norm": 88.3177320268542, "language_loss": 0.93367279, "learning_rate": 3.227219971129842e-06, "loss": 0.95238054, "num_input_tokens_seen": 111089560, "router_z_loss_clip": 3.18164062, "router_z_loss_mlp": 0.33349609, "step": 5173, "time_per_iteration": 4.041758060455322 }, { "auxiliary_loss_clip": 0.01582644, "auxiliary_loss_mlp": 0.00251323, "balance_loss_clip": 1.27724814, "balance_loss_mlp": 0.2194462, "epoch": 0.31107770930407336, "flos": 25739655724800.0, "grad_norm": 7.448617145915292, "language_loss": 0.88836068, "learning_rate": 3.226912425313001e-06, "loss": 0.90670037, "num_input_tokens_seen": 111109960, "router_z_loss_clip": 3.0546875, "router_z_loss_mlp": 0.31884766, "step": 5174, "time_per_iteration": 2.6568233966827393 }, { "auxiliary_loss_clip": 0.01599766, "auxiliary_loss_mlp": 0.00280486, "balance_loss_clip": 1.28613877, "balance_loss_mlp": 0.24503322, "epoch": 0.3111378325567413, "flos": 19208259118080.0, "grad_norm": 165.09183700616617, "language_loss": 0.9080025, "learning_rate": 3.2266048329715183e-06, "loss": 0.92680502, "num_input_tokens_seen": 111127960, "router_z_loss_clip": 3.13867188, "router_z_loss_mlp": 0.35449219, "step": 5175, "time_per_iteration": 2.6088955402374268 }, { "auxiliary_loss_clip": 0.0157846, "auxiliary_loss_mlp": 0.00256432, "balance_loss_clip": 1.2823689, "balance_loss_mlp": 0.22760725, "epoch": 0.3111979558094093, "flos": 23696374561920.0, "grad_norm": 16.43425317141788, "language_loss": 0.90578938, "learning_rate": 3.2262971941170575e-06, "loss": 0.92413831, "num_input_tokens_seen": 111146730, "router_z_loss_clip": 2.95898438, "router_z_loss_mlp": 0.28808594, "step": 5176, "time_per_iteration": 2.6939449310302734 }, { "auxiliary_loss_clip": 0.01556554, "auxiliary_loss_mlp": 0.00267915, "balance_loss_clip": 1.25739765, "balance_loss_mlp": 0.23317754, "epoch": 0.31125807906207725, "flos": 21033023892480.0, "grad_norm": 5.142249104474445, "language_loss": 0.87042367, "learning_rate": 3.2259895087612837e-06, "loss": 0.8886683, "num_input_tokens_seen": 111166295, "router_z_loss_clip": 2.99609375, "router_z_loss_mlp": 0.34716797, "step": 5177, "time_per_iteration": 2.7073006629943848 }, { "auxiliary_loss_clip": 0.0156029, "auxiliary_loss_mlp": 0.00260576, "balance_loss_clip": 1.25952101, "balance_loss_mlp": 0.22624353, "epoch": 0.3113182023147452, "flos": 23076628277760.0, "grad_norm": 4.190502584350524, "language_loss": 0.88362288, "learning_rate": 3.2256817769158657e-06, "loss": 0.90183151, "num_input_tokens_seen": 111185665, "router_z_loss_clip": 3.00976562, "router_z_loss_mlp": 0.34350586, "step": 5178, "time_per_iteration": 2.6452620029449463 }, { "auxiliary_loss_clip": 0.01568268, "auxiliary_loss_mlp": 0.00283455, "balance_loss_clip": 1.2657876, "balance_loss_mlp": 0.25060102, "epoch": 0.3113783255674132, "flos": 11838994888320.0, "grad_norm": 9.737249328263367, "language_loss": 0.89126682, "learning_rate": 3.225373998592471e-06, "loss": 0.90978408, "num_input_tokens_seen": 111201615, "router_z_loss_clip": 3.0234375, "router_z_loss_mlp": 0.32861328, "step": 5179, "time_per_iteration": 2.68013596534729 }, { "auxiliary_loss_clip": 0.01550263, "auxiliary_loss_mlp": 0.00265424, "balance_loss_clip": 1.25832403, "balance_loss_mlp": 0.23376164, "epoch": 0.31143844882008115, "flos": 16289547684480.0, "grad_norm": 6.491229250897338, "language_loss": 0.85819787, "learning_rate": 3.2250661738027715e-06, "loss": 0.87635469, "num_input_tokens_seen": 111220515, "router_z_loss_clip": 2.91992188, "router_z_loss_mlp": 0.31665039, "step": 5180, "time_per_iteration": 3.9969730377197266 }, { "auxiliary_loss_clip": 0.01566438, "auxiliary_loss_mlp": 0.00253022, "balance_loss_clip": 1.26688254, "balance_loss_mlp": 0.22090672, "epoch": 0.3114985720727491, "flos": 23217792727680.0, "grad_norm": 4.8323856041392075, "language_loss": 0.89711767, "learning_rate": 3.22475830255844e-06, "loss": 0.91531229, "num_input_tokens_seen": 111240395, "router_z_loss_clip": 2.9921875, "router_z_loss_mlp": 0.32128906, "step": 5181, "time_per_iteration": 2.6742959022521973 }, { "auxiliary_loss_clip": 0.015453, "auxiliary_loss_mlp": 0.00235394, "balance_loss_clip": 1.25412965, "balance_loss_mlp": 0.2081185, "epoch": 0.3115586953254171, "flos": 30044626698240.0, "grad_norm": 3.27502853085985, "language_loss": 0.82114053, "learning_rate": 3.2244503848711516e-06, "loss": 0.83894742, "num_input_tokens_seen": 111261100, "router_z_loss_clip": 2.90429688, "router_z_loss_mlp": 0.27270508, "step": 5182, "time_per_iteration": 2.704859733581543 }, { "auxiliary_loss_clip": 0.01585708, "auxiliary_loss_mlp": 0.00295127, "balance_loss_clip": 1.28105307, "balance_loss_mlp": 0.26184365, "epoch": 0.3116188185780851, "flos": 25666326109440.0, "grad_norm": 5.1830296074157145, "language_loss": 0.78651512, "learning_rate": 3.2241424207525815e-06, "loss": 0.80532348, "num_input_tokens_seen": 111281320, "router_z_loss_clip": 3.046875, "router_z_loss_mlp": 0.33276367, "step": 5183, "time_per_iteration": 2.685634136199951 }, { "auxiliary_loss_clip": 0.01535261, "auxiliary_loss_mlp": 0.00063168, "balance_loss_clip": 1.34594262, "balance_loss_mlp": 0.05572958, "epoch": 0.31167894183075306, "flos": 69510058917120.0, "grad_norm": 0.9303002617195502, "language_loss": 0.58773911, "learning_rate": 3.223834410214408e-06, "loss": 0.60372341, "num_input_tokens_seen": 111341405, "router_z_loss_clip": 1.890625, "router_z_loss_mlp": 0.07421875, "step": 5184, "time_per_iteration": 3.123053789138794 }, { "auxiliary_loss_clip": 0.0156757, "auxiliary_loss_mlp": 0.00304107, "balance_loss_clip": 1.26794672, "balance_loss_mlp": 0.2717773, "epoch": 0.31173906508342103, "flos": 14939845211520.0, "grad_norm": 9.75270172430182, "language_loss": 0.77644271, "learning_rate": 3.223526353268311e-06, "loss": 0.79515946, "num_input_tokens_seen": 111358975, "router_z_loss_clip": 2.99804688, "router_z_loss_mlp": 0.32324219, "step": 5185, "time_per_iteration": 2.729487419128418 }, { "auxiliary_loss_clip": 0.01583509, "auxiliary_loss_mlp": 0.0028415, "balance_loss_clip": 1.27464485, "balance_loss_mlp": 0.25217772, "epoch": 0.311799188336089, "flos": 16176033728640.0, "grad_norm": 11.919077637816148, "language_loss": 0.72948694, "learning_rate": 3.2232182499259725e-06, "loss": 0.74816352, "num_input_tokens_seen": 111375845, "router_z_loss_clip": 3.09179688, "router_z_loss_mlp": 0.31982422, "step": 5186, "time_per_iteration": 2.6438100337982178 }, { "auxiliary_loss_clip": 0.01582874, "auxiliary_loss_mlp": 0.00306536, "balance_loss_clip": 1.27580512, "balance_loss_mlp": 0.27203709, "epoch": 0.31185931158875696, "flos": 25009627708800.0, "grad_norm": 32.8214145283845, "language_loss": 0.94594038, "learning_rate": 3.2229101001990747e-06, "loss": 0.96483451, "num_input_tokens_seen": 111394150, "router_z_loss_clip": 3.07421875, "router_z_loss_mlp": 0.34521484, "step": 5187, "time_per_iteration": 2.643606662750244 }, { "auxiliary_loss_clip": 0.01567998, "auxiliary_loss_mlp": 0.00287149, "balance_loss_clip": 1.27199888, "balance_loss_mlp": 0.25738221, "epoch": 0.3119194348414249, "flos": 37232901273600.0, "grad_norm": 24.407413303054483, "language_loss": 0.68841392, "learning_rate": 3.2226019040993036e-06, "loss": 0.70696539, "num_input_tokens_seen": 111418355, "router_z_loss_clip": 2.9609375, "router_z_loss_mlp": 0.29748535, "step": 5188, "time_per_iteration": 2.824032783508301 }, { "auxiliary_loss_clip": 0.0155469, "auxiliary_loss_mlp": 0.00274282, "balance_loss_clip": 1.26094031, "balance_loss_mlp": 0.24314414, "epoch": 0.3119795580940929, "flos": 15012779777280.0, "grad_norm": 5.081758967932617, "language_loss": 0.90469623, "learning_rate": 3.222293661638346e-06, "loss": 0.92298603, "num_input_tokens_seen": 111435445, "router_z_loss_clip": 2.93554688, "router_z_loss_mlp": 0.3112793, "step": 5189, "time_per_iteration": 2.6271965503692627 }, { "auxiliary_loss_clip": 0.01538654, "auxiliary_loss_mlp": 0.00280739, "balance_loss_clip": 1.25239718, "balance_loss_mlp": 0.25172368, "epoch": 0.31203968134676086, "flos": 15998168557440.0, "grad_norm": 6.4056594205872734, "language_loss": 0.85207421, "learning_rate": 3.22198537282789e-06, "loss": 0.87026817, "num_input_tokens_seen": 111453430, "router_z_loss_clip": 2.86328125, "router_z_loss_mlp": 0.29003906, "step": 5190, "time_per_iteration": 2.6388747692108154 }, { "auxiliary_loss_clip": 0.01551084, "auxiliary_loss_mlp": 0.00280086, "balance_loss_clip": 1.2560811, "balance_loss_mlp": 0.24911526, "epoch": 0.3120998045994288, "flos": 23837359443840.0, "grad_norm": 1.761036811608107, "language_loss": 0.80473322, "learning_rate": 3.2216770376796262e-06, "loss": 0.8230449, "num_input_tokens_seen": 111475325, "router_z_loss_clip": 2.953125, "router_z_loss_mlp": 0.30932617, "step": 5191, "time_per_iteration": 2.6815295219421387 }, { "auxiliary_loss_clip": 0.01515095, "auxiliary_loss_mlp": 0.00046679, "balance_loss_clip": 1.33716488, "balance_loss_mlp": 0.04088529, "epoch": 0.3121599278520968, "flos": 69184205712000.0, "grad_norm": 0.871003740614933, "language_loss": 0.64012301, "learning_rate": 3.221368656205247e-06, "loss": 0.65574074, "num_input_tokens_seen": 111533960, "router_z_loss_clip": 1.78125, "router_z_loss_mlp": 0.05786133, "step": 5192, "time_per_iteration": 3.226191520690918 }, { "auxiliary_loss_clip": 0.01570018, "auxiliary_loss_mlp": 0.003037, "balance_loss_clip": 1.26493716, "balance_loss_mlp": 0.26989177, "epoch": 0.31222005110476475, "flos": 23806368984960.0, "grad_norm": 18.05986348928334, "language_loss": 0.86403871, "learning_rate": 3.221060228416446e-06, "loss": 0.8827759, "num_input_tokens_seen": 111554055, "router_z_loss_clip": 3.04882812, "router_z_loss_mlp": 0.33813477, "step": 5193, "time_per_iteration": 2.6632587909698486 }, { "auxiliary_loss_clip": 0.01566551, "auxiliary_loss_mlp": 0.0029268, "balance_loss_clip": 1.26340222, "balance_loss_mlp": 0.25970715, "epoch": 0.3122801743574327, "flos": 25226132935680.0, "grad_norm": 2.5964703345403053, "language_loss": 0.80805731, "learning_rate": 3.2207517543249183e-06, "loss": 0.82664961, "num_input_tokens_seen": 111574305, "router_z_loss_clip": 3.03320312, "router_z_loss_mlp": 0.32983398, "step": 5194, "time_per_iteration": 2.6951637268066406 }, { "auxiliary_loss_clip": 0.01575031, "auxiliary_loss_mlp": 0.00305042, "balance_loss_clip": 1.27567017, "balance_loss_mlp": 0.27400008, "epoch": 0.3123402976101007, "flos": 22966490200320.0, "grad_norm": 11.33058440580533, "language_loss": 0.81282032, "learning_rate": 3.2204432339423616e-06, "loss": 0.83162105, "num_input_tokens_seen": 111595680, "router_z_loss_clip": 2.99414062, "router_z_loss_mlp": 0.3104248, "step": 5195, "time_per_iteration": 2.6969006061553955 }, { "auxiliary_loss_clip": 0.01573573, "auxiliary_loss_mlp": 0.00311501, "balance_loss_clip": 1.27441549, "balance_loss_mlp": 0.27688217, "epoch": 0.3124004208627687, "flos": 25192089820800.0, "grad_norm": 1.8002084226838486, "language_loss": 0.8360076, "learning_rate": 3.220134667280476e-06, "loss": 0.85485834, "num_input_tokens_seen": 111618135, "router_z_loss_clip": 2.9921875, "router_z_loss_mlp": 0.34594727, "step": 5196, "time_per_iteration": 2.687711715698242 }, { "auxiliary_loss_clip": 0.01489779, "auxiliary_loss_mlp": 0.00045197, "balance_loss_clip": 1.31168723, "balance_loss_mlp": 0.03856891, "epoch": 0.31246054411543667, "flos": 67485165517440.0, "grad_norm": 0.7982790604436729, "language_loss": 0.54670632, "learning_rate": 3.2198260543509613e-06, "loss": 0.56205606, "num_input_tokens_seen": 111682220, "router_z_loss_clip": 1.78125, "router_z_loss_mlp": 0.06640625, "step": 5197, "time_per_iteration": 3.174241065979004 }, { "auxiliary_loss_clip": 0.01565524, "auxiliary_loss_mlp": 0.00299693, "balance_loss_clip": 1.2709527, "balance_loss_mlp": 0.2678169, "epoch": 0.31252066736810463, "flos": 17858520731520.0, "grad_norm": 98.707273646863, "language_loss": 0.73054367, "learning_rate": 3.21951739516552e-06, "loss": 0.74919581, "num_input_tokens_seen": 111700815, "router_z_loss_clip": 2.94726562, "router_z_loss_mlp": 0.31933594, "step": 5198, "time_per_iteration": 2.612720251083374 }, { "auxiliary_loss_clip": 0.0159603, "auxiliary_loss_mlp": 0.00361434, "balance_loss_clip": 1.28628087, "balance_loss_mlp": 0.32393116, "epoch": 0.3125807906207726, "flos": 18475034791680.0, "grad_norm": 32.72477416310552, "language_loss": 0.77300107, "learning_rate": 3.219208689735857e-06, "loss": 0.79257572, "num_input_tokens_seen": 111718195, "router_z_loss_clip": 3.09765625, "router_z_loss_mlp": 0.375, "step": 5199, "time_per_iteration": 2.621884346008301 }, { "auxiliary_loss_clip": 0.01573598, "auxiliary_loss_mlp": 0.00330266, "balance_loss_clip": 1.26907074, "balance_loss_mlp": 0.2964586, "epoch": 0.31264091387344056, "flos": 18946541646720.0, "grad_norm": 3.240837301615117, "language_loss": 0.8448236, "learning_rate": 3.2188999380736785e-06, "loss": 0.86386216, "num_input_tokens_seen": 111734440, "router_z_loss_clip": 3.04882812, "router_z_loss_mlp": 0.33813477, "step": 5200, "time_per_iteration": 2.6037113666534424 }, { "auxiliary_loss_clip": 0.01545537, "auxiliary_loss_mlp": 0.00282956, "balance_loss_clip": 1.25757027, "balance_loss_mlp": 0.25184235, "epoch": 0.3127010371261085, "flos": 21468512384640.0, "grad_norm": 9.097104911077793, "language_loss": 0.90731728, "learning_rate": 3.2185911401906917e-06, "loss": 0.92560214, "num_input_tokens_seen": 111751960, "router_z_loss_clip": 2.88085938, "router_z_loss_mlp": 0.31103516, "step": 5201, "time_per_iteration": 2.650531768798828 }, { "auxiliary_loss_clip": 0.01587552, "auxiliary_loss_mlp": 0.00345337, "balance_loss_clip": 1.27403831, "balance_loss_mlp": 0.30940688, "epoch": 0.3127611603787765, "flos": 15336047203200.0, "grad_norm": 57.814239026403655, "language_loss": 0.74651515, "learning_rate": 3.2182822960986072e-06, "loss": 0.76584399, "num_input_tokens_seen": 111769585, "router_z_loss_clip": 3.13671875, "router_z_loss_mlp": 0.35961914, "step": 5202, "time_per_iteration": 2.6267590522766113 }, { "auxiliary_loss_clip": 0.01562566, "auxiliary_loss_mlp": 0.0034026, "balance_loss_clip": 1.25439453, "balance_loss_mlp": 0.30734628, "epoch": 0.31282128363144446, "flos": 17602980399360.0, "grad_norm": 40.98880632081559, "language_loss": 0.90654284, "learning_rate": 3.2179734058091358e-06, "loss": 0.92557114, "num_input_tokens_seen": 111787880, "router_z_loss_clip": 3.08789062, "router_z_loss_mlp": 0.32897949, "step": 5203, "time_per_iteration": 2.608332633972168 }, { "auxiliary_loss_clip": 0.01585177, "auxiliary_loss_mlp": 0.00347937, "balance_loss_clip": 1.27731025, "balance_loss_mlp": 0.31501126, "epoch": 0.3128814068841124, "flos": 26756753235840.0, "grad_norm": 27.07678771875222, "language_loss": 0.67452812, "learning_rate": 3.2176644693339913e-06, "loss": 0.69385922, "num_input_tokens_seen": 111805950, "router_z_loss_clip": 3.078125, "router_z_loss_mlp": 0.32910156, "step": 5204, "time_per_iteration": 2.82592511177063 }, { "auxiliary_loss_clip": 0.01560347, "auxiliary_loss_mlp": 0.00336455, "balance_loss_clip": 1.26175284, "balance_loss_mlp": 0.30499542, "epoch": 0.3129415301367804, "flos": 22272372806400.0, "grad_norm": 5.878566828184419, "language_loss": 0.7148664, "learning_rate": 3.217355486684887e-06, "loss": 0.73383445, "num_input_tokens_seen": 111826135, "router_z_loss_clip": 2.984375, "router_z_loss_mlp": 0.3145752, "step": 5205, "time_per_iteration": 2.696436882019043 }, { "auxiliary_loss_clip": 0.0156741, "auxiliary_loss_mlp": 0.00351729, "balance_loss_clip": 1.25480759, "balance_loss_mlp": 0.31663379, "epoch": 0.31300165338944835, "flos": 26464907232000.0, "grad_norm": 236.6667865399722, "language_loss": 0.80516785, "learning_rate": 3.2170464578735414e-06, "loss": 0.82435924, "num_input_tokens_seen": 111844700, "router_z_loss_clip": 3.12890625, "router_z_loss_mlp": 0.35107422, "step": 5206, "time_per_iteration": 2.6967172622680664 }, { "auxiliary_loss_clip": 0.01570888, "auxiliary_loss_mlp": 0.00323054, "balance_loss_clip": 1.27014577, "balance_loss_mlp": 0.29146352, "epoch": 0.3130617766421163, "flos": 21944652094080.0, "grad_norm": 66.44320519251114, "language_loss": 0.9093442, "learning_rate": 3.216737382911672e-06, "loss": 0.92828369, "num_input_tokens_seen": 111861585, "router_z_loss_clip": 3.0078125, "router_z_loss_mlp": 0.31591797, "step": 5207, "time_per_iteration": 2.633481979370117 }, { "auxiliary_loss_clip": 0.01564815, "auxiliary_loss_mlp": 0.00319178, "balance_loss_clip": 1.25893354, "balance_loss_mlp": 0.28665805, "epoch": 0.3131218998947843, "flos": 23292774368640.0, "grad_norm": 2.0333511446016277, "language_loss": 0.76793182, "learning_rate": 3.216428261810999e-06, "loss": 0.78677177, "num_input_tokens_seen": 111882950, "router_z_loss_clip": 3.05859375, "router_z_loss_mlp": 0.32495117, "step": 5208, "time_per_iteration": 4.243360996246338 }, { "auxiliary_loss_clip": 0.01576431, "auxiliary_loss_mlp": 0.00353737, "balance_loss_clip": 1.27273607, "balance_loss_mlp": 0.31876129, "epoch": 0.3131820231474523, "flos": 21139642437120.0, "grad_norm": 187.71046542974085, "language_loss": 0.80423236, "learning_rate": 3.2161190945832445e-06, "loss": 0.82353407, "num_input_tokens_seen": 111901640, "router_z_loss_clip": 3.03710938, "router_z_loss_mlp": 0.34960938, "step": 5209, "time_per_iteration": 2.6728968620300293 }, { "auxiliary_loss_clip": 0.01552862, "auxiliary_loss_mlp": 0.00331102, "balance_loss_clip": 1.2512486, "balance_loss_mlp": 0.30094165, "epoch": 0.31324214640012027, "flos": 23909863046400.0, "grad_norm": 9.54085365953534, "language_loss": 0.82424819, "learning_rate": 3.2158098812401325e-06, "loss": 0.84308779, "num_input_tokens_seen": 111919615, "router_z_loss_clip": 3.015625, "router_z_loss_mlp": 0.30175781, "step": 5210, "time_per_iteration": 2.7409324645996094 }, { "auxiliary_loss_clip": 0.01541549, "auxiliary_loss_mlp": 0.00326715, "balance_loss_clip": 1.24574471, "balance_loss_mlp": 0.2947669, "epoch": 0.31330226965278823, "flos": 22236929061120.0, "grad_norm": 72.44506270297991, "language_loss": 0.85031897, "learning_rate": 3.2155006217933874e-06, "loss": 0.86900163, "num_input_tokens_seen": 111938485, "router_z_loss_clip": 2.96289062, "router_z_loss_mlp": 0.31958008, "step": 5211, "time_per_iteration": 4.0619893074035645 }, { "auxiliary_loss_clip": 0.01550021, "auxiliary_loss_mlp": 0.0034268, "balance_loss_clip": 1.24297941, "balance_loss_mlp": 0.30958709, "epoch": 0.3133623929054562, "flos": 19753993428480.0, "grad_norm": 17.983163519680392, "language_loss": 0.83994037, "learning_rate": 3.2151913162547367e-06, "loss": 0.85886741, "num_input_tokens_seen": 111956425, "router_z_loss_clip": 3.07226562, "router_z_loss_mlp": 0.33081055, "step": 5212, "time_per_iteration": 2.65610408782959 }, { "auxiliary_loss_clip": 0.01588319, "auxiliary_loss_mlp": 0.00364967, "balance_loss_clip": 1.27769744, "balance_loss_mlp": 0.32963297, "epoch": 0.31342251615812416, "flos": 27162256849920.0, "grad_norm": 10.277286245749874, "language_loss": 0.79345858, "learning_rate": 3.2148819646359097e-06, "loss": 0.8129915, "num_input_tokens_seen": 111975915, "router_z_loss_clip": 3.10742188, "router_z_loss_mlp": 0.35327148, "step": 5213, "time_per_iteration": 2.7141189575195312 }, { "auxiliary_loss_clip": 0.01583919, "auxiliary_loss_mlp": 0.00353294, "balance_loss_clip": 1.26442289, "balance_loss_mlp": 0.31927124, "epoch": 0.31348263941079213, "flos": 20229809915520.0, "grad_norm": 8.17063003571594, "language_loss": 0.84714413, "learning_rate": 3.2145725669486374e-06, "loss": 0.86651623, "num_input_tokens_seen": 111995055, "router_z_loss_clip": 3.19140625, "router_z_loss_mlp": 0.34033203, "step": 5214, "time_per_iteration": 2.6915481090545654 }, { "auxiliary_loss_clip": 0.01561726, "auxiliary_loss_mlp": 0.00375722, "balance_loss_clip": 1.26056516, "balance_loss_mlp": 0.34355953, "epoch": 0.3135427626634601, "flos": 24607643627520.0, "grad_norm": 6.32314438572992, "language_loss": 0.88344622, "learning_rate": 3.2142631232046517e-06, "loss": 0.90282071, "num_input_tokens_seen": 112015830, "router_z_loss_clip": 3.015625, "router_z_loss_mlp": 0.3215332, "step": 5215, "time_per_iteration": 4.2121241092681885 }, { "auxiliary_loss_clip": 0.01565273, "auxiliary_loss_mlp": 0.00368261, "balance_loss_clip": 1.25413227, "balance_loss_mlp": 0.33164024, "epoch": 0.31360288591612806, "flos": 20959873845120.0, "grad_norm": 8.893307872385545, "language_loss": 0.85902059, "learning_rate": 3.213953633415686e-06, "loss": 0.87835598, "num_input_tokens_seen": 112035065, "router_z_loss_clip": 3.109375, "router_z_loss_mlp": 0.36621094, "step": 5216, "time_per_iteration": 2.6484603881835938 }, { "auxiliary_loss_clip": 0.0159409, "auxiliary_loss_mlp": 0.00411217, "balance_loss_clip": 1.27027833, "balance_loss_mlp": 0.37330818, "epoch": 0.313663009168796, "flos": 26980513009920.0, "grad_norm": 18.69557172570863, "language_loss": 0.74082577, "learning_rate": 3.213644097593477e-06, "loss": 0.76087892, "num_input_tokens_seen": 112058405, "router_z_loss_clip": 3.24023438, "router_z_loss_mlp": 0.37915039, "step": 5217, "time_per_iteration": 2.6815922260284424 }, { "auxiliary_loss_clip": 0.01596499, "auxiliary_loss_mlp": 0.00349597, "balance_loss_clip": 1.27640843, "balance_loss_mlp": 0.31745842, "epoch": 0.313723132421464, "flos": 18040911016320.0, "grad_norm": 11.705595127272716, "language_loss": 0.85644126, "learning_rate": 3.2133345157497624e-06, "loss": 0.87590218, "num_input_tokens_seen": 112076420, "router_z_loss_clip": 3.19921875, "router_z_loss_mlp": 0.32128906, "step": 5218, "time_per_iteration": 2.6391115188598633 }, { "auxiliary_loss_clip": 0.01593821, "auxiliary_loss_mlp": 0.00395909, "balance_loss_clip": 1.27681339, "balance_loss_mlp": 0.35903746, "epoch": 0.31378325567413196, "flos": 22488913946880.0, "grad_norm": 15.874475603497084, "language_loss": 0.77620637, "learning_rate": 3.2130248878962813e-06, "loss": 0.79610372, "num_input_tokens_seen": 112090775, "router_z_loss_clip": 3.171875, "router_z_loss_mlp": 0.36877441, "step": 5219, "time_per_iteration": 2.6229169368743896 }, { "auxiliary_loss_clip": 0.016061, "auxiliary_loss_mlp": 0.00378652, "balance_loss_clip": 1.27729166, "balance_loss_mlp": 0.34517795, "epoch": 0.3138433789267999, "flos": 22419247518720.0, "grad_norm": 28.311932080484016, "language_loss": 0.85818166, "learning_rate": 3.2127152140447747e-06, "loss": 0.87802917, "num_input_tokens_seen": 112110980, "router_z_loss_clip": 3.2890625, "router_z_loss_mlp": 0.3347168, "step": 5220, "time_per_iteration": 2.711939811706543 }, { "auxiliary_loss_clip": 0.01582905, "auxiliary_loss_mlp": 0.003944, "balance_loss_clip": 1.26456118, "balance_loss_mlp": 0.35842302, "epoch": 0.3139035021794679, "flos": 13005912026880.0, "grad_norm": 3.830634630104383, "language_loss": 0.7972911, "learning_rate": 3.212405494206986e-06, "loss": 0.81706417, "num_input_tokens_seen": 112129020, "router_z_loss_clip": 3.18554688, "router_z_loss_mlp": 0.35986328, "step": 5221, "time_per_iteration": 2.6668009757995605 }, { "auxiliary_loss_clip": 0.0158065, "auxiliary_loss_mlp": 0.00342856, "balance_loss_clip": 1.26559639, "balance_loss_mlp": 0.31150359, "epoch": 0.31396362543213585, "flos": 16945994689920.0, "grad_norm": 15.914581925145113, "language_loss": 0.88432026, "learning_rate": 3.2120957283946588e-06, "loss": 0.90355539, "num_input_tokens_seen": 112147865, "router_z_loss_clip": 3.1484375, "router_z_loss_mlp": 0.31347656, "step": 5222, "time_per_iteration": 3.961674213409424 }, { "auxiliary_loss_clip": 0.01605619, "auxiliary_loss_mlp": 0.00418937, "balance_loss_clip": 1.2729243, "balance_loss_mlp": 0.37990832, "epoch": 0.31402374868480387, "flos": 20156731695360.0, "grad_norm": 8.453588701086954, "language_loss": 0.77383476, "learning_rate": 3.2117859166195407e-06, "loss": 0.79408026, "num_input_tokens_seen": 112166745, "router_z_loss_clip": 3.33007812, "router_z_loss_mlp": 0.39013672, "step": 5223, "time_per_iteration": 2.6180508136749268 }, { "auxiliary_loss_clip": 0.0157308, "auxiliary_loss_mlp": 0.00377336, "balance_loss_clip": 1.25338483, "balance_loss_mlp": 0.3448635, "epoch": 0.31408387193747184, "flos": 21251073404160.0, "grad_norm": 17.81485908202575, "language_loss": 0.85841262, "learning_rate": 3.211476058893379e-06, "loss": 0.87791681, "num_input_tokens_seen": 112185895, "router_z_loss_clip": 3.19921875, "router_z_loss_mlp": 0.32446289, "step": 5224, "time_per_iteration": 2.6553549766540527 }, { "auxiliary_loss_clip": 0.01595538, "auxiliary_loss_mlp": 0.00436582, "balance_loss_clip": 1.26422858, "balance_loss_mlp": 0.39993745, "epoch": 0.3141439951901398, "flos": 27484267299840.0, "grad_norm": 13.449480919213634, "language_loss": 0.63997406, "learning_rate": 3.2111661552279243e-06, "loss": 0.66029525, "num_input_tokens_seen": 112204465, "router_z_loss_clip": 3.31054688, "router_z_loss_mlp": 0.36645508, "step": 5225, "time_per_iteration": 2.7360916137695312 }, { "auxiliary_loss_clip": 0.01581466, "auxiliary_loss_mlp": 0.00378835, "balance_loss_clip": 1.26590276, "balance_loss_mlp": 0.34519389, "epoch": 0.31420411844280777, "flos": 17852235851520.0, "grad_norm": 4.747996672424668, "language_loss": 0.88170809, "learning_rate": 3.2108562056349273e-06, "loss": 0.90131104, "num_input_tokens_seen": 112221635, "router_z_loss_clip": 3.15820312, "router_z_loss_mlp": 0.33618164, "step": 5226, "time_per_iteration": 2.7337799072265625 }, { "auxiliary_loss_clip": 0.0160751, "auxiliary_loss_mlp": 0.00389138, "balance_loss_clip": 1.27733278, "balance_loss_mlp": 0.35280341, "epoch": 0.31426424169547573, "flos": 21616967295360.0, "grad_norm": 24.80659790215287, "language_loss": 0.81203043, "learning_rate": 3.210546210126141e-06, "loss": 0.83199692, "num_input_tokens_seen": 112241240, "router_z_loss_clip": 3.30273438, "router_z_loss_mlp": 0.36328125, "step": 5227, "time_per_iteration": 2.700890302658081 }, { "auxiliary_loss_clip": 0.01614096, "auxiliary_loss_mlp": 0.00414152, "balance_loss_clip": 1.28742194, "balance_loss_mlp": 0.37822258, "epoch": 0.3143243649481437, "flos": 30920631586560.0, "grad_norm": 2.9576866032803193, "language_loss": 0.74516571, "learning_rate": 3.2102361687133213e-06, "loss": 0.76544815, "num_input_tokens_seen": 112262350, "router_z_loss_clip": 3.26953125, "router_z_loss_mlp": 0.359375, "step": 5228, "time_per_iteration": 2.719413995742798 }, { "auxiliary_loss_clip": 0.01597505, "auxiliary_loss_mlp": 0.00366744, "balance_loss_clip": 1.27437544, "balance_loss_mlp": 0.33501083, "epoch": 0.31438448820081166, "flos": 22821411168000.0, "grad_norm": 25.19554704668356, "language_loss": 0.86171788, "learning_rate": 3.2099260814082254e-06, "loss": 0.88136041, "num_input_tokens_seen": 112283710, "router_z_loss_clip": 3.234375, "router_z_loss_mlp": 0.31726074, "step": 5229, "time_per_iteration": 2.6470229625701904 }, { "auxiliary_loss_clip": 0.01574637, "auxiliary_loss_mlp": 0.00388222, "balance_loss_clip": 1.26099336, "balance_loss_mlp": 0.35570192, "epoch": 0.3144446114534796, "flos": 23292127923840.0, "grad_norm": 12.743496154067719, "language_loss": 0.75659299, "learning_rate": 3.209615948222611e-06, "loss": 0.77622151, "num_input_tokens_seen": 112304285, "router_z_loss_clip": 3.1328125, "router_z_loss_mlp": 0.32519531, "step": 5230, "time_per_iteration": 2.6802783012390137 }, { "auxiliary_loss_clip": 0.01570047, "auxiliary_loss_mlp": 0.00407857, "balance_loss_clip": 1.24995947, "balance_loss_mlp": 0.37209386, "epoch": 0.3145047347061476, "flos": 31355976424320.0, "grad_norm": 62.039945714098884, "language_loss": 0.83981675, "learning_rate": 3.209305769168239e-06, "loss": 0.85959578, "num_input_tokens_seen": 112325110, "router_z_loss_clip": 3.19726562, "router_z_loss_mlp": 0.35766602, "step": 5231, "time_per_iteration": 2.8813371658325195 }, { "auxiliary_loss_clip": 0.01588985, "auxiliary_loss_mlp": 0.00363853, "balance_loss_clip": 1.26517534, "balance_loss_mlp": 0.33123687, "epoch": 0.31456485795881556, "flos": 10889552643840.0, "grad_norm": 14.596569994018306, "language_loss": 0.9090485, "learning_rate": 3.2089955442568704e-06, "loss": 0.92857683, "num_input_tokens_seen": 112339855, "router_z_loss_clip": 3.23828125, "router_z_loss_mlp": 0.32617188, "step": 5232, "time_per_iteration": 2.7140023708343506 }, { "auxiliary_loss_clip": 0.01554126, "auxiliary_loss_mlp": 0.00342255, "balance_loss_clip": 1.23788214, "balance_loss_mlp": 0.30972308, "epoch": 0.3146249812114835, "flos": 17092438439040.0, "grad_norm": 5.590829153433804, "language_loss": 0.85758239, "learning_rate": 3.2086852735002692e-06, "loss": 0.87654614, "num_input_tokens_seen": 112358480, "router_z_loss_clip": 3.16601562, "router_z_loss_mlp": 0.32531738, "step": 5233, "time_per_iteration": 2.6448991298675537 }, { "auxiliary_loss_clip": 0.01608428, "auxiliary_loss_mlp": 0.00388972, "balance_loss_clip": 1.26924682, "balance_loss_mlp": 0.35392439, "epoch": 0.3146851044641515, "flos": 55291442889600.0, "grad_norm": 33.10692676712518, "language_loss": 0.77559513, "learning_rate": 3.2083749569102024e-06, "loss": 0.79556912, "num_input_tokens_seen": 112382350, "router_z_loss_clip": 3.390625, "router_z_loss_mlp": 0.3503418, "step": 5234, "time_per_iteration": 2.9477930068969727 }, { "auxiliary_loss_clip": 0.01582194, "auxiliary_loss_mlp": 0.00386387, "balance_loss_clip": 1.25375581, "balance_loss_mlp": 0.3520309, "epoch": 0.31474522771681945, "flos": 27015884928000.0, "grad_norm": 1.8445233677597146, "language_loss": 0.78115696, "learning_rate": 3.2080645944984356e-06, "loss": 0.8008427, "num_input_tokens_seen": 112400260, "router_z_loss_clip": 3.28515625, "router_z_loss_mlp": 0.34375, "step": 5235, "time_per_iteration": 2.7089805603027344 }, { "auxiliary_loss_clip": 0.0155209, "auxiliary_loss_mlp": 0.00356474, "balance_loss_clip": 1.23748267, "balance_loss_mlp": 0.32452568, "epoch": 0.3148053509694875, "flos": 21251935330560.0, "grad_norm": 114.95642334395872, "language_loss": 0.84551239, "learning_rate": 3.2077541862767384e-06, "loss": 0.86459804, "num_input_tokens_seen": 112419400, "router_z_loss_clip": 3.15039062, "router_z_loss_mlp": 0.31933594, "step": 5236, "time_per_iteration": 2.6647696495056152 }, { "auxiliary_loss_clip": 0.01583318, "auxiliary_loss_mlp": 0.00402092, "balance_loss_clip": 1.25083733, "balance_loss_mlp": 0.36585268, "epoch": 0.31486547422215544, "flos": 31248675521280.0, "grad_norm": 34.92864178311655, "language_loss": 0.81239069, "learning_rate": 3.207443732256881e-06, "loss": 0.83224475, "num_input_tokens_seen": 112440825, "router_z_loss_clip": 3.32421875, "router_z_loss_mlp": 0.36230469, "step": 5237, "time_per_iteration": 2.725630521774292 }, { "auxiliary_loss_clip": 0.01540856, "auxiliary_loss_mlp": 0.00340787, "balance_loss_clip": 1.2398206, "balance_loss_mlp": 0.31000701, "epoch": 0.3149255974748234, "flos": 19828615933440.0, "grad_norm": 5.032117702576191, "language_loss": 0.84132707, "learning_rate": 3.2071332324506372e-06, "loss": 0.86014354, "num_input_tokens_seen": 112459180, "router_z_loss_clip": 3.0078125, "router_z_loss_mlp": 0.30773926, "step": 5238, "time_per_iteration": 2.6392412185668945 }, { "auxiliary_loss_clip": 0.01566415, "auxiliary_loss_mlp": 0.00076315, "balance_loss_clip": 1.36894107, "balance_loss_mlp": 0.06372621, "epoch": 0.31498572072749137, "flos": 67683965339520.0, "grad_norm": 0.8304693781228705, "language_loss": 0.68196237, "learning_rate": 3.2068226868697795e-06, "loss": 0.69838965, "num_input_tokens_seen": 112516680, "router_z_loss_clip": 1.9765625, "router_z_loss_mlp": 0.12597656, "step": 5239, "time_per_iteration": 3.1590588092803955 }, { "auxiliary_loss_clip": 0.01594141, "auxiliary_loss_mlp": 0.0041532, "balance_loss_clip": 1.26081502, "balance_loss_mlp": 0.37822217, "epoch": 0.31504584398015933, "flos": 19793136274560.0, "grad_norm": 10.161030349840265, "language_loss": 0.88503206, "learning_rate": 3.2065120955260846e-06, "loss": 0.90512669, "num_input_tokens_seen": 112535895, "router_z_loss_clip": 3.3359375, "router_z_loss_mlp": 0.37109375, "step": 5240, "time_per_iteration": 2.6952526569366455 }, { "auxiliary_loss_clip": 0.01574235, "auxiliary_loss_mlp": 0.00390882, "balance_loss_clip": 1.25756848, "balance_loss_mlp": 0.3583374, "epoch": 0.3151059672328273, "flos": 26615409217920.0, "grad_norm": 22.17738384463229, "language_loss": 0.86047637, "learning_rate": 3.2062014584313302e-06, "loss": 0.88012755, "num_input_tokens_seen": 112557490, "router_z_loss_clip": 3.16601562, "router_z_loss_mlp": 0.32519531, "step": 5241, "time_per_iteration": 2.7122690677642822 }, { "auxiliary_loss_clip": 0.01567036, "auxiliary_loss_mlp": 0.00388371, "balance_loss_clip": 1.25479567, "balance_loss_mlp": 0.35441977, "epoch": 0.31516609048549526, "flos": 24204438483840.0, "grad_norm": 9.88145355077178, "language_loss": 0.80073357, "learning_rate": 3.2058907755972956e-06, "loss": 0.82028764, "num_input_tokens_seen": 112577075, "router_z_loss_clip": 3.12695312, "router_z_loss_mlp": 0.33935547, "step": 5242, "time_per_iteration": 2.7351906299591064 }, { "auxiliary_loss_clip": 0.0155379, "auxiliary_loss_mlp": 0.00366329, "balance_loss_clip": 1.25166118, "balance_loss_mlp": 0.33423766, "epoch": 0.31522621373816323, "flos": 25958710817280.0, "grad_norm": 602.069171735207, "language_loss": 0.79684192, "learning_rate": 3.2055800470357626e-06, "loss": 0.81604314, "num_input_tokens_seen": 112597620, "router_z_loss_clip": 3.0234375, "router_z_loss_mlp": 0.32104492, "step": 5243, "time_per_iteration": 2.682337522506714 }, { "auxiliary_loss_clip": 0.01585972, "auxiliary_loss_mlp": 0.0038423, "balance_loss_clip": 1.26024437, "balance_loss_mlp": 0.35161415, "epoch": 0.3152863369908312, "flos": 21908813299200.0, "grad_norm": 4.672459064291928, "language_loss": 0.71679115, "learning_rate": 3.205269272758513e-06, "loss": 0.73649323, "num_input_tokens_seen": 112617150, "router_z_loss_clip": 3.26171875, "router_z_loss_mlp": 0.3260498, "step": 5244, "time_per_iteration": 2.634270668029785 }, { "auxiliary_loss_clip": 0.01589141, "auxiliary_loss_mlp": 0.00387039, "balance_loss_clip": 1.258991, "balance_loss_mlp": 0.35361266, "epoch": 0.31534646024349916, "flos": 16281072074880.0, "grad_norm": 4.733767879627094, "language_loss": 0.96493948, "learning_rate": 3.2049584527773313e-06, "loss": 0.98470116, "num_input_tokens_seen": 112631090, "router_z_loss_clip": 3.3046875, "router_z_loss_mlp": 0.33398438, "step": 5245, "time_per_iteration": 2.5917203426361084 }, { "auxiliary_loss_clip": 0.01606719, "auxiliary_loss_mlp": 0.00410772, "balance_loss_clip": 1.27477503, "balance_loss_mlp": 0.37610599, "epoch": 0.3154065834961671, "flos": 24717243000960.0, "grad_norm": 5.6498106886401676, "language_loss": 0.80599135, "learning_rate": 3.2046475871040048e-06, "loss": 0.82616627, "num_input_tokens_seen": 112651220, "router_z_loss_clip": 3.31835938, "router_z_loss_mlp": 0.34643555, "step": 5246, "time_per_iteration": 2.662494659423828 }, { "auxiliary_loss_clip": 0.01592328, "auxiliary_loss_mlp": 0.0040225, "balance_loss_clip": 1.2627691, "balance_loss_mlp": 0.36639142, "epoch": 0.3154667067488351, "flos": 35371148469120.0, "grad_norm": 44.193181987117775, "language_loss": 0.66926587, "learning_rate": 3.204336675750321e-06, "loss": 0.68921167, "num_input_tokens_seen": 112671560, "router_z_loss_clip": 3.296875, "router_z_loss_mlp": 0.35864258, "step": 5247, "time_per_iteration": 2.7901716232299805 }, { "auxiliary_loss_clip": 0.01591615, "auxiliary_loss_mlp": 0.00397473, "balance_loss_clip": 1.26216781, "balance_loss_mlp": 0.36235428, "epoch": 0.31552683000150306, "flos": 17456464823040.0, "grad_norm": 3.2074012149165725, "language_loss": 0.89550936, "learning_rate": 3.2040257187280693e-06, "loss": 0.91540027, "num_input_tokens_seen": 112689790, "router_z_loss_clip": 3.296875, "router_z_loss_mlp": 0.35107422, "step": 5248, "time_per_iteration": 2.6421475410461426 }, { "auxiliary_loss_clip": 0.01589114, "auxiliary_loss_mlp": 0.00415813, "balance_loss_clip": 1.25962782, "balance_loss_mlp": 0.37957358, "epoch": 0.3155869532541711, "flos": 18405763413120.0, "grad_norm": 3.4197484465187937, "language_loss": 0.92576432, "learning_rate": 3.2037147160490423e-06, "loss": 0.9458136, "num_input_tokens_seen": 112708265, "router_z_loss_clip": 3.29492188, "router_z_loss_mlp": 0.36206055, "step": 5249, "time_per_iteration": 2.6396310329437256 }, { "auxiliary_loss_clip": 0.01595065, "auxiliary_loss_mlp": 0.00422429, "balance_loss_clip": 1.26636076, "balance_loss_mlp": 0.37989467, "epoch": 0.31564707650683904, "flos": 21579763783680.0, "grad_norm": 13.680312781915164, "language_loss": 0.91421288, "learning_rate": 3.2034036677250322e-06, "loss": 0.9343878, "num_input_tokens_seen": 112727820, "router_z_loss_clip": 3.28710938, "router_z_loss_mlp": 0.42553711, "step": 5250, "time_per_iteration": 4.33744215965271 }, { "auxiliary_loss_clip": 0.01601618, "auxiliary_loss_mlp": 0.0038637, "balance_loss_clip": 1.2738483, "balance_loss_mlp": 0.35163224, "epoch": 0.315707199759507, "flos": 21030976817280.0, "grad_norm": 46.81800593724332, "language_loss": 0.78657603, "learning_rate": 3.203092573767835e-06, "loss": 0.80645591, "num_input_tokens_seen": 112743140, "router_z_loss_clip": 3.27734375, "router_z_loss_mlp": 0.34741211, "step": 5251, "time_per_iteration": 2.701134443283081 }, { "auxiliary_loss_clip": 0.01615235, "auxiliary_loss_mlp": 0.00393191, "balance_loss_clip": 1.28328562, "balance_loss_mlp": 0.35928804, "epoch": 0.31576732301217497, "flos": 26828861788800.0, "grad_norm": 10.109807932609, "language_loss": 0.83079648, "learning_rate": 3.202781434189246e-06, "loss": 0.85088074, "num_input_tokens_seen": 112764705, "router_z_loss_clip": 3.31835938, "router_z_loss_mlp": 0.33886719, "step": 5252, "time_per_iteration": 2.7337260246276855 }, { "auxiliary_loss_clip": 0.01575154, "auxiliary_loss_mlp": 0.00340079, "balance_loss_clip": 1.26022959, "balance_loss_mlp": 0.30565113, "epoch": 0.31582744626484294, "flos": 22711165349760.0, "grad_norm": 678.54343437277, "language_loss": 0.78879476, "learning_rate": 3.202470249001066e-06, "loss": 0.80794716, "num_input_tokens_seen": 112785310, "router_z_loss_clip": 3.15039062, "router_z_loss_mlp": 0.34448242, "step": 5253, "time_per_iteration": 4.194349527359009 }, { "auxiliary_loss_clip": 0.01580718, "auxiliary_loss_mlp": 0.00386436, "balance_loss_clip": 1.26355267, "balance_loss_mlp": 0.35246187, "epoch": 0.3158875695175109, "flos": 23951914894080.0, "grad_norm": 5.299734122347078, "language_loss": 0.79645002, "learning_rate": 3.2021590182150924e-06, "loss": 0.81612158, "num_input_tokens_seen": 112802905, "router_z_loss_clip": 3.17578125, "router_z_loss_mlp": 0.33984375, "step": 5254, "time_per_iteration": 2.703737735748291 }, { "auxiliary_loss_clip": 0.01587681, "auxiliary_loss_mlp": 0.00396512, "balance_loss_clip": 1.26049352, "balance_loss_mlp": 0.36065397, "epoch": 0.31594769277017887, "flos": 13261883322240.0, "grad_norm": 91.39716424338937, "language_loss": 0.83851147, "learning_rate": 3.201847741843128e-06, "loss": 0.85835338, "num_input_tokens_seen": 112820305, "router_z_loss_clip": 3.26757812, "router_z_loss_mlp": 0.35839844, "step": 5255, "time_per_iteration": 2.706033229827881 }, { "auxiliary_loss_clip": 0.01583006, "auxiliary_loss_mlp": 0.00340587, "balance_loss_clip": 1.26113045, "balance_loss_mlp": 0.30637437, "epoch": 0.31600781602284683, "flos": 23368258800000.0, "grad_norm": 8.716197177860044, "language_loss": 0.8470614, "learning_rate": 3.2015364198969772e-06, "loss": 0.8662973, "num_input_tokens_seen": 112841185, "router_z_loss_clip": 3.22265625, "router_z_loss_mlp": 0.34204102, "step": 5256, "time_per_iteration": 2.7022359371185303 }, { "auxiliary_loss_clip": 0.01574246, "auxiliary_loss_mlp": 0.00321235, "balance_loss_clip": 1.25856614, "balance_loss_mlp": 0.29007342, "epoch": 0.3160679392755148, "flos": 19828580019840.0, "grad_norm": 39.32699674244255, "language_loss": 0.75615382, "learning_rate": 3.2012250523884453e-06, "loss": 0.77510864, "num_input_tokens_seen": 112860570, "router_z_loss_clip": 3.15625, "router_z_loss_mlp": 0.31176758, "step": 5257, "time_per_iteration": 2.7090325355529785 }, { "auxiliary_loss_clip": 0.015866, "auxiliary_loss_mlp": 0.00349927, "balance_loss_clip": 1.26771903, "balance_loss_mlp": 0.31495139, "epoch": 0.31612806252818276, "flos": 20193216935040.0, "grad_norm": 92338.76991869432, "language_loss": 0.84649801, "learning_rate": 3.2009136393293393e-06, "loss": 0.86586332, "num_input_tokens_seen": 112877975, "router_z_loss_clip": 3.18945312, "router_z_loss_mlp": 0.34960938, "step": 5258, "time_per_iteration": 4.074587106704712 }, { "auxiliary_loss_clip": 0.01595256, "auxiliary_loss_mlp": 0.00346452, "balance_loss_clip": 1.27387345, "balance_loss_mlp": 0.31371751, "epoch": 0.31618818578085073, "flos": 24235967646720.0, "grad_norm": 25.684970184120022, "language_loss": 0.7941674, "learning_rate": 3.200602180731467e-06, "loss": 0.81358457, "num_input_tokens_seen": 112896170, "router_z_loss_clip": 3.21484375, "router_z_loss_mlp": 0.32763672, "step": 5259, "time_per_iteration": 2.63191294670105 }, { "auxiliary_loss_clip": 0.01598585, "auxiliary_loss_mlp": 0.00362275, "balance_loss_clip": 1.27042127, "balance_loss_mlp": 0.33101803, "epoch": 0.3162483090335187, "flos": 25081844002560.0, "grad_norm": 53.44371253774358, "language_loss": 0.72100997, "learning_rate": 3.20029067660664e-06, "loss": 0.74061859, "num_input_tokens_seen": 112916180, "router_z_loss_clip": 3.28125, "router_z_loss_mlp": 0.31225586, "step": 5260, "time_per_iteration": 2.635963201522827 }, { "auxiliary_loss_clip": 0.01581998, "auxiliary_loss_mlp": 0.0036435, "balance_loss_clip": 1.26760292, "balance_loss_mlp": 0.32977933, "epoch": 0.31630843228618666, "flos": 26323383646080.0, "grad_norm": 2.552057576980257, "language_loss": 0.78547484, "learning_rate": 3.1999791269666706e-06, "loss": 0.80493832, "num_input_tokens_seen": 112936745, "router_z_loss_clip": 3.14257812, "router_z_loss_mlp": 0.34570312, "step": 5261, "time_per_iteration": 2.6803717613220215 }, { "auxiliary_loss_clip": 0.01576608, "auxiliary_loss_mlp": 0.00195705, "balance_loss_clip": 1.34191966, "balance_loss_mlp": 0.18592937, "epoch": 0.3163685555388547, "flos": 66758441552640.0, "grad_norm": 0.7501868637089333, "language_loss": 0.50677478, "learning_rate": 3.1996675318233716e-06, "loss": 0.52449793, "num_input_tokens_seen": 112994845, "router_z_loss_clip": 2.34375, "router_z_loss_mlp": 0.09765625, "step": 5262, "time_per_iteration": 3.128899097442627 }, { "auxiliary_loss_clip": 0.01648848, "auxiliary_loss_mlp": 0.00366361, "balance_loss_clip": 1.31014025, "balance_loss_mlp": 0.33312559, "epoch": 0.31642867879152264, "flos": 25995662933760.0, "grad_norm": 8.061371635901653, "language_loss": 0.89479172, "learning_rate": 3.19935589118856e-06, "loss": 0.91494381, "num_input_tokens_seen": 113015125, "router_z_loss_clip": 3.38867188, "router_z_loss_mlp": 0.33203125, "step": 5263, "time_per_iteration": 2.8143084049224854 }, { "auxiliary_loss_clip": 0.01585175, "auxiliary_loss_mlp": 0.00334935, "balance_loss_clip": 1.27115452, "balance_loss_mlp": 0.30229563, "epoch": 0.3164888020441906, "flos": 25774955815680.0, "grad_norm": 14.246709033467031, "language_loss": 0.85983151, "learning_rate": 3.1990442050740535e-06, "loss": 0.87903261, "num_input_tokens_seen": 113035535, "router_z_loss_clip": 3.140625, "router_z_loss_mlp": 0.32641602, "step": 5264, "time_per_iteration": 4.075377464294434 }, { "auxiliary_loss_clip": 0.01623444, "auxiliary_loss_mlp": 0.00379761, "balance_loss_clip": 1.29460204, "balance_loss_mlp": 0.34170926, "epoch": 0.3165489252968586, "flos": 19756220071680.0, "grad_norm": 5.525708039561475, "language_loss": 0.86320871, "learning_rate": 3.19873247349167e-06, "loss": 0.8832407, "num_input_tokens_seen": 113052720, "router_z_loss_clip": 3.28710938, "router_z_loss_mlp": 0.38037109, "step": 5265, "time_per_iteration": 2.6252501010894775 }, { "auxiliary_loss_clip": 0.01622758, "auxiliary_loss_mlp": 0.00384697, "balance_loss_clip": 1.29086566, "balance_loss_mlp": 0.34714568, "epoch": 0.31660904854952654, "flos": 23183929180800.0, "grad_norm": 1.8705618949704237, "language_loss": 0.82190269, "learning_rate": 3.1984206964532307e-06, "loss": 0.84197724, "num_input_tokens_seen": 113071435, "router_z_loss_clip": 3.31835938, "router_z_loss_mlp": 0.37573242, "step": 5266, "time_per_iteration": 2.6299796104431152 }, { "auxiliary_loss_clip": 0.01623832, "auxiliary_loss_mlp": 0.00354409, "balance_loss_clip": 1.29337978, "balance_loss_mlp": 0.32081628, "epoch": 0.3166691718021945, "flos": 20408501099520.0, "grad_norm": 690.1843622165735, "language_loss": 0.87858093, "learning_rate": 3.1981088739705585e-06, "loss": 0.89836341, "num_input_tokens_seen": 113088645, "router_z_loss_clip": 3.3046875, "router_z_loss_mlp": 0.33569336, "step": 5267, "time_per_iteration": 2.665837049484253 }, { "auxiliary_loss_clip": 0.01577983, "auxiliary_loss_mlp": 0.001976, "balance_loss_clip": 1.34588742, "balance_loss_mlp": 0.18758661, "epoch": 0.31672929505486247, "flos": 70144781172480.0, "grad_norm": 0.7098952831152016, "language_loss": 0.57423961, "learning_rate": 3.197797006055478e-06, "loss": 0.59199548, "num_input_tokens_seen": 113152775, "router_z_loss_clip": 2.3125, "router_z_loss_mlp": 0.10009766, "step": 5268, "time_per_iteration": 3.1527059078216553 }, { "auxiliary_loss_clip": 0.01625006, "auxiliary_loss_mlp": 0.00359805, "balance_loss_clip": 1.29306412, "balance_loss_mlp": 0.32771403, "epoch": 0.31678941830753043, "flos": 14355758154240.0, "grad_norm": 4.599633895155689, "language_loss": 0.81322241, "learning_rate": 3.197485092719815e-06, "loss": 0.83307052, "num_input_tokens_seen": 113171410, "router_z_loss_clip": 3.32226562, "router_z_loss_mlp": 0.32128906, "step": 5269, "time_per_iteration": 2.6955034732818604 }, { "auxiliary_loss_clip": 0.01609805, "auxiliary_loss_mlp": 0.00354469, "balance_loss_clip": 1.28255999, "balance_loss_mlp": 0.31906432, "epoch": 0.3168495415601984, "flos": 22747722416640.0, "grad_norm": 38.39819485962703, "language_loss": 0.85427475, "learning_rate": 3.1971731339753973e-06, "loss": 0.87391746, "num_input_tokens_seen": 113189965, "router_z_loss_clip": 3.2734375, "router_z_loss_mlp": 0.35424805, "step": 5270, "time_per_iteration": 2.671257257461548 }, { "auxiliary_loss_clip": 0.01665746, "auxiliary_loss_mlp": 0.00363081, "balance_loss_clip": 1.32581544, "balance_loss_mlp": 0.32660329, "epoch": 0.31690966481286637, "flos": 20115254465280.0, "grad_norm": 4.2860118000327105, "language_loss": 0.85172188, "learning_rate": 3.1968611298340545e-06, "loss": 0.87201011, "num_input_tokens_seen": 113206355, "router_z_loss_clip": 3.40234375, "router_z_loss_mlp": 0.36450195, "step": 5271, "time_per_iteration": 2.6367621421813965 }, { "auxiliary_loss_clip": 0.01653668, "auxiliary_loss_mlp": 0.00389391, "balance_loss_clip": 1.31413519, "balance_loss_mlp": 0.35329473, "epoch": 0.31696978806553433, "flos": 21178928937600.0, "grad_norm": 3091.583119968142, "language_loss": 0.79578626, "learning_rate": 3.1965490803076173e-06, "loss": 0.81621683, "num_input_tokens_seen": 113225440, "router_z_loss_clip": 3.39453125, "router_z_loss_mlp": 0.36083984, "step": 5272, "time_per_iteration": 2.652951240539551 }, { "auxiliary_loss_clip": 0.01640764, "auxiliary_loss_mlp": 0.0038642, "balance_loss_clip": 1.29505873, "balance_loss_mlp": 0.34736753, "epoch": 0.3170299113182023, "flos": 42997030439040.0, "grad_norm": 136.7137996689856, "language_loss": 0.76313818, "learning_rate": 3.1962369854079194e-06, "loss": 0.78341001, "num_input_tokens_seen": 113248840, "router_z_loss_clip": 3.46484375, "router_z_loss_mlp": 0.39038086, "step": 5273, "time_per_iteration": 2.861022710800171 }, { "auxiliary_loss_clip": 0.0166874, "auxiliary_loss_mlp": 0.00356119, "balance_loss_clip": 1.32500875, "balance_loss_mlp": 0.32300264, "epoch": 0.31709003457087026, "flos": 24460158384000.0, "grad_norm": 44.34488931378684, "language_loss": 0.75279641, "learning_rate": 3.195924845146795e-06, "loss": 0.77304506, "num_input_tokens_seen": 113269630, "router_z_loss_clip": 3.43554688, "router_z_loss_mlp": 0.33105469, "step": 5274, "time_per_iteration": 2.7464914321899414 }, { "auxiliary_loss_clip": 0.01644439, "auxiliary_loss_mlp": 0.00351404, "balance_loss_clip": 1.31374955, "balance_loss_mlp": 0.31802493, "epoch": 0.3171501578235382, "flos": 24135310759680.0, "grad_norm": 2.348824893401691, "language_loss": 0.85444582, "learning_rate": 3.195612659536081e-06, "loss": 0.87440419, "num_input_tokens_seen": 113291200, "router_z_loss_clip": 3.31054688, "router_z_loss_mlp": 0.33374023, "step": 5275, "time_per_iteration": 2.6825666427612305 }, { "auxiliary_loss_clip": 0.01683979, "auxiliary_loss_mlp": 0.00411433, "balance_loss_clip": 1.33297157, "balance_loss_mlp": 0.37385789, "epoch": 0.31721028107620625, "flos": 18879712392960.0, "grad_norm": 16.79348259869333, "language_loss": 0.79202342, "learning_rate": 3.1953004285876147e-06, "loss": 0.81297755, "num_input_tokens_seen": 113310170, "router_z_loss_clip": 3.50585938, "router_z_loss_mlp": 0.37548828, "step": 5276, "time_per_iteration": 2.6985857486724854 }, { "auxiliary_loss_clip": 0.01685967, "auxiliary_loss_mlp": 0.00419788, "balance_loss_clip": 1.34236729, "balance_loss_mlp": 0.38548005, "epoch": 0.3172704043288742, "flos": 23147874904320.0, "grad_norm": 2.806606664930923, "language_loss": 0.83184206, "learning_rate": 3.194988152313236e-06, "loss": 0.85289967, "num_input_tokens_seen": 113331140, "router_z_loss_clip": 3.4375, "router_z_loss_mlp": 0.34326172, "step": 5277, "time_per_iteration": 2.880139112472534 }, { "auxiliary_loss_clip": 0.01689323, "auxiliary_loss_mlp": 0.00466422, "balance_loss_clip": 1.34633529, "balance_loss_mlp": 0.42901462, "epoch": 0.3173305275815422, "flos": 17858520731520.0, "grad_norm": 218.1388402175099, "language_loss": 0.85128176, "learning_rate": 3.1946758307247878e-06, "loss": 0.87283915, "num_input_tokens_seen": 113350030, "router_z_loss_clip": 3.43164062, "router_z_loss_mlp": 0.37402344, "step": 5278, "time_per_iteration": 2.724992275238037 }, { "auxiliary_loss_clip": 0.01815472, "auxiliary_loss_mlp": 0.00214941, "balance_loss_clip": 1.59367609, "balance_loss_mlp": 0.20111278, "epoch": 0.31739065083421014, "flos": 59973476883840.0, "grad_norm": 0.8528982611080177, "language_loss": 0.62787187, "learning_rate": 3.1943634638341114e-06, "loss": 0.64817601, "num_input_tokens_seen": 113395820, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.13867188, "step": 5279, "time_per_iteration": 2.905529737472534 }, { "auxiliary_loss_clip": 0.01711148, "auxiliary_loss_mlp": 0.00455496, "balance_loss_clip": 1.35605621, "balance_loss_mlp": 0.41839767, "epoch": 0.3174507740868781, "flos": 23800981944960.0, "grad_norm": 3.968999841652131, "language_loss": 0.86500371, "learning_rate": 3.194051051653053e-06, "loss": 0.88667011, "num_input_tokens_seen": 113416835, "router_z_loss_clip": 3.5546875, "router_z_loss_mlp": 0.37109375, "step": 5280, "time_per_iteration": 2.729020357131958 }, { "auxiliary_loss_clip": 0.01695016, "auxiliary_loss_mlp": 0.00437171, "balance_loss_clip": 1.35099912, "balance_loss_mlp": 0.40367281, "epoch": 0.31751089733954607, "flos": 27638899349760.0, "grad_norm": 2.2694503997751907, "language_loss": 0.82888037, "learning_rate": 3.19373859419346e-06, "loss": 0.85020226, "num_input_tokens_seen": 113440850, "router_z_loss_clip": 3.43945312, "router_z_loss_mlp": 0.33496094, "step": 5281, "time_per_iteration": 2.7531557083129883 }, { "auxiliary_loss_clip": 0.01691487, "auxiliary_loss_mlp": 0.0047499, "balance_loss_clip": 1.34465456, "balance_loss_mlp": 0.43817836, "epoch": 0.31757102059221404, "flos": 23769273214080.0, "grad_norm": 2.876056723979226, "language_loss": 0.82677937, "learning_rate": 3.193426091467179e-06, "loss": 0.84844416, "num_input_tokens_seen": 113461000, "router_z_loss_clip": 3.46875, "router_z_loss_mlp": 0.36816406, "step": 5282, "time_per_iteration": 2.705524206161499 }, { "auxiliary_loss_clip": 0.01714014, "auxiliary_loss_mlp": 0.00479749, "balance_loss_clip": 1.35738599, "balance_loss_mlp": 0.44215065, "epoch": 0.317631143844882, "flos": 25264521596160.0, "grad_norm": 14.273216511553482, "language_loss": 0.7423743, "learning_rate": 3.193113543486061e-06, "loss": 0.76431191, "num_input_tokens_seen": 113480820, "router_z_loss_clip": 3.56640625, "router_z_loss_mlp": 0.3762207, "step": 5283, "time_per_iteration": 2.746896266937256 }, { "auxiliary_loss_clip": 0.01828924, "auxiliary_loss_mlp": 0.00204047, "balance_loss_clip": 1.59749889, "balance_loss_mlp": 0.19021861, "epoch": 0.31769126709754997, "flos": 55825939221120.0, "grad_norm": 0.7150292259334943, "language_loss": 0.52286828, "learning_rate": 3.192800950261958e-06, "loss": 0.54319799, "num_input_tokens_seen": 113536910, "router_z_loss_clip": 2.3125, "router_z_loss_mlp": 0.13867188, "step": 5284, "time_per_iteration": 3.142355442047119 }, { "auxiliary_loss_clip": 0.01729488, "auxiliary_loss_mlp": 0.00467404, "balance_loss_clip": 1.36336708, "balance_loss_mlp": 0.42963889, "epoch": 0.31775139035021793, "flos": 16690562098560.0, "grad_norm": 37.25394615742276, "language_loss": 0.77742279, "learning_rate": 3.1924883118067235e-06, "loss": 0.79939169, "num_input_tokens_seen": 113555480, "router_z_loss_clip": 3.66015625, "router_z_loss_mlp": 0.37768555, "step": 5285, "time_per_iteration": 2.730858564376831 }, { "auxiliary_loss_clip": 0.0185287, "auxiliary_loss_mlp": 0.00125159, "balance_loss_clip": 1.61714339, "balance_loss_mlp": 0.10656277, "epoch": 0.3178115136028859, "flos": 64227241019520.0, "grad_norm": 0.8552242604456926, "language_loss": 0.603935, "learning_rate": 3.1921756281322123e-06, "loss": 0.62371528, "num_input_tokens_seen": 113616790, "router_z_loss_clip": 2.34375, "router_z_loss_mlp": 0.18554688, "step": 5286, "time_per_iteration": 3.1397857666015625 }, { "auxiliary_loss_clip": 0.01715046, "auxiliary_loss_mlp": 0.00419703, "balance_loss_clip": 1.35504818, "balance_loss_mlp": 0.38107902, "epoch": 0.31787163685555386, "flos": 18697465762560.0, "grad_norm": 199.46601753811717, "language_loss": 0.78561401, "learning_rate": 3.1918628992502826e-06, "loss": 0.80696154, "num_input_tokens_seen": 113635320, "router_z_loss_clip": 3.60351562, "router_z_loss_mlp": 0.38647461, "step": 5287, "time_per_iteration": 2.6318671703338623 }, { "auxiliary_loss_clip": 0.01694603, "auxiliary_loss_mlp": 0.00452936, "balance_loss_clip": 1.33532333, "balance_loss_mlp": 0.41538453, "epoch": 0.31793176010822183, "flos": 21324762155520.0, "grad_norm": 46.598257215988475, "language_loss": 0.81964117, "learning_rate": 3.191550125172792e-06, "loss": 0.84111655, "num_input_tokens_seen": 113654000, "router_z_loss_clip": 3.58984375, "router_z_loss_mlp": 0.37548828, "step": 5288, "time_per_iteration": 2.666808605194092 }, { "auxiliary_loss_clip": 0.01698756, "auxiliary_loss_mlp": 0.00414086, "balance_loss_clip": 1.35089982, "balance_loss_mlp": 0.38092154, "epoch": 0.31799188336088985, "flos": 20958688696320.0, "grad_norm": 93.36377169583024, "language_loss": 0.93863559, "learning_rate": 3.1912373059116007e-06, "loss": 0.959764, "num_input_tokens_seen": 113672375, "router_z_loss_clip": 3.48046875, "router_z_loss_mlp": 0.33178711, "step": 5289, "time_per_iteration": 2.8286237716674805 }, { "auxiliary_loss_clip": 0.01726885, "auxiliary_loss_mlp": 0.00393542, "balance_loss_clip": 1.36424232, "balance_loss_mlp": 0.35863787, "epoch": 0.3180520066135578, "flos": 22491930689280.0, "grad_norm": 3.0956754457319957, "language_loss": 0.7274999, "learning_rate": 3.190924441478572e-06, "loss": 0.74870414, "num_input_tokens_seen": 113692385, "router_z_loss_clip": 3.62695312, "router_z_loss_mlp": 0.34887695, "step": 5290, "time_per_iteration": 2.7378129959106445 }, { "auxiliary_loss_clip": 0.01737961, "auxiliary_loss_mlp": 0.00484122, "balance_loss_clip": 1.36238325, "balance_loss_mlp": 0.44235075, "epoch": 0.3181121298662258, "flos": 27235335070080.0, "grad_norm": 13.925164534892899, "language_loss": 0.8578434, "learning_rate": 3.1906115318855687e-06, "loss": 0.88006425, "num_input_tokens_seen": 113712145, "router_z_loss_clip": 3.7578125, "router_z_loss_mlp": 0.41772461, "step": 5291, "time_per_iteration": 2.8088250160217285 }, { "auxiliary_loss_clip": 0.01718282, "auxiliary_loss_mlp": 0.00471605, "balance_loss_clip": 1.35678351, "balance_loss_mlp": 0.43240905, "epoch": 0.31817225311889374, "flos": 23180158252800.0, "grad_norm": 5.940563772213058, "language_loss": 0.86545694, "learning_rate": 3.1902985771444577e-06, "loss": 0.8873558, "num_input_tokens_seen": 113731435, "router_z_loss_clip": 3.61328125, "router_z_loss_mlp": 0.39233398, "step": 5292, "time_per_iteration": 2.7348055839538574 }, { "auxiliary_loss_clip": 0.01686985, "auxiliary_loss_mlp": 0.00388534, "balance_loss_clip": 1.34214163, "balance_loss_mlp": 0.35508353, "epoch": 0.3182323763715617, "flos": 23258803080960.0, "grad_norm": 5.13819635451938, "language_loss": 0.80813932, "learning_rate": 3.1899855772671043e-06, "loss": 0.8288945, "num_input_tokens_seen": 113750825, "router_z_loss_clip": 3.44921875, "router_z_loss_mlp": 0.33447266, "step": 5293, "time_per_iteration": 4.134828090667725 }, { "auxiliary_loss_clip": 0.01699339, "auxiliary_loss_mlp": 0.00415983, "balance_loss_clip": 1.34583592, "balance_loss_mlp": 0.38021994, "epoch": 0.3182924996242297, "flos": 29016683280000.0, "grad_norm": 6.668535299111072, "language_loss": 0.80571091, "learning_rate": 3.189672532265379e-06, "loss": 0.82686412, "num_input_tokens_seen": 113770010, "router_z_loss_clip": 3.53125, "router_z_loss_mlp": 0.35766602, "step": 5294, "time_per_iteration": 2.78254771232605 }, { "auxiliary_loss_clip": 0.01721351, "auxiliary_loss_mlp": 0.00459245, "balance_loss_clip": 1.35933638, "balance_loss_mlp": 0.42054972, "epoch": 0.31835262287689764, "flos": 20449188230400.0, "grad_norm": 10.762652059869128, "language_loss": 0.84346086, "learning_rate": 3.189359442151152e-06, "loss": 0.8652668, "num_input_tokens_seen": 113788640, "router_z_loss_clip": 3.6171875, "router_z_loss_mlp": 0.38696289, "step": 5295, "time_per_iteration": 4.361624240875244 }, { "auxiliary_loss_clip": 0.01748937, "auxiliary_loss_mlp": 0.0044543, "balance_loss_clip": 1.37637639, "balance_loss_mlp": 0.40918994, "epoch": 0.3184127461295656, "flos": 25119478477440.0, "grad_norm": 3.4555869258319993, "language_loss": 0.76394141, "learning_rate": 3.189046306936296e-06, "loss": 0.7858851, "num_input_tokens_seen": 113809515, "router_z_loss_clip": 3.72460938, "router_z_loss_mlp": 0.36230469, "step": 5296, "time_per_iteration": 2.8007869720458984 }, { "auxiliary_loss_clip": 0.017187, "auxiliary_loss_mlp": 0.00395452, "balance_loss_clip": 1.35629117, "balance_loss_mlp": 0.36076164, "epoch": 0.31847286938223357, "flos": 25551231955200.0, "grad_norm": 102.21779204433832, "language_loss": 0.83361399, "learning_rate": 3.1887331266326846e-06, "loss": 0.85475552, "num_input_tokens_seen": 113829770, "router_z_loss_clip": 3.625, "router_z_loss_mlp": 0.34667969, "step": 5297, "time_per_iteration": 2.8877604007720947 }, { "auxiliary_loss_clip": 0.01694658, "auxiliary_loss_mlp": 0.00391142, "balance_loss_clip": 1.34677148, "balance_loss_mlp": 0.35452074, "epoch": 0.31853299263490154, "flos": 27782470010880.0, "grad_norm": 209.81542942637273, "language_loss": 0.8476584, "learning_rate": 3.1884199012521942e-06, "loss": 0.86851645, "num_input_tokens_seen": 113849320, "router_z_loss_clip": 3.48046875, "router_z_loss_mlp": 0.3659668, "step": 5298, "time_per_iteration": 2.874307870864868 }, { "auxiliary_loss_clip": 0.0172742, "auxiliary_loss_mlp": 0.00418194, "balance_loss_clip": 1.36358809, "balance_loss_mlp": 0.38190711, "epoch": 0.3185931158875695, "flos": 22706747976960.0, "grad_norm": 3.6936767994557136, "language_loss": 0.79605472, "learning_rate": 3.1881066308067016e-06, "loss": 0.81751084, "num_input_tokens_seen": 113867860, "router_z_loss_clip": 3.63671875, "router_z_loss_mlp": 0.36303711, "step": 5299, "time_per_iteration": 2.673214912414551 }, { "auxiliary_loss_clip": 0.01725132, "auxiliary_loss_mlp": 0.004235, "balance_loss_clip": 1.35831928, "balance_loss_mlp": 0.3866643, "epoch": 0.31865323914023747, "flos": 24571517523840.0, "grad_norm": 2.659835608724678, "language_loss": 0.84340334, "learning_rate": 3.1877933153080873e-06, "loss": 0.86488962, "num_input_tokens_seen": 113886375, "router_z_loss_clip": 3.66796875, "router_z_loss_mlp": 0.36816406, "step": 5300, "time_per_iteration": 2.725107431411743 }, { "auxiliary_loss_clip": 0.01709414, "auxiliary_loss_mlp": 0.00396935, "balance_loss_clip": 1.35558712, "balance_loss_mlp": 0.36064738, "epoch": 0.31871336239290543, "flos": 18186564666240.0, "grad_norm": 5.157814773361325, "language_loss": 0.90324616, "learning_rate": 3.1874799547682304e-06, "loss": 0.92430961, "num_input_tokens_seen": 113904065, "router_z_loss_clip": 3.54101562, "router_z_loss_mlp": 0.36303711, "step": 5301, "time_per_iteration": 4.032884836196899 }, { "auxiliary_loss_clip": 0.01743598, "auxiliary_loss_mlp": 0.00484469, "balance_loss_clip": 1.3827188, "balance_loss_mlp": 0.44067162, "epoch": 0.31877348564557345, "flos": 21826756679040.0, "grad_norm": 7.246737242243414, "language_loss": 0.83416533, "learning_rate": 3.187166549199015e-06, "loss": 0.85644603, "num_input_tokens_seen": 113918415, "router_z_loss_clip": 3.61132812, "router_z_loss_mlp": 0.43798828, "step": 5302, "time_per_iteration": 2.61442494392395 }, { "auxiliary_loss_clip": 0.01713699, "auxiliary_loss_mlp": 0.00398935, "balance_loss_clip": 1.36358833, "balance_loss_mlp": 0.36300546, "epoch": 0.3188336088982414, "flos": 22015252275840.0, "grad_norm": 7.988477762781484, "language_loss": 0.85394502, "learning_rate": 3.1868530986123255e-06, "loss": 0.87507129, "num_input_tokens_seen": 113938135, "router_z_loss_clip": 3.5, "router_z_loss_mlp": 0.35913086, "step": 5303, "time_per_iteration": 2.636838912963867 }, { "auxiliary_loss_clip": 0.01732017, "auxiliary_loss_mlp": 0.00479411, "balance_loss_clip": 1.35822463, "balance_loss_mlp": 0.43690121, "epoch": 0.3188937321509094, "flos": 20047886507520.0, "grad_norm": 52.36716422601023, "language_loss": 0.80699813, "learning_rate": 3.186539603020047e-06, "loss": 0.82911241, "num_input_tokens_seen": 113957125, "router_z_loss_clip": 3.73632812, "router_z_loss_mlp": 0.42553711, "step": 5304, "time_per_iteration": 2.652920722961426 }, { "auxiliary_loss_clip": 0.01701611, "auxiliary_loss_mlp": 0.00454162, "balance_loss_clip": 1.35833609, "balance_loss_mlp": 0.41575235, "epoch": 0.31895385540357735, "flos": 25848105863040.0, "grad_norm": 2.481263795678358, "language_loss": 0.78354096, "learning_rate": 3.186226062434068e-06, "loss": 0.80509865, "num_input_tokens_seen": 113974875, "router_z_loss_clip": 3.43359375, "router_z_loss_mlp": 0.3840332, "step": 5305, "time_per_iteration": 2.693019390106201 }, { "auxiliary_loss_clip": 0.01718931, "auxiliary_loss_mlp": 0.00454895, "balance_loss_clip": 1.3646903, "balance_loss_mlp": 0.41641381, "epoch": 0.3190139786562453, "flos": 23477714519040.0, "grad_norm": 2.614960997721816, "language_loss": 0.69231534, "learning_rate": 3.1859124768662778e-06, "loss": 0.71405363, "num_input_tokens_seen": 113994450, "router_z_loss_clip": 3.546875, "router_z_loss_mlp": 0.38500977, "step": 5306, "time_per_iteration": 4.1589226722717285 }, { "auxiliary_loss_clip": 0.01739448, "auxiliary_loss_mlp": 0.00493905, "balance_loss_clip": 1.37364554, "balance_loss_mlp": 0.45015484, "epoch": 0.3190741019089133, "flos": 29095543589760.0, "grad_norm": 6.654300061910735, "language_loss": 0.84727508, "learning_rate": 3.1855988463285678e-06, "loss": 0.86960858, "num_input_tokens_seen": 114013945, "router_z_loss_clip": 3.65820312, "router_z_loss_mlp": 0.4375, "step": 5307, "time_per_iteration": 2.7024319171905518 }, { "auxiliary_loss_clip": 0.01718994, "auxiliary_loss_mlp": 0.00453631, "balance_loss_clip": 1.36287189, "balance_loss_mlp": 0.41016725, "epoch": 0.31913422516158124, "flos": 17129534209920.0, "grad_norm": 6.619124019128554, "language_loss": 0.84394681, "learning_rate": 3.1852851708328308e-06, "loss": 0.86567307, "num_input_tokens_seen": 114031375, "router_z_loss_clip": 3.5625, "router_z_loss_mlp": 0.43457031, "step": 5308, "time_per_iteration": 2.5934371948242188 }, { "auxiliary_loss_clip": 0.01772577, "auxiliary_loss_mlp": 0.0047594, "balance_loss_clip": 1.38983309, "balance_loss_mlp": 0.43154624, "epoch": 0.3191943484142492, "flos": 16069846147200.0, "grad_norm": 3.5987620811593817, "language_loss": 0.82845581, "learning_rate": 3.184971450390961e-06, "loss": 0.850941, "num_input_tokens_seen": 114048465, "router_z_loss_clip": 3.828125, "router_z_loss_mlp": 0.4440918, "step": 5309, "time_per_iteration": 2.6370866298675537 }, { "auxiliary_loss_clip": 0.01710817, "auxiliary_loss_mlp": 0.00490966, "balance_loss_clip": 1.3531692, "balance_loss_mlp": 0.45055419, "epoch": 0.3192544716669172, "flos": 22966166977920.0, "grad_norm": 24.05355927448271, "language_loss": 0.88434446, "learning_rate": 3.184657685014856e-06, "loss": 0.90636224, "num_input_tokens_seen": 114068415, "router_z_loss_clip": 3.57421875, "router_z_loss_mlp": 0.40380859, "step": 5310, "time_per_iteration": 2.6801815032958984 }, { "auxiliary_loss_clip": 0.01701893, "auxiliary_loss_mlp": 0.00503033, "balance_loss_clip": 1.34938145, "balance_loss_mlp": 0.46350345, "epoch": 0.31931459491958514, "flos": 26870339018880.0, "grad_norm": 2.4399319914904263, "language_loss": 0.81751913, "learning_rate": 3.184343874716412e-06, "loss": 0.83956838, "num_input_tokens_seen": 114088565, "router_z_loss_clip": 3.52539062, "router_z_loss_mlp": 0.39526367, "step": 5311, "time_per_iteration": 2.7080507278442383 }, { "auxiliary_loss_clip": 0.01726892, "auxiliary_loss_mlp": 0.00475049, "balance_loss_clip": 1.37104058, "balance_loss_mlp": 0.43663949, "epoch": 0.3193747181722531, "flos": 21836525178240.0, "grad_norm": 26.139032177883394, "language_loss": 0.90551031, "learning_rate": 3.1840300195075295e-06, "loss": 0.92752975, "num_input_tokens_seen": 114107160, "router_z_loss_clip": 3.55859375, "router_z_loss_mlp": 0.38427734, "step": 5312, "time_per_iteration": 2.6891252994537354 }, { "auxiliary_loss_clip": 0.01765996, "auxiliary_loss_mlp": 0.0055989, "balance_loss_clip": 1.3896544, "balance_loss_mlp": 0.51428097, "epoch": 0.31943484142492107, "flos": 18324999682560.0, "grad_norm": 16500.420739845704, "language_loss": 0.8763659, "learning_rate": 3.1837161194001102e-06, "loss": 0.89962476, "num_input_tokens_seen": 114123420, "router_z_loss_clip": 3.76757812, "router_z_loss_mlp": 0.45629883, "step": 5313, "time_per_iteration": 2.637005090713501 }, { "auxiliary_loss_clip": 0.0171895, "auxiliary_loss_mlp": 0.00503515, "balance_loss_clip": 1.35774231, "balance_loss_mlp": 0.45986021, "epoch": 0.31949496467758903, "flos": 21615818060160.0, "grad_norm": 27.505662907547688, "language_loss": 0.93054336, "learning_rate": 3.183402174406057e-06, "loss": 0.95276797, "num_input_tokens_seen": 114139230, "router_z_loss_clip": 3.61132812, "router_z_loss_mlp": 0.43652344, "step": 5314, "time_per_iteration": 2.6137049198150635 }, { "auxiliary_loss_clip": 0.01719111, "auxiliary_loss_mlp": 0.00449448, "balance_loss_clip": 1.35462117, "balance_loss_mlp": 0.40846366, "epoch": 0.31955508793025705, "flos": 21760214734080.0, "grad_norm": 7.091221261371386, "language_loss": 0.85859489, "learning_rate": 3.1830881845372747e-06, "loss": 0.88028049, "num_input_tokens_seen": 114159290, "router_z_loss_clip": 3.64257812, "router_z_loss_mlp": 0.40966797, "step": 5315, "time_per_iteration": 2.661275863647461 }, { "auxiliary_loss_clip": 0.01731629, "auxiliary_loss_mlp": 0.00546566, "balance_loss_clip": 1.36689258, "balance_loss_mlp": 0.50200564, "epoch": 0.319615211182925, "flos": 17164331510400.0, "grad_norm": 4.166853666189118, "language_loss": 0.72203255, "learning_rate": 3.18277414980567e-06, "loss": 0.74481452, "num_input_tokens_seen": 114177655, "router_z_loss_clip": 3.6484375, "router_z_loss_mlp": 0.44555664, "step": 5316, "time_per_iteration": 2.6852006912231445 }, { "auxiliary_loss_clip": 0.01708103, "auxiliary_loss_mlp": 0.00501932, "balance_loss_clip": 1.35315096, "balance_loss_mlp": 0.46304637, "epoch": 0.319675334435593, "flos": 28112812416000.0, "grad_norm": 3.105118985887463, "language_loss": 0.73223877, "learning_rate": 3.1824600702231515e-06, "loss": 0.7543391, "num_input_tokens_seen": 114200880, "router_z_loss_clip": 3.5546875, "router_z_loss_mlp": 0.38891602, "step": 5317, "time_per_iteration": 2.870189905166626 }, { "auxiliary_loss_clip": 0.0214604, "auxiliary_loss_mlp": 0.00332953, "balance_loss_clip": 1.88205564, "balance_loss_mlp": 0.31406975, "epoch": 0.31973545768826095, "flos": 69501119408640.0, "grad_norm": 0.723936095649714, "language_loss": 0.52444285, "learning_rate": 3.182145945801628e-06, "loss": 0.54923278, "num_input_tokens_seen": 114267145, "router_z_loss_clip": 2.640625, "router_z_loss_mlp": 0.18847656, "step": 5318, "time_per_iteration": 3.3257522583007812 }, { "auxiliary_loss_clip": 0.01728356, "auxiliary_loss_mlp": 0.00477608, "balance_loss_clip": 1.36918175, "balance_loss_mlp": 0.43598038, "epoch": 0.3197955809409289, "flos": 13699203408000.0, "grad_norm": 2.3196256640344064, "language_loss": 0.88436198, "learning_rate": 3.181831776553012e-06, "loss": 0.90642154, "num_input_tokens_seen": 114284630, "router_z_loss_clip": 3.59179688, "router_z_loss_mlp": 0.41650391, "step": 5319, "time_per_iteration": 2.6685307025909424 }, { "auxiliary_loss_clip": 0.01712763, "auxiliary_loss_mlp": 0.00506486, "balance_loss_clip": 1.35387349, "balance_loss_mlp": 0.46297482, "epoch": 0.3198557041935969, "flos": 33218124278400.0, "grad_norm": 4.666115024303236, "language_loss": 0.6920771, "learning_rate": 3.1815175624892165e-06, "loss": 0.71426964, "num_input_tokens_seen": 114305830, "router_z_loss_clip": 3.58789062, "router_z_loss_mlp": 0.43530273, "step": 5320, "time_per_iteration": 2.797147274017334 }, { "auxiliary_loss_clip": 0.01741138, "auxiliary_loss_mlp": 0.00513903, "balance_loss_clip": 1.36418307, "balance_loss_mlp": 0.46929485, "epoch": 0.31991582744626484, "flos": 23732033788800.0, "grad_norm": 12.957188868701222, "language_loss": 0.75577235, "learning_rate": 3.1812033036221567e-06, "loss": 0.77832282, "num_input_tokens_seen": 114325165, "router_z_loss_clip": 3.77148438, "router_z_loss_mlp": 0.44555664, "step": 5321, "time_per_iteration": 2.6608736515045166 }, { "auxiliary_loss_clip": 0.01768294, "auxiliary_loss_mlp": 0.00546411, "balance_loss_clip": 1.38516927, "balance_loss_mlp": 0.50087309, "epoch": 0.3199759506989328, "flos": 18550842445440.0, "grad_norm": 4.851887266712705, "language_loss": 0.91747189, "learning_rate": 3.180888999963749e-06, "loss": 0.94061893, "num_input_tokens_seen": 114341310, "router_z_loss_clip": 3.83007812, "router_z_loss_mlp": 0.45556641, "step": 5322, "time_per_iteration": 2.636070966720581 }, { "auxiliary_loss_clip": 0.01691942, "auxiliary_loss_mlp": 0.0046032, "balance_loss_clip": 1.33862114, "balance_loss_mlp": 0.42012292, "epoch": 0.3200360739516008, "flos": 22418888382720.0, "grad_norm": 20.741659168915366, "language_loss": 0.88024211, "learning_rate": 3.1805746515259123e-06, "loss": 0.90176475, "num_input_tokens_seen": 114360355, "router_z_loss_clip": 3.53320312, "router_z_loss_mlp": 0.40185547, "step": 5323, "time_per_iteration": 2.722917318344116 }, { "auxiliary_loss_clip": 0.01699879, "auxiliary_loss_mlp": 0.00488897, "balance_loss_clip": 1.34733093, "balance_loss_mlp": 0.44316855, "epoch": 0.32009619720426874, "flos": 20595236929920.0, "grad_norm": 10.456138840422868, "language_loss": 0.834512, "learning_rate": 3.1802602583205663e-06, "loss": 0.85639977, "num_input_tokens_seen": 114379220, "router_z_loss_clip": 3.52539062, "router_z_loss_mlp": 0.45751953, "step": 5324, "time_per_iteration": 2.6193459033966064 }, { "auxiliary_loss_clip": 0.01700404, "auxiliary_loss_mlp": 0.00429234, "balance_loss_clip": 1.34624791, "balance_loss_mlp": 0.38968033, "epoch": 0.3201563204569367, "flos": 18147637301760.0, "grad_norm": 59.342111246778316, "language_loss": 0.85458195, "learning_rate": 3.1799458203596333e-06, "loss": 0.87587833, "num_input_tokens_seen": 114396365, "router_z_loss_clip": 3.5390625, "router_z_loss_mlp": 0.39550781, "step": 5325, "time_per_iteration": 2.5910449028015137 }, { "auxiliary_loss_clip": 0.01748158, "auxiliary_loss_mlp": 0.00491394, "balance_loss_clip": 1.37642813, "balance_loss_mlp": 0.44740576, "epoch": 0.32021644370960467, "flos": 31684235840640.0, "grad_norm": 2.531549607091387, "language_loss": 0.79877114, "learning_rate": 3.179631337655037e-06, "loss": 0.82116663, "num_input_tokens_seen": 114416780, "router_z_loss_clip": 3.71875, "router_z_loss_mlp": 0.43994141, "step": 5326, "time_per_iteration": 2.698843240737915 }, { "auxiliary_loss_clip": 0.01721269, "auxiliary_loss_mlp": 0.00479162, "balance_loss_clip": 1.36466336, "balance_loss_mlp": 0.43803519, "epoch": 0.32027656696227264, "flos": 26865921646080.0, "grad_norm": 325.05977496945695, "language_loss": 0.84618151, "learning_rate": 3.179316810218701e-06, "loss": 0.86818582, "num_input_tokens_seen": 114437405, "router_z_loss_clip": 3.56640625, "router_z_loss_mlp": 0.41113281, "step": 5327, "time_per_iteration": 2.6728932857513428 }, { "auxiliary_loss_clip": 0.01725154, "auxiliary_loss_mlp": 0.00468179, "balance_loss_clip": 1.3526727, "balance_loss_mlp": 0.42681384, "epoch": 0.32033669021494066, "flos": 24169928492160.0, "grad_norm": 6.152228868253943, "language_loss": 0.80736208, "learning_rate": 3.179002238062554e-06, "loss": 0.8292954, "num_input_tokens_seen": 114458505, "router_z_loss_clip": 3.72070312, "router_z_loss_mlp": 0.41357422, "step": 5328, "time_per_iteration": 2.727324962615967 }, { "auxiliary_loss_clip": 0.01723256, "auxiliary_loss_mlp": 0.00466248, "balance_loss_clip": 1.36527967, "balance_loss_mlp": 0.42132989, "epoch": 0.3203968134676086, "flos": 24460768915200.0, "grad_norm": 3.979778810289534, "language_loss": 0.79304135, "learning_rate": 3.178687621198524e-06, "loss": 0.8149364, "num_input_tokens_seen": 114479050, "router_z_loss_clip": 3.58203125, "router_z_loss_mlp": 0.44946289, "step": 5329, "time_per_iteration": 2.761971950531006 }, { "auxiliary_loss_clip": 0.01709156, "auxiliary_loss_mlp": 0.00450292, "balance_loss_clip": 1.35900331, "balance_loss_mlp": 0.41014215, "epoch": 0.3204569367202766, "flos": 18004713085440.0, "grad_norm": 5.423415900636356, "language_loss": 0.75235623, "learning_rate": 3.1783729596385415e-06, "loss": 0.7739507, "num_input_tokens_seen": 114497415, "router_z_loss_clip": 3.5, "router_z_loss_mlp": 0.40136719, "step": 5330, "time_per_iteration": 2.6481165885925293 }, { "auxiliary_loss_clip": 0.01713445, "auxiliary_loss_mlp": 0.00468089, "balance_loss_clip": 1.34898973, "balance_loss_mlp": 0.4263179, "epoch": 0.32051705997294455, "flos": 30589678650240.0, "grad_norm": 228.63663213000595, "language_loss": 0.85417676, "learning_rate": 3.1780582533945376e-06, "loss": 0.87599218, "num_input_tokens_seen": 114518785, "router_z_loss_clip": 3.64648438, "router_z_loss_mlp": 0.41772461, "step": 5331, "time_per_iteration": 2.7270965576171875 }, { "auxiliary_loss_clip": 0.0204186, "auxiliary_loss_mlp": 0.00184144, "balance_loss_clip": 1.78756797, "balance_loss_mlp": 0.16879018, "epoch": 0.3205771832256125, "flos": 68417979765120.0, "grad_norm": 0.8405055316079078, "language_loss": 0.57824594, "learning_rate": 3.177743502478447e-06, "loss": 0.60050589, "num_input_tokens_seen": 114577710, "router_z_loss_clip": 2.546875, "router_z_loss_mlp": 0.15332031, "step": 5332, "time_per_iteration": 3.150365114212036 }, { "auxiliary_loss_clip": 0.01755669, "auxiliary_loss_mlp": 0.00416776, "balance_loss_clip": 1.38321352, "balance_loss_mlp": 0.37562457, "epoch": 0.3206373064782805, "flos": 30443953173120.0, "grad_norm": 7.191721494461687, "language_loss": 0.7948072, "learning_rate": 3.177428706902205e-06, "loss": 0.81653172, "num_input_tokens_seen": 114598640, "router_z_loss_clip": 3.72460938, "router_z_loss_mlp": 0.41186523, "step": 5333, "time_per_iteration": 2.8208911418914795 }, { "auxiliary_loss_clip": 0.01703662, "auxiliary_loss_mlp": 0.00479265, "balance_loss_clip": 1.34517097, "balance_loss_mlp": 0.43768477, "epoch": 0.32069742973094845, "flos": 22054502862720.0, "grad_norm": 67.01280208774357, "language_loss": 0.77010524, "learning_rate": 3.1771138666777485e-06, "loss": 0.79193455, "num_input_tokens_seen": 114618780, "router_z_loss_clip": 3.5859375, "router_z_loss_mlp": 0.41601562, "step": 5334, "time_per_iteration": 2.657924175262451 }, { "auxiliary_loss_clip": 0.01697242, "auxiliary_loss_mlp": 0.00431344, "balance_loss_clip": 1.33943069, "balance_loss_mlp": 0.39310208, "epoch": 0.3207575529836164, "flos": 22054000072320.0, "grad_norm": 45.705353479362955, "language_loss": 0.82068557, "learning_rate": 3.1767989818170156e-06, "loss": 0.84197146, "num_input_tokens_seen": 114637525, "router_z_loss_clip": 3.578125, "router_z_loss_mlp": 0.38256836, "step": 5335, "time_per_iteration": 4.110499143600464 }, { "auxiliary_loss_clip": 0.01687736, "auxiliary_loss_mlp": 0.00430527, "balance_loss_clip": 1.33443999, "balance_loss_mlp": 0.39025843, "epoch": 0.3208176762362844, "flos": 34057536186240.0, "grad_norm": 82.03594537750911, "language_loss": 0.73619497, "learning_rate": 3.1764840523319477e-06, "loss": 0.75737756, "num_input_tokens_seen": 114659705, "router_z_loss_clip": 3.53320312, "router_z_loss_mlp": 0.40283203, "step": 5336, "time_per_iteration": 2.8323209285736084 }, { "auxiliary_loss_clip": 0.01714047, "auxiliary_loss_mlp": 0.00453074, "balance_loss_clip": 1.35130465, "balance_loss_mlp": 0.41356763, "epoch": 0.32087779948895234, "flos": 21798711135360.0, "grad_norm": 19.206202026266023, "language_loss": 0.83321321, "learning_rate": 3.176169078234487e-06, "loss": 0.85488439, "num_input_tokens_seen": 114678340, "router_z_loss_clip": 3.62890625, "router_z_loss_mlp": 0.39526367, "step": 5337, "time_per_iteration": 4.083790063858032 }, { "auxiliary_loss_clip": 0.0170653, "auxiliary_loss_mlp": 0.00435685, "balance_loss_clip": 1.35603833, "balance_loss_mlp": 0.3976576, "epoch": 0.3209379227416203, "flos": 21434110133760.0, "grad_norm": 36.2260311016666, "language_loss": 0.79676151, "learning_rate": 3.1758540595365766e-06, "loss": 0.8181836, "num_input_tokens_seen": 114696980, "router_z_loss_clip": 3.50195312, "router_z_loss_mlp": 0.38037109, "step": 5338, "time_per_iteration": 2.681302547454834 }, { "auxiliary_loss_clip": 0.01716521, "auxiliary_loss_mlp": 0.00457119, "balance_loss_clip": 1.34539962, "balance_loss_mlp": 0.41441774, "epoch": 0.3209980459942883, "flos": 25849075530240.0, "grad_norm": 29.985344490420072, "language_loss": 0.67065066, "learning_rate": 3.1755389962501626e-06, "loss": 0.6923871, "num_input_tokens_seen": 114717330, "router_z_loss_clip": 3.71289062, "router_z_loss_mlp": 0.42700195, "step": 5339, "time_per_iteration": 2.7811601161956787 }, { "auxiliary_loss_clip": 0.0173619, "auxiliary_loss_mlp": 0.00438312, "balance_loss_clip": 1.36323452, "balance_loss_mlp": 0.3958976, "epoch": 0.32105816924695624, "flos": 19099162535040.0, "grad_norm": 20.953203430334028, "language_loss": 0.88539886, "learning_rate": 3.175223888387192e-06, "loss": 0.90714389, "num_input_tokens_seen": 114736320, "router_z_loss_clip": 3.734375, "router_z_loss_mlp": 0.42407227, "step": 5340, "time_per_iteration": 2.7758517265319824 }, { "auxiliary_loss_clip": 0.01748346, "auxiliary_loss_mlp": 0.00430686, "balance_loss_clip": 1.37357712, "balance_loss_mlp": 0.39072728, "epoch": 0.3211182924996242, "flos": 16581860565120.0, "grad_norm": 9.29176722591997, "language_loss": 0.81435549, "learning_rate": 3.1749087359596137e-06, "loss": 0.83614582, "num_input_tokens_seen": 114754575, "router_z_loss_clip": 3.74804688, "router_z_loss_mlp": 0.3996582, "step": 5341, "time_per_iteration": 2.8800814151763916 }, { "auxiliary_loss_clip": 0.01726169, "auxiliary_loss_mlp": 0.00444642, "balance_loss_clip": 1.364223, "balance_loss_mlp": 0.40251321, "epoch": 0.3211784157522922, "flos": 22672202071680.0, "grad_norm": 9.24975323944955, "language_loss": 0.84109414, "learning_rate": 3.1745935389793786e-06, "loss": 0.86280221, "num_input_tokens_seen": 114773590, "router_z_loss_clip": 3.61914062, "router_z_loss_mlp": 0.42163086, "step": 5342, "time_per_iteration": 2.6589980125427246 }, { "auxiliary_loss_clip": 0.01738929, "auxiliary_loss_mlp": 0.00470275, "balance_loss_clip": 1.36383533, "balance_loss_mlp": 0.42781252, "epoch": 0.3212385390049602, "flos": 20558787603840.0, "grad_norm": 4.428488895852741, "language_loss": 0.8184011, "learning_rate": 3.174278297458438e-06, "loss": 0.8404932, "num_input_tokens_seen": 114790775, "router_z_loss_clip": 3.75195312, "router_z_loss_mlp": 0.42431641, "step": 5343, "time_per_iteration": 4.057276964187622 }, { "auxiliary_loss_clip": 0.01740952, "auxiliary_loss_mlp": 0.00445387, "balance_loss_clip": 1.36550665, "balance_loss_mlp": 0.40371096, "epoch": 0.32129866225762815, "flos": 24791147233920.0, "grad_norm": 16.521403325537324, "language_loss": 0.87509346, "learning_rate": 3.173963011408748e-06, "loss": 0.8969568, "num_input_tokens_seen": 114809835, "router_z_loss_clip": 3.75390625, "router_z_loss_mlp": 0.41699219, "step": 5344, "time_per_iteration": 2.7205429077148438 }, { "auxiliary_loss_clip": 0.01732332, "auxiliary_loss_mlp": 0.00428256, "balance_loss_clip": 1.35939765, "balance_loss_mlp": 0.3871524, "epoch": 0.3213587855102961, "flos": 18366871962240.0, "grad_norm": 5.333940563530704, "language_loss": 0.8709991, "learning_rate": 3.173647680842262e-06, "loss": 0.89260495, "num_input_tokens_seen": 114826505, "router_z_loss_clip": 3.734375, "router_z_loss_mlp": 0.41088867, "step": 5345, "time_per_iteration": 2.6423041820526123 }, { "auxiliary_loss_clip": 0.01741688, "auxiliary_loss_mlp": 0.00444918, "balance_loss_clip": 1.36232805, "balance_loss_mlp": 0.40402907, "epoch": 0.3214189087629641, "flos": 27015992668800.0, "grad_norm": 6.582095642625461, "language_loss": 0.88990968, "learning_rate": 3.1733323057709384e-06, "loss": 0.91177571, "num_input_tokens_seen": 114846140, "router_z_loss_clip": 3.79492188, "router_z_loss_mlp": 0.40893555, "step": 5346, "time_per_iteration": 2.6737165451049805 }, { "auxiliary_loss_clip": 0.01734172, "auxiliary_loss_mlp": 0.00461336, "balance_loss_clip": 1.36105418, "balance_loss_mlp": 0.41834933, "epoch": 0.32147903201563205, "flos": 23148269953920.0, "grad_norm": 147.1817099614968, "language_loss": 0.86252403, "learning_rate": 3.1730168862067366e-06, "loss": 0.88447917, "num_input_tokens_seen": 114866660, "router_z_loss_clip": 3.73242188, "router_z_loss_mlp": 0.42993164, "step": 5347, "time_per_iteration": 2.6493818759918213 }, { "auxiliary_loss_clip": 0.01758943, "auxiliary_loss_mlp": 0.00423824, "balance_loss_clip": 1.38428283, "balance_loss_mlp": 0.38319784, "epoch": 0.3215391552683, "flos": 16580747243520.0, "grad_norm": 997.8325512875963, "language_loss": 0.84685218, "learning_rate": 3.1727014221616164e-06, "loss": 0.86867988, "num_input_tokens_seen": 114882820, "router_z_loss_clip": 3.74804688, "router_z_loss_mlp": 0.40625, "step": 5348, "time_per_iteration": 3.971548318862915 }, { "auxiliary_loss_clip": 0.01758128, "auxiliary_loss_mlp": 0.00437387, "balance_loss_clip": 1.37611198, "balance_loss_mlp": 0.39723742, "epoch": 0.321599278520968, "flos": 17821820010240.0, "grad_norm": 6.230409482743181, "language_loss": 0.91577208, "learning_rate": 3.172385913647542e-06, "loss": 0.93772733, "num_input_tokens_seen": 114900745, "router_z_loss_clip": 3.82226562, "router_z_loss_mlp": 0.40161133, "step": 5349, "time_per_iteration": 2.624138355255127 }, { "auxiliary_loss_clip": 0.01766398, "auxiliary_loss_mlp": 0.00441154, "balance_loss_clip": 1.37653732, "balance_loss_mlp": 0.39859584, "epoch": 0.32165940177363594, "flos": 16251769555200.0, "grad_norm": 10.465101388754215, "language_loss": 0.853001, "learning_rate": 3.172070360676475e-06, "loss": 0.87507641, "num_input_tokens_seen": 114917940, "router_z_loss_clip": 3.90234375, "router_z_loss_mlp": 0.42553711, "step": 5350, "time_per_iteration": 2.6980488300323486 }, { "auxiliary_loss_clip": 0.01723951, "auxiliary_loss_mlp": 0.00402879, "balance_loss_clip": 1.35880721, "balance_loss_mlp": 0.36730716, "epoch": 0.3217195250263039, "flos": 27599900158080.0, "grad_norm": 2.7611779289012484, "language_loss": 0.83755827, "learning_rate": 3.1717547632603828e-06, "loss": 0.85882658, "num_input_tokens_seen": 114937735, "router_z_loss_clip": 3.65039062, "router_z_loss_mlp": 0.35595703, "step": 5351, "time_per_iteration": 2.707916736602783 }, { "auxiliary_loss_clip": 0.01761579, "auxiliary_loss_mlp": 0.00437328, "balance_loss_clip": 1.38087535, "balance_loss_mlp": 0.39336371, "epoch": 0.3217796482789719, "flos": 21470595373440.0, "grad_norm": 2.4694399934877644, "language_loss": 0.81983256, "learning_rate": 3.1714391214112326e-06, "loss": 0.84182167, "num_input_tokens_seen": 114956630, "router_z_loss_clip": 3.8046875, "router_z_loss_mlp": 0.43994141, "step": 5352, "time_per_iteration": 2.6870014667510986 }, { "auxiliary_loss_clip": 0.01736802, "auxiliary_loss_mlp": 0.00459245, "balance_loss_clip": 1.36315322, "balance_loss_mlp": 0.41799891, "epoch": 0.32183977153163984, "flos": 21215593745280.0, "grad_norm": 16.439411627145446, "language_loss": 0.87876016, "learning_rate": 3.1711234351409933e-06, "loss": 0.9007206, "num_input_tokens_seen": 114976470, "router_z_loss_clip": 3.73828125, "router_z_loss_mlp": 0.41235352, "step": 5353, "time_per_iteration": 2.6717095375061035 }, { "auxiliary_loss_clip": 0.01741641, "auxiliary_loss_mlp": 0.00442971, "balance_loss_clip": 1.37649751, "balance_loss_mlp": 0.40241641, "epoch": 0.3218998947843078, "flos": 24608182331520.0, "grad_norm": 7.804477434550239, "language_loss": 0.77912045, "learning_rate": 3.1708077044616365e-06, "loss": 0.80096662, "num_input_tokens_seen": 114996710, "router_z_loss_clip": 3.65234375, "router_z_loss_mlp": 0.40576172, "step": 5354, "time_per_iteration": 2.7204020023345947 }, { "auxiliary_loss_clip": 0.01765065, "auxiliary_loss_mlp": 0.00458198, "balance_loss_clip": 1.38737154, "balance_loss_mlp": 0.41411406, "epoch": 0.3219600180369758, "flos": 22270577126400.0, "grad_norm": 12.227088884031177, "language_loss": 0.88231307, "learning_rate": 3.1704919293851334e-06, "loss": 0.90454566, "num_input_tokens_seen": 115015775, "router_z_loss_clip": 3.77734375, "router_z_loss_mlp": 0.44042969, "step": 5355, "time_per_iteration": 2.736206531524658 }, { "auxiliary_loss_clip": 0.01761099, "auxiliary_loss_mlp": 0.00431673, "balance_loss_clip": 1.38397467, "balance_loss_mlp": 0.39130852, "epoch": 0.3220201412896438, "flos": 14939126939520.0, "grad_norm": 2.2280261303343267, "language_loss": 0.76718211, "learning_rate": 3.1701761099234597e-06, "loss": 0.78910983, "num_input_tokens_seen": 115034265, "router_z_loss_clip": 3.7734375, "router_z_loss_mlp": 0.40356445, "step": 5356, "time_per_iteration": 2.684670925140381 }, { "auxiliary_loss_clip": 0.01809588, "auxiliary_loss_mlp": 0.0045967, "balance_loss_clip": 1.39688182, "balance_loss_mlp": 0.41420311, "epoch": 0.32208026454231176, "flos": 22667389649280.0, "grad_norm": 5.717652274955718, "language_loss": 0.76678914, "learning_rate": 3.1698602460885903e-06, "loss": 0.78948176, "num_input_tokens_seen": 115051945, "router_z_loss_clip": 4.125, "router_z_loss_mlp": 0.45458984, "step": 5357, "time_per_iteration": 2.772672653198242 }, { "auxiliary_loss_clip": 0.02034832, "auxiliary_loss_mlp": 0.00110077, "balance_loss_clip": 1.78668559, "balance_loss_mlp": 0.09643945, "epoch": 0.3221403877949797, "flos": 64605130053120.0, "grad_norm": 0.7449338119182936, "language_loss": 0.58332777, "learning_rate": 3.1695443378925035e-06, "loss": 0.60477686, "num_input_tokens_seen": 115119090, "router_z_loss_clip": 2.46875, "router_z_loss_mlp": 0.13671875, "step": 5358, "time_per_iteration": 3.2751173973083496 }, { "auxiliary_loss_clip": 0.01753956, "auxiliary_loss_mlp": 0.00458005, "balance_loss_clip": 1.37029171, "balance_loss_mlp": 0.41418332, "epoch": 0.3222005110476477, "flos": 20157019004160.0, "grad_norm": 17.81984872717862, "language_loss": 0.88275611, "learning_rate": 3.1692283853471777e-06, "loss": 0.90487576, "num_input_tokens_seen": 115137755, "router_z_loss_clip": 3.83398438, "router_z_loss_mlp": 0.43847656, "step": 5359, "time_per_iteration": 2.649867057800293 }, { "auxiliary_loss_clip": 0.01742643, "auxiliary_loss_mlp": 0.00445392, "balance_loss_clip": 1.36888528, "balance_loss_mlp": 0.40438405, "epoch": 0.32226063430031565, "flos": 22674177319680.0, "grad_norm": 60.58402497155927, "language_loss": 0.84699863, "learning_rate": 3.168912388464595e-06, "loss": 0.86887896, "num_input_tokens_seen": 115158150, "router_z_loss_clip": 3.73828125, "router_z_loss_mlp": 0.41015625, "step": 5360, "time_per_iteration": 2.6827168464660645 }, { "auxiliary_loss_clip": 0.02021787, "auxiliary_loss_mlp": 0.00116668, "balance_loss_clip": 1.77722979, "balance_loss_mlp": 0.10527184, "epoch": 0.3223207575529836, "flos": 63828525075840.0, "grad_norm": 0.6424278133447734, "language_loss": 0.56201422, "learning_rate": 3.168596347256737e-06, "loss": 0.58339882, "num_input_tokens_seen": 115212755, "router_z_loss_clip": 2.4375, "router_z_loss_mlp": 0.11376953, "step": 5361, "time_per_iteration": 3.0779762268066406 }, { "auxiliary_loss_clip": 0.0176141, "auxiliary_loss_mlp": 0.00451613, "balance_loss_clip": 1.37866449, "balance_loss_mlp": 0.40648085, "epoch": 0.3223808808056516, "flos": 26870123537280.0, "grad_norm": 8.077084231627092, "language_loss": 0.77284688, "learning_rate": 3.168280261735588e-06, "loss": 0.79497707, "num_input_tokens_seen": 115233090, "router_z_loss_clip": 3.83007812, "router_z_loss_mlp": 0.45141602, "step": 5362, "time_per_iteration": 2.7750682830810547 }, { "auxiliary_loss_clip": 0.01760361, "auxiliary_loss_mlp": 0.00469238, "balance_loss_clip": 1.37322128, "balance_loss_mlp": 0.42334259, "epoch": 0.32244100405831955, "flos": 26761350176640.0, "grad_norm": 8.81325571355015, "language_loss": 0.78271919, "learning_rate": 3.167964131913135e-06, "loss": 0.80501521, "num_input_tokens_seen": 115252645, "router_z_loss_clip": 3.86914062, "router_z_loss_mlp": 0.45922852, "step": 5363, "time_per_iteration": 2.7448463439941406 }, { "auxiliary_loss_clip": 0.01762042, "auxiliary_loss_mlp": 0.00480878, "balance_loss_clip": 1.36235106, "balance_loss_mlp": 0.43524519, "epoch": 0.3225011273109875, "flos": 23803029020160.0, "grad_norm": 297.1902528303222, "language_loss": 0.80552548, "learning_rate": 3.167647957801365e-06, "loss": 0.82795471, "num_input_tokens_seen": 115269085, "router_z_loss_clip": 3.9921875, "router_z_loss_mlp": 0.45629883, "step": 5364, "time_per_iteration": 2.6377387046813965 }, { "auxiliary_loss_clip": 0.01768046, "auxiliary_loss_mlp": 0.00469002, "balance_loss_clip": 1.385427, "balance_loss_mlp": 0.42258191, "epoch": 0.3225612505636555, "flos": 17274505501440.0, "grad_norm": 3.9357790984981835, "language_loss": 0.81346393, "learning_rate": 3.1673317394122672e-06, "loss": 0.83583438, "num_input_tokens_seen": 115286470, "router_z_loss_clip": 3.83007812, "router_z_loss_mlp": 0.46411133, "step": 5365, "time_per_iteration": 2.5915067195892334 }, { "auxiliary_loss_clip": 0.01766733, "auxiliary_loss_mlp": 0.00487688, "balance_loss_clip": 1.37798953, "balance_loss_mlp": 0.442913, "epoch": 0.32262137381632344, "flos": 23366247638400.0, "grad_norm": 4.477916868887371, "language_loss": 0.81284535, "learning_rate": 3.1670154767578333e-06, "loss": 0.83538949, "num_input_tokens_seen": 115307000, "router_z_loss_clip": 3.890625, "router_z_loss_mlp": 0.44775391, "step": 5366, "time_per_iteration": 2.6553051471710205 }, { "auxiliary_loss_clip": 0.01756767, "auxiliary_loss_mlp": 0.00451975, "balance_loss_clip": 1.3754313, "balance_loss_mlp": 0.4084878, "epoch": 0.3226814970689914, "flos": 23258803080960.0, "grad_norm": 10.078397649601087, "language_loss": 0.78411591, "learning_rate": 3.166699169850055e-06, "loss": 0.80620337, "num_input_tokens_seen": 115325925, "router_z_loss_clip": 3.80859375, "router_z_loss_mlp": 0.43481445, "step": 5367, "time_per_iteration": 2.6642096042633057 }, { "auxiliary_loss_clip": 0.0177465, "auxiliary_loss_mlp": 0.00444261, "balance_loss_clip": 1.38732529, "balance_loss_mlp": 0.40225124, "epoch": 0.32274162032165943, "flos": 16395196561920.0, "grad_norm": 10.265467161621235, "language_loss": 0.7992003, "learning_rate": 3.1663828187009274e-06, "loss": 0.82138938, "num_input_tokens_seen": 115343705, "router_z_loss_clip": 3.87109375, "router_z_loss_mlp": 0.42016602, "step": 5368, "time_per_iteration": 2.6223528385162354 }, { "auxiliary_loss_clip": 0.01754651, "auxiliary_loss_mlp": 0.00463158, "balance_loss_clip": 1.3767519, "balance_loss_mlp": 0.42021933, "epoch": 0.3228017435743274, "flos": 27855081354240.0, "grad_norm": 5.634633015266428, "language_loss": 0.84334195, "learning_rate": 3.1660664233224467e-06, "loss": 0.86552006, "num_input_tokens_seen": 115364170, "router_z_loss_clip": 3.77734375, "router_z_loss_mlp": 0.42919922, "step": 5369, "time_per_iteration": 2.815669298171997 }, { "auxiliary_loss_clip": 0.01742166, "auxiliary_loss_mlp": 0.00451284, "balance_loss_clip": 1.37082553, "balance_loss_mlp": 0.40958509, "epoch": 0.32286186682699536, "flos": 19608770741760.0, "grad_norm": 28.087854046742503, "language_loss": 0.87376636, "learning_rate": 3.16574998372661e-06, "loss": 0.89570081, "num_input_tokens_seen": 115382495, "router_z_loss_clip": 3.71679688, "router_z_loss_mlp": 0.41723633, "step": 5370, "time_per_iteration": 2.733877658843994 }, { "auxiliary_loss_clip": 0.01756079, "auxiliary_loss_mlp": 0.00468385, "balance_loss_clip": 1.37749124, "balance_loss_mlp": 0.42537397, "epoch": 0.3229219900796633, "flos": 24134017870080.0, "grad_norm": 18.59988933227063, "language_loss": 0.87593722, "learning_rate": 3.1654334999254177e-06, "loss": 0.8981818, "num_input_tokens_seen": 115399450, "router_z_loss_clip": 3.78515625, "router_z_loss_mlp": 0.43041992, "step": 5371, "time_per_iteration": 2.714113473892212 }, { "auxiliary_loss_clip": 0.01758871, "auxiliary_loss_mlp": 0.00510389, "balance_loss_clip": 1.36895227, "balance_loss_mlp": 0.46311104, "epoch": 0.3229821133323313, "flos": 17748705876480.0, "grad_norm": 11.121726575004729, "language_loss": 0.94626832, "learning_rate": 3.1651169719308695e-06, "loss": 0.96896088, "num_input_tokens_seen": 115417700, "router_z_loss_clip": 3.89648438, "router_z_loss_mlp": 0.47338867, "step": 5372, "time_per_iteration": 2.7691824436187744 }, { "auxiliary_loss_clip": 0.01739104, "auxiliary_loss_mlp": 0.00488996, "balance_loss_clip": 1.36094296, "balance_loss_mlp": 0.4442451, "epoch": 0.32304223658499925, "flos": 22346025644160.0, "grad_norm": 3.397076080910925, "language_loss": 0.76769137, "learning_rate": 3.1648003997549694e-06, "loss": 0.78997231, "num_input_tokens_seen": 115435840, "router_z_loss_clip": 3.78125, "router_z_loss_mlp": 0.44775391, "step": 5373, "time_per_iteration": 2.664355993270874 }, { "auxiliary_loss_clip": 0.01733184, "auxiliary_loss_mlp": 0.0051101, "balance_loss_clip": 1.36162257, "balance_loss_mlp": 0.46640217, "epoch": 0.3231023598376672, "flos": 18478302929280.0, "grad_norm": 4.570110681734641, "language_loss": 0.84853977, "learning_rate": 3.1644837834097214e-06, "loss": 0.87098169, "num_input_tokens_seen": 115454210, "router_z_loss_clip": 3.71679688, "router_z_loss_mlp": 0.44628906, "step": 5374, "time_per_iteration": 2.627004861831665 }, { "auxiliary_loss_clip": 0.01734836, "auxiliary_loss_mlp": 0.00508167, "balance_loss_clip": 1.36226296, "balance_loss_mlp": 0.4635348, "epoch": 0.3231624830903352, "flos": 27636313570560.0, "grad_norm": 209.26717321525177, "language_loss": 0.92479718, "learning_rate": 3.1641671229071317e-06, "loss": 0.94722712, "num_input_tokens_seen": 115471785, "router_z_loss_clip": 3.72851562, "router_z_loss_mlp": 0.4465332, "step": 5375, "time_per_iteration": 2.7411937713623047 }, { "auxiliary_loss_clip": 0.01749977, "auxiliary_loss_mlp": 0.00497339, "balance_loss_clip": 1.35978281, "balance_loss_mlp": 0.44996512, "epoch": 0.32322260634300315, "flos": 21726423014400.0, "grad_norm": 243.092844268399, "language_loss": 0.80099702, "learning_rate": 3.1638504182592076e-06, "loss": 0.82347012, "num_input_tokens_seen": 115491405, "router_z_loss_clip": 3.90429688, "router_z_loss_mlp": 0.47387695, "step": 5376, "time_per_iteration": 2.6381583213806152 }, { "auxiliary_loss_clip": 0.01739849, "auxiliary_loss_mlp": 0.00488411, "balance_loss_clip": 1.35693693, "balance_loss_mlp": 0.44563907, "epoch": 0.3232827295956711, "flos": 22637656166400.0, "grad_norm": 62.53695825974241, "language_loss": 0.7101441, "learning_rate": 3.1635336694779594e-06, "loss": 0.7324267, "num_input_tokens_seen": 115511555, "router_z_loss_clip": 3.83007812, "router_z_loss_mlp": 0.42749023, "step": 5377, "time_per_iteration": 2.7161648273468018 }, { "auxiliary_loss_clip": 0.01744746, "auxiliary_loss_mlp": 0.00484623, "balance_loss_clip": 1.35918999, "balance_loss_mlp": 0.43655822, "epoch": 0.3233428528483391, "flos": 26322593546880.0, "grad_norm": 38.38556851047563, "language_loss": 0.77251601, "learning_rate": 3.1632168765753982e-06, "loss": 0.7948097, "num_input_tokens_seen": 115532860, "router_z_loss_clip": 3.85742188, "router_z_loss_mlp": 0.48046875, "step": 5378, "time_per_iteration": 4.16313099861145 }, { "auxiliary_loss_clip": 0.01767853, "auxiliary_loss_mlp": 0.00469403, "balance_loss_clip": 1.38380146, "balance_loss_mlp": 0.42722651, "epoch": 0.32340297610100704, "flos": 28585217111040.0, "grad_norm": 21.220625828874667, "language_loss": 0.88555533, "learning_rate": 3.1629000395635357e-06, "loss": 0.90792787, "num_input_tokens_seen": 115553850, "router_z_loss_clip": 3.83984375, "router_z_loss_mlp": 0.42211914, "step": 5379, "time_per_iteration": 2.8018434047698975 }, { "auxiliary_loss_clip": 0.01764341, "auxiliary_loss_mlp": 0.00509123, "balance_loss_clip": 1.36760855, "balance_loss_mlp": 0.46611181, "epoch": 0.323463099353675, "flos": 30773792787840.0, "grad_norm": 29.115757878277655, "language_loss": 0.82945299, "learning_rate": 3.162583158454388e-06, "loss": 0.85218763, "num_input_tokens_seen": 115575530, "router_z_loss_clip": 3.97265625, "router_z_loss_mlp": 0.43017578, "step": 5380, "time_per_iteration": 4.112062454223633 }, { "auxiliary_loss_clip": 0.01757828, "auxiliary_loss_mlp": 0.00509655, "balance_loss_clip": 1.36506498, "balance_loss_mlp": 0.46161401, "epoch": 0.32352322260634303, "flos": 25228610974080.0, "grad_norm": 190.96577913615303, "language_loss": 0.8286109, "learning_rate": 3.1622662332599697e-06, "loss": 0.8512857, "num_input_tokens_seen": 115594885, "router_z_loss_clip": 3.9296875, "router_z_loss_mlp": 0.48071289, "step": 5381, "time_per_iteration": 2.7150115966796875 }, { "auxiliary_loss_clip": 0.0173355, "auxiliary_loss_mlp": 0.00451775, "balance_loss_clip": 1.36081195, "balance_loss_mlp": 0.41036174, "epoch": 0.323583345859011, "flos": 23330480670720.0, "grad_norm": 7.090879928974393, "language_loss": 0.7803489, "learning_rate": 3.1619492639922998e-06, "loss": 0.80220217, "num_input_tokens_seen": 115614080, "router_z_loss_clip": 3.72460938, "router_z_loss_mlp": 0.4140625, "step": 5382, "time_per_iteration": 2.828624725341797 }, { "auxiliary_loss_clip": 0.01758019, "auxiliary_loss_mlp": 0.00508798, "balance_loss_clip": 1.36640906, "balance_loss_mlp": 0.464834, "epoch": 0.32364346911167896, "flos": 26207499392640.0, "grad_norm": 2.835303541111796, "language_loss": 0.78023958, "learning_rate": 3.1616322506633964e-06, "loss": 0.80290776, "num_input_tokens_seen": 115632820, "router_z_loss_clip": 3.91796875, "router_z_loss_mlp": 0.43969727, "step": 5383, "time_per_iteration": 2.8823087215423584 }, { "auxiliary_loss_clip": 0.01768518, "auxiliary_loss_mlp": 0.00455598, "balance_loss_clip": 1.38711429, "balance_loss_mlp": 0.41625881, "epoch": 0.3237035923643469, "flos": 23695764030720.0, "grad_norm": 29.59502525042219, "language_loss": 0.82999772, "learning_rate": 3.161315193285283e-06, "loss": 0.85223883, "num_input_tokens_seen": 115652860, "router_z_loss_clip": 3.8125, "router_z_loss_mlp": 0.39331055, "step": 5384, "time_per_iteration": 2.7238235473632812 }, { "auxiliary_loss_clip": 0.01762258, "auxiliary_loss_mlp": 0.00488322, "balance_loss_clip": 1.36154497, "balance_loss_mlp": 0.44202131, "epoch": 0.3237637156170149, "flos": 14428728633600.0, "grad_norm": 633.8875870087014, "language_loss": 0.82864565, "learning_rate": 3.16099809186998e-06, "loss": 0.85115147, "num_input_tokens_seen": 115670940, "router_z_loss_clip": 4.0078125, "router_z_loss_mlp": 0.46313477, "step": 5385, "time_per_iteration": 4.077510118484497 }, { "auxiliary_loss_clip": 0.01791221, "auxiliary_loss_mlp": 0.00473478, "balance_loss_clip": 1.39400315, "balance_loss_mlp": 0.42884657, "epoch": 0.32382383886968286, "flos": 31062981185280.0, "grad_norm": 3.8052127767330015, "language_loss": 0.78033972, "learning_rate": 3.1606809464295145e-06, "loss": 0.80298662, "num_input_tokens_seen": 115691155, "router_z_loss_clip": 3.97265625, "router_z_loss_mlp": 0.4465332, "step": 5386, "time_per_iteration": 2.8729071617126465 }, { "auxiliary_loss_clip": 0.01763044, "auxiliary_loss_mlp": 0.0050499, "balance_loss_clip": 1.36552119, "balance_loss_mlp": 0.45821202, "epoch": 0.3238839621223508, "flos": 23256935573760.0, "grad_norm": 21.79709856254866, "language_loss": 1.00606298, "learning_rate": 3.1603637569759095e-06, "loss": 1.02874327, "num_input_tokens_seen": 115710340, "router_z_loss_clip": 3.97265625, "router_z_loss_mlp": 0.4675293, "step": 5387, "time_per_iteration": 2.7540955543518066 }, { "auxiliary_loss_clip": 0.01785306, "auxiliary_loss_mlp": 0.0050496, "balance_loss_clip": 1.38349724, "balance_loss_mlp": 0.45446324, "epoch": 0.3239440853750188, "flos": 22964658606720.0, "grad_norm": 46.549576029520495, "language_loss": 0.84239417, "learning_rate": 3.1600465235211956e-06, "loss": 0.86529684, "num_input_tokens_seen": 115726745, "router_z_loss_clip": 4.01757812, "router_z_loss_mlp": 0.50512695, "step": 5388, "time_per_iteration": 2.641906976699829 }, { "auxiliary_loss_clip": 0.01753359, "auxiliary_loss_mlp": 0.0050422, "balance_loss_clip": 1.36329293, "balance_loss_mlp": 0.45925438, "epoch": 0.32400420862768675, "flos": 36246614653440.0, "grad_norm": 7.51814937407651, "language_loss": 0.77521807, "learning_rate": 3.1597292460774006e-06, "loss": 0.79779387, "num_input_tokens_seen": 115749385, "router_z_loss_clip": 3.8984375, "router_z_loss_mlp": 0.44970703, "step": 5389, "time_per_iteration": 2.7651724815368652 }, { "auxiliary_loss_clip": 0.0173168, "auxiliary_loss_mlp": 0.00483741, "balance_loss_clip": 1.35699928, "balance_loss_mlp": 0.44003853, "epoch": 0.3240643318803547, "flos": 21616500418560.0, "grad_norm": 426.323453519577, "language_loss": 0.86353457, "learning_rate": 3.159411924656557e-06, "loss": 0.88568884, "num_input_tokens_seen": 115768105, "router_z_loss_clip": 3.74414062, "router_z_loss_mlp": 0.43701172, "step": 5390, "time_per_iteration": 2.6310060024261475 }, { "auxiliary_loss_clip": 0.01782724, "auxiliary_loss_mlp": 0.00492853, "balance_loss_clip": 1.3941164, "balance_loss_mlp": 0.44595596, "epoch": 0.3241244551330227, "flos": 23295611543040.0, "grad_norm": 3.580452327747769, "language_loss": 0.78380203, "learning_rate": 3.1590945592706967e-06, "loss": 0.80655777, "num_input_tokens_seen": 115787340, "router_z_loss_clip": 3.88476562, "router_z_loss_mlp": 0.46899414, "step": 5391, "time_per_iteration": 4.105253219604492 }, { "auxiliary_loss_clip": 0.01738834, "auxiliary_loss_mlp": 0.00454613, "balance_loss_clip": 1.36244833, "balance_loss_mlp": 0.41277072, "epoch": 0.32418457838569065, "flos": 14097236993280.0, "grad_norm": 2.788761463833981, "language_loss": 0.82459867, "learning_rate": 3.158777149931855e-06, "loss": 0.84653312, "num_input_tokens_seen": 115805565, "router_z_loss_clip": 3.765625, "router_z_loss_mlp": 0.41821289, "step": 5392, "time_per_iteration": 2.6442453861236572 }, { "auxiliary_loss_clip": 0.01759943, "auxiliary_loss_mlp": 0.00467576, "balance_loss_clip": 1.37656784, "balance_loss_mlp": 0.42055973, "epoch": 0.3242447016383586, "flos": 29752672953600.0, "grad_norm": 5.006453255503665, "language_loss": 0.69378924, "learning_rate": 3.158459696652067e-06, "loss": 0.71606439, "num_input_tokens_seen": 115826725, "router_z_loss_clip": 3.83203125, "router_z_loss_mlp": 0.4699707, "step": 5393, "time_per_iteration": 2.7339701652526855 }, { "auxiliary_loss_clip": 0.01729172, "auxiliary_loss_mlp": 0.00437734, "balance_loss_clip": 1.35934377, "balance_loss_mlp": 0.39386538, "epoch": 0.3243048248910266, "flos": 24351205455360.0, "grad_norm": 10.878870564428043, "language_loss": 0.88646066, "learning_rate": 3.158142199443371e-06, "loss": 0.90812969, "num_input_tokens_seen": 115846955, "router_z_loss_clip": 3.70117188, "router_z_loss_mlp": 0.43847656, "step": 5394, "time_per_iteration": 2.737839937210083 }, { "auxiliary_loss_clip": 0.01730989, "auxiliary_loss_mlp": 0.00450533, "balance_loss_clip": 1.37263477, "balance_loss_mlp": 0.41167068, "epoch": 0.3243649481436946, "flos": 24353037048960.0, "grad_norm": 6.136108248963986, "language_loss": 0.88194394, "learning_rate": 3.1578246583178076e-06, "loss": 0.90375918, "num_input_tokens_seen": 115865975, "router_z_loss_clip": 3.58398438, "router_z_loss_mlp": 0.38842773, "step": 5395, "time_per_iteration": 2.801877975463867 }, { "auxiliary_loss_clip": 0.0174041, "auxiliary_loss_mlp": 0.00466956, "balance_loss_clip": 1.37642741, "balance_loss_mlp": 0.4230873, "epoch": 0.32442507139636256, "flos": 22925228451840.0, "grad_norm": 3.7057434064723282, "language_loss": 0.88345754, "learning_rate": 3.157507073287417e-06, "loss": 0.90553117, "num_input_tokens_seen": 115884950, "router_z_loss_clip": 3.63867188, "router_z_loss_mlp": 0.43847656, "step": 5396, "time_per_iteration": 2.681025981903076 }, { "auxiliary_loss_clip": 0.01763877, "auxiliary_loss_mlp": 0.00479246, "balance_loss_clip": 1.37925172, "balance_loss_mlp": 0.43227726, "epoch": 0.32448519464903053, "flos": 22200192426240.0, "grad_norm": 1.9847215204788546, "language_loss": 0.83170199, "learning_rate": 3.1571894443642414e-06, "loss": 0.85413325, "num_input_tokens_seen": 115904170, "router_z_loss_clip": 3.84765625, "router_z_loss_mlp": 0.46948242, "step": 5397, "time_per_iteration": 2.619535446166992 }, { "auxiliary_loss_clip": 0.01715311, "auxiliary_loss_mlp": 0.00461845, "balance_loss_clip": 1.35306799, "balance_loss_mlp": 0.41969228, "epoch": 0.3245453179016985, "flos": 18838450644480.0, "grad_norm": 75.3545319744597, "language_loss": 0.74081826, "learning_rate": 3.1568717715603263e-06, "loss": 0.76258987, "num_input_tokens_seen": 115919255, "router_z_loss_clip": 3.625, "router_z_loss_mlp": 0.42163086, "step": 5398, "time_per_iteration": 2.6087658405303955 }, { "auxiliary_loss_clip": 0.01755023, "auxiliary_loss_mlp": 0.00464051, "balance_loss_clip": 1.38191199, "balance_loss_mlp": 0.42096841, "epoch": 0.32460544115436646, "flos": 21178390233600.0, "grad_norm": 8.53060412121265, "language_loss": 0.78023934, "learning_rate": 3.156554054887718e-06, "loss": 0.80243003, "num_input_tokens_seen": 115938535, "router_z_loss_clip": 3.73242188, "router_z_loss_mlp": 0.4309082, "step": 5399, "time_per_iteration": 2.67522931098938 }, { "auxiliary_loss_clip": 0.01764904, "auxiliary_loss_mlp": 0.00457798, "balance_loss_clip": 1.38276327, "balance_loss_mlp": 0.41225994, "epoch": 0.3246655644070344, "flos": 21981137333760.0, "grad_norm": 4.419731305592811, "language_loss": 0.7683745, "learning_rate": 3.1562362943584645e-06, "loss": 0.79060155, "num_input_tokens_seen": 115955005, "router_z_loss_clip": 3.8203125, "router_z_loss_mlp": 0.45507812, "step": 5400, "time_per_iteration": 2.628851890563965 }, { "auxiliary_loss_clip": 0.01717169, "auxiliary_loss_mlp": 0.0047518, "balance_loss_clip": 1.34822524, "balance_loss_mlp": 0.43112031, "epoch": 0.3247256876597024, "flos": 32159729105280.0, "grad_norm": 58.467045960401705, "language_loss": 0.8635844, "learning_rate": 3.155918489984614e-06, "loss": 0.88550788, "num_input_tokens_seen": 115975305, "router_z_loss_clip": 3.68945312, "router_z_loss_mlp": 0.44042969, "step": 5401, "time_per_iteration": 2.7424001693725586 }, { "auxiliary_loss_clip": 0.01752992, "auxiliary_loss_mlp": 0.00447132, "balance_loss_clip": 1.37876058, "balance_loss_mlp": 0.40028319, "epoch": 0.32478581091237035, "flos": 20997544233600.0, "grad_norm": 4.95861621361043, "language_loss": 0.91762924, "learning_rate": 3.1556006417782196e-06, "loss": 0.93963051, "num_input_tokens_seen": 115994810, "router_z_loss_clip": 3.7421875, "router_z_loss_mlp": 0.46875, "step": 5402, "time_per_iteration": 2.653144359588623 }, { "auxiliary_loss_clip": 0.0168881, "auxiliary_loss_mlp": 0.00420563, "balance_loss_clip": 1.34411681, "balance_loss_mlp": 0.3817009, "epoch": 0.3248459341650383, "flos": 17924990849280.0, "grad_norm": 22.679003451762107, "language_loss": 0.89334285, "learning_rate": 3.155282749751332e-06, "loss": 0.91443658, "num_input_tokens_seen": 116011095, "router_z_loss_clip": 3.44726562, "router_z_loss_mlp": 0.38867188, "step": 5403, "time_per_iteration": 2.6331961154937744 }, { "auxiliary_loss_clip": 0.01678712, "auxiliary_loss_mlp": 0.00415446, "balance_loss_clip": 1.34061766, "balance_loss_mlp": 0.3768456, "epoch": 0.3249060574177063, "flos": 24535606901760.0, "grad_norm": 108.43416284042576, "language_loss": 0.91988266, "learning_rate": 3.154964813916007e-06, "loss": 0.94082427, "num_input_tokens_seen": 116028805, "router_z_loss_clip": 3.3828125, "router_z_loss_mlp": 0.38623047, "step": 5404, "time_per_iteration": 2.819408655166626 }, { "auxiliary_loss_clip": 0.01681394, "auxiliary_loss_mlp": 0.00415338, "balance_loss_clip": 1.33690882, "balance_loss_mlp": 0.3778345, "epoch": 0.32496618067037425, "flos": 25994765093760.0, "grad_norm": 17.638890644193594, "language_loss": 0.7810849, "learning_rate": 3.1546468342843008e-06, "loss": 0.80205226, "num_input_tokens_seen": 116047765, "router_z_loss_clip": 3.4453125, "router_z_loss_mlp": 0.37524414, "step": 5405, "time_per_iteration": 2.7167999744415283 }, { "auxiliary_loss_clip": 0.01718972, "auxiliary_loss_mlp": 0.00413575, "balance_loss_clip": 1.36414552, "balance_loss_mlp": 0.37459368, "epoch": 0.3250263039230422, "flos": 19573757959680.0, "grad_norm": 8.457375672989178, "language_loss": 0.89439523, "learning_rate": 3.1543288108682707e-06, "loss": 0.9157207, "num_input_tokens_seen": 116068385, "router_z_loss_clip": 3.55273438, "router_z_loss_mlp": 0.38989258, "step": 5406, "time_per_iteration": 2.756756544113159 }, { "auxiliary_loss_clip": 0.01727502, "auxiliary_loss_mlp": 0.00476203, "balance_loss_clip": 1.37260473, "balance_loss_mlp": 0.43300164, "epoch": 0.3250864271757102, "flos": 16763640318720.0, "grad_norm": 3.4445528084286203, "language_loss": 0.9188754, "learning_rate": 3.1540107436799764e-06, "loss": 0.94091243, "num_input_tokens_seen": 116085350, "router_z_loss_clip": 3.54882812, "router_z_loss_mlp": 0.43188477, "step": 5407, "time_per_iteration": 2.661437749862671 }, { "auxiliary_loss_clip": 0.0170032, "auxiliary_loss_mlp": 0.00496175, "balance_loss_clip": 1.35150337, "balance_loss_mlp": 0.45199615, "epoch": 0.3251465504283782, "flos": 27819458040960.0, "grad_norm": 1.7576310552766574, "language_loss": 0.72780395, "learning_rate": 3.153692632731479e-06, "loss": 0.74976885, "num_input_tokens_seen": 116107560, "router_z_loss_clip": 3.49023438, "router_z_loss_mlp": 0.44189453, "step": 5408, "time_per_iteration": 2.735755443572998 }, { "auxiliary_loss_clip": 0.01699982, "auxiliary_loss_mlp": 0.00492609, "balance_loss_clip": 1.33868575, "balance_loss_mlp": 0.45026553, "epoch": 0.32520667368104617, "flos": 19063144172160.0, "grad_norm": 39.5961323176261, "language_loss": 0.83086157, "learning_rate": 3.153374478034841e-06, "loss": 0.85278749, "num_input_tokens_seen": 116125980, "router_z_loss_clip": 3.61328125, "router_z_loss_mlp": 0.42333984, "step": 5409, "time_per_iteration": 2.6374149322509766 }, { "auxiliary_loss_clip": 0.01682848, "auxiliary_loss_mlp": 0.00520323, "balance_loss_clip": 1.32887602, "balance_loss_mlp": 0.47797978, "epoch": 0.32526679693371413, "flos": 29382146208000.0, "grad_norm": 17.40190198297269, "language_loss": 0.87303042, "learning_rate": 3.1530562796021285e-06, "loss": 0.89506215, "num_input_tokens_seen": 116146530, "router_z_loss_clip": 3.53710938, "router_z_loss_mlp": 0.42358398, "step": 5410, "time_per_iteration": 2.6688005924224854 }, { "auxiliary_loss_clip": 0.01688417, "auxiliary_loss_mlp": 0.00467247, "balance_loss_clip": 1.35565615, "balance_loss_mlp": 0.42886174, "epoch": 0.3253269201863821, "flos": 20704513080960.0, "grad_norm": 1.8123045202325114, "language_loss": 0.76095158, "learning_rate": 3.152738037445405e-06, "loss": 0.78250825, "num_input_tokens_seen": 116165695, "router_z_loss_clip": 3.328125, "router_z_loss_mlp": 0.38378906, "step": 5411, "time_per_iteration": 2.6537535190582275 }, { "auxiliary_loss_clip": 0.01665221, "auxiliary_loss_mlp": 0.00483283, "balance_loss_clip": 1.32434416, "balance_loss_mlp": 0.4391993, "epoch": 0.32538704343905006, "flos": 29094142959360.0, "grad_norm": 338.8647445684552, "language_loss": 0.86297131, "learning_rate": 3.1524197515767403e-06, "loss": 0.88445634, "num_input_tokens_seen": 116185375, "router_z_loss_clip": 3.41210938, "router_z_loss_mlp": 0.44091797, "step": 5412, "time_per_iteration": 2.735797643661499 }, { "auxiliary_loss_clip": 0.01694534, "auxiliary_loss_mlp": 0.0052623, "balance_loss_clip": 1.34283745, "balance_loss_mlp": 0.48183632, "epoch": 0.325447166691718, "flos": 24676124906880.0, "grad_norm": 5.235351034014789, "language_loss": 0.85710263, "learning_rate": 3.152101422008203e-06, "loss": 0.87931025, "num_input_tokens_seen": 116204335, "router_z_loss_clip": 3.51757812, "router_z_loss_mlp": 0.44433594, "step": 5413, "time_per_iteration": 2.637565851211548 }, { "auxiliary_loss_clip": 0.01681003, "auxiliary_loss_mlp": 0.00526038, "balance_loss_clip": 1.33821619, "balance_loss_mlp": 0.48123962, "epoch": 0.325507289944386, "flos": 21543134889600.0, "grad_norm": 3.3411484425489193, "language_loss": 0.81992185, "learning_rate": 3.151783048751864e-06, "loss": 0.84199226, "num_input_tokens_seen": 116222840, "router_z_loss_clip": 3.42773438, "router_z_loss_mlp": 0.44799805, "step": 5414, "time_per_iteration": 2.614631175994873 }, { "auxiliary_loss_clip": 0.01972756, "auxiliary_loss_mlp": 0.00125667, "balance_loss_clip": 1.72370255, "balance_loss_mlp": 0.11613069, "epoch": 0.32556741319705396, "flos": 71518722347520.0, "grad_norm": 0.9168634537251126, "language_loss": 0.6382606, "learning_rate": 3.1514646318197965e-06, "loss": 0.65924489, "num_input_tokens_seen": 116274940, "router_z_loss_clip": 2.484375, "router_z_loss_mlp": 0.09521484, "step": 5415, "time_per_iteration": 3.1185145378112793 }, { "auxiliary_loss_clip": 0.01646899, "auxiliary_loss_mlp": 0.00549659, "balance_loss_clip": 1.31337404, "balance_loss_mlp": 0.50357282, "epoch": 0.3256275364497219, "flos": 23732428838400.0, "grad_norm": 4.952079817775966, "language_loss": 0.7929424, "learning_rate": 3.151146171224075e-06, "loss": 0.81490803, "num_input_tokens_seen": 116297300, "router_z_loss_clip": 3.3359375, "router_z_loss_mlp": 0.46020508, "step": 5416, "time_per_iteration": 2.6879985332489014 }, { "auxiliary_loss_clip": 0.01913783, "auxiliary_loss_mlp": 0.00173828, "balance_loss_clip": 1.68449891, "balance_loss_mlp": 0.16300404, "epoch": 0.3256876597023899, "flos": 67289199891840.0, "grad_norm": 0.770712207219402, "language_loss": 0.5739345, "learning_rate": 3.1508276669767757e-06, "loss": 0.59481066, "num_input_tokens_seen": 116362370, "router_z_loss_clip": 2.28125, "router_z_loss_mlp": 0.10839844, "step": 5417, "time_per_iteration": 3.203908681869507 }, { "auxiliary_loss_clip": 0.01902927, "auxiliary_loss_mlp": 0.00111675, "balance_loss_clip": 1.66992319, "balance_loss_mlp": 0.10094664, "epoch": 0.32574778295505785, "flos": 71282323964160.0, "grad_norm": 0.8329424988425485, "language_loss": 0.63572907, "learning_rate": 3.150509119089975e-06, "loss": 0.65587509, "num_input_tokens_seen": 116430365, "router_z_loss_clip": 2.328125, "router_z_loss_mlp": 0.10742188, "step": 5418, "time_per_iteration": 3.252798318862915 }, { "auxiliary_loss_clip": 0.01659694, "auxiliary_loss_mlp": 0.00526432, "balance_loss_clip": 1.32567668, "balance_loss_mlp": 0.48296845, "epoch": 0.3258079062077258, "flos": 20776370238720.0, "grad_norm": 5.686045105507039, "language_loss": 0.75974709, "learning_rate": 3.1501905275757537e-06, "loss": 0.78160834, "num_input_tokens_seen": 116447525, "router_z_loss_clip": 3.33984375, "router_z_loss_mlp": 0.43481445, "step": 5419, "time_per_iteration": 2.630409002304077 }, { "auxiliary_loss_clip": 0.01674391, "auxiliary_loss_mlp": 0.00565353, "balance_loss_clip": 1.32911122, "balance_loss_mlp": 0.51528478, "epoch": 0.3258680294603938, "flos": 22235456603520.0, "grad_norm": 130.49762058746728, "language_loss": 0.82562959, "learning_rate": 3.1498718924461926e-06, "loss": 0.84802711, "num_input_tokens_seen": 116466310, "router_z_loss_clip": 3.45117188, "router_z_loss_mlp": 0.50073242, "step": 5420, "time_per_iteration": 4.106637477874756 }, { "auxiliary_loss_clip": 0.01662306, "auxiliary_loss_mlp": 0.00555609, "balance_loss_clip": 1.3210659, "balance_loss_mlp": 0.50628042, "epoch": 0.3259281527130618, "flos": 26979974305920.0, "grad_norm": 9.557827752269842, "language_loss": 0.84624422, "learning_rate": 3.1495532137133736e-06, "loss": 0.86842334, "num_input_tokens_seen": 116487825, "router_z_loss_clip": 3.41601562, "router_z_loss_mlp": 0.49365234, "step": 5421, "time_per_iteration": 2.725907802581787 }, { "auxiliary_loss_clip": 0.01660167, "auxiliary_loss_mlp": 0.00611014, "balance_loss_clip": 1.33262229, "balance_loss_mlp": 0.55958766, "epoch": 0.32598827596572977, "flos": 26214251149440.0, "grad_norm": 45.49535577367021, "language_loss": 0.81002855, "learning_rate": 3.149234491389381e-06, "loss": 0.83274031, "num_input_tokens_seen": 116509950, "router_z_loss_clip": 3.27734375, "router_z_loss_mlp": 0.51416016, "step": 5422, "time_per_iteration": 4.125760793685913 }, { "auxiliary_loss_clip": 0.01672455, "auxiliary_loss_mlp": 0.00596349, "balance_loss_clip": 1.33226192, "balance_loss_mlp": 0.54547042, "epoch": 0.32604839921839773, "flos": 17639752947840.0, "grad_norm": 4.678913275859054, "language_loss": 0.69772029, "learning_rate": 3.1489157254863026e-06, "loss": 0.72040832, "num_input_tokens_seen": 116527695, "router_z_loss_clip": 3.40039062, "router_z_loss_mlp": 0.50878906, "step": 5423, "time_per_iteration": 2.604306936264038 }, { "auxiliary_loss_clip": 0.01643395, "auxiliary_loss_mlp": 0.00539062, "balance_loss_clip": 1.31819844, "balance_loss_mlp": 0.4918313, "epoch": 0.3261085224710657, "flos": 23622721724160.0, "grad_norm": 80.00149126373606, "language_loss": 0.80341262, "learning_rate": 3.148596916016224e-06, "loss": 0.82523715, "num_input_tokens_seen": 116547800, "router_z_loss_clip": 3.25195312, "router_z_loss_mlp": 0.47265625, "step": 5424, "time_per_iteration": 2.699673891067505 }, { "auxiliary_loss_clip": 0.01648276, "auxiliary_loss_mlp": 0.00610886, "balance_loss_clip": 1.31661916, "balance_loss_mlp": 0.5598166, "epoch": 0.32616864572373366, "flos": 23260455106560.0, "grad_norm": 5.896590109142663, "language_loss": 0.81319028, "learning_rate": 3.1482780629912355e-06, "loss": 0.83578193, "num_input_tokens_seen": 116568460, "router_z_loss_clip": 3.3203125, "router_z_loss_mlp": 0.51098633, "step": 5425, "time_per_iteration": 2.7810757160186768 }, { "auxiliary_loss_clip": 0.01616663, "auxiliary_loss_mlp": 0.006086, "balance_loss_clip": 1.28477156, "balance_loss_mlp": 0.55526614, "epoch": 0.32622876897640163, "flos": 25593427457280.0, "grad_norm": 10.241206654472697, "language_loss": 0.85522681, "learning_rate": 3.147959166423428e-06, "loss": 0.87747943, "num_input_tokens_seen": 116588705, "router_z_loss_clip": 3.3203125, "router_z_loss_mlp": 0.53393555, "step": 5426, "time_per_iteration": 2.7226595878601074 }, { "auxiliary_loss_clip": 0.01642244, "auxiliary_loss_mlp": 0.00601613, "balance_loss_clip": 1.3055824, "balance_loss_mlp": 0.5525229, "epoch": 0.3262888922290696, "flos": 22418996123520.0, "grad_norm": 11.950149187372197, "language_loss": 0.79343951, "learning_rate": 3.147640226324893e-06, "loss": 0.81587803, "num_input_tokens_seen": 116608845, "router_z_loss_clip": 3.3671875, "router_z_loss_mlp": 0.49121094, "step": 5427, "time_per_iteration": 4.085914134979248 }, { "auxiliary_loss_clip": 0.0165128, "auxiliary_loss_mlp": 0.00605383, "balance_loss_clip": 1.31187224, "balance_loss_mlp": 0.55483866, "epoch": 0.32634901548173756, "flos": 19718908819200.0, "grad_norm": 20.960401779377484, "language_loss": 0.84160268, "learning_rate": 3.1473212427077266e-06, "loss": 0.86416924, "num_input_tokens_seen": 116628145, "router_z_loss_clip": 3.390625, "router_z_loss_mlp": 0.50512695, "step": 5428, "time_per_iteration": 2.630246162414551 }, { "auxiliary_loss_clip": 0.01633431, "auxiliary_loss_mlp": 0.00623736, "balance_loss_clip": 1.30489945, "balance_loss_mlp": 0.56990147, "epoch": 0.3264091387344055, "flos": 16142924367360.0, "grad_norm": 11.152283468337448, "language_loss": 0.76853275, "learning_rate": 3.147002215584023e-06, "loss": 0.79110444, "num_input_tokens_seen": 116646920, "router_z_loss_clip": 3.28710938, "router_z_loss_mlp": 0.53833008, "step": 5429, "time_per_iteration": 2.6388838291168213 }, { "auxiliary_loss_clip": 0.0165247, "auxiliary_loss_mlp": 0.00590701, "balance_loss_clip": 1.31648779, "balance_loss_mlp": 0.54215902, "epoch": 0.3264692619870735, "flos": 16399075230720.0, "grad_norm": 12.795348719742941, "language_loss": 0.82734609, "learning_rate": 3.146683144965881e-06, "loss": 0.84977776, "num_input_tokens_seen": 116665100, "router_z_loss_clip": 3.35742188, "router_z_loss_mlp": 0.4855957, "step": 5430, "time_per_iteration": 2.634023666381836 }, { "auxiliary_loss_clip": 0.01685433, "auxiliary_loss_mlp": 0.00595522, "balance_loss_clip": 1.3449266, "balance_loss_mlp": 0.54268873, "epoch": 0.32652938523974145, "flos": 22382331315840.0, "grad_norm": 3.4534032476823793, "language_loss": 0.88504297, "learning_rate": 3.146364030865399e-06, "loss": 0.90785253, "num_input_tokens_seen": 116682205, "router_z_loss_clip": 3.40625, "router_z_loss_mlp": 0.52832031, "step": 5431, "time_per_iteration": 2.642017364501953 }, { "auxiliary_loss_clip": 0.01607699, "auxiliary_loss_mlp": 0.00549789, "balance_loss_clip": 1.28813279, "balance_loss_mlp": 0.50401282, "epoch": 0.3265895084924094, "flos": 21908059113600.0, "grad_norm": 16.52947118815553, "language_loss": 0.75243706, "learning_rate": 3.146044873294678e-06, "loss": 0.77401197, "num_input_tokens_seen": 116702575, "router_z_loss_clip": 3.1953125, "router_z_loss_mlp": 0.45703125, "step": 5432, "time_per_iteration": 2.8431761264801025 }, { "auxiliary_loss_clip": 0.01648722, "auxiliary_loss_mlp": 0.00557027, "balance_loss_clip": 1.31585526, "balance_loss_mlp": 0.50955814, "epoch": 0.3266496317450774, "flos": 16067152627200.0, "grad_norm": 57.016033323607175, "language_loss": 0.89353561, "learning_rate": 3.1457256722658203e-06, "loss": 0.91559303, "num_input_tokens_seen": 116720885, "router_z_loss_clip": 3.33203125, "router_z_loss_mlp": 0.47436523, "step": 5433, "time_per_iteration": 4.076584100723267 }, { "auxiliary_loss_clip": 0.01619756, "auxiliary_loss_mlp": 0.00562617, "balance_loss_clip": 1.3001821, "balance_loss_mlp": 0.51426589, "epoch": 0.3267097549977454, "flos": 22528236360960.0, "grad_norm": 4.63628104358729, "language_loss": 0.89520562, "learning_rate": 3.145406427790931e-06, "loss": 0.91702932, "num_input_tokens_seen": 116740395, "router_z_loss_clip": 3.19726562, "router_z_loss_mlp": 0.48339844, "step": 5434, "time_per_iteration": 2.687565326690674 }, { "auxiliary_loss_clip": 0.01647701, "auxiliary_loss_mlp": 0.00562506, "balance_loss_clip": 1.31622863, "balance_loss_mlp": 0.51289129, "epoch": 0.32676987825041337, "flos": 27270419679360.0, "grad_norm": 5.065249582473822, "language_loss": 0.93420547, "learning_rate": 3.1450871398821147e-06, "loss": 0.95630753, "num_input_tokens_seen": 116758870, "router_z_loss_clip": 3.31640625, "router_z_loss_mlp": 0.49633789, "step": 5435, "time_per_iteration": 2.6761550903320312 }, { "auxiliary_loss_clip": 0.01651288, "auxiliary_loss_mlp": 0.00568025, "balance_loss_clip": 1.31861997, "balance_loss_mlp": 0.51900643, "epoch": 0.32683000150308134, "flos": 11508257433600.0, "grad_norm": 156.20832864833798, "language_loss": 0.82516557, "learning_rate": 3.144767808551479e-06, "loss": 0.8473587, "num_input_tokens_seen": 116773440, "router_z_loss_clip": 3.32421875, "router_z_loss_mlp": 0.4909668, "step": 5436, "time_per_iteration": 2.645685911178589 }, { "auxiliary_loss_clip": 0.01633286, "auxiliary_loss_mlp": 0.00530462, "balance_loss_clip": 1.31273651, "balance_loss_mlp": 0.48304003, "epoch": 0.3268901247557493, "flos": 25630200005760.0, "grad_norm": 3.390763175484594, "language_loss": 0.75472844, "learning_rate": 3.144448433811134e-06, "loss": 0.77636588, "num_input_tokens_seen": 116794375, "router_z_loss_clip": 3.203125, "router_z_loss_mlp": 0.47436523, "step": 5437, "time_per_iteration": 2.7007668018341064 }, { "auxiliary_loss_clip": 0.01639102, "auxiliary_loss_mlp": 0.00582165, "balance_loss_clip": 1.30209446, "balance_loss_mlp": 0.52964121, "epoch": 0.32695024800841727, "flos": 24860849575680.0, "grad_norm": 93.24123748770873, "language_loss": 0.68825388, "learning_rate": 3.144129015673189e-06, "loss": 0.7104665, "num_input_tokens_seen": 116815095, "router_z_loss_clip": 3.375, "router_z_loss_mlp": 0.52514648, "step": 5438, "time_per_iteration": 2.691960096359253 }, { "auxiliary_loss_clip": 0.01662544, "auxiliary_loss_mlp": 0.00496577, "balance_loss_clip": 1.33509636, "balance_loss_mlp": 0.45366174, "epoch": 0.32701037126108523, "flos": 28839249072000.0, "grad_norm": 10.487936625501808, "language_loss": 0.79824412, "learning_rate": 3.1438095541497576e-06, "loss": 0.81983542, "num_input_tokens_seen": 116836630, "router_z_loss_clip": 3.27734375, "router_z_loss_mlp": 0.42944336, "step": 5439, "time_per_iteration": 2.73903489112854 }, { "auxiliary_loss_clip": 0.01632093, "auxiliary_loss_mlp": 0.00572777, "balance_loss_clip": 1.30852914, "balance_loss_mlp": 0.52175516, "epoch": 0.3270704945137532, "flos": 27965075777280.0, "grad_norm": 6.235625892106406, "language_loss": 0.78891122, "learning_rate": 3.1434900492529527e-06, "loss": 0.81095994, "num_input_tokens_seen": 116856880, "router_z_loss_clip": 3.23828125, "router_z_loss_mlp": 0.51025391, "step": 5440, "time_per_iteration": 2.7226908206939697 }, { "auxiliary_loss_clip": 0.01623689, "auxiliary_loss_mlp": 0.00552092, "balance_loss_clip": 1.30559421, "balance_loss_mlp": 0.5063867, "epoch": 0.32713061776642116, "flos": 23690700213120.0, "grad_norm": 16.533000287195666, "language_loss": 0.88637275, "learning_rate": 3.1431705009948914e-06, "loss": 0.90813053, "num_input_tokens_seen": 116873770, "router_z_loss_clip": 3.18359375, "router_z_loss_mlp": 0.45678711, "step": 5441, "time_per_iteration": 2.6751458644866943 }, { "auxiliary_loss_clip": 0.01643225, "auxiliary_loss_mlp": 0.00547514, "balance_loss_clip": 1.30912137, "balance_loss_mlp": 0.49680212, "epoch": 0.3271907410190891, "flos": 22455625017600.0, "grad_norm": 35.81865654685802, "language_loss": 0.90856844, "learning_rate": 3.1428509093876897e-06, "loss": 0.93047583, "num_input_tokens_seen": 116891225, "router_z_loss_clip": 3.34375, "router_z_loss_mlp": 0.50708008, "step": 5442, "time_per_iteration": 2.701514959335327 }, { "auxiliary_loss_clip": 0.01627213, "auxiliary_loss_mlp": 0.0053528, "balance_loss_clip": 1.29930472, "balance_loss_mlp": 0.48816815, "epoch": 0.3272508642717571, "flos": 22820118278400.0, "grad_norm": 2.16330705364133, "language_loss": 0.81060863, "learning_rate": 3.1425312744434668e-06, "loss": 0.83223349, "num_input_tokens_seen": 116912300, "router_z_loss_clip": 3.27929688, "router_z_loss_mlp": 0.47167969, "step": 5443, "time_per_iteration": 2.723113536834717 }, { "auxiliary_loss_clip": 0.01642482, "auxiliary_loss_mlp": 0.00560901, "balance_loss_clip": 1.31188703, "balance_loss_mlp": 0.5125016, "epoch": 0.32731098752442506, "flos": 11801360413440.0, "grad_norm": 37.67681278188588, "language_loss": 0.89205813, "learning_rate": 3.142211596174343e-06, "loss": 0.91409206, "num_input_tokens_seen": 116929425, "router_z_loss_clip": 3.30664062, "router_z_loss_mlp": 0.48413086, "step": 5444, "time_per_iteration": 2.6586766242980957 }, { "auxiliary_loss_clip": 0.01651855, "auxiliary_loss_mlp": 0.0058903, "balance_loss_clip": 1.31563413, "balance_loss_mlp": 0.53564793, "epoch": 0.327371110777093, "flos": 21027780506880.0, "grad_norm": 24.68992460405587, "language_loss": 0.63863891, "learning_rate": 3.1418918745924423e-06, "loss": 0.66104776, "num_input_tokens_seen": 116948255, "router_z_loss_clip": 3.36328125, "router_z_loss_mlp": 0.53442383, "step": 5445, "time_per_iteration": 2.7059619426727295 }, { "auxiliary_loss_clip": 0.01620198, "auxiliary_loss_mlp": 0.00576149, "balance_loss_clip": 1.29869521, "balance_loss_mlp": 0.52748734, "epoch": 0.327431234029761, "flos": 19062102677760.0, "grad_norm": 2.3340214423567494, "language_loss": 0.9258154, "learning_rate": 3.1415721097098865e-06, "loss": 0.94777894, "num_input_tokens_seen": 116964905, "router_z_loss_clip": 3.21484375, "router_z_loss_mlp": 0.48632812, "step": 5446, "time_per_iteration": 2.6818618774414062 }, { "auxiliary_loss_clip": 0.01655124, "auxiliary_loss_mlp": 0.00601734, "balance_loss_clip": 1.31691337, "balance_loss_mlp": 0.54673052, "epoch": 0.32749135728242895, "flos": 25849219184640.0, "grad_norm": 7.056093630370844, "language_loss": 0.84576035, "learning_rate": 3.141252301538802e-06, "loss": 0.86832893, "num_input_tokens_seen": 116983650, "router_z_loss_clip": 3.3828125, "router_z_loss_mlp": 0.55029297, "step": 5447, "time_per_iteration": 2.781341314315796 }, { "auxiliary_loss_clip": 0.0162362, "auxiliary_loss_mlp": 0.00552771, "balance_loss_clip": 1.30433345, "balance_loss_mlp": 0.50420493, "epoch": 0.327551480535097, "flos": 20120533764480.0, "grad_norm": 3.559026544222916, "language_loss": 0.78469062, "learning_rate": 3.1409324500913157e-06, "loss": 0.80645454, "num_input_tokens_seen": 117003265, "router_z_loss_clip": 3.1875, "router_z_loss_mlp": 0.4855957, "step": 5448, "time_per_iteration": 2.6433005332946777 }, { "auxiliary_loss_clip": 0.01620868, "auxiliary_loss_mlp": 0.00583184, "balance_loss_clip": 1.30444264, "balance_loss_mlp": 0.52934897, "epoch": 0.32761160378776494, "flos": 28803553931520.0, "grad_norm": 1.5046887710947887, "language_loss": 0.71551991, "learning_rate": 3.1406125553795567e-06, "loss": 0.73756039, "num_input_tokens_seen": 117025370, "router_z_loss_clip": 3.16601562, "router_z_loss_mlp": 0.53808594, "step": 5449, "time_per_iteration": 2.7022290229797363 }, { "auxiliary_loss_clip": 0.01649799, "auxiliary_loss_mlp": 0.00513288, "balance_loss_clip": 1.3272239, "balance_loss_mlp": 0.46691555, "epoch": 0.3276717270404329, "flos": 26937778803840.0, "grad_norm": 76.71003442585912, "language_loss": 0.70480764, "learning_rate": 3.1402926174156556e-06, "loss": 0.72643852, "num_input_tokens_seen": 117044350, "router_z_loss_clip": 3.22460938, "router_z_loss_mlp": 0.46386719, "step": 5450, "time_per_iteration": 2.667043447494507 }, { "auxiliary_loss_clip": 0.01635844, "auxiliary_loss_mlp": 0.00516066, "balance_loss_clip": 1.32088232, "balance_loss_mlp": 0.47083831, "epoch": 0.32773185029310087, "flos": 25338425829120.0, "grad_norm": 6.04832581832523, "language_loss": 0.82720625, "learning_rate": 3.1399726362117437e-06, "loss": 0.84872538, "num_input_tokens_seen": 117064450, "router_z_loss_clip": 3.1484375, "router_z_loss_mlp": 0.45288086, "step": 5451, "time_per_iteration": 2.655608654022217 }, { "auxiliary_loss_clip": 0.01615307, "auxiliary_loss_mlp": 0.00537843, "balance_loss_clip": 1.29357922, "balance_loss_mlp": 0.4862496, "epoch": 0.32779197354576883, "flos": 26391721271040.0, "grad_norm": 4.527544759907656, "language_loss": 0.7626878, "learning_rate": 3.1396526117799555e-06, "loss": 0.78421932, "num_input_tokens_seen": 117083060, "router_z_loss_clip": 3.21679688, "router_z_loss_mlp": 0.51611328, "step": 5452, "time_per_iteration": 2.6630730628967285 }, { "auxiliary_loss_clip": 0.01623486, "auxiliary_loss_mlp": 0.00525537, "balance_loss_clip": 1.31421888, "balance_loss_mlp": 0.47830617, "epoch": 0.3278520967984368, "flos": 24899381890560.0, "grad_norm": 8.802770316652174, "language_loss": 0.83770895, "learning_rate": 3.1393325441324256e-06, "loss": 0.85919917, "num_input_tokens_seen": 117101860, "router_z_loss_clip": 3.09570312, "router_z_loss_mlp": 0.47241211, "step": 5453, "time_per_iteration": 2.669976234436035 }, { "auxiliary_loss_clip": 0.01606051, "auxiliary_loss_mlp": 0.00501477, "balance_loss_clip": 1.29318941, "balance_loss_mlp": 0.45727453, "epoch": 0.32791222005110476, "flos": 29752996176000.0, "grad_norm": 31.914984198998788, "language_loss": 0.82173002, "learning_rate": 3.1390124332812916e-06, "loss": 0.84280533, "num_input_tokens_seen": 117123100, "router_z_loss_clip": 3.1328125, "router_z_loss_mlp": 0.44189453, "step": 5454, "time_per_iteration": 2.6846940517425537 }, { "auxiliary_loss_clip": 0.01581637, "auxiliary_loss_mlp": 0.00482101, "balance_loss_clip": 1.2870152, "balance_loss_mlp": 0.43885195, "epoch": 0.32797234330377273, "flos": 16508064072960.0, "grad_norm": 106.66614293658785, "language_loss": 0.82518023, "learning_rate": 3.1386922792386924e-06, "loss": 0.84581763, "num_input_tokens_seen": 117140515, "router_z_loss_clip": 2.94921875, "router_z_loss_mlp": 0.43237305, "step": 5455, "time_per_iteration": 2.6284332275390625 }, { "auxiliary_loss_clip": 0.01631514, "auxiliary_loss_mlp": 0.00542512, "balance_loss_clip": 1.31495082, "balance_loss_mlp": 0.49380302, "epoch": 0.3280324665564407, "flos": 26577918397440.0, "grad_norm": 433.21396879012076, "language_loss": 0.79043758, "learning_rate": 3.138372082016768e-06, "loss": 0.81217778, "num_input_tokens_seen": 117161485, "router_z_loss_clip": 3.16796875, "router_z_loss_mlp": 0.48681641, "step": 5456, "time_per_iteration": 2.6728460788726807 }, { "auxiliary_loss_clip": 0.01626539, "auxiliary_loss_mlp": 0.00487053, "balance_loss_clip": 1.31768775, "balance_loss_mlp": 0.44387555, "epoch": 0.32809258980910866, "flos": 22929969047040.0, "grad_norm": 10.506549283485755, "language_loss": 0.81995082, "learning_rate": 3.1380518416276596e-06, "loss": 0.84108675, "num_input_tokens_seen": 117181870, "router_z_loss_clip": 3.09179688, "router_z_loss_mlp": 0.43164062, "step": 5457, "time_per_iteration": 2.6595494747161865 }, { "auxiliary_loss_clip": 0.0162196, "auxiliary_loss_mlp": 0.00501698, "balance_loss_clip": 1.30670571, "balance_loss_mlp": 0.45554, "epoch": 0.3281527130617766, "flos": 22783848520320.0, "grad_norm": 4.34387686736333, "language_loss": 0.86209273, "learning_rate": 3.1377315580835115e-06, "loss": 0.88332927, "num_input_tokens_seen": 117201380, "router_z_loss_clip": 3.15234375, "router_z_loss_mlp": 0.46166992, "step": 5458, "time_per_iteration": 2.776191473007202 }, { "auxiliary_loss_clip": 0.01604777, "auxiliary_loss_mlp": 0.00502655, "balance_loss_clip": 1.30275309, "balance_loss_mlp": 0.45892856, "epoch": 0.3282128363144446, "flos": 21250678354560.0, "grad_norm": 54.033404419320306, "language_loss": 0.77799577, "learning_rate": 3.1374112313964686e-06, "loss": 0.79907006, "num_input_tokens_seen": 117221040, "router_z_loss_clip": 3.01757812, "router_z_loss_mlp": 0.43725586, "step": 5459, "time_per_iteration": 2.7742886543273926 }, { "auxiliary_loss_clip": 0.01624414, "auxiliary_loss_mlp": 0.00508892, "balance_loss_clip": 1.31887293, "balance_loss_mlp": 0.46189988, "epoch": 0.32827295956711255, "flos": 30843064166400.0, "grad_norm": 2.719904375630614, "language_loss": 0.89343786, "learning_rate": 3.1370908615786783e-06, "loss": 0.91477096, "num_input_tokens_seen": 117241395, "router_z_loss_clip": 3.05273438, "router_z_loss_mlp": 0.47045898, "step": 5460, "time_per_iteration": 2.825568199157715 }, { "auxiliary_loss_clip": 0.01628996, "auxiliary_loss_mlp": 0.00511817, "balance_loss_clip": 1.31663489, "balance_loss_mlp": 0.4648723, "epoch": 0.3283330828197806, "flos": 25915006944000.0, "grad_norm": 10.179875674757351, "language_loss": 0.82017279, "learning_rate": 3.136770448642288e-06, "loss": 0.84158099, "num_input_tokens_seen": 117259340, "router_z_loss_clip": 3.125, "router_z_loss_mlp": 0.46972656, "step": 5461, "time_per_iteration": 2.778759241104126 }, { "auxiliary_loss_clip": 0.01615331, "auxiliary_loss_mlp": 0.00526603, "balance_loss_clip": 1.31076217, "balance_loss_mlp": 0.4775604, "epoch": 0.32839320607244854, "flos": 38582065042560.0, "grad_norm": 27.576833370170483, "language_loss": 0.67845285, "learning_rate": 3.1364499925994484e-06, "loss": 0.69987226, "num_input_tokens_seen": 117282375, "router_z_loss_clip": 3.046875, "router_z_loss_mlp": 0.49047852, "step": 5462, "time_per_iteration": 2.78997540473938 }, { "auxiliary_loss_clip": 0.01618542, "auxiliary_loss_mlp": 0.00510453, "balance_loss_clip": 1.32115889, "balance_loss_mlp": 0.46510607, "epoch": 0.3284533293251165, "flos": 26650888876800.0, "grad_norm": 15.632919898660074, "language_loss": 0.81877697, "learning_rate": 3.1361294934623115e-06, "loss": 0.84006691, "num_input_tokens_seen": 117303830, "router_z_loss_clip": 2.9765625, "router_z_loss_mlp": 0.453125, "step": 5463, "time_per_iteration": 4.147435188293457 }, { "auxiliary_loss_clip": 0.01623142, "auxiliary_loss_mlp": 0.00520211, "balance_loss_clip": 1.3189919, "balance_loss_mlp": 0.47288504, "epoch": 0.32851345257778447, "flos": 15304158904320.0, "grad_norm": 8.755544136688252, "language_loss": 0.75484043, "learning_rate": 3.1358089512430303e-06, "loss": 0.77627385, "num_input_tokens_seen": 117320665, "router_z_loss_clip": 3.0390625, "router_z_loss_mlp": 0.47338867, "step": 5464, "time_per_iteration": 4.049316167831421 }, { "auxiliary_loss_clip": 0.01600657, "auxiliary_loss_mlp": 0.00507256, "balance_loss_clip": 1.30724907, "balance_loss_mlp": 0.4637689, "epoch": 0.32857357583045244, "flos": 23513732881920.0, "grad_norm": 1.972814613611793, "language_loss": 0.75301504, "learning_rate": 3.1354883659537594e-06, "loss": 0.77409422, "num_input_tokens_seen": 117339795, "router_z_loss_clip": 2.93359375, "router_z_loss_mlp": 0.43505859, "step": 5465, "time_per_iteration": 2.643740177154541 }, { "auxiliary_loss_clip": 0.01593662, "auxiliary_loss_mlp": 0.0049515, "balance_loss_clip": 1.29574835, "balance_loss_mlp": 0.44989815, "epoch": 0.3286336990831204, "flos": 20995209849600.0, "grad_norm": 4.186885133424252, "language_loss": 0.86699128, "learning_rate": 3.1351677376066567e-06, "loss": 0.88787943, "num_input_tokens_seen": 117359525, "router_z_loss_clip": 2.98046875, "router_z_loss_mlp": 0.45263672, "step": 5466, "time_per_iteration": 2.627324104309082 }, { "auxiliary_loss_clip": 0.01607737, "auxiliary_loss_mlp": 0.00536943, "balance_loss_clip": 1.30747724, "balance_loss_mlp": 0.48804289, "epoch": 0.32869382233578837, "flos": 23658811914240.0, "grad_norm": 7094.063653604693, "language_loss": 0.83882749, "learning_rate": 3.134847066213879e-06, "loss": 0.86027431, "num_input_tokens_seen": 117380320, "router_z_loss_clip": 3.00195312, "router_z_loss_mlp": 0.48876953, "step": 5467, "time_per_iteration": 2.654825210571289 }, { "auxiliary_loss_clip": 0.01595777, "auxiliary_loss_mlp": 0.00515924, "balance_loss_clip": 1.29259253, "balance_loss_mlp": 0.46769148, "epoch": 0.32875394558845633, "flos": 25336522408320.0, "grad_norm": 3135.1630590826926, "language_loss": 0.78245425, "learning_rate": 3.134526351787587e-06, "loss": 0.80357122, "num_input_tokens_seen": 117400695, "router_z_loss_clip": 3.03125, "router_z_loss_mlp": 0.48193359, "step": 5468, "time_per_iteration": 2.700373888015747 }, { "auxiliary_loss_clip": 0.01646242, "auxiliary_loss_mlp": 0.00538675, "balance_loss_clip": 1.32933569, "balance_loss_mlp": 0.48920363, "epoch": 0.3288140688411243, "flos": 14903108576640.0, "grad_norm": 296.3519248686756, "language_loss": 0.8575893, "learning_rate": 3.134205594339942e-06, "loss": 0.87943852, "num_input_tokens_seen": 117418800, "router_z_loss_clip": 3.16796875, "router_z_loss_mlp": 0.49414062, "step": 5469, "time_per_iteration": 4.12693977355957 }, { "auxiliary_loss_clip": 0.01596988, "auxiliary_loss_mlp": 0.00519534, "balance_loss_clip": 1.29805183, "balance_loss_mlp": 0.47211242, "epoch": 0.32887419209379226, "flos": 18551345235840.0, "grad_norm": 3.6421076329059314, "language_loss": 0.87355548, "learning_rate": 3.133884793883107e-06, "loss": 0.89472073, "num_input_tokens_seen": 117438220, "router_z_loss_clip": 2.98828125, "router_z_loss_mlp": 0.47460938, "step": 5470, "time_per_iteration": 2.6616621017456055 }, { "auxiliary_loss_clip": 0.01607341, "auxiliary_loss_mlp": 0.00509271, "balance_loss_clip": 1.30606508, "balance_loss_mlp": 0.4622075, "epoch": 0.3289343153464602, "flos": 48105610439040.0, "grad_norm": 135.5674597582698, "language_loss": 0.74782592, "learning_rate": 3.1335639504292478e-06, "loss": 0.76899207, "num_input_tokens_seen": 117462560, "router_z_loss_clip": 3.01171875, "router_z_loss_mlp": 0.47070312, "step": 5471, "time_per_iteration": 2.915369987487793 }, { "auxiliary_loss_clip": 0.0162953, "auxiliary_loss_mlp": 0.0053284, "balance_loss_clip": 1.31556022, "balance_loss_mlp": 0.48243839, "epoch": 0.3289944385991282, "flos": 27600295207680.0, "grad_norm": 13.04913683583274, "language_loss": 0.72062254, "learning_rate": 3.1332430639905288e-06, "loss": 0.74224627, "num_input_tokens_seen": 117483665, "router_z_loss_clip": 3.140625, "router_z_loss_mlp": 0.50415039, "step": 5472, "time_per_iteration": 2.734987497329712 }, { "auxiliary_loss_clip": 0.01631737, "auxiliary_loss_mlp": 0.00531692, "balance_loss_clip": 1.32386041, "balance_loss_mlp": 0.48176715, "epoch": 0.32905456185179616, "flos": 20120318282880.0, "grad_norm": 50.50330287303695, "language_loss": 0.94246316, "learning_rate": 3.13292213457912e-06, "loss": 0.96409738, "num_input_tokens_seen": 117503565, "router_z_loss_clip": 3.08007812, "router_z_loss_mlp": 0.4987793, "step": 5473, "time_per_iteration": 2.6354663372039795 }, { "auxiliary_loss_clip": 0.01631888, "auxiliary_loss_mlp": 0.00507023, "balance_loss_clip": 1.32324898, "balance_loss_mlp": 0.45967346, "epoch": 0.3291146851044642, "flos": 23180230080000.0, "grad_norm": 3.1988385082058888, "language_loss": 0.84395707, "learning_rate": 3.1326011622071903e-06, "loss": 0.86534619, "num_input_tokens_seen": 117521460, "router_z_loss_clip": 3.08398438, "router_z_loss_mlp": 0.47314453, "step": 5474, "time_per_iteration": 2.63146710395813 }, { "auxiliary_loss_clip": 0.01636391, "auxiliary_loss_mlp": 0.00247626, "balance_loss_clip": 1.46130013, "balance_loss_mlp": 0.23580031, "epoch": 0.32917480835713214, "flos": 67621912594560.0, "grad_norm": 0.7976136826564614, "language_loss": 0.59858251, "learning_rate": 3.132280146886911e-06, "loss": 0.6174227, "num_input_tokens_seen": 117580550, "router_z_loss_clip": 1.75, "router_z_loss_mlp": 0.11816406, "step": 5475, "time_per_iteration": 4.548194885253906 }, { "auxiliary_loss_clip": 0.01623055, "auxiliary_loss_mlp": 0.00568209, "balance_loss_clip": 1.30919743, "balance_loss_mlp": 0.5195477, "epoch": 0.3292349316098001, "flos": 27964537073280.0, "grad_norm": 10.289917811681038, "language_loss": 0.82541609, "learning_rate": 3.131959088630455e-06, "loss": 0.84732878, "num_input_tokens_seen": 117600645, "router_z_loss_clip": 3.140625, "router_z_loss_mlp": 0.48681641, "step": 5476, "time_per_iteration": 2.6423418521881104 }, { "auxiliary_loss_clip": 0.01630151, "auxiliary_loss_mlp": 0.00542515, "balance_loss_clip": 1.3293407, "balance_loss_mlp": 0.49576131, "epoch": 0.3292950548624681, "flos": 20263673462400.0, "grad_norm": 13.3842024268705, "language_loss": 0.80286467, "learning_rate": 3.131637987449997e-06, "loss": 0.82459128, "num_input_tokens_seen": 117618880, "router_z_loss_clip": 3.00976562, "router_z_loss_mlp": 0.4675293, "step": 5477, "time_per_iteration": 2.664680242538452 }, { "auxiliary_loss_clip": 0.01617347, "auxiliary_loss_mlp": 0.00500041, "balance_loss_clip": 1.32122719, "balance_loss_mlp": 0.45500416, "epoch": 0.32935517811513604, "flos": 20812999132800.0, "grad_norm": 162.99385028683753, "language_loss": 0.84574187, "learning_rate": 3.131316843357713e-06, "loss": 0.86691582, "num_input_tokens_seen": 117636445, "router_z_loss_clip": 2.96484375, "router_z_loss_mlp": 0.45043945, "step": 5478, "time_per_iteration": 2.8050224781036377 }, { "auxiliary_loss_clip": 0.01590384, "auxiliary_loss_mlp": 0.0047953, "balance_loss_clip": 1.29756498, "balance_loss_mlp": 0.43694851, "epoch": 0.329415301367804, "flos": 18441853603200.0, "grad_norm": 8.126989221524642, "language_loss": 0.85436523, "learning_rate": 3.1309956563657807e-06, "loss": 0.87506437, "num_input_tokens_seen": 117653105, "router_z_loss_clip": 2.92578125, "router_z_loss_mlp": 0.42602539, "step": 5479, "time_per_iteration": 2.728492498397827 }, { "auxiliary_loss_clip": 0.01612055, "auxiliary_loss_mlp": 0.00134671, "balance_loss_clip": 1.43937945, "balance_loss_mlp": 0.12646905, "epoch": 0.32947542462047197, "flos": 66323024887680.0, "grad_norm": 0.7406366265279102, "language_loss": 0.55889201, "learning_rate": 3.1306744264863804e-06, "loss": 0.57635927, "num_input_tokens_seen": 117719225, "router_z_loss_clip": 1.7265625, "router_z_loss_mlp": 0.08203125, "step": 5480, "time_per_iteration": 3.214444875717163 }, { "auxiliary_loss_clip": 0.01604065, "auxiliary_loss_mlp": 0.0054641, "balance_loss_clip": 1.30433571, "balance_loss_mlp": 0.49920267, "epoch": 0.32953554787313993, "flos": 23221599569280.0, "grad_norm": 3.764826295571361, "language_loss": 0.8279866, "learning_rate": 3.1303531537316915e-06, "loss": 0.84949136, "num_input_tokens_seen": 117738725, "router_z_loss_clip": 3.0, "router_z_loss_mlp": 0.47216797, "step": 5481, "time_per_iteration": 2.67130708694458 }, { "auxiliary_loss_clip": 0.01599596, "auxiliary_loss_mlp": 0.00541607, "balance_loss_clip": 1.29340029, "balance_loss_mlp": 0.49211109, "epoch": 0.3295956711258079, "flos": 27009492307200.0, "grad_norm": 8.536843678225932, "language_loss": 0.8287257, "learning_rate": 3.130031838113899e-06, "loss": 0.85013771, "num_input_tokens_seen": 117757765, "router_z_loss_clip": 3.0625, "router_z_loss_mlp": 0.49536133, "step": 5482, "time_per_iteration": 2.7413885593414307 }, { "auxiliary_loss_clip": 0.01619527, "auxiliary_loss_mlp": 0.00521466, "balance_loss_clip": 1.30755019, "balance_loss_mlp": 0.47547522, "epoch": 0.32965579437847586, "flos": 19171702051200.0, "grad_norm": 48.81309756320408, "language_loss": 0.79440475, "learning_rate": 3.129710479645185e-06, "loss": 0.81581467, "num_input_tokens_seen": 117776810, "router_z_loss_clip": 3.12109375, "router_z_loss_mlp": 0.4597168, "step": 5483, "time_per_iteration": 2.6674184799194336 }, { "auxiliary_loss_clip": 0.01608972, "auxiliary_loss_mlp": 0.00487906, "balance_loss_clip": 1.30505037, "balance_loss_mlp": 0.44582477, "epoch": 0.32971591763114383, "flos": 30482521401600.0, "grad_norm": 10.711384805532505, "language_loss": 0.80911171, "learning_rate": 3.1293890783377366e-06, "loss": 0.83008051, "num_input_tokens_seen": 117797730, "router_z_loss_clip": 3.03710938, "router_z_loss_mlp": 0.4206543, "step": 5484, "time_per_iteration": 2.7445266246795654 }, { "auxiliary_loss_clip": 0.01606248, "auxiliary_loss_mlp": 0.00550867, "balance_loss_clip": 1.30385447, "balance_loss_mlp": 0.50065613, "epoch": 0.3297760408838118, "flos": 16289583598080.0, "grad_norm": 11.253346283219042, "language_loss": 0.7866739, "learning_rate": 3.129067634203742e-06, "loss": 0.80824494, "num_input_tokens_seen": 117815365, "router_z_loss_clip": 3.02734375, "router_z_loss_mlp": 0.50170898, "step": 5485, "time_per_iteration": 2.6161720752716064 }, { "auxiliary_loss_clip": 0.01571646, "auxiliary_loss_mlp": 0.00485318, "balance_loss_clip": 1.28344166, "balance_loss_mlp": 0.44254529, "epoch": 0.32983616413647976, "flos": 29530924341120.0, "grad_norm": 25.771878319631295, "language_loss": 0.84877497, "learning_rate": 3.128746147255388e-06, "loss": 0.86934459, "num_input_tokens_seen": 117836095, "router_z_loss_clip": 2.8828125, "router_z_loss_mlp": 0.42797852, "step": 5486, "time_per_iteration": 2.7425038814544678 }, { "auxiliary_loss_clip": 0.01592957, "auxiliary_loss_mlp": 0.00512824, "balance_loss_clip": 1.29898262, "balance_loss_mlp": 0.46661848, "epoch": 0.3298962873891478, "flos": 20631398947200.0, "grad_norm": 18.344522045575467, "language_loss": 0.89674795, "learning_rate": 3.1284246175048683e-06, "loss": 0.91780573, "num_input_tokens_seen": 117854655, "router_z_loss_clip": 2.94335938, "router_z_loss_mlp": 0.46264648, "step": 5487, "time_per_iteration": 2.6852545738220215 }, { "auxiliary_loss_clip": 0.01608036, "auxiliary_loss_mlp": 0.00521288, "balance_loss_clip": 1.30626392, "balance_loss_mlp": 0.47305632, "epoch": 0.32995641064181574, "flos": 14976007228800.0, "grad_norm": 36.49719195961084, "language_loss": 0.81011891, "learning_rate": 3.1281030449643735e-06, "loss": 0.8314122, "num_input_tokens_seen": 117873300, "router_z_loss_clip": 3.015625, "router_z_loss_mlp": 0.48217773, "step": 5488, "time_per_iteration": 2.6665873527526855 }, { "auxiliary_loss_clip": 0.01616369, "auxiliary_loss_mlp": 0.00520748, "balance_loss_clip": 1.31478834, "balance_loss_mlp": 0.47428018, "epoch": 0.3300165338944837, "flos": 18661447399680.0, "grad_norm": 45.32937127985059, "language_loss": 0.77830148, "learning_rate": 3.127781429646098e-06, "loss": 0.7996726, "num_input_tokens_seen": 117891540, "router_z_loss_clip": 3.01367188, "router_z_loss_mlp": 0.46435547, "step": 5489, "time_per_iteration": 2.6557607650756836 }, { "auxiliary_loss_clip": 0.01594196, "auxiliary_loss_mlp": 0.00519297, "balance_loss_clip": 1.29829001, "balance_loss_mlp": 0.47397316, "epoch": 0.3300766571471517, "flos": 25583730785280.0, "grad_norm": 48.54707219330723, "language_loss": 0.95111138, "learning_rate": 3.127459771562238e-06, "loss": 0.97224629, "num_input_tokens_seen": 117907690, "router_z_loss_clip": 2.96289062, "router_z_loss_mlp": 0.453125, "step": 5490, "time_per_iteration": 2.8021819591522217 }, { "auxiliary_loss_clip": 0.01566888, "auxiliary_loss_mlp": 0.00465372, "balance_loss_clip": 1.2749846, "balance_loss_mlp": 0.42386305, "epoch": 0.33013678039981964, "flos": 11363501623680.0, "grad_norm": 10.082812198299813, "language_loss": 0.88212633, "learning_rate": 3.1271380707249907e-06, "loss": 0.90244889, "num_input_tokens_seen": 117925640, "router_z_loss_clip": 2.91796875, "router_z_loss_mlp": 0.4152832, "step": 5491, "time_per_iteration": 2.6597111225128174 }, { "auxiliary_loss_clip": 0.01596926, "auxiliary_loss_mlp": 0.0048842, "balance_loss_clip": 1.30161297, "balance_loss_mlp": 0.44319224, "epoch": 0.3301969036524876, "flos": 24821203939200.0, "grad_norm": 9.628339165638607, "language_loss": 0.81991971, "learning_rate": 3.126816327146554e-06, "loss": 0.84077322, "num_input_tokens_seen": 117944525, "router_z_loss_clip": 2.95117188, "router_z_loss_mlp": 0.45263672, "step": 5492, "time_per_iteration": 2.6714067459106445 }, { "auxiliary_loss_clip": 0.0160378, "auxiliary_loss_mlp": 0.00522506, "balance_loss_clip": 1.30173826, "balance_loss_mlp": 0.4735114, "epoch": 0.33025702690515557, "flos": 15961144613760.0, "grad_norm": 47.15062331655749, "language_loss": 0.83253074, "learning_rate": 3.12649454083913e-06, "loss": 0.85379362, "num_input_tokens_seen": 117962515, "router_z_loss_clip": 3.015625, "router_z_loss_mlp": 0.48999023, "step": 5493, "time_per_iteration": 2.633070230484009 }, { "auxiliary_loss_clip": 0.0160077, "auxiliary_loss_mlp": 0.00102162, "balance_loss_clip": 1.41763222, "balance_loss_mlp": 0.0947707, "epoch": 0.33031715015782354, "flos": 59416755989760.0, "grad_norm": 0.7633667836140986, "language_loss": 0.53691661, "learning_rate": 3.12617271181492e-06, "loss": 0.55394602, "num_input_tokens_seen": 118018780, "router_z_loss_clip": 1.828125, "router_z_loss_mlp": 0.07373047, "step": 5494, "time_per_iteration": 3.084963083267212 }, { "auxiliary_loss_clip": 0.01581346, "auxiliary_loss_mlp": 0.00492441, "balance_loss_clip": 1.28875721, "balance_loss_mlp": 0.44950211, "epoch": 0.3303772734104915, "flos": 23184360144000.0, "grad_norm": 18.116930325982445, "language_loss": 0.91413337, "learning_rate": 3.1258508400861276e-06, "loss": 0.9348712, "num_input_tokens_seen": 118038610, "router_z_loss_clip": 2.92578125, "router_z_loss_mlp": 0.42944336, "step": 5495, "time_per_iteration": 2.6988515853881836 }, { "auxiliary_loss_clip": 0.01600808, "auxiliary_loss_mlp": 0.00495981, "balance_loss_clip": 1.29974318, "balance_loss_mlp": 0.45087269, "epoch": 0.33043739666315947, "flos": 33071896010880.0, "grad_norm": 30.02817716433816, "language_loss": 0.78475273, "learning_rate": 3.1255289256649587e-06, "loss": 0.80572063, "num_input_tokens_seen": 118055905, "router_z_loss_clip": 3.01171875, "router_z_loss_mlp": 0.45092773, "step": 5496, "time_per_iteration": 2.7362167835235596 }, { "auxiliary_loss_clip": 0.0158997, "auxiliary_loss_mlp": 0.00485261, "balance_loss_clip": 1.29513121, "balance_loss_mlp": 0.44158289, "epoch": 0.33049751991582743, "flos": 24895431394560.0, "grad_norm": 2.492586907083643, "language_loss": 0.7814883, "learning_rate": 3.1252069685636196e-06, "loss": 0.80224061, "num_input_tokens_seen": 118073695, "router_z_loss_clip": 2.9453125, "router_z_loss_mlp": 0.43725586, "step": 5497, "time_per_iteration": 2.694695472717285 }, { "auxiliary_loss_clip": 0.01564706, "auxiliary_loss_mlp": 0.00492391, "balance_loss_clip": 1.2771132, "balance_loss_mlp": 0.44997615, "epoch": 0.3305576431684954, "flos": 29460575554560.0, "grad_norm": 1469.4356700417886, "language_loss": 0.86169368, "learning_rate": 3.124884968794321e-06, "loss": 0.88226467, "num_input_tokens_seen": 118094030, "router_z_loss_clip": 2.87304688, "router_z_loss_mlp": 0.42407227, "step": 5498, "time_per_iteration": 2.7140886783599854 }, { "auxiliary_loss_clip": 0.01583421, "auxiliary_loss_mlp": 0.00494, "balance_loss_clip": 1.28590345, "balance_loss_mlp": 0.44662669, "epoch": 0.33061776642116336, "flos": 22632305040000.0, "grad_norm": 7.507273359954162, "language_loss": 0.82498419, "learning_rate": 3.12456292636927e-06, "loss": 0.84575838, "num_input_tokens_seen": 118111665, "router_z_loss_clip": 2.97460938, "router_z_loss_mlp": 0.47412109, "step": 5499, "time_per_iteration": 2.6105291843414307 }, { "auxiliary_loss_clip": 0.01564736, "auxiliary_loss_mlp": 0.00481426, "balance_loss_clip": 1.2739867, "balance_loss_mlp": 0.43822449, "epoch": 0.3306778896738313, "flos": 25776320532480.0, "grad_norm": 1.6427377415211089, "language_loss": 0.82627648, "learning_rate": 3.124240841300681e-06, "loss": 0.8467381, "num_input_tokens_seen": 118132435, "router_z_loss_clip": 2.90625, "router_z_loss_mlp": 0.43212891, "step": 5500, "time_per_iteration": 2.6986587047576904 }, { "auxiliary_loss_clip": 0.01599075, "auxiliary_loss_mlp": 0.00496313, "balance_loss_clip": 1.30399132, "balance_loss_mlp": 0.4507277, "epoch": 0.33073801292649935, "flos": 36940552479360.0, "grad_norm": 7.588102521362138, "language_loss": 0.71682125, "learning_rate": 3.1239187136007665e-06, "loss": 0.73777515, "num_input_tokens_seen": 118155255, "router_z_loss_clip": 2.953125, "router_z_loss_mlp": 0.45556641, "step": 5501, "time_per_iteration": 2.8177385330200195 }, { "auxiliary_loss_clip": 0.01587088, "auxiliary_loss_mlp": 0.0049537, "balance_loss_clip": 1.29485989, "balance_loss_mlp": 0.44954628, "epoch": 0.3307981361791673, "flos": 12967738848000.0, "grad_norm": 3.853689270145967, "language_loss": 0.85058421, "learning_rate": 3.1235965432817417e-06, "loss": 0.87140876, "num_input_tokens_seen": 118169865, "router_z_loss_clip": 2.91992188, "router_z_loss_mlp": 0.45800781, "step": 5502, "time_per_iteration": 2.6845107078552246 }, { "auxiliary_loss_clip": 0.01604459, "auxiliary_loss_mlp": 0.00527821, "balance_loss_clip": 1.30696535, "balance_loss_mlp": 0.47734821, "epoch": 0.3308582594318353, "flos": 25374372364800.0, "grad_norm": 39.65166413517039, "language_loss": 0.77644861, "learning_rate": 3.123274330355824e-06, "loss": 0.79777145, "num_input_tokens_seen": 118190760, "router_z_loss_clip": 2.97460938, "router_z_loss_mlp": 0.50488281, "step": 5503, "time_per_iteration": 2.7161052227020264 }, { "auxiliary_loss_clip": 0.0159011, "auxiliary_loss_mlp": 0.00443941, "balance_loss_clip": 1.29754448, "balance_loss_mlp": 0.40484035, "epoch": 0.33091838268450324, "flos": 26468570419200.0, "grad_norm": 6.72807496081418, "language_loss": 0.79734194, "learning_rate": 3.12295207483523e-06, "loss": 0.8176825, "num_input_tokens_seen": 118213620, "router_z_loss_clip": 2.921875, "router_z_loss_mlp": 0.390625, "step": 5504, "time_per_iteration": 2.7365355491638184 }, { "auxiliary_loss_clip": 0.0156763, "auxiliary_loss_mlp": 0.00466792, "balance_loss_clip": 1.27611661, "balance_loss_mlp": 0.42637983, "epoch": 0.3309785059371712, "flos": 24971167221120.0, "grad_norm": 24.071980567442708, "language_loss": 0.76233304, "learning_rate": 3.1226297767321816e-06, "loss": 0.78267729, "num_input_tokens_seen": 118235010, "router_z_loss_clip": 2.91601562, "router_z_loss_mlp": 0.40405273, "step": 5505, "time_per_iteration": 4.213175058364868 }, { "auxiliary_loss_clip": 0.01568654, "auxiliary_loss_mlp": 0.00520462, "balance_loss_clip": 1.27827704, "balance_loss_mlp": 0.47568724, "epoch": 0.3310386291898392, "flos": 20446710192000.0, "grad_norm": 7.513282307336765, "language_loss": 0.86778152, "learning_rate": 3.122307436058899e-06, "loss": 0.88867265, "num_input_tokens_seen": 118255820, "router_z_loss_clip": 2.90625, "router_z_loss_mlp": 0.44799805, "step": 5506, "time_per_iteration": 4.189249515533447 }, { "auxiliary_loss_clip": 0.01602595, "auxiliary_loss_mlp": 0.00466431, "balance_loss_clip": 1.30557442, "balance_loss_mlp": 0.42308614, "epoch": 0.33109875244250714, "flos": 23182672204800.0, "grad_norm": 3.3544234422992614, "language_loss": 0.8454318, "learning_rate": 3.121985052827606e-06, "loss": 0.86612207, "num_input_tokens_seen": 118274160, "router_z_loss_clip": 2.97070312, "router_z_loss_mlp": 0.43383789, "step": 5507, "time_per_iteration": 2.665292978286743 }, { "auxiliary_loss_clip": 0.01578458, "auxiliary_loss_mlp": 0.00526244, "balance_loss_clip": 1.28769743, "balance_loss_mlp": 0.48158786, "epoch": 0.3311588756951751, "flos": 24168384207360.0, "grad_norm": 3.737896871285951, "language_loss": 0.77929831, "learning_rate": 3.1216626270505274e-06, "loss": 0.8003453, "num_input_tokens_seen": 118294385, "router_z_loss_clip": 2.91015625, "router_z_loss_mlp": 0.4465332, "step": 5508, "time_per_iteration": 2.7712974548339844 }, { "auxiliary_loss_clip": 0.01569202, "auxiliary_loss_mlp": 0.0046803, "balance_loss_clip": 1.28664875, "balance_loss_mlp": 0.42802307, "epoch": 0.33121899894784307, "flos": 28145742209280.0, "grad_norm": 5.375295266430183, "language_loss": 0.76988304, "learning_rate": 3.12134015873989e-06, "loss": 0.79025543, "num_input_tokens_seen": 118313105, "router_z_loss_clip": 2.83007812, "router_z_loss_mlp": 0.39990234, "step": 5509, "time_per_iteration": 2.7654919624328613 }, { "auxiliary_loss_clip": 0.01595682, "auxiliary_loss_mlp": 0.00494421, "balance_loss_clip": 1.30281687, "balance_loss_mlp": 0.45176768, "epoch": 0.33127912220051103, "flos": 29567660976000.0, "grad_norm": 8.225858482513216, "language_loss": 0.79477137, "learning_rate": 3.121017647907921e-06, "loss": 0.8156724, "num_input_tokens_seen": 118335250, "router_z_loss_clip": 2.9296875, "router_z_loss_mlp": 0.42626953, "step": 5510, "time_per_iteration": 2.755457639694214 }, { "auxiliary_loss_clip": 0.01570833, "auxiliary_loss_mlp": 0.00481457, "balance_loss_clip": 1.28485131, "balance_loss_mlp": 0.43954289, "epoch": 0.331339245453179, "flos": 14428836374400.0, "grad_norm": 974.2229342526064, "language_loss": 0.94503003, "learning_rate": 3.1206950945668508e-06, "loss": 0.96555293, "num_input_tokens_seen": 118351470, "router_z_loss_clip": 2.859375, "router_z_loss_mlp": 0.41894531, "step": 5511, "time_per_iteration": 4.104701995849609 }, { "auxiliary_loss_clip": 0.01551707, "auxiliary_loss_mlp": 0.00474564, "balance_loss_clip": 1.27371931, "balance_loss_mlp": 0.43436646, "epoch": 0.33139936870584696, "flos": 20887118847360.0, "grad_norm": 11.219581751951605, "language_loss": 0.78327572, "learning_rate": 3.12037249872891e-06, "loss": 0.80353844, "num_input_tokens_seen": 118370970, "router_z_loss_clip": 2.77929688, "router_z_loss_mlp": 0.40161133, "step": 5512, "time_per_iteration": 2.650683879852295 }, { "auxiliary_loss_clip": 0.0157338, "auxiliary_loss_mlp": 0.0048861, "balance_loss_clip": 1.28579867, "balance_loss_mlp": 0.4497481, "epoch": 0.33145949195851493, "flos": 36284356869120.0, "grad_norm": 48.40613141337527, "language_loss": 0.77676034, "learning_rate": 3.1200498604063317e-06, "loss": 0.79738021, "num_input_tokens_seen": 118393125, "router_z_loss_clip": 2.87304688, "router_z_loss_mlp": 0.38891602, "step": 5513, "time_per_iteration": 2.7655158042907715 }, { "auxiliary_loss_clip": 0.01569343, "auxiliary_loss_mlp": 0.00514148, "balance_loss_clip": 1.27710509, "balance_loss_mlp": 0.46853822, "epoch": 0.33151961521118295, "flos": 14279735018880.0, "grad_norm": 15.249702380315565, "language_loss": 0.76081777, "learning_rate": 3.1197271796113507e-06, "loss": 0.78165263, "num_input_tokens_seen": 118410860, "router_z_loss_clip": 2.92578125, "router_z_loss_mlp": 0.45654297, "step": 5514, "time_per_iteration": 2.6223273277282715 }, { "auxiliary_loss_clip": 0.01604628, "auxiliary_loss_mlp": 0.0053409, "balance_loss_clip": 1.30755317, "balance_loss_mlp": 0.48895711, "epoch": 0.3315797384638509, "flos": 20774323163520.0, "grad_norm": 18.75448424433654, "language_loss": 0.73163176, "learning_rate": 3.1194044563562026e-06, "loss": 0.75301892, "num_input_tokens_seen": 118429570, "router_z_loss_clip": 2.96679688, "router_z_loss_mlp": 0.45141602, "step": 5515, "time_per_iteration": 2.6470863819122314 }, { "auxiliary_loss_clip": 0.01576537, "auxiliary_loss_mlp": 0.00530322, "balance_loss_clip": 1.28342676, "balance_loss_mlp": 0.48774058, "epoch": 0.3316398617165189, "flos": 24679464871680.0, "grad_norm": 16.28488196648176, "language_loss": 0.73499846, "learning_rate": 3.1190816906531257e-06, "loss": 0.75606704, "num_input_tokens_seen": 118450285, "router_z_loss_clip": 2.93359375, "router_z_loss_mlp": 0.42578125, "step": 5516, "time_per_iteration": 2.6702098846435547 }, { "auxiliary_loss_clip": 0.01575217, "auxiliary_loss_mlp": 0.00546743, "balance_loss_clip": 1.27666104, "balance_loss_mlp": 0.49941662, "epoch": 0.33169998496918685, "flos": 18587974129920.0, "grad_norm": 22.676130120796145, "language_loss": 0.87242699, "learning_rate": 3.118758882514359e-06, "loss": 0.8936466, "num_input_tokens_seen": 118468270, "router_z_loss_clip": 2.984375, "router_z_loss_mlp": 0.47363281, "step": 5517, "time_per_iteration": 4.007158041000366 }, { "auxiliary_loss_clip": 0.01570445, "auxiliary_loss_mlp": 0.00514952, "balance_loss_clip": 1.28502011, "balance_loss_mlp": 0.47337186, "epoch": 0.3317601082218548, "flos": 20193647898240.0, "grad_norm": 19.545757000585017, "language_loss": 0.79362309, "learning_rate": 3.118436031952143e-06, "loss": 0.81447709, "num_input_tokens_seen": 118486615, "router_z_loss_clip": 2.85546875, "router_z_loss_mlp": 0.41577148, "step": 5518, "time_per_iteration": 2.6238152980804443 }, { "auxiliary_loss_clip": 0.01540833, "auxiliary_loss_mlp": 0.00160397, "balance_loss_clip": 1.34395957, "balance_loss_mlp": 0.15162329, "epoch": 0.3318202314745228, "flos": 68974703637120.0, "grad_norm": 0.6019499570280913, "language_loss": 0.53956068, "learning_rate": 3.1181131389787206e-06, "loss": 0.55657303, "num_input_tokens_seen": 118553580, "router_z_loss_clip": 1.96875, "router_z_loss_mlp": 0.08789062, "step": 5519, "time_per_iteration": 3.2296977043151855 }, { "auxiliary_loss_clip": 0.01573207, "auxiliary_loss_mlp": 0.00546561, "balance_loss_clip": 1.28171039, "balance_loss_mlp": 0.50169069, "epoch": 0.33188035472719074, "flos": 21500113374720.0, "grad_norm": 31.750921576731162, "language_loss": 0.84345376, "learning_rate": 3.117790203606336e-06, "loss": 0.86465144, "num_input_tokens_seen": 118570280, "router_z_loss_clip": 2.9140625, "router_z_loss_mlp": 0.44873047, "step": 5520, "time_per_iteration": 2.667644500732422 }, { "auxiliary_loss_clip": 0.01573939, "auxiliary_loss_mlp": 0.00493025, "balance_loss_clip": 1.28422642, "balance_loss_mlp": 0.45065793, "epoch": 0.3319404779798587, "flos": 28870490926080.0, "grad_norm": 6.498715236426103, "language_loss": 0.80940819, "learning_rate": 3.1174672258472344e-06, "loss": 0.83007777, "num_input_tokens_seen": 118590455, "router_z_loss_clip": 2.90039062, "router_z_loss_mlp": 0.42407227, "step": 5521, "time_per_iteration": 2.739652633666992 }, { "auxiliary_loss_clip": 0.01578892, "auxiliary_loss_mlp": 0.00537682, "balance_loss_clip": 1.28021741, "balance_loss_mlp": 0.49123853, "epoch": 0.33200060123252667, "flos": 23076915586560.0, "grad_norm": 127.35680076721854, "language_loss": 0.77643085, "learning_rate": 3.117144205713664e-06, "loss": 0.79759657, "num_input_tokens_seen": 118609495, "router_z_loss_clip": 2.984375, "router_z_loss_mlp": 0.46386719, "step": 5522, "time_per_iteration": 2.7655816078186035 }, { "auxiliary_loss_clip": 0.01572213, "auxiliary_loss_mlp": 0.004873, "balance_loss_clip": 1.28385282, "balance_loss_mlp": 0.44664928, "epoch": 0.33206072448519464, "flos": 21142479611520.0, "grad_norm": 12.143204836526795, "language_loss": 0.80655718, "learning_rate": 3.1168211432178735e-06, "loss": 0.82715225, "num_input_tokens_seen": 118628720, "router_z_loss_clip": 2.88085938, "router_z_loss_mlp": 0.40649414, "step": 5523, "time_per_iteration": 2.737348794937134 }, { "auxiliary_loss_clip": 0.01580682, "auxiliary_loss_mlp": 0.00490841, "balance_loss_clip": 1.28779769, "balance_loss_mlp": 0.44942731, "epoch": 0.3321208477378626, "flos": 13079097987840.0, "grad_norm": 17.70192336304605, "language_loss": 0.86335361, "learning_rate": 3.116498038372114e-06, "loss": 0.88406885, "num_input_tokens_seen": 118645955, "router_z_loss_clip": 2.93164062, "router_z_loss_mlp": 0.41430664, "step": 5524, "time_per_iteration": 2.726077079772949 }, { "auxiliary_loss_clip": 0.01578194, "auxiliary_loss_mlp": 0.00497624, "balance_loss_clip": 1.28742075, "balance_loss_mlp": 0.45821327, "epoch": 0.33218097099053057, "flos": 21215414177280.0, "grad_norm": 3.823614868994269, "language_loss": 0.89823604, "learning_rate": 3.116174891188636e-06, "loss": 0.91899425, "num_input_tokens_seen": 118665605, "router_z_loss_clip": 2.90429688, "router_z_loss_mlp": 0.39379883, "step": 5525, "time_per_iteration": 2.6567418575286865 }, { "auxiliary_loss_clip": 0.01558659, "auxiliary_loss_mlp": 0.00157156, "balance_loss_clip": 1.35221314, "balance_loss_mlp": 0.148764, "epoch": 0.33224109424319853, "flos": 64348979189760.0, "grad_norm": 0.7521071899607129, "language_loss": 0.51975727, "learning_rate": 3.1158517016796945e-06, "loss": 0.53691542, "num_input_tokens_seen": 118728155, "router_z_loss_clip": 2.0625, "router_z_loss_mlp": 0.08398438, "step": 5526, "time_per_iteration": 3.1209027767181396 }, { "auxiliary_loss_clip": 0.0159959, "auxiliary_loss_mlp": 0.00544269, "balance_loss_clip": 1.29486334, "balance_loss_mlp": 0.49777743, "epoch": 0.33230121749586655, "flos": 17346003523200.0, "grad_norm": 44.16976911131203, "language_loss": 0.84170341, "learning_rate": 3.1155284698575445e-06, "loss": 0.86314201, "num_input_tokens_seen": 118743955, "router_z_loss_clip": 3.04882812, "router_z_loss_mlp": 0.46484375, "step": 5527, "time_per_iteration": 2.651883602142334 }, { "auxiliary_loss_clip": 0.01580472, "auxiliary_loss_mlp": 0.00530901, "balance_loss_clip": 1.29042125, "balance_loss_mlp": 0.48901063, "epoch": 0.3323613407485345, "flos": 20997041443200.0, "grad_norm": 5.064860664836912, "language_loss": 0.78500509, "learning_rate": 3.1152051957344434e-06, "loss": 0.80611879, "num_input_tokens_seen": 118763275, "router_z_loss_clip": 2.90429688, "router_z_loss_mlp": 0.41918945, "step": 5528, "time_per_iteration": 2.676032304763794 }, { "auxiliary_loss_clip": 0.01575552, "auxiliary_loss_mlp": 0.00511862, "balance_loss_clip": 1.28438246, "balance_loss_mlp": 0.46944773, "epoch": 0.3324214640012025, "flos": 13152535344000.0, "grad_norm": 3.8044523033866198, "language_loss": 0.88724518, "learning_rate": 3.1148818793226497e-06, "loss": 0.90811932, "num_input_tokens_seen": 118781110, "router_z_loss_clip": 2.9140625, "router_z_loss_mlp": 0.42407227, "step": 5529, "time_per_iteration": 2.650164842605591 }, { "auxiliary_loss_clip": 0.01604846, "auxiliary_loss_mlp": 0.0053227, "balance_loss_clip": 1.30283022, "balance_loss_mlp": 0.48809063, "epoch": 0.33248158725387045, "flos": 22273522041600.0, "grad_norm": 15.485608236127607, "language_loss": 0.76649082, "learning_rate": 3.114558520634423e-06, "loss": 0.787862, "num_input_tokens_seen": 118800620, "router_z_loss_clip": 3.02148438, "router_z_loss_mlp": 0.44213867, "step": 5530, "time_per_iteration": 2.737518072128296 }, { "auxiliary_loss_clip": 0.01591692, "auxiliary_loss_mlp": 0.00517289, "balance_loss_clip": 1.28805137, "balance_loss_mlp": 0.47644785, "epoch": 0.3325417105065384, "flos": 20740998320640.0, "grad_norm": 6.311591444167064, "language_loss": 0.82121098, "learning_rate": 3.1142351196820256e-06, "loss": 0.84230077, "num_input_tokens_seen": 118818725, "router_z_loss_clip": 3.03710938, "router_z_loss_mlp": 0.40869141, "step": 5531, "time_per_iteration": 2.6790318489074707 }, { "auxiliary_loss_clip": 0.01600055, "auxiliary_loss_mlp": 0.00539687, "balance_loss_clip": 1.30024648, "balance_loss_mlp": 0.49467313, "epoch": 0.3326018337592064, "flos": 24790536702720.0, "grad_norm": 69.33304113066231, "language_loss": 0.78527749, "learning_rate": 3.1139116764777206e-06, "loss": 0.80667484, "num_input_tokens_seen": 118839390, "router_z_loss_clip": 3.0, "router_z_loss_mlp": 0.45019531, "step": 5532, "time_per_iteration": 2.6913888454437256 }, { "auxiliary_loss_clip": 0.01584453, "auxiliary_loss_mlp": 0.00502952, "balance_loss_clip": 1.28716898, "balance_loss_mlp": 0.45758128, "epoch": 0.33266195701187434, "flos": 14501699112960.0, "grad_norm": 14.834023333039529, "language_loss": 0.73751616, "learning_rate": 3.1135881910337735e-06, "loss": 0.75839019, "num_input_tokens_seen": 118856275, "router_z_loss_clip": 2.97460938, "router_z_loss_mlp": 0.45336914, "step": 5533, "time_per_iteration": 2.7271888256073 }, { "auxiliary_loss_clip": 0.01566034, "auxiliary_loss_mlp": 0.00500704, "balance_loss_clip": 1.27180243, "balance_loss_mlp": 0.45795509, "epoch": 0.3327220802645423, "flos": 15304410299520.0, "grad_norm": 6.245691370170473, "language_loss": 0.77020419, "learning_rate": 3.113264663362451e-06, "loss": 0.79087162, "num_input_tokens_seen": 118873830, "router_z_loss_clip": 2.94140625, "router_z_loss_mlp": 0.42724609, "step": 5534, "time_per_iteration": 2.698463201522827 }, { "auxiliary_loss_clip": 0.01583257, "auxiliary_loss_mlp": 0.00528854, "balance_loss_clip": 1.28748798, "balance_loss_mlp": 0.48541397, "epoch": 0.3327822035172103, "flos": 23477534951040.0, "grad_norm": 70.97305995391447, "language_loss": 0.7228334, "learning_rate": 3.1129410934760204e-06, "loss": 0.74395448, "num_input_tokens_seen": 118891560, "router_z_loss_clip": 2.95703125, "router_z_loss_mlp": 0.43408203, "step": 5535, "time_per_iteration": 2.692518711090088 }, { "auxiliary_loss_clip": 0.01571523, "auxiliary_loss_mlp": 0.00547771, "balance_loss_clip": 1.27563143, "balance_loss_mlp": 0.5010649, "epoch": 0.33284232676987824, "flos": 25374516019200.0, "grad_norm": 28.395863270228844, "language_loss": 0.779576, "learning_rate": 3.1126174813867517e-06, "loss": 0.80076897, "num_input_tokens_seen": 118910260, "router_z_loss_clip": 2.9609375, "router_z_loss_mlp": 0.46679688, "step": 5536, "time_per_iteration": 2.660176992416382 }, { "auxiliary_loss_clip": 0.01574601, "auxiliary_loss_mlp": 0.005375, "balance_loss_clip": 1.27547586, "balance_loss_mlp": 0.4915086, "epoch": 0.3329024500225462, "flos": 23694363400320.0, "grad_norm": 14.569887290546781, "language_loss": 0.87195206, "learning_rate": 3.112293827106917e-06, "loss": 0.89307308, "num_input_tokens_seen": 118929985, "router_z_loss_clip": 2.9921875, "router_z_loss_mlp": 0.46020508, "step": 5537, "time_per_iteration": 2.6532106399536133 }, { "auxiliary_loss_clip": 0.01585102, "auxiliary_loss_mlp": 0.00511739, "balance_loss_clip": 1.28471637, "balance_loss_mlp": 0.46708292, "epoch": 0.33296257327521417, "flos": 31723163205120.0, "grad_norm": 5.754768013059788, "language_loss": 0.77808118, "learning_rate": 3.111970130648789e-06, "loss": 0.7990495, "num_input_tokens_seen": 118951355, "router_z_loss_clip": 3.00585938, "router_z_loss_mlp": 0.44677734, "step": 5538, "time_per_iteration": 2.7051334381103516 }, { "auxiliary_loss_clip": 0.01540243, "auxiliary_loss_mlp": 0.00476653, "balance_loss_clip": 1.25436044, "balance_loss_mlp": 0.43540639, "epoch": 0.33302269652788213, "flos": 22744705674240.0, "grad_norm": 28.202062898663343, "language_loss": 0.80048895, "learning_rate": 3.1116463920246424e-06, "loss": 0.82065797, "num_input_tokens_seen": 118970910, "router_z_loss_clip": 2.85742188, "router_z_loss_mlp": 0.4128418, "step": 5539, "time_per_iteration": 2.6633849143981934 }, { "auxiliary_loss_clip": 0.01577545, "auxiliary_loss_mlp": 0.00515418, "balance_loss_clip": 1.27594006, "balance_loss_mlp": 0.4706912, "epoch": 0.33308281978055015, "flos": 11473747441920.0, "grad_norm": 5.322936735697194, "language_loss": 0.7732451, "learning_rate": 3.1113226112467527e-06, "loss": 0.79417473, "num_input_tokens_seen": 118989200, "router_z_loss_clip": 3.015625, "router_z_loss_mlp": 0.44750977, "step": 5540, "time_per_iteration": 2.63276743888855 }, { "auxiliary_loss_clip": 0.01554375, "auxiliary_loss_mlp": 0.00527425, "balance_loss_clip": 1.26103187, "balance_loss_mlp": 0.48355561, "epoch": 0.3331429430332181, "flos": 38213693112960.0, "grad_norm": 2.7294732945503584, "language_loss": 0.65095437, "learning_rate": 3.1109987883273983e-06, "loss": 0.67177236, "num_input_tokens_seen": 119011030, "router_z_loss_clip": 2.93359375, "router_z_loss_mlp": 0.4387207, "step": 5541, "time_per_iteration": 2.765655279159546 }, { "auxiliary_loss_clip": 0.01534911, "auxiliary_loss_mlp": 0.00498972, "balance_loss_clip": 1.24259782, "balance_loss_mlp": 0.45684344, "epoch": 0.3332030662858861, "flos": 22528667324160.0, "grad_norm": 2.674965133707393, "language_loss": 0.75876474, "learning_rate": 3.1106749232788584e-06, "loss": 0.77910358, "num_input_tokens_seen": 119030620, "router_z_loss_clip": 2.921875, "router_z_loss_mlp": 0.42138672, "step": 5542, "time_per_iteration": 2.650214195251465 }, { "auxiliary_loss_clip": 0.01544128, "auxiliary_loss_mlp": 0.00535913, "balance_loss_clip": 1.24934721, "balance_loss_mlp": 0.49075603, "epoch": 0.33326318953855405, "flos": 15997773507840.0, "grad_norm": 38.439029142757065, "language_loss": 0.7938571, "learning_rate": 3.110351016113414e-06, "loss": 0.81465745, "num_input_tokens_seen": 119048015, "router_z_loss_clip": 2.94726562, "router_z_loss_mlp": 0.45166016, "step": 5543, "time_per_iteration": 2.6148736476898193 }, { "auxiliary_loss_clip": 0.01561131, "auxiliary_loss_mlp": 0.00517572, "balance_loss_clip": 1.26374698, "balance_loss_mlp": 0.47139078, "epoch": 0.333323312791222, "flos": 25593535198080.0, "grad_norm": 46.708099611856134, "language_loss": 0.80759609, "learning_rate": 3.110027066843348e-06, "loss": 0.82838321, "num_input_tokens_seen": 119066280, "router_z_loss_clip": 2.97460938, "router_z_loss_mlp": 0.46142578, "step": 5544, "time_per_iteration": 2.70951247215271 }, { "auxiliary_loss_clip": 0.01536038, "auxiliary_loss_mlp": 0.00568343, "balance_loss_clip": 1.24336219, "balance_loss_mlp": 0.52115953, "epoch": 0.33338343604389, "flos": 25119550304640.0, "grad_norm": 52.51805925828019, "language_loss": 0.76017356, "learning_rate": 3.1097030754809456e-06, "loss": 0.78121734, "num_input_tokens_seen": 119087680, "router_z_loss_clip": 2.9296875, "router_z_loss_mlp": 0.47167969, "step": 5545, "time_per_iteration": 2.732720375061035 }, { "auxiliary_loss_clip": 0.01545581, "auxiliary_loss_mlp": 0.0051878, "balance_loss_clip": 1.25494313, "balance_loss_mlp": 0.47393349, "epoch": 0.33344355929655795, "flos": 16947287579520.0, "grad_norm": 6.875822240436938, "language_loss": 0.74610537, "learning_rate": 3.1093790420384894e-06, "loss": 0.76674896, "num_input_tokens_seen": 119105820, "router_z_loss_clip": 2.90625, "router_z_loss_mlp": 0.44824219, "step": 5546, "time_per_iteration": 2.6604368686676025 }, { "auxiliary_loss_clip": 0.01520167, "auxiliary_loss_mlp": 0.00562627, "balance_loss_clip": 1.22705519, "balance_loss_mlp": 0.51568288, "epoch": 0.3335036825492259, "flos": 27889591345920.0, "grad_norm": 133.25581305137445, "language_loss": 0.69674468, "learning_rate": 3.1090549665282702e-06, "loss": 0.71757263, "num_input_tokens_seen": 119126630, "router_z_loss_clip": 2.93359375, "router_z_loss_mlp": 0.46948242, "step": 5547, "time_per_iteration": 4.260615348815918 }, { "auxiliary_loss_clip": 0.01550578, "auxiliary_loss_mlp": 0.00526865, "balance_loss_clip": 1.25759506, "balance_loss_mlp": 0.48478419, "epoch": 0.3335638058018939, "flos": 16179553261440.0, "grad_norm": 19.687291209613875, "language_loss": 0.91841704, "learning_rate": 3.1087308489625742e-06, "loss": 0.93919146, "num_input_tokens_seen": 119143375, "router_z_loss_clip": 2.92773438, "router_z_loss_mlp": 0.42089844, "step": 5548, "time_per_iteration": 4.02365255355835 }, { "auxiliary_loss_clip": 0.01541779, "auxiliary_loss_mlp": 0.00503082, "balance_loss_clip": 1.24198031, "balance_loss_mlp": 0.4568764, "epoch": 0.33362392905456184, "flos": 39896108288640.0, "grad_norm": 7.318126467127114, "language_loss": 0.7980876, "learning_rate": 3.1084066893536945e-06, "loss": 0.81853622, "num_input_tokens_seen": 119166450, "router_z_loss_clip": 2.99609375, "router_z_loss_mlp": 0.46191406, "step": 5549, "time_per_iteration": 2.841622829437256 }, { "auxiliary_loss_clip": 0.01532116, "auxiliary_loss_mlp": 0.00497906, "balance_loss_clip": 1.23636389, "balance_loss_mlp": 0.45515773, "epoch": 0.3336840523072298, "flos": 44271212567040.0, "grad_norm": 13.162228822589661, "language_loss": 0.73314977, "learning_rate": 3.108082487713921e-06, "loss": 0.75345004, "num_input_tokens_seen": 119189645, "router_z_loss_clip": 2.95898438, "router_z_loss_mlp": 0.42773438, "step": 5550, "time_per_iteration": 2.9085965156555176 }, { "auxiliary_loss_clip": 0.01546138, "auxiliary_loss_mlp": 0.00510713, "balance_loss_clip": 1.24776793, "balance_loss_mlp": 0.46841744, "epoch": 0.33374417555989777, "flos": 15085678429440.0, "grad_norm": 6.905128605925467, "language_loss": 0.6645838, "learning_rate": 3.1077582440555495e-06, "loss": 0.68515229, "num_input_tokens_seen": 119208045, "router_z_loss_clip": 2.98046875, "router_z_loss_mlp": 0.42285156, "step": 5551, "time_per_iteration": 2.6311562061309814 }, { "auxiliary_loss_clip": 0.01525882, "auxiliary_loss_mlp": 0.00516762, "balance_loss_clip": 1.23643744, "balance_loss_mlp": 0.47201061, "epoch": 0.33380429881256574, "flos": 15849174942720.0, "grad_norm": 5.167904644467356, "language_loss": 0.75524777, "learning_rate": 3.1074339583908746e-06, "loss": 0.77567416, "num_input_tokens_seen": 119224910, "router_z_loss_clip": 2.89453125, "router_z_loss_mlp": 0.44750977, "step": 5552, "time_per_iteration": 2.797231435775757 }, { "auxiliary_loss_clip": 0.01513597, "auxiliary_loss_mlp": 0.00493994, "balance_loss_clip": 1.22580624, "balance_loss_mlp": 0.45181811, "epoch": 0.33386442206523376, "flos": 13480327883520.0, "grad_norm": 13.274758104406505, "language_loss": 0.8980478, "learning_rate": 3.107109630732192e-06, "loss": 0.91812372, "num_input_tokens_seen": 119243290, "router_z_loss_clip": 2.87695312, "router_z_loss_mlp": 0.42211914, "step": 5553, "time_per_iteration": 4.007889270782471 }, { "auxiliary_loss_clip": 0.01523478, "auxiliary_loss_mlp": 0.00471147, "balance_loss_clip": 1.23198164, "balance_loss_mlp": 0.43023473, "epoch": 0.3339245453179017, "flos": 16690669839360.0, "grad_norm": 25.027656995856034, "language_loss": 0.87479091, "learning_rate": 3.1067852610918017e-06, "loss": 0.89473712, "num_input_tokens_seen": 119261195, "router_z_loss_clip": 2.91210938, "router_z_loss_mlp": 0.40942383, "step": 5554, "time_per_iteration": 2.7050328254699707 }, { "auxiliary_loss_clip": 0.01530255, "auxiliary_loss_mlp": 0.00474902, "balance_loss_clip": 1.23294389, "balance_loss_mlp": 0.43246388, "epoch": 0.3339846685705697, "flos": 24610624456320.0, "grad_norm": 20.86485701924114, "language_loss": 0.87275565, "learning_rate": 3.1064608494820032e-06, "loss": 0.89280719, "num_input_tokens_seen": 119282845, "router_z_loss_clip": 2.97460938, "router_z_loss_mlp": 0.42456055, "step": 5555, "time_per_iteration": 2.7094626426696777 }, { "auxiliary_loss_clip": 0.01536364, "auxiliary_loss_mlp": 0.0049315, "balance_loss_clip": 1.24181604, "balance_loss_mlp": 0.44973415, "epoch": 0.33404479182323765, "flos": 30953812775040.0, "grad_norm": 11.884020416932232, "language_loss": 0.79386497, "learning_rate": 3.106136395915099e-06, "loss": 0.81416011, "num_input_tokens_seen": 119304430, "router_z_loss_clip": 2.9453125, "router_z_loss_mlp": 0.43457031, "step": 5556, "time_per_iteration": 2.732884407043457 }, { "auxiliary_loss_clip": 0.01527082, "auxiliary_loss_mlp": 0.00466273, "balance_loss_clip": 1.23810136, "balance_loss_mlp": 0.42559928, "epoch": 0.3341049150759056, "flos": 23513301918720.0, "grad_norm": 5.634430592156986, "language_loss": 0.86651528, "learning_rate": 3.105811900403391e-06, "loss": 0.88644886, "num_input_tokens_seen": 119323830, "router_z_loss_clip": 2.88671875, "router_z_loss_mlp": 0.40649414, "step": 5557, "time_per_iteration": 2.698448896408081 }, { "auxiliary_loss_clip": 0.01531305, "auxiliary_loss_mlp": 0.00426581, "balance_loss_clip": 1.23852038, "balance_loss_mlp": 0.38960272, "epoch": 0.3341650383285736, "flos": 24026824707840.0, "grad_norm": 352.5943687907297, "language_loss": 0.8342644, "learning_rate": 3.1054873629591855e-06, "loss": 0.85384321, "num_input_tokens_seen": 119346340, "router_z_loss_clip": 2.92773438, "router_z_loss_mlp": 0.36962891, "step": 5558, "time_per_iteration": 2.825575828552246 }, { "auxiliary_loss_clip": 0.01510948, "auxiliary_loss_mlp": 0.00432179, "balance_loss_clip": 1.21573627, "balance_loss_mlp": 0.39341214, "epoch": 0.33422516158124155, "flos": 24901967669760.0, "grad_norm": 2.1941763082298333, "language_loss": 0.87150031, "learning_rate": 3.105162783594788e-06, "loss": 0.89093161, "num_input_tokens_seen": 119367285, "router_z_loss_clip": 2.95117188, "router_z_loss_mlp": 0.38793945, "step": 5559, "time_per_iteration": 4.196875095367432 }, { "auxiliary_loss_clip": 0.01496717, "auxiliary_loss_mlp": 0.00392367, "balance_loss_clip": 1.21420026, "balance_loss_mlp": 0.35724795, "epoch": 0.3342852848339095, "flos": 18333403464960.0, "grad_norm": 6.05266729596821, "language_loss": 0.76116145, "learning_rate": 3.1048381623225074e-06, "loss": 0.7800523, "num_input_tokens_seen": 119385370, "router_z_loss_clip": 2.82617188, "router_z_loss_mlp": 0.35107422, "step": 5560, "time_per_iteration": 2.646061420440674 }, { "auxiliary_loss_clip": 0.01508851, "auxiliary_loss_mlp": 0.00423967, "balance_loss_clip": 1.21763921, "balance_loss_mlp": 0.38534313, "epoch": 0.3343454080865775, "flos": 30046530119040.0, "grad_norm": 4.716856440101298, "language_loss": 0.80786216, "learning_rate": 3.1045134991546526e-06, "loss": 0.82719034, "num_input_tokens_seen": 119409150, "router_z_loss_clip": 2.91210938, "router_z_loss_mlp": 0.38623047, "step": 5561, "time_per_iteration": 2.7490670680999756 }, { "auxiliary_loss_clip": 0.0151983, "auxiliary_loss_mlp": 0.00420586, "balance_loss_clip": 1.2287029, "balance_loss_mlp": 0.3825103, "epoch": 0.33440553133924544, "flos": 16398823835520.0, "grad_norm": 216.9696923270397, "language_loss": 0.75399613, "learning_rate": 3.1041887941035355e-06, "loss": 0.77340031, "num_input_tokens_seen": 119426475, "router_z_loss_clip": 2.91210938, "router_z_loss_mlp": 0.38085938, "step": 5562, "time_per_iteration": 2.639038324356079 }, { "auxiliary_loss_clip": 0.01506078, "auxiliary_loss_mlp": 0.00364154, "balance_loss_clip": 1.21729779, "balance_loss_mlp": 0.3308233, "epoch": 0.3344656545919134, "flos": 24242072958720.0, "grad_norm": 5.591433907291675, "language_loss": 0.70583355, "learning_rate": 3.1038640471814685e-06, "loss": 0.72453582, "num_input_tokens_seen": 119446900, "router_z_loss_clip": 2.88867188, "router_z_loss_mlp": 0.33349609, "step": 5563, "time_per_iteration": 2.690009832382202 }, { "auxiliary_loss_clip": 0.01530579, "auxiliary_loss_mlp": 0.00419479, "balance_loss_clip": 1.23253655, "balance_loss_mlp": 0.38002074, "epoch": 0.3345257778445814, "flos": 52118843149440.0, "grad_norm": 19.87167114398471, "language_loss": 0.78265846, "learning_rate": 3.103539258400766e-06, "loss": 0.80215895, "num_input_tokens_seen": 119470945, "router_z_loss_clip": 2.97851562, "router_z_loss_mlp": 0.39453125, "step": 5564, "time_per_iteration": 2.940110445022583 }, { "auxiliary_loss_clip": 0.01496691, "auxiliary_loss_mlp": 0.00073659, "balance_loss_clip": 1.29375601, "balance_loss_mlp": 0.06369306, "epoch": 0.33458590109724934, "flos": 68048602254720.0, "grad_norm": 0.8057517237810922, "language_loss": 0.54941928, "learning_rate": 3.103214427773745e-06, "loss": 0.56512272, "num_input_tokens_seen": 119529925, "router_z_loss_clip": 2.03125, "router_z_loss_mlp": 0.09960938, "step": 5565, "time_per_iteration": 3.1427042484283447 }, { "auxiliary_loss_clip": 0.01521776, "auxiliary_loss_mlp": 0.00394313, "balance_loss_clip": 1.22981298, "balance_loss_mlp": 0.35909843, "epoch": 0.3346460243499173, "flos": 37414788768000.0, "grad_norm": 79.65497083136118, "language_loss": 0.71349871, "learning_rate": 3.102889555312721e-06, "loss": 0.73265958, "num_input_tokens_seen": 119550700, "router_z_loss_clip": 2.91796875, "router_z_loss_mlp": 0.35229492, "step": 5566, "time_per_iteration": 2.766519069671631 }, { "auxiliary_loss_clip": 0.015084, "auxiliary_loss_mlp": 0.00391326, "balance_loss_clip": 1.21862936, "balance_loss_mlp": 0.35587275, "epoch": 0.3347061476025853, "flos": 18697358021760.0, "grad_norm": 2.652467543853517, "language_loss": 0.82713145, "learning_rate": 3.102564641030016e-06, "loss": 0.84612876, "num_input_tokens_seen": 119569295, "router_z_loss_clip": 2.89648438, "router_z_loss_mlp": 0.35473633, "step": 5567, "time_per_iteration": 2.665616989135742 }, { "auxiliary_loss_clip": 0.01511983, "auxiliary_loss_mlp": 0.00426049, "balance_loss_clip": 1.21596718, "balance_loss_mlp": 0.38678133, "epoch": 0.3347662708552533, "flos": 13917827537280.0, "grad_norm": 2.7348646788553763, "language_loss": 0.83195448, "learning_rate": 3.102239684937949e-06, "loss": 0.85133481, "num_input_tokens_seen": 119587375, "router_z_loss_clip": 2.95507812, "router_z_loss_mlp": 0.39306641, "step": 5568, "time_per_iteration": 2.6135306358337402 }, { "auxiliary_loss_clip": 0.01535037, "auxiliary_loss_mlp": 0.0041354, "balance_loss_clip": 1.23490989, "balance_loss_mlp": 0.37625092, "epoch": 0.33482639410792125, "flos": 19750402068480.0, "grad_norm": 76.17103427571024, "language_loss": 0.76378047, "learning_rate": 3.101914687048842e-06, "loss": 0.78326631, "num_input_tokens_seen": 119604530, "router_z_loss_clip": 3.00195312, "router_z_loss_mlp": 0.37280273, "step": 5569, "time_per_iteration": 2.718444585800171 }, { "auxiliary_loss_clip": 0.01520853, "auxiliary_loss_mlp": 0.0041368, "balance_loss_clip": 1.2216177, "balance_loss_mlp": 0.3769401, "epoch": 0.3348865173605892, "flos": 16102991422080.0, "grad_norm": 293.08048769402546, "language_loss": 0.95181799, "learning_rate": 3.10158964737502e-06, "loss": 0.97116333, "num_input_tokens_seen": 119621025, "router_z_loss_clip": 2.99414062, "router_z_loss_mlp": 0.3671875, "step": 5570, "time_per_iteration": 2.862739086151123 }, { "auxiliary_loss_clip": 0.01515847, "auxiliary_loss_mlp": 0.00386704, "balance_loss_clip": 1.22176993, "balance_loss_mlp": 0.34920084, "epoch": 0.3349466406132572, "flos": 25008945350400.0, "grad_norm": 30.53975102390588, "language_loss": 0.85130179, "learning_rate": 3.101264565928808e-06, "loss": 0.87032729, "num_input_tokens_seen": 119641725, "router_z_loss_clip": 2.93945312, "router_z_loss_mlp": 0.37524414, "step": 5571, "time_per_iteration": 2.783130645751953 }, { "auxiliary_loss_clip": 0.01545861, "auxiliary_loss_mlp": 0.00230658, "balance_loss_clip": 1.33117783, "balance_loss_mlp": 0.21888056, "epoch": 0.33500676386592515, "flos": 54319991564160.0, "grad_norm": 1.0264118522767554, "language_loss": 0.55268157, "learning_rate": 3.1009394427225335e-06, "loss": 0.57044673, "num_input_tokens_seen": 119693560, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.11767578, "step": 5572, "time_per_iteration": 3.1498193740844727 }, { "auxiliary_loss_clip": 0.01534155, "auxiliary_loss_mlp": 0.00431359, "balance_loss_clip": 1.23993659, "balance_loss_mlp": 0.39411792, "epoch": 0.3350668871185931, "flos": 26797332625920.0, "grad_norm": 208.2353012507294, "language_loss": 0.85203719, "learning_rate": 3.1006142777685257e-06, "loss": 0.87169236, "num_input_tokens_seen": 119712935, "router_z_loss_clip": 2.94140625, "router_z_loss_mlp": 0.37231445, "step": 5573, "time_per_iteration": 2.7048099040985107 }, { "auxiliary_loss_clip": 0.01523163, "auxiliary_loss_mlp": 0.00382255, "balance_loss_clip": 1.22708035, "balance_loss_mlp": 0.34746981, "epoch": 0.3351270103712611, "flos": 33510508986240.0, "grad_norm": 8.606320182678555, "language_loss": 0.80444169, "learning_rate": 3.1002890710791133e-06, "loss": 0.82349586, "num_input_tokens_seen": 119731680, "router_z_loss_clip": 2.96289062, "router_z_loss_mlp": 0.34790039, "step": 5574, "time_per_iteration": 2.7470831871032715 }, { "auxiliary_loss_clip": 0.01540146, "auxiliary_loss_mlp": 0.00365377, "balance_loss_clip": 1.24823332, "balance_loss_mlp": 0.33156967, "epoch": 0.33518713362392905, "flos": 26506240807680.0, "grad_norm": 6.444732463716021, "language_loss": 0.92807651, "learning_rate": 3.0999638226666287e-06, "loss": 0.94713169, "num_input_tokens_seen": 119752155, "router_z_loss_clip": 2.91796875, "router_z_loss_mlp": 0.33813477, "step": 5575, "time_per_iteration": 2.7024929523468018 }, { "auxiliary_loss_clip": 0.01583254, "auxiliary_loss_mlp": 0.00372733, "balance_loss_clip": 1.26940513, "balance_loss_mlp": 0.33606422, "epoch": 0.335247256876597, "flos": 17232345912960.0, "grad_norm": 4.1518286083812805, "language_loss": 0.87862605, "learning_rate": 3.0996385325434063e-06, "loss": 0.89818591, "num_input_tokens_seen": 119769195, "router_z_loss_clip": 3.13867188, "router_z_loss_mlp": 0.36645508, "step": 5576, "time_per_iteration": 2.639709711074829 }, { "auxiliary_loss_clip": 0.01534514, "auxiliary_loss_mlp": 0.0035317, "balance_loss_clip": 1.2368871, "balance_loss_mlp": 0.31595263, "epoch": 0.335307380129265, "flos": 25629373992960.0, "grad_norm": 1052.0087385056956, "language_loss": 0.78825974, "learning_rate": 3.0993132007217806e-06, "loss": 0.8071366, "num_input_tokens_seen": 119786810, "router_z_loss_clip": 2.97460938, "router_z_loss_mlp": 0.37231445, "step": 5577, "time_per_iteration": 2.6559345722198486 }, { "auxiliary_loss_clip": 0.01561902, "auxiliary_loss_mlp": 0.0036954, "balance_loss_clip": 1.26151848, "balance_loss_mlp": 0.33263314, "epoch": 0.33536750338193294, "flos": 19680089195520.0, "grad_norm": 55.66494601609688, "language_loss": 0.85998583, "learning_rate": 3.0989878272140883e-06, "loss": 0.87930024, "num_input_tokens_seen": 119805395, "router_z_loss_clip": 3.0078125, "router_z_loss_mlp": 0.36914062, "step": 5578, "time_per_iteration": 2.6454966068267822 }, { "auxiliary_loss_clip": 0.01533621, "auxiliary_loss_mlp": 0.0037571, "balance_loss_clip": 1.24074733, "balance_loss_mlp": 0.33911306, "epoch": 0.3354276266346009, "flos": 18332613365760.0, "grad_norm": 15.32926166381112, "language_loss": 0.80194861, "learning_rate": 3.0986624120326676e-06, "loss": 0.82104194, "num_input_tokens_seen": 119823135, "router_z_loss_clip": 2.9296875, "router_z_loss_mlp": 0.36547852, "step": 5579, "time_per_iteration": 2.612842559814453 }, { "auxiliary_loss_clip": 0.01526289, "auxiliary_loss_mlp": 0.00382907, "balance_loss_clip": 1.23031151, "balance_loss_mlp": 0.34802598, "epoch": 0.3354877498872689, "flos": 17858556645120.0, "grad_norm": 21.226347619111102, "language_loss": 0.88099837, "learning_rate": 3.0983369551898573e-06, "loss": 0.90009034, "num_input_tokens_seen": 119842265, "router_z_loss_clip": 2.95898438, "router_z_loss_mlp": 0.34887695, "step": 5580, "time_per_iteration": 2.632863759994507 }, { "auxiliary_loss_clip": 0.01543369, "auxiliary_loss_mlp": 0.0040766, "balance_loss_clip": 1.24537349, "balance_loss_mlp": 0.37020475, "epoch": 0.3355478731399369, "flos": 24717745791360.0, "grad_norm": 3.7471992009845363, "language_loss": 0.82669032, "learning_rate": 3.0980114566980003e-06, "loss": 0.84620064, "num_input_tokens_seen": 119862500, "router_z_loss_clip": 2.98046875, "router_z_loss_mlp": 0.37426758, "step": 5581, "time_per_iteration": 2.68477725982666 }, { "auxiliary_loss_clip": 0.01569792, "auxiliary_loss_mlp": 0.00413438, "balance_loss_clip": 1.26225519, "balance_loss_mlp": 0.3743847, "epoch": 0.33560799639260486, "flos": 16873886136960.0, "grad_norm": 11.334767225374792, "language_loss": 0.8073982, "learning_rate": 3.0976859165694384e-06, "loss": 0.82723045, "num_input_tokens_seen": 119880160, "router_z_loss_clip": 3.07421875, "router_z_loss_mlp": 0.39038086, "step": 5582, "time_per_iteration": 2.6129021644592285 }, { "auxiliary_loss_clip": 0.01551277, "auxiliary_loss_mlp": 0.00396031, "balance_loss_clip": 1.2513001, "balance_loss_mlp": 0.35845616, "epoch": 0.3356681196452728, "flos": 18333511205760.0, "grad_norm": 3299.54745304531, "language_loss": 0.88300955, "learning_rate": 3.0973603348165166e-06, "loss": 0.90248257, "num_input_tokens_seen": 119899040, "router_z_loss_clip": 2.99804688, "router_z_loss_mlp": 0.37597656, "step": 5583, "time_per_iteration": 2.676743745803833 }, { "auxiliary_loss_clip": 0.01548624, "auxiliary_loss_mlp": 0.0036086, "balance_loss_clip": 1.2516669, "balance_loss_mlp": 0.32264179, "epoch": 0.3357282428979408, "flos": 34750612085760.0, "grad_norm": 35.37925294199629, "language_loss": 0.8425647, "learning_rate": 3.097034711451581e-06, "loss": 0.86165953, "num_input_tokens_seen": 119921120, "router_z_loss_clip": 2.96875, "router_z_loss_mlp": 0.38208008, "step": 5584, "time_per_iteration": 2.8251233100891113 }, { "auxiliary_loss_clip": 0.01550153, "auxiliary_loss_mlp": 0.00391207, "balance_loss_clip": 1.2517885, "balance_loss_mlp": 0.35425195, "epoch": 0.33578836615060875, "flos": 21580087006080.0, "grad_norm": 5.388423713421417, "language_loss": 0.82181233, "learning_rate": 3.0967090464869795e-06, "loss": 0.84122592, "num_input_tokens_seen": 119940165, "router_z_loss_clip": 2.984375, "router_z_loss_mlp": 0.36987305, "step": 5585, "time_per_iteration": 2.6921873092651367 }, { "auxiliary_loss_clip": 0.01576285, "auxiliary_loss_mlp": 0.00374348, "balance_loss_clip": 1.27838063, "balance_loss_mlp": 0.33779863, "epoch": 0.3358484894032767, "flos": 24530291688960.0, "grad_norm": 24.195852877666933, "language_loss": 0.82955867, "learning_rate": 3.0963833399350608e-06, "loss": 0.84906495, "num_input_tokens_seen": 119959730, "router_z_loss_clip": 2.97460938, "router_z_loss_mlp": 0.36572266, "step": 5586, "time_per_iteration": 2.7588794231414795 }, { "auxiliary_loss_clip": 0.01588363, "auxiliary_loss_mlp": 0.00353321, "balance_loss_clip": 1.27562213, "balance_loss_mlp": 0.31410095, "epoch": 0.3359086126559447, "flos": 22455589104000.0, "grad_norm": 103.3704683249579, "language_loss": 0.8943662, "learning_rate": 3.0960575918081756e-06, "loss": 0.91378307, "num_input_tokens_seen": 119979315, "router_z_loss_clip": 3.12890625, "router_z_loss_mlp": 0.39233398, "step": 5587, "time_per_iteration": 2.7122697830200195 }, { "auxiliary_loss_clip": 0.01589126, "auxiliary_loss_mlp": 0.00349829, "balance_loss_clip": 1.29490042, "balance_loss_mlp": 0.31568766, "epoch": 0.33596873590861265, "flos": 16543687386240.0, "grad_norm": 1194.232375840977, "language_loss": 0.73894513, "learning_rate": 3.095731802118677e-06, "loss": 0.75833464, "num_input_tokens_seen": 119996140, "router_z_loss_clip": 2.94140625, "router_z_loss_mlp": 0.34179688, "step": 5588, "time_per_iteration": 2.685500383377075 }, { "auxiliary_loss_clip": 0.01591555, "auxiliary_loss_mlp": 0.0036385, "balance_loss_clip": 1.28324127, "balance_loss_mlp": 0.32515472, "epoch": 0.3360288591612806, "flos": 31175812782720.0, "grad_norm": 22.357301113988644, "language_loss": 0.75951552, "learning_rate": 3.095405970878919e-06, "loss": 0.77906954, "num_input_tokens_seen": 120017720, "router_z_loss_clip": 3.08007812, "router_z_loss_mlp": 0.38720703, "step": 5589, "time_per_iteration": 4.156806468963623 }, { "auxiliary_loss_clip": 0.01587954, "auxiliary_loss_mlp": 0.00371514, "balance_loss_clip": 1.28618753, "balance_loss_mlp": 0.33608454, "epoch": 0.3360889824139486, "flos": 23696913265920.0, "grad_norm": 57.76248346447716, "language_loss": 0.72684246, "learning_rate": 3.0950800981012567e-06, "loss": 0.74643713, "num_input_tokens_seen": 120036335, "router_z_loss_clip": 3.02148438, "router_z_loss_mlp": 0.35424805, "step": 5590, "time_per_iteration": 4.171026706695557 }, { "auxiliary_loss_clip": 0.01593126, "auxiliary_loss_mlp": 0.00337489, "balance_loss_clip": 1.29468179, "balance_loss_mlp": 0.30203605, "epoch": 0.33614910566661654, "flos": 19318109886720.0, "grad_norm": 3.0197056841907592, "language_loss": 0.81321573, "learning_rate": 3.094754183798047e-06, "loss": 0.83252186, "num_input_tokens_seen": 120056120, "router_z_loss_clip": 2.98828125, "router_z_loss_mlp": 0.35449219, "step": 5591, "time_per_iteration": 2.6838200092315674 }, { "auxiliary_loss_clip": 0.01591979, "auxiliary_loss_mlp": 0.00349122, "balance_loss_clip": 1.29555023, "balance_loss_mlp": 0.31207216, "epoch": 0.3362092289192845, "flos": 16472261191680.0, "grad_norm": 33.06302673028555, "language_loss": 0.76515383, "learning_rate": 3.0944282279816493e-06, "loss": 0.78456485, "num_input_tokens_seen": 120073650, "router_z_loss_clip": 2.96679688, "router_z_loss_mlp": 0.37084961, "step": 5592, "time_per_iteration": 2.6967225074768066 }, { "auxiliary_loss_clip": 0.01558369, "auxiliary_loss_mlp": 0.00347929, "balance_loss_clip": 1.26691604, "balance_loss_mlp": 0.31340617, "epoch": 0.33626935217195253, "flos": 24243581329920.0, "grad_norm": 27.44926009669974, "language_loss": 0.82425714, "learning_rate": 3.094102230664423e-06, "loss": 0.84332013, "num_input_tokens_seen": 120093260, "router_z_loss_clip": 2.91601562, "router_z_loss_mlp": 0.34545898, "step": 5593, "time_per_iteration": 2.6847188472747803 }, { "auxiliary_loss_clip": 0.01581792, "auxiliary_loss_mlp": 0.00362582, "balance_loss_clip": 1.27785945, "balance_loss_mlp": 0.32348162, "epoch": 0.3363294754246205, "flos": 19718765164800.0, "grad_norm": 87.56725031354604, "language_loss": 0.78874129, "learning_rate": 3.093776191858731e-06, "loss": 0.80818498, "num_input_tokens_seen": 120111830, "router_z_loss_clip": 3.03515625, "router_z_loss_mlp": 0.39111328, "step": 5594, "time_per_iteration": 2.6315743923187256 }, { "auxiliary_loss_clip": 0.01593111, "auxiliary_loss_mlp": 0.00367594, "balance_loss_clip": 1.29271281, "balance_loss_mlp": 0.32861251, "epoch": 0.33638959867728846, "flos": 22596286677120.0, "grad_norm": 25.581757089423586, "language_loss": 0.84858316, "learning_rate": 3.0934501115769363e-06, "loss": 0.86819023, "num_input_tokens_seen": 120130470, "router_z_loss_clip": 3.00390625, "router_z_loss_mlp": 0.38989258, "step": 5595, "time_per_iteration": 4.11200737953186 }, { "auxiliary_loss_clip": 0.01597624, "auxiliary_loss_mlp": 0.00324098, "balance_loss_clip": 1.30071568, "balance_loss_mlp": 0.2880972, "epoch": 0.3364497219299564, "flos": 20994742972800.0, "grad_norm": 19.125038599518827, "language_loss": 0.87747091, "learning_rate": 3.0931239898314037e-06, "loss": 0.8966881, "num_input_tokens_seen": 120150735, "router_z_loss_clip": 2.96875, "router_z_loss_mlp": 0.36010742, "step": 5596, "time_per_iteration": 2.712236166000366 }, { "auxiliary_loss_clip": 0.01591046, "auxiliary_loss_mlp": 0.00363684, "balance_loss_clip": 1.29055262, "balance_loss_mlp": 0.3294946, "epoch": 0.3365098451826244, "flos": 25228610974080.0, "grad_norm": 18.191507710314223, "language_loss": 0.80562866, "learning_rate": 3.0927978266344995e-06, "loss": 0.82517594, "num_input_tokens_seen": 120173230, "router_z_loss_clip": 3.0078125, "router_z_loss_mlp": 0.34204102, "step": 5597, "time_per_iteration": 2.836308002471924 }, { "auxiliary_loss_clip": 0.01588202, "auxiliary_loss_mlp": 0.00371647, "balance_loss_clip": 1.29056537, "balance_loss_mlp": 0.33504951, "epoch": 0.33656996843529235, "flos": 24571697091840.0, "grad_norm": 3.6634150951222573, "language_loss": 0.85010904, "learning_rate": 3.0924716219985916e-06, "loss": 0.86970752, "num_input_tokens_seen": 120191860, "router_z_loss_clip": 2.9765625, "router_z_loss_mlp": 0.36572266, "step": 5598, "time_per_iteration": 2.6566855907440186 }, { "auxiliary_loss_clip": 0.01621995, "auxiliary_loss_mlp": 0.00402566, "balance_loss_clip": 1.30762208, "balance_loss_mlp": 0.36241624, "epoch": 0.3366300916879603, "flos": 44091120752640.0, "grad_norm": 14.016018202304966, "language_loss": 0.70498919, "learning_rate": 3.0921453759360514e-06, "loss": 0.72523481, "num_input_tokens_seen": 120219195, "router_z_loss_clip": 3.14648438, "router_z_loss_mlp": 0.40136719, "step": 5599, "time_per_iteration": 2.821074962615967 }, { "auxiliary_loss_clip": 0.01582871, "auxiliary_loss_mlp": 0.00381205, "balance_loss_clip": 1.27528596, "balance_loss_mlp": 0.34022084, "epoch": 0.3366902149406283, "flos": 13879869840000.0, "grad_norm": 9.64623936925707, "language_loss": 0.90438521, "learning_rate": 3.091819088459249e-06, "loss": 0.92402595, "num_input_tokens_seen": 120232950, "router_z_loss_clip": 3.07617188, "router_z_loss_mlp": 0.40966797, "step": 5600, "time_per_iteration": 2.590841293334961 }, { "auxiliary_loss_clip": 0.01593971, "auxiliary_loss_mlp": 0.00347623, "balance_loss_clip": 1.28846407, "balance_loss_mlp": 0.31200367, "epoch": 0.33675033819329625, "flos": 16253098358400.0, "grad_norm": 123.89688377472987, "language_loss": 0.88376749, "learning_rate": 3.0914927595805573e-06, "loss": 0.90318346, "num_input_tokens_seen": 120248865, "router_z_loss_clip": 3.05664062, "router_z_loss_mlp": 0.35644531, "step": 5601, "time_per_iteration": 3.998593330383301 }, { "auxiliary_loss_clip": 0.01611561, "auxiliary_loss_mlp": 0.00336867, "balance_loss_clip": 1.31644046, "balance_loss_mlp": 0.30346438, "epoch": 0.3368104614459642, "flos": 17055809544960.0, "grad_norm": 73.65743158595602, "language_loss": 0.89894515, "learning_rate": 3.0911663893123507e-06, "loss": 0.91842943, "num_input_tokens_seen": 120267820, "router_z_loss_clip": 2.953125, "router_z_loss_mlp": 0.33422852, "step": 5602, "time_per_iteration": 2.6501998901367188 }, { "auxiliary_loss_clip": 0.01599793, "auxiliary_loss_mlp": 0.00373667, "balance_loss_clip": 1.30186224, "balance_loss_mlp": 0.33709359, "epoch": 0.3368705846986322, "flos": 17858628472320.0, "grad_norm": 9.367974326772686, "language_loss": 0.75468409, "learning_rate": 3.0908399776670048e-06, "loss": 0.77441871, "num_input_tokens_seen": 120286540, "router_z_loss_clip": 2.98046875, "router_z_loss_mlp": 0.3659668, "step": 5603, "time_per_iteration": 2.6472818851470947 }, { "auxiliary_loss_clip": 0.01629645, "auxiliary_loss_mlp": 0.00390888, "balance_loss_clip": 1.32413876, "balance_loss_mlp": 0.35266981, "epoch": 0.33693070795130015, "flos": 22929502170240.0, "grad_norm": 32.85631066989053, "language_loss": 0.87790245, "learning_rate": 3.090513524656898e-06, "loss": 0.89810777, "num_input_tokens_seen": 120307305, "router_z_loss_clip": 3.05273438, "router_z_loss_mlp": 0.38208008, "step": 5604, "time_per_iteration": 2.6615848541259766 }, { "auxiliary_loss_clip": 0.01621022, "auxiliary_loss_mlp": 0.00379452, "balance_loss_clip": 1.31628835, "balance_loss_mlp": 0.34216344, "epoch": 0.3369908312039681, "flos": 22017443005440.0, "grad_norm": 2.08017111106845, "language_loss": 0.79698372, "learning_rate": 3.090187030294409e-06, "loss": 0.81698841, "num_input_tokens_seen": 120327845, "router_z_loss_clip": 3.046875, "router_z_loss_mlp": 0.37280273, "step": 5605, "time_per_iteration": 2.6683003902435303 }, { "auxiliary_loss_clip": 0.01643846, "auxiliary_loss_mlp": 0.00413838, "balance_loss_clip": 1.33239174, "balance_loss_mlp": 0.37385544, "epoch": 0.33705095445663613, "flos": 11801970944640.0, "grad_norm": 6.184367869605203, "language_loss": 0.89504039, "learning_rate": 3.089860494591919e-06, "loss": 0.91561717, "num_input_tokens_seen": 120343255, "router_z_loss_clip": 3.11914062, "router_z_loss_mlp": 0.40014648, "step": 5606, "time_per_iteration": 2.627930164337158 }, { "auxiliary_loss_clip": 0.01630758, "auxiliary_loss_mlp": 0.00363268, "balance_loss_clip": 1.32452416, "balance_loss_mlp": 0.32409561, "epoch": 0.3371110777093041, "flos": 25046400257280.0, "grad_norm": 1.758199402811793, "language_loss": 0.73085803, "learning_rate": 3.089533917561809e-06, "loss": 0.75079823, "num_input_tokens_seen": 120361745, "router_z_loss_clip": 3.05664062, "router_z_loss_mlp": 0.39160156, "step": 5607, "time_per_iteration": 2.7224080562591553 }, { "auxiliary_loss_clip": 0.01633906, "auxiliary_loss_mlp": 0.00427294, "balance_loss_clip": 1.32143724, "balance_loss_mlp": 0.38719207, "epoch": 0.33717120096197206, "flos": 26579031719040.0, "grad_norm": 5.635755917585983, "language_loss": 0.7832191, "learning_rate": 3.089207299216464e-06, "loss": 0.80383104, "num_input_tokens_seen": 120380565, "router_z_loss_clip": 3.125, "router_z_loss_mlp": 0.40087891, "step": 5608, "time_per_iteration": 2.837798833847046 }, { "auxiliary_loss_clip": 0.01633129, "auxiliary_loss_mlp": 0.00415322, "balance_loss_clip": 1.32576799, "balance_loss_mlp": 0.37674558, "epoch": 0.33723132421464, "flos": 15158541168000.0, "grad_norm": 9.403551934723325, "language_loss": 0.8491286, "learning_rate": 3.088880639568269e-06, "loss": 0.86961311, "num_input_tokens_seen": 120399235, "router_z_loss_clip": 3.07421875, "router_z_loss_mlp": 0.38574219, "step": 5609, "time_per_iteration": 2.6769025325775146 }, { "auxiliary_loss_clip": 0.0164015, "auxiliary_loss_mlp": 0.00452785, "balance_loss_clip": 1.32881725, "balance_loss_mlp": 0.40998852, "epoch": 0.337291447467308, "flos": 23436093634560.0, "grad_norm": 6.40304731037525, "language_loss": 0.87100255, "learning_rate": 3.0885539386296114e-06, "loss": 0.89193189, "num_input_tokens_seen": 120420095, "router_z_loss_clip": 3.11523438, "router_z_loss_mlp": 0.42773438, "step": 5610, "time_per_iteration": 2.6841881275177 }, { "auxiliary_loss_clip": 0.01652004, "auxiliary_loss_mlp": 0.0041201, "balance_loss_clip": 1.34475589, "balance_loss_mlp": 0.37286192, "epoch": 0.33735157071997596, "flos": 17238163916160.0, "grad_norm": 2.6111512218583526, "language_loss": 0.88976723, "learning_rate": 3.088227196412879e-06, "loss": 0.91040736, "num_input_tokens_seen": 120437690, "router_z_loss_clip": 3.07226562, "router_z_loss_mlp": 0.3918457, "step": 5611, "time_per_iteration": 2.6425278186798096 }, { "auxiliary_loss_clip": 0.01635836, "auxiliary_loss_mlp": 0.00462604, "balance_loss_clip": 1.32646894, "balance_loss_mlp": 0.41878232, "epoch": 0.3374116939726439, "flos": 28257388657920.0, "grad_norm": 97.05374320654698, "language_loss": 0.84320545, "learning_rate": 3.0879004129304626e-06, "loss": 0.8641898, "num_input_tokens_seen": 120459240, "router_z_loss_clip": 3.08984375, "router_z_loss_mlp": 0.43774414, "step": 5612, "time_per_iteration": 2.7237908840179443 }, { "auxiliary_loss_clip": 0.0164081, "auxiliary_loss_mlp": 0.00491291, "balance_loss_clip": 1.33052969, "balance_loss_mlp": 0.44789928, "epoch": 0.3374718172253119, "flos": 35919396731520.0, "grad_norm": 28.617111203699505, "language_loss": 0.75644296, "learning_rate": 3.087573588194753e-06, "loss": 0.77776396, "num_input_tokens_seen": 120481090, "router_z_loss_clip": 3.10742188, "router_z_loss_mlp": 0.43432617, "step": 5613, "time_per_iteration": 2.7829782962799072 }, { "auxiliary_loss_clip": 0.01659478, "auxiliary_loss_mlp": 0.00461238, "balance_loss_clip": 1.34491527, "balance_loss_mlp": 0.41929978, "epoch": 0.33753194047797985, "flos": 18186672407040.0, "grad_norm": 18.727949652256477, "language_loss": 0.84484076, "learning_rate": 3.087246722218144e-06, "loss": 0.86604798, "num_input_tokens_seen": 120500045, "router_z_loss_clip": 3.14257812, "router_z_loss_mlp": 0.41943359, "step": 5614, "time_per_iteration": 2.724947690963745 }, { "auxiliary_loss_clip": 0.01659461, "auxiliary_loss_mlp": 0.00508103, "balance_loss_clip": 1.34440458, "balance_loss_mlp": 0.46056187, "epoch": 0.3375920637306478, "flos": 23148916398720.0, "grad_norm": 10.083504187588652, "language_loss": 0.96133846, "learning_rate": 3.086919815013031e-06, "loss": 0.98301405, "num_input_tokens_seen": 120521125, "router_z_loss_clip": 3.1484375, "router_z_loss_mlp": 0.47583008, "step": 5615, "time_per_iteration": 2.7331082820892334 }, { "auxiliary_loss_clip": 0.01646436, "auxiliary_loss_mlp": 0.0050217, "balance_loss_clip": 1.34010863, "balance_loss_mlp": 0.45958847, "epoch": 0.3376521869833158, "flos": 23112215677440.0, "grad_norm": 8.56628761434008, "language_loss": 0.85308164, "learning_rate": 3.086592866591809e-06, "loss": 0.87456769, "num_input_tokens_seen": 120539180, "router_z_loss_clip": 3.06445312, "router_z_loss_mlp": 0.42553711, "step": 5616, "time_per_iteration": 2.737654447555542 }, { "auxiliary_loss_clip": 0.01666528, "auxiliary_loss_mlp": 0.00455815, "balance_loss_clip": 1.34549296, "balance_loss_mlp": 0.4128992, "epoch": 0.33771231023598375, "flos": 19274585581440.0, "grad_norm": 38.77886928773457, "language_loss": 0.89252031, "learning_rate": 3.0862658769668774e-06, "loss": 0.91374373, "num_input_tokens_seen": 120556280, "router_z_loss_clip": 3.20703125, "router_z_loss_mlp": 0.42895508, "step": 5617, "time_per_iteration": 2.6170666217803955 }, { "auxiliary_loss_clip": 0.01674944, "auxiliary_loss_mlp": 0.0045434, "balance_loss_clip": 1.3621515, "balance_loss_mlp": 0.41101977, "epoch": 0.3377724334886517, "flos": 18150187167360.0, "grad_norm": 24.142870080227333, "language_loss": 0.84607404, "learning_rate": 3.0859388461506343e-06, "loss": 0.86736691, "num_input_tokens_seen": 120575395, "router_z_loss_clip": 3.12695312, "router_z_loss_mlp": 0.43310547, "step": 5618, "time_per_iteration": 2.62235689163208 }, { "auxiliary_loss_clip": 0.01673312, "auxiliary_loss_mlp": 0.0050885, "balance_loss_clip": 1.35544503, "balance_loss_mlp": 0.46190548, "epoch": 0.3378325567413197, "flos": 25775997310080.0, "grad_norm": 24.18243213084034, "language_loss": 0.76862633, "learning_rate": 3.085611774155481e-06, "loss": 0.79044801, "num_input_tokens_seen": 120596075, "router_z_loss_clip": 3.18359375, "router_z_loss_mlp": 0.46972656, "step": 5619, "time_per_iteration": 2.7528467178344727 }, { "auxiliary_loss_clip": 0.01691774, "auxiliary_loss_mlp": 0.00481935, "balance_loss_clip": 1.37267637, "balance_loss_mlp": 0.43623054, "epoch": 0.3378926799939877, "flos": 21317112558720.0, "grad_norm": 423.6003237403087, "language_loss": 0.76496571, "learning_rate": 3.085284660993821e-06, "loss": 0.78670275, "num_input_tokens_seen": 120614195, "router_z_loss_clip": 3.19140625, "router_z_loss_mlp": 0.45727539, "step": 5620, "time_per_iteration": 2.697732448577881 }, { "auxiliary_loss_clip": 0.01656281, "auxiliary_loss_mlp": 0.00510883, "balance_loss_clip": 1.34566855, "balance_loss_mlp": 0.46520182, "epoch": 0.33795280324665566, "flos": 24900028335360.0, "grad_norm": 34.09951818416055, "language_loss": 0.75049305, "learning_rate": 3.084957506678058e-06, "loss": 0.77216464, "num_input_tokens_seen": 120634475, "router_z_loss_clip": 3.10742188, "router_z_loss_mlp": 0.45703125, "step": 5621, "time_per_iteration": 2.7057900428771973 }, { "auxiliary_loss_clip": 0.01646339, "auxiliary_loss_mlp": 0.00493199, "balance_loss_clip": 1.34071016, "balance_loss_mlp": 0.45009333, "epoch": 0.33801292649932363, "flos": 24753943722240.0, "grad_norm": 5.487472678520415, "language_loss": 0.87219661, "learning_rate": 3.0846303112205975e-06, "loss": 0.893592, "num_input_tokens_seen": 120654980, "router_z_loss_clip": 3.05664062, "router_z_loss_mlp": 0.43066406, "step": 5622, "time_per_iteration": 2.675394058227539 }, { "auxiliary_loss_clip": 0.01674453, "auxiliary_loss_mlp": 0.00491399, "balance_loss_clip": 1.36461687, "balance_loss_mlp": 0.44958037, "epoch": 0.3380730497519916, "flos": 26723967096960.0, "grad_norm": 9.238216684294697, "language_loss": 0.7780472, "learning_rate": 3.0843030746338464e-06, "loss": 0.79970574, "num_input_tokens_seen": 120676245, "router_z_loss_clip": 3.09765625, "router_z_loss_mlp": 0.41796875, "step": 5623, "time_per_iteration": 2.6746809482574463 }, { "auxiliary_loss_clip": 0.02048074, "auxiliary_loss_mlp": 0.00264322, "balance_loss_clip": 1.84051514, "balance_loss_mlp": 0.24610667, "epoch": 0.33813317300465956, "flos": 70035756416640.0, "grad_norm": 0.7573395481446963, "language_loss": 0.54530442, "learning_rate": 3.083975796930215e-06, "loss": 0.5684284, "num_input_tokens_seen": 120741965, "router_z_loss_clip": 2.0625, "router_z_loss_mlp": 0.18261719, "step": 5624, "time_per_iteration": 3.243288040161133 }, { "auxiliary_loss_clip": 0.01677985, "auxiliary_loss_mlp": 0.00495723, "balance_loss_clip": 1.36337876, "balance_loss_mlp": 0.45094782, "epoch": 0.3381932962573275, "flos": 24097317148800.0, "grad_norm": 185.56917330617796, "language_loss": 0.79099125, "learning_rate": 3.083648478122111e-06, "loss": 0.81272829, "num_input_tokens_seen": 120760410, "router_z_loss_clip": 3.14648438, "router_z_loss_mlp": 0.44775391, "step": 5625, "time_per_iteration": 2.6991119384765625 }, { "auxiliary_loss_clip": 0.01690921, "auxiliary_loss_mlp": 0.00487842, "balance_loss_clip": 1.37075865, "balance_loss_mlp": 0.44137439, "epoch": 0.3382534195099955, "flos": 19278248768640.0, "grad_norm": 6.169780062407367, "language_loss": 0.76683366, "learning_rate": 3.0833211182219497e-06, "loss": 0.78862131, "num_input_tokens_seen": 120777705, "router_z_loss_clip": 3.20117188, "router_z_loss_mlp": 0.46508789, "step": 5626, "time_per_iteration": 2.63055157661438 }, { "auxiliary_loss_clip": 0.0168893, "auxiliary_loss_mlp": 0.00467423, "balance_loss_clip": 1.38139033, "balance_loss_mlp": 0.42453191, "epoch": 0.33831354276266346, "flos": 25226240676480.0, "grad_norm": 36.821120407016245, "language_loss": 0.84339124, "learning_rate": 3.0829937172421425e-06, "loss": 0.86495483, "num_input_tokens_seen": 120798660, "router_z_loss_clip": 3.07617188, "router_z_loss_mlp": 0.42871094, "step": 5627, "time_per_iteration": 2.68215274810791 }, { "auxiliary_loss_clip": 0.01670614, "auxiliary_loss_mlp": 0.00490446, "balance_loss_clip": 1.35621345, "balance_loss_mlp": 0.4462429, "epoch": 0.3383736660153314, "flos": 23112000195840.0, "grad_norm": 16.352295523515277, "language_loss": 0.86174428, "learning_rate": 3.0826662751951055e-06, "loss": 0.8833549, "num_input_tokens_seen": 120816705, "router_z_loss_clip": 3.140625, "router_z_loss_mlp": 0.44189453, "step": 5628, "time_per_iteration": 2.6627743244171143 }, { "auxiliary_loss_clip": 0.01690991, "auxiliary_loss_mlp": 0.0051162, "balance_loss_clip": 1.36639047, "balance_loss_mlp": 0.46338803, "epoch": 0.3384337892679994, "flos": 23477139901440.0, "grad_norm": 2.9168502097130826, "language_loss": 0.84676969, "learning_rate": 3.082338792093254e-06, "loss": 0.86879587, "num_input_tokens_seen": 120835375, "router_z_loss_clip": 3.24804688, "router_z_loss_mlp": 0.48193359, "step": 5629, "time_per_iteration": 2.7029573917388916 }, { "auxiliary_loss_clip": 0.01675221, "auxiliary_loss_mlp": 0.00544875, "balance_loss_clip": 1.35501289, "balance_loss_mlp": 0.49380583, "epoch": 0.33849391252066735, "flos": 19425805839360.0, "grad_norm": 6.2141078884506165, "language_loss": 0.90525162, "learning_rate": 3.0820112679490074e-06, "loss": 0.92745256, "num_input_tokens_seen": 120854260, "router_z_loss_clip": 3.203125, "router_z_loss_mlp": 0.51098633, "step": 5630, "time_per_iteration": 2.6588196754455566 }, { "auxiliary_loss_clip": 0.01677287, "auxiliary_loss_mlp": 0.00472389, "balance_loss_clip": 1.36633229, "balance_loss_mlp": 0.4299503, "epoch": 0.3385540357733353, "flos": 21064840364160.0, "grad_norm": 57.40580308839691, "language_loss": 0.78036541, "learning_rate": 3.0816837027747857e-06, "loss": 0.80186212, "num_input_tokens_seen": 120871590, "router_z_loss_clip": 3.11132812, "router_z_loss_mlp": 0.42431641, "step": 5631, "time_per_iteration": 4.1633875370025635 }, { "auxiliary_loss_clip": 0.01990598, "auxiliary_loss_mlp": 0.00153214, "balance_loss_clip": 1.79332829, "balance_loss_mlp": 0.14024439, "epoch": 0.3386141590260033, "flos": 69208013450880.0, "grad_norm": 0.9528878836206, "language_loss": 0.55906165, "learning_rate": 3.0813560965830084e-06, "loss": 0.58049977, "num_input_tokens_seen": 120925550, "router_z_loss_clip": 1.96875, "router_z_loss_mlp": 0.12988281, "step": 5632, "time_per_iteration": 3.205193042755127 }, { "auxiliary_loss_clip": 0.01670882, "auxiliary_loss_mlp": 0.00479629, "balance_loss_clip": 1.35524642, "balance_loss_mlp": 0.4360939, "epoch": 0.3386742822786713, "flos": 25519487310720.0, "grad_norm": 21.95724780594213, "language_loss": 0.84613764, "learning_rate": 3.0810284493861005e-06, "loss": 0.86764276, "num_input_tokens_seen": 120947620, "router_z_loss_clip": 3.15625, "router_z_loss_mlp": 0.43554688, "step": 5633, "time_per_iteration": 4.262273788452148 }, { "auxiliary_loss_clip": 0.0163763, "auxiliary_loss_mlp": 0.00441638, "balance_loss_clip": 1.32633138, "balance_loss_mlp": 0.40175086, "epoch": 0.33873440553133927, "flos": 23623116773760.0, "grad_norm": 4.003270799141251, "language_loss": 0.66261518, "learning_rate": 3.0807007611964855e-06, "loss": 0.6834079, "num_input_tokens_seen": 120965205, "router_z_loss_clip": 3.11328125, "router_z_loss_mlp": 0.39892578, "step": 5634, "time_per_iteration": 2.723479986190796 }, { "auxiliary_loss_clip": 0.01648715, "auxiliary_loss_mlp": 0.00481733, "balance_loss_clip": 1.34060037, "balance_loss_mlp": 0.43881768, "epoch": 0.33879452878400723, "flos": 17088882992640.0, "grad_norm": 18.660766430528557, "language_loss": 0.9697445, "learning_rate": 3.080373032026589e-06, "loss": 0.99104899, "num_input_tokens_seen": 120983560, "router_z_loss_clip": 3.08007812, "router_z_loss_mlp": 0.42895508, "step": 5635, "time_per_iteration": 2.683528184890747 }, { "auxiliary_loss_clip": 0.01650593, "auxiliary_loss_mlp": 0.00505514, "balance_loss_clip": 1.35081482, "balance_loss_mlp": 0.4600476, "epoch": 0.3388546520366752, "flos": 15742053607680.0, "grad_norm": 2.355476753450642, "language_loss": 0.80813694, "learning_rate": 3.0800452618888386e-06, "loss": 0.82969803, "num_input_tokens_seen": 121001400, "router_z_loss_clip": 2.99414062, "router_z_loss_mlp": 0.4543457, "step": 5636, "time_per_iteration": 2.590348958969116 }, { "auxiliary_loss_clip": 0.0163409, "auxiliary_loss_mlp": 0.0050307, "balance_loss_clip": 1.33239055, "balance_loss_mlp": 0.45908174, "epoch": 0.33891477528934316, "flos": 22418744728320.0, "grad_norm": 17.96299111891339, "language_loss": 0.89739394, "learning_rate": 3.0797174507956637e-06, "loss": 0.91876554, "num_input_tokens_seen": 121021760, "router_z_loss_clip": 3.01367188, "router_z_loss_mlp": 0.43969727, "step": 5637, "time_per_iteration": 2.652534008026123 }, { "auxiliary_loss_clip": 0.01643711, "auxiliary_loss_mlp": 0.00559935, "balance_loss_clip": 1.33469391, "balance_loss_mlp": 0.51239389, "epoch": 0.3389748985420111, "flos": 17274828723840.0, "grad_norm": 6.916622723291121, "language_loss": 0.76354319, "learning_rate": 3.079389598759495e-06, "loss": 0.78557962, "num_input_tokens_seen": 121041070, "router_z_loss_clip": 3.09179688, "router_z_loss_mlp": 0.4753418, "step": 5638, "time_per_iteration": 4.059498310089111 }, { "auxiliary_loss_clip": 0.01611362, "auxiliary_loss_mlp": 0.00459081, "balance_loss_clip": 1.31303573, "balance_loss_mlp": 0.41635692, "epoch": 0.3390350217946791, "flos": 27744979190400.0, "grad_norm": 12.07334466484072, "language_loss": 0.86321962, "learning_rate": 3.079061705792765e-06, "loss": 0.88392407, "num_input_tokens_seen": 121060890, "router_z_loss_clip": 2.98046875, "router_z_loss_mlp": 0.42700195, "step": 5639, "time_per_iteration": 2.7078371047973633 }, { "auxiliary_loss_clip": 0.016025, "auxiliary_loss_mlp": 0.00494724, "balance_loss_clip": 1.29636431, "balance_loss_mlp": 0.45064029, "epoch": 0.33909514504734706, "flos": 20339804338560.0, "grad_norm": 2.9447441467685485, "language_loss": 0.75569904, "learning_rate": 3.078733771907907e-06, "loss": 0.77667123, "num_input_tokens_seen": 121079135, "router_z_loss_clip": 3.06445312, "router_z_loss_mlp": 0.44067383, "step": 5640, "time_per_iteration": 2.6862857341766357 }, { "auxiliary_loss_clip": 0.01605412, "auxiliary_loss_mlp": 0.00516295, "balance_loss_clip": 1.30424452, "balance_loss_mlp": 0.46985066, "epoch": 0.339155268300015, "flos": 14830030356480.0, "grad_norm": 30.987080591917014, "language_loss": 0.7501936, "learning_rate": 3.0784057971173554e-06, "loss": 0.77141064, "num_input_tokens_seen": 121097685, "router_z_loss_clip": 3.00976562, "router_z_loss_mlp": 0.46484375, "step": 5641, "time_per_iteration": 2.608158588409424 }, { "auxiliary_loss_clip": 0.01606893, "auxiliary_loss_mlp": 0.00468371, "balance_loss_clip": 1.30313706, "balance_loss_mlp": 0.42710093, "epoch": 0.339215391552683, "flos": 26067951054720.0, "grad_norm": 263121.91431245895, "language_loss": 0.92944169, "learning_rate": 3.0780777814335483e-06, "loss": 0.95019436, "num_input_tokens_seen": 121115640, "router_z_loss_clip": 3.04101562, "router_z_loss_mlp": 0.4128418, "step": 5642, "time_per_iteration": 2.6640493869781494 }, { "auxiliary_loss_clip": 0.01587595, "auxiliary_loss_mlp": 0.00454491, "balance_loss_clip": 1.30117512, "balance_loss_mlp": 0.41620094, "epoch": 0.33927551480535095, "flos": 14574705505920.0, "grad_norm": 20.852150725974038, "language_loss": 0.88211054, "learning_rate": 3.077749724868924e-06, "loss": 0.90253139, "num_input_tokens_seen": 121132485, "router_z_loss_clip": 2.86523438, "router_z_loss_mlp": 0.38305664, "step": 5643, "time_per_iteration": 4.0114524364471436 }, { "auxiliary_loss_clip": 0.01608968, "auxiliary_loss_mlp": 0.00445923, "balance_loss_clip": 1.31616831, "balance_loss_mlp": 0.40641725, "epoch": 0.3393356380580189, "flos": 23805578885760.0, "grad_norm": 15.191693125753208, "language_loss": 0.81951714, "learning_rate": 3.077421627435922e-06, "loss": 0.84006602, "num_input_tokens_seen": 121152935, "router_z_loss_clip": 2.92773438, "router_z_loss_mlp": 0.39501953, "step": 5644, "time_per_iteration": 2.672614097595215 }, { "auxiliary_loss_clip": 0.01584404, "auxiliary_loss_mlp": 0.00472852, "balance_loss_clip": 1.2901907, "balance_loss_mlp": 0.43201119, "epoch": 0.3393957613106869, "flos": 17347871030400.0, "grad_norm": 4.357574613666112, "language_loss": 0.73070824, "learning_rate": 3.0770934891469832e-06, "loss": 0.75128084, "num_input_tokens_seen": 121169835, "router_z_loss_clip": 2.94335938, "router_z_loss_mlp": 0.40844727, "step": 5645, "time_per_iteration": 2.778305768966675 }, { "auxiliary_loss_clip": 0.01597758, "auxiliary_loss_mlp": 0.00434345, "balance_loss_clip": 1.3074007, "balance_loss_mlp": 0.39455256, "epoch": 0.3394558845633549, "flos": 28433960939520.0, "grad_norm": 79.87131542036131, "language_loss": 0.82893062, "learning_rate": 3.076765310014552e-06, "loss": 0.84925163, "num_input_tokens_seen": 121190290, "router_z_loss_clip": 2.90625, "router_z_loss_mlp": 0.39819336, "step": 5646, "time_per_iteration": 2.765425443649292 }, { "auxiliary_loss_clip": 0.01576168, "auxiliary_loss_mlp": 0.00466689, "balance_loss_clip": 1.27969992, "balance_loss_mlp": 0.42646772, "epoch": 0.33951600781602287, "flos": 22086929865600.0, "grad_norm": 7.433345002187825, "language_loss": 0.85192478, "learning_rate": 3.0764370900510727e-06, "loss": 0.87235332, "num_input_tokens_seen": 121209060, "router_z_loss_clip": 2.9609375, "router_z_loss_mlp": 0.40209961, "step": 5647, "time_per_iteration": 2.706400156021118 }, { "auxiliary_loss_clip": 0.0159833, "auxiliary_loss_mlp": 0.0043732, "balance_loss_clip": 1.30498028, "balance_loss_mlp": 0.3976706, "epoch": 0.33957613106869083, "flos": 23878262056320.0, "grad_norm": 94.7845844910692, "language_loss": 0.81488878, "learning_rate": 3.0761088292689904e-06, "loss": 0.83524525, "num_input_tokens_seen": 121227480, "router_z_loss_clip": 2.93164062, "router_z_loss_mlp": 0.39624023, "step": 5648, "time_per_iteration": 2.689861536026001 }, { "auxiliary_loss_clip": 0.01839869, "auxiliary_loss_mlp": 0.00141911, "balance_loss_clip": 1.64635777, "balance_loss_mlp": 0.13075307, "epoch": 0.3396362543213588, "flos": 71242642414080.0, "grad_norm": 0.7996376082361883, "language_loss": 0.55675316, "learning_rate": 3.075780527680754e-06, "loss": 0.57657099, "num_input_tokens_seen": 121291305, "router_z_loss_clip": 1.9375, "router_z_loss_mlp": 0.11181641, "step": 5649, "time_per_iteration": 3.1723403930664062 }, { "auxiliary_loss_clip": 0.01581095, "auxiliary_loss_mlp": 0.00433917, "balance_loss_clip": 1.28648162, "balance_loss_mlp": 0.39743865, "epoch": 0.33969637757402676, "flos": 25921615046400.0, "grad_norm": 22.440707290410945, "language_loss": 0.89513218, "learning_rate": 3.0754521852988117e-06, "loss": 0.91528237, "num_input_tokens_seen": 121312740, "router_z_loss_clip": 2.9453125, "router_z_loss_mlp": 0.36499023, "step": 5650, "time_per_iteration": 2.725966453552246 }, { "auxiliary_loss_clip": 0.01584408, "auxiliary_loss_mlp": 0.00440437, "balance_loss_clip": 1.29048729, "balance_loss_mlp": 0.40257573, "epoch": 0.33975650082669473, "flos": 35261728663680.0, "grad_norm": 16.713692650544395, "language_loss": 0.75786531, "learning_rate": 3.0751238021356152e-06, "loss": 0.77811372, "num_input_tokens_seen": 121334220, "router_z_loss_clip": 2.9375, "router_z_loss_mlp": 0.37866211, "step": 5651, "time_per_iteration": 2.798872470855713 }, { "auxiliary_loss_clip": 0.0158869, "auxiliary_loss_mlp": 0.0044851, "balance_loss_clip": 1.29467571, "balance_loss_mlp": 0.4106968, "epoch": 0.3398166240793627, "flos": 16647001879680.0, "grad_norm": 5.373562700958699, "language_loss": 0.86477512, "learning_rate": 3.074795378203616e-06, "loss": 0.88514709, "num_input_tokens_seen": 121351870, "router_z_loss_clip": 2.93945312, "router_z_loss_mlp": 0.37792969, "step": 5652, "time_per_iteration": 2.6688833236694336 }, { "auxiliary_loss_clip": 0.01582948, "auxiliary_loss_mlp": 0.00450603, "balance_loss_clip": 1.28889322, "balance_loss_mlp": 0.41004759, "epoch": 0.33987674733203066, "flos": 24062196625920.0, "grad_norm": 3.193395553694335, "language_loss": 0.82840371, "learning_rate": 3.0744669135152685e-06, "loss": 0.84873915, "num_input_tokens_seen": 121373400, "router_z_loss_clip": 2.94140625, "router_z_loss_mlp": 0.40576172, "step": 5653, "time_per_iteration": 2.725322961807251 }, { "auxiliary_loss_clip": 0.01582495, "auxiliary_loss_mlp": 0.00440518, "balance_loss_clip": 1.29144013, "balance_loss_mlp": 0.40258539, "epoch": 0.3399368705846986, "flos": 13250678279040.0, "grad_norm": 9.318497629239587, "language_loss": 0.94020462, "learning_rate": 3.0741384080830278e-06, "loss": 0.96043479, "num_input_tokens_seen": 121385225, "router_z_loss_clip": 2.91210938, "router_z_loss_mlp": 0.37939453, "step": 5654, "time_per_iteration": 2.610999584197998 }, { "auxiliary_loss_clip": 0.01583397, "auxiliary_loss_mlp": 0.00443553, "balance_loss_clip": 1.29710507, "balance_loss_mlp": 0.40430921, "epoch": 0.3399969938373666, "flos": 27012832272000.0, "grad_norm": 5.722588244033212, "language_loss": 0.72302699, "learning_rate": 3.073809861919351e-06, "loss": 0.7432965, "num_input_tokens_seen": 121404735, "router_z_loss_clip": 2.86328125, "router_z_loss_mlp": 0.39233398, "step": 5655, "time_per_iteration": 2.7244269847869873 }, { "auxiliary_loss_clip": 0.01585103, "auxiliary_loss_mlp": 0.00428154, "balance_loss_clip": 1.29533494, "balance_loss_mlp": 0.39076969, "epoch": 0.34005711709003456, "flos": 28550096588160.0, "grad_norm": 24.31206323692729, "language_loss": 0.81125832, "learning_rate": 3.073481275036697e-06, "loss": 0.83139086, "num_input_tokens_seen": 121426780, "router_z_loss_clip": 2.89648438, "router_z_loss_mlp": 0.37402344, "step": 5656, "time_per_iteration": 2.739196538925171 }, { "auxiliary_loss_clip": 0.01606974, "auxiliary_loss_mlp": 0.00418602, "balance_loss_clip": 1.30718315, "balance_loss_mlp": 0.3815276, "epoch": 0.3401172403427025, "flos": 21617003208960.0, "grad_norm": 6.4120001019126684, "language_loss": 0.89067924, "learning_rate": 3.073152647447525e-06, "loss": 0.91093498, "num_input_tokens_seen": 121447245, "router_z_loss_clip": 2.99609375, "router_z_loss_mlp": 0.37060547, "step": 5657, "time_per_iteration": 2.6502573490142822 }, { "auxiliary_loss_clip": 0.01587622, "auxiliary_loss_mlp": 0.00445154, "balance_loss_clip": 1.30127001, "balance_loss_mlp": 0.40750772, "epoch": 0.3401773635953705, "flos": 25885776251520.0, "grad_norm": 2.7351105515731002, "language_loss": 0.90856087, "learning_rate": 3.0728239791642976e-06, "loss": 0.92888862, "num_input_tokens_seen": 121468165, "router_z_loss_clip": 2.859375, "router_z_loss_mlp": 0.37646484, "step": 5658, "time_per_iteration": 2.7455952167510986 }, { "auxiliary_loss_clip": 0.01817873, "auxiliary_loss_mlp": 0.00118, "balance_loss_clip": 1.61939216, "balance_loss_mlp": 0.1073669, "epoch": 0.3402374868480385, "flos": 65507995336320.0, "grad_norm": 0.8167290856380668, "language_loss": 0.5966351, "learning_rate": 3.072495270199477e-06, "loss": 0.61599386, "num_input_tokens_seen": 121523795, "router_z_loss_clip": 1.984375, "router_z_loss_mlp": 0.10644531, "step": 5659, "time_per_iteration": 3.109438896179199 }, { "auxiliary_loss_clip": 0.01591896, "auxiliary_loss_mlp": 0.00426343, "balance_loss_clip": 1.3081789, "balance_loss_mlp": 0.39012721, "epoch": 0.34029761010070647, "flos": 24060580513920.0, "grad_norm": 13.646799482092204, "language_loss": 0.74164832, "learning_rate": 3.0721665205655284e-06, "loss": 0.76183069, "num_input_tokens_seen": 121542950, "router_z_loss_clip": 2.83398438, "router_z_loss_mlp": 0.36230469, "step": 5660, "time_per_iteration": 2.632992744445801 }, { "auxiliary_loss_clip": 0.01595012, "auxiliary_loss_mlp": 0.00423427, "balance_loss_clip": 1.30687273, "balance_loss_mlp": 0.38609052, "epoch": 0.34035773335337444, "flos": 27599720590080.0, "grad_norm": 2.376384653023225, "language_loss": 0.73044056, "learning_rate": 3.071837730274918e-06, "loss": 0.75062495, "num_input_tokens_seen": 121562765, "router_z_loss_clip": 2.87890625, "router_z_loss_mlp": 0.37304688, "step": 5661, "time_per_iteration": 2.748883008956909 }, { "auxiliary_loss_clip": 0.01589037, "auxiliary_loss_mlp": 0.00454272, "balance_loss_clip": 1.30540848, "balance_loss_mlp": 0.4161008, "epoch": 0.3404178566060424, "flos": 20812783651200.0, "grad_norm": 4.390980860594311, "language_loss": 0.85364997, "learning_rate": 3.071508899340113e-06, "loss": 0.87408304, "num_input_tokens_seen": 121581610, "router_z_loss_clip": 2.83789062, "router_z_loss_mlp": 0.38183594, "step": 5662, "time_per_iteration": 2.85486102104187 }, { "auxiliary_loss_clip": 0.01592386, "auxiliary_loss_mlp": 0.00396281, "balance_loss_clip": 1.30384839, "balance_loss_mlp": 0.35968333, "epoch": 0.34047797985871037, "flos": 26833566470400.0, "grad_norm": 5.305578717321817, "language_loss": 0.79917634, "learning_rate": 3.0711800277735833e-06, "loss": 0.81906295, "num_input_tokens_seen": 121601885, "router_z_loss_clip": 2.8828125, "router_z_loss_mlp": 0.36572266, "step": 5663, "time_per_iteration": 2.6761600971221924 }, { "auxiliary_loss_clip": 0.01615654, "auxiliary_loss_mlp": 0.00430076, "balance_loss_clip": 1.32961893, "balance_loss_mlp": 0.39555264, "epoch": 0.34053810311137833, "flos": 19682639061120.0, "grad_norm": 7.814468648217868, "language_loss": 0.91602862, "learning_rate": 3.0708511155877997e-06, "loss": 0.93648589, "num_input_tokens_seen": 121621335, "router_z_loss_clip": 2.86328125, "router_z_loss_mlp": 0.34521484, "step": 5664, "time_per_iteration": 2.6260929107666016 }, { "auxiliary_loss_clip": 0.01584264, "auxiliary_loss_mlp": 0.00426024, "balance_loss_clip": 1.29689229, "balance_loss_mlp": 0.39169195, "epoch": 0.3405982263640463, "flos": 21725740656000.0, "grad_norm": 15.068385101642036, "language_loss": 0.74817109, "learning_rate": 3.070522162795235e-06, "loss": 0.76827395, "num_input_tokens_seen": 121641310, "router_z_loss_clip": 2.875, "router_z_loss_mlp": 0.34326172, "step": 5665, "time_per_iteration": 2.614964723587036 }, { "auxiliary_loss_clip": 0.01572904, "auxiliary_loss_mlp": 0.00406549, "balance_loss_clip": 1.28473353, "balance_loss_mlp": 0.36782986, "epoch": 0.34065834961671426, "flos": 18041629288320.0, "grad_norm": 6.576760242988177, "language_loss": 0.80109334, "learning_rate": 3.0701931694083626e-06, "loss": 0.82088792, "num_input_tokens_seen": 121659625, "router_z_loss_clip": 2.88085938, "router_z_loss_mlp": 0.38720703, "step": 5666, "time_per_iteration": 2.6227569580078125 }, { "auxiliary_loss_clip": 0.01577809, "auxiliary_loss_mlp": 0.00427108, "balance_loss_clip": 1.28695941, "balance_loss_mlp": 0.38900888, "epoch": 0.3407184728693822, "flos": 21397337585280.0, "grad_norm": 5.769210336590465, "language_loss": 0.78098953, "learning_rate": 3.0698641354396576e-06, "loss": 0.80103874, "num_input_tokens_seen": 121679205, "router_z_loss_clip": 2.90820312, "router_z_loss_mlp": 0.38085938, "step": 5667, "time_per_iteration": 2.6269400119781494 }, { "auxiliary_loss_clip": 0.01705422, "auxiliary_loss_mlp": 0.00099449, "balance_loss_clip": 1.5060159, "balance_loss_mlp": 0.08900621, "epoch": 0.3407785961220502, "flos": 68688101018880.0, "grad_norm": 0.838233226978156, "language_loss": 0.63385999, "learning_rate": 3.069535060901597e-06, "loss": 0.65190876, "num_input_tokens_seen": 121751085, "router_z_loss_clip": 2.0, "router_z_loss_mlp": 0.10449219, "step": 5668, "time_per_iteration": 3.3097074031829834 }, { "auxiliary_loss_clip": 0.01588457, "auxiliary_loss_mlp": 0.00428777, "balance_loss_clip": 1.29510403, "balance_loss_mlp": 0.39187014, "epoch": 0.34083871937471816, "flos": 14064379027200.0, "grad_norm": 12.122833531598696, "language_loss": 0.80025601, "learning_rate": 3.0692059458066596e-06, "loss": 0.82042837, "num_input_tokens_seen": 121768565, "router_z_loss_clip": 2.93164062, "router_z_loss_mlp": 0.36889648, "step": 5669, "time_per_iteration": 2.612865686416626 }, { "auxiliary_loss_clip": 0.01564618, "auxiliary_loss_mlp": 0.00408196, "balance_loss_clip": 1.27129197, "balance_loss_mlp": 0.3715511, "epoch": 0.3408988426273861, "flos": 17085435287040.0, "grad_norm": 37.47167530738846, "language_loss": 0.85501486, "learning_rate": 3.0688767901673265e-06, "loss": 0.87474298, "num_input_tokens_seen": 121784925, "router_z_loss_clip": 2.93359375, "router_z_loss_mlp": 0.36645508, "step": 5670, "time_per_iteration": 2.6444623470306396 }, { "auxiliary_loss_clip": 0.01577781, "auxiliary_loss_mlp": 0.00462251, "balance_loss_clip": 1.28130841, "balance_loss_mlp": 0.42295954, "epoch": 0.3409589658800541, "flos": 24024562151040.0, "grad_norm": 37.351444174063765, "language_loss": 0.81965142, "learning_rate": 3.068547593996078e-06, "loss": 0.84005171, "num_input_tokens_seen": 121804425, "router_z_loss_clip": 2.96679688, "router_z_loss_mlp": 0.39306641, "step": 5671, "time_per_iteration": 2.6785223484039307 }, { "auxiliary_loss_clip": 0.01566692, "auxiliary_loss_mlp": 0.00412247, "balance_loss_clip": 1.27676153, "balance_loss_mlp": 0.37231144, "epoch": 0.34101908913272205, "flos": 21142012734720.0, "grad_norm": 50.70823660410165, "language_loss": 0.79444927, "learning_rate": 3.0682183573053974e-06, "loss": 0.81423861, "num_input_tokens_seen": 121825145, "router_z_loss_clip": 2.90234375, "router_z_loss_mlp": 0.3996582, "step": 5672, "time_per_iteration": 2.7613494396209717 }, { "auxiliary_loss_clip": 0.01563639, "auxiliary_loss_mlp": 0.00382596, "balance_loss_clip": 1.27102828, "balance_loss_mlp": 0.34649983, "epoch": 0.3410792123853901, "flos": 15702012921600.0, "grad_norm": 6.003312795106123, "language_loss": 0.79127163, "learning_rate": 3.06788908010777e-06, "loss": 0.81073397, "num_input_tokens_seen": 121842185, "router_z_loss_clip": 2.92773438, "router_z_loss_mlp": 0.36083984, "step": 5673, "time_per_iteration": 4.149132251739502 }, { "auxiliary_loss_clip": 0.01565389, "auxiliary_loss_mlp": 0.00399115, "balance_loss_clip": 1.27599072, "balance_loss_mlp": 0.36437771, "epoch": 0.34113933563805804, "flos": 23036012974080.0, "grad_norm": 295.4353289904276, "language_loss": 0.85384077, "learning_rate": 3.067559762415682e-06, "loss": 0.8734858, "num_input_tokens_seen": 121862260, "router_z_loss_clip": 2.89453125, "router_z_loss_mlp": 0.34741211, "step": 5674, "time_per_iteration": 2.688446044921875 }, { "auxiliary_loss_clip": 0.01628128, "auxiliary_loss_mlp": 0.00167691, "balance_loss_clip": 1.42623901, "balance_loss_mlp": 0.15805879, "epoch": 0.341199458890726, "flos": 69614235336960.0, "grad_norm": 0.8012025881704852, "language_loss": 0.56134033, "learning_rate": 3.0672304042416198e-06, "loss": 0.57929862, "num_input_tokens_seen": 121923560, "router_z_loss_clip": 2.015625, "router_z_loss_mlp": 0.09619141, "step": 5675, "time_per_iteration": 4.696690797805786 }, { "auxiliary_loss_clip": 0.01574558, "auxiliary_loss_mlp": 0.004101, "balance_loss_clip": 1.28344929, "balance_loss_mlp": 0.37545824, "epoch": 0.34125958214339397, "flos": 22346348866560.0, "grad_norm": 215.67085276824042, "language_loss": 0.84685135, "learning_rate": 3.0669010055980734e-06, "loss": 0.86669797, "num_input_tokens_seen": 121943515, "router_z_loss_clip": 2.91015625, "router_z_loss_mlp": 0.34643555, "step": 5676, "time_per_iteration": 2.7509567737579346 }, { "auxiliary_loss_clip": 0.01568796, "auxiliary_loss_mlp": 0.00405564, "balance_loss_clip": 1.27323258, "balance_loss_mlp": 0.36844194, "epoch": 0.34131970539606193, "flos": 21871933009920.0, "grad_norm": 145.46440675917248, "language_loss": 0.90497494, "learning_rate": 3.0665715664975357e-06, "loss": 0.9247185, "num_input_tokens_seen": 121962540, "router_z_loss_clip": 2.953125, "router_z_loss_mlp": 0.37133789, "step": 5677, "time_per_iteration": 2.6524882316589355 }, { "auxiliary_loss_clip": 0.01563023, "auxiliary_loss_mlp": 0.00425114, "balance_loss_clip": 1.26987815, "balance_loss_mlp": 0.38582283, "epoch": 0.3413798286487299, "flos": 24935723475840.0, "grad_norm": 93.48290785521299, "language_loss": 0.85356069, "learning_rate": 3.0662420869524966e-06, "loss": 0.87344205, "num_input_tokens_seen": 121979830, "router_z_loss_clip": 2.93359375, "router_z_loss_mlp": 0.39331055, "step": 5678, "time_per_iteration": 2.686438798904419 }, { "auxiliary_loss_clip": 0.01557729, "auxiliary_loss_mlp": 0.00428829, "balance_loss_clip": 1.26340222, "balance_loss_mlp": 0.39211208, "epoch": 0.34143995190139786, "flos": 25374372364800.0, "grad_norm": 20.115937004868083, "language_loss": 0.80775338, "learning_rate": 3.0659125669754506e-06, "loss": 0.82761896, "num_input_tokens_seen": 121999055, "router_z_loss_clip": 2.94335938, "router_z_loss_mlp": 0.3671875, "step": 5679, "time_per_iteration": 2.737489700317383 }, { "auxiliary_loss_clip": 0.01583695, "auxiliary_loss_mlp": 0.00193207, "balance_loss_clip": 1.39548206, "balance_loss_mlp": 0.18271689, "epoch": 0.34150007515406583, "flos": 67782578129280.0, "grad_norm": 1.4644571484273643, "language_loss": 0.59035873, "learning_rate": 3.0655830065788923e-06, "loss": 0.60812771, "num_input_tokens_seen": 122067015, "router_z_loss_clip": 1.8828125, "router_z_loss_mlp": 0.10498047, "step": 5680, "time_per_iteration": 4.592510938644409 }, { "auxiliary_loss_clip": 0.01551543, "auxiliary_loss_mlp": 0.00406052, "balance_loss_clip": 1.26681256, "balance_loss_mlp": 0.36964551, "epoch": 0.3415601984067338, "flos": 20302421258880.0, "grad_norm": 20.676773352818547, "language_loss": 0.78444523, "learning_rate": 3.0652534057753206e-06, "loss": 0.80402118, "num_input_tokens_seen": 122085295, "router_z_loss_clip": 2.84960938, "router_z_loss_mlp": 0.36401367, "step": 5681, "time_per_iteration": 2.6316099166870117 }, { "auxiliary_loss_clip": 0.01547595, "auxiliary_loss_mlp": 0.0043015, "balance_loss_clip": 1.25914609, "balance_loss_mlp": 0.39164537, "epoch": 0.34162032165940176, "flos": 26031178506240.0, "grad_norm": 264.19066824515966, "language_loss": 0.77694106, "learning_rate": 3.064923764577233e-06, "loss": 0.79671848, "num_input_tokens_seen": 122104020, "router_z_loss_clip": 2.8828125, "router_z_loss_mlp": 0.38476562, "step": 5682, "time_per_iteration": 2.704002618789673 }, { "auxiliary_loss_clip": 0.01554642, "auxiliary_loss_mlp": 0.00454688, "balance_loss_clip": 1.26387334, "balance_loss_mlp": 0.41670835, "epoch": 0.3416804449120697, "flos": 28803338449920.0, "grad_norm": 52.999884836344485, "language_loss": 0.88795102, "learning_rate": 3.0645940829971295e-06, "loss": 0.9080444, "num_input_tokens_seen": 122125080, "router_z_loss_clip": 2.91015625, "router_z_loss_mlp": 0.37963867, "step": 5683, "time_per_iteration": 2.7831549644470215 }, { "auxiliary_loss_clip": 0.01574861, "auxiliary_loss_mlp": 0.00435873, "balance_loss_clip": 1.28095341, "balance_loss_mlp": 0.39970517, "epoch": 0.3417405681647377, "flos": 22601601889920.0, "grad_norm": 16.609237824179207, "language_loss": 0.76859319, "learning_rate": 3.0642643610475116e-06, "loss": 0.78870058, "num_input_tokens_seen": 122146350, "router_z_loss_clip": 2.93945312, "router_z_loss_mlp": 0.36181641, "step": 5684, "time_per_iteration": 2.6840338706970215 }, { "auxiliary_loss_clip": 0.01551699, "auxiliary_loss_mlp": 0.00439061, "balance_loss_clip": 1.26758051, "balance_loss_mlp": 0.40458584, "epoch": 0.34180069141740566, "flos": 24716237420160.0, "grad_norm": 7.533671066186242, "language_loss": 0.80096793, "learning_rate": 3.0639345987408823e-06, "loss": 0.82087553, "num_input_tokens_seen": 122168085, "router_z_loss_clip": 2.84375, "router_z_loss_mlp": 0.3449707, "step": 5685, "time_per_iteration": 2.6756787300109863 }, { "auxiliary_loss_clip": 0.01572446, "auxiliary_loss_mlp": 0.00404363, "balance_loss_clip": 1.28472972, "balance_loss_mlp": 0.36964914, "epoch": 0.3418608146700737, "flos": 30518755246080.0, "grad_norm": 5.544803339208702, "language_loss": 0.76664484, "learning_rate": 3.0636047960897468e-06, "loss": 0.78641289, "num_input_tokens_seen": 122191040, "router_z_loss_clip": 2.87695312, "router_z_loss_mlp": 0.34692383, "step": 5686, "time_per_iteration": 4.115495920181274 }, { "auxiliary_loss_clip": 0.0157048, "auxiliary_loss_mlp": 0.00431996, "balance_loss_clip": 1.27871513, "balance_loss_mlp": 0.39444527, "epoch": 0.34192093792274164, "flos": 15122343237120.0, "grad_norm": 2.200688185207488, "language_loss": 0.84181452, "learning_rate": 3.06327495310661e-06, "loss": 0.86183929, "num_input_tokens_seen": 122209225, "router_z_loss_clip": 2.91992188, "router_z_loss_mlp": 0.37548828, "step": 5687, "time_per_iteration": 2.632516860961914 }, { "auxiliary_loss_clip": 0.01566522, "auxiliary_loss_mlp": 0.00444264, "balance_loss_clip": 1.27815259, "balance_loss_mlp": 0.40704656, "epoch": 0.3419810611754096, "flos": 13187799521280.0, "grad_norm": 300.5222692491193, "language_loss": 0.93339968, "learning_rate": 3.062945069803981e-06, "loss": 0.95350748, "num_input_tokens_seen": 122226160, "router_z_loss_clip": 2.87890625, "router_z_loss_mlp": 0.37207031, "step": 5688, "time_per_iteration": 2.6575570106506348 }, { "auxiliary_loss_clip": 0.01587754, "auxiliary_loss_mlp": 0.00424028, "balance_loss_clip": 1.28607833, "balance_loss_mlp": 0.38528526, "epoch": 0.34204118442807757, "flos": 19536267139200.0, "grad_norm": 10.832553455046009, "language_loss": 0.85442686, "learning_rate": 3.0626151461943684e-06, "loss": 0.87454474, "num_input_tokens_seen": 122243115, "router_z_loss_clip": 3.015625, "router_z_loss_mlp": 0.38745117, "step": 5689, "time_per_iteration": 2.6565048694610596 }, { "auxiliary_loss_clip": 0.01578265, "auxiliary_loss_mlp": 0.00429211, "balance_loss_clip": 1.28456426, "balance_loss_mlp": 0.38877556, "epoch": 0.34210130768074554, "flos": 15194846839680.0, "grad_norm": 3.9989992956146456, "language_loss": 0.79860556, "learning_rate": 3.0622851822902834e-06, "loss": 0.81868041, "num_input_tokens_seen": 122261105, "router_z_loss_clip": 2.93359375, "router_z_loss_mlp": 0.40429688, "step": 5690, "time_per_iteration": 2.6812186241149902 }, { "auxiliary_loss_clip": 0.01556899, "auxiliary_loss_mlp": 0.00422181, "balance_loss_clip": 1.26918542, "balance_loss_mlp": 0.38722903, "epoch": 0.3421614309334135, "flos": 24936226266240.0, "grad_norm": 6.699531921127357, "language_loss": 0.81605667, "learning_rate": 3.061955178104237e-06, "loss": 0.8358475, "num_input_tokens_seen": 122279995, "router_z_loss_clip": 2.87695312, "router_z_loss_mlp": 0.34960938, "step": 5691, "time_per_iteration": 2.7449257373809814 }, { "auxiliary_loss_clip": 0.01567016, "auxiliary_loss_mlp": 0.00416806, "balance_loss_clip": 1.28132772, "balance_loss_mlp": 0.3800897, "epoch": 0.34222155418608147, "flos": 21908633731200.0, "grad_norm": 2.6334149039010524, "language_loss": 0.76185554, "learning_rate": 3.0616251336487447e-06, "loss": 0.78169382, "num_input_tokens_seen": 122299070, "router_z_loss_clip": 2.85546875, "router_z_loss_mlp": 0.3671875, "step": 5692, "time_per_iteration": 2.6840031147003174 }, { "auxiliary_loss_clip": 0.01580508, "auxiliary_loss_mlp": 0.00389414, "balance_loss_clip": 1.28679729, "balance_loss_mlp": 0.35212508, "epoch": 0.34228167743874943, "flos": 18114061063680.0, "grad_norm": 16.69465923614269, "language_loss": 0.805215, "learning_rate": 3.06129504893632e-06, "loss": 0.82491416, "num_input_tokens_seen": 122316800, "router_z_loss_clip": 2.93554688, "router_z_loss_mlp": 0.37304688, "step": 5693, "time_per_iteration": 2.696392297744751 }, { "auxiliary_loss_clip": 0.01570045, "auxiliary_loss_mlp": 0.00401975, "balance_loss_clip": 1.28324866, "balance_loss_mlp": 0.36647415, "epoch": 0.3423418006914174, "flos": 21288600138240.0, "grad_norm": 4.102767930373659, "language_loss": 0.81528914, "learning_rate": 3.0609649239794813e-06, "loss": 0.83500934, "num_input_tokens_seen": 122335275, "router_z_loss_clip": 2.86132812, "router_z_loss_mlp": 0.35498047, "step": 5694, "time_per_iteration": 2.80635142326355 }, { "auxiliary_loss_clip": 0.01561946, "auxiliary_loss_mlp": 0.00379832, "balance_loss_clip": 1.2713114, "balance_loss_mlp": 0.34533259, "epoch": 0.34240192394408536, "flos": 19823480288640.0, "grad_norm": 359.6438447262561, "language_loss": 0.87385631, "learning_rate": 3.060634758790747e-06, "loss": 0.89327407, "num_input_tokens_seen": 122353215, "router_z_loss_clip": 2.90625, "router_z_loss_mlp": 0.3449707, "step": 5695, "time_per_iteration": 2.703368663787842 }, { "auxiliary_loss_clip": 0.01591864, "auxiliary_loss_mlp": 0.00375214, "balance_loss_clip": 1.29602659, "balance_loss_mlp": 0.3393802, "epoch": 0.3424620471967533, "flos": 24535535074560.0, "grad_norm": 7.480668955777455, "language_loss": 0.7887795, "learning_rate": 3.060304553382635e-06, "loss": 0.80845028, "num_input_tokens_seen": 122372495, "router_z_loss_clip": 2.95703125, "router_z_loss_mlp": 0.35839844, "step": 5696, "time_per_iteration": 2.783504009246826 }, { "auxiliary_loss_clip": 0.01581576, "auxiliary_loss_mlp": 0.00383439, "balance_loss_clip": 1.29212046, "balance_loss_mlp": 0.34698457, "epoch": 0.3425221704494213, "flos": 25848895962240.0, "grad_norm": 57.70019261654548, "language_loss": 0.76703048, "learning_rate": 3.0599743077676685e-06, "loss": 0.78668058, "num_input_tokens_seen": 122394600, "router_z_loss_clip": 2.89453125, "router_z_loss_mlp": 0.36450195, "step": 5697, "time_per_iteration": 2.7351551055908203 }, { "auxiliary_loss_clip": 0.01587068, "auxiliary_loss_mlp": 0.00375952, "balance_loss_clip": 1.29546118, "balance_loss_mlp": 0.34059423, "epoch": 0.34258229370208926, "flos": 21540513196800.0, "grad_norm": 36.06780918230688, "language_loss": 0.87722385, "learning_rate": 3.05964402195837e-06, "loss": 0.89685398, "num_input_tokens_seen": 122414700, "router_z_loss_clip": 2.9140625, "router_z_loss_mlp": 0.35375977, "step": 5698, "time_per_iteration": 2.7074692249298096 }, { "auxiliary_loss_clip": 0.01569598, "auxiliary_loss_mlp": 0.0039138, "balance_loss_clip": 1.27978015, "balance_loss_mlp": 0.35447246, "epoch": 0.3426424169547573, "flos": 23652778429440.0, "grad_norm": 40.51167638912526, "language_loss": 0.7616595, "learning_rate": 3.0593136959672645e-06, "loss": 0.78126931, "num_input_tokens_seen": 122432760, "router_z_loss_clip": 2.89648438, "router_z_loss_mlp": 0.36889648, "step": 5699, "time_per_iteration": 2.6597366333007812 }, { "auxiliary_loss_clip": 0.0156737, "auxiliary_loss_mlp": 0.00366281, "balance_loss_clip": 1.2772671, "balance_loss_mlp": 0.33237818, "epoch": 0.34270254020742524, "flos": 24644883052800.0, "grad_norm": 5.8526132044947135, "language_loss": 0.7803362, "learning_rate": 3.058983329806877e-06, "loss": 0.79967272, "num_input_tokens_seen": 122449105, "router_z_loss_clip": 2.90039062, "router_z_loss_mlp": 0.33886719, "step": 5700, "time_per_iteration": 2.673095226287842 }, { "auxiliary_loss_clip": 0.01574258, "auxiliary_loss_mlp": 0.00362797, "balance_loss_clip": 1.28505766, "balance_loss_mlp": 0.32886988, "epoch": 0.3427626634600932, "flos": 20996754134400.0, "grad_norm": 6.094389706459197, "language_loss": 0.89286065, "learning_rate": 3.0586529234897354e-06, "loss": 0.91223121, "num_input_tokens_seen": 122468700, "router_z_loss_clip": 2.89257812, "router_z_loss_mlp": 0.33911133, "step": 5701, "time_per_iteration": 2.636474370956421 }, { "auxiliary_loss_clip": 0.01561999, "auxiliary_loss_mlp": 0.00322589, "balance_loss_clip": 1.27096295, "balance_loss_mlp": 0.28994972, "epoch": 0.3428227867127612, "flos": 21433786911360.0, "grad_norm": 35.8365731115393, "language_loss": 0.78259301, "learning_rate": 3.0583224770283694e-06, "loss": 0.80143887, "num_input_tokens_seen": 122488160, "router_z_loss_clip": 2.91015625, "router_z_loss_mlp": 0.32641602, "step": 5702, "time_per_iteration": 2.6632611751556396 }, { "auxiliary_loss_clip": 0.01570299, "auxiliary_loss_mlp": 0.00174527, "balance_loss_clip": 1.34997857, "balance_loss_mlp": 0.16146214, "epoch": 0.34288290996542914, "flos": 55731782695680.0, "grad_norm": 0.861277875320857, "language_loss": 0.56101012, "learning_rate": 3.057991990435309e-06, "loss": 0.57845831, "num_input_tokens_seen": 122542890, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.13085938, "step": 5703, "time_per_iteration": 3.042109489440918 }, { "auxiliary_loss_clip": 0.01574025, "auxiliary_loss_mlp": 0.00409733, "balance_loss_clip": 1.28392315, "balance_loss_mlp": 0.36920193, "epoch": 0.3429430332180971, "flos": 20156803522560.0, "grad_norm": 73.1080590958927, "language_loss": 0.81009531, "learning_rate": 3.057661463723086e-06, "loss": 0.82993281, "num_input_tokens_seen": 122561770, "router_z_loss_clip": 2.90429688, "router_z_loss_mlp": 0.40527344, "step": 5704, "time_per_iteration": 2.6462290287017822 }, { "auxiliary_loss_clip": 0.01582379, "auxiliary_loss_mlp": 0.00368482, "balance_loss_clip": 1.29340792, "balance_loss_mlp": 0.33472165, "epoch": 0.34300315647076507, "flos": 17965857548160.0, "grad_norm": 34.712633822719454, "language_loss": 0.76986551, "learning_rate": 3.0573308969042346e-06, "loss": 0.78937411, "num_input_tokens_seen": 122580580, "router_z_loss_clip": 2.88671875, "router_z_loss_mlp": 0.33764648, "step": 5705, "time_per_iteration": 2.787940502166748 }, { "auxiliary_loss_clip": 0.0153817, "auxiliary_loss_mlp": 0.00336864, "balance_loss_clip": 1.25333333, "balance_loss_mlp": 0.30435535, "epoch": 0.34306327972343303, "flos": 22086822124800.0, "grad_norm": 9.846079843187336, "language_loss": 0.84611058, "learning_rate": 3.057000289991289e-06, "loss": 0.86486089, "num_input_tokens_seen": 122599810, "router_z_loss_clip": 2.84765625, "router_z_loss_mlp": 0.32507324, "step": 5706, "time_per_iteration": 2.664628028869629 }, { "auxiliary_loss_clip": 0.0159436, "auxiliary_loss_mlp": 0.00343067, "balance_loss_clip": 1.29539454, "balance_loss_mlp": 0.30663651, "epoch": 0.343123402976101, "flos": 18442679616000.0, "grad_norm": 3.75549088660496, "language_loss": 0.90218323, "learning_rate": 3.056669642996787e-06, "loss": 0.92155755, "num_input_tokens_seen": 122616035, "router_z_loss_clip": 2.99414062, "router_z_loss_mlp": 0.36425781, "step": 5707, "time_per_iteration": 2.7008376121520996 }, { "auxiliary_loss_clip": 0.01554917, "auxiliary_loss_mlp": 0.00359986, "balance_loss_clip": 1.2714889, "balance_loss_mlp": 0.32784748, "epoch": 0.34318352622876896, "flos": 17163685065600.0, "grad_norm": 2.3075675913491445, "language_loss": 0.81046379, "learning_rate": 3.056338955933266e-06, "loss": 0.82961285, "num_input_tokens_seen": 122633785, "router_z_loss_clip": 2.8359375, "router_z_loss_mlp": 0.32128906, "step": 5708, "time_per_iteration": 2.834834575653076 }, { "auxiliary_loss_clip": 0.01531711, "auxiliary_loss_mlp": 0.00300609, "balance_loss_clip": 1.25019491, "balance_loss_mlp": 0.26846987, "epoch": 0.34324364948143693, "flos": 26688164215680.0, "grad_norm": 53.288824445915346, "language_loss": 0.86918616, "learning_rate": 3.0560082288132662e-06, "loss": 0.88750935, "num_input_tokens_seen": 122652100, "router_z_loss_clip": 2.81445312, "router_z_loss_mlp": 0.32128906, "step": 5709, "time_per_iteration": 2.801910638809204 }, { "auxiliary_loss_clip": 0.01543266, "auxiliary_loss_mlp": 0.00348725, "balance_loss_clip": 1.25203133, "balance_loss_mlp": 0.31186581, "epoch": 0.3433037727341049, "flos": 21251576194560.0, "grad_norm": 13.567898002049748, "language_loss": 0.85988516, "learning_rate": 3.055677461649329e-06, "loss": 0.8788051, "num_input_tokens_seen": 122669720, "router_z_loss_clip": 2.9140625, "router_z_loss_mlp": 0.36865234, "step": 5710, "time_per_iteration": 2.682270050048828 }, { "auxiliary_loss_clip": 0.01534662, "auxiliary_loss_mlp": 0.00321637, "balance_loss_clip": 1.24711812, "balance_loss_mlp": 0.28747198, "epoch": 0.34336389598677286, "flos": 20629423699200.0, "grad_norm": 5.022444693645233, "language_loss": 0.77214837, "learning_rate": 3.055346654453996e-06, "loss": 0.79071134, "num_input_tokens_seen": 122688715, "router_z_loss_clip": 2.875, "router_z_loss_mlp": 0.34179688, "step": 5711, "time_per_iteration": 2.728151321411133 }, { "auxiliary_loss_clip": 0.01517625, "auxiliary_loss_mlp": 0.00358195, "balance_loss_clip": 1.23619246, "balance_loss_mlp": 0.32488748, "epoch": 0.3434240192394409, "flos": 14538579402240.0, "grad_norm": 4.219602789569208, "language_loss": 0.74137044, "learning_rate": 3.055015807239812e-06, "loss": 0.76012868, "num_input_tokens_seen": 122706970, "router_z_loss_clip": 2.81445312, "router_z_loss_mlp": 0.33325195, "step": 5712, "time_per_iteration": 2.658104419708252 }, { "auxiliary_loss_clip": 0.01564025, "auxiliary_loss_mlp": 0.00162828, "balance_loss_clip": 1.3556931, "balance_loss_mlp": 0.15114501, "epoch": 0.34348414249210885, "flos": 58051538841600.0, "grad_norm": 0.8518648594124081, "language_loss": 0.57910168, "learning_rate": 3.0546849200193226e-06, "loss": 0.59637022, "num_input_tokens_seen": 122758095, "router_z_loss_clip": 2.09375, "router_z_loss_mlp": 0.11669922, "step": 5713, "time_per_iteration": 3.12113356590271 }, { "auxiliary_loss_clip": 0.01502989, "auxiliary_loss_mlp": 0.00320144, "balance_loss_clip": 1.21905029, "balance_loss_mlp": 0.28674155, "epoch": 0.3435442657447768, "flos": 20704441253760.0, "grad_norm": 20.23782584353987, "language_loss": 0.87569547, "learning_rate": 3.054353992805076e-06, "loss": 0.8939268, "num_input_tokens_seen": 122777815, "router_z_loss_clip": 2.84179688, "router_z_loss_mlp": 0.33398438, "step": 5714, "time_per_iteration": 2.692373037338257 }, { "auxiliary_loss_clip": 0.0150676, "auxiliary_loss_mlp": 0.00359094, "balance_loss_clip": 1.22317731, "balance_loss_mlp": 0.32361761, "epoch": 0.3436043889974448, "flos": 22930256355840.0, "grad_norm": 66.72656747330252, "language_loss": 0.77312183, "learning_rate": 3.05402302560962e-06, "loss": 0.79178035, "num_input_tokens_seen": 122797555, "router_z_loss_clip": 2.8359375, "router_z_loss_mlp": 0.35498047, "step": 5715, "time_per_iteration": 2.669142723083496 }, { "auxiliary_loss_clip": 0.01510789, "auxiliary_loss_mlp": 0.00138312, "balance_loss_clip": 1.30575252, "balance_loss_mlp": 0.12629533, "epoch": 0.34366451225011274, "flos": 58403285752320.0, "grad_norm": 0.9168685104943242, "language_loss": 0.65305114, "learning_rate": 3.053692018445505e-06, "loss": 0.66954213, "num_input_tokens_seen": 122863955, "router_z_loss_clip": 2.046875, "router_z_loss_mlp": 0.12011719, "step": 5716, "time_per_iteration": 4.6164093017578125 }, { "auxiliary_loss_clip": 0.01497051, "auxiliary_loss_mlp": 0.00336527, "balance_loss_clip": 1.21951306, "balance_loss_mlp": 0.30391109, "epoch": 0.3437246355027807, "flos": 15596292216960.0, "grad_norm": 54.48080523912891, "language_loss": 0.81735402, "learning_rate": 3.0533609713252838e-06, "loss": 0.83568978, "num_input_tokens_seen": 122883000, "router_z_loss_clip": 2.77539062, "router_z_loss_mlp": 0.32592773, "step": 5717, "time_per_iteration": 4.09070086479187 }, { "auxiliary_loss_clip": 0.0148687, "auxiliary_loss_mlp": 0.00357891, "balance_loss_clip": 1.20976281, "balance_loss_mlp": 0.32498938, "epoch": 0.34378475875544867, "flos": 27672260106240.0, "grad_norm": 2.784537190086487, "language_loss": 0.82768649, "learning_rate": 3.0530298842615077e-06, "loss": 0.84613413, "num_input_tokens_seen": 122903265, "router_z_loss_clip": 2.77148438, "router_z_loss_mlp": 0.32885742, "step": 5718, "time_per_iteration": 2.783261299133301 }, { "auxiliary_loss_clip": 0.01502528, "auxiliary_loss_mlp": 0.00347925, "balance_loss_clip": 1.21751547, "balance_loss_mlp": 0.31046954, "epoch": 0.34384488200811664, "flos": 31431496769280.0, "grad_norm": 6.334651021516015, "language_loss": 0.72502893, "learning_rate": 3.052698757266734e-06, "loss": 0.74353349, "num_input_tokens_seen": 122923860, "router_z_loss_clip": 2.84960938, "router_z_loss_mlp": 0.37426758, "step": 5719, "time_per_iteration": 2.8150084018707275 }, { "auxiliary_loss_clip": 0.0150933, "auxiliary_loss_mlp": 0.00321305, "balance_loss_clip": 1.2222389, "balance_loss_mlp": 0.28809369, "epoch": 0.3439050052607846, "flos": 24899920594560.0, "grad_norm": 2.5367988712795824, "language_loss": 0.81472707, "learning_rate": 3.0523675903535183e-06, "loss": 0.83303344, "num_input_tokens_seen": 122945305, "router_z_loss_clip": 2.87109375, "router_z_loss_mlp": 0.33203125, "step": 5720, "time_per_iteration": 2.760312080383301 }, { "auxiliary_loss_clip": 0.01510465, "auxiliary_loss_mlp": 0.00323401, "balance_loss_clip": 1.22564292, "balance_loss_mlp": 0.29126251, "epoch": 0.34396512851345257, "flos": 18150079426560.0, "grad_norm": 7.483712972715063, "language_loss": 0.80822957, "learning_rate": 3.0520363835344173e-06, "loss": 0.82656825, "num_input_tokens_seen": 122962535, "router_z_loss_clip": 2.84960938, "router_z_loss_mlp": 0.32141113, "step": 5721, "time_per_iteration": 2.6796512603759766 }, { "auxiliary_loss_clip": 0.01495017, "auxiliary_loss_mlp": 0.00346112, "balance_loss_clip": 1.20986307, "balance_loss_mlp": 0.31220847, "epoch": 0.34402525176612053, "flos": 16034438315520.0, "grad_norm": 85.07560403086474, "language_loss": 0.86785245, "learning_rate": 3.051705136821992e-06, "loss": 0.88626379, "num_input_tokens_seen": 122979750, "router_z_loss_clip": 2.84960938, "router_z_loss_mlp": 0.33898926, "step": 5722, "time_per_iteration": 4.0104169845581055 }, { "auxiliary_loss_clip": 0.01495529, "auxiliary_loss_mlp": 0.00335356, "balance_loss_clip": 1.21648848, "balance_loss_mlp": 0.30340827, "epoch": 0.3440853750187885, "flos": 21178641628800.0, "grad_norm": 34.23610427145381, "language_loss": 0.86571622, "learning_rate": 3.051373850228801e-06, "loss": 0.88402498, "num_input_tokens_seen": 122998955, "router_z_loss_clip": 2.79296875, "router_z_loss_mlp": 0.3190918, "step": 5723, "time_per_iteration": 2.6350176334381104 }, { "auxiliary_loss_clip": 0.01492861, "auxiliary_loss_mlp": 0.00329978, "balance_loss_clip": 1.20958507, "balance_loss_mlp": 0.29693329, "epoch": 0.34414549827145646, "flos": 12677868092160.0, "grad_norm": 5.144927802474237, "language_loss": 0.9158352, "learning_rate": 3.0510425237674096e-06, "loss": 0.93406361, "num_input_tokens_seen": 123016165, "router_z_loss_clip": 2.83007812, "router_z_loss_mlp": 0.33007812, "step": 5724, "time_per_iteration": 2.6324737071990967 }, { "auxiliary_loss_clip": 0.0149856, "auxiliary_loss_mlp": 0.0034294, "balance_loss_clip": 1.21911883, "balance_loss_mlp": 0.3091327, "epoch": 0.3442056215241244, "flos": 31284514316160.0, "grad_norm": 47.45832382797063, "language_loss": 0.75688672, "learning_rate": 3.05071115745038e-06, "loss": 0.77530175, "num_input_tokens_seen": 123036900, "router_z_loss_clip": 2.79492188, "router_z_loss_mlp": 0.33789062, "step": 5725, "time_per_iteration": 2.725569248199463 }, { "auxiliary_loss_clip": 0.01480928, "auxiliary_loss_mlp": 0.00344616, "balance_loss_clip": 1.20019186, "balance_loss_mlp": 0.30978256, "epoch": 0.34426574477679245, "flos": 23367289132800.0, "grad_norm": 31.800106306525294, "language_loss": 0.7700808, "learning_rate": 3.0503797512902773e-06, "loss": 0.78833628, "num_input_tokens_seen": 123057480, "router_z_loss_clip": 2.80664062, "router_z_loss_mlp": 0.34814453, "step": 5726, "time_per_iteration": 2.7133121490478516 }, { "auxiliary_loss_clip": 0.01473965, "auxiliary_loss_mlp": 0.00346251, "balance_loss_clip": 1.19575739, "balance_loss_mlp": 0.31337279, "epoch": 0.3443258680294604, "flos": 24535427333760.0, "grad_norm": 14.77160130146188, "language_loss": 0.80160868, "learning_rate": 3.0500483052996703e-06, "loss": 0.81981087, "num_input_tokens_seen": 123076890, "router_z_loss_clip": 2.78320312, "router_z_loss_mlp": 0.32910156, "step": 5727, "time_per_iteration": 2.6744582653045654 }, { "auxiliary_loss_clip": 0.01482508, "auxiliary_loss_mlp": 0.00324609, "balance_loss_clip": 1.20267367, "balance_loss_mlp": 0.29384089, "epoch": 0.3443859912821284, "flos": 20230133137920.0, "grad_norm": 4.111812841656273, "language_loss": 0.95251906, "learning_rate": 3.0497168194911257e-06, "loss": 0.97059023, "num_input_tokens_seen": 123092530, "router_z_loss_clip": 2.80273438, "router_z_loss_mlp": 0.30773926, "step": 5728, "time_per_iteration": 3.9685022830963135 }, { "auxiliary_loss_clip": 0.01478046, "auxiliary_loss_mlp": 0.00307246, "balance_loss_clip": 1.19748163, "balance_loss_mlp": 0.27587023, "epoch": 0.34444611453479634, "flos": 24316515895680.0, "grad_norm": 82.65261771544166, "language_loss": 0.77305734, "learning_rate": 3.0493852938772143e-06, "loss": 0.79091024, "num_input_tokens_seen": 123110560, "router_z_loss_clip": 2.80078125, "router_z_loss_mlp": 0.3137207, "step": 5729, "time_per_iteration": 2.714808225631714 }, { "auxiliary_loss_clip": 0.01474056, "auxiliary_loss_mlp": 0.00341891, "balance_loss_clip": 1.1994195, "balance_loss_mlp": 0.30994296, "epoch": 0.3445062377874643, "flos": 16983413683200.0, "grad_norm": 30.858303907379426, "language_loss": 0.81893325, "learning_rate": 3.0490537284705078e-06, "loss": 0.83709276, "num_input_tokens_seen": 123128655, "router_z_loss_clip": 2.74804688, "router_z_loss_mlp": 0.31958008, "step": 5730, "time_per_iteration": 2.640746831893921 }, { "auxiliary_loss_clip": 0.01462637, "auxiliary_loss_mlp": 0.00285989, "balance_loss_clip": 1.18847108, "balance_loss_mlp": 0.25721151, "epoch": 0.3445663610401323, "flos": 20302708567680.0, "grad_norm": 4.172468225623624, "language_loss": 0.8534981, "learning_rate": 3.048722123283578e-06, "loss": 0.87098438, "num_input_tokens_seen": 123145130, "router_z_loss_clip": 2.7421875, "router_z_loss_mlp": 0.28759766, "step": 5731, "time_per_iteration": 2.6816492080688477 }, { "auxiliary_loss_clip": 0.01483135, "auxiliary_loss_mlp": 0.00305319, "balance_loss_clip": 1.20170212, "balance_loss_mlp": 0.27306104, "epoch": 0.34462648429280024, "flos": 15888102307200.0, "grad_norm": 7.230164810219902, "language_loss": 0.86092865, "learning_rate": 3.0483904783290006e-06, "loss": 0.87881321, "num_input_tokens_seen": 123162265, "router_z_loss_clip": 2.81640625, "router_z_loss_mlp": 0.32250977, "step": 5732, "time_per_iteration": 2.8207433223724365 }, { "auxiliary_loss_clip": 0.01412623, "auxiliary_loss_mlp": 0.00044003, "balance_loss_clip": 1.21572137, "balance_loss_mlp": 0.03532484, "epoch": 0.3446866075454682, "flos": 59311035285120.0, "grad_norm": 0.811757024503234, "language_loss": 0.53482616, "learning_rate": 3.0480587936193505e-06, "loss": 0.54939246, "num_input_tokens_seen": 123218620, "router_z_loss_clip": 1.96875, "router_z_loss_mlp": 0.08691406, "step": 5733, "time_per_iteration": 3.2104363441467285 }, { "auxiliary_loss_clip": 0.0148772, "auxiliary_loss_mlp": 0.00302331, "balance_loss_clip": 1.20548391, "balance_loss_mlp": 0.27152756, "epoch": 0.34474673079813617, "flos": 22343799000960.0, "grad_norm": 1.9937726138617244, "language_loss": 0.89376205, "learning_rate": 3.047727069167207e-06, "loss": 0.91166258, "num_input_tokens_seen": 123237325, "router_z_loss_clip": 2.82226562, "router_z_loss_mlp": 0.30822754, "step": 5734, "time_per_iteration": 2.741135835647583 }, { "auxiliary_loss_clip": 0.01448141, "auxiliary_loss_mlp": 0.0029349, "balance_loss_clip": 1.17453349, "balance_loss_mlp": 0.26237667, "epoch": 0.34480685405080413, "flos": 27670141203840.0, "grad_norm": 21.92828647146351, "language_loss": 0.98964858, "learning_rate": 3.0473953049851478e-06, "loss": 1.00706494, "num_input_tokens_seen": 123258650, "router_z_loss_clip": 2.73632812, "router_z_loss_mlp": 0.31079102, "step": 5735, "time_per_iteration": 2.7265405654907227 }, { "auxiliary_loss_clip": 0.01484354, "auxiliary_loss_mlp": 0.0030914, "balance_loss_clip": 1.20451069, "balance_loss_mlp": 0.27701306, "epoch": 0.3448669773034721, "flos": 22456020067200.0, "grad_norm": 9.022410963146363, "language_loss": 0.82859653, "learning_rate": 3.0470635010857533e-06, "loss": 0.84653145, "num_input_tokens_seen": 123277155, "router_z_loss_clip": 2.79882812, "router_z_loss_mlp": 0.32128906, "step": 5736, "time_per_iteration": 2.7656447887420654 }, { "auxiliary_loss_clip": 0.01470008, "auxiliary_loss_mlp": 0.00312366, "balance_loss_clip": 1.19352162, "balance_loss_mlp": 0.28137201, "epoch": 0.34492710055614006, "flos": 24936190352640.0, "grad_norm": 12.832306219283502, "language_loss": 0.83581579, "learning_rate": 3.0467316574816064e-06, "loss": 0.85363948, "num_input_tokens_seen": 123297640, "router_z_loss_clip": 2.765625, "router_z_loss_mlp": 0.31005859, "step": 5737, "time_per_iteration": 2.7277369499206543 }, { "auxiliary_loss_clip": 0.01490773, "auxiliary_loss_mlp": 0.0033163, "balance_loss_clip": 1.20534468, "balance_loss_mlp": 0.29756019, "epoch": 0.34498722380880803, "flos": 20120821073280.0, "grad_norm": 7.592716308160046, "language_loss": 0.78516495, "learning_rate": 3.0463997741852893e-06, "loss": 0.80338907, "num_input_tokens_seen": 123314370, "router_z_loss_clip": 2.85742188, "router_z_loss_mlp": 0.34082031, "step": 5738, "time_per_iteration": 2.698808193206787 }, { "auxiliary_loss_clip": 0.01476561, "auxiliary_loss_mlp": 0.00287715, "balance_loss_clip": 1.19699097, "balance_loss_mlp": 0.25569549, "epoch": 0.34504734706147605, "flos": 28438126917120.0, "grad_norm": 5.119614151673911, "language_loss": 0.88862932, "learning_rate": 3.046067851209389e-06, "loss": 0.90627205, "num_input_tokens_seen": 123336085, "router_z_loss_clip": 2.79882812, "router_z_loss_mlp": 0.3203125, "step": 5739, "time_per_iteration": 2.7606801986694336 }, { "auxiliary_loss_clip": 0.01479735, "auxiliary_loss_mlp": 0.00280434, "balance_loss_clip": 1.19854069, "balance_loss_mlp": 0.25103691, "epoch": 0.345107470314144, "flos": 22674464628480.0, "grad_norm": 2.792326305748787, "language_loss": 0.90410084, "learning_rate": 3.0457358885664898e-06, "loss": 0.9217025, "num_input_tokens_seen": 123354460, "router_z_loss_clip": 2.80859375, "router_z_loss_mlp": 0.29394531, "step": 5740, "time_per_iteration": 2.6929519176483154 }, { "auxiliary_loss_clip": 0.01468263, "auxiliary_loss_mlp": 0.00292265, "balance_loss_clip": 1.19530141, "balance_loss_mlp": 0.26401246, "epoch": 0.345167593566812, "flos": 20630716588800.0, "grad_norm": 7.216468605083782, "language_loss": 0.84466958, "learning_rate": 3.045403886269181e-06, "loss": 0.86227483, "num_input_tokens_seen": 123373420, "router_z_loss_clip": 2.73046875, "router_z_loss_mlp": 0.28271484, "step": 5741, "time_per_iteration": 2.714667797088623 }, { "auxiliary_loss_clip": 0.01476076, "auxiliary_loss_mlp": 0.00281152, "balance_loss_clip": 1.19462144, "balance_loss_mlp": 0.25057489, "epoch": 0.34522771681947995, "flos": 26214358890240.0, "grad_norm": 20.616411120897467, "language_loss": 0.83737683, "learning_rate": 3.045071844330053e-06, "loss": 0.85494912, "num_input_tokens_seen": 123394730, "router_z_loss_clip": 2.8125, "router_z_loss_mlp": 0.3059082, "step": 5742, "time_per_iteration": 2.686887264251709 }, { "auxiliary_loss_clip": 0.01444545, "auxiliary_loss_mlp": 0.00282844, "balance_loss_clip": 1.17438817, "balance_loss_mlp": 0.25449604, "epoch": 0.3452878400721479, "flos": 19062354072960.0, "grad_norm": 5.369101887745796, "language_loss": 0.85610509, "learning_rate": 3.0447397627616955e-06, "loss": 0.87337899, "num_input_tokens_seen": 123412895, "router_z_loss_clip": 2.703125, "router_z_loss_mlp": 0.28356934, "step": 5743, "time_per_iteration": 2.627516984939575 }, { "auxiliary_loss_clip": 0.014633, "auxiliary_loss_mlp": 0.00283147, "balance_loss_clip": 1.18889976, "balance_loss_mlp": 0.25317782, "epoch": 0.3453479633248159, "flos": 27929739772800.0, "grad_norm": 8.282143787549998, "language_loss": 0.76090682, "learning_rate": 3.0444076415767016e-06, "loss": 0.77837133, "num_input_tokens_seen": 123432320, "router_z_loss_clip": 2.7421875, "router_z_loss_mlp": 0.29980469, "step": 5744, "time_per_iteration": 2.692866802215576 }, { "auxiliary_loss_clip": 0.01453356, "auxiliary_loss_mlp": 0.002612, "balance_loss_clip": 1.18397784, "balance_loss_mlp": 0.23258999, "epoch": 0.34540808657748384, "flos": 19606113135360.0, "grad_norm": 6.034156227042831, "language_loss": 0.87267113, "learning_rate": 3.044075480787665e-06, "loss": 0.88981676, "num_input_tokens_seen": 123450980, "router_z_loss_clip": 2.6953125, "router_z_loss_mlp": 0.28601074, "step": 5745, "time_per_iteration": 2.623452663421631 }, { "auxiliary_loss_clip": 0.01470004, "auxiliary_loss_mlp": 0.00263793, "balance_loss_clip": 1.19267261, "balance_loss_mlp": 0.23544472, "epoch": 0.3454682098301518, "flos": 20411661496320.0, "grad_norm": 88.7722177677256, "language_loss": 0.96527982, "learning_rate": 3.043743280407182e-06, "loss": 0.9826178, "num_input_tokens_seen": 123469365, "router_z_loss_clip": 2.77734375, "router_z_loss_mlp": 0.28369141, "step": 5746, "time_per_iteration": 2.7449681758880615 }, { "auxiliary_loss_clip": 0.0147041, "auxiliary_loss_mlp": 0.00283393, "balance_loss_clip": 1.19039989, "balance_loss_mlp": 0.25235093, "epoch": 0.34552833308281977, "flos": 21325121291520.0, "grad_norm": 3.8004040649364703, "language_loss": 0.74602044, "learning_rate": 3.043411040447849e-06, "loss": 0.76355839, "num_input_tokens_seen": 123489425, "router_z_loss_clip": 2.79882812, "router_z_loss_mlp": 0.31054688, "step": 5747, "time_per_iteration": 2.754140853881836 }, { "auxiliary_loss_clip": 0.01466816, "auxiliary_loss_mlp": 0.00242507, "balance_loss_clip": 1.19182158, "balance_loss_mlp": 0.2146001, "epoch": 0.34558845633548774, "flos": 36243633824640.0, "grad_norm": 9.135358857490374, "language_loss": 0.78507245, "learning_rate": 3.043078760922264e-06, "loss": 0.80216563, "num_input_tokens_seen": 123509970, "router_z_loss_clip": 2.75, "router_z_loss_mlp": 0.27893066, "step": 5748, "time_per_iteration": 2.8253400325775146 }, { "auxiliary_loss_clip": 0.0145459, "auxiliary_loss_mlp": 0.00305742, "balance_loss_clip": 1.18009329, "balance_loss_mlp": 0.27803731, "epoch": 0.3456485795881557, "flos": 22450561200000.0, "grad_norm": 8.795812113704747, "language_loss": 0.82074356, "learning_rate": 3.042746441843029e-06, "loss": 0.83834684, "num_input_tokens_seen": 123531055, "router_z_loss_clip": 2.7421875, "router_z_loss_mlp": 0.27709961, "step": 5749, "time_per_iteration": 2.6852948665618896 }, { "auxiliary_loss_clip": 0.01438559, "auxiliary_loss_mlp": 0.00084476, "balance_loss_clip": 1.25078773, "balance_loss_mlp": 0.07384302, "epoch": 0.34570870284082367, "flos": 62004299005440.0, "grad_norm": 1.2823536824308277, "language_loss": 0.6277225, "learning_rate": 3.0424140832227437e-06, "loss": 0.6429528, "num_input_tokens_seen": 123584720, "router_z_loss_clip": 1.875, "router_z_loss_mlp": 0.10644531, "step": 5750, "time_per_iteration": 3.0461132526397705 }, { "auxiliary_loss_clip": 0.01456432, "auxiliary_loss_mlp": 0.00248062, "balance_loss_clip": 1.18508816, "balance_loss_mlp": 0.21995258, "epoch": 0.34576882609349163, "flos": 22782196494720.0, "grad_norm": 20.520041659815462, "language_loss": 0.85950589, "learning_rate": 3.042081685074012e-06, "loss": 0.87655079, "num_input_tokens_seen": 123604465, "router_z_loss_clip": 2.71289062, "router_z_loss_mlp": 0.28088379, "step": 5751, "time_per_iteration": 2.6738550662994385 }, { "auxiliary_loss_clip": 0.01469247, "auxiliary_loss_mlp": 0.0026591, "balance_loss_clip": 1.19457614, "balance_loss_mlp": 0.23715663, "epoch": 0.34582894934615965, "flos": 12348818576640.0, "grad_norm": 5.6742831812274455, "language_loss": 0.91813672, "learning_rate": 3.041749247409439e-06, "loss": 0.93548822, "num_input_tokens_seen": 123622320, "router_z_loss_clip": 2.75, "router_z_loss_mlp": 0.2878418, "step": 5752, "time_per_iteration": 2.684209108352661 }, { "auxiliary_loss_clip": 0.01414592, "auxiliary_loss_mlp": 0.00086809, "balance_loss_clip": 1.23035455, "balance_loss_mlp": 0.07827319, "epoch": 0.3458890725988276, "flos": 70167691071360.0, "grad_norm": 0.7311763753685301, "language_loss": 0.62770212, "learning_rate": 3.0414167702416296e-06, "loss": 0.64271617, "num_input_tokens_seen": 123678010, "router_z_loss_clip": 1.84375, "router_z_loss_mlp": 0.08544922, "step": 5753, "time_per_iteration": 3.0959017276763916 }, { "auxiliary_loss_clip": 0.01479278, "auxiliary_loss_mlp": 0.00290814, "balance_loss_clip": 1.20185769, "balance_loss_mlp": 0.25974816, "epoch": 0.3459491958514956, "flos": 17092582093440.0, "grad_norm": 65.28241818629962, "language_loss": 0.78571439, "learning_rate": 3.0410842535831914e-06, "loss": 0.8034153, "num_input_tokens_seen": 123696830, "router_z_loss_clip": 2.77148438, "router_z_loss_mlp": 0.31091309, "step": 5754, "time_per_iteration": 2.7241389751434326 }, { "auxiliary_loss_clip": 0.01466154, "auxiliary_loss_mlp": 0.00265608, "balance_loss_clip": 1.19003689, "balance_loss_mlp": 0.23833291, "epoch": 0.34600931910416355, "flos": 16650952375680.0, "grad_norm": 18.65530035461238, "language_loss": 0.80181044, "learning_rate": 3.0407516974467343e-06, "loss": 0.8191281, "num_input_tokens_seen": 123714360, "router_z_loss_clip": 2.7578125, "router_z_loss_mlp": 0.27319336, "step": 5755, "time_per_iteration": 2.6638662815093994 }, { "auxiliary_loss_clip": 0.01466723, "auxiliary_loss_mlp": 0.00262467, "balance_loss_clip": 1.19378376, "balance_loss_mlp": 0.23345178, "epoch": 0.3460694423568315, "flos": 38546190334080.0, "grad_norm": 2.447677879106579, "language_loss": 0.78710055, "learning_rate": 3.040419101844869e-06, "loss": 0.80439246, "num_input_tokens_seen": 123739250, "router_z_loss_clip": 2.72851562, "router_z_loss_mlp": 0.29016113, "step": 5756, "time_per_iteration": 2.8843610286712646 }, { "auxiliary_loss_clip": 0.01401326, "auxiliary_loss_mlp": 0.00105339, "balance_loss_clip": 1.21756053, "balance_loss_mlp": 0.09704244, "epoch": 0.3461295656094995, "flos": 72081479704320.0, "grad_norm": 0.8640733316014463, "language_loss": 0.61767936, "learning_rate": 3.040086466790207e-06, "loss": 0.63274604, "num_input_tokens_seen": 123802845, "router_z_loss_clip": 1.8359375, "router_z_loss_mlp": 0.08300781, "step": 5757, "time_per_iteration": 3.1898863315582275 }, { "auxiliary_loss_clip": 0.01392471, "auxiliary_loss_mlp": 0.00112283, "balance_loss_clip": 1.21551335, "balance_loss_mlp": 0.10327029, "epoch": 0.34618968886216744, "flos": 65460089571840.0, "grad_norm": 0.8093034737188053, "language_loss": 0.58715409, "learning_rate": 3.039753792295362e-06, "loss": 0.60220164, "num_input_tokens_seen": 123861805, "router_z_loss_clip": 1.765625, "router_z_loss_mlp": 0.09033203, "step": 5758, "time_per_iteration": 4.599436044692993 }, { "auxiliary_loss_clip": 0.01462018, "auxiliary_loss_mlp": 0.00293925, "balance_loss_clip": 1.19192624, "balance_loss_mlp": 0.26648262, "epoch": 0.3462498121148354, "flos": 23472542960640.0, "grad_norm": 113.26262265731994, "language_loss": 0.77860415, "learning_rate": 3.0394210783729487e-06, "loss": 0.79616362, "num_input_tokens_seen": 123881820, "router_z_loss_clip": 2.69921875, "router_z_loss_mlp": 0.2746582, "step": 5759, "time_per_iteration": 4.1634767055511475 }, { "auxiliary_loss_clip": 0.01435428, "auxiliary_loss_mlp": 0.00269524, "balance_loss_clip": 1.16921639, "balance_loss_mlp": 0.24022238, "epoch": 0.3463099353675034, "flos": 24170790418560.0, "grad_norm": 4.953173030745065, "language_loss": 0.88824356, "learning_rate": 3.0390883250355836e-06, "loss": 0.90529311, "num_input_tokens_seen": 123903700, "router_z_loss_clip": 2.66210938, "router_z_loss_mlp": 0.29296875, "step": 5760, "time_per_iteration": 2.676464319229126 }, { "auxiliary_loss_clip": 0.01396916, "auxiliary_loss_mlp": 0.00073291, "balance_loss_clip": 1.21560526, "balance_loss_mlp": 0.0652801, "epoch": 0.34637005862017134, "flos": 63700609766400.0, "grad_norm": 0.7821886416323237, "language_loss": 0.56404281, "learning_rate": 3.0387555322958865e-06, "loss": 0.57874489, "num_input_tokens_seen": 123960075, "router_z_loss_clip": 1.8125, "router_z_loss_mlp": 0.08007812, "step": 5761, "time_per_iteration": 3.1754019260406494 }, { "auxiliary_loss_clip": 0.01443206, "auxiliary_loss_mlp": 0.00239477, "balance_loss_clip": 1.17942691, "balance_loss_mlp": 0.21245226, "epoch": 0.3464301818728393, "flos": 13145532192000.0, "grad_norm": 5.105558622624866, "language_loss": 1.0246675, "learning_rate": 3.038422700166474e-06, "loss": 1.04149437, "num_input_tokens_seen": 123975805, "router_z_loss_clip": 2.640625, "router_z_loss_mlp": 0.2701416, "step": 5762, "time_per_iteration": 2.633002281188965 }, { "auxiliary_loss_clip": 0.01454811, "auxiliary_loss_mlp": 0.00294878, "balance_loss_clip": 1.1854496, "balance_loss_mlp": 0.26295424, "epoch": 0.34649030512550727, "flos": 29315173299840.0, "grad_norm": 27.388763208134897, "language_loss": 0.75957525, "learning_rate": 3.0380898286599692e-06, "loss": 0.77707207, "num_input_tokens_seen": 123997530, "router_z_loss_clip": 2.6953125, "router_z_loss_mlp": 0.31933594, "step": 5763, "time_per_iteration": 2.709540605545044 }, { "auxiliary_loss_clip": 0.01447573, "auxiliary_loss_mlp": 0.00294117, "balance_loss_clip": 1.17755699, "balance_loss_mlp": 0.26400489, "epoch": 0.34655042837817523, "flos": 23730884553600.0, "grad_norm": 28.521733272912325, "language_loss": 0.90312946, "learning_rate": 3.0377569177889945e-06, "loss": 0.92054629, "num_input_tokens_seen": 124016375, "router_z_loss_clip": 2.69921875, "router_z_loss_mlp": 0.30126953, "step": 5764, "time_per_iteration": 4.0606207847595215 }, { "auxiliary_loss_clip": 0.01446801, "auxiliary_loss_mlp": 0.00280922, "balance_loss_clip": 1.18428946, "balance_loss_mlp": 0.25263375, "epoch": 0.34661055163084326, "flos": 22054215553920.0, "grad_norm": 2.8025743421765825, "language_loss": 0.75958824, "learning_rate": 3.0374239675661722e-06, "loss": 0.77686554, "num_input_tokens_seen": 124033975, "router_z_loss_clip": 2.62890625, "router_z_loss_mlp": 0.28308105, "step": 5765, "time_per_iteration": 2.6045303344726562 }, { "auxiliary_loss_clip": 0.01463132, "auxiliary_loss_mlp": 0.00242365, "balance_loss_clip": 1.19905698, "balance_loss_mlp": 0.21505415, "epoch": 0.3466706748835112, "flos": 21799213925760.0, "grad_norm": 10.632592324246923, "language_loss": 0.83198357, "learning_rate": 3.03709097800413e-06, "loss": 0.84903854, "num_input_tokens_seen": 124051930, "router_z_loss_clip": 2.64257812, "router_z_loss_mlp": 0.27319336, "step": 5766, "time_per_iteration": 2.6247313022613525 }, { "auxiliary_loss_clip": 0.01445257, "auxiliary_loss_mlp": 0.00280441, "balance_loss_clip": 1.181633, "balance_loss_mlp": 0.25196177, "epoch": 0.3467307981361792, "flos": 19461680547840.0, "grad_norm": 5.863973596172745, "language_loss": 0.7841149, "learning_rate": 3.0367579491154943e-06, "loss": 0.80137187, "num_input_tokens_seen": 124071220, "router_z_loss_clip": 2.63476562, "router_z_loss_mlp": 0.28479004, "step": 5767, "time_per_iteration": 2.6007909774780273 }, { "auxiliary_loss_clip": 0.01435317, "auxiliary_loss_mlp": 0.00272416, "balance_loss_clip": 1.17419803, "balance_loss_mlp": 0.24380605, "epoch": 0.34679092138884715, "flos": 24827452905600.0, "grad_norm": 103.80233222253996, "language_loss": 0.86324215, "learning_rate": 3.036424880912893e-06, "loss": 0.88031948, "num_input_tokens_seen": 124090140, "router_z_loss_clip": 2.609375, "router_z_loss_mlp": 0.28637695, "step": 5768, "time_per_iteration": 2.695274591445923 }, { "auxiliary_loss_clip": 0.01384951, "auxiliary_loss_mlp": 0.00081343, "balance_loss_clip": 1.21646476, "balance_loss_mlp": 0.07247426, "epoch": 0.3468510446415151, "flos": 63236070149760.0, "grad_norm": 0.8643138272967363, "language_loss": 0.57181668, "learning_rate": 3.036091773408956e-06, "loss": 0.58647954, "num_input_tokens_seen": 124152025, "router_z_loss_clip": 1.6875, "router_z_loss_mlp": 0.08886719, "step": 5769, "time_per_iteration": 3.17423939704895 }, { "auxiliary_loss_clip": 0.01469068, "auxiliary_loss_mlp": 0.00285971, "balance_loss_clip": 1.19308281, "balance_loss_mlp": 0.2543326, "epoch": 0.3469111678941831, "flos": 12120713256960.0, "grad_norm": 25.311508584086187, "language_loss": 0.95229203, "learning_rate": 3.0357586266163154e-06, "loss": 0.96984243, "num_input_tokens_seen": 124165795, "router_z_loss_clip": 2.7578125, "router_z_loss_mlp": 0.31640625, "step": 5770, "time_per_iteration": 4.026670694351196 }, { "auxiliary_loss_clip": 0.01380134, "auxiliary_loss_mlp": 0.00079637, "balance_loss_clip": 1.21078992, "balance_loss_mlp": 0.07095837, "epoch": 0.34697129114685105, "flos": 65934110378880.0, "grad_norm": 0.779600433788468, "language_loss": 0.59489465, "learning_rate": 3.0354254405476036e-06, "loss": 0.60949236, "num_input_tokens_seen": 124222925, "router_z_loss_clip": 1.6953125, "router_z_loss_mlp": 0.08691406, "step": 5771, "time_per_iteration": 2.9256889820098877 }, { "auxiliary_loss_clip": 0.01467532, "auxiliary_loss_mlp": 0.00301813, "balance_loss_clip": 1.19912231, "balance_loss_mlp": 0.27204615, "epoch": 0.347031414399519, "flos": 34454205054720.0, "grad_norm": 21.870951673600278, "language_loss": 0.78543895, "learning_rate": 3.0350922152154557e-06, "loss": 0.80313247, "num_input_tokens_seen": 124240915, "router_z_loss_clip": 2.68554688, "router_z_loss_mlp": 0.29748535, "step": 5772, "time_per_iteration": 2.7882466316223145 }, { "auxiliary_loss_clip": 0.01450608, "auxiliary_loss_mlp": 0.00330017, "balance_loss_clip": 1.18280315, "balance_loss_mlp": 0.29642382, "epoch": 0.347091537652187, "flos": 26944135511040.0, "grad_norm": 15.899152559726218, "language_loss": 0.82194114, "learning_rate": 3.034758950632507e-06, "loss": 0.83974737, "num_input_tokens_seen": 124262770, "router_z_loss_clip": 2.67773438, "router_z_loss_mlp": 0.33569336, "step": 5773, "time_per_iteration": 2.71340012550354 }, { "auxiliary_loss_clip": 0.01455008, "auxiliary_loss_mlp": 0.00309165, "balance_loss_clip": 1.18349767, "balance_loss_mlp": 0.27721652, "epoch": 0.34715166090485494, "flos": 21142228216320.0, "grad_norm": 14.773437514371732, "language_loss": 0.7896964, "learning_rate": 3.034425646811396e-06, "loss": 0.80733812, "num_input_tokens_seen": 124280950, "router_z_loss_clip": 2.71289062, "router_z_loss_mlp": 0.31970215, "step": 5774, "time_per_iteration": 2.6807193756103516 }, { "auxiliary_loss_clip": 0.01457083, "auxiliary_loss_mlp": 0.00323087, "balance_loss_clip": 1.19344509, "balance_loss_mlp": 0.29161549, "epoch": 0.3472117841575229, "flos": 23478001827840.0, "grad_norm": 88.39551330393498, "language_loss": 0.8151989, "learning_rate": 3.0340923037647602e-06, "loss": 0.83300054, "num_input_tokens_seen": 124299540, "router_z_loss_clip": 2.63671875, "router_z_loss_mlp": 0.31494141, "step": 5775, "time_per_iteration": 2.7095096111297607 }, { "auxiliary_loss_clip": 0.01479329, "auxiliary_loss_mlp": 0.00332796, "balance_loss_clip": 1.20361257, "balance_loss_mlp": 0.30070442, "epoch": 0.34727190741019087, "flos": 17492806408320.0, "grad_norm": 3.528692534846297, "language_loss": 0.89002311, "learning_rate": 3.0337589215052404e-06, "loss": 0.90814435, "num_input_tokens_seen": 124316285, "router_z_loss_clip": 2.7578125, "router_z_loss_mlp": 0.32080078, "step": 5776, "time_per_iteration": 2.645292282104492 }, { "auxiliary_loss_clip": 0.01404982, "auxiliary_loss_mlp": 0.00095069, "balance_loss_clip": 1.24075174, "balance_loss_mlp": 0.08462617, "epoch": 0.34733203066285884, "flos": 65265491640960.0, "grad_norm": 0.8547911464793351, "language_loss": 0.63281393, "learning_rate": 3.033425500045478e-06, "loss": 0.64781445, "num_input_tokens_seen": 124376650, "router_z_loss_clip": 1.640625, "router_z_loss_mlp": 0.10449219, "step": 5777, "time_per_iteration": 3.1997790336608887 }, { "auxiliary_loss_clip": 0.01456148, "auxiliary_loss_mlp": 0.00302339, "balance_loss_clip": 1.19011045, "balance_loss_mlp": 0.27082044, "epoch": 0.3473921539155268, "flos": 28658726294400.0, "grad_norm": 9.752263434950216, "language_loss": 0.72273779, "learning_rate": 3.033092039398119e-06, "loss": 0.74032265, "num_input_tokens_seen": 124396475, "router_z_loss_clip": 2.6640625, "router_z_loss_mlp": 0.31518555, "step": 5778, "time_per_iteration": 2.7698099613189697 }, { "auxiliary_loss_clip": 0.01493507, "auxiliary_loss_mlp": 0.00290564, "balance_loss_clip": 1.21767998, "balance_loss_mlp": 0.2593548, "epoch": 0.3474522771681948, "flos": 40836895355520.0, "grad_norm": 11.001242041406563, "language_loss": 0.80609274, "learning_rate": 3.0327585395758046e-06, "loss": 0.82393348, "num_input_tokens_seen": 124416480, "router_z_loss_clip": 2.76171875, "router_z_loss_mlp": 0.31225586, "step": 5779, "time_per_iteration": 2.839808464050293 }, { "auxiliary_loss_clip": 0.01499976, "auxiliary_loss_mlp": 0.00318879, "balance_loss_clip": 1.21871924, "balance_loss_mlp": 0.28728819, "epoch": 0.3475124004208628, "flos": 24608577381120.0, "grad_norm": 30.78492690117864, "language_loss": 0.71843857, "learning_rate": 3.0324250005911837e-06, "loss": 0.7366271, "num_input_tokens_seen": 124435950, "router_z_loss_clip": 2.8125, "router_z_loss_mlp": 0.31591797, "step": 5780, "time_per_iteration": 2.762274980545044 }, { "auxiliary_loss_clip": 0.01481142, "auxiliary_loss_mlp": 0.00310512, "balance_loss_clip": 1.20963943, "balance_loss_mlp": 0.27971995, "epoch": 0.34757252367353075, "flos": 22711309004160.0, "grad_norm": 10.041901054169669, "language_loss": 0.78291565, "learning_rate": 3.0320914224569033e-06, "loss": 0.80083215, "num_input_tokens_seen": 124455410, "router_z_loss_clip": 2.7109375, "router_z_loss_mlp": 0.30773926, "step": 5781, "time_per_iteration": 2.6990506649017334 }, { "auxiliary_loss_clip": 0.01481783, "auxiliary_loss_mlp": 0.00329622, "balance_loss_clip": 1.20937657, "balance_loss_mlp": 0.29714966, "epoch": 0.3476326469261987, "flos": 19828184970240.0, "grad_norm": 179.57107119191863, "language_loss": 0.87637818, "learning_rate": 3.031757805185612e-06, "loss": 0.89449215, "num_input_tokens_seen": 124474870, "router_z_loss_clip": 2.72460938, "router_z_loss_mlp": 0.32470703, "step": 5782, "time_per_iteration": 2.739673376083374 }, { "auxiliary_loss_clip": 0.01500035, "auxiliary_loss_mlp": 0.00371364, "balance_loss_clip": 1.22606385, "balance_loss_mlp": 0.33755642, "epoch": 0.3476927701788667, "flos": 19938107566080.0, "grad_norm": 183.9967570929765, "language_loss": 0.71102989, "learning_rate": 3.0314241487899622e-06, "loss": 0.7297439, "num_input_tokens_seen": 124494105, "router_z_loss_clip": 2.73632812, "router_z_loss_mlp": 0.33789062, "step": 5783, "time_per_iteration": 2.8160147666931152 }, { "auxiliary_loss_clip": 0.01481514, "auxiliary_loss_mlp": 0.00374101, "balance_loss_clip": 1.21817207, "balance_loss_mlp": 0.33955395, "epoch": 0.34775289343153465, "flos": 20735108490240.0, "grad_norm": 669.4162450256633, "language_loss": 0.92523789, "learning_rate": 3.031090453282605e-06, "loss": 0.94379401, "num_input_tokens_seen": 124512030, "router_z_loss_clip": 2.63476562, "router_z_loss_mlp": 0.34570312, "step": 5784, "time_per_iteration": 2.6665399074554443 }, { "auxiliary_loss_clip": 0.01489836, "auxiliary_loss_mlp": 0.00336213, "balance_loss_clip": 1.22203481, "balance_loss_mlp": 0.30233353, "epoch": 0.3478130166842026, "flos": 19354846521600.0, "grad_norm": 50.90848396056713, "language_loss": 0.87908626, "learning_rate": 3.0307567186761946e-06, "loss": 0.89734674, "num_input_tokens_seen": 124530980, "router_z_loss_clip": 2.67578125, "router_z_loss_mlp": 0.33862305, "step": 5785, "time_per_iteration": 2.6187832355499268 }, { "auxiliary_loss_clip": 0.0151043, "auxiliary_loss_mlp": 0.00316126, "balance_loss_clip": 1.237988, "balance_loss_mlp": 0.28653827, "epoch": 0.3478731399368706, "flos": 22051198811520.0, "grad_norm": 4.765072393733737, "language_loss": 0.85768551, "learning_rate": 3.0304229449833862e-06, "loss": 0.87595105, "num_input_tokens_seen": 124549330, "router_z_loss_clip": 2.72460938, "router_z_loss_mlp": 0.29589844, "step": 5786, "time_per_iteration": 2.672747850418091 }, { "auxiliary_loss_clip": 0.0150415, "auxiliary_loss_mlp": 0.00287256, "balance_loss_clip": 1.23568368, "balance_loss_mlp": 0.25645196, "epoch": 0.34793326318953854, "flos": 18041449720320.0, "grad_norm": 3.3337188605499346, "language_loss": 0.80718493, "learning_rate": 3.030089132216836e-06, "loss": 0.82509899, "num_input_tokens_seen": 124567200, "router_z_loss_clip": 2.68164062, "router_z_loss_mlp": 0.30810547, "step": 5787, "time_per_iteration": 2.651751756668091 }, { "auxiliary_loss_clip": 0.01522117, "auxiliary_loss_mlp": 0.00338003, "balance_loss_clip": 1.24406695, "balance_loss_mlp": 0.30545866, "epoch": 0.3479933864422065, "flos": 29314670509440.0, "grad_norm": 3.2158077699744694, "language_loss": 0.87289566, "learning_rate": 3.029755280389203e-06, "loss": 0.8914969, "num_input_tokens_seen": 124587025, "router_z_loss_clip": 2.78125, "router_z_loss_mlp": 0.32519531, "step": 5788, "time_per_iteration": 2.703883647918701 }, { "auxiliary_loss_clip": 0.01559011, "auxiliary_loss_mlp": 0.00321905, "balance_loss_clip": 1.27043462, "balance_loss_mlp": 0.28645253, "epoch": 0.3480535096948745, "flos": 20120713332480.0, "grad_norm": 6.589139409848463, "language_loss": 0.93800181, "learning_rate": 3.029421389513147e-06, "loss": 0.95681095, "num_input_tokens_seen": 124605860, "router_z_loss_clip": 2.88867188, "router_z_loss_mlp": 0.35424805, "step": 5789, "time_per_iteration": 2.6454269886016846 }, { "auxiliary_loss_clip": 0.01532953, "auxiliary_loss_mlp": 0.0034973, "balance_loss_clip": 1.2520138, "balance_loss_mlp": 0.31659013, "epoch": 0.34811363294754244, "flos": 18548974938240.0, "grad_norm": 3.6190364738353202, "language_loss": 0.90084207, "learning_rate": 3.029087459601328e-06, "loss": 0.91966891, "num_input_tokens_seen": 124624270, "router_z_loss_clip": 2.80859375, "router_z_loss_mlp": 0.33129883, "step": 5790, "time_per_iteration": 2.7314038276672363 }, { "auxiliary_loss_clip": 0.01535951, "auxiliary_loss_mlp": 0.00343922, "balance_loss_clip": 1.25908113, "balance_loss_mlp": 0.30977997, "epoch": 0.3481737562002104, "flos": 26870303105280.0, "grad_norm": 3.9284966007552398, "language_loss": 0.9047823, "learning_rate": 3.0287534906664097e-06, "loss": 0.923581, "num_input_tokens_seen": 124644005, "router_z_loss_clip": 2.76953125, "router_z_loss_mlp": 0.34130859, "step": 5791, "time_per_iteration": 2.6898274421691895 }, { "auxiliary_loss_clip": 0.01564545, "auxiliary_loss_mlp": 0.00324988, "balance_loss_clip": 1.27674079, "balance_loss_mlp": 0.2902976, "epoch": 0.3482338794528784, "flos": 28908664104960.0, "grad_norm": 32.78251315853265, "language_loss": 0.83618534, "learning_rate": 3.028419482721056e-06, "loss": 0.8550806, "num_input_tokens_seen": 124663020, "router_z_loss_clip": 2.8828125, "router_z_loss_mlp": 0.34716797, "step": 5792, "time_per_iteration": 2.7686691284179688 }, { "auxiliary_loss_clip": 0.01543542, "auxiliary_loss_mlp": 0.00290432, "balance_loss_clip": 1.26131034, "balance_loss_mlp": 0.25748298, "epoch": 0.3482940027055464, "flos": 22200767043840.0, "grad_norm": 18.141416934945124, "language_loss": 0.87461501, "learning_rate": 3.0280854357779325e-06, "loss": 0.89295477, "num_input_tokens_seen": 124682975, "router_z_loss_clip": 2.82226562, "router_z_loss_mlp": 0.32958984, "step": 5793, "time_per_iteration": 2.681962251663208 }, { "auxiliary_loss_clip": 0.01579516, "auxiliary_loss_mlp": 0.0034467, "balance_loss_clip": 1.28601408, "balance_loss_mlp": 0.30807275, "epoch": 0.34835412595821436, "flos": 20302708567680.0, "grad_norm": 4.496433465467231, "language_loss": 0.84319329, "learning_rate": 3.027751349849706e-06, "loss": 0.86243522, "num_input_tokens_seen": 124701340, "router_z_loss_clip": 2.93359375, "router_z_loss_mlp": 0.3659668, "step": 5794, "time_per_iteration": 2.739572525024414 }, { "auxiliary_loss_clip": 0.01554853, "auxiliary_loss_mlp": 0.00296552, "balance_loss_clip": 1.27188873, "balance_loss_mlp": 0.26295891, "epoch": 0.3484142492108823, "flos": 20449691020800.0, "grad_norm": 18.55495528512756, "language_loss": 0.63496387, "learning_rate": 3.0274172249490456e-06, "loss": 0.65347803, "num_input_tokens_seen": 124719165, "router_z_loss_clip": 2.828125, "router_z_loss_mlp": 0.33581543, "step": 5795, "time_per_iteration": 2.7755885124206543 }, { "auxiliary_loss_clip": 0.01547898, "auxiliary_loss_mlp": 0.0028252, "balance_loss_clip": 1.26247358, "balance_loss_mlp": 0.24985689, "epoch": 0.3484743724635503, "flos": 24352929308160.0, "grad_norm": 18.757703204072016, "language_loss": 0.88422692, "learning_rate": 3.0270830610886213e-06, "loss": 0.90253115, "num_input_tokens_seen": 124738670, "router_z_loss_clip": 2.85546875, "router_z_loss_mlp": 0.32641602, "step": 5796, "time_per_iteration": 2.7252728939056396 }, { "auxiliary_loss_clip": 0.01545913, "auxiliary_loss_mlp": 0.00238657, "balance_loss_clip": 1.26746607, "balance_loss_mlp": 0.20883131, "epoch": 0.34853449571621825, "flos": 24353001135360.0, "grad_norm": 10.824327737783955, "language_loss": 0.90600526, "learning_rate": 3.0267488582811033e-06, "loss": 0.92385095, "num_input_tokens_seen": 124758760, "router_z_loss_clip": 2.78515625, "router_z_loss_mlp": 0.29821777, "step": 5797, "time_per_iteration": 2.7263407707214355 }, { "auxiliary_loss_clip": 0.01571532, "auxiliary_loss_mlp": 0.00258269, "balance_loss_clip": 1.28195977, "balance_loss_mlp": 0.2246995, "epoch": 0.3485946189688862, "flos": 27267690245760.0, "grad_norm": 66.07733483051561, "language_loss": 0.78040153, "learning_rate": 3.026414616539167e-06, "loss": 0.7986995, "num_input_tokens_seen": 124777765, "router_z_loss_clip": 2.8984375, "router_z_loss_mlp": 0.33569336, "step": 5798, "time_per_iteration": 2.8640477657318115 }, { "auxiliary_loss_clip": 0.01570104, "auxiliary_loss_mlp": 0.00265474, "balance_loss_clip": 1.27898872, "balance_loss_mlp": 0.23259631, "epoch": 0.3486547422215542, "flos": 20156695781760.0, "grad_norm": 120.60373065257113, "language_loss": 0.84767115, "learning_rate": 3.026080335875485e-06, "loss": 0.866027, "num_input_tokens_seen": 124796775, "router_z_loss_clip": 2.91015625, "router_z_loss_mlp": 0.32885742, "step": 5799, "time_per_iteration": 2.6920862197875977 }, { "auxiliary_loss_clip": 0.01555081, "auxiliary_loss_mlp": 0.00242522, "balance_loss_clip": 1.26748073, "balance_loss_mlp": 0.21326782, "epoch": 0.34871486547422215, "flos": 20230348619520.0, "grad_norm": 977.9232727681581, "language_loss": 0.84634674, "learning_rate": 3.025746016302734e-06, "loss": 0.86432278, "num_input_tokens_seen": 124815825, "router_z_loss_clip": 2.87890625, "router_z_loss_mlp": 0.29248047, "step": 5800, "time_per_iteration": 2.6642909049987793 }, { "auxiliary_loss_clip": 0.01582317, "auxiliary_loss_mlp": 0.00275409, "balance_loss_clip": 1.28611696, "balance_loss_mlp": 0.24036191, "epoch": 0.3487749887268901, "flos": 44053234882560.0, "grad_norm": 41.17505015642335, "language_loss": 0.7422772, "learning_rate": 3.025411657833591e-06, "loss": 0.76085448, "num_input_tokens_seen": 124838420, "router_z_loss_clip": 2.96484375, "router_z_loss_mlp": 0.35058594, "step": 5801, "time_per_iteration": 5.75596284866333 }, { "auxiliary_loss_clip": 0.01593735, "auxiliary_loss_mlp": 0.00256901, "balance_loss_clip": 1.29995561, "balance_loss_mlp": 0.22247359, "epoch": 0.3488351119795581, "flos": 23295144666240.0, "grad_norm": 5.259347864654754, "language_loss": 0.83467793, "learning_rate": 3.025077260480735e-06, "loss": 0.85318434, "num_input_tokens_seen": 124857320, "router_z_loss_clip": 2.9375, "router_z_loss_mlp": 0.34423828, "step": 5802, "time_per_iteration": 2.6848833560943604 }, { "auxiliary_loss_clip": 0.01573586, "auxiliary_loss_mlp": 0.00226288, "balance_loss_clip": 1.28922021, "balance_loss_mlp": 0.19717684, "epoch": 0.34889523523222604, "flos": 19934839428480.0, "grad_norm": 23.57779255107615, "language_loss": 0.8548311, "learning_rate": 3.0247428242568474e-06, "loss": 0.87282985, "num_input_tokens_seen": 124875685, "router_z_loss_clip": 2.84375, "router_z_loss_mlp": 0.29125977, "step": 5803, "time_per_iteration": 2.656851053237915 }, { "auxiliary_loss_clip": 0.01551959, "auxiliary_loss_mlp": 0.00251914, "balance_loss_clip": 1.2583735, "balance_loss_mlp": 0.21767694, "epoch": 0.348955358484894, "flos": 30446179816320.0, "grad_norm": 29.324733621419128, "language_loss": 0.75280428, "learning_rate": 3.0244083491746085e-06, "loss": 0.77084303, "num_input_tokens_seen": 124895960, "router_z_loss_clip": 2.9375, "router_z_loss_mlp": 0.34179688, "step": 5804, "time_per_iteration": 2.7073211669921875 }, { "auxiliary_loss_clip": 0.01575134, "auxiliary_loss_mlp": 0.0024435, "balance_loss_clip": 1.28935063, "balance_loss_mlp": 0.21247301, "epoch": 0.349015481737562, "flos": 17999972490240.0, "grad_norm": 8.797189168446717, "language_loss": 0.84269536, "learning_rate": 3.024073835246702e-06, "loss": 0.86089021, "num_input_tokens_seen": 124914140, "router_z_loss_clip": 2.86132812, "router_z_loss_mlp": 0.31884766, "step": 5805, "time_per_iteration": 2.6225640773773193 }, { "auxiliary_loss_clip": 0.01559894, "auxiliary_loss_mlp": 0.00230483, "balance_loss_clip": 1.26963663, "balance_loss_mlp": 0.1970565, "epoch": 0.34907560499023, "flos": 27198490694400.0, "grad_norm": 146.9946221316487, "language_loss": 0.7654227, "learning_rate": 3.023739282485814e-06, "loss": 0.78332645, "num_input_tokens_seen": 124934180, "router_z_loss_clip": 2.90234375, "router_z_loss_mlp": 0.33447266, "step": 5806, "time_per_iteration": 4.117645978927612 }, { "auxiliary_loss_clip": 0.01590583, "auxiliary_loss_mlp": 0.00260473, "balance_loss_clip": 1.29432178, "balance_loss_mlp": 0.22594967, "epoch": 0.34913572824289796, "flos": 30226873328640.0, "grad_norm": 1.9751375308200427, "language_loss": 0.77537239, "learning_rate": 3.023404690904629e-06, "loss": 0.79388297, "num_input_tokens_seen": 124956060, "router_z_loss_clip": 2.96484375, "router_z_loss_mlp": 0.34521484, "step": 5807, "time_per_iteration": 2.7855167388916016 }, { "auxiliary_loss_clip": 0.01561004, "auxiliary_loss_mlp": 0.00257091, "balance_loss_clip": 1.26917839, "balance_loss_mlp": 0.2213755, "epoch": 0.3491958514955659, "flos": 29971907614080.0, "grad_norm": 73.4048830562558, "language_loss": 0.82050192, "learning_rate": 3.0230700605158364e-06, "loss": 0.83868289, "num_input_tokens_seen": 124976070, "router_z_loss_clip": 2.91796875, "router_z_loss_mlp": 0.35717773, "step": 5808, "time_per_iteration": 2.7306065559387207 }, { "auxiliary_loss_clip": 0.01566895, "auxiliary_loss_mlp": 0.00238818, "balance_loss_clip": 1.28053749, "balance_loss_mlp": 0.2056303, "epoch": 0.3492559747482339, "flos": 22783273902720.0, "grad_norm": 19.558915230599123, "language_loss": 0.89057976, "learning_rate": 3.0227353913321238e-06, "loss": 0.90863693, "num_input_tokens_seen": 124996995, "router_z_loss_clip": 2.86523438, "router_z_loss_mlp": 0.33227539, "step": 5809, "time_per_iteration": 2.654219627380371 }, { "auxiliary_loss_clip": 0.01560349, "auxiliary_loss_mlp": 0.00247631, "balance_loss_clip": 1.27748251, "balance_loss_mlp": 0.21296453, "epoch": 0.34931609800090185, "flos": 26068022881920.0, "grad_norm": 37.28032634310127, "language_loss": 0.87392956, "learning_rate": 3.0224006833661835e-06, "loss": 0.89200932, "num_input_tokens_seen": 125015600, "router_z_loss_clip": 2.82617188, "router_z_loss_mlp": 0.34692383, "step": 5810, "time_per_iteration": 2.655759572982788 }, { "auxiliary_loss_clip": 0.01569706, "auxiliary_loss_mlp": 0.00239939, "balance_loss_clip": 1.27663732, "balance_loss_mlp": 0.20486797, "epoch": 0.3493762212535698, "flos": 29242023252480.0, "grad_norm": 39.19042257785873, "language_loss": 0.82668775, "learning_rate": 3.0220659366307057e-06, "loss": 0.8447842, "num_input_tokens_seen": 125035290, "router_z_loss_clip": 2.9296875, "router_z_loss_mlp": 0.35058594, "step": 5811, "time_per_iteration": 2.7703630924224854 }, { "auxiliary_loss_clip": 0.01564741, "auxiliary_loss_mlp": 0.0023128, "balance_loss_clip": 1.27751386, "balance_loss_mlp": 0.19592279, "epoch": 0.3494363445062378, "flos": 27126058919040.0, "grad_norm": 6.36716662143538, "language_loss": 0.85958666, "learning_rate": 3.021731151138386e-06, "loss": 0.87754685, "num_input_tokens_seen": 125057130, "router_z_loss_clip": 2.875, "router_z_loss_mlp": 0.35351562, "step": 5812, "time_per_iteration": 2.70796537399292 }, { "auxiliary_loss_clip": 0.01566501, "auxiliary_loss_mlp": 0.00247519, "balance_loss_clip": 1.27272129, "balance_loss_mlp": 0.21106461, "epoch": 0.34949646775890575, "flos": 12276207233280.0, "grad_norm": 23.846968409879793, "language_loss": 0.7837404, "learning_rate": 3.021396326901918e-06, "loss": 0.8018806, "num_input_tokens_seen": 125073720, "router_z_loss_clip": 2.94335938, "router_z_loss_mlp": 0.36450195, "step": 5813, "time_per_iteration": 4.004144906997681 }, { "auxiliary_loss_clip": 0.01576528, "auxiliary_loss_mlp": 0.00227537, "balance_loss_clip": 1.2868855, "balance_loss_mlp": 0.19258484, "epoch": 0.3495565910115737, "flos": 17165516659200.0, "grad_norm": 9.027985365000973, "language_loss": 0.83727551, "learning_rate": 3.0210614639339998e-06, "loss": 0.85531616, "num_input_tokens_seen": 125090635, "router_z_loss_clip": 2.89453125, "router_z_loss_mlp": 0.34960938, "step": 5814, "time_per_iteration": 2.67761492729187 }, { "auxiliary_loss_clip": 0.01572333, "auxiliary_loss_mlp": 0.00242412, "balance_loss_clip": 1.2755301, "balance_loss_mlp": 0.20874743, "epoch": 0.3496167142642417, "flos": 26465661417600.0, "grad_norm": 230.01527896364266, "language_loss": 0.90555334, "learning_rate": 3.020726562247328e-06, "loss": 0.92370075, "num_input_tokens_seen": 125110070, "router_z_loss_clip": 2.97265625, "router_z_loss_mlp": 0.33691406, "step": 5815, "time_per_iteration": 2.744168758392334 }, { "auxiliary_loss_clip": 0.01576726, "auxiliary_loss_mlp": 0.00239533, "balance_loss_clip": 1.28114867, "balance_loss_mlp": 0.20603502, "epoch": 0.34967683751690964, "flos": 17414843938560.0, "grad_norm": 8.759685359859752, "language_loss": 0.83783603, "learning_rate": 3.0203916218546024e-06, "loss": 0.85599864, "num_input_tokens_seen": 125125730, "router_z_loss_clip": 2.95703125, "router_z_loss_mlp": 0.33496094, "step": 5816, "time_per_iteration": 2.632697820663452 }, { "auxiliary_loss_clip": 0.01594198, "auxiliary_loss_mlp": 0.00242727, "balance_loss_clip": 1.29634237, "balance_loss_mlp": 0.2026249, "epoch": 0.3497369607695776, "flos": 22600021691520.0, "grad_norm": 12.255862808411512, "language_loss": 0.6873104, "learning_rate": 3.0200566427685246e-06, "loss": 0.7056796, "num_input_tokens_seen": 125146195, "router_z_loss_clip": 2.98046875, "router_z_loss_mlp": 0.40112305, "step": 5817, "time_per_iteration": 2.7047250270843506 }, { "auxiliary_loss_clip": 0.01460708, "auxiliary_loss_mlp": 0.0007659, "balance_loss_clip": 1.28176117, "balance_loss_mlp": 0.06409672, "epoch": 0.34979708402224563, "flos": 68529374818560.0, "grad_norm": 0.9047523949974388, "language_loss": 0.60090649, "learning_rate": 3.0197216250017975e-06, "loss": 0.61627948, "num_input_tokens_seen": 125207790, "router_z_loss_clip": 1.7890625, "router_z_loss_mlp": 0.125, "step": 5818, "time_per_iteration": 3.2595417499542236 }, { "auxiliary_loss_clip": 0.01575064, "auxiliary_loss_mlp": 0.00225528, "balance_loss_clip": 1.28214061, "balance_loss_mlp": 0.18778639, "epoch": 0.3498572072749136, "flos": 18989634988800.0, "grad_norm": 26.728886683583188, "language_loss": 0.89554662, "learning_rate": 3.019386568567123e-06, "loss": 0.91355252, "num_input_tokens_seen": 125226220, "router_z_loss_clip": 2.92773438, "router_z_loss_mlp": 0.37768555, "step": 5819, "time_per_iteration": 2.6883039474487305 }, { "auxiliary_loss_clip": 0.01580991, "auxiliary_loss_mlp": 0.00212275, "balance_loss_clip": 1.28462315, "balance_loss_mlp": 0.17851481, "epoch": 0.34991733052758156, "flos": 27818883423360.0, "grad_norm": 3.1635689709523374, "language_loss": 0.75444341, "learning_rate": 3.0190514734772083e-06, "loss": 0.77237606, "num_input_tokens_seen": 125247485, "router_z_loss_clip": 2.96875, "router_z_loss_mlp": 0.33764648, "step": 5820, "time_per_iteration": 2.7443721294403076 }, { "auxiliary_loss_clip": 0.01577249, "auxiliary_loss_mlp": 0.00230196, "balance_loss_clip": 1.2770927, "balance_loss_mlp": 0.19498135, "epoch": 0.3499774537802495, "flos": 33584197737600.0, "grad_norm": 69.96375477888168, "language_loss": 0.73808169, "learning_rate": 3.018716339744759e-06, "loss": 0.75615609, "num_input_tokens_seen": 125268625, "router_z_loss_clip": 3.00195312, "router_z_loss_mlp": 0.35229492, "step": 5821, "time_per_iteration": 2.790905237197876 }, { "auxiliary_loss_clip": 0.015748, "auxiliary_loss_mlp": 0.00247696, "balance_loss_clip": 1.27156758, "balance_loss_mlp": 0.21128994, "epoch": 0.3500375770329175, "flos": 23476744851840.0, "grad_norm": 9.35806671139538, "language_loss": 0.81627595, "learning_rate": 3.0183811673824842e-06, "loss": 0.83450091, "num_input_tokens_seen": 125287530, "router_z_loss_clip": 3.03320312, "router_z_loss_mlp": 0.36401367, "step": 5822, "time_per_iteration": 2.700282096862793 }, { "auxiliary_loss_clip": 0.01567257, "auxiliary_loss_mlp": 0.00247892, "balance_loss_clip": 1.26854515, "balance_loss_mlp": 0.21155696, "epoch": 0.35009770028558546, "flos": 19026048401280.0, "grad_norm": 16.64389630615954, "language_loss": 0.8540414, "learning_rate": 3.018045956403094e-06, "loss": 0.87219286, "num_input_tokens_seen": 125307020, "router_z_loss_clip": 2.984375, "router_z_loss_mlp": 0.36328125, "step": 5823, "time_per_iteration": 2.661284923553467 }, { "auxiliary_loss_clip": 0.01412952, "auxiliary_loss_mlp": 0.00093582, "balance_loss_clip": 1.23731685, "balance_loss_mlp": 0.08175655, "epoch": 0.3501578235382534, "flos": 68351868783360.0, "grad_norm": 0.7643539865106257, "language_loss": 0.58663636, "learning_rate": 3.017710706819298e-06, "loss": 0.60170168, "num_input_tokens_seen": 125370445, "router_z_loss_clip": 1.75, "router_z_loss_mlp": 0.11816406, "step": 5824, "time_per_iteration": 3.1621673107147217 }, { "auxiliary_loss_clip": 0.01551744, "auxiliary_loss_mlp": 0.00217395, "balance_loss_clip": 1.259848, "balance_loss_mlp": 0.18125057, "epoch": 0.3502179467909214, "flos": 21250893836160.0, "grad_norm": 2.402137245768028, "language_loss": 0.91707826, "learning_rate": 3.017375418643811e-06, "loss": 0.93476963, "num_input_tokens_seen": 125388900, "router_z_loss_clip": 2.921875, "router_z_loss_mlp": 0.36132812, "step": 5825, "time_per_iteration": 2.635638952255249 }, { "auxiliary_loss_clip": 0.01577927, "auxiliary_loss_mlp": 0.00232612, "balance_loss_clip": 1.28528404, "balance_loss_mlp": 0.19646822, "epoch": 0.35027807004358935, "flos": 11942955826560.0, "grad_norm": 8.540964697798524, "language_loss": 0.92130703, "learning_rate": 3.0170400918893464e-06, "loss": 0.93941247, "num_input_tokens_seen": 125402675, "router_z_loss_clip": 2.92382812, "router_z_loss_mlp": 0.36181641, "step": 5826, "time_per_iteration": 2.5988311767578125 }, { "auxiliary_loss_clip": 0.0155829, "auxiliary_loss_mlp": 0.00230896, "balance_loss_clip": 1.26758587, "balance_loss_mlp": 0.19525287, "epoch": 0.3503381932962573, "flos": 21470918595840.0, "grad_norm": 25.662655212803255, "language_loss": 0.86769378, "learning_rate": 3.0167047265686186e-06, "loss": 0.88558567, "num_input_tokens_seen": 125421360, "router_z_loss_clip": 2.90625, "router_z_loss_mlp": 0.35644531, "step": 5827, "time_per_iteration": 2.6705381870269775 }, { "auxiliary_loss_clip": 0.0153217, "auxiliary_loss_mlp": 0.00204343, "balance_loss_clip": 1.24856234, "balance_loss_mlp": 0.16967644, "epoch": 0.3503983165489253, "flos": 21251109317760.0, "grad_norm": 334.3822124394979, "language_loss": 0.80842608, "learning_rate": 3.0163693226943467e-06, "loss": 0.82579124, "num_input_tokens_seen": 125440000, "router_z_loss_clip": 2.83789062, "router_z_loss_mlp": 0.34667969, "step": 5828, "time_per_iteration": 2.638786554336548 }, { "auxiliary_loss_clip": 0.0152519, "auxiliary_loss_mlp": 0.00235543, "balance_loss_clip": 1.23996758, "balance_loss_mlp": 0.19846895, "epoch": 0.35045843980159325, "flos": 27815723026560.0, "grad_norm": 22.696204966671583, "language_loss": 0.85539079, "learning_rate": 3.016033880279248e-06, "loss": 0.87299812, "num_input_tokens_seen": 125460390, "router_z_loss_clip": 2.8515625, "router_z_loss_mlp": 0.37060547, "step": 5829, "time_per_iteration": 2.6921236515045166 }, { "auxiliary_loss_clip": 0.01538918, "auxiliary_loss_mlp": 0.00239069, "balance_loss_clip": 1.24559188, "balance_loss_mlp": 0.20251977, "epoch": 0.3505185630542612, "flos": 25921148169600.0, "grad_norm": 10.008271779218335, "language_loss": 0.81343091, "learning_rate": 3.0156983993360417e-06, "loss": 0.83121079, "num_input_tokens_seen": 125478410, "router_z_loss_clip": 2.93359375, "router_z_loss_mlp": 0.36547852, "step": 5830, "time_per_iteration": 2.6528921127319336 }, { "auxiliary_loss_clip": 0.01541855, "auxiliary_loss_mlp": 0.00204128, "balance_loss_clip": 1.2530725, "balance_loss_mlp": 0.16922313, "epoch": 0.35057868630692923, "flos": 20521763660160.0, "grad_norm": 25.480547908884514, "language_loss": 0.97221619, "learning_rate": 3.0153628798774513e-06, "loss": 0.98967606, "num_input_tokens_seen": 125495975, "router_z_loss_clip": 2.88476562, "router_z_loss_mlp": 0.34887695, "step": 5831, "time_per_iteration": 2.6487741470336914 }, { "auxiliary_loss_clip": 0.01526114, "auxiliary_loss_mlp": 0.00201496, "balance_loss_clip": 1.24182379, "balance_loss_mlp": 0.16311058, "epoch": 0.3506388095595972, "flos": 20448649526400.0, "grad_norm": 8.421436539511818, "language_loss": 0.87504363, "learning_rate": 3.0150273219161985e-06, "loss": 0.89231968, "num_input_tokens_seen": 125515035, "router_z_loss_clip": 2.84179688, "router_z_loss_mlp": 0.38378906, "step": 5832, "time_per_iteration": 2.6634533405303955 }, { "auxiliary_loss_clip": 0.01540337, "auxiliary_loss_mlp": 0.00240114, "balance_loss_clip": 1.24602151, "balance_loss_mlp": 0.2017765, "epoch": 0.35069893281226516, "flos": 23109665811840.0, "grad_norm": 93.01636380491672, "language_loss": 0.78382868, "learning_rate": 3.014691725465008e-06, "loss": 0.80163324, "num_input_tokens_seen": 125535555, "router_z_loss_clip": 2.9375, "router_z_loss_mlp": 0.38330078, "step": 5833, "time_per_iteration": 2.7585129737854004 }, { "auxiliary_loss_clip": 0.01517197, "auxiliary_loss_mlp": 0.00183191, "balance_loss_clip": 1.23564792, "balance_loss_mlp": 0.15069476, "epoch": 0.35075905606493313, "flos": 27271999877760.0, "grad_norm": 19.68969705857964, "language_loss": 0.85490882, "learning_rate": 3.014356090536606e-06, "loss": 0.87191272, "num_input_tokens_seen": 125558195, "router_z_loss_clip": 2.81640625, "router_z_loss_mlp": 0.32519531, "step": 5834, "time_per_iteration": 2.78979754447937 }, { "auxiliary_loss_clip": 0.01523248, "auxiliary_loss_mlp": 0.00211907, "balance_loss_clip": 1.23934579, "balance_loss_mlp": 0.17697826, "epoch": 0.3508191793176011, "flos": 19128608709120.0, "grad_norm": 11.685140299256252, "language_loss": 0.92696583, "learning_rate": 3.0140204171437183e-06, "loss": 0.94431734, "num_input_tokens_seen": 125575375, "router_z_loss_clip": 2.83984375, "router_z_loss_mlp": 0.34960938, "step": 5835, "time_per_iteration": 2.6622815132141113 }, { "auxiliary_loss_clip": 0.01522393, "auxiliary_loss_mlp": 0.00220481, "balance_loss_clip": 1.24239779, "balance_loss_mlp": 0.18977252, "epoch": 0.35087930257026906, "flos": 25557588662400.0, "grad_norm": 2.2939299686992864, "language_loss": 0.81347227, "learning_rate": 3.0136847052990754e-06, "loss": 0.83090103, "num_input_tokens_seen": 125596745, "router_z_loss_clip": 2.796875, "router_z_loss_mlp": 0.30688477, "step": 5836, "time_per_iteration": 2.7081809043884277 }, { "auxiliary_loss_clip": 0.01508775, "auxiliary_loss_mlp": 0.00199258, "balance_loss_clip": 1.23258793, "balance_loss_mlp": 0.16683304, "epoch": 0.350939425822937, "flos": 18004246208640.0, "grad_norm": 94.30165913685752, "language_loss": 0.85731959, "learning_rate": 3.0133489550154074e-06, "loss": 0.8743999, "num_input_tokens_seen": 125613980, "router_z_loss_clip": 2.76171875, "router_z_loss_mlp": 0.32397461, "step": 5837, "time_per_iteration": 2.6494076251983643 }, { "auxiliary_loss_clip": 0.01510179, "auxiliary_loss_mlp": 0.00242485, "balance_loss_clip": 1.22746921, "balance_loss_mlp": 0.20922586, "epoch": 0.350999549075605, "flos": 22273198819200.0, "grad_norm": 47.5923821009948, "language_loss": 0.76175797, "learning_rate": 3.0130131663054442e-06, "loss": 0.7792846, "num_input_tokens_seen": 125632100, "router_z_loss_clip": 2.828125, "router_z_loss_mlp": 0.33239746, "step": 5838, "time_per_iteration": 2.677680730819702 }, { "auxiliary_loss_clip": 0.01489503, "auxiliary_loss_mlp": 0.00184986, "balance_loss_clip": 1.21836066, "balance_loss_mlp": 0.14950943, "epoch": 0.35105967232827295, "flos": 14392279307520.0, "grad_norm": 3.7660874424612287, "language_loss": 0.91749126, "learning_rate": 3.0126773391819215e-06, "loss": 0.93423617, "num_input_tokens_seen": 125649190, "router_z_loss_clip": 2.71289062, "router_z_loss_mlp": 0.35473633, "step": 5839, "time_per_iteration": 2.6309454441070557 }, { "auxiliary_loss_clip": 0.01495712, "auxiliary_loss_mlp": 0.00233021, "balance_loss_clip": 1.21534646, "balance_loss_mlp": 0.19864115, "epoch": 0.3511197955809409, "flos": 25082346792960.0, "grad_norm": 7.830237553035223, "language_loss": 0.68513894, "learning_rate": 3.012341473657572e-06, "loss": 0.70242625, "num_input_tokens_seen": 125668680, "router_z_loss_clip": 2.80078125, "router_z_loss_mlp": 0.34350586, "step": 5840, "time_per_iteration": 2.6954710483551025 }, { "auxiliary_loss_clip": 0.01495855, "auxiliary_loss_mlp": 0.00213708, "balance_loss_clip": 1.22218299, "balance_loss_mlp": 0.18004367, "epoch": 0.3511799188336089, "flos": 25884160139520.0, "grad_norm": 194.77949142691438, "language_loss": 0.95750022, "learning_rate": 3.0120055697451322e-06, "loss": 0.97459579, "num_input_tokens_seen": 125686935, "router_z_loss_clip": 2.734375, "router_z_loss_mlp": 0.33618164, "step": 5841, "time_per_iteration": 2.711456775665283 }, { "auxiliary_loss_clip": 0.01495523, "auxiliary_loss_mlp": 0.00248739, "balance_loss_clip": 1.21783924, "balance_loss_mlp": 0.21283305, "epoch": 0.35124004208627685, "flos": 20083725302400.0, "grad_norm": 48.88818409016856, "language_loss": 0.8396796, "learning_rate": 3.0116696274573406e-06, "loss": 0.85712218, "num_input_tokens_seen": 125707180, "router_z_loss_clip": 2.77734375, "router_z_loss_mlp": 0.35888672, "step": 5842, "time_per_iteration": 2.7118382453918457 }, { "auxiliary_loss_clip": 0.0150143, "auxiliary_loss_mlp": 0.00219394, "balance_loss_clip": 1.22603846, "balance_loss_mlp": 0.18270169, "epoch": 0.3513001653389448, "flos": 17783431349760.0, "grad_norm": 6.156129794892344, "language_loss": 0.81603575, "learning_rate": 3.0113336468069346e-06, "loss": 0.83324403, "num_input_tokens_seen": 125722780, "router_z_loss_clip": 2.75585938, "router_z_loss_mlp": 0.36694336, "step": 5843, "time_per_iteration": 4.196788311004639 }, { "auxiliary_loss_clip": 0.01488592, "auxiliary_loss_mlp": 0.00217674, "balance_loss_clip": 1.21699536, "balance_loss_mlp": 0.18186307, "epoch": 0.3513602885916128, "flos": 29387138198400.0, "grad_norm": 11.96732825421378, "language_loss": 0.74234068, "learning_rate": 3.010997627806655e-06, "loss": 0.75940329, "num_input_tokens_seen": 125742110, "router_z_loss_clip": 2.71289062, "router_z_loss_mlp": 0.3581543, "step": 5844, "time_per_iteration": 2.9514596462249756 }, { "auxiliary_loss_clip": 0.01490819, "auxiliary_loss_mlp": 0.0019977, "balance_loss_clip": 1.22015929, "balance_loss_mlp": 0.16539007, "epoch": 0.3514204118442808, "flos": 16179876483840.0, "grad_norm": 23.08384677847394, "language_loss": 0.84317678, "learning_rate": 3.010661570469245e-06, "loss": 0.86008269, "num_input_tokens_seen": 125759980, "router_z_loss_clip": 2.70703125, "router_z_loss_mlp": 0.34399414, "step": 5845, "time_per_iteration": 2.74468994140625 }, { "auxiliary_loss_clip": 0.0148671, "auxiliary_loss_mlp": 0.00205763, "balance_loss_clip": 1.2197783, "balance_loss_mlp": 0.17324279, "epoch": 0.35148053509694877, "flos": 23834665923840.0, "grad_norm": 12.007083258298238, "language_loss": 0.82987881, "learning_rate": 3.0103254748074465e-06, "loss": 0.84680355, "num_input_tokens_seen": 125772660, "router_z_loss_clip": 2.66992188, "router_z_loss_mlp": 0.32495117, "step": 5846, "time_per_iteration": 2.701140880584717 }, { "auxiliary_loss_clip": 0.01492805, "auxiliary_loss_mlp": 0.00220642, "balance_loss_clip": 1.21779823, "balance_loss_mlp": 0.18917096, "epoch": 0.35154065834961673, "flos": 20991295267200.0, "grad_norm": 6.347185183749801, "language_loss": 0.80332637, "learning_rate": 3.0099893408340046e-06, "loss": 0.82046092, "num_input_tokens_seen": 125791935, "router_z_loss_clip": 2.75195312, "router_z_loss_mlp": 0.31469727, "step": 5847, "time_per_iteration": 2.7477598190307617 }, { "auxiliary_loss_clip": 0.01506446, "auxiliary_loss_mlp": 0.00198281, "balance_loss_clip": 1.23107219, "balance_loss_mlp": 0.16685733, "epoch": 0.3516007816022847, "flos": 33255471444480.0, "grad_norm": 1346.510300176404, "language_loss": 0.80288112, "learning_rate": 3.009653168561666e-06, "loss": 0.81992835, "num_input_tokens_seen": 125813455, "router_z_loss_clip": 2.75976562, "router_z_loss_mlp": 0.31420898, "step": 5848, "time_per_iteration": 2.8120625019073486 }, { "auxiliary_loss_clip": 0.01501003, "auxiliary_loss_mlp": 0.00216329, "balance_loss_clip": 1.22664976, "balance_loss_mlp": 0.18237816, "epoch": 0.35166090485495266, "flos": 11726953390080.0, "grad_norm": 6.806242205544046, "language_loss": 0.99939144, "learning_rate": 3.009316958003178e-06, "loss": 1.01656473, "num_input_tokens_seen": 125827660, "router_z_loss_clip": 2.74414062, "router_z_loss_mlp": 0.33959961, "step": 5849, "time_per_iteration": 4.193632364273071 }, { "auxiliary_loss_clip": 0.01510966, "auxiliary_loss_mlp": 0.00205963, "balance_loss_clip": 1.23978472, "balance_loss_mlp": 0.17506403, "epoch": 0.3517210281076206, "flos": 22638446265600.0, "grad_norm": 118.00643526874818, "language_loss": 0.85088789, "learning_rate": 3.0089807091712897e-06, "loss": 0.86805713, "num_input_tokens_seen": 125846655, "router_z_loss_clip": 2.71484375, "router_z_loss_mlp": 0.30908203, "step": 5850, "time_per_iteration": 2.6801109313964844 }, { "auxiliary_loss_clip": 0.01506362, "auxiliary_loss_mlp": 0.0020993, "balance_loss_clip": 1.23525476, "balance_loss_mlp": 0.17690951, "epoch": 0.3517811513602886, "flos": 21322750993920.0, "grad_norm": 8.745930273480166, "language_loss": 0.82474875, "learning_rate": 3.0086444220787515e-06, "loss": 0.84191167, "num_input_tokens_seen": 125866290, "router_z_loss_clip": 2.70898438, "router_z_loss_mlp": 0.33007812, "step": 5851, "time_per_iteration": 2.638716459274292 }, { "auxiliary_loss_clip": 0.01523238, "auxiliary_loss_mlp": 0.00197316, "balance_loss_clip": 1.2503643, "balance_loss_mlp": 0.16548759, "epoch": 0.35184127461295656, "flos": 21032880238080.0, "grad_norm": 15.513811154796475, "language_loss": 0.95824856, "learning_rate": 3.0083080967383165e-06, "loss": 0.97545409, "num_input_tokens_seen": 125884620, "router_z_loss_clip": 2.7265625, "router_z_loss_mlp": 0.31811523, "step": 5852, "time_per_iteration": 2.6389896869659424 }, { "auxiliary_loss_clip": 0.01527077, "auxiliary_loss_mlp": 0.00214051, "balance_loss_clip": 1.24984908, "balance_loss_mlp": 0.18095855, "epoch": 0.3519013978656245, "flos": 22455265881600.0, "grad_norm": 33.05630572846214, "language_loss": 0.76525581, "learning_rate": 3.007971733162737e-06, "loss": 0.7826671, "num_input_tokens_seen": 125902430, "router_z_loss_clip": 2.77148438, "router_z_loss_mlp": 0.33105469, "step": 5853, "time_per_iteration": 2.713188648223877 }, { "auxiliary_loss_clip": 0.01522301, "auxiliary_loss_mlp": 0.00213125, "balance_loss_clip": 1.24175906, "balance_loss_mlp": 0.17967477, "epoch": 0.3519615211182925, "flos": 13115295918720.0, "grad_norm": 4.434978413776267, "language_loss": 0.89082646, "learning_rate": 3.0076353313647686e-06, "loss": 0.90818077, "num_input_tokens_seen": 125920570, "router_z_loss_clip": 2.8046875, "router_z_loss_mlp": 0.3347168, "step": 5854, "time_per_iteration": 2.6654839515686035 }, { "auxiliary_loss_clip": 0.01528573, "auxiliary_loss_mlp": 0.00188993, "balance_loss_clip": 1.25624585, "balance_loss_mlp": 0.15811799, "epoch": 0.35202164437096045, "flos": 19135144984320.0, "grad_norm": 6.088950865676143, "language_loss": 0.8012383, "learning_rate": 3.0072988913571666e-06, "loss": 0.81841397, "num_input_tokens_seen": 125939800, "router_z_loss_clip": 2.72265625, "router_z_loss_mlp": 0.30859375, "step": 5855, "time_per_iteration": 4.1006903648376465 }, { "auxiliary_loss_clip": 0.01509289, "auxiliary_loss_mlp": 0.00197747, "balance_loss_clip": 1.23767233, "balance_loss_mlp": 0.16587012, "epoch": 0.3520817676236284, "flos": 26542187343360.0, "grad_norm": 8.206888669266723, "language_loss": 0.79963672, "learning_rate": 3.006962413152691e-06, "loss": 0.81670702, "num_input_tokens_seen": 125958720, "router_z_loss_clip": 2.72265625, "router_z_loss_mlp": 0.31884766, "step": 5856, "time_per_iteration": 2.7310144901275635 }, { "auxiliary_loss_clip": 0.01537528, "auxiliary_loss_mlp": 0.00218346, "balance_loss_clip": 1.25418568, "balance_loss_mlp": 0.1853247, "epoch": 0.3521418908762964, "flos": 44893472803200.0, "grad_norm": 24.942118903064955, "language_loss": 0.69330537, "learning_rate": 3.0066258967640987e-06, "loss": 0.71086413, "num_input_tokens_seen": 125984310, "router_z_loss_clip": 2.83203125, "router_z_loss_mlp": 0.33032227, "step": 5857, "time_per_iteration": 2.960798740386963 }, { "auxiliary_loss_clip": 0.01528123, "auxiliary_loss_mlp": 0.0022685, "balance_loss_clip": 1.24635923, "balance_loss_mlp": 0.19539064, "epoch": 0.3522020141289644, "flos": 20187398931840.0, "grad_norm": 17.226276319749353, "language_loss": 0.80056202, "learning_rate": 3.006289342204152e-06, "loss": 0.81811178, "num_input_tokens_seen": 126002410, "router_z_loss_clip": 2.81835938, "router_z_loss_mlp": 0.31481934, "step": 5858, "time_per_iteration": 2.7337229251861572 }, { "auxiliary_loss_clip": 0.01531838, "auxiliary_loss_mlp": 0.00207318, "balance_loss_clip": 1.25277114, "balance_loss_mlp": 0.17646638, "epoch": 0.35226213738163237, "flos": 27563917708800.0, "grad_norm": 32.10512889216305, "language_loss": 0.83192825, "learning_rate": 3.0059527494856126e-06, "loss": 0.84931982, "num_input_tokens_seen": 126022490, "router_z_loss_clip": 2.78710938, "router_z_loss_mlp": 0.30834961, "step": 5859, "time_per_iteration": 2.7709572315216064 }, { "auxiliary_loss_clip": 0.01538089, "auxiliary_loss_mlp": 0.00243789, "balance_loss_clip": 1.25578547, "balance_loss_mlp": 0.21200824, "epoch": 0.35232226063430033, "flos": 22966310632320.0, "grad_norm": 27.652407880978856, "language_loss": 0.80359197, "learning_rate": 3.0056161186212435e-06, "loss": 0.82141072, "num_input_tokens_seen": 126042895, "router_z_loss_clip": 2.82226562, "router_z_loss_mlp": 0.31762695, "step": 5860, "time_per_iteration": 2.7094807624816895 }, { "auxiliary_loss_clip": 0.01528858, "auxiliary_loss_mlp": 0.00225652, "balance_loss_clip": 1.24485064, "balance_loss_mlp": 0.19315507, "epoch": 0.3523823838869683, "flos": 19168290259200.0, "grad_norm": 2.5149409271444694, "language_loss": 0.8035028, "learning_rate": 3.005279449623811e-06, "loss": 0.8210479, "num_input_tokens_seen": 126060130, "router_z_loss_clip": 2.84375, "router_z_loss_mlp": 0.32495117, "step": 5861, "time_per_iteration": 2.7008743286132812 }, { "auxiliary_loss_clip": 0.01551766, "auxiliary_loss_mlp": 0.00215468, "balance_loss_clip": 1.27121079, "balance_loss_mlp": 0.18216069, "epoch": 0.35244250713963626, "flos": 17930988420480.0, "grad_norm": 5.041665823191844, "language_loss": 0.74959791, "learning_rate": 3.0049427425060815e-06, "loss": 0.76727021, "num_input_tokens_seen": 126077850, "router_z_loss_clip": 2.80078125, "router_z_loss_mlp": 0.33325195, "step": 5862, "time_per_iteration": 2.6826417446136475 }, { "auxiliary_loss_clip": 0.01545658, "auxiliary_loss_mlp": 0.00250366, "balance_loss_clip": 1.26412499, "balance_loss_mlp": 0.2173216, "epoch": 0.35250263039230423, "flos": 21432529935360.0, "grad_norm": 271.92291673649606, "language_loss": 0.84381956, "learning_rate": 3.0046059972808215e-06, "loss": 0.86177981, "num_input_tokens_seen": 126095985, "router_z_loss_clip": 2.8125, "router_z_loss_mlp": 0.33081055, "step": 5863, "time_per_iteration": 2.7712810039520264 }, { "auxiliary_loss_clip": 0.0153974, "auxiliary_loss_mlp": 0.00229304, "balance_loss_clip": 1.25994825, "balance_loss_mlp": 0.19747508, "epoch": 0.3525627536449722, "flos": 27416863428480.0, "grad_norm": 24.67685669205483, "language_loss": 0.81271005, "learning_rate": 3.0042692139608024e-06, "loss": 0.83040047, "num_input_tokens_seen": 126116070, "router_z_loss_clip": 2.796875, "router_z_loss_mlp": 0.31835938, "step": 5864, "time_per_iteration": 2.7455577850341797 }, { "auxiliary_loss_clip": 0.01547252, "auxiliary_loss_mlp": 0.00216485, "balance_loss_clip": 1.2666415, "balance_loss_mlp": 0.18646784, "epoch": 0.35262287689764016, "flos": 24789818430720.0, "grad_norm": 3.5033826666673367, "language_loss": 0.89233088, "learning_rate": 3.003932392558793e-06, "loss": 0.90996826, "num_input_tokens_seen": 126135205, "router_z_loss_clip": 2.80078125, "router_z_loss_mlp": 0.30029297, "step": 5865, "time_per_iteration": 2.900442600250244 }, { "auxiliary_loss_clip": 0.01557253, "auxiliary_loss_mlp": 0.00265463, "balance_loss_clip": 1.26746023, "balance_loss_mlp": 0.23163143, "epoch": 0.3526830001503081, "flos": 17821604528640.0, "grad_norm": 3.0238925018260696, "language_loss": 0.89306295, "learning_rate": 3.0035955330875677e-06, "loss": 0.91129017, "num_input_tokens_seen": 126151895, "router_z_loss_clip": 2.90234375, "router_z_loss_mlp": 0.33813477, "step": 5866, "time_per_iteration": 2.6215198040008545 }, { "auxiliary_loss_clip": 0.01540923, "auxiliary_loss_mlp": 0.00258244, "balance_loss_clip": 1.25042629, "balance_loss_mlp": 0.22283903, "epoch": 0.3527431234029761, "flos": 18078114528000.0, "grad_norm": 14.40635147565815, "language_loss": 0.93465424, "learning_rate": 3.0032586355598986e-06, "loss": 0.9526459, "num_input_tokens_seen": 126168515, "router_z_loss_clip": 2.90820312, "router_z_loss_mlp": 0.35424805, "step": 5867, "time_per_iteration": 2.6193461418151855 }, { "auxiliary_loss_clip": 0.01551169, "auxiliary_loss_mlp": 0.00240178, "balance_loss_clip": 1.26684093, "balance_loss_mlp": 0.21088813, "epoch": 0.35280324665564405, "flos": 19427350124160.0, "grad_norm": 1348.2564886775306, "language_loss": 0.81130272, "learning_rate": 3.0029216999885613e-06, "loss": 0.82921618, "num_input_tokens_seen": 126186460, "router_z_loss_clip": 2.84179688, "router_z_loss_mlp": 0.29309082, "step": 5868, "time_per_iteration": 2.6162021160125732 }, { "auxiliary_loss_clip": 0.01562742, "auxiliary_loss_mlp": 0.00229255, "balance_loss_clip": 1.27585173, "balance_loss_mlp": 0.19752173, "epoch": 0.352863369908312, "flos": 21504027957120.0, "grad_norm": 18.565821139703353, "language_loss": 0.71407843, "learning_rate": 3.0025847263863327e-06, "loss": 0.73199832, "num_input_tokens_seen": 126206170, "router_z_loss_clip": 2.86914062, "router_z_loss_mlp": 0.31738281, "step": 5869, "time_per_iteration": 2.636399984359741 }, { "auxiliary_loss_clip": 0.01540381, "auxiliary_loss_mlp": 0.00234544, "balance_loss_clip": 1.25932312, "balance_loss_mlp": 0.20385987, "epoch": 0.35292349316098, "flos": 22309504490880.0, "grad_norm": 179.72877583884124, "language_loss": 0.83734149, "learning_rate": 3.0022477147659917e-06, "loss": 0.85509074, "num_input_tokens_seen": 126225605, "router_z_loss_clip": 2.8125, "router_z_loss_mlp": 0.3067627, "step": 5870, "time_per_iteration": 2.659522294998169 }, { "auxiliary_loss_clip": 0.01537413, "auxiliary_loss_mlp": 0.00243821, "balance_loss_clip": 1.26205564, "balance_loss_mlp": 0.21263549, "epoch": 0.352983616413648, "flos": 33109745967360.0, "grad_norm": 4.104322903160396, "language_loss": 0.77605247, "learning_rate": 3.001910665140316e-06, "loss": 0.79386485, "num_input_tokens_seen": 126250230, "router_z_loss_clip": 2.75390625, "router_z_loss_mlp": 0.31201172, "step": 5871, "time_per_iteration": 2.868439197540283 }, { "auxiliary_loss_clip": 0.01540613, "auxiliary_loss_mlp": 0.00206025, "balance_loss_clip": 1.26738369, "balance_loss_mlp": 0.17853522, "epoch": 0.35304373966631597, "flos": 18696603836160.0, "grad_norm": 261.4402412486613, "language_loss": 0.81121475, "learning_rate": 3.0015735775220873e-06, "loss": 0.82868111, "num_input_tokens_seen": 126268315, "router_z_loss_clip": 2.734375, "router_z_loss_mlp": 0.27514648, "step": 5872, "time_per_iteration": 2.627357244491577 }, { "auxiliary_loss_clip": 0.01548381, "auxiliary_loss_mlp": 0.00212955, "balance_loss_clip": 1.27127004, "balance_loss_mlp": 0.18265221, "epoch": 0.35310386291898394, "flos": 23364954748800.0, "grad_norm": 41.98583573571079, "language_loss": 0.87685698, "learning_rate": 3.001236451924089e-06, "loss": 0.89447033, "num_input_tokens_seen": 126288390, "router_z_loss_clip": 2.77539062, "router_z_loss_mlp": 0.30322266, "step": 5873, "time_per_iteration": 2.6473228931427 }, { "auxiliary_loss_clip": 0.01555399, "auxiliary_loss_mlp": 0.00265296, "balance_loss_clip": 1.27073741, "balance_loss_mlp": 0.23501715, "epoch": 0.3531639861716519, "flos": 24461954064000.0, "grad_norm": 111.65463896716041, "language_loss": 0.75061727, "learning_rate": 3.000899288359104e-06, "loss": 0.76882422, "num_input_tokens_seen": 126305750, "router_z_loss_clip": 2.84375, "router_z_loss_mlp": 0.30249023, "step": 5874, "time_per_iteration": 2.6582789421081543 }, { "auxiliary_loss_clip": 0.01548153, "auxiliary_loss_mlp": 0.00093008, "balance_loss_clip": 1.36006117, "balance_loss_mlp": 0.08418636, "epoch": 0.35322410942431987, "flos": 70312446881280.0, "grad_norm": 0.8025998236146415, "language_loss": 0.61204904, "learning_rate": 3.000562086839917e-06, "loss": 0.62846065, "num_input_tokens_seen": 126362495, "router_z_loss_clip": 1.875, "router_z_loss_mlp": 0.08837891, "step": 5875, "time_per_iteration": 3.0798707008361816 }, { "auxiliary_loss_clip": 0.01557035, "auxiliary_loss_mlp": 0.00253245, "balance_loss_clip": 1.27313447, "balance_loss_mlp": 0.22351415, "epoch": 0.35328423267698783, "flos": 19820894509440.0, "grad_norm": 23.428945231621206, "language_loss": 0.83997917, "learning_rate": 3.0002248473793163e-06, "loss": 0.85808194, "num_input_tokens_seen": 126378320, "router_z_loss_clip": 2.84375, "router_z_loss_mlp": 0.29699707, "step": 5876, "time_per_iteration": 2.634537696838379 }, { "auxiliary_loss_clip": 0.01502852, "auxiliary_loss_mlp": 0.00073255, "balance_loss_clip": 1.32432067, "balance_loss_mlp": 0.06424253, "epoch": 0.3533443559296558, "flos": 60826356391680.0, "grad_norm": 0.6788376982672064, "language_loss": 0.56679016, "learning_rate": 2.999887569990088e-06, "loss": 0.58255124, "num_input_tokens_seen": 126442735, "router_z_loss_clip": 1.7890625, "router_z_loss_mlp": 0.09033203, "step": 5877, "time_per_iteration": 3.2287144660949707 }, { "auxiliary_loss_clip": 0.01544909, "auxiliary_loss_mlp": 0.0022195, "balance_loss_clip": 1.2634449, "balance_loss_mlp": 0.19157571, "epoch": 0.35340447918232376, "flos": 24755775315840.0, "grad_norm": 373.40445332385326, "language_loss": 0.81697005, "learning_rate": 2.999550254685024e-06, "loss": 0.8346386, "num_input_tokens_seen": 126463090, "router_z_loss_clip": 2.81640625, "router_z_loss_mlp": 0.30371094, "step": 5878, "time_per_iteration": 2.6868772506713867 }, { "auxiliary_loss_clip": 0.01541398, "auxiliary_loss_mlp": 0.00246637, "balance_loss_clip": 1.259866, "balance_loss_mlp": 0.21807402, "epoch": 0.3534646024349917, "flos": 21796304924160.0, "grad_norm": 6.831848092345733, "language_loss": 0.85729015, "learning_rate": 2.9992129014769136e-06, "loss": 0.87517047, "num_input_tokens_seen": 126482105, "router_z_loss_clip": 2.81640625, "router_z_loss_mlp": 0.28588867, "step": 5879, "time_per_iteration": 2.673164129257202 }, { "auxiliary_loss_clip": 0.01544228, "auxiliary_loss_mlp": 0.00271621, "balance_loss_clip": 1.25687146, "balance_loss_mlp": 0.24007834, "epoch": 0.3535247256876597, "flos": 20012119539840.0, "grad_norm": 5.895197069495591, "language_loss": 0.76413691, "learning_rate": 2.9988755103785493e-06, "loss": 0.78229541, "num_input_tokens_seen": 126502125, "router_z_loss_clip": 2.87304688, "router_z_loss_mlp": 0.31542969, "step": 5880, "time_per_iteration": 2.6836390495300293 }, { "auxiliary_loss_clip": 0.0153904, "auxiliary_loss_mlp": 0.00272894, "balance_loss_clip": 1.25292325, "balance_loss_mlp": 0.2423286, "epoch": 0.35358484894032766, "flos": 18187929383040.0, "grad_norm": 19.702092018180075, "language_loss": 0.75804782, "learning_rate": 2.998538081402727e-06, "loss": 0.77616715, "num_input_tokens_seen": 126521950, "router_z_loss_clip": 2.859375, "router_z_loss_mlp": 0.30566406, "step": 5881, "time_per_iteration": 2.6667232513427734 }, { "auxiliary_loss_clip": 0.01539635, "auxiliary_loss_mlp": 0.00241575, "balance_loss_clip": 1.26371336, "balance_loss_mlp": 0.21524119, "epoch": 0.3536449721929956, "flos": 22820369673600.0, "grad_norm": 26.630480695242866, "language_loss": 0.81761354, "learning_rate": 2.998200614562239e-06, "loss": 0.83542562, "num_input_tokens_seen": 126542445, "router_z_loss_clip": 2.75976562, "router_z_loss_mlp": 0.26330566, "step": 5882, "time_per_iteration": 2.6696343421936035 }, { "auxiliary_loss_clip": 0.01545333, "auxiliary_loss_mlp": 0.00269124, "balance_loss_clip": 1.26142883, "balance_loss_mlp": 0.23803386, "epoch": 0.3537050954456636, "flos": 26432336574720.0, "grad_norm": 44.5138297520715, "language_loss": 0.76595086, "learning_rate": 2.9978631098698847e-06, "loss": 0.78409541, "num_input_tokens_seen": 126560690, "router_z_loss_clip": 2.83789062, "router_z_loss_mlp": 0.31079102, "step": 5883, "time_per_iteration": 2.7396907806396484 }, { "auxiliary_loss_clip": 0.01526528, "auxiliary_loss_mlp": 0.00271174, "balance_loss_clip": 1.24145472, "balance_loss_mlp": 0.23936914, "epoch": 0.3537652186983316, "flos": 17197153562880.0, "grad_norm": 193.92812762771175, "language_loss": 0.86477208, "learning_rate": 2.9975255673384614e-06, "loss": 0.88274914, "num_input_tokens_seen": 126577620, "router_z_loss_clip": 2.85351562, "router_z_loss_mlp": 0.31787109, "step": 5884, "time_per_iteration": 2.644519090652466 }, { "auxiliary_loss_clip": 0.0152835, "auxiliary_loss_mlp": 0.00236569, "balance_loss_clip": 1.24823546, "balance_loss_mlp": 0.20934188, "epoch": 0.3538253419509996, "flos": 19536769929600.0, "grad_norm": 75.71696170026736, "language_loss": 0.81557679, "learning_rate": 2.9971879869807673e-06, "loss": 0.83322597, "num_input_tokens_seen": 126596235, "router_z_loss_clip": 2.80664062, "router_z_loss_mlp": 0.27233887, "step": 5885, "time_per_iteration": 4.2806055545806885 }, { "auxiliary_loss_clip": 0.01525607, "auxiliary_loss_mlp": 0.0025277, "balance_loss_clip": 1.24441087, "balance_loss_mlp": 0.22396913, "epoch": 0.35388546520366754, "flos": 12128578335360.0, "grad_norm": 26.906172481899688, "language_loss": 0.94384611, "learning_rate": 2.996850368809606e-06, "loss": 0.96162993, "num_input_tokens_seen": 126612830, "router_z_loss_clip": 2.8125, "router_z_loss_mlp": 0.2878418, "step": 5886, "time_per_iteration": 2.648134231567383 }, { "auxiliary_loss_clip": 0.01517451, "auxiliary_loss_mlp": 0.00252816, "balance_loss_clip": 1.24711728, "balance_loss_mlp": 0.22287108, "epoch": 0.3539455884563355, "flos": 19678149861120.0, "grad_norm": 3.1122315115114896, "language_loss": 0.86311507, "learning_rate": 2.9965127128377787e-06, "loss": 0.88081777, "num_input_tokens_seen": 126630910, "router_z_loss_clip": 2.703125, "router_z_loss_mlp": 0.29907227, "step": 5887, "time_per_iteration": 2.7064616680145264 }, { "auxiliary_loss_clip": 0.01514738, "auxiliary_loss_mlp": 0.00247199, "balance_loss_clip": 1.24121308, "balance_loss_mlp": 0.21761124, "epoch": 0.35400571170900347, "flos": 18072045129600.0, "grad_norm": 8.399843582822472, "language_loss": 0.72722477, "learning_rate": 2.996175019078089e-06, "loss": 0.74484414, "num_input_tokens_seen": 126648365, "router_z_loss_clip": 2.73242188, "router_z_loss_mlp": 0.2956543, "step": 5888, "time_per_iteration": 2.6676909923553467 }, { "auxiliary_loss_clip": 0.01538843, "auxiliary_loss_mlp": 0.00284451, "balance_loss_clip": 1.26168489, "balance_loss_mlp": 0.25479209, "epoch": 0.35406583496167143, "flos": 26068058795520.0, "grad_norm": 7.2004686391379655, "language_loss": 0.84418434, "learning_rate": 2.9958372875433437e-06, "loss": 0.86241734, "num_input_tokens_seen": 126667500, "router_z_loss_clip": 2.7734375, "router_z_loss_mlp": 0.296875, "step": 5889, "time_per_iteration": 2.6549150943756104 }, { "auxiliary_loss_clip": 0.0155145, "auxiliary_loss_mlp": 0.0027363, "balance_loss_clip": 1.27395797, "balance_loss_mlp": 0.24380338, "epoch": 0.3541259582143394, "flos": 19792453916160.0, "grad_norm": 13.660839148494652, "language_loss": 0.90698481, "learning_rate": 2.9954995182463478e-06, "loss": 0.92523557, "num_input_tokens_seen": 126686820, "router_z_loss_clip": 2.77734375, "router_z_loss_mlp": 0.29821777, "step": 5890, "time_per_iteration": 2.715224027633667 }, { "auxiliary_loss_clip": 0.01533069, "auxiliary_loss_mlp": 0.00242878, "balance_loss_clip": 1.26127028, "balance_loss_mlp": 0.21643695, "epoch": 0.35418608146700736, "flos": 24022084112640.0, "grad_norm": 12.527501025519555, "language_loss": 0.86834407, "learning_rate": 2.99516171119991e-06, "loss": 0.88610357, "num_input_tokens_seen": 126706965, "router_z_loss_clip": 2.71875, "router_z_loss_mlp": 0.26391602, "step": 5891, "time_per_iteration": 4.073111534118652 }, { "auxiliary_loss_clip": 0.01524487, "auxiliary_loss_mlp": 0.00282846, "balance_loss_clip": 1.25589299, "balance_loss_mlp": 0.25393769, "epoch": 0.35424620471967533, "flos": 12385770693120.0, "grad_norm": 5.573846170037341, "language_loss": 0.79510474, "learning_rate": 2.9948238664168415e-06, "loss": 0.81317806, "num_input_tokens_seen": 126724015, "router_z_loss_clip": 2.6875, "router_z_loss_mlp": 0.2890625, "step": 5892, "time_per_iteration": 2.628737449645996 }, { "auxiliary_loss_clip": 0.01523981, "auxiliary_loss_mlp": 0.00265495, "balance_loss_clip": 1.2545495, "balance_loss_mlp": 0.23678914, "epoch": 0.3543063279723433, "flos": 19673624747520.0, "grad_norm": 11.305287881590573, "language_loss": 0.75058526, "learning_rate": 2.9944859839099518e-06, "loss": 0.76848, "num_input_tokens_seen": 126737565, "router_z_loss_clip": 2.6953125, "router_z_loss_mlp": 0.28674316, "step": 5893, "time_per_iteration": 2.665471076965332 }, { "auxiliary_loss_clip": 0.0153122, "auxiliary_loss_mlp": 0.0031037, "balance_loss_clip": 1.26058686, "balance_loss_mlp": 0.27928066, "epoch": 0.35436645122501126, "flos": 21909208348800.0, "grad_norm": 47.87487034370348, "language_loss": 0.75794637, "learning_rate": 2.9941480636920533e-06, "loss": 0.7763623, "num_input_tokens_seen": 126756095, "router_z_loss_clip": 2.70703125, "router_z_loss_mlp": 0.31103516, "step": 5894, "time_per_iteration": 2.666548252105713 }, { "auxiliary_loss_clip": 0.01540474, "auxiliary_loss_mlp": 0.00235802, "balance_loss_clip": 1.27321768, "balance_loss_mlp": 0.20915824, "epoch": 0.3544265744776792, "flos": 21719527603200.0, "grad_norm": 7.714899172362084, "language_loss": 0.80217361, "learning_rate": 2.9938101057759615e-06, "loss": 0.81993639, "num_input_tokens_seen": 126775455, "router_z_loss_clip": 2.67382812, "router_z_loss_mlp": 0.26611328, "step": 5895, "time_per_iteration": 2.64473032951355 }, { "auxiliary_loss_clip": 0.01530376, "auxiliary_loss_mlp": 0.00287995, "balance_loss_clip": 1.26146483, "balance_loss_mlp": 0.26050571, "epoch": 0.3544866977303472, "flos": 21213223447680.0, "grad_norm": 49.095140557841226, "language_loss": 0.90311062, "learning_rate": 2.993472110174491e-06, "loss": 0.92129433, "num_input_tokens_seen": 126792320, "router_z_loss_clip": 2.68945312, "router_z_loss_mlp": 0.27478027, "step": 5896, "time_per_iteration": 2.6386702060699463 }, { "auxiliary_loss_clip": 0.01522569, "auxiliary_loss_mlp": 0.00275233, "balance_loss_clip": 1.25777912, "balance_loss_mlp": 0.24968643, "epoch": 0.35454682098301515, "flos": 29311402371840.0, "grad_norm": 2.5200787110755978, "language_loss": 0.77765691, "learning_rate": 2.9931340769004576e-06, "loss": 0.79563498, "num_input_tokens_seen": 126813680, "router_z_loss_clip": 2.65039062, "router_z_loss_mlp": 0.25561523, "step": 5897, "time_per_iteration": 4.147968530654907 }, { "auxiliary_loss_clip": 0.01505761, "auxiliary_loss_mlp": 0.00272337, "balance_loss_clip": 1.24113894, "balance_loss_mlp": 0.24522921, "epoch": 0.3546069442356832, "flos": 24316587722880.0, "grad_norm": 8.963501027497745, "language_loss": 0.87302935, "learning_rate": 2.9927960059666816e-06, "loss": 0.89081031, "num_input_tokens_seen": 126834395, "router_z_loss_clip": 2.64648438, "router_z_loss_mlp": 0.27087402, "step": 5898, "time_per_iteration": 2.7752861976623535 }, { "auxiliary_loss_clip": 0.01488334, "auxiliary_loss_mlp": 0.0027671, "balance_loss_clip": 1.2290833, "balance_loss_mlp": 0.24869597, "epoch": 0.35466706748835114, "flos": 22857285876480.0, "grad_norm": 20.492287434901915, "language_loss": 0.81563127, "learning_rate": 2.9924578973859804e-06, "loss": 0.8332817, "num_input_tokens_seen": 126855145, "router_z_loss_clip": 2.58984375, "router_z_loss_mlp": 0.28015137, "step": 5899, "time_per_iteration": 2.6861534118652344 }, { "auxiliary_loss_clip": 0.01492157, "auxiliary_loss_mlp": 0.00252968, "balance_loss_clip": 1.22877693, "balance_loss_mlp": 0.22532293, "epoch": 0.3547271907410191, "flos": 28330107742080.0, "grad_norm": 3.5163163291122976, "language_loss": 0.87161309, "learning_rate": 2.9921197511711763e-06, "loss": 0.88906437, "num_input_tokens_seen": 126873790, "router_z_loss_clip": 2.63867188, "router_z_loss_mlp": 0.27636719, "step": 5900, "time_per_iteration": 2.7171592712402344 }, { "auxiliary_loss_clip": 0.01497941, "auxiliary_loss_mlp": 0.00296316, "balance_loss_clip": 1.23404193, "balance_loss_mlp": 0.26689562, "epoch": 0.35478731399368707, "flos": 23514092017920.0, "grad_norm": 26.47977309138114, "language_loss": 0.87827921, "learning_rate": 2.991781567335093e-06, "loss": 0.89622176, "num_input_tokens_seen": 126892865, "router_z_loss_clip": 2.63867188, "router_z_loss_mlp": 0.29443359, "step": 5901, "time_per_iteration": 2.653489351272583 }, { "auxiliary_loss_clip": 0.014964, "auxiliary_loss_mlp": 0.00278671, "balance_loss_clip": 1.22946513, "balance_loss_mlp": 0.25141972, "epoch": 0.35484743724635504, "flos": 18624315715200.0, "grad_norm": 12.298874308908644, "language_loss": 0.82703853, "learning_rate": 2.9914433458905525e-06, "loss": 0.84478927, "num_input_tokens_seen": 126911935, "router_z_loss_clip": 2.66992188, "router_z_loss_mlp": 0.27258301, "step": 5902, "time_per_iteration": 2.7079484462738037 }, { "auxiliary_loss_clip": 0.01481276, "auxiliary_loss_mlp": 0.00267821, "balance_loss_clip": 1.22052467, "balance_loss_mlp": 0.242167, "epoch": 0.354907560499023, "flos": 17384499924480.0, "grad_norm": 1.9705534189789042, "language_loss": 0.79117811, "learning_rate": 2.991105086850381e-06, "loss": 0.80866909, "num_input_tokens_seen": 126930040, "router_z_loss_clip": 2.60742188, "router_z_loss_mlp": 0.25610352, "step": 5903, "time_per_iteration": 2.698075771331787 }, { "auxiliary_loss_clip": 0.01478117, "auxiliary_loss_mlp": 0.00312131, "balance_loss_clip": 1.21451163, "balance_loss_mlp": 0.2839019, "epoch": 0.35496768375169097, "flos": 19208546426880.0, "grad_norm": 8.04838683859432, "language_loss": 0.84454334, "learning_rate": 2.9907667902274053e-06, "loss": 0.86244583, "num_input_tokens_seen": 126948390, "router_z_loss_clip": 2.63867188, "router_z_loss_mlp": 0.28222656, "step": 5904, "time_per_iteration": 2.650801420211792 }, { "auxiliary_loss_clip": 0.01483079, "auxiliary_loss_mlp": 0.00277839, "balance_loss_clip": 1.22035289, "balance_loss_mlp": 0.24998006, "epoch": 0.35502780700435893, "flos": 18332792933760.0, "grad_norm": 7.349125307122319, "language_loss": 0.85888022, "learning_rate": 2.9904284560344536e-06, "loss": 0.8764894, "num_input_tokens_seen": 126964905, "router_z_loss_clip": 2.63085938, "router_z_loss_mlp": 0.27893066, "step": 5905, "time_per_iteration": 2.675139904022217 }, { "auxiliary_loss_clip": 0.01488305, "auxiliary_loss_mlp": 0.00282415, "balance_loss_clip": 1.23508906, "balance_loss_mlp": 0.25539032, "epoch": 0.3550879302570269, "flos": 15448555578240.0, "grad_norm": 34.00735744856272, "language_loss": 0.77426964, "learning_rate": 2.990090084284356e-06, "loss": 0.79197681, "num_input_tokens_seen": 126982000, "router_z_loss_clip": 2.52929688, "router_z_loss_mlp": 0.2701416, "step": 5906, "time_per_iteration": 2.637699604034424 }, { "auxiliary_loss_clip": 0.01481577, "auxiliary_loss_mlp": 0.003166, "balance_loss_clip": 1.21661854, "balance_loss_mlp": 0.28683341, "epoch": 0.35514805350969486, "flos": 21979197999360.0, "grad_norm": 6413.6022757304345, "language_loss": 0.82373464, "learning_rate": 2.9897516749899426e-06, "loss": 0.84171635, "num_input_tokens_seen": 126998390, "router_z_loss_clip": 2.6484375, "router_z_loss_mlp": 0.29736328, "step": 5907, "time_per_iteration": 2.667628288269043 }, { "auxiliary_loss_clip": 0.01495828, "auxiliary_loss_mlp": 0.00292535, "balance_loss_clip": 1.23082876, "balance_loss_mlp": 0.26578468, "epoch": 0.3552081767623628, "flos": 29861949104640.0, "grad_norm": 10.775576147939987, "language_loss": 0.81064343, "learning_rate": 2.989413228164047e-06, "loss": 0.82852709, "num_input_tokens_seen": 127020220, "router_z_loss_clip": 2.65039062, "router_z_loss_mlp": 0.26733398, "step": 5908, "time_per_iteration": 2.8001832962036133 }, { "auxiliary_loss_clip": 0.01490273, "auxiliary_loss_mlp": 0.00319109, "balance_loss_clip": 1.22918749, "balance_loss_mlp": 0.29089257, "epoch": 0.3552683000150308, "flos": 26432264747520.0, "grad_norm": 3.821765736810609, "language_loss": 0.76175368, "learning_rate": 2.989074743819502e-06, "loss": 0.7798475, "num_input_tokens_seen": 127038585, "router_z_loss_clip": 2.61132812, "router_z_loss_mlp": 0.28210449, "step": 5909, "time_per_iteration": 2.8730227947235107 }, { "auxiliary_loss_clip": 0.01492003, "auxiliary_loss_mlp": 0.00299544, "balance_loss_clip": 1.23322582, "balance_loss_mlp": 0.27371129, "epoch": 0.35532842326769876, "flos": 19785989468160.0, "grad_norm": 7.041222316521886, "language_loss": 0.8666569, "learning_rate": 2.988736221969144e-06, "loss": 0.88457239, "num_input_tokens_seen": 127056215, "router_z_loss_clip": 2.59179688, "router_z_loss_mlp": 0.25866699, "step": 5910, "time_per_iteration": 2.6895179748535156 }, { "auxiliary_loss_clip": 0.01502458, "auxiliary_loss_mlp": 0.00352541, "balance_loss_clip": 1.23262894, "balance_loss_mlp": 0.32080752, "epoch": 0.3553885465203668, "flos": 17239277237760.0, "grad_norm": 2.049628425571731, "language_loss": 0.80072427, "learning_rate": 2.98839766262581e-06, "loss": 0.81927431, "num_input_tokens_seen": 127075825, "router_z_loss_clip": 2.69921875, "router_z_loss_mlp": 0.31738281, "step": 5911, "time_per_iteration": 2.6407647132873535 }, { "auxiliary_loss_clip": 0.01492864, "auxiliary_loss_mlp": 0.00318901, "balance_loss_clip": 1.23284876, "balance_loss_mlp": 0.29172164, "epoch": 0.35544866977303474, "flos": 14934350430720.0, "grad_norm": 46.99077634321067, "language_loss": 0.94998634, "learning_rate": 2.9880590658023366e-06, "loss": 0.96810395, "num_input_tokens_seen": 127091205, "router_z_loss_clip": 2.6015625, "router_z_loss_mlp": 0.27185059, "step": 5912, "time_per_iteration": 2.5882568359375 }, { "auxiliary_loss_clip": 0.01499189, "auxiliary_loss_mlp": 0.003145, "balance_loss_clip": 1.23575735, "balance_loss_mlp": 0.28596175, "epoch": 0.3555087930257027, "flos": 19756040503680.0, "grad_norm": 7.2213600463361125, "language_loss": 0.84271538, "learning_rate": 2.9877204315115646e-06, "loss": 0.86085224, "num_input_tokens_seen": 127109210, "router_z_loss_clip": 2.63476562, "router_z_loss_mlp": 0.28552246, "step": 5913, "time_per_iteration": 2.637784481048584 }, { "auxiliary_loss_clip": 0.01506802, "auxiliary_loss_mlp": 0.00315498, "balance_loss_clip": 1.24323249, "balance_loss_mlp": 0.28698277, "epoch": 0.3555689162783707, "flos": 21068252156160.0, "grad_norm": 2.2518926297441815, "language_loss": 0.86649996, "learning_rate": 2.9873817597663353e-06, "loss": 0.88472301, "num_input_tokens_seen": 127128400, "router_z_loss_clip": 2.6328125, "router_z_loss_mlp": 0.28515625, "step": 5914, "time_per_iteration": 2.625899314880371 }, { "auxiliary_loss_clip": 0.01509382, "auxiliary_loss_mlp": 0.00303621, "balance_loss_clip": 1.24217045, "balance_loss_mlp": 0.27638179, "epoch": 0.35562903953103864, "flos": 33069633454080.0, "grad_norm": 142.43572518494807, "language_loss": 0.79166162, "learning_rate": 2.98704305057949e-06, "loss": 0.80979168, "num_input_tokens_seen": 127149965, "router_z_loss_clip": 2.66992188, "router_z_loss_mlp": 0.27258301, "step": 5915, "time_per_iteration": 2.737635612487793 }, { "auxiliary_loss_clip": 0.015052, "auxiliary_loss_mlp": 0.00310532, "balance_loss_clip": 1.24310589, "balance_loss_mlp": 0.28099185, "epoch": 0.3556891627837066, "flos": 20557853850240.0, "grad_norm": 7.124030078357069, "language_loss": 0.8246547, "learning_rate": 2.9867043039638737e-06, "loss": 0.84281206, "num_input_tokens_seen": 127169865, "router_z_loss_clip": 2.61914062, "router_z_loss_mlp": 0.2956543, "step": 5916, "time_per_iteration": 2.6383414268493652 }, { "auxiliary_loss_clip": 0.01522837, "auxiliary_loss_mlp": 0.0032505, "balance_loss_clip": 1.25394535, "balance_loss_mlp": 0.29751289, "epoch": 0.35574928603637457, "flos": 20703327932160.0, "grad_norm": 116.98635109976172, "language_loss": 0.95379001, "learning_rate": 2.986365519932332e-06, "loss": 0.97226888, "num_input_tokens_seen": 127188075, "router_z_loss_clip": 2.68554688, "router_z_loss_mlp": 0.27539062, "step": 5917, "time_per_iteration": 2.6645665168762207 }, { "auxiliary_loss_clip": 0.0151482, "auxiliary_loss_mlp": 0.00305985, "balance_loss_clip": 1.24826586, "balance_loss_mlp": 0.27689826, "epoch": 0.35580940928904253, "flos": 15194595444480.0, "grad_norm": 7.318126587713542, "language_loss": 0.85146189, "learning_rate": 2.98602669849771e-06, "loss": 0.86966997, "num_input_tokens_seen": 127206065, "router_z_loss_clip": 2.66601562, "router_z_loss_mlp": 0.29089355, "step": 5918, "time_per_iteration": 2.672731637954712 }, { "auxiliary_loss_clip": 0.01463757, "auxiliary_loss_mlp": 0.00105356, "balance_loss_clip": 1.26793814, "balance_loss_mlp": 0.09615277, "epoch": 0.3558695325417105, "flos": 58639145431680.0, "grad_norm": 30.011511032784316, "language_loss": 0.63664269, "learning_rate": 2.985687839672857e-06, "loss": 0.65233386, "num_input_tokens_seen": 127257885, "router_z_loss_clip": 1.953125, "router_z_loss_mlp": 0.09179688, "step": 5919, "time_per_iteration": 2.904599189758301 }, { "auxiliary_loss_clip": 0.01518291, "auxiliary_loss_mlp": 0.00327551, "balance_loss_clip": 1.24954569, "balance_loss_mlp": 0.29870194, "epoch": 0.35592965579437846, "flos": 22018233104640.0, "grad_norm": 13.752898540590467, "language_loss": 0.82794708, "learning_rate": 2.9853489434706223e-06, "loss": 0.84640545, "num_input_tokens_seen": 127275550, "router_z_loss_clip": 2.68945312, "router_z_loss_mlp": 0.28881836, "step": 5920, "time_per_iteration": 2.6655349731445312 }, { "auxiliary_loss_clip": 0.0152425, "auxiliary_loss_mlp": 0.00325258, "balance_loss_clip": 1.25616241, "balance_loss_mlp": 0.29640907, "epoch": 0.35598977904704643, "flos": 23367684182400.0, "grad_norm": 15145.141574004392, "language_loss": 0.84300882, "learning_rate": 2.985010009903857e-06, "loss": 0.8615039, "num_input_tokens_seen": 127295110, "router_z_loss_clip": 2.68164062, "router_z_loss_mlp": 0.28857422, "step": 5921, "time_per_iteration": 2.755725383758545 }, { "auxiliary_loss_clip": 0.01525137, "auxiliary_loss_mlp": 0.00286204, "balance_loss_clip": 1.25587773, "balance_loss_mlp": 0.25587749, "epoch": 0.3560499022997144, "flos": 17785334770560.0, "grad_norm": 26.24091261076865, "language_loss": 0.77522993, "learning_rate": 2.9846710389854133e-06, "loss": 0.79334342, "num_input_tokens_seen": 127312865, "router_z_loss_clip": 2.69140625, "router_z_loss_mlp": 0.30297852, "step": 5922, "time_per_iteration": 2.7211203575134277 }, { "auxiliary_loss_clip": 0.01518421, "auxiliary_loss_mlp": 0.00292639, "balance_loss_clip": 1.25347948, "balance_loss_mlp": 0.26443449, "epoch": 0.35611002555238236, "flos": 20740459616640.0, "grad_norm": 41.33323798568489, "language_loss": 0.85017145, "learning_rate": 2.9843320307281454e-06, "loss": 0.86828208, "num_input_tokens_seen": 127331710, "router_z_loss_clip": 2.6484375, "router_z_loss_mlp": 0.28186035, "step": 5923, "time_per_iteration": 2.7301626205444336 }, { "auxiliary_loss_clip": 0.01515931, "auxiliary_loss_mlp": 0.002989, "balance_loss_clip": 1.25306797, "balance_loss_mlp": 0.27038473, "epoch": 0.3561701488050504, "flos": 19462219251840.0, "grad_norm": 25.991058511060764, "language_loss": 0.90702826, "learning_rate": 2.983992985144908e-06, "loss": 0.92517662, "num_input_tokens_seen": 127350950, "router_z_loss_clip": 2.62890625, "router_z_loss_mlp": 0.28515625, "step": 5924, "time_per_iteration": 2.684992551803589 }, { "auxiliary_loss_clip": 0.01528924, "auxiliary_loss_mlp": 0.00290892, "balance_loss_clip": 1.25990152, "balance_loss_mlp": 0.26273513, "epoch": 0.35623027205771834, "flos": 30774942023040.0, "grad_norm": 2.2945950548981013, "language_loss": 0.85980755, "learning_rate": 2.9836539022485578e-06, "loss": 0.87800574, "num_input_tokens_seen": 127369385, "router_z_loss_clip": 2.69140625, "router_z_loss_mlp": 0.28112793, "step": 5925, "time_per_iteration": 2.750714063644409 }, { "auxiliary_loss_clip": 0.01510387, "auxiliary_loss_mlp": 0.00312726, "balance_loss_clip": 1.24365389, "balance_loss_mlp": 0.28254217, "epoch": 0.3562903953103863, "flos": 16981079299200.0, "grad_norm": 23.117762382951124, "language_loss": 0.83298767, "learning_rate": 2.9833147820519535e-06, "loss": 0.85121882, "num_input_tokens_seen": 127386965, "router_z_loss_clip": 2.66601562, "router_z_loss_mlp": 0.30187988, "step": 5926, "time_per_iteration": 2.774214744567871 }, { "auxiliary_loss_clip": 0.01515393, "auxiliary_loss_mlp": 0.00285606, "balance_loss_clip": 1.24815214, "balance_loss_mlp": 0.25738871, "epoch": 0.3563505185630543, "flos": 23839837482240.0, "grad_norm": 16.390731639523615, "language_loss": 0.77688658, "learning_rate": 2.9829756245679544e-06, "loss": 0.7948966, "num_input_tokens_seen": 127406075, "router_z_loss_clip": 2.671875, "router_z_loss_mlp": 0.28210449, "step": 5927, "time_per_iteration": 4.455687522888184 }, { "auxiliary_loss_clip": 0.01512291, "auxiliary_loss_mlp": 0.00292783, "balance_loss_clip": 1.2481842, "balance_loss_mlp": 0.26590082, "epoch": 0.35641064181572224, "flos": 22273450214400.0, "grad_norm": 35.281872906752476, "language_loss": 0.85092533, "learning_rate": 2.9826364298094212e-06, "loss": 0.86897612, "num_input_tokens_seen": 127425350, "router_z_loss_clip": 2.64257812, "router_z_loss_mlp": 0.26867676, "step": 5928, "time_per_iteration": 4.368957281112671 }, { "auxiliary_loss_clip": 0.01508821, "auxiliary_loss_mlp": 0.002834, "balance_loss_clip": 1.24678826, "balance_loss_mlp": 0.25416991, "epoch": 0.3564707650683902, "flos": 23001251587200.0, "grad_norm": 4.077640733454962, "language_loss": 0.88193625, "learning_rate": 2.982297197789215e-06, "loss": 0.89985847, "num_input_tokens_seen": 127446335, "router_z_loss_clip": 2.625, "router_z_loss_mlp": 0.29223633, "step": 5929, "time_per_iteration": 2.848938226699829 }, { "auxiliary_loss_clip": 0.01509039, "auxiliary_loss_mlp": 0.0025333, "balance_loss_clip": 1.24648726, "balance_loss_mlp": 0.22618556, "epoch": 0.35653088832105817, "flos": 14684268965760.0, "grad_norm": 24.713996412558508, "language_loss": 0.78719914, "learning_rate": 2.981957928520201e-06, "loss": 0.8048228, "num_input_tokens_seen": 127462795, "router_z_loss_clip": 2.62695312, "router_z_loss_mlp": 0.27124023, "step": 5930, "time_per_iteration": 2.703510046005249 }, { "auxiliary_loss_clip": 0.01508001, "auxiliary_loss_mlp": 0.00286483, "balance_loss_clip": 1.2392621, "balance_loss_mlp": 0.25555986, "epoch": 0.35659101157372614, "flos": 23477068074240.0, "grad_norm": 36.41178767717122, "language_loss": 0.75772732, "learning_rate": 2.981618622015244e-06, "loss": 0.7756722, "num_input_tokens_seen": 127482675, "router_z_loss_clip": 2.68554688, "router_z_loss_mlp": 0.3092041, "step": 5931, "time_per_iteration": 2.8686869144439697 }, { "auxiliary_loss_clip": 0.01488728, "auxiliary_loss_mlp": 0.00247636, "balance_loss_clip": 1.23041344, "balance_loss_mlp": 0.22191074, "epoch": 0.3566511348263941, "flos": 26578672583040.0, "grad_norm": 3.629516985500712, "language_loss": 0.74720758, "learning_rate": 2.981279278287211e-06, "loss": 0.76457125, "num_input_tokens_seen": 127502275, "router_z_loss_clip": 2.58398438, "router_z_loss_mlp": 0.25708008, "step": 5932, "time_per_iteration": 2.7950844764709473 }, { "auxiliary_loss_clip": 0.01511044, "auxiliary_loss_mlp": 0.00274373, "balance_loss_clip": 1.24824548, "balance_loss_mlp": 0.24738401, "epoch": 0.35671125807906207, "flos": 13115008609920.0, "grad_norm": 515.1582088038905, "language_loss": 0.89982677, "learning_rate": 2.980939897348969e-06, "loss": 0.91768092, "num_input_tokens_seen": 127520195, "router_z_loss_clip": 2.63085938, "router_z_loss_mlp": 0.27001953, "step": 5933, "time_per_iteration": 4.237018823623657 }, { "auxiliary_loss_clip": 0.01497943, "auxiliary_loss_mlp": 0.00295381, "balance_loss_clip": 1.23597634, "balance_loss_mlp": 0.26600745, "epoch": 0.35677138133173003, "flos": 33000577557120.0, "grad_norm": 10.426071311582003, "language_loss": 0.75285125, "learning_rate": 2.980600479213388e-06, "loss": 0.7707845, "num_input_tokens_seen": 127544495, "router_z_loss_clip": 2.62109375, "router_z_loss_mlp": 0.29370117, "step": 5934, "time_per_iteration": 2.848538637161255 }, { "auxiliary_loss_clip": 0.01504644, "auxiliary_loss_mlp": 0.00277204, "balance_loss_clip": 1.23815691, "balance_loss_mlp": 0.24721113, "epoch": 0.356831504584398, "flos": 20777842696320.0, "grad_norm": 16.83274762527767, "language_loss": 0.80124712, "learning_rate": 2.9802610238933384e-06, "loss": 0.81906563, "num_input_tokens_seen": 127563810, "router_z_loss_clip": 2.6640625, "router_z_loss_mlp": 0.29992676, "step": 5935, "time_per_iteration": 2.8459441661834717 }, { "auxiliary_loss_clip": 0.01493347, "auxiliary_loss_mlp": 0.00275082, "balance_loss_clip": 1.23364365, "balance_loss_mlp": 0.24649569, "epoch": 0.35689162783706596, "flos": 12165566365440.0, "grad_norm": 11.033408738761537, "language_loss": 0.86630678, "learning_rate": 2.979921531401692e-06, "loss": 0.88399112, "num_input_tokens_seen": 127579065, "router_z_loss_clip": 2.59765625, "router_z_loss_mlp": 0.28564453, "step": 5936, "time_per_iteration": 2.85355544090271 }, { "auxiliary_loss_clip": 0.01491076, "auxiliary_loss_mlp": 0.00272285, "balance_loss_clip": 1.22679257, "balance_loss_mlp": 0.24514139, "epoch": 0.356951751089734, "flos": 23841489507840.0, "grad_norm": 8.822662366218662, "language_loss": 0.72147751, "learning_rate": 2.9795820017513242e-06, "loss": 0.73911113, "num_input_tokens_seen": 127599105, "router_z_loss_clip": 2.64648438, "router_z_loss_mlp": 0.27160645, "step": 5937, "time_per_iteration": 2.6971676349639893 }, { "auxiliary_loss_clip": 0.01491748, "auxiliary_loss_mlp": 0.00254721, "balance_loss_clip": 1.22994924, "balance_loss_mlp": 0.2283276, "epoch": 0.35701187434240195, "flos": 11722176881280.0, "grad_norm": 299.6733573780944, "language_loss": 0.89531302, "learning_rate": 2.9792424349551073e-06, "loss": 0.91277772, "num_input_tokens_seen": 127614940, "router_z_loss_clip": 2.61914062, "router_z_loss_mlp": 0.26379395, "step": 5938, "time_per_iteration": 2.6800901889801025 }, { "auxiliary_loss_clip": 0.01499365, "auxiliary_loss_mlp": 0.00247942, "balance_loss_clip": 1.23788393, "balance_loss_mlp": 0.22226456, "epoch": 0.3570719975950699, "flos": 24898879100160.0, "grad_norm": 35.35227962056127, "language_loss": 0.864519, "learning_rate": 2.9789028310259202e-06, "loss": 0.88199198, "num_input_tokens_seen": 127634960, "router_z_loss_clip": 2.61523438, "router_z_loss_mlp": 0.25671387, "step": 5939, "time_per_iteration": 4.156993865966797 }, { "auxiliary_loss_clip": 0.0148571, "auxiliary_loss_mlp": 0.00295593, "balance_loss_clip": 1.21911299, "balance_loss_mlp": 0.26582688, "epoch": 0.3571321208477379, "flos": 25994836920960.0, "grad_norm": 8.680842744578293, "language_loss": 0.87455285, "learning_rate": 2.9785631899766395e-06, "loss": 0.89236593, "num_input_tokens_seen": 127654545, "router_z_loss_clip": 2.66601562, "router_z_loss_mlp": 0.29772949, "step": 5940, "time_per_iteration": 2.6442272663116455 }, { "auxiliary_loss_clip": 0.01493829, "auxiliary_loss_mlp": 0.00326047, "balance_loss_clip": 1.22956014, "balance_loss_mlp": 0.29254931, "epoch": 0.35719224410040584, "flos": 14501663199360.0, "grad_norm": 18.897314772315582, "language_loss": 0.80665338, "learning_rate": 2.9782235118201443e-06, "loss": 0.82485211, "num_input_tokens_seen": 127672320, "router_z_loss_clip": 2.640625, "router_z_loss_mlp": 0.33496094, "step": 5941, "time_per_iteration": 2.6456472873687744 }, { "auxiliary_loss_clip": 0.01500958, "auxiliary_loss_mlp": 0.00253751, "balance_loss_clip": 1.23690462, "balance_loss_mlp": 0.22751309, "epoch": 0.3572523673530738, "flos": 31175453646720.0, "grad_norm": 29.30628975145058, "language_loss": 0.76966882, "learning_rate": 2.9778837965693154e-06, "loss": 0.78721589, "num_input_tokens_seen": 127693315, "router_z_loss_clip": 2.63867188, "router_z_loss_mlp": 0.2623291, "step": 5942, "time_per_iteration": 2.6903724670410156 }, { "auxiliary_loss_clip": 0.01492028, "auxiliary_loss_mlp": 0.0027162, "balance_loss_clip": 1.22857308, "balance_loss_mlp": 0.24370095, "epoch": 0.3573124906057418, "flos": 15851976203520.0, "grad_norm": 32.0714690750336, "language_loss": 0.82299042, "learning_rate": 2.9775440442370354e-06, "loss": 0.8406269, "num_input_tokens_seen": 127711570, "router_z_loss_clip": 2.6328125, "router_z_loss_mlp": 0.27880859, "step": 5943, "time_per_iteration": 2.604304313659668 }, { "auxiliary_loss_clip": 0.01496204, "auxiliary_loss_mlp": 0.00088257, "balance_loss_clip": 1.30519748, "balance_loss_mlp": 0.07748039, "epoch": 0.35737261385840974, "flos": 60822729118080.0, "grad_norm": 0.7988660221409314, "language_loss": 0.60460901, "learning_rate": 2.9772042548361867e-06, "loss": 0.6204536, "num_input_tokens_seen": 127772475, "router_z_loss_clip": 1.90625, "router_z_loss_mlp": 0.10791016, "step": 5944, "time_per_iteration": 3.2205090522766113 }, { "auxiliary_loss_clip": 0.01493226, "auxiliary_loss_mlp": 0.00271535, "balance_loss_clip": 1.23329306, "balance_loss_mlp": 0.24228139, "epoch": 0.3574327371110777, "flos": 18843765857280.0, "grad_norm": 31.55290877374383, "language_loss": 0.78670406, "learning_rate": 2.976864428379655e-06, "loss": 0.80435169, "num_input_tokens_seen": 127790940, "router_z_loss_clip": 2.60351562, "router_z_loss_mlp": 0.29260254, "step": 5945, "time_per_iteration": 2.6056289672851562 }, { "auxiliary_loss_clip": 0.01501848, "auxiliary_loss_mlp": 0.00269551, "balance_loss_clip": 1.24379051, "balance_loss_mlp": 0.24033329, "epoch": 0.35749286036374567, "flos": 23549679417600.0, "grad_norm": 2.982838781620009, "language_loss": 0.85974407, "learning_rate": 2.976524564880326e-06, "loss": 0.87745804, "num_input_tokens_seen": 127808275, "router_z_loss_clip": 2.578125, "router_z_loss_mlp": 0.29187012, "step": 5946, "time_per_iteration": 2.6423346996307373 }, { "auxiliary_loss_clip": 0.01502911, "auxiliary_loss_mlp": 0.00255238, "balance_loss_clip": 1.23779571, "balance_loss_mlp": 0.22781941, "epoch": 0.35755298361641363, "flos": 21105491581440.0, "grad_norm": 4.986826558578387, "language_loss": 0.7515285, "learning_rate": 2.9761846643510882e-06, "loss": 0.76911002, "num_input_tokens_seen": 127828840, "router_z_loss_clip": 2.65234375, "router_z_loss_mlp": 0.27429199, "step": 5947, "time_per_iteration": 2.65560245513916 }, { "auxiliary_loss_clip": 0.01502512, "auxiliary_loss_mlp": 0.00249347, "balance_loss_clip": 1.24339724, "balance_loss_mlp": 0.22227435, "epoch": 0.3576131068690816, "flos": 19245031666560.0, "grad_norm": 1757.1775610805526, "language_loss": 0.81522727, "learning_rate": 2.9758447268048297e-06, "loss": 0.83274585, "num_input_tokens_seen": 127846240, "router_z_loss_clip": 2.58789062, "router_z_loss_mlp": 0.27087402, "step": 5948, "time_per_iteration": 2.669339656829834 }, { "auxiliary_loss_clip": 0.01500312, "auxiliary_loss_mlp": 0.00276958, "balance_loss_clip": 1.24172616, "balance_loss_mlp": 0.2490395, "epoch": 0.35767323012174956, "flos": 28654703971200.0, "grad_norm": 2.3119903943017728, "language_loss": 0.80437452, "learning_rate": 2.9755047522544415e-06, "loss": 0.82214725, "num_input_tokens_seen": 127866880, "router_z_loss_clip": 2.58984375, "router_z_loss_mlp": 0.27929688, "step": 5949, "time_per_iteration": 2.7328648567199707 }, { "auxiliary_loss_clip": 0.0150381, "auxiliary_loss_mlp": 0.00271933, "balance_loss_clip": 1.24194205, "balance_loss_mlp": 0.24407417, "epoch": 0.35773335337441753, "flos": 17085363459840.0, "grad_norm": 23.091905183980128, "language_loss": 0.84887576, "learning_rate": 2.9751647407128154e-06, "loss": 0.86663318, "num_input_tokens_seen": 127883560, "router_z_loss_clip": 2.6171875, "router_z_loss_mlp": 0.27832031, "step": 5950, "time_per_iteration": 2.6642158031463623 }, { "auxiliary_loss_clip": 0.01499182, "auxiliary_loss_mlp": 0.00276289, "balance_loss_clip": 1.23775434, "balance_loss_mlp": 0.24770282, "epoch": 0.35779347662708555, "flos": 15888605097600.0, "grad_norm": 213.36166011452516, "language_loss": 0.81946564, "learning_rate": 2.9748246921928445e-06, "loss": 0.83722031, "num_input_tokens_seen": 127902330, "router_z_loss_clip": 2.6171875, "router_z_loss_mlp": 0.28588867, "step": 5951, "time_per_iteration": 2.6741888523101807 }, { "auxiliary_loss_clip": 0.01505622, "auxiliary_loss_mlp": 0.00275568, "balance_loss_clip": 1.23940623, "balance_loss_mlp": 0.24515802, "epoch": 0.3578535998797535, "flos": 28658834035200.0, "grad_norm": 299.2720347104936, "language_loss": 0.79243314, "learning_rate": 2.9744846067074236e-06, "loss": 0.81024504, "num_input_tokens_seen": 127922325, "router_z_loss_clip": 2.6640625, "router_z_loss_mlp": 0.30407715, "step": 5952, "time_per_iteration": 2.8111579418182373 }, { "auxiliary_loss_clip": 0.0150193, "auxiliary_loss_mlp": 0.00269252, "balance_loss_clip": 1.24150717, "balance_loss_mlp": 0.24126138, "epoch": 0.3579137231324215, "flos": 37852432076160.0, "grad_norm": 765.0293157213974, "language_loss": 0.75548947, "learning_rate": 2.974144484269449e-06, "loss": 0.77320129, "num_input_tokens_seen": 127942635, "router_z_loss_clip": 2.60351562, "router_z_loss_mlp": 0.28015137, "step": 5953, "time_per_iteration": 2.8341546058654785 }, { "auxiliary_loss_clip": 0.01498873, "auxiliary_loss_mlp": 0.00273169, "balance_loss_clip": 1.23718941, "balance_loss_mlp": 0.24581094, "epoch": 0.35797384638508944, "flos": 22346851656960.0, "grad_norm": 2.5389744417506694, "language_loss": 0.72711557, "learning_rate": 2.9738043248918175e-06, "loss": 0.74483603, "num_input_tokens_seen": 127962520, "router_z_loss_clip": 2.6171875, "router_z_loss_mlp": 0.2734375, "step": 5954, "time_per_iteration": 2.7336812019348145 }, { "auxiliary_loss_clip": 0.01500985, "auxiliary_loss_mlp": 0.00258639, "balance_loss_clip": 1.24401653, "balance_loss_mlp": 0.2316974, "epoch": 0.3580339696377574, "flos": 13589711775360.0, "grad_norm": 9.547787503987697, "language_loss": 0.83188701, "learning_rate": 2.9734641285874282e-06, "loss": 0.84948313, "num_input_tokens_seen": 127981180, "router_z_loss_clip": 2.56640625, "router_z_loss_mlp": 0.26940918, "step": 5955, "time_per_iteration": 2.6379752159118652 }, { "auxiliary_loss_clip": 0.01494611, "auxiliary_loss_mlp": 0.00250133, "balance_loss_clip": 1.23729324, "balance_loss_mlp": 0.22298923, "epoch": 0.3580940928904254, "flos": 23768231719680.0, "grad_norm": 63.083960422439176, "language_loss": 0.83031559, "learning_rate": 2.973123895369182e-06, "loss": 0.84776306, "num_input_tokens_seen": 127999725, "router_z_loss_clip": 2.57226562, "router_z_loss_mlp": 0.27124023, "step": 5956, "time_per_iteration": 2.688478469848633 }, { "auxiliary_loss_clip": 0.01490946, "auxiliary_loss_mlp": 0.00221116, "balance_loss_clip": 1.23601735, "balance_loss_mlp": 0.19626084, "epoch": 0.35815421614309334, "flos": 19463871277440.0, "grad_norm": 6.554604328044441, "language_loss": 0.79953784, "learning_rate": 2.9727836252499805e-06, "loss": 0.81665844, "num_input_tokens_seen": 128018885, "router_z_loss_clip": 2.54492188, "router_z_loss_mlp": 0.24841309, "step": 5957, "time_per_iteration": 2.6196229457855225 }, { "auxiliary_loss_clip": 0.0150473, "auxiliary_loss_mlp": 0.00272438, "balance_loss_clip": 1.24173033, "balance_loss_mlp": 0.24566346, "epoch": 0.3582143393957613, "flos": 23368186972800.0, "grad_norm": 23.446359030780407, "language_loss": 0.82114965, "learning_rate": 2.972443318242726e-06, "loss": 0.83892131, "num_input_tokens_seen": 128037875, "router_z_loss_clip": 2.62890625, "router_z_loss_mlp": 0.2677002, "step": 5958, "time_per_iteration": 2.6661384105682373 }, { "auxiliary_loss_clip": 0.0148663, "auxiliary_loss_mlp": 0.00242682, "balance_loss_clip": 1.2306273, "balance_loss_mlp": 0.21924575, "epoch": 0.35827446264842927, "flos": 26323275905280.0, "grad_norm": 1.7229376919654875, "language_loss": 0.9365077, "learning_rate": 2.972102974360324e-06, "loss": 0.95380086, "num_input_tokens_seen": 128056045, "router_z_loss_clip": 2.56445312, "router_z_loss_mlp": 0.23425293, "step": 5959, "time_per_iteration": 2.7513198852539062 }, { "auxiliary_loss_clip": 0.01482245, "auxiliary_loss_mlp": 0.00215151, "balance_loss_clip": 1.22549295, "balance_loss_mlp": 0.18981871, "epoch": 0.35833458590109724, "flos": 30446610779520.0, "grad_norm": 6.090088655960469, "language_loss": 0.65056479, "learning_rate": 2.971762593615679e-06, "loss": 0.66753876, "num_input_tokens_seen": 128077815, "router_z_loss_clip": 2.56835938, "router_z_loss_mlp": 0.25354004, "step": 5960, "time_per_iteration": 2.7185463905334473 }, { "auxiliary_loss_clip": 0.01473336, "auxiliary_loss_mlp": 0.00237406, "balance_loss_clip": 1.2193476, "balance_loss_mlp": 0.20982131, "epoch": 0.3583947091537652, "flos": 14829886702080.0, "grad_norm": 11.16089893860679, "language_loss": 0.85702682, "learning_rate": 2.9714221760216993e-06, "loss": 0.8741343, "num_input_tokens_seen": 128095460, "router_z_loss_clip": 2.54101562, "router_z_loss_mlp": 0.27587891, "step": 5961, "time_per_iteration": 2.621490001678467 }, { "auxiliary_loss_clip": 0.01469958, "auxiliary_loss_mlp": 0.00256717, "balance_loss_clip": 1.21490967, "balance_loss_mlp": 0.22981113, "epoch": 0.35845483240643317, "flos": 34240644743040.0, "grad_norm": 19.03233312902429, "language_loss": 0.78469002, "learning_rate": 2.971081721591294e-06, "loss": 0.80195677, "num_input_tokens_seen": 128118605, "router_z_loss_clip": 2.55273438, "router_z_loss_mlp": 0.26904297, "step": 5962, "time_per_iteration": 2.768618106842041 }, { "auxiliary_loss_clip": 0.01465514, "auxiliary_loss_mlp": 0.00233435, "balance_loss_clip": 1.21321177, "balance_loss_mlp": 0.20667244, "epoch": 0.35851495565910113, "flos": 20960089326720.0, "grad_norm": 8.948067264379377, "language_loss": 0.81682509, "learning_rate": 2.9707412303373716e-06, "loss": 0.83381462, "num_input_tokens_seen": 128139205, "router_z_loss_clip": 2.5234375, "router_z_loss_mlp": 0.26757812, "step": 5963, "time_per_iteration": 2.66129469871521 }, { "auxiliary_loss_clip": 0.01471584, "auxiliary_loss_mlp": 0.00251421, "balance_loss_clip": 1.21500874, "balance_loss_mlp": 0.22377646, "epoch": 0.35857507891176915, "flos": 22309863626880.0, "grad_norm": 17.870409051344918, "language_loss": 0.86086494, "learning_rate": 2.9704007022728447e-06, "loss": 0.87809503, "num_input_tokens_seen": 128158765, "router_z_loss_clip": 2.56640625, "router_z_loss_mlp": 0.27636719, "step": 5964, "time_per_iteration": 2.7011122703552246 }, { "auxiliary_loss_clip": 0.01470987, "auxiliary_loss_mlp": 0.00269969, "balance_loss_clip": 1.21473098, "balance_loss_mlp": 0.24408835, "epoch": 0.3586352021644371, "flos": 23367863750400.0, "grad_norm": 211.42828195324296, "language_loss": 0.74672979, "learning_rate": 2.970060137410626e-06, "loss": 0.76413935, "num_input_tokens_seen": 128177850, "router_z_loss_clip": 2.5625, "router_z_loss_mlp": 0.25878906, "step": 5965, "time_per_iteration": 2.649482250213623 }, { "auxiliary_loss_clip": 0.01459605, "auxiliary_loss_mlp": 0.00253339, "balance_loss_clip": 1.20793462, "balance_loss_mlp": 0.22718433, "epoch": 0.3586953254171051, "flos": 27849227437440.0, "grad_norm": 18.671993238716933, "language_loss": 0.85685104, "learning_rate": 2.9697195357636294e-06, "loss": 0.87398046, "num_input_tokens_seen": 128196925, "router_z_loss_clip": 2.515625, "router_z_loss_mlp": 0.26147461, "step": 5966, "time_per_iteration": 2.680037498474121 }, { "auxiliary_loss_clip": 0.01458894, "auxiliary_loss_mlp": 0.00275665, "balance_loss_clip": 1.20482135, "balance_loss_mlp": 0.24650636, "epoch": 0.35875544866977305, "flos": 19500500171520.0, "grad_norm": 420.1194610653278, "language_loss": 0.9859603, "learning_rate": 2.9693788973447715e-06, "loss": 1.00330591, "num_input_tokens_seen": 128213955, "router_z_loss_clip": 2.53710938, "router_z_loss_mlp": 0.29174805, "step": 5967, "time_per_iteration": 2.653076648712158 }, { "auxiliary_loss_clip": 0.01466207, "auxiliary_loss_mlp": 0.0025569, "balance_loss_clip": 1.21094632, "balance_loss_mlp": 0.22734228, "epoch": 0.358815571922441, "flos": 21471134077440.0, "grad_norm": 7.498684555916693, "language_loss": 0.88701344, "learning_rate": 2.9690382221669682e-06, "loss": 0.9042325, "num_input_tokens_seen": 128232980, "router_z_loss_clip": 2.54882812, "router_z_loss_mlp": 0.28369141, "step": 5968, "time_per_iteration": 2.6468262672424316 }, { "auxiliary_loss_clip": 0.01456793, "auxiliary_loss_mlp": 0.00254175, "balance_loss_clip": 1.20683146, "balance_loss_mlp": 0.22577976, "epoch": 0.358875695175109, "flos": 21835411856640.0, "grad_norm": 87.28617008129291, "language_loss": 0.91435999, "learning_rate": 2.9686975102431384e-06, "loss": 0.93146968, "num_input_tokens_seen": 128252795, "router_z_loss_clip": 2.5, "router_z_loss_mlp": 0.28417969, "step": 5969, "time_per_iteration": 4.13720178604126 }, { "auxiliary_loss_clip": 0.01458056, "auxiliary_loss_mlp": 0.00247614, "balance_loss_clip": 1.20897341, "balance_loss_mlp": 0.22153111, "epoch": 0.35893581842777694, "flos": 32011633330560.0, "grad_norm": 271.8719445895624, "language_loss": 0.79646909, "learning_rate": 2.968356761586202e-06, "loss": 0.8135258, "num_input_tokens_seen": 128273115, "router_z_loss_clip": 2.49414062, "router_z_loss_mlp": 0.26086426, "step": 5970, "time_per_iteration": 4.14414381980896 }, { "auxiliary_loss_clip": 0.01454417, "auxiliary_loss_mlp": 0.00244193, "balance_loss_clip": 1.20511758, "balance_loss_mlp": 0.21538004, "epoch": 0.3589959416804449, "flos": 20485817124480.0, "grad_norm": 4.321338247277519, "language_loss": 0.87330472, "learning_rate": 2.9680159762090805e-06, "loss": 0.8902908, "num_input_tokens_seen": 128292220, "router_z_loss_clip": 2.49609375, "router_z_loss_mlp": 0.2878418, "step": 5971, "time_per_iteration": 2.7066855430603027 }, { "auxiliary_loss_clip": 0.01445554, "auxiliary_loss_mlp": 0.00223705, "balance_loss_clip": 1.19537544, "balance_loss_mlp": 0.19534475, "epoch": 0.3590560649331129, "flos": 16180666583040.0, "grad_norm": 3.0395966567516086, "language_loss": 0.88180196, "learning_rate": 2.967675154124696e-06, "loss": 0.89849454, "num_input_tokens_seen": 128310305, "router_z_loss_clip": 2.50585938, "router_z_loss_mlp": 0.28356934, "step": 5972, "time_per_iteration": 2.683870315551758 }, { "auxiliary_loss_clip": 0.01441351, "auxiliary_loss_mlp": 0.00230422, "balance_loss_clip": 1.19439769, "balance_loss_mlp": 0.20286115, "epoch": 0.35911618818578084, "flos": 20375391738240.0, "grad_norm": 19.878774657511613, "language_loss": 0.87281334, "learning_rate": 2.9673342953459722e-06, "loss": 0.88953114, "num_input_tokens_seen": 128328305, "router_z_loss_clip": 2.47070312, "router_z_loss_mlp": 0.27587891, "step": 5973, "time_per_iteration": 2.7409756183624268 }, { "auxiliary_loss_clip": 0.01477931, "auxiliary_loss_mlp": 0.00140551, "balance_loss_clip": 1.30160785, "balance_loss_mlp": 0.13206363, "epoch": 0.3591763114384488, "flos": 41236691685120.0, "grad_norm": 0.8982706403941807, "language_loss": 0.562617, "learning_rate": 2.9669933998858355e-06, "loss": 0.57880181, "num_input_tokens_seen": 128378380, "router_z_loss_clip": 1.765625, "router_z_loss_mlp": 0.08496094, "step": 5974, "time_per_iteration": 3.059684991836548 }, { "auxiliary_loss_clip": 0.0144937, "auxiliary_loss_mlp": 0.00235933, "balance_loss_clip": 1.20262969, "balance_loss_mlp": 0.2057137, "epoch": 0.35923643469111677, "flos": 18695454600960.0, "grad_norm": 2.8182231748901643, "language_loss": 0.75663054, "learning_rate": 2.9666524677572114e-06, "loss": 0.77348363, "num_input_tokens_seen": 128394315, "router_z_loss_clip": 2.46679688, "router_z_loss_mlp": 0.30212402, "step": 5975, "time_per_iteration": 4.127224922180176 }, { "auxiliary_loss_clip": 0.01437343, "auxiliary_loss_mlp": 0.00268085, "balance_loss_clip": 1.1963861, "balance_loss_mlp": 0.23769836, "epoch": 0.35929655794378473, "flos": 25009950931200.0, "grad_norm": 750.8530245636255, "language_loss": 0.85507011, "learning_rate": 2.96631149897303e-06, "loss": 0.87212443, "num_input_tokens_seen": 128414515, "router_z_loss_clip": 2.41796875, "router_z_loss_mlp": 0.30383301, "step": 5976, "time_per_iteration": 2.6835696697235107 }, { "auxiliary_loss_clip": 0.01442306, "auxiliary_loss_mlp": 0.00272572, "balance_loss_clip": 1.19795275, "balance_loss_mlp": 0.24360384, "epoch": 0.35935668119645275, "flos": 14975576265600.0, "grad_norm": 2.2526362747170663, "language_loss": 0.86361778, "learning_rate": 2.9659704935462194e-06, "loss": 0.88076663, "num_input_tokens_seen": 128430615, "router_z_loss_clip": 2.44140625, "router_z_loss_mlp": 0.28967285, "step": 5977, "time_per_iteration": 2.616544246673584 }, { "auxiliary_loss_clip": 0.01437029, "auxiliary_loss_mlp": 0.0032582, "balance_loss_clip": 1.19657159, "balance_loss_mlp": 0.29681674, "epoch": 0.3594168044491207, "flos": 21178138838400.0, "grad_norm": 2.537868397864208, "language_loss": 0.87699997, "learning_rate": 2.9656294514897102e-06, "loss": 0.89462841, "num_input_tokens_seen": 128449480, "router_z_loss_clip": 2.41015625, "router_z_loss_mlp": 0.28991699, "step": 5978, "time_per_iteration": 2.6337997913360596 }, { "auxiliary_loss_clip": 0.01456825, "auxiliary_loss_mlp": 0.00443473, "balance_loss_clip": 1.20464134, "balance_loss_mlp": 0.40911663, "epoch": 0.3594769277017887, "flos": 27672152365440.0, "grad_norm": 30.32994243477531, "language_loss": 0.73066759, "learning_rate": 2.965288372816436e-06, "loss": 0.74967062, "num_input_tokens_seen": 128471465, "router_z_loss_clip": 2.52148438, "router_z_loss_mlp": 0.34350586, "step": 5979, "time_per_iteration": 2.7411305904388428 }, { "auxiliary_loss_clip": 0.01447921, "auxiliary_loss_mlp": 0.00409836, "balance_loss_clip": 1.20448589, "balance_loss_mlp": 0.3793422, "epoch": 0.35953705095445665, "flos": 23002328995200.0, "grad_norm": 14.607376722504364, "language_loss": 0.76661503, "learning_rate": 2.9649472575393296e-06, "loss": 0.78519261, "num_input_tokens_seen": 128490645, "router_z_loss_clip": 2.4375, "router_z_loss_mlp": 0.30541992, "step": 5980, "time_per_iteration": 2.782780408859253 }, { "auxiliary_loss_clip": 0.01447622, "auxiliary_loss_mlp": 0.00446961, "balance_loss_clip": 1.19738662, "balance_loss_mlp": 0.41374975, "epoch": 0.3595971742071246, "flos": 25513992529920.0, "grad_norm": 3.5933004206174735, "language_loss": 0.77797341, "learning_rate": 2.964606105671327e-06, "loss": 0.79691929, "num_input_tokens_seen": 128510225, "router_z_loss_clip": 2.50195312, "router_z_loss_mlp": 0.33203125, "step": 5981, "time_per_iteration": 4.209684610366821 }, { "auxiliary_loss_clip": 0.01457606, "auxiliary_loss_mlp": 0.00478424, "balance_loss_clip": 1.21102262, "balance_loss_mlp": 0.44189823, "epoch": 0.3596572974597926, "flos": 29862559635840.0, "grad_norm": 49.57559604493387, "language_loss": 0.78031814, "learning_rate": 2.9642649172253635e-06, "loss": 0.79967839, "num_input_tokens_seen": 128530195, "router_z_loss_clip": 2.46289062, "router_z_loss_mlp": 0.36499023, "step": 5982, "time_per_iteration": 2.779730796813965 }, { "auxiliary_loss_clip": 0.01467411, "auxiliary_loss_mlp": 0.004891, "balance_loss_clip": 1.22065353, "balance_loss_mlp": 0.45612708, "epoch": 0.35971742071246054, "flos": 23112538899840.0, "grad_norm": 24.50571184040137, "language_loss": 0.83675921, "learning_rate": 2.9639236922143786e-06, "loss": 0.85632432, "num_input_tokens_seen": 128549990, "router_z_loss_clip": 2.46679688, "router_z_loss_mlp": 0.32983398, "step": 5983, "time_per_iteration": 2.6947379112243652 }, { "auxiliary_loss_clip": 0.01466421, "auxiliary_loss_mlp": 0.00580815, "balance_loss_clip": 1.21533537, "balance_loss_mlp": 0.54421794, "epoch": 0.3597775439651285, "flos": 16725359399040.0, "grad_norm": 6.85544339058106, "language_loss": 0.83839709, "learning_rate": 2.96358243065131e-06, "loss": 0.85886943, "num_input_tokens_seen": 128567925, "router_z_loss_clip": 2.51367188, "router_z_loss_mlp": 0.3659668, "step": 5984, "time_per_iteration": 2.7049572467803955 }, { "auxiliary_loss_clip": 0.01475383, "auxiliary_loss_mlp": 0.00504354, "balance_loss_clip": 1.22670627, "balance_loss_mlp": 0.46842414, "epoch": 0.3598376672177965, "flos": 19719483436800.0, "grad_norm": 10.35794576928377, "language_loss": 0.93930376, "learning_rate": 2.9632411325490993e-06, "loss": 0.95910108, "num_input_tokens_seen": 128585655, "router_z_loss_clip": 2.49023438, "router_z_loss_mlp": 0.359375, "step": 5985, "time_per_iteration": 2.756159782409668 }, { "auxiliary_loss_clip": 0.01466476, "auxiliary_loss_mlp": 0.00606955, "balance_loss_clip": 1.22168982, "balance_loss_mlp": 0.56914186, "epoch": 0.35989779047046444, "flos": 17311529445120.0, "grad_norm": 10.913314829950103, "language_loss": 0.77984178, "learning_rate": 2.9628997979206884e-06, "loss": 0.80057609, "num_input_tokens_seen": 128604820, "router_z_loss_clip": 2.44726562, "router_z_loss_mlp": 0.37817383, "step": 5986, "time_per_iteration": 2.6435656547546387 }, { "auxiliary_loss_clip": 0.01478391, "auxiliary_loss_mlp": 0.00681699, "balance_loss_clip": 1.22277117, "balance_loss_mlp": 0.6417399, "epoch": 0.3599579137231324, "flos": 22711237176960.0, "grad_norm": 616.0371758615881, "language_loss": 0.8111676, "learning_rate": 2.9625584267790204e-06, "loss": 0.8327685, "num_input_tokens_seen": 128623070, "router_z_loss_clip": 2.5546875, "router_z_loss_mlp": 0.39941406, "step": 5987, "time_per_iteration": 2.627520799636841 }, { "auxiliary_loss_clip": 0.01475145, "auxiliary_loss_mlp": 0.00569503, "balance_loss_clip": 1.22474849, "balance_loss_mlp": 0.53133243, "epoch": 0.36001803697580037, "flos": 20959873845120.0, "grad_norm": 7.74713646479869, "language_loss": 0.79452705, "learning_rate": 2.9622170191370404e-06, "loss": 0.81497353, "num_input_tokens_seen": 128642430, "router_z_loss_clip": 2.50390625, "router_z_loss_mlp": 0.38183594, "step": 5988, "time_per_iteration": 2.669175386428833 }, { "auxiliary_loss_clip": 0.01484149, "auxiliary_loss_mlp": 0.00576155, "balance_loss_clip": 1.22833347, "balance_loss_mlp": 0.53920054, "epoch": 0.36007816022846834, "flos": 20485565729280.0, "grad_norm": 2.9823452109621846, "language_loss": 0.7886796, "learning_rate": 2.9618755750076953e-06, "loss": 0.80928266, "num_input_tokens_seen": 128661285, "router_z_loss_clip": 2.55859375, "router_z_loss_mlp": 0.36938477, "step": 5989, "time_per_iteration": 2.651118040084839 }, { "auxiliary_loss_clip": 0.01485887, "auxiliary_loss_mlp": 0.00608664, "balance_loss_clip": 1.23681366, "balance_loss_mlp": 0.56977773, "epoch": 0.36013828348113636, "flos": 28001237794560.0, "grad_norm": 44.114346120020464, "language_loss": 0.85283935, "learning_rate": 2.961534094403931e-06, "loss": 0.87378484, "num_input_tokens_seen": 128682210, "router_z_loss_clip": 2.4921875, "router_z_loss_mlp": 0.38891602, "step": 5990, "time_per_iteration": 2.6938037872314453 }, { "auxiliary_loss_clip": 0.01506956, "auxiliary_loss_mlp": 0.00622558, "balance_loss_clip": 1.24646914, "balance_loss_mlp": 0.58510262, "epoch": 0.3601984067338043, "flos": 20082181017600.0, "grad_norm": 4.432396963158662, "language_loss": 0.9016664, "learning_rate": 2.961192577338698e-06, "loss": 0.92296159, "num_input_tokens_seen": 128700445, "router_z_loss_clip": 2.60351562, "router_z_loss_mlp": 0.37451172, "step": 5991, "time_per_iteration": 2.623351573944092 }, { "auxiliary_loss_clip": 0.01501121, "auxiliary_loss_mlp": 0.00651128, "balance_loss_clip": 1.24173498, "balance_loss_mlp": 0.61071575, "epoch": 0.3602585299864723, "flos": 18617599872000.0, "grad_norm": 130.53086995873204, "language_loss": 0.83211821, "learning_rate": 2.9608510238249463e-06, "loss": 0.85364068, "num_input_tokens_seen": 128716855, "router_z_loss_clip": 2.59570312, "router_z_loss_mlp": 0.40405273, "step": 5992, "time_per_iteration": 2.66521954536438 }, { "auxiliary_loss_clip": 0.01509263, "auxiliary_loss_mlp": 0.00588935, "balance_loss_clip": 1.25550175, "balance_loss_mlp": 0.55348194, "epoch": 0.36031865323914025, "flos": 19573003774080.0, "grad_norm": 10.267843307743744, "language_loss": 0.83855551, "learning_rate": 2.960509433875627e-06, "loss": 0.85953748, "num_input_tokens_seen": 128735835, "router_z_loss_clip": 2.53710938, "router_z_loss_mlp": 0.35473633, "step": 5993, "time_per_iteration": 2.697218179702759 }, { "auxiliary_loss_clip": 0.01525208, "auxiliary_loss_mlp": 0.00612614, "balance_loss_clip": 1.26287818, "balance_loss_mlp": 0.57501519, "epoch": 0.3603787764918082, "flos": 17490615678720.0, "grad_norm": 2.9915786741589985, "language_loss": 0.82189655, "learning_rate": 2.9601678075036943e-06, "loss": 0.84327483, "num_input_tokens_seen": 128752465, "router_z_loss_clip": 2.62109375, "router_z_loss_mlp": 0.37597656, "step": 5994, "time_per_iteration": 2.63128924369812 }, { "auxiliary_loss_clip": 0.01525728, "auxiliary_loss_mlp": 0.00522424, "balance_loss_clip": 1.26028979, "balance_loss_mlp": 0.48670846, "epoch": 0.3604388997444762, "flos": 15523393564800.0, "grad_norm": 7.152655347051845, "language_loss": 0.77152443, "learning_rate": 2.9598261447221024e-06, "loss": 0.7920059, "num_input_tokens_seen": 128770865, "router_z_loss_clip": 2.65039062, "router_z_loss_mlp": 0.35717773, "step": 5995, "time_per_iteration": 2.759296178817749 }, { "auxiliary_loss_clip": 0.01523572, "auxiliary_loss_mlp": 0.00566933, "balance_loss_clip": 1.25840259, "balance_loss_mlp": 0.52663982, "epoch": 0.36049902299714415, "flos": 17310883000320.0, "grad_norm": 17.25714603952194, "language_loss": 0.89827234, "learning_rate": 2.9594844455438057e-06, "loss": 0.91917741, "num_input_tokens_seen": 128789730, "router_z_loss_clip": 2.65234375, "router_z_loss_mlp": 0.40258789, "step": 5996, "time_per_iteration": 2.673665761947632 }, { "auxiliary_loss_clip": 0.01548599, "auxiliary_loss_mlp": 0.0048644, "balance_loss_clip": 1.28160429, "balance_loss_mlp": 0.45012897, "epoch": 0.3605591462498121, "flos": 17056025026560.0, "grad_norm": 5.234161895306228, "language_loss": 0.79485351, "learning_rate": 2.959142709981763e-06, "loss": 0.81520391, "num_input_tokens_seen": 128806610, "router_z_loss_clip": 2.66992188, "router_z_loss_mlp": 0.36303711, "step": 5997, "time_per_iteration": 2.7516837120056152 }, { "auxiliary_loss_clip": 0.01571245, "auxiliary_loss_mlp": 0.00477125, "balance_loss_clip": 1.30147147, "balance_loss_mlp": 0.44317403, "epoch": 0.3606192695024801, "flos": 16836862193280.0, "grad_norm": 12.324358754249818, "language_loss": 0.79117143, "learning_rate": 2.9588009380489337e-06, "loss": 0.8116551, "num_input_tokens_seen": 128824830, "router_z_loss_clip": 2.69921875, "router_z_loss_mlp": 0.33959961, "step": 5998, "time_per_iteration": 2.7042171955108643 }, { "auxiliary_loss_clip": 0.0157524, "auxiliary_loss_mlp": 0.00513208, "balance_loss_clip": 1.30546796, "balance_loss_mlp": 0.47849444, "epoch": 0.36067939275514804, "flos": 12129655743360.0, "grad_norm": 23.314273873431844, "language_loss": 0.86643004, "learning_rate": 2.9584591297582758e-06, "loss": 0.88731456, "num_input_tokens_seen": 128838170, "router_z_loss_clip": 2.69726562, "router_z_loss_mlp": 0.34692383, "step": 5999, "time_per_iteration": 2.7084171772003174 }, { "auxiliary_loss_clip": 0.01571568, "auxiliary_loss_mlp": 0.00486167, "balance_loss_clip": 1.29719996, "balance_loss_mlp": 0.44809139, "epoch": 0.360739516007816, "flos": 18041449720320.0, "grad_norm": 13.791425027694288, "language_loss": 0.85908961, "learning_rate": 2.9581172851227516e-06, "loss": 0.87966692, "num_input_tokens_seen": 128855625, "router_z_loss_clip": 2.7421875, "router_z_loss_mlp": 0.38061523, "step": 6000, "time_per_iteration": 2.616788148880005 }, { "auxiliary_loss_clip": 0.01571421, "auxiliary_loss_mlp": 0.00457672, "balance_loss_clip": 1.30142689, "balance_loss_mlp": 0.41971558, "epoch": 0.360799639260484, "flos": 18549800951040.0, "grad_norm": 7.272441247229841, "language_loss": 0.84922516, "learning_rate": 2.9577754041553243e-06, "loss": 0.86951607, "num_input_tokens_seen": 128873540, "router_z_loss_clip": 2.70117188, "router_z_loss_mlp": 0.37963867, "step": 6001, "time_per_iteration": 2.6695752143859863 }, { "auxiliary_loss_clip": 0.01587504, "auxiliary_loss_mlp": 0.0045106, "balance_loss_clip": 1.31589615, "balance_loss_mlp": 0.41234055, "epoch": 0.36085976251315194, "flos": 19682028529920.0, "grad_norm": 9.033496741404514, "language_loss": 0.90491855, "learning_rate": 2.9574334868689575e-06, "loss": 0.92530417, "num_input_tokens_seen": 128889925, "router_z_loss_clip": 2.72070312, "router_z_loss_mlp": 0.38720703, "step": 6002, "time_per_iteration": 2.718067169189453 }, { "auxiliary_loss_clip": 0.01600011, "auxiliary_loss_mlp": 0.00405641, "balance_loss_clip": 1.32606959, "balance_loss_mlp": 0.37321591, "epoch": 0.3609198857658199, "flos": 24198943703040.0, "grad_norm": 121.66778833329406, "language_loss": 0.96084952, "learning_rate": 2.9570915332766165e-06, "loss": 0.98090613, "num_input_tokens_seen": 128906890, "router_z_loss_clip": 2.74023438, "router_z_loss_mlp": 0.32421875, "step": 6003, "time_per_iteration": 2.6218903064727783 }, { "auxiliary_loss_clip": 0.01623032, "auxiliary_loss_mlp": 0.00101253, "balance_loss_clip": 1.41944599, "balance_loss_mlp": 0.08995207, "epoch": 0.3609800090184879, "flos": 57115995160320.0, "grad_norm": 0.8881867025641624, "language_loss": 0.53644717, "learning_rate": 2.9567495433912693e-06, "loss": 0.55369008, "num_input_tokens_seen": 128965940, "router_z_loss_clip": 2.03125, "router_z_loss_mlp": 0.11279297, "step": 6004, "time_per_iteration": 3.062074899673462 }, { "auxiliary_loss_clip": 0.01571227, "auxiliary_loss_mlp": 0.00461396, "balance_loss_clip": 1.29589438, "balance_loss_mlp": 0.4240123, "epoch": 0.3610401322711559, "flos": 20811239366400.0, "grad_norm": 19.25488589458657, "language_loss": 0.85258913, "learning_rate": 2.956407517225883e-06, "loss": 0.87291533, "num_input_tokens_seen": 128985835, "router_z_loss_clip": 2.75976562, "router_z_loss_mlp": 0.3737793, "step": 6005, "time_per_iteration": 2.6486454010009766 }, { "auxiliary_loss_clip": 0.01561748, "auxiliary_loss_mlp": 0.00411312, "balance_loss_clip": 1.2900908, "balance_loss_mlp": 0.37912515, "epoch": 0.36110025552382385, "flos": 13699167494400.0, "grad_norm": 2.392218970734875, "language_loss": 0.85937792, "learning_rate": 2.956065454793429e-06, "loss": 0.87910855, "num_input_tokens_seen": 129003120, "router_z_loss_clip": 2.71289062, "router_z_loss_mlp": 0.32202148, "step": 6006, "time_per_iteration": 2.5957164764404297 }, { "auxiliary_loss_clip": 0.01582305, "auxiliary_loss_mlp": 0.00393469, "balance_loss_clip": 1.30115807, "balance_loss_mlp": 0.35932714, "epoch": 0.3611603787764918, "flos": 22455014486400.0, "grad_norm": 11.576546223568087, "language_loss": 0.91087222, "learning_rate": 2.955723356106876e-06, "loss": 0.93062997, "num_input_tokens_seen": 129021645, "router_z_loss_clip": 2.81445312, "router_z_loss_mlp": 0.34130859, "step": 6007, "time_per_iteration": 2.6875240802764893 }, { "auxiliary_loss_clip": 0.01575331, "auxiliary_loss_mlp": 0.00409532, "balance_loss_clip": 1.29301703, "balance_loss_mlp": 0.37302965, "epoch": 0.3612205020291598, "flos": 20886651970560.0, "grad_norm": 30.80999999615213, "language_loss": 0.82335556, "learning_rate": 2.955381221179198e-06, "loss": 0.8432042, "num_input_tokens_seen": 129038375, "router_z_loss_clip": 2.82226562, "router_z_loss_mlp": 0.36523438, "step": 6008, "time_per_iteration": 2.6471405029296875 }, { "auxiliary_loss_clip": 0.01550504, "auxiliary_loss_mlp": 0.00401012, "balance_loss_clip": 1.27660906, "balance_loss_mlp": 0.36682263, "epoch": 0.36128062528182775, "flos": 15741981780480.0, "grad_norm": 16.76778529119698, "language_loss": 0.90638536, "learning_rate": 2.955039050023368e-06, "loss": 0.92590046, "num_input_tokens_seen": 129056235, "router_z_loss_clip": 2.73632812, "router_z_loss_mlp": 0.34204102, "step": 6009, "time_per_iteration": 2.627875566482544 }, { "auxiliary_loss_clip": 0.01566299, "auxiliary_loss_mlp": 0.00418733, "balance_loss_clip": 1.28883493, "balance_loss_mlp": 0.38380447, "epoch": 0.3613407485344957, "flos": 16764502245120.0, "grad_norm": 41.46415358412408, "language_loss": 0.83563399, "learning_rate": 2.954696842652362e-06, "loss": 0.85548437, "num_input_tokens_seen": 129072405, "router_z_loss_clip": 2.77734375, "router_z_loss_mlp": 0.34887695, "step": 6010, "time_per_iteration": 2.632220983505249 }, { "auxiliary_loss_clip": 0.01560358, "auxiliary_loss_mlp": 0.00385214, "balance_loss_clip": 1.28236091, "balance_loss_mlp": 0.35076249, "epoch": 0.3614008717871637, "flos": 20371189847040.0, "grad_norm": 182.62770585650384, "language_loss": 0.88974273, "learning_rate": 2.9543545990791554e-06, "loss": 0.9091984, "num_input_tokens_seen": 129090225, "router_z_loss_clip": 2.78320312, "router_z_loss_mlp": 0.34448242, "step": 6011, "time_per_iteration": 2.7324817180633545 }, { "auxiliary_loss_clip": 0.01557919, "auxiliary_loss_mlp": 0.003927, "balance_loss_clip": 1.27881765, "balance_loss_mlp": 0.3560794, "epoch": 0.36146099503983165, "flos": 22776665800320.0, "grad_norm": 10.895251530840905, "language_loss": 0.69909334, "learning_rate": 2.954012319316727e-06, "loss": 0.71859956, "num_input_tokens_seen": 129107685, "router_z_loss_clip": 2.7890625, "router_z_loss_mlp": 0.36621094, "step": 6012, "time_per_iteration": 5.611877679824829 }, { "auxiliary_loss_clip": 0.01565606, "auxiliary_loss_mlp": 0.00422277, "balance_loss_clip": 1.28958988, "balance_loss_mlp": 0.38708678, "epoch": 0.3615211182924996, "flos": 22996654646400.0, "grad_norm": 10.04041095155924, "language_loss": 0.88257194, "learning_rate": 2.9536700033780565e-06, "loss": 0.90245074, "num_input_tokens_seen": 129125315, "router_z_loss_clip": 2.75390625, "router_z_loss_mlp": 0.35180664, "step": 6013, "time_per_iteration": 2.6535143852233887 }, { "auxiliary_loss_clip": 0.01574581, "auxiliary_loss_mlp": 0.0037804, "balance_loss_clip": 1.29443622, "balance_loss_mlp": 0.3451618, "epoch": 0.3615812415451676, "flos": 16648079287680.0, "grad_norm": 10.629891709509527, "language_loss": 0.96783549, "learning_rate": 2.9533276512761228e-06, "loss": 0.98736161, "num_input_tokens_seen": 129141600, "router_z_loss_clip": 2.80078125, "router_z_loss_mlp": 0.32861328, "step": 6014, "time_per_iteration": 2.6206252574920654 }, { "auxiliary_loss_clip": 0.01566284, "auxiliary_loss_mlp": 0.00345777, "balance_loss_clip": 1.29080331, "balance_loss_mlp": 0.31430614, "epoch": 0.36164136479783554, "flos": 21320093387520.0, "grad_norm": 2.25173743572339, "language_loss": 0.82212436, "learning_rate": 2.95298526302391e-06, "loss": 0.84124494, "num_input_tokens_seen": 129160665, "router_z_loss_clip": 2.75976562, "router_z_loss_mlp": 0.31420898, "step": 6015, "time_per_iteration": 2.7504706382751465 }, { "auxiliary_loss_clip": 0.0156255, "auxiliary_loss_mlp": 0.0038474, "balance_loss_clip": 1.28922129, "balance_loss_mlp": 0.3503367, "epoch": 0.3617014880505035, "flos": 24169569356160.0, "grad_norm": 9.505567128948794, "language_loss": 0.73483276, "learning_rate": 2.9526428386344e-06, "loss": 0.75430572, "num_input_tokens_seen": 129179220, "router_z_loss_clip": 2.73828125, "router_z_loss_mlp": 0.34423828, "step": 6016, "time_per_iteration": 2.703514814376831 }, { "auxiliary_loss_clip": 0.01561267, "auxiliary_loss_mlp": 0.0040924, "balance_loss_clip": 1.28399336, "balance_loss_mlp": 0.37333381, "epoch": 0.3617616113031715, "flos": 39014824101120.0, "grad_norm": 17.817182573751168, "language_loss": 0.79627407, "learning_rate": 2.9523003781205785e-06, "loss": 0.81597912, "num_input_tokens_seen": 129200385, "router_z_loss_clip": 2.7734375, "router_z_loss_mlp": 0.35913086, "step": 6017, "time_per_iteration": 4.234957695007324 }, { "auxiliary_loss_clip": 0.01552538, "auxiliary_loss_mlp": 0.00396329, "balance_loss_clip": 1.2788676, "balance_loss_mlp": 0.36218786, "epoch": 0.3618217345558395, "flos": 12130840892160.0, "grad_norm": 7.473210046626886, "language_loss": 0.82488948, "learning_rate": 2.9519578814954307e-06, "loss": 0.84437811, "num_input_tokens_seen": 129217395, "router_z_loss_clip": 2.734375, "router_z_loss_mlp": 0.34130859, "step": 6018, "time_per_iteration": 2.6358540058135986 }, { "auxiliary_loss_clip": 0.01566652, "auxiliary_loss_mlp": 0.00409365, "balance_loss_clip": 1.29718375, "balance_loss_mlp": 0.37474674, "epoch": 0.36188185780850746, "flos": 24935005203840.0, "grad_norm": 96.84059335966258, "language_loss": 0.74407041, "learning_rate": 2.9516153487719448e-06, "loss": 0.7638306, "num_input_tokens_seen": 129238940, "router_z_loss_clip": 2.6953125, "router_z_loss_mlp": 0.34619141, "step": 6019, "time_per_iteration": 2.743962287902832 }, { "auxiliary_loss_clip": 0.01556169, "auxiliary_loss_mlp": 0.00352976, "balance_loss_clip": 1.27954578, "balance_loss_mlp": 0.31833419, "epoch": 0.3619419810611754, "flos": 20958832350720.0, "grad_norm": 64.92320878496048, "language_loss": 0.83585328, "learning_rate": 2.95127277996311e-06, "loss": 0.85494471, "num_input_tokens_seen": 129258240, "router_z_loss_clip": 2.76367188, "router_z_loss_mlp": 0.34643555, "step": 6020, "time_per_iteration": 2.7030398845672607 }, { "auxiliary_loss_clip": 0.0157056, "auxiliary_loss_mlp": 0.00388557, "balance_loss_clip": 1.29170156, "balance_loss_mlp": 0.34919381, "epoch": 0.3620021043138434, "flos": 22528882805760.0, "grad_norm": 2.5606583574109494, "language_loss": 0.7928865, "learning_rate": 2.9509301750819156e-06, "loss": 0.81247771, "num_input_tokens_seen": 129279040, "router_z_loss_clip": 2.78710938, "router_z_loss_mlp": 0.39331055, "step": 6021, "time_per_iteration": 2.6987974643707275 }, { "auxiliary_loss_clip": 0.01564868, "auxiliary_loss_mlp": 0.0042541, "balance_loss_clip": 1.29002798, "balance_loss_mlp": 0.38816914, "epoch": 0.36206222756651135, "flos": 15596687266560.0, "grad_norm": 4.126709381123415, "language_loss": 0.88092053, "learning_rate": 2.9505875341413533e-06, "loss": 0.90082335, "num_input_tokens_seen": 129295415, "router_z_loss_clip": 2.74804688, "router_z_loss_mlp": 0.37255859, "step": 6022, "time_per_iteration": 2.6822407245635986 }, { "auxiliary_loss_clip": 0.01570873, "auxiliary_loss_mlp": 0.00350529, "balance_loss_clip": 1.30009174, "balance_loss_mlp": 0.31810379, "epoch": 0.3621223508191793, "flos": 23587170238080.0, "grad_norm": 5.872172287596771, "language_loss": 0.87495214, "learning_rate": 2.950244857154417e-06, "loss": 0.89416611, "num_input_tokens_seen": 129312620, "router_z_loss_clip": 2.7109375, "router_z_loss_mlp": 0.32421875, "step": 6023, "time_per_iteration": 2.678739547729492 }, { "auxiliary_loss_clip": 0.01566326, "auxiliary_loss_mlp": 0.00369541, "balance_loss_clip": 1.29046392, "balance_loss_mlp": 0.33494651, "epoch": 0.3621824740718473, "flos": 22309899540480.0, "grad_norm": 15.701285692911613, "language_loss": 0.86442024, "learning_rate": 2.9499021441341e-06, "loss": 0.88377893, "num_input_tokens_seen": 129331825, "router_z_loss_clip": 2.7578125, "router_z_loss_mlp": 0.34619141, "step": 6024, "time_per_iteration": 4.062086343765259 }, { "auxiliary_loss_clip": 0.01559448, "auxiliary_loss_mlp": 0.00325427, "balance_loss_clip": 1.29257035, "balance_loss_mlp": 0.29445601, "epoch": 0.36224259732451525, "flos": 16763640318720.0, "grad_norm": 2.4829029832255936, "language_loss": 0.79927242, "learning_rate": 2.9495593950933997e-06, "loss": 0.81812114, "num_input_tokens_seen": 129350400, "router_z_loss_clip": 2.66796875, "router_z_loss_mlp": 0.30957031, "step": 6025, "time_per_iteration": 2.672111749649048 }, { "auxiliary_loss_clip": 0.01552127, "auxiliary_loss_mlp": 0.00359088, "balance_loss_clip": 1.28442824, "balance_loss_mlp": 0.32499412, "epoch": 0.3623027205771832, "flos": 23149742411520.0, "grad_norm": 108.1789619862585, "language_loss": 0.80559886, "learning_rate": 2.9492166100453107e-06, "loss": 0.82471102, "num_input_tokens_seen": 129371155, "router_z_loss_clip": 2.67578125, "router_z_loss_mlp": 0.34106445, "step": 6026, "time_per_iteration": 2.6549315452575684 }, { "auxiliary_loss_clip": 0.0155792, "auxiliary_loss_mlp": 0.00355704, "balance_loss_clip": 1.2794255, "balance_loss_mlp": 0.31934479, "epoch": 0.3623628438298512, "flos": 28549162834560.0, "grad_norm": 24.737237337347604, "language_loss": 0.84967756, "learning_rate": 2.948873789002833e-06, "loss": 0.86881381, "num_input_tokens_seen": 129391230, "router_z_loss_clip": 2.78320312, "router_z_loss_mlp": 0.36376953, "step": 6027, "time_per_iteration": 2.9171323776245117 }, { "auxiliary_loss_clip": 0.01562717, "auxiliary_loss_mlp": 0.00369456, "balance_loss_clip": 1.28490686, "balance_loss_mlp": 0.33645871, "epoch": 0.36242296708251914, "flos": 25484941405440.0, "grad_norm": 49.96178791332562, "language_loss": 0.75763428, "learning_rate": 2.9485309319789667e-06, "loss": 0.77695596, "num_input_tokens_seen": 129410065, "router_z_loss_clip": 2.77929688, "router_z_loss_mlp": 0.32983398, "step": 6028, "time_per_iteration": 2.770338296890259 }, { "auxiliary_loss_clip": 0.01549793, "auxiliary_loss_mlp": 0.00362525, "balance_loss_clip": 1.28507447, "balance_loss_mlp": 0.32995656, "epoch": 0.3624830903351871, "flos": 16290373697280.0, "grad_norm": 5.6372337311069, "language_loss": 0.92120337, "learning_rate": 2.9481880389867117e-06, "loss": 0.94032657, "num_input_tokens_seen": 129428655, "router_z_loss_clip": 2.64648438, "router_z_loss_mlp": 0.32568359, "step": 6029, "time_per_iteration": 2.602210283279419 }, { "auxiliary_loss_clip": 0.01565588, "auxiliary_loss_mlp": 0.00355964, "balance_loss_clip": 1.29303741, "balance_loss_mlp": 0.32167923, "epoch": 0.36254321358785513, "flos": 18296307694080.0, "grad_norm": 24.86369056098275, "language_loss": 0.80560058, "learning_rate": 2.9478451100390714e-06, "loss": 0.82481611, "num_input_tokens_seen": 129447845, "router_z_loss_clip": 2.72460938, "router_z_loss_mlp": 0.34277344, "step": 6030, "time_per_iteration": 2.6217291355133057 }, { "auxiliary_loss_clip": 0.01557945, "auxiliary_loss_mlp": 0.00383341, "balance_loss_clip": 1.2782011, "balance_loss_mlp": 0.34879434, "epoch": 0.3626033368405231, "flos": 14865294533760.0, "grad_norm": 10.012168292311188, "language_loss": 0.84469539, "learning_rate": 2.94750214514905e-06, "loss": 0.86410826, "num_input_tokens_seen": 129463275, "router_z_loss_clip": 2.79492188, "router_z_loss_mlp": 0.34545898, "step": 6031, "time_per_iteration": 2.675849437713623 }, { "auxiliary_loss_clip": 0.0155207, "auxiliary_loss_mlp": 0.00313494, "balance_loss_clip": 1.28141904, "balance_loss_mlp": 0.28102177, "epoch": 0.36266346009319106, "flos": 22306595489280.0, "grad_norm": 8.126727217100383, "language_loss": 0.79599476, "learning_rate": 2.9471591443296516e-06, "loss": 0.81465036, "num_input_tokens_seen": 129483205, "router_z_loss_clip": 2.70507812, "router_z_loss_mlp": 0.32470703, "step": 6032, "time_per_iteration": 2.666625499725342 }, { "auxiliary_loss_clip": 0.01549457, "auxiliary_loss_mlp": 0.00355325, "balance_loss_clip": 1.27783847, "balance_loss_mlp": 0.32144541, "epoch": 0.362723583345859, "flos": 18222331633920.0, "grad_norm": 2948.9069252747677, "language_loss": 0.84910756, "learning_rate": 2.946816107593884e-06, "loss": 0.86815536, "num_input_tokens_seen": 129499885, "router_z_loss_clip": 2.71484375, "router_z_loss_mlp": 0.33886719, "step": 6033, "time_per_iteration": 2.6463539600372314 }, { "auxiliary_loss_clip": 0.01498357, "auxiliary_loss_mlp": 0.00068281, "balance_loss_clip": 1.29960823, "balance_loss_mlp": 0.06079457, "epoch": 0.362783706598527, "flos": 68499174458880.0, "grad_norm": 0.8647830140674183, "language_loss": 0.64785033, "learning_rate": 2.9464730349547547e-06, "loss": 0.66351676, "num_input_tokens_seen": 129561885, "router_z_loss_clip": 1.984375, "router_z_loss_mlp": 0.07470703, "step": 6034, "time_per_iteration": 3.226513624191284 }, { "auxiliary_loss_clip": 0.01552477, "auxiliary_loss_mlp": 0.00342545, "balance_loss_clip": 1.28550816, "balance_loss_mlp": 0.31002429, "epoch": 0.36284382985119495, "flos": 26576589594240.0, "grad_norm": 5.031916114779958, "language_loss": 0.94570994, "learning_rate": 2.946129926425273e-06, "loss": 0.96466017, "num_input_tokens_seen": 129582325, "router_z_loss_clip": 2.66796875, "router_z_loss_mlp": 0.32519531, "step": 6035, "time_per_iteration": 2.7005274295806885 }, { "auxiliary_loss_clip": 0.01544031, "auxiliary_loss_mlp": 0.00374262, "balance_loss_clip": 1.27188206, "balance_loss_mlp": 0.33766437, "epoch": 0.3629039531038629, "flos": 20156767608960.0, "grad_norm": 2.331189902669245, "language_loss": 0.81335664, "learning_rate": 2.9457867820184496e-06, "loss": 0.8325395, "num_input_tokens_seen": 129600350, "router_z_loss_clip": 2.72265625, "router_z_loss_mlp": 0.36572266, "step": 6036, "time_per_iteration": 2.7455461025238037 }, { "auxiliary_loss_clip": 0.01545841, "auxiliary_loss_mlp": 0.00323678, "balance_loss_clip": 1.26803374, "balance_loss_mlp": 0.28834435, "epoch": 0.3629640763565309, "flos": 18625716345600.0, "grad_norm": 3.418602757420063, "language_loss": 0.84120011, "learning_rate": 2.945443601747297e-06, "loss": 0.85989535, "num_input_tokens_seen": 129618425, "router_z_loss_clip": 2.77734375, "router_z_loss_mlp": 0.35302734, "step": 6037, "time_per_iteration": 2.6517624855041504 }, { "auxiliary_loss_clip": 0.01559843, "auxiliary_loss_mlp": 0.00339353, "balance_loss_clip": 1.29196334, "balance_loss_mlp": 0.30750021, "epoch": 0.36302419960919885, "flos": 19571459489280.0, "grad_norm": 57.127477765106825, "language_loss": 0.84305787, "learning_rate": 2.945100385624828e-06, "loss": 0.86204982, "num_input_tokens_seen": 129636750, "router_z_loss_clip": 2.67382812, "router_z_loss_mlp": 0.31811523, "step": 6038, "time_per_iteration": 2.660201072692871 }, { "auxiliary_loss_clip": 0.01486884, "auxiliary_loss_mlp": 0.0008896, "balance_loss_clip": 1.29702294, "balance_loss_mlp": 0.08080637, "epoch": 0.3630843228618668, "flos": 63797606444160.0, "grad_norm": 9.722116845312845, "language_loss": 0.63052177, "learning_rate": 2.9447571336640573e-06, "loss": 0.64628023, "num_input_tokens_seen": 129699030, "router_z_loss_clip": 1.8984375, "router_z_loss_mlp": 0.08154297, "step": 6039, "time_per_iteration": 3.281809091567993 }, { "auxiliary_loss_clip": 0.01545419, "auxiliary_loss_mlp": 0.00326267, "balance_loss_clip": 1.27742648, "balance_loss_mlp": 0.29205436, "epoch": 0.3631444461145348, "flos": 21835160461440.0, "grad_norm": 21.431710342890515, "language_loss": 0.80669737, "learning_rate": 2.944413845878002e-06, "loss": 0.82541424, "num_input_tokens_seen": 129717135, "router_z_loss_clip": 2.6796875, "router_z_loss_mlp": 0.34179688, "step": 6040, "time_per_iteration": 2.719456195831299 }, { "auxiliary_loss_clip": 0.01553489, "auxiliary_loss_mlp": 0.00318265, "balance_loss_clip": 1.27782297, "balance_loss_mlp": 0.28429055, "epoch": 0.36320456936720275, "flos": 21722041555200.0, "grad_norm": 8.639956117444767, "language_loss": 0.87951946, "learning_rate": 2.9440705222796783e-06, "loss": 0.89823711, "num_input_tokens_seen": 129735940, "router_z_loss_clip": 2.7578125, "router_z_loss_mlp": 0.33935547, "step": 6041, "time_per_iteration": 2.6764333248138428 }, { "auxiliary_loss_clip": 0.01547202, "auxiliary_loss_mlp": 0.00327366, "balance_loss_clip": 1.27180123, "balance_loss_mlp": 0.29475045, "epoch": 0.3632646926198707, "flos": 17019072910080.0, "grad_norm": 16.759068417091296, "language_loss": 0.92318761, "learning_rate": 2.943727162882107e-06, "loss": 0.94193327, "num_input_tokens_seen": 129752790, "router_z_loss_clip": 2.75390625, "router_z_loss_mlp": 0.32617188, "step": 6042, "time_per_iteration": 2.6406478881835938 }, { "auxiliary_loss_clip": 0.01543629, "auxiliary_loss_mlp": 0.00334175, "balance_loss_clip": 1.27222824, "balance_loss_mlp": 0.30113071, "epoch": 0.36332481587253873, "flos": 23331163029120.0, "grad_norm": 3.366033856670862, "language_loss": 0.83911991, "learning_rate": 2.9433837676983064e-06, "loss": 0.85789806, "num_input_tokens_seen": 129773655, "router_z_loss_clip": 2.71289062, "router_z_loss_mlp": 0.33032227, "step": 6043, "time_per_iteration": 2.6817820072174072 }, { "auxiliary_loss_clip": 0.01556206, "auxiliary_loss_mlp": 0.00331807, "balance_loss_clip": 1.28384447, "balance_loss_mlp": 0.29625866, "epoch": 0.3633849391252067, "flos": 10743539857920.0, "grad_norm": 17.396910573955882, "language_loss": 0.73740447, "learning_rate": 2.943040336741298e-06, "loss": 0.75628459, "num_input_tokens_seen": 129791605, "router_z_loss_clip": 2.72265625, "router_z_loss_mlp": 0.35546875, "step": 6044, "time_per_iteration": 2.6208152770996094 }, { "auxiliary_loss_clip": 0.01535808, "auxiliary_loss_mlp": 0.00282862, "balance_loss_clip": 1.26920629, "balance_loss_mlp": 0.25026995, "epoch": 0.36344506237787466, "flos": 25849147357440.0, "grad_norm": 62.4315337409331, "language_loss": 0.87255275, "learning_rate": 2.9426968700241066e-06, "loss": 0.89073944, "num_input_tokens_seen": 129811075, "router_z_loss_clip": 2.66601562, "router_z_loss_mlp": 0.32592773, "step": 6045, "time_per_iteration": 2.662590980529785 }, { "auxiliary_loss_clip": 0.0155335, "auxiliary_loss_mlp": 0.00334483, "balance_loss_clip": 1.27927375, "balance_loss_mlp": 0.2994352, "epoch": 0.3635051856305426, "flos": 30154046503680.0, "grad_norm": 1282.7793389069172, "language_loss": 0.72630942, "learning_rate": 2.942353367559755e-06, "loss": 0.7451877, "num_input_tokens_seen": 129833755, "router_z_loss_clip": 2.73828125, "router_z_loss_mlp": 0.35058594, "step": 6046, "time_per_iteration": 2.7174036502838135 }, { "auxiliary_loss_clip": 0.01559059, "auxiliary_loss_mlp": 0.0033658, "balance_loss_clip": 1.28992498, "balance_loss_mlp": 0.30315381, "epoch": 0.3635653088832106, "flos": 22198396746240.0, "grad_norm": 43.358894418547685, "language_loss": 0.84388423, "learning_rate": 2.9420098293612692e-06, "loss": 0.86284065, "num_input_tokens_seen": 129854475, "router_z_loss_clip": 2.69140625, "router_z_loss_mlp": 0.33447266, "step": 6047, "time_per_iteration": 2.673017740249634 }, { "auxiliary_loss_clip": 0.0154374, "auxiliary_loss_mlp": 0.00305775, "balance_loss_clip": 1.26702213, "balance_loss_mlp": 0.271467, "epoch": 0.36362543213587856, "flos": 24787053083520.0, "grad_norm": 2.2171641734891914, "language_loss": 0.85775459, "learning_rate": 2.9416662554416767e-06, "loss": 0.87624979, "num_input_tokens_seen": 129873530, "router_z_loss_clip": 2.765625, "router_z_loss_mlp": 0.34301758, "step": 6048, "time_per_iteration": 2.6680619716644287 }, { "auxiliary_loss_clip": 0.01480543, "auxiliary_loss_mlp": 0.00087433, "balance_loss_clip": 1.28737175, "balance_loss_mlp": 0.07846881, "epoch": 0.3636855553885465, "flos": 62526369231360.0, "grad_norm": 1.2850555927511922, "language_loss": 0.52147937, "learning_rate": 2.9413226458140054e-06, "loss": 0.53715914, "num_input_tokens_seen": 129940400, "router_z_loss_clip": 1.9296875, "router_z_loss_mlp": 0.08984375, "step": 6049, "time_per_iteration": 3.2505764961242676 }, { "auxiliary_loss_clip": 0.01543851, "auxiliary_loss_mlp": 0.00284502, "balance_loss_clip": 1.27441239, "balance_loss_mlp": 0.25081366, "epoch": 0.3637456786412145, "flos": 24060652341120.0, "grad_norm": 73.65272320622037, "language_loss": 0.93735886, "learning_rate": 2.9409790004912845e-06, "loss": 0.9556424, "num_input_tokens_seen": 129958635, "router_z_loss_clip": 2.69726562, "router_z_loss_mlp": 0.33691406, "step": 6050, "time_per_iteration": 2.699235439300537 }, { "auxiliary_loss_clip": 0.01546982, "auxiliary_loss_mlp": 0.00281301, "balance_loss_clip": 1.27558184, "balance_loss_mlp": 0.24889965, "epoch": 0.36380580189388245, "flos": 16691495852160.0, "grad_norm": 122.5069751169658, "language_loss": 0.87201411, "learning_rate": 2.940635319486546e-06, "loss": 0.89029694, "num_input_tokens_seen": 129977685, "router_z_loss_clip": 2.71484375, "router_z_loss_mlp": 0.32397461, "step": 6051, "time_per_iteration": 2.663992166519165 }, { "auxiliary_loss_clip": 0.01537657, "auxiliary_loss_mlp": 0.00293684, "balance_loss_clip": 1.27002966, "balance_loss_mlp": 0.25961429, "epoch": 0.3638659251465504, "flos": 25114091437440.0, "grad_norm": 7.482870452908569, "language_loss": 0.8813442, "learning_rate": 2.940291602812822e-06, "loss": 0.89965761, "num_input_tokens_seen": 129997530, "router_z_loss_clip": 2.67773438, "router_z_loss_mlp": 0.34057617, "step": 6052, "time_per_iteration": 2.670405626296997 }, { "auxiliary_loss_clip": 0.01530144, "auxiliary_loss_mlp": 0.00284544, "balance_loss_clip": 1.26943386, "balance_loss_mlp": 0.2526671, "epoch": 0.3639260483992184, "flos": 23003011353600.0, "grad_norm": 18.230122114809408, "language_loss": 0.78178877, "learning_rate": 2.939947850483145e-06, "loss": 0.7999357, "num_input_tokens_seen": 130017955, "router_z_loss_clip": 2.609375, "router_z_loss_mlp": 0.31835938, "step": 6053, "time_per_iteration": 2.6551151275634766 }, { "auxiliary_loss_clip": 0.0145458, "auxiliary_loss_mlp": 0.00075108, "balance_loss_clip": 1.26205635, "balance_loss_mlp": 0.06638142, "epoch": 0.36398617165188635, "flos": 70716011160960.0, "grad_norm": 0.7675708904970995, "language_loss": 0.60843146, "learning_rate": 2.9396040625105532e-06, "loss": 0.62372839, "num_input_tokens_seen": 130074275, "router_z_loss_clip": 1.921875, "router_z_loss_mlp": 0.08740234, "step": 6054, "time_per_iteration": 6.004218101501465 }, { "auxiliary_loss_clip": 0.01539285, "auxiliary_loss_mlp": 0.00258252, "balance_loss_clip": 1.26764131, "balance_loss_mlp": 0.2241343, "epoch": 0.3640462949045543, "flos": 22235456603520.0, "grad_norm": 9.175135137288539, "language_loss": 0.83680958, "learning_rate": 2.9392602389080802e-06, "loss": 0.85478497, "num_input_tokens_seen": 130091375, "router_z_loss_clip": 2.71679688, "router_z_loss_mlp": 0.34130859, "step": 6055, "time_per_iteration": 2.672990083694458 }, { "auxiliary_loss_clip": 0.01549911, "auxiliary_loss_mlp": 0.00266756, "balance_loss_clip": 1.27609515, "balance_loss_mlp": 0.23175579, "epoch": 0.3641064181572223, "flos": 21543529939200.0, "grad_norm": 7.3657432637445845, "language_loss": 0.81512678, "learning_rate": 2.938916379688765e-06, "loss": 0.8332935, "num_input_tokens_seen": 130111595, "router_z_loss_clip": 2.73828125, "router_z_loss_mlp": 0.35009766, "step": 6056, "time_per_iteration": 2.639315366744995 }, { "auxiliary_loss_clip": 0.01535121, "auxiliary_loss_mlp": 0.00281203, "balance_loss_clip": 1.26775837, "balance_loss_mlp": 0.2480152, "epoch": 0.3641665414098903, "flos": 22273306560000.0, "grad_norm": 5.450955794797858, "language_loss": 0.8985287, "learning_rate": 2.9385724848656468e-06, "loss": 0.9166919, "num_input_tokens_seen": 130131440, "router_z_loss_clip": 2.67773438, "router_z_loss_mlp": 0.33203125, "step": 6057, "time_per_iteration": 2.635474920272827 }, { "auxiliary_loss_clip": 0.01526313, "auxiliary_loss_mlp": 0.00290605, "balance_loss_clip": 1.26011467, "balance_loss_mlp": 0.25810874, "epoch": 0.36422666466255826, "flos": 28329676778880.0, "grad_norm": 10.497085435589678, "language_loss": 0.88319182, "learning_rate": 2.9382285544517647e-06, "loss": 0.90136099, "num_input_tokens_seen": 130151375, "router_z_loss_clip": 2.66210938, "router_z_loss_mlp": 0.32495117, "step": 6058, "time_per_iteration": 2.6934762001037598 }, { "auxiliary_loss_clip": 0.0151503, "auxiliary_loss_mlp": 0.00281663, "balance_loss_clip": 1.24927914, "balance_loss_mlp": 0.24854672, "epoch": 0.36428678791522623, "flos": 24170503109760.0, "grad_norm": 104.44578032286712, "language_loss": 0.93046069, "learning_rate": 2.9378845884601636e-06, "loss": 0.94842762, "num_input_tokens_seen": 130169960, "router_z_loss_clip": 2.66015625, "router_z_loss_mlp": 0.33105469, "step": 6059, "time_per_iteration": 2.685561180114746 }, { "auxiliary_loss_clip": 0.01516576, "auxiliary_loss_mlp": 0.00306564, "balance_loss_clip": 1.25279284, "balance_loss_mlp": 0.27289894, "epoch": 0.3643469111678942, "flos": 22528451842560.0, "grad_norm": 24.546411651395186, "language_loss": 0.94290125, "learning_rate": 2.937540586903884e-06, "loss": 0.96113271, "num_input_tokens_seen": 130189800, "router_z_loss_clip": 2.63867188, "router_z_loss_mlp": 0.33642578, "step": 6060, "time_per_iteration": 4.129201889038086 }, { "auxiliary_loss_clip": 0.01526675, "auxiliary_loss_mlp": 0.00284838, "balance_loss_clip": 1.25496173, "balance_loss_mlp": 0.249385, "epoch": 0.36440703442056216, "flos": 19426595938560.0, "grad_norm": 4.374903796558651, "language_loss": 0.74731898, "learning_rate": 2.937196549795971e-06, "loss": 0.76543409, "num_input_tokens_seen": 130206370, "router_z_loss_clip": 2.71679688, "router_z_loss_mlp": 0.35449219, "step": 6061, "time_per_iteration": 2.698676586151123 }, { "auxiliary_loss_clip": 0.01522825, "auxiliary_loss_mlp": 0.00297349, "balance_loss_clip": 1.25429523, "balance_loss_mlp": 0.26492378, "epoch": 0.3644671576732301, "flos": 18040515966720.0, "grad_norm": 50.76919287406095, "language_loss": 0.86151415, "learning_rate": 2.9368524771494718e-06, "loss": 0.87971586, "num_input_tokens_seen": 130224445, "router_z_loss_clip": 2.68554688, "router_z_loss_mlp": 0.32421875, "step": 6062, "time_per_iteration": 2.66644287109375 }, { "auxiliary_loss_clip": 0.01523473, "auxiliary_loss_mlp": 0.00268574, "balance_loss_clip": 1.26201522, "balance_loss_mlp": 0.23574382, "epoch": 0.3645272809258981, "flos": 21542811667200.0, "grad_norm": 406.2891628326871, "language_loss": 0.79788399, "learning_rate": 2.936508368977432e-06, "loss": 0.81580448, "num_input_tokens_seen": 130245380, "router_z_loss_clip": 2.61328125, "router_z_loss_mlp": 0.328125, "step": 6063, "time_per_iteration": 2.69675350189209 }, { "auxiliary_loss_clip": 0.01525665, "auxiliary_loss_mlp": 0.00285572, "balance_loss_clip": 1.25983405, "balance_loss_mlp": 0.25145429, "epoch": 0.36458740417856605, "flos": 22746860490240.0, "grad_norm": 48.359611196208725, "language_loss": 0.74985898, "learning_rate": 2.936164225292901e-06, "loss": 0.7679714, "num_input_tokens_seen": 130265575, "router_z_loss_clip": 2.65625, "router_z_loss_mlp": 0.34094238, "step": 6064, "time_per_iteration": 2.6221470832824707 }, { "auxiliary_loss_clip": 0.0152708, "auxiliary_loss_mlp": 0.00261971, "balance_loss_clip": 1.26068163, "balance_loss_mlp": 0.23131019, "epoch": 0.364647527431234, "flos": 26140670138880.0, "grad_norm": 62.59034005758984, "language_loss": 0.83088374, "learning_rate": 2.9358200461089297e-06, "loss": 0.84877419, "num_input_tokens_seen": 130286195, "router_z_loss_clip": 2.66796875, "router_z_loss_mlp": 0.30639648, "step": 6065, "time_per_iteration": 2.687039613723755 }, { "auxiliary_loss_clip": 0.01533501, "auxiliary_loss_mlp": 0.00291792, "balance_loss_clip": 1.26465452, "balance_loss_mlp": 0.25707871, "epoch": 0.364707650683902, "flos": 31029907737600.0, "grad_norm": 15.817982801443712, "language_loss": 0.83524036, "learning_rate": 2.9354758314385676e-06, "loss": 0.85349321, "num_input_tokens_seen": 130306095, "router_z_loss_clip": 2.6875, "router_z_loss_mlp": 0.34692383, "step": 6066, "time_per_iteration": 4.131505966186523 }, { "auxiliary_loss_clip": 0.0153238, "auxiliary_loss_mlp": 0.00279198, "balance_loss_clip": 1.26846087, "balance_loss_mlp": 0.24720195, "epoch": 0.36476777393656995, "flos": 19572896033280.0, "grad_norm": 2546.902450036903, "language_loss": 0.85629672, "learning_rate": 2.9351315812948684e-06, "loss": 0.87441248, "num_input_tokens_seen": 130324685, "router_z_loss_clip": 2.63476562, "router_z_loss_mlp": 0.31982422, "step": 6067, "time_per_iteration": 2.642425537109375 }, { "auxiliary_loss_clip": 0.01538912, "auxiliary_loss_mlp": 0.00244993, "balance_loss_clip": 1.27447522, "balance_loss_mlp": 0.21499977, "epoch": 0.3648278971892379, "flos": 17748849530880.0, "grad_norm": 10.654989292663304, "language_loss": 0.81576514, "learning_rate": 2.934787295690886e-06, "loss": 0.83360416, "num_input_tokens_seen": 130343855, "router_z_loss_clip": 2.64453125, "router_z_loss_mlp": 0.29980469, "step": 6068, "time_per_iteration": 2.6299734115600586 }, { "auxiliary_loss_clip": 0.01544501, "auxiliary_loss_mlp": 0.00293013, "balance_loss_clip": 1.27190506, "balance_loss_mlp": 0.26075488, "epoch": 0.3648880204419059, "flos": 17931167988480.0, "grad_norm": 35.92315833391162, "language_loss": 0.82798523, "learning_rate": 2.9344429746396755e-06, "loss": 0.84636033, "num_input_tokens_seen": 130362320, "router_z_loss_clip": 2.7265625, "router_z_loss_mlp": 0.32275391, "step": 6069, "time_per_iteration": 2.6106553077697754 }, { "auxiliary_loss_clip": 0.01544859, "auxiliary_loss_mlp": 0.00307898, "balance_loss_clip": 1.27584243, "balance_loss_mlp": 0.27554488, "epoch": 0.3649481436945739, "flos": 22638266697600.0, "grad_norm": 4.011972269425396, "language_loss": 0.73193288, "learning_rate": 2.9340986181542945e-06, "loss": 0.75046039, "num_input_tokens_seen": 130383165, "router_z_loss_clip": 2.69140625, "router_z_loss_mlp": 0.32324219, "step": 6070, "time_per_iteration": 2.6845672130584717 }, { "auxiliary_loss_clip": 0.01557875, "auxiliary_loss_mlp": 0.00281759, "balance_loss_clip": 1.28910065, "balance_loss_mlp": 0.25119343, "epoch": 0.36500826694724187, "flos": 21579656042880.0, "grad_norm": 2.8208110705131384, "language_loss": 0.82751405, "learning_rate": 2.9337542262477994e-06, "loss": 0.84591043, "num_input_tokens_seen": 130402425, "router_z_loss_clip": 2.68359375, "router_z_loss_mlp": 0.30566406, "step": 6071, "time_per_iteration": 2.656904458999634 }, { "auxiliary_loss_clip": 0.01558068, "auxiliary_loss_mlp": 0.00280647, "balance_loss_clip": 1.28663445, "balance_loss_mlp": 0.24691066, "epoch": 0.36506839019990983, "flos": 13772533023360.0, "grad_norm": 89.08184755943006, "language_loss": 0.96525317, "learning_rate": 2.9334097989332506e-06, "loss": 0.98364031, "num_input_tokens_seen": 130419440, "router_z_loss_clip": 2.71679688, "router_z_loss_mlp": 0.33740234, "step": 6072, "time_per_iteration": 2.6069490909576416 }, { "auxiliary_loss_clip": 0.01562151, "auxiliary_loss_mlp": 0.00256279, "balance_loss_clip": 1.29117668, "balance_loss_mlp": 0.22421132, "epoch": 0.3651285134525778, "flos": 17274972378240.0, "grad_norm": 37.133775943062844, "language_loss": 0.81402969, "learning_rate": 2.9330653362237094e-06, "loss": 0.832214, "num_input_tokens_seen": 130438495, "router_z_loss_clip": 2.70898438, "router_z_loss_mlp": 0.32080078, "step": 6073, "time_per_iteration": 2.753474235534668 }, { "auxiliary_loss_clip": 0.01561402, "auxiliary_loss_mlp": 0.00266173, "balance_loss_clip": 1.28750324, "balance_loss_mlp": 0.23248425, "epoch": 0.36518863670524576, "flos": 21907987286400.0, "grad_norm": 4.221161573272919, "language_loss": 0.76495266, "learning_rate": 2.932720838132236e-06, "loss": 0.78322846, "num_input_tokens_seen": 130455575, "router_z_loss_clip": 2.74023438, "router_z_loss_mlp": 0.33691406, "step": 6074, "time_per_iteration": 2.8941547870635986 }, { "auxiliary_loss_clip": 0.01555319, "auxiliary_loss_mlp": 0.00245827, "balance_loss_clip": 1.28450274, "balance_loss_mlp": 0.21397406, "epoch": 0.3652487599579137, "flos": 27122180250240.0, "grad_norm": 5.64875876279917, "language_loss": 0.78984416, "learning_rate": 2.9323763046718954e-06, "loss": 0.80785561, "num_input_tokens_seen": 130476385, "router_z_loss_clip": 2.70898438, "router_z_loss_mlp": 0.31860352, "step": 6075, "time_per_iteration": 2.7034752368927 }, { "auxiliary_loss_clip": 0.01557123, "auxiliary_loss_mlp": 0.00242868, "balance_loss_clip": 1.28375387, "balance_loss_mlp": 0.21063364, "epoch": 0.3653088832105817, "flos": 19755573626880.0, "grad_norm": 3.546096196920162, "language_loss": 0.95922494, "learning_rate": 2.9320317358557524e-06, "loss": 0.97722483, "num_input_tokens_seen": 130493630, "router_z_loss_clip": 2.734375, "router_z_loss_mlp": 0.32226562, "step": 6076, "time_per_iteration": 2.6391875743865967 }, { "auxiliary_loss_clip": 0.01561317, "auxiliary_loss_mlp": 0.00279743, "balance_loss_clip": 1.29081345, "balance_loss_mlp": 0.24476665, "epoch": 0.36536900646324966, "flos": 13115008609920.0, "grad_norm": 46.31162874402638, "language_loss": 0.75861573, "learning_rate": 2.931687131696872e-06, "loss": 0.77702636, "num_input_tokens_seen": 130510735, "router_z_loss_clip": 2.70507812, "router_z_loss_mlp": 0.34985352, "step": 6077, "time_per_iteration": 2.7347724437713623 }, { "auxiliary_loss_clip": 0.01462601, "auxiliary_loss_mlp": 0.00114287, "balance_loss_clip": 1.26961398, "balance_loss_mlp": 0.10556089, "epoch": 0.3654291297159176, "flos": 71100472383360.0, "grad_norm": 3.0550123453188296, "language_loss": 0.6141417, "learning_rate": 2.9313424922083224e-06, "loss": 0.62991059, "num_input_tokens_seen": 130577050, "router_z_loss_clip": 1.9296875, "router_z_loss_mlp": 0.08740234, "step": 6078, "time_per_iteration": 3.2844815254211426 }, { "auxiliary_loss_clip": 0.01581453, "auxiliary_loss_mlp": 0.00282046, "balance_loss_clip": 1.30668855, "balance_loss_mlp": 0.24852422, "epoch": 0.3654892529685856, "flos": 23617478338560.0, "grad_norm": 24.101224160338298, "language_loss": 0.84300232, "learning_rate": 2.930997817403173e-06, "loss": 0.86163735, "num_input_tokens_seen": 130593780, "router_z_loss_clip": 2.74804688, "router_z_loss_mlp": 0.33520508, "step": 6079, "time_per_iteration": 2.6630890369415283 }, { "auxiliary_loss_clip": 0.01580291, "auxiliary_loss_mlp": 0.00301506, "balance_loss_clip": 1.30358315, "balance_loss_mlp": 0.2691288, "epoch": 0.36554937622125355, "flos": 43470799850880.0, "grad_norm": 4.212144363598613, "language_loss": 0.70282459, "learning_rate": 2.9306531072944913e-06, "loss": 0.72164255, "num_input_tokens_seen": 130615510, "router_z_loss_clip": 2.76757812, "router_z_loss_mlp": 0.32373047, "step": 6080, "time_per_iteration": 2.837479829788208 }, { "auxiliary_loss_clip": 0.01587271, "auxiliary_loss_mlp": 0.00317569, "balance_loss_clip": 1.30594134, "balance_loss_mlp": 0.28154406, "epoch": 0.3656094994739215, "flos": 23294641875840.0, "grad_norm": 589.4720887562161, "language_loss": 0.78429329, "learning_rate": 2.930308361895352e-06, "loss": 0.80334169, "num_input_tokens_seen": 130635410, "router_z_loss_clip": 2.8125, "router_z_loss_mlp": 0.3605957, "step": 6081, "time_per_iteration": 2.6781325340270996 }, { "auxiliary_loss_clip": 0.01593103, "auxiliary_loss_mlp": 0.00303687, "balance_loss_clip": 1.30926728, "balance_loss_mlp": 0.2705231, "epoch": 0.3656696227265895, "flos": 24571984400640.0, "grad_norm": 5.626836758181694, "language_loss": 0.7937969, "learning_rate": 2.9299635812188257e-06, "loss": 0.81276482, "num_input_tokens_seen": 130657725, "router_z_loss_clip": 2.8359375, "router_z_loss_mlp": 0.33178711, "step": 6082, "time_per_iteration": 2.8027541637420654 }, { "auxiliary_loss_clip": 0.01610725, "auxiliary_loss_mlp": 0.00281395, "balance_loss_clip": 1.32741046, "balance_loss_mlp": 0.24801643, "epoch": 0.3657297459792575, "flos": 27928375056000.0, "grad_norm": 41.489731275449884, "language_loss": 0.89300424, "learning_rate": 2.929618765277987e-06, "loss": 0.91192544, "num_input_tokens_seen": 130678360, "router_z_loss_clip": 2.83398438, "router_z_loss_mlp": 0.33398438, "step": 6083, "time_per_iteration": 2.710726022720337 }, { "auxiliary_loss_clip": 0.01618784, "auxiliary_loss_mlp": 0.00130371, "balance_loss_clip": 1.40661621, "balance_loss_mlp": 0.11907019, "epoch": 0.36578986923192547, "flos": 67392622126080.0, "grad_norm": 0.8110750155270271, "language_loss": 0.58786154, "learning_rate": 2.9292739140859125e-06, "loss": 0.60535306, "num_input_tokens_seen": 130742110, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.11279297, "step": 6084, "time_per_iteration": 3.326092481613159 }, { "auxiliary_loss_clip": 0.01615214, "auxiliary_loss_mlp": 0.00286156, "balance_loss_clip": 1.33249462, "balance_loss_mlp": 0.25430301, "epoch": 0.36584999248459343, "flos": 20227511445120.0, "grad_norm": 2.588025660733764, "language_loss": 0.79208302, "learning_rate": 2.9289290276556767e-06, "loss": 0.81109673, "num_input_tokens_seen": 130759870, "router_z_loss_clip": 2.82617188, "router_z_loss_mlp": 0.31835938, "step": 6085, "time_per_iteration": 2.709677219390869 }, { "auxiliary_loss_clip": 0.01634338, "auxiliary_loss_mlp": 0.00315471, "balance_loss_clip": 1.34321284, "balance_loss_mlp": 0.28183046, "epoch": 0.3659101157372614, "flos": 19062461813760.0, "grad_norm": 53.541159927667394, "language_loss": 0.84978461, "learning_rate": 2.9285841060003604e-06, "loss": 0.86928266, "num_input_tokens_seen": 130778510, "router_z_loss_clip": 2.91015625, "router_z_loss_mlp": 0.33642578, "step": 6086, "time_per_iteration": 2.6481964588165283 }, { "auxiliary_loss_clip": 0.01631416, "auxiliary_loss_mlp": 0.00302168, "balance_loss_clip": 1.34824133, "balance_loss_mlp": 0.26979101, "epoch": 0.36597023898992936, "flos": 30810708990720.0, "grad_norm": 57.84596972249884, "language_loss": 0.85064363, "learning_rate": 2.9282391491330416e-06, "loss": 0.8699795, "num_input_tokens_seen": 130798535, "router_z_loss_clip": 2.828125, "router_z_loss_mlp": 0.32373047, "step": 6087, "time_per_iteration": 2.753568649291992 }, { "auxiliary_loss_clip": 0.01634526, "auxiliary_loss_mlp": 0.00350714, "balance_loss_clip": 1.33934951, "balance_loss_mlp": 0.31418782, "epoch": 0.36603036224259733, "flos": 20521799573760.0, "grad_norm": 28.568795364350212, "language_loss": 0.80449772, "learning_rate": 2.9278941570668002e-06, "loss": 0.82435012, "num_input_tokens_seen": 130816655, "router_z_loss_clip": 2.95507812, "router_z_loss_mlp": 0.36523438, "step": 6088, "time_per_iteration": 2.6144182682037354 }, { "auxiliary_loss_clip": 0.01634974, "auxiliary_loss_mlp": 0.00318195, "balance_loss_clip": 1.33167541, "balance_loss_mlp": 0.2818836, "epoch": 0.3660904854952653, "flos": 38329397798400.0, "grad_norm": 4.919271960365891, "language_loss": 0.87104511, "learning_rate": 2.92754912981472e-06, "loss": 0.89057684, "num_input_tokens_seen": 130841225, "router_z_loss_clip": 3.03320312, "router_z_loss_mlp": 0.36303711, "step": 6089, "time_per_iteration": 2.7797272205352783 }, { "auxiliary_loss_clip": 0.01654192, "auxiliary_loss_mlp": 0.00338518, "balance_loss_clip": 1.36207843, "balance_loss_mlp": 0.30141979, "epoch": 0.36615060874793326, "flos": 21835555511040.0, "grad_norm": 14.005567487824953, "language_loss": 0.78157586, "learning_rate": 2.927204067389884e-06, "loss": 0.80150294, "num_input_tokens_seen": 130861050, "router_z_loss_clip": 2.921875, "router_z_loss_mlp": 0.37060547, "step": 6090, "time_per_iteration": 2.621188163757324 }, { "auxiliary_loss_clip": 0.01661474, "auxiliary_loss_mlp": 0.00333503, "balance_loss_clip": 1.36517477, "balance_loss_mlp": 0.30002916, "epoch": 0.3662107320006012, "flos": 16581537342720.0, "grad_norm": 163.62107897288536, "language_loss": 0.80666095, "learning_rate": 2.9268589698053763e-06, "loss": 0.82661068, "num_input_tokens_seen": 130879775, "router_z_loss_clip": 2.9609375, "router_z_loss_mlp": 0.33496094, "step": 6091, "time_per_iteration": 2.611182928085327 }, { "auxiliary_loss_clip": 0.01654078, "auxiliary_loss_mlp": 0.00291181, "balance_loss_clip": 1.36181128, "balance_loss_mlp": 0.25780213, "epoch": 0.3662708552532692, "flos": 20958365473920.0, "grad_norm": 21.849799775193116, "language_loss": 0.81032979, "learning_rate": 2.926513837074284e-06, "loss": 0.82978231, "num_input_tokens_seen": 130898070, "router_z_loss_clip": 2.92382812, "router_z_loss_mlp": 0.33398438, "step": 6092, "time_per_iteration": 2.6020522117614746 }, { "auxiliary_loss_clip": 0.01652464, "auxiliary_loss_mlp": 0.00306308, "balance_loss_clip": 1.35506821, "balance_loss_mlp": 0.2684229, "epoch": 0.36633097850593715, "flos": 21902707987200.0, "grad_norm": 9.122981889426043, "language_loss": 0.8719418, "learning_rate": 2.9261686692096942e-06, "loss": 0.89152956, "num_input_tokens_seen": 130915250, "router_z_loss_clip": 2.97460938, "router_z_loss_mlp": 0.37866211, "step": 6093, "time_per_iteration": 2.650583028793335 }, { "auxiliary_loss_clip": 0.01659137, "auxiliary_loss_mlp": 0.00322975, "balance_loss_clip": 1.36031866, "balance_loss_mlp": 0.2865451, "epoch": 0.3663911017586051, "flos": 32854133808000.0, "grad_norm": 3.269023043334289, "language_loss": 0.80578983, "learning_rate": 2.925823466224696e-06, "loss": 0.82561094, "num_input_tokens_seen": 130936995, "router_z_loss_clip": 2.98828125, "router_z_loss_mlp": 0.36450195, "step": 6094, "time_per_iteration": 2.7127485275268555 }, { "auxiliary_loss_clip": 0.01649706, "auxiliary_loss_mlp": 0.00297215, "balance_loss_clip": 1.35539865, "balance_loss_mlp": 0.2616908, "epoch": 0.3664512250112731, "flos": 27271748482560.0, "grad_norm": 3.672337963270092, "language_loss": 0.85190296, "learning_rate": 2.9254782281323785e-06, "loss": 0.87137216, "num_input_tokens_seen": 130957970, "router_z_loss_clip": 2.9375, "router_z_loss_mlp": 0.35534668, "step": 6095, "time_per_iteration": 2.68685245513916 }, { "auxiliary_loss_clip": 0.01641562, "auxiliary_loss_mlp": 0.0030452, "balance_loss_clip": 1.34191084, "balance_loss_mlp": 0.26866212, "epoch": 0.3665113482639411, "flos": 17784436930560.0, "grad_norm": 19.080317955024817, "language_loss": 0.8532604, "learning_rate": 2.925132954945834e-06, "loss": 0.87272125, "num_input_tokens_seen": 130974915, "router_z_loss_clip": 2.99609375, "router_z_loss_mlp": 0.35839844, "step": 6096, "time_per_iteration": 4.085319757461548 }, { "auxiliary_loss_clip": 0.01655309, "auxiliary_loss_mlp": 0.00311297, "balance_loss_clip": 1.35524929, "balance_loss_mlp": 0.27529615, "epoch": 0.36657147151660907, "flos": 27854614477440.0, "grad_norm": 62.52904187131912, "language_loss": 0.74879795, "learning_rate": 2.924787646678155e-06, "loss": 0.76846403, "num_input_tokens_seen": 130995745, "router_z_loss_clip": 3.00390625, "router_z_loss_mlp": 0.35986328, "step": 6097, "time_per_iteration": 4.157190799713135 }, { "auxiliary_loss_clip": 0.01653087, "auxiliary_loss_mlp": 0.00338079, "balance_loss_clip": 1.35605359, "balance_loss_mlp": 0.30322173, "epoch": 0.36663159476927704, "flos": 25374013228800.0, "grad_norm": 7.791021071818948, "language_loss": 0.8341344, "learning_rate": 2.9244423033424365e-06, "loss": 0.85404605, "num_input_tokens_seen": 131015545, "router_z_loss_clip": 2.96875, "router_z_loss_mlp": 0.34863281, "step": 6098, "time_per_iteration": 2.7568814754486084 }, { "auxiliary_loss_clip": 0.01658424, "auxiliary_loss_mlp": 0.0029034, "balance_loss_clip": 1.36070538, "balance_loss_mlp": 0.25724721, "epoch": 0.366691718021945, "flos": 21357225072000.0, "grad_norm": 4.2181137947259355, "language_loss": 0.80546516, "learning_rate": 2.9240969249517723e-06, "loss": 0.82495284, "num_input_tokens_seen": 131033990, "router_z_loss_clip": 2.9765625, "router_z_loss_mlp": 0.33105469, "step": 6099, "time_per_iteration": 2.681466579437256 }, { "auxiliary_loss_clip": 0.01654652, "auxiliary_loss_mlp": 0.00281056, "balance_loss_clip": 1.35878944, "balance_loss_mlp": 0.24538897, "epoch": 0.36675184127461297, "flos": 16800376953600.0, "grad_norm": 15.058799386051357, "language_loss": 0.91622061, "learning_rate": 2.9237515115192602e-06, "loss": 0.93557769, "num_input_tokens_seen": 131050710, "router_z_loss_clip": 2.95507812, "router_z_loss_mlp": 0.35668945, "step": 6100, "time_per_iteration": 2.692492723464966 }, { "auxiliary_loss_clip": 0.0164273, "auxiliary_loss_mlp": 0.00330367, "balance_loss_clip": 1.3429991, "balance_loss_mlp": 0.29257739, "epoch": 0.36681196452728093, "flos": 21906514828800.0, "grad_norm": 20.172228681170047, "language_loss": 0.79219627, "learning_rate": 2.9234060630579992e-06, "loss": 0.8119272, "num_input_tokens_seen": 131071435, "router_z_loss_clip": 2.99609375, "router_z_loss_mlp": 0.37792969, "step": 6101, "time_per_iteration": 2.671046018600464 }, { "auxiliary_loss_clip": 0.01666463, "auxiliary_loss_mlp": 0.00301017, "balance_loss_clip": 1.36272252, "balance_loss_mlp": 0.26775753, "epoch": 0.3668720877799489, "flos": 17712436118400.0, "grad_norm": 4.060481140914331, "language_loss": 0.85496509, "learning_rate": 2.9230605795810865e-06, "loss": 0.87463987, "num_input_tokens_seen": 131088775, "router_z_loss_clip": 3.04101562, "router_z_loss_mlp": 0.33251953, "step": 6102, "time_per_iteration": 4.033511638641357 }, { "auxiliary_loss_clip": 0.01657969, "auxiliary_loss_mlp": 0.00301341, "balance_loss_clip": 1.35543835, "balance_loss_mlp": 0.26340821, "epoch": 0.36693221103261686, "flos": 47045455499520.0, "grad_norm": 211.77160381442087, "language_loss": 0.78094637, "learning_rate": 2.922715061101625e-06, "loss": 0.80053937, "num_input_tokens_seen": 131112800, "router_z_loss_clip": 3.0234375, "router_z_loss_mlp": 0.37915039, "step": 6103, "time_per_iteration": 2.909019708633423 }, { "auxiliary_loss_clip": 0.01650752, "auxiliary_loss_mlp": 0.00318334, "balance_loss_clip": 1.35172701, "balance_loss_mlp": 0.28481275, "epoch": 0.3669923342852848, "flos": 15960929132160.0, "grad_norm": 32.804376977435695, "language_loss": 0.80321783, "learning_rate": 2.922369507632716e-06, "loss": 0.82290876, "num_input_tokens_seen": 131131150, "router_z_loss_clip": 2.99023438, "router_z_loss_mlp": 0.33544922, "step": 6104, "time_per_iteration": 2.623243570327759 }, { "auxiliary_loss_clip": 0.01660307, "auxiliary_loss_mlp": 0.0029579, "balance_loss_clip": 1.35749435, "balance_loss_mlp": 0.2601698, "epoch": 0.3670524575379528, "flos": 19974485064960.0, "grad_norm": 113.0315206940846, "language_loss": 0.89020491, "learning_rate": 2.9220239191874617e-06, "loss": 0.90976584, "num_input_tokens_seen": 131150365, "router_z_loss_clip": 3.03320312, "router_z_loss_mlp": 0.35644531, "step": 6105, "time_per_iteration": 2.6984660625457764 }, { "auxiliary_loss_clip": 0.01646013, "auxiliary_loss_mlp": 0.00322827, "balance_loss_clip": 1.34136462, "balance_loss_mlp": 0.28653991, "epoch": 0.36711258079062076, "flos": 25702955003520.0, "grad_norm": 3.257065789417559, "language_loss": 0.87883806, "learning_rate": 2.9216782957789692e-06, "loss": 0.89852643, "num_input_tokens_seen": 131169310, "router_z_loss_clip": 3.04882812, "router_z_loss_mlp": 0.36279297, "step": 6106, "time_per_iteration": 2.6917905807495117 }, { "auxiliary_loss_clip": 0.0169502, "auxiliary_loss_mlp": 0.00128248, "balance_loss_clip": 1.44733262, "balance_loss_mlp": 0.11852089, "epoch": 0.3671727040432887, "flos": 60772743342720.0, "grad_norm": 5.766417196467903, "language_loss": 0.58992648, "learning_rate": 2.9213326374203426e-06, "loss": 0.60815918, "num_input_tokens_seen": 131232900, "router_z_loss_clip": 2.46875, "router_z_loss_mlp": 0.09716797, "step": 6107, "time_per_iteration": 3.219575881958008 }, { "auxiliary_loss_clip": 0.01659753, "auxiliary_loss_mlp": 0.00291199, "balance_loss_clip": 1.3665055, "balance_loss_mlp": 0.25638992, "epoch": 0.3672328272959567, "flos": 18661303745280.0, "grad_norm": 2.383125876257265, "language_loss": 0.80668378, "learning_rate": 2.92098694412469e-06, "loss": 0.82619327, "num_input_tokens_seen": 131250920, "router_z_loss_clip": 2.93554688, "router_z_loss_mlp": 0.34790039, "step": 6108, "time_per_iteration": 4.030089378356934 }, { "auxiliary_loss_clip": 0.0166168, "auxiliary_loss_mlp": 0.0031021, "balance_loss_clip": 1.35587168, "balance_loss_mlp": 0.27594921, "epoch": 0.3672929505486247, "flos": 15049049535360.0, "grad_norm": 29.02903292230155, "language_loss": 0.83583999, "learning_rate": 2.9206412159051213e-06, "loss": 0.85555887, "num_input_tokens_seen": 131267910, "router_z_loss_clip": 3.0546875, "router_z_loss_mlp": 0.34228516, "step": 6109, "time_per_iteration": 2.579540491104126 }, { "auxiliary_loss_clip": 0.01648261, "auxiliary_loss_mlp": 0.00314217, "balance_loss_clip": 1.35537887, "balance_loss_mlp": 0.28031406, "epoch": 0.3673530738012927, "flos": 20589347099520.0, "grad_norm": 107.21459818008167, "language_loss": 0.60468501, "learning_rate": 2.920295452774744e-06, "loss": 0.62430984, "num_input_tokens_seen": 131287150, "router_z_loss_clip": 2.9296875, "router_z_loss_mlp": 0.33886719, "step": 6110, "time_per_iteration": 2.6516590118408203 }, { "auxiliary_loss_clip": 0.01671875, "auxiliary_loss_mlp": 0.00317994, "balance_loss_clip": 1.37172866, "balance_loss_mlp": 0.28125349, "epoch": 0.36741319705396064, "flos": 21689830033920.0, "grad_norm": 5.047102067877273, "language_loss": 0.85373867, "learning_rate": 2.919949654746672e-06, "loss": 0.87363732, "num_input_tokens_seen": 131308225, "router_z_loss_clip": 3.00390625, "router_z_loss_mlp": 0.36767578, "step": 6111, "time_per_iteration": 2.6954715251922607 }, { "auxiliary_loss_clip": 0.01659555, "auxiliary_loss_mlp": 0.00311551, "balance_loss_clip": 1.36380005, "balance_loss_mlp": 0.276456, "epoch": 0.3674733203066286, "flos": 29862200499840.0, "grad_norm": 8.45879770281599, "language_loss": 0.78262222, "learning_rate": 2.9196038218340163e-06, "loss": 0.80233324, "num_input_tokens_seen": 131332115, "router_z_loss_clip": 2.95703125, "router_z_loss_mlp": 0.35083008, "step": 6112, "time_per_iteration": 2.7311718463897705 }, { "auxiliary_loss_clip": 0.01658533, "auxiliary_loss_mlp": 0.00317767, "balance_loss_clip": 1.36111975, "balance_loss_mlp": 0.2858668, "epoch": 0.36753344355929657, "flos": 18257021193600.0, "grad_norm": 25.358928071103804, "language_loss": 0.90774941, "learning_rate": 2.919257954049892e-06, "loss": 0.92751241, "num_input_tokens_seen": 131351885, "router_z_loss_clip": 2.97265625, "router_z_loss_mlp": 0.3190918, "step": 6113, "time_per_iteration": 2.717071294784546 }, { "auxiliary_loss_clip": 0.01671641, "auxiliary_loss_mlp": 0.00343802, "balance_loss_clip": 1.36965334, "balance_loss_mlp": 0.30932629, "epoch": 0.36759356681196453, "flos": 25301150490240.0, "grad_norm": 2.2173369351011214, "language_loss": 0.88578683, "learning_rate": 2.918912051407413e-06, "loss": 0.90594125, "num_input_tokens_seen": 131370245, "router_z_loss_clip": 3.015625, "router_z_loss_mlp": 0.34472656, "step": 6114, "time_per_iteration": 2.6451780796051025 }, { "auxiliary_loss_clip": 0.01657791, "auxiliary_loss_mlp": 0.00314294, "balance_loss_clip": 1.35778821, "balance_loss_mlp": 0.27898377, "epoch": 0.3676536900646325, "flos": 21032952065280.0, "grad_norm": 2.1155994279674406, "language_loss": 0.76314777, "learning_rate": 2.918566113919698e-06, "loss": 0.78286862, "num_input_tokens_seen": 131388115, "router_z_loss_clip": 2.99804688, "router_z_loss_mlp": 0.35327148, "step": 6115, "time_per_iteration": 2.627621650695801 }, { "auxiliary_loss_clip": 0.01683332, "auxiliary_loss_mlp": 0.00322721, "balance_loss_clip": 1.38622904, "balance_loss_mlp": 0.28927055, "epoch": 0.36771381331730046, "flos": 16288506190080.0, "grad_norm": 61.14663255603954, "language_loss": 0.85632932, "learning_rate": 2.9182201415998636e-06, "loss": 0.87638986, "num_input_tokens_seen": 131404595, "router_z_loss_clip": 2.97265625, "router_z_loss_mlp": 0.33447266, "step": 6116, "time_per_iteration": 2.5789105892181396 }, { "auxiliary_loss_clip": 0.01685298, "auxiliary_loss_mlp": 0.00300148, "balance_loss_clip": 1.38188279, "balance_loss_mlp": 0.26553011, "epoch": 0.36777393656996843, "flos": 22309971367680.0, "grad_norm": 2.2397869133562254, "language_loss": 0.70879138, "learning_rate": 2.9178741344610286e-06, "loss": 0.72864586, "num_input_tokens_seen": 131423760, "router_z_loss_clip": 3.03125, "router_z_loss_mlp": 0.34643555, "step": 6117, "time_per_iteration": 2.728520154953003 }, { "auxiliary_loss_clip": 0.01675387, "auxiliary_loss_mlp": 0.00312139, "balance_loss_clip": 1.37553155, "balance_loss_mlp": 0.2806673, "epoch": 0.3678340598226364, "flos": 26834069260800.0, "grad_norm": 6.193472498482061, "language_loss": 0.81954849, "learning_rate": 2.9175280925163156e-06, "loss": 0.83942378, "num_input_tokens_seen": 131444955, "router_z_loss_clip": 2.99609375, "router_z_loss_mlp": 0.31469727, "step": 6118, "time_per_iteration": 2.745534896850586 }, { "auxiliary_loss_clip": 0.01689399, "auxiliary_loss_mlp": 0.00322835, "balance_loss_clip": 1.38776326, "balance_loss_mlp": 0.2882883, "epoch": 0.36789418307530436, "flos": 21761723105280.0, "grad_norm": 9.901698974419121, "language_loss": 0.79580975, "learning_rate": 2.9171820157788445e-06, "loss": 0.8159321, "num_input_tokens_seen": 131465720, "router_z_loss_clip": 3.01953125, "router_z_loss_mlp": 0.34570312, "step": 6119, "time_per_iteration": 2.7610905170440674 }, { "auxiliary_loss_clip": 0.0170244, "auxiliary_loss_mlp": 0.00299706, "balance_loss_clip": 1.40154111, "balance_loss_mlp": 0.26608884, "epoch": 0.3679543063279723, "flos": 15924192497280.0, "grad_norm": 47.909885142599734, "language_loss": 0.87013829, "learning_rate": 2.9168359042617404e-06, "loss": 0.89015973, "num_input_tokens_seen": 131483080, "router_z_loss_clip": 3.00976562, "router_z_loss_mlp": 0.3359375, "step": 6120, "time_per_iteration": 2.7355473041534424 }, { "auxiliary_loss_clip": 0.01706873, "auxiliary_loss_mlp": 0.00301846, "balance_loss_clip": 1.40236592, "balance_loss_mlp": 0.26985043, "epoch": 0.3680144295806403, "flos": 24275541456000.0, "grad_norm": 10.971688026495075, "language_loss": 0.74156022, "learning_rate": 2.916489757978126e-06, "loss": 0.76164746, "num_input_tokens_seen": 131502545, "router_z_loss_clip": 3.046875, "router_z_loss_mlp": 0.31982422, "step": 6121, "time_per_iteration": 2.672490358352661 }, { "auxiliary_loss_clip": 0.01714428, "auxiliary_loss_mlp": 0.00324305, "balance_loss_clip": 1.40910709, "balance_loss_mlp": 0.29288173, "epoch": 0.36807455283330826, "flos": 26104148985600.0, "grad_norm": 2.2297940306770565, "language_loss": 0.80173624, "learning_rate": 2.9161435769411286e-06, "loss": 0.82212359, "num_input_tokens_seen": 131522155, "router_z_loss_clip": 3.05664062, "router_z_loss_mlp": 0.31420898, "step": 6122, "time_per_iteration": 2.6889796257019043 }, { "auxiliary_loss_clip": 0.01703796, "auxiliary_loss_mlp": 0.00282232, "balance_loss_clip": 1.40663457, "balance_loss_mlp": 0.24768497, "epoch": 0.3681346760859763, "flos": 24644990793600.0, "grad_norm": 117.88835064706055, "language_loss": 0.74764729, "learning_rate": 2.915797361163875e-06, "loss": 0.76750755, "num_input_tokens_seen": 131543865, "router_z_loss_clip": 2.97265625, "router_z_loss_mlp": 0.34545898, "step": 6123, "time_per_iteration": 2.770402669906616 }, { "auxiliary_loss_clip": 0.01707917, "auxiliary_loss_mlp": 0.00317882, "balance_loss_clip": 1.39919376, "balance_loss_mlp": 0.28307313, "epoch": 0.36819479933864424, "flos": 23878369797120.0, "grad_norm": 16.707481766684513, "language_loss": 0.83276492, "learning_rate": 2.9154511106594933e-06, "loss": 0.85302293, "num_input_tokens_seen": 131562155, "router_z_loss_clip": 3.09179688, "router_z_loss_mlp": 0.34814453, "step": 6124, "time_per_iteration": 2.7541017532348633 }, { "auxiliary_loss_clip": 0.01718555, "auxiliary_loss_mlp": 0.00325106, "balance_loss_clip": 1.40814495, "balance_loss_mlp": 0.28984356, "epoch": 0.3682549225913122, "flos": 25553997302400.0, "grad_norm": 35.97742056864481, "language_loss": 0.82276833, "learning_rate": 2.915104825441114e-06, "loss": 0.84320498, "num_input_tokens_seen": 131581695, "router_z_loss_clip": 3.10351562, "router_z_loss_mlp": 0.35253906, "step": 6125, "time_per_iteration": 2.7553911209106445 }, { "auxiliary_loss_clip": 0.01735141, "auxiliary_loss_mlp": 0.00346759, "balance_loss_clip": 1.41891909, "balance_loss_mlp": 0.31085366, "epoch": 0.36831504584398017, "flos": 16946605221120.0, "grad_norm": 8.875758677434353, "language_loss": 0.86264956, "learning_rate": 2.9147585055218686e-06, "loss": 0.88346851, "num_input_tokens_seen": 131599465, "router_z_loss_clip": 3.1640625, "router_z_loss_mlp": 0.35913086, "step": 6126, "time_per_iteration": 2.6563873291015625 }, { "auxiliary_loss_clip": 0.01738052, "auxiliary_loss_mlp": 0.00371324, "balance_loss_clip": 1.41075718, "balance_loss_mlp": 0.32929081, "epoch": 0.36837516909664814, "flos": 19865065259520.0, "grad_norm": 12.522907600333244, "language_loss": 0.73676234, "learning_rate": 2.914412150914888e-06, "loss": 0.75785613, "num_input_tokens_seen": 131618330, "router_z_loss_clip": 3.27539062, "router_z_loss_mlp": 0.42016602, "step": 6127, "time_per_iteration": 2.6750481128692627 }, { "auxiliary_loss_clip": 0.0172939, "auxiliary_loss_mlp": 0.00278603, "balance_loss_clip": 1.41553569, "balance_loss_mlp": 0.24586849, "epoch": 0.3684352923493161, "flos": 37626984362880.0, "grad_norm": 5.175152485821013, "language_loss": 0.7703169, "learning_rate": 2.9140657616333074e-06, "loss": 0.79039681, "num_input_tokens_seen": 131638960, "router_z_loss_clip": 3.13867188, "router_z_loss_mlp": 0.32739258, "step": 6128, "time_per_iteration": 2.8093948364257812 }, { "auxiliary_loss_clip": 0.01761005, "auxiliary_loss_mlp": 0.00301462, "balance_loss_clip": 1.44244432, "balance_loss_mlp": 0.26801193, "epoch": 0.36849541560198407, "flos": 14465501182080.0, "grad_norm": 3.8763915596427903, "language_loss": 0.837071, "learning_rate": 2.9137193376902614e-06, "loss": 0.85769564, "num_input_tokens_seen": 131657440, "router_z_loss_clip": 3.1875, "router_z_loss_mlp": 0.33447266, "step": 6129, "time_per_iteration": 2.590507984161377 }, { "auxiliary_loss_clip": 0.0174311, "auxiliary_loss_mlp": 0.00304209, "balance_loss_clip": 1.41975641, "balance_loss_mlp": 0.26987666, "epoch": 0.36855553885465203, "flos": 25770753924480.0, "grad_norm": 2.3938025259093307, "language_loss": 0.90971732, "learning_rate": 2.9133728790988868e-06, "loss": 0.93019044, "num_input_tokens_seen": 131678035, "router_z_loss_clip": 3.23242188, "router_z_loss_mlp": 0.34301758, "step": 6130, "time_per_iteration": 2.6983652114868164 }, { "auxiliary_loss_clip": 0.01897998, "auxiliary_loss_mlp": 0.001831, "balance_loss_clip": 1.67588043, "balance_loss_mlp": 0.17012998, "epoch": 0.36861566210732, "flos": 65049417377280.0, "grad_norm": 0.8215704052041939, "language_loss": 0.60114229, "learning_rate": 2.913026385872321e-06, "loss": 0.62195325, "num_input_tokens_seen": 131742470, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.12988281, "step": 6131, "time_per_iteration": 3.2208354473114014 }, { "auxiliary_loss_clip": 0.0175852, "auxiliary_loss_mlp": 0.00310124, "balance_loss_clip": 1.43795156, "balance_loss_mlp": 0.27793741, "epoch": 0.36867578535998796, "flos": 30954495133440.0, "grad_norm": 1.7503215313541118, "language_loss": 0.80153954, "learning_rate": 2.9126798580237034e-06, "loss": 0.82222593, "num_input_tokens_seen": 131764570, "router_z_loss_clip": 3.20898438, "router_z_loss_mlp": 0.32128906, "step": 6132, "time_per_iteration": 2.6898601055145264 }, { "auxiliary_loss_clip": 0.0174936, "auxiliary_loss_mlp": 0.00308445, "balance_loss_clip": 1.41876554, "balance_loss_mlp": 0.27373156, "epoch": 0.3687359086126559, "flos": 28837956182400.0, "grad_norm": 2.5584278386217303, "language_loss": 0.81444955, "learning_rate": 2.9123332955661736e-06, "loss": 0.83502758, "num_input_tokens_seen": 131785720, "router_z_loss_clip": 3.30273438, "router_z_loss_mlp": 0.34692383, "step": 6133, "time_per_iteration": 2.6741058826446533 }, { "auxiliary_loss_clip": 0.01762654, "auxiliary_loss_mlp": 0.00351775, "balance_loss_clip": 1.44205284, "balance_loss_mlp": 0.31386608, "epoch": 0.3687960318653239, "flos": 21396798881280.0, "grad_norm": 4.773291754773896, "language_loss": 0.77258563, "learning_rate": 2.911986698512874e-06, "loss": 0.7937299, "num_input_tokens_seen": 131804430, "router_z_loss_clip": 3.20703125, "router_z_loss_mlp": 0.37939453, "step": 6134, "time_per_iteration": 2.618229866027832 }, { "auxiliary_loss_clip": 0.01756156, "auxiliary_loss_mlp": 0.0033886, "balance_loss_clip": 1.43598557, "balance_loss_mlp": 0.30414668, "epoch": 0.36885615511799186, "flos": 20266043760000.0, "grad_norm": 5.488995254832189, "language_loss": 0.81492507, "learning_rate": 2.9116400668769477e-06, "loss": 0.83587521, "num_input_tokens_seen": 131822060, "router_z_loss_clip": 3.20117188, "router_z_loss_mlp": 0.34667969, "step": 6135, "time_per_iteration": 2.638878583908081 }, { "auxiliary_loss_clip": 0.01918003, "auxiliary_loss_mlp": 0.00113538, "balance_loss_clip": 1.68269551, "balance_loss_mlp": 0.0999006, "epoch": 0.3689162783706599, "flos": 63088836301440.0, "grad_norm": 0.8152603977945808, "language_loss": 0.58329588, "learning_rate": 2.9112934006715376e-06, "loss": 0.60361129, "num_input_tokens_seen": 131880715, "router_z_loss_clip": 2.34375, "router_z_loss_mlp": 0.13671875, "step": 6136, "time_per_iteration": 3.1016945838928223 }, { "auxiliary_loss_clip": 0.0174602, "auxiliary_loss_mlp": 0.00332967, "balance_loss_clip": 1.42749131, "balance_loss_mlp": 0.29770535, "epoch": 0.36897640162332784, "flos": 10961984419200.0, "grad_norm": 3.5509601470041514, "language_loss": 0.86275268, "learning_rate": 2.9109466999097918e-06, "loss": 0.88354254, "num_input_tokens_seen": 131895850, "router_z_loss_clip": 3.18554688, "router_z_loss_mlp": 0.35253906, "step": 6137, "time_per_iteration": 2.6219449043273926 }, { "auxiliary_loss_clip": 0.01774063, "auxiliary_loss_mlp": 0.00298483, "balance_loss_clip": 1.4493165, "balance_loss_mlp": 0.26376879, "epoch": 0.3690365248759958, "flos": 20704297599360.0, "grad_norm": 13.251590657794914, "language_loss": 0.81466615, "learning_rate": 2.9105999646048552e-06, "loss": 0.83539158, "num_input_tokens_seen": 131915775, "router_z_loss_clip": 3.24609375, "router_z_loss_mlp": 0.34741211, "step": 6138, "time_per_iteration": 4.08752179145813 }, { "auxiliary_loss_clip": 0.01752022, "auxiliary_loss_mlp": 0.00320311, "balance_loss_clip": 1.42813492, "balance_loss_mlp": 0.28605026, "epoch": 0.3690966481286638, "flos": 31826369957760.0, "grad_norm": 27.626187106028116, "language_loss": 0.74382615, "learning_rate": 2.9102531947698764e-06, "loss": 0.76454949, "num_input_tokens_seen": 131935715, "router_z_loss_clip": 3.2421875, "router_z_loss_mlp": 0.34228516, "step": 6139, "time_per_iteration": 4.250002145767212 }, { "auxiliary_loss_clip": 0.01772901, "auxiliary_loss_mlp": 0.00304036, "balance_loss_clip": 1.45500207, "balance_loss_mlp": 0.27225491, "epoch": 0.36915677138133174, "flos": 13114936782720.0, "grad_norm": 2.911845193258864, "language_loss": 0.78942263, "learning_rate": 2.909906390418006e-06, "loss": 0.81019199, "num_input_tokens_seen": 131954120, "router_z_loss_clip": 3.1796875, "router_z_loss_mlp": 0.31787109, "step": 6140, "time_per_iteration": 2.660627603530884 }, { "auxiliary_loss_clip": 0.01933827, "auxiliary_loss_mlp": 0.00130663, "balance_loss_clip": 1.68699896, "balance_loss_mlp": 0.11550003, "epoch": 0.3692168946339997, "flos": 68686879956480.0, "grad_norm": 0.8013563662870526, "language_loss": 0.58813548, "learning_rate": 2.9095595515623934e-06, "loss": 0.60878038, "num_input_tokens_seen": 132017485, "router_z_loss_clip": 2.46875, "router_z_loss_mlp": 0.15136719, "step": 6141, "time_per_iteration": 3.247819423675537 }, { "auxiliary_loss_clip": 0.01764191, "auxiliary_loss_mlp": 0.0032418, "balance_loss_clip": 1.44484687, "balance_loss_mlp": 0.28867972, "epoch": 0.36927701788666767, "flos": 22017873968640.0, "grad_norm": 6.339878994636131, "language_loss": 0.81235325, "learning_rate": 2.909212678216192e-06, "loss": 0.83323693, "num_input_tokens_seen": 132036760, "router_z_loss_clip": 3.19140625, "router_z_loss_mlp": 0.35498047, "step": 6142, "time_per_iteration": 2.6446585655212402 }, { "auxiliary_loss_clip": 0.01751588, "auxiliary_loss_mlp": 0.00301659, "balance_loss_clip": 1.43795538, "balance_loss_mlp": 0.26878121, "epoch": 0.36933714113933563, "flos": 21835591424640.0, "grad_norm": 17.251072900497764, "language_loss": 0.82707155, "learning_rate": 2.908865770392555e-06, "loss": 0.84760404, "num_input_tokens_seen": 132056935, "router_z_loss_clip": 3.13476562, "router_z_loss_mlp": 0.32885742, "step": 6143, "time_per_iteration": 2.6618659496307373 }, { "auxiliary_loss_clip": 0.01749336, "auxiliary_loss_mlp": 0.00290607, "balance_loss_clip": 1.43761051, "balance_loss_mlp": 0.25672776, "epoch": 0.3693972643920036, "flos": 23691705793920.0, "grad_norm": 4.385277260438461, "language_loss": 0.88941097, "learning_rate": 2.9085188281046364e-06, "loss": 0.90981036, "num_input_tokens_seen": 132077285, "router_z_loss_clip": 3.11914062, "router_z_loss_mlp": 0.33837891, "step": 6144, "time_per_iteration": 4.066816329956055 }, { "auxiliary_loss_clip": 0.01752516, "auxiliary_loss_mlp": 0.00304028, "balance_loss_clip": 1.43957043, "balance_loss_mlp": 0.27241322, "epoch": 0.36945738764467156, "flos": 22856747172480.0, "grad_norm": 8.011343558712973, "language_loss": 0.87254429, "learning_rate": 2.908171851365593e-06, "loss": 0.89310968, "num_input_tokens_seen": 132095520, "router_z_loss_clip": 3.1328125, "router_z_loss_mlp": 0.31665039, "step": 6145, "time_per_iteration": 2.689098834991455 }, { "auxiliary_loss_clip": 0.01747047, "auxiliary_loss_mlp": 0.00308123, "balance_loss_clip": 1.43371511, "balance_loss_mlp": 0.27431512, "epoch": 0.36951751089733953, "flos": 16615939593600.0, "grad_norm": 3.2463909344333253, "language_loss": 0.85730052, "learning_rate": 2.9078248401885815e-06, "loss": 0.8778522, "num_input_tokens_seen": 132112810, "router_z_loss_clip": 3.13671875, "router_z_loss_mlp": 0.33764648, "step": 6146, "time_per_iteration": 2.625507354736328 }, { "auxiliary_loss_clip": 0.01751913, "auxiliary_loss_mlp": 0.00308492, "balance_loss_clip": 1.43687212, "balance_loss_mlp": 0.27554274, "epoch": 0.3695776341500075, "flos": 18914545607040.0, "grad_norm": 46.67677066868991, "language_loss": 0.90241617, "learning_rate": 2.907477794586761e-06, "loss": 0.92302024, "num_input_tokens_seen": 132131615, "router_z_loss_clip": 3.15039062, "router_z_loss_mlp": 0.3293457, "step": 6147, "time_per_iteration": 2.6742210388183594 }, { "auxiliary_loss_clip": 0.01757006, "auxiliary_loss_mlp": 0.00334457, "balance_loss_clip": 1.43992543, "balance_loss_mlp": 0.30129313, "epoch": 0.36963775740267546, "flos": 20808474019200.0, "grad_norm": 23.12108597144141, "language_loss": 0.90302539, "learning_rate": 2.9071307145732926e-06, "loss": 0.92394, "num_input_tokens_seen": 132149585, "router_z_loss_clip": 3.16992188, "router_z_loss_mlp": 0.33154297, "step": 6148, "time_per_iteration": 2.756828784942627 }, { "auxiliary_loss_clip": 0.01751927, "auxiliary_loss_mlp": 0.00289513, "balance_loss_clip": 1.44641066, "balance_loss_mlp": 0.25651574, "epoch": 0.3696978806553435, "flos": 26061881656320.0, "grad_norm": 27.66448168235438, "language_loss": 0.81981838, "learning_rate": 2.9067836001613357e-06, "loss": 0.84023273, "num_input_tokens_seen": 132165555, "router_z_loss_clip": 3.05664062, "router_z_loss_mlp": 0.32983398, "step": 6149, "time_per_iteration": 2.7588748931884766 }, { "auxiliary_loss_clip": 0.01744027, "auxiliary_loss_mlp": 0.00315852, "balance_loss_clip": 1.43001151, "balance_loss_mlp": 0.27997014, "epoch": 0.36975800390801145, "flos": 26833925606400.0, "grad_norm": 55.72440744971489, "language_loss": 0.78007501, "learning_rate": 2.906436451364054e-06, "loss": 0.80067384, "num_input_tokens_seen": 132185100, "router_z_loss_clip": 3.14257812, "router_z_loss_mlp": 0.35888672, "step": 6150, "time_per_iteration": 4.051138639450073 }, { "auxiliary_loss_clip": 0.01764409, "auxiliary_loss_mlp": 0.003234, "balance_loss_clip": 1.44772387, "balance_loss_mlp": 0.29276353, "epoch": 0.3698181271606794, "flos": 21142623265920.0, "grad_norm": 18.588103554450182, "language_loss": 0.88866532, "learning_rate": 2.906089268194611e-06, "loss": 0.9095434, "num_input_tokens_seen": 132203930, "router_z_loss_clip": 3.16796875, "router_z_loss_mlp": 0.30639648, "step": 6151, "time_per_iteration": 2.639847993850708 }, { "auxiliary_loss_clip": 0.01879684, "auxiliary_loss_mlp": 0.00258936, "balance_loss_clip": 1.63921881, "balance_loss_mlp": 0.24758756, "epoch": 0.3698782504133474, "flos": 66742639568640.0, "grad_norm": 0.8913916021970856, "language_loss": 0.63081217, "learning_rate": 2.9057420506661726e-06, "loss": 0.65219837, "num_input_tokens_seen": 132263845, "router_z_loss_clip": 2.40625, "router_z_loss_mlp": 0.11328125, "step": 6152, "time_per_iteration": 3.2172608375549316 }, { "auxiliary_loss_clip": 0.01789433, "auxiliary_loss_mlp": 0.00310788, "balance_loss_clip": 1.48045397, "balance_loss_mlp": 0.28122383, "epoch": 0.36993837366601534, "flos": 24311523905280.0, "grad_norm": 13.27810593455708, "language_loss": 0.76209956, "learning_rate": 2.9053947987919044e-06, "loss": 0.7831018, "num_input_tokens_seen": 132282350, "router_z_loss_clip": 3.08984375, "router_z_loss_mlp": 0.29589844, "step": 6153, "time_per_iteration": 2.6226654052734375 }, { "auxiliary_loss_clip": 0.01752663, "auxiliary_loss_mlp": 0.00328189, "balance_loss_clip": 1.43782234, "balance_loss_mlp": 0.29454845, "epoch": 0.3699984969186833, "flos": 24349194293760.0, "grad_norm": 19.958438161126388, "language_loss": 0.79401559, "learning_rate": 2.9050475125849755e-06, "loss": 0.8148241, "num_input_tokens_seen": 132301930, "router_z_loss_clip": 3.14648438, "router_z_loss_mlp": 0.33666992, "step": 6154, "time_per_iteration": 2.6286425590515137 }, { "auxiliary_loss_clip": 0.01769702, "auxiliary_loss_mlp": 0.00325652, "balance_loss_clip": 1.45036948, "balance_loss_mlp": 0.29274982, "epoch": 0.37005862017135127, "flos": 19829154637440.0, "grad_norm": 2.3716957928688833, "language_loss": 0.75364166, "learning_rate": 2.9047001920585534e-06, "loss": 0.7745952, "num_input_tokens_seen": 132320915, "router_z_loss_clip": 3.19140625, "router_z_loss_mlp": 0.32885742, "step": 6155, "time_per_iteration": 2.6308815479278564 }, { "auxiliary_loss_clip": 0.01754817, "auxiliary_loss_mlp": 0.00328395, "balance_loss_clip": 1.44308472, "balance_loss_mlp": 0.29749557, "epoch": 0.37011874342401924, "flos": 19573793873280.0, "grad_norm": 60.18763246504998, "language_loss": 0.75456631, "learning_rate": 2.9043528372258097e-06, "loss": 0.77539843, "num_input_tokens_seen": 132340415, "router_z_loss_clip": 3.11328125, "router_z_loss_mlp": 0.30908203, "step": 6156, "time_per_iteration": 2.6738739013671875 }, { "auxiliary_loss_clip": 0.0175877, "auxiliary_loss_mlp": 0.00318529, "balance_loss_clip": 1.45406413, "balance_loss_mlp": 0.2875106, "epoch": 0.3701788666766872, "flos": 20374350243840.0, "grad_norm": 4.303218289626294, "language_loss": 0.8657515, "learning_rate": 2.904005448099916e-06, "loss": 0.88652444, "num_input_tokens_seen": 132358600, "router_z_loss_clip": 3.04492188, "router_z_loss_mlp": 0.31018066, "step": 6157, "time_per_iteration": 2.6034724712371826 }, { "auxiliary_loss_clip": 0.01763096, "auxiliary_loss_mlp": 0.00380435, "balance_loss_clip": 1.43685758, "balance_loss_mlp": 0.34452897, "epoch": 0.37023898992935517, "flos": 15340931452800.0, "grad_norm": 14.637838314534807, "language_loss": 0.87401319, "learning_rate": 2.9036580246940444e-06, "loss": 0.89544845, "num_input_tokens_seen": 132373160, "router_z_loss_clip": 3.26367188, "router_z_loss_mlp": 0.35913086, "step": 6158, "time_per_iteration": 2.6095242500305176 }, { "auxiliary_loss_clip": 0.01740722, "auxiliary_loss_mlp": 0.00357847, "balance_loss_clip": 1.42744637, "balance_loss_mlp": 0.3245998, "epoch": 0.37029911318202313, "flos": 19573937527680.0, "grad_norm": 22.97231569900213, "language_loss": 0.78225696, "learning_rate": 2.9033105670213708e-06, "loss": 0.80324268, "num_input_tokens_seen": 132392345, "router_z_loss_clip": 3.1328125, "router_z_loss_mlp": 0.33239746, "step": 6159, "time_per_iteration": 2.640918254852295 }, { "auxiliary_loss_clip": 0.01746209, "auxiliary_loss_mlp": 0.00341356, "balance_loss_clip": 1.43796206, "balance_loss_mlp": 0.30988428, "epoch": 0.3703592364346911, "flos": 26213353309440.0, "grad_norm": 162.18991399203838, "language_loss": 0.78623044, "learning_rate": 2.9029630750950697e-06, "loss": 0.80710614, "num_input_tokens_seen": 132412620, "router_z_loss_clip": 3.07617188, "router_z_loss_mlp": 0.31469727, "step": 6160, "time_per_iteration": 2.7621724605560303 }, { "auxiliary_loss_clip": 0.01745502, "auxiliary_loss_mlp": 0.00350425, "balance_loss_clip": 1.43910503, "balance_loss_mlp": 0.32031286, "epoch": 0.37041935968735906, "flos": 20048317470720.0, "grad_norm": 16.32692762629393, "language_loss": 0.84679055, "learning_rate": 2.9026155489283176e-06, "loss": 0.86774981, "num_input_tokens_seen": 132431570, "router_z_loss_clip": 3.06640625, "router_z_loss_mlp": 0.30126953, "step": 6161, "time_per_iteration": 2.6703407764434814 }, { "auxiliary_loss_clip": 0.01762708, "auxiliary_loss_mlp": 0.00367626, "balance_loss_clip": 1.44721127, "balance_loss_mlp": 0.33365148, "epoch": 0.3704794829400271, "flos": 24133802388480.0, "grad_norm": 12.98149532345473, "language_loss": 0.84822619, "learning_rate": 2.902267988534295e-06, "loss": 0.8695296, "num_input_tokens_seen": 132451525, "router_z_loss_clip": 3.15429688, "router_z_loss_mlp": 0.33984375, "step": 6162, "time_per_iteration": 2.693572759628296 }, { "auxiliary_loss_clip": 0.01747583, "auxiliary_loss_mlp": 0.00379241, "balance_loss_clip": 1.43327248, "balance_loss_mlp": 0.34662554, "epoch": 0.37053960619269505, "flos": 14866874732160.0, "grad_norm": 9.534158556911878, "language_loss": 0.87700605, "learning_rate": 2.9019203939261783e-06, "loss": 0.8982743, "num_input_tokens_seen": 132469875, "router_z_loss_clip": 3.14453125, "router_z_loss_mlp": 0.32629395, "step": 6163, "time_per_iteration": 2.6321518421173096 }, { "auxiliary_loss_clip": 0.01772637, "auxiliary_loss_mlp": 0.00388557, "balance_loss_clip": 1.45221591, "balance_loss_mlp": 0.35594136, "epoch": 0.370599729445363, "flos": 21361498790400.0, "grad_norm": 7.75312584962961, "language_loss": 0.75406301, "learning_rate": 2.9015727651171507e-06, "loss": 0.77567494, "num_input_tokens_seen": 132488360, "router_z_loss_clip": 3.20117188, "router_z_loss_mlp": 0.32666016, "step": 6164, "time_per_iteration": 2.65812611579895 }, { "auxiliary_loss_clip": 0.01750714, "auxiliary_loss_mlp": 0.00389768, "balance_loss_clip": 1.435709, "balance_loss_mlp": 0.35464942, "epoch": 0.370659852698031, "flos": 26829041356800.0, "grad_norm": 90.83827626853744, "language_loss": 0.90762895, "learning_rate": 2.9012251021203935e-06, "loss": 0.92903382, "num_input_tokens_seen": 132508630, "router_z_loss_clip": 3.14453125, "router_z_loss_mlp": 0.35107422, "step": 6165, "time_per_iteration": 2.818296432495117 }, { "auxiliary_loss_clip": 0.01761991, "auxiliary_loss_mlp": 0.00393814, "balance_loss_clip": 1.43625593, "balance_loss_mlp": 0.35776511, "epoch": 0.37071997595069894, "flos": 19099018880640.0, "grad_norm": 76.33734557143282, "language_loss": 0.77548897, "learning_rate": 2.9008774049490896e-06, "loss": 0.79704702, "num_input_tokens_seen": 132527465, "router_z_loss_clip": 3.25585938, "router_z_loss_mlp": 0.3605957, "step": 6166, "time_per_iteration": 2.840646982192993 }, { "auxiliary_loss_clip": 0.01790764, "auxiliary_loss_mlp": 0.00182561, "balance_loss_clip": 1.558743, "balance_loss_mlp": 0.17397824, "epoch": 0.3707800992033669, "flos": 52178384920320.0, "grad_norm": 0.832023837952891, "language_loss": 0.55362785, "learning_rate": 2.9005296736164244e-06, "loss": 0.5733611, "num_input_tokens_seen": 132579940, "router_z_loss_clip": 2.3125, "router_z_loss_mlp": 0.0859375, "step": 6167, "time_per_iteration": 3.0273876190185547 }, { "auxiliary_loss_clip": 0.01767227, "auxiliary_loss_mlp": 0.0045365, "balance_loss_clip": 1.44164014, "balance_loss_mlp": 0.41524038, "epoch": 0.3708402224560349, "flos": 19901837808000.0, "grad_norm": 9.85019736137254, "language_loss": 0.8303836, "learning_rate": 2.900181908135584e-06, "loss": 0.85259235, "num_input_tokens_seen": 132598390, "router_z_loss_clip": 3.25585938, "router_z_loss_mlp": 0.38452148, "step": 6168, "time_per_iteration": 2.821683883666992 }, { "auxiliary_loss_clip": 0.01775421, "auxiliary_loss_mlp": 0.00361162, "balance_loss_clip": 1.44361448, "balance_loss_mlp": 0.32764056, "epoch": 0.37090034570870284, "flos": 20007630339840.0, "grad_norm": 328.82768878188125, "language_loss": 0.80381024, "learning_rate": 2.899834108519755e-06, "loss": 0.82517606, "num_input_tokens_seen": 132616920, "router_z_loss_clip": 3.31835938, "router_z_loss_mlp": 0.33496094, "step": 6169, "time_per_iteration": 2.8078439235687256 }, { "auxiliary_loss_clip": 0.0178242, "auxiliary_loss_mlp": 0.00374225, "balance_loss_clip": 1.44995713, "balance_loss_mlp": 0.33870089, "epoch": 0.3709604689613708, "flos": 24134700228480.0, "grad_norm": 37.54496928817693, "language_loss": 0.85182667, "learning_rate": 2.899486274782127e-06, "loss": 0.87339312, "num_input_tokens_seen": 132637660, "router_z_loss_clip": 3.3203125, "router_z_loss_mlp": 0.35522461, "step": 6170, "time_per_iteration": 2.681673288345337 }, { "auxiliary_loss_clip": 0.01772013, "auxiliary_loss_mlp": 0.00383421, "balance_loss_clip": 1.44255781, "balance_loss_mlp": 0.34799248, "epoch": 0.37102059221403877, "flos": 23876071326720.0, "grad_norm": 10.036151389541871, "language_loss": 0.82856262, "learning_rate": 2.8991384069358885e-06, "loss": 0.85011697, "num_input_tokens_seen": 132657635, "router_z_loss_clip": 3.29296875, "router_z_loss_mlp": 0.35424805, "step": 6171, "time_per_iteration": 2.684309959411621 }, { "auxiliary_loss_clip": 0.017816, "auxiliary_loss_mlp": 0.00374201, "balance_loss_clip": 1.44709361, "balance_loss_mlp": 0.33653051, "epoch": 0.37108071546670673, "flos": 14501268149760.0, "grad_norm": 2.831526408845845, "language_loss": 0.87951833, "learning_rate": 2.898790504994232e-06, "loss": 0.90107632, "num_input_tokens_seen": 132674455, "router_z_loss_clip": 3.34765625, "router_z_loss_mlp": 0.37695312, "step": 6172, "time_per_iteration": 2.61558198928833 }, { "auxiliary_loss_clip": 0.01763208, "auxiliary_loss_mlp": 0.00385546, "balance_loss_clip": 1.42823839, "balance_loss_mlp": 0.34673142, "epoch": 0.3711408387193747, "flos": 34562619279360.0, "grad_norm": 11.103513257936944, "language_loss": 0.66689932, "learning_rate": 2.89844256897035e-06, "loss": 0.68838686, "num_input_tokens_seen": 132695140, "router_z_loss_clip": 3.34960938, "router_z_loss_mlp": 0.38842773, "step": 6173, "time_per_iteration": 2.7604682445526123 }, { "auxiliary_loss_clip": 0.01761967, "auxiliary_loss_mlp": 0.00352768, "balance_loss_clip": 1.42835116, "balance_loss_mlp": 0.31652817, "epoch": 0.37120096197204266, "flos": 17310703432320.0, "grad_norm": 4.262962883782642, "language_loss": 0.87597746, "learning_rate": 2.898094598877435e-06, "loss": 0.89712477, "num_input_tokens_seen": 132712470, "router_z_loss_clip": 3.33984375, "router_z_loss_mlp": 0.36254883, "step": 6174, "time_per_iteration": 2.6347172260284424 }, { "auxiliary_loss_clip": 0.01784005, "auxiliary_loss_mlp": 0.00373251, "balance_loss_clip": 1.44893241, "balance_loss_mlp": 0.33779779, "epoch": 0.37126108522471063, "flos": 30664049760000.0, "grad_norm": 5.770911542686874, "language_loss": 0.87380517, "learning_rate": 2.8977465947286826e-06, "loss": 0.89537764, "num_input_tokens_seen": 132732945, "router_z_loss_clip": 3.34960938, "router_z_loss_mlp": 0.35449219, "step": 6175, "time_per_iteration": 2.7404541969299316 }, { "auxiliary_loss_clip": 0.017674, "auxiliary_loss_mlp": 0.003715, "balance_loss_clip": 1.43736863, "balance_loss_mlp": 0.33418775, "epoch": 0.37132120847737865, "flos": 25155640494720.0, "grad_norm": 57.764205605973174, "language_loss": 0.94989121, "learning_rate": 2.89739855653729e-06, "loss": 0.97128022, "num_input_tokens_seen": 132752470, "router_z_loss_clip": 3.296875, "router_z_loss_mlp": 0.37304688, "step": 6176, "time_per_iteration": 2.6623072624206543 }, { "auxiliary_loss_clip": 0.01758686, "auxiliary_loss_mlp": 0.00370177, "balance_loss_clip": 1.42801249, "balance_loss_mlp": 0.3329834, "epoch": 0.3713813317300466, "flos": 21213474842880.0, "grad_norm": 7.956622477997687, "language_loss": 0.79159445, "learning_rate": 2.8970504843164546e-06, "loss": 0.81288302, "num_input_tokens_seen": 132771485, "router_z_loss_clip": 3.30273438, "router_z_loss_mlp": 0.37158203, "step": 6177, "time_per_iteration": 2.640137195587158 }, { "auxiliary_loss_clip": 0.01752415, "auxiliary_loss_mlp": 0.00345109, "balance_loss_clip": 1.41648507, "balance_loss_mlp": 0.30937022, "epoch": 0.3714414549827146, "flos": 21616644072960.0, "grad_norm": 3.8615859378096955, "language_loss": 0.83767933, "learning_rate": 2.896702378079374e-06, "loss": 0.85865462, "num_input_tokens_seen": 132791465, "router_z_loss_clip": 3.359375, "router_z_loss_mlp": 0.35717773, "step": 6178, "time_per_iteration": 2.614762306213379 }, { "auxiliary_loss_clip": 0.01765294, "auxiliary_loss_mlp": 0.00342274, "balance_loss_clip": 1.42872238, "balance_loss_mlp": 0.30653492, "epoch": 0.37150157823538255, "flos": 19972294335360.0, "grad_norm": 6.028753697064996, "language_loss": 0.78684485, "learning_rate": 2.8963542378392502e-06, "loss": 0.80792046, "num_input_tokens_seen": 132810160, "router_z_loss_clip": 3.36523438, "router_z_loss_mlp": 0.35742188, "step": 6179, "time_per_iteration": 2.655958414077759 }, { "auxiliary_loss_clip": 0.01738338, "auxiliary_loss_mlp": 0.00344814, "balance_loss_clip": 1.40612221, "balance_loss_mlp": 0.30843115, "epoch": 0.3715617014880505, "flos": 24860562266880.0, "grad_norm": 11.325527399744889, "language_loss": 0.76684862, "learning_rate": 2.896006063609283e-06, "loss": 0.78768015, "num_input_tokens_seen": 132831265, "router_z_loss_clip": 3.32617188, "router_z_loss_mlp": 0.36376953, "step": 6180, "time_per_iteration": 2.706688404083252 }, { "auxiliary_loss_clip": 0.01771479, "auxiliary_loss_mlp": 0.0033329, "balance_loss_clip": 1.43146563, "balance_loss_mlp": 0.29795685, "epoch": 0.3716218247407185, "flos": 20449080489600.0, "grad_norm": 8.397526022519594, "language_loss": 0.82797611, "learning_rate": 2.8956578554026767e-06, "loss": 0.84902376, "num_input_tokens_seen": 132850005, "router_z_loss_clip": 3.39648438, "router_z_loss_mlp": 0.35351562, "step": 6181, "time_per_iteration": 4.1642632484436035 }, { "auxiliary_loss_clip": 0.01750873, "auxiliary_loss_mlp": 0.00340412, "balance_loss_clip": 1.41550696, "balance_loss_mlp": 0.3030276, "epoch": 0.37168194799338644, "flos": 24133479166080.0, "grad_norm": 3.895181375171825, "language_loss": 0.85404336, "learning_rate": 2.8953096132326343e-06, "loss": 0.87495625, "num_input_tokens_seen": 132865790, "router_z_loss_clip": 3.3515625, "router_z_loss_mlp": 0.3737793, "step": 6182, "time_per_iteration": 4.10425329208374 }, { "auxiliary_loss_clip": 0.01631747, "auxiliary_loss_mlp": 0.00101038, "balance_loss_clip": 1.43830562, "balance_loss_mlp": 0.09307515, "epoch": 0.3717420712460544, "flos": 67408926900480.0, "grad_norm": 0.7949177884181257, "language_loss": 0.57155257, "learning_rate": 2.894961337112362e-06, "loss": 0.58888042, "num_input_tokens_seen": 132921775, "router_z_loss_clip": 1.9375, "router_z_loss_mlp": 0.07958984, "step": 6183, "time_per_iteration": 3.138582468032837 }, { "auxiliary_loss_clip": 0.01751125, "auxiliary_loss_mlp": 0.00373311, "balance_loss_clip": 1.40011287, "balance_loss_mlp": 0.33306611, "epoch": 0.37180219449872237, "flos": 22376908362240.0, "grad_norm": 37.025864110619146, "language_loss": 0.83227932, "learning_rate": 2.894613027055066e-06, "loss": 0.85352373, "num_input_tokens_seen": 132941060, "router_z_loss_clip": 3.5078125, "router_z_loss_mlp": 0.40234375, "step": 6184, "time_per_iteration": 2.6824100017547607 }, { "auxiliary_loss_clip": 0.01756492, "auxiliary_loss_mlp": 0.00332097, "balance_loss_clip": 1.41195631, "balance_loss_mlp": 0.2960957, "epoch": 0.37186231775139034, "flos": 21869885934720.0, "grad_norm": 165.15424316820702, "language_loss": 0.7998184, "learning_rate": 2.894264683073954e-06, "loss": 0.82070428, "num_input_tokens_seen": 132961850, "router_z_loss_clip": 3.4453125, "router_z_loss_mlp": 0.35986328, "step": 6185, "time_per_iteration": 2.731783866882324 }, { "auxiliary_loss_clip": 0.01772659, "auxiliary_loss_mlp": 0.00336292, "balance_loss_clip": 1.42674398, "balance_loss_mlp": 0.30040956, "epoch": 0.3719224410040583, "flos": 22415225195520.0, "grad_norm": 6.4054323583099215, "language_loss": 0.8334865, "learning_rate": 2.8939163051822363e-06, "loss": 0.85457599, "num_input_tokens_seen": 132981625, "router_z_loss_clip": 3.45898438, "router_z_loss_mlp": 0.35888672, "step": 6186, "time_per_iteration": 4.179441928863525 }, { "auxiliary_loss_clip": 0.01756799, "auxiliary_loss_mlp": 0.0036431, "balance_loss_clip": 1.40731311, "balance_loss_mlp": 0.3264249, "epoch": 0.37198256425672627, "flos": 25151223121920.0, "grad_norm": 103.57731824628544, "language_loss": 0.91585267, "learning_rate": 2.8935678933931224e-06, "loss": 0.93706375, "num_input_tokens_seen": 133001225, "router_z_loss_clip": 3.49023438, "router_z_loss_mlp": 0.37841797, "step": 6187, "time_per_iteration": 2.707345962524414 }, { "auxiliary_loss_clip": 0.01740387, "auxiliary_loss_mlp": 0.00319625, "balance_loss_clip": 1.40747344, "balance_loss_mlp": 0.28333762, "epoch": 0.37204268750939423, "flos": 21138313633920.0, "grad_norm": 3.5221536598295806, "language_loss": 0.89998674, "learning_rate": 2.893219447719824e-06, "loss": 0.92058682, "num_input_tokens_seen": 133018820, "router_z_loss_clip": 3.33007812, "router_z_loss_mlp": 0.36279297, "step": 6188, "time_per_iteration": 2.812798500061035 }, { "auxiliary_loss_clip": 0.01728977, "auxiliary_loss_mlp": 0.0034052, "balance_loss_clip": 1.39351559, "balance_loss_mlp": 0.30339789, "epoch": 0.37210281076206225, "flos": 21506829217920.0, "grad_norm": 6.470280606006512, "language_loss": 0.72186929, "learning_rate": 2.8928709681755548e-06, "loss": 0.7425642, "num_input_tokens_seen": 133040205, "router_z_loss_clip": 3.35742188, "router_z_loss_mlp": 0.37158203, "step": 6189, "time_per_iteration": 2.701035976409912 }, { "auxiliary_loss_clip": 0.01751875, "auxiliary_loss_mlp": 0.00343385, "balance_loss_clip": 1.41285872, "balance_loss_mlp": 0.30781281, "epoch": 0.3721629340147302, "flos": 17347835116800.0, "grad_norm": 5.918363596605893, "language_loss": 0.91386658, "learning_rate": 2.8925224547735293e-06, "loss": 0.93481922, "num_input_tokens_seen": 133058095, "router_z_loss_clip": 3.38867188, "router_z_loss_mlp": 0.35595703, "step": 6190, "time_per_iteration": 2.628676652908325 }, { "auxiliary_loss_clip": 0.0175073, "auxiliary_loss_mlp": 0.00363262, "balance_loss_clip": 1.40010583, "balance_loss_mlp": 0.32344574, "epoch": 0.3722230572673982, "flos": 16432400073600.0, "grad_norm": 14.869174185778954, "language_loss": 0.99507302, "learning_rate": 2.8921739075269633e-06, "loss": 1.01621282, "num_input_tokens_seen": 133071530, "router_z_loss_clip": 3.50585938, "router_z_loss_mlp": 0.39819336, "step": 6191, "time_per_iteration": 2.6085667610168457 }, { "auxiliary_loss_clip": 0.01732405, "auxiliary_loss_mlp": 0.00360975, "balance_loss_clip": 1.38886809, "balance_loss_mlp": 0.32027724, "epoch": 0.37228318052006615, "flos": 22674716023680.0, "grad_norm": 28.059415817560293, "language_loss": 0.8035543, "learning_rate": 2.891825326449073e-06, "loss": 0.82448804, "num_input_tokens_seen": 133091410, "router_z_loss_clip": 3.4296875, "router_z_loss_mlp": 0.40698242, "step": 6192, "time_per_iteration": 2.8310558795928955 }, { "auxiliary_loss_clip": 0.01735686, "auxiliary_loss_mlp": 0.00386705, "balance_loss_clip": 1.39667344, "balance_loss_mlp": 0.34798574, "epoch": 0.3723433037727341, "flos": 25265491263360.0, "grad_norm": 60.21299109620226, "language_loss": 0.89144921, "learning_rate": 2.8914767115530766e-06, "loss": 0.91267312, "num_input_tokens_seen": 133110365, "router_z_loss_clip": 3.390625, "router_z_loss_mlp": 0.38720703, "step": 6193, "time_per_iteration": 4.088541269302368 }, { "auxiliary_loss_clip": 0.01726472, "auxiliary_loss_mlp": 0.00394257, "balance_loss_clip": 1.38630021, "balance_loss_mlp": 0.35436979, "epoch": 0.3724034270254021, "flos": 10524664333440.0, "grad_norm": 26.782036437731833, "language_loss": 0.93332297, "learning_rate": 2.891128062852194e-06, "loss": 0.9545303, "num_input_tokens_seen": 133128255, "router_z_loss_clip": 3.40429688, "router_z_loss_mlp": 0.39868164, "step": 6194, "time_per_iteration": 2.624647378921509 }, { "auxiliary_loss_clip": 0.01748852, "auxiliary_loss_mlp": 0.00348914, "balance_loss_clip": 1.40744138, "balance_loss_mlp": 0.31310368, "epoch": 0.37246355027807004, "flos": 20266223328000.0, "grad_norm": 23.296579631366935, "language_loss": 0.86816168, "learning_rate": 2.890779380359646e-06, "loss": 0.88913941, "num_input_tokens_seen": 133143975, "router_z_loss_clip": 3.41210938, "router_z_loss_mlp": 0.35766602, "step": 6195, "time_per_iteration": 2.640249013900757 }, { "auxiliary_loss_clip": 0.01726316, "auxiliary_loss_mlp": 0.00361546, "balance_loss_clip": 1.39563477, "balance_loss_mlp": 0.32490095, "epoch": 0.372523673530738, "flos": 19500571998720.0, "grad_norm": 4.443975525287679, "language_loss": 0.84632063, "learning_rate": 2.890430664088655e-06, "loss": 0.8671993, "num_input_tokens_seen": 133162935, "router_z_loss_clip": 3.30859375, "router_z_loss_mlp": 0.36645508, "step": 6196, "time_per_iteration": 2.638850688934326 }, { "auxiliary_loss_clip": 0.01741795, "auxiliary_loss_mlp": 0.0034721, "balance_loss_clip": 1.40622759, "balance_loss_mlp": 0.31163746, "epoch": 0.372583796783406, "flos": 16764250849920.0, "grad_norm": 169.95378587711426, "language_loss": 0.92950213, "learning_rate": 2.890081914052443e-06, "loss": 0.95039213, "num_input_tokens_seen": 133181180, "router_z_loss_clip": 3.35546875, "router_z_loss_mlp": 0.35546875, "step": 6197, "time_per_iteration": 2.701787233352661 }, { "auxiliary_loss_clip": 0.01727405, "auxiliary_loss_mlp": 0.00357686, "balance_loss_clip": 1.39933681, "balance_loss_mlp": 0.32077876, "epoch": 0.37264392003607394, "flos": 22637979388800.0, "grad_norm": 7.148352252324941, "language_loss": 0.72934955, "learning_rate": 2.889733130264237e-06, "loss": 0.75020045, "num_input_tokens_seen": 133199615, "router_z_loss_clip": 3.27734375, "router_z_loss_mlp": 0.36889648, "step": 6198, "time_per_iteration": 2.7098886966705322 }, { "auxiliary_loss_clip": 0.01726346, "auxiliary_loss_mlp": 0.00310373, "balance_loss_clip": 1.39754152, "balance_loss_mlp": 0.27444309, "epoch": 0.3727040432887419, "flos": 19973120348160.0, "grad_norm": 844.2562329934984, "language_loss": 0.80675435, "learning_rate": 2.889384312737261e-06, "loss": 0.8271215, "num_input_tokens_seen": 133219650, "router_z_loss_clip": 3.28710938, "router_z_loss_mlp": 0.35913086, "step": 6199, "time_per_iteration": 2.7864387035369873 }, { "auxiliary_loss_clip": 0.01751193, "auxiliary_loss_mlp": 0.00341318, "balance_loss_clip": 1.41590834, "balance_loss_mlp": 0.30557936, "epoch": 0.37276416654140987, "flos": 63899122279680.0, "grad_norm": 11.68775424684546, "language_loss": 0.87785619, "learning_rate": 2.889035461484742e-06, "loss": 0.8987813, "num_input_tokens_seen": 133245675, "router_z_loss_clip": 3.35351562, "router_z_loss_mlp": 0.35742188, "step": 6200, "time_per_iteration": 3.062654495239258 }, { "auxiliary_loss_clip": 0.01730776, "auxiliary_loss_mlp": 0.00307638, "balance_loss_clip": 1.39593911, "balance_loss_mlp": 0.27006292, "epoch": 0.37282428979407783, "flos": 39785970211200.0, "grad_norm": 5.216280328096078, "language_loss": 0.66756642, "learning_rate": 2.88868657651991e-06, "loss": 0.68795061, "num_input_tokens_seen": 133266905, "router_z_loss_clip": 3.34765625, "router_z_loss_mlp": 0.37573242, "step": 6201, "time_per_iteration": 2.8229570388793945 }, { "auxiliary_loss_clip": 0.01720933, "auxiliary_loss_mlp": 0.00349912, "balance_loss_clip": 1.38732576, "balance_loss_mlp": 0.3128618, "epoch": 0.37288441304674586, "flos": 22709046447360.0, "grad_norm": 6.938724169525751, "language_loss": 0.80283368, "learning_rate": 2.8883376578559934e-06, "loss": 0.82354218, "num_input_tokens_seen": 133286865, "router_z_loss_clip": 3.3359375, "router_z_loss_mlp": 0.37036133, "step": 6202, "time_per_iteration": 2.6978414058685303 }, { "auxiliary_loss_clip": 0.01727644, "auxiliary_loss_mlp": 0.00326534, "balance_loss_clip": 1.39674366, "balance_loss_mlp": 0.28946, "epoch": 0.3729445362994138, "flos": 18770292587520.0, "grad_norm": 1163.5513950143343, "language_loss": 0.80674016, "learning_rate": 2.8879887055062243e-06, "loss": 0.82728195, "num_input_tokens_seen": 133305295, "router_z_loss_clip": 3.31054688, "router_z_loss_mlp": 0.37084961, "step": 6203, "time_per_iteration": 2.678176164627075 }, { "auxiliary_loss_clip": 0.0171213, "auxiliary_loss_mlp": 0.00322028, "balance_loss_clip": 1.38334107, "balance_loss_mlp": 0.28638479, "epoch": 0.3730046595520818, "flos": 22456199635200.0, "grad_norm": 159.8452322186409, "language_loss": 0.860587, "learning_rate": 2.8876397194838353e-06, "loss": 0.88092858, "num_input_tokens_seen": 133324625, "router_z_loss_clip": 3.29101562, "router_z_loss_mlp": 0.35620117, "step": 6204, "time_per_iteration": 2.695551633834839 }, { "auxiliary_loss_clip": 0.01704644, "auxiliary_loss_mlp": 0.00295229, "balance_loss_clip": 1.37738121, "balance_loss_mlp": 0.26034814, "epoch": 0.37306478280474975, "flos": 24316372241280.0, "grad_norm": 1.9237567676402594, "language_loss": 0.81729567, "learning_rate": 2.8872906998020577e-06, "loss": 0.83729434, "num_input_tokens_seen": 133344625, "router_z_loss_clip": 3.2734375, "router_z_loss_mlp": 0.34838867, "step": 6205, "time_per_iteration": 2.6962597370147705 }, { "auxiliary_loss_clip": 0.01708088, "auxiliary_loss_mlp": 0.00332416, "balance_loss_clip": 1.38637316, "balance_loss_mlp": 0.29415029, "epoch": 0.3731249060574177, "flos": 15815167741440.0, "grad_norm": 124.67974357147908, "language_loss": 0.87893176, "learning_rate": 2.886941646474128e-06, "loss": 0.89933681, "num_input_tokens_seen": 133363605, "router_z_loss_clip": 3.22070312, "router_z_loss_mlp": 0.38256836, "step": 6206, "time_per_iteration": 2.623389959335327 }, { "auxiliary_loss_clip": 0.01700995, "auxiliary_loss_mlp": 0.00294829, "balance_loss_clip": 1.38045382, "balance_loss_mlp": 0.25718307, "epoch": 0.3731850293100857, "flos": 19828077229440.0, "grad_norm": 21.299436800490437, "language_loss": 0.99193847, "learning_rate": 2.886592559513283e-06, "loss": 1.01189685, "num_input_tokens_seen": 133379405, "router_z_loss_clip": 3.20703125, "router_z_loss_mlp": 0.3762207, "step": 6207, "time_per_iteration": 2.643613338470459 }, { "auxiliary_loss_clip": 0.0165991, "auxiliary_loss_mlp": 0.00277675, "balance_loss_clip": 1.34810495, "balance_loss_mlp": 0.24141175, "epoch": 0.37324515256275365, "flos": 19062354072960.0, "grad_norm": 83.53901267553387, "language_loss": 0.92375898, "learning_rate": 2.886243438932759e-06, "loss": 0.94313478, "num_input_tokens_seen": 133397585, "router_z_loss_clip": 3.1171875, "router_z_loss_mlp": 0.36254883, "step": 6208, "time_per_iteration": 2.6359732151031494 }, { "auxiliary_loss_clip": 0.01673946, "auxiliary_loss_mlp": 0.00315116, "balance_loss_clip": 1.36272693, "balance_loss_mlp": 0.2784715, "epoch": 0.3733052758154216, "flos": 20704333512960.0, "grad_norm": 19.93076828162548, "language_loss": 0.80820835, "learning_rate": 2.8858942847457953e-06, "loss": 0.82809901, "num_input_tokens_seen": 133415365, "router_z_loss_clip": 3.11132812, "router_z_loss_mlp": 0.36645508, "step": 6209, "time_per_iteration": 2.6833019256591797 }, { "auxiliary_loss_clip": 0.01696813, "auxiliary_loss_mlp": 0.00301319, "balance_loss_clip": 1.37629461, "balance_loss_mlp": 0.26522291, "epoch": 0.3733653990680896, "flos": 20193504243840.0, "grad_norm": 1469.0156638428725, "language_loss": 0.79311258, "learning_rate": 2.8855450969656305e-06, "loss": 0.8130939, "num_input_tokens_seen": 133435700, "router_z_loss_clip": 3.203125, "router_z_loss_mlp": 0.36132812, "step": 6210, "time_per_iteration": 2.698287010192871 }, { "auxiliary_loss_clip": 0.01704777, "auxiliary_loss_mlp": 0.00312599, "balance_loss_clip": 1.38486719, "balance_loss_mlp": 0.27438077, "epoch": 0.37342552232075754, "flos": 20339660684160.0, "grad_norm": 75.2770940526914, "language_loss": 0.84696579, "learning_rate": 2.8851958756055073e-06, "loss": 0.86713958, "num_input_tokens_seen": 133455180, "router_z_loss_clip": 3.1953125, "router_z_loss_mlp": 0.38183594, "step": 6211, "time_per_iteration": 2.790957450866699 }, { "auxiliary_loss_clip": 0.01707597, "auxiliary_loss_mlp": 0.00317616, "balance_loss_clip": 1.38144886, "balance_loss_mlp": 0.28142402, "epoch": 0.3734856455734255, "flos": 35517879527040.0, "grad_norm": 46.80030827966551, "language_loss": 0.81229854, "learning_rate": 2.884846620678668e-06, "loss": 0.83255064, "num_input_tokens_seen": 133476715, "router_z_loss_clip": 3.26171875, "router_z_loss_mlp": 0.36181641, "step": 6212, "time_per_iteration": 2.947744607925415 }, { "auxiliary_loss_clip": 0.01702778, "auxiliary_loss_mlp": 0.00350404, "balance_loss_clip": 1.37576818, "balance_loss_mlp": 0.31175661, "epoch": 0.37354576882609347, "flos": 21142300043520.0, "grad_norm": 102.29957469427833, "language_loss": 0.88364637, "learning_rate": 2.884497332198356e-06, "loss": 0.9041782, "num_input_tokens_seen": 133494550, "router_z_loss_clip": 3.27148438, "router_z_loss_mlp": 0.38623047, "step": 6213, "time_per_iteration": 2.6485490798950195 }, { "auxiliary_loss_clip": 0.01744939, "auxiliary_loss_mlp": 0.00321103, "balance_loss_clip": 1.41530049, "balance_loss_mlp": 0.28307498, "epoch": 0.37360589207876144, "flos": 21506793304320.0, "grad_norm": 5.397452728992794, "language_loss": 0.86660707, "learning_rate": 2.8841480101778167e-06, "loss": 0.88726747, "num_input_tokens_seen": 133512640, "router_z_loss_clip": 3.30078125, "router_z_loss_mlp": 0.38012695, "step": 6214, "time_per_iteration": 2.6463186740875244 }, { "auxiliary_loss_clip": 0.01770692, "auxiliary_loss_mlp": 0.00292075, "balance_loss_clip": 1.43599665, "balance_loss_mlp": 0.25609717, "epoch": 0.37366601533142946, "flos": 38435800861440.0, "grad_norm": 12.82699136741613, "language_loss": 0.9174602, "learning_rate": 2.883798654630296e-06, "loss": 0.93808794, "num_input_tokens_seen": 133535540, "router_z_loss_clip": 3.34960938, "router_z_loss_mlp": 0.36010742, "step": 6215, "time_per_iteration": 2.8581056594848633 }, { "auxiliary_loss_clip": 0.01767993, "auxiliary_loss_mlp": 0.00315197, "balance_loss_clip": 1.4239552, "balance_loss_mlp": 0.27733612, "epoch": 0.3737261385840974, "flos": 18441171244800.0, "grad_norm": 329.43872232355596, "language_loss": 0.75696039, "learning_rate": 2.8834492655690423e-06, "loss": 0.77779227, "num_input_tokens_seen": 133555795, "router_z_loss_clip": 3.44140625, "router_z_loss_mlp": 0.37890625, "step": 6216, "time_per_iteration": 2.673511266708374 }, { "auxiliary_loss_clip": 0.01775131, "auxiliary_loss_mlp": 0.00287296, "balance_loss_clip": 1.436131, "balance_loss_mlp": 0.25129485, "epoch": 0.3737862618367654, "flos": 22929861306240.0, "grad_norm": 535.9028297087336, "language_loss": 0.76544935, "learning_rate": 2.883099843007303e-06, "loss": 0.78607357, "num_input_tokens_seen": 133575905, "router_z_loss_clip": 3.38867188, "router_z_loss_mlp": 0.35986328, "step": 6217, "time_per_iteration": 2.6698174476623535 }, { "auxiliary_loss_clip": 0.0179696, "auxiliary_loss_mlp": 0.00301545, "balance_loss_clip": 1.44641304, "balance_loss_mlp": 0.26246798, "epoch": 0.37384638508943335, "flos": 15409664127360.0, "grad_norm": 100.86209809413465, "language_loss": 0.8634932, "learning_rate": 2.88275038695833e-06, "loss": 0.88447827, "num_input_tokens_seen": 133592585, "router_z_loss_clip": 3.50585938, "router_z_loss_mlp": 0.39086914, "step": 6218, "time_per_iteration": 2.6003971099853516 }, { "auxiliary_loss_clip": 0.01809767, "auxiliary_loss_mlp": 0.00314998, "balance_loss_clip": 1.45789826, "balance_loss_mlp": 0.27759027, "epoch": 0.3739065083421013, "flos": 24280820755200.0, "grad_norm": 12.700535646905603, "language_loss": 0.84185207, "learning_rate": 2.8824008974353736e-06, "loss": 0.86309969, "num_input_tokens_seen": 133615070, "router_z_loss_clip": 3.5234375, "router_z_loss_mlp": 0.37402344, "step": 6219, "time_per_iteration": 2.7263286113739014 }, { "auxiliary_loss_clip": 0.01852811, "auxiliary_loss_mlp": 0.00321785, "balance_loss_clip": 1.48415279, "balance_loss_mlp": 0.28430516, "epoch": 0.3739666315947693, "flos": 23002831785600.0, "grad_norm": 77.72955449991898, "language_loss": 0.83332592, "learning_rate": 2.8820513744516866e-06, "loss": 0.8550719, "num_input_tokens_seen": 133633490, "router_z_loss_clip": 3.6875, "router_z_loss_mlp": 0.375, "step": 6220, "time_per_iteration": 2.629988193511963 }, { "auxiliary_loss_clip": 0.01864375, "auxiliary_loss_mlp": 0.00346924, "balance_loss_clip": 1.48142207, "balance_loss_mlp": 0.30458063, "epoch": 0.37402675484743725, "flos": 19391116279680.0, "grad_norm": 2.7785801449467717, "language_loss": 0.89500743, "learning_rate": 2.8817018180205235e-06, "loss": 0.91712046, "num_input_tokens_seen": 133653425, "router_z_loss_clip": 3.828125, "router_z_loss_mlp": 0.4230957, "step": 6221, "time_per_iteration": 2.6810359954833984 }, { "auxiliary_loss_clip": 0.01874178, "auxiliary_loss_mlp": 0.00330632, "balance_loss_clip": 1.49085593, "balance_loss_mlp": 0.29458317, "epoch": 0.3740868781001052, "flos": 17126158331520.0, "grad_norm": 3.156957643583962, "language_loss": 0.82266426, "learning_rate": 2.8813522281551387e-06, "loss": 0.84471238, "num_input_tokens_seen": 133670220, "router_z_loss_clip": 3.83203125, "router_z_loss_mlp": 0.36035156, "step": 6222, "time_per_iteration": 2.697625160217285 }, { "auxiliary_loss_clip": 0.01889468, "auxiliary_loss_mlp": 0.00333942, "balance_loss_clip": 1.50639224, "balance_loss_mlp": 0.29553324, "epoch": 0.3741470013527732, "flos": 20043505048320.0, "grad_norm": 6.583884574919026, "language_loss": 0.77513599, "learning_rate": 2.881002604868789e-06, "loss": 0.79737008, "num_input_tokens_seen": 133688910, "router_z_loss_clip": 3.83203125, "router_z_loss_mlp": 0.3840332, "step": 6223, "time_per_iteration": 4.179046154022217 }, { "auxiliary_loss_clip": 0.01876546, "auxiliary_loss_mlp": 0.00312015, "balance_loss_clip": 1.49775434, "balance_loss_mlp": 0.2749173, "epoch": 0.37420712460544114, "flos": 36897279569280.0, "grad_norm": 24.42932142784083, "language_loss": 0.75529999, "learning_rate": 2.8806529481747325e-06, "loss": 0.77718556, "num_input_tokens_seen": 133708690, "router_z_loss_clip": 3.78710938, "router_z_loss_mlp": 0.37133789, "step": 6224, "time_per_iteration": 4.267859697341919 }, { "auxiliary_loss_clip": 0.01903215, "auxiliary_loss_mlp": 0.00321637, "balance_loss_clip": 1.50710392, "balance_loss_mlp": 0.2822265, "epoch": 0.3742672478581091, "flos": 22201198007040.0, "grad_norm": 169.8052249715832, "language_loss": 0.75917518, "learning_rate": 2.880303258086228e-06, "loss": 0.78142369, "num_input_tokens_seen": 133728095, "router_z_loss_clip": 3.96289062, "router_z_loss_mlp": 0.39379883, "step": 6225, "time_per_iteration": 2.6787261962890625 }, { "auxiliary_loss_clip": 0.01903505, "auxiliary_loss_mlp": 0.00374613, "balance_loss_clip": 1.50762856, "balance_loss_mlp": 0.33160207, "epoch": 0.3743273711107771, "flos": 24681547860480.0, "grad_norm": 5.578502963536227, "language_loss": 0.86453331, "learning_rate": 2.879953534616536e-06, "loss": 0.8873145, "num_input_tokens_seen": 133745590, "router_z_loss_clip": 3.95117188, "router_z_loss_mlp": 0.42993164, "step": 6226, "time_per_iteration": 2.6953277587890625 }, { "auxiliary_loss_clip": 0.01879939, "auxiliary_loss_mlp": 0.00344279, "balance_loss_clip": 1.49191642, "balance_loss_mlp": 0.30720508, "epoch": 0.37438749436344504, "flos": 24459619680000.0, "grad_norm": 9.848099541722927, "language_loss": 0.7537905, "learning_rate": 2.879603777778917e-06, "loss": 0.77603269, "num_input_tokens_seen": 133766155, "router_z_loss_clip": 3.87695312, "router_z_loss_mlp": 0.37060547, "step": 6227, "time_per_iteration": 2.7418878078460693 }, { "auxiliary_loss_clip": 0.01872247, "auxiliary_loss_mlp": 0.00300065, "balance_loss_clip": 1.50206828, "balance_loss_mlp": 0.26310998, "epoch": 0.374447617616113, "flos": 21798747048960.0, "grad_norm": 205.3105839906934, "language_loss": 0.88903528, "learning_rate": 2.879253987586635e-06, "loss": 0.91075844, "num_input_tokens_seen": 133783185, "router_z_loss_clip": 3.69921875, "router_z_loss_mlp": 0.36938477, "step": 6228, "time_per_iteration": 4.1091930866241455 }, { "auxiliary_loss_clip": 0.01898255, "auxiliary_loss_mlp": 0.00338625, "balance_loss_clip": 1.50758064, "balance_loss_mlp": 0.30128872, "epoch": 0.374507740868781, "flos": 17968191932160.0, "grad_norm": 3.024899839641628, "language_loss": 0.80797374, "learning_rate": 2.8789041640529535e-06, "loss": 0.83034253, "num_input_tokens_seen": 133800975, "router_z_loss_clip": 3.90625, "router_z_loss_mlp": 0.37304688, "step": 6229, "time_per_iteration": 2.7096736431121826 }, { "auxiliary_loss_clip": 0.018801, "auxiliary_loss_mlp": 0.00336902, "balance_loss_clip": 1.49933648, "balance_loss_mlp": 0.2955364, "epoch": 0.374567864121449, "flos": 16105828596480.0, "grad_norm": 6.648066918118238, "language_loss": 0.90789127, "learning_rate": 2.8785543071911383e-06, "loss": 0.93006122, "num_input_tokens_seen": 133818020, "router_z_loss_clip": 3.80859375, "router_z_loss_mlp": 0.41381836, "step": 6230, "time_per_iteration": 2.6571669578552246 }, { "auxiliary_loss_clip": 0.01877623, "auxiliary_loss_mlp": 0.00319483, "balance_loss_clip": 1.49165916, "balance_loss_mlp": 0.27938092, "epoch": 0.37462798737411696, "flos": 25773160135680.0, "grad_norm": 2.3191735673029195, "language_loss": 0.81501675, "learning_rate": 2.878204417014456e-06, "loss": 0.83698779, "num_input_tokens_seen": 133840690, "router_z_loss_clip": 3.85546875, "router_z_loss_mlp": 0.40112305, "step": 6231, "time_per_iteration": 2.6997509002685547 }, { "auxiliary_loss_clip": 0.0184278, "auxiliary_loss_mlp": 0.00336784, "balance_loss_clip": 1.4769578, "balance_loss_mlp": 0.29658675, "epoch": 0.3746881106267849, "flos": 16654507822080.0, "grad_norm": 30.09528976023357, "language_loss": 0.81781572, "learning_rate": 2.8778544935361735e-06, "loss": 0.83961141, "num_input_tokens_seen": 133858350, "router_z_loss_clip": 3.65429688, "router_z_loss_mlp": 0.40185547, "step": 6232, "time_per_iteration": 2.6533267498016357 }, { "auxiliary_loss_clip": 0.01842189, "auxiliary_loss_mlp": 0.0033891, "balance_loss_clip": 1.4758178, "balance_loss_mlp": 0.29852194, "epoch": 0.3747482338794529, "flos": 26177981391360.0, "grad_norm": 13.09794378764582, "language_loss": 0.82242632, "learning_rate": 2.877504536769561e-06, "loss": 0.84423733, "num_input_tokens_seen": 133879775, "router_z_loss_clip": 3.66601562, "router_z_loss_mlp": 0.40380859, "step": 6233, "time_per_iteration": 2.6974451541900635 }, { "auxiliary_loss_clip": 0.01883734, "auxiliary_loss_mlp": 0.00335454, "balance_loss_clip": 1.50510311, "balance_loss_mlp": 0.2958048, "epoch": 0.37480835713212085, "flos": 12021061950720.0, "grad_norm": 6.411901264153556, "language_loss": 0.77728009, "learning_rate": 2.8771545467278883e-06, "loss": 0.79947197, "num_input_tokens_seen": 133898295, "router_z_loss_clip": 3.78710938, "router_z_loss_mlp": 0.39672852, "step": 6234, "time_per_iteration": 2.6605517864227295 }, { "auxiliary_loss_clip": 0.01866197, "auxiliary_loss_mlp": 0.0030326, "balance_loss_clip": 1.49676061, "balance_loss_mlp": 0.26504135, "epoch": 0.3748684803847888, "flos": 19679263182720.0, "grad_norm": 33.99818610903916, "language_loss": 0.89069664, "learning_rate": 2.8768045234244276e-06, "loss": 0.91239119, "num_input_tokens_seen": 133915230, "router_z_loss_clip": 3.69140625, "router_z_loss_mlp": 0.38232422, "step": 6235, "time_per_iteration": 4.040600776672363 }, { "auxiliary_loss_clip": 0.01895997, "auxiliary_loss_mlp": 0.00343683, "balance_loss_clip": 1.51793718, "balance_loss_mlp": 0.30520254, "epoch": 0.3749286036374568, "flos": 20521189042560.0, "grad_norm": 21.376432884972886, "language_loss": 0.85431904, "learning_rate": 2.8764544668724517e-06, "loss": 0.87671584, "num_input_tokens_seen": 133934110, "router_z_loss_clip": 3.78125, "router_z_loss_mlp": 0.38500977, "step": 6236, "time_per_iteration": 2.6919105052948 }, { "auxiliary_loss_clip": 0.01864658, "auxiliary_loss_mlp": 0.00327963, "balance_loss_clip": 1.48695481, "balance_loss_mlp": 0.28681225, "epoch": 0.37498872689012475, "flos": 20704620821760.0, "grad_norm": 5.5749971956454765, "language_loss": 0.82549107, "learning_rate": 2.876104377085234e-06, "loss": 0.84741735, "num_input_tokens_seen": 133952395, "router_z_loss_clip": 3.77734375, "router_z_loss_mlp": 0.41137695, "step": 6237, "time_per_iteration": 2.688678026199341 }, { "auxiliary_loss_clip": 0.01881091, "auxiliary_loss_mlp": 0.00368646, "balance_loss_clip": 1.49876022, "balance_loss_mlp": 0.3283532, "epoch": 0.3750488501427927, "flos": 21574843620480.0, "grad_norm": 5.0796658929377445, "language_loss": 0.98796141, "learning_rate": 2.8757542540760508e-06, "loss": 1.01045883, "num_input_tokens_seen": 133969635, "router_z_loss_clip": 3.83007812, "router_z_loss_mlp": 0.40283203, "step": 6238, "time_per_iteration": 2.6438536643981934 }, { "auxiliary_loss_clip": 0.01917394, "auxiliary_loss_mlp": 0.00358655, "balance_loss_clip": 1.52026224, "balance_loss_mlp": 0.31819487, "epoch": 0.3751089733954607, "flos": 15923869274880.0, "grad_norm": 3.651896369504434, "language_loss": 0.77606487, "learning_rate": 2.8754040978581777e-06, "loss": 0.79882538, "num_input_tokens_seen": 133987215, "router_z_loss_clip": 3.97851562, "router_z_loss_mlp": 0.40454102, "step": 6239, "time_per_iteration": 2.634272575378418 }, { "auxiliary_loss_clip": 0.01909892, "auxiliary_loss_mlp": 0.00332075, "balance_loss_clip": 1.52239239, "balance_loss_mlp": 0.29326081, "epoch": 0.37516909664812864, "flos": 36284644177920.0, "grad_norm": 60.086590523922126, "language_loss": 0.73454273, "learning_rate": 2.875053908444895e-06, "loss": 0.75696242, "num_input_tokens_seen": 134009250, "router_z_loss_clip": 3.87890625, "router_z_loss_mlp": 0.38818359, "step": 6240, "time_per_iteration": 2.7657978534698486 }, { "auxiliary_loss_clip": 0.01922686, "auxiliary_loss_mlp": 0.00310842, "balance_loss_clip": 1.52555978, "balance_loss_mlp": 0.27355331, "epoch": 0.3752292199007966, "flos": 13515915283200.0, "grad_norm": 199.7752184576111, "language_loss": 0.85524571, "learning_rate": 2.8747036858494795e-06, "loss": 0.877581, "num_input_tokens_seen": 134026875, "router_z_loss_clip": 3.97460938, "router_z_loss_mlp": 0.37280273, "step": 6241, "time_per_iteration": 2.6335947513580322 }, { "auxiliary_loss_clip": 0.01922911, "auxiliary_loss_mlp": 0.00363786, "balance_loss_clip": 1.52964377, "balance_loss_mlp": 0.32439965, "epoch": 0.3752893431534646, "flos": 27198095644800.0, "grad_norm": 3.309810333109818, "language_loss": 0.90710175, "learning_rate": 2.874353430085213e-06, "loss": 0.92996871, "num_input_tokens_seen": 134047185, "router_z_loss_clip": 3.93359375, "router_z_loss_mlp": 0.39379883, "step": 6242, "time_per_iteration": 2.699143886566162 }, { "auxiliary_loss_clip": 0.0192101, "auxiliary_loss_mlp": 0.00386548, "balance_loss_clip": 1.52584398, "balance_loss_mlp": 0.34821069, "epoch": 0.3753494664061326, "flos": 30007674581760.0, "grad_norm": 44.40093327694809, "language_loss": 0.76479769, "learning_rate": 2.8740031411653766e-06, "loss": 0.78787321, "num_input_tokens_seen": 134067330, "router_z_loss_clip": 3.9453125, "router_z_loss_mlp": 0.38354492, "step": 6243, "time_per_iteration": 2.7594821453094482 }, { "auxiliary_loss_clip": 0.01919027, "auxiliary_loss_mlp": 0.00344427, "balance_loss_clip": 1.5275892, "balance_loss_mlp": 0.30525458, "epoch": 0.37540958965880056, "flos": 24461954064000.0, "grad_norm": 11.099275734425364, "language_loss": 0.9187935, "learning_rate": 2.8736528191032535e-06, "loss": 0.94142801, "num_input_tokens_seen": 134085525, "router_z_loss_clip": 3.91210938, "router_z_loss_mlp": 0.3918457, "step": 6244, "time_per_iteration": 2.706972122192383 }, { "auxiliary_loss_clip": 0.01919294, "auxiliary_loss_mlp": 0.0037486, "balance_loss_clip": 1.53721941, "balance_loss_mlp": 0.33587897, "epoch": 0.3754697129114685, "flos": 16508387295360.0, "grad_norm": 8.00230730849742, "language_loss": 0.92057246, "learning_rate": 2.8733024639121277e-06, "loss": 0.94351399, "num_input_tokens_seen": 134101855, "router_z_loss_clip": 3.81640625, "router_z_loss_mlp": 0.39013672, "step": 6245, "time_per_iteration": 2.678317070007324 }, { "auxiliary_loss_clip": 0.01907928, "auxiliary_loss_mlp": 0.00360349, "balance_loss_clip": 1.52120686, "balance_loss_mlp": 0.32122433, "epoch": 0.3755298361641365, "flos": 19390900798080.0, "grad_norm": 34.42144106574157, "language_loss": 0.73644412, "learning_rate": 2.8729520756052853e-06, "loss": 0.75912684, "num_input_tokens_seen": 134119360, "router_z_loss_clip": 3.87109375, "router_z_loss_mlp": 0.39135742, "step": 6246, "time_per_iteration": 2.6878113746643066 }, { "auxiliary_loss_clip": 0.01922199, "auxiliary_loss_mlp": 0.00434356, "balance_loss_clip": 1.52981496, "balance_loss_mlp": 0.39217934, "epoch": 0.37558995941680445, "flos": 14720395069440.0, "grad_norm": 105.79631757964029, "language_loss": 0.82674861, "learning_rate": 2.8726016541960124e-06, "loss": 0.8503142, "num_input_tokens_seen": 134137475, "router_z_loss_clip": 3.91796875, "router_z_loss_mlp": 0.42163086, "step": 6247, "time_per_iteration": 2.6661903858184814 }, { "auxiliary_loss_clip": 0.01899771, "auxiliary_loss_mlp": 0.00427946, "balance_loss_clip": 1.51067817, "balance_loss_mlp": 0.38584161, "epoch": 0.3756500826694724, "flos": 21689901861120.0, "grad_norm": 42.35756937260307, "language_loss": 0.61449897, "learning_rate": 2.872251199697598e-06, "loss": 0.63777614, "num_input_tokens_seen": 134154580, "router_z_loss_clip": 3.88867188, "router_z_loss_mlp": 0.4206543, "step": 6248, "time_per_iteration": 2.696249008178711 }, { "auxiliary_loss_clip": 0.01926245, "auxiliary_loss_mlp": 0.00411943, "balance_loss_clip": 1.53763962, "balance_loss_mlp": 0.37084013, "epoch": 0.3757102059221404, "flos": 26505666190080.0, "grad_norm": 12.697953628671101, "language_loss": 0.90617585, "learning_rate": 2.8719007121233297e-06, "loss": 0.92955774, "num_input_tokens_seen": 134174285, "router_z_loss_clip": 3.88867188, "router_z_loss_mlp": 0.41113281, "step": 6249, "time_per_iteration": 2.7899115085601807 }, { "auxiliary_loss_clip": 0.01911216, "auxiliary_loss_mlp": 0.00397112, "balance_loss_clip": 1.52416396, "balance_loss_mlp": 0.35624677, "epoch": 0.37577032917480835, "flos": 37338083274240.0, "grad_norm": 14.403286407543535, "language_loss": 0.76266086, "learning_rate": 2.8715501914864993e-06, "loss": 0.78574419, "num_input_tokens_seen": 134195940, "router_z_loss_clip": 3.87109375, "router_z_loss_mlp": 0.40844727, "step": 6250, "time_per_iteration": 2.920860767364502 }, { "auxiliary_loss_clip": 0.0191818, "auxiliary_loss_mlp": 0.00417517, "balance_loss_clip": 1.53638887, "balance_loss_mlp": 0.38001367, "epoch": 0.3758304524274763, "flos": 21908597817600.0, "grad_norm": 82.07551195271114, "language_loss": 0.84240448, "learning_rate": 2.8711996378003987e-06, "loss": 0.86576152, "num_input_tokens_seen": 134212235, "router_z_loss_clip": 3.81835938, "router_z_loss_mlp": 0.37524414, "step": 6251, "time_per_iteration": 2.771466016769409 }, { "auxiliary_loss_clip": 0.01917488, "auxiliary_loss_mlp": 0.00377633, "balance_loss_clip": 1.54096746, "balance_loss_mlp": 0.34246644, "epoch": 0.3758905756801443, "flos": 36569343375360.0, "grad_norm": 57.597289568598285, "language_loss": 0.65622747, "learning_rate": 2.8708490510783203e-06, "loss": 0.67917871, "num_input_tokens_seen": 134233810, "router_z_loss_clip": 3.765625, "router_z_loss_mlp": 0.35180664, "step": 6252, "time_per_iteration": 2.845865488052368 }, { "auxiliary_loss_clip": 0.01918672, "auxiliary_loss_mlp": 0.00427225, "balance_loss_clip": 1.53209269, "balance_loss_mlp": 0.38407123, "epoch": 0.37595069893281224, "flos": 24528783317760.0, "grad_norm": 66.0750705566635, "language_loss": 0.9460175, "learning_rate": 2.8704984313335584e-06, "loss": 0.96947646, "num_input_tokens_seen": 134252020, "router_z_loss_clip": 3.86132812, "router_z_loss_mlp": 0.43164062, "step": 6253, "time_per_iteration": 2.701983690261841 }, { "auxiliary_loss_clip": 0.01925117, "auxiliary_loss_mlp": 0.00379594, "balance_loss_clip": 1.54306209, "balance_loss_mlp": 0.33984971, "epoch": 0.3760108221854802, "flos": 16435021766400.0, "grad_norm": 6.166453486783868, "language_loss": 0.86335933, "learning_rate": 2.8701477785794097e-06, "loss": 0.88640642, "num_input_tokens_seen": 134269495, "router_z_loss_clip": 3.81835938, "router_z_loss_mlp": 0.39746094, "step": 6254, "time_per_iteration": 2.8000218868255615 }, { "auxiliary_loss_clip": 0.0191129, "auxiliary_loss_mlp": 0.00438602, "balance_loss_clip": 1.52666545, "balance_loss_mlp": 0.39442289, "epoch": 0.37607094543814823, "flos": 13771742924160.0, "grad_norm": 3.9619038077928534, "language_loss": 0.711824, "learning_rate": 2.869797092829169e-06, "loss": 0.73532289, "num_input_tokens_seen": 134287035, "router_z_loss_clip": 3.84960938, "router_z_loss_mlp": 0.44165039, "step": 6255, "time_per_iteration": 2.634838819503784 }, { "auxiliary_loss_clip": 0.01900484, "auxiliary_loss_mlp": 0.00393473, "balance_loss_clip": 1.52691901, "balance_loss_mlp": 0.35487255, "epoch": 0.3761310686908162, "flos": 19857918453120.0, "grad_norm": 112.35511098297715, "language_loss": 0.81516039, "learning_rate": 2.869446374096135e-06, "loss": 0.83810002, "num_input_tokens_seen": 134304840, "router_z_loss_clip": 3.734375, "router_z_loss_mlp": 0.38598633, "step": 6256, "time_per_iteration": 2.6313912868499756 }, { "auxiliary_loss_clip": 0.01897294, "auxiliary_loss_mlp": 0.00432489, "balance_loss_clip": 1.52077293, "balance_loss_mlp": 0.39155272, "epoch": 0.37619119194348416, "flos": 12750802657920.0, "grad_norm": 131.21033826536137, "language_loss": 0.78961587, "learning_rate": 2.8690956223936088e-06, "loss": 0.81291372, "num_input_tokens_seen": 134323180, "router_z_loss_clip": 3.76757812, "router_z_loss_mlp": 0.40942383, "step": 6257, "time_per_iteration": 2.6628663539886475 }, { "auxiliary_loss_clip": 0.01901021, "auxiliary_loss_mlp": 0.00386117, "balance_loss_clip": 1.52443123, "balance_loss_mlp": 0.34663504, "epoch": 0.3762513151961521, "flos": 17530548624000.0, "grad_norm": 24.969304995305503, "language_loss": 0.90571475, "learning_rate": 2.868744837734889e-06, "loss": 0.92858613, "num_input_tokens_seen": 134341390, "router_z_loss_clip": 3.76757812, "router_z_loss_mlp": 0.39477539, "step": 6258, "time_per_iteration": 2.724742889404297 }, { "auxiliary_loss_clip": 0.01924873, "auxiliary_loss_mlp": 0.00422984, "balance_loss_clip": 1.54692984, "balance_loss_mlp": 0.38326323, "epoch": 0.3763114384488201, "flos": 23617406511360.0, "grad_norm": 9.53932641636149, "language_loss": 0.86272085, "learning_rate": 2.868394020133277e-06, "loss": 0.88619936, "num_input_tokens_seen": 134360425, "router_z_loss_clip": 3.78125, "router_z_loss_mlp": 0.39697266, "step": 6259, "time_per_iteration": 2.6500914096832275 }, { "auxiliary_loss_clip": 0.0194715, "auxiliary_loss_mlp": 0.0043783, "balance_loss_clip": 1.55209351, "balance_loss_mlp": 0.39086145, "epoch": 0.37637156170148806, "flos": 25406978935680.0, "grad_norm": 17.876668162632317, "language_loss": 0.78715622, "learning_rate": 2.8680431696020783e-06, "loss": 0.81100601, "num_input_tokens_seen": 134379775, "router_z_loss_clip": 3.94921875, "router_z_loss_mlp": 0.46972656, "step": 6260, "time_per_iteration": 2.6986477375030518 }, { "auxiliary_loss_clip": 0.01940847, "auxiliary_loss_mlp": 0.00389913, "balance_loss_clip": 1.55044723, "balance_loss_mlp": 0.34749818, "epoch": 0.376431684954156, "flos": 23440906056960.0, "grad_norm": 27.977444205653715, "language_loss": 0.85491079, "learning_rate": 2.867692286154594e-06, "loss": 0.87821841, "num_input_tokens_seen": 134400315, "router_z_loss_clip": 3.90820312, "router_z_loss_mlp": 0.42407227, "step": 6261, "time_per_iteration": 2.737029552459717 }, { "auxiliary_loss_clip": 0.01936204, "auxiliary_loss_mlp": 0.00422871, "balance_loss_clip": 1.55122674, "balance_loss_mlp": 0.38298365, "epoch": 0.376491808206824, "flos": 34204482725760.0, "grad_norm": 2.2088847308571857, "language_loss": 0.86401594, "learning_rate": 2.867341369804132e-06, "loss": 0.88760668, "num_input_tokens_seen": 134422875, "router_z_loss_clip": 3.84960938, "router_z_loss_mlp": 0.39892578, "step": 6262, "time_per_iteration": 2.758775234222412 }, { "auxiliary_loss_clip": 0.01944814, "auxiliary_loss_mlp": 0.00448658, "balance_loss_clip": 1.56329679, "balance_loss_mlp": 0.40767425, "epoch": 0.37655193145949195, "flos": 35185669614720.0, "grad_norm": 3.5780132671943448, "language_loss": 0.85952234, "learning_rate": 2.866990420563998e-06, "loss": 0.88345706, "num_input_tokens_seen": 134443025, "router_z_loss_clip": 3.81640625, "router_z_loss_mlp": 0.40966797, "step": 6263, "time_per_iteration": 2.7555785179138184 }, { "auxiliary_loss_clip": 0.019325, "auxiliary_loss_mlp": 0.00405261, "balance_loss_clip": 1.55567241, "balance_loss_mlp": 0.3624168, "epoch": 0.3766120547121599, "flos": 16761844638720.0, "grad_norm": 102.60994173008052, "language_loss": 0.88269889, "learning_rate": 2.866639438447501e-06, "loss": 0.90607655, "num_input_tokens_seen": 134460945, "router_z_loss_clip": 3.76757812, "router_z_loss_mlp": 0.42871094, "step": 6264, "time_per_iteration": 2.7224411964416504 }, { "auxiliary_loss_clip": 0.0192823, "auxiliary_loss_mlp": 0.00382629, "balance_loss_clip": 1.55286622, "balance_loss_mlp": 0.34226483, "epoch": 0.3766721779648279, "flos": 23550361776000.0, "grad_norm": 13.401158443692575, "language_loss": 0.81099808, "learning_rate": 2.8662884234679497e-06, "loss": 0.83410668, "num_input_tokens_seen": 134480440, "router_z_loss_clip": 3.75, "router_z_loss_mlp": 0.40356445, "step": 6265, "time_per_iteration": 4.1410229206085205 }, { "auxiliary_loss_clip": 0.01954951, "auxiliary_loss_mlp": 0.00368251, "balance_loss_clip": 1.58013511, "balance_loss_mlp": 0.33058068, "epoch": 0.37673230121749585, "flos": 29129191655040.0, "grad_norm": 79.97532432961876, "language_loss": 0.71765381, "learning_rate": 2.865937375638654e-06, "loss": 0.74088585, "num_input_tokens_seen": 134501110, "router_z_loss_clip": 3.74609375, "router_z_loss_mlp": 0.37719727, "step": 6266, "time_per_iteration": 2.7063207626342773 }, { "auxiliary_loss_clip": 0.01924775, "auxiliary_loss_mlp": 0.00410118, "balance_loss_clip": 1.54506183, "balance_loss_mlp": 0.36751226, "epoch": 0.3767924244701638, "flos": 28146783703680.0, "grad_norm": 4.526592905307364, "language_loss": 0.70813394, "learning_rate": 2.8655862949729264e-06, "loss": 0.73148286, "num_input_tokens_seen": 134522460, "router_z_loss_clip": 3.79882812, "router_z_loss_mlp": 0.42626953, "step": 6267, "time_per_iteration": 4.098716974258423 }, { "auxiliary_loss_clip": 0.02011626, "auxiliary_loss_mlp": 0.00405127, "balance_loss_clip": 1.763116, "balance_loss_mlp": 0.39034477, "epoch": 0.37685254772283183, "flos": 60797197526400.0, "grad_norm": 0.7475781923874779, "language_loss": 0.58947766, "learning_rate": 2.8652351814840795e-06, "loss": 0.6136452, "num_input_tokens_seen": 134589545, "router_z_loss_clip": 2.484375, "router_z_loss_mlp": 0.14746094, "step": 6268, "time_per_iteration": 3.2631325721740723 }, { "auxiliary_loss_clip": 0.01921774, "auxiliary_loss_mlp": 0.00392342, "balance_loss_clip": 1.55667663, "balance_loss_mlp": 0.3545289, "epoch": 0.3769126709754998, "flos": 26032543223040.0, "grad_norm": 6.189676524820148, "language_loss": 0.70643169, "learning_rate": 2.8648840351854283e-06, "loss": 0.72957283, "num_input_tokens_seen": 134610550, "router_z_loss_clip": 3.65039062, "router_z_loss_mlp": 0.37817383, "step": 6269, "time_per_iteration": 2.698833703994751 }, { "auxiliary_loss_clip": 0.01914414, "auxiliary_loss_mlp": 0.00364811, "balance_loss_clip": 1.55721176, "balance_loss_mlp": 0.32783186, "epoch": 0.37697279422816776, "flos": 23579879777280.0, "grad_norm": 30.232906005912522, "language_loss": 0.77553439, "learning_rate": 2.8645328560902874e-06, "loss": 0.79832667, "num_input_tokens_seen": 134630485, "router_z_loss_clip": 3.57226562, "router_z_loss_mlp": 0.37011719, "step": 6270, "time_per_iteration": 2.657090663909912 }, { "auxiliary_loss_clip": 0.01941315, "auxiliary_loss_mlp": 0.00267897, "balance_loss_clip": 1.71488154, "balance_loss_mlp": 0.25445032, "epoch": 0.3770329174808357, "flos": 64745935367040.0, "grad_norm": 0.717428500404105, "language_loss": 0.56058997, "learning_rate": 2.8641816442119746e-06, "loss": 0.58268213, "num_input_tokens_seen": 134693510, "router_z_loss_clip": 2.25, "router_z_loss_mlp": 0.13476562, "step": 6271, "time_per_iteration": 4.5436851978302 }, { "auxiliary_loss_clip": 0.01896872, "auxiliary_loss_mlp": 0.00365614, "balance_loss_clip": 1.54343414, "balance_loss_mlp": 0.32281822, "epoch": 0.3770930407335037, "flos": 21835304115840.0, "grad_norm": 4.402440366136075, "language_loss": 0.85404819, "learning_rate": 2.8638303995638066e-06, "loss": 0.8766731, "num_input_tokens_seen": 134713115, "router_z_loss_clip": 3.53515625, "router_z_loss_mlp": 0.42797852, "step": 6272, "time_per_iteration": 2.6630859375 }, { "auxiliary_loss_clip": 0.01858111, "auxiliary_loss_mlp": 0.00365946, "balance_loss_clip": 1.50870681, "balance_loss_mlp": 0.3298493, "epoch": 0.37715316398617166, "flos": 22747901984640.0, "grad_norm": 331.6858930783031, "language_loss": 0.79920292, "learning_rate": 2.863479122159103e-06, "loss": 0.82144356, "num_input_tokens_seen": 134732635, "router_z_loss_clip": 3.49023438, "router_z_loss_mlp": 0.36083984, "step": 6273, "time_per_iteration": 2.6434078216552734 }, { "auxiliary_loss_clip": 0.01881123, "auxiliary_loss_mlp": 0.00367709, "balance_loss_clip": 1.53484547, "balance_loss_mlp": 0.32922816, "epoch": 0.3772132872388396, "flos": 18914581520640.0, "grad_norm": 6.894333195275504, "language_loss": 0.77053726, "learning_rate": 2.8631278120111858e-06, "loss": 0.79302561, "num_input_tokens_seen": 134750695, "router_z_loss_clip": 3.46484375, "router_z_loss_mlp": 0.38452148, "step": 6274, "time_per_iteration": 2.6399075984954834 }, { "auxiliary_loss_clip": 0.01855099, "auxiliary_loss_mlp": 0.00351244, "balance_loss_clip": 1.50401592, "balance_loss_mlp": 0.31405061, "epoch": 0.3772734104915076, "flos": 17346219004800.0, "grad_norm": 28.69803074862732, "language_loss": 0.91633308, "learning_rate": 2.8627764691333742e-06, "loss": 0.93839657, "num_input_tokens_seen": 134768935, "router_z_loss_clip": 3.50976562, "router_z_loss_mlp": 0.37158203, "step": 6275, "time_per_iteration": 2.752255439758301 }, { "auxiliary_loss_clip": 0.01856299, "auxiliary_loss_mlp": 0.00327014, "balance_loss_clip": 1.51735699, "balance_loss_mlp": 0.29048866, "epoch": 0.37733353374417555, "flos": 32342370785280.0, "grad_norm": 5.350188949454512, "language_loss": 0.79056835, "learning_rate": 2.8624250935389935e-06, "loss": 0.81240153, "num_input_tokens_seen": 134791260, "router_z_loss_clip": 3.38867188, "router_z_loss_mlp": 0.36523438, "step": 6276, "time_per_iteration": 2.782949924468994 }, { "auxiliary_loss_clip": 0.0185671, "auxiliary_loss_mlp": 0.00352035, "balance_loss_clip": 1.5153687, "balance_loss_mlp": 0.31245792, "epoch": 0.3773936569968435, "flos": 23360681030400.0, "grad_norm": 16.256338316858308, "language_loss": 0.91296548, "learning_rate": 2.862073685241366e-06, "loss": 0.93505299, "num_input_tokens_seen": 134808350, "router_z_loss_clip": 3.41992188, "router_z_loss_mlp": 0.39599609, "step": 6277, "time_per_iteration": 4.02819561958313 }, { "auxiliary_loss_clip": 0.01824626, "auxiliary_loss_mlp": 0.00387982, "balance_loss_clip": 1.49706626, "balance_loss_mlp": 0.34752226, "epoch": 0.3774537802495115, "flos": 21466788531840.0, "grad_norm": 16.729840689338406, "language_loss": 0.83216047, "learning_rate": 2.861722244253818e-06, "loss": 0.85428655, "num_input_tokens_seen": 134826005, "router_z_loss_clip": 3.27734375, "router_z_loss_mlp": 0.4050293, "step": 6278, "time_per_iteration": 2.6059980392456055 }, { "auxiliary_loss_clip": 0.01831186, "auxiliary_loss_mlp": 0.00424008, "balance_loss_clip": 1.48937809, "balance_loss_mlp": 0.38218969, "epoch": 0.37751390350217945, "flos": 24973717086720.0, "grad_norm": 41.48152221048386, "language_loss": 0.90985274, "learning_rate": 2.8613707705896767e-06, "loss": 0.93240464, "num_input_tokens_seen": 134844995, "router_z_loss_clip": 3.421875, "router_z_loss_mlp": 0.41821289, "step": 6279, "time_per_iteration": 2.7499871253967285 }, { "auxiliary_loss_clip": 0.01808258, "auxiliary_loss_mlp": 0.00338236, "balance_loss_clip": 1.47714269, "balance_loss_mlp": 0.30094689, "epoch": 0.3775740267548474, "flos": 27819098904960.0, "grad_norm": 79.84066725775578, "language_loss": 0.81239927, "learning_rate": 2.861019264262269e-06, "loss": 0.83386421, "num_input_tokens_seen": 134865285, "router_z_loss_clip": 3.30859375, "router_z_loss_mlp": 0.37304688, "step": 6280, "time_per_iteration": 2.6716761589050293 }, { "auxiliary_loss_clip": 0.01844457, "auxiliary_loss_mlp": 0.00392318, "balance_loss_clip": 1.50813663, "balance_loss_mlp": 0.34883031, "epoch": 0.3776341500075154, "flos": 22565224391040.0, "grad_norm": 2.4562968593805463, "language_loss": 0.79573143, "learning_rate": 2.8606677252849242e-06, "loss": 0.81809914, "num_input_tokens_seen": 134886535, "router_z_loss_clip": 3.36328125, "router_z_loss_mlp": 0.43505859, "step": 6281, "time_per_iteration": 2.646883964538574 }, { "auxiliary_loss_clip": 0.01807167, "auxiliary_loss_mlp": 0.00378813, "balance_loss_clip": 1.4801383, "balance_loss_mlp": 0.3399511, "epoch": 0.3776942732601834, "flos": 23077238808960.0, "grad_norm": 13.516866568311812, "language_loss": 0.87936985, "learning_rate": 2.860316153670974e-06, "loss": 0.90122962, "num_input_tokens_seen": 134907435, "router_z_loss_clip": 3.27539062, "router_z_loss_mlp": 0.38842773, "step": 6282, "time_per_iteration": 2.6472721099853516 }, { "auxiliary_loss_clip": 0.01804149, "auxiliary_loss_mlp": 0.00411838, "balance_loss_clip": 1.47764087, "balance_loss_mlp": 0.37040126, "epoch": 0.37775439651285136, "flos": 21724411852800.0, "grad_norm": 11.901492485475334, "language_loss": 0.75809646, "learning_rate": 2.8599645494337484e-06, "loss": 0.78025639, "num_input_tokens_seen": 134925360, "router_z_loss_clip": 3.265625, "router_z_loss_mlp": 0.41455078, "step": 6283, "time_per_iteration": 2.6179730892181396 }, { "auxiliary_loss_clip": 0.01833217, "auxiliary_loss_mlp": 0.00408684, "balance_loss_clip": 1.50414264, "balance_loss_mlp": 0.36712807, "epoch": 0.37781451976551933, "flos": 23987753688960.0, "grad_norm": 35.10569096213132, "language_loss": 0.84272927, "learning_rate": 2.859612912586581e-06, "loss": 0.86514831, "num_input_tokens_seen": 134944205, "router_z_loss_clip": 3.29101562, "router_z_loss_mlp": 0.41577148, "step": 6284, "time_per_iteration": 2.673311710357666 }, { "auxiliary_loss_clip": 0.01811814, "auxiliary_loss_mlp": 0.00416936, "balance_loss_clip": 1.47777176, "balance_loss_mlp": 0.37227991, "epoch": 0.3778746430181873, "flos": 13727967223680.0, "grad_norm": 7.687931518994306, "language_loss": 0.95356166, "learning_rate": 2.8592612431428055e-06, "loss": 0.97584915, "num_input_tokens_seen": 134960255, "router_z_loss_clip": 3.33984375, "router_z_loss_mlp": 0.44628906, "step": 6285, "time_per_iteration": 2.633985757827759 }, { "auxiliary_loss_clip": 0.01821531, "auxiliary_loss_mlp": 0.00381419, "balance_loss_clip": 1.4865669, "balance_loss_mlp": 0.33733505, "epoch": 0.37793476627085526, "flos": 19460495399040.0, "grad_norm": 45.57120108941556, "language_loss": 0.90695649, "learning_rate": 2.858909541115758e-06, "loss": 0.92898601, "num_input_tokens_seen": 134978605, "router_z_loss_clip": 3.34570312, "router_z_loss_mlp": 0.44091797, "step": 6286, "time_per_iteration": 2.6958253383636475 }, { "auxiliary_loss_clip": 0.01807288, "auxiliary_loss_mlp": 0.0038074, "balance_loss_clip": 1.47609806, "balance_loss_mlp": 0.34104371, "epoch": 0.3779948895235232, "flos": 10707018704640.0, "grad_norm": 46.246363363631126, "language_loss": 0.89423126, "learning_rate": 2.858557806518775e-06, "loss": 0.91611159, "num_input_tokens_seen": 134995020, "router_z_loss_clip": 3.3125, "router_z_loss_mlp": 0.39697266, "step": 6287, "time_per_iteration": 2.778470277786255 }, { "auxiliary_loss_clip": 0.01829541, "auxiliary_loss_mlp": 0.00419007, "balance_loss_clip": 1.49035168, "balance_loss_mlp": 0.37504238, "epoch": 0.3780550127761912, "flos": 22310007281280.0, "grad_norm": 118.00212758282379, "language_loss": 0.8003307, "learning_rate": 2.8582060393651927e-06, "loss": 0.82281625, "num_input_tokens_seen": 135012620, "router_z_loss_clip": 3.390625, "router_z_loss_mlp": 0.43945312, "step": 6288, "time_per_iteration": 2.6491541862487793 }, { "auxiliary_loss_clip": 0.01813873, "auxiliary_loss_mlp": 0.00411331, "balance_loss_clip": 1.48393202, "balance_loss_mlp": 0.36691338, "epoch": 0.37811513602885916, "flos": 28950644125440.0, "grad_norm": 4.284310419794487, "language_loss": 0.8124699, "learning_rate": 2.857854239668352e-06, "loss": 0.83472192, "num_input_tokens_seen": 135033365, "router_z_loss_clip": 3.30078125, "router_z_loss_mlp": 0.44433594, "step": 6289, "time_per_iteration": 2.760009765625 }, { "auxiliary_loss_clip": 0.01824548, "auxiliary_loss_mlp": 0.00343275, "balance_loss_clip": 1.49053729, "balance_loss_mlp": 0.30465129, "epoch": 0.3781752592815271, "flos": 23112933949440.0, "grad_norm": 2.9193772959218527, "language_loss": 0.81848073, "learning_rate": 2.857502407441593e-06, "loss": 0.84015888, "num_input_tokens_seen": 135052185, "router_z_loss_clip": 3.34375, "router_z_loss_mlp": 0.38623047, "step": 6290, "time_per_iteration": 2.666259288787842 }, { "auxiliary_loss_clip": 0.01818464, "auxiliary_loss_mlp": 0.00401302, "balance_loss_clip": 1.47980738, "balance_loss_mlp": 0.35631213, "epoch": 0.3782353825341951, "flos": 19755932762880.0, "grad_norm": 6.286185403262449, "language_loss": 0.87763238, "learning_rate": 2.8571505426982566e-06, "loss": 0.89983004, "num_input_tokens_seen": 135070425, "router_z_loss_clip": 3.38867188, "router_z_loss_mlp": 0.44995117, "step": 6291, "time_per_iteration": 2.6776504516601562 }, { "auxiliary_loss_clip": 0.01797993, "auxiliary_loss_mlp": 0.00351438, "balance_loss_clip": 1.46805871, "balance_loss_mlp": 0.31298056, "epoch": 0.37829550578686305, "flos": 22050839675520.0, "grad_norm": 6.864759132992324, "language_loss": 0.84684652, "learning_rate": 2.8567986454516854e-06, "loss": 0.86834079, "num_input_tokens_seen": 135090525, "router_z_loss_clip": 3.30078125, "router_z_loss_mlp": 0.38476562, "step": 6292, "time_per_iteration": 2.6722164154052734 }, { "auxiliary_loss_clip": 0.01767498, "auxiliary_loss_mlp": 0.003859, "balance_loss_clip": 1.44269049, "balance_loss_mlp": 0.34329432, "epoch": 0.378355629039531, "flos": 16470357770880.0, "grad_norm": 5.2067709036577465, "language_loss": 0.77076286, "learning_rate": 2.856446715715224e-06, "loss": 0.79229683, "num_input_tokens_seen": 135109575, "router_z_loss_clip": 3.24609375, "router_z_loss_mlp": 0.42626953, "step": 6293, "time_per_iteration": 2.6429967880249023 }, { "auxiliary_loss_clip": 0.01797719, "auxiliary_loss_mlp": 0.00363774, "balance_loss_clip": 1.47208595, "balance_loss_mlp": 0.32448274, "epoch": 0.378415752292199, "flos": 19974844200960.0, "grad_norm": 12.746481480265993, "language_loss": 0.78763044, "learning_rate": 2.8560947535022173e-06, "loss": 0.80924535, "num_input_tokens_seen": 135127000, "router_z_loss_clip": 3.25585938, "router_z_loss_mlp": 0.39282227, "step": 6294, "time_per_iteration": 2.6479086875915527 }, { "auxiliary_loss_clip": 0.01788647, "auxiliary_loss_mlp": 0.00367829, "balance_loss_clip": 1.44802964, "balance_loss_mlp": 0.32820341, "epoch": 0.378475875544867, "flos": 14647388676480.0, "grad_norm": 351.038399729336, "language_loss": 0.90710562, "learning_rate": 2.855742758826011e-06, "loss": 0.92867035, "num_input_tokens_seen": 135145285, "router_z_loss_clip": 3.40820312, "router_z_loss_mlp": 0.39648438, "step": 6295, "time_per_iteration": 2.633073568344116 }, { "auxiliary_loss_clip": 0.01787173, "auxiliary_loss_mlp": 0.00368951, "balance_loss_clip": 1.45999885, "balance_loss_mlp": 0.3293969, "epoch": 0.37853599879753497, "flos": 26650996617600.0, "grad_norm": 5.80580948617567, "language_loss": 0.78779566, "learning_rate": 2.8553907316999547e-06, "loss": 0.80935693, "num_input_tokens_seen": 135165240, "router_z_loss_clip": 3.2734375, "router_z_loss_mlp": 0.39550781, "step": 6296, "time_per_iteration": 2.693225860595703 }, { "auxiliary_loss_clip": 0.01781476, "auxiliary_loss_mlp": 0.00355107, "balance_loss_clip": 1.45962369, "balance_loss_mlp": 0.31915379, "epoch": 0.37859612205020293, "flos": 17311960408320.0, "grad_norm": 7.619778088351546, "language_loss": 0.82903957, "learning_rate": 2.855038672137396e-06, "loss": 0.85040545, "num_input_tokens_seen": 135184045, "router_z_loss_clip": 3.21875, "router_z_loss_mlp": 0.35961914, "step": 6297, "time_per_iteration": 2.769942045211792 }, { "auxiliary_loss_clip": 0.01791135, "auxiliary_loss_mlp": 0.00391366, "balance_loss_clip": 1.45889497, "balance_loss_mlp": 0.35274225, "epoch": 0.3786562453028709, "flos": 18220392299520.0, "grad_norm": 5.802611979452548, "language_loss": 0.85055107, "learning_rate": 2.854686580151684e-06, "loss": 0.87237602, "num_input_tokens_seen": 135202365, "router_z_loss_clip": 3.31835938, "router_z_loss_mlp": 0.38647461, "step": 6298, "time_per_iteration": 2.6130568981170654 }, { "auxiliary_loss_clip": 0.01780885, "auxiliary_loss_mlp": 0.00343004, "balance_loss_clip": 1.4518652, "balance_loss_mlp": 0.30511913, "epoch": 0.37871636855553886, "flos": 21214875473280.0, "grad_norm": 11.416922275552885, "language_loss": 0.90747482, "learning_rate": 2.8543344557561722e-06, "loss": 0.92871368, "num_input_tokens_seen": 135220955, "router_z_loss_clip": 3.2890625, "router_z_loss_mlp": 0.37890625, "step": 6299, "time_per_iteration": 2.670567035675049 }, { "auxiliary_loss_clip": 0.01760086, "auxiliary_loss_mlp": 0.00352653, "balance_loss_clip": 1.43906975, "balance_loss_mlp": 0.31576994, "epoch": 0.3787764918082068, "flos": 20952727038720.0, "grad_norm": 55.102193475118, "language_loss": 0.86510414, "learning_rate": 2.8539822989642116e-06, "loss": 0.88623154, "num_input_tokens_seen": 135239715, "router_z_loss_clip": 3.2109375, "router_z_loss_mlp": 0.36889648, "step": 6300, "time_per_iteration": 2.6362504959106445 }, { "auxiliary_loss_clip": 0.01778714, "auxiliary_loss_mlp": 0.00372247, "balance_loss_clip": 1.44841385, "balance_loss_mlp": 0.33164459, "epoch": 0.3788366150608748, "flos": 17308009912320.0, "grad_norm": 8.882171563133577, "language_loss": 0.90434849, "learning_rate": 2.8536301097891577e-06, "loss": 0.92585808, "num_input_tokens_seen": 135257035, "router_z_loss_clip": 3.3046875, "router_z_loss_mlp": 0.40649414, "step": 6301, "time_per_iteration": 2.6532063484191895 }, { "auxiliary_loss_clip": 0.01779786, "auxiliary_loss_mlp": 0.00353343, "balance_loss_clip": 1.45490885, "balance_loss_mlp": 0.31557792, "epoch": 0.37889673831354276, "flos": 24311092942080.0, "grad_norm": 4.851674300595305, "language_loss": 0.75261676, "learning_rate": 2.8532778882443636e-06, "loss": 0.77394801, "num_input_tokens_seen": 135275720, "router_z_loss_clip": 3.25195312, "router_z_loss_mlp": 0.37768555, "step": 6302, "time_per_iteration": 2.7103302478790283 }, { "auxiliary_loss_clip": 0.0176499, "auxiliary_loss_mlp": 0.00353385, "balance_loss_clip": 1.45115542, "balance_loss_mlp": 0.3161442, "epoch": 0.3789568615662107, "flos": 26683603188480.0, "grad_norm": 5.139995373532578, "language_loss": 0.74396276, "learning_rate": 2.8529256343431867e-06, "loss": 0.76514649, "num_input_tokens_seen": 135294140, "router_z_loss_clip": 3.13476562, "router_z_loss_mlp": 0.37231445, "step": 6303, "time_per_iteration": 2.746009111404419 }, { "auxiliary_loss_clip": 0.01753601, "auxiliary_loss_mlp": 0.00359827, "balance_loss_clip": 1.43499422, "balance_loss_mlp": 0.32172805, "epoch": 0.3790169848188787, "flos": 23585194990080.0, "grad_norm": 17.457892913544267, "language_loss": 0.83951557, "learning_rate": 2.8525733480989846e-06, "loss": 0.86064982, "num_input_tokens_seen": 135314845, "router_z_loss_clip": 3.18554688, "router_z_loss_mlp": 0.38085938, "step": 6304, "time_per_iteration": 2.8070883750915527 }, { "auxiliary_loss_clip": 0.01779289, "auxiliary_loss_mlp": 0.00347768, "balance_loss_clip": 1.45452499, "balance_loss_mlp": 0.30857199, "epoch": 0.37907710807154665, "flos": 18437436230400.0, "grad_norm": 39.90711869085266, "language_loss": 0.88943326, "learning_rate": 2.8522210295251146e-06, "loss": 0.9107039, "num_input_tokens_seen": 135333055, "router_z_loss_clip": 3.2421875, "router_z_loss_mlp": 0.39208984, "step": 6305, "time_per_iteration": 2.6936378479003906 }, { "auxiliary_loss_clip": 0.01572347, "auxiliary_loss_mlp": 0.00197845, "balance_loss_clip": 1.3916533, "balance_loss_mlp": 0.18878534, "epoch": 0.3791372313242146, "flos": 50107165954560.0, "grad_norm": 0.9569122233755102, "language_loss": 0.64270067, "learning_rate": 2.8518686786349387e-06, "loss": 0.66040266, "num_input_tokens_seen": 135387865, "router_z_loss_clip": 1.8125, "router_z_loss_mlp": 0.09082031, "step": 6306, "time_per_iteration": 3.010441303253174 }, { "auxiliary_loss_clip": 0.01764967, "auxiliary_loss_mlp": 0.00325326, "balance_loss_clip": 1.45511866, "balance_loss_mlp": 0.28770322, "epoch": 0.3791973545768826, "flos": 24316551809280.0, "grad_norm": 13.402067579599324, "language_loss": 0.7917679, "learning_rate": 2.851516295441817e-06, "loss": 0.81267077, "num_input_tokens_seen": 135409095, "router_z_loss_clip": 3.09765625, "router_z_loss_mlp": 0.37597656, "step": 6307, "time_per_iteration": 4.103034734725952 }, { "auxiliary_loss_clip": 0.01752623, "auxiliary_loss_mlp": 0.0037607, "balance_loss_clip": 1.43864441, "balance_loss_mlp": 0.33856678, "epoch": 0.3792574778295506, "flos": 21579907438080.0, "grad_norm": 7.06689290896619, "language_loss": 0.84348178, "learning_rate": 2.851163879959112e-06, "loss": 0.86476868, "num_input_tokens_seen": 135429585, "router_z_loss_clip": 3.140625, "router_z_loss_mlp": 0.37475586, "step": 6308, "time_per_iteration": 2.7256312370300293 }, { "auxiliary_loss_clip": 0.0173956, "auxiliary_loss_mlp": 0.00323675, "balance_loss_clip": 1.42890167, "balance_loss_mlp": 0.28550416, "epoch": 0.37931760108221857, "flos": 22272731942400.0, "grad_norm": 29.9546703251187, "language_loss": 0.81198275, "learning_rate": 2.8508114322001876e-06, "loss": 0.83261514, "num_input_tokens_seen": 135446320, "router_z_loss_clip": 3.10742188, "router_z_loss_mlp": 0.38183594, "step": 6309, "time_per_iteration": 4.099500894546509 }, { "auxiliary_loss_clip": 0.01747561, "auxiliary_loss_mlp": 0.00350681, "balance_loss_clip": 1.44051909, "balance_loss_mlp": 0.31317803, "epoch": 0.37937772433488653, "flos": 19682998197120.0, "grad_norm": 17.039936598968172, "language_loss": 0.85809851, "learning_rate": 2.8504589521784083e-06, "loss": 0.87908089, "num_input_tokens_seen": 135465720, "router_z_loss_clip": 3.07421875, "router_z_loss_mlp": 0.37475586, "step": 6310, "time_per_iteration": 2.6909570693969727 }, { "auxiliary_loss_clip": 0.01748546, "auxiliary_loss_mlp": 0.00329769, "balance_loss_clip": 1.44409502, "balance_loss_mlp": 0.29379156, "epoch": 0.3794378475875545, "flos": 19099378016640.0, "grad_norm": 7.0298053179181546, "language_loss": 0.83692122, "learning_rate": 2.8501064399071403e-06, "loss": 0.8577044, "num_input_tokens_seen": 135485155, "router_z_loss_clip": 3.04101562, "router_z_loss_mlp": 0.35986328, "step": 6311, "time_per_iteration": 2.659982681274414 }, { "auxiliary_loss_clip": 0.01726732, "auxiliary_loss_mlp": 0.00331949, "balance_loss_clip": 1.4218725, "balance_loss_mlp": 0.29392144, "epoch": 0.37949797084022246, "flos": 20339660684160.0, "grad_norm": 86.74826596780385, "language_loss": 0.76141596, "learning_rate": 2.8497538953997504e-06, "loss": 0.78200269, "num_input_tokens_seen": 135502675, "router_z_loss_clip": 3.05273438, "router_z_loss_mlp": 0.38037109, "step": 6312, "time_per_iteration": 2.6650490760803223 }, { "auxiliary_loss_clip": 0.01594735, "auxiliary_loss_mlp": 0.00211282, "balance_loss_clip": 1.40998733, "balance_loss_mlp": 0.19735818, "epoch": 0.37955809409289043, "flos": 63972203477760.0, "grad_norm": 0.7484418696017325, "language_loss": 0.55355513, "learning_rate": 2.849401318669608e-06, "loss": 0.57161528, "num_input_tokens_seen": 135562005, "router_z_loss_clip": 1.84375, "router_z_loss_mlp": 0.13964844, "step": 6313, "time_per_iteration": 4.594098806381226 }, { "auxiliary_loss_clip": 0.01745494, "auxiliary_loss_mlp": 0.00345152, "balance_loss_clip": 1.4380163, "balance_loss_mlp": 0.30733895, "epoch": 0.3796182173455584, "flos": 31540665179520.0, "grad_norm": 3.293529400187782, "language_loss": 0.77518559, "learning_rate": 2.849048709730083e-06, "loss": 0.79609203, "num_input_tokens_seen": 135582600, "router_z_loss_clip": 3.07617188, "router_z_loss_mlp": 0.37817383, "step": 6314, "time_per_iteration": 2.7508485317230225 }, { "auxiliary_loss_clip": 0.01724268, "auxiliary_loss_mlp": 0.00364108, "balance_loss_clip": 1.41995263, "balance_loss_mlp": 0.32534137, "epoch": 0.37967834059822636, "flos": 12130804978560.0, "grad_norm": 6.0027224160187815, "language_loss": 0.80808234, "learning_rate": 2.848696068594545e-06, "loss": 0.82896608, "num_input_tokens_seen": 135600280, "router_z_loss_clip": 3.04492188, "router_z_loss_mlp": 0.38769531, "step": 6315, "time_per_iteration": 2.732398748397827 }, { "auxiliary_loss_clip": 0.01710741, "auxiliary_loss_mlp": 0.00300886, "balance_loss_clip": 1.41599309, "balance_loss_mlp": 0.26633912, "epoch": 0.3797384638508943, "flos": 39348578298240.0, "grad_norm": 23.67779095354767, "language_loss": 0.78999102, "learning_rate": 2.8483433952763677e-06, "loss": 0.81010729, "num_input_tokens_seen": 135621560, "router_z_loss_clip": 2.95117188, "router_z_loss_mlp": 0.34545898, "step": 6316, "time_per_iteration": 2.861340045928955 }, { "auxiliary_loss_clip": 0.01722798, "auxiliary_loss_mlp": 0.00332929, "balance_loss_clip": 1.42141974, "balance_loss_mlp": 0.29642695, "epoch": 0.3797985871035623, "flos": 34054016653440.0, "grad_norm": 2.3623849488220716, "language_loss": 0.72991347, "learning_rate": 2.847990689788923e-06, "loss": 0.75047076, "num_input_tokens_seen": 135641745, "router_z_loss_clip": 3.01367188, "router_z_loss_mlp": 0.36499023, "step": 6317, "time_per_iteration": 2.7673027515411377 }, { "auxiliary_loss_clip": 0.01702372, "auxiliary_loss_mlp": 0.0033525, "balance_loss_clip": 1.40615058, "balance_loss_mlp": 0.29800946, "epoch": 0.37985871035623026, "flos": 23222174186880.0, "grad_norm": 9.862311346420181, "language_loss": 0.93124503, "learning_rate": 2.8476379521455877e-06, "loss": 0.95162123, "num_input_tokens_seen": 135660650, "router_z_loss_clip": 2.96289062, "router_z_loss_mlp": 0.37231445, "step": 6318, "time_per_iteration": 2.7520792484283447 }, { "auxiliary_loss_clip": 0.01701736, "auxiliary_loss_mlp": 0.00357897, "balance_loss_clip": 1.40478742, "balance_loss_mlp": 0.31798545, "epoch": 0.3799188336088982, "flos": 18114958903680.0, "grad_norm": 55.36256524096329, "language_loss": 0.86600429, "learning_rate": 2.8472851823597354e-06, "loss": 0.88660061, "num_input_tokens_seen": 135679980, "router_z_loss_clip": 2.96875, "router_z_loss_mlp": 0.39892578, "step": 6319, "time_per_iteration": 4.062528848648071 }, { "auxiliary_loss_clip": 0.01716187, "auxiliary_loss_mlp": 0.00333954, "balance_loss_clip": 1.41962206, "balance_loss_mlp": 0.29823905, "epoch": 0.3799789568615662, "flos": 21871897096320.0, "grad_norm": 4.079888687741842, "language_loss": 0.71026945, "learning_rate": 2.846932380444744e-06, "loss": 0.73077095, "num_input_tokens_seen": 135699400, "router_z_loss_clip": 2.96679688, "router_z_loss_mlp": 0.35693359, "step": 6320, "time_per_iteration": 2.6409599781036377 }, { "auxiliary_loss_clip": 0.01675601, "auxiliary_loss_mlp": 0.0036625, "balance_loss_clip": 1.38168609, "balance_loss_mlp": 0.32657754, "epoch": 0.3800390801142342, "flos": 32962943082240.0, "grad_norm": 9.118692507694329, "language_loss": 0.76827043, "learning_rate": 2.846579546413992e-06, "loss": 0.78868896, "num_input_tokens_seen": 135723455, "router_z_loss_clip": 2.9375, "router_z_loss_mlp": 0.39648438, "step": 6321, "time_per_iteration": 2.7763257026672363 }, { "auxiliary_loss_clip": 0.01677069, "auxiliary_loss_mlp": 0.00361072, "balance_loss_clip": 1.38552272, "balance_loss_mlp": 0.3219949, "epoch": 0.38009920336690217, "flos": 26907075653760.0, "grad_norm": 25.980472287639238, "language_loss": 0.82597303, "learning_rate": 2.846226680280859e-06, "loss": 0.84635454, "num_input_tokens_seen": 135744335, "router_z_loss_clip": 2.91601562, "router_z_loss_mlp": 0.39086914, "step": 6322, "time_per_iteration": 2.667598247528076 }, { "auxiliary_loss_clip": 0.01677753, "auxiliary_loss_mlp": 0.00341984, "balance_loss_clip": 1.38567162, "balance_loss_mlp": 0.30374187, "epoch": 0.38015932661957014, "flos": 22488913946880.0, "grad_norm": 10.995790896409527, "language_loss": 0.90483999, "learning_rate": 2.845873782058725e-06, "loss": 0.92503732, "num_input_tokens_seen": 135761440, "router_z_loss_clip": 2.921875, "router_z_loss_mlp": 0.38232422, "step": 6323, "time_per_iteration": 2.6958930492401123 }, { "auxiliary_loss_clip": 0.01657554, "auxiliary_loss_mlp": 0.00351482, "balance_loss_clip": 1.36703849, "balance_loss_mlp": 0.31393123, "epoch": 0.3802194498722381, "flos": 21980993679360.0, "grad_norm": 58096.21168933838, "language_loss": 0.815786, "learning_rate": 2.845520851760973e-06, "loss": 0.83587635, "num_input_tokens_seen": 135779955, "router_z_loss_clip": 2.90820312, "router_z_loss_mlp": 0.37548828, "step": 6324, "time_per_iteration": 2.615281105041504 }, { "auxiliary_loss_clip": 0.01657504, "auxiliary_loss_mlp": 0.00345755, "balance_loss_clip": 1.3691324, "balance_loss_mlp": 0.3092289, "epoch": 0.38027957312490607, "flos": 21324869896320.0, "grad_norm": 4.8748114815731975, "language_loss": 0.92800522, "learning_rate": 2.8451678894009847e-06, "loss": 0.94803774, "num_input_tokens_seen": 135799840, "router_z_loss_clip": 2.88671875, "router_z_loss_mlp": 0.36523438, "step": 6325, "time_per_iteration": 2.6678097248077393 }, { "auxiliary_loss_clip": 0.01679979, "auxiliary_loss_mlp": 0.0037471, "balance_loss_clip": 1.39034772, "balance_loss_mlp": 0.3356812, "epoch": 0.38033969637757403, "flos": 16691244456960.0, "grad_norm": 106.75011144654844, "language_loss": 0.86331761, "learning_rate": 2.8448148949921465e-06, "loss": 0.88386446, "num_input_tokens_seen": 135817880, "router_z_loss_clip": 2.89257812, "router_z_loss_mlp": 0.39013672, "step": 6326, "time_per_iteration": 2.6468002796173096 }, { "auxiliary_loss_clip": 0.01670417, "auxiliary_loss_mlp": 0.0031065, "balance_loss_clip": 1.38464427, "balance_loss_mlp": 0.27488685, "epoch": 0.380399819630242, "flos": 36210847685760.0, "grad_norm": 250.08127878542467, "language_loss": 0.78477168, "learning_rate": 2.844461868547842e-06, "loss": 0.80458236, "num_input_tokens_seen": 135838940, "router_z_loss_clip": 2.85742188, "router_z_loss_mlp": 0.35766602, "step": 6327, "time_per_iteration": 2.8125650882720947 }, { "auxiliary_loss_clip": 0.01668131, "auxiliary_loss_mlp": 0.003371, "balance_loss_clip": 1.38008904, "balance_loss_mlp": 0.299263, "epoch": 0.38045994288290996, "flos": 21288851533440.0, "grad_norm": 34.43407094339074, "language_loss": 0.90350306, "learning_rate": 2.844108810081459e-06, "loss": 0.92355537, "num_input_tokens_seen": 135858325, "router_z_loss_clip": 2.875, "router_z_loss_mlp": 0.37817383, "step": 6328, "time_per_iteration": 2.6929678916931152 }, { "auxiliary_loss_clip": 0.01642996, "auxiliary_loss_mlp": 0.00391242, "balance_loss_clip": 1.35751915, "balance_loss_mlp": 0.35330975, "epoch": 0.38052006613557793, "flos": 20922885815040.0, "grad_norm": 12.032188848619578, "language_loss": 0.67222941, "learning_rate": 2.843755719606385e-06, "loss": 0.69257176, "num_input_tokens_seen": 135878430, "router_z_loss_clip": 2.85742188, "router_z_loss_mlp": 0.37939453, "step": 6329, "time_per_iteration": 2.86405611038208 }, { "auxiliary_loss_clip": 0.01653699, "auxiliary_loss_mlp": 0.00347674, "balance_loss_clip": 1.36908555, "balance_loss_mlp": 0.31291246, "epoch": 0.3805801893882459, "flos": 20990720649600.0, "grad_norm": 3.0827356038337665, "language_loss": 0.63162988, "learning_rate": 2.8434025971360104e-06, "loss": 0.65164363, "num_input_tokens_seen": 135894755, "router_z_loss_clip": 2.84570312, "router_z_loss_mlp": 0.34765625, "step": 6330, "time_per_iteration": 2.6362199783325195 }, { "auxiliary_loss_clip": 0.01660003, "auxiliary_loss_mlp": 0.00319157, "balance_loss_clip": 1.38163209, "balance_loss_mlp": 0.28358537, "epoch": 0.38064031264091386, "flos": 25558594243200.0, "grad_norm": 77.1129716540159, "language_loss": 0.71788412, "learning_rate": 2.8430494426837243e-06, "loss": 0.73767573, "num_input_tokens_seen": 135918275, "router_z_loss_clip": 2.78515625, "router_z_loss_mlp": 0.35571289, "step": 6331, "time_per_iteration": 2.767843723297119 }, { "auxiliary_loss_clip": 0.01662141, "auxiliary_loss_mlp": 0.00345219, "balance_loss_clip": 1.37312746, "balance_loss_mlp": 0.3075732, "epoch": 0.3807004358935818, "flos": 15085857997440.0, "grad_norm": 191.80767360205004, "language_loss": 0.82617444, "learning_rate": 2.842696256262919e-06, "loss": 0.84624803, "num_input_tokens_seen": 135937430, "router_z_loss_clip": 2.89257812, "router_z_loss_mlp": 0.37646484, "step": 6332, "time_per_iteration": 2.6633622646331787 }, { "auxiliary_loss_clip": 0.01646805, "auxiliary_loss_mlp": 0.00357192, "balance_loss_clip": 1.3578546, "balance_loss_mlp": 0.31568378, "epoch": 0.3807605591462498, "flos": 16399398453120.0, "grad_norm": 149.71814529522095, "language_loss": 0.89260828, "learning_rate": 2.842343037886987e-06, "loss": 0.9126482, "num_input_tokens_seen": 135954210, "router_z_loss_clip": 2.88867188, "router_z_loss_mlp": 0.41479492, "step": 6333, "time_per_iteration": 2.6423985958099365 }, { "auxiliary_loss_clip": 0.01645006, "auxiliary_loss_mlp": 0.00327805, "balance_loss_clip": 1.35713148, "balance_loss_mlp": 0.29142195, "epoch": 0.3808206823989178, "flos": 29057083102080.0, "grad_norm": 23.273082800797216, "language_loss": 0.90227681, "learning_rate": 2.8419897875693226e-06, "loss": 0.92200488, "num_input_tokens_seen": 135974425, "router_z_loss_clip": 2.87890625, "router_z_loss_mlp": 0.36376953, "step": 6334, "time_per_iteration": 2.744326591491699 }, { "auxiliary_loss_clip": 0.01643671, "auxiliary_loss_mlp": 0.00377337, "balance_loss_clip": 1.35406148, "balance_loss_mlp": 0.33904654, "epoch": 0.3808808056515858, "flos": 15705855676800.0, "grad_norm": 2.7673171741302083, "language_loss": 0.86890757, "learning_rate": 2.841636505323321e-06, "loss": 0.88911772, "num_input_tokens_seen": 135991985, "router_z_loss_clip": 2.890625, "router_z_loss_mlp": 0.38305664, "step": 6335, "time_per_iteration": 2.6367666721343994 }, { "auxiliary_loss_clip": 0.01642752, "auxiliary_loss_mlp": 0.00386126, "balance_loss_clip": 1.3521632, "balance_loss_mlp": 0.34459355, "epoch": 0.38094092890425374, "flos": 20704584908160.0, "grad_norm": 4.887053105690929, "language_loss": 0.8132481, "learning_rate": 2.8412831911623795e-06, "loss": 0.83353686, "num_input_tokens_seen": 136010015, "router_z_loss_clip": 2.90820312, "router_z_loss_mlp": 0.4152832, "step": 6336, "time_per_iteration": 2.639522075653076 }, { "auxiliary_loss_clip": 0.01616877, "auxiliary_loss_mlp": 0.00346382, "balance_loss_clip": 1.33122849, "balance_loss_mlp": 0.3098799, "epoch": 0.3810010521569217, "flos": 20667956014080.0, "grad_norm": 19.731351704334546, "language_loss": 0.76143926, "learning_rate": 2.840929845099894e-06, "loss": 0.7810719, "num_input_tokens_seen": 136028440, "router_z_loss_clip": 2.859375, "router_z_loss_mlp": 0.36523438, "step": 6337, "time_per_iteration": 2.6626675128936768 }, { "auxiliary_loss_clip": 0.01618131, "auxiliary_loss_mlp": 0.00378607, "balance_loss_clip": 1.32861519, "balance_loss_mlp": 0.33991128, "epoch": 0.38106117540958967, "flos": 31827626933760.0, "grad_norm": 22.740821330893887, "language_loss": 0.69900227, "learning_rate": 2.8405764671492652e-06, "loss": 0.71896964, "num_input_tokens_seen": 136048360, "router_z_loss_clip": 2.8984375, "router_z_loss_mlp": 0.38696289, "step": 6338, "time_per_iteration": 2.738293409347534 }, { "auxiliary_loss_clip": 0.0162191, "auxiliary_loss_mlp": 0.00307624, "balance_loss_clip": 1.33269012, "balance_loss_mlp": 0.27102634, "epoch": 0.38112129866225763, "flos": 16902757693440.0, "grad_norm": 51.366522824283635, "language_loss": 0.78202629, "learning_rate": 2.8402230573238923e-06, "loss": 0.80132163, "num_input_tokens_seen": 136065500, "router_z_loss_clip": 2.88867188, "router_z_loss_mlp": 0.36572266, "step": 6339, "time_per_iteration": 2.7502620220184326 }, { "auxiliary_loss_clip": 0.01617708, "auxiliary_loss_mlp": 0.00340545, "balance_loss_clip": 1.3351903, "balance_loss_mlp": 0.30406725, "epoch": 0.3811814219149256, "flos": 20887226588160.0, "grad_norm": 10.22970083205903, "language_loss": 0.77072072, "learning_rate": 2.839869615637177e-06, "loss": 0.79030323, "num_input_tokens_seen": 136084060, "router_z_loss_clip": 2.82421875, "router_z_loss_mlp": 0.36474609, "step": 6340, "time_per_iteration": 2.7073283195495605 }, { "auxiliary_loss_clip": 0.01611596, "auxiliary_loss_mlp": 0.0032431, "balance_loss_clip": 1.32688892, "balance_loss_mlp": 0.28749776, "epoch": 0.38124154516759357, "flos": 16690813493760.0, "grad_norm": 2.962139574331783, "language_loss": 0.99508452, "learning_rate": 2.839516142102522e-06, "loss": 1.01444364, "num_input_tokens_seen": 136102310, "router_z_loss_clip": 2.84765625, "router_z_loss_mlp": 0.36816406, "step": 6341, "time_per_iteration": 2.702725887298584 }, { "auxiliary_loss_clip": 0.01604112, "auxiliary_loss_mlp": 0.00319418, "balance_loss_clip": 1.3188436, "balance_loss_mlp": 0.28050843, "epoch": 0.38130166842026153, "flos": 19681956702720.0, "grad_norm": 15.44616999737883, "language_loss": 0.82259417, "learning_rate": 2.83916263673333e-06, "loss": 0.84182942, "num_input_tokens_seen": 136120725, "router_z_loss_clip": 2.84960938, "router_z_loss_mlp": 0.38891602, "step": 6342, "time_per_iteration": 2.6628644466400146 }, { "auxiliary_loss_clip": 0.01610023, "auxiliary_loss_mlp": 0.00339371, "balance_loss_clip": 1.32598853, "balance_loss_mlp": 0.30267861, "epoch": 0.3813617916729295, "flos": 22198432659840.0, "grad_norm": 5.86366953193857, "language_loss": 0.88495713, "learning_rate": 2.838809099543007e-06, "loss": 0.90445113, "num_input_tokens_seen": 136139105, "router_z_loss_clip": 2.83984375, "router_z_loss_mlp": 0.36669922, "step": 6343, "time_per_iteration": 2.6732633113861084 }, { "auxiliary_loss_clip": 0.01596852, "auxiliary_loss_mlp": 0.0032061, "balance_loss_clip": 1.31398034, "balance_loss_mlp": 0.28541934, "epoch": 0.38142191492559746, "flos": 19096899978240.0, "grad_norm": 8.386416740801145, "language_loss": 0.82893658, "learning_rate": 2.838455530544959e-06, "loss": 0.84811127, "num_input_tokens_seen": 136158265, "router_z_loss_clip": 2.82617188, "router_z_loss_mlp": 0.35205078, "step": 6344, "time_per_iteration": 2.64619517326355 }, { "auxiliary_loss_clip": 0.01599836, "auxiliary_loss_mlp": 0.00335868, "balance_loss_clip": 1.31566691, "balance_loss_mlp": 0.29867435, "epoch": 0.3814820381782654, "flos": 24097748112000.0, "grad_norm": 28.982781865800654, "language_loss": 0.8146891, "learning_rate": 2.838101929752593e-06, "loss": 0.83404613, "num_input_tokens_seen": 136176100, "router_z_loss_clip": 2.84179688, "router_z_loss_mlp": 0.37182617, "step": 6345, "time_per_iteration": 2.6838085651397705 }, { "auxiliary_loss_clip": 0.01606472, "auxiliary_loss_mlp": 0.00325936, "balance_loss_clip": 1.32523167, "balance_loss_mlp": 0.28938615, "epoch": 0.3815421614309334, "flos": 15778502933760.0, "grad_norm": 7.594039798954289, "language_loss": 0.7811954, "learning_rate": 2.8377482971793187e-06, "loss": 0.80051947, "num_input_tokens_seen": 136195125, "router_z_loss_clip": 2.81054688, "router_z_loss_mlp": 0.36523438, "step": 6346, "time_per_iteration": 2.7067387104034424 }, { "auxiliary_loss_clip": 0.01602548, "auxiliary_loss_mlp": 0.00326646, "balance_loss_clip": 1.3160274, "balance_loss_mlp": 0.28890437, "epoch": 0.38160228468360136, "flos": 19899754819200.0, "grad_norm": 27.33825746462555, "language_loss": 0.82789379, "learning_rate": 2.8373946328385437e-06, "loss": 0.84718573, "num_input_tokens_seen": 136213885, "router_z_loss_clip": 2.86328125, "router_z_loss_mlp": 0.37719727, "step": 6347, "time_per_iteration": 2.662125825881958 }, { "auxiliary_loss_clip": 0.01594731, "auxiliary_loss_mlp": 0.00306102, "balance_loss_clip": 1.31177163, "balance_loss_mlp": 0.27274776, "epoch": 0.3816624079362694, "flos": 19281050029440.0, "grad_norm": 3.3280869493578953, "language_loss": 0.81904149, "learning_rate": 2.8370409367436813e-06, "loss": 0.83804977, "num_input_tokens_seen": 136232700, "router_z_loss_clip": 2.83398438, "router_z_loss_mlp": 0.33349609, "step": 6348, "time_per_iteration": 2.6169116497039795 }, { "auxiliary_loss_clip": 0.01599514, "auxiliary_loss_mlp": 0.00315178, "balance_loss_clip": 1.31415939, "balance_loss_mlp": 0.27517158, "epoch": 0.38172253118893734, "flos": 21177564220800.0, "grad_norm": 6.599754956347613, "language_loss": 0.9526394, "learning_rate": 2.836687208908142e-06, "loss": 0.97178632, "num_input_tokens_seen": 136248975, "router_z_loss_clip": 2.8515625, "router_z_loss_mlp": 0.40014648, "step": 6349, "time_per_iteration": 4.099695920944214 }, { "auxiliary_loss_clip": 0.01617193, "auxiliary_loss_mlp": 0.00284771, "balance_loss_clip": 1.3280381, "balance_loss_mlp": 0.24893647, "epoch": 0.3817826544416053, "flos": 17529219820800.0, "grad_norm": 12.100676716755896, "language_loss": 0.8647033, "learning_rate": 2.836333449345341e-06, "loss": 0.8837229, "num_input_tokens_seen": 136266710, "router_z_loss_clip": 2.890625, "router_z_loss_mlp": 0.35839844, "step": 6350, "time_per_iteration": 2.6415741443634033 }, { "auxiliary_loss_clip": 0.01594355, "auxiliary_loss_mlp": 0.00315247, "balance_loss_clip": 1.31365752, "balance_loss_mlp": 0.2742388, "epoch": 0.38184277769427327, "flos": 16326535714560.0, "grad_norm": 4.066549234458273, "language_loss": 0.83770609, "learning_rate": 2.8359796580686907e-06, "loss": 0.85680211, "num_input_tokens_seen": 136284445, "router_z_loss_clip": 2.80273438, "router_z_loss_mlp": 0.40966797, "step": 6351, "time_per_iteration": 4.143386602401733 }, { "auxiliary_loss_clip": 0.01620492, "auxiliary_loss_mlp": 0.00318301, "balance_loss_clip": 1.33287787, "balance_loss_mlp": 0.27798429, "epoch": 0.38190290094694124, "flos": 30443450382720.0, "grad_norm": 8.89597882618044, "language_loss": 0.84861881, "learning_rate": 2.8356258350916085e-06, "loss": 0.86800665, "num_input_tokens_seen": 136305730, "router_z_loss_clip": 2.875, "router_z_loss_mlp": 0.40332031, "step": 6352, "time_per_iteration": 2.819288969039917 }, { "auxiliary_loss_clip": 0.01619417, "auxiliary_loss_mlp": 0.00331966, "balance_loss_clip": 1.33261323, "balance_loss_mlp": 0.29431999, "epoch": 0.3819630241996092, "flos": 14209924936320.0, "grad_norm": 26.5925386651671, "language_loss": 0.70450819, "learning_rate": 2.8352719804275104e-06, "loss": 0.72402203, "num_input_tokens_seen": 136323850, "router_z_loss_clip": 2.87109375, "router_z_loss_mlp": 0.3762207, "step": 6353, "time_per_iteration": 2.697115182876587 }, { "auxiliary_loss_clip": 0.01603932, "auxiliary_loss_mlp": 0.00284783, "balance_loss_clip": 1.31660986, "balance_loss_mlp": 0.24873419, "epoch": 0.38202314745227717, "flos": 25009699536000.0, "grad_norm": 4.642029143390403, "language_loss": 0.89097846, "learning_rate": 2.834918094089816e-06, "loss": 0.90986562, "num_input_tokens_seen": 136344880, "router_z_loss_clip": 2.87304688, "router_z_loss_mlp": 0.3605957, "step": 6354, "time_per_iteration": 2.674713134765625 }, { "auxiliary_loss_clip": 0.01626034, "auxiliary_loss_mlp": 0.00277559, "balance_loss_clip": 1.34139276, "balance_loss_mlp": 0.24384655, "epoch": 0.38208327070494513, "flos": 20814507504000.0, "grad_norm": 1.8706555521131765, "language_loss": 0.88374752, "learning_rate": 2.834564176091943e-06, "loss": 0.90278345, "num_input_tokens_seen": 136366060, "router_z_loss_clip": 2.84570312, "router_z_loss_mlp": 0.33691406, "step": 6355, "time_per_iteration": 4.093441724777222 }, { "auxiliary_loss_clip": 0.01611108, "auxiliary_loss_mlp": 0.00302688, "balance_loss_clip": 1.32657981, "balance_loss_mlp": 0.2684277, "epoch": 0.3821433939576131, "flos": 22637727993600.0, "grad_norm": 1065.2635182942765, "language_loss": 0.82591498, "learning_rate": 2.8342102264473125e-06, "loss": 0.84505296, "num_input_tokens_seen": 136385625, "router_z_loss_clip": 2.84765625, "router_z_loss_mlp": 0.34228516, "step": 6356, "time_per_iteration": 2.6782469749450684 }, { "auxiliary_loss_clip": 0.01627385, "auxiliary_loss_mlp": 0.00319317, "balance_loss_clip": 1.33355713, "balance_loss_mlp": 0.2811704, "epoch": 0.38220351721028106, "flos": 26869872142080.0, "grad_norm": 17.92427179665703, "language_loss": 0.8797105, "learning_rate": 2.833856245169348e-06, "loss": 0.89917755, "num_input_tokens_seen": 136405750, "router_z_loss_clip": 2.93945312, "router_z_loss_mlp": 0.3815918, "step": 6357, "time_per_iteration": 2.717782735824585 }, { "auxiliary_loss_clip": 0.01638929, "auxiliary_loss_mlp": 0.00311389, "balance_loss_clip": 1.34712601, "balance_loss_mlp": 0.27624598, "epoch": 0.38226364046294903, "flos": 23367468700800.0, "grad_norm": 21.557444268876438, "language_loss": 0.87017787, "learning_rate": 2.8335022322714695e-06, "loss": 0.88968104, "num_input_tokens_seen": 136426085, "router_z_loss_clip": 2.91992188, "router_z_loss_mlp": 0.35131836, "step": 6358, "time_per_iteration": 2.6921043395996094 }, { "auxiliary_loss_clip": 0.01635267, "auxiliary_loss_mlp": 0.00287146, "balance_loss_clip": 1.33668435, "balance_loss_mlp": 0.25119299, "epoch": 0.382323763715617, "flos": 19646225648640.0, "grad_norm": 36.700007592076034, "language_loss": 0.85840839, "learning_rate": 2.8331481877671036e-06, "loss": 0.8776325, "num_input_tokens_seen": 136442670, "router_z_loss_clip": 2.99023438, "router_z_loss_mlp": 0.35961914, "step": 6359, "time_per_iteration": 2.6756958961486816 }, { "auxiliary_loss_clip": 0.0163617, "auxiliary_loss_mlp": 0.00310689, "balance_loss_clip": 1.34665358, "balance_loss_mlp": 0.27371031, "epoch": 0.38238388696828496, "flos": 54124741232640.0, "grad_norm": 6.757895651590891, "language_loss": 0.76579475, "learning_rate": 2.8327941116696754e-06, "loss": 0.78526336, "num_input_tokens_seen": 136465730, "router_z_loss_clip": 2.89257812, "router_z_loss_mlp": 0.36962891, "step": 6360, "time_per_iteration": 2.935215711593628 }, { "auxiliary_loss_clip": 0.01644196, "auxiliary_loss_mlp": 0.00342146, "balance_loss_clip": 1.35449314, "balance_loss_mlp": 0.3038322, "epoch": 0.382444010220953, "flos": 24936190352640.0, "grad_norm": 183.3862611858086, "language_loss": 0.84157073, "learning_rate": 2.83244000399261e-06, "loss": 0.8614341, "num_input_tokens_seen": 136487215, "router_z_loss_clip": 2.89648438, "router_z_loss_mlp": 0.3828125, "step": 6361, "time_per_iteration": 4.188424587249756 }, { "auxiliary_loss_clip": 0.01631754, "auxiliary_loss_mlp": 0.00310187, "balance_loss_clip": 1.33984244, "balance_loss_mlp": 0.27163455, "epoch": 0.38250413347362094, "flos": 42337351209600.0, "grad_norm": 4.75359859340562, "language_loss": 0.71291703, "learning_rate": 2.832085864749337e-06, "loss": 0.73233646, "num_input_tokens_seen": 136510365, "router_z_loss_clip": 2.921875, "router_z_loss_mlp": 0.38574219, "step": 6362, "time_per_iteration": 2.8097174167633057 }, { "auxiliary_loss_clip": 0.01633669, "auxiliary_loss_mlp": 0.00337431, "balance_loss_clip": 1.33474171, "balance_loss_mlp": 0.29818696, "epoch": 0.3825642567262889, "flos": 16289224462080.0, "grad_norm": 230.2022076347913, "language_loss": 0.89253169, "learning_rate": 2.8317316939532848e-06, "loss": 0.91224265, "num_input_tokens_seen": 136527100, "router_z_loss_clip": 2.98828125, "router_z_loss_mlp": 0.39282227, "step": 6363, "time_per_iteration": 2.654904365539551 }, { "auxiliary_loss_clip": 0.01642755, "auxiliary_loss_mlp": 0.00299767, "balance_loss_clip": 1.35172915, "balance_loss_mlp": 0.26560172, "epoch": 0.3826243799789569, "flos": 45654778586880.0, "grad_norm": 3.1390254233277077, "language_loss": 0.67328572, "learning_rate": 2.8313774916178825e-06, "loss": 0.692711, "num_input_tokens_seen": 136550870, "router_z_loss_clip": 2.9140625, "router_z_loss_mlp": 0.34204102, "step": 6364, "time_per_iteration": 2.921494722366333 }, { "auxiliary_loss_clip": 0.01633247, "auxiliary_loss_mlp": 0.00323678, "balance_loss_clip": 1.33395076, "balance_loss_mlp": 0.28481627, "epoch": 0.38268450323162484, "flos": 25301581453440.0, "grad_norm": 9.333599662354064, "language_loss": 0.76151007, "learning_rate": 2.8310232577565635e-06, "loss": 0.78107935, "num_input_tokens_seen": 136569895, "router_z_loss_clip": 2.99609375, "router_z_loss_mlp": 0.38842773, "step": 6365, "time_per_iteration": 2.712049722671509 }, { "auxiliary_loss_clip": 0.01642703, "auxiliary_loss_mlp": 0.00331412, "balance_loss_clip": 1.34180737, "balance_loss_mlp": 0.2928119, "epoch": 0.3827446264842928, "flos": 21836022387840.0, "grad_norm": 2.6915263514773535, "language_loss": 0.79895067, "learning_rate": 2.830668992382758e-06, "loss": 0.81869185, "num_input_tokens_seen": 136588585, "router_z_loss_clip": 3.00976562, "router_z_loss_mlp": 0.38598633, "step": 6366, "time_per_iteration": 2.6547906398773193 }, { "auxiliary_loss_clip": 0.01652466, "auxiliary_loss_mlp": 0.0030324, "balance_loss_clip": 1.35244131, "balance_loss_mlp": 0.26731074, "epoch": 0.38280474973696077, "flos": 25734591907200.0, "grad_norm": 3.2746922353969063, "language_loss": 0.78030682, "learning_rate": 2.830314695509902e-06, "loss": 0.79986382, "num_input_tokens_seen": 136606640, "router_z_loss_clip": 3.00195312, "router_z_loss_mlp": 0.359375, "step": 6367, "time_per_iteration": 2.7022924423217773 }, { "auxiliary_loss_clip": 0.01645579, "auxiliary_loss_mlp": 0.00308976, "balance_loss_clip": 1.35439682, "balance_loss_mlp": 0.27225989, "epoch": 0.38286487298962874, "flos": 24895934184960.0, "grad_norm": 29.008406633294673, "language_loss": 0.71184838, "learning_rate": 2.82996036715143e-06, "loss": 0.73139393, "num_input_tokens_seen": 136624940, "router_z_loss_clip": 2.91015625, "router_z_loss_mlp": 0.36743164, "step": 6368, "time_per_iteration": 2.6671314239501953 }, { "auxiliary_loss_clip": 0.0164398, "auxiliary_loss_mlp": 0.00249546, "balance_loss_clip": 1.35303164, "balance_loss_mlp": 0.21366408, "epoch": 0.3829249962422967, "flos": 28543703967360.0, "grad_norm": 6.804965912593796, "language_loss": 0.75027627, "learning_rate": 2.8296060073207763e-06, "loss": 0.76921153, "num_input_tokens_seen": 136645540, "router_z_loss_clip": 2.90820312, "router_z_loss_mlp": 0.35864258, "step": 6369, "time_per_iteration": 2.7387173175811768 }, { "auxiliary_loss_clip": 0.01644063, "auxiliary_loss_mlp": 0.00280656, "balance_loss_clip": 1.34890759, "balance_loss_mlp": 0.24632323, "epoch": 0.38298511949496467, "flos": 21471205904640.0, "grad_norm": 4.112067191658734, "language_loss": 0.84173089, "learning_rate": 2.8292516160313804e-06, "loss": 0.86097807, "num_input_tokens_seen": 136664530, "router_z_loss_clip": 2.95117188, "router_z_loss_mlp": 0.34350586, "step": 6370, "time_per_iteration": 2.671417236328125 }, { "auxiliary_loss_clip": 0.01640708, "auxiliary_loss_mlp": 0.00295478, "balance_loss_clip": 1.34462953, "balance_loss_mlp": 0.26031083, "epoch": 0.38304524274763263, "flos": 31679998035840.0, "grad_norm": 13.542197684908267, "language_loss": 0.7150318, "learning_rate": 2.8288971932966805e-06, "loss": 0.73439366, "num_input_tokens_seen": 136682315, "router_z_loss_clip": 2.9609375, "router_z_loss_mlp": 0.35131836, "step": 6371, "time_per_iteration": 2.8249351978302 }, { "auxiliary_loss_clip": 0.01629305, "auxiliary_loss_mlp": 0.0031426, "balance_loss_clip": 1.32985497, "balance_loss_mlp": 0.27825874, "epoch": 0.3831053660003006, "flos": 25076816098560.0, "grad_norm": 2.100183189851737, "language_loss": 0.79272449, "learning_rate": 2.8285427391301155e-06, "loss": 0.81216019, "num_input_tokens_seen": 136701185, "router_z_loss_clip": 2.99414062, "router_z_loss_mlp": 0.35986328, "step": 6372, "time_per_iteration": 2.7687911987304688 }, { "auxiliary_loss_clip": 0.01634512, "auxiliary_loss_mlp": 0.00311852, "balance_loss_clip": 1.33286881, "balance_loss_mlp": 0.27606487, "epoch": 0.38316548925296856, "flos": 23259018562560.0, "grad_norm": 11.00943733144634, "language_loss": 0.91037482, "learning_rate": 2.8281882535451266e-06, "loss": 0.92983842, "num_input_tokens_seen": 136721265, "router_z_loss_clip": 3.015625, "router_z_loss_mlp": 0.35791016, "step": 6373, "time_per_iteration": 2.733586072921753 }, { "auxiliary_loss_clip": 0.01633817, "auxiliary_loss_mlp": 0.00322382, "balance_loss_clip": 1.33856297, "balance_loss_mlp": 0.28618985, "epoch": 0.3832256125056366, "flos": 34423465991040.0, "grad_norm": 5.080349468038354, "language_loss": 0.81788421, "learning_rate": 2.8278337365551567e-06, "loss": 0.83744615, "num_input_tokens_seen": 136741885, "router_z_loss_clip": 2.953125, "router_z_loss_mlp": 0.36181641, "step": 6374, "time_per_iteration": 2.7729270458221436 }, { "auxiliary_loss_clip": 0.01641953, "auxiliary_loss_mlp": 0.00315158, "balance_loss_clip": 1.34198666, "balance_loss_mlp": 0.27934742, "epoch": 0.38328573575830455, "flos": 21762764599680.0, "grad_norm": 18.077195179627996, "language_loss": 0.83782071, "learning_rate": 2.8274791881736485e-06, "loss": 0.85739183, "num_input_tokens_seen": 136760905, "router_z_loss_clip": 3.0, "router_z_loss_mlp": 0.3581543, "step": 6375, "time_per_iteration": 2.6978530883789062 }, { "auxiliary_loss_clip": 0.0161977, "auxiliary_loss_mlp": 0.00298048, "balance_loss_clip": 1.32249069, "balance_loss_mlp": 0.26209489, "epoch": 0.3833458590109725, "flos": 17380010724480.0, "grad_norm": 54.25424411282349, "language_loss": 0.80408758, "learning_rate": 2.8271246084140457e-06, "loss": 0.82326579, "num_input_tokens_seen": 136777240, "router_z_loss_clip": 2.97265625, "router_z_loss_mlp": 0.35961914, "step": 6376, "time_per_iteration": 2.6696083545684814 }, { "auxiliary_loss_clip": 0.01624487, "auxiliary_loss_mlp": 0.00303517, "balance_loss_clip": 1.33263206, "balance_loss_mlp": 0.26765925, "epoch": 0.3834059822636405, "flos": 29424557191680.0, "grad_norm": 43.3541374108664, "language_loss": 0.72407168, "learning_rate": 2.826769997289796e-06, "loss": 0.7433517, "num_input_tokens_seen": 136801040, "router_z_loss_clip": 2.91992188, "router_z_loss_mlp": 0.35839844, "step": 6377, "time_per_iteration": 2.8000590801239014 }, { "auxiliary_loss_clip": 0.01619972, "auxiliary_loss_mlp": 0.00319812, "balance_loss_clip": 1.32066298, "balance_loss_mlp": 0.28288147, "epoch": 0.38346610551630844, "flos": 21470739027840.0, "grad_norm": 1419.7694736385138, "language_loss": 0.82079792, "learning_rate": 2.826415354814344e-06, "loss": 0.84019578, "num_input_tokens_seen": 136819495, "router_z_loss_clip": 2.9921875, "router_z_loss_mlp": 0.36889648, "step": 6378, "time_per_iteration": 2.6820626258850098 }, { "auxiliary_loss_clip": 0.01636608, "auxiliary_loss_mlp": 0.00327475, "balance_loss_clip": 1.33576155, "balance_loss_mlp": 0.29140192, "epoch": 0.3835262287689764, "flos": 27561224188800.0, "grad_norm": 43.77223103004055, "language_loss": 0.74262512, "learning_rate": 2.8260606810011396e-06, "loss": 0.76226592, "num_input_tokens_seen": 136838840, "router_z_loss_clip": 3.0078125, "router_z_loss_mlp": 0.36083984, "step": 6379, "time_per_iteration": 2.7266478538513184 }, { "auxiliary_loss_clip": 0.01631655, "auxiliary_loss_mlp": 0.00295413, "balance_loss_clip": 1.33536875, "balance_loss_mlp": 0.26122427, "epoch": 0.3835863520216444, "flos": 15523716787200.0, "grad_norm": 3.53089952463034, "language_loss": 0.88467366, "learning_rate": 2.8257059758636315e-06, "loss": 0.90394425, "num_input_tokens_seen": 136854425, "router_z_loss_clip": 2.96289062, "router_z_loss_mlp": 0.34204102, "step": 6380, "time_per_iteration": 2.7315142154693604 }, { "auxiliary_loss_clip": 0.01624299, "auxiliary_loss_mlp": 0.00324617, "balance_loss_clip": 1.32844687, "balance_loss_mlp": 0.28782916, "epoch": 0.38364647527431234, "flos": 21904934630400.0, "grad_norm": 12.648454522273104, "language_loss": 0.85923922, "learning_rate": 2.8253512394152697e-06, "loss": 0.87872839, "num_input_tokens_seen": 136874355, "router_z_loss_clip": 2.95898438, "router_z_loss_mlp": 0.36816406, "step": 6381, "time_per_iteration": 2.6952576637268066 }, { "auxiliary_loss_clip": 0.01586094, "auxiliary_loss_mlp": 0.00228604, "balance_loss_clip": 1.40374136, "balance_loss_mlp": 0.21830411, "epoch": 0.3837065985269803, "flos": 65534927558400.0, "grad_norm": 0.8122089837339881, "language_loss": 0.59734744, "learning_rate": 2.8249964716695068e-06, "loss": 0.61549443, "num_input_tokens_seen": 136937475, "router_z_loss_clip": 1.828125, "router_z_loss_mlp": 0.10302734, "step": 6382, "time_per_iteration": 3.1304779052734375 }, { "auxiliary_loss_clip": 0.01618217, "auxiliary_loss_mlp": 0.0029267, "balance_loss_clip": 1.32110715, "balance_loss_mlp": 0.25614429, "epoch": 0.38376672177964827, "flos": 28256598558720.0, "grad_norm": 31.01702469523618, "language_loss": 0.75027978, "learning_rate": 2.824641672639794e-06, "loss": 0.76938868, "num_input_tokens_seen": 136955805, "router_z_loss_clip": 2.96875, "router_z_loss_mlp": 0.36523438, "step": 6383, "time_per_iteration": 2.7845327854156494 }, { "auxiliary_loss_clip": 0.01634254, "auxiliary_loss_mlp": 0.00296492, "balance_loss_clip": 1.33144569, "balance_loss_mlp": 0.26173091, "epoch": 0.38382684503231623, "flos": 20631363033600.0, "grad_norm": 12.106975601486083, "language_loss": 0.82265091, "learning_rate": 2.824286842339587e-06, "loss": 0.8419584, "num_input_tokens_seen": 136975240, "router_z_loss_clip": 3.02539062, "router_z_loss_mlp": 0.34741211, "step": 6384, "time_per_iteration": 2.7124667167663574 }, { "auxiliary_loss_clip": 0.01651498, "auxiliary_loss_mlp": 0.00279031, "balance_loss_clip": 1.34730005, "balance_loss_mlp": 0.24651103, "epoch": 0.3838869682849842, "flos": 19605825826560.0, "grad_norm": 40.14352114451263, "language_loss": 0.80459905, "learning_rate": 2.823931980782341e-06, "loss": 0.82390434, "num_input_tokens_seen": 136994985, "router_z_loss_clip": 3.0390625, "router_z_loss_mlp": 0.32495117, "step": 6385, "time_per_iteration": 2.764244794845581 }, { "auxiliary_loss_clip": 0.01555741, "auxiliary_loss_mlp": 0.00213322, "balance_loss_clip": 1.37314105, "balance_loss_mlp": 0.20159169, "epoch": 0.38394709153765216, "flos": 56556110891520.0, "grad_norm": 1.1764234130688362, "language_loss": 0.678231, "learning_rate": 2.82357708798151e-06, "loss": 0.69592154, "num_input_tokens_seen": 137046290, "router_z_loss_clip": 1.828125, "router_z_loss_mlp": 0.1171875, "step": 6386, "time_per_iteration": 3.00028657913208 }, { "auxiliary_loss_clip": 0.0164764, "auxiliary_loss_mlp": 0.00314766, "balance_loss_clip": 1.3471427, "balance_loss_mlp": 0.27998027, "epoch": 0.3840072147903202, "flos": 15888748752000.0, "grad_norm": 9.539811422810631, "language_loss": 0.79170305, "learning_rate": 2.8232221639505547e-06, "loss": 0.8113271, "num_input_tokens_seen": 137064725, "router_z_loss_clip": 3.00976562, "router_z_loss_mlp": 0.34765625, "step": 6387, "time_per_iteration": 2.683128595352173 }, { "auxiliary_loss_clip": 0.01650924, "auxiliary_loss_mlp": 0.00302356, "balance_loss_clip": 1.35045171, "balance_loss_mlp": 0.26969314, "epoch": 0.38406733804298815, "flos": 28218030330240.0, "grad_norm": 2.926624253721753, "language_loss": 0.87572628, "learning_rate": 2.822867208702932e-06, "loss": 0.89525914, "num_input_tokens_seen": 137086030, "router_z_loss_clip": 3.0078125, "router_z_loss_mlp": 0.32666016, "step": 6388, "time_per_iteration": 2.7151131629943848 }, { "auxiliary_loss_clip": 0.0165121, "auxiliary_loss_mlp": 0.00288919, "balance_loss_clip": 1.34033775, "balance_loss_mlp": 0.25356135, "epoch": 0.3841274612956561, "flos": 18223588609920.0, "grad_norm": 13.983126721764119, "language_loss": 0.82510334, "learning_rate": 2.8225122222521026e-06, "loss": 0.84450459, "num_input_tokens_seen": 137105400, "router_z_loss_clip": 3.10546875, "router_z_loss_mlp": 0.35375977, "step": 6389, "time_per_iteration": 2.742051839828491 }, { "auxiliary_loss_clip": 0.0166683, "auxiliary_loss_mlp": 0.00320901, "balance_loss_clip": 1.34844732, "balance_loss_mlp": 0.28289717, "epoch": 0.3841875845483241, "flos": 19792884879360.0, "grad_norm": 4.778524176391641, "language_loss": 0.8391583, "learning_rate": 2.8221572046115273e-06, "loss": 0.85903561, "num_input_tokens_seen": 137124985, "router_z_loss_clip": 3.1875, "router_z_loss_mlp": 0.37988281, "step": 6390, "time_per_iteration": 2.6780333518981934 }, { "auxiliary_loss_clip": 0.01658868, "auxiliary_loss_mlp": 0.00288224, "balance_loss_clip": 1.34294629, "balance_loss_mlp": 0.25367731, "epoch": 0.38424770780099204, "flos": 29898829393920.0, "grad_norm": 262.4528700326799, "language_loss": 0.76295197, "learning_rate": 2.821802155794668e-06, "loss": 0.78242284, "num_input_tokens_seen": 137146745, "router_z_loss_clip": 3.16015625, "router_z_loss_mlp": 0.34545898, "step": 6391, "time_per_iteration": 2.7532753944396973 }, { "auxiliary_loss_clip": 0.01649161, "auxiliary_loss_mlp": 0.00331622, "balance_loss_clip": 1.33381319, "balance_loss_mlp": 0.29354697, "epoch": 0.38430783105366, "flos": 20813717404800.0, "grad_norm": 8.951828855598835, "language_loss": 0.91793633, "learning_rate": 2.8214470758149884e-06, "loss": 0.93774408, "num_input_tokens_seen": 137163195, "router_z_loss_clip": 3.15429688, "router_z_loss_mlp": 0.38061523, "step": 6392, "time_per_iteration": 4.065686225891113 }, { "auxiliary_loss_clip": 0.01655674, "auxiliary_loss_mlp": 0.00317393, "balance_loss_clip": 1.33980393, "balance_loss_mlp": 0.28258431, "epoch": 0.384367954306328, "flos": 10998577399680.0, "grad_norm": 44.87928977372603, "language_loss": 0.69636261, "learning_rate": 2.8210919646859536e-06, "loss": 0.7160933, "num_input_tokens_seen": 137179330, "router_z_loss_clip": 3.15820312, "router_z_loss_mlp": 0.34790039, "step": 6393, "time_per_iteration": 2.67919921875 }, { "auxiliary_loss_clip": 0.0165279, "auxiliary_loss_mlp": 0.00304554, "balance_loss_clip": 1.33456767, "balance_loss_mlp": 0.26778981, "epoch": 0.38442807755899594, "flos": 25338030779520.0, "grad_norm": 6.9178031105268785, "language_loss": 0.80512524, "learning_rate": 2.820736822421029e-06, "loss": 0.82469863, "num_input_tokens_seen": 137198655, "router_z_loss_clip": 3.18164062, "router_z_loss_mlp": 0.3671875, "step": 6394, "time_per_iteration": 4.246353387832642 }, { "auxiliary_loss_clip": 0.01657222, "auxiliary_loss_mlp": 0.00298556, "balance_loss_clip": 1.33776689, "balance_loss_mlp": 0.26358047, "epoch": 0.3844882008116639, "flos": 21069760527360.0, "grad_norm": 38.01514094757584, "language_loss": 0.89542091, "learning_rate": 2.8203816490336822e-06, "loss": 0.91497874, "num_input_tokens_seen": 137217120, "router_z_loss_clip": 3.19140625, "router_z_loss_mlp": 0.34985352, "step": 6395, "time_per_iteration": 2.8561062812805176 }, { "auxiliary_loss_clip": 0.01671022, "auxiliary_loss_mlp": 0.00296386, "balance_loss_clip": 1.34647012, "balance_loss_mlp": 0.26346076, "epoch": 0.38454832406433187, "flos": 17963235855360.0, "grad_norm": 29.2097131755875, "language_loss": 0.80332494, "learning_rate": 2.8200264445373813e-06, "loss": 0.82299906, "num_input_tokens_seen": 137234410, "router_z_loss_clip": 3.24609375, "router_z_loss_mlp": 0.3293457, "step": 6396, "time_per_iteration": 2.7471776008605957 }, { "auxiliary_loss_clip": 0.01474214, "auxiliary_loss_mlp": 0.00106547, "balance_loss_clip": 1.30212903, "balance_loss_mlp": 0.09681967, "epoch": 0.38460844731699984, "flos": 67924999555200.0, "grad_norm": 0.9973468574081054, "language_loss": 0.59828007, "learning_rate": 2.8196712089455954e-06, "loss": 0.6140877, "num_input_tokens_seen": 137294940, "router_z_loss_clip": 1.71875, "router_z_loss_mlp": 0.09716797, "step": 6397, "time_per_iteration": 4.663430213928223 }, { "auxiliary_loss_clip": 0.01665425, "auxiliary_loss_mlp": 0.002565, "balance_loss_clip": 1.35379732, "balance_loss_mlp": 0.2251482, "epoch": 0.3846685705696678, "flos": 25849075530240.0, "grad_norm": 683068.9376245758, "language_loss": 0.92615199, "learning_rate": 2.819315942271794e-06, "loss": 0.94537127, "num_input_tokens_seen": 137315035, "router_z_loss_clip": 3.11523438, "router_z_loss_mlp": 0.31323242, "step": 6398, "time_per_iteration": 2.8191897869110107 }, { "auxiliary_loss_clip": 0.01647086, "auxiliary_loss_mlp": 0.00293524, "balance_loss_clip": 1.33587265, "balance_loss_mlp": 0.26198125, "epoch": 0.38472869382233577, "flos": 16290194129280.0, "grad_norm": 4.797798387559952, "language_loss": 0.87718755, "learning_rate": 2.8189606445294515e-06, "loss": 0.89659369, "num_input_tokens_seen": 137333155, "router_z_loss_clip": 3.11328125, "router_z_loss_mlp": 0.31542969, "step": 6399, "time_per_iteration": 2.652078151702881 }, { "auxiliary_loss_clip": 0.01629405, "auxiliary_loss_mlp": 0.0027606, "balance_loss_clip": 1.32162011, "balance_loss_mlp": 0.24010614, "epoch": 0.38478881707500373, "flos": 19353122668800.0, "grad_norm": 105.21784984775711, "language_loss": 0.75932962, "learning_rate": 2.818605315732038e-06, "loss": 0.77838427, "num_input_tokens_seen": 137351515, "router_z_loss_clip": 3.08007812, "router_z_loss_mlp": 0.35961914, "step": 6400, "time_per_iteration": 2.826495409011841 }, { "auxiliary_loss_clip": 0.01643348, "auxiliary_loss_mlp": 0.00290036, "balance_loss_clip": 1.33457339, "balance_loss_mlp": 0.25780219, "epoch": 0.38484894032767175, "flos": 24860849575680.0, "grad_norm": 6.583121368690919, "language_loss": 0.7875967, "learning_rate": 2.81824995589303e-06, "loss": 0.80693054, "num_input_tokens_seen": 137371255, "router_z_loss_clip": 3.08789062, "router_z_loss_mlp": 0.32202148, "step": 6401, "time_per_iteration": 2.7704579830169678 }, { "auxiliary_loss_clip": 0.0165261, "auxiliary_loss_mlp": 0.00277772, "balance_loss_clip": 1.33622897, "balance_loss_mlp": 0.24115115, "epoch": 0.3849090635803397, "flos": 14501806853760.0, "grad_norm": 4.698122172961729, "language_loss": 0.80882818, "learning_rate": 2.8178945650259012e-06, "loss": 0.82813191, "num_input_tokens_seen": 137388980, "router_z_loss_clip": 3.1640625, "router_z_loss_mlp": 0.3659668, "step": 6402, "time_per_iteration": 2.711061477661133 }, { "auxiliary_loss_clip": 0.01636417, "auxiliary_loss_mlp": 0.00268352, "balance_loss_clip": 1.32975483, "balance_loss_mlp": 0.23318522, "epoch": 0.3849691868330077, "flos": 18515865576960.0, "grad_norm": 12.770514564255796, "language_loss": 0.90521371, "learning_rate": 2.817539143144128e-06, "loss": 0.92426139, "num_input_tokens_seen": 137406885, "router_z_loss_clip": 3.0703125, "router_z_loss_mlp": 0.3515625, "step": 6403, "time_per_iteration": 2.6125285625457764 }, { "auxiliary_loss_clip": 0.01618506, "auxiliary_loss_mlp": 0.0029043, "balance_loss_clip": 1.32121253, "balance_loss_mlp": 0.25881618, "epoch": 0.38502931008567565, "flos": 21616392677760.0, "grad_norm": 522.9589302879966, "language_loss": 0.90498936, "learning_rate": 2.817183690261189e-06, "loss": 0.9240787, "num_input_tokens_seen": 137425535, "router_z_loss_clip": 2.97460938, "router_z_loss_mlp": 0.31616211, "step": 6404, "time_per_iteration": 4.086002826690674 }, { "auxiliary_loss_clip": 0.01614214, "auxiliary_loss_mlp": 0.00283081, "balance_loss_clip": 1.31210184, "balance_loss_mlp": 0.24932085, "epoch": 0.3850894333383436, "flos": 25415346804480.0, "grad_norm": 2.4525566263436973, "language_loss": 0.75457811, "learning_rate": 2.816828206390563e-06, "loss": 0.77355111, "num_input_tokens_seen": 137447700, "router_z_loss_clip": 3.02148438, "router_z_loss_mlp": 0.33740234, "step": 6405, "time_per_iteration": 2.7340383529663086 }, { "auxiliary_loss_clip": 0.01618282, "auxiliary_loss_mlp": 0.00269863, "balance_loss_clip": 1.31931412, "balance_loss_mlp": 0.23805785, "epoch": 0.3851495565910116, "flos": 20227870581120.0, "grad_norm": 109.48243481597194, "language_loss": 0.86628896, "learning_rate": 2.816472691545729e-06, "loss": 0.8851704, "num_input_tokens_seen": 137462245, "router_z_loss_clip": 2.98632812, "router_z_loss_mlp": 0.31811523, "step": 6406, "time_per_iteration": 2.6327297687530518 }, { "auxiliary_loss_clip": 0.01622763, "auxiliary_loss_mlp": 0.00282586, "balance_loss_clip": 1.31411695, "balance_loss_mlp": 0.24746734, "epoch": 0.38520967984367954, "flos": 16508459122560.0, "grad_norm": 9.009872721962736, "language_loss": 0.92503709, "learning_rate": 2.8161171457401694e-06, "loss": 0.9440906, "num_input_tokens_seen": 137476455, "router_z_loss_clip": 3.08984375, "router_z_loss_mlp": 0.35107422, "step": 6407, "time_per_iteration": 2.60701060295105 }, { "auxiliary_loss_clip": 0.01407908, "auxiliary_loss_mlp": 0.00097652, "balance_loss_clip": 1.23090756, "balance_loss_mlp": 0.09006993, "epoch": 0.3852698030963475, "flos": 61313772971520.0, "grad_norm": 0.7701428187108796, "language_loss": 0.64861035, "learning_rate": 2.815761568987365e-06, "loss": 0.66366595, "num_input_tokens_seen": 137539845, "router_z_loss_clip": 1.765625, "router_z_loss_mlp": 0.07568359, "step": 6408, "time_per_iteration": 3.21704363822937 }, { "auxiliary_loss_clip": 0.01623172, "auxiliary_loss_mlp": 0.00307297, "balance_loss_clip": 1.32007957, "balance_loss_mlp": 0.27339351, "epoch": 0.3853299263490155, "flos": 22893016930560.0, "grad_norm": 22.150362727447924, "language_loss": 0.78805512, "learning_rate": 2.8154059613008e-06, "loss": 0.80735981, "num_input_tokens_seen": 137559880, "router_z_loss_clip": 3.03125, "router_z_loss_mlp": 0.33911133, "step": 6409, "time_per_iteration": 2.6574177742004395 }, { "auxiliary_loss_clip": 0.01623192, "auxiliary_loss_mlp": 0.00288355, "balance_loss_clip": 1.3140552, "balance_loss_mlp": 0.25359327, "epoch": 0.38539004960168344, "flos": 20047491457920.0, "grad_norm": 120.7775109532317, "language_loss": 0.8092851, "learning_rate": 2.81505032269396e-06, "loss": 0.82840061, "num_input_tokens_seen": 137578225, "router_z_loss_clip": 3.09179688, "router_z_loss_mlp": 0.34741211, "step": 6410, "time_per_iteration": 2.67203688621521 }, { "auxiliary_loss_clip": 0.01413846, "auxiliary_loss_mlp": 0.00091077, "balance_loss_clip": 1.23758411, "balance_loss_mlp": 0.08220824, "epoch": 0.3854501728543514, "flos": 68730691570560.0, "grad_norm": 2.0941226392176775, "language_loss": 0.59496307, "learning_rate": 2.81469465318033e-06, "loss": 0.61001229, "num_input_tokens_seen": 137645770, "router_z_loss_clip": 1.765625, "router_z_loss_mlp": 0.08886719, "step": 6411, "time_per_iteration": 3.242995023727417 }, { "auxiliary_loss_clip": 0.01616793, "auxiliary_loss_mlp": 0.00246114, "balance_loss_clip": 1.30892897, "balance_loss_mlp": 0.21333119, "epoch": 0.38551029610701937, "flos": 20485027025280.0, "grad_norm": 6.35446635716123, "language_loss": 0.85746861, "learning_rate": 2.814338952773397e-06, "loss": 0.87609762, "num_input_tokens_seen": 137664090, "router_z_loss_clip": 3.078125, "router_z_loss_mlp": 0.32739258, "step": 6412, "time_per_iteration": 2.7130844593048096 }, { "auxiliary_loss_clip": 0.01618982, "auxiliary_loss_mlp": 0.00275395, "balance_loss_clip": 1.30901361, "balance_loss_mlp": 0.2427558, "epoch": 0.38557041935968733, "flos": 23471788775040.0, "grad_norm": 2.549919299423259, "language_loss": 0.87172139, "learning_rate": 2.8139832214866493e-06, "loss": 0.89066517, "num_input_tokens_seen": 137683190, "router_z_loss_clip": 3.1015625, "router_z_loss_mlp": 0.32666016, "step": 6413, "time_per_iteration": 2.6942784786224365 }, { "auxiliary_loss_clip": 0.01417563, "auxiliary_loss_mlp": 0.0007404, "balance_loss_clip": 1.24129415, "balance_loss_mlp": 0.06540932, "epoch": 0.38563054261235535, "flos": 63966636869760.0, "grad_norm": 0.8104589740427719, "language_loss": 0.61041033, "learning_rate": 2.813627459333576e-06, "loss": 0.6253264, "num_input_tokens_seen": 137737315, "router_z_loss_clip": 1.765625, "router_z_loss_mlp": 0.08642578, "step": 6414, "time_per_iteration": 3.077376365661621 }, { "auxiliary_loss_clip": 0.01621711, "auxiliary_loss_mlp": 0.00251389, "balance_loss_clip": 1.30936754, "balance_loss_mlp": 0.21581739, "epoch": 0.3856906658650233, "flos": 23987789602560.0, "grad_norm": 5.457801374787279, "language_loss": 0.86835825, "learning_rate": 2.8132716663276685e-06, "loss": 0.88708919, "num_input_tokens_seen": 137753535, "router_z_loss_clip": 3.12695312, "router_z_loss_mlp": 0.35546875, "step": 6415, "time_per_iteration": 2.7062904834747314 }, { "auxiliary_loss_clip": 0.01636942, "auxiliary_loss_mlp": 0.00247308, "balance_loss_clip": 1.32462645, "balance_loss_mlp": 0.21342859, "epoch": 0.3857507891176913, "flos": 25007436979200.0, "grad_norm": 8.493384525836746, "language_loss": 0.84398359, "learning_rate": 2.8129158424824173e-06, "loss": 0.86282605, "num_input_tokens_seen": 137773405, "router_z_loss_clip": 3.12695312, "router_z_loss_mlp": 0.33862305, "step": 6416, "time_per_iteration": 2.7539241313934326 }, { "auxiliary_loss_clip": 0.01621486, "auxiliary_loss_mlp": 0.00275288, "balance_loss_clip": 1.31088769, "balance_loss_mlp": 0.24431768, "epoch": 0.38581091237035925, "flos": 21536778182400.0, "grad_norm": 7.877570523578659, "language_loss": 0.85020435, "learning_rate": 2.8125599878113155e-06, "loss": 0.8691721, "num_input_tokens_seen": 137790810, "router_z_loss_clip": 3.10351562, "router_z_loss_mlp": 0.30981445, "step": 6417, "time_per_iteration": 2.681366205215454 }, { "auxiliary_loss_clip": 0.01622608, "auxiliary_loss_mlp": 0.00254775, "balance_loss_clip": 1.3100996, "balance_loss_mlp": 0.2227788, "epoch": 0.3858710356230272, "flos": 17383889393280.0, "grad_norm": 13.549240707663827, "language_loss": 0.8904106, "learning_rate": 2.8122041023278583e-06, "loss": 0.9091844, "num_input_tokens_seen": 137810265, "router_z_loss_clip": 3.12890625, "router_z_loss_mlp": 0.32006836, "step": 6418, "time_per_iteration": 2.726663589477539 }, { "auxiliary_loss_clip": 0.01619538, "auxiliary_loss_mlp": 0.00248069, "balance_loss_clip": 1.31333435, "balance_loss_mlp": 0.21705124, "epoch": 0.3859311588756952, "flos": 20339588856960.0, "grad_norm": 7.2652202216629105, "language_loss": 0.87159479, "learning_rate": 2.8118481860455407e-06, "loss": 0.89027083, "num_input_tokens_seen": 137828580, "router_z_loss_clip": 3.06054688, "router_z_loss_mlp": 0.31030273, "step": 6419, "time_per_iteration": 2.705132484436035 }, { "auxiliary_loss_clip": 0.01635078, "auxiliary_loss_mlp": 0.00248218, "balance_loss_clip": 1.31754112, "balance_loss_mlp": 0.21302727, "epoch": 0.38599128212836314, "flos": 26321157002880.0, "grad_norm": 37.02581083666262, "language_loss": 0.75878537, "learning_rate": 2.8114922389778573e-06, "loss": 0.77761835, "num_input_tokens_seen": 137846145, "router_z_loss_clip": 3.17773438, "router_z_loss_mlp": 0.35180664, "step": 6420, "time_per_iteration": 2.750032424926758 }, { "auxiliary_loss_clip": 0.01637871, "auxiliary_loss_mlp": 0.00264549, "balance_loss_clip": 1.32441509, "balance_loss_mlp": 0.22952518, "epoch": 0.3860514053810311, "flos": 13553837066880.0, "grad_norm": 5.876358523652723, "language_loss": 0.88083565, "learning_rate": 2.8111362611383076e-06, "loss": 0.89985991, "num_input_tokens_seen": 137863705, "router_z_loss_clip": 3.13867188, "router_z_loss_mlp": 0.3503418, "step": 6421, "time_per_iteration": 2.6665453910827637 }, { "auxiliary_loss_clip": 0.01638165, "auxiliary_loss_mlp": 0.0026978, "balance_loss_clip": 1.3198241, "balance_loss_mlp": 0.23516153, "epoch": 0.3861115286336991, "flos": 20954271323520.0, "grad_norm": 4.590073067070618, "language_loss": 0.80771947, "learning_rate": 2.8107802525403886e-06, "loss": 0.82679886, "num_input_tokens_seen": 137880285, "router_z_loss_clip": 3.18359375, "router_z_loss_mlp": 0.34643555, "step": 6422, "time_per_iteration": 2.6321959495544434 }, { "auxiliary_loss_clip": 0.01651381, "auxiliary_loss_mlp": 0.00246392, "balance_loss_clip": 1.33118415, "balance_loss_mlp": 0.21370503, "epoch": 0.38617165188636704, "flos": 16362697731840.0, "grad_norm": 2.3463799525801123, "language_loss": 0.74307883, "learning_rate": 2.8104242131976025e-06, "loss": 0.76205659, "num_input_tokens_seen": 137898335, "router_z_loss_clip": 3.203125, "router_z_loss_mlp": 0.3269043, "step": 6423, "time_per_iteration": 2.6652989387512207 }, { "auxiliary_loss_clip": 0.01624951, "auxiliary_loss_mlp": 0.00259142, "balance_loss_clip": 1.30840778, "balance_loss_mlp": 0.22352204, "epoch": 0.386231775139035, "flos": 34787276893440.0, "grad_norm": 3.774284105856269, "language_loss": 0.77174854, "learning_rate": 2.810068143123449e-06, "loss": 0.79058945, "num_input_tokens_seen": 137918605, "router_z_loss_clip": 3.16210938, "router_z_loss_mlp": 0.35620117, "step": 6424, "time_per_iteration": 2.8043863773345947 }, { "auxiliary_loss_clip": 0.01628187, "auxiliary_loss_mlp": 0.00260154, "balance_loss_clip": 1.31270862, "balance_loss_mlp": 0.22846854, "epoch": 0.38629189839170297, "flos": 21726171619200.0, "grad_norm": 18.498667601603845, "language_loss": 0.77568841, "learning_rate": 2.809712042331429e-06, "loss": 0.79457182, "num_input_tokens_seen": 137938245, "router_z_loss_clip": 3.15429688, "router_z_loss_mlp": 0.31689453, "step": 6425, "time_per_iteration": 2.7228190898895264 }, { "auxiliary_loss_clip": 0.01623246, "auxiliary_loss_mlp": 0.00262252, "balance_loss_clip": 1.30741155, "balance_loss_mlp": 0.22873083, "epoch": 0.38635202164437094, "flos": 27923634460800.0, "grad_norm": 581.4834736584602, "language_loss": 0.86199278, "learning_rate": 2.8093559108350484e-06, "loss": 0.88084781, "num_input_tokens_seen": 137956770, "router_z_loss_clip": 3.16015625, "router_z_loss_mlp": 0.33520508, "step": 6426, "time_per_iteration": 2.825049638748169 }, { "auxiliary_loss_clip": 0.01635088, "auxiliary_loss_mlp": 0.00291914, "balance_loss_clip": 1.31853437, "balance_loss_mlp": 0.25875026, "epoch": 0.38641214489703896, "flos": 23586631534080.0, "grad_norm": 25.322865339239797, "language_loss": 0.82649434, "learning_rate": 2.80899974864781e-06, "loss": 0.84576434, "num_input_tokens_seen": 137977040, "router_z_loss_clip": 3.16601562, "router_z_loss_mlp": 0.33178711, "step": 6427, "time_per_iteration": 2.73966121673584 }, { "auxiliary_loss_clip": 0.01627626, "auxiliary_loss_mlp": 0.00264892, "balance_loss_clip": 1.31166708, "balance_loss_mlp": 0.23122761, "epoch": 0.3864722681497069, "flos": 12641239198080.0, "grad_norm": 5.178065698152898, "language_loss": 0.76156753, "learning_rate": 2.8086435557832203e-06, "loss": 0.78049272, "num_input_tokens_seen": 137993545, "router_z_loss_clip": 3.16015625, "router_z_loss_mlp": 0.33642578, "step": 6428, "time_per_iteration": 2.670247793197632 }, { "auxiliary_loss_clip": 0.01620404, "auxiliary_loss_mlp": 0.00279659, "balance_loss_clip": 1.30646241, "balance_loss_mlp": 0.24525538, "epoch": 0.3865323914023749, "flos": 17598922162560.0, "grad_norm": 5.168169471143829, "language_loss": 0.89987695, "learning_rate": 2.8082873322547863e-06, "loss": 0.9188776, "num_input_tokens_seen": 138010140, "router_z_loss_clip": 3.140625, "router_z_loss_mlp": 0.34423828, "step": 6429, "time_per_iteration": 2.665508270263672 }, { "auxiliary_loss_clip": 0.01611354, "auxiliary_loss_mlp": 0.0030184, "balance_loss_clip": 1.30057049, "balance_loss_mlp": 0.26922432, "epoch": 0.38659251465504285, "flos": 18478949374080.0, "grad_norm": 143.53212572378067, "language_loss": 0.90953016, "learning_rate": 2.807931078076015e-06, "loss": 0.92866206, "num_input_tokens_seen": 138028880, "router_z_loss_clip": 3.11132812, "router_z_loss_mlp": 0.32592773, "step": 6430, "time_per_iteration": 2.7065279483795166 }, { "auxiliary_loss_clip": 0.01426963, "auxiliary_loss_mlp": 0.00162404, "balance_loss_clip": 1.25524187, "balance_loss_mlp": 0.15186636, "epoch": 0.3866526379077108, "flos": 64165726978560.0, "grad_norm": 1.3511975586301592, "language_loss": 0.58661735, "learning_rate": 2.807574793260416e-06, "loss": 0.60251099, "num_input_tokens_seen": 138098090, "router_z_loss_clip": 1.71875, "router_z_loss_mlp": 0.10546875, "step": 6431, "time_per_iteration": 3.2014658451080322 }, { "auxiliary_loss_clip": 0.0160859, "auxiliary_loss_mlp": 0.00291007, "balance_loss_clip": 1.29696488, "balance_loss_mlp": 0.25719944, "epoch": 0.3867127611603788, "flos": 14388292897920.0, "grad_norm": 67.67262516554253, "language_loss": 0.8722856, "learning_rate": 2.8072184778215004e-06, "loss": 0.89128155, "num_input_tokens_seen": 138114735, "router_z_loss_clip": 3.12109375, "router_z_loss_mlp": 0.33789062, "step": 6432, "time_per_iteration": 2.6436076164245605 }, { "auxiliary_loss_clip": 0.01609774, "auxiliary_loss_mlp": 0.00306545, "balance_loss_clip": 1.29203081, "balance_loss_mlp": 0.27195033, "epoch": 0.38677288441304675, "flos": 20010754823040.0, "grad_norm": 15.029161225752137, "language_loss": 0.90068674, "learning_rate": 2.806862131772779e-06, "loss": 0.91984999, "num_input_tokens_seen": 138130480, "router_z_loss_clip": 3.17578125, "router_z_loss_mlp": 0.34594727, "step": 6433, "time_per_iteration": 2.7007317543029785 }, { "auxiliary_loss_clip": 0.01597732, "auxiliary_loss_mlp": 0.00252346, "balance_loss_clip": 1.28952813, "balance_loss_mlp": 0.2170601, "epoch": 0.3868330076657147, "flos": 22236893147520.0, "grad_norm": 226.3857086724911, "language_loss": 0.78739607, "learning_rate": 2.806505755127765e-06, "loss": 0.80589688, "num_input_tokens_seen": 138150640, "router_z_loss_clip": 3.08007812, "router_z_loss_mlp": 0.3527832, "step": 6434, "time_per_iteration": 4.0895140171051025 }, { "auxiliary_loss_clip": 0.01596896, "auxiliary_loss_mlp": 0.00292639, "balance_loss_clip": 1.27986789, "balance_loss_mlp": 0.25916541, "epoch": 0.3868931309183827, "flos": 16727442387840.0, "grad_norm": 6.82635709548058, "language_loss": 0.87253559, "learning_rate": 2.806149347899972e-06, "loss": 0.89143097, "num_input_tokens_seen": 138169700, "router_z_loss_clip": 3.17382812, "router_z_loss_mlp": 0.33496094, "step": 6435, "time_per_iteration": 2.6448957920074463 }, { "auxiliary_loss_clip": 0.01584944, "auxiliary_loss_mlp": 0.00290784, "balance_loss_clip": 1.28063023, "balance_loss_mlp": 0.25816792, "epoch": 0.38695325417105064, "flos": 22674716023680.0, "grad_norm": 123.73818269155582, "language_loss": 0.85604489, "learning_rate": 2.805792910102915e-06, "loss": 0.87480217, "num_input_tokens_seen": 138185835, "router_z_loss_clip": 3.0390625, "router_z_loss_mlp": 0.32592773, "step": 6436, "time_per_iteration": 4.135608434677124 }, { "auxiliary_loss_clip": 0.01573145, "auxiliary_loss_mlp": 0.00284238, "balance_loss_clip": 1.27835107, "balance_loss_mlp": 0.25240889, "epoch": 0.3870133774237186, "flos": 23112036109440.0, "grad_norm": 2.5520577152469612, "language_loss": 0.82380152, "learning_rate": 2.8054364417501093e-06, "loss": 0.8423754, "num_input_tokens_seen": 138204080, "router_z_loss_clip": 2.94726562, "router_z_loss_mlp": 0.31835938, "step": 6437, "time_per_iteration": 2.682001829147339 }, { "auxiliary_loss_clip": 0.01579868, "auxiliary_loss_mlp": 0.0028608, "balance_loss_clip": 1.28154874, "balance_loss_mlp": 0.25396538, "epoch": 0.3870735006763866, "flos": 17675699483520.0, "grad_norm": 18.879839620250902, "language_loss": 0.89580423, "learning_rate": 2.805079942855074e-06, "loss": 0.9144637, "num_input_tokens_seen": 138220710, "router_z_loss_clip": 2.984375, "router_z_loss_mlp": 0.32128906, "step": 6438, "time_per_iteration": 2.666818857192993 }, { "auxiliary_loss_clip": 0.01588133, "auxiliary_loss_mlp": 0.00296544, "balance_loss_clip": 1.28598094, "balance_loss_mlp": 0.26376134, "epoch": 0.38713362392905454, "flos": 23295791111040.0, "grad_norm": 11.106264734864858, "language_loss": 0.81345201, "learning_rate": 2.804723413431326e-06, "loss": 0.83229882, "num_input_tokens_seen": 138241720, "router_z_loss_clip": 3.0234375, "router_z_loss_mlp": 0.32788086, "step": 6439, "time_per_iteration": 4.177370309829712 }, { "auxiliary_loss_clip": 0.0158238, "auxiliary_loss_mlp": 0.00265458, "balance_loss_clip": 1.28490436, "balance_loss_mlp": 0.23606052, "epoch": 0.38719374718172256, "flos": 21031192298880.0, "grad_norm": 2698.8115049238877, "language_loss": 0.79856002, "learning_rate": 2.8043668534923855e-06, "loss": 0.81703842, "num_input_tokens_seen": 138261885, "router_z_loss_clip": 2.97265625, "router_z_loss_mlp": 0.29394531, "step": 6440, "time_per_iteration": 2.8905861377716064 }, { "auxiliary_loss_clip": 0.01582491, "auxiliary_loss_mlp": 0.00295182, "balance_loss_clip": 1.27567124, "balance_loss_mlp": 0.26037312, "epoch": 0.3872538704343905, "flos": 19609776322560.0, "grad_norm": 19.98204487847169, "language_loss": 0.89891291, "learning_rate": 2.804010263051774e-06, "loss": 0.91768968, "num_input_tokens_seen": 138280255, "router_z_loss_clip": 3.06445312, "router_z_loss_mlp": 0.34814453, "step": 6441, "time_per_iteration": 2.7068021297454834 }, { "auxiliary_loss_clip": 0.01579005, "auxiliary_loss_mlp": 0.00284305, "balance_loss_clip": 1.28051805, "balance_loss_mlp": 0.25183189, "epoch": 0.3873139936870585, "flos": 17530045833600.0, "grad_norm": 30.846464127102777, "language_loss": 0.89172351, "learning_rate": 2.8036536421230118e-06, "loss": 0.91035664, "num_input_tokens_seen": 138296675, "router_z_loss_clip": 2.99023438, "router_z_loss_mlp": 0.32446289, "step": 6442, "time_per_iteration": 2.63509464263916 }, { "auxiliary_loss_clip": 0.01578801, "auxiliary_loss_mlp": 0.00279398, "balance_loss_clip": 1.27762032, "balance_loss_mlp": 0.2464962, "epoch": 0.38737411693972645, "flos": 17786555832960.0, "grad_norm": 6.033118273186808, "language_loss": 0.91615719, "learning_rate": 2.803296990719624e-06, "loss": 0.93473911, "num_input_tokens_seen": 138314985, "router_z_loss_clip": 3.01367188, "router_z_loss_mlp": 0.32910156, "step": 6443, "time_per_iteration": 2.7195403575897217 }, { "auxiliary_loss_clip": 0.01370931, "auxiliary_loss_mlp": 0.00102426, "balance_loss_clip": 1.20647013, "balance_loss_mlp": 0.09451038, "epoch": 0.3874342401923944, "flos": 58304637048960.0, "grad_norm": 0.8260066586418422, "language_loss": 0.5030992, "learning_rate": 2.8029403088551327e-06, "loss": 0.51783276, "num_input_tokens_seen": 138373275, "router_z_loss_clip": 1.640625, "router_z_loss_mlp": 0.07910156, "step": 6444, "time_per_iteration": 3.1782565116882324 }, { "auxiliary_loss_clip": 0.01564572, "auxiliary_loss_mlp": 0.00276169, "balance_loss_clip": 1.27235365, "balance_loss_mlp": 0.24453072, "epoch": 0.3874943634450624, "flos": 17711933328000.0, "grad_norm": 5.132340380019124, "language_loss": 0.84929276, "learning_rate": 2.802583596543065e-06, "loss": 0.86770022, "num_input_tokens_seen": 138391145, "router_z_loss_clip": 2.921875, "router_z_loss_mlp": 0.31640625, "step": 6445, "time_per_iteration": 2.6092607975006104 }, { "auxiliary_loss_clip": 0.01549872, "auxiliary_loss_mlp": 0.00289635, "balance_loss_clip": 1.25933111, "balance_loss_mlp": 0.2563042, "epoch": 0.38755448669773035, "flos": 19244852098560.0, "grad_norm": 30.034406786312978, "language_loss": 0.88520896, "learning_rate": 2.8022268537969474e-06, "loss": 0.90360403, "num_input_tokens_seen": 138409875, "router_z_loss_clip": 2.91015625, "router_z_loss_mlp": 0.33349609, "step": 6446, "time_per_iteration": 4.07696795463562 }, { "auxiliary_loss_clip": 0.01546041, "auxiliary_loss_mlp": 0.00280965, "balance_loss_clip": 1.25525045, "balance_loss_mlp": 0.24730068, "epoch": 0.3876146099503983, "flos": 20594267262720.0, "grad_norm": 6.204593487446583, "language_loss": 0.84951806, "learning_rate": 2.801870080630306e-06, "loss": 0.86778808, "num_input_tokens_seen": 138428965, "router_z_loss_clip": 2.90820312, "router_z_loss_mlp": 0.33691406, "step": 6447, "time_per_iteration": 2.6207103729248047 }, { "auxiliary_loss_clip": 0.01544966, "auxiliary_loss_mlp": 0.00301008, "balance_loss_clip": 1.2588743, "balance_loss_mlp": 0.26934642, "epoch": 0.3876747332030663, "flos": 19281121856640.0, "grad_norm": 10.832703160301248, "language_loss": 0.83466738, "learning_rate": 2.801513277056671e-06, "loss": 0.85312712, "num_input_tokens_seen": 138448090, "router_z_loss_clip": 2.86132812, "router_z_loss_mlp": 0.31640625, "step": 6448, "time_per_iteration": 2.6414225101470947 }, { "auxiliary_loss_clip": 0.01534666, "auxiliary_loss_mlp": 0.00250908, "balance_loss_clip": 1.24921179, "balance_loss_mlp": 0.22184475, "epoch": 0.38773485645573424, "flos": 18945895201920.0, "grad_norm": 107.80117643773491, "language_loss": 0.80577034, "learning_rate": 2.8011564430895725e-06, "loss": 0.82362616, "num_input_tokens_seen": 138466105, "router_z_loss_clip": 2.85546875, "router_z_loss_mlp": 0.29077148, "step": 6449, "time_per_iteration": 2.6069836616516113 }, { "auxiliary_loss_clip": 0.01502054, "auxiliary_loss_mlp": 0.00277053, "balance_loss_clip": 1.21885681, "balance_loss_mlp": 0.24341182, "epoch": 0.3877949797084022, "flos": 23071348978560.0, "grad_norm": 150128.5785214675, "language_loss": 0.86373401, "learning_rate": 2.800799578742542e-06, "loss": 0.88152504, "num_input_tokens_seen": 138485160, "router_z_loss_clip": 2.83007812, "router_z_loss_mlp": 0.33618164, "step": 6450, "time_per_iteration": 2.703974962234497 }, { "auxiliary_loss_clip": 0.01509763, "auxiliary_loss_mlp": 0.00276708, "balance_loss_clip": 1.22497845, "balance_loss_mlp": 0.24434228, "epoch": 0.3878551029610702, "flos": 29095543589760.0, "grad_norm": 23.490848056333736, "language_loss": 0.83431888, "learning_rate": 2.8004426840291106e-06, "loss": 0.8521837, "num_input_tokens_seen": 138504135, "router_z_loss_clip": 2.84765625, "router_z_loss_mlp": 0.3236084, "step": 6451, "time_per_iteration": 2.7084262371063232 }, { "auxiliary_loss_clip": 0.01475519, "auxiliary_loss_mlp": 0.00260851, "balance_loss_clip": 1.2064693, "balance_loss_mlp": 0.22940361, "epoch": 0.38791522621373814, "flos": 20996394998400.0, "grad_norm": 275.6890235506067, "language_loss": 0.84946412, "learning_rate": 2.800085758962812e-06, "loss": 0.86682785, "num_input_tokens_seen": 138523955, "router_z_loss_clip": 2.68554688, "router_z_loss_mlp": 0.31420898, "step": 6452, "time_per_iteration": 2.7136781215667725 }, { "auxiliary_loss_clip": 0.01483229, "auxiliary_loss_mlp": 0.0028688, "balance_loss_clip": 1.21269083, "balance_loss_mlp": 0.25488454, "epoch": 0.3879753494664061, "flos": 15486836497920.0, "grad_norm": 2.2685780771495714, "language_loss": 0.85421801, "learning_rate": 2.799728803557182e-06, "loss": 0.87191916, "num_input_tokens_seen": 138541655, "router_z_loss_clip": 2.70703125, "router_z_loss_mlp": 0.31958008, "step": 6453, "time_per_iteration": 2.6290056705474854 }, { "auxiliary_loss_clip": 0.01483115, "auxiliary_loss_mlp": 0.00289799, "balance_loss_clip": 1.20601773, "balance_loss_mlp": 0.25904316, "epoch": 0.3880354727190741, "flos": 22053964158720.0, "grad_norm": 9.443094203125678, "language_loss": 0.78644931, "learning_rate": 2.7993718178257555e-06, "loss": 0.80417842, "num_input_tokens_seen": 138560860, "router_z_loss_clip": 2.77539062, "router_z_loss_mlp": 0.30761719, "step": 6454, "time_per_iteration": 2.714240550994873 }, { "auxiliary_loss_clip": 0.01478986, "auxiliary_loss_mlp": 0.00321217, "balance_loss_clip": 1.20665097, "balance_loss_mlp": 0.28745663, "epoch": 0.3880955959717421, "flos": 20340307128960.0, "grad_norm": 3.0111833756054516, "language_loss": 0.86041421, "learning_rate": 2.7990148017820694e-06, "loss": 0.87841618, "num_input_tokens_seen": 138580200, "router_z_loss_clip": 2.72070312, "router_z_loss_mlp": 0.33764648, "step": 6455, "time_per_iteration": 2.6649327278137207 }, { "auxiliary_loss_clip": 0.01469797, "auxiliary_loss_mlp": 0.00300665, "balance_loss_clip": 1.20012069, "balance_loss_mlp": 0.26561701, "epoch": 0.38815571922441006, "flos": 23075407215360.0, "grad_norm": 2.473031468023308, "language_loss": 0.82208121, "learning_rate": 2.798657755439662e-06, "loss": 0.83978587, "num_input_tokens_seen": 138598315, "router_z_loss_clip": 2.69335938, "router_z_loss_mlp": 0.35058594, "step": 6456, "time_per_iteration": 2.6733102798461914 }, { "auxiliary_loss_clip": 0.01460819, "auxiliary_loss_mlp": 0.00289582, "balance_loss_clip": 1.19465506, "balance_loss_mlp": 0.25708562, "epoch": 0.388215842477078, "flos": 20776944856320.0, "grad_norm": 5.718243065887521, "language_loss": 0.71112895, "learning_rate": 2.7983006788120726e-06, "loss": 0.72863293, "num_input_tokens_seen": 138615695, "router_z_loss_clip": 2.66015625, "router_z_loss_mlp": 0.32495117, "step": 6457, "time_per_iteration": 2.691025495529175 }, { "auxiliary_loss_clip": 0.01460693, "auxiliary_loss_mlp": 0.00295313, "balance_loss_clip": 1.1918987, "balance_loss_mlp": 0.26148093, "epoch": 0.388275965729746, "flos": 20448182649600.0, "grad_norm": 5.693040832832106, "language_loss": 0.90197134, "learning_rate": 2.797943571912841e-06, "loss": 0.91953135, "num_input_tokens_seen": 138633180, "router_z_loss_clip": 2.68945312, "router_z_loss_mlp": 0.33837891, "step": 6458, "time_per_iteration": 2.6768531799316406 }, { "auxiliary_loss_clip": 0.01456689, "auxiliary_loss_mlp": 0.00282294, "balance_loss_clip": 1.19195271, "balance_loss_mlp": 0.25184757, "epoch": 0.38833608898241395, "flos": 27892392606720.0, "grad_norm": 19.50981090791896, "language_loss": 0.87809575, "learning_rate": 2.797586434755509e-06, "loss": 0.89548552, "num_input_tokens_seen": 138654785, "router_z_loss_clip": 2.6484375, "router_z_loss_mlp": 0.30444336, "step": 6459, "time_per_iteration": 2.7084097862243652 }, { "auxiliary_loss_clip": 0.01447782, "auxiliary_loss_mlp": 0.00291323, "balance_loss_clip": 1.18515897, "balance_loss_mlp": 0.26135367, "epoch": 0.3883962122350819, "flos": 18076390675200.0, "grad_norm": 13.490096370367871, "language_loss": 0.69605803, "learning_rate": 2.7972292673536202e-06, "loss": 0.71344912, "num_input_tokens_seen": 138673330, "router_z_loss_clip": 2.62695312, "router_z_loss_mlp": 0.29956055, "step": 6460, "time_per_iteration": 2.657027244567871 }, { "auxiliary_loss_clip": 0.01444232, "auxiliary_loss_mlp": 0.00272149, "balance_loss_clip": 1.18625951, "balance_loss_mlp": 0.24163094, "epoch": 0.3884563354877499, "flos": 23622254847360.0, "grad_norm": 5.654545742733878, "language_loss": 0.91951966, "learning_rate": 2.796872069720717e-06, "loss": 0.93668342, "num_input_tokens_seen": 138694185, "router_z_loss_clip": 2.58007812, "router_z_loss_mlp": 0.30517578, "step": 6461, "time_per_iteration": 2.704819440841675 }, { "auxiliary_loss_clip": 0.01432322, "auxiliary_loss_mlp": 0.00290806, "balance_loss_clip": 1.17508864, "balance_loss_mlp": 0.25902432, "epoch": 0.38851645874041785, "flos": 27453528236160.0, "grad_norm": 19.76220759751131, "language_loss": 0.8195641, "learning_rate": 2.7965148418703456e-06, "loss": 0.83679533, "num_input_tokens_seen": 138714625, "router_z_loss_clip": 2.56640625, "router_z_loss_mlp": 0.31738281, "step": 6462, "time_per_iteration": 2.7301876544952393 }, { "auxiliary_loss_clip": 0.01437326, "auxiliary_loss_mlp": 0.00274832, "balance_loss_clip": 1.17747688, "balance_loss_mlp": 0.24343227, "epoch": 0.3885765819930858, "flos": 25228072270080.0, "grad_norm": 38.02205846940823, "language_loss": 0.84303319, "learning_rate": 2.796157583816052e-06, "loss": 0.86015475, "num_input_tokens_seen": 138733585, "router_z_loss_clip": 2.59765625, "router_z_loss_mlp": 0.31396484, "step": 6463, "time_per_iteration": 2.655244827270508 }, { "auxiliary_loss_clip": 0.01458901, "auxiliary_loss_mlp": 0.00303213, "balance_loss_clip": 1.18895221, "balance_loss_mlp": 0.26876202, "epoch": 0.3886367052457538, "flos": 16946605221120.0, "grad_norm": 15.80373461753734, "language_loss": 0.80689025, "learning_rate": 2.795800295571382e-06, "loss": 0.82451129, "num_input_tokens_seen": 138752335, "router_z_loss_clip": 2.69921875, "router_z_loss_mlp": 0.34448242, "step": 6464, "time_per_iteration": 2.646345853805542 }, { "auxiliary_loss_clip": 0.01434175, "auxiliary_loss_mlp": 0.00286145, "balance_loss_clip": 1.17891002, "balance_loss_mlp": 0.25455415, "epoch": 0.38869682849842174, "flos": 27154140376320.0, "grad_norm": 3.6812497429866804, "language_loss": 0.75412452, "learning_rate": 2.7954429771498858e-06, "loss": 0.77132773, "num_input_tokens_seen": 138768450, "router_z_loss_clip": 2.55273438, "router_z_loss_mlp": 0.31567383, "step": 6465, "time_per_iteration": 2.660212516784668 }, { "auxiliary_loss_clip": 0.01434381, "auxiliary_loss_mlp": 0.00303956, "balance_loss_clip": 1.17466354, "balance_loss_mlp": 0.26974303, "epoch": 0.3887569517510897, "flos": 21063619301760.0, "grad_norm": 24.065312226753637, "language_loss": 0.84232205, "learning_rate": 2.7950856285651117e-06, "loss": 0.85970545, "num_input_tokens_seen": 138786775, "router_z_loss_clip": 2.59960938, "router_z_loss_mlp": 0.34204102, "step": 6466, "time_per_iteration": 2.62174129486084 }, { "auxiliary_loss_clip": 0.01426984, "auxiliary_loss_mlp": 0.00282846, "balance_loss_clip": 1.16610289, "balance_loss_mlp": 0.25182778, "epoch": 0.38881707500375773, "flos": 29497384016640.0, "grad_norm": 390.7304449220014, "language_loss": 0.75837481, "learning_rate": 2.794728249830611e-06, "loss": 0.77547312, "num_input_tokens_seen": 138810100, "router_z_loss_clip": 2.60546875, "router_z_loss_mlp": 0.31005859, "step": 6467, "time_per_iteration": 2.6900274753570557 }, { "auxiliary_loss_clip": 0.01421467, "auxiliary_loss_mlp": 0.00274472, "balance_loss_clip": 1.16308141, "balance_loss_mlp": 0.2435368, "epoch": 0.3888771982564257, "flos": 17488281294720.0, "grad_norm": 7.253526309738377, "language_loss": 0.92756289, "learning_rate": 2.794370840959936e-06, "loss": 0.94452232, "num_input_tokens_seen": 138825140, "router_z_loss_clip": 2.58203125, "router_z_loss_mlp": 0.3092041, "step": 6468, "time_per_iteration": 2.614870071411133 }, { "auxiliary_loss_clip": 0.01406229, "auxiliary_loss_mlp": 0.00270576, "balance_loss_clip": 1.15689349, "balance_loss_mlp": 0.24078512, "epoch": 0.38893732150909366, "flos": 21942425450880.0, "grad_norm": 84.70971845098208, "language_loss": 0.91145504, "learning_rate": 2.7940134019666383e-06, "loss": 0.92822313, "num_input_tokens_seen": 138844115, "router_z_loss_clip": 2.49023438, "router_z_loss_mlp": 0.29772949, "step": 6469, "time_per_iteration": 2.659018039703369 }, { "auxiliary_loss_clip": 0.01428588, "auxiliary_loss_mlp": 0.00254136, "balance_loss_clip": 1.17310143, "balance_loss_mlp": 0.22399974, "epoch": 0.3889974447617616, "flos": 24276367468800.0, "grad_norm": 316.4226848799966, "language_loss": 0.80768013, "learning_rate": 2.793655932864273e-06, "loss": 0.82450736, "num_input_tokens_seen": 138860860, "router_z_loss_clip": 2.55664062, "router_z_loss_mlp": 0.30151367, "step": 6470, "time_per_iteration": 2.6421797275543213 }, { "auxiliary_loss_clip": 0.01411961, "auxiliary_loss_mlp": 0.0026701, "balance_loss_clip": 1.1607244, "balance_loss_mlp": 0.23601538, "epoch": 0.3890575680144296, "flos": 25667116208640.0, "grad_norm": 122.90887690607924, "language_loss": 0.8293916, "learning_rate": 2.7932984336663953e-06, "loss": 0.84618127, "num_input_tokens_seen": 138881910, "router_z_loss_clip": 2.515625, "router_z_loss_mlp": 0.30969238, "step": 6471, "time_per_iteration": 2.6981358528137207 }, { "auxiliary_loss_clip": 0.0141125, "auxiliary_loss_mlp": 0.00267756, "balance_loss_clip": 1.1581924, "balance_loss_mlp": 0.23583147, "epoch": 0.38911769126709755, "flos": 22855274714880.0, "grad_norm": 3.6447668409396865, "language_loss": 0.72261649, "learning_rate": 2.792940904386562e-06, "loss": 0.73940659, "num_input_tokens_seen": 138900975, "router_z_loss_clip": 2.52929688, "router_z_loss_mlp": 0.3190918, "step": 6472, "time_per_iteration": 2.6294267177581787 }, { "auxiliary_loss_clip": 0.01411366, "auxiliary_loss_mlp": 0.00288541, "balance_loss_clip": 1.15668678, "balance_loss_mlp": 0.25582981, "epoch": 0.3891778145197655, "flos": 25447522412160.0, "grad_norm": 12.411221239543886, "language_loss": 0.82560021, "learning_rate": 2.7925833450383293e-06, "loss": 0.84259927, "num_input_tokens_seen": 138920795, "router_z_loss_clip": 2.546875, "router_z_loss_mlp": 0.32763672, "step": 6473, "time_per_iteration": 2.6768908500671387 }, { "auxiliary_loss_clip": 0.01427805, "auxiliary_loss_mlp": 0.00273685, "balance_loss_clip": 1.1692313, "balance_loss_mlp": 0.24235702, "epoch": 0.3892379377724335, "flos": 14027965614720.0, "grad_norm": 100.8966622166671, "language_loss": 0.81948251, "learning_rate": 2.792225755635257e-06, "loss": 0.83649743, "num_input_tokens_seen": 138938770, "router_z_loss_clip": 2.58789062, "router_z_loss_mlp": 0.31323242, "step": 6474, "time_per_iteration": 2.635230541229248 }, { "auxiliary_loss_clip": 0.01410296, "auxiliary_loss_mlp": 0.00233525, "balance_loss_clip": 1.15518653, "balance_loss_mlp": 0.20520091, "epoch": 0.38929806102510145, "flos": 20157449967360.0, "grad_norm": 6.052489268162658, "language_loss": 0.74852842, "learning_rate": 2.7918681361909046e-06, "loss": 0.76496661, "num_input_tokens_seen": 138958880, "router_z_loss_clip": 2.55664062, "router_z_loss_mlp": 0.28369141, "step": 6475, "time_per_iteration": 2.7633635997772217 }, { "auxiliary_loss_clip": 0.01436909, "auxiliary_loss_mlp": 0.00295619, "balance_loss_clip": 1.17230105, "balance_loss_mlp": 0.2630986, "epoch": 0.3893581842777694, "flos": 22163958581760.0, "grad_norm": 6.947620539027206, "language_loss": 0.81704456, "learning_rate": 2.7915104867188332e-06, "loss": 0.83436984, "num_input_tokens_seen": 138977240, "router_z_loss_clip": 2.65039062, "router_z_loss_mlp": 0.32568359, "step": 6476, "time_per_iteration": 4.141119480133057 }, { "auxiliary_loss_clip": 0.01335984, "auxiliary_loss_mlp": 0.00234733, "balance_loss_clip": 1.17702293, "balance_loss_mlp": 0.22348003, "epoch": 0.3894183075304374, "flos": 67301877392640.0, "grad_norm": 0.8503068609658352, "language_loss": 0.57724059, "learning_rate": 2.7911528072326055e-06, "loss": 0.59294784, "num_input_tokens_seen": 139039035, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.11230469, "step": 6477, "time_per_iteration": 3.147035837173462 }, { "auxiliary_loss_clip": 0.01438575, "auxiliary_loss_mlp": 0.0025765, "balance_loss_clip": 1.17559969, "balance_loss_mlp": 0.22503397, "epoch": 0.38947843078310534, "flos": 18547502480640.0, "grad_norm": 87.85736850978014, "language_loss": 0.85254198, "learning_rate": 2.7907950977457832e-06, "loss": 0.86950421, "num_input_tokens_seen": 139055560, "router_z_loss_clip": 2.6328125, "router_z_loss_mlp": 0.32617188, "step": 6478, "time_per_iteration": 2.653440475463867 }, { "auxiliary_loss_clip": 0.01424014, "auxiliary_loss_mlp": 0.00269686, "balance_loss_clip": 1.16432643, "balance_loss_mlp": 0.23761836, "epoch": 0.3895385540357733, "flos": 14605875532800.0, "grad_norm": 112.00906850600344, "language_loss": 0.9098047, "learning_rate": 2.7904373582719317e-06, "loss": 0.92674172, "num_input_tokens_seen": 139071865, "router_z_loss_clip": 2.59375, "router_z_loss_mlp": 0.32055664, "step": 6479, "time_per_iteration": 4.125814914703369 }, { "auxiliary_loss_clip": 0.01429806, "auxiliary_loss_mlp": 0.00278053, "balance_loss_clip": 1.16953826, "balance_loss_mlp": 0.24885869, "epoch": 0.38959867728844133, "flos": 19975203336960.0, "grad_norm": 3.990272310302621, "language_loss": 0.87720668, "learning_rate": 2.790079588824617e-06, "loss": 0.89428532, "num_input_tokens_seen": 139089640, "router_z_loss_clip": 2.6015625, "router_z_loss_mlp": 0.29199219, "step": 6480, "time_per_iteration": 2.62803053855896 }, { "auxiliary_loss_clip": 0.0142196, "auxiliary_loss_mlp": 0.00276557, "balance_loss_clip": 1.16519094, "balance_loss_mlp": 0.24729146, "epoch": 0.3896588005411093, "flos": 22672130244480.0, "grad_norm": 167.76552286998233, "language_loss": 0.89203322, "learning_rate": 2.7897217894174038e-06, "loss": 0.9090184, "num_input_tokens_seen": 139109365, "router_z_loss_clip": 2.56445312, "router_z_loss_mlp": 0.2923584, "step": 6481, "time_per_iteration": 4.086307048797607 }, { "auxiliary_loss_clip": 0.01427086, "auxiliary_loss_mlp": 0.002531, "balance_loss_clip": 1.16730785, "balance_loss_mlp": 0.22384559, "epoch": 0.38971892379377726, "flos": 20996035862400.0, "grad_norm": 64.53872556247816, "language_loss": 0.81756747, "learning_rate": 2.789363960063863e-06, "loss": 0.8343693, "num_input_tokens_seen": 139128260, "router_z_loss_clip": 2.59960938, "router_z_loss_mlp": 0.29248047, "step": 6482, "time_per_iteration": 2.6340091228485107 }, { "auxiliary_loss_clip": 0.01424119, "auxiliary_loss_mlp": 0.00282219, "balance_loss_clip": 1.16346622, "balance_loss_mlp": 0.25196365, "epoch": 0.3897790470464452, "flos": 22528487756160.0, "grad_norm": 9.087926678013908, "language_loss": 0.86136615, "learning_rate": 2.78900610077756e-06, "loss": 0.87842953, "num_input_tokens_seen": 139147315, "router_z_loss_clip": 2.61132812, "router_z_loss_mlp": 0.30285645, "step": 6483, "time_per_iteration": 2.6710658073425293 }, { "auxiliary_loss_clip": 0.01429467, "auxiliary_loss_mlp": 0.00272453, "balance_loss_clip": 1.16353536, "balance_loss_mlp": 0.2389079, "epoch": 0.3898391702991132, "flos": 26209905603840.0, "grad_norm": 10.662940204439261, "language_loss": 0.83993655, "learning_rate": 2.788648211572067e-06, "loss": 0.85695577, "num_input_tokens_seen": 139167270, "router_z_loss_clip": 2.66015625, "router_z_loss_mlp": 0.33544922, "step": 6484, "time_per_iteration": 2.6716723442077637 }, { "auxiliary_loss_clip": 0.01438534, "auxiliary_loss_mlp": 0.00266349, "balance_loss_clip": 1.17478621, "balance_loss_mlp": 0.23366177, "epoch": 0.38989929355178116, "flos": 21065558636160.0, "grad_norm": 5.089714065552633, "language_loss": 0.88113916, "learning_rate": 2.7882902924609557e-06, "loss": 0.89818799, "num_input_tokens_seen": 139185970, "router_z_loss_clip": 2.63476562, "router_z_loss_mlp": 0.3269043, "step": 6485, "time_per_iteration": 2.6687333583831787 }, { "auxiliary_loss_clip": 0.01425281, "auxiliary_loss_mlp": 0.00289624, "balance_loss_clip": 1.16135454, "balance_loss_mlp": 0.25731796, "epoch": 0.3899594168044491, "flos": 25484115392640.0, "grad_norm": 23.169141006593062, "language_loss": 0.93261385, "learning_rate": 2.7879323434577965e-06, "loss": 0.94976294, "num_input_tokens_seen": 139203730, "router_z_loss_clip": 2.640625, "router_z_loss_mlp": 0.32299805, "step": 6486, "time_per_iteration": 2.8332126140594482 }, { "auxiliary_loss_clip": 0.0143308, "auxiliary_loss_mlp": 0.00280427, "balance_loss_clip": 1.16514921, "balance_loss_mlp": 0.24876457, "epoch": 0.3900195400571171, "flos": 31139363456640.0, "grad_norm": 3.837766818699953, "language_loss": 0.92742109, "learning_rate": 2.7875743645761645e-06, "loss": 0.94455612, "num_input_tokens_seen": 139222560, "router_z_loss_clip": 2.6796875, "router_z_loss_mlp": 0.31665039, "step": 6487, "time_per_iteration": 2.7519614696502686 }, { "auxiliary_loss_clip": 0.01416208, "auxiliary_loss_mlp": 0.00267963, "balance_loss_clip": 1.15517938, "balance_loss_mlp": 0.23656285, "epoch": 0.39007966330978505, "flos": 20229917656320.0, "grad_norm": 2.2524855853662618, "language_loss": 0.8090263, "learning_rate": 2.787216355829633e-06, "loss": 0.82586801, "num_input_tokens_seen": 139242165, "router_z_loss_clip": 2.609375, "router_z_loss_mlp": 0.31396484, "step": 6488, "time_per_iteration": 4.089753150939941 }, { "auxiliary_loss_clip": 0.01449672, "auxiliary_loss_mlp": 0.00326326, "balance_loss_clip": 1.17767644, "balance_loss_mlp": 0.29208916, "epoch": 0.390139786562453, "flos": 22528739151360.0, "grad_norm": 3.0131082360640455, "language_loss": 0.7688266, "learning_rate": 2.786858317231779e-06, "loss": 0.78658664, "num_input_tokens_seen": 139262525, "router_z_loss_clip": 2.71875, "router_z_loss_mlp": 0.34277344, "step": 6489, "time_per_iteration": 2.8891658782958984 }, { "auxiliary_loss_clip": 0.01427137, "auxiliary_loss_mlp": 0.00272821, "balance_loss_clip": 1.16010761, "balance_loss_mlp": 0.24366215, "epoch": 0.390199909815121, "flos": 26432911192320.0, "grad_norm": 5.212522280218004, "language_loss": 0.87238598, "learning_rate": 2.7865002487961788e-06, "loss": 0.88938558, "num_input_tokens_seen": 139282835, "router_z_loss_clip": 2.66601562, "router_z_loss_mlp": 0.29150391, "step": 6490, "time_per_iteration": 2.7837557792663574 }, { "auxiliary_loss_clip": 0.0142147, "auxiliary_loss_mlp": 0.00303798, "balance_loss_clip": 1.1573689, "balance_loss_mlp": 0.27232683, "epoch": 0.39026003306778895, "flos": 17274577328640.0, "grad_norm": 2.848701300704279, "language_loss": 0.98184299, "learning_rate": 2.7861421505364104e-06, "loss": 0.99909568, "num_input_tokens_seen": 139299490, "router_z_loss_clip": 2.640625, "router_z_loss_mlp": 0.31494141, "step": 6491, "time_per_iteration": 2.6340012550354004 }, { "auxiliary_loss_clip": 0.01434014, "auxiliary_loss_mlp": 0.00267586, "balance_loss_clip": 1.16238439, "balance_loss_mlp": 0.23597193, "epoch": 0.3903201563204569, "flos": 24532841554560.0, "grad_norm": 30.924248528339277, "language_loss": 0.84281451, "learning_rate": 2.7857840224660523e-06, "loss": 0.8598305, "num_input_tokens_seen": 139317865, "router_z_loss_clip": 2.71484375, "router_z_loss_mlp": 0.31591797, "step": 6492, "time_per_iteration": 2.6429078578948975 }, { "auxiliary_loss_clip": 0.01420595, "auxiliary_loss_mlp": 0.00265425, "balance_loss_clip": 1.15781558, "balance_loss_mlp": 0.2339298, "epoch": 0.39038027957312493, "flos": 23767944410880.0, "grad_norm": 10.59488395269669, "language_loss": 0.7906003, "learning_rate": 2.7854258645986857e-06, "loss": 0.80746055, "num_input_tokens_seen": 139339840, "router_z_loss_clip": 2.62695312, "router_z_loss_mlp": 0.31506348, "step": 6493, "time_per_iteration": 2.7049787044525146 }, { "auxiliary_loss_clip": 0.01454511, "auxiliary_loss_mlp": 0.00302614, "balance_loss_clip": 1.17862797, "balance_loss_mlp": 0.26914036, "epoch": 0.3904404028257929, "flos": 14100612871680.0, "grad_norm": 18.362015275945453, "language_loss": 0.8483628, "learning_rate": 2.7850676769478916e-06, "loss": 0.86593407, "num_input_tokens_seen": 139357555, "router_z_loss_clip": 2.75585938, "router_z_loss_mlp": 0.3347168, "step": 6494, "time_per_iteration": 2.613114595413208 }, { "auxiliary_loss_clip": 0.01446611, "auxiliary_loss_mlp": 0.00334195, "balance_loss_clip": 1.1702013, "balance_loss_mlp": 0.29733506, "epoch": 0.39050052607846086, "flos": 16910048154240.0, "grad_norm": 72.04328146162683, "language_loss": 0.84588099, "learning_rate": 2.7847094595272525e-06, "loss": 0.86368906, "num_input_tokens_seen": 139374455, "router_z_loss_clip": 2.76367188, "router_z_loss_mlp": 0.36865234, "step": 6495, "time_per_iteration": 2.660688877105713 }, { "auxiliary_loss_clip": 0.01441295, "auxiliary_loss_mlp": 0.00284918, "balance_loss_clip": 1.17385423, "balance_loss_mlp": 0.25156337, "epoch": 0.39056064933112883, "flos": 25915761129600.0, "grad_norm": 89.95493582881869, "language_loss": 0.75954127, "learning_rate": 2.784351212350352e-06, "loss": 0.77680331, "num_input_tokens_seen": 139394770, "router_z_loss_clip": 2.67578125, "router_z_loss_mlp": 0.33374023, "step": 6496, "time_per_iteration": 2.740514039993286 }, { "auxiliary_loss_clip": 0.01369322, "auxiliary_loss_mlp": 0.00056358, "balance_loss_clip": 1.15555072, "balance_loss_mlp": 0.04748853, "epoch": 0.3906207725837968, "flos": 60028421713920.0, "grad_norm": 0.6654237805044204, "language_loss": 0.53697115, "learning_rate": 2.783992935430775e-06, "loss": 0.55122793, "num_input_tokens_seen": 139454760, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.08886719, "step": 6497, "time_per_iteration": 3.2274329662323 }, { "auxiliary_loss_clip": 0.01431043, "auxiliary_loss_mlp": 0.0027213, "balance_loss_clip": 1.16201425, "balance_loss_mlp": 0.24037269, "epoch": 0.39068089583646476, "flos": 21068683119360.0, "grad_norm": 15.18587398265637, "language_loss": 0.76631069, "learning_rate": 2.7836346287821068e-06, "loss": 0.78334248, "num_input_tokens_seen": 139472645, "router_z_loss_clip": 2.6875, "router_z_loss_mlp": 0.31787109, "step": 6498, "time_per_iteration": 2.680330514907837 }, { "auxiliary_loss_clip": 0.01382766, "auxiliary_loss_mlp": 0.00055918, "balance_loss_clip": 1.17798579, "balance_loss_mlp": 0.04571389, "epoch": 0.3907410190891327, "flos": 70445677403520.0, "grad_norm": 0.7232880396868521, "language_loss": 0.51553869, "learning_rate": 2.783276292417936e-06, "loss": 0.52992558, "num_input_tokens_seen": 139536730, "router_z_loss_clip": 2.046875, "router_z_loss_mlp": 0.10205078, "step": 6499, "time_per_iteration": 3.1817221641540527 }, { "auxiliary_loss_clip": 0.01429801, "auxiliary_loss_mlp": 0.00313089, "balance_loss_clip": 1.15923917, "balance_loss_mlp": 0.27632487, "epoch": 0.3908011423418007, "flos": 27962454084480.0, "grad_norm": 6.102377875234599, "language_loss": 0.79938775, "learning_rate": 2.7829179263518487e-06, "loss": 0.81681669, "num_input_tokens_seen": 139557540, "router_z_loss_clip": 2.70703125, "router_z_loss_mlp": 0.36791992, "step": 6500, "time_per_iteration": 2.7219269275665283 }, { "auxiliary_loss_clip": 0.01457985, "auxiliary_loss_mlp": 0.00295205, "balance_loss_clip": 1.18387222, "balance_loss_mlp": 0.26313788, "epoch": 0.39086126559446865, "flos": 24462097718400.0, "grad_norm": 58.00850906669905, "language_loss": 0.78002572, "learning_rate": 2.7825595305974354e-06, "loss": 0.79755765, "num_input_tokens_seen": 139576875, "router_z_loss_clip": 2.74609375, "router_z_loss_mlp": 0.32006836, "step": 6501, "time_per_iteration": 2.7087833881378174 }, { "auxiliary_loss_clip": 0.01441248, "auxiliary_loss_mlp": 0.0027396, "balance_loss_clip": 1.17107558, "balance_loss_mlp": 0.24046238, "epoch": 0.3909213888471366, "flos": 16941541403520.0, "grad_norm": 36.803316140432315, "language_loss": 0.84421343, "learning_rate": 2.782201105168287e-06, "loss": 0.86136544, "num_input_tokens_seen": 139594295, "router_z_loss_clip": 2.69726562, "router_z_loss_mlp": 0.3347168, "step": 6502, "time_per_iteration": 2.711869716644287 }, { "auxiliary_loss_clip": 0.01421627, "auxiliary_loss_mlp": 0.0025604, "balance_loss_clip": 1.15802431, "balance_loss_mlp": 0.22633335, "epoch": 0.3909815120998046, "flos": 29278400751360.0, "grad_norm": 6.2762859649316605, "language_loss": 0.8820107, "learning_rate": 2.7818426500779932e-06, "loss": 0.89878738, "num_input_tokens_seen": 139614080, "router_z_loss_clip": 2.63476562, "router_z_loss_mlp": 0.29736328, "step": 6503, "time_per_iteration": 2.786430835723877 }, { "auxiliary_loss_clip": 0.01424345, "auxiliary_loss_mlp": 0.00273259, "balance_loss_clip": 1.16169, "balance_loss_mlp": 0.24233589, "epoch": 0.39104163535247255, "flos": 18951246328320.0, "grad_norm": 4.301953823803494, "language_loss": 0.80164802, "learning_rate": 2.7814841653401485e-06, "loss": 0.81862414, "num_input_tokens_seen": 139632755, "router_z_loss_clip": 2.62695312, "router_z_loss_mlp": 0.30908203, "step": 6504, "time_per_iteration": 2.692943811416626 }, { "auxiliary_loss_clip": 0.01415772, "auxiliary_loss_mlp": 0.00261054, "balance_loss_clip": 1.15145242, "balance_loss_mlp": 0.22741327, "epoch": 0.3911017586051405, "flos": 26323347732480.0, "grad_norm": 258.71196887508677, "language_loss": 0.88998842, "learning_rate": 2.7811256509683454e-06, "loss": 0.90675676, "num_input_tokens_seen": 139654205, "router_z_loss_clip": 2.64257812, "router_z_loss_mlp": 0.33642578, "step": 6505, "time_per_iteration": 2.7082085609436035 }, { "auxiliary_loss_clip": 0.01426775, "auxiliary_loss_mlp": 0.00259905, "balance_loss_clip": 1.16017079, "balance_loss_mlp": 0.22724116, "epoch": 0.3911618818578085, "flos": 21835770992640.0, "grad_norm": 1981.0411405048003, "language_loss": 0.81235278, "learning_rate": 2.7807671069761797e-06, "loss": 0.82921958, "num_input_tokens_seen": 139673595, "router_z_loss_clip": 2.66796875, "router_z_loss_mlp": 0.3269043, "step": 6506, "time_per_iteration": 2.630178928375244 }, { "auxiliary_loss_clip": 0.01426376, "auxiliary_loss_mlp": 0.00259324, "balance_loss_clip": 1.16509426, "balance_loss_mlp": 0.22832938, "epoch": 0.3912220051104765, "flos": 16359680989440.0, "grad_norm": 9.768795334173719, "language_loss": 0.82468629, "learning_rate": 2.7804085333772477e-06, "loss": 0.84154326, "num_input_tokens_seen": 139690565, "router_z_loss_clip": 2.61132812, "router_z_loss_mlp": 0.30981445, "step": 6507, "time_per_iteration": 2.6447837352752686 }, { "auxiliary_loss_clip": 0.01346502, "auxiliary_loss_mlp": 0.00062402, "balance_loss_clip": 1.16921723, "balance_loss_mlp": 0.05667965, "epoch": 0.39128212836314447, "flos": 71050986420480.0, "grad_norm": 0.7460888361265731, "language_loss": 0.56519979, "learning_rate": 2.7800499301851446e-06, "loss": 0.57928878, "num_input_tokens_seen": 139749420, "router_z_loss_clip": 1.7734375, "router_z_loss_mlp": 0.05712891, "step": 6508, "time_per_iteration": 3.286494016647339 }, { "auxiliary_loss_clip": 0.01417888, "auxiliary_loss_mlp": 0.00265005, "balance_loss_clip": 1.15842569, "balance_loss_mlp": 0.23212677, "epoch": 0.39134225161581243, "flos": 20331975173760.0, "grad_norm": 87.34585527786857, "language_loss": 0.83616745, "learning_rate": 2.779691297413471e-06, "loss": 0.85299641, "num_input_tokens_seen": 139766265, "router_z_loss_clip": 2.59765625, "router_z_loss_mlp": 0.32885742, "step": 6509, "time_per_iteration": 2.6110756397247314 }, { "auxiliary_loss_clip": 0.01450047, "auxiliary_loss_mlp": 0.00276972, "balance_loss_clip": 1.18093109, "balance_loss_mlp": 0.24221063, "epoch": 0.3914023748684804, "flos": 17018390551680.0, "grad_norm": 151.77149016247262, "language_loss": 0.92613661, "learning_rate": 2.779332635075825e-06, "loss": 0.94340682, "num_input_tokens_seen": 139782400, "router_z_loss_clip": 2.68945312, "router_z_loss_mlp": 0.34741211, "step": 6510, "time_per_iteration": 2.586942195892334 }, { "auxiliary_loss_clip": 0.01430193, "auxiliary_loss_mlp": 0.00268615, "balance_loss_clip": 1.16456056, "balance_loss_mlp": 0.23604709, "epoch": 0.39146249812114836, "flos": 18405224709120.0, "grad_norm": 10.31553965610553, "language_loss": 0.86050063, "learning_rate": 2.7789739431858073e-06, "loss": 0.87748873, "num_input_tokens_seen": 139801435, "router_z_loss_clip": 2.65429688, "router_z_loss_mlp": 0.32568359, "step": 6511, "time_per_iteration": 2.6577961444854736 }, { "auxiliary_loss_clip": 0.01379936, "auxiliary_loss_mlp": 0.00074487, "balance_loss_clip": 1.18845034, "balance_loss_mlp": 0.06828818, "epoch": 0.3915226213738163, "flos": 67637355442560.0, "grad_norm": 0.7146889134694348, "language_loss": 0.57758832, "learning_rate": 2.7786152217570196e-06, "loss": 0.59213257, "num_input_tokens_seen": 139869700, "router_z_loss_clip": 1.9140625, "router_z_loss_mlp": 0.06176758, "step": 6512, "time_per_iteration": 3.220501184463501 }, { "auxiliary_loss_clip": 0.01445659, "auxiliary_loss_mlp": 0.00297532, "balance_loss_clip": 1.17698479, "balance_loss_mlp": 0.26498795, "epoch": 0.3915827446264843, "flos": 26359330181760.0, "grad_norm": 34.293633224994416, "language_loss": 0.76649588, "learning_rate": 2.7782564708030647e-06, "loss": 0.7839278, "num_input_tokens_seen": 139890140, "router_z_loss_clip": 2.69140625, "router_z_loss_mlp": 0.32519531, "step": 6513, "time_per_iteration": 2.6691160202026367 }, { "auxiliary_loss_clip": 0.01452168, "auxiliary_loss_mlp": 0.00291835, "balance_loss_clip": 1.17996407, "balance_loss_mlp": 0.25828913, "epoch": 0.39164286787915226, "flos": 21943897908480.0, "grad_norm": 25.710143733997185, "language_loss": 0.87688476, "learning_rate": 2.7778976903375464e-06, "loss": 0.89432478, "num_input_tokens_seen": 139908020, "router_z_loss_clip": 2.72070312, "router_z_loss_mlp": 0.33544922, "step": 6514, "time_per_iteration": 2.620985984802246 }, { "auxiliary_loss_clip": 0.01431598, "auxiliary_loss_mlp": 0.00297411, "balance_loss_clip": 1.16681159, "balance_loss_mlp": 0.26548663, "epoch": 0.3917029911318202, "flos": 16399829416320.0, "grad_norm": 144.8244040139509, "language_loss": 0.85529077, "learning_rate": 2.7775388803740693e-06, "loss": 0.87258089, "num_input_tokens_seen": 139926180, "router_z_loss_clip": 2.65039062, "router_z_loss_mlp": 0.31933594, "step": 6515, "time_per_iteration": 2.632024049758911 }, { "auxiliary_loss_clip": 0.01428844, "auxiliary_loss_mlp": 0.0027775, "balance_loss_clip": 1.1639452, "balance_loss_mlp": 0.24661213, "epoch": 0.3917631143844882, "flos": 26211701283840.0, "grad_norm": 91.25896661887427, "language_loss": 0.84391081, "learning_rate": 2.7771800409262406e-06, "loss": 0.86097682, "num_input_tokens_seen": 139947420, "router_z_loss_clip": 2.65039062, "router_z_loss_mlp": 0.31152344, "step": 6516, "time_per_iteration": 2.7095232009887695 }, { "auxiliary_loss_clip": 0.01434446, "auxiliary_loss_mlp": 0.0028292, "balance_loss_clip": 1.16695213, "balance_loss_mlp": 0.24823022, "epoch": 0.39182323763715615, "flos": 18548364407040.0, "grad_norm": 3.233505049969333, "language_loss": 0.78115308, "learning_rate": 2.7768211720076665e-06, "loss": 0.79832673, "num_input_tokens_seen": 139965800, "router_z_loss_clip": 2.67578125, "router_z_loss_mlp": 0.34692383, "step": 6517, "time_per_iteration": 2.677799701690674 }, { "auxiliary_loss_clip": 0.0144167, "auxiliary_loss_mlp": 0.00262708, "balance_loss_clip": 1.16999602, "balance_loss_mlp": 0.23128425, "epoch": 0.3918833608898241, "flos": 34313543395200.0, "grad_norm": 4.530292872335274, "language_loss": 0.78265655, "learning_rate": 2.776462273631956e-06, "loss": 0.79970032, "num_input_tokens_seen": 139988140, "router_z_loss_clip": 2.71484375, "router_z_loss_mlp": 0.31420898, "step": 6518, "time_per_iteration": 2.7896158695220947 }, { "auxiliary_loss_clip": 0.01436714, "auxiliary_loss_mlp": 0.00322228, "balance_loss_clip": 1.16398478, "balance_loss_mlp": 0.28927898, "epoch": 0.3919434841424921, "flos": 36939582812160.0, "grad_norm": 37.623656679511306, "language_loss": 0.69133204, "learning_rate": 2.7761033458127177e-06, "loss": 0.70892143, "num_input_tokens_seen": 140010060, "router_z_loss_clip": 2.7265625, "router_z_loss_mlp": 0.32958984, "step": 6519, "time_per_iteration": 4.216265439987183 }, { "auxiliary_loss_clip": 0.01461738, "auxiliary_loss_mlp": 0.00305299, "balance_loss_clip": 1.18351269, "balance_loss_mlp": 0.27153906, "epoch": 0.3920036073951601, "flos": 23508956373120.0, "grad_norm": 2.1985370864055387, "language_loss": 0.76498532, "learning_rate": 2.775744388563563e-06, "loss": 0.78265572, "num_input_tokens_seen": 140029400, "router_z_loss_clip": 2.78515625, "router_z_loss_mlp": 0.33740234, "step": 6520, "time_per_iteration": 2.652175188064575 }, { "auxiliary_loss_clip": 0.01428722, "auxiliary_loss_mlp": 0.00296707, "balance_loss_clip": 1.15534854, "balance_loss_mlp": 0.26340038, "epoch": 0.39206373064782807, "flos": 18406086635520.0, "grad_norm": 30.59593200708812, "language_loss": 0.86679435, "learning_rate": 2.775385401898104e-06, "loss": 0.8840487, "num_input_tokens_seen": 140048940, "router_z_loss_clip": 2.734375, "router_z_loss_mlp": 0.33325195, "step": 6521, "time_per_iteration": 4.0971901416778564 }, { "auxiliary_loss_clip": 0.01450627, "auxiliary_loss_mlp": 0.0031023, "balance_loss_clip": 1.17324686, "balance_loss_mlp": 0.2769464, "epoch": 0.39212385390049603, "flos": 12313051608960.0, "grad_norm": 6689.509652100534, "language_loss": 0.81988013, "learning_rate": 2.775026385829952e-06, "loss": 0.83748871, "num_input_tokens_seen": 140066380, "router_z_loss_clip": 2.77539062, "router_z_loss_mlp": 0.33276367, "step": 6522, "time_per_iteration": 2.7357699871063232 }, { "auxiliary_loss_clip": 0.01436551, "auxiliary_loss_mlp": 0.00264785, "balance_loss_clip": 1.16215634, "balance_loss_mlp": 0.23317116, "epoch": 0.392183977153164, "flos": 19719160214400.0, "grad_norm": 38.426515997916674, "language_loss": 0.8577925, "learning_rate": 2.774667340372722e-06, "loss": 0.87480581, "num_input_tokens_seen": 140085275, "router_z_loss_clip": 2.74414062, "router_z_loss_mlp": 0.31591797, "step": 6523, "time_per_iteration": 2.676544427871704 }, { "auxiliary_loss_clip": 0.01432615, "auxiliary_loss_mlp": 0.00292457, "balance_loss_clip": 1.1558938, "balance_loss_mlp": 0.26050854, "epoch": 0.39224410040583196, "flos": 33144902403840.0, "grad_norm": 123.4313016308746, "language_loss": 0.6863445, "learning_rate": 2.7743082655400293e-06, "loss": 0.70359522, "num_input_tokens_seen": 140105105, "router_z_loss_clip": 2.76757812, "router_z_loss_mlp": 0.31958008, "step": 6524, "time_per_iteration": 4.184070110321045 }, { "auxiliary_loss_clip": 0.01417853, "auxiliary_loss_mlp": 0.00267724, "balance_loss_clip": 1.14641094, "balance_loss_mlp": 0.23608588, "epoch": 0.39230422365849993, "flos": 27782434097280.0, "grad_norm": 23.94354305669703, "language_loss": 0.81767762, "learning_rate": 2.773949161345489e-06, "loss": 0.83453333, "num_input_tokens_seen": 140125645, "router_z_loss_clip": 2.7109375, "router_z_loss_mlp": 0.31640625, "step": 6525, "time_per_iteration": 2.761019229888916 }, { "auxiliary_loss_clip": 0.01417274, "auxiliary_loss_mlp": 0.00286827, "balance_loss_clip": 1.14545739, "balance_loss_mlp": 0.25344807, "epoch": 0.3923643469111679, "flos": 17931634865280.0, "grad_norm": 37.40970133362971, "language_loss": 0.91495073, "learning_rate": 2.773590027802719e-06, "loss": 0.93199176, "num_input_tokens_seen": 140141925, "router_z_loss_clip": 2.72070312, "router_z_loss_mlp": 0.33349609, "step": 6526, "time_per_iteration": 2.6442313194274902 }, { "auxiliary_loss_clip": 0.01427545, "auxiliary_loss_mlp": 0.00283874, "balance_loss_clip": 1.14865661, "balance_loss_mlp": 0.24813552, "epoch": 0.39242447016383586, "flos": 24059539019520.0, "grad_norm": 8.164651092983313, "language_loss": 0.75390399, "learning_rate": 2.7732308649253383e-06, "loss": 0.77101827, "num_input_tokens_seen": 140160965, "router_z_loss_clip": 2.7890625, "router_z_loss_mlp": 0.35742188, "step": 6527, "time_per_iteration": 2.7507286071777344 }, { "auxiliary_loss_clip": 0.01420099, "auxiliary_loss_mlp": 0.00265863, "balance_loss_clip": 1.1503917, "balance_loss_mlp": 0.23279426, "epoch": 0.3924845934165038, "flos": 10664069016960.0, "grad_norm": 66.87487419637978, "language_loss": 0.92293787, "learning_rate": 2.772871672726965e-06, "loss": 0.93979752, "num_input_tokens_seen": 140177780, "router_z_loss_clip": 2.6953125, "router_z_loss_mlp": 0.33081055, "step": 6528, "time_per_iteration": 2.5909855365753174 }, { "auxiliary_loss_clip": 0.01429192, "auxiliary_loss_mlp": 0.00290365, "balance_loss_clip": 1.15609157, "balance_loss_mlp": 0.25813118, "epoch": 0.3925447166691718, "flos": 31245910174080.0, "grad_norm": 23.54490098592932, "language_loss": 0.75150943, "learning_rate": 2.7725124512212205e-06, "loss": 0.76870507, "num_input_tokens_seen": 140201660, "router_z_loss_clip": 2.73242188, "router_z_loss_mlp": 0.32250977, "step": 6529, "time_per_iteration": 2.7485172748565674 }, { "auxiliary_loss_clip": 0.01436062, "auxiliary_loss_mlp": 0.00266717, "balance_loss_clip": 1.15602481, "balance_loss_mlp": 0.23450664, "epoch": 0.39260483992183975, "flos": 29415040087680.0, "grad_norm": 5.791513910477593, "language_loss": 0.90039241, "learning_rate": 2.7721532004217267e-06, "loss": 0.91742015, "num_input_tokens_seen": 140218585, "router_z_loss_clip": 2.80078125, "router_z_loss_mlp": 0.32226562, "step": 6530, "time_per_iteration": 4.060192346572876 }, { "auxiliary_loss_clip": 0.01425392, "auxiliary_loss_mlp": 0.00270597, "balance_loss_clip": 1.15267015, "balance_loss_mlp": 0.23874465, "epoch": 0.3926649631745077, "flos": 22857788666880.0, "grad_norm": 23.30479821152368, "language_loss": 0.83972913, "learning_rate": 2.7717939203421063e-06, "loss": 0.85668898, "num_input_tokens_seen": 140239905, "router_z_loss_clip": 2.72851562, "router_z_loss_mlp": 0.31835938, "step": 6531, "time_per_iteration": 2.7624094486236572 }, { "auxiliary_loss_clip": 0.01370532, "auxiliary_loss_mlp": 0.00132083, "balance_loss_clip": 1.20368159, "balance_loss_mlp": 0.12349989, "epoch": 0.3927250864271757, "flos": 63893881872000.0, "grad_norm": 0.7982653442877826, "language_loss": 0.60050786, "learning_rate": 2.7714346109959822e-06, "loss": 0.61553407, "num_input_tokens_seen": 140293820, "router_z_loss_clip": 1.671875, "router_z_loss_mlp": 0.0859375, "step": 6532, "time_per_iteration": 3.081346035003662 }, { "auxiliary_loss_clip": 0.01365831, "auxiliary_loss_mlp": 0.00101024, "balance_loss_clip": 1.19655418, "balance_loss_mlp": 0.09296565, "epoch": 0.3927852096798437, "flos": 68909741890560.0, "grad_norm": 0.7761626056907693, "language_loss": 0.55118781, "learning_rate": 2.771075272396981e-06, "loss": 0.56585634, "num_input_tokens_seen": 140360420, "router_z_loss_clip": 1.6875, "router_z_loss_mlp": 0.08056641, "step": 6533, "time_per_iteration": 3.2230589389801025 }, { "auxiliary_loss_clip": 0.01441094, "auxiliary_loss_mlp": 0.00295873, "balance_loss_clip": 1.16206026, "balance_loss_mlp": 0.26359132, "epoch": 0.39284533293251167, "flos": 29715972232320.0, "grad_norm": 3.5640933702118356, "language_loss": 0.84653342, "learning_rate": 2.7707159045587284e-06, "loss": 0.86390305, "num_input_tokens_seen": 140381950, "router_z_loss_clip": 2.79101562, "router_z_loss_mlp": 0.32275391, "step": 6534, "time_per_iteration": 2.785506010055542 }, { "auxiliary_loss_clip": 0.01439923, "auxiliary_loss_mlp": 0.00288363, "balance_loss_clip": 1.16409302, "balance_loss_mlp": 0.25436485, "epoch": 0.39290545618517964, "flos": 18552027594240.0, "grad_norm": 13.62896249023886, "language_loss": 0.87015021, "learning_rate": 2.770356507494851e-06, "loss": 0.88743311, "num_input_tokens_seen": 140399410, "router_z_loss_clip": 2.76171875, "router_z_loss_mlp": 0.34008789, "step": 6535, "time_per_iteration": 2.6079249382019043 }, { "auxiliary_loss_clip": 0.01448496, "auxiliary_loss_mlp": 0.00264818, "balance_loss_clip": 1.16683626, "balance_loss_mlp": 0.23131996, "epoch": 0.3929655794378476, "flos": 26249479413120.0, "grad_norm": 16.60652397766833, "language_loss": 0.75984657, "learning_rate": 2.769997081218978e-06, "loss": 0.77697968, "num_input_tokens_seen": 140419055, "router_z_loss_clip": 2.81835938, "router_z_loss_mlp": 0.33496094, "step": 6536, "time_per_iteration": 2.6889560222625732 }, { "auxiliary_loss_clip": 0.01428521, "auxiliary_loss_mlp": 0.00262186, "balance_loss_clip": 1.16369009, "balance_loss_mlp": 0.22902176, "epoch": 0.39302570269051557, "flos": 29277933874560.0, "grad_norm": 21.757021968956256, "language_loss": 0.78238869, "learning_rate": 2.769637625744738e-06, "loss": 0.79929578, "num_input_tokens_seen": 140438800, "router_z_loss_clip": 2.64453125, "router_z_loss_mlp": 0.33178711, "step": 6537, "time_per_iteration": 2.7091174125671387 }, { "auxiliary_loss_clip": 0.01453486, "auxiliary_loss_mlp": 0.0029833, "balance_loss_clip": 1.18069005, "balance_loss_mlp": 0.26540443, "epoch": 0.39308582594318353, "flos": 17347440067200.0, "grad_norm": 41.042522464405856, "language_loss": 0.84697461, "learning_rate": 2.769278141085763e-06, "loss": 0.86449289, "num_input_tokens_seen": 140456880, "router_z_loss_clip": 2.73046875, "router_z_loss_mlp": 0.3293457, "step": 6538, "time_per_iteration": 2.6474804878234863 }, { "auxiliary_loss_clip": 0.01364784, "auxiliary_loss_mlp": 0.00117196, "balance_loss_clip": 1.19734502, "balance_loss_mlp": 0.10956663, "epoch": 0.3931459491958515, "flos": 61007094650880.0, "grad_norm": 0.7962885061833448, "language_loss": 0.61770701, "learning_rate": 2.768918627255683e-06, "loss": 0.63252681, "num_input_tokens_seen": 140507510, "router_z_loss_clip": 1.671875, "router_z_loss_mlp": 0.07617188, "step": 6539, "time_per_iteration": 2.9980828762054443 }, { "auxiliary_loss_clip": 0.01447626, "auxiliary_loss_mlp": 0.00295416, "balance_loss_clip": 1.17612219, "balance_loss_mlp": 0.26175079, "epoch": 0.39320607244851946, "flos": 39016009249920.0, "grad_norm": 3.0693651550290584, "language_loss": 0.77767396, "learning_rate": 2.7685590842681315e-06, "loss": 0.79510438, "num_input_tokens_seen": 140528740, "router_z_loss_clip": 2.71679688, "router_z_loss_mlp": 0.33691406, "step": 6540, "time_per_iteration": 2.8119516372680664 }, { "auxiliary_loss_clip": 0.01431625, "auxiliary_loss_mlp": 0.00268295, "balance_loss_clip": 1.16298294, "balance_loss_mlp": 0.23572677, "epoch": 0.3932661957011874, "flos": 24679752180480.0, "grad_norm": 301.4849158121515, "language_loss": 0.77489507, "learning_rate": 2.7681995121367433e-06, "loss": 0.7918942, "num_input_tokens_seen": 140547560, "router_z_loss_clip": 2.68945312, "router_z_loss_mlp": 0.32568359, "step": 6541, "time_per_iteration": 2.675463914871216 }, { "auxiliary_loss_clip": 0.01367901, "auxiliary_loss_mlp": 0.00079105, "balance_loss_clip": 1.20077634, "balance_loss_mlp": 0.07257282, "epoch": 0.3933263189538554, "flos": 70096552185600.0, "grad_norm": 0.8719265393698116, "language_loss": 0.60150695, "learning_rate": 2.7678399108751516e-06, "loss": 0.61597705, "num_input_tokens_seen": 140601175, "router_z_loss_clip": 1.671875, "router_z_loss_mlp": 0.06542969, "step": 6542, "time_per_iteration": 2.9672024250030518 }, { "auxiliary_loss_clip": 0.01450789, "auxiliary_loss_mlp": 0.00296263, "balance_loss_clip": 1.17743373, "balance_loss_mlp": 0.26274127, "epoch": 0.39338644220652336, "flos": 22929071207040.0, "grad_norm": 282.23728344924524, "language_loss": 0.8878926, "learning_rate": 2.7674802804969947e-06, "loss": 0.90536308, "num_input_tokens_seen": 140622200, "router_z_loss_clip": 2.73242188, "router_z_loss_mlp": 0.33544922, "step": 6543, "time_per_iteration": 2.7156713008880615 }, { "auxiliary_loss_clip": 0.01444345, "auxiliary_loss_mlp": 0.00259673, "balance_loss_clip": 1.17411041, "balance_loss_mlp": 0.22662845, "epoch": 0.3934465654591913, "flos": 30848163897600.0, "grad_norm": 8213.254453606947, "language_loss": 0.75395089, "learning_rate": 2.767120621015908e-06, "loss": 0.77099109, "num_input_tokens_seen": 140643125, "router_z_loss_clip": 2.70507812, "router_z_loss_mlp": 0.33032227, "step": 6544, "time_per_iteration": 2.755950689315796 }, { "auxiliary_loss_clip": 0.01450987, "auxiliary_loss_mlp": 0.00299661, "balance_loss_clip": 1.17780244, "balance_loss_mlp": 0.26492333, "epoch": 0.3935066887118593, "flos": 29236528471680.0, "grad_norm": 1533.3521857635988, "language_loss": 0.83851123, "learning_rate": 2.76676093244553e-06, "loss": 0.85601771, "num_input_tokens_seen": 140662500, "router_z_loss_clip": 2.73242188, "router_z_loss_mlp": 0.34716797, "step": 6545, "time_per_iteration": 2.7264750003814697 }, { "auxiliary_loss_clip": 0.01442905, "auxiliary_loss_mlp": 0.00293099, "balance_loss_clip": 1.17785144, "balance_loss_mlp": 0.26363087, "epoch": 0.3935668119645273, "flos": 19135288638720.0, "grad_norm": 26.63001862673322, "language_loss": 0.80359697, "learning_rate": 2.7664012147995015e-06, "loss": 0.82095701, "num_input_tokens_seen": 140681960, "router_z_loss_clip": 2.65039062, "router_z_loss_mlp": 0.29443359, "step": 6546, "time_per_iteration": 2.7560856342315674 }, { "auxiliary_loss_clip": 0.01463693, "auxiliary_loss_mlp": 0.00308226, "balance_loss_clip": 1.18726349, "balance_loss_mlp": 0.27484781, "epoch": 0.3936269352171953, "flos": 18516116972160.0, "grad_norm": 20.268399174732618, "language_loss": 0.88509858, "learning_rate": 2.7660414680914617e-06, "loss": 0.90281773, "num_input_tokens_seen": 140699170, "router_z_loss_clip": 2.76171875, "router_z_loss_mlp": 0.33374023, "step": 6547, "time_per_iteration": 2.6412556171417236 }, { "auxiliary_loss_clip": 0.01461365, "auxiliary_loss_mlp": 0.00290301, "balance_loss_clip": 1.1919843, "balance_loss_mlp": 0.25971201, "epoch": 0.39368705846986324, "flos": 15632813370240.0, "grad_norm": 6.656900410407221, "language_loss": 0.91279399, "learning_rate": 2.7656816923350525e-06, "loss": 0.93031067, "num_input_tokens_seen": 140714920, "router_z_loss_clip": 2.69140625, "router_z_loss_mlp": 0.30578613, "step": 6548, "time_per_iteration": 2.60237455368042 }, { "auxiliary_loss_clip": 0.01443874, "auxiliary_loss_mlp": 0.00284244, "balance_loss_clip": 1.17235947, "balance_loss_mlp": 0.25038883, "epoch": 0.3937471817225312, "flos": 21325839563520.0, "grad_norm": 3.3976293513354343, "language_loss": 0.79872191, "learning_rate": 2.7653218875439174e-06, "loss": 0.81600308, "num_input_tokens_seen": 140734595, "router_z_loss_clip": 2.71679688, "router_z_loss_mlp": 0.33837891, "step": 6549, "time_per_iteration": 2.638240098953247 }, { "auxiliary_loss_clip": 0.01462782, "auxiliary_loss_mlp": 0.00307709, "balance_loss_clip": 1.18395627, "balance_loss_mlp": 0.27437758, "epoch": 0.39380730497519917, "flos": 20776693461120.0, "grad_norm": 12.241719105453654, "language_loss": 0.82534313, "learning_rate": 2.764962053731699e-06, "loss": 0.84304798, "num_input_tokens_seen": 140754050, "router_z_loss_clip": 2.78515625, "router_z_loss_mlp": 0.33325195, "step": 6550, "time_per_iteration": 2.6687657833099365 }, { "auxiliary_loss_clip": 0.01458101, "auxiliary_loss_mlp": 0.00264144, "balance_loss_clip": 1.18369114, "balance_loss_mlp": 0.23186165, "epoch": 0.39386742822786713, "flos": 21609784575360.0, "grad_norm": 102.76955842770397, "language_loss": 0.8826319, "learning_rate": 2.7646021909120434e-06, "loss": 0.8998543, "num_input_tokens_seen": 140771440, "router_z_loss_clip": 2.74414062, "router_z_loss_mlp": 0.32275391, "step": 6551, "time_per_iteration": 2.641793727874756 }, { "auxiliary_loss_clip": 0.014429, "auxiliary_loss_mlp": 0.003066, "balance_loss_clip": 1.16955173, "balance_loss_mlp": 0.27527168, "epoch": 0.3939275514805351, "flos": 12414642249600.0, "grad_norm": 23.87603264415071, "language_loss": 0.87840211, "learning_rate": 2.764242299098596e-06, "loss": 0.89589715, "num_input_tokens_seen": 140786715, "router_z_loss_clip": 2.734375, "router_z_loss_mlp": 0.31323242, "step": 6552, "time_per_iteration": 2.6510274410247803 }, { "auxiliary_loss_clip": 0.01471307, "auxiliary_loss_mlp": 0.00284277, "balance_loss_clip": 1.19182491, "balance_loss_mlp": 0.25178012, "epoch": 0.39398767473320306, "flos": 18552027594240.0, "grad_norm": 15.030878362827929, "language_loss": 0.79224551, "learning_rate": 2.763882378305003e-06, "loss": 0.80980134, "num_input_tokens_seen": 140804950, "router_z_loss_clip": 2.796875, "router_z_loss_mlp": 0.32446289, "step": 6553, "time_per_iteration": 2.599310874938965 }, { "auxiliary_loss_clip": 0.01442168, "auxiliary_loss_mlp": 0.00304383, "balance_loss_clip": 1.17155182, "balance_loss_mlp": 0.27047959, "epoch": 0.39404779798587103, "flos": 29308888419840.0, "grad_norm": 13.057009463717566, "language_loss": 0.69924867, "learning_rate": 2.7635224285449144e-06, "loss": 0.71671426, "num_input_tokens_seen": 140822800, "router_z_loss_clip": 2.70898438, "router_z_loss_mlp": 0.33935547, "step": 6554, "time_per_iteration": 2.7854840755462646 }, { "auxiliary_loss_clip": 0.01445225, "auxiliary_loss_mlp": 0.00288174, "balance_loss_clip": 1.17604494, "balance_loss_mlp": 0.25810945, "epoch": 0.394107921238539, "flos": 34897055834880.0, "grad_norm": 33.50418287192239, "language_loss": 0.85169888, "learning_rate": 2.7631624498319796e-06, "loss": 0.8690328, "num_input_tokens_seen": 140842940, "router_z_loss_clip": 2.69140625, "router_z_loss_mlp": 0.30053711, "step": 6555, "time_per_iteration": 2.744800567626953 }, { "auxiliary_loss_clip": 0.01446735, "auxiliary_loss_mlp": 0.0030625, "balance_loss_clip": 1.17473757, "balance_loss_mlp": 0.27134585, "epoch": 0.39416804449120696, "flos": 25081413039360.0, "grad_norm": 188.00228802860536, "language_loss": 0.81112993, "learning_rate": 2.7628024421798473e-06, "loss": 0.82865983, "num_input_tokens_seen": 140863060, "router_z_loss_clip": 2.72070312, "router_z_loss_mlp": 0.34936523, "step": 6556, "time_per_iteration": 2.67191219329834 }, { "auxiliary_loss_clip": 0.01431449, "auxiliary_loss_mlp": 0.00277701, "balance_loss_clip": 1.16224802, "balance_loss_mlp": 0.2455858, "epoch": 0.3942281677438749, "flos": 32306639731200.0, "grad_norm": 1965.3989433742058, "language_loss": 0.91709203, "learning_rate": 2.7624424056021705e-06, "loss": 0.9341836, "num_input_tokens_seen": 140883795, "router_z_loss_clip": 2.6953125, "router_z_loss_mlp": 0.32104492, "step": 6557, "time_per_iteration": 2.752310037612915 }, { "auxiliary_loss_clip": 0.01439304, "auxiliary_loss_mlp": 0.00293336, "balance_loss_clip": 1.16969204, "balance_loss_mlp": 0.26250881, "epoch": 0.3942882909965429, "flos": 24936621315840.0, "grad_norm": 5.383269719480459, "language_loss": 0.89588606, "learning_rate": 2.7620823401126004e-06, "loss": 0.91321254, "num_input_tokens_seen": 140903055, "router_z_loss_clip": 2.69726562, "router_z_loss_mlp": 0.30859375, "step": 6558, "time_per_iteration": 2.6710662841796875 }, { "auxiliary_loss_clip": 0.01438069, "auxiliary_loss_mlp": 0.00285201, "balance_loss_clip": 1.16857207, "balance_loss_mlp": 0.25401634, "epoch": 0.39434841424921085, "flos": 11874797769600.0, "grad_norm": 141.37443708330932, "language_loss": 0.79682308, "learning_rate": 2.761722245724792e-06, "loss": 0.8140558, "num_input_tokens_seen": 140920685, "router_z_loss_clip": 2.68945312, "router_z_loss_mlp": 0.31176758, "step": 6559, "time_per_iteration": 2.630338191986084 }, { "auxiliary_loss_clip": 0.01431556, "auxiliary_loss_mlp": 0.00310766, "balance_loss_clip": 1.15931058, "balance_loss_mlp": 0.27641034, "epoch": 0.3944085375018789, "flos": 16361620323840.0, "grad_norm": 77.29299085577408, "language_loss": 0.90033048, "learning_rate": 2.7613621224524003e-06, "loss": 0.9177537, "num_input_tokens_seen": 140937320, "router_z_loss_clip": 2.7265625, "router_z_loss_mlp": 0.34399414, "step": 6560, "time_per_iteration": 2.66190505027771 }, { "auxiliary_loss_clip": 0.01429887, "auxiliary_loss_mlp": 0.00270415, "balance_loss_clip": 1.1616559, "balance_loss_mlp": 0.23938489, "epoch": 0.39446866075454684, "flos": 10633365866880.0, "grad_norm": 170.54426822832397, "language_loss": 0.91034508, "learning_rate": 2.7610019703090803e-06, "loss": 0.92734808, "num_input_tokens_seen": 140954855, "router_z_loss_clip": 2.68164062, "router_z_loss_mlp": 0.31018066, "step": 6561, "time_per_iteration": 4.005646467208862 }, { "auxiliary_loss_clip": 0.01408277, "auxiliary_loss_mlp": 0.00273251, "balance_loss_clip": 1.14674497, "balance_loss_mlp": 0.2429478, "epoch": 0.3945287840072148, "flos": 18187498419840.0, "grad_norm": 19.211562299695338, "language_loss": 0.90970433, "learning_rate": 2.7606417893084887e-06, "loss": 0.92651963, "num_input_tokens_seen": 140973250, "router_z_loss_clip": 2.6171875, "router_z_loss_mlp": 0.30273438, "step": 6562, "time_per_iteration": 2.6686630249023438 }, { "auxiliary_loss_clip": 0.01425627, "auxiliary_loss_mlp": 0.00282197, "balance_loss_clip": 1.15965474, "balance_loss_mlp": 0.25176334, "epoch": 0.39458890725988277, "flos": 23039891642880.0, "grad_norm": 145.07200769373895, "language_loss": 0.88867176, "learning_rate": 2.7602815794642853e-06, "loss": 0.90574998, "num_input_tokens_seen": 140993050, "router_z_loss_clip": 2.65820312, "router_z_loss_mlp": 0.30407715, "step": 6563, "time_per_iteration": 4.094133615493774 }, { "auxiliary_loss_clip": 0.01416145, "auxiliary_loss_mlp": 0.00263428, "balance_loss_clip": 1.15165377, "balance_loss_mlp": 0.23362595, "epoch": 0.39464903051255074, "flos": 17159052211200.0, "grad_norm": 15.636818747717083, "language_loss": 0.78939164, "learning_rate": 2.759921340790127e-06, "loss": 0.80618739, "num_input_tokens_seen": 141010815, "router_z_loss_clip": 2.64648438, "router_z_loss_mlp": 0.2980957, "step": 6564, "time_per_iteration": 2.6878697872161865 }, { "auxiliary_loss_clip": 0.01420173, "auxiliary_loss_mlp": 0.00272662, "balance_loss_clip": 1.15339255, "balance_loss_mlp": 0.24536322, "epoch": 0.3947091537652187, "flos": 15889000147200.0, "grad_norm": 15.713864791117048, "language_loss": 0.92184889, "learning_rate": 2.759561073299676e-06, "loss": 0.93877721, "num_input_tokens_seen": 141028720, "router_z_loss_clip": 2.66992188, "router_z_loss_mlp": 0.27319336, "step": 6565, "time_per_iteration": 2.6649186611175537 }, { "auxiliary_loss_clip": 0.01418355, "auxiliary_loss_mlp": 0.00274513, "balance_loss_clip": 1.15602338, "balance_loss_mlp": 0.24347107, "epoch": 0.39476927701788667, "flos": 18545491319040.0, "grad_norm": 145.5760049280142, "language_loss": 0.89096653, "learning_rate": 2.7592007770065937e-06, "loss": 0.90789527, "num_input_tokens_seen": 141046025, "router_z_loss_clip": 2.625, "router_z_loss_mlp": 0.31054688, "step": 6566, "time_per_iteration": 4.0582075119018555 }, { "auxiliary_loss_clip": 0.01423048, "auxiliary_loss_mlp": 0.00297433, "balance_loss_clip": 1.15371466, "balance_loss_mlp": 0.26476923, "epoch": 0.39482940027055463, "flos": 22275712771200.0, "grad_norm": 36.21142995920963, "language_loss": 0.86941469, "learning_rate": 2.7588404519245403e-06, "loss": 0.88661945, "num_input_tokens_seen": 141066865, "router_z_loss_clip": 2.69726562, "router_z_loss_mlp": 0.32641602, "step": 6567, "time_per_iteration": 2.6643660068511963 }, { "auxiliary_loss_clip": 0.01397572, "auxiliary_loss_mlp": 0.00247242, "balance_loss_clip": 1.14239275, "balance_loss_mlp": 0.21681963, "epoch": 0.3948895235232226, "flos": 14757634494720.0, "grad_norm": 6.002472420997199, "language_loss": 0.86320817, "learning_rate": 2.758480098067182e-06, "loss": 0.87965631, "num_input_tokens_seen": 141084210, "router_z_loss_clip": 2.55273438, "router_z_loss_mlp": 0.30444336, "step": 6568, "time_per_iteration": 2.6569736003875732 }, { "auxiliary_loss_clip": 0.01409584, "auxiliary_loss_mlp": 0.0027427, "balance_loss_clip": 1.15071428, "balance_loss_mlp": 0.24399111, "epoch": 0.39494964677589056, "flos": 22565763095040.0, "grad_norm": 2.01701162719074, "language_loss": 0.91243976, "learning_rate": 2.7581197154481816e-06, "loss": 0.92927831, "num_input_tokens_seen": 141103895, "router_z_loss_clip": 2.58789062, "router_z_loss_mlp": 0.30297852, "step": 6569, "time_per_iteration": 2.719325542449951 }, { "auxiliary_loss_clip": 0.01417429, "auxiliary_loss_mlp": 0.00264811, "balance_loss_clip": 1.15939426, "balance_loss_mlp": 0.23436448, "epoch": 0.3950097700285585, "flos": 22963186149120.0, "grad_norm": 6.481766312967979, "language_loss": 0.83505666, "learning_rate": 2.7577593040812066e-06, "loss": 0.85187912, "num_input_tokens_seen": 141124000, "router_z_loss_clip": 2.58007812, "router_z_loss_mlp": 0.3046875, "step": 6570, "time_per_iteration": 2.7094762325286865 }, { "auxiliary_loss_clip": 0.01398087, "auxiliary_loss_mlp": 0.00276705, "balance_loss_clip": 1.14084125, "balance_loss_mlp": 0.24501893, "epoch": 0.3950698932812265, "flos": 20595236929920.0, "grad_norm": 8.998410709131717, "language_loss": 0.87205184, "learning_rate": 2.757398863979922e-06, "loss": 0.88879979, "num_input_tokens_seen": 141142535, "router_z_loss_clip": 2.57226562, "router_z_loss_mlp": 0.31640625, "step": 6571, "time_per_iteration": 2.646961212158203 }, { "auxiliary_loss_clip": 0.01409815, "auxiliary_loss_mlp": 0.00286389, "balance_loss_clip": 1.15004635, "balance_loss_mlp": 0.25575206, "epoch": 0.39513001653389446, "flos": 20375786787840.0, "grad_norm": 6.646359102661631, "language_loss": 0.84016967, "learning_rate": 2.757038395157997e-06, "loss": 0.85713172, "num_input_tokens_seen": 141161575, "router_z_loss_clip": 2.59570312, "router_z_loss_mlp": 0.30688477, "step": 6572, "time_per_iteration": 4.085606813430786 }, { "auxiliary_loss_clip": 0.0140634, "auxiliary_loss_mlp": 0.00258736, "balance_loss_clip": 1.14728999, "balance_loss_mlp": 0.22917181, "epoch": 0.3951901397865625, "flos": 26463650256000.0, "grad_norm": 3.007711280469517, "language_loss": 0.80815035, "learning_rate": 2.7566778976291002e-06, "loss": 0.82480109, "num_input_tokens_seen": 141181150, "router_z_loss_clip": 2.59179688, "router_z_loss_mlp": 0.29614258, "step": 6573, "time_per_iteration": 2.7038486003875732 }, { "auxiliary_loss_clip": 0.01404877, "auxiliary_loss_mlp": 0.00279555, "balance_loss_clip": 1.14648676, "balance_loss_mlp": 0.24979997, "epoch": 0.39525026303923044, "flos": 43838345767680.0, "grad_norm": 134.8651861437548, "language_loss": 0.73037958, "learning_rate": 2.7563173714069017e-06, "loss": 0.74722397, "num_input_tokens_seen": 141206310, "router_z_loss_clip": 2.5859375, "router_z_loss_mlp": 0.29748535, "step": 6574, "time_per_iteration": 2.8859164714813232 }, { "auxiliary_loss_clip": 0.01399962, "auxiliary_loss_mlp": 0.00243113, "balance_loss_clip": 1.14101756, "balance_loss_mlp": 0.21334614, "epoch": 0.3953103862918984, "flos": 18040803275520.0, "grad_norm": 212.0853240633193, "language_loss": 0.83929467, "learning_rate": 2.755956816505072e-06, "loss": 0.85572541, "num_input_tokens_seen": 141223925, "router_z_loss_clip": 2.58789062, "router_z_loss_mlp": 0.29748535, "step": 6575, "time_per_iteration": 2.6955862045288086 }, { "auxiliary_loss_clip": 0.01421858, "auxiliary_loss_mlp": 0.00273402, "balance_loss_clip": 1.15792036, "balance_loss_mlp": 0.24325444, "epoch": 0.3953705095445664, "flos": 16976015481600.0, "grad_norm": 13.040253677510428, "language_loss": 0.81536126, "learning_rate": 2.7555962329372845e-06, "loss": 0.83231384, "num_input_tokens_seen": 141239010, "router_z_loss_clip": 2.63867188, "router_z_loss_mlp": 0.30114746, "step": 6576, "time_per_iteration": 2.6759207248687744 }, { "auxiliary_loss_clip": 0.01410418, "auxiliary_loss_mlp": 0.00263775, "balance_loss_clip": 1.14720142, "balance_loss_mlp": 0.23249468, "epoch": 0.39543063279723434, "flos": 17411144837760.0, "grad_norm": 124.61536411379016, "language_loss": 0.89663017, "learning_rate": 2.7552356207172124e-06, "loss": 0.9133721, "num_input_tokens_seen": 141252255, "router_z_loss_clip": 2.63085938, "router_z_loss_mlp": 0.31286621, "step": 6577, "time_per_iteration": 2.8384056091308594 }, { "auxiliary_loss_clip": 0.01420595, "auxiliary_loss_mlp": 0.00268407, "balance_loss_clip": 1.15957284, "balance_loss_mlp": 0.23781793, "epoch": 0.3954907560499023, "flos": 22784207656320.0, "grad_norm": 4.049372801190956, "language_loss": 0.98261309, "learning_rate": 2.75487497985853e-06, "loss": 0.99950308, "num_input_tokens_seen": 141269325, "router_z_loss_clip": 2.61328125, "router_z_loss_mlp": 0.3059082, "step": 6578, "time_per_iteration": 2.771533727645874 }, { "auxiliary_loss_clip": 0.01409034, "auxiliary_loss_mlp": 0.00278974, "balance_loss_clip": 1.1449194, "balance_loss_mlp": 0.24681097, "epoch": 0.39555087930257027, "flos": 21944400698880.0, "grad_norm": 5.013963497486403, "language_loss": 0.86058342, "learning_rate": 2.7545143103749117e-06, "loss": 0.87746352, "num_input_tokens_seen": 141288505, "router_z_loss_clip": 2.640625, "router_z_loss_mlp": 0.3215332, "step": 6579, "time_per_iteration": 2.680816411972046 }, { "auxiliary_loss_clip": 0.01430232, "auxiliary_loss_mlp": 0.00327342, "balance_loss_clip": 1.16132402, "balance_loss_mlp": 0.29644245, "epoch": 0.39561100255523823, "flos": 20404622430720.0, "grad_norm": 1348.5707027409603, "language_loss": 0.77095306, "learning_rate": 2.754153612280037e-06, "loss": 0.7885288, "num_input_tokens_seen": 141303680, "router_z_loss_clip": 2.6875, "router_z_loss_mlp": 0.30883789, "step": 6580, "time_per_iteration": 2.677456855773926 }, { "auxiliary_loss_clip": 0.01415054, "auxiliary_loss_mlp": 0.00291996, "balance_loss_clip": 1.15438223, "balance_loss_mlp": 0.26145491, "epoch": 0.3956711258079062, "flos": 27964572986880.0, "grad_norm": 406.7111051026534, "language_loss": 0.66092908, "learning_rate": 2.7537928855875797e-06, "loss": 0.67799962, "num_input_tokens_seen": 141324090, "router_z_loss_clip": 2.60546875, "router_z_loss_mlp": 0.30541992, "step": 6581, "time_per_iteration": 2.757558584213257 }, { "auxiliary_loss_clip": 0.01424815, "auxiliary_loss_mlp": 0.0030109, "balance_loss_clip": 1.15961647, "balance_loss_mlp": 0.26985714, "epoch": 0.39573124906057416, "flos": 14428297670400.0, "grad_norm": 8.61748472767752, "language_loss": 0.79038829, "learning_rate": 2.7534321303112224e-06, "loss": 0.80764735, "num_input_tokens_seen": 141342235, "router_z_loss_clip": 2.65039062, "router_z_loss_mlp": 0.31225586, "step": 6582, "time_per_iteration": 2.640465259552002 }, { "auxiliary_loss_clip": 0.01421074, "auxiliary_loss_mlp": 0.00297255, "balance_loss_clip": 1.15716839, "balance_loss_mlp": 0.26783454, "epoch": 0.39579137231324213, "flos": 18733699607040.0, "grad_norm": 43.65190332982038, "language_loss": 0.85502481, "learning_rate": 2.753071346464642e-06, "loss": 0.87220812, "num_input_tokens_seen": 141361195, "router_z_loss_clip": 2.63867188, "router_z_loss_mlp": 0.29418945, "step": 6583, "time_per_iteration": 2.6321802139282227 }, { "auxiliary_loss_clip": 0.01428628, "auxiliary_loss_mlp": 0.00296532, "balance_loss_clip": 1.16214907, "balance_loss_mlp": 0.2663008, "epoch": 0.3958514955659101, "flos": 17676417755520.0, "grad_norm": 255.30788589818252, "language_loss": 0.728769, "learning_rate": 2.7527105340615207e-06, "loss": 0.74602062, "num_input_tokens_seen": 141378275, "router_z_loss_clip": 2.66210938, "router_z_loss_mlp": 0.30224609, "step": 6584, "time_per_iteration": 2.654461622238159 }, { "auxiliary_loss_clip": 0.0143314, "auxiliary_loss_mlp": 0.00295148, "balance_loss_clip": 1.16761637, "balance_loss_mlp": 0.26572716, "epoch": 0.39591161881857806, "flos": 29309103901440.0, "grad_norm": 53.885190221646035, "language_loss": 0.81635392, "learning_rate": 2.7523496931155413e-06, "loss": 0.83363682, "num_input_tokens_seen": 141396960, "router_z_loss_clip": 2.65429688, "router_z_loss_mlp": 0.29418945, "step": 6585, "time_per_iteration": 2.7365570068359375 }, { "auxiliary_loss_clip": 0.01424185, "auxiliary_loss_mlp": 0.00289858, "balance_loss_clip": 1.15895009, "balance_loss_mlp": 0.25717053, "epoch": 0.3959717420712461, "flos": 25771831332480.0, "grad_norm": 29.03455149142605, "language_loss": 0.80495203, "learning_rate": 2.7519888236403856e-06, "loss": 0.82209241, "num_input_tokens_seen": 141417320, "router_z_loss_clip": 2.65429688, "router_z_loss_mlp": 0.32641602, "step": 6586, "time_per_iteration": 2.7751080989837646 }, { "auxiliary_loss_clip": 0.01431758, "auxiliary_loss_mlp": 0.00324288, "balance_loss_clip": 1.16909635, "balance_loss_mlp": 0.2933417, "epoch": 0.39603186532391405, "flos": 20923783655040.0, "grad_norm": 9.876764816217014, "language_loss": 0.78014368, "learning_rate": 2.7516279256497382e-06, "loss": 0.79770416, "num_input_tokens_seen": 141435985, "router_z_loss_clip": 2.62695312, "router_z_loss_mlp": 0.30957031, "step": 6587, "time_per_iteration": 2.6830945014953613 }, { "auxiliary_loss_clip": 0.01386326, "auxiliary_loss_mlp": 0.00102037, "balance_loss_clip": 1.22113037, "balance_loss_mlp": 0.09221432, "epoch": 0.396091988576582, "flos": 54880986176640.0, "grad_norm": 1.0075464053311867, "language_loss": 0.60485756, "learning_rate": 2.751266999157285e-06, "loss": 0.6197412, "num_input_tokens_seen": 141486075, "router_z_loss_clip": 1.65625, "router_z_loss_mlp": 0.09814453, "step": 6588, "time_per_iteration": 2.932032585144043 }, { "auxiliary_loss_clip": 0.01439902, "auxiliary_loss_mlp": 0.00302335, "balance_loss_clip": 1.17207146, "balance_loss_mlp": 0.27007681, "epoch": 0.39615211182925, "flos": 20702896968960.0, "grad_norm": 36.15881620293012, "language_loss": 0.86410224, "learning_rate": 2.7509060441767115e-06, "loss": 0.88152468, "num_input_tokens_seen": 141505280, "router_z_loss_clip": 2.6796875, "router_z_loss_mlp": 0.32275391, "step": 6589, "time_per_iteration": 2.801392078399658 }, { "auxiliary_loss_clip": 0.01450213, "auxiliary_loss_mlp": 0.00334964, "balance_loss_clip": 1.18199205, "balance_loss_mlp": 0.30141824, "epoch": 0.39621223508191794, "flos": 20994312009600.0, "grad_norm": 10738.582826645055, "language_loss": 0.80863452, "learning_rate": 2.7505450607217057e-06, "loss": 0.82648623, "num_input_tokens_seen": 141523930, "router_z_loss_clip": 2.6796875, "router_z_loss_mlp": 0.33569336, "step": 6590, "time_per_iteration": 2.7860207557678223 }, { "auxiliary_loss_clip": 0.01445303, "auxiliary_loss_mlp": 0.00332556, "balance_loss_clip": 1.1812135, "balance_loss_mlp": 0.30447015, "epoch": 0.3962723583345859, "flos": 23368833417600.0, "grad_norm": 122.96299840950465, "language_loss": 0.82287681, "learning_rate": 2.750184048805956e-06, "loss": 0.84065539, "num_input_tokens_seen": 141541320, "router_z_loss_clip": 2.640625, "router_z_loss_mlp": 0.28100586, "step": 6591, "time_per_iteration": 2.66439151763916 }, { "auxiliary_loss_clip": 0.01451189, "auxiliary_loss_mlp": 0.00321698, "balance_loss_clip": 1.18539739, "balance_loss_mlp": 0.28996408, "epoch": 0.39633248158725387, "flos": 25115599808640.0, "grad_norm": 39.863807833461806, "language_loss": 0.84396195, "learning_rate": 2.749823008443152e-06, "loss": 0.86169082, "num_input_tokens_seen": 141561880, "router_z_loss_clip": 2.65625, "router_z_loss_mlp": 0.31750488, "step": 6592, "time_per_iteration": 2.7012786865234375 }, { "auxiliary_loss_clip": 0.01442417, "auxiliary_loss_mlp": 0.00307075, "balance_loss_clip": 1.18493748, "balance_loss_mlp": 0.27721289, "epoch": 0.39639260483992184, "flos": 39787622236800.0, "grad_norm": 19.860812413244226, "language_loss": 0.75221479, "learning_rate": 2.7494619396469843e-06, "loss": 0.76970971, "num_input_tokens_seen": 141586460, "router_z_loss_clip": 2.57617188, "router_z_loss_mlp": 0.29870605, "step": 6593, "time_per_iteration": 2.8126957416534424 }, { "auxiliary_loss_clip": 0.01444319, "auxiliary_loss_mlp": 0.00335003, "balance_loss_clip": 1.18146133, "balance_loss_mlp": 0.30322173, "epoch": 0.3964527280925898, "flos": 17347045017600.0, "grad_norm": 7.646989274719978, "language_loss": 0.844661, "learning_rate": 2.7491008424311452e-06, "loss": 0.86245418, "num_input_tokens_seen": 141605955, "router_z_loss_clip": 2.62890625, "router_z_loss_mlp": 0.31750488, "step": 6594, "time_per_iteration": 2.6207823753356934 }, { "auxiliary_loss_clip": 0.01409041, "auxiliary_loss_mlp": 0.00101313, "balance_loss_clip": 1.23718381, "balance_loss_mlp": 0.09034582, "epoch": 0.39651285134525777, "flos": 71717848369920.0, "grad_norm": 0.9148173921094871, "language_loss": 0.62724513, "learning_rate": 2.7487397168093265e-06, "loss": 0.64234865, "num_input_tokens_seen": 141673140, "router_z_loss_clip": 1.71875, "router_z_loss_mlp": 0.10986328, "step": 6595, "time_per_iteration": 3.2069408893585205 }, { "auxiliary_loss_clip": 0.01453579, "auxiliary_loss_mlp": 0.00321813, "balance_loss_clip": 1.18700457, "balance_loss_mlp": 0.29034221, "epoch": 0.39657297459792573, "flos": 25775710001280.0, "grad_norm": 51.57020044260727, "language_loss": 0.70727694, "learning_rate": 2.748378562795223e-06, "loss": 0.72503084, "num_input_tokens_seen": 141692955, "router_z_loss_clip": 2.66601562, "router_z_loss_mlp": 0.31469727, "step": 6596, "time_per_iteration": 2.718285322189331 }, { "auxiliary_loss_clip": 0.01444205, "auxiliary_loss_mlp": 0.00313308, "balance_loss_clip": 1.18505526, "balance_loss_mlp": 0.28028703, "epoch": 0.3966330978505937, "flos": 20266115587200.0, "grad_norm": 5.841454282379107, "language_loss": 0.85343158, "learning_rate": 2.7480173804025293e-06, "loss": 0.87100673, "num_input_tokens_seen": 141710680, "router_z_loss_clip": 2.59179688, "router_z_loss_mlp": 0.33007812, "step": 6597, "time_per_iteration": 2.7122414112091064 }, { "auxiliary_loss_clip": 0.01463095, "auxiliary_loss_mlp": 0.00353183, "balance_loss_clip": 1.19359565, "balance_loss_mlp": 0.32087752, "epoch": 0.39669322110326166, "flos": 20631183465600.0, "grad_norm": 35.06936872258178, "language_loss": 0.78126585, "learning_rate": 2.747656169644941e-06, "loss": 0.7994287, "num_input_tokens_seen": 141729860, "router_z_loss_clip": 2.6953125, "router_z_loss_mlp": 0.32299805, "step": 6598, "time_per_iteration": 2.638516902923584 }, { "auxiliary_loss_clip": 0.01466133, "auxiliary_loss_mlp": 0.00333251, "balance_loss_clip": 1.19583845, "balance_loss_mlp": 0.30287653, "epoch": 0.3967533443559297, "flos": 21726063878400.0, "grad_norm": 85.32559724749366, "language_loss": 0.87254673, "learning_rate": 2.747294930536157e-06, "loss": 0.8905406, "num_input_tokens_seen": 141749060, "router_z_loss_clip": 2.703125, "router_z_loss_mlp": 0.30395508, "step": 6599, "time_per_iteration": 2.7062056064605713 }, { "auxiliary_loss_clip": 0.01464651, "auxiliary_loss_mlp": 0.00328627, "balance_loss_clip": 1.1979928, "balance_loss_mlp": 0.2948668, "epoch": 0.39681346760859765, "flos": 25484151306240.0, "grad_norm": 59.32231263797697, "language_loss": 0.8086468, "learning_rate": 2.7469336630898737e-06, "loss": 0.82657957, "num_input_tokens_seen": 141769860, "router_z_loss_clip": 2.6640625, "router_z_loss_mlp": 0.33764648, "step": 6600, "time_per_iteration": 2.688436508178711 }, { "auxiliary_loss_clip": 0.01450144, "auxiliary_loss_mlp": 0.00347921, "balance_loss_clip": 1.18456268, "balance_loss_mlp": 0.31566256, "epoch": 0.3968735908612656, "flos": 20959586536320.0, "grad_norm": 52.61490389638629, "language_loss": 0.93874621, "learning_rate": 2.746572367319791e-06, "loss": 0.95672685, "num_input_tokens_seen": 141788465, "router_z_loss_clip": 2.65234375, "router_z_loss_mlp": 0.32226562, "step": 6601, "time_per_iteration": 2.6468327045440674 }, { "auxiliary_loss_clip": 0.01464525, "auxiliary_loss_mlp": 0.00319675, "balance_loss_clip": 1.19204259, "balance_loss_mlp": 0.28438896, "epoch": 0.3969337141139336, "flos": 10707090531840.0, "grad_norm": 36.357774473413095, "language_loss": 0.79414678, "learning_rate": 2.7462110432396095e-06, "loss": 0.81198877, "num_input_tokens_seen": 141804955, "router_z_loss_clip": 2.72460938, "router_z_loss_mlp": 0.35253906, "step": 6602, "time_per_iteration": 2.6900947093963623 }, { "auxiliary_loss_clip": 0.01460651, "auxiliary_loss_mlp": 0.00343899, "balance_loss_clip": 1.1913588, "balance_loss_mlp": 0.31168818, "epoch": 0.39699383736660154, "flos": 17593714690560.0, "grad_norm": 6.758090078760853, "language_loss": 0.9446522, "learning_rate": 2.7458496908630305e-06, "loss": 0.96269763, "num_input_tokens_seen": 141820025, "router_z_loss_clip": 2.68945312, "router_z_loss_mlp": 0.32177734, "step": 6603, "time_per_iteration": 4.041489124298096 }, { "auxiliary_loss_clip": 0.01457492, "auxiliary_loss_mlp": 0.00308739, "balance_loss_clip": 1.18986773, "balance_loss_mlp": 0.27755341, "epoch": 0.3970539606192695, "flos": 17785945301760.0, "grad_norm": 20.327464045279093, "language_loss": 0.78147912, "learning_rate": 2.7454883102037563e-06, "loss": 0.79914141, "num_input_tokens_seen": 141838735, "router_z_loss_clip": 2.67773438, "router_z_loss_mlp": 0.31176758, "step": 6604, "time_per_iteration": 2.691746950149536 }, { "auxiliary_loss_clip": 0.01441717, "auxiliary_loss_mlp": 0.0031995, "balance_loss_clip": 1.18542552, "balance_loss_mlp": 0.29014802, "epoch": 0.3971140838719375, "flos": 24789495208320.0, "grad_norm": 2.9958794009406757, "language_loss": 0.87171471, "learning_rate": 2.745126901275491e-06, "loss": 0.88933134, "num_input_tokens_seen": 141858090, "router_z_loss_clip": 2.56445312, "router_z_loss_mlp": 0.2980957, "step": 6605, "time_per_iteration": 4.17153263092041 }, { "auxiliary_loss_clip": 0.01445959, "auxiliary_loss_mlp": 0.00315674, "balance_loss_clip": 1.18218863, "balance_loss_mlp": 0.2844885, "epoch": 0.39717420712460544, "flos": 24243581329920.0, "grad_norm": 5.633861172853512, "language_loss": 0.81780934, "learning_rate": 2.7447654640919383e-06, "loss": 0.83542573, "num_input_tokens_seen": 141877540, "router_z_loss_clip": 2.640625, "router_z_loss_mlp": 0.31176758, "step": 6606, "time_per_iteration": 2.7407524585723877 }, { "auxiliary_loss_clip": 0.01447786, "auxiliary_loss_mlp": 0.00306291, "balance_loss_clip": 1.17906094, "balance_loss_mlp": 0.27400905, "epoch": 0.3972343303772734, "flos": 25884698843520.0, "grad_norm": 647.2578146304963, "language_loss": 0.82660061, "learning_rate": 2.744403998666805e-06, "loss": 0.84414136, "num_input_tokens_seen": 141897315, "router_z_loss_clip": 2.68945312, "router_z_loss_mlp": 0.32299805, "step": 6607, "time_per_iteration": 2.712916374206543 }, { "auxiliary_loss_clip": 0.01459948, "auxiliary_loss_mlp": 0.00326274, "balance_loss_clip": 1.18592715, "balance_loss_mlp": 0.29287139, "epoch": 0.39729445362994137, "flos": 45623716300800.0, "grad_norm": 41.18693102897316, "language_loss": 0.74784374, "learning_rate": 2.744042505013797e-06, "loss": 0.76570594, "num_input_tokens_seen": 141919580, "router_z_loss_clip": 2.74023438, "router_z_loss_mlp": 0.33422852, "step": 6608, "time_per_iteration": 4.2569053173065186 }, { "auxiliary_loss_clip": 0.01454733, "auxiliary_loss_mlp": 0.0032029, "balance_loss_clip": 1.1767025, "balance_loss_mlp": 0.28621998, "epoch": 0.39735457688260933, "flos": 20193971120640.0, "grad_norm": 37.61526746075015, "language_loss": 0.80947298, "learning_rate": 2.7436809831466233e-06, "loss": 0.82722318, "num_input_tokens_seen": 141937045, "router_z_loss_clip": 2.78125, "router_z_loss_mlp": 0.34057617, "step": 6609, "time_per_iteration": 2.7021679878234863 }, { "auxiliary_loss_clip": 0.01459376, "auxiliary_loss_mlp": 0.00303747, "balance_loss_clip": 1.18882799, "balance_loss_mlp": 0.2751132, "epoch": 0.3974147001352773, "flos": 23331163029120.0, "grad_norm": 22.68483436622926, "language_loss": 0.78472281, "learning_rate": 2.7433194330789927e-06, "loss": 0.80235404, "num_input_tokens_seen": 141956695, "router_z_loss_clip": 2.703125, "router_z_loss_mlp": 0.28637695, "step": 6610, "time_per_iteration": 2.724433422088623 }, { "auxiliary_loss_clip": 0.01434319, "auxiliary_loss_mlp": 0.00301356, "balance_loss_clip": 1.17060912, "balance_loss_mlp": 0.27126712, "epoch": 0.39747482338794526, "flos": 21688644885120.0, "grad_norm": 11.217874178177599, "language_loss": 0.85828096, "learning_rate": 2.7429578548246133e-06, "loss": 0.87563765, "num_input_tokens_seen": 141975935, "router_z_loss_clip": 2.63671875, "router_z_loss_mlp": 0.30102539, "step": 6611, "time_per_iteration": 2.702512502670288 }, { "auxiliary_loss_clip": 0.0144383, "auxiliary_loss_mlp": 0.00319483, "balance_loss_clip": 1.17484486, "balance_loss_mlp": 0.2899186, "epoch": 0.3975349466406133, "flos": 30988717816320.0, "grad_norm": 4.937033423925227, "language_loss": 0.85068709, "learning_rate": 2.7425962483971985e-06, "loss": 0.86832023, "num_input_tokens_seen": 141995750, "router_z_loss_clip": 2.69140625, "router_z_loss_mlp": 0.2956543, "step": 6612, "time_per_iteration": 2.740370273590088 }, { "auxiliary_loss_clip": 0.01372928, "auxiliary_loss_mlp": 0.00091849, "balance_loss_clip": 1.19603014, "balance_loss_mlp": 0.08283718, "epoch": 0.39759506989328125, "flos": 63683948833920.0, "grad_norm": 1.6223500196745415, "language_loss": 0.64426947, "learning_rate": 2.742234613810459e-06, "loss": 0.65891719, "num_input_tokens_seen": 142057655, "router_z_loss_clip": 1.7734375, "router_z_loss_mlp": 0.09033203, "step": 6613, "time_per_iteration": 3.0801045894622803 }, { "auxiliary_loss_clip": 0.01444668, "auxiliary_loss_mlp": 0.00326448, "balance_loss_clip": 1.17209983, "balance_loss_mlp": 0.29442862, "epoch": 0.3976551931459492, "flos": 23695835857920.0, "grad_norm": 40.27073122333036, "language_loss": 0.79188287, "learning_rate": 2.741872951078109e-06, "loss": 0.80959404, "num_input_tokens_seen": 142076020, "router_z_loss_clip": 2.72460938, "router_z_loss_mlp": 0.3203125, "step": 6614, "time_per_iteration": 4.091978073120117 }, { "auxiliary_loss_clip": 0.01448709, "auxiliary_loss_mlp": 0.00330526, "balance_loss_clip": 1.17526054, "balance_loss_mlp": 0.29864931, "epoch": 0.3977153163986172, "flos": 15669657745920.0, "grad_norm": 8.929224907298694, "language_loss": 0.88506466, "learning_rate": 2.741511260213862e-06, "loss": 0.90285707, "num_input_tokens_seen": 142093790, "router_z_loss_clip": 2.73632812, "router_z_loss_mlp": 0.31848145, "step": 6615, "time_per_iteration": 2.6483981609344482 }, { "auxiliary_loss_clip": 0.01442522, "auxiliary_loss_mlp": 0.00297081, "balance_loss_clip": 1.16961598, "balance_loss_mlp": 0.26706421, "epoch": 0.39777543965128515, "flos": 14064702249600.0, "grad_norm": 18.5493636005327, "language_loss": 0.75566256, "learning_rate": 2.741149541231434e-06, "loss": 0.77305853, "num_input_tokens_seen": 142110545, "router_z_loss_clip": 2.72460938, "router_z_loss_mlp": 0.30004883, "step": 6616, "time_per_iteration": 2.6497044563293457 }, { "auxiliary_loss_clip": 0.01434443, "auxiliary_loss_mlp": 0.00306447, "balance_loss_clip": 1.16162825, "balance_loss_mlp": 0.2754285, "epoch": 0.3978355629039531, "flos": 23367468700800.0, "grad_norm": 11.441666977512993, "language_loss": 0.92286038, "learning_rate": 2.740787794144541e-06, "loss": 0.94026929, "num_input_tokens_seen": 142128695, "router_z_loss_clip": 2.72851562, "router_z_loss_mlp": 0.31005859, "step": 6617, "time_per_iteration": 2.666437864303589 }, { "auxiliary_loss_clip": 0.01428162, "auxiliary_loss_mlp": 0.003224, "balance_loss_clip": 1.16489398, "balance_loss_mlp": 0.29240742, "epoch": 0.3978956861566211, "flos": 19062785036160.0, "grad_norm": 13.248516196794126, "language_loss": 0.78301913, "learning_rate": 2.7404260189669e-06, "loss": 0.80052477, "num_input_tokens_seen": 142148375, "router_z_loss_clip": 2.6328125, "router_z_loss_mlp": 0.30004883, "step": 6618, "time_per_iteration": 2.774120807647705 }, { "auxiliary_loss_clip": 0.01430982, "auxiliary_loss_mlp": 0.00306926, "balance_loss_clip": 1.16032994, "balance_loss_mlp": 0.2740719, "epoch": 0.39795580940928904, "flos": 30227699341440.0, "grad_norm": 13232.615980711544, "language_loss": 0.7311151, "learning_rate": 2.740064215712231e-06, "loss": 0.74849427, "num_input_tokens_seen": 142169735, "router_z_loss_clip": 2.70703125, "router_z_loss_mlp": 0.32861328, "step": 6619, "time_per_iteration": 2.723377227783203 }, { "auxiliary_loss_clip": 0.01312799, "auxiliary_loss_mlp": 0.00116189, "balance_loss_clip": 1.1450057, "balance_loss_mlp": 0.1056508, "epoch": 0.398015932661957, "flos": 69847224906240.0, "grad_norm": 0.7797192127842372, "language_loss": 0.58145726, "learning_rate": 2.7397023843942527e-06, "loss": 0.59574711, "num_input_tokens_seen": 142229520, "router_z_loss_clip": 1.671875, "router_z_loss_mlp": 0.10546875, "step": 6620, "time_per_iteration": 3.128671646118164 }, { "auxiliary_loss_clip": 0.01416184, "auxiliary_loss_mlp": 0.00288683, "balance_loss_clip": 1.15199637, "balance_loss_mlp": 0.25950027, "epoch": 0.39807605591462497, "flos": 20157773189760.0, "grad_norm": 4.3630488559990965, "language_loss": 0.85580772, "learning_rate": 2.739340525026686e-06, "loss": 0.87285638, "num_input_tokens_seen": 142247660, "router_z_loss_clip": 2.64648438, "router_z_loss_mlp": 0.29199219, "step": 6621, "time_per_iteration": 2.6746280193328857 }, { "auxiliary_loss_clip": 0.01403275, "auxiliary_loss_mlp": 0.00299201, "balance_loss_clip": 1.14354146, "balance_loss_mlp": 0.26908833, "epoch": 0.39813617916729294, "flos": 21141761339520.0, "grad_norm": 88.32599439427958, "language_loss": 0.84977299, "learning_rate": 2.738978637623252e-06, "loss": 0.86679775, "num_input_tokens_seen": 142266990, "router_z_loss_clip": 2.59570312, "router_z_loss_mlp": 0.30114746, "step": 6622, "time_per_iteration": 2.7428510189056396 }, { "auxiliary_loss_clip": 0.01420062, "auxiliary_loss_mlp": 0.003027, "balance_loss_clip": 1.1535821, "balance_loss_mlp": 0.27051368, "epoch": 0.3981963024199609, "flos": 18988485753600.0, "grad_norm": 37.03750149571474, "language_loss": 0.81946898, "learning_rate": 2.738616722197674e-06, "loss": 0.83669662, "num_input_tokens_seen": 142287170, "router_z_loss_clip": 2.6640625, "router_z_loss_mlp": 0.32177734, "step": 6623, "time_per_iteration": 2.949141263961792 }, { "auxiliary_loss_clip": 0.01405941, "auxiliary_loss_mlp": 0.00303421, "balance_loss_clip": 1.14476037, "balance_loss_mlp": 0.27352381, "epoch": 0.39825642567262887, "flos": 16575108808320.0, "grad_norm": 4.141039167794411, "language_loss": 0.88257802, "learning_rate": 2.7382547787636766e-06, "loss": 0.89967167, "num_input_tokens_seen": 142305405, "router_z_loss_clip": 2.61132812, "router_z_loss_mlp": 0.29931641, "step": 6624, "time_per_iteration": 2.7515249252319336 }, { "auxiliary_loss_clip": 0.01408457, "auxiliary_loss_mlp": 0.00313572, "balance_loss_clip": 1.14137101, "balance_loss_mlp": 0.28033623, "epoch": 0.39831654892529683, "flos": 22199833290240.0, "grad_norm": 18.791346370814626, "language_loss": 0.94176853, "learning_rate": 2.7378928073349832e-06, "loss": 0.95898879, "num_input_tokens_seen": 142322710, "router_z_loss_clip": 2.67382812, "router_z_loss_mlp": 0.33227539, "step": 6625, "time_per_iteration": 2.7426493167877197 }, { "auxiliary_loss_clip": 0.01404575, "auxiliary_loss_mlp": 0.00313379, "balance_loss_clip": 1.14202654, "balance_loss_mlp": 0.28008386, "epoch": 0.39837667217796485, "flos": 10487963612160.0, "grad_norm": 25.13003114444521, "language_loss": 0.93801785, "learning_rate": 2.737530807925321e-06, "loss": 0.95519745, "num_input_tokens_seen": 142338535, "router_z_loss_clip": 2.62695312, "router_z_loss_mlp": 0.33288574, "step": 6626, "time_per_iteration": 2.767620325088501 }, { "auxiliary_loss_clip": 0.01401424, "auxiliary_loss_mlp": 0.00321382, "balance_loss_clip": 1.13880324, "balance_loss_mlp": 0.29053044, "epoch": 0.3984367954306328, "flos": 17965282930560.0, "grad_norm": 12.457675447490075, "language_loss": 0.91886425, "learning_rate": 2.737168780548417e-06, "loss": 0.93609226, "num_input_tokens_seen": 142354570, "router_z_loss_clip": 2.62890625, "router_z_loss_mlp": 0.30859375, "step": 6627, "time_per_iteration": 2.692538022994995 }, { "auxiliary_loss_clip": 0.01390105, "auxiliary_loss_mlp": 0.00285516, "balance_loss_clip": 1.12744558, "balance_loss_mlp": 0.25754941, "epoch": 0.3984969186833008, "flos": 22711057608960.0, "grad_norm": 82.3527991664246, "language_loss": 0.87918675, "learning_rate": 2.736806725217998e-06, "loss": 0.89594293, "num_input_tokens_seen": 142374395, "router_z_loss_clip": 2.62304688, "router_z_loss_mlp": 0.27978516, "step": 6628, "time_per_iteration": 2.7396485805511475 }, { "auxiliary_loss_clip": 0.01402356, "auxiliary_loss_mlp": 0.00319773, "balance_loss_clip": 1.13619447, "balance_loss_mlp": 0.28879064, "epoch": 0.39855704193596875, "flos": 23405785534080.0, "grad_norm": 25.495820308451297, "language_loss": 0.79075933, "learning_rate": 2.7364446419477945e-06, "loss": 0.80798066, "num_input_tokens_seen": 142396040, "router_z_loss_clip": 2.66210938, "router_z_loss_mlp": 0.30969238, "step": 6629, "time_per_iteration": 2.7774717807769775 }, { "auxiliary_loss_clip": 0.01376663, "auxiliary_loss_mlp": 0.00307652, "balance_loss_clip": 1.12301397, "balance_loss_mlp": 0.27734894, "epoch": 0.3986171651886367, "flos": 21251935330560.0, "grad_norm": 88.71973461865431, "language_loss": 0.866207, "learning_rate": 2.7360825307515366e-06, "loss": 0.8830502, "num_input_tokens_seen": 142415495, "router_z_loss_clip": 2.53125, "router_z_loss_mlp": 0.30273438, "step": 6630, "time_per_iteration": 2.6875522136688232 }, { "auxiliary_loss_clip": 0.01383175, "auxiliary_loss_mlp": 0.0030915, "balance_loss_clip": 1.12665105, "balance_loss_mlp": 0.27906132, "epoch": 0.3986772884413047, "flos": 12458705258880.0, "grad_norm": 110.56265094446617, "language_loss": 0.82033134, "learning_rate": 2.7357203916429555e-06, "loss": 0.83725464, "num_input_tokens_seen": 142431865, "router_z_loss_clip": 2.56835938, "router_z_loss_mlp": 0.30078125, "step": 6631, "time_per_iteration": 2.6572818756103516 }, { "auxiliary_loss_clip": 0.01378731, "auxiliary_loss_mlp": 0.00315112, "balance_loss_clip": 1.11865807, "balance_loss_mlp": 0.28285396, "epoch": 0.39873741169397264, "flos": 19646117907840.0, "grad_norm": 7.279050144644968, "language_loss": 0.80259985, "learning_rate": 2.735358224635783e-06, "loss": 0.8195383, "num_input_tokens_seen": 142450595, "router_z_loss_clip": 2.60351562, "router_z_loss_mlp": 0.32299805, "step": 6632, "time_per_iteration": 2.7034547328948975 }, { "auxiliary_loss_clip": 0.01372968, "auxiliary_loss_mlp": 0.0029664, "balance_loss_clip": 1.12027812, "balance_loss_mlp": 0.26900721, "epoch": 0.3987975349466406, "flos": 21684766216320.0, "grad_norm": 137.2862486288265, "language_loss": 0.81928241, "learning_rate": 2.7349960297437533e-06, "loss": 0.83597857, "num_input_tokens_seen": 142466650, "router_z_loss_clip": 2.52734375, "router_z_loss_mlp": 0.27624512, "step": 6633, "time_per_iteration": 2.6177995204925537 }, { "auxiliary_loss_clip": 0.01374964, "auxiliary_loss_mlp": 0.0031687, "balance_loss_clip": 1.11884892, "balance_loss_mlp": 0.28694803, "epoch": 0.3988576581993086, "flos": 23914064937600.0, "grad_norm": 17.620894463998365, "language_loss": 0.87070286, "learning_rate": 2.7346338069806e-06, "loss": 0.88762128, "num_input_tokens_seen": 142486165, "router_z_loss_clip": 2.56054688, "router_z_loss_mlp": 0.29882812, "step": 6634, "time_per_iteration": 2.6756954193115234 }, { "auxiliary_loss_clip": 0.01383833, "auxiliary_loss_mlp": 0.00311126, "balance_loss_clip": 1.12639713, "balance_loss_mlp": 0.28056037, "epoch": 0.39891778145197654, "flos": 18149899858560.0, "grad_norm": 102.77095001274809, "language_loss": 0.82448971, "learning_rate": 2.7342715563600597e-06, "loss": 0.84143925, "num_input_tokens_seen": 142505035, "router_z_loss_clip": 2.57421875, "router_z_loss_mlp": 0.3059082, "step": 6635, "time_per_iteration": 2.6367125511169434 }, { "auxiliary_loss_clip": 0.01384114, "auxiliary_loss_mlp": 0.002839, "balance_loss_clip": 1.12501073, "balance_loss_mlp": 0.25575459, "epoch": 0.3989779047046445, "flos": 22595281096320.0, "grad_norm": 15.110911951521603, "language_loss": 0.75469685, "learning_rate": 2.733909277895868e-06, "loss": 0.77137709, "num_input_tokens_seen": 142521870, "router_z_loss_clip": 2.58984375, "router_z_loss_mlp": 0.28137207, "step": 6636, "time_per_iteration": 2.70432448387146 }, { "auxiliary_loss_clip": 0.01366071, "auxiliary_loss_mlp": 0.00309814, "balance_loss_clip": 1.11778951, "balance_loss_mlp": 0.28048861, "epoch": 0.39903802795731247, "flos": 18077216688000.0, "grad_norm": 7.517464780227158, "language_loss": 0.88892484, "learning_rate": 2.733546971601763e-06, "loss": 0.90568376, "num_input_tokens_seen": 142540455, "router_z_loss_clip": 2.48046875, "router_z_loss_mlp": 0.29321289, "step": 6637, "time_per_iteration": 2.600917100906372 }, { "auxiliary_loss_clip": 0.01295164, "auxiliary_loss_mlp": 0.00106355, "balance_loss_clip": 1.11816454, "balance_loss_mlp": 0.09739094, "epoch": 0.39909815120998043, "flos": 70441367771520.0, "grad_norm": 0.7053648572337116, "language_loss": 0.53017914, "learning_rate": 2.733184637491484e-06, "loss": 0.54419434, "num_input_tokens_seen": 142599665, "router_z_loss_clip": 1.765625, "router_z_loss_mlp": 0.08984375, "step": 6638, "time_per_iteration": 3.20635986328125 }, { "auxiliary_loss_clip": 0.01375218, "auxiliary_loss_mlp": 0.00319186, "balance_loss_clip": 1.12449133, "balance_loss_mlp": 0.28771514, "epoch": 0.39915827446264845, "flos": 18549262247040.0, "grad_norm": 47.8180856261843, "language_loss": 0.81303912, "learning_rate": 2.732822275578769e-06, "loss": 0.82998317, "num_input_tokens_seen": 142618845, "router_z_loss_clip": 2.5078125, "router_z_loss_mlp": 0.31469727, "step": 6639, "time_per_iteration": 2.665512800216675 }, { "auxiliary_loss_clip": 0.01385418, "auxiliary_loss_mlp": 0.00312752, "balance_loss_clip": 1.13449609, "balance_loss_mlp": 0.28151876, "epoch": 0.3992183977153164, "flos": 29897249195520.0, "grad_norm": 5.190464538460886, "language_loss": 0.82404244, "learning_rate": 2.7324598858773603e-06, "loss": 0.84102404, "num_input_tokens_seen": 142640885, "router_z_loss_clip": 2.51171875, "router_z_loss_mlp": 0.31176758, "step": 6640, "time_per_iteration": 2.809905529022217 }, { "auxiliary_loss_clip": 0.01376402, "auxiliary_loss_mlp": 0.00312988, "balance_loss_clip": 1.1237061, "balance_loss_mlp": 0.28356695, "epoch": 0.3992785209679844, "flos": 22565080736640.0, "grad_norm": 11.915435124975641, "language_loss": 0.89323068, "learning_rate": 2.7320974684009996e-06, "loss": 0.9101246, "num_input_tokens_seen": 142659340, "router_z_loss_clip": 2.52929688, "router_z_loss_mlp": 0.29418945, "step": 6641, "time_per_iteration": 2.6601693630218506 }, { "auxiliary_loss_clip": 0.01398189, "auxiliary_loss_mlp": 0.00316749, "balance_loss_clip": 1.14343357, "balance_loss_mlp": 0.28664857, "epoch": 0.39933864422065235, "flos": 19682674974720.0, "grad_norm": 6.191067955207903, "language_loss": 0.91691715, "learning_rate": 2.7317350231634288e-06, "loss": 0.93406653, "num_input_tokens_seen": 142677085, "router_z_loss_clip": 2.546875, "router_z_loss_mlp": 0.30078125, "step": 6642, "time_per_iteration": 2.6519598960876465 }, { "auxiliary_loss_clip": 0.0140225, "auxiliary_loss_mlp": 0.00331697, "balance_loss_clip": 1.14379168, "balance_loss_mlp": 0.30017799, "epoch": 0.3993987674733203, "flos": 23038491012480.0, "grad_norm": 22.886459462787073, "language_loss": 0.81036353, "learning_rate": 2.731372550178393e-06, "loss": 0.827703, "num_input_tokens_seen": 142694595, "router_z_loss_clip": 2.58203125, "router_z_loss_mlp": 0.31518555, "step": 6643, "time_per_iteration": 2.650790214538574 }, { "auxiliary_loss_clip": 0.01405133, "auxiliary_loss_mlp": 0.00331413, "balance_loss_clip": 1.148826, "balance_loss_mlp": 0.29927409, "epoch": 0.3994588907259883, "flos": 19390828970880.0, "grad_norm": 3.3518122566900606, "language_loss": 0.72937787, "learning_rate": 2.7310100494596375e-06, "loss": 0.74674332, "num_input_tokens_seen": 142714175, "router_z_loss_clip": 2.56640625, "router_z_loss_mlp": 0.3215332, "step": 6644, "time_per_iteration": 2.674323320388794 }, { "auxiliary_loss_clip": 0.01395466, "auxiliary_loss_mlp": 0.00325959, "balance_loss_clip": 1.14032328, "balance_loss_mlp": 0.29399937, "epoch": 0.39951901397865625, "flos": 13734395758080.0, "grad_norm": 13.74551547148261, "language_loss": 0.85918379, "learning_rate": 2.730647521020907e-06, "loss": 0.87639797, "num_input_tokens_seen": 142730955, "router_z_loss_clip": 2.55078125, "router_z_loss_mlp": 0.31945801, "step": 6645, "time_per_iteration": 4.142471075057983 }, { "auxiliary_loss_clip": 0.01403043, "auxiliary_loss_mlp": 0.003398, "balance_loss_clip": 1.14820182, "balance_loss_mlp": 0.30849546, "epoch": 0.3995791372313242, "flos": 23586451966080.0, "grad_norm": 3.62687885603037, "language_loss": 0.76616895, "learning_rate": 2.73028496487595e-06, "loss": 0.78359735, "num_input_tokens_seen": 142751200, "router_z_loss_clip": 2.54882812, "router_z_loss_mlp": 0.31323242, "step": 6646, "time_per_iteration": 2.6528072357177734 }, { "auxiliary_loss_clip": 0.01410247, "auxiliary_loss_mlp": 0.00320419, "balance_loss_clip": 1.15130424, "balance_loss_mlp": 0.28825676, "epoch": 0.3996392604839922, "flos": 21355896268800.0, "grad_norm": 154.03353386673538, "language_loss": 0.78757524, "learning_rate": 2.729922381038513e-06, "loss": 0.80488193, "num_input_tokens_seen": 142770170, "router_z_loss_clip": 2.58789062, "router_z_loss_mlp": 0.32177734, "step": 6647, "time_per_iteration": 4.121215581893921 }, { "auxiliary_loss_clip": 0.01403891, "auxiliary_loss_mlp": 0.00327951, "balance_loss_clip": 1.15548003, "balance_loss_mlp": 0.29798174, "epoch": 0.39969938373666014, "flos": 26032255914240.0, "grad_norm": 18.175798040939558, "language_loss": 0.80103505, "learning_rate": 2.7295597695223463e-06, "loss": 0.81835347, "num_input_tokens_seen": 142792680, "router_z_loss_clip": 2.48242188, "router_z_loss_mlp": 0.29980469, "step": 6648, "time_per_iteration": 2.722133159637451 }, { "auxiliary_loss_clip": 0.01418745, "auxiliary_loss_mlp": 0.00329834, "balance_loss_clip": 1.16373694, "balance_loss_mlp": 0.30094942, "epoch": 0.3997595069893281, "flos": 20116367786880.0, "grad_norm": 34.835189834158186, "language_loss": 0.71308076, "learning_rate": 2.7291971303412006e-06, "loss": 0.7305665, "num_input_tokens_seen": 142810510, "router_z_loss_clip": 2.55273438, "router_z_loss_mlp": 0.28894043, "step": 6649, "time_per_iteration": 2.637627124786377 }, { "auxiliary_loss_clip": 0.01414264, "auxiliary_loss_mlp": 0.00365837, "balance_loss_clip": 1.16130447, "balance_loss_mlp": 0.33365071, "epoch": 0.39981963024199607, "flos": 27783403764480.0, "grad_norm": 49.179026632357356, "language_loss": 0.83642936, "learning_rate": 2.728834463508826e-06, "loss": 0.85423034, "num_input_tokens_seen": 142832455, "router_z_loss_clip": 2.52929688, "router_z_loss_mlp": 0.32202148, "step": 6650, "time_per_iteration": 4.136972665786743 }, { "auxiliary_loss_clip": 0.01414165, "auxiliary_loss_mlp": 0.00333303, "balance_loss_clip": 1.1599586, "balance_loss_mlp": 0.30345285, "epoch": 0.39987975349466404, "flos": 21944436612480.0, "grad_norm": 21.414485971490652, "language_loss": 0.77655619, "learning_rate": 2.728471769038975e-06, "loss": 0.79403085, "num_input_tokens_seen": 142852590, "router_z_loss_clip": 2.54492188, "router_z_loss_mlp": 0.29858398, "step": 6651, "time_per_iteration": 2.674011468887329 }, { "auxiliary_loss_clip": 0.01427629, "auxiliary_loss_mlp": 0.00393604, "balance_loss_clip": 1.16704762, "balance_loss_mlp": 0.35929531, "epoch": 0.39993987674733206, "flos": 20704405340160.0, "grad_norm": 284.23651455035167, "language_loss": 0.81215632, "learning_rate": 2.728109046945403e-06, "loss": 0.83036864, "num_input_tokens_seen": 142870595, "router_z_loss_clip": 2.609375, "router_z_loss_mlp": 0.34301758, "step": 6652, "time_per_iteration": 2.696906566619873 }, { "auxiliary_loss_clip": 0.01350404, "auxiliary_loss_mlp": 0.00147955, "balance_loss_clip": 1.17307448, "balance_loss_mlp": 0.13746475, "epoch": 0.4, "flos": 61525429862400.0, "grad_norm": 0.8600863865416475, "language_loss": 0.59982121, "learning_rate": 2.727746297241862e-06, "loss": 0.61480474, "num_input_tokens_seen": 142925805, "router_z_loss_clip": 1.7734375, "router_z_loss_mlp": 0.10498047, "step": 6653, "time_per_iteration": 3.049694061279297 }, { "auxiliary_loss_clip": 0.01419922, "auxiliary_loss_mlp": 0.00300915, "balance_loss_clip": 1.17410111, "balance_loss_mlp": 0.27309182, "epoch": 0.400060123252668, "flos": 14502309644160.0, "grad_norm": 114.96780173480907, "language_loss": 0.74397117, "learning_rate": 2.7273835199421085e-06, "loss": 0.76117957, "num_input_tokens_seen": 142943145, "router_z_loss_clip": 2.45507812, "router_z_loss_mlp": 0.27856445, "step": 6654, "time_per_iteration": 2.674602508544922 }, { "auxiliary_loss_clip": 0.01411783, "auxiliary_loss_mlp": 0.00370713, "balance_loss_clip": 1.1609714, "balance_loss_mlp": 0.3370955, "epoch": 0.40012024650533595, "flos": 19093308618240.0, "grad_norm": 11.9482588240532, "language_loss": 0.96239978, "learning_rate": 2.7270207150599e-06, "loss": 0.98022473, "num_input_tokens_seen": 142956925, "router_z_loss_clip": 2.5078125, "router_z_loss_mlp": 0.33569336, "step": 6655, "time_per_iteration": 2.6430020332336426 }, { "auxiliary_loss_clip": 0.0140492, "auxiliary_loss_mlp": 0.00324711, "balance_loss_clip": 1.15845597, "balance_loss_mlp": 0.29526627, "epoch": 0.4001803697580039, "flos": 29351012094720.0, "grad_norm": 28.430699216117382, "language_loss": 0.7774235, "learning_rate": 2.7266578826089917e-06, "loss": 0.79471982, "num_input_tokens_seen": 142978040, "router_z_loss_clip": 2.46679688, "router_z_loss_mlp": 0.29443359, "step": 6656, "time_per_iteration": 2.696978807449341 }, { "auxiliary_loss_clip": 0.01424392, "auxiliary_loss_mlp": 0.00334661, "balance_loss_clip": 1.16948462, "balance_loss_mlp": 0.30271333, "epoch": 0.4002404930106719, "flos": 20920048640640.0, "grad_norm": 6.892117674552533, "language_loss": 0.79036832, "learning_rate": 2.726295022603144e-06, "loss": 0.80795884, "num_input_tokens_seen": 142998390, "router_z_loss_clip": 2.55078125, "router_z_loss_mlp": 0.31933594, "step": 6657, "time_per_iteration": 2.8217923641204834 }, { "auxiliary_loss_clip": 0.01435378, "auxiliary_loss_mlp": 0.00329581, "balance_loss_clip": 1.17848647, "balance_loss_mlp": 0.29560632, "epoch": 0.40030061626333985, "flos": 28405735827840.0, "grad_norm": 2.2639142427127723, "language_loss": 0.85588044, "learning_rate": 2.725932135056117e-06, "loss": 0.87353003, "num_input_tokens_seen": 143021505, "router_z_loss_clip": 2.57226562, "router_z_loss_mlp": 0.33959961, "step": 6658, "time_per_iteration": 2.8139142990112305 }, { "auxiliary_loss_clip": 0.0142651, "auxiliary_loss_mlp": 0.00311404, "balance_loss_clip": 1.17005885, "balance_loss_mlp": 0.28126782, "epoch": 0.4003607395160078, "flos": 25921615046400.0, "grad_norm": 149.36320769837982, "language_loss": 0.83419865, "learning_rate": 2.72556921998167e-06, "loss": 0.85157776, "num_input_tokens_seen": 143041375, "router_z_loss_clip": 2.5625, "router_z_loss_mlp": 0.30151367, "step": 6659, "time_per_iteration": 2.799391031265259 }, { "auxiliary_loss_clip": 0.01395655, "auxiliary_loss_mlp": 0.00321009, "balance_loss_clip": 1.1496253, "balance_loss_mlp": 0.29073024, "epoch": 0.4004208627686758, "flos": 20768648814720.0, "grad_norm": 265.2062231912479, "language_loss": 0.78212911, "learning_rate": 2.7252062773935662e-06, "loss": 0.79929578, "num_input_tokens_seen": 143058725, "router_z_loss_clip": 2.4609375, "router_z_loss_mlp": 0.30297852, "step": 6660, "time_per_iteration": 2.662999391555786 }, { "auxiliary_loss_clip": 0.01420219, "auxiliary_loss_mlp": 0.00319355, "balance_loss_clip": 1.16595411, "balance_loss_mlp": 0.2887181, "epoch": 0.40048098602134374, "flos": 24681224638080.0, "grad_norm": 45.82129070569097, "language_loss": 0.76853389, "learning_rate": 2.7248433073055674e-06, "loss": 0.78592962, "num_input_tokens_seen": 143076995, "router_z_loss_clip": 2.54296875, "router_z_loss_mlp": 0.30664062, "step": 6661, "time_per_iteration": 2.6707546710968018 }, { "auxiliary_loss_clip": 0.01426213, "auxiliary_loss_mlp": 0.00327603, "balance_loss_clip": 1.16617596, "balance_loss_mlp": 0.29665661, "epoch": 0.4005411092740117, "flos": 23185688947200.0, "grad_norm": 7.733232329230433, "language_loss": 0.81258756, "learning_rate": 2.724480309731437e-06, "loss": 0.83012569, "num_input_tokens_seen": 143096780, "router_z_loss_clip": 2.60546875, "router_z_loss_mlp": 0.30957031, "step": 6662, "time_per_iteration": 2.6551225185394287 }, { "auxiliary_loss_clip": 0.01423402, "auxiliary_loss_mlp": 0.00296868, "balance_loss_clip": 1.16408062, "balance_loss_mlp": 0.26701784, "epoch": 0.4006012325266797, "flos": 17522324409600.0, "grad_norm": 11.204425431374172, "language_loss": 0.73853654, "learning_rate": 2.7241172846849417e-06, "loss": 0.75573921, "num_input_tokens_seen": 143112590, "router_z_loss_clip": 2.59179688, "router_z_loss_mlp": 0.29858398, "step": 6663, "time_per_iteration": 2.806070566177368 }, { "auxiliary_loss_clip": 0.01426618, "auxiliary_loss_mlp": 0.00334622, "balance_loss_clip": 1.17014968, "balance_loss_mlp": 0.30233964, "epoch": 0.40066135577934764, "flos": 19857200181120.0, "grad_norm": 33.4272767852, "language_loss": 0.9389137, "learning_rate": 2.7237542321798455e-06, "loss": 0.95652616, "num_input_tokens_seen": 143130220, "router_z_loss_clip": 2.56054688, "router_z_loss_mlp": 0.32299805, "step": 6664, "time_per_iteration": 2.746793746948242 }, { "auxiliary_loss_clip": 0.01437669, "auxiliary_loss_mlp": 0.00339765, "balance_loss_clip": 1.17356253, "balance_loss_mlp": 0.3081508, "epoch": 0.40072147903201566, "flos": 18150007599360.0, "grad_norm": 6.2736858869562075, "language_loss": 0.91643441, "learning_rate": 2.723391152229917e-06, "loss": 0.93420875, "num_input_tokens_seen": 143147160, "router_z_loss_clip": 2.64453125, "router_z_loss_mlp": 0.31640625, "step": 6665, "time_per_iteration": 2.7651543617248535 }, { "auxiliary_loss_clip": 0.01445403, "auxiliary_loss_mlp": 0.00327571, "balance_loss_clip": 1.1842494, "balance_loss_mlp": 0.29474038, "epoch": 0.4007816022846836, "flos": 18661267831680.0, "grad_norm": 29.68208446402844, "language_loss": 0.83507073, "learning_rate": 2.7230280448489236e-06, "loss": 0.85280049, "num_input_tokens_seen": 143164605, "router_z_loss_clip": 2.61132812, "router_z_loss_mlp": 0.32836914, "step": 6666, "time_per_iteration": 2.689385414123535 }, { "auxiliary_loss_clip": 0.01446935, "auxiliary_loss_mlp": 0.0034218, "balance_loss_clip": 1.18698883, "balance_loss_mlp": 0.31047022, "epoch": 0.4008417255373516, "flos": 25703170485120.0, "grad_norm": 4316.617313332409, "language_loss": 0.82483011, "learning_rate": 2.7226649100506333e-06, "loss": 0.84272128, "num_input_tokens_seen": 143183965, "router_z_loss_clip": 2.6015625, "router_z_loss_mlp": 0.31689453, "step": 6667, "time_per_iteration": 2.6882448196411133 }, { "auxiliary_loss_clip": 0.0143969, "auxiliary_loss_mlp": 0.00344426, "balance_loss_clip": 1.17990434, "balance_loss_mlp": 0.31440958, "epoch": 0.40090184879001955, "flos": 22858614679680.0, "grad_norm": 4.373300696033577, "language_loss": 0.82265091, "learning_rate": 2.7223017478488183e-06, "loss": 0.84049207, "num_input_tokens_seen": 143204965, "router_z_loss_clip": 2.6015625, "router_z_loss_mlp": 0.30004883, "step": 6668, "time_per_iteration": 2.700854778289795 }, { "auxiliary_loss_clip": 0.01442209, "auxiliary_loss_mlp": 0.00303681, "balance_loss_clip": 1.18458986, "balance_loss_mlp": 0.27652559, "epoch": 0.4009619720426875, "flos": 29059848449280.0, "grad_norm": 10.204492118611837, "language_loss": 0.90986276, "learning_rate": 2.721938558257248e-06, "loss": 0.92732167, "num_input_tokens_seen": 143225015, "router_z_loss_clip": 2.58007812, "router_z_loss_mlp": 0.27148438, "step": 6669, "time_per_iteration": 2.7382924556732178 }, { "auxiliary_loss_clip": 0.01411253, "auxiliary_loss_mlp": 0.00213896, "balance_loss_clip": 1.22700298, "balance_loss_mlp": 0.20092554, "epoch": 0.4010220952953555, "flos": 66059763131520.0, "grad_norm": 0.7066692509830752, "language_loss": 0.53076714, "learning_rate": 2.721575341289695e-06, "loss": 0.54701865, "num_input_tokens_seen": 143294925, "router_z_loss_clip": 1.84375, "router_z_loss_mlp": 0.12988281, "step": 6670, "time_per_iteration": 3.362952709197998 }, { "auxiliary_loss_clip": 0.01450848, "auxiliary_loss_mlp": 0.00369222, "balance_loss_clip": 1.19475329, "balance_loss_mlp": 0.33667746, "epoch": 0.40108221854802345, "flos": 29642822184960.0, "grad_norm": 3.7389537580788934, "language_loss": 0.93595088, "learning_rate": 2.7212120969599333e-06, "loss": 0.95415151, "num_input_tokens_seen": 143314170, "router_z_loss_clip": 2.55859375, "router_z_loss_mlp": 0.32543945, "step": 6671, "time_per_iteration": 2.738396644592285 }, { "auxiliary_loss_clip": 0.01453968, "auxiliary_loss_mlp": 0.00375575, "balance_loss_clip": 1.18824339, "balance_loss_mlp": 0.34174341, "epoch": 0.4011423418006914, "flos": 19929560129280.0, "grad_norm": 4.275056493317617, "language_loss": 0.84315956, "learning_rate": 2.720848825281736e-06, "loss": 0.86145496, "num_input_tokens_seen": 143330050, "router_z_loss_clip": 2.65625, "router_z_loss_mlp": 0.33862305, "step": 6672, "time_per_iteration": 2.6326746940612793 }, { "auxiliary_loss_clip": 0.01450859, "auxiliary_loss_mlp": 0.00391226, "balance_loss_clip": 1.18891537, "balance_loss_mlp": 0.35720408, "epoch": 0.4012024650533594, "flos": 20084299920000.0, "grad_norm": 10.646086222082266, "language_loss": 0.73296821, "learning_rate": 2.72048552626888e-06, "loss": 0.75138903, "num_input_tokens_seen": 143348650, "router_z_loss_clip": 2.62109375, "router_z_loss_mlp": 0.34033203, "step": 6673, "time_per_iteration": 2.6665260791778564 }, { "auxiliary_loss_clip": 0.01454296, "auxiliary_loss_mlp": 0.0035133, "balance_loss_clip": 1.18906081, "balance_loss_mlp": 0.32124203, "epoch": 0.40126258830602735, "flos": 21695719864320.0, "grad_norm": 14.9682207660407, "language_loss": 0.85422671, "learning_rate": 2.7201221999351402e-06, "loss": 0.87228292, "num_input_tokens_seen": 143370275, "router_z_loss_clip": 2.65234375, "router_z_loss_mlp": 0.30041504, "step": 6674, "time_per_iteration": 2.7273263931274414 }, { "auxiliary_loss_clip": 0.01461739, "auxiliary_loss_mlp": 0.0036171, "balance_loss_clip": 1.19502616, "balance_loss_mlp": 0.32668591, "epoch": 0.4013227115586953, "flos": 12020379592320.0, "grad_norm": 48.78687381316823, "language_loss": 0.9236744, "learning_rate": 2.719758846294294e-06, "loss": 0.94190896, "num_input_tokens_seen": 143385390, "router_z_loss_clip": 2.66796875, "router_z_loss_mlp": 0.35009766, "step": 6675, "time_per_iteration": 2.6525962352752686 }, { "auxiliary_loss_clip": 0.01460111, "auxiliary_loss_mlp": 0.0037383, "balance_loss_clip": 1.19473743, "balance_loss_mlp": 0.33711314, "epoch": 0.4013828348113633, "flos": 25447522412160.0, "grad_norm": 533.4624324695498, "language_loss": 0.99072212, "learning_rate": 2.71939546536012e-06, "loss": 1.00906146, "num_input_tokens_seen": 143404215, "router_z_loss_clip": 2.65234375, "router_z_loss_mlp": 0.36694336, "step": 6676, "time_per_iteration": 2.699455976486206 }, { "auxiliary_loss_clip": 0.01455672, "auxiliary_loss_mlp": 0.00375843, "balance_loss_clip": 1.18951321, "balance_loss_mlp": 0.34296465, "epoch": 0.40144295806403124, "flos": 18582946225920.0, "grad_norm": 40.12522483833791, "language_loss": 0.87387913, "learning_rate": 2.719032057146399e-06, "loss": 0.89219421, "num_input_tokens_seen": 143422245, "router_z_loss_clip": 2.66601562, "router_z_loss_mlp": 0.32861328, "step": 6677, "time_per_iteration": 2.649977684020996 }, { "auxiliary_loss_clip": 0.01464133, "auxiliary_loss_mlp": 0.00368141, "balance_loss_clip": 1.20064533, "balance_loss_mlp": 0.33552498, "epoch": 0.4015030813166992, "flos": 22930220442240.0, "grad_norm": 12.139587983986777, "language_loss": 0.91062999, "learning_rate": 2.71866862166691e-06, "loss": 0.92895269, "num_input_tokens_seen": 143443130, "router_z_loss_clip": 2.63671875, "router_z_loss_mlp": 0.32617188, "step": 6678, "time_per_iteration": 2.6780178546905518 }, { "auxiliary_loss_clip": 0.01459226, "auxiliary_loss_mlp": 0.00379906, "balance_loss_clip": 1.19568157, "balance_loss_mlp": 0.3462415, "epoch": 0.4015632045693672, "flos": 20595057361920.0, "grad_norm": 16.082019177064296, "language_loss": 0.71167862, "learning_rate": 2.718305158935434e-06, "loss": 0.73006994, "num_input_tokens_seen": 143461385, "router_z_loss_clip": 2.63671875, "router_z_loss_mlp": 0.33691406, "step": 6679, "time_per_iteration": 2.700611114501953 }, { "auxiliary_loss_clip": 0.01448398, "auxiliary_loss_mlp": 0.00386181, "balance_loss_clip": 1.18775129, "balance_loss_mlp": 0.3522298, "epoch": 0.4016233278220352, "flos": 23438930808960.0, "grad_norm": 3.4443152844304032, "language_loss": 0.83007312, "learning_rate": 2.7179416689657554e-06, "loss": 0.84841889, "num_input_tokens_seen": 143481750, "router_z_loss_clip": 2.60546875, "router_z_loss_mlp": 0.33959961, "step": 6680, "time_per_iteration": 2.795255661010742 }, { "auxiliary_loss_clip": 0.01468904, "auxiliary_loss_mlp": 0.00445482, "balance_loss_clip": 1.19747734, "balance_loss_mlp": 0.40921855, "epoch": 0.40168345107470316, "flos": 21431057477760.0, "grad_norm": 7086.790220004393, "language_loss": 0.8086884, "learning_rate": 2.7175781517716556e-06, "loss": 0.82783222, "num_input_tokens_seen": 143501540, "router_z_loss_clip": 2.71484375, "router_z_loss_mlp": 0.36230469, "step": 6681, "time_per_iteration": 2.7298166751861572 }, { "auxiliary_loss_clip": 0.014768, "auxiliary_loss_mlp": 0.00395029, "balance_loss_clip": 1.20774913, "balance_loss_mlp": 0.36176962, "epoch": 0.4017435743273711, "flos": 22857214049280.0, "grad_norm": 3.2897154929116974, "language_loss": 0.69885671, "learning_rate": 2.7172146073669213e-06, "loss": 0.71757495, "num_input_tokens_seen": 143520530, "router_z_loss_clip": 2.68945312, "router_z_loss_mlp": 0.33251953, "step": 6682, "time_per_iteration": 2.684739828109741 }, { "auxiliary_loss_clip": 0.01457244, "auxiliary_loss_mlp": 0.00398262, "balance_loss_clip": 1.18862021, "balance_loss_mlp": 0.36156917, "epoch": 0.4018036975800391, "flos": 28622312881920.0, "grad_norm": 16.645088347547606, "language_loss": 0.79805189, "learning_rate": 2.716851035765337e-06, "loss": 0.816607, "num_input_tokens_seen": 143540210, "router_z_loss_clip": 2.68359375, "router_z_loss_mlp": 0.36694336, "step": 6683, "time_per_iteration": 2.752300500869751 }, { "auxiliary_loss_clip": 0.0146536, "auxiliary_loss_mlp": 0.00369742, "balance_loss_clip": 1.20181322, "balance_loss_mlp": 0.33650681, "epoch": 0.40186382083270705, "flos": 26651212099200.0, "grad_norm": 4.66712898342189, "language_loss": 0.79784226, "learning_rate": 2.7164874369806896e-06, "loss": 0.81619328, "num_input_tokens_seen": 143560940, "router_z_loss_clip": 2.63476562, "router_z_loss_mlp": 0.33251953, "step": 6684, "time_per_iteration": 2.7237842082977295 }, { "auxiliary_loss_clip": 0.01429243, "auxiliary_loss_mlp": 0.00167097, "balance_loss_clip": 1.24041212, "balance_loss_mlp": 0.15632053, "epoch": 0.401923944085375, "flos": 59259969123840.0, "grad_norm": 0.821013106554311, "language_loss": 0.6035012, "learning_rate": 2.716123811026767e-06, "loss": 0.61946464, "num_input_tokens_seen": 143624015, "router_z_loss_clip": 1.890625, "router_z_loss_mlp": 0.10791016, "step": 6685, "time_per_iteration": 3.268524408340454 }, { "auxiliary_loss_clip": 0.0147827, "auxiliary_loss_mlp": 0.00414803, "balance_loss_clip": 1.20789099, "balance_loss_mlp": 0.37772942, "epoch": 0.401984067338043, "flos": 16982803152000.0, "grad_norm": 5.200403744280799, "language_loss": 0.76771611, "learning_rate": 2.715760157917357e-06, "loss": 0.78664684, "num_input_tokens_seen": 143642750, "router_z_loss_clip": 2.70507812, "router_z_loss_mlp": 0.37109375, "step": 6686, "time_per_iteration": 2.6494410037994385 }, { "auxiliary_loss_clip": 0.01470808, "auxiliary_loss_mlp": 0.004187, "balance_loss_clip": 1.20573604, "balance_loss_mlp": 0.38091075, "epoch": 0.40204419059071095, "flos": 24972496024320.0, "grad_norm": 6.422060180195437, "language_loss": 0.78713012, "learning_rate": 2.7153964776662504e-06, "loss": 0.80602521, "num_input_tokens_seen": 143664515, "router_z_loss_clip": 2.65234375, "router_z_loss_mlp": 0.37792969, "step": 6687, "time_per_iteration": 4.083427429199219 }, { "auxiliary_loss_clip": 0.01491521, "auxiliary_loss_mlp": 0.00370042, "balance_loss_clip": 1.22193193, "balance_loss_mlp": 0.33780771, "epoch": 0.4021043138433789, "flos": 23477463123840.0, "grad_norm": 10.961693552894461, "language_loss": 0.78154343, "learning_rate": 2.7150327702872385e-06, "loss": 0.8001591, "num_input_tokens_seen": 143683135, "router_z_loss_clip": 2.69921875, "router_z_loss_mlp": 0.32250977, "step": 6688, "time_per_iteration": 2.6573517322540283 }, { "auxiliary_loss_clip": 0.01489211, "auxiliary_loss_mlp": 0.00404849, "balance_loss_clip": 1.21692395, "balance_loss_mlp": 0.37006402, "epoch": 0.4021644370960469, "flos": 25995806588160.0, "grad_norm": 76.27810719981487, "language_loss": 0.71521473, "learning_rate": 2.7146690357941112e-06, "loss": 0.7341553, "num_input_tokens_seen": 143703985, "router_z_loss_clip": 2.72265625, "router_z_loss_mlp": 0.34814453, "step": 6689, "time_per_iteration": 4.141737937927246 }, { "auxiliary_loss_clip": 0.01471368, "auxiliary_loss_mlp": 0.00386521, "balance_loss_clip": 1.20389831, "balance_loss_mlp": 0.35276085, "epoch": 0.40222456034871484, "flos": 13587987922560.0, "grad_norm": 2.4827325937360496, "language_loss": 0.8363809, "learning_rate": 2.7143052742006632e-06, "loss": 0.85495973, "num_input_tokens_seen": 143719245, "router_z_loss_clip": 2.67382812, "router_z_loss_mlp": 0.33764648, "step": 6690, "time_per_iteration": 2.619150400161743 }, { "auxiliary_loss_clip": 0.0146848, "auxiliary_loss_mlp": 0.00390267, "balance_loss_clip": 1.20108652, "balance_loss_mlp": 0.35786611, "epoch": 0.4022846836013828, "flos": 24278019494400.0, "grad_norm": 5.828977487423062, "language_loss": 0.79029441, "learning_rate": 2.7139414855206872e-06, "loss": 0.80888194, "num_input_tokens_seen": 143739575, "router_z_loss_clip": 2.67578125, "router_z_loss_mlp": 0.32373047, "step": 6691, "time_per_iteration": 2.896336078643799 }, { "auxiliary_loss_clip": 0.01493335, "auxiliary_loss_mlp": 0.00425754, "balance_loss_clip": 1.22038972, "balance_loss_mlp": 0.38908547, "epoch": 0.40234480685405083, "flos": 20151596050560.0, "grad_norm": 342.3760096461946, "language_loss": 0.79363972, "learning_rate": 2.7135776697679785e-06, "loss": 0.81283063, "num_input_tokens_seen": 143758515, "router_z_loss_clip": 2.73046875, "router_z_loss_mlp": 0.36694336, "step": 6692, "time_per_iteration": 4.122026205062866 }, { "auxiliary_loss_clip": 0.01484326, "auxiliary_loss_mlp": 0.00373456, "balance_loss_clip": 1.21582806, "balance_loss_mlp": 0.3408401, "epoch": 0.4024049301067188, "flos": 22930220442240.0, "grad_norm": 9.009617884570192, "language_loss": 0.89231551, "learning_rate": 2.7132138269563333e-06, "loss": 0.91089332, "num_input_tokens_seen": 143776770, "router_z_loss_clip": 2.6875, "router_z_loss_mlp": 0.32617188, "step": 6693, "time_per_iteration": 2.7413792610168457 }, { "auxiliary_loss_clip": 0.01463214, "auxiliary_loss_mlp": 0.00394513, "balance_loss_clip": 1.19876027, "balance_loss_mlp": 0.36044317, "epoch": 0.40246505335938676, "flos": 36028421487360.0, "grad_norm": 6.9012941077829, "language_loss": 0.76762056, "learning_rate": 2.7128499570995483e-06, "loss": 0.7861979, "num_input_tokens_seen": 143798450, "router_z_loss_clip": 2.64257812, "router_z_loss_mlp": 0.34082031, "step": 6694, "time_per_iteration": 2.800140380859375 }, { "auxiliary_loss_clip": 0.01466753, "auxiliary_loss_mlp": 0.00422609, "balance_loss_clip": 1.19825959, "balance_loss_mlp": 0.38455772, "epoch": 0.4025251766120547, "flos": 20594303176320.0, "grad_norm": 125.93784531855822, "language_loss": 0.74607491, "learning_rate": 2.7124860602114212e-06, "loss": 0.76496851, "num_input_tokens_seen": 143816995, "router_z_loss_clip": 2.68359375, "router_z_loss_mlp": 0.38061523, "step": 6695, "time_per_iteration": 2.7324469089508057 }, { "auxiliary_loss_clip": 0.0149442, "auxiliary_loss_mlp": 0.00389331, "balance_loss_clip": 1.22594237, "balance_loss_mlp": 0.35521311, "epoch": 0.4025852998647227, "flos": 64523932381440.0, "grad_norm": 21.377019976099415, "language_loss": 0.84527493, "learning_rate": 2.7121221363057515e-06, "loss": 0.86411238, "num_input_tokens_seen": 143842090, "router_z_loss_clip": 2.68554688, "router_z_loss_mlp": 0.34106445, "step": 6696, "time_per_iteration": 3.1059353351593018 }, { "auxiliary_loss_clip": 0.01508314, "auxiliary_loss_mlp": 0.00389015, "balance_loss_clip": 1.23620629, "balance_loss_mlp": 0.35454008, "epoch": 0.40264542311739066, "flos": 20886292834560.0, "grad_norm": 9.678502759484006, "language_loss": 0.78572112, "learning_rate": 2.7117581853963393e-06, "loss": 0.80469441, "num_input_tokens_seen": 143860800, "router_z_loss_clip": 2.72070312, "router_z_loss_mlp": 0.34472656, "step": 6697, "time_per_iteration": 2.677260398864746 }, { "auxiliary_loss_clip": 0.01491943, "auxiliary_loss_mlp": 0.00380919, "balance_loss_clip": 1.22416735, "balance_loss_mlp": 0.34898257, "epoch": 0.4027055463700586, "flos": 26250197685120.0, "grad_norm": 52.99144119650553, "language_loss": 0.69154251, "learning_rate": 2.711394207496984e-06, "loss": 0.71027118, "num_input_tokens_seen": 143878950, "router_z_loss_clip": 2.67773438, "router_z_loss_mlp": 0.31933594, "step": 6698, "time_per_iteration": 4.08188533782959 }, { "auxiliary_loss_clip": 0.01473172, "auxiliary_loss_mlp": 0.00372046, "balance_loss_clip": 1.2021749, "balance_loss_mlp": 0.33747542, "epoch": 0.4027656696227266, "flos": 20631398947200.0, "grad_norm": 47.030044969251925, "language_loss": 0.83571857, "learning_rate": 2.711030202621491e-06, "loss": 0.8541708, "num_input_tokens_seen": 143898385, "router_z_loss_clip": 2.7109375, "router_z_loss_mlp": 0.34594727, "step": 6699, "time_per_iteration": 2.689856767654419 }, { "auxiliary_loss_clip": 0.01470235, "auxiliary_loss_mlp": 0.00375643, "balance_loss_clip": 1.20642424, "balance_loss_mlp": 0.34276491, "epoch": 0.40282579287539455, "flos": 22346277039360.0, "grad_norm": 4.6228051963933865, "language_loss": 0.86069524, "learning_rate": 2.7106661707836605e-06, "loss": 0.87915409, "num_input_tokens_seen": 143918795, "router_z_loss_clip": 2.63671875, "router_z_loss_mlp": 0.32885742, "step": 6700, "time_per_iteration": 2.695662260055542 }, { "auxiliary_loss_clip": 0.01494606, "auxiliary_loss_mlp": 0.00397701, "balance_loss_clip": 1.22229147, "balance_loss_mlp": 0.36236775, "epoch": 0.4028859161280625, "flos": 29274988959360.0, "grad_norm": 8.326410180345835, "language_loss": 0.8067289, "learning_rate": 2.7103021119972977e-06, "loss": 0.825652, "num_input_tokens_seen": 143938245, "router_z_loss_clip": 2.7265625, "router_z_loss_mlp": 0.35302734, "step": 6701, "time_per_iteration": 2.769742488861084 }, { "auxiliary_loss_clip": 0.01480766, "auxiliary_loss_mlp": 0.0039235, "balance_loss_clip": 1.21656156, "balance_loss_mlp": 0.35813719, "epoch": 0.4029460393807305, "flos": 28622312881920.0, "grad_norm": 28.770585010787787, "language_loss": 0.72181368, "learning_rate": 2.709938026276208e-06, "loss": 0.7405448, "num_input_tokens_seen": 143960995, "router_z_loss_clip": 2.64257812, "router_z_loss_mlp": 0.34179688, "step": 6702, "time_per_iteration": 2.719691038131714 }, { "auxiliary_loss_clip": 0.01474534, "auxiliary_loss_mlp": 0.00419183, "balance_loss_clip": 1.20574117, "balance_loss_mlp": 0.38187078, "epoch": 0.40300616263339845, "flos": 22601925112320.0, "grad_norm": 5.35777577246246, "language_loss": 0.73481929, "learning_rate": 2.7095739136341964e-06, "loss": 0.7537564, "num_input_tokens_seen": 143979910, "router_z_loss_clip": 2.6875, "router_z_loss_mlp": 0.37329102, "step": 6703, "time_per_iteration": 2.6427388191223145 }, { "auxiliary_loss_clip": 0.01481572, "auxiliary_loss_mlp": 0.00380648, "balance_loss_clip": 1.21657455, "balance_loss_mlp": 0.34648287, "epoch": 0.4030662858860664, "flos": 25520313323520.0, "grad_norm": 3.6364737316512596, "language_loss": 0.89118028, "learning_rate": 2.709209774085071e-06, "loss": 0.90980244, "num_input_tokens_seen": 144000095, "router_z_loss_clip": 2.6484375, "router_z_loss_mlp": 0.34179688, "step": 6704, "time_per_iteration": 2.6714224815368652 }, { "auxiliary_loss_clip": 0.01484946, "auxiliary_loss_mlp": 0.00393896, "balance_loss_clip": 1.21625876, "balance_loss_mlp": 0.35806212, "epoch": 0.40312640913873443, "flos": 23586703361280.0, "grad_norm": 38.40954931359415, "language_loss": 0.79901922, "learning_rate": 2.7088456076426407e-06, "loss": 0.81780767, "num_input_tokens_seen": 144019695, "router_z_loss_clip": 2.6875, "router_z_loss_mlp": 0.3581543, "step": 6705, "time_per_iteration": 2.6568474769592285 }, { "auxiliary_loss_clip": 0.01474063, "auxiliary_loss_mlp": 0.00351898, "balance_loss_clip": 1.21252418, "balance_loss_mlp": 0.32296601, "epoch": 0.4031865323914024, "flos": 20011042131840.0, "grad_norm": 8.282837421762546, "language_loss": 0.74519861, "learning_rate": 2.708481414320713e-06, "loss": 0.76345813, "num_input_tokens_seen": 144038525, "router_z_loss_clip": 2.61523438, "router_z_loss_mlp": 0.28942871, "step": 6706, "time_per_iteration": 2.6590330600738525 }, { "auxiliary_loss_clip": 0.01472385, "auxiliary_loss_mlp": 0.0034447, "balance_loss_clip": 1.20882738, "balance_loss_mlp": 0.3121163, "epoch": 0.40324665564407036, "flos": 21871430219520.0, "grad_norm": 12.727149188840825, "language_loss": 0.77210832, "learning_rate": 2.7081171941330992e-06, "loss": 0.79027689, "num_input_tokens_seen": 144059485, "router_z_loss_clip": 2.63476562, "router_z_loss_mlp": 0.32324219, "step": 6707, "time_per_iteration": 2.693007230758667 }, { "auxiliary_loss_clip": 0.01462576, "auxiliary_loss_mlp": 0.00380152, "balance_loss_clip": 1.20304823, "balance_loss_mlp": 0.34567702, "epoch": 0.4033067788967383, "flos": 23878728933120.0, "grad_norm": 5.838110902743076, "language_loss": 0.84703636, "learning_rate": 2.707752947093611e-06, "loss": 0.86546361, "num_input_tokens_seen": 144080265, "router_z_loss_clip": 2.59765625, "router_z_loss_mlp": 0.34472656, "step": 6708, "time_per_iteration": 2.777224063873291 }, { "auxiliary_loss_clip": 0.01472782, "auxiliary_loss_mlp": 0.00377603, "balance_loss_clip": 1.20186353, "balance_loss_mlp": 0.34377131, "epoch": 0.4033669021494063, "flos": 17419907756160.0, "grad_norm": 14.545672426968789, "language_loss": 0.91280609, "learning_rate": 2.70738867321606e-06, "loss": 0.93130994, "num_input_tokens_seen": 144098040, "router_z_loss_clip": 2.70898438, "router_z_loss_mlp": 0.33813477, "step": 6709, "time_per_iteration": 2.6831109523773193 }, { "auxiliary_loss_clip": 0.01477374, "auxiliary_loss_mlp": 0.00375543, "balance_loss_clip": 1.20828676, "balance_loss_mlp": 0.33808753, "epoch": 0.40342702540207426, "flos": 29600554855680.0, "grad_norm": 7.400719998605195, "language_loss": 0.76727676, "learning_rate": 2.70702437251426e-06, "loss": 0.78580594, "num_input_tokens_seen": 144118265, "router_z_loss_clip": 2.6875, "router_z_loss_mlp": 0.37451172, "step": 6710, "time_per_iteration": 2.707368850708008 }, { "auxiliary_loss_clip": 0.01458482, "auxiliary_loss_mlp": 0.00369178, "balance_loss_clip": 1.19946265, "balance_loss_mlp": 0.3368485, "epoch": 0.4034871486547422, "flos": 11284605400320.0, "grad_norm": 96.61545440226658, "language_loss": 0.91979402, "learning_rate": 2.7066600450020236e-06, "loss": 0.9380706, "num_input_tokens_seen": 144133865, "router_z_loss_clip": 2.58984375, "router_z_loss_mlp": 0.32324219, "step": 6711, "time_per_iteration": 2.731245517730713 }, { "auxiliary_loss_clip": 0.01466584, "auxiliary_loss_mlp": 0.00388444, "balance_loss_clip": 1.20214367, "balance_loss_mlp": 0.35520822, "epoch": 0.4035472719074102, "flos": 15552839738880.0, "grad_norm": 8.704247306731228, "language_loss": 0.8663286, "learning_rate": 2.706295690693168e-06, "loss": 0.88487893, "num_input_tokens_seen": 144150125, "router_z_loss_clip": 2.64453125, "router_z_loss_mlp": 0.33251953, "step": 6712, "time_per_iteration": 2.667625904083252 }, { "auxiliary_loss_clip": 0.01464676, "auxiliary_loss_mlp": 0.00344806, "balance_loss_clip": 1.20195246, "balance_loss_mlp": 0.3146221, "epoch": 0.40360739516007815, "flos": 24674365140480.0, "grad_norm": 8.975785919379025, "language_loss": 0.86529541, "learning_rate": 2.7059313096015096e-06, "loss": 0.88339019, "num_input_tokens_seen": 144169295, "router_z_loss_clip": 2.62890625, "router_z_loss_mlp": 0.30151367, "step": 6713, "time_per_iteration": 2.7855679988861084 }, { "auxiliary_loss_clip": 0.01456064, "auxiliary_loss_mlp": 0.00360148, "balance_loss_clip": 1.19131255, "balance_loss_mlp": 0.32821208, "epoch": 0.4036675184127461, "flos": 17304095329920.0, "grad_norm": 3.8201444208081656, "language_loss": 0.93352389, "learning_rate": 2.705566901740865e-06, "loss": 0.95168591, "num_input_tokens_seen": 144185790, "router_z_loss_clip": 2.6484375, "router_z_loss_mlp": 0.31970215, "step": 6714, "time_per_iteration": 2.6568803787231445 }, { "auxiliary_loss_clip": 0.0146434, "auxiliary_loss_mlp": 0.00343136, "balance_loss_clip": 1.20074773, "balance_loss_mlp": 0.31197473, "epoch": 0.4037276416654141, "flos": 19864023765120.0, "grad_norm": 7.459855224006649, "language_loss": 0.75255251, "learning_rate": 2.7052024671250527e-06, "loss": 0.77062726, "num_input_tokens_seen": 144205190, "router_z_loss_clip": 2.6328125, "router_z_loss_mlp": 0.31152344, "step": 6715, "time_per_iteration": 2.6723573207855225 }, { "auxiliary_loss_clip": 0.01460441, "auxiliary_loss_mlp": 0.00376124, "balance_loss_clip": 1.1933161, "balance_loss_mlp": 0.34214956, "epoch": 0.40378776491808205, "flos": 18296271780480.0, "grad_norm": 27.603476457479417, "language_loss": 0.8615436, "learning_rate": 2.704838005767892e-06, "loss": 0.87990922, "num_input_tokens_seen": 144222705, "router_z_loss_clip": 2.67578125, "router_z_loss_mlp": 0.33959961, "step": 6716, "time_per_iteration": 2.6693716049194336 }, { "auxiliary_loss_clip": 0.01451289, "auxiliary_loss_mlp": 0.00348216, "balance_loss_clip": 1.19065261, "balance_loss_mlp": 0.31835425, "epoch": 0.40384788817075, "flos": 15049372757760.0, "grad_norm": 11.639201971165813, "language_loss": 0.84078652, "learning_rate": 2.7044735176832037e-06, "loss": 0.85878158, "num_input_tokens_seen": 144239545, "router_z_loss_clip": 2.609375, "router_z_loss_mlp": 0.29846191, "step": 6717, "time_per_iteration": 2.644970178604126 }, { "auxiliary_loss_clip": 0.01419245, "auxiliary_loss_mlp": 0.00171111, "balance_loss_clip": 1.20799589, "balance_loss_mlp": 0.16038249, "epoch": 0.40390801142341803, "flos": 61929927895680.0, "grad_norm": 0.9297737048799892, "language_loss": 0.60116249, "learning_rate": 2.7041090028848084e-06, "loss": 0.61706609, "num_input_tokens_seen": 144288145, "router_z_loss_clip": 2.109375, "router_z_loss_mlp": 0.10742188, "step": 6718, "time_per_iteration": 3.066592216491699 }, { "auxiliary_loss_clip": 0.01451009, "auxiliary_loss_mlp": 0.00346466, "balance_loss_clip": 1.18766308, "balance_loss_mlp": 0.31385064, "epoch": 0.403968134676086, "flos": 22738779930240.0, "grad_norm": 29.649458709233436, "language_loss": 0.83880913, "learning_rate": 2.7037444613865306e-06, "loss": 0.85678387, "num_input_tokens_seen": 144302315, "router_z_loss_clip": 2.63476562, "router_z_loss_mlp": 0.32592773, "step": 6719, "time_per_iteration": 2.710273027420044 }, { "auxiliary_loss_clip": 0.0144983, "auxiliary_loss_mlp": 0.00336038, "balance_loss_clip": 1.18562055, "balance_loss_mlp": 0.30673641, "epoch": 0.40402825792875396, "flos": 19784409269760.0, "grad_norm": 41.16388494930088, "language_loss": 0.88199651, "learning_rate": 2.7033798932021906e-06, "loss": 0.89985514, "num_input_tokens_seen": 144318990, "router_z_loss_clip": 2.64453125, "router_z_loss_mlp": 0.29296875, "step": 6720, "time_per_iteration": 2.693999767303467 }, { "auxiliary_loss_clip": 0.01460185, "auxiliary_loss_mlp": 0.00316102, "balance_loss_clip": 1.19891953, "balance_loss_mlp": 0.28625238, "epoch": 0.40408838118142193, "flos": 19609273532160.0, "grad_norm": 7.234617141246806, "language_loss": 0.83039612, "learning_rate": 2.7030152983456153e-06, "loss": 0.84815896, "num_input_tokens_seen": 144335765, "router_z_loss_clip": 2.61328125, "router_z_loss_mlp": 0.29907227, "step": 6721, "time_per_iteration": 2.7311298847198486 }, { "auxiliary_loss_clip": 0.01448539, "auxiliary_loss_mlp": 0.00342457, "balance_loss_clip": 1.19039941, "balance_loss_mlp": 0.31291723, "epoch": 0.4041485044340899, "flos": 24426043441920.0, "grad_norm": 4.580903513682932, "language_loss": 0.79784369, "learning_rate": 2.7026506768306304e-06, "loss": 0.81575364, "num_input_tokens_seen": 144355825, "router_z_loss_clip": 2.58203125, "router_z_loss_mlp": 0.29516602, "step": 6722, "time_per_iteration": 2.754469633102417 }, { "auxiliary_loss_clip": 0.01460391, "auxiliary_loss_mlp": 0.00351336, "balance_loss_clip": 1.19945359, "balance_loss_mlp": 0.32259476, "epoch": 0.40420862768675786, "flos": 16760192613120.0, "grad_norm": 13.966833719856666, "language_loss": 0.7400192, "learning_rate": 2.7022860286710602e-06, "loss": 0.75813651, "num_input_tokens_seen": 144374320, "router_z_loss_clip": 2.61132812, "router_z_loss_mlp": 0.28723145, "step": 6723, "time_per_iteration": 2.6448826789855957 }, { "auxiliary_loss_clip": 0.01477293, "auxiliary_loss_mlp": 0.00375498, "balance_loss_clip": 1.21032298, "balance_loss_mlp": 0.3449443, "epoch": 0.4042687509394258, "flos": 22491571553280.0, "grad_norm": 3.5127932657150884, "language_loss": 0.79296339, "learning_rate": 2.701921353880734e-06, "loss": 0.81149137, "num_input_tokens_seen": 144394325, "router_z_loss_clip": 2.66992188, "router_z_loss_mlp": 0.30554199, "step": 6724, "time_per_iteration": 2.729971408843994 }, { "auxiliary_loss_clip": 0.0146901, "auxiliary_loss_mlp": 0.00296254, "balance_loss_clip": 1.21154344, "balance_loss_mlp": 0.27040988, "epoch": 0.4043288741920938, "flos": 30336149479680.0, "grad_norm": 117.31355339402123, "language_loss": 0.80633658, "learning_rate": 2.7015566524734787e-06, "loss": 0.82398927, "num_input_tokens_seen": 144412765, "router_z_loss_clip": 2.57421875, "router_z_loss_mlp": 0.25854492, "step": 6725, "time_per_iteration": 2.693812370300293 }, { "auxiliary_loss_clip": 0.01478348, "auxiliary_loss_mlp": 0.0031719, "balance_loss_clip": 1.21773958, "balance_loss_mlp": 0.28619576, "epoch": 0.40438899744476176, "flos": 46348321363200.0, "grad_norm": 56.58077520271252, "language_loss": 0.84094298, "learning_rate": 2.701191924463126e-06, "loss": 0.8588984, "num_input_tokens_seen": 144435400, "router_z_loss_clip": 2.60351562, "router_z_loss_mlp": 0.31030273, "step": 6726, "time_per_iteration": 2.8658411502838135 }, { "auxiliary_loss_clip": 0.01476604, "auxiliary_loss_mlp": 0.003292, "balance_loss_clip": 1.21178198, "balance_loss_mlp": 0.29965967, "epoch": 0.4044491206974297, "flos": 13333524998400.0, "grad_norm": 11.340058153241813, "language_loss": 0.90064538, "learning_rate": 2.7008271698635054e-06, "loss": 0.91870338, "num_input_tokens_seen": 144452925, "router_z_loss_clip": 2.65429688, "router_z_loss_mlp": 0.29553223, "step": 6727, "time_per_iteration": 2.633463144302368 }, { "auxiliary_loss_clip": 0.01458095, "auxiliary_loss_mlp": 0.00390146, "balance_loss_clip": 1.19538248, "balance_loss_mlp": 0.3551459, "epoch": 0.4045092439500977, "flos": 12093745121280.0, "grad_norm": 3.93880585749848, "language_loss": 0.93249273, "learning_rate": 2.700462388688447e-06, "loss": 0.95097506, "num_input_tokens_seen": 144470195, "router_z_loss_clip": 2.62695312, "router_z_loss_mlp": 0.34960938, "step": 6728, "time_per_iteration": 2.641984224319458 }, { "auxiliary_loss_clip": 0.01499655, "auxiliary_loss_mlp": 0.0029627, "balance_loss_clip": 1.2341888, "balance_loss_mlp": 0.2668018, "epoch": 0.40456936720276565, "flos": 21179683123200.0, "grad_norm": 31.614348256228457, "language_loss": 0.89779794, "learning_rate": 2.700097580951786e-06, "loss": 0.91575724, "num_input_tokens_seen": 144490320, "router_z_loss_clip": 2.65625, "router_z_loss_mlp": 0.29455566, "step": 6729, "time_per_iteration": 4.080347061157227 }, { "auxiliary_loss_clip": 0.01496838, "auxiliary_loss_mlp": 0.00344103, "balance_loss_clip": 1.22956908, "balance_loss_mlp": 0.31412208, "epoch": 0.4046294904554336, "flos": 23915286000000.0, "grad_norm": 12.146428529095132, "language_loss": 0.80208987, "learning_rate": 2.6997327466673533e-06, "loss": 0.8204993, "num_input_tokens_seen": 144508990, "router_z_loss_clip": 2.67382812, "router_z_loss_mlp": 0.29943848, "step": 6730, "time_per_iteration": 2.685918092727661 }, { "auxiliary_loss_clip": 0.01495317, "auxiliary_loss_mlp": 0.00363963, "balance_loss_clip": 1.22912633, "balance_loss_mlp": 0.33249205, "epoch": 0.4046896137081016, "flos": 38071235773440.0, "grad_norm": 7.860589373445715, "language_loss": 0.74498904, "learning_rate": 2.699367885848985e-06, "loss": 0.76358187, "num_input_tokens_seen": 144529550, "router_z_loss_clip": 2.66210938, "router_z_loss_mlp": 0.31469727, "step": 6731, "time_per_iteration": 4.246072053909302 }, { "auxiliary_loss_clip": 0.01514146, "auxiliary_loss_mlp": 0.00319061, "balance_loss_clip": 1.24335885, "balance_loss_mlp": 0.29184508, "epoch": 0.4047497369607696, "flos": 23617262856960.0, "grad_norm": 132.27790671849317, "language_loss": 0.8154794, "learning_rate": 2.699002998510517e-06, "loss": 0.83381146, "num_input_tokens_seen": 144549310, "router_z_loss_clip": 2.7109375, "router_z_loss_mlp": 0.27172852, "step": 6732, "time_per_iteration": 2.754749059677124 }, { "auxiliary_loss_clip": 0.01493763, "auxiliary_loss_mlp": 0.00302768, "balance_loss_clip": 1.22658563, "balance_loss_mlp": 0.27380013, "epoch": 0.40480986021343757, "flos": 12823593569280.0, "grad_norm": 24.878703102153786, "language_loss": 0.8319633, "learning_rate": 2.6986380846657852e-06, "loss": 0.84992862, "num_input_tokens_seen": 144567430, "router_z_loss_clip": 2.671875, "router_z_loss_mlp": 0.28955078, "step": 6733, "time_per_iteration": 2.7015082836151123 }, { "auxiliary_loss_clip": 0.01495914, "auxiliary_loss_mlp": 0.00322802, "balance_loss_clip": 1.2248292, "balance_loss_mlp": 0.29166433, "epoch": 0.40486998346610553, "flos": 23768770423680.0, "grad_norm": 9.205305763089699, "language_loss": 0.8338747, "learning_rate": 2.698273144328627e-06, "loss": 0.85206187, "num_input_tokens_seen": 144585975, "router_z_loss_clip": 2.71289062, "router_z_loss_mlp": 0.31176758, "step": 6734, "time_per_iteration": 4.189352035522461 }, { "auxiliary_loss_clip": 0.01495226, "auxiliary_loss_mlp": 0.00333896, "balance_loss_clip": 1.22307789, "balance_loss_mlp": 0.30392689, "epoch": 0.4049301067187735, "flos": 22856818999680.0, "grad_norm": 99.5913024898432, "language_loss": 0.72657657, "learning_rate": 2.6979081775128805e-06, "loss": 0.74486768, "num_input_tokens_seen": 144605225, "router_z_loss_clip": 2.71875, "router_z_loss_mlp": 0.29980469, "step": 6735, "time_per_iteration": 2.6793932914733887 }, { "auxiliary_loss_clip": 0.01509127, "auxiliary_loss_mlp": 0.0033703, "balance_loss_clip": 1.23716354, "balance_loss_mlp": 0.30541611, "epoch": 0.40499022997144146, "flos": 22783992174720.0, "grad_norm": 21.073024092642754, "language_loss": 0.90719157, "learning_rate": 2.697543184232387e-06, "loss": 0.92565316, "num_input_tokens_seen": 144624145, "router_z_loss_clip": 2.71484375, "router_z_loss_mlp": 0.31640625, "step": 6736, "time_per_iteration": 2.7064034938812256 }, { "auxiliary_loss_clip": 0.01536711, "auxiliary_loss_mlp": 0.00294531, "balance_loss_clip": 1.25911009, "balance_loss_mlp": 0.26544404, "epoch": 0.4050503532241094, "flos": 23039352938880.0, "grad_norm": 5.531064693265709, "language_loss": 0.82502425, "learning_rate": 2.6971781645009863e-06, "loss": 0.8433367, "num_input_tokens_seen": 144644470, "router_z_loss_clip": 2.78320312, "router_z_loss_mlp": 0.29125977, "step": 6737, "time_per_iteration": 2.6819875240325928 }, { "auxiliary_loss_clip": 0.01524797, "auxiliary_loss_mlp": 0.00310127, "balance_loss_clip": 1.24852502, "balance_loss_mlp": 0.28012195, "epoch": 0.4051104764767774, "flos": 16647756065280.0, "grad_norm": 4.13578699849256, "language_loss": 0.79232538, "learning_rate": 2.696813118332519e-06, "loss": 0.81067455, "num_input_tokens_seen": 144661055, "router_z_loss_clip": 2.76171875, "router_z_loss_mlp": 0.29992676, "step": 6738, "time_per_iteration": 2.6623728275299072 }, { "auxiliary_loss_clip": 0.01530194, "auxiliary_loss_mlp": 0.00306928, "balance_loss_clip": 1.25512505, "balance_loss_mlp": 0.27935538, "epoch": 0.40517059972944536, "flos": 16358962717440.0, "grad_norm": 3.7303338138509403, "language_loss": 0.80624533, "learning_rate": 2.696448045740828e-06, "loss": 0.82461655, "num_input_tokens_seen": 144677935, "router_z_loss_clip": 2.74804688, "router_z_loss_mlp": 0.27539062, "step": 6739, "time_per_iteration": 2.6798486709594727 }, { "auxiliary_loss_clip": 0.01538907, "auxiliary_loss_mlp": 0.00313718, "balance_loss_clip": 1.25755537, "balance_loss_mlp": 0.28175816, "epoch": 0.4052307229821133, "flos": 28803374363520.0, "grad_norm": 112.64144795195084, "language_loss": 0.81721765, "learning_rate": 2.6960829467397576e-06, "loss": 0.8357439, "num_input_tokens_seen": 144697725, "router_z_loss_clip": 2.81054688, "router_z_loss_mlp": 0.32006836, "step": 6740, "time_per_iteration": 4.149216413497925 }, { "auxiliary_loss_clip": 0.01544728, "auxiliary_loss_mlp": 0.00338092, "balance_loss_clip": 1.26436162, "balance_loss_mlp": 0.30690706, "epoch": 0.4052908462347813, "flos": 21397876289280.0, "grad_norm": 2.7489668566366943, "language_loss": 0.81692386, "learning_rate": 2.695717821343153e-06, "loss": 0.83575201, "num_input_tokens_seen": 144718805, "router_z_loss_clip": 2.80078125, "router_z_loss_mlp": 0.31164551, "step": 6741, "time_per_iteration": 2.705186605453491 }, { "auxiliary_loss_clip": 0.01554451, "auxiliary_loss_mlp": 0.00320389, "balance_loss_clip": 1.2710309, "balance_loss_mlp": 0.28901282, "epoch": 0.40535096948744925, "flos": 22419067950720.0, "grad_norm": 12.592968668982827, "language_loss": 0.79064417, "learning_rate": 2.6953526695648577e-06, "loss": 0.80939257, "num_input_tokens_seen": 144737105, "router_z_loss_clip": 2.83789062, "router_z_loss_mlp": 0.31396484, "step": 6742, "time_per_iteration": 2.675943374633789 }, { "auxiliary_loss_clip": 0.015526, "auxiliary_loss_mlp": 0.00337729, "balance_loss_clip": 1.26510966, "balance_loss_mlp": 0.30659151, "epoch": 0.4054110927401172, "flos": 17010776868480.0, "grad_norm": 145.01146389896232, "language_loss": 0.82547414, "learning_rate": 2.6949874914187202e-06, "loss": 0.84437752, "num_input_tokens_seen": 144751350, "router_z_loss_clip": 2.87695312, "router_z_loss_mlp": 0.31152344, "step": 6743, "time_per_iteration": 2.6258978843688965 }, { "auxiliary_loss_clip": 0.01561552, "auxiliary_loss_mlp": 0.00327606, "balance_loss_clip": 1.27303481, "balance_loss_mlp": 0.29675463, "epoch": 0.4054712159927852, "flos": 21614848392960.0, "grad_norm": 446.2569263109992, "language_loss": 0.80140448, "learning_rate": 2.694622286918588e-06, "loss": 0.82029605, "num_input_tokens_seen": 144770030, "router_z_loss_clip": 2.88671875, "router_z_loss_mlp": 0.30834961, "step": 6744, "time_per_iteration": 2.686690330505371 }, { "auxiliary_loss_clip": 0.01558444, "auxiliary_loss_mlp": 0.00320028, "balance_loss_clip": 1.27364433, "balance_loss_mlp": 0.2897487, "epoch": 0.4055313392454532, "flos": 25812554376960.0, "grad_norm": 5.332799232174399, "language_loss": 0.87102199, "learning_rate": 2.6942570560783076e-06, "loss": 0.88980675, "num_input_tokens_seen": 144790965, "router_z_loss_clip": 2.84765625, "router_z_loss_mlp": 0.30273438, "step": 6745, "time_per_iteration": 2.703606605529785 }, { "auxiliary_loss_clip": 0.01565119, "auxiliary_loss_mlp": 0.00285489, "balance_loss_clip": 1.2811029, "balance_loss_mlp": 0.25513861, "epoch": 0.40559146249812117, "flos": 14137098111360.0, "grad_norm": 7.976023589863802, "language_loss": 0.73456419, "learning_rate": 2.693891798911731e-06, "loss": 0.75307029, "num_input_tokens_seen": 144807755, "router_z_loss_clip": 2.83984375, "router_z_loss_mlp": 0.30371094, "step": 6746, "time_per_iteration": 2.633772373199463 }, { "auxiliary_loss_clip": 0.01561539, "auxiliary_loss_mlp": 0.00316552, "balance_loss_clip": 1.27823925, "balance_loss_mlp": 0.28571215, "epoch": 0.40565158575078913, "flos": 41355481962240.0, "grad_norm": 1336.5692058061634, "language_loss": 0.63964665, "learning_rate": 2.6935265154327075e-06, "loss": 0.6584276, "num_input_tokens_seen": 144832405, "router_z_loss_clip": 2.83203125, "router_z_loss_mlp": 0.30871582, "step": 6747, "time_per_iteration": 2.827512502670288 }, { "auxiliary_loss_clip": 0.01566432, "auxiliary_loss_mlp": 0.00313086, "balance_loss_clip": 1.28280294, "balance_loss_mlp": 0.28364116, "epoch": 0.4057117090034571, "flos": 28544529980160.0, "grad_norm": 4.3549221760317804, "language_loss": 0.89947295, "learning_rate": 2.693161205655089e-06, "loss": 0.91826808, "num_input_tokens_seen": 144853890, "router_z_loss_clip": 2.83203125, "router_z_loss_mlp": 0.29418945, "step": 6748, "time_per_iteration": 2.702265977859497 }, { "auxiliary_loss_clip": 0.0157916, "auxiliary_loss_mlp": 0.00286059, "balance_loss_clip": 1.2920835, "balance_loss_mlp": 0.25736529, "epoch": 0.40577183225612506, "flos": 18004066640640.0, "grad_norm": 9.083302049614716, "language_loss": 0.90592444, "learning_rate": 2.6927958695927287e-06, "loss": 0.92457664, "num_input_tokens_seen": 144871395, "router_z_loss_clip": 2.87304688, "router_z_loss_mlp": 0.28674316, "step": 6749, "time_per_iteration": 2.622791051864624 }, { "auxiliary_loss_clip": 0.0158749, "auxiliary_loss_mlp": 0.00310343, "balance_loss_clip": 1.30014312, "balance_loss_mlp": 0.28248394, "epoch": 0.40583195550879303, "flos": 19536734016000.0, "grad_norm": 5.374074943162313, "language_loss": 0.83260369, "learning_rate": 2.6924305072594784e-06, "loss": 0.85158205, "num_input_tokens_seen": 144890975, "router_z_loss_clip": 2.86914062, "router_z_loss_mlp": 0.27893066, "step": 6750, "time_per_iteration": 2.6600613594055176 }, { "auxiliary_loss_clip": 0.01572796, "auxiliary_loss_mlp": 0.00314119, "balance_loss_clip": 1.28079939, "balance_loss_mlp": 0.28355402, "epoch": 0.405892078761461, "flos": 22309468577280.0, "grad_norm": 411.25369979242834, "language_loss": 0.82539648, "learning_rate": 2.692065118669195e-06, "loss": 0.84426558, "num_input_tokens_seen": 144908170, "router_z_loss_clip": 2.91992188, "router_z_loss_mlp": 0.30566406, "step": 6751, "time_per_iteration": 2.681941270828247 }, { "auxiliary_loss_clip": 0.01575802, "auxiliary_loss_mlp": 0.00318924, "balance_loss_clip": 1.28785777, "balance_loss_mlp": 0.28727439, "epoch": 0.40595220201412896, "flos": 25484402701440.0, "grad_norm": 39.43246084256196, "language_loss": 0.75842267, "learning_rate": 2.6916997038357326e-06, "loss": 0.77736992, "num_input_tokens_seen": 144928020, "router_z_loss_clip": 2.87890625, "router_z_loss_mlp": 0.31640625, "step": 6752, "time_per_iteration": 2.7259552478790283 }, { "auxiliary_loss_clip": 0.01594617, "auxiliary_loss_mlp": 0.00300257, "balance_loss_clip": 1.30441236, "balance_loss_mlp": 0.27062112, "epoch": 0.4060123252667969, "flos": 49856004103680.0, "grad_norm": 3.6476343793506274, "language_loss": 0.77784359, "learning_rate": 2.691334262772948e-06, "loss": 0.79679239, "num_input_tokens_seen": 144951240, "router_z_loss_clip": 2.90429688, "router_z_loss_mlp": 0.29638672, "step": 6753, "time_per_iteration": 2.9413504600524902 }, { "auxiliary_loss_clip": 0.01578945, "auxiliary_loss_mlp": 0.00305382, "balance_loss_clip": 1.29578018, "balance_loss_mlp": 0.27577028, "epoch": 0.4060724485194649, "flos": 21135476459520.0, "grad_norm": 3.230957506678049, "language_loss": 0.79457772, "learning_rate": 2.690968795494699e-06, "loss": 0.81342101, "num_input_tokens_seen": 144969100, "router_z_loss_clip": 2.83398438, "router_z_loss_mlp": 0.29589844, "step": 6754, "time_per_iteration": 2.7523746490478516 }, { "auxiliary_loss_clip": 0.01582788, "auxiliary_loss_mlp": 0.00316547, "balance_loss_clip": 1.29718697, "balance_loss_mlp": 0.2857551, "epoch": 0.40613257177213286, "flos": 21758059918080.0, "grad_norm": 38.10390500768634, "language_loss": 0.88838184, "learning_rate": 2.690603302014844e-06, "loss": 0.90737522, "num_input_tokens_seen": 144987065, "router_z_loss_clip": 2.85546875, "router_z_loss_mlp": 0.30786133, "step": 6755, "time_per_iteration": 2.9120874404907227 }, { "auxiliary_loss_clip": 0.0155892, "auxiliary_loss_mlp": 0.00303093, "balance_loss_clip": 1.27711272, "balance_loss_mlp": 0.27220559, "epoch": 0.4061926950248008, "flos": 25555074710400.0, "grad_norm": 97.27585014819508, "language_loss": 0.76871479, "learning_rate": 2.6902377823472426e-06, "loss": 0.78733492, "num_input_tokens_seen": 145007310, "router_z_loss_clip": 2.81445312, "router_z_loss_mlp": 0.3092041, "step": 6756, "time_per_iteration": 2.7961766719818115 }, { "auxiliary_loss_clip": 0.01582334, "auxiliary_loss_mlp": 0.00294772, "balance_loss_clip": 1.29831004, "balance_loss_mlp": 0.26291886, "epoch": 0.4062528182774688, "flos": 23695799944320.0, "grad_norm": 5.3198607251868735, "language_loss": 0.85062915, "learning_rate": 2.689872236505755e-06, "loss": 0.8694002, "num_input_tokens_seen": 145026210, "router_z_loss_clip": 2.84179688, "router_z_loss_mlp": 0.31860352, "step": 6757, "time_per_iteration": 2.7748465538024902 }, { "auxiliary_loss_clip": 0.0158906, "auxiliary_loss_mlp": 0.00289667, "balance_loss_clip": 1.3052851, "balance_loss_mlp": 0.26162881, "epoch": 0.4063129415301368, "flos": 21726027964800.0, "grad_norm": 8.274125632832591, "language_loss": 0.85944366, "learning_rate": 2.6895066645042437e-06, "loss": 0.87823099, "num_input_tokens_seen": 145045475, "router_z_loss_clip": 2.83398438, "router_z_loss_mlp": 0.28039551, "step": 6758, "time_per_iteration": 2.723299980163574 }, { "auxiliary_loss_clip": 0.01572013, "auxiliary_loss_mlp": 0.00312442, "balance_loss_clip": 1.29310298, "balance_loss_mlp": 0.28265128, "epoch": 0.40637306478280477, "flos": 12787575206400.0, "grad_norm": 9.214785681582102, "language_loss": 0.97004449, "learning_rate": 2.6891410663565703e-06, "loss": 0.98888904, "num_input_tokens_seen": 145062260, "router_z_loss_clip": 2.7890625, "router_z_loss_mlp": 0.2980957, "step": 6759, "time_per_iteration": 2.716917037963867 }, { "auxiliary_loss_clip": 0.0154746, "auxiliary_loss_mlp": 0.00313047, "balance_loss_clip": 1.27094507, "balance_loss_mlp": 0.28350663, "epoch": 0.40643318803547274, "flos": 24024490323840.0, "grad_norm": 17.708391285908302, "language_loss": 0.73640561, "learning_rate": 2.688775442076598e-06, "loss": 0.75501072, "num_input_tokens_seen": 145082470, "router_z_loss_clip": 2.76171875, "router_z_loss_mlp": 0.29541016, "step": 6760, "time_per_iteration": 2.6811389923095703 }, { "auxiliary_loss_clip": 0.01538625, "auxiliary_loss_mlp": 0.00326836, "balance_loss_clip": 1.26478958, "balance_loss_mlp": 0.29544771, "epoch": 0.4064933112881407, "flos": 25592421876480.0, "grad_norm": 2.241828305502943, "language_loss": 0.80971187, "learning_rate": 2.688409791678193e-06, "loss": 0.82836652, "num_input_tokens_seen": 145105685, "router_z_loss_clip": 2.73828125, "router_z_loss_mlp": 0.31384277, "step": 6761, "time_per_iteration": 2.755098342895508 }, { "auxiliary_loss_clip": 0.01519084, "auxiliary_loss_mlp": 0.00267972, "balance_loss_clip": 1.25458157, "balance_loss_mlp": 0.24129303, "epoch": 0.40655343454080867, "flos": 22054323294720.0, "grad_norm": 37.12825306709907, "language_loss": 0.75707704, "learning_rate": 2.6880441151752185e-06, "loss": 0.77494764, "num_input_tokens_seen": 145125590, "router_z_loss_clip": 2.64257812, "router_z_loss_mlp": 0.26660156, "step": 6762, "time_per_iteration": 2.669874429702759 }, { "auxiliary_loss_clip": 0.01530642, "auxiliary_loss_mlp": 0.00296014, "balance_loss_clip": 1.26277995, "balance_loss_mlp": 0.2670103, "epoch": 0.40661355779347663, "flos": 26468893641600.0, "grad_norm": 1.9431259555165223, "language_loss": 0.80180109, "learning_rate": 2.6876784125815433e-06, "loss": 0.82006764, "num_input_tokens_seen": 145146810, "router_z_loss_clip": 2.6796875, "router_z_loss_mlp": 0.29016113, "step": 6763, "time_per_iteration": 2.7438178062438965 }, { "auxiliary_loss_clip": 0.01520405, "auxiliary_loss_mlp": 0.00331809, "balance_loss_clip": 1.25567508, "balance_loss_mlp": 0.30114844, "epoch": 0.4066736810461446, "flos": 13261129136640.0, "grad_norm": 104.24700861893749, "language_loss": 0.7419554, "learning_rate": 2.687312683911033e-06, "loss": 0.76047754, "num_input_tokens_seen": 145163130, "router_z_loss_clip": 2.64257812, "router_z_loss_mlp": 0.3067627, "step": 6764, "time_per_iteration": 2.705596685409546 }, { "auxiliary_loss_clip": 0.01539347, "auxiliary_loss_mlp": 0.00374118, "balance_loss_clip": 1.2674458, "balance_loss_mlp": 0.3381173, "epoch": 0.40673380429881256, "flos": 28803625758720.0, "grad_norm": 40.884844692211296, "language_loss": 0.97201705, "learning_rate": 2.686946929177557e-06, "loss": 0.99115169, "num_input_tokens_seen": 145181420, "router_z_loss_clip": 2.71875, "router_z_loss_mlp": 0.36010742, "step": 6765, "time_per_iteration": 2.7028183937072754 }, { "auxiliary_loss_clip": 0.01529678, "auxiliary_loss_mlp": 0.00311909, "balance_loss_clip": 1.26159656, "balance_loss_mlp": 0.28120095, "epoch": 0.4067939275514805, "flos": 12495334152960.0, "grad_norm": 7.563426840829584, "language_loss": 0.9011035, "learning_rate": 2.6865811483949855e-06, "loss": 0.91951936, "num_input_tokens_seen": 145198545, "router_z_loss_clip": 2.68164062, "router_z_loss_mlp": 0.3067627, "step": 6766, "time_per_iteration": 2.667046070098877 }, { "auxiliary_loss_clip": 0.01507279, "auxiliary_loss_mlp": 0.00297221, "balance_loss_clip": 1.24745846, "balance_loss_mlp": 0.26760927, "epoch": 0.4068540508041485, "flos": 18770508069120.0, "grad_norm": 15.281632834157598, "language_loss": 0.83554733, "learning_rate": 2.6862153415771867e-06, "loss": 0.8535924, "num_input_tokens_seen": 145215835, "router_z_loss_clip": 2.59570312, "router_z_loss_mlp": 0.29614258, "step": 6767, "time_per_iteration": 2.6536810398101807 }, { "auxiliary_loss_clip": 0.01517725, "auxiliary_loss_mlp": 0.00314803, "balance_loss_clip": 1.25830531, "balance_loss_mlp": 0.28554934, "epoch": 0.40691417405681646, "flos": 28512821249280.0, "grad_norm": 34.4928877755483, "language_loss": 0.8401593, "learning_rate": 2.685849508738034e-06, "loss": 0.85848457, "num_input_tokens_seen": 145236555, "router_z_loss_clip": 2.59765625, "router_z_loss_mlp": 0.29284668, "step": 6768, "time_per_iteration": 2.7089619636535645 }, { "auxiliary_loss_clip": 0.01499062, "auxiliary_loss_mlp": 0.00306725, "balance_loss_clip": 1.24115491, "balance_loss_mlp": 0.27718529, "epoch": 0.4069742973094844, "flos": 20814040627200.0, "grad_norm": 68.35475911304923, "language_loss": 0.93186259, "learning_rate": 2.6854836498913995e-06, "loss": 0.94992054, "num_input_tokens_seen": 145254595, "router_z_loss_clip": 2.58007812, "router_z_loss_mlp": 0.2956543, "step": 6769, "time_per_iteration": 2.6208348274230957 }, { "auxiliary_loss_clip": 0.01487721, "auxiliary_loss_mlp": 0.00299353, "balance_loss_clip": 1.2376219, "balance_loss_mlp": 0.27096966, "epoch": 0.4070344205621524, "flos": 21470272151040.0, "grad_norm": 15.192417870740572, "language_loss": 0.85938948, "learning_rate": 2.685117765051156e-06, "loss": 0.87726021, "num_input_tokens_seen": 145274005, "router_z_loss_clip": 2.50195312, "router_z_loss_mlp": 0.28393555, "step": 6770, "time_per_iteration": 2.6757125854492188 }, { "auxiliary_loss_clip": 0.01478831, "auxiliary_loss_mlp": 0.00336904, "balance_loss_clip": 1.22309852, "balance_loss_mlp": 0.30544454, "epoch": 0.4070945438148204, "flos": 26830046937600.0, "grad_norm": 108.34020628884224, "language_loss": 0.84924603, "learning_rate": 2.6847518542311783e-06, "loss": 0.86740339, "num_input_tokens_seen": 145294850, "router_z_loss_clip": 2.55664062, "router_z_loss_mlp": 0.3145752, "step": 6771, "time_per_iteration": 2.7132208347320557 }, { "auxiliary_loss_clip": 0.01472326, "auxiliary_loss_mlp": 0.00322287, "balance_loss_clip": 1.22135472, "balance_loss_mlp": 0.29329485, "epoch": 0.4071546670674884, "flos": 26354158623360.0, "grad_norm": 3.3188086155513976, "language_loss": 0.80647713, "learning_rate": 2.6843859174453417e-06, "loss": 0.82442325, "num_input_tokens_seen": 145317050, "router_z_loss_clip": 2.50976562, "router_z_loss_mlp": 0.28979492, "step": 6772, "time_per_iteration": 4.101114273071289 }, { "auxiliary_loss_clip": 0.01479502, "auxiliary_loss_mlp": 0.00312281, "balance_loss_clip": 1.22833037, "balance_loss_mlp": 0.28468412, "epoch": 0.40721479032015634, "flos": 17895401020800.0, "grad_norm": 2.2749283779985428, "language_loss": 0.87895954, "learning_rate": 2.6840199547075218e-06, "loss": 0.89687735, "num_input_tokens_seen": 145334480, "router_z_loss_clip": 2.50976562, "router_z_loss_mlp": 0.27612305, "step": 6773, "time_per_iteration": 4.040104150772095 }, { "auxiliary_loss_clip": 0.01458882, "auxiliary_loss_mlp": 0.00176156, "balance_loss_clip": 1.25665689, "balance_loss_mlp": 0.16447356, "epoch": 0.4072749135728243, "flos": 49854570537600.0, "grad_norm": 0.9538996705580078, "language_loss": 0.63950688, "learning_rate": 2.683653966031597e-06, "loss": 0.65585726, "num_input_tokens_seen": 145388695, "router_z_loss_clip": 2.03125, "router_z_loss_mlp": 0.11669922, "step": 6774, "time_per_iteration": 3.062551498413086 }, { "auxiliary_loss_clip": 0.0145883, "auxiliary_loss_mlp": 0.00334277, "balance_loss_clip": 1.20739186, "balance_loss_mlp": 0.30423644, "epoch": 0.40733503682549227, "flos": 27563630400000.0, "grad_norm": 670.2471096212143, "language_loss": 0.79800463, "learning_rate": 2.683287951431446e-06, "loss": 0.81593573, "num_input_tokens_seen": 145408240, "router_z_loss_clip": 2.515625, "router_z_loss_mlp": 0.30029297, "step": 6775, "time_per_iteration": 2.7410616874694824 }, { "auxiliary_loss_clip": 0.01453087, "auxiliary_loss_mlp": 0.0030407, "balance_loss_clip": 1.20730567, "balance_loss_mlp": 0.27896422, "epoch": 0.40739516007816023, "flos": 22126970551680.0, "grad_norm": 3.8144484008095456, "language_loss": 0.83623427, "learning_rate": 2.6829219109209474e-06, "loss": 0.8538059, "num_input_tokens_seen": 145428395, "router_z_loss_clip": 2.45898438, "router_z_loss_mlp": 0.2512207, "step": 6776, "time_per_iteration": 4.1398024559021 }, { "auxiliary_loss_clip": 0.01460667, "auxiliary_loss_mlp": 0.00322831, "balance_loss_clip": 1.20723629, "balance_loss_mlp": 0.29467386, "epoch": 0.4074552833308282, "flos": 23842243693440.0, "grad_norm": 141.33446925520673, "language_loss": 0.86476725, "learning_rate": 2.682555844513981e-06, "loss": 0.88260221, "num_input_tokens_seen": 145448290, "router_z_loss_clip": 2.53515625, "router_z_loss_mlp": 0.28173828, "step": 6777, "time_per_iteration": 2.7067837715148926 }, { "auxiliary_loss_clip": 0.01441553, "auxiliary_loss_mlp": 0.0011949, "balance_loss_clip": 1.24342704, "balance_loss_mlp": 0.11042972, "epoch": 0.40751540658349616, "flos": 58000008781440.0, "grad_norm": 0.6906222850432487, "language_loss": 0.52778316, "learning_rate": 2.6821897522244286e-06, "loss": 0.54339361, "num_input_tokens_seen": 145509785, "router_z_loss_clip": 1.984375, "router_z_loss_mlp": 0.09082031, "step": 6778, "time_per_iteration": 3.1699235439300537 }, { "auxiliary_loss_clip": 0.01449776, "auxiliary_loss_mlp": 0.00315085, "balance_loss_clip": 1.20340133, "balance_loss_mlp": 0.28490168, "epoch": 0.40757552983616413, "flos": 21214659991680.0, "grad_norm": 9.189414732807958, "language_loss": 0.89898354, "learning_rate": 2.6818236340661718e-06, "loss": 0.91663218, "num_input_tokens_seen": 145528620, "router_z_loss_clip": 2.46289062, "router_z_loss_mlp": 0.30175781, "step": 6779, "time_per_iteration": 2.704439640045166 }, { "auxiliary_loss_clip": 0.01449063, "auxiliary_loss_mlp": 0.00318083, "balance_loss_clip": 1.20374405, "balance_loss_mlp": 0.28990149, "epoch": 0.4076356530888321, "flos": 26833530556800.0, "grad_norm": 2.7171844606275397, "language_loss": 0.81922996, "learning_rate": 2.6814574900530957e-06, "loss": 0.83690143, "num_input_tokens_seen": 145547775, "router_z_loss_clip": 2.45117188, "router_z_loss_mlp": 0.28186035, "step": 6780, "time_per_iteration": 2.7270734310150146 }, { "auxiliary_loss_clip": 0.01419485, "auxiliary_loss_mlp": 0.00302061, "balance_loss_clip": 1.17994237, "balance_loss_mlp": 0.27523845, "epoch": 0.40769577634150006, "flos": 12203021272320.0, "grad_norm": 8.699434267212615, "language_loss": 0.74082041, "learning_rate": 2.6810913201990827e-06, "loss": 0.75803584, "num_input_tokens_seen": 145564465, "router_z_loss_clip": 2.39648438, "router_z_loss_mlp": 0.26855469, "step": 6781, "time_per_iteration": 2.6750071048736572 }, { "auxiliary_loss_clip": 0.01434185, "auxiliary_loss_mlp": 0.00297185, "balance_loss_clip": 1.19087541, "balance_loss_mlp": 0.27031541, "epoch": 0.407755899594168, "flos": 33655264796160.0, "grad_norm": 11.385036462673098, "language_loss": 0.75818801, "learning_rate": 2.6807251245180183e-06, "loss": 0.77550173, "num_input_tokens_seen": 145585965, "router_z_loss_clip": 2.43164062, "router_z_loss_mlp": 0.26904297, "step": 6782, "time_per_iteration": 4.233702898025513 }, { "auxiliary_loss_clip": 0.01422815, "auxiliary_loss_mlp": 0.00318948, "balance_loss_clip": 1.17777419, "balance_loss_mlp": 0.28988475, "epoch": 0.407816022846836, "flos": 20157342226560.0, "grad_norm": 28.645444860725227, "language_loss": 0.88865501, "learning_rate": 2.6803589030237897e-06, "loss": 0.90607262, "num_input_tokens_seen": 145605000, "router_z_loss_clip": 2.44921875, "router_z_loss_mlp": 0.29052734, "step": 6783, "time_per_iteration": 2.675356388092041 }, { "auxiliary_loss_clip": 0.01433986, "auxiliary_loss_mlp": 0.00325751, "balance_loss_clip": 1.19033313, "balance_loss_mlp": 0.29571062, "epoch": 0.40787614609950396, "flos": 21178821196800.0, "grad_norm": 135.06827649318106, "language_loss": 0.85841531, "learning_rate": 2.679992655730283e-06, "loss": 0.87601268, "num_input_tokens_seen": 145623740, "router_z_loss_clip": 2.43554688, "router_z_loss_mlp": 0.30078125, "step": 6784, "time_per_iteration": 2.7047250270843506 }, { "auxiliary_loss_clip": 0.01432615, "auxiliary_loss_mlp": 0.00346272, "balance_loss_clip": 1.18189824, "balance_loss_mlp": 0.31735188, "epoch": 0.407936269352172, "flos": 20520650338560.0, "grad_norm": 178.39465157413716, "language_loss": 0.74218822, "learning_rate": 2.679626382651386e-06, "loss": 0.7599771, "num_input_tokens_seen": 145643515, "router_z_loss_clip": 2.50585938, "router_z_loss_mlp": 0.2890625, "step": 6785, "time_per_iteration": 2.6575913429260254 }, { "auxiliary_loss_clip": 0.01422453, "auxiliary_loss_mlp": 0.00317611, "balance_loss_clip": 1.17666578, "balance_loss_mlp": 0.28894094, "epoch": 0.40799639260483994, "flos": 20118809911680.0, "grad_norm": 52.26521977229502, "language_loss": 0.861678, "learning_rate": 2.679260083800989e-06, "loss": 0.87907863, "num_input_tokens_seen": 145660890, "router_z_loss_clip": 2.45703125, "router_z_loss_mlp": 0.28662109, "step": 6786, "time_per_iteration": 2.63122296333313 }, { "auxiliary_loss_clip": 0.01420612, "auxiliary_loss_mlp": 0.0033702, "balance_loss_clip": 1.17646134, "balance_loss_mlp": 0.30774194, "epoch": 0.4080565158575079, "flos": 20997328752000.0, "grad_norm": 25.042325898533264, "language_loss": 0.8535701, "learning_rate": 2.678893759192982e-06, "loss": 0.87114644, "num_input_tokens_seen": 145680070, "router_z_loss_clip": 2.44140625, "router_z_loss_mlp": 0.29272461, "step": 6787, "time_per_iteration": 2.6502041816711426 }, { "auxiliary_loss_clip": 0.01431033, "auxiliary_loss_mlp": 0.00309714, "balance_loss_clip": 1.18302917, "balance_loss_mlp": 0.27944639, "epoch": 0.40811663911017587, "flos": 19317714837120.0, "grad_norm": 5.590155406613061, "language_loss": 0.74787468, "learning_rate": 2.678527408841255e-06, "loss": 0.76528215, "num_input_tokens_seen": 145698010, "router_z_loss_clip": 2.48242188, "router_z_loss_mlp": 0.30273438, "step": 6788, "time_per_iteration": 2.6242079734802246 }, { "auxiliary_loss_clip": 0.01424582, "auxiliary_loss_mlp": 0.00330959, "balance_loss_clip": 1.17350447, "balance_loss_mlp": 0.3012045, "epoch": 0.40817676236284384, "flos": 40625382119040.0, "grad_norm": 3.998006126211897, "language_loss": 0.73041987, "learning_rate": 2.678161032759701e-06, "loss": 0.74797529, "num_input_tokens_seen": 145722215, "router_z_loss_clip": 2.50976562, "router_z_loss_mlp": 0.29772949, "step": 6789, "time_per_iteration": 2.8281960487365723 }, { "auxiliary_loss_clip": 0.01420182, "auxiliary_loss_mlp": 0.00316132, "balance_loss_clip": 1.17461812, "balance_loss_mlp": 0.28865415, "epoch": 0.4082368856155118, "flos": 20522086882560.0, "grad_norm": 495.7123768406948, "language_loss": 0.68257302, "learning_rate": 2.6777946309622123e-06, "loss": 0.69993615, "num_input_tokens_seen": 145741090, "router_z_loss_clip": 2.45703125, "router_z_loss_mlp": 0.2746582, "step": 6790, "time_per_iteration": 2.68023419380188 }, { "auxiliary_loss_clip": 0.01418619, "auxiliary_loss_mlp": 0.0031173, "balance_loss_clip": 1.17458785, "balance_loss_mlp": 0.28297696, "epoch": 0.40829700886817977, "flos": 11427745098240.0, "grad_norm": 6.786210880521379, "language_loss": 0.77530897, "learning_rate": 2.677428203462683e-06, "loss": 0.79261243, "num_input_tokens_seen": 145754985, "router_z_loss_clip": 2.44140625, "router_z_loss_mlp": 0.28771973, "step": 6791, "time_per_iteration": 2.6076364517211914 }, { "auxiliary_loss_clip": 0.01398636, "auxiliary_loss_mlp": 0.00118979, "balance_loss_clip": 1.19688904, "balance_loss_mlp": 0.1111115, "epoch": 0.40835713212084773, "flos": 67330677121920.0, "grad_norm": 0.7511841296424061, "language_loss": 0.5870502, "learning_rate": 2.6770617502750093e-06, "loss": 0.60222626, "num_input_tokens_seen": 145815260, "router_z_loss_clip": 2.015625, "router_z_loss_mlp": 0.07861328, "step": 6792, "time_per_iteration": 3.1058013439178467 }, { "auxiliary_loss_clip": 0.01425544, "auxiliary_loss_mlp": 0.00303047, "balance_loss_clip": 1.17533422, "balance_loss_mlp": 0.27475911, "epoch": 0.4084172553735157, "flos": 21762010414080.0, "grad_norm": 57.05819781834104, "language_loss": 0.88781142, "learning_rate": 2.6766952714130857e-06, "loss": 0.90509737, "num_input_tokens_seen": 145832665, "router_z_loss_clip": 2.50585938, "router_z_loss_mlp": 0.28320312, "step": 6793, "time_per_iteration": 2.648909568786621 }, { "auxiliary_loss_clip": 0.01420688, "auxiliary_loss_mlp": 0.00315491, "balance_loss_clip": 1.17161632, "balance_loss_mlp": 0.288037, "epoch": 0.40847737862618366, "flos": 27417258478080.0, "grad_norm": 3.8952726669834905, "language_loss": 0.8985123, "learning_rate": 2.6763287668908094e-06, "loss": 0.91587412, "num_input_tokens_seen": 145850240, "router_z_loss_clip": 2.4921875, "router_z_loss_mlp": 0.27441406, "step": 6794, "time_per_iteration": 2.703150749206543 }, { "auxiliary_loss_clip": 0.01425667, "auxiliary_loss_mlp": 0.0032723, "balance_loss_clip": 1.18079185, "balance_loss_mlp": 0.29665291, "epoch": 0.4085375018788516, "flos": 18587255857920.0, "grad_norm": 24.255096477170014, "language_loss": 0.85074157, "learning_rate": 2.6759622367220788e-06, "loss": 0.86827058, "num_input_tokens_seen": 145869545, "router_z_loss_clip": 2.44921875, "router_z_loss_mlp": 0.30603027, "step": 6795, "time_per_iteration": 2.751018762588501 }, { "auxiliary_loss_clip": 0.01425513, "auxiliary_loss_mlp": 0.00351638, "balance_loss_clip": 1.17620921, "balance_loss_mlp": 0.31984472, "epoch": 0.4085976251315196, "flos": 15411783029760.0, "grad_norm": 14.832434759903762, "language_loss": 0.77459288, "learning_rate": 2.675595680920792e-06, "loss": 0.79236436, "num_input_tokens_seen": 145884025, "router_z_loss_clip": 2.49414062, "router_z_loss_mlp": 0.31787109, "step": 6796, "time_per_iteration": 2.727681875228882 }, { "auxiliary_loss_clip": 0.01407151, "auxiliary_loss_mlp": 0.0031401, "balance_loss_clip": 1.16498554, "balance_loss_mlp": 0.28578159, "epoch": 0.40865774838418756, "flos": 21252222639360.0, "grad_norm": 4.085642572026513, "language_loss": 0.84516752, "learning_rate": 2.6752290995008498e-06, "loss": 0.86237913, "num_input_tokens_seen": 145903210, "router_z_loss_clip": 2.41992188, "router_z_loss_mlp": 0.28186035, "step": 6797, "time_per_iteration": 2.719212532043457 }, { "auxiliary_loss_clip": 0.01410036, "auxiliary_loss_mlp": 0.0031248, "balance_loss_clip": 1.1642741, "balance_loss_mlp": 0.28556237, "epoch": 0.4087178716368556, "flos": 13772245714560.0, "grad_norm": 4.819275065620166, "language_loss": 0.93486345, "learning_rate": 2.6748624924761523e-06, "loss": 0.95208859, "num_input_tokens_seen": 145920985, "router_z_loss_clip": 2.4609375, "router_z_loss_mlp": 0.26940918, "step": 6798, "time_per_iteration": 2.6685783863067627 }, { "auxiliary_loss_clip": 0.01403765, "auxiliary_loss_mlp": 0.00289684, "balance_loss_clip": 1.1642592, "balance_loss_mlp": 0.2636008, "epoch": 0.40877799488952354, "flos": 23621752056960.0, "grad_norm": 6.112343132647309, "language_loss": 0.88858753, "learning_rate": 2.674495859860601e-06, "loss": 0.90552199, "num_input_tokens_seen": 145940350, "router_z_loss_clip": 2.39453125, "router_z_loss_mlp": 0.26074219, "step": 6799, "time_per_iteration": 2.7127511501312256 }, { "auxiliary_loss_clip": 0.01414383, "auxiliary_loss_mlp": 0.00301418, "balance_loss_clip": 1.17079067, "balance_loss_mlp": 0.27241391, "epoch": 0.4088381181421915, "flos": 20918791664640.0, "grad_norm": 3.1807175602522517, "language_loss": 0.91733694, "learning_rate": 2.6741292016681e-06, "loss": 0.93449497, "num_input_tokens_seen": 145957460, "router_z_loss_clip": 2.43554688, "router_z_loss_mlp": 0.28979492, "step": 6800, "time_per_iteration": 2.7614519596099854 }, { "auxiliary_loss_clip": 0.01408615, "auxiliary_loss_mlp": 0.00317968, "balance_loss_clip": 1.16809773, "balance_loss_mlp": 0.28849977, "epoch": 0.4088982413948595, "flos": 13297578462720.0, "grad_norm": 41.67330139276372, "language_loss": 0.8301903, "learning_rate": 2.6737625179125514e-06, "loss": 0.8474561, "num_input_tokens_seen": 145975285, "router_z_loss_clip": 2.41015625, "router_z_loss_mlp": 0.29492188, "step": 6801, "time_per_iteration": 2.840203046798706 }, { "auxiliary_loss_clip": 0.01413092, "auxiliary_loss_mlp": 0.00331093, "balance_loss_clip": 1.17181158, "balance_loss_mlp": 0.30092132, "epoch": 0.40895836464752744, "flos": 15267673664640.0, "grad_norm": 9.522323583402022, "language_loss": 0.85277259, "learning_rate": 2.673395808607861e-06, "loss": 0.87021446, "num_input_tokens_seen": 145989150, "router_z_loss_clip": 2.41210938, "router_z_loss_mlp": 0.30187988, "step": 6802, "time_per_iteration": 2.7383153438568115 }, { "auxiliary_loss_clip": 0.01420188, "auxiliary_loss_mlp": 0.00313388, "balance_loss_clip": 1.17681289, "balance_loss_mlp": 0.28493294, "epoch": 0.4090184879001954, "flos": 14501411804160.0, "grad_norm": 2.3148682118257735, "language_loss": 0.86598706, "learning_rate": 2.673029073767934e-06, "loss": 0.88332283, "num_input_tokens_seen": 146006980, "router_z_loss_clip": 2.43554688, "router_z_loss_mlp": 0.2845459, "step": 6803, "time_per_iteration": 2.724520683288574 }, { "auxiliary_loss_clip": 0.01397995, "auxiliary_loss_mlp": 0.00308527, "balance_loss_clip": 1.16109467, "balance_loss_mlp": 0.2796545, "epoch": 0.40907861115286337, "flos": 13881593692800.0, "grad_norm": 8.815198984959101, "language_loss": 0.87467027, "learning_rate": 2.6726623134066764e-06, "loss": 0.89173549, "num_input_tokens_seen": 146025125, "router_z_loss_clip": 2.3671875, "router_z_loss_mlp": 0.28845215, "step": 6804, "time_per_iteration": 2.749619722366333 }, { "auxiliary_loss_clip": 0.0141133, "auxiliary_loss_mlp": 0.00319309, "balance_loss_clip": 1.16601336, "balance_loss_mlp": 0.29173556, "epoch": 0.40913873440553133, "flos": 28037615293440.0, "grad_norm": 12.766313498358565, "language_loss": 0.83376354, "learning_rate": 2.672295527537998e-06, "loss": 0.85106993, "num_input_tokens_seen": 146044990, "router_z_loss_clip": 2.453125, "router_z_loss_mlp": 0.27600098, "step": 6805, "time_per_iteration": 2.714515209197998 }, { "auxiliary_loss_clip": 0.01412461, "auxiliary_loss_mlp": 0.0030439, "balance_loss_clip": 1.16723299, "balance_loss_mlp": 0.27861732, "epoch": 0.4091988576581993, "flos": 21618188357760.0, "grad_norm": 100.53948754952127, "language_loss": 0.83534288, "learning_rate": 2.671928716175804e-06, "loss": 0.85251141, "num_input_tokens_seen": 146066045, "router_z_loss_clip": 2.453125, "router_z_loss_mlp": 0.2578125, "step": 6806, "time_per_iteration": 2.7058799266815186 }, { "auxiliary_loss_clip": 0.0141715, "auxiliary_loss_mlp": 0.00266109, "balance_loss_clip": 1.17403281, "balance_loss_mlp": 0.24047898, "epoch": 0.40925898091086726, "flos": 25224085860480.0, "grad_norm": 10.092206488863786, "language_loss": 0.78772718, "learning_rate": 2.671561879334007e-06, "loss": 0.80455971, "num_input_tokens_seen": 146086280, "router_z_loss_clip": 2.42773438, "router_z_loss_mlp": 0.25646973, "step": 6807, "time_per_iteration": 2.69642972946167 }, { "auxiliary_loss_clip": 0.01376918, "auxiliary_loss_mlp": 0.00070372, "balance_loss_clip": 1.18582416, "balance_loss_mlp": 0.06269466, "epoch": 0.40931910416353523, "flos": 68930568800640.0, "grad_norm": 0.8083750418312338, "language_loss": 0.58671606, "learning_rate": 2.6711950170265155e-06, "loss": 0.6011889, "num_input_tokens_seen": 146148840, "router_z_loss_clip": 1.90625, "router_z_loss_mlp": 0.07666016, "step": 6808, "time_per_iteration": 3.271242380142212 }, { "auxiliary_loss_clip": 0.0141503, "auxiliary_loss_mlp": 0.00264727, "balance_loss_clip": 1.17321908, "balance_loss_mlp": 0.23777393, "epoch": 0.4093792274162032, "flos": 20189553747840.0, "grad_norm": 4.811014592016072, "language_loss": 0.61023426, "learning_rate": 2.670828129267242e-06, "loss": 0.6270318, "num_input_tokens_seen": 146166195, "router_z_loss_clip": 2.41601562, "router_z_loss_mlp": 0.26953125, "step": 6809, "time_per_iteration": 2.736030340194702 }, { "auxiliary_loss_clip": 0.01403656, "auxiliary_loss_mlp": 0.0024995, "balance_loss_clip": 1.16175425, "balance_loss_mlp": 0.22341384, "epoch": 0.40943935066887116, "flos": 25228754628480.0, "grad_norm": 4.1303496077500474, "language_loss": 0.89504886, "learning_rate": 2.6704612160700983e-06, "loss": 0.91158485, "num_input_tokens_seen": 146185045, "router_z_loss_clip": 2.421875, "router_z_loss_mlp": 0.26538086, "step": 6810, "time_per_iteration": 2.762766122817993 }, { "auxiliary_loss_clip": 0.01421885, "auxiliary_loss_mlp": 0.00297817, "balance_loss_clip": 1.17324483, "balance_loss_mlp": 0.26951653, "epoch": 0.4094994739215392, "flos": 23255319461760.0, "grad_norm": 13.53074461078268, "language_loss": 0.86132354, "learning_rate": 2.670094277448999e-06, "loss": 0.87852055, "num_input_tokens_seen": 146204655, "router_z_loss_clip": 2.48828125, "router_z_loss_mlp": 0.28283691, "step": 6811, "time_per_iteration": 2.6620559692382812 }, { "auxiliary_loss_clip": 0.01408114, "auxiliary_loss_mlp": 0.00259819, "balance_loss_clip": 1.1623522, "balance_loss_mlp": 0.23162647, "epoch": 0.40955959717420715, "flos": 17382165540480.0, "grad_norm": 13.969822797946932, "language_loss": 0.78892291, "learning_rate": 2.669727313417857e-06, "loss": 0.80560225, "num_input_tokens_seen": 146222000, "router_z_loss_clip": 2.45898438, "router_z_loss_mlp": 0.28186035, "step": 6812, "time_per_iteration": 2.6516098976135254 }, { "auxiliary_loss_clip": 0.0140659, "auxiliary_loss_mlp": 0.00292961, "balance_loss_clip": 1.16393733, "balance_loss_mlp": 0.26380223, "epoch": 0.4096197204268751, "flos": 25082418620160.0, "grad_norm": 10.121506947891838, "language_loss": 0.72998559, "learning_rate": 2.6693603239905872e-06, "loss": 0.74698114, "num_input_tokens_seen": 146242630, "router_z_loss_clip": 2.42773438, "router_z_loss_mlp": 0.29174805, "step": 6813, "time_per_iteration": 2.6722145080566406 }, { "auxiliary_loss_clip": 0.01392996, "auxiliary_loss_mlp": 0.00265, "balance_loss_clip": 1.15061617, "balance_loss_mlp": 0.23784381, "epoch": 0.4096798436795431, "flos": 30586769648640.0, "grad_norm": 58.23938987657569, "language_loss": 0.79769772, "learning_rate": 2.6689933091811087e-06, "loss": 0.81427765, "num_input_tokens_seen": 146263070, "router_z_loss_clip": 2.421875, "router_z_loss_mlp": 0.27160645, "step": 6814, "time_per_iteration": 4.083324909210205 }, { "auxiliary_loss_clip": 0.01422885, "auxiliary_loss_mlp": 0.00285642, "balance_loss_clip": 1.17134345, "balance_loss_mlp": 0.25791392, "epoch": 0.40973996693221104, "flos": 24133622820480.0, "grad_norm": 11.844929396767144, "language_loss": 0.75474429, "learning_rate": 2.6686262690033357e-06, "loss": 0.77182961, "num_input_tokens_seen": 146282890, "router_z_loss_clip": 2.51757812, "router_z_loss_mlp": 0.27734375, "step": 6815, "time_per_iteration": 4.076141595840454 }, { "auxiliary_loss_clip": 0.01409669, "auxiliary_loss_mlp": 0.00300109, "balance_loss_clip": 1.16366625, "balance_loss_mlp": 0.27208331, "epoch": 0.409800090184879, "flos": 23988974751360.0, "grad_norm": 9.965380070753442, "language_loss": 0.82390344, "learning_rate": 2.668259203471188e-06, "loss": 0.84100127, "num_input_tokens_seen": 146301755, "router_z_loss_clip": 2.46289062, "router_z_loss_mlp": 0.28027344, "step": 6816, "time_per_iteration": 2.6798224449157715 }, { "auxiliary_loss_clip": 0.01410227, "auxiliary_loss_mlp": 0.00291995, "balance_loss_clip": 1.16391468, "balance_loss_mlp": 0.26390889, "epoch": 0.40986021343754697, "flos": 16143678552960.0, "grad_norm": 10.471479753651506, "language_loss": 0.87743217, "learning_rate": 2.6678921125985843e-06, "loss": 0.89445436, "num_input_tokens_seen": 146316835, "router_z_loss_clip": 2.4609375, "router_z_loss_mlp": 0.28088379, "step": 6817, "time_per_iteration": 2.611795425415039 }, { "auxiliary_loss_clip": 0.01420968, "auxiliary_loss_mlp": 0.0030629, "balance_loss_clip": 1.16272807, "balance_loss_mlp": 0.2763682, "epoch": 0.40992033669021494, "flos": 24790824011520.0, "grad_norm": 144.60074343327943, "language_loss": 0.88582212, "learning_rate": 2.667524996399444e-06, "loss": 0.90309465, "num_input_tokens_seen": 146336650, "router_z_loss_clip": 2.58398438, "router_z_loss_mlp": 0.2989502, "step": 6818, "time_per_iteration": 4.118330240249634 }, { "auxiliary_loss_clip": 0.01409618, "auxiliary_loss_mlp": 0.00262611, "balance_loss_clip": 1.16307425, "balance_loss_mlp": 0.2365759, "epoch": 0.4099804599428829, "flos": 29641888431360.0, "grad_norm": 36.24206301136326, "language_loss": 0.71398568, "learning_rate": 2.66715785488769e-06, "loss": 0.73070794, "num_input_tokens_seen": 146357640, "router_z_loss_clip": 2.46484375, "router_z_loss_mlp": 0.26037598, "step": 6819, "time_per_iteration": 2.8902831077575684 }, { "auxiliary_loss_clip": 0.0142173, "auxiliary_loss_mlp": 0.00303273, "balance_loss_clip": 1.16474926, "balance_loss_mlp": 0.27324465, "epoch": 0.41004058319555087, "flos": 24826590979200.0, "grad_norm": 4.778699194793992, "language_loss": 0.90895122, "learning_rate": 2.6667906880772428e-06, "loss": 0.92620128, "num_input_tokens_seen": 146379325, "router_z_loss_clip": 2.57226562, "router_z_loss_mlp": 0.29992676, "step": 6820, "time_per_iteration": 2.7267448902130127 }, { "auxiliary_loss_clip": 0.0139687, "auxiliary_loss_mlp": 0.00296921, "balance_loss_clip": 1.15297258, "balance_loss_mlp": 0.26794127, "epoch": 0.41010070644821883, "flos": 25737464995200.0, "grad_norm": 115.08346938310278, "language_loss": 0.78367758, "learning_rate": 2.6664234959820256e-06, "loss": 0.80061549, "num_input_tokens_seen": 146398635, "router_z_loss_clip": 2.43945312, "router_z_loss_mlp": 0.29016113, "step": 6821, "time_per_iteration": 2.6953489780426025 }, { "auxiliary_loss_clip": 0.01394558, "auxiliary_loss_mlp": 0.00264358, "balance_loss_clip": 1.14665759, "balance_loss_mlp": 0.23703498, "epoch": 0.4101608297008868, "flos": 22346061557760.0, "grad_norm": 33.47381231574748, "language_loss": 0.7986939, "learning_rate": 2.6660562786159634e-06, "loss": 0.81528306, "num_input_tokens_seen": 146417585, "router_z_loss_clip": 2.47851562, "router_z_loss_mlp": 0.2734375, "step": 6822, "time_per_iteration": 2.626279830932617 }, { "auxiliary_loss_clip": 0.01400868, "auxiliary_loss_mlp": 0.00296292, "balance_loss_clip": 1.14896214, "balance_loss_mlp": 0.26713338, "epoch": 0.41022095295355476, "flos": 21945083057280.0, "grad_norm": 7.559985696240839, "language_loss": 0.83649391, "learning_rate": 2.6656890359929796e-06, "loss": 0.8534655, "num_input_tokens_seen": 146437035, "router_z_loss_clip": 2.52148438, "router_z_loss_mlp": 0.29125977, "step": 6823, "time_per_iteration": 2.6830451488494873 }, { "auxiliary_loss_clip": 0.01405575, "auxiliary_loss_mlp": 0.00269099, "balance_loss_clip": 1.15286946, "balance_loss_mlp": 0.23943949, "epoch": 0.4102810762062228, "flos": 27450511493760.0, "grad_norm": 23.84103159984935, "language_loss": 0.80702579, "learning_rate": 2.665321768127001e-06, "loss": 0.82377255, "num_input_tokens_seen": 146457370, "router_z_loss_clip": 2.52734375, "router_z_loss_mlp": 0.29663086, "step": 6824, "time_per_iteration": 4.104139089584351 }, { "auxiliary_loss_clip": 0.01396113, "auxiliary_loss_mlp": 0.0027708, "balance_loss_clip": 1.14309335, "balance_loss_mlp": 0.24754041, "epoch": 0.41034119945889075, "flos": 24499265316480.0, "grad_norm": 11.853762083682092, "language_loss": 0.79436564, "learning_rate": 2.6649544750319548e-06, "loss": 0.81109756, "num_input_tokens_seen": 146478105, "router_z_loss_clip": 2.52929688, "router_z_loss_mlp": 0.29528809, "step": 6825, "time_per_iteration": 2.795856475830078 }, { "auxiliary_loss_clip": 0.01389948, "auxiliary_loss_mlp": 0.00244421, "balance_loss_clip": 1.13980389, "balance_loss_mlp": 0.21795669, "epoch": 0.4104013227115587, "flos": 24352641999360.0, "grad_norm": 30.18353569223568, "language_loss": 0.92310631, "learning_rate": 2.664587156721768e-06, "loss": 0.93944997, "num_input_tokens_seen": 146497835, "router_z_loss_clip": 2.50390625, "router_z_loss_mlp": 0.26477051, "step": 6826, "time_per_iteration": 2.7050585746765137 }, { "auxiliary_loss_clip": 0.01396126, "auxiliary_loss_mlp": 0.00256865, "balance_loss_clip": 1.14929187, "balance_loss_mlp": 0.22918504, "epoch": 0.4104614459642267, "flos": 23729340268800.0, "grad_norm": 7.085910192077398, "language_loss": 0.75289237, "learning_rate": 2.6642198132103696e-06, "loss": 0.76942229, "num_input_tokens_seen": 146517735, "router_z_loss_clip": 2.46679688, "router_z_loss_mlp": 0.2767334, "step": 6827, "time_per_iteration": 2.642277479171753 }, { "auxiliary_loss_clip": 0.01379609, "auxiliary_loss_mlp": 0.00246598, "balance_loss_clip": 1.13375759, "balance_loss_mlp": 0.22040814, "epoch": 0.41052156921689464, "flos": 22127976132480.0, "grad_norm": 20.346235489964542, "language_loss": 0.78887111, "learning_rate": 2.663852444511689e-06, "loss": 0.80513316, "num_input_tokens_seen": 146537640, "router_z_loss_clip": 2.45898438, "router_z_loss_mlp": 0.26171875, "step": 6828, "time_per_iteration": 2.6456539630889893 }, { "auxiliary_loss_clip": 0.01393769, "auxiliary_loss_mlp": 0.00257275, "balance_loss_clip": 1.13956225, "balance_loss_mlp": 0.22725785, "epoch": 0.4105816924695626, "flos": 20084371747200.0, "grad_norm": 6.903336998367901, "language_loss": 0.90364802, "learning_rate": 2.6634850506396574e-06, "loss": 0.92015839, "num_input_tokens_seen": 146554695, "router_z_loss_clip": 2.54296875, "router_z_loss_mlp": 0.3001709, "step": 6829, "time_per_iteration": 2.6480817794799805 }, { "auxiliary_loss_clip": 0.01377395, "auxiliary_loss_mlp": 0.00257214, "balance_loss_clip": 1.12631166, "balance_loss_mlp": 0.22880657, "epoch": 0.4106418157222306, "flos": 18076785724800.0, "grad_norm": 9.192076690317803, "language_loss": 0.94873154, "learning_rate": 2.663117631608206e-06, "loss": 0.96507764, "num_input_tokens_seen": 146573740, "router_z_loss_clip": 2.51171875, "router_z_loss_mlp": 0.28393555, "step": 6830, "time_per_iteration": 2.7050697803497314 }, { "auxiliary_loss_clip": 0.01384301, "auxiliary_loss_mlp": 0.00236132, "balance_loss_clip": 1.13148761, "balance_loss_mlp": 0.20658, "epoch": 0.41070193897489854, "flos": 21647850013440.0, "grad_norm": 8.386839145773173, "language_loss": 0.73360085, "learning_rate": 2.662750187431268e-06, "loss": 0.74980521, "num_input_tokens_seen": 146592885, "router_z_loss_clip": 2.52734375, "router_z_loss_mlp": 0.2956543, "step": 6831, "time_per_iteration": 2.664151668548584 }, { "auxiliary_loss_clip": 0.01386073, "auxiliary_loss_mlp": 0.00245289, "balance_loss_clip": 1.1339618, "balance_loss_mlp": 0.2164408, "epoch": 0.4107620622275665, "flos": 26648195356800.0, "grad_norm": 2.837288790692594, "language_loss": 0.77262795, "learning_rate": 2.662382718122776e-06, "loss": 0.78894162, "num_input_tokens_seen": 146611995, "router_z_loss_clip": 2.51953125, "router_z_loss_mlp": 0.28820801, "step": 6832, "time_per_iteration": 2.7238142490386963 }, { "auxiliary_loss_clip": 0.01382918, "auxiliary_loss_mlp": 0.00264687, "balance_loss_clip": 1.13129473, "balance_loss_mlp": 0.23742396, "epoch": 0.41082218548023447, "flos": 18734310138240.0, "grad_norm": 5.493127954347516, "language_loss": 0.84241086, "learning_rate": 2.662015223696666e-06, "loss": 0.85888696, "num_input_tokens_seen": 146628045, "router_z_loss_clip": 2.51757812, "router_z_loss_mlp": 0.27246094, "step": 6833, "time_per_iteration": 2.640706777572632 }, { "auxiliary_loss_clip": 0.01394455, "auxiliary_loss_mlp": 0.00262507, "balance_loss_clip": 1.1387434, "balance_loss_mlp": 0.23302685, "epoch": 0.41088230873290243, "flos": 22893771116160.0, "grad_norm": 5.3259668843236, "language_loss": 0.81012678, "learning_rate": 2.6616477041668713e-06, "loss": 0.8266964, "num_input_tokens_seen": 146648355, "router_z_loss_clip": 2.55859375, "router_z_loss_mlp": 0.29443359, "step": 6834, "time_per_iteration": 2.6702494621276855 }, { "auxiliary_loss_clip": 0.01383621, "auxiliary_loss_mlp": 0.00240392, "balance_loss_clip": 1.13051116, "balance_loss_mlp": 0.21274699, "epoch": 0.4109424319855704, "flos": 24276978000000.0, "grad_norm": 42.1509701045469, "language_loss": 0.77931553, "learning_rate": 2.661280159547329e-06, "loss": 0.79555571, "num_input_tokens_seen": 146668370, "router_z_loss_clip": 2.52929688, "router_z_loss_mlp": 0.27636719, "step": 6835, "time_per_iteration": 2.6571192741394043 }, { "auxiliary_loss_clip": 0.01384163, "auxiliary_loss_mlp": 0.00222979, "balance_loss_clip": 1.13197458, "balance_loss_mlp": 0.1974439, "epoch": 0.41100255523823837, "flos": 12969139478400.0, "grad_norm": 12.698165255873297, "language_loss": 0.98045063, "learning_rate": 2.660912589851978e-06, "loss": 0.99652201, "num_input_tokens_seen": 146686665, "router_z_loss_clip": 2.5234375, "router_z_loss_mlp": 0.25561523, "step": 6836, "time_per_iteration": 2.653738498687744 }, { "auxiliary_loss_clip": 0.01369643, "auxiliary_loss_mlp": 0.00233054, "balance_loss_clip": 1.1243217, "balance_loss_mlp": 0.20624372, "epoch": 0.4110626784909064, "flos": 23145648261120.0, "grad_norm": 16.944712182581654, "language_loss": 0.7451061, "learning_rate": 2.6605449950947547e-06, "loss": 0.76113307, "num_input_tokens_seen": 146706570, "router_z_loss_clip": 2.453125, "router_z_loss_mlp": 0.26831055, "step": 6837, "time_per_iteration": 2.695446491241455 }, { "auxiliary_loss_clip": 0.01388967, "auxiliary_loss_mlp": 0.00242718, "balance_loss_clip": 1.1342566, "balance_loss_mlp": 0.21541885, "epoch": 0.41112280174357435, "flos": 22747399194240.0, "grad_norm": 10.887020802883303, "language_loss": 0.84264171, "learning_rate": 2.660177375289599e-06, "loss": 0.8589586, "num_input_tokens_seen": 146723425, "router_z_loss_clip": 2.54882812, "router_z_loss_mlp": 0.27307129, "step": 6838, "time_per_iteration": 2.7097127437591553 }, { "auxiliary_loss_clip": 0.01390225, "auxiliary_loss_mlp": 0.00227454, "balance_loss_clip": 1.13860083, "balance_loss_mlp": 0.19964266, "epoch": 0.4111829249962423, "flos": 21102403011840.0, "grad_norm": 9.464108747440203, "language_loss": 0.91437048, "learning_rate": 2.659809730450451e-06, "loss": 0.9305473, "num_input_tokens_seen": 146741640, "router_z_loss_clip": 2.51757812, "router_z_loss_mlp": 0.27844238, "step": 6839, "time_per_iteration": 2.7113709449768066 }, { "auxiliary_loss_clip": 0.01378316, "auxiliary_loss_mlp": 0.00218634, "balance_loss_clip": 1.13079798, "balance_loss_mlp": 0.19153813, "epoch": 0.4112430482489103, "flos": 21505787723520.0, "grad_norm": 175.44311889823615, "language_loss": 0.88768125, "learning_rate": 2.6594420605912523e-06, "loss": 0.90365076, "num_input_tokens_seen": 146759195, "router_z_loss_clip": 2.47460938, "router_z_loss_mlp": 0.27062988, "step": 6840, "time_per_iteration": 2.674197196960449 }, { "auxiliary_loss_clip": 0.01379069, "auxiliary_loss_mlp": 0.00241447, "balance_loss_clip": 1.12616038, "balance_loss_mlp": 0.21394551, "epoch": 0.41130317150157825, "flos": 19570022945280.0, "grad_norm": 143.2282223427411, "language_loss": 0.76193899, "learning_rate": 2.6590743657259442e-06, "loss": 0.77814412, "num_input_tokens_seen": 146774990, "router_z_loss_clip": 2.52929688, "router_z_loss_mlp": 0.27490234, "step": 6841, "time_per_iteration": 2.7424862384796143 }, { "auxiliary_loss_clip": 0.01319223, "auxiliary_loss_mlp": 0.0007177, "balance_loss_clip": 1.1459825, "balance_loss_mlp": 0.06566673, "epoch": 0.4113632947542462, "flos": 62383157706240.0, "grad_norm": 3.9857311708039527, "language_loss": 0.59311676, "learning_rate": 2.65870664586847e-06, "loss": 0.6070267, "num_input_tokens_seen": 146839610, "router_z_loss_clip": 1.734375, "router_z_loss_mlp": 0.06103516, "step": 6842, "time_per_iteration": 3.2852349281311035 }, { "auxiliary_loss_clip": 0.01380711, "auxiliary_loss_mlp": 0.00227698, "balance_loss_clip": 1.1345154, "balance_loss_mlp": 0.20327184, "epoch": 0.4114234180069142, "flos": 13918617636480.0, "grad_norm": 174.89712729263888, "language_loss": 0.78022474, "learning_rate": 2.6583389010327742e-06, "loss": 0.79630882, "num_input_tokens_seen": 146857360, "router_z_loss_clip": 2.46289062, "router_z_loss_mlp": 0.2442627, "step": 6843, "time_per_iteration": 2.6498069763183594 }, { "auxiliary_loss_clip": 0.01375027, "auxiliary_loss_mlp": 0.00051615, "balance_loss_clip": 1.19147015, "balance_loss_mlp": 0.04651245, "epoch": 0.41148354125958214, "flos": 64928505219840.0, "grad_norm": 0.7035343193442615, "language_loss": 0.5339672, "learning_rate": 2.6579711312328013e-06, "loss": 0.54823357, "num_input_tokens_seen": 146917055, "router_z_loss_clip": 1.8359375, "router_z_loss_mlp": 0.05102539, "step": 6844, "time_per_iteration": 3.1321163177490234 }, { "auxiliary_loss_clip": 0.01391776, "auxiliary_loss_mlp": 0.00235322, "balance_loss_clip": 1.1423595, "balance_loss_mlp": 0.20972812, "epoch": 0.4115436645122501, "flos": 18728779443840.0, "grad_norm": 3.343607194329203, "language_loss": 0.73096937, "learning_rate": 2.6576033364824967e-06, "loss": 0.74724036, "num_input_tokens_seen": 146935215, "router_z_loss_clip": 2.49609375, "router_z_loss_mlp": 0.25610352, "step": 6845, "time_per_iteration": 2.688157081604004 }, { "auxiliary_loss_clip": 0.01394422, "auxiliary_loss_mlp": 0.00227827, "balance_loss_clip": 1.14090765, "balance_loss_mlp": 0.20166066, "epoch": 0.41160378776491807, "flos": 16252918790400.0, "grad_norm": 14.113990501804645, "language_loss": 0.77456033, "learning_rate": 2.657235516795808e-06, "loss": 0.79078287, "num_input_tokens_seen": 146951970, "router_z_loss_clip": 2.53125, "router_z_loss_mlp": 0.26171875, "step": 6846, "time_per_iteration": 2.786271810531616 }, { "auxiliary_loss_clip": 0.0139122, "auxiliary_loss_mlp": 0.00231816, "balance_loss_clip": 1.14440072, "balance_loss_mlp": 0.20721135, "epoch": 0.41166391101758604, "flos": 27970031854080.0, "grad_norm": 2.010351342107659, "language_loss": 0.71490949, "learning_rate": 2.6568676721866826e-06, "loss": 0.73113984, "num_input_tokens_seen": 146975615, "router_z_loss_clip": 2.46875, "router_z_loss_mlp": 0.24584961, "step": 6847, "time_per_iteration": 2.8369603157043457 }, { "auxiliary_loss_clip": 0.01382517, "auxiliary_loss_mlp": 0.00235988, "balance_loss_clip": 1.13674903, "balance_loss_mlp": 0.21014363, "epoch": 0.411724034270254, "flos": 34131296764800.0, "grad_norm": 8.071896012635584, "language_loss": 0.77612972, "learning_rate": 2.656499802669069e-06, "loss": 0.79231477, "num_input_tokens_seen": 146998855, "router_z_loss_clip": 2.45898438, "router_z_loss_mlp": 0.25842285, "step": 6848, "time_per_iteration": 2.906960964202881 }, { "auxiliary_loss_clip": 0.01400022, "auxiliary_loss_mlp": 0.00088995, "balance_loss_clip": 1.22302866, "balance_loss_mlp": 0.08122301, "epoch": 0.41178415752292197, "flos": 67923670752000.0, "grad_norm": 0.8583809841367851, "language_loss": 0.5612607, "learning_rate": 2.6561319082569174e-06, "loss": 0.57615089, "num_input_tokens_seen": 147062710, "router_z_loss_clip": 1.765625, "router_z_loss_mlp": 0.07763672, "step": 6849, "time_per_iteration": 3.2435426712036133 }, { "auxiliary_loss_clip": 0.01387665, "auxiliary_loss_mlp": 0.00237723, "balance_loss_clip": 1.14508986, "balance_loss_mlp": 0.2122006, "epoch": 0.41184428077558993, "flos": 34313938444800.0, "grad_norm": 34.15032277529469, "language_loss": 0.82197285, "learning_rate": 2.6557639889641783e-06, "loss": 0.83822668, "num_input_tokens_seen": 147086075, "router_z_loss_clip": 2.42578125, "router_z_loss_mlp": 0.25524902, "step": 6850, "time_per_iteration": 2.771185874938965 }, { "auxiliary_loss_clip": 0.01387592, "auxiliary_loss_mlp": 0.00230334, "balance_loss_clip": 1.14226627, "balance_loss_mlp": 0.20535985, "epoch": 0.41190440402825795, "flos": 35444118948480.0, "grad_norm": 172.4770081162339, "language_loss": 0.73212814, "learning_rate": 2.6553960448048025e-06, "loss": 0.74830735, "num_input_tokens_seen": 147107590, "router_z_loss_clip": 2.453125, "router_z_loss_mlp": 0.24975586, "step": 6851, "time_per_iteration": 2.7946598529815674 }, { "auxiliary_loss_clip": 0.0139376, "auxiliary_loss_mlp": 0.00247927, "balance_loss_clip": 1.14550495, "balance_loss_mlp": 0.22214177, "epoch": 0.4119645272809259, "flos": 20849879422080.0, "grad_norm": 22.0826243554903, "language_loss": 0.91643381, "learning_rate": 2.655028075792743e-06, "loss": 0.93285072, "num_input_tokens_seen": 147123715, "router_z_loss_clip": 2.48632812, "router_z_loss_mlp": 0.2578125, "step": 6852, "time_per_iteration": 2.6828012466430664 }, { "auxiliary_loss_clip": 0.01393613, "auxiliary_loss_mlp": 0.00261798, "balance_loss_clip": 1.14317441, "balance_loss_mlp": 0.23360489, "epoch": 0.4120246505335939, "flos": 27562050201600.0, "grad_norm": 83.78229117855604, "language_loss": 0.86788905, "learning_rate": 2.6546600819419537e-06, "loss": 0.88444316, "num_input_tokens_seen": 147144290, "router_z_loss_clip": 2.50390625, "router_z_loss_mlp": 0.28222656, "step": 6853, "time_per_iteration": 2.7457242012023926 }, { "auxiliary_loss_clip": 0.01398907, "auxiliary_loss_mlp": 0.00248002, "balance_loss_clip": 1.14604843, "balance_loss_mlp": 0.22021399, "epoch": 0.41208477378626185, "flos": 37815444046080.0, "grad_norm": 194.8128496835626, "language_loss": 0.74310017, "learning_rate": 2.6542920632663883e-06, "loss": 0.75956929, "num_input_tokens_seen": 147166340, "router_z_loss_clip": 2.53125, "router_z_loss_mlp": 0.27807617, "step": 6854, "time_per_iteration": 2.7854151725769043 }, { "auxiliary_loss_clip": 0.01376686, "auxiliary_loss_mlp": 0.00235256, "balance_loss_clip": 1.13032532, "balance_loss_mlp": 0.2068364, "epoch": 0.4121448970389298, "flos": 23440762402560.0, "grad_norm": 19.853306788143822, "language_loss": 0.89654177, "learning_rate": 2.6539240197800023e-06, "loss": 0.91266119, "num_input_tokens_seen": 147184025, "router_z_loss_clip": 2.46679688, "router_z_loss_mlp": 0.28417969, "step": 6855, "time_per_iteration": 2.697131872177124 }, { "auxiliary_loss_clip": 0.01375473, "auxiliary_loss_mlp": 0.00265195, "balance_loss_clip": 1.12829232, "balance_loss_mlp": 0.23786062, "epoch": 0.4122050202915978, "flos": 21325300859520.0, "grad_norm": 2.502196563589322, "language_loss": 0.84900296, "learning_rate": 2.6535559514967517e-06, "loss": 0.86540961, "num_input_tokens_seen": 147202730, "router_z_loss_clip": 2.47460938, "router_z_loss_mlp": 0.27282715, "step": 6856, "time_per_iteration": 4.06578803062439 }, { "auxiliary_loss_clip": 0.0138383, "auxiliary_loss_mlp": 0.00244204, "balance_loss_clip": 1.13554108, "balance_loss_mlp": 0.21958686, "epoch": 0.41226514354426574, "flos": 17306286059520.0, "grad_norm": 65.01585731279155, "language_loss": 0.88163078, "learning_rate": 2.6531878584305935e-06, "loss": 0.89791107, "num_input_tokens_seen": 147215315, "router_z_loss_clip": 2.484375, "router_z_loss_mlp": 0.24645996, "step": 6857, "time_per_iteration": 4.119785785675049 }, { "auxiliary_loss_clip": 0.01375555, "auxiliary_loss_mlp": 0.00252225, "balance_loss_clip": 1.13053417, "balance_loss_mlp": 0.2241874, "epoch": 0.4123252667969337, "flos": 17638855107840.0, "grad_norm": 270.6058027295092, "language_loss": 0.78699946, "learning_rate": 2.6528197405954873e-06, "loss": 0.80327725, "num_input_tokens_seen": 147233330, "router_z_loss_clip": 2.45117188, "router_z_loss_mlp": 0.28027344, "step": 6858, "time_per_iteration": 2.6855404376983643 }, { "auxiliary_loss_clip": 0.0137771, "auxiliary_loss_mlp": 0.00242702, "balance_loss_clip": 1.12928975, "balance_loss_mlp": 0.2150453, "epoch": 0.4123853900496017, "flos": 46424811375360.0, "grad_norm": 1400.1943083889316, "language_loss": 0.6777904, "learning_rate": 2.652451598005391e-06, "loss": 0.69399446, "num_input_tokens_seen": 147257780, "router_z_loss_clip": 2.484375, "router_z_loss_mlp": 0.27624512, "step": 6859, "time_per_iteration": 2.9023327827453613 }, { "auxiliary_loss_clip": 0.01377419, "auxiliary_loss_mlp": 0.00257916, "balance_loss_clip": 1.12513161, "balance_loss_mlp": 0.22940102, "epoch": 0.41244551330226964, "flos": 17675160779520.0, "grad_norm": 33.10806642056471, "language_loss": 0.83793586, "learning_rate": 2.652083430674264e-06, "loss": 0.85428917, "num_input_tokens_seen": 147276055, "router_z_loss_clip": 2.5234375, "router_z_loss_mlp": 0.28540039, "step": 6860, "time_per_iteration": 4.051973581314087 }, { "auxiliary_loss_clip": 0.01365001, "auxiliary_loss_mlp": 0.00254451, "balance_loss_clip": 1.11949801, "balance_loss_mlp": 0.2269018, "epoch": 0.4125056365549376, "flos": 18693730748160.0, "grad_norm": 70.49784343601358, "language_loss": 0.79660231, "learning_rate": 2.651715238616068e-06, "loss": 0.81279683, "num_input_tokens_seen": 147293200, "router_z_loss_clip": 2.45703125, "router_z_loss_mlp": 0.27539062, "step": 6861, "time_per_iteration": 2.6070330142974854 }, { "auxiliary_loss_clip": 0.01369028, "auxiliary_loss_mlp": 0.00236809, "balance_loss_clip": 1.12726009, "balance_loss_mlp": 0.21172784, "epoch": 0.41256575980760557, "flos": 17895293280000.0, "grad_norm": 16.91097659552207, "language_loss": 0.87084889, "learning_rate": 2.651347021844765e-06, "loss": 0.88690728, "num_input_tokens_seen": 147310640, "router_z_loss_clip": 2.41796875, "router_z_loss_mlp": 0.25109863, "step": 6862, "time_per_iteration": 2.766947031021118 }, { "auxiliary_loss_clip": 0.01376949, "auxiliary_loss_mlp": 0.00256692, "balance_loss_clip": 1.1300869, "balance_loss_mlp": 0.23116903, "epoch": 0.41262588306027354, "flos": 21981316901760.0, "grad_norm": 8.294575461556992, "language_loss": 0.84003568, "learning_rate": 2.650978780374318e-06, "loss": 0.85637212, "num_input_tokens_seen": 147329435, "router_z_loss_clip": 2.47070312, "router_z_loss_mlp": 0.25561523, "step": 6863, "time_per_iteration": 2.7042076587677 }, { "auxiliary_loss_clip": 0.01330087, "auxiliary_loss_mlp": 0.00133784, "balance_loss_clip": 1.14601922, "balance_loss_mlp": 0.1255822, "epoch": 0.41268600631294156, "flos": 53350006740480.0, "grad_norm": 0.7140864231224561, "language_loss": 0.52524936, "learning_rate": 2.650610514218691e-06, "loss": 0.53988808, "num_input_tokens_seen": 147385805, "router_z_loss_clip": 1.84375, "router_z_loss_mlp": 0.08203125, "step": 6864, "time_per_iteration": 3.1213865280151367 }, { "auxiliary_loss_clip": 0.01374198, "auxiliary_loss_mlp": 0.00267969, "balance_loss_clip": 1.1244235, "balance_loss_mlp": 0.23921563, "epoch": 0.4127461295656095, "flos": 24385356311040.0, "grad_norm": 61.405046451703825, "language_loss": 0.79265809, "learning_rate": 2.6502422233918468e-06, "loss": 0.80907977, "num_input_tokens_seen": 147405160, "router_z_loss_clip": 2.49414062, "router_z_loss_mlp": 0.28759766, "step": 6865, "time_per_iteration": 2.7147769927978516 }, { "auxiliary_loss_clip": 0.01325251, "auxiliary_loss_mlp": 0.00164871, "balance_loss_clip": 1.13663197, "balance_loss_mlp": 0.15619248, "epoch": 0.4128062528182775, "flos": 71705242696320.0, "grad_norm": 0.9063399141356253, "language_loss": 0.65737087, "learning_rate": 2.649873907907753e-06, "loss": 0.67227209, "num_input_tokens_seen": 147460245, "router_z_loss_clip": 1.890625, "router_z_loss_mlp": 0.08691406, "step": 6866, "time_per_iteration": 4.487993478775024 }, { "auxiliary_loss_clip": 0.01365727, "auxiliary_loss_mlp": 0.00262338, "balance_loss_clip": 1.11700201, "balance_loss_mlp": 0.23439582, "epoch": 0.41286637607094545, "flos": 17849111368320.0, "grad_norm": 21.742137038483026, "language_loss": 0.90596306, "learning_rate": 2.649505567780375e-06, "loss": 0.92224371, "num_input_tokens_seen": 147476200, "router_z_loss_clip": 2.48828125, "router_z_loss_mlp": 0.27941895, "step": 6867, "time_per_iteration": 2.7160019874572754 }, { "auxiliary_loss_clip": 0.01369456, "auxiliary_loss_mlp": 0.00255284, "balance_loss_clip": 1.11826217, "balance_loss_mlp": 0.22858125, "epoch": 0.4129264993236134, "flos": 25549544016000.0, "grad_norm": 6.9580949431459915, "language_loss": 0.86797035, "learning_rate": 2.6491372030236815e-06, "loss": 0.88421774, "num_input_tokens_seen": 147494315, "router_z_loss_clip": 2.51171875, "router_z_loss_mlp": 0.26696777, "step": 6868, "time_per_iteration": 2.743624210357666 }, { "auxiliary_loss_clip": 0.01316827, "auxiliary_loss_mlp": 0.00079728, "balance_loss_clip": 1.14182758, "balance_loss_mlp": 0.07381497, "epoch": 0.4129866225762814, "flos": 65414446364160.0, "grad_norm": 0.8663249964161274, "language_loss": 0.57348049, "learning_rate": 2.64876881365164e-06, "loss": 0.58744597, "num_input_tokens_seen": 147543665, "router_z_loss_clip": 1.75, "router_z_loss_mlp": 0.05908203, "step": 6869, "time_per_iteration": 2.9005041122436523 }, { "auxiliary_loss_clip": 0.01364384, "auxiliary_loss_mlp": 0.0027761, "balance_loss_clip": 1.11902881, "balance_loss_mlp": 0.25009698, "epoch": 0.41304674582894935, "flos": 28876991287680.0, "grad_norm": 11.427010929111855, "language_loss": 0.81383312, "learning_rate": 2.64840039967822e-06, "loss": 0.83025312, "num_input_tokens_seen": 147564870, "router_z_loss_clip": 2.44921875, "router_z_loss_mlp": 0.27526855, "step": 6870, "time_per_iteration": 2.6914758682250977 }, { "auxiliary_loss_clip": 0.01363545, "auxiliary_loss_mlp": 0.00263336, "balance_loss_clip": 1.11613846, "balance_loss_mlp": 0.23434404, "epoch": 0.4131068690816173, "flos": 22891975436160.0, "grad_norm": 121.48837658952212, "language_loss": 0.88649124, "learning_rate": 2.6480319611173912e-06, "loss": 0.90276003, "num_input_tokens_seen": 147584840, "router_z_loss_clip": 2.47070312, "router_z_loss_mlp": 0.29003906, "step": 6871, "time_per_iteration": 2.67029070854187 }, { "auxiliary_loss_clip": 0.01375895, "auxiliary_loss_mlp": 0.00246737, "balance_loss_clip": 1.12967157, "balance_loss_mlp": 0.22052297, "epoch": 0.4131669923342853, "flos": 26065185707520.0, "grad_norm": 4991.675422938932, "language_loss": 0.76976281, "learning_rate": 2.6476634979831263e-06, "loss": 0.78598917, "num_input_tokens_seen": 147604635, "router_z_loss_clip": 2.4609375, "router_z_loss_mlp": 0.26220703, "step": 6872, "time_per_iteration": 2.6754255294799805 }, { "auxiliary_loss_clip": 0.01360708, "auxiliary_loss_mlp": 0.002428, "balance_loss_clip": 1.11754704, "balance_loss_mlp": 0.21399873, "epoch": 0.41322711558695324, "flos": 19244564789760.0, "grad_norm": 10.066074504951425, "language_loss": 0.85081756, "learning_rate": 2.6472950102893964e-06, "loss": 0.86685264, "num_input_tokens_seen": 147620700, "router_z_loss_clip": 2.4296875, "router_z_loss_mlp": 0.28820801, "step": 6873, "time_per_iteration": 2.681732416152954 }, { "auxiliary_loss_clip": 0.01375686, "auxiliary_loss_mlp": 0.00274958, "balance_loss_clip": 1.12345064, "balance_loss_mlp": 0.24341479, "epoch": 0.4132872388396212, "flos": 22674464628480.0, "grad_norm": 249.3106096018696, "language_loss": 0.90470809, "learning_rate": 2.6469264980501746e-06, "loss": 0.92121452, "num_input_tokens_seen": 147639490, "router_z_loss_clip": 2.51953125, "router_z_loss_mlp": 0.31567383, "step": 6874, "time_per_iteration": 2.6835896968841553 }, { "auxiliary_loss_clip": 0.01372468, "auxiliary_loss_mlp": 0.00276642, "balance_loss_clip": 1.1265223, "balance_loss_mlp": 0.24813868, "epoch": 0.4133473620922892, "flos": 20150195420160.0, "grad_norm": 16.55573828341059, "language_loss": 0.79203951, "learning_rate": 2.646557961279436e-06, "loss": 0.80853057, "num_input_tokens_seen": 147657205, "router_z_loss_clip": 2.45703125, "router_z_loss_mlp": 0.28503418, "step": 6875, "time_per_iteration": 2.6380765438079834 }, { "auxiliary_loss_clip": 0.01365801, "auxiliary_loss_mlp": 0.00255205, "balance_loss_clip": 1.12616348, "balance_loss_mlp": 0.22975381, "epoch": 0.41340748534495714, "flos": 24242755317120.0, "grad_norm": 3.895502565426921, "language_loss": 0.86526716, "learning_rate": 2.646189399991154e-06, "loss": 0.88147724, "num_input_tokens_seen": 147677005, "router_z_loss_clip": 2.39453125, "router_z_loss_mlp": 0.2545166, "step": 6876, "time_per_iteration": 2.6835334300994873 }, { "auxiliary_loss_clip": 0.01377222, "auxiliary_loss_mlp": 0.00299211, "balance_loss_clip": 1.12901592, "balance_loss_mlp": 0.26931328, "epoch": 0.41346760859762516, "flos": 14392171566720.0, "grad_norm": 7.756798175932521, "language_loss": 0.7719788, "learning_rate": 2.6458208141993048e-06, "loss": 0.78874314, "num_input_tokens_seen": 147693435, "router_z_loss_clip": 2.484375, "router_z_loss_mlp": 0.29870605, "step": 6877, "time_per_iteration": 2.6251327991485596 }, { "auxiliary_loss_clip": 0.0137954, "auxiliary_loss_mlp": 0.00262634, "balance_loss_clip": 1.13552129, "balance_loss_mlp": 0.23620489, "epoch": 0.4135277318502931, "flos": 22492002516480.0, "grad_norm": 8.781838527347082, "language_loss": 0.83666706, "learning_rate": 2.6454522039178668e-06, "loss": 0.85308874, "num_input_tokens_seen": 147714000, "router_z_loss_clip": 2.44140625, "router_z_loss_mlp": 0.26452637, "step": 6878, "time_per_iteration": 2.6512234210968018 }, { "auxiliary_loss_clip": 0.01373161, "auxiliary_loss_mlp": 0.00265212, "balance_loss_clip": 1.12665224, "balance_loss_mlp": 0.23787682, "epoch": 0.4135878551029611, "flos": 22418744728320.0, "grad_norm": 10.180778754940595, "language_loss": 0.88431752, "learning_rate": 2.6450835691608154e-06, "loss": 0.90070128, "num_input_tokens_seen": 147731010, "router_z_loss_clip": 2.46289062, "router_z_loss_mlp": 0.27294922, "step": 6879, "time_per_iteration": 2.738079071044922 }, { "auxiliary_loss_clip": 0.01382991, "auxiliary_loss_mlp": 0.00270991, "balance_loss_clip": 1.1371423, "balance_loss_mlp": 0.24295273, "epoch": 0.41364797835562905, "flos": 27053232094080.0, "grad_norm": 322.00457011732294, "language_loss": 0.91000116, "learning_rate": 2.6447149099421315e-06, "loss": 0.92654097, "num_input_tokens_seen": 147750880, "router_z_loss_clip": 2.45898438, "router_z_loss_mlp": 0.28063965, "step": 6880, "time_per_iteration": 2.73940110206604 }, { "auxiliary_loss_clip": 0.01389934, "auxiliary_loss_mlp": 0.00292547, "balance_loss_clip": 1.14084363, "balance_loss_mlp": 0.26307893, "epoch": 0.413708101608297, "flos": 22967603521920.0, "grad_norm": 3.2722765019582885, "language_loss": 0.77203977, "learning_rate": 2.6443462262757927e-06, "loss": 0.78886461, "num_input_tokens_seen": 147771360, "router_z_loss_clip": 2.49414062, "router_z_loss_mlp": 0.29467773, "step": 6881, "time_per_iteration": 2.689289093017578 }, { "auxiliary_loss_clip": 0.01384324, "auxiliary_loss_mlp": 0.0029237, "balance_loss_clip": 1.1444329, "balance_loss_mlp": 0.26472518, "epoch": 0.413768224860965, "flos": 13333991875200.0, "grad_norm": 6.447575893323677, "language_loss": 0.87007403, "learning_rate": 2.6439775181757805e-06, "loss": 0.88684094, "num_input_tokens_seen": 147787440, "router_z_loss_clip": 2.3984375, "router_z_loss_mlp": 0.27661133, "step": 6882, "time_per_iteration": 2.7379395961761475 }, { "auxiliary_loss_clip": 0.01396516, "auxiliary_loss_mlp": 0.00299916, "balance_loss_clip": 1.14301074, "balance_loss_mlp": 0.26746699, "epoch": 0.41382834811363295, "flos": 20813968800000.0, "grad_norm": 57.01543474718828, "language_loss": 0.81385452, "learning_rate": 2.643608785656077e-06, "loss": 0.83081883, "num_input_tokens_seen": 147805720, "router_z_loss_clip": 2.53320312, "router_z_loss_mlp": 0.32446289, "step": 6883, "time_per_iteration": 2.705965757369995 }, { "auxiliary_loss_clip": 0.01386805, "auxiliary_loss_mlp": 0.00303944, "balance_loss_clip": 1.13799429, "balance_loss_mlp": 0.27337891, "epoch": 0.4138884713663009, "flos": 20667130001280.0, "grad_norm": 16.640973123443906, "language_loss": 0.80755293, "learning_rate": 2.643240028730663e-06, "loss": 0.82446039, "num_input_tokens_seen": 147824605, "router_z_loss_clip": 2.48632812, "router_z_loss_mlp": 0.3059082, "step": 6884, "time_per_iteration": 2.763960123062134 }, { "auxiliary_loss_clip": 0.01387997, "auxiliary_loss_mlp": 0.0030527, "balance_loss_clip": 1.13740361, "balance_loss_mlp": 0.27694577, "epoch": 0.4139485946189689, "flos": 29056616225280.0, "grad_norm": 10.374167131057625, "language_loss": 0.80864847, "learning_rate": 2.642871247413523e-06, "loss": 0.82558113, "num_input_tokens_seen": 147845445, "router_z_loss_clip": 2.50585938, "router_z_loss_mlp": 0.28295898, "step": 6885, "time_per_iteration": 2.720151662826538 }, { "auxiliary_loss_clip": 0.01391806, "auxiliary_loss_mlp": 0.00293357, "balance_loss_clip": 1.14549804, "balance_loss_mlp": 0.26434177, "epoch": 0.41400871787163684, "flos": 24425720219520.0, "grad_norm": 123.72668436118754, "language_loss": 0.76831591, "learning_rate": 2.6425024417186414e-06, "loss": 0.78516752, "num_input_tokens_seen": 147865580, "router_z_loss_clip": 2.46484375, "router_z_loss_mlp": 0.2902832, "step": 6886, "time_per_iteration": 2.7026567459106445 }, { "auxiliary_loss_clip": 0.01400741, "auxiliary_loss_mlp": 0.00328908, "balance_loss_clip": 1.15302086, "balance_loss_mlp": 0.29933217, "epoch": 0.4140688411243048, "flos": 19464050845440.0, "grad_norm": 16.52039714209627, "language_loss": 0.82146859, "learning_rate": 2.642133611660002e-06, "loss": 0.83876514, "num_input_tokens_seen": 147885230, "router_z_loss_clip": 2.47460938, "router_z_loss_mlp": 0.29553223, "step": 6887, "time_per_iteration": 2.67100191116333 }, { "auxiliary_loss_clip": 0.01387094, "auxiliary_loss_mlp": 0.00281285, "balance_loss_clip": 1.14418244, "balance_loss_mlp": 0.25400996, "epoch": 0.4141289643769728, "flos": 19313656600320.0, "grad_norm": 4.175942878105828, "language_loss": 0.78027201, "learning_rate": 2.641764757251592e-06, "loss": 0.7969557, "num_input_tokens_seen": 147903035, "router_z_loss_clip": 2.42773438, "router_z_loss_mlp": 0.27307129, "step": 6888, "time_per_iteration": 2.6471431255340576 }, { "auxiliary_loss_clip": 0.01395771, "auxiliary_loss_mlp": 0.00307472, "balance_loss_clip": 1.14742494, "balance_loss_mlp": 0.27421248, "epoch": 0.41418908762964074, "flos": 16726903683840.0, "grad_norm": 15.773086233838354, "language_loss": 0.81180644, "learning_rate": 2.6413958785073976e-06, "loss": 0.82883894, "num_input_tokens_seen": 147918745, "router_z_loss_clip": 2.484375, "router_z_loss_mlp": 0.33276367, "step": 6889, "time_per_iteration": 2.6389358043670654 }, { "auxiliary_loss_clip": 0.0140254, "auxiliary_loss_mlp": 0.00295556, "balance_loss_clip": 1.15922737, "balance_loss_mlp": 0.26736349, "epoch": 0.41424921088230876, "flos": 25296840858240.0, "grad_norm": 90.5282382617966, "language_loss": 0.84150988, "learning_rate": 2.6410269754414074e-06, "loss": 0.85849082, "num_input_tokens_seen": 147938265, "router_z_loss_clip": 2.43554688, "router_z_loss_mlp": 0.28186035, "step": 6890, "time_per_iteration": 2.730959415435791 }, { "auxiliary_loss_clip": 0.01409133, "auxiliary_loss_mlp": 0.00315238, "balance_loss_clip": 1.16443098, "balance_loss_mlp": 0.28476763, "epoch": 0.4143093341349767, "flos": 20960520289920.0, "grad_norm": 7.231242666209297, "language_loss": 0.82899022, "learning_rate": 2.6406580480676113e-06, "loss": 0.8462339, "num_input_tokens_seen": 147957320, "router_z_loss_clip": 2.44726562, "router_z_loss_mlp": 0.30480957, "step": 6891, "time_per_iteration": 2.6533048152923584 }, { "auxiliary_loss_clip": 0.01417454, "auxiliary_loss_mlp": 0.0033727, "balance_loss_clip": 1.16275692, "balance_loss_mlp": 0.30186486, "epoch": 0.4143694573876447, "flos": 22017694400640.0, "grad_norm": 57.03982148594572, "language_loss": 0.91406876, "learning_rate": 2.6402890963999963e-06, "loss": 0.93161595, "num_input_tokens_seen": 147977045, "router_z_loss_clip": 2.546875, "router_z_loss_mlp": 0.35424805, "step": 6892, "time_per_iteration": 2.830307722091675 }, { "auxiliary_loss_clip": 0.01412734, "auxiliary_loss_mlp": 0.0031509, "balance_loss_clip": 1.16872597, "balance_loss_mlp": 0.28555, "epoch": 0.41442958064031266, "flos": 35697396723840.0, "grad_norm": 4.3836858450997385, "language_loss": 0.75051278, "learning_rate": 2.6399201204525554e-06, "loss": 0.76779097, "num_input_tokens_seen": 147996905, "router_z_loss_clip": 2.44140625, "router_z_loss_mlp": 0.29528809, "step": 6893, "time_per_iteration": 2.8227481842041016 }, { "auxiliary_loss_clip": 0.01410084, "auxiliary_loss_mlp": 0.00271688, "balance_loss_clip": 1.16535878, "balance_loss_mlp": 0.24455574, "epoch": 0.4144897038929806, "flos": 28293766156800.0, "grad_norm": 7.945131252390588, "language_loss": 0.79575306, "learning_rate": 2.639551120239279e-06, "loss": 0.81257081, "num_input_tokens_seen": 148017875, "router_z_loss_clip": 2.44335938, "router_z_loss_mlp": 0.27111816, "step": 6894, "time_per_iteration": 2.765350818634033 }, { "auxiliary_loss_clip": 0.01409427, "auxiliary_loss_mlp": 0.00298748, "balance_loss_clip": 1.16187751, "balance_loss_mlp": 0.27004203, "epoch": 0.4145498271456486, "flos": 11648093080320.0, "grad_norm": 47.98093130562892, "language_loss": 0.73765868, "learning_rate": 2.63918209577416e-06, "loss": 0.75474042, "num_input_tokens_seen": 148032300, "router_z_loss_clip": 2.47265625, "router_z_loss_mlp": 0.2869873, "step": 6895, "time_per_iteration": 2.6956286430358887 }, { "auxiliary_loss_clip": 0.01425207, "auxiliary_loss_mlp": 0.00284777, "balance_loss_clip": 1.1766603, "balance_loss_mlp": 0.25601187, "epoch": 0.41460995039831655, "flos": 27235622378880.0, "grad_norm": 35.05907851796279, "language_loss": 0.77858269, "learning_rate": 2.638813047071192e-06, "loss": 0.79568255, "num_input_tokens_seen": 148053260, "router_z_loss_clip": 2.48632812, "router_z_loss_mlp": 0.28759766, "step": 6896, "time_per_iteration": 2.706022262573242 }, { "auxiliary_loss_clip": 0.0141703, "auxiliary_loss_mlp": 0.00288178, "balance_loss_clip": 1.16847444, "balance_loss_mlp": 0.25844732, "epoch": 0.4146700736509845, "flos": 25922369232000.0, "grad_norm": 89.9941491827171, "language_loss": 0.8050226, "learning_rate": 2.6384439741443696e-06, "loss": 0.82207477, "num_input_tokens_seen": 148072965, "router_z_loss_clip": 2.48828125, "router_z_loss_mlp": 0.29736328, "step": 6897, "time_per_iteration": 2.7457377910614014 }, { "auxiliary_loss_clip": 0.01418136, "auxiliary_loss_mlp": 0.002664, "balance_loss_clip": 1.16742969, "balance_loss_mlp": 0.23788533, "epoch": 0.4147301969036525, "flos": 26833243248000.0, "grad_norm": 16.399679196959884, "language_loss": 0.89235574, "learning_rate": 2.6380748770076873e-06, "loss": 0.90920109, "num_input_tokens_seen": 148093240, "router_z_loss_clip": 2.50976562, "router_z_loss_mlp": 0.28515625, "step": 6898, "time_per_iteration": 4.137167930603027 }, { "auxiliary_loss_clip": 0.01419264, "auxiliary_loss_mlp": 0.00301022, "balance_loss_clip": 1.17082787, "balance_loss_mlp": 0.27222151, "epoch": 0.41479032015632045, "flos": 20298291194880.0, "grad_norm": 4.111884843117439, "language_loss": 0.82652336, "learning_rate": 2.6377057556751416e-06, "loss": 0.84372622, "num_input_tokens_seen": 148110925, "router_z_loss_clip": 2.48046875, "router_z_loss_mlp": 0.28808594, "step": 6899, "time_per_iteration": 2.627286911010742 }, { "auxiliary_loss_clip": 0.01430901, "auxiliary_loss_mlp": 0.00302707, "balance_loss_clip": 1.17804313, "balance_loss_mlp": 0.27290505, "epoch": 0.4148504434089884, "flos": 25264988472960.0, "grad_norm": 7.717498037437445, "language_loss": 0.84085858, "learning_rate": 2.6373366101607306e-06, "loss": 0.85819465, "num_input_tokens_seen": 148130670, "router_z_loss_clip": 2.52929688, "router_z_loss_mlp": 0.29833984, "step": 6900, "time_per_iteration": 4.130227565765381 }, { "auxiliary_loss_clip": 0.0142158, "auxiliary_loss_mlp": 0.00290893, "balance_loss_clip": 1.17300439, "balance_loss_mlp": 0.26070929, "epoch": 0.4149105666616564, "flos": 12822300679680.0, "grad_norm": 9.539217033895834, "language_loss": 0.89910436, "learning_rate": 2.6369674404784503e-06, "loss": 0.91622913, "num_input_tokens_seen": 148148350, "router_z_loss_clip": 2.48828125, "router_z_loss_mlp": 0.30200195, "step": 6901, "time_per_iteration": 2.6954026222229004 }, { "auxiliary_loss_clip": 0.01417709, "auxiliary_loss_mlp": 0.00296057, "balance_loss_clip": 1.17033899, "balance_loss_mlp": 0.26898485, "epoch": 0.41497068991432434, "flos": 16763891713920.0, "grad_norm": 86.0779626034437, "language_loss": 0.76113397, "learning_rate": 2.6365982466423014e-06, "loss": 0.77827168, "num_input_tokens_seen": 148167550, "router_z_loss_clip": 2.47265625, "router_z_loss_mlp": 0.27050781, "step": 6902, "time_per_iteration": 2.6913087368011475 }, { "auxiliary_loss_clip": 0.01421343, "auxiliary_loss_mlp": 0.00319199, "balance_loss_clip": 1.17376876, "balance_loss_mlp": 0.2895399, "epoch": 0.4150308131669923, "flos": 18000906243840.0, "grad_norm": 9.974795896836959, "language_loss": 0.89724463, "learning_rate": 2.6362290286662834e-06, "loss": 0.91465002, "num_input_tokens_seen": 148184740, "router_z_loss_clip": 2.47265625, "router_z_loss_mlp": 0.29663086, "step": 6903, "time_per_iteration": 4.060417175292969 }, { "auxiliary_loss_clip": 0.01430007, "auxiliary_loss_mlp": 0.00334914, "balance_loss_clip": 1.18031871, "balance_loss_mlp": 0.30325222, "epoch": 0.41509093641966033, "flos": 30044770352640.0, "grad_norm": 29.867990266370757, "language_loss": 0.76617265, "learning_rate": 2.6358597865643968e-06, "loss": 0.78382194, "num_input_tokens_seen": 148204605, "router_z_loss_clip": 2.49414062, "router_z_loss_mlp": 0.31665039, "step": 6904, "time_per_iteration": 2.7394914627075195 }, { "auxiliary_loss_clip": 0.01419672, "auxiliary_loss_mlp": 0.00308027, "balance_loss_clip": 1.16605687, "balance_loss_mlp": 0.2774142, "epoch": 0.4151510596723283, "flos": 24279994742400.0, "grad_norm": 56.50463090246851, "language_loss": 0.8424046, "learning_rate": 2.635490520350643e-06, "loss": 0.85968155, "num_input_tokens_seen": 148224675, "router_z_loss_clip": 2.53710938, "router_z_loss_mlp": 0.30615234, "step": 6905, "time_per_iteration": 2.7467727661132812 }, { "auxiliary_loss_clip": 0.01424329, "auxiliary_loss_mlp": 0.00306072, "balance_loss_clip": 1.17144489, "balance_loss_mlp": 0.27588773, "epoch": 0.41521118292499626, "flos": 23476206147840.0, "grad_norm": 6.4955057855308915, "language_loss": 0.75759661, "learning_rate": 2.635121230039025e-06, "loss": 0.77490056, "num_input_tokens_seen": 148243375, "router_z_loss_clip": 2.52929688, "router_z_loss_mlp": 0.30175781, "step": 6906, "time_per_iteration": 2.7187771797180176 }, { "auxiliary_loss_clip": 0.01413975, "auxiliary_loss_mlp": 0.00309773, "balance_loss_clip": 1.16665006, "balance_loss_mlp": 0.2817831, "epoch": 0.4152713061776642, "flos": 22125498094080.0, "grad_norm": 19.465344134684504, "language_loss": 0.76674199, "learning_rate": 2.6347519156435467e-06, "loss": 0.78397954, "num_input_tokens_seen": 148261140, "router_z_loss_clip": 2.47070312, "router_z_loss_mlp": 0.27966309, "step": 6907, "time_per_iteration": 2.6816256046295166 }, { "auxiliary_loss_clip": 0.01413873, "auxiliary_loss_mlp": 0.00294103, "balance_loss_clip": 1.16547084, "balance_loss_mlp": 0.26483703, "epoch": 0.4153314294303322, "flos": 21251396626560.0, "grad_norm": 5.835388374345328, "language_loss": 0.84120166, "learning_rate": 2.6343825771782123e-06, "loss": 0.85828149, "num_input_tokens_seen": 148279655, "router_z_loss_clip": 2.48242188, "router_z_loss_mlp": 0.29260254, "step": 6908, "time_per_iteration": 2.680659294128418 }, { "auxiliary_loss_clip": 0.01480656, "auxiliary_loss_mlp": 0.00078762, "balance_loss_clip": 1.27692783, "balance_loss_mlp": 0.07284941, "epoch": 0.41539155268300015, "flos": 57920681594880.0, "grad_norm": 0.8208912612936905, "language_loss": 0.64511979, "learning_rate": 2.634013214657026e-06, "loss": 0.66071403, "num_input_tokens_seen": 148339005, "router_z_loss_clip": 2.03125, "router_z_loss_mlp": 0.05908203, "step": 6909, "time_per_iteration": 4.5079309940338135 }, { "auxiliary_loss_clip": 0.01404949, "auxiliary_loss_mlp": 0.002628, "balance_loss_clip": 1.15990829, "balance_loss_mlp": 0.23650268, "epoch": 0.4154516759356681, "flos": 21903677654400.0, "grad_norm": 407.25151211267945, "language_loss": 0.92874128, "learning_rate": 2.633643828093996e-06, "loss": 0.94541872, "num_input_tokens_seen": 148358715, "router_z_loss_clip": 2.45117188, "router_z_loss_mlp": 0.26269531, "step": 6910, "time_per_iteration": 2.665701389312744 }, { "auxiliary_loss_clip": 0.01475224, "auxiliary_loss_mlp": 0.00094293, "balance_loss_clip": 1.27256799, "balance_loss_mlp": 0.08776008, "epoch": 0.4155117991883361, "flos": 67833677226240.0, "grad_norm": 0.8206379664202867, "language_loss": 0.61926419, "learning_rate": 2.633274417503128e-06, "loss": 0.6349594, "num_input_tokens_seen": 148417280, "router_z_loss_clip": 2.03125, "router_z_loss_mlp": 0.06542969, "step": 6911, "time_per_iteration": 3.1090869903564453 }, { "auxiliary_loss_clip": 0.01427063, "auxiliary_loss_mlp": 0.00304637, "balance_loss_clip": 1.17120159, "balance_loss_mlp": 0.27423894, "epoch": 0.41557192244100405, "flos": 14282679934080.0, "grad_norm": 160.56101352808557, "language_loss": 0.95819759, "learning_rate": 2.6329049828984312e-06, "loss": 0.97551465, "num_input_tokens_seen": 148432610, "router_z_loss_clip": 2.56054688, "router_z_loss_mlp": 0.30419922, "step": 6912, "time_per_iteration": 2.5963642597198486 }, { "auxiliary_loss_clip": 0.01396684, "auxiliary_loss_mlp": 0.00279699, "balance_loss_clip": 1.14974916, "balance_loss_mlp": 0.25312755, "epoch": 0.415632045693672, "flos": 24461954064000.0, "grad_norm": 16.481301082363263, "language_loss": 0.72058785, "learning_rate": 2.632535524293914e-06, "loss": 0.73735166, "num_input_tokens_seen": 148451510, "router_z_loss_clip": 2.46875, "router_z_loss_mlp": 0.26574707, "step": 6913, "time_per_iteration": 2.6894800662994385 }, { "auxiliary_loss_clip": 0.01406694, "auxiliary_loss_mlp": 0.00280085, "balance_loss_clip": 1.16217899, "balance_loss_mlp": 0.25342953, "epoch": 0.41569216894634, "flos": 20115290378880.0, "grad_norm": 57.609545133359646, "language_loss": 0.82184291, "learning_rate": 2.632166041703586e-06, "loss": 0.83871067, "num_input_tokens_seen": 148469945, "router_z_loss_clip": 2.44140625, "router_z_loss_mlp": 0.26660156, "step": 6914, "time_per_iteration": 2.6344432830810547 }, { "auxiliary_loss_clip": 0.01399325, "auxiliary_loss_mlp": 0.00310993, "balance_loss_clip": 1.15152073, "balance_loss_mlp": 0.28104812, "epoch": 0.41575229219900794, "flos": 23798827128960.0, "grad_norm": 5.203095873600688, "language_loss": 0.91901076, "learning_rate": 2.631796535141458e-06, "loss": 0.93611395, "num_input_tokens_seen": 148486655, "router_z_loss_clip": 2.4765625, "router_z_loss_mlp": 0.29968262, "step": 6915, "time_per_iteration": 2.6857364177703857 }, { "auxiliary_loss_clip": 0.01401701, "auxiliary_loss_mlp": 0.00283399, "balance_loss_clip": 1.15303254, "balance_loss_mlp": 0.25638592, "epoch": 0.4158124154516759, "flos": 23108229267840.0, "grad_norm": 4.199973616042434, "language_loss": 0.79899567, "learning_rate": 2.6314270046215426e-06, "loss": 0.81584668, "num_input_tokens_seen": 148505035, "router_z_loss_clip": 2.48828125, "router_z_loss_mlp": 0.27038574, "step": 6916, "time_per_iteration": 2.6822328567504883 }, { "auxiliary_loss_clip": 0.01414249, "auxiliary_loss_mlp": 0.00265951, "balance_loss_clip": 1.1622622, "balance_loss_mlp": 0.23971298, "epoch": 0.41587253870434393, "flos": 24242970798720.0, "grad_norm": 48.329791837155696, "language_loss": 0.7876575, "learning_rate": 2.631057450157852e-06, "loss": 0.80445951, "num_input_tokens_seen": 148525575, "router_z_loss_clip": 2.51953125, "router_z_loss_mlp": 0.26269531, "step": 6917, "time_per_iteration": 2.6766164302825928 }, { "auxiliary_loss_clip": 0.01400876, "auxiliary_loss_mlp": 0.00273781, "balance_loss_clip": 1.15534019, "balance_loss_mlp": 0.24608861, "epoch": 0.4159326619570119, "flos": 23881602021120.0, "grad_norm": 21.195004444646862, "language_loss": 0.85482609, "learning_rate": 2.6306878717643988e-06, "loss": 0.87157261, "num_input_tokens_seen": 148547270, "router_z_loss_clip": 2.453125, "router_z_loss_mlp": 0.27661133, "step": 6918, "time_per_iteration": 2.7356514930725098 }, { "auxiliary_loss_clip": 0.01391594, "auxiliary_loss_mlp": 0.00264522, "balance_loss_clip": 1.14303803, "balance_loss_mlp": 0.23643655, "epoch": 0.41599278520967986, "flos": 40626531354240.0, "grad_norm": 4756.340074586925, "language_loss": 0.75695086, "learning_rate": 2.6303182694551995e-06, "loss": 0.77351201, "num_input_tokens_seen": 148572100, "router_z_loss_clip": 2.48632812, "router_z_loss_mlp": 0.28112793, "step": 6919, "time_per_iteration": 2.8362205028533936 }, { "auxiliary_loss_clip": 0.01394121, "auxiliary_loss_mlp": 0.0029232, "balance_loss_clip": 1.14736223, "balance_loss_mlp": 0.26385325, "epoch": 0.4160529084623478, "flos": 18222942165120.0, "grad_norm": 12.61788047787155, "language_loss": 0.90149683, "learning_rate": 2.6299486432442677e-06, "loss": 0.91836131, "num_input_tokens_seen": 148591245, "router_z_loss_clip": 2.46679688, "router_z_loss_mlp": 0.28442383, "step": 6920, "time_per_iteration": 2.624249219894409 }, { "auxiliary_loss_clip": 0.01392386, "auxiliary_loss_mlp": 0.00288514, "balance_loss_clip": 1.14725995, "balance_loss_mlp": 0.25968921, "epoch": 0.4161130317150158, "flos": 13661963982720.0, "grad_norm": 298.5545293473413, "language_loss": 0.75963706, "learning_rate": 2.6295789931456195e-06, "loss": 0.77644604, "num_input_tokens_seen": 148607980, "router_z_loss_clip": 2.45117188, "router_z_loss_mlp": 0.28808594, "step": 6921, "time_per_iteration": 2.638244390487671 }, { "auxiliary_loss_clip": 0.01396201, "auxiliary_loss_mlp": 0.002745, "balance_loss_clip": 1.15310454, "balance_loss_mlp": 0.24639031, "epoch": 0.41617315496768376, "flos": 16178511767040.0, "grad_norm": 32.51561619184545, "language_loss": 0.87114376, "learning_rate": 2.629209319173274e-06, "loss": 0.88785076, "num_input_tokens_seen": 148624490, "router_z_loss_clip": 2.43359375, "router_z_loss_mlp": 0.28112793, "step": 6922, "time_per_iteration": 2.668532371520996 }, { "auxiliary_loss_clip": 0.01400967, "auxiliary_loss_mlp": 0.00284341, "balance_loss_clip": 1.15360737, "balance_loss_mlp": 0.25706571, "epoch": 0.4162332782203517, "flos": 26213317395840.0, "grad_norm": 20.136738606444762, "language_loss": 0.72727156, "learning_rate": 2.628839621341247e-06, "loss": 0.74412465, "num_input_tokens_seen": 148646490, "router_z_loss_clip": 2.4765625, "router_z_loss_mlp": 0.27270508, "step": 6923, "time_per_iteration": 2.710132360458374 }, { "auxiliary_loss_clip": 0.0140425, "auxiliary_loss_mlp": 0.00282596, "balance_loss_clip": 1.15486956, "balance_loss_mlp": 0.25369972, "epoch": 0.4162934014730197, "flos": 28183987215360.0, "grad_norm": 15.36921349991564, "language_loss": 0.83700281, "learning_rate": 2.6284698996635593e-06, "loss": 0.85387135, "num_input_tokens_seen": 148668580, "router_z_loss_clip": 2.4921875, "router_z_loss_mlp": 0.28881836, "step": 6924, "time_per_iteration": 2.735969066619873 }, { "auxiliary_loss_clip": 0.01382983, "auxiliary_loss_mlp": 0.00286095, "balance_loss_clip": 1.13617921, "balance_loss_mlp": 0.25844994, "epoch": 0.41635352472568765, "flos": 19865316654720.0, "grad_norm": 6.09792317195744, "language_loss": 0.81456631, "learning_rate": 2.62810015415423e-06, "loss": 0.8312571, "num_input_tokens_seen": 148688410, "router_z_loss_clip": 2.46484375, "router_z_loss_mlp": 0.27661133, "step": 6925, "time_per_iteration": 2.6510465145111084 }, { "auxiliary_loss_clip": 0.01381665, "auxiliary_loss_mlp": 0.00276589, "balance_loss_clip": 1.13158774, "balance_loss_mlp": 0.24838394, "epoch": 0.4164136479783556, "flos": 14935356011520.0, "grad_norm": 25.457186437008172, "language_loss": 0.89619493, "learning_rate": 2.6277303848272792e-06, "loss": 0.91277748, "num_input_tokens_seen": 148704855, "router_z_loss_clip": 2.49804688, "router_z_loss_mlp": 0.28234863, "step": 6926, "time_per_iteration": 2.634174108505249 }, { "auxiliary_loss_clip": 0.01381164, "auxiliary_loss_mlp": 0.00289522, "balance_loss_clip": 1.13645625, "balance_loss_mlp": 0.2618421, "epoch": 0.4164737712310236, "flos": 21757593041280.0, "grad_norm": 11.798890301882237, "language_loss": 0.91444969, "learning_rate": 2.6273605916967302e-06, "loss": 0.93115658, "num_input_tokens_seen": 148723065, "router_z_loss_clip": 2.44335938, "router_z_loss_mlp": 0.27697754, "step": 6927, "time_per_iteration": 2.678067207336426 }, { "auxiliary_loss_clip": 0.01388844, "auxiliary_loss_mlp": 0.00282082, "balance_loss_clip": 1.14089894, "balance_loss_mlp": 0.254807, "epoch": 0.41653389448369155, "flos": 20740136394240.0, "grad_norm": 63.6932402658284, "language_loss": 0.84077799, "learning_rate": 2.626990774776604e-06, "loss": 0.8574872, "num_input_tokens_seen": 148741780, "router_z_loss_clip": 2.48046875, "router_z_loss_mlp": 0.27282715, "step": 6928, "time_per_iteration": 2.675041437149048 }, { "auxiliary_loss_clip": 0.01395655, "auxiliary_loss_mlp": 0.00277244, "balance_loss_clip": 1.14684463, "balance_loss_mlp": 0.24876487, "epoch": 0.4165940177363595, "flos": 24972891073920.0, "grad_norm": 2.6898988410614484, "language_loss": 0.85603917, "learning_rate": 2.6266209340809254e-06, "loss": 0.8727681, "num_input_tokens_seen": 148759795, "router_z_loss_clip": 2.49023438, "router_z_loss_mlp": 0.28466797, "step": 6929, "time_per_iteration": 2.6775903701782227 }, { "auxiliary_loss_clip": 0.0139315, "auxiliary_loss_mlp": 0.00271433, "balance_loss_clip": 1.14514947, "balance_loss_mlp": 0.24439679, "epoch": 0.41665414098902753, "flos": 20521727746560.0, "grad_norm": 26.891299676050043, "language_loss": 0.7855249, "learning_rate": 2.6262510696237182e-06, "loss": 0.80217069, "num_input_tokens_seen": 148778680, "router_z_loss_clip": 2.48242188, "router_z_loss_mlp": 0.27050781, "step": 6930, "time_per_iteration": 2.768950939178467 }, { "auxiliary_loss_clip": 0.01394668, "auxiliary_loss_mlp": 0.00258956, "balance_loss_clip": 1.14597368, "balance_loss_mlp": 0.23128766, "epoch": 0.4167142642416955, "flos": 19682926369920.0, "grad_norm": 121.69783760742101, "language_loss": 0.89496481, "learning_rate": 2.625881181419007e-06, "loss": 0.91150105, "num_input_tokens_seen": 148796470, "router_z_loss_clip": 2.48828125, "router_z_loss_mlp": 0.27661133, "step": 6931, "time_per_iteration": 2.6845648288726807 }, { "auxiliary_loss_clip": 0.01380372, "auxiliary_loss_mlp": 0.00293841, "balance_loss_clip": 1.13205576, "balance_loss_mlp": 0.26461077, "epoch": 0.41677438749436346, "flos": 23763742519680.0, "grad_norm": 11.896540857713473, "language_loss": 0.83901042, "learning_rate": 2.6255112694808193e-06, "loss": 0.85575259, "num_input_tokens_seen": 148815300, "router_z_loss_clip": 2.48242188, "router_z_loss_mlp": 0.29248047, "step": 6932, "time_per_iteration": 2.6822924613952637 }, { "auxiliary_loss_clip": 0.01390834, "auxiliary_loss_mlp": 0.00261153, "balance_loss_clip": 1.14037979, "balance_loss_mlp": 0.23336571, "epoch": 0.41683451074703143, "flos": 30410053712640.0, "grad_norm": 3.3979900530566614, "language_loss": 0.8957845, "learning_rate": 2.6251413338231813e-06, "loss": 0.91230434, "num_input_tokens_seen": 148834315, "router_z_loss_clip": 2.50585938, "router_z_loss_mlp": 0.27807617, "step": 6933, "time_per_iteration": 2.7461166381835938 }, { "auxiliary_loss_clip": 0.0140444, "auxiliary_loss_mlp": 0.00301029, "balance_loss_clip": 1.1514914, "balance_loss_mlp": 0.27070186, "epoch": 0.4168946339996994, "flos": 21506757390720.0, "grad_norm": 7.8485192326911575, "language_loss": 0.85273981, "learning_rate": 2.624771374460121e-06, "loss": 0.86979449, "num_input_tokens_seen": 148852420, "router_z_loss_clip": 2.52929688, "router_z_loss_mlp": 0.3034668, "step": 6934, "time_per_iteration": 2.716526746749878 }, { "auxiliary_loss_clip": 0.01402213, "auxiliary_loss_mlp": 0.00276557, "balance_loss_clip": 1.15552425, "balance_loss_mlp": 0.24751809, "epoch": 0.41695475725236736, "flos": 17638675539840.0, "grad_norm": 3.6413087545155087, "language_loss": 0.71973777, "learning_rate": 2.624401391405668e-06, "loss": 0.73652542, "num_input_tokens_seen": 148869305, "router_z_loss_clip": 2.46875, "router_z_loss_mlp": 0.29077148, "step": 6935, "time_per_iteration": 2.622572898864746 }, { "auxiliary_loss_clip": 0.01398195, "auxiliary_loss_mlp": 0.0027434, "balance_loss_clip": 1.14978862, "balance_loss_mlp": 0.2485548, "epoch": 0.4170148805050353, "flos": 15668903560320.0, "grad_norm": 21.588169750366593, "language_loss": 0.83757108, "learning_rate": 2.6240313846738513e-06, "loss": 0.85429645, "num_input_tokens_seen": 148886395, "router_z_loss_clip": 2.48242188, "router_z_loss_mlp": 0.25793457, "step": 6936, "time_per_iteration": 2.6314873695373535 }, { "auxiliary_loss_clip": 0.01404527, "auxiliary_loss_mlp": 0.00269567, "balance_loss_clip": 1.15774989, "balance_loss_mlp": 0.24411595, "epoch": 0.4170750037577033, "flos": 15159151699200.0, "grad_norm": 26.677717110261423, "language_loss": 0.85108984, "learning_rate": 2.6236613542787024e-06, "loss": 0.86783075, "num_input_tokens_seen": 148905235, "router_z_loss_clip": 2.46679688, "router_z_loss_mlp": 0.25476074, "step": 6937, "time_per_iteration": 2.630932092666626 }, { "auxiliary_loss_clip": 0.01399354, "auxiliary_loss_mlp": 0.00258644, "balance_loss_clip": 1.15074348, "balance_loss_mlp": 0.23219176, "epoch": 0.41713512701037125, "flos": 28768289754240.0, "grad_norm": 3.027721137814871, "language_loss": 0.87493813, "learning_rate": 2.6232913002342518e-06, "loss": 0.89151818, "num_input_tokens_seen": 148928130, "router_z_loss_clip": 2.48632812, "router_z_loss_mlp": 0.26452637, "step": 6938, "time_per_iteration": 2.8802576065063477 }, { "auxiliary_loss_clip": 0.01407287, "auxiliary_loss_mlp": 0.00295685, "balance_loss_clip": 1.15513361, "balance_loss_mlp": 0.26515502, "epoch": 0.4171952502630392, "flos": 28256993608320.0, "grad_norm": 156.82569621425893, "language_loss": 0.82252491, "learning_rate": 2.6229212225545334e-06, "loss": 0.83955461, "num_input_tokens_seen": 148948790, "router_z_loss_clip": 2.51757812, "router_z_loss_mlp": 0.30505371, "step": 6939, "time_per_iteration": 3.0412657260894775 }, { "auxiliary_loss_clip": 0.01410029, "auxiliary_loss_mlp": 0.002755, "balance_loss_clip": 1.1600529, "balance_loss_mlp": 0.24864177, "epoch": 0.4172553735157072, "flos": 24571697091840.0, "grad_norm": 3.6744773569727722, "language_loss": 0.8215735, "learning_rate": 2.622551121253579e-06, "loss": 0.83842885, "num_input_tokens_seen": 148967690, "router_z_loss_clip": 2.5, "router_z_loss_mlp": 0.26843262, "step": 6940, "time_per_iteration": 4.510461091995239 }, { "auxiliary_loss_clip": 0.01401623, "auxiliary_loss_mlp": 0.00289598, "balance_loss_clip": 1.15080333, "balance_loss_mlp": 0.26022524, "epoch": 0.41731549676837515, "flos": 27045797978880.0, "grad_norm": 47.41873454611108, "language_loss": 0.77581072, "learning_rate": 2.622180996345424e-06, "loss": 0.79272294, "num_input_tokens_seen": 148987150, "router_z_loss_clip": 2.50976562, "router_z_loss_mlp": 0.29382324, "step": 6941, "time_per_iteration": 2.9065158367156982 }, { "auxiliary_loss_clip": 0.01415969, "auxiliary_loss_mlp": 0.00293216, "balance_loss_clip": 1.165254, "balance_loss_mlp": 0.26517817, "epoch": 0.4173756200210431, "flos": 28394063907840.0, "grad_norm": 4.477494542680823, "language_loss": 0.81601989, "learning_rate": 2.621810847844104e-06, "loss": 0.83311176, "num_input_tokens_seen": 149004895, "router_z_loss_clip": 2.5078125, "router_z_loss_mlp": 0.28076172, "step": 6942, "time_per_iteration": 4.153656244277954 }, { "auxiliary_loss_clip": 0.01417453, "auxiliary_loss_mlp": 0.00296634, "balance_loss_clip": 1.16063595, "balance_loss_mlp": 0.26606908, "epoch": 0.41743574327371114, "flos": 22521556431360.0, "grad_norm": 1374.6016460061212, "language_loss": 0.82325387, "learning_rate": 2.6214406757636534e-06, "loss": 0.84039474, "num_input_tokens_seen": 149020970, "router_z_loss_clip": 2.56640625, "router_z_loss_mlp": 0.30554199, "step": 6943, "time_per_iteration": 2.6974587440490723 }, { "auxiliary_loss_clip": 0.01396745, "auxiliary_loss_mlp": 0.00299755, "balance_loss_clip": 1.15162075, "balance_loss_mlp": 0.27106109, "epoch": 0.4174958665263791, "flos": 30113431200000.0, "grad_norm": 38.536577322544105, "language_loss": 0.69601953, "learning_rate": 2.621070480118111e-06, "loss": 0.71298456, "num_input_tokens_seen": 149041795, "router_z_loss_clip": 2.45117188, "router_z_loss_mlp": 0.28723145, "step": 6944, "time_per_iteration": 2.706611394882202 }, { "auxiliary_loss_clip": 0.01408176, "auxiliary_loss_mlp": 0.00297886, "balance_loss_clip": 1.15834737, "balance_loss_mlp": 0.26814297, "epoch": 0.41755598977904707, "flos": 25263444188160.0, "grad_norm": 1.947677275148861, "language_loss": 0.76900673, "learning_rate": 2.620700260921513e-06, "loss": 0.78606737, "num_input_tokens_seen": 149063700, "router_z_loss_clip": 2.50390625, "router_z_loss_mlp": 0.29736328, "step": 6945, "time_per_iteration": 4.092973709106445 }, { "auxiliary_loss_clip": 0.01404733, "auxiliary_loss_mlp": 0.00278167, "balance_loss_clip": 1.15957081, "balance_loss_mlp": 0.25138062, "epoch": 0.41761611303171503, "flos": 19828580019840.0, "grad_norm": 5.2044171368214265, "language_loss": 0.87406111, "learning_rate": 2.620330018187899e-06, "loss": 0.89089012, "num_input_tokens_seen": 149082410, "router_z_loss_clip": 2.453125, "router_z_loss_mlp": 0.2677002, "step": 6946, "time_per_iteration": 2.6575326919555664 }, { "auxiliary_loss_clip": 0.01391528, "auxiliary_loss_mlp": 0.00268514, "balance_loss_clip": 1.14465284, "balance_loss_mlp": 0.24293125, "epoch": 0.417676236284383, "flos": 15523249910400.0, "grad_norm": 15.780180007200576, "language_loss": 0.86150169, "learning_rate": 2.6199597519313086e-06, "loss": 0.87810212, "num_input_tokens_seen": 149098745, "router_z_loss_clip": 2.46875, "router_z_loss_mlp": 0.25610352, "step": 6947, "time_per_iteration": 2.6387410163879395 }, { "auxiliary_loss_clip": 0.01408894, "auxiliary_loss_mlp": 0.00274426, "balance_loss_clip": 1.16335344, "balance_loss_mlp": 0.24836637, "epoch": 0.41773635953705096, "flos": 32524473761280.0, "grad_norm": 6.621913335776044, "language_loss": 0.77546638, "learning_rate": 2.6195894621657825e-06, "loss": 0.79229957, "num_input_tokens_seen": 149122255, "router_z_loss_clip": 2.45703125, "router_z_loss_mlp": 0.26037598, "step": 6948, "time_per_iteration": 2.8217642307281494 }, { "auxiliary_loss_clip": 0.01397553, "auxiliary_loss_mlp": 0.00308357, "balance_loss_clip": 1.15095651, "balance_loss_mlp": 0.28028369, "epoch": 0.4177964827897189, "flos": 23440941970560.0, "grad_norm": 120.6682190865351, "language_loss": 0.82701254, "learning_rate": 2.619219148905362e-06, "loss": 0.84407163, "num_input_tokens_seen": 149142845, "router_z_loss_clip": 2.46484375, "router_z_loss_mlp": 0.28088379, "step": 6949, "time_per_iteration": 2.6534557342529297 }, { "auxiliary_loss_clip": 0.01411456, "auxiliary_loss_mlp": 0.00278491, "balance_loss_clip": 1.15941644, "balance_loss_mlp": 0.25054818, "epoch": 0.4178566060423869, "flos": 22748907565440.0, "grad_norm": 62.212861935808725, "language_loss": 0.89109039, "learning_rate": 2.6188488121640888e-06, "loss": 0.90798986, "num_input_tokens_seen": 149163375, "router_z_loss_clip": 2.52539062, "router_z_loss_mlp": 0.27905273, "step": 6950, "time_per_iteration": 2.679577350616455 }, { "auxiliary_loss_clip": 0.01402087, "auxiliary_loss_mlp": 0.00294109, "balance_loss_clip": 1.15576982, "balance_loss_mlp": 0.26601154, "epoch": 0.41791672929505486, "flos": 26032794618240.0, "grad_norm": 378.8632240265386, "language_loss": 0.8092593, "learning_rate": 2.618478451956007e-06, "loss": 0.82622123, "num_input_tokens_seen": 149185610, "router_z_loss_clip": 2.46289062, "router_z_loss_mlp": 0.28125, "step": 6951, "time_per_iteration": 4.145330429077148 }, { "auxiliary_loss_clip": 0.01406921, "auxiliary_loss_mlp": 0.00281624, "balance_loss_clip": 1.15475965, "balance_loss_mlp": 0.25225046, "epoch": 0.4179768525477228, "flos": 19568694142080.0, "grad_norm": 3.640682426703686, "language_loss": 0.81593341, "learning_rate": 2.61810806829516e-06, "loss": 0.83281887, "num_input_tokens_seen": 149203990, "router_z_loss_clip": 2.52148438, "router_z_loss_mlp": 0.2935791, "step": 6952, "time_per_iteration": 2.6274261474609375 }, { "auxiliary_loss_clip": 0.01397021, "auxiliary_loss_mlp": 0.00284076, "balance_loss_clip": 1.14987445, "balance_loss_mlp": 0.25458384, "epoch": 0.4180369758003908, "flos": 17783826399360.0, "grad_norm": 54.84474017678979, "language_loss": 0.81936049, "learning_rate": 2.617737661195593e-06, "loss": 0.83617145, "num_input_tokens_seen": 149221385, "router_z_loss_clip": 2.46679688, "router_z_loss_mlp": 0.29492188, "step": 6953, "time_per_iteration": 2.6211225986480713 }, { "auxiliary_loss_clip": 0.01396797, "auxiliary_loss_mlp": 0.00255549, "balance_loss_clip": 1.15254974, "balance_loss_mlp": 0.2293587, "epoch": 0.41809709905305875, "flos": 20960663944320.0, "grad_norm": 14.649428435264896, "language_loss": 0.83543956, "learning_rate": 2.617367230671353e-06, "loss": 0.85196298, "num_input_tokens_seen": 149241175, "router_z_loss_clip": 2.44726562, "router_z_loss_mlp": 0.26184082, "step": 6954, "time_per_iteration": 2.644941806793213 }, { "auxiliary_loss_clip": 0.01394663, "auxiliary_loss_mlp": 0.00297997, "balance_loss_clip": 1.14635873, "balance_loss_mlp": 0.27012634, "epoch": 0.4181572223057267, "flos": 22017622573440.0, "grad_norm": 8.495035976497672, "language_loss": 0.92649448, "learning_rate": 2.616996776736485e-06, "loss": 0.94342113, "num_input_tokens_seen": 149259115, "router_z_loss_clip": 2.48242188, "router_z_loss_mlp": 0.27868652, "step": 6955, "time_per_iteration": 2.6728146076202393 }, { "auxiliary_loss_clip": 0.01387492, "auxiliary_loss_mlp": 0.00266201, "balance_loss_clip": 1.14456415, "balance_loss_mlp": 0.23911692, "epoch": 0.4182173455583947, "flos": 26245528917120.0, "grad_norm": 86.17167004225261, "language_loss": 0.88313472, "learning_rate": 2.616626299405037e-06, "loss": 0.89967167, "num_input_tokens_seen": 149278705, "router_z_loss_clip": 2.42773438, "router_z_loss_mlp": 0.27087402, "step": 6956, "time_per_iteration": 2.7048206329345703 }, { "auxiliary_loss_clip": 0.01393895, "auxiliary_loss_mlp": 0.00302301, "balance_loss_clip": 1.14287901, "balance_loss_mlp": 0.27035287, "epoch": 0.4182774688110627, "flos": 14791605782400.0, "grad_norm": 8.820644894132002, "language_loss": 0.77697027, "learning_rate": 2.616255798691059e-06, "loss": 0.7939322, "num_input_tokens_seen": 149294040, "router_z_loss_clip": 2.50976562, "router_z_loss_mlp": 0.3190918, "step": 6957, "time_per_iteration": 2.607893943786621 }, { "auxiliary_loss_clip": 0.01387133, "auxiliary_loss_mlp": 0.00292062, "balance_loss_clip": 1.139781, "balance_loss_mlp": 0.26513273, "epoch": 0.41833759206373067, "flos": 20412020632320.0, "grad_norm": 1.9968317935379543, "language_loss": 0.83461332, "learning_rate": 2.6158852746085982e-06, "loss": 0.85140532, "num_input_tokens_seen": 149310385, "router_z_loss_clip": 2.47460938, "router_z_loss_mlp": 0.26928711, "step": 6958, "time_per_iteration": 2.600341320037842 }, { "auxiliary_loss_clip": 0.01395535, "auxiliary_loss_mlp": 0.00302396, "balance_loss_clip": 1.14875436, "balance_loss_mlp": 0.27363086, "epoch": 0.41839771531639863, "flos": 23656333875840.0, "grad_norm": 54.98658370676171, "language_loss": 0.84276313, "learning_rate": 2.6155147271717066e-06, "loss": 0.85974252, "num_input_tokens_seen": 149328235, "router_z_loss_clip": 2.46875, "router_z_loss_mlp": 0.28735352, "step": 6959, "time_per_iteration": 2.634221076965332 }, { "auxiliary_loss_clip": 0.01388075, "auxiliary_loss_mlp": 0.00274319, "balance_loss_clip": 1.14374375, "balance_loss_mlp": 0.24899891, "epoch": 0.4184578385690666, "flos": 19754137082880.0, "grad_norm": 155.7319269232263, "language_loss": 0.84119254, "learning_rate": 2.6151441563944347e-06, "loss": 0.85781652, "num_input_tokens_seen": 149347465, "router_z_loss_clip": 2.4453125, "router_z_loss_mlp": 0.25305176, "step": 6960, "time_per_iteration": 2.6327178478240967 }, { "auxiliary_loss_clip": 0.01378348, "auxiliary_loss_mlp": 0.00280514, "balance_loss_clip": 1.13627434, "balance_loss_mlp": 0.25366804, "epoch": 0.41851796182173456, "flos": 20193396503040.0, "grad_norm": 14.777030057016153, "language_loss": 0.83524811, "learning_rate": 2.614773562290835e-06, "loss": 0.85183668, "num_input_tokens_seen": 149366685, "router_z_loss_clip": 2.42382812, "router_z_loss_mlp": 0.26867676, "step": 6961, "time_per_iteration": 2.637571334838867 }, { "auxiliary_loss_clip": 0.01428276, "auxiliary_loss_mlp": 0.00064516, "balance_loss_clip": 1.21457481, "balance_loss_mlp": 0.05922337, "epoch": 0.41857808507440253, "flos": 59018794231680.0, "grad_norm": 0.819582694514353, "language_loss": 0.54965931, "learning_rate": 2.61440294487496e-06, "loss": 0.56458724, "num_input_tokens_seen": 149422925, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.05297852, "step": 6962, "time_per_iteration": 3.038231611251831 }, { "auxiliary_loss_clip": 0.01399998, "auxiliary_loss_mlp": 0.00285574, "balance_loss_clip": 1.1537807, "balance_loss_mlp": 0.25623703, "epoch": 0.4186382083270705, "flos": 18478805719680.0, "grad_norm": 14.313941057372123, "language_loss": 0.92687929, "learning_rate": 2.614032304160864e-06, "loss": 0.943735, "num_input_tokens_seen": 149440820, "router_z_loss_clip": 2.4609375, "router_z_loss_mlp": 0.2935791, "step": 6963, "time_per_iteration": 2.63814640045166 }, { "auxiliary_loss_clip": 0.01379497, "auxiliary_loss_mlp": 0.00303447, "balance_loss_clip": 1.13993716, "balance_loss_mlp": 0.27571923, "epoch": 0.41869833157973846, "flos": 21578758202880.0, "grad_norm": 61.728169343546185, "language_loss": 0.75134158, "learning_rate": 2.6136616401626014e-06, "loss": 0.76817101, "num_input_tokens_seen": 149461060, "router_z_loss_clip": 2.39453125, "router_z_loss_mlp": 0.27722168, "step": 6964, "time_per_iteration": 2.6312990188598633 }, { "auxiliary_loss_clip": 0.01379373, "auxiliary_loss_mlp": 0.0027356, "balance_loss_clip": 1.13597286, "balance_loss_mlp": 0.24526006, "epoch": 0.4187584548324064, "flos": 35517412650240.0, "grad_norm": 55.56342786075924, "language_loss": 0.76988226, "learning_rate": 2.6132909528942273e-06, "loss": 0.78641158, "num_input_tokens_seen": 149483115, "router_z_loss_clip": 2.43359375, "router_z_loss_mlp": 0.28308105, "step": 6965, "time_per_iteration": 2.7698938846588135 }, { "auxiliary_loss_clip": 0.01407066, "auxiliary_loss_mlp": 0.00253873, "balance_loss_clip": 1.16369641, "balance_loss_mlp": 0.2287195, "epoch": 0.4188185780850744, "flos": 18655880791680.0, "grad_norm": 4.741886584453144, "language_loss": 0.77632904, "learning_rate": 2.6129202423697997e-06, "loss": 0.79293847, "num_input_tokens_seen": 149501495, "router_z_loss_clip": 2.43164062, "router_z_loss_mlp": 0.25158691, "step": 6966, "time_per_iteration": 2.6276042461395264 }, { "auxiliary_loss_clip": 0.01411362, "auxiliary_loss_mlp": 0.00321692, "balance_loss_clip": 1.16454315, "balance_loss_mlp": 0.29272392, "epoch": 0.41887870133774235, "flos": 40333428374400.0, "grad_norm": 98.81761987729494, "language_loss": 0.79783463, "learning_rate": 2.612549508603375e-06, "loss": 0.81516516, "num_input_tokens_seen": 149523170, "router_z_loss_clip": 2.46875, "router_z_loss_mlp": 0.28967285, "step": 6967, "time_per_iteration": 2.786262035369873 }, { "auxiliary_loss_clip": 0.01495916, "auxiliary_loss_mlp": 0.00116291, "balance_loss_clip": 1.24758351, "balance_loss_mlp": 0.10904338, "epoch": 0.4189388245904103, "flos": 61371336516480.0, "grad_norm": 0.6731396751671859, "language_loss": 0.45763588, "learning_rate": 2.612178751609011e-06, "loss": 0.47375798, "num_input_tokens_seen": 149583955, "router_z_loss_clip": 2.484375, "router_z_loss_mlp": 0.07226562, "step": 6968, "time_per_iteration": 3.1143362522125244 }, { "auxiliary_loss_clip": 0.01421123, "auxiliary_loss_mlp": 0.00331717, "balance_loss_clip": 1.17354941, "balance_loss_mlp": 0.30098501, "epoch": 0.4189989478430783, "flos": 28215624119040.0, "grad_norm": 141.38760378086602, "language_loss": 0.81777465, "learning_rate": 2.6118079714007685e-06, "loss": 0.83530307, "num_input_tokens_seen": 149604440, "router_z_loss_clip": 2.4765625, "router_z_loss_mlp": 0.30712891, "step": 6969, "time_per_iteration": 2.7423510551452637 }, { "auxiliary_loss_clip": 0.01399777, "auxiliary_loss_mlp": 0.00294558, "balance_loss_clip": 1.15526903, "balance_loss_mlp": 0.26814103, "epoch": 0.4190590710957463, "flos": 24565879088640.0, "grad_norm": 3.7279118361254073, "language_loss": 0.87247849, "learning_rate": 2.611437167992705e-06, "loss": 0.88942182, "num_input_tokens_seen": 149623745, "router_z_loss_clip": 2.44726562, "router_z_loss_mlp": 0.26416016, "step": 6970, "time_per_iteration": 2.751967668533325 }, { "auxiliary_loss_clip": 0.01417904, "auxiliary_loss_mlp": 0.00279834, "balance_loss_clip": 1.17062306, "balance_loss_mlp": 0.25263017, "epoch": 0.41911919434841427, "flos": 21726027964800.0, "grad_norm": 18.003447499216506, "language_loss": 0.90226209, "learning_rate": 2.6110663413988835e-06, "loss": 0.91923946, "num_input_tokens_seen": 149643025, "router_z_loss_clip": 2.47460938, "router_z_loss_mlp": 0.27233887, "step": 6971, "time_per_iteration": 2.645559072494507 }, { "auxiliary_loss_clip": 0.01412203, "auxiliary_loss_mlp": 0.00286609, "balance_loss_clip": 1.16924381, "balance_loss_mlp": 0.25954878, "epoch": 0.41917931760108224, "flos": 17601543855360.0, "grad_norm": 15.091608473164822, "language_loss": 0.82293242, "learning_rate": 2.6106954916333648e-06, "loss": 0.83992052, "num_input_tokens_seen": 149660695, "router_z_loss_clip": 2.43164062, "router_z_loss_mlp": 0.27026367, "step": 6972, "time_per_iteration": 2.6032323837280273 }, { "auxiliary_loss_clip": 0.01397484, "auxiliary_loss_mlp": 0.00263684, "balance_loss_clip": 1.15323758, "balance_loss_mlp": 0.23769622, "epoch": 0.4192394408537502, "flos": 37816701022080.0, "grad_norm": 12.299131419066777, "language_loss": 0.79472899, "learning_rate": 2.610324618710212e-06, "loss": 0.81134063, "num_input_tokens_seen": 149682040, "router_z_loss_clip": 2.43945312, "router_z_loss_mlp": 0.26000977, "step": 6973, "time_per_iteration": 2.793625593185425 }, { "auxiliary_loss_clip": 0.01418839, "auxiliary_loss_mlp": 0.00315711, "balance_loss_clip": 1.16711807, "balance_loss_mlp": 0.28776792, "epoch": 0.41929956410641817, "flos": 23107726477440.0, "grad_norm": 9.838583653093744, "language_loss": 0.82372606, "learning_rate": 2.609953722643489e-06, "loss": 0.84107161, "num_input_tokens_seen": 149700855, "router_z_loss_clip": 2.51757812, "router_z_loss_mlp": 0.27966309, "step": 6974, "time_per_iteration": 2.6352128982543945 }, { "auxiliary_loss_clip": 0.01397893, "auxiliary_loss_mlp": 0.00282221, "balance_loss_clip": 1.15298915, "balance_loss_mlp": 0.25337261, "epoch": 0.41935968735908613, "flos": 22524537260160.0, "grad_norm": 11.84107087449541, "language_loss": 0.79654515, "learning_rate": 2.609582803447259e-06, "loss": 0.81334633, "num_input_tokens_seen": 149717360, "router_z_loss_clip": 2.453125, "router_z_loss_mlp": 0.28894043, "step": 6975, "time_per_iteration": 2.649394989013672 }, { "auxiliary_loss_clip": 0.01418982, "auxiliary_loss_mlp": 0.00294534, "balance_loss_clip": 1.1723423, "balance_loss_mlp": 0.26406416, "epoch": 0.4194198106117541, "flos": 26870446759680.0, "grad_norm": 3.216604026392508, "language_loss": 0.87081254, "learning_rate": 2.6092118611355885e-06, "loss": 0.88794774, "num_input_tokens_seen": 149738975, "router_z_loss_clip": 2.46875, "router_z_loss_mlp": 0.30480957, "step": 6976, "time_per_iteration": 2.6880359649658203 }, { "auxiliary_loss_clip": 0.01403301, "auxiliary_loss_mlp": 0.00301561, "balance_loss_clip": 1.15480399, "balance_loss_mlp": 0.27148438, "epoch": 0.41947993386442206, "flos": 19902412425600.0, "grad_norm": 12.560392564886028, "language_loss": 0.76659954, "learning_rate": 2.6088408957225425e-06, "loss": 0.78364813, "num_input_tokens_seen": 149757055, "router_z_loss_clip": 2.48242188, "router_z_loss_mlp": 0.30090332, "step": 6977, "time_per_iteration": 2.625437021255493 }, { "auxiliary_loss_clip": 0.01397847, "auxiliary_loss_mlp": 0.00294266, "balance_loss_clip": 1.15225506, "balance_loss_mlp": 0.26648986, "epoch": 0.41954005711709, "flos": 17383889393280.0, "grad_norm": 66.8612716523086, "language_loss": 0.89866853, "learning_rate": 2.6084699072221898e-06, "loss": 0.91558969, "num_input_tokens_seen": 149772885, "router_z_loss_clip": 2.45507812, "router_z_loss_mlp": 0.27807617, "step": 6978, "time_per_iteration": 2.601032018661499 }, { "auxiliary_loss_clip": 0.01393963, "auxiliary_loss_mlp": 0.00307281, "balance_loss_clip": 1.14541471, "balance_loss_mlp": 0.27685863, "epoch": 0.419600180369758, "flos": 25003306915200.0, "grad_norm": 29.997030808879643, "language_loss": 0.89235413, "learning_rate": 2.6080988956485964e-06, "loss": 0.90936661, "num_input_tokens_seen": 149791515, "router_z_loss_clip": 2.48828125, "router_z_loss_mlp": 0.30444336, "step": 6979, "time_per_iteration": 2.763108491897583 }, { "auxiliary_loss_clip": 0.01393358, "auxiliary_loss_mlp": 0.00300284, "balance_loss_clip": 1.14529562, "balance_loss_mlp": 0.27193597, "epoch": 0.41966030362242596, "flos": 17383781652480.0, "grad_norm": 95.501535833024, "language_loss": 0.89732766, "learning_rate": 2.6077278610158325e-06, "loss": 0.91426408, "num_input_tokens_seen": 149807250, "router_z_loss_clip": 2.48046875, "router_z_loss_mlp": 0.28344727, "step": 6980, "time_per_iteration": 2.582070827484131 }, { "auxiliary_loss_clip": 0.01396271, "auxiliary_loss_mlp": 0.00294612, "balance_loss_clip": 1.14698827, "balance_loss_mlp": 0.26740807, "epoch": 0.4197204268750939, "flos": 22156165330560.0, "grad_norm": 15.274055354555063, "language_loss": 0.87431908, "learning_rate": 2.6073568033379665e-06, "loss": 0.8912279, "num_input_tokens_seen": 149821640, "router_z_loss_clip": 2.49804688, "router_z_loss_mlp": 0.2722168, "step": 6981, "time_per_iteration": 2.674773693084717 }, { "auxiliary_loss_clip": 0.0138888, "auxiliary_loss_mlp": 0.00304004, "balance_loss_clip": 1.14364552, "balance_loss_mlp": 0.27336678, "epoch": 0.4197805501277619, "flos": 22084128604800.0, "grad_norm": 2.9678213696537776, "language_loss": 0.88784075, "learning_rate": 2.6069857226290696e-06, "loss": 0.90476954, "num_input_tokens_seen": 149840545, "router_z_loss_clip": 2.453125, "router_z_loss_mlp": 0.30664062, "step": 6982, "time_per_iteration": 4.1124348640441895 }, { "auxiliary_loss_clip": 0.01413061, "auxiliary_loss_mlp": 0.00293805, "balance_loss_clip": 1.16005158, "balance_loss_mlp": 0.263955, "epoch": 0.4198406733804299, "flos": 26432192920320.0, "grad_norm": 55.68742550031249, "language_loss": 0.63410515, "learning_rate": 2.606614618903214e-06, "loss": 0.65117377, "num_input_tokens_seen": 149860375, "router_z_loss_clip": 2.52929688, "router_z_loss_mlp": 0.29858398, "step": 6983, "time_per_iteration": 2.742246627807617 }, { "auxiliary_loss_clip": 0.01398007, "auxiliary_loss_mlp": 0.00288559, "balance_loss_clip": 1.15369773, "balance_loss_mlp": 0.26086712, "epoch": 0.4199007966330979, "flos": 12531029293440.0, "grad_norm": 23.6938109511939, "language_loss": 0.91331506, "learning_rate": 2.606243492174471e-06, "loss": 0.93018073, "num_input_tokens_seen": 149877850, "router_z_loss_clip": 2.44335938, "router_z_loss_mlp": 0.27697754, "step": 6984, "time_per_iteration": 4.2667236328125 }, { "auxiliary_loss_clip": 0.0139857, "auxiliary_loss_mlp": 0.00311179, "balance_loss_clip": 1.14990699, "balance_loss_mlp": 0.28131711, "epoch": 0.41996091988576584, "flos": 21762944167680.0, "grad_norm": 16.431129393106286, "language_loss": 0.85130715, "learning_rate": 2.605872342456914e-06, "loss": 0.86840463, "num_input_tokens_seen": 149896110, "router_z_loss_clip": 2.48828125, "router_z_loss_mlp": 0.29858398, "step": 6985, "time_per_iteration": 2.735740900039673 }, { "auxiliary_loss_clip": 0.0140579, "auxiliary_loss_mlp": 0.00314293, "balance_loss_clip": 1.15562189, "balance_loss_mlp": 0.28351364, "epoch": 0.4200210431384338, "flos": 26541935948160.0, "grad_norm": 22.643717018381516, "language_loss": 0.86312437, "learning_rate": 2.6055011697646173e-06, "loss": 0.88032526, "num_input_tokens_seen": 149916495, "router_z_loss_clip": 2.50195312, "router_z_loss_mlp": 0.30786133, "step": 6986, "time_per_iteration": 2.725522994995117 }, { "auxiliary_loss_clip": 0.01388664, "auxiliary_loss_mlp": 0.00279093, "balance_loss_clip": 1.14565897, "balance_loss_mlp": 0.252736, "epoch": 0.42008116639110177, "flos": 26795824254720.0, "grad_norm": 19.847595422236644, "language_loss": 0.77194262, "learning_rate": 2.605129974111655e-06, "loss": 0.78862023, "num_input_tokens_seen": 149936445, "router_z_loss_clip": 2.4296875, "router_z_loss_mlp": 0.26342773, "step": 6987, "time_per_iteration": 4.0937418937683105 }, { "auxiliary_loss_clip": 0.01404148, "auxiliary_loss_mlp": 0.00320791, "balance_loss_clip": 1.15540552, "balance_loss_mlp": 0.29131073, "epoch": 0.42014128964376973, "flos": 32087333243520.0, "grad_norm": 120.11840018951423, "language_loss": 0.80181563, "learning_rate": 2.604758755512104e-06, "loss": 0.81906503, "num_input_tokens_seen": 149959430, "router_z_loss_clip": 2.48632812, "router_z_loss_mlp": 0.29504395, "step": 6988, "time_per_iteration": 2.7679593563079834 }, { "auxiliary_loss_clip": 0.01413027, "auxiliary_loss_mlp": 0.00321972, "balance_loss_clip": 1.1592623, "balance_loss_mlp": 0.29159701, "epoch": 0.4202014128964377, "flos": 26467133875200.0, "grad_norm": 24.353150288886187, "language_loss": 0.79759848, "learning_rate": 2.60438751398004e-06, "loss": 0.81494844, "num_input_tokens_seen": 149980365, "router_z_loss_clip": 2.54101562, "router_z_loss_mlp": 0.30395508, "step": 6989, "time_per_iteration": 2.6546518802642822 }, { "auxiliary_loss_clip": 0.01404608, "auxiliary_loss_mlp": 0.00322412, "balance_loss_clip": 1.15213799, "balance_loss_mlp": 0.29193032, "epoch": 0.42026153614910566, "flos": 13401216178560.0, "grad_norm": 11.939689604556325, "language_loss": 0.78708148, "learning_rate": 2.6040162495295404e-06, "loss": 0.80435169, "num_input_tokens_seen": 149997375, "router_z_loss_clip": 2.5234375, "router_z_loss_mlp": 0.30505371, "step": 6990, "time_per_iteration": 2.623218297958374 }, { "auxiliary_loss_clip": 0.01409418, "auxiliary_loss_mlp": 0.00073407, "balance_loss_clip": 1.18596649, "balance_loss_mlp": 0.06799515, "epoch": 0.42032165940177363, "flos": 60250457635200.0, "grad_norm": 0.7989934828384163, "language_loss": 0.60169262, "learning_rate": 2.603644962174685e-06, "loss": 0.61652088, "num_input_tokens_seen": 150051230, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.05419922, "step": 6991, "time_per_iteration": 3.0360424518585205 }, { "auxiliary_loss_clip": 0.0140426, "auxiliary_loss_mlp": 0.00330981, "balance_loss_clip": 1.15332222, "balance_loss_mlp": 0.30110675, "epoch": 0.4203817826544416, "flos": 24535211852160.0, "grad_norm": 1.9771824882115414, "language_loss": 0.88525808, "learning_rate": 2.6032736519295517e-06, "loss": 0.90261054, "num_input_tokens_seen": 150071135, "router_z_loss_clip": 2.50976562, "router_z_loss_mlp": 0.29858398, "step": 6992, "time_per_iteration": 2.716872215270996 }, { "auxiliary_loss_clip": 0.01433838, "auxiliary_loss_mlp": 0.0016312, "balance_loss_clip": 1.20808411, "balance_loss_mlp": 0.15467969, "epoch": 0.42044190590710956, "flos": 58820781530880.0, "grad_norm": 0.7825280547208252, "language_loss": 0.64943033, "learning_rate": 2.6029023188082217e-06, "loss": 0.66539991, "num_input_tokens_seen": 150125220, "router_z_loss_clip": 2.25, "router_z_loss_mlp": 0.08447266, "step": 6993, "time_per_iteration": 4.616387844085693 }, { "auxiliary_loss_clip": 0.01401909, "auxiliary_loss_mlp": 0.00372182, "balance_loss_clip": 1.14893675, "balance_loss_mlp": 0.33703953, "epoch": 0.4205020291597775, "flos": 16436063260800.0, "grad_norm": 44.846025833106076, "language_loss": 0.92247772, "learning_rate": 2.6025309628247746e-06, "loss": 0.94021857, "num_input_tokens_seen": 150142300, "router_z_loss_clip": 2.53125, "router_z_loss_mlp": 0.3515625, "step": 6994, "time_per_iteration": 2.6520767211914062 }, { "auxiliary_loss_clip": 0.01395343, "auxiliary_loss_mlp": 0.00335965, "balance_loss_clip": 1.1454711, "balance_loss_mlp": 0.30513734, "epoch": 0.4205621524124455, "flos": 18405655672320.0, "grad_norm": 1246.7931771403446, "language_loss": 0.85594851, "learning_rate": 2.6021595839932934e-06, "loss": 0.87326157, "num_input_tokens_seen": 150161345, "router_z_loss_clip": 2.5, "router_z_loss_mlp": 0.30834961, "step": 6995, "time_per_iteration": 2.627835512161255 }, { "auxiliary_loss_clip": 0.01406332, "auxiliary_loss_mlp": 0.00314731, "balance_loss_clip": 1.1608423, "balance_loss_mlp": 0.28610915, "epoch": 0.4206222756651135, "flos": 25520097841920.0, "grad_norm": 96.85502334264243, "language_loss": 0.8543756, "learning_rate": 2.60178818232786e-06, "loss": 0.87158626, "num_input_tokens_seen": 150182420, "router_z_loss_clip": 2.45898438, "router_z_loss_mlp": 0.28601074, "step": 6996, "time_per_iteration": 2.675568103790283 }, { "auxiliary_loss_clip": 0.01401937, "auxiliary_loss_mlp": 0.00323409, "balance_loss_clip": 1.15435767, "balance_loss_mlp": 0.29441774, "epoch": 0.4206823989177815, "flos": 15304338472320.0, "grad_norm": 79.78981743468857, "language_loss": 0.84555036, "learning_rate": 2.601416757842559e-06, "loss": 0.86280382, "num_input_tokens_seen": 150200175, "router_z_loss_clip": 2.47851562, "router_z_loss_mlp": 0.29003906, "step": 6997, "time_per_iteration": 2.6223039627075195 }, { "auxiliary_loss_clip": 0.0140097, "auxiliary_loss_mlp": 0.00332794, "balance_loss_clip": 1.15358591, "balance_loss_mlp": 0.30010653, "epoch": 0.42074252217044944, "flos": 15554096714880.0, "grad_norm": 27.056681687905463, "language_loss": 0.81924701, "learning_rate": 2.6010453105514743e-06, "loss": 0.83658469, "num_input_tokens_seen": 150217100, "router_z_loss_clip": 2.47265625, "router_z_loss_mlp": 0.32666016, "step": 6998, "time_per_iteration": 2.6041550636291504 }, { "auxiliary_loss_clip": 0.01406523, "auxiliary_loss_mlp": 0.00360963, "balance_loss_clip": 1.15590358, "balance_loss_mlp": 0.32952759, "epoch": 0.4208026454231174, "flos": 26145877610880.0, "grad_norm": 37.84941859269665, "language_loss": 0.81414574, "learning_rate": 2.60067384046869e-06, "loss": 0.83182061, "num_input_tokens_seen": 150239830, "router_z_loss_clip": 2.50976562, "router_z_loss_mlp": 0.3145752, "step": 6999, "time_per_iteration": 2.741089344024658 }, { "auxiliary_loss_clip": 0.01407861, "auxiliary_loss_mlp": 0.00325112, "balance_loss_clip": 1.15836167, "balance_loss_mlp": 0.29736009, "epoch": 0.42086276867578537, "flos": 23550110380800.0, "grad_norm": 3.944722112470355, "language_loss": 0.70837021, "learning_rate": 2.600302347608295e-06, "loss": 0.72569996, "num_input_tokens_seen": 150260690, "router_z_loss_clip": 2.49414062, "router_z_loss_mlp": 0.27734375, "step": 7000, "time_per_iteration": 2.65169095993042 }, { "auxiliary_loss_clip": 0.01413955, "auxiliary_loss_mlp": 0.00340853, "balance_loss_clip": 1.16400337, "balance_loss_mlp": 0.30852318, "epoch": 0.42092289192845334, "flos": 18113414618880.0, "grad_norm": 53.470024713024245, "language_loss": 0.81361246, "learning_rate": 2.5999308319843743e-06, "loss": 0.83116055, "num_input_tokens_seen": 150279885, "router_z_loss_clip": 2.49804688, "router_z_loss_mlp": 0.32324219, "step": 7001, "time_per_iteration": 2.6489367485046387 }, { "auxiliary_loss_clip": 0.01400481, "auxiliary_loss_mlp": 0.00335471, "balance_loss_clip": 1.15604448, "balance_loss_mlp": 0.30644402, "epoch": 0.4209830151811213, "flos": 20006588845440.0, "grad_norm": 8.853984429643754, "language_loss": 0.91605806, "learning_rate": 2.5995592936110154e-06, "loss": 0.93341762, "num_input_tokens_seen": 150297390, "router_z_loss_clip": 2.44726562, "router_z_loss_mlp": 0.29003906, "step": 7002, "time_per_iteration": 2.6673898696899414 }, { "auxiliary_loss_clip": 0.01407873, "auxiliary_loss_mlp": 0.00331568, "balance_loss_clip": 1.16457713, "balance_loss_mlp": 0.30358934, "epoch": 0.42104313843378927, "flos": 21978946604160.0, "grad_norm": 10.596916093339283, "language_loss": 0.76364112, "learning_rate": 2.5991877325023096e-06, "loss": 0.78103554, "num_input_tokens_seen": 150317390, "router_z_loss_clip": 2.43359375, "router_z_loss_mlp": 0.27990723, "step": 7003, "time_per_iteration": 2.639894962310791 }, { "auxiliary_loss_clip": 0.01419205, "auxiliary_loss_mlp": 0.00378027, "balance_loss_clip": 1.16633105, "balance_loss_mlp": 0.34402823, "epoch": 0.42110326168645723, "flos": 25443966965760.0, "grad_norm": 3279.401439038277, "language_loss": 0.84465528, "learning_rate": 2.598816148672344e-06, "loss": 0.86262763, "num_input_tokens_seen": 150337455, "router_z_loss_clip": 2.52929688, "router_z_loss_mlp": 0.33984375, "step": 7004, "time_per_iteration": 2.6789510250091553 }, { "auxiliary_loss_clip": 0.01407496, "auxiliary_loss_mlp": 0.00309773, "balance_loss_clip": 1.16792572, "balance_loss_mlp": 0.28161579, "epoch": 0.4211633849391252, "flos": 17822574195840.0, "grad_norm": 13.109695933785492, "language_loss": 0.76336384, "learning_rate": 2.59844454213521e-06, "loss": 0.78053653, "num_input_tokens_seen": 150355385, "router_z_loss_clip": 2.3984375, "router_z_loss_mlp": 0.28137207, "step": 7005, "time_per_iteration": 2.6840157508850098 }, { "auxiliary_loss_clip": 0.01418304, "auxiliary_loss_mlp": 0.00340822, "balance_loss_clip": 1.17183852, "balance_loss_mlp": 0.31128147, "epoch": 0.42122350819179316, "flos": 16282436791680.0, "grad_norm": 12.38088264340069, "language_loss": 0.80179429, "learning_rate": 2.5980729129049994e-06, "loss": 0.81938553, "num_input_tokens_seen": 150371750, "router_z_loss_clip": 2.46679688, "router_z_loss_mlp": 0.2956543, "step": 7006, "time_per_iteration": 2.6740567684173584 }, { "auxiliary_loss_clip": 0.0140488, "auxiliary_loss_mlp": 0.00324401, "balance_loss_clip": 1.15990591, "balance_loss_mlp": 0.29651845, "epoch": 0.4212836314444611, "flos": 19645866512640.0, "grad_norm": 119.55455430097157, "language_loss": 0.77977371, "learning_rate": 2.5977012609958033e-06, "loss": 0.79706657, "num_input_tokens_seen": 150389955, "router_z_loss_clip": 2.44921875, "router_z_loss_mlp": 0.27893066, "step": 7007, "time_per_iteration": 2.7012012004852295 }, { "auxiliary_loss_clip": 0.014071, "auxiliary_loss_mlp": 0.00379316, "balance_loss_clip": 1.15825057, "balance_loss_mlp": 0.34662923, "epoch": 0.4213437546971291, "flos": 18369026778240.0, "grad_norm": 3.5442971695055427, "language_loss": 0.88490415, "learning_rate": 2.5973295864217166e-06, "loss": 0.90276831, "num_input_tokens_seen": 150405780, "router_z_loss_clip": 2.4921875, "router_z_loss_mlp": 0.32714844, "step": 7008, "time_per_iteration": 2.6798882484436035 }, { "auxiliary_loss_clip": 0.01412628, "auxiliary_loss_mlp": 0.00333099, "balance_loss_clip": 1.16426778, "balance_loss_mlp": 0.30369031, "epoch": 0.42140387794979706, "flos": 27704507541120.0, "grad_norm": 16.49818934891253, "language_loss": 0.78136957, "learning_rate": 2.596957889196831e-06, "loss": 0.79882687, "num_input_tokens_seen": 150425615, "router_z_loss_clip": 2.48828125, "router_z_loss_mlp": 0.29382324, "step": 7009, "time_per_iteration": 2.697206497192383 }, { "auxiliary_loss_clip": 0.01412818, "auxiliary_loss_mlp": 0.00330822, "balance_loss_clip": 1.16879487, "balance_loss_mlp": 0.3012816, "epoch": 0.4214640012024651, "flos": 28147071012480.0, "grad_norm": 66.52399836981196, "language_loss": 0.7296893, "learning_rate": 2.596586169335243e-06, "loss": 0.74712574, "num_input_tokens_seen": 150445765, "router_z_loss_clip": 2.44140625, "router_z_loss_mlp": 0.29553223, "step": 7010, "time_per_iteration": 2.7349796295166016 }, { "auxiliary_loss_clip": 0.01395101, "auxiliary_loss_mlp": 0.0033696, "balance_loss_clip": 1.14777899, "balance_loss_mlp": 0.30637109, "epoch": 0.42152412445513304, "flos": 22997265177600.0, "grad_norm": 6.092665050321106, "language_loss": 0.77918839, "learning_rate": 2.5962144268510477e-06, "loss": 0.79650903, "num_input_tokens_seen": 150464405, "router_z_loss_clip": 2.47265625, "router_z_loss_mlp": 0.3059082, "step": 7011, "time_per_iteration": 2.6376020908355713 }, { "auxiliary_loss_clip": 0.01486388, "auxiliary_loss_mlp": 0.00100417, "balance_loss_clip": 1.2777915, "balance_loss_mlp": 0.0944089, "epoch": 0.421584247707801, "flos": 63749592938880.0, "grad_norm": 0.8154257226532128, "language_loss": 0.54356313, "learning_rate": 2.5958426617583417e-06, "loss": 0.5594312, "num_input_tokens_seen": 150520430, "router_z_loss_clip": 2.09375, "router_z_loss_mlp": 0.06005859, "step": 7012, "time_per_iteration": 3.028329372406006 }, { "auxiliary_loss_clip": 0.0141893, "auxiliary_loss_mlp": 0.00350458, "balance_loss_clip": 1.17246556, "balance_loss_mlp": 0.31960636, "epoch": 0.421644370960469, "flos": 24314612474880.0, "grad_norm": 2.8926606846689067, "language_loss": 0.83471751, "learning_rate": 2.5954708740712215e-06, "loss": 0.85241139, "num_input_tokens_seen": 150542610, "router_z_loss_clip": 2.46289062, "router_z_loss_mlp": 0.30822754, "step": 7013, "time_per_iteration": 2.750278949737549 }, { "auxiliary_loss_clip": 0.01407704, "auxiliary_loss_mlp": 0.00342244, "balance_loss_clip": 1.16163409, "balance_loss_mlp": 0.31141639, "epoch": 0.42170449421313694, "flos": 23440690575360.0, "grad_norm": 181.06737857378798, "language_loss": 0.86528707, "learning_rate": 2.595099063803787e-06, "loss": 0.88278657, "num_input_tokens_seen": 150560970, "router_z_loss_clip": 2.4609375, "router_z_loss_mlp": 0.30810547, "step": 7014, "time_per_iteration": 2.6806061267852783 }, { "auxiliary_loss_clip": 0.01410196, "auxiliary_loss_mlp": 0.00327491, "balance_loss_clip": 1.16590893, "balance_loss_mlp": 0.29702136, "epoch": 0.4217646174658049, "flos": 23695476721920.0, "grad_norm": 17.44443335873867, "language_loss": 0.83317375, "learning_rate": 2.5947272309701354e-06, "loss": 0.85055059, "num_input_tokens_seen": 150582615, "router_z_loss_clip": 2.44335938, "router_z_loss_mlp": 0.30444336, "step": 7015, "time_per_iteration": 2.71368408203125 }, { "auxiliary_loss_clip": 0.01418428, "auxiliary_loss_mlp": 0.00342149, "balance_loss_clip": 1.17098033, "balance_loss_mlp": 0.31070173, "epoch": 0.42182474071847287, "flos": 24971562270720.0, "grad_norm": 2.5126521159015387, "language_loss": 0.86660939, "learning_rate": 2.594355375584368e-06, "loss": 0.88421518, "num_input_tokens_seen": 150603640, "router_z_loss_clip": 2.47851562, "router_z_loss_mlp": 0.31469727, "step": 7016, "time_per_iteration": 2.7161731719970703 }, { "auxiliary_loss_clip": 0.01410581, "auxiliary_loss_mlp": 0.00344016, "balance_loss_clip": 1.1653856, "balance_loss_mlp": 0.31455976, "epoch": 0.42188486397114083, "flos": 22856639431680.0, "grad_norm": 2.4228969618505847, "language_loss": 0.7519784, "learning_rate": 2.593983497660586e-06, "loss": 0.76952434, "num_input_tokens_seen": 150622490, "router_z_loss_clip": 2.45507812, "router_z_loss_mlp": 0.29431152, "step": 7017, "time_per_iteration": 2.615173101425171 }, { "auxiliary_loss_clip": 0.01504532, "auxiliary_loss_mlp": 0.00110709, "balance_loss_clip": 1.30962205, "balance_loss_mlp": 0.1042244, "epoch": 0.4219449872238088, "flos": 66975700965120.0, "grad_norm": 0.687034993337925, "language_loss": 0.5888046, "learning_rate": 2.5936115972128895e-06, "loss": 0.60495698, "num_input_tokens_seen": 150689545, "router_z_loss_clip": 1.953125, "router_z_loss_mlp": 0.06494141, "step": 7018, "time_per_iteration": 3.2052643299102783 }, { "auxiliary_loss_clip": 0.01408601, "auxiliary_loss_mlp": 0.00368319, "balance_loss_clip": 1.16158724, "balance_loss_mlp": 0.33547676, "epoch": 0.42200511047647676, "flos": 13115367745920.0, "grad_norm": 12.338018356073793, "language_loss": 0.8332637, "learning_rate": 2.593239674255382e-06, "loss": 0.85103285, "num_input_tokens_seen": 150707610, "router_z_loss_clip": 2.46875, "router_z_loss_mlp": 0.32861328, "step": 7019, "time_per_iteration": 2.666208505630493 }, { "auxiliary_loss_clip": 0.01418058, "auxiliary_loss_mlp": 0.00348452, "balance_loss_clip": 1.17117691, "balance_loss_mlp": 0.31626546, "epoch": 0.42206523372914473, "flos": 13991193066240.0, "grad_norm": 9.7976148644771, "language_loss": 0.75132555, "learning_rate": 2.592867728802166e-06, "loss": 0.76899064, "num_input_tokens_seen": 150724530, "router_z_loss_clip": 2.46875, "router_z_loss_mlp": 0.32177734, "step": 7020, "time_per_iteration": 2.66951847076416 }, { "auxiliary_loss_clip": 0.01400855, "auxiliary_loss_mlp": 0.00325253, "balance_loss_clip": 1.16091657, "balance_loss_mlp": 0.29807335, "epoch": 0.4221253569818127, "flos": 21942317710080.0, "grad_norm": 11.68084707779413, "language_loss": 0.86072445, "learning_rate": 2.592495760867347e-06, "loss": 0.87798554, "num_input_tokens_seen": 150742870, "router_z_loss_clip": 2.39648438, "router_z_loss_mlp": 0.27197266, "step": 7021, "time_per_iteration": 2.6172702312469482 }, { "auxiliary_loss_clip": 0.01413094, "auxiliary_loss_mlp": 0.00347244, "balance_loss_clip": 1.16788566, "balance_loss_mlp": 0.31729853, "epoch": 0.42218548023448066, "flos": 32192587071360.0, "grad_norm": 56.97159980192243, "language_loss": 0.7547183, "learning_rate": 2.5921237704650293e-06, "loss": 0.77232164, "num_input_tokens_seen": 150765500, "router_z_loss_clip": 2.45703125, "router_z_loss_mlp": 0.29956055, "step": 7022, "time_per_iteration": 2.711921215057373 }, { "auxiliary_loss_clip": 0.01388677, "auxiliary_loss_mlp": 0.00324446, "balance_loss_clip": 1.15381241, "balance_loss_mlp": 0.29662222, "epoch": 0.4222456034871487, "flos": 30118961894400.0, "grad_norm": 9.030402505194479, "language_loss": 0.72143304, "learning_rate": 2.5917517576093188e-06, "loss": 0.73856431, "num_input_tokens_seen": 150784945, "router_z_loss_clip": 2.34960938, "router_z_loss_mlp": 0.27807617, "step": 7023, "time_per_iteration": 2.711137294769287 }, { "auxiliary_loss_clip": 0.01392477, "auxiliary_loss_mlp": 0.00333597, "balance_loss_clip": 1.15494895, "balance_loss_mlp": 0.30534407, "epoch": 0.42230572673981664, "flos": 22127904305280.0, "grad_norm": 13.741545198409115, "language_loss": 0.75034094, "learning_rate": 2.591379722314322e-06, "loss": 0.76760161, "num_input_tokens_seen": 150803120, "router_z_loss_clip": 2.375, "router_z_loss_mlp": 0.28271484, "step": 7024, "time_per_iteration": 4.072700500488281 }, { "auxiliary_loss_clip": 0.01414983, "auxiliary_loss_mlp": 0.00362583, "balance_loss_clip": 1.17116153, "balance_loss_mlp": 0.33139741, "epoch": 0.4223658499924846, "flos": 22055077480320.0, "grad_norm": 16.47478471187675, "language_loss": 0.82766598, "learning_rate": 2.591007664594147e-06, "loss": 0.84544158, "num_input_tokens_seen": 150823135, "router_z_loss_clip": 2.4375, "router_z_loss_mlp": 0.31201172, "step": 7025, "time_per_iteration": 2.6456146240234375 }, { "auxiliary_loss_clip": 0.01400754, "auxiliary_loss_mlp": 0.00345455, "balance_loss_clip": 1.1571908, "balance_loss_mlp": 0.31543773, "epoch": 0.4224259732451526, "flos": 20410727742720.0, "grad_norm": 3.9350329020392825, "language_loss": 0.84068704, "learning_rate": 2.5906355844629024e-06, "loss": 0.85814917, "num_input_tokens_seen": 150842070, "router_z_loss_clip": 2.4375, "router_z_loss_mlp": 0.29992676, "step": 7026, "time_per_iteration": 4.188737154006958 }, { "auxiliary_loss_clip": 0.01453036, "auxiliary_loss_mlp": 0.00142252, "balance_loss_clip": 1.2599715, "balance_loss_mlp": 0.13452683, "epoch": 0.42248609649782054, "flos": 62846655828480.0, "grad_norm": 0.738475364992954, "language_loss": 0.61370313, "learning_rate": 2.5902634819346966e-06, "loss": 0.62965608, "num_input_tokens_seen": 150907450, "router_z_loss_clip": 1.9296875, "router_z_loss_mlp": 0.07714844, "step": 7027, "time_per_iteration": 3.226504325866699 }, { "auxiliary_loss_clip": 0.01398887, "auxiliary_loss_mlp": 0.00341603, "balance_loss_clip": 1.15707183, "balance_loss_mlp": 0.31163374, "epoch": 0.4225462197504885, "flos": 26249946289920.0, "grad_norm": 8.598887719107832, "language_loss": 0.8029151, "learning_rate": 2.5898913570236414e-06, "loss": 0.82032001, "num_input_tokens_seen": 150928040, "router_z_loss_clip": 2.41992188, "router_z_loss_mlp": 0.29943848, "step": 7028, "time_per_iteration": 2.6800248622894287 }, { "auxiliary_loss_clip": 0.01407471, "auxiliary_loss_mlp": 0.00366171, "balance_loss_clip": 1.16347444, "balance_loss_mlp": 0.33524835, "epoch": 0.42260634300315647, "flos": 20521943228160.0, "grad_norm": 4.243404318521858, "language_loss": 0.87732816, "learning_rate": 2.589519209743846e-06, "loss": 0.89506459, "num_input_tokens_seen": 150945760, "router_z_loss_clip": 2.44140625, "router_z_loss_mlp": 0.30932617, "step": 7029, "time_per_iteration": 4.263251066207886 }, { "auxiliary_loss_clip": 0.01404407, "auxiliary_loss_mlp": 0.00387176, "balance_loss_clip": 1.15627789, "balance_loss_mlp": 0.35498905, "epoch": 0.42266646625582444, "flos": 24316731377280.0, "grad_norm": 92.06952006916518, "language_loss": 0.8384577, "learning_rate": 2.589147040109424e-06, "loss": 0.85637355, "num_input_tokens_seen": 150965665, "router_z_loss_clip": 2.484375, "router_z_loss_mlp": 0.32177734, "step": 7030, "time_per_iteration": 2.7506067752838135 }, { "auxiliary_loss_clip": 0.01393657, "auxiliary_loss_mlp": 0.00346533, "balance_loss_clip": 1.14805627, "balance_loss_mlp": 0.31620657, "epoch": 0.4227265895084924, "flos": 24204151175040.0, "grad_norm": 26.728414379514415, "language_loss": 0.92521214, "learning_rate": 2.588774848134486e-06, "loss": 0.94261408, "num_input_tokens_seen": 150982260, "router_z_loss_clip": 2.453125, "router_z_loss_mlp": 0.3034668, "step": 7031, "time_per_iteration": 2.7920291423797607 }, { "auxiliary_loss_clip": 0.01393853, "auxiliary_loss_mlp": 0.00355551, "balance_loss_clip": 1.14904141, "balance_loss_mlp": 0.32508063, "epoch": 0.42278671276116037, "flos": 16909760845440.0, "grad_norm": 81.79040567171914, "language_loss": 0.8240158, "learning_rate": 2.5884026338331473e-06, "loss": 0.84150982, "num_input_tokens_seen": 150999990, "router_z_loss_clip": 2.44726562, "router_z_loss_mlp": 0.3046875, "step": 7032, "time_per_iteration": 2.6725149154663086 }, { "auxiliary_loss_clip": 0.0140085, "auxiliary_loss_mlp": 0.0036816, "balance_loss_clip": 1.15164971, "balance_loss_mlp": 0.33735663, "epoch": 0.42284683601382833, "flos": 25411073086080.0, "grad_norm": 16.288765770409206, "language_loss": 0.76311171, "learning_rate": 2.5880303972195222e-06, "loss": 0.78080183, "num_input_tokens_seen": 151021105, "router_z_loss_clip": 2.48828125, "router_z_loss_mlp": 0.30810547, "step": 7033, "time_per_iteration": 2.7467665672302246 }, { "auxiliary_loss_clip": 0.01394904, "auxiliary_loss_mlp": 0.00338284, "balance_loss_clip": 1.14784598, "balance_loss_mlp": 0.30845815, "epoch": 0.4229069592664963, "flos": 23040322606080.0, "grad_norm": 43.39029057791129, "language_loss": 0.96225417, "learning_rate": 2.5876581383077256e-06, "loss": 0.97958606, "num_input_tokens_seen": 151040665, "router_z_loss_clip": 2.47265625, "router_z_loss_mlp": 0.29797363, "step": 7034, "time_per_iteration": 2.7108068466186523 }, { "auxiliary_loss_clip": 0.01391188, "auxiliary_loss_mlp": 0.00349771, "balance_loss_clip": 1.14847279, "balance_loss_mlp": 0.32126793, "epoch": 0.42296708251916426, "flos": 26067448264320.0, "grad_norm": 64.07688715213025, "language_loss": 0.82515216, "learning_rate": 2.5872858571118723e-06, "loss": 0.84256178, "num_input_tokens_seen": 151061240, "router_z_loss_clip": 2.42382812, "router_z_loss_mlp": 0.28503418, "step": 7035, "time_per_iteration": 4.138630390167236 }, { "auxiliary_loss_clip": 0.01395273, "auxiliary_loss_mlp": 0.00345899, "balance_loss_clip": 1.14803219, "balance_loss_mlp": 0.31635937, "epoch": 0.4230272057718323, "flos": 19458376496640.0, "grad_norm": 3.025555389520627, "language_loss": 0.88232583, "learning_rate": 2.5869135536460817e-06, "loss": 0.89973754, "num_input_tokens_seen": 151076870, "router_z_loss_clip": 2.47851562, "router_z_loss_mlp": 0.29541016, "step": 7036, "time_per_iteration": 2.5825741291046143 }, { "auxiliary_loss_clip": 0.01403289, "auxiliary_loss_mlp": 0.00333412, "balance_loss_clip": 1.15601242, "balance_loss_mlp": 0.30417013, "epoch": 0.42308732902450025, "flos": 22383300983040.0, "grad_norm": 10.90265579531404, "language_loss": 0.76484537, "learning_rate": 2.58654122792447e-06, "loss": 0.78221238, "num_input_tokens_seen": 151095110, "router_z_loss_clip": 2.46875, "router_z_loss_mlp": 0.29211426, "step": 7037, "time_per_iteration": 2.6833889484405518 }, { "auxiliary_loss_clip": 0.01391184, "auxiliary_loss_mlp": 0.00347056, "balance_loss_clip": 1.14338875, "balance_loss_mlp": 0.31726563, "epoch": 0.4231474522771682, "flos": 20995425331200.0, "grad_norm": 3.9712554378754983, "language_loss": 0.82589096, "learning_rate": 2.586168879961155e-06, "loss": 0.84327328, "num_input_tokens_seen": 151114355, "router_z_loss_clip": 2.48046875, "router_z_loss_mlp": 0.29797363, "step": 7038, "time_per_iteration": 2.628981351852417 }, { "auxiliary_loss_clip": 0.01407536, "auxiliary_loss_mlp": 0.00369684, "balance_loss_clip": 1.15485716, "balance_loss_mlp": 0.33771199, "epoch": 0.4232075755298362, "flos": 14975863574400.0, "grad_norm": 16.43576097153784, "language_loss": 0.78408831, "learning_rate": 2.585796509770259e-06, "loss": 0.80186045, "num_input_tokens_seen": 151131505, "router_z_loss_clip": 2.52539062, "router_z_loss_mlp": 0.31958008, "step": 7039, "time_per_iteration": 2.6214776039123535 }, { "auxiliary_loss_clip": 0.01407483, "auxiliary_loss_mlp": 0.00367706, "balance_loss_clip": 1.15396738, "balance_loss_mlp": 0.33556765, "epoch": 0.42326769878250414, "flos": 24532661986560.0, "grad_norm": 6.000809359018324, "language_loss": 0.81865311, "learning_rate": 2.5854241173658996e-06, "loss": 0.83640504, "num_input_tokens_seen": 151151555, "router_z_loss_clip": 2.53710938, "router_z_loss_mlp": 0.3215332, "step": 7040, "time_per_iteration": 2.679781913757324 }, { "auxiliary_loss_clip": 0.01406899, "auxiliary_loss_mlp": 0.00343879, "balance_loss_clip": 1.15571618, "balance_loss_mlp": 0.31312326, "epoch": 0.4233278220351721, "flos": 26870303105280.0, "grad_norm": 2.7112307930834123, "language_loss": 0.73493063, "learning_rate": 2.5850517027621996e-06, "loss": 0.75243843, "num_input_tokens_seen": 151172385, "router_z_loss_clip": 2.515625, "router_z_loss_mlp": 0.30761719, "step": 7041, "time_per_iteration": 2.739536762237549 }, { "auxiliary_loss_clip": 0.01403426, "auxiliary_loss_mlp": 0.00369218, "balance_loss_clip": 1.15200031, "balance_loss_mlp": 0.33731806, "epoch": 0.4233879452878401, "flos": 42814927463040.0, "grad_norm": 109.37951003377444, "language_loss": 0.81714797, "learning_rate": 2.5846792659732803e-06, "loss": 0.83487439, "num_input_tokens_seen": 151194930, "router_z_loss_clip": 2.51171875, "router_z_loss_mlp": 0.3190918, "step": 7042, "time_per_iteration": 2.854063034057617 }, { "auxiliary_loss_clip": 0.01397742, "auxiliary_loss_mlp": 0.00357605, "balance_loss_clip": 1.15091598, "balance_loss_mlp": 0.32634804, "epoch": 0.42344806854050804, "flos": 25229006023680.0, "grad_norm": 6287.29371473536, "language_loss": 0.86194026, "learning_rate": 2.5843068070132643e-06, "loss": 0.87949371, "num_input_tokens_seen": 151217905, "router_z_loss_clip": 2.46289062, "router_z_loss_mlp": 0.31274414, "step": 7043, "time_per_iteration": 2.839526414871216 }, { "auxiliary_loss_clip": 0.01414349, "auxiliary_loss_mlp": 0.00335228, "balance_loss_clip": 1.16031849, "balance_loss_mlp": 0.30629581, "epoch": 0.423508191793176, "flos": 22778820616320.0, "grad_norm": 115.21024899474041, "language_loss": 0.72332454, "learning_rate": 2.5839343258962763e-06, "loss": 0.74082029, "num_input_tokens_seen": 151234580, "router_z_loss_clip": 2.54101562, "router_z_loss_mlp": 0.28930664, "step": 7044, "time_per_iteration": 2.7001419067382812 }, { "auxiliary_loss_clip": 0.01424623, "auxiliary_loss_mlp": 0.00353406, "balance_loss_clip": 1.16677213, "balance_loss_mlp": 0.32245949, "epoch": 0.42356831504584397, "flos": 34637493179520.0, "grad_norm": 2.251152208885742, "language_loss": 0.81987572, "learning_rate": 2.5835618226364393e-06, "loss": 0.83765596, "num_input_tokens_seen": 151254765, "router_z_loss_clip": 2.57617188, "router_z_loss_mlp": 0.30957031, "step": 7045, "time_per_iteration": 2.7705023288726807 }, { "auxiliary_loss_clip": 0.01404598, "auxiliary_loss_mlp": 0.00361308, "balance_loss_clip": 1.15517735, "balance_loss_mlp": 0.33056363, "epoch": 0.42362843829851193, "flos": 17596767346560.0, "grad_norm": 5.048372195889732, "language_loss": 0.8750062, "learning_rate": 2.5831892972478797e-06, "loss": 0.89266527, "num_input_tokens_seen": 151269045, "router_z_loss_clip": 2.4921875, "router_z_loss_mlp": 0.30725098, "step": 7046, "time_per_iteration": 2.5764427185058594 }, { "auxiliary_loss_clip": 0.01414133, "auxiliary_loss_mlp": 0.00367441, "balance_loss_clip": 1.16200256, "balance_loss_mlp": 0.33654237, "epoch": 0.4236885615511799, "flos": 22565691267840.0, "grad_norm": 9.329389801788729, "language_loss": 0.84010392, "learning_rate": 2.5828167497447242e-06, "loss": 0.85791969, "num_input_tokens_seen": 151287530, "router_z_loss_clip": 2.52148438, "router_z_loss_mlp": 0.30908203, "step": 7047, "time_per_iteration": 2.726040840148926 }, { "auxiliary_loss_clip": 0.01409731, "auxiliary_loss_mlp": 0.0033811, "balance_loss_clip": 1.15710926, "balance_loss_mlp": 0.31069148, "epoch": 0.42374868480384786, "flos": 26469216864000.0, "grad_norm": 6.322836622323462, "language_loss": 0.73876965, "learning_rate": 2.582444180141098e-06, "loss": 0.75624806, "num_input_tokens_seen": 151308905, "router_z_loss_clip": 2.52539062, "router_z_loss_mlp": 0.27404785, "step": 7048, "time_per_iteration": 2.702104091644287 }, { "auxiliary_loss_clip": 0.01405348, "auxiliary_loss_mlp": 0.00364712, "balance_loss_clip": 1.15336978, "balance_loss_mlp": 0.33476698, "epoch": 0.4238088080565159, "flos": 20370220179840.0, "grad_norm": 88.16953413774394, "language_loss": 0.84270364, "learning_rate": 2.5820715884511307e-06, "loss": 0.86040425, "num_input_tokens_seen": 151326525, "router_z_loss_clip": 2.51757812, "router_z_loss_mlp": 0.29980469, "step": 7049, "time_per_iteration": 2.661215305328369 }, { "auxiliary_loss_clip": 0.01414399, "auxiliary_loss_mlp": 0.00374935, "balance_loss_clip": 1.16111994, "balance_loss_mlp": 0.34334409, "epoch": 0.42386893130918385, "flos": 21172105353600.0, "grad_norm": 59.239729154205186, "language_loss": 0.89955914, "learning_rate": 2.5816989746889504e-06, "loss": 0.91745245, "num_input_tokens_seen": 151344675, "router_z_loss_clip": 2.53515625, "router_z_loss_mlp": 0.31591797, "step": 7050, "time_per_iteration": 2.640963077545166 }, { "auxiliary_loss_clip": 0.01419401, "auxiliary_loss_mlp": 0.00349305, "balance_loss_clip": 1.16496599, "balance_loss_mlp": 0.32069468, "epoch": 0.4239290545618518, "flos": 17675627656320.0, "grad_norm": 8.175843985748632, "language_loss": 0.79979885, "learning_rate": 2.581326338868687e-06, "loss": 0.81748593, "num_input_tokens_seen": 151360730, "router_z_loss_clip": 2.54296875, "router_z_loss_mlp": 0.28637695, "step": 7051, "time_per_iteration": 2.657200336456299 }, { "auxiliary_loss_clip": 0.01421312, "auxiliary_loss_mlp": 0.00373423, "balance_loss_clip": 1.16572762, "balance_loss_mlp": 0.34195143, "epoch": 0.4239891778145198, "flos": 24314504734080.0, "grad_norm": 1.969717553487127, "language_loss": 0.89712262, "learning_rate": 2.5809536810044706e-06, "loss": 0.91507006, "num_input_tokens_seen": 151380445, "router_z_loss_clip": 2.5546875, "router_z_loss_mlp": 0.31469727, "step": 7052, "time_per_iteration": 2.684546709060669 }, { "auxiliary_loss_clip": 0.01420311, "auxiliary_loss_mlp": 0.00362365, "balance_loss_clip": 1.16628337, "balance_loss_mlp": 0.33240771, "epoch": 0.42404930106718774, "flos": 20558428467840.0, "grad_norm": 24.418148188926832, "language_loss": 0.79200667, "learning_rate": 2.5805810011104323e-06, "loss": 0.80983341, "num_input_tokens_seen": 151399325, "router_z_loss_clip": 2.54492188, "router_z_loss_mlp": 0.29968262, "step": 7053, "time_per_iteration": 2.6592884063720703 }, { "auxiliary_loss_clip": 0.0141822, "auxiliary_loss_mlp": 0.00408179, "balance_loss_clip": 1.16363239, "balance_loss_mlp": 0.37518165, "epoch": 0.4241094243198557, "flos": 22308067946880.0, "grad_norm": 11.179326950373449, "language_loss": 0.88196695, "learning_rate": 2.580208299200704e-06, "loss": 0.90023094, "num_input_tokens_seen": 151417240, "router_z_loss_clip": 2.54492188, "router_z_loss_mlp": 0.32983398, "step": 7054, "time_per_iteration": 2.6578078269958496 }, { "auxiliary_loss_clip": 0.01439298, "auxiliary_loss_mlp": 0.00103853, "balance_loss_clip": 1.21063828, "balance_loss_mlp": 0.09631884, "epoch": 0.4241695475725237, "flos": 70612445272320.0, "grad_norm": 0.783878877975846, "language_loss": 0.59930241, "learning_rate": 2.5798355752894183e-06, "loss": 0.61473393, "num_input_tokens_seen": 151476015, "router_z_loss_clip": 2.28125, "router_z_loss_mlp": 0.07519531, "step": 7055, "time_per_iteration": 3.1099839210510254 }, { "auxiliary_loss_clip": 0.01420617, "auxiliary_loss_mlp": 0.00439572, "balance_loss_clip": 1.16465616, "balance_loss_mlp": 0.40333229, "epoch": 0.42422967082519164, "flos": 14027462824320.0, "grad_norm": 40.77952822752872, "language_loss": 0.83783579, "learning_rate": 2.5794628293907107e-06, "loss": 0.85643774, "num_input_tokens_seen": 151492035, "router_z_loss_clip": 2.56054688, "router_z_loss_mlp": 0.36230469, "step": 7056, "time_per_iteration": 2.8975815773010254 }, { "auxiliary_loss_clip": 0.01414406, "auxiliary_loss_mlp": 0.00368358, "balance_loss_clip": 1.16002798, "balance_loss_mlp": 0.33776915, "epoch": 0.4242897940778596, "flos": 22345522853760.0, "grad_norm": 20.5604942440064, "language_loss": 0.91936177, "learning_rate": 2.579090061518714e-06, "loss": 0.93718946, "num_input_tokens_seen": 151508970, "router_z_loss_clip": 2.54101562, "router_z_loss_mlp": 0.30615234, "step": 7057, "time_per_iteration": 2.7375526428222656 }, { "auxiliary_loss_clip": 0.01431775, "auxiliary_loss_mlp": 0.00401485, "balance_loss_clip": 1.17230201, "balance_loss_mlp": 0.36789209, "epoch": 0.42434991733052757, "flos": 22595855713920.0, "grad_norm": 430.7323277254274, "language_loss": 0.90532506, "learning_rate": 2.5787172716875642e-06, "loss": 0.9236576, "num_input_tokens_seen": 151525295, "router_z_loss_clip": 2.59570312, "router_z_loss_mlp": 0.33618164, "step": 7058, "time_per_iteration": 2.657898426055908 }, { "auxiliary_loss_clip": 0.01409677, "auxiliary_loss_mlp": 0.00403368, "balance_loss_clip": 1.15714848, "balance_loss_mlp": 0.3718487, "epoch": 0.42441004058319554, "flos": 20011437181440.0, "grad_norm": 4.5054161234514645, "language_loss": 0.86342907, "learning_rate": 2.5783444599113973e-06, "loss": 0.88155949, "num_input_tokens_seen": 151544435, "router_z_loss_clip": 2.53125, "router_z_loss_mlp": 0.31530762, "step": 7059, "time_per_iteration": 2.6707375049591064 }, { "auxiliary_loss_clip": 0.01425318, "auxiliary_loss_mlp": 0.00408675, "balance_loss_clip": 1.16495407, "balance_loss_mlp": 0.37539208, "epoch": 0.4244701638358635, "flos": 11144985235200.0, "grad_norm": 67.7120471329731, "language_loss": 0.77166039, "learning_rate": 2.57797162620435e-06, "loss": 0.79000032, "num_input_tokens_seen": 151559520, "router_z_loss_clip": 2.60351562, "router_z_loss_mlp": 0.33300781, "step": 7060, "time_per_iteration": 2.5967650413513184 }, { "auxiliary_loss_clip": 0.01421616, "auxiliary_loss_mlp": 0.00411327, "balance_loss_clip": 1.16482353, "balance_loss_mlp": 0.3781155, "epoch": 0.42453028708853147, "flos": 23987753688960.0, "grad_norm": 1.9557006456251689, "language_loss": 0.80884159, "learning_rate": 2.577598770580562e-06, "loss": 0.82717097, "num_input_tokens_seen": 151579790, "router_z_loss_clip": 2.5703125, "router_z_loss_mlp": 0.33251953, "step": 7061, "time_per_iteration": 2.660928249359131 }, { "auxiliary_loss_clip": 0.01416548, "auxiliary_loss_mlp": 0.00409846, "balance_loss_clip": 1.16170168, "balance_loss_mlp": 0.37677786, "epoch": 0.42459041034119943, "flos": 18406338030720.0, "grad_norm": 5.4638284568682405, "language_loss": 0.79978579, "learning_rate": 2.5772258930541693e-06, "loss": 0.81804973, "num_input_tokens_seen": 151598285, "router_z_loss_clip": 2.546875, "router_z_loss_mlp": 0.33056641, "step": 7062, "time_per_iteration": 2.6273653507232666 }, { "auxiliary_loss_clip": 0.01411537, "auxiliary_loss_mlp": 0.00432956, "balance_loss_clip": 1.15521276, "balance_loss_mlp": 0.39943418, "epoch": 0.42465053359386745, "flos": 20958006337920.0, "grad_norm": 18.690729353610113, "language_loss": 0.71740055, "learning_rate": 2.5768529936393137e-06, "loss": 0.73584545, "num_input_tokens_seen": 151615430, "router_z_loss_clip": 2.5625, "router_z_loss_mlp": 0.33520508, "step": 7063, "time_per_iteration": 2.6274514198303223 }, { "auxiliary_loss_clip": 0.01406699, "auxiliary_loss_mlp": 0.00439205, "balance_loss_clip": 1.15617597, "balance_loss_mlp": 0.40532535, "epoch": 0.4247106568465354, "flos": 33106190520960.0, "grad_norm": 33.674217186002245, "language_loss": 0.83170831, "learning_rate": 2.5764800723501354e-06, "loss": 0.85016739, "num_input_tokens_seen": 151637030, "router_z_loss_clip": 2.50585938, "router_z_loss_mlp": 0.33837891, "step": 7064, "time_per_iteration": 2.722407817840576 }, { "auxiliary_loss_clip": 0.0143168, "auxiliary_loss_mlp": 0.00422377, "balance_loss_clip": 1.17526412, "balance_loss_mlp": 0.38840234, "epoch": 0.4247707800992034, "flos": 20046916840320.0, "grad_norm": 6.299896229815424, "language_loss": 0.8216387, "learning_rate": 2.5761071292007736e-06, "loss": 0.84017932, "num_input_tokens_seen": 151655745, "router_z_loss_clip": 2.56445312, "router_z_loss_mlp": 0.33959961, "step": 7065, "time_per_iteration": 2.6464180946350098 }, { "auxiliary_loss_clip": 0.01409129, "auxiliary_loss_mlp": 0.00381299, "balance_loss_clip": 1.155936, "balance_loss_mlp": 0.34920788, "epoch": 0.42483090335187135, "flos": 22385132576640.0, "grad_norm": 5.770808670881057, "language_loss": 0.77688229, "learning_rate": 2.5757341642053725e-06, "loss": 0.79478657, "num_input_tokens_seen": 151678040, "router_z_loss_clip": 2.53320312, "router_z_loss_mlp": 0.32104492, "step": 7066, "time_per_iteration": 2.7271697521209717 }, { "auxiliary_loss_clip": 0.01421567, "auxiliary_loss_mlp": 0.00391722, "balance_loss_clip": 1.16055167, "balance_loss_mlp": 0.35727021, "epoch": 0.4248910266045393, "flos": 21356830022400.0, "grad_norm": 4.524254433099811, "language_loss": 0.84674609, "learning_rate": 2.5753611773780745e-06, "loss": 0.86487901, "num_input_tokens_seen": 151696410, "router_z_loss_clip": 2.60742188, "router_z_loss_mlp": 0.34472656, "step": 7067, "time_per_iteration": 4.044067859649658 }, { "auxiliary_loss_clip": 0.01434102, "auxiliary_loss_mlp": 0.00187049, "balance_loss_clip": 1.21414089, "balance_loss_mlp": 0.17636745, "epoch": 0.4249511498572073, "flos": 64008114099840.0, "grad_norm": 0.9228856361388713, "language_loss": 0.63214529, "learning_rate": 2.574988168733022e-06, "loss": 0.6483568, "num_input_tokens_seen": 151756365, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.10693359, "step": 7068, "time_per_iteration": 3.062466859817505 }, { "auxiliary_loss_clip": 0.01417567, "auxiliary_loss_mlp": 0.00420271, "balance_loss_clip": 1.15930939, "balance_loss_mlp": 0.38517612, "epoch": 0.42501127310987524, "flos": 19607046888960.0, "grad_norm": 4.04570142203525, "language_loss": 0.79802954, "learning_rate": 2.574615138284361e-06, "loss": 0.81640798, "num_input_tokens_seen": 151775165, "router_z_loss_clip": 2.58398438, "router_z_loss_mlp": 0.35083008, "step": 7069, "time_per_iteration": 4.198902606964111 }, { "auxiliary_loss_clip": 0.01420369, "auxiliary_loss_mlp": 0.00402971, "balance_loss_clip": 1.16115117, "balance_loss_mlp": 0.36964005, "epoch": 0.4250713963625432, "flos": 19462326992640.0, "grad_norm": 3.3956677893796128, "language_loss": 0.87870383, "learning_rate": 2.5742420860462364e-06, "loss": 0.89693725, "num_input_tokens_seen": 151792620, "router_z_loss_clip": 2.58984375, "router_z_loss_mlp": 0.33325195, "step": 7070, "time_per_iteration": 2.714399576187134 }, { "auxiliary_loss_clip": 0.01413121, "auxiliary_loss_mlp": 0.00402039, "balance_loss_clip": 1.15589249, "balance_loss_mlp": 0.36880374, "epoch": 0.4251315196152112, "flos": 25337707557120.0, "grad_norm": 7.083134151918011, "language_loss": 0.78046149, "learning_rate": 2.573869012032795e-06, "loss": 0.79861307, "num_input_tokens_seen": 151812850, "router_z_loss_clip": 2.57226562, "router_z_loss_mlp": 0.33227539, "step": 7071, "time_per_iteration": 4.239865779876709 }, { "auxiliary_loss_clip": 0.0140975, "auxiliary_loss_mlp": 0.0040947, "balance_loss_clip": 1.15378642, "balance_loss_mlp": 0.37625861, "epoch": 0.42519164286787914, "flos": 26359186527360.0, "grad_norm": 18.165206309767985, "language_loss": 0.78889763, "learning_rate": 2.5734959162581824e-06, "loss": 0.80708981, "num_input_tokens_seen": 151831785, "router_z_loss_clip": 2.56054688, "router_z_loss_mlp": 0.33203125, "step": 7072, "time_per_iteration": 2.7525768280029297 }, { "auxiliary_loss_clip": 0.01428042, "auxiliary_loss_mlp": 0.00458569, "balance_loss_clip": 1.16729999, "balance_loss_mlp": 0.42526174, "epoch": 0.4252517661205471, "flos": 26031070765440.0, "grad_norm": 25.156022539665997, "language_loss": 0.87491769, "learning_rate": 2.5731227987365475e-06, "loss": 0.89378381, "num_input_tokens_seen": 151853885, "router_z_loss_clip": 2.60742188, "router_z_loss_mlp": 0.33300781, "step": 7073, "time_per_iteration": 2.727867603302002 }, { "auxiliary_loss_clip": 0.01422898, "auxiliary_loss_mlp": 0.00395975, "balance_loss_clip": 1.165887, "balance_loss_mlp": 0.36297768, "epoch": 0.42531188937321507, "flos": 12713635059840.0, "grad_norm": 5.359643168855741, "language_loss": 0.97983694, "learning_rate": 2.5727496594820386e-06, "loss": 0.99802566, "num_input_tokens_seen": 151871780, "router_z_loss_clip": 2.56835938, "router_z_loss_mlp": 0.33007812, "step": 7074, "time_per_iteration": 2.7776386737823486 }, { "auxiliary_loss_clip": 0.01423661, "auxiliary_loss_mlp": 0.00434757, "balance_loss_clip": 1.16456485, "balance_loss_mlp": 0.39754039, "epoch": 0.42537201262588303, "flos": 22091670460800.0, "grad_norm": 99.29199151043449, "language_loss": 0.71463478, "learning_rate": 2.572376498508805e-06, "loss": 0.73321903, "num_input_tokens_seen": 151891600, "router_z_loss_clip": 2.58789062, "router_z_loss_mlp": 0.37207031, "step": 7075, "time_per_iteration": 2.820573568344116 }, { "auxiliary_loss_clip": 0.01411675, "auxiliary_loss_mlp": 0.0041971, "balance_loss_clip": 1.15840483, "balance_loss_mlp": 0.38361356, "epoch": 0.42543213587855105, "flos": 23003119094400.0, "grad_norm": 2.939130345623063, "language_loss": 0.80642271, "learning_rate": 2.5720033158309973e-06, "loss": 0.8247366, "num_input_tokens_seen": 151911330, "router_z_loss_clip": 2.53710938, "router_z_loss_mlp": 0.36132812, "step": 7076, "time_per_iteration": 2.7364649772644043 }, { "auxiliary_loss_clip": 0.01416465, "auxiliary_loss_mlp": 0.00407401, "balance_loss_clip": 1.16061056, "balance_loss_mlp": 0.37330744, "epoch": 0.425492259131219, "flos": 25082454533760.0, "grad_norm": 9.4542672574176, "language_loss": 0.86138558, "learning_rate": 2.571630111462766e-06, "loss": 0.87962419, "num_input_tokens_seen": 151930355, "router_z_loss_clip": 2.55664062, "router_z_loss_mlp": 0.34130859, "step": 7077, "time_per_iteration": 4.164709806442261 }, { "auxiliary_loss_clip": 0.01391275, "auxiliary_loss_mlp": 0.00387149, "balance_loss_clip": 1.14192951, "balance_loss_mlp": 0.35546267, "epoch": 0.425552382383887, "flos": 22816850140800.0, "grad_norm": 3.1004698977765646, "language_loss": 0.78188288, "learning_rate": 2.571256885418265e-06, "loss": 0.79966712, "num_input_tokens_seen": 151949695, "router_z_loss_clip": 2.49414062, "router_z_loss_mlp": 0.31689453, "step": 7078, "time_per_iteration": 2.6744744777679443 }, { "auxiliary_loss_clip": 0.01394464, "auxiliary_loss_mlp": 0.00385122, "balance_loss_clip": 1.14503825, "balance_loss_mlp": 0.3553319, "epoch": 0.42561250563655495, "flos": 13553585671680.0, "grad_norm": 38.62576693636197, "language_loss": 0.88035893, "learning_rate": 2.5708836377116445e-06, "loss": 0.89815474, "num_input_tokens_seen": 151967640, "router_z_loss_clip": 2.49609375, "router_z_loss_mlp": 0.29797363, "step": 7079, "time_per_iteration": 2.707059383392334 }, { "auxiliary_loss_clip": 0.01405688, "auxiliary_loss_mlp": 0.00395251, "balance_loss_clip": 1.15015721, "balance_loss_mlp": 0.36338606, "epoch": 0.4256726288892229, "flos": 46978303023360.0, "grad_norm": 20.26361127047626, "language_loss": 0.76820254, "learning_rate": 2.5705103683570592e-06, "loss": 0.78621185, "num_input_tokens_seen": 151994020, "router_z_loss_clip": 2.55859375, "router_z_loss_mlp": 0.31848145, "step": 7080, "time_per_iteration": 2.9031472206115723 }, { "auxiliary_loss_clip": 0.0139931, "auxiliary_loss_mlp": 0.0036475, "balance_loss_clip": 1.14541459, "balance_loss_mlp": 0.33370799, "epoch": 0.4257327521418909, "flos": 23586451966080.0, "grad_norm": 209.502591449781, "language_loss": 0.86926007, "learning_rate": 2.5701370773686646e-06, "loss": 0.88690066, "num_input_tokens_seen": 152013415, "router_z_loss_clip": 2.5390625, "router_z_loss_mlp": 0.31030273, "step": 7081, "time_per_iteration": 2.6974878311157227 }, { "auxiliary_loss_clip": 0.01401786, "auxiliary_loss_mlp": 0.0035438, "balance_loss_clip": 1.14948106, "balance_loss_mlp": 0.32565013, "epoch": 0.42579287539455885, "flos": 18989994124800.0, "grad_norm": 2.1770485920419342, "language_loss": 0.86429495, "learning_rate": 2.5697637647606138e-06, "loss": 0.88185662, "num_input_tokens_seen": 152030860, "router_z_loss_clip": 2.52148438, "router_z_loss_mlp": 0.28710938, "step": 7082, "time_per_iteration": 2.656346321105957 }, { "auxiliary_loss_clip": 0.01416107, "auxiliary_loss_mlp": 0.00395987, "balance_loss_clip": 1.15988708, "balance_loss_mlp": 0.36377689, "epoch": 0.4258529986472268, "flos": 25191910252800.0, "grad_norm": 4.887749288680173, "language_loss": 0.74958283, "learning_rate": 2.569390430547065e-06, "loss": 0.76770377, "num_input_tokens_seen": 152050395, "router_z_loss_clip": 2.55859375, "router_z_loss_mlp": 0.32202148, "step": 7083, "time_per_iteration": 2.7276597023010254 }, { "auxiliary_loss_clip": 0.01497326, "auxiliary_loss_mlp": 0.00058948, "balance_loss_clip": 1.26920176, "balance_loss_mlp": 0.053441, "epoch": 0.4259131218998948, "flos": 69968280718080.0, "grad_norm": 0.8635307169596059, "language_loss": 0.66719294, "learning_rate": 2.569017074742173e-06, "loss": 0.68275571, "num_input_tokens_seen": 152113555, "router_z_loss_clip": 2.28125, "router_z_loss_mlp": 0.05517578, "step": 7084, "time_per_iteration": 3.19527006149292 }, { "auxiliary_loss_clip": 0.01400857, "auxiliary_loss_mlp": 0.00393158, "balance_loss_clip": 1.14882898, "balance_loss_mlp": 0.35970774, "epoch": 0.42597324515256274, "flos": 18004964480640.0, "grad_norm": 3.8833637590354577, "language_loss": 0.85745114, "learning_rate": 2.5686436973600964e-06, "loss": 0.8753913, "num_input_tokens_seen": 152131575, "router_z_loss_clip": 2.52148438, "router_z_loss_mlp": 0.33447266, "step": 7085, "time_per_iteration": 2.7272191047668457 }, { "auxiliary_loss_clip": 0.01418357, "auxiliary_loss_mlp": 0.00421861, "balance_loss_clip": 1.1598227, "balance_loss_mlp": 0.38590765, "epoch": 0.4260333684052307, "flos": 15158792563200.0, "grad_norm": 30.19460991369448, "language_loss": 0.83874309, "learning_rate": 2.568270298414995e-06, "loss": 0.85714531, "num_input_tokens_seen": 152149435, "router_z_loss_clip": 2.58398438, "router_z_loss_mlp": 0.35961914, "step": 7086, "time_per_iteration": 2.6548244953155518 }, { "auxiliary_loss_clip": 0.01410948, "auxiliary_loss_mlp": 0.00396453, "balance_loss_clip": 1.15664172, "balance_loss_mlp": 0.36240673, "epoch": 0.42609349165789867, "flos": 14939342421120.0, "grad_norm": 22.8630901443389, "language_loss": 0.86680496, "learning_rate": 2.5678968779210255e-06, "loss": 0.88487899, "num_input_tokens_seen": 152166860, "router_z_loss_clip": 2.54492188, "router_z_loss_mlp": 0.34033203, "step": 7087, "time_per_iteration": 2.6134378910064697 }, { "auxiliary_loss_clip": 0.01398251, "auxiliary_loss_mlp": 0.00368361, "balance_loss_clip": 1.14412689, "balance_loss_mlp": 0.3354829, "epoch": 0.42615361491056664, "flos": 23731961961600.0, "grad_norm": 3.9391706068798484, "language_loss": 0.72434986, "learning_rate": 2.5675234358923505e-06, "loss": 0.74201596, "num_input_tokens_seen": 152187475, "router_z_loss_clip": 2.53515625, "router_z_loss_mlp": 0.32910156, "step": 7088, "time_per_iteration": 2.7009191513061523 }, { "auxiliary_loss_clip": 0.01402199, "auxiliary_loss_mlp": 0.00364435, "balance_loss_clip": 1.14968824, "balance_loss_mlp": 0.33485952, "epoch": 0.42621373816323466, "flos": 24936441747840.0, "grad_norm": 40.466226989375436, "language_loss": 0.75387019, "learning_rate": 2.56714997234313e-06, "loss": 0.77153653, "num_input_tokens_seen": 152207235, "router_z_loss_clip": 2.52734375, "router_z_loss_mlp": 0.29614258, "step": 7089, "time_per_iteration": 2.6758017539978027 }, { "auxiliary_loss_clip": 0.01412013, "auxiliary_loss_mlp": 0.003632, "balance_loss_clip": 1.15267622, "balance_loss_mlp": 0.33278951, "epoch": 0.4262738614159026, "flos": 13552975140480.0, "grad_norm": 8.244903373502643, "language_loss": 0.85426104, "learning_rate": 2.566776487287525e-06, "loss": 0.87201321, "num_input_tokens_seen": 152224240, "router_z_loss_clip": 2.59179688, "router_z_loss_mlp": 0.30407715, "step": 7090, "time_per_iteration": 2.607390880584717 }, { "auxiliary_loss_clip": 0.01416142, "auxiliary_loss_mlp": 0.00386983, "balance_loss_clip": 1.1610049, "balance_loss_mlp": 0.35512996, "epoch": 0.4263339846685706, "flos": 29748794284800.0, "grad_norm": 13.5519638916595, "language_loss": 0.8239764, "learning_rate": 2.5664029807396994e-06, "loss": 0.84200764, "num_input_tokens_seen": 152242595, "router_z_loss_clip": 2.55078125, "router_z_loss_mlp": 0.31835938, "step": 7091, "time_per_iteration": 2.6710665225982666 }, { "auxiliary_loss_clip": 0.01397077, "auxiliary_loss_mlp": 0.00326761, "balance_loss_clip": 1.14741564, "balance_loss_mlp": 0.3003318, "epoch": 0.42639410792123855, "flos": 16834204586880.0, "grad_norm": 2.878859623396974, "language_loss": 0.87425387, "learning_rate": 2.5660294527138156e-06, "loss": 0.89149219, "num_input_tokens_seen": 152260840, "router_z_loss_clip": 2.50195312, "router_z_loss_mlp": 0.26452637, "step": 7092, "time_per_iteration": 2.6310858726501465 }, { "auxiliary_loss_clip": 0.01431116, "auxiliary_loss_mlp": 0.00368486, "balance_loss_clip": 1.17149508, "balance_loss_mlp": 0.33708656, "epoch": 0.4264542311739065, "flos": 28763118195840.0, "grad_norm": 6.19154391013008, "language_loss": 0.80158645, "learning_rate": 2.565655903224038e-06, "loss": 0.81958246, "num_input_tokens_seen": 152280580, "router_z_loss_clip": 2.59765625, "router_z_loss_mlp": 0.31396484, "step": 7093, "time_per_iteration": 2.7532901763916016 }, { "auxiliary_loss_clip": 0.01416471, "auxiliary_loss_mlp": 0.00364359, "balance_loss_clip": 1.16148305, "balance_loss_mlp": 0.33360285, "epoch": 0.4265143544265745, "flos": 24713615727360.0, "grad_norm": 3.0292948186109414, "language_loss": 0.78922015, "learning_rate": 2.565282332284532e-06, "loss": 0.80702847, "num_input_tokens_seen": 152298455, "router_z_loss_clip": 2.5546875, "router_z_loss_mlp": 0.30786133, "step": 7094, "time_per_iteration": 2.726991653442383 }, { "auxiliary_loss_clip": 0.01415365, "auxiliary_loss_mlp": 0.00375771, "balance_loss_clip": 1.16125464, "balance_loss_mlp": 0.34494352, "epoch": 0.42657447767924245, "flos": 21865971352320.0, "grad_norm": 3.1622595541733114, "language_loss": 0.87619758, "learning_rate": 2.564908739909464e-06, "loss": 0.89410895, "num_input_tokens_seen": 152316995, "router_z_loss_clip": 2.54101562, "router_z_loss_mlp": 0.30822754, "step": 7095, "time_per_iteration": 2.6824605464935303 }, { "auxiliary_loss_clip": 0.01423262, "auxiliary_loss_mlp": 0.00379352, "balance_loss_clip": 1.16196132, "balance_loss_mlp": 0.34492433, "epoch": 0.4266346009319104, "flos": 21470236237440.0, "grad_norm": 265.81176341117725, "language_loss": 0.86560452, "learning_rate": 2.5645351261129996e-06, "loss": 0.88363063, "num_input_tokens_seen": 152334800, "router_z_loss_clip": 2.61328125, "router_z_loss_mlp": 0.34448242, "step": 7096, "time_per_iteration": 2.614403486251831 }, { "auxiliary_loss_clip": 0.01423522, "auxiliary_loss_mlp": 0.00398116, "balance_loss_clip": 1.16249204, "balance_loss_mlp": 0.36371207, "epoch": 0.4266947241845784, "flos": 25519379569920.0, "grad_norm": 29.8312919627977, "language_loss": 0.72514117, "learning_rate": 2.5641614909093066e-06, "loss": 0.74335748, "num_input_tokens_seen": 152355175, "router_z_loss_clip": 2.609375, "router_z_loss_mlp": 0.34399414, "step": 7097, "time_per_iteration": 2.7040884494781494 }, { "auxiliary_loss_clip": 0.01419776, "auxiliary_loss_mlp": 0.00406808, "balance_loss_clip": 1.16535163, "balance_loss_mlp": 0.37114084, "epoch": 0.42675484743724634, "flos": 26541217676160.0, "grad_norm": 142.66206135988708, "language_loss": 0.77949226, "learning_rate": 2.5637878343125535e-06, "loss": 0.7977581, "num_input_tokens_seen": 152377245, "router_z_loss_clip": 2.54296875, "router_z_loss_mlp": 0.35668945, "step": 7098, "time_per_iteration": 2.722876787185669 }, { "auxiliary_loss_clip": 0.01408194, "auxiliary_loss_mlp": 0.00329873, "balance_loss_clip": 1.15585971, "balance_loss_mlp": 0.30104855, "epoch": 0.4268149706899143, "flos": 23112718467840.0, "grad_norm": 9.794667177124698, "language_loss": 0.83039337, "learning_rate": 2.5634141563369086e-06, "loss": 0.84777409, "num_input_tokens_seen": 152396985, "router_z_loss_clip": 2.5234375, "router_z_loss_mlp": 0.28808594, "step": 7099, "time_per_iteration": 2.786961555480957 }, { "auxiliary_loss_clip": 0.0143666, "auxiliary_loss_mlp": 0.00402118, "balance_loss_clip": 1.17615056, "balance_loss_mlp": 0.36752313, "epoch": 0.4268750939425823, "flos": 22706532495360.0, "grad_norm": 5.6723561035206895, "language_loss": 0.89025789, "learning_rate": 2.5630404569965432e-06, "loss": 0.90864563, "num_input_tokens_seen": 152415590, "router_z_loss_clip": 2.60351562, "router_z_loss_mlp": 0.34570312, "step": 7100, "time_per_iteration": 2.6879327297210693 }, { "auxiliary_loss_clip": 0.01415028, "auxiliary_loss_mlp": 0.00373507, "balance_loss_clip": 1.15719438, "balance_loss_mlp": 0.34117746, "epoch": 0.42693521719525024, "flos": 25374875155200.0, "grad_norm": 155.51916507715336, "language_loss": 0.85453486, "learning_rate": 2.562666736305627e-06, "loss": 0.87242019, "num_input_tokens_seen": 152436735, "router_z_loss_clip": 2.578125, "router_z_loss_mlp": 0.32348633, "step": 7101, "time_per_iteration": 2.6918084621429443 }, { "auxiliary_loss_clip": 0.01428274, "auxiliary_loss_mlp": 0.00334089, "balance_loss_clip": 1.16772509, "balance_loss_mlp": 0.30273682, "epoch": 0.42699534044791826, "flos": 18150689957760.0, "grad_norm": 54.30121474816975, "language_loss": 0.817581, "learning_rate": 2.5622929942783314e-06, "loss": 0.8352046, "num_input_tokens_seen": 152455685, "router_z_loss_clip": 2.60351562, "router_z_loss_mlp": 0.31347656, "step": 7102, "time_per_iteration": 2.6055197715759277 }, { "auxiliary_loss_clip": 0.01427344, "auxiliary_loss_mlp": 0.00370857, "balance_loss_clip": 1.16895199, "balance_loss_mlp": 0.3371926, "epoch": 0.4270554637005862, "flos": 13698413308800.0, "grad_norm": 2.923394909874939, "language_loss": 0.90458316, "learning_rate": 2.5619192309288297e-06, "loss": 0.92256516, "num_input_tokens_seen": 152473500, "router_z_loss_clip": 2.5859375, "router_z_loss_mlp": 0.33691406, "step": 7103, "time_per_iteration": 2.630558490753174 }, { "auxiliary_loss_clip": 0.0144554, "auxiliary_loss_mlp": 0.00387063, "balance_loss_clip": 1.18290222, "balance_loss_mlp": 0.35494819, "epoch": 0.4271155869532542, "flos": 17493596507520.0, "grad_norm": 98.16483274470569, "language_loss": 0.81559449, "learning_rate": 2.561545446271294e-06, "loss": 0.8339206, "num_input_tokens_seen": 152491320, "router_z_loss_clip": 2.62890625, "router_z_loss_mlp": 0.32128906, "step": 7104, "time_per_iteration": 2.612170457839966 }, { "auxiliary_loss_clip": 0.01416768, "auxiliary_loss_mlp": 0.00351537, "balance_loss_clip": 1.15799034, "balance_loss_mlp": 0.31977963, "epoch": 0.42717571020592215, "flos": 32452293381120.0, "grad_norm": 33.02472438677648, "language_loss": 0.81852025, "learning_rate": 2.5611716403198987e-06, "loss": 0.83620328, "num_input_tokens_seen": 152511970, "router_z_loss_clip": 2.59375, "router_z_loss_mlp": 0.31787109, "step": 7105, "time_per_iteration": 2.733327865600586 }, { "auxiliary_loss_clip": 0.01425931, "auxiliary_loss_mlp": 0.00394648, "balance_loss_clip": 1.16689456, "balance_loss_mlp": 0.36286643, "epoch": 0.4272358334585901, "flos": 16253062444800.0, "grad_norm": 3.08512592859961, "language_loss": 0.83767956, "learning_rate": 2.560797813088819e-06, "loss": 0.85588533, "num_input_tokens_seen": 152530515, "router_z_loss_clip": 2.58984375, "router_z_loss_mlp": 0.31811523, "step": 7106, "time_per_iteration": 2.616262912750244 }, { "auxiliary_loss_clip": 0.01420513, "auxiliary_loss_mlp": 0.00378164, "balance_loss_clip": 1.16562033, "balance_loss_mlp": 0.34554875, "epoch": 0.4272959567112581, "flos": 24200092938240.0, "grad_norm": 5.685595190721689, "language_loss": 0.86794019, "learning_rate": 2.560423964592229e-06, "loss": 0.88592696, "num_input_tokens_seen": 152549295, "router_z_loss_clip": 2.54882812, "router_z_loss_mlp": 0.32641602, "step": 7107, "time_per_iteration": 2.7270443439483643 }, { "auxiliary_loss_clip": 0.0141869, "auxiliary_loss_mlp": 0.00355921, "balance_loss_clip": 1.16153562, "balance_loss_mlp": 0.32332903, "epoch": 0.42735607996392605, "flos": 27963495578880.0, "grad_norm": 10.343565547137722, "language_loss": 0.73328978, "learning_rate": 2.5600500948443075e-06, "loss": 0.75103593, "num_input_tokens_seen": 152570725, "router_z_loss_clip": 2.57226562, "router_z_loss_mlp": 0.32592773, "step": 7108, "time_per_iteration": 2.732841968536377 }, { "auxiliary_loss_clip": 0.01431841, "auxiliary_loss_mlp": 0.00344748, "balance_loss_clip": 1.16935658, "balance_loss_mlp": 0.31475472, "epoch": 0.427416203216594, "flos": 20295597674880.0, "grad_norm": 4.296132567966364, "language_loss": 0.77926135, "learning_rate": 2.5596762038592294e-06, "loss": 0.79702729, "num_input_tokens_seen": 152588950, "router_z_loss_clip": 2.62695312, "router_z_loss_mlp": 0.30004883, "step": 7109, "time_per_iteration": 4.053720474243164 }, { "auxiliary_loss_clip": 0.01426937, "auxiliary_loss_mlp": 0.00364327, "balance_loss_clip": 1.16631806, "balance_loss_mlp": 0.33111569, "epoch": 0.427476326469262, "flos": 26943955943040.0, "grad_norm": 110.89258479463155, "language_loss": 0.72124416, "learning_rate": 2.559302291651174e-06, "loss": 0.73915684, "num_input_tokens_seen": 152608965, "router_z_loss_clip": 2.60742188, "router_z_loss_mlp": 0.33203125, "step": 7110, "time_per_iteration": 2.7289819717407227 }, { "auxiliary_loss_clip": 0.01424068, "auxiliary_loss_mlp": 0.00384755, "balance_loss_clip": 1.16573644, "balance_loss_mlp": 0.35099465, "epoch": 0.42753644972192995, "flos": 25702847262720.0, "grad_norm": 2.5529291880468077, "language_loss": 0.80387592, "learning_rate": 2.5589283582343197e-06, "loss": 0.82196409, "num_input_tokens_seen": 152630220, "router_z_loss_clip": 2.58007812, "router_z_loss_mlp": 0.33789062, "step": 7111, "time_per_iteration": 4.133579254150391 }, { "auxiliary_loss_clip": 0.01416812, "auxiliary_loss_mlp": 0.0036484, "balance_loss_clip": 1.15908217, "balance_loss_mlp": 0.33160397, "epoch": 0.4275965729745979, "flos": 18767419499520.0, "grad_norm": 17.287950289421392, "language_loss": 0.79847765, "learning_rate": 2.558554403622845e-06, "loss": 0.81629419, "num_input_tokens_seen": 152648835, "router_z_loss_clip": 2.57421875, "router_z_loss_mlp": 0.33203125, "step": 7112, "time_per_iteration": 2.638350248336792 }, { "auxiliary_loss_clip": 0.01410891, "auxiliary_loss_mlp": 0.00349501, "balance_loss_clip": 1.15359735, "balance_loss_mlp": 0.32039055, "epoch": 0.4276566962272659, "flos": 23764424878080.0, "grad_norm": 6.050212207839433, "language_loss": 0.77391958, "learning_rate": 2.5581804278309323e-06, "loss": 0.79152346, "num_input_tokens_seen": 152668375, "router_z_loss_clip": 2.57421875, "router_z_loss_mlp": 0.29089355, "step": 7113, "time_per_iteration": 4.084723234176636 }, { "auxiliary_loss_clip": 0.01426134, "auxiliary_loss_mlp": 0.00368544, "balance_loss_clip": 1.16219068, "balance_loss_mlp": 0.33224446, "epoch": 0.42771681947993384, "flos": 22492505306880.0, "grad_norm": 6.102963321493748, "language_loss": 0.67436475, "learning_rate": 2.5578064308727617e-06, "loss": 0.69231153, "num_input_tokens_seen": 152689725, "router_z_loss_clip": 2.64257812, "router_z_loss_mlp": 0.36315918, "step": 7114, "time_per_iteration": 2.7031309604644775 }, { "auxiliary_loss_clip": 0.01440624, "auxiliary_loss_mlp": 0.0038289, "balance_loss_clip": 1.17052007, "balance_loss_mlp": 0.34719905, "epoch": 0.42777694273260186, "flos": 25044712318080.0, "grad_norm": 3.626198137318398, "language_loss": 0.70537508, "learning_rate": 2.5574324127625153e-06, "loss": 0.72361028, "num_input_tokens_seen": 152709375, "router_z_loss_clip": 2.703125, "router_z_loss_mlp": 0.35717773, "step": 7115, "time_per_iteration": 2.7031381130218506 }, { "auxiliary_loss_clip": 0.01420908, "auxiliary_loss_mlp": 0.00382235, "balance_loss_clip": 1.15865076, "balance_loss_mlp": 0.34640023, "epoch": 0.4278370659852698, "flos": 18661519226880.0, "grad_norm": 188.68052512244083, "language_loss": 0.79595053, "learning_rate": 2.5570583735143753e-06, "loss": 0.81398201, "num_input_tokens_seen": 152727510, "router_z_loss_clip": 2.62304688, "router_z_loss_mlp": 0.3581543, "step": 7116, "time_per_iteration": 2.618859052658081 }, { "auxiliary_loss_clip": 0.01402048, "auxiliary_loss_mlp": 0.00338143, "balance_loss_clip": 1.14755774, "balance_loss_mlp": 0.30807853, "epoch": 0.4278971892379378, "flos": 27308269635840.0, "grad_norm": 94.34338349206381, "language_loss": 0.74346513, "learning_rate": 2.5566843131425275e-06, "loss": 0.760867, "num_input_tokens_seen": 152746670, "router_z_loss_clip": 2.546875, "router_z_loss_mlp": 0.30053711, "step": 7117, "time_per_iteration": 2.757567882537842 }, { "auxiliary_loss_clip": 0.01416203, "auxiliary_loss_mlp": 0.00332413, "balance_loss_clip": 1.15549803, "balance_loss_mlp": 0.30087003, "epoch": 0.42795731249060576, "flos": 12888698970240.0, "grad_norm": 3.41413914386177, "language_loss": 0.78429741, "learning_rate": 2.5563102316611536e-06, "loss": 0.80178356, "num_input_tokens_seen": 152760545, "router_z_loss_clip": 2.60351562, "router_z_loss_mlp": 0.31542969, "step": 7118, "time_per_iteration": 2.6834030151367188 }, { "auxiliary_loss_clip": 0.01419761, "auxiliary_loss_mlp": 0.00329067, "balance_loss_clip": 1.15931344, "balance_loss_mlp": 0.297095, "epoch": 0.4280174357432737, "flos": 33401448316800.0, "grad_norm": 40.547738016411365, "language_loss": 0.81698263, "learning_rate": 2.55593612908444e-06, "loss": 0.83447087, "num_input_tokens_seen": 152780970, "router_z_loss_clip": 2.60546875, "router_z_loss_mlp": 0.31982422, "step": 7119, "time_per_iteration": 4.2000038623809814 }, { "auxiliary_loss_clip": 0.01429617, "auxiliary_loss_mlp": 0.00360164, "balance_loss_clip": 1.16612267, "balance_loss_mlp": 0.32586703, "epoch": 0.4280775589959417, "flos": 18259104182400.0, "grad_norm": 2.9240873254572697, "language_loss": 0.81675106, "learning_rate": 2.555562005426573e-06, "loss": 0.83464885, "num_input_tokens_seen": 152798475, "router_z_loss_clip": 2.63476562, "router_z_loss_mlp": 0.34289551, "step": 7120, "time_per_iteration": 2.818265914916992 }, { "auxiliary_loss_clip": 0.01418028, "auxiliary_loss_mlp": 0.00356131, "balance_loss_clip": 1.1580807, "balance_loss_mlp": 0.32508919, "epoch": 0.42813768224860965, "flos": 21471277731840.0, "grad_norm": 21.54018365248761, "language_loss": 0.8280611, "learning_rate": 2.5551878607017385e-06, "loss": 0.84580266, "num_input_tokens_seen": 152817555, "router_z_loss_clip": 2.59960938, "router_z_loss_mlp": 0.31030273, "step": 7121, "time_per_iteration": 2.7444581985473633 }, { "auxiliary_loss_clip": 0.01398916, "auxiliary_loss_mlp": 0.00346533, "balance_loss_clip": 1.14227247, "balance_loss_mlp": 0.31522882, "epoch": 0.4281978055012776, "flos": 15669262696320.0, "grad_norm": 10.17972335458286, "language_loss": 0.92185867, "learning_rate": 2.554813694924126e-06, "loss": 0.93931317, "num_input_tokens_seen": 152836295, "router_z_loss_clip": 2.56835938, "router_z_loss_mlp": 0.31298828, "step": 7122, "time_per_iteration": 2.702604055404663 }, { "auxiliary_loss_clip": 0.0141432, "auxiliary_loss_mlp": 0.00359047, "balance_loss_clip": 1.15234661, "balance_loss_mlp": 0.32643163, "epoch": 0.4282579287539456, "flos": 17712005155200.0, "grad_norm": 3.4714436003293856, "language_loss": 0.86373281, "learning_rate": 2.554439508107921e-06, "loss": 0.88146651, "num_input_tokens_seen": 152854950, "router_z_loss_clip": 2.61914062, "router_z_loss_mlp": 0.32617188, "step": 7123, "time_per_iteration": 2.638716459274292 }, { "auxiliary_loss_clip": 0.01420004, "auxiliary_loss_mlp": 0.00336133, "balance_loss_clip": 1.1573925, "balance_loss_mlp": 0.30265915, "epoch": 0.42831805200661355, "flos": 19281157770240.0, "grad_norm": 14.631478987922854, "language_loss": 0.86919457, "learning_rate": 2.5540653002673153e-06, "loss": 0.88675594, "num_input_tokens_seen": 152873995, "router_z_loss_clip": 2.62695312, "router_z_loss_mlp": 0.33496094, "step": 7124, "time_per_iteration": 2.65563702583313 }, { "auxiliary_loss_clip": 0.01413484, "auxiliary_loss_mlp": 0.00373909, "balance_loss_clip": 1.15175796, "balance_loss_mlp": 0.34326008, "epoch": 0.4283781752592815, "flos": 19792633484160.0, "grad_norm": 12.657679224617253, "language_loss": 0.86787498, "learning_rate": 2.553691071416498e-06, "loss": 0.88574892, "num_input_tokens_seen": 152892925, "router_z_loss_clip": 2.6171875, "router_z_loss_mlp": 0.30639648, "step": 7125, "time_per_iteration": 2.6675937175750732 }, { "auxiliary_loss_clip": 0.01409652, "auxiliary_loss_mlp": 0.00321433, "balance_loss_clip": 1.15110636, "balance_loss_mlp": 0.29076016, "epoch": 0.4284382985119495, "flos": 16508064072960.0, "grad_norm": 13.603846000274487, "language_loss": 0.80978644, "learning_rate": 2.553316821569659e-06, "loss": 0.8270973, "num_input_tokens_seen": 152910935, "router_z_loss_clip": 2.5859375, "router_z_loss_mlp": 0.3067627, "step": 7126, "time_per_iteration": 2.624483585357666 }, { "auxiliary_loss_clip": 0.0141142, "auxiliary_loss_mlp": 0.00330731, "balance_loss_clip": 1.15143859, "balance_loss_mlp": 0.2999984, "epoch": 0.42849842176461744, "flos": 23330767979520.0, "grad_norm": 5.12288237762217, "language_loss": 0.88156629, "learning_rate": 2.5529425507409913e-06, "loss": 0.89898777, "num_input_tokens_seen": 152931030, "router_z_loss_clip": 2.59960938, "router_z_loss_mlp": 0.30737305, "step": 7127, "time_per_iteration": 2.661252975463867 }, { "auxiliary_loss_clip": 0.01419943, "auxiliary_loss_mlp": 0.00369014, "balance_loss_clip": 1.15551567, "balance_loss_mlp": 0.33391884, "epoch": 0.4285585450172854, "flos": 17274433674240.0, "grad_norm": 4.865906856089036, "language_loss": 0.82652295, "learning_rate": 2.5525682589446867e-06, "loss": 0.84441245, "num_input_tokens_seen": 152948085, "router_z_loss_clip": 2.64257812, "router_z_loss_mlp": 0.35107422, "step": 7128, "time_per_iteration": 2.637255907058716 }, { "auxiliary_loss_clip": 0.01415149, "auxiliary_loss_mlp": 0.00365995, "balance_loss_clip": 1.15215516, "balance_loss_mlp": 0.33345109, "epoch": 0.42861866826995343, "flos": 24279599692800.0, "grad_norm": 6.234008216743909, "language_loss": 0.81713194, "learning_rate": 2.552193946194937e-06, "loss": 0.83494341, "num_input_tokens_seen": 152966265, "router_z_loss_clip": 2.62890625, "router_z_loss_mlp": 0.32519531, "step": 7129, "time_per_iteration": 2.722487688064575 }, { "auxiliary_loss_clip": 0.0142647, "auxiliary_loss_mlp": 0.00400576, "balance_loss_clip": 1.16672802, "balance_loss_mlp": 0.36629128, "epoch": 0.4286787915226214, "flos": 24353108876160.0, "grad_norm": 199.6344674330613, "language_loss": 0.83966064, "learning_rate": 2.5518196125059394e-06, "loss": 0.85793108, "num_input_tokens_seen": 152986775, "router_z_loss_clip": 2.6015625, "router_z_loss_mlp": 0.34277344, "step": 7130, "time_per_iteration": 2.7026095390319824 }, { "auxiliary_loss_clip": 0.0143577, "auxiliary_loss_mlp": 0.00363254, "balance_loss_clip": 1.17070735, "balance_loss_mlp": 0.33142531, "epoch": 0.42873891477528936, "flos": 15449992122240.0, "grad_norm": 6.947103758152384, "language_loss": 0.80768561, "learning_rate": 2.551445257891886e-06, "loss": 0.82567585, "num_input_tokens_seen": 153003595, "router_z_loss_clip": 2.64648438, "router_z_loss_mlp": 0.31860352, "step": 7131, "time_per_iteration": 2.71919322013855 }, { "auxiliary_loss_clip": 0.01419645, "auxiliary_loss_mlp": 0.0036422, "balance_loss_clip": 1.15137815, "balance_loss_mlp": 0.32891029, "epoch": 0.4287990380279573, "flos": 17639573379840.0, "grad_norm": 3.394510288494608, "language_loss": 0.85560411, "learning_rate": 2.551070882366973e-06, "loss": 0.87344277, "num_input_tokens_seen": 153021960, "router_z_loss_clip": 2.68554688, "router_z_loss_mlp": 0.3527832, "step": 7132, "time_per_iteration": 2.687697649002075 }, { "auxiliary_loss_clip": 0.01421183, "auxiliary_loss_mlp": 0.00379937, "balance_loss_clip": 1.15865028, "balance_loss_mlp": 0.34560466, "epoch": 0.4288591612806253, "flos": 27162328677120.0, "grad_norm": 7.313473470550568, "language_loss": 0.83171403, "learning_rate": 2.550696485945397e-06, "loss": 0.84972525, "num_input_tokens_seen": 153042110, "router_z_loss_clip": 2.62304688, "router_z_loss_mlp": 0.34338379, "step": 7133, "time_per_iteration": 2.754601240158081 }, { "auxiliary_loss_clip": 0.01423565, "auxiliary_loss_mlp": 0.00344598, "balance_loss_clip": 1.15988207, "balance_loss_mlp": 0.31623814, "epoch": 0.42891928453329325, "flos": 17163182275200.0, "grad_norm": 16.464505855743376, "language_loss": 0.81177384, "learning_rate": 2.550322068641355e-06, "loss": 0.82945549, "num_input_tokens_seen": 153058925, "router_z_loss_clip": 2.63476562, "router_z_loss_mlp": 0.28381348, "step": 7134, "time_per_iteration": 2.665741205215454 }, { "auxiliary_loss_clip": 0.01415487, "auxiliary_loss_mlp": 0.00356868, "balance_loss_clip": 1.15252614, "balance_loss_mlp": 0.32360855, "epoch": 0.4289794077859612, "flos": 18187031543040.0, "grad_norm": 3.440554725946858, "language_loss": 0.91357017, "learning_rate": 2.5499476304690455e-06, "loss": 0.93129373, "num_input_tokens_seen": 153078070, "router_z_loss_clip": 2.62890625, "router_z_loss_mlp": 0.33276367, "step": 7135, "time_per_iteration": 2.699946641921997 }, { "auxiliary_loss_clip": 0.01402866, "auxiliary_loss_mlp": 0.00332513, "balance_loss_clip": 1.1439743, "balance_loss_mlp": 0.30166167, "epoch": 0.4290395310386292, "flos": 28256885867520.0, "grad_norm": 47.46720110848756, "language_loss": 0.83409321, "learning_rate": 2.549573171442666e-06, "loss": 0.85144699, "num_input_tokens_seen": 153096680, "router_z_loss_clip": 2.58984375, "router_z_loss_mlp": 0.30834961, "step": 7136, "time_per_iteration": 2.721072196960449 }, { "auxiliary_loss_clip": 0.0142529, "auxiliary_loss_mlp": 0.00368166, "balance_loss_clip": 1.15947664, "balance_loss_mlp": 0.33688551, "epoch": 0.42909965429129715, "flos": 16216074414720.0, "grad_norm": 91.00852505679585, "language_loss": 0.85933012, "learning_rate": 2.5491986915764175e-06, "loss": 0.87726462, "num_input_tokens_seen": 153113305, "router_z_loss_clip": 2.66015625, "router_z_loss_mlp": 0.31274414, "step": 7137, "time_per_iteration": 2.589449405670166 }, { "auxiliary_loss_clip": 0.01424556, "auxiliary_loss_mlp": 0.00378908, "balance_loss_clip": 1.16131032, "balance_loss_mlp": 0.34705564, "epoch": 0.4291597775439651, "flos": 23112862122240.0, "grad_norm": 5.203940834573685, "language_loss": 0.83732575, "learning_rate": 2.548824190884499e-06, "loss": 0.85536039, "num_input_tokens_seen": 153132735, "router_z_loss_clip": 2.63476562, "router_z_loss_mlp": 0.3182373, "step": 7138, "time_per_iteration": 2.665797233581543 }, { "auxiliary_loss_clip": 0.01518022, "auxiliary_loss_mlp": 0.0007882, "balance_loss_clip": 1.27543378, "balance_loss_mlp": 0.06985509, "epoch": 0.4292199007966331, "flos": 67546212681600.0, "grad_norm": 0.9156107643159812, "language_loss": 0.55721092, "learning_rate": 2.548449669381113e-06, "loss": 0.57317936, "num_input_tokens_seen": 153187925, "router_z_loss_clip": 2.4375, "router_z_loss_mlp": 0.08984375, "step": 7139, "time_per_iteration": 2.967646598815918 }, { "auxiliary_loss_clip": 0.01404396, "auxiliary_loss_mlp": 0.00349085, "balance_loss_clip": 1.14280927, "balance_loss_mlp": 0.31921107, "epoch": 0.42928002404930105, "flos": 22999850956800.0, "grad_norm": 19.875443109773613, "language_loss": 0.87468398, "learning_rate": 2.5480751270804595e-06, "loss": 0.89221883, "num_input_tokens_seen": 153206990, "router_z_loss_clip": 2.61132812, "router_z_loss_mlp": 0.29858398, "step": 7140, "time_per_iteration": 2.707534074783325 }, { "auxiliary_loss_clip": 0.01427916, "auxiliary_loss_mlp": 0.00371825, "balance_loss_clip": 1.15912676, "balance_loss_mlp": 0.33978167, "epoch": 0.429340147301969, "flos": 11544922241280.0, "grad_norm": 89.82264770161768, "language_loss": 0.88910127, "learning_rate": 2.5477005639967424e-06, "loss": 0.90709865, "num_input_tokens_seen": 153222345, "router_z_loss_clip": 2.68554688, "router_z_loss_mlp": 0.3203125, "step": 7141, "time_per_iteration": 2.6473805904388428 }, { "auxiliary_loss_clip": 0.01424592, "auxiliary_loss_mlp": 0.00390321, "balance_loss_clip": 1.15483022, "balance_loss_mlp": 0.35834938, "epoch": 0.42940027055463703, "flos": 25264988472960.0, "grad_norm": 34.15608585997064, "language_loss": 0.91637003, "learning_rate": 2.547325980144166e-06, "loss": 0.93451917, "num_input_tokens_seen": 153240570, "router_z_loss_clip": 2.6953125, "router_z_loss_mlp": 0.31958008, "step": 7142, "time_per_iteration": 2.703907012939453 }, { "auxiliary_loss_clip": 0.01409552, "auxiliary_loss_mlp": 0.00346976, "balance_loss_clip": 1.14609432, "balance_loss_mlp": 0.31569499, "epoch": 0.429460393807305, "flos": 23805004268160.0, "grad_norm": 2.4168408941088697, "language_loss": 0.86364543, "learning_rate": 2.5469513755369323e-06, "loss": 0.88121068, "num_input_tokens_seen": 153259575, "router_z_loss_clip": 2.63671875, "router_z_loss_mlp": 0.3125, "step": 7143, "time_per_iteration": 2.7025365829467773 }, { "auxiliary_loss_clip": 0.01436212, "auxiliary_loss_mlp": 0.00340501, "balance_loss_clip": 1.1688385, "balance_loss_mlp": 0.31150907, "epoch": 0.42952051705997296, "flos": 13918294414080.0, "grad_norm": 16.223084978859546, "language_loss": 0.83573204, "learning_rate": 2.5465767501892484e-06, "loss": 0.85349917, "num_input_tokens_seen": 153276650, "router_z_loss_clip": 2.67382812, "router_z_loss_mlp": 0.28979492, "step": 7144, "time_per_iteration": 2.616328716278076 }, { "auxiliary_loss_clip": 0.01421029, "auxiliary_loss_mlp": 0.00389635, "balance_loss_clip": 1.15618563, "balance_loss_mlp": 0.35794911, "epoch": 0.4295806403126409, "flos": 26760380509440.0, "grad_norm": 2.0096398810518457, "language_loss": 0.80170739, "learning_rate": 2.54620210411532e-06, "loss": 0.81981409, "num_input_tokens_seen": 153298025, "router_z_loss_clip": 2.65039062, "router_z_loss_mlp": 0.31665039, "step": 7145, "time_per_iteration": 2.7041823863983154 }, { "auxiliary_loss_clip": 0.01426079, "auxiliary_loss_mlp": 0.00388062, "balance_loss_clip": 1.15897238, "balance_loss_mlp": 0.35394403, "epoch": 0.4296407635653089, "flos": 20952619297920.0, "grad_norm": 5.8101328660861356, "language_loss": 0.86735916, "learning_rate": 2.545827437329352e-06, "loss": 0.88550055, "num_input_tokens_seen": 153315775, "router_z_loss_clip": 2.671875, "router_z_loss_mlp": 0.34106445, "step": 7146, "time_per_iteration": 2.6698684692382812 }, { "auxiliary_loss_clip": 0.014135, "auxiliary_loss_mlp": 0.0037979, "balance_loss_clip": 1.15168607, "balance_loss_mlp": 0.34855732, "epoch": 0.42970088681797686, "flos": 15852335339520.0, "grad_norm": 6.281410613121259, "language_loss": 0.9041661, "learning_rate": 2.5454527498455532e-06, "loss": 0.92209899, "num_input_tokens_seen": 153332765, "router_z_loss_clip": 2.61914062, "router_z_loss_mlp": 0.31225586, "step": 7147, "time_per_iteration": 2.6322779655456543 }, { "auxiliary_loss_clip": 0.01425265, "auxiliary_loss_mlp": 0.00357451, "balance_loss_clip": 1.16081333, "balance_loss_mlp": 0.32650429, "epoch": 0.4297610100706448, "flos": 22382618624640.0, "grad_norm": 9.653840365409316, "language_loss": 0.92852342, "learning_rate": 2.545078041678131e-06, "loss": 0.94635057, "num_input_tokens_seen": 153350760, "router_z_loss_clip": 2.64648438, "router_z_loss_mlp": 0.30957031, "step": 7148, "time_per_iteration": 2.647468328475952 }, { "auxiliary_loss_clip": 0.01426505, "auxiliary_loss_mlp": 0.00356843, "balance_loss_clip": 1.16213918, "balance_loss_mlp": 0.32797006, "epoch": 0.4298211333233128, "flos": 27925681536000.0, "grad_norm": 36.50740341213853, "language_loss": 0.83949488, "learning_rate": 2.5447033128412957e-06, "loss": 0.8573283, "num_input_tokens_seen": 153370765, "router_z_loss_clip": 2.640625, "router_z_loss_mlp": 0.28857422, "step": 7149, "time_per_iteration": 2.7169911861419678 }, { "auxiliary_loss_clip": 0.01405787, "auxiliary_loss_mlp": 0.00332195, "balance_loss_clip": 1.14414692, "balance_loss_mlp": 0.30131957, "epoch": 0.42988125657598075, "flos": 24425612478720.0, "grad_norm": 55.9217417785159, "language_loss": 0.84991521, "learning_rate": 2.544328563349256e-06, "loss": 0.86729503, "num_input_tokens_seen": 153390725, "router_z_loss_clip": 2.62109375, "router_z_loss_mlp": 0.30883789, "step": 7150, "time_per_iteration": 2.690232992172241 }, { "auxiliary_loss_clip": 0.01437225, "auxiliary_loss_mlp": 0.00378892, "balance_loss_clip": 1.16565275, "balance_loss_mlp": 0.34565622, "epoch": 0.4299413798286487, "flos": 15850180523520.0, "grad_norm": 377.10358430178076, "language_loss": 0.83169556, "learning_rate": 2.5439537932162222e-06, "loss": 0.84985673, "num_input_tokens_seen": 153408010, "router_z_loss_clip": 2.71679688, "router_z_loss_mlp": 0.33251953, "step": 7151, "time_per_iteration": 4.069535970687866 }, { "auxiliary_loss_clip": 0.01439547, "auxiliary_loss_mlp": 0.00400937, "balance_loss_clip": 1.16874015, "balance_loss_mlp": 0.36784431, "epoch": 0.4300015030813167, "flos": 22309504490880.0, "grad_norm": 6.827970250233639, "language_loss": 0.7777983, "learning_rate": 2.543579002456406e-06, "loss": 0.79620314, "num_input_tokens_seen": 153426865, "router_z_loss_clip": 2.71289062, "router_z_loss_mlp": 0.33093262, "step": 7152, "time_per_iteration": 2.628608226776123 }, { "auxiliary_loss_clip": 0.01414489, "auxiliary_loss_mlp": 0.00373576, "balance_loss_clip": 1.14765453, "balance_loss_mlp": 0.33982787, "epoch": 0.43006162633398465, "flos": 34897666366080.0, "grad_norm": 2.725355718024374, "language_loss": 0.77548903, "learning_rate": 2.54320419108402e-06, "loss": 0.79336965, "num_input_tokens_seen": 153449410, "router_z_loss_clip": 2.66601562, "router_z_loss_mlp": 0.33752441, "step": 7153, "time_per_iteration": 4.2659454345703125 }, { "auxiliary_loss_clip": 0.01424192, "auxiliary_loss_mlp": 0.00379727, "balance_loss_clip": 1.16157126, "balance_loss_mlp": 0.34937626, "epoch": 0.4301217495866526, "flos": 15961575576960.0, "grad_norm": 19.66966547461353, "language_loss": 0.86775249, "learning_rate": 2.542829359113276e-06, "loss": 0.88579166, "num_input_tokens_seen": 153467910, "router_z_loss_clip": 2.62890625, "router_z_loss_mlp": 0.3034668, "step": 7154, "time_per_iteration": 2.708641767501831 }, { "auxiliary_loss_clip": 0.01424284, "auxiliary_loss_mlp": 0.00372995, "balance_loss_clip": 1.1605978, "balance_loss_mlp": 0.34238189, "epoch": 0.43018187283932063, "flos": 18770364414720.0, "grad_norm": 111.73189539329825, "language_loss": 0.84537005, "learning_rate": 2.542454506558389e-06, "loss": 0.86334288, "num_input_tokens_seen": 153487100, "router_z_loss_clip": 2.63671875, "router_z_loss_mlp": 0.3059082, "step": 7155, "time_per_iteration": 2.623826026916504 }, { "auxiliary_loss_clip": 0.01420391, "auxiliary_loss_mlp": 0.00376074, "balance_loss_clip": 1.15669847, "balance_loss_mlp": 0.34517458, "epoch": 0.4302419960919886, "flos": 20151703791360.0, "grad_norm": 39.48521671961109, "language_loss": 0.96495032, "learning_rate": 2.5420796334335723e-06, "loss": 0.98291498, "num_input_tokens_seen": 153505565, "router_z_loss_clip": 2.63085938, "router_z_loss_mlp": 0.30859375, "step": 7156, "time_per_iteration": 4.05840277671814 }, { "auxiliary_loss_clip": 0.01435762, "auxiliary_loss_mlp": 0.00390469, "balance_loss_clip": 1.16839933, "balance_loss_mlp": 0.35594553, "epoch": 0.43030211934465656, "flos": 26432731624320.0, "grad_norm": 2.107980655766426, "language_loss": 0.90770817, "learning_rate": 2.541704739753042e-06, "loss": 0.92597044, "num_input_tokens_seen": 153526130, "router_z_loss_clip": 2.67578125, "router_z_loss_mlp": 0.34533691, "step": 7157, "time_per_iteration": 2.701692581176758 }, { "auxiliary_loss_clip": 0.01461755, "auxiliary_loss_mlp": 0.0037226, "balance_loss_clip": 1.18858123, "balance_loss_mlp": 0.34105131, "epoch": 0.43036224259732453, "flos": 24389234979840.0, "grad_norm": 12.710639356319104, "language_loss": 0.80839097, "learning_rate": 2.5413298255310132e-06, "loss": 0.82673115, "num_input_tokens_seen": 153546370, "router_z_loss_clip": 2.73046875, "router_z_loss_mlp": 0.31225586, "step": 7158, "time_per_iteration": 2.6990506649017334 }, { "auxiliary_loss_clip": 0.01438446, "auxiliary_loss_mlp": 0.00382121, "balance_loss_clip": 1.17134213, "balance_loss_mlp": 0.35198486, "epoch": 0.4304223658499925, "flos": 17201714590080.0, "grad_norm": 26.253594870742596, "language_loss": 0.89170933, "learning_rate": 2.5409548907817034e-06, "loss": 0.90991503, "num_input_tokens_seen": 153562800, "router_z_loss_clip": 2.66796875, "router_z_loss_mlp": 0.30151367, "step": 7159, "time_per_iteration": 2.622833490371704 }, { "auxiliary_loss_clip": 0.01442834, "auxiliary_loss_mlp": 0.00365905, "balance_loss_clip": 1.170542, "balance_loss_mlp": 0.33486319, "epoch": 0.43048248910266046, "flos": 14903000835840.0, "grad_norm": 12.647159439448382, "language_loss": 0.90836459, "learning_rate": 2.54057993551933e-06, "loss": 0.92645192, "num_input_tokens_seen": 153578395, "router_z_loss_clip": 2.7265625, "router_z_loss_mlp": 0.31079102, "step": 7160, "time_per_iteration": 2.6387205123901367 }, { "auxiliary_loss_clip": 0.01443624, "auxiliary_loss_mlp": 0.00366723, "balance_loss_clip": 1.17059445, "balance_loss_mlp": 0.3309606, "epoch": 0.4305426123553284, "flos": 21579835610880.0, "grad_norm": 82.39527751287787, "language_loss": 0.85798943, "learning_rate": 2.5402049597581116e-06, "loss": 0.87609291, "num_input_tokens_seen": 153596880, "router_z_loss_clip": 2.73242188, "router_z_loss_mlp": 0.35742188, "step": 7161, "time_per_iteration": 4.123345375061035 }, { "auxiliary_loss_clip": 0.01442817, "auxiliary_loss_mlp": 0.00351967, "balance_loss_clip": 1.17238212, "balance_loss_mlp": 0.31935117, "epoch": 0.4306027356079964, "flos": 22601278667520.0, "grad_norm": 20.116261959792606, "language_loss": 0.79212642, "learning_rate": 2.5398299635122662e-06, "loss": 0.81007433, "num_input_tokens_seen": 153616570, "router_z_loss_clip": 2.703125, "router_z_loss_mlp": 0.32592773, "step": 7162, "time_per_iteration": 2.715836524963379 }, { "auxiliary_loss_clip": 0.01490764, "auxiliary_loss_mlp": 0.00090424, "balance_loss_clip": 1.22327089, "balance_loss_mlp": 0.08207938, "epoch": 0.43066285886066435, "flos": 70672091806080.0, "grad_norm": 0.7837524237061969, "language_loss": 0.58379459, "learning_rate": 2.5394549467960147e-06, "loss": 0.59960651, "num_input_tokens_seen": 153671450, "router_z_loss_clip": 2.671875, "router_z_loss_mlp": 0.08349609, "step": 7163, "time_per_iteration": 3.0688247680664062 }, { "auxiliary_loss_clip": 0.01437003, "auxiliary_loss_mlp": 0.00349624, "balance_loss_clip": 1.16851091, "balance_loss_mlp": 0.31791425, "epoch": 0.4307229821133323, "flos": 26720591218560.0, "grad_norm": 3.44819501423517, "language_loss": 0.85399044, "learning_rate": 2.5390799096235783e-06, "loss": 0.87185669, "num_input_tokens_seen": 153691405, "router_z_loss_clip": 2.68554688, "router_z_loss_mlp": 0.31713867, "step": 7164, "time_per_iteration": 2.7044460773468018 }, { "auxiliary_loss_clip": 0.0144059, "auxiliary_loss_mlp": 0.00366558, "balance_loss_clip": 1.16990185, "balance_loss_mlp": 0.33346543, "epoch": 0.4307831053660003, "flos": 26177119464960.0, "grad_norm": 11.63726781041606, "language_loss": 0.77364558, "learning_rate": 2.538704852009177e-06, "loss": 0.79171705, "num_input_tokens_seen": 153711555, "router_z_loss_clip": 2.70507812, "router_z_loss_mlp": 0.33081055, "step": 7165, "time_per_iteration": 2.706228256225586 }, { "auxiliary_loss_clip": 0.01440678, "auxiliary_loss_mlp": 0.0035404, "balance_loss_clip": 1.16952014, "balance_loss_mlp": 0.32149562, "epoch": 0.43084322861866825, "flos": 18910343715840.0, "grad_norm": 6.747080834053999, "language_loss": 0.82079709, "learning_rate": 2.538329773967034e-06, "loss": 0.83874422, "num_input_tokens_seen": 153730095, "router_z_loss_clip": 2.70898438, "router_z_loss_mlp": 0.32568359, "step": 7166, "time_per_iteration": 2.634289026260376 }, { "auxiliary_loss_clip": 0.01429059, "auxiliary_loss_mlp": 0.00267218, "balance_loss_clip": 1.16503322, "balance_loss_mlp": 0.23579484, "epoch": 0.4309033518713362, "flos": 26432911192320.0, "grad_norm": 10.921742262901743, "language_loss": 0.79380149, "learning_rate": 2.537954675511372e-06, "loss": 0.81076431, "num_input_tokens_seen": 153749320, "router_z_loss_clip": 2.64453125, "router_z_loss_mlp": 0.31445312, "step": 7167, "time_per_iteration": 2.822505474090576 }, { "auxiliary_loss_clip": 0.01430486, "auxiliary_loss_mlp": 0.00304959, "balance_loss_clip": 1.16670132, "balance_loss_mlp": 0.27210492, "epoch": 0.43096347512400424, "flos": 21213295274880.0, "grad_norm": 3.8994790281789085, "language_loss": 0.84082699, "learning_rate": 2.537579556656414e-06, "loss": 0.85818148, "num_input_tokens_seen": 153767825, "router_z_loss_clip": 2.640625, "router_z_loss_mlp": 0.32849121, "step": 7168, "time_per_iteration": 2.736447811126709 }, { "auxiliary_loss_clip": 0.01435901, "auxiliary_loss_mlp": 0.00285231, "balance_loss_clip": 1.17114377, "balance_loss_mlp": 0.2539984, "epoch": 0.4310235983766722, "flos": 16540131939840.0, "grad_norm": 113.98137751020259, "language_loss": 0.91951072, "learning_rate": 2.537204417416387e-06, "loss": 0.93672198, "num_input_tokens_seen": 153785350, "router_z_loss_clip": 2.64257812, "router_z_loss_mlp": 0.3125, "step": 7169, "time_per_iteration": 2.7616074085235596 }, { "auxiliary_loss_clip": 0.01449768, "auxiliary_loss_mlp": 0.00071009, "balance_loss_clip": 1.19874334, "balance_loss_mlp": 0.06118573, "epoch": 0.43108372162934017, "flos": 64775704763520.0, "grad_norm": 0.6702405346718439, "language_loss": 0.60768765, "learning_rate": 2.5368292578055132e-06, "loss": 0.62289542, "num_input_tokens_seen": 153856400, "router_z_loss_clip": 2.515625, "router_z_loss_mlp": 0.09814453, "step": 7170, "time_per_iteration": 3.2945749759674072 }, { "auxiliary_loss_clip": 0.01432567, "auxiliary_loss_mlp": 0.00296215, "balance_loss_clip": 1.16662598, "balance_loss_mlp": 0.26500595, "epoch": 0.43114384488200813, "flos": 13444094039040.0, "grad_norm": 4.348649547854899, "language_loss": 0.85631067, "learning_rate": 2.536454077838021e-06, "loss": 0.87359846, "num_input_tokens_seen": 153875230, "router_z_loss_clip": 2.66210938, "router_z_loss_mlp": 0.31201172, "step": 7171, "time_per_iteration": 2.665846347808838 }, { "auxiliary_loss_clip": 0.01452716, "auxiliary_loss_mlp": 0.00269101, "balance_loss_clip": 1.18826067, "balance_loss_mlp": 0.23762953, "epoch": 0.4312039681346761, "flos": 26286682924800.0, "grad_norm": 10.019786500727852, "language_loss": 0.82775021, "learning_rate": 2.5360788775281357e-06, "loss": 0.84496838, "num_input_tokens_seen": 153894740, "router_z_loss_clip": 2.64648438, "router_z_loss_mlp": 0.31445312, "step": 7172, "time_per_iteration": 2.6803464889526367 }, { "auxiliary_loss_clip": 0.01457939, "auxiliary_loss_mlp": 0.00297945, "balance_loss_clip": 1.18952763, "balance_loss_mlp": 0.26294494, "epoch": 0.43126409138734406, "flos": 20376684627840.0, "grad_norm": 52.46277923140172, "language_loss": 0.85039186, "learning_rate": 2.535703656890086e-06, "loss": 0.86795068, "num_input_tokens_seen": 153913230, "router_z_loss_clip": 2.68554688, "router_z_loss_mlp": 0.35009766, "step": 7173, "time_per_iteration": 2.6240451335906982 }, { "auxiliary_loss_clip": 0.01438341, "auxiliary_loss_mlp": 0.00266428, "balance_loss_clip": 1.17634463, "balance_loss_mlp": 0.23388404, "epoch": 0.431324214640012, "flos": 22123091882880.0, "grad_norm": 9.050033533898832, "language_loss": 0.82667494, "learning_rate": 2.5353284159381e-06, "loss": 0.84372264, "num_input_tokens_seen": 153933250, "router_z_loss_clip": 2.62109375, "router_z_loss_mlp": 0.32568359, "step": 7174, "time_per_iteration": 2.72456955909729 }, { "auxiliary_loss_clip": 0.01449324, "auxiliary_loss_mlp": 0.00294285, "balance_loss_clip": 1.18470168, "balance_loss_mlp": 0.26216999, "epoch": 0.43138433789268, "flos": 15231008856960.0, "grad_norm": 4.979708424583863, "language_loss": 0.88370699, "learning_rate": 2.534953154686407e-06, "loss": 0.90114313, "num_input_tokens_seen": 153951325, "router_z_loss_clip": 2.64453125, "router_z_loss_mlp": 0.32104492, "step": 7175, "time_per_iteration": 2.6341567039489746 }, { "auxiliary_loss_clip": 0.01455703, "auxiliary_loss_mlp": 0.0027519, "balance_loss_clip": 1.18239677, "balance_loss_mlp": 0.23959459, "epoch": 0.43144446114534796, "flos": 18150294908160.0, "grad_norm": 14.545115892762864, "language_loss": 0.83099043, "learning_rate": 2.5345778731492366e-06, "loss": 0.84829938, "num_input_tokens_seen": 153966975, "router_z_loss_clip": 2.734375, "router_z_loss_mlp": 0.35620117, "step": 7176, "time_per_iteration": 2.6232852935791016 }, { "auxiliary_loss_clip": 0.0146042, "auxiliary_loss_mlp": 0.00277729, "balance_loss_clip": 1.19216311, "balance_loss_mlp": 0.24334925, "epoch": 0.4315045843980159, "flos": 22929861306240.0, "grad_norm": 3.001195058673707, "language_loss": 0.80342984, "learning_rate": 2.534202571340819e-06, "loss": 0.82081133, "num_input_tokens_seen": 153986695, "router_z_loss_clip": 2.68359375, "router_z_loss_mlp": 0.34375, "step": 7177, "time_per_iteration": 2.671656608581543 }, { "auxiliary_loss_clip": 0.01469312, "auxiliary_loss_mlp": 0.00306334, "balance_loss_clip": 1.19654763, "balance_loss_mlp": 0.27026165, "epoch": 0.4315647076506839, "flos": 22126862810880.0, "grad_norm": 15.483687929846662, "language_loss": 0.87780255, "learning_rate": 2.533827249275387e-06, "loss": 0.89555901, "num_input_tokens_seen": 154004710, "router_z_loss_clip": 2.73242188, "router_z_loss_mlp": 0.3605957, "step": 7178, "time_per_iteration": 2.665947914123535 }, { "auxiliary_loss_clip": 0.01457767, "auxiliary_loss_mlp": 0.0022861, "balance_loss_clip": 1.20025349, "balance_loss_mlp": 0.19449246, "epoch": 0.43162483090335185, "flos": 26871129118080.0, "grad_norm": 2.7518927682637573, "language_loss": 0.88790548, "learning_rate": 2.5334519069671725e-06, "loss": 0.90476918, "num_input_tokens_seen": 154024320, "router_z_loss_clip": 2.57421875, "router_z_loss_mlp": 0.34106445, "step": 7179, "time_per_iteration": 2.7014245986938477 }, { "auxiliary_loss_clip": 0.01452272, "auxiliary_loss_mlp": 0.00273042, "balance_loss_clip": 1.1895169, "balance_loss_mlp": 0.23975897, "epoch": 0.4316849541560198, "flos": 13913122855680.0, "grad_norm": 7.463791004743366, "language_loss": 0.82816553, "learning_rate": 2.5330765444304075e-06, "loss": 0.84541869, "num_input_tokens_seen": 154041755, "router_z_loss_clip": 2.62695312, "router_z_loss_mlp": 0.33276367, "step": 7180, "time_per_iteration": 2.666935920715332 }, { "auxiliary_loss_clip": 0.01466563, "auxiliary_loss_mlp": 0.00280125, "balance_loss_clip": 1.19995236, "balance_loss_mlp": 0.24603131, "epoch": 0.4317450774086878, "flos": 16435165420800.0, "grad_norm": 2.584692010453137, "language_loss": 0.86806178, "learning_rate": 2.5327011616793274e-06, "loss": 0.88552868, "num_input_tokens_seen": 154056775, "router_z_loss_clip": 2.66796875, "router_z_loss_mlp": 0.34130859, "step": 7181, "time_per_iteration": 2.6034936904907227 }, { "auxiliary_loss_clip": 0.01471696, "auxiliary_loss_mlp": 0.00286972, "balance_loss_clip": 1.2046386, "balance_loss_mlp": 0.25018382, "epoch": 0.4318052006613558, "flos": 20554980762240.0, "grad_norm": 6.050514979341648, "language_loss": 0.93978953, "learning_rate": 2.532325758728165e-06, "loss": 0.95737618, "num_input_tokens_seen": 154075015, "router_z_loss_clip": 2.66796875, "router_z_loss_mlp": 0.36816406, "step": 7182, "time_per_iteration": 2.6644339561462402 }, { "auxiliary_loss_clip": 0.01462542, "auxiliary_loss_mlp": 0.00253502, "balance_loss_clip": 1.19770265, "balance_loss_mlp": 0.21971811, "epoch": 0.43186532391402377, "flos": 22820046451200.0, "grad_norm": 4.754953774570806, "language_loss": 0.83497411, "learning_rate": 2.5319503355911566e-06, "loss": 0.85213453, "num_input_tokens_seen": 154095170, "router_z_loss_clip": 2.65039062, "router_z_loss_mlp": 0.33789062, "step": 7183, "time_per_iteration": 2.6813955307006836 }, { "auxiliary_loss_clip": 0.0148252, "auxiliary_loss_mlp": 0.00282145, "balance_loss_clip": 1.21071219, "balance_loss_mlp": 0.24786082, "epoch": 0.43192544716669173, "flos": 25556583081600.0, "grad_norm": 4.120138537035195, "language_loss": 0.85567272, "learning_rate": 2.5315748922825393e-06, "loss": 0.87331939, "num_input_tokens_seen": 154116895, "router_z_loss_clip": 2.71875, "router_z_loss_mlp": 0.3425293, "step": 7184, "time_per_iteration": 2.7311789989471436 }, { "auxiliary_loss_clip": 0.01446807, "auxiliary_loss_mlp": 0.00293422, "balance_loss_clip": 1.18532193, "balance_loss_mlp": 0.25959027, "epoch": 0.4319855704193597, "flos": 30954674701440.0, "grad_norm": 4.60213892927688, "language_loss": 0.78348213, "learning_rate": 2.5311994288165474e-06, "loss": 0.80088449, "num_input_tokens_seen": 154138395, "router_z_loss_clip": 2.6171875, "router_z_loss_mlp": 0.33837891, "step": 7185, "time_per_iteration": 2.731214761734009 }, { "auxiliary_loss_clip": 0.01456488, "auxiliary_loss_mlp": 0.00283047, "balance_loss_clip": 1.19026899, "balance_loss_mlp": 0.24518618, "epoch": 0.43204569367202766, "flos": 24238732993920.0, "grad_norm": 79.95896151293607, "language_loss": 0.84374297, "learning_rate": 2.530823945207421e-06, "loss": 0.86113834, "num_input_tokens_seen": 154156775, "router_z_loss_clip": 2.65820312, "router_z_loss_mlp": 0.37841797, "step": 7186, "time_per_iteration": 2.7694082260131836 }, { "auxiliary_loss_clip": 0.01454381, "auxiliary_loss_mlp": 0.00266866, "balance_loss_clip": 1.18760681, "balance_loss_mlp": 0.23541833, "epoch": 0.43210581692469563, "flos": 18406948561920.0, "grad_norm": 5.447344360697795, "language_loss": 0.82219481, "learning_rate": 2.5304484414693962e-06, "loss": 0.83940727, "num_input_tokens_seen": 154177500, "router_z_loss_clip": 2.66796875, "router_z_loss_mlp": 0.31445312, "step": 7187, "time_per_iteration": 2.73030948638916 }, { "auxiliary_loss_clip": 0.01395332, "auxiliary_loss_mlp": 0.00073325, "balance_loss_clip": 1.17715478, "balance_loss_mlp": 0.05749412, "epoch": 0.4321659401773636, "flos": 49832378910720.0, "grad_norm": 0.842075485169368, "language_loss": 0.67770946, "learning_rate": 2.530072917616714e-06, "loss": 0.69239604, "num_input_tokens_seen": 154237110, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.15820312, "step": 7188, "time_per_iteration": 3.1595420837402344 }, { "auxiliary_loss_clip": 0.01435027, "auxiliary_loss_mlp": 0.00253924, "balance_loss_clip": 1.17219186, "balance_loss_mlp": 0.22171409, "epoch": 0.43222606343003156, "flos": 17128564542720.0, "grad_norm": 441.2249633359303, "language_loss": 0.84926099, "learning_rate": 2.529697373663614e-06, "loss": 0.86615056, "num_input_tokens_seen": 154253910, "router_z_loss_clip": 2.62695312, "router_z_loss_mlp": 0.32226562, "step": 7189, "time_per_iteration": 2.6554195880889893 }, { "auxiliary_loss_clip": 0.01461649, "auxiliary_loss_mlp": 0.00287315, "balance_loss_clip": 1.19228899, "balance_loss_mlp": 0.25081325, "epoch": 0.4322861866826995, "flos": 22749949059840.0, "grad_norm": 9.447335287226995, "language_loss": 0.79178882, "learning_rate": 2.5293218096243364e-06, "loss": 0.80927849, "num_input_tokens_seen": 154274770, "router_z_loss_clip": 2.69335938, "router_z_loss_mlp": 0.36523438, "step": 7190, "time_per_iteration": 2.679560422897339 }, { "auxiliary_loss_clip": 0.01426201, "auxiliary_loss_mlp": 0.00248833, "balance_loss_clip": 1.16558385, "balance_loss_mlp": 0.21724211, "epoch": 0.4323463099353675, "flos": 27891925729920.0, "grad_norm": 11.168723045770053, "language_loss": 0.86012751, "learning_rate": 2.5289462255131223e-06, "loss": 0.87687784, "num_input_tokens_seen": 154295035, "router_z_loss_clip": 2.609375, "router_z_loss_mlp": 0.31567383, "step": 7191, "time_per_iteration": 2.690704584121704 }, { "auxiliary_loss_clip": 0.0143051, "auxiliary_loss_mlp": 0.00253697, "balance_loss_clip": 1.16996622, "balance_loss_mlp": 0.22041389, "epoch": 0.43240643318803546, "flos": 21614740652160.0, "grad_norm": 40.65194698389577, "language_loss": 0.82995749, "learning_rate": 2.5285706213442146e-06, "loss": 0.84679955, "num_input_tokens_seen": 154314905, "router_z_loss_clip": 2.60546875, "router_z_loss_mlp": 0.33251953, "step": 7192, "time_per_iteration": 2.687068462371826 }, { "auxiliary_loss_clip": 0.01456987, "auxiliary_loss_mlp": 0.00279588, "balance_loss_clip": 1.1932745, "balance_loss_mlp": 0.24396834, "epoch": 0.4324665564407034, "flos": 17558378686080.0, "grad_norm": 600.5018024235204, "language_loss": 0.86684823, "learning_rate": 2.5281949971318557e-06, "loss": 0.88421398, "num_input_tokens_seen": 154331740, "router_z_loss_clip": 2.63867188, "router_z_loss_mlp": 0.35620117, "step": 7193, "time_per_iteration": 4.009444952011108 }, { "auxiliary_loss_clip": 0.01425806, "auxiliary_loss_mlp": 0.00268568, "balance_loss_clip": 1.16502035, "balance_loss_mlp": 0.2329479, "epoch": 0.4325266796933714, "flos": 18402423448320.0, "grad_norm": 26.484595985183503, "language_loss": 0.84366411, "learning_rate": 2.5278193528902897e-06, "loss": 0.86060786, "num_input_tokens_seen": 154348740, "router_z_loss_clip": 2.60742188, "router_z_loss_mlp": 0.35668945, "step": 7194, "time_per_iteration": 2.6324357986450195 }, { "auxiliary_loss_clip": 0.01462997, "auxiliary_loss_mlp": 0.00275779, "balance_loss_clip": 1.19741321, "balance_loss_mlp": 0.2437351, "epoch": 0.4325868029460394, "flos": 22564793427840.0, "grad_norm": 20.19416897130306, "language_loss": 0.68506193, "learning_rate": 2.5274436886337613e-06, "loss": 0.70244968, "num_input_tokens_seen": 154368835, "router_z_loss_clip": 2.65429688, "router_z_loss_mlp": 0.32055664, "step": 7195, "time_per_iteration": 4.13252067565918 }, { "auxiliary_loss_clip": 0.01455698, "auxiliary_loss_mlp": 0.00272196, "balance_loss_clip": 1.18673968, "balance_loss_mlp": 0.23986597, "epoch": 0.43264692619870737, "flos": 14605516396800.0, "grad_norm": 13.828701609799227, "language_loss": 0.76482379, "learning_rate": 2.527068004376515e-06, "loss": 0.7821027, "num_input_tokens_seen": 154384620, "router_z_loss_clip": 2.69140625, "router_z_loss_mlp": 0.32324219, "step": 7196, "time_per_iteration": 2.6613595485687256 }, { "auxiliary_loss_clip": 0.01453102, "auxiliary_loss_mlp": 0.00288667, "balance_loss_clip": 1.18631709, "balance_loss_mlp": 0.25419188, "epoch": 0.43270704945137534, "flos": 21501657659520.0, "grad_norm": 5.8836820624291315, "language_loss": 0.78690773, "learning_rate": 2.526692300132797e-06, "loss": 0.80432546, "num_input_tokens_seen": 154402865, "router_z_loss_clip": 2.66601562, "router_z_loss_mlp": 0.34472656, "step": 7197, "time_per_iteration": 2.673870325088501 }, { "auxiliary_loss_clip": 0.01441989, "auxiliary_loss_mlp": 0.00272137, "balance_loss_clip": 1.18103766, "balance_loss_mlp": 0.23809057, "epoch": 0.4327671727040433, "flos": 25155891889920.0, "grad_norm": 7.060082757211363, "language_loss": 0.78107178, "learning_rate": 2.5263165759168547e-06, "loss": 0.79821301, "num_input_tokens_seen": 154423625, "router_z_loss_clip": 2.61328125, "router_z_loss_mlp": 0.34057617, "step": 7198, "time_per_iteration": 4.0738301277160645 }, { "auxiliary_loss_clip": 0.01436076, "auxiliary_loss_mlp": 0.00259264, "balance_loss_clip": 1.17446291, "balance_loss_mlp": 0.22698174, "epoch": 0.43282729595671127, "flos": 25447163276160.0, "grad_norm": 9.56450311146034, "language_loss": 0.86200112, "learning_rate": 2.525940831742934e-06, "loss": 0.87895453, "num_input_tokens_seen": 154444775, "router_z_loss_clip": 2.61328125, "router_z_loss_mlp": 0.32250977, "step": 7199, "time_per_iteration": 2.704907178878784 }, { "auxiliary_loss_clip": 0.01449737, "auxiliary_loss_mlp": 0.00240935, "balance_loss_clip": 1.18876767, "balance_loss_mlp": 0.21034542, "epoch": 0.43288741920937923, "flos": 24126116878080.0, "grad_norm": 27.181625789653122, "language_loss": 0.75269544, "learning_rate": 2.525565067625286e-06, "loss": 0.76960212, "num_input_tokens_seen": 154460815, "router_z_loss_clip": 2.61328125, "router_z_loss_mlp": 0.3059082, "step": 7200, "time_per_iteration": 2.6408040523529053 }, { "auxiliary_loss_clip": 0.01460342, "auxiliary_loss_mlp": 0.00260752, "balance_loss_clip": 1.19230592, "balance_loss_mlp": 0.22415429, "epoch": 0.4329475424620472, "flos": 19204955066880.0, "grad_norm": 4.29949919949412, "language_loss": 0.9463104, "learning_rate": 2.525189283578157e-06, "loss": 0.9635213, "num_input_tokens_seen": 154479145, "router_z_loss_clip": 2.67773438, "router_z_loss_mlp": 0.36621094, "step": 7201, "time_per_iteration": 2.6874680519104004 }, { "auxiliary_loss_clip": 0.01467047, "auxiliary_loss_mlp": 0.00294465, "balance_loss_clip": 1.19434047, "balance_loss_mlp": 0.25619882, "epoch": 0.43300766571471516, "flos": 22638374438400.0, "grad_norm": 3.6340710272290484, "language_loss": 0.72850275, "learning_rate": 2.5248134796157974e-06, "loss": 0.74611795, "num_input_tokens_seen": 154498905, "router_z_loss_clip": 2.72460938, "router_z_loss_mlp": 0.38305664, "step": 7202, "time_per_iteration": 2.677262783050537 }, { "auxiliary_loss_clip": 0.01466043, "auxiliary_loss_mlp": 0.00280255, "balance_loss_clip": 1.20251703, "balance_loss_mlp": 0.24735323, "epoch": 0.4330677889673831, "flos": 22121080721280.0, "grad_norm": 8.422063697389142, "language_loss": 0.87548876, "learning_rate": 2.5244376557524586e-06, "loss": 0.89295173, "num_input_tokens_seen": 154517270, "router_z_loss_clip": 2.63476562, "router_z_loss_mlp": 0.32861328, "step": 7203, "time_per_iteration": 4.102859735488892 }, { "auxiliary_loss_clip": 0.0145518, "auxiliary_loss_mlp": 0.00268621, "balance_loss_clip": 1.18508315, "balance_loss_mlp": 0.23254836, "epoch": 0.4331279122200511, "flos": 23221527742080.0, "grad_norm": 4.536724385730094, "language_loss": 0.89160109, "learning_rate": 2.5240618120023912e-06, "loss": 0.90883911, "num_input_tokens_seen": 154535945, "router_z_loss_clip": 2.70117188, "router_z_loss_mlp": 0.36083984, "step": 7204, "time_per_iteration": 2.6761093139648438 }, { "auxiliary_loss_clip": 0.01451892, "auxiliary_loss_mlp": 0.00276898, "balance_loss_clip": 1.18587911, "balance_loss_mlp": 0.24306688, "epoch": 0.43318803547271906, "flos": 18259750627200.0, "grad_norm": 6.755103397887986, "language_loss": 0.82254851, "learning_rate": 2.5236859483798468e-06, "loss": 0.83983648, "num_input_tokens_seen": 154554935, "router_z_loss_clip": 2.66015625, "router_z_loss_mlp": 0.33813477, "step": 7205, "time_per_iteration": 2.598172426223755 }, { "auxiliary_loss_clip": 0.01455564, "auxiliary_loss_mlp": 0.0025219, "balance_loss_clip": 1.19678903, "balance_loss_mlp": 0.22021803, "epoch": 0.433248158725387, "flos": 27418407713280.0, "grad_norm": 2.4844686035839265, "language_loss": 0.8093369, "learning_rate": 2.5233100648990803e-06, "loss": 0.82641447, "num_input_tokens_seen": 154576065, "router_z_loss_clip": 2.5859375, "router_z_loss_mlp": 0.31933594, "step": 7206, "time_per_iteration": 2.769469976425171 }, { "auxiliary_loss_clip": 0.01448681, "auxiliary_loss_mlp": 0.00250931, "balance_loss_clip": 1.1844753, "balance_loss_mlp": 0.21874449, "epoch": 0.433308281978055, "flos": 23218008209280.0, "grad_norm": 14.900468760717507, "language_loss": 0.85908628, "learning_rate": 2.522934161574342e-06, "loss": 0.87608242, "num_input_tokens_seen": 154595110, "router_z_loss_clip": 2.64453125, "router_z_loss_mlp": 0.32226562, "step": 7207, "time_per_iteration": 2.730645179748535 }, { "auxiliary_loss_clip": 0.01475425, "auxiliary_loss_mlp": 0.00276438, "balance_loss_clip": 1.19880009, "balance_loss_mlp": 0.24086568, "epoch": 0.433368405230723, "flos": 15852407166720.0, "grad_norm": 4.286138229054606, "language_loss": 0.87131894, "learning_rate": 2.5225582384198888e-06, "loss": 0.88883758, "num_input_tokens_seen": 154612255, "router_z_loss_clip": 2.76757812, "router_z_loss_mlp": 0.35571289, "step": 7208, "time_per_iteration": 2.6528282165527344 }, { "auxiliary_loss_clip": 0.01461605, "auxiliary_loss_mlp": 0.00249399, "balance_loss_clip": 1.19468713, "balance_loss_mlp": 0.21709329, "epoch": 0.433428528483391, "flos": 19026084314880.0, "grad_norm": 8.182674416468378, "language_loss": 0.80217075, "learning_rate": 2.5221822954499744e-06, "loss": 0.8192808, "num_input_tokens_seen": 154630440, "router_z_loss_clip": 2.671875, "router_z_loss_mlp": 0.32299805, "step": 7209, "time_per_iteration": 2.619515895843506 }, { "auxiliary_loss_clip": 0.01463935, "auxiliary_loss_mlp": 0.00261633, "balance_loss_clip": 1.19801235, "balance_loss_mlp": 0.2267528, "epoch": 0.43348865173605894, "flos": 24718248581760.0, "grad_norm": 2.957464644205263, "language_loss": 0.87027657, "learning_rate": 2.5218063326788557e-06, "loss": 0.88753223, "num_input_tokens_seen": 154652515, "router_z_loss_clip": 2.66015625, "router_z_loss_mlp": 0.34912109, "step": 7210, "time_per_iteration": 2.7165539264678955 }, { "auxiliary_loss_clip": 0.01465441, "auxiliary_loss_mlp": 0.0024686, "balance_loss_clip": 1.20093036, "balance_loss_mlp": 0.2131478, "epoch": 0.4335487749887269, "flos": 22090664880000.0, "grad_norm": 15.842570537534623, "language_loss": 0.87691689, "learning_rate": 2.5214303501207885e-06, "loss": 0.89403993, "num_input_tokens_seen": 154670965, "router_z_loss_clip": 2.64453125, "router_z_loss_mlp": 0.33666992, "step": 7211, "time_per_iteration": 2.632728338241577 }, { "auxiliary_loss_clip": 0.01466509, "auxiliary_loss_mlp": 0.00260437, "balance_loss_clip": 1.19606304, "balance_loss_mlp": 0.22619966, "epoch": 0.43360889824139487, "flos": 22382941847040.0, "grad_norm": 6.346530041723591, "language_loss": 0.82880175, "learning_rate": 2.521054347790029e-06, "loss": 0.84607118, "num_input_tokens_seen": 154689980, "router_z_loss_clip": 2.70507812, "router_z_loss_mlp": 0.34228516, "step": 7212, "time_per_iteration": 2.693483352661133 }, { "auxiliary_loss_clip": 0.01470597, "auxiliary_loss_mlp": 0.00279345, "balance_loss_clip": 1.20031476, "balance_loss_mlp": 0.24506073, "epoch": 0.43366902149406283, "flos": 17528286067200.0, "grad_norm": 55.27549230758034, "language_loss": 0.81984901, "learning_rate": 2.5206783257008375e-06, "loss": 0.83734846, "num_input_tokens_seen": 154706570, "router_z_loss_clip": 2.69921875, "router_z_loss_mlp": 0.34301758, "step": 7213, "time_per_iteration": 2.7501580715179443 }, { "auxiliary_loss_clip": 0.01473034, "auxiliary_loss_mlp": 0.00271124, "balance_loss_clip": 1.20248342, "balance_loss_mlp": 0.23641074, "epoch": 0.4337291447467308, "flos": 19022672522880.0, "grad_norm": 7.19928023727967, "language_loss": 0.69863546, "learning_rate": 2.520302283867471e-06, "loss": 0.71607709, "num_input_tokens_seen": 154725210, "router_z_loss_clip": 2.703125, "router_z_loss_mlp": 0.34741211, "step": 7214, "time_per_iteration": 2.7224783897399902 }, { "auxiliary_loss_clip": 0.01468273, "auxiliary_loss_mlp": 0.00261182, "balance_loss_clip": 1.20382583, "balance_loss_mlp": 0.22789858, "epoch": 0.43378926799939876, "flos": 27234042180480.0, "grad_norm": 28.011533508938165, "language_loss": 0.76946074, "learning_rate": 2.519926222304191e-06, "loss": 0.78675526, "num_input_tokens_seen": 154745945, "router_z_loss_clip": 2.64257812, "router_z_loss_mlp": 0.33276367, "step": 7215, "time_per_iteration": 2.8568344116210938 }, { "auxiliary_loss_clip": 0.01472735, "auxiliary_loss_mlp": 0.00234974, "balance_loss_clip": 1.20533586, "balance_loss_mlp": 0.2014288, "epoch": 0.43384939125206673, "flos": 15961108700160.0, "grad_norm": 248.30100575089963, "language_loss": 0.82158458, "learning_rate": 2.519550141025255e-06, "loss": 0.83866167, "num_input_tokens_seen": 154763580, "router_z_loss_clip": 2.67382812, "router_z_loss_mlp": 0.33544922, "step": 7216, "time_per_iteration": 2.6499781608581543 }, { "auxiliary_loss_clip": 0.01499883, "auxiliary_loss_mlp": 0.00285019, "balance_loss_clip": 1.2198478, "balance_loss_mlp": 0.24837378, "epoch": 0.4339095145047347, "flos": 21793216354560.0, "grad_norm": 20.793992988877946, "language_loss": 0.86248839, "learning_rate": 2.519174040044927e-06, "loss": 0.88033742, "num_input_tokens_seen": 154776825, "router_z_loss_clip": 2.8046875, "router_z_loss_mlp": 0.36645508, "step": 7217, "time_per_iteration": 2.63392972946167 }, { "auxiliary_loss_clip": 0.01472926, "auxiliary_loss_mlp": 0.00249057, "balance_loss_clip": 1.19776368, "balance_loss_mlp": 0.21517786, "epoch": 0.43396963775740266, "flos": 14209853109120.0, "grad_norm": 14.574209304261995, "language_loss": 0.81362033, "learning_rate": 2.5187979193774664e-06, "loss": 0.83084011, "num_input_tokens_seen": 154794025, "router_z_loss_clip": 2.75195312, "router_z_loss_mlp": 0.33862305, "step": 7218, "time_per_iteration": 2.6557343006134033 }, { "auxiliary_loss_clip": 0.01485908, "auxiliary_loss_mlp": 0.00281997, "balance_loss_clip": 1.20737696, "balance_loss_mlp": 0.24647222, "epoch": 0.4340297610100706, "flos": 19719052473600.0, "grad_norm": 9.556305237400233, "language_loss": 0.78871155, "learning_rate": 2.5184217790371367e-06, "loss": 0.80639064, "num_input_tokens_seen": 154813105, "router_z_loss_clip": 2.78710938, "router_z_loss_mlp": 0.35498047, "step": 7219, "time_per_iteration": 2.668442487716675 }, { "auxiliary_loss_clip": 0.01464874, "auxiliary_loss_mlp": 0.0026, "balance_loss_clip": 1.19609332, "balance_loss_mlp": 0.22380808, "epoch": 0.4340898842627386, "flos": 18953508885120.0, "grad_norm": 4.257626854209452, "language_loss": 0.8202225, "learning_rate": 2.518045619038202e-06, "loss": 0.83747119, "num_input_tokens_seen": 154833525, "router_z_loss_clip": 2.69140625, "router_z_loss_mlp": 0.36230469, "step": 7220, "time_per_iteration": 2.6939616203308105 }, { "auxiliary_loss_clip": 0.0147538, "auxiliary_loss_mlp": 0.0025398, "balance_loss_clip": 1.20238256, "balance_loss_mlp": 0.22219884, "epoch": 0.4341500075154066, "flos": 22018304931840.0, "grad_norm": 50.7191474547803, "language_loss": 0.78486532, "learning_rate": 2.5176694393949243e-06, "loss": 0.80215895, "num_input_tokens_seen": 154853090, "router_z_loss_clip": 2.73242188, "router_z_loss_mlp": 0.31762695, "step": 7221, "time_per_iteration": 2.690606117248535 }, { "auxiliary_loss_clip": 0.01496177, "auxiliary_loss_mlp": 0.00232631, "balance_loss_clip": 1.2193737, "balance_loss_mlp": 0.196081, "epoch": 0.4342101307680746, "flos": 23582465556480.0, "grad_norm": 34.841689521888284, "language_loss": 0.72317195, "learning_rate": 2.51729324012157e-06, "loss": 0.74046004, "num_input_tokens_seen": 154872055, "router_z_loss_clip": 2.76757812, "router_z_loss_mlp": 0.36547852, "step": 7222, "time_per_iteration": 2.6492228507995605 }, { "auxiliary_loss_clip": 0.01476984, "auxiliary_loss_mlp": 0.00265703, "balance_loss_clip": 1.20488727, "balance_loss_mlp": 0.23079875, "epoch": 0.43427025402074254, "flos": 17967976450560.0, "grad_norm": 69.84269914708423, "language_loss": 0.81120008, "learning_rate": 2.5169170212324053e-06, "loss": 0.82862687, "num_input_tokens_seen": 154886645, "router_z_loss_clip": 2.72070312, "router_z_loss_mlp": 0.34887695, "step": 7223, "time_per_iteration": 2.6126503944396973 }, { "auxiliary_loss_clip": 0.01488453, "auxiliary_loss_mlp": 0.00281887, "balance_loss_clip": 1.21061158, "balance_loss_mlp": 0.24583842, "epoch": 0.4343303772734105, "flos": 26286395616000.0, "grad_norm": 24.675850933108137, "language_loss": 1.0111773, "learning_rate": 2.516540782741694e-06, "loss": 1.02888072, "num_input_tokens_seen": 154906775, "router_z_loss_clip": 2.77929688, "router_z_loss_mlp": 0.36083984, "step": 7224, "time_per_iteration": 2.6850273609161377 }, { "auxiliary_loss_clip": 0.01472357, "auxiliary_loss_mlp": 0.00248247, "balance_loss_clip": 1.20032048, "balance_loss_mlp": 0.21522591, "epoch": 0.43439050052607847, "flos": 26833961520000.0, "grad_norm": 3.0448093039055943, "language_loss": 0.65397775, "learning_rate": 2.5161645246637056e-06, "loss": 0.67118382, "num_input_tokens_seen": 154926990, "router_z_loss_clip": 2.71875, "router_z_loss_mlp": 0.33032227, "step": 7225, "time_per_iteration": 2.6925857067108154 }, { "auxiliary_loss_clip": 0.01488514, "auxiliary_loss_mlp": 0.00293365, "balance_loss_clip": 1.21083188, "balance_loss_mlp": 0.25831753, "epoch": 0.43445062377874644, "flos": 21397660807680.0, "grad_norm": 36.17282102409196, "language_loss": 0.86051691, "learning_rate": 2.5157882470127054e-06, "loss": 0.87833571, "num_input_tokens_seen": 154946210, "router_z_loss_clip": 2.7734375, "router_z_loss_mlp": 0.35009766, "step": 7226, "time_per_iteration": 2.6582775115966797 }, { "auxiliary_loss_clip": 0.01469929, "auxiliary_loss_mlp": 0.00236799, "balance_loss_clip": 1.20092559, "balance_loss_mlp": 0.20468396, "epoch": 0.4345107470314144, "flos": 19901945548800.0, "grad_norm": 2.606903186525605, "language_loss": 0.9113239, "learning_rate": 2.515411949802964e-06, "loss": 0.92839116, "num_input_tokens_seen": 154964995, "router_z_loss_clip": 2.69140625, "router_z_loss_mlp": 0.32104492, "step": 7227, "time_per_iteration": 2.671858072280884 }, { "auxiliary_loss_clip": 0.01484427, "auxiliary_loss_mlp": 0.00275489, "balance_loss_clip": 1.21354377, "balance_loss_mlp": 0.24200308, "epoch": 0.43457087028408237, "flos": 26432623883520.0, "grad_norm": 1297.8232711124074, "language_loss": 0.86026657, "learning_rate": 2.5150356330487498e-06, "loss": 0.87786573, "num_input_tokens_seen": 154984775, "router_z_loss_clip": 2.7109375, "router_z_loss_mlp": 0.33508301, "step": 7228, "time_per_iteration": 2.696002244949341 }, { "auxiliary_loss_clip": 0.01481781, "auxiliary_loss_mlp": 0.00260911, "balance_loss_clip": 1.21212244, "balance_loss_mlp": 0.22748516, "epoch": 0.43463099353675033, "flos": 31868816855040.0, "grad_norm": 156.97652655914985, "language_loss": 0.87416697, "learning_rate": 2.5146592967643324e-06, "loss": 0.89159387, "num_input_tokens_seen": 155008125, "router_z_loss_clip": 2.69726562, "router_z_loss_mlp": 0.33422852, "step": 7229, "time_per_iteration": 2.7644529342651367 }, { "auxiliary_loss_clip": 0.01506573, "auxiliary_loss_mlp": 0.00274352, "balance_loss_clip": 1.22629845, "balance_loss_mlp": 0.23866047, "epoch": 0.4346911167894183, "flos": 24571266128640.0, "grad_norm": 3.2121100152395723, "language_loss": 0.89152098, "learning_rate": 2.5142829409639834e-06, "loss": 0.90933025, "num_input_tokens_seen": 155027885, "router_z_loss_clip": 2.80273438, "router_z_loss_mlp": 0.35693359, "step": 7230, "time_per_iteration": 2.691286087036133 }, { "auxiliary_loss_clip": 0.01503795, "auxiliary_loss_mlp": 0.00288568, "balance_loss_clip": 1.2214179, "balance_loss_mlp": 0.25349662, "epoch": 0.43475124004208626, "flos": 17090678672640.0, "grad_norm": 25.874421708878987, "language_loss": 0.84650004, "learning_rate": 2.513906565661973e-06, "loss": 0.86442363, "num_input_tokens_seen": 155043375, "router_z_loss_clip": 2.82617188, "router_z_loss_mlp": 0.35058594, "step": 7231, "time_per_iteration": 2.5903193950653076 }, { "auxiliary_loss_clip": 0.01479272, "auxiliary_loss_mlp": 0.00274155, "balance_loss_clip": 1.20797491, "balance_loss_mlp": 0.2431131, "epoch": 0.4348113632947542, "flos": 26104615862400.0, "grad_norm": 136.3340014625959, "language_loss": 0.744156, "learning_rate": 2.513530170872575e-06, "loss": 0.76169026, "num_input_tokens_seen": 155062930, "router_z_loss_clip": 2.71679688, "router_z_loss_mlp": 0.31005859, "step": 7232, "time_per_iteration": 2.726602792739868 }, { "auxiliary_loss_clip": 0.01503802, "auxiliary_loss_mlp": 0.00263931, "balance_loss_clip": 1.22659302, "balance_loss_mlp": 0.22764358, "epoch": 0.4348714865474222, "flos": 34200496316160.0, "grad_norm": 40.60027612009066, "language_loss": 0.77433813, "learning_rate": 2.5131537566100605e-06, "loss": 0.79201543, "num_input_tokens_seen": 155084980, "router_z_loss_clip": 2.76953125, "router_z_loss_mlp": 0.36303711, "step": 7233, "time_per_iteration": 2.7750394344329834 }, { "auxiliary_loss_clip": 0.0147136, "auxiliary_loss_mlp": 0.00272621, "balance_loss_clip": 1.19568217, "balance_loss_mlp": 0.23776408, "epoch": 0.43493160980009016, "flos": 31537468869120.0, "grad_norm": 4.181142762232191, "language_loss": 0.80146086, "learning_rate": 2.5127773228887053e-06, "loss": 0.8189007, "num_input_tokens_seen": 155107260, "router_z_loss_clip": 2.75585938, "router_z_loss_mlp": 0.34814453, "step": 7234, "time_per_iteration": 2.740543842315674 }, { "auxiliary_loss_clip": 0.01503171, "auxiliary_loss_mlp": 0.00308792, "balance_loss_clip": 1.21785498, "balance_loss_mlp": 0.27403039, "epoch": 0.4349917330527582, "flos": 24061334699520.0, "grad_norm": 2.558584721870834, "language_loss": 0.68066752, "learning_rate": 2.512400869722782e-06, "loss": 0.69878715, "num_input_tokens_seen": 155126720, "router_z_loss_clip": 2.8515625, "router_z_loss_mlp": 0.34765625, "step": 7235, "time_per_iteration": 4.086630582809448 }, { "auxiliary_loss_clip": 0.01498897, "auxiliary_loss_mlp": 0.0027903, "balance_loss_clip": 1.22350717, "balance_loss_mlp": 0.24660493, "epoch": 0.43505185630542614, "flos": 30519329863680.0, "grad_norm": 5.518399378105568, "language_loss": 0.8128767, "learning_rate": 2.512024397126566e-06, "loss": 0.83065605, "num_input_tokens_seen": 155148640, "router_z_loss_clip": 2.75390625, "router_z_loss_mlp": 0.32421875, "step": 7236, "time_per_iteration": 2.708411455154419 }, { "auxiliary_loss_clip": 0.01508223, "auxiliary_loss_mlp": 0.00271105, "balance_loss_clip": 1.22948146, "balance_loss_mlp": 0.2359619, "epoch": 0.4351119795580941, "flos": 15735158196480.0, "grad_norm": 6.932034468039975, "language_loss": 0.87028337, "learning_rate": 2.5116479051143345e-06, "loss": 0.88807666, "num_input_tokens_seen": 155165870, "router_z_loss_clip": 2.78515625, "router_z_loss_mlp": 0.35180664, "step": 7237, "time_per_iteration": 4.101562738418579 }, { "auxiliary_loss_clip": 0.01505258, "auxiliary_loss_mlp": 0.00271948, "balance_loss_clip": 1.22842813, "balance_loss_mlp": 0.23926088, "epoch": 0.4351721028107621, "flos": 18731760272640.0, "grad_norm": 23.14813408420282, "language_loss": 0.70289534, "learning_rate": 2.5112713937003623e-06, "loss": 0.72066742, "num_input_tokens_seen": 155185315, "router_z_loss_clip": 2.76953125, "router_z_loss_mlp": 0.3269043, "step": 7238, "time_per_iteration": 2.6839544773101807 }, { "auxiliary_loss_clip": 0.01518874, "auxiliary_loss_mlp": 0.00271367, "balance_loss_clip": 1.23987007, "balance_loss_mlp": 0.23720147, "epoch": 0.43523222606343004, "flos": 25226887121280.0, "grad_norm": 5.957520849855805, "language_loss": 0.89955866, "learning_rate": 2.510894862898928e-06, "loss": 0.91746104, "num_input_tokens_seen": 155205790, "router_z_loss_clip": 2.79101562, "router_z_loss_mlp": 0.34179688, "step": 7239, "time_per_iteration": 2.6861603260040283 }, { "auxiliary_loss_clip": 0.01520621, "auxiliary_loss_mlp": 0.00277219, "balance_loss_clip": 1.23740447, "balance_loss_mlp": 0.23964468, "epoch": 0.435292349316098, "flos": 22709190101760.0, "grad_norm": 7.837466372449419, "language_loss": 0.79394931, "learning_rate": 2.510518312724309e-06, "loss": 0.81192774, "num_input_tokens_seen": 155226475, "router_z_loss_clip": 2.83398438, "router_z_loss_mlp": 0.37573242, "step": 7240, "time_per_iteration": 4.116159677505493 }, { "auxiliary_loss_clip": 0.01514598, "auxiliary_loss_mlp": 0.00294589, "balance_loss_clip": 1.22996032, "balance_loss_mlp": 0.25715679, "epoch": 0.43535247256876597, "flos": 25775889569280.0, "grad_norm": 2.6922513441178477, "language_loss": 0.88963515, "learning_rate": 2.5101417431907842e-06, "loss": 0.907727, "num_input_tokens_seen": 155247110, "router_z_loss_clip": 2.84375, "router_z_loss_mlp": 0.37426758, "step": 7241, "time_per_iteration": 2.682924747467041 }, { "auxiliary_loss_clip": 0.01513721, "auxiliary_loss_mlp": 0.00293031, "balance_loss_clip": 1.22815514, "balance_loss_mlp": 0.2580786, "epoch": 0.43541259582143393, "flos": 17528142412800.0, "grad_norm": 8.282638733423934, "language_loss": 0.88402253, "learning_rate": 2.5097651543126345e-06, "loss": 0.90209007, "num_input_tokens_seen": 155261335, "router_z_loss_clip": 2.85742188, "router_z_loss_mlp": 0.34912109, "step": 7242, "time_per_iteration": 2.6380200386047363 }, { "auxiliary_loss_clip": 0.01513703, "auxiliary_loss_mlp": 0.00274172, "balance_loss_clip": 1.22941363, "balance_loss_mlp": 0.23986372, "epoch": 0.4354727190741019, "flos": 15195205975680.0, "grad_norm": 70.54150896605418, "language_loss": 0.81026554, "learning_rate": 2.509388546104138e-06, "loss": 0.82814431, "num_input_tokens_seen": 155278510, "router_z_loss_clip": 2.84179688, "router_z_loss_mlp": 0.34301758, "step": 7243, "time_per_iteration": 2.6175479888916016 }, { "auxiliary_loss_clip": 0.01505607, "auxiliary_loss_mlp": 0.00282084, "balance_loss_clip": 1.22935665, "balance_loss_mlp": 0.24718, "epoch": 0.43553284232676986, "flos": 16649264436480.0, "grad_norm": 32.31693551133028, "language_loss": 0.88335305, "learning_rate": 2.5090119185795766e-06, "loss": 0.90122998, "num_input_tokens_seen": 155296450, "router_z_loss_clip": 2.75976562, "router_z_loss_mlp": 0.34936523, "step": 7244, "time_per_iteration": 2.609377145767212 }, { "auxiliary_loss_clip": 0.01504406, "auxiliary_loss_mlp": 0.00254358, "balance_loss_clip": 1.22746611, "balance_loss_mlp": 0.21838032, "epoch": 0.43559296557943783, "flos": 23400865370880.0, "grad_norm": 8.250903309492404, "language_loss": 0.79400063, "learning_rate": 2.508635271753234e-06, "loss": 0.81158823, "num_input_tokens_seen": 155316080, "router_z_loss_clip": 2.77148438, "router_z_loss_mlp": 0.36010742, "step": 7245, "time_per_iteration": 4.110187292098999 }, { "auxiliary_loss_clip": 0.01531994, "auxiliary_loss_mlp": 0.00260223, "balance_loss_clip": 1.24657714, "balance_loss_mlp": 0.22693995, "epoch": 0.4356530888321058, "flos": 22419067950720.0, "grad_norm": 57.313039612845586, "language_loss": 0.84871042, "learning_rate": 2.508258605639389e-06, "loss": 0.86663258, "num_input_tokens_seen": 155336765, "router_z_loss_clip": 2.8515625, "router_z_loss_mlp": 0.33251953, "step": 7246, "time_per_iteration": 2.6899514198303223 }, { "auxiliary_loss_clip": 0.01533918, "auxiliary_loss_mlp": 0.00299713, "balance_loss_clip": 1.24891615, "balance_loss_mlp": 0.26554808, "epoch": 0.43571321208477376, "flos": 21616141282560.0, "grad_norm": 5.100915358455112, "language_loss": 0.91922832, "learning_rate": 2.5078819202523275e-06, "loss": 0.93756473, "num_input_tokens_seen": 155356440, "router_z_loss_clip": 2.8515625, "router_z_loss_mlp": 0.34155273, "step": 7247, "time_per_iteration": 2.650034189224243 }, { "auxiliary_loss_clip": 0.01519627, "auxiliary_loss_mlp": 0.00302286, "balance_loss_clip": 1.23838544, "balance_loss_mlp": 0.26845443, "epoch": 0.4357733353374418, "flos": 23987358639360.0, "grad_norm": 6.435234475624503, "language_loss": 0.77858168, "learning_rate": 2.507505215606333e-06, "loss": 0.79680085, "num_input_tokens_seen": 155377070, "router_z_loss_clip": 2.81445312, "router_z_loss_mlp": 0.33813477, "step": 7248, "time_per_iteration": 2.677650213241577 }, { "auxiliary_loss_clip": 0.01520872, "auxiliary_loss_mlp": 0.00263984, "balance_loss_clip": 1.24211109, "balance_loss_mlp": 0.23096293, "epoch": 0.43583345859010975, "flos": 25264737077760.0, "grad_norm": 13.010605532088857, "language_loss": 0.92192972, "learning_rate": 2.5071284917156893e-06, "loss": 0.93977833, "num_input_tokens_seen": 155398415, "router_z_loss_clip": 2.78710938, "router_z_loss_mlp": 0.33007812, "step": 7249, "time_per_iteration": 2.6895663738250732 }, { "auxiliary_loss_clip": 0.01512996, "auxiliary_loss_mlp": 0.00254698, "balance_loss_clip": 1.22958016, "balance_loss_mlp": 0.22224891, "epoch": 0.4358935818427777, "flos": 23696302734720.0, "grad_norm": 6.216413917901755, "language_loss": 0.89062822, "learning_rate": 2.506751748594683e-06, "loss": 0.90830511, "num_input_tokens_seen": 155415625, "router_z_loss_clip": 2.83398438, "router_z_loss_mlp": 0.32446289, "step": 7250, "time_per_iteration": 2.677631378173828 }, { "auxiliary_loss_clip": 0.01546394, "auxiliary_loss_mlp": 0.00286702, "balance_loss_clip": 1.26123714, "balance_loss_mlp": 0.25325203, "epoch": 0.4359537050954457, "flos": 29532827761920.0, "grad_norm": 28.486354221003324, "language_loss": 0.90837705, "learning_rate": 2.5063749862575988e-06, "loss": 0.92670798, "num_input_tokens_seen": 155435505, "router_z_loss_clip": 2.8515625, "router_z_loss_mlp": 0.33422852, "step": 7251, "time_per_iteration": 2.692474603652954 }, { "auxiliary_loss_clip": 0.0151345, "auxiliary_loss_mlp": 0.00279011, "balance_loss_clip": 1.23255098, "balance_loss_mlp": 0.24355796, "epoch": 0.43601382834811364, "flos": 22711273090560.0, "grad_norm": 3.241834562229884, "language_loss": 0.77468431, "learning_rate": 2.5059982047187245e-06, "loss": 0.79260886, "num_input_tokens_seen": 155455425, "router_z_loss_clip": 2.80859375, "router_z_loss_mlp": 0.35449219, "step": 7252, "time_per_iteration": 2.664320230484009 }, { "auxiliary_loss_clip": 0.01512727, "auxiliary_loss_mlp": 0.00270705, "balance_loss_clip": 1.23766303, "balance_loss_mlp": 0.23949555, "epoch": 0.4360739516007816, "flos": 19098731571840.0, "grad_norm": 7.254449351736594, "language_loss": 0.89561605, "learning_rate": 2.505621403992348e-06, "loss": 0.91345042, "num_input_tokens_seen": 155474250, "router_z_loss_clip": 2.75195312, "router_z_loss_mlp": 0.31225586, "step": 7253, "time_per_iteration": 2.610624313354492 }, { "auxiliary_loss_clip": 0.01535851, "auxiliary_loss_mlp": 0.00312509, "balance_loss_clip": 1.25598645, "balance_loss_mlp": 0.27812928, "epoch": 0.43613407485344957, "flos": 23404420817280.0, "grad_norm": 5.115141591930763, "language_loss": 0.76777911, "learning_rate": 2.505244584092757e-06, "loss": 0.78626263, "num_input_tokens_seen": 155494685, "router_z_loss_clip": 2.796875, "router_z_loss_mlp": 0.34375, "step": 7254, "time_per_iteration": 2.6885359287261963 }, { "auxiliary_loss_clip": 0.0152871, "auxiliary_loss_mlp": 0.00260231, "balance_loss_clip": 1.25181103, "balance_loss_mlp": 0.22785407, "epoch": 0.43619419810611754, "flos": 22637799820800.0, "grad_norm": 2.9097816177130134, "language_loss": 0.88666588, "learning_rate": 2.5048677450342406e-06, "loss": 0.90455532, "num_input_tokens_seen": 155513040, "router_z_loss_clip": 2.765625, "router_z_loss_mlp": 0.32348633, "step": 7255, "time_per_iteration": 2.6551365852355957 }, { "auxiliary_loss_clip": 0.01517011, "auxiliary_loss_mlp": 0.0026512, "balance_loss_clip": 1.23852384, "balance_loss_mlp": 0.23207489, "epoch": 0.4362543213587855, "flos": 20047958334720.0, "grad_norm": 18.587280079041605, "language_loss": 0.83430344, "learning_rate": 2.504490886831089e-06, "loss": 0.85212475, "num_input_tokens_seen": 155530100, "router_z_loss_clip": 2.78710938, "router_z_loss_mlp": 0.33056641, "step": 7256, "time_per_iteration": 2.652520179748535 }, { "auxiliary_loss_clip": 0.01520719, "auxiliary_loss_mlp": 0.0024724, "balance_loss_clip": 1.24568009, "balance_loss_mlp": 0.21619831, "epoch": 0.43631444461145347, "flos": 21361319222400.0, "grad_norm": 1.9664094373166843, "language_loss": 0.80770373, "learning_rate": 2.5041140094975922e-06, "loss": 0.82538337, "num_input_tokens_seen": 155549375, "router_z_loss_clip": 2.75390625, "router_z_loss_mlp": 0.31054688, "step": 7257, "time_per_iteration": 2.6535627841949463 }, { "auxiliary_loss_clip": 0.01504386, "auxiliary_loss_mlp": 0.00271387, "balance_loss_clip": 1.22478795, "balance_loss_mlp": 0.2393198, "epoch": 0.43637456786412143, "flos": 22418529246720.0, "grad_norm": 4.489947651059835, "language_loss": 0.7957086, "learning_rate": 2.5037371130480417e-06, "loss": 0.81346631, "num_input_tokens_seen": 155569395, "router_z_loss_clip": 2.79492188, "router_z_loss_mlp": 0.32067871, "step": 7258, "time_per_iteration": 2.7075915336608887 }, { "auxiliary_loss_clip": 0.01525623, "auxiliary_loss_mlp": 0.00282058, "balance_loss_clip": 1.24497831, "balance_loss_mlp": 0.24822673, "epoch": 0.4364346911167894, "flos": 28548839612160.0, "grad_norm": 10.902797817972038, "language_loss": 0.84451306, "learning_rate": 2.5033601974967297e-06, "loss": 0.86258984, "num_input_tokens_seen": 155589090, "router_z_loss_clip": 2.81054688, "router_z_loss_mlp": 0.33837891, "step": 7259, "time_per_iteration": 2.7714943885803223 }, { "auxiliary_loss_clip": 0.01345186, "auxiliary_loss_mlp": 0.0004557, "balance_loss_clip": 1.18528497, "balance_loss_mlp": 0.03298127, "epoch": 0.43649481436945736, "flos": 62659345380480.0, "grad_norm": 0.7454392780255781, "language_loss": 0.56662267, "learning_rate": 2.5029832628579483e-06, "loss": 0.58053023, "num_input_tokens_seen": 155648660, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.12597656, "step": 7260, "time_per_iteration": 3.188816547393799 }, { "auxiliary_loss_clip": 0.01511669, "auxiliary_loss_mlp": 0.00286916, "balance_loss_clip": 1.23499584, "balance_loss_mlp": 0.25556371, "epoch": 0.4365549376221254, "flos": 30592120775040.0, "grad_norm": 14.114565907798262, "language_loss": 0.78855294, "learning_rate": 2.5026063091459907e-06, "loss": 0.80653882, "num_input_tokens_seen": 155669945, "router_z_loss_clip": 2.76757812, "router_z_loss_mlp": 0.31323242, "step": 7261, "time_per_iteration": 2.810563325881958 }, { "auxiliary_loss_clip": 0.01508482, "auxiliary_loss_mlp": 0.00283391, "balance_loss_clip": 1.23472762, "balance_loss_mlp": 0.25161031, "epoch": 0.43661506087479335, "flos": 17165875795200.0, "grad_norm": 2.8906681741860307, "language_loss": 0.77294886, "learning_rate": 2.5022293363751522e-06, "loss": 0.79086757, "num_input_tokens_seen": 155688555, "router_z_loss_clip": 2.73828125, "router_z_loss_mlp": 0.31762695, "step": 7262, "time_per_iteration": 2.7119932174682617 }, { "auxiliary_loss_clip": 0.01492337, "auxiliary_loss_mlp": 0.00249654, "balance_loss_clip": 1.22523427, "balance_loss_mlp": 0.21870753, "epoch": 0.4366751841274613, "flos": 22047499710720.0, "grad_norm": 16.76718528442695, "language_loss": 0.83609217, "learning_rate": 2.501852344559726e-06, "loss": 0.85351205, "num_input_tokens_seen": 155705370, "router_z_loss_clip": 2.66992188, "router_z_loss_mlp": 0.30957031, "step": 7263, "time_per_iteration": 2.6913533210754395 }, { "auxiliary_loss_clip": 0.01488983, "auxiliary_loss_mlp": 0.00284352, "balance_loss_clip": 1.2174983, "balance_loss_mlp": 0.24928066, "epoch": 0.4367353073801293, "flos": 15997306631040.0, "grad_norm": 10.782451335371709, "language_loss": 0.82616782, "learning_rate": 2.50147533371401e-06, "loss": 0.84390116, "num_input_tokens_seen": 155721890, "router_z_loss_clip": 2.71679688, "router_z_loss_mlp": 0.35107422, "step": 7264, "time_per_iteration": 2.6102354526519775 }, { "auxiliary_loss_clip": 0.01485007, "auxiliary_loss_mlp": 0.00274405, "balance_loss_clip": 1.21253645, "balance_loss_mlp": 0.2432676, "epoch": 0.43679543063279724, "flos": 38217535868160.0, "grad_norm": 32.6996138664104, "language_loss": 0.68359804, "learning_rate": 2.501098303852298e-06, "loss": 0.70119214, "num_input_tokens_seen": 155743970, "router_z_loss_clip": 2.72460938, "router_z_loss_mlp": 0.31115723, "step": 7265, "time_per_iteration": 2.8027150630950928 }, { "auxiliary_loss_clip": 0.0149115, "auxiliary_loss_mlp": 0.00286431, "balance_loss_clip": 1.21961212, "balance_loss_mlp": 0.25388664, "epoch": 0.4368555538854652, "flos": 15193230727680.0, "grad_norm": 5.316876045689238, "language_loss": 0.80501473, "learning_rate": 2.5007212549888884e-06, "loss": 0.8227905, "num_input_tokens_seen": 155761830, "router_z_loss_clip": 2.71289062, "router_z_loss_mlp": 0.32519531, "step": 7266, "time_per_iteration": 2.611496925354004 }, { "auxiliary_loss_clip": 0.01490231, "auxiliary_loss_mlp": 0.00260645, "balance_loss_clip": 1.21868157, "balance_loss_mlp": 0.2295074, "epoch": 0.4369156771381332, "flos": 23069086421760.0, "grad_norm": 2.7154888398745154, "language_loss": 0.8993457, "learning_rate": 2.5003441871380794e-06, "loss": 0.91685444, "num_input_tokens_seen": 155779610, "router_z_loss_clip": 2.71679688, "router_z_loss_mlp": 0.31176758, "step": 7267, "time_per_iteration": 2.6958420276641846 }, { "auxiliary_loss_clip": 0.0149317, "auxiliary_loss_mlp": 0.00261818, "balance_loss_clip": 1.22146869, "balance_loss_mlp": 0.2296792, "epoch": 0.43697580039080114, "flos": 23441085624960.0, "grad_norm": 21.148714826681662, "language_loss": 0.80731624, "learning_rate": 2.4999671003141674e-06, "loss": 0.82486618, "num_input_tokens_seen": 155798765, "router_z_loss_clip": 2.71875, "router_z_loss_mlp": 0.32104492, "step": 7268, "time_per_iteration": 2.6992156505584717 }, { "auxiliary_loss_clip": 0.01497152, "auxiliary_loss_mlp": 0.00290048, "balance_loss_clip": 1.22430682, "balance_loss_mlp": 0.25731331, "epoch": 0.4370359236434691, "flos": 18514680428160.0, "grad_norm": 6.663624550309342, "language_loss": 0.86558902, "learning_rate": 2.499589994531454e-06, "loss": 0.883461, "num_input_tokens_seen": 155817750, "router_z_loss_clip": 2.73046875, "router_z_loss_mlp": 0.32714844, "step": 7269, "time_per_iteration": 2.618189811706543 }, { "auxiliary_loss_clip": 0.01504256, "auxiliary_loss_mlp": 0.00260528, "balance_loss_clip": 1.23202801, "balance_loss_mlp": 0.22929515, "epoch": 0.43709604689613707, "flos": 23222497409280.0, "grad_norm": 1.6587667698471402, "language_loss": 0.81733519, "learning_rate": 2.499212869804237e-06, "loss": 0.83498299, "num_input_tokens_seen": 155836490, "router_z_loss_clip": 2.72265625, "router_z_loss_mlp": 0.31201172, "step": 7270, "time_per_iteration": 2.666328191757202 }, { "auxiliary_loss_clip": 0.014988, "auxiliary_loss_mlp": 0.00284071, "balance_loss_clip": 1.2235806, "balance_loss_mlp": 0.25212246, "epoch": 0.43715617014880503, "flos": 23803711378560.0, "grad_norm": 4.546199207249762, "language_loss": 0.85591465, "learning_rate": 2.4988357261468182e-06, "loss": 0.87374341, "num_input_tokens_seen": 155856225, "router_z_loss_clip": 2.75195312, "router_z_loss_mlp": 0.31933594, "step": 7271, "time_per_iteration": 2.6562395095825195 }, { "auxiliary_loss_clip": 0.0133265, "auxiliary_loss_mlp": 0.00031349, "balance_loss_clip": 1.17139101, "balance_loss_mlp": 0.02166962, "epoch": 0.437216293401473, "flos": 61941204766080.0, "grad_norm": 0.6784669955250194, "language_loss": 0.54440761, "learning_rate": 2.4984585635734993e-06, "loss": 0.55804753, "num_input_tokens_seen": 155916770, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.09667969, "step": 7272, "time_per_iteration": 3.214798927307129 }, { "auxiliary_loss_clip": 0.0152108, "auxiliary_loss_mlp": 0.00318094, "balance_loss_clip": 1.23633742, "balance_loss_mlp": 0.2828314, "epoch": 0.43727641665414096, "flos": 21982250655360.0, "grad_norm": 5.762135047405825, "language_loss": 0.7625035, "learning_rate": 2.498081382098581e-06, "loss": 0.78089529, "num_input_tokens_seen": 155936490, "router_z_loss_clip": 2.84765625, "router_z_loss_mlp": 0.35253906, "step": 7273, "time_per_iteration": 2.6737070083618164 }, { "auxiliary_loss_clip": 0.01511272, "auxiliary_loss_mlp": 0.00298177, "balance_loss_clip": 1.22979748, "balance_loss_mlp": 0.2654188, "epoch": 0.437336539906809, "flos": 39530860842240.0, "grad_norm": 7.518891985093172, "language_loss": 0.8462472, "learning_rate": 2.497704181736367e-06, "loss": 0.86434174, "num_input_tokens_seen": 155957595, "router_z_loss_clip": 2.81835938, "router_z_loss_mlp": 0.32763672, "step": 7274, "time_per_iteration": 2.8244316577911377 }, { "auxiliary_loss_clip": 0.01505229, "auxiliary_loss_mlp": 0.00293676, "balance_loss_clip": 1.22582042, "balance_loss_mlp": 0.26301566, "epoch": 0.43739666315947695, "flos": 17457147181440.0, "grad_norm": 10.474375631164808, "language_loss": 0.85969657, "learning_rate": 2.49732696250116e-06, "loss": 0.87768561, "num_input_tokens_seen": 155975710, "router_z_loss_clip": 2.79492188, "router_z_loss_mlp": 0.30664062, "step": 7275, "time_per_iteration": 2.6503450870513916 }, { "auxiliary_loss_clip": 0.01517457, "auxiliary_loss_mlp": 0.00292008, "balance_loss_clip": 1.2395997, "balance_loss_mlp": 0.25877213, "epoch": 0.4374567864121449, "flos": 16358747235840.0, "grad_norm": 4.463723825872533, "language_loss": 0.88430536, "learning_rate": 2.496949724407266e-06, "loss": 0.90239996, "num_input_tokens_seen": 155993090, "router_z_loss_clip": 2.78125, "router_z_loss_mlp": 0.33227539, "step": 7276, "time_per_iteration": 2.6709163188934326 }, { "auxiliary_loss_clip": 0.01535981, "auxiliary_loss_mlp": 0.00327924, "balance_loss_clip": 1.2443378, "balance_loss_mlp": 0.29259008, "epoch": 0.4375169096648129, "flos": 30587523834240.0, "grad_norm": 278.13720161080033, "language_loss": 0.79978293, "learning_rate": 2.496572467468988e-06, "loss": 0.81842196, "num_input_tokens_seen": 156013685, "router_z_loss_clip": 2.91601562, "router_z_loss_mlp": 0.35351562, "step": 7277, "time_per_iteration": 2.748324155807495 }, { "auxiliary_loss_clip": 0.0152106, "auxiliary_loss_mlp": 0.00318091, "balance_loss_clip": 1.23927402, "balance_loss_mlp": 0.28378224, "epoch": 0.43757703291748085, "flos": 30555599621760.0, "grad_norm": 5.683042869170781, "language_loss": 0.80325711, "learning_rate": 2.4961951917006317e-06, "loss": 0.8216486, "num_input_tokens_seen": 156034300, "router_z_loss_clip": 2.81640625, "router_z_loss_mlp": 0.34301758, "step": 7278, "time_per_iteration": 4.117890119552612 }, { "auxiliary_loss_clip": 0.01523763, "auxiliary_loss_mlp": 0.00303091, "balance_loss_clip": 1.2429378, "balance_loss_mlp": 0.27052286, "epoch": 0.4376371561701488, "flos": 21397373498880.0, "grad_norm": 4.5497035946452575, "language_loss": 0.73232996, "learning_rate": 2.4958178971165046e-06, "loss": 0.75059849, "num_input_tokens_seen": 156053805, "router_z_loss_clip": 2.81054688, "router_z_loss_mlp": 0.32592773, "step": 7279, "time_per_iteration": 2.691434144973755 }, { "auxiliary_loss_clip": 0.01539127, "auxiliary_loss_mlp": 0.00311321, "balance_loss_clip": 1.24872923, "balance_loss_mlp": 0.27856246, "epoch": 0.4376972794228168, "flos": 23404384903680.0, "grad_norm": 2.845805592339726, "language_loss": 0.91148233, "learning_rate": 2.4954405837309126e-06, "loss": 0.92998683, "num_input_tokens_seen": 156073295, "router_z_loss_clip": 2.90625, "router_z_loss_mlp": 0.32763672, "step": 7280, "time_per_iteration": 4.09071159362793 }, { "auxiliary_loss_clip": 0.01534922, "auxiliary_loss_mlp": 0.00294054, "balance_loss_clip": 1.24984848, "balance_loss_mlp": 0.26073527, "epoch": 0.43775740267548474, "flos": 22892945103360.0, "grad_norm": 10.660237855630115, "language_loss": 0.82912064, "learning_rate": 2.4950632515581653e-06, "loss": 0.84741044, "num_input_tokens_seen": 156094540, "router_z_loss_clip": 2.8515625, "router_z_loss_mlp": 0.33337402, "step": 7281, "time_per_iteration": 2.6758289337158203 }, { "auxiliary_loss_clip": 0.01543715, "auxiliary_loss_mlp": 0.00317866, "balance_loss_clip": 1.25458765, "balance_loss_mlp": 0.28570288, "epoch": 0.4378175259281527, "flos": 23294390480640.0, "grad_norm": 241457.6099359091, "language_loss": 0.82250702, "learning_rate": 2.494685900612569e-06, "loss": 0.84112275, "num_input_tokens_seen": 156114070, "router_z_loss_clip": 2.89257812, "router_z_loss_mlp": 0.32177734, "step": 7282, "time_per_iteration": 4.1027069091796875 }, { "auxiliary_loss_clip": 0.01546891, "auxiliary_loss_mlp": 0.00303404, "balance_loss_clip": 1.25378633, "balance_loss_mlp": 0.26888067, "epoch": 0.43787764918082067, "flos": 23876897339520.0, "grad_norm": 30.441478872590594, "language_loss": 0.90600628, "learning_rate": 2.4943085309084333e-06, "loss": 0.92450929, "num_input_tokens_seen": 156132130, "router_z_loss_clip": 2.93164062, "router_z_loss_mlp": 0.34545898, "step": 7283, "time_per_iteration": 2.723203659057617 }, { "auxiliary_loss_clip": 0.0155018, "auxiliary_loss_mlp": 0.00302765, "balance_loss_clip": 1.25396073, "balance_loss_mlp": 0.26621589, "epoch": 0.43793777243348864, "flos": 23988148738560.0, "grad_norm": 19.147280233016474, "language_loss": 0.86293912, "learning_rate": 2.49393114246007e-06, "loss": 0.88146853, "num_input_tokens_seen": 156150820, "router_z_loss_clip": 2.96289062, "router_z_loss_mlp": 0.36547852, "step": 7284, "time_per_iteration": 2.679199695587158 }, { "auxiliary_loss_clip": 0.01550097, "auxiliary_loss_mlp": 0.00298849, "balance_loss_clip": 1.25915706, "balance_loss_mlp": 0.26392022, "epoch": 0.4379978956861566, "flos": 18624064320000.0, "grad_norm": 21.375284320250195, "language_loss": 0.88187939, "learning_rate": 2.493553735281787e-06, "loss": 0.90036881, "num_input_tokens_seen": 156170125, "router_z_loss_clip": 2.90625, "router_z_loss_mlp": 0.34936523, "step": 7285, "time_per_iteration": 2.648146152496338 }, { "auxiliary_loss_clip": 0.01558836, "auxiliary_loss_mlp": 0.00295229, "balance_loss_clip": 1.26257873, "balance_loss_mlp": 0.2614688, "epoch": 0.43805801893882457, "flos": 21981388728960.0, "grad_norm": 65.45806803913442, "language_loss": 0.81937474, "learning_rate": 2.493176309387897e-06, "loss": 0.8379153, "num_input_tokens_seen": 156187320, "router_z_loss_clip": 2.9609375, "router_z_loss_mlp": 0.33740234, "step": 7286, "time_per_iteration": 2.6661813259124756 }, { "auxiliary_loss_clip": 0.01549852, "auxiliary_loss_mlp": 0.00322984, "balance_loss_clip": 1.25651002, "balance_loss_mlp": 0.28769809, "epoch": 0.43811814219149253, "flos": 26393337383040.0, "grad_norm": 7.853077936822833, "language_loss": 0.7923547, "learning_rate": 2.492798864792712e-06, "loss": 0.81108314, "num_input_tokens_seen": 156207455, "router_z_loss_clip": 2.93164062, "router_z_loss_mlp": 0.35253906, "step": 7287, "time_per_iteration": 2.70443058013916 }, { "auxiliary_loss_clip": 0.01553847, "auxiliary_loss_mlp": 0.0032759, "balance_loss_clip": 1.26114535, "balance_loss_mlp": 0.29154089, "epoch": 0.43817826544416055, "flos": 17493309198720.0, "grad_norm": 26.653766958800315, "language_loss": 0.88644314, "learning_rate": 2.492421401510545e-06, "loss": 0.90525746, "num_input_tokens_seen": 156226560, "router_z_loss_clip": 2.9296875, "router_z_loss_mlp": 0.3605957, "step": 7288, "time_per_iteration": 4.021993160247803 }, { "auxiliary_loss_clip": 0.01552554, "auxiliary_loss_mlp": 0.00317365, "balance_loss_clip": 1.25394797, "balance_loss_mlp": 0.28098226, "epoch": 0.4382383886968285, "flos": 21581020759680.0, "grad_norm": 32.24217088701585, "language_loss": 0.88468814, "learning_rate": 2.4920439195557093e-06, "loss": 0.90338731, "num_input_tokens_seen": 156246740, "router_z_loss_clip": 2.98828125, "router_z_loss_mlp": 0.36376953, "step": 7289, "time_per_iteration": 2.6590895652770996 }, { "auxiliary_loss_clip": 0.01566113, "auxiliary_loss_mlp": 0.00309269, "balance_loss_clip": 1.26968074, "balance_loss_mlp": 0.27543738, "epoch": 0.4382985119494965, "flos": 27923742201600.0, "grad_norm": 4.805620157790787, "language_loss": 0.83923745, "learning_rate": 2.4916664189425183e-06, "loss": 0.85799128, "num_input_tokens_seen": 156266440, "router_z_loss_clip": 2.96484375, "router_z_loss_mlp": 0.33837891, "step": 7290, "time_per_iteration": 2.753173351287842 }, { "auxiliary_loss_clip": 0.01581593, "auxiliary_loss_mlp": 0.00301694, "balance_loss_clip": 1.28450489, "balance_loss_mlp": 0.26793444, "epoch": 0.43835863520216445, "flos": 24936836797440.0, "grad_norm": 5.75773157593334, "language_loss": 0.84362745, "learning_rate": 2.491288899685288e-06, "loss": 0.86246032, "num_input_tokens_seen": 156286900, "router_z_loss_clip": 2.97070312, "router_z_loss_mlp": 0.33764648, "step": 7291, "time_per_iteration": 2.6935582160949707 }, { "auxiliary_loss_clip": 0.01562464, "auxiliary_loss_mlp": 0.00304626, "balance_loss_clip": 1.2670449, "balance_loss_mlp": 0.26955485, "epoch": 0.4384187584548324, "flos": 33510293504640.0, "grad_norm": 4.4250744065213405, "language_loss": 0.72744036, "learning_rate": 2.4909113617983325e-06, "loss": 0.74611115, "num_input_tokens_seen": 156307690, "router_z_loss_clip": 2.95117188, "router_z_loss_mlp": 0.35107422, "step": 7292, "time_per_iteration": 2.7406883239746094 }, { "auxiliary_loss_clip": 0.01563919, "auxiliary_loss_mlp": 0.00291227, "balance_loss_clip": 1.2686286, "balance_loss_mlp": 0.25558299, "epoch": 0.4384788817075004, "flos": 23951052967680.0, "grad_norm": 4.101440134559566, "language_loss": 0.80379295, "learning_rate": 2.49053380529597e-06, "loss": 0.82234442, "num_input_tokens_seen": 156326620, "router_z_loss_clip": 2.953125, "router_z_loss_mlp": 0.35668945, "step": 7293, "time_per_iteration": 2.6975927352905273 }, { "auxiliary_loss_clip": 0.01562134, "auxiliary_loss_mlp": 0.00347378, "balance_loss_clip": 1.26395857, "balance_loss_mlp": 0.3070378, "epoch": 0.43853900496016834, "flos": 19098516090240.0, "grad_norm": 28.558922192532496, "language_loss": 0.86101842, "learning_rate": 2.490156230192516e-06, "loss": 0.8801136, "num_input_tokens_seen": 156345495, "router_z_loss_clip": 2.98242188, "router_z_loss_mlp": 0.40332031, "step": 7294, "time_per_iteration": 2.671438694000244 }, { "auxiliary_loss_clip": 0.01577819, "auxiliary_loss_mlp": 0.00338692, "balance_loss_clip": 1.28073621, "balance_loss_mlp": 0.30271441, "epoch": 0.4385991282128363, "flos": 13225362168960.0, "grad_norm": 15.588606604803557, "language_loss": 0.79558486, "learning_rate": 2.4897786365022883e-06, "loss": 0.81474996, "num_input_tokens_seen": 156363155, "router_z_loss_clip": 2.97265625, "router_z_loss_mlp": 0.35986328, "step": 7295, "time_per_iteration": 2.618680715560913 }, { "auxiliary_loss_clip": 0.01577631, "auxiliary_loss_mlp": 0.00349534, "balance_loss_clip": 1.27516842, "balance_loss_mlp": 0.31129181, "epoch": 0.4386592514655043, "flos": 14319883445760.0, "grad_norm": 28.530577035163915, "language_loss": 0.82249624, "learning_rate": 2.4894010242396063e-06, "loss": 0.84176785, "num_input_tokens_seen": 156380940, "router_z_loss_clip": 3.0234375, "router_z_loss_mlp": 0.3828125, "step": 7296, "time_per_iteration": 2.6362862586975098 }, { "auxiliary_loss_clip": 0.01564214, "auxiliary_loss_mlp": 0.0030522, "balance_loss_clip": 1.26978123, "balance_loss_mlp": 0.27057767, "epoch": 0.43871937471817224, "flos": 22784423137920.0, "grad_norm": 5.112930681967207, "language_loss": 0.75932246, "learning_rate": 2.4890233934187873e-06, "loss": 0.77801681, "num_input_tokens_seen": 156400415, "router_z_loss_clip": 2.9453125, "router_z_loss_mlp": 0.34643555, "step": 7297, "time_per_iteration": 2.645451784133911 }, { "auxiliary_loss_clip": 0.01566597, "auxiliary_loss_mlp": 0.0032753, "balance_loss_clip": 1.26916885, "balance_loss_mlp": 0.29133767, "epoch": 0.4387794979708402, "flos": 28072304853120.0, "grad_norm": 28.644485211810167, "language_loss": 0.75280404, "learning_rate": 2.4886457440541535e-06, "loss": 0.77174532, "num_input_tokens_seen": 156421120, "router_z_loss_clip": 2.97460938, "router_z_loss_mlp": 0.36206055, "step": 7298, "time_per_iteration": 2.6820783615112305 }, { "auxiliary_loss_clip": 0.01556614, "auxiliary_loss_mlp": 0.00268093, "balance_loss_clip": 1.26379299, "balance_loss_mlp": 0.23409463, "epoch": 0.43883962122350817, "flos": 26249551240320.0, "grad_norm": 5.136832876619168, "language_loss": 0.7844345, "learning_rate": 2.4882680761600238e-06, "loss": 0.80268157, "num_input_tokens_seen": 156441535, "router_z_loss_clip": 2.93164062, "router_z_loss_mlp": 0.34008789, "step": 7299, "time_per_iteration": 2.7084038257598877 }, { "auxiliary_loss_clip": 0.01551189, "auxiliary_loss_mlp": 0.00313202, "balance_loss_clip": 1.2517997, "balance_loss_mlp": 0.27512604, "epoch": 0.43889974447617613, "flos": 25883765089920.0, "grad_norm": 2.3872987763151783, "language_loss": 0.85005844, "learning_rate": 2.487890389750719e-06, "loss": 0.86870235, "num_input_tokens_seen": 156462015, "router_z_loss_clip": 2.9921875, "router_z_loss_mlp": 0.38085938, "step": 7300, "time_per_iteration": 2.6784064769744873 }, { "auxiliary_loss_clip": 0.01549006, "auxiliary_loss_mlp": 0.00309412, "balance_loss_clip": 1.25429797, "balance_loss_mlp": 0.2763437, "epoch": 0.43895986772884416, "flos": 25046615738880.0, "grad_norm": 59.73336525927075, "language_loss": 0.78624815, "learning_rate": 2.4875126848405626e-06, "loss": 0.80483234, "num_input_tokens_seen": 156482165, "router_z_loss_clip": 2.9453125, "router_z_loss_mlp": 0.33032227, "step": 7301, "time_per_iteration": 2.696646213531494 }, { "auxiliary_loss_clip": 0.01578821, "auxiliary_loss_mlp": 0.00311924, "balance_loss_clip": 1.27942002, "balance_loss_mlp": 0.27492177, "epoch": 0.4390199909815121, "flos": 25994585525760.0, "grad_norm": 14.460226045007607, "language_loss": 0.77757812, "learning_rate": 2.4871349614438757e-06, "loss": 0.79648554, "num_input_tokens_seen": 156503170, "router_z_loss_clip": 2.9921875, "router_z_loss_mlp": 0.37011719, "step": 7302, "time_per_iteration": 2.7109427452087402 }, { "auxiliary_loss_clip": 0.01558, "auxiliary_loss_mlp": 0.00337389, "balance_loss_clip": 1.26590192, "balance_loss_mlp": 0.30036274, "epoch": 0.4390801142341801, "flos": 29022249888000.0, "grad_norm": 9.024055898858883, "language_loss": 0.87334687, "learning_rate": 2.486757219574983e-06, "loss": 0.89230084, "num_input_tokens_seen": 156523005, "router_z_loss_clip": 2.92382812, "router_z_loss_mlp": 0.37060547, "step": 7303, "time_per_iteration": 2.7572505474090576 }, { "auxiliary_loss_clip": 0.01553499, "auxiliary_loss_mlp": 0.00349994, "balance_loss_clip": 1.24893665, "balance_loss_mlp": 0.31411192, "epoch": 0.43914023748684805, "flos": 33438544087680.0, "grad_norm": 192.69158567096125, "language_loss": 0.77474165, "learning_rate": 2.4863794592482067e-06, "loss": 0.79377663, "num_input_tokens_seen": 156544440, "router_z_loss_clip": 3.04882812, "router_z_loss_mlp": 0.35888672, "step": 7304, "time_per_iteration": 2.760329008102417 }, { "auxiliary_loss_clip": 0.01536786, "auxiliary_loss_mlp": 0.00296921, "balance_loss_clip": 1.25187135, "balance_loss_mlp": 0.26466262, "epoch": 0.439200360739516, "flos": 34531844302080.0, "grad_norm": 3.473020966646635, "language_loss": 0.83989298, "learning_rate": 2.486001680477873e-06, "loss": 0.85822999, "num_input_tokens_seen": 156565410, "router_z_loss_clip": 2.8515625, "router_z_loss_mlp": 0.32250977, "step": 7305, "time_per_iteration": 2.8378231525421143 }, { "auxiliary_loss_clip": 0.01546679, "auxiliary_loss_mlp": 0.00320428, "balance_loss_clip": 1.25580978, "balance_loss_mlp": 0.28609604, "epoch": 0.439260483992184, "flos": 21907843632000.0, "grad_norm": 14.687439679925065, "language_loss": 0.76640475, "learning_rate": 2.485623883278308e-06, "loss": 0.78507584, "num_input_tokens_seen": 156584210, "router_z_loss_clip": 2.91015625, "router_z_loss_mlp": 0.34350586, "step": 7306, "time_per_iteration": 2.786677837371826 }, { "auxiliary_loss_clip": 0.01543953, "auxiliary_loss_mlp": 0.00311433, "balance_loss_clip": 1.2528801, "balance_loss_mlp": 0.27521703, "epoch": 0.43932060724485195, "flos": 20996430912000.0, "grad_norm": 18.330930905870922, "language_loss": 0.67159784, "learning_rate": 2.4852460676638344e-06, "loss": 0.69015169, "num_input_tokens_seen": 156602730, "router_z_loss_clip": 2.90820312, "router_z_loss_mlp": 0.36230469, "step": 7307, "time_per_iteration": 2.701305389404297 }, { "auxiliary_loss_clip": 0.01551944, "auxiliary_loss_mlp": 0.00331665, "balance_loss_clip": 1.25563598, "balance_loss_mlp": 0.29759496, "epoch": 0.4393807304975199, "flos": 17747053850880.0, "grad_norm": 65.304704960893, "language_loss": 0.80823249, "learning_rate": 2.4848682336487828e-06, "loss": 0.82706869, "num_input_tokens_seen": 156619405, "router_z_loss_clip": 2.96484375, "router_z_loss_mlp": 0.34057617, "step": 7308, "time_per_iteration": 2.709374189376831 }, { "auxiliary_loss_clip": 0.01539029, "auxiliary_loss_mlp": 0.00321215, "balance_loss_clip": 1.24081016, "balance_loss_mlp": 0.28478497, "epoch": 0.4394408537501879, "flos": 22528523669760.0, "grad_norm": 28.75117434182302, "language_loss": 0.83405286, "learning_rate": 2.4844903812474787e-06, "loss": 0.85265529, "num_input_tokens_seen": 156638165, "router_z_loss_clip": 2.98046875, "router_z_loss_mlp": 0.36425781, "step": 7309, "time_per_iteration": 2.7496745586395264 }, { "auxiliary_loss_clip": 0.01522786, "auxiliary_loss_mlp": 0.00281543, "balance_loss_clip": 1.24145532, "balance_loss_mlp": 0.24654339, "epoch": 0.43950097700285584, "flos": 23440654661760.0, "grad_norm": 49.44882984726687, "language_loss": 0.78576541, "learning_rate": 2.484112510474251e-06, "loss": 0.80380869, "num_input_tokens_seen": 156658845, "router_z_loss_clip": 2.81445312, "router_z_loss_mlp": 0.34985352, "step": 7310, "time_per_iteration": 2.6623010635375977 }, { "auxiliary_loss_clip": 0.01543378, "auxiliary_loss_mlp": 0.00320718, "balance_loss_clip": 1.24369621, "balance_loss_mlp": 0.28433508, "epoch": 0.4395611002555238, "flos": 23180696956800.0, "grad_norm": 5.18026141527971, "language_loss": 0.83372056, "learning_rate": 2.483734621343429e-06, "loss": 0.8523615, "num_input_tokens_seen": 156677275, "router_z_loss_clip": 2.99414062, "router_z_loss_mlp": 0.36376953, "step": 7311, "time_per_iteration": 2.6556038856506348 }, { "auxiliary_loss_clip": 0.01556185, "auxiliary_loss_mlp": 0.00333907, "balance_loss_clip": 1.25737572, "balance_loss_mlp": 0.29728639, "epoch": 0.43962122350819177, "flos": 22127365601280.0, "grad_norm": 2.516936596764019, "language_loss": 0.8843258, "learning_rate": 2.483356713869341e-06, "loss": 0.90322667, "num_input_tokens_seen": 156695815, "router_z_loss_clip": 2.9921875, "router_z_loss_mlp": 0.36621094, "step": 7312, "time_per_iteration": 2.655214309692383 }, { "auxiliary_loss_clip": 0.01538168, "auxiliary_loss_mlp": 0.00348241, "balance_loss_clip": 1.24677253, "balance_loss_mlp": 0.31262159, "epoch": 0.43968134676085974, "flos": 17420554200960.0, "grad_norm": 3.7601726452914948, "language_loss": 0.95431167, "learning_rate": 2.482978788066318e-06, "loss": 0.97317576, "num_input_tokens_seen": 156714385, "router_z_loss_clip": 2.91015625, "router_z_loss_mlp": 0.35595703, "step": 7313, "time_per_iteration": 2.6299421787261963 }, { "auxiliary_loss_clip": 0.01534808, "auxiliary_loss_mlp": 0.00293514, "balance_loss_clip": 1.2412889, "balance_loss_mlp": 0.25765589, "epoch": 0.43974147001352776, "flos": 18952646958720.0, "grad_norm": 3.776489252415717, "language_loss": 0.73890805, "learning_rate": 2.4826008439486904e-06, "loss": 0.75719124, "num_input_tokens_seen": 156732615, "router_z_loss_clip": 2.94140625, "router_z_loss_mlp": 0.35864258, "step": 7314, "time_per_iteration": 2.6484973430633545 }, { "auxiliary_loss_clip": 0.01544181, "auxiliary_loss_mlp": 0.00345807, "balance_loss_clip": 1.24486959, "balance_loss_mlp": 0.30730239, "epoch": 0.4398015932661957, "flos": 18953508885120.0, "grad_norm": 10.997981563041034, "language_loss": 0.81936389, "learning_rate": 2.4822228815307915e-06, "loss": 0.83826375, "num_input_tokens_seen": 156750920, "router_z_loss_clip": 2.99414062, "router_z_loss_mlp": 0.38525391, "step": 7315, "time_per_iteration": 2.6383233070373535 }, { "auxiliary_loss_clip": 0.01522672, "auxiliary_loss_mlp": 0.0034849, "balance_loss_clip": 1.2355212, "balance_loss_mlp": 0.31382418, "epoch": 0.4398617165188637, "flos": 24199913370240.0, "grad_norm": 34.9719195240347, "language_loss": 0.80322981, "learning_rate": 2.4818449008269523e-06, "loss": 0.82194144, "num_input_tokens_seen": 156768520, "router_z_loss_clip": 2.87695312, "router_z_loss_mlp": 0.34643555, "step": 7316, "time_per_iteration": 2.695347309112549 }, { "auxiliary_loss_clip": 0.01549352, "auxiliary_loss_mlp": 0.00338582, "balance_loss_clip": 1.25641739, "balance_loss_mlp": 0.301126, "epoch": 0.43992183977153165, "flos": 22236677665920.0, "grad_norm": 134.60177124067337, "language_loss": 0.72891074, "learning_rate": 2.481466901851506e-06, "loss": 0.7477901, "num_input_tokens_seen": 156788700, "router_z_loss_clip": 2.9296875, "router_z_loss_mlp": 0.37475586, "step": 7317, "time_per_iteration": 2.6342244148254395 }, { "auxiliary_loss_clip": 0.01542764, "auxiliary_loss_mlp": 0.00348068, "balance_loss_clip": 1.24902129, "balance_loss_mlp": 0.31108928, "epoch": 0.4399819630241996, "flos": 18697465762560.0, "grad_norm": 16.49235410700712, "language_loss": 0.86800778, "learning_rate": 2.4810888846187865e-06, "loss": 0.8869161, "num_input_tokens_seen": 156806470, "router_z_loss_clip": 2.93554688, "router_z_loss_mlp": 0.36987305, "step": 7318, "time_per_iteration": 2.609905481338501 }, { "auxiliary_loss_clip": 0.01546172, "auxiliary_loss_mlp": 0.0032894, "balance_loss_clip": 1.24929321, "balance_loss_mlp": 0.29312921, "epoch": 0.4400420862768676, "flos": 23879375377920.0, "grad_norm": 2.5147463973278827, "language_loss": 0.85490572, "learning_rate": 2.4807108491431283e-06, "loss": 0.87365675, "num_input_tokens_seen": 156825895, "router_z_loss_clip": 2.96484375, "router_z_loss_mlp": 0.3581543, "step": 7319, "time_per_iteration": 2.7004446983337402 }, { "auxiliary_loss_clip": 0.01540377, "auxiliary_loss_mlp": 0.0034473, "balance_loss_clip": 1.25013888, "balance_loss_mlp": 0.30818063, "epoch": 0.44010220952953555, "flos": 28037615293440.0, "grad_norm": 2.0879463915305436, "language_loss": 0.85607237, "learning_rate": 2.4803327954388667e-06, "loss": 0.87492347, "num_input_tokens_seen": 156845990, "router_z_loss_clip": 2.90234375, "router_z_loss_mlp": 0.36547852, "step": 7320, "time_per_iteration": 4.1041319370269775 }, { "auxiliary_loss_clip": 0.0154807, "auxiliary_loss_mlp": 0.0033763, "balance_loss_clip": 1.25382936, "balance_loss_mlp": 0.30172423, "epoch": 0.4401623327822035, "flos": 23768985905280.0, "grad_norm": 2.291638169327599, "language_loss": 0.77274644, "learning_rate": 2.4799547235203376e-06, "loss": 0.79160345, "num_input_tokens_seen": 156866685, "router_z_loss_clip": 2.93945312, "router_z_loss_mlp": 0.35888672, "step": 7321, "time_per_iteration": 2.6922073364257812 }, { "auxiliary_loss_clip": 0.01437022, "auxiliary_loss_mlp": 0.00080958, "balance_loss_clip": 1.27714455, "balance_loss_mlp": 0.07270916, "epoch": 0.4402224560348715, "flos": 70774583264640.0, "grad_norm": 0.8699970720707796, "language_loss": 0.56529969, "learning_rate": 2.4795766334018763e-06, "loss": 0.5804795, "num_input_tokens_seen": 156923450, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.08251953, "step": 7322, "time_per_iteration": 4.636983633041382 }, { "auxiliary_loss_clip": 0.0154145, "auxiliary_loss_mlp": 0.00334003, "balance_loss_clip": 1.25238466, "balance_loss_mlp": 0.29950401, "epoch": 0.44028257928753944, "flos": 22891795868160.0, "grad_norm": 138.98245912683544, "language_loss": 0.81155896, "learning_rate": 2.479198525097822e-06, "loss": 0.83031344, "num_input_tokens_seen": 156944795, "router_z_loss_clip": 2.890625, "router_z_loss_mlp": 0.34545898, "step": 7323, "time_per_iteration": 2.6831393241882324 }, { "auxiliary_loss_clip": 0.01538763, "auxiliary_loss_mlp": 0.00357122, "balance_loss_clip": 1.24758422, "balance_loss_mlp": 0.32081044, "epoch": 0.4403427025402074, "flos": 17895760156800.0, "grad_norm": 6.437398626839328, "language_loss": 0.86548162, "learning_rate": 2.478820398622511e-06, "loss": 0.88444042, "num_input_tokens_seen": 156962755, "router_z_loss_clip": 2.91015625, "router_z_loss_mlp": 0.36328125, "step": 7324, "time_per_iteration": 4.074963569641113 }, { "auxiliary_loss_clip": 0.01419153, "auxiliary_loss_mlp": 0.00087511, "balance_loss_clip": 1.26271987, "balance_loss_mlp": 0.07964273, "epoch": 0.4404028257928754, "flos": 69562525708800.0, "grad_norm": 0.6565235410273872, "language_loss": 0.54317701, "learning_rate": 2.478442253990283e-06, "loss": 0.55824363, "num_input_tokens_seen": 157028095, "router_z_loss_clip": 1.5625, "router_z_loss_mlp": 0.07861328, "step": 7325, "time_per_iteration": 3.1279892921447754 }, { "auxiliary_loss_clip": 0.01544707, "auxiliary_loss_mlp": 0.00353763, "balance_loss_clip": 1.25531101, "balance_loss_mlp": 0.31907317, "epoch": 0.44046294904554334, "flos": 20923675914240.0, "grad_norm": 55.035535785201496, "language_loss": 0.75939983, "learning_rate": 2.4780640912154766e-06, "loss": 0.77838457, "num_input_tokens_seen": 157048365, "router_z_loss_clip": 2.89257812, "router_z_loss_mlp": 0.34692383, "step": 7326, "time_per_iteration": 2.7075376510620117 }, { "auxiliary_loss_clip": 0.01538203, "auxiliary_loss_mlp": 0.00343779, "balance_loss_clip": 1.25200713, "balance_loss_mlp": 0.3090651, "epoch": 0.44052307229821136, "flos": 23623475909760.0, "grad_norm": 64.46216369620049, "language_loss": 0.81886971, "learning_rate": 2.477685910312432e-06, "loss": 0.83768952, "num_input_tokens_seen": 157069130, "router_z_loss_clip": 2.86328125, "router_z_loss_mlp": 0.34741211, "step": 7327, "time_per_iteration": 2.7055118083953857 }, { "auxiliary_loss_clip": 0.0152341, "auxiliary_loss_mlp": 0.00393152, "balance_loss_clip": 1.2396512, "balance_loss_mlp": 0.35593477, "epoch": 0.4405831955508793, "flos": 17597665186560.0, "grad_norm": 17.09153034629179, "language_loss": 0.88521338, "learning_rate": 2.4773077112954897e-06, "loss": 0.90437895, "num_input_tokens_seen": 157084940, "router_z_loss_clip": 2.83789062, "router_z_loss_mlp": 0.37182617, "step": 7328, "time_per_iteration": 2.5965161323547363 }, { "auxiliary_loss_clip": 0.01519469, "auxiliary_loss_mlp": 0.00341063, "balance_loss_clip": 1.2391398, "balance_loss_mlp": 0.30806598, "epoch": 0.4406433188035473, "flos": 21463376739840.0, "grad_norm": 9.698399129101826, "language_loss": 0.8562246, "learning_rate": 2.4769294941789908e-06, "loss": 0.87482995, "num_input_tokens_seen": 157102770, "router_z_loss_clip": 2.80859375, "router_z_loss_mlp": 0.32983398, "step": 7329, "time_per_iteration": 2.6797726154327393 }, { "auxiliary_loss_clip": 0.01530292, "auxiliary_loss_mlp": 0.00338837, "balance_loss_clip": 1.23951864, "balance_loss_mlp": 0.30510056, "epoch": 0.44070344205621526, "flos": 22673566788480.0, "grad_norm": 62.40137517806673, "language_loss": 0.79472202, "learning_rate": 2.476551258977278e-06, "loss": 0.81341332, "num_input_tokens_seen": 157122035, "router_z_loss_clip": 2.91015625, "router_z_loss_mlp": 0.33740234, "step": 7330, "time_per_iteration": 4.043255567550659 }, { "auxiliary_loss_clip": 0.01553263, "auxiliary_loss_mlp": 0.00386712, "balance_loss_clip": 1.26395559, "balance_loss_mlp": 0.35121137, "epoch": 0.4407635653088832, "flos": 23441193365760.0, "grad_norm": 1.7445270537585265, "language_loss": 0.80211246, "learning_rate": 2.4761730057046936e-06, "loss": 0.82151222, "num_input_tokens_seen": 157142800, "router_z_loss_clip": 2.88867188, "router_z_loss_mlp": 0.35473633, "step": 7331, "time_per_iteration": 2.6802334785461426 }, { "auxiliary_loss_clip": 0.01536332, "auxiliary_loss_mlp": 0.00401415, "balance_loss_clip": 1.24977326, "balance_loss_mlp": 0.36670136, "epoch": 0.4408236885615512, "flos": 24021294013440.0, "grad_norm": 516.245605651051, "language_loss": 0.79580885, "learning_rate": 2.475794734375581e-06, "loss": 0.81518626, "num_input_tokens_seen": 157163295, "router_z_loss_clip": 2.8671875, "router_z_loss_mlp": 0.34692383, "step": 7332, "time_per_iteration": 2.695683002471924 }, { "auxiliary_loss_clip": 0.01526361, "auxiliary_loss_mlp": 0.00365931, "balance_loss_clip": 1.23728776, "balance_loss_mlp": 0.33066869, "epoch": 0.44088381181421915, "flos": 12676826597760.0, "grad_norm": 2.4686148674785464, "language_loss": 0.80965769, "learning_rate": 2.475416445004285e-06, "loss": 0.82858062, "num_input_tokens_seen": 157180890, "router_z_loss_clip": 2.88867188, "router_z_loss_mlp": 0.3527832, "step": 7333, "time_per_iteration": 2.6496803760528564 }, { "auxiliary_loss_clip": 0.01530287, "auxiliary_loss_mlp": 0.00365599, "balance_loss_clip": 1.25465608, "balance_loss_mlp": 0.33233982, "epoch": 0.4409439350668871, "flos": 24569865498240.0, "grad_norm": 10.929291481449408, "language_loss": 0.84965599, "learning_rate": 2.4750381376051493e-06, "loss": 0.86861479, "num_input_tokens_seen": 157200580, "router_z_loss_clip": 2.7578125, "router_z_loss_mlp": 0.33251953, "step": 7334, "time_per_iteration": 2.71000337600708 }, { "auxiliary_loss_clip": 0.01551054, "auxiliary_loss_mlp": 0.00402663, "balance_loss_clip": 1.25259805, "balance_loss_mlp": 0.3646825, "epoch": 0.4410040583195551, "flos": 22668574798080.0, "grad_norm": 4.2696264953422265, "language_loss": 0.83417726, "learning_rate": 2.47465981219252e-06, "loss": 0.85371447, "num_input_tokens_seen": 157218345, "router_z_loss_clip": 2.98242188, "router_z_loss_mlp": 0.37963867, "step": 7335, "time_per_iteration": 2.6681854724884033 }, { "auxiliary_loss_clip": 0.01526602, "auxiliary_loss_mlp": 0.00359668, "balance_loss_clip": 1.2367146, "balance_loss_mlp": 0.32280898, "epoch": 0.44106418157222305, "flos": 10852528700160.0, "grad_norm": 48.90201854758476, "language_loss": 0.79071516, "learning_rate": 2.4742814687807423e-06, "loss": 0.80957788, "num_input_tokens_seen": 157234395, "router_z_loss_clip": 2.8984375, "router_z_loss_mlp": 0.3684082, "step": 7336, "time_per_iteration": 2.5969319343566895 }, { "auxiliary_loss_clip": 0.01533157, "auxiliary_loss_mlp": 0.00407182, "balance_loss_clip": 1.23979831, "balance_loss_mlp": 0.36619779, "epoch": 0.441124304824891, "flos": 21726710323200.0, "grad_norm": 16.112672596976136, "language_loss": 0.71734911, "learning_rate": 2.473903107384165e-06, "loss": 0.73675251, "num_input_tokens_seen": 157254805, "router_z_loss_clip": 2.93554688, "router_z_loss_mlp": 0.40991211, "step": 7337, "time_per_iteration": 2.6690475940704346 }, { "auxiliary_loss_clip": 0.01411648, "auxiliary_loss_mlp": 0.00068199, "balance_loss_clip": 1.23981726, "balance_loss_mlp": 0.06047434, "epoch": 0.441184428077559, "flos": 63220486625280.0, "grad_norm": 0.7346676939699418, "language_loss": 0.52580285, "learning_rate": 2.473524728017134e-06, "loss": 0.54060125, "num_input_tokens_seen": 157317870, "router_z_loss_clip": 1.71875, "router_z_loss_mlp": 0.07714844, "step": 7338, "time_per_iteration": 3.2402782440185547 }, { "auxiliary_loss_clip": 0.01527824, "auxiliary_loss_mlp": 0.00439321, "balance_loss_clip": 1.23418951, "balance_loss_mlp": 0.39774066, "epoch": 0.44124455133022694, "flos": 21177959270400.0, "grad_norm": 7.972563300668592, "language_loss": 0.78128791, "learning_rate": 2.473146330693997e-06, "loss": 0.80095935, "num_input_tokens_seen": 157336505, "router_z_loss_clip": 2.93554688, "router_z_loss_mlp": 0.41552734, "step": 7339, "time_per_iteration": 2.629051685333252 }, { "auxiliary_loss_clip": 0.01528178, "auxiliary_loss_mlp": 0.00359734, "balance_loss_clip": 1.25029492, "balance_loss_mlp": 0.32728487, "epoch": 0.4413046745828949, "flos": 17457865453440.0, "grad_norm": 4.3224456975866135, "language_loss": 0.75865269, "learning_rate": 2.472767915429105e-06, "loss": 0.77753174, "num_input_tokens_seen": 157354995, "router_z_loss_clip": 2.77734375, "router_z_loss_mlp": 0.32470703, "step": 7340, "time_per_iteration": 2.6086714267730713 }, { "auxiliary_loss_clip": 0.0142935, "auxiliary_loss_mlp": 0.00071986, "balance_loss_clip": 1.24921811, "balance_loss_mlp": 0.06469015, "epoch": 0.4413647978355629, "flos": 61586153804160.0, "grad_norm": 0.8994122847059501, "language_loss": 0.64050949, "learning_rate": 2.4723894822368054e-06, "loss": 0.65552282, "num_input_tokens_seen": 157404260, "router_z_loss_clip": 1.796875, "router_z_loss_mlp": 0.07275391, "step": 7341, "time_per_iteration": 2.9027256965637207 }, { "auxiliary_loss_clip": 0.01533075, "auxiliary_loss_mlp": 0.00402685, "balance_loss_clip": 1.24456191, "balance_loss_mlp": 0.3653723, "epoch": 0.4414249210882309, "flos": 27527001505920.0, "grad_norm": 11.92352291447366, "language_loss": 0.80343819, "learning_rate": 2.47201103113145e-06, "loss": 0.82279581, "num_input_tokens_seen": 157423045, "router_z_loss_clip": 2.88476562, "router_z_loss_mlp": 0.37329102, "step": 7342, "time_per_iteration": 2.6988160610198975 }, { "auxiliary_loss_clip": 0.01512971, "auxiliary_loss_mlp": 0.00380435, "balance_loss_clip": 1.23045039, "balance_loss_mlp": 0.34431446, "epoch": 0.44148504434089886, "flos": 23513984277120.0, "grad_norm": 161.36027056968615, "language_loss": 0.86088336, "learning_rate": 2.4716325621273886e-06, "loss": 0.87981743, "num_input_tokens_seen": 157441815, "router_z_loss_clip": 2.828125, "router_z_loss_mlp": 0.36108398, "step": 7343, "time_per_iteration": 2.683457136154175 }, { "auxiliary_loss_clip": 0.01505003, "auxiliary_loss_mlp": 0.00375361, "balance_loss_clip": 1.22456598, "balance_loss_mlp": 0.34071916, "epoch": 0.4415451675935668, "flos": 21580589796480.0, "grad_norm": 40.03970946119173, "language_loss": 0.82178903, "learning_rate": 2.4712540752389725e-06, "loss": 0.84059268, "num_input_tokens_seen": 157460470, "router_z_loss_clip": 2.80664062, "router_z_loss_mlp": 0.34594727, "step": 7344, "time_per_iteration": 2.652653217315674 }, { "auxiliary_loss_clip": 0.01375238, "auxiliary_loss_mlp": 0.00067613, "balance_loss_clip": 1.21316767, "balance_loss_mlp": 0.05998376, "epoch": 0.4416052908462348, "flos": 59006368126080.0, "grad_norm": 0.8153351678240132, "language_loss": 0.63365436, "learning_rate": 2.470875570480556e-06, "loss": 0.64808297, "num_input_tokens_seen": 157512655, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.07617188, "step": 7345, "time_per_iteration": 2.88912296295166 }, { "auxiliary_loss_clip": 0.01509689, "auxiliary_loss_mlp": 0.00360786, "balance_loss_clip": 1.22632587, "balance_loss_mlp": 0.32831299, "epoch": 0.44166541409890275, "flos": 26357642242560.0, "grad_norm": 9.923220876445336, "language_loss": 0.90665513, "learning_rate": 2.470497047866489e-06, "loss": 0.92535985, "num_input_tokens_seen": 157533700, "router_z_loss_clip": 2.83398438, "router_z_loss_mlp": 0.32446289, "step": 7346, "time_per_iteration": 2.6994500160217285 }, { "auxiliary_loss_clip": 0.01505441, "auxiliary_loss_mlp": 0.00343893, "balance_loss_clip": 1.22021806, "balance_loss_mlp": 0.30886891, "epoch": 0.4417255373515707, "flos": 20192678231040.0, "grad_norm": 442.18172346590893, "language_loss": 0.86228466, "learning_rate": 2.470118507411128e-06, "loss": 0.88077796, "num_input_tokens_seen": 157551105, "router_z_loss_clip": 2.84960938, "router_z_loss_mlp": 0.35009766, "step": 7347, "time_per_iteration": 2.6699962615966797 }, { "auxiliary_loss_clip": 0.01502968, "auxiliary_loss_mlp": 0.00375922, "balance_loss_clip": 1.2237674, "balance_loss_mlp": 0.34130365, "epoch": 0.4417856606042387, "flos": 17887895078400.0, "grad_norm": 15.243434358332195, "language_loss": 0.89793408, "learning_rate": 2.4697399491288263e-06, "loss": 0.91672301, "num_input_tokens_seen": 157568285, "router_z_loss_clip": 2.7890625, "router_z_loss_mlp": 0.34619141, "step": 7348, "time_per_iteration": 2.7058019638061523 }, { "auxiliary_loss_clip": 0.01514252, "auxiliary_loss_mlp": 0.0036732, "balance_loss_clip": 1.23036826, "balance_loss_mlp": 0.33079404, "epoch": 0.44184578385690665, "flos": 27964034282880.0, "grad_norm": 6.458077647624142, "language_loss": 0.7716949, "learning_rate": 2.469361373033938e-06, "loss": 0.79051059, "num_input_tokens_seen": 157590405, "router_z_loss_clip": 2.84179688, "router_z_loss_mlp": 0.36547852, "step": 7349, "time_per_iteration": 2.733731508255005 }, { "auxiliary_loss_clip": 0.01502865, "auxiliary_loss_mlp": 0.00367298, "balance_loss_clip": 1.2182219, "balance_loss_mlp": 0.33113015, "epoch": 0.4419059071095746, "flos": 23367899664000.0, "grad_norm": 111.23803766849683, "language_loss": 0.8183924, "learning_rate": 2.468982779140819e-06, "loss": 0.83709395, "num_input_tokens_seen": 157607420, "router_z_loss_clip": 2.84570312, "router_z_loss_mlp": 0.36206055, "step": 7350, "time_per_iteration": 2.658177137374878 }, { "auxiliary_loss_clip": 0.01502889, "auxiliary_loss_mlp": 0.00373219, "balance_loss_clip": 1.21902871, "balance_loss_mlp": 0.33807606, "epoch": 0.4419660303622426, "flos": 15012169246080.0, "grad_norm": 8.672458713026966, "language_loss": 0.8970421, "learning_rate": 2.468604167463827e-06, "loss": 0.91580319, "num_input_tokens_seen": 157624990, "router_z_loss_clip": 2.83984375, "router_z_loss_mlp": 0.3515625, "step": 7351, "time_per_iteration": 2.659651517868042 }, { "auxiliary_loss_clip": 0.01490363, "auxiliary_loss_mlp": 0.00340327, "balance_loss_clip": 1.21915388, "balance_loss_mlp": 0.30823612, "epoch": 0.44202615361491054, "flos": 25371750672000.0, "grad_norm": 9.620445212722936, "language_loss": 0.78070331, "learning_rate": 2.4682255380173176e-06, "loss": 0.79901028, "num_input_tokens_seen": 157645300, "router_z_loss_clip": 2.71289062, "router_z_loss_mlp": 0.32080078, "step": 7352, "time_per_iteration": 2.681013584136963 }, { "auxiliary_loss_clip": 0.01524818, "auxiliary_loss_mlp": 0.00370489, "balance_loss_clip": 1.24181533, "balance_loss_mlp": 0.33520317, "epoch": 0.4420862768675785, "flos": 24681116897280.0, "grad_norm": 150.26796168536202, "language_loss": 0.92828298, "learning_rate": 2.467846890815649e-06, "loss": 0.94723606, "num_input_tokens_seen": 157664060, "router_z_loss_clip": 2.83007812, "router_z_loss_mlp": 0.35253906, "step": 7353, "time_per_iteration": 2.7944436073303223 }, { "auxiliary_loss_clip": 0.01493554, "auxiliary_loss_mlp": 0.00352704, "balance_loss_clip": 1.21494699, "balance_loss_mlp": 0.3179189, "epoch": 0.44214640012024653, "flos": 19528437974400.0, "grad_norm": 3.9125179937377697, "language_loss": 0.82754374, "learning_rate": 2.4674682258731795e-06, "loss": 0.84600627, "num_input_tokens_seen": 157680905, "router_z_loss_clip": 2.78710938, "router_z_loss_mlp": 0.34765625, "step": 7354, "time_per_iteration": 2.7497243881225586 }, { "auxiliary_loss_clip": 0.01501181, "auxiliary_loss_mlp": 0.00328136, "balance_loss_clip": 1.22831643, "balance_loss_mlp": 0.29594964, "epoch": 0.4422065233729145, "flos": 47557434003840.0, "grad_norm": 4.804159145189643, "language_loss": 0.71200371, "learning_rate": 2.467089543204268e-06, "loss": 0.73029685, "num_input_tokens_seen": 157701980, "router_z_loss_clip": 2.73046875, "router_z_loss_mlp": 0.32202148, "step": 7355, "time_per_iteration": 2.9092941284179688 }, { "auxiliary_loss_clip": 0.01499564, "auxiliary_loss_mlp": 0.00348828, "balance_loss_clip": 1.21140432, "balance_loss_mlp": 0.31177801, "epoch": 0.44226664662558246, "flos": 19281050029440.0, "grad_norm": 34.1465890863076, "language_loss": 0.86182636, "learning_rate": 2.466710842823274e-06, "loss": 0.8803103, "num_input_tokens_seen": 157720555, "router_z_loss_clip": 2.8828125, "router_z_loss_mlp": 0.37084961, "step": 7356, "time_per_iteration": 2.6166698932647705 }, { "auxiliary_loss_clip": 0.01508047, "auxiliary_loss_mlp": 0.00379722, "balance_loss_clip": 1.22579157, "balance_loss_mlp": 0.34207559, "epoch": 0.4423267698782504, "flos": 17821820010240.0, "grad_norm": 3.172619049800396, "language_loss": 0.85102379, "learning_rate": 2.4663321247445577e-06, "loss": 0.86990148, "num_input_tokens_seen": 157739160, "router_z_loss_clip": 2.82226562, "router_z_loss_mlp": 0.37670898, "step": 7357, "time_per_iteration": 2.607311248779297 }, { "auxiliary_loss_clip": 0.01509354, "auxiliary_loss_mlp": 0.00333683, "balance_loss_clip": 1.23131323, "balance_loss_mlp": 0.29854006, "epoch": 0.4423868931309184, "flos": 29204424691200.0, "grad_norm": 25.891093085903112, "language_loss": 0.79032338, "learning_rate": 2.465953388982481e-06, "loss": 0.80875373, "num_input_tokens_seen": 157760020, "router_z_loss_clip": 2.77929688, "router_z_loss_mlp": 0.35107422, "step": 7358, "time_per_iteration": 2.708587884902954 }, { "auxiliary_loss_clip": 0.0150873, "auxiliary_loss_mlp": 0.00342297, "balance_loss_clip": 1.22904205, "balance_loss_mlp": 0.3077026, "epoch": 0.44244701638358636, "flos": 29713135057920.0, "grad_norm": 2.5364229223633092, "language_loss": 0.82089138, "learning_rate": 2.465574635551405e-06, "loss": 0.83940166, "num_input_tokens_seen": 157780435, "router_z_loss_clip": 2.79492188, "router_z_loss_mlp": 0.34619141, "step": 7359, "time_per_iteration": 2.7162482738494873 }, { "auxiliary_loss_clip": 0.01500945, "auxiliary_loss_mlp": 0.00318356, "balance_loss_clip": 1.22462523, "balance_loss_mlp": 0.28485852, "epoch": 0.4425071396362543, "flos": 22930040874240.0, "grad_norm": 24.69154530114858, "language_loss": 0.76055861, "learning_rate": 2.4651958644656923e-06, "loss": 0.77875167, "num_input_tokens_seen": 157799420, "router_z_loss_clip": 2.75976562, "router_z_loss_mlp": 0.33496094, "step": 7360, "time_per_iteration": 2.662841558456421 }, { "auxiliary_loss_clip": 0.01492665, "auxiliary_loss_mlp": 0.00335542, "balance_loss_clip": 1.2135874, "balance_loss_mlp": 0.30130535, "epoch": 0.4425672628889223, "flos": 19792346175360.0, "grad_norm": 89.01171471225564, "language_loss": 0.76572299, "learning_rate": 2.4648170757397053e-06, "loss": 0.78400499, "num_input_tokens_seen": 157817025, "router_z_loss_clip": 2.79101562, "router_z_loss_mlp": 0.3425293, "step": 7361, "time_per_iteration": 2.6488630771636963 }, { "auxiliary_loss_clip": 0.01510994, "auxiliary_loss_mlp": 0.0034053, "balance_loss_clip": 1.22738039, "balance_loss_mlp": 0.30438614, "epoch": 0.44262738614159025, "flos": 13662215377920.0, "grad_norm": 45.68039025799055, "language_loss": 0.9239856, "learning_rate": 2.464438269387809e-06, "loss": 0.94250083, "num_input_tokens_seen": 157834345, "router_z_loss_clip": 2.83398438, "router_z_loss_mlp": 0.36157227, "step": 7362, "time_per_iteration": 4.074936628341675 }, { "auxiliary_loss_clip": 0.01514538, "auxiliary_loss_mlp": 0.0036916, "balance_loss_clip": 1.22385216, "balance_loss_mlp": 0.33072704, "epoch": 0.4426875093942582, "flos": 14210212245120.0, "grad_norm": 65.72490082506302, "language_loss": 0.827806, "learning_rate": 2.464059445424366e-06, "loss": 0.84664303, "num_input_tokens_seen": 157852290, "router_z_loss_clip": 2.91015625, "router_z_loss_mlp": 0.38476562, "step": 7363, "time_per_iteration": 2.6360108852386475 }, { "auxiliary_loss_clip": 0.01318671, "auxiliary_loss_mlp": 0.000587, "balance_loss_clip": 1.15640724, "balance_loss_mlp": 0.05178603, "epoch": 0.4427476326469262, "flos": 70117525728000.0, "grad_norm": 0.7090862645362096, "language_loss": 0.55368328, "learning_rate": 2.463680603863743e-06, "loss": 0.56745696, "num_input_tokens_seen": 157923060, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.06933594, "step": 7364, "time_per_iteration": 3.2610929012298584 }, { "auxiliary_loss_clip": 0.01494995, "auxiliary_loss_mlp": 0.00303709, "balance_loss_clip": 1.21985149, "balance_loss_mlp": 0.27171323, "epoch": 0.44280775589959415, "flos": 25445080287360.0, "grad_norm": 9.595097590887256, "language_loss": 0.8025229, "learning_rate": 2.463301744720305e-06, "loss": 0.82050991, "num_input_tokens_seen": 157944110, "router_z_loss_clip": 2.75390625, "router_z_loss_mlp": 0.31982422, "step": 7365, "time_per_iteration": 4.156264066696167 }, { "auxiliary_loss_clip": 0.0149048, "auxiliary_loss_mlp": 0.00316386, "balance_loss_clip": 1.21844745, "balance_loss_mlp": 0.28281641, "epoch": 0.4428678791522621, "flos": 22857214049280.0, "grad_norm": 18.96025807532063, "language_loss": 0.8075214, "learning_rate": 2.4629228680084184e-06, "loss": 0.82559007, "num_input_tokens_seen": 157964295, "router_z_loss_clip": 2.72070312, "router_z_loss_mlp": 0.33569336, "step": 7366, "time_per_iteration": 2.671660900115967 }, { "auxiliary_loss_clip": 0.01491695, "auxiliary_loss_mlp": 0.00322703, "balance_loss_clip": 1.21698737, "balance_loss_mlp": 0.28865695, "epoch": 0.44292800240493013, "flos": 25812446636160.0, "grad_norm": 38.48023088836553, "language_loss": 0.81476223, "learning_rate": 2.46254397374245e-06, "loss": 0.83290625, "num_input_tokens_seen": 157983970, "router_z_loss_clip": 2.74609375, "router_z_loss_mlp": 0.34057617, "step": 7367, "time_per_iteration": 4.116817474365234 }, { "auxiliary_loss_clip": 0.01489024, "auxiliary_loss_mlp": 0.00330533, "balance_loss_clip": 1.21298957, "balance_loss_mlp": 0.29565281, "epoch": 0.4429881256575981, "flos": 32416885549440.0, "grad_norm": 12.751806792996721, "language_loss": 0.79056692, "learning_rate": 2.4621650619367677e-06, "loss": 0.80876243, "num_input_tokens_seen": 158006515, "router_z_loss_clip": 2.76171875, "router_z_loss_mlp": 0.34863281, "step": 7368, "time_per_iteration": 2.771739959716797 }, { "auxiliary_loss_clip": 0.01492146, "auxiliary_loss_mlp": 0.00337218, "balance_loss_clip": 1.21599019, "balance_loss_mlp": 0.30300462, "epoch": 0.44304824891026606, "flos": 22163707186560.0, "grad_norm": 3.650854378386452, "language_loss": 0.85836583, "learning_rate": 2.4617861326057403e-06, "loss": 0.87665945, "num_input_tokens_seen": 158025565, "router_z_loss_clip": 2.76171875, "router_z_loss_mlp": 0.34228516, "step": 7369, "time_per_iteration": 2.7243847846984863 }, { "auxiliary_loss_clip": 0.01479017, "auxiliary_loss_mlp": 0.00317219, "balance_loss_clip": 1.20832276, "balance_loss_mlp": 0.28355405, "epoch": 0.443108372162934, "flos": 25338569483520.0, "grad_norm": 3.8420618089989302, "language_loss": 0.80180967, "learning_rate": 2.461407185763737e-06, "loss": 0.81977201, "num_input_tokens_seen": 158045620, "router_z_loss_clip": 2.70898438, "router_z_loss_mlp": 0.33666992, "step": 7370, "time_per_iteration": 2.6775269508361816 }, { "auxiliary_loss_clip": 0.01476482, "auxiliary_loss_mlp": 0.00296891, "balance_loss_clip": 1.20603943, "balance_loss_mlp": 0.26322651, "epoch": 0.443168495415602, "flos": 23330947547520.0, "grad_norm": 119.09855689354859, "language_loss": 0.77017558, "learning_rate": 2.461028221425126e-06, "loss": 0.78790927, "num_input_tokens_seen": 158063505, "router_z_loss_clip": 2.70703125, "router_z_loss_mlp": 0.33691406, "step": 7371, "time_per_iteration": 2.948523998260498 }, { "auxiliary_loss_clip": 0.01486214, "auxiliary_loss_mlp": 0.00305856, "balance_loss_clip": 1.21381164, "balance_loss_mlp": 0.27245343, "epoch": 0.44322861866826996, "flos": 21871502046720.0, "grad_norm": 92.36824571186473, "language_loss": 0.76118982, "learning_rate": 2.4606492396042786e-06, "loss": 0.77911055, "num_input_tokens_seen": 158080335, "router_z_loss_clip": 2.72460938, "router_z_loss_mlp": 0.33398438, "step": 7372, "time_per_iteration": 4.055301189422607 }, { "auxiliary_loss_clip": 0.01497385, "auxiliary_loss_mlp": 0.00347631, "balance_loss_clip": 1.21368384, "balance_loss_mlp": 0.30943596, "epoch": 0.4432887419209379, "flos": 20084407660800.0, "grad_norm": 69.1974292053158, "language_loss": 0.89591253, "learning_rate": 2.4602702403155664e-06, "loss": 0.91436267, "num_input_tokens_seen": 158098955, "router_z_loss_clip": 2.83398438, "router_z_loss_mlp": 0.38232422, "step": 7373, "time_per_iteration": 2.640072822570801 }, { "auxiliary_loss_clip": 0.01326175, "auxiliary_loss_mlp": 0.00040732, "balance_loss_clip": 1.15817428, "balance_loss_mlp": 0.03224437, "epoch": 0.4433488651736059, "flos": 70035540935040.0, "grad_norm": 0.7562299881503296, "language_loss": 0.5503521, "learning_rate": 2.4598912235733604e-06, "loss": 0.56402111, "num_input_tokens_seen": 158164110, "router_z_loss_clip": 1.6796875, "router_z_loss_mlp": 0.08496094, "step": 7374, "time_per_iteration": 3.1987414360046387 }, { "auxiliary_loss_clip": 0.01470297, "auxiliary_loss_mlp": 0.00322289, "balance_loss_clip": 1.19920063, "balance_loss_mlp": 0.28788489, "epoch": 0.44340898842627385, "flos": 16282472705280.0, "grad_norm": 15.352091416458023, "language_loss": 0.90782213, "learning_rate": 2.4595121893920327e-06, "loss": 0.92574799, "num_input_tokens_seen": 158179850, "router_z_loss_clip": 2.7109375, "router_z_loss_mlp": 0.34399414, "step": 7375, "time_per_iteration": 2.691990613937378 }, { "auxiliary_loss_clip": 0.01499474, "auxiliary_loss_mlp": 0.00330804, "balance_loss_clip": 1.21986115, "balance_loss_mlp": 0.29492235, "epoch": 0.4434691116789418, "flos": 16611989097600.0, "grad_norm": 8.286118871483028, "language_loss": 0.88943923, "learning_rate": 2.4591331377859578e-06, "loss": 0.90774202, "num_input_tokens_seen": 158196590, "router_z_loss_clip": 2.796875, "router_z_loss_mlp": 0.35888672, "step": 7376, "time_per_iteration": 2.666522741317749 }, { "auxiliary_loss_clip": 0.0148802, "auxiliary_loss_mlp": 0.00321001, "balance_loss_clip": 1.21283746, "balance_loss_mlp": 0.28640643, "epoch": 0.4435292349316098, "flos": 19063251912960.0, "grad_norm": 13.80232097569648, "language_loss": 0.84935772, "learning_rate": 2.4587540687695077e-06, "loss": 0.86744797, "num_input_tokens_seen": 158216355, "router_z_loss_clip": 2.75195312, "router_z_loss_mlp": 0.34570312, "step": 7377, "time_per_iteration": 2.651468515396118 }, { "auxiliary_loss_clip": 0.0148892, "auxiliary_loss_mlp": 0.00268492, "balance_loss_clip": 1.22175419, "balance_loss_mlp": 0.23554239, "epoch": 0.44358935818427775, "flos": 21251324799360.0, "grad_norm": 20.54575649723188, "language_loss": 0.83723557, "learning_rate": 2.458374982357057e-06, "loss": 0.85480964, "num_input_tokens_seen": 158235825, "router_z_loss_clip": 2.66992188, "router_z_loss_mlp": 0.32958984, "step": 7378, "time_per_iteration": 2.6757638454437256 }, { "auxiliary_loss_clip": 0.01488314, "auxiliary_loss_mlp": 0.00317525, "balance_loss_clip": 1.21110821, "balance_loss_mlp": 0.28254941, "epoch": 0.4436494814369457, "flos": 12495298239360.0, "grad_norm": 59.64743616829395, "language_loss": 0.78745347, "learning_rate": 2.457995878562982e-06, "loss": 0.80551189, "num_input_tokens_seen": 158254230, "router_z_loss_clip": 2.77539062, "router_z_loss_mlp": 0.34960938, "step": 7379, "time_per_iteration": 2.6910157203674316 }, { "auxiliary_loss_clip": 0.01507539, "auxiliary_loss_mlp": 0.00298936, "balance_loss_clip": 1.23374844, "balance_loss_mlp": 0.26338822, "epoch": 0.44370960468961373, "flos": 23659853408640.0, "grad_norm": 7.368121162923069, "language_loss": 0.80060875, "learning_rate": 2.457616757401656e-06, "loss": 0.81867349, "num_input_tokens_seen": 158273400, "router_z_loss_clip": 2.73828125, "router_z_loss_mlp": 0.35546875, "step": 7380, "time_per_iteration": 2.68861722946167 }, { "auxiliary_loss_clip": 0.01500587, "auxiliary_loss_mlp": 0.00302631, "balance_loss_clip": 1.22417188, "balance_loss_mlp": 0.26779824, "epoch": 0.4437697279422817, "flos": 32416849635840.0, "grad_norm": 39.158366468956, "language_loss": 0.70083582, "learning_rate": 2.457237618887458e-06, "loss": 0.71886808, "num_input_tokens_seen": 158296840, "router_z_loss_clip": 2.76171875, "router_z_loss_mlp": 0.34838867, "step": 7381, "time_per_iteration": 2.7239747047424316 }, { "auxiliary_loss_clip": 0.01528094, "auxiliary_loss_mlp": 0.00332867, "balance_loss_clip": 1.25050306, "balance_loss_mlp": 0.29796267, "epoch": 0.44382985119494966, "flos": 18112875914880.0, "grad_norm": 4.939166385335575, "language_loss": 0.87970662, "learning_rate": 2.456858463034763e-06, "loss": 0.8983162, "num_input_tokens_seen": 158314935, "router_z_loss_clip": 2.77734375, "router_z_loss_mlp": 0.34887695, "step": 7382, "time_per_iteration": 2.638709545135498 }, { "auxiliary_loss_clip": 0.01514627, "auxiliary_loss_mlp": 0.00326213, "balance_loss_clip": 1.2362349, "balance_loss_mlp": 0.2898308, "epoch": 0.44388997444761763, "flos": 30774151923840.0, "grad_norm": 145.57163664713198, "language_loss": 0.72082424, "learning_rate": 2.456479289857949e-06, "loss": 0.73923266, "num_input_tokens_seen": 158334620, "router_z_loss_clip": 2.78320312, "router_z_loss_mlp": 0.36401367, "step": 7383, "time_per_iteration": 2.747565746307373 }, { "auxiliary_loss_clip": 0.01507398, "auxiliary_loss_mlp": 0.00341351, "balance_loss_clip": 1.22645998, "balance_loss_mlp": 0.30689931, "epoch": 0.4439500977002856, "flos": 20339157893760.0, "grad_norm": 8.413238925963759, "language_loss": 0.85080254, "learning_rate": 2.4561000993713953e-06, "loss": 0.86929005, "num_input_tokens_seen": 158350550, "router_z_loss_clip": 2.8125, "router_z_loss_mlp": 0.34448242, "step": 7384, "time_per_iteration": 2.657088041305542 }, { "auxiliary_loss_clip": 0.01506824, "auxiliary_loss_mlp": 0.00350098, "balance_loss_clip": 1.22325039, "balance_loss_mlp": 0.31273752, "epoch": 0.44401022095295356, "flos": 20371225760640.0, "grad_norm": 4.069574597396487, "language_loss": 0.86839998, "learning_rate": 2.4557208915894796e-06, "loss": 0.88696921, "num_input_tokens_seen": 158369555, "router_z_loss_clip": 2.8359375, "router_z_loss_mlp": 0.37353516, "step": 7385, "time_per_iteration": 2.7047667503356934 }, { "auxiliary_loss_clip": 0.01504405, "auxiliary_loss_mlp": 0.00312655, "balance_loss_clip": 1.22924984, "balance_loss_mlp": 0.27410299, "epoch": 0.4440703442056215, "flos": 20230635928320.0, "grad_norm": 7.131566844388839, "language_loss": 0.8773967, "learning_rate": 2.455341666526582e-06, "loss": 0.8955673, "num_input_tokens_seen": 158388045, "router_z_loss_clip": 2.75195312, "router_z_loss_mlp": 0.38574219, "step": 7386, "time_per_iteration": 2.647714614868164 }, { "auxiliary_loss_clip": 0.01528697, "auxiliary_loss_mlp": 0.00361292, "balance_loss_clip": 1.23668694, "balance_loss_mlp": 0.32357419, "epoch": 0.4441304674582895, "flos": 39494698824960.0, "grad_norm": 198.9616305560583, "language_loss": 0.77455866, "learning_rate": 2.4549624241970832e-06, "loss": 0.79345864, "num_input_tokens_seen": 158410115, "router_z_loss_clip": 2.921875, "router_z_loss_mlp": 0.37719727, "step": 7387, "time_per_iteration": 2.8029348850250244 }, { "auxiliary_loss_clip": 0.0150889, "auxiliary_loss_mlp": 0.00334017, "balance_loss_clip": 1.23124552, "balance_loss_mlp": 0.29801536, "epoch": 0.44419059071095746, "flos": 14829671220480.0, "grad_norm": 4.966003801294005, "language_loss": 0.77591938, "learning_rate": 2.4545831646153628e-06, "loss": 0.79434848, "num_input_tokens_seen": 158427765, "router_z_loss_clip": 2.77539062, "router_z_loss_mlp": 0.35961914, "step": 7388, "time_per_iteration": 2.6688618659973145 }, { "auxiliary_loss_clip": 0.01501353, "auxiliary_loss_mlp": 0.00327151, "balance_loss_clip": 1.22067046, "balance_loss_mlp": 0.29031533, "epoch": 0.4442507139636254, "flos": 22637835734400.0, "grad_norm": 3.565729918194898, "language_loss": 0.76574779, "learning_rate": 2.4542038877958044e-06, "loss": 0.78403282, "num_input_tokens_seen": 158446375, "router_z_loss_clip": 2.80664062, "router_z_loss_mlp": 0.3684082, "step": 7389, "time_per_iteration": 2.673527717590332 }, { "auxiliary_loss_clip": 0.01513838, "auxiliary_loss_mlp": 0.00311363, "balance_loss_clip": 1.23733866, "balance_loss_mlp": 0.2766971, "epoch": 0.4443108372162934, "flos": 38290721829120.0, "grad_norm": 4.35627762966236, "language_loss": 0.82130247, "learning_rate": 2.453824593752788e-06, "loss": 0.83955443, "num_input_tokens_seen": 158467260, "router_z_loss_clip": 2.76757812, "router_z_loss_mlp": 0.34667969, "step": 7390, "time_per_iteration": 2.811337471008301 }, { "auxiliary_loss_clip": 0.01510639, "auxiliary_loss_mlp": 0.00292695, "balance_loss_clip": 1.23486364, "balance_loss_mlp": 0.2584815, "epoch": 0.44437096046896135, "flos": 17748993185280.0, "grad_norm": 26.370569215225114, "language_loss": 0.89795029, "learning_rate": 2.4534452825006988e-06, "loss": 0.91598356, "num_input_tokens_seen": 158486720, "router_z_loss_clip": 2.7578125, "router_z_loss_mlp": 0.34228516, "step": 7391, "time_per_iteration": 2.7378737926483154 }, { "auxiliary_loss_clip": 0.01510541, "auxiliary_loss_mlp": 0.00298344, "balance_loss_clip": 1.2380724, "balance_loss_mlp": 0.26472709, "epoch": 0.4444310837216293, "flos": 13732348682880.0, "grad_norm": 12.316481854077338, "language_loss": 0.80677849, "learning_rate": 2.4530659540539185e-06, "loss": 0.82486737, "num_input_tokens_seen": 158502530, "router_z_loss_clip": 2.72460938, "router_z_loss_mlp": 0.3359375, "step": 7392, "time_per_iteration": 2.628227472305298 }, { "auxiliary_loss_clip": 0.01498352, "auxiliary_loss_mlp": 0.00277137, "balance_loss_clip": 1.22894454, "balance_loss_mlp": 0.245094, "epoch": 0.44449120697429734, "flos": 25010238240000.0, "grad_norm": 17.470729747963926, "language_loss": 0.84760112, "learning_rate": 2.4526866084268313e-06, "loss": 0.86535603, "num_input_tokens_seen": 158522715, "router_z_loss_clip": 2.69726562, "router_z_loss_mlp": 0.3203125, "step": 7393, "time_per_iteration": 2.752321481704712 }, { "auxiliary_loss_clip": 0.01520651, "auxiliary_loss_mlp": 0.00342303, "balance_loss_clip": 1.24248588, "balance_loss_mlp": 0.30446592, "epoch": 0.4445513302269653, "flos": 32671707609600.0, "grad_norm": 15.426973792955925, "language_loss": 0.8767969, "learning_rate": 2.4523072456338226e-06, "loss": 0.89542645, "num_input_tokens_seen": 158543615, "router_z_loss_clip": 2.78515625, "router_z_loss_mlp": 0.37817383, "step": 7394, "time_per_iteration": 2.7762956619262695 }, { "auxiliary_loss_clip": 0.01503101, "auxiliary_loss_mlp": 0.0028533, "balance_loss_clip": 1.22847271, "balance_loss_mlp": 0.25109342, "epoch": 0.44461145347963327, "flos": 11655814504320.0, "grad_norm": 2.3983766870555203, "language_loss": 0.88372791, "learning_rate": 2.4519278656892785e-06, "loss": 0.90161222, "num_input_tokens_seen": 158560330, "router_z_loss_clip": 2.74414062, "router_z_loss_mlp": 0.34228516, "step": 7395, "time_per_iteration": 2.6172335147857666 }, { "auxiliary_loss_clip": 0.01508623, "auxiliary_loss_mlp": 0.00305501, "balance_loss_clip": 1.2343421, "balance_loss_mlp": 0.26976186, "epoch": 0.44467157673230123, "flos": 20886759711360.0, "grad_norm": 4.643988596067947, "language_loss": 0.78440225, "learning_rate": 2.451548468607584e-06, "loss": 0.80254352, "num_input_tokens_seen": 158579735, "router_z_loss_clip": 2.74023438, "router_z_loss_mlp": 0.35742188, "step": 7396, "time_per_iteration": 2.6516571044921875 }, { "auxiliary_loss_clip": 0.01511281, "auxiliary_loss_mlp": 0.00330208, "balance_loss_clip": 1.23558497, "balance_loss_mlp": 0.29291981, "epoch": 0.4447316999849692, "flos": 18546137763840.0, "grad_norm": 5.157042301305465, "language_loss": 0.87367821, "learning_rate": 2.451169054403126e-06, "loss": 0.89209312, "num_input_tokens_seen": 158597075, "router_z_loss_clip": 2.75585938, "router_z_loss_mlp": 0.37304688, "step": 7397, "time_per_iteration": 2.6834349632263184 }, { "auxiliary_loss_clip": 0.01520881, "auxiliary_loss_mlp": 0.00310523, "balance_loss_clip": 1.248703, "balance_loss_mlp": 0.27538007, "epoch": 0.44479182323763716, "flos": 23769057732480.0, "grad_norm": 15.378007665757304, "language_loss": 0.70271599, "learning_rate": 2.450789623090293e-06, "loss": 0.72103, "num_input_tokens_seen": 158616650, "router_z_loss_clip": 2.72460938, "router_z_loss_mlp": 0.3515625, "step": 7398, "time_per_iteration": 2.7403244972229004 }, { "auxiliary_loss_clip": 0.01510861, "auxiliary_loss_mlp": 0.0031469, "balance_loss_clip": 1.23742545, "balance_loss_mlp": 0.28011954, "epoch": 0.44485194649030513, "flos": 16543831040640.0, "grad_norm": 2.383289388184021, "language_loss": 0.77566087, "learning_rate": 2.450410174683472e-06, "loss": 0.79391646, "num_input_tokens_seen": 158634515, "router_z_loss_clip": 2.73632812, "router_z_loss_mlp": 0.34594727, "step": 7399, "time_per_iteration": 2.8116514682769775 }, { "auxiliary_loss_clip": 0.01488286, "auxiliary_loss_mlp": 0.00290113, "balance_loss_clip": 1.2194314, "balance_loss_mlp": 0.25599501, "epoch": 0.4449120697429731, "flos": 22600955445120.0, "grad_norm": 39.10449124341703, "language_loss": 0.78327364, "learning_rate": 2.4500307091970514e-06, "loss": 0.80105758, "num_input_tokens_seen": 158653760, "router_z_loss_clip": 2.6875, "router_z_loss_mlp": 0.34106445, "step": 7400, "time_per_iteration": 2.726386547088623 }, { "auxiliary_loss_clip": 0.01497152, "auxiliary_loss_mlp": 0.00289243, "balance_loss_clip": 1.22755849, "balance_loss_mlp": 0.25488681, "epoch": 0.44497219299564106, "flos": 20004864992640.0, "grad_norm": 2.5466734175810553, "language_loss": 0.91669905, "learning_rate": 2.449651226645422e-06, "loss": 0.93456298, "num_input_tokens_seen": 158672190, "router_z_loss_clip": 2.69335938, "router_z_loss_mlp": 0.34350586, "step": 7401, "time_per_iteration": 2.677379608154297 }, { "auxiliary_loss_clip": 0.01499935, "auxiliary_loss_mlp": 0.00289602, "balance_loss_clip": 1.22988105, "balance_loss_mlp": 0.25474498, "epoch": 0.445032316248309, "flos": 25594253470080.0, "grad_norm": 25.57362821901347, "language_loss": 0.87796652, "learning_rate": 2.449271727042973e-06, "loss": 0.89586186, "num_input_tokens_seen": 158694115, "router_z_loss_clip": 2.70117188, "router_z_loss_mlp": 0.34863281, "step": 7402, "time_per_iteration": 2.865997076034546 }, { "auxiliary_loss_clip": 0.01514107, "auxiliary_loss_mlp": 0.00290931, "balance_loss_clip": 1.23669362, "balance_loss_mlp": 0.25817251, "epoch": 0.445092439500977, "flos": 21250426959360.0, "grad_norm": 10.174356754244922, "language_loss": 0.83075035, "learning_rate": 2.4488922104040947e-06, "loss": 0.84880066, "num_input_tokens_seen": 158711000, "router_z_loss_clip": 2.77148438, "router_z_loss_mlp": 0.32788086, "step": 7403, "time_per_iteration": 2.6538796424865723 }, { "auxiliary_loss_clip": 0.01350537, "auxiliary_loss_mlp": 0.00029565, "balance_loss_clip": 1.19559813, "balance_loss_mlp": 0.02203143, "epoch": 0.44515256275364495, "flos": 57764900309760.0, "grad_norm": 0.7711229171873077, "language_loss": 0.59594095, "learning_rate": 2.4485126767431793e-06, "loss": 0.60974193, "num_input_tokens_seen": 158769675, "router_z_loss_clip": 1.546875, "router_z_loss_mlp": 0.07519531, "step": 7404, "time_per_iteration": 4.52139139175415 }, { "auxiliary_loss_clip": 0.01522804, "auxiliary_loss_mlp": 0.00336375, "balance_loss_clip": 1.23962986, "balance_loss_mlp": 0.29796591, "epoch": 0.4452126860063129, "flos": 15596004908160.0, "grad_norm": 20.017628006201114, "language_loss": 0.88069642, "learning_rate": 2.4481331260746177e-06, "loss": 0.89928824, "num_input_tokens_seen": 158788215, "router_z_loss_clip": 2.8359375, "router_z_loss_mlp": 0.38427734, "step": 7405, "time_per_iteration": 2.713597297668457 }, { "auxiliary_loss_clip": 0.01500241, "auxiliary_loss_mlp": 0.0031142, "balance_loss_clip": 1.2289257, "balance_loss_mlp": 0.27625346, "epoch": 0.4452728092589809, "flos": 21617398258560.0, "grad_norm": 1.8867227193685747, "language_loss": 0.81147265, "learning_rate": 2.4477535584128036e-06, "loss": 0.82958925, "num_input_tokens_seen": 158809090, "router_z_loss_clip": 2.71289062, "router_z_loss_mlp": 0.3515625, "step": 7406, "time_per_iteration": 2.7076852321624756 }, { "auxiliary_loss_clip": 0.01494115, "auxiliary_loss_mlp": 0.00279263, "balance_loss_clip": 1.22892833, "balance_loss_mlp": 0.2476006, "epoch": 0.4453329325116489, "flos": 29497491757440.0, "grad_norm": 46.767230273914, "language_loss": 0.7193138, "learning_rate": 2.447373973772129e-06, "loss": 0.73704761, "num_input_tokens_seen": 158828320, "router_z_loss_clip": 2.65039062, "router_z_loss_mlp": 0.31665039, "step": 7407, "time_per_iteration": 4.203291893005371 }, { "auxiliary_loss_clip": 0.01509004, "auxiliary_loss_mlp": 0.00308427, "balance_loss_clip": 1.23525667, "balance_loss_mlp": 0.27333191, "epoch": 0.44539305576431687, "flos": 21361139654400.0, "grad_norm": 2.272648608768517, "language_loss": 0.7383877, "learning_rate": 2.4469943721669887e-06, "loss": 0.75656199, "num_input_tokens_seen": 158847040, "router_z_loss_clip": 2.74023438, "router_z_loss_mlp": 0.35107422, "step": 7408, "time_per_iteration": 2.666595935821533 }, { "auxiliary_loss_clip": 0.01487257, "auxiliary_loss_mlp": 0.00307466, "balance_loss_clip": 1.21389294, "balance_loss_mlp": 0.27389702, "epoch": 0.44545317901698483, "flos": 41427626428800.0, "grad_norm": 151.53298610785535, "language_loss": 0.76463622, "learning_rate": 2.4466147536117776e-06, "loss": 0.78258348, "num_input_tokens_seen": 158870490, "router_z_loss_clip": 2.73242188, "router_z_loss_mlp": 0.33569336, "step": 7409, "time_per_iteration": 4.267614364624023 }, { "auxiliary_loss_clip": 0.01488998, "auxiliary_loss_mlp": 0.00324289, "balance_loss_clip": 1.21500993, "balance_loss_mlp": 0.2879782, "epoch": 0.4455133022696528, "flos": 22055005653120.0, "grad_norm": 6.256165030796798, "language_loss": 0.72950041, "learning_rate": 2.4462351181208895e-06, "loss": 0.74763334, "num_input_tokens_seen": 158889920, "router_z_loss_clip": 2.7421875, "router_z_loss_mlp": 0.36303711, "step": 7410, "time_per_iteration": 2.7130978107452393 }, { "auxiliary_loss_clip": 0.01502566, "auxiliary_loss_mlp": 0.00308787, "balance_loss_clip": 1.22576189, "balance_loss_mlp": 0.27345306, "epoch": 0.44557342552232077, "flos": 23476960333440.0, "grad_norm": 10.020503099638624, "language_loss": 0.82790095, "learning_rate": 2.4458554657087217e-06, "loss": 0.8460145, "num_input_tokens_seen": 158909580, "router_z_loss_clip": 2.76757812, "router_z_loss_mlp": 0.35327148, "step": 7411, "time_per_iteration": 2.6906332969665527 }, { "auxiliary_loss_clip": 0.01494753, "auxiliary_loss_mlp": 0.00271125, "balance_loss_clip": 1.23452771, "balance_loss_mlp": 0.24008319, "epoch": 0.44563354877498873, "flos": 19134678107520.0, "grad_norm": 7.9365424996563805, "language_loss": 0.83879495, "learning_rate": 2.4454757963896695e-06, "loss": 0.85645372, "num_input_tokens_seen": 158924600, "router_z_loss_clip": 2.59960938, "router_z_loss_mlp": 0.31054688, "step": 7412, "time_per_iteration": 2.774928092956543 }, { "auxiliary_loss_clip": 0.01493047, "auxiliary_loss_mlp": 0.00309967, "balance_loss_clip": 1.21845067, "balance_loss_mlp": 0.2764934, "epoch": 0.4456936720276567, "flos": 13621420506240.0, "grad_norm": 8.954484164143432, "language_loss": 0.87865651, "learning_rate": 2.4450961101781304e-06, "loss": 0.89668667, "num_input_tokens_seen": 158939345, "router_z_loss_clip": 2.75, "router_z_loss_mlp": 0.33496094, "step": 7413, "time_per_iteration": 2.6444928646087646 }, { "auxiliary_loss_clip": 0.01482935, "auxiliary_loss_mlp": 0.00294409, "balance_loss_clip": 1.21605968, "balance_loss_mlp": 0.26207942, "epoch": 0.44575379528032466, "flos": 14713715139840.0, "grad_norm": 3.4000939208585796, "language_loss": 0.82686448, "learning_rate": 2.4447164070885026e-06, "loss": 0.84463787, "num_input_tokens_seen": 158955855, "router_z_loss_clip": 2.66796875, "router_z_loss_mlp": 0.32299805, "step": 7414, "time_per_iteration": 2.7273757457733154 }, { "auxiliary_loss_clip": 0.01486806, "auxiliary_loss_mlp": 0.00309577, "balance_loss_clip": 1.21533298, "balance_loss_mlp": 0.27410069, "epoch": 0.4458139185329926, "flos": 24170682677760.0, "grad_norm": 24.388500680742577, "language_loss": 0.90118062, "learning_rate": 2.4443366871351837e-06, "loss": 0.91914445, "num_input_tokens_seen": 158976315, "router_z_loss_clip": 2.71289062, "router_z_loss_mlp": 0.35473633, "step": 7415, "time_per_iteration": 4.043541669845581 }, { "auxiliary_loss_clip": 0.01471152, "auxiliary_loss_mlp": 0.00322041, "balance_loss_clip": 1.20430565, "balance_loss_mlp": 0.28789955, "epoch": 0.4458740417856606, "flos": 21762225895680.0, "grad_norm": 1424.2173673358109, "language_loss": 0.90007806, "learning_rate": 2.4439569503325732e-06, "loss": 0.91801, "num_input_tokens_seen": 158996725, "router_z_loss_clip": 2.66796875, "router_z_loss_mlp": 0.34155273, "step": 7416, "time_per_iteration": 2.6401963233947754 }, { "auxiliary_loss_clip": 0.01486639, "auxiliary_loss_mlp": 0.00297241, "balance_loss_clip": 1.21649837, "balance_loss_mlp": 0.2631467, "epoch": 0.44593416503832856, "flos": 21068790860160.0, "grad_norm": 9.741006924283475, "language_loss": 0.86838007, "learning_rate": 2.4435771966950706e-06, "loss": 0.88621891, "num_input_tokens_seen": 159017255, "router_z_loss_clip": 2.70117188, "router_z_loss_mlp": 0.34106445, "step": 7417, "time_per_iteration": 2.67580509185791 }, { "auxiliary_loss_clip": 0.01478979, "auxiliary_loss_mlp": 0.00300563, "balance_loss_clip": 1.21126294, "balance_loss_mlp": 0.26570579, "epoch": 0.4459942882909965, "flos": 22600488568320.0, "grad_norm": 489.7816659726153, "language_loss": 0.88014054, "learning_rate": 2.443197426237077e-06, "loss": 0.89793599, "num_input_tokens_seen": 159035010, "router_z_loss_clip": 2.67578125, "router_z_loss_mlp": 0.34863281, "step": 7418, "time_per_iteration": 2.6474437713623047 }, { "auxiliary_loss_clip": 0.01463523, "auxiliary_loss_mlp": 0.00281815, "balance_loss_clip": 1.19676268, "balance_loss_mlp": 0.24877053, "epoch": 0.4460544115436645, "flos": 26505486622080.0, "grad_norm": 24.57223788297306, "language_loss": 0.83319336, "learning_rate": 2.442817638972991e-06, "loss": 0.85064679, "num_input_tokens_seen": 159055345, "router_z_loss_clip": 2.66601562, "router_z_loss_mlp": 0.33056641, "step": 7419, "time_per_iteration": 2.6933679580688477 }, { "auxiliary_loss_clip": 0.01476646, "auxiliary_loss_mlp": 0.00291642, "balance_loss_clip": 1.21332252, "balance_loss_mlp": 0.25890678, "epoch": 0.4461145347963325, "flos": 17604021893760.0, "grad_norm": 1016.6495534940369, "language_loss": 0.79712129, "learning_rate": 2.4424378349172176e-06, "loss": 0.8148042, "num_input_tokens_seen": 159074225, "router_z_loss_clip": 2.63671875, "router_z_loss_mlp": 0.32714844, "step": 7420, "time_per_iteration": 2.656338691711426 }, { "auxiliary_loss_clip": 0.01464258, "auxiliary_loss_mlp": 0.00292172, "balance_loss_clip": 1.20570576, "balance_loss_mlp": 0.25993797, "epoch": 0.44617465804900047, "flos": 27268193036160.0, "grad_norm": 97.22186273074672, "language_loss": 0.80467856, "learning_rate": 2.442058014084156e-06, "loss": 0.82224286, "num_input_tokens_seen": 159095415, "router_z_loss_clip": 2.5859375, "router_z_loss_mlp": 0.32250977, "step": 7421, "time_per_iteration": 2.7221546173095703 }, { "auxiliary_loss_clip": 0.0146213, "auxiliary_loss_mlp": 0.00289005, "balance_loss_clip": 1.20213056, "balance_loss_mlp": 0.25514966, "epoch": 0.44623478130166844, "flos": 17786412178560.0, "grad_norm": 7.559844451373558, "language_loss": 0.82265633, "learning_rate": 2.44167817648821e-06, "loss": 0.8401677, "num_input_tokens_seen": 159114615, "router_z_loss_clip": 2.59960938, "router_z_loss_mlp": 0.33862305, "step": 7422, "time_per_iteration": 2.740950107574463 }, { "auxiliary_loss_clip": 0.01473291, "auxiliary_loss_mlp": 0.00336373, "balance_loss_clip": 1.21033967, "balance_loss_mlp": 0.30192155, "epoch": 0.4462949045543364, "flos": 23003011353600.0, "grad_norm": 13.341818276430095, "language_loss": 0.71352625, "learning_rate": 2.441298322143784e-06, "loss": 0.73162293, "num_input_tokens_seen": 159134370, "router_z_loss_clip": 2.63085938, "router_z_loss_mlp": 0.34423828, "step": 7423, "time_per_iteration": 2.679807424545288 }, { "auxiliary_loss_clip": 0.01477897, "auxiliary_loss_mlp": 0.00283502, "balance_loss_clip": 1.21842194, "balance_loss_mlp": 0.251196, "epoch": 0.44635502780700437, "flos": 17820096157440.0, "grad_norm": 10.45270643034853, "language_loss": 0.86103964, "learning_rate": 2.4409184510652807e-06, "loss": 0.87865365, "num_input_tokens_seen": 159152540, "router_z_loss_clip": 2.59570312, "router_z_loss_mlp": 0.32324219, "step": 7424, "time_per_iteration": 2.6748740673065186 }, { "auxiliary_loss_clip": 0.01469351, "auxiliary_loss_mlp": 0.00264821, "balance_loss_clip": 1.21004069, "balance_loss_mlp": 0.23225351, "epoch": 0.44641515105967233, "flos": 26688020561280.0, "grad_norm": 44.161158694753276, "language_loss": 0.8496142, "learning_rate": 2.4405385632671063e-06, "loss": 0.86695588, "num_input_tokens_seen": 159173425, "router_z_loss_clip": 2.59570312, "router_z_loss_mlp": 0.32543945, "step": 7425, "time_per_iteration": 2.6980209350585938 }, { "auxiliary_loss_clip": 0.01482623, "auxiliary_loss_mlp": 0.00300455, "balance_loss_clip": 1.22046971, "balance_loss_mlp": 0.26762491, "epoch": 0.4464752743123403, "flos": 18913324544640.0, "grad_norm": 6.124757451645669, "language_loss": 0.83841813, "learning_rate": 2.4401586587636655e-06, "loss": 0.85624892, "num_input_tokens_seen": 159191210, "router_z_loss_clip": 2.62109375, "router_z_loss_mlp": 0.328125, "step": 7426, "time_per_iteration": 2.6671652793884277 }, { "auxiliary_loss_clip": 0.01467181, "auxiliary_loss_mlp": 0.00301377, "balance_loss_clip": 1.20143282, "balance_loss_mlp": 0.26718742, "epoch": 0.44653539756500826, "flos": 29570318582400.0, "grad_norm": 4.986109324702479, "language_loss": 0.72466481, "learning_rate": 2.4397787375693634e-06, "loss": 0.7423504, "num_input_tokens_seen": 159211755, "router_z_loss_clip": 2.65625, "router_z_loss_mlp": 0.34228516, "step": 7427, "time_per_iteration": 2.7839462757110596 }, { "auxiliary_loss_clip": 0.01485677, "auxiliary_loss_mlp": 0.00280261, "balance_loss_clip": 1.21926713, "balance_loss_mlp": 0.2485995, "epoch": 0.44659552081767623, "flos": 21468979261440.0, "grad_norm": 15.569917997483254, "language_loss": 0.82604766, "learning_rate": 2.439398799698608e-06, "loss": 0.84370697, "num_input_tokens_seen": 159230315, "router_z_loss_clip": 2.66210938, "router_z_loss_mlp": 0.31689453, "step": 7428, "time_per_iteration": 2.6447269916534424 }, { "auxiliary_loss_clip": 0.01470636, "auxiliary_loss_mlp": 0.00275336, "balance_loss_clip": 1.20658839, "balance_loss_mlp": 0.2432927, "epoch": 0.4466556440703442, "flos": 17931886260480.0, "grad_norm": 6.122190114463814, "language_loss": 0.84697551, "learning_rate": 2.439018845165806e-06, "loss": 0.8644352, "num_input_tokens_seen": 159249810, "router_z_loss_clip": 2.63867188, "router_z_loss_mlp": 0.32055664, "step": 7429, "time_per_iteration": 2.647575855255127 }, { "auxiliary_loss_clip": 0.01482255, "auxiliary_loss_mlp": 0.00298919, "balance_loss_clip": 1.21587789, "balance_loss_mlp": 0.26656556, "epoch": 0.44671576732301216, "flos": 21107430915840.0, "grad_norm": 88.82640156945236, "language_loss": 0.96352601, "learning_rate": 2.438638873985366e-06, "loss": 0.98133773, "num_input_tokens_seen": 159271715, "router_z_loss_clip": 2.66796875, "router_z_loss_mlp": 0.32348633, "step": 7430, "time_per_iteration": 2.6819207668304443 }, { "auxiliary_loss_clip": 0.01480554, "auxiliary_loss_mlp": 0.00304873, "balance_loss_clip": 1.2056936, "balance_loss_mlp": 0.27003992, "epoch": 0.4467758905756801, "flos": 23508920459520.0, "grad_norm": 16.725276519696877, "language_loss": 0.86794865, "learning_rate": 2.4382588861716954e-06, "loss": 0.88580292, "num_input_tokens_seen": 159290690, "router_z_loss_clip": 2.75195312, "router_z_loss_mlp": 0.34814453, "step": 7431, "time_per_iteration": 2.7003397941589355 }, { "auxiliary_loss_clip": 0.01470638, "auxiliary_loss_mlp": 0.00320046, "balance_loss_clip": 1.20309615, "balance_loss_mlp": 0.28671461, "epoch": 0.4468360138283481, "flos": 18734022829440.0, "grad_norm": 38.08134803682482, "language_loss": 0.8815254, "learning_rate": 2.437878881739204e-06, "loss": 0.89943218, "num_input_tokens_seen": 159309400, "router_z_loss_clip": 2.67773438, "router_z_loss_mlp": 0.33325195, "step": 7432, "time_per_iteration": 2.6457695960998535 }, { "auxiliary_loss_clip": 0.01478511, "auxiliary_loss_mlp": 0.00338146, "balance_loss_clip": 1.20631719, "balance_loss_mlp": 0.30402794, "epoch": 0.4468961370810161, "flos": 23477139901440.0, "grad_norm": 72.9796394552312, "language_loss": 0.83264697, "learning_rate": 2.437498860702301e-06, "loss": 0.85081351, "num_input_tokens_seen": 159327425, "router_z_loss_clip": 2.72265625, "router_z_loss_mlp": 0.34106445, "step": 7433, "time_per_iteration": 2.685288429260254 }, { "auxiliary_loss_clip": 0.01454655, "auxiliary_loss_mlp": 0.00305147, "balance_loss_clip": 1.19464946, "balance_loss_mlp": 0.27361631, "epoch": 0.4469562603336841, "flos": 30075042539520.0, "grad_norm": 19.36407299560044, "language_loss": 0.81925935, "learning_rate": 2.437118823075398e-06, "loss": 0.83685738, "num_input_tokens_seen": 159345805, "router_z_loss_clip": 2.59960938, "router_z_loss_mlp": 0.31530762, "step": 7434, "time_per_iteration": 2.7955877780914307 }, { "auxiliary_loss_clip": 0.01486899, "auxiliary_loss_mlp": 0.00307303, "balance_loss_clip": 1.2166779, "balance_loss_mlp": 0.27535483, "epoch": 0.44701638358635204, "flos": 22456415116800.0, "grad_norm": 19.164467893325792, "language_loss": 0.70754939, "learning_rate": 2.436738768872905e-06, "loss": 0.7254914, "num_input_tokens_seen": 159364595, "router_z_loss_clip": 2.70117188, "router_z_loss_mlp": 0.31933594, "step": 7435, "time_per_iteration": 2.760272741317749 }, { "auxiliary_loss_clip": 0.01481342, "auxiliary_loss_mlp": 0.00329247, "balance_loss_clip": 1.2162807, "balance_loss_mlp": 0.29369858, "epoch": 0.44707650683902, "flos": 24057851080320.0, "grad_norm": 7.724560927777531, "language_loss": 0.88970578, "learning_rate": 2.4363586981092346e-06, "loss": 0.90781164, "num_input_tokens_seen": 159385265, "router_z_loss_clip": 2.65039062, "router_z_loss_mlp": 0.35510254, "step": 7436, "time_per_iteration": 2.7431564331054688 }, { "auxiliary_loss_clip": 0.01478467, "auxiliary_loss_mlp": 0.00330418, "balance_loss_clip": 1.21021736, "balance_loss_mlp": 0.29639524, "epoch": 0.44713663009168797, "flos": 23766938830080.0, "grad_norm": 3.394299363880444, "language_loss": 0.85253584, "learning_rate": 2.435978610798798e-06, "loss": 0.87062466, "num_input_tokens_seen": 159405080, "router_z_loss_clip": 2.68359375, "router_z_loss_mlp": 0.34057617, "step": 7437, "time_per_iteration": 2.6747028827667236 }, { "auxiliary_loss_clip": 0.01448289, "auxiliary_loss_mlp": 0.00332243, "balance_loss_clip": 1.18335271, "balance_loss_mlp": 0.29783887, "epoch": 0.44719675334435594, "flos": 24499265316480.0, "grad_norm": 3.5628081820448956, "language_loss": 0.79214978, "learning_rate": 2.435598506956009e-06, "loss": 0.80995506, "num_input_tokens_seen": 159424595, "router_z_loss_clip": 2.65039062, "router_z_loss_mlp": 0.34399414, "step": 7438, "time_per_iteration": 2.703549385070801 }, { "auxiliary_loss_clip": 0.01473098, "auxiliary_loss_mlp": 0.00350737, "balance_loss_clip": 1.20341074, "balance_loss_mlp": 0.31340015, "epoch": 0.4472568765970239, "flos": 29781759991680.0, "grad_norm": 59.52281381311678, "language_loss": 0.7316069, "learning_rate": 2.4352183865952808e-06, "loss": 0.74984521, "num_input_tokens_seen": 159443865, "router_z_loss_clip": 2.6953125, "router_z_loss_mlp": 0.37402344, "step": 7439, "time_per_iteration": 2.7008261680603027 }, { "auxiliary_loss_clip": 0.01470714, "auxiliary_loss_mlp": 0.00333041, "balance_loss_clip": 1.20484662, "balance_loss_mlp": 0.29758775, "epoch": 0.44731699984969187, "flos": 24643123286400.0, "grad_norm": 30.238184673007975, "language_loss": 0.80663764, "learning_rate": 2.4348382497310285e-06, "loss": 0.8246752, "num_input_tokens_seen": 159464525, "router_z_loss_clip": 2.65820312, "router_z_loss_mlp": 0.35473633, "step": 7440, "time_per_iteration": 2.731043577194214 }, { "auxiliary_loss_clip": 0.01459904, "auxiliary_loss_mlp": 0.00344911, "balance_loss_clip": 1.19654751, "balance_loss_mlp": 0.31234285, "epoch": 0.44737712310235983, "flos": 29455691304960.0, "grad_norm": 58.29658356100729, "language_loss": 0.78632855, "learning_rate": 2.4344580963776655e-06, "loss": 0.80437672, "num_input_tokens_seen": 159486385, "router_z_loss_clip": 2.63476562, "router_z_loss_mlp": 0.32592773, "step": 7441, "time_per_iteration": 2.7037012577056885 }, { "auxiliary_loss_clip": 0.01480856, "auxiliary_loss_mlp": 0.00341674, "balance_loss_clip": 1.21195424, "balance_loss_mlp": 0.30684125, "epoch": 0.4474372463550278, "flos": 24896832024960.0, "grad_norm": 16.087457795837025, "language_loss": 0.81776762, "learning_rate": 2.4340779265496082e-06, "loss": 0.83599293, "num_input_tokens_seen": 159503880, "router_z_loss_clip": 2.69140625, "router_z_loss_mlp": 0.34863281, "step": 7442, "time_per_iteration": 2.6757938861846924 }, { "auxiliary_loss_clip": 0.01471521, "auxiliary_loss_mlp": 0.00362269, "balance_loss_clip": 1.19427586, "balance_loss_mlp": 0.32300138, "epoch": 0.44749736960769576, "flos": 33181603125120.0, "grad_norm": 10.077058138133498, "language_loss": 0.82652885, "learning_rate": 2.433697740261273e-06, "loss": 0.84486675, "num_input_tokens_seen": 159522980, "router_z_loss_clip": 2.76953125, "router_z_loss_mlp": 0.39233398, "step": 7443, "time_per_iteration": 2.7371933460235596 }, { "auxiliary_loss_clip": 0.01460673, "auxiliary_loss_mlp": 0.00366833, "balance_loss_clip": 1.1953696, "balance_loss_mlp": 0.32851934, "epoch": 0.4475574928603637, "flos": 21071807602560.0, "grad_norm": 6.280644856268834, "language_loss": 0.83550978, "learning_rate": 2.4333175375270748e-06, "loss": 0.8537848, "num_input_tokens_seen": 159543340, "router_z_loss_clip": 2.65234375, "router_z_loss_mlp": 0.38256836, "step": 7444, "time_per_iteration": 2.7022743225097656 }, { "auxiliary_loss_clip": 0.01461365, "auxiliary_loss_mlp": 0.00373384, "balance_loss_clip": 1.19513321, "balance_loss_mlp": 0.33839637, "epoch": 0.4476176161130317, "flos": 21862523646720.0, "grad_norm": 5.264781221726737, "language_loss": 0.93751848, "learning_rate": 2.4329373183614333e-06, "loss": 0.95586598, "num_input_tokens_seen": 159558210, "router_z_loss_clip": 2.66015625, "router_z_loss_mlp": 0.34997559, "step": 7445, "time_per_iteration": 2.9154083728790283 }, { "auxiliary_loss_clip": 0.01477246, "auxiliary_loss_mlp": 0.00358149, "balance_loss_clip": 1.19934797, "balance_loss_mlp": 0.32124197, "epoch": 0.4476777393656997, "flos": 22528667324160.0, "grad_norm": 96.31092930432146, "language_loss": 0.70201838, "learning_rate": 2.432557082778765e-06, "loss": 0.72037232, "num_input_tokens_seen": 159577920, "router_z_loss_clip": 2.77539062, "router_z_loss_mlp": 0.36889648, "step": 7446, "time_per_iteration": 2.807722330093384 }, { "auxiliary_loss_clip": 0.01312838, "auxiliary_loss_mlp": 0.00103539, "balance_loss_clip": 1.16209769, "balance_loss_mlp": 0.09600464, "epoch": 0.4477378626183677, "flos": 49017133877760.0, "grad_norm": 0.7381774216104406, "language_loss": 0.49614519, "learning_rate": 2.4321768307934884e-06, "loss": 0.51030898, "num_input_tokens_seen": 159632295, "router_z_loss_clip": 1.5078125, "router_z_loss_mlp": 0.07519531, "step": 7447, "time_per_iteration": 4.464433670043945 }, { "auxiliary_loss_clip": 0.01311884, "auxiliary_loss_mlp": 0.00100424, "balance_loss_clip": 1.16093194, "balance_loss_mlp": 0.09293745, "epoch": 0.44779798587103564, "flos": 56542179392640.0, "grad_norm": 0.7548482771092092, "language_loss": 0.59052163, "learning_rate": 2.4317965624200235e-06, "loss": 0.60464472, "num_input_tokens_seen": 159698435, "router_z_loss_clip": 1.5078125, "router_z_loss_mlp": 0.07470703, "step": 7448, "time_per_iteration": 3.182490587234497 }, { "auxiliary_loss_clip": 0.01461229, "auxiliary_loss_mlp": 0.00381195, "balance_loss_clip": 1.19305682, "balance_loss_mlp": 0.34419209, "epoch": 0.4478581091237036, "flos": 46498536040320.0, "grad_norm": 20.55464656896347, "language_loss": 0.63888967, "learning_rate": 2.431416277672789e-06, "loss": 0.65731388, "num_input_tokens_seen": 159722150, "router_z_loss_clip": 2.68164062, "router_z_loss_mlp": 0.37011719, "step": 7449, "time_per_iteration": 2.9274513721466064 }, { "auxiliary_loss_clip": 0.01460766, "auxiliary_loss_mlp": 0.00358691, "balance_loss_clip": 1.19399107, "balance_loss_mlp": 0.32464465, "epoch": 0.4479182323763716, "flos": 20814363849600.0, "grad_norm": 97.99762856236228, "language_loss": 0.8615579, "learning_rate": 2.4310359765662065e-06, "loss": 0.8797524, "num_input_tokens_seen": 159740550, "router_z_loss_clip": 2.66601562, "router_z_loss_mlp": 0.33984375, "step": 7450, "time_per_iteration": 4.125848770141602 }, { "auxiliary_loss_clip": 0.01460012, "auxiliary_loss_mlp": 0.00361934, "balance_loss_clip": 1.1946311, "balance_loss_mlp": 0.32834107, "epoch": 0.44797835562903954, "flos": 14245979212800.0, "grad_norm": 50.41735296176141, "language_loss": 0.86416191, "learning_rate": 2.430655659114697e-06, "loss": 0.88238132, "num_input_tokens_seen": 159758245, "router_z_loss_clip": 2.65039062, "router_z_loss_mlp": 0.3359375, "step": 7451, "time_per_iteration": 4.1474199295043945 }, { "auxiliary_loss_clip": 0.01296531, "auxiliary_loss_mlp": 0.00085469, "balance_loss_clip": 1.14349639, "balance_loss_mlp": 0.07741013, "epoch": 0.4480384788817075, "flos": 63534560169600.0, "grad_norm": 0.7874529869222724, "language_loss": 0.62678277, "learning_rate": 2.430275325332681e-06, "loss": 0.64060277, "num_input_tokens_seen": 159826790, "router_z_loss_clip": 1.53125, "router_z_loss_mlp": 0.08056641, "step": 7452, "time_per_iteration": 3.1976380348205566 }, { "auxiliary_loss_clip": 0.01461244, "auxiliary_loss_mlp": 0.0035897, "balance_loss_clip": 1.19445682, "balance_loss_mlp": 0.32122791, "epoch": 0.44809860213437547, "flos": 21652626522240.0, "grad_norm": 49.82353570708272, "language_loss": 0.69614166, "learning_rate": 2.429894975234582e-06, "loss": 0.71434385, "num_input_tokens_seen": 159845805, "router_z_loss_clip": 2.66796875, "router_z_loss_mlp": 0.37744141, "step": 7453, "time_per_iteration": 2.6838250160217285 }, { "auxiliary_loss_clip": 0.01285948, "auxiliary_loss_mlp": 0.00083153, "balance_loss_clip": 1.13753915, "balance_loss_mlp": 0.07557151, "epoch": 0.44815872538704343, "flos": 69190634246400.0, "grad_norm": 0.7455263833094983, "language_loss": 0.56545293, "learning_rate": 2.4295146088348224e-06, "loss": 0.57914388, "num_input_tokens_seen": 159898860, "router_z_loss_clip": 1.484375, "router_z_loss_mlp": 0.07568359, "step": 7454, "time_per_iteration": 3.038912773132324 }, { "auxiliary_loss_clip": 0.0147079, "auxiliary_loss_mlp": 0.00391457, "balance_loss_clip": 1.20340157, "balance_loss_mlp": 0.35629028, "epoch": 0.4482188486397114, "flos": 12598289510400.0, "grad_norm": 17.569077880092745, "language_loss": 0.83320236, "learning_rate": 2.4291342261478255e-06, "loss": 0.85182488, "num_input_tokens_seen": 159911555, "router_z_loss_clip": 2.6796875, "router_z_loss_mlp": 0.3515625, "step": 7455, "time_per_iteration": 2.6444125175476074 }, { "auxiliary_loss_clip": 0.01471203, "auxiliary_loss_mlp": 0.00413167, "balance_loss_clip": 1.20556402, "balance_loss_mlp": 0.37754756, "epoch": 0.44827897189237936, "flos": 34058182631040.0, "grad_norm": 6.073098752184318, "language_loss": 0.81782562, "learning_rate": 2.428753827188016e-06, "loss": 0.83666933, "num_input_tokens_seen": 159931470, "router_z_loss_clip": 2.65625, "router_z_loss_mlp": 0.35620117, "step": 7456, "time_per_iteration": 2.82081937789917 }, { "auxiliary_loss_clip": 0.01470682, "auxiliary_loss_mlp": 0.00371629, "balance_loss_clip": 1.20887876, "balance_loss_mlp": 0.33643857, "epoch": 0.44833909514504733, "flos": 25147416280320.0, "grad_norm": 203.7911659090859, "language_loss": 0.8230325, "learning_rate": 2.428373411969818e-06, "loss": 0.84145558, "num_input_tokens_seen": 159946115, "router_z_loss_clip": 2.62109375, "router_z_loss_mlp": 0.35180664, "step": 7457, "time_per_iteration": 4.0529937744140625 }, { "auxiliary_loss_clip": 0.01483048, "auxiliary_loss_mlp": 0.00378061, "balance_loss_clip": 1.21153426, "balance_loss_mlp": 0.34065294, "epoch": 0.4483992183977153, "flos": 16179984224640.0, "grad_norm": 29.502583909489754, "language_loss": 0.74985576, "learning_rate": 2.4279929805076576e-06, "loss": 0.76846689, "num_input_tokens_seen": 159963915, "router_z_loss_clip": 2.71679688, "router_z_loss_mlp": 0.37426758, "step": 7458, "time_per_iteration": 2.598811388015747 }, { "auxiliary_loss_clip": 0.01466666, "auxiliary_loss_mlp": 0.00371207, "balance_loss_clip": 1.19592643, "balance_loss_mlp": 0.3332265, "epoch": 0.44845934165038326, "flos": 17746048270080.0, "grad_norm": 60.16075183345928, "language_loss": 0.77445912, "learning_rate": 2.427612532815961e-06, "loss": 0.7928378, "num_input_tokens_seen": 159982140, "router_z_loss_clip": 2.70507812, "router_z_loss_mlp": 0.37939453, "step": 7459, "time_per_iteration": 2.6401870250701904 }, { "auxiliary_loss_clip": 0.01465556, "auxiliary_loss_mlp": 0.0038724, "balance_loss_clip": 1.20160508, "balance_loss_mlp": 0.3524074, "epoch": 0.4485194649030513, "flos": 21835914647040.0, "grad_norm": 4.7532873951715855, "language_loss": 0.76829302, "learning_rate": 2.427232068909154e-06, "loss": 0.78682101, "num_input_tokens_seen": 160002280, "router_z_loss_clip": 2.63671875, "router_z_loss_mlp": 0.34814453, "step": 7460, "time_per_iteration": 2.67160964012146 }, { "auxiliary_loss_clip": 0.01476462, "auxiliary_loss_mlp": 0.00378163, "balance_loss_clip": 1.21043491, "balance_loss_mlp": 0.34132779, "epoch": 0.44857958815571924, "flos": 20084515401600.0, "grad_norm": 5.025969382765271, "language_loss": 0.84554195, "learning_rate": 2.4268515888016635e-06, "loss": 0.86408824, "num_input_tokens_seen": 160020260, "router_z_loss_clip": 2.66015625, "router_z_loss_mlp": 0.3684082, "step": 7461, "time_per_iteration": 2.6317391395568848 }, { "auxiliary_loss_clip": 0.01488663, "auxiliary_loss_mlp": 0.00380186, "balance_loss_clip": 1.22012997, "balance_loss_mlp": 0.34563917, "epoch": 0.4486397114083872, "flos": 27053519402880.0, "grad_norm": 13.831304449131821, "language_loss": 0.760234, "learning_rate": 2.4264710925079184e-06, "loss": 0.77892244, "num_input_tokens_seen": 160040240, "router_z_loss_clip": 2.68164062, "router_z_loss_mlp": 0.34545898, "step": 7462, "time_per_iteration": 2.6861188411712646 }, { "auxiliary_loss_clip": 0.01287849, "auxiliary_loss_mlp": 0.00072211, "balance_loss_clip": 1.14508581, "balance_loss_mlp": 0.06613083, "epoch": 0.4486998346610552, "flos": 67321195931520.0, "grad_norm": 130.90990620902886, "language_loss": 0.54425406, "learning_rate": 2.4260905800423462e-06, "loss": 0.55785465, "num_input_tokens_seen": 160093865, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.06079102, "step": 7463, "time_per_iteration": 3.173318386077881 }, { "auxiliary_loss_clip": 0.01462789, "auxiliary_loss_mlp": 0.0036211, "balance_loss_clip": 1.2021699, "balance_loss_mlp": 0.32734907, "epoch": 0.44875995791372314, "flos": 27636816360960.0, "grad_norm": 5.157265468428393, "language_loss": 0.83557308, "learning_rate": 2.4257100514193775e-06, "loss": 0.85382199, "num_input_tokens_seen": 160113590, "router_z_loss_clip": 2.60351562, "router_z_loss_mlp": 0.34741211, "step": 7464, "time_per_iteration": 2.724362850189209 }, { "auxiliary_loss_clip": 0.01482427, "auxiliary_loss_mlp": 0.00404388, "balance_loss_clip": 1.2191478, "balance_loss_mlp": 0.36869708, "epoch": 0.4488200811663911, "flos": 13005947940480.0, "grad_norm": 247.59052934626763, "language_loss": 0.82255793, "learning_rate": 2.425329506653441e-06, "loss": 0.84142613, "num_input_tokens_seen": 160131795, "router_z_loss_clip": 2.6328125, "router_z_loss_mlp": 0.35693359, "step": 7465, "time_per_iteration": 2.675835132598877 }, { "auxiliary_loss_clip": 0.01487134, "auxiliary_loss_mlp": 0.00372984, "balance_loss_clip": 1.21570563, "balance_loss_mlp": 0.33619612, "epoch": 0.44888020441905907, "flos": 27489977562240.0, "grad_norm": 84.13552375343625, "language_loss": 0.87205803, "learning_rate": 2.424948945758966e-06, "loss": 0.89065921, "num_input_tokens_seen": 160150635, "router_z_loss_clip": 2.71289062, "router_z_loss_mlp": 0.36791992, "step": 7466, "time_per_iteration": 2.6737465858459473 }, { "auxiliary_loss_clip": 0.0149113, "auxiliary_loss_mlp": 0.00411267, "balance_loss_clip": 1.22329164, "balance_loss_mlp": 0.37321538, "epoch": 0.44894032767172704, "flos": 18259678800000.0, "grad_norm": 3.1547837413849975, "language_loss": 0.88896871, "learning_rate": 2.4245683687503844e-06, "loss": 0.90799272, "num_input_tokens_seen": 160168615, "router_z_loss_clip": 2.68359375, "router_z_loss_mlp": 0.38037109, "step": 7467, "time_per_iteration": 2.6458473205566406 }, { "auxiliary_loss_clip": 0.0147808, "auxiliary_loss_mlp": 0.00362272, "balance_loss_clip": 1.21832037, "balance_loss_mlp": 0.32791549, "epoch": 0.449000450924395, "flos": 21579835610880.0, "grad_norm": 2.396717090190205, "language_loss": 0.80675042, "learning_rate": 2.424187775642129e-06, "loss": 0.82515395, "num_input_tokens_seen": 160187295, "router_z_loss_clip": 2.59570312, "router_z_loss_mlp": 0.34350586, "step": 7468, "time_per_iteration": 2.653432846069336 }, { "auxiliary_loss_clip": 0.01473165, "auxiliary_loss_mlp": 0.00388037, "balance_loss_clip": 1.2128818, "balance_loss_mlp": 0.35246503, "epoch": 0.44906057417706297, "flos": 17967904623360.0, "grad_norm": 2.046762795399905, "language_loss": 0.75616097, "learning_rate": 2.4238071664486297e-06, "loss": 0.774773, "num_input_tokens_seen": 160205115, "router_z_loss_clip": 2.60546875, "router_z_loss_mlp": 0.35571289, "step": 7469, "time_per_iteration": 2.6947221755981445 }, { "auxiliary_loss_clip": 0.01490235, "auxiliary_loss_mlp": 0.00434977, "balance_loss_clip": 1.22095859, "balance_loss_mlp": 0.39702114, "epoch": 0.44912069742973093, "flos": 20047347803520.0, "grad_norm": 94.97121555276874, "language_loss": 0.79709053, "learning_rate": 2.4234265411843203e-06, "loss": 0.81634265, "num_input_tokens_seen": 160222580, "router_z_loss_clip": 2.69140625, "router_z_loss_mlp": 0.37963867, "step": 7470, "time_per_iteration": 2.662874460220337 }, { "auxiliary_loss_clip": 0.01497089, "auxiliary_loss_mlp": 0.00428762, "balance_loss_clip": 1.22967303, "balance_loss_mlp": 0.39021, "epoch": 0.4491808206823989, "flos": 21033526682880.0, "grad_norm": 19.25646427737763, "language_loss": 0.82551146, "learning_rate": 2.423045899863634e-06, "loss": 0.84476995, "num_input_tokens_seen": 160241520, "router_z_loss_clip": 2.67578125, "router_z_loss_mlp": 0.38549805, "step": 7471, "time_per_iteration": 2.755434513092041 }, { "auxiliary_loss_clip": 0.01481205, "auxiliary_loss_mlp": 0.00387431, "balance_loss_clip": 1.21649706, "balance_loss_mlp": 0.35293192, "epoch": 0.44924094393506686, "flos": 22967136645120.0, "grad_norm": 24.428145653159667, "language_loss": 0.79806006, "learning_rate": 2.4226652425010048e-06, "loss": 0.81674635, "num_input_tokens_seen": 160261815, "router_z_loss_clip": 2.6484375, "router_z_loss_mlp": 0.34472656, "step": 7472, "time_per_iteration": 2.72841477394104 }, { "auxiliary_loss_clip": 0.01312539, "auxiliary_loss_mlp": 0.00064419, "balance_loss_clip": 1.15586293, "balance_loss_mlp": 0.05817212, "epoch": 0.4493010671877349, "flos": 59233467864960.0, "grad_norm": 0.7462234388992085, "language_loss": 0.61240864, "learning_rate": 2.4222845691108676e-06, "loss": 0.62617826, "num_input_tokens_seen": 160317070, "router_z_loss_clip": 1.5625, "router_z_loss_mlp": 0.06225586, "step": 7473, "time_per_iteration": 3.1195058822631836 }, { "auxiliary_loss_clip": 0.01467029, "auxiliary_loss_mlp": 0.00374926, "balance_loss_clip": 1.2031579, "balance_loss_mlp": 0.33894837, "epoch": 0.44936119044040285, "flos": 18004892653440.0, "grad_norm": 11.474318667643908, "language_loss": 0.83699203, "learning_rate": 2.421903879707657e-06, "loss": 0.85541153, "num_input_tokens_seen": 160334980, "router_z_loss_clip": 2.63476562, "router_z_loss_mlp": 0.35986328, "step": 7474, "time_per_iteration": 2.694025993347168 }, { "auxiliary_loss_clip": 0.01457879, "auxiliary_loss_mlp": 0.00400177, "balance_loss_clip": 1.2038492, "balance_loss_mlp": 0.36236334, "epoch": 0.4494213136930708, "flos": 21251827589760.0, "grad_norm": 16.81968240610641, "language_loss": 0.7809236, "learning_rate": 2.4215231743058086e-06, "loss": 0.7995041, "num_input_tokens_seen": 160354500, "router_z_loss_clip": 2.54296875, "router_z_loss_mlp": 0.37817383, "step": 7475, "time_per_iteration": 2.6869421005249023 }, { "auxiliary_loss_clip": 0.01478831, "auxiliary_loss_mlp": 0.00409957, "balance_loss_clip": 1.21833873, "balance_loss_mlp": 0.37243032, "epoch": 0.4494814369457388, "flos": 27418695022080.0, "grad_norm": 3.4462419034633984, "language_loss": 0.81627828, "learning_rate": 2.4211424529197594e-06, "loss": 0.83516622, "num_input_tokens_seen": 160373650, "router_z_loss_clip": 2.60351562, "router_z_loss_mlp": 0.37548828, "step": 7476, "time_per_iteration": 2.7476422786712646 }, { "auxiliary_loss_clip": 0.01489992, "auxiliary_loss_mlp": 0.00411073, "balance_loss_clip": 1.2232002, "balance_loss_mlp": 0.37204385, "epoch": 0.44954156019840674, "flos": 22854053652480.0, "grad_norm": 4.067284329814961, "language_loss": 0.78345358, "learning_rate": 2.4207617155639464e-06, "loss": 0.80246425, "num_input_tokens_seen": 160393430, "router_z_loss_clip": 2.66796875, "router_z_loss_mlp": 0.390625, "step": 7477, "time_per_iteration": 2.7331507205963135 }, { "auxiliary_loss_clip": 0.01485658, "auxiliary_loss_mlp": 0.00412539, "balance_loss_clip": 1.21920848, "balance_loss_mlp": 0.37606141, "epoch": 0.4496016834510747, "flos": 17201570935680.0, "grad_norm": 7.328608796810543, "language_loss": 0.75639987, "learning_rate": 2.4203809622528062e-06, "loss": 0.7753818, "num_input_tokens_seen": 160410545, "router_z_loss_clip": 2.66210938, "router_z_loss_mlp": 0.36474609, "step": 7478, "time_per_iteration": 2.66679048538208 }, { "auxiliary_loss_clip": 0.01478615, "auxiliary_loss_mlp": 0.00411763, "balance_loss_clip": 1.21849847, "balance_loss_mlp": 0.37485555, "epoch": 0.4496618067037427, "flos": 18916628595840.0, "grad_norm": 27.498393372806614, "language_loss": 0.95482105, "learning_rate": 2.420000193000779e-06, "loss": 0.97372484, "num_input_tokens_seen": 160428105, "router_z_loss_clip": 2.59765625, "router_z_loss_mlp": 0.36914062, "step": 7479, "time_per_iteration": 2.645629644393921 }, { "auxiliary_loss_clip": 0.01490819, "auxiliary_loss_mlp": 0.00366118, "balance_loss_clip": 1.22890806, "balance_loss_mlp": 0.3317377, "epoch": 0.44972192995641064, "flos": 21031659175680.0, "grad_norm": 2.7517930100223245, "language_loss": 0.81954038, "learning_rate": 2.419619407822302e-06, "loss": 0.83810973, "num_input_tokens_seen": 160448815, "router_z_loss_clip": 2.6171875, "router_z_loss_mlp": 0.34399414, "step": 7480, "time_per_iteration": 2.6951487064361572 }, { "auxiliary_loss_clip": 0.01494175, "auxiliary_loss_mlp": 0.00415687, "balance_loss_clip": 1.22193503, "balance_loss_mlp": 0.37582308, "epoch": 0.4497820532090786, "flos": 20777088510720.0, "grad_norm": 38.890330236248786, "language_loss": 0.87008381, "learning_rate": 2.419238606731815e-06, "loss": 0.88918245, "num_input_tokens_seen": 160465940, "router_z_loss_clip": 2.72070312, "router_z_loss_mlp": 0.3984375, "step": 7481, "time_per_iteration": 2.6476986408233643 }, { "auxiliary_loss_clip": 0.01464775, "auxiliary_loss_mlp": 0.00408111, "balance_loss_clip": 1.20961607, "balance_loss_mlp": 0.37239635, "epoch": 0.44984217646174657, "flos": 33802606385280.0, "grad_norm": 4.389572901827345, "language_loss": 0.74960154, "learning_rate": 2.418857789743758e-06, "loss": 0.76833034, "num_input_tokens_seen": 160486710, "router_z_loss_clip": 2.55078125, "router_z_loss_mlp": 0.35693359, "step": 7482, "time_per_iteration": 2.7309529781341553 }, { "auxiliary_loss_clip": 0.01493414, "auxiliary_loss_mlp": 0.00412186, "balance_loss_clip": 1.2308495, "balance_loss_mlp": 0.37511152, "epoch": 0.44990229971441453, "flos": 15518365660800.0, "grad_norm": 4.3712954188511075, "language_loss": 0.92708015, "learning_rate": 2.418476956872571e-06, "loss": 0.94613612, "num_input_tokens_seen": 160503405, "router_z_loss_clip": 2.62695312, "router_z_loss_mlp": 0.37060547, "step": 7483, "time_per_iteration": 2.6495511531829834 }, { "auxiliary_loss_clip": 0.01494244, "auxiliary_loss_mlp": 0.00434111, "balance_loss_clip": 1.22623491, "balance_loss_mlp": 0.39582103, "epoch": 0.4499624229670825, "flos": 29861913191040.0, "grad_norm": 11.454507948617767, "language_loss": 0.87045693, "learning_rate": 2.4180961081326967e-06, "loss": 0.88974047, "num_input_tokens_seen": 160525080, "router_z_loss_clip": 2.68359375, "router_z_loss_mlp": 0.3828125, "step": 7484, "time_per_iteration": 2.702760934829712 }, { "auxiliary_loss_clip": 0.01496906, "auxiliary_loss_mlp": 0.00415222, "balance_loss_clip": 1.22484136, "balance_loss_mlp": 0.37697947, "epoch": 0.45002254621975046, "flos": 18513674847360.0, "grad_norm": 20.522675581111905, "language_loss": 0.83614498, "learning_rate": 2.4177152435385754e-06, "loss": 0.85526621, "num_input_tokens_seen": 160540895, "router_z_loss_clip": 2.72265625, "router_z_loss_mlp": 0.38256836, "step": 7485, "time_per_iteration": 2.65505051612854 }, { "auxiliary_loss_clip": 0.01289927, "auxiliary_loss_mlp": 0.00103313, "balance_loss_clip": 1.13416386, "balance_loss_mlp": 0.095779, "epoch": 0.4500826694724185, "flos": 70420394229120.0, "grad_norm": 0.7759483698122989, "language_loss": 0.57781494, "learning_rate": 2.4173343631046504e-06, "loss": 0.59174734, "num_input_tokens_seen": 160598270, "router_z_loss_clip": 1.5625, "router_z_loss_mlp": 0.07519531, "step": 7486, "time_per_iteration": 3.181135892868042 }, { "auxiliary_loss_clip": 0.01499241, "auxiliary_loss_mlp": 0.00451262, "balance_loss_clip": 1.23894858, "balance_loss_mlp": 0.41306704, "epoch": 0.45014279272508645, "flos": 15778897983360.0, "grad_norm": 4.881530364962694, "language_loss": 0.89302063, "learning_rate": 2.4169534668453654e-06, "loss": 0.91252565, "num_input_tokens_seen": 160614720, "router_z_loss_clip": 2.59960938, "router_z_loss_mlp": 0.38232422, "step": 7487, "time_per_iteration": 2.677926540374756 }, { "auxiliary_loss_clip": 0.01467769, "auxiliary_loss_mlp": 0.00455451, "balance_loss_clip": 1.2092036, "balance_loss_mlp": 0.41582593, "epoch": 0.4502029159777544, "flos": 21799573061760.0, "grad_norm": 53.3339332803007, "language_loss": 0.81919253, "learning_rate": 2.4165725547751622e-06, "loss": 0.83842468, "num_input_tokens_seen": 160635170, "router_z_loss_clip": 2.5859375, "router_z_loss_mlp": 0.39575195, "step": 7488, "time_per_iteration": 2.6708250045776367 }, { "auxiliary_loss_clip": 0.01497966, "auxiliary_loss_mlp": 0.00516076, "balance_loss_clip": 1.22172642, "balance_loss_mlp": 0.46987098, "epoch": 0.4502630392304224, "flos": 28767966531840.0, "grad_norm": 8.776764810422007, "language_loss": 0.80074447, "learning_rate": 2.4161916269084858e-06, "loss": 0.82088488, "num_input_tokens_seen": 160654490, "router_z_loss_clip": 2.765625, "router_z_loss_mlp": 0.46191406, "step": 7489, "time_per_iteration": 4.215796709060669 }, { "auxiliary_loss_clip": 0.0147708, "auxiliary_loss_mlp": 0.00476846, "balance_loss_clip": 1.21194506, "balance_loss_mlp": 0.4347418, "epoch": 0.45032316248309034, "flos": 15844182952320.0, "grad_norm": 116.77583091429432, "language_loss": 0.76504862, "learning_rate": 2.4158106832597817e-06, "loss": 0.78458786, "num_input_tokens_seen": 160669400, "router_z_loss_clip": 2.65039062, "router_z_loss_mlp": 0.42114258, "step": 7490, "time_per_iteration": 2.7430613040924072 }, { "auxiliary_loss_clip": 0.01277455, "auxiliary_loss_mlp": 0.00122553, "balance_loss_clip": 1.11866963, "balance_loss_mlp": 0.11497115, "epoch": 0.4503832857357583, "flos": 57853600945920.0, "grad_norm": 0.7095109165091306, "language_loss": 0.56278205, "learning_rate": 2.415429723843495e-06, "loss": 0.57678211, "num_input_tokens_seen": 160733820, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.07568359, "step": 7491, "time_per_iteration": 3.1857144832611084 }, { "auxiliary_loss_clip": 0.01470814, "auxiliary_loss_mlp": 0.00467535, "balance_loss_clip": 1.21131408, "balance_loss_mlp": 0.4261691, "epoch": 0.4504434089884263, "flos": 23878082488320.0, "grad_norm": 5.538800604064262, "language_loss": 0.84404755, "learning_rate": 2.4150487486740713e-06, "loss": 0.86343098, "num_input_tokens_seen": 160753175, "router_z_loss_clip": 2.59765625, "router_z_loss_mlp": 0.4140625, "step": 7492, "time_per_iteration": 4.207965850830078 }, { "auxiliary_loss_clip": 0.01483391, "auxiliary_loss_mlp": 0.00529762, "balance_loss_clip": 1.20953846, "balance_loss_mlp": 0.48181558, "epoch": 0.45050353224109424, "flos": 17785083375360.0, "grad_norm": 37.008021802070274, "language_loss": 0.98745, "learning_rate": 2.4146677577659573e-06, "loss": 1.00758147, "num_input_tokens_seen": 160768310, "router_z_loss_clip": 2.73828125, "router_z_loss_mlp": 0.47973633, "step": 7493, "time_per_iteration": 4.109467029571533 }, { "auxiliary_loss_clip": 0.01277748, "auxiliary_loss_mlp": 0.00224122, "balance_loss_clip": 1.11543012, "balance_loss_mlp": 0.21253526, "epoch": 0.4505636554937622, "flos": 65063420703360.0, "grad_norm": 0.8023044342828962, "language_loss": 0.62642491, "learning_rate": 2.4142867511336e-06, "loss": 0.64144361, "num_input_tokens_seen": 160827370, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.11572266, "step": 7494, "time_per_iteration": 3.176154136657715 }, { "auxiliary_loss_clip": 0.01471404, "auxiliary_loss_mlp": 0.00427204, "balance_loss_clip": 1.21364045, "balance_loss_mlp": 0.38974851, "epoch": 0.45062377874643017, "flos": 22200084685440.0, "grad_norm": 5.004121165894214, "language_loss": 0.85451603, "learning_rate": 2.4139057287914484e-06, "loss": 0.87350214, "num_input_tokens_seen": 160849140, "router_z_loss_clip": 2.57617188, "router_z_loss_mlp": 0.37475586, "step": 7495, "time_per_iteration": 2.666616916656494 }, { "auxiliary_loss_clip": 0.01478305, "auxiliary_loss_mlp": 0.00456105, "balance_loss_clip": 1.2186625, "balance_loss_mlp": 0.41755289, "epoch": 0.45068390199909814, "flos": 37670293186560.0, "grad_norm": 3.5629823577327673, "language_loss": 0.90688062, "learning_rate": 2.41352469075395e-06, "loss": 0.92622471, "num_input_tokens_seen": 160871280, "router_z_loss_clip": 2.59960938, "router_z_loss_mlp": 0.38574219, "step": 7496, "time_per_iteration": 2.8481857776641846 }, { "auxiliary_loss_clip": 0.01455916, "auxiliary_loss_mlp": 0.00496519, "balance_loss_clip": 1.19397879, "balance_loss_mlp": 0.45381844, "epoch": 0.4507440252517661, "flos": 22302501338880.0, "grad_norm": 71.16624384024739, "language_loss": 0.81495905, "learning_rate": 2.4131436370355534e-06, "loss": 0.83448339, "num_input_tokens_seen": 160888625, "router_z_loss_clip": 2.61914062, "router_z_loss_mlp": 0.42724609, "step": 7497, "time_per_iteration": 2.710631847381592 }, { "auxiliary_loss_clip": 0.014907, "auxiliary_loss_mlp": 0.00488897, "balance_loss_clip": 1.22252119, "balance_loss_mlp": 0.44676861, "epoch": 0.45080414850443407, "flos": 13188374138880.0, "grad_norm": 795.6308100234744, "language_loss": 0.80969131, "learning_rate": 2.4127625676507088e-06, "loss": 0.82948726, "num_input_tokens_seen": 160907040, "router_z_loss_clip": 2.68164062, "router_z_loss_mlp": 0.42114258, "step": 7498, "time_per_iteration": 2.659393787384033 }, { "auxiliary_loss_clip": 0.01458455, "auxiliary_loss_mlp": 0.00500552, "balance_loss_clip": 1.19818544, "balance_loss_mlp": 0.45623037, "epoch": 0.4508642717571021, "flos": 21944939402880.0, "grad_norm": 30.265372858302353, "language_loss": 0.78685528, "learning_rate": 2.4123814826138663e-06, "loss": 0.80644536, "num_input_tokens_seen": 160927115, "router_z_loss_clip": 2.59960938, "router_z_loss_mlp": 0.44287109, "step": 7499, "time_per_iteration": 4.107067108154297 }, { "auxiliary_loss_clip": 0.01471278, "auxiliary_loss_mlp": 0.00487467, "balance_loss_clip": 1.20841885, "balance_loss_mlp": 0.44486207, "epoch": 0.45092439500977005, "flos": 23367468700800.0, "grad_norm": 2.573831718910498, "language_loss": 0.83176351, "learning_rate": 2.412000381939477e-06, "loss": 0.85135096, "num_input_tokens_seen": 160944405, "router_z_loss_clip": 2.62890625, "router_z_loss_mlp": 0.42602539, "step": 7500, "time_per_iteration": 2.6663076877593994 }, { "auxiliary_loss_clip": 0.01467985, "auxiliary_loss_mlp": 0.0049867, "balance_loss_clip": 1.21129942, "balance_loss_mlp": 0.45673257, "epoch": 0.450984518262438, "flos": 20772958446720.0, "grad_norm": 15.983119202980113, "language_loss": 0.68105257, "learning_rate": 2.411619265641992e-06, "loss": 0.70071912, "num_input_tokens_seen": 160961345, "router_z_loss_clip": 2.56835938, "router_z_loss_mlp": 0.41918945, "step": 7501, "time_per_iteration": 2.6267151832580566 }, { "auxiliary_loss_clip": 0.01466661, "auxiliary_loss_mlp": 0.00538413, "balance_loss_clip": 1.202981, "balance_loss_mlp": 0.49232626, "epoch": 0.451044641515106, "flos": 17707372300800.0, "grad_norm": 6004.64005155851, "language_loss": 0.91013551, "learning_rate": 2.411238133735863e-06, "loss": 0.93018615, "num_input_tokens_seen": 160977330, "router_z_loss_clip": 2.63867188, "router_z_loss_mlp": 0.46044922, "step": 7502, "time_per_iteration": 2.6102423667907715 }, { "auxiliary_loss_clip": 0.01455353, "auxiliary_loss_mlp": 0.0048575, "balance_loss_clip": 1.19530535, "balance_loss_mlp": 0.44226211, "epoch": 0.45110476476777395, "flos": 20594698225920.0, "grad_norm": 99.71261367464086, "language_loss": 0.8405953, "learning_rate": 2.4108569862355418e-06, "loss": 0.86000633, "num_input_tokens_seen": 160997280, "router_z_loss_clip": 2.59570312, "router_z_loss_mlp": 0.43481445, "step": 7503, "time_per_iteration": 2.6892707347869873 }, { "auxiliary_loss_clip": 0.01461933, "auxiliary_loss_mlp": 0.00480859, "balance_loss_clip": 1.20709729, "balance_loss_mlp": 0.43803942, "epoch": 0.4511648880204419, "flos": 16034043265920.0, "grad_norm": 21.505093549496127, "language_loss": 0.86038566, "learning_rate": 2.410475823155484e-06, "loss": 0.87981355, "num_input_tokens_seen": 161014235, "router_z_loss_clip": 2.546875, "router_z_loss_mlp": 0.42822266, "step": 7504, "time_per_iteration": 2.6092941761016846 }, { "auxiliary_loss_clip": 0.01462738, "auxiliary_loss_mlp": 0.00455315, "balance_loss_clip": 1.2061007, "balance_loss_mlp": 0.41566646, "epoch": 0.4512250112731099, "flos": 23978811202560.0, "grad_norm": 1142.7614735418501, "language_loss": 0.68527067, "learning_rate": 2.4100946445101405e-06, "loss": 0.7044512, "num_input_tokens_seen": 161032360, "router_z_loss_clip": 2.56445312, "router_z_loss_mlp": 0.39624023, "step": 7505, "time_per_iteration": 2.7746388912200928 }, { "auxiliary_loss_clip": 0.01296747, "auxiliary_loss_mlp": 0.00207381, "balance_loss_clip": 1.14290166, "balance_loss_mlp": 0.19441062, "epoch": 0.45128513452577784, "flos": 71462308037760.0, "grad_norm": 0.8286048787889418, "language_loss": 0.58171731, "learning_rate": 2.409713450313968e-06, "loss": 0.5967586, "num_input_tokens_seen": 161091360, "router_z_loss_clip": 1.53125, "router_z_loss_mlp": 0.12988281, "step": 7506, "time_per_iteration": 3.2007458209991455 }, { "auxiliary_loss_clip": 0.0144665, "auxiliary_loss_mlp": 0.00466567, "balance_loss_clip": 1.19158792, "balance_loss_mlp": 0.42543989, "epoch": 0.4513452577784458, "flos": 22090844448000.0, "grad_norm": 8974.037925662034, "language_loss": 0.85388678, "learning_rate": 2.40933224058142e-06, "loss": 0.87301898, "num_input_tokens_seen": 161110825, "router_z_loss_clip": 2.5546875, "router_z_loss_mlp": 0.41137695, "step": 7507, "time_per_iteration": 2.769765853881836 }, { "auxiliary_loss_clip": 0.01458525, "auxiliary_loss_mlp": 0.00524318, "balance_loss_clip": 1.20041013, "balance_loss_mlp": 0.48142684, "epoch": 0.4514053810311138, "flos": 24276403382400.0, "grad_norm": 4.3062512411657785, "language_loss": 0.77664435, "learning_rate": 2.4089510153269526e-06, "loss": 0.79647279, "num_input_tokens_seen": 161130685, "router_z_loss_clip": 2.58203125, "router_z_loss_mlp": 0.42871094, "step": 7508, "time_per_iteration": 2.7502026557922363 }, { "auxiliary_loss_clip": 0.01451241, "auxiliary_loss_mlp": 0.00484165, "balance_loss_clip": 1.19875956, "balance_loss_mlp": 0.4434666, "epoch": 0.45146550428378174, "flos": 17886781756800.0, "grad_norm": 389.93109793939385, "language_loss": 0.84837031, "learning_rate": 2.4085697745650217e-06, "loss": 0.86772436, "num_input_tokens_seen": 161147555, "router_z_loss_clip": 2.52539062, "router_z_loss_mlp": 0.40722656, "step": 7509, "time_per_iteration": 2.6461970806121826 }, { "auxiliary_loss_clip": 0.01455257, "auxiliary_loss_mlp": 0.00527927, "balance_loss_clip": 1.20416057, "balance_loss_mlp": 0.48660907, "epoch": 0.4515256275364497, "flos": 24243437675520.0, "grad_norm": 136.32840443492984, "language_loss": 0.81406367, "learning_rate": 2.4081885183100837e-06, "loss": 0.83389544, "num_input_tokens_seen": 161166255, "router_z_loss_clip": 2.5078125, "router_z_loss_mlp": 0.4128418, "step": 7510, "time_per_iteration": 2.7284348011016846 }, { "auxiliary_loss_clip": 0.01458881, "auxiliary_loss_mlp": 0.0049615, "balance_loss_clip": 1.19507456, "balance_loss_mlp": 0.45483187, "epoch": 0.45158575078911767, "flos": 20631039811200.0, "grad_norm": 25.39697404901466, "language_loss": 0.84070843, "learning_rate": 2.4078072465765964e-06, "loss": 0.8602587, "num_input_tokens_seen": 161184720, "router_z_loss_clip": 2.63867188, "router_z_loss_mlp": 0.41308594, "step": 7511, "time_per_iteration": 2.691373825073242 }, { "auxiliary_loss_clip": 0.01460029, "auxiliary_loss_mlp": 0.00534845, "balance_loss_clip": 1.19848752, "balance_loss_mlp": 0.49083275, "epoch": 0.45164587404178563, "flos": 23327751237120.0, "grad_norm": 86.86396456971815, "language_loss": 0.84718108, "learning_rate": 2.4074259593790174e-06, "loss": 0.8671298, "num_input_tokens_seen": 161204360, "router_z_loss_clip": 2.6171875, "router_z_loss_mlp": 0.44018555, "step": 7512, "time_per_iteration": 2.6669094562530518 }, { "auxiliary_loss_clip": 0.01471007, "auxiliary_loss_mlp": 0.00559451, "balance_loss_clip": 1.20063484, "balance_loss_mlp": 0.51474756, "epoch": 0.45170599729445365, "flos": 23805973935360.0, "grad_norm": 4.02544483168645, "language_loss": 0.94234109, "learning_rate": 2.4070446567318053e-06, "loss": 0.96264565, "num_input_tokens_seen": 161223575, "router_z_loss_clip": 2.70703125, "router_z_loss_mlp": 0.44677734, "step": 7513, "time_per_iteration": 2.656541347503662 }, { "auxiliary_loss_clip": 0.01455672, "auxiliary_loss_mlp": 0.00499236, "balance_loss_clip": 1.20204341, "balance_loss_mlp": 0.46058795, "epoch": 0.4517661205471216, "flos": 23512942782720.0, "grad_norm": 4.100147097341956, "language_loss": 0.72470766, "learning_rate": 2.406663338649419e-06, "loss": 0.74425673, "num_input_tokens_seen": 161243805, "router_z_loss_clip": 2.53515625, "router_z_loss_mlp": 0.38647461, "step": 7514, "time_per_iteration": 2.73429274559021 }, { "auxiliary_loss_clip": 0.01463987, "auxiliary_loss_mlp": 0.00514819, "balance_loss_clip": 1.20116043, "balance_loss_mlp": 0.46911398, "epoch": 0.4518262437997896, "flos": 23513948363520.0, "grad_norm": 12.163853724226005, "language_loss": 0.7611106, "learning_rate": 2.406282005146318e-06, "loss": 0.78089869, "num_input_tokens_seen": 161261450, "router_z_loss_clip": 2.625, "router_z_loss_mlp": 0.45678711, "step": 7515, "time_per_iteration": 2.6980865001678467 }, { "auxiliary_loss_clip": 0.0145394, "auxiliary_loss_mlp": 0.00533801, "balance_loss_clip": 1.18881714, "balance_loss_mlp": 0.48699921, "epoch": 0.45188636705245755, "flos": 14568061489920.0, "grad_norm": 86.48580090087346, "language_loss": 0.87829012, "learning_rate": 2.405900656236963e-06, "loss": 0.89816761, "num_input_tokens_seen": 161276965, "router_z_loss_clip": 2.65625, "router_z_loss_mlp": 0.4675293, "step": 7516, "time_per_iteration": 2.6605420112609863 }, { "auxiliary_loss_clip": 0.01452676, "auxiliary_loss_mlp": 0.00564029, "balance_loss_clip": 1.19445252, "balance_loss_mlp": 0.52104235, "epoch": 0.4519464903051255, "flos": 19901550499200.0, "grad_norm": 281.8224133383948, "language_loss": 0.7133621, "learning_rate": 2.4055192919358137e-06, "loss": 0.73352909, "num_input_tokens_seen": 161295375, "router_z_loss_clip": 2.5859375, "router_z_loss_mlp": 0.4296875, "step": 7517, "time_per_iteration": 2.643141746520996 }, { "auxiliary_loss_clip": 0.01444932, "auxiliary_loss_mlp": 0.00537137, "balance_loss_clip": 1.19107556, "balance_loss_mlp": 0.49667746, "epoch": 0.4520066135577935, "flos": 18844376388480.0, "grad_norm": 20.876764787029927, "language_loss": 0.69063777, "learning_rate": 2.405137912257333e-06, "loss": 0.71045852, "num_input_tokens_seen": 161313010, "router_z_loss_clip": 2.5390625, "router_z_loss_mlp": 0.40454102, "step": 7518, "time_per_iteration": 2.657953977584839 }, { "auxiliary_loss_clip": 0.01445525, "auxiliary_loss_mlp": 0.00524502, "balance_loss_clip": 1.18937874, "balance_loss_mlp": 0.48194411, "epoch": 0.45206673681046144, "flos": 48214419713280.0, "grad_norm": 18.982833534625225, "language_loss": 0.65105522, "learning_rate": 2.404756517215982e-06, "loss": 0.67075551, "num_input_tokens_seen": 161336690, "router_z_loss_clip": 2.56054688, "router_z_loss_mlp": 0.42553711, "step": 7519, "time_per_iteration": 2.929690361022949 }, { "auxiliary_loss_clip": 0.01439528, "auxiliary_loss_mlp": 0.00574457, "balance_loss_clip": 1.18589807, "balance_loss_mlp": 0.52860928, "epoch": 0.4521268600631294, "flos": 23842171866240.0, "grad_norm": 8.416398524811147, "language_loss": 0.78193098, "learning_rate": 2.404375106826223e-06, "loss": 0.80207086, "num_input_tokens_seen": 161357845, "router_z_loss_clip": 2.53710938, "router_z_loss_mlp": 0.45825195, "step": 7520, "time_per_iteration": 2.6856133937835693 }, { "auxiliary_loss_clip": 0.01442677, "auxiliary_loss_mlp": 0.00556601, "balance_loss_clip": 1.18295217, "balance_loss_mlp": 0.51041889, "epoch": 0.4521869833157974, "flos": 18843622202880.0, "grad_norm": 98.090375144064, "language_loss": 0.81437778, "learning_rate": 2.4039936811025194e-06, "loss": 0.83437061, "num_input_tokens_seen": 161375160, "router_z_loss_clip": 2.59765625, "router_z_loss_mlp": 0.46166992, "step": 7521, "time_per_iteration": 2.6849207878112793 }, { "auxiliary_loss_clip": 0.01443208, "auxiliary_loss_mlp": 0.00547526, "balance_loss_clip": 1.18076861, "balance_loss_mlp": 0.50163031, "epoch": 0.45224710656846534, "flos": 19788072456960.0, "grad_norm": 3.0805928684025505, "language_loss": 0.74709237, "learning_rate": 2.4036122400593343e-06, "loss": 0.76699972, "num_input_tokens_seen": 161393690, "router_z_loss_clip": 2.62304688, "router_z_loss_mlp": 0.45922852, "step": 7522, "time_per_iteration": 2.6507527828216553 }, { "auxiliary_loss_clip": 0.01433387, "auxiliary_loss_mlp": 0.00544078, "balance_loss_clip": 1.17788196, "balance_loss_mlp": 0.5000658, "epoch": 0.4523072298211333, "flos": 28256131681920.0, "grad_norm": 7.825423562791712, "language_loss": 0.65770197, "learning_rate": 2.403230783711134e-06, "loss": 0.6774767, "num_input_tokens_seen": 161415015, "router_z_loss_clip": 2.55664062, "router_z_loss_mlp": 0.43994141, "step": 7523, "time_per_iteration": 2.729522466659546 }, { "auxiliary_loss_clip": 0.01449169, "auxiliary_loss_mlp": 0.00529769, "balance_loss_clip": 1.18739295, "balance_loss_mlp": 0.48585242, "epoch": 0.45236735307380127, "flos": 11181039511680.0, "grad_norm": 146.22176085431983, "language_loss": 0.84126812, "learning_rate": 2.4028493120723813e-06, "loss": 0.86105752, "num_input_tokens_seen": 161432940, "router_z_loss_clip": 2.61523438, "router_z_loss_mlp": 0.43920898, "step": 7524, "time_per_iteration": 2.6658365726470947 }, { "auxiliary_loss_clip": 0.01438266, "auxiliary_loss_mlp": 0.00508763, "balance_loss_clip": 1.18234932, "balance_loss_mlp": 0.46637216, "epoch": 0.45242747632646924, "flos": 22601386408320.0, "grad_norm": 7.211583913064756, "language_loss": 0.69649941, "learning_rate": 2.4024678251575417e-06, "loss": 0.71596974, "num_input_tokens_seen": 161452215, "router_z_loss_clip": 2.5625, "router_z_loss_mlp": 0.42358398, "step": 7525, "time_per_iteration": 2.665422201156616 }, { "auxiliary_loss_clip": 0.01432074, "auxiliary_loss_mlp": 0.00524651, "balance_loss_clip": 1.18326569, "balance_loss_mlp": 0.48428667, "epoch": 0.45248759957913726, "flos": 18256267008000.0, "grad_norm": 9.559409504466883, "language_loss": 0.83540034, "learning_rate": 2.402086322981083e-06, "loss": 0.85496759, "num_input_tokens_seen": 161469520, "router_z_loss_clip": 2.48828125, "router_z_loss_mlp": 0.40332031, "step": 7526, "time_per_iteration": 2.6207313537597656 }, { "auxiliary_loss_clip": 0.01438249, "auxiliary_loss_mlp": 0.00533536, "balance_loss_clip": 1.18473566, "balance_loss_mlp": 0.49305257, "epoch": 0.4525477228318052, "flos": 22450094323200.0, "grad_norm": 5.8178366277881794, "language_loss": 0.8651908, "learning_rate": 2.40170480555747e-06, "loss": 0.88490868, "num_input_tokens_seen": 161487335, "router_z_loss_clip": 2.5390625, "router_z_loss_mlp": 0.40454102, "step": 7527, "time_per_iteration": 2.660675287246704 }, { "auxiliary_loss_clip": 0.01430624, "auxiliary_loss_mlp": 0.00510972, "balance_loss_clip": 1.17722619, "balance_loss_mlp": 0.47003552, "epoch": 0.4526078460844732, "flos": 29644869260160.0, "grad_norm": 7.661119786866352, "language_loss": 0.70648575, "learning_rate": 2.4013232729011706e-06, "loss": 0.72590172, "num_input_tokens_seen": 161510095, "router_z_loss_clip": 2.52929688, "router_z_loss_mlp": 0.40966797, "step": 7528, "time_per_iteration": 2.7197182178497314 }, { "auxiliary_loss_clip": 0.0142655, "auxiliary_loss_mlp": 0.00545898, "balance_loss_clip": 1.17402637, "balance_loss_mlp": 0.50281578, "epoch": 0.45266796933714115, "flos": 23039747988480.0, "grad_norm": 18.280232616044394, "language_loss": 0.80112791, "learning_rate": 2.4009417250266525e-06, "loss": 0.8208524, "num_input_tokens_seen": 161528725, "router_z_loss_clip": 2.52734375, "router_z_loss_mlp": 0.43066406, "step": 7529, "time_per_iteration": 2.658329963684082 }, { "auxiliary_loss_clip": 0.01430568, "auxiliary_loss_mlp": 0.00538483, "balance_loss_clip": 1.17482674, "balance_loss_mlp": 0.4952817, "epoch": 0.4527280925898091, "flos": 14428405411200.0, "grad_norm": 11.51695800436085, "language_loss": 0.80425882, "learning_rate": 2.400560161948384e-06, "loss": 0.82394934, "num_input_tokens_seen": 161547195, "router_z_loss_clip": 2.55664062, "router_z_loss_mlp": 0.43212891, "step": 7530, "time_per_iteration": 2.6455655097961426 }, { "auxiliary_loss_clip": 0.01429415, "auxiliary_loss_mlp": 0.00549924, "balance_loss_clip": 1.17740345, "balance_loss_mlp": 0.50760448, "epoch": 0.4527882158424771, "flos": 22925515760640.0, "grad_norm": 23.694580062295593, "language_loss": 0.81326473, "learning_rate": 2.400178583680834e-06, "loss": 0.83305812, "num_input_tokens_seen": 161565565, "router_z_loss_clip": 2.51757812, "router_z_loss_mlp": 0.4230957, "step": 7531, "time_per_iteration": 4.113640785217285 }, { "auxiliary_loss_clip": 0.01419408, "auxiliary_loss_mlp": 0.0053095, "balance_loss_clip": 1.17212045, "balance_loss_mlp": 0.49058539, "epoch": 0.45284833909514505, "flos": 25555326105600.0, "grad_norm": 754.6669815258074, "language_loss": 0.72552276, "learning_rate": 2.3997969902384717e-06, "loss": 0.74502629, "num_input_tokens_seen": 161586630, "router_z_loss_clip": 2.47265625, "router_z_loss_mlp": 0.40380859, "step": 7532, "time_per_iteration": 2.736219644546509 }, { "auxiliary_loss_clip": 0.01418382, "auxiliary_loss_mlp": 0.00522368, "balance_loss_clip": 1.16963577, "balance_loss_mlp": 0.47976264, "epoch": 0.452908462347813, "flos": 18150007599360.0, "grad_norm": 10.251837705333461, "language_loss": 0.83094585, "learning_rate": 2.399415381635768e-06, "loss": 0.8503533, "num_input_tokens_seen": 161603815, "router_z_loss_clip": 2.48828125, "router_z_loss_mlp": 0.42578125, "step": 7533, "time_per_iteration": 2.639467477798462 }, { "auxiliary_loss_clip": 0.01421638, "auxiliary_loss_mlp": 0.00563247, "balance_loss_clip": 1.16276085, "balance_loss_mlp": 0.51682675, "epoch": 0.452968585600481, "flos": 19062749122560.0, "grad_norm": 15.323347894748178, "language_loss": 0.90231979, "learning_rate": 2.3990337578871927e-06, "loss": 0.92216861, "num_input_tokens_seen": 161622900, "router_z_loss_clip": 2.58984375, "router_z_loss_mlp": 0.46459961, "step": 7534, "time_per_iteration": 2.6816413402557373 }, { "auxiliary_loss_clip": 0.01405087, "auxiliary_loss_mlp": 0.00535945, "balance_loss_clip": 1.15587425, "balance_loss_mlp": 0.49095502, "epoch": 0.45302870885314894, "flos": 22051737515520.0, "grad_norm": 787.74803932213, "language_loss": 0.81196213, "learning_rate": 2.3986521190072176e-06, "loss": 0.83137238, "num_input_tokens_seen": 161641700, "router_z_loss_clip": 2.4921875, "router_z_loss_mlp": 0.44995117, "step": 7535, "time_per_iteration": 5.7490270137786865 }, { "auxiliary_loss_clip": 0.0140971, "auxiliary_loss_mlp": 0.00492333, "balance_loss_clip": 1.16248465, "balance_loss_mlp": 0.45301783, "epoch": 0.4530888321058169, "flos": 20376217751040.0, "grad_norm": 134.97468164615978, "language_loss": 0.86859691, "learning_rate": 2.3982704650103138e-06, "loss": 0.88761735, "num_input_tokens_seen": 161661955, "router_z_loss_clip": 2.47460938, "router_z_loss_mlp": 0.39306641, "step": 7536, "time_per_iteration": 2.7371819019317627 }, { "auxiliary_loss_clip": 0.01399777, "auxiliary_loss_mlp": 0.00489656, "balance_loss_clip": 1.14674997, "balance_loss_mlp": 0.44964981, "epoch": 0.4531489553584849, "flos": 14830425406080.0, "grad_norm": 20.55438820063087, "language_loss": 0.81005776, "learning_rate": 2.3978887959109544e-06, "loss": 0.82895207, "num_input_tokens_seen": 161679245, "router_z_loss_clip": 2.52929688, "router_z_loss_mlp": 0.40014648, "step": 7537, "time_per_iteration": 2.639756441116333 }, { "auxiliary_loss_clip": 0.01396801, "auxiliary_loss_mlp": 0.00501212, "balance_loss_clip": 1.14890122, "balance_loss_mlp": 0.4621594, "epoch": 0.45320907861115284, "flos": 21944975316480.0, "grad_norm": 21.591274845369494, "language_loss": 0.8207773, "learning_rate": 2.3975071117236118e-06, "loss": 0.83975738, "num_input_tokens_seen": 161698795, "router_z_loss_clip": 2.4765625, "router_z_loss_mlp": 0.390625, "step": 7538, "time_per_iteration": 2.652264356613159 }, { "auxiliary_loss_clip": 0.01294905, "auxiliary_loss_mlp": 0.00266763, "balance_loss_clip": 1.1333648, "balance_loss_mlp": 0.25360271, "epoch": 0.45326920186382086, "flos": 66251455038720.0, "grad_norm": 0.7957771001938359, "language_loss": 0.62082815, "learning_rate": 2.3971254124627593e-06, "loss": 0.63644481, "num_input_tokens_seen": 161761980, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.13183594, "step": 7539, "time_per_iteration": 3.1907784938812256 }, { "auxiliary_loss_clip": 0.01401923, "auxiliary_loss_mlp": 0.00464857, "balance_loss_clip": 1.1578629, "balance_loss_mlp": 0.42589915, "epoch": 0.4533293251164888, "flos": 14684233052160.0, "grad_norm": 18416.959974929934, "language_loss": 0.70765531, "learning_rate": 2.396743698142872e-06, "loss": 0.72632313, "num_input_tokens_seen": 161779455, "router_z_loss_clip": 2.44335938, "router_z_loss_mlp": 0.38964844, "step": 7540, "time_per_iteration": 2.6831774711608887 }, { "auxiliary_loss_clip": 0.01409725, "auxiliary_loss_mlp": 0.0048348, "balance_loss_clip": 1.15302467, "balance_loss_mlp": 0.44142273, "epoch": 0.4533894483691568, "flos": 22601206840320.0, "grad_norm": 15.968056610546975, "language_loss": 0.92417252, "learning_rate": 2.396361968778424e-06, "loss": 0.94310462, "num_input_tokens_seen": 161798980, "router_z_loss_clip": 2.56640625, "router_z_loss_mlp": 0.42041016, "step": 7541, "time_per_iteration": 4.136256694793701 }, { "auxiliary_loss_clip": 0.01392485, "auxiliary_loss_mlp": 0.00500176, "balance_loss_clip": 1.14500976, "balance_loss_mlp": 0.45852441, "epoch": 0.45344957162182475, "flos": 34751617666560.0, "grad_norm": 12.880920318805352, "language_loss": 0.81849569, "learning_rate": 2.395980224383889e-06, "loss": 0.83742225, "num_input_tokens_seen": 161819745, "router_z_loss_clip": 2.47460938, "router_z_loss_mlp": 0.41625977, "step": 7542, "time_per_iteration": 2.732893943786621 }, { "auxiliary_loss_clip": 0.01390305, "auxiliary_loss_mlp": 0.00451728, "balance_loss_clip": 1.14602089, "balance_loss_mlp": 0.41224596, "epoch": 0.4535096948744927, "flos": 23550218121600.0, "grad_norm": 37.4049304772582, "language_loss": 0.8511759, "learning_rate": 2.395598464973746e-06, "loss": 0.86959624, "num_input_tokens_seen": 161838575, "router_z_loss_clip": 2.4453125, "router_z_loss_mlp": 0.39526367, "step": 7543, "time_per_iteration": 2.6855037212371826 }, { "auxiliary_loss_clip": 0.01391011, "auxiliary_loss_mlp": 0.00477236, "balance_loss_clip": 1.14344168, "balance_loss_mlp": 0.43799269, "epoch": 0.4535698181271607, "flos": 25557552748800.0, "grad_norm": 53.16560567948009, "language_loss": 0.81230628, "learning_rate": 2.395216690562469e-06, "loss": 0.83098876, "num_input_tokens_seen": 161858590, "router_z_loss_clip": 2.47851562, "router_z_loss_mlp": 0.39233398, "step": 7544, "time_per_iteration": 2.6775143146514893 }, { "auxiliary_loss_clip": 0.01398373, "auxiliary_loss_mlp": 0.00504708, "balance_loss_clip": 1.15055275, "balance_loss_mlp": 0.46286604, "epoch": 0.45362994137982865, "flos": 24864117713280.0, "grad_norm": 72.50017988571514, "language_loss": 0.839185, "learning_rate": 2.3948349011645355e-06, "loss": 0.85821581, "num_input_tokens_seen": 161878390, "router_z_loss_clip": 2.48242188, "router_z_loss_mlp": 0.41870117, "step": 7545, "time_per_iteration": 2.6845386028289795 }, { "auxiliary_loss_clip": 0.01390863, "auxiliary_loss_mlp": 0.00434031, "balance_loss_clip": 1.14682555, "balance_loss_mlp": 0.39724261, "epoch": 0.4536900646324966, "flos": 30806794408320.0, "grad_norm": 17.6388814798412, "language_loss": 0.78258109, "learning_rate": 2.394453096794423e-06, "loss": 0.80083001, "num_input_tokens_seen": 161898610, "router_z_loss_clip": 2.43945312, "router_z_loss_mlp": 0.36791992, "step": 7546, "time_per_iteration": 2.7356560230255127 }, { "auxiliary_loss_clip": 0.01413974, "auxiliary_loss_mlp": 0.0048157, "balance_loss_clip": 1.15938473, "balance_loss_mlp": 0.44261217, "epoch": 0.4537501878851646, "flos": 23404313076480.0, "grad_norm": 9.253442624818158, "language_loss": 0.82599688, "learning_rate": 2.394071277466609e-06, "loss": 0.84495229, "num_input_tokens_seen": 161918210, "router_z_loss_clip": 2.54492188, "router_z_loss_mlp": 0.3894043, "step": 7547, "time_per_iteration": 2.712801694869995 }, { "auxiliary_loss_clip": 0.01396621, "auxiliary_loss_mlp": 0.00478534, "balance_loss_clip": 1.14558351, "balance_loss_mlp": 0.43843168, "epoch": 0.45381031113783254, "flos": 18149289327360.0, "grad_norm": 24.151522752542984, "language_loss": 0.76080489, "learning_rate": 2.393689443195573e-06, "loss": 0.77955645, "num_input_tokens_seen": 161936950, "router_z_loss_clip": 2.51171875, "router_z_loss_mlp": 0.40063477, "step": 7548, "time_per_iteration": 2.6751809120178223 }, { "auxiliary_loss_clip": 0.01377627, "auxiliary_loss_mlp": 0.00445521, "balance_loss_clip": 1.13646996, "balance_loss_mlp": 0.4095439, "epoch": 0.4538704343905005, "flos": 25336666062720.0, "grad_norm": 80.48224068872496, "language_loss": 0.79872072, "learning_rate": 2.393307593995794e-06, "loss": 0.81695217, "num_input_tokens_seen": 161955550, "router_z_loss_clip": 2.40820312, "router_z_loss_mlp": 0.35961914, "step": 7549, "time_per_iteration": 2.7075366973876953 }, { "auxiliary_loss_clip": 0.01394545, "auxiliary_loss_mlp": 0.00450403, "balance_loss_clip": 1.14384425, "balance_loss_mlp": 0.41468805, "epoch": 0.4539305576431685, "flos": 28731445378560.0, "grad_norm": 20.49449267955831, "language_loss": 0.71582103, "learning_rate": 2.392925729881751e-06, "loss": 0.73427045, "num_input_tokens_seen": 161976760, "router_z_loss_clip": 2.50585938, "router_z_loss_mlp": 0.35717773, "step": 7550, "time_per_iteration": 2.7168548107147217 }, { "auxiliary_loss_clip": 0.01379851, "auxiliary_loss_mlp": 0.0046496, "balance_loss_clip": 1.13488519, "balance_loss_mlp": 0.42671788, "epoch": 0.45399068089583644, "flos": 22492397566080.0, "grad_norm": 4.25066675935797, "language_loss": 0.74873912, "learning_rate": 2.3925438508679263e-06, "loss": 0.76718724, "num_input_tokens_seen": 161996120, "router_z_loss_clip": 2.44921875, "router_z_loss_mlp": 0.38232422, "step": 7551, "time_per_iteration": 2.7430081367492676 }, { "auxiliary_loss_clip": 0.01398153, "auxiliary_loss_mlp": 0.0047589, "balance_loss_clip": 1.14182544, "balance_loss_mlp": 0.4340471, "epoch": 0.45405080414850446, "flos": 12893403651840.0, "grad_norm": 7.398487771690533, "language_loss": 0.86284363, "learning_rate": 2.392161956968798e-06, "loss": 0.88158405, "num_input_tokens_seen": 162011125, "router_z_loss_clip": 2.5625, "router_z_loss_mlp": 0.41821289, "step": 7552, "time_per_iteration": 2.784442186355591 }, { "auxiliary_loss_clip": 0.01315759, "auxiliary_loss_mlp": 0.00107542, "balance_loss_clip": 1.14286697, "balance_loss_mlp": 0.0985302, "epoch": 0.4541109274011724, "flos": 59766919724160.0, "grad_norm": 0.810311917073569, "language_loss": 0.57880372, "learning_rate": 2.39178004819885e-06, "loss": 0.59303671, "num_input_tokens_seen": 162068705, "router_z_loss_clip": 1.7265625, "router_z_loss_mlp": 0.09033203, "step": 7553, "time_per_iteration": 3.109130859375 }, { "auxiliary_loss_clip": 0.01387491, "auxiliary_loss_mlp": 0.00459154, "balance_loss_clip": 1.14137888, "balance_loss_mlp": 0.4219842, "epoch": 0.4541710506538404, "flos": 28511743841280.0, "grad_norm": 7.914112514850036, "language_loss": 0.81288075, "learning_rate": 2.3913981245725626e-06, "loss": 0.83134723, "num_input_tokens_seen": 162089655, "router_z_loss_clip": 2.45898438, "router_z_loss_mlp": 0.37182617, "step": 7554, "time_per_iteration": 2.7354743480682373 }, { "auxiliary_loss_clip": 0.01403639, "auxiliary_loss_mlp": 0.00434468, "balance_loss_clip": 1.14491749, "balance_loss_mlp": 0.39663115, "epoch": 0.45423117390650836, "flos": 17675591742720.0, "grad_norm": 41.15643693443292, "language_loss": 0.84826684, "learning_rate": 2.3910161861044194e-06, "loss": 0.86664796, "num_input_tokens_seen": 162108465, "router_z_loss_clip": 2.58789062, "router_z_loss_mlp": 0.37817383, "step": 7555, "time_per_iteration": 2.6685259342193604 }, { "auxiliary_loss_clip": 0.01386654, "auxiliary_loss_mlp": 0.00449438, "balance_loss_clip": 1.13840151, "balance_loss_mlp": 0.41052771, "epoch": 0.4542912971591763, "flos": 28072556248320.0, "grad_norm": 104.69087040068001, "language_loss": 0.77133596, "learning_rate": 2.390634232808903e-06, "loss": 0.78969681, "num_input_tokens_seen": 162129910, "router_z_loss_clip": 2.48632812, "router_z_loss_mlp": 0.38916016, "step": 7556, "time_per_iteration": 2.761812210083008 }, { "auxiliary_loss_clip": 0.01409186, "auxiliary_loss_mlp": 0.00497142, "balance_loss_clip": 1.1503942, "balance_loss_mlp": 0.45451325, "epoch": 0.4543514204118443, "flos": 22671771108480.0, "grad_norm": 5.315675269143128, "language_loss": 0.6902771, "learning_rate": 2.3902522647004982e-06, "loss": 0.70934039, "num_input_tokens_seen": 162148840, "router_z_loss_clip": 2.59179688, "router_z_loss_mlp": 0.42675781, "step": 7557, "time_per_iteration": 2.7455177307128906 }, { "auxiliary_loss_clip": 0.0130167, "auxiliary_loss_mlp": 0.0008219, "balance_loss_clip": 1.13097239, "balance_loss_mlp": 0.07413148, "epoch": 0.45441154366451225, "flos": 58216549921920.0, "grad_norm": 0.6656941516779946, "language_loss": 0.57159871, "learning_rate": 2.3898702817936875e-06, "loss": 0.5854373, "num_input_tokens_seen": 162208500, "router_z_loss_clip": 1.703125, "router_z_loss_mlp": 0.08056641, "step": 7558, "time_per_iteration": 3.132261037826538 }, { "auxiliary_loss_clip": 0.01411365, "auxiliary_loss_mlp": 0.00466145, "balance_loss_clip": 1.14999199, "balance_loss_mlp": 0.42549437, "epoch": 0.4544716669171802, "flos": 16764286763520.0, "grad_norm": 16.99316737911206, "language_loss": 0.64894962, "learning_rate": 2.3894882841029573e-06, "loss": 0.66772467, "num_input_tokens_seen": 162224650, "router_z_loss_clip": 2.61328125, "router_z_loss_mlp": 0.40625, "step": 7559, "time_per_iteration": 2.757943868637085 }, { "auxiliary_loss_clip": 0.01405027, "auxiliary_loss_mlp": 0.0048778, "balance_loss_clip": 1.15123892, "balance_loss_mlp": 0.44472212, "epoch": 0.4545317901698482, "flos": 15925233991680.0, "grad_norm": 57.144020300106995, "language_loss": 0.78336072, "learning_rate": 2.389106271642792e-06, "loss": 0.80228877, "num_input_tokens_seen": 162242930, "router_z_loss_clip": 2.53710938, "router_z_loss_mlp": 0.4309082, "step": 7560, "time_per_iteration": 2.6788315773010254 }, { "auxiliary_loss_clip": 0.01418926, "auxiliary_loss_mlp": 0.00439861, "balance_loss_clip": 1.15712512, "balance_loss_mlp": 0.39985472, "epoch": 0.45459191342251615, "flos": 17639752947840.0, "grad_norm": 8.45195234352107, "language_loss": 0.78098845, "learning_rate": 2.3887242444276775e-06, "loss": 0.7995764, "num_input_tokens_seen": 162261455, "router_z_loss_clip": 2.62109375, "router_z_loss_mlp": 0.40014648, "step": 7561, "time_per_iteration": 2.671323537826538 }, { "auxiliary_loss_clip": 0.01402502, "auxiliary_loss_mlp": 0.00428371, "balance_loss_clip": 1.15416074, "balance_loss_mlp": 0.3908917, "epoch": 0.4546520366751841, "flos": 16176608346240.0, "grad_norm": 30.13400509643176, "language_loss": 0.90198398, "learning_rate": 2.3883422024721015e-06, "loss": 0.92029274, "num_input_tokens_seen": 162279725, "router_z_loss_clip": 2.484375, "router_z_loss_mlp": 0.375, "step": 7562, "time_per_iteration": 2.653895378112793 }, { "auxiliary_loss_clip": 0.01406511, "auxiliary_loss_mlp": 0.00395332, "balance_loss_clip": 1.157758, "balance_loss_mlp": 0.36064172, "epoch": 0.4547121599278521, "flos": 19751443562880.0, "grad_norm": 4.088856829581359, "language_loss": 0.94425738, "learning_rate": 2.38796014579055e-06, "loss": 0.96227586, "num_input_tokens_seen": 162297865, "router_z_loss_clip": 2.48632812, "router_z_loss_mlp": 0.34692383, "step": 7563, "time_per_iteration": 2.7290005683898926 }, { "auxiliary_loss_clip": 0.01413271, "auxiliary_loss_mlp": 0.0043893, "balance_loss_clip": 1.15521264, "balance_loss_mlp": 0.39842236, "epoch": 0.45477228318052004, "flos": 19937461121280.0, "grad_norm": 12.988193882778504, "language_loss": 0.78602695, "learning_rate": 2.3875780743975097e-06, "loss": 0.80454898, "num_input_tokens_seen": 162316010, "router_z_loss_clip": 2.58203125, "router_z_loss_mlp": 0.40478516, "step": 7564, "time_per_iteration": 2.6878821849823 }, { "auxiliary_loss_clip": 0.01403159, "auxiliary_loss_mlp": 0.00447452, "balance_loss_clip": 1.1421392, "balance_loss_mlp": 0.40866095, "epoch": 0.454832406433188, "flos": 21288312829440.0, "grad_norm": 2.8353115838090113, "language_loss": 0.73216462, "learning_rate": 2.3871959883074713e-06, "loss": 0.75067073, "num_input_tokens_seen": 162336115, "router_z_loss_clip": 2.61132812, "router_z_loss_mlp": 0.38818359, "step": 7565, "time_per_iteration": 2.7408156394958496 }, { "auxiliary_loss_clip": 0.01397724, "auxiliary_loss_mlp": 0.00434289, "balance_loss_clip": 1.14721608, "balance_loss_mlp": 0.39704764, "epoch": 0.45489252968585603, "flos": 24498726612480.0, "grad_norm": 55.9976426084516, "language_loss": 0.85288984, "learning_rate": 2.386813887534922e-06, "loss": 0.87120998, "num_input_tokens_seen": 162355705, "router_z_loss_clip": 2.50390625, "router_z_loss_mlp": 0.37231445, "step": 7566, "time_per_iteration": 2.7086780071258545 }, { "auxiliary_loss_clip": 0.01407017, "auxiliary_loss_mlp": 0.00417981, "balance_loss_clip": 1.14889073, "balance_loss_mlp": 0.37909546, "epoch": 0.454952652938524, "flos": 17092474352640.0, "grad_norm": 35.557495629215744, "language_loss": 0.79703265, "learning_rate": 2.3864317720943508e-06, "loss": 0.8152827, "num_input_tokens_seen": 162374055, "router_z_loss_clip": 2.58007812, "router_z_loss_mlp": 0.38891602, "step": 7567, "time_per_iteration": 2.6516432762145996 }, { "auxiliary_loss_clip": 0.01412723, "auxiliary_loss_mlp": 0.00421675, "balance_loss_clip": 1.15768218, "balance_loss_mlp": 0.3847678, "epoch": 0.45501277619119196, "flos": 27630387826560.0, "grad_norm": 244.17996734995018, "language_loss": 0.85779965, "learning_rate": 2.386049642000249e-06, "loss": 0.87614357, "num_input_tokens_seen": 162393560, "router_z_loss_clip": 2.54882812, "router_z_loss_mlp": 0.36914062, "step": 7568, "time_per_iteration": 2.7208304405212402 }, { "auxiliary_loss_clip": 0.0143528, "auxiliary_loss_mlp": 0.00395743, "balance_loss_clip": 1.16716266, "balance_loss_mlp": 0.35874087, "epoch": 0.4550728994438599, "flos": 19974664632960.0, "grad_norm": 4.044342095200329, "language_loss": 0.88582605, "learning_rate": 2.3856674972671055e-06, "loss": 0.90413624, "num_input_tokens_seen": 162413170, "router_z_loss_clip": 2.68359375, "router_z_loss_mlp": 0.37036133, "step": 7569, "time_per_iteration": 2.779404640197754 }, { "auxiliary_loss_clip": 0.01422708, "auxiliary_loss_mlp": 0.00433889, "balance_loss_clip": 1.1581099, "balance_loss_mlp": 0.3954556, "epoch": 0.4551330226965279, "flos": 26066873646720.0, "grad_norm": 46.18699037321896, "language_loss": 0.80460036, "learning_rate": 2.385285337909412e-06, "loss": 0.82316625, "num_input_tokens_seen": 162434080, "router_z_loss_clip": 2.64453125, "router_z_loss_mlp": 0.3840332, "step": 7570, "time_per_iteration": 2.7331037521362305 }, { "auxiliary_loss_clip": 0.01418785, "auxiliary_loss_mlp": 0.00411083, "balance_loss_clip": 1.1626277, "balance_loss_mlp": 0.37601143, "epoch": 0.45519314594919585, "flos": 32781091501440.0, "grad_norm": 6.2685219474004645, "language_loss": 0.80413347, "learning_rate": 2.3849031639416596e-06, "loss": 0.82243216, "num_input_tokens_seen": 162455445, "router_z_loss_clip": 2.56054688, "router_z_loss_mlp": 0.35058594, "step": 7571, "time_per_iteration": 2.7777576446533203 }, { "auxiliary_loss_clip": 0.01411661, "auxiliary_loss_mlp": 0.00418647, "balance_loss_clip": 1.1612978, "balance_loss_mlp": 0.38257474, "epoch": 0.4552532692018638, "flos": 19172671718400.0, "grad_norm": 5.956624558436569, "language_loss": 0.84781361, "learning_rate": 2.3845209753783414e-06, "loss": 0.86611676, "num_input_tokens_seen": 162474940, "router_z_loss_clip": 2.50195312, "router_z_loss_mlp": 0.36108398, "step": 7572, "time_per_iteration": 2.658423662185669 }, { "auxiliary_loss_clip": 0.01430694, "auxiliary_loss_mlp": 0.00469057, "balance_loss_clip": 1.16699076, "balance_loss_mlp": 0.42995673, "epoch": 0.4553133924545318, "flos": 26027156183040.0, "grad_norm": 13.60266532844963, "language_loss": 0.79508913, "learning_rate": 2.3841387722339486e-06, "loss": 0.81408668, "num_input_tokens_seen": 162493340, "router_z_loss_clip": 2.63867188, "router_z_loss_mlp": 0.39086914, "step": 7573, "time_per_iteration": 4.17199182510376 }, { "auxiliary_loss_clip": 0.01442062, "auxiliary_loss_mlp": 0.00479091, "balance_loss_clip": 1.17490005, "balance_loss_mlp": 0.43436348, "epoch": 0.45537351570719975, "flos": 30661535808000.0, "grad_norm": 26.218731854409505, "language_loss": 0.80182374, "learning_rate": 2.3837565545229748e-06, "loss": 0.82103521, "num_input_tokens_seen": 162514360, "router_z_loss_clip": 2.671875, "router_z_loss_mlp": 0.44677734, "step": 7574, "time_per_iteration": 2.7613720893859863 }, { "auxiliary_loss_clip": 0.01424319, "auxiliary_loss_mlp": 0.00421441, "balance_loss_clip": 1.16540742, "balance_loss_mlp": 0.38503399, "epoch": 0.4554336389598677, "flos": 24353396184960.0, "grad_norm": 41.071127179054244, "language_loss": 0.77046835, "learning_rate": 2.383374322259915e-06, "loss": 0.78892601, "num_input_tokens_seen": 162535240, "router_z_loss_clip": 2.59179688, "router_z_loss_mlp": 0.36401367, "step": 7575, "time_per_iteration": 2.7275354862213135 }, { "auxiliary_loss_clip": 0.01410835, "auxiliary_loss_mlp": 0.00434531, "balance_loss_clip": 1.15661216, "balance_loss_mlp": 0.3966462, "epoch": 0.4554937622125357, "flos": 20557925677440.0, "grad_norm": 6.813289329247483, "language_loss": 0.79669189, "learning_rate": 2.3829920754592617e-06, "loss": 0.81514555, "num_input_tokens_seen": 162553880, "router_z_loss_clip": 2.54101562, "router_z_loss_mlp": 0.37890625, "step": 7576, "time_per_iteration": 2.666214942932129 }, { "auxiliary_loss_clip": 0.01418329, "auxiliary_loss_mlp": 0.00385098, "balance_loss_clip": 1.16381645, "balance_loss_mlp": 0.35069418, "epoch": 0.45555388546520365, "flos": 22820764723200.0, "grad_norm": 10.984060172419102, "language_loss": 0.72564638, "learning_rate": 2.382609814135511e-06, "loss": 0.74368066, "num_input_tokens_seen": 162574485, "router_z_loss_clip": 2.5390625, "router_z_loss_mlp": 0.34399414, "step": 7577, "time_per_iteration": 5.586750745773315 }, { "auxiliary_loss_clip": 0.01435723, "auxiliary_loss_mlp": 0.00417617, "balance_loss_clip": 1.17260623, "balance_loss_mlp": 0.37832606, "epoch": 0.4556140087178716, "flos": 21725992051200.0, "grad_norm": 7.574604678329882, "language_loss": 0.80285048, "learning_rate": 2.382227538303157e-06, "loss": 0.82138395, "num_input_tokens_seen": 162595130, "router_z_loss_clip": 2.6328125, "router_z_loss_mlp": 0.39306641, "step": 7578, "time_per_iteration": 2.7509186267852783 }, { "auxiliary_loss_clip": 0.01416855, "auxiliary_loss_mlp": 0.00403712, "balance_loss_clip": 1.16311026, "balance_loss_mlp": 0.37097734, "epoch": 0.45567413197053963, "flos": 25994513698560.0, "grad_norm": 157.1544504619399, "language_loss": 0.76598823, "learning_rate": 2.381845247976697e-06, "loss": 0.78419387, "num_input_tokens_seen": 162615720, "router_z_loss_clip": 2.53710938, "router_z_loss_mlp": 0.32714844, "step": 7579, "time_per_iteration": 2.7198898792266846 }, { "auxiliary_loss_clip": 0.01426924, "auxiliary_loss_mlp": 0.00394864, "balance_loss_clip": 1.16614199, "balance_loss_mlp": 0.36093736, "epoch": 0.4557342552232076, "flos": 21537604195200.0, "grad_norm": 24.99644659561302, "language_loss": 0.8406052, "learning_rate": 2.381462943170627e-06, "loss": 0.85882306, "num_input_tokens_seen": 162635825, "router_z_loss_clip": 2.60742188, "router_z_loss_mlp": 0.33959961, "step": 7580, "time_per_iteration": 2.898420572280884 }, { "auxiliary_loss_clip": 0.01429634, "auxiliary_loss_mlp": 0.00385785, "balance_loss_clip": 1.17552328, "balance_loss_mlp": 0.35145292, "epoch": 0.45579437847587556, "flos": 40001972647680.0, "grad_norm": 912.9577948483096, "language_loss": 0.74337888, "learning_rate": 2.381080623899444e-06, "loss": 0.76153314, "num_input_tokens_seen": 162659130, "router_z_loss_clip": 2.54101562, "router_z_loss_mlp": 0.34326172, "step": 7581, "time_per_iteration": 2.8408920764923096 }, { "auxiliary_loss_clip": 0.0141966, "auxiliary_loss_mlp": 0.00384075, "balance_loss_clip": 1.16976428, "balance_loss_mlp": 0.34990925, "epoch": 0.4558545017285435, "flos": 31138501530240.0, "grad_norm": 15.408074537333219, "language_loss": 0.77553368, "learning_rate": 2.3806982901776455e-06, "loss": 0.793571, "num_input_tokens_seen": 162681665, "router_z_loss_clip": 2.49414062, "router_z_loss_mlp": 0.34179688, "step": 7582, "time_per_iteration": 2.8271267414093018 }, { "auxiliary_loss_clip": 0.0143103, "auxiliary_loss_mlp": 0.00418981, "balance_loss_clip": 1.16626561, "balance_loss_mlp": 0.37859261, "epoch": 0.4559146249812115, "flos": 21725776569600.0, "grad_norm": 68.31996912781864, "language_loss": 0.7857877, "learning_rate": 2.380315942019729e-06, "loss": 0.80428779, "num_input_tokens_seen": 162702040, "router_z_loss_clip": 2.65039062, "router_z_loss_mlp": 0.40356445, "step": 7583, "time_per_iteration": 2.7957241535186768 }, { "auxiliary_loss_clip": 0.01435093, "auxiliary_loss_mlp": 0.00417258, "balance_loss_clip": 1.17078447, "balance_loss_mlp": 0.38128081, "epoch": 0.45597474823387946, "flos": 23805973935360.0, "grad_norm": 19.080165849807504, "language_loss": 0.79956806, "learning_rate": 2.379933579440195e-06, "loss": 0.81809163, "num_input_tokens_seen": 162722375, "router_z_loss_clip": 2.64257812, "router_z_loss_mlp": 0.35986328, "step": 7584, "time_per_iteration": 4.11084771156311 }, { "auxiliary_loss_clip": 0.01438141, "auxiliary_loss_mlp": 0.00432536, "balance_loss_clip": 1.18032241, "balance_loss_mlp": 0.39357847, "epoch": 0.4560348714865474, "flos": 31905661230720.0, "grad_norm": 4.470858374547008, "language_loss": 0.72675538, "learning_rate": 2.379551202453541e-06, "loss": 0.74546212, "num_input_tokens_seen": 162746095, "router_z_loss_clip": 2.578125, "router_z_loss_mlp": 0.38964844, "step": 7585, "time_per_iteration": 2.774897336959839 }, { "auxiliary_loss_clip": 0.01433223, "auxiliary_loss_mlp": 0.00421434, "balance_loss_clip": 1.17362821, "balance_loss_mlp": 0.38507539, "epoch": 0.4560949947392154, "flos": 22048828513920.0, "grad_norm": 7.378395418962431, "language_loss": 0.81955588, "learning_rate": 2.379168811074267e-06, "loss": 0.8381024, "num_input_tokens_seen": 162766330, "router_z_loss_clip": 2.59765625, "router_z_loss_mlp": 0.36352539, "step": 7586, "time_per_iteration": 2.6550371646881104 }, { "auxiliary_loss_clip": 0.01418912, "auxiliary_loss_mlp": 0.00381223, "balance_loss_clip": 1.16655779, "balance_loss_mlp": 0.34770164, "epoch": 0.45615511799188335, "flos": 24571804832640.0, "grad_norm": 1112.6829661335764, "language_loss": 0.83850008, "learning_rate": 2.3787864053168747e-06, "loss": 0.85650134, "num_input_tokens_seen": 162784755, "router_z_loss_clip": 2.52148438, "router_z_loss_mlp": 0.33569336, "step": 7587, "time_per_iteration": 2.7632453441619873 }, { "auxiliary_loss_clip": 0.01429045, "auxiliary_loss_mlp": 0.00466046, "balance_loss_clip": 1.16668189, "balance_loss_mlp": 0.42699328, "epoch": 0.4562152412445513, "flos": 18330709944960.0, "grad_norm": 74.23460514733344, "language_loss": 0.77569294, "learning_rate": 2.378403985195863e-06, "loss": 0.79464388, "num_input_tokens_seen": 162803850, "router_z_loss_clip": 2.62304688, "router_z_loss_mlp": 0.39038086, "step": 7588, "time_per_iteration": 2.6846091747283936 }, { "auxiliary_loss_clip": 0.01416529, "auxiliary_loss_mlp": 0.00413839, "balance_loss_clip": 1.16688204, "balance_loss_mlp": 0.37855312, "epoch": 0.4562753644972193, "flos": 13516525814400.0, "grad_norm": 18.099436050125565, "language_loss": 0.8491661, "learning_rate": 2.378021550725735e-06, "loss": 0.86746979, "num_input_tokens_seen": 162820775, "router_z_loss_clip": 2.49804688, "router_z_loss_mlp": 0.35253906, "step": 7589, "time_per_iteration": 2.6110925674438477 }, { "auxiliary_loss_clip": 0.01407464, "auxiliary_loss_mlp": 0.00386201, "balance_loss_clip": 1.15674376, "balance_loss_mlp": 0.35277507, "epoch": 0.45633548774988725, "flos": 29639697701760.0, "grad_norm": 53.42490418097034, "language_loss": 0.70740712, "learning_rate": 2.377639101920992e-06, "loss": 0.72534376, "num_input_tokens_seen": 162839695, "router_z_loss_clip": 2.5078125, "router_z_loss_mlp": 0.33422852, "step": 7590, "time_per_iteration": 2.7345635890960693 }, { "auxiliary_loss_clip": 0.01420415, "auxiliary_loss_mlp": 0.00402423, "balance_loss_clip": 1.16492796, "balance_loss_mlp": 0.369748, "epoch": 0.4563956110025552, "flos": 22233409528320.0, "grad_norm": 10.686067809921528, "language_loss": 0.78167713, "learning_rate": 2.377256638796135e-06, "loss": 0.79990554, "num_input_tokens_seen": 162856095, "router_z_loss_clip": 2.55664062, "router_z_loss_mlp": 0.32678223, "step": 7591, "time_per_iteration": 2.654724359512329 }, { "auxiliary_loss_clip": 0.01421847, "auxiliary_loss_mlp": 0.00435046, "balance_loss_clip": 1.16855955, "balance_loss_mlp": 0.39518288, "epoch": 0.45645573425522323, "flos": 17092043389440.0, "grad_norm": 42.46724754714922, "language_loss": 0.84599721, "learning_rate": 2.3768741613656695e-06, "loss": 0.86456609, "num_input_tokens_seen": 162874070, "router_z_loss_clip": 2.53125, "router_z_loss_mlp": 0.39892578, "step": 7592, "time_per_iteration": 2.6879401206970215 }, { "auxiliary_loss_clip": 0.01417616, "auxiliary_loss_mlp": 0.00388466, "balance_loss_clip": 1.1633141, "balance_loss_mlp": 0.35248888, "epoch": 0.4565158575078912, "flos": 20332334309760.0, "grad_norm": 45.9361923495634, "language_loss": 0.75669312, "learning_rate": 2.376491669644098e-06, "loss": 0.77475399, "num_input_tokens_seen": 162891000, "router_z_loss_clip": 2.54492188, "router_z_loss_mlp": 0.35986328, "step": 7593, "time_per_iteration": 2.7698769569396973 }, { "auxiliary_loss_clip": 0.01409487, "auxiliary_loss_mlp": 0.00416676, "balance_loss_clip": 1.15869141, "balance_loss_mlp": 0.38098449, "epoch": 0.45657598076055916, "flos": 23983013093760.0, "grad_norm": 2.487200039177754, "language_loss": 0.89371818, "learning_rate": 2.3761091636459248e-06, "loss": 0.91197979, "num_input_tokens_seen": 162910120, "router_z_loss_clip": 2.50976562, "router_z_loss_mlp": 0.35717773, "step": 7594, "time_per_iteration": 2.754897356033325 }, { "auxiliary_loss_clip": 0.0130142, "auxiliary_loss_mlp": 0.00094047, "balance_loss_clip": 1.1364665, "balance_loss_mlp": 0.08555911, "epoch": 0.45663610401322713, "flos": 69364297526400.0, "grad_norm": 0.8048262517647888, "language_loss": 0.52502918, "learning_rate": 2.375726643385654e-06, "loss": 0.53898382, "num_input_tokens_seen": 162963720, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.08496094, "step": 7595, "time_per_iteration": 3.160919666290283 }, { "auxiliary_loss_clip": 0.01420011, "auxiliary_loss_mlp": 0.00425359, "balance_loss_clip": 1.16171622, "balance_loss_mlp": 0.38771248, "epoch": 0.4566962272658951, "flos": 15149095891200.0, "grad_norm": 6.062413196030684, "language_loss": 0.93539107, "learning_rate": 2.3753441088777915e-06, "loss": 0.95384479, "num_input_tokens_seen": 162975760, "router_z_loss_clip": 2.58398438, "router_z_loss_mlp": 0.37670898, "step": 7596, "time_per_iteration": 2.647336721420288 }, { "auxiliary_loss_clip": 0.01416795, "auxiliary_loss_mlp": 0.00408983, "balance_loss_clip": 1.16559267, "balance_loss_mlp": 0.3734107, "epoch": 0.45675635051856306, "flos": 18697465762560.0, "grad_norm": 2.2004467664045926, "language_loss": 0.83527803, "learning_rate": 2.374961560136843e-06, "loss": 0.85353577, "num_input_tokens_seen": 162994865, "router_z_loss_clip": 2.51171875, "router_z_loss_mlp": 0.35546875, "step": 7597, "time_per_iteration": 2.6661739349365234 }, { "auxiliary_loss_clip": 0.01423718, "auxiliary_loss_mlp": 0.00412637, "balance_loss_clip": 1.16514909, "balance_loss_mlp": 0.37527674, "epoch": 0.456816473771231, "flos": 19098300608640.0, "grad_norm": 52.87763053245632, "language_loss": 0.84305143, "learning_rate": 2.374578997177314e-06, "loss": 0.86141497, "num_input_tokens_seen": 163014730, "router_z_loss_clip": 2.58984375, "router_z_loss_mlp": 0.37353516, "step": 7598, "time_per_iteration": 2.6853725910186768 }, { "auxiliary_loss_clip": 0.01406618, "auxiliary_loss_mlp": 0.00394877, "balance_loss_clip": 1.15545285, "balance_loss_mlp": 0.36002019, "epoch": 0.456876597023899, "flos": 28950069507840.0, "grad_norm": 6.110755640848057, "language_loss": 0.76814032, "learning_rate": 2.374196420013712e-06, "loss": 0.78615534, "num_input_tokens_seen": 163033405, "router_z_loss_clip": 2.51367188, "router_z_loss_mlp": 0.34863281, "step": 7599, "time_per_iteration": 2.7635538578033447 }, { "auxiliary_loss_clip": 0.01400631, "auxiliary_loss_mlp": 0.00406503, "balance_loss_clip": 1.15278018, "balance_loss_mlp": 0.36947656, "epoch": 0.45693672027656695, "flos": 23289470317440.0, "grad_norm": 9.423401603051845, "language_loss": 0.75909317, "learning_rate": 2.373813828660544e-06, "loss": 0.77716452, "num_input_tokens_seen": 163051400, "router_z_loss_clip": 2.4765625, "router_z_loss_mlp": 0.37036133, "step": 7600, "time_per_iteration": 2.690176010131836 }, { "auxiliary_loss_clip": 0.01425173, "auxiliary_loss_mlp": 0.00428418, "balance_loss_clip": 1.1676867, "balance_loss_mlp": 0.39146292, "epoch": 0.4569968435292349, "flos": 20558212986240.0, "grad_norm": 87.01000443008517, "language_loss": 0.84548783, "learning_rate": 2.373431223132319e-06, "loss": 0.86402375, "num_input_tokens_seen": 163069250, "router_z_loss_clip": 2.57421875, "router_z_loss_mlp": 0.36962891, "step": 7601, "time_per_iteration": 2.665968656539917 }, { "auxiliary_loss_clip": 0.01416148, "auxiliary_loss_mlp": 0.00382778, "balance_loss_clip": 1.16175556, "balance_loss_mlp": 0.35021019, "epoch": 0.4570569667819029, "flos": 41282619223680.0, "grad_norm": 15.46872564753309, "language_loss": 0.78987145, "learning_rate": 2.3730486034435448e-06, "loss": 0.80786073, "num_input_tokens_seen": 163091755, "router_z_loss_clip": 2.546875, "router_z_loss_mlp": 0.32568359, "step": 7602, "time_per_iteration": 2.847581624984741 }, { "auxiliary_loss_clip": 0.01424181, "auxiliary_loss_mlp": 0.00411097, "balance_loss_clip": 1.16571736, "balance_loss_mlp": 0.37304568, "epoch": 0.45711709003457085, "flos": 26031573555840.0, "grad_norm": 2.389255051961173, "language_loss": 0.81465137, "learning_rate": 2.372665969608729e-06, "loss": 0.83300424, "num_input_tokens_seen": 163111600, "router_z_loss_clip": 2.58789062, "router_z_loss_mlp": 0.38012695, "step": 7603, "time_per_iteration": 2.8188705444335938 }, { "auxiliary_loss_clip": 0.01395767, "auxiliary_loss_mlp": 0.00391232, "balance_loss_clip": 1.14890003, "balance_loss_mlp": 0.35716161, "epoch": 0.4571772132872388, "flos": 22158068751360.0, "grad_norm": 5.924937198503358, "language_loss": 0.86805904, "learning_rate": 2.372283321642383e-06, "loss": 0.88592899, "num_input_tokens_seen": 163127350, "router_z_loss_clip": 2.47070312, "router_z_loss_mlp": 0.34082031, "step": 7604, "time_per_iteration": 2.7042856216430664 }, { "auxiliary_loss_clip": 0.01414793, "auxiliary_loss_mlp": 0.00390175, "balance_loss_clip": 1.15830827, "balance_loss_mlp": 0.35527027, "epoch": 0.45723733653990684, "flos": 23878872587520.0, "grad_norm": 18.50935827562512, "language_loss": 0.94087529, "learning_rate": 2.371900659559016e-06, "loss": 0.95892489, "num_input_tokens_seen": 163145855, "router_z_loss_clip": 2.56835938, "router_z_loss_mlp": 0.34887695, "step": 7605, "time_per_iteration": 2.7235891819000244 }, { "auxiliary_loss_clip": 0.01404949, "auxiliary_loss_mlp": 0.00419005, "balance_loss_clip": 1.15479064, "balance_loss_mlp": 0.38290796, "epoch": 0.4572974597925748, "flos": 16871803148160.0, "grad_norm": 564.1555297609431, "language_loss": 0.79339451, "learning_rate": 2.371517983373138e-06, "loss": 0.81163406, "num_input_tokens_seen": 163163830, "router_z_loss_clip": 2.50195312, "router_z_loss_mlp": 0.36083984, "step": 7606, "time_per_iteration": 2.6096878051757812 }, { "auxiliary_loss_clip": 0.01407618, "auxiliary_loss_mlp": 0.00417513, "balance_loss_clip": 1.15461814, "balance_loss_mlp": 0.38077304, "epoch": 0.45735758304524277, "flos": 13771491528960.0, "grad_norm": 80.8582275914916, "language_loss": 0.86439431, "learning_rate": 2.371135293099262e-06, "loss": 0.88264561, "num_input_tokens_seen": 163180700, "router_z_loss_clip": 2.53515625, "router_z_loss_mlp": 0.36743164, "step": 7607, "time_per_iteration": 2.6782703399658203 }, { "auxiliary_loss_clip": 0.0140279, "auxiliary_loss_mlp": 0.00374666, "balance_loss_clip": 1.15300965, "balance_loss_mlp": 0.34123927, "epoch": 0.45741770629791073, "flos": 21100750986240.0, "grad_norm": 3.093280331836067, "language_loss": 0.86123782, "learning_rate": 2.3707525887518982e-06, "loss": 0.87901235, "num_input_tokens_seen": 163199450, "router_z_loss_clip": 2.49414062, "router_z_loss_mlp": 0.33398438, "step": 7608, "time_per_iteration": 2.68945574760437 }, { "auxiliary_loss_clip": 0.01404451, "auxiliary_loss_mlp": 0.00376128, "balance_loss_clip": 1.15754879, "balance_loss_mlp": 0.34147358, "epoch": 0.4574778295505787, "flos": 23112898035840.0, "grad_norm": 17.531496126162345, "language_loss": 0.75335377, "learning_rate": 2.370369870345559e-06, "loss": 0.77115953, "num_input_tokens_seen": 163217875, "router_z_loss_clip": 2.46484375, "router_z_loss_mlp": 0.34655762, "step": 7609, "time_per_iteration": 2.6796412467956543 }, { "auxiliary_loss_clip": 0.01404628, "auxiliary_loss_mlp": 0.00397746, "balance_loss_clip": 1.15371585, "balance_loss_mlp": 0.36405766, "epoch": 0.45753795280324666, "flos": 24352929308160.0, "grad_norm": 2.888809037105175, "language_loss": 0.86616755, "learning_rate": 2.369987137894757e-06, "loss": 0.88419127, "num_input_tokens_seen": 163237430, "router_z_loss_clip": 2.5078125, "router_z_loss_mlp": 0.33666992, "step": 7610, "time_per_iteration": 2.7877793312072754 }, { "auxiliary_loss_clip": 0.01402017, "auxiliary_loss_mlp": 0.00399276, "balance_loss_clip": 1.15378487, "balance_loss_mlp": 0.36358476, "epoch": 0.4575980760559146, "flos": 16653789550080.0, "grad_norm": 38.262123412470764, "language_loss": 0.89153421, "learning_rate": 2.3696043914140057e-06, "loss": 0.90954721, "num_input_tokens_seen": 163253905, "router_z_loss_clip": 2.48632812, "router_z_loss_mlp": 0.35693359, "step": 7611, "time_per_iteration": 2.7945940494537354 }, { "auxiliary_loss_clip": 0.01413853, "auxiliary_loss_mlp": 0.00390762, "balance_loss_clip": 1.16269588, "balance_loss_mlp": 0.35809833, "epoch": 0.4576581993085826, "flos": 35911423912320.0, "grad_norm": 424.54078627136573, "language_loss": 0.80471337, "learning_rate": 2.369221630917819e-06, "loss": 0.82275951, "num_input_tokens_seen": 163274285, "router_z_loss_clip": 2.51171875, "router_z_loss_mlp": 0.3269043, "step": 7612, "time_per_iteration": 2.772831916809082 }, { "auxiliary_loss_clip": 0.01379826, "auxiliary_loss_mlp": 0.00391869, "balance_loss_clip": 1.13786685, "balance_loss_mlp": 0.35667843, "epoch": 0.45771832256125056, "flos": 20080421251200.0, "grad_norm": 8.354411700478733, "language_loss": 0.90118754, "learning_rate": 2.368838856420711e-06, "loss": 0.91890448, "num_input_tokens_seen": 163293150, "router_z_loss_clip": 2.42382812, "router_z_loss_mlp": 0.35180664, "step": 7613, "time_per_iteration": 2.6236801147460938 }, { "auxiliary_loss_clip": 0.0139502, "auxiliary_loss_mlp": 0.00401874, "balance_loss_clip": 1.15037251, "balance_loss_mlp": 0.36496711, "epoch": 0.4577784458139185, "flos": 10744329957120.0, "grad_norm": 117.69026218656552, "language_loss": 0.83040088, "learning_rate": 2.3684560679371965e-06, "loss": 0.84836984, "num_input_tokens_seen": 163310065, "router_z_loss_clip": 2.44921875, "router_z_loss_mlp": 0.36938477, "step": 7614, "time_per_iteration": 2.6219818592071533 }, { "auxiliary_loss_clip": 0.01386079, "auxiliary_loss_mlp": 0.0038923, "balance_loss_clip": 1.14610839, "balance_loss_mlp": 0.35427749, "epoch": 0.4578385690665865, "flos": 21907269014400.0, "grad_norm": 3.8957738357923772, "language_loss": 0.78743303, "learning_rate": 2.368073265481791e-06, "loss": 0.80518609, "num_input_tokens_seen": 163329415, "router_z_loss_clip": 2.40039062, "router_z_loss_mlp": 0.34960938, "step": 7615, "time_per_iteration": 4.1506030559539795 }, { "auxiliary_loss_clip": 0.01298201, "auxiliary_loss_mlp": 0.00112547, "balance_loss_clip": 1.13094378, "balance_loss_mlp": 0.10463192, "epoch": 0.45789869231925445, "flos": 64758286667520.0, "grad_norm": 0.7508989200259062, "language_loss": 0.57603586, "learning_rate": 2.3676904490690105e-06, "loss": 0.59014332, "num_input_tokens_seen": 163385875, "router_z_loss_clip": 1.671875, "router_z_loss_mlp": 0.07910156, "step": 7616, "time_per_iteration": 3.069803237915039 }, { "auxiliary_loss_clip": 0.01398456, "auxiliary_loss_mlp": 0.00425503, "balance_loss_clip": 1.15115666, "balance_loss_mlp": 0.38780886, "epoch": 0.4579588155719224, "flos": 16144001775360.0, "grad_norm": 5.597852264832207, "language_loss": 0.78961629, "learning_rate": 2.3673076187133704e-06, "loss": 0.80785584, "num_input_tokens_seen": 163405170, "router_z_loss_clip": 2.47265625, "router_z_loss_mlp": 0.37695312, "step": 7617, "time_per_iteration": 2.6174890995025635 }, { "auxiliary_loss_clip": 0.01406461, "auxiliary_loss_mlp": 0.00388861, "balance_loss_clip": 1.15941072, "balance_loss_mlp": 0.35305023, "epoch": 0.45801893882459044, "flos": 21395541905280.0, "grad_norm": 5.657423201595948, "language_loss": 0.84205055, "learning_rate": 2.36692477442939e-06, "loss": 0.86000371, "num_input_tokens_seen": 163423155, "router_z_loss_clip": 2.47070312, "router_z_loss_mlp": 0.35839844, "step": 7618, "time_per_iteration": 2.636505603790283 }, { "auxiliary_loss_clip": 0.01400332, "auxiliary_loss_mlp": 0.00359676, "balance_loss_clip": 1.15319943, "balance_loss_mlp": 0.3277514, "epoch": 0.4580790620772584, "flos": 19536554448000.0, "grad_norm": 45.57102968009295, "language_loss": 0.84998304, "learning_rate": 2.366541916231585e-06, "loss": 0.86758316, "num_input_tokens_seen": 163442450, "router_z_loss_clip": 2.47460938, "router_z_loss_mlp": 0.3190918, "step": 7619, "time_per_iteration": 4.100680828094482 }, { "auxiliary_loss_clip": 0.01399194, "auxiliary_loss_mlp": 0.00359632, "balance_loss_clip": 1.15649581, "balance_loss_mlp": 0.32796967, "epoch": 0.45813918532992637, "flos": 16581070465920.0, "grad_norm": 42.31836650330557, "language_loss": 0.7755028, "learning_rate": 2.366159044134473e-06, "loss": 0.793091, "num_input_tokens_seen": 163459810, "router_z_loss_clip": 2.42578125, "router_z_loss_mlp": 0.31640625, "step": 7620, "time_per_iteration": 4.15244722366333 }, { "auxiliary_loss_clip": 0.01384733, "auxiliary_loss_mlp": 0.00364139, "balance_loss_clip": 1.14474607, "balance_loss_mlp": 0.3330735, "epoch": 0.45819930858259433, "flos": 42230301701760.0, "grad_norm": 2.6368067834573488, "language_loss": 0.81949276, "learning_rate": 2.3657761581525748e-06, "loss": 0.83698148, "num_input_tokens_seen": 163482970, "router_z_loss_clip": 2.40039062, "router_z_loss_mlp": 0.31054688, "step": 7621, "time_per_iteration": 2.8252310752868652 }, { "auxiliary_loss_clip": 0.01304226, "auxiliary_loss_mlp": 0.00137324, "balance_loss_clip": 1.13917351, "balance_loss_mlp": 0.126261, "epoch": 0.4582594318352623, "flos": 63714795638400.0, "grad_norm": 0.7581395888896513, "language_loss": 0.64390123, "learning_rate": 2.3653932583004063e-06, "loss": 0.65831673, "num_input_tokens_seen": 163545330, "router_z_loss_clip": 1.65625, "router_z_loss_mlp": 0.11083984, "step": 7622, "time_per_iteration": 3.174578905105591 }, { "auxiliary_loss_clip": 0.01392196, "auxiliary_loss_mlp": 0.00382, "balance_loss_clip": 1.1489861, "balance_loss_mlp": 0.34745342, "epoch": 0.45831955508793026, "flos": 26869979882880.0, "grad_norm": 8.797827543306832, "language_loss": 0.8566317, "learning_rate": 2.3650103445924903e-06, "loss": 0.87437367, "num_input_tokens_seen": 163564620, "router_z_loss_clip": 2.43359375, "router_z_loss_mlp": 0.3449707, "step": 7623, "time_per_iteration": 2.7331736087799072 }, { "auxiliary_loss_clip": 0.01398788, "auxiliary_loss_mlp": 0.00422184, "balance_loss_clip": 1.15296674, "balance_loss_mlp": 0.38398927, "epoch": 0.45837967834059823, "flos": 18733951002240.0, "grad_norm": 60.033737322821736, "language_loss": 0.77292979, "learning_rate": 2.3646274170433452e-06, "loss": 0.79113948, "num_input_tokens_seen": 163581010, "router_z_loss_clip": 2.45703125, "router_z_loss_mlp": 0.38183594, "step": 7624, "time_per_iteration": 2.626006603240967 }, { "auxiliary_loss_clip": 0.01392563, "auxiliary_loss_mlp": 0.0038674, "balance_loss_clip": 1.14533865, "balance_loss_mlp": 0.34933221, "epoch": 0.4584398015932662, "flos": 21178102924800.0, "grad_norm": 64.40303583925996, "language_loss": 0.80717987, "learning_rate": 2.364244475667491e-06, "loss": 0.82497287, "num_input_tokens_seen": 163599955, "router_z_loss_clip": 2.47460938, "router_z_loss_mlp": 0.37402344, "step": 7625, "time_per_iteration": 2.8010692596435547 }, { "auxiliary_loss_clip": 0.01412987, "auxiliary_loss_mlp": 0.00390852, "balance_loss_clip": 1.16021883, "balance_loss_mlp": 0.35661548, "epoch": 0.45849992484593416, "flos": 19790047704960.0, "grad_norm": 65.43783362323063, "language_loss": 0.84088516, "learning_rate": 2.363861520479451e-06, "loss": 0.85892355, "num_input_tokens_seen": 163618545, "router_z_loss_clip": 2.52734375, "router_z_loss_mlp": 0.3425293, "step": 7626, "time_per_iteration": 4.094861745834351 }, { "auxiliary_loss_clip": 0.01403375, "auxiliary_loss_mlp": 0.00379905, "balance_loss_clip": 1.1563406, "balance_loss_mlp": 0.34652635, "epoch": 0.4585600480986021, "flos": 18223265387520.0, "grad_norm": 64.42590879749145, "language_loss": 0.9018538, "learning_rate": 2.3634785514937445e-06, "loss": 0.91968662, "num_input_tokens_seen": 163636055, "router_z_loss_clip": 2.47265625, "router_z_loss_mlp": 0.33361816, "step": 7627, "time_per_iteration": 2.605846881866455 }, { "auxiliary_loss_clip": 0.01413129, "auxiliary_loss_mlp": 0.00412138, "balance_loss_clip": 1.15885901, "balance_loss_mlp": 0.37589836, "epoch": 0.4586201713512701, "flos": 29022213974400.0, "grad_norm": 15.435810044534561, "language_loss": 0.75807273, "learning_rate": 2.3630955687248953e-06, "loss": 0.77632535, "num_input_tokens_seen": 163657485, "router_z_loss_clip": 2.54101562, "router_z_loss_mlp": 0.36230469, "step": 7628, "time_per_iteration": 2.7360129356384277 }, { "auxiliary_loss_clip": 0.0139258, "auxiliary_loss_mlp": 0.00401369, "balance_loss_clip": 1.14685345, "balance_loss_mlp": 0.36431855, "epoch": 0.45868029460393805, "flos": 23404600385280.0, "grad_norm": 267.8529474441672, "language_loss": 0.83394051, "learning_rate": 2.3627125721874265e-06, "loss": 0.85187995, "num_input_tokens_seen": 163676030, "router_z_loss_clip": 2.45898438, "router_z_loss_mlp": 0.37036133, "step": 7629, "time_per_iteration": 2.661344528198242 }, { "auxiliary_loss_clip": 0.01412301, "auxiliary_loss_mlp": 0.00407876, "balance_loss_clip": 1.15810561, "balance_loss_mlp": 0.3715167, "epoch": 0.458740417856606, "flos": 18221972497920.0, "grad_norm": 6.649513197919966, "language_loss": 0.87807107, "learning_rate": 2.3623295618958595e-06, "loss": 0.89627278, "num_input_tokens_seen": 163694490, "router_z_loss_clip": 2.54492188, "router_z_loss_mlp": 0.36328125, "step": 7630, "time_per_iteration": 2.6195521354675293 }, { "auxiliary_loss_clip": 0.01398115, "auxiliary_loss_mlp": 0.00400203, "balance_loss_clip": 1.1494664, "balance_loss_mlp": 0.36308098, "epoch": 0.458800541109274, "flos": 34568760504960.0, "grad_norm": 119.78780803390171, "language_loss": 0.78440535, "learning_rate": 2.3619465378647198e-06, "loss": 0.80238855, "num_input_tokens_seen": 163717035, "router_z_loss_clip": 2.484375, "router_z_loss_mlp": 0.37133789, "step": 7631, "time_per_iteration": 2.7584073543548584 }, { "auxiliary_loss_clip": 0.01410489, "auxiliary_loss_mlp": 0.00392315, "balance_loss_clip": 1.15532565, "balance_loss_mlp": 0.35602784, "epoch": 0.458860664361942, "flos": 17712112896000.0, "grad_norm": 16.083993480445585, "language_loss": 0.7876749, "learning_rate": 2.361563500108531e-06, "loss": 0.80570292, "num_input_tokens_seen": 163734525, "router_z_loss_clip": 2.5546875, "router_z_loss_mlp": 0.36303711, "step": 7632, "time_per_iteration": 2.662785530090332 }, { "auxiliary_loss_clip": 0.01409401, "auxiliary_loss_mlp": 0.00378864, "balance_loss_clip": 1.15235853, "balance_loss_mlp": 0.34360141, "epoch": 0.45892078761460997, "flos": 18441889516800.0, "grad_norm": 61.38493079348406, "language_loss": 0.7987175, "learning_rate": 2.3611804486418178e-06, "loss": 0.8166002, "num_input_tokens_seen": 163752860, "router_z_loss_clip": 2.57226562, "router_z_loss_mlp": 0.3527832, "step": 7633, "time_per_iteration": 2.619844436645508 }, { "auxiliary_loss_clip": 0.01415245, "auxiliary_loss_mlp": 0.00394482, "balance_loss_clip": 1.15716362, "balance_loss_mlp": 0.35736066, "epoch": 0.45898091086727794, "flos": 22672956257280.0, "grad_norm": 22.867255500106467, "language_loss": 0.86587274, "learning_rate": 2.3607973834791062e-06, "loss": 0.88397002, "num_input_tokens_seen": 163772495, "router_z_loss_clip": 2.58203125, "router_z_loss_mlp": 0.37109375, "step": 7634, "time_per_iteration": 2.7244346141815186 }, { "auxiliary_loss_clip": 0.01424334, "auxiliary_loss_mlp": 0.00394173, "balance_loss_clip": 1.16675675, "balance_loss_mlp": 0.35864913, "epoch": 0.4590410341199459, "flos": 21652949744640.0, "grad_norm": 55.873485187732385, "language_loss": 0.87893659, "learning_rate": 2.3604143046349216e-06, "loss": 0.89712167, "num_input_tokens_seen": 163791475, "router_z_loss_clip": 2.57226562, "router_z_loss_mlp": 0.35546875, "step": 7635, "time_per_iteration": 2.7165777683258057 }, { "auxiliary_loss_clip": 0.01405686, "auxiliary_loss_mlp": 0.00394315, "balance_loss_clip": 1.15523458, "balance_loss_mlp": 0.36053133, "epoch": 0.45910115737261387, "flos": 36535372087680.0, "grad_norm": 31.07156326904667, "language_loss": 0.69742262, "learning_rate": 2.3600312121237905e-06, "loss": 0.71542263, "num_input_tokens_seen": 163812995, "router_z_loss_clip": 2.50390625, "router_z_loss_mlp": 0.33813477, "step": 7636, "time_per_iteration": 2.90412974357605 }, { "auxiliary_loss_clip": 0.01406306, "auxiliary_loss_mlp": 0.00351961, "balance_loss_clip": 1.15810466, "balance_loss_mlp": 0.31967962, "epoch": 0.45916128062528183, "flos": 24419866302720.0, "grad_norm": 4.7072646142261565, "language_loss": 0.85498923, "learning_rate": 2.3596481059602395e-06, "loss": 0.87257189, "num_input_tokens_seen": 163833945, "router_z_loss_clip": 2.48046875, "router_z_loss_mlp": 0.32275391, "step": 7637, "time_per_iteration": 2.7241384983062744 }, { "auxiliary_loss_clip": 0.01412305, "auxiliary_loss_mlp": 0.00420664, "balance_loss_clip": 1.15700269, "balance_loss_mlp": 0.38225436, "epoch": 0.4592214038779498, "flos": 23221958705280.0, "grad_norm": 35.14661808670471, "language_loss": 0.80389798, "learning_rate": 2.3592649861587965e-06, "loss": 0.8222276, "num_input_tokens_seen": 163853885, "router_z_loss_clip": 2.55078125, "router_z_loss_mlp": 0.3840332, "step": 7638, "time_per_iteration": 2.7056353092193604 }, { "auxiliary_loss_clip": 0.01408054, "auxiliary_loss_mlp": 0.00369637, "balance_loss_clip": 1.15575051, "balance_loss_mlp": 0.33506668, "epoch": 0.45928152713061776, "flos": 19172133014400.0, "grad_norm": 42.215785917712054, "language_loss": 0.80214727, "learning_rate": 2.358881852733989e-06, "loss": 0.81992418, "num_input_tokens_seen": 163871855, "router_z_loss_clip": 2.52539062, "router_z_loss_mlp": 0.34594727, "step": 7639, "time_per_iteration": 2.658640146255493 }, { "auxiliary_loss_clip": 0.01416394, "auxiliary_loss_mlp": 0.00391644, "balance_loss_clip": 1.16204286, "balance_loss_mlp": 0.35607231, "epoch": 0.4593416503832857, "flos": 22414686491520.0, "grad_norm": 5.425272501572054, "language_loss": 0.74108684, "learning_rate": 2.358498705700346e-06, "loss": 0.75916731, "num_input_tokens_seen": 163891450, "router_z_loss_clip": 2.54296875, "router_z_loss_mlp": 0.35571289, "step": 7640, "time_per_iteration": 2.712205410003662 }, { "auxiliary_loss_clip": 0.0141406, "auxiliary_loss_mlp": 0.0038902, "balance_loss_clip": 1.15803695, "balance_loss_mlp": 0.35399598, "epoch": 0.4594017736359537, "flos": 18880215183360.0, "grad_norm": 3.112982251641233, "language_loss": 0.80893743, "learning_rate": 2.3581155450723958e-06, "loss": 0.82696819, "num_input_tokens_seen": 163909345, "router_z_loss_clip": 2.5625, "router_z_loss_mlp": 0.3503418, "step": 7641, "time_per_iteration": 2.6795477867126465 }, { "auxiliary_loss_clip": 0.01417265, "auxiliary_loss_mlp": 0.00374303, "balance_loss_clip": 1.16257465, "balance_loss_mlp": 0.33947054, "epoch": 0.45946189688862166, "flos": 20518567349760.0, "grad_norm": 6.518634142432164, "language_loss": 0.80651093, "learning_rate": 2.357732370864668e-06, "loss": 0.82442665, "num_input_tokens_seen": 163926940, "router_z_loss_clip": 2.54882812, "router_z_loss_mlp": 0.34814453, "step": 7642, "time_per_iteration": 2.7242443561553955 }, { "auxiliary_loss_clip": 0.01284609, "auxiliary_loss_mlp": 0.00109907, "balance_loss_clip": 1.11587667, "balance_loss_mlp": 0.09960775, "epoch": 0.4595220201412896, "flos": 61405990162560.0, "grad_norm": 0.8219408616513796, "language_loss": 0.58071291, "learning_rate": 2.357349183091694e-06, "loss": 0.59465814, "num_input_tokens_seen": 163977785, "router_z_loss_clip": 1.6875, "router_z_loss_mlp": 0.10302734, "step": 7643, "time_per_iteration": 2.9433772563934326 }, { "auxiliary_loss_clip": 0.01418196, "auxiliary_loss_mlp": 0.00437828, "balance_loss_clip": 1.1587882, "balance_loss_mlp": 0.39894193, "epoch": 0.4595821433939576, "flos": 23330947547520.0, "grad_norm": 29.061702155960898, "language_loss": 0.97294396, "learning_rate": 2.3569659817680016e-06, "loss": 0.99150419, "num_input_tokens_seen": 163996630, "router_z_loss_clip": 2.58984375, "router_z_loss_mlp": 0.38891602, "step": 7644, "time_per_iteration": 2.7192814350128174 }, { "auxiliary_loss_clip": 0.01420085, "auxiliary_loss_mlp": 0.00420515, "balance_loss_clip": 1.15949941, "balance_loss_mlp": 0.38270217, "epoch": 0.4596422666466256, "flos": 14282356711680.0, "grad_norm": 90.52420204394812, "language_loss": 0.90003538, "learning_rate": 2.3565827669081243e-06, "loss": 0.91844136, "num_input_tokens_seen": 164013190, "router_z_loss_clip": 2.60742188, "router_z_loss_mlp": 0.37817383, "step": 7645, "time_per_iteration": 2.679596424102783 }, { "auxiliary_loss_clip": 0.01295455, "auxiliary_loss_mlp": 0.00102974, "balance_loss_clip": 1.12287021, "balance_loss_mlp": 0.09338997, "epoch": 0.4597023898992936, "flos": 65727337737600.0, "grad_norm": 0.756015204819982, "language_loss": 0.59569842, "learning_rate": 2.356199538526593e-06, "loss": 0.60968274, "num_input_tokens_seen": 164074030, "router_z_loss_clip": 1.7265625, "router_z_loss_mlp": 0.09570312, "step": 7646, "time_per_iteration": 3.0933046340942383 }, { "auxiliary_loss_clip": 0.01411092, "auxiliary_loss_mlp": 0.00436004, "balance_loss_clip": 1.15578568, "balance_loss_mlp": 0.39909667, "epoch": 0.45976251315196154, "flos": 26907075653760.0, "grad_norm": 8.797714011015263, "language_loss": 0.78574979, "learning_rate": 2.355816296637939e-06, "loss": 0.80422074, "num_input_tokens_seen": 164095515, "router_z_loss_clip": 2.55078125, "router_z_loss_mlp": 0.36914062, "step": 7647, "time_per_iteration": 2.736733913421631 }, { "auxiliary_loss_clip": 0.01426281, "auxiliary_loss_mlp": 0.00437826, "balance_loss_clip": 1.16865742, "balance_loss_mlp": 0.39889181, "epoch": 0.4598226364046295, "flos": 26618066824320.0, "grad_norm": 163.12288546755235, "language_loss": 0.7147249, "learning_rate": 2.3554330412566957e-06, "loss": 0.73336595, "num_input_tokens_seen": 164117270, "router_z_loss_clip": 2.58203125, "router_z_loss_mlp": 0.3894043, "step": 7648, "time_per_iteration": 2.7669765949249268 }, { "auxiliary_loss_clip": 0.01417901, "auxiliary_loss_mlp": 0.00400482, "balance_loss_clip": 1.15684223, "balance_loss_mlp": 0.36481482, "epoch": 0.45988275965729747, "flos": 24387762522240.0, "grad_norm": 23.94743072795577, "language_loss": 0.8291266, "learning_rate": 2.3550497723973953e-06, "loss": 0.84731042, "num_input_tokens_seen": 164137850, "router_z_loss_clip": 2.61132812, "router_z_loss_mlp": 0.35668945, "step": 7649, "time_per_iteration": 2.792159080505371 }, { "auxiliary_loss_clip": 0.01402675, "auxiliary_loss_mlp": 0.00433253, "balance_loss_clip": 1.15311289, "balance_loss_mlp": 0.39505798, "epoch": 0.45994288290996543, "flos": 24535822383360.0, "grad_norm": 179.88935262819012, "language_loss": 0.75491273, "learning_rate": 2.3546664900745726e-06, "loss": 0.77327204, "num_input_tokens_seen": 164157960, "router_z_loss_clip": 2.49414062, "router_z_loss_mlp": 0.38183594, "step": 7650, "time_per_iteration": 2.7706310749053955 }, { "auxiliary_loss_clip": 0.01425332, "auxiliary_loss_mlp": 0.00431637, "balance_loss_clip": 1.16198564, "balance_loss_mlp": 0.39272755, "epoch": 0.4600030061626334, "flos": 14830245838080.0, "grad_norm": 33.906157063075156, "language_loss": 0.91948307, "learning_rate": 2.354283194302761e-06, "loss": 0.93805277, "num_input_tokens_seen": 164174590, "router_z_loss_clip": 2.63476562, "router_z_loss_mlp": 0.38916016, "step": 7651, "time_per_iteration": 2.646437883377075 }, { "auxiliary_loss_clip": 0.01419995, "auxiliary_loss_mlp": 0.00455002, "balance_loss_clip": 1.16496539, "balance_loss_mlp": 0.41773677, "epoch": 0.46006312941530136, "flos": 18113845582080.0, "grad_norm": 416.76154526912745, "language_loss": 0.81158423, "learning_rate": 2.3538998850964948e-06, "loss": 0.83033419, "num_input_tokens_seen": 164192935, "router_z_loss_clip": 2.54882812, "router_z_loss_mlp": 0.37280273, "step": 7652, "time_per_iteration": 2.659301996231079 }, { "auxiliary_loss_clip": 0.01411728, "auxiliary_loss_mlp": 0.00462651, "balance_loss_clip": 1.15732861, "balance_loss_mlp": 0.4234314, "epoch": 0.46012325266796933, "flos": 21976468565760.0, "grad_norm": 6.91571312408346, "language_loss": 0.80849749, "learning_rate": 2.3535165624703097e-06, "loss": 0.8272413, "num_input_tokens_seen": 164213160, "router_z_loss_clip": 2.54492188, "router_z_loss_mlp": 0.39257812, "step": 7653, "time_per_iteration": 2.7195560932159424 }, { "auxiliary_loss_clip": 0.01457607, "auxiliary_loss_mlp": 0.00451385, "balance_loss_clip": 1.18125272, "balance_loss_mlp": 0.41063946, "epoch": 0.4601833759206373, "flos": 15268068714240.0, "grad_norm": 6.922089269269168, "language_loss": 0.76031899, "learning_rate": 2.353133226438741e-06, "loss": 0.77940893, "num_input_tokens_seen": 164229330, "router_z_loss_clip": 2.76367188, "router_z_loss_mlp": 0.40771484, "step": 7654, "time_per_iteration": 2.6520228385925293 }, { "auxiliary_loss_clip": 0.01418536, "auxiliary_loss_mlp": 0.00427229, "balance_loss_clip": 1.16306973, "balance_loss_mlp": 0.39091796, "epoch": 0.46024349917330526, "flos": 27088999061760.0, "grad_norm": 184.31580219056374, "language_loss": 0.85111797, "learning_rate": 2.3527498770163248e-06, "loss": 0.86957562, "num_input_tokens_seen": 164248240, "router_z_loss_clip": 2.5546875, "router_z_loss_mlp": 0.36328125, "step": 7655, "time_per_iteration": 2.6922903060913086 }, { "auxiliary_loss_clip": 0.01423646, "auxiliary_loss_mlp": 0.00429717, "balance_loss_clip": 1.16976261, "balance_loss_mlp": 0.39545619, "epoch": 0.4603036224259732, "flos": 24462923731200.0, "grad_norm": 39.01183208546183, "language_loss": 0.74080592, "learning_rate": 2.3523665142175985e-06, "loss": 0.75933957, "num_input_tokens_seen": 164268020, "router_z_loss_clip": 2.53710938, "router_z_loss_mlp": 0.34277344, "step": 7656, "time_per_iteration": 2.7271382808685303 }, { "auxiliary_loss_clip": 0.01424101, "auxiliary_loss_mlp": 0.00432882, "balance_loss_clip": 1.16436577, "balance_loss_mlp": 0.39535519, "epoch": 0.4603637456786412, "flos": 28109292883200.0, "grad_norm": 47.836194236884126, "language_loss": 0.84598786, "learning_rate": 2.351983138057098e-06, "loss": 0.86455774, "num_input_tokens_seen": 164287305, "router_z_loss_clip": 2.59960938, "router_z_loss_mlp": 0.37548828, "step": 7657, "time_per_iteration": 4.133780002593994 }, { "auxiliary_loss_clip": 0.01434766, "auxiliary_loss_mlp": 0.00451718, "balance_loss_clip": 1.17121363, "balance_loss_mlp": 0.41094875, "epoch": 0.4604238689313092, "flos": 24348942898560.0, "grad_norm": 22.640477771635723, "language_loss": 0.75804561, "learning_rate": 2.3515997485493623e-06, "loss": 0.77691042, "num_input_tokens_seen": 164306835, "router_z_loss_clip": 2.6328125, "router_z_loss_mlp": 0.40795898, "step": 7658, "time_per_iteration": 2.7236971855163574 }, { "auxiliary_loss_clip": 0.01326072, "auxiliary_loss_mlp": 0.0016458, "balance_loss_clip": 1.15932584, "balance_loss_mlp": 0.15447119, "epoch": 0.4604839921839772, "flos": 53606229431040.0, "grad_norm": 0.9149183973938942, "language_loss": 0.61645919, "learning_rate": 2.351216345708928e-06, "loss": 0.63136572, "num_input_tokens_seen": 164367095, "router_z_loss_clip": 1.671875, "router_z_loss_mlp": 0.10107422, "step": 7659, "time_per_iteration": 3.252645254135132 }, { "auxiliary_loss_clip": 0.01420867, "auxiliary_loss_mlp": 0.00409523, "balance_loss_clip": 1.16725111, "balance_loss_mlp": 0.37495205, "epoch": 0.46054411543664514, "flos": 31248424126080.0, "grad_norm": 761.9066906063895, "language_loss": 0.73894989, "learning_rate": 2.350832929550336e-06, "loss": 0.75725383, "num_input_tokens_seen": 164388895, "router_z_loss_clip": 2.53710938, "router_z_loss_mlp": 0.34570312, "step": 7660, "time_per_iteration": 2.9009156227111816 }, { "auxiliary_loss_clip": 0.01442155, "auxiliary_loss_mlp": 0.00431743, "balance_loss_clip": 1.17654169, "balance_loss_mlp": 0.39278561, "epoch": 0.4606042386893131, "flos": 24092863862400.0, "grad_norm": 19.132640112170566, "language_loss": 0.82577896, "learning_rate": 2.3504495000881227e-06, "loss": 0.84451795, "num_input_tokens_seen": 164409080, "router_z_loss_clip": 2.66015625, "router_z_loss_mlp": 0.38989258, "step": 7661, "time_per_iteration": 4.181703805923462 }, { "auxiliary_loss_clip": 0.01423002, "auxiliary_loss_mlp": 0.0044356, "balance_loss_clip": 1.17159665, "balance_loss_mlp": 0.40758234, "epoch": 0.46066436194198107, "flos": 26578457101440.0, "grad_norm": 50.83738502684465, "language_loss": 0.81191289, "learning_rate": 2.3500660573368305e-06, "loss": 0.83057845, "num_input_tokens_seen": 164427585, "router_z_loss_clip": 2.515625, "router_z_loss_mlp": 0.35961914, "step": 7662, "time_per_iteration": 4.255288124084473 }, { "auxiliary_loss_clip": 0.0144692, "auxiliary_loss_mlp": 0.00475552, "balance_loss_clip": 1.17697227, "balance_loss_mlp": 0.43406728, "epoch": 0.46072448519464904, "flos": 17775602184960.0, "grad_norm": 56.336187658634515, "language_loss": 0.91839188, "learning_rate": 2.349682601310998e-06, "loss": 0.93761659, "num_input_tokens_seen": 164438455, "router_z_loss_clip": 2.6953125, "router_z_loss_mlp": 0.41503906, "step": 7663, "time_per_iteration": 2.67997407913208 }, { "auxiliary_loss_clip": 0.01418007, "auxiliary_loss_mlp": 0.00387754, "balance_loss_clip": 1.16582024, "balance_loss_mlp": 0.35423261, "epoch": 0.460784608447317, "flos": 15086109392640.0, "grad_norm": 308.46594112592453, "language_loss": 0.81574726, "learning_rate": 2.3492991320251653e-06, "loss": 0.83380485, "num_input_tokens_seen": 164456830, "router_z_loss_clip": 2.52539062, "router_z_loss_mlp": 0.33520508, "step": 7664, "time_per_iteration": 2.6950490474700928 }, { "auxiliary_loss_clip": 0.01427978, "auxiliary_loss_mlp": 0.00430453, "balance_loss_clip": 1.16886473, "balance_loss_mlp": 0.39256808, "epoch": 0.46084473169998497, "flos": 18588261438720.0, "grad_norm": 437.11682837334945, "language_loss": 0.78801358, "learning_rate": 2.3489156494938753e-06, "loss": 0.80659789, "num_input_tokens_seen": 164475375, "router_z_loss_clip": 2.59179688, "router_z_loss_mlp": 0.37890625, "step": 7665, "time_per_iteration": 2.736499071121216 }, { "auxiliary_loss_clip": 0.01429483, "auxiliary_loss_mlp": 0.00403446, "balance_loss_clip": 1.17017698, "balance_loss_mlp": 0.36880365, "epoch": 0.46090485495265293, "flos": 19494789909120.0, "grad_norm": 15.43448059294349, "language_loss": 0.82580984, "learning_rate": 2.348532153731669e-06, "loss": 0.8441391, "num_input_tokens_seen": 164492040, "router_z_loss_clip": 2.59765625, "router_z_loss_mlp": 0.34643555, "step": 7666, "time_per_iteration": 2.6448476314544678 }, { "auxiliary_loss_clip": 0.01414419, "auxiliary_loss_mlp": 0.00401251, "balance_loss_clip": 1.16194987, "balance_loss_mlp": 0.36622697, "epoch": 0.4609649782053209, "flos": 33364927163520.0, "grad_norm": 5.85884153169361, "language_loss": 0.78765929, "learning_rate": 2.348148644753088e-06, "loss": 0.80581594, "num_input_tokens_seen": 164513665, "router_z_loss_clip": 2.5234375, "router_z_loss_mlp": 0.35009766, "step": 7667, "time_per_iteration": 2.761565685272217 }, { "auxiliary_loss_clip": 0.01421575, "auxiliary_loss_mlp": 0.00437222, "balance_loss_clip": 1.16374457, "balance_loss_mlp": 0.40198374, "epoch": 0.46102510145798886, "flos": 23769165473280.0, "grad_norm": 52.01780472547862, "language_loss": 0.81259978, "learning_rate": 2.347765122572676e-06, "loss": 0.83118773, "num_input_tokens_seen": 164533890, "router_z_loss_clip": 2.57617188, "router_z_loss_mlp": 0.35229492, "step": 7668, "time_per_iteration": 4.123117446899414 }, { "auxiliary_loss_clip": 0.0142333, "auxiliary_loss_mlp": 0.00395543, "balance_loss_clip": 1.17324185, "balance_loss_mlp": 0.36180654, "epoch": 0.4610852247106568, "flos": 23294821443840.0, "grad_norm": 3.8022482270983513, "language_loss": 0.82566965, "learning_rate": 2.347381587204975e-06, "loss": 0.84385842, "num_input_tokens_seen": 164553815, "router_z_loss_clip": 2.50195312, "router_z_loss_mlp": 0.33740234, "step": 7669, "time_per_iteration": 2.6602416038513184 }, { "auxiliary_loss_clip": 0.01423384, "auxiliary_loss_mlp": 0.00430031, "balance_loss_clip": 1.1680789, "balance_loss_mlp": 0.39250407, "epoch": 0.4611453479633248, "flos": 25447450584960.0, "grad_norm": 14.470089380725394, "language_loss": 0.8784622, "learning_rate": 2.34699803866453e-06, "loss": 0.89699638, "num_input_tokens_seen": 164573125, "router_z_loss_clip": 2.55273438, "router_z_loss_mlp": 0.37548828, "step": 7670, "time_per_iteration": 2.834012746810913 }, { "auxiliary_loss_clip": 0.01402173, "auxiliary_loss_mlp": 0.00416277, "balance_loss_clip": 1.15754151, "balance_loss_mlp": 0.38101453, "epoch": 0.4612054712159928, "flos": 21139606523520.0, "grad_norm": 1.9358541032659373, "language_loss": 0.71394932, "learning_rate": 2.3466144769658845e-06, "loss": 0.73213375, "num_input_tokens_seen": 164592575, "router_z_loss_clip": 2.44921875, "router_z_loss_mlp": 0.35253906, "step": 7671, "time_per_iteration": 2.832479476928711 }, { "auxiliary_loss_clip": 0.01311013, "auxiliary_loss_mlp": 0.00095025, "balance_loss_clip": 1.14859378, "balance_loss_mlp": 0.08515434, "epoch": 0.4612655944686608, "flos": 69959266404480.0, "grad_norm": 0.6758314884533831, "language_loss": 0.55472791, "learning_rate": 2.346230902123583e-06, "loss": 0.56878829, "num_input_tokens_seen": 164659795, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.09863281, "step": 7672, "time_per_iteration": 3.3173041343688965 }, { "auxiliary_loss_clip": 0.01426732, "auxiliary_loss_mlp": 0.00411956, "balance_loss_clip": 1.16505456, "balance_loss_mlp": 0.37550229, "epoch": 0.46132571772132874, "flos": 16837149502080.0, "grad_norm": 2.90609670688035, "language_loss": 0.79696512, "learning_rate": 2.3458473141521715e-06, "loss": 0.81535196, "num_input_tokens_seen": 164678735, "router_z_loss_clip": 2.61914062, "router_z_loss_mlp": 0.36474609, "step": 7673, "time_per_iteration": 2.657597780227661 }, { "auxiliary_loss_clip": 0.01421134, "auxiliary_loss_mlp": 0.00395306, "balance_loss_clip": 1.16561759, "balance_loss_mlp": 0.36004403, "epoch": 0.4613858409739967, "flos": 35808935431680.0, "grad_norm": 4.98640644964533, "language_loss": 0.76121044, "learning_rate": 2.345463713066195e-06, "loss": 0.7793749, "num_input_tokens_seen": 164700885, "router_z_loss_clip": 2.55078125, "router_z_loss_mlp": 0.35253906, "step": 7674, "time_per_iteration": 2.7828781604766846 }, { "auxiliary_loss_clip": 0.01422221, "auxiliary_loss_mlp": 0.00415904, "balance_loss_clip": 1.16629994, "balance_loss_mlp": 0.37737596, "epoch": 0.4614459642266647, "flos": 35266756567680.0, "grad_norm": 4.459583461006877, "language_loss": 0.71424055, "learning_rate": 2.3450800988801996e-06, "loss": 0.73262179, "num_input_tokens_seen": 164726960, "router_z_loss_clip": 2.55859375, "router_z_loss_mlp": 0.38500977, "step": 7675, "time_per_iteration": 2.837834596633911 }, { "auxiliary_loss_clip": 0.01308792, "auxiliary_loss_mlp": 0.00144436, "balance_loss_clip": 1.14136374, "balance_loss_mlp": 0.13375497, "epoch": 0.46150608747933264, "flos": 66704610044160.0, "grad_norm": 0.7312986489482346, "language_loss": 0.58248788, "learning_rate": 2.3446964716087327e-06, "loss": 0.59702015, "num_input_tokens_seen": 164788525, "router_z_loss_clip": 1.671875, "router_z_loss_mlp": 0.10693359, "step": 7676, "time_per_iteration": 3.2074267864227295 }, { "auxiliary_loss_clip": 0.01296945, "auxiliary_loss_mlp": 0.00118374, "balance_loss_clip": 1.1280849, "balance_loss_mlp": 0.10769272, "epoch": 0.4615662107320006, "flos": 55830177025920.0, "grad_norm": 0.8021137254901085, "language_loss": 0.6281029, "learning_rate": 2.344312831266341e-06, "loss": 0.64225614, "num_input_tokens_seen": 164843525, "router_z_loss_clip": 1.6875, "router_z_loss_mlp": 0.10693359, "step": 7677, "time_per_iteration": 3.004438877105713 }, { "auxiliary_loss_clip": 0.01407697, "auxiliary_loss_mlp": 0.00379021, "balance_loss_clip": 1.15809536, "balance_loss_mlp": 0.34440216, "epoch": 0.46162633398466857, "flos": 15483245137920.0, "grad_norm": 52.70083872990648, "language_loss": 0.81212914, "learning_rate": 2.3439291778675718e-06, "loss": 0.82999623, "num_input_tokens_seen": 164859895, "router_z_loss_clip": 2.49609375, "router_z_loss_mlp": 0.34594727, "step": 7678, "time_per_iteration": 2.7829742431640625 }, { "auxiliary_loss_clip": 0.01426812, "auxiliary_loss_mlp": 0.00406154, "balance_loss_clip": 1.16896057, "balance_loss_mlp": 0.3676967, "epoch": 0.46168645723733653, "flos": 20011437181440.0, "grad_norm": 71.78220200429496, "language_loss": 0.72637177, "learning_rate": 2.343545511426974e-06, "loss": 0.7447015, "num_input_tokens_seen": 164878030, "router_z_loss_clip": 2.578125, "router_z_loss_mlp": 0.38427734, "step": 7679, "time_per_iteration": 2.6852951049804688 }, { "auxiliary_loss_clip": 0.01422346, "auxiliary_loss_mlp": 0.00384651, "balance_loss_clip": 1.16848528, "balance_loss_mlp": 0.35008049, "epoch": 0.4617465804900045, "flos": 20298542590080.0, "grad_norm": 14.241988461934266, "language_loss": 0.78332591, "learning_rate": 2.3431618319590963e-06, "loss": 0.80139589, "num_input_tokens_seen": 164895710, "router_z_loss_clip": 2.5390625, "router_z_loss_mlp": 0.34521484, "step": 7680, "time_per_iteration": 2.6947596073150635 }, { "auxiliary_loss_clip": 0.01439022, "auxiliary_loss_mlp": 0.00397251, "balance_loss_clip": 1.17978716, "balance_loss_mlp": 0.36008161, "epoch": 0.46180670374267246, "flos": 22346312952960.0, "grad_norm": 14.047477096323238, "language_loss": 0.71064508, "learning_rate": 2.342778139478487e-06, "loss": 0.72900772, "num_input_tokens_seen": 164913365, "router_z_loss_clip": 2.59179688, "router_z_loss_mlp": 0.37182617, "step": 7681, "time_per_iteration": 2.706242561340332 }, { "auxiliary_loss_clip": 0.01404311, "auxiliary_loss_mlp": 0.00370345, "balance_loss_clip": 1.15788853, "balance_loss_mlp": 0.33663306, "epoch": 0.46186682699534043, "flos": 19895696582400.0, "grad_norm": 55.2816947646057, "language_loss": 0.73146677, "learning_rate": 2.342394433999697e-06, "loss": 0.74921334, "num_input_tokens_seen": 164931620, "router_z_loss_clip": 2.46875, "router_z_loss_mlp": 0.3371582, "step": 7682, "time_per_iteration": 2.6594200134277344 }, { "auxiliary_loss_clip": 0.01418385, "auxiliary_loss_mlp": 0.00391755, "balance_loss_clip": 1.16709781, "balance_loss_mlp": 0.35708934, "epoch": 0.4619269502480084, "flos": 31503569408640.0, "grad_norm": 5.080691165220897, "language_loss": 0.8136245, "learning_rate": 2.342010715537275e-06, "loss": 0.8317259, "num_input_tokens_seen": 164950905, "router_z_loss_clip": 2.50976562, "router_z_loss_mlp": 0.34692383, "step": 7683, "time_per_iteration": 2.7127525806427 }, { "auxiliary_loss_clip": 0.01417629, "auxiliary_loss_mlp": 0.00363357, "balance_loss_clip": 1.16740489, "balance_loss_mlp": 0.32773688, "epoch": 0.46198707350067636, "flos": 25009484054400.0, "grad_norm": 4.523250672332182, "language_loss": 0.83609378, "learning_rate": 2.3416269841057726e-06, "loss": 0.85390359, "num_input_tokens_seen": 164970950, "router_z_loss_clip": 2.5, "router_z_loss_mlp": 0.35620117, "step": 7684, "time_per_iteration": 2.845942497253418 }, { "auxiliary_loss_clip": 0.01430359, "auxiliary_loss_mlp": 0.00392732, "balance_loss_clip": 1.1719346, "balance_loss_mlp": 0.35558608, "epoch": 0.4620471967533444, "flos": 18292357198080.0, "grad_norm": 8.314822474666215, "language_loss": 0.85683107, "learning_rate": 2.3412432397197412e-06, "loss": 0.87506199, "num_input_tokens_seen": 164989855, "router_z_loss_clip": 2.58398438, "router_z_loss_mlp": 0.37158203, "step": 7685, "time_per_iteration": 2.6929025650024414 }, { "auxiliary_loss_clip": 0.01407419, "auxiliary_loss_mlp": 0.00364497, "balance_loss_clip": 1.16402149, "balance_loss_mlp": 0.33035558, "epoch": 0.46210732000601235, "flos": 33985104410880.0, "grad_norm": 67.2408405759557, "language_loss": 0.72826684, "learning_rate": 2.340859482393731e-06, "loss": 0.74598593, "num_input_tokens_seen": 165012290, "router_z_loss_clip": 2.4375, "router_z_loss_mlp": 0.34130859, "step": 7686, "time_per_iteration": 2.813401460647583 }, { "auxiliary_loss_clip": 0.01419455, "auxiliary_loss_mlp": 0.00414218, "balance_loss_clip": 1.16694427, "balance_loss_mlp": 0.37754953, "epoch": 0.4621674432586803, "flos": 25009412227200.0, "grad_norm": 38.09339677306485, "language_loss": 0.80695885, "learning_rate": 2.340475712142296e-06, "loss": 0.82529563, "num_input_tokens_seen": 165030810, "router_z_loss_clip": 2.5234375, "router_z_loss_mlp": 0.36694336, "step": 7687, "time_per_iteration": 2.6821866035461426 }, { "auxiliary_loss_clip": 0.01411766, "auxiliary_loss_mlp": 0.00364796, "balance_loss_clip": 1.16621745, "balance_loss_mlp": 0.33156008, "epoch": 0.4622275665113483, "flos": 22014031213440.0, "grad_norm": 49.44629060259665, "language_loss": 0.81193942, "learning_rate": 2.3400919289799873e-06, "loss": 0.829705, "num_input_tokens_seen": 165050205, "router_z_loss_clip": 2.45898438, "router_z_loss_mlp": 0.33251953, "step": 7688, "time_per_iteration": 2.6846508979797363 }, { "auxiliary_loss_clip": 0.01419707, "auxiliary_loss_mlp": 0.0038793, "balance_loss_clip": 1.16896439, "balance_loss_mlp": 0.35147557, "epoch": 0.46228768976401624, "flos": 24058820747520.0, "grad_norm": 2.893861544559389, "language_loss": 0.843499, "learning_rate": 2.3397081329213585e-06, "loss": 0.86157537, "num_input_tokens_seen": 165069370, "router_z_loss_clip": 2.5078125, "router_z_loss_mlp": 0.36474609, "step": 7689, "time_per_iteration": 2.7051355838775635 }, { "auxiliary_loss_clip": 0.01427044, "auxiliary_loss_mlp": 0.00425036, "balance_loss_clip": 1.17378724, "balance_loss_mlp": 0.38746166, "epoch": 0.4623478130166842, "flos": 26651391667200.0, "grad_norm": 3.733555840509, "language_loss": 0.65876138, "learning_rate": 2.339324323980964e-06, "loss": 0.67728221, "num_input_tokens_seen": 165089610, "router_z_loss_clip": 2.53125, "router_z_loss_mlp": 0.37597656, "step": 7690, "time_per_iteration": 2.721898078918457 }, { "auxiliary_loss_clip": 0.01427349, "auxiliary_loss_mlp": 0.00402933, "balance_loss_clip": 1.16813874, "balance_loss_mlp": 0.36428511, "epoch": 0.46240793626935217, "flos": 20558428467840.0, "grad_norm": 175.4081222326952, "language_loss": 0.90848291, "learning_rate": 2.3389405021733562e-06, "loss": 0.92678571, "num_input_tokens_seen": 165109050, "router_z_loss_clip": 2.59375, "router_z_loss_mlp": 0.38647461, "step": 7691, "time_per_iteration": 2.6418731212615967 }, { "auxiliary_loss_clip": 0.01414891, "auxiliary_loss_mlp": 0.00384225, "balance_loss_clip": 1.16492701, "balance_loss_mlp": 0.34982151, "epoch": 0.46246805952202014, "flos": 22456055980800.0, "grad_norm": 1.7552019386906892, "language_loss": 0.80244654, "learning_rate": 2.338556667513091e-06, "loss": 0.82043767, "num_input_tokens_seen": 165130130, "router_z_loss_clip": 2.5, "router_z_loss_mlp": 0.34423828, "step": 7692, "time_per_iteration": 2.6776273250579834 }, { "auxiliary_loss_clip": 0.01422059, "auxiliary_loss_mlp": 0.00392158, "balance_loss_clip": 1.17138708, "balance_loss_mlp": 0.35527515, "epoch": 0.4625281827746881, "flos": 35041308854400.0, "grad_norm": 39.39882064929038, "language_loss": 0.79742938, "learning_rate": 2.338172820014723e-06, "loss": 0.81557155, "num_input_tokens_seen": 165152685, "router_z_loss_clip": 2.50585938, "router_z_loss_mlp": 0.36889648, "step": 7693, "time_per_iteration": 2.7708561420440674 }, { "auxiliary_loss_clip": 0.01412131, "auxiliary_loss_mlp": 0.00387506, "balance_loss_clip": 1.16916966, "balance_loss_mlp": 0.3515763, "epoch": 0.46258830602735607, "flos": 21068647205760.0, "grad_norm": 2.030859267439608, "language_loss": 0.90798646, "learning_rate": 2.337788959692808e-06, "loss": 0.92598283, "num_input_tokens_seen": 165173315, "router_z_loss_clip": 2.4296875, "router_z_loss_mlp": 0.359375, "step": 7694, "time_per_iteration": 2.805649518966675 }, { "auxiliary_loss_clip": 0.0142617, "auxiliary_loss_mlp": 0.0039149, "balance_loss_clip": 1.17681599, "balance_loss_mlp": 0.35584652, "epoch": 0.46264842928002403, "flos": 26177227205760.0, "grad_norm": 10.970809656456423, "language_loss": 0.8682664, "learning_rate": 2.337405086561902e-06, "loss": 0.88644302, "num_input_tokens_seen": 165192395, "router_z_loss_clip": 2.49414062, "router_z_loss_mlp": 0.35644531, "step": 7695, "time_per_iteration": 2.772348642349243 }, { "auxiliary_loss_clip": 0.0141947, "auxiliary_loss_mlp": 0.00383656, "balance_loss_clip": 1.17324042, "balance_loss_mlp": 0.34901336, "epoch": 0.462708552532692, "flos": 16764214936320.0, "grad_norm": 2.4249716555138754, "language_loss": 0.78983706, "learning_rate": 2.3370212006365606e-06, "loss": 0.8078683, "num_input_tokens_seen": 165211355, "router_z_loss_clip": 2.45898438, "router_z_loss_mlp": 0.34643555, "step": 7696, "time_per_iteration": 2.7246146202087402 }, { "auxiliary_loss_clip": 0.01431018, "auxiliary_loss_mlp": 0.00418886, "balance_loss_clip": 1.17669034, "balance_loss_mlp": 0.38064414, "epoch": 0.46276867578535996, "flos": 15560453422080.0, "grad_norm": 11.29127185762115, "language_loss": 0.77574241, "learning_rate": 2.3366373019313423e-06, "loss": 0.79424143, "num_input_tokens_seen": 165229380, "router_z_loss_clip": 2.54882812, "router_z_loss_mlp": 0.38269043, "step": 7697, "time_per_iteration": 2.724553108215332 }, { "auxiliary_loss_clip": 0.01416306, "auxiliary_loss_mlp": 0.00446172, "balance_loss_clip": 1.16825223, "balance_loss_mlp": 0.40609419, "epoch": 0.462828799038028, "flos": 22415404763520.0, "grad_norm": 13.720406181583867, "language_loss": 0.9059673, "learning_rate": 2.3362533904608025e-06, "loss": 0.92459208, "num_input_tokens_seen": 165247200, "router_z_loss_clip": 2.47851562, "router_z_loss_mlp": 0.40063477, "step": 7698, "time_per_iteration": 2.70389986038208 }, { "auxiliary_loss_clip": 0.01410183, "auxiliary_loss_mlp": 0.00398666, "balance_loss_clip": 1.16769505, "balance_loss_mlp": 0.36128211, "epoch": 0.46288892229069595, "flos": 21069580959360.0, "grad_norm": 8.643926178970009, "language_loss": 0.77132785, "learning_rate": 2.335869466239502e-06, "loss": 0.78941637, "num_input_tokens_seen": 165265825, "router_z_loss_clip": 2.42773438, "router_z_loss_mlp": 0.37353516, "step": 7699, "time_per_iteration": 4.115377902984619 }, { "auxiliary_loss_clip": 0.01409839, "auxiliary_loss_mlp": 0.00425983, "balance_loss_clip": 1.16239214, "balance_loss_mlp": 0.38731202, "epoch": 0.4629490455433639, "flos": 23185688947200.0, "grad_norm": 16.13636484296391, "language_loss": 0.77440262, "learning_rate": 2.335485529281996e-06, "loss": 0.79276091, "num_input_tokens_seen": 165284380, "router_z_loss_clip": 2.47460938, "router_z_loss_mlp": 0.38671875, "step": 7700, "time_per_iteration": 2.8820698261260986 }, { "auxiliary_loss_clip": 0.01412222, "auxiliary_loss_mlp": 0.00417017, "balance_loss_clip": 1.16992509, "balance_loss_mlp": 0.37934706, "epoch": 0.4630091687960319, "flos": 18835541642880.0, "grad_norm": 6.3462644051765125, "language_loss": 0.79740834, "learning_rate": 2.3351015796028467e-06, "loss": 0.81570065, "num_input_tokens_seen": 165300320, "router_z_loss_clip": 2.421875, "router_z_loss_mlp": 0.37695312, "step": 7701, "time_per_iteration": 2.6560544967651367 }, { "auxiliary_loss_clip": 0.01427231, "auxiliary_loss_mlp": 0.00445456, "balance_loss_clip": 1.17462957, "balance_loss_mlp": 0.40547276, "epoch": 0.46306929204869984, "flos": 38907020407680.0, "grad_norm": 142.07563594554284, "language_loss": 0.7183789, "learning_rate": 2.3347176172166114e-06, "loss": 0.73710585, "num_input_tokens_seen": 165318130, "router_z_loss_clip": 2.52734375, "router_z_loss_mlp": 0.3996582, "step": 7702, "time_per_iteration": 2.8315579891204834 }, { "auxiliary_loss_clip": 0.01424548, "auxiliary_loss_mlp": 0.00410468, "balance_loss_clip": 1.17913675, "balance_loss_mlp": 0.37451464, "epoch": 0.4631294153013678, "flos": 19644178573440.0, "grad_norm": 27.28107573906326, "language_loss": 0.79330218, "learning_rate": 2.33433364213785e-06, "loss": 0.8116523, "num_input_tokens_seen": 165336225, "router_z_loss_clip": 2.453125, "router_z_loss_mlp": 0.35961914, "step": 7703, "time_per_iteration": 4.10429048538208 }, { "auxiliary_loss_clip": 0.01441719, "auxiliary_loss_mlp": 0.00437498, "balance_loss_clip": 1.18662417, "balance_loss_mlp": 0.39823061, "epoch": 0.4631895385540358, "flos": 24608254158720.0, "grad_norm": 72.836611170974, "language_loss": 0.75916052, "learning_rate": 2.3339496543811243e-06, "loss": 0.77795273, "num_input_tokens_seen": 165355005, "router_z_loss_clip": 2.55078125, "router_z_loss_mlp": 0.39233398, "step": 7704, "time_per_iteration": 2.751185417175293 }, { "auxiliary_loss_clip": 0.01426986, "auxiliary_loss_mlp": 0.00445394, "balance_loss_clip": 1.17973053, "balance_loss_mlp": 0.40464777, "epoch": 0.46324966180670374, "flos": 26320115508480.0, "grad_norm": 12.663077242579849, "language_loss": 0.8613109, "learning_rate": 2.3335656539609934e-06, "loss": 0.88003469, "num_input_tokens_seen": 165374910, "router_z_loss_clip": 2.47070312, "router_z_loss_mlp": 0.40722656, "step": 7705, "time_per_iteration": 4.2530741691589355 }, { "auxiliary_loss_clip": 0.01432912, "auxiliary_loss_mlp": 0.00434299, "balance_loss_clip": 1.17744827, "balance_loss_mlp": 0.39586622, "epoch": 0.4633097850593717, "flos": 19240506552960.0, "grad_norm": 109.54109621358397, "language_loss": 0.83732355, "learning_rate": 2.3331816408920196e-06, "loss": 0.85599566, "num_input_tokens_seen": 165392590, "router_z_loss_clip": 2.55664062, "router_z_loss_mlp": 0.38452148, "step": 7706, "time_per_iteration": 2.7141013145446777 }, { "auxiliary_loss_clip": 0.01415746, "auxiliary_loss_mlp": 0.00426407, "balance_loss_clip": 1.17342556, "balance_loss_mlp": 0.38947642, "epoch": 0.46336990831203967, "flos": 22783166161920.0, "grad_norm": 1736.7654943398804, "language_loss": 0.76201153, "learning_rate": 2.3327976151887654e-06, "loss": 0.78043306, "num_input_tokens_seen": 165411195, "router_z_loss_clip": 2.42382812, "router_z_loss_mlp": 0.36962891, "step": 7707, "time_per_iteration": 2.712829113006592 }, { "auxiliary_loss_clip": 0.01424756, "auxiliary_loss_mlp": 0.00424524, "balance_loss_clip": 1.17866993, "balance_loss_mlp": 0.38706833, "epoch": 0.46343003156470763, "flos": 38210604543360.0, "grad_norm": 9.160193218583473, "language_loss": 0.67489898, "learning_rate": 2.332413576865791e-06, "loss": 0.6933918, "num_input_tokens_seen": 165430150, "router_z_loss_clip": 2.45898438, "router_z_loss_mlp": 0.37402344, "step": 7708, "time_per_iteration": 2.934591293334961 }, { "auxiliary_loss_clip": 0.01428587, "auxiliary_loss_mlp": 0.00394022, "balance_loss_clip": 1.18012238, "balance_loss_mlp": 0.35718679, "epoch": 0.4634901548173756, "flos": 31938555110400.0, "grad_norm": 40.81718011669565, "language_loss": 0.83678699, "learning_rate": 2.3320295259376614e-06, "loss": 0.85501313, "num_input_tokens_seen": 165450595, "router_z_loss_clip": 2.48632812, "router_z_loss_mlp": 0.3684082, "step": 7709, "time_per_iteration": 2.795398712158203 }, { "auxiliary_loss_clip": 0.01438393, "auxiliary_loss_mlp": 0.00413965, "balance_loss_clip": 1.18676066, "balance_loss_mlp": 0.37641412, "epoch": 0.46355027807004356, "flos": 20082540153600.0, "grad_norm": 200.41424176208054, "language_loss": 0.83756196, "learning_rate": 2.3316454624189385e-06, "loss": 0.8560856, "num_input_tokens_seen": 165469515, "router_z_loss_clip": 2.515625, "router_z_loss_mlp": 0.37548828, "step": 7710, "time_per_iteration": 2.77746319770813 }, { "auxiliary_loss_clip": 0.01458503, "auxiliary_loss_mlp": 0.00417426, "balance_loss_clip": 1.19831574, "balance_loss_mlp": 0.37734795, "epoch": 0.4636104013227116, "flos": 24061370613120.0, "grad_norm": 14.157243750468755, "language_loss": 0.79383904, "learning_rate": 2.3312613863241865e-06, "loss": 0.81259835, "num_input_tokens_seen": 165488125, "router_z_loss_clip": 2.60351562, "router_z_loss_mlp": 0.40063477, "step": 7711, "time_per_iteration": 4.154359340667725 }, { "auxiliary_loss_clip": 0.01446629, "auxiliary_loss_mlp": 0.00410976, "balance_loss_clip": 1.19320631, "balance_loss_mlp": 0.37054002, "epoch": 0.46367052457537955, "flos": 23914639555200.0, "grad_norm": 25.02551509704164, "language_loss": 0.76868498, "learning_rate": 2.33087729766797e-06, "loss": 0.78726101, "num_input_tokens_seen": 165509225, "router_z_loss_clip": 2.53710938, "router_z_loss_mlp": 0.40429688, "step": 7712, "time_per_iteration": 2.7194020748138428 }, { "auxiliary_loss_clip": 0.01444411, "auxiliary_loss_mlp": 0.00408447, "balance_loss_clip": 1.18732429, "balance_loss_mlp": 0.36791536, "epoch": 0.4637306478280475, "flos": 26396533693440.0, "grad_norm": 864.402012880036, "language_loss": 0.78041697, "learning_rate": 2.3304931964648524e-06, "loss": 0.79894555, "num_input_tokens_seen": 165529945, "router_z_loss_clip": 2.56835938, "router_z_loss_mlp": 0.40527344, "step": 7713, "time_per_iteration": 2.6945700645446777 }, { "auxiliary_loss_clip": 0.01453187, "auxiliary_loss_mlp": 0.00459598, "balance_loss_clip": 1.19219196, "balance_loss_mlp": 0.41687325, "epoch": 0.4637907710807155, "flos": 21980706370560.0, "grad_norm": 3007.079594154705, "language_loss": 0.66409081, "learning_rate": 2.3301090827294e-06, "loss": 0.68321866, "num_input_tokens_seen": 165550690, "router_z_loss_clip": 2.61132812, "router_z_loss_mlp": 0.42724609, "step": 7714, "time_per_iteration": 2.673614263534546 }, { "auxiliary_loss_clip": 0.01447075, "auxiliary_loss_mlp": 0.00446801, "balance_loss_clip": 1.1921221, "balance_loss_mlp": 0.40674675, "epoch": 0.46385089433338345, "flos": 12422291846400.0, "grad_norm": 19.513449427493267, "language_loss": 0.77381527, "learning_rate": 2.3297249564761784e-06, "loss": 0.79275405, "num_input_tokens_seen": 165567775, "router_z_loss_clip": 2.55078125, "router_z_loss_mlp": 0.40039062, "step": 7715, "time_per_iteration": 2.646658420562744 }, { "auxiliary_loss_clip": 0.01466588, "auxiliary_loss_mlp": 0.00459826, "balance_loss_clip": 1.19881225, "balance_loss_mlp": 0.41881847, "epoch": 0.4639110175860514, "flos": 23915752876800.0, "grad_norm": 5.2668776853680805, "language_loss": 0.75331742, "learning_rate": 2.3293408177197527e-06, "loss": 0.77258158, "num_input_tokens_seen": 165587010, "router_z_loss_clip": 2.67773438, "router_z_loss_mlp": 0.40991211, "step": 7716, "time_per_iteration": 2.804318428039551 }, { "auxiliary_loss_clip": 0.01454314, "auxiliary_loss_mlp": 0.00438331, "balance_loss_clip": 1.1985743, "balance_loss_mlp": 0.39710861, "epoch": 0.4639711408387194, "flos": 25300396304640.0, "grad_norm": 28.790624914316087, "language_loss": 0.86066508, "learning_rate": 2.328956666474691e-06, "loss": 0.87959158, "num_input_tokens_seen": 165607850, "router_z_loss_clip": 2.55664062, "router_z_loss_mlp": 0.41235352, "step": 7717, "time_per_iteration": 2.79526424407959 }, { "auxiliary_loss_clip": 0.01448311, "auxiliary_loss_mlp": 0.00420342, "balance_loss_clip": 1.18811929, "balance_loss_mlp": 0.38117009, "epoch": 0.46403126409138734, "flos": 21211822817280.0, "grad_norm": 26829.22328203678, "language_loss": 0.77801311, "learning_rate": 2.3285725027555593e-06, "loss": 0.79669964, "num_input_tokens_seen": 165627175, "router_z_loss_clip": 2.59765625, "router_z_loss_mlp": 0.39208984, "step": 7718, "time_per_iteration": 2.684689521789551 }, { "auxiliary_loss_clip": 0.01455389, "auxiliary_loss_mlp": 0.00414502, "balance_loss_clip": 1.19213688, "balance_loss_mlp": 0.37501994, "epoch": 0.4640913873440553, "flos": 35845564325760.0, "grad_norm": 18.876480461252797, "language_loss": 0.77176058, "learning_rate": 2.3281883265769254e-06, "loss": 0.79045951, "num_input_tokens_seen": 165648340, "router_z_loss_clip": 2.63085938, "router_z_loss_mlp": 0.39453125, "step": 7719, "time_per_iteration": 2.8018598556518555 }, { "auxiliary_loss_clip": 0.01456879, "auxiliary_loss_mlp": 0.00428717, "balance_loss_clip": 1.19633961, "balance_loss_mlp": 0.38699359, "epoch": 0.46415151059672327, "flos": 19166207270400.0, "grad_norm": 10.275416954369046, "language_loss": 0.91317952, "learning_rate": 2.327804137953357e-06, "loss": 0.93203545, "num_input_tokens_seen": 165667195, "router_z_loss_clip": 2.60546875, "router_z_loss_mlp": 0.41723633, "step": 7720, "time_per_iteration": 2.6592020988464355 }, { "auxiliary_loss_clip": 0.01315831, "auxiliary_loss_mlp": 0.00080575, "balance_loss_clip": 1.16608787, "balance_loss_mlp": 0.07289782, "epoch": 0.46421163384939124, "flos": 58912750304640.0, "grad_norm": 0.7352614559530523, "language_loss": 0.549025, "learning_rate": 2.3274199368994226e-06, "loss": 0.56298906, "num_input_tokens_seen": 165726760, "router_z_loss_clip": 1.5, "router_z_loss_mlp": 0.07666016, "step": 7721, "time_per_iteration": 3.251382350921631 }, { "auxiliary_loss_clip": 0.01457637, "auxiliary_loss_mlp": 0.0044183, "balance_loss_clip": 1.20142722, "balance_loss_mlp": 0.40079796, "epoch": 0.4642717571020592, "flos": 20157342226560.0, "grad_norm": 6.920931130417418, "language_loss": 0.86377609, "learning_rate": 2.3270357234296918e-06, "loss": 0.88277078, "num_input_tokens_seen": 165745005, "router_z_loss_clip": 2.56054688, "router_z_loss_mlp": 0.41040039, "step": 7722, "time_per_iteration": 2.678438425064087 }, { "auxiliary_loss_clip": 0.01462661, "auxiliary_loss_mlp": 0.00437757, "balance_loss_clip": 1.20068121, "balance_loss_mlp": 0.39889449, "epoch": 0.46433188035472717, "flos": 25046184775680.0, "grad_norm": 3954.402422050313, "language_loss": 0.84261507, "learning_rate": 2.3266514975587332e-06, "loss": 0.86161923, "num_input_tokens_seen": 165765750, "router_z_loss_clip": 2.61914062, "router_z_loss_mlp": 0.38867188, "step": 7723, "time_per_iteration": 2.7246551513671875 }, { "auxiliary_loss_clip": 0.01444573, "auxiliary_loss_mlp": 0.00375307, "balance_loss_clip": 1.19104326, "balance_loss_mlp": 0.33804268, "epoch": 0.4643920036073952, "flos": 28075644817920.0, "grad_norm": 17.525975957085645, "language_loss": 0.74997866, "learning_rate": 2.326267259301118e-06, "loss": 0.76817745, "num_input_tokens_seen": 165787515, "router_z_loss_clip": 2.54101562, "router_z_loss_mlp": 0.37231445, "step": 7724, "time_per_iteration": 2.7139244079589844 }, { "auxiliary_loss_clip": 0.01446711, "auxiliary_loss_mlp": 0.00401582, "balance_loss_clip": 1.19222164, "balance_loss_mlp": 0.36546111, "epoch": 0.46445212686006315, "flos": 18369350000640.0, "grad_norm": 106.840680668386, "language_loss": 0.7598269, "learning_rate": 2.325883008671415e-06, "loss": 0.77830982, "num_input_tokens_seen": 165806675, "router_z_loss_clip": 2.54296875, "router_z_loss_mlp": 0.36108398, "step": 7725, "time_per_iteration": 2.6213905811309814 }, { "auxiliary_loss_clip": 0.01426797, "auxiliary_loss_mlp": 0.00376125, "balance_loss_clip": 1.18078482, "balance_loss_mlp": 0.3407203, "epoch": 0.4645122501127311, "flos": 31721618920320.0, "grad_norm": 47.71684316502801, "language_loss": 0.71391416, "learning_rate": 2.3254987456841955e-06, "loss": 0.73194337, "num_input_tokens_seen": 165829835, "router_z_loss_clip": 2.4609375, "router_z_loss_mlp": 0.35400391, "step": 7726, "time_per_iteration": 2.7235848903656006 }, { "auxiliary_loss_clip": 0.01452021, "auxiliary_loss_mlp": 0.00396746, "balance_loss_clip": 1.19684517, "balance_loss_mlp": 0.35971934, "epoch": 0.4645723733653991, "flos": 23768806337280.0, "grad_norm": 12.331576323403942, "language_loss": 0.82643354, "learning_rate": 2.3251144703540307e-06, "loss": 0.84492123, "num_input_tokens_seen": 165849380, "router_z_loss_clip": 2.54882812, "router_z_loss_mlp": 0.37036133, "step": 7727, "time_per_iteration": 2.691620111465454 }, { "auxiliary_loss_clip": 0.01447815, "auxiliary_loss_mlp": 0.00421724, "balance_loss_clip": 1.19208694, "balance_loss_mlp": 0.38326687, "epoch": 0.46463249661806705, "flos": 33145512935040.0, "grad_norm": 15.991635231022526, "language_loss": 0.84432292, "learning_rate": 2.3247301826954936e-06, "loss": 0.86301839, "num_input_tokens_seen": 165868620, "router_z_loss_clip": 2.55664062, "router_z_loss_mlp": 0.38452148, "step": 7728, "time_per_iteration": 2.734421730041504 }, { "auxiliary_loss_clip": 0.01457099, "auxiliary_loss_mlp": 0.00413643, "balance_loss_clip": 1.19759822, "balance_loss_mlp": 0.37354115, "epoch": 0.464692619870735, "flos": 18296020385280.0, "grad_norm": 451.08135187199275, "language_loss": 0.83221209, "learning_rate": 2.324345882723155e-06, "loss": 0.85091949, "num_input_tokens_seen": 165885915, "router_z_loss_clip": 2.59179688, "router_z_loss_mlp": 0.40136719, "step": 7729, "time_per_iteration": 2.659691095352173 }, { "auxiliary_loss_clip": 0.01452124, "auxiliary_loss_mlp": 0.00395968, "balance_loss_clip": 1.19600999, "balance_loss_mlp": 0.3563906, "epoch": 0.464752743123403, "flos": 22638051216000.0, "grad_norm": 18.007248942280597, "language_loss": 0.85960639, "learning_rate": 2.323961570451588e-06, "loss": 0.87808728, "num_input_tokens_seen": 165905465, "router_z_loss_clip": 2.55859375, "router_z_loss_mlp": 0.39575195, "step": 7730, "time_per_iteration": 2.624544620513916 }, { "auxiliary_loss_clip": 0.01445795, "auxiliary_loss_mlp": 0.00433579, "balance_loss_clip": 1.19232225, "balance_loss_mlp": 0.39416814, "epoch": 0.46481286637607094, "flos": 20412128373120.0, "grad_norm": 9.464239707983618, "language_loss": 0.82211578, "learning_rate": 2.3235772458953655e-06, "loss": 0.84090954, "num_input_tokens_seen": 165924640, "router_z_loss_clip": 2.53320312, "router_z_loss_mlp": 0.39379883, "step": 7731, "time_per_iteration": 2.673732280731201 }, { "auxiliary_loss_clip": 0.01436468, "auxiliary_loss_mlp": 0.00390088, "balance_loss_clip": 1.18792927, "balance_loss_mlp": 0.35408688, "epoch": 0.4648729896287389, "flos": 34275406129920.0, "grad_norm": 22.98631780535822, "language_loss": 0.72140205, "learning_rate": 2.323192909069061e-06, "loss": 0.73966759, "num_input_tokens_seen": 165945765, "router_z_loss_clip": 2.48828125, "router_z_loss_mlp": 0.36010742, "step": 7732, "time_per_iteration": 2.7896337509155273 }, { "auxiliary_loss_clip": 0.01449157, "auxiliary_loss_mlp": 0.00439776, "balance_loss_clip": 1.18832564, "balance_loss_mlp": 0.39824307, "epoch": 0.4649331128814069, "flos": 21321781326720.0, "grad_norm": 257.32269837062074, "language_loss": 0.83509976, "learning_rate": 2.32280855998725e-06, "loss": 0.85398906, "num_input_tokens_seen": 165964025, "router_z_loss_clip": 2.60546875, "router_z_loss_mlp": 0.41552734, "step": 7733, "time_per_iteration": 2.6461408138275146 }, { "auxiliary_loss_clip": 0.01283088, "auxiliary_loss_mlp": 0.00062311, "balance_loss_clip": 1.13068926, "balance_loss_mlp": 0.0530126, "epoch": 0.46499323613407484, "flos": 58308515717760.0, "grad_norm": 1.3314151244074708, "language_loss": 0.51812351, "learning_rate": 2.3224241986645057e-06, "loss": 0.53157747, "num_input_tokens_seen": 166021950, "router_z_loss_clip": 1.5234375, "router_z_loss_mlp": 0.09277344, "step": 7734, "time_per_iteration": 3.0830087661743164 }, { "auxiliary_loss_clip": 0.01443613, "auxiliary_loss_mlp": 0.00389514, "balance_loss_clip": 1.18647301, "balance_loss_mlp": 0.35315472, "epoch": 0.4650533593867428, "flos": 10889660384640.0, "grad_norm": 65.08021082848143, "language_loss": 0.80919129, "learning_rate": 2.3220398251154035e-06, "loss": 0.82752252, "num_input_tokens_seen": 166039675, "router_z_loss_clip": 2.56835938, "router_z_loss_mlp": 0.36376953, "step": 7735, "time_per_iteration": 2.613586664199829 }, { "auxiliary_loss_clip": 0.01423596, "auxiliary_loss_mlp": 0.00383243, "balance_loss_clip": 1.1765945, "balance_loss_mlp": 0.34831458, "epoch": 0.46511348263941077, "flos": 19974592805760.0, "grad_norm": 10.506670091082793, "language_loss": 0.76958919, "learning_rate": 2.321655439354519e-06, "loss": 0.78765756, "num_input_tokens_seen": 166057745, "router_z_loss_clip": 2.47265625, "router_z_loss_mlp": 0.34960938, "step": 7736, "time_per_iteration": 2.667954444885254 }, { "auxiliary_loss_clip": 0.01415825, "auxiliary_loss_mlp": 0.00411051, "balance_loss_clip": 1.17529726, "balance_loss_mlp": 0.37316588, "epoch": 0.46517360589207873, "flos": 19678401256320.0, "grad_norm": 26.984914452782068, "language_loss": 0.76858497, "learning_rate": 2.321271041396427e-06, "loss": 0.78685373, "num_input_tokens_seen": 166076440, "router_z_loss_clip": 2.40429688, "router_z_loss_mlp": 0.37890625, "step": 7737, "time_per_iteration": 2.689790964126587 }, { "auxiliary_loss_clip": 0.01445264, "auxiliary_loss_mlp": 0.00417275, "balance_loss_clip": 1.18663716, "balance_loss_mlp": 0.3798196, "epoch": 0.46523372914474675, "flos": 16872665074560.0, "grad_norm": 105.83708657193591, "language_loss": 0.89857507, "learning_rate": 2.3208866312557065e-06, "loss": 0.91720045, "num_input_tokens_seen": 166092520, "router_z_loss_clip": 2.58398438, "router_z_loss_mlp": 0.37451172, "step": 7738, "time_per_iteration": 2.6479854583740234 }, { "auxiliary_loss_clip": 0.01273875, "auxiliary_loss_mlp": 0.00105539, "balance_loss_clip": 1.11770606, "balance_loss_mlp": 0.09657423, "epoch": 0.4652938523974147, "flos": 53439138339840.0, "grad_norm": 0.7318792301090837, "language_loss": 0.57337832, "learning_rate": 2.320502208946932e-06, "loss": 0.58717245, "num_input_tokens_seen": 166156285, "router_z_loss_clip": 1.5625, "router_z_loss_mlp": 0.08984375, "step": 7739, "time_per_iteration": 3.2470743656158447 }, { "auxiliary_loss_clip": 0.01435765, "auxiliary_loss_mlp": 0.00439372, "balance_loss_clip": 1.18104994, "balance_loss_mlp": 0.40024751, "epoch": 0.4653539756500827, "flos": 15231296165760.0, "grad_norm": 24.434183408119207, "language_loss": 0.91764903, "learning_rate": 2.3201177744846815e-06, "loss": 0.93640041, "num_input_tokens_seen": 166173455, "router_z_loss_clip": 2.54882812, "router_z_loss_mlp": 0.39135742, "step": 7740, "time_per_iteration": 2.7231385707855225 }, { "auxiliary_loss_clip": 0.01424093, "auxiliary_loss_mlp": 0.00394025, "balance_loss_clip": 1.17633057, "balance_loss_mlp": 0.35838109, "epoch": 0.46541409890275065, "flos": 23732249270400.0, "grad_norm": 5.065779651035473, "language_loss": 0.80118906, "learning_rate": 2.3197333278835327e-06, "loss": 0.81937027, "num_input_tokens_seen": 166194370, "router_z_loss_clip": 2.48242188, "router_z_loss_mlp": 0.35668945, "step": 7741, "time_per_iteration": 2.6854987144470215 }, { "auxiliary_loss_clip": 0.01445028, "auxiliary_loss_mlp": 0.0040582, "balance_loss_clip": 1.18408227, "balance_loss_mlp": 0.36748213, "epoch": 0.4654742221554186, "flos": 20847329556480.0, "grad_norm": 165.89102817568545, "language_loss": 0.86041266, "learning_rate": 2.319348869158064e-06, "loss": 0.87892115, "num_input_tokens_seen": 166213195, "router_z_loss_clip": 2.61132812, "router_z_loss_mlp": 0.38330078, "step": 7742, "time_per_iteration": 4.091065168380737 }, { "auxiliary_loss_clip": 0.01436438, "auxiliary_loss_mlp": 0.00469377, "balance_loss_clip": 1.17718148, "balance_loss_mlp": 0.42450637, "epoch": 0.4655343454080866, "flos": 20704836303360.0, "grad_norm": 82.27316257214919, "language_loss": 0.7820918, "learning_rate": 2.3189643983228555e-06, "loss": 0.80114996, "num_input_tokens_seen": 166231350, "router_z_loss_clip": 2.59375, "router_z_loss_mlp": 0.44897461, "step": 7743, "time_per_iteration": 2.732997417449951 }, { "auxiliary_loss_clip": 0.01441552, "auxiliary_loss_mlp": 0.00392561, "balance_loss_clip": 1.18948913, "balance_loss_mlp": 0.35512969, "epoch": 0.46559446866075455, "flos": 18989850470400.0, "grad_norm": 39.174471569314306, "language_loss": 0.78417504, "learning_rate": 2.318579915392483e-06, "loss": 0.80251616, "num_input_tokens_seen": 166250530, "router_z_loss_clip": 2.5234375, "router_z_loss_mlp": 0.37451172, "step": 7744, "time_per_iteration": 2.656733989715576 }, { "auxiliary_loss_clip": 0.01421037, "auxiliary_loss_mlp": 0.00402795, "balance_loss_clip": 1.17566752, "balance_loss_mlp": 0.36619794, "epoch": 0.4656545919134225, "flos": 34496364643200.0, "grad_norm": 3.9794070834128505, "language_loss": 0.8904649, "learning_rate": 2.31819542038153e-06, "loss": 0.90870321, "num_input_tokens_seen": 166272545, "router_z_loss_clip": 2.45117188, "router_z_loss_mlp": 0.36572266, "step": 7745, "time_per_iteration": 4.2389068603515625 }, { "auxiliary_loss_clip": 0.01426851, "auxiliary_loss_mlp": 0.00415717, "balance_loss_clip": 1.17638707, "balance_loss_mlp": 0.37675947, "epoch": 0.4657147151660905, "flos": 24310554238080.0, "grad_norm": 6.73518516282038, "language_loss": 0.78965718, "learning_rate": 2.317810913304574e-06, "loss": 0.80808282, "num_input_tokens_seen": 166292135, "router_z_loss_clip": 2.50195312, "router_z_loss_mlp": 0.38964844, "step": 7746, "time_per_iteration": 2.7008020877838135 }, { "auxiliary_loss_clip": 0.01413413, "auxiliary_loss_mlp": 0.00401003, "balance_loss_clip": 1.17025185, "balance_loss_mlp": 0.36554989, "epoch": 0.46577483841875844, "flos": 58795139220480.0, "grad_norm": 71.92145163746531, "language_loss": 0.74335587, "learning_rate": 2.3174263941761963e-06, "loss": 0.7615, "num_input_tokens_seen": 166316710, "router_z_loss_clip": 2.43164062, "router_z_loss_mlp": 0.35473633, "step": 7747, "time_per_iteration": 4.534693479537964 }, { "auxiliary_loss_clip": 0.0144524, "auxiliary_loss_mlp": 0.00462153, "balance_loss_clip": 1.18941426, "balance_loss_mlp": 0.42000058, "epoch": 0.4658349616714264, "flos": 31321969223040.0, "grad_norm": 27.825016166031137, "language_loss": 0.72859341, "learning_rate": 2.317041863010978e-06, "loss": 0.74766731, "num_input_tokens_seen": 166338535, "router_z_loss_clip": 2.55664062, "router_z_loss_mlp": 0.421875, "step": 7748, "time_per_iteration": 2.7369587421417236 }, { "auxiliary_loss_clip": 0.01435746, "auxiliary_loss_mlp": 0.0042408, "balance_loss_clip": 1.17749655, "balance_loss_mlp": 0.38419229, "epoch": 0.46589508492409437, "flos": 14860338456960.0, "grad_norm": 29.82758853779135, "language_loss": 0.72469628, "learning_rate": 2.3166573198235007e-06, "loss": 0.7432946, "num_input_tokens_seen": 166355540, "router_z_loss_clip": 2.58203125, "router_z_loss_mlp": 0.39892578, "step": 7749, "time_per_iteration": 2.654207944869995 }, { "auxiliary_loss_clip": 0.01460232, "auxiliary_loss_mlp": 0.00404807, "balance_loss_clip": 1.19717073, "balance_loss_mlp": 0.36618268, "epoch": 0.46595520817676234, "flos": 12895989431040.0, "grad_norm": 21.794750896822336, "language_loss": 0.82635009, "learning_rate": 2.3162727646283456e-06, "loss": 0.84500051, "num_input_tokens_seen": 166372635, "router_z_loss_clip": 2.63085938, "router_z_loss_mlp": 0.38623047, "step": 7750, "time_per_iteration": 2.630199670791626 }, { "auxiliary_loss_clip": 0.0144434, "auxiliary_loss_mlp": 0.00431543, "balance_loss_clip": 1.1866082, "balance_loss_mlp": 0.3932054, "epoch": 0.46601533142943036, "flos": 32854169721600.0, "grad_norm": 12.140458804098222, "language_loss": 0.7981205, "learning_rate": 2.3158881974400963e-06, "loss": 0.81687927, "num_input_tokens_seen": 166393175, "router_z_loss_clip": 2.578125, "router_z_loss_mlp": 0.38330078, "step": 7751, "time_per_iteration": 2.7569050788879395 }, { "auxiliary_loss_clip": 0.01458077, "auxiliary_loss_mlp": 0.0043385, "balance_loss_clip": 1.19356179, "balance_loss_mlp": 0.39369982, "epoch": 0.4660754546820983, "flos": 19967517826560.0, "grad_norm": 14.928418572502714, "language_loss": 0.81111604, "learning_rate": 2.3155036182733345e-06, "loss": 0.83003533, "num_input_tokens_seen": 166408630, "router_z_loss_clip": 2.64453125, "router_z_loss_mlp": 0.40185547, "step": 7752, "time_per_iteration": 2.631706714630127 }, { "auxiliary_loss_clip": 0.01452043, "auxiliary_loss_mlp": 0.00456673, "balance_loss_clip": 1.1870743, "balance_loss_mlp": 0.41263741, "epoch": 0.4661355779347663, "flos": 26688164215680.0, "grad_norm": 42.264344633679116, "language_loss": 0.76987249, "learning_rate": 2.315119027142644e-06, "loss": 0.78895962, "num_input_tokens_seen": 166428170, "router_z_loss_clip": 2.65039062, "router_z_loss_mlp": 0.44067383, "step": 7753, "time_per_iteration": 4.139386177062988 }, { "auxiliary_loss_clip": 0.0144396, "auxiliary_loss_mlp": 0.00396938, "balance_loss_clip": 1.18931925, "balance_loss_mlp": 0.35843322, "epoch": 0.46619570118743425, "flos": 20959442881920.0, "grad_norm": 82.74953852607187, "language_loss": 0.79190958, "learning_rate": 2.3147344240626076e-06, "loss": 0.81031859, "num_input_tokens_seen": 166446705, "router_z_loss_clip": 2.54492188, "router_z_loss_mlp": 0.38500977, "step": 7754, "time_per_iteration": 2.659687042236328 }, { "auxiliary_loss_clip": 0.01438029, "auxiliary_loss_mlp": 0.0044216, "balance_loss_clip": 1.18249798, "balance_loss_mlp": 0.40141422, "epoch": 0.4662558244401022, "flos": 24426079355520.0, "grad_norm": 25.26114273646324, "language_loss": 0.84603399, "learning_rate": 2.3143498090478114e-06, "loss": 0.86483592, "num_input_tokens_seen": 166466750, "router_z_loss_clip": 2.5546875, "router_z_loss_mlp": 0.40722656, "step": 7755, "time_per_iteration": 2.7454473972320557 }, { "auxiliary_loss_clip": 0.01423442, "auxiliary_loss_mlp": 0.0039272, "balance_loss_clip": 1.1746217, "balance_loss_mlp": 0.35605121, "epoch": 0.4663159476927702, "flos": 20595452411520.0, "grad_norm": 18.080392585663816, "language_loss": 0.78954303, "learning_rate": 2.3139651821128382e-06, "loss": 0.80770463, "num_input_tokens_seen": 166485400, "router_z_loss_clip": 2.48632812, "router_z_loss_mlp": 0.36694336, "step": 7756, "time_per_iteration": 2.6340878009796143 }, { "auxiliary_loss_clip": 0.01427502, "auxiliary_loss_mlp": 0.00386744, "balance_loss_clip": 1.17763376, "balance_loss_mlp": 0.35093343, "epoch": 0.46637607094543815, "flos": 25661872823040.0, "grad_norm": 168.03210623792987, "language_loss": 0.83092386, "learning_rate": 2.313580543272274e-06, "loss": 0.84906638, "num_input_tokens_seen": 166505730, "router_z_loss_clip": 2.50195312, "router_z_loss_mlp": 0.3581543, "step": 7757, "time_per_iteration": 2.6764800548553467 }, { "auxiliary_loss_clip": 0.01439701, "auxiliary_loss_mlp": 0.00406752, "balance_loss_clip": 1.18584228, "balance_loss_mlp": 0.36874834, "epoch": 0.4664361941981061, "flos": 24273853516800.0, "grad_norm": 20.720160713774675, "language_loss": 0.73450971, "learning_rate": 2.313195892540705e-06, "loss": 0.75297421, "num_input_tokens_seen": 166523770, "router_z_loss_clip": 2.53515625, "router_z_loss_mlp": 0.37988281, "step": 7758, "time_per_iteration": 2.6738758087158203 }, { "auxiliary_loss_clip": 0.01426761, "auxiliary_loss_mlp": 0.00391066, "balance_loss_clip": 1.17910528, "balance_loss_mlp": 0.35575616, "epoch": 0.4664963174507741, "flos": 18405871153920.0, "grad_norm": 41.16133747119332, "language_loss": 0.82425374, "learning_rate": 2.3128112299327147e-06, "loss": 0.84243202, "num_input_tokens_seen": 166542935, "router_z_loss_clip": 2.4765625, "router_z_loss_mlp": 0.3527832, "step": 7759, "time_per_iteration": 2.675135612487793 }, { "auxiliary_loss_clip": 0.0143591, "auxiliary_loss_mlp": 0.0039434, "balance_loss_clip": 1.18659902, "balance_loss_mlp": 0.35888726, "epoch": 0.46655644070344204, "flos": 22455122227200.0, "grad_norm": 95.18789073871861, "language_loss": 0.83296889, "learning_rate": 2.312426555462893e-06, "loss": 0.85127139, "num_input_tokens_seen": 166563935, "router_z_loss_clip": 2.49804688, "router_z_loss_mlp": 0.35473633, "step": 7760, "time_per_iteration": 2.684521198272705 }, { "auxiliary_loss_clip": 0.01420899, "auxiliary_loss_mlp": 0.00395211, "balance_loss_clip": 1.17249632, "balance_loss_mlp": 0.35632476, "epoch": 0.46661656395611, "flos": 13808407731840.0, "grad_norm": 55.21596677400066, "language_loss": 0.80808401, "learning_rate": 2.3120418691458237e-06, "loss": 0.82624507, "num_input_tokens_seen": 166582175, "router_z_loss_clip": 2.484375, "router_z_loss_mlp": 0.38891602, "step": 7761, "time_per_iteration": 2.6639411449432373 }, { "auxiliary_loss_clip": 0.01457817, "auxiliary_loss_mlp": 0.00422589, "balance_loss_clip": 1.19637799, "balance_loss_mlp": 0.38165271, "epoch": 0.466676687208778, "flos": 21652159645440.0, "grad_norm": 72.8816268532683, "language_loss": 0.84562618, "learning_rate": 2.3116571709960956e-06, "loss": 0.86443019, "num_input_tokens_seen": 166601870, "router_z_loss_clip": 2.61328125, "router_z_loss_mlp": 0.40966797, "step": 7762, "time_per_iteration": 2.7596004009246826 }, { "auxiliary_loss_clip": 0.01283176, "auxiliary_loss_mlp": 0.00031994, "balance_loss_clip": 1.12419784, "balance_loss_mlp": 0.02550852, "epoch": 0.46673681046144594, "flos": 68534259068160.0, "grad_norm": 0.7793218265170596, "language_loss": 0.59404838, "learning_rate": 2.311272461028297e-06, "loss": 0.60720003, "num_input_tokens_seen": 166668960, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.06494141, "step": 7763, "time_per_iteration": 3.2454962730407715 }, { "auxiliary_loss_clip": 0.01445977, "auxiliary_loss_mlp": 0.00422986, "balance_loss_clip": 1.1834147, "balance_loss_mlp": 0.38364673, "epoch": 0.46679693371411396, "flos": 15814449469440.0, "grad_norm": 5.520071961283715, "language_loss": 0.85127318, "learning_rate": 2.3108877392570146e-06, "loss": 0.86996287, "num_input_tokens_seen": 166686110, "router_z_loss_clip": 2.62890625, "router_z_loss_mlp": 0.39331055, "step": 7764, "time_per_iteration": 2.6861584186553955 }, { "auxiliary_loss_clip": 0.0142958, "auxiliary_loss_mlp": 0.00386154, "balance_loss_clip": 1.18193245, "balance_loss_mlp": 0.35122615, "epoch": 0.4668570569667819, "flos": 18514572687360.0, "grad_norm": 43.410744560735786, "language_loss": 0.77407646, "learning_rate": 2.310503005696839e-06, "loss": 0.79223382, "num_input_tokens_seen": 166703930, "router_z_loss_clip": 2.47460938, "router_z_loss_mlp": 0.34912109, "step": 7765, "time_per_iteration": 2.656803607940674 }, { "auxiliary_loss_clip": 0.01447154, "auxiliary_loss_mlp": 0.00413662, "balance_loss_clip": 1.18945527, "balance_loss_mlp": 0.37606293, "epoch": 0.4669171802194499, "flos": 19206643006080.0, "grad_norm": 3.034104927725337, "language_loss": 0.84553504, "learning_rate": 2.3101182603623576e-06, "loss": 0.86414313, "num_input_tokens_seen": 166719940, "router_z_loss_clip": 2.578125, "router_z_loss_mlp": 0.37597656, "step": 7766, "time_per_iteration": 2.67101788520813 }, { "auxiliary_loss_clip": 0.01437279, "auxiliary_loss_mlp": 0.0041344, "balance_loss_clip": 1.18542719, "balance_loss_mlp": 0.37586561, "epoch": 0.46697730347211786, "flos": 12276135406080.0, "grad_norm": 195.1313756225157, "language_loss": 0.71576643, "learning_rate": 2.3097335032681607e-06, "loss": 0.73427367, "num_input_tokens_seen": 166738285, "router_z_loss_clip": 2.51757812, "router_z_loss_mlp": 0.37573242, "step": 7767, "time_per_iteration": 2.701338768005371 }, { "auxiliary_loss_clip": 0.01441795, "auxiliary_loss_mlp": 0.0040081, "balance_loss_clip": 1.19129992, "balance_loss_mlp": 0.3645463, "epoch": 0.4670374267247858, "flos": 23586739274880.0, "grad_norm": 3.6127496931601932, "language_loss": 0.81263363, "learning_rate": 2.3093487344288393e-06, "loss": 0.83105969, "num_input_tokens_seen": 166758170, "router_z_loss_clip": 2.50195312, "router_z_loss_mlp": 0.36254883, "step": 7768, "time_per_iteration": 2.686739921569824 }, { "auxiliary_loss_clip": 0.01438902, "auxiliary_loss_mlp": 0.00368144, "balance_loss_clip": 1.18219829, "balance_loss_mlp": 0.33364481, "epoch": 0.4670975499774538, "flos": 15991093578240.0, "grad_norm": 10.13788530213485, "language_loss": 0.76795882, "learning_rate": 2.308963953858982e-06, "loss": 0.78602928, "num_input_tokens_seen": 166775750, "router_z_loss_clip": 2.56835938, "router_z_loss_mlp": 0.3449707, "step": 7769, "time_per_iteration": 2.6551871299743652 }, { "auxiliary_loss_clip": 0.01435051, "auxiliary_loss_mlp": 0.00395389, "balance_loss_clip": 1.18213272, "balance_loss_mlp": 0.3588151, "epoch": 0.46715767323012175, "flos": 15377596260480.0, "grad_norm": 7.485193737442467, "language_loss": 0.87885875, "learning_rate": 2.3085791615731803e-06, "loss": 0.89716315, "num_input_tokens_seen": 166791720, "router_z_loss_clip": 2.52929688, "router_z_loss_mlp": 0.36547852, "step": 7770, "time_per_iteration": 2.6290132999420166 }, { "auxiliary_loss_clip": 0.01295873, "auxiliary_loss_mlp": 0.00044487, "balance_loss_clip": 1.1279068, "balance_loss_mlp": 0.03638045, "epoch": 0.4672177964827897, "flos": 60252217401600.0, "grad_norm": 0.7820811156193783, "language_loss": 0.55486512, "learning_rate": 2.3081943575860265e-06, "loss": 0.56826878, "num_input_tokens_seen": 166856360, "router_z_loss_clip": 1.6796875, "router_z_loss_mlp": 0.08105469, "step": 7771, "time_per_iteration": 3.2188403606414795 }, { "auxiliary_loss_clip": 0.01441436, "auxiliary_loss_mlp": 0.00398389, "balance_loss_clip": 1.19142199, "balance_loss_mlp": 0.36005154, "epoch": 0.4672779197354577, "flos": 27636134002560.0, "grad_norm": 62.87245783590284, "language_loss": 0.70592052, "learning_rate": 2.3078095419121117e-06, "loss": 0.7243188, "num_input_tokens_seen": 166875925, "router_z_loss_clip": 2.5, "router_z_loss_mlp": 0.38330078, "step": 7772, "time_per_iteration": 2.7143852710723877 }, { "auxiliary_loss_clip": 0.01438465, "auxiliary_loss_mlp": 0.00385611, "balance_loss_clip": 1.18908381, "balance_loss_mlp": 0.34922856, "epoch": 0.46733804298812565, "flos": 31394257344000.0, "grad_norm": 5.014852761136276, "language_loss": 0.69705582, "learning_rate": 2.3074247145660283e-06, "loss": 0.71529663, "num_input_tokens_seen": 166896520, "router_z_loss_clip": 2.49414062, "router_z_loss_mlp": 0.36401367, "step": 7773, "time_per_iteration": 2.7429630756378174 }, { "auxiliary_loss_clip": 0.01437215, "auxiliary_loss_mlp": 0.00401216, "balance_loss_clip": 1.18265319, "balance_loss_mlp": 0.36516708, "epoch": 0.4673981662407936, "flos": 19500607912320.0, "grad_norm": 15.722193378764258, "language_loss": 0.86925161, "learning_rate": 2.3070398755623685e-06, "loss": 0.88763589, "num_input_tokens_seen": 166915370, "router_z_loss_clip": 2.546875, "router_z_loss_mlp": 0.36035156, "step": 7774, "time_per_iteration": 2.6383397579193115 }, { "auxiliary_loss_clip": 0.01434598, "auxiliary_loss_mlp": 0.00379592, "balance_loss_clip": 1.1813693, "balance_loss_mlp": 0.34394801, "epoch": 0.4674582894934616, "flos": 20521835487360.0, "grad_norm": 37.70167498126349, "language_loss": 0.85320711, "learning_rate": 2.306655024915726e-06, "loss": 0.8713491, "num_input_tokens_seen": 166934875, "router_z_loss_clip": 2.53320312, "router_z_loss_mlp": 0.35644531, "step": 7775, "time_per_iteration": 2.793421983718872 }, { "auxiliary_loss_clip": 0.01442875, "auxiliary_loss_mlp": 0.00398003, "balance_loss_clip": 1.18668818, "balance_loss_mlp": 0.36064285, "epoch": 0.46751841274612954, "flos": 22090952188800.0, "grad_norm": 45.3408244050632, "language_loss": 0.76118636, "learning_rate": 2.306270162640694e-06, "loss": 0.77959514, "num_input_tokens_seen": 166954285, "router_z_loss_clip": 2.5625, "router_z_loss_mlp": 0.3737793, "step": 7776, "time_per_iteration": 2.697019100189209 }, { "auxiliary_loss_clip": 0.01444888, "auxiliary_loss_mlp": 0.00427284, "balance_loss_clip": 1.19362473, "balance_loss_mlp": 0.39009118, "epoch": 0.46757853599879756, "flos": 26980082046720.0, "grad_norm": 31.159758343867043, "language_loss": 0.77547777, "learning_rate": 2.3058852887518678e-06, "loss": 0.79419947, "num_input_tokens_seen": 166975975, "router_z_loss_clip": 2.51171875, "router_z_loss_mlp": 0.37182617, "step": 7777, "time_per_iteration": 2.697828531265259 }, { "auxiliary_loss_clip": 0.01449977, "auxiliary_loss_mlp": 0.00409575, "balance_loss_clip": 1.19199443, "balance_loss_mlp": 0.37283489, "epoch": 0.4676386592514655, "flos": 24134053783680.0, "grad_norm": 5.687839213548536, "language_loss": 0.77466881, "learning_rate": 2.3055004032638394e-06, "loss": 0.79326433, "num_input_tokens_seen": 166996140, "router_z_loss_clip": 2.578125, "router_z_loss_mlp": 0.3671875, "step": 7778, "time_per_iteration": 2.680717945098877 }, { "auxiliary_loss_clip": 0.014587, "auxiliary_loss_mlp": 0.00419169, "balance_loss_clip": 1.19759059, "balance_loss_mlp": 0.38073617, "epoch": 0.4676987825041335, "flos": 25483720343040.0, "grad_norm": 8.852565293322966, "language_loss": 0.79187357, "learning_rate": 2.305115506191206e-06, "loss": 0.81065226, "num_input_tokens_seen": 167016105, "router_z_loss_clip": 2.61132812, "router_z_loss_mlp": 0.38427734, "step": 7779, "time_per_iteration": 2.7090423107147217 }, { "auxiliary_loss_clip": 0.01423804, "auxiliary_loss_mlp": 0.00389681, "balance_loss_clip": 1.17500973, "balance_loss_mlp": 0.35551548, "epoch": 0.46775890575680146, "flos": 21945298538880.0, "grad_norm": 10.61429086423236, "language_loss": 0.78287548, "learning_rate": 2.304730597548562e-06, "loss": 0.80101031, "num_input_tokens_seen": 167036185, "router_z_loss_clip": 2.49023438, "router_z_loss_mlp": 0.34155273, "step": 7780, "time_per_iteration": 2.683276414871216 }, { "auxiliary_loss_clip": 0.01467227, "auxiliary_loss_mlp": 0.00413456, "balance_loss_clip": 1.20257473, "balance_loss_mlp": 0.37504712, "epoch": 0.4678190290094694, "flos": 25228395492480.0, "grad_norm": 2.0212249296481795, "language_loss": 0.80101168, "learning_rate": 2.3043456773505023e-06, "loss": 0.81981856, "num_input_tokens_seen": 167054515, "router_z_loss_clip": 2.64648438, "router_z_loss_mlp": 0.38427734, "step": 7781, "time_per_iteration": 2.789621591567993 }, { "auxiliary_loss_clip": 0.01448525, "auxiliary_loss_mlp": 0.00392187, "balance_loss_clip": 1.19193625, "balance_loss_mlp": 0.35432598, "epoch": 0.4678791522621374, "flos": 32268358811520.0, "grad_norm": 10.961494141560085, "language_loss": 0.69609505, "learning_rate": 2.3039607456116252e-06, "loss": 0.71450222, "num_input_tokens_seen": 167077245, "router_z_loss_clip": 2.56835938, "router_z_loss_mlp": 0.37841797, "step": 7782, "time_per_iteration": 2.797297954559326 }, { "auxiliary_loss_clip": 0.01461955, "auxiliary_loss_mlp": 0.00456747, "balance_loss_clip": 1.19847202, "balance_loss_mlp": 0.41731259, "epoch": 0.46793927551480535, "flos": 27046480337280.0, "grad_norm": 29.361322041357795, "language_loss": 0.69158518, "learning_rate": 2.3035758023465254e-06, "loss": 0.71077216, "num_input_tokens_seen": 167097235, "router_z_loss_clip": 2.63671875, "router_z_loss_mlp": 0.39428711, "step": 7783, "time_per_iteration": 2.7048096656799316 }, { "auxiliary_loss_clip": 0.01466374, "auxiliary_loss_mlp": 0.00435004, "balance_loss_clip": 1.20336843, "balance_loss_mlp": 0.39189839, "epoch": 0.4679993987674733, "flos": 17457398576640.0, "grad_norm": 23.163898353038107, "language_loss": 0.76820886, "learning_rate": 2.303190847569801e-06, "loss": 0.78722262, "num_input_tokens_seen": 167113155, "router_z_loss_clip": 2.63085938, "router_z_loss_mlp": 0.43066406, "step": 7784, "time_per_iteration": 4.176246643066406 }, { "auxiliary_loss_clip": 0.01447183, "auxiliary_loss_mlp": 0.00408092, "balance_loss_clip": 1.19166303, "balance_loss_mlp": 0.37097073, "epoch": 0.4680595220201413, "flos": 17165121609600.0, "grad_norm": 380.9269314595852, "language_loss": 0.91167063, "learning_rate": 2.3028058812960497e-06, "loss": 0.93022335, "num_input_tokens_seen": 167131765, "router_z_loss_clip": 2.55273438, "router_z_loss_mlp": 0.37109375, "step": 7785, "time_per_iteration": 2.812243938446045 }, { "auxiliary_loss_clip": 0.01456938, "auxiliary_loss_mlp": 0.00411181, "balance_loss_clip": 1.20009625, "balance_loss_mlp": 0.37150824, "epoch": 0.46811964527280925, "flos": 11327591001600.0, "grad_norm": 127.35502072328538, "language_loss": 0.85427713, "learning_rate": 2.3024209035398678e-06, "loss": 0.8729583, "num_input_tokens_seen": 167149030, "router_z_loss_clip": 2.56640625, "router_z_loss_mlp": 0.39697266, "step": 7786, "time_per_iteration": 2.6494979858398438 }, { "auxiliary_loss_clip": 0.01440418, "auxiliary_loss_mlp": 0.00405622, "balance_loss_clip": 1.19006538, "balance_loss_mlp": 0.36981148, "epoch": 0.4681797685254772, "flos": 24278809593600.0, "grad_norm": 49.93888078885759, "language_loss": 0.78925085, "learning_rate": 2.302035914315856e-06, "loss": 0.80771124, "num_input_tokens_seen": 167167375, "router_z_loss_clip": 2.50390625, "router_z_loss_mlp": 0.35791016, "step": 7787, "time_per_iteration": 2.7078051567077637 }, { "auxiliary_loss_clip": 0.01461955, "auxiliary_loss_mlp": 0.00423475, "balance_loss_clip": 1.2053175, "balance_loss_mlp": 0.38532811, "epoch": 0.4682398917781452, "flos": 31650372293760.0, "grad_norm": 4.805151459007983, "language_loss": 0.70479012, "learning_rate": 2.3016509136386116e-06, "loss": 0.7236445, "num_input_tokens_seen": 167188065, "router_z_loss_clip": 2.56640625, "router_z_loss_mlp": 0.3815918, "step": 7788, "time_per_iteration": 4.26589298248291 }, { "auxiliary_loss_clip": 0.01457247, "auxiliary_loss_mlp": 0.00429288, "balance_loss_clip": 1.20086157, "balance_loss_mlp": 0.391332, "epoch": 0.46830001503081314, "flos": 28110765340800.0, "grad_norm": 6.217519944010808, "language_loss": 0.70294929, "learning_rate": 2.3012659015227343e-06, "loss": 0.72181469, "num_input_tokens_seen": 167209675, "router_z_loss_clip": 2.56445312, "router_z_loss_mlp": 0.37988281, "step": 7789, "time_per_iteration": 2.716214895248413 }, { "auxiliary_loss_clip": 0.01288613, "auxiliary_loss_mlp": 0.00147316, "balance_loss_clip": 1.12986171, "balance_loss_mlp": 0.13630083, "epoch": 0.4683601382834811, "flos": 57881718316800.0, "grad_norm": 0.6970365995358444, "language_loss": 0.61444992, "learning_rate": 2.300880877982825e-06, "loss": 0.62880915, "num_input_tokens_seen": 167273940, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.11035156, "step": 7790, "time_per_iteration": 4.664637088775635 }, { "auxiliary_loss_clip": 0.01458709, "auxiliary_loss_mlp": 0.00388147, "balance_loss_clip": 1.20294881, "balance_loss_mlp": 0.35190761, "epoch": 0.46842026153614913, "flos": 21871933009920.0, "grad_norm": 4.415124988310101, "language_loss": 0.83883727, "learning_rate": 2.3004958430334808e-06, "loss": 0.85730588, "num_input_tokens_seen": 167292730, "router_z_loss_clip": 2.55664062, "router_z_loss_mlp": 0.36206055, "step": 7791, "time_per_iteration": 2.6695494651794434 }, { "auxiliary_loss_clip": 0.01452593, "auxiliary_loss_mlp": 0.00400854, "balance_loss_clip": 1.19764388, "balance_loss_mlp": 0.36544865, "epoch": 0.4684803847888171, "flos": 24900818434560.0, "grad_norm": 38.048629735113124, "language_loss": 0.81327927, "learning_rate": 2.3001107966893052e-06, "loss": 0.83181369, "num_input_tokens_seen": 167313460, "router_z_loss_clip": 2.546875, "router_z_loss_mlp": 0.35400391, "step": 7792, "time_per_iteration": 2.6945934295654297 }, { "auxiliary_loss_clip": 0.01446917, "auxiliary_loss_mlp": 0.00401942, "balance_loss_clip": 1.19537163, "balance_loss_mlp": 0.36434329, "epoch": 0.46854050804148506, "flos": 26251670142720.0, "grad_norm": 29.73433768821562, "language_loss": 0.71724916, "learning_rate": 2.299725738964898e-06, "loss": 0.7357378, "num_input_tokens_seen": 167335385, "router_z_loss_clip": 2.51757812, "router_z_loss_mlp": 0.3762207, "step": 7793, "time_per_iteration": 2.7234768867492676 }, { "auxiliary_loss_clip": 0.0145597, "auxiliary_loss_mlp": 0.00432984, "balance_loss_clip": 1.20012641, "balance_loss_mlp": 0.39393115, "epoch": 0.468600631294153, "flos": 21579799697280.0, "grad_norm": 4.0867378310414315, "language_loss": 0.7821275, "learning_rate": 2.2993406698748607e-06, "loss": 0.80101705, "num_input_tokens_seen": 167353625, "router_z_loss_clip": 2.55664062, "router_z_loss_mlp": 0.39013672, "step": 7794, "time_per_iteration": 2.6995718479156494 }, { "auxiliary_loss_clip": 0.01474568, "auxiliary_loss_mlp": 0.00466951, "balance_loss_clip": 1.21319938, "balance_loss_mlp": 0.42732573, "epoch": 0.468660754546821, "flos": 25885632597120.0, "grad_norm": 31.028921765185693, "language_loss": 0.7056154, "learning_rate": 2.2989555894337953e-06, "loss": 0.72503066, "num_input_tokens_seen": 167374565, "router_z_loss_clip": 2.61328125, "router_z_loss_mlp": 0.39624023, "step": 7795, "time_per_iteration": 4.149586200714111 }, { "auxiliary_loss_clip": 0.01466171, "auxiliary_loss_mlp": 0.0042924, "balance_loss_clip": 1.20547867, "balance_loss_mlp": 0.39059252, "epoch": 0.46872087779948896, "flos": 35475001666560.0, "grad_norm": 29.374960783805825, "language_loss": 0.7326628, "learning_rate": 2.298570497656304e-06, "loss": 0.75161695, "num_input_tokens_seen": 167395010, "router_z_loss_clip": 2.609375, "router_z_loss_mlp": 0.38671875, "step": 7796, "time_per_iteration": 2.8017351627349854 }, { "auxiliary_loss_clip": 0.01467745, "auxiliary_loss_mlp": 0.00467562, "balance_loss_clip": 1.20361006, "balance_loss_mlp": 0.42710286, "epoch": 0.4687810010521569, "flos": 26396425952640.0, "grad_norm": 10.2453104258466, "language_loss": 0.75886524, "learning_rate": 2.2981853945569894e-06, "loss": 0.77821839, "num_input_tokens_seen": 167415285, "router_z_loss_clip": 2.64257812, "router_z_loss_mlp": 0.40454102, "step": 7797, "time_per_iteration": 2.7213380336761475 }, { "auxiliary_loss_clip": 0.01490686, "auxiliary_loss_mlp": 0.00462925, "balance_loss_clip": 1.21982265, "balance_loss_mlp": 0.4194133, "epoch": 0.4688411243048249, "flos": 19972761212160.0, "grad_norm": 14.799809232309157, "language_loss": 0.73591065, "learning_rate": 2.297800280150454e-06, "loss": 0.75544667, "num_input_tokens_seen": 167432405, "router_z_loss_clip": 2.70703125, "router_z_loss_mlp": 0.43481445, "step": 7798, "time_per_iteration": 2.6945462226867676 }, { "auxiliary_loss_clip": 0.01278313, "auxiliary_loss_mlp": 0.00113203, "balance_loss_clip": 1.11821961, "balance_loss_mlp": 0.10204468, "epoch": 0.46890124755749285, "flos": 63977015900160.0, "grad_norm": 0.9289118476299877, "language_loss": 0.64162815, "learning_rate": 2.2974151544513033e-06, "loss": 0.65554321, "num_input_tokens_seen": 167499365, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.11181641, "step": 7799, "time_per_iteration": 3.3190603256225586 }, { "auxiliary_loss_clip": 0.01463257, "auxiliary_loss_mlp": 0.00425281, "balance_loss_clip": 1.20286858, "balance_loss_mlp": 0.38541722, "epoch": 0.4689613708101608, "flos": 23768985905280.0, "grad_norm": 2.529461011174724, "language_loss": 0.77506757, "learning_rate": 2.2970300174741395e-06, "loss": 0.79395294, "num_input_tokens_seen": 167520390, "router_z_loss_clip": 2.60351562, "router_z_loss_mlp": 0.3984375, "step": 7800, "time_per_iteration": 2.7100372314453125 }, { "auxiliary_loss_clip": 0.01455854, "auxiliary_loss_mlp": 0.00424466, "balance_loss_clip": 1.20162296, "balance_loss_mlp": 0.38789308, "epoch": 0.4690214940628288, "flos": 24788705109120.0, "grad_norm": 25.56981776709766, "language_loss": 0.78713387, "learning_rate": 2.296644869233568e-06, "loss": 0.80593705, "num_input_tokens_seen": 167539865, "router_z_loss_clip": 2.54492188, "router_z_loss_mlp": 0.36547852, "step": 7801, "time_per_iteration": 2.7636430263519287 }, { "auxiliary_loss_clip": 0.01479692, "auxiliary_loss_mlp": 0.00427, "balance_loss_clip": 1.20791423, "balance_loss_mlp": 0.38725555, "epoch": 0.46908161731549675, "flos": 18077324428800.0, "grad_norm": 3.9076857218018906, "language_loss": 0.73116112, "learning_rate": 2.2962597097441936e-06, "loss": 0.75022805, "num_input_tokens_seen": 167558190, "router_z_loss_clip": 2.71875, "router_z_loss_mlp": 0.39770508, "step": 7802, "time_per_iteration": 2.6927106380462646 }, { "auxiliary_loss_clip": 0.01467789, "auxiliary_loss_mlp": 0.00414002, "balance_loss_clip": 1.20811868, "balance_loss_mlp": 0.37571204, "epoch": 0.4691417405681647, "flos": 25703350053120.0, "grad_norm": 12.859648919675832, "language_loss": 0.79194784, "learning_rate": 2.2958745390206206e-06, "loss": 0.81076574, "num_input_tokens_seen": 167577685, "router_z_loss_clip": 2.59765625, "router_z_loss_mlp": 0.38330078, "step": 7803, "time_per_iteration": 2.7394142150878906 }, { "auxiliary_loss_clip": 0.0146441, "auxiliary_loss_mlp": 0.00436413, "balance_loss_clip": 1.20313883, "balance_loss_mlp": 0.39857548, "epoch": 0.46920186382083273, "flos": 17457039440640.0, "grad_norm": 4.9768416944098055, "language_loss": 0.83656478, "learning_rate": 2.2954893570774558e-06, "loss": 0.855573, "num_input_tokens_seen": 167596390, "router_z_loss_clip": 2.61328125, "router_z_loss_mlp": 0.37817383, "step": 7804, "time_per_iteration": 2.717755079269409 }, { "auxiliary_loss_clip": 0.01467098, "auxiliary_loss_mlp": 0.00448371, "balance_loss_clip": 1.21070242, "balance_loss_mlp": 0.40700534, "epoch": 0.4692619870735007, "flos": 20339445202560.0, "grad_norm": 140.2737372182612, "language_loss": 0.83204317, "learning_rate": 2.295104163929305e-06, "loss": 0.85119784, "num_input_tokens_seen": 167614980, "router_z_loss_clip": 2.5625, "router_z_loss_mlp": 0.41357422, "step": 7805, "time_per_iteration": 2.708989381790161 }, { "auxiliary_loss_clip": 0.01479283, "auxiliary_loss_mlp": 0.00452569, "balance_loss_clip": 1.20636785, "balance_loss_mlp": 0.40922433, "epoch": 0.46932211032616866, "flos": 29496558003840.0, "grad_norm": 41.69797523629519, "language_loss": 0.88733947, "learning_rate": 2.2947189595907742e-06, "loss": 0.90665793, "num_input_tokens_seen": 167635895, "router_z_loss_clip": 2.734375, "router_z_loss_mlp": 0.43334961, "step": 7806, "time_per_iteration": 2.8007290363311768 }, { "auxiliary_loss_clip": 0.01471387, "auxiliary_loss_mlp": 0.00428697, "balance_loss_clip": 1.20839, "balance_loss_mlp": 0.39000127, "epoch": 0.4693822335788366, "flos": 36211242735360.0, "grad_norm": 84.22850946247445, "language_loss": 0.84398204, "learning_rate": 2.294333744076472e-06, "loss": 0.86298287, "num_input_tokens_seen": 167657440, "router_z_loss_clip": 2.62695312, "router_z_loss_mlp": 0.38720703, "step": 7807, "time_per_iteration": 3.0040881633758545 }, { "auxiliary_loss_clip": 0.01489142, "auxiliary_loss_mlp": 0.00454694, "balance_loss_clip": 1.22140002, "balance_loss_mlp": 0.41463992, "epoch": 0.4694423568315046, "flos": 20338978325760.0, "grad_norm": 59.451023457042595, "language_loss": 0.56449503, "learning_rate": 2.2939485174010035e-06, "loss": 0.58393335, "num_input_tokens_seen": 167675025, "router_z_loss_clip": 2.6796875, "router_z_loss_mlp": 0.40063477, "step": 7808, "time_per_iteration": 2.9206793308258057 }, { "auxiliary_loss_clip": 0.01239328, "auxiliary_loss_mlp": 0.0015407, "balance_loss_clip": 1.08474624, "balance_loss_mlp": 0.14438999, "epoch": 0.46950248008417256, "flos": 64326353621760.0, "grad_norm": 0.8740502522665015, "language_loss": 0.57437456, "learning_rate": 2.293563279578978e-06, "loss": 0.58830857, "num_input_tokens_seen": 167729635, "router_z_loss_clip": 1.546875, "router_z_loss_mlp": 0.09667969, "step": 7809, "time_per_iteration": 3.0720913410186768 }, { "auxiliary_loss_clip": 0.01489524, "auxiliary_loss_mlp": 0.00435648, "balance_loss_clip": 1.2147572, "balance_loss_mlp": 0.39585608, "epoch": 0.4695626033368405, "flos": 19200106730880.0, "grad_norm": 41.20605232510803, "language_loss": 0.78639609, "learning_rate": 2.2931780306250045e-06, "loss": 0.80564779, "num_input_tokens_seen": 167745135, "router_z_loss_clip": 2.74804688, "router_z_loss_mlp": 0.39770508, "step": 7810, "time_per_iteration": 2.717860698699951 }, { "auxiliary_loss_clip": 0.01476983, "auxiliary_loss_mlp": 0.00409289, "balance_loss_clip": 1.21154857, "balance_loss_mlp": 0.37273934, "epoch": 0.4696227265895085, "flos": 23002436736000.0, "grad_norm": 6.884402657050895, "language_loss": 0.87954104, "learning_rate": 2.29279277055369e-06, "loss": 0.89840376, "num_input_tokens_seen": 167763875, "router_z_loss_clip": 2.65429688, "router_z_loss_mlp": 0.3659668, "step": 7811, "time_per_iteration": 2.684976577758789 }, { "auxiliary_loss_clip": 0.01475807, "auxiliary_loss_mlp": 0.00423594, "balance_loss_clip": 1.21359849, "balance_loss_mlp": 0.38692558, "epoch": 0.46968284984217645, "flos": 21870855601920.0, "grad_norm": 73.86186748629362, "language_loss": 0.85178673, "learning_rate": 2.292407499379644e-06, "loss": 0.87078083, "num_input_tokens_seen": 167784895, "router_z_loss_clip": 2.62109375, "router_z_loss_mlp": 0.36669922, "step": 7812, "time_per_iteration": 2.7008771896362305 }, { "auxiliary_loss_clip": 0.01455417, "auxiliary_loss_mlp": 0.00432569, "balance_loss_clip": 1.20173693, "balance_loss_mlp": 0.39265794, "epoch": 0.4697429730948444, "flos": 19974987855360.0, "grad_norm": 96.69028869836833, "language_loss": 0.80835104, "learning_rate": 2.292022217117477e-06, "loss": 0.82723081, "num_input_tokens_seen": 167803185, "router_z_loss_clip": 2.53710938, "router_z_loss_mlp": 0.39916992, "step": 7813, "time_per_iteration": 2.647787570953369 }, { "auxiliary_loss_clip": 0.01464689, "auxiliary_loss_mlp": 0.00399455, "balance_loss_clip": 1.20472288, "balance_loss_mlp": 0.36421651, "epoch": 0.4698030963475124, "flos": 15156206784000.0, "grad_norm": 66.36554249266958, "language_loss": 0.89311385, "learning_rate": 2.291636923781798e-06, "loss": 0.91175532, "num_input_tokens_seen": 167816550, "router_z_loss_clip": 2.60351562, "router_z_loss_mlp": 0.35253906, "step": 7814, "time_per_iteration": 2.644508123397827 }, { "auxiliary_loss_clip": 0.01446771, "auxiliary_loss_mlp": 0.0040536, "balance_loss_clip": 1.19332361, "balance_loss_mlp": 0.36819023, "epoch": 0.46986321960018035, "flos": 15151178880000.0, "grad_norm": 25.39385611547622, "language_loss": 0.86834002, "learning_rate": 2.291251619387217e-06, "loss": 0.88686126, "num_input_tokens_seen": 167831845, "router_z_loss_clip": 2.53320312, "router_z_loss_mlp": 0.37158203, "step": 7815, "time_per_iteration": 2.606029510498047 }, { "auxiliary_loss_clip": 0.014432, "auxiliary_loss_mlp": 0.00413023, "balance_loss_clip": 1.18846178, "balance_loss_mlp": 0.37489983, "epoch": 0.4699233428528483, "flos": 23108911626240.0, "grad_norm": 171.4972963373993, "language_loss": 0.8484109, "learning_rate": 2.2908663039483468e-06, "loss": 0.86697316, "num_input_tokens_seen": 167850360, "router_z_loss_clip": 2.546875, "router_z_loss_mlp": 0.38110352, "step": 7816, "time_per_iteration": 2.6986279487609863 }, { "auxiliary_loss_clip": 0.01230474, "auxiliary_loss_mlp": 0.00110387, "balance_loss_clip": 1.07830179, "balance_loss_mlp": 0.10175604, "epoch": 0.46998346610551633, "flos": 68105558246400.0, "grad_norm": 1.0338634183287236, "language_loss": 0.58002102, "learning_rate": 2.290480977479796e-06, "loss": 0.59342957, "num_input_tokens_seen": 167908660, "router_z_loss_clip": 1.515625, "router_z_loss_mlp": 0.08642578, "step": 7817, "time_per_iteration": 3.224261522293091 }, { "auxiliary_loss_clip": 0.01451094, "auxiliary_loss_mlp": 0.00411507, "balance_loss_clip": 1.20063722, "balance_loss_mlp": 0.3750295, "epoch": 0.4700435893581843, "flos": 24129456842880.0, "grad_norm": 18.55474279317209, "language_loss": 0.85427386, "learning_rate": 2.2900956399961775e-06, "loss": 0.87289989, "num_input_tokens_seen": 167927905, "router_z_loss_clip": 2.50195312, "router_z_loss_mlp": 0.36499023, "step": 7818, "time_per_iteration": 2.7036659717559814 }, { "auxiliary_loss_clip": 0.01457641, "auxiliary_loss_mlp": 0.00388552, "balance_loss_clip": 1.20213056, "balance_loss_mlp": 0.35112035, "epoch": 0.47010371261085226, "flos": 20150518642560.0, "grad_norm": 11.470100991415269, "language_loss": 0.89259744, "learning_rate": 2.289710291512104e-06, "loss": 0.91105938, "num_input_tokens_seen": 167945995, "router_z_loss_clip": 2.55273438, "router_z_loss_mlp": 0.37402344, "step": 7819, "time_per_iteration": 2.664466381072998 }, { "auxiliary_loss_clip": 0.01470309, "auxiliary_loss_mlp": 0.00435723, "balance_loss_clip": 1.20547223, "balance_loss_mlp": 0.39371318, "epoch": 0.47016383586352023, "flos": 15122199582720.0, "grad_norm": 38.385219327831905, "language_loss": 0.8493849, "learning_rate": 2.289324932042186e-06, "loss": 0.86844522, "num_input_tokens_seen": 167963380, "router_z_loss_clip": 2.64648438, "router_z_loss_mlp": 0.42041016, "step": 7820, "time_per_iteration": 2.618699789047241 }, { "auxiliary_loss_clip": 0.01463591, "auxiliary_loss_mlp": 0.00405899, "balance_loss_clip": 1.207775, "balance_loss_mlp": 0.36858615, "epoch": 0.4702239591161882, "flos": 13552975140480.0, "grad_norm": 12.493897816419464, "language_loss": 0.80470532, "learning_rate": 2.288939561601039e-06, "loss": 0.8234002, "num_input_tokens_seen": 167981740, "router_z_loss_clip": 2.56054688, "router_z_loss_mlp": 0.37304688, "step": 7821, "time_per_iteration": 2.643613338470459 }, { "auxiliary_loss_clip": 0.01443891, "auxiliary_loss_mlp": 0.00413872, "balance_loss_clip": 1.19509792, "balance_loss_mlp": 0.37601137, "epoch": 0.47028408236885616, "flos": 24276511123200.0, "grad_norm": 4.180727848891327, "language_loss": 0.93171835, "learning_rate": 2.2885541802032746e-06, "loss": 0.95029598, "num_input_tokens_seen": 167999380, "router_z_loss_clip": 2.48632812, "router_z_loss_mlp": 0.37866211, "step": 7822, "time_per_iteration": 2.7382562160491943 }, { "auxiliary_loss_clip": 0.01465961, "auxiliary_loss_mlp": 0.00371088, "balance_loss_clip": 1.21037519, "balance_loss_mlp": 0.3367084, "epoch": 0.4703442056215241, "flos": 22856926740480.0, "grad_norm": 3.7481412917293846, "language_loss": 0.8566274, "learning_rate": 2.2881687878635055e-06, "loss": 0.87499791, "num_input_tokens_seen": 168018395, "router_z_loss_clip": 2.55859375, "router_z_loss_mlp": 0.34375, "step": 7823, "time_per_iteration": 2.7065749168395996 }, { "auxiliary_loss_clip": 0.01250706, "auxiliary_loss_mlp": 0.00061239, "balance_loss_clip": 1.10268962, "balance_loss_mlp": 0.05236965, "epoch": 0.4704043288741921, "flos": 69240227950080.0, "grad_norm": 0.6825453663908594, "language_loss": 0.56216407, "learning_rate": 2.2877833845963487e-06, "loss": 0.57528353, "num_input_tokens_seen": 168084080, "router_z_loss_clip": 1.4765625, "router_z_loss_mlp": 0.08886719, "step": 7824, "time_per_iteration": 3.268352746963501 }, { "auxiliary_loss_clip": 0.01477774, "auxiliary_loss_mlp": 0.00391941, "balance_loss_clip": 1.21764517, "balance_loss_mlp": 0.35589242, "epoch": 0.47046445212686006, "flos": 18041090584320.0, "grad_norm": 15.058240774090676, "language_loss": 0.87436771, "learning_rate": 2.2873979704164157e-06, "loss": 0.89306486, "num_input_tokens_seen": 168101555, "router_z_loss_clip": 2.59960938, "router_z_loss_mlp": 0.36035156, "step": 7825, "time_per_iteration": 2.649292230606079 }, { "auxiliary_loss_clip": 0.01475621, "auxiliary_loss_mlp": 0.00413752, "balance_loss_clip": 1.2114749, "balance_loss_mlp": 0.37503269, "epoch": 0.470524575379528, "flos": 23951448017280.0, "grad_norm": 6.581030112453944, "language_loss": 0.74239069, "learning_rate": 2.287012545338324e-06, "loss": 0.76128447, "num_input_tokens_seen": 168121530, "router_z_loss_clip": 2.64257812, "router_z_loss_mlp": 0.38720703, "step": 7826, "time_per_iteration": 2.6737594604492188 }, { "auxiliary_loss_clip": 0.01471336, "auxiliary_loss_mlp": 0.00399499, "balance_loss_clip": 1.20991099, "balance_loss_mlp": 0.35961205, "epoch": 0.470584698632196, "flos": 18113558273280.0, "grad_norm": 37.18756326212194, "language_loss": 0.89406043, "learning_rate": 2.2866271093766877e-06, "loss": 0.91276878, "num_input_tokens_seen": 168140335, "router_z_loss_clip": 2.6171875, "router_z_loss_mlp": 0.39892578, "step": 7827, "time_per_iteration": 4.03376579284668 }, { "auxiliary_loss_clip": 0.01253844, "auxiliary_loss_mlp": 0.00087277, "balance_loss_clip": 1.10993648, "balance_loss_mlp": 0.07831293, "epoch": 0.47064482188486395, "flos": 57251916224640.0, "grad_norm": 0.8062469004772751, "language_loss": 0.55372816, "learning_rate": 2.286241662546122e-06, "loss": 0.56713939, "num_input_tokens_seen": 168200535, "router_z_loss_clip": 1.4375, "router_z_loss_mlp": 0.08984375, "step": 7828, "time_per_iteration": 3.159149408340454 }, { "auxiliary_loss_clip": 0.01459787, "auxiliary_loss_mlp": 0.00411742, "balance_loss_clip": 1.20705557, "balance_loss_mlp": 0.37264138, "epoch": 0.4707049451375319, "flos": 17895077798400.0, "grad_norm": 6.950037041245772, "language_loss": 0.86779493, "learning_rate": 2.285856204861245e-06, "loss": 0.88651025, "num_input_tokens_seen": 168219610, "router_z_loss_clip": 2.53125, "router_z_loss_mlp": 0.39086914, "step": 7829, "time_per_iteration": 2.627861499786377 }, { "auxiliary_loss_clip": 0.01466039, "auxiliary_loss_mlp": 0.00409418, "balance_loss_clip": 1.2092725, "balance_loss_mlp": 0.3726064, "epoch": 0.47076506839019994, "flos": 25232669210880.0, "grad_norm": 5.50269296828369, "language_loss": 0.79020333, "learning_rate": 2.2854707363366703e-06, "loss": 0.80895787, "num_input_tokens_seen": 168242505, "router_z_loss_clip": 2.56640625, "router_z_loss_mlp": 0.36791992, "step": 7830, "time_per_iteration": 4.214278936386108 }, { "auxiliary_loss_clip": 0.01472202, "auxiliary_loss_mlp": 0.00404024, "balance_loss_clip": 1.22101021, "balance_loss_mlp": 0.36821377, "epoch": 0.4708251916428679, "flos": 13479681438720.0, "grad_norm": 8.719441195136728, "language_loss": 0.84859645, "learning_rate": 2.2850852569870177e-06, "loss": 0.86735868, "num_input_tokens_seen": 168260220, "router_z_loss_clip": 2.515625, "router_z_loss_mlp": 0.35839844, "step": 7831, "time_per_iteration": 2.667121171951294 }, { "auxiliary_loss_clip": 0.01493044, "auxiliary_loss_mlp": 0.00483011, "balance_loss_clip": 1.21996951, "balance_loss_mlp": 0.43887937, "epoch": 0.47088531489553587, "flos": 30147833450880.0, "grad_norm": 7.631637862719774, "language_loss": 0.80693394, "learning_rate": 2.2846997668269033e-06, "loss": 0.82669449, "num_input_tokens_seen": 168277360, "router_z_loss_clip": 2.72851562, "router_z_loss_mlp": 0.44140625, "step": 7832, "time_per_iteration": 2.728379487991333 }, { "auxiliary_loss_clip": 0.01453342, "auxiliary_loss_mlp": 0.0037502, "balance_loss_clip": 1.20774412, "balance_loss_mlp": 0.34218991, "epoch": 0.47094543814820383, "flos": 21798280172160.0, "grad_norm": 12.210899881691363, "language_loss": 0.79110014, "learning_rate": 2.2843142658709454e-06, "loss": 0.80938375, "num_input_tokens_seen": 168296605, "router_z_loss_clip": 2.45507812, "router_z_loss_mlp": 0.328125, "step": 7833, "time_per_iteration": 4.078873157501221 }, { "auxiliary_loss_clip": 0.01459894, "auxiliary_loss_mlp": 0.00396677, "balance_loss_clip": 1.21137357, "balance_loss_mlp": 0.35931644, "epoch": 0.4710055614008718, "flos": 23003011353600.0, "grad_norm": 43.94572495304055, "language_loss": 0.82248807, "learning_rate": 2.283928754133762e-06, "loss": 0.84105378, "num_input_tokens_seen": 168316205, "router_z_loss_clip": 2.48632812, "router_z_loss_mlp": 0.37353516, "step": 7834, "time_per_iteration": 2.6856770515441895 }, { "auxiliary_loss_clip": 0.01477678, "auxiliary_loss_mlp": 0.00401722, "balance_loss_clip": 1.22396827, "balance_loss_mlp": 0.36436194, "epoch": 0.47106568465353976, "flos": 42741346452480.0, "grad_norm": 53.76987002827452, "language_loss": 0.71289814, "learning_rate": 2.283543231629972e-06, "loss": 0.73169219, "num_input_tokens_seen": 168338935, "router_z_loss_clip": 2.5390625, "router_z_loss_mlp": 0.37329102, "step": 7835, "time_per_iteration": 2.837968111038208 }, { "auxiliary_loss_clip": 0.01274619, "auxiliary_loss_mlp": 0.00108789, "balance_loss_clip": 1.12391341, "balance_loss_mlp": 0.09853655, "epoch": 0.4711258079062077, "flos": 68554008570240.0, "grad_norm": 0.8561659686466754, "language_loss": 0.62017202, "learning_rate": 2.283157698374194e-06, "loss": 0.63400614, "num_input_tokens_seen": 168392800, "router_z_loss_clip": 1.5078125, "router_z_loss_mlp": 0.10253906, "step": 7836, "time_per_iteration": 3.168658494949341 }, { "auxiliary_loss_clip": 0.01488744, "auxiliary_loss_mlp": 0.00421977, "balance_loss_clip": 1.22162557, "balance_loss_mlp": 0.38237557, "epoch": 0.4711859311588757, "flos": 25446588658560.0, "grad_norm": 4.933600474170642, "language_loss": 0.73476994, "learning_rate": 2.2827721543810475e-06, "loss": 0.75387716, "num_input_tokens_seen": 168412940, "router_z_loss_clip": 2.67382812, "router_z_loss_mlp": 0.39599609, "step": 7837, "time_per_iteration": 4.160662889480591 }, { "auxiliary_loss_clip": 0.01475035, "auxiliary_loss_mlp": 0.00409985, "balance_loss_clip": 1.21566963, "balance_loss_mlp": 0.37100333, "epoch": 0.47124605441154366, "flos": 21981891519360.0, "grad_norm": 4.028700044527988, "language_loss": 0.72308433, "learning_rate": 2.282386599665153e-06, "loss": 0.74193448, "num_input_tokens_seen": 168431995, "router_z_loss_clip": 2.59570312, "router_z_loss_mlp": 0.38964844, "step": 7838, "time_per_iteration": 2.6805319786071777 }, { "auxiliary_loss_clip": 0.01492023, "auxiliary_loss_mlp": 0.00433186, "balance_loss_clip": 1.22733545, "balance_loss_mlp": 0.39282158, "epoch": 0.4713061776642116, "flos": 25412689198080.0, "grad_norm": 69.69954370379695, "language_loss": 0.84886611, "learning_rate": 2.2820010342411304e-06, "loss": 0.86811817, "num_input_tokens_seen": 168454585, "router_z_loss_clip": 2.6484375, "router_z_loss_mlp": 0.40332031, "step": 7839, "time_per_iteration": 2.734173536300659 }, { "auxiliary_loss_clip": 0.01478798, "auxiliary_loss_mlp": 0.00405964, "balance_loss_clip": 1.22485983, "balance_loss_mlp": 0.36917588, "epoch": 0.4713663009168796, "flos": 26542259170560.0, "grad_norm": 175.4179782364303, "language_loss": 0.79092836, "learning_rate": 2.2816154581235993e-06, "loss": 0.80977601, "num_input_tokens_seen": 168471265, "router_z_loss_clip": 2.54101562, "router_z_loss_mlp": 0.36791992, "step": 7840, "time_per_iteration": 2.669721841812134 }, { "auxiliary_loss_clip": 0.01469973, "auxiliary_loss_mlp": 0.00425297, "balance_loss_clip": 1.21136975, "balance_loss_mlp": 0.3862682, "epoch": 0.47142642416954755, "flos": 23623583650560.0, "grad_norm": 4.327510380941089, "language_loss": 0.80823547, "learning_rate": 2.2812298713271833e-06, "loss": 0.82718813, "num_input_tokens_seen": 168491360, "router_z_loss_clip": 2.5859375, "router_z_loss_mlp": 0.39038086, "step": 7841, "time_per_iteration": 2.686037302017212 }, { "auxiliary_loss_clip": 0.01481939, "auxiliary_loss_mlp": 0.00435994, "balance_loss_clip": 1.22248888, "balance_loss_mlp": 0.3955577, "epoch": 0.4714865474222155, "flos": 22310150935680.0, "grad_norm": 16.181053244316942, "language_loss": 0.76285231, "learning_rate": 2.280844273866501e-06, "loss": 0.7820316, "num_input_tokens_seen": 168511335, "router_z_loss_clip": 2.59570312, "router_z_loss_mlp": 0.40429688, "step": 7842, "time_per_iteration": 2.6826794147491455 }, { "auxiliary_loss_clip": 0.01486469, "auxiliary_loss_mlp": 0.00417579, "balance_loss_clip": 1.22961342, "balance_loss_mlp": 0.37874073, "epoch": 0.4715466706748835, "flos": 17822430541440.0, "grad_norm": 5.636944181580286, "language_loss": 0.85315591, "learning_rate": 2.280458665756177e-06, "loss": 0.87219638, "num_input_tokens_seen": 168529920, "router_z_loss_clip": 2.56835938, "router_z_loss_mlp": 0.38818359, "step": 7843, "time_per_iteration": 2.667193651199341 }, { "auxiliary_loss_clip": 0.01486405, "auxiliary_loss_mlp": 0.00399791, "balance_loss_clip": 1.22614312, "balance_loss_mlp": 0.36064303, "epoch": 0.4716067939275515, "flos": 23659530186240.0, "grad_norm": 54.57608820045155, "language_loss": 0.79478639, "learning_rate": 2.280073047010832e-06, "loss": 0.81364834, "num_input_tokens_seen": 168550595, "router_z_loss_clip": 2.6015625, "router_z_loss_mlp": 0.3918457, "step": 7844, "time_per_iteration": 2.7039074897766113 }, { "auxiliary_loss_clip": 0.0146688, "auxiliary_loss_mlp": 0.00433042, "balance_loss_clip": 1.21186125, "balance_loss_mlp": 0.39267731, "epoch": 0.47166691718021947, "flos": 17930162407680.0, "grad_norm": 6.529173680573744, "language_loss": 0.84033608, "learning_rate": 2.279687417645088e-06, "loss": 0.8593353, "num_input_tokens_seen": 168569765, "router_z_loss_clip": 2.55078125, "router_z_loss_mlp": 0.40332031, "step": 7845, "time_per_iteration": 2.6972994804382324 }, { "auxiliary_loss_clip": 0.01476425, "auxiliary_loss_mlp": 0.00418516, "balance_loss_clip": 1.21884727, "balance_loss_mlp": 0.3802734, "epoch": 0.47172704043288743, "flos": 26614583205120.0, "grad_norm": 39.63184368108871, "language_loss": 0.76899159, "learning_rate": 2.2793017776735703e-06, "loss": 0.78794092, "num_input_tokens_seen": 168591525, "router_z_loss_clip": 2.57226562, "router_z_loss_mlp": 0.38256836, "step": 7846, "time_per_iteration": 2.7766265869140625 }, { "auxiliary_loss_clip": 0.01471575, "auxiliary_loss_mlp": 0.00390783, "balance_loss_clip": 1.22111511, "balance_loss_mlp": 0.35156298, "epoch": 0.4717871636855554, "flos": 27922700707200.0, "grad_norm": 1.9007371296621496, "language_loss": 0.78903252, "learning_rate": 2.2789161271109e-06, "loss": 0.80765611, "num_input_tokens_seen": 168611235, "router_z_loss_clip": 2.50390625, "router_z_loss_mlp": 0.3918457, "step": 7847, "time_per_iteration": 2.7280147075653076 }, { "auxiliary_loss_clip": 0.01471769, "auxiliary_loss_mlp": 0.00418062, "balance_loss_clip": 1.21799815, "balance_loss_mlp": 0.37962919, "epoch": 0.47184728693822336, "flos": 14502237816960.0, "grad_norm": 4.112456381486064, "language_loss": 0.86687797, "learning_rate": 2.278530465971703e-06, "loss": 0.88577628, "num_input_tokens_seen": 168628710, "router_z_loss_clip": 2.5390625, "router_z_loss_mlp": 0.38427734, "step": 7848, "time_per_iteration": 2.6743545532226562 }, { "auxiliary_loss_clip": 0.01493329, "auxiliary_loss_mlp": 0.00430927, "balance_loss_clip": 1.23699069, "balance_loss_mlp": 0.39256537, "epoch": 0.47190741019089133, "flos": 17856545483520.0, "grad_norm": 8.832091236981126, "language_loss": 0.77561486, "learning_rate": 2.2781447942706032e-06, "loss": 0.79485738, "num_input_tokens_seen": 168645645, "router_z_loss_clip": 2.5625, "router_z_loss_mlp": 0.38378906, "step": 7849, "time_per_iteration": 2.6810929775238037 }, { "auxiliary_loss_clip": 0.01495149, "auxiliary_loss_mlp": 0.00446623, "balance_loss_clip": 1.23112261, "balance_loss_mlp": 0.4056623, "epoch": 0.4719675334435593, "flos": 17895472848000.0, "grad_norm": 176.22499556089, "language_loss": 0.76948887, "learning_rate": 2.277759112022224e-06, "loss": 0.78890657, "num_input_tokens_seen": 168664165, "router_z_loss_clip": 2.63867188, "router_z_loss_mlp": 0.40942383, "step": 7850, "time_per_iteration": 2.738771438598633 }, { "auxiliary_loss_clip": 0.014916, "auxiliary_loss_mlp": 0.00420819, "balance_loss_clip": 1.23039484, "balance_loss_mlp": 0.38188541, "epoch": 0.47202765669622726, "flos": 20704369426560.0, "grad_norm": 10.58953269120672, "language_loss": 0.79338235, "learning_rate": 2.2773734192411916e-06, "loss": 0.8125065, "num_input_tokens_seen": 168681940, "router_z_loss_clip": 2.61328125, "router_z_loss_mlp": 0.38916016, "step": 7851, "time_per_iteration": 2.69732666015625 }, { "auxiliary_loss_clip": 0.0147482, "auxiliary_loss_mlp": 0.00426612, "balance_loss_clip": 1.21645451, "balance_loss_mlp": 0.38672429, "epoch": 0.4720877799488952, "flos": 16360255607040.0, "grad_norm": 5.3629664942877255, "language_loss": 0.82682025, "learning_rate": 2.276987715942132e-06, "loss": 0.84583461, "num_input_tokens_seen": 168698830, "router_z_loss_clip": 2.58007812, "router_z_loss_mlp": 0.39868164, "step": 7852, "time_per_iteration": 2.7819268703460693 }, { "auxiliary_loss_clip": 0.01465914, "auxiliary_loss_mlp": 0.00420155, "balance_loss_clip": 1.21467555, "balance_loss_mlp": 0.38339102, "epoch": 0.4721479032015632, "flos": 20668171495680.0, "grad_norm": 57.898719709826224, "language_loss": 0.74537647, "learning_rate": 2.2766020021396696e-06, "loss": 0.76423717, "num_input_tokens_seen": 168718305, "router_z_loss_clip": 2.51367188, "router_z_loss_mlp": 0.36767578, "step": 7853, "time_per_iteration": 2.705368995666504 }, { "auxiliary_loss_clip": 0.01257695, "auxiliary_loss_mlp": 0.00103313, "balance_loss_clip": 1.11344647, "balance_loss_mlp": 0.09449109, "epoch": 0.47220802645423116, "flos": 67750438435200.0, "grad_norm": 0.6795151251061062, "language_loss": 0.50047308, "learning_rate": 2.276216277848432e-06, "loss": 0.51408315, "num_input_tokens_seen": 168782365, "router_z_loss_clip": 1.4375, "router_z_loss_mlp": 0.08837891, "step": 7854, "time_per_iteration": 3.308317184448242 }, { "auxiliary_loss_clip": 0.01484936, "auxiliary_loss_mlp": 0.00402665, "balance_loss_clip": 1.22325873, "balance_loss_mlp": 0.3640888, "epoch": 0.4722681497068991, "flos": 20921449271040.0, "grad_norm": 29.389327931366466, "language_loss": 0.70024347, "learning_rate": 2.2758305430830455e-06, "loss": 0.71911943, "num_input_tokens_seen": 168800485, "router_z_loss_clip": 2.61914062, "router_z_loss_mlp": 0.38598633, "step": 7855, "time_per_iteration": 2.675990343093872 }, { "auxiliary_loss_clip": 0.01477858, "auxiliary_loss_mlp": 0.0039172, "balance_loss_clip": 1.22028363, "balance_loss_mlp": 0.35374039, "epoch": 0.4723282729595671, "flos": 28293083798400.0, "grad_norm": 40.889235721719835, "language_loss": 0.82916343, "learning_rate": 2.2754447978581376e-06, "loss": 0.8478592, "num_input_tokens_seen": 168818965, "router_z_loss_clip": 2.57421875, "router_z_loss_mlp": 0.37988281, "step": 7856, "time_per_iteration": 2.7275588512420654 }, { "auxiliary_loss_clip": 0.01480281, "auxiliary_loss_mlp": 0.0037046, "balance_loss_clip": 1.22765267, "balance_loss_mlp": 0.33495915, "epoch": 0.4723883962122351, "flos": 27125053338240.0, "grad_norm": 5.7225511525833666, "language_loss": 0.80273378, "learning_rate": 2.2750590421883347e-06, "loss": 0.82124114, "num_input_tokens_seen": 168840355, "router_z_loss_clip": 2.52929688, "router_z_loss_mlp": 0.35522461, "step": 7857, "time_per_iteration": 2.6991918087005615 }, { "auxiliary_loss_clip": 0.0147373, "auxiliary_loss_mlp": 0.00387299, "balance_loss_clip": 1.21912432, "balance_loss_mlp": 0.35344389, "epoch": 0.47244851946490307, "flos": 31537253387520.0, "grad_norm": 22.685355767284324, "language_loss": 0.69565558, "learning_rate": 2.2746732760882655e-06, "loss": 0.71426588, "num_input_tokens_seen": 168861765, "router_z_loss_clip": 2.54492188, "router_z_loss_mlp": 0.33837891, "step": 7858, "time_per_iteration": 2.7661001682281494 }, { "auxiliary_loss_clip": 0.0146903, "auxiliary_loss_mlp": 0.00366781, "balance_loss_clip": 1.21758628, "balance_loss_mlp": 0.330971, "epoch": 0.47250864271757104, "flos": 20886544229760.0, "grad_norm": 49.63542552966917, "language_loss": 0.769741, "learning_rate": 2.2742874995725575e-06, "loss": 0.78809911, "num_input_tokens_seen": 168881310, "router_z_loss_clip": 2.51757812, "router_z_loss_mlp": 0.35791016, "step": 7859, "time_per_iteration": 2.7397143840789795 }, { "auxiliary_loss_clip": 0.01482457, "auxiliary_loss_mlp": 0.00422365, "balance_loss_clip": 1.22166216, "balance_loss_mlp": 0.38457549, "epoch": 0.472568765970239, "flos": 20522086882560.0, "grad_norm": 4.712192687214576, "language_loss": 0.67898858, "learning_rate": 2.2739017126558413e-06, "loss": 0.69803685, "num_input_tokens_seen": 168899470, "router_z_loss_clip": 2.61132812, "router_z_loss_mlp": 0.37768555, "step": 7860, "time_per_iteration": 2.711091995239258 }, { "auxiliary_loss_clip": 0.01469431, "auxiliary_loss_mlp": 0.00377309, "balance_loss_clip": 1.21424878, "balance_loss_mlp": 0.33994871, "epoch": 0.47262888922290697, "flos": 35805200417280.0, "grad_norm": 8.008743436764433, "language_loss": 0.7905817, "learning_rate": 2.2735159153527445e-06, "loss": 0.80904913, "num_input_tokens_seen": 168921495, "router_z_loss_clip": 2.55078125, "router_z_loss_mlp": 0.37353516, "step": 7861, "time_per_iteration": 2.7667899131774902 }, { "auxiliary_loss_clip": 0.01471795, "auxiliary_loss_mlp": 0.00409377, "balance_loss_clip": 1.21785593, "balance_loss_mlp": 0.37225485, "epoch": 0.47268901247557493, "flos": 20667740532480.0, "grad_norm": 4.462786821928621, "language_loss": 0.90868723, "learning_rate": 2.273130107677896e-06, "loss": 0.92749894, "num_input_tokens_seen": 168940515, "router_z_loss_clip": 2.54101562, "router_z_loss_mlp": 0.37109375, "step": 7862, "time_per_iteration": 2.6191375255584717 }, { "auxiliary_loss_clip": 0.0146643, "auxiliary_loss_mlp": 0.00398042, "balance_loss_clip": 1.21056795, "balance_loss_mlp": 0.36082506, "epoch": 0.4727491357282429, "flos": 19573291082880.0, "grad_norm": 43.922871871147095, "language_loss": 0.91488743, "learning_rate": 2.272744289645927e-06, "loss": 0.93353212, "num_input_tokens_seen": 168958340, "router_z_loss_clip": 2.55859375, "router_z_loss_mlp": 0.37231445, "step": 7863, "time_per_iteration": 2.620615243911743 }, { "auxiliary_loss_clip": 0.01484446, "auxiliary_loss_mlp": 0.00399739, "balance_loss_clip": 1.22743154, "balance_loss_mlp": 0.36192632, "epoch": 0.47280925898091086, "flos": 18217231902720.0, "grad_norm": 4.060608326105938, "language_loss": 0.70900464, "learning_rate": 2.272358461271467e-06, "loss": 0.7278465, "num_input_tokens_seen": 168974850, "router_z_loss_clip": 2.5703125, "router_z_loss_mlp": 0.37817383, "step": 7864, "time_per_iteration": 2.6454052925109863 }, { "auxiliary_loss_clip": 0.01476538, "auxiliary_loss_mlp": 0.00414471, "balance_loss_clip": 1.22146332, "balance_loss_mlp": 0.37656289, "epoch": 0.4728693822335788, "flos": 17821820010240.0, "grad_norm": 5.307374737624215, "language_loss": 0.74009812, "learning_rate": 2.271972622569147e-06, "loss": 0.75900817, "num_input_tokens_seen": 168992860, "router_z_loss_clip": 2.54882812, "router_z_loss_mlp": 0.37915039, "step": 7865, "time_per_iteration": 2.6863861083984375 }, { "auxiliary_loss_clip": 0.01458723, "auxiliary_loss_mlp": 0.00381993, "balance_loss_clip": 1.21153212, "balance_loss_mlp": 0.34844732, "epoch": 0.4729295054862468, "flos": 20595057361920.0, "grad_norm": 165.0637526046751, "language_loss": 0.79977763, "learning_rate": 2.2715867735535976e-06, "loss": 0.81818485, "num_input_tokens_seen": 169010325, "router_z_loss_clip": 2.46875, "router_z_loss_mlp": 0.33520508, "step": 7866, "time_per_iteration": 2.7064027786254883 }, { "auxiliary_loss_clip": 0.01474948, "auxiliary_loss_mlp": 0.00387891, "balance_loss_clip": 1.21473658, "balance_loss_mlp": 0.35295045, "epoch": 0.47298962873891476, "flos": 23368079232000.0, "grad_norm": 16.76205737918974, "language_loss": 0.88943124, "learning_rate": 2.271200914239451e-06, "loss": 0.9080596, "num_input_tokens_seen": 169029840, "router_z_loss_clip": 2.59960938, "router_z_loss_mlp": 0.34899902, "step": 7867, "time_per_iteration": 2.6942427158355713 }, { "auxiliary_loss_clip": 0.01468558, "auxiliary_loss_mlp": 0.00370495, "balance_loss_clip": 1.21683419, "balance_loss_mlp": 0.33623463, "epoch": 0.4730497519915827, "flos": 22052240305920.0, "grad_norm": 4.853846567655911, "language_loss": 0.83692634, "learning_rate": 2.2708150446413385e-06, "loss": 0.85531688, "num_input_tokens_seen": 169049975, "router_z_loss_clip": 2.51757812, "router_z_loss_mlp": 0.34301758, "step": 7868, "time_per_iteration": 2.720332622528076 }, { "auxiliary_loss_clip": 0.01482792, "auxiliary_loss_mlp": 0.00409824, "balance_loss_clip": 1.21924555, "balance_loss_mlp": 0.37193894, "epoch": 0.4731098752442507, "flos": 21069724613760.0, "grad_norm": 28.089907517430625, "language_loss": 0.8447206, "learning_rate": 2.2704291647738915e-06, "loss": 0.86364675, "num_input_tokens_seen": 169069540, "router_z_loss_clip": 2.6328125, "router_z_loss_mlp": 0.37890625, "step": 7869, "time_per_iteration": 4.063143014907837 }, { "auxiliary_loss_clip": 0.01487535, "auxiliary_loss_mlp": 0.0039011, "balance_loss_clip": 1.22392035, "balance_loss_mlp": 0.35465705, "epoch": 0.4731699984969187, "flos": 22528775064960.0, "grad_norm": 2.947993621596573, "language_loss": 0.78524083, "learning_rate": 2.2700432746517443e-06, "loss": 0.80401731, "num_input_tokens_seen": 169089940, "router_z_loss_clip": 2.63867188, "router_z_loss_mlp": 0.35449219, "step": 7870, "time_per_iteration": 2.7365682125091553 }, { "auxiliary_loss_clip": 0.0149173, "auxiliary_loss_mlp": 0.00411584, "balance_loss_clip": 1.22610903, "balance_loss_mlp": 0.37017095, "epoch": 0.4732301217495867, "flos": 24898124914560.0, "grad_norm": 76.98143057527443, "language_loss": 0.86450285, "learning_rate": 2.2696573742895292e-06, "loss": 0.88353598, "num_input_tokens_seen": 169109650, "router_z_loss_clip": 2.65820312, "router_z_loss_mlp": 0.4140625, "step": 7871, "time_per_iteration": 2.708550453186035 }, { "auxiliary_loss_clip": 0.01477391, "auxiliary_loss_mlp": 0.00362355, "balance_loss_clip": 1.21727765, "balance_loss_mlp": 0.32678336, "epoch": 0.47329024500225464, "flos": 22784423137920.0, "grad_norm": 7.103895322042403, "language_loss": 0.81741178, "learning_rate": 2.269271463701879e-06, "loss": 0.83580923, "num_input_tokens_seen": 169128990, "router_z_loss_clip": 2.59960938, "router_z_loss_mlp": 0.35595703, "step": 7872, "time_per_iteration": 4.114868402481079 }, { "auxiliary_loss_clip": 0.01464803, "auxiliary_loss_mlp": 0.0036278, "balance_loss_clip": 1.20935345, "balance_loss_mlp": 0.3279469, "epoch": 0.4733503682549226, "flos": 38695902220800.0, "grad_norm": 101.55609736435034, "language_loss": 0.72602028, "learning_rate": 2.268885542903428e-06, "loss": 0.74429607, "num_input_tokens_seen": 169154645, "router_z_loss_clip": 2.55273438, "router_z_loss_mlp": 0.34851074, "step": 7873, "time_per_iteration": 2.8550407886505127 }, { "auxiliary_loss_clip": 0.0147688, "auxiliary_loss_mlp": 0.00397162, "balance_loss_clip": 1.21590912, "balance_loss_mlp": 0.35706013, "epoch": 0.47341049150759057, "flos": 22966849336320.0, "grad_norm": 4.701281574790249, "language_loss": 0.78564143, "learning_rate": 2.26849961190881e-06, "loss": 0.80438185, "num_input_tokens_seen": 169174995, "router_z_loss_clip": 2.61328125, "router_z_loss_mlp": 0.40112305, "step": 7874, "time_per_iteration": 2.6905524730682373 }, { "auxiliary_loss_clip": 0.01475596, "auxiliary_loss_mlp": 0.00364708, "balance_loss_clip": 1.21522915, "balance_loss_mlp": 0.32975596, "epoch": 0.47347061476025853, "flos": 14538471661440.0, "grad_norm": 11.00623129337569, "language_loss": 0.72401243, "learning_rate": 2.26811367073266e-06, "loss": 0.74241543, "num_input_tokens_seen": 169191815, "router_z_loss_clip": 2.6015625, "router_z_loss_mlp": 0.34985352, "step": 7875, "time_per_iteration": 4.153144836425781 }, { "auxiliary_loss_clip": 0.01481276, "auxiliary_loss_mlp": 0.00371527, "balance_loss_clip": 1.22219157, "balance_loss_mlp": 0.33602655, "epoch": 0.4735307380129265, "flos": 30263250827520.0, "grad_norm": 17.563394379482737, "language_loss": 0.86922103, "learning_rate": 2.2677277193896125e-06, "loss": 0.88774908, "num_input_tokens_seen": 169210430, "router_z_loss_clip": 2.58984375, "router_z_loss_mlp": 0.35498047, "step": 7876, "time_per_iteration": 2.71294903755188 }, { "auxiliary_loss_clip": 0.01470812, "auxiliary_loss_mlp": 0.00424956, "balance_loss_clip": 1.20834184, "balance_loss_mlp": 0.38640368, "epoch": 0.47359086126559446, "flos": 19391044452480.0, "grad_norm": 28.499461959028928, "language_loss": 0.8360014, "learning_rate": 2.267341757894304e-06, "loss": 0.85495913, "num_input_tokens_seen": 169229295, "router_z_loss_clip": 2.62304688, "router_z_loss_mlp": 0.38525391, "step": 7877, "time_per_iteration": 2.6243770122528076 }, { "auxiliary_loss_clip": 0.01456611, "auxiliary_loss_mlp": 0.00369598, "balance_loss_clip": 1.20545125, "balance_loss_mlp": 0.33488435, "epoch": 0.47365098451826243, "flos": 21939408708480.0, "grad_norm": 6.351716821320333, "language_loss": 0.75870878, "learning_rate": 2.2669557862613685e-06, "loss": 0.77697086, "num_input_tokens_seen": 169247855, "router_z_loss_clip": 2.51367188, "router_z_loss_mlp": 0.34716797, "step": 7878, "time_per_iteration": 2.647332191467285 }, { "auxiliary_loss_clip": 0.01472477, "auxiliary_loss_mlp": 0.00346859, "balance_loss_clip": 1.21629918, "balance_loss_mlp": 0.31040478, "epoch": 0.4737111077709304, "flos": 25845053207040.0, "grad_norm": 28.807219387070965, "language_loss": 0.80309826, "learning_rate": 2.2665698045054425e-06, "loss": 0.82129169, "num_input_tokens_seen": 169268860, "router_z_loss_clip": 2.56445312, "router_z_loss_mlp": 0.36450195, "step": 7879, "time_per_iteration": 4.094977617263794 }, { "auxiliary_loss_clip": 0.01242894, "auxiliary_loss_mlp": 0.00105164, "balance_loss_clip": 1.09529841, "balance_loss_mlp": 0.09467403, "epoch": 0.47377123102359836, "flos": 67760886314880.0, "grad_norm": 0.7253507713251136, "language_loss": 0.61110008, "learning_rate": 2.266183812641164e-06, "loss": 0.62458068, "num_input_tokens_seen": 169331855, "router_z_loss_clip": 1.4765625, "router_z_loss_mlp": 0.10498047, "step": 7880, "time_per_iteration": 3.258176803588867 }, { "auxiliary_loss_clip": 0.01467797, "auxiliary_loss_mlp": 0.00422195, "balance_loss_clip": 1.21101356, "balance_loss_mlp": 0.38132966, "epoch": 0.4738313542762663, "flos": 24315977191680.0, "grad_norm": 353.84450565371293, "language_loss": 0.73660707, "learning_rate": 2.2657978106831675e-06, "loss": 0.75550705, "num_input_tokens_seen": 169352175, "router_z_loss_clip": 2.56835938, "router_z_loss_mlp": 0.40820312, "step": 7881, "time_per_iteration": 2.7044827938079834 }, { "auxiliary_loss_clip": 0.01466611, "auxiliary_loss_mlp": 0.00368344, "balance_loss_clip": 1.21204484, "balance_loss_mlp": 0.33444053, "epoch": 0.4738914775289343, "flos": 20705339093760.0, "grad_norm": 86.93845546359643, "language_loss": 0.82333249, "learning_rate": 2.265411798646092e-06, "loss": 0.84168202, "num_input_tokens_seen": 169371215, "router_z_loss_clip": 2.546875, "router_z_loss_mlp": 0.33911133, "step": 7882, "time_per_iteration": 2.716411590576172 }, { "auxiliary_loss_clip": 0.01476462, "auxiliary_loss_mlp": 0.00387565, "balance_loss_clip": 1.21592712, "balance_loss_mlp": 0.35058624, "epoch": 0.4739516007816023, "flos": 25446337263360.0, "grad_norm": 3.483226769158877, "language_loss": 0.81283271, "learning_rate": 2.2650257765445747e-06, "loss": 0.83147299, "num_input_tokens_seen": 169391745, "router_z_loss_clip": 2.60742188, "router_z_loss_mlp": 0.36938477, "step": 7883, "time_per_iteration": 2.7514121532440186 }, { "auxiliary_loss_clip": 0.01477343, "auxiliary_loss_mlp": 0.00386993, "balance_loss_clip": 1.21694231, "balance_loss_mlp": 0.34846413, "epoch": 0.4740117240342703, "flos": 19974341410560.0, "grad_norm": 2.8998737707929676, "language_loss": 0.79195082, "learning_rate": 2.2646397443932525e-06, "loss": 0.81059414, "num_input_tokens_seen": 169409845, "router_z_loss_clip": 2.60351562, "router_z_loss_mlp": 0.38500977, "step": 7884, "time_per_iteration": 2.7267420291900635 }, { "auxiliary_loss_clip": 0.01482365, "auxiliary_loss_mlp": 0.00413029, "balance_loss_clip": 1.21843553, "balance_loss_mlp": 0.37445247, "epoch": 0.47407184728693824, "flos": 15661146222720.0, "grad_norm": 70.68909952578645, "language_loss": 0.88199836, "learning_rate": 2.2642537022067655e-06, "loss": 0.90095228, "num_input_tokens_seen": 169426085, "router_z_loss_clip": 2.63867188, "router_z_loss_mlp": 0.38598633, "step": 7885, "time_per_iteration": 2.644940137863159 }, { "auxiliary_loss_clip": 0.01484634, "auxiliary_loss_mlp": 0.00386858, "balance_loss_clip": 1.22509408, "balance_loss_mlp": 0.34973666, "epoch": 0.4741319705396062, "flos": 18588800142720.0, "grad_norm": 27.34758926470861, "language_loss": 0.80249524, "learning_rate": 2.263867649999751e-06, "loss": 0.82121021, "num_input_tokens_seen": 169444705, "router_z_loss_clip": 2.59570312, "router_z_loss_mlp": 0.37109375, "step": 7886, "time_per_iteration": 2.656519889831543 }, { "auxiliary_loss_clip": 0.01493647, "auxiliary_loss_mlp": 0.00427739, "balance_loss_clip": 1.22570968, "balance_loss_mlp": 0.38611135, "epoch": 0.47419209379227417, "flos": 13261093223040.0, "grad_norm": 122.16121340477392, "language_loss": 0.81817299, "learning_rate": 2.263481587786849e-06, "loss": 0.83738685, "num_input_tokens_seen": 169460850, "router_z_loss_clip": 2.67578125, "router_z_loss_mlp": 0.41625977, "step": 7887, "time_per_iteration": 2.652818441390991 }, { "auxiliary_loss_clip": 0.01475867, "auxiliary_loss_mlp": 0.00382213, "balance_loss_clip": 1.21978498, "balance_loss_mlp": 0.34466159, "epoch": 0.47425221704494214, "flos": 20044043752320.0, "grad_norm": 43.95953607912842, "language_loss": 0.84145594, "learning_rate": 2.2630955155826993e-06, "loss": 0.86003673, "num_input_tokens_seen": 169478890, "router_z_loss_clip": 2.56445312, "router_z_loss_mlp": 0.37597656, "step": 7888, "time_per_iteration": 2.637044668197632 }, { "auxiliary_loss_clip": 0.01492029, "auxiliary_loss_mlp": 0.00366113, "balance_loss_clip": 1.23010421, "balance_loss_mlp": 0.33096975, "epoch": 0.4743123402976101, "flos": 27271892136960.0, "grad_norm": 58.99003992234917, "language_loss": 0.78299844, "learning_rate": 2.2627094334019406e-06, "loss": 0.80157983, "num_input_tokens_seen": 169499690, "router_z_loss_clip": 2.61523438, "router_z_loss_mlp": 0.35131836, "step": 7889, "time_per_iteration": 2.7386932373046875 }, { "auxiliary_loss_clip": 0.01244438, "auxiliary_loss_mlp": 0.00101283, "balance_loss_clip": 1.09834599, "balance_loss_mlp": 0.09107919, "epoch": 0.47437246355027807, "flos": 55393970261760.0, "grad_norm": 0.690094214832616, "language_loss": 0.55429256, "learning_rate": 2.262323341259214e-06, "loss": 0.56774974, "num_input_tokens_seen": 169560475, "router_z_loss_clip": 1.453125, "router_z_loss_mlp": 0.10205078, "step": 7890, "time_per_iteration": 3.2382590770721436 }, { "auxiliary_loss_clip": 0.01489811, "auxiliary_loss_mlp": 0.00388655, "balance_loss_clip": 1.22920299, "balance_loss_mlp": 0.35167634, "epoch": 0.47443258680294603, "flos": 23878477537920.0, "grad_norm": 12.263584876649997, "language_loss": 0.72206664, "learning_rate": 2.2619372391691605e-06, "loss": 0.74085128, "num_input_tokens_seen": 169580110, "router_z_loss_clip": 2.6015625, "router_z_loss_mlp": 0.36987305, "step": 7891, "time_per_iteration": 2.708894968032837 }, { "auxiliary_loss_clip": 0.01493626, "auxiliary_loss_mlp": 0.00478729, "balance_loss_clip": 1.22594833, "balance_loss_mlp": 0.43378681, "epoch": 0.474492710055614, "flos": 21977761455360.0, "grad_norm": 10.99899324654528, "language_loss": 0.7645576, "learning_rate": 2.26155112714642e-06, "loss": 0.78428113, "num_input_tokens_seen": 169597510, "router_z_loss_clip": 2.67578125, "router_z_loss_mlp": 0.44995117, "step": 7892, "time_per_iteration": 2.7353100776672363 }, { "auxiliary_loss_clip": 0.01272626, "auxiliary_loss_mlp": 0.00059128, "balance_loss_clip": 1.12193632, "balance_loss_mlp": 0.05035396, "epoch": 0.47455283330828196, "flos": 62557180122240.0, "grad_norm": 0.800498952198806, "language_loss": 0.58274984, "learning_rate": 2.2611650052056355e-06, "loss": 0.59606737, "num_input_tokens_seen": 169660010, "router_z_loss_clip": 1.5078125, "router_z_loss_mlp": 0.08789062, "step": 7893, "time_per_iteration": 3.2548446655273438 }, { "auxiliary_loss_clip": 0.01472416, "auxiliary_loss_mlp": 0.00396363, "balance_loss_clip": 1.22087777, "balance_loss_mlp": 0.35747707, "epoch": 0.47461295656094993, "flos": 12093637380480.0, "grad_norm": 118.86307385584706, "language_loss": 0.85751647, "learning_rate": 2.2607788733614463e-06, "loss": 0.87620425, "num_input_tokens_seen": 169678485, "router_z_loss_clip": 2.515625, "router_z_loss_mlp": 0.38916016, "step": 7894, "time_per_iteration": 2.664578914642334 }, { "auxiliary_loss_clip": 0.01472147, "auxiliary_loss_mlp": 0.00378887, "balance_loss_clip": 1.21942925, "balance_loss_mlp": 0.34305245, "epoch": 0.4746730798136179, "flos": 20884568981760.0, "grad_norm": 113.38159480046279, "language_loss": 0.80353492, "learning_rate": 2.260392731628497e-06, "loss": 0.82204527, "num_input_tokens_seen": 169697335, "router_z_loss_clip": 2.52539062, "router_z_loss_mlp": 0.3581543, "step": 7895, "time_per_iteration": 2.6829771995544434 }, { "auxiliary_loss_clip": 0.01474082, "auxiliary_loss_mlp": 0.0040124, "balance_loss_clip": 1.21713948, "balance_loss_mlp": 0.36402297, "epoch": 0.4747332030662859, "flos": 19974808287360.0, "grad_norm": 40.794977006583785, "language_loss": 0.88401151, "learning_rate": 2.260006580021429e-06, "loss": 0.9027648, "num_input_tokens_seen": 169715395, "router_z_loss_clip": 2.5703125, "router_z_loss_mlp": 0.37231445, "step": 7896, "time_per_iteration": 2.6753251552581787 }, { "auxiliary_loss_clip": 0.01484214, "auxiliary_loss_mlp": 0.00410251, "balance_loss_clip": 1.23200417, "balance_loss_mlp": 0.37095967, "epoch": 0.4747933263189539, "flos": 16034186920320.0, "grad_norm": 1504.4129373934372, "language_loss": 0.83919775, "learning_rate": 2.259620418554886e-06, "loss": 0.85814238, "num_input_tokens_seen": 169733755, "router_z_loss_clip": 2.5234375, "router_z_loss_mlp": 0.39306641, "step": 7897, "time_per_iteration": 2.6341514587402344 }, { "auxiliary_loss_clip": 0.01500749, "auxiliary_loss_mlp": 0.0040536, "balance_loss_clip": 1.23942757, "balance_loss_mlp": 0.36778519, "epoch": 0.47485344957162184, "flos": 13955102876160.0, "grad_norm": 72.06686125419053, "language_loss": 0.73253351, "learning_rate": 2.25923424724351e-06, "loss": 0.7515946, "num_input_tokens_seen": 169751390, "router_z_loss_clip": 2.609375, "router_z_loss_mlp": 0.37597656, "step": 7898, "time_per_iteration": 2.831624984741211 }, { "auxiliary_loss_clip": 0.01489171, "auxiliary_loss_mlp": 0.00400627, "balance_loss_clip": 1.22708058, "balance_loss_mlp": 0.3604297, "epoch": 0.4749135728242898, "flos": 20449080489600.0, "grad_norm": 5.34180848302132, "language_loss": 0.78304559, "learning_rate": 2.258848066101946e-06, "loss": 0.80194354, "num_input_tokens_seen": 169769500, "router_z_loss_clip": 2.62109375, "router_z_loss_mlp": 0.40185547, "step": 7899, "time_per_iteration": 2.718524694442749 }, { "auxiliary_loss_clip": 0.01472721, "auxiliary_loss_mlp": 0.00381523, "balance_loss_clip": 1.22012532, "balance_loss_mlp": 0.34523612, "epoch": 0.4749736960769578, "flos": 28949961767040.0, "grad_norm": 14.052509037541208, "language_loss": 0.75778055, "learning_rate": 2.258461875144837e-06, "loss": 0.77632308, "num_input_tokens_seen": 169789215, "router_z_loss_clip": 2.52734375, "router_z_loss_mlp": 0.36254883, "step": 7900, "time_per_iteration": 2.693476438522339 }, { "auxiliary_loss_clip": 0.01486713, "auxiliary_loss_mlp": 0.00379623, "balance_loss_clip": 1.23535955, "balance_loss_mlp": 0.34359813, "epoch": 0.47503381932962574, "flos": 31938770592000.0, "grad_norm": 96.54344656173453, "language_loss": 0.76282167, "learning_rate": 2.2580756743868273e-06, "loss": 0.78148502, "num_input_tokens_seen": 169808825, "router_z_loss_clip": 2.515625, "router_z_loss_mlp": 0.36035156, "step": 7901, "time_per_iteration": 2.712613344192505 }, { "auxiliary_loss_clip": 0.01468243, "auxiliary_loss_mlp": 0.00386314, "balance_loss_clip": 1.22053552, "balance_loss_mlp": 0.35031313, "epoch": 0.4750939425822937, "flos": 22127257860480.0, "grad_norm": 9.748858156597224, "language_loss": 0.79609293, "learning_rate": 2.2576894638425636e-06, "loss": 0.8146385, "num_input_tokens_seen": 169827590, "router_z_loss_clip": 2.4765625, "router_z_loss_mlp": 0.36010742, "step": 7902, "time_per_iteration": 2.674734354019165 }, { "auxiliary_loss_clip": 0.01466043, "auxiliary_loss_mlp": 0.0038476, "balance_loss_clip": 1.2194283, "balance_loss_mlp": 0.34856835, "epoch": 0.47515406583496167, "flos": 20850094903680.0, "grad_norm": 258.17901454233356, "language_loss": 0.74600583, "learning_rate": 2.257303243526688e-06, "loss": 0.76451385, "num_input_tokens_seen": 169844925, "router_z_loss_clip": 2.46484375, "router_z_loss_mlp": 0.36181641, "step": 7903, "time_per_iteration": 2.6800525188446045 }, { "auxiliary_loss_clip": 0.0145941, "auxiliary_loss_mlp": 0.00364207, "balance_loss_clip": 1.21545815, "balance_loss_mlp": 0.33104253, "epoch": 0.47521418908762963, "flos": 17524802448000.0, "grad_norm": 50.27372788755333, "language_loss": 0.77490342, "learning_rate": 2.256917013453848e-06, "loss": 0.79313958, "num_input_tokens_seen": 169862705, "router_z_loss_clip": 2.44140625, "router_z_loss_mlp": 0.33154297, "step": 7904, "time_per_iteration": 2.678912878036499 }, { "auxiliary_loss_clip": 0.01471696, "auxiliary_loss_mlp": 0.00361809, "balance_loss_clip": 1.22706723, "balance_loss_mlp": 0.32943213, "epoch": 0.4752743123402976, "flos": 20559434048640.0, "grad_norm": 657.9215529698724, "language_loss": 0.90963525, "learning_rate": 2.25653077363869e-06, "loss": 0.92797029, "num_input_tokens_seen": 169880155, "router_z_loss_clip": 2.4453125, "router_z_loss_mlp": 0.32373047, "step": 7905, "time_per_iteration": 2.6729140281677246 }, { "auxiliary_loss_clip": 0.01443034, "auxiliary_loss_mlp": 0.00356994, "balance_loss_clip": 1.20702004, "balance_loss_mlp": 0.32404417, "epoch": 0.47533443559296557, "flos": 26360623071360.0, "grad_norm": 42.23322761214558, "language_loss": 0.87223613, "learning_rate": 2.2561445240958583e-06, "loss": 0.89023638, "num_input_tokens_seen": 169901525, "router_z_loss_clip": 2.36328125, "router_z_loss_mlp": 0.32958984, "step": 7906, "time_per_iteration": 2.742083787918091 }, { "auxiliary_loss_clip": 0.01267838, "auxiliary_loss_mlp": 0.00086478, "balance_loss_clip": 1.11801922, "balance_loss_mlp": 0.07808595, "epoch": 0.47539455884563353, "flos": 65949660967680.0, "grad_norm": 0.6540075695856886, "language_loss": 0.58837897, "learning_rate": 2.255758264840002e-06, "loss": 0.60192209, "num_input_tokens_seen": 169970345, "router_z_loss_clip": 1.5, "router_z_loss_mlp": 0.08398438, "step": 7907, "time_per_iteration": 3.280851364135742 }, { "auxiliary_loss_clip": 0.01465906, "auxiliary_loss_mlp": 0.00377137, "balance_loss_clip": 1.21743774, "balance_loss_mlp": 0.34175602, "epoch": 0.4754546820983015, "flos": 17238128002560.0, "grad_norm": 25.93361595916039, "language_loss": 0.84792078, "learning_rate": 2.255371995885765e-06, "loss": 0.86635119, "num_input_tokens_seen": 169986440, "router_z_loss_clip": 2.484375, "router_z_loss_mlp": 0.35375977, "step": 7908, "time_per_iteration": 2.6822829246520996 }, { "auxiliary_loss_clip": 0.01480703, "auxiliary_loss_mlp": 0.00369246, "balance_loss_clip": 1.22524738, "balance_loss_mlp": 0.33393627, "epoch": 0.47551480535096946, "flos": 19825886499840.0, "grad_norm": 17.59412698468543, "language_loss": 0.79364657, "learning_rate": 2.254985717247797e-06, "loss": 0.81214607, "num_input_tokens_seen": 170005705, "router_z_loss_clip": 2.5546875, "router_z_loss_mlp": 0.35302734, "step": 7909, "time_per_iteration": 2.733083486557007 }, { "auxiliary_loss_clip": 0.01470725, "auxiliary_loss_mlp": 0.00390542, "balance_loss_clip": 1.21751535, "balance_loss_mlp": 0.35444519, "epoch": 0.4755749286036375, "flos": 22163958581760.0, "grad_norm": 7074.607589873091, "language_loss": 0.81190681, "learning_rate": 2.2545994289407457e-06, "loss": 0.83051944, "num_input_tokens_seen": 170023415, "router_z_loss_clip": 2.53125, "router_z_loss_mlp": 0.36108398, "step": 7910, "time_per_iteration": 2.693406105041504 }, { "auxiliary_loss_clip": 0.01463949, "auxiliary_loss_mlp": 0.00345074, "balance_loss_clip": 1.2159642, "balance_loss_mlp": 0.31005013, "epoch": 0.47563505185630545, "flos": 21648280976640.0, "grad_norm": 6.990605563407297, "language_loss": 0.84091151, "learning_rate": 2.2542131309792577e-06, "loss": 0.85900176, "num_input_tokens_seen": 170042395, "router_z_loss_clip": 2.47851562, "router_z_loss_mlp": 0.34985352, "step": 7911, "time_per_iteration": 4.1321141719818115 }, { "auxiliary_loss_clip": 0.0148042, "auxiliary_loss_mlp": 0.00373032, "balance_loss_clip": 1.22529781, "balance_loss_mlp": 0.33655423, "epoch": 0.4756951751089734, "flos": 20628777254400.0, "grad_norm": 146.46833913942828, "language_loss": 0.81752974, "learning_rate": 2.253826823377983e-06, "loss": 0.83606422, "num_input_tokens_seen": 170061610, "router_z_loss_clip": 2.55078125, "router_z_loss_mlp": 0.36523438, "step": 7912, "time_per_iteration": 2.6414685249328613 }, { "auxiliary_loss_clip": 0.01491111, "auxiliary_loss_mlp": 0.0037147, "balance_loss_clip": 1.23660326, "balance_loss_mlp": 0.3346819, "epoch": 0.4757552983616414, "flos": 25848788221440.0, "grad_norm": 5.178928264074438, "language_loss": 0.80240464, "learning_rate": 2.253440506151569e-06, "loss": 0.8210305, "num_input_tokens_seen": 170083505, "router_z_loss_clip": 2.546875, "router_z_loss_mlp": 0.36767578, "step": 7913, "time_per_iteration": 2.7679078578948975 }, { "auxiliary_loss_clip": 0.01478411, "auxiliary_loss_mlp": 0.00358001, "balance_loss_clip": 1.22664988, "balance_loss_mlp": 0.32202399, "epoch": 0.47581542161430934, "flos": 18223013992320.0, "grad_norm": 11.049759794618758, "language_loss": 0.77667868, "learning_rate": 2.253054179314666e-06, "loss": 0.79504281, "num_input_tokens_seen": 170100690, "router_z_loss_clip": 2.51757812, "router_z_loss_mlp": 0.35961914, "step": 7914, "time_per_iteration": 4.0894153118133545 }, { "auxiliary_loss_clip": 0.01479824, "auxiliary_loss_mlp": 0.00400683, "balance_loss_clip": 1.22530723, "balance_loss_mlp": 0.36134365, "epoch": 0.4758755448669773, "flos": 21579763783680.0, "grad_norm": 553.1658735214427, "language_loss": 0.69909495, "learning_rate": 2.2526678428819227e-06, "loss": 0.71790004, "num_input_tokens_seen": 170119240, "router_z_loss_clip": 2.54296875, "router_z_loss_mlp": 0.39355469, "step": 7915, "time_per_iteration": 2.7089927196502686 }, { "auxiliary_loss_clip": 0.01455827, "auxiliary_loss_mlp": 0.00337098, "balance_loss_clip": 1.21302354, "balance_loss_mlp": 0.30083406, "epoch": 0.47593566811964527, "flos": 15231152511360.0, "grad_norm": 135.59031539480767, "language_loss": 0.82472551, "learning_rate": 2.2522814968679896e-06, "loss": 0.84265471, "num_input_tokens_seen": 170136450, "router_z_loss_clip": 2.42773438, "router_z_loss_mlp": 0.36254883, "step": 7916, "time_per_iteration": 2.6232852935791016 }, { "auxiliary_loss_clip": 0.01461868, "auxiliary_loss_mlp": 0.00383478, "balance_loss_clip": 1.21604502, "balance_loss_mlp": 0.34704816, "epoch": 0.47599579137231324, "flos": 21543242630400.0, "grad_norm": 9.129893210784518, "language_loss": 0.70102048, "learning_rate": 2.2518951412875173e-06, "loss": 0.7194739, "num_input_tokens_seen": 170155295, "router_z_loss_clip": 2.4609375, "router_z_loss_mlp": 0.36401367, "step": 7917, "time_per_iteration": 4.030153751373291 }, { "auxiliary_loss_clip": 0.01261172, "auxiliary_loss_mlp": 0.00104846, "balance_loss_clip": 1.11885035, "balance_loss_mlp": 0.0960724, "epoch": 0.4760559146249812, "flos": 64554602595840.0, "grad_norm": 0.8197203996311756, "language_loss": 0.64950013, "learning_rate": 2.2515087761551557e-06, "loss": 0.66316032, "num_input_tokens_seen": 170222325, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.08789062, "step": 7918, "time_per_iteration": 3.18410325050354 }, { "auxiliary_loss_clip": 0.01474058, "auxiliary_loss_mlp": 0.0035625, "balance_loss_clip": 1.22990561, "balance_loss_mlp": 0.3208214, "epoch": 0.47611603787764917, "flos": 22233876405120.0, "grad_norm": 106.93343411245026, "language_loss": 0.74576205, "learning_rate": 2.2511224014855563e-06, "loss": 0.76406515, "num_input_tokens_seen": 170241625, "router_z_loss_clip": 2.44140625, "router_z_loss_mlp": 0.35449219, "step": 7919, "time_per_iteration": 2.6736016273498535 }, { "auxiliary_loss_clip": 0.01476998, "auxiliary_loss_mlp": 0.00374717, "balance_loss_clip": 1.22918916, "balance_loss_mlp": 0.34071854, "epoch": 0.47617616113031713, "flos": 22780005765120.0, "grad_norm": 15.085404893571033, "language_loss": 0.79300988, "learning_rate": 2.2507360172933694e-06, "loss": 0.81152707, "num_input_tokens_seen": 170262470, "router_z_loss_clip": 2.47460938, "router_z_loss_mlp": 0.34008789, "step": 7920, "time_per_iteration": 2.80709171295166 }, { "auxiliary_loss_clip": 0.01499418, "auxiliary_loss_mlp": 0.00368197, "balance_loss_clip": 1.23724198, "balance_loss_mlp": 0.33193356, "epoch": 0.4762362843829851, "flos": 24133802388480.0, "grad_norm": 36.72210018245196, "language_loss": 0.83454841, "learning_rate": 2.2503496235932487e-06, "loss": 0.85322458, "num_input_tokens_seen": 170283460, "router_z_loss_clip": 2.625, "router_z_loss_mlp": 0.36279297, "step": 7921, "time_per_iteration": 4.1018311977386475 }, { "auxiliary_loss_clip": 0.0149282, "auxiliary_loss_mlp": 0.0040209, "balance_loss_clip": 1.23331892, "balance_loss_mlp": 0.36177331, "epoch": 0.47629640763565306, "flos": 22452069571200.0, "grad_norm": 30.339427834161786, "language_loss": 0.8405937, "learning_rate": 2.249963220399845e-06, "loss": 0.85954285, "num_input_tokens_seen": 170304225, "router_z_loss_clip": 2.59570312, "router_z_loss_mlp": 0.40283203, "step": 7922, "time_per_iteration": 2.7084150314331055 }, { "auxiliary_loss_clip": 0.01492085, "auxiliary_loss_mlp": 0.00415778, "balance_loss_clip": 1.23042154, "balance_loss_mlp": 0.37560427, "epoch": 0.4763565308883211, "flos": 11181398647680.0, "grad_norm": 31.207788877377848, "language_loss": 0.78718686, "learning_rate": 2.2495768077278104e-06, "loss": 0.80626547, "num_input_tokens_seen": 170322110, "router_z_loss_clip": 2.61914062, "router_z_loss_mlp": 0.40161133, "step": 7923, "time_per_iteration": 2.6297779083251953 }, { "auxiliary_loss_clip": 0.01501033, "auxiliary_loss_mlp": 0.00374023, "balance_loss_clip": 1.23973012, "balance_loss_mlp": 0.34019113, "epoch": 0.47641665414098905, "flos": 22382151747840.0, "grad_norm": 75.71767587047846, "language_loss": 0.89364773, "learning_rate": 2.2491903855917992e-06, "loss": 0.91239834, "num_input_tokens_seen": 170340700, "router_z_loss_clip": 2.61523438, "router_z_loss_mlp": 0.33837891, "step": 7924, "time_per_iteration": 2.670773506164551 }, { "auxiliary_loss_clip": 0.01503854, "auxiliary_loss_mlp": 0.00381872, "balance_loss_clip": 1.24276686, "balance_loss_mlp": 0.34250897, "epoch": 0.476476777393657, "flos": 25046148862080.0, "grad_norm": 5.236339153159506, "language_loss": 0.87515265, "learning_rate": 2.2488039540064626e-06, "loss": 0.89400989, "num_input_tokens_seen": 170359780, "router_z_loss_clip": 2.60742188, "router_z_loss_mlp": 0.39355469, "step": 7925, "time_per_iteration": 2.7091190814971924 }, { "auxiliary_loss_clip": 0.01490043, "auxiliary_loss_mlp": 0.0037128, "balance_loss_clip": 1.22908115, "balance_loss_mlp": 0.33513576, "epoch": 0.476536900646325, "flos": 27269916888960.0, "grad_norm": 42.702892460510704, "language_loss": 0.77174425, "learning_rate": 2.2484175129864558e-06, "loss": 0.79035747, "num_input_tokens_seen": 170381260, "router_z_loss_clip": 2.609375, "router_z_loss_mlp": 0.36132812, "step": 7926, "time_per_iteration": 2.7371368408203125 }, { "auxiliary_loss_clip": 0.01513637, "auxiliary_loss_mlp": 0.00368293, "balance_loss_clip": 1.249681, "balance_loss_mlp": 0.33191052, "epoch": 0.47659702389899294, "flos": 25301401885440.0, "grad_norm": 672.1553513316936, "language_loss": 0.75502086, "learning_rate": 2.248031062546432e-06, "loss": 0.77384019, "num_input_tokens_seen": 170400595, "router_z_loss_clip": 2.640625, "router_z_loss_mlp": 0.36376953, "step": 7927, "time_per_iteration": 2.6901485919952393 }, { "auxiliary_loss_clip": 0.01498566, "auxiliary_loss_mlp": 0.00331157, "balance_loss_clip": 1.2448318, "balance_loss_mlp": 0.29878008, "epoch": 0.4766571471516609, "flos": 25992861672960.0, "grad_norm": 410.84781813548375, "language_loss": 0.74572927, "learning_rate": 2.247644602701045e-06, "loss": 0.76402646, "num_input_tokens_seen": 170421110, "router_z_loss_clip": 2.53515625, "router_z_loss_mlp": 0.32373047, "step": 7928, "time_per_iteration": 2.7124111652374268 }, { "auxiliary_loss_clip": 0.01497448, "auxiliary_loss_mlp": 0.00386726, "balance_loss_clip": 1.23563075, "balance_loss_mlp": 0.34748268, "epoch": 0.4767172704043289, "flos": 16032211672320.0, "grad_norm": 124.59590960505045, "language_loss": 0.84820604, "learning_rate": 2.2472581334649496e-06, "loss": 0.86704779, "num_input_tokens_seen": 170436700, "router_z_loss_clip": 2.61328125, "router_z_loss_mlp": 0.39257812, "step": 7929, "time_per_iteration": 2.6007707118988037 }, { "auxiliary_loss_clip": 0.01490446, "auxiliary_loss_mlp": 0.00353954, "balance_loss_clip": 1.23745513, "balance_loss_mlp": 0.32050419, "epoch": 0.47677739365699684, "flos": 39235351651200.0, "grad_norm": 7.0184957486387605, "language_loss": 0.71237284, "learning_rate": 2.2468716548528016e-06, "loss": 0.73081684, "num_input_tokens_seen": 170459555, "router_z_loss_clip": 2.52734375, "router_z_loss_mlp": 0.3347168, "step": 7930, "time_per_iteration": 2.854783296585083 }, { "auxiliary_loss_clip": 0.01488165, "auxiliary_loss_mlp": 0.00350084, "balance_loss_clip": 1.23336697, "balance_loss_mlp": 0.31545329, "epoch": 0.4768375169096648, "flos": 24717781704960.0, "grad_norm": 10.687306088033697, "language_loss": 0.84141386, "learning_rate": 2.2464851668792555e-06, "loss": 0.85979629, "num_input_tokens_seen": 170479175, "router_z_loss_clip": 2.546875, "router_z_loss_mlp": 0.34655762, "step": 7931, "time_per_iteration": 2.672152042388916 }, { "auxiliary_loss_clip": 0.01498129, "auxiliary_loss_mlp": 0.00366364, "balance_loss_clip": 1.23591363, "balance_loss_mlp": 0.32981449, "epoch": 0.47689764016233277, "flos": 22528667324160.0, "grad_norm": 91.15972405894814, "language_loss": 0.84215975, "learning_rate": 2.2460986695589678e-06, "loss": 0.86080468, "num_input_tokens_seen": 170498450, "router_z_loss_clip": 2.62304688, "router_z_loss_mlp": 0.36572266, "step": 7932, "time_per_iteration": 2.694103479385376 }, { "auxiliary_loss_clip": 0.01483787, "auxiliary_loss_mlp": 0.00366095, "balance_loss_clip": 1.22651112, "balance_loss_mlp": 0.33064198, "epoch": 0.47695776341500074, "flos": 15120619384320.0, "grad_norm": 17.58606081139129, "language_loss": 0.86514056, "learning_rate": 2.245712162906593e-06, "loss": 0.8836394, "num_input_tokens_seen": 170516255, "router_z_loss_clip": 2.57617188, "router_z_loss_mlp": 0.35473633, "step": 7933, "time_per_iteration": 2.668574810028076 }, { "auxiliary_loss_clip": 0.01500405, "auxiliary_loss_mlp": 0.00392038, "balance_loss_clip": 1.23420811, "balance_loss_mlp": 0.35219878, "epoch": 0.4770178866676687, "flos": 14678917839360.0, "grad_norm": 981.2993484434162, "language_loss": 0.81326938, "learning_rate": 2.2453256469367888e-06, "loss": 0.83219379, "num_input_tokens_seen": 170532705, "router_z_loss_clip": 2.66210938, "router_z_loss_mlp": 0.39819336, "step": 7934, "time_per_iteration": 2.7863800525665283 }, { "auxiliary_loss_clip": 0.01502284, "auxiliary_loss_mlp": 0.00393471, "balance_loss_clip": 1.23707008, "balance_loss_mlp": 0.35582522, "epoch": 0.47707800992033667, "flos": 22565583527040.0, "grad_norm": 23.92220183536478, "language_loss": 0.85460246, "learning_rate": 2.244939121664211e-06, "loss": 0.87356007, "num_input_tokens_seen": 170551925, "router_z_loss_clip": 2.65039062, "router_z_loss_mlp": 0.37670898, "step": 7935, "time_per_iteration": 2.702028751373291 }, { "auxiliary_loss_clip": 0.01515232, "auxiliary_loss_mlp": 0.00375834, "balance_loss_clip": 1.24704742, "balance_loss_mlp": 0.33666167, "epoch": 0.4771381331730047, "flos": 30918225375360.0, "grad_norm": 7.554014315246816, "language_loss": 0.77676988, "learning_rate": 2.2445525871035177e-06, "loss": 0.79568052, "num_input_tokens_seen": 170572320, "router_z_loss_clip": 2.68359375, "router_z_loss_mlp": 0.39160156, "step": 7936, "time_per_iteration": 2.719099760055542 }, { "auxiliary_loss_clip": 0.01491859, "auxiliary_loss_mlp": 0.0040647, "balance_loss_clip": 1.22910559, "balance_loss_mlp": 0.36782259, "epoch": 0.47719825642567265, "flos": 25738901539200.0, "grad_norm": 9.704111808801265, "language_loss": 0.75002253, "learning_rate": 2.2441660432693656e-06, "loss": 0.76900584, "num_input_tokens_seen": 170589470, "router_z_loss_clip": 2.62304688, "router_z_loss_mlp": 0.38647461, "step": 7937, "time_per_iteration": 2.7146353721618652 }, { "auxiliary_loss_clip": 0.01236888, "auxiliary_loss_mlp": 0.00073228, "balance_loss_clip": 1.0968889, "balance_loss_mlp": 0.06383431, "epoch": 0.4772583796783406, "flos": 66355128668160.0, "grad_norm": 0.7018780505163268, "language_loss": 0.5606485, "learning_rate": 2.2437794901764128e-06, "loss": 0.57374966, "num_input_tokens_seen": 170662265, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.09375, "step": 7938, "time_per_iteration": 3.3182828426361084 }, { "auxiliary_loss_clip": 0.01490248, "auxiliary_loss_mlp": 0.0035985, "balance_loss_clip": 1.23306453, "balance_loss_mlp": 0.32377684, "epoch": 0.4773185029310086, "flos": 22051091070720.0, "grad_norm": 18.527484706398152, "language_loss": 0.95756525, "learning_rate": 2.243392927839317e-06, "loss": 0.97606623, "num_input_tokens_seen": 170679680, "router_z_loss_clip": 2.5703125, "router_z_loss_mlp": 0.36083984, "step": 7939, "time_per_iteration": 2.664494037628174 }, { "auxiliary_loss_clip": 0.01487499, "auxiliary_loss_mlp": 0.00340931, "balance_loss_clip": 1.22902441, "balance_loss_mlp": 0.30786252, "epoch": 0.47737862618367655, "flos": 16727801523840.0, "grad_norm": 3.9361390160149217, "language_loss": 0.84882951, "learning_rate": 2.2430063562727367e-06, "loss": 0.86711377, "num_input_tokens_seen": 170697340, "router_z_loss_clip": 2.5859375, "router_z_loss_mlp": 0.33056641, "step": 7940, "time_per_iteration": 2.607713460922241 }, { "auxiliary_loss_clip": 0.01484236, "auxiliary_loss_mlp": 0.00371675, "balance_loss_clip": 1.2251482, "balance_loss_mlp": 0.33531615, "epoch": 0.4774387494363445, "flos": 19609453100160.0, "grad_norm": 2144.2318750676545, "language_loss": 0.8986693, "learning_rate": 2.2426197754913322e-06, "loss": 0.91722846, "num_input_tokens_seen": 170714905, "router_z_loss_clip": 2.59179688, "router_z_loss_mlp": 0.36328125, "step": 7941, "time_per_iteration": 2.6414597034454346 }, { "auxiliary_loss_clip": 0.01496489, "auxiliary_loss_mlp": 0.00353551, "balance_loss_clip": 1.23057866, "balance_loss_mlp": 0.31752548, "epoch": 0.4774988726890125, "flos": 16653969118080.0, "grad_norm": 38.70923880914926, "language_loss": 0.83684224, "learning_rate": 2.24223318550976e-06, "loss": 0.85534263, "num_input_tokens_seen": 170731810, "router_z_loss_clip": 2.65625, "router_z_loss_mlp": 0.36035156, "step": 7942, "time_per_iteration": 2.619664192199707 }, { "auxiliary_loss_clip": 0.01500239, "auxiliary_loss_mlp": 0.00371806, "balance_loss_clip": 1.23530698, "balance_loss_mlp": 0.33582875, "epoch": 0.47755899594168044, "flos": 20485565729280.0, "grad_norm": 19.641876686318152, "language_loss": 0.6990037, "learning_rate": 2.241846586342682e-06, "loss": 0.7177242, "num_input_tokens_seen": 170750270, "router_z_loss_clip": 2.6484375, "router_z_loss_mlp": 0.35986328, "step": 7943, "time_per_iteration": 2.630659580230713 }, { "auxiliary_loss_clip": 0.01508793, "auxiliary_loss_mlp": 0.0035668, "balance_loss_clip": 1.24239111, "balance_loss_mlp": 0.31979671, "epoch": 0.4776191191943484, "flos": 21652806090240.0, "grad_norm": 59.4367468103365, "language_loss": 0.81804383, "learning_rate": 2.2414599780047577e-06, "loss": 0.83669853, "num_input_tokens_seen": 170769015, "router_z_loss_clip": 2.6640625, "router_z_loss_mlp": 0.36914062, "step": 7944, "time_per_iteration": 2.7299654483795166 }, { "auxiliary_loss_clip": 0.01500237, "auxiliary_loss_mlp": 0.0040927, "balance_loss_clip": 1.23628187, "balance_loss_mlp": 0.36902514, "epoch": 0.4776792424470164, "flos": 18770220760320.0, "grad_norm": 7.545380646945023, "language_loss": 0.75181776, "learning_rate": 2.2410733605106456e-06, "loss": 0.77091283, "num_input_tokens_seen": 170785725, "router_z_loss_clip": 2.63671875, "router_z_loss_mlp": 0.40234375, "step": 7945, "time_per_iteration": 2.754721164703369 }, { "auxiliary_loss_clip": 0.0150314, "auxiliary_loss_mlp": 0.00363074, "balance_loss_clip": 1.24054384, "balance_loss_mlp": 0.32630998, "epoch": 0.47773936569968434, "flos": 29715828577920.0, "grad_norm": 7.611451676541132, "language_loss": 0.80280876, "learning_rate": 2.240686733875009e-06, "loss": 0.82147086, "num_input_tokens_seen": 170804600, "router_z_loss_clip": 2.62695312, "router_z_loss_mlp": 0.36791992, "step": 7946, "time_per_iteration": 2.781416177749634 }, { "auxiliary_loss_clip": 0.01511077, "auxiliary_loss_mlp": 0.00401379, "balance_loss_clip": 1.24266219, "balance_loss_mlp": 0.362755, "epoch": 0.4777994889523523, "flos": 24791542283520.0, "grad_norm": 23.57957234281433, "language_loss": 0.85689759, "learning_rate": 2.240300098112506e-06, "loss": 0.8760221, "num_input_tokens_seen": 170824230, "router_z_loss_clip": 2.6875, "router_z_loss_mlp": 0.38647461, "step": 7947, "time_per_iteration": 2.817075729370117 }, { "auxiliary_loss_clip": 0.01514774, "auxiliary_loss_mlp": 0.00340227, "balance_loss_clip": 1.25310779, "balance_loss_mlp": 0.30670568, "epoch": 0.47785961220502027, "flos": 17858161595520.0, "grad_norm": 3.7599289472954873, "language_loss": 0.81924242, "learning_rate": 2.2399134532377998e-06, "loss": 0.83779246, "num_input_tokens_seen": 170843365, "router_z_loss_clip": 2.6171875, "router_z_loss_mlp": 0.33520508, "step": 7948, "time_per_iteration": 2.659876823425293 }, { "auxiliary_loss_clip": 0.01508879, "auxiliary_loss_mlp": 0.00394177, "balance_loss_clip": 1.24285066, "balance_loss_mlp": 0.35758024, "epoch": 0.4779197354576883, "flos": 20266546550400.0, "grad_norm": 25.641523977154474, "language_loss": 0.83621192, "learning_rate": 2.2395267992655514e-06, "loss": 0.85524249, "num_input_tokens_seen": 170863515, "router_z_loss_clip": 2.65429688, "router_z_loss_mlp": 0.36572266, "step": 7949, "time_per_iteration": 2.657289743423462 }, { "auxiliary_loss_clip": 0.01506469, "auxiliary_loss_mlp": 0.00370476, "balance_loss_clip": 1.24033356, "balance_loss_mlp": 0.33526191, "epoch": 0.47797985871035625, "flos": 17056599644160.0, "grad_norm": 28.023961181066625, "language_loss": 0.81898898, "learning_rate": 2.2391401362104227e-06, "loss": 0.83775848, "num_input_tokens_seen": 170881245, "router_z_loss_clip": 2.65820312, "router_z_loss_mlp": 0.35205078, "step": 7950, "time_per_iteration": 2.657841205596924 }, { "auxiliary_loss_clip": 0.01510983, "auxiliary_loss_mlp": 0.00381252, "balance_loss_clip": 1.24523211, "balance_loss_mlp": 0.3433907, "epoch": 0.4780399819630242, "flos": 31358418549120.0, "grad_norm": 5.426149717606869, "language_loss": 0.80721754, "learning_rate": 2.2387534640870756e-06, "loss": 0.82613993, "num_input_tokens_seen": 170901285, "router_z_loss_clip": 2.66015625, "router_z_loss_mlp": 0.37841797, "step": 7951, "time_per_iteration": 2.718863010406494 }, { "auxiliary_loss_clip": 0.01519389, "auxiliary_loss_mlp": 0.00372622, "balance_loss_clip": 1.2497741, "balance_loss_mlp": 0.33697852, "epoch": 0.4781001052156922, "flos": 24899597372160.0, "grad_norm": 2.4757283474738374, "language_loss": 0.87644666, "learning_rate": 2.238366782910174e-06, "loss": 0.89536679, "num_input_tokens_seen": 170919740, "router_z_loss_clip": 2.6953125, "router_z_loss_mlp": 0.35620117, "step": 7952, "time_per_iteration": 2.713710069656372 }, { "auxiliary_loss_clip": 0.0152569, "auxiliary_loss_mlp": 0.00379062, "balance_loss_clip": 1.25298381, "balance_loss_mlp": 0.34449112, "epoch": 0.47816022846836015, "flos": 18697717157760.0, "grad_norm": 17.250299772750214, "language_loss": 0.85293925, "learning_rate": 2.23798009269438e-06, "loss": 0.87198675, "num_input_tokens_seen": 170938510, "router_z_loss_clip": 2.72460938, "router_z_loss_mlp": 0.34570312, "step": 7953, "time_per_iteration": 4.084122896194458 }, { "auxiliary_loss_clip": 0.01538449, "auxiliary_loss_mlp": 0.00360758, "balance_loss_clip": 1.25518012, "balance_loss_mlp": 0.32430345, "epoch": 0.4782203517210281, "flos": 11977573559040.0, "grad_norm": 72.59771617782684, "language_loss": 0.89958978, "learning_rate": 2.2375933934543566e-06, "loss": 0.91858184, "num_input_tokens_seen": 170951170, "router_z_loss_clip": 2.83203125, "router_z_loss_mlp": 0.36425781, "step": 7954, "time_per_iteration": 2.6468727588653564 }, { "auxiliary_loss_clip": 0.01527637, "auxiliary_loss_mlp": 0.00339656, "balance_loss_clip": 1.25730443, "balance_loss_mlp": 0.30471608, "epoch": 0.4782804749736961, "flos": 20813501923200.0, "grad_norm": 55.596035773699356, "language_loss": 0.76294166, "learning_rate": 2.237206685204768e-06, "loss": 0.7816146, "num_input_tokens_seen": 170970990, "router_z_loss_clip": 2.69921875, "router_z_loss_mlp": 0.3494873, "step": 7955, "time_per_iteration": 2.6679842472076416 }, { "auxiliary_loss_clip": 0.01539787, "auxiliary_loss_mlp": 0.00375674, "balance_loss_clip": 1.26122236, "balance_loss_mlp": 0.33788434, "epoch": 0.47834059822636404, "flos": 23840304359040.0, "grad_norm": 2.737056614874643, "language_loss": 0.87609172, "learning_rate": 2.2368199679602787e-06, "loss": 0.89524639, "num_input_tokens_seen": 170991215, "router_z_loss_clip": 2.78515625, "router_z_loss_mlp": 0.37792969, "step": 7956, "time_per_iteration": 4.2982707023620605 }, { "auxiliary_loss_clip": 0.0154273, "auxiliary_loss_mlp": 0.0038008, "balance_loss_clip": 1.2724061, "balance_loss_mlp": 0.34052598, "epoch": 0.478400721479032, "flos": 22633777497600.0, "grad_norm": 2.760342069819623, "language_loss": 0.89345896, "learning_rate": 2.2364332417355516e-06, "loss": 0.91268706, "num_input_tokens_seen": 171007325, "router_z_loss_clip": 2.69921875, "router_z_loss_mlp": 0.39550781, "step": 7957, "time_per_iteration": 2.7042229175567627 }, { "auxiliary_loss_clip": 0.01534006, "auxiliary_loss_mlp": 0.00392449, "balance_loss_clip": 1.26313591, "balance_loss_mlp": 0.35525569, "epoch": 0.4784608447317, "flos": 19354954262400.0, "grad_norm": 245.16198727413382, "language_loss": 0.84551239, "learning_rate": 2.2360465065452527e-06, "loss": 0.86477691, "num_input_tokens_seen": 171025650, "router_z_loss_clip": 2.71289062, "router_z_loss_mlp": 0.37207031, "step": 7958, "time_per_iteration": 2.6951968669891357 }, { "auxiliary_loss_clip": 0.01537505, "auxiliary_loss_mlp": 0.00383858, "balance_loss_clip": 1.26217866, "balance_loss_mlp": 0.34537745, "epoch": 0.47852096798436794, "flos": 24021114445440.0, "grad_norm": 5.215268833142549, "language_loss": 0.90414834, "learning_rate": 2.235659762404047e-06, "loss": 0.92336202, "num_input_tokens_seen": 171045045, "router_z_loss_clip": 2.75, "router_z_loss_mlp": 0.38476562, "step": 7959, "time_per_iteration": 4.15187668800354 }, { "auxiliary_loss_clip": 0.01542545, "auxiliary_loss_mlp": 0.00381862, "balance_loss_clip": 1.27229881, "balance_loss_mlp": 0.34869757, "epoch": 0.4785810912370359, "flos": 25666433850240.0, "grad_norm": 6.566352879252062, "language_loss": 0.79528469, "learning_rate": 2.235273009326599e-06, "loss": 0.81452876, "num_input_tokens_seen": 171062910, "router_z_loss_clip": 2.70117188, "router_z_loss_mlp": 0.33154297, "step": 7960, "time_per_iteration": 2.6815638542175293 }, { "auxiliary_loss_clip": 0.01543143, "auxiliary_loss_mlp": 0.00350808, "balance_loss_clip": 1.27076817, "balance_loss_mlp": 0.31628463, "epoch": 0.47864121448970387, "flos": 21432134885760.0, "grad_norm": 15.071282384913612, "language_loss": 0.8237772, "learning_rate": 2.2348862473275745e-06, "loss": 0.84271669, "num_input_tokens_seen": 171080875, "router_z_loss_clip": 2.7265625, "router_z_loss_mlp": 0.3449707, "step": 7961, "time_per_iteration": 2.668024778366089 }, { "auxiliary_loss_clip": 0.01532743, "auxiliary_loss_mlp": 0.00350119, "balance_loss_clip": 1.261114, "balance_loss_mlp": 0.31395093, "epoch": 0.47870133774237184, "flos": 16143894034560.0, "grad_norm": 1.6334733512152482, "language_loss": 0.83620894, "learning_rate": 2.2344994764216405e-06, "loss": 0.85503751, "num_input_tokens_seen": 171099190, "router_z_loss_clip": 2.71289062, "router_z_loss_mlp": 0.36181641, "step": 7962, "time_per_iteration": 2.6461713314056396 }, { "auxiliary_loss_clip": 0.01561349, "auxiliary_loss_mlp": 0.00407509, "balance_loss_clip": 1.27907205, "balance_loss_mlp": 0.37000579, "epoch": 0.47876146099503986, "flos": 26906788344960.0, "grad_norm": 7.256099939379373, "language_loss": 0.71949768, "learning_rate": 2.2341126966234635e-06, "loss": 0.73918629, "num_input_tokens_seen": 171119060, "router_z_loss_clip": 2.82421875, "router_z_loss_mlp": 0.37463379, "step": 7963, "time_per_iteration": 2.7030234336853027 }, { "auxiliary_loss_clip": 0.01543461, "auxiliary_loss_mlp": 0.00362643, "balance_loss_clip": 1.26842093, "balance_loss_mlp": 0.32609302, "epoch": 0.4788215842477078, "flos": 45332085778560.0, "grad_norm": 602.3743080422138, "language_loss": 0.838525, "learning_rate": 2.2337259079477083e-06, "loss": 0.85758603, "num_input_tokens_seen": 171141900, "router_z_loss_clip": 2.75195312, "router_z_loss_mlp": 0.36547852, "step": 7964, "time_per_iteration": 4.309806823730469 }, { "auxiliary_loss_clip": 0.015627, "auxiliary_loss_mlp": 0.00445501, "balance_loss_clip": 1.27562118, "balance_loss_mlp": 0.4012025, "epoch": 0.4788817075003758, "flos": 22237180456320.0, "grad_norm": 96.39978851310218, "language_loss": 0.82312608, "learning_rate": 2.233339110409044e-06, "loss": 0.84320807, "num_input_tokens_seen": 171161045, "router_z_loss_clip": 2.87109375, "router_z_loss_mlp": 0.44335938, "step": 7965, "time_per_iteration": 2.660370111465454 }, { "auxiliary_loss_clip": 0.01544187, "auxiliary_loss_mlp": 0.00359496, "balance_loss_clip": 1.26913548, "balance_loss_mlp": 0.32499719, "epoch": 0.47894183075304375, "flos": 16471183783680.0, "grad_norm": 2.8111059868059103, "language_loss": 0.82275474, "learning_rate": 2.232952304022137e-06, "loss": 0.84179151, "num_input_tokens_seen": 171179675, "router_z_loss_clip": 2.75390625, "router_z_loss_mlp": 0.3449707, "step": 7966, "time_per_iteration": 2.774526596069336 }, { "auxiliary_loss_clip": 0.01533141, "auxiliary_loss_mlp": 0.00396732, "balance_loss_clip": 1.25837374, "balance_loss_mlp": 0.36077857, "epoch": 0.4790019540057117, "flos": 24282688262400.0, "grad_norm": 3.7460585811202134, "language_loss": 0.79114193, "learning_rate": 2.232565488801655e-06, "loss": 0.81044066, "num_input_tokens_seen": 171201175, "router_z_loss_clip": 2.74804688, "router_z_loss_mlp": 0.35913086, "step": 7967, "time_per_iteration": 2.7440385818481445 }, { "auxiliary_loss_clip": 0.01545487, "auxiliary_loss_mlp": 0.0037981, "balance_loss_clip": 1.2763679, "balance_loss_mlp": 0.3452397, "epoch": 0.4790620772583797, "flos": 25666469763840.0, "grad_norm": 31.637231555061174, "language_loss": 0.84861517, "learning_rate": 2.232178664762267e-06, "loss": 0.86786819, "num_input_tokens_seen": 171221750, "router_z_loss_clip": 2.69140625, "router_z_loss_mlp": 0.34545898, "step": 7968, "time_per_iteration": 2.7573018074035645 }, { "auxiliary_loss_clip": 0.01393752, "auxiliary_loss_mlp": 0.00075148, "balance_loss_clip": 1.23661232, "balance_loss_mlp": 0.06594521, "epoch": 0.47912220051104765, "flos": 69428077102080.0, "grad_norm": 0.7417620644990021, "language_loss": 0.61611998, "learning_rate": 2.2317918319186408e-06, "loss": 0.63080895, "num_input_tokens_seen": 171292235, "router_z_loss_clip": 1.5703125, "router_z_loss_mlp": 0.09179688, "step": 7969, "time_per_iteration": 3.290376663208008 }, { "auxiliary_loss_clip": 0.0155784, "auxiliary_loss_mlp": 0.00366604, "balance_loss_clip": 1.28010845, "balance_loss_mlp": 0.33019769, "epoch": 0.4791823237637156, "flos": 24168922911360.0, "grad_norm": 18.52981395364934, "language_loss": 0.81572437, "learning_rate": 2.2314049902854446e-06, "loss": 0.83496881, "num_input_tokens_seen": 171312215, "router_z_loss_clip": 2.77734375, "router_z_loss_mlp": 0.36425781, "step": 7970, "time_per_iteration": 2.7636051177978516 }, { "auxiliary_loss_clip": 0.01546742, "auxiliary_loss_mlp": 0.00399042, "balance_loss_clip": 1.26683939, "balance_loss_mlp": 0.36232582, "epoch": 0.4792424470163836, "flos": 24751465683840.0, "grad_norm": 5.597738164226211, "language_loss": 0.75937128, "learning_rate": 2.231018139877349e-06, "loss": 0.77882922, "num_input_tokens_seen": 171332975, "router_z_loss_clip": 2.79882812, "router_z_loss_mlp": 0.36743164, "step": 7971, "time_per_iteration": 2.7697737216949463 }, { "auxiliary_loss_clip": 0.01533993, "auxiliary_loss_mlp": 0.0041152, "balance_loss_clip": 1.26043987, "balance_loss_mlp": 0.37256235, "epoch": 0.47930257026905154, "flos": 23257905240960.0, "grad_norm": 2.9541636755991587, "language_loss": 0.84265685, "learning_rate": 2.230631280709021e-06, "loss": 0.86211205, "num_input_tokens_seen": 171353880, "router_z_loss_clip": 2.74023438, "router_z_loss_mlp": 0.3894043, "step": 7972, "time_per_iteration": 2.7200512886047363 }, { "auxiliary_loss_clip": 0.015468, "auxiliary_loss_mlp": 0.00419131, "balance_loss_clip": 1.26637852, "balance_loss_mlp": 0.37800363, "epoch": 0.4793626935217195, "flos": 14064091718400.0, "grad_norm": 17.181006537723253, "language_loss": 0.77395737, "learning_rate": 2.2302444127951327e-06, "loss": 0.79361665, "num_input_tokens_seen": 171370930, "router_z_loss_clip": 2.80273438, "router_z_loss_mlp": 0.41113281, "step": 7973, "time_per_iteration": 2.64968204498291 }, { "auxiliary_loss_clip": 0.01557056, "auxiliary_loss_mlp": 0.00394317, "balance_loss_clip": 1.28194273, "balance_loss_mlp": 0.35814887, "epoch": 0.4794228167743875, "flos": 21798854789760.0, "grad_norm": 6.359660294550561, "language_loss": 0.8449862, "learning_rate": 2.2298575361503523e-06, "loss": 0.86449987, "num_input_tokens_seen": 171387575, "router_z_loss_clip": 2.74804688, "router_z_loss_mlp": 0.36157227, "step": 7974, "time_per_iteration": 2.6696622371673584 }, { "auxiliary_loss_clip": 0.01410806, "auxiliary_loss_mlp": 0.00114837, "balance_loss_clip": 1.25074744, "balance_loss_mlp": 0.10520476, "epoch": 0.47948294002705544, "flos": 66968805553920.0, "grad_norm": 0.7527598118688152, "language_loss": 0.53713906, "learning_rate": 2.2294706507893517e-06, "loss": 0.55239546, "num_input_tokens_seen": 171449980, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.09619141, "step": 7975, "time_per_iteration": 3.2051451206207275 }, { "auxiliary_loss_clip": 0.0155547, "auxiliary_loss_mlp": 0.00445196, "balance_loss_clip": 1.27001858, "balance_loss_mlp": 0.40423554, "epoch": 0.47954306327972346, "flos": 12422471414400.0, "grad_norm": 19.13541482651737, "language_loss": 0.9685809, "learning_rate": 2.2290837567268008e-06, "loss": 0.98858762, "num_input_tokens_seen": 171465290, "router_z_loss_clip": 2.85742188, "router_z_loss_mlp": 0.40942383, "step": 7976, "time_per_iteration": 2.6276350021362305 }, { "auxiliary_loss_clip": 0.01565359, "auxiliary_loss_mlp": 0.00436133, "balance_loss_clip": 1.27544034, "balance_loss_mlp": 0.39605486, "epoch": 0.4796031865323914, "flos": 18361951799040.0, "grad_norm": 59.10197385788787, "language_loss": 0.81575656, "learning_rate": 2.2286968539773713e-06, "loss": 0.83577144, "num_input_tokens_seen": 171481130, "router_z_loss_clip": 2.89648438, "router_z_loss_mlp": 0.40087891, "step": 7977, "time_per_iteration": 2.7268686294555664 }, { "auxiliary_loss_clip": 0.01548562, "auxiliary_loss_mlp": 0.00358728, "balance_loss_clip": 1.27250814, "balance_loss_mlp": 0.32534969, "epoch": 0.4796633097850594, "flos": 21835088634240.0, "grad_norm": 10.252436768332622, "language_loss": 0.83358824, "learning_rate": 2.228309942555734e-06, "loss": 0.85266119, "num_input_tokens_seen": 171501140, "router_z_loss_clip": 2.76367188, "router_z_loss_mlp": 0.33398438, "step": 7978, "time_per_iteration": 2.6384074687957764 }, { "auxiliary_loss_clip": 0.01562664, "auxiliary_loss_mlp": 0.00417919, "balance_loss_clip": 1.27654421, "balance_loss_mlp": 0.37829396, "epoch": 0.47972343303772735, "flos": 23437350610560.0, "grad_norm": 65.48422638290694, "language_loss": 0.95372599, "learning_rate": 2.22792302247656e-06, "loss": 0.97353184, "num_input_tokens_seen": 171519835, "router_z_loss_clip": 2.85742188, "router_z_loss_mlp": 0.39624023, "step": 7979, "time_per_iteration": 2.6746113300323486 }, { "auxiliary_loss_clip": 0.01549043, "auxiliary_loss_mlp": 0.00417765, "balance_loss_clip": 1.2672379, "balance_loss_mlp": 0.37823552, "epoch": 0.4797835562903953, "flos": 24899776940160.0, "grad_norm": 3.6526745687868294, "language_loss": 0.82175314, "learning_rate": 2.227536093754523e-06, "loss": 0.84142125, "num_input_tokens_seen": 171540980, "router_z_loss_clip": 2.82226562, "router_z_loss_mlp": 0.39526367, "step": 7980, "time_per_iteration": 2.6809744834899902 }, { "auxiliary_loss_clip": 0.01557616, "auxiliary_loss_mlp": 0.00455292, "balance_loss_clip": 1.2675581, "balance_loss_mlp": 0.41082698, "epoch": 0.4798436795430633, "flos": 35042996793600.0, "grad_norm": 3.168400618760364, "language_loss": 0.79593766, "learning_rate": 2.227149156404295e-06, "loss": 0.81606674, "num_input_tokens_seen": 171563600, "router_z_loss_clip": 2.90039062, "router_z_loss_mlp": 0.44482422, "step": 7981, "time_per_iteration": 2.7928946018218994 }, { "auxiliary_loss_clip": 0.01561788, "auxiliary_loss_mlp": 0.00413882, "balance_loss_clip": 1.28160191, "balance_loss_mlp": 0.37625918, "epoch": 0.47990380279573125, "flos": 20590209025920.0, "grad_norm": 2.178419688022749, "language_loss": 0.76847363, "learning_rate": 2.2267622104405473e-06, "loss": 0.7882303, "num_input_tokens_seen": 171580700, "router_z_loss_clip": 2.80078125, "router_z_loss_mlp": 0.3762207, "step": 7982, "time_per_iteration": 2.693531036376953 }, { "auxiliary_loss_clip": 0.01537705, "auxiliary_loss_mlp": 0.00365349, "balance_loss_clip": 1.26782084, "balance_loss_mlp": 0.3324706, "epoch": 0.4799639260483992, "flos": 26359402008960.0, "grad_norm": 6.055681142721246, "language_loss": 0.77079988, "learning_rate": 2.2263752558779544e-06, "loss": 0.78983039, "num_input_tokens_seen": 171602035, "router_z_loss_clip": 2.69726562, "router_z_loss_mlp": 0.32885742, "step": 7983, "time_per_iteration": 2.7311758995056152 }, { "auxiliary_loss_clip": 0.01428551, "auxiliary_loss_mlp": 0.00144512, "balance_loss_clip": 1.26992857, "balance_loss_mlp": 0.13344947, "epoch": 0.4800240493010672, "flos": 70979021521920.0, "grad_norm": 0.7627743079400525, "language_loss": 0.58813667, "learning_rate": 2.2259882927311883e-06, "loss": 0.60386729, "num_input_tokens_seen": 171659215, "router_z_loss_clip": 1.5859375, "router_z_loss_mlp": 0.11083984, "step": 7984, "time_per_iteration": 3.12073016166687 }, { "auxiliary_loss_clip": 0.0153526, "auxiliary_loss_mlp": 0.00378639, "balance_loss_clip": 1.26532125, "balance_loss_mlp": 0.34506959, "epoch": 0.48008417255373514, "flos": 17086656349440.0, "grad_norm": 8.49216969045243, "language_loss": 0.73404092, "learning_rate": 2.2256013210149247e-06, "loss": 0.75317991, "num_input_tokens_seen": 171675710, "router_z_loss_clip": 2.69921875, "router_z_loss_mlp": 0.33569336, "step": 7985, "time_per_iteration": 2.6925485134124756 }, { "auxiliary_loss_clip": 0.01558594, "auxiliary_loss_mlp": 0.00400358, "balance_loss_clip": 1.27424431, "balance_loss_mlp": 0.36006564, "epoch": 0.4801442958064031, "flos": 15413435055360.0, "grad_norm": 2.8617051399475084, "language_loss": 0.76634693, "learning_rate": 2.225214340743835e-06, "loss": 0.78593647, "num_input_tokens_seen": 171692510, "router_z_loss_clip": 2.84375, "router_z_loss_mlp": 0.40307617, "step": 7986, "time_per_iteration": 2.663752794265747 }, { "auxiliary_loss_clip": 0.01547565, "auxiliary_loss_mlp": 0.00388749, "balance_loss_clip": 1.26455307, "balance_loss_mlp": 0.35148454, "epoch": 0.4802044190590711, "flos": 11473747441920.0, "grad_norm": 39.57537522313339, "language_loss": 0.85005522, "learning_rate": 2.2248273519325956e-06, "loss": 0.86941838, "num_input_tokens_seen": 171710235, "router_z_loss_clip": 2.8359375, "router_z_loss_mlp": 0.37280273, "step": 7987, "time_per_iteration": 2.616159677505493 }, { "auxiliary_loss_clip": 0.01552561, "auxiliary_loss_mlp": 0.00437076, "balance_loss_clip": 1.2731775, "balance_loss_mlp": 0.39582986, "epoch": 0.48026454231173904, "flos": 20951003185920.0, "grad_norm": 235.40859018688536, "language_loss": 0.81073004, "learning_rate": 2.2244403545958812e-06, "loss": 0.83062643, "num_input_tokens_seen": 171726715, "router_z_loss_clip": 2.796875, "router_z_loss_mlp": 0.41259766, "step": 7988, "time_per_iteration": 2.7625763416290283 }, { "auxiliary_loss_clip": 0.01573955, "auxiliary_loss_mlp": 0.0040388, "balance_loss_clip": 1.29030848, "balance_loss_mlp": 0.36589983, "epoch": 0.48032466556440706, "flos": 20448110822400.0, "grad_norm": 4.173847814566595, "language_loss": 0.86640596, "learning_rate": 2.224053348748365e-06, "loss": 0.88618433, "num_input_tokens_seen": 171743605, "router_z_loss_clip": 2.83984375, "router_z_loss_mlp": 0.37939453, "step": 7989, "time_per_iteration": 2.7317442893981934 }, { "auxiliary_loss_clip": 0.0155817, "auxiliary_loss_mlp": 0.00398125, "balance_loss_clip": 1.27528143, "balance_loss_mlp": 0.36088407, "epoch": 0.480384788817075, "flos": 37120823861760.0, "grad_norm": 36.55565632149364, "language_loss": 0.79115129, "learning_rate": 2.223666334404724e-06, "loss": 0.81071424, "num_input_tokens_seen": 171765445, "router_z_loss_clip": 2.82617188, "router_z_loss_mlp": 0.37255859, "step": 7990, "time_per_iteration": 2.9646973609924316 }, { "auxiliary_loss_clip": 0.01447753, "auxiliary_loss_mlp": 0.00170574, "balance_loss_clip": 1.29104555, "balance_loss_mlp": 0.16051297, "epoch": 0.480444912069743, "flos": 69552577641600.0, "grad_norm": 0.7541637374572476, "language_loss": 0.58710504, "learning_rate": 2.223279311579633e-06, "loss": 0.60328835, "num_input_tokens_seen": 171830115, "router_z_loss_clip": 1.5625, "router_z_loss_mlp": 0.10058594, "step": 7991, "time_per_iteration": 3.2527034282684326 }, { "auxiliary_loss_clip": 0.01566413, "auxiliary_loss_mlp": 0.0044548, "balance_loss_clip": 1.28557003, "balance_loss_mlp": 0.40478203, "epoch": 0.48050503532241096, "flos": 29822231640960.0, "grad_norm": 14.651102734714405, "language_loss": 0.74065506, "learning_rate": 2.222892280287768e-06, "loss": 0.76077396, "num_input_tokens_seen": 171849135, "router_z_loss_clip": 2.80859375, "router_z_loss_mlp": 0.40673828, "step": 7992, "time_per_iteration": 2.760547637939453 }, { "auxiliary_loss_clip": 0.01564905, "auxiliary_loss_mlp": 0.00408045, "balance_loss_clip": 1.28490806, "balance_loss_mlp": 0.3699699, "epoch": 0.4805651585750789, "flos": 23948539015680.0, "grad_norm": 253.93372394330098, "language_loss": 0.80410773, "learning_rate": 2.2225052405438056e-06, "loss": 0.82383728, "num_input_tokens_seen": 171868880, "router_z_loss_clip": 2.796875, "router_z_loss_mlp": 0.38085938, "step": 7993, "time_per_iteration": 2.714796304702759 }, { "auxiliary_loss_clip": 0.01565615, "auxiliary_loss_mlp": 0.00382092, "balance_loss_clip": 1.2914418, "balance_loss_mlp": 0.34663954, "epoch": 0.4806252818277469, "flos": 25665428269440.0, "grad_norm": 42.97359733364314, "language_loss": 0.85433102, "learning_rate": 2.222118192362422e-06, "loss": 0.87380815, "num_input_tokens_seen": 171889455, "router_z_loss_clip": 2.7421875, "router_z_loss_mlp": 0.35473633, "step": 7994, "time_per_iteration": 2.7213151454925537 }, { "auxiliary_loss_clip": 0.0158089, "auxiliary_loss_mlp": 0.00401809, "balance_loss_clip": 1.29721785, "balance_loss_mlp": 0.36416304, "epoch": 0.48068540508041485, "flos": 13151996640000.0, "grad_norm": 10.02373248049652, "language_loss": 0.87023056, "learning_rate": 2.2217311357582946e-06, "loss": 0.89005756, "num_input_tokens_seen": 171906070, "router_z_loss_clip": 2.8359375, "router_z_loss_mlp": 0.37670898, "step": 7995, "time_per_iteration": 2.7214276790618896 }, { "auxiliary_loss_clip": 0.0157707, "auxiliary_loss_mlp": 0.00395424, "balance_loss_clip": 1.30019903, "balance_loss_mlp": 0.35861251, "epoch": 0.4807455283330828, "flos": 21176738208000.0, "grad_norm": 2.5031342867998236, "language_loss": 0.87605703, "learning_rate": 2.2213440707461e-06, "loss": 0.89578193, "num_input_tokens_seen": 171926515, "router_z_loss_clip": 2.76953125, "router_z_loss_mlp": 0.36791992, "step": 7996, "time_per_iteration": 4.208987474441528 }, { "auxiliary_loss_clip": 0.01586747, "auxiliary_loss_mlp": 0.00433732, "balance_loss_clip": 1.30886412, "balance_loss_mlp": 0.39684826, "epoch": 0.4808056515857508, "flos": 12275991751680.0, "grad_norm": 9.282874260692273, "language_loss": 0.86432165, "learning_rate": 2.220956997340516e-06, "loss": 0.88452649, "num_input_tokens_seen": 171943845, "router_z_loss_clip": 2.77929688, "router_z_loss_mlp": 0.36889648, "step": 7997, "time_per_iteration": 2.71414852142334 }, { "auxiliary_loss_clip": 0.01587366, "auxiliary_loss_mlp": 0.0044111, "balance_loss_clip": 1.30391622, "balance_loss_mlp": 0.40205681, "epoch": 0.48086577483841875, "flos": 24826052275200.0, "grad_norm": 7.450401424387744, "language_loss": 0.76997709, "learning_rate": 2.220569915556221e-06, "loss": 0.79026186, "num_input_tokens_seen": 171964970, "router_z_loss_clip": 2.8359375, "router_z_loss_mlp": 0.39086914, "step": 7998, "time_per_iteration": 4.256881475448608 }, { "auxiliary_loss_clip": 0.01579055, "auxiliary_loss_mlp": 0.00396022, "balance_loss_clip": 1.29975438, "balance_loss_mlp": 0.35987824, "epoch": 0.4809258980910867, "flos": 24465365856000.0, "grad_norm": 21.407954382931447, "language_loss": 0.76053536, "learning_rate": 2.220182825407892e-06, "loss": 0.78028613, "num_input_tokens_seen": 171986340, "router_z_loss_clip": 2.796875, "router_z_loss_mlp": 0.36157227, "step": 7999, "time_per_iteration": 2.702115535736084 }, { "auxiliary_loss_clip": 0.01588058, "auxiliary_loss_mlp": 0.00466635, "balance_loss_clip": 1.30554748, "balance_loss_mlp": 0.42712909, "epoch": 0.4809860213437547, "flos": 21215952881280.0, "grad_norm": 38.153336091706954, "language_loss": 0.76447111, "learning_rate": 2.2197957269102083e-06, "loss": 0.78501809, "num_input_tokens_seen": 172007300, "router_z_loss_clip": 2.82421875, "router_z_loss_mlp": 0.39526367, "step": 8000, "time_per_iteration": 2.7140302658081055 }, { "auxiliary_loss_clip": 0.01587548, "auxiliary_loss_mlp": 0.00427944, "balance_loss_clip": 1.30657268, "balance_loss_mlp": 0.38862836, "epoch": 0.48104614459642264, "flos": 37632084094080.0, "grad_norm": 7.673158857136519, "language_loss": 0.78541839, "learning_rate": 2.2194086200778485e-06, "loss": 0.80557334, "num_input_tokens_seen": 172029585, "router_z_loss_clip": 2.80664062, "router_z_loss_mlp": 0.39306641, "step": 8001, "time_per_iteration": 4.112316131591797 }, { "auxiliary_loss_clip": 0.01599108, "auxiliary_loss_mlp": 0.00433878, "balance_loss_clip": 1.31554818, "balance_loss_mlp": 0.39377564, "epoch": 0.48110626784909066, "flos": 18406122549120.0, "grad_norm": 2.774281698056675, "language_loss": 0.85347486, "learning_rate": 2.219021504925493e-06, "loss": 0.87380469, "num_input_tokens_seen": 172047495, "router_z_loss_clip": 2.8359375, "router_z_loss_mlp": 0.40087891, "step": 8002, "time_per_iteration": 2.68402099609375 }, { "auxiliary_loss_clip": 0.01593414, "auxiliary_loss_mlp": 0.00443816, "balance_loss_clip": 1.30670583, "balance_loss_mlp": 0.40285566, "epoch": 0.48116639110175863, "flos": 28439814856320.0, "grad_norm": 4.5040155007402625, "language_loss": 0.77137631, "learning_rate": 2.218634381467819e-06, "loss": 0.79174864, "num_input_tokens_seen": 172067625, "router_z_loss_clip": 2.8671875, "router_z_loss_mlp": 0.40991211, "step": 8003, "time_per_iteration": 2.7268877029418945 }, { "auxiliary_loss_clip": 0.01589248, "auxiliary_loss_mlp": 0.00392897, "balance_loss_clip": 1.31498337, "balance_loss_mlp": 0.35644257, "epoch": 0.4812265143544266, "flos": 21725237865600.0, "grad_norm": 174.42427255693875, "language_loss": 0.87687123, "learning_rate": 2.218247249719507e-06, "loss": 0.89669269, "num_input_tokens_seen": 172087885, "router_z_loss_clip": 2.7421875, "router_z_loss_mlp": 0.36474609, "step": 8004, "time_per_iteration": 2.6799049377441406 }, { "auxiliary_loss_clip": 0.01610195, "auxiliary_loss_mlp": 0.00448389, "balance_loss_clip": 1.3119936, "balance_loss_mlp": 0.40544939, "epoch": 0.48128663760709456, "flos": 13224679810560.0, "grad_norm": 5.087716273443497, "language_loss": 0.85910463, "learning_rate": 2.217860109695239e-06, "loss": 0.87969041, "num_input_tokens_seen": 172105815, "router_z_loss_clip": 2.98242188, "router_z_loss_mlp": 0.42944336, "step": 8005, "time_per_iteration": 2.623615264892578 }, { "auxiliary_loss_clip": 0.01616592, "auxiliary_loss_mlp": 0.00441215, "balance_loss_clip": 1.32701766, "balance_loss_mlp": 0.40187606, "epoch": 0.4813467608597625, "flos": 24243437675520.0, "grad_norm": 2.3213638543367603, "language_loss": 0.77460217, "learning_rate": 2.217472961409692e-06, "loss": 0.79518026, "num_input_tokens_seen": 172126125, "router_z_loss_clip": 2.8984375, "router_z_loss_mlp": 0.39331055, "step": 8006, "time_per_iteration": 4.161262035369873 }, { "auxiliary_loss_clip": 0.01624744, "auxiliary_loss_mlp": 0.00457896, "balance_loss_clip": 1.33766651, "balance_loss_mlp": 0.41686457, "epoch": 0.4814068841124305, "flos": 27480424544640.0, "grad_norm": 23.99154501449194, "language_loss": 0.76320207, "learning_rate": 2.2170858048775495e-06, "loss": 0.78402853, "num_input_tokens_seen": 172141945, "router_z_loss_clip": 2.86914062, "router_z_loss_mlp": 0.40991211, "step": 8007, "time_per_iteration": 2.663813829421997 }, { "auxiliary_loss_clip": 0.01619738, "auxiliary_loss_mlp": 0.00436388, "balance_loss_clip": 1.33410561, "balance_loss_mlp": 0.3962864, "epoch": 0.48146700736509845, "flos": 19572896033280.0, "grad_norm": 3.1065804109554196, "language_loss": 0.7616896, "learning_rate": 2.2166986401134914e-06, "loss": 0.78225088, "num_input_tokens_seen": 172161095, "router_z_loss_clip": 2.85742188, "router_z_loss_mlp": 0.40087891, "step": 8008, "time_per_iteration": 2.837277412414551 }, { "auxiliary_loss_clip": 0.01617405, "auxiliary_loss_mlp": 0.00458063, "balance_loss_clip": 1.32860184, "balance_loss_mlp": 0.41486144, "epoch": 0.4815271306177664, "flos": 20627771673600.0, "grad_norm": 67.89108454909244, "language_loss": 0.68515629, "learning_rate": 2.216311467132199e-06, "loss": 0.70591092, "num_input_tokens_seen": 172178750, "router_z_loss_clip": 2.88671875, "router_z_loss_mlp": 0.43164062, "step": 8009, "time_per_iteration": 2.7527575492858887 }, { "auxiliary_loss_clip": 0.01388841, "auxiliary_loss_mlp": 0.00102666, "balance_loss_clip": 1.22904408, "balance_loss_mlp": 0.09393951, "epoch": 0.4815872538704344, "flos": 67691076232320.0, "grad_norm": 0.8753545349380699, "language_loss": 0.60408676, "learning_rate": 2.2159242859483547e-06, "loss": 0.61900187, "num_input_tokens_seen": 172240235, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.08740234, "step": 8010, "time_per_iteration": 3.2240664958953857 }, { "auxiliary_loss_clip": 0.01616766, "auxiliary_loss_mlp": 0.00413438, "balance_loss_clip": 1.33793294, "balance_loss_mlp": 0.37431353, "epoch": 0.48164737712310235, "flos": 22820764723200.0, "grad_norm": 7.104737492643966, "language_loss": 0.79388815, "learning_rate": 2.215537096576639e-06, "loss": 0.81419015, "num_input_tokens_seen": 172259875, "router_z_loss_clip": 2.78710938, "router_z_loss_mlp": 0.39135742, "step": 8011, "time_per_iteration": 2.810405731201172 }, { "auxiliary_loss_clip": 0.0161573, "auxiliary_loss_mlp": 0.00404289, "balance_loss_clip": 1.34166348, "balance_loss_mlp": 0.36995703, "epoch": 0.4817075003757703, "flos": 23733865382400.0, "grad_norm": 91.61494647967987, "language_loss": 0.85455751, "learning_rate": 2.2151498990317354e-06, "loss": 0.87475765, "num_input_tokens_seen": 172280150, "router_z_loss_clip": 2.73828125, "router_z_loss_mlp": 0.34326172, "step": 8012, "time_per_iteration": 2.6857852935791016 }, { "auxiliary_loss_clip": 0.01616984, "auxiliary_loss_mlp": 0.0043134, "balance_loss_clip": 1.33584166, "balance_loss_mlp": 0.39228749, "epoch": 0.4817676236284383, "flos": 28182909807360.0, "grad_norm": 39.086304055731524, "language_loss": 0.79932106, "learning_rate": 2.214762693328326e-06, "loss": 0.81980431, "num_input_tokens_seen": 172300810, "router_z_loss_clip": 2.80859375, "router_z_loss_mlp": 0.390625, "step": 8013, "time_per_iteration": 2.7622461318969727 }, { "auxiliary_loss_clip": 0.01632288, "auxiliary_loss_mlp": 0.00385552, "balance_loss_clip": 1.35179472, "balance_loss_mlp": 0.34809595, "epoch": 0.48182774688110624, "flos": 17091756080640.0, "grad_norm": 15.108661024935254, "language_loss": 0.97349489, "learning_rate": 2.214375479481094e-06, "loss": 0.99367326, "num_input_tokens_seen": 172317930, "router_z_loss_clip": 2.8046875, "router_z_loss_mlp": 0.37451172, "step": 8014, "time_per_iteration": 2.631533622741699 }, { "auxiliary_loss_clip": 0.01612791, "auxiliary_loss_mlp": 0.00465184, "balance_loss_clip": 1.32771325, "balance_loss_mlp": 0.42312673, "epoch": 0.4818878701337742, "flos": 12567873669120.0, "grad_norm": 8.494339955660225, "language_loss": 0.81653327, "learning_rate": 2.213988257504722e-06, "loss": 0.837313, "num_input_tokens_seen": 172336340, "router_z_loss_clip": 2.84765625, "router_z_loss_mlp": 0.4206543, "step": 8015, "time_per_iteration": 2.6446306705474854 }, { "auxiliary_loss_clip": 0.01608787, "auxiliary_loss_mlp": 0.00417281, "balance_loss_clip": 1.32838702, "balance_loss_mlp": 0.37822801, "epoch": 0.48194799338644223, "flos": 24608505553920.0, "grad_norm": 326.9077435249571, "language_loss": 0.86441863, "learning_rate": 2.213601027413894e-06, "loss": 0.88467932, "num_input_tokens_seen": 172354315, "router_z_loss_clip": 2.80664062, "router_z_loss_mlp": 0.39038086, "step": 8016, "time_per_iteration": 2.732821226119995 }, { "auxiliary_loss_clip": 0.01600647, "auxiliary_loss_mlp": 0.00393532, "balance_loss_clip": 1.33106303, "balance_loss_mlp": 0.35624337, "epoch": 0.4820081166391102, "flos": 21105204272640.0, "grad_norm": 27.7324377245132, "language_loss": 0.84027976, "learning_rate": 2.2132137892232933e-06, "loss": 0.86022162, "num_input_tokens_seen": 172372695, "router_z_loss_clip": 2.69921875, "router_z_loss_mlp": 0.37280273, "step": 8017, "time_per_iteration": 2.690098524093628 }, { "auxiliary_loss_clip": 0.01607905, "auxiliary_loss_mlp": 0.00407813, "balance_loss_clip": 1.33715463, "balance_loss_mlp": 0.37138277, "epoch": 0.48206823989177816, "flos": 25264593423360.0, "grad_norm": 13.783640466689613, "language_loss": 0.85042894, "learning_rate": 2.2128265429476043e-06, "loss": 0.8705861, "num_input_tokens_seen": 172390905, "router_z_loss_clip": 2.70703125, "router_z_loss_mlp": 0.36450195, "step": 8018, "time_per_iteration": 2.7563774585723877 }, { "auxiliary_loss_clip": 0.01641498, "auxiliary_loss_mlp": 0.00434357, "balance_loss_clip": 1.35003936, "balance_loss_mlp": 0.39511353, "epoch": 0.4821283631444461, "flos": 24645062620800.0, "grad_norm": 12.416365851088491, "language_loss": 0.82492602, "learning_rate": 2.2124392886015124e-06, "loss": 0.84568459, "num_input_tokens_seen": 172412295, "router_z_loss_clip": 2.91796875, "router_z_loss_mlp": 0.39257812, "step": 8019, "time_per_iteration": 2.7875239849090576 }, { "auxiliary_loss_clip": 0.01624269, "auxiliary_loss_mlp": 0.00412181, "balance_loss_clip": 1.34347045, "balance_loss_mlp": 0.37439138, "epoch": 0.4821884863971141, "flos": 23952094462080.0, "grad_norm": 19.201091865066687, "language_loss": 0.85554081, "learning_rate": 2.212052026199701e-06, "loss": 0.87590528, "num_input_tokens_seen": 172432625, "router_z_loss_clip": 2.80664062, "router_z_loss_mlp": 0.37792969, "step": 8020, "time_per_iteration": 2.6882903575897217 }, { "auxiliary_loss_clip": 0.01613569, "auxiliary_loss_mlp": 0.0038361, "balance_loss_clip": 1.33572769, "balance_loss_mlp": 0.34713209, "epoch": 0.48224860964978206, "flos": 17160668323200.0, "grad_norm": 141.3634961139285, "language_loss": 0.76649827, "learning_rate": 2.211664755756855e-06, "loss": 0.78647006, "num_input_tokens_seen": 172450010, "router_z_loss_clip": 2.77734375, "router_z_loss_mlp": 0.36450195, "step": 8021, "time_per_iteration": 2.6767866611480713 }, { "auxiliary_loss_clip": 0.01625495, "auxiliary_loss_mlp": 0.00404082, "balance_loss_clip": 1.33691049, "balance_loss_mlp": 0.36519587, "epoch": 0.48230873290245, "flos": 23075838178560.0, "grad_norm": 33.75528918231387, "language_loss": 0.70192868, "learning_rate": 2.2112774772876603e-06, "loss": 0.72222447, "num_input_tokens_seen": 172469080, "router_z_loss_clip": 2.88671875, "router_z_loss_mlp": 0.38867188, "step": 8022, "time_per_iteration": 2.7344892024993896 }, { "auxiliary_loss_clip": 0.01613008, "auxiliary_loss_mlp": 0.0038706, "balance_loss_clip": 1.34094679, "balance_loss_mlp": 0.3517974, "epoch": 0.482368856155118, "flos": 19353517718400.0, "grad_norm": 9.312061379591139, "language_loss": 0.72722602, "learning_rate": 2.2108901908068028e-06, "loss": 0.7472266, "num_input_tokens_seen": 172484850, "router_z_loss_clip": 2.72070312, "router_z_loss_mlp": 0.35253906, "step": 8023, "time_per_iteration": 2.658505916595459 }, { "auxiliary_loss_clip": 0.01632512, "auxiliary_loss_mlp": 0.00410752, "balance_loss_clip": 1.35387254, "balance_loss_mlp": 0.37215248, "epoch": 0.48242897940778595, "flos": 20078984707200.0, "grad_norm": 310.43031380331155, "language_loss": 0.82675183, "learning_rate": 2.2105028963289683e-06, "loss": 0.84718442, "num_input_tokens_seen": 172503525, "router_z_loss_clip": 2.7890625, "router_z_loss_mlp": 0.38549805, "step": 8024, "time_per_iteration": 2.666679620742798 }, { "auxiliary_loss_clip": 0.01630212, "auxiliary_loss_mlp": 0.00389464, "balance_loss_clip": 1.35131013, "balance_loss_mlp": 0.3494336, "epoch": 0.4824891026604539, "flos": 23403989854080.0, "grad_norm": 38.39967750524729, "language_loss": 0.812545, "learning_rate": 2.2101155938688423e-06, "loss": 0.83274174, "num_input_tokens_seen": 172524360, "router_z_loss_clip": 2.7890625, "router_z_loss_mlp": 0.40014648, "step": 8025, "time_per_iteration": 2.716071605682373 }, { "auxiliary_loss_clip": 0.01631468, "auxiliary_loss_mlp": 0.003878, "balance_loss_clip": 1.34959435, "balance_loss_mlp": 0.35136938, "epoch": 0.4825492259131219, "flos": 20368675895040.0, "grad_norm": 3.112945319240416, "language_loss": 0.80075961, "learning_rate": 2.209728283441112e-06, "loss": 0.8209523, "num_input_tokens_seen": 172541480, "router_z_loss_clip": 2.81835938, "router_z_loss_mlp": 0.36450195, "step": 8026, "time_per_iteration": 2.699601411819458 }, { "auxiliary_loss_clip": 0.01632629, "auxiliary_loss_mlp": 0.00461312, "balance_loss_clip": 1.35070395, "balance_loss_mlp": 0.42206839, "epoch": 0.48260934916578985, "flos": 14319021519360.0, "grad_norm": 58.13443584783437, "language_loss": 0.83391035, "learning_rate": 2.209340965060465e-06, "loss": 0.8548497, "num_input_tokens_seen": 172559005, "router_z_loss_clip": 2.81835938, "router_z_loss_mlp": 0.39306641, "step": 8027, "time_per_iteration": 2.6185193061828613 }, { "auxiliary_loss_clip": 0.01638633, "auxiliary_loss_mlp": 0.00389618, "balance_loss_clip": 1.35380578, "balance_loss_mlp": 0.35249627, "epoch": 0.4826694724184578, "flos": 22121152548480.0, "grad_norm": 11.213807844376648, "language_loss": 0.74286121, "learning_rate": 2.2089536387415868e-06, "loss": 0.76314378, "num_input_tokens_seen": 172578435, "router_z_loss_clip": 2.84765625, "router_z_loss_mlp": 0.37109375, "step": 8028, "time_per_iteration": 2.6815125942230225 }, { "auxiliary_loss_clip": 0.01636361, "auxiliary_loss_mlp": 0.00441485, "balance_loss_clip": 1.35239255, "balance_loss_mlp": 0.40095422, "epoch": 0.48272959567112583, "flos": 16181169373440.0, "grad_norm": 50.8814512929007, "language_loss": 0.7855882, "learning_rate": 2.2085663044991655e-06, "loss": 0.80636668, "num_input_tokens_seen": 172596095, "router_z_loss_clip": 2.83984375, "router_z_loss_mlp": 0.40551758, "step": 8029, "time_per_iteration": 2.6577014923095703 }, { "auxiliary_loss_clip": 0.01658368, "auxiliary_loss_mlp": 0.00450845, "balance_loss_clip": 1.36826146, "balance_loss_mlp": 0.41195869, "epoch": 0.4827897189237938, "flos": 23180445561600.0, "grad_norm": 143.225308416906, "language_loss": 0.89874113, "learning_rate": 2.2081789623478896e-06, "loss": 0.9198333, "num_input_tokens_seen": 172615255, "router_z_loss_clip": 2.8984375, "router_z_loss_mlp": 0.38916016, "step": 8030, "time_per_iteration": 2.697082042694092 }, { "auxiliary_loss_clip": 0.01650957, "auxiliary_loss_mlp": 0.00453811, "balance_loss_clip": 1.37329948, "balance_loss_mlp": 0.4167372, "epoch": 0.48284984217646176, "flos": 21652626522240.0, "grad_norm": 8.13886854475101, "language_loss": 0.78239828, "learning_rate": 2.2077916123024466e-06, "loss": 0.80344594, "num_input_tokens_seen": 172633185, "router_z_loss_clip": 2.77929688, "router_z_loss_mlp": 0.37060547, "step": 8031, "time_per_iteration": 2.77010440826416 }, { "auxiliary_loss_clip": 0.01666218, "auxiliary_loss_mlp": 0.00444017, "balance_loss_clip": 1.36912966, "balance_loss_mlp": 0.40238935, "epoch": 0.48290996542912973, "flos": 31467443304960.0, "grad_norm": 20.88136524397335, "language_loss": 0.77754676, "learning_rate": 2.2074042543775245e-06, "loss": 0.79864907, "num_input_tokens_seen": 172654280, "router_z_loss_clip": 2.97460938, "router_z_loss_mlp": 0.41625977, "step": 8032, "time_per_iteration": 2.840233564376831 }, { "auxiliary_loss_clip": 0.01644156, "auxiliary_loss_mlp": 0.00438216, "balance_loss_clip": 1.36176658, "balance_loss_mlp": 0.39873433, "epoch": 0.4829700886817977, "flos": 24461954064000.0, "grad_norm": 5.820128943944328, "language_loss": 0.80736208, "learning_rate": 2.2070168885878126e-06, "loss": 0.8281858, "num_input_tokens_seen": 172675545, "router_z_loss_clip": 2.82226562, "router_z_loss_mlp": 0.39453125, "step": 8033, "time_per_iteration": 2.693984031677246 }, { "auxiliary_loss_clip": 0.01680535, "auxiliary_loss_mlp": 0.00464778, "balance_loss_clip": 1.38621223, "balance_loss_mlp": 0.42443722, "epoch": 0.48303021193446566, "flos": 25702164904320.0, "grad_norm": 4.077800504976528, "language_loss": 0.88431275, "learning_rate": 2.2066295149479996e-06, "loss": 0.90576583, "num_input_tokens_seen": 172696455, "router_z_loss_clip": 2.94335938, "router_z_loss_mlp": 0.40332031, "step": 8034, "time_per_iteration": 2.7396440505981445 }, { "auxiliary_loss_clip": 0.01642876, "auxiliary_loss_mlp": 0.00419231, "balance_loss_clip": 1.36426604, "balance_loss_mlp": 0.38308704, "epoch": 0.4830903351871336, "flos": 20085233673600.0, "grad_norm": 3.34942255179597, "language_loss": 0.84718019, "learning_rate": 2.2062421334727744e-06, "loss": 0.86780125, "num_input_tokens_seen": 172716720, "router_z_loss_clip": 2.78710938, "router_z_loss_mlp": 0.36157227, "step": 8035, "time_per_iteration": 2.919151544570923 }, { "auxiliary_loss_clip": 0.01646712, "auxiliary_loss_mlp": 0.00397359, "balance_loss_clip": 1.36297011, "balance_loss_mlp": 0.36085743, "epoch": 0.4831504584398016, "flos": 39452216014080.0, "grad_norm": 227.05581359270641, "language_loss": 0.74366939, "learning_rate": 2.2058547441768267e-06, "loss": 0.76411009, "num_input_tokens_seen": 172737435, "router_z_loss_clip": 2.83984375, "router_z_loss_mlp": 0.36499023, "step": 8036, "time_per_iteration": 2.8102447986602783 }, { "auxiliary_loss_clip": 0.01648639, "auxiliary_loss_mlp": 0.00423274, "balance_loss_clip": 1.35919523, "balance_loss_mlp": 0.38505584, "epoch": 0.48321058169246955, "flos": 20006588845440.0, "grad_norm": 16.204581947993134, "language_loss": 0.77828407, "learning_rate": 2.205467347074847e-06, "loss": 0.79900324, "num_input_tokens_seen": 172755700, "router_z_loss_clip": 2.89648438, "router_z_loss_mlp": 0.38183594, "step": 8037, "time_per_iteration": 2.6765575408935547 }, { "auxiliary_loss_clip": 0.01647232, "auxiliary_loss_mlp": 0.00436109, "balance_loss_clip": 1.35406184, "balance_loss_mlp": 0.39581633, "epoch": 0.4832707049451375, "flos": 20741465197440.0, "grad_norm": 149.96363074041284, "language_loss": 0.78878796, "learning_rate": 2.205079942181525e-06, "loss": 0.80962133, "num_input_tokens_seen": 172775185, "router_z_loss_clip": 2.92773438, "router_z_loss_mlp": 0.40258789, "step": 8038, "time_per_iteration": 4.124361991882324 }, { "auxiliary_loss_clip": 0.01650092, "auxiliary_loss_mlp": 0.0040824, "balance_loss_clip": 1.36160779, "balance_loss_mlp": 0.36861485, "epoch": 0.4833308281978055, "flos": 33145584762240.0, "grad_norm": 1.529705727347306, "language_loss": 0.83179653, "learning_rate": 2.20469252951155e-06, "loss": 0.85237992, "num_input_tokens_seen": 172796990, "router_z_loss_clip": 2.88671875, "router_z_loss_mlp": 0.39624023, "step": 8039, "time_per_iteration": 2.767123222351074 }, { "auxiliary_loss_clip": 0.01635349, "auxiliary_loss_mlp": 0.00406576, "balance_loss_clip": 1.34865642, "balance_loss_mlp": 0.36687949, "epoch": 0.48339095145047345, "flos": 19099234362240.0, "grad_norm": 7.090389362487774, "language_loss": 0.83325565, "learning_rate": 2.2043051090796143e-06, "loss": 0.85367489, "num_input_tokens_seen": 172814915, "router_z_loss_clip": 2.8671875, "router_z_loss_mlp": 0.39697266, "step": 8040, "time_per_iteration": 2.7224395275115967 }, { "auxiliary_loss_clip": 0.01655685, "auxiliary_loss_mlp": 0.00373618, "balance_loss_clip": 1.36605072, "balance_loss_mlp": 0.33790338, "epoch": 0.4834510747031414, "flos": 34459448440320.0, "grad_norm": 34.02868983011996, "language_loss": 0.82117456, "learning_rate": 2.203917680900409e-06, "loss": 0.84146762, "num_input_tokens_seen": 172837060, "router_z_loss_clip": 2.89648438, "router_z_loss_mlp": 0.35717773, "step": 8041, "time_per_iteration": 4.295841932296753 }, { "auxiliary_loss_clip": 0.0163166, "auxiliary_loss_mlp": 0.00394641, "balance_loss_clip": 1.35186911, "balance_loss_mlp": 0.35899752, "epoch": 0.48351119795580944, "flos": 27380845065600.0, "grad_norm": 39.03373033073294, "language_loss": 0.73471773, "learning_rate": 2.203530244988624e-06, "loss": 0.7549808, "num_input_tokens_seen": 172856545, "router_z_loss_clip": 2.796875, "router_z_loss_mlp": 0.35644531, "step": 8042, "time_per_iteration": 2.714569568634033 }, { "auxiliary_loss_clip": 0.01349196, "auxiliary_loss_mlp": 0.00158116, "balance_loss_clip": 1.18357313, "balance_loss_mlp": 0.1505339, "epoch": 0.4835713212084774, "flos": 67143941291520.0, "grad_norm": 0.6942954345837632, "language_loss": 0.57501698, "learning_rate": 2.2031428013589517e-06, "loss": 0.5900901, "num_input_tokens_seen": 172923055, "router_z_loss_clip": 1.65625, "router_z_loss_mlp": 0.07568359, "step": 8043, "time_per_iteration": 4.5979697704315186 }, { "auxiliary_loss_clip": 0.01640005, "auxiliary_loss_mlp": 0.00404388, "balance_loss_clip": 1.351511, "balance_loss_mlp": 0.36588296, "epoch": 0.48363144446114537, "flos": 17967473660160.0, "grad_norm": 7.130207629157485, "language_loss": 0.79224575, "learning_rate": 2.2027553500260847e-06, "loss": 0.8126896, "num_input_tokens_seen": 172940700, "router_z_loss_clip": 2.8828125, "router_z_loss_mlp": 0.38525391, "step": 8044, "time_per_iteration": 2.6346702575683594 }, { "auxiliary_loss_clip": 0.01637964, "auxiliary_loss_mlp": 0.00395432, "balance_loss_clip": 1.35311937, "balance_loss_mlp": 0.35811931, "epoch": 0.48369156771381333, "flos": 20593513077120.0, "grad_norm": 3.6156248274444582, "language_loss": 0.80452126, "learning_rate": 2.202367891004714e-06, "loss": 0.82485521, "num_input_tokens_seen": 172961125, "router_z_loss_clip": 2.84765625, "router_z_loss_mlp": 0.37329102, "step": 8045, "time_per_iteration": 2.6600143909454346 }, { "auxiliary_loss_clip": 0.01620676, "auxiliary_loss_mlp": 0.00420378, "balance_loss_clip": 1.33708811, "balance_loss_mlp": 0.38211203, "epoch": 0.4837516909664813, "flos": 22675075159680.0, "grad_norm": 51.21102904414122, "language_loss": 0.74493039, "learning_rate": 2.201980424309533e-06, "loss": 0.76534092, "num_input_tokens_seen": 172980405, "router_z_loss_clip": 2.83203125, "router_z_loss_mlp": 0.3828125, "step": 8046, "time_per_iteration": 2.745100736618042 }, { "auxiliary_loss_clip": 0.01600388, "auxiliary_loss_mlp": 0.0036796, "balance_loss_clip": 1.32340872, "balance_loss_mlp": 0.33408031, "epoch": 0.48381181421914926, "flos": 25518625384320.0, "grad_norm": 5.044059022474448, "language_loss": 0.88495994, "learning_rate": 2.2015929499552337e-06, "loss": 0.90464342, "num_input_tokens_seen": 172999105, "router_z_loss_clip": 2.76953125, "router_z_loss_mlp": 0.33886719, "step": 8047, "time_per_iteration": 2.786492109298706 }, { "auxiliary_loss_clip": 0.0161086, "auxiliary_loss_mlp": 0.00406962, "balance_loss_clip": 1.32917762, "balance_loss_mlp": 0.36826676, "epoch": 0.4838719374718172, "flos": 24207491139840.0, "grad_norm": 19.414888700485335, "language_loss": 0.85925323, "learning_rate": 2.2012054679565092e-06, "loss": 0.87943149, "num_input_tokens_seen": 173019935, "router_z_loss_clip": 2.81640625, "router_z_loss_mlp": 0.38720703, "step": 8048, "time_per_iteration": 4.070250511169434 }, { "auxiliary_loss_clip": 0.01614677, "auxiliary_loss_mlp": 0.00406342, "balance_loss_clip": 1.32585335, "balance_loss_mlp": 0.36666894, "epoch": 0.4839320607244852, "flos": 26724577628160.0, "grad_norm": 182.55883552506276, "language_loss": 0.86986542, "learning_rate": 2.200817978328054e-06, "loss": 0.89007556, "num_input_tokens_seen": 173039700, "router_z_loss_clip": 2.88867188, "router_z_loss_mlp": 0.39672852, "step": 8049, "time_per_iteration": 2.694589614868164 }, { "auxiliary_loss_clip": 0.0162737, "auxiliary_loss_mlp": 0.0036476, "balance_loss_clip": 1.34776723, "balance_loss_mlp": 0.32818624, "epoch": 0.48399218397715316, "flos": 20448900921600.0, "grad_norm": 7.4359126218212275, "language_loss": 0.79277396, "learning_rate": 2.2004304810845602e-06, "loss": 0.81269526, "num_input_tokens_seen": 173059170, "router_z_loss_clip": 2.79492188, "router_z_loss_mlp": 0.36572266, "step": 8050, "time_per_iteration": 2.6913018226623535 }, { "auxiliary_loss_clip": 0.01335702, "auxiliary_loss_mlp": 0.00079931, "balance_loss_clip": 1.17098522, "balance_loss_mlp": 0.07316026, "epoch": 0.4840523072298211, "flos": 67180570185600.0, "grad_norm": 0.6968691840828601, "language_loss": 0.55754405, "learning_rate": 2.200042976240723e-06, "loss": 0.57170039, "num_input_tokens_seen": 173119000, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.06787109, "step": 8051, "time_per_iteration": 3.239795207977295 }, { "auxiliary_loss_clip": 0.01599706, "auxiliary_loss_mlp": 0.00366621, "balance_loss_clip": 1.31843209, "balance_loss_mlp": 0.32887915, "epoch": 0.4841124304824891, "flos": 22411490181120.0, "grad_norm": 54.83090039164296, "language_loss": 0.81973219, "learning_rate": 2.199655463811236e-06, "loss": 0.83939552, "num_input_tokens_seen": 173137570, "router_z_loss_clip": 2.8125, "router_z_loss_mlp": 0.37744141, "step": 8052, "time_per_iteration": 2.709867000579834 }, { "auxiliary_loss_clip": 0.01576744, "auxiliary_loss_mlp": 0.00341687, "balance_loss_clip": 1.30719268, "balance_loss_mlp": 0.30571008, "epoch": 0.48417255373515705, "flos": 13843959217920.0, "grad_norm": 23.171816902290583, "language_loss": 0.74825776, "learning_rate": 2.1992679438107936e-06, "loss": 0.76744199, "num_input_tokens_seen": 173154355, "router_z_loss_clip": 2.69921875, "router_z_loss_mlp": 0.35986328, "step": 8053, "time_per_iteration": 2.6731953620910645 }, { "auxiliary_loss_clip": 0.0161088, "auxiliary_loss_mlp": 0.00359388, "balance_loss_clip": 1.33498406, "balance_loss_mlp": 0.32302901, "epoch": 0.484232676987825, "flos": 31649689935360.0, "grad_norm": 307.3499439822623, "language_loss": 0.77673233, "learning_rate": 2.198880416254091e-06, "loss": 0.796435, "num_input_tokens_seen": 173174845, "router_z_loss_clip": 2.76367188, "router_z_loss_mlp": 0.36376953, "step": 8054, "time_per_iteration": 2.7893402576446533 }, { "auxiliary_loss_clip": 0.01574215, "auxiliary_loss_mlp": 0.00352435, "balance_loss_clip": 1.30341315, "balance_loss_mlp": 0.31557518, "epoch": 0.48429280024049304, "flos": 24095377814400.0, "grad_norm": 295.6348804874185, "language_loss": 0.74983203, "learning_rate": 2.1984928811558233e-06, "loss": 0.76909852, "num_input_tokens_seen": 173195025, "router_z_loss_clip": 2.70898438, "router_z_loss_mlp": 0.3684082, "step": 8055, "time_per_iteration": 2.7101423740386963 }, { "auxiliary_loss_clip": 0.01602127, "auxiliary_loss_mlp": 0.00354623, "balance_loss_clip": 1.32135785, "balance_loss_mlp": 0.31819266, "epoch": 0.484352923493161, "flos": 17530081747200.0, "grad_norm": 3.8150180837274457, "language_loss": 0.70393991, "learning_rate": 2.198105338530685e-06, "loss": 0.7235074, "num_input_tokens_seen": 173213065, "router_z_loss_clip": 2.8046875, "router_z_loss_mlp": 0.36425781, "step": 8056, "time_per_iteration": 2.7163209915161133 }, { "auxiliary_loss_clip": 0.01602118, "auxiliary_loss_mlp": 0.00366491, "balance_loss_clip": 1.32293713, "balance_loss_mlp": 0.32729453, "epoch": 0.48441304674582897, "flos": 29166862043520.0, "grad_norm": 297.44639207340396, "language_loss": 0.73550767, "learning_rate": 2.1977177883933726e-06, "loss": 0.75519371, "num_input_tokens_seen": 173234545, "router_z_loss_clip": 2.796875, "router_z_loss_mlp": 0.39208984, "step": 8057, "time_per_iteration": 2.7987265586853027 }, { "auxiliary_loss_clip": 0.01580383, "auxiliary_loss_mlp": 0.00331636, "balance_loss_clip": 1.31070065, "balance_loss_mlp": 0.29708868, "epoch": 0.48447316999849693, "flos": 15886701676800.0, "grad_norm": 24.644515223384282, "language_loss": 0.86836064, "learning_rate": 2.1973302307585827e-06, "loss": 0.88748085, "num_input_tokens_seen": 173252175, "router_z_loss_clip": 2.6953125, "router_z_loss_mlp": 0.34521484, "step": 8058, "time_per_iteration": 2.6905975341796875 }, { "auxiliary_loss_clip": 0.01607959, "auxiliary_loss_mlp": 0.00390217, "balance_loss_clip": 1.31840968, "balance_loss_mlp": 0.35042477, "epoch": 0.4845332932511649, "flos": 24381405815040.0, "grad_norm": 7.0646778270042745, "language_loss": 0.84857774, "learning_rate": 2.1969426656410097e-06, "loss": 0.86855948, "num_input_tokens_seen": 173268790, "router_z_loss_clip": 2.89257812, "router_z_loss_mlp": 0.39794922, "step": 8059, "time_per_iteration": 2.767595052719116 }, { "auxiliary_loss_clip": 0.01607639, "auxiliary_loss_mlp": 0.00389169, "balance_loss_clip": 1.32087135, "balance_loss_mlp": 0.34751743, "epoch": 0.48459341650383286, "flos": 37116478316160.0, "grad_norm": 10.429116664510918, "language_loss": 0.71500939, "learning_rate": 2.196555093055352e-06, "loss": 0.73497742, "num_input_tokens_seen": 173288030, "router_z_loss_clip": 2.86914062, "router_z_loss_mlp": 0.41674805, "step": 8060, "time_per_iteration": 2.8406293392181396 }, { "auxiliary_loss_clip": 0.01597456, "auxiliary_loss_mlp": 0.00350816, "balance_loss_clip": 1.32010961, "balance_loss_mlp": 0.31061852, "epoch": 0.48465353975650083, "flos": 22966777509120.0, "grad_norm": 49.725159672504496, "language_loss": 0.73783135, "learning_rate": 2.1961675130163046e-06, "loss": 0.75731409, "num_input_tokens_seen": 173305965, "router_z_loss_clip": 2.7734375, "router_z_loss_mlp": 0.40161133, "step": 8061, "time_per_iteration": 2.656487464904785 }, { "auxiliary_loss_clip": 0.01617791, "auxiliary_loss_mlp": 0.00377933, "balance_loss_clip": 1.33384597, "balance_loss_mlp": 0.3368293, "epoch": 0.4847136630091688, "flos": 17707695523200.0, "grad_norm": 13.117662710212088, "language_loss": 0.87907183, "learning_rate": 2.1957799255385653e-06, "loss": 0.89902902, "num_input_tokens_seen": 173321985, "router_z_loss_clip": 2.83398438, "router_z_loss_mlp": 0.41113281, "step": 8062, "time_per_iteration": 2.659801721572876 }, { "auxiliary_loss_clip": 0.01607049, "auxiliary_loss_mlp": 0.00351316, "balance_loss_clip": 1.33014536, "balance_loss_mlp": 0.31438535, "epoch": 0.48477378626183676, "flos": 22018269018240.0, "grad_norm": 4.587622713066759, "language_loss": 0.79621637, "learning_rate": 2.1953923306368325e-06, "loss": 0.81580007, "num_input_tokens_seen": 173341315, "router_z_loss_clip": 2.76757812, "router_z_loss_mlp": 0.36987305, "step": 8063, "time_per_iteration": 2.654578447341919 }, { "auxiliary_loss_clip": 0.01612519, "auxiliary_loss_mlp": 0.00352217, "balance_loss_clip": 1.33252811, "balance_loss_mlp": 0.31685942, "epoch": 0.4848339095145047, "flos": 27962956874880.0, "grad_norm": 7.5182956605555376, "language_loss": 0.84559798, "learning_rate": 2.1950047283258023e-06, "loss": 0.86524534, "num_input_tokens_seen": 173361055, "router_z_loss_clip": 2.80078125, "router_z_loss_mlp": 0.35302734, "step": 8064, "time_per_iteration": 2.7007720470428467 }, { "auxiliary_loss_clip": 0.01626389, "auxiliary_loss_mlp": 0.00337159, "balance_loss_clip": 1.34738362, "balance_loss_mlp": 0.30025184, "epoch": 0.4848940327671727, "flos": 21688752625920.0, "grad_norm": 3.05760826158405, "language_loss": 0.87221503, "learning_rate": 2.194617118620173e-06, "loss": 0.89185053, "num_input_tokens_seen": 173379255, "router_z_loss_clip": 2.78710938, "router_z_loss_mlp": 0.36914062, "step": 8065, "time_per_iteration": 2.697171211242676 }, { "auxiliary_loss_clip": 0.01618269, "auxiliary_loss_mlp": 0.00342818, "balance_loss_clip": 1.34535658, "balance_loss_mlp": 0.30576786, "epoch": 0.48495415601984065, "flos": 20631578515200.0, "grad_norm": 2.6418060572772935, "language_loss": 0.81311536, "learning_rate": 2.194229501534644e-06, "loss": 0.83272624, "num_input_tokens_seen": 173398370, "router_z_loss_clip": 2.72851562, "router_z_loss_mlp": 0.37060547, "step": 8066, "time_per_iteration": 2.6704559326171875 }, { "auxiliary_loss_clip": 0.01620957, "auxiliary_loss_mlp": 0.00344594, "balance_loss_clip": 1.34708309, "balance_loss_mlp": 0.30864066, "epoch": 0.4850142792725086, "flos": 25628152930560.0, "grad_norm": 5.665192075704937, "language_loss": 0.77253228, "learning_rate": 2.193841877083912e-06, "loss": 0.79218781, "num_input_tokens_seen": 173419595, "router_z_loss_clip": 2.74023438, "router_z_loss_mlp": 0.36010742, "step": 8067, "time_per_iteration": 2.723296880722046 }, { "auxiliary_loss_clip": 0.0162654, "auxiliary_loss_mlp": 0.00404091, "balance_loss_clip": 1.34421885, "balance_loss_mlp": 0.36234367, "epoch": 0.4850744025251766, "flos": 13771958405760.0, "grad_norm": 9.895086953769486, "language_loss": 0.86184782, "learning_rate": 2.1934542452826767e-06, "loss": 0.88215411, "num_input_tokens_seen": 173435390, "router_z_loss_clip": 2.82421875, "router_z_loss_mlp": 0.41772461, "step": 8068, "time_per_iteration": 2.6966614723205566 }, { "auxiliary_loss_clip": 0.01604589, "auxiliary_loss_mlp": 0.00372478, "balance_loss_clip": 1.32911968, "balance_loss_mlp": 0.33228064, "epoch": 0.4851345257778446, "flos": 20261339078400.0, "grad_norm": 242.2158797683225, "language_loss": 0.89042604, "learning_rate": 2.193066606145638e-06, "loss": 0.91019666, "num_input_tokens_seen": 173454095, "router_z_loss_clip": 2.75585938, "router_z_loss_mlp": 0.40185547, "step": 8069, "time_per_iteration": 2.674145460128784 }, { "auxiliary_loss_clip": 0.01620711, "auxiliary_loss_mlp": 0.0037659, "balance_loss_clip": 1.34298587, "balance_loss_mlp": 0.33603477, "epoch": 0.48519464903051257, "flos": 27089681420160.0, "grad_norm": 18.95450700553503, "language_loss": 0.83854616, "learning_rate": 2.192678959687493e-06, "loss": 0.85851914, "num_input_tokens_seen": 173475300, "router_z_loss_clip": 2.77539062, "router_z_loss_mlp": 0.40527344, "step": 8070, "time_per_iteration": 2.7325594425201416 }, { "auxiliary_loss_clip": 0.01610824, "auxiliary_loss_mlp": 0.00397798, "balance_loss_clip": 1.33033609, "balance_loss_mlp": 0.35693276, "epoch": 0.48525477228318054, "flos": 17127235739520.0, "grad_norm": 4.090360297454009, "language_loss": 0.86672121, "learning_rate": 2.192291305922943e-06, "loss": 0.88680744, "num_input_tokens_seen": 173492005, "router_z_loss_clip": 2.80273438, "router_z_loss_mlp": 0.40869141, "step": 8071, "time_per_iteration": 2.6274349689483643 }, { "auxiliary_loss_clip": 0.01623659, "auxiliary_loss_mlp": 0.00361828, "balance_loss_clip": 1.34348571, "balance_loss_mlp": 0.32174999, "epoch": 0.4853148955358485, "flos": 28180324028160.0, "grad_norm": 21.982631484410952, "language_loss": 0.77302265, "learning_rate": 2.1919036448666873e-06, "loss": 0.79287744, "num_input_tokens_seen": 173511995, "router_z_loss_clip": 2.796875, "router_z_loss_mlp": 0.40063477, "step": 8072, "time_per_iteration": 2.7313485145568848 }, { "auxiliary_loss_clip": 0.01631757, "auxiliary_loss_mlp": 0.0036942, "balance_loss_clip": 1.34405851, "balance_loss_mlp": 0.32812619, "epoch": 0.48537501878851647, "flos": 17493309198720.0, "grad_norm": 14.785133891502923, "language_loss": 0.94227272, "learning_rate": 2.1915159765334262e-06, "loss": 0.96228456, "num_input_tokens_seen": 173530215, "router_z_loss_clip": 2.875, "router_z_loss_mlp": 0.41333008, "step": 8073, "time_per_iteration": 2.7510604858398438 }, { "auxiliary_loss_clip": 0.01620784, "auxiliary_loss_mlp": 0.00352981, "balance_loss_clip": 1.34868431, "balance_loss_mlp": 0.31585971, "epoch": 0.48543514204118443, "flos": 28584857975040.0, "grad_norm": 13.14988526651096, "language_loss": 0.67439985, "learning_rate": 2.19112830093786e-06, "loss": 0.69413757, "num_input_tokens_seen": 173550920, "router_z_loss_clip": 2.71875, "router_z_loss_mlp": 0.37060547, "step": 8074, "time_per_iteration": 2.699949264526367 }, { "auxiliary_loss_clip": 0.01621872, "auxiliary_loss_mlp": 0.00343696, "balance_loss_clip": 1.33963692, "balance_loss_mlp": 0.30485764, "epoch": 0.4854952652938524, "flos": 20959981585920.0, "grad_norm": 3.5980100315598516, "language_loss": 0.78353971, "learning_rate": 2.19074061809469e-06, "loss": 0.80319536, "num_input_tokens_seen": 173569065, "router_z_loss_clip": 2.81835938, "router_z_loss_mlp": 0.38818359, "step": 8075, "time_per_iteration": 2.6677908897399902 }, { "auxiliary_loss_clip": 0.01632995, "auxiliary_loss_mlp": 0.00341954, "balance_loss_clip": 1.35546899, "balance_loss_mlp": 0.30523768, "epoch": 0.48555538854652036, "flos": 66529543155840.0, "grad_norm": 90.06876939117188, "language_loss": 0.86069101, "learning_rate": 2.1903529280186163e-06, "loss": 0.88044047, "num_input_tokens_seen": 173596085, "router_z_loss_clip": 2.77929688, "router_z_loss_mlp": 0.3671875, "step": 8076, "time_per_iteration": 3.081855297088623 }, { "auxiliary_loss_clip": 0.01634727, "auxiliary_loss_mlp": 0.00373842, "balance_loss_clip": 1.34814346, "balance_loss_mlp": 0.33083174, "epoch": 0.4856155117991883, "flos": 15924982596480.0, "grad_norm": 30.17381864254862, "language_loss": 0.91764152, "learning_rate": 2.1899652307243407e-06, "loss": 0.93772721, "num_input_tokens_seen": 173613900, "router_z_loss_clip": 2.86328125, "router_z_loss_mlp": 0.4296875, "step": 8077, "time_per_iteration": 2.654985189437866 }, { "auxiliary_loss_clip": 0.01476443, "auxiliary_loss_mlp": 0.00098375, "balance_loss_clip": 1.30143511, "balance_loss_mlp": 0.08721658, "epoch": 0.4856756350518563, "flos": 71047395060480.0, "grad_norm": 0.9116318905275735, "language_loss": 0.57815093, "learning_rate": 2.189577526226564e-06, "loss": 0.59389913, "num_input_tokens_seen": 173671305, "router_z_loss_clip": 1.75, "router_z_loss_mlp": 0.11181641, "step": 8078, "time_per_iteration": 3.146068811416626 }, { "auxiliary_loss_clip": 0.01639496, "auxiliary_loss_mlp": 0.00357316, "balance_loss_clip": 1.3459444, "balance_loss_mlp": 0.31661838, "epoch": 0.48573575830452426, "flos": 29825679346560.0, "grad_norm": 77.67071489469244, "language_loss": 0.78686309, "learning_rate": 2.1891898145399884e-06, "loss": 0.80683124, "num_input_tokens_seen": 173692070, "router_z_loss_clip": 2.93164062, "router_z_loss_mlp": 0.40698242, "step": 8079, "time_per_iteration": 2.833815336227417 }, { "auxiliary_loss_clip": 0.01638341, "auxiliary_loss_mlp": 0.00333707, "balance_loss_clip": 1.35219979, "balance_loss_mlp": 0.29532209, "epoch": 0.4857958815571922, "flos": 17639501552640.0, "grad_norm": 2.7981869531608923, "language_loss": 0.89322937, "learning_rate": 2.1888020956793172e-06, "loss": 0.91294986, "num_input_tokens_seen": 173709785, "router_z_loss_clip": 2.86132812, "router_z_loss_mlp": 0.3840332, "step": 8080, "time_per_iteration": 4.195082902908325 }, { "auxiliary_loss_clip": 0.01625293, "auxiliary_loss_mlp": 0.00354863, "balance_loss_clip": 1.34392667, "balance_loss_mlp": 0.3150472, "epoch": 0.4858560048098602, "flos": 21105491581440.0, "grad_norm": 3.203433580186773, "language_loss": 0.8918196, "learning_rate": 2.188414369659251e-06, "loss": 0.91162115, "num_input_tokens_seen": 173728770, "router_z_loss_clip": 2.81445312, "router_z_loss_mlp": 0.3984375, "step": 8081, "time_per_iteration": 2.6881327629089355 }, { "auxiliary_loss_clip": 0.01640838, "auxiliary_loss_mlp": 0.00371929, "balance_loss_clip": 1.35425079, "balance_loss_mlp": 0.32751143, "epoch": 0.4859161280625282, "flos": 22090844448000.0, "grad_norm": 13.294278913391782, "language_loss": 0.87963402, "learning_rate": 2.1880266364944924e-06, "loss": 0.89976168, "num_input_tokens_seen": 173747355, "router_z_loss_clip": 2.8671875, "router_z_loss_mlp": 0.44458008, "step": 8082, "time_per_iteration": 2.680919647216797 }, { "auxiliary_loss_clip": 0.01644175, "auxiliary_loss_mlp": 0.00333789, "balance_loss_clip": 1.3628726, "balance_loss_mlp": 0.29490328, "epoch": 0.4859762513151962, "flos": 17493452853120.0, "grad_norm": 2.4747759416207105, "language_loss": 0.93192559, "learning_rate": 2.187638896199746e-06, "loss": 0.95170522, "num_input_tokens_seen": 173764825, "router_z_loss_clip": 2.81445312, "router_z_loss_mlp": 0.38891602, "step": 8083, "time_per_iteration": 4.231025218963623 }, { "auxiliary_loss_clip": 0.01622208, "auxiliary_loss_mlp": 0.00335786, "balance_loss_clip": 1.35078812, "balance_loss_mlp": 0.29737711, "epoch": 0.48603637456786414, "flos": 18004246208640.0, "grad_norm": 17.037036747524226, "language_loss": 0.87416995, "learning_rate": 2.1872511487897126e-06, "loss": 0.89374995, "num_input_tokens_seen": 173783215, "router_z_loss_clip": 2.71484375, "router_z_loss_mlp": 0.3840332, "step": 8084, "time_per_iteration": 2.6831517219543457 }, { "auxiliary_loss_clip": 0.01647, "auxiliary_loss_mlp": 0.00360718, "balance_loss_clip": 1.3602469, "balance_loss_mlp": 0.32142621, "epoch": 0.4860964978205321, "flos": 22492038430080.0, "grad_norm": 14.054080385794629, "language_loss": 0.75225341, "learning_rate": 2.186863394279098e-06, "loss": 0.77233058, "num_input_tokens_seen": 173801905, "router_z_loss_clip": 2.87109375, "router_z_loss_mlp": 0.39282227, "step": 8085, "time_per_iteration": 4.055827856063843 }, { "auxiliary_loss_clip": 0.01638242, "auxiliary_loss_mlp": 0.00337969, "balance_loss_clip": 1.35516536, "balance_loss_mlp": 0.29824811, "epoch": 0.48615662107320007, "flos": 23372532518400.0, "grad_norm": 18.829276320622952, "language_loss": 0.82582414, "learning_rate": 2.1864756326826046e-06, "loss": 0.84558624, "num_input_tokens_seen": 173824690, "router_z_loss_clip": 2.8359375, "router_z_loss_mlp": 0.3972168, "step": 8086, "time_per_iteration": 2.7111735343933105 }, { "auxiliary_loss_clip": 0.01645409, "auxiliary_loss_mlp": 0.00336221, "balance_loss_clip": 1.36005187, "balance_loss_mlp": 0.29716766, "epoch": 0.48621674432586803, "flos": 34418833136640.0, "grad_norm": 8.752229687969285, "language_loss": 0.76303113, "learning_rate": 2.1860878640149355e-06, "loss": 0.7828474, "num_input_tokens_seen": 173844450, "router_z_loss_clip": 2.85546875, "router_z_loss_mlp": 0.39038086, "step": 8087, "time_per_iteration": 2.7602598667144775 }, { "auxiliary_loss_clip": 0.01638934, "auxiliary_loss_mlp": 0.00362596, "balance_loss_clip": 1.34748006, "balance_loss_mlp": 0.31996632, "epoch": 0.486276867578536, "flos": 33107555237760.0, "grad_norm": 73.27228151794307, "language_loss": 0.79956758, "learning_rate": 2.1857000882907974e-06, "loss": 0.81958294, "num_input_tokens_seen": 173864975, "router_z_loss_clip": 2.91601562, "router_z_loss_mlp": 0.42602539, "step": 8088, "time_per_iteration": 2.7506985664367676 }, { "auxiliary_loss_clip": 0.01649789, "auxiliary_loss_mlp": 0.00339568, "balance_loss_clip": 1.36271501, "balance_loss_mlp": 0.29882246, "epoch": 0.48633699083120396, "flos": 21470703114240.0, "grad_norm": 180.24922632484748, "language_loss": 0.81512845, "learning_rate": 2.185312305524892e-06, "loss": 0.83502197, "num_input_tokens_seen": 173883805, "router_z_loss_clip": 2.87304688, "router_z_loss_mlp": 0.4074707, "step": 8089, "time_per_iteration": 2.657520055770874 }, { "auxiliary_loss_clip": 0.0165425, "auxiliary_loss_mlp": 0.00361852, "balance_loss_clip": 1.36297202, "balance_loss_mlp": 0.32041508, "epoch": 0.48639711408387193, "flos": 20084335833600.0, "grad_norm": 24.30865346380841, "language_loss": 0.89217901, "learning_rate": 2.184924515731926e-06, "loss": 0.91234004, "num_input_tokens_seen": 173903520, "router_z_loss_clip": 2.91015625, "router_z_loss_mlp": 0.41455078, "step": 8090, "time_per_iteration": 4.0763022899627686 }, { "auxiliary_loss_clip": 0.01664976, "auxiliary_loss_mlp": 0.0032987, "balance_loss_clip": 1.37680507, "balance_loss_mlp": 0.28898153, "epoch": 0.4864572373365399, "flos": 20778884190720.0, "grad_norm": 3.3744775035250645, "language_loss": 0.81860423, "learning_rate": 2.1845367189266045e-06, "loss": 0.83855265, "num_input_tokens_seen": 173924255, "router_z_loss_clip": 2.8828125, "router_z_loss_mlp": 0.40869141, "step": 8091, "time_per_iteration": 2.6842238903045654 }, { "auxiliary_loss_clip": 0.01672138, "auxiliary_loss_mlp": 0.00349057, "balance_loss_clip": 1.37603235, "balance_loss_mlp": 0.30633262, "epoch": 0.48651736058920786, "flos": 26025360503040.0, "grad_norm": 6.199191207063659, "language_loss": 0.84985626, "learning_rate": 2.184148915123631e-06, "loss": 0.87006819, "num_input_tokens_seen": 173943285, "router_z_loss_clip": 2.9609375, "router_z_loss_mlp": 0.42700195, "step": 8092, "time_per_iteration": 2.7277233600616455 }, { "auxiliary_loss_clip": 0.01690217, "auxiliary_loss_mlp": 0.00331393, "balance_loss_clip": 1.39094532, "balance_loss_mlp": 0.28974128, "epoch": 0.4865774838418758, "flos": 20485601642880.0, "grad_norm": 35.23364157118287, "language_loss": 0.7852459, "learning_rate": 2.1837611043377126e-06, "loss": 0.805462, "num_input_tokens_seen": 173962205, "router_z_loss_clip": 2.98828125, "router_z_loss_mlp": 0.41601562, "step": 8093, "time_per_iteration": 2.7288691997528076 }, { "auxiliary_loss_clip": 0.01685893, "auxiliary_loss_mlp": 0.00341587, "balance_loss_clip": 1.3894428, "balance_loss_mlp": 0.30048323, "epoch": 0.4866376070945438, "flos": 23547704169600.0, "grad_norm": 7.65772948616519, "language_loss": 0.7518189, "learning_rate": 2.1833732865835545e-06, "loss": 0.77209365, "num_input_tokens_seen": 173980945, "router_z_loss_clip": 2.96289062, "router_z_loss_mlp": 0.41064453, "step": 8094, "time_per_iteration": 2.788161277770996 }, { "auxiliary_loss_clip": 0.01718649, "auxiliary_loss_mlp": 0.00356281, "balance_loss_clip": 1.40147436, "balance_loss_mlp": 0.3149865, "epoch": 0.4866977303472118, "flos": 16690598012160.0, "grad_norm": 4.372325643533697, "language_loss": 0.76133132, "learning_rate": 2.1829854618758636e-06, "loss": 0.78208059, "num_input_tokens_seen": 173998860, "router_z_loss_clip": 3.171875, "router_z_loss_mlp": 0.41308594, "step": 8095, "time_per_iteration": 2.6834423542022705 }, { "auxiliary_loss_clip": 0.0168445, "auxiliary_loss_mlp": 0.00319904, "balance_loss_clip": 1.38492668, "balance_loss_mlp": 0.27772814, "epoch": 0.4867578535998798, "flos": 17896011552000.0, "grad_norm": 25.656953361858804, "language_loss": 0.85298669, "learning_rate": 2.182597630229345e-06, "loss": 0.87303019, "num_input_tokens_seen": 174016665, "router_z_loss_clip": 2.99804688, "router_z_loss_mlp": 0.42163086, "step": 8096, "time_per_iteration": 2.646864414215088 }, { "auxiliary_loss_clip": 0.01693263, "auxiliary_loss_mlp": 0.00346209, "balance_loss_clip": 1.39482415, "balance_loss_mlp": 0.30422375, "epoch": 0.48681797685254774, "flos": 22637799820800.0, "grad_norm": 178.73950704180254, "language_loss": 0.74456024, "learning_rate": 2.1822097916587067e-06, "loss": 0.76495498, "num_input_tokens_seen": 174034800, "router_z_loss_clip": 2.98828125, "router_z_loss_mlp": 0.41967773, "step": 8097, "time_per_iteration": 2.6971967220306396 }, { "auxiliary_loss_clip": 0.01715413, "auxiliary_loss_mlp": 0.00313035, "balance_loss_clip": 1.40263915, "balance_loss_mlp": 0.27295658, "epoch": 0.4868781001052157, "flos": 20886077352960.0, "grad_norm": 5.370627185553025, "language_loss": 0.77215022, "learning_rate": 2.1818219461786543e-06, "loss": 0.79243469, "num_input_tokens_seen": 174054445, "router_z_loss_clip": 3.1328125, "router_z_loss_mlp": 0.40087891, "step": 8098, "time_per_iteration": 2.7149953842163086 }, { "auxiliary_loss_clip": 0.01728095, "auxiliary_loss_mlp": 0.00345171, "balance_loss_clip": 1.40674758, "balance_loss_mlp": 0.29970431, "epoch": 0.48693822335788367, "flos": 41974940937600.0, "grad_norm": 87.68975541122627, "language_loss": 0.72760153, "learning_rate": 2.1814340938038956e-06, "loss": 0.74833429, "num_input_tokens_seen": 174077890, "router_z_loss_clip": 3.21484375, "router_z_loss_mlp": 0.45483398, "step": 8099, "time_per_iteration": 2.8706655502319336 }, { "auxiliary_loss_clip": 0.01720509, "auxiliary_loss_mlp": 0.00332039, "balance_loss_clip": 1.40633368, "balance_loss_mlp": 0.28948149, "epoch": 0.48699834661055164, "flos": 24243294021120.0, "grad_norm": 2.479773421331886, "language_loss": 0.76795328, "learning_rate": 2.181046234549138e-06, "loss": 0.78847873, "num_input_tokens_seen": 174097460, "router_z_loss_clip": 3.140625, "router_z_loss_mlp": 0.42553711, "step": 8100, "time_per_iteration": 2.711744546890259 }, { "auxiliary_loss_clip": 0.01721035, "auxiliary_loss_mlp": 0.00319459, "balance_loss_clip": 1.41171217, "balance_loss_mlp": 0.27237177, "epoch": 0.4870584698632196, "flos": 25923877603200.0, "grad_norm": 1.544977812262047, "language_loss": 0.81092978, "learning_rate": 2.180658368429088e-06, "loss": 0.83133471, "num_input_tokens_seen": 174120775, "router_z_loss_clip": 3.09375, "router_z_loss_mlp": 0.47070312, "step": 8101, "time_per_iteration": 2.7197091579437256 }, { "auxiliary_loss_clip": 0.01477493, "auxiliary_loss_mlp": 0.00097685, "balance_loss_clip": 1.30724525, "balance_loss_mlp": 0.08433346, "epoch": 0.48711859311588757, "flos": 70211933648640.0, "grad_norm": 0.6873299800821779, "language_loss": 0.51776296, "learning_rate": 2.1802704954584565e-06, "loss": 0.53351474, "num_input_tokens_seen": 174189135, "router_z_loss_clip": 1.703125, "router_z_loss_mlp": 0.13378906, "step": 8102, "time_per_iteration": 3.329775333404541 }, { "auxiliary_loss_clip": 0.0170646, "auxiliary_loss_mlp": 0.0032804, "balance_loss_clip": 1.39552414, "balance_loss_mlp": 0.2844573, "epoch": 0.48717871636855553, "flos": 12342964659840.0, "grad_norm": 10.59475819163748, "language_loss": 0.78078359, "learning_rate": 2.1798826156519484e-06, "loss": 0.80112857, "num_input_tokens_seen": 174203250, "router_z_loss_clip": 3.11132812, "router_z_loss_mlp": 0.4362793, "step": 8103, "time_per_iteration": 2.668189764022827 }, { "auxiliary_loss_clip": 0.01713769, "auxiliary_loss_mlp": 0.00321357, "balance_loss_clip": 1.39921236, "balance_loss_mlp": 0.27734452, "epoch": 0.4872388396212235, "flos": 23477139901440.0, "grad_norm": 24.23414201245346, "language_loss": 0.68545073, "learning_rate": 2.1794947290242737e-06, "loss": 0.70580202, "num_input_tokens_seen": 174224145, "router_z_loss_clip": 3.14648438, "router_z_loss_mlp": 0.44042969, "step": 8104, "time_per_iteration": 2.671679735183716 }, { "auxiliary_loss_clip": 0.01754374, "auxiliary_loss_mlp": 0.0033009, "balance_loss_clip": 1.43229437, "balance_loss_mlp": 0.28414658, "epoch": 0.48729896287389146, "flos": 31427582186880.0, "grad_norm": 2.898134847514157, "language_loss": 0.74281311, "learning_rate": 2.1791068355901413e-06, "loss": 0.76365769, "num_input_tokens_seen": 174244435, "router_z_loss_clip": 3.22265625, "router_z_loss_mlp": 0.45996094, "step": 8105, "time_per_iteration": 2.7569847106933594 }, { "auxiliary_loss_clip": 0.01742212, "auxiliary_loss_mlp": 0.0035034, "balance_loss_clip": 1.42522383, "balance_loss_mlp": 0.30463549, "epoch": 0.4873590861265594, "flos": 19057936700160.0, "grad_norm": 5.973654210809919, "language_loss": 0.79907596, "learning_rate": 2.178718935364259e-06, "loss": 0.82000154, "num_input_tokens_seen": 174262710, "router_z_loss_clip": 3.16796875, "router_z_loss_mlp": 0.45751953, "step": 8106, "time_per_iteration": 2.7501235008239746 }, { "auxiliary_loss_clip": 0.01755013, "auxiliary_loss_mlp": 0.00374576, "balance_loss_clip": 1.42530644, "balance_loss_mlp": 0.33011097, "epoch": 0.4874192093792274, "flos": 24348296453760.0, "grad_norm": 18.970209485590637, "language_loss": 0.82396734, "learning_rate": 2.1783310283613373e-06, "loss": 0.84526324, "num_input_tokens_seen": 174281545, "router_z_loss_clip": 3.29492188, "router_z_loss_mlp": 0.44482422, "step": 8107, "time_per_iteration": 2.713927745819092 }, { "auxiliary_loss_clip": 0.0175081, "auxiliary_loss_mlp": 0.0033117, "balance_loss_clip": 1.43584454, "balance_loss_mlp": 0.28653759, "epoch": 0.4874793326318954, "flos": 23112610727040.0, "grad_norm": 7.099705318149349, "language_loss": 0.81994307, "learning_rate": 2.1779431145960853e-06, "loss": 0.84076285, "num_input_tokens_seen": 174300290, "router_z_loss_clip": 3.1484375, "router_z_loss_mlp": 0.4465332, "step": 8108, "time_per_iteration": 2.663421392440796 }, { "auxiliary_loss_clip": 0.01724449, "auxiliary_loss_mlp": 0.00286848, "balance_loss_clip": 1.41692328, "balance_loss_mlp": 0.24469538, "epoch": 0.4875394558845634, "flos": 19026156142080.0, "grad_norm": 12.2217517781924, "language_loss": 0.80125707, "learning_rate": 2.177555194083212e-06, "loss": 0.82137007, "num_input_tokens_seen": 174318490, "router_z_loss_clip": 3.078125, "router_z_loss_mlp": 0.42163086, "step": 8109, "time_per_iteration": 2.667231559753418 }, { "auxiliary_loss_clip": 0.01740998, "auxiliary_loss_mlp": 0.00302214, "balance_loss_clip": 1.42984343, "balance_loss_mlp": 0.25808299, "epoch": 0.48759957913723134, "flos": 21433607343360.0, "grad_norm": 191.27250697614045, "language_loss": 0.83102477, "learning_rate": 2.177167266837428e-06, "loss": 0.85145688, "num_input_tokens_seen": 174335505, "router_z_loss_clip": 3.11328125, "router_z_loss_mlp": 0.44116211, "step": 8110, "time_per_iteration": 2.673060417175293 }, { "auxiliary_loss_clip": 0.01754045, "auxiliary_loss_mlp": 0.00325551, "balance_loss_clip": 1.43109584, "balance_loss_mlp": 0.28156233, "epoch": 0.4876597023898993, "flos": 17748669962880.0, "grad_norm": 10.36811642690939, "language_loss": 0.80667114, "learning_rate": 2.176779332873444e-06, "loss": 0.82746702, "num_input_tokens_seen": 174353990, "router_z_loss_clip": 3.23242188, "router_z_loss_mlp": 0.44042969, "step": 8111, "time_per_iteration": 2.6539814472198486 }, { "auxiliary_loss_clip": 0.01742384, "auxiliary_loss_mlp": 0.00348997, "balance_loss_clip": 1.42965686, "balance_loss_mlp": 0.30658215, "epoch": 0.4877198256425673, "flos": 17019647527680.0, "grad_norm": 22.979846604815688, "language_loss": 0.8066116, "learning_rate": 2.17639139220597e-06, "loss": 0.82752538, "num_input_tokens_seen": 174373425, "router_z_loss_clip": 3.12695312, "router_z_loss_mlp": 0.42431641, "step": 8112, "time_per_iteration": 2.665844202041626 }, { "auxiliary_loss_clip": 0.01738379, "auxiliary_loss_mlp": 0.00351009, "balance_loss_clip": 1.41281617, "balance_loss_mlp": 0.30399281, "epoch": 0.48777994889523524, "flos": 22384091082240.0, "grad_norm": 2.6774532638290296, "language_loss": 0.80368114, "learning_rate": 2.1760034448497166e-06, "loss": 0.82457507, "num_input_tokens_seen": 174393070, "router_z_loss_clip": 3.25585938, "router_z_loss_mlp": 0.47021484, "step": 8113, "time_per_iteration": 2.678218126296997 }, { "auxiliary_loss_clip": 0.01515102, "auxiliary_loss_mlp": 0.0008526, "balance_loss_clip": 1.33900964, "balance_loss_mlp": 0.0684756, "epoch": 0.4878400721479032, "flos": 61241772159360.0, "grad_norm": 0.7851369840249823, "language_loss": 0.48556584, "learning_rate": 2.1756154908193943e-06, "loss": 0.50156951, "num_input_tokens_seen": 174446880, "router_z_loss_clip": 1.765625, "router_z_loss_mlp": 0.16796875, "step": 8114, "time_per_iteration": 3.057821750640869 }, { "auxiliary_loss_clip": 0.01745019, "auxiliary_loss_mlp": 0.00299962, "balance_loss_clip": 1.42544198, "balance_loss_mlp": 0.25428143, "epoch": 0.48790019540057117, "flos": 24536612482560.0, "grad_norm": 41.7703855707457, "language_loss": 0.83909106, "learning_rate": 2.1752275301297155e-06, "loss": 0.85954088, "num_input_tokens_seen": 174468485, "router_z_loss_clip": 3.19335938, "router_z_loss_mlp": 0.45678711, "step": 8115, "time_per_iteration": 2.71944522857666 }, { "auxiliary_loss_clip": 0.01756057, "auxiliary_loss_mlp": 0.00334245, "balance_loss_clip": 1.42878377, "balance_loss_mlp": 0.28789636, "epoch": 0.48796031865323913, "flos": 21833939399040.0, "grad_norm": 24.482790539784304, "language_loss": 0.81607407, "learning_rate": 2.1748395627953915e-06, "loss": 0.83697712, "num_input_tokens_seen": 174486360, "router_z_loss_clip": 3.2734375, "router_z_loss_mlp": 0.46362305, "step": 8116, "time_per_iteration": 2.7781991958618164 }, { "auxiliary_loss_clip": 0.01747575, "auxiliary_loss_mlp": 0.00334422, "balance_loss_clip": 1.43288529, "balance_loss_mlp": 0.29148284, "epoch": 0.4880204419059071, "flos": 18588907883520.0, "grad_norm": 70.15984799823327, "language_loss": 0.71645069, "learning_rate": 2.1744515888311335e-06, "loss": 0.73727059, "num_input_tokens_seen": 174505075, "router_z_loss_clip": 3.14453125, "router_z_loss_mlp": 0.4296875, "step": 8117, "time_per_iteration": 2.704153299331665 }, { "auxiliary_loss_clip": 0.01736201, "auxiliary_loss_mlp": 0.003165, "balance_loss_clip": 1.42395377, "balance_loss_mlp": 0.27303615, "epoch": 0.48808056515857506, "flos": 19172168928000.0, "grad_norm": 35.6602153884737, "language_loss": 0.84631431, "learning_rate": 2.1740636082516533e-06, "loss": 0.86684132, "num_input_tokens_seen": 174523385, "router_z_loss_clip": 3.12304688, "router_z_loss_mlp": 0.43505859, "step": 8118, "time_per_iteration": 2.684366226196289 }, { "auxiliary_loss_clip": 0.01740233, "auxiliary_loss_mlp": 0.00356133, "balance_loss_clip": 1.42337108, "balance_loss_mlp": 0.3117398, "epoch": 0.48814068841124303, "flos": 20120497850880.0, "grad_norm": 3.1274364090770232, "language_loss": 0.7169987, "learning_rate": 2.1736756210716645e-06, "loss": 0.73796231, "num_input_tokens_seen": 174542200, "router_z_loss_clip": 3.16796875, "router_z_loss_mlp": 0.4440918, "step": 8119, "time_per_iteration": 2.650541305541992 }, { "auxiliary_loss_clip": 0.01760609, "auxiliary_loss_mlp": 0.00338372, "balance_loss_clip": 1.42999709, "balance_loss_mlp": 0.29757825, "epoch": 0.488200811663911, "flos": 22965592360320.0, "grad_norm": 6.854083480534947, "language_loss": 0.79876482, "learning_rate": 2.173287627305878e-06, "loss": 0.81975472, "num_input_tokens_seen": 174563620, "router_z_loss_clip": 3.3046875, "router_z_loss_mlp": 0.40795898, "step": 8120, "time_per_iteration": 2.7444674968719482 }, { "auxiliary_loss_clip": 0.01772364, "auxiliary_loss_mlp": 0.00324439, "balance_loss_clip": 1.44346762, "balance_loss_mlp": 0.27828091, "epoch": 0.48826093491657896, "flos": 33910697387520.0, "grad_norm": 490.53534758602365, "language_loss": 0.68733376, "learning_rate": 2.1728996269690075e-06, "loss": 0.70830178, "num_input_tokens_seen": 174586465, "router_z_loss_clip": 3.28515625, "router_z_loss_mlp": 0.46142578, "step": 8121, "time_per_iteration": 2.7783873081207275 }, { "auxiliary_loss_clip": 0.01756699, "auxiliary_loss_mlp": 0.00348409, "balance_loss_clip": 1.42487788, "balance_loss_mlp": 0.30117831, "epoch": 0.488321058169247, "flos": 23070307484160.0, "grad_norm": 20.38815350774075, "language_loss": 0.91054296, "learning_rate": 2.1725116200757664e-06, "loss": 0.93159401, "num_input_tokens_seen": 174604035, "router_z_loss_clip": 3.3203125, "router_z_loss_mlp": 0.47192383, "step": 8122, "time_per_iteration": 4.124528169631958 }, { "auxiliary_loss_clip": 0.01782212, "auxiliary_loss_mlp": 0.0032142, "balance_loss_clip": 1.44350564, "balance_loss_mlp": 0.27683526, "epoch": 0.48838118142191494, "flos": 19317714837120.0, "grad_norm": 22.882908528142977, "language_loss": 0.90971816, "learning_rate": 2.172123606640866e-06, "loss": 0.93075454, "num_input_tokens_seen": 174621715, "router_z_loss_clip": 3.38671875, "router_z_loss_mlp": 0.44555664, "step": 8123, "time_per_iteration": 2.68357515335083 }, { "auxiliary_loss_clip": 0.0179649, "auxiliary_loss_mlp": 0.00346757, "balance_loss_clip": 1.44986558, "balance_loss_mlp": 0.30105257, "epoch": 0.4884413046745829, "flos": 25410678036480.0, "grad_norm": 106.72505042696287, "language_loss": 0.90742302, "learning_rate": 2.1717355866790227e-06, "loss": 0.92885542, "num_input_tokens_seen": 174643835, "router_z_loss_clip": 3.46875, "router_z_loss_mlp": 0.45654297, "step": 8124, "time_per_iteration": 2.740358829498291 }, { "auxiliary_loss_clip": 0.01784726, "auxiliary_loss_mlp": 0.00338305, "balance_loss_clip": 1.45191467, "balance_loss_mlp": 0.29610503, "epoch": 0.4885014279272509, "flos": 20991546662400.0, "grad_norm": 7.468246560942172, "language_loss": 0.85525346, "learning_rate": 2.171347560204948e-06, "loss": 0.8764838, "num_input_tokens_seen": 174660955, "router_z_loss_clip": 3.328125, "router_z_loss_mlp": 0.421875, "step": 8125, "time_per_iteration": 2.6699728965759277 }, { "auxiliary_loss_clip": 0.0178087, "auxiliary_loss_mlp": 0.00346268, "balance_loss_clip": 1.44539785, "balance_loss_mlp": 0.30402029, "epoch": 0.48856155117991884, "flos": 13771599269760.0, "grad_norm": 4.209018266711976, "language_loss": 0.79007399, "learning_rate": 2.170959527233356e-06, "loss": 0.8113454, "num_input_tokens_seen": 174678270, "router_z_loss_clip": 3.35742188, "router_z_loss_mlp": 0.42236328, "step": 8126, "time_per_iteration": 4.234640836715698 }, { "auxiliary_loss_clip": 0.01828998, "auxiliary_loss_mlp": 0.00356005, "balance_loss_clip": 1.47230649, "balance_loss_mlp": 0.3108007, "epoch": 0.4886216744325868, "flos": 32087764206720.0, "grad_norm": 7.246604081212289, "language_loss": 0.7408585, "learning_rate": 2.1705714877789633e-06, "loss": 0.76270854, "num_input_tokens_seen": 174698360, "router_z_loss_clip": 3.56835938, "router_z_loss_mlp": 0.45263672, "step": 8127, "time_per_iteration": 4.258450269699097 }, { "auxiliary_loss_clip": 0.01821392, "auxiliary_loss_mlp": 0.00368406, "balance_loss_clip": 1.46480703, "balance_loss_mlp": 0.32346442, "epoch": 0.48868179768525477, "flos": 19610063631360.0, "grad_norm": 4.9987762601759815, "language_loss": 0.82200122, "learning_rate": 2.170183441856481e-06, "loss": 0.84389913, "num_input_tokens_seen": 174716755, "router_z_loss_clip": 3.56835938, "router_z_loss_mlp": 0.44921875, "step": 8128, "time_per_iteration": 2.6581361293792725 }, { "auxiliary_loss_clip": 0.0181949, "auxiliary_loss_mlp": 0.00376843, "balance_loss_clip": 1.46620631, "balance_loss_mlp": 0.33051825, "epoch": 0.48874192093792274, "flos": 21286912199040.0, "grad_norm": 6.194027512099844, "language_loss": 0.82224995, "learning_rate": 2.1697953894806265e-06, "loss": 0.84421325, "num_input_tokens_seen": 174735560, "router_z_loss_clip": 3.53320312, "router_z_loss_mlp": 0.46264648, "step": 8129, "time_per_iteration": 2.7145674228668213 }, { "auxiliary_loss_clip": 0.01838582, "auxiliary_loss_mlp": 0.00356999, "balance_loss_clip": 1.47723258, "balance_loss_mlp": 0.31205705, "epoch": 0.4888020441905907, "flos": 14173439696640.0, "grad_norm": 9.559321449739732, "language_loss": 0.73243392, "learning_rate": 2.169407330666114e-06, "loss": 0.7543897, "num_input_tokens_seen": 174752730, "router_z_loss_clip": 3.60742188, "router_z_loss_mlp": 0.44921875, "step": 8130, "time_per_iteration": 2.6408941745758057 }, { "auxiliary_loss_clip": 0.01816071, "auxiliary_loss_mlp": 0.00319283, "balance_loss_clip": 1.47172213, "balance_loss_mlp": 0.27453142, "epoch": 0.48886216744325867, "flos": 24097891766400.0, "grad_norm": 337.5244275886292, "language_loss": 0.78217417, "learning_rate": 2.169019265427658e-06, "loss": 0.80352765, "num_input_tokens_seen": 174772520, "router_z_loss_clip": 3.44726562, "router_z_loss_mlp": 0.44775391, "step": 8131, "time_per_iteration": 2.682281255722046 }, { "auxiliary_loss_clip": 0.01844503, "auxiliary_loss_mlp": 0.00347151, "balance_loss_clip": 1.48202336, "balance_loss_mlp": 0.29996771, "epoch": 0.48892229069592663, "flos": 38431419402240.0, "grad_norm": 29.41030508571792, "language_loss": 0.73999125, "learning_rate": 2.1686311937799745e-06, "loss": 0.76190782, "num_input_tokens_seen": 174796540, "router_z_loss_clip": 3.62890625, "router_z_loss_mlp": 0.47192383, "step": 8132, "time_per_iteration": 4.2306506633758545 }, { "auxiliary_loss_clip": 0.01836816, "auxiliary_loss_mlp": 0.00352915, "balance_loss_clip": 1.48221517, "balance_loss_mlp": 0.30709052, "epoch": 0.4889824139485946, "flos": 23843321101440.0, "grad_norm": 3.165970023733327, "language_loss": 0.74855155, "learning_rate": 2.1682431157377797e-06, "loss": 0.77044886, "num_input_tokens_seen": 174817840, "router_z_loss_clip": 3.54882812, "router_z_loss_mlp": 0.45776367, "step": 8133, "time_per_iteration": 2.7084128856658936 }, { "auxiliary_loss_clip": 0.01816814, "auxiliary_loss_mlp": 0.00341144, "balance_loss_clip": 1.46959209, "balance_loss_mlp": 0.29946834, "epoch": 0.48904253720126256, "flos": 24425827960320.0, "grad_norm": 10.00437214852962, "language_loss": 0.76449388, "learning_rate": 2.1678550313157883e-06, "loss": 0.78607345, "num_input_tokens_seen": 174837885, "router_z_loss_clip": 3.4765625, "router_z_loss_mlp": 0.41674805, "step": 8134, "time_per_iteration": 2.707709789276123 }, { "auxiliary_loss_clip": 0.01825642, "auxiliary_loss_mlp": 0.00382489, "balance_loss_clip": 1.46902454, "balance_loss_mlp": 0.33580691, "epoch": 0.4891026604539306, "flos": 24170682677760.0, "grad_norm": 26.62676173601318, "language_loss": 0.85413384, "learning_rate": 2.167466940528718e-06, "loss": 0.87621522, "num_input_tokens_seen": 174855240, "router_z_loss_clip": 3.56640625, "router_z_loss_mlp": 0.46679688, "step": 8135, "time_per_iteration": 2.688605785369873 }, { "auxiliary_loss_clip": 0.01826881, "auxiliary_loss_mlp": 0.00351677, "balance_loss_clip": 1.47327435, "balance_loss_mlp": 0.306997, "epoch": 0.48916278370659855, "flos": 21470954509440.0, "grad_norm": 32.226858775408196, "language_loss": 0.79981577, "learning_rate": 2.1670788433912843e-06, "loss": 0.82160139, "num_input_tokens_seen": 174875145, "router_z_loss_clip": 3.53710938, "router_z_loss_mlp": 0.44677734, "step": 8136, "time_per_iteration": 2.7471845149993896 }, { "auxiliary_loss_clip": 0.01823395, "auxiliary_loss_mlp": 0.0035396, "balance_loss_clip": 1.47310972, "balance_loss_mlp": 0.31121191, "epoch": 0.4892229069592665, "flos": 22309755886080.0, "grad_norm": 44.479420098249456, "language_loss": 0.78006268, "learning_rate": 2.166690739918204e-06, "loss": 0.80183619, "num_input_tokens_seen": 174894770, "router_z_loss_clip": 3.50585938, "router_z_loss_mlp": 0.42749023, "step": 8137, "time_per_iteration": 2.6840076446533203 }, { "auxiliary_loss_clip": 0.01823047, "auxiliary_loss_mlp": 0.00356698, "balance_loss_clip": 1.46664429, "balance_loss_mlp": 0.30932379, "epoch": 0.4892830302119345, "flos": 12786856934400.0, "grad_norm": 14.258433454983797, "language_loss": 0.82517081, "learning_rate": 2.1663026301241944e-06, "loss": 0.84696829, "num_input_tokens_seen": 174912780, "router_z_loss_clip": 3.5625, "router_z_loss_mlp": 0.47387695, "step": 8138, "time_per_iteration": 2.63661527633667 }, { "auxiliary_loss_clip": 0.01820711, "auxiliary_loss_mlp": 0.00369334, "balance_loss_clip": 1.47374606, "balance_loss_mlp": 0.32348657, "epoch": 0.48934315346460244, "flos": 20813896972800.0, "grad_norm": 30.43833279119974, "language_loss": 0.7846272, "learning_rate": 2.165914514023972e-06, "loss": 0.80652761, "num_input_tokens_seen": 174931250, "router_z_loss_clip": 3.47070312, "router_z_loss_mlp": 0.45849609, "step": 8139, "time_per_iteration": 2.6750714778900146 }, { "auxiliary_loss_clip": 0.01820627, "auxiliary_loss_mlp": 0.00347842, "balance_loss_clip": 1.47371793, "balance_loss_mlp": 0.30273294, "epoch": 0.4894032767172704, "flos": 19755537713280.0, "grad_norm": 4.07099940628131, "language_loss": 0.69976431, "learning_rate": 2.165526391632255e-06, "loss": 0.72144902, "num_input_tokens_seen": 174951105, "router_z_loss_clip": 3.46679688, "router_z_loss_mlp": 0.45117188, "step": 8140, "time_per_iteration": 2.7008414268493652 }, { "auxiliary_loss_clip": 0.01834496, "auxiliary_loss_mlp": 0.00372195, "balance_loss_clip": 1.47345865, "balance_loss_mlp": 0.32308114, "epoch": 0.4894633999699384, "flos": 17818982835840.0, "grad_norm": 14.662687485566627, "language_loss": 0.87047398, "learning_rate": 2.1651382629637608e-06, "loss": 0.89254081, "num_input_tokens_seen": 174969120, "router_z_loss_clip": 3.61132812, "router_z_loss_mlp": 0.49121094, "step": 8141, "time_per_iteration": 2.6833789348602295 }, { "auxiliary_loss_clip": 0.01838028, "auxiliary_loss_mlp": 0.0037719, "balance_loss_clip": 1.48054862, "balance_loss_mlp": 0.32931536, "epoch": 0.48952352322260634, "flos": 25523222325120.0, "grad_norm": 3.7815679225138275, "language_loss": 0.77932465, "learning_rate": 2.1647501280332066e-06, "loss": 0.8014769, "num_input_tokens_seen": 174991295, "router_z_loss_clip": 3.57617188, "router_z_loss_mlp": 0.47875977, "step": 8142, "time_per_iteration": 2.714078903198242 }, { "auxiliary_loss_clip": 0.01803445, "auxiliary_loss_mlp": 0.00354664, "balance_loss_clip": 1.45869267, "balance_loss_mlp": 0.31010309, "epoch": 0.4895836464752743, "flos": 29055502903680.0, "grad_norm": 2.274984913803755, "language_loss": 0.74524987, "learning_rate": 2.1643619868553105e-06, "loss": 0.76683092, "num_input_tokens_seen": 175012830, "router_z_loss_clip": 3.44726562, "router_z_loss_mlp": 0.44580078, "step": 8143, "time_per_iteration": 2.7421395778656006 }, { "auxiliary_loss_clip": 0.01807244, "auxiliary_loss_mlp": 0.00373665, "balance_loss_clip": 1.46080947, "balance_loss_mlp": 0.33217973, "epoch": 0.48964376972794227, "flos": 33546958312320.0, "grad_norm": 2.640696213241605, "language_loss": 0.79824388, "learning_rate": 2.163973839444793e-06, "loss": 0.82005298, "num_input_tokens_seen": 175035695, "router_z_loss_clip": 3.46289062, "router_z_loss_mlp": 0.41503906, "step": 8144, "time_per_iteration": 2.7693493366241455 }, { "auxiliary_loss_clip": 0.01806064, "auxiliary_loss_mlp": 0.00372186, "balance_loss_clip": 1.4537276, "balance_loss_mlp": 0.3245973, "epoch": 0.48970389298061023, "flos": 22054035985920.0, "grad_norm": 11.120419923606454, "language_loss": 0.81201357, "learning_rate": 2.1635856858163695e-06, "loss": 0.83379602, "num_input_tokens_seen": 175056425, "router_z_loss_clip": 3.5234375, "router_z_loss_mlp": 0.47583008, "step": 8145, "time_per_iteration": 2.698435068130493 }, { "auxiliary_loss_clip": 0.01792755, "auxiliary_loss_mlp": 0.00337764, "balance_loss_clip": 1.44782519, "balance_loss_mlp": 0.29379976, "epoch": 0.4897640162332782, "flos": 20084299920000.0, "grad_norm": 18.545418152033122, "language_loss": 0.880979, "learning_rate": 2.163197525984761e-06, "loss": 0.90228415, "num_input_tokens_seen": 175074800, "router_z_loss_clip": 3.45117188, "router_z_loss_mlp": 0.43969727, "step": 8146, "time_per_iteration": 2.6204919815063477 }, { "auxiliary_loss_clip": 0.01790052, "auxiliary_loss_mlp": 0.00312159, "balance_loss_clip": 1.45241106, "balance_loss_mlp": 0.26912487, "epoch": 0.48982413948594616, "flos": 23806225330560.0, "grad_norm": 4.554239901391087, "language_loss": 0.80376518, "learning_rate": 2.162809359964687e-06, "loss": 0.82478726, "num_input_tokens_seen": 175094500, "router_z_loss_clip": 3.37695312, "router_z_loss_mlp": 0.43041992, "step": 8147, "time_per_iteration": 2.666036605834961 }, { "auxiliary_loss_clip": 0.01802895, "auxiliary_loss_mlp": 0.00312537, "balance_loss_clip": 1.45683289, "balance_loss_mlp": 0.26893073, "epoch": 0.4898842627386142, "flos": 17639645207040.0, "grad_norm": 103.51392996462536, "language_loss": 0.91283518, "learning_rate": 2.162421187770864e-06, "loss": 0.93398952, "num_input_tokens_seen": 175112920, "router_z_loss_clip": 3.4609375, "router_z_loss_mlp": 0.43603516, "step": 8148, "time_per_iteration": 2.7094063758850098 }, { "auxiliary_loss_clip": 0.01801896, "auxiliary_loss_mlp": 0.00328303, "balance_loss_clip": 1.45730305, "balance_loss_mlp": 0.28715178, "epoch": 0.48994438599128215, "flos": 16617914841600.0, "grad_norm": 21.17914714619926, "language_loss": 0.81310713, "learning_rate": 2.162033009418015e-06, "loss": 0.83440912, "num_input_tokens_seen": 175129910, "router_z_loss_clip": 3.44921875, "router_z_loss_mlp": 0.41137695, "step": 8149, "time_per_iteration": 2.675619602203369 }, { "auxiliary_loss_clip": 0.01792182, "auxiliary_loss_mlp": 0.00326407, "balance_loss_clip": 1.44177747, "balance_loss_mlp": 0.2822758, "epoch": 0.4900045092439501, "flos": 26614834600320.0, "grad_norm": 8.32278009465709, "language_loss": 0.84195799, "learning_rate": 2.1616448249208567e-06, "loss": 0.8631438, "num_input_tokens_seen": 175148705, "router_z_loss_clip": 3.50195312, "router_z_loss_mlp": 0.44140625, "step": 8150, "time_per_iteration": 2.685760259628296 }, { "auxiliary_loss_clip": 0.01795533, "auxiliary_loss_mlp": 0.00316677, "balance_loss_clip": 1.44265437, "balance_loss_mlp": 0.27285615, "epoch": 0.4900646324966181, "flos": 19902125116800.0, "grad_norm": 2.3937584808329664, "language_loss": 0.78717589, "learning_rate": 2.1612566342941106e-06, "loss": 0.80829799, "num_input_tokens_seen": 175167425, "router_z_loss_clip": 3.53125, "router_z_loss_mlp": 0.43798828, "step": 8151, "time_per_iteration": 2.6519052982330322 }, { "auxiliary_loss_clip": 0.0150187, "auxiliary_loss_mlp": 0.00131471, "balance_loss_clip": 1.32989585, "balance_loss_mlp": 0.11239771, "epoch": 0.49012475574928605, "flos": 59189620337280.0, "grad_norm": 0.8252572683060831, "language_loss": 0.53923625, "learning_rate": 2.1608684375524977e-06, "loss": 0.55556965, "num_input_tokens_seen": 175227985, "router_z_loss_clip": 1.71875, "router_z_loss_mlp": 0.19042969, "step": 8152, "time_per_iteration": 3.1370296478271484 }, { "auxiliary_loss_clip": 0.01797565, "auxiliary_loss_mlp": 0.00302208, "balance_loss_clip": 1.44581282, "balance_loss_mlp": 0.25733811, "epoch": 0.490184879001954, "flos": 45259797657600.0, "grad_norm": 3.306486271761101, "language_loss": 0.6749025, "learning_rate": 2.1604802347107364e-06, "loss": 0.6959002, "num_input_tokens_seen": 175251895, "router_z_loss_clip": 3.51757812, "router_z_loss_mlp": 0.44897461, "step": 8153, "time_per_iteration": 2.843877077102661 }, { "auxiliary_loss_clip": 0.01779818, "auxiliary_loss_mlp": 0.00298138, "balance_loss_clip": 1.44050574, "balance_loss_mlp": 0.25136054, "epoch": 0.490245002254622, "flos": 28002135634560.0, "grad_norm": 4.542728576773728, "language_loss": 0.81258655, "learning_rate": 2.160092025783549e-06, "loss": 0.83336616, "num_input_tokens_seen": 175272770, "router_z_loss_clip": 3.39453125, "router_z_loss_mlp": 0.46801758, "step": 8154, "time_per_iteration": 2.729923963546753 }, { "auxiliary_loss_clip": 0.01491834, "auxiliary_loss_mlp": 0.00093607, "balance_loss_clip": 1.31643558, "balance_loss_mlp": 0.07491536, "epoch": 0.49030512550728994, "flos": 58951318533120.0, "grad_norm": 0.9807515061144112, "language_loss": 0.66727412, "learning_rate": 2.1597038107856564e-06, "loss": 0.6831286, "num_input_tokens_seen": 175336320, "router_z_loss_clip": 1.75, "router_z_loss_mlp": 0.18652344, "step": 8155, "time_per_iteration": 3.196727991104126 }, { "auxiliary_loss_clip": 0.01797993, "auxiliary_loss_mlp": 0.00297101, "balance_loss_clip": 1.44252038, "balance_loss_mlp": 0.25313699, "epoch": 0.4903652487599579, "flos": 19791843384960.0, "grad_norm": 1.862063119993553, "language_loss": 0.82331288, "learning_rate": 2.1593155897317784e-06, "loss": 0.84426379, "num_input_tokens_seen": 175353540, "router_z_loss_clip": 3.55859375, "router_z_loss_mlp": 0.43969727, "step": 8156, "time_per_iteration": 2.6672098636627197 }, { "auxiliary_loss_clip": 0.01796849, "auxiliary_loss_mlp": 0.00303976, "balance_loss_clip": 1.44358599, "balance_loss_mlp": 0.26146597, "epoch": 0.49042537201262587, "flos": 21762082241280.0, "grad_norm": 96.91405446506568, "language_loss": 0.90024054, "learning_rate": 2.1589273626366377e-06, "loss": 0.92124873, "num_input_tokens_seen": 175370445, "router_z_loss_clip": 3.53515625, "router_z_loss_mlp": 0.42504883, "step": 8157, "time_per_iteration": 2.665046215057373 }, { "auxiliary_loss_clip": 0.01780244, "auxiliary_loss_mlp": 0.00278277, "balance_loss_clip": 1.43080735, "balance_loss_mlp": 0.2325964, "epoch": 0.49048549526529384, "flos": 18953042008320.0, "grad_norm": 1.6335323801635881, "language_loss": 0.84587795, "learning_rate": 2.158539129514956e-06, "loss": 0.86646318, "num_input_tokens_seen": 175389020, "router_z_loss_clip": 3.49804688, "router_z_loss_mlp": 0.45703125, "step": 8158, "time_per_iteration": 2.6755099296569824 }, { "auxiliary_loss_clip": 0.0181113, "auxiliary_loss_mlp": 0.00306689, "balance_loss_clip": 1.4483856, "balance_loss_mlp": 0.26248628, "epoch": 0.4905456185179618, "flos": 26906393295360.0, "grad_norm": 14.299401229640958, "language_loss": 0.75823075, "learning_rate": 2.158150890381454e-06, "loss": 0.77940893, "num_input_tokens_seen": 175409545, "router_z_loss_clip": 3.63085938, "router_z_loss_mlp": 0.44165039, "step": 8159, "time_per_iteration": 2.733875274658203 }, { "auxiliary_loss_clip": 0.01804027, "auxiliary_loss_mlp": 0.00298534, "balance_loss_clip": 1.45202875, "balance_loss_mlp": 0.250779, "epoch": 0.49060574177062977, "flos": 20412343854720.0, "grad_norm": 28.372779300680552, "language_loss": 0.78867817, "learning_rate": 2.157762645250854e-06, "loss": 0.80970377, "num_input_tokens_seen": 175429335, "router_z_loss_clip": 3.51953125, "router_z_loss_mlp": 0.47680664, "step": 8160, "time_per_iteration": 2.6749675273895264 }, { "auxiliary_loss_clip": 0.01775731, "auxiliary_loss_mlp": 0.00310706, "balance_loss_clip": 1.42918396, "balance_loss_mlp": 0.26657474, "epoch": 0.4906658650232978, "flos": 17493704248320.0, "grad_norm": 5.931650878823519, "language_loss": 0.77497935, "learning_rate": 2.1573743941378796e-06, "loss": 0.79584372, "num_input_tokens_seen": 175446955, "router_z_loss_clip": 3.46875, "router_z_loss_mlp": 0.44116211, "step": 8161, "time_per_iteration": 2.6128015518188477 }, { "auxiliary_loss_clip": 0.0177266, "auxiliary_loss_mlp": 0.00296635, "balance_loss_clip": 1.4298389, "balance_loss_mlp": 0.25255114, "epoch": 0.49072598827596575, "flos": 26614439550720.0, "grad_norm": 3.5045578682384644, "language_loss": 0.74322021, "learning_rate": 2.1569861370572517e-06, "loss": 0.76391321, "num_input_tokens_seen": 175468195, "router_z_loss_clip": 3.42578125, "router_z_loss_mlp": 0.44091797, "step": 8162, "time_per_iteration": 2.6854124069213867 }, { "auxiliary_loss_clip": 0.01759937, "auxiliary_loss_mlp": 0.00291162, "balance_loss_clip": 1.41588926, "balance_loss_mlp": 0.24614835, "epoch": 0.4907861115286337, "flos": 20412595249920.0, "grad_norm": 10.900156755087226, "language_loss": 0.70838761, "learning_rate": 2.1565978740236944e-06, "loss": 0.72889864, "num_input_tokens_seen": 175487455, "router_z_loss_clip": 3.4375, "router_z_loss_mlp": 0.44970703, "step": 8163, "time_per_iteration": 2.6331493854522705 }, { "auxiliary_loss_clip": 0.01735809, "auxiliary_loss_mlp": 0.00296996, "balance_loss_clip": 1.40829515, "balance_loss_mlp": 0.2552017, "epoch": 0.4908462347813017, "flos": 14064271286400.0, "grad_norm": 4.038048219205349, "language_loss": 0.84071869, "learning_rate": 2.1562096050519293e-06, "loss": 0.86104667, "num_input_tokens_seen": 175504450, "router_z_loss_clip": 3.27539062, "router_z_loss_mlp": 0.41796875, "step": 8164, "time_per_iteration": 4.019381284713745 }, { "auxiliary_loss_clip": 0.01764472, "auxiliary_loss_mlp": 0.00313805, "balance_loss_clip": 1.4179213, "balance_loss_mlp": 0.26783812, "epoch": 0.49090635803396965, "flos": 18735100237440.0, "grad_norm": 481.3562346610187, "language_loss": 0.82938194, "learning_rate": 2.1558213301566806e-06, "loss": 0.85016471, "num_input_tokens_seen": 175523600, "router_z_loss_clip": 3.46679688, "router_z_loss_mlp": 0.45996094, "step": 8165, "time_per_iteration": 2.6758644580841064 }, { "auxiliary_loss_clip": 0.01763932, "auxiliary_loss_mlp": 0.00303384, "balance_loss_clip": 1.42490911, "balance_loss_mlp": 0.26073083, "epoch": 0.4909664812866376, "flos": 20558500295040.0, "grad_norm": 7.162009279599586, "language_loss": 0.8536309, "learning_rate": 2.1554330493526716e-06, "loss": 0.87430406, "num_input_tokens_seen": 175542720, "router_z_loss_clip": 3.39453125, "router_z_loss_mlp": 0.42651367, "step": 8166, "time_per_iteration": 2.6887905597686768 }, { "auxiliary_loss_clip": 0.01400278, "auxiliary_loss_mlp": 0.00142925, "balance_loss_clip": 1.2457087, "balance_loss_mlp": 0.12909694, "epoch": 0.4910266045393056, "flos": 54684017948160.0, "grad_norm": 0.7783668828624731, "language_loss": 0.54161751, "learning_rate": 2.1550447626546253e-06, "loss": 0.55704951, "num_input_tokens_seen": 175598640, "router_z_loss_clip": 1.546875, "router_z_loss_mlp": 0.13867188, "step": 8167, "time_per_iteration": 3.1710660457611084 }, { "auxiliary_loss_clip": 0.0176046, "auxiliary_loss_mlp": 0.00300534, "balance_loss_clip": 1.42819786, "balance_loss_mlp": 0.25556871, "epoch": 0.49108672779197354, "flos": 16246454342400.0, "grad_norm": 2.5347312513498648, "language_loss": 0.92788619, "learning_rate": 2.1546564700772665e-06, "loss": 0.9484961, "num_input_tokens_seen": 175615675, "router_z_loss_clip": 3.32617188, "router_z_loss_mlp": 0.44995117, "step": 8168, "time_per_iteration": 4.135586738586426 }, { "auxiliary_loss_clip": 0.01740493, "auxiliary_loss_mlp": 0.00286844, "balance_loss_clip": 1.4103092, "balance_loss_mlp": 0.24147293, "epoch": 0.4911468510446415, "flos": 19825419623040.0, "grad_norm": 2.214623309945843, "language_loss": 0.78168356, "learning_rate": 2.1542681716353193e-06, "loss": 0.80195689, "num_input_tokens_seen": 175632255, "router_z_loss_clip": 3.30078125, "router_z_loss_mlp": 0.45361328, "step": 8169, "time_per_iteration": 4.1668150424957275 }, { "auxiliary_loss_clip": 0.01721501, "auxiliary_loss_mlp": 0.00288166, "balance_loss_clip": 1.39812899, "balance_loss_mlp": 0.24527447, "epoch": 0.4912069742973095, "flos": 21212684743680.0, "grad_norm": 7.619494214548272, "language_loss": 0.82617104, "learning_rate": 2.1538798673435068e-06, "loss": 0.84626776, "num_input_tokens_seen": 175651625, "router_z_loss_clip": 3.234375, "router_z_loss_mlp": 0.42871094, "step": 8170, "time_per_iteration": 2.676424026489258 }, { "auxiliary_loss_clip": 0.01762796, "auxiliary_loss_mlp": 0.00323198, "balance_loss_clip": 1.42203331, "balance_loss_mlp": 0.27961531, "epoch": 0.49126709754997744, "flos": 19537129065600.0, "grad_norm": 12.916693576931934, "language_loss": 0.85481226, "learning_rate": 2.1534915572165545e-06, "loss": 0.87567228, "num_input_tokens_seen": 175669265, "router_z_loss_clip": 3.40820312, "router_z_loss_mlp": 0.43603516, "step": 8171, "time_per_iteration": 2.7030532360076904 }, { "auxiliary_loss_clip": 0.01752072, "auxiliary_loss_mlp": 0.00289538, "balance_loss_clip": 1.41719723, "balance_loss_mlp": 0.24194978, "epoch": 0.4913272208026454, "flos": 12239686080000.0, "grad_norm": 15.501023343054678, "language_loss": 0.90044481, "learning_rate": 2.1531032412691875e-06, "loss": 0.92086095, "num_input_tokens_seen": 175686065, "router_z_loss_clip": 3.34960938, "router_z_loss_mlp": 0.47583008, "step": 8172, "time_per_iteration": 2.615058183670044 }, { "auxiliary_loss_clip": 0.01379284, "auxiliary_loss_mlp": 0.00090546, "balance_loss_clip": 1.22725129, "balance_loss_mlp": 0.07767108, "epoch": 0.49138734405531337, "flos": 65465871661440.0, "grad_norm": 0.683938290014935, "language_loss": 0.52811992, "learning_rate": 2.1527149195161295e-06, "loss": 0.54281819, "num_input_tokens_seen": 175748595, "router_z_loss_clip": 1.515625, "router_z_loss_mlp": 0.12890625, "step": 8173, "time_per_iteration": 3.1528351306915283 }, { "auxiliary_loss_clip": 0.01748055, "auxiliary_loss_mlp": 0.00298344, "balance_loss_clip": 1.41328645, "balance_loss_mlp": 0.253712, "epoch": 0.4914474673079814, "flos": 18439052342400.0, "grad_norm": 67.05011429277769, "language_loss": 0.68686283, "learning_rate": 2.152326591972107e-06, "loss": 0.70732677, "num_input_tokens_seen": 175766770, "router_z_loss_clip": 3.34960938, "router_z_loss_mlp": 0.44628906, "step": 8174, "time_per_iteration": 4.0173656940460205 }, { "auxiliary_loss_clip": 0.01720251, "auxiliary_loss_mlp": 0.00296833, "balance_loss_clip": 1.39622748, "balance_loss_mlp": 0.25243953, "epoch": 0.49150759056064935, "flos": 21685053525120.0, "grad_norm": 12.214327716364425, "language_loss": 0.75345576, "learning_rate": 2.1519382586518445e-06, "loss": 0.77362657, "num_input_tokens_seen": 175783605, "router_z_loss_clip": 3.2421875, "router_z_loss_mlp": 0.4440918, "step": 8175, "time_per_iteration": 2.6508235931396484 }, { "auxiliary_loss_clip": 0.01743162, "auxiliary_loss_mlp": 0.00299663, "balance_loss_clip": 1.41465545, "balance_loss_mlp": 0.25715277, "epoch": 0.4915677138133173, "flos": 22382439056640.0, "grad_norm": 37.21413366187826, "language_loss": 0.80160451, "learning_rate": 2.151549919570068e-06, "loss": 0.82203281, "num_input_tokens_seen": 175801390, "router_z_loss_clip": 3.28320312, "router_z_loss_mlp": 0.42480469, "step": 8176, "time_per_iteration": 2.6696791648864746 }, { "auxiliary_loss_clip": 0.01744392, "auxiliary_loss_mlp": 0.0030426, "balance_loss_clip": 1.41299617, "balance_loss_mlp": 0.26153505, "epoch": 0.4916278370659853, "flos": 18402890325120.0, "grad_norm": 10.173400178640419, "language_loss": 0.7545839, "learning_rate": 2.1511615747415036e-06, "loss": 0.77507043, "num_input_tokens_seen": 175819830, "router_z_loss_clip": 3.31445312, "router_z_loss_mlp": 0.42749023, "step": 8177, "time_per_iteration": 2.652636766433716 }, { "auxiliary_loss_clip": 0.01366078, "auxiliary_loss_mlp": 0.00151244, "balance_loss_clip": 1.21711445, "balance_loss_mlp": 0.13932317, "epoch": 0.49168796031865325, "flos": 66609124715520.0, "grad_norm": 0.6974561784120124, "language_loss": 0.45750731, "learning_rate": 2.150773224180877e-06, "loss": 0.47268054, "num_input_tokens_seen": 175881765, "router_z_loss_clip": 1.484375, "router_z_loss_mlp": 0.11914062, "step": 8178, "time_per_iteration": 3.2028563022613525 }, { "auxiliary_loss_clip": 0.01715839, "auxiliary_loss_mlp": 0.00322637, "balance_loss_clip": 1.39172745, "balance_loss_mlp": 0.28112847, "epoch": 0.4917480835713212, "flos": 20959335141120.0, "grad_norm": 103.33180577840932, "language_loss": 0.71693373, "learning_rate": 2.1503848679029147e-06, "loss": 0.73731846, "num_input_tokens_seen": 175901795, "router_z_loss_clip": 3.2421875, "router_z_loss_mlp": 0.41503906, "step": 8179, "time_per_iteration": 2.816530466079712 }, { "auxiliary_loss_clip": 0.01719989, "auxiliary_loss_mlp": 0.00322897, "balance_loss_clip": 1.3880868, "balance_loss_mlp": 0.2801249, "epoch": 0.4918082068239892, "flos": 15772900412160.0, "grad_norm": 5.672802862473398, "language_loss": 0.76275253, "learning_rate": 2.149996505922343e-06, "loss": 0.78318143, "num_input_tokens_seen": 175917770, "router_z_loss_clip": 3.31640625, "router_z_loss_mlp": 0.42749023, "step": 8180, "time_per_iteration": 2.634249448776245 }, { "auxiliary_loss_clip": 0.01706334, "auxiliary_loss_mlp": 0.0030882, "balance_loss_clip": 1.38633871, "balance_loss_mlp": 0.26552346, "epoch": 0.49186833007665715, "flos": 24604806453120.0, "grad_norm": 20.559002547392563, "language_loss": 0.8928895, "learning_rate": 2.1496081382538895e-06, "loss": 0.913041, "num_input_tokens_seen": 175937000, "router_z_loss_clip": 3.19726562, "router_z_loss_mlp": 0.43310547, "step": 8181, "time_per_iteration": 2.735861301422119 }, { "auxiliary_loss_clip": 0.01702233, "auxiliary_loss_mlp": 0.00294537, "balance_loss_clip": 1.3908124, "balance_loss_mlp": 0.25379106, "epoch": 0.4919284533293251, "flos": 22090557139200.0, "grad_norm": 16.799404553229635, "language_loss": 0.79494345, "learning_rate": 2.1492197649122793e-06, "loss": 0.81491125, "num_input_tokens_seen": 175955170, "router_z_loss_clip": 3.1171875, "router_z_loss_mlp": 0.40722656, "step": 8182, "time_per_iteration": 2.6644675731658936 }, { "auxiliary_loss_clip": 0.01703456, "auxiliary_loss_mlp": 0.00297725, "balance_loss_clip": 1.38493991, "balance_loss_mlp": 0.25476187, "epoch": 0.4919885765819931, "flos": 23368043318400.0, "grad_norm": 869.6302064633481, "language_loss": 0.81541336, "learning_rate": 2.1488313859122412e-06, "loss": 0.83542514, "num_input_tokens_seen": 175973725, "router_z_loss_clip": 3.18554688, "router_z_loss_mlp": 0.42993164, "step": 8183, "time_per_iteration": 2.750823497772217 }, { "auxiliary_loss_clip": 0.01736452, "auxiliary_loss_mlp": 0.00318675, "balance_loss_clip": 1.40260816, "balance_loss_mlp": 0.27587891, "epoch": 0.49204869983466104, "flos": 21360493209600.0, "grad_norm": 5.082062150863461, "language_loss": 0.83428538, "learning_rate": 2.1484430012685015e-06, "loss": 0.85483664, "num_input_tokens_seen": 175993885, "router_z_loss_clip": 3.33789062, "router_z_loss_mlp": 0.42797852, "step": 8184, "time_per_iteration": 2.7768585681915283 }, { "auxiliary_loss_clip": 0.0170272, "auxiliary_loss_mlp": 0.00291694, "balance_loss_clip": 1.38689232, "balance_loss_mlp": 0.25171113, "epoch": 0.492108823087329, "flos": 21142695093120.0, "grad_norm": 3.893195202774834, "language_loss": 0.78033519, "learning_rate": 2.148054610995789e-06, "loss": 0.80027938, "num_input_tokens_seen": 176014210, "router_z_loss_clip": 3.16210938, "router_z_loss_mlp": 0.39990234, "step": 8185, "time_per_iteration": 2.66658616065979 }, { "auxiliary_loss_clip": 0.01733393, "auxiliary_loss_mlp": 0.00317197, "balance_loss_clip": 1.39878154, "balance_loss_mlp": 0.27089632, "epoch": 0.49216894633999697, "flos": 25116605389440.0, "grad_norm": 32.273739781766565, "language_loss": 0.81096387, "learning_rate": 2.147666215108831e-06, "loss": 0.83146977, "num_input_tokens_seen": 176033890, "router_z_loss_clip": 3.34570312, "router_z_loss_mlp": 0.46289062, "step": 8186, "time_per_iteration": 2.707064151763916 }, { "auxiliary_loss_clip": 0.01718392, "auxiliary_loss_mlp": 0.00324998, "balance_loss_clip": 1.39345467, "balance_loss_mlp": 0.28131998, "epoch": 0.49222906959266494, "flos": 22637943475200.0, "grad_norm": 4.029286656943426, "language_loss": 0.75090158, "learning_rate": 2.1472778136223545e-06, "loss": 0.77133554, "num_input_tokens_seen": 176052720, "router_z_loss_clip": 3.24804688, "router_z_loss_mlp": 0.43676758, "step": 8187, "time_per_iteration": 2.6857657432556152 }, { "auxiliary_loss_clip": 0.01708421, "auxiliary_loss_mlp": 0.00323628, "balance_loss_clip": 1.38692117, "balance_loss_mlp": 0.27868617, "epoch": 0.49228919284533296, "flos": 20410548174720.0, "grad_norm": 222.58073225764946, "language_loss": 0.71682167, "learning_rate": 2.1468894065510894e-06, "loss": 0.73714209, "num_input_tokens_seen": 176072545, "router_z_loss_clip": 3.21289062, "router_z_loss_mlp": 0.44897461, "step": 8188, "time_per_iteration": 2.6485345363616943 }, { "auxiliary_loss_clip": 0.01721291, "auxiliary_loss_mlp": 0.00320874, "balance_loss_clip": 1.39890695, "balance_loss_mlp": 0.28234541, "epoch": 0.4923493160980009, "flos": 27122359818240.0, "grad_norm": 2.0743721855641573, "language_loss": 0.80908346, "learning_rate": 2.1465009939097623e-06, "loss": 0.82950509, "num_input_tokens_seen": 176091490, "router_z_loss_clip": 3.22070312, "router_z_loss_mlp": 0.38549805, "step": 8189, "time_per_iteration": 2.6965601444244385 }, { "auxiliary_loss_clip": 0.01709927, "auxiliary_loss_mlp": 0.00298326, "balance_loss_clip": 1.38990164, "balance_loss_mlp": 0.25824815, "epoch": 0.4924094393506689, "flos": 35736683224320.0, "grad_norm": 45.89313695780101, "language_loss": 0.69794786, "learning_rate": 2.146112575713104e-06, "loss": 0.71803039, "num_input_tokens_seen": 176113200, "router_z_loss_clip": 3.19921875, "router_z_loss_mlp": 0.40087891, "step": 8190, "time_per_iteration": 2.755208730697632 }, { "auxiliary_loss_clip": 0.01718429, "auxiliary_loss_mlp": 0.00294966, "balance_loss_clip": 1.39326262, "balance_loss_mlp": 0.25565043, "epoch": 0.49246956260333685, "flos": 20412487509120.0, "grad_norm": 4.237079667955729, "language_loss": 0.78953838, "learning_rate": 2.1457241519758413e-06, "loss": 0.80967236, "num_input_tokens_seen": 176132485, "router_z_loss_clip": 3.25390625, "router_z_loss_mlp": 0.39331055, "step": 8191, "time_per_iteration": 2.6675689220428467 }, { "auxiliary_loss_clip": 0.01728027, "auxiliary_loss_mlp": 0.00321048, "balance_loss_clip": 1.40061617, "balance_loss_mlp": 0.27584359, "epoch": 0.4925296858560048, "flos": 38976938231040.0, "grad_norm": 4.4301849371560635, "language_loss": 0.77700567, "learning_rate": 2.1453357227127043e-06, "loss": 0.79749644, "num_input_tokens_seen": 176155755, "router_z_loss_clip": 3.2734375, "router_z_loss_mlp": 0.45214844, "step": 8192, "time_per_iteration": 2.854296922683716 }, { "auxiliary_loss_clip": 0.01343245, "auxiliary_loss_mlp": 0.00082113, "balance_loss_clip": 1.19968104, "balance_loss_mlp": 0.06971523, "epoch": 0.4925898091086728, "flos": 64278917712000.0, "grad_norm": 0.6986153400912771, "language_loss": 0.51809096, "learning_rate": 2.1449472879384224e-06, "loss": 0.53234446, "num_input_tokens_seen": 176216295, "router_z_loss_clip": 1.4375, "router_z_loss_mlp": 0.12353516, "step": 8193, "time_per_iteration": 3.201075792312622 }, { "auxiliary_loss_clip": 0.01731824, "auxiliary_loss_mlp": 0.002703, "balance_loss_clip": 1.40882087, "balance_loss_mlp": 0.22998382, "epoch": 0.49264993236134075, "flos": 23036372110080.0, "grad_norm": 1.8194520858667815, "language_loss": 0.82394844, "learning_rate": 2.1445588476677246e-06, "loss": 0.8439697, "num_input_tokens_seen": 176235925, "router_z_loss_clip": 3.23242188, "router_z_loss_mlp": 0.40332031, "step": 8194, "time_per_iteration": 2.7732198238372803 }, { "auxiliary_loss_clip": 0.01736216, "auxiliary_loss_mlp": 0.00331789, "balance_loss_clip": 1.40528488, "balance_loss_mlp": 0.29101938, "epoch": 0.4927100556140087, "flos": 24718212668160.0, "grad_norm": 2.67366488377231, "language_loss": 0.7800861, "learning_rate": 2.144170401915341e-06, "loss": 0.80076617, "num_input_tokens_seen": 176253865, "router_z_loss_clip": 3.31054688, "router_z_loss_mlp": 0.40795898, "step": 8195, "time_per_iteration": 2.6731414794921875 }, { "auxiliary_loss_clip": 0.01714471, "auxiliary_loss_mlp": 0.00299084, "balance_loss_clip": 1.39204812, "balance_loss_mlp": 0.25719365, "epoch": 0.4927701788666767, "flos": 23505544581120.0, "grad_norm": 216.90566588284858, "language_loss": 0.86029243, "learning_rate": 2.143781950696001e-06, "loss": 0.88042796, "num_input_tokens_seen": 176271525, "router_z_loss_clip": 3.2265625, "router_z_loss_mlp": 0.41845703, "step": 8196, "time_per_iteration": 2.724663496017456 }, { "auxiliary_loss_clip": 0.01731008, "auxiliary_loss_mlp": 0.00327399, "balance_loss_clip": 1.39890623, "balance_loss_mlp": 0.28422105, "epoch": 0.49283030211934464, "flos": 22928891639040.0, "grad_norm": 4.769995356371952, "language_loss": 0.7704916, "learning_rate": 2.1433934940244356e-06, "loss": 0.79107571, "num_input_tokens_seen": 176290810, "router_z_loss_clip": 3.3203125, "router_z_loss_mlp": 0.43188477, "step": 8197, "time_per_iteration": 2.6962990760803223 }, { "auxiliary_loss_clip": 0.01741869, "auxiliary_loss_mlp": 0.00296657, "balance_loss_clip": 1.41439784, "balance_loss_mlp": 0.25576773, "epoch": 0.4928904253720126, "flos": 16873024210560.0, "grad_norm": 3.001666305641691, "language_loss": 0.91787815, "learning_rate": 2.143005031915374e-06, "loss": 0.93826342, "num_input_tokens_seen": 176309165, "router_z_loss_clip": 3.2734375, "router_z_loss_mlp": 0.40869141, "step": 8198, "time_per_iteration": 2.6900618076324463 }, { "auxiliary_loss_clip": 0.01750788, "auxiliary_loss_mlp": 0.00327178, "balance_loss_clip": 1.4128567, "balance_loss_mlp": 0.28576499, "epoch": 0.4929505486246806, "flos": 14866551509760.0, "grad_norm": 3.442085956896212, "language_loss": 0.81621176, "learning_rate": 2.1426165643835467e-06, "loss": 0.83699143, "num_input_tokens_seen": 176324960, "router_z_loss_clip": 3.3828125, "router_z_loss_mlp": 0.41430664, "step": 8199, "time_per_iteration": 2.6400697231292725 }, { "auxiliary_loss_clip": 0.0174806, "auxiliary_loss_mlp": 0.00343703, "balance_loss_clip": 1.40986443, "balance_loss_mlp": 0.29811722, "epoch": 0.49301067187734854, "flos": 23842351434240.0, "grad_norm": 5.485531773138667, "language_loss": 0.66909266, "learning_rate": 2.1422280914436864e-06, "loss": 0.69001031, "num_input_tokens_seen": 176346195, "router_z_loss_clip": 3.37695312, "router_z_loss_mlp": 0.45581055, "step": 8200, "time_per_iteration": 2.6963675022125244 }, { "auxiliary_loss_clip": 0.01735313, "auxiliary_loss_mlp": 0.00304981, "balance_loss_clip": 1.41396654, "balance_loss_mlp": 0.26409268, "epoch": 0.49307079513001656, "flos": 22491284244480.0, "grad_norm": 1412.3271083727207, "language_loss": 0.84372336, "learning_rate": 2.1418396131105213e-06, "loss": 0.86412632, "num_input_tokens_seen": 176366735, "router_z_loss_clip": 3.21289062, "router_z_loss_mlp": 0.40869141, "step": 8201, "time_per_iteration": 2.6772193908691406 }, { "auxiliary_loss_clip": 0.01742981, "auxiliary_loss_mlp": 0.00323126, "balance_loss_clip": 1.4039067, "balance_loss_mlp": 0.27858937, "epoch": 0.4931309183826845, "flos": 15924587546880.0, "grad_norm": 62.72833031794366, "language_loss": 0.77613997, "learning_rate": 2.141451129398785e-06, "loss": 0.79680109, "num_input_tokens_seen": 176384475, "router_z_loss_clip": 3.390625, "router_z_loss_mlp": 0.4453125, "step": 8202, "time_per_iteration": 2.6978633403778076 }, { "auxiliary_loss_clip": 0.01726384, "auxiliary_loss_mlp": 0.00323914, "balance_loss_clip": 1.40167093, "balance_loss_mlp": 0.28383541, "epoch": 0.4931910416353525, "flos": 27309059735040.0, "grad_norm": 2.2090514157759054, "language_loss": 0.83435726, "learning_rate": 2.1410626403232076e-06, "loss": 0.85486019, "num_input_tokens_seen": 176402645, "router_z_loss_clip": 3.24609375, "router_z_loss_mlp": 0.40087891, "step": 8203, "time_per_iteration": 2.7838833332061768 }, { "auxiliary_loss_clip": 0.01748404, "auxiliary_loss_mlp": 0.00293719, "balance_loss_clip": 1.4131062, "balance_loss_mlp": 0.25008804, "epoch": 0.49325116488802045, "flos": 20806139635200.0, "grad_norm": 28.983820006045754, "language_loss": 0.8754065, "learning_rate": 2.1406741458985197e-06, "loss": 0.89582771, "num_input_tokens_seen": 176416715, "router_z_loss_clip": 3.34960938, "router_z_loss_mlp": 0.43676758, "step": 8204, "time_per_iteration": 2.6240766048431396 }, { "auxiliary_loss_clip": 0.0172325, "auxiliary_loss_mlp": 0.00300754, "balance_loss_clip": 1.40278983, "balance_loss_mlp": 0.25791031, "epoch": 0.4933112881406884, "flos": 19865963099520.0, "grad_norm": 34.54654881717216, "language_loss": 0.74782383, "learning_rate": 2.140285646139455e-06, "loss": 0.7680639, "num_input_tokens_seen": 176435755, "router_z_loss_clip": 3.20507812, "router_z_loss_mlp": 0.4284668, "step": 8205, "time_per_iteration": 2.668473958969116 }, { "auxiliary_loss_clip": 0.01720458, "auxiliary_loss_mlp": 0.00354706, "balance_loss_clip": 1.39216888, "balance_loss_mlp": 0.30876297, "epoch": 0.4933714113933564, "flos": 21827977741440.0, "grad_norm": 12.503163075918478, "language_loss": 0.73369324, "learning_rate": 2.139897141060744e-06, "loss": 0.75444484, "num_input_tokens_seen": 176453915, "router_z_loss_clip": 3.28320312, "router_z_loss_mlp": 0.45947266, "step": 8206, "time_per_iteration": 2.692049026489258 }, { "auxiliary_loss_clip": 0.01694619, "auxiliary_loss_mlp": 0.00307524, "balance_loss_clip": 1.38005948, "balance_loss_mlp": 0.26723105, "epoch": 0.49343153464602435, "flos": 27890130049920.0, "grad_norm": 10.725608039623944, "language_loss": 0.84321767, "learning_rate": 2.1395086306771196e-06, "loss": 0.86323905, "num_input_tokens_seen": 176475175, "router_z_loss_clip": 3.14648438, "router_z_loss_mlp": 0.40307617, "step": 8207, "time_per_iteration": 4.155500650405884 }, { "auxiliary_loss_clip": 0.01710508, "auxiliary_loss_mlp": 0.00312152, "balance_loss_clip": 1.39428222, "balance_loss_mlp": 0.26926064, "epoch": 0.4934916578986923, "flos": 24681080983680.0, "grad_norm": 10.218398974185554, "language_loss": 0.69287086, "learning_rate": 2.1391201150033147e-06, "loss": 0.71309745, "num_input_tokens_seen": 176494250, "router_z_loss_clip": 3.1640625, "router_z_loss_mlp": 0.42919922, "step": 8208, "time_per_iteration": 2.703428268432617 }, { "auxiliary_loss_clip": 0.01702251, "auxiliary_loss_mlp": 0.00295089, "balance_loss_clip": 1.3843205, "balance_loss_mlp": 0.2549392, "epoch": 0.4935517811513603, "flos": 23405139089280.0, "grad_norm": 9.130205847769034, "language_loss": 0.87116003, "learning_rate": 2.1387315940540598e-06, "loss": 0.89113343, "num_input_tokens_seen": 176513325, "router_z_loss_clip": 3.17773438, "router_z_loss_mlp": 0.40136719, "step": 8209, "time_per_iteration": 2.6835927963256836 }, { "auxiliary_loss_clip": 0.01701118, "auxiliary_loss_mlp": 0.00310647, "balance_loss_clip": 1.39418161, "balance_loss_mlp": 0.26701623, "epoch": 0.49361190440402825, "flos": 21944508439680.0, "grad_norm": 3.771504733832411, "language_loss": 0.84856468, "learning_rate": 2.138343067844089e-06, "loss": 0.86868227, "num_input_tokens_seen": 176532915, "router_z_loss_clip": 3.0703125, "router_z_loss_mlp": 0.43652344, "step": 8210, "time_per_iteration": 4.106863737106323 }, { "auxiliary_loss_clip": 0.01694364, "auxiliary_loss_mlp": 0.00323107, "balance_loss_clip": 1.37488151, "balance_loss_mlp": 0.28188413, "epoch": 0.4936720276566962, "flos": 25115671635840.0, "grad_norm": 2.7947614379302324, "language_loss": 0.86350667, "learning_rate": 2.1379545363881363e-06, "loss": 0.88368142, "num_input_tokens_seen": 176552775, "router_z_loss_clip": 3.19140625, "router_z_loss_mlp": 0.41235352, "step": 8211, "time_per_iteration": 4.233346223831177 }, { "auxiliary_loss_clip": 0.01692386, "auxiliary_loss_mlp": 0.0030642, "balance_loss_clip": 1.37640631, "balance_loss_mlp": 0.26324272, "epoch": 0.4937321509093642, "flos": 26358935132160.0, "grad_norm": 10.704721391912084, "language_loss": 1.00709331, "learning_rate": 2.137565999700933e-06, "loss": 1.02708149, "num_input_tokens_seen": 176572185, "router_z_loss_clip": 3.16210938, "router_z_loss_mlp": 0.43188477, "step": 8212, "time_per_iteration": 2.769343376159668 }, { "auxiliary_loss_clip": 0.01681811, "auxiliary_loss_mlp": 0.00293095, "balance_loss_clip": 1.37641108, "balance_loss_mlp": 0.24960756, "epoch": 0.49379227416203214, "flos": 22961390469120.0, "grad_norm": 17.8886542469904, "language_loss": 0.71861029, "learning_rate": 2.1371774577972138e-06, "loss": 0.73835933, "num_input_tokens_seen": 176591490, "router_z_loss_clip": 3.05664062, "router_z_loss_mlp": 0.43481445, "step": 8213, "time_per_iteration": 2.7249794006347656 }, { "auxiliary_loss_clip": 0.01684213, "auxiliary_loss_mlp": 0.00304953, "balance_loss_clip": 1.37417912, "balance_loss_mlp": 0.26077357, "epoch": 0.49385239741470016, "flos": 32489101843200.0, "grad_norm": 27.537595744211142, "language_loss": 0.83471429, "learning_rate": 2.136788910691711e-06, "loss": 0.85460591, "num_input_tokens_seen": 176612715, "router_z_loss_clip": 3.09765625, "router_z_loss_mlp": 0.44140625, "step": 8214, "time_per_iteration": 2.7611501216888428 }, { "auxiliary_loss_clip": 0.01674957, "auxiliary_loss_mlp": 0.00300537, "balance_loss_clip": 1.3692838, "balance_loss_mlp": 0.25960097, "epoch": 0.4939125206673681, "flos": 22492864442880.0, "grad_norm": 63.30402690573189, "language_loss": 0.91447043, "learning_rate": 2.1364003583991594e-06, "loss": 0.93422538, "num_input_tokens_seen": 176631950, "router_z_loss_clip": 3.05664062, "router_z_loss_mlp": 0.40966797, "step": 8215, "time_per_iteration": 2.6575124263763428 }, { "auxiliary_loss_clip": 0.01653341, "auxiliary_loss_mlp": 0.00259794, "balance_loss_clip": 1.36345863, "balance_loss_mlp": 0.2200259, "epoch": 0.4939726439200361, "flos": 31176351486720.0, "grad_norm": 86.92918198262056, "language_loss": 0.88760793, "learning_rate": 2.136011800934292e-06, "loss": 0.90673923, "num_input_tokens_seen": 176653060, "router_z_loss_clip": 2.90039062, "router_z_loss_mlp": 0.39770508, "step": 8216, "time_per_iteration": 2.760728597640991 }, { "auxiliary_loss_clip": 0.01655854, "auxiliary_loss_mlp": 0.00321845, "balance_loss_clip": 1.36002457, "balance_loss_mlp": 0.27914461, "epoch": 0.49403276717270406, "flos": 22674213233280.0, "grad_norm": 9.170620042209707, "language_loss": 0.80342436, "learning_rate": 2.1356232383118442e-06, "loss": 0.8232013, "num_input_tokens_seen": 176673895, "router_z_loss_clip": 2.95507812, "router_z_loss_mlp": 0.42700195, "step": 8217, "time_per_iteration": 4.190083026885986 }, { "auxiliary_loss_clip": 0.01675724, "auxiliary_loss_mlp": 0.0028438, "balance_loss_clip": 1.37427711, "balance_loss_mlp": 0.24625713, "epoch": 0.494092890425372, "flos": 20741070147840.0, "grad_norm": 33.60781995111995, "language_loss": 0.84249783, "learning_rate": 2.1352346705465494e-06, "loss": 0.86209881, "num_input_tokens_seen": 176692550, "router_z_loss_clip": 3.01757812, "router_z_loss_mlp": 0.3815918, "step": 8218, "time_per_iteration": 2.727193593978882 }, { "auxiliary_loss_clip": 0.01642737, "auxiliary_loss_mlp": 0.00270602, "balance_loss_clip": 1.35023916, "balance_loss_mlp": 0.22737651, "epoch": 0.49415301367804, "flos": 18369026778240.0, "grad_norm": 47.6293177950911, "language_loss": 0.84344417, "learning_rate": 2.134846097653142e-06, "loss": 0.86257762, "num_input_tokens_seen": 176709335, "router_z_loss_clip": 2.921875, "router_z_loss_mlp": 0.43237305, "step": 8219, "time_per_iteration": 2.673443078994751 }, { "auxiliary_loss_clip": 0.01635892, "auxiliary_loss_mlp": 0.00283628, "balance_loss_clip": 1.34649956, "balance_loss_mlp": 0.24591023, "epoch": 0.49421313693070795, "flos": 17530620451200.0, "grad_norm": 112.71575367911119, "language_loss": 0.68434584, "learning_rate": 2.134457519646357e-06, "loss": 0.70354104, "num_input_tokens_seen": 176727715, "router_z_loss_clip": 2.89453125, "router_z_loss_mlp": 0.37719727, "step": 8220, "time_per_iteration": 2.680807113647461 }, { "auxiliary_loss_clip": 0.01634946, "auxiliary_loss_mlp": 0.00302336, "balance_loss_clip": 1.3455137, "balance_loss_mlp": 0.26094687, "epoch": 0.4942732601833759, "flos": 20812173120000.0, "grad_norm": 14.420172130642793, "language_loss": 0.7950995, "learning_rate": 2.1340689365409296e-06, "loss": 0.81447232, "num_input_tokens_seen": 176747530, "router_z_loss_clip": 2.89648438, "router_z_loss_mlp": 0.4140625, "step": 8221, "time_per_iteration": 2.673842430114746 }, { "auxiliary_loss_clip": 0.01647687, "auxiliary_loss_mlp": 0.0027187, "balance_loss_clip": 1.36194539, "balance_loss_mlp": 0.23169665, "epoch": 0.4943333834360439, "flos": 15048941794560.0, "grad_norm": 114.22466884595354, "language_loss": 0.85039586, "learning_rate": 2.133680348351595e-06, "loss": 0.86959147, "num_input_tokens_seen": 176765260, "router_z_loss_clip": 2.859375, "router_z_loss_mlp": 0.40185547, "step": 8222, "time_per_iteration": 2.754786968231201 }, { "auxiliary_loss_clip": 0.01650414, "auxiliary_loss_mlp": 0.00323958, "balance_loss_clip": 1.3557229, "balance_loss_mlp": 0.28211555, "epoch": 0.49439350668871185, "flos": 16070420764800.0, "grad_norm": 2.8880024963737236, "language_loss": 0.79967988, "learning_rate": 2.133291755093088e-06, "loss": 0.81942362, "num_input_tokens_seen": 176781770, "router_z_loss_clip": 2.94726562, "router_z_loss_mlp": 0.41845703, "step": 8223, "time_per_iteration": 2.7002005577087402 }, { "auxiliary_loss_clip": 0.01644655, "auxiliary_loss_mlp": 0.00345866, "balance_loss_clip": 1.34670568, "balance_loss_mlp": 0.30206847, "epoch": 0.4944536299413798, "flos": 20880079781760.0, "grad_norm": 38.03861986953734, "language_loss": 0.80473047, "learning_rate": 2.132903156780144e-06, "loss": 0.82463568, "num_input_tokens_seen": 176800655, "router_z_loss_clip": 2.97851562, "router_z_loss_mlp": 0.43798828, "step": 8224, "time_per_iteration": 2.664337158203125 }, { "auxiliary_loss_clip": 0.01639089, "auxiliary_loss_mlp": 0.00307359, "balance_loss_clip": 1.34521019, "balance_loss_mlp": 0.26670903, "epoch": 0.4945137531940478, "flos": 26608908856320.0, "grad_norm": 13.152464286330419, "language_loss": 0.73920339, "learning_rate": 2.1325145534274997e-06, "loss": 0.75866789, "num_input_tokens_seen": 176820610, "router_z_loss_clip": 2.9375, "router_z_loss_mlp": 0.40649414, "step": 8225, "time_per_iteration": 2.7064077854156494 }, { "auxiliary_loss_clip": 0.01631537, "auxiliary_loss_mlp": 0.00291758, "balance_loss_clip": 1.34058213, "balance_loss_mlp": 0.25346753, "epoch": 0.49457387644671574, "flos": 23988148738560.0, "grad_norm": 4.425474108537735, "language_loss": 0.83511364, "learning_rate": 2.1321259450498893e-06, "loss": 0.85434663, "num_input_tokens_seen": 176840520, "router_z_loss_clip": 2.91210938, "router_z_loss_mlp": 0.38305664, "step": 8226, "time_per_iteration": 2.6815624237060547 }, { "auxiliary_loss_clip": 0.01607043, "auxiliary_loss_mlp": 0.00297816, "balance_loss_clip": 1.32022548, "balance_loss_mlp": 0.25811923, "epoch": 0.49463399969938376, "flos": 26976598427520.0, "grad_norm": 5.978997941982717, "language_loss": 0.77570087, "learning_rate": 2.131737331662051e-06, "loss": 0.7947495, "num_input_tokens_seen": 176860265, "router_z_loss_clip": 2.86328125, "router_z_loss_mlp": 0.39697266, "step": 8227, "time_per_iteration": 2.7171592712402344 }, { "auxiliary_loss_clip": 0.01632844, "auxiliary_loss_mlp": 0.00300946, "balance_loss_clip": 1.33649635, "balance_loss_mlp": 0.26027179, "epoch": 0.49469412295205173, "flos": 29681534067840.0, "grad_norm": 6.62010401268089, "language_loss": 0.78349948, "learning_rate": 2.131348713278718e-06, "loss": 0.80283737, "num_input_tokens_seen": 176882910, "router_z_loss_clip": 2.96289062, "router_z_loss_mlp": 0.40673828, "step": 8228, "time_per_iteration": 2.7360432147979736 }, { "auxiliary_loss_clip": 0.0161306, "auxiliary_loss_mlp": 0.00285063, "balance_loss_clip": 1.33199322, "balance_loss_mlp": 0.24801265, "epoch": 0.4947542462047197, "flos": 24131791226880.0, "grad_norm": 2.459432574822201, "language_loss": 0.89888883, "learning_rate": 2.1309600899146304e-06, "loss": 0.9178701, "num_input_tokens_seen": 176903030, "router_z_loss_clip": 2.80859375, "router_z_loss_mlp": 0.37036133, "step": 8229, "time_per_iteration": 2.76126766204834 }, { "auxiliary_loss_clip": 0.01623344, "auxiliary_loss_mlp": 0.00301062, "balance_loss_clip": 1.33360159, "balance_loss_mlp": 0.25924361, "epoch": 0.49481436945738766, "flos": 20045049333120.0, "grad_norm": 1154.1990905621963, "language_loss": 0.82664502, "learning_rate": 2.1305714615845227e-06, "loss": 0.84588909, "num_input_tokens_seen": 176919025, "router_z_loss_clip": 2.8984375, "router_z_loss_mlp": 0.41796875, "step": 8230, "time_per_iteration": 2.624035120010376 }, { "auxiliary_loss_clip": 0.01631506, "auxiliary_loss_mlp": 0.00316929, "balance_loss_clip": 1.33812284, "balance_loss_mlp": 0.27804321, "epoch": 0.4948744927100556, "flos": 15669550005120.0, "grad_norm": 8.369919294029394, "language_loss": 0.88631213, "learning_rate": 2.1301828283031314e-06, "loss": 0.90579647, "num_input_tokens_seen": 176937945, "router_z_loss_clip": 2.93359375, "router_z_loss_mlp": 0.38891602, "step": 8231, "time_per_iteration": 2.650423049926758 }, { "auxiliary_loss_clip": 0.01236613, "auxiliary_loss_mlp": 0.00045651, "balance_loss_clip": 1.09761882, "balance_loss_mlp": 0.03516044, "epoch": 0.4949346159627236, "flos": 68872071502080.0, "grad_norm": 0.7507950587995613, "language_loss": 0.59796774, "learning_rate": 2.1297941900851944e-06, "loss": 0.61079037, "num_input_tokens_seen": 177004575, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.10498047, "step": 8232, "time_per_iteration": 3.2845494747161865 }, { "auxiliary_loss_clip": 0.01625808, "auxiliary_loss_mlp": 0.00324059, "balance_loss_clip": 1.33074069, "balance_loss_mlp": 0.28567374, "epoch": 0.49499473921539155, "flos": 24790285307520.0, "grad_norm": 6.61884199573343, "language_loss": 0.74884057, "learning_rate": 2.1294055469454496e-06, "loss": 0.76833928, "num_input_tokens_seen": 177024155, "router_z_loss_clip": 2.94921875, "router_z_loss_mlp": 0.38354492, "step": 8233, "time_per_iteration": 2.707324981689453 }, { "auxiliary_loss_clip": 0.0160182, "auxiliary_loss_mlp": 0.00320246, "balance_loss_clip": 1.31872559, "balance_loss_mlp": 0.28143167, "epoch": 0.4950548624680595, "flos": 32707905540480.0, "grad_norm": 21.083551596626002, "language_loss": 0.73265839, "learning_rate": 2.129016898898633e-06, "loss": 0.75187898, "num_input_tokens_seen": 177046185, "router_z_loss_clip": 2.828125, "router_z_loss_mlp": 0.38793945, "step": 8234, "time_per_iteration": 2.7646031379699707 }, { "auxiliary_loss_clip": 0.01235725, "auxiliary_loss_mlp": 0.00048521, "balance_loss_clip": 1.09670496, "balance_loss_mlp": 0.03669561, "epoch": 0.4951149857207275, "flos": 50082173066880.0, "grad_norm": 0.7964200110412154, "language_loss": 0.57729524, "learning_rate": 2.128628245959482e-06, "loss": 0.59013778, "num_input_tokens_seen": 177099025, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.11816406, "step": 8235, "time_per_iteration": 3.0552632808685303 }, { "auxiliary_loss_clip": 0.01618145, "auxiliary_loss_mlp": 0.00357937, "balance_loss_clip": 1.32885289, "balance_loss_mlp": 0.32024354, "epoch": 0.49517510897339545, "flos": 22236785406720.0, "grad_norm": 36.923597885855756, "language_loss": 0.84518141, "learning_rate": 2.1282395881427355e-06, "loss": 0.86494219, "num_input_tokens_seen": 177118365, "router_z_loss_clip": 2.89257812, "router_z_loss_mlp": 0.37695312, "step": 8236, "time_per_iteration": 2.6941006183624268 }, { "auxiliary_loss_clip": 0.01615529, "auxiliary_loss_mlp": 0.00303546, "balance_loss_clip": 1.33098555, "balance_loss_mlp": 0.26537526, "epoch": 0.4952352322260634, "flos": 25374120969600.0, "grad_norm": 25.133047652426903, "language_loss": 0.79185176, "learning_rate": 2.1278509254631315e-06, "loss": 0.81104249, "num_input_tokens_seen": 177136415, "router_z_loss_clip": 2.84765625, "router_z_loss_mlp": 0.38183594, "step": 8237, "time_per_iteration": 2.6642818450927734 }, { "auxiliary_loss_clip": 0.01611377, "auxiliary_loss_mlp": 0.00288757, "balance_loss_clip": 1.32657838, "balance_loss_mlp": 0.25099111, "epoch": 0.4952953554787314, "flos": 24608721035520.0, "grad_norm": 470.89911875110727, "language_loss": 0.82382274, "learning_rate": 2.127462257935406e-06, "loss": 0.8428241, "num_input_tokens_seen": 177155690, "router_z_loss_clip": 2.84765625, "router_z_loss_mlp": 0.37744141, "step": 8238, "time_per_iteration": 2.701849937438965 }, { "auxiliary_loss_clip": 0.01606699, "auxiliary_loss_mlp": 0.00322256, "balance_loss_clip": 1.31770205, "balance_loss_mlp": 0.28096172, "epoch": 0.49535547873139935, "flos": 17311278049920.0, "grad_norm": 16.034726078011776, "language_loss": 0.838202, "learning_rate": 2.1270735855743008e-06, "loss": 0.85749149, "num_input_tokens_seen": 177173350, "router_z_loss_clip": 2.890625, "router_z_loss_mlp": 0.41308594, "step": 8239, "time_per_iteration": 2.6110477447509766 }, { "auxiliary_loss_clip": 0.01583828, "auxiliary_loss_mlp": 0.00342398, "balance_loss_clip": 1.2948842, "balance_loss_mlp": 0.29795641, "epoch": 0.4954156019840673, "flos": 20740315962240.0, "grad_norm": 6.268421261733063, "language_loss": 0.86294502, "learning_rate": 2.126684908394552e-06, "loss": 0.88220727, "num_input_tokens_seen": 177191115, "router_z_loss_clip": 2.88867188, "router_z_loss_mlp": 0.44433594, "step": 8240, "time_per_iteration": 2.813711166381836 }, { "auxiliary_loss_clip": 0.01583987, "auxiliary_loss_mlp": 0.00298149, "balance_loss_clip": 1.30709445, "balance_loss_mlp": 0.25850028, "epoch": 0.49547572523673533, "flos": 12820684567680.0, "grad_norm": 5.047517542609759, "language_loss": 0.90827709, "learning_rate": 2.126296226410898e-06, "loss": 0.92709851, "num_input_tokens_seen": 177206155, "router_z_loss_clip": 2.76953125, "router_z_loss_mlp": 0.39648438, "step": 8241, "time_per_iteration": 2.613617420196533 }, { "auxiliary_loss_clip": 0.01584103, "auxiliary_loss_mlp": 0.00290475, "balance_loss_clip": 1.30734169, "balance_loss_mlp": 0.252042, "epoch": 0.4955358484894033, "flos": 15597046402560.0, "grad_norm": 18.168196487099916, "language_loss": 0.83997911, "learning_rate": 2.1259075396380794e-06, "loss": 0.85872483, "num_input_tokens_seen": 177224815, "router_z_loss_clip": 2.76953125, "router_z_loss_mlp": 0.38427734, "step": 8242, "time_per_iteration": 2.6464316844940186 }, { "auxiliary_loss_clip": 0.0160097, "auxiliary_loss_mlp": 0.00287191, "balance_loss_clip": 1.3122412, "balance_loss_mlp": 0.24701738, "epoch": 0.49559597174207126, "flos": 26464368528000.0, "grad_norm": 8.165754230408591, "language_loss": 0.72537696, "learning_rate": 2.125518848090833e-06, "loss": 0.74425852, "num_input_tokens_seen": 177244490, "router_z_loss_clip": 2.890625, "router_z_loss_mlp": 0.40185547, "step": 8243, "time_per_iteration": 2.702315330505371 }, { "auxiliary_loss_clip": 0.01598741, "auxiliary_loss_mlp": 0.00305265, "balance_loss_clip": 1.30776143, "balance_loss_mlp": 0.26752332, "epoch": 0.4956560949947392, "flos": 23148234040320.0, "grad_norm": 15.557176977922582, "language_loss": 0.75748777, "learning_rate": 2.125130151783901e-06, "loss": 0.77652788, "num_input_tokens_seen": 177264340, "router_z_loss_clip": 2.91015625, "router_z_loss_mlp": 0.37719727, "step": 8244, "time_per_iteration": 2.7242512702941895 }, { "auxiliary_loss_clip": 0.01591109, "auxiliary_loss_mlp": 0.00303897, "balance_loss_clip": 1.30346847, "balance_loss_mlp": 0.26117247, "epoch": 0.4957162182474072, "flos": 20773461237120.0, "grad_norm": 14.086405795281935, "language_loss": 0.83542538, "learning_rate": 2.12474145073202e-06, "loss": 0.85437536, "num_input_tokens_seen": 177283055, "router_z_loss_clip": 2.875, "router_z_loss_mlp": 0.42724609, "step": 8245, "time_per_iteration": 2.710317611694336 }, { "auxiliary_loss_clip": 0.0158791, "auxiliary_loss_mlp": 0.00279428, "balance_loss_clip": 1.30999732, "balance_loss_mlp": 0.23918253, "epoch": 0.49577634150007516, "flos": 18734202397440.0, "grad_norm": 2.6401202526439542, "language_loss": 0.90159428, "learning_rate": 2.1243527449499306e-06, "loss": 0.9202677, "num_input_tokens_seen": 177301140, "router_z_loss_clip": 2.78125, "router_z_loss_mlp": 0.40209961, "step": 8246, "time_per_iteration": 2.627795457839966 }, { "auxiliary_loss_clip": 0.0159659, "auxiliary_loss_mlp": 0.00290436, "balance_loss_clip": 1.3044399, "balance_loss_mlp": 0.24861708, "epoch": 0.4958364647527431, "flos": 25554176870400.0, "grad_norm": 4.026796675948535, "language_loss": 0.92804229, "learning_rate": 2.1239640344523733e-06, "loss": 0.94691253, "num_input_tokens_seen": 177323095, "router_z_loss_clip": 2.92382812, "router_z_loss_mlp": 0.41845703, "step": 8247, "time_per_iteration": 2.7092082500457764 }, { "auxiliary_loss_clip": 0.015716, "auxiliary_loss_mlp": 0.00308567, "balance_loss_clip": 1.289011, "balance_loss_mlp": 0.26891786, "epoch": 0.4958965880054111, "flos": 24425325169920.0, "grad_norm": 20.54126091507977, "language_loss": 0.90341002, "learning_rate": 2.123575319254087e-06, "loss": 0.92221165, "num_input_tokens_seen": 177339845, "router_z_loss_clip": 2.828125, "router_z_loss_mlp": 0.39624023, "step": 8248, "time_per_iteration": 2.660733461380005 }, { "auxiliary_loss_clip": 0.01586116, "auxiliary_loss_mlp": 0.0032391, "balance_loss_clip": 1.29224014, "balance_loss_mlp": 0.28137574, "epoch": 0.49595671125807905, "flos": 25083460114560.0, "grad_norm": 149.45489290667024, "language_loss": 0.80741489, "learning_rate": 2.123186599369812e-06, "loss": 0.82651508, "num_input_tokens_seen": 177359980, "router_z_loss_clip": 2.9375, "router_z_loss_mlp": 0.42553711, "step": 8249, "time_per_iteration": 4.068791151046753 }, { "auxiliary_loss_clip": 0.01581823, "auxiliary_loss_mlp": 0.00301439, "balance_loss_clip": 1.29800105, "balance_loss_mlp": 0.26217115, "epoch": 0.496016834510747, "flos": 16435883692800.0, "grad_norm": 1109.4912470114955, "language_loss": 0.81902313, "learning_rate": 2.122797874814289e-06, "loss": 0.83785582, "num_input_tokens_seen": 177378580, "router_z_loss_clip": 2.8359375, "router_z_loss_mlp": 0.39282227, "step": 8250, "time_per_iteration": 2.6741857528686523 }, { "auxiliary_loss_clip": 0.01593108, "auxiliary_loss_mlp": 0.00304985, "balance_loss_clip": 1.30845785, "balance_loss_mlp": 0.26278502, "epoch": 0.496076957763415, "flos": 23437925228160.0, "grad_norm": 2.326440952845298, "language_loss": 0.75564086, "learning_rate": 2.1224091456022585e-06, "loss": 0.77462184, "num_input_tokens_seen": 177398790, "router_z_loss_clip": 2.84765625, "router_z_loss_mlp": 0.421875, "step": 8251, "time_per_iteration": 2.6751270294189453 }, { "auxiliary_loss_clip": 0.01580615, "auxiliary_loss_mlp": 0.0030981, "balance_loss_clip": 1.29588938, "balance_loss_mlp": 0.26844427, "epoch": 0.49613708101608295, "flos": 16909509450240.0, "grad_norm": 2.6088325624834745, "language_loss": 0.87028372, "learning_rate": 2.122020411748461e-06, "loss": 0.88918799, "num_input_tokens_seen": 177416515, "router_z_loss_clip": 2.84570312, "router_z_loss_mlp": 0.4140625, "step": 8252, "time_per_iteration": 4.0679144859313965 }, { "auxiliary_loss_clip": 0.01616903, "auxiliary_loss_mlp": 0.00321022, "balance_loss_clip": 1.32708478, "balance_loss_mlp": 0.27839291, "epoch": 0.4961972042687509, "flos": 16618094409600.0, "grad_norm": 2.2628162397748297, "language_loss": 0.88472468, "learning_rate": 2.1216316732676363e-06, "loss": 0.90410393, "num_input_tokens_seen": 177434425, "router_z_loss_clip": 2.90039062, "router_z_loss_mlp": 0.42651367, "step": 8253, "time_per_iteration": 2.6116294860839844 }, { "auxiliary_loss_clip": 0.01598904, "auxiliary_loss_mlp": 0.00265647, "balance_loss_clip": 1.3103472, "balance_loss_mlp": 0.22721443, "epoch": 0.49625732752141893, "flos": 28956749437440.0, "grad_norm": 30.680308996179328, "language_loss": 0.71807361, "learning_rate": 2.1212429301745275e-06, "loss": 0.73671913, "num_input_tokens_seen": 177459675, "router_z_loss_clip": 2.88867188, "router_z_loss_mlp": 0.38427734, "step": 8254, "time_per_iteration": 4.210940599441528 }, { "auxiliary_loss_clip": 0.01591509, "auxiliary_loss_mlp": 0.00306566, "balance_loss_clip": 1.29993951, "balance_loss_mlp": 0.26438969, "epoch": 0.4963174507740869, "flos": 23112359331840.0, "grad_norm": 9.039631308669888, "language_loss": 0.8204686, "learning_rate": 2.1208541824838743e-06, "loss": 0.83944929, "num_input_tokens_seen": 177478895, "router_z_loss_clip": 2.9140625, "router_z_loss_mlp": 0.42163086, "step": 8255, "time_per_iteration": 2.707580804824829 }, { "auxiliary_loss_clip": 0.01592714, "auxiliary_loss_mlp": 0.00289005, "balance_loss_clip": 1.3032763, "balance_loss_mlp": 0.25019011, "epoch": 0.49637757402675486, "flos": 13917863450880.0, "grad_norm": 307.4588068398472, "language_loss": 0.88399476, "learning_rate": 2.1204654302104183e-06, "loss": 0.90281194, "num_input_tokens_seen": 177494920, "router_z_loss_clip": 2.89453125, "router_z_loss_mlp": 0.38818359, "step": 8256, "time_per_iteration": 2.6517257690429688 }, { "auxiliary_loss_clip": 0.01597921, "auxiliary_loss_mlp": 0.0028365, "balance_loss_clip": 1.30930746, "balance_loss_mlp": 0.24116403, "epoch": 0.49643769727942283, "flos": 22309001700480.0, "grad_norm": 17.577293207412183, "language_loss": 0.86210573, "learning_rate": 2.120076673368901e-06, "loss": 0.88092142, "num_input_tokens_seen": 177515455, "router_z_loss_clip": 2.88671875, "router_z_loss_mlp": 0.42480469, "step": 8257, "time_per_iteration": 2.6988518238067627 }, { "auxiliary_loss_clip": 0.01600637, "auxiliary_loss_mlp": 0.00283711, "balance_loss_clip": 1.30967891, "balance_loss_mlp": 0.24263108, "epoch": 0.4964978205320908, "flos": 19500248776320.0, "grad_norm": 11.617048483962511, "language_loss": 0.74720919, "learning_rate": 2.1196879119740647e-06, "loss": 0.76605266, "num_input_tokens_seen": 177534040, "router_z_loss_clip": 2.90429688, "router_z_loss_mlp": 0.41064453, "step": 8258, "time_per_iteration": 2.61411452293396 }, { "auxiliary_loss_clip": 0.01582167, "auxiliary_loss_mlp": 0.00264896, "balance_loss_clip": 1.30546117, "balance_loss_mlp": 0.22779775, "epoch": 0.49655794378475876, "flos": 23436524597760.0, "grad_norm": 5.912249874854427, "language_loss": 0.8142606, "learning_rate": 2.1192991460406502e-06, "loss": 0.83273125, "num_input_tokens_seen": 177554510, "router_z_loss_clip": 2.76367188, "router_z_loss_mlp": 0.37084961, "step": 8259, "time_per_iteration": 4.059530973434448 }, { "auxiliary_loss_clip": 0.01619957, "auxiliary_loss_mlp": 0.00283752, "balance_loss_clip": 1.3294996, "balance_loss_mlp": 0.2417188, "epoch": 0.4966180670374267, "flos": 26831124345600.0, "grad_norm": 47.9843026953094, "language_loss": 0.84113562, "learning_rate": 2.1189103755834e-06, "loss": 0.86017269, "num_input_tokens_seen": 177575780, "router_z_loss_clip": 2.90820312, "router_z_loss_mlp": 0.42016602, "step": 8260, "time_per_iteration": 2.726888656616211 }, { "auxiliary_loss_clip": 0.01603288, "auxiliary_loss_mlp": 0.00301415, "balance_loss_clip": 1.30553341, "balance_loss_mlp": 0.25985849, "epoch": 0.4966781902900947, "flos": 22009326531840.0, "grad_norm": 11.679280702841124, "language_loss": 0.84046453, "learning_rate": 2.1185216006170573e-06, "loss": 0.85951149, "num_input_tokens_seen": 177588965, "router_z_loss_clip": 2.97460938, "router_z_loss_mlp": 0.4152832, "step": 8261, "time_per_iteration": 2.61301851272583 }, { "auxiliary_loss_clip": 0.01563533, "auxiliary_loss_mlp": 0.00276296, "balance_loss_clip": 1.28611612, "balance_loss_mlp": 0.24029475, "epoch": 0.49673831354276266, "flos": 26213353309440.0, "grad_norm": 2.493756276218485, "language_loss": 0.95630819, "learning_rate": 2.1181328211563627e-06, "loss": 0.97470653, "num_input_tokens_seen": 177608425, "router_z_loss_clip": 2.7734375, "router_z_loss_mlp": 0.35986328, "step": 8262, "time_per_iteration": 2.7837069034576416 }, { "auxiliary_loss_clip": 0.01593129, "auxiliary_loss_mlp": 0.00288732, "balance_loss_clip": 1.3093679, "balance_loss_mlp": 0.24874961, "epoch": 0.4967984367954306, "flos": 23182277155200.0, "grad_norm": 14.199479709157625, "language_loss": 0.77843237, "learning_rate": 2.11774403721606e-06, "loss": 0.79725099, "num_input_tokens_seen": 177628240, "router_z_loss_clip": 2.83789062, "router_z_loss_mlp": 0.39990234, "step": 8263, "time_per_iteration": 2.6469202041625977 }, { "auxiliary_loss_clip": 0.01599217, "auxiliary_loss_mlp": 0.00316982, "balance_loss_clip": 1.30892777, "balance_loss_mlp": 0.27502003, "epoch": 0.4968585600480986, "flos": 19281445079040.0, "grad_norm": 10.321053238858216, "language_loss": 0.77354181, "learning_rate": 2.1173552488108923e-06, "loss": 0.79270375, "num_input_tokens_seen": 177645920, "router_z_loss_clip": 2.90429688, "router_z_loss_mlp": 0.41992188, "step": 8264, "time_per_iteration": 2.6227495670318604 }, { "auxiliary_loss_clip": 0.01593713, "auxiliary_loss_mlp": 0.00267589, "balance_loss_clip": 1.29922044, "balance_loss_mlp": 0.22722441, "epoch": 0.49691868330076655, "flos": 22528703237760.0, "grad_norm": 41.574723361894776, "language_loss": 0.72181505, "learning_rate": 2.1169664559556007e-06, "loss": 0.74042809, "num_input_tokens_seen": 177667185, "router_z_loss_clip": 2.94335938, "router_z_loss_mlp": 0.40356445, "step": 8265, "time_per_iteration": 2.670355796813965 }, { "auxiliary_loss_clip": 0.01159415, "auxiliary_loss_mlp": 0.00084217, "balance_loss_clip": 1.02558517, "balance_loss_mlp": 0.07286798, "epoch": 0.4969788065534345, "flos": 66577128675840.0, "grad_norm": 0.9004455656262217, "language_loss": 0.53084362, "learning_rate": 2.1165776586649304e-06, "loss": 0.54328001, "num_input_tokens_seen": 177733020, "router_z_loss_clip": 1.34375, "router_z_loss_mlp": 0.11328125, "step": 8266, "time_per_iteration": 3.18708872795105 }, { "auxiliary_loss_clip": 0.01563941, "auxiliary_loss_mlp": 0.00260601, "balance_loss_clip": 1.29279387, "balance_loss_mlp": 0.22419478, "epoch": 0.49703892980610254, "flos": 24059503105920.0, "grad_norm": 27.653337532524365, "language_loss": 0.83629662, "learning_rate": 2.1161888569536223e-06, "loss": 0.85454202, "num_input_tokens_seen": 177753370, "router_z_loss_clip": 2.71484375, "router_z_loss_mlp": 0.36401367, "step": 8267, "time_per_iteration": 2.7422688007354736 }, { "auxiliary_loss_clip": 0.01589773, "auxiliary_loss_mlp": 0.0026855, "balance_loss_clip": 1.30355525, "balance_loss_mlp": 0.22713624, "epoch": 0.4970990530587705, "flos": 29126174912640.0, "grad_norm": 2.8132490407701063, "language_loss": 0.82731068, "learning_rate": 2.1158000508364223e-06, "loss": 0.84589392, "num_input_tokens_seen": 177771530, "router_z_loss_clip": 2.86523438, "router_z_loss_mlp": 0.41430664, "step": 8268, "time_per_iteration": 2.7157413959503174 }, { "auxiliary_loss_clip": 0.0159878, "auxiliary_loss_mlp": 0.00303783, "balance_loss_clip": 1.30790925, "balance_loss_mlp": 0.2611776, "epoch": 0.49715917631143847, "flos": 46026167258880.0, "grad_norm": 35.11957646949294, "language_loss": 0.7335977, "learning_rate": 2.115411240328073e-06, "loss": 0.75262332, "num_input_tokens_seen": 177796355, "router_z_loss_clip": 2.91210938, "router_z_loss_mlp": 0.42602539, "step": 8269, "time_per_iteration": 2.9109463691711426 }, { "auxiliary_loss_clip": 0.01596746, "auxiliary_loss_mlp": 0.00284607, "balance_loss_clip": 1.31310987, "balance_loss_mlp": 0.24405198, "epoch": 0.49721929956410643, "flos": 20191277600640.0, "grad_norm": 48.95411155886271, "language_loss": 0.89997101, "learning_rate": 2.1150224254433167e-06, "loss": 0.91878456, "num_input_tokens_seen": 177814300, "router_z_loss_clip": 2.83398438, "router_z_loss_mlp": 0.40551758, "step": 8270, "time_per_iteration": 2.6716599464416504 }, { "auxiliary_loss_clip": 0.01594805, "auxiliary_loss_mlp": 0.00284019, "balance_loss_clip": 1.3040036, "balance_loss_mlp": 0.24436952, "epoch": 0.4972794228167744, "flos": 21653560275840.0, "grad_norm": 13.567166219690304, "language_loss": 0.75924313, "learning_rate": 2.114633606196899e-06, "loss": 0.77803135, "num_input_tokens_seen": 177833615, "router_z_loss_clip": 2.90625, "router_z_loss_mlp": 0.39599609, "step": 8271, "time_per_iteration": 2.6862680912017822 }, { "auxiliary_loss_clip": 0.01604976, "auxiliary_loss_mlp": 0.00276021, "balance_loss_clip": 1.31473589, "balance_loss_mlp": 0.23606217, "epoch": 0.49733954606944236, "flos": 24279743347200.0, "grad_norm": 1.8153646682645064, "language_loss": 0.83700097, "learning_rate": 2.1142447826035635e-06, "loss": 0.855811, "num_input_tokens_seen": 177855315, "router_z_loss_clip": 2.90234375, "router_z_loss_mlp": 0.39941406, "step": 8272, "time_per_iteration": 2.671494722366333 }, { "auxiliary_loss_clip": 0.01575636, "auxiliary_loss_mlp": 0.00287487, "balance_loss_clip": 1.29508054, "balance_loss_mlp": 0.25108021, "epoch": 0.4973996693221103, "flos": 37852575730560.0, "grad_norm": 8.016527427350354, "language_loss": 0.73006046, "learning_rate": 2.1138559546780544e-06, "loss": 0.74869162, "num_input_tokens_seen": 177875590, "router_z_loss_clip": 2.8046875, "router_z_loss_mlp": 0.36401367, "step": 8273, "time_per_iteration": 2.7836966514587402 }, { "auxiliary_loss_clip": 0.0158045, "auxiliary_loss_mlp": 0.00283634, "balance_loss_clip": 1.29868126, "balance_loss_mlp": 0.24286492, "epoch": 0.4974597925747783, "flos": 21361426963200.0, "grad_norm": 2.1532204163402446, "language_loss": 0.83492839, "learning_rate": 2.1134671224351163e-06, "loss": 0.85356915, "num_input_tokens_seen": 177894175, "router_z_loss_clip": 2.82226562, "router_z_loss_mlp": 0.40771484, "step": 8274, "time_per_iteration": 2.6122419834136963 }, { "auxiliary_loss_clip": 0.01584741, "auxiliary_loss_mlp": 0.00278876, "balance_loss_clip": 1.29280639, "balance_loss_mlp": 0.24056223, "epoch": 0.49751991582744626, "flos": 30738133560960.0, "grad_norm": 16.182635020954972, "language_loss": 0.82240188, "learning_rate": 2.113078285889493e-06, "loss": 0.84103805, "num_input_tokens_seen": 177913920, "router_z_loss_clip": 2.92382812, "router_z_loss_mlp": 0.38330078, "step": 8275, "time_per_iteration": 2.7147271633148193 }, { "auxiliary_loss_clip": 0.01575819, "auxiliary_loss_mlp": 0.00302362, "balance_loss_clip": 1.28650033, "balance_loss_mlp": 0.26044762, "epoch": 0.4975800390801142, "flos": 14100541044480.0, "grad_norm": 57.78569877295585, "language_loss": 0.92537934, "learning_rate": 2.1126894450559303e-06, "loss": 0.94416118, "num_input_tokens_seen": 177930425, "router_z_loss_clip": 2.890625, "router_z_loss_mlp": 0.41894531, "step": 8276, "time_per_iteration": 2.641982316970825 }, { "auxiliary_loss_clip": 0.01559508, "auxiliary_loss_mlp": 0.00278191, "balance_loss_clip": 1.28659129, "balance_loss_mlp": 0.24235669, "epoch": 0.4976401623327822, "flos": 24207275658240.0, "grad_norm": 6.674944196356156, "language_loss": 0.75626236, "learning_rate": 2.112300599949172e-06, "loss": 0.77463937, "num_input_tokens_seen": 177949885, "router_z_loss_clip": 2.7265625, "router_z_loss_mlp": 0.35864258, "step": 8277, "time_per_iteration": 2.6854665279388428 }, { "auxiliary_loss_clip": 0.01563931, "auxiliary_loss_mlp": 0.0025419, "balance_loss_clip": 1.28657484, "balance_loss_mlp": 0.21437439, "epoch": 0.49770028558545015, "flos": 21136769349120.0, "grad_norm": 3.8601375213513713, "language_loss": 0.88034403, "learning_rate": 2.111911750583964e-06, "loss": 0.89852524, "num_input_tokens_seen": 177965720, "router_z_loss_clip": 2.77148438, "router_z_loss_mlp": 0.39819336, "step": 8278, "time_per_iteration": 2.6437125205993652 }, { "auxiliary_loss_clip": 0.0157207, "auxiliary_loss_mlp": 0.00286192, "balance_loss_clip": 1.28927994, "balance_loss_mlp": 0.24485055, "epoch": 0.4977604088381181, "flos": 16763927627520.0, "grad_norm": 6.128711646906723, "language_loss": 0.74527138, "learning_rate": 2.111522896975052e-06, "loss": 0.76385403, "num_input_tokens_seen": 177983190, "router_z_loss_clip": 2.828125, "router_z_loss_mlp": 0.41333008, "step": 8279, "time_per_iteration": 2.60491943359375 }, { "auxiliary_loss_clip": 0.01560682, "auxiliary_loss_mlp": 0.00316142, "balance_loss_clip": 1.28091908, "balance_loss_mlp": 0.2755636, "epoch": 0.49782053209078614, "flos": 15703521292800.0, "grad_norm": 15.8415173669579, "language_loss": 0.77671325, "learning_rate": 2.1111340391371794e-06, "loss": 0.79548144, "num_input_tokens_seen": 178000155, "router_z_loss_clip": 2.79492188, "router_z_loss_mlp": 0.40551758, "step": 8280, "time_per_iteration": 2.6097826957702637 }, { "auxiliary_loss_clip": 0.01562574, "auxiliary_loss_mlp": 0.00259332, "balance_loss_clip": 1.28501129, "balance_loss_mlp": 0.22314033, "epoch": 0.4978806553434541, "flos": 24753692327040.0, "grad_norm": 8.93595460356632, "language_loss": 0.70333397, "learning_rate": 2.1107451770850936e-06, "loss": 0.72155303, "num_input_tokens_seen": 178021060, "router_z_loss_clip": 2.78125, "router_z_loss_mlp": 0.36206055, "step": 8281, "time_per_iteration": 2.6819510459899902 }, { "auxiliary_loss_clip": 0.01558587, "auxiliary_loss_mlp": 0.00280696, "balance_loss_clip": 1.27950072, "balance_loss_mlp": 0.24233469, "epoch": 0.49794077859612207, "flos": 13115726881920.0, "grad_norm": 9.366564028822461, "language_loss": 0.81392652, "learning_rate": 2.1103563108335387e-06, "loss": 0.83231932, "num_input_tokens_seen": 178038180, "router_z_loss_clip": 2.79296875, "router_z_loss_mlp": 0.38330078, "step": 8282, "time_per_iteration": 2.656797170639038 }, { "auxiliary_loss_clip": 0.01567042, "auxiliary_loss_mlp": 0.00263183, "balance_loss_clip": 1.28648138, "balance_loss_mlp": 0.2251793, "epoch": 0.49800090184879003, "flos": 27525133998720.0, "grad_norm": 8.751547627361585, "language_loss": 0.78827572, "learning_rate": 2.109967440397263e-06, "loss": 0.80657804, "num_input_tokens_seen": 178057565, "router_z_loss_clip": 2.80664062, "router_z_loss_mlp": 0.38012695, "step": 8283, "time_per_iteration": 2.7004942893981934 }, { "auxiliary_loss_clip": 0.01556029, "auxiliary_loss_mlp": 0.00269434, "balance_loss_clip": 1.28172779, "balance_loss_mlp": 0.231502, "epoch": 0.498061025101458, "flos": 19792489829760.0, "grad_norm": 1.7736128719756854, "language_loss": 0.86350262, "learning_rate": 2.1095785657910095e-06, "loss": 0.88175726, "num_input_tokens_seen": 178076965, "router_z_loss_clip": 2.74609375, "router_z_loss_mlp": 0.37939453, "step": 8284, "time_per_iteration": 2.6614372730255127 }, { "auxiliary_loss_clip": 0.01565432, "auxiliary_loss_mlp": 0.00315812, "balance_loss_clip": 1.28147805, "balance_loss_mlp": 0.27325472, "epoch": 0.49812114835412596, "flos": 29893909230720.0, "grad_norm": 28.10545964587103, "language_loss": 0.79484862, "learning_rate": 2.109189687029526e-06, "loss": 0.81366104, "num_input_tokens_seen": 178095105, "router_z_loss_clip": 2.83984375, "router_z_loss_mlp": 0.42578125, "step": 8285, "time_per_iteration": 2.72127628326416 }, { "auxiliary_loss_clip": 0.01558471, "auxiliary_loss_mlp": 0.00275153, "balance_loss_clip": 1.28181982, "balance_loss_mlp": 0.23548052, "epoch": 0.49818127160679393, "flos": 23147048891520.0, "grad_norm": 558.1317371763907, "language_loss": 0.81014919, "learning_rate": 2.1088008041275598e-06, "loss": 0.82848537, "num_input_tokens_seen": 178114505, "router_z_loss_clip": 2.765625, "router_z_loss_mlp": 0.39672852, "step": 8286, "time_per_iteration": 2.8055546283721924 }, { "auxiliary_loss_clip": 0.01576504, "auxiliary_loss_mlp": 0.00281975, "balance_loss_clip": 1.29355907, "balance_loss_mlp": 0.24277888, "epoch": 0.4982413948594619, "flos": 21652806090240.0, "grad_norm": 7.0380952892820465, "language_loss": 0.91973245, "learning_rate": 2.1084119170998545e-06, "loss": 0.93831718, "num_input_tokens_seen": 178131595, "router_z_loss_clip": 2.83203125, "router_z_loss_mlp": 0.39208984, "step": 8287, "time_per_iteration": 2.6749281883239746 }, { "auxiliary_loss_clip": 0.01573651, "auxiliary_loss_mlp": 0.0028776, "balance_loss_clip": 1.28710878, "balance_loss_mlp": 0.2466327, "epoch": 0.49830151811212986, "flos": 32486982940800.0, "grad_norm": 23.68307006351826, "language_loss": 0.79543626, "learning_rate": 2.108023025961159e-06, "loss": 0.81405032, "num_input_tokens_seen": 178152055, "router_z_loss_clip": 2.86523438, "router_z_loss_mlp": 0.41137695, "step": 8288, "time_per_iteration": 2.7617757320404053 }, { "auxiliary_loss_clip": 0.01570451, "auxiliary_loss_mlp": 0.00290425, "balance_loss_clip": 1.28697121, "balance_loss_mlp": 0.24755731, "epoch": 0.4983616413647978, "flos": 18142358002560.0, "grad_norm": 20.653876379602544, "language_loss": 0.91759348, "learning_rate": 2.10763413072622e-06, "loss": 0.93620217, "num_input_tokens_seen": 178168150, "router_z_loss_clip": 2.83398438, "router_z_loss_mlp": 0.4284668, "step": 8289, "time_per_iteration": 2.618659496307373 }, { "auxiliary_loss_clip": 0.01563481, "auxiliary_loss_mlp": 0.00251868, "balance_loss_clip": 1.28634548, "balance_loss_mlp": 0.21224239, "epoch": 0.4984217646174658, "flos": 19718836992000.0, "grad_norm": 39.82132615552353, "language_loss": 0.81791532, "learning_rate": 2.107245231409784e-06, "loss": 0.83606875, "num_input_tokens_seen": 178186150, "router_z_loss_clip": 2.7734375, "router_z_loss_mlp": 0.39624023, "step": 8290, "time_per_iteration": 2.62422251701355 }, { "auxiliary_loss_clip": 0.01580719, "auxiliary_loss_mlp": 0.00277108, "balance_loss_clip": 1.29723394, "balance_loss_mlp": 0.23462164, "epoch": 0.49848188787013376, "flos": 24936549488640.0, "grad_norm": 20.995234191672953, "language_loss": 0.89798307, "learning_rate": 2.106856328026598e-06, "loss": 0.91656131, "num_input_tokens_seen": 178207665, "router_z_loss_clip": 2.8359375, "router_z_loss_mlp": 0.42480469, "step": 8291, "time_per_iteration": 4.117675304412842 }, { "auxiliary_loss_clip": 0.01550114, "auxiliary_loss_mlp": 0.00284851, "balance_loss_clip": 1.26992404, "balance_loss_mlp": 0.24634685, "epoch": 0.4985420111228017, "flos": 22382439056640.0, "grad_norm": 3.644418875251962, "language_loss": 0.7504428, "learning_rate": 2.106467420591409e-06, "loss": 0.76879251, "num_input_tokens_seen": 178226325, "router_z_loss_clip": 2.80078125, "router_z_loss_mlp": 0.38500977, "step": 8292, "time_per_iteration": 2.6323609352111816 }, { "auxiliary_loss_clip": 0.01558631, "auxiliary_loss_mlp": 0.00296159, "balance_loss_clip": 1.28238583, "balance_loss_mlp": 0.25515068, "epoch": 0.4986021343754697, "flos": 16216469464320.0, "grad_norm": 6.896034325643645, "language_loss": 0.74606562, "learning_rate": 2.106078509118965e-06, "loss": 0.76461345, "num_input_tokens_seen": 178244960, "router_z_loss_clip": 2.76171875, "router_z_loss_mlp": 0.41015625, "step": 8293, "time_per_iteration": 2.6407389640808105 }, { "auxiliary_loss_clip": 0.01544311, "auxiliary_loss_mlp": 0.00290387, "balance_loss_clip": 1.26656079, "balance_loss_mlp": 0.24835409, "epoch": 0.4986622576281377, "flos": 23403594804480.0, "grad_norm": 1133.9241831978209, "language_loss": 0.90293145, "learning_rate": 2.1056895936240133e-06, "loss": 0.92127848, "num_input_tokens_seen": 178265400, "router_z_loss_clip": 2.77734375, "router_z_loss_mlp": 0.42016602, "step": 8294, "time_per_iteration": 4.121809005737305 }, { "auxiliary_loss_clip": 0.01549266, "auxiliary_loss_mlp": 0.00260062, "balance_loss_clip": 1.27572775, "balance_loss_mlp": 0.22050798, "epoch": 0.49872238088080567, "flos": 19974556892160.0, "grad_norm": 11.067786763372983, "language_loss": 0.80127347, "learning_rate": 2.1053006741213016e-06, "loss": 0.81936669, "num_input_tokens_seen": 178284535, "router_z_loss_clip": 2.73632812, "router_z_loss_mlp": 0.39526367, "step": 8295, "time_per_iteration": 2.628880262374878 }, { "auxiliary_loss_clip": 0.01535646, "auxiliary_loss_mlp": 0.00255477, "balance_loss_clip": 1.26713192, "balance_loss_mlp": 0.21794973, "epoch": 0.49878250413347364, "flos": 22893016930560.0, "grad_norm": 5.324015311635045, "language_loss": 0.75591099, "learning_rate": 2.1049117506255775e-06, "loss": 0.77382219, "num_input_tokens_seen": 178302425, "router_z_loss_clip": 2.68554688, "router_z_loss_mlp": 0.37548828, "step": 8296, "time_per_iteration": 3.9921317100524902 }, { "auxiliary_loss_clip": 0.01550573, "auxiliary_loss_mlp": 0.00265894, "balance_loss_clip": 1.26718688, "balance_loss_mlp": 0.22574462, "epoch": 0.4988426273861416, "flos": 32598449821440.0, "grad_norm": 3.143093059184644, "language_loss": 0.72897172, "learning_rate": 2.1045228231515895e-06, "loss": 0.74713641, "num_input_tokens_seen": 178323065, "router_z_loss_clip": 2.83398438, "router_z_loss_mlp": 0.40136719, "step": 8297, "time_per_iteration": 2.7317750453948975 }, { "auxiliary_loss_clip": 0.0154182, "auxiliary_loss_mlp": 0.00260841, "balance_loss_clip": 1.27842045, "balance_loss_mlp": 0.22412413, "epoch": 0.49890275063880957, "flos": 20923604087040.0, "grad_norm": 5.111270963256113, "language_loss": 0.76128674, "learning_rate": 2.1041338917140857e-06, "loss": 0.77931333, "num_input_tokens_seen": 178343985, "router_z_loss_clip": 2.63476562, "router_z_loss_mlp": 0.36743164, "step": 8298, "time_per_iteration": 2.6905357837677 }, { "auxiliary_loss_clip": 0.01551956, "auxiliary_loss_mlp": 0.00277347, "balance_loss_clip": 1.28239906, "balance_loss_mlp": 0.23869902, "epoch": 0.49896287389147753, "flos": 18624459369600.0, "grad_norm": 12.85411277683915, "language_loss": 0.9264462, "learning_rate": 2.103744956327814e-06, "loss": 0.94473922, "num_input_tokens_seen": 178362345, "router_z_loss_clip": 2.6953125, "router_z_loss_mlp": 0.38647461, "step": 8299, "time_per_iteration": 2.720627784729004 }, { "auxiliary_loss_clip": 0.0156199, "auxiliary_loss_mlp": 0.00261309, "balance_loss_clip": 1.28095436, "balance_loss_mlp": 0.22321013, "epoch": 0.4990229971441455, "flos": 24826555065600.0, "grad_norm": 3.318187111837297, "language_loss": 0.76271665, "learning_rate": 2.1033560170075234e-06, "loss": 0.78094971, "num_input_tokens_seen": 178383190, "router_z_loss_clip": 2.80664062, "router_z_loss_mlp": 0.38110352, "step": 8300, "time_per_iteration": 2.672309637069702 }, { "auxiliary_loss_clip": 0.013135, "auxiliary_loss_mlp": 0.00050078, "balance_loss_clip": 1.16644883, "balance_loss_mlp": 0.03839538, "epoch": 0.49908312039681346, "flos": 71384525136000.0, "grad_norm": 0.7534611225038539, "language_loss": 0.50898254, "learning_rate": 2.1029670737679623e-06, "loss": 0.52261829, "num_input_tokens_seen": 178444250, "router_z_loss_clip": 1.46875, "router_z_loss_mlp": 0.11669922, "step": 8301, "time_per_iteration": 4.652975559234619 }, { "auxiliary_loss_clip": 0.01544301, "auxiliary_loss_mlp": 0.00253207, "balance_loss_clip": 1.27823305, "balance_loss_mlp": 0.21663322, "epoch": 0.4991432436494814, "flos": 19828651847040.0, "grad_norm": 13.661283215860301, "language_loss": 0.91914904, "learning_rate": 2.102578126623879e-06, "loss": 0.93712413, "num_input_tokens_seen": 178463250, "router_z_loss_clip": 2.66015625, "router_z_loss_mlp": 0.36572266, "step": 8302, "time_per_iteration": 2.633612632751465 }, { "auxiliary_loss_clip": 0.01554698, "auxiliary_loss_mlp": 0.00255894, "balance_loss_clip": 1.28522396, "balance_loss_mlp": 0.22022676, "epoch": 0.4992033669021494, "flos": 15121912273920.0, "grad_norm": 9.310815278704329, "language_loss": 0.7700659, "learning_rate": 2.102189175590024e-06, "loss": 0.78817177, "num_input_tokens_seen": 178481340, "router_z_loss_clip": 2.69335938, "router_z_loss_mlp": 0.35668945, "step": 8303, "time_per_iteration": 2.6400959491729736 }, { "auxiliary_loss_clip": 0.01554025, "auxiliary_loss_mlp": 0.00304335, "balance_loss_clip": 1.27870989, "balance_loss_mlp": 0.26635444, "epoch": 0.49926349015481736, "flos": 31207952476800.0, "grad_norm": 49.48104176862393, "language_loss": 0.77327991, "learning_rate": 2.101800220681144e-06, "loss": 0.79186356, "num_input_tokens_seen": 178501545, "router_z_loss_clip": 2.75390625, "router_z_loss_mlp": 0.38012695, "step": 8304, "time_per_iteration": 2.755831003189087 }, { "auxiliary_loss_clip": 0.01552649, "auxiliary_loss_mlp": 0.00273684, "balance_loss_clip": 1.28285098, "balance_loss_mlp": 0.23572783, "epoch": 0.4993236134074853, "flos": 24900207903360.0, "grad_norm": 5.701512692202346, "language_loss": 0.89880586, "learning_rate": 2.10141126191199e-06, "loss": 0.9170692, "num_input_tokens_seen": 178519700, "router_z_loss_clip": 2.69726562, "router_z_loss_mlp": 0.37963867, "step": 8305, "time_per_iteration": 2.669471502304077 }, { "auxiliary_loss_clip": 0.0131389, "auxiliary_loss_mlp": 0.00031649, "balance_loss_clip": 1.16659403, "balance_loss_mlp": 0.02297036, "epoch": 0.4993837366601533, "flos": 70420573797120.0, "grad_norm": 0.7132946082690106, "language_loss": 0.56840104, "learning_rate": 2.1010222992973107e-06, "loss": 0.58185643, "num_input_tokens_seen": 178576740, "router_z_loss_clip": 1.46875, "router_z_loss_mlp": 0.08691406, "step": 8306, "time_per_iteration": 3.2620975971221924 }, { "auxiliary_loss_clip": 0.015464, "auxiliary_loss_mlp": 0.00271105, "balance_loss_clip": 1.27593565, "balance_loss_mlp": 0.23262404, "epoch": 0.4994438599128213, "flos": 15961216440960.0, "grad_norm": 2.9861251596144074, "language_loss": 0.88417953, "learning_rate": 2.1006333328518556e-06, "loss": 0.90235454, "num_input_tokens_seen": 178594745, "router_z_loss_clip": 2.703125, "router_z_loss_mlp": 0.38476562, "step": 8307, "time_per_iteration": 2.653660297393799 }, { "auxiliary_loss_clip": 0.01536658, "auxiliary_loss_mlp": 0.00258016, "balance_loss_clip": 1.27240562, "balance_loss_mlp": 0.22237217, "epoch": 0.4995039831654893, "flos": 27928303228800.0, "grad_norm": 83.26886751674232, "language_loss": 0.68718207, "learning_rate": 2.1002443625903748e-06, "loss": 0.70512879, "num_input_tokens_seen": 178614110, "router_z_loss_clip": 2.64257812, "router_z_loss_mlp": 0.35620117, "step": 8308, "time_per_iteration": 2.7902040481567383 }, { "auxiliary_loss_clip": 0.01522353, "auxiliary_loss_mlp": 0.00270639, "balance_loss_clip": 1.26183522, "balance_loss_mlp": 0.23568663, "epoch": 0.49956410641815724, "flos": 24204797619840.0, "grad_norm": 5.286821443666597, "language_loss": 0.80296683, "learning_rate": 2.0998553885276168e-06, "loss": 0.82089674, "num_input_tokens_seen": 178634170, "router_z_loss_clip": 2.60351562, "router_z_loss_mlp": 0.34960938, "step": 8309, "time_per_iteration": 2.738373041152954 }, { "auxiliary_loss_clip": 0.01536473, "auxiliary_loss_mlp": 0.00271014, "balance_loss_clip": 1.26427412, "balance_loss_mlp": 0.23517923, "epoch": 0.4996242296708252, "flos": 16180127879040.0, "grad_norm": 636.0163792387274, "language_loss": 0.86780024, "learning_rate": 2.0994664106783335e-06, "loss": 0.88587511, "num_input_tokens_seen": 178651775, "router_z_loss_clip": 2.71875, "router_z_loss_mlp": 0.3581543, "step": 8310, "time_per_iteration": 2.6469452381134033 }, { "auxiliary_loss_clip": 0.01537471, "auxiliary_loss_mlp": 0.00297814, "balance_loss_clip": 1.26424956, "balance_loss_mlp": 0.26100242, "epoch": 0.49968435292349317, "flos": 16873527000960.0, "grad_norm": 376.02021595260595, "language_loss": 0.77926946, "learning_rate": 2.0990774290572735e-06, "loss": 0.79762232, "num_input_tokens_seen": 178669720, "router_z_loss_clip": 2.73046875, "router_z_loss_mlp": 0.36816406, "step": 8311, "time_per_iteration": 2.6617627143859863 }, { "auxiliary_loss_clip": 0.01525653, "auxiliary_loss_mlp": 0.00274138, "balance_loss_clip": 1.26270652, "balance_loss_mlp": 0.23856601, "epoch": 0.49974447617616113, "flos": 14939521989120.0, "grad_norm": 476.76245085127596, "language_loss": 0.83990484, "learning_rate": 2.098688443679187e-06, "loss": 0.85790277, "num_input_tokens_seen": 178686765, "router_z_loss_clip": 2.62695312, "router_z_loss_mlp": 0.35546875, "step": 8312, "time_per_iteration": 2.6910252571105957 }, { "auxiliary_loss_clip": 0.01536785, "auxiliary_loss_mlp": 0.00307042, "balance_loss_clip": 1.26656592, "balance_loss_mlp": 0.2713981, "epoch": 0.4998045994288291, "flos": 26651535321600.0, "grad_norm": 2.6480471378595847, "language_loss": 0.91475868, "learning_rate": 2.0982994545588256e-06, "loss": 0.93319696, "num_input_tokens_seen": 178705845, "router_z_loss_clip": 2.703125, "router_z_loss_mlp": 0.35644531, "step": 8313, "time_per_iteration": 2.7459142208099365 }, { "auxiliary_loss_clip": 0.01522987, "auxiliary_loss_mlp": 0.00298754, "balance_loss_clip": 1.25658691, "balance_loss_mlp": 0.2627764, "epoch": 0.49986472268149706, "flos": 20953768533120.0, "grad_norm": 4.712286776072296, "language_loss": 0.8662343, "learning_rate": 2.097910461710939e-06, "loss": 0.88445169, "num_input_tokens_seen": 178723410, "router_z_loss_clip": 2.6640625, "router_z_loss_mlp": 0.35961914, "step": 8314, "time_per_iteration": 2.654977321624756 }, { "auxiliary_loss_clip": 0.01505959, "auxiliary_loss_mlp": 0.00275816, "balance_loss_clip": 1.23907399, "balance_loss_mlp": 0.23940918, "epoch": 0.49992484593416503, "flos": 22783884433920.0, "grad_norm": 87.0736697185339, "language_loss": 0.8599745, "learning_rate": 2.0975214651502773e-06, "loss": 0.87779224, "num_input_tokens_seen": 178743560, "router_z_loss_clip": 2.66796875, "router_z_loss_mlp": 0.36425781, "step": 8315, "time_per_iteration": 2.6770336627960205 }, { "auxiliary_loss_clip": 0.01509395, "auxiliary_loss_mlp": 0.00268156, "balance_loss_clip": 1.25151098, "balance_loss_mlp": 0.23442024, "epoch": 0.499984969186833, "flos": 46786970252160.0, "grad_norm": 44.045049164919554, "language_loss": 0.79938495, "learning_rate": 2.0971324648915926e-06, "loss": 0.81716049, "num_input_tokens_seen": 178767225, "router_z_loss_clip": 2.578125, "router_z_loss_mlp": 0.33740234, "step": 8316, "time_per_iteration": 2.881390333175659 }, { "auxiliary_loss_clip": 0.01515469, "auxiliary_loss_mlp": 0.0028299, "balance_loss_clip": 1.25614893, "balance_loss_mlp": 0.25051743, "epoch": 0.500045092439501, "flos": 25556978131200.0, "grad_norm": 12.585700615187667, "language_loss": 0.8680687, "learning_rate": 2.0967434609496343e-06, "loss": 0.8860532, "num_input_tokens_seen": 178786810, "router_z_loss_clip": 2.59765625, "router_z_loss_mlp": 0.32470703, "step": 8317, "time_per_iteration": 2.666767120361328 }, { "auxiliary_loss_clip": 0.01495067, "auxiliary_loss_mlp": 0.00277528, "balance_loss_clip": 1.23169398, "balance_loss_mlp": 0.24331445, "epoch": 0.5001052156921689, "flos": 20704764476160.0, "grad_norm": 20.74793044744049, "language_loss": 0.89145911, "learning_rate": 2.0963544533391548e-06, "loss": 0.90918505, "num_input_tokens_seen": 178805660, "router_z_loss_clip": 2.6328125, "router_z_loss_mlp": 0.34204102, "step": 8318, "time_per_iteration": 2.653527021408081 }, { "auxiliary_loss_clip": 0.01501112, "auxiliary_loss_mlp": 0.00258561, "balance_loss_clip": 1.23977971, "balance_loss_mlp": 0.22441971, "epoch": 0.500165338944837, "flos": 21251109317760.0, "grad_norm": 181.2066553924834, "language_loss": 0.88603467, "learning_rate": 2.0959654420749045e-06, "loss": 0.90363145, "num_input_tokens_seen": 178824780, "router_z_loss_clip": 2.61523438, "router_z_loss_mlp": 0.34106445, "step": 8319, "time_per_iteration": 2.6545042991638184 }, { "auxiliary_loss_clip": 0.01475184, "auxiliary_loss_mlp": 0.00291432, "balance_loss_clip": 1.22724199, "balance_loss_mlp": 0.25874481, "epoch": 0.5002254621975049, "flos": 27854398995840.0, "grad_norm": 7.173321456236692, "language_loss": 0.78335166, "learning_rate": 2.095576427171635e-06, "loss": 0.80101782, "num_input_tokens_seen": 178845640, "router_z_loss_clip": 2.4765625, "router_z_loss_mlp": 0.3269043, "step": 8320, "time_per_iteration": 2.7205400466918945 }, { "auxiliary_loss_clip": 0.01506738, "auxiliary_loss_mlp": 0.00316103, "balance_loss_clip": 1.23859668, "balance_loss_mlp": 0.2780987, "epoch": 0.5002855854501729, "flos": 15551941898880.0, "grad_norm": 25.38293760669924, "language_loss": 0.85190326, "learning_rate": 2.0951874086440978e-06, "loss": 0.87013173, "num_input_tokens_seen": 178862290, "router_z_loss_clip": 2.67773438, "router_z_loss_mlp": 0.38037109, "step": 8321, "time_per_iteration": 2.5990335941314697 }, { "auxiliary_loss_clip": 0.01484319, "auxiliary_loss_mlp": 0.00271935, "balance_loss_clip": 1.22425401, "balance_loss_mlp": 0.23841313, "epoch": 0.5003457087028408, "flos": 16107408794880.0, "grad_norm": 7.896609248323216, "language_loss": 0.89831048, "learning_rate": 2.0947983865070455e-06, "loss": 0.91587305, "num_input_tokens_seen": 178879805, "router_z_loss_clip": 2.6015625, "router_z_loss_mlp": 0.33544922, "step": 8322, "time_per_iteration": 2.623309850692749 }, { "auxiliary_loss_clip": 0.01499309, "auxiliary_loss_mlp": 0.00265344, "balance_loss_clip": 1.24115407, "balance_loss_mlp": 0.23318155, "epoch": 0.5004058319555088, "flos": 22710518904960.0, "grad_norm": 23.73440893705101, "language_loss": 0.83153778, "learning_rate": 2.094409360775228e-06, "loss": 0.84918433, "num_input_tokens_seen": 178896985, "router_z_loss_clip": 2.5859375, "router_z_loss_mlp": 0.3215332, "step": 8323, "time_per_iteration": 2.7042691707611084 }, { "auxiliary_loss_clip": 0.01476556, "auxiliary_loss_mlp": 0.00284722, "balance_loss_clip": 1.22084987, "balance_loss_mlp": 0.25165376, "epoch": 0.5004659552081767, "flos": 30117956313600.0, "grad_norm": 8.830156648644511, "language_loss": 0.7493493, "learning_rate": 2.0940203314633977e-06, "loss": 0.76696205, "num_input_tokens_seen": 178920605, "router_z_loss_clip": 2.55664062, "router_z_loss_mlp": 0.33081055, "step": 8324, "time_per_iteration": 2.748060703277588 }, { "auxiliary_loss_clip": 0.01481999, "auxiliary_loss_mlp": 0.00266509, "balance_loss_clip": 1.22466075, "balance_loss_mlp": 0.23396532, "epoch": 0.5005260784608447, "flos": 18624710764800.0, "grad_norm": 58.33306761231406, "language_loss": 0.79470181, "learning_rate": 2.0936312985863077e-06, "loss": 0.81218684, "num_input_tokens_seen": 178937760, "router_z_loss_clip": 2.57421875, "router_z_loss_mlp": 0.32568359, "step": 8325, "time_per_iteration": 2.6344377994537354 }, { "auxiliary_loss_clip": 0.01482858, "auxiliary_loss_mlp": 0.00286532, "balance_loss_clip": 1.22329187, "balance_loss_mlp": 0.2491717, "epoch": 0.5005862017135126, "flos": 24859987649280.0, "grad_norm": 12.739242541349977, "language_loss": 0.79311162, "learning_rate": 2.093242262158709e-06, "loss": 0.8108055, "num_input_tokens_seen": 178957985, "router_z_loss_clip": 2.59765625, "router_z_loss_mlp": 0.37353516, "step": 8326, "time_per_iteration": 2.694550037384033 }, { "auxiliary_loss_clip": 0.01454003, "auxiliary_loss_mlp": 0.00259559, "balance_loss_clip": 1.20804393, "balance_loss_mlp": 0.22622773, "epoch": 0.5006463249661807, "flos": 18734381965440.0, "grad_norm": 42.668106256404506, "language_loss": 0.83476615, "learning_rate": 2.0928532221953544e-06, "loss": 0.85190177, "num_input_tokens_seen": 178977070, "router_z_loss_clip": 2.4609375, "router_z_loss_mlp": 0.33325195, "step": 8327, "time_per_iteration": 2.63102126121521 }, { "auxiliary_loss_clip": 0.01479456, "auxiliary_loss_mlp": 0.00261916, "balance_loss_clip": 1.22080278, "balance_loss_mlp": 0.22853786, "epoch": 0.5007064482188487, "flos": 13042145871360.0, "grad_norm": 20.449983269550177, "language_loss": 0.94838834, "learning_rate": 2.092464178710997e-06, "loss": 0.96580207, "num_input_tokens_seen": 178994175, "router_z_loss_clip": 2.58789062, "router_z_loss_mlp": 0.33398438, "step": 8328, "time_per_iteration": 2.647000551223755 }, { "auxiliary_loss_clip": 0.01479471, "auxiliary_loss_mlp": 0.00286864, "balance_loss_clip": 1.21741557, "balance_loss_mlp": 0.25348586, "epoch": 0.5007665714715166, "flos": 21288671965440.0, "grad_norm": 12.713745288024715, "language_loss": 0.81151652, "learning_rate": 2.092075131720388e-06, "loss": 0.82917988, "num_input_tokens_seen": 179013710, "router_z_loss_clip": 2.62304688, "router_z_loss_mlp": 0.33374023, "step": 8329, "time_per_iteration": 2.6821954250335693 }, { "auxiliary_loss_clip": 0.0147274, "auxiliary_loss_mlp": 0.00263914, "balance_loss_clip": 1.2178129, "balance_loss_mlp": 0.23046367, "epoch": 0.5008266947241846, "flos": 29754576374400.0, "grad_norm": 13.934878913026669, "language_loss": 0.85048699, "learning_rate": 2.091686081238281e-06, "loss": 0.86785352, "num_input_tokens_seen": 179035255, "router_z_loss_clip": 2.55078125, "router_z_loss_mlp": 0.33447266, "step": 8330, "time_per_iteration": 2.772576332092285 }, { "auxiliary_loss_clip": 0.01226107, "auxiliary_loss_mlp": 0.00055488, "balance_loss_clip": 1.08714938, "balance_loss_mlp": 0.04862158, "epoch": 0.5008868179768525, "flos": 63557829204480.0, "grad_norm": 0.712579330952562, "language_loss": 0.55603564, "learning_rate": 2.0912970272794282e-06, "loss": 0.56885159, "num_input_tokens_seen": 179090915, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.06884766, "step": 8331, "time_per_iteration": 2.9457015991210938 }, { "auxiliary_loss_clip": 0.0145444, "auxiliary_loss_mlp": 0.0027823, "balance_loss_clip": 1.20095646, "balance_loss_mlp": 0.24508986, "epoch": 0.5009469412295205, "flos": 27375637593600.0, "grad_norm": 155.7590845590233, "language_loss": 0.70471287, "learning_rate": 2.0909079698585833e-06, "loss": 0.72203952, "num_input_tokens_seen": 179109160, "router_z_loss_clip": 2.53515625, "router_z_loss_mlp": 0.33154297, "step": 8332, "time_per_iteration": 2.7539167404174805 }, { "auxiliary_loss_clip": 0.01457157, "auxiliary_loss_mlp": 0.0027981, "balance_loss_clip": 1.20517111, "balance_loss_mlp": 0.24889892, "epoch": 0.5010070644821885, "flos": 27378833904000.0, "grad_norm": 10.080759065047047, "language_loss": 0.80818623, "learning_rate": 2.0905189089904993e-06, "loss": 0.82555592, "num_input_tokens_seen": 179130610, "router_z_loss_clip": 2.51953125, "router_z_loss_mlp": 0.30895996, "step": 8333, "time_per_iteration": 4.14194917678833 }, { "auxiliary_loss_clip": 0.01471103, "auxiliary_loss_mlp": 0.00303911, "balance_loss_clip": 1.2133584, "balance_loss_mlp": 0.27224934, "epoch": 0.5010671877348565, "flos": 20662748542080.0, "grad_norm": 11.10512071954758, "language_loss": 0.85264301, "learning_rate": 2.090129844689929e-06, "loss": 0.87039316, "num_input_tokens_seen": 179147860, "router_z_loss_clip": 2.58007812, "router_z_loss_mlp": 0.31665039, "step": 8334, "time_per_iteration": 2.6392858028411865 }, { "auxiliary_loss_clip": 0.01206012, "auxiliary_loss_mlp": 0.00049304, "balance_loss_clip": 1.06585586, "balance_loss_mlp": 0.04162645, "epoch": 0.5011273109875244, "flos": 59128645000320.0, "grad_norm": 0.8870809437483181, "language_loss": 0.6232866, "learning_rate": 2.089740776971626e-06, "loss": 0.63583976, "num_input_tokens_seen": 179210490, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.07666016, "step": 8335, "time_per_iteration": 3.0867512226104736 }, { "auxiliary_loss_clip": 0.01443238, "auxiliary_loss_mlp": 0.00298873, "balance_loss_clip": 1.19701862, "balance_loss_mlp": 0.26685357, "epoch": 0.5011874342401924, "flos": 25336342840320.0, "grad_norm": 5.14581752502151, "language_loss": 0.83946002, "learning_rate": 2.0893517058503435e-06, "loss": 0.85688114, "num_input_tokens_seen": 179231360, "router_z_loss_clip": 2.46289062, "router_z_loss_mlp": 0.32006836, "step": 8336, "time_per_iteration": 4.1414759159088135 }, { "auxiliary_loss_clip": 0.01444426, "auxiliary_loss_mlp": 0.00269448, "balance_loss_clip": 1.19411969, "balance_loss_mlp": 0.23821557, "epoch": 0.5012475574928603, "flos": 20229953569920.0, "grad_norm": 11.530746077821638, "language_loss": 0.87506306, "learning_rate": 2.088962631340836e-06, "loss": 0.89220178, "num_input_tokens_seen": 179250625, "router_z_loss_clip": 2.5078125, "router_z_loss_mlp": 0.3125, "step": 8337, "time_per_iteration": 2.6344826221466064 }, { "auxiliary_loss_clip": 0.01460715, "auxiliary_loss_mlp": 0.00277988, "balance_loss_clip": 1.19999552, "balance_loss_mlp": 0.24608777, "epoch": 0.5013076807455283, "flos": 22710123855360.0, "grad_norm": 4.393125940812073, "language_loss": 0.89364803, "learning_rate": 2.0885735534578555e-06, "loss": 0.91103506, "num_input_tokens_seen": 179267360, "router_z_loss_clip": 2.60351562, "router_z_loss_mlp": 0.3190918, "step": 8338, "time_per_iteration": 4.031171560287476 }, { "auxiliary_loss_clip": 0.01439608, "auxiliary_loss_mlp": 0.00290389, "balance_loss_clip": 1.18724751, "balance_loss_mlp": 0.25627089, "epoch": 0.5013678039981962, "flos": 24245161528320.0, "grad_norm": 4.067965246588167, "language_loss": 0.8942591, "learning_rate": 2.0881844722161583e-06, "loss": 0.9115591, "num_input_tokens_seen": 179289810, "router_z_loss_clip": 2.52148438, "router_z_loss_mlp": 0.34130859, "step": 8339, "time_per_iteration": 2.6974287033081055 }, { "auxiliary_loss_clip": 0.01454003, "auxiliary_loss_mlp": 0.00276301, "balance_loss_clip": 1.20240784, "balance_loss_mlp": 0.24287513, "epoch": 0.5014279272508643, "flos": 26176688501760.0, "grad_norm": 35.51987330234699, "language_loss": 0.7763015, "learning_rate": 2.0877953876304962e-06, "loss": 0.79360455, "num_input_tokens_seen": 179310620, "router_z_loss_clip": 2.51953125, "router_z_loss_mlp": 0.33422852, "step": 8340, "time_per_iteration": 2.6866447925567627 }, { "auxiliary_loss_clip": 0.01457896, "auxiliary_loss_mlp": 0.00297357, "balance_loss_clip": 1.20183468, "balance_loss_mlp": 0.26340666, "epoch": 0.5014880505035323, "flos": 21430446946560.0, "grad_norm": 6.4527188049597015, "language_loss": 0.85543227, "learning_rate": 2.0874062997156245e-06, "loss": 0.87298477, "num_input_tokens_seen": 179329005, "router_z_loss_clip": 2.55859375, "router_z_loss_mlp": 0.33959961, "step": 8341, "time_per_iteration": 2.6983139514923096 }, { "auxiliary_loss_clip": 0.0145816, "auxiliary_loss_mlp": 0.00297727, "balance_loss_clip": 1.19849324, "balance_loss_mlp": 0.26492095, "epoch": 0.5015481737562002, "flos": 15770745596160.0, "grad_norm": 7.01104753151254, "language_loss": 0.97599179, "learning_rate": 2.0870172084862975e-06, "loss": 0.99355072, "num_input_tokens_seen": 179343785, "router_z_loss_clip": 2.59960938, "router_z_loss_mlp": 0.32763672, "step": 8342, "time_per_iteration": 2.5808353424072266 }, { "auxiliary_loss_clip": 0.0144374, "auxiliary_loss_mlp": 0.00283185, "balance_loss_clip": 1.1908114, "balance_loss_mlp": 0.24994969, "epoch": 0.5016082970088682, "flos": 26830801123200.0, "grad_norm": 11.606428044103685, "language_loss": 0.82482362, "learning_rate": 2.0866281139572682e-06, "loss": 0.84209287, "num_input_tokens_seen": 179364070, "router_z_loss_clip": 2.53125, "router_z_loss_mlp": 0.33190918, "step": 8343, "time_per_iteration": 4.110390901565552 }, { "auxiliary_loss_clip": 0.01444448, "auxiliary_loss_mlp": 0.0027097, "balance_loss_clip": 1.19871497, "balance_loss_mlp": 0.23954621, "epoch": 0.5016684202615361, "flos": 21470595373440.0, "grad_norm": 3.092284262478727, "language_loss": 0.74420977, "learning_rate": 2.086239016143293e-06, "loss": 0.76136398, "num_input_tokens_seen": 179384225, "router_z_loss_clip": 2.45898438, "router_z_loss_mlp": 0.31445312, "step": 8344, "time_per_iteration": 2.6959409713745117 }, { "auxiliary_loss_clip": 0.01455787, "auxiliary_loss_mlp": 0.00275591, "balance_loss_clip": 1.20144057, "balance_loss_mlp": 0.24268943, "epoch": 0.5017285435142042, "flos": 26246821806720.0, "grad_norm": 11.107452437986751, "language_loss": 0.80935889, "learning_rate": 2.0858499150591258e-06, "loss": 0.82667267, "num_input_tokens_seen": 179402595, "router_z_loss_clip": 2.54296875, "router_z_loss_mlp": 0.32885742, "step": 8345, "time_per_iteration": 2.694019079208374 }, { "auxiliary_loss_clip": 0.01441796, "auxiliary_loss_mlp": 0.00300582, "balance_loss_clip": 1.19274426, "balance_loss_mlp": 0.26751363, "epoch": 0.5017886667668721, "flos": 20777555387520.0, "grad_norm": 431.0842443627243, "language_loss": 0.84663343, "learning_rate": 2.0854608107195203e-06, "loss": 0.86405724, "num_input_tokens_seen": 179419635, "router_z_loss_clip": 2.4921875, "router_z_loss_mlp": 0.33105469, "step": 8346, "time_per_iteration": 2.681772470474243 }, { "auxiliary_loss_clip": 0.01439133, "auxiliary_loss_mlp": 0.00266835, "balance_loss_clip": 1.18647361, "balance_loss_mlp": 0.23479134, "epoch": 0.5018487900195401, "flos": 20156408472960.0, "grad_norm": 5.065189931772412, "language_loss": 0.75553739, "learning_rate": 2.0850717031392333e-06, "loss": 0.77259707, "num_input_tokens_seen": 179438770, "router_z_loss_clip": 2.52734375, "router_z_loss_mlp": 0.32055664, "step": 8347, "time_per_iteration": 2.641700506210327 }, { "auxiliary_loss_clip": 0.01452719, "auxiliary_loss_mlp": 0.0028615, "balance_loss_clip": 1.1977216, "balance_loss_mlp": 0.25306955, "epoch": 0.501908913272208, "flos": 18150689957760.0, "grad_norm": 22.84304935680298, "language_loss": 0.79019868, "learning_rate": 2.0846825923330174e-06, "loss": 0.80758739, "num_input_tokens_seen": 179457475, "router_z_loss_clip": 2.54492188, "router_z_loss_mlp": 0.33093262, "step": 8348, "time_per_iteration": 2.711578845977783 }, { "auxiliary_loss_clip": 0.01438472, "auxiliary_loss_mlp": 0.00258697, "balance_loss_clip": 1.19182217, "balance_loss_mlp": 0.2267492, "epoch": 0.501969036524876, "flos": 23112287504640.0, "grad_norm": 819.0956907181119, "language_loss": 0.79558796, "learning_rate": 2.0842934783156303e-06, "loss": 0.8125596, "num_input_tokens_seen": 179478140, "router_z_loss_clip": 2.46875, "router_z_loss_mlp": 0.3190918, "step": 8349, "time_per_iteration": 2.667661190032959 }, { "auxiliary_loss_clip": 0.01458528, "auxiliary_loss_mlp": 0.00289963, "balance_loss_clip": 1.19803953, "balance_loss_mlp": 0.25627461, "epoch": 0.5020291597775439, "flos": 11363214314880.0, "grad_norm": 8.94591119113137, "language_loss": 0.73775196, "learning_rate": 2.0839043611018266e-06, "loss": 0.75523686, "num_input_tokens_seen": 179494325, "router_z_loss_clip": 2.60742188, "router_z_loss_mlp": 0.3371582, "step": 8350, "time_per_iteration": 2.6572978496551514 }, { "auxiliary_loss_clip": 0.01162616, "auxiliary_loss_mlp": 0.00111571, "balance_loss_clip": 1.02634883, "balance_loss_mlp": 0.10484753, "epoch": 0.5020892830302119, "flos": 64011094928640.0, "grad_norm": 0.81392388749973, "language_loss": 0.59552813, "learning_rate": 2.0835152407063597e-06, "loss": 0.60826993, "num_input_tokens_seen": 179553545, "router_z_loss_clip": 1.359375, "router_z_loss_mlp": 0.06738281, "step": 8351, "time_per_iteration": 3.2867138385772705 }, { "auxiliary_loss_clip": 0.01424509, "auxiliary_loss_mlp": 0.0028604, "balance_loss_clip": 1.1771431, "balance_loss_mlp": 0.2535198, "epoch": 0.5021494062828799, "flos": 23732859801600.0, "grad_norm": 9.02671095886992, "language_loss": 0.8137697, "learning_rate": 2.0831261171439873e-06, "loss": 0.83087516, "num_input_tokens_seen": 179573645, "router_z_loss_clip": 2.47460938, "router_z_loss_mlp": 0.32507324, "step": 8352, "time_per_iteration": 2.71761417388916 }, { "auxiliary_loss_clip": 0.0144143, "auxiliary_loss_mlp": 0.00264548, "balance_loss_clip": 1.19001138, "balance_loss_mlp": 0.23047844, "epoch": 0.5022095295355479, "flos": 21576747041280.0, "grad_norm": 3.323835666915715, "language_loss": 0.78832614, "learning_rate": 2.082736990429464e-06, "loss": 0.80538595, "num_input_tokens_seen": 179591435, "router_z_loss_clip": 2.51171875, "router_z_loss_mlp": 0.34057617, "step": 8353, "time_per_iteration": 2.8355326652526855 }, { "auxiliary_loss_clip": 0.0144268, "auxiliary_loss_mlp": 0.00287691, "balance_loss_clip": 1.19221389, "balance_loss_mlp": 0.25321591, "epoch": 0.5022696527882159, "flos": 21397229844480.0, "grad_norm": 16.139732130577777, "language_loss": 0.83307296, "learning_rate": 2.0823478605775455e-06, "loss": 0.85037667, "num_input_tokens_seen": 179609955, "router_z_loss_clip": 2.5078125, "router_z_loss_mlp": 0.34472656, "step": 8354, "time_per_iteration": 2.7903733253479004 }, { "auxiliary_loss_clip": 0.01424443, "auxiliary_loss_mlp": 0.00259481, "balance_loss_clip": 1.1764878, "balance_loss_mlp": 0.2266508, "epoch": 0.5023297760408838, "flos": 27160712565120.0, "grad_norm": 3.63691636857119, "language_loss": 0.7869693, "learning_rate": 2.0819587276029884e-06, "loss": 0.80380851, "num_input_tokens_seen": 179630875, "router_z_loss_clip": 2.48046875, "router_z_loss_mlp": 0.328125, "step": 8355, "time_per_iteration": 2.8440442085266113 }, { "auxiliary_loss_clip": 0.01437408, "auxiliary_loss_mlp": 0.00287946, "balance_loss_clip": 1.18131661, "balance_loss_mlp": 0.2518968, "epoch": 0.5023898992935518, "flos": 26213820186240.0, "grad_norm": 4.155823791507584, "language_loss": 0.87978709, "learning_rate": 2.081569591520548e-06, "loss": 0.89704061, "num_input_tokens_seen": 179649835, "router_z_loss_clip": 2.55859375, "router_z_loss_mlp": 0.3605957, "step": 8356, "time_per_iteration": 2.775402307510376 }, { "auxiliary_loss_clip": 0.01434452, "auxiliary_loss_mlp": 0.00316065, "balance_loss_clip": 1.17546904, "balance_loss_mlp": 0.28077903, "epoch": 0.5024500225462197, "flos": 13440323111040.0, "grad_norm": 6.997154238222545, "language_loss": 0.86071908, "learning_rate": 2.0811804523449803e-06, "loss": 0.87822425, "num_input_tokens_seen": 179667605, "router_z_loss_clip": 2.59179688, "router_z_loss_mlp": 0.3527832, "step": 8357, "time_per_iteration": 2.7920563220977783 }, { "auxiliary_loss_clip": 0.01405952, "auxiliary_loss_mlp": 0.00252699, "balance_loss_clip": 1.16314209, "balance_loss_mlp": 0.22108433, "epoch": 0.5025101457988878, "flos": 21579584215680.0, "grad_norm": 2.4225416295754396, "language_loss": 0.83975923, "learning_rate": 2.0807913100910417e-06, "loss": 0.85634571, "num_input_tokens_seen": 179686910, "router_z_loss_clip": 2.42578125, "router_z_loss_mlp": 0.31567383, "step": 8358, "time_per_iteration": 2.656294584274292 }, { "auxiliary_loss_clip": 0.01407948, "auxiliary_loss_mlp": 0.00255068, "balance_loss_clip": 1.1632781, "balance_loss_mlp": 0.22269084, "epoch": 0.5025702690515557, "flos": 24645134448000.0, "grad_norm": 22.596751682514483, "language_loss": 0.81915838, "learning_rate": 2.0804021647734887e-06, "loss": 0.83578849, "num_input_tokens_seen": 179706395, "router_z_loss_clip": 2.44726562, "router_z_loss_mlp": 0.32348633, "step": 8359, "time_per_iteration": 2.6656970977783203 }, { "auxiliary_loss_clip": 0.01382212, "auxiliary_loss_mlp": 0.00268878, "balance_loss_clip": 1.14819217, "balance_loss_mlp": 0.2355466, "epoch": 0.5026303923042237, "flos": 22090162089600.0, "grad_norm": 12.798572337318557, "language_loss": 0.84104347, "learning_rate": 2.080013016407077e-06, "loss": 0.85755438, "num_input_tokens_seen": 179725735, "router_z_loss_clip": 2.33984375, "router_z_loss_mlp": 0.33349609, "step": 8360, "time_per_iteration": 2.651395797729492 }, { "auxiliary_loss_clip": 0.01382706, "auxiliary_loss_mlp": 0.00258184, "balance_loss_clip": 1.14870882, "balance_loss_mlp": 0.22540191, "epoch": 0.5026905155568916, "flos": 23697200574720.0, "grad_norm": 3.484702836986478, "language_loss": 0.8360765, "learning_rate": 2.0796238650065645e-06, "loss": 0.85248542, "num_input_tokens_seen": 179746150, "router_z_loss_clip": 2.34179688, "router_z_loss_mlp": 0.328125, "step": 8361, "time_per_iteration": 2.6885247230529785 }, { "auxiliary_loss_clip": 0.01406641, "auxiliary_loss_mlp": 0.002884, "balance_loss_clip": 1.15981483, "balance_loss_mlp": 0.25585574, "epoch": 0.5027506388095596, "flos": 25812410722560.0, "grad_norm": 5.628477656703242, "language_loss": 0.92275918, "learning_rate": 2.0792347105867065e-06, "loss": 0.93970954, "num_input_tokens_seen": 179767550, "router_z_loss_clip": 2.46875, "router_z_loss_mlp": 0.32543945, "step": 8362, "time_per_iteration": 2.7093560695648193 }, { "auxiliary_loss_clip": 0.01407159, "auxiliary_loss_mlp": 0.00285556, "balance_loss_clip": 1.1584506, "balance_loss_mlp": 0.25122365, "epoch": 0.5028107620622275, "flos": 27526606456320.0, "grad_norm": 3.4500016642329285, "language_loss": 0.84347016, "learning_rate": 2.0788455531622605e-06, "loss": 0.86039728, "num_input_tokens_seen": 179790075, "router_z_loss_clip": 2.484375, "router_z_loss_mlp": 0.34326172, "step": 8363, "time_per_iteration": 2.85449481010437 }, { "auxiliary_loss_clip": 0.01392124, "auxiliary_loss_mlp": 0.00279204, "balance_loss_clip": 1.15855956, "balance_loss_mlp": 0.24487191, "epoch": 0.5028708853148955, "flos": 24534278098560.0, "grad_norm": 5.23196617032345, "language_loss": 0.82314289, "learning_rate": 2.0784563927479838e-06, "loss": 0.83985615, "num_input_tokens_seen": 179806515, "router_z_loss_clip": 2.33398438, "router_z_loss_mlp": 0.34350586, "step": 8364, "time_per_iteration": 2.6654088497161865 }, { "auxiliary_loss_clip": 0.01376716, "auxiliary_loss_mlp": 0.0024937, "balance_loss_clip": 1.14213514, "balance_loss_mlp": 0.21931693, "epoch": 0.5029310085675635, "flos": 20813609664000.0, "grad_norm": 115.82632450621476, "language_loss": 0.75418043, "learning_rate": 2.0780672293586317e-06, "loss": 0.77044129, "num_input_tokens_seen": 179826450, "router_z_loss_clip": 2.34765625, "router_z_loss_mlp": 0.30078125, "step": 8365, "time_per_iteration": 2.640221357345581 }, { "auxiliary_loss_clip": 0.01390775, "auxiliary_loss_mlp": 0.00305159, "balance_loss_clip": 1.14539349, "balance_loss_mlp": 0.27180415, "epoch": 0.5029911318202315, "flos": 22342470197760.0, "grad_norm": 12.794447933538077, "language_loss": 0.79883558, "learning_rate": 2.0776780630089635e-06, "loss": 0.81579489, "num_input_tokens_seen": 179846770, "router_z_loss_clip": 2.453125, "router_z_loss_mlp": 0.33374023, "step": 8366, "time_per_iteration": 2.681597948074341 }, { "auxiliary_loss_clip": 0.01379691, "auxiliary_loss_mlp": 0.0030785, "balance_loss_clip": 1.13918209, "balance_loss_mlp": 0.2751869, "epoch": 0.5030512550728995, "flos": 24352713826560.0, "grad_norm": 112.45764053623076, "language_loss": 0.84721607, "learning_rate": 2.077288893713735e-06, "loss": 0.86409152, "num_input_tokens_seen": 179866585, "router_z_loss_clip": 2.40234375, "router_z_loss_mlp": 0.32653809, "step": 8367, "time_per_iteration": 2.70650315284729 }, { "auxiliary_loss_clip": 0.01364695, "auxiliary_loss_mlp": 0.00242232, "balance_loss_clip": 1.13126302, "balance_loss_mlp": 0.21182188, "epoch": 0.5031113783255674, "flos": 18259930195200.0, "grad_norm": 14.693830472751841, "language_loss": 0.77362263, "learning_rate": 2.0768997214877035e-06, "loss": 0.78969187, "num_input_tokens_seen": 179885575, "router_z_loss_clip": 2.3359375, "router_z_loss_mlp": 0.30407715, "step": 8368, "time_per_iteration": 2.6716814041137695 }, { "auxiliary_loss_clip": 0.01119178, "auxiliary_loss_mlp": 0.00074172, "balance_loss_clip": 0.99295616, "balance_loss_mlp": 0.06697126, "epoch": 0.5031715015782354, "flos": 57253173200640.0, "grad_norm": 0.8480609048502558, "language_loss": 0.6292659, "learning_rate": 2.0765105463456274e-06, "loss": 0.64119947, "num_input_tokens_seen": 179939650, "router_z_loss_clip": 1.265625, "router_z_loss_mlp": 0.07177734, "step": 8369, "time_per_iteration": 3.086191415786743 }, { "auxiliary_loss_clip": 0.01370715, "auxiliary_loss_mlp": 0.00288309, "balance_loss_clip": 1.13054323, "balance_loss_mlp": 0.25652814, "epoch": 0.5032316248309033, "flos": 27527360641920.0, "grad_norm": 14.984126726126169, "language_loss": 0.68240643, "learning_rate": 2.076121368302263e-06, "loss": 0.69899666, "num_input_tokens_seen": 179961765, "router_z_loss_clip": 2.40234375, "router_z_loss_mlp": 0.31787109, "step": 8370, "time_per_iteration": 2.748309373855591 }, { "auxiliary_loss_clip": 0.01364336, "auxiliary_loss_mlp": 0.00276103, "balance_loss_clip": 1.12714505, "balance_loss_mlp": 0.2452274, "epoch": 0.5032917480835714, "flos": 34495825939200.0, "grad_norm": 72.97394101645965, "language_loss": 0.74241924, "learning_rate": 2.0757321873723695e-06, "loss": 0.75882357, "num_input_tokens_seen": 179983015, "router_z_loss_clip": 2.37304688, "router_z_loss_mlp": 0.30908203, "step": 8371, "time_per_iteration": 2.835634708404541 }, { "auxiliary_loss_clip": 0.01377714, "auxiliary_loss_mlp": 0.00306002, "balance_loss_clip": 1.13796294, "balance_loss_mlp": 0.27135941, "epoch": 0.5033518713362393, "flos": 33656773167360.0, "grad_norm": 6.120533729314986, "language_loss": 0.74239898, "learning_rate": 2.0753430035707042e-06, "loss": 0.75923622, "num_input_tokens_seen": 180003210, "router_z_loss_clip": 2.3984375, "router_z_loss_mlp": 0.34667969, "step": 8372, "time_per_iteration": 2.7883994579315186 }, { "auxiliary_loss_clip": 0.01364903, "auxiliary_loss_mlp": 0.00272482, "balance_loss_clip": 1.12534308, "balance_loss_mlp": 0.23855546, "epoch": 0.5034119945889073, "flos": 28185495586560.0, "grad_norm": 3.5459312923437203, "language_loss": 0.73071885, "learning_rate": 2.0749538169120235e-06, "loss": 0.74709266, "num_input_tokens_seen": 180025530, "router_z_loss_clip": 2.39257812, "router_z_loss_mlp": 0.33911133, "step": 8373, "time_per_iteration": 2.7610809803009033 }, { "auxiliary_loss_clip": 0.01357153, "auxiliary_loss_mlp": 0.00280124, "balance_loss_clip": 1.12199903, "balance_loss_mlp": 0.24705526, "epoch": 0.5034721178415752, "flos": 21358697529600.0, "grad_norm": 3.636721584583758, "language_loss": 0.80362999, "learning_rate": 2.0745646274110872e-06, "loss": 0.82000268, "num_input_tokens_seen": 180043180, "router_z_loss_clip": 2.35351562, "router_z_loss_mlp": 0.33093262, "step": 8374, "time_per_iteration": 2.767807960510254 }, { "auxiliary_loss_clip": 0.0136588, "auxiliary_loss_mlp": 0.00290248, "balance_loss_clip": 1.12537336, "balance_loss_mlp": 0.2564404, "epoch": 0.5035322410942432, "flos": 22674823764480.0, "grad_norm": 12.29258553993945, "language_loss": 0.74644792, "learning_rate": 2.0741754350826525e-06, "loss": 0.76300919, "num_input_tokens_seen": 180062905, "router_z_loss_clip": 2.40625, "router_z_loss_mlp": 0.33789062, "step": 8375, "time_per_iteration": 4.127256631851196 }, { "auxiliary_loss_clip": 0.01374613, "auxiliary_loss_mlp": 0.00275439, "balance_loss_clip": 1.13195527, "balance_loss_mlp": 0.24158344, "epoch": 0.5035923643469111, "flos": 19828723674240.0, "grad_norm": 15.891379848684567, "language_loss": 0.87726533, "learning_rate": 2.0737862399414777e-06, "loss": 0.89376581, "num_input_tokens_seen": 180082000, "router_z_loss_clip": 2.421875, "router_z_loss_mlp": 0.33837891, "step": 8376, "time_per_iteration": 2.7629427909851074 }, { "auxiliary_loss_clip": 0.01367027, "auxiliary_loss_mlp": 0.00311993, "balance_loss_clip": 1.12624776, "balance_loss_mlp": 0.27928203, "epoch": 0.5036524875995791, "flos": 30514625182080.0, "grad_norm": 3.1061646355548596, "language_loss": 0.68027282, "learning_rate": 2.0733970420023213e-06, "loss": 0.69706303, "num_input_tokens_seen": 180101340, "router_z_loss_clip": 2.40820312, "router_z_loss_mlp": 0.32739258, "step": 8377, "time_per_iteration": 2.7158868312835693 }, { "auxiliary_loss_clip": 0.01367758, "auxiliary_loss_mlp": 0.0026476, "balance_loss_clip": 1.13031924, "balance_loss_mlp": 0.23314574, "epoch": 0.5037126108522471, "flos": 14720574637440.0, "grad_norm": 11.679831293764575, "language_loss": 0.86198485, "learning_rate": 2.0730078412799425e-06, "loss": 0.87831008, "num_input_tokens_seen": 180119160, "router_z_loss_clip": 2.375, "router_z_loss_mlp": 0.31616211, "step": 8378, "time_per_iteration": 4.136762857437134 }, { "auxiliary_loss_clip": 0.01361199, "auxiliary_loss_mlp": 0.00279756, "balance_loss_clip": 1.12648749, "balance_loss_mlp": 0.24754606, "epoch": 0.5037727341049151, "flos": 25297702784640.0, "grad_norm": 28.28919708900393, "language_loss": 0.81032073, "learning_rate": 2.0726186377890985e-06, "loss": 0.82673037, "num_input_tokens_seen": 180138730, "router_z_loss_clip": 2.34375, "router_z_loss_mlp": 0.32226562, "step": 8379, "time_per_iteration": 2.7212252616882324 }, { "auxiliary_loss_clip": 0.01351006, "auxiliary_loss_mlp": 0.00272672, "balance_loss_clip": 1.11803222, "balance_loss_mlp": 0.24122481, "epoch": 0.5038328573575831, "flos": 28541764632960.0, "grad_norm": 2.6844512212374085, "language_loss": 0.75016129, "learning_rate": 2.072229431544548e-06, "loss": 0.76639807, "num_input_tokens_seen": 180158810, "router_z_loss_clip": 2.32617188, "router_z_loss_mlp": 0.31469727, "step": 8380, "time_per_iteration": 4.211390972137451 }, { "auxiliary_loss_clip": 0.01344291, "auxiliary_loss_mlp": 0.00259961, "balance_loss_clip": 1.11606276, "balance_loss_mlp": 0.22901402, "epoch": 0.503892980610251, "flos": 31649869503360.0, "grad_norm": 1918.9575955827543, "language_loss": 0.69356614, "learning_rate": 2.071840222561051e-06, "loss": 0.70960867, "num_input_tokens_seen": 180179700, "router_z_loss_clip": 2.27929688, "router_z_loss_mlp": 0.30944824, "step": 8381, "time_per_iteration": 2.7733652591705322 }, { "auxiliary_loss_clip": 0.01347557, "auxiliary_loss_mlp": 0.00248709, "balance_loss_clip": 1.11762142, "balance_loss_mlp": 0.21733285, "epoch": 0.503953103862919, "flos": 27089358197760.0, "grad_norm": 3.9252661618166607, "language_loss": 0.73879194, "learning_rate": 2.071451010853365e-06, "loss": 0.75475466, "num_input_tokens_seen": 180199890, "router_z_loss_clip": 2.30078125, "router_z_loss_mlp": 0.31396484, "step": 8382, "time_per_iteration": 2.7511401176452637 }, { "auxiliary_loss_clip": 0.01361728, "auxiliary_loss_mlp": 0.00304748, "balance_loss_clip": 1.11908364, "balance_loss_mlp": 0.26924735, "epoch": 0.5040132271155869, "flos": 15632957024640.0, "grad_norm": 14.514916264859897, "language_loss": 0.70208502, "learning_rate": 2.0710617964362506e-06, "loss": 0.71874982, "num_input_tokens_seen": 180217840, "router_z_loss_clip": 2.42578125, "router_z_loss_mlp": 0.35498047, "step": 8383, "time_per_iteration": 2.6751596927642822 }, { "auxiliary_loss_clip": 0.01340413, "auxiliary_loss_mlp": 0.00246392, "balance_loss_clip": 1.11077595, "balance_loss_mlp": 0.21639854, "epoch": 0.504073350368255, "flos": 13590106824960.0, "grad_norm": 11.228611075296522, "language_loss": 0.75186872, "learning_rate": 2.070672579324465e-06, "loss": 0.76773679, "num_input_tokens_seen": 180236465, "router_z_loss_clip": 2.29296875, "router_z_loss_mlp": 0.30029297, "step": 8384, "time_per_iteration": 2.674572229385376 }, { "auxiliary_loss_clip": 0.01329956, "auxiliary_loss_mlp": 0.00250181, "balance_loss_clip": 1.1006273, "balance_loss_mlp": 0.21824472, "epoch": 0.5041334736209229, "flos": 29058160510080.0, "grad_norm": 1152.4828387266602, "language_loss": 0.77345878, "learning_rate": 2.0702833595327674e-06, "loss": 0.78926015, "num_input_tokens_seen": 180258025, "router_z_loss_clip": 2.29101562, "router_z_loss_mlp": 0.31970215, "step": 8385, "time_per_iteration": 4.138592481613159 }, { "auxiliary_loss_clip": 0.01339258, "auxiliary_loss_mlp": 0.00256685, "balance_loss_clip": 1.1112566, "balance_loss_mlp": 0.22653654, "epoch": 0.5041935968735909, "flos": 24608361899520.0, "grad_norm": 2.701822788894995, "language_loss": 0.88682055, "learning_rate": 2.069894137075919e-06, "loss": 0.90277994, "num_input_tokens_seen": 180277825, "router_z_loss_clip": 2.27734375, "router_z_loss_mlp": 0.3013916, "step": 8386, "time_per_iteration": 2.6785714626312256 }, { "auxiliary_loss_clip": 0.01324556, "auxiliary_loss_mlp": 0.00266999, "balance_loss_clip": 1.09620452, "balance_loss_mlp": 0.23443097, "epoch": 0.5042537201262588, "flos": 26286934320000.0, "grad_norm": 14.559947453353955, "language_loss": 0.72368765, "learning_rate": 2.0695049119686766e-06, "loss": 0.73960316, "num_input_tokens_seen": 180300465, "router_z_loss_clip": 2.28320312, "router_z_loss_mlp": 0.32568359, "step": 8387, "time_per_iteration": 2.713618040084839 }, { "auxiliary_loss_clip": 0.01320616, "auxiliary_loss_mlp": 0.00248018, "balance_loss_clip": 1.09652853, "balance_loss_mlp": 0.2180251, "epoch": 0.5043138433789268, "flos": 22017371178240.0, "grad_norm": 146.56702360454182, "language_loss": 0.86119992, "learning_rate": 2.0691156842258016e-06, "loss": 0.87688625, "num_input_tokens_seen": 180321050, "router_z_loss_clip": 2.24023438, "router_z_loss_mlp": 0.29980469, "step": 8388, "time_per_iteration": 2.6889700889587402 }, { "auxiliary_loss_clip": 0.01319442, "auxiliary_loss_mlp": 0.00264279, "balance_loss_clip": 1.09268904, "balance_loss_mlp": 0.23154445, "epoch": 0.5043739666315947, "flos": 28767104605440.0, "grad_norm": 11.892249356228541, "language_loss": 0.77263486, "learning_rate": 2.0687264538620537e-06, "loss": 0.78847206, "num_input_tokens_seen": 180338870, "router_z_loss_clip": 2.26757812, "router_z_loss_mlp": 0.32739258, "step": 8389, "time_per_iteration": 2.7304751873016357 }, { "auxiliary_loss_clip": 0.01327682, "auxiliary_loss_mlp": 0.00229477, "balance_loss_clip": 1.0976367, "balance_loss_mlp": 0.19671869, "epoch": 0.5044340898842627, "flos": 27599253713280.0, "grad_norm": 10.343852951834906, "language_loss": 0.75240803, "learning_rate": 2.068337220892191e-06, "loss": 0.76797962, "num_input_tokens_seen": 180361285, "router_z_loss_clip": 2.29882812, "router_z_loss_mlp": 0.32763672, "step": 8390, "time_per_iteration": 2.715522289276123 }, { "auxiliary_loss_clip": 0.01237025, "auxiliary_loss_mlp": 0.00033314, "balance_loss_clip": 1.08259892, "balance_loss_mlp": 0.02640026, "epoch": 0.5044942131369307, "flos": 67458050749440.0, "grad_norm": 0.8390600312595843, "language_loss": 0.52454656, "learning_rate": 2.067947985330974e-06, "loss": 0.53724998, "num_input_tokens_seen": 180415170, "router_z_loss_clip": 1.546875, "router_z_loss_mlp": 0.06933594, "step": 8391, "time_per_iteration": 2.9326982498168945 }, { "auxiliary_loss_clip": 0.01241243, "auxiliary_loss_mlp": 0.00101871, "balance_loss_clip": 1.08761764, "balance_loss_mlp": 0.09262076, "epoch": 0.5045543363895987, "flos": 58630849390080.0, "grad_norm": 0.8064278320968578, "language_loss": 0.60452712, "learning_rate": 2.0675587471931628e-06, "loss": 0.61795831, "num_input_tokens_seen": 180468060, "router_z_loss_clip": 1.53125, "router_z_loss_mlp": 0.09228516, "step": 8392, "time_per_iteration": 2.9434127807617188 }, { "auxiliary_loss_clip": 0.01298103, "auxiliary_loss_mlp": 0.00234382, "balance_loss_clip": 1.07763362, "balance_loss_mlp": 0.20245816, "epoch": 0.5046144596422667, "flos": 22526620248960.0, "grad_norm": 34.64253621057198, "language_loss": 0.91606176, "learning_rate": 2.067169506493517e-06, "loss": 0.93138665, "num_input_tokens_seen": 180486610, "router_z_loss_clip": 2.20507812, "router_z_loss_mlp": 0.31860352, "step": 8393, "time_per_iteration": 2.6683945655822754 }, { "auxiliary_loss_clip": 0.01296211, "auxiliary_loss_mlp": 0.00246869, "balance_loss_clip": 1.07896781, "balance_loss_mlp": 0.21577933, "epoch": 0.5046745828949346, "flos": 27454246508160.0, "grad_norm": 57.950754431525986, "language_loss": 0.60318476, "learning_rate": 2.0667802632467974e-06, "loss": 0.61861551, "num_input_tokens_seen": 180508135, "router_z_loss_clip": 2.17578125, "router_z_loss_mlp": 0.31103516, "step": 8394, "time_per_iteration": 2.8524320125579834 }, { "auxiliary_loss_clip": 0.01308386, "auxiliary_loss_mlp": 0.00232549, "balance_loss_clip": 1.08704376, "balance_loss_mlp": 0.19950414, "epoch": 0.5047347061476026, "flos": 17274541415040.0, "grad_norm": 3.6634201161507702, "language_loss": 0.82114863, "learning_rate": 2.0663910174677627e-06, "loss": 0.83655798, "num_input_tokens_seen": 180527000, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.33032227, "step": 8395, "time_per_iteration": 2.774489164352417 }, { "auxiliary_loss_clip": 0.01287607, "auxiliary_loss_mlp": 0.00238085, "balance_loss_clip": 1.06937075, "balance_loss_mlp": 0.20637567, "epoch": 0.5047948294002705, "flos": 16649515831680.0, "grad_norm": 174.44317691999652, "language_loss": 0.76069146, "learning_rate": 2.0660017691711737e-06, "loss": 0.77594841, "num_input_tokens_seen": 180544715, "router_z_loss_clip": 2.18164062, "router_z_loss_mlp": 0.31713867, "step": 8396, "time_per_iteration": 2.666508436203003 }, { "auxiliary_loss_clip": 0.01295268, "auxiliary_loss_mlp": 0.00224792, "balance_loss_clip": 1.07954264, "balance_loss_mlp": 0.19453719, "epoch": 0.5048549526529386, "flos": 26865706164480.0, "grad_norm": 8.88449262559083, "language_loss": 0.85148978, "learning_rate": 2.065612518371792e-06, "loss": 0.8666904, "num_input_tokens_seen": 180565365, "router_z_loss_clip": 2.15429688, "router_z_loss_mlp": 0.30273438, "step": 8397, "time_per_iteration": 2.691591739654541 }, { "auxiliary_loss_clip": 0.0127433, "auxiliary_loss_mlp": 0.00226321, "balance_loss_clip": 1.06346059, "balance_loss_mlp": 0.19721037, "epoch": 0.5049150759056065, "flos": 21833939399040.0, "grad_norm": 15.824481504938332, "language_loss": 0.71582836, "learning_rate": 2.065223265084376e-06, "loss": 0.73083484, "num_input_tokens_seen": 180586670, "router_z_loss_clip": 2.109375, "router_z_loss_mlp": 0.29101562, "step": 8398, "time_per_iteration": 2.64906644821167 }, { "auxiliary_loss_clip": 0.0128325, "auxiliary_loss_mlp": 0.0025105, "balance_loss_clip": 1.06384206, "balance_loss_mlp": 0.21908997, "epoch": 0.5049751991582745, "flos": 21685807710720.0, "grad_norm": 9.215987286745595, "language_loss": 0.78676176, "learning_rate": 2.064834009323688e-06, "loss": 0.80210483, "num_input_tokens_seen": 180605085, "router_z_loss_clip": 2.19238281, "router_z_loss_mlp": 0.31982422, "step": 8399, "time_per_iteration": 2.6782491207122803 }, { "auxiliary_loss_clip": 0.01292257, "auxiliary_loss_mlp": 0.00277289, "balance_loss_clip": 1.06679845, "balance_loss_mlp": 0.24481633, "epoch": 0.5050353224109424, "flos": 21359379888000.0, "grad_norm": 680.5594893330084, "language_loss": 0.88688958, "learning_rate": 2.0644447511044878e-06, "loss": 0.90258509, "num_input_tokens_seen": 180624370, "router_z_loss_clip": 2.2578125, "router_z_loss_mlp": 0.32470703, "step": 8400, "time_per_iteration": 2.7447712421417236 }, { "auxiliary_loss_clip": 0.01281752, "auxiliary_loss_mlp": 0.00221532, "balance_loss_clip": 1.069098, "balance_loss_mlp": 0.189798, "epoch": 0.5050954456636104, "flos": 22820082364800.0, "grad_norm": 6.898512323047323, "language_loss": 0.8476572, "learning_rate": 2.0640554904415362e-06, "loss": 0.86268997, "num_input_tokens_seen": 180642450, "router_z_loss_clip": 2.12890625, "router_z_loss_mlp": 0.31713867, "step": 8401, "time_per_iteration": 2.6519651412963867 }, { "auxiliary_loss_clip": 0.01291593, "auxiliary_loss_mlp": 0.00233191, "balance_loss_clip": 1.06993628, "balance_loss_mlp": 0.20212492, "epoch": 0.5051555689162783, "flos": 30448226891520.0, "grad_norm": 25.324071544169996, "language_loss": 0.78536773, "learning_rate": 2.063666227349593e-06, "loss": 0.80061555, "num_input_tokens_seen": 180665250, "router_z_loss_clip": 2.21679688, "router_z_loss_mlp": 0.31103516, "step": 8402, "time_per_iteration": 2.7691829204559326 }, { "auxiliary_loss_clip": 0.0128928, "auxiliary_loss_mlp": 0.00271506, "balance_loss_clip": 1.07080913, "balance_loss_mlp": 0.2388902, "epoch": 0.5052156921689464, "flos": 21287953693440.0, "grad_norm": 84.75088042004353, "language_loss": 0.76057684, "learning_rate": 2.063276961843422e-06, "loss": 0.77618468, "num_input_tokens_seen": 180687425, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.32617188, "step": 8403, "time_per_iteration": 2.7064619064331055 }, { "auxiliary_loss_clip": 0.01277558, "auxiliary_loss_mlp": 0.0025, "balance_loss_clip": 1.063375, "balance_loss_mlp": 0.21801579, "epoch": 0.5052758154216143, "flos": 25081305298560.0, "grad_norm": 13.505889439336944, "language_loss": 0.91634607, "learning_rate": 2.062887693937781e-06, "loss": 0.93162161, "num_input_tokens_seen": 180708725, "router_z_loss_clip": 2.140625, "router_z_loss_mlp": 0.31945801, "step": 8404, "time_per_iteration": 2.74446702003479 }, { "auxiliary_loss_clip": 0.01291649, "auxiliary_loss_mlp": 0.00217843, "balance_loss_clip": 1.07705975, "balance_loss_mlp": 0.18728921, "epoch": 0.5053359386742823, "flos": 20885502735360.0, "grad_norm": 13.674764815726505, "language_loss": 0.81169021, "learning_rate": 2.0624984236474322e-06, "loss": 0.82678515, "num_input_tokens_seen": 180727990, "router_z_loss_clip": 2.14648438, "router_z_loss_mlp": 0.30566406, "step": 8405, "time_per_iteration": 2.685103178024292 }, { "auxiliary_loss_clip": 0.01289964, "auxiliary_loss_mlp": 0.00263733, "balance_loss_clip": 1.06908095, "balance_loss_mlp": 0.23059301, "epoch": 0.5053960619269503, "flos": 37743335493120.0, "grad_norm": 7.734724563451844, "language_loss": 0.81798649, "learning_rate": 2.0621091509871378e-06, "loss": 0.83352345, "num_input_tokens_seen": 180749765, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.33154297, "step": 8406, "time_per_iteration": 2.8328564167022705 }, { "auxiliary_loss_clip": 0.01271907, "auxiliary_loss_mlp": 0.002196, "balance_loss_clip": 1.05990112, "balance_loss_mlp": 0.18831968, "epoch": 0.5054561851796182, "flos": 23513840622720.0, "grad_norm": 6.343219989777322, "language_loss": 0.8423081, "learning_rate": 2.0617198759716568e-06, "loss": 0.85722321, "num_input_tokens_seen": 180769580, "router_z_loss_clip": 2.1171875, "router_z_loss_mlp": 0.3125, "step": 8407, "time_per_iteration": 2.6477203369140625 }, { "auxiliary_loss_clip": 0.01278034, "auxiliary_loss_mlp": 0.00228032, "balance_loss_clip": 1.06292152, "balance_loss_mlp": 0.1969423, "epoch": 0.5055163084322862, "flos": 30410233280640.0, "grad_norm": 3.134876633636941, "language_loss": 0.73308778, "learning_rate": 2.0613305986157535e-06, "loss": 0.74814844, "num_input_tokens_seen": 180790295, "router_z_loss_clip": 2.15429688, "router_z_loss_mlp": 0.31103516, "step": 8408, "time_per_iteration": 2.716315984725952 }, { "auxiliary_loss_clip": 0.01292725, "auxiliary_loss_mlp": 0.00284335, "balance_loss_clip": 1.07443738, "balance_loss_mlp": 0.2509563, "epoch": 0.5055764316849541, "flos": 20259651139200.0, "grad_norm": 13.949250419931529, "language_loss": 0.7124629, "learning_rate": 2.0609413189341865e-06, "loss": 0.72823352, "num_input_tokens_seen": 180807875, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.33374023, "step": 8409, "time_per_iteration": 2.653784990310669 }, { "auxiliary_loss_clip": 0.01265055, "auxiliary_loss_mlp": 0.00225527, "balance_loss_clip": 1.05632222, "balance_loss_mlp": 0.19372171, "epoch": 0.5056365549376222, "flos": 26070895969920.0, "grad_norm": 9.970262880830685, "language_loss": 0.7578913, "learning_rate": 2.0605520369417193e-06, "loss": 0.77279711, "num_input_tokens_seen": 180831300, "router_z_loss_clip": 2.0859375, "router_z_loss_mlp": 0.31811523, "step": 8410, "time_per_iteration": 2.7240092754364014 }, { "auxiliary_loss_clip": 0.01276815, "auxiliary_loss_mlp": 0.00232273, "balance_loss_clip": 1.06319821, "balance_loss_mlp": 0.20274523, "epoch": 0.5056966781902901, "flos": 19279074781440.0, "grad_norm": 35.85111967493781, "language_loss": 0.84981918, "learning_rate": 2.060162752653113e-06, "loss": 0.86491007, "num_input_tokens_seen": 180849055, "router_z_loss_clip": 2.13671875, "router_z_loss_mlp": 0.29553223, "step": 8411, "time_per_iteration": 2.6339781284332275 }, { "auxiliary_loss_clip": 0.01282708, "auxiliary_loss_mlp": 0.00258387, "balance_loss_clip": 1.06480885, "balance_loss_mlp": 0.22527042, "epoch": 0.5057568014429581, "flos": 21323325611520.0, "grad_norm": 18.935798075865094, "language_loss": 0.89281088, "learning_rate": 2.0597734660831285e-06, "loss": 0.90822184, "num_input_tokens_seen": 180867395, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.33105469, "step": 8412, "time_per_iteration": 2.654064178466797 }, { "auxiliary_loss_clip": 0.01283012, "auxiliary_loss_mlp": 0.0021283, "balance_loss_clip": 1.06787515, "balance_loss_mlp": 0.18200272, "epoch": 0.505816924695626, "flos": 17493596507520.0, "grad_norm": 24.9287197487472, "language_loss": 0.89707398, "learning_rate": 2.0593841772465283e-06, "loss": 0.91203249, "num_input_tokens_seen": 180886670, "router_z_loss_clip": 2.15039062, "router_z_loss_mlp": 0.30834961, "step": 8413, "time_per_iteration": 2.674069881439209 }, { "auxiliary_loss_clip": 0.01286004, "auxiliary_loss_mlp": 0.00245485, "balance_loss_clip": 1.0653584, "balance_loss_mlp": 0.21344152, "epoch": 0.505877047948294, "flos": 21142084561920.0, "grad_norm": 73.25322111356229, "language_loss": 0.89396906, "learning_rate": 2.0589948861580737e-06, "loss": 0.909284, "num_input_tokens_seen": 180904645, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.32055664, "step": 8414, "time_per_iteration": 2.6650896072387695 }, { "auxiliary_loss_clip": 0.01269854, "auxiliary_loss_mlp": 0.00230914, "balance_loss_clip": 1.05567789, "balance_loss_mlp": 0.2001102, "epoch": 0.5059371712009619, "flos": 36350036887680.0, "grad_norm": 153.13119586632106, "language_loss": 0.71105802, "learning_rate": 2.058605592832528e-06, "loss": 0.7260657, "num_input_tokens_seen": 180922340, "router_z_loss_clip": 2.13964844, "router_z_loss_mlp": 0.30822754, "step": 8415, "time_per_iteration": 2.8225138187408447 }, { "auxiliary_loss_clip": 0.01282345, "auxiliary_loss_mlp": 0.00230823, "balance_loss_clip": 1.06353426, "balance_loss_mlp": 0.19820763, "epoch": 0.50599729445363, "flos": 22673387220480.0, "grad_norm": 176.68633009268746, "language_loss": 0.8874706, "learning_rate": 2.0582162972846515e-06, "loss": 0.90260226, "num_input_tokens_seen": 180941350, "router_z_loss_clip": 2.18652344, "router_z_loss_mlp": 0.32641602, "step": 8416, "time_per_iteration": 2.662322998046875 }, { "auxiliary_loss_clip": 0.0127175, "auxiliary_loss_mlp": 0.00208749, "balance_loss_clip": 1.06474364, "balance_loss_mlp": 0.18026978, "epoch": 0.5060574177062979, "flos": 22747866071040.0, "grad_norm": 6.46068441718129, "language_loss": 0.870646, "learning_rate": 2.0578269995292078e-06, "loss": 0.88545102, "num_input_tokens_seen": 180960720, "router_z_loss_clip": 2.07421875, "router_z_loss_mlp": 0.28479004, "step": 8417, "time_per_iteration": 2.749983310699463 }, { "auxiliary_loss_clip": 0.0126459, "auxiliary_loss_mlp": 0.00220104, "balance_loss_clip": 1.05543399, "balance_loss_mlp": 0.18813223, "epoch": 0.5061175409589659, "flos": 21653201139840.0, "grad_norm": 76.05522947182168, "language_loss": 0.70037484, "learning_rate": 2.0574376995809588e-06, "loss": 0.71522182, "num_input_tokens_seen": 180979725, "router_z_loss_clip": 2.09375, "router_z_loss_mlp": 0.31982422, "step": 8418, "time_per_iteration": 4.069144248962402 }, { "auxiliary_loss_clip": 0.01284723, "auxiliary_loss_mlp": 0.00223606, "balance_loss_clip": 1.06500876, "balance_loss_mlp": 0.19263516, "epoch": 0.5061776642116339, "flos": 21616249023360.0, "grad_norm": 17.424369930228274, "language_loss": 0.86612695, "learning_rate": 2.0570483974546653e-06, "loss": 0.88121027, "num_input_tokens_seen": 180998980, "router_z_loss_clip": 2.20019531, "router_z_loss_mlp": 0.30981445, "step": 8419, "time_per_iteration": 2.6608774662017822 }, { "auxiliary_loss_clip": 0.01290748, "auxiliary_loss_mlp": 0.00251939, "balance_loss_clip": 1.06811833, "balance_loss_mlp": 0.21844092, "epoch": 0.5062377874643018, "flos": 24426294837120.0, "grad_norm": 16.520824183916023, "language_loss": 0.86132622, "learning_rate": 2.0566590931650917e-06, "loss": 0.87675309, "num_input_tokens_seen": 181019165, "router_z_loss_clip": 2.2265625, "router_z_loss_mlp": 0.33520508, "step": 8420, "time_per_iteration": 4.131247282028198 }, { "auxiliary_loss_clip": 0.01284361, "auxiliary_loss_mlp": 0.00237252, "balance_loss_clip": 1.0646143, "balance_loss_mlp": 0.20206141, "epoch": 0.5062979107169698, "flos": 22524429519360.0, "grad_norm": 6.45836985693877, "language_loss": 0.86403877, "learning_rate": 2.056269786726999e-06, "loss": 0.87925488, "num_input_tokens_seen": 181037110, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.35180664, "step": 8421, "time_per_iteration": 2.6373038291931152 }, { "auxiliary_loss_clip": 0.01271737, "auxiliary_loss_mlp": 0.0021815, "balance_loss_clip": 1.06056535, "balance_loss_mlp": 0.18570112, "epoch": 0.5063580339696377, "flos": 24571984400640.0, "grad_norm": 6.847833101204108, "language_loss": 0.72525066, "learning_rate": 2.0558804781551512e-06, "loss": 0.74014956, "num_input_tokens_seen": 181057775, "router_z_loss_clip": 2.11230469, "router_z_loss_mlp": 0.32446289, "step": 8422, "time_per_iteration": 4.045825004577637 }, { "auxiliary_loss_clip": 0.01275099, "auxiliary_loss_mlp": 0.00220256, "balance_loss_clip": 1.06347668, "balance_loss_mlp": 0.18799761, "epoch": 0.5064181572223058, "flos": 22596143022720.0, "grad_norm": 9.515918847386917, "language_loss": 0.89206892, "learning_rate": 2.05549116746431e-06, "loss": 0.90702248, "num_input_tokens_seen": 181078260, "router_z_loss_clip": 2.1171875, "router_z_loss_mlp": 0.32250977, "step": 8423, "time_per_iteration": 2.7575416564941406 }, { "auxiliary_loss_clip": 0.01276031, "auxiliary_loss_mlp": 0.00211053, "balance_loss_clip": 1.06067944, "balance_loss_mlp": 0.17567217, "epoch": 0.5064782804749737, "flos": 25994944661760.0, "grad_norm": 19.400621431652336, "language_loss": 0.851318, "learning_rate": 2.055101854669237e-06, "loss": 0.86618888, "num_input_tokens_seen": 181098755, "router_z_loss_clip": 2.15429688, "router_z_loss_mlp": 0.35375977, "step": 8424, "time_per_iteration": 2.6821236610412598 }, { "auxiliary_loss_clip": 0.0126353, "auxiliary_loss_mlp": 0.00200404, "balance_loss_clip": 1.05608869, "balance_loss_mlp": 0.16611984, "epoch": 0.5065384037276417, "flos": 28553041503360.0, "grad_norm": 67.379192090613, "language_loss": 0.75590849, "learning_rate": 2.0547125397846975e-06, "loss": 0.77054781, "num_input_tokens_seen": 181121570, "router_z_loss_clip": 2.07226562, "router_z_loss_mlp": 0.34301758, "step": 8425, "time_per_iteration": 2.7815101146698 }, { "auxiliary_loss_clip": 0.01271758, "auxiliary_loss_mlp": 0.00197143, "balance_loss_clip": 1.05687904, "balance_loss_mlp": 0.16433649, "epoch": 0.5065985269803096, "flos": 22966023323520.0, "grad_norm": 5.34010783961367, "language_loss": 0.85942918, "learning_rate": 2.0543232228254524e-06, "loss": 0.87411821, "num_input_tokens_seen": 181140240, "router_z_loss_clip": 2.1484375, "router_z_loss_mlp": 0.328125, "step": 8426, "time_per_iteration": 2.6267430782318115 }, { "auxiliary_loss_clip": 0.01267603, "auxiliary_loss_mlp": 0.00201071, "balance_loss_clip": 1.0580219, "balance_loss_mlp": 0.16790724, "epoch": 0.5066586502329776, "flos": 21608563512960.0, "grad_norm": 51.651297328867166, "language_loss": 0.86609602, "learning_rate": 2.053933903806265e-06, "loss": 0.88078272, "num_input_tokens_seen": 181158630, "router_z_loss_clip": 2.09472656, "router_z_loss_mlp": 0.33129883, "step": 8427, "time_per_iteration": 4.091872692108154 }, { "auxiliary_loss_clip": 0.01277237, "auxiliary_loss_mlp": 0.00202079, "balance_loss_clip": 1.05956757, "balance_loss_mlp": 0.16445683, "epoch": 0.5067187734856455, "flos": 20339912079360.0, "grad_norm": 8.322531758877448, "language_loss": 0.79537261, "learning_rate": 2.0535445827418997e-06, "loss": 0.81016582, "num_input_tokens_seen": 181176405, "router_z_loss_clip": 2.17675781, "router_z_loss_mlp": 0.3762207, "step": 8428, "time_per_iteration": 2.661098003387451 }, { "auxiliary_loss_clip": 0.01264784, "auxiliary_loss_mlp": 0.00210792, "balance_loss_clip": 1.05344355, "balance_loss_mlp": 0.17791384, "epoch": 0.5067788967383136, "flos": 28841080665600.0, "grad_norm": 4.657257563646165, "language_loss": 0.8974666, "learning_rate": 2.0531552596471168e-06, "loss": 0.91222239, "num_input_tokens_seen": 181197595, "router_z_loss_clip": 2.11328125, "router_z_loss_mlp": 0.32885742, "step": 8429, "time_per_iteration": 2.6989338397979736 }, { "auxiliary_loss_clip": 0.01286037, "auxiliary_loss_mlp": 0.00228777, "balance_loss_clip": 1.0639317, "balance_loss_mlp": 0.18946135, "epoch": 0.5068390199909815, "flos": 32450174478720.0, "grad_norm": 4.090888256715769, "language_loss": 0.81045282, "learning_rate": 2.052765934536682e-06, "loss": 0.82560098, "num_input_tokens_seen": 181218560, "router_z_loss_clip": 2.22070312, "router_z_loss_mlp": 0.39331055, "step": 8430, "time_per_iteration": 2.7299647331237793 }, { "auxiliary_loss_clip": 0.01263995, "auxiliary_loss_mlp": 0.00201315, "balance_loss_clip": 1.05597901, "balance_loss_mlp": 0.16850869, "epoch": 0.5068991432436495, "flos": 23146582014720.0, "grad_norm": 4.9730636051809665, "language_loss": 0.8396371, "learning_rate": 2.0523766074253575e-06, "loss": 0.85429025, "num_input_tokens_seen": 181237095, "router_z_loss_clip": 2.08007812, "router_z_loss_mlp": 0.328125, "step": 8431, "time_per_iteration": 2.642548084259033 }, { "auxiliary_loss_clip": 0.01271276, "auxiliary_loss_mlp": 0.00215306, "balance_loss_clip": 1.06029201, "balance_loss_mlp": 0.17889968, "epoch": 0.5069592664963174, "flos": 19936096404480.0, "grad_norm": 9.841118033886003, "language_loss": 0.78152716, "learning_rate": 2.0519872783279074e-06, "loss": 0.79639304, "num_input_tokens_seen": 181255940, "router_z_loss_clip": 2.10742188, "router_z_loss_mlp": 0.36376953, "step": 8432, "time_per_iteration": 2.680083751678467 }, { "auxiliary_loss_clip": 0.01219623, "auxiliary_loss_mlp": 0.00040336, "balance_loss_clip": 1.06920004, "balance_loss_mlp": 0.02374209, "epoch": 0.5070193897489854, "flos": 65793771941760.0, "grad_norm": 0.7437461861017748, "language_loss": 0.63180155, "learning_rate": 2.0515979472590945e-06, "loss": 0.64440113, "num_input_tokens_seen": 181316945, "router_z_loss_clip": 1.5, "router_z_loss_mlp": 0.16601562, "step": 8433, "time_per_iteration": 3.1762917041778564 }, { "auxiliary_loss_clip": 0.01279399, "auxiliary_loss_mlp": 0.00186975, "balance_loss_clip": 1.0647099, "balance_loss_mlp": 0.15066382, "epoch": 0.5070795130016534, "flos": 17275331514240.0, "grad_norm": 31.318065524628665, "language_loss": 0.83150655, "learning_rate": 2.051208614233681e-06, "loss": 0.84617031, "num_input_tokens_seen": 181335555, "router_z_loss_clip": 2.14550781, "router_z_loss_mlp": 0.36254883, "step": 8434, "time_per_iteration": 2.6155951023101807 }, { "auxiliary_loss_clip": 0.01279225, "auxiliary_loss_mlp": 0.00189792, "balance_loss_clip": 1.06420648, "balance_loss_mlp": 0.15147769, "epoch": 0.5071396362543213, "flos": 21069940095360.0, "grad_norm": 7.170245874910778, "language_loss": 0.79364133, "learning_rate": 2.0508192792664326e-06, "loss": 0.80833155, "num_input_tokens_seen": 181354580, "router_z_loss_clip": 2.14941406, "router_z_loss_mlp": 0.38330078, "step": 8435, "time_per_iteration": 2.648918867111206 }, { "auxiliary_loss_clip": 0.0128824, "auxiliary_loss_mlp": 0.00221905, "balance_loss_clip": 1.07195628, "balance_loss_mlp": 0.18545055, "epoch": 0.5071997595069894, "flos": 23144822248320.0, "grad_norm": 86.22890022610137, "language_loss": 0.81292284, "learning_rate": 2.050429942372112e-06, "loss": 0.82802433, "num_input_tokens_seen": 181374320, "router_z_loss_clip": 2.16308594, "router_z_loss_mlp": 0.36450195, "step": 8436, "time_per_iteration": 2.659109115600586 }, { "auxiliary_loss_clip": 0.01286966, "auxiliary_loss_mlp": 0.00219746, "balance_loss_clip": 1.07051361, "balance_loss_mlp": 0.18281505, "epoch": 0.5072598827596573, "flos": 22747183712640.0, "grad_norm": 9.887858236092486, "language_loss": 0.91403919, "learning_rate": 2.050040603565483e-06, "loss": 0.9291063, "num_input_tokens_seen": 181392190, "router_z_loss_clip": 2.1640625, "router_z_loss_mlp": 0.36889648, "step": 8437, "time_per_iteration": 2.6986544132232666 }, { "auxiliary_loss_clip": 0.01260591, "auxiliary_loss_mlp": 0.00196829, "balance_loss_clip": 1.05568671, "balance_loss_mlp": 0.1586577, "epoch": 0.5073200060123253, "flos": 22566301799040.0, "grad_norm": 34.47839546704348, "language_loss": 0.87715417, "learning_rate": 2.049651262861309e-06, "loss": 0.89172828, "num_input_tokens_seen": 181413890, "router_z_loss_clip": 2.04882812, "router_z_loss_mlp": 0.38183594, "step": 8438, "time_per_iteration": 2.742281436920166 }, { "auxiliary_loss_clip": 0.01277759, "auxiliary_loss_mlp": 0.00221491, "balance_loss_clip": 1.06389713, "balance_loss_mlp": 0.17988648, "epoch": 0.5073801292649932, "flos": 25806341324160.0, "grad_norm": 10.871772058131743, "language_loss": 0.86905766, "learning_rate": 2.0492619202743543e-06, "loss": 0.88405013, "num_input_tokens_seen": 181433240, "router_z_loss_clip": 2.13867188, "router_z_loss_mlp": 0.41625977, "step": 8439, "time_per_iteration": 2.737874746322632 }, { "auxiliary_loss_clip": 0.01266019, "auxiliary_loss_mlp": 0.00186567, "balance_loss_clip": 1.06048584, "balance_loss_mlp": 0.1519721, "epoch": 0.5074402525176612, "flos": 25373941401600.0, "grad_norm": 12.587574602759359, "language_loss": 0.77313733, "learning_rate": 2.048872575819383e-06, "loss": 0.78766316, "num_input_tokens_seen": 181453535, "router_z_loss_clip": 2.05664062, "router_z_loss_mlp": 0.34594727, "step": 8440, "time_per_iteration": 2.6900556087493896 }, { "auxiliary_loss_clip": 0.01273244, "auxiliary_loss_mlp": 0.00201638, "balance_loss_clip": 1.06467962, "balance_loss_mlp": 0.16856973, "epoch": 0.5075003757703291, "flos": 26064431521920.0, "grad_norm": 4.655087423318338, "language_loss": 0.77361882, "learning_rate": 2.048483229511158e-06, "loss": 0.78836769, "num_input_tokens_seen": 181474195, "router_z_loss_clip": 2.08398438, "router_z_loss_mlp": 0.33056641, "step": 8441, "time_per_iteration": 2.680727481842041 }, { "auxiliary_loss_clip": 0.01274356, "auxiliary_loss_mlp": 0.00215778, "balance_loss_clip": 1.06111324, "balance_loss_mlp": 0.17975271, "epoch": 0.5075604990229972, "flos": 21835447770240.0, "grad_norm": 5.548521568205392, "language_loss": 0.73036867, "learning_rate": 2.0480938813644445e-06, "loss": 0.74527001, "num_input_tokens_seen": 181494000, "router_z_loss_clip": 2.13183594, "router_z_loss_mlp": 0.36010742, "step": 8442, "time_per_iteration": 2.613765239715576 }, { "auxiliary_loss_clip": 0.01259492, "auxiliary_loss_mlp": 0.00202313, "balance_loss_clip": 1.0605036, "balance_loss_mlp": 0.17049587, "epoch": 0.5076206222756651, "flos": 31978703537280.0, "grad_norm": 129.5741433363358, "language_loss": 0.76958895, "learning_rate": 2.047704531394006e-06, "loss": 0.78420705, "num_input_tokens_seen": 181515955, "router_z_loss_clip": 1.98925781, "router_z_loss_mlp": 0.31799316, "step": 8443, "time_per_iteration": 2.7187209129333496 }, { "auxiliary_loss_clip": 0.01283182, "auxiliary_loss_mlp": 0.0025913, "balance_loss_clip": 1.07144618, "balance_loss_mlp": 0.22465512, "epoch": 0.5076807455283331, "flos": 36904031326080.0, "grad_norm": 681.0511373393103, "language_loss": 0.68542278, "learning_rate": 2.047315179614607e-06, "loss": 0.7008459, "num_input_tokens_seen": 181540225, "router_z_loss_clip": 2.11816406, "router_z_loss_mlp": 0.3449707, "step": 8444, "time_per_iteration": 2.802870988845825 }, { "auxiliary_loss_clip": 0.01272235, "auxiliary_loss_mlp": 0.00235053, "balance_loss_clip": 1.06488276, "balance_loss_mlp": 0.20162673, "epoch": 0.507740868781001, "flos": 29862415981440.0, "grad_norm": 16.16549625514251, "language_loss": 0.71433908, "learning_rate": 2.046925826041012e-06, "loss": 0.72941196, "num_input_tokens_seen": 181560125, "router_z_loss_clip": 2.07519531, "router_z_loss_mlp": 0.33422852, "step": 8445, "time_per_iteration": 2.755812406539917 }, { "auxiliary_loss_clip": 0.01208517, "auxiliary_loss_mlp": 0.00109759, "balance_loss_clip": 1.05973899, "balance_loss_mlp": 0.0964072, "epoch": 0.507800992033669, "flos": 61918974247680.0, "grad_norm": 1.2574834824888388, "language_loss": 0.61323088, "learning_rate": 2.0465364706879845e-06, "loss": 0.62641358, "num_input_tokens_seen": 181618830, "router_z_loss_clip": 1.484375, "router_z_loss_mlp": 0.13378906, "step": 8446, "time_per_iteration": 3.1855201721191406 }, { "auxiliary_loss_clip": 0.01271336, "auxiliary_loss_mlp": 0.00238263, "balance_loss_clip": 1.06618476, "balance_loss_mlp": 0.2048849, "epoch": 0.507861115286337, "flos": 20700490757760.0, "grad_norm": 1953.7159115245338, "language_loss": 0.87477577, "learning_rate": 2.04614711357029e-06, "loss": 0.88987184, "num_input_tokens_seen": 181637120, "router_z_loss_clip": 2.05078125, "router_z_loss_mlp": 0.33398438, "step": 8447, "time_per_iteration": 2.6632330417633057 }, { "auxiliary_loss_clip": 0.01265664, "auxiliary_loss_mlp": 0.00246971, "balance_loss_clip": 1.05977273, "balance_loss_mlp": 0.21418804, "epoch": 0.507921238539005, "flos": 30847050576000.0, "grad_norm": 4.367107921108185, "language_loss": 0.75394309, "learning_rate": 2.0457577547026916e-06, "loss": 0.76906949, "num_input_tokens_seen": 181659965, "router_z_loss_clip": 2.05957031, "router_z_loss_mlp": 0.32788086, "step": 8448, "time_per_iteration": 2.7363381385803223 }, { "auxiliary_loss_clip": 0.01273504, "auxiliary_loss_mlp": 0.00243422, "balance_loss_clip": 1.06824517, "balance_loss_mlp": 0.21216494, "epoch": 0.507981361791673, "flos": 35700197984640.0, "grad_norm": 44.019645796323005, "language_loss": 0.7721526, "learning_rate": 2.045368394099955e-06, "loss": 0.78732193, "num_input_tokens_seen": 181685290, "router_z_loss_clip": 2.05273438, "router_z_loss_mlp": 0.31225586, "step": 8449, "time_per_iteration": 2.8312604427337646 }, { "auxiliary_loss_clip": 0.01267952, "auxiliary_loss_mlp": 0.00271546, "balance_loss_clip": 1.06526363, "balance_loss_mlp": 0.24031331, "epoch": 0.5080414850443409, "flos": 27161466750720.0, "grad_norm": 10.527701214994753, "language_loss": 0.81042624, "learning_rate": 2.044979031776844e-06, "loss": 0.82582122, "num_input_tokens_seen": 181706080, "router_z_loss_clip": 2.02734375, "router_z_loss_mlp": 0.31225586, "step": 8450, "time_per_iteration": 2.706831693649292 }, { "auxiliary_loss_clip": 0.01284104, "auxiliary_loss_mlp": 0.00267566, "balance_loss_clip": 1.07314146, "balance_loss_mlp": 0.23559368, "epoch": 0.5081016082970089, "flos": 27085192220160.0, "grad_norm": 28.594115355912066, "language_loss": 0.83704042, "learning_rate": 2.0445896677481234e-06, "loss": 0.85255718, "num_input_tokens_seen": 181724805, "router_z_loss_clip": 2.11132812, "router_z_loss_mlp": 0.31958008, "step": 8451, "time_per_iteration": 2.6697113513946533 }, { "auxiliary_loss_clip": 0.01279698, "auxiliary_loss_mlp": 0.00253869, "balance_loss_clip": 1.07108796, "balance_loss_mlp": 0.22034709, "epoch": 0.5081617315496768, "flos": 22856531690880.0, "grad_norm": 32.766752098141566, "language_loss": 0.92501819, "learning_rate": 2.044200302028559e-06, "loss": 0.94035387, "num_input_tokens_seen": 181743725, "router_z_loss_clip": 2.0859375, "router_z_loss_mlp": 0.33496094, "step": 8452, "time_per_iteration": 2.6629557609558105 }, { "auxiliary_loss_clip": 0.01289489, "auxiliary_loss_mlp": 0.00255639, "balance_loss_clip": 1.07426941, "balance_loss_mlp": 0.22354811, "epoch": 0.5082218548023448, "flos": 16281898087680.0, "grad_norm": 110.68424976105771, "language_loss": 0.8922087, "learning_rate": 2.0438109346329143e-06, "loss": 0.90766001, "num_input_tokens_seen": 181757720, "router_z_loss_clip": 2.15234375, "router_z_loss_mlp": 0.32080078, "step": 8453, "time_per_iteration": 2.575678586959839 }, { "auxiliary_loss_clip": 0.01276254, "auxiliary_loss_mlp": 0.00283194, "balance_loss_clip": 1.0726645, "balance_loss_mlp": 0.25390404, "epoch": 0.5082819780550127, "flos": 24460768915200.0, "grad_norm": 3.651066331755099, "language_loss": 0.8629269, "learning_rate": 2.0434215655759544e-06, "loss": 0.87852138, "num_input_tokens_seen": 181778545, "router_z_loss_clip": 2.03417969, "router_z_loss_mlp": 0.29260254, "step": 8454, "time_per_iteration": 2.692570447921753 }, { "auxiliary_loss_clip": 0.01277075, "auxiliary_loss_mlp": 0.00271438, "balance_loss_clip": 1.0701499, "balance_loss_mlp": 0.2381551, "epoch": 0.5083421013076808, "flos": 23403271582080.0, "grad_norm": 417.4921037647146, "language_loss": 0.95648921, "learning_rate": 2.0430321948724446e-06, "loss": 0.97197431, "num_input_tokens_seen": 181799495, "router_z_loss_clip": 2.07226562, "router_z_loss_mlp": 0.33300781, "step": 8455, "time_per_iteration": 2.680104970932007 }, { "auxiliary_loss_clip": 0.01279936, "auxiliary_loss_mlp": 0.00255226, "balance_loss_clip": 1.06291652, "balance_loss_mlp": 0.22129858, "epoch": 0.5084022245603487, "flos": 23872695448320.0, "grad_norm": 29.39719836644077, "language_loss": 0.69289601, "learning_rate": 2.042642822537149e-06, "loss": 0.70824766, "num_input_tokens_seen": 181818400, "router_z_loss_clip": 2.16796875, "router_z_loss_mlp": 0.33886719, "step": 8456, "time_per_iteration": 2.652148962020874 }, { "auxiliary_loss_clip": 0.01280299, "auxiliary_loss_mlp": 0.00076297, "balance_loss_clip": 1.09052968, "balance_loss_mlp": 0.05684246, "epoch": 0.5084623478130167, "flos": 62873336655360.0, "grad_norm": 0.7923429600422457, "language_loss": 0.61850691, "learning_rate": 2.0422534485848343e-06, "loss": 0.63207293, "num_input_tokens_seen": 181875975, "router_z_loss_clip": 1.890625, "router_z_loss_mlp": 0.19433594, "step": 8457, "time_per_iteration": 2.9996519088745117 }, { "auxiliary_loss_clip": 0.01294995, "auxiliary_loss_mlp": 0.00275396, "balance_loss_clip": 1.07985592, "balance_loss_mlp": 0.23989575, "epoch": 0.5085224710656846, "flos": 22346133384960.0, "grad_norm": 3.6900447442513897, "language_loss": 0.75521559, "learning_rate": 2.0418640730302644e-06, "loss": 0.77091956, "num_input_tokens_seen": 181896450, "router_z_loss_clip": 2.15039062, "router_z_loss_mlp": 0.35498047, "step": 8458, "time_per_iteration": 2.7170698642730713 }, { "auxiliary_loss_clip": 0.01286353, "auxiliary_loss_mlp": 0.00253474, "balance_loss_clip": 1.07175815, "balance_loss_mlp": 0.21949948, "epoch": 0.5085825943183526, "flos": 26066263115520.0, "grad_norm": 3.5831845483761664, "language_loss": 0.83821005, "learning_rate": 2.0414746958882043e-06, "loss": 0.85360837, "num_input_tokens_seen": 181916770, "router_z_loss_clip": 2.14648438, "router_z_loss_mlp": 0.33959961, "step": 8459, "time_per_iteration": 2.7478222846984863 }, { "auxiliary_loss_clip": 0.01302362, "auxiliary_loss_mlp": 0.00272226, "balance_loss_clip": 1.08589482, "balance_loss_mlp": 0.23715451, "epoch": 0.5086427175710206, "flos": 17420733768960.0, "grad_norm": 12.85302653753024, "language_loss": 0.88289273, "learning_rate": 2.0410853171734196e-06, "loss": 0.89863861, "num_input_tokens_seen": 181932710, "router_z_loss_clip": 2.16601562, "router_z_loss_mlp": 0.35083008, "step": 8460, "time_per_iteration": 4.184171915054321 }, { "auxiliary_loss_clip": 0.01301122, "auxiliary_loss_mlp": 0.00276113, "balance_loss_clip": 1.08526433, "balance_loss_mlp": 0.24068442, "epoch": 0.5087028408236886, "flos": 20631758083200.0, "grad_norm": 52.2493006133263, "language_loss": 0.78074396, "learning_rate": 2.0406959369006754e-06, "loss": 0.7965163, "num_input_tokens_seen": 181950665, "router_z_loss_clip": 2.15820312, "router_z_loss_mlp": 0.35449219, "step": 8461, "time_per_iteration": 2.6320743560791016 }, { "auxiliary_loss_clip": 0.01278118, "auxiliary_loss_mlp": 0.00240568, "balance_loss_clip": 1.07236862, "balance_loss_mlp": 0.21000226, "epoch": 0.5087629640763566, "flos": 25593822506880.0, "grad_norm": 32.93595865379383, "language_loss": 0.82978296, "learning_rate": 2.0403065550847375e-06, "loss": 0.84496987, "num_input_tokens_seen": 181971270, "router_z_loss_clip": 2.05859375, "router_z_loss_mlp": 0.30541992, "step": 8462, "time_per_iteration": 2.6529345512390137 }, { "auxiliary_loss_clip": 0.01301865, "auxiliary_loss_mlp": 0.00287131, "balance_loss_clip": 1.08808255, "balance_loss_mlp": 0.25465804, "epoch": 0.5088230873290245, "flos": 13261631927040.0, "grad_norm": 13.149036555126095, "language_loss": 0.89704221, "learning_rate": 2.0399171717403706e-06, "loss": 0.91293216, "num_input_tokens_seen": 181988410, "router_z_loss_clip": 2.13574219, "router_z_loss_mlp": 0.32446289, "step": 8463, "time_per_iteration": 4.028928756713867 }, { "auxiliary_loss_clip": 0.01292216, "auxiliary_loss_mlp": 0.00277408, "balance_loss_clip": 1.0821594, "balance_loss_mlp": 0.24302843, "epoch": 0.5088832105816925, "flos": 20043469134720.0, "grad_norm": 446.6297308732057, "language_loss": 0.82334453, "learning_rate": 2.039527786882341e-06, "loss": 0.83904076, "num_input_tokens_seen": 182006530, "router_z_loss_clip": 2.1015625, "router_z_loss_mlp": 0.34375, "step": 8464, "time_per_iteration": 4.010192632675171 }, { "auxiliary_loss_clip": 0.01284945, "auxiliary_loss_mlp": 0.00040715, "balance_loss_clip": 1.12230432, "balance_loss_mlp": 0.02593261, "epoch": 0.5089433338343604, "flos": 67422179018880.0, "grad_norm": 0.6792131237838422, "language_loss": 0.5871287, "learning_rate": 2.0391384005254133e-06, "loss": 0.60038531, "num_input_tokens_seen": 182074240, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.14746094, "step": 8465, "time_per_iteration": 3.2345855236053467 }, { "auxiliary_loss_clip": 0.01286908, "auxiliary_loss_mlp": 0.00249595, "balance_loss_clip": 1.07851255, "balance_loss_mlp": 0.21640718, "epoch": 0.5090034570870284, "flos": 22710339336960.0, "grad_norm": 12.533113653804534, "language_loss": 0.88579273, "learning_rate": 2.038749012684354e-06, "loss": 0.90115786, "num_input_tokens_seen": 182093360, "router_z_loss_clip": 2.08496094, "router_z_loss_mlp": 0.33178711, "step": 8466, "time_per_iteration": 2.639962673187256 }, { "auxiliary_loss_clip": 0.01292944, "auxiliary_loss_mlp": 0.00256303, "balance_loss_clip": 1.08256459, "balance_loss_mlp": 0.2242592, "epoch": 0.5090635803396963, "flos": 20445812352000.0, "grad_norm": 11.630830447566353, "language_loss": 0.84543556, "learning_rate": 2.0383596233739286e-06, "loss": 0.860928, "num_input_tokens_seen": 182110170, "router_z_loss_clip": 2.10449219, "router_z_loss_mlp": 0.32006836, "step": 8467, "time_per_iteration": 2.6207423210144043 }, { "auxiliary_loss_clip": 0.01294088, "auxiliary_loss_mlp": 0.00279172, "balance_loss_clip": 1.0875771, "balance_loss_mlp": 0.24731922, "epoch": 0.5091237035923644, "flos": 23768878164480.0, "grad_norm": 4.746928782429192, "language_loss": 0.80885452, "learning_rate": 2.0379702326089013e-06, "loss": 0.82458711, "num_input_tokens_seen": 182129570, "router_z_loss_clip": 2.0625, "router_z_loss_mlp": 0.31835938, "step": 8468, "time_per_iteration": 2.6897740364074707 }, { "auxiliary_loss_clip": 0.01305068, "auxiliary_loss_mlp": 0.00243002, "balance_loss_clip": 1.09075534, "balance_loss_mlp": 0.20914689, "epoch": 0.5091838268450323, "flos": 18327908684160.0, "grad_norm": 32.499069481428066, "language_loss": 0.84246469, "learning_rate": 2.03758084040404e-06, "loss": 0.85794532, "num_input_tokens_seen": 182147565, "router_z_loss_clip": 2.14160156, "router_z_loss_mlp": 0.33837891, "step": 8469, "time_per_iteration": 2.69844913482666 }, { "auxiliary_loss_clip": 0.01319003, "auxiliary_loss_mlp": 0.00267409, "balance_loss_clip": 1.10307264, "balance_loss_mlp": 0.23550904, "epoch": 0.5092439500977003, "flos": 29057621806080.0, "grad_norm": 10.626290995701492, "language_loss": 0.74347407, "learning_rate": 2.037191446774109e-06, "loss": 0.7593382, "num_input_tokens_seen": 182169695, "router_z_loss_clip": 2.15722656, "router_z_loss_mlp": 0.31884766, "step": 8470, "time_per_iteration": 4.25928807258606 }, { "auxiliary_loss_clip": 0.01306146, "auxiliary_loss_mlp": 0.00262862, "balance_loss_clip": 1.09083986, "balance_loss_mlp": 0.22988835, "epoch": 0.5093040733503682, "flos": 13553908894080.0, "grad_norm": 32.93261253484889, "language_loss": 0.81251764, "learning_rate": 2.0368020517338745e-06, "loss": 0.82820773, "num_input_tokens_seen": 182186385, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.32958984, "step": 8471, "time_per_iteration": 2.7232437133789062 }, { "auxiliary_loss_clip": 0.01301485, "auxiliary_loss_mlp": 0.00045935, "balance_loss_clip": 1.15952682, "balance_loss_mlp": 0.03615949, "epoch": 0.5093641966030362, "flos": 68906617407360.0, "grad_norm": 0.7480140293361, "language_loss": 0.5787183, "learning_rate": 2.036412655298103e-06, "loss": 0.59219253, "num_input_tokens_seen": 182247095, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.09765625, "step": 8472, "time_per_iteration": 3.1409759521484375 }, { "auxiliary_loss_clip": 0.01305438, "auxiliary_loss_mlp": 0.00248349, "balance_loss_clip": 1.09115779, "balance_loss_mlp": 0.21780801, "epoch": 0.5094243198557042, "flos": 21580948932480.0, "grad_norm": 152.4589054260386, "language_loss": 0.75193262, "learning_rate": 2.03602325748156e-06, "loss": 0.76747054, "num_input_tokens_seen": 182266380, "router_z_loss_clip": 2.14160156, "router_z_loss_mlp": 0.30517578, "step": 8473, "time_per_iteration": 2.6787497997283936 }, { "auxiliary_loss_clip": 0.01310457, "auxiliary_loss_mlp": 0.00241222, "balance_loss_clip": 1.09423709, "balance_loss_mlp": 0.20772411, "epoch": 0.5094844431083722, "flos": 28840721529600.0, "grad_norm": 11.837596903857097, "language_loss": 0.92461675, "learning_rate": 2.0356338582990105e-06, "loss": 0.94013357, "num_input_tokens_seen": 182284685, "router_z_loss_clip": 2.1640625, "router_z_loss_mlp": 0.33520508, "step": 8474, "time_per_iteration": 2.7184979915618896 }, { "auxiliary_loss_clip": 0.01317132, "auxiliary_loss_mlp": 0.00248654, "balance_loss_clip": 1.10054588, "balance_loss_mlp": 0.21544194, "epoch": 0.5095445663610402, "flos": 14976114969600.0, "grad_norm": 17.386978704838047, "language_loss": 0.70518517, "learning_rate": 2.035244457765222e-06, "loss": 0.72084302, "num_input_tokens_seen": 182301810, "router_z_loss_clip": 2.16308594, "router_z_loss_mlp": 0.33203125, "step": 8475, "time_per_iteration": 2.6275157928466797 }, { "auxiliary_loss_clip": 0.01327171, "auxiliary_loss_mlp": 0.00267896, "balance_loss_clip": 1.10769415, "balance_loss_mlp": 0.2356734, "epoch": 0.5096046896137081, "flos": 20777088510720.0, "grad_norm": 7.641110800281134, "language_loss": 0.89504814, "learning_rate": 2.0348550558949605e-06, "loss": 0.91099882, "num_input_tokens_seen": 182320285, "router_z_loss_clip": 2.19335938, "router_z_loss_mlp": 0.3223877, "step": 8476, "time_per_iteration": 2.667182207107544 }, { "auxiliary_loss_clip": 0.01320261, "auxiliary_loss_mlp": 0.00294748, "balance_loss_clip": 1.10065746, "balance_loss_mlp": 0.25981927, "epoch": 0.5096648128663761, "flos": 23185078416000.0, "grad_norm": 40.08769832645642, "language_loss": 0.91213125, "learning_rate": 2.0344656527029917e-06, "loss": 0.92828131, "num_input_tokens_seen": 182339465, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.34936523, "step": 8477, "time_per_iteration": 2.6616976261138916 }, { "auxiliary_loss_clip": 0.01316302, "auxiliary_loss_mlp": 0.00250758, "balance_loss_clip": 1.10213482, "balance_loss_mlp": 0.2181896, "epoch": 0.509724936119044, "flos": 22309432663680.0, "grad_norm": 72.4588935820032, "language_loss": 0.70001853, "learning_rate": 2.034076248204082e-06, "loss": 0.71568906, "num_input_tokens_seen": 182358375, "router_z_loss_clip": 2.140625, "router_z_loss_mlp": 0.32592773, "step": 8478, "time_per_iteration": 2.665623903274536 }, { "auxiliary_loss_clip": 0.01305338, "auxiliary_loss_mlp": 0.0025625, "balance_loss_clip": 1.09520936, "balance_loss_mlp": 0.22339615, "epoch": 0.509785059371712, "flos": 26287077974400.0, "grad_norm": 19.684098157002953, "language_loss": 0.74988538, "learning_rate": 2.0336868424129968e-06, "loss": 0.76550126, "num_input_tokens_seen": 182377935, "router_z_loss_clip": 2.09960938, "router_z_loss_mlp": 0.32836914, "step": 8479, "time_per_iteration": 2.691483736038208 }, { "auxiliary_loss_clip": 0.01309311, "auxiliary_loss_mlp": 0.0024118, "balance_loss_clip": 1.09731936, "balance_loss_mlp": 0.20947018, "epoch": 0.50984518262438, "flos": 22964586779520.0, "grad_norm": 20.152500716694146, "language_loss": 0.77425849, "learning_rate": 2.0332974353445037e-06, "loss": 0.78976333, "num_input_tokens_seen": 182396440, "router_z_loss_clip": 2.12109375, "router_z_loss_mlp": 0.31713867, "step": 8480, "time_per_iteration": 2.771641254425049 }, { "auxiliary_loss_clip": 0.01315363, "auxiliary_loss_mlp": 0.00237625, "balance_loss_clip": 1.09620786, "balance_loss_mlp": 0.2057727, "epoch": 0.509905305877048, "flos": 26213389223040.0, "grad_norm": 13.064770799458795, "language_loss": 0.84728092, "learning_rate": 2.0329080270133688e-06, "loss": 0.86281085, "num_input_tokens_seen": 182415890, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.31860352, "step": 8481, "time_per_iteration": 2.701718330383301 }, { "auxiliary_loss_clip": 0.01307089, "auxiliary_loss_mlp": 0.0023332, "balance_loss_clip": 1.0942353, "balance_loss_mlp": 0.20199221, "epoch": 0.5099654291297159, "flos": 20340055733760.0, "grad_norm": 118.11644890954022, "language_loss": 0.88855183, "learning_rate": 2.0325186174343578e-06, "loss": 0.90395594, "num_input_tokens_seen": 182434235, "router_z_loss_clip": 2.12695312, "router_z_loss_mlp": 0.31347656, "step": 8482, "time_per_iteration": 2.61818265914917 }, { "auxiliary_loss_clip": 0.01323387, "auxiliary_loss_mlp": 0.00253911, "balance_loss_clip": 1.10021901, "balance_loss_mlp": 0.22115257, "epoch": 0.5100255523823839, "flos": 29054820545280.0, "grad_norm": 47.631173307637034, "language_loss": 0.9205929, "learning_rate": 2.032129206622238e-06, "loss": 0.93636584, "num_input_tokens_seen": 182454360, "router_z_loss_clip": 2.23242188, "router_z_loss_mlp": 0.32714844, "step": 8483, "time_per_iteration": 2.706202983856201 }, { "auxiliary_loss_clip": 0.01312228, "auxiliary_loss_mlp": 0.00240649, "balance_loss_clip": 1.0969888, "balance_loss_mlp": 0.21046518, "epoch": 0.5100856756350518, "flos": 22455912326400.0, "grad_norm": 138.47691000063756, "language_loss": 0.90027261, "learning_rate": 2.031739794591775e-06, "loss": 0.91580135, "num_input_tokens_seen": 182471940, "router_z_loss_clip": 2.15234375, "router_z_loss_mlp": 0.30175781, "step": 8484, "time_per_iteration": 2.665294885635376 }, { "auxiliary_loss_clip": 0.01298221, "auxiliary_loss_mlp": 0.00264153, "balance_loss_clip": 1.08722949, "balance_loss_mlp": 0.23239595, "epoch": 0.5101457988877198, "flos": 19171055606400.0, "grad_norm": 13.656938368720379, "language_loss": 0.88581443, "learning_rate": 2.031350381357736e-06, "loss": 0.90143824, "num_input_tokens_seen": 182490685, "router_z_loss_clip": 2.109375, "router_z_loss_mlp": 0.31762695, "step": 8485, "time_per_iteration": 2.6119463443756104 }, { "auxiliary_loss_clip": 0.01298503, "auxiliary_loss_mlp": 0.0024403, "balance_loss_clip": 1.09034216, "balance_loss_mlp": 0.21375114, "epoch": 0.5102059221403878, "flos": 14866371941760.0, "grad_norm": 126.14869748625088, "language_loss": 0.79607844, "learning_rate": 2.0309609669348874e-06, "loss": 0.81150377, "num_input_tokens_seen": 182508325, "router_z_loss_clip": 2.0859375, "router_z_loss_mlp": 0.30297852, "step": 8486, "time_per_iteration": 2.62530255317688 }, { "auxiliary_loss_clip": 0.01326552, "auxiliary_loss_mlp": 0.00255083, "balance_loss_clip": 1.10584903, "balance_loss_mlp": 0.22330174, "epoch": 0.5102660453930558, "flos": 22961103160320.0, "grad_norm": 22.562402633572344, "language_loss": 0.77014875, "learning_rate": 2.0305715513379953e-06, "loss": 0.78596509, "num_input_tokens_seen": 182527020, "router_z_loss_clip": 2.20507812, "router_z_loss_mlp": 0.31787109, "step": 8487, "time_per_iteration": 2.628732919692993 }, { "auxiliary_loss_clip": 0.01319589, "auxiliary_loss_mlp": 0.00245602, "balance_loss_clip": 1.10466707, "balance_loss_mlp": 0.21489391, "epoch": 0.5103261686457238, "flos": 23149311448320.0, "grad_norm": 95.41460318245436, "language_loss": 0.7885403, "learning_rate": 2.030182134581827e-06, "loss": 0.8041923, "num_input_tokens_seen": 182543505, "router_z_loss_clip": 2.14648438, "router_z_loss_mlp": 0.30712891, "step": 8488, "time_per_iteration": 2.671250820159912 }, { "auxiliary_loss_clip": 0.01323837, "auxiliary_loss_mlp": 0.00239227, "balance_loss_clip": 1.10266006, "balance_loss_mlp": 0.21084291, "epoch": 0.5103862918983917, "flos": 14319237000960.0, "grad_norm": 62.33357847033048, "language_loss": 0.79148042, "learning_rate": 2.0297927166811503e-06, "loss": 0.80711102, "num_input_tokens_seen": 182562250, "router_z_loss_clip": 2.21289062, "router_z_loss_mlp": 0.28381348, "step": 8489, "time_per_iteration": 2.6053555011749268 }, { "auxiliary_loss_clip": 0.01301137, "auxiliary_loss_mlp": 0.00248596, "balance_loss_clip": 1.08838439, "balance_loss_mlp": 0.21791139, "epoch": 0.5104464151510597, "flos": 25848536826240.0, "grad_norm": 10.146232131850132, "language_loss": 0.79599714, "learning_rate": 2.0294032976507297e-06, "loss": 0.81149447, "num_input_tokens_seen": 182581910, "router_z_loss_clip": 2.12890625, "router_z_loss_mlp": 0.30664062, "step": 8490, "time_per_iteration": 2.681851387023926 }, { "auxiliary_loss_clip": 0.01288683, "auxiliary_loss_mlp": 0.00243332, "balance_loss_clip": 1.08039582, "balance_loss_mlp": 0.21472159, "epoch": 0.5105065384037276, "flos": 21652913831040.0, "grad_norm": 8.05218828763606, "language_loss": 0.87145436, "learning_rate": 2.0290138775053337e-06, "loss": 0.88677454, "num_input_tokens_seen": 182601350, "router_z_loss_clip": 2.08789062, "router_z_loss_mlp": 0.28588867, "step": 8491, "time_per_iteration": 2.665956735610962 }, { "auxiliary_loss_clip": 0.01284165, "auxiliary_loss_mlp": 0.00216662, "balance_loss_clip": 1.08198142, "balance_loss_mlp": 0.18963754, "epoch": 0.5105666616563956, "flos": 22491571553280.0, "grad_norm": 10.549271272895274, "language_loss": 0.86861056, "learning_rate": 2.028624456259728e-06, "loss": 0.88361883, "num_input_tokens_seen": 182619660, "router_z_loss_clip": 2.01855469, "router_z_loss_mlp": 0.27001953, "step": 8492, "time_per_iteration": 2.742173194885254 }, { "auxiliary_loss_clip": 0.01326494, "auxiliary_loss_mlp": 0.00242409, "balance_loss_clip": 1.10094237, "balance_loss_mlp": 0.21049632, "epoch": 0.5106267849090635, "flos": 22455768672000.0, "grad_norm": 14.036177486165249, "language_loss": 0.85189164, "learning_rate": 2.0282350339286804e-06, "loss": 0.86758065, "num_input_tokens_seen": 182639815, "router_z_loss_clip": 2.25585938, "router_z_loss_mlp": 0.31896973, "step": 8493, "time_per_iteration": 2.732360601425171 }, { "auxiliary_loss_clip": 0.01315362, "auxiliary_loss_mlp": 0.00262983, "balance_loss_clip": 1.09861326, "balance_loss_mlp": 0.23260829, "epoch": 0.5106869081617316, "flos": 23547093638400.0, "grad_norm": 3.4696409386257905, "language_loss": 0.89904654, "learning_rate": 2.0278456105269574e-06, "loss": 0.91482997, "num_input_tokens_seen": 182659655, "router_z_loss_clip": 2.171875, "router_z_loss_mlp": 0.3034668, "step": 8494, "time_per_iteration": 2.652782678604126 }, { "auxiliary_loss_clip": 0.01307463, "auxiliary_loss_mlp": 0.00261805, "balance_loss_clip": 1.09132278, "balance_loss_mlp": 0.23092949, "epoch": 0.5107470314143995, "flos": 26792987080320.0, "grad_norm": 2.998870048337578, "language_loss": 0.84829956, "learning_rate": 2.027456186069326e-06, "loss": 0.86399221, "num_input_tokens_seen": 182677075, "router_z_loss_clip": 2.16113281, "router_z_loss_mlp": 0.30908203, "step": 8495, "time_per_iteration": 2.6659493446350098 }, { "auxiliary_loss_clip": 0.01311973, "auxiliary_loss_mlp": 0.00270541, "balance_loss_clip": 1.09502435, "balance_loss_mlp": 0.23854569, "epoch": 0.5108071546670675, "flos": 25739691638400.0, "grad_norm": 22.486743027263202, "language_loss": 0.8512634, "learning_rate": 2.0270667605705535e-06, "loss": 0.86708856, "num_input_tokens_seen": 182699625, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.31982422, "step": 8496, "time_per_iteration": 2.6910994052886963 }, { "auxiliary_loss_clip": 0.01299275, "auxiliary_loss_mlp": 0.00256343, "balance_loss_clip": 1.08994067, "balance_loss_mlp": 0.22642121, "epoch": 0.5108672779197354, "flos": 18697537589760.0, "grad_norm": 27.734630306463366, "language_loss": 0.85657007, "learning_rate": 2.0266773340454066e-06, "loss": 0.87212622, "num_input_tokens_seen": 182717020, "router_z_loss_clip": 2.09472656, "router_z_loss_mlp": 0.29919434, "step": 8497, "time_per_iteration": 2.6414034366607666 }, { "auxiliary_loss_clip": 0.01316332, "auxiliary_loss_mlp": 0.00236334, "balance_loss_clip": 1.09933352, "balance_loss_mlp": 0.20668659, "epoch": 0.5109274011724034, "flos": 26688164215680.0, "grad_norm": 10.323675546996526, "language_loss": 0.87303859, "learning_rate": 2.0262879065086525e-06, "loss": 0.88856524, "num_input_tokens_seen": 182736955, "router_z_loss_clip": 2.16796875, "router_z_loss_mlp": 0.29675293, "step": 8498, "time_per_iteration": 2.7126123905181885 }, { "auxiliary_loss_clip": 0.01306805, "auxiliary_loss_mlp": 0.00246393, "balance_loss_clip": 1.0968653, "balance_loss_mlp": 0.21663833, "epoch": 0.5109875244250714, "flos": 22784028088320.0, "grad_norm": 6.115657491781187, "language_loss": 0.79758054, "learning_rate": 2.0258984779750584e-06, "loss": 0.8131125, "num_input_tokens_seen": 182757620, "router_z_loss_clip": 2.09765625, "router_z_loss_mlp": 0.29760742, "step": 8499, "time_per_iteration": 2.6821248531341553 }, { "auxiliary_loss_clip": 0.01311965, "auxiliary_loss_mlp": 0.00242558, "balance_loss_clip": 1.09523642, "balance_loss_mlp": 0.21337564, "epoch": 0.5110476476777394, "flos": 35588515622400.0, "grad_norm": 25.352956061327028, "language_loss": 0.78399205, "learning_rate": 2.0255090484593914e-06, "loss": 0.7995373, "num_input_tokens_seen": 182780195, "router_z_loss_clip": 2.16796875, "router_z_loss_mlp": 0.29150391, "step": 8500, "time_per_iteration": 2.7731637954711914 }, { "auxiliary_loss_clip": 0.01321705, "auxiliary_loss_mlp": 0.00281464, "balance_loss_clip": 1.0948329, "balance_loss_mlp": 0.24770394, "epoch": 0.5111077709304074, "flos": 19280798634240.0, "grad_norm": 5.411947949582414, "language_loss": 0.74715716, "learning_rate": 2.0251196179764183e-06, "loss": 0.76318884, "num_input_tokens_seen": 182795765, "router_z_loss_clip": 2.27148438, "router_z_loss_mlp": 0.33740234, "step": 8501, "time_per_iteration": 2.6389389038085938 }, { "auxiliary_loss_clip": 0.01309816, "auxiliary_loss_mlp": 0.00254911, "balance_loss_clip": 1.0900631, "balance_loss_mlp": 0.22384493, "epoch": 0.5111678941830753, "flos": 20668207409280.0, "grad_norm": 28.801695260416466, "language_loss": 0.95031214, "learning_rate": 2.024730186540907e-06, "loss": 0.96595937, "num_input_tokens_seen": 182813120, "router_z_loss_clip": 2.20117188, "router_z_loss_mlp": 0.31079102, "step": 8502, "time_per_iteration": 4.112308979034424 }, { "auxiliary_loss_clip": 0.0129652, "auxiliary_loss_mlp": 0.00265425, "balance_loss_clip": 1.08558214, "balance_loss_mlp": 0.23493141, "epoch": 0.5112280174357433, "flos": 26287903987200.0, "grad_norm": 18.874767344309955, "language_loss": 0.88513637, "learning_rate": 2.0243407541676253e-06, "loss": 0.90075582, "num_input_tokens_seen": 182835745, "router_z_loss_clip": 2.11132812, "router_z_loss_mlp": 0.3046875, "step": 8503, "time_per_iteration": 2.768415927886963 }, { "auxiliary_loss_clip": 0.01282991, "auxiliary_loss_mlp": 0.00132425, "balance_loss_clip": 1.14478922, "balance_loss_mlp": 0.12551077, "epoch": 0.5112881406884112, "flos": 59474247707520.0, "grad_norm": 3.3641392976996745, "language_loss": 0.63695085, "learning_rate": 2.023951320871339e-06, "loss": 0.65110493, "num_input_tokens_seen": 182892540, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 0.06933594, "step": 8504, "time_per_iteration": 3.143454074859619 }, { "auxiliary_loss_clip": 0.01312248, "auxiliary_loss_mlp": 0.00249552, "balance_loss_clip": 1.09652495, "balance_loss_mlp": 0.21932048, "epoch": 0.5113482639410792, "flos": 26468857728000.0, "grad_norm": 360.2809455277972, "language_loss": 0.90587717, "learning_rate": 2.023561886666816e-06, "loss": 0.9214952, "num_input_tokens_seen": 182911515, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.30212402, "step": 8505, "time_per_iteration": 4.072691202163696 }, { "auxiliary_loss_clip": 0.01311972, "auxiliary_loss_mlp": 0.00248951, "balance_loss_clip": 1.09922338, "balance_loss_mlp": 0.21979198, "epoch": 0.5114083871937471, "flos": 29895848565120.0, "grad_norm": 8.739401977587278, "language_loss": 0.81953675, "learning_rate": 2.0231724515688246e-06, "loss": 0.83514595, "num_input_tokens_seen": 182930860, "router_z_loss_clip": 2.12890625, "router_z_loss_mlp": 0.29150391, "step": 8506, "time_per_iteration": 2.6735408306121826 }, { "auxiliary_loss_clip": 0.01300492, "auxiliary_loss_mlp": 0.00271401, "balance_loss_clip": 1.08506823, "balance_loss_mlp": 0.23777199, "epoch": 0.5114685104464152, "flos": 24314576561280.0, "grad_norm": 5.08256383253446, "language_loss": 0.68369132, "learning_rate": 2.022783015592131e-06, "loss": 0.69941026, "num_input_tokens_seen": 182949960, "router_z_loss_clip": 2.15234375, "router_z_loss_mlp": 0.33630371, "step": 8507, "time_per_iteration": 4.1303184032440186 }, { "auxiliary_loss_clip": 0.0130656, "auxiliary_loss_mlp": 0.00228027, "balance_loss_clip": 1.09470713, "balance_loss_mlp": 0.1979503, "epoch": 0.5115286336990831, "flos": 17019288391680.0, "grad_norm": 16.254968202248065, "language_loss": 0.9122653, "learning_rate": 2.022393578751503e-06, "loss": 0.92761111, "num_input_tokens_seen": 182968085, "router_z_loss_clip": 2.11621094, "router_z_loss_mlp": 0.30078125, "step": 8508, "time_per_iteration": 2.6117300987243652 }, { "auxiliary_loss_clip": 0.01317888, "auxiliary_loss_mlp": 0.00245029, "balance_loss_clip": 1.09940171, "balance_loss_mlp": 0.21520317, "epoch": 0.5115887569517511, "flos": 23659386531840.0, "grad_norm": 6.065328738655889, "language_loss": 0.78097463, "learning_rate": 2.022004141061709e-06, "loss": 0.7966038, "num_input_tokens_seen": 182987275, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.29833984, "step": 8509, "time_per_iteration": 2.688170909881592 }, { "auxiliary_loss_clip": 0.01284605, "auxiliary_loss_mlp": 0.00242021, "balance_loss_clip": 1.0816716, "balance_loss_mlp": 0.21521118, "epoch": 0.511648880204419, "flos": 16107193313280.0, "grad_norm": 13.316060235560677, "language_loss": 0.81346655, "learning_rate": 2.0216147025375153e-06, "loss": 0.82873279, "num_input_tokens_seen": 183004700, "router_z_loss_clip": 2.02929688, "router_z_loss_mlp": 0.26818848, "step": 8510, "time_per_iteration": 2.609095811843872 }, { "auxiliary_loss_clip": 0.0130141, "auxiliary_loss_mlp": 0.00219095, "balance_loss_clip": 1.09338987, "balance_loss_mlp": 0.19089019, "epoch": 0.511709003457087, "flos": 32634970974720.0, "grad_norm": 4.633752670743582, "language_loss": 0.78483927, "learning_rate": 2.0212252631936907e-06, "loss": 0.8000443, "num_input_tokens_seen": 183025830, "router_z_loss_clip": 2.08398438, "router_z_loss_mlp": 0.2824707, "step": 8511, "time_per_iteration": 2.7290170192718506 }, { "auxiliary_loss_clip": 0.01308644, "auxiliary_loss_mlp": 0.00245448, "balance_loss_clip": 1.09849, "balance_loss_mlp": 0.21650442, "epoch": 0.511769126709755, "flos": 21762082241280.0, "grad_norm": 11.145708259270094, "language_loss": 0.75608617, "learning_rate": 2.020835823045001e-06, "loss": 0.77162707, "num_input_tokens_seen": 183045140, "router_z_loss_clip": 2.10253906, "router_z_loss_mlp": 0.28955078, "step": 8512, "time_per_iteration": 4.115967035293579 }, { "auxiliary_loss_clip": 0.01306349, "auxiliary_loss_mlp": 0.00247244, "balance_loss_clip": 1.0918256, "balance_loss_mlp": 0.21436571, "epoch": 0.511829249962423, "flos": 23915357827200.0, "grad_norm": 30.27477105736768, "language_loss": 0.74868524, "learning_rate": 2.0204463821062146e-06, "loss": 0.76422113, "num_input_tokens_seen": 183063935, "router_z_loss_clip": 2.1484375, "router_z_loss_mlp": 0.32861328, "step": 8513, "time_per_iteration": 2.710601568222046 }, { "auxiliary_loss_clip": 0.01321709, "auxiliary_loss_mlp": 0.00234453, "balance_loss_clip": 1.10501623, "balance_loss_mlp": 0.20534185, "epoch": 0.511889373215091, "flos": 23727005884800.0, "grad_norm": 4.668750131548081, "language_loss": 0.75076085, "learning_rate": 2.0200569403921e-06, "loss": 0.76632249, "num_input_tokens_seen": 183084135, "router_z_loss_clip": 2.16601562, "router_z_loss_mlp": 0.29125977, "step": 8514, "time_per_iteration": 2.7405760288238525 }, { "auxiliary_loss_clip": 0.01303115, "auxiliary_loss_mlp": 0.00241477, "balance_loss_clip": 1.09142852, "balance_loss_mlp": 0.21417816, "epoch": 0.5119494964677589, "flos": 28111519526400.0, "grad_norm": 263.6517526992869, "language_loss": 0.7140367, "learning_rate": 2.019667497917424e-06, "loss": 0.72948265, "num_input_tokens_seen": 183104570, "router_z_loss_clip": 2.11425781, "router_z_loss_mlp": 0.27282715, "step": 8515, "time_per_iteration": 2.7509236335754395 }, { "auxiliary_loss_clip": 0.01302007, "auxiliary_loss_mlp": 0.00227464, "balance_loss_clip": 1.09122503, "balance_loss_mlp": 0.20033219, "epoch": 0.5120096197204269, "flos": 24973214296320.0, "grad_norm": 29.375448356283805, "language_loss": 0.82164121, "learning_rate": 2.019278054696955e-06, "loss": 0.83693588, "num_input_tokens_seen": 183123850, "router_z_loss_clip": 2.109375, "router_z_loss_mlp": 0.27111816, "step": 8516, "time_per_iteration": 2.682950735092163 }, { "auxiliary_loss_clip": 0.01331721, "auxiliary_loss_mlp": 0.0026322, "balance_loss_clip": 1.11561954, "balance_loss_mlp": 0.23359665, "epoch": 0.5120697429730948, "flos": 17968012364160.0, "grad_norm": 8.25869494932315, "language_loss": 0.84183049, "learning_rate": 2.0188886107454595e-06, "loss": 0.85777986, "num_input_tokens_seen": 183141725, "router_z_loss_clip": 2.16210938, "router_z_loss_mlp": 0.29602051, "step": 8517, "time_per_iteration": 2.630096673965454 }, { "auxiliary_loss_clip": 0.01330392, "auxiliary_loss_mlp": 0.0025842, "balance_loss_clip": 1.10625434, "balance_loss_mlp": 0.22661456, "epoch": 0.5121298662257628, "flos": 23292343405440.0, "grad_norm": 9.27130142498079, "language_loss": 0.8107295, "learning_rate": 2.0184991660777063e-06, "loss": 0.8266176, "num_input_tokens_seen": 183161300, "router_z_loss_clip": 2.2421875, "router_z_loss_mlp": 0.31811523, "step": 8518, "time_per_iteration": 2.7049458026885986 }, { "auxiliary_loss_clip": 0.01318213, "auxiliary_loss_mlp": 0.00224619, "balance_loss_clip": 1.1026535, "balance_loss_mlp": 0.19410104, "epoch": 0.5121899894784308, "flos": 17311062568320.0, "grad_norm": 45.75327703544161, "language_loss": 0.88307118, "learning_rate": 2.0181097207084625e-06, "loss": 0.89849949, "num_input_tokens_seen": 183180495, "router_z_loss_clip": 2.15429688, "router_z_loss_mlp": 0.30541992, "step": 8519, "time_per_iteration": 2.652827262878418 }, { "auxiliary_loss_clip": 0.01314247, "auxiliary_loss_mlp": 0.00254392, "balance_loss_clip": 1.10107255, "balance_loss_mlp": 0.22536427, "epoch": 0.5122501127310988, "flos": 24930085040640.0, "grad_norm": 571.1712046530563, "language_loss": 0.87009263, "learning_rate": 2.017720274652497e-06, "loss": 0.88577902, "num_input_tokens_seen": 183200330, "router_z_loss_clip": 2.1328125, "router_z_loss_mlp": 0.28991699, "step": 8520, "time_per_iteration": 2.699948310852051 }, { "auxiliary_loss_clip": 0.01327085, "auxiliary_loss_mlp": 0.00276616, "balance_loss_clip": 1.10744333, "balance_loss_mlp": 0.24597907, "epoch": 0.5123102359837667, "flos": 18442859184000.0, "grad_norm": 105.98418948983861, "language_loss": 0.89361119, "learning_rate": 2.0173308279245765e-06, "loss": 0.90964818, "num_input_tokens_seen": 183218230, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.3067627, "step": 8521, "time_per_iteration": 2.682382345199585 }, { "auxiliary_loss_clip": 0.01315862, "auxiliary_loss_mlp": 0.00241709, "balance_loss_clip": 1.09973168, "balance_loss_mlp": 0.21071422, "epoch": 0.5123703592364347, "flos": 26684860164480.0, "grad_norm": 5.398366088232258, "language_loss": 0.72024548, "learning_rate": 2.0169413805394692e-06, "loss": 0.73582125, "num_input_tokens_seen": 183236735, "router_z_loss_clip": 2.16210938, "router_z_loss_mlp": 0.31005859, "step": 8522, "time_per_iteration": 2.7388296127319336 }, { "auxiliary_loss_clip": 0.01331191, "auxiliary_loss_mlp": 0.00260834, "balance_loss_clip": 1.10778666, "balance_loss_mlp": 0.22759876, "epoch": 0.5124304824891026, "flos": 28803948981120.0, "grad_norm": 23.29252831163486, "language_loss": 0.70978272, "learning_rate": 2.0165519325119433e-06, "loss": 0.72570294, "num_input_tokens_seen": 183257550, "router_z_loss_clip": 2.23632812, "router_z_loss_mlp": 0.33203125, "step": 8523, "time_per_iteration": 2.696420669555664 }, { "auxiliary_loss_clip": 0.0132494, "auxiliary_loss_mlp": 0.00239373, "balance_loss_clip": 1.11053395, "balance_loss_mlp": 0.21164495, "epoch": 0.5124906057417706, "flos": 21761830846080.0, "grad_norm": 6.345559632676771, "language_loss": 0.83082169, "learning_rate": 2.0161624838567656e-06, "loss": 0.84646481, "num_input_tokens_seen": 183275515, "router_z_loss_clip": 2.14257812, "router_z_loss_mlp": 0.27709961, "step": 8524, "time_per_iteration": 2.6373908519744873 }, { "auxiliary_loss_clip": 0.01301252, "auxiliary_loss_mlp": 0.00232045, "balance_loss_clip": 1.09406567, "balance_loss_mlp": 0.20360178, "epoch": 0.5125507289944387, "flos": 18880538405760.0, "grad_norm": 4.8536476067603225, "language_loss": 0.81792569, "learning_rate": 2.015773034588706e-06, "loss": 0.83325875, "num_input_tokens_seen": 183293880, "router_z_loss_clip": 2.07421875, "router_z_loss_mlp": 0.28442383, "step": 8525, "time_per_iteration": 2.664383888244629 }, { "auxiliary_loss_clip": 0.01329718, "auxiliary_loss_mlp": 0.00267424, "balance_loss_clip": 1.11244845, "balance_loss_mlp": 0.23592857, "epoch": 0.5126108522471066, "flos": 35627838036480.0, "grad_norm": 10.008540859571776, "language_loss": 0.80576456, "learning_rate": 2.015383584722531e-06, "loss": 0.82173598, "num_input_tokens_seen": 183315860, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.31518555, "step": 8526, "time_per_iteration": 2.754840850830078 }, { "auxiliary_loss_clip": 0.01321151, "auxiliary_loss_mlp": 0.00258239, "balance_loss_clip": 1.10550988, "balance_loss_mlp": 0.22860309, "epoch": 0.5126709754997746, "flos": 20190918464640.0, "grad_norm": 8.954339155284618, "language_loss": 0.71682155, "learning_rate": 2.0149941342730088e-06, "loss": 0.73261547, "num_input_tokens_seen": 183335480, "router_z_loss_clip": 2.15820312, "router_z_loss_mlp": 0.29650879, "step": 8527, "time_per_iteration": 2.6380720138549805 }, { "auxiliary_loss_clip": 0.01308387, "auxiliary_loss_mlp": 0.00211215, "balance_loss_clip": 1.10456371, "balance_loss_mlp": 0.18178183, "epoch": 0.5127310987524425, "flos": 18588548747520.0, "grad_norm": 62.52091897798757, "language_loss": 0.80170536, "learning_rate": 2.014604683254908e-06, "loss": 0.81690139, "num_input_tokens_seen": 183354395, "router_z_loss_clip": 2.03808594, "router_z_loss_mlp": 0.29431152, "step": 8528, "time_per_iteration": 2.6126840114593506 }, { "auxiliary_loss_clip": 0.01313058, "auxiliary_loss_mlp": 0.00250377, "balance_loss_clip": 1.10071445, "balance_loss_mlp": 0.2194784, "epoch": 0.5127912220051105, "flos": 22454691264000.0, "grad_norm": 16.446021112361503, "language_loss": 0.88676512, "learning_rate": 2.014215231682995e-06, "loss": 0.90239948, "num_input_tokens_seen": 183372980, "router_z_loss_clip": 2.12695312, "router_z_loss_mlp": 0.30908203, "step": 8529, "time_per_iteration": 2.6172664165496826 }, { "auxiliary_loss_clip": 0.01310428, "auxiliary_loss_mlp": 0.00246081, "balance_loss_clip": 1.09994316, "balance_loss_mlp": 0.2161116, "epoch": 0.5128513452577784, "flos": 19093703667840.0, "grad_norm": 3.8738542864535743, "language_loss": 0.79803091, "learning_rate": 2.01382577957204e-06, "loss": 0.81359595, "num_input_tokens_seen": 183390160, "router_z_loss_clip": 2.1015625, "router_z_loss_mlp": 0.29980469, "step": 8530, "time_per_iteration": 2.6011102199554443 }, { "auxiliary_loss_clip": 0.01395475, "auxiliary_loss_mlp": 0.00090571, "balance_loss_clip": 1.25420904, "balance_loss_mlp": 0.08217871, "epoch": 0.5129114685104464, "flos": 67892285243520.0, "grad_norm": 0.7561724740301653, "language_loss": 0.60380822, "learning_rate": 2.0134363269368095e-06, "loss": 0.61866868, "num_input_tokens_seen": 183455280, "router_z_loss_clip": 1.4140625, "router_z_loss_mlp": 0.08398438, "step": 8531, "time_per_iteration": 3.3527121543884277 }, { "auxiliary_loss_clip": 0.01340876, "auxiliary_loss_mlp": 0.00249544, "balance_loss_clip": 1.11725748, "balance_loss_mlp": 0.21843049, "epoch": 0.5129715917631144, "flos": 20449152316800.0, "grad_norm": 72.64106910377896, "language_loss": 0.85529923, "learning_rate": 2.0130468737920725e-06, "loss": 0.87120336, "num_input_tokens_seen": 183473955, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.31103516, "step": 8532, "time_per_iteration": 2.831019163131714 }, { "auxiliary_loss_clip": 0.01330257, "auxiliary_loss_mlp": 0.00233015, "balance_loss_clip": 1.11154175, "balance_loss_mlp": 0.20392773, "epoch": 0.5130317150157824, "flos": 35116146840960.0, "grad_norm": 107.32788476234585, "language_loss": 0.73930424, "learning_rate": 2.012657420152597e-06, "loss": 0.75493705, "num_input_tokens_seen": 183497195, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.29101562, "step": 8533, "time_per_iteration": 2.7663958072662354 }, { "auxiliary_loss_clip": 0.01305959, "auxiliary_loss_mlp": 0.00261388, "balance_loss_clip": 1.0943476, "balance_loss_mlp": 0.22979754, "epoch": 0.5130918382684503, "flos": 19791627903360.0, "grad_norm": 10.615137030939177, "language_loss": 0.87302768, "learning_rate": 2.01226796603315e-06, "loss": 0.8887012, "num_input_tokens_seen": 183513675, "router_z_loss_clip": 2.11914062, "router_z_loss_mlp": 0.31591797, "step": 8534, "time_per_iteration": 2.6639246940612793 }, { "auxiliary_loss_clip": 0.01323886, "auxiliary_loss_mlp": 0.00244601, "balance_loss_clip": 1.10710144, "balance_loss_mlp": 0.21448895, "epoch": 0.5131519615211183, "flos": 26323096337280.0, "grad_norm": 1.8798263428697572, "language_loss": 0.70081782, "learning_rate": 2.0118785114485017e-06, "loss": 0.71650267, "num_input_tokens_seen": 183535165, "router_z_loss_clip": 2.16796875, "router_z_loss_mlp": 0.30102539, "step": 8535, "time_per_iteration": 2.7494118213653564 }, { "auxiliary_loss_clip": 0.01325972, "auxiliary_loss_mlp": 0.00251163, "balance_loss_clip": 1.11210322, "balance_loss_mlp": 0.22361347, "epoch": 0.5132120847737862, "flos": 19171917532800.0, "grad_norm": 56.31981144928579, "language_loss": 0.75820208, "learning_rate": 2.011489056413418e-06, "loss": 0.77397335, "num_input_tokens_seen": 183553780, "router_z_loss_clip": 2.13867188, "router_z_loss_mlp": 0.27526855, "step": 8536, "time_per_iteration": 2.694150686264038 }, { "auxiliary_loss_clip": 0.01331074, "auxiliary_loss_mlp": 0.00258636, "balance_loss_clip": 1.10406625, "balance_loss_mlp": 0.22518633, "epoch": 0.5132722080264542, "flos": 20230420446720.0, "grad_norm": 906.8943342572752, "language_loss": 0.80616939, "learning_rate": 2.011099600942669e-06, "loss": 0.82206649, "num_input_tokens_seen": 183572285, "router_z_loss_clip": 2.26757812, "router_z_loss_mlp": 0.33447266, "step": 8537, "time_per_iteration": 2.633655548095703 }, { "auxiliary_loss_clip": 0.01315918, "auxiliary_loss_mlp": 0.00251759, "balance_loss_clip": 1.09922528, "balance_loss_mlp": 0.22157523, "epoch": 0.5133323312791223, "flos": 16469459930880.0, "grad_norm": 107.45650582590834, "language_loss": 0.87648046, "learning_rate": 2.0107101450510214e-06, "loss": 0.8921572, "num_input_tokens_seen": 183589330, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.30151367, "step": 8538, "time_per_iteration": 2.616544723510742 }, { "auxiliary_loss_clip": 0.01301845, "auxiliary_loss_mlp": 0.0025063, "balance_loss_clip": 1.08797443, "balance_loss_mlp": 0.22240105, "epoch": 0.5133924545317902, "flos": 26068094709120.0, "grad_norm": 188.41370126511399, "language_loss": 0.85025084, "learning_rate": 2.0103206887532437e-06, "loss": 0.86577559, "num_input_tokens_seen": 183609205, "router_z_loss_clip": 2.13867188, "router_z_loss_mlp": 0.28222656, "step": 8539, "time_per_iteration": 2.752021551132202 }, { "auxiliary_loss_clip": 0.0132142, "auxiliary_loss_mlp": 0.00261287, "balance_loss_clip": 1.10646749, "balance_loss_mlp": 0.22993451, "epoch": 0.5134525777844582, "flos": 29131023248640.0, "grad_norm": 46.226696416787576, "language_loss": 0.81445551, "learning_rate": 2.009931232064105e-06, "loss": 0.83028269, "num_input_tokens_seen": 183629985, "router_z_loss_clip": 2.14746094, "router_z_loss_mlp": 0.31347656, "step": 8540, "time_per_iteration": 2.7432947158813477 }, { "auxiliary_loss_clip": 0.01326475, "auxiliary_loss_mlp": 0.00290998, "balance_loss_clip": 1.10340285, "balance_loss_mlp": 0.25961018, "epoch": 0.5135127010371261, "flos": 17454776883840.0, "grad_norm": 13.229944814442021, "language_loss": 0.83198172, "learning_rate": 2.0095417749983724e-06, "loss": 0.84815639, "num_input_tokens_seen": 183648220, "router_z_loss_clip": 2.23046875, "router_z_loss_mlp": 0.31384277, "step": 8541, "time_per_iteration": 2.704939842224121 }, { "auxiliary_loss_clip": 0.01295545, "auxiliary_loss_mlp": 0.00250004, "balance_loss_clip": 1.07975185, "balance_loss_mlp": 0.21981996, "epoch": 0.5135728242897941, "flos": 21944975316480.0, "grad_norm": 7.221556596690133, "language_loss": 0.76648796, "learning_rate": 2.0091523175708162e-06, "loss": 0.78194344, "num_input_tokens_seen": 183668230, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.30175781, "step": 8542, "time_per_iteration": 2.6243157386779785 }, { "auxiliary_loss_clip": 0.01301907, "auxiliary_loss_mlp": 0.00260607, "balance_loss_clip": 1.08591413, "balance_loss_mlp": 0.23092359, "epoch": 0.513632947542462, "flos": 22674859678080.0, "grad_norm": 40.626844672477986, "language_loss": 0.87501252, "learning_rate": 2.0087628597962023e-06, "loss": 0.89063764, "num_input_tokens_seen": 183687800, "router_z_loss_clip": 2.1640625, "router_z_loss_mlp": 0.296875, "step": 8543, "time_per_iteration": 2.6885883808135986 }, { "auxiliary_loss_clip": 0.01296754, "auxiliary_loss_mlp": 0.00230516, "balance_loss_clip": 1.08696008, "balance_loss_mlp": 0.20146433, "epoch": 0.51369307079513, "flos": 29457163762560.0, "grad_norm": 14.696719186808535, "language_loss": 0.75738263, "learning_rate": 2.008373401689299e-06, "loss": 0.77265537, "num_input_tokens_seen": 183709025, "router_z_loss_clip": 2.09667969, "router_z_loss_mlp": 0.29052734, "step": 8544, "time_per_iteration": 4.090541362762451 }, { "auxiliary_loss_clip": 0.0130241, "auxiliary_loss_mlp": 0.00239473, "balance_loss_clip": 1.08481574, "balance_loss_mlp": 0.21216168, "epoch": 0.513753194047798, "flos": 18989347680000.0, "grad_norm": 17.727606494495316, "language_loss": 0.78630614, "learning_rate": 2.0079839432648765e-06, "loss": 0.80172491, "num_input_tokens_seen": 183725740, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.27331543, "step": 8545, "time_per_iteration": 2.631136655807495 }, { "auxiliary_loss_clip": 0.01293643, "auxiliary_loss_mlp": 0.00269006, "balance_loss_clip": 1.07575846, "balance_loss_mlp": 0.23861967, "epoch": 0.513813317300466, "flos": 17821855923840.0, "grad_norm": 3.8445616251395447, "language_loss": 0.91920364, "learning_rate": 2.0075944845377016e-06, "loss": 0.93483013, "num_input_tokens_seen": 183743995, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.30383301, "step": 8546, "time_per_iteration": 2.672301769256592 }, { "auxiliary_loss_clip": 0.01297021, "auxiliary_loss_mlp": 0.00261467, "balance_loss_clip": 1.07361281, "balance_loss_mlp": 0.23196244, "epoch": 0.5138734405531339, "flos": 24061191045120.0, "grad_norm": 875.2027744885571, "language_loss": 0.81051064, "learning_rate": 2.007205025522544e-06, "loss": 0.82609558, "num_input_tokens_seen": 183764150, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.29528809, "step": 8547, "time_per_iteration": 4.110399007797241 }, { "auxiliary_loss_clip": 0.01297232, "auxiliary_loss_mlp": 0.00261492, "balance_loss_clip": 1.0787214, "balance_loss_mlp": 0.232131, "epoch": 0.5139335638058019, "flos": 26097253574400.0, "grad_norm": 36.50938696856157, "language_loss": 0.80683899, "learning_rate": 2.0068155662341702e-06, "loss": 0.82242632, "num_input_tokens_seen": 183783280, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.29370117, "step": 8548, "time_per_iteration": 2.720400810241699 }, { "auxiliary_loss_clip": 0.01289582, "auxiliary_loss_mlp": 0.0025819, "balance_loss_clip": 1.07923126, "balance_loss_mlp": 0.22895969, "epoch": 0.5139936870584698, "flos": 18917095472640.0, "grad_norm": 9.808784018770366, "language_loss": 0.87828523, "learning_rate": 2.0064261066873495e-06, "loss": 0.89376295, "num_input_tokens_seen": 183800725, "router_z_loss_clip": 2.10546875, "router_z_loss_mlp": 0.29223633, "step": 8549, "time_per_iteration": 4.043980598449707 }, { "auxiliary_loss_clip": 0.01286517, "auxiliary_loss_mlp": 0.0024706, "balance_loss_clip": 1.07233524, "balance_loss_mlp": 0.21999986, "epoch": 0.5140538103111378, "flos": 16144001775360.0, "grad_norm": 55.970292748203875, "language_loss": 0.78338587, "learning_rate": 2.0060366468968504e-06, "loss": 0.79872167, "num_input_tokens_seen": 183818735, "router_z_loss_clip": 2.140625, "router_z_loss_mlp": 0.27087402, "step": 8550, "time_per_iteration": 2.587674140930176 }, { "auxiliary_loss_clip": 0.01297789, "auxiliary_loss_mlp": 0.00267732, "balance_loss_clip": 1.07515335, "balance_loss_mlp": 0.23535466, "epoch": 0.5141139335638057, "flos": 22420145358720.0, "grad_norm": 11.111596399133129, "language_loss": 0.8185634, "learning_rate": 2.0056471868774408e-06, "loss": 0.83421862, "num_input_tokens_seen": 183840015, "router_z_loss_clip": 2.22851562, "router_z_loss_mlp": 0.32373047, "step": 8551, "time_per_iteration": 2.635011672973633 }, { "auxiliary_loss_clip": 0.01297583, "auxiliary_loss_mlp": 0.00235881, "balance_loss_clip": 1.085289, "balance_loss_mlp": 0.20759231, "epoch": 0.5141740568164738, "flos": 27089645506560.0, "grad_norm": 11.984774902095037, "language_loss": 0.77305079, "learning_rate": 2.0052577266438897e-06, "loss": 0.78838545, "num_input_tokens_seen": 183860145, "router_z_loss_clip": 2.12304688, "router_z_loss_mlp": 0.28295898, "step": 8552, "time_per_iteration": 2.6556344032287598 }, { "auxiliary_loss_clip": 0.01290147, "auxiliary_loss_mlp": 0.00264361, "balance_loss_clip": 1.07222223, "balance_loss_mlp": 0.23496385, "epoch": 0.5142341800691418, "flos": 24973250209920.0, "grad_norm": 6.5990965493817635, "language_loss": 0.82633114, "learning_rate": 2.004868266210965e-06, "loss": 0.84187615, "num_input_tokens_seen": 183880540, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.29370117, "step": 8553, "time_per_iteration": 2.6707186698913574 }, { "auxiliary_loss_clip": 0.01291496, "auxiliary_loss_mlp": 0.0024763, "balance_loss_clip": 1.07806587, "balance_loss_mlp": 0.21947324, "epoch": 0.5142943033218097, "flos": 20704513080960.0, "grad_norm": 378.63544239676037, "language_loss": 0.75858963, "learning_rate": 2.004478805593435e-06, "loss": 0.77398086, "num_input_tokens_seen": 183900895, "router_z_loss_clip": 2.13671875, "router_z_loss_mlp": 0.28161621, "step": 8554, "time_per_iteration": 4.071545600891113 }, { "auxiliary_loss_clip": 0.01307757, "auxiliary_loss_mlp": 0.00263298, "balance_loss_clip": 1.08481121, "balance_loss_mlp": 0.23226783, "epoch": 0.5143544265744777, "flos": 22925479847040.0, "grad_norm": 11.547098206660268, "language_loss": 0.80634993, "learning_rate": 2.004089344806068e-06, "loss": 0.82206047, "num_input_tokens_seen": 183920335, "router_z_loss_clip": 2.23242188, "router_z_loss_mlp": 0.31030273, "step": 8555, "time_per_iteration": 2.666668653488159 }, { "auxiliary_loss_clip": 0.01286898, "auxiliary_loss_mlp": 0.00235379, "balance_loss_clip": 1.0700618, "balance_loss_mlp": 0.20650639, "epoch": 0.5144145498271456, "flos": 15921391236480.0, "grad_norm": 5.352696268231434, "language_loss": 0.89968944, "learning_rate": 2.003699883863633e-06, "loss": 0.91491222, "num_input_tokens_seen": 183936220, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.28857422, "step": 8556, "time_per_iteration": 2.647824764251709 }, { "auxiliary_loss_clip": 0.01280145, "auxiliary_loss_mlp": 0.00228077, "balance_loss_clip": 1.0678122, "balance_loss_mlp": 0.20249483, "epoch": 0.5144746730798136, "flos": 19681238430720.0, "grad_norm": 452.46213459842505, "language_loss": 0.92241091, "learning_rate": 2.003310422780898e-06, "loss": 0.93749315, "num_input_tokens_seen": 183953250, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.25622559, "step": 8557, "time_per_iteration": 2.638580799102783 }, { "auxiliary_loss_clip": 0.0127578, "auxiliary_loss_mlp": 0.00218879, "balance_loss_clip": 1.0653764, "balance_loss_mlp": 0.19358285, "epoch": 0.5145347963324816, "flos": 23914711382400.0, "grad_norm": 51.013453990213875, "language_loss": 0.94977373, "learning_rate": 2.0029209615726307e-06, "loss": 0.96472025, "num_input_tokens_seen": 183973865, "router_z_loss_clip": 2.10351562, "router_z_loss_mlp": 0.25317383, "step": 8558, "time_per_iteration": 2.6791610717773438 }, { "auxiliary_loss_clip": 0.0127514, "auxiliary_loss_mlp": 0.00258982, "balance_loss_clip": 1.06236219, "balance_loss_mlp": 0.23156349, "epoch": 0.5145949195851496, "flos": 18260002022400.0, "grad_norm": 231.87787367061324, "language_loss": 0.74010587, "learning_rate": 2.002531500253602e-06, "loss": 0.75544709, "num_input_tokens_seen": 183992555, "router_z_loss_clip": 2.13085938, "router_z_loss_mlp": 0.27453613, "step": 8559, "time_per_iteration": 2.6561360359191895 }, { "auxiliary_loss_clip": 0.01288188, "auxiliary_loss_mlp": 0.00256042, "balance_loss_clip": 1.07073343, "balance_loss_mlp": 0.22743168, "epoch": 0.5146550428378175, "flos": 26213425136640.0, "grad_norm": 5791.281651634087, "language_loss": 0.70384026, "learning_rate": 2.002142038838577e-06, "loss": 0.71928251, "num_input_tokens_seen": 184010825, "router_z_loss_clip": 2.17578125, "router_z_loss_mlp": 0.28625488, "step": 8560, "time_per_iteration": 2.71158504486084 }, { "auxiliary_loss_clip": 0.01287331, "auxiliary_loss_mlp": 0.00247512, "balance_loss_clip": 1.07280529, "balance_loss_mlp": 0.21949756, "epoch": 0.5147151660904855, "flos": 22674177319680.0, "grad_norm": 3.8628767851488894, "language_loss": 0.7752738, "learning_rate": 2.0017525773423265e-06, "loss": 0.79062223, "num_input_tokens_seen": 184030155, "router_z_loss_clip": 2.14257812, "router_z_loss_mlp": 0.28027344, "step": 8561, "time_per_iteration": 2.7872166633605957 }, { "auxiliary_loss_clip": 0.01307695, "auxiliary_loss_mlp": 0.00243943, "balance_loss_clip": 1.08933139, "balance_loss_mlp": 0.21793202, "epoch": 0.5147752893431534, "flos": 24972388283520.0, "grad_norm": 32.393471333260095, "language_loss": 0.72911817, "learning_rate": 2.0013631157796177e-06, "loss": 0.74463451, "num_input_tokens_seen": 184051440, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.26025391, "step": 8562, "time_per_iteration": 2.696347713470459 }, { "auxiliary_loss_clip": 0.01308141, "auxiliary_loss_mlp": 0.00220042, "balance_loss_clip": 1.09145832, "balance_loss_mlp": 0.19457905, "epoch": 0.5148354125958214, "flos": 22744669760640.0, "grad_norm": 20.63804222226166, "language_loss": 0.83104783, "learning_rate": 2.0009736541652188e-06, "loss": 0.84632969, "num_input_tokens_seen": 184070205, "router_z_loss_clip": 2.16796875, "router_z_loss_mlp": 0.2545166, "step": 8563, "time_per_iteration": 2.6580114364624023 }, { "auxiliary_loss_clip": 0.0131986, "auxiliary_loss_mlp": 0.00265817, "balance_loss_clip": 1.09543014, "balance_loss_mlp": 0.23581225, "epoch": 0.5148955358484893, "flos": 23068763199360.0, "grad_norm": 2.289097278488612, "language_loss": 0.91857684, "learning_rate": 2.0005841925139e-06, "loss": 0.9344337, "num_input_tokens_seen": 184087345, "router_z_loss_clip": 2.24414062, "router_z_loss_mlp": 0.29992676, "step": 8564, "time_per_iteration": 2.6317920684814453 }, { "auxiliary_loss_clip": 0.01337094, "auxiliary_loss_mlp": 0.00261522, "balance_loss_clip": 1.10785747, "balance_loss_mlp": 0.230039, "epoch": 0.5149556591011574, "flos": 20340127560960.0, "grad_norm": 55.083333760847076, "language_loss": 0.80285871, "learning_rate": 2.0001947308404283e-06, "loss": 0.81884491, "num_input_tokens_seen": 184107110, "router_z_loss_clip": 2.29492188, "router_z_loss_mlp": 0.31481934, "step": 8565, "time_per_iteration": 2.728675127029419 }, { "auxiliary_loss_clip": 0.01321818, "auxiliary_loss_mlp": 0.00247125, "balance_loss_clip": 1.09542704, "balance_loss_mlp": 0.217418, "epoch": 0.5150157823538254, "flos": 22638230784000.0, "grad_norm": 6.080678467400373, "language_loss": 0.77547866, "learning_rate": 1.9998052691595715e-06, "loss": 0.79116809, "num_input_tokens_seen": 184127105, "router_z_loss_clip": 2.265625, "router_z_loss_mlp": 0.29736328, "step": 8566, "time_per_iteration": 2.6549859046936035 }, { "auxiliary_loss_clip": 0.01335141, "auxiliary_loss_mlp": 0.0024912, "balance_loss_clip": 1.10126126, "balance_loss_mlp": 0.21888858, "epoch": 0.5150759056064933, "flos": 26067627832320.0, "grad_norm": 4.521346618051375, "language_loss": 0.85315359, "learning_rate": 1.9994158074861005e-06, "loss": 0.86899614, "num_input_tokens_seen": 184148060, "router_z_loss_clip": 2.34179688, "router_z_loss_mlp": 0.30224609, "step": 8567, "time_per_iteration": 2.7024617195129395 }, { "auxiliary_loss_clip": 0.01337695, "auxiliary_loss_mlp": 0.00242906, "balance_loss_clip": 1.11323881, "balance_loss_mlp": 0.21517825, "epoch": 0.5151360288591613, "flos": 25952641418880.0, "grad_norm": 16.484585625290137, "language_loss": 0.86282456, "learning_rate": 1.9990263458347806e-06, "loss": 0.87863064, "num_input_tokens_seen": 184166175, "router_z_loss_clip": 2.24609375, "router_z_loss_mlp": 0.27734375, "step": 8568, "time_per_iteration": 2.642699718475342 }, { "auxiliary_loss_clip": 0.01315119, "auxiliary_loss_mlp": 0.00224464, "balance_loss_clip": 1.09843147, "balance_loss_mlp": 0.19737956, "epoch": 0.5151961521118292, "flos": 18507246312960.0, "grad_norm": 11.16696037472944, "language_loss": 0.97975516, "learning_rate": 1.9986368842203825e-06, "loss": 0.99515104, "num_input_tokens_seen": 184182600, "router_z_loss_clip": 2.1640625, "router_z_loss_mlp": 0.27087402, "step": 8569, "time_per_iteration": 2.651681423187256 }, { "auxiliary_loss_clip": 0.01328343, "auxiliary_loss_mlp": 0.0023875, "balance_loss_clip": 1.09964669, "balance_loss_mlp": 0.20835175, "epoch": 0.5152562753644973, "flos": 22233696837120.0, "grad_norm": 71.67685700624712, "language_loss": 0.84806287, "learning_rate": 1.998247422657674e-06, "loss": 0.86373377, "num_input_tokens_seen": 184202020, "router_z_loss_clip": 2.2890625, "router_z_loss_mlp": 0.30395508, "step": 8570, "time_per_iteration": 2.609334707260132 }, { "auxiliary_loss_clip": 0.01331193, "auxiliary_loss_mlp": 0.0022857, "balance_loss_clip": 1.10132003, "balance_loss_mlp": 0.20013812, "epoch": 0.5153163986171652, "flos": 38436555047040.0, "grad_norm": 199.2491366618405, "language_loss": 0.80903387, "learning_rate": 1.9978579611614227e-06, "loss": 0.82463145, "num_input_tokens_seen": 184224850, "router_z_loss_clip": 2.296875, "router_z_loss_mlp": 0.28430176, "step": 8571, "time_per_iteration": 2.765758752822876 }, { "auxiliary_loss_clip": 0.01384939, "auxiliary_loss_mlp": 0.0005828, "balance_loss_clip": 1.23951638, "balance_loss_mlp": 0.04669261, "epoch": 0.5153765218698332, "flos": 66384503015040.0, "grad_norm": 0.7674170452075944, "language_loss": 0.52797455, "learning_rate": 1.9974684997463984e-06, "loss": 0.54240674, "num_input_tokens_seen": 184288520, "router_z_loss_clip": 1.453125, "router_z_loss_mlp": 0.11572266, "step": 8572, "time_per_iteration": 3.1619808673858643 }, { "auxiliary_loss_clip": 0.01333933, "auxiliary_loss_mlp": 0.00221298, "balance_loss_clip": 1.11731601, "balance_loss_mlp": 0.19583479, "epoch": 0.5154366451225011, "flos": 24024669891840.0, "grad_norm": 85.9455454690201, "language_loss": 0.85021031, "learning_rate": 1.9970790384273687e-06, "loss": 0.86576271, "num_input_tokens_seen": 184308565, "router_z_loss_clip": 2.16796875, "router_z_loss_mlp": 0.2545166, "step": 8573, "time_per_iteration": 2.6907858848571777 }, { "auxiliary_loss_clip": 0.01332647, "auxiliary_loss_mlp": 0.00252731, "balance_loss_clip": 1.11287367, "balance_loss_mlp": 0.22410847, "epoch": 0.5154967683751691, "flos": 23468843859840.0, "grad_norm": 5.74131350529177, "language_loss": 0.85802698, "learning_rate": 1.996689577219102e-06, "loss": 0.8738808, "num_input_tokens_seen": 184326795, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.28625488, "step": 8574, "time_per_iteration": 2.6767327785491943 }, { "auxiliary_loss_clip": 0.01319174, "auxiliary_loss_mlp": 0.00202717, "balance_loss_clip": 1.10463369, "balance_loss_mlp": 0.17750394, "epoch": 0.515556891627837, "flos": 23805650712960.0, "grad_norm": 3.1313864504382325, "language_loss": 0.92614877, "learning_rate": 1.996300116136367e-06, "loss": 0.94136769, "num_input_tokens_seen": 184345990, "router_z_loss_clip": 2.14648438, "router_z_loss_mlp": 0.25183105, "step": 8575, "time_per_iteration": 2.6547799110412598 }, { "auxiliary_loss_clip": 0.0134158, "auxiliary_loss_mlp": 0.00226592, "balance_loss_clip": 1.11591101, "balance_loss_mlp": 0.19868523, "epoch": 0.515617014880505, "flos": 19828544106240.0, "grad_norm": 2.1295550859535206, "language_loss": 0.83520442, "learning_rate": 1.995910655193932e-06, "loss": 0.85088617, "num_input_tokens_seen": 184366300, "router_z_loss_clip": 2.25976562, "router_z_loss_mlp": 0.27893066, "step": 8576, "time_per_iteration": 2.690713405609131 }, { "auxiliary_loss_clip": 0.0134383, "auxiliary_loss_mlp": 0.00246587, "balance_loss_clip": 1.10865152, "balance_loss_mlp": 0.21667689, "epoch": 0.515677138133173, "flos": 14245907385600.0, "grad_norm": 21.459571615651747, "language_loss": 0.83687729, "learning_rate": 1.9955211944065654e-06, "loss": 0.85278147, "num_input_tokens_seen": 184383030, "router_z_loss_clip": 2.3515625, "router_z_loss_mlp": 0.29882812, "step": 8577, "time_per_iteration": 2.6270253658294678 }, { "auxiliary_loss_clip": 0.0134378, "auxiliary_loss_mlp": 0.00252899, "balance_loss_clip": 1.11752808, "balance_loss_mlp": 0.22368127, "epoch": 0.515737261385841, "flos": 28289707920000.0, "grad_norm": 154.19297646956448, "language_loss": 0.87831336, "learning_rate": 1.9951317337890353e-06, "loss": 0.89428014, "num_input_tokens_seen": 184403410, "router_z_loss_clip": 2.26367188, "router_z_loss_mlp": 0.29223633, "step": 8578, "time_per_iteration": 2.689704418182373 }, { "auxiliary_loss_clip": 0.01324013, "auxiliary_loss_mlp": 0.00226677, "balance_loss_clip": 1.10439241, "balance_loss_mlp": 0.19960438, "epoch": 0.515797384638509, "flos": 27891925729920.0, "grad_norm": 264.2871326908366, "language_loss": 0.84674436, "learning_rate": 1.9947422733561105e-06, "loss": 0.86225128, "num_input_tokens_seen": 184423830, "router_z_loss_clip": 2.19921875, "router_z_loss_mlp": 0.27075195, "step": 8579, "time_per_iteration": 2.706963062286377 }, { "auxiliary_loss_clip": 0.01318984, "auxiliary_loss_mlp": 0.00220936, "balance_loss_clip": 1.09877777, "balance_loss_mlp": 0.19389959, "epoch": 0.5158575078911769, "flos": 23040071210880.0, "grad_norm": 1077.631278717671, "language_loss": 0.87862098, "learning_rate": 1.994352813122559e-06, "loss": 0.8940202, "num_input_tokens_seen": 184445050, "router_z_loss_clip": 2.20117188, "router_z_loss_mlp": 0.27038574, "step": 8580, "time_per_iteration": 2.6637191772460938 }, { "auxiliary_loss_clip": 0.01340958, "auxiliary_loss_mlp": 0.00247792, "balance_loss_clip": 1.11320984, "balance_loss_mlp": 0.22086224, "epoch": 0.5159176311438449, "flos": 12641346938880.0, "grad_norm": 14.58522689442267, "language_loss": 0.80746472, "learning_rate": 1.99396335310315e-06, "loss": 0.82335222, "num_input_tokens_seen": 184460775, "router_z_loss_clip": 2.27734375, "router_z_loss_mlp": 0.26904297, "step": 8581, "time_per_iteration": 2.578061103820801 }, { "auxiliary_loss_clip": 0.01319238, "auxiliary_loss_mlp": 0.00212755, "balance_loss_clip": 1.10381675, "balance_loss_mlp": 0.18837646, "epoch": 0.5159777543965128, "flos": 15558154951680.0, "grad_norm": 15.651098357926333, "language_loss": 0.82455951, "learning_rate": 1.9935738933126508e-06, "loss": 0.83987945, "num_input_tokens_seen": 184477365, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.24389648, "step": 8582, "time_per_iteration": 2.6051039695739746 }, { "auxiliary_loss_clip": 0.01324711, "auxiliary_loss_mlp": 0.00201613, "balance_loss_clip": 1.1060257, "balance_loss_mlp": 0.17747357, "epoch": 0.5160378776491809, "flos": 23221671396480.0, "grad_norm": 7.939311349582776, "language_loss": 0.73655993, "learning_rate": 1.99318443376583e-06, "loss": 0.75182319, "num_input_tokens_seen": 184497045, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.24145508, "step": 8583, "time_per_iteration": 2.61879301071167 }, { "auxiliary_loss_clip": 0.01339389, "auxiliary_loss_mlp": 0.00220566, "balance_loss_clip": 1.11274397, "balance_loss_mlp": 0.19383934, "epoch": 0.5160980009018488, "flos": 21944616180480.0, "grad_norm": 13.460536451188158, "language_loss": 0.81705421, "learning_rate": 1.9927949744774568e-06, "loss": 0.83265376, "num_input_tokens_seen": 184517675, "router_z_loss_clip": 2.26953125, "router_z_loss_mlp": 0.26733398, "step": 8584, "time_per_iteration": 2.6456823348999023 }, { "auxiliary_loss_clip": 0.01320012, "auxiliary_loss_mlp": 0.00237592, "balance_loss_clip": 1.09805095, "balance_loss_mlp": 0.21043663, "epoch": 0.5161581241545168, "flos": 22784064001920.0, "grad_norm": 11.749811223651335, "language_loss": 0.86240935, "learning_rate": 1.9924055154622983e-06, "loss": 0.87798536, "num_input_tokens_seen": 184537745, "router_z_loss_clip": 2.22070312, "router_z_loss_mlp": 0.27172852, "step": 8585, "time_per_iteration": 2.6957192420959473 }, { "auxiliary_loss_clip": 0.01313895, "auxiliary_loss_mlp": 0.00202031, "balance_loss_clip": 1.09514451, "balance_loss_mlp": 0.17699692, "epoch": 0.5162182474071847, "flos": 19675384513920.0, "grad_norm": 25.309562775214182, "language_loss": 0.88441128, "learning_rate": 1.9920160567351238e-06, "loss": 0.89957052, "num_input_tokens_seen": 184553630, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.25012207, "step": 8586, "time_per_iteration": 4.149962663650513 }, { "auxiliary_loss_clip": 0.0132521, "auxiliary_loss_mlp": 0.00214316, "balance_loss_clip": 1.10531294, "balance_loss_mlp": 0.18923484, "epoch": 0.5162783706598527, "flos": 20046198568320.0, "grad_norm": 15.206442345567124, "language_loss": 0.78895462, "learning_rate": 1.991626598310701e-06, "loss": 0.80434984, "num_input_tokens_seen": 184573530, "router_z_loss_clip": 2.20117188, "router_z_loss_mlp": 0.25085449, "step": 8587, "time_per_iteration": 2.6840264797210693 }, { "auxiliary_loss_clip": 0.01319361, "auxiliary_loss_mlp": 0.00047534, "balance_loss_clip": 1.1775583, "balance_loss_mlp": 0.0399043, "epoch": 0.5163384939125206, "flos": 69959553713280.0, "grad_norm": 2.3997974905760837, "language_loss": 0.57302713, "learning_rate": 1.9912371402037984e-06, "loss": 0.58669615, "num_input_tokens_seen": 184637875, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.07617188, "step": 8588, "time_per_iteration": 3.092831611633301 }, { "auxiliary_loss_clip": 0.01335321, "auxiliary_loss_mlp": 0.00249439, "balance_loss_clip": 1.11079848, "balance_loss_mlp": 0.21901673, "epoch": 0.5163986171651886, "flos": 17417034668160.0, "grad_norm": 19.08288734515957, "language_loss": 0.8247695, "learning_rate": 1.990847682429185e-06, "loss": 0.84061712, "num_input_tokens_seen": 184656125, "router_z_loss_clip": 2.24804688, "router_z_loss_mlp": 0.30407715, "step": 8589, "time_per_iteration": 4.031512498855591 }, { "auxiliary_loss_clip": 0.01318473, "auxiliary_loss_mlp": 0.00231487, "balance_loss_clip": 1.0975312, "balance_loss_mlp": 0.20516543, "epoch": 0.5164587404178566, "flos": 21322679166720.0, "grad_norm": 1.701975919176331, "language_loss": 0.74344289, "learning_rate": 1.990458225001627e-06, "loss": 0.75894248, "num_input_tokens_seen": 184675920, "router_z_loss_clip": 2.21484375, "router_z_loss_mlp": 0.26318359, "step": 8590, "time_per_iteration": 2.6488611698150635 }, { "auxiliary_loss_clip": 0.01338771, "auxiliary_loss_mlp": 0.00017809, "balance_loss_clip": 1.19438815, "balance_loss_mlp": 0.01013229, "epoch": 0.5165188636705246, "flos": 68057149691520.0, "grad_norm": 0.813317025299719, "language_loss": 0.55511916, "learning_rate": 1.990068767935895e-06, "loss": 0.56868494, "num_input_tokens_seen": 184730520, "router_z_loss_clip": 1.4375, "router_z_loss_mlp": 0.07666016, "step": 8591, "time_per_iteration": 4.382440090179443 }, { "auxiliary_loss_clip": 0.01306323, "auxiliary_loss_mlp": 0.00189013, "balance_loss_clip": 1.09462929, "balance_loss_mlp": 0.16414635, "epoch": 0.5165789869231926, "flos": 19385657412480.0, "grad_norm": 5.366405874738355, "language_loss": 0.88311017, "learning_rate": 1.9896793112467566e-06, "loss": 0.89806354, "num_input_tokens_seen": 184748340, "router_z_loss_clip": 2.1171875, "router_z_loss_mlp": 0.2487793, "step": 8592, "time_per_iteration": 2.613340377807617 }, { "auxiliary_loss_clip": 0.01317316, "auxiliary_loss_mlp": 0.00197892, "balance_loss_clip": 1.09665811, "balance_loss_mlp": 0.17306118, "epoch": 0.5166391101758605, "flos": 20960197067520.0, "grad_norm": 5.440980167407868, "language_loss": 0.89525592, "learning_rate": 1.989289854948979e-06, "loss": 0.91040808, "num_input_tokens_seen": 184766615, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.24841309, "step": 8593, "time_per_iteration": 2.632797956466675 }, { "auxiliary_loss_clip": 0.01312381, "auxiliary_loss_mlp": 0.00218855, "balance_loss_clip": 1.09418201, "balance_loss_mlp": 0.19224782, "epoch": 0.5166992334285285, "flos": 29462407148160.0, "grad_norm": 6.549722289059256, "language_loss": 0.7743811, "learning_rate": 1.9889003990573314e-06, "loss": 0.78969347, "num_input_tokens_seen": 184788075, "router_z_loss_clip": 2.18164062, "router_z_loss_mlp": 0.26635742, "step": 8594, "time_per_iteration": 2.6878552436828613 }, { "auxiliary_loss_clip": 0.01315177, "auxiliary_loss_mlp": 0.00223983, "balance_loss_clip": 1.09368503, "balance_loss_mlp": 0.19672005, "epoch": 0.5167593566811964, "flos": 20304360593280.0, "grad_norm": 2.2589584522222106, "language_loss": 0.84031677, "learning_rate": 1.988510943586582e-06, "loss": 0.85570836, "num_input_tokens_seen": 184808710, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.27246094, "step": 8595, "time_per_iteration": 2.668076276779175 }, { "auxiliary_loss_clip": 0.01311853, "auxiliary_loss_mlp": 0.00201612, "balance_loss_clip": 1.09468818, "balance_loss_mlp": 0.17581493, "epoch": 0.5168194799338645, "flos": 14611370313600.0, "grad_norm": 3.2040018096237026, "language_loss": 0.72905242, "learning_rate": 1.9881214885514986e-06, "loss": 0.74418712, "num_input_tokens_seen": 184826475, "router_z_loss_clip": 2.171875, "router_z_loss_mlp": 0.25817871, "step": 8596, "time_per_iteration": 4.061712980270386 }, { "auxiliary_loss_clip": 0.01326554, "auxiliary_loss_mlp": 0.00223976, "balance_loss_clip": 1.10260963, "balance_loss_mlp": 0.19602151, "epoch": 0.5168796031865324, "flos": 25007257411200.0, "grad_norm": 6.533144725638429, "language_loss": 0.82277769, "learning_rate": 1.9877320339668492e-06, "loss": 0.83828306, "num_input_tokens_seen": 184845245, "router_z_loss_clip": 2.2421875, "router_z_loss_mlp": 0.27929688, "step": 8597, "time_per_iteration": 2.6552045345306396 }, { "auxiliary_loss_clip": 0.01314653, "auxiliary_loss_mlp": 0.00238049, "balance_loss_clip": 1.0939362, "balance_loss_mlp": 0.21086955, "epoch": 0.5169397264392004, "flos": 26939969533440.0, "grad_norm": 2.9797213594697194, "language_loss": 0.87655175, "learning_rate": 1.987342579847403e-06, "loss": 0.89207876, "num_input_tokens_seen": 184866605, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.27185059, "step": 8598, "time_per_iteration": 2.6864242553710938 }, { "auxiliary_loss_clip": 0.01312766, "auxiliary_loss_mlp": 0.00196374, "balance_loss_clip": 1.09082985, "balance_loss_mlp": 0.17066008, "epoch": 0.5169998496918683, "flos": 25407804948480.0, "grad_norm": 2.0114478106457696, "language_loss": 0.81687874, "learning_rate": 1.9869531262079273e-06, "loss": 0.83197016, "num_input_tokens_seen": 184886945, "router_z_loss_clip": 2.21679688, "router_z_loss_mlp": 0.25720215, "step": 8599, "time_per_iteration": 2.6735033988952637 }, { "auxiliary_loss_clip": 0.0128592, "auxiliary_loss_mlp": 0.00189435, "balance_loss_clip": 1.07020307, "balance_loss_mlp": 0.16452008, "epoch": 0.5170599729445363, "flos": 24680793674880.0, "grad_norm": 14.10689567765216, "language_loss": 0.80009121, "learning_rate": 1.9865636730631904e-06, "loss": 0.81484473, "num_input_tokens_seen": 184905590, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.24926758, "step": 8600, "time_per_iteration": 2.672043561935425 }, { "auxiliary_loss_clip": 0.01296991, "auxiliary_loss_mlp": 0.00198478, "balance_loss_clip": 1.07603955, "balance_loss_mlp": 0.17230015, "epoch": 0.5171200961972042, "flos": 20994455664000.0, "grad_norm": 14.414504337102118, "language_loss": 0.8132264, "learning_rate": 1.9861742204279602e-06, "loss": 0.82818109, "num_input_tokens_seen": 184925555, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.26196289, "step": 8601, "time_per_iteration": 2.628859043121338 }, { "auxiliary_loss_clip": 0.0130074, "auxiliary_loss_mlp": 0.00223649, "balance_loss_clip": 1.07575691, "balance_loss_mlp": 0.19660079, "epoch": 0.5171802194498722, "flos": 22745639427840.0, "grad_norm": 13.632469094008558, "language_loss": 0.9478339, "learning_rate": 1.9857847683170045e-06, "loss": 0.9630779, "num_input_tokens_seen": 184944490, "router_z_loss_clip": 2.25195312, "router_z_loss_mlp": 0.27075195, "step": 8602, "time_per_iteration": 2.634979486465454 }, { "auxiliary_loss_clip": 0.01293552, "auxiliary_loss_mlp": 0.00232861, "balance_loss_clip": 1.07399321, "balance_loss_mlp": 0.2063604, "epoch": 0.5172403427025402, "flos": 28176732668160.0, "grad_norm": 54222.05134740459, "language_loss": 0.82945317, "learning_rate": 1.9853953167450926e-06, "loss": 0.84471732, "num_input_tokens_seen": 184963190, "router_z_loss_clip": 2.19921875, "router_z_loss_mlp": 0.26501465, "step": 8603, "time_per_iteration": 2.704261541366577 }, { "auxiliary_loss_clip": 0.01283399, "auxiliary_loss_mlp": 0.00230765, "balance_loss_clip": 1.06218183, "balance_loss_mlp": 0.20468217, "epoch": 0.5173004659552082, "flos": 20337829090560.0, "grad_norm": 167.44533959382747, "language_loss": 0.82652575, "learning_rate": 1.9850058657269915e-06, "loss": 0.84166741, "num_input_tokens_seen": 184981220, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.2611084, "step": 8604, "time_per_iteration": 2.679879903793335 }, { "auxiliary_loss_clip": 0.01284267, "auxiliary_loss_mlp": 0.00239042, "balance_loss_clip": 1.05885112, "balance_loss_mlp": 0.21062236, "epoch": 0.5173605892078762, "flos": 19063323740160.0, "grad_norm": 80.16575033886134, "language_loss": 0.92928863, "learning_rate": 1.984616415277469e-06, "loss": 0.94452173, "num_input_tokens_seen": 184998810, "router_z_loss_clip": 2.25390625, "router_z_loss_mlp": 0.28417969, "step": 8605, "time_per_iteration": 2.6371939182281494 }, { "auxiliary_loss_clip": 0.01267505, "auxiliary_loss_mlp": 0.00213505, "balance_loss_clip": 1.04913759, "balance_loss_mlp": 0.18689741, "epoch": 0.5174207124605441, "flos": 27995168396160.0, "grad_norm": 159.85669288214112, "language_loss": 0.71462601, "learning_rate": 1.984226965411294e-06, "loss": 0.7294361, "num_input_tokens_seen": 185021185, "router_z_loss_clip": 2.18164062, "router_z_loss_mlp": 0.26611328, "step": 8606, "time_per_iteration": 2.7054202556610107 }, { "auxiliary_loss_clip": 0.01284186, "auxiliary_loss_mlp": 0.00235089, "balance_loss_clip": 1.06681526, "balance_loss_mlp": 0.20846979, "epoch": 0.5174808357132121, "flos": 19496657416320.0, "grad_norm": 930.4462231236995, "language_loss": 0.85359865, "learning_rate": 1.983837516143234e-06, "loss": 0.8687914, "num_input_tokens_seen": 185038465, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.26611328, "step": 8607, "time_per_iteration": 2.6510581970214844 }, { "auxiliary_loss_clip": 0.01275007, "auxiliary_loss_mlp": 0.00225475, "balance_loss_clip": 1.05536973, "balance_loss_mlp": 0.19974984, "epoch": 0.51754095896588, "flos": 22784171742720.0, "grad_norm": 9.903491754278866, "language_loss": 0.79113817, "learning_rate": 1.983448067488057e-06, "loss": 0.80614299, "num_input_tokens_seen": 185057340, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.25708008, "step": 8608, "time_per_iteration": 2.721646308898926 }, { "auxiliary_loss_clip": 0.01276597, "auxiliary_loss_mlp": 0.00248454, "balance_loss_clip": 1.05385506, "balance_loss_mlp": 0.22109517, "epoch": 0.5176010822185481, "flos": 22669257156480.0, "grad_norm": 141.52814354118655, "language_loss": 0.94053972, "learning_rate": 1.983058619460531e-06, "loss": 0.95579022, "num_input_tokens_seen": 185074935, "router_z_loss_clip": 2.22851562, "router_z_loss_mlp": 0.27331543, "step": 8609, "time_per_iteration": 2.64128041267395 }, { "auxiliary_loss_clip": 0.01283014, "auxiliary_loss_mlp": 0.00233277, "balance_loss_clip": 1.06425142, "balance_loss_mlp": 0.2071943, "epoch": 0.517661205471216, "flos": 23951196622080.0, "grad_norm": 9.900649696420677, "language_loss": 0.80362856, "learning_rate": 1.9826691720754237e-06, "loss": 0.81879145, "num_input_tokens_seen": 185095050, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.26074219, "step": 8610, "time_per_iteration": 2.6624159812927246 }, { "auxiliary_loss_clip": 0.01300046, "auxiliary_loss_mlp": 0.00253027, "balance_loss_clip": 1.07097661, "balance_loss_mlp": 0.22477484, "epoch": 0.517721328723884, "flos": 15596076735360.0, "grad_norm": 2.256936561840729, "language_loss": 0.77563447, "learning_rate": 1.9822797253475034e-06, "loss": 0.79116517, "num_input_tokens_seen": 185112275, "router_z_loss_clip": 2.29101562, "router_z_loss_mlp": 0.2824707, "step": 8611, "time_per_iteration": 2.681755304336548 }, { "auxiliary_loss_clip": 0.01293194, "auxiliary_loss_mlp": 0.00229981, "balance_loss_clip": 1.07303178, "balance_loss_mlp": 0.20427951, "epoch": 0.5177814519765519, "flos": 20960197067520.0, "grad_norm": 6.116958344400622, "language_loss": 0.86210734, "learning_rate": 1.9818902792915373e-06, "loss": 0.87733912, "num_input_tokens_seen": 185132165, "router_z_loss_clip": 2.20117188, "router_z_loss_mlp": 0.25708008, "step": 8612, "time_per_iteration": 2.6607611179351807 }, { "auxiliary_loss_clip": 0.01277901, "auxiliary_loss_mlp": 0.00248759, "balance_loss_clip": 1.05905247, "balance_loss_mlp": 0.22136468, "epoch": 0.5178415752292199, "flos": 17967832796160.0, "grad_norm": 18.110813058834033, "language_loss": 0.90601897, "learning_rate": 1.981500833922294e-06, "loss": 0.92128551, "num_input_tokens_seen": 185151025, "router_z_loss_clip": 2.18652344, "router_z_loss_mlp": 0.27392578, "step": 8613, "time_per_iteration": 2.6280791759490967 }, { "auxiliary_loss_clip": 0.01305065, "auxiliary_loss_mlp": 0.002828, "balance_loss_clip": 1.07700193, "balance_loss_mlp": 0.25373715, "epoch": 0.5179016984818878, "flos": 17821496787840.0, "grad_norm": 6.0259859239729545, "language_loss": 0.76304018, "learning_rate": 1.981111389254541e-06, "loss": 0.77891886, "num_input_tokens_seen": 185168455, "router_z_loss_clip": 2.28125, "router_z_loss_mlp": 0.29077148, "step": 8614, "time_per_iteration": 2.599669933319092 }, { "auxiliary_loss_clip": 0.01306168, "auxiliary_loss_mlp": 0.002399, "balance_loss_clip": 1.08273339, "balance_loss_mlp": 0.21274462, "epoch": 0.5179618217345558, "flos": 17820455293440.0, "grad_norm": 10.358248357639125, "language_loss": 0.93970639, "learning_rate": 1.9807219453030453e-06, "loss": 0.95516706, "num_input_tokens_seen": 185184415, "router_z_loss_clip": 2.23632812, "router_z_loss_mlp": 0.27124023, "step": 8615, "time_per_iteration": 2.6214077472686768 }, { "auxiliary_loss_clip": 0.0129652, "auxiliary_loss_mlp": 0.002445, "balance_loss_clip": 1.07841539, "balance_loss_mlp": 0.21785641, "epoch": 0.5180219449872238, "flos": 22522131048960.0, "grad_norm": 20.269198571240334, "language_loss": 0.87421489, "learning_rate": 1.9803325020825763e-06, "loss": 0.88962513, "num_input_tokens_seen": 185202910, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.26635742, "step": 8616, "time_per_iteration": 2.629202365875244 }, { "auxiliary_loss_clip": 0.01325585, "auxiliary_loss_mlp": 0.00248219, "balance_loss_clip": 1.09192061, "balance_loss_mlp": 0.22070578, "epoch": 0.5180820682398918, "flos": 23915465568000.0, "grad_norm": 8.150878452278226, "language_loss": 0.82821566, "learning_rate": 1.9799430596079e-06, "loss": 0.84395373, "num_input_tokens_seen": 185223085, "router_z_loss_clip": 2.33398438, "router_z_loss_mlp": 0.2755127, "step": 8617, "time_per_iteration": 2.676535129547119 }, { "auxiliary_loss_clip": 0.01301004, "auxiliary_loss_mlp": 0.00246704, "balance_loss_clip": 1.07802773, "balance_loss_mlp": 0.22095503, "epoch": 0.5181421914925598, "flos": 16979930064000.0, "grad_norm": 13.244953608333724, "language_loss": 0.76943672, "learning_rate": 1.979553617893785e-06, "loss": 0.78491378, "num_input_tokens_seen": 185241295, "router_z_loss_clip": 2.23046875, "router_z_loss_mlp": 0.25732422, "step": 8618, "time_per_iteration": 2.6128787994384766 }, { "auxiliary_loss_clip": 0.01295324, "auxiliary_loss_mlp": 0.00161475, "balance_loss_clip": 1.15211248, "balance_loss_mlp": 0.15417896, "epoch": 0.5182023147452277, "flos": 66059870872320.0, "grad_norm": 0.9545370620787204, "language_loss": 0.67244083, "learning_rate": 1.979164176954999e-06, "loss": 0.6870088, "num_input_tokens_seen": 185298295, "router_z_loss_clip": 1.4296875, "router_z_loss_mlp": 0.07275391, "step": 8619, "time_per_iteration": 3.0390384197235107 }, { "auxiliary_loss_clip": 0.01274166, "auxiliary_loss_mlp": 0.00219631, "balance_loss_clip": 1.05982089, "balance_loss_mlp": 0.19531232, "epoch": 0.5182624379978957, "flos": 18187749815040.0, "grad_norm": 12.262761695703075, "language_loss": 0.90321028, "learning_rate": 1.97877473680631e-06, "loss": 0.91814828, "num_input_tokens_seen": 185317000, "router_z_loss_clip": 2.14453125, "router_z_loss_mlp": 0.24304199, "step": 8620, "time_per_iteration": 2.611311197280884 }, { "auxiliary_loss_clip": 0.01286982, "auxiliary_loss_mlp": 0.00234249, "balance_loss_clip": 1.06893444, "balance_loss_mlp": 0.20931, "epoch": 0.5183225612505636, "flos": 14026708638720.0, "grad_norm": 4.926298081453719, "language_loss": 0.88607043, "learning_rate": 1.9783852974624846e-06, "loss": 0.90128267, "num_input_tokens_seen": 185331185, "router_z_loss_clip": 2.18066406, "router_z_loss_mlp": 0.24951172, "step": 8621, "time_per_iteration": 2.59924578666687 }, { "auxiliary_loss_clip": 0.01294204, "auxiliary_loss_mlp": 0.00261849, "balance_loss_clip": 1.07130635, "balance_loss_mlp": 0.23533666, "epoch": 0.5183826845032317, "flos": 23659781581440.0, "grad_norm": 6.7251986023099795, "language_loss": 0.68911147, "learning_rate": 1.9779958589382905e-06, "loss": 0.70467198, "num_input_tokens_seen": 185348955, "router_z_loss_clip": 2.22851562, "router_z_loss_mlp": 0.26501465, "step": 8622, "time_per_iteration": 2.718533992767334 }, { "auxiliary_loss_clip": 0.01292011, "auxiliary_loss_mlp": 0.00221011, "balance_loss_clip": 1.06258428, "balance_loss_mlp": 0.1945222, "epoch": 0.5184428077558996, "flos": 15888605097600.0, "grad_norm": 2073.3729197571492, "language_loss": 0.73585641, "learning_rate": 1.977606421248497e-06, "loss": 0.75098658, "num_input_tokens_seen": 185367330, "router_z_loss_clip": 2.29101562, "router_z_loss_mlp": 0.26464844, "step": 8623, "time_per_iteration": 2.665191411972046 }, { "auxiliary_loss_clip": 0.01272937, "auxiliary_loss_mlp": 0.00214632, "balance_loss_clip": 1.05455029, "balance_loss_mlp": 0.18831092, "epoch": 0.5185029310085676, "flos": 21030833162880.0, "grad_norm": 19.3671805212736, "language_loss": 0.83541644, "learning_rate": 1.9772169844078685e-06, "loss": 0.85029221, "num_input_tokens_seen": 185385060, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.2635498, "step": 8624, "time_per_iteration": 2.6391501426696777 }, { "auxiliary_loss_clip": 0.01279013, "auxiliary_loss_mlp": 0.00238896, "balance_loss_clip": 1.05955136, "balance_loss_mlp": 0.21338472, "epoch": 0.5185630542612355, "flos": 26542690133760.0, "grad_norm": 29.4960340020081, "language_loss": 0.77757549, "learning_rate": 1.9768275484311756e-06, "loss": 0.79275453, "num_input_tokens_seen": 185403745, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.25524902, "step": 8625, "time_per_iteration": 2.7565486431121826 }, { "auxiliary_loss_clip": 0.01284874, "auxiliary_loss_mlp": 0.00250887, "balance_loss_clip": 1.0653913, "balance_loss_mlp": 0.22632989, "epoch": 0.5186231775139035, "flos": 20668422890880.0, "grad_norm": 53.95443963680843, "language_loss": 0.77861977, "learning_rate": 1.976438113333184e-06, "loss": 0.79397738, "num_input_tokens_seen": 185422620, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.24560547, "step": 8626, "time_per_iteration": 2.726149797439575 }, { "auxiliary_loss_clip": 0.01297756, "auxiliary_loss_mlp": 0.00267864, "balance_loss_clip": 1.07151377, "balance_loss_mlp": 0.24128026, "epoch": 0.5186833007665714, "flos": 20885502735360.0, "grad_norm": 14.332441578143106, "language_loss": 0.78466249, "learning_rate": 1.9760486791286612e-06, "loss": 0.80031866, "num_input_tokens_seen": 185439380, "router_z_loss_clip": 2.2578125, "router_z_loss_mlp": 0.26599121, "step": 8627, "time_per_iteration": 2.663118600845337 }, { "auxiliary_loss_clip": 0.01292981, "auxiliary_loss_mlp": 0.00254233, "balance_loss_clip": 1.06732202, "balance_loss_mlp": 0.22775689, "epoch": 0.5187434240192395, "flos": 20886903365760.0, "grad_norm": 22.60923316639077, "language_loss": 0.80407178, "learning_rate": 1.9756592458323753e-06, "loss": 0.8195439, "num_input_tokens_seen": 185458830, "router_z_loss_clip": 2.2578125, "router_z_loss_mlp": 0.26464844, "step": 8628, "time_per_iteration": 2.7386903762817383 }, { "auxiliary_loss_clip": 0.01296671, "auxiliary_loss_mlp": 0.00274133, "balance_loss_clip": 1.07525206, "balance_loss_mlp": 0.24629736, "epoch": 0.5188035472719074, "flos": 19859929614720.0, "grad_norm": 38.11802610076561, "language_loss": 0.83277678, "learning_rate": 1.9752698134590927e-06, "loss": 0.84848475, "num_input_tokens_seen": 185477270, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.27819824, "step": 8629, "time_per_iteration": 4.103916168212891 }, { "auxiliary_loss_clip": 0.01313243, "auxiliary_loss_mlp": 0.00260473, "balance_loss_clip": 1.08540797, "balance_loss_mlp": 0.23363894, "epoch": 0.5188636705245754, "flos": 21138313633920.0, "grad_norm": 53.51414726226495, "language_loss": 0.81479722, "learning_rate": 1.9748803820235815e-06, "loss": 0.83053434, "num_input_tokens_seen": 185495795, "router_z_loss_clip": 2.27734375, "router_z_loss_mlp": 0.26818848, "step": 8630, "time_per_iteration": 2.7035200595855713 }, { "auxiliary_loss_clip": 0.01310709, "auxiliary_loss_mlp": 0.00290844, "balance_loss_clip": 1.0806241, "balance_loss_mlp": 0.26259172, "epoch": 0.5189237937772434, "flos": 22419786222720.0, "grad_norm": 11.436073132060754, "language_loss": 0.86933845, "learning_rate": 1.9744909515406093e-06, "loss": 0.88535398, "num_input_tokens_seen": 185514885, "router_z_loss_clip": 2.30273438, "router_z_loss_mlp": 0.28271484, "step": 8631, "time_per_iteration": 4.179270029067993 }, { "auxiliary_loss_clip": 0.01324533, "auxiliary_loss_mlp": 0.00281049, "balance_loss_clip": 1.09200728, "balance_loss_mlp": 0.25274852, "epoch": 0.5189839170299113, "flos": 25446696399360.0, "grad_norm": 30.650740462938018, "language_loss": 0.81649858, "learning_rate": 1.974101522024942e-06, "loss": 0.8325544, "num_input_tokens_seen": 185537155, "router_z_loss_clip": 2.32617188, "router_z_loss_mlp": 0.28283691, "step": 8632, "time_per_iteration": 2.686420202255249 }, { "auxiliary_loss_clip": 0.01311036, "auxiliary_loss_mlp": 0.0026564, "balance_loss_clip": 1.08331764, "balance_loss_mlp": 0.23857985, "epoch": 0.5190440402825793, "flos": 18587722734720.0, "grad_norm": 7.145416870206982, "language_loss": 0.88277382, "learning_rate": 1.9737120934913477e-06, "loss": 0.89854056, "num_input_tokens_seen": 185555520, "router_z_loss_clip": 2.27539062, "router_z_loss_mlp": 0.27075195, "step": 8633, "time_per_iteration": 4.1167073249816895 }, { "auxiliary_loss_clip": 0.01310849, "auxiliary_loss_mlp": 0.0027519, "balance_loss_clip": 1.08211291, "balance_loss_mlp": 0.24860635, "epoch": 0.5191041635352472, "flos": 21908633731200.0, "grad_norm": 5.930323528373013, "language_loss": 0.87913632, "learning_rate": 1.9733226659545936e-06, "loss": 0.89499676, "num_input_tokens_seen": 185573855, "router_z_loss_clip": 2.2890625, "router_z_loss_mlp": 0.26538086, "step": 8634, "time_per_iteration": 2.66119122505188 }, { "auxiliary_loss_clip": 0.01309518, "auxiliary_loss_mlp": 0.00280362, "balance_loss_clip": 1.08328152, "balance_loss_mlp": 0.25320628, "epoch": 0.5191642867879153, "flos": 27527971173120.0, "grad_norm": 4.570284894965837, "language_loss": 0.75948286, "learning_rate": 1.9729332394294467e-06, "loss": 0.77538168, "num_input_tokens_seen": 185595145, "router_z_loss_clip": 2.26171875, "router_z_loss_mlp": 0.27148438, "step": 8635, "time_per_iteration": 2.731816291809082 }, { "auxiliary_loss_clip": 0.01324332, "auxiliary_loss_mlp": 0.00273576, "balance_loss_clip": 1.09277868, "balance_loss_mlp": 0.24609847, "epoch": 0.5192244100405832, "flos": 15705999331200.0, "grad_norm": 41.09422686173065, "language_loss": 0.84277689, "learning_rate": 1.9725438139306742e-06, "loss": 0.85875607, "num_input_tokens_seen": 185613320, "router_z_loss_clip": 2.3125, "router_z_loss_mlp": 0.27490234, "step": 8636, "time_per_iteration": 2.6545984745025635 }, { "auxiliary_loss_clip": 0.01341936, "auxiliary_loss_mlp": 0.0031264, "balance_loss_clip": 1.10430789, "balance_loss_mlp": 0.28247967, "epoch": 0.5192845332932512, "flos": 12057080313600.0, "grad_norm": 1014.9762881167925, "language_loss": 0.83429968, "learning_rate": 1.9721543894730425e-06, "loss": 0.85084546, "num_input_tokens_seen": 185630730, "router_z_loss_clip": 2.375, "router_z_loss_mlp": 0.30200195, "step": 8637, "time_per_iteration": 2.67226243019104 }, { "auxiliary_loss_clip": 0.01319676, "auxiliary_loss_mlp": 0.00292478, "balance_loss_clip": 1.09043777, "balance_loss_mlp": 0.26429653, "epoch": 0.5193446565459191, "flos": 18953185662720.0, "grad_norm": 10.689998886399612, "language_loss": 0.81110716, "learning_rate": 1.9717649660713194e-06, "loss": 0.82722867, "num_input_tokens_seen": 185648515, "router_z_loss_clip": 2.2890625, "router_z_loss_mlp": 0.28186035, "step": 8638, "time_per_iteration": 4.045186996459961 }, { "auxiliary_loss_clip": 0.01322262, "auxiliary_loss_mlp": 0.00300838, "balance_loss_clip": 1.0926125, "balance_loss_mlp": 0.27417123, "epoch": 0.5194047797985871, "flos": 20374960775040.0, "grad_norm": 10.383115079261245, "language_loss": 0.82125068, "learning_rate": 1.971375543740272e-06, "loss": 0.83748162, "num_input_tokens_seen": 185665220, "router_z_loss_clip": 2.29882812, "router_z_loss_mlp": 0.2668457, "step": 8639, "time_per_iteration": 2.6569623947143555 }, { "auxiliary_loss_clip": 0.01326287, "auxiliary_loss_mlp": 0.00275685, "balance_loss_clip": 1.09884977, "balance_loss_mlp": 0.25123566, "epoch": 0.519464903051255, "flos": 24353001135360.0, "grad_norm": 2.1845887131679733, "language_loss": 0.85249388, "learning_rate": 1.9709861224946665e-06, "loss": 0.86851358, "num_input_tokens_seen": 185683750, "router_z_loss_clip": 2.2734375, "router_z_loss_mlp": 0.24462891, "step": 8640, "time_per_iteration": 2.6652920246124268 }, { "auxiliary_loss_clip": 0.01329222, "auxiliary_loss_mlp": 0.00284524, "balance_loss_clip": 1.10180306, "balance_loss_mlp": 0.25788069, "epoch": 0.519525026303923, "flos": 14061829161600.0, "grad_norm": 3.695166871587914, "language_loss": 0.74391454, "learning_rate": 1.97059670234927e-06, "loss": 0.76005203, "num_input_tokens_seen": 185700625, "router_z_loss_clip": 2.2734375, "router_z_loss_mlp": 0.26647949, "step": 8641, "time_per_iteration": 2.6432716846466064 }, { "auxiliary_loss_clip": 0.0132721, "auxiliary_loss_mlp": 0.00269582, "balance_loss_clip": 1.0995307, "balance_loss_mlp": 0.24540612, "epoch": 0.519585149556591, "flos": 28835873193600.0, "grad_norm": 2.008280157714009, "language_loss": 0.81664902, "learning_rate": 1.97020728331885e-06, "loss": 0.83261693, "num_input_tokens_seen": 185721155, "router_z_loss_clip": 2.27539062, "router_z_loss_mlp": 0.24157715, "step": 8642, "time_per_iteration": 2.703280448913574 }, { "auxiliary_loss_clip": 0.0133184, "auxiliary_loss_mlp": 0.00256099, "balance_loss_clip": 1.09958887, "balance_loss_mlp": 0.22872832, "epoch": 0.519645272809259, "flos": 25373007648000.0, "grad_norm": 8.01486399021342, "language_loss": 0.88801515, "learning_rate": 1.9698178654181726e-06, "loss": 0.90389454, "num_input_tokens_seen": 185740990, "router_z_loss_clip": 2.32421875, "router_z_loss_mlp": 0.27355957, "step": 8643, "time_per_iteration": 2.7444188594818115 }, { "auxiliary_loss_clip": 0.01320405, "auxiliary_loss_mlp": 0.0028674, "balance_loss_clip": 1.08792222, "balance_loss_mlp": 0.25896436, "epoch": 0.519705396061927, "flos": 25372863993600.0, "grad_norm": 7.906198530643072, "language_loss": 0.77309787, "learning_rate": 1.969428448662004e-06, "loss": 0.78916931, "num_input_tokens_seen": 185762235, "router_z_loss_clip": 2.32617188, "router_z_loss_mlp": 0.27783203, "step": 8644, "time_per_iteration": 2.6926310062408447 }, { "auxiliary_loss_clip": 0.0131511, "auxiliary_loss_mlp": 0.00275494, "balance_loss_clip": 1.08808994, "balance_loss_mlp": 0.24968541, "epoch": 0.5197655193145949, "flos": 28476228268800.0, "grad_norm": 14.759422163629875, "language_loss": 0.87389207, "learning_rate": 1.9690390330651133e-06, "loss": 0.88979816, "num_input_tokens_seen": 185783415, "router_z_loss_clip": 2.26953125, "router_z_loss_mlp": 0.2578125, "step": 8645, "time_per_iteration": 2.7297756671905518 }, { "auxiliary_loss_clip": 0.01330962, "auxiliary_loss_mlp": 0.00269505, "balance_loss_clip": 1.09778416, "balance_loss_mlp": 0.24263524, "epoch": 0.5198256425672629, "flos": 20009138711040.0, "grad_norm": 124.36435583925469, "language_loss": 0.85033131, "learning_rate": 1.968649618642264e-06, "loss": 0.86633599, "num_input_tokens_seen": 185801345, "router_z_loss_clip": 2.33398438, "router_z_loss_mlp": 0.2689209, "step": 8646, "time_per_iteration": 2.598688840866089 }, { "auxiliary_loss_clip": 0.01320291, "auxiliary_loss_mlp": 0.0028153, "balance_loss_clip": 1.0899117, "balance_loss_mlp": 0.25550711, "epoch": 0.5198857658199308, "flos": 19828867328640.0, "grad_norm": 75.23612504362386, "language_loss": 0.75689185, "learning_rate": 1.9682602054082252e-06, "loss": 0.77291006, "num_input_tokens_seen": 185820815, "router_z_loss_clip": 2.30664062, "router_z_loss_mlp": 0.26037598, "step": 8647, "time_per_iteration": 2.6993558406829834 }, { "auxiliary_loss_clip": 0.01333319, "auxiliary_loss_mlp": 0.00289904, "balance_loss_clip": 1.09821343, "balance_loss_mlp": 0.26072192, "epoch": 0.5199458890725989, "flos": 24461918150400.0, "grad_norm": 6.868184580827574, "language_loss": 0.80654681, "learning_rate": 1.967870793377763e-06, "loss": 0.82277906, "num_input_tokens_seen": 185841450, "router_z_loss_clip": 2.34960938, "router_z_loss_mlp": 0.29199219, "step": 8648, "time_per_iteration": 2.750157117843628 }, { "auxiliary_loss_clip": 0.0133195, "auxiliary_loss_mlp": 0.00270392, "balance_loss_clip": 1.09645188, "balance_loss_mlp": 0.24306932, "epoch": 0.5200060123252668, "flos": 23404779953280.0, "grad_norm": 9.227708734424787, "language_loss": 0.72854418, "learning_rate": 1.967481382565642e-06, "loss": 0.74456751, "num_input_tokens_seen": 185859935, "router_z_loss_clip": 2.35742188, "router_z_loss_mlp": 0.27331543, "step": 8649, "time_per_iteration": 2.6890242099761963 }, { "auxiliary_loss_clip": 0.01331341, "auxiliary_loss_mlp": 0.00295073, "balance_loss_clip": 1.0915643, "balance_loss_mlp": 0.26750013, "epoch": 0.5200661355779348, "flos": 17201355454080.0, "grad_norm": 28.20984074304388, "language_loss": 0.77995646, "learning_rate": 1.9670919729866315e-06, "loss": 0.79622054, "num_input_tokens_seen": 185876795, "router_z_loss_clip": 2.40234375, "router_z_loss_mlp": 0.27563477, "step": 8650, "time_per_iteration": 2.652677536010742 }, { "auxiliary_loss_clip": 0.01307727, "auxiliary_loss_mlp": 0.00237762, "balance_loss_clip": 1.08314157, "balance_loss_mlp": 0.21214405, "epoch": 0.5201262588306027, "flos": 18515075477760.0, "grad_norm": 19.9176673921206, "language_loss": 0.841824, "learning_rate": 1.966702564655496e-06, "loss": 0.85727882, "num_input_tokens_seen": 185895570, "router_z_loss_clip": 2.25, "router_z_loss_mlp": 0.25622559, "step": 8651, "time_per_iteration": 2.6604502201080322 }, { "auxiliary_loss_clip": 0.01341847, "auxiliary_loss_mlp": 0.00267777, "balance_loss_clip": 1.10352433, "balance_loss_mlp": 0.23826081, "epoch": 0.5201863820832707, "flos": 18619395552000.0, "grad_norm": 18.031691800563078, "language_loss": 0.87126696, "learning_rate": 1.966313157587003e-06, "loss": 0.8873632, "num_input_tokens_seen": 185913700, "router_z_loss_clip": 2.38085938, "router_z_loss_mlp": 0.29528809, "step": 8652, "time_per_iteration": 2.6407690048217773 }, { "auxiliary_loss_clip": 0.01343781, "auxiliary_loss_mlp": 0.00280148, "balance_loss_clip": 1.10976744, "balance_loss_mlp": 0.25340983, "epoch": 0.5202465053359386, "flos": 22857142222080.0, "grad_norm": 6.7838242760156735, "language_loss": 0.76326478, "learning_rate": 1.9659237517959187e-06, "loss": 0.77950406, "num_input_tokens_seen": 185932460, "router_z_loss_clip": 2.33789062, "router_z_loss_mlp": 0.26745605, "step": 8653, "time_per_iteration": 2.62227463722229 }, { "auxiliary_loss_clip": 0.01346852, "auxiliary_loss_mlp": 0.00267944, "balance_loss_clip": 1.10838294, "balance_loss_mlp": 0.24010868, "epoch": 0.5203066285886067, "flos": 21981532383360.0, "grad_norm": 29.01737080117579, "language_loss": 0.8490212, "learning_rate": 1.965534347297008e-06, "loss": 0.86516911, "num_input_tokens_seen": 185952030, "router_z_loss_clip": 2.38671875, "router_z_loss_mlp": 0.27807617, "step": 8654, "time_per_iteration": 2.7160770893096924 }, { "auxiliary_loss_clip": 0.01349459, "auxiliary_loss_mlp": 0.00278441, "balance_loss_clip": 1.11193109, "balance_loss_mlp": 0.2502484, "epoch": 0.5203667518412746, "flos": 20233329448320.0, "grad_norm": 41.97198194775774, "language_loss": 0.88920355, "learning_rate": 1.9651449441050393e-06, "loss": 0.90548253, "num_input_tokens_seen": 185973130, "router_z_loss_clip": 2.37695312, "router_z_loss_mlp": 0.28210449, "step": 8655, "time_per_iteration": 2.7061166763305664 }, { "auxiliary_loss_clip": 0.01343719, "auxiliary_loss_mlp": 0.002599, "balance_loss_clip": 1.11316657, "balance_loss_mlp": 0.23596242, "epoch": 0.5204268750939426, "flos": 15705460627200.0, "grad_norm": 2.904460162457369, "language_loss": 0.73942006, "learning_rate": 1.9647555422347777e-06, "loss": 0.75545621, "num_input_tokens_seen": 185990200, "router_z_loss_clip": 2.30859375, "router_z_loss_mlp": 0.23962402, "step": 8656, "time_per_iteration": 2.593651056289673 }, { "auxiliary_loss_clip": 0.01334249, "auxiliary_loss_mlp": 0.00263116, "balance_loss_clip": 1.10126746, "balance_loss_mlp": 0.23622289, "epoch": 0.5204869983466105, "flos": 27449469999360.0, "grad_norm": 3.3974440960562724, "language_loss": 0.79630548, "learning_rate": 1.9643661417009893e-06, "loss": 0.81227916, "num_input_tokens_seen": 186009880, "router_z_loss_clip": 2.33007812, "router_z_loss_mlp": 0.26879883, "step": 8657, "time_per_iteration": 2.6855216026306152 }, { "auxiliary_loss_clip": 0.01328123, "auxiliary_loss_mlp": 0.00248779, "balance_loss_clip": 1.0977577, "balance_loss_mlp": 0.21857163, "epoch": 0.5205471215992785, "flos": 20595452411520.0, "grad_norm": 6.441722959114923, "language_loss": 0.78776133, "learning_rate": 1.9639767425184408e-06, "loss": 0.8035304, "num_input_tokens_seen": 186026680, "router_z_loss_clip": 2.30859375, "router_z_loss_mlp": 0.30200195, "step": 8658, "time_per_iteration": 2.6385529041290283 }, { "auxiliary_loss_clip": 0.01345539, "auxiliary_loss_mlp": 0.00268381, "balance_loss_clip": 1.11105633, "balance_loss_mlp": 0.24102271, "epoch": 0.5206072448519465, "flos": 22127904305280.0, "grad_norm": 722.5326566862364, "language_loss": 0.91238868, "learning_rate": 1.963587344701897e-06, "loss": 0.92852795, "num_input_tokens_seen": 186046920, "router_z_loss_clip": 2.34179688, "router_z_loss_mlp": 0.27380371, "step": 8659, "time_per_iteration": 2.630228042602539 }, { "auxiliary_loss_clip": 0.01360091, "auxiliary_loss_mlp": 0.00247118, "balance_loss_clip": 1.11424279, "balance_loss_mlp": 0.21689895, "epoch": 0.5206673681046144, "flos": 18330422636160.0, "grad_norm": 388.6066750737373, "language_loss": 0.84363616, "learning_rate": 1.9631979482661253e-06, "loss": 0.85970831, "num_input_tokens_seen": 186062090, "router_z_loss_clip": 2.4609375, "router_z_loss_mlp": 0.30212402, "step": 8660, "time_per_iteration": 2.616302013397217 }, { "auxiliary_loss_clip": 0.01356633, "auxiliary_loss_mlp": 0.00248476, "balance_loss_clip": 1.11729383, "balance_loss_mlp": 0.22297713, "epoch": 0.5207274913572825, "flos": 20230240878720.0, "grad_norm": 2.6026656518952236, "language_loss": 0.868016, "learning_rate": 1.9628085532258906e-06, "loss": 0.88406712, "num_input_tokens_seen": 186081135, "router_z_loss_clip": 2.39453125, "router_z_loss_mlp": 0.25488281, "step": 8661, "time_per_iteration": 2.6394431591033936 }, { "auxiliary_loss_clip": 0.0135162, "auxiliary_loss_mlp": 0.00241473, "balance_loss_clip": 1.11457705, "balance_loss_mlp": 0.21610576, "epoch": 0.5207876146099504, "flos": 22127042378880.0, "grad_norm": 3.305803864250956, "language_loss": 0.7846961, "learning_rate": 1.9624191595959603e-06, "loss": 0.80062705, "num_input_tokens_seen": 186099700, "router_z_loss_clip": 2.37109375, "router_z_loss_mlp": 0.25402832, "step": 8662, "time_per_iteration": 2.699007749557495 }, { "auxiliary_loss_clip": 0.01339224, "auxiliary_loss_mlp": 0.00237204, "balance_loss_clip": 1.1102953, "balance_loss_mlp": 0.21090662, "epoch": 0.5208477378626184, "flos": 23878908501120.0, "grad_norm": 142.7526016661227, "language_loss": 0.74581826, "learning_rate": 1.962029767391098e-06, "loss": 0.76158261, "num_input_tokens_seen": 186119740, "router_z_loss_clip": 2.29296875, "router_z_loss_mlp": 0.26318359, "step": 8663, "time_per_iteration": 2.723644971847534 }, { "auxiliary_loss_clip": 0.01344352, "auxiliary_loss_mlp": 0.00235474, "balance_loss_clip": 1.11390424, "balance_loss_mlp": 0.21051157, "epoch": 0.5209078611152863, "flos": 20961525870720.0, "grad_norm": 150.70238817600153, "language_loss": 0.83769596, "learning_rate": 1.961640376626072e-06, "loss": 0.85349417, "num_input_tokens_seen": 186140645, "router_z_loss_clip": 2.30273438, "router_z_loss_mlp": 0.24963379, "step": 8664, "time_per_iteration": 2.6786653995513916 }, { "auxiliary_loss_clip": 0.01345337, "auxiliary_loss_mlp": 0.00230667, "balance_loss_clip": 1.11205852, "balance_loss_mlp": 0.20243812, "epoch": 0.5209679843679543, "flos": 20667740532480.0, "grad_norm": 8.020126493434178, "language_loss": 0.86995244, "learning_rate": 1.961250987315646e-06, "loss": 0.8857125, "num_input_tokens_seen": 186160130, "router_z_loss_clip": 2.3359375, "router_z_loss_mlp": 0.2824707, "step": 8665, "time_per_iteration": 2.6777002811431885 }, { "auxiliary_loss_clip": 0.0133407, "auxiliary_loss_mlp": 0.00223089, "balance_loss_clip": 1.10759115, "balance_loss_mlp": 0.19610025, "epoch": 0.5210281076206222, "flos": 20227295963520.0, "grad_norm": 3.7044953202039075, "language_loss": 0.79898208, "learning_rate": 1.960861599474586e-06, "loss": 0.81455374, "num_input_tokens_seen": 186179485, "router_z_loss_clip": 2.25976562, "router_z_loss_mlp": 0.27001953, "step": 8666, "time_per_iteration": 2.696317434310913 }, { "auxiliary_loss_clip": 0.01373494, "auxiliary_loss_mlp": 0.00257765, "balance_loss_clip": 1.12242496, "balance_loss_mlp": 0.22746195, "epoch": 0.5210882308732903, "flos": 16069989801600.0, "grad_norm": 43.99100846671773, "language_loss": 0.82994658, "learning_rate": 1.9604722131176592e-06, "loss": 0.84625918, "num_input_tokens_seen": 186197140, "router_z_loss_clip": 2.50976562, "router_z_loss_mlp": 0.30273438, "step": 8667, "time_per_iteration": 2.609222412109375 }, { "auxiliary_loss_clip": 0.01337438, "auxiliary_loss_mlp": 0.00209903, "balance_loss_clip": 1.11275315, "balance_loss_mlp": 0.18461896, "epoch": 0.5211483541259582, "flos": 24825298089600.0, "grad_norm": 23.646137416574632, "language_loss": 0.86215454, "learning_rate": 1.960082828259629e-06, "loss": 0.87762797, "num_input_tokens_seen": 186216800, "router_z_loss_clip": 2.25195312, "router_z_loss_mlp": 0.25317383, "step": 8668, "time_per_iteration": 2.6683850288391113 }, { "auxiliary_loss_clip": 0.01332732, "auxiliary_loss_mlp": 0.002163, "balance_loss_clip": 1.10451841, "balance_loss_mlp": 0.18925159, "epoch": 0.5212084773786262, "flos": 20370651143040.0, "grad_norm": 203.97292417911447, "language_loss": 0.69784021, "learning_rate": 1.9596934449152623e-06, "loss": 0.71333051, "num_input_tokens_seen": 186235320, "router_z_loss_clip": 2.28515625, "router_z_loss_mlp": 0.27050781, "step": 8669, "time_per_iteration": 2.6489346027374268 }, { "auxiliary_loss_clip": 0.01327352, "auxiliary_loss_mlp": 0.00198369, "balance_loss_clip": 1.09965086, "balance_loss_mlp": 0.17146379, "epoch": 0.5212686006312941, "flos": 23145468693120.0, "grad_norm": 75.71801169339555, "language_loss": 0.74306011, "learning_rate": 1.959304063099325e-06, "loss": 0.75831735, "num_input_tokens_seen": 186254460, "router_z_loss_clip": 2.2734375, "router_z_loss_mlp": 0.26904297, "step": 8670, "time_per_iteration": 2.6310510635375977 }, { "auxiliary_loss_clip": 0.01334693, "auxiliary_loss_mlp": 0.0019236, "balance_loss_clip": 1.10298765, "balance_loss_mlp": 0.16560954, "epoch": 0.5213287238839621, "flos": 27774030314880.0, "grad_norm": 40.328554288468744, "language_loss": 0.84172249, "learning_rate": 1.9589146828265806e-06, "loss": 0.85699302, "num_input_tokens_seen": 186269465, "router_z_loss_clip": 2.31835938, "router_z_loss_mlp": 0.26757812, "step": 8671, "time_per_iteration": 4.065420389175415 }, { "auxiliary_loss_clip": 0.01348031, "auxiliary_loss_mlp": 0.00211138, "balance_loss_clip": 1.11468267, "balance_loss_mlp": 0.18360074, "epoch": 0.5213888471366301, "flos": 19937676602880.0, "grad_norm": 3.119627668983307, "language_loss": 0.86073267, "learning_rate": 1.958525304111796e-06, "loss": 0.87632442, "num_input_tokens_seen": 186288660, "router_z_loss_clip": 2.33203125, "router_z_loss_mlp": 0.27563477, "step": 8672, "time_per_iteration": 2.610459327697754 }, { "auxiliary_loss_clip": 0.01328159, "auxiliary_loss_mlp": 0.00188954, "balance_loss_clip": 1.09999084, "balance_loss_mlp": 0.16377747, "epoch": 0.521448970389298, "flos": 16982731324800.0, "grad_norm": 11.159418348510965, "language_loss": 0.79635811, "learning_rate": 1.958135926969736e-06, "loss": 0.81152928, "num_input_tokens_seen": 186305760, "router_z_loss_clip": 2.28125, "router_z_loss_mlp": 0.25183105, "step": 8673, "time_per_iteration": 2.6128013134002686 }, { "auxiliary_loss_clip": 0.01315958, "auxiliary_loss_mlp": 0.0019576, "balance_loss_clip": 1.09327936, "balance_loss_mlp": 0.16973692, "epoch": 0.5215090936419661, "flos": 18989706816000.0, "grad_norm": 48.453941433515695, "language_loss": 0.83879066, "learning_rate": 1.957746551415166e-06, "loss": 0.85390788, "num_input_tokens_seen": 186324135, "router_z_loss_clip": 2.22460938, "router_z_loss_mlp": 0.26037598, "step": 8674, "time_per_iteration": 4.13818621635437 }, { "auxiliary_loss_clip": 0.01325245, "auxiliary_loss_mlp": 0.00209546, "balance_loss_clip": 1.09556389, "balance_loss_mlp": 0.18104318, "epoch": 0.521569216894634, "flos": 16143427157760.0, "grad_norm": 43.44135468723662, "language_loss": 0.95735836, "learning_rate": 1.9573571774628506e-06, "loss": 0.9727062, "num_input_tokens_seen": 186340205, "router_z_loss_clip": 2.29492188, "router_z_loss_mlp": 0.28503418, "step": 8675, "time_per_iteration": 4.026090860366821 }, { "auxiliary_loss_clip": 0.01415097, "auxiliary_loss_mlp": 0.00088299, "balance_loss_clip": 1.26795864, "balance_loss_mlp": 0.08028814, "epoch": 0.521629340147302, "flos": 57579493282560.0, "grad_norm": 0.8375249121178142, "language_loss": 0.62444764, "learning_rate": 1.9569678051275556e-06, "loss": 0.63948154, "num_input_tokens_seen": 186396940, "router_z_loss_clip": 1.46875, "router_z_loss_mlp": 0.08007812, "step": 8676, "time_per_iteration": 3.1050665378570557 }, { "auxiliary_loss_clip": 0.0131493, "auxiliary_loss_mlp": 0.00193287, "balance_loss_clip": 1.09168291, "balance_loss_mlp": 0.16688192, "epoch": 0.5216894633999699, "flos": 26796901662720.0, "grad_norm": 75.79667921473555, "language_loss": 0.74274182, "learning_rate": 1.956578434424046e-06, "loss": 0.75782394, "num_input_tokens_seen": 186418680, "router_z_loss_clip": 2.23242188, "router_z_loss_mlp": 0.26428223, "step": 8677, "time_per_iteration": 2.8021066188812256 }, { "auxiliary_loss_clip": 0.01325053, "auxiliary_loss_mlp": 0.00220469, "balance_loss_clip": 1.09944046, "balance_loss_mlp": 0.19288382, "epoch": 0.5217495866526379, "flos": 26358719650560.0, "grad_norm": 46.736778028873715, "language_loss": 0.74167812, "learning_rate": 1.956189065367086e-06, "loss": 0.75713331, "num_input_tokens_seen": 186438265, "router_z_loss_clip": 2.25390625, "router_z_loss_mlp": 0.27587891, "step": 8678, "time_per_iteration": 2.7071902751922607 }, { "auxiliary_loss_clip": 0.01333382, "auxiliary_loss_mlp": 0.00209895, "balance_loss_clip": 1.09776151, "balance_loss_mlp": 0.18259603, "epoch": 0.5218097099053058, "flos": 23584009841280.0, "grad_norm": 4.296233952765604, "language_loss": 0.79171014, "learning_rate": 1.9557996979714414e-06, "loss": 0.80714285, "num_input_tokens_seen": 186456870, "router_z_loss_clip": 2.35546875, "router_z_loss_mlp": 0.27282715, "step": 8679, "time_per_iteration": 2.683425188064575 }, { "auxiliary_loss_clip": 0.01315988, "auxiliary_loss_mlp": 0.00197296, "balance_loss_clip": 1.09322333, "balance_loss_mlp": 0.17178479, "epoch": 0.5218698331579739, "flos": 18077396256000.0, "grad_norm": 2.477452910342753, "language_loss": 0.73225158, "learning_rate": 1.9554103322518764e-06, "loss": 0.74738443, "num_input_tokens_seen": 186476425, "router_z_loss_clip": 2.22851562, "router_z_loss_mlp": 0.25524902, "step": 8680, "time_per_iteration": 4.14244532585144 }, { "auxiliary_loss_clip": 0.01325991, "auxiliary_loss_mlp": 0.0020823, "balance_loss_clip": 1.09845281, "balance_loss_mlp": 0.18256426, "epoch": 0.5219299564106418, "flos": 19281121856640.0, "grad_norm": 29.43441058207596, "language_loss": 0.89221746, "learning_rate": 1.955020968223156e-06, "loss": 0.90755963, "num_input_tokens_seen": 186492555, "router_z_loss_clip": 2.27929688, "router_z_loss_mlp": 0.25671387, "step": 8681, "time_per_iteration": 2.6387033462524414 }, { "auxiliary_loss_clip": 0.0131727, "auxiliary_loss_mlp": 0.00216905, "balance_loss_clip": 1.09284484, "balance_loss_mlp": 0.19151306, "epoch": 0.5219900796633098, "flos": 26651355753600.0, "grad_norm": 2.8139530623940776, "language_loss": 0.85018206, "learning_rate": 1.9546316059000454e-06, "loss": 0.86552387, "num_input_tokens_seen": 186513190, "router_z_loss_clip": 2.24609375, "router_z_loss_mlp": 0.25402832, "step": 8682, "time_per_iteration": 2.7209036350250244 }, { "auxiliary_loss_clip": 0.01312689, "auxiliary_loss_mlp": 0.00199382, "balance_loss_clip": 1.09159899, "balance_loss_mlp": 0.1727742, "epoch": 0.5220502029159777, "flos": 34312717382400.0, "grad_norm": 92.37076792486904, "language_loss": 0.76979762, "learning_rate": 1.9542422452973082e-06, "loss": 0.78491831, "num_input_tokens_seen": 186534830, "router_z_loss_clip": 2.21289062, "router_z_loss_mlp": 0.26599121, "step": 8683, "time_per_iteration": 2.7684600353240967 }, { "auxiliary_loss_clip": 0.01327299, "auxiliary_loss_mlp": 0.00232672, "balance_loss_clip": 1.0982933, "balance_loss_mlp": 0.20606408, "epoch": 0.5221103261686457, "flos": 22156488552960.0, "grad_norm": 48.751284751979725, "language_loss": 0.83300543, "learning_rate": 1.9538528864297104e-06, "loss": 0.84860516, "num_input_tokens_seen": 186554390, "router_z_loss_clip": 2.2890625, "router_z_loss_mlp": 0.26647949, "step": 8684, "time_per_iteration": 2.708794593811035 }, { "auxiliary_loss_clip": 0.0132296, "auxiliary_loss_mlp": 0.00187055, "balance_loss_clip": 1.10009742, "balance_loss_mlp": 0.16049474, "epoch": 0.5221704494213137, "flos": 19208402772480.0, "grad_norm": 32.05160695449131, "language_loss": 0.84027565, "learning_rate": 1.9534635293120153e-06, "loss": 0.85537583, "num_input_tokens_seen": 186572360, "router_z_loss_clip": 2.22851562, "router_z_loss_mlp": 0.26550293, "step": 8685, "time_per_iteration": 2.601736307144165 }, { "auxiliary_loss_clip": 0.01325874, "auxiliary_loss_mlp": 0.00195384, "balance_loss_clip": 1.1003648, "balance_loss_mlp": 0.16838267, "epoch": 0.5222305726739817, "flos": 19354056422400.0, "grad_norm": 112.86295572581314, "language_loss": 0.87520564, "learning_rate": 1.9530741739589876e-06, "loss": 0.89041823, "num_input_tokens_seen": 186590655, "router_z_loss_clip": 2.25390625, "router_z_loss_mlp": 0.27001953, "step": 8686, "time_per_iteration": 2.687774658203125 }, { "auxiliary_loss_clip": 0.01305218, "auxiliary_loss_mlp": 0.00185695, "balance_loss_clip": 1.08853447, "balance_loss_mlp": 0.16115028, "epoch": 0.5222906959266497, "flos": 27814789272960.0, "grad_norm": 2.4373293314642743, "language_loss": 0.76165116, "learning_rate": 1.9526848203853927e-06, "loss": 0.77656031, "num_input_tokens_seen": 186610345, "router_z_loss_clip": 2.16601562, "router_z_loss_mlp": 0.24523926, "step": 8687, "time_per_iteration": 2.779982566833496 }, { "auxiliary_loss_clip": 0.01311463, "auxiliary_loss_mlp": 0.00183228, "balance_loss_clip": 1.0891757, "balance_loss_mlp": 0.15981495, "epoch": 0.5223508191793176, "flos": 12712988615040.0, "grad_norm": 19.46395448881557, "language_loss": 0.88042718, "learning_rate": 1.9522954686059936e-06, "loss": 0.89537406, "num_input_tokens_seen": 186624360, "router_z_loss_clip": 2.22070312, "router_z_loss_mlp": 0.23388672, "step": 8688, "time_per_iteration": 2.6443660259246826 }, { "auxiliary_loss_clip": 0.01319173, "auxiliary_loss_mlp": 0.00207103, "balance_loss_clip": 1.09562826, "balance_loss_mlp": 0.18200937, "epoch": 0.5224109424319856, "flos": 15632238752640.0, "grad_norm": 73.111432870144, "language_loss": 0.83745539, "learning_rate": 1.9519061186355558e-06, "loss": 0.85271823, "num_input_tokens_seen": 186638680, "router_z_loss_clip": 2.23632812, "router_z_loss_mlp": 0.25109863, "step": 8689, "time_per_iteration": 2.6356804370880127 }, { "auxiliary_loss_clip": 0.0131317, "auxiliary_loss_mlp": 0.00192671, "balance_loss_clip": 1.09162784, "balance_loss_mlp": 0.16829288, "epoch": 0.5224710656846535, "flos": 15742233175680.0, "grad_norm": 4.623411373628884, "language_loss": 0.90186602, "learning_rate": 1.9515167704888417e-06, "loss": 0.91692442, "num_input_tokens_seen": 186655840, "router_z_loss_clip": 2.21679688, "router_z_loss_mlp": 0.24365234, "step": 8690, "time_per_iteration": 2.6110692024230957 }, { "auxiliary_loss_clip": 0.01326122, "auxiliary_loss_mlp": 0.00189947, "balance_loss_clip": 1.09833944, "balance_loss_mlp": 0.16338761, "epoch": 0.5225311889373215, "flos": 26030998938240.0, "grad_norm": 35.93993873469718, "language_loss": 0.86371195, "learning_rate": 1.9511274241806173e-06, "loss": 0.87887263, "num_input_tokens_seen": 186674150, "router_z_loss_clip": 2.27734375, "router_z_loss_mlp": 0.26574707, "step": 8691, "time_per_iteration": 2.6973085403442383 }, { "auxiliary_loss_clip": 0.01325739, "auxiliary_loss_mlp": 0.0021363, "balance_loss_clip": 1.09592378, "balance_loss_mlp": 0.18667667, "epoch": 0.5225913121899894, "flos": 18369278173440.0, "grad_norm": 9.528622182217601, "language_loss": 0.86540395, "learning_rate": 1.950738079725646e-06, "loss": 0.88079762, "num_input_tokens_seen": 186690675, "router_z_loss_clip": 2.30078125, "router_z_loss_mlp": 0.26953125, "step": 8692, "time_per_iteration": 2.6510980129241943 }, { "auxiliary_loss_clip": 0.01317751, "auxiliary_loss_mlp": 0.00210222, "balance_loss_clip": 1.09712553, "balance_loss_mlp": 0.18651089, "epoch": 0.5226514354426575, "flos": 29273516501760.0, "grad_norm": 12.96252574507916, "language_loss": 0.78472757, "learning_rate": 1.950348737138691e-06, "loss": 0.80000722, "num_input_tokens_seen": 186710380, "router_z_loss_clip": 2.20117188, "router_z_loss_mlp": 0.23730469, "step": 8693, "time_per_iteration": 2.7111778259277344 }, { "auxiliary_loss_clip": 0.01327536, "auxiliary_loss_mlp": 0.00218939, "balance_loss_clip": 1.09343052, "balance_loss_mlp": 0.19121101, "epoch": 0.5227115586953254, "flos": 22853299466880.0, "grad_norm": 21.800559627578078, "language_loss": 0.92531723, "learning_rate": 1.949959396434517e-06, "loss": 0.94078195, "num_input_tokens_seen": 186729135, "router_z_loss_clip": 2.34179688, "router_z_loss_mlp": 0.27697754, "step": 8694, "time_per_iteration": 2.7333521842956543 }, { "auxiliary_loss_clip": 0.01392555, "auxiliary_loss_mlp": 0.00083908, "balance_loss_clip": 1.25312936, "balance_loss_mlp": 0.07699391, "epoch": 0.5227716819479934, "flos": 57474419022720.0, "grad_norm": 0.780223714155609, "language_loss": 0.55480015, "learning_rate": 1.949570057627888e-06, "loss": 0.56956482, "num_input_tokens_seen": 186791115, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.06933594, "step": 8695, "time_per_iteration": 3.1823086738586426 }, { "auxiliary_loss_clip": 0.01327947, "auxiliary_loss_mlp": 0.00197594, "balance_loss_clip": 1.09720671, "balance_loss_mlp": 0.17108151, "epoch": 0.5228318052006613, "flos": 13808264077440.0, "grad_norm": 9.16178479085916, "language_loss": 0.83771962, "learning_rate": 1.9491807207335672e-06, "loss": 0.85297501, "num_input_tokens_seen": 186808660, "router_z_loss_clip": 2.31054688, "router_z_loss_mlp": 0.26501465, "step": 8696, "time_per_iteration": 2.6099448204040527 }, { "auxiliary_loss_clip": 0.01320964, "auxiliary_loss_mlp": 0.00227356, "balance_loss_clip": 1.09249473, "balance_loss_mlp": 0.20122558, "epoch": 0.5228919284533293, "flos": 15596184476160.0, "grad_norm": 13.31309519057925, "language_loss": 0.78740549, "learning_rate": 1.948791385766319e-06, "loss": 0.80288863, "num_input_tokens_seen": 186825900, "router_z_loss_clip": 2.28320312, "router_z_loss_mlp": 0.26159668, "step": 8697, "time_per_iteration": 2.5982067584991455 }, { "auxiliary_loss_clip": 0.01313294, "auxiliary_loss_mlp": 0.00186173, "balance_loss_clip": 1.09048426, "balance_loss_mlp": 0.16195004, "epoch": 0.5229520517059973, "flos": 22491499726080.0, "grad_norm": 560.6025877591582, "language_loss": 0.88417006, "learning_rate": 1.948402052740906e-06, "loss": 0.89916468, "num_input_tokens_seen": 186843735, "router_z_loss_clip": 2.2265625, "router_z_loss_mlp": 0.24230957, "step": 8698, "time_per_iteration": 2.7050907611846924 }, { "auxiliary_loss_clip": 0.01327765, "auxiliary_loss_mlp": 0.00206542, "balance_loss_clip": 1.10221386, "balance_loss_mlp": 0.18161505, "epoch": 0.5230121749586653, "flos": 22090880361600.0, "grad_norm": 355.3555848806379, "language_loss": 0.80202156, "learning_rate": 1.948012721672093e-06, "loss": 0.81736469, "num_input_tokens_seen": 186862440, "router_z_loss_clip": 2.25390625, "router_z_loss_mlp": 0.24938965, "step": 8699, "time_per_iteration": 2.612898588180542 }, { "auxiliary_loss_clip": 0.01324357, "auxiliary_loss_mlp": 0.00213278, "balance_loss_clip": 1.09351933, "balance_loss_mlp": 0.1872662, "epoch": 0.5230722982113333, "flos": 22127150119680.0, "grad_norm": 4.323121778295936, "language_loss": 0.82875663, "learning_rate": 1.947623392574642e-06, "loss": 0.84413296, "num_input_tokens_seen": 186880940, "router_z_loss_clip": 2.30761719, "router_z_loss_mlp": 0.26037598, "step": 8700, "time_per_iteration": 2.6153993606567383 }, { "auxiliary_loss_clip": 0.01335241, "auxiliary_loss_mlp": 0.00249071, "balance_loss_clip": 1.10485768, "balance_loss_mlp": 0.21992481, "epoch": 0.5231324214640012, "flos": 25009268572800.0, "grad_norm": 25.98822799556201, "language_loss": 0.77944636, "learning_rate": 1.947234065463318e-06, "loss": 0.79528952, "num_input_tokens_seen": 186900785, "router_z_loss_clip": 2.3046875, "router_z_loss_mlp": 0.29174805, "step": 8701, "time_per_iteration": 2.7400989532470703 }, { "auxiliary_loss_clip": 0.01323692, "auxiliary_loss_mlp": 0.00211021, "balance_loss_clip": 1.10195017, "balance_loss_mlp": 0.18552178, "epoch": 0.5231925447166692, "flos": 25740517651200.0, "grad_norm": 19.39618331621996, "language_loss": 0.72969341, "learning_rate": 1.9468447403528826e-06, "loss": 0.74504054, "num_input_tokens_seen": 186920895, "router_z_loss_clip": 2.21679688, "router_z_loss_mlp": 0.25488281, "step": 8702, "time_per_iteration": 2.7058589458465576 }, { "auxiliary_loss_clip": 0.01333397, "auxiliary_loss_mlp": 0.00225247, "balance_loss_clip": 1.10530186, "balance_loss_mlp": 0.19903252, "epoch": 0.5232526679693371, "flos": 21433930565760.0, "grad_norm": 117.28917546995004, "language_loss": 0.83249772, "learning_rate": 1.946455417258101e-06, "loss": 0.84808421, "num_input_tokens_seen": 186940605, "router_z_loss_clip": 2.28125, "router_z_loss_mlp": 0.26257324, "step": 8703, "time_per_iteration": 2.6143510341644287 }, { "auxiliary_loss_clip": 0.01342704, "auxiliary_loss_mlp": 0.00236348, "balance_loss_clip": 1.10890865, "balance_loss_mlp": 0.2067845, "epoch": 0.5233127912220051, "flos": 35298393471360.0, "grad_norm": 4.143081550041936, "language_loss": 0.85688293, "learning_rate": 1.9460660961937348e-06, "loss": 0.87267345, "num_input_tokens_seen": 186960820, "router_z_loss_clip": 2.33984375, "router_z_loss_mlp": 0.2956543, "step": 8704, "time_per_iteration": 2.732206344604492 }, { "auxiliary_loss_clip": 0.01326963, "auxiliary_loss_mlp": 0.00214684, "balance_loss_clip": 1.10362458, "balance_loss_mlp": 0.19007912, "epoch": 0.523372914474673, "flos": 17051320344960.0, "grad_norm": 59.2422699942105, "language_loss": 0.84164059, "learning_rate": 1.9456767771745474e-06, "loss": 0.85705703, "num_input_tokens_seen": 186976240, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.24597168, "step": 8705, "time_per_iteration": 2.574928045272827 }, { "auxiliary_loss_clip": 0.0134797, "auxiliary_loss_mlp": 0.00246633, "balance_loss_clip": 1.10979605, "balance_loss_mlp": 0.21774882, "epoch": 0.5234330377273411, "flos": 18406302117120.0, "grad_norm": 25.688637777263935, "language_loss": 0.77503526, "learning_rate": 1.9452874602153027e-06, "loss": 0.79098129, "num_input_tokens_seen": 186992855, "router_z_loss_clip": 2.3828125, "router_z_loss_mlp": 0.28857422, "step": 8706, "time_per_iteration": 2.6038146018981934 }, { "auxiliary_loss_clip": 0.01340569, "auxiliary_loss_mlp": 0.00079417, "balance_loss_clip": 1.20297647, "balance_loss_mlp": 0.07317084, "epoch": 0.523493160980009, "flos": 65850296970240.0, "grad_norm": 0.6706772288122443, "language_loss": 0.51796836, "learning_rate": 1.9448981453307623e-06, "loss": 0.53216821, "num_input_tokens_seen": 187051205, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.06225586, "step": 8707, "time_per_iteration": 3.1681737899780273 }, { "auxiliary_loss_clip": 0.01333349, "auxiliary_loss_mlp": 0.00223269, "balance_loss_clip": 1.10696495, "balance_loss_mlp": 0.19750786, "epoch": 0.523553284232677, "flos": 21872076664320.0, "grad_norm": 12.923242881491706, "language_loss": 0.81784856, "learning_rate": 1.9445088325356904e-06, "loss": 0.83341473, "num_input_tokens_seen": 187070540, "router_z_loss_clip": 2.26367188, "router_z_loss_mlp": 0.25805664, "step": 8708, "time_per_iteration": 2.6391568183898926 }, { "auxiliary_loss_clip": 0.01356509, "auxiliary_loss_mlp": 0.00210467, "balance_loss_clip": 1.12159896, "balance_loss_mlp": 0.18340613, "epoch": 0.5236134074853449, "flos": 20848191482880.0, "grad_norm": 45.1590500030247, "language_loss": 0.85032684, "learning_rate": 1.944119521844849e-06, "loss": 0.8659966, "num_input_tokens_seen": 187089975, "router_z_loss_clip": 2.34765625, "router_z_loss_mlp": 0.27075195, "step": 8709, "time_per_iteration": 2.6328024864196777 }, { "auxiliary_loss_clip": 0.01368692, "auxiliary_loss_mlp": 0.00241814, "balance_loss_clip": 1.12428069, "balance_loss_mlp": 0.2153497, "epoch": 0.5236735307380129, "flos": 25520421064320.0, "grad_norm": 32.4974513325414, "language_loss": 0.92962658, "learning_rate": 1.9437302132730003e-06, "loss": 0.94573164, "num_input_tokens_seen": 187108775, "router_z_loss_clip": 2.44140625, "router_z_loss_mlp": 0.26489258, "step": 8710, "time_per_iteration": 2.6800832748413086 }, { "auxiliary_loss_clip": 0.01359814, "auxiliary_loss_mlp": 0.00233004, "balance_loss_clip": 1.12522388, "balance_loss_mlp": 0.20733847, "epoch": 0.523733653990681, "flos": 23583112001280.0, "grad_norm": 134.67168824228617, "language_loss": 0.78061938, "learning_rate": 1.943340906834908e-06, "loss": 0.79654759, "num_input_tokens_seen": 187128830, "router_z_loss_clip": 2.34375, "router_z_loss_mlp": 0.2565918, "step": 8711, "time_per_iteration": 2.663094997406006 }, { "auxiliary_loss_clip": 0.01338979, "auxiliary_loss_mlp": 0.00214004, "balance_loss_clip": 1.1119076, "balance_loss_mlp": 0.18787345, "epoch": 0.5237937772433489, "flos": 21106245767040.0, "grad_norm": 60.66558033643959, "language_loss": 0.89702618, "learning_rate": 1.9429516025453345e-06, "loss": 0.91255599, "num_input_tokens_seen": 187149570, "router_z_loss_clip": 2.27539062, "router_z_loss_mlp": 0.26171875, "step": 8712, "time_per_iteration": 2.6427013874053955 }, { "auxiliary_loss_clip": 0.01351896, "auxiliary_loss_mlp": 0.00231871, "balance_loss_clip": 1.11604965, "balance_loss_mlp": 0.20341563, "epoch": 0.5238539004960169, "flos": 19172887200000.0, "grad_norm": 9.895452196084216, "language_loss": 0.76579171, "learning_rate": 1.9425623004190415e-06, "loss": 0.78162938, "num_input_tokens_seen": 187170575, "router_z_loss_clip": 2.35742188, "router_z_loss_mlp": 0.2845459, "step": 8713, "time_per_iteration": 2.6725943088531494 }, { "auxiliary_loss_clip": 0.01373944, "auxiliary_loss_mlp": 0.00229668, "balance_loss_clip": 1.12626588, "balance_loss_mlp": 0.20029525, "epoch": 0.5239140237486848, "flos": 17888218300800.0, "grad_norm": 3.9677858357195626, "language_loss": 0.88528204, "learning_rate": 1.9421730004707925e-06, "loss": 0.90131819, "num_input_tokens_seen": 187187190, "router_z_loss_clip": 2.4765625, "router_z_loss_mlp": 0.29394531, "step": 8714, "time_per_iteration": 4.006772518157959 }, { "auxiliary_loss_clip": 0.01368418, "auxiliary_loss_mlp": 0.00238311, "balance_loss_clip": 1.12667847, "balance_loss_mlp": 0.21032029, "epoch": 0.5239741470013528, "flos": 17930413802880.0, "grad_norm": 15.081194458382718, "language_loss": 0.85069621, "learning_rate": 1.9417837027153483e-06, "loss": 0.86676347, "num_input_tokens_seen": 187204350, "router_z_loss_clip": 2.421875, "router_z_loss_mlp": 0.27990723, "step": 8715, "time_per_iteration": 2.658881664276123 }, { "auxiliary_loss_clip": 0.01352989, "auxiliary_loss_mlp": 0.00250069, "balance_loss_clip": 1.12219787, "balance_loss_mlp": 0.22246048, "epoch": 0.5240342702540207, "flos": 30993386584320.0, "grad_norm": 2.8988929167895066, "language_loss": 0.7812162, "learning_rate": 1.9413944071674723e-06, "loss": 0.79724681, "num_input_tokens_seen": 187225605, "router_z_loss_clip": 2.30859375, "router_z_loss_mlp": 0.27600098, "step": 8716, "time_per_iteration": 4.17400336265564 }, { "auxiliary_loss_clip": 0.01348233, "auxiliary_loss_mlp": 0.00233037, "balance_loss_clip": 1.11989963, "balance_loss_mlp": 0.20621453, "epoch": 0.5240943935066887, "flos": 25005066681600.0, "grad_norm": 92.438617378468, "language_loss": 0.90824556, "learning_rate": 1.941005113841926e-06, "loss": 0.92405826, "num_input_tokens_seen": 187241335, "router_z_loss_clip": 2.28125, "router_z_loss_mlp": 0.26818848, "step": 8717, "time_per_iteration": 2.6524624824523926 }, { "auxiliary_loss_clip": 0.01346661, "auxiliary_loss_mlp": 0.00225074, "balance_loss_clip": 1.11614966, "balance_loss_mlp": 0.19973008, "epoch": 0.5241545167593566, "flos": 23659099223040.0, "grad_norm": 21.273435720812692, "language_loss": 0.72295237, "learning_rate": 1.9406158227534723e-06, "loss": 0.73866963, "num_input_tokens_seen": 187259925, "router_z_loss_clip": 2.3046875, "router_z_loss_mlp": 0.25390625, "step": 8718, "time_per_iteration": 4.080482006072998 }, { "auxiliary_loss_clip": 0.01357678, "auxiliary_loss_mlp": 0.00237658, "balance_loss_clip": 1.12095129, "balance_loss_mlp": 0.21213478, "epoch": 0.5242146400120247, "flos": 23400398494080.0, "grad_norm": 3.672402949561507, "language_loss": 0.78463554, "learning_rate": 1.940226533916872e-06, "loss": 0.80058897, "num_input_tokens_seen": 187279035, "router_z_loss_clip": 2.37109375, "router_z_loss_mlp": 0.25537109, "step": 8719, "time_per_iteration": 2.6590681076049805 }, { "auxiliary_loss_clip": 0.01355005, "auxiliary_loss_mlp": 0.00217973, "balance_loss_clip": 1.12461805, "balance_loss_mlp": 0.18963677, "epoch": 0.5242747632646926, "flos": 17749065012480.0, "grad_norm": 9.324763420720776, "language_loss": 0.81881428, "learning_rate": 1.9398372473468877e-06, "loss": 0.83454406, "num_input_tokens_seen": 187297555, "router_z_loss_clip": 2.3046875, "router_z_loss_mlp": 0.28356934, "step": 8720, "time_per_iteration": 2.6398894786834717 }, { "auxiliary_loss_clip": 0.01345593, "auxiliary_loss_mlp": 0.00222084, "balance_loss_clip": 1.11266398, "balance_loss_mlp": 0.19653791, "epoch": 0.5243348865173606, "flos": 32597731549440.0, "grad_norm": 3.817981156370407, "language_loss": 0.77106357, "learning_rate": 1.939447963058281e-06, "loss": 0.7867403, "num_input_tokens_seen": 187320265, "router_z_loss_clip": 2.328125, "router_z_loss_mlp": 0.25549316, "step": 8721, "time_per_iteration": 2.7294511795043945 }, { "auxiliary_loss_clip": 0.01347002, "auxiliary_loss_mlp": 0.00221, "balance_loss_clip": 1.11702609, "balance_loss_mlp": 0.19535807, "epoch": 0.5243950097700285, "flos": 25484115392640.0, "grad_norm": 402.7813240863664, "language_loss": 0.92875111, "learning_rate": 1.939058681065813e-06, "loss": 0.94443119, "num_input_tokens_seen": 187338045, "router_z_loss_clip": 2.30078125, "router_z_loss_mlp": 0.25683594, "step": 8722, "time_per_iteration": 4.183077335357666 }, { "auxiliary_loss_clip": 0.0133803, "auxiliary_loss_mlp": 0.00224152, "balance_loss_clip": 1.11241031, "balance_loss_mlp": 0.20079887, "epoch": 0.5244551330226965, "flos": 15268391936640.0, "grad_norm": 25.626829760871253, "language_loss": 0.86227071, "learning_rate": 1.938669401384247e-06, "loss": 0.87789255, "num_input_tokens_seen": 187356040, "router_z_loss_clip": 2.25585938, "router_z_loss_mlp": 0.23364258, "step": 8723, "time_per_iteration": 2.8015623092651367 }, { "auxiliary_loss_clip": 0.01369921, "auxiliary_loss_mlp": 0.00244082, "balance_loss_clip": 1.13357532, "balance_loss_mlp": 0.21468449, "epoch": 0.5245152562753645, "flos": 22237108629120.0, "grad_norm": 7.717778653255853, "language_loss": 0.8219986, "learning_rate": 1.9382801240283426e-06, "loss": 0.8381387, "num_input_tokens_seen": 187374185, "router_z_loss_clip": 2.36523438, "router_z_loss_mlp": 0.29394531, "step": 8724, "time_per_iteration": 2.6780712604522705 }, { "auxiliary_loss_clip": 0.01346667, "auxiliary_loss_mlp": 0.00226525, "balance_loss_clip": 1.11286128, "balance_loss_mlp": 0.19908254, "epoch": 0.5245753795280325, "flos": 29426460612480.0, "grad_norm": 6.696353230952606, "language_loss": 0.78242493, "learning_rate": 1.9378908490128625e-06, "loss": 0.79815686, "num_input_tokens_seen": 187396640, "router_z_loss_clip": 2.33984375, "router_z_loss_mlp": 0.27441406, "step": 8725, "time_per_iteration": 2.7072596549987793 }, { "auxiliary_loss_clip": 0.0131993, "auxiliary_loss_mlp": 0.00065955, "balance_loss_clip": 1.16058195, "balance_loss_mlp": 0.05808684, "epoch": 0.5246355027807005, "flos": 58834392785280.0, "grad_norm": 0.756203945213419, "language_loss": 0.55376494, "learning_rate": 1.937501576352568e-06, "loss": 0.56762385, "num_input_tokens_seen": 187455945, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.07861328, "step": 8726, "time_per_iteration": 3.139814615249634 }, { "auxiliary_loss_clip": 0.01300644, "auxiliary_loss_mlp": 0.00059355, "balance_loss_clip": 1.15308619, "balance_loss_mlp": 0.05267959, "epoch": 0.5246956260333684, "flos": 64526592965760.0, "grad_norm": 0.7848901735414096, "language_loss": 0.5810051, "learning_rate": 1.937112306062219e-06, "loss": 0.59460509, "num_input_tokens_seen": 187519975, "router_z_loss_clip": 1.4765625, "router_z_loss_mlp": 0.06689453, "step": 8727, "time_per_iteration": 3.102858543395996 }, { "auxiliary_loss_clip": 0.01336385, "auxiliary_loss_mlp": 0.00216214, "balance_loss_clip": 1.10584021, "balance_loss_mlp": 0.18860546, "epoch": 0.5247557492860364, "flos": 24533631653760.0, "grad_norm": 1.5709914728329955, "language_loss": 0.76392782, "learning_rate": 1.9367230381565786e-06, "loss": 0.77945381, "num_input_tokens_seen": 187541775, "router_z_loss_clip": 2.30273438, "router_z_loss_mlp": 0.27612305, "step": 8728, "time_per_iteration": 2.6917593479156494 }, { "auxiliary_loss_clip": 0.01334322, "auxiliary_loss_mlp": 0.00243609, "balance_loss_clip": 1.10919285, "balance_loss_mlp": 0.21727526, "epoch": 0.5248158725387043, "flos": 18806131382400.0, "grad_norm": 8.960818321664098, "language_loss": 0.74466473, "learning_rate": 1.9363337726504062e-06, "loss": 0.76044405, "num_input_tokens_seen": 187560425, "router_z_loss_clip": 2.24804688, "router_z_loss_mlp": 0.26379395, "step": 8729, "time_per_iteration": 2.6244232654571533 }, { "auxiliary_loss_clip": 0.01333588, "auxiliary_loss_mlp": 0.00219694, "balance_loss_clip": 1.10598993, "balance_loss_mlp": 0.19396845, "epoch": 0.5248759957913723, "flos": 20955851521920.0, "grad_norm": 14.95197429593985, "language_loss": 0.91056979, "learning_rate": 1.935944509558464e-06, "loss": 0.92610258, "num_input_tokens_seen": 187579930, "router_z_loss_clip": 2.27734375, "router_z_loss_mlp": 0.25708008, "step": 8730, "time_per_iteration": 2.70786714553833 }, { "auxiliary_loss_clip": 0.01327262, "auxiliary_loss_mlp": 0.00203052, "balance_loss_clip": 1.10518098, "balance_loss_mlp": 0.17653942, "epoch": 0.5249361190440403, "flos": 18660980522880.0, "grad_norm": 28.233331842286848, "language_loss": 0.86788523, "learning_rate": 1.9355552488955125e-06, "loss": 0.88318837, "num_input_tokens_seen": 187595365, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.26525879, "step": 8731, "time_per_iteration": 2.6540987491607666 }, { "auxiliary_loss_clip": 0.01309958, "auxiliary_loss_mlp": 0.00210626, "balance_loss_clip": 1.09281468, "balance_loss_mlp": 0.18246858, "epoch": 0.5249962422967083, "flos": 24863327614080.0, "grad_norm": 2.7186488987372743, "language_loss": 0.90421367, "learning_rate": 1.935165990676312e-06, "loss": 0.91941953, "num_input_tokens_seen": 187614715, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.28161621, "step": 8732, "time_per_iteration": 2.696241617202759 }, { "auxiliary_loss_clip": 0.01317036, "auxiliary_loss_mlp": 0.00215415, "balance_loss_clip": 1.09330678, "balance_loss_mlp": 0.18909329, "epoch": 0.5250563655493762, "flos": 15262681674240.0, "grad_norm": 5.037251344658125, "language_loss": 0.85128438, "learning_rate": 1.9347767349156237e-06, "loss": 0.86660892, "num_input_tokens_seen": 187630745, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.26342773, "step": 8733, "time_per_iteration": 2.6373541355133057 }, { "auxiliary_loss_clip": 0.01337072, "auxiliary_loss_mlp": 0.00213757, "balance_loss_clip": 1.11336803, "balance_loss_mlp": 0.18765047, "epoch": 0.5251164888020442, "flos": 18625177641600.0, "grad_norm": 9.983878801712631, "language_loss": 0.8680768, "learning_rate": 1.934387481628208e-06, "loss": 0.8835851, "num_input_tokens_seen": 187648200, "router_z_loss_clip": 2.24023438, "router_z_loss_mlp": 0.2611084, "step": 8734, "time_per_iteration": 2.6244609355926514 }, { "auxiliary_loss_clip": 0.01319768, "auxiliary_loss_mlp": 0.00210871, "balance_loss_clip": 1.09789991, "balance_loss_mlp": 0.18656461, "epoch": 0.5251766120547121, "flos": 29710764760320.0, "grad_norm": 5.141296155669662, "language_loss": 0.82166135, "learning_rate": 1.933998230828826e-06, "loss": 0.83696771, "num_input_tokens_seen": 187669205, "router_z_loss_clip": 2.21679688, "router_z_loss_mlp": 0.2434082, "step": 8735, "time_per_iteration": 2.702881097793579 }, { "auxiliary_loss_clip": 0.01308405, "auxiliary_loss_mlp": 0.00201426, "balance_loss_clip": 1.08949709, "balance_loss_mlp": 0.17541437, "epoch": 0.5252367353073801, "flos": 23440295525760.0, "grad_norm": 4.274709842780115, "language_loss": 0.87163401, "learning_rate": 1.9336089825322376e-06, "loss": 0.88673222, "num_input_tokens_seen": 187690890, "router_z_loss_clip": 2.19140625, "router_z_loss_mlp": 0.26013184, "step": 8736, "time_per_iteration": 2.7621912956237793 }, { "auxiliary_loss_clip": 0.01326712, "auxiliary_loss_mlp": 0.00214621, "balance_loss_clip": 1.10067904, "balance_loss_mlp": 0.18921718, "epoch": 0.5252968585600482, "flos": 30810708990720.0, "grad_norm": 22.883176476513633, "language_loss": 0.78813553, "learning_rate": 1.9332197367532033e-06, "loss": 0.80354881, "num_input_tokens_seen": 187713045, "router_z_loss_clip": 2.25976562, "router_z_loss_mlp": 0.25415039, "step": 8737, "time_per_iteration": 2.741071939468384 }, { "auxiliary_loss_clip": 0.01300036, "auxiliary_loss_mlp": 0.00219492, "balance_loss_clip": 1.07875156, "balance_loss_mlp": 0.19238427, "epoch": 0.5253569818127161, "flos": 20628274464000.0, "grad_norm": 28.46474572215583, "language_loss": 0.85611677, "learning_rate": 1.9328304935064833e-06, "loss": 0.87131208, "num_input_tokens_seen": 187733640, "router_z_loss_clip": 2.21484375, "router_z_loss_mlp": 0.27087402, "step": 8738, "time_per_iteration": 2.6930294036865234 }, { "auxiliary_loss_clip": 0.01339395, "auxiliary_loss_mlp": 0.00069745, "balance_loss_clip": 1.19836426, "balance_loss_mlp": 0.06254484, "epoch": 0.5254171050653841, "flos": 63428695810560.0, "grad_norm": 0.7327871542160603, "language_loss": 0.54072988, "learning_rate": 1.932441252806837e-06, "loss": 0.55482125, "num_input_tokens_seen": 187792930, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.07177734, "step": 8739, "time_per_iteration": 3.0960376262664795 }, { "auxiliary_loss_clip": 0.01313929, "auxiliary_loss_mlp": 0.00205333, "balance_loss_clip": 1.0923388, "balance_loss_mlp": 0.18035881, "epoch": 0.525477228318052, "flos": 34670782108800.0, "grad_norm": 21.342601743047616, "language_loss": 0.90241957, "learning_rate": 1.9320520146690263e-06, "loss": 0.9176122, "num_input_tokens_seen": 187812495, "router_z_loss_clip": 2.21484375, "router_z_loss_mlp": 0.24987793, "step": 8740, "time_per_iteration": 2.720768928527832 }, { "auxiliary_loss_clip": 0.01306988, "auxiliary_loss_mlp": 0.00208599, "balance_loss_clip": 1.08940303, "balance_loss_mlp": 0.18381497, "epoch": 0.52553735157072, "flos": 17930844766080.0, "grad_norm": 6.525669603670679, "language_loss": 0.77784121, "learning_rate": 1.9316627791078093e-06, "loss": 0.79299706, "num_input_tokens_seen": 187829685, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.2479248, "step": 8741, "time_per_iteration": 2.6303632259368896 }, { "auxiliary_loss_clip": 0.01314661, "auxiliary_loss_mlp": 0.00211449, "balance_loss_clip": 1.09276688, "balance_loss_mlp": 0.18717843, "epoch": 0.5255974748233879, "flos": 9940864584960.0, "grad_norm": 3.4285897445473528, "language_loss": 0.7699582, "learning_rate": 1.931273546137947e-06, "loss": 0.78521931, "num_input_tokens_seen": 187846495, "router_z_loss_clip": 2.22070312, "router_z_loss_mlp": 0.24279785, "step": 8742, "time_per_iteration": 2.5922300815582275 }, { "auxiliary_loss_clip": 0.01325774, "auxiliary_loss_mlp": 0.00229138, "balance_loss_clip": 1.09527564, "balance_loss_mlp": 0.20136176, "epoch": 0.5256575980760559, "flos": 16868427269760.0, "grad_norm": 59.676126771846896, "language_loss": 0.70970088, "learning_rate": 1.9308843157741983e-06, "loss": 0.72525001, "num_input_tokens_seen": 187862010, "router_z_loss_clip": 2.3046875, "router_z_loss_mlp": 0.27758789, "step": 8743, "time_per_iteration": 2.5665974617004395 }, { "auxiliary_loss_clip": 0.01289776, "auxiliary_loss_mlp": 0.00047819, "balance_loss_clip": 1.14588594, "balance_loss_mlp": 0.04188281, "epoch": 0.5257177213287239, "flos": 62386210362240.0, "grad_norm": 0.7840215671527526, "language_loss": 0.53719765, "learning_rate": 1.930495088031323e-06, "loss": 0.55057359, "num_input_tokens_seen": 187922730, "router_z_loss_clip": 1.4375, "router_z_loss_mlp": 0.05932617, "step": 8744, "time_per_iteration": 3.2221949100494385 }, { "auxiliary_loss_clip": 0.01324954, "auxiliary_loss_mlp": 0.00228265, "balance_loss_clip": 1.09803104, "balance_loss_mlp": 0.20159821, "epoch": 0.5257778445813919, "flos": 20776908942720.0, "grad_norm": 6.386966417305812, "language_loss": 0.85954452, "learning_rate": 1.9301058629240814e-06, "loss": 0.87507677, "num_input_tokens_seen": 187940160, "router_z_loss_clip": 2.26757812, "router_z_loss_mlp": 0.26672363, "step": 8745, "time_per_iteration": 2.635817766189575 }, { "auxiliary_loss_clip": 0.0130458, "auxiliary_loss_mlp": 0.00198299, "balance_loss_clip": 1.08985686, "balance_loss_mlp": 0.17389637, "epoch": 0.5258379678340598, "flos": 17018606033280.0, "grad_norm": 2.0103218463379555, "language_loss": 0.89547968, "learning_rate": 1.9297166404672324e-06, "loss": 0.91050851, "num_input_tokens_seen": 187958625, "router_z_loss_clip": 2.14648438, "router_z_loss_mlp": 0.24401855, "step": 8746, "time_per_iteration": 2.622610330581665 }, { "auxiliary_loss_clip": 0.01308708, "auxiliary_loss_mlp": 0.002232, "balance_loss_clip": 1.09442997, "balance_loss_mlp": 0.19735497, "epoch": 0.5258980910867278, "flos": 21068754946560.0, "grad_norm": 19.628025074882057, "language_loss": 0.82617676, "learning_rate": 1.9293274206755353e-06, "loss": 0.84149575, "num_input_tokens_seen": 187977575, "router_z_loss_clip": 2.14355469, "router_z_loss_mlp": 0.25854492, "step": 8747, "time_per_iteration": 2.884514093399048 }, { "auxiliary_loss_clip": 0.01293754, "auxiliary_loss_mlp": 0.0020459, "balance_loss_clip": 1.08394909, "balance_loss_mlp": 0.17880502, "epoch": 0.5259582143393957, "flos": 18004461690240.0, "grad_norm": 6.308715782003099, "language_loss": 0.89930069, "learning_rate": 1.9289382035637505e-06, "loss": 0.91428411, "num_input_tokens_seen": 187996650, "router_z_loss_clip": 2.09765625, "router_z_loss_mlp": 0.25805664, "step": 8748, "time_per_iteration": 2.874084234237671 }, { "auxiliary_loss_clip": 0.01319613, "auxiliary_loss_mlp": 0.00220982, "balance_loss_clip": 1.09725881, "balance_loss_mlp": 0.19375482, "epoch": 0.5260183375920637, "flos": 22783848520320.0, "grad_norm": 9622.41238292116, "language_loss": 0.90782845, "learning_rate": 1.9285489891466345e-06, "loss": 0.92323452, "num_input_tokens_seen": 188013510, "router_z_loss_clip": 2.22265625, "router_z_loss_mlp": 0.27258301, "step": 8749, "time_per_iteration": 2.6463942527770996 }, { "auxiliary_loss_clip": 0.01307374, "auxiliary_loss_mlp": 0.00199309, "balance_loss_clip": 1.09408116, "balance_loss_mlp": 0.17342886, "epoch": 0.5260784608447318, "flos": 27052406081280.0, "grad_norm": 13.152606729273494, "language_loss": 0.81330574, "learning_rate": 1.9281597774389487e-06, "loss": 0.8283726, "num_input_tokens_seen": 188032085, "router_z_loss_clip": 2.13085938, "router_z_loss_mlp": 0.25866699, "step": 8750, "time_per_iteration": 2.7072129249572754 }, { "auxiliary_loss_clip": 0.0131179, "auxiliary_loss_mlp": 0.00203344, "balance_loss_clip": 1.09236836, "balance_loss_mlp": 0.17748731, "epoch": 0.5261385840973997, "flos": 20662820369280.0, "grad_norm": 11.01912012816676, "language_loss": 0.82895911, "learning_rate": 1.9277705684554517e-06, "loss": 0.84411049, "num_input_tokens_seen": 188050590, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.25866699, "step": 8751, "time_per_iteration": 2.6649913787841797 }, { "auxiliary_loss_clip": 0.01314069, "auxiliary_loss_mlp": 0.00203278, "balance_loss_clip": 1.09868705, "balance_loss_mlp": 0.17811325, "epoch": 0.5261987073500677, "flos": 23622649896960.0, "grad_norm": 107.52906716512537, "language_loss": 0.81325549, "learning_rate": 1.927381362210902e-06, "loss": 0.82842898, "num_input_tokens_seen": 188071620, "router_z_loss_clip": 2.15039062, "router_z_loss_mlp": 0.25170898, "step": 8752, "time_per_iteration": 2.7370588779449463 }, { "auxiliary_loss_clip": 0.01321078, "auxiliary_loss_mlp": 0.00194425, "balance_loss_clip": 1.09640145, "balance_loss_mlp": 0.16753156, "epoch": 0.5262588306027356, "flos": 27636241743360.0, "grad_norm": 18.031455719705097, "language_loss": 0.75468552, "learning_rate": 1.926992158720058e-06, "loss": 0.76984054, "num_input_tokens_seen": 188091740, "router_z_loss_clip": 2.24609375, "router_z_loss_mlp": 0.26904297, "step": 8753, "time_per_iteration": 2.7903401851654053 }, { "auxiliary_loss_clip": 0.01314726, "auxiliary_loss_mlp": 0.00196859, "balance_loss_clip": 1.09936452, "balance_loss_mlp": 0.17388692, "epoch": 0.5263189538554036, "flos": 21759711943680.0, "grad_norm": 6.302425941081012, "language_loss": 0.90206659, "learning_rate": 1.9266029579976785e-06, "loss": 0.91718245, "num_input_tokens_seen": 188111165, "router_z_loss_clip": 2.15429688, "router_z_loss_mlp": 0.22961426, "step": 8754, "time_per_iteration": 2.6750130653381348 }, { "auxiliary_loss_clip": 0.01316509, "auxiliary_loss_mlp": 0.00229513, "balance_loss_clip": 1.09999478, "balance_loss_mlp": 0.2027622, "epoch": 0.5263790771080715, "flos": 14276359140480.0, "grad_norm": 5.739316509615888, "language_loss": 0.95324349, "learning_rate": 1.926213760058522e-06, "loss": 0.96870363, "num_input_tokens_seen": 188127825, "router_z_loss_clip": 2.16601562, "router_z_loss_mlp": 0.26745605, "step": 8755, "time_per_iteration": 2.6240272521972656 }, { "auxiliary_loss_clip": 0.0127263, "auxiliary_loss_mlp": 0.00063407, "balance_loss_clip": 1.12342298, "balance_loss_mlp": 0.05751814, "epoch": 0.5264392003607395, "flos": 65806413528960.0, "grad_norm": 0.7463553545142124, "language_loss": 0.58253968, "learning_rate": 1.9258245649173477e-06, "loss": 0.59590006, "num_input_tokens_seen": 188194050, "router_z_loss_clip": 1.4921875, "router_z_loss_mlp": 0.05883789, "step": 8756, "time_per_iteration": 4.598665714263916 }, { "auxiliary_loss_clip": 0.01311866, "auxiliary_loss_mlp": 0.00217664, "balance_loss_clip": 1.0901134, "balance_loss_mlp": 0.18888664, "epoch": 0.5264993236134075, "flos": 21032413361280.0, "grad_norm": 9.475397611786136, "language_loss": 0.78685123, "learning_rate": 1.925435372588913e-06, "loss": 0.80214655, "num_input_tokens_seen": 188212565, "router_z_loss_clip": 2.21679688, "router_z_loss_mlp": 0.2878418, "step": 8757, "time_per_iteration": 2.6966755390167236 }, { "auxiliary_loss_clip": 0.01298627, "auxiliary_loss_mlp": 0.00202254, "balance_loss_clip": 1.0864588, "balance_loss_mlp": 0.17872185, "epoch": 0.5265594468660755, "flos": 16618202150400.0, "grad_norm": 358.94980271108506, "language_loss": 0.95036775, "learning_rate": 1.9250461830879768e-06, "loss": 0.9653765, "num_input_tokens_seen": 188229505, "router_z_loss_clip": 2.11914062, "router_z_loss_mlp": 0.23535156, "step": 8758, "time_per_iteration": 4.054173469543457 }, { "auxiliary_loss_clip": 0.01288395, "auxiliary_loss_mlp": 0.00199501, "balance_loss_clip": 1.07616758, "balance_loss_mlp": 0.1743713, "epoch": 0.5266195701187434, "flos": 24134125610880.0, "grad_norm": 6.842721422307652, "language_loss": 0.83007544, "learning_rate": 1.9246569964292965e-06, "loss": 0.84495437, "num_input_tokens_seen": 188250395, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.25097656, "step": 8759, "time_per_iteration": 2.6442711353302 }, { "auxiliary_loss_clip": 0.0129048, "auxiliary_loss_mlp": 0.00193912, "balance_loss_clip": 1.08124232, "balance_loss_mlp": 0.16805521, "epoch": 0.5266796933714114, "flos": 15844111125120.0, "grad_norm": 18.954552335763864, "language_loss": 0.81401372, "learning_rate": 1.9242678126276307e-06, "loss": 0.82885766, "num_input_tokens_seen": 188266785, "router_z_loss_clip": 2.09570312, "router_z_loss_mlp": 0.25854492, "step": 8760, "time_per_iteration": 4.010049104690552 }, { "auxiliary_loss_clip": 0.01313524, "auxiliary_loss_mlp": 0.00199229, "balance_loss_clip": 1.09368014, "balance_loss_mlp": 0.17225154, "epoch": 0.5267398166240793, "flos": 20951434149120.0, "grad_norm": 9.829907473617705, "language_loss": 0.87094736, "learning_rate": 1.923878631697736e-06, "loss": 0.8860749, "num_input_tokens_seen": 188282525, "router_z_loss_clip": 2.19921875, "router_z_loss_mlp": 0.26953125, "step": 8761, "time_per_iteration": 2.6444709300994873 }, { "auxiliary_loss_clip": 0.01308094, "auxiliary_loss_mlp": 0.00187871, "balance_loss_clip": 1.08999968, "balance_loss_mlp": 0.16184777, "epoch": 0.5267999398767473, "flos": 20996394998400.0, "grad_norm": 4.059219752938652, "language_loss": 0.81924474, "learning_rate": 1.923489453654373e-06, "loss": 0.83420438, "num_input_tokens_seen": 188301395, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.26049805, "step": 8762, "time_per_iteration": 2.7862846851348877 }, { "auxiliary_loss_clip": 0.01301423, "auxiliary_loss_mlp": 0.00057536, "balance_loss_clip": 1.15686917, "balance_loss_mlp": 0.05186136, "epoch": 0.5268600631294152, "flos": 66849401767680.0, "grad_norm": 0.9511442441184109, "language_loss": 0.65046495, "learning_rate": 1.9231002785122963e-06, "loss": 0.66405457, "num_input_tokens_seen": 188357665, "router_z_loss_clip": 1.4453125, "router_z_loss_mlp": 0.05664062, "step": 8763, "time_per_iteration": 2.995384931564331 }, { "auxiliary_loss_clip": 0.01335237, "auxiliary_loss_mlp": 0.00207632, "balance_loss_clip": 1.1154232, "balance_loss_mlp": 0.18073803, "epoch": 0.5269201863820833, "flos": 17165552572800.0, "grad_norm": 8.032090947629124, "language_loss": 0.80323005, "learning_rate": 1.922711106286265e-06, "loss": 0.81865871, "num_input_tokens_seen": 188376935, "router_z_loss_clip": 2.19921875, "router_z_loss_mlp": 0.26904297, "step": 8764, "time_per_iteration": 4.100106716156006 }, { "auxiliary_loss_clip": 0.01328008, "auxiliary_loss_mlp": 0.0021971, "balance_loss_clip": 1.10968733, "balance_loss_mlp": 0.19340067, "epoch": 0.5269803096347513, "flos": 20522589672960.0, "grad_norm": 11.386695265955117, "language_loss": 0.82060581, "learning_rate": 1.9223219369910368e-06, "loss": 0.83608299, "num_input_tokens_seen": 188394995, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.26318359, "step": 8765, "time_per_iteration": 2.684817314147949 }, { "auxiliary_loss_clip": 0.01328873, "auxiliary_loss_mlp": 0.00210501, "balance_loss_clip": 1.1082685, "balance_loss_mlp": 0.18190277, "epoch": 0.5270404328874192, "flos": 27230989524480.0, "grad_norm": 225.0763880521249, "language_loss": 0.92324448, "learning_rate": 1.9219327706413677e-06, "loss": 0.93863815, "num_input_tokens_seen": 188415475, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.2857666, "step": 8766, "time_per_iteration": 2.7008094787597656 }, { "auxiliary_loss_clip": 0.01340614, "auxiliary_loss_mlp": 0.00203778, "balance_loss_clip": 1.11510229, "balance_loss_mlp": 0.1767533, "epoch": 0.5271005561400872, "flos": 23110491824640.0, "grad_norm": 4941.280965514039, "language_loss": 0.86625671, "learning_rate": 1.921543607252017e-06, "loss": 0.88170069, "num_input_tokens_seen": 188435665, "router_z_loss_clip": 2.25585938, "router_z_loss_mlp": 0.27038574, "step": 8767, "time_per_iteration": 2.63136887550354 }, { "auxiliary_loss_clip": 0.01340772, "auxiliary_loss_mlp": 0.00231817, "balance_loss_clip": 1.11580622, "balance_loss_mlp": 0.20377934, "epoch": 0.5271606793927551, "flos": 22564793427840.0, "grad_norm": 43.77457821984019, "language_loss": 0.80716127, "learning_rate": 1.9211544468377394e-06, "loss": 0.82288718, "num_input_tokens_seen": 188455405, "router_z_loss_clip": 2.25, "router_z_loss_mlp": 0.28027344, "step": 8768, "time_per_iteration": 2.658099412918091 }, { "auxiliary_loss_clip": 0.01336717, "auxiliary_loss_mlp": 0.0020913, "balance_loss_clip": 1.1182797, "balance_loss_mlp": 0.18282032, "epoch": 0.5272208026454231, "flos": 18764259102720.0, "grad_norm": 2.9185506531911267, "language_loss": 0.82755673, "learning_rate": 1.9207652894132933e-06, "loss": 0.84301519, "num_input_tokens_seen": 188472940, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.26293945, "step": 8769, "time_per_iteration": 2.7952685356140137 }, { "auxiliary_loss_clip": 0.01340699, "auxiliary_loss_mlp": 0.00200271, "balance_loss_clip": 1.12163734, "balance_loss_mlp": 0.17305601, "epoch": 0.5272809258980911, "flos": 20412164286720.0, "grad_norm": 20.976225596258786, "language_loss": 0.82601523, "learning_rate": 1.920376134993436e-06, "loss": 0.84142494, "num_input_tokens_seen": 188493035, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.27209473, "step": 8770, "time_per_iteration": 2.657714605331421 }, { "auxiliary_loss_clip": 0.01361958, "auxiliary_loss_mlp": 0.00223774, "balance_loss_clip": 1.13910651, "balance_loss_mlp": 0.19597465, "epoch": 0.5273410491507591, "flos": 28256742213120.0, "grad_norm": 8.657165811505251, "language_loss": 0.77478528, "learning_rate": 1.9199869835929224e-06, "loss": 0.79064256, "num_input_tokens_seen": 188513860, "router_z_loss_clip": 2.2265625, "router_z_loss_mlp": 0.2779541, "step": 8771, "time_per_iteration": 2.729231357574463 }, { "auxiliary_loss_clip": 0.01339124, "auxiliary_loss_mlp": 0.00222606, "balance_loss_clip": 1.1236012, "balance_loss_mlp": 0.19530748, "epoch": 0.527401172403427, "flos": 22455158140800.0, "grad_norm": 5.096451887355266, "language_loss": 0.84126627, "learning_rate": 1.9195978352265115e-06, "loss": 0.85688353, "num_input_tokens_seen": 188533345, "router_z_loss_clip": 2.15429688, "router_z_loss_mlp": 0.27294922, "step": 8772, "time_per_iteration": 2.643411874771118 }, { "auxiliary_loss_clip": 0.01347123, "auxiliary_loss_mlp": 0.00209327, "balance_loss_clip": 1.125494, "balance_loss_mlp": 0.18213573, "epoch": 0.527461295656095, "flos": 21031084558080.0, "grad_norm": 8.652763085333394, "language_loss": 0.76740527, "learning_rate": 1.9192086899089585e-06, "loss": 0.78296977, "num_input_tokens_seen": 188551550, "router_z_loss_clip": 2.21484375, "router_z_loss_mlp": 0.2722168, "step": 8773, "time_per_iteration": 2.754176139831543 }, { "auxiliary_loss_clip": 0.01337681, "auxiliary_loss_mlp": 0.00211368, "balance_loss_clip": 1.12289023, "balance_loss_mlp": 0.18493938, "epoch": 0.5275214189087629, "flos": 26322018929280.0, "grad_norm": 3.2819183339953164, "language_loss": 0.91477644, "learning_rate": 1.91881954765502e-06, "loss": 0.93026692, "num_input_tokens_seen": 188571615, "router_z_loss_clip": 2.14648438, "router_z_loss_mlp": 0.26403809, "step": 8774, "time_per_iteration": 2.6675946712493896 }, { "auxiliary_loss_clip": 0.01337865, "auxiliary_loss_mlp": 0.002257, "balance_loss_clip": 1.12287617, "balance_loss_mlp": 0.19881785, "epoch": 0.5275815421614309, "flos": 20047024581120.0, "grad_norm": 12.287881996266727, "language_loss": 0.86215061, "learning_rate": 1.9184304084794523e-06, "loss": 0.87778628, "num_input_tokens_seen": 188591965, "router_z_loss_clip": 2.1484375, "router_z_loss_mlp": 0.2689209, "step": 8775, "time_per_iteration": 2.647134780883789 }, { "auxiliary_loss_clip": 0.01332577, "auxiliary_loss_mlp": 0.00210388, "balance_loss_clip": 1.12214351, "balance_loss_mlp": 0.18403043, "epoch": 0.5276416654140988, "flos": 21432206712960.0, "grad_norm": 173.75319241468904, "language_loss": 0.90181148, "learning_rate": 1.918041272397012e-06, "loss": 0.91724116, "num_input_tokens_seen": 188610675, "router_z_loss_clip": 2.10839844, "router_z_loss_mlp": 0.26379395, "step": 8776, "time_per_iteration": 2.645421266555786 }, { "auxiliary_loss_clip": 0.01353783, "auxiliary_loss_mlp": 0.00208903, "balance_loss_clip": 1.13038683, "balance_loss_mlp": 0.18398872, "epoch": 0.5277017886667669, "flos": 17165085696000.0, "grad_norm": 4.856251409565469, "language_loss": 0.74933195, "learning_rate": 1.9176521394224547e-06, "loss": 0.7649588, "num_input_tokens_seen": 188628235, "router_z_loss_clip": 2.23632812, "router_z_loss_mlp": 0.24926758, "step": 8777, "time_per_iteration": 2.6404359340667725 }, { "auxiliary_loss_clip": 0.01331475, "auxiliary_loss_mlp": 0.00213702, "balance_loss_clip": 1.11819363, "balance_loss_mlp": 0.18701127, "epoch": 0.5277619119194349, "flos": 20448146736000.0, "grad_norm": 3051.1527252057713, "language_loss": 0.88289505, "learning_rate": 1.9172630095705358e-06, "loss": 0.89834684, "num_input_tokens_seen": 188648925, "router_z_loss_clip": 2.1328125, "router_z_loss_mlp": 0.26660156, "step": 8778, "time_per_iteration": 2.7115731239318848 }, { "auxiliary_loss_clip": 0.0136076, "auxiliary_loss_mlp": 0.00201499, "balance_loss_clip": 1.13607597, "balance_loss_mlp": 0.17395003, "epoch": 0.5278220351721028, "flos": 24061083304320.0, "grad_norm": 14.243541795252291, "language_loss": 0.88129747, "learning_rate": 1.916873882856013e-06, "loss": 0.89692008, "num_input_tokens_seen": 188668125, "router_z_loss_clip": 2.24609375, "router_z_loss_mlp": 0.2755127, "step": 8779, "time_per_iteration": 2.7158584594726562 }, { "auxiliary_loss_clip": 0.01348471, "auxiliary_loss_mlp": 0.00191241, "balance_loss_clip": 1.13143277, "balance_loss_mlp": 0.16449061, "epoch": 0.5278821584247708, "flos": 24642907804800.0, "grad_norm": 141.1988562768924, "language_loss": 0.85694206, "learning_rate": 1.9164847592936406e-06, "loss": 0.87233913, "num_input_tokens_seen": 188684410, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.26745605, "step": 8780, "time_per_iteration": 2.632591724395752 }, { "auxiliary_loss_clip": 0.01373394, "auxiliary_loss_mlp": 0.00220424, "balance_loss_clip": 1.14797533, "balance_loss_mlp": 0.19319689, "epoch": 0.5279422816774387, "flos": 35408244240000.0, "grad_norm": 2.7024173427867897, "language_loss": 0.76463783, "learning_rate": 1.916095638898174e-06, "loss": 0.78057599, "num_input_tokens_seen": 188706130, "router_z_loss_clip": 2.25585938, "router_z_loss_mlp": 0.2722168, "step": 8781, "time_per_iteration": 2.786389112472534 }, { "auxiliary_loss_clip": 0.0132607, "auxiliary_loss_mlp": 0.00195243, "balance_loss_clip": 1.11104429, "balance_loss_mlp": 0.17020926, "epoch": 0.5280024049301068, "flos": 22967028904320.0, "grad_norm": 143.1367412560772, "language_loss": 0.77952826, "learning_rate": 1.9157065216843696e-06, "loss": 0.79474139, "num_input_tokens_seen": 188725030, "router_z_loss_clip": 2.15234375, "router_z_loss_mlp": 0.25024414, "step": 8782, "time_per_iteration": 2.6549019813537598 }, { "auxiliary_loss_clip": 0.01338006, "auxiliary_loss_mlp": 0.00213208, "balance_loss_clip": 1.12203932, "balance_loss_mlp": 0.18745838, "epoch": 0.5280625281827747, "flos": 21507619317120.0, "grad_norm": 10.193152351942766, "language_loss": 0.76988947, "learning_rate": 1.915317407666982e-06, "loss": 0.78540158, "num_input_tokens_seen": 188744325, "router_z_loss_clip": 2.16210938, "router_z_loss_mlp": 0.2578125, "step": 8783, "time_per_iteration": 2.623230218887329 }, { "auxiliary_loss_clip": 0.0138413, "auxiliary_loss_mlp": 0.0024425, "balance_loss_clip": 1.14946318, "balance_loss_mlp": 0.21462689, "epoch": 0.5281226514354427, "flos": 31208167958400.0, "grad_norm": 9.563992231684418, "language_loss": 0.77443409, "learning_rate": 1.9149282968607674e-06, "loss": 0.79071796, "num_input_tokens_seen": 188765100, "router_z_loss_clip": 2.34765625, "router_z_loss_mlp": 0.29626465, "step": 8784, "time_per_iteration": 2.7164928913116455 }, { "auxiliary_loss_clip": 0.01360876, "auxiliary_loss_mlp": 0.0023211, "balance_loss_clip": 1.13200045, "balance_loss_mlp": 0.20344047, "epoch": 0.5281827746881106, "flos": 25077821679360.0, "grad_norm": 31.214546731829504, "language_loss": 0.84746575, "learning_rate": 1.91453918928048e-06, "loss": 0.86339557, "num_input_tokens_seen": 188783995, "router_z_loss_clip": 2.2890625, "router_z_loss_mlp": 0.28637695, "step": 8785, "time_per_iteration": 2.6566293239593506 }, { "auxiliary_loss_clip": 0.01357906, "auxiliary_loss_mlp": 0.00207266, "balance_loss_clip": 1.13590491, "balance_loss_mlp": 0.18005055, "epoch": 0.5282428979407786, "flos": 20631255292800.0, "grad_norm": 4.100255038035106, "language_loss": 0.90262705, "learning_rate": 1.9141500849408745e-06, "loss": 0.91827869, "num_input_tokens_seen": 188803120, "router_z_loss_clip": 2.22070312, "router_z_loss_mlp": 0.2722168, "step": 8786, "time_per_iteration": 2.6585400104522705 }, { "auxiliary_loss_clip": 0.01353329, "auxiliary_loss_mlp": 0.00191131, "balance_loss_clip": 1.13639343, "balance_loss_mlp": 0.16509528, "epoch": 0.5283030211934465, "flos": 22419391173120.0, "grad_norm": 10.738251558330598, "language_loss": 0.88979506, "learning_rate": 1.9137609838567076e-06, "loss": 0.9052397, "num_input_tokens_seen": 188820960, "router_z_loss_clip": 2.171875, "router_z_loss_mlp": 0.26037598, "step": 8787, "time_per_iteration": 2.6691057682037354 }, { "auxiliary_loss_clip": 0.01347654, "auxiliary_loss_mlp": 0.00210959, "balance_loss_clip": 1.12931085, "balance_loss_mlp": 0.1848882, "epoch": 0.5283631444461145, "flos": 23615467176960.0, "grad_norm": 78.77848929551989, "language_loss": 0.89184928, "learning_rate": 1.9133718860427316e-06, "loss": 0.90743542, "num_input_tokens_seen": 188837165, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.26049805, "step": 8788, "time_per_iteration": 2.624455213546753 }, { "auxiliary_loss_clip": 0.01356695, "auxiliary_loss_mlp": 0.00219927, "balance_loss_clip": 1.13883209, "balance_loss_mlp": 0.19287859, "epoch": 0.5284232676987825, "flos": 32671994918400.0, "grad_norm": 2.876132248976745, "language_loss": 0.83894062, "learning_rate": 1.9129827915137027e-06, "loss": 0.85470688, "num_input_tokens_seen": 188858555, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.27050781, "step": 8789, "time_per_iteration": 2.720179319381714 }, { "auxiliary_loss_clip": 0.01351338, "auxiliary_loss_mlp": 0.00192773, "balance_loss_clip": 1.13479006, "balance_loss_mlp": 0.16680947, "epoch": 0.5284833909514505, "flos": 26760919213440.0, "grad_norm": 373.7935282398046, "language_loss": 0.79383242, "learning_rate": 1.9125937002843754e-06, "loss": 0.80927354, "num_input_tokens_seen": 188879050, "router_z_loss_clip": 2.16210938, "router_z_loss_mlp": 0.25964355, "step": 8790, "time_per_iteration": 2.6904942989349365 }, { "auxiliary_loss_clip": 0.01369828, "auxiliary_loss_mlp": 0.00199224, "balance_loss_clip": 1.14658117, "balance_loss_mlp": 0.17360634, "epoch": 0.5285435142041185, "flos": 22090700793600.0, "grad_norm": 2.494653406557066, "language_loss": 0.85364234, "learning_rate": 1.9122046123695036e-06, "loss": 0.86933285, "num_input_tokens_seen": 188898885, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.25634766, "step": 8791, "time_per_iteration": 2.6830339431762695 }, { "auxiliary_loss_clip": 0.01364315, "auxiliary_loss_mlp": 0.00213659, "balance_loss_clip": 1.14345133, "balance_loss_mlp": 0.18681312, "epoch": 0.5286036374567864, "flos": 20375463565440.0, "grad_norm": 4.846754304173797, "language_loss": 0.75943935, "learning_rate": 1.9118155277838423e-06, "loss": 0.77521908, "num_input_tokens_seen": 188917225, "router_z_loss_clip": 2.20507812, "router_z_loss_mlp": 0.26818848, "step": 8792, "time_per_iteration": 2.7056219577789307 }, { "auxiliary_loss_clip": 0.01357751, "auxiliary_loss_mlp": 0.00213799, "balance_loss_clip": 1.13940072, "balance_loss_mlp": 0.18869352, "epoch": 0.5286637607094544, "flos": 24352175122560.0, "grad_norm": 122.4877236744321, "language_loss": 0.90120554, "learning_rate": 1.9114264465421443e-06, "loss": 0.91692114, "num_input_tokens_seen": 188936120, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.25097656, "step": 8793, "time_per_iteration": 2.758869171142578 }, { "auxiliary_loss_clip": 0.01371763, "auxiliary_loss_mlp": 0.00225933, "balance_loss_clip": 1.15052605, "balance_loss_mlp": 0.20046987, "epoch": 0.5287238839621223, "flos": 17271165536640.0, "grad_norm": 32.31575803405323, "language_loss": 0.92098975, "learning_rate": 1.9110373686591645e-06, "loss": 0.93696678, "num_input_tokens_seen": 188953405, "router_z_loss_clip": 2.21289062, "router_z_loss_mlp": 0.2545166, "step": 8794, "time_per_iteration": 2.661921977996826 }, { "auxiliary_loss_clip": 0.01379944, "auxiliary_loss_mlp": 0.00238388, "balance_loss_clip": 1.15008497, "balance_loss_mlp": 0.20939678, "epoch": 0.5287840072147904, "flos": 17566890209280.0, "grad_norm": 6.116919524505697, "language_loss": 0.77946317, "learning_rate": 1.9106482941496564e-06, "loss": 0.79564643, "num_input_tokens_seen": 188971150, "router_z_loss_clip": 2.29492188, "router_z_loss_mlp": 0.28979492, "step": 8795, "time_per_iteration": 2.70296573638916 }, { "auxiliary_loss_clip": 0.01386478, "auxiliary_loss_mlp": 0.00235055, "balance_loss_clip": 1.15720057, "balance_loss_mlp": 0.20826825, "epoch": 0.5288441304674583, "flos": 18552099421440.0, "grad_norm": 7.860429239029773, "language_loss": 0.90936494, "learning_rate": 1.910259223028374e-06, "loss": 0.9255802, "num_input_tokens_seen": 188989550, "router_z_loss_clip": 2.29296875, "router_z_loss_mlp": 0.26757812, "step": 8796, "time_per_iteration": 2.6790072917938232 }, { "auxiliary_loss_clip": 0.01386295, "auxiliary_loss_mlp": 0.00244973, "balance_loss_clip": 1.1593821, "balance_loss_mlp": 0.2190212, "epoch": 0.5289042537201263, "flos": 20814507504000.0, "grad_norm": 392.73718118810285, "language_loss": 0.77594495, "learning_rate": 1.909870155310071e-06, "loss": 0.79225761, "num_input_tokens_seen": 189008795, "router_z_loss_clip": 2.26757812, "router_z_loss_mlp": 0.2598877, "step": 8797, "time_per_iteration": 2.6476998329162598 }, { "auxiliary_loss_clip": 0.01392428, "auxiliary_loss_mlp": 0.0021146, "balance_loss_clip": 1.1667335, "balance_loss_mlp": 0.1854604, "epoch": 0.5289643769727942, "flos": 15735265937280.0, "grad_norm": 6.944182003126047, "language_loss": 0.89601398, "learning_rate": 1.9094810910095005e-06, "loss": 0.91205287, "num_input_tokens_seen": 189025540, "router_z_loss_clip": 2.25976562, "router_z_loss_mlp": 0.2598877, "step": 8798, "time_per_iteration": 2.635138511657715 }, { "auxiliary_loss_clip": 0.01400088, "auxiliary_loss_mlp": 0.00235595, "balance_loss_clip": 1.16378915, "balance_loss_mlp": 0.20632918, "epoch": 0.5290245002254622, "flos": 19537308633600.0, "grad_norm": 4.4708604464919635, "language_loss": 0.7878812, "learning_rate": 1.9090920301414166e-06, "loss": 0.80423802, "num_input_tokens_seen": 189044885, "router_z_loss_clip": 2.36328125, "router_z_loss_mlp": 0.29284668, "step": 8799, "time_per_iteration": 4.047868490219116 }, { "auxiliary_loss_clip": 0.01408166, "auxiliary_loss_mlp": 0.00183932, "balance_loss_clip": 1.17940342, "balance_loss_mlp": 0.15850492, "epoch": 0.5290846234781301, "flos": 15815131827840.0, "grad_norm": 140.50146424453175, "language_loss": 0.7761932, "learning_rate": 1.9087029727205716e-06, "loss": 0.79211414, "num_input_tokens_seen": 189061280, "router_z_loss_clip": 2.2890625, "router_z_loss_mlp": 0.25402832, "step": 8800, "time_per_iteration": 4.087331295013428 }, { "auxiliary_loss_clip": 0.01401952, "auxiliary_loss_mlp": 0.00077983, "balance_loss_clip": 1.25786543, "balance_loss_mlp": 0.07044925, "epoch": 0.5291447467307981, "flos": 70057624821120.0, "grad_norm": 1.0005875669802278, "language_loss": 0.57119167, "learning_rate": 1.9083139187617193e-06, "loss": 0.58599102, "num_input_tokens_seen": 189114775, "router_z_loss_clip": 1.4375, "router_z_loss_mlp": 0.07519531, "step": 8801, "time_per_iteration": 2.9887704849243164 }, { "auxiliary_loss_clip": 0.01397064, "auxiliary_loss_mlp": 0.00241164, "balance_loss_clip": 1.16534042, "balance_loss_mlp": 0.2129713, "epoch": 0.529204869983466, "flos": 28364186770560.0, "grad_norm": 39.04311803007078, "language_loss": 0.71494085, "learning_rate": 1.9079248682796123e-06, "loss": 0.73132312, "num_input_tokens_seen": 189134700, "router_z_loss_clip": 2.31835938, "router_z_loss_mlp": 0.28198242, "step": 8802, "time_per_iteration": 4.111076831817627 }, { "auxiliary_loss_clip": 0.01404078, "auxiliary_loss_mlp": 0.00218076, "balance_loss_clip": 1.17256665, "balance_loss_mlp": 0.19275641, "epoch": 0.5292649932361341, "flos": 33758830684800.0, "grad_norm": 7.9317238929451985, "language_loss": 0.76087439, "learning_rate": 1.907535821289003e-06, "loss": 0.77709591, "num_input_tokens_seen": 189155365, "router_z_loss_clip": 2.31445312, "router_z_loss_mlp": 0.2532959, "step": 8803, "time_per_iteration": 2.7654898166656494 }, { "auxiliary_loss_clip": 0.01394003, "auxiliary_loss_mlp": 0.00219254, "balance_loss_clip": 1.16258609, "balance_loss_mlp": 0.1918118, "epoch": 0.5293251164888021, "flos": 20447679859200.0, "grad_norm": 66.07972433164336, "language_loss": 0.84798658, "learning_rate": 1.9071467778046458e-06, "loss": 0.86411917, "num_input_tokens_seen": 189173885, "router_z_loss_clip": 2.3125, "router_z_loss_mlp": 0.27490234, "step": 8804, "time_per_iteration": 2.6279683113098145 }, { "auxiliary_loss_clip": 0.01441931, "auxiliary_loss_mlp": 0.00112033, "balance_loss_clip": 1.2982235, "balance_loss_mlp": 0.10340185, "epoch": 0.52938523974147, "flos": 66545312204160.0, "grad_norm": 0.755845984043334, "language_loss": 0.5256331, "learning_rate": 1.906757737841291e-06, "loss": 0.54117274, "num_input_tokens_seen": 189236515, "router_z_loss_clip": 1.4375, "router_z_loss_mlp": 0.08642578, "step": 8805, "time_per_iteration": 3.187822103500366 }, { "auxiliary_loss_clip": 0.01445483, "auxiliary_loss_mlp": 0.00080676, "balance_loss_clip": 1.29988706, "balance_loss_mlp": 0.07276089, "epoch": 0.529445362994138, "flos": 67151734542720.0, "grad_norm": 0.7412859649412711, "language_loss": 0.63745016, "learning_rate": 1.906368701413693e-06, "loss": 0.65271169, "num_input_tokens_seen": 189300500, "router_z_loss_clip": 1.453125, "router_z_loss_mlp": 0.07910156, "step": 8806, "time_per_iteration": 3.090163230895996 }, { "auxiliary_loss_clip": 0.01414836, "auxiliary_loss_mlp": 0.00233369, "balance_loss_clip": 1.17406106, "balance_loss_mlp": 0.20422257, "epoch": 0.5295054862468059, "flos": 17749316407680.0, "grad_norm": 48.09907611455745, "language_loss": 0.78694928, "learning_rate": 1.9059796685366026e-06, "loss": 0.80343133, "num_input_tokens_seen": 189319745, "router_z_loss_clip": 2.41015625, "router_z_loss_mlp": 0.29125977, "step": 8807, "time_per_iteration": 4.012441635131836 }, { "auxiliary_loss_clip": 0.01405278, "auxiliary_loss_mlp": 0.00202846, "balance_loss_clip": 1.173491, "balance_loss_mlp": 0.17722759, "epoch": 0.529565609499474, "flos": 11397401084160.0, "grad_norm": 16.150898058132146, "language_loss": 0.78422147, "learning_rate": 1.9055906392247723e-06, "loss": 0.80030274, "num_input_tokens_seen": 189334550, "router_z_loss_clip": 2.31640625, "router_z_loss_mlp": 0.25610352, "step": 8808, "time_per_iteration": 2.5804152488708496 }, { "auxiliary_loss_clip": 0.01400344, "auxiliary_loss_mlp": 0.00194452, "balance_loss_clip": 1.17147648, "balance_loss_mlp": 0.16894153, "epoch": 0.5296257327521419, "flos": 17196363463680.0, "grad_norm": 3.325717687548196, "language_loss": 0.9411239, "learning_rate": 1.9052016134929554e-06, "loss": 0.95707184, "num_input_tokens_seen": 189351735, "router_z_loss_clip": 2.29101562, "router_z_loss_mlp": 0.25537109, "step": 8809, "time_per_iteration": 2.5766797065734863 }, { "auxiliary_loss_clip": 0.01432476, "auxiliary_loss_mlp": 0.00245414, "balance_loss_clip": 1.18837905, "balance_loss_mlp": 0.2160172, "epoch": 0.5296858560048099, "flos": 39964086777600.0, "grad_norm": 86.90373895790039, "language_loss": 0.70980811, "learning_rate": 1.9048125913559016e-06, "loss": 0.72658706, "num_input_tokens_seen": 189373105, "router_z_loss_clip": 2.44140625, "router_z_loss_mlp": 0.29394531, "step": 8810, "time_per_iteration": 2.8256092071533203 }, { "auxiliary_loss_clip": 0.0140656, "auxiliary_loss_mlp": 0.00216457, "balance_loss_clip": 1.17503834, "balance_loss_mlp": 0.18779886, "epoch": 0.5297459792574778, "flos": 20961418129920.0, "grad_norm": 23.38185010130542, "language_loss": 0.74032247, "learning_rate": 1.9044235728283646e-06, "loss": 0.75655264, "num_input_tokens_seen": 189394615, "router_z_loss_clip": 2.31640625, "router_z_loss_mlp": 0.28649902, "step": 8811, "time_per_iteration": 2.6961348056793213 }, { "auxiliary_loss_clip": 0.0138955, "auxiliary_loss_mlp": 0.00075435, "balance_loss_clip": 1.25316262, "balance_loss_mlp": 0.06813949, "epoch": 0.5298061025101458, "flos": 66523620389760.0, "grad_norm": 0.6553655750857896, "language_loss": 0.52993536, "learning_rate": 1.9040345579250953e-06, "loss": 0.54458523, "num_input_tokens_seen": 189459750, "router_z_loss_clip": 1.359375, "router_z_loss_mlp": 0.07275391, "step": 8812, "time_per_iteration": 3.25697922706604 }, { "auxiliary_loss_clip": 0.01385266, "auxiliary_loss_mlp": 0.00078771, "balance_loss_clip": 1.24743748, "balance_loss_mlp": 0.07128425, "epoch": 0.5298662257628137, "flos": 67662994775040.0, "grad_norm": 0.7192499334016521, "language_loss": 0.56301618, "learning_rate": 1.9036455466608453e-06, "loss": 0.57765651, "num_input_tokens_seen": 189527540, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.07470703, "step": 8813, "time_per_iteration": 3.225308656692505 }, { "auxiliary_loss_clip": 0.01404014, "auxiliary_loss_mlp": 0.00190685, "balance_loss_clip": 1.17762566, "balance_loss_mlp": 0.16592479, "epoch": 0.5299263490154817, "flos": 19646405216640.0, "grad_norm": 15.440749390224251, "language_loss": 0.86950344, "learning_rate": 1.9032565390503657e-06, "loss": 0.88545042, "num_input_tokens_seen": 189546900, "router_z_loss_clip": 2.26953125, "router_z_loss_mlp": 0.24768066, "step": 8814, "time_per_iteration": 2.633758306503296 }, { "auxiliary_loss_clip": 0.0142403, "auxiliary_loss_mlp": 0.00233561, "balance_loss_clip": 1.18154716, "balance_loss_mlp": 0.20515332, "epoch": 0.5299864722681497, "flos": 22055005653120.0, "grad_norm": 11.035321479293287, "language_loss": 0.90120775, "learning_rate": 1.9028675351084076e-06, "loss": 0.91778362, "num_input_tokens_seen": 189566490, "router_z_loss_clip": 2.42578125, "router_z_loss_mlp": 0.28381348, "step": 8815, "time_per_iteration": 2.8344321250915527 }, { "auxiliary_loss_clip": 0.01407328, "auxiliary_loss_mlp": 0.00212024, "balance_loss_clip": 1.17683887, "balance_loss_mlp": 0.18368816, "epoch": 0.5300465955208177, "flos": 21763698353280.0, "grad_norm": 16.78085962703768, "language_loss": 0.72733676, "learning_rate": 1.9024785348497225e-06, "loss": 0.74353027, "num_input_tokens_seen": 189585580, "router_z_loss_clip": 2.30859375, "router_z_loss_mlp": 0.2833252, "step": 8816, "time_per_iteration": 2.6888022422790527 }, { "auxiliary_loss_clip": 0.01389706, "auxiliary_loss_mlp": 0.00220221, "balance_loss_clip": 1.1602366, "balance_loss_mlp": 0.19319603, "epoch": 0.5301067187734857, "flos": 42996491735040.0, "grad_norm": 10.422712312636461, "language_loss": 0.79774773, "learning_rate": 1.9020895382890611e-06, "loss": 0.81384695, "num_input_tokens_seen": 189608485, "router_z_loss_clip": 2.29492188, "router_z_loss_mlp": 0.26989746, "step": 8817, "time_per_iteration": 2.8365633487701416 }, { "auxiliary_loss_clip": 0.01380404, "auxiliary_loss_mlp": 0.00253294, "balance_loss_clip": 1.1487385, "balance_loss_mlp": 0.22554184, "epoch": 0.5301668420261536, "flos": 20554298403840.0, "grad_norm": 8.730615305490105, "language_loss": 0.71572036, "learning_rate": 1.9017005454411743e-06, "loss": 0.73205733, "num_input_tokens_seen": 189627815, "router_z_loss_clip": 2.3203125, "router_z_loss_mlp": 0.27770996, "step": 8818, "time_per_iteration": 2.679097890853882 }, { "auxiliary_loss_clip": 0.01386768, "auxiliary_loss_mlp": 0.0022931, "balance_loss_clip": 1.15269065, "balance_loss_mlp": 0.19938877, "epoch": 0.5302269652788216, "flos": 17486665182720.0, "grad_norm": 16.987772065000524, "language_loss": 0.82641101, "learning_rate": 1.9013115563208126e-06, "loss": 0.84257174, "num_input_tokens_seen": 189644850, "router_z_loss_clip": 2.33984375, "router_z_loss_mlp": 0.29931641, "step": 8819, "time_per_iteration": 2.6713345050811768 }, { "auxiliary_loss_clip": 0.01388361, "auxiliary_loss_mlp": 0.00246988, "balance_loss_clip": 1.15173507, "balance_loss_mlp": 0.22029723, "epoch": 0.5302870885314895, "flos": 14574202715520.0, "grad_norm": 66.23858829634625, "language_loss": 0.91628933, "learning_rate": 1.9009225709427267e-06, "loss": 0.93264288, "num_input_tokens_seen": 189660945, "router_z_loss_clip": 2.36523438, "router_z_loss_mlp": 0.2668457, "step": 8820, "time_per_iteration": 2.5928876399993896 }, { "auxiliary_loss_clip": 0.01368091, "auxiliary_loss_mlp": 0.00229519, "balance_loss_clip": 1.14102948, "balance_loss_mlp": 0.2031979, "epoch": 0.5303472117841576, "flos": 23438032968960.0, "grad_norm": 575.2210811618712, "language_loss": 0.7850045, "learning_rate": 1.9005335893216667e-06, "loss": 0.80098057, "num_input_tokens_seen": 189680425, "router_z_loss_clip": 2.27148438, "router_z_loss_mlp": 0.26306152, "step": 8821, "time_per_iteration": 2.6302483081817627 }, { "auxiliary_loss_clip": 0.01363655, "auxiliary_loss_mlp": 0.00216808, "balance_loss_clip": 1.13729918, "balance_loss_mlp": 0.18925855, "epoch": 0.5304073350368255, "flos": 22709010533760.0, "grad_norm": 14.468224962031535, "language_loss": 0.81105047, "learning_rate": 1.9001446114723824e-06, "loss": 0.82685512, "num_input_tokens_seen": 189700375, "router_z_loss_clip": 2.25976562, "router_z_loss_mlp": 0.27563477, "step": 8822, "time_per_iteration": 2.645082712173462 }, { "auxiliary_loss_clip": 0.01377021, "auxiliary_loss_mlp": 0.00213953, "balance_loss_clip": 1.14396131, "balance_loss_mlp": 0.18659422, "epoch": 0.5304674582894935, "flos": 27928554624000.0, "grad_norm": 33.49209955528062, "language_loss": 0.73441648, "learning_rate": 1.8997556374096257e-06, "loss": 0.75032622, "num_input_tokens_seen": 189721225, "router_z_loss_clip": 2.33007812, "router_z_loss_mlp": 0.27331543, "step": 8823, "time_per_iteration": 2.6888954639434814 }, { "auxiliary_loss_clip": 0.01359644, "auxiliary_loss_mlp": 0.00234203, "balance_loss_clip": 1.13199496, "balance_loss_mlp": 0.2068803, "epoch": 0.5305275815421614, "flos": 21250642440960.0, "grad_norm": 9.09299471811976, "language_loss": 0.7666384, "learning_rate": 1.8993666671481444e-06, "loss": 0.78257686, "num_input_tokens_seen": 189740170, "router_z_loss_clip": 2.27734375, "router_z_loss_mlp": 0.27355957, "step": 8824, "time_per_iteration": 2.694246768951416 }, { "auxiliary_loss_clip": 0.01357296, "auxiliary_loss_mlp": 0.0021522, "balance_loss_clip": 1.12958944, "balance_loss_mlp": 0.18943472, "epoch": 0.5305877047948294, "flos": 17603088140160.0, "grad_norm": 5.312070021836843, "language_loss": 0.84695888, "learning_rate": 1.898977700702689e-06, "loss": 0.86268401, "num_input_tokens_seen": 189757890, "router_z_loss_clip": 2.27539062, "router_z_loss_mlp": 0.25769043, "step": 8825, "time_per_iteration": 2.7001466751098633 }, { "auxiliary_loss_clip": 0.01347124, "auxiliary_loss_mlp": 0.00205691, "balance_loss_clip": 1.12313437, "balance_loss_mlp": 0.17890519, "epoch": 0.5306478280474973, "flos": 15195493284480.0, "grad_norm": 49.48392362559418, "language_loss": 0.92142254, "learning_rate": 1.8985887380880103e-06, "loss": 0.93695068, "num_input_tokens_seen": 189775390, "router_z_loss_clip": 2.2421875, "router_z_loss_mlp": 0.26806641, "step": 8826, "time_per_iteration": 2.620328903198242 }, { "auxiliary_loss_clip": 0.01356589, "auxiliary_loss_mlp": 0.00198842, "balance_loss_clip": 1.12761998, "balance_loss_mlp": 0.1734032, "epoch": 0.5307079513001653, "flos": 15341218761600.0, "grad_norm": 2.7983889975298184, "language_loss": 0.71236753, "learning_rate": 1.8981997793188558e-06, "loss": 0.72792184, "num_input_tokens_seen": 189793975, "router_z_loss_clip": 2.29101562, "router_z_loss_mlp": 0.2545166, "step": 8827, "time_per_iteration": 2.6464192867279053 }, { "auxiliary_loss_clip": 0.01352994, "auxiliary_loss_mlp": 0.00207036, "balance_loss_clip": 1.12634885, "balance_loss_mlp": 0.17977326, "epoch": 0.5307680745528333, "flos": 43544452688640.0, "grad_norm": 4.411015113791605, "language_loss": 0.68317652, "learning_rate": 1.8978108244099762e-06, "loss": 0.69877684, "num_input_tokens_seen": 189817870, "router_z_loss_clip": 2.26367188, "router_z_loss_mlp": 0.27233887, "step": 8828, "time_per_iteration": 2.8367538452148438 }, { "auxiliary_loss_clip": 0.01377189, "auxiliary_loss_mlp": 0.00214935, "balance_loss_clip": 1.13864231, "balance_loss_mlp": 0.18638425, "epoch": 0.5308281978055013, "flos": 20048928001920.0, "grad_norm": 232.51399822569337, "language_loss": 0.88935149, "learning_rate": 1.8974218733761208e-06, "loss": 0.90527272, "num_input_tokens_seen": 189837905, "router_z_loss_clip": 2.38671875, "router_z_loss_mlp": 0.28503418, "step": 8829, "time_per_iteration": 2.647507429122925 }, { "auxiliary_loss_clip": 0.01349945, "auxiliary_loss_mlp": 0.00205237, "balance_loss_clip": 1.12064171, "balance_loss_mlp": 0.17902312, "epoch": 0.5308883210581693, "flos": 20703938463360.0, "grad_norm": 60.53715645860559, "language_loss": 0.84490311, "learning_rate": 1.8970329262320375e-06, "loss": 0.86045492, "num_input_tokens_seen": 189856970, "router_z_loss_clip": 2.29296875, "router_z_loss_mlp": 0.26196289, "step": 8830, "time_per_iteration": 2.631129741668701 }, { "auxiliary_loss_clip": 0.01350024, "auxiliary_loss_mlp": 0.00209028, "balance_loss_clip": 1.1177609, "balance_loss_mlp": 0.1803223, "epoch": 0.5309484443108372, "flos": 14355506759040.0, "grad_norm": 9.51350635935875, "language_loss": 0.88299304, "learning_rate": 1.8966439829924768e-06, "loss": 0.89858353, "num_input_tokens_seen": 189872830, "router_z_loss_clip": 2.3203125, "router_z_loss_mlp": 0.2869873, "step": 8831, "time_per_iteration": 2.640223741531372 }, { "auxiliary_loss_clip": 0.01328885, "auxiliary_loss_mlp": 0.00224479, "balance_loss_clip": 1.1027714, "balance_loss_mlp": 0.19828892, "epoch": 0.5310085675635052, "flos": 20010503427840.0, "grad_norm": 10.255647969393165, "language_loss": 0.80420673, "learning_rate": 1.896255043672186e-06, "loss": 0.81974041, "num_input_tokens_seen": 189891635, "router_z_loss_clip": 2.26367188, "router_z_loss_mlp": 0.26220703, "step": 8832, "time_per_iteration": 2.6848440170288086 }, { "auxiliary_loss_clip": 0.01355997, "auxiliary_loss_mlp": 0.00243234, "balance_loss_clip": 1.1207794, "balance_loss_mlp": 0.21378979, "epoch": 0.5310686908161731, "flos": 22127293774080.0, "grad_norm": 8.787643586964142, "language_loss": 0.84486139, "learning_rate": 1.8958661082859143e-06, "loss": 0.86085373, "num_input_tokens_seen": 189909050, "router_z_loss_clip": 2.34960938, "router_z_loss_mlp": 0.29455566, "step": 8833, "time_per_iteration": 2.6634650230407715 }, { "auxiliary_loss_clip": 0.01333057, "auxiliary_loss_mlp": 0.00237352, "balance_loss_clip": 1.10725641, "balance_loss_mlp": 0.20797896, "epoch": 0.5311288140688412, "flos": 24717889445760.0, "grad_norm": 13.790071571962171, "language_loss": 0.79554892, "learning_rate": 1.8954771768484103e-06, "loss": 0.81125301, "num_input_tokens_seen": 189927405, "router_z_loss_clip": 2.25976562, "router_z_loss_mlp": 0.29406738, "step": 8834, "time_per_iteration": 2.6782963275909424 }, { "auxiliary_loss_clip": 0.0134365, "auxiliary_loss_mlp": 0.00228294, "balance_loss_clip": 1.10550046, "balance_loss_mlp": 0.19837235, "epoch": 0.5311889373215091, "flos": 24097712198400.0, "grad_norm": 22.188036947049916, "language_loss": 0.86247337, "learning_rate": 1.8950882493744226e-06, "loss": 0.87819278, "num_input_tokens_seen": 189947740, "router_z_loss_clip": 2.38085938, "router_z_loss_mlp": 0.29919434, "step": 8835, "time_per_iteration": 2.693085193634033 }, { "auxiliary_loss_clip": 0.01316476, "auxiliary_loss_mlp": 0.00222164, "balance_loss_clip": 1.09480906, "balance_loss_mlp": 0.19534159, "epoch": 0.5312490605741771, "flos": 22017012042240.0, "grad_norm": 27.597922389265907, "language_loss": 0.8042531, "learning_rate": 1.8946993258786985e-06, "loss": 0.81963944, "num_input_tokens_seen": 189966495, "router_z_loss_clip": 2.21289062, "router_z_loss_mlp": 0.26818848, "step": 8836, "time_per_iteration": 2.641768455505371 }, { "auxiliary_loss_clip": 0.01330738, "auxiliary_loss_mlp": 0.00229, "balance_loss_clip": 1.10438693, "balance_loss_mlp": 0.19934025, "epoch": 0.531309183826845, "flos": 19390541662080.0, "grad_norm": 4.574279460768588, "language_loss": 0.88490582, "learning_rate": 1.894310406375987e-06, "loss": 0.90050316, "num_input_tokens_seen": 189985325, "router_z_loss_clip": 2.26367188, "router_z_loss_mlp": 0.296875, "step": 8837, "time_per_iteration": 2.6276605129241943 }, { "auxiliary_loss_clip": 0.01334654, "auxiliary_loss_mlp": 0.00228742, "balance_loss_clip": 1.10744703, "balance_loss_mlp": 0.20133591, "epoch": 0.531369307079513, "flos": 20190056538240.0, "grad_norm": 4.416263697111819, "language_loss": 0.92115259, "learning_rate": 1.893921490881035e-06, "loss": 0.93678653, "num_input_tokens_seen": 190003290, "router_z_loss_clip": 2.27148438, "router_z_loss_mlp": 0.27404785, "step": 8838, "time_per_iteration": 2.656812906265259 }, { "auxiliary_loss_clip": 0.01321757, "auxiliary_loss_mlp": 0.00226262, "balance_loss_clip": 1.10148644, "balance_loss_mlp": 0.20081039, "epoch": 0.5314294303321809, "flos": 18880143356160.0, "grad_norm": 36.19616666801055, "language_loss": 0.78582323, "learning_rate": 1.8935325794085906e-06, "loss": 0.80130339, "num_input_tokens_seen": 190023260, "router_z_loss_clip": 2.20507812, "router_z_loss_mlp": 0.25427246, "step": 8839, "time_per_iteration": 2.83188533782959 }, { "auxiliary_loss_clip": 0.01331403, "auxiliary_loss_mlp": 0.00208644, "balance_loss_clip": 1.10593116, "balance_loss_mlp": 0.18232268, "epoch": 0.531489553584849, "flos": 23040035297280.0, "grad_norm": 2.4897470004534044, "language_loss": 0.82628262, "learning_rate": 1.8931436719734023e-06, "loss": 0.84168309, "num_input_tokens_seen": 190042035, "router_z_loss_clip": 2.25390625, "router_z_loss_mlp": 0.26367188, "step": 8840, "time_per_iteration": 2.7143337726593018 }, { "auxiliary_loss_clip": 0.01324165, "auxiliary_loss_mlp": 0.00230721, "balance_loss_clip": 1.10074162, "balance_loss_mlp": 0.20425692, "epoch": 0.5315496768375169, "flos": 19790478668160.0, "grad_norm": 15.087597368523925, "language_loss": 0.83620954, "learning_rate": 1.892754768590216e-06, "loss": 0.85175848, "num_input_tokens_seen": 190057545, "router_z_loss_clip": 2.23632812, "router_z_loss_mlp": 0.26452637, "step": 8841, "time_per_iteration": 4.158872127532959 }, { "auxiliary_loss_clip": 0.01295858, "auxiliary_loss_mlp": 0.0009674, "balance_loss_clip": 1.16282797, "balance_loss_mlp": 0.08920594, "epoch": 0.5316098000901849, "flos": 71023228185600.0, "grad_norm": 0.6773362758827933, "language_loss": 0.56506139, "learning_rate": 1.8923658692737793e-06, "loss": 0.57898736, "num_input_tokens_seen": 190123800, "router_z_loss_clip": 1.328125, "router_z_loss_mlp": 0.07519531, "step": 8842, "time_per_iteration": 3.2526378631591797 }, { "auxiliary_loss_clip": 0.01337018, "auxiliary_loss_mlp": 0.00235055, "balance_loss_clip": 1.1032331, "balance_loss_mlp": 0.20588492, "epoch": 0.5316699233428529, "flos": 16435560470400.0, "grad_norm": 10.904316354062432, "language_loss": 0.82617456, "learning_rate": 1.8919769740388407e-06, "loss": 0.84189528, "num_input_tokens_seen": 190141625, "router_z_loss_clip": 2.33789062, "router_z_loss_mlp": 0.29187012, "step": 8843, "time_per_iteration": 3.9723427295684814 }, { "auxiliary_loss_clip": 0.01302161, "auxiliary_loss_mlp": 0.00069313, "balance_loss_clip": 1.16841578, "balance_loss_mlp": 0.06225591, "epoch": 0.5317300465955208, "flos": 67420814302080.0, "grad_norm": 1.059797079044665, "language_loss": 0.60708344, "learning_rate": 1.891588082900145e-06, "loss": 0.62079811, "num_input_tokens_seen": 190198110, "router_z_loss_clip": 1.3359375, "router_z_loss_mlp": 0.07080078, "step": 8844, "time_per_iteration": 4.463963747024536 }, { "auxiliary_loss_clip": 0.01314539, "auxiliary_loss_mlp": 0.00090941, "balance_loss_clip": 1.17590451, "balance_loss_mlp": 0.08331159, "epoch": 0.5317901698481888, "flos": 59508075340800.0, "grad_norm": 0.8223014431592927, "language_loss": 0.61542094, "learning_rate": 1.8911991958724411e-06, "loss": 0.62947571, "num_input_tokens_seen": 190259950, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.07617188, "step": 8845, "time_per_iteration": 3.100733518600464 }, { "auxiliary_loss_clip": 0.01331617, "auxiliary_loss_mlp": 0.00237584, "balance_loss_clip": 1.10711813, "balance_loss_mlp": 0.21070217, "epoch": 0.5318502931008567, "flos": 19129219240320.0, "grad_norm": 294.50330362886973, "language_loss": 0.85988998, "learning_rate": 1.890810312970474e-06, "loss": 0.87558198, "num_input_tokens_seen": 190278265, "router_z_loss_clip": 2.24804688, "router_z_loss_mlp": 0.26879883, "step": 8846, "time_per_iteration": 2.608633041381836 }, { "auxiliary_loss_clip": 0.01318153, "auxiliary_loss_mlp": 0.00220045, "balance_loss_clip": 1.09625697, "balance_loss_mlp": 0.19342583, "epoch": 0.5319104163535248, "flos": 24681045070080.0, "grad_norm": 4.891145386346162, "language_loss": 0.82126832, "learning_rate": 1.8904214342089903e-06, "loss": 0.83665025, "num_input_tokens_seen": 190298400, "router_z_loss_clip": 2.22070312, "router_z_loss_mlp": 0.26635742, "step": 8847, "time_per_iteration": 2.6302475929260254 }, { "auxiliary_loss_clip": 0.01305915, "auxiliary_loss_mlp": 0.0022219, "balance_loss_clip": 1.08791065, "balance_loss_mlp": 0.19753733, "epoch": 0.5319705396061927, "flos": 19385513758080.0, "grad_norm": 88.7614540242931, "language_loss": 0.94171488, "learning_rate": 1.8900325596027378e-06, "loss": 0.95699596, "num_input_tokens_seen": 190316235, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.24645996, "step": 8848, "time_per_iteration": 2.6336886882781982 }, { "auxiliary_loss_clip": 0.01321424, "auxiliary_loss_mlp": 0.00252999, "balance_loss_clip": 1.09794021, "balance_loss_mlp": 0.22401963, "epoch": 0.5320306628588607, "flos": 18259319664000.0, "grad_norm": 17.597573527670246, "language_loss": 0.84148508, "learning_rate": 1.8896436891664609e-06, "loss": 0.85722935, "num_input_tokens_seen": 190335060, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.28967285, "step": 8849, "time_per_iteration": 3.9967682361602783 }, { "auxiliary_loss_clip": 0.01315605, "auxiliary_loss_mlp": 0.00221927, "balance_loss_clip": 1.09026122, "balance_loss_mlp": 0.19410405, "epoch": 0.5320907861115286, "flos": 23732321097600.0, "grad_norm": 84.88460102244512, "language_loss": 0.86652172, "learning_rate": 1.8892548229149066e-06, "loss": 0.88189703, "num_input_tokens_seen": 190353265, "router_z_loss_clip": 2.25195312, "router_z_loss_mlp": 0.27819824, "step": 8850, "time_per_iteration": 2.6548116207122803 }, { "auxiliary_loss_clip": 0.01284585, "auxiliary_loss_mlp": 0.00238835, "balance_loss_clip": 1.07454014, "balance_loss_mlp": 0.21284755, "epoch": 0.5321509093641966, "flos": 34495251321600.0, "grad_norm": 3.304838102593183, "language_loss": 0.60671806, "learning_rate": 1.888865960862821e-06, "loss": 0.62195224, "num_input_tokens_seen": 190376575, "router_z_loss_clip": 2.09667969, "router_z_loss_mlp": 0.25976562, "step": 8851, "time_per_iteration": 2.764244318008423 }, { "auxiliary_loss_clip": 0.01311494, "auxiliary_loss_mlp": 0.0023552, "balance_loss_clip": 1.08968818, "balance_loss_mlp": 0.2073632, "epoch": 0.5322110326168645, "flos": 20010934391040.0, "grad_norm": 22.905813403211262, "language_loss": 0.77382159, "learning_rate": 1.8884771030249484e-06, "loss": 0.78929174, "num_input_tokens_seen": 190395185, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.28198242, "step": 8852, "time_per_iteration": 2.6741743087768555 }, { "auxiliary_loss_clip": 0.01256105, "auxiliary_loss_mlp": 0.00263991, "balance_loss_clip": 1.13541722, "balance_loss_mlp": 0.24844609, "epoch": 0.5322711558695326, "flos": 64631164435200.0, "grad_norm": 0.7729125415477177, "language_loss": 0.62148398, "learning_rate": 1.8880882494160357e-06, "loss": 0.63668489, "num_input_tokens_seen": 190452595, "router_z_loss_clip": 1.203125, "router_z_loss_mlp": 0.15527344, "step": 8853, "time_per_iteration": 3.1103200912475586 }, { "auxiliary_loss_clip": 0.01301216, "auxiliary_loss_mlp": 0.0022769, "balance_loss_clip": 1.07999444, "balance_loss_mlp": 0.199223, "epoch": 0.5323312791222005, "flos": 14939342421120.0, "grad_norm": 8.832547720753915, "language_loss": 0.88873434, "learning_rate": 1.8876994000508278e-06, "loss": 0.90402341, "num_input_tokens_seen": 190469140, "router_z_loss_clip": 2.21484375, "router_z_loss_mlp": 0.28466797, "step": 8854, "time_per_iteration": 2.638186454772949 }, { "auxiliary_loss_clip": 0.01300325, "auxiliary_loss_mlp": 0.00194379, "balance_loss_clip": 1.08509636, "balance_loss_mlp": 0.16893977, "epoch": 0.5323914023748685, "flos": 23440834229760.0, "grad_norm": 1229.6016526915168, "language_loss": 0.82048434, "learning_rate": 1.8873105549440698e-06, "loss": 0.8354314, "num_input_tokens_seen": 190489015, "router_z_loss_clip": 2.15039062, "router_z_loss_mlp": 0.2545166, "step": 8855, "time_per_iteration": 2.736544370651245 }, { "auxiliary_loss_clip": 0.0129957, "auxiliary_loss_mlp": 0.00231281, "balance_loss_clip": 1.0837909, "balance_loss_mlp": 0.20658064, "epoch": 0.5324515256275365, "flos": 26286180134400.0, "grad_norm": 3.9389162121399353, "language_loss": 0.71859181, "learning_rate": 1.886921714110507e-06, "loss": 0.73390031, "num_input_tokens_seen": 190508065, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.24694824, "step": 8856, "time_per_iteration": 2.638658046722412 }, { "auxiliary_loss_clip": 0.01282524, "auxiliary_loss_mlp": 0.00232595, "balance_loss_clip": 1.06458652, "balance_loss_mlp": 0.20514119, "epoch": 0.5325116488802044, "flos": 26870913636480.0, "grad_norm": 29.41792904167496, "language_loss": 0.84985232, "learning_rate": 1.8865328775648842e-06, "loss": 0.86500359, "num_input_tokens_seen": 190527045, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.27478027, "step": 8857, "time_per_iteration": 2.688770055770874 }, { "auxiliary_loss_clip": 0.01280748, "auxiliary_loss_mlp": 0.00203013, "balance_loss_clip": 1.06728601, "balance_loss_mlp": 0.17741901, "epoch": 0.5325717721328724, "flos": 25884734757120.0, "grad_norm": 3.6609156397968152, "language_loss": 0.76774359, "learning_rate": 1.8861440453219456e-06, "loss": 0.78258115, "num_input_tokens_seen": 190544075, "router_z_loss_clip": 2.13476562, "router_z_loss_mlp": 0.2557373, "step": 8858, "time_per_iteration": 2.6629624366760254 }, { "auxiliary_loss_clip": 0.01288437, "auxiliary_loss_mlp": 0.00243677, "balance_loss_clip": 1.07244444, "balance_loss_mlp": 0.21568625, "epoch": 0.5326318953855403, "flos": 21799321666560.0, "grad_norm": 41.6166877389548, "language_loss": 0.75965673, "learning_rate": 1.8857552173964367e-06, "loss": 0.7749778, "num_input_tokens_seen": 190566030, "router_z_loss_clip": 2.16210938, "router_z_loss_mlp": 0.2800293, "step": 8859, "time_per_iteration": 2.741753339767456 }, { "auxiliary_loss_clip": 0.01287099, "auxiliary_loss_mlp": 0.00217038, "balance_loss_clip": 1.07691705, "balance_loss_mlp": 0.19296984, "epoch": 0.5326920186382084, "flos": 20922921728640.0, "grad_norm": 11.947770417350432, "language_loss": 0.74576199, "learning_rate": 1.8853663938031013e-06, "loss": 0.76080346, "num_input_tokens_seen": 190585605, "router_z_loss_clip": 2.1015625, "router_z_loss_mlp": 0.24060059, "step": 8860, "time_per_iteration": 2.6310105323791504 }, { "auxiliary_loss_clip": 0.01255771, "auxiliary_loss_mlp": 0.00225497, "balance_loss_clip": 1.04780495, "balance_loss_mlp": 0.20016477, "epoch": 0.5327521418908763, "flos": 21433427775360.0, "grad_norm": 6.653158920884805, "language_loss": 0.84974873, "learning_rate": 1.884977574556683e-06, "loss": 0.86456144, "num_input_tokens_seen": 190604625, "router_z_loss_clip": 2.08203125, "router_z_loss_mlp": 0.25341797, "step": 8861, "time_per_iteration": 2.797293186187744 }, { "auxiliary_loss_clip": 0.01277945, "auxiliary_loss_mlp": 0.00212474, "balance_loss_clip": 1.06450868, "balance_loss_mlp": 0.1877141, "epoch": 0.5328122651435443, "flos": 21760250647680.0, "grad_norm": 48.17642276655718, "language_loss": 0.93506718, "learning_rate": 1.8845887596719279e-06, "loss": 0.94997132, "num_input_tokens_seen": 190625060, "router_z_loss_clip": 2.13671875, "router_z_loss_mlp": 0.2479248, "step": 8862, "time_per_iteration": 2.66556978225708 }, { "auxiliary_loss_clip": 0.01267526, "auxiliary_loss_mlp": 0.0024941, "balance_loss_clip": 1.04937983, "balance_loss_mlp": 0.22069217, "epoch": 0.5328723883962122, "flos": 18296487262080.0, "grad_norm": 3.8529983561861556, "language_loss": 0.74106026, "learning_rate": 1.8841999491635778e-06, "loss": 0.75622964, "num_input_tokens_seen": 190643150, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.28710938, "step": 8863, "time_per_iteration": 2.602647542953491 }, { "auxiliary_loss_clip": 0.01268808, "auxiliary_loss_mlp": 0.00195996, "balance_loss_clip": 1.06204152, "balance_loss_mlp": 0.1736915, "epoch": 0.5329325116488802, "flos": 25374911068800.0, "grad_norm": 2.4531460637536093, "language_loss": 0.81563449, "learning_rate": 1.883811143046377e-06, "loss": 0.83028245, "num_input_tokens_seen": 190662725, "router_z_loss_clip": 2.06542969, "router_z_loss_mlp": 0.22302246, "step": 8864, "time_per_iteration": 2.6671297550201416 }, { "auxiliary_loss_clip": 0.01251954, "auxiliary_loss_mlp": 0.00223238, "balance_loss_clip": 1.04547524, "balance_loss_mlp": 0.19739342, "epoch": 0.5329926349015481, "flos": 25592098654080.0, "grad_norm": 5.620023521291987, "language_loss": 0.72259665, "learning_rate": 1.8834223413350702e-06, "loss": 0.7373485, "num_input_tokens_seen": 190683680, "router_z_loss_clip": 2.06542969, "router_z_loss_mlp": 0.25805664, "step": 8865, "time_per_iteration": 2.6593377590179443 }, { "auxiliary_loss_clip": 0.01266497, "auxiliary_loss_mlp": 0.00195008, "balance_loss_clip": 1.05623996, "balance_loss_mlp": 0.17099962, "epoch": 0.5330527581542162, "flos": 22889605138560.0, "grad_norm": 6.664707170839729, "language_loss": 0.86979735, "learning_rate": 1.8830335440443989e-06, "loss": 0.88441241, "num_input_tokens_seen": 190703350, "router_z_loss_clip": 2.10546875, "router_z_loss_mlp": 0.23986816, "step": 8866, "time_per_iteration": 2.6713709831237793 }, { "auxiliary_loss_clip": 0.01271762, "auxiliary_loss_mlp": 0.0022683, "balance_loss_clip": 1.05704784, "balance_loss_mlp": 0.1990664, "epoch": 0.5331128814068841, "flos": 16026752805120.0, "grad_norm": 719.5317030516484, "language_loss": 0.82582045, "learning_rate": 1.882644751189108e-06, "loss": 0.84080631, "num_input_tokens_seen": 190721170, "router_z_loss_clip": 2.15039062, "router_z_loss_mlp": 0.27758789, "step": 8867, "time_per_iteration": 2.622035026550293 }, { "auxiliary_loss_clip": 0.01257287, "auxiliary_loss_mlp": 0.00211565, "balance_loss_clip": 1.04778695, "balance_loss_mlp": 0.18588698, "epoch": 0.5331730046595521, "flos": 39344699629440.0, "grad_norm": 13.518775138942594, "language_loss": 0.79014653, "learning_rate": 1.88225596278394e-06, "loss": 0.80483508, "num_input_tokens_seen": 190743795, "router_z_loss_clip": 2.09765625, "router_z_loss_mlp": 0.25683594, "step": 8868, "time_per_iteration": 2.8673477172851562 }, { "auxiliary_loss_clip": 0.01245028, "auxiliary_loss_mlp": 0.00215958, "balance_loss_clip": 1.04058933, "balance_loss_mlp": 0.19086421, "epoch": 0.5332331279122201, "flos": 24024382583040.0, "grad_norm": 1858.1293599996404, "language_loss": 0.85191309, "learning_rate": 1.881867178843637e-06, "loss": 0.86652291, "num_input_tokens_seen": 190761560, "router_z_loss_clip": 2.04394531, "router_z_loss_mlp": 0.25061035, "step": 8869, "time_per_iteration": 2.6527585983276367 }, { "auxiliary_loss_clip": 0.01257082, "auxiliary_loss_mlp": 0.00207704, "balance_loss_clip": 1.05016744, "balance_loss_mlp": 0.18245585, "epoch": 0.533293251164888, "flos": 17129318728320.0, "grad_norm": 4.47995647040371, "language_loss": 0.85181195, "learning_rate": 1.8814783993829434e-06, "loss": 0.86645985, "num_input_tokens_seen": 190778875, "router_z_loss_clip": 2.06835938, "router_z_loss_mlp": 0.25256348, "step": 8870, "time_per_iteration": 2.6158089637756348 }, { "auxiliary_loss_clip": 0.01258937, "auxiliary_loss_mlp": 0.00235417, "balance_loss_clip": 1.04905641, "balance_loss_mlp": 0.20728317, "epoch": 0.533353374417556, "flos": 22126360020480.0, "grad_norm": 5.8664594407530615, "language_loss": 0.82018769, "learning_rate": 1.8810896244165997e-06, "loss": 0.83513123, "num_input_tokens_seen": 190799830, "router_z_loss_clip": 2.09960938, "router_z_loss_mlp": 0.28125, "step": 8871, "time_per_iteration": 2.6351559162139893 }, { "auxiliary_loss_clip": 0.01242485, "auxiliary_loss_mlp": 0.00197175, "balance_loss_clip": 1.03955758, "balance_loss_mlp": 0.17381039, "epoch": 0.533413497670224, "flos": 15011091838080.0, "grad_norm": 98.37284758353567, "language_loss": 0.79346818, "learning_rate": 1.8807008539593498e-06, "loss": 0.80786479, "num_input_tokens_seen": 190817155, "router_z_loss_clip": 2.03125, "router_z_loss_mlp": 0.23364258, "step": 8872, "time_per_iteration": 2.6107332706451416 }, { "auxiliary_loss_clip": 0.01253214, "auxiliary_loss_mlp": 0.00238158, "balance_loss_clip": 1.04712987, "balance_loss_mlp": 0.21212304, "epoch": 0.533473620922892, "flos": 19609955890560.0, "grad_norm": 13.950931229906196, "language_loss": 0.72189522, "learning_rate": 1.880312088025936e-06, "loss": 0.7368089, "num_input_tokens_seen": 190835240, "router_z_loss_clip": 2.06054688, "router_z_loss_mlp": 0.26037598, "step": 8873, "time_per_iteration": 2.606431007385254 }, { "auxiliary_loss_clip": 0.01239214, "auxiliary_loss_mlp": 0.00211811, "balance_loss_clip": 1.04063225, "balance_loss_mlp": 0.18774214, "epoch": 0.5335337441755599, "flos": 14282644020480.0, "grad_norm": 33.404682582222485, "language_loss": 0.89046156, "learning_rate": 1.879923326631099e-06, "loss": 0.90497178, "num_input_tokens_seen": 190851620, "router_z_loss_clip": 1.98339844, "router_z_loss_mlp": 0.24060059, "step": 8874, "time_per_iteration": 2.606678009033203 }, { "auxiliary_loss_clip": 0.01240042, "auxiliary_loss_mlp": 0.00206832, "balance_loss_clip": 1.03489637, "balance_loss_mlp": 0.18142813, "epoch": 0.5335938674282279, "flos": 20814830726400.0, "grad_norm": 40.10226619420842, "language_loss": 0.7814163, "learning_rate": 1.879534569789582e-06, "loss": 0.79588503, "num_input_tokens_seen": 190870545, "router_z_loss_clip": 2.04882812, "router_z_loss_mlp": 0.25390625, "step": 8875, "time_per_iteration": 2.6564202308654785 }, { "auxiliary_loss_clip": 0.01253457, "auxiliary_loss_mlp": 0.00093748, "balance_loss_clip": 1.13382363, "balance_loss_mlp": 0.0854513, "epoch": 0.5336539906808958, "flos": 71396448451200.0, "grad_norm": 0.7074284326733348, "language_loss": 0.58820498, "learning_rate": 1.879145817516126e-06, "loss": 0.601677, "num_input_tokens_seen": 190931995, "router_z_loss_clip": 1.203125, "router_z_loss_mlp": 0.08300781, "step": 8876, "time_per_iteration": 3.2505505084991455 }, { "auxiliary_loss_clip": 0.01230365, "auxiliary_loss_mlp": 0.00197605, "balance_loss_clip": 1.02616906, "balance_loss_mlp": 0.17446598, "epoch": 0.5337141139335638, "flos": 20152996680960.0, "grad_norm": 5.6233165282297755, "language_loss": 0.83433867, "learning_rate": 1.8787570698254727e-06, "loss": 0.84861839, "num_input_tokens_seen": 190949890, "router_z_loss_clip": 2.04101562, "router_z_loss_mlp": 0.23144531, "step": 8877, "time_per_iteration": 2.6191632747650146 }, { "auxiliary_loss_clip": 0.01252927, "auxiliary_loss_mlp": 0.00258554, "balance_loss_clip": 1.13111925, "balance_loss_mlp": 0.24720517, "epoch": 0.5337742371862317, "flos": 67728387484800.0, "grad_norm": 0.763243412565211, "language_loss": 0.56707144, "learning_rate": 1.8783683267323629e-06, "loss": 0.58218622, "num_input_tokens_seen": 191008480, "router_z_loss_clip": 1.21875, "router_z_loss_mlp": 0.11328125, "step": 8878, "time_per_iteration": 3.02239990234375 }, { "auxiliary_loss_clip": 0.01250563, "auxiliary_loss_mlp": 0.00202621, "balance_loss_clip": 1.04265177, "balance_loss_mlp": 0.1766576, "epoch": 0.5338343604388998, "flos": 25008909436800.0, "grad_norm": 5.1547410402627065, "language_loss": 0.81597376, "learning_rate": 1.8779795882515395e-06, "loss": 0.83050561, "num_input_tokens_seen": 191028995, "router_z_loss_clip": 2.07910156, "router_z_loss_mlp": 0.25952148, "step": 8879, "time_per_iteration": 2.710376024246216 }, { "auxiliary_loss_clip": 0.01224254, "auxiliary_loss_mlp": 0.00218781, "balance_loss_clip": 1.02053916, "balance_loss_mlp": 0.19284084, "epoch": 0.5338944836915677, "flos": 17601256546560.0, "grad_norm": 26.567737356174575, "language_loss": 0.93517911, "learning_rate": 1.8775908543977416e-06, "loss": 0.9496094, "num_input_tokens_seen": 191045285, "router_z_loss_clip": 2.03808594, "router_z_loss_mlp": 0.25964355, "step": 8880, "time_per_iteration": 2.607015609741211 }, { "auxiliary_loss_clip": 0.01221399, "auxiliary_loss_mlp": 0.00203307, "balance_loss_clip": 1.02357352, "balance_loss_mlp": 0.17877354, "epoch": 0.5339546069442357, "flos": 21724124544000.0, "grad_norm": 24.149540695525886, "language_loss": 0.86234915, "learning_rate": 1.8772021251857107e-06, "loss": 0.87659621, "num_input_tokens_seen": 191066105, "router_z_loss_clip": 1.98046875, "router_z_loss_mlp": 0.24511719, "step": 8881, "time_per_iteration": 2.701364278793335 }, { "auxiliary_loss_clip": 0.01239862, "auxiliary_loss_mlp": 0.0014224, "balance_loss_clip": 1.11768198, "balance_loss_mlp": 0.13246524, "epoch": 0.5340147301969036, "flos": 69723583315200.0, "grad_norm": 0.7757066038150747, "language_loss": 0.59247291, "learning_rate": 1.8768134006301882e-06, "loss": 0.60629392, "num_input_tokens_seen": 191126315, "router_z_loss_clip": 1.21875, "router_z_loss_mlp": 0.09765625, "step": 8882, "time_per_iteration": 3.074885845184326 }, { "auxiliary_loss_clip": 0.01238744, "auxiliary_loss_mlp": 0.00125012, "balance_loss_clip": 1.1196034, "balance_loss_mlp": 0.11471229, "epoch": 0.5340748534495716, "flos": 63880701580800.0, "grad_norm": 0.8577185168346692, "language_loss": 0.63671088, "learning_rate": 1.876424680745913e-06, "loss": 0.65034842, "num_input_tokens_seen": 191174240, "router_z_loss_clip": 1.1953125, "router_z_loss_mlp": 0.10302734, "step": 8883, "time_per_iteration": 2.9871091842651367 }, { "auxiliary_loss_clip": 0.01220294, "auxiliary_loss_mlp": 0.00210553, "balance_loss_clip": 1.01695299, "balance_loss_mlp": 0.18536375, "epoch": 0.5341349767022396, "flos": 28694313694080.0, "grad_norm": 5.075538719651704, "language_loss": 0.9302088, "learning_rate": 1.8760359655476272e-06, "loss": 0.94451725, "num_input_tokens_seen": 191193335, "router_z_loss_clip": 2.03125, "router_z_loss_mlp": 0.25170898, "step": 8884, "time_per_iteration": 4.256259441375732 }, { "auxiliary_loss_clip": 0.01225618, "auxiliary_loss_mlp": 0.00231487, "balance_loss_clip": 1.02553511, "balance_loss_mlp": 0.20863496, "epoch": 0.5341950999549075, "flos": 16289691338880.0, "grad_norm": 6.7302531585989325, "language_loss": 0.78390789, "learning_rate": 1.8756472550500695e-06, "loss": 0.79847896, "num_input_tokens_seen": 191210900, "router_z_loss_clip": 1.99902344, "router_z_loss_mlp": 0.22875977, "step": 8885, "time_per_iteration": 4.0613853931427 }, { "auxiliary_loss_clip": 0.01214102, "auxiliary_loss_mlp": 0.00249637, "balance_loss_clip": 1.01325727, "balance_loss_mlp": 0.22257617, "epoch": 0.5342552232075756, "flos": 14355650413440.0, "grad_norm": 599.5630470781546, "language_loss": 0.88424766, "learning_rate": 1.87525854926798e-06, "loss": 0.89888513, "num_input_tokens_seen": 191226730, "router_z_loss_clip": 2.00585938, "router_z_loss_mlp": 0.27038574, "step": 8886, "time_per_iteration": 2.6026904582977295 }, { "auxiliary_loss_clip": 0.01220439, "auxiliary_loss_mlp": 0.00240784, "balance_loss_clip": 1.0200814, "balance_loss_mlp": 0.21478495, "epoch": 0.5343153464602435, "flos": 30297976300800.0, "grad_norm": 18.795702230806196, "language_loss": 0.81734073, "learning_rate": 1.8748698482160996e-06, "loss": 0.83195293, "num_input_tokens_seen": 191250435, "router_z_loss_clip": 2.00390625, "router_z_loss_mlp": 0.25976562, "step": 8887, "time_per_iteration": 4.127835988998413 }, { "auxiliary_loss_clip": 0.01222977, "auxiliary_loss_mlp": 0.00239355, "balance_loss_clip": 1.02327371, "balance_loss_mlp": 0.21283078, "epoch": 0.5343754697129115, "flos": 15596292216960.0, "grad_norm": 88.07753281698386, "language_loss": 0.81665587, "learning_rate": 1.8744811519091663e-06, "loss": 0.83127928, "num_input_tokens_seen": 191268315, "router_z_loss_clip": 1.99609375, "router_z_loss_mlp": 0.26538086, "step": 8888, "time_per_iteration": 2.66058349609375 }, { "auxiliary_loss_clip": 0.01229671, "auxiliary_loss_mlp": 0.00253793, "balance_loss_clip": 1.0219456, "balance_loss_mlp": 0.22582659, "epoch": 0.5344355929655794, "flos": 16909617191040.0, "grad_norm": 6.7860216335414965, "language_loss": 0.88776743, "learning_rate": 1.8740924603619208e-06, "loss": 0.90260208, "num_input_tokens_seen": 191287000, "router_z_loss_clip": 2.07617188, "router_z_loss_mlp": 0.27978516, "step": 8889, "time_per_iteration": 2.6240501403808594 }, { "auxiliary_loss_clip": 0.01212328, "auxiliary_loss_mlp": 0.00240405, "balance_loss_clip": 1.0123632, "balance_loss_mlp": 0.21539497, "epoch": 0.5344957162182474, "flos": 16798186224000.0, "grad_norm": 57.348491882443774, "language_loss": 0.76784641, "learning_rate": 1.873703773589102e-06, "loss": 0.78237373, "num_input_tokens_seen": 191304565, "router_z_loss_clip": 2.0, "router_z_loss_mlp": 0.25036621, "step": 8890, "time_per_iteration": 2.613736391067505 }, { "auxiliary_loss_clip": 0.01217368, "auxiliary_loss_mlp": 0.0026321, "balance_loss_clip": 1.01316738, "balance_loss_mlp": 0.23752028, "epoch": 0.5345558394709153, "flos": 12705590413440.0, "grad_norm": 9.202328631142448, "language_loss": 0.89075148, "learning_rate": 1.8733150916054483e-06, "loss": 0.90555722, "num_input_tokens_seen": 191318300, "router_z_loss_clip": 2.04296875, "router_z_loss_mlp": 0.25671387, "step": 8891, "time_per_iteration": 4.136599540710449 }, { "auxiliary_loss_clip": 0.0118407, "auxiliary_loss_mlp": 0.00230223, "balance_loss_clip": 0.99147081, "balance_loss_mlp": 0.20566589, "epoch": 0.5346159627235834, "flos": 22455050400000.0, "grad_norm": 39.65254688451946, "language_loss": 0.81689513, "learning_rate": 1.872926414425699e-06, "loss": 0.831038, "num_input_tokens_seen": 191337925, "router_z_loss_clip": 1.92578125, "router_z_loss_mlp": 0.24572754, "step": 8892, "time_per_iteration": 2.6474435329437256 }, { "auxiliary_loss_clip": 0.01198094, "auxiliary_loss_mlp": 0.0021384, "balance_loss_clip": 0.9999972, "balance_loss_mlp": 0.18856725, "epoch": 0.5346760859762513, "flos": 22415763899520.0, "grad_norm": 825.8254394766508, "language_loss": 0.94102156, "learning_rate": 1.8725377420645932e-06, "loss": 0.95514089, "num_input_tokens_seen": 191357120, "router_z_loss_clip": 1.98144531, "router_z_loss_mlp": 0.25292969, "step": 8893, "time_per_iteration": 2.6200153827667236 }, { "auxiliary_loss_clip": 0.01193964, "auxiliary_loss_mlp": 0.00215571, "balance_loss_clip": 0.99707752, "balance_loss_mlp": 0.19157444, "epoch": 0.5347362092289193, "flos": 22816131868800.0, "grad_norm": 7.578053181565926, "language_loss": 0.81227577, "learning_rate": 1.872149074536869e-06, "loss": 0.82637107, "num_input_tokens_seen": 191375395, "router_z_loss_clip": 1.97070312, "router_z_loss_mlp": 0.2401123, "step": 8894, "time_per_iteration": 2.6408772468566895 }, { "auxiliary_loss_clip": 0.01186166, "auxiliary_loss_mlp": 0.0026147, "balance_loss_clip": 0.99413121, "balance_loss_mlp": 0.23780695, "epoch": 0.5347963324815872, "flos": 23219480666880.0, "grad_norm": 6.812642533570317, "language_loss": 0.82060778, "learning_rate": 1.8717604118572648e-06, "loss": 0.8350842, "num_input_tokens_seen": 191395595, "router_z_loss_clip": 1.92285156, "router_z_loss_mlp": 0.23681641, "step": 8895, "time_per_iteration": 2.6268327236175537 }, { "auxiliary_loss_clip": 0.01191724, "auxiliary_loss_mlp": 0.00236078, "balance_loss_clip": 0.99539161, "balance_loss_mlp": 0.21245024, "epoch": 0.5348564557342552, "flos": 22601350494720.0, "grad_norm": 9.024884742447265, "language_loss": 0.8315171, "learning_rate": 1.8713717540405178e-06, "loss": 0.84579515, "num_input_tokens_seen": 191413730, "router_z_loss_clip": 1.96289062, "router_z_loss_mlp": 0.23608398, "step": 8896, "time_per_iteration": 2.6638476848602295 }, { "auxiliary_loss_clip": 0.01192183, "auxiliary_loss_mlp": 0.00262191, "balance_loss_clip": 0.99468231, "balance_loss_mlp": 0.23673971, "epoch": 0.5349165789869232, "flos": 18002378701440.0, "grad_norm": 37.20040389691521, "language_loss": 0.86120582, "learning_rate": 1.8709831011013676e-06, "loss": 0.87574953, "num_input_tokens_seen": 191432400, "router_z_loss_clip": 1.97460938, "router_z_loss_mlp": 0.25463867, "step": 8897, "time_per_iteration": 2.607800006866455 }, { "auxiliary_loss_clip": 0.01193568, "auxiliary_loss_mlp": 0.00221776, "balance_loss_clip": 0.99636316, "balance_loss_mlp": 0.19770774, "epoch": 0.5349767022395912, "flos": 17159770483200.0, "grad_norm": 273.68908097704957, "language_loss": 0.85429442, "learning_rate": 1.8705944530545509e-06, "loss": 0.86844778, "num_input_tokens_seen": 191448855, "router_z_loss_clip": 1.97167969, "router_z_loss_mlp": 0.2409668, "step": 8898, "time_per_iteration": 2.5986950397491455 }, { "auxiliary_loss_clip": 0.01211113, "auxiliary_loss_mlp": 0.0032346, "balance_loss_clip": 1.0871532, "balance_loss_mlp": 0.31187332, "epoch": 0.5350368254922592, "flos": 70992058158720.0, "grad_norm": 0.9128233983857472, "language_loss": 0.57680213, "learning_rate": 1.8702058099148052e-06, "loss": 0.59214789, "num_input_tokens_seen": 191519690, "router_z_loss_clip": 1.234375, "router_z_loss_mlp": 0.11572266, "step": 8899, "time_per_iteration": 3.3019955158233643 }, { "auxiliary_loss_clip": 0.01185984, "auxiliary_loss_mlp": 0.00251196, "balance_loss_clip": 0.98920846, "balance_loss_mlp": 0.22794999, "epoch": 0.5350969487449271, "flos": 27417833095680.0, "grad_norm": 137.30723193951238, "language_loss": 0.76555437, "learning_rate": 1.869817171696868e-06, "loss": 0.77992618, "num_input_tokens_seen": 191539380, "router_z_loss_clip": 1.96972656, "router_z_loss_mlp": 0.23266602, "step": 8900, "time_per_iteration": 2.681436538696289 }, { "auxiliary_loss_clip": 0.01196969, "auxiliary_loss_mlp": 0.00237427, "balance_loss_clip": 0.99639654, "balance_loss_mlp": 0.21412188, "epoch": 0.5351570719975951, "flos": 19316134638720.0, "grad_norm": 3.495578175722658, "language_loss": 0.78697693, "learning_rate": 1.8694285384154777e-06, "loss": 0.80132091, "num_input_tokens_seen": 191557400, "router_z_loss_clip": 2.0078125, "router_z_loss_mlp": 0.23291016, "step": 8901, "time_per_iteration": 2.640850305557251 }, { "auxiliary_loss_clip": 0.0118046, "auxiliary_loss_mlp": 0.00245539, "balance_loss_clip": 0.98650849, "balance_loss_mlp": 0.22049281, "epoch": 0.535217195250263, "flos": 19828580019840.0, "grad_norm": 15.13914920355804, "language_loss": 0.86219859, "learning_rate": 1.8690399100853699e-06, "loss": 0.87645859, "num_input_tokens_seen": 191575860, "router_z_loss_clip": 1.94238281, "router_z_loss_mlp": 0.25048828, "step": 8902, "time_per_iteration": 2.667057514190674 }, { "auxiliary_loss_clip": 0.01174292, "auxiliary_loss_mlp": 0.00224045, "balance_loss_clip": 0.98318851, "balance_loss_mlp": 0.20138311, "epoch": 0.535277318502931, "flos": 22127868391680.0, "grad_norm": 11.598828619997196, "language_loss": 0.77425408, "learning_rate": 1.868651286721281e-06, "loss": 0.78823745, "num_input_tokens_seen": 191595775, "router_z_loss_clip": 1.91015625, "router_z_loss_mlp": 0.2265625, "step": 8903, "time_per_iteration": 2.7000534534454346 }, { "auxiliary_loss_clip": 0.01199881, "auxiliary_loss_mlp": 0.00242123, "balance_loss_clip": 0.99947679, "balance_loss_mlp": 0.21871036, "epoch": 0.5353374417555989, "flos": 25045897466880.0, "grad_norm": 408.8967133801997, "language_loss": 0.80880791, "learning_rate": 1.86826266833795e-06, "loss": 0.82322794, "num_input_tokens_seen": 191617785, "router_z_loss_clip": 2.00488281, "router_z_loss_mlp": 0.234375, "step": 8904, "time_per_iteration": 2.703160524368286 }, { "auxiliary_loss_clip": 0.01191273, "auxiliary_loss_mlp": 0.002274, "balance_loss_clip": 0.99483049, "balance_loss_mlp": 0.20461896, "epoch": 0.535397565008267, "flos": 19388710068480.0, "grad_norm": 275.9712018598858, "language_loss": 0.81878948, "learning_rate": 1.8678740549501103e-06, "loss": 0.83297616, "num_input_tokens_seen": 191636900, "router_z_loss_clip": 1.96386719, "router_z_loss_mlp": 0.2277832, "step": 8905, "time_per_iteration": 2.6156468391418457 }, { "auxiliary_loss_clip": 0.01180352, "auxiliary_loss_mlp": 0.00221643, "balance_loss_clip": 0.99007833, "balance_loss_mlp": 0.19930293, "epoch": 0.5354576882609349, "flos": 21471205904640.0, "grad_norm": 21.299333172410023, "language_loss": 0.90211987, "learning_rate": 1.8674854465725005e-06, "loss": 0.91613984, "num_input_tokens_seen": 191656720, "router_z_loss_clip": 1.90527344, "router_z_loss_mlp": 0.22338867, "step": 8906, "time_per_iteration": 2.63397216796875 }, { "auxiliary_loss_clip": 0.01195373, "auxiliary_loss_mlp": 0.0022534, "balance_loss_clip": 0.99510968, "balance_loss_mlp": 0.20078273, "epoch": 0.5355178115136029, "flos": 20777519473920.0, "grad_norm": 10.872820808968797, "language_loss": 0.8258599, "learning_rate": 1.8670968432198563e-06, "loss": 0.84006703, "num_input_tokens_seen": 191674445, "router_z_loss_clip": 2.00097656, "router_z_loss_mlp": 0.24511719, "step": 8907, "time_per_iteration": 2.7108986377716064 }, { "auxiliary_loss_clip": 0.01171219, "auxiliary_loss_mlp": 0.00221682, "balance_loss_clip": 0.97822964, "balance_loss_mlp": 0.19671918, "epoch": 0.5355779347662708, "flos": 23514020190720.0, "grad_norm": 7.473800042738709, "language_loss": 0.85166693, "learning_rate": 1.866708244906912e-06, "loss": 0.86559594, "num_input_tokens_seen": 191695000, "router_z_loss_clip": 1.92871094, "router_z_loss_mlp": 0.24987793, "step": 8908, "time_per_iteration": 2.658963441848755 }, { "auxiliary_loss_clip": 0.01195852, "auxiliary_loss_mlp": 0.00220461, "balance_loss_clip": 0.99249029, "balance_loss_mlp": 0.19645238, "epoch": 0.5356380580189388, "flos": 20303211358080.0, "grad_norm": 374.51788972854257, "language_loss": 0.83756006, "learning_rate": 1.8663196516484055e-06, "loss": 0.85172319, "num_input_tokens_seen": 191713295, "router_z_loss_clip": 2.03515625, "router_z_loss_mlp": 0.24035645, "step": 8909, "time_per_iteration": 2.60960054397583 }, { "auxiliary_loss_clip": 0.01187868, "auxiliary_loss_mlp": 0.00203384, "balance_loss_clip": 0.99236953, "balance_loss_mlp": 0.1801382, "epoch": 0.5356981812716068, "flos": 21361642444800.0, "grad_norm": 32.11247089544186, "language_loss": 0.91226268, "learning_rate": 1.8659310634590702e-06, "loss": 0.92617512, "num_input_tokens_seen": 191732725, "router_z_loss_clip": 1.95703125, "router_z_loss_mlp": 0.23242188, "step": 8910, "time_per_iteration": 2.6668524742126465 }, { "auxiliary_loss_clip": 0.01183116, "auxiliary_loss_mlp": 0.00221741, "balance_loss_clip": 0.98795873, "balance_loss_mlp": 0.19739801, "epoch": 0.5357583045242748, "flos": 23111246010240.0, "grad_norm": 14.646340422444975, "language_loss": 0.8842656, "learning_rate": 1.8655424803536427e-06, "loss": 0.89831412, "num_input_tokens_seen": 191753765, "router_z_loss_clip": 1.95214844, "router_z_loss_mlp": 0.24365234, "step": 8911, "time_per_iteration": 2.680492401123047 }, { "auxiliary_loss_clip": 0.01178749, "auxiliary_loss_mlp": 0.00199486, "balance_loss_clip": 0.98503828, "balance_loss_mlp": 0.17806403, "epoch": 0.5358184277769428, "flos": 21141761339520.0, "grad_norm": 3.8425172820041356, "language_loss": 0.75189209, "learning_rate": 1.8651539023468585e-06, "loss": 0.76567441, "num_input_tokens_seen": 191773560, "router_z_loss_clip": 1.93652344, "router_z_loss_mlp": 0.2142334, "step": 8912, "time_per_iteration": 2.6505861282348633 }, { "auxiliary_loss_clip": 0.0120212, "auxiliary_loss_mlp": 0.00253879, "balance_loss_clip": 1.0049597, "balance_loss_mlp": 0.22779605, "epoch": 0.5358785510296107, "flos": 16282400878080.0, "grad_norm": 12.266506025594108, "language_loss": 0.80418479, "learning_rate": 1.8647653294534509e-06, "loss": 0.81874478, "num_input_tokens_seen": 191791255, "router_z_loss_clip": 1.97070312, "router_z_loss_mlp": 0.26049805, "step": 8913, "time_per_iteration": 2.6107358932495117 }, { "auxiliary_loss_clip": 0.01201352, "auxiliary_loss_mlp": 0.00227495, "balance_loss_clip": 0.99704349, "balance_loss_mlp": 0.20228189, "epoch": 0.5359386742822787, "flos": 16976877408000.0, "grad_norm": 2760.2208625135813, "language_loss": 0.79832685, "learning_rate": 1.864376761688156e-06, "loss": 0.81261533, "num_input_tokens_seen": 191809325, "router_z_loss_clip": 2.04492188, "router_z_loss_mlp": 0.25219727, "step": 8914, "time_per_iteration": 2.653411388397217 }, { "auxiliary_loss_clip": 0.01195055, "auxiliary_loss_mlp": 0.00247355, "balance_loss_clip": 0.99407655, "balance_loss_mlp": 0.22165391, "epoch": 0.5359987975349466, "flos": 20812927305600.0, "grad_norm": 3.744776655752441, "language_loss": 0.78784996, "learning_rate": 1.8639881990657079e-06, "loss": 0.80227405, "num_input_tokens_seen": 191829795, "router_z_loss_clip": 2.00976562, "router_z_loss_mlp": 0.25695801, "step": 8915, "time_per_iteration": 2.6386425495147705 }, { "auxiliary_loss_clip": 0.01198956, "auxiliary_loss_mlp": 0.00221824, "balance_loss_clip": 0.99892378, "balance_loss_mlp": 0.1978271, "epoch": 0.5360589207876146, "flos": 22199941031040.0, "grad_norm": 10.682722117966357, "language_loss": 0.85290104, "learning_rate": 1.8635996416008408e-06, "loss": 0.86710888, "num_input_tokens_seen": 191850840, "router_z_loss_clip": 1.99902344, "router_z_loss_mlp": 0.23986816, "step": 8916, "time_per_iteration": 2.690218687057495 }, { "auxiliary_loss_clip": 0.01191271, "auxiliary_loss_mlp": 0.0022084, "balance_loss_clip": 0.99612844, "balance_loss_mlp": 0.19641447, "epoch": 0.5361190440402825, "flos": 31394365084800.0, "grad_norm": 9.937623516628049, "language_loss": 0.79203188, "learning_rate": 1.863211089308289e-06, "loss": 0.806153, "num_input_tokens_seen": 191869520, "router_z_loss_clip": 1.95214844, "router_z_loss_mlp": 0.24438477, "step": 8917, "time_per_iteration": 2.718505382537842 }, { "auxiliary_loss_clip": 0.01224372, "auxiliary_loss_mlp": 0.00231795, "balance_loss_clip": 1.01782691, "balance_loss_mlp": 0.20543739, "epoch": 0.5361791672929506, "flos": 16069882060800.0, "grad_norm": 62.12950401603561, "language_loss": 0.82558256, "learning_rate": 1.8628225422027865e-06, "loss": 0.84014428, "num_input_tokens_seen": 191887240, "router_z_loss_clip": 2.06445312, "router_z_loss_mlp": 0.2635498, "step": 8918, "time_per_iteration": 2.669370412826538 }, { "auxiliary_loss_clip": 0.01185931, "auxiliary_loss_mlp": 0.00227894, "balance_loss_clip": 0.98926622, "balance_loss_mlp": 0.20296766, "epoch": 0.5362392905456185, "flos": 20740926493440.0, "grad_norm": 2.2977508142702483, "language_loss": 0.8279835, "learning_rate": 1.862434000299067e-06, "loss": 0.84212172, "num_input_tokens_seen": 191905690, "router_z_loss_clip": 1.96777344, "router_z_loss_mlp": 0.24926758, "step": 8919, "time_per_iteration": 2.630331516265869 }, { "auxiliary_loss_clip": 0.01203582, "auxiliary_loss_mlp": 0.00219488, "balance_loss_clip": 1.00059664, "balance_loss_mlp": 0.19266562, "epoch": 0.5362994137982865, "flos": 17340077779200.0, "grad_norm": 183.00461399416508, "language_loss": 0.80713278, "learning_rate": 1.862045463611864e-06, "loss": 0.82136351, "num_input_tokens_seen": 191920725, "router_z_loss_clip": 2.03027344, "router_z_loss_mlp": 0.26831055, "step": 8920, "time_per_iteration": 2.6078219413757324 }, { "auxiliary_loss_clip": 0.01176673, "auxiliary_loss_mlp": 0.00223882, "balance_loss_clip": 0.9834879, "balance_loss_mlp": 0.19804926, "epoch": 0.5363595370509544, "flos": 42813957795840.0, "grad_norm": 7.187150334972805, "language_loss": 0.7441411, "learning_rate": 1.8616569321559105e-06, "loss": 0.75814664, "num_input_tokens_seen": 191944645, "router_z_loss_clip": 1.93261719, "router_z_loss_mlp": 0.25842285, "step": 8921, "time_per_iteration": 2.885011672973633 }, { "auxiliary_loss_clip": 0.01205663, "auxiliary_loss_mlp": 0.00263504, "balance_loss_clip": 1.00505352, "balance_loss_mlp": 0.23740953, "epoch": 0.5364196603036224, "flos": 19171953446400.0, "grad_norm": 115.53113359382232, "language_loss": 0.88319898, "learning_rate": 1.86126840594594e-06, "loss": 0.89789069, "num_input_tokens_seen": 191962265, "router_z_loss_clip": 2.00878906, "router_z_loss_mlp": 0.26074219, "step": 8922, "time_per_iteration": 2.626796245574951 }, { "auxiliary_loss_clip": 0.01210915, "auxiliary_loss_mlp": 0.00231551, "balance_loss_clip": 1.00669956, "balance_loss_mlp": 0.20502727, "epoch": 0.5364797835562904, "flos": 17931060247680.0, "grad_norm": 4.839510568598882, "language_loss": 0.85520422, "learning_rate": 1.860879884996686e-06, "loss": 0.86962891, "num_input_tokens_seen": 191978850, "router_z_loss_clip": 2.04101562, "router_z_loss_mlp": 0.26550293, "step": 8923, "time_per_iteration": 2.5880393981933594 }, { "auxiliary_loss_clip": 0.01192279, "auxiliary_loss_mlp": 0.00220016, "balance_loss_clip": 0.99594784, "balance_loss_mlp": 0.19488618, "epoch": 0.5365399068089584, "flos": 30228058477440.0, "grad_norm": 16.72169007390184, "language_loss": 0.77336776, "learning_rate": 1.8604913693228804e-06, "loss": 0.78749067, "num_input_tokens_seen": 192002000, "router_z_loss_clip": 1.96386719, "router_z_loss_mlp": 0.2512207, "step": 8924, "time_per_iteration": 2.717101812362671 }, { "auxiliary_loss_clip": 0.01223945, "auxiliary_loss_mlp": 0.00222727, "balance_loss_clip": 1.01330924, "balance_loss_mlp": 0.19471329, "epoch": 0.5366000300616264, "flos": 24891696380160.0, "grad_norm": 17.849524471044784, "language_loss": 0.95844519, "learning_rate": 1.8601028589392558e-06, "loss": 0.97291189, "num_input_tokens_seen": 192019100, "router_z_loss_clip": 2.10449219, "router_z_loss_mlp": 0.2800293, "step": 8925, "time_per_iteration": 2.8330416679382324 }, { "auxiliary_loss_clip": 0.01194215, "auxiliary_loss_mlp": 0.00217931, "balance_loss_clip": 0.99502182, "balance_loss_mlp": 0.19076297, "epoch": 0.5366601533142943, "flos": 29826649013760.0, "grad_norm": 130.2814890620954, "language_loss": 0.8538509, "learning_rate": 1.8597143538605455e-06, "loss": 0.86797237, "num_input_tokens_seen": 192041660, "router_z_loss_clip": 1.99316406, "router_z_loss_mlp": 0.27160645, "step": 8926, "time_per_iteration": 4.139146327972412 }, { "auxiliary_loss_clip": 0.01199698, "auxiliary_loss_mlp": 0.00205025, "balance_loss_clip": 1.00165927, "balance_loss_mlp": 0.17963353, "epoch": 0.5367202765669623, "flos": 27199352620800.0, "grad_norm": 135.80505769073244, "language_loss": 0.74492002, "learning_rate": 1.85932585410148e-06, "loss": 0.75896722, "num_input_tokens_seen": 192063540, "router_z_loss_clip": 1.98046875, "router_z_loss_mlp": 0.25415039, "step": 8927, "time_per_iteration": 4.115447044372559 }, { "auxiliary_loss_clip": 0.01224536, "auxiliary_loss_mlp": 0.00238096, "balance_loss_clip": 1.01870275, "balance_loss_mlp": 0.20900846, "epoch": 0.5367803998196302, "flos": 20229953569920.0, "grad_norm": 4.292643080023921, "language_loss": 0.84430486, "learning_rate": 1.8589373596767929e-06, "loss": 0.85893118, "num_input_tokens_seen": 192081760, "router_z_loss_clip": 2.06054688, "router_z_loss_mlp": 0.29077148, "step": 8928, "time_per_iteration": 2.6568150520324707 }, { "auxiliary_loss_clip": 0.01175258, "auxiliary_loss_mlp": 0.00228437, "balance_loss_clip": 0.98387539, "balance_loss_mlp": 0.20100668, "epoch": 0.5368405230722982, "flos": 32154629374080.0, "grad_norm": 14.452749556604573, "language_loss": 0.71008158, "learning_rate": 1.8585488706012154e-06, "loss": 0.72411847, "num_input_tokens_seen": 192101620, "router_z_loss_clip": 1.91308594, "router_z_loss_mlp": 0.27429199, "step": 8929, "time_per_iteration": 4.1480748653411865 }, { "auxiliary_loss_clip": 0.01210117, "auxiliary_loss_mlp": 0.00217077, "balance_loss_clip": 1.01029491, "balance_loss_mlp": 0.19089907, "epoch": 0.5369006463249661, "flos": 26247935128320.0, "grad_norm": 5.381299114714289, "language_loss": 0.75191104, "learning_rate": 1.8581603868894781e-06, "loss": 0.76618296, "num_input_tokens_seen": 192121805, "router_z_loss_clip": 1.99804688, "router_z_loss_mlp": 0.26196289, "step": 8930, "time_per_iteration": 2.779059886932373 }, { "auxiliary_loss_clip": 0.01176792, "auxiliary_loss_mlp": 0.00249788, "balance_loss_clip": 0.98512155, "balance_loss_mlp": 0.22362138, "epoch": 0.5369607695776342, "flos": 26211306234240.0, "grad_norm": 41.16752111691508, "language_loss": 0.72854555, "learning_rate": 1.8577719085563136e-06, "loss": 0.74281132, "num_input_tokens_seen": 192141765, "router_z_loss_clip": 1.91699219, "router_z_loss_mlp": 0.26147461, "step": 8931, "time_per_iteration": 2.6939103603363037 }, { "auxiliary_loss_clip": 0.01196354, "auxiliary_loss_mlp": 0.00223676, "balance_loss_clip": 1.00077856, "balance_loss_mlp": 0.19653234, "epoch": 0.5370208928303021, "flos": 25009017177600.0, "grad_norm": 17.418433857568488, "language_loss": 0.82908958, "learning_rate": 1.8573834356164525e-06, "loss": 0.84328985, "num_input_tokens_seen": 192161560, "router_z_loss_clip": 1.95507812, "router_z_loss_mlp": 0.27124023, "step": 8932, "time_per_iteration": 2.655109167098999 }, { "auxiliary_loss_clip": 0.01182928, "auxiliary_loss_mlp": 0.00230375, "balance_loss_clip": 0.98873544, "balance_loss_mlp": 0.20399427, "epoch": 0.5370810160829701, "flos": 31792147274880.0, "grad_norm": 7.881113772765568, "language_loss": 0.72060931, "learning_rate": 1.8569949680846261e-06, "loss": 0.7347424, "num_input_tokens_seen": 192180190, "router_z_loss_clip": 1.94335938, "router_z_loss_mlp": 0.26391602, "step": 8933, "time_per_iteration": 4.107812166213989 }, { "auxiliary_loss_clip": 0.01184555, "auxiliary_loss_mlp": 0.00224154, "balance_loss_clip": 0.98861057, "balance_loss_mlp": 0.19687892, "epoch": 0.537141139335638, "flos": 23842602829440.0, "grad_norm": 122.20216784959115, "language_loss": 0.90167695, "learning_rate": 1.856606505975565e-06, "loss": 0.91576409, "num_input_tokens_seen": 192198855, "router_z_loss_clip": 1.95703125, "router_z_loss_mlp": 0.27258301, "step": 8934, "time_per_iteration": 2.7282941341400146 }, { "auxiliary_loss_clip": 0.0119993, "auxiliary_loss_mlp": 0.00229372, "balance_loss_clip": 1.00106311, "balance_loss_mlp": 0.20319328, "epoch": 0.537201262588306, "flos": 18508826511360.0, "grad_norm": 3.8902069161396917, "language_loss": 0.87145948, "learning_rate": 1.856218049303999e-06, "loss": 0.88575244, "num_input_tokens_seen": 192216555, "router_z_loss_clip": 1.98925781, "router_z_loss_mlp": 0.26184082, "step": 8935, "time_per_iteration": 2.621873378753662 }, { "auxiliary_loss_clip": 0.01193744, "auxiliary_loss_mlp": 0.00247602, "balance_loss_clip": 0.99159908, "balance_loss_mlp": 0.21825245, "epoch": 0.537261385840974, "flos": 25662950231040.0, "grad_norm": 14.320277872611955, "language_loss": 0.91148579, "learning_rate": 1.855829598084659e-06, "loss": 0.92589927, "num_input_tokens_seen": 192236910, "router_z_loss_clip": 2.0234375, "router_z_loss_mlp": 0.29321289, "step": 8936, "time_per_iteration": 2.6810572147369385 }, { "auxiliary_loss_clip": 0.0121825, "auxiliary_loss_mlp": 0.0021435, "balance_loss_clip": 1.01408958, "balance_loss_mlp": 0.18829037, "epoch": 0.537321509093642, "flos": 40735017406080.0, "grad_norm": 2413.7596904559746, "language_loss": 0.7741487, "learning_rate": 1.8554411523322754e-06, "loss": 0.78847468, "num_input_tokens_seen": 192260790, "router_z_loss_clip": 2.04394531, "router_z_loss_mlp": 0.26037598, "step": 8937, "time_per_iteration": 2.829597234725952 }, { "auxiliary_loss_clip": 0.01205257, "auxiliary_loss_mlp": 0.00214279, "balance_loss_clip": 0.99956375, "balance_loss_mlp": 0.18681301, "epoch": 0.53738163234631, "flos": 17238487138560.0, "grad_norm": 89.34425630173071, "language_loss": 0.90141141, "learning_rate": 1.8550527120615778e-06, "loss": 0.91560674, "num_input_tokens_seen": 192277230, "router_z_loss_clip": 2.05859375, "router_z_loss_mlp": 0.2746582, "step": 8938, "time_per_iteration": 2.6579015254974365 }, { "auxiliary_loss_clip": 0.0122382, "auxiliary_loss_mlp": 0.0022493, "balance_loss_clip": 1.01391089, "balance_loss_mlp": 0.19680873, "epoch": 0.5374417555989779, "flos": 12821977457280.0, "grad_norm": 25.58896612128774, "language_loss": 0.92104143, "learning_rate": 1.8546642772872957e-06, "loss": 0.93552887, "num_input_tokens_seen": 192292840, "router_z_loss_clip": 2.09765625, "router_z_loss_mlp": 0.28137207, "step": 8939, "time_per_iteration": 2.6239569187164307 }, { "auxiliary_loss_clip": 0.01252387, "auxiliary_loss_mlp": 0.00047253, "balance_loss_clip": 1.09459543, "balance_loss_mlp": 0.04157844, "epoch": 0.5375018788516459, "flos": 67256018703360.0, "grad_norm": 0.6944223447241654, "language_loss": 0.52199817, "learning_rate": 1.8542758480241589e-06, "loss": 0.5349946, "num_input_tokens_seen": 192358240, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.05664062, "step": 8940, "time_per_iteration": 3.175300121307373 }, { "auxiliary_loss_clip": 0.01193505, "auxiliary_loss_mlp": 0.00225414, "balance_loss_clip": 0.99622184, "balance_loss_mlp": 0.20047534, "epoch": 0.5375620021043138, "flos": 18114168804480.0, "grad_norm": 5.084700371918976, "language_loss": 0.81436563, "learning_rate": 1.8538874242868965e-06, "loss": 0.82855475, "num_input_tokens_seen": 192377370, "router_z_loss_clip": 1.97363281, "router_z_loss_mlp": 0.24963379, "step": 8941, "time_per_iteration": 2.7099053859710693 }, { "auxiliary_loss_clip": 0.01189739, "auxiliary_loss_mlp": 0.00217621, "balance_loss_clip": 0.99325848, "balance_loss_mlp": 0.1943993, "epoch": 0.5376221253569818, "flos": 23149383275520.0, "grad_norm": 3.6599172911285827, "language_loss": 0.87076116, "learning_rate": 1.853499006090237e-06, "loss": 0.88483477, "num_input_tokens_seen": 192396450, "router_z_loss_clip": 1.96679688, "router_z_loss_mlp": 0.2322998, "step": 8942, "time_per_iteration": 2.6869091987609863 }, { "auxiliary_loss_clip": 0.0122698, "auxiliary_loss_mlp": 0.00272381, "balance_loss_clip": 1.02018845, "balance_loss_mlp": 0.24323446, "epoch": 0.5376822486096497, "flos": 29972302663680.0, "grad_norm": 4.014320295563004, "language_loss": 0.79672992, "learning_rate": 1.853110593448911e-06, "loss": 0.81172347, "num_input_tokens_seen": 192417390, "router_z_loss_clip": 2.06640625, "router_z_loss_mlp": 0.29174805, "step": 8943, "time_per_iteration": 2.7247962951660156 }, { "auxiliary_loss_clip": 0.01240118, "auxiliary_loss_mlp": 0.00055888, "balance_loss_clip": 1.08870959, "balance_loss_mlp": 0.05002324, "epoch": 0.5377423718623178, "flos": 54168950874240.0, "grad_norm": 0.8300011927430635, "language_loss": 0.5949437, "learning_rate": 1.852722186377645e-06, "loss": 0.60790372, "num_input_tokens_seen": 192478060, "router_z_loss_clip": 1.515625, "router_z_loss_mlp": 0.05859375, "step": 8944, "time_per_iteration": 3.1343846321105957 }, { "auxiliary_loss_clip": 0.01213487, "auxiliary_loss_mlp": 0.00261368, "balance_loss_clip": 1.00241756, "balance_loss_mlp": 0.23296037, "epoch": 0.5378024951149857, "flos": 23257079228160.0, "grad_norm": 16.16021064130627, "language_loss": 0.8635335, "learning_rate": 1.852333784891169e-06, "loss": 0.87828207, "num_input_tokens_seen": 192495985, "router_z_loss_clip": 2.11132812, "router_z_loss_mlp": 0.28430176, "step": 8945, "time_per_iteration": 2.6860923767089844 }, { "auxiliary_loss_clip": 0.01211232, "auxiliary_loss_mlp": 0.00230684, "balance_loss_clip": 1.00424218, "balance_loss_mlp": 0.20420802, "epoch": 0.5378626183676537, "flos": 24024095274240.0, "grad_norm": 15.609742518299052, "language_loss": 0.76200897, "learning_rate": 1.8519453890042112e-06, "loss": 0.7764281, "num_input_tokens_seen": 192515445, "router_z_loss_clip": 2.06738281, "router_z_loss_mlp": 0.26489258, "step": 8946, "time_per_iteration": 2.657294988632202 }, { "auxiliary_loss_clip": 0.01216591, "auxiliary_loss_mlp": 0.00213203, "balance_loss_clip": 1.0124867, "balance_loss_mlp": 0.18758501, "epoch": 0.5379227416203216, "flos": 27161789973120.0, "grad_norm": 2.3521200134724194, "language_loss": 0.82958841, "learning_rate": 1.851556998731498e-06, "loss": 0.84388638, "num_input_tokens_seen": 192536530, "router_z_loss_clip": 2.04492188, "router_z_loss_mlp": 0.25646973, "step": 8947, "time_per_iteration": 2.6773128509521484 }, { "auxiliary_loss_clip": 0.01188073, "auxiliary_loss_mlp": 0.00251079, "balance_loss_clip": 0.99292094, "balance_loss_mlp": 0.22454301, "epoch": 0.5379828648729896, "flos": 24681619687680.0, "grad_norm": 13.233527172978588, "language_loss": 0.65650874, "learning_rate": 1.8511686140877592e-06, "loss": 0.67090034, "num_input_tokens_seen": 192556075, "router_z_loss_clip": 1.95019531, "router_z_loss_mlp": 0.26550293, "step": 8948, "time_per_iteration": 2.6582489013671875 }, { "auxiliary_loss_clip": 0.01245277, "auxiliary_loss_mlp": 0.00230717, "balance_loss_clip": 1.02749681, "balance_loss_mlp": 0.20409763, "epoch": 0.5380429881256577, "flos": 22523280284160.0, "grad_norm": 149.10677546755377, "language_loss": 0.8632406, "learning_rate": 1.8507802350877205e-06, "loss": 0.87800056, "num_input_tokens_seen": 192575535, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.26623535, "step": 8949, "time_per_iteration": 2.6427125930786133 }, { "auxiliary_loss_clip": 0.01206465, "auxiliary_loss_mlp": 0.00225836, "balance_loss_clip": 1.0039258, "balance_loss_mlp": 0.19989586, "epoch": 0.5381031113783256, "flos": 26979543342720.0, "grad_norm": 305.8847264578508, "language_loss": 0.84049946, "learning_rate": 1.850391861746111e-06, "loss": 0.85482246, "num_input_tokens_seen": 192594490, "router_z_loss_clip": 2.0234375, "router_z_loss_mlp": 0.25964355, "step": 8950, "time_per_iteration": 2.6786553859710693 }, { "auxiliary_loss_clip": 0.01216969, "auxiliary_loss_mlp": 0.00214358, "balance_loss_clip": 1.01124823, "balance_loss_mlp": 0.19019458, "epoch": 0.5381632346309936, "flos": 24754087376640.0, "grad_norm": 32.08594321549141, "language_loss": 0.78595364, "learning_rate": 1.8500034940776573e-06, "loss": 0.80026686, "num_input_tokens_seen": 192615650, "router_z_loss_clip": 2.05664062, "router_z_loss_mlp": 0.24157715, "step": 8951, "time_per_iteration": 2.6709060668945312 }, { "auxiliary_loss_clip": 0.01215343, "auxiliary_loss_mlp": 0.00240791, "balance_loss_clip": 1.0060699, "balance_loss_mlp": 0.2147679, "epoch": 0.5382233578836615, "flos": 15560058372480.0, "grad_norm": 6.468566041076626, "language_loss": 0.85067916, "learning_rate": 1.849615132097085e-06, "loss": 0.86524051, "num_input_tokens_seen": 192633840, "router_z_loss_clip": 2.09179688, "router_z_loss_mlp": 0.26000977, "step": 8952, "time_per_iteration": 2.6522443294525146 }, { "auxiliary_loss_clip": 0.01216361, "auxiliary_loss_mlp": 0.00257221, "balance_loss_clip": 1.00816643, "balance_loss_mlp": 0.22864619, "epoch": 0.5382834811363295, "flos": 25084501608960.0, "grad_norm": 5.54996104663942, "language_loss": 0.84209502, "learning_rate": 1.8492267758191228e-06, "loss": 0.85683084, "num_input_tokens_seen": 192655890, "router_z_loss_clip": 2.08398438, "router_z_loss_mlp": 0.28588867, "step": 8953, "time_per_iteration": 2.750161647796631 }, { "auxiliary_loss_clip": 0.01202315, "auxiliary_loss_mlp": 0.00217422, "balance_loss_clip": 1.00051212, "balance_loss_mlp": 0.19245917, "epoch": 0.5383436043889974, "flos": 13297901685120.0, "grad_norm": 31.29334018668495, "language_loss": 0.89061964, "learning_rate": 1.8488384252584964e-06, "loss": 0.90481699, "num_input_tokens_seen": 192673025, "router_z_loss_clip": 2.01660156, "router_z_loss_mlp": 0.24975586, "step": 8954, "time_per_iteration": 2.727889060974121 }, { "auxiliary_loss_clip": 0.01200634, "auxiliary_loss_mlp": 0.00213808, "balance_loss_clip": 1.00034058, "balance_loss_mlp": 0.18894145, "epoch": 0.5384037276416654, "flos": 23039388852480.0, "grad_norm": 4.443381574678608, "language_loss": 0.83969897, "learning_rate": 1.8484500804299318e-06, "loss": 0.85384345, "num_input_tokens_seen": 192692190, "router_z_loss_clip": 2.00390625, "router_z_loss_mlp": 0.24902344, "step": 8955, "time_per_iteration": 2.7250399589538574 }, { "auxiliary_loss_clip": 0.01226074, "auxiliary_loss_mlp": 0.00233547, "balance_loss_clip": 1.01368213, "balance_loss_mlp": 0.20716557, "epoch": 0.5384638508943334, "flos": 20631147552000.0, "grad_norm": 8.01026972339475, "language_loss": 0.84121448, "learning_rate": 1.8480617413481557e-06, "loss": 0.8558107, "num_input_tokens_seen": 192710380, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.26391602, "step": 8956, "time_per_iteration": 2.6953909397125244 }, { "auxiliary_loss_clip": 0.01244699, "auxiliary_loss_mlp": 0.00131651, "balance_loss_clip": 1.09070408, "balance_loss_mlp": 0.12449831, "epoch": 0.5385239741470014, "flos": 66737683491840.0, "grad_norm": 1.1386778661633585, "language_loss": 0.62664497, "learning_rate": 1.8476734080278932e-06, "loss": 0.6404084, "num_input_tokens_seen": 192768995, "router_z_loss_clip": 1.5390625, "router_z_loss_mlp": 0.07128906, "step": 8957, "time_per_iteration": 3.0596301555633545 }, { "auxiliary_loss_clip": 0.01246622, "auxiliary_loss_mlp": 0.00227916, "balance_loss_clip": 1.09404325, "balance_loss_mlp": 0.21990524, "epoch": 0.5385840973996693, "flos": 64716058229760.0, "grad_norm": 0.727813091780219, "language_loss": 0.50955099, "learning_rate": 1.8472850804838705e-06, "loss": 0.5242964, "num_input_tokens_seen": 192825585, "router_z_loss_clip": 1.5234375, "router_z_loss_mlp": 0.08007812, "step": 8958, "time_per_iteration": 3.1520891189575195 }, { "auxiliary_loss_clip": 0.01250421, "auxiliary_loss_mlp": 0.00236649, "balance_loss_clip": 1.03584087, "balance_loss_mlp": 0.20927849, "epoch": 0.5386442206523373, "flos": 26141783460480.0, "grad_norm": 202.34952494165069, "language_loss": 0.84216475, "learning_rate": 1.8468967587308128e-06, "loss": 0.85703552, "num_input_tokens_seen": 192847335, "router_z_loss_clip": 2.14746094, "router_z_loss_mlp": 0.27404785, "step": 8959, "time_per_iteration": 2.6918883323669434 }, { "auxiliary_loss_clip": 0.01215166, "auxiliary_loss_mlp": 0.00226159, "balance_loss_clip": 1.00503969, "balance_loss_mlp": 0.19899088, "epoch": 0.5387043439050052, "flos": 18251849635200.0, "grad_norm": 421.7498377919202, "language_loss": 0.94589627, "learning_rate": 1.8465084427834455e-06, "loss": 0.96030951, "num_input_tokens_seen": 192862205, "router_z_loss_clip": 2.10546875, "router_z_loss_mlp": 0.27172852, "step": 8960, "time_per_iteration": 2.622714042663574 }, { "auxiliary_loss_clip": 0.01199934, "auxiliary_loss_mlp": 0.00278242, "balance_loss_clip": 0.99692518, "balance_loss_mlp": 0.25130099, "epoch": 0.5387644671576732, "flos": 29788296266880.0, "grad_norm": 86.28706065324721, "language_loss": 0.83693743, "learning_rate": 1.8461201326564933e-06, "loss": 0.85171926, "num_input_tokens_seen": 192883695, "router_z_loss_clip": 2.03125, "router_z_loss_mlp": 0.26965332, "step": 8961, "time_per_iteration": 2.6952714920043945 }, { "auxiliary_loss_clip": 0.01212221, "auxiliary_loss_mlp": 0.00220878, "balance_loss_clip": 1.00253975, "balance_loss_mlp": 0.19411595, "epoch": 0.5388245904103413, "flos": 22374466237440.0, "grad_norm": 47.49131696331509, "language_loss": 0.89356685, "learning_rate": 1.845731828364681e-06, "loss": 0.90789783, "num_input_tokens_seen": 192900190, "router_z_loss_clip": 2.09375, "router_z_loss_mlp": 0.26757812, "step": 8962, "time_per_iteration": 2.6492762565612793 }, { "auxiliary_loss_clip": 0.0125378, "auxiliary_loss_mlp": 0.00046421, "balance_loss_clip": 1.10203481, "balance_loss_mlp": 0.03993564, "epoch": 0.5388847136630092, "flos": 69807794751360.0, "grad_norm": 0.7316416282296551, "language_loss": 0.5380435, "learning_rate": 1.8453435299227333e-06, "loss": 0.55104548, "num_input_tokens_seen": 192958675, "router_z_loss_clip": 1.515625, "router_z_loss_mlp": 0.06494141, "step": 8963, "time_per_iteration": 3.0788145065307617 }, { "auxiliary_loss_clip": 0.01271954, "auxiliary_loss_mlp": 0.00046564, "balance_loss_clip": 1.1140548, "balance_loss_mlp": 0.04067529, "epoch": 0.5389448369156772, "flos": 69822303845760.0, "grad_norm": 0.8045951480971929, "language_loss": 0.62408161, "learning_rate": 1.8449552373453744e-06, "loss": 0.63726676, "num_input_tokens_seen": 193033135, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.05883789, "step": 8964, "time_per_iteration": 3.180586099624634 }, { "auxiliary_loss_clip": 0.0122452, "auxiliary_loss_mlp": 0.00228917, "balance_loss_clip": 1.00806546, "balance_loss_mlp": 0.20308387, "epoch": 0.5390049601683451, "flos": 31722444933120.0, "grad_norm": 11.721539513392262, "language_loss": 0.76617134, "learning_rate": 1.8445669506473287e-06, "loss": 0.78070569, "num_input_tokens_seen": 193055570, "router_z_loss_clip": 2.1640625, "router_z_loss_mlp": 0.25866699, "step": 8965, "time_per_iteration": 2.7270731925964355 }, { "auxiliary_loss_clip": 0.01239574, "auxiliary_loss_mlp": 0.00235531, "balance_loss_clip": 1.02508283, "balance_loss_mlp": 0.20875695, "epoch": 0.5390650834210131, "flos": 18113486446080.0, "grad_norm": 31.3684897476987, "language_loss": 0.91358459, "learning_rate": 1.8441786698433192e-06, "loss": 0.92833561, "num_input_tokens_seen": 193073120, "router_z_loss_clip": 2.14453125, "router_z_loss_mlp": 0.26782227, "step": 8966, "time_per_iteration": 2.583099126815796 }, { "auxiliary_loss_clip": 0.01247871, "auxiliary_loss_mlp": 0.00226039, "balance_loss_clip": 1.03156281, "balance_loss_mlp": 0.20167288, "epoch": 0.539125206673681, "flos": 17416711445760.0, "grad_norm": 3.2526487362252707, "language_loss": 0.77796435, "learning_rate": 1.8437903949480706e-06, "loss": 0.79270345, "num_input_tokens_seen": 193090105, "router_z_loss_clip": 2.1640625, "router_z_loss_mlp": 0.24377441, "step": 8967, "time_per_iteration": 2.627350330352783 }, { "auxiliary_loss_clip": 0.01188256, "auxiliary_loss_mlp": 0.0023666, "balance_loss_clip": 0.98999608, "balance_loss_mlp": 0.21056473, "epoch": 0.539185329926349, "flos": 22198935450240.0, "grad_norm": 13.00897374795693, "language_loss": 0.88278127, "learning_rate": 1.8434021259763065e-06, "loss": 0.89703047, "num_input_tokens_seen": 193109325, "router_z_loss_clip": 1.98144531, "router_z_loss_mlp": 0.2611084, "step": 8968, "time_per_iteration": 4.198465585708618 }, { "auxiliary_loss_clip": 0.01233922, "auxiliary_loss_mlp": 0.00230618, "balance_loss_clip": 1.02012753, "balance_loss_mlp": 0.20517848, "epoch": 0.539245453179017, "flos": 21434397442560.0, "grad_norm": 4.383862923332199, "language_loss": 0.8067494, "learning_rate": 1.8430138629427484e-06, "loss": 0.8213948, "num_input_tokens_seen": 193130595, "router_z_loss_clip": 2.13867188, "router_z_loss_mlp": 0.25439453, "step": 8969, "time_per_iteration": 4.048234224319458 }, { "auxiliary_loss_clip": 0.01203627, "auxiliary_loss_mlp": 0.00244539, "balance_loss_clip": 0.99529111, "balance_loss_mlp": 0.21768141, "epoch": 0.539305576431685, "flos": 20735000749440.0, "grad_norm": 6.269445136715458, "language_loss": 0.89194739, "learning_rate": 1.8426256058621205e-06, "loss": 0.90642899, "num_input_tokens_seen": 193148930, "router_z_loss_clip": 2.08007812, "router_z_loss_mlp": 0.26867676, "step": 8970, "time_per_iteration": 2.6081318855285645 }, { "auxiliary_loss_clip": 0.01207875, "auxiliary_loss_mlp": 0.00229153, "balance_loss_clip": 1.00595117, "balance_loss_mlp": 0.20402348, "epoch": 0.5393656996843529, "flos": 30920452018560.0, "grad_norm": 33.96749633928989, "language_loss": 0.80105305, "learning_rate": 1.842237354749146e-06, "loss": 0.81542337, "num_input_tokens_seen": 193170140, "router_z_loss_clip": 2.01660156, "router_z_loss_mlp": 0.2512207, "step": 8971, "time_per_iteration": 4.1103246212005615 }, { "auxiliary_loss_clip": 0.01301117, "auxiliary_loss_mlp": 0.00088149, "balance_loss_clip": 1.11992061, "balance_loss_mlp": 0.08133061, "epoch": 0.5394258229370209, "flos": 50317781351040.0, "grad_norm": 0.8611977426343315, "language_loss": 0.59874415, "learning_rate": 1.8418491096185465e-06, "loss": 0.6126368, "num_input_tokens_seen": 193227235, "router_z_loss_clip": 1.8125, "router_z_loss_mlp": 0.06835938, "step": 8972, "time_per_iteration": 3.0876595973968506 }, { "auxiliary_loss_clip": 0.01207165, "auxiliary_loss_mlp": 0.00230122, "balance_loss_clip": 1.00246906, "balance_loss_mlp": 0.20474212, "epoch": 0.5394859461896888, "flos": 25411935012480.0, "grad_norm": 5.326626720177186, "language_loss": 0.84514683, "learning_rate": 1.841460870485045e-06, "loss": 0.85951966, "num_input_tokens_seen": 193248435, "router_z_loss_clip": 2.05078125, "router_z_loss_mlp": 0.25378418, "step": 8973, "time_per_iteration": 2.6404311656951904 }, { "auxiliary_loss_clip": 0.01260085, "auxiliary_loss_mlp": 0.0022301, "balance_loss_clip": 1.03625858, "balance_loss_mlp": 0.19703446, "epoch": 0.5395460694423568, "flos": 25478476957440.0, "grad_norm": 73.98809471002781, "language_loss": 0.82381499, "learning_rate": 1.8410726373633623e-06, "loss": 0.83864594, "num_input_tokens_seen": 193267490, "router_z_loss_clip": 2.23535156, "router_z_loss_mlp": 0.25964355, "step": 8974, "time_per_iteration": 2.6521666049957275 }, { "auxiliary_loss_clip": 0.01288421, "auxiliary_loss_mlp": 0.00076404, "balance_loss_clip": 1.11077809, "balance_loss_mlp": 0.06953774, "epoch": 0.5396061926950249, "flos": 53249493507840.0, "grad_norm": 0.7198825078424088, "language_loss": 0.50820446, "learning_rate": 1.8406844102682215e-06, "loss": 0.52185261, "num_input_tokens_seen": 193326050, "router_z_loss_clip": 1.78125, "router_z_loss_mlp": 0.06884766, "step": 8975, "time_per_iteration": 3.12680983543396 }, { "auxiliary_loss_clip": 0.01217462, "auxiliary_loss_mlp": 0.00222735, "balance_loss_clip": 1.01334238, "balance_loss_mlp": 0.19685432, "epoch": 0.5396663159476928, "flos": 26725080418560.0, "grad_norm": 226.1361678915132, "language_loss": 0.81354594, "learning_rate": 1.840296189214344e-06, "loss": 0.82794785, "num_input_tokens_seen": 193348785, "router_z_loss_clip": 2.04101562, "router_z_loss_mlp": 0.25866699, "step": 8976, "time_per_iteration": 4.121136903762817 }, { "auxiliary_loss_clip": 0.01230065, "auxiliary_loss_mlp": 0.0021347, "balance_loss_clip": 1.02494884, "balance_loss_mlp": 0.18640909, "epoch": 0.5397264392003608, "flos": 23253380127360.0, "grad_norm": 14.876756283441377, "language_loss": 0.77686143, "learning_rate": 1.8399079742164509e-06, "loss": 0.79129678, "num_input_tokens_seen": 193367080, "router_z_loss_clip": 2.05175781, "router_z_loss_mlp": 0.27038574, "step": 8977, "time_per_iteration": 2.687260150909424 }, { "auxiliary_loss_clip": 0.01241214, "auxiliary_loss_mlp": 0.00238381, "balance_loss_clip": 1.0312115, "balance_loss_mlp": 0.21295336, "epoch": 0.5397865624530287, "flos": 18294188791680.0, "grad_norm": 6.280867508486488, "language_loss": 0.8027879, "learning_rate": 1.8395197652892636e-06, "loss": 0.81758392, "num_input_tokens_seen": 193383715, "router_z_loss_clip": 2.1015625, "router_z_loss_mlp": 0.25439453, "step": 8978, "time_per_iteration": 2.6679701805114746 }, { "auxiliary_loss_clip": 0.01220205, "auxiliary_loss_mlp": 0.0022926, "balance_loss_clip": 1.01462615, "balance_loss_mlp": 0.20181775, "epoch": 0.5398466857056967, "flos": 15297514888320.0, "grad_norm": 162.0887075250353, "language_loss": 0.83667004, "learning_rate": 1.8391315624475028e-06, "loss": 0.8511647, "num_input_tokens_seen": 193400560, "router_z_loss_clip": 2.05273438, "router_z_loss_mlp": 0.27453613, "step": 8979, "time_per_iteration": 2.6376953125 }, { "auxiliary_loss_clip": 0.01238986, "auxiliary_loss_mlp": 0.00225461, "balance_loss_clip": 1.02731502, "balance_loss_mlp": 0.19869855, "epoch": 0.5399068089583646, "flos": 17821748183040.0, "grad_norm": 43.345281675758336, "language_loss": 0.86029768, "learning_rate": 1.8387433657058892e-06, "loss": 0.87494218, "num_input_tokens_seen": 193418680, "router_z_loss_clip": 2.1171875, "router_z_loss_mlp": 0.26794434, "step": 8980, "time_per_iteration": 2.61337947845459 }, { "auxiliary_loss_clip": 0.01227589, "auxiliary_loss_mlp": 0.00234223, "balance_loss_clip": 1.02186108, "balance_loss_mlp": 0.20716225, "epoch": 0.5399669322110326, "flos": 27381635164800.0, "grad_norm": 2.0369457106945688, "language_loss": 0.88805908, "learning_rate": 1.8383551750791431e-06, "loss": 0.90267718, "num_input_tokens_seen": 193439310, "router_z_loss_clip": 2.05957031, "router_z_loss_mlp": 0.27038574, "step": 8981, "time_per_iteration": 2.710923194885254 }, { "auxiliary_loss_clip": 0.01239719, "auxiliary_loss_mlp": 0.00226294, "balance_loss_clip": 1.02744555, "balance_loss_mlp": 0.19857767, "epoch": 0.5400270554637006, "flos": 20449116403200.0, "grad_norm": 3.347719629867188, "language_loss": 0.7639007, "learning_rate": 1.8379669905819857e-06, "loss": 0.77856082, "num_input_tokens_seen": 193458115, "router_z_loss_clip": 2.12304688, "router_z_loss_mlp": 0.27709961, "step": 8982, "time_per_iteration": 2.6420998573303223 }, { "auxiliary_loss_clip": 0.01242707, "auxiliary_loss_mlp": 0.00216411, "balance_loss_clip": 1.0342257, "balance_loss_mlp": 0.19061424, "epoch": 0.5400871787163686, "flos": 21689578638720.0, "grad_norm": 6.222465541662933, "language_loss": 0.88900572, "learning_rate": 1.8375788122291358e-06, "loss": 0.90359688, "num_input_tokens_seen": 193477365, "router_z_loss_clip": 2.0859375, "router_z_loss_mlp": 0.2578125, "step": 8983, "time_per_iteration": 2.6744887828826904 }, { "auxiliary_loss_clip": 0.012581, "auxiliary_loss_mlp": 0.00214543, "balance_loss_clip": 1.04109883, "balance_loss_mlp": 0.18595614, "epoch": 0.5401473019690365, "flos": 19204739585280.0, "grad_norm": 14.4417826303989, "language_loss": 0.79165024, "learning_rate": 1.8371906400353138e-06, "loss": 0.8063767, "num_input_tokens_seen": 193495595, "router_z_loss_clip": 2.171875, "router_z_loss_mlp": 0.28588867, "step": 8984, "time_per_iteration": 2.7580180168151855 }, { "auxiliary_loss_clip": 0.01275586, "auxiliary_loss_mlp": 0.00207815, "balance_loss_clip": 1.05009162, "balance_loss_mlp": 0.18050429, "epoch": 0.5402074252217045, "flos": 20627376624000.0, "grad_norm": 14.538621904216667, "language_loss": 0.88869166, "learning_rate": 1.8368024740152386e-06, "loss": 0.90352559, "num_input_tokens_seen": 193514035, "router_z_loss_clip": 2.2578125, "router_z_loss_mlp": 0.27282715, "step": 8985, "time_per_iteration": 2.6615865230560303 }, { "auxiliary_loss_clip": 0.01238311, "auxiliary_loss_mlp": 0.00195582, "balance_loss_clip": 1.03676152, "balance_loss_mlp": 0.16912965, "epoch": 0.5402675484743724, "flos": 24973465691520.0, "grad_norm": 797.9031512247736, "language_loss": 0.85259891, "learning_rate": 1.83641431418363e-06, "loss": 0.86693782, "num_input_tokens_seen": 193535445, "router_z_loss_clip": 2.015625, "router_z_loss_mlp": 0.26428223, "step": 8986, "time_per_iteration": 2.7091715335845947 }, { "auxiliary_loss_clip": 0.01237748, "auxiliary_loss_mlp": 0.00190206, "balance_loss_clip": 1.03186035, "balance_loss_mlp": 0.16355082, "epoch": 0.5403276717270404, "flos": 19459022941440.0, "grad_norm": 4.716611335926856, "language_loss": 0.84845573, "learning_rate": 1.8360261605552075e-06, "loss": 0.86273533, "num_input_tokens_seen": 193554780, "router_z_loss_clip": 2.06152344, "router_z_loss_mlp": 0.26635742, "step": 8987, "time_per_iteration": 2.6641154289245605 }, { "auxiliary_loss_clip": 0.01248257, "auxiliary_loss_mlp": 0.00166271, "balance_loss_clip": 1.04149842, "balance_loss_mlp": 0.13916257, "epoch": 0.5403877949797083, "flos": 18442140912000.0, "grad_norm": 1216.1360466241233, "language_loss": 0.78963828, "learning_rate": 1.8356380131446887e-06, "loss": 0.8037836, "num_input_tokens_seen": 193573580, "router_z_loss_clip": 2.07226562, "router_z_loss_mlp": 0.27124023, "step": 8988, "time_per_iteration": 2.6734046936035156 }, { "auxiliary_loss_clip": 0.01248613, "auxiliary_loss_mlp": 0.0018308, "balance_loss_clip": 1.04084647, "balance_loss_mlp": 0.15547085, "epoch": 0.5404479182323764, "flos": 28292868316800.0, "grad_norm": 319.20831707369564, "language_loss": 0.75727332, "learning_rate": 1.8352498719667934e-06, "loss": 0.77159023, "num_input_tokens_seen": 193590490, "router_z_loss_clip": 2.07714844, "router_z_loss_mlp": 0.27624512, "step": 8989, "time_per_iteration": 2.6887199878692627 }, { "auxiliary_loss_clip": 0.01236486, "auxiliary_loss_mlp": 0.00189455, "balance_loss_clip": 1.03306997, "balance_loss_mlp": 0.1629429, "epoch": 0.5405080414850444, "flos": 23367325046400.0, "grad_norm": 4.65728460474569, "language_loss": 0.8411479, "learning_rate": 1.8348617370362399e-06, "loss": 0.8554073, "num_input_tokens_seen": 193609900, "router_z_loss_clip": 2.03417969, "router_z_loss_mlp": 0.26538086, "step": 8990, "time_per_iteration": 2.6552820205688477 }, { "auxiliary_loss_clip": 0.01244166, "auxiliary_loss_mlp": 0.00174247, "balance_loss_clip": 1.03580451, "balance_loss_mlp": 0.14672175, "epoch": 0.5405681647377123, "flos": 21106425335040.0, "grad_norm": 45.17739646932531, "language_loss": 0.77779794, "learning_rate": 1.834473608367745e-06, "loss": 0.79198211, "num_input_tokens_seen": 193629775, "router_z_loss_clip": 2.08398438, "router_z_loss_mlp": 0.2755127, "step": 8991, "time_per_iteration": 2.6324219703674316 }, { "auxiliary_loss_clip": 0.01264424, "auxiliary_loss_mlp": 0.00171981, "balance_loss_clip": 1.05803967, "balance_loss_mlp": 0.14229769, "epoch": 0.5406282879903803, "flos": 20449188230400.0, "grad_norm": 18.278924219017046, "language_loss": 0.84487396, "learning_rate": 1.8340854859760277e-06, "loss": 0.85923797, "num_input_tokens_seen": 193648070, "router_z_loss_clip": 2.06445312, "router_z_loss_mlp": 0.29675293, "step": 8992, "time_per_iteration": 2.6469972133636475 }, { "auxiliary_loss_clip": 0.01255791, "auxiliary_loss_mlp": 0.00167257, "balance_loss_clip": 1.04573464, "balance_loss_mlp": 0.13744229, "epoch": 0.5406884112430482, "flos": 14209493973120.0, "grad_norm": 6.760223657495277, "language_loss": 0.87062281, "learning_rate": 1.8336973698758056e-06, "loss": 0.88485324, "num_input_tokens_seen": 193665060, "router_z_loss_clip": 2.10449219, "router_z_loss_mlp": 0.2980957, "step": 8993, "time_per_iteration": 2.6489744186401367 }, { "auxiliary_loss_clip": 0.01236343, "auxiliary_loss_mlp": 0.00154589, "balance_loss_clip": 1.03367996, "balance_loss_mlp": 0.12488233, "epoch": 0.5407485344957162, "flos": 23875568536320.0, "grad_norm": 13.534352149841487, "language_loss": 0.79190195, "learning_rate": 1.8333092600817959e-06, "loss": 0.80581129, "num_input_tokens_seen": 193683620, "router_z_loss_clip": 2.02734375, "router_z_loss_mlp": 0.296875, "step": 8994, "time_per_iteration": 2.6587440967559814 }, { "auxiliary_loss_clip": 0.01253371, "auxiliary_loss_mlp": 0.00170902, "balance_loss_clip": 1.04251885, "balance_loss_mlp": 0.14266142, "epoch": 0.5408086577483842, "flos": 23148485435520.0, "grad_norm": 74.03991637036381, "language_loss": 0.83156538, "learning_rate": 1.8329211566087157e-06, "loss": 0.84580815, "num_input_tokens_seen": 193702990, "router_z_loss_clip": 2.11132812, "router_z_loss_mlp": 0.28234863, "step": 8995, "time_per_iteration": 2.625929832458496 }, { "auxiliary_loss_clip": 0.01247278, "auxiliary_loss_mlp": 0.00149134, "balance_loss_clip": 1.04635203, "balance_loss_mlp": 0.12053599, "epoch": 0.5408687810010522, "flos": 18771046773120.0, "grad_norm": 2.4746435767653603, "language_loss": 0.80305141, "learning_rate": 1.832533059471282e-06, "loss": 0.81701553, "num_input_tokens_seen": 193721785, "router_z_loss_clip": 2.00976562, "router_z_loss_mlp": 0.28601074, "step": 8996, "time_per_iteration": 2.625659465789795 }, { "auxiliary_loss_clip": 0.01246704, "auxiliary_loss_mlp": 0.00153166, "balance_loss_clip": 1.04544497, "balance_loss_mlp": 0.12601049, "epoch": 0.5409289042537201, "flos": 13881557779200.0, "grad_norm": 222.31253902571675, "language_loss": 0.80831873, "learning_rate": 1.8321449686842115e-06, "loss": 0.82231748, "num_input_tokens_seen": 193740315, "router_z_loss_clip": 2.01367188, "router_z_loss_mlp": 0.27160645, "step": 8997, "time_per_iteration": 2.609140634536743 }, { "auxiliary_loss_clip": 0.01259567, "auxiliary_loss_mlp": 0.00143594, "balance_loss_clip": 1.05451405, "balance_loss_mlp": 0.11381592, "epoch": 0.5409890275063881, "flos": 14465357527680.0, "grad_norm": 6.3117262745232185, "language_loss": 0.84567332, "learning_rate": 1.8317568842622207e-06, "loss": 0.85970497, "num_input_tokens_seen": 193757580, "router_z_loss_clip": 2.05078125, "router_z_loss_mlp": 0.29785156, "step": 8998, "time_per_iteration": 2.6332356929779053 }, { "auxiliary_loss_clip": 0.01253637, "auxiliary_loss_mlp": 0.00172464, "balance_loss_clip": 1.05081391, "balance_loss_mlp": 0.14058726, "epoch": 0.541049150759056, "flos": 48977449349760.0, "grad_norm": 1.9243630579732653, "language_loss": 0.77131289, "learning_rate": 1.8313688062200256e-06, "loss": 0.78557396, "num_input_tokens_seen": 193780965, "router_z_loss_clip": 2.02832031, "router_z_loss_mlp": 0.31860352, "step": 8999, "time_per_iteration": 3.020439863204956 }, { "auxiliary_loss_clip": 0.01270101, "auxiliary_loss_mlp": 0.00157242, "balance_loss_clip": 1.06262779, "balance_loss_mlp": 0.12643833, "epoch": 0.541109274011724, "flos": 18147601388160.0, "grad_norm": 261.53651024427944, "language_loss": 0.90157455, "learning_rate": 1.8309807345723422e-06, "loss": 0.9158479, "num_input_tokens_seen": 193797855, "router_z_loss_clip": 2.07519531, "router_z_loss_mlp": 0.30810547, "step": 9000, "time_per_iteration": 2.7224552631378174 }, { "auxiliary_loss_clip": 0.01248429, "auxiliary_loss_mlp": 0.00139075, "balance_loss_clip": 1.04622161, "balance_loss_mlp": 0.10841419, "epoch": 0.541169397264392, "flos": 20522553759360.0, "grad_norm": 10.843095606730348, "language_loss": 0.81718069, "learning_rate": 1.8305926693338863e-06, "loss": 0.83105576, "num_input_tokens_seen": 193817375, "router_z_loss_clip": 2.02050781, "router_z_loss_mlp": 0.30664062, "step": 9001, "time_per_iteration": 2.7406649589538574 }, { "auxiliary_loss_clip": 0.01265409, "auxiliary_loss_mlp": 0.00176939, "balance_loss_clip": 1.05585933, "balance_loss_mlp": 0.14713626, "epoch": 0.54122952051706, "flos": 20044043752320.0, "grad_norm": 9.103646038588721, "language_loss": 0.94999206, "learning_rate": 1.8302046105193734e-06, "loss": 0.96441555, "num_input_tokens_seen": 193832205, "router_z_loss_clip": 2.09667969, "router_z_loss_mlp": 0.29833984, "step": 9002, "time_per_iteration": 2.633077383041382 }, { "auxiliary_loss_clip": 0.01270509, "auxiliary_loss_mlp": 0.00171847, "balance_loss_clip": 1.0662384, "balance_loss_mlp": 0.14396413, "epoch": 0.541289643769728, "flos": 19062246332160.0, "grad_norm": 3.244165588681721, "language_loss": 0.87702447, "learning_rate": 1.8298165581435183e-06, "loss": 0.89144802, "num_input_tokens_seen": 193849830, "router_z_loss_clip": 2.04492188, "router_z_loss_mlp": 0.27868652, "step": 9003, "time_per_iteration": 2.647686004638672 }, { "auxiliary_loss_clip": 0.01247577, "auxiliary_loss_mlp": 0.00178719, "balance_loss_clip": 1.0478673, "balance_loss_mlp": 0.15102656, "epoch": 0.5413497670223959, "flos": 22382295402240.0, "grad_norm": 2.1230985281995767, "language_loss": 0.76987201, "learning_rate": 1.8294285122210372e-06, "loss": 0.78413498, "num_input_tokens_seen": 193869945, "router_z_loss_clip": 1.99707031, "router_z_loss_mlp": 0.27709961, "step": 9004, "time_per_iteration": 2.643705368041992 }, { "auxiliary_loss_clip": 0.01248169, "auxiliary_loss_mlp": 0.00030175, "balance_loss_clip": 1.10144043, "balance_loss_mlp": 0.01567901, "epoch": 0.5414098902750639, "flos": 70031734093440.0, "grad_norm": 0.9385847089737126, "language_loss": 0.58290136, "learning_rate": 1.8290404727666434e-06, "loss": 0.59568489, "num_input_tokens_seen": 193930860, "router_z_loss_clip": 1.46875, "router_z_loss_mlp": 0.14453125, "step": 9005, "time_per_iteration": 3.2872657775878906 }, { "auxiliary_loss_clip": 0.01250137, "auxiliary_loss_mlp": 0.00162096, "balance_loss_clip": 1.04883647, "balance_loss_mlp": 0.13236535, "epoch": 0.5414700135277318, "flos": 21798962530560.0, "grad_norm": 87.02574684233967, "language_loss": 0.87230146, "learning_rate": 1.8286524397950517e-06, "loss": 0.88642377, "num_input_tokens_seen": 193949075, "router_z_loss_clip": 2.01269531, "router_z_loss_mlp": 0.29724121, "step": 9006, "time_per_iteration": 2.6529476642608643 }, { "auxiliary_loss_clip": 0.01268521, "auxiliary_loss_mlp": 0.00160161, "balance_loss_clip": 1.06705987, "balance_loss_mlp": 0.13312417, "epoch": 0.5415301367803999, "flos": 16907929251840.0, "grad_norm": 32.662353835547655, "language_loss": 0.89096612, "learning_rate": 1.8282644133209777e-06, "loss": 0.90525293, "num_input_tokens_seen": 193967630, "router_z_loss_clip": 2.015625, "router_z_loss_mlp": 0.27038574, "step": 9007, "time_per_iteration": 2.655665159225464 }, { "auxiliary_loss_clip": 0.0124956, "auxiliary_loss_mlp": 0.00176343, "balance_loss_clip": 1.04733765, "balance_loss_mlp": 0.14582568, "epoch": 0.5415902600330678, "flos": 25704176065920.0, "grad_norm": 106.15232248598551, "language_loss": 0.7556349, "learning_rate": 1.8278763933591334e-06, "loss": 0.76989388, "num_input_tokens_seen": 193988730, "router_z_loss_clip": 2.02246094, "router_z_loss_mlp": 0.30541992, "step": 9008, "time_per_iteration": 2.6788227558135986 }, { "auxiliary_loss_clip": 0.01270959, "auxiliary_loss_mlp": 0.00170856, "balance_loss_clip": 1.06251049, "balance_loss_mlp": 0.13950378, "epoch": 0.5416503832857358, "flos": 19208151377280.0, "grad_norm": 105.09534636160784, "language_loss": 0.86295283, "learning_rate": 1.827488379924234e-06, "loss": 0.87737101, "num_input_tokens_seen": 194005160, "router_z_loss_clip": 2.08203125, "router_z_loss_mlp": 0.31323242, "step": 9009, "time_per_iteration": 2.6834778785705566 }, { "auxiliary_loss_clip": 0.01269228, "auxiliary_loss_mlp": 0.00189823, "balance_loss_clip": 1.06067634, "balance_loss_mlp": 0.1583516, "epoch": 0.5417105065384037, "flos": 12713706887040.0, "grad_norm": 4.41865246831462, "language_loss": 1.01268959, "learning_rate": 1.8271003730309923e-06, "loss": 1.02728009, "num_input_tokens_seen": 194021700, "router_z_loss_clip": 2.08789062, "router_z_loss_mlp": 0.31469727, "step": 9010, "time_per_iteration": 4.0207359790802 }, { "auxiliary_loss_clip": 0.01268282, "auxiliary_loss_mlp": 0.00167012, "balance_loss_clip": 1.06326008, "balance_loss_mlp": 0.13639927, "epoch": 0.5417706297910717, "flos": 30335933998080.0, "grad_norm": 45.43046830904062, "language_loss": 0.75387979, "learning_rate": 1.826712372694122e-06, "loss": 0.7682327, "num_input_tokens_seen": 194042620, "router_z_loss_clip": 2.046875, "router_z_loss_mlp": 0.3059082, "step": 9011, "time_per_iteration": 4.229523658752441 }, { "auxiliary_loss_clip": 0.01274593, "auxiliary_loss_mlp": 0.00181219, "balance_loss_clip": 1.06545019, "balance_loss_mlp": 0.15067783, "epoch": 0.5418307530437396, "flos": 29020992912000.0, "grad_norm": 113.87803051359643, "language_loss": 0.90695584, "learning_rate": 1.8263243789283362e-06, "loss": 0.92151403, "num_input_tokens_seen": 194061800, "router_z_loss_clip": 2.09375, "router_z_loss_mlp": 0.30541992, "step": 9012, "time_per_iteration": 2.697009325027466 }, { "auxiliary_loss_clip": 0.01256233, "auxiliary_loss_mlp": 0.00195043, "balance_loss_clip": 1.05306053, "balance_loss_mlp": 0.16695765, "epoch": 0.5418908762964076, "flos": 16873455173760.0, "grad_norm": 5.16925484863136, "language_loss": 0.84704912, "learning_rate": 1.8259363917483466e-06, "loss": 0.86156178, "num_input_tokens_seen": 194079890, "router_z_loss_clip": 2.02929688, "router_z_loss_mlp": 0.28063965, "step": 9013, "time_per_iteration": 2.7071123123168945 }, { "auxiliary_loss_clip": 0.01273533, "auxiliary_loss_mlp": 0.00166398, "balance_loss_clip": 1.06498337, "balance_loss_mlp": 0.13635761, "epoch": 0.5419509995490756, "flos": 18949702043520.0, "grad_norm": 3.9590128315691193, "language_loss": 0.79940039, "learning_rate": 1.8255484111688667e-06, "loss": 0.81379974, "num_input_tokens_seen": 194097625, "router_z_loss_clip": 2.08789062, "router_z_loss_mlp": 0.30029297, "step": 9014, "time_per_iteration": 4.028576374053955 }, { "auxiliary_loss_clip": 0.01262946, "auxiliary_loss_mlp": 0.00154942, "balance_loss_clip": 1.06240487, "balance_loss_mlp": 0.12368502, "epoch": 0.5420111228017436, "flos": 18077719478400.0, "grad_norm": 3.3715547253379485, "language_loss": 0.87847948, "learning_rate": 1.8251604372046085e-06, "loss": 0.89265835, "num_input_tokens_seen": 194116055, "router_z_loss_clip": 2.00292969, "router_z_loss_mlp": 0.31298828, "step": 9015, "time_per_iteration": 2.643716812133789 }, { "auxiliary_loss_clip": 0.01275654, "auxiliary_loss_mlp": 0.00184162, "balance_loss_clip": 1.06314683, "balance_loss_mlp": 0.1555874, "epoch": 0.5420712460544116, "flos": 19061779455360.0, "grad_norm": 145.2380231935182, "language_loss": 0.91765839, "learning_rate": 1.8247724698702843e-06, "loss": 0.93225658, "num_input_tokens_seen": 194130365, "router_z_loss_clip": 2.12402344, "router_z_loss_mlp": 0.28601074, "step": 9016, "time_per_iteration": 2.6276724338531494 }, { "auxiliary_loss_clip": 0.01259856, "auxiliary_loss_mlp": 0.00179803, "balance_loss_clip": 1.05562842, "balance_loss_mlp": 0.15423232, "epoch": 0.5421313693070795, "flos": 18187103370240.0, "grad_norm": 27.41925540251094, "language_loss": 0.89513105, "learning_rate": 1.8243845091806053e-06, "loss": 0.90952766, "num_input_tokens_seen": 194148975, "router_z_loss_clip": 2.04296875, "router_z_loss_mlp": 0.2557373, "step": 9017, "time_per_iteration": 2.6806094646453857 }, { "auxiliary_loss_clip": 0.01262619, "auxiliary_loss_mlp": 0.00171372, "balance_loss_clip": 1.06340432, "balance_loss_mlp": 0.1439901, "epoch": 0.5421914925597475, "flos": 13005947940480.0, "grad_norm": 17.06004218757237, "language_loss": 0.86837554, "learning_rate": 1.8239965551502837e-06, "loss": 0.88271546, "num_input_tokens_seen": 194167185, "router_z_loss_clip": 1.9921875, "router_z_loss_mlp": 0.27392578, "step": 9018, "time_per_iteration": 4.122587203979492 }, { "auxiliary_loss_clip": 0.01275071, "auxiliary_loss_mlp": 0.00169553, "balance_loss_clip": 1.06379104, "balance_loss_mlp": 0.14125308, "epoch": 0.5422516158124154, "flos": 46758457831680.0, "grad_norm": 15.618300764552208, "language_loss": 0.74637234, "learning_rate": 1.8236086077940303e-06, "loss": 0.76081854, "num_input_tokens_seen": 194192840, "router_z_loss_clip": 2.11425781, "router_z_loss_mlp": 0.28295898, "step": 9019, "time_per_iteration": 2.8973214626312256 }, { "auxiliary_loss_clip": 0.01264064, "auxiliary_loss_mlp": 0.00176532, "balance_loss_clip": 1.06430268, "balance_loss_mlp": 0.15041275, "epoch": 0.5423117390650835, "flos": 31758642864000.0, "grad_norm": 5.3829844684334045, "language_loss": 0.78684419, "learning_rate": 1.8232206671265555e-06, "loss": 0.8012501, "num_input_tokens_seen": 194213150, "router_z_loss_clip": 1.99902344, "router_z_loss_mlp": 0.26123047, "step": 9020, "time_per_iteration": 2.7548422813415527 }, { "auxiliary_loss_clip": 0.01249582, "auxiliary_loss_mlp": 0.00164403, "balance_loss_clip": 1.05485201, "balance_loss_mlp": 0.13592362, "epoch": 0.5423718623177514, "flos": 27201974313600.0, "grad_norm": 35.83955312933443, "language_loss": 0.85618454, "learning_rate": 1.8228327331625717e-06, "loss": 0.87032437, "num_input_tokens_seen": 194234665, "router_z_loss_clip": 1.94628906, "router_z_loss_mlp": 0.28442383, "step": 9021, "time_per_iteration": 2.864750385284424 }, { "auxiliary_loss_clip": 0.01288657, "auxiliary_loss_mlp": 0.00183933, "balance_loss_clip": 1.07847071, "balance_loss_mlp": 0.15436953, "epoch": 0.5424319855704194, "flos": 23546447193600.0, "grad_norm": 31.173271935817187, "language_loss": 0.84660351, "learning_rate": 1.822444805916788e-06, "loss": 0.86132944, "num_input_tokens_seen": 194253790, "router_z_loss_clip": 2.10351562, "router_z_loss_mlp": 0.29577637, "step": 9022, "time_per_iteration": 2.6545016765594482 }, { "auxiliary_loss_clip": 0.01264074, "auxiliary_loss_mlp": 0.00180581, "balance_loss_clip": 1.06050241, "balance_loss_mlp": 0.15273325, "epoch": 0.5424921088230873, "flos": 26615624699520.0, "grad_norm": 231.30009404226678, "language_loss": 0.91150999, "learning_rate": 1.822056885403915e-06, "loss": 0.92595655, "num_input_tokens_seen": 194274950, "router_z_loss_clip": 2.03710938, "router_z_loss_mlp": 0.27880859, "step": 9023, "time_per_iteration": 2.65956449508667 }, { "auxiliary_loss_clip": 0.0130142, "auxiliary_loss_mlp": 0.00179601, "balance_loss_clip": 1.0900743, "balance_loss_mlp": 0.15010856, "epoch": 0.5425522320757553, "flos": 23586811102080.0, "grad_norm": 8.23486380295083, "language_loss": 0.76659811, "learning_rate": 1.8216689716386627e-06, "loss": 0.78140831, "num_input_tokens_seen": 194296155, "router_z_loss_clip": 2.11523438, "router_z_loss_mlp": 0.29516602, "step": 9024, "time_per_iteration": 2.71765398979187 }, { "auxiliary_loss_clip": 0.01273984, "auxiliary_loss_mlp": 0.00180889, "balance_loss_clip": 1.06763101, "balance_loss_mlp": 0.15276755, "epoch": 0.5426123553284232, "flos": 30592264429440.0, "grad_norm": 6.767997791198535, "language_loss": 0.72697926, "learning_rate": 1.8212810646357405e-06, "loss": 0.74152803, "num_input_tokens_seen": 194318025, "router_z_loss_clip": 2.06347656, "router_z_loss_mlp": 0.28125, "step": 9025, "time_per_iteration": 2.7597060203552246 }, { "auxiliary_loss_clip": 0.01288051, "auxiliary_loss_mlp": 0.00182331, "balance_loss_clip": 1.07860148, "balance_loss_mlp": 0.15317282, "epoch": 0.5426724785810912, "flos": 12495118671360.0, "grad_norm": 33.99769094383093, "language_loss": 0.82808971, "learning_rate": 1.8208931644098591e-06, "loss": 0.84279352, "num_input_tokens_seen": 194336150, "router_z_loss_clip": 2.09179688, "router_z_loss_mlp": 0.29199219, "step": 9026, "time_per_iteration": 2.6400275230407715 }, { "auxiliary_loss_clip": 0.01280422, "auxiliary_loss_mlp": 0.00205241, "balance_loss_clip": 1.07155502, "balance_loss_mlp": 0.17454445, "epoch": 0.5427326018337592, "flos": 26064611089920.0, "grad_norm": 23.166239240420627, "language_loss": 0.85047024, "learning_rate": 1.8205052709757265e-06, "loss": 0.86532688, "num_input_tokens_seen": 194355980, "router_z_loss_clip": 2.08886719, "router_z_loss_mlp": 0.30688477, "step": 9027, "time_per_iteration": 2.703296661376953 }, { "auxiliary_loss_clip": 0.01289999, "auxiliary_loss_mlp": 0.00088267, "balance_loss_clip": 1.15437186, "balance_loss_mlp": 0.07877778, "epoch": 0.5427927250864272, "flos": 65984745576960.0, "grad_norm": 5.825467870730686, "language_loss": 0.56399918, "learning_rate": 1.8201173843480515e-06, "loss": 0.5777818, "num_input_tokens_seen": 194422660, "router_z_loss_clip": 1.359375, "router_z_loss_mlp": 0.09472656, "step": 9028, "time_per_iteration": 3.180621862411499 }, { "auxiliary_loss_clip": 0.01264925, "auxiliary_loss_mlp": 0.00184943, "balance_loss_clip": 1.05937552, "balance_loss_mlp": 0.15683302, "epoch": 0.5428528483390952, "flos": 19975382904960.0, "grad_norm": 217.14638944844683, "language_loss": 0.87943679, "learning_rate": 1.8197295045415442e-06, "loss": 0.8939355, "num_input_tokens_seen": 194438545, "router_z_loss_clip": 2.05761719, "router_z_loss_mlp": 0.28125, "step": 9029, "time_per_iteration": 2.7416439056396484 }, { "auxiliary_loss_clip": 0.0128159, "auxiliary_loss_mlp": 0.00187203, "balance_loss_clip": 1.07223892, "balance_loss_mlp": 0.15871215, "epoch": 0.5429129715917631, "flos": 21832323287040.0, "grad_norm": 3.0507593233738612, "language_loss": 0.88488394, "learning_rate": 1.8193416315709112e-06, "loss": 0.8995719, "num_input_tokens_seen": 194458060, "router_z_loss_clip": 2.09570312, "router_z_loss_mlp": 0.28466797, "step": 9030, "time_per_iteration": 2.6822896003723145 }, { "auxiliary_loss_clip": 0.01284279, "auxiliary_loss_mlp": 0.0016608, "balance_loss_clip": 1.07221282, "balance_loss_mlp": 0.13589609, "epoch": 0.5429730948444311, "flos": 27782685492480.0, "grad_norm": 12862.268036236532, "language_loss": 0.84922028, "learning_rate": 1.8189537654508623e-06, "loss": 0.86372387, "num_input_tokens_seen": 194477405, "router_z_loss_clip": 2.11914062, "router_z_loss_mlp": 0.30200195, "step": 9031, "time_per_iteration": 2.756277322769165 }, { "auxiliary_loss_clip": 0.01285397, "auxiliary_loss_mlp": 0.00159967, "balance_loss_clip": 1.08050942, "balance_loss_mlp": 0.13412286, "epoch": 0.543033218097099, "flos": 26760452336640.0, "grad_norm": 4.006438114586593, "language_loss": 0.90928411, "learning_rate": 1.8185659061961045e-06, "loss": 0.92373776, "num_input_tokens_seen": 194497085, "router_z_loss_clip": 2.04882812, "router_z_loss_mlp": 0.25866699, "step": 9032, "time_per_iteration": 2.656460762023926 }, { "auxiliary_loss_clip": 0.01288385, "auxiliary_loss_mlp": 0.00198938, "balance_loss_clip": 1.07298839, "balance_loss_mlp": 0.1692307, "epoch": 0.5430933413497671, "flos": 22675254727680.0, "grad_norm": 2.05909256620772, "language_loss": 0.81752974, "learning_rate": 1.8181780538213457e-06, "loss": 0.83240294, "num_input_tokens_seen": 194516785, "router_z_loss_clip": 2.15234375, "router_z_loss_mlp": 0.296875, "step": 9033, "time_per_iteration": 2.6282646656036377 }, { "auxiliary_loss_clip": 0.01271413, "auxiliary_loss_mlp": 0.00177777, "balance_loss_clip": 1.06282473, "balance_loss_mlp": 0.14989373, "epoch": 0.543153464602435, "flos": 24607499973120.0, "grad_norm": 4626.358512746314, "language_loss": 0.84910429, "learning_rate": 1.8177902083412935e-06, "loss": 0.8635962, "num_input_tokens_seen": 194536475, "router_z_loss_clip": 2.0859375, "router_z_loss_mlp": 0.27893066, "step": 9034, "time_per_iteration": 2.639439582824707 }, { "auxiliary_loss_clip": 0.01273173, "auxiliary_loss_mlp": 0.00176072, "balance_loss_clip": 1.06384468, "balance_loss_mlp": 0.14764038, "epoch": 0.543213587855103, "flos": 19025725178880.0, "grad_norm": 44.05716110781452, "language_loss": 0.91806906, "learning_rate": 1.817402369770655e-06, "loss": 0.93256152, "num_input_tokens_seen": 194554495, "router_z_loss_clip": 2.09375, "router_z_loss_mlp": 0.28405762, "step": 9035, "time_per_iteration": 2.6830666065216064 }, { "auxiliary_loss_clip": 0.01279961, "auxiliary_loss_mlp": 0.00102417, "balance_loss_clip": 1.1374501, "balance_loss_mlp": 0.09478774, "epoch": 0.5432737111077709, "flos": 65686435125120.0, "grad_norm": 0.6720765487579117, "language_loss": 0.55222464, "learning_rate": 1.8170145381241364e-06, "loss": 0.56604844, "num_input_tokens_seen": 194617620, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.07617188, "step": 9036, "time_per_iteration": 3.1237282752990723 }, { "auxiliary_loss_clip": 0.01291334, "auxiliary_loss_mlp": 0.00185923, "balance_loss_clip": 1.07650435, "balance_loss_mlp": 0.15478534, "epoch": 0.5433338343604389, "flos": 22091670460800.0, "grad_norm": 3.9552969955978994, "language_loss": 0.83755803, "learning_rate": 1.8166267134164451e-06, "loss": 0.85233057, "num_input_tokens_seen": 194637690, "router_z_loss_clip": 2.15039062, "router_z_loss_mlp": 0.31152344, "step": 9037, "time_per_iteration": 2.6717493534088135 }, { "auxiliary_loss_clip": 0.01257238, "auxiliary_loss_mlp": 0.00175719, "balance_loss_clip": 1.05418348, "balance_loss_mlp": 0.14869455, "epoch": 0.5433939576131068, "flos": 34672649616000.0, "grad_norm": 78.22172608513462, "language_loss": 0.73713326, "learning_rate": 1.8162388956622875e-06, "loss": 0.75146282, "num_input_tokens_seen": 194659520, "router_z_loss_clip": 2.03125, "router_z_loss_mlp": 0.2701416, "step": 9038, "time_per_iteration": 2.761455535888672 }, { "auxiliary_loss_clip": 0.01277535, "auxiliary_loss_mlp": 0.00196933, "balance_loss_clip": 1.06524467, "balance_loss_mlp": 0.16765499, "epoch": 0.5434540808657748, "flos": 20303355012480.0, "grad_norm": 36.649960670252305, "language_loss": 0.86968088, "learning_rate": 1.8158510848763692e-06, "loss": 0.88442552, "num_input_tokens_seen": 194677645, "router_z_loss_clip": 2.12109375, "router_z_loss_mlp": 0.29260254, "step": 9039, "time_per_iteration": 2.651200771331787 }, { "auxiliary_loss_clip": 0.01278232, "auxiliary_loss_mlp": 0.00177392, "balance_loss_clip": 1.06709266, "balance_loss_mlp": 0.14558692, "epoch": 0.5435142041184428, "flos": 23112790295040.0, "grad_norm": 2.793763133610511, "language_loss": 0.86116099, "learning_rate": 1.8154632810733962e-06, "loss": 0.87571728, "num_input_tokens_seen": 194697400, "router_z_loss_clip": 2.11328125, "router_z_loss_mlp": 0.31835938, "step": 9040, "time_per_iteration": 2.7033214569091797 }, { "auxiliary_loss_clip": 0.01251842, "auxiliary_loss_mlp": 0.00088783, "balance_loss_clip": 1.11327267, "balance_loss_mlp": 0.08124937, "epoch": 0.5435743273711108, "flos": 64012746954240.0, "grad_norm": 0.6633407967088617, "language_loss": 0.5187695, "learning_rate": 1.815075484268074e-06, "loss": 0.53217578, "num_input_tokens_seen": 194761205, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.07519531, "step": 9041, "time_per_iteration": 3.1589760780334473 }, { "auxiliary_loss_clip": 0.01287713, "auxiliary_loss_mlp": 0.0017889, "balance_loss_clip": 1.07656121, "balance_loss_mlp": 0.14708519, "epoch": 0.5436344506237788, "flos": 25118903859840.0, "grad_norm": 7.9164375089894055, "language_loss": 0.84481168, "learning_rate": 1.8146876944751078e-06, "loss": 0.85947776, "num_input_tokens_seen": 194782445, "router_z_loss_clip": 2.11035156, "router_z_loss_mlp": 0.31835938, "step": 9042, "time_per_iteration": 2.69921612739563 }, { "auxiliary_loss_clip": 0.01260643, "auxiliary_loss_mlp": 0.00181694, "balance_loss_clip": 1.06048131, "balance_loss_mlp": 0.15240443, "epoch": 0.5436945738764467, "flos": 19572967860480.0, "grad_norm": 20.37869249342085, "language_loss": 0.76114285, "learning_rate": 1.8142999117092033e-06, "loss": 0.77556622, "num_input_tokens_seen": 194800325, "router_z_loss_clip": 2.00488281, "router_z_loss_mlp": 0.29309082, "step": 9043, "time_per_iteration": 2.67382550239563 }, { "auxiliary_loss_clip": 0.01278984, "auxiliary_loss_mlp": 0.0019084, "balance_loss_clip": 1.07186866, "balance_loss_mlp": 0.16034669, "epoch": 0.5437546971291147, "flos": 21142515525120.0, "grad_norm": 9.42423836439278, "language_loss": 0.91094398, "learning_rate": 1.8139121359850644e-06, "loss": 0.92564231, "num_input_tokens_seen": 194818675, "router_z_loss_clip": 2.07226562, "router_z_loss_mlp": 0.30493164, "step": 9044, "time_per_iteration": 2.631122350692749 }, { "auxiliary_loss_clip": 0.0130452, "auxiliary_loss_mlp": 0.00193767, "balance_loss_clip": 1.08511114, "balance_loss_mlp": 0.16246219, "epoch": 0.5438148203817826, "flos": 25118688378240.0, "grad_norm": 8.81745390794574, "language_loss": 0.70303917, "learning_rate": 1.8135243673173956e-06, "loss": 0.71802205, "num_input_tokens_seen": 194836595, "router_z_loss_clip": 2.19628906, "router_z_loss_mlp": 0.31323242, "step": 9045, "time_per_iteration": 2.7987048625946045 }, { "auxiliary_loss_clip": 0.0127954, "auxiliary_loss_mlp": 0.00168201, "balance_loss_clip": 1.07211208, "balance_loss_mlp": 0.14149809, "epoch": 0.5438749436344507, "flos": 23002939526400.0, "grad_norm": 4.6475986929391695, "language_loss": 0.77586454, "learning_rate": 1.8131366057209023e-06, "loss": 0.79034197, "num_input_tokens_seen": 194857520, "router_z_loss_clip": 2.07421875, "router_z_loss_mlp": 0.26708984, "step": 9046, "time_per_iteration": 2.654956340789795 }, { "auxiliary_loss_clip": 0.01268093, "auxiliary_loss_mlp": 0.00188232, "balance_loss_clip": 1.06343818, "balance_loss_mlp": 0.16125542, "epoch": 0.5439350668871186, "flos": 15487016065920.0, "grad_norm": 2.63470561576437, "language_loss": 0.8409493, "learning_rate": 1.8127488512102868e-06, "loss": 0.8555125, "num_input_tokens_seen": 194876020, "router_z_loss_clip": 2.04882812, "router_z_loss_mlp": 0.26953125, "step": 9047, "time_per_iteration": 2.6353092193603516 }, { "auxiliary_loss_clip": 0.01292521, "auxiliary_loss_mlp": 0.00206911, "balance_loss_clip": 1.08287728, "balance_loss_mlp": 0.17856339, "epoch": 0.5439951901397866, "flos": 17238415311360.0, "grad_norm": 15.355358258704085, "language_loss": 0.8071965, "learning_rate": 1.8123611038002547e-06, "loss": 0.82219082, "num_input_tokens_seen": 194894650, "router_z_loss_clip": 2.09570312, "router_z_loss_mlp": 0.28369141, "step": 9048, "time_per_iteration": 2.6165213584899902 }, { "auxiliary_loss_clip": 0.01276073, "auxiliary_loss_mlp": 0.00202263, "balance_loss_clip": 1.06579494, "balance_loss_mlp": 0.17243698, "epoch": 0.5440553133924545, "flos": 18661016436480.0, "grad_norm": 90.2177537044943, "language_loss": 1.01950085, "learning_rate": 1.8119733635055076e-06, "loss": 1.03428411, "num_input_tokens_seen": 194911935, "router_z_loss_clip": 2.10546875, "router_z_loss_mlp": 0.2980957, "step": 9049, "time_per_iteration": 2.661010503768921 }, { "auxiliary_loss_clip": 0.012868, "auxiliary_loss_mlp": 0.00184396, "balance_loss_clip": 1.07835126, "balance_loss_mlp": 0.15733582, "epoch": 0.5441154366451225, "flos": 27122934435840.0, "grad_norm": 4.5590711749294925, "language_loss": 0.82501036, "learning_rate": 1.8115856303407492e-06, "loss": 0.83972228, "num_input_tokens_seen": 194931620, "router_z_loss_clip": 2.08203125, "router_z_loss_mlp": 0.27050781, "step": 9050, "time_per_iteration": 2.6919608116149902 }, { "auxiliary_loss_clip": 0.01298547, "auxiliary_loss_mlp": 0.00193608, "balance_loss_clip": 1.08319366, "balance_loss_mlp": 0.1667864, "epoch": 0.5441755598977904, "flos": 25993867253760.0, "grad_norm": 5.876946802242328, "language_loss": 0.77688241, "learning_rate": 1.8111979043206832e-06, "loss": 0.79180396, "num_input_tokens_seen": 194952560, "router_z_loss_clip": 2.15429688, "router_z_loss_mlp": 0.26818848, "step": 9051, "time_per_iteration": 2.715994119644165 }, { "auxiliary_loss_clip": 0.01283959, "auxiliary_loss_mlp": 0.00191851, "balance_loss_clip": 1.07320487, "balance_loss_mlp": 0.16527955, "epoch": 0.5442356831504584, "flos": 32380041173760.0, "grad_norm": 24.59558638236739, "language_loss": 0.75093585, "learning_rate": 1.810810185460011e-06, "loss": 0.76569402, "num_input_tokens_seen": 194973915, "router_z_loss_clip": 2.10742188, "router_z_loss_mlp": 0.26574707, "step": 9052, "time_per_iteration": 4.146685361862183 }, { "auxiliary_loss_clip": 0.01298563, "auxiliary_loss_mlp": 0.00191114, "balance_loss_clip": 1.08263421, "balance_loss_mlp": 0.16405359, "epoch": 0.5442958064031264, "flos": 24164290056960.0, "grad_norm": 5.232871640428255, "language_loss": 0.98468173, "learning_rate": 1.810422473773436e-06, "loss": 0.99957848, "num_input_tokens_seen": 194990170, "router_z_loss_clip": 2.16015625, "router_z_loss_mlp": 0.2701416, "step": 9053, "time_per_iteration": 4.1191489696502686 }, { "auxiliary_loss_clip": 0.0131776, "auxiliary_loss_mlp": 0.00204723, "balance_loss_clip": 1.09787393, "balance_loss_mlp": 0.17791273, "epoch": 0.5443559296557944, "flos": 18764690065920.0, "grad_norm": 3.7696694274516584, "language_loss": 0.91288519, "learning_rate": 1.8100347692756595e-06, "loss": 0.92811, "num_input_tokens_seen": 195006395, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.26806641, "step": 9054, "time_per_iteration": 2.670396327972412 }, { "auxiliary_loss_clip": 0.01313459, "auxiliary_loss_mlp": 0.0021474, "balance_loss_clip": 1.09553719, "balance_loss_mlp": 0.1875128, "epoch": 0.5444160529084624, "flos": 22632556435200.0, "grad_norm": 22.3977864800882, "language_loss": 0.78102905, "learning_rate": 1.8096470719813836e-06, "loss": 0.79631108, "num_input_tokens_seen": 195025080, "router_z_loss_clip": 2.17578125, "router_z_loss_mlp": 0.2722168, "step": 9055, "time_per_iteration": 2.6875159740448 }, { "auxiliary_loss_clip": 0.01315415, "auxiliary_loss_mlp": 0.0005314, "balance_loss_clip": 1.16788077, "balance_loss_mlp": 0.04622554, "epoch": 0.5444761761611303, "flos": 69671909600640.0, "grad_norm": 0.730601141915738, "language_loss": 0.57123089, "learning_rate": 1.80925938190531e-06, "loss": 0.58491635, "num_input_tokens_seen": 195085725, "router_z_loss_clip": 1.4765625, "router_z_loss_mlp": 0.06933594, "step": 9056, "time_per_iteration": 4.559916973114014 }, { "auxiliary_loss_clip": 0.01314091, "auxiliary_loss_mlp": 0.00221484, "balance_loss_clip": 1.09096193, "balance_loss_mlp": 0.19290994, "epoch": 0.5445362994137983, "flos": 14278442129280.0, "grad_norm": 6.200288403787547, "language_loss": 0.77514243, "learning_rate": 1.8088716990621395e-06, "loss": 0.79049814, "num_input_tokens_seen": 195102585, "router_z_loss_clip": 2.23242188, "router_z_loss_mlp": 0.28601074, "step": 9057, "time_per_iteration": 2.613802433013916 }, { "auxiliary_loss_clip": 0.0132712, "auxiliary_loss_mlp": 0.00208709, "balance_loss_clip": 1.10565007, "balance_loss_mlp": 0.18071899, "epoch": 0.5445964226664662, "flos": 28986195611520.0, "grad_norm": 12.90885426177813, "language_loss": 0.81695211, "learning_rate": 1.8084840234665738e-06, "loss": 0.83231038, "num_input_tokens_seen": 195120055, "router_z_loss_clip": 2.21582031, "router_z_loss_mlp": 0.27978516, "step": 9058, "time_per_iteration": 2.670454978942871 }, { "auxiliary_loss_clip": 0.01314141, "auxiliary_loss_mlp": 0.00074251, "balance_loss_clip": 1.16535449, "balance_loss_mlp": 0.06695568, "epoch": 0.5446565459191343, "flos": 68620230270720.0, "grad_norm": 0.7670727536768611, "language_loss": 0.61792272, "learning_rate": 1.808096355133312e-06, "loss": 0.63180661, "num_input_tokens_seen": 195181045, "router_z_loss_clip": 1.484375, "router_z_loss_mlp": 0.07275391, "step": 9059, "time_per_iteration": 3.2225944995880127 }, { "auxiliary_loss_clip": 0.01292852, "auxiliary_loss_mlp": 0.00208866, "balance_loss_clip": 1.07920527, "balance_loss_mlp": 0.18311691, "epoch": 0.5447166691718022, "flos": 16216469464320.0, "grad_norm": 36.363134355467416, "language_loss": 0.88402176, "learning_rate": 1.8077086940770572e-06, "loss": 0.89903903, "num_input_tokens_seen": 195198840, "router_z_loss_clip": 2.13476562, "router_z_loss_mlp": 0.25744629, "step": 9060, "time_per_iteration": 4.023879766464233 }, { "auxiliary_loss_clip": 0.01310548, "auxiliary_loss_mlp": 0.00215019, "balance_loss_clip": 1.09217107, "balance_loss_mlp": 0.18892412, "epoch": 0.5447767924244702, "flos": 25849039616640.0, "grad_norm": 6.523750944046212, "language_loss": 0.87471896, "learning_rate": 1.8073210403125072e-06, "loss": 0.88997459, "num_input_tokens_seen": 195218720, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.26123047, "step": 9061, "time_per_iteration": 2.692802906036377 }, { "auxiliary_loss_clip": 0.01306662, "auxiliary_loss_mlp": 0.00229567, "balance_loss_clip": 1.09076011, "balance_loss_mlp": 0.20309061, "epoch": 0.5448369156771381, "flos": 19677718897920.0, "grad_norm": 19.12608170176957, "language_loss": 0.93093812, "learning_rate": 1.8069333938543627e-06, "loss": 0.94630039, "num_input_tokens_seen": 195235770, "router_z_loss_clip": 2.16113281, "router_z_loss_mlp": 0.26477051, "step": 9062, "time_per_iteration": 2.635654926300049 }, { "auxiliary_loss_clip": 0.01325987, "auxiliary_loss_mlp": 0.00254909, "balance_loss_clip": 1.09783363, "balance_loss_mlp": 0.2264418, "epoch": 0.5448970389298061, "flos": 19281804215040.0, "grad_norm": 4.574975899997816, "language_loss": 0.90727305, "learning_rate": 1.8065457547173233e-06, "loss": 0.92308199, "num_input_tokens_seen": 195254870, "router_z_loss_clip": 2.28320312, "router_z_loss_mlp": 0.28417969, "step": 9063, "time_per_iteration": 2.761207103729248 }, { "auxiliary_loss_clip": 0.01327887, "auxiliary_loss_mlp": 0.00230969, "balance_loss_clip": 1.10363102, "balance_loss_mlp": 0.20477876, "epoch": 0.544957162182474, "flos": 20991690316800.0, "grad_norm": 8.432295001302016, "language_loss": 0.72192192, "learning_rate": 1.8061581229160878e-06, "loss": 0.7375105, "num_input_tokens_seen": 195273390, "router_z_loss_clip": 2.2421875, "router_z_loss_mlp": 0.26196289, "step": 9064, "time_per_iteration": 2.6254770755767822 }, { "auxiliary_loss_clip": 0.01322708, "auxiliary_loss_mlp": 0.00218209, "balance_loss_clip": 1.09474087, "balance_loss_mlp": 0.18826318, "epoch": 0.545017285435142, "flos": 25374587846400.0, "grad_norm": 4.142378529147941, "language_loss": 0.88017738, "learning_rate": 1.8057704984653566e-06, "loss": 0.89558655, "num_input_tokens_seen": 195295635, "router_z_loss_clip": 2.27929688, "router_z_loss_mlp": 0.29980469, "step": 9065, "time_per_iteration": 2.7270543575286865 }, { "auxiliary_loss_clip": 0.0130855, "auxiliary_loss_mlp": 0.00192815, "balance_loss_clip": 1.09197819, "balance_loss_mlp": 0.16754216, "epoch": 0.54507740868781, "flos": 19134749934720.0, "grad_norm": 7.6284124744790365, "language_loss": 0.86644495, "learning_rate": 1.805382881379827e-06, "loss": 0.88145864, "num_input_tokens_seen": 195312545, "router_z_loss_clip": 2.16601562, "router_z_loss_mlp": 0.25268555, "step": 9066, "time_per_iteration": 2.634230136871338 }, { "auxiliary_loss_clip": 0.01306542, "auxiliary_loss_mlp": 0.00238424, "balance_loss_clip": 1.08320236, "balance_loss_mlp": 0.21031405, "epoch": 0.545137531940478, "flos": 26249802635520.0, "grad_norm": 5.71839927281471, "language_loss": 0.85875106, "learning_rate": 1.8049952716741975e-06, "loss": 0.8742007, "num_input_tokens_seen": 195332955, "router_z_loss_clip": 2.23632812, "router_z_loss_mlp": 0.28112793, "step": 9067, "time_per_iteration": 2.8013784885406494 }, { "auxiliary_loss_clip": 0.0132419, "auxiliary_loss_mlp": 0.00250204, "balance_loss_clip": 1.09514105, "balance_loss_mlp": 0.2202352, "epoch": 0.545197655193146, "flos": 37555629995520.0, "grad_norm": 19.745220947653916, "language_loss": 0.71678621, "learning_rate": 1.8046076693631682e-06, "loss": 0.73253012, "num_input_tokens_seen": 195355930, "router_z_loss_clip": 2.2890625, "router_z_loss_mlp": 0.29980469, "step": 9068, "time_per_iteration": 2.811072587966919 }, { "auxiliary_loss_clip": 0.01299864, "auxiliary_loss_mlp": 0.00238549, "balance_loss_clip": 1.08400679, "balance_loss_mlp": 0.21361083, "epoch": 0.5452577784458139, "flos": 26031250333440.0, "grad_norm": 105.98226092410016, "language_loss": 0.77613515, "learning_rate": 1.8042200744614343e-06, "loss": 0.79151928, "num_input_tokens_seen": 195376445, "router_z_loss_clip": 2.16015625, "router_z_loss_mlp": 0.24963379, "step": 9069, "time_per_iteration": 2.8020803928375244 }, { "auxiliary_loss_clip": 0.01308317, "auxiliary_loss_mlp": 0.00217809, "balance_loss_clip": 1.09062099, "balance_loss_mlp": 0.19331148, "epoch": 0.5453179016984819, "flos": 17639034675840.0, "grad_norm": 7.099018468138661, "language_loss": 0.81976545, "learning_rate": 1.8038324869836957e-06, "loss": 0.83502674, "num_input_tokens_seen": 195393725, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.24511719, "step": 9070, "time_per_iteration": 2.7316269874572754 }, { "auxiliary_loss_clip": 0.01303877, "auxiliary_loss_mlp": 0.00237645, "balance_loss_clip": 1.08430028, "balance_loss_mlp": 0.21128826, "epoch": 0.5453780249511498, "flos": 23216679406080.0, "grad_norm": 3.776360502421378, "language_loss": 0.68263209, "learning_rate": 1.8034449069446489e-06, "loss": 0.69804728, "num_input_tokens_seen": 195411380, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.2635498, "step": 9071, "time_per_iteration": 2.7307615280151367 }, { "auxiliary_loss_clip": 0.01336921, "auxiliary_loss_mlp": 0.00117637, "balance_loss_clip": 1.18194687, "balance_loss_mlp": 0.11024585, "epoch": 0.5454381482038179, "flos": 68696504801280.0, "grad_norm": 0.6936939107651525, "language_loss": 0.56725907, "learning_rate": 1.80305733435899e-06, "loss": 0.58180463, "num_input_tokens_seen": 195482015, "router_z_loss_clip": 1.546875, "router_z_loss_mlp": 0.07373047, "step": 9072, "time_per_iteration": 3.2647197246551514 }, { "auxiliary_loss_clip": 0.01302247, "auxiliary_loss_mlp": 0.00244259, "balance_loss_clip": 1.08459961, "balance_loss_mlp": 0.21794993, "epoch": 0.5454982714564858, "flos": 13260626346240.0, "grad_norm": 1099.535143130576, "language_loss": 0.77871084, "learning_rate": 1.8026697692414174e-06, "loss": 0.79417586, "num_input_tokens_seen": 195500440, "router_z_loss_clip": 2.17578125, "router_z_loss_mlp": 0.26318359, "step": 9073, "time_per_iteration": 2.649066686630249 }, { "auxiliary_loss_clip": 0.01307792, "auxiliary_loss_mlp": 0.00227844, "balance_loss_clip": 1.09167051, "balance_loss_mlp": 0.2035608, "epoch": 0.5455583947091538, "flos": 21835878733440.0, "grad_norm": 8.81796971990186, "language_loss": 0.78568876, "learning_rate": 1.802282211606627e-06, "loss": 0.80104512, "num_input_tokens_seen": 195520860, "router_z_loss_clip": 2.16015625, "router_z_loss_mlp": 0.24291992, "step": 9074, "time_per_iteration": 2.697845458984375 }, { "auxiliary_loss_clip": 0.01312874, "auxiliary_loss_mlp": 0.00244015, "balance_loss_clip": 1.08944535, "balance_loss_mlp": 0.21786046, "epoch": 0.5456185179618217, "flos": 17817438551040.0, "grad_norm": 13.47702700067254, "language_loss": 0.76197559, "learning_rate": 1.8018946614693148e-06, "loss": 0.7775445, "num_input_tokens_seen": 195538615, "router_z_loss_clip": 2.23242188, "router_z_loss_mlp": 0.26135254, "step": 9075, "time_per_iteration": 2.58799409866333 }, { "auxiliary_loss_clip": 0.01320931, "auxiliary_loss_mlp": 0.00240368, "balance_loss_clip": 1.09520721, "balance_loss_mlp": 0.21546504, "epoch": 0.5456786412144897, "flos": 21069401391360.0, "grad_norm": 4.821176882022189, "language_loss": 0.88341331, "learning_rate": 1.8015071188441768e-06, "loss": 0.89902627, "num_input_tokens_seen": 195557460, "router_z_loss_clip": 2.25976562, "router_z_loss_mlp": 0.2487793, "step": 9076, "time_per_iteration": 2.619471788406372 }, { "auxiliary_loss_clip": 0.01312267, "auxiliary_loss_mlp": 0.00232697, "balance_loss_clip": 1.08960438, "balance_loss_mlp": 0.20598176, "epoch": 0.5457387644671576, "flos": 23294965098240.0, "grad_norm": 13.884819627097615, "language_loss": 0.85993081, "learning_rate": 1.8011195837459089e-06, "loss": 0.87538046, "num_input_tokens_seen": 195577985, "router_z_loss_clip": 2.22265625, "router_z_loss_mlp": 0.26721191, "step": 9077, "time_per_iteration": 2.6702609062194824 }, { "auxiliary_loss_clip": 0.01301122, "auxiliary_loss_mlp": 0.0022541, "balance_loss_clip": 1.07551289, "balance_loss_mlp": 0.19936307, "epoch": 0.5457988877198257, "flos": 21617039122560.0, "grad_norm": 7.480703242172585, "language_loss": 0.78232682, "learning_rate": 1.8007320561892064e-06, "loss": 0.79759216, "num_input_tokens_seen": 195597620, "router_z_loss_clip": 2.2578125, "router_z_loss_mlp": 0.26025391, "step": 9078, "time_per_iteration": 2.6924681663513184 }, { "auxiliary_loss_clip": 0.01295041, "auxiliary_loss_mlp": 0.00247463, "balance_loss_clip": 1.07174468, "balance_loss_mlp": 0.22158276, "epoch": 0.5458590109724936, "flos": 23762485543680.0, "grad_norm": 30.888869745938337, "language_loss": 0.87974137, "learning_rate": 1.800344536188764e-06, "loss": 0.8951664, "num_input_tokens_seen": 195615910, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.25878906, "step": 9079, "time_per_iteration": 2.642232894897461 }, { "auxiliary_loss_clip": 0.01316457, "auxiliary_loss_mlp": 0.00263179, "balance_loss_clip": 1.081967, "balance_loss_mlp": 0.23594013, "epoch": 0.5459191342251616, "flos": 24424283675520.0, "grad_norm": 84.50921370584351, "language_loss": 0.84449714, "learning_rate": 1.799957023759277e-06, "loss": 0.86029351, "num_input_tokens_seen": 195635620, "router_z_loss_clip": 2.34765625, "router_z_loss_mlp": 0.27233887, "step": 9080, "time_per_iteration": 2.711336612701416 }, { "auxiliary_loss_clip": 0.01307712, "auxiliary_loss_mlp": 0.00275014, "balance_loss_clip": 1.08099246, "balance_loss_mlp": 0.24851349, "epoch": 0.5459792574778296, "flos": 23623009032960.0, "grad_norm": 250.68283453298082, "language_loss": 0.95534909, "learning_rate": 1.7995695189154392e-06, "loss": 0.97117639, "num_input_tokens_seen": 195652495, "router_z_loss_clip": 2.26757812, "router_z_loss_mlp": 0.26501465, "step": 9081, "time_per_iteration": 2.6576597690582275 }, { "auxiliary_loss_clip": 0.01301234, "auxiliary_loss_mlp": 0.00269657, "balance_loss_clip": 1.07463312, "balance_loss_mlp": 0.2425009, "epoch": 0.5460393807304975, "flos": 19135540033920.0, "grad_norm": 37.90435050385178, "language_loss": 0.78042698, "learning_rate": 1.7991820216719461e-06, "loss": 0.79613584, "num_input_tokens_seen": 195671965, "router_z_loss_clip": 2.26757812, "router_z_loss_mlp": 0.27148438, "step": 9082, "time_per_iteration": 2.6460983753204346 }, { "auxiliary_loss_clip": 0.01296951, "auxiliary_loss_mlp": 0.00271407, "balance_loss_clip": 1.07502198, "balance_loss_mlp": 0.24603924, "epoch": 0.5460995039831655, "flos": 35918534805120.0, "grad_norm": 906.5465964162578, "language_loss": 0.73053241, "learning_rate": 1.7987945320434906e-06, "loss": 0.74621606, "num_input_tokens_seen": 195694725, "router_z_loss_clip": 2.22070312, "router_z_loss_mlp": 0.25378418, "step": 9083, "time_per_iteration": 2.7811779975891113 }, { "auxiliary_loss_clip": 0.01285837, "auxiliary_loss_mlp": 0.00231023, "balance_loss_clip": 1.06647718, "balance_loss_mlp": 0.20641845, "epoch": 0.5461596272358334, "flos": 26759231274240.0, "grad_norm": 3.59059725653078, "language_loss": 0.87030494, "learning_rate": 1.798407050044766e-06, "loss": 0.88547355, "num_input_tokens_seen": 195714090, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.24572754, "step": 9084, "time_per_iteration": 2.6733345985412598 }, { "auxiliary_loss_clip": 0.01313775, "auxiliary_loss_mlp": 0.00254221, "balance_loss_clip": 1.08668506, "balance_loss_mlp": 0.22860281, "epoch": 0.5462197504885015, "flos": 20886580143360.0, "grad_norm": 2.386000036205379, "language_loss": 0.83181459, "learning_rate": 1.7980195756904675e-06, "loss": 0.8474946, "num_input_tokens_seen": 195733585, "router_z_loss_clip": 2.26953125, "router_z_loss_mlp": 0.25634766, "step": 9085, "time_per_iteration": 2.618704319000244 }, { "auxiliary_loss_clip": 0.01301893, "auxiliary_loss_mlp": 0.002707, "balance_loss_clip": 1.07157063, "balance_loss_mlp": 0.24477226, "epoch": 0.5462798737411694, "flos": 25804976607360.0, "grad_norm": 12.53032969717092, "language_loss": 0.82306778, "learning_rate": 1.7976321089952857e-06, "loss": 0.83879364, "num_input_tokens_seen": 195752820, "router_z_loss_clip": 2.3046875, "router_z_loss_mlp": 0.25939941, "step": 9086, "time_per_iteration": 2.71980881690979 }, { "auxiliary_loss_clip": 0.01303958, "auxiliary_loss_mlp": 0.00278612, "balance_loss_clip": 1.08186209, "balance_loss_mlp": 0.25376922, "epoch": 0.5463399969938374, "flos": 25775027642880.0, "grad_norm": 7.037331332714847, "language_loss": 0.82174385, "learning_rate": 1.7972446499739155e-06, "loss": 0.83756953, "num_input_tokens_seen": 195773740, "router_z_loss_clip": 2.22070312, "router_z_loss_mlp": 0.24841309, "step": 9087, "time_per_iteration": 2.6893765926361084 }, { "auxiliary_loss_clip": 0.0133085, "auxiliary_loss_mlp": 0.00253788, "balance_loss_clip": 1.09667087, "balance_loss_mlp": 0.22706087, "epoch": 0.5464001202465053, "flos": 18843298980480.0, "grad_norm": 8.19151636195717, "language_loss": 0.87658107, "learning_rate": 1.7968571986410484e-06, "loss": 0.89242744, "num_input_tokens_seen": 195792125, "router_z_loss_clip": 2.34375, "router_z_loss_mlp": 0.2668457, "step": 9088, "time_per_iteration": 2.6144771575927734 }, { "auxiliary_loss_clip": 0.01277347, "auxiliary_loss_mlp": 0.0010224, "balance_loss_clip": 1.11962652, "balance_loss_mlp": 0.0955638, "epoch": 0.5464602434991733, "flos": 69049541623680.0, "grad_norm": 0.713821781264253, "language_loss": 0.57324678, "learning_rate": 1.7964697550113758e-06, "loss": 0.58704263, "num_input_tokens_seen": 195854935, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.06689453, "step": 9089, "time_per_iteration": 3.1439671516418457 }, { "auxiliary_loss_clip": 0.01305046, "auxiliary_loss_mlp": 0.00270683, "balance_loss_clip": 1.07761967, "balance_loss_mlp": 0.24382551, "epoch": 0.5465203667518412, "flos": 27560039040000.0, "grad_norm": 25.3806806326181, "language_loss": 0.83877158, "learning_rate": 1.7960823190995918e-06, "loss": 0.8545289, "num_input_tokens_seen": 195874715, "router_z_loss_clip": 2.2734375, "router_z_loss_mlp": 0.26843262, "step": 9090, "time_per_iteration": 2.6574392318725586 }, { "auxiliary_loss_clip": 0.0129391, "auxiliary_loss_mlp": 0.00265304, "balance_loss_clip": 1.06592917, "balance_loss_mlp": 0.2368722, "epoch": 0.5465804900045093, "flos": 21210206705280.0, "grad_norm": 9.124672554874346, "language_loss": 0.82420367, "learning_rate": 1.7956948909203855e-06, "loss": 0.83979583, "num_input_tokens_seen": 195892610, "router_z_loss_clip": 2.27734375, "router_z_loss_mlp": 0.28479004, "step": 9091, "time_per_iteration": 2.7761759757995605 }, { "auxiliary_loss_clip": 0.01289366, "auxiliary_loss_mlp": 0.0026185, "balance_loss_clip": 1.066872, "balance_loss_mlp": 0.23629168, "epoch": 0.5466406132571772, "flos": 22488949860480.0, "grad_norm": 19.559387878133347, "language_loss": 0.83430362, "learning_rate": 1.7953074704884498e-06, "loss": 0.84981579, "num_input_tokens_seen": 195911085, "router_z_loss_clip": 2.23046875, "router_z_loss_mlp": 0.25549316, "step": 9092, "time_per_iteration": 2.675853967666626 }, { "auxiliary_loss_clip": 0.01293569, "auxiliary_loss_mlp": 0.00246749, "balance_loss_clip": 1.0639379, "balance_loss_mlp": 0.21953395, "epoch": 0.5467007365098452, "flos": 17675843137920.0, "grad_norm": 41.29888553496963, "language_loss": 0.87241769, "learning_rate": 1.794920057818476e-06, "loss": 0.88782084, "num_input_tokens_seen": 195929845, "router_z_loss_clip": 2.29882812, "router_z_loss_mlp": 0.27209473, "step": 9093, "time_per_iteration": 2.6139354705810547 }, { "auxiliary_loss_clip": 0.0128294, "auxiliary_loss_mlp": 0.00282104, "balance_loss_clip": 1.0578196, "balance_loss_mlp": 0.25292131, "epoch": 0.5467608597625132, "flos": 15698852524800.0, "grad_norm": 4.3922282216809245, "language_loss": 0.78755361, "learning_rate": 1.7945326529251533e-06, "loss": 0.80320406, "num_input_tokens_seen": 195946350, "router_z_loss_clip": 2.24804688, "router_z_loss_mlp": 0.29174805, "step": 9094, "time_per_iteration": 4.00624942779541 }, { "auxiliary_loss_clip": 0.01307011, "auxiliary_loss_mlp": 0.00242989, "balance_loss_clip": 1.07775223, "balance_loss_mlp": 0.21905203, "epoch": 0.5468209830151811, "flos": 24312816794880.0, "grad_norm": 5.367033609888581, "language_loss": 0.77119845, "learning_rate": 1.7941452558231731e-06, "loss": 0.78669846, "num_input_tokens_seen": 195959840, "router_z_loss_clip": 2.29101562, "router_z_loss_mlp": 0.23950195, "step": 9095, "time_per_iteration": 2.629438877105713 }, { "auxiliary_loss_clip": 0.01305322, "auxiliary_loss_mlp": 0.00262174, "balance_loss_clip": 1.07865906, "balance_loss_mlp": 0.2357931, "epoch": 0.5468811062678491, "flos": 29166323339520.0, "grad_norm": 1.9792367738159138, "language_loss": 0.73694766, "learning_rate": 1.7937578665272256e-06, "loss": 0.7526226, "num_input_tokens_seen": 195981125, "router_z_loss_clip": 2.265625, "router_z_loss_mlp": 0.2635498, "step": 9096, "time_per_iteration": 4.18281888961792 }, { "auxiliary_loss_clip": 0.0127147, "auxiliary_loss_mlp": 0.00078302, "balance_loss_clip": 1.11457753, "balance_loss_mlp": 0.07200817, "epoch": 0.546941229520517, "flos": 67867037982720.0, "grad_norm": 0.7298959601909331, "language_loss": 0.56957185, "learning_rate": 1.7933704850520007e-06, "loss": 0.58306956, "num_input_tokens_seen": 196038880, "router_z_loss_clip": 1.5703125, "router_z_loss_mlp": 0.06298828, "step": 9097, "time_per_iteration": 3.2156898975372314 }, { "auxiliary_loss_clip": 0.0127126, "auxiliary_loss_mlp": 0.00075776, "balance_loss_clip": 1.11461449, "balance_loss_mlp": 0.06895762, "epoch": 0.5470013527731851, "flos": 58270306625280.0, "grad_norm": 0.9096712615060342, "language_loss": 0.64433837, "learning_rate": 1.7929831114121868e-06, "loss": 0.65780872, "num_input_tokens_seen": 196099215, "router_z_loss_clip": 1.5625, "router_z_loss_mlp": 0.06835938, "step": 9098, "time_per_iteration": 4.489742279052734 }, { "auxiliary_loss_clip": 0.01296888, "auxiliary_loss_mlp": 0.00253883, "balance_loss_clip": 1.0700438, "balance_loss_mlp": 0.22806212, "epoch": 0.547061476025853, "flos": 22965915582720.0, "grad_norm": 22.84276107343891, "language_loss": 0.82112807, "learning_rate": 1.7925957456224753e-06, "loss": 0.83663583, "num_input_tokens_seen": 196120370, "router_z_loss_clip": 2.265625, "router_z_loss_mlp": 0.25830078, "step": 9099, "time_per_iteration": 2.6682024002075195 }, { "auxiliary_loss_clip": 0.01277602, "auxiliary_loss_mlp": 0.0025921, "balance_loss_clip": 1.0558821, "balance_loss_mlp": 0.2347725, "epoch": 0.547121599278521, "flos": 29968244426880.0, "grad_norm": 190.53814388648922, "language_loss": 0.7909081, "learning_rate": 1.7922083876975537e-06, "loss": 0.80627626, "num_input_tokens_seen": 196139075, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.24438477, "step": 9100, "time_per_iteration": 2.6874172687530518 }, { "auxiliary_loss_clip": 0.01279749, "auxiliary_loss_mlp": 0.00293991, "balance_loss_clip": 1.05787325, "balance_loss_mlp": 0.26758665, "epoch": 0.5471817225311889, "flos": 36535443914880.0, "grad_norm": 20.596391685621132, "language_loss": 0.73135102, "learning_rate": 1.7918210376521102e-06, "loss": 0.74708843, "num_input_tokens_seen": 196159990, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.26416016, "step": 9101, "time_per_iteration": 2.746894121170044 }, { "auxiliary_loss_clip": 0.01297349, "auxiliary_loss_mlp": 0.00273991, "balance_loss_clip": 1.07048082, "balance_loss_mlp": 0.24715725, "epoch": 0.5472418457838569, "flos": 25775243124480.0, "grad_norm": 7.633484708692332, "language_loss": 0.8409723, "learning_rate": 1.7914336955008343e-06, "loss": 0.85668564, "num_input_tokens_seen": 196180570, "router_z_loss_clip": 2.26953125, "router_z_loss_mlp": 0.26818848, "step": 9102, "time_per_iteration": 2.6717400550842285 }, { "auxiliary_loss_clip": 0.01291293, "auxiliary_loss_mlp": 0.00266823, "balance_loss_clip": 1.06862056, "balance_loss_mlp": 0.24168126, "epoch": 0.5473019690365248, "flos": 27887687925120.0, "grad_norm": 30.224196486006765, "language_loss": 0.78212315, "learning_rate": 1.791046361258413e-06, "loss": 0.79770422, "num_input_tokens_seen": 196200300, "router_z_loss_clip": 2.2265625, "router_z_loss_mlp": 0.25109863, "step": 9103, "time_per_iteration": 4.140236854553223 }, { "auxiliary_loss_clip": 0.01287108, "auxiliary_loss_mlp": 0.00284494, "balance_loss_clip": 1.0664587, "balance_loss_mlp": 0.25900719, "epoch": 0.5473620922891929, "flos": 57631490219520.0, "grad_norm": 51.51173705020782, "language_loss": 0.69597775, "learning_rate": 1.7906590349395356e-06, "loss": 0.71169376, "num_input_tokens_seen": 196228525, "router_z_loss_clip": 2.20507812, "router_z_loss_mlp": 0.25500488, "step": 9104, "time_per_iteration": 3.0095863342285156 }, { "auxiliary_loss_clip": 0.01286746, "auxiliary_loss_mlp": 0.00304946, "balance_loss_clip": 1.06239808, "balance_loss_mlp": 0.27762344, "epoch": 0.5474222155418608, "flos": 19354056422400.0, "grad_norm": 50.90361726412666, "language_loss": 0.90990937, "learning_rate": 1.790271716558888e-06, "loss": 0.92582631, "num_input_tokens_seen": 196247690, "router_z_loss_clip": 2.24414062, "router_z_loss_mlp": 0.27294922, "step": 9105, "time_per_iteration": 2.6556460857391357 }, { "auxiliary_loss_clip": 0.01283507, "auxiliary_loss_mlp": 0.00251079, "balance_loss_clip": 1.05739117, "balance_loss_mlp": 0.22689131, "epoch": 0.5474823387945288, "flos": 25120448144640.0, "grad_norm": 35.392858621436474, "language_loss": 0.86444449, "learning_rate": 1.7898844061311575e-06, "loss": 0.87979031, "num_input_tokens_seen": 196268555, "router_z_loss_clip": 2.265625, "router_z_loss_mlp": 0.24182129, "step": 9106, "time_per_iteration": 2.687185764312744 }, { "auxiliary_loss_clip": 0.01304496, "auxiliary_loss_mlp": 0.00283624, "balance_loss_clip": 1.0750159, "balance_loss_mlp": 0.25917432, "epoch": 0.5475424620471967, "flos": 18004174381440.0, "grad_norm": 7.104137335639339, "language_loss": 0.77810621, "learning_rate": 1.7894971036710322e-06, "loss": 0.79398739, "num_input_tokens_seen": 196285585, "router_z_loss_clip": 2.296875, "router_z_loss_mlp": 0.24450684, "step": 9107, "time_per_iteration": 2.646055221557617 }, { "auxiliary_loss_clip": 0.01277716, "auxiliary_loss_mlp": 0.00298603, "balance_loss_clip": 1.05283415, "balance_loss_mlp": 0.27184021, "epoch": 0.5476025852998647, "flos": 22309324922880.0, "grad_norm": 5.432447753472254, "language_loss": 0.74298769, "learning_rate": 1.789109809193197e-06, "loss": 0.75875086, "num_input_tokens_seen": 196305085, "router_z_loss_clip": 2.25, "router_z_loss_mlp": 0.26757812, "step": 9108, "time_per_iteration": 2.62410044670105 }, { "auxiliary_loss_clip": 0.01274328, "auxiliary_loss_mlp": 0.00287446, "balance_loss_clip": 1.05183685, "balance_loss_mlp": 0.2622692, "epoch": 0.5476627085525327, "flos": 20120497850880.0, "grad_norm": 9.714372886837825, "language_loss": 0.81373638, "learning_rate": 1.7887225227123396e-06, "loss": 0.82935405, "num_input_tokens_seen": 196323945, "router_z_loss_clip": 2.22460938, "router_z_loss_mlp": 0.25170898, "step": 9109, "time_per_iteration": 2.634201765060425 }, { "auxiliary_loss_clip": 0.01283459, "auxiliary_loss_mlp": 0.00294748, "balance_loss_clip": 1.05967534, "balance_loss_mlp": 0.27066767, "epoch": 0.5477228318052006, "flos": 17712579772800.0, "grad_norm": 5.5680323667355704, "language_loss": 0.85468125, "learning_rate": 1.7883352442431457e-06, "loss": 0.87046325, "num_input_tokens_seen": 196342200, "router_z_loss_clip": 2.23632812, "router_z_loss_mlp": 0.2409668, "step": 9110, "time_per_iteration": 2.7130260467529297 }, { "auxiliary_loss_clip": 0.01270492, "auxiliary_loss_mlp": 0.00271219, "balance_loss_clip": 1.05132496, "balance_loss_mlp": 0.24768718, "epoch": 0.5477829550578687, "flos": 25848895962240.0, "grad_norm": 52.80439660224323, "language_loss": 0.77784771, "learning_rate": 1.7879479738002993e-06, "loss": 0.79326481, "num_input_tokens_seen": 196362940, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.23547363, "step": 9111, "time_per_iteration": 2.686607837677002 }, { "auxiliary_loss_clip": 0.01285088, "auxiliary_loss_mlp": 0.00266405, "balance_loss_clip": 1.05926514, "balance_loss_mlp": 0.24170466, "epoch": 0.5478430783105366, "flos": 23039676161280.0, "grad_norm": 7.91922279566762, "language_loss": 0.78749841, "learning_rate": 1.7875607113984876e-06, "loss": 0.80301338, "num_input_tokens_seen": 196383070, "router_z_loss_clip": 2.2578125, "router_z_loss_mlp": 0.24719238, "step": 9112, "time_per_iteration": 2.6667351722717285 }, { "auxiliary_loss_clip": 0.01290002, "auxiliary_loss_mlp": 0.00287425, "balance_loss_clip": 1.06130409, "balance_loss_mlp": 0.26202166, "epoch": 0.5479032015632046, "flos": 16071210864000.0, "grad_norm": 23.936475040256123, "language_loss": 0.972332, "learning_rate": 1.7871734570523953e-06, "loss": 0.98810625, "num_input_tokens_seen": 196398485, "router_z_loss_clip": 2.28515625, "router_z_loss_mlp": 0.25415039, "step": 9113, "time_per_iteration": 2.725208282470703 }, { "auxiliary_loss_clip": 0.01280551, "auxiliary_loss_mlp": 0.0028692, "balance_loss_clip": 1.05597591, "balance_loss_mlp": 0.26162419, "epoch": 0.5479633248158725, "flos": 24278701852800.0, "grad_norm": 2.557880780959388, "language_loss": 0.78159702, "learning_rate": 1.7867862107767067e-06, "loss": 0.79727179, "num_input_tokens_seen": 196417725, "router_z_loss_clip": 2.24609375, "router_z_loss_mlp": 0.25317383, "step": 9114, "time_per_iteration": 2.6955180168151855 }, { "auxiliary_loss_clip": 0.01284236, "auxiliary_loss_mlp": 0.00299719, "balance_loss_clip": 1.05626631, "balance_loss_mlp": 0.27622244, "epoch": 0.5480234480685405, "flos": 26358216860160.0, "grad_norm": 4.821109478195481, "language_loss": 0.77660894, "learning_rate": 1.7863989725861066e-06, "loss": 0.79244852, "num_input_tokens_seen": 196437840, "router_z_loss_clip": 2.28125, "router_z_loss_mlp": 0.23474121, "step": 9115, "time_per_iteration": 2.669281005859375 }, { "auxiliary_loss_clip": 0.01280736, "auxiliary_loss_mlp": 0.00296184, "balance_loss_clip": 1.05230951, "balance_loss_mlp": 0.26876578, "epoch": 0.5480835713212084, "flos": 22055077480320.0, "grad_norm": 13.988057091942085, "language_loss": 0.80750829, "learning_rate": 1.7860117424952781e-06, "loss": 0.82327747, "num_input_tokens_seen": 196457300, "router_z_loss_clip": 2.2890625, "router_z_loss_mlp": 0.27416992, "step": 9116, "time_per_iteration": 2.716392755508423 }, { "auxiliary_loss_clip": 0.01282077, "auxiliary_loss_mlp": 0.00297805, "balance_loss_clip": 1.05808234, "balance_loss_mlp": 0.27385592, "epoch": 0.5481436945738765, "flos": 25301042749440.0, "grad_norm": 3.087079955460182, "language_loss": 0.8344872, "learning_rate": 1.7856245205189063e-06, "loss": 0.85028601, "num_input_tokens_seen": 196476720, "router_z_loss_clip": 2.23828125, "router_z_loss_mlp": 0.23974609, "step": 9117, "time_per_iteration": 2.6997416019439697 }, { "auxiliary_loss_clip": 0.01266458, "auxiliary_loss_mlp": 0.00288421, "balance_loss_clip": 1.04847336, "balance_loss_mlp": 0.2648648, "epoch": 0.5482038178265444, "flos": 33580857772800.0, "grad_norm": 10.22660061945206, "language_loss": 0.69425011, "learning_rate": 1.785237306671674e-06, "loss": 0.70979893, "num_input_tokens_seen": 196496765, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.2355957, "step": 9118, "time_per_iteration": 2.7307722568511963 }, { "auxiliary_loss_clip": 0.013143, "auxiliary_loss_mlp": 0.0029331, "balance_loss_clip": 1.07650435, "balance_loss_mlp": 0.26737007, "epoch": 0.5482639410792124, "flos": 19026192055680.0, "grad_norm": 4.1170681863119585, "language_loss": 0.85673237, "learning_rate": 1.7848501009682646e-06, "loss": 0.87280846, "num_input_tokens_seen": 196516220, "router_z_loss_clip": 2.37695312, "router_z_loss_mlp": 0.25939941, "step": 9119, "time_per_iteration": 2.6092536449432373 }, { "auxiliary_loss_clip": 0.01262711, "auxiliary_loss_mlp": 0.00303306, "balance_loss_clip": 1.04587007, "balance_loss_mlp": 0.27909419, "epoch": 0.5483240643318803, "flos": 25410318900480.0, "grad_norm": 3.93295767342818, "language_loss": 0.88129234, "learning_rate": 1.7844629034233604e-06, "loss": 0.89695251, "num_input_tokens_seen": 196533860, "router_z_loss_clip": 2.16796875, "router_z_loss_mlp": 0.24206543, "step": 9120, "time_per_iteration": 2.6448616981506348 }, { "auxiliary_loss_clip": 0.01272282, "auxiliary_loss_mlp": 0.0029982, "balance_loss_clip": 1.05171394, "balance_loss_mlp": 0.27501267, "epoch": 0.5483841875845483, "flos": 21466896272640.0, "grad_norm": 144.3474259681268, "language_loss": 0.88362586, "learning_rate": 1.7840757140516455e-06, "loss": 0.89934677, "num_input_tokens_seen": 196551305, "router_z_loss_clip": 2.20507812, "router_z_loss_mlp": 0.24816895, "step": 9121, "time_per_iteration": 2.6565299034118652 }, { "auxiliary_loss_clip": 0.01273035, "auxiliary_loss_mlp": 0.00328627, "balance_loss_clip": 1.04824734, "balance_loss_mlp": 0.30368811, "epoch": 0.5484443108372163, "flos": 24747263792640.0, "grad_norm": 29.258307856595916, "language_loss": 0.68601632, "learning_rate": 1.7836885328678008e-06, "loss": 0.70203292, "num_input_tokens_seen": 196569420, "router_z_loss_clip": 2.25, "router_z_loss_mlp": 0.24963379, "step": 9122, "time_per_iteration": 2.6595962047576904 }, { "auxiliary_loss_clip": 0.01292348, "auxiliary_loss_mlp": 0.00310593, "balance_loss_clip": 1.06834745, "balance_loss_mlp": 0.28623879, "epoch": 0.5485044340898843, "flos": 25375377945600.0, "grad_norm": 205.1266470812938, "language_loss": 0.78076065, "learning_rate": 1.7833013598865084e-06, "loss": 0.79679012, "num_input_tokens_seen": 196590610, "router_z_loss_clip": 2.24023438, "router_z_loss_mlp": 0.24353027, "step": 9123, "time_per_iteration": 2.6486737728118896 }, { "auxiliary_loss_clip": 0.01314406, "auxiliary_loss_mlp": 0.00306953, "balance_loss_clip": 1.08778679, "balance_loss_mlp": 0.2821219, "epoch": 0.5485645573425523, "flos": 12641167370880.0, "grad_norm": 8.621156912917927, "language_loss": 0.89834213, "learning_rate": 1.7829141951224505e-06, "loss": 0.91455579, "num_input_tokens_seen": 196606495, "router_z_loss_clip": 2.265625, "router_z_loss_mlp": 0.24841309, "step": 9124, "time_per_iteration": 2.602525234222412 }, { "auxiliary_loss_clip": 0.01301095, "auxiliary_loss_mlp": 0.00291905, "balance_loss_clip": 1.07207584, "balance_loss_mlp": 0.2687546, "epoch": 0.5486246805952202, "flos": 28329425383680.0, "grad_norm": 4.669355929212493, "language_loss": 0.86606896, "learning_rate": 1.7825270385903075e-06, "loss": 0.88199902, "num_input_tokens_seen": 196626365, "router_z_loss_clip": 2.28710938, "router_z_loss_mlp": 0.23168945, "step": 9125, "time_per_iteration": 2.669339418411255 }, { "auxiliary_loss_clip": 0.01301777, "auxiliary_loss_mlp": 0.0030043, "balance_loss_clip": 1.07182312, "balance_loss_mlp": 0.27414441, "epoch": 0.5486848038478882, "flos": 16800017817600.0, "grad_norm": 6.516661330397307, "language_loss": 0.84544933, "learning_rate": 1.7821398903047617e-06, "loss": 0.86147135, "num_input_tokens_seen": 196644465, "router_z_loss_clip": 2.3046875, "router_z_loss_mlp": 0.26281738, "step": 9126, "time_per_iteration": 2.6191372871398926 }, { "auxiliary_loss_clip": 0.01290388, "auxiliary_loss_mlp": 0.00304524, "balance_loss_clip": 1.06338656, "balance_loss_mlp": 0.27832198, "epoch": 0.5487449271005561, "flos": 17236224581760.0, "grad_norm": 29.94944045710538, "language_loss": 0.78144944, "learning_rate": 1.7817527502804928e-06, "loss": 0.79739851, "num_input_tokens_seen": 196659160, "router_z_loss_clip": 2.27148438, "router_z_loss_mlp": 0.26196289, "step": 9127, "time_per_iteration": 2.593217372894287 }, { "auxiliary_loss_clip": 0.01306705, "auxiliary_loss_mlp": 0.00335745, "balance_loss_clip": 1.07472849, "balance_loss_mlp": 0.30789781, "epoch": 0.5488050503532241, "flos": 17340867878400.0, "grad_norm": 2.672175437555318, "language_loss": 0.89926326, "learning_rate": 1.781365618532181e-06, "loss": 0.9156878, "num_input_tokens_seen": 196677410, "router_z_loss_clip": 2.3203125, "router_z_loss_mlp": 0.27844238, "step": 9128, "time_per_iteration": 2.651442289352417 }, { "auxiliary_loss_clip": 0.01313337, "auxiliary_loss_mlp": 0.00322787, "balance_loss_clip": 1.08180988, "balance_loss_mlp": 0.29597634, "epoch": 0.548865173605892, "flos": 17239169496960.0, "grad_norm": 62.027072442041074, "language_loss": 0.80828667, "learning_rate": 1.7809784950745078e-06, "loss": 0.82464796, "num_input_tokens_seen": 196696765, "router_z_loss_clip": 2.31640625, "router_z_loss_mlp": 0.26831055, "step": 9129, "time_per_iteration": 2.6827189922332764 }, { "auxiliary_loss_clip": 0.01329655, "auxiliary_loss_mlp": 0.00314936, "balance_loss_clip": 1.09533834, "balance_loss_mlp": 0.2869457, "epoch": 0.5489252968585601, "flos": 17456716218240.0, "grad_norm": 3.9277953150102674, "language_loss": 0.76564896, "learning_rate": 1.7805913799221511e-06, "loss": 0.78209484, "num_input_tokens_seen": 196714895, "router_z_loss_clip": 2.34570312, "router_z_loss_mlp": 0.27990723, "step": 9130, "time_per_iteration": 2.656524658203125 }, { "auxiliary_loss_clip": 0.01305543, "auxiliary_loss_mlp": 0.00295275, "balance_loss_clip": 1.07727206, "balance_loss_mlp": 0.26914448, "epoch": 0.548985420111228, "flos": 26323383646080.0, "grad_norm": 3.495801944024294, "language_loss": 0.71826047, "learning_rate": 1.7802042730897915e-06, "loss": 0.73426861, "num_input_tokens_seen": 196735510, "router_z_loss_clip": 2.28515625, "router_z_loss_mlp": 0.26135254, "step": 9131, "time_per_iteration": 2.7065329551696777 }, { "auxiliary_loss_clip": 0.01314302, "auxiliary_loss_mlp": 0.00303627, "balance_loss_clip": 1.0880512, "balance_loss_mlp": 0.27521926, "epoch": 0.549045543363896, "flos": 18693730748160.0, "grad_norm": 21.692299471016742, "language_loss": 0.83122849, "learning_rate": 1.7798171745921084e-06, "loss": 0.84740782, "num_input_tokens_seen": 196752855, "router_z_loss_clip": 2.25976562, "router_z_loss_mlp": 0.28393555, "step": 9132, "time_per_iteration": 2.600705146789551 }, { "auxiliary_loss_clip": 0.0130572, "auxiliary_loss_mlp": 0.0030376, "balance_loss_clip": 1.07445431, "balance_loss_mlp": 0.27839184, "epoch": 0.5491056666165639, "flos": 24717386655360.0, "grad_norm": 5.489185774581199, "language_loss": 0.89452994, "learning_rate": 1.7794300844437795e-06, "loss": 0.91062468, "num_input_tokens_seen": 196772230, "router_z_loss_clip": 2.31054688, "router_z_loss_mlp": 0.25354004, "step": 9133, "time_per_iteration": 2.699733257293701 }, { "auxiliary_loss_clip": 0.01299117, "auxiliary_loss_mlp": 0.00306885, "balance_loss_clip": 1.07030737, "balance_loss_mlp": 0.28080219, "epoch": 0.5491657898692319, "flos": 21576926609280.0, "grad_norm": 4.231228282388851, "language_loss": 0.78270668, "learning_rate": 1.7790430026594841e-06, "loss": 0.79876667, "num_input_tokens_seen": 196790405, "router_z_loss_clip": 2.28710938, "router_z_loss_mlp": 0.26074219, "step": 9134, "time_per_iteration": 2.6311042308807373 }, { "auxiliary_loss_clip": 0.01335258, "auxiliary_loss_mlp": 0.00321752, "balance_loss_clip": 1.10035706, "balance_loss_mlp": 0.29444104, "epoch": 0.5492259131219, "flos": 50476432746240.0, "grad_norm": 4.349115837916499, "language_loss": 0.66882795, "learning_rate": 1.7786559292539004e-06, "loss": 0.68539804, "num_input_tokens_seen": 196813785, "router_z_loss_clip": 2.34570312, "router_z_loss_mlp": 0.27319336, "step": 9135, "time_per_iteration": 2.890806198120117 }, { "auxiliary_loss_clip": 0.01312901, "auxiliary_loss_mlp": 0.00300633, "balance_loss_clip": 1.07737947, "balance_loss_mlp": 0.27265453, "epoch": 0.5492860363745679, "flos": 25119262995840.0, "grad_norm": 4.737822579003588, "language_loss": 0.83067787, "learning_rate": 1.7782688642417058e-06, "loss": 0.8468132, "num_input_tokens_seen": 196834390, "router_z_loss_clip": 2.35546875, "router_z_loss_mlp": 0.27978516, "step": 9136, "time_per_iteration": 4.102610111236572 }, { "auxiliary_loss_clip": 0.01329257, "auxiliary_loss_mlp": 0.00313336, "balance_loss_clip": 1.0939492, "balance_loss_mlp": 0.28510767, "epoch": 0.5493461596272359, "flos": 22633777497600.0, "grad_norm": 14.639623698518033, "language_loss": 0.76795167, "learning_rate": 1.7778818076375781e-06, "loss": 0.78437763, "num_input_tokens_seen": 196853290, "router_z_loss_clip": 2.35351562, "router_z_loss_mlp": 0.28271484, "step": 9137, "time_per_iteration": 2.7470905780792236 }, { "auxiliary_loss_clip": 0.01276138, "auxiliary_loss_mlp": 0.0013099, "balance_loss_clip": 1.12016904, "balance_loss_mlp": 0.12302645, "epoch": 0.5494062828799038, "flos": 66151800754560.0, "grad_norm": 0.7306641986291927, "language_loss": 0.64501464, "learning_rate": 1.7774947594561947e-06, "loss": 0.65908593, "num_input_tokens_seen": 196913120, "router_z_loss_clip": 1.5625, "router_z_loss_mlp": 0.07958984, "step": 9138, "time_per_iteration": 4.722402334213257 }, { "auxiliary_loss_clip": 0.01339702, "auxiliary_loss_mlp": 0.00301864, "balance_loss_clip": 1.1005367, "balance_loss_mlp": 0.27314654, "epoch": 0.5494664061325718, "flos": 21105958458240.0, "grad_norm": 8.780098686792693, "language_loss": 0.81385732, "learning_rate": 1.7771077197122321e-06, "loss": 0.83027291, "num_input_tokens_seen": 196931530, "router_z_loss_clip": 2.38867188, "router_z_loss_mlp": 0.28735352, "step": 9139, "time_per_iteration": 2.6796655654907227 }, { "auxiliary_loss_clip": 0.01313972, "auxiliary_loss_mlp": 0.00267864, "balance_loss_clip": 1.08418334, "balance_loss_mlp": 0.24423666, "epoch": 0.5495265293852397, "flos": 14392566616320.0, "grad_norm": 8.27195513624354, "language_loss": 0.80774897, "learning_rate": 1.7767206884203672e-06, "loss": 0.82356733, "num_input_tokens_seen": 196949430, "router_z_loss_clip": 2.29882812, "router_z_loss_mlp": 0.23657227, "step": 9140, "time_per_iteration": 3.9989945888519287 }, { "auxiliary_loss_clip": 0.01308288, "auxiliary_loss_mlp": 0.00274622, "balance_loss_clip": 1.07937431, "balance_loss_mlp": 0.24918295, "epoch": 0.5495866526379077, "flos": 25549148966400.0, "grad_norm": 12.440150855391586, "language_loss": 0.85307562, "learning_rate": 1.7763336655952762e-06, "loss": 0.86890471, "num_input_tokens_seen": 196968265, "router_z_loss_clip": 2.2890625, "router_z_loss_mlp": 0.2545166, "step": 9141, "time_per_iteration": 2.658358573913574 }, { "auxiliary_loss_clip": 0.01306237, "auxiliary_loss_mlp": 0.0030661, "balance_loss_clip": 1.08368325, "balance_loss_mlp": 0.27978802, "epoch": 0.5496467758905756, "flos": 21317256213120.0, "grad_norm": 154.09137623612423, "language_loss": 0.81037712, "learning_rate": 1.7759466512516346e-06, "loss": 0.82650554, "num_input_tokens_seen": 196984930, "router_z_loss_clip": 2.22460938, "router_z_loss_mlp": 0.26794434, "step": 9142, "time_per_iteration": 2.6280899047851562 }, { "auxiliary_loss_clip": 0.01334419, "auxiliary_loss_mlp": 0.00272616, "balance_loss_clip": 1.09971356, "balance_loss_mlp": 0.24493527, "epoch": 0.5497068991432437, "flos": 22233086305920.0, "grad_norm": 15.312549782806851, "language_loss": 0.84078789, "learning_rate": 1.7755596454041192e-06, "loss": 0.85685825, "num_input_tokens_seen": 197002320, "router_z_loss_clip": 2.34570312, "router_z_loss_mlp": 0.27709961, "step": 9143, "time_per_iteration": 2.6464383602142334 }, { "auxiliary_loss_clip": 0.01319451, "auxiliary_loss_mlp": 0.00301205, "balance_loss_clip": 1.09562302, "balance_loss_mlp": 0.2732389, "epoch": 0.5497670223959116, "flos": 18479093028480.0, "grad_norm": 5.816449106336075, "language_loss": 0.89898193, "learning_rate": 1.7751726480674044e-06, "loss": 0.91518843, "num_input_tokens_seen": 197020825, "router_z_loss_clip": 2.23828125, "router_z_loss_mlp": 0.27966309, "step": 9144, "time_per_iteration": 2.606168270111084 }, { "auxiliary_loss_clip": 0.01326159, "auxiliary_loss_mlp": 0.00304636, "balance_loss_clip": 1.0966785, "balance_loss_mlp": 0.27651471, "epoch": 0.5498271456485796, "flos": 29205107049600.0, "grad_norm": 2.8367914741648463, "language_loss": 0.78123039, "learning_rate": 1.7747856592561645e-06, "loss": 0.7975384, "num_input_tokens_seen": 197040450, "router_z_loss_clip": 2.29296875, "router_z_loss_mlp": 0.28137207, "step": 9145, "time_per_iteration": 4.055703639984131 }, { "auxiliary_loss_clip": 0.01315917, "auxiliary_loss_mlp": 0.00292988, "balance_loss_clip": 1.09321308, "balance_loss_mlp": 0.26761991, "epoch": 0.5498872689012475, "flos": 34824372664320.0, "grad_norm": 2.94192085734235, "language_loss": 0.77478302, "learning_rate": 1.774398678985076e-06, "loss": 0.79087204, "num_input_tokens_seen": 197063930, "router_z_loss_clip": 2.22851562, "router_z_loss_mlp": 0.25390625, "step": 9146, "time_per_iteration": 2.746192693710327 }, { "auxiliary_loss_clip": 0.01308516, "auxiliary_loss_mlp": 0.00264499, "balance_loss_clip": 1.08429635, "balance_loss_mlp": 0.23942924, "epoch": 0.5499473921539155, "flos": 25921938268800.0, "grad_norm": 20.23979727648267, "language_loss": 0.70443743, "learning_rate": 1.7740117072688113e-06, "loss": 0.72016764, "num_input_tokens_seen": 197082660, "router_z_loss_clip": 2.24609375, "router_z_loss_mlp": 0.25048828, "step": 9147, "time_per_iteration": 2.6413230895996094 }, { "auxiliary_loss_clip": 0.0133642, "auxiliary_loss_mlp": 0.00279675, "balance_loss_clip": 1.10477388, "balance_loss_mlp": 0.25167239, "epoch": 0.5500075154065835, "flos": 22273701609600.0, "grad_norm": 66.76773150809488, "language_loss": 0.88238966, "learning_rate": 1.7736247441220458e-06, "loss": 0.89855063, "num_input_tokens_seen": 197100675, "router_z_loss_clip": 2.31640625, "router_z_loss_mlp": 0.28015137, "step": 9148, "time_per_iteration": 2.6270999908447266 }, { "auxiliary_loss_clip": 0.01339512, "auxiliary_loss_mlp": 0.00289915, "balance_loss_clip": 1.10793793, "balance_loss_mlp": 0.26079232, "epoch": 0.5500676386592515, "flos": 28037507552640.0, "grad_norm": 4.927443629920093, "language_loss": 0.86861253, "learning_rate": 1.773237789559453e-06, "loss": 0.88490689, "num_input_tokens_seen": 197121320, "router_z_loss_clip": 2.3125, "router_z_loss_mlp": 0.2911377, "step": 9149, "time_per_iteration": 2.71547532081604 }, { "auxiliary_loss_clip": 0.0131026, "auxiliary_loss_mlp": 0.0028088, "balance_loss_clip": 1.08838749, "balance_loss_mlp": 0.25511879, "epoch": 0.5501277619119195, "flos": 23914819123200.0, "grad_norm": 12.093015967114729, "language_loss": 0.80577302, "learning_rate": 1.7728508435957052e-06, "loss": 0.82168442, "num_input_tokens_seen": 197138965, "router_z_loss_clip": 2.21289062, "router_z_loss_mlp": 0.25720215, "step": 9150, "time_per_iteration": 2.649186134338379 }, { "auxiliary_loss_clip": 0.01324079, "auxiliary_loss_mlp": 0.00285789, "balance_loss_clip": 1.09371734, "balance_loss_mlp": 0.25708386, "epoch": 0.5501878851645874, "flos": 20923783655040.0, "grad_norm": 4.2247317099975445, "language_loss": 0.82995677, "learning_rate": 1.772463906245477e-06, "loss": 0.84605545, "num_input_tokens_seen": 197156460, "router_z_loss_clip": 2.30273438, "router_z_loss_mlp": 0.28723145, "step": 9151, "time_per_iteration": 2.6268341541290283 }, { "auxiliary_loss_clip": 0.01310582, "auxiliary_loss_mlp": 0.00312786, "balance_loss_clip": 1.08797705, "balance_loss_mlp": 0.28524864, "epoch": 0.5502480084172554, "flos": 20665298407680.0, "grad_norm": 11.390696422408856, "language_loss": 0.81993502, "learning_rate": 1.7720769775234394e-06, "loss": 0.83616877, "num_input_tokens_seen": 197175140, "router_z_loss_clip": 2.23046875, "router_z_loss_mlp": 0.27563477, "step": 9152, "time_per_iteration": 2.6459035873413086 }, { "auxiliary_loss_clip": 0.01302966, "auxiliary_loss_mlp": 0.00257982, "balance_loss_clip": 1.0830009, "balance_loss_mlp": 0.23334172, "epoch": 0.5503081316699233, "flos": 26432552056320.0, "grad_norm": 11.16006621609473, "language_loss": 0.89670479, "learning_rate": 1.7716900574442662e-06, "loss": 0.9123143, "num_input_tokens_seen": 197194345, "router_z_loss_clip": 2.19921875, "router_z_loss_mlp": 0.24633789, "step": 9153, "time_per_iteration": 2.7046430110931396 }, { "auxiliary_loss_clip": 0.01306239, "auxiliary_loss_mlp": 0.00240547, "balance_loss_clip": 1.08520174, "balance_loss_mlp": 0.21457113, "epoch": 0.5503682549225913, "flos": 30629144718720.0, "grad_norm": 2.518057211344147, "language_loss": 0.8193754, "learning_rate": 1.7713031460226294e-06, "loss": 0.83484328, "num_input_tokens_seen": 197215535, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.25964355, "step": 9154, "time_per_iteration": 2.695378303527832 }, { "auxiliary_loss_clip": 0.01312724, "auxiliary_loss_mlp": 0.00295995, "balance_loss_clip": 1.08536291, "balance_loss_mlp": 0.26720622, "epoch": 0.5504283781752592, "flos": 22565439872640.0, "grad_norm": 10.63711889857913, "language_loss": 0.80629647, "learning_rate": 1.770916243273199e-06, "loss": 0.82238364, "num_input_tokens_seen": 197234945, "router_z_loss_clip": 2.2734375, "router_z_loss_mlp": 0.28771973, "step": 9155, "time_per_iteration": 2.7402281761169434 }, { "auxiliary_loss_clip": 0.01291636, "auxiliary_loss_mlp": 0.00102056, "balance_loss_clip": 1.14076102, "balance_loss_mlp": 0.09523746, "epoch": 0.5504885014279273, "flos": 67901009270400.0, "grad_norm": 0.7231750848706211, "language_loss": 0.55107206, "learning_rate": 1.7705293492106483e-06, "loss": 0.56500894, "num_input_tokens_seen": 197302285, "router_z_loss_clip": 1.515625, "router_z_loss_mlp": 0.06835938, "step": 9156, "time_per_iteration": 3.3186023235321045 }, { "auxiliary_loss_clip": 0.01303237, "auxiliary_loss_mlp": 0.00253273, "balance_loss_clip": 1.08293915, "balance_loss_mlp": 0.2276428, "epoch": 0.5505486246805952, "flos": 22450058409600.0, "grad_norm": 12.310861297827715, "language_loss": 0.88834929, "learning_rate": 1.7701424638496475e-06, "loss": 0.90391445, "num_input_tokens_seen": 197321575, "router_z_loss_clip": 2.20507812, "router_z_loss_mlp": 0.25646973, "step": 9157, "time_per_iteration": 2.7821414470672607 }, { "auxiliary_loss_clip": 0.01322059, "auxiliary_loss_mlp": 0.00285425, "balance_loss_clip": 1.08813512, "balance_loss_mlp": 0.25779179, "epoch": 0.5506087479332632, "flos": 26906896085760.0, "grad_norm": 78.16251524781127, "language_loss": 0.86535513, "learning_rate": 1.7697555872048677e-06, "loss": 0.88142997, "num_input_tokens_seen": 197340255, "router_z_loss_clip": 2.34375, "router_z_loss_mlp": 0.27661133, "step": 9158, "time_per_iteration": 2.6599061489105225 }, { "auxiliary_loss_clip": 0.01323879, "auxiliary_loss_mlp": 0.00235358, "balance_loss_clip": 1.09881949, "balance_loss_mlp": 0.21025217, "epoch": 0.5506688711859311, "flos": 22930256355840.0, "grad_norm": 3.2497958930916906, "language_loss": 0.77387947, "learning_rate": 1.769368719290979e-06, "loss": 0.78947186, "num_input_tokens_seen": 197360360, "router_z_loss_clip": 2.25, "router_z_loss_mlp": 0.25097656, "step": 9159, "time_per_iteration": 2.713042974472046 }, { "auxiliary_loss_clip": 0.01327442, "auxiliary_loss_mlp": 0.00277876, "balance_loss_clip": 1.09161806, "balance_loss_mlp": 0.24970675, "epoch": 0.5507289944385991, "flos": 29606408772480.0, "grad_norm": 67.29417061694888, "language_loss": 0.78271234, "learning_rate": 1.7689818601226516e-06, "loss": 0.79876554, "num_input_tokens_seen": 197381905, "router_z_loss_clip": 2.35351562, "router_z_loss_mlp": 0.28210449, "step": 9160, "time_per_iteration": 2.7726943492889404 }, { "auxiliary_loss_clip": 0.01308601, "auxiliary_loss_mlp": 0.00259991, "balance_loss_clip": 1.0829308, "balance_loss_mlp": 0.23365748, "epoch": 0.5507891176912671, "flos": 15334431091200.0, "grad_norm": 79.45411140024238, "language_loss": 0.79152691, "learning_rate": 1.7685950097145552e-06, "loss": 0.80721283, "num_input_tokens_seen": 197398555, "router_z_loss_clip": 2.2578125, "router_z_loss_mlp": 0.2635498, "step": 9161, "time_per_iteration": 2.640010356903076 }, { "auxiliary_loss_clip": 0.01319031, "auxiliary_loss_mlp": 0.0025429, "balance_loss_clip": 1.09149086, "balance_loss_mlp": 0.22709808, "epoch": 0.5508492409439351, "flos": 26578313447040.0, "grad_norm": 4.695628299605547, "language_loss": 0.76584613, "learning_rate": 1.768208168081359e-06, "loss": 0.78157938, "num_input_tokens_seen": 197419630, "router_z_loss_clip": 2.2734375, "router_z_loss_mlp": 0.27172852, "step": 9162, "time_per_iteration": 2.714024066925049 }, { "auxiliary_loss_clip": 0.01306231, "auxiliary_loss_mlp": 0.00275691, "balance_loss_clip": 1.08569026, "balance_loss_mlp": 0.24936962, "epoch": 0.5509093641966031, "flos": 25443428261760.0, "grad_norm": 9.085908287400482, "language_loss": 0.89989579, "learning_rate": 1.767821335237733e-06, "loss": 0.91571504, "num_input_tokens_seen": 197438480, "router_z_loss_clip": 2.20507812, "router_z_loss_mlp": 0.26293945, "step": 9163, "time_per_iteration": 2.640822649002075 }, { "auxiliary_loss_clip": 0.01315036, "auxiliary_loss_mlp": 0.00243383, "balance_loss_clip": 1.09154272, "balance_loss_mlp": 0.21900505, "epoch": 0.550969487449271, "flos": 18698543170560.0, "grad_norm": 6.295426154411486, "language_loss": 0.86220992, "learning_rate": 1.7674345111983441e-06, "loss": 0.87779415, "num_input_tokens_seen": 197456755, "router_z_loss_clip": 2.23242188, "router_z_loss_mlp": 0.24401855, "step": 9164, "time_per_iteration": 2.6175618171691895 }, { "auxiliary_loss_clip": 0.01310204, "auxiliary_loss_mlp": 0.00261869, "balance_loss_clip": 1.08100617, "balance_loss_mlp": 0.23383066, "epoch": 0.551029610701939, "flos": 22708723224960.0, "grad_norm": 8.870982725408156, "language_loss": 0.81305289, "learning_rate": 1.767047695977863e-06, "loss": 0.82877362, "num_input_tokens_seen": 197475530, "router_z_loss_clip": 2.29101562, "router_z_loss_mlp": 0.28015137, "step": 9165, "time_per_iteration": 2.6366217136383057 }, { "auxiliary_loss_clip": 0.01301587, "auxiliary_loss_mlp": 0.00256727, "balance_loss_clip": 1.07859814, "balance_loss_mlp": 0.23342201, "epoch": 0.5510897339546069, "flos": 12420496166400.0, "grad_norm": 3.2006937568715688, "language_loss": 0.86190248, "learning_rate": 1.7666608895909563e-06, "loss": 0.87748563, "num_input_tokens_seen": 197490835, "router_z_loss_clip": 2.23046875, "router_z_loss_mlp": 0.2331543, "step": 9166, "time_per_iteration": 2.622159719467163 }, { "auxiliary_loss_clip": 0.01308649, "auxiliary_loss_mlp": 0.00270908, "balance_loss_clip": 1.0822196, "balance_loss_mlp": 0.24315622, "epoch": 0.5511498572072749, "flos": 18770579896320.0, "grad_norm": 6.583040284110605, "language_loss": 0.83517063, "learning_rate": 1.7662740920522913e-06, "loss": 0.85096622, "num_input_tokens_seen": 197508770, "router_z_loss_clip": 2.26171875, "router_z_loss_mlp": 0.27734375, "step": 9167, "time_per_iteration": 2.6075942516326904 }, { "auxiliary_loss_clip": 0.01297139, "auxiliary_loss_mlp": 0.00273689, "balance_loss_clip": 1.07480359, "balance_loss_mlp": 0.24741569, "epoch": 0.5512099804599428, "flos": 19573326996480.0, "grad_norm": 6.550400884742833, "language_loss": 0.89343631, "learning_rate": 1.7658873033765374e-06, "loss": 0.90914464, "num_input_tokens_seen": 197527340, "router_z_loss_clip": 2.22460938, "router_z_loss_mlp": 0.26318359, "step": 9168, "time_per_iteration": 2.6348280906677246 }, { "auxiliary_loss_clip": 0.01316508, "auxiliary_loss_mlp": 0.00295906, "balance_loss_clip": 1.08917332, "balance_loss_mlp": 0.26996624, "epoch": 0.5512701037126109, "flos": 26245600744320.0, "grad_norm": 8.583166433478382, "language_loss": 0.76868284, "learning_rate": 1.7655005235783591e-06, "loss": 0.78480697, "num_input_tokens_seen": 197547280, "router_z_loss_clip": 2.2734375, "router_z_loss_mlp": 0.25939941, "step": 9169, "time_per_iteration": 2.672537326812744 }, { "auxiliary_loss_clip": 0.01286041, "auxiliary_loss_mlp": 0.00275363, "balance_loss_clip": 1.06722951, "balance_loss_mlp": 0.25156915, "epoch": 0.5513302269652788, "flos": 21945406279680.0, "grad_norm": 42.40284400924623, "language_loss": 0.92545187, "learning_rate": 1.7651137526724251e-06, "loss": 0.94106597, "num_input_tokens_seen": 197565045, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.23791504, "step": 9170, "time_per_iteration": 2.6986801624298096 }, { "auxiliary_loss_clip": 0.01288774, "auxiliary_loss_mlp": 0.00044896, "balance_loss_clip": 1.13132024, "balance_loss_mlp": 0.03960316, "epoch": 0.5513903502179468, "flos": 68235948616320.0, "grad_norm": 0.7856616829754699, "language_loss": 0.59614694, "learning_rate": 1.7647269906734017e-06, "loss": 0.6094836, "num_input_tokens_seen": 197625005, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.05297852, "step": 9171, "time_per_iteration": 3.1536049842834473 }, { "auxiliary_loss_clip": 0.01303453, "auxiliary_loss_mlp": 0.00332118, "balance_loss_clip": 1.07807517, "balance_loss_mlp": 0.30481866, "epoch": 0.5514504734706147, "flos": 18734238311040.0, "grad_norm": 311.2940947127057, "language_loss": 0.78113705, "learning_rate": 1.7643402375959533e-06, "loss": 0.79749274, "num_input_tokens_seen": 197645050, "router_z_loss_clip": 2.25585938, "router_z_loss_mlp": 0.27270508, "step": 9172, "time_per_iteration": 2.7170350551605225 }, { "auxiliary_loss_clip": 0.01298283, "auxiliary_loss_mlp": 0.0028134, "balance_loss_clip": 1.07160568, "balance_loss_mlp": 0.25526857, "epoch": 0.5515105967232827, "flos": 22270972176000.0, "grad_norm": 6.594829126546399, "language_loss": 0.83727586, "learning_rate": 1.7639534934547474e-06, "loss": 0.85307205, "num_input_tokens_seen": 197663910, "router_z_loss_clip": 2.265625, "router_z_loss_mlp": 0.26086426, "step": 9173, "time_per_iteration": 2.680867910385132 }, { "auxiliary_loss_clip": 0.01302401, "auxiliary_loss_mlp": 0.0026227, "balance_loss_clip": 1.07288718, "balance_loss_mlp": 0.23619844, "epoch": 0.5515707199759508, "flos": 22557682535040.0, "grad_norm": 5.626932495858452, "language_loss": 0.82390982, "learning_rate": 1.7635667582644484e-06, "loss": 0.83955652, "num_input_tokens_seen": 197681580, "router_z_loss_clip": 2.296875, "router_z_loss_mlp": 0.26098633, "step": 9174, "time_per_iteration": 2.666444778442383 }, { "auxiliary_loss_clip": 0.0129562, "auxiliary_loss_mlp": 0.00274139, "balance_loss_clip": 1.06927323, "balance_loss_mlp": 0.24921221, "epoch": 0.5516308432286187, "flos": 28291072636800.0, "grad_norm": 9.372416410796454, "language_loss": 0.80398566, "learning_rate": 1.7631800320397217e-06, "loss": 0.81968331, "num_input_tokens_seen": 197702095, "router_z_loss_clip": 2.2578125, "router_z_loss_mlp": 0.24938965, "step": 9175, "time_per_iteration": 2.726780414581299 }, { "auxiliary_loss_clip": 0.01288879, "auxiliary_loss_mlp": 0.00333016, "balance_loss_clip": 1.06489944, "balance_loss_mlp": 0.3047514, "epoch": 0.5516909664812867, "flos": 18764474584320.0, "grad_norm": 5.497002374256045, "language_loss": 0.77134836, "learning_rate": 1.7627933147952318e-06, "loss": 0.78756732, "num_input_tokens_seen": 197720720, "router_z_loss_clip": 2.23828125, "router_z_loss_mlp": 0.28259277, "step": 9176, "time_per_iteration": 2.615628480911255 }, { "auxiliary_loss_clip": 0.01285112, "auxiliary_loss_mlp": 0.00290729, "balance_loss_clip": 1.06417811, "balance_loss_mlp": 0.2651701, "epoch": 0.5517510897339546, "flos": 27740346336000.0, "grad_norm": 7076.325200969735, "language_loss": 0.78793836, "learning_rate": 1.7624066065456435e-06, "loss": 0.80369675, "num_input_tokens_seen": 197741820, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.2557373, "step": 9177, "time_per_iteration": 2.7300925254821777 }, { "auxiliary_loss_clip": 0.01307531, "auxiliary_loss_mlp": 0.00317453, "balance_loss_clip": 1.08051252, "balance_loss_mlp": 0.29058355, "epoch": 0.5518112129866226, "flos": 18404470523520.0, "grad_norm": 14.344623183176186, "language_loss": 0.8654713, "learning_rate": 1.7620199073056204e-06, "loss": 0.88172114, "num_input_tokens_seen": 197759160, "router_z_loss_clip": 2.26953125, "router_z_loss_mlp": 0.26879883, "step": 9178, "time_per_iteration": 4.1321656703948975 }, { "auxiliary_loss_clip": 0.0130928, "auxiliary_loss_mlp": 0.00314252, "balance_loss_clip": 1.0788511, "balance_loss_mlp": 0.28731048, "epoch": 0.5518713362392905, "flos": 25082670015360.0, "grad_norm": 17.80660709662402, "language_loss": 0.81235468, "learning_rate": 1.761633217089826e-06, "loss": 0.82859004, "num_input_tokens_seen": 197779760, "router_z_loss_clip": 2.30664062, "router_z_loss_mlp": 0.26940918, "step": 9179, "time_per_iteration": 2.663072347640991 }, { "auxiliary_loss_clip": 0.01298028, "auxiliary_loss_mlp": 0.00286091, "balance_loss_clip": 1.07008374, "balance_loss_mlp": 0.26102108, "epoch": 0.5519314594919585, "flos": 36538999361280.0, "grad_norm": 4.8476316075375925, "language_loss": 0.75881416, "learning_rate": 1.761246535912924e-06, "loss": 0.77465534, "num_input_tokens_seen": 197801545, "router_z_loss_clip": 2.28320312, "router_z_loss_mlp": 0.25085449, "step": 9180, "time_per_iteration": 4.183887958526611 }, { "auxiliary_loss_clip": 0.01292639, "auxiliary_loss_mlp": 0.0029655, "balance_loss_clip": 1.06576657, "balance_loss_mlp": 0.27235037, "epoch": 0.5519915827446265, "flos": 20448613612800.0, "grad_norm": 7.6142765212769135, "language_loss": 0.76859635, "learning_rate": 1.7608598637895776e-06, "loss": 0.7844882, "num_input_tokens_seen": 197820760, "router_z_loss_clip": 2.26953125, "router_z_loss_mlp": 0.24194336, "step": 9181, "time_per_iteration": 2.641159772872925 }, { "auxiliary_loss_clip": 0.01295173, "auxiliary_loss_mlp": 0.0031506, "balance_loss_clip": 1.06692433, "balance_loss_mlp": 0.28540075, "epoch": 0.5520517059972945, "flos": 23768052151680.0, "grad_norm": 4.13147972191409, "language_loss": 0.85825861, "learning_rate": 1.7604732007344486e-06, "loss": 0.87436098, "num_input_tokens_seen": 197840195, "router_z_loss_clip": 2.28320312, "router_z_loss_mlp": 0.29663086, "step": 9182, "time_per_iteration": 4.2955238819122314 }, { "auxiliary_loss_clip": 0.01298275, "auxiliary_loss_mlp": 0.00289883, "balance_loss_clip": 1.06666315, "balance_loss_mlp": 0.26427692, "epoch": 0.5521118292499624, "flos": 22196457411840.0, "grad_norm": 7.602459589189655, "language_loss": 0.8880716, "learning_rate": 1.7600865467622003e-06, "loss": 0.90395319, "num_input_tokens_seen": 197859475, "router_z_loss_clip": 2.31835938, "router_z_loss_mlp": 0.25634766, "step": 9183, "time_per_iteration": 2.6833670139312744 }, { "auxiliary_loss_clip": 0.01274168, "auxiliary_loss_mlp": 0.00302811, "balance_loss_clip": 1.0519805, "balance_loss_mlp": 0.27683565, "epoch": 0.5521719525026304, "flos": 23583291569280.0, "grad_norm": 6.858422757536044, "language_loss": 0.7295841, "learning_rate": 1.7596999018874936e-06, "loss": 0.74535388, "num_input_tokens_seen": 197879395, "router_z_loss_clip": 2.22460938, "router_z_loss_mlp": 0.25952148, "step": 9184, "time_per_iteration": 2.6711316108703613 }, { "auxiliary_loss_clip": 0.01279558, "auxiliary_loss_mlp": 0.00319993, "balance_loss_clip": 1.05669653, "balance_loss_mlp": 0.29326668, "epoch": 0.5522320757552983, "flos": 26137617482880.0, "grad_norm": 3.0799408093749046, "language_loss": 0.8134203, "learning_rate": 1.7593132661249917e-06, "loss": 0.8294158, "num_input_tokens_seen": 197900815, "router_z_loss_clip": 2.23046875, "router_z_loss_mlp": 0.2668457, "step": 9185, "time_per_iteration": 2.652327537536621 }, { "auxiliary_loss_clip": 0.01299563, "auxiliary_loss_mlp": 0.00297628, "balance_loss_clip": 1.072999, "balance_loss_mlp": 0.27073437, "epoch": 0.5522921990079663, "flos": 24676160820480.0, "grad_norm": 10.776513842133722, "language_loss": 0.7986095, "learning_rate": 1.7589266394893536e-06, "loss": 0.81458145, "num_input_tokens_seen": 197918985, "router_z_loss_clip": 2.26757812, "router_z_loss_mlp": 0.26879883, "step": 9186, "time_per_iteration": 2.6703619956970215 }, { "auxiliary_loss_clip": 0.01310638, "auxiliary_loss_mlp": 0.00286202, "balance_loss_clip": 1.07704663, "balance_loss_mlp": 0.2595346, "epoch": 0.5523523222606344, "flos": 22748153379840.0, "grad_norm": 15.938584429873675, "language_loss": 0.73417497, "learning_rate": 1.7585400219952421e-06, "loss": 0.75014341, "num_input_tokens_seen": 197937725, "router_z_loss_clip": 2.33398438, "router_z_loss_mlp": 0.2668457, "step": 9187, "time_per_iteration": 2.6636083126068115 }, { "auxiliary_loss_clip": 0.0130582, "auxiliary_loss_mlp": 0.00277931, "balance_loss_clip": 1.07700872, "balance_loss_mlp": 0.2523244, "epoch": 0.5524124455133023, "flos": 19755825022080.0, "grad_norm": 11.569137702379281, "language_loss": 0.84710598, "learning_rate": 1.758153413657318e-06, "loss": 0.86294353, "num_input_tokens_seen": 197955635, "router_z_loss_clip": 2.2890625, "router_z_loss_mlp": 0.25598145, "step": 9188, "time_per_iteration": 4.030259370803833 }, { "auxiliary_loss_clip": 0.01317512, "auxiliary_loss_mlp": 0.00315332, "balance_loss_clip": 1.08652353, "balance_loss_mlp": 0.2871156, "epoch": 0.5524725687659703, "flos": 23294821443840.0, "grad_norm": 17.7918669835789, "language_loss": 0.89864779, "learning_rate": 1.7577668144902394e-06, "loss": 0.91497624, "num_input_tokens_seen": 197974490, "router_z_loss_clip": 2.3125, "router_z_loss_mlp": 0.28222656, "step": 9189, "time_per_iteration": 2.658393144607544 }, { "auxiliary_loss_clip": 0.01314498, "auxiliary_loss_mlp": 0.00297572, "balance_loss_clip": 1.08721638, "balance_loss_mlp": 0.27139345, "epoch": 0.5525326920186382, "flos": 24862178378880.0, "grad_norm": 5.13103634148809, "language_loss": 0.82691884, "learning_rate": 1.7573802245086684e-06, "loss": 0.84303945, "num_input_tokens_seen": 197995735, "router_z_loss_clip": 2.27539062, "router_z_loss_mlp": 0.26184082, "step": 9190, "time_per_iteration": 2.703795909881592 }, { "auxiliary_loss_clip": 0.01308368, "auxiliary_loss_mlp": 0.00283732, "balance_loss_clip": 1.07535601, "balance_loss_mlp": 0.2555275, "epoch": 0.5525928152713062, "flos": 13735580906880.0, "grad_norm": 20.133438063822055, "language_loss": 0.89702153, "learning_rate": 1.7569936437272627e-06, "loss": 0.91294253, "num_input_tokens_seen": 198009685, "router_z_loss_clip": 2.328125, "router_z_loss_mlp": 0.28198242, "step": 9191, "time_per_iteration": 2.586456298828125 }, { "auxiliary_loss_clip": 0.01299897, "auxiliary_loss_mlp": 0.00277749, "balance_loss_clip": 1.07175922, "balance_loss_mlp": 0.25183266, "epoch": 0.5526529385239741, "flos": 13071592045440.0, "grad_norm": 1696.9812322580615, "language_loss": 0.7799781, "learning_rate": 1.7566070721606829e-06, "loss": 0.79575455, "num_input_tokens_seen": 198026845, "router_z_loss_clip": 2.28515625, "router_z_loss_mlp": 0.25927734, "step": 9192, "time_per_iteration": 2.595831871032715 }, { "auxiliary_loss_clip": 0.01295959, "auxiliary_loss_mlp": 0.00263781, "balance_loss_clip": 1.07334089, "balance_loss_mlp": 0.23931964, "epoch": 0.5527130617766421, "flos": 23148377694720.0, "grad_norm": 1.875432346121547, "language_loss": 0.81507015, "learning_rate": 1.756220509823588e-06, "loss": 0.8306675, "num_input_tokens_seen": 198045275, "router_z_loss_clip": 2.22851562, "router_z_loss_mlp": 0.24438477, "step": 9193, "time_per_iteration": 2.65450119972229 }, { "auxiliary_loss_clip": 0.01282837, "auxiliary_loss_mlp": 0.00275001, "balance_loss_clip": 1.06110895, "balance_loss_mlp": 0.25076622, "epoch": 0.55277318502931, "flos": 21285547482240.0, "grad_norm": 39.78071158914756, "language_loss": 0.85314852, "learning_rate": 1.7558339567306344e-06, "loss": 0.86872691, "num_input_tokens_seen": 198065760, "router_z_loss_clip": 2.22070312, "router_z_loss_mlp": 0.24243164, "step": 9194, "time_per_iteration": 2.68658447265625 }, { "auxiliary_loss_clip": 0.01302903, "auxiliary_loss_mlp": 0.00251511, "balance_loss_clip": 1.06771278, "balance_loss_mlp": 0.22573763, "epoch": 0.5528333082819781, "flos": 38324549462400.0, "grad_norm": 5.887565768140376, "language_loss": 0.74963689, "learning_rate": 1.7554474128964825e-06, "loss": 0.76518101, "num_input_tokens_seen": 198087595, "router_z_loss_clip": 2.34960938, "router_z_loss_mlp": 0.25769043, "step": 9195, "time_per_iteration": 2.7840492725372314 }, { "auxiliary_loss_clip": 0.01299972, "auxiliary_loss_mlp": 0.00293323, "balance_loss_clip": 1.06963313, "balance_loss_mlp": 0.26484391, "epoch": 0.552893431534646, "flos": 13553621585280.0, "grad_norm": 24.154448103604256, "language_loss": 0.8197732, "learning_rate": 1.7550608783357887e-06, "loss": 0.83570617, "num_input_tokens_seen": 198104620, "router_z_loss_clip": 2.30078125, "router_z_loss_mlp": 0.28491211, "step": 9196, "time_per_iteration": 2.6103553771972656 }, { "auxiliary_loss_clip": 0.01281928, "auxiliary_loss_mlp": 0.00265965, "balance_loss_clip": 1.06391847, "balance_loss_mlp": 0.2416102, "epoch": 0.552953554787314, "flos": 21939408708480.0, "grad_norm": 3.754804846064917, "language_loss": 0.82757616, "learning_rate": 1.7546743530632115e-06, "loss": 0.84305507, "num_input_tokens_seen": 198123565, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.24328613, "step": 9197, "time_per_iteration": 2.6507315635681152 }, { "auxiliary_loss_clip": 0.01284126, "auxiliary_loss_mlp": 0.00266478, "balance_loss_clip": 1.06308651, "balance_loss_mlp": 0.24076498, "epoch": 0.5530136780399819, "flos": 43658002558080.0, "grad_norm": 2.3452844179867545, "language_loss": 0.81950808, "learning_rate": 1.754287837093407e-06, "loss": 0.8350141, "num_input_tokens_seen": 198148270, "router_z_loss_clip": 2.2109375, "router_z_loss_mlp": 0.25720215, "step": 9198, "time_per_iteration": 2.851803779602051 }, { "auxiliary_loss_clip": 0.01284614, "auxiliary_loss_mlp": 0.00271029, "balance_loss_clip": 1.06321645, "balance_loss_mlp": 0.24584031, "epoch": 0.5530738012926499, "flos": 25045502417280.0, "grad_norm": 7.357158403293689, "language_loss": 0.8351658, "learning_rate": 1.7539013304410327e-06, "loss": 0.85072219, "num_input_tokens_seen": 198168810, "router_z_loss_clip": 2.21289062, "router_z_loss_mlp": 0.25183105, "step": 9199, "time_per_iteration": 2.6823856830596924 }, { "auxiliary_loss_clip": 0.01280425, "auxiliary_loss_mlp": 0.00263957, "balance_loss_clip": 1.05979538, "balance_loss_mlp": 0.23982856, "epoch": 0.553133924545318, "flos": 16472081623680.0, "grad_norm": 2.4453530526082377, "language_loss": 0.69657421, "learning_rate": 1.7535148331207443e-06, "loss": 0.71201801, "num_input_tokens_seen": 198186200, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.24145508, "step": 9200, "time_per_iteration": 2.6146469116210938 }, { "auxiliary_loss_clip": 0.01319395, "auxiliary_loss_mlp": 0.00247713, "balance_loss_clip": 1.07931364, "balance_loss_mlp": 0.22103378, "epoch": 0.5531940477979859, "flos": 24606207083520.0, "grad_norm": 409.89672010096854, "language_loss": 0.72058856, "learning_rate": 1.7531283451471978e-06, "loss": 0.73625964, "num_input_tokens_seen": 198207050, "router_z_loss_clip": 2.3984375, "router_z_loss_mlp": 0.26733398, "step": 9201, "time_per_iteration": 2.688994884490967 }, { "auxiliary_loss_clip": 0.01287423, "auxiliary_loss_mlp": 0.00257291, "balance_loss_clip": 1.06141686, "balance_loss_mlp": 0.23298393, "epoch": 0.5532541710506539, "flos": 22159577122560.0, "grad_norm": 5.700559431946877, "language_loss": 0.69297636, "learning_rate": 1.7527418665350502e-06, "loss": 0.7084235, "num_input_tokens_seen": 198224565, "router_z_loss_clip": 2.26367188, "router_z_loss_mlp": 0.2434082, "step": 9202, "time_per_iteration": 2.63075852394104 }, { "auxiliary_loss_clip": 0.01293318, "auxiliary_loss_mlp": 0.00266114, "balance_loss_clip": 1.06588149, "balance_loss_mlp": 0.24227206, "epoch": 0.5533142943033218, "flos": 21397265758080.0, "grad_norm": 5.497165486518426, "language_loss": 0.74258363, "learning_rate": 1.7523553972989548e-06, "loss": 0.75817788, "num_input_tokens_seen": 198244790, "router_z_loss_clip": 2.27539062, "router_z_loss_mlp": 0.23876953, "step": 9203, "time_per_iteration": 2.6561641693115234 }, { "auxiliary_loss_clip": 0.01312614, "auxiliary_loss_mlp": 0.00296984, "balance_loss_clip": 1.08192325, "balance_loss_mlp": 0.26770645, "epoch": 0.5533744175559898, "flos": 23550541344000.0, "grad_norm": 41.55447942499581, "language_loss": 0.70866883, "learning_rate": 1.7519689374535683e-06, "loss": 0.72476482, "num_input_tokens_seen": 198264375, "router_z_loss_clip": 2.3046875, "router_z_loss_mlp": 0.29272461, "step": 9204, "time_per_iteration": 2.779344081878662 }, { "auxiliary_loss_clip": 0.01260999, "auxiliary_loss_mlp": 0.00253924, "balance_loss_clip": 1.04454088, "balance_loss_mlp": 0.23017722, "epoch": 0.5534345408086577, "flos": 24061514267520.0, "grad_norm": 2.9993236311591884, "language_loss": 0.83937734, "learning_rate": 1.7515824870135445e-06, "loss": 0.85452658, "num_input_tokens_seen": 198283895, "router_z_loss_clip": 2.16210938, "router_z_loss_mlp": 0.23742676, "step": 9205, "time_per_iteration": 2.6571333408355713 }, { "auxiliary_loss_clip": 0.01283398, "auxiliary_loss_mlp": 0.00266463, "balance_loss_clip": 1.06239486, "balance_loss_mlp": 0.24166755, "epoch": 0.5534946640613257, "flos": 33771831408000.0, "grad_norm": 2.0481990325381805, "language_loss": 0.77680594, "learning_rate": 1.751196045993537e-06, "loss": 0.79230452, "num_input_tokens_seen": 198310035, "router_z_loss_clip": 2.20507812, "router_z_loss_mlp": 0.2479248, "step": 9206, "time_per_iteration": 2.858829975128174 }, { "auxiliary_loss_clip": 0.01293321, "auxiliary_loss_mlp": 0.00242714, "balance_loss_clip": 1.06310844, "balance_loss_mlp": 0.21857423, "epoch": 0.5535547873139937, "flos": 15159223526400.0, "grad_norm": 135.5862808096717, "language_loss": 0.83285856, "learning_rate": 1.7508096144082012e-06, "loss": 0.84821892, "num_input_tokens_seen": 198327810, "router_z_loss_clip": 2.3046875, "router_z_loss_mlp": 0.24157715, "step": 9207, "time_per_iteration": 2.6402580738067627 }, { "auxiliary_loss_clip": 0.01301652, "auxiliary_loss_mlp": 0.00288865, "balance_loss_clip": 1.06539965, "balance_loss_mlp": 0.26151827, "epoch": 0.5536149105666617, "flos": 16980863817600.0, "grad_norm": 24.950233014735783, "language_loss": 0.76854366, "learning_rate": 1.750423192272189e-06, "loss": 0.78444886, "num_input_tokens_seen": 198343150, "router_z_loss_clip": 2.36132812, "router_z_loss_mlp": 0.27331543, "step": 9208, "time_per_iteration": 2.6422386169433594 }, { "auxiliary_loss_clip": 0.01291024, "auxiliary_loss_mlp": 0.00280868, "balance_loss_clip": 1.05791354, "balance_loss_mlp": 0.25371251, "epoch": 0.5536750338193296, "flos": 18149935772160.0, "grad_norm": 283.9500217049027, "language_loss": 0.76866937, "learning_rate": 1.7500367796001547e-06, "loss": 0.7843883, "num_input_tokens_seen": 198360925, "router_z_loss_clip": 2.33007812, "router_z_loss_mlp": 0.27172852, "step": 9209, "time_per_iteration": 2.6315619945526123 }, { "auxiliary_loss_clip": 0.01279825, "auxiliary_loss_mlp": 0.00247521, "balance_loss_clip": 1.05261326, "balance_loss_mlp": 0.2236197, "epoch": 0.5537351570719976, "flos": 22747794243840.0, "grad_norm": 9.506883509727857, "language_loss": 0.90178561, "learning_rate": 1.7496503764067513e-06, "loss": 0.917059, "num_input_tokens_seen": 198379265, "router_z_loss_clip": 2.27148438, "router_z_loss_mlp": 0.2388916, "step": 9210, "time_per_iteration": 2.6867787837982178 }, { "auxiliary_loss_clip": 0.01277639, "auxiliary_loss_mlp": 0.0023562, "balance_loss_clip": 1.05325425, "balance_loss_mlp": 0.21251678, "epoch": 0.5537952803246655, "flos": 26356026130560.0, "grad_norm": 4.210211972755019, "language_loss": 0.80585515, "learning_rate": 1.74926398270663e-06, "loss": 0.8209877, "num_input_tokens_seen": 198399490, "router_z_loss_clip": 2.23632812, "router_z_loss_mlp": 0.23120117, "step": 9211, "time_per_iteration": 2.6875083446502686 }, { "auxiliary_loss_clip": 0.01310045, "auxiliary_loss_mlp": 0.00282415, "balance_loss_clip": 1.07164836, "balance_loss_mlp": 0.25491321, "epoch": 0.5538554035773335, "flos": 18037427397120.0, "grad_norm": 21.04233208146956, "language_loss": 0.76526725, "learning_rate": 1.7488775985144437e-06, "loss": 0.78119189, "num_input_tokens_seen": 198419110, "router_z_loss_clip": 2.3828125, "router_z_loss_mlp": 0.27514648, "step": 9212, "time_per_iteration": 2.68575119972229 }, { "auxiliary_loss_clip": 0.01285078, "auxiliary_loss_mlp": 0.00268471, "balance_loss_clip": 1.04949069, "balance_loss_mlp": 0.24293678, "epoch": 0.5539155268300014, "flos": 31686247002240.0, "grad_norm": 72.8970512271501, "language_loss": 0.60455662, "learning_rate": 1.7484912238448443e-06, "loss": 0.62009209, "num_input_tokens_seen": 198441360, "router_z_loss_clip": 2.359375, "router_z_loss_mlp": 0.25561523, "step": 9213, "time_per_iteration": 2.7617528438568115 }, { "auxiliary_loss_clip": 0.01294729, "auxiliary_loss_mlp": 0.00261664, "balance_loss_clip": 1.06296802, "balance_loss_mlp": 0.23560455, "epoch": 0.5539756500826695, "flos": 15193769431680.0, "grad_norm": 35.49012003967723, "language_loss": 0.93366063, "learning_rate": 1.7481048587124827e-06, "loss": 0.94922459, "num_input_tokens_seen": 198459835, "router_z_loss_clip": 2.31640625, "router_z_loss_mlp": 0.26049805, "step": 9214, "time_per_iteration": 2.6475625038146973 }, { "auxiliary_loss_clip": 0.0126982, "auxiliary_loss_mlp": 0.00256269, "balance_loss_clip": 1.04361415, "balance_loss_mlp": 0.23140207, "epoch": 0.5540357733353375, "flos": 26353117128960.0, "grad_norm": 18.14883041958605, "language_loss": 0.76768005, "learning_rate": 1.7477185031320108e-06, "loss": 0.78294098, "num_input_tokens_seen": 198478955, "router_z_loss_clip": 2.26171875, "router_z_loss_mlp": 0.24902344, "step": 9215, "time_per_iteration": 2.7160451412200928 }, { "auxiliary_loss_clip": 0.01274178, "auxiliary_loss_mlp": 0.00256019, "balance_loss_clip": 1.04567266, "balance_loss_mlp": 0.2314261, "epoch": 0.5540958965880054, "flos": 21323684747520.0, "grad_norm": 3.558281739048892, "language_loss": 0.80919373, "learning_rate": 1.7473321571180773e-06, "loss": 0.82449573, "num_input_tokens_seen": 198499030, "router_z_loss_clip": 2.28710938, "router_z_loss_mlp": 0.24609375, "step": 9216, "time_per_iteration": 2.7252049446105957 }, { "auxiliary_loss_clip": 0.01271538, "auxiliary_loss_mlp": 0.00231977, "balance_loss_clip": 1.04536867, "balance_loss_mlp": 0.20803991, "epoch": 0.5541560198406734, "flos": 25666828899840.0, "grad_norm": 5.7950178087104876, "language_loss": 0.78619456, "learning_rate": 1.7469458206853345e-06, "loss": 0.80122966, "num_input_tokens_seen": 198520265, "router_z_loss_clip": 2.2578125, "router_z_loss_mlp": 0.23950195, "step": 9217, "time_per_iteration": 2.7029402256011963 }, { "auxiliary_loss_clip": 0.01265325, "auxiliary_loss_mlp": 0.00252823, "balance_loss_clip": 1.04013419, "balance_loss_mlp": 0.22941038, "epoch": 0.5542161430933413, "flos": 21939624190080.0, "grad_norm": 66.22637148297137, "language_loss": 0.83239973, "learning_rate": 1.7465594938484315e-06, "loss": 0.84758127, "num_input_tokens_seen": 198539645, "router_z_loss_clip": 2.25, "router_z_loss_mlp": 0.23425293, "step": 9218, "time_per_iteration": 2.626368999481201 }, { "auxiliary_loss_clip": 0.01279799, "auxiliary_loss_mlp": 0.00232576, "balance_loss_clip": 1.04962206, "balance_loss_mlp": 0.20733932, "epoch": 0.5542762663460093, "flos": 19571459489280.0, "grad_norm": 2.9384406746854985, "language_loss": 0.79250127, "learning_rate": 1.7461731766220176e-06, "loss": 0.80762506, "num_input_tokens_seen": 198558710, "router_z_loss_clip": 2.30273438, "router_z_loss_mlp": 0.2520752, "step": 9219, "time_per_iteration": 2.749870777130127 }, { "auxiliary_loss_clip": 0.01283135, "auxiliary_loss_mlp": 0.00256769, "balance_loss_clip": 1.05510235, "balance_loss_mlp": 0.23225985, "epoch": 0.5543363895986773, "flos": 19499063627520.0, "grad_norm": 5.708720702743817, "language_loss": 0.78888738, "learning_rate": 1.7457868690207426e-06, "loss": 0.80428636, "num_input_tokens_seen": 198577050, "router_z_loss_clip": 2.28125, "router_z_loss_mlp": 0.24499512, "step": 9220, "time_per_iteration": 2.6990456581115723 }, { "auxiliary_loss_clip": 0.01263499, "auxiliary_loss_mlp": 0.00228581, "balance_loss_clip": 1.03953779, "balance_loss_mlp": 0.20640856, "epoch": 0.5543965128513453, "flos": 22635609091200.0, "grad_norm": 9.41310251287739, "language_loss": 0.84294522, "learning_rate": 1.7454005710592547e-06, "loss": 0.85786605, "num_input_tokens_seen": 198595290, "router_z_loss_clip": 2.24023438, "router_z_loss_mlp": 0.22180176, "step": 9221, "time_per_iteration": 4.02078652381897 }, { "auxiliary_loss_clip": 0.01280433, "auxiliary_loss_mlp": 0.00244505, "balance_loss_clip": 1.05488694, "balance_loss_mlp": 0.22143817, "epoch": 0.5544566361040132, "flos": 25989952671360.0, "grad_norm": 2.545494935459845, "language_loss": 0.90620995, "learning_rate": 1.7450142827522027e-06, "loss": 0.92145932, "num_input_tokens_seen": 198614110, "router_z_loss_clip": 2.2578125, "router_z_loss_mlp": 0.23071289, "step": 9222, "time_per_iteration": 4.19714879989624 }, { "auxiliary_loss_clip": 0.01285416, "auxiliary_loss_mlp": 0.0023476, "balance_loss_clip": 1.05181324, "balance_loss_mlp": 0.20867673, "epoch": 0.5545167593566812, "flos": 28257568225920.0, "grad_norm": 5.459283806475052, "language_loss": 0.85375285, "learning_rate": 1.7446280041142344e-06, "loss": 0.8689546, "num_input_tokens_seen": 198633880, "router_z_loss_clip": 2.33789062, "router_z_loss_mlp": 0.2611084, "step": 9223, "time_per_iteration": 2.722905397415161 }, { "auxiliary_loss_clip": 0.01280568, "auxiliary_loss_mlp": 0.00232676, "balance_loss_clip": 1.04729605, "balance_loss_mlp": 0.20765391, "epoch": 0.5545768826093491, "flos": 28476551491200.0, "grad_norm": 4.550986790121821, "language_loss": 0.91364259, "learning_rate": 1.7442417351599986e-06, "loss": 0.92877495, "num_input_tokens_seen": 198653505, "router_z_loss_clip": 2.33007812, "router_z_loss_mlp": 0.25036621, "step": 9224, "time_per_iteration": 4.0948100090026855 }, { "auxiliary_loss_clip": 0.0127812, "auxiliary_loss_mlp": 0.00260058, "balance_loss_clip": 1.04994762, "balance_loss_mlp": 0.23500015, "epoch": 0.5546370058620171, "flos": 18478051534080.0, "grad_norm": 76.6802932553396, "language_loss": 0.65972596, "learning_rate": 1.743855475904141e-06, "loss": 0.67510772, "num_input_tokens_seen": 198671890, "router_z_loss_clip": 2.28125, "router_z_loss_mlp": 0.25061035, "step": 9225, "time_per_iteration": 2.650106906890869 }, { "auxiliary_loss_clip": 0.0127469, "auxiliary_loss_mlp": 0.0024432, "balance_loss_clip": 1.04224229, "balance_loss_mlp": 0.21907103, "epoch": 0.554697129114685, "flos": 22930507751040.0, "grad_norm": 23.174754402037742, "language_loss": 0.74313515, "learning_rate": 1.7434692263613098e-06, "loss": 0.75832522, "num_input_tokens_seen": 198691995, "router_z_loss_clip": 2.32617188, "router_z_loss_mlp": 0.25244141, "step": 9226, "time_per_iteration": 2.72800874710083 }, { "auxiliary_loss_clip": 0.01248446, "auxiliary_loss_mlp": 0.00240307, "balance_loss_clip": 1.0251472, "balance_loss_mlp": 0.21826571, "epoch": 0.5547572523673531, "flos": 21797166850560.0, "grad_norm": 36.653339989624826, "language_loss": 0.80477083, "learning_rate": 1.7430829865461518e-06, "loss": 0.8196584, "num_input_tokens_seen": 198712440, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.22045898, "step": 9227, "time_per_iteration": 2.645087242126465 }, { "auxiliary_loss_clip": 0.01279353, "auxiliary_loss_mlp": 0.00221815, "balance_loss_clip": 1.04694676, "balance_loss_mlp": 0.19850978, "epoch": 0.5548173756200211, "flos": 22342829333760.0, "grad_norm": 87.03254570716952, "language_loss": 0.79789698, "learning_rate": 1.7426967564733118e-06, "loss": 0.81290871, "num_input_tokens_seen": 198731515, "router_z_loss_clip": 2.32617188, "router_z_loss_mlp": 0.23278809, "step": 9228, "time_per_iteration": 2.694439172744751 }, { "auxiliary_loss_clip": 0.01258375, "auxiliary_loss_mlp": 0.00221158, "balance_loss_clip": 1.03548026, "balance_loss_mlp": 0.19871122, "epoch": 0.554877498872689, "flos": 17858736213120.0, "grad_norm": 2.4031143898241343, "language_loss": 0.8329643, "learning_rate": 1.7423105361574373e-06, "loss": 0.8477596, "num_input_tokens_seen": 198749750, "router_z_loss_clip": 2.2265625, "router_z_loss_mlp": 0.22460938, "step": 9229, "time_per_iteration": 2.768777847290039 }, { "auxiliary_loss_clip": 0.01256081, "auxiliary_loss_mlp": 0.00238558, "balance_loss_clip": 1.0274744, "balance_loss_mlp": 0.2137509, "epoch": 0.554937622125357, "flos": 17238343484160.0, "grad_norm": 23.592011255432762, "language_loss": 0.77526873, "learning_rate": 1.741924325613172e-06, "loss": 0.79021513, "num_input_tokens_seen": 198768320, "router_z_loss_clip": 2.28710938, "router_z_loss_mlp": 0.2479248, "step": 9230, "time_per_iteration": 4.0585081577301025 }, { "auxiliary_loss_clip": 0.01275528, "auxiliary_loss_mlp": 0.00238989, "balance_loss_clip": 1.04269195, "balance_loss_mlp": 0.21543312, "epoch": 0.5549977453780249, "flos": 25368087484800.0, "grad_norm": 10.285513914585666, "language_loss": 0.77531689, "learning_rate": 1.741538124855163e-06, "loss": 0.79046202, "num_input_tokens_seen": 198787230, "router_z_loss_clip": 2.33007812, "router_z_loss_mlp": 0.2355957, "step": 9231, "time_per_iteration": 2.670149803161621 }, { "auxiliary_loss_clip": 0.0130962, "auxiliary_loss_mlp": 0.00207972, "balance_loss_clip": 1.07006764, "balance_loss_mlp": 0.18297368, "epoch": 0.555057868630693, "flos": 25079114568960.0, "grad_norm": 110.65626190972614, "language_loss": 0.83714795, "learning_rate": 1.7411519338980548e-06, "loss": 0.85232389, "num_input_tokens_seen": 198806720, "router_z_loss_clip": 2.3984375, "router_z_loss_mlp": 0.24975586, "step": 9232, "time_per_iteration": 2.6871633529663086 }, { "auxiliary_loss_clip": 0.01251742, "auxiliary_loss_mlp": 0.00230581, "balance_loss_clip": 1.03297877, "balance_loss_mlp": 0.20771676, "epoch": 0.5551179918833609, "flos": 26104220812800.0, "grad_norm": 3.770693044429687, "language_loss": 0.88847852, "learning_rate": 1.7407657527564898e-06, "loss": 0.90330172, "num_input_tokens_seen": 198826235, "router_z_loss_clip": 2.19140625, "router_z_loss_mlp": 0.2286377, "step": 9233, "time_per_iteration": 2.681368827819824 }, { "auxiliary_loss_clip": 0.01290574, "auxiliary_loss_mlp": 0.00223516, "balance_loss_clip": 1.05400813, "balance_loss_mlp": 0.19937646, "epoch": 0.5551781151360289, "flos": 19384759572480.0, "grad_norm": 4.130277217992442, "language_loss": 0.86433923, "learning_rate": 1.7403795814451142e-06, "loss": 0.87948012, "num_input_tokens_seen": 198842655, "router_z_loss_clip": 2.36523438, "router_z_loss_mlp": 0.24157715, "step": 9234, "time_per_iteration": 2.61818790435791 }, { "auxiliary_loss_clip": 0.01252772, "auxiliary_loss_mlp": 0.00225525, "balance_loss_clip": 1.02772999, "balance_loss_mlp": 0.20292285, "epoch": 0.5552382383886968, "flos": 21725956137600.0, "grad_norm": 11.356326398298755, "language_loss": 0.73942524, "learning_rate": 1.7399934199785706e-06, "loss": 0.75420821, "num_input_tokens_seen": 198861210, "router_z_loss_clip": 2.25585938, "router_z_loss_mlp": 0.22607422, "step": 9235, "time_per_iteration": 2.6087398529052734 }, { "auxiliary_loss_clip": 0.01268927, "auxiliary_loss_mlp": 0.00226014, "balance_loss_clip": 1.04211235, "balance_loss_mlp": 0.20230351, "epoch": 0.5552983616413648, "flos": 14356189117440.0, "grad_norm": 17.627753276999723, "language_loss": 0.76959288, "learning_rate": 1.7396072683715029e-06, "loss": 0.78454232, "num_input_tokens_seen": 198880045, "router_z_loss_clip": 2.265625, "router_z_loss_mlp": 0.23742676, "step": 9236, "time_per_iteration": 2.6975584030151367 }, { "auxiliary_loss_clip": 0.01258838, "auxiliary_loss_mlp": 0.00216938, "balance_loss_clip": 1.03569329, "balance_loss_mlp": 0.19452706, "epoch": 0.5553584848940327, "flos": 25478548784640.0, "grad_norm": 53.435653817117704, "language_loss": 0.92936563, "learning_rate": 1.7392211266385536e-06, "loss": 0.94412345, "num_input_tokens_seen": 198900210, "router_z_loss_clip": 2.23046875, "router_z_loss_mlp": 0.22412109, "step": 9237, "time_per_iteration": 2.68105149269104 }, { "auxiliary_loss_clip": 0.01249159, "auxiliary_loss_mlp": 0.00207138, "balance_loss_clip": 1.02541769, "balance_loss_mlp": 0.18445221, "epoch": 0.5554186081467007, "flos": 22163850840960.0, "grad_norm": 3.101721231493904, "language_loss": 0.83523494, "learning_rate": 1.7388349947943652e-06, "loss": 0.8497979, "num_input_tokens_seen": 198919055, "router_z_loss_clip": 2.24023438, "router_z_loss_mlp": 0.22680664, "step": 9238, "time_per_iteration": 2.6770336627960205 }, { "auxiliary_loss_clip": 0.01258571, "auxiliary_loss_mlp": 0.00253765, "balance_loss_clip": 1.02894068, "balance_loss_mlp": 0.22890967, "epoch": 0.5554787313993687, "flos": 49746656125440.0, "grad_norm": 36.89862941991988, "language_loss": 0.8660605, "learning_rate": 1.73844887285358e-06, "loss": 0.88118386, "num_input_tokens_seen": 198943505, "router_z_loss_clip": 2.29296875, "router_z_loss_mlp": 0.24829102, "step": 9239, "time_per_iteration": 2.878704309463501 }, { "auxiliary_loss_clip": 0.01257896, "auxiliary_loss_mlp": 0.00222221, "balance_loss_clip": 1.0308814, "balance_loss_mlp": 0.20027408, "epoch": 0.5555388546520367, "flos": 22127365601280.0, "grad_norm": 185.7099563300428, "language_loss": 0.86227393, "learning_rate": 1.7380627608308393e-06, "loss": 0.87707508, "num_input_tokens_seen": 198963590, "router_z_loss_clip": 2.26757812, "router_z_loss_mlp": 0.21960449, "step": 9240, "time_per_iteration": 2.65285587310791 }, { "auxiliary_loss_clip": 0.01232441, "auxiliary_loss_mlp": 0.00217185, "balance_loss_clip": 1.01421762, "balance_loss_mlp": 0.19474961, "epoch": 0.5555989779047047, "flos": 24682122478080.0, "grad_norm": 6.2581925734171095, "language_loss": 0.71637797, "learning_rate": 1.737676658740786e-06, "loss": 0.73087424, "num_input_tokens_seen": 198982680, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.2244873, "step": 9241, "time_per_iteration": 2.641261577606201 }, { "auxiliary_loss_clip": 0.01271737, "auxiliary_loss_mlp": 0.00199632, "balance_loss_clip": 1.04074371, "balance_loss_mlp": 0.17531367, "epoch": 0.5556591011573726, "flos": 16106510954880.0, "grad_norm": 2.884599129599908, "language_loss": 0.83853716, "learning_rate": 1.7372905665980594e-06, "loss": 0.85325086, "num_input_tokens_seen": 199000185, "router_z_loss_clip": 2.30664062, "router_z_loss_mlp": 0.24316406, "step": 9242, "time_per_iteration": 2.628225803375244 }, { "auxiliary_loss_clip": 0.01270095, "auxiliary_loss_mlp": 0.00211615, "balance_loss_clip": 1.03783607, "balance_loss_mlp": 0.18714099, "epoch": 0.5557192244100406, "flos": 12933695733120.0, "grad_norm": 6.823280952117062, "language_loss": 0.75189835, "learning_rate": 1.7369044844173012e-06, "loss": 0.76671541, "num_input_tokens_seen": 199018380, "router_z_loss_clip": 2.32226562, "router_z_loss_mlp": 0.24499512, "step": 9243, "time_per_iteration": 2.6669983863830566 }, { "auxiliary_loss_clip": 0.01262229, "auxiliary_loss_mlp": 0.00222876, "balance_loss_clip": 1.03568554, "balance_loss_mlp": 0.19909342, "epoch": 0.5557793476627085, "flos": 23111712887040.0, "grad_norm": 261.25664828508303, "language_loss": 0.83282578, "learning_rate": 1.7365184122131509e-06, "loss": 0.84767687, "num_input_tokens_seen": 199037115, "router_z_loss_clip": 2.26953125, "router_z_loss_mlp": 0.23779297, "step": 9244, "time_per_iteration": 2.7297770977020264 }, { "auxiliary_loss_clip": 0.01244171, "auxiliary_loss_mlp": 0.00205387, "balance_loss_clip": 1.02623641, "balance_loss_mlp": 0.1847042, "epoch": 0.5558394709153766, "flos": 21428040735360.0, "grad_norm": 2.4186555317238705, "language_loss": 0.8102212, "learning_rate": 1.7361323500002486e-06, "loss": 0.82471681, "num_input_tokens_seen": 199053375, "router_z_loss_clip": 2.18164062, "router_z_loss_mlp": 0.20690918, "step": 9245, "time_per_iteration": 2.6762495040893555 }, { "auxiliary_loss_clip": 0.01268605, "auxiliary_loss_mlp": 0.00232875, "balance_loss_clip": 1.03745246, "balance_loss_mlp": 0.2079601, "epoch": 0.5558995941680445, "flos": 25078324469760.0, "grad_norm": 20.751160905158958, "language_loss": 0.87857181, "learning_rate": 1.7357462977932348e-06, "loss": 0.89358658, "num_input_tokens_seen": 199070930, "router_z_loss_clip": 2.30859375, "router_z_loss_mlp": 0.24951172, "step": 9246, "time_per_iteration": 2.6793863773345947 }, { "auxiliary_loss_clip": 0.01261091, "auxiliary_loss_mlp": 0.00217827, "balance_loss_clip": 1.03484726, "balance_loss_mlp": 0.19353242, "epoch": 0.5559597174207125, "flos": 20011149872640.0, "grad_norm": 44.92362551642789, "language_loss": 0.79932368, "learning_rate": 1.7353602556067471e-06, "loss": 0.81411284, "num_input_tokens_seen": 199088675, "router_z_loss_clip": 2.26367188, "router_z_loss_mlp": 0.24291992, "step": 9247, "time_per_iteration": 2.652797222137451 }, { "auxiliary_loss_clip": 0.01257361, "auxiliary_loss_mlp": 0.00220543, "balance_loss_clip": 1.02980351, "balance_loss_mlp": 0.19624856, "epoch": 0.5560198406733804, "flos": 16835677044480.0, "grad_norm": 14.328898483488326, "language_loss": 0.8612833, "learning_rate": 1.7349742234554254e-06, "loss": 0.87606239, "num_input_tokens_seen": 199103075, "router_z_loss_clip": 2.27539062, "router_z_loss_mlp": 0.24304199, "step": 9248, "time_per_iteration": 2.6463027000427246 }, { "auxiliary_loss_clip": 0.01277085, "auxiliary_loss_mlp": 0.00036032, "balance_loss_clip": 1.12606239, "balance_loss_mlp": 0.02959512, "epoch": 0.5560799639260484, "flos": 70697051758080.0, "grad_norm": 0.8621616495606996, "language_loss": 0.58607435, "learning_rate": 1.7345882013539081e-06, "loss": 0.59920549, "num_input_tokens_seen": 199160325, "router_z_loss_clip": 1.515625, "router_z_loss_mlp": 0.06445312, "step": 9249, "time_per_iteration": 3.223930597305298 }, { "auxiliary_loss_clip": 0.01267321, "auxiliary_loss_mlp": 0.00223189, "balance_loss_clip": 1.03536725, "balance_loss_mlp": 0.19950244, "epoch": 0.5561400871787163, "flos": 23148593176320.0, "grad_norm": 15.24530202008838, "language_loss": 0.87639898, "learning_rate": 1.734202189316832e-06, "loss": 0.89130408, "num_input_tokens_seen": 199179760, "router_z_loss_clip": 2.3203125, "router_z_loss_mlp": 0.23706055, "step": 9250, "time_per_iteration": 2.649759531021118 }, { "auxiliary_loss_clip": 0.01259425, "auxiliary_loss_mlp": 0.00236, "balance_loss_clip": 1.03097677, "balance_loss_mlp": 0.21036997, "epoch": 0.5562002104313843, "flos": 17566423332480.0, "grad_norm": 23.063512773945064, "language_loss": 0.82126272, "learning_rate": 1.733816187358836e-06, "loss": 0.83621705, "num_input_tokens_seen": 199196695, "router_z_loss_clip": 2.28320312, "router_z_loss_mlp": 0.25622559, "step": 9251, "time_per_iteration": 2.6057286262512207 }, { "auxiliary_loss_clip": 0.01248783, "auxiliary_loss_mlp": 0.00228256, "balance_loss_clip": 1.02521122, "balance_loss_mlp": 0.20609525, "epoch": 0.5562603336840523, "flos": 25045430590080.0, "grad_norm": 23.892960766477756, "language_loss": 0.81937504, "learning_rate": 1.7334301954945569e-06, "loss": 0.83414543, "num_input_tokens_seen": 199217845, "router_z_loss_clip": 2.2421875, "router_z_loss_mlp": 0.22167969, "step": 9252, "time_per_iteration": 2.757880449295044 }, { "auxiliary_loss_clip": 0.01259032, "auxiliary_loss_mlp": 0.00226814, "balance_loss_clip": 1.03071404, "balance_loss_mlp": 0.20122002, "epoch": 0.5563204569367203, "flos": 29059022436480.0, "grad_norm": 8.4662475965508, "language_loss": 0.80450588, "learning_rate": 1.7330442137386313e-06, "loss": 0.81936443, "num_input_tokens_seen": 199239250, "router_z_loss_clip": 2.28125, "router_z_loss_mlp": 0.25598145, "step": 9253, "time_per_iteration": 2.7066712379455566 }, { "auxiliary_loss_clip": 0.01302324, "auxiliary_loss_mlp": 0.00217178, "balance_loss_clip": 1.06609488, "balance_loss_mlp": 0.19420633, "epoch": 0.5563805801893883, "flos": 22090449398400.0, "grad_norm": 34.865224286180435, "language_loss": 0.88474876, "learning_rate": 1.7326582421056965e-06, "loss": 0.89994377, "num_input_tokens_seen": 199258320, "router_z_loss_clip": 2.36132812, "router_z_loss_mlp": 0.22949219, "step": 9254, "time_per_iteration": 2.6208384037017822 }, { "auxiliary_loss_clip": 0.01266097, "auxiliary_loss_mlp": 0.00028141, "balance_loss_clip": 1.11656106, "balance_loss_mlp": 0.02237139, "epoch": 0.5564407034420562, "flos": 58636128689280.0, "grad_norm": 0.8790005084751179, "language_loss": 0.64771628, "learning_rate": 1.732272280610387e-06, "loss": 0.6606586, "num_input_tokens_seen": 199314840, "router_z_loss_clip": 1.5, "router_z_loss_mlp": 0.05761719, "step": 9255, "time_per_iteration": 3.0035715103149414 }, { "auxiliary_loss_clip": 0.01265433, "auxiliary_loss_mlp": 0.00205926, "balance_loss_clip": 1.03685904, "balance_loss_mlp": 0.1844327, "epoch": 0.5565008266947242, "flos": 23112323418240.0, "grad_norm": 2.522843596717607, "language_loss": 0.75690848, "learning_rate": 1.7318863292673399e-06, "loss": 0.77162206, "num_input_tokens_seen": 199335405, "router_z_loss_clip": 2.28515625, "router_z_loss_mlp": 0.21484375, "step": 9256, "time_per_iteration": 2.6445653438568115 }, { "auxiliary_loss_clip": 0.01261002, "auxiliary_loss_mlp": 0.0020185, "balance_loss_clip": 1.03569567, "balance_loss_mlp": 0.17979592, "epoch": 0.5565609499473921, "flos": 21578399066880.0, "grad_norm": 4.804848674420017, "language_loss": 0.8246454, "learning_rate": 1.73150038809119e-06, "loss": 0.83927393, "num_input_tokens_seen": 199354345, "router_z_loss_clip": 2.25195312, "router_z_loss_mlp": 0.22045898, "step": 9257, "time_per_iteration": 2.6306402683258057 }, { "auxiliary_loss_clip": 0.01282586, "auxiliary_loss_mlp": 0.00207846, "balance_loss_clip": 1.05087173, "balance_loss_mlp": 0.18476734, "epoch": 0.5566210732000602, "flos": 18369637309440.0, "grad_norm": 649.8681584865938, "language_loss": 0.71600378, "learning_rate": 1.7311144570965724e-06, "loss": 0.73090816, "num_input_tokens_seen": 199372250, "router_z_loss_clip": 2.31835938, "router_z_loss_mlp": 0.23095703, "step": 9258, "time_per_iteration": 2.664853811264038 }, { "auxiliary_loss_clip": 0.01277534, "auxiliary_loss_mlp": 0.00212301, "balance_loss_clip": 1.0458678, "balance_loss_mlp": 0.18736291, "epoch": 0.5566811964527281, "flos": 25703350053120.0, "grad_norm": 853.9569332166142, "language_loss": 0.87404168, "learning_rate": 1.7307285362981215e-06, "loss": 0.88893998, "num_input_tokens_seen": 199392815, "router_z_loss_clip": 2.31640625, "router_z_loss_mlp": 0.24926758, "step": 9259, "time_per_iteration": 2.7335550785064697 }, { "auxiliary_loss_clip": 0.01250608, "auxiliary_loss_mlp": 0.00192615, "balance_loss_clip": 1.02320695, "balance_loss_mlp": 0.17139557, "epoch": 0.5567413197053961, "flos": 26943991856640.0, "grad_norm": 15.323798765256306, "language_loss": 0.88668275, "learning_rate": 1.7303426257104712e-06, "loss": 0.901115, "num_input_tokens_seen": 199412375, "router_z_loss_clip": 2.27539062, "router_z_loss_mlp": 0.21240234, "step": 9260, "time_per_iteration": 2.700984001159668 }, { "auxiliary_loss_clip": 0.01277498, "auxiliary_loss_mlp": 0.00222255, "balance_loss_clip": 1.04427087, "balance_loss_mlp": 0.1966608, "epoch": 0.556801442958064, "flos": 20850597694080.0, "grad_norm": 18.509390554612427, "language_loss": 0.75380576, "learning_rate": 1.729956725348256e-06, "loss": 0.7688033, "num_input_tokens_seen": 199431490, "router_z_loss_clip": 2.33203125, "router_z_loss_mlp": 0.25610352, "step": 9261, "time_per_iteration": 2.6386945247650146 }, { "auxiliary_loss_clip": 0.0125936, "auxiliary_loss_mlp": 0.00060531, "balance_loss_clip": 1.10856676, "balance_loss_mlp": 0.05399799, "epoch": 0.556861566210732, "flos": 70498213044480.0, "grad_norm": 0.7255276699105632, "language_loss": 0.60833156, "learning_rate": 1.729570835226108e-06, "loss": 0.62153047, "num_input_tokens_seen": 199495855, "router_z_loss_clip": 1.5078125, "router_z_loss_mlp": 0.06542969, "step": 9262, "time_per_iteration": 3.1873230934143066 }, { "auxiliary_loss_clip": 0.01281486, "auxiliary_loss_mlp": 0.00202575, "balance_loss_clip": 1.05215669, "balance_loss_mlp": 0.17863756, "epoch": 0.5569216894633999, "flos": 25337276593920.0, "grad_norm": 3.4872910725068476, "language_loss": 0.7142123, "learning_rate": 1.7291849553586622e-06, "loss": 0.7290529, "num_input_tokens_seen": 199515870, "router_z_loss_clip": 2.2890625, "router_z_loss_mlp": 0.23950195, "step": 9263, "time_per_iteration": 4.069705486297607 }, { "auxiliary_loss_clip": 0.0128613, "auxiliary_loss_mlp": 0.00204453, "balance_loss_clip": 1.0531292, "balance_loss_mlp": 0.18225628, "epoch": 0.556981812716068, "flos": 22638733574400.0, "grad_norm": 73.03107491825486, "language_loss": 0.80733931, "learning_rate": 1.7287990857605497e-06, "loss": 0.82224512, "num_input_tokens_seen": 199535745, "router_z_loss_clip": 2.328125, "router_z_loss_mlp": 0.22180176, "step": 9264, "time_per_iteration": 4.066655874252319 }, { "auxiliary_loss_clip": 0.0128513, "auxiliary_loss_mlp": 0.00203364, "balance_loss_clip": 1.04877484, "balance_loss_mlp": 0.1801417, "epoch": 0.5570419359687359, "flos": 11035852738560.0, "grad_norm": 19.888206374686803, "language_loss": 0.85894465, "learning_rate": 1.7284132264464022e-06, "loss": 0.87382954, "num_input_tokens_seen": 199554035, "router_z_loss_clip": 2.36523438, "router_z_loss_mlp": 0.2322998, "step": 9265, "time_per_iteration": 2.6315948963165283 }, { "auxiliary_loss_clip": 0.01288663, "auxiliary_loss_mlp": 0.00201384, "balance_loss_clip": 1.05562937, "balance_loss_mlp": 0.18092801, "epoch": 0.5571020592214039, "flos": 22823135020800.0, "grad_norm": 1057.7939186906867, "language_loss": 0.7717706, "learning_rate": 1.7280273774308536e-06, "loss": 0.7866711, "num_input_tokens_seen": 199576120, "router_z_loss_clip": 2.33007812, "router_z_loss_mlp": 0.20446777, "step": 9266, "time_per_iteration": 2.6999247074127197 }, { "auxiliary_loss_clip": 0.01278162, "auxiliary_loss_mlp": 0.00190251, "balance_loss_clip": 1.04636025, "balance_loss_mlp": 0.16779205, "epoch": 0.5571621824740719, "flos": 22927778317440.0, "grad_norm": 3.686496863347203, "language_loss": 0.7633971, "learning_rate": 1.727641538728533e-06, "loss": 0.77808124, "num_input_tokens_seen": 199593780, "router_z_loss_clip": 2.31835938, "router_z_loss_mlp": 0.22485352, "step": 9267, "time_per_iteration": 4.009453296661377 }, { "auxiliary_loss_clip": 0.01280988, "auxiliary_loss_mlp": 0.0021126, "balance_loss_clip": 1.04740047, "balance_loss_mlp": 0.18880142, "epoch": 0.5572223057267398, "flos": 22966705681920.0, "grad_norm": 100.82011354978276, "language_loss": 0.81194687, "learning_rate": 1.7272557103540736e-06, "loss": 0.82686937, "num_input_tokens_seen": 199613220, "router_z_loss_clip": 2.3359375, "router_z_loss_mlp": 0.22485352, "step": 9268, "time_per_iteration": 2.6210644245147705 }, { "auxiliary_loss_clip": 0.01265906, "auxiliary_loss_mlp": 0.00202222, "balance_loss_clip": 1.03511846, "balance_loss_mlp": 0.18021587, "epoch": 0.5572824289794078, "flos": 20960053413120.0, "grad_norm": 4.258674789730526, "language_loss": 0.83484668, "learning_rate": 1.726869892322104e-06, "loss": 0.84952796, "num_input_tokens_seen": 199632085, "router_z_loss_clip": 2.31054688, "router_z_loss_mlp": 0.22033691, "step": 9269, "time_per_iteration": 2.641766309738159 }, { "auxiliary_loss_clip": 0.01261086, "auxiliary_loss_mlp": 0.0019486, "balance_loss_clip": 1.0298295, "balance_loss_mlp": 0.1722697, "epoch": 0.5573425522320757, "flos": 25042413847680.0, "grad_norm": 2.9206177314306547, "language_loss": 0.89177555, "learning_rate": 1.726484084647256e-06, "loss": 0.906335, "num_input_tokens_seen": 199649295, "router_z_loss_clip": 2.30859375, "router_z_loss_mlp": 0.22607422, "step": 9270, "time_per_iteration": 2.7265725135803223 }, { "auxiliary_loss_clip": 0.01280286, "auxiliary_loss_mlp": 0.00215647, "balance_loss_clip": 1.0436976, "balance_loss_mlp": 0.19128023, "epoch": 0.5574026754847438, "flos": 23659637927040.0, "grad_norm": 3.492643912695584, "language_loss": 0.90202647, "learning_rate": 1.7260982873441591e-06, "loss": 0.91698581, "num_input_tokens_seen": 199668870, "router_z_loss_clip": 2.36914062, "router_z_loss_mlp": 0.24365234, "step": 9271, "time_per_iteration": 2.679202079772949 }, { "auxiliary_loss_clip": 0.01282611, "auxiliary_loss_mlp": 0.0020716, "balance_loss_clip": 1.0489887, "balance_loss_mlp": 0.18225744, "epoch": 0.5574627987374117, "flos": 24782240661120.0, "grad_norm": 4.905246154396478, "language_loss": 0.96309304, "learning_rate": 1.725712500427442e-06, "loss": 0.97799081, "num_input_tokens_seen": 199684870, "router_z_loss_clip": 2.33984375, "router_z_loss_mlp": 0.24914551, "step": 9272, "time_per_iteration": 2.64282488822937 }, { "auxiliary_loss_clip": 0.01256937, "auxiliary_loss_mlp": 0.00196455, "balance_loss_clip": 1.02924538, "balance_loss_mlp": 0.17442463, "epoch": 0.5575229219900797, "flos": 21834944979840.0, "grad_norm": 23.43012647973238, "language_loss": 0.92763978, "learning_rate": 1.7253267239117347e-06, "loss": 0.94217372, "num_input_tokens_seen": 199701975, "router_z_loss_clip": 2.27929688, "router_z_loss_mlp": 0.22033691, "step": 9273, "time_per_iteration": 4.052488327026367 }, { "auxiliary_loss_clip": 0.0124894, "auxiliary_loss_mlp": 0.00204104, "balance_loss_clip": 1.01860654, "balance_loss_mlp": 0.18095329, "epoch": 0.5575830452427476, "flos": 27815148408960.0, "grad_norm": 11.523609270964146, "language_loss": 0.81845689, "learning_rate": 1.7249409578116655e-06, "loss": 0.83298731, "num_input_tokens_seen": 199721865, "router_z_loss_clip": 2.30078125, "router_z_loss_mlp": 0.23156738, "step": 9274, "time_per_iteration": 2.7314229011535645 }, { "auxiliary_loss_clip": 0.0128202, "auxiliary_loss_mlp": 0.0021091, "balance_loss_clip": 1.04064393, "balance_loss_mlp": 0.18655592, "epoch": 0.5576431684954156, "flos": 17812805696640.0, "grad_norm": 7.679687511348679, "language_loss": 0.9245699, "learning_rate": 1.7245552021418629e-06, "loss": 0.9394992, "num_input_tokens_seen": 199736455, "router_z_loss_clip": 2.41210938, "router_z_loss_mlp": 0.24389648, "step": 9275, "time_per_iteration": 2.615837812423706 }, { "auxiliary_loss_clip": 0.01279128, "auxiliary_loss_mlp": 0.00211596, "balance_loss_clip": 1.04539216, "balance_loss_mlp": 0.18899432, "epoch": 0.5577032917480835, "flos": 15486872411520.0, "grad_norm": 10.393678330077204, "language_loss": 0.81848973, "learning_rate": 1.7241694569169546e-06, "loss": 0.83339697, "num_input_tokens_seen": 199753125, "router_z_loss_clip": 2.33789062, "router_z_loss_mlp": 0.22619629, "step": 9276, "time_per_iteration": 2.59586238861084 }, { "auxiliary_loss_clip": 0.01264178, "auxiliary_loss_mlp": 0.00202907, "balance_loss_clip": 1.03083158, "balance_loss_mlp": 0.17826691, "epoch": 0.5577634150007516, "flos": 21579763783680.0, "grad_norm": 4.487328419075129, "language_loss": 0.82597148, "learning_rate": 1.7237837221515678e-06, "loss": 0.84064233, "num_input_tokens_seen": 199771365, "router_z_loss_clip": 2.33203125, "router_z_loss_mlp": 0.24645996, "step": 9277, "time_per_iteration": 2.6275746822357178 }, { "auxiliary_loss_clip": 0.01269332, "auxiliary_loss_mlp": 0.00182453, "balance_loss_clip": 1.0365597, "balance_loss_mlp": 0.15753824, "epoch": 0.5578235382534195, "flos": 21139750177920.0, "grad_norm": 3.698636774088309, "language_loss": 0.77320242, "learning_rate": 1.7233979978603304e-06, "loss": 0.7877202, "num_input_tokens_seen": 199790035, "router_z_loss_clip": 2.3203125, "router_z_loss_mlp": 0.24926758, "step": 9278, "time_per_iteration": 2.6340715885162354 }, { "auxiliary_loss_clip": 0.01278136, "auxiliary_loss_mlp": 0.00217559, "balance_loss_clip": 1.03936267, "balance_loss_mlp": 0.19173793, "epoch": 0.5578836615060875, "flos": 26505199313280.0, "grad_norm": 36.36057072365084, "language_loss": 0.81771463, "learning_rate": 1.723012284057868e-06, "loss": 0.83267152, "num_input_tokens_seen": 199811125, "router_z_loss_clip": 2.38671875, "router_z_loss_mlp": 0.25817871, "step": 9279, "time_per_iteration": 2.684398889541626 }, { "auxiliary_loss_clip": 0.01292389, "auxiliary_loss_mlp": 0.00192425, "balance_loss_clip": 1.04707217, "balance_loss_mlp": 0.16649678, "epoch": 0.5579437847587555, "flos": 20153786780160.0, "grad_norm": 45.377753465418635, "language_loss": 0.78709865, "learning_rate": 1.7226265807588082e-06, "loss": 0.80194676, "num_input_tokens_seen": 199829915, "router_z_loss_clip": 2.453125, "router_z_loss_mlp": 0.25939941, "step": 9280, "time_per_iteration": 2.732231855392456 }, { "auxiliary_loss_clip": 0.01269001, "auxiliary_loss_mlp": 0.00186163, "balance_loss_clip": 1.03134346, "balance_loss_mlp": 0.16253608, "epoch": 0.5580039080114234, "flos": 26102281478400.0, "grad_norm": 2.517288837810889, "language_loss": 0.83635259, "learning_rate": 1.7222408879777763e-06, "loss": 0.85090417, "num_input_tokens_seen": 199850670, "router_z_loss_clip": 2.37304688, "router_z_loss_mlp": 0.23632812, "step": 9281, "time_per_iteration": 2.660083293914795 }, { "auxiliary_loss_clip": 0.01276505, "auxiliary_loss_mlp": 0.00189033, "balance_loss_clip": 1.04283321, "balance_loss_mlp": 0.16650245, "epoch": 0.5580640312640914, "flos": 13771671096960.0, "grad_norm": 15.70827732843974, "language_loss": 0.81235135, "learning_rate": 1.7218552057293974e-06, "loss": 0.8270067, "num_input_tokens_seen": 199867645, "router_z_loss_clip": 2.33984375, "router_z_loss_mlp": 0.22546387, "step": 9282, "time_per_iteration": 2.6553995609283447 }, { "auxiliary_loss_clip": 0.01255045, "auxiliary_loss_mlp": 0.00205498, "balance_loss_clip": 1.02639925, "balance_loss_mlp": 0.18326569, "epoch": 0.5581241545167593, "flos": 17675986792320.0, "grad_norm": 45.921852808907936, "language_loss": 0.72577417, "learning_rate": 1.721469534028297e-06, "loss": 0.74037957, "num_input_tokens_seen": 199886320, "router_z_loss_clip": 2.28515625, "router_z_loss_mlp": 0.22229004, "step": 9283, "time_per_iteration": 2.610285997390747 }, { "auxiliary_loss_clip": 0.01272123, "auxiliary_loss_mlp": 0.00176208, "balance_loss_clip": 1.03645563, "balance_loss_mlp": 0.15250903, "epoch": 0.5581842777694274, "flos": 19569161018880.0, "grad_norm": 66.95675550808059, "language_loss": 0.91165459, "learning_rate": 1.7210838728890994e-06, "loss": 0.92613792, "num_input_tokens_seen": 199904895, "router_z_loss_clip": 2.35546875, "router_z_loss_mlp": 0.23681641, "step": 9284, "time_per_iteration": 2.6571788787841797 }, { "auxiliary_loss_clip": 0.01267619, "auxiliary_loss_mlp": 0.00197113, "balance_loss_clip": 1.03461349, "balance_loss_mlp": 0.17285359, "epoch": 0.5582444010220953, "flos": 20595165102720.0, "grad_norm": 493.74854416125146, "language_loss": 0.93112504, "learning_rate": 1.7206982223264304e-06, "loss": 0.94577241, "num_input_tokens_seen": 199921090, "router_z_loss_clip": 2.33007812, "router_z_loss_mlp": 0.24255371, "step": 9285, "time_per_iteration": 2.624689817428589 }, { "auxiliary_loss_clip": 0.01263905, "auxiliary_loss_mlp": 0.0019741, "balance_loss_clip": 1.02948177, "balance_loss_mlp": 0.17305604, "epoch": 0.5583045242747633, "flos": 19135504120320.0, "grad_norm": 24.679197555520915, "language_loss": 0.85119659, "learning_rate": 1.720312582354912e-06, "loss": 0.86580974, "num_input_tokens_seen": 199939925, "router_z_loss_clip": 2.34179688, "router_z_loss_mlp": 0.2434082, "step": 9286, "time_per_iteration": 2.635106086730957 }, { "auxiliary_loss_clip": 0.01274701, "auxiliary_loss_mlp": 0.00184997, "balance_loss_clip": 1.03610981, "balance_loss_mlp": 0.16175072, "epoch": 0.5583646475274312, "flos": 27454569730560.0, "grad_norm": 2.419638640739014, "language_loss": 0.81543607, "learning_rate": 1.7199269529891684e-06, "loss": 0.83003306, "num_input_tokens_seen": 199960015, "router_z_loss_clip": 2.38671875, "router_z_loss_mlp": 0.23266602, "step": 9287, "time_per_iteration": 2.679624319076538 }, { "auxiliary_loss_clip": 0.01291478, "auxiliary_loss_mlp": 0.0019603, "balance_loss_clip": 1.048244, "balance_loss_mlp": 0.1738447, "epoch": 0.5584247707800992, "flos": 23653784010240.0, "grad_norm": 12.869653823729392, "language_loss": 0.81570399, "learning_rate": 1.7195413342438233e-06, "loss": 0.83057904, "num_input_tokens_seen": 199980505, "router_z_loss_clip": 2.43164062, "router_z_loss_mlp": 0.22192383, "step": 9288, "time_per_iteration": 2.731786012649536 }, { "auxiliary_loss_clip": 0.01292629, "auxiliary_loss_mlp": 0.00194382, "balance_loss_clip": 1.05067873, "balance_loss_mlp": 0.16956294, "epoch": 0.5584848940327671, "flos": 13698880185600.0, "grad_norm": 4.202439797170646, "language_loss": 0.90935266, "learning_rate": 1.7191557261334984e-06, "loss": 0.92422271, "num_input_tokens_seen": 199999020, "router_z_loss_clip": 2.421875, "router_z_loss_mlp": 0.24816895, "step": 9289, "time_per_iteration": 2.616331100463867 }, { "auxiliary_loss_clip": 0.01301847, "auxiliary_loss_mlp": 0.00193599, "balance_loss_clip": 1.05387998, "balance_loss_mlp": 0.1682675, "epoch": 0.5585450172854352, "flos": 27016208150400.0, "grad_norm": 10.138671960023803, "language_loss": 0.72823375, "learning_rate": 1.718770128672817e-06, "loss": 0.7431882, "num_input_tokens_seen": 200019020, "router_z_loss_clip": 2.47851562, "router_z_loss_mlp": 0.25341797, "step": 9290, "time_per_iteration": 2.697014331817627 }, { "auxiliary_loss_clip": 0.01272167, "auxiliary_loss_mlp": 0.00208404, "balance_loss_clip": 1.03155291, "balance_loss_mlp": 0.18328613, "epoch": 0.5586051405381031, "flos": 23185653033600.0, "grad_norm": 30.350657241513083, "language_loss": 0.8034569, "learning_rate": 1.7183845418764e-06, "loss": 0.81826264, "num_input_tokens_seen": 200038110, "router_z_loss_clip": 2.40820312, "router_z_loss_mlp": 0.2512207, "step": 9291, "time_per_iteration": 2.674354076385498 }, { "auxiliary_loss_clip": 0.01247743, "auxiliary_loss_mlp": 0.00191553, "balance_loss_clip": 1.01669216, "balance_loss_mlp": 0.16761565, "epoch": 0.5586652637907711, "flos": 20775544225920.0, "grad_norm": 3.5871040708926625, "language_loss": 0.91129112, "learning_rate": 1.7179989657588698e-06, "loss": 0.92568409, "num_input_tokens_seen": 200056210, "router_z_loss_clip": 2.31054688, "router_z_loss_mlp": 0.23937988, "step": 9292, "time_per_iteration": 2.6839680671691895 }, { "auxiliary_loss_clip": 0.01267133, "auxiliary_loss_mlp": 0.00191752, "balance_loss_clip": 1.0321008, "balance_loss_mlp": 0.16763589, "epoch": 0.5587253870434391, "flos": 28219897837440.0, "grad_norm": 3.584155499490326, "language_loss": 0.82460046, "learning_rate": 1.7176134003348476e-06, "loss": 0.83918929, "num_input_tokens_seen": 200075620, "router_z_loss_clip": 2.34765625, "router_z_loss_mlp": 0.24145508, "step": 9293, "time_per_iteration": 2.6983530521392822 }, { "auxiliary_loss_clip": 0.01273469, "auxiliary_loss_mlp": 0.00173253, "balance_loss_clip": 1.03953648, "balance_loss_mlp": 0.15134238, "epoch": 0.558785510296107, "flos": 26615732440320.0, "grad_norm": 4.79388037820887, "language_loss": 0.7962364, "learning_rate": 1.7172278456189523e-06, "loss": 0.81070364, "num_input_tokens_seen": 200095945, "router_z_loss_clip": 2.33789062, "router_z_loss_mlp": 0.21923828, "step": 9294, "time_per_iteration": 2.6944425106048584 }, { "auxiliary_loss_clip": 0.01260297, "auxiliary_loss_mlp": 0.00194247, "balance_loss_clip": 1.02639198, "balance_loss_mlp": 0.17160943, "epoch": 0.558845633548775, "flos": 20156767608960.0, "grad_norm": 7.501233021570425, "language_loss": 0.78326362, "learning_rate": 1.716842301625806e-06, "loss": 0.797809, "num_input_tokens_seen": 200114185, "router_z_loss_clip": 2.33984375, "router_z_loss_mlp": 0.22631836, "step": 9295, "time_per_iteration": 2.6134698390960693 }, { "auxiliary_loss_clip": 0.01268399, "auxiliary_loss_mlp": 0.00180906, "balance_loss_clip": 1.03306341, "balance_loss_mlp": 0.15544298, "epoch": 0.5589057568014429, "flos": 24350774492160.0, "grad_norm": 2.979354636627372, "language_loss": 0.87664104, "learning_rate": 1.7164567683700281e-06, "loss": 0.89113408, "num_input_tokens_seen": 200135030, "router_z_loss_clip": 2.35546875, "router_z_loss_mlp": 0.25500488, "step": 9296, "time_per_iteration": 2.6588778495788574 }, { "auxiliary_loss_clip": 0.012735, "auxiliary_loss_mlp": 0.00197393, "balance_loss_clip": 1.03571403, "balance_loss_mlp": 0.17401607, "epoch": 0.558965880054111, "flos": 21105168359040.0, "grad_norm": 3.1871653740530532, "language_loss": 0.70877635, "learning_rate": 1.7160712458662379e-06, "loss": 0.72348529, "num_input_tokens_seen": 200154290, "router_z_loss_clip": 2.38085938, "router_z_loss_mlp": 0.23376465, "step": 9297, "time_per_iteration": 2.6920580863952637 }, { "auxiliary_loss_clip": 0.01294125, "auxiliary_loss_mlp": 0.00204355, "balance_loss_clip": 1.0470705, "balance_loss_mlp": 0.178665, "epoch": 0.5590260033067789, "flos": 18436071513600.0, "grad_norm": 10.419848436147566, "language_loss": 0.83309805, "learning_rate": 1.7156857341290544e-06, "loss": 0.84808278, "num_input_tokens_seen": 200171555, "router_z_loss_clip": 2.46875, "router_z_loss_mlp": 0.25708008, "step": 9298, "time_per_iteration": 2.5964279174804688 }, { "auxiliary_loss_clip": 0.01220499, "auxiliary_loss_mlp": 0.00066031, "balance_loss_clip": 1.04942846, "balance_loss_mlp": 0.0607381, "epoch": 0.5590861265594469, "flos": 70577432490240.0, "grad_norm": 0.67124782287472, "language_loss": 0.51922697, "learning_rate": 1.7153002331730967e-06, "loss": 0.53209227, "num_input_tokens_seen": 200237010, "router_z_loss_clip": 1.7109375, "router_z_loss_mlp": 0.05297852, "step": 9299, "time_per_iteration": 3.219676971435547 }, { "auxiliary_loss_clip": 0.01244737, "auxiliary_loss_mlp": 0.00173686, "balance_loss_clip": 1.01729333, "balance_loss_mlp": 0.15009476, "epoch": 0.5591462498121148, "flos": 30664408896000.0, "grad_norm": 17.37567210140604, "language_loss": 0.75878358, "learning_rate": 1.7149147430129824e-06, "loss": 0.77296782, "num_input_tokens_seen": 200260820, "router_z_loss_clip": 2.27539062, "router_z_loss_mlp": 0.23596191, "step": 9300, "time_per_iteration": 2.7128403186798096 }, { "auxiliary_loss_clip": 0.01277608, "auxiliary_loss_mlp": 0.00224277, "balance_loss_clip": 1.03993583, "balance_loss_mlp": 0.19867086, "epoch": 0.5592063730647828, "flos": 18150438562560.0, "grad_norm": 7.133540852533901, "language_loss": 0.88805753, "learning_rate": 1.7145292636633293e-06, "loss": 0.90307641, "num_input_tokens_seen": 200278035, "router_z_loss_clip": 2.37304688, "router_z_loss_mlp": 0.2557373, "step": 9301, "time_per_iteration": 2.6187121868133545 }, { "auxiliary_loss_clip": 0.0127021, "auxiliary_loss_mlp": 0.00210079, "balance_loss_clip": 1.03489029, "balance_loss_mlp": 0.18521203, "epoch": 0.5592664963174507, "flos": 24060400945920.0, "grad_norm": 22.25944968283858, "language_loss": 0.75667787, "learning_rate": 1.714143795138756e-06, "loss": 0.77148074, "num_input_tokens_seen": 200297255, "router_z_loss_clip": 2.35351562, "router_z_loss_mlp": 0.24841309, "step": 9302, "time_per_iteration": 2.643488645553589 }, { "auxiliary_loss_clip": 0.01284483, "auxiliary_loss_mlp": 0.00204606, "balance_loss_clip": 1.04283178, "balance_loss_mlp": 0.17954864, "epoch": 0.5593266195701188, "flos": 19827897661440.0, "grad_norm": 16.209082775214185, "language_loss": 0.79700518, "learning_rate": 1.713758337453878e-06, "loss": 0.81189609, "num_input_tokens_seen": 200317505, "router_z_loss_clip": 2.4140625, "router_z_loss_mlp": 0.25036621, "step": 9303, "time_per_iteration": 2.690110206604004 }, { "auxiliary_loss_clip": 0.01277109, "auxiliary_loss_mlp": 0.00185709, "balance_loss_clip": 1.0415889, "balance_loss_mlp": 0.16214162, "epoch": 0.5593867428227867, "flos": 25300755440640.0, "grad_norm": 2.329413591344766, "language_loss": 0.7976408, "learning_rate": 1.7133728906233124e-06, "loss": 0.81226897, "num_input_tokens_seen": 200338350, "router_z_loss_clip": 2.35742188, "router_z_loss_mlp": 0.2355957, "step": 9304, "time_per_iteration": 2.6809065341949463 }, { "auxiliary_loss_clip": 0.01277629, "auxiliary_loss_mlp": 0.00185228, "balance_loss_clip": 1.03985238, "balance_loss_mlp": 0.1606704, "epoch": 0.5594468660754547, "flos": 12933013374720.0, "grad_norm": 270.1594774903958, "language_loss": 0.85658264, "learning_rate": 1.7129874546616763e-06, "loss": 0.87121117, "num_input_tokens_seen": 200353965, "router_z_loss_clip": 2.375, "router_z_loss_mlp": 0.24560547, "step": 9305, "time_per_iteration": 4.026232481002808 }, { "auxiliary_loss_clip": 0.01254919, "auxiliary_loss_mlp": 0.00198188, "balance_loss_clip": 1.0225091, "balance_loss_mlp": 0.17499012, "epoch": 0.5595069893281227, "flos": 19062713208960.0, "grad_norm": 1.8432494692002628, "language_loss": 0.76334774, "learning_rate": 1.7126020295835836e-06, "loss": 0.77787876, "num_input_tokens_seen": 200373595, "router_z_loss_clip": 2.32226562, "router_z_loss_mlp": 0.23205566, "step": 9306, "time_per_iteration": 4.069831609725952 }, { "auxiliary_loss_clip": 0.01215427, "auxiliary_loss_mlp": 0.00040551, "balance_loss_clip": 1.03950655, "balance_loss_mlp": 0.03599764, "epoch": 0.5595671125807906, "flos": 70273375862400.0, "grad_norm": 0.9102188026392314, "language_loss": 0.60108191, "learning_rate": 1.7122166154036518e-06, "loss": 0.61364174, "num_input_tokens_seen": 200429155, "router_z_loss_clip": 1.7578125, "router_z_loss_mlp": 0.0456543, "step": 9307, "time_per_iteration": 3.21934175491333 }, { "auxiliary_loss_clip": 0.01261678, "auxiliary_loss_mlp": 0.00195213, "balance_loss_clip": 1.03002501, "balance_loss_mlp": 0.17366013, "epoch": 0.5596272358334586, "flos": 20665513889280.0, "grad_norm": 201.46050085814318, "language_loss": 0.79755789, "learning_rate": 1.7118312121364943e-06, "loss": 0.81212682, "num_input_tokens_seen": 200448290, "router_z_loss_clip": 2.31445312, "router_z_loss_mlp": 0.21533203, "step": 9308, "time_per_iteration": 2.633913040161133 }, { "auxiliary_loss_clip": 0.01268041, "auxiliary_loss_mlp": 0.00231266, "balance_loss_clip": 1.03203321, "balance_loss_mlp": 0.2065776, "epoch": 0.5596873590861265, "flos": 25041013217280.0, "grad_norm": 3.9134489749035506, "language_loss": 0.76166248, "learning_rate": 1.7114458197967257e-06, "loss": 0.77665555, "num_input_tokens_seen": 200466555, "router_z_loss_clip": 2.35742188, "router_z_loss_mlp": 0.24707031, "step": 9309, "time_per_iteration": 4.06767201423645 }, { "auxiliary_loss_clip": 0.01272063, "auxiliary_loss_mlp": 0.00214575, "balance_loss_clip": 1.03764892, "balance_loss_mlp": 0.18880221, "epoch": 0.5597474823387946, "flos": 25958387594880.0, "grad_norm": 2.8519531364791812, "language_loss": 0.83253509, "learning_rate": 1.7110604383989613e-06, "loss": 0.8474015, "num_input_tokens_seen": 200485980, "router_z_loss_clip": 2.34179688, "router_z_loss_mlp": 0.25769043, "step": 9310, "time_per_iteration": 2.7046737670898438 }, { "auxiliary_loss_clip": 0.01319181, "auxiliary_loss_mlp": 0.00214007, "balance_loss_clip": 1.06909478, "balance_loss_mlp": 0.1872566, "epoch": 0.5598076055914625, "flos": 26177442687360.0, "grad_norm": 5.881101541505523, "language_loss": 0.80758482, "learning_rate": 1.7106750679578133e-06, "loss": 0.82291675, "num_input_tokens_seen": 200504555, "router_z_loss_clip": 2.50390625, "router_z_loss_mlp": 0.2677002, "step": 9311, "time_per_iteration": 2.66375470161438 }, { "auxiliary_loss_clip": 0.0126829, "auxiliary_loss_mlp": 0.00199082, "balance_loss_clip": 1.03278661, "balance_loss_mlp": 0.1753947, "epoch": 0.5598677288441305, "flos": 11655778590720.0, "grad_norm": 2.503521481248088, "language_loss": 0.82134271, "learning_rate": 1.7102897084878962e-06, "loss": 0.83601642, "num_input_tokens_seen": 200522700, "router_z_loss_clip": 2.35351562, "router_z_loss_mlp": 0.23657227, "step": 9312, "time_per_iteration": 2.6776680946350098 }, { "auxiliary_loss_clip": 0.01265913, "auxiliary_loss_mlp": 0.00210768, "balance_loss_clip": 1.03273273, "balance_loss_mlp": 0.18747422, "epoch": 0.5599278520967984, "flos": 22966597941120.0, "grad_norm": 6.982296906362298, "language_loss": 0.96833366, "learning_rate": 1.709904360003822e-06, "loss": 0.98310041, "num_input_tokens_seen": 200541910, "router_z_loss_clip": 2.33398438, "router_z_loss_mlp": 0.23291016, "step": 9313, "time_per_iteration": 2.6578779220581055 }, { "auxiliary_loss_clip": 0.01264001, "auxiliary_loss_mlp": 0.00203423, "balance_loss_clip": 1.03435254, "balance_loss_mlp": 0.18145278, "epoch": 0.5599879753494664, "flos": 21215557831680.0, "grad_norm": 9.313189072048134, "language_loss": 0.82011962, "learning_rate": 1.709519022520204e-06, "loss": 0.83479381, "num_input_tokens_seen": 200562600, "router_z_loss_clip": 2.29296875, "router_z_loss_mlp": 0.21972656, "step": 9314, "time_per_iteration": 2.658977508544922 }, { "auxiliary_loss_clip": 0.01266906, "auxiliary_loss_mlp": 0.00217214, "balance_loss_clip": 1.03185463, "balance_loss_mlp": 0.19370575, "epoch": 0.5600480986021343, "flos": 31903219105920.0, "grad_norm": 48.25356062747558, "language_loss": 0.78820252, "learning_rate": 1.7091336960516537e-06, "loss": 0.80304372, "num_input_tokens_seen": 200584795, "router_z_loss_clip": 2.35742188, "router_z_loss_mlp": 0.23498535, "step": 9315, "time_per_iteration": 4.190639019012451 }, { "auxiliary_loss_clip": 0.01288717, "auxiliary_loss_mlp": 0.00223362, "balance_loss_clip": 1.04656935, "balance_loss_mlp": 0.19916278, "epoch": 0.5601082218548024, "flos": 28476048700800.0, "grad_norm": 5.486012016827079, "language_loss": 0.7540524, "learning_rate": 1.7087483806127824e-06, "loss": 0.76917315, "num_input_tokens_seen": 200606945, "router_z_loss_clip": 2.421875, "router_z_loss_mlp": 0.24194336, "step": 9316, "time_per_iteration": 2.699634313583374 }, { "auxiliary_loss_clip": 0.01279543, "auxiliary_loss_mlp": 0.00217723, "balance_loss_clip": 1.0424732, "balance_loss_mlp": 0.19475186, "epoch": 0.5601683451074703, "flos": 24097173494400.0, "grad_norm": 13.452816834505304, "language_loss": 0.93555605, "learning_rate": 1.7083630762182022e-06, "loss": 0.95052874, "num_input_tokens_seen": 200626340, "router_z_loss_clip": 2.37304688, "router_z_loss_mlp": 0.22961426, "step": 9317, "time_per_iteration": 2.6768345832824707 }, { "auxiliary_loss_clip": 0.01291832, "auxiliary_loss_mlp": 0.00221203, "balance_loss_clip": 1.04712868, "balance_loss_mlp": 0.19575158, "epoch": 0.5602284683601383, "flos": 26356205698560.0, "grad_norm": 9.816898147825352, "language_loss": 0.85460949, "learning_rate": 1.7079777828825233e-06, "loss": 0.86973977, "num_input_tokens_seen": 200644520, "router_z_loss_clip": 2.45117188, "router_z_loss_mlp": 0.25463867, "step": 9318, "time_per_iteration": 2.672327995300293 }, { "auxiliary_loss_clip": 0.01264285, "auxiliary_loss_mlp": 0.00206751, "balance_loss_clip": 1.02960336, "balance_loss_mlp": 0.18371981, "epoch": 0.5602885916128063, "flos": 24496392228480.0, "grad_norm": 19.462825331456806, "language_loss": 0.81303, "learning_rate": 1.7075925006203558e-06, "loss": 0.82774031, "num_input_tokens_seen": 200664845, "router_z_loss_clip": 2.34960938, "router_z_loss_mlp": 0.23022461, "step": 9319, "time_per_iteration": 2.709087610244751 }, { "auxiliary_loss_clip": 0.01261015, "auxiliary_loss_mlp": 0.00200761, "balance_loss_clip": 1.02854502, "balance_loss_mlp": 0.17836119, "epoch": 0.5603487148654742, "flos": 27345006270720.0, "grad_norm": 440.26351998287424, "language_loss": 0.90335131, "learning_rate": 1.7072072294463101e-06, "loss": 0.91796905, "num_input_tokens_seen": 200686535, "router_z_loss_clip": 2.32617188, "router_z_loss_mlp": 0.22412109, "step": 9320, "time_per_iteration": 2.6748485565185547 }, { "auxiliary_loss_clip": 0.01205083, "auxiliary_loss_mlp": 0.00032582, "balance_loss_clip": 1.03414774, "balance_loss_mlp": 0.02755091, "epoch": 0.5604088381181422, "flos": 54087756180480.0, "grad_norm": 0.7820002610924723, "language_loss": 0.51954865, "learning_rate": 1.706821969374996e-06, "loss": 0.53192526, "num_input_tokens_seen": 200736965, "router_z_loss_clip": 1.7109375, "router_z_loss_mlp": 0.05029297, "step": 9321, "time_per_iteration": 3.0568647384643555 }, { "auxiliary_loss_clip": 0.01271307, "auxiliary_loss_mlp": 0.00195853, "balance_loss_clip": 1.0377686, "balance_loss_mlp": 0.17382348, "epoch": 0.5604689613708101, "flos": 22236390357120.0, "grad_norm": 6.880433868526368, "language_loss": 0.80632502, "learning_rate": 1.7064367204210216e-06, "loss": 0.82099664, "num_input_tokens_seen": 200757420, "router_z_loss_clip": 2.3359375, "router_z_loss_mlp": 0.22021484, "step": 9322, "time_per_iteration": 2.6643238067626953 }, { "auxiliary_loss_clip": 0.01278004, "auxiliary_loss_mlp": 0.00223428, "balance_loss_clip": 1.03952479, "balance_loss_mlp": 0.20107639, "epoch": 0.5605290846234782, "flos": 35297782940160.0, "grad_norm": 3.2647704048115878, "language_loss": 0.78822899, "learning_rate": 1.7060514825989963e-06, "loss": 0.80324328, "num_input_tokens_seen": 200779520, "router_z_loss_clip": 2.390625, "router_z_loss_mlp": 0.22351074, "step": 9323, "time_per_iteration": 2.7793431282043457 }, { "auxiliary_loss_clip": 0.01266785, "auxiliary_loss_mlp": 0.00222846, "balance_loss_clip": 1.03145266, "balance_loss_mlp": 0.20006503, "epoch": 0.5605892078761461, "flos": 20263314326400.0, "grad_norm": 7.073652383978384, "language_loss": 0.69976985, "learning_rate": 1.7056662559235286e-06, "loss": 0.71466613, "num_input_tokens_seen": 200799485, "router_z_loss_clip": 2.35546875, "router_z_loss_mlp": 0.22802734, "step": 9324, "time_per_iteration": 2.654230833053589 }, { "auxiliary_loss_clip": 0.01260269, "auxiliary_loss_mlp": 0.00204846, "balance_loss_clip": 1.02373409, "balance_loss_mlp": 0.18117082, "epoch": 0.5606493311288141, "flos": 17308333134720.0, "grad_norm": 503.2058489099429, "language_loss": 0.94122678, "learning_rate": 1.705281040409226e-06, "loss": 0.9558779, "num_input_tokens_seen": 200817540, "router_z_loss_clip": 2.36523438, "router_z_loss_mlp": 0.23681641, "step": 9325, "time_per_iteration": 2.6195929050445557 }, { "auxiliary_loss_clip": 0.01297684, "auxiliary_loss_mlp": 0.00219587, "balance_loss_clip": 1.05107045, "balance_loss_mlp": 0.19558984, "epoch": 0.560709454381482, "flos": 21652985658240.0, "grad_norm": 3.9234738527864508, "language_loss": 0.8246423, "learning_rate": 1.7048958360706952e-06, "loss": 0.83981502, "num_input_tokens_seen": 200838380, "router_z_loss_clip": 2.47070312, "router_z_loss_mlp": 0.23986816, "step": 9326, "time_per_iteration": 2.6424591541290283 }, { "auxiliary_loss_clip": 0.01284822, "auxiliary_loss_mlp": 0.00232509, "balance_loss_clip": 1.04180455, "balance_loss_mlp": 0.20765345, "epoch": 0.56076957763415, "flos": 20303355012480.0, "grad_norm": 23.325912823644604, "language_loss": 0.88409024, "learning_rate": 1.7045106429225447e-06, "loss": 0.89926356, "num_input_tokens_seen": 200855640, "router_z_loss_clip": 2.43359375, "router_z_loss_mlp": 0.24853516, "step": 9327, "time_per_iteration": 2.7489984035491943 }, { "auxiliary_loss_clip": 0.01289789, "auxiliary_loss_mlp": 0.00225921, "balance_loss_clip": 1.04537809, "balance_loss_mlp": 0.2019237, "epoch": 0.5608297008868179, "flos": 25045897466880.0, "grad_norm": 5.231217301690262, "language_loss": 0.86273003, "learning_rate": 1.7041254609793795e-06, "loss": 0.87788707, "num_input_tokens_seen": 200876585, "router_z_loss_clip": 2.44335938, "router_z_loss_mlp": 0.2401123, "step": 9328, "time_per_iteration": 2.6975080966949463 }, { "auxiliary_loss_clip": 0.01269587, "auxiliary_loss_mlp": 0.00209141, "balance_loss_clip": 1.03650331, "balance_loss_mlp": 0.18544181, "epoch": 0.560889824139486, "flos": 19866825025920.0, "grad_norm": 1.7184326544478867, "language_loss": 0.79553539, "learning_rate": 1.7037402902558066e-06, "loss": 0.81032264, "num_input_tokens_seen": 200898175, "router_z_loss_clip": 2.33007812, "router_z_loss_mlp": 0.23706055, "step": 9329, "time_per_iteration": 2.6440136432647705 }, { "auxiliary_loss_clip": 0.01289144, "auxiliary_loss_mlp": 0.00215756, "balance_loss_clip": 1.04697144, "balance_loss_mlp": 0.19025758, "epoch": 0.5609499473921539, "flos": 22929394429440.0, "grad_norm": 3.032018440609573, "language_loss": 0.90330321, "learning_rate": 1.7033551307664324e-06, "loss": 0.91835219, "num_input_tokens_seen": 200917515, "router_z_loss_clip": 2.421875, "router_z_loss_mlp": 0.25500488, "step": 9330, "time_per_iteration": 2.6716148853302 }, { "auxiliary_loss_clip": 0.01227176, "auxiliary_loss_mlp": 0.0005248, "balance_loss_clip": 1.05092335, "balance_loss_mlp": 0.04632924, "epoch": 0.5610100706448219, "flos": 53035825455360.0, "grad_norm": 1.8803728036057583, "language_loss": 0.57362807, "learning_rate": 1.7029699825258603e-06, "loss": 0.58642465, "num_input_tokens_seen": 200978615, "router_z_loss_clip": 1.765625, "router_z_loss_mlp": 0.06152344, "step": 9331, "time_per_iteration": 3.2018020153045654 }, { "auxiliary_loss_clip": 0.01267178, "auxiliary_loss_mlp": 0.00218777, "balance_loss_clip": 1.02774763, "balance_loss_mlp": 0.19542412, "epoch": 0.5610701938974898, "flos": 21834944979840.0, "grad_norm": 5.198470925641551, "language_loss": 0.90453613, "learning_rate": 1.7025848455486971e-06, "loss": 0.91939574, "num_input_tokens_seen": 200997745, "router_z_loss_clip": 2.39453125, "router_z_loss_mlp": 0.23352051, "step": 9332, "time_per_iteration": 2.6830239295959473 }, { "auxiliary_loss_clip": 0.01306468, "auxiliary_loss_mlp": 0.00280748, "balance_loss_clip": 1.06327724, "balance_loss_mlp": 0.25374702, "epoch": 0.5611303171501578, "flos": 17457183095040.0, "grad_norm": 15.98296771846845, "language_loss": 0.91693199, "learning_rate": 1.7021997198495454e-06, "loss": 0.93280411, "num_input_tokens_seen": 201016370, "router_z_loss_clip": 2.43164062, "router_z_loss_mlp": 0.26977539, "step": 9333, "time_per_iteration": 2.6334753036499023 }, { "auxiliary_loss_clip": 0.01263645, "auxiliary_loss_mlp": 0.00244149, "balance_loss_clip": 1.03057349, "balance_loss_mlp": 0.22002107, "epoch": 0.5611904404028258, "flos": 22637799820800.0, "grad_norm": 31.265213402654283, "language_loss": 0.79284692, "learning_rate": 1.7018146054430108e-06, "loss": 0.80792487, "num_input_tokens_seen": 201034310, "router_z_loss_clip": 2.32617188, "router_z_loss_mlp": 0.24133301, "step": 9334, "time_per_iteration": 2.724316358566284 }, { "auxiliary_loss_clip": 0.0127474, "auxiliary_loss_mlp": 0.00228961, "balance_loss_clip": 1.03971314, "balance_loss_mlp": 0.20610824, "epoch": 0.5612505636554938, "flos": 14316327999360.0, "grad_norm": 31.893523464867314, "language_loss": 0.79720324, "learning_rate": 1.7014295023436961e-06, "loss": 0.81224024, "num_input_tokens_seen": 201052030, "router_z_loss_clip": 2.35351562, "router_z_loss_mlp": 0.2286377, "step": 9335, "time_per_iteration": 2.6022539138793945 }, { "auxiliary_loss_clip": 0.01283711, "auxiliary_loss_mlp": 0.00268299, "balance_loss_clip": 1.0451324, "balance_loss_mlp": 0.24266897, "epoch": 0.5613106869081618, "flos": 16508279554560.0, "grad_norm": 64.76510055398181, "language_loss": 0.84250659, "learning_rate": 1.701044410566205e-06, "loss": 0.85802668, "num_input_tokens_seen": 201068445, "router_z_loss_clip": 2.3828125, "router_z_loss_mlp": 0.25634766, "step": 9336, "time_per_iteration": 2.6027638912200928 }, { "auxiliary_loss_clip": 0.0126969, "auxiliary_loss_mlp": 0.0022622, "balance_loss_clip": 1.0347271, "balance_loss_mlp": 0.20297411, "epoch": 0.5613708101608297, "flos": 24058569352320.0, "grad_norm": 33.84863993286327, "language_loss": 0.7566973, "learning_rate": 1.7006593301251393e-06, "loss": 0.77165639, "num_input_tokens_seen": 201082140, "router_z_loss_clip": 2.3515625, "router_z_loss_mlp": 0.23242188, "step": 9337, "time_per_iteration": 2.6524341106414795 }, { "auxiliary_loss_clip": 0.01234624, "auxiliary_loss_mlp": 0.00064033, "balance_loss_clip": 1.05670238, "balance_loss_mlp": 0.05864482, "epoch": 0.5614309334134977, "flos": 64905735997440.0, "grad_norm": 0.9293553935520884, "language_loss": 0.62208086, "learning_rate": 1.700274261035102e-06, "loss": 0.6350674, "num_input_tokens_seen": 201137245, "router_z_loss_clip": 1.78125, "router_z_loss_mlp": 0.05395508, "step": 9338, "time_per_iteration": 3.1099443435668945 }, { "auxiliary_loss_clip": 0.01274615, "auxiliary_loss_mlp": 0.00226914, "balance_loss_clip": 1.04057956, "balance_loss_mlp": 0.20313212, "epoch": 0.5614910566661656, "flos": 32919849740160.0, "grad_norm": 20.572297629865286, "language_loss": 0.74142063, "learning_rate": 1.6998892033106946e-06, "loss": 0.75643593, "num_input_tokens_seen": 201157270, "router_z_loss_clip": 2.33984375, "router_z_loss_mlp": 0.23779297, "step": 9339, "time_per_iteration": 2.7918639183044434 }, { "auxiliary_loss_clip": 0.01279791, "auxiliary_loss_mlp": 0.0022539, "balance_loss_clip": 1.03949523, "balance_loss_mlp": 0.20251366, "epoch": 0.5615511799188336, "flos": 18588871969920.0, "grad_norm": 2.8049764955642766, "language_loss": 0.76047218, "learning_rate": 1.6995041569665184e-06, "loss": 0.77552396, "num_input_tokens_seen": 201174530, "router_z_loss_clip": 2.40234375, "router_z_loss_mlp": 0.2286377, "step": 9340, "time_per_iteration": 2.5946993827819824 }, { "auxiliary_loss_clip": 0.012791, "auxiliary_loss_mlp": 0.00229185, "balance_loss_clip": 1.04582727, "balance_loss_mlp": 0.20728651, "epoch": 0.5616113031715015, "flos": 22820010537600.0, "grad_norm": 8.912140954401659, "language_loss": 0.85399431, "learning_rate": 1.6991191220171756e-06, "loss": 0.86907721, "num_input_tokens_seen": 201194905, "router_z_loss_clip": 2.33398438, "router_z_loss_mlp": 0.21887207, "step": 9341, "time_per_iteration": 2.6831789016723633 }, { "auxiliary_loss_clip": 0.01298499, "auxiliary_loss_mlp": 0.00215514, "balance_loss_clip": 1.05056286, "balance_loss_mlp": 0.19219626, "epoch": 0.5616714264241696, "flos": 22345702421760.0, "grad_norm": 9.227247417284797, "language_loss": 0.88283855, "learning_rate": 1.6987340984772653e-06, "loss": 0.89797872, "num_input_tokens_seen": 201213715, "router_z_loss_clip": 2.48242188, "router_z_loss_mlp": 0.2331543, "step": 9342, "time_per_iteration": 2.6636784076690674 }, { "auxiliary_loss_clip": 0.01288857, "auxiliary_loss_mlp": 0.00237781, "balance_loss_clip": 1.04450011, "balance_loss_mlp": 0.21390328, "epoch": 0.5617315496768375, "flos": 18807783408000.0, "grad_norm": 116.03168307505989, "language_loss": 0.8338185, "learning_rate": 1.6983490863613882e-06, "loss": 0.84908485, "num_input_tokens_seen": 201231415, "router_z_loss_clip": 2.4453125, "router_z_loss_mlp": 0.2388916, "step": 9343, "time_per_iteration": 2.6479525566101074 }, { "auxiliary_loss_clip": 0.01263127, "auxiliary_loss_mlp": 0.00228576, "balance_loss_clip": 1.03089952, "balance_loss_mlp": 0.20429258, "epoch": 0.5617916729295055, "flos": 18369314087040.0, "grad_norm": 5.875772546650142, "language_loss": 0.79840708, "learning_rate": 1.6979640856841442e-06, "loss": 0.81332409, "num_input_tokens_seen": 201249625, "router_z_loss_clip": 2.32421875, "router_z_loss_mlp": 0.24291992, "step": 9344, "time_per_iteration": 2.612426280975342 }, { "auxiliary_loss_clip": 0.01292307, "auxiliary_loss_mlp": 0.00232162, "balance_loss_clip": 1.04518127, "balance_loss_mlp": 0.20601973, "epoch": 0.5618517961821734, "flos": 28179964892160.0, "grad_norm": 13.00954602315971, "language_loss": 0.75532204, "learning_rate": 1.6975790964601318e-06, "loss": 0.7705667, "num_input_tokens_seen": 201271205, "router_z_loss_clip": 2.47070312, "router_z_loss_mlp": 0.2611084, "step": 9345, "time_per_iteration": 2.7814781665802 }, { "auxiliary_loss_clip": 0.01284344, "auxiliary_loss_mlp": 0.0023239, "balance_loss_clip": 1.04780817, "balance_loss_mlp": 0.20982412, "epoch": 0.5619119194348414, "flos": 15486872411520.0, "grad_norm": 5.42174462750624, "language_loss": 0.95697564, "learning_rate": 1.6971941187039512e-06, "loss": 0.97214305, "num_input_tokens_seen": 201287700, "router_z_loss_clip": 2.3671875, "router_z_loss_mlp": 0.22546387, "step": 9346, "time_per_iteration": 2.636953353881836 }, { "auxiliary_loss_clip": 0.01250567, "auxiliary_loss_mlp": 0.00233772, "balance_loss_clip": 1.01846075, "balance_loss_mlp": 0.20944095, "epoch": 0.5619720426875094, "flos": 29128652951040.0, "grad_norm": 20.821699197393112, "language_loss": 0.68796104, "learning_rate": 1.6968091524301993e-06, "loss": 0.70280445, "num_input_tokens_seen": 201307530, "router_z_loss_clip": 2.32226562, "router_z_loss_mlp": 0.2434082, "step": 9347, "time_per_iteration": 4.273086071014404 }, { "auxiliary_loss_clip": 0.01295837, "auxiliary_loss_mlp": 0.00233165, "balance_loss_clip": 1.04946828, "balance_loss_mlp": 0.20774999, "epoch": 0.5620321659401774, "flos": 18003743418240.0, "grad_norm": 8283.129222456137, "language_loss": 0.80440664, "learning_rate": 1.6964241976534745e-06, "loss": 0.81969666, "num_input_tokens_seen": 201326210, "router_z_loss_clip": 2.46484375, "router_z_loss_mlp": 0.25415039, "step": 9348, "time_per_iteration": 2.6602141857147217 }, { "auxiliary_loss_clip": 0.01279075, "auxiliary_loss_mlp": 0.00242797, "balance_loss_clip": 1.04148865, "balance_loss_mlp": 0.21832326, "epoch": 0.5620922891928454, "flos": 20594518657920.0, "grad_norm": 16.274033205383855, "language_loss": 0.88331479, "learning_rate": 1.6960392543883754e-06, "loss": 0.89853352, "num_input_tokens_seen": 201346120, "router_z_loss_clip": 2.375, "router_z_loss_mlp": 0.24450684, "step": 9349, "time_per_iteration": 4.119148015975952 }, { "auxiliary_loss_clip": 0.01283785, "auxiliary_loss_mlp": 0.00242178, "balance_loss_clip": 1.04271579, "balance_loss_mlp": 0.21826501, "epoch": 0.5621524124455133, "flos": 26287006147200.0, "grad_norm": 15.103841265176285, "language_loss": 0.75146943, "learning_rate": 1.6956543226494975e-06, "loss": 0.76672906, "num_input_tokens_seen": 201365700, "router_z_loss_clip": 2.41015625, "router_z_loss_mlp": 0.23913574, "step": 9350, "time_per_iteration": 2.6831510066986084 }, { "auxiliary_loss_clip": 0.01256968, "auxiliary_loss_mlp": 0.00233797, "balance_loss_clip": 1.01731777, "balance_loss_mlp": 0.20825069, "epoch": 0.5622125356981813, "flos": 12750299867520.0, "grad_norm": 6.812655920355498, "language_loss": 0.89105225, "learning_rate": 1.6952694024514381e-06, "loss": 0.90595996, "num_input_tokens_seen": 201382795, "router_z_loss_clip": 2.39257812, "router_z_loss_mlp": 0.25549316, "step": 9351, "time_per_iteration": 4.058567523956299 }, { "auxiliary_loss_clip": 0.01286617, "auxiliary_loss_mlp": 0.00227508, "balance_loss_clip": 1.04391766, "balance_loss_mlp": 0.20242637, "epoch": 0.5622726589508492, "flos": 23805327490560.0, "grad_norm": 4.544561005761319, "language_loss": 0.64916372, "learning_rate": 1.6948844938087945e-06, "loss": 0.66430509, "num_input_tokens_seen": 201402780, "router_z_loss_clip": 2.42773438, "router_z_loss_mlp": 0.25109863, "step": 9352, "time_per_iteration": 2.714797258377075 }, { "auxiliary_loss_clip": 0.01236025, "auxiliary_loss_mlp": 0.00188881, "balance_loss_clip": 1.01145864, "balance_loss_mlp": 0.16718528, "epoch": 0.5623327822035172, "flos": 24718212668160.0, "grad_norm": 9.197428449222343, "language_loss": 0.78704751, "learning_rate": 1.6944995967361604e-06, "loss": 0.80129659, "num_input_tokens_seen": 201424140, "router_z_loss_clip": 2.24804688, "router_z_loss_mlp": 0.21679688, "step": 9353, "time_per_iteration": 2.706303596496582 }, { "auxiliary_loss_clip": 0.01268395, "auxiliary_loss_mlp": 0.00201363, "balance_loss_clip": 1.02866292, "balance_loss_mlp": 0.17885666, "epoch": 0.5623929054561851, "flos": 14019274523520.0, "grad_norm": 16.230433974024386, "language_loss": 0.88239563, "learning_rate": 1.6941147112481327e-06, "loss": 0.89709324, "num_input_tokens_seen": 201439645, "router_z_loss_clip": 2.39648438, "router_z_loss_mlp": 0.22485352, "step": 9354, "time_per_iteration": 2.6452925205230713 }, { "auxiliary_loss_clip": 0.01278912, "auxiliary_loss_mlp": 0.00202119, "balance_loss_clip": 1.03440189, "balance_loss_mlp": 0.1785033, "epoch": 0.5624530287088532, "flos": 20704405340160.0, "grad_norm": 143.69321353992137, "language_loss": 0.82449448, "learning_rate": 1.6937298373593056e-06, "loss": 0.83930475, "num_input_tokens_seen": 201459970, "router_z_loss_clip": 2.44726562, "router_z_loss_mlp": 0.23583984, "step": 9355, "time_per_iteration": 2.6713719367980957 }, { "auxiliary_loss_clip": 0.01242731, "auxiliary_loss_mlp": 0.00204069, "balance_loss_clip": 1.01189482, "balance_loss_mlp": 0.17974989, "epoch": 0.5625131519615211, "flos": 21470918595840.0, "grad_norm": 44.399075738094915, "language_loss": 0.79378754, "learning_rate": 1.693344975084274e-06, "loss": 0.80825555, "num_input_tokens_seen": 201480055, "router_z_loss_clip": 2.3046875, "router_z_loss_mlp": 0.24291992, "step": 9356, "time_per_iteration": 2.6768174171447754 }, { "auxiliary_loss_clip": 0.01272571, "auxiliary_loss_mlp": 0.00186217, "balance_loss_clip": 1.03449416, "balance_loss_mlp": 0.16273269, "epoch": 0.5625732752141891, "flos": 18698004466560.0, "grad_norm": 18.03173965373499, "language_loss": 0.92254114, "learning_rate": 1.6929601244376318e-06, "loss": 0.93712914, "num_input_tokens_seen": 201497645, "router_z_loss_clip": 2.37695312, "router_z_loss_mlp": 0.23474121, "step": 9357, "time_per_iteration": 2.620073080062866 }, { "auxiliary_loss_clip": 0.01263004, "auxiliary_loss_mlp": 0.00196716, "balance_loss_clip": 1.02906191, "balance_loss_mlp": 0.17575943, "epoch": 0.562633398466857, "flos": 16216900427520.0, "grad_norm": 3.6832055762126097, "language_loss": 0.81104612, "learning_rate": 1.6925752854339722e-06, "loss": 0.82564336, "num_input_tokens_seen": 201515455, "router_z_loss_clip": 2.34179688, "router_z_loss_mlp": 0.2097168, "step": 9358, "time_per_iteration": 4.024397611618042 }, { "auxiliary_loss_clip": 0.01263075, "auxiliary_loss_mlp": 0.00190041, "balance_loss_clip": 1.0255518, "balance_loss_mlp": 0.16828549, "epoch": 0.562693521719525, "flos": 22491930689280.0, "grad_norm": 2.106289477644705, "language_loss": 0.85418761, "learning_rate": 1.6921904580878885e-06, "loss": 0.86871868, "num_input_tokens_seen": 201534500, "router_z_loss_clip": 2.37890625, "router_z_loss_mlp": 0.2175293, "step": 9359, "time_per_iteration": 2.6565680503845215 }, { "auxiliary_loss_clip": 0.01256646, "auxiliary_loss_mlp": 0.00207524, "balance_loss_clip": 1.01678276, "balance_loss_mlp": 0.18397978, "epoch": 0.562753644972193, "flos": 25331171281920.0, "grad_norm": 37.37790572889734, "language_loss": 0.79110944, "learning_rate": 1.6918056424139736e-06, "loss": 0.80575109, "num_input_tokens_seen": 201553280, "router_z_loss_clip": 2.40234375, "router_z_loss_mlp": 0.23535156, "step": 9360, "time_per_iteration": 2.68914794921875 }, { "auxiliary_loss_clip": 0.01301249, "auxiliary_loss_mlp": 0.00177423, "balance_loss_clip": 1.0920558, "balance_loss_mlp": 0.16860119, "epoch": 0.562813768224861, "flos": 67392622126080.0, "grad_norm": 0.758247490978468, "language_loss": 0.55086803, "learning_rate": 1.6914208384268197e-06, "loss": 0.56565475, "num_input_tokens_seen": 201610030, "router_z_loss_clip": 2.09375, "router_z_loss_mlp": 0.08837891, "step": 9361, "time_per_iteration": 3.0636749267578125 }, { "auxiliary_loss_clip": 0.01245527, "auxiliary_loss_mlp": 0.00191857, "balance_loss_clip": 1.01291656, "balance_loss_mlp": 0.170459, "epoch": 0.562873891477529, "flos": 23331163029120.0, "grad_norm": 6.593698565747468, "language_loss": 0.86419284, "learning_rate": 1.691036046141018e-06, "loss": 0.87856674, "num_input_tokens_seen": 201628370, "router_z_loss_clip": 2.32421875, "router_z_loss_mlp": 0.21398926, "step": 9362, "time_per_iteration": 2.6486411094665527 }, { "auxiliary_loss_clip": 0.01275911, "auxiliary_loss_mlp": 0.00193253, "balance_loss_clip": 1.035429, "balance_loss_mlp": 0.16907749, "epoch": 0.5629340147301969, "flos": 38472824805120.0, "grad_norm": 15.997256174520711, "language_loss": 0.81826925, "learning_rate": 1.6906512655711614e-06, "loss": 0.8329609, "num_input_tokens_seen": 201649790, "router_z_loss_clip": 2.40234375, "router_z_loss_mlp": 0.24157715, "step": 9363, "time_per_iteration": 2.802929639816284 }, { "auxiliary_loss_clip": 0.01244688, "auxiliary_loss_mlp": 0.00193527, "balance_loss_clip": 1.01035309, "balance_loss_mlp": 0.17125852, "epoch": 0.5629941379828649, "flos": 29242023252480.0, "grad_norm": 191.67297933992847, "language_loss": 0.90358019, "learning_rate": 1.690266496731839e-06, "loss": 0.91796231, "num_input_tokens_seen": 201669175, "router_z_loss_clip": 2.34375, "router_z_loss_mlp": 0.22253418, "step": 9364, "time_per_iteration": 2.7277536392211914 }, { "auxiliary_loss_clip": 0.01257558, "auxiliary_loss_mlp": 0.0018891, "balance_loss_clip": 1.02176785, "balance_loss_mlp": 0.16727391, "epoch": 0.5630542612355328, "flos": 19420885676160.0, "grad_norm": 16.55917899808996, "language_loss": 0.75235415, "learning_rate": 1.689881739637642e-06, "loss": 0.76681882, "num_input_tokens_seen": 201687000, "router_z_loss_clip": 2.35546875, "router_z_loss_mlp": 0.21655273, "step": 9365, "time_per_iteration": 2.6439905166625977 }, { "auxiliary_loss_clip": 0.01267712, "auxiliary_loss_mlp": 0.00225257, "balance_loss_clip": 1.02364206, "balance_loss_mlp": 0.19881615, "epoch": 0.5631143844882008, "flos": 22266303408000.0, "grad_norm": 7.975158139653375, "language_loss": 0.91960788, "learning_rate": 1.6894969943031611e-06, "loss": 0.93453753, "num_input_tokens_seen": 201703335, "router_z_loss_clip": 2.44140625, "router_z_loss_mlp": 0.2644043, "step": 9366, "time_per_iteration": 2.6442556381225586 }, { "auxiliary_loss_clip": 0.01260458, "auxiliary_loss_mlp": 0.00181895, "balance_loss_clip": 1.02638698, "balance_loss_mlp": 0.15987712, "epoch": 0.5631745077408687, "flos": 22965305051520.0, "grad_norm": 6.186785044332759, "language_loss": 0.81477416, "learning_rate": 1.6891122607429845e-06, "loss": 0.8291977, "num_input_tokens_seen": 201723495, "router_z_loss_clip": 2.34179688, "router_z_loss_mlp": 0.2199707, "step": 9367, "time_per_iteration": 2.8191027641296387 }, { "auxiliary_loss_clip": 0.01328546, "auxiliary_loss_mlp": 0.00105313, "balance_loss_clip": 1.11730814, "balance_loss_mlp": 0.09839927, "epoch": 0.5632346309935368, "flos": 65080515576960.0, "grad_norm": 0.8682981111096842, "language_loss": 0.52903414, "learning_rate": 1.6887275389717028e-06, "loss": 0.54337275, "num_input_tokens_seen": 201792615, "router_z_loss_clip": 2.109375, "router_z_loss_mlp": 0.06933594, "step": 9368, "time_per_iteration": 3.299403190612793 }, { "auxiliary_loss_clip": 0.01241153, "auxiliary_loss_mlp": 0.00212293, "balance_loss_clip": 1.01105404, "balance_loss_mlp": 0.18969089, "epoch": 0.5632947542462047, "flos": 23002903612800.0, "grad_norm": 7.179947697408982, "language_loss": 0.762016, "learning_rate": 1.6883428290039046e-06, "loss": 0.77655041, "num_input_tokens_seen": 201812520, "router_z_loss_clip": 2.30273438, "router_z_loss_mlp": 0.22619629, "step": 9369, "time_per_iteration": 2.6865692138671875 }, { "auxiliary_loss_clip": 0.01253688, "auxiliary_loss_mlp": 0.00191394, "balance_loss_clip": 1.01805139, "balance_loss_mlp": 0.16582322, "epoch": 0.5633548774988727, "flos": 30482593228800.0, "grad_norm": 40.18572690283628, "language_loss": 0.83168149, "learning_rate": 1.6879581308541763e-06, "loss": 0.84613228, "num_input_tokens_seen": 201834185, "router_z_loss_clip": 2.36132812, "router_z_loss_mlp": 0.25585938, "step": 9370, "time_per_iteration": 2.8011395931243896 }, { "auxiliary_loss_clip": 0.01257939, "auxiliary_loss_mlp": 0.00215241, "balance_loss_clip": 1.02186251, "balance_loss_mlp": 0.18770361, "epoch": 0.5634150007515406, "flos": 18515039564160.0, "grad_norm": 11.56863364341377, "language_loss": 0.85781837, "learning_rate": 1.687573444537108e-06, "loss": 0.87255013, "num_input_tokens_seen": 201851305, "router_z_loss_clip": 2.36328125, "router_z_loss_mlp": 0.27502441, "step": 9371, "time_per_iteration": 2.615680694580078 }, { "auxiliary_loss_clip": 0.01239026, "auxiliary_loss_mlp": 0.00166695, "balance_loss_clip": 1.00930715, "balance_loss_mlp": 0.14316307, "epoch": 0.5634751240042086, "flos": 19244672530560.0, "grad_norm": 12.809551048826341, "language_loss": 0.84611315, "learning_rate": 1.687188770067285e-06, "loss": 0.86017036, "num_input_tokens_seen": 201870350, "router_z_loss_clip": 2.29882812, "router_z_loss_mlp": 0.23510742, "step": 9372, "time_per_iteration": 2.644272565841675 }, { "auxiliary_loss_clip": 0.01273331, "auxiliary_loss_mlp": 0.0020874, "balance_loss_clip": 1.03705382, "balance_loss_mlp": 0.1845044, "epoch": 0.5635352472568766, "flos": 12020630987520.0, "grad_norm": 7.180350914273434, "language_loss": 0.82371873, "learning_rate": 1.6868041074592956e-06, "loss": 0.83853948, "num_input_tokens_seen": 201886800, "router_z_loss_clip": 2.36328125, "router_z_loss_mlp": 0.24267578, "step": 9373, "time_per_iteration": 2.647299289703369 }, { "auxiliary_loss_clip": 0.01281907, "auxiliary_loss_mlp": 0.00207512, "balance_loss_clip": 1.04092252, "balance_loss_mlp": 0.18047497, "epoch": 0.5635953705095446, "flos": 21871645701120.0, "grad_norm": 36.607510632854364, "language_loss": 0.92688566, "learning_rate": 1.6864194567277264e-06, "loss": 0.94177985, "num_input_tokens_seen": 201904730, "router_z_loss_clip": 2.40625, "router_z_loss_mlp": 0.27050781, "step": 9374, "time_per_iteration": 2.662726402282715 }, { "auxiliary_loss_clip": 0.01238636, "auxiliary_loss_mlp": 0.00175473, "balance_loss_clip": 1.01040864, "balance_loss_mlp": 0.15243012, "epoch": 0.5636554937622126, "flos": 27126166659840.0, "grad_norm": 142.74874946569474, "language_loss": 0.74423593, "learning_rate": 1.6860348178871618e-06, "loss": 0.75837702, "num_input_tokens_seen": 201924850, "router_z_loss_clip": 2.28320312, "router_z_loss_mlp": 0.23010254, "step": 9375, "time_per_iteration": 2.692528009414673 }, { "auxiliary_loss_clip": 0.01281244, "auxiliary_loss_mlp": 0.00181366, "balance_loss_clip": 1.03812218, "balance_loss_mlp": 0.15680844, "epoch": 0.5637156170148805, "flos": 12926405272320.0, "grad_norm": 18.45867363681506, "language_loss": 0.9008683, "learning_rate": 1.6856501909521889e-06, "loss": 0.91549438, "num_input_tokens_seen": 201939500, "router_z_loss_clip": 2.43164062, "router_z_loss_mlp": 0.2454834, "step": 9376, "time_per_iteration": 2.6400399208068848 }, { "auxiliary_loss_clip": 0.01267666, "auxiliary_loss_mlp": 0.00230465, "balance_loss_clip": 1.0280937, "balance_loss_mlp": 0.2057295, "epoch": 0.5637757402675485, "flos": 45551033130240.0, "grad_norm": 3.9705086160514043, "language_loss": 0.7535426, "learning_rate": 1.6852655759373925e-06, "loss": 0.76852393, "num_input_tokens_seen": 201963000, "router_z_loss_clip": 2.3984375, "router_z_loss_mlp": 0.24743652, "step": 9377, "time_per_iteration": 2.882615804672241 }, { "auxiliary_loss_clip": 0.01288528, "auxiliary_loss_mlp": 0.00178049, "balance_loss_clip": 1.04627633, "balance_loss_mlp": 0.15412346, "epoch": 0.5638358635202164, "flos": 20886041439360.0, "grad_norm": 6.555032631286302, "language_loss": 0.80368447, "learning_rate": 1.6848809728573565e-06, "loss": 0.8183502, "num_input_tokens_seen": 201983145, "router_z_loss_clip": 2.42773438, "router_z_loss_mlp": 0.23937988, "step": 9378, "time_per_iteration": 2.7459676265716553 }, { "auxiliary_loss_clip": 0.01281496, "auxiliary_loss_mlp": 0.00231335, "balance_loss_clip": 1.03587401, "balance_loss_mlp": 0.20489481, "epoch": 0.5638959867728844, "flos": 18806562345600.0, "grad_norm": 19.947050309716797, "language_loss": 0.92582643, "learning_rate": 1.6844963817266656e-06, "loss": 0.94095469, "num_input_tokens_seen": 202000335, "router_z_loss_clip": 2.4609375, "router_z_loss_mlp": 0.26464844, "step": 9379, "time_per_iteration": 2.60707688331604 }, { "auxiliary_loss_clip": 0.01274982, "auxiliary_loss_mlp": 0.00199804, "balance_loss_clip": 1.03699112, "balance_loss_mlp": 0.17531815, "epoch": 0.5639561100255523, "flos": 27490336698240.0, "grad_norm": 7.993554774905916, "language_loss": 0.81209373, "learning_rate": 1.6841118025599042e-06, "loss": 0.82684159, "num_input_tokens_seen": 202018275, "router_z_loss_clip": 2.3828125, "router_z_loss_mlp": 0.24487305, "step": 9380, "time_per_iteration": 2.6758649349212646 }, { "auxiliary_loss_clip": 0.01292239, "auxiliary_loss_mlp": 0.00204396, "balance_loss_clip": 1.04711771, "balance_loss_mlp": 0.17956477, "epoch": 0.5640162332782204, "flos": 18076570243200.0, "grad_norm": 24.970826691208035, "language_loss": 0.85261309, "learning_rate": 1.6837272353716542e-06, "loss": 0.86757946, "num_input_tokens_seen": 202034330, "router_z_loss_clip": 2.45507812, "router_z_loss_mlp": 0.24841309, "step": 9381, "time_per_iteration": 2.613896131515503 }, { "auxiliary_loss_clip": 0.01294565, "auxiliary_loss_mlp": 0.00218236, "balance_loss_clip": 1.04807639, "balance_loss_mlp": 0.19323818, "epoch": 0.5640763565308883, "flos": 20884856290560.0, "grad_norm": 57.224791283181084, "language_loss": 0.81130731, "learning_rate": 1.683342680176499e-06, "loss": 0.82643533, "num_input_tokens_seen": 202053100, "router_z_loss_clip": 2.46484375, "router_z_loss_mlp": 0.24975586, "step": 9382, "time_per_iteration": 2.679006814956665 }, { "auxiliary_loss_clip": 0.01357512, "auxiliary_loss_mlp": 0.00079857, "balance_loss_clip": 1.16015303, "balance_loss_mlp": 0.0703201, "epoch": 0.5641364797835563, "flos": 64447912224000.0, "grad_norm": 0.6919816516151311, "language_loss": 0.54023123, "learning_rate": 1.682958136989022e-06, "loss": 0.55460495, "num_input_tokens_seen": 202120125, "router_z_loss_clip": 1.9765625, "router_z_loss_mlp": 0.09521484, "step": 9383, "time_per_iteration": 3.297159433364868 }, { "auxiliary_loss_clip": 0.01274791, "auxiliary_loss_mlp": 0.00238942, "balance_loss_clip": 1.03458238, "balance_loss_mlp": 0.21350275, "epoch": 0.5641966030362242, "flos": 18660944609280.0, "grad_norm": 4.200652438404635, "language_loss": 0.77298415, "learning_rate": 1.6825736058238033e-06, "loss": 0.78812146, "num_input_tokens_seen": 202138030, "router_z_loss_clip": 2.40429688, "router_z_loss_mlp": 0.25439453, "step": 9384, "time_per_iteration": 2.71553897857666 }, { "auxiliary_loss_clip": 0.01288196, "auxiliary_loss_mlp": 0.00219989, "balance_loss_clip": 1.04399443, "balance_loss_mlp": 0.19460912, "epoch": 0.5642567262888922, "flos": 22492325738880.0, "grad_norm": 45.019264739489586, "language_loss": 0.81685507, "learning_rate": 1.6821890866954263e-06, "loss": 0.8319369, "num_input_tokens_seen": 202155580, "router_z_loss_clip": 2.43945312, "router_z_loss_mlp": 0.25390625, "step": 9385, "time_per_iteration": 2.672147750854492 }, { "auxiliary_loss_clip": 0.01267535, "auxiliary_loss_mlp": 0.00214877, "balance_loss_clip": 1.03122473, "balance_loss_mlp": 0.19101122, "epoch": 0.5643168495415603, "flos": 13003972692480.0, "grad_norm": 44.0681305311584, "language_loss": 0.91938007, "learning_rate": 1.6818045796184703e-06, "loss": 0.93420422, "num_input_tokens_seen": 202170365, "router_z_loss_clip": 2.36523438, "router_z_loss_mlp": 0.23876953, "step": 9386, "time_per_iteration": 2.6235334873199463 }, { "auxiliary_loss_clip": 0.01286144, "auxiliary_loss_mlp": 0.00238349, "balance_loss_clip": 1.04428577, "balance_loss_mlp": 0.21119297, "epoch": 0.5643769727942282, "flos": 18588297352320.0, "grad_norm": 14.796624400810662, "language_loss": 0.79035026, "learning_rate": 1.681420084607516e-06, "loss": 0.80559516, "num_input_tokens_seen": 202189095, "router_z_loss_clip": 2.41992188, "router_z_loss_mlp": 0.27197266, "step": 9387, "time_per_iteration": 2.651008367538452 }, { "auxiliary_loss_clip": 0.01290708, "auxiliary_loss_mlp": 0.00227602, "balance_loss_clip": 1.04537916, "balance_loss_mlp": 0.20147142, "epoch": 0.5644370960468962, "flos": 33806269572480.0, "grad_norm": 8.845360013977434, "language_loss": 0.80469322, "learning_rate": 1.6810356016771452e-06, "loss": 0.81987637, "num_input_tokens_seen": 202213500, "router_z_loss_clip": 2.453125, "router_z_loss_mlp": 0.26147461, "step": 9388, "time_per_iteration": 2.8031108379364014 }, { "auxiliary_loss_clip": 0.01244439, "auxiliary_loss_mlp": 0.00239425, "balance_loss_clip": 1.01494455, "balance_loss_mlp": 0.21640596, "epoch": 0.5644972192995641, "flos": 21214911386880.0, "grad_norm": 7.604097762177784, "language_loss": 0.88825774, "learning_rate": 1.6806511308419353e-06, "loss": 0.90309638, "num_input_tokens_seen": 202231920, "router_z_loss_clip": 2.296875, "router_z_loss_mlp": 0.23034668, "step": 9389, "time_per_iteration": 4.205033540725708 }, { "auxiliary_loss_clip": 0.01299722, "auxiliary_loss_mlp": 0.00222747, "balance_loss_clip": 1.05090344, "balance_loss_mlp": 0.19785586, "epoch": 0.5645573425522321, "flos": 18587722734720.0, "grad_norm": 8.18842808408754, "language_loss": 0.78395927, "learning_rate": 1.680266672116467e-06, "loss": 0.79918396, "num_input_tokens_seen": 202247600, "router_z_loss_clip": 2.48632812, "router_z_loss_mlp": 0.24914551, "step": 9390, "time_per_iteration": 2.687814474105835 }, { "auxiliary_loss_clip": 0.01274441, "auxiliary_loss_mlp": 0.00221857, "balance_loss_clip": 1.03667057, "balance_loss_mlp": 0.19920695, "epoch": 0.5646174658049, "flos": 18113809668480.0, "grad_norm": 16.086095162141465, "language_loss": 0.99273401, "learning_rate": 1.6798822255153192e-06, "loss": 1.00769699, "num_input_tokens_seen": 202265350, "router_z_loss_clip": 2.375, "router_z_loss_mlp": 0.2265625, "step": 9391, "time_per_iteration": 4.058152437210083 }, { "auxiliary_loss_clip": 0.01297491, "auxiliary_loss_mlp": 0.00247263, "balance_loss_clip": 1.04971886, "balance_loss_mlp": 0.2211915, "epoch": 0.564677589057568, "flos": 28329964087680.0, "grad_norm": 30.008088815856816, "language_loss": 0.69050515, "learning_rate": 1.6794977910530684e-06, "loss": 0.7059527, "num_input_tokens_seen": 202284285, "router_z_loss_clip": 2.48242188, "router_z_loss_mlp": 0.26049805, "step": 9392, "time_per_iteration": 2.708153486251831 }, { "auxiliary_loss_clip": 0.01296938, "auxiliary_loss_mlp": 0.0023433, "balance_loss_clip": 1.04756761, "balance_loss_mlp": 0.20900974, "epoch": 0.564737712310236, "flos": 22163743100160.0, "grad_norm": 49.41380278594029, "language_loss": 0.91470754, "learning_rate": 1.6791133687442937e-06, "loss": 0.93002021, "num_input_tokens_seen": 202303450, "router_z_loss_clip": 2.49804688, "router_z_loss_mlp": 0.2532959, "step": 9393, "time_per_iteration": 4.1152026653289795 }, { "auxiliary_loss_clip": 0.01292157, "auxiliary_loss_mlp": 0.00230104, "balance_loss_clip": 1.05157053, "balance_loss_mlp": 0.20628645, "epoch": 0.564797835562904, "flos": 20959011918720.0, "grad_norm": 8.328776759181045, "language_loss": 0.94179976, "learning_rate": 1.6787289586035725e-06, "loss": 0.95702231, "num_input_tokens_seen": 202322315, "router_z_loss_clip": 2.40820312, "router_z_loss_mlp": 0.23815918, "step": 9394, "time_per_iteration": 2.6988818645477295 }, { "auxiliary_loss_clip": 0.01272907, "auxiliary_loss_mlp": 0.00229427, "balance_loss_clip": 1.03767538, "balance_loss_mlp": 0.20546573, "epoch": 0.5648579588155719, "flos": 17420302805760.0, "grad_norm": 3.1516566631690113, "language_loss": 0.91135454, "learning_rate": 1.6783445606454814e-06, "loss": 0.92637789, "num_input_tokens_seen": 202339905, "router_z_loss_clip": 2.35351562, "router_z_loss_mlp": 0.23986816, "step": 9395, "time_per_iteration": 2.607619524002075 }, { "auxiliary_loss_clip": 0.01395207, "auxiliary_loss_mlp": 0.00091371, "balance_loss_clip": 1.19881523, "balance_loss_mlp": 0.08183399, "epoch": 0.5649180820682399, "flos": 69929568835200.0, "grad_norm": 0.8716263030843595, "language_loss": 0.57606399, "learning_rate": 1.677960174884597e-06, "loss": 0.59092975, "num_input_tokens_seen": 202397320, "router_z_loss_clip": 1.96875, "router_z_loss_mlp": 0.09521484, "step": 9396, "time_per_iteration": 3.1884469985961914 }, { "auxiliary_loss_clip": 0.01279039, "auxiliary_loss_mlp": 0.00253007, "balance_loss_clip": 1.04056144, "balance_loss_mlp": 0.22592297, "epoch": 0.5649782053209078, "flos": 24973070641920.0, "grad_norm": 3.24943818158317, "language_loss": 0.78440988, "learning_rate": 1.6775758013354943e-06, "loss": 0.79973036, "num_input_tokens_seen": 202416865, "router_z_loss_clip": 2.38476562, "router_z_loss_mlp": 0.27099609, "step": 9397, "time_per_iteration": 2.686253547668457 }, { "auxiliary_loss_clip": 0.01258214, "auxiliary_loss_mlp": 0.00263778, "balance_loss_clip": 1.02453685, "balance_loss_mlp": 0.23897065, "epoch": 0.5650383285735758, "flos": 21726602582400.0, "grad_norm": 9.16493468533091, "language_loss": 0.76811236, "learning_rate": 1.67719144001275e-06, "loss": 0.78333223, "num_input_tokens_seen": 202436210, "router_z_loss_clip": 2.33789062, "router_z_loss_mlp": 0.24816895, "step": 9398, "time_per_iteration": 2.693168878555298 }, { "auxiliary_loss_clip": 0.0141854, "auxiliary_loss_mlp": 0.00134814, "balance_loss_clip": 1.21753216, "balance_loss_mlp": 0.12213035, "epoch": 0.5650984518262439, "flos": 65904484636800.0, "grad_norm": 0.7580233559961177, "language_loss": 0.57255769, "learning_rate": 1.6768070909309386e-06, "loss": 0.58809125, "num_input_tokens_seen": 202492925, "router_z_loss_clip": 2.0, "router_z_loss_mlp": 0.12695312, "step": 9399, "time_per_iteration": 3.0713977813720703 }, { "auxiliary_loss_clip": 0.01251901, "auxiliary_loss_mlp": 0.00257104, "balance_loss_clip": 1.01900721, "balance_loss_mlp": 0.23060372, "epoch": 0.5651585750789118, "flos": 21032592929280.0, "grad_norm": 585.1365714280007, "language_loss": 0.81883758, "learning_rate": 1.6764227541046347e-06, "loss": 0.83392763, "num_input_tokens_seen": 202511905, "router_z_loss_clip": 2.328125, "router_z_loss_mlp": 0.26513672, "step": 9400, "time_per_iteration": 4.02965784072876 }, { "auxiliary_loss_clip": 0.01301005, "auxiliary_loss_mlp": 0.00251953, "balance_loss_clip": 1.05402112, "balance_loss_mlp": 0.22330683, "epoch": 0.5652186983315798, "flos": 18551919853440.0, "grad_norm": 8.30727261742925, "language_loss": 0.70364535, "learning_rate": 1.676038429548412e-06, "loss": 0.71917492, "num_input_tokens_seen": 202529815, "router_z_loss_clip": 2.46679688, "router_z_loss_mlp": 0.28662109, "step": 9401, "time_per_iteration": 2.6375527381896973 }, { "auxiliary_loss_clip": 0.01284553, "auxiliary_loss_mlp": 0.00252391, "balance_loss_clip": 1.04805279, "balance_loss_mlp": 0.22705877, "epoch": 0.5652788215842477, "flos": 18478662065280.0, "grad_norm": 17.687245936443407, "language_loss": 0.88262653, "learning_rate": 1.6756541172768453e-06, "loss": 0.89799601, "num_input_tokens_seen": 202547710, "router_z_loss_clip": 2.3671875, "router_z_loss_mlp": 0.2532959, "step": 9402, "time_per_iteration": 2.623420476913452 }, { "auxiliary_loss_clip": 0.012717, "auxiliary_loss_mlp": 0.00264391, "balance_loss_clip": 1.03586173, "balance_loss_mlp": 0.24034598, "epoch": 0.5653389448369157, "flos": 30044052080640.0, "grad_norm": 3.821402997174246, "language_loss": 0.83348989, "learning_rate": 1.6752698173045068e-06, "loss": 0.84885073, "num_input_tokens_seen": 202568835, "router_z_loss_clip": 2.36132812, "router_z_loss_mlp": 0.24047852, "step": 9403, "time_per_iteration": 2.702528715133667 }, { "auxiliary_loss_clip": 0.01276494, "auxiliary_loss_mlp": 0.00266521, "balance_loss_clip": 1.03926992, "balance_loss_mlp": 0.24086678, "epoch": 0.5653990680895836, "flos": 16727550128640.0, "grad_norm": 148.855513159024, "language_loss": 0.77008677, "learning_rate": 1.6748855296459685e-06, "loss": 0.78551686, "num_input_tokens_seen": 202587385, "router_z_loss_clip": 2.37304688, "router_z_loss_mlp": 0.25646973, "step": 9404, "time_per_iteration": 2.6391890048980713 }, { "auxiliary_loss_clip": 0.01274255, "auxiliary_loss_mlp": 0.00244104, "balance_loss_clip": 1.0437609, "balance_loss_mlp": 0.21984504, "epoch": 0.5654591913422516, "flos": 14538256179840.0, "grad_norm": 11.777987163881482, "language_loss": 0.74376756, "learning_rate": 1.6745012543158045e-06, "loss": 0.75895119, "num_input_tokens_seen": 202604815, "router_z_loss_clip": 2.30859375, "router_z_loss_mlp": 0.24255371, "step": 9405, "time_per_iteration": 2.6124746799468994 }, { "auxiliary_loss_clip": 0.01265336, "auxiliary_loss_mlp": 0.00226565, "balance_loss_clip": 1.03707755, "balance_loss_mlp": 0.20461817, "epoch": 0.5655193145949196, "flos": 26209905603840.0, "grad_norm": 224.34321790555146, "language_loss": 0.80255657, "learning_rate": 1.6741169913285852e-06, "loss": 0.81747568, "num_input_tokens_seen": 202623775, "router_z_loss_clip": 2.27929688, "router_z_loss_mlp": 0.21948242, "step": 9406, "time_per_iteration": 2.6732428073883057 }, { "auxiliary_loss_clip": 0.01268701, "auxiliary_loss_mlp": 0.00266206, "balance_loss_clip": 1.03466296, "balance_loss_mlp": 0.24082646, "epoch": 0.5655794378475876, "flos": 25046579825280.0, "grad_norm": 8.767253534421368, "language_loss": 0.87785304, "learning_rate": 1.673732740698882e-06, "loss": 0.89320213, "num_input_tokens_seen": 202643375, "router_z_loss_clip": 2.33984375, "router_z_loss_mlp": 0.25366211, "step": 9407, "time_per_iteration": 2.6691091060638428 }, { "auxiliary_loss_clip": 0.01311196, "auxiliary_loss_mlp": 0.00227607, "balance_loss_clip": 1.06716645, "balance_loss_mlp": 0.20248917, "epoch": 0.5656395611002555, "flos": 31032852652800.0, "grad_norm": 76.9758127854083, "language_loss": 0.78170252, "learning_rate": 1.6733485024412666e-06, "loss": 0.79709053, "num_input_tokens_seen": 202668400, "router_z_loss_clip": 2.4375, "router_z_loss_mlp": 0.25109863, "step": 9408, "time_per_iteration": 2.7607624530792236 }, { "auxiliary_loss_clip": 0.01300895, "auxiliary_loss_mlp": 0.00238835, "balance_loss_clip": 1.0645833, "balance_loss_mlp": 0.2146951, "epoch": 0.5656996843529235, "flos": 20229522606720.0, "grad_norm": 26.117029671547346, "language_loss": 0.90007973, "learning_rate": 1.672964276570308e-06, "loss": 0.91547704, "num_input_tokens_seen": 202685125, "router_z_loss_clip": 2.359375, "router_z_loss_mlp": 0.24145508, "step": 9409, "time_per_iteration": 2.6883575916290283 }, { "auxiliary_loss_clip": 0.01277227, "auxiliary_loss_mlp": 0.00273686, "balance_loss_clip": 1.03670895, "balance_loss_mlp": 0.24824689, "epoch": 0.5657598076055914, "flos": 20996251344000.0, "grad_norm": 6.508517615239209, "language_loss": 0.86218327, "learning_rate": 1.6725800631005776e-06, "loss": 0.87769246, "num_input_tokens_seen": 202703830, "router_z_loss_clip": 2.40429688, "router_z_loss_mlp": 0.2545166, "step": 9410, "time_per_iteration": 2.6588029861450195 }, { "auxiliary_loss_clip": 0.01293852, "auxiliary_loss_mlp": 0.00241982, "balance_loss_clip": 1.05189204, "balance_loss_mlp": 0.21780616, "epoch": 0.5658199308582594, "flos": 11545999649280.0, "grad_norm": 5.53411480481222, "language_loss": 0.91902059, "learning_rate": 1.6721958620466432e-06, "loss": 0.93437898, "num_input_tokens_seen": 202719835, "router_z_loss_clip": 2.41992188, "router_z_loss_mlp": 0.24169922, "step": 9411, "time_per_iteration": 2.606736660003662 }, { "auxiliary_loss_clip": 0.01294567, "auxiliary_loss_mlp": 0.00269361, "balance_loss_clip": 1.05186081, "balance_loss_mlp": 0.2416213, "epoch": 0.5658800541109275, "flos": 14172146807040.0, "grad_norm": 6.937355624727228, "language_loss": 0.79223454, "learning_rate": 1.6718116734230749e-06, "loss": 0.80787379, "num_input_tokens_seen": 202736795, "router_z_loss_clip": 2.42578125, "router_z_loss_mlp": 0.27722168, "step": 9412, "time_per_iteration": 2.695753812789917 }, { "auxiliary_loss_clip": 0.01288793, "auxiliary_loss_mlp": 0.00244771, "balance_loss_clip": 1.05150306, "balance_loss_mlp": 0.22059503, "epoch": 0.5659401773635954, "flos": 27305073325440.0, "grad_norm": 38.94297371143048, "language_loss": 0.6567654, "learning_rate": 1.6714274972444413e-06, "loss": 0.67210102, "num_input_tokens_seen": 202756900, "router_z_loss_clip": 2.375, "router_z_loss_mlp": 0.24194336, "step": 9413, "time_per_iteration": 2.8388662338256836 }, { "auxiliary_loss_clip": 0.01287742, "auxiliary_loss_mlp": 0.00234466, "balance_loss_clip": 1.05135417, "balance_loss_mlp": 0.20987295, "epoch": 0.5660003006162634, "flos": 16728196573440.0, "grad_norm": 527.0083658371486, "language_loss": 0.7717663, "learning_rate": 1.6710433335253092e-06, "loss": 0.78698838, "num_input_tokens_seen": 202775145, "router_z_loss_clip": 2.36132812, "router_z_loss_mlp": 0.24621582, "step": 9414, "time_per_iteration": 2.6344704627990723 }, { "auxiliary_loss_clip": 0.01261083, "auxiliary_loss_mlp": 0.00232838, "balance_loss_clip": 1.02849483, "balance_loss_mlp": 0.20943755, "epoch": 0.5660604238689313, "flos": 21653452535040.0, "grad_norm": 83.71922549429587, "language_loss": 0.84369254, "learning_rate": 1.670659182280247e-06, "loss": 0.85863173, "num_input_tokens_seen": 202794505, "router_z_loss_clip": 2.328125, "router_z_loss_mlp": 0.23413086, "step": 9415, "time_per_iteration": 2.698840856552124 }, { "auxiliary_loss_clip": 0.01420465, "auxiliary_loss_mlp": 0.00147786, "balance_loss_clip": 1.21895397, "balance_loss_mlp": 0.13424401, "epoch": 0.5661205471215993, "flos": 68824022083200.0, "grad_norm": 0.6871304226846888, "language_loss": 0.48975101, "learning_rate": 1.670275043523822e-06, "loss": 0.50543356, "num_input_tokens_seen": 202858580, "router_z_loss_clip": 2.015625, "router_z_loss_mlp": 0.13574219, "step": 9416, "time_per_iteration": 3.2879676818847656 }, { "auxiliary_loss_clip": 0.01284683, "auxiliary_loss_mlp": 0.00236507, "balance_loss_clip": 1.04705656, "balance_loss_mlp": 0.21206921, "epoch": 0.5661806703742672, "flos": 28621774177920.0, "grad_norm": 5.646686034680271, "language_loss": 0.72522998, "learning_rate": 1.6698909172706e-06, "loss": 0.74044192, "num_input_tokens_seen": 202878565, "router_z_loss_clip": 2.37890625, "router_z_loss_mlp": 0.2442627, "step": 9417, "time_per_iteration": 2.7214865684509277 }, { "auxiliary_loss_clip": 0.01264087, "auxiliary_loss_mlp": 0.00255499, "balance_loss_clip": 1.02831364, "balance_loss_mlp": 0.22983363, "epoch": 0.5662407936269352, "flos": 21397948116480.0, "grad_norm": 5.0086251830067665, "language_loss": 0.77808738, "learning_rate": 1.6695068035351479e-06, "loss": 0.79328328, "num_input_tokens_seen": 202897350, "router_z_loss_clip": 2.35546875, "router_z_loss_mlp": 0.25671387, "step": 9418, "time_per_iteration": 2.637676954269409 }, { "auxiliary_loss_clip": 0.01286101, "auxiliary_loss_mlp": 0.00233599, "balance_loss_clip": 1.04357851, "balance_loss_mlp": 0.20985293, "epoch": 0.5663009168796032, "flos": 25660005315840.0, "grad_norm": 3.6942856181998094, "language_loss": 0.73864353, "learning_rate": 1.6691227023320304e-06, "loss": 0.75384045, "num_input_tokens_seen": 202916745, "router_z_loss_clip": 2.42773438, "router_z_loss_mlp": 0.23730469, "step": 9419, "time_per_iteration": 2.6898815631866455 }, { "auxiliary_loss_clip": 0.01434115, "auxiliary_loss_mlp": 0.00101471, "balance_loss_clip": 1.23852873, "balance_loss_mlp": 0.09121904, "epoch": 0.5663610401322712, "flos": 67930458422400.0, "grad_norm": 2.8179447138051716, "language_loss": 0.59306335, "learning_rate": 1.6687386136758135e-06, "loss": 0.60841918, "num_input_tokens_seen": 202982375, "router_z_loss_clip": 1.953125, "router_z_loss_mlp": 0.10253906, "step": 9420, "time_per_iteration": 3.199526071548462 }, { "auxiliary_loss_clip": 0.01290937, "auxiliary_loss_mlp": 0.00252874, "balance_loss_clip": 1.05112958, "balance_loss_mlp": 0.22923522, "epoch": 0.5664211633849391, "flos": 24609367480320.0, "grad_norm": 25.170189238975418, "language_loss": 0.81410265, "learning_rate": 1.6683545375810618e-06, "loss": 0.82954073, "num_input_tokens_seen": 203002430, "router_z_loss_clip": 2.40234375, "router_z_loss_mlp": 0.23632812, "step": 9421, "time_per_iteration": 2.705155611038208 }, { "auxiliary_loss_clip": 0.01285963, "auxiliary_loss_mlp": 0.00275204, "balance_loss_clip": 1.04286265, "balance_loss_mlp": 0.25046849, "epoch": 0.5664812866376071, "flos": 11648811352320.0, "grad_norm": 6.5968132317897235, "language_loss": 0.83163774, "learning_rate": 1.6679704740623389e-06, "loss": 0.84724939, "num_input_tokens_seen": 203019425, "router_z_loss_clip": 2.4296875, "router_z_loss_mlp": 0.24755859, "step": 9422, "time_per_iteration": 2.6179187297821045 }, { "auxiliary_loss_clip": 0.01259858, "auxiliary_loss_mlp": 0.00247173, "balance_loss_clip": 1.0271976, "balance_loss_mlp": 0.22339037, "epoch": 0.566541409890275, "flos": 24643985212800.0, "grad_norm": 10.02476435827786, "language_loss": 0.88846588, "learning_rate": 1.6675864231342085e-06, "loss": 0.9035362, "num_input_tokens_seen": 203039035, "router_z_loss_clip": 2.328125, "router_z_loss_mlp": 0.23803711, "step": 9423, "time_per_iteration": 2.689324140548706 }, { "auxiliary_loss_clip": 0.01267365, "auxiliary_loss_mlp": 0.00254561, "balance_loss_clip": 1.03078818, "balance_loss_mlp": 0.2276911, "epoch": 0.566601533142943, "flos": 22270577126400.0, "grad_norm": 5.747776705425229, "language_loss": 0.8671931, "learning_rate": 1.6672023848112353e-06, "loss": 0.88241237, "num_input_tokens_seen": 203059320, "router_z_loss_clip": 2.36523438, "router_z_loss_mlp": 0.26843262, "step": 9424, "time_per_iteration": 2.6395626068115234 }, { "auxiliary_loss_clip": 0.01296548, "auxiliary_loss_mlp": 0.00270338, "balance_loss_clip": 1.04884839, "balance_loss_mlp": 0.24137016, "epoch": 0.5666616563956111, "flos": 29971656218880.0, "grad_norm": 473.0918542253134, "language_loss": 0.87241375, "learning_rate": 1.6668183591079805e-06, "loss": 0.88808262, "num_input_tokens_seen": 203078490, "router_z_loss_clip": 2.47851562, "router_z_loss_mlp": 0.28955078, "step": 9425, "time_per_iteration": 2.7122762203216553 }, { "auxiliary_loss_clip": 0.01285437, "auxiliary_loss_mlp": 0.00255481, "balance_loss_clip": 1.04550242, "balance_loss_mlp": 0.23037598, "epoch": 0.566721779648279, "flos": 17781456101760.0, "grad_norm": 12.898323813194322, "language_loss": 0.69544578, "learning_rate": 1.6664343460390064e-06, "loss": 0.71085489, "num_input_tokens_seen": 203096065, "router_z_loss_clip": 2.40234375, "router_z_loss_mlp": 0.25085449, "step": 9426, "time_per_iteration": 2.589808225631714 }, { "auxiliary_loss_clip": 0.01306466, "auxiliary_loss_mlp": 0.00295766, "balance_loss_clip": 1.05995107, "balance_loss_mlp": 0.26765656, "epoch": 0.566781902900947, "flos": 21033490769280.0, "grad_norm": 38.035519278838855, "language_loss": 0.87789345, "learning_rate": 1.6660503456188764e-06, "loss": 0.89391583, "num_input_tokens_seen": 203115270, "router_z_loss_clip": 2.46484375, "router_z_loss_mlp": 0.28149414, "step": 9427, "time_per_iteration": 2.633382797241211 }, { "auxiliary_loss_clip": 0.01284462, "auxiliary_loss_mlp": 0.00257358, "balance_loss_clip": 1.04864836, "balance_loss_mlp": 0.23368318, "epoch": 0.5668420261536149, "flos": 23148593176320.0, "grad_norm": 92.36659950356955, "language_loss": 0.91213995, "learning_rate": 1.6656663578621498e-06, "loss": 0.92755818, "num_input_tokens_seen": 203134290, "router_z_loss_clip": 2.35546875, "router_z_loss_mlp": 0.23681641, "step": 9428, "time_per_iteration": 2.644094705581665 }, { "auxiliary_loss_clip": 0.0131405, "auxiliary_loss_mlp": 0.00251509, "balance_loss_clip": 1.06182432, "balance_loss_mlp": 0.22497274, "epoch": 0.5669021494062829, "flos": 22601601889920.0, "grad_norm": 17.02089494688732, "language_loss": 0.80254054, "learning_rate": 1.6652823827833886e-06, "loss": 0.81819618, "num_input_tokens_seen": 203152935, "router_z_loss_clip": 2.52148438, "router_z_loss_mlp": 0.265625, "step": 9429, "time_per_iteration": 2.6706180572509766 }, { "auxiliary_loss_clip": 0.01308786, "auxiliary_loss_mlp": 0.0025795, "balance_loss_clip": 1.05915451, "balance_loss_mlp": 0.23088998, "epoch": 0.5669622726589508, "flos": 17381231786880.0, "grad_norm": 23.278686229077284, "language_loss": 0.84170532, "learning_rate": 1.6648984203971538e-06, "loss": 0.85737276, "num_input_tokens_seen": 203170110, "router_z_loss_clip": 2.49609375, "router_z_loss_mlp": 0.27099609, "step": 9430, "time_per_iteration": 2.625060558319092 }, { "auxiliary_loss_clip": 0.01279955, "auxiliary_loss_mlp": 0.00249671, "balance_loss_clip": 1.04131937, "balance_loss_mlp": 0.22317103, "epoch": 0.5670223959116188, "flos": 18763253521920.0, "grad_norm": 43.988727916137314, "language_loss": 0.81272662, "learning_rate": 1.6645144707180032e-06, "loss": 0.8280229, "num_input_tokens_seen": 203188825, "router_z_loss_clip": 2.38671875, "router_z_loss_mlp": 0.26513672, "step": 9431, "time_per_iteration": 4.124488830566406 }, { "auxiliary_loss_clip": 0.01303208, "auxiliary_loss_mlp": 0.00225027, "balance_loss_clip": 1.06144714, "balance_loss_mlp": 0.20253263, "epoch": 0.5670825191642868, "flos": 13553334276480.0, "grad_norm": 5.167696060828471, "language_loss": 0.78704053, "learning_rate": 1.6641305337604984e-06, "loss": 0.80232286, "num_input_tokens_seen": 203206860, "router_z_loss_clip": 2.421875, "router_z_loss_mlp": 0.22485352, "step": 9432, "time_per_iteration": 2.6269383430480957 }, { "auxiliary_loss_clip": 0.01267331, "auxiliary_loss_mlp": 0.00264477, "balance_loss_clip": 1.0319314, "balance_loss_mlp": 0.23918059, "epoch": 0.5671426424169548, "flos": 22054035985920.0, "grad_norm": 4.3311491395458654, "language_loss": 0.84252489, "learning_rate": 1.663746609539197e-06, "loss": 0.85784298, "num_input_tokens_seen": 203225625, "router_z_loss_clip": 2.35351562, "router_z_loss_mlp": 0.25317383, "step": 9433, "time_per_iteration": 4.125555515289307 }, { "auxiliary_loss_clip": 0.01296876, "auxiliary_loss_mlp": 0.00279507, "balance_loss_clip": 1.0467093, "balance_loss_mlp": 0.25186267, "epoch": 0.5672027656696227, "flos": 21323972056320.0, "grad_norm": 24.772381571624972, "language_loss": 0.72664142, "learning_rate": 1.6633626980686582e-06, "loss": 0.74240524, "num_input_tokens_seen": 203242920, "router_z_loss_clip": 2.50195312, "router_z_loss_mlp": 0.27648926, "step": 9434, "time_per_iteration": 2.661813974380493 }, { "auxiliary_loss_clip": 0.01283502, "auxiliary_loss_mlp": 0.00242695, "balance_loss_clip": 1.04531229, "balance_loss_mlp": 0.21928251, "epoch": 0.5672628889222907, "flos": 23514056104320.0, "grad_norm": 2.8176499140753064, "language_loss": 0.72435927, "learning_rate": 1.6629787993634399e-06, "loss": 0.73962122, "num_input_tokens_seen": 203261995, "router_z_loss_clip": 2.3828125, "router_z_loss_mlp": 0.23413086, "step": 9435, "time_per_iteration": 4.23422384262085 }, { "auxiliary_loss_clip": 0.01274549, "auxiliary_loss_mlp": 0.0025386, "balance_loss_clip": 1.03867245, "balance_loss_mlp": 0.23045906, "epoch": 0.5673230121749586, "flos": 27121928855040.0, "grad_norm": 11.439708909348584, "language_loss": 0.76661015, "learning_rate": 1.6625949134380984e-06, "loss": 0.78189433, "num_input_tokens_seen": 203280670, "router_z_loss_clip": 2.35742188, "router_z_loss_mlp": 0.23425293, "step": 9436, "time_per_iteration": 2.7549822330474854 }, { "auxiliary_loss_clip": 0.01300404, "auxiliary_loss_mlp": 0.00274982, "balance_loss_clip": 1.05279613, "balance_loss_mlp": 0.24897079, "epoch": 0.5673831354276266, "flos": 31141985149440.0, "grad_norm": 3.050647713736746, "language_loss": 0.79652268, "learning_rate": 1.6622110403071921e-06, "loss": 0.81227654, "num_input_tokens_seen": 203304800, "router_z_loss_clip": 2.47851562, "router_z_loss_mlp": 0.26013184, "step": 9437, "time_per_iteration": 2.7605960369110107 }, { "auxiliary_loss_clip": 0.01305971, "auxiliary_loss_mlp": 0.00275279, "balance_loss_clip": 1.06174529, "balance_loss_mlp": 0.24522606, "epoch": 0.5674432586802945, "flos": 27673193859840.0, "grad_norm": 12.74565207869998, "language_loss": 0.68932235, "learning_rate": 1.661827179985277e-06, "loss": 0.70513487, "num_input_tokens_seen": 203324060, "router_z_loss_clip": 2.44140625, "router_z_loss_mlp": 0.30053711, "step": 9438, "time_per_iteration": 2.727487802505493 }, { "auxiliary_loss_clip": 0.01306537, "auxiliary_loss_mlp": 0.0026222, "balance_loss_clip": 1.06224298, "balance_loss_mlp": 0.23511185, "epoch": 0.5675033819329626, "flos": 26615157822720.0, "grad_norm": 22.197134547497253, "language_loss": 0.82553124, "learning_rate": 1.661443332486909e-06, "loss": 0.84121883, "num_input_tokens_seen": 203344360, "router_z_loss_clip": 2.44335938, "router_z_loss_mlp": 0.27124023, "step": 9439, "time_per_iteration": 2.6804862022399902 }, { "auxiliary_loss_clip": 0.01323936, "auxiliary_loss_mlp": 0.00266322, "balance_loss_clip": 1.07330894, "balance_loss_mlp": 0.23862964, "epoch": 0.5675635051856306, "flos": 19098372435840.0, "grad_norm": 4.364665980405295, "language_loss": 0.91962183, "learning_rate": 1.6610594978266438e-06, "loss": 0.9355244, "num_input_tokens_seen": 203362115, "router_z_loss_clip": 2.50585938, "router_z_loss_mlp": 0.27697754, "step": 9440, "time_per_iteration": 2.627060890197754 }, { "auxiliary_loss_clip": 0.01306864, "auxiliary_loss_mlp": 0.0028581, "balance_loss_clip": 1.05807018, "balance_loss_mlp": 0.25911903, "epoch": 0.5676236284382985, "flos": 17566315591680.0, "grad_norm": 9.556778443669904, "language_loss": 0.83990049, "learning_rate": 1.6606756760190365e-06, "loss": 0.85582721, "num_input_tokens_seen": 203380550, "router_z_loss_clip": 2.49023438, "router_z_loss_mlp": 0.26672363, "step": 9441, "time_per_iteration": 2.6065282821655273 }, { "auxiliary_loss_clip": 0.01302796, "auxiliary_loss_mlp": 0.00302321, "balance_loss_clip": 1.05736589, "balance_loss_mlp": 0.27539188, "epoch": 0.5676837516909665, "flos": 15954069634560.0, "grad_norm": 12.722921779656293, "language_loss": 0.90904641, "learning_rate": 1.6602918670786413e-06, "loss": 0.92509753, "num_input_tokens_seen": 203396590, "router_z_loss_clip": 2.45703125, "router_z_loss_mlp": 0.26928711, "step": 9442, "time_per_iteration": 4.01578426361084 }, { "auxiliary_loss_clip": 0.01325176, "auxiliary_loss_mlp": 0.00269651, "balance_loss_clip": 1.07980442, "balance_loss_mlp": 0.24459356, "epoch": 0.5677438749436344, "flos": 18295912644480.0, "grad_norm": 16.772410941432707, "language_loss": 0.83406079, "learning_rate": 1.6599080710200126e-06, "loss": 0.85000908, "num_input_tokens_seen": 203414280, "router_z_loss_clip": 2.45703125, "router_z_loss_mlp": 0.25085449, "step": 9443, "time_per_iteration": 2.6322555541992188 }, { "auxiliary_loss_clip": 0.01306634, "auxiliary_loss_mlp": 0.00332318, "balance_loss_clip": 1.06155348, "balance_loss_mlp": 0.30590081, "epoch": 0.5678039981963025, "flos": 17931311642880.0, "grad_norm": 5.033232119572463, "language_loss": 0.86400276, "learning_rate": 1.6595242878577046e-06, "loss": 0.88039225, "num_input_tokens_seen": 203433280, "router_z_loss_clip": 2.44921875, "router_z_loss_mlp": 0.26428223, "step": 9444, "time_per_iteration": 2.6096367835998535 }, { "auxiliary_loss_clip": 0.0132288, "auxiliary_loss_mlp": 0.00276496, "balance_loss_clip": 1.07061458, "balance_loss_mlp": 0.25150961, "epoch": 0.5678641214489704, "flos": 19316350120320.0, "grad_norm": 41.09167978527035, "language_loss": 0.88463151, "learning_rate": 1.6591405176062687e-06, "loss": 0.90062535, "num_input_tokens_seen": 203449935, "router_z_loss_clip": 2.51953125, "router_z_loss_mlp": 0.25, "step": 9445, "time_per_iteration": 2.631793260574341 }, { "auxiliary_loss_clip": 0.01328467, "auxiliary_loss_mlp": 0.00282521, "balance_loss_clip": 1.07623982, "balance_loss_mlp": 0.25532985, "epoch": 0.5679242447016384, "flos": 27751084502400.0, "grad_norm": 6.740058325569447, "language_loss": 0.76941204, "learning_rate": 1.658756760280259e-06, "loss": 0.78552192, "num_input_tokens_seen": 203473025, "router_z_loss_clip": 2.5234375, "router_z_loss_mlp": 0.27172852, "step": 9446, "time_per_iteration": 2.752338171005249 }, { "auxiliary_loss_clip": 0.0132621, "auxiliary_loss_mlp": 0.00288208, "balance_loss_clip": 1.06813729, "balance_loss_mlp": 0.26003844, "epoch": 0.5679843679543063, "flos": 23769093646080.0, "grad_norm": 56.80437868370588, "language_loss": 0.8375262, "learning_rate": 1.6583730158942276e-06, "loss": 0.85367036, "num_input_tokens_seen": 203492895, "router_z_loss_clip": 2.5859375, "router_z_loss_mlp": 0.28161621, "step": 9447, "time_per_iteration": 2.7215120792388916 }, { "auxiliary_loss_clip": 0.01310046, "auxiliary_loss_mlp": 0.00263464, "balance_loss_clip": 1.05808377, "balance_loss_mlp": 0.23751193, "epoch": 0.5680444912069743, "flos": 25591883172480.0, "grad_norm": 5.335723002563481, "language_loss": 0.85134315, "learning_rate": 1.657989284462725e-06, "loss": 0.86707819, "num_input_tokens_seen": 203513710, "router_z_loss_clip": 2.51757812, "router_z_loss_mlp": 0.25976562, "step": 9448, "time_per_iteration": 2.6723532676696777 }, { "auxiliary_loss_clip": 0.01347236, "auxiliary_loss_mlp": 0.0033767, "balance_loss_clip": 1.08516836, "balance_loss_mlp": 0.3059485, "epoch": 0.5681046144596422, "flos": 23695799944320.0, "grad_norm": 14.972770433948968, "language_loss": 0.84621376, "learning_rate": 1.6576055660003038e-06, "loss": 0.86306286, "num_input_tokens_seen": 203531630, "router_z_loss_clip": 2.61914062, "router_z_loss_mlp": 0.31738281, "step": 9449, "time_per_iteration": 2.7167389392852783 }, { "auxiliary_loss_clip": 0.0131387, "auxiliary_loss_mlp": 0.00257162, "balance_loss_clip": 1.06452107, "balance_loss_mlp": 0.23140061, "epoch": 0.5681647377123102, "flos": 28000770917760.0, "grad_norm": 7.158695909943339, "language_loss": 0.82618487, "learning_rate": 1.6572218605215128e-06, "loss": 0.84189522, "num_input_tokens_seen": 203551885, "router_z_loss_clip": 2.49414062, "router_z_loss_mlp": 0.25769043, "step": 9450, "time_per_iteration": 2.690284490585327 }, { "auxiliary_loss_clip": 0.01277357, "auxiliary_loss_mlp": 0.00286799, "balance_loss_clip": 1.03609169, "balance_loss_mlp": 0.26245645, "epoch": 0.5682248609649782, "flos": 22747758330240.0, "grad_norm": 13.814686707503448, "language_loss": 0.76046687, "learning_rate": 1.6568381680409038e-06, "loss": 0.77610844, "num_input_tokens_seen": 203572250, "router_z_loss_clip": 2.4140625, "router_z_loss_mlp": 0.2434082, "step": 9451, "time_per_iteration": 2.6433491706848145 }, { "auxiliary_loss_clip": 0.0131585, "auxiliary_loss_mlp": 0.00315703, "balance_loss_clip": 1.05810654, "balance_loss_mlp": 0.28499466, "epoch": 0.5682849842176462, "flos": 21288600138240.0, "grad_norm": 584.2010506434016, "language_loss": 0.82592309, "learning_rate": 1.656454488573026e-06, "loss": 0.84223866, "num_input_tokens_seen": 203590605, "router_z_loss_clip": 2.57617188, "router_z_loss_mlp": 0.30712891, "step": 9452, "time_per_iteration": 2.647092580795288 }, { "auxiliary_loss_clip": 0.01317699, "auxiliary_loss_mlp": 0.0026385, "balance_loss_clip": 1.06562924, "balance_loss_mlp": 0.2395428, "epoch": 0.5683451074703142, "flos": 21141689512320.0, "grad_norm": 5.9806021926551916, "language_loss": 0.77959436, "learning_rate": 1.656070822132428e-06, "loss": 0.7954098, "num_input_tokens_seen": 203610080, "router_z_loss_clip": 2.52148438, "router_z_loss_mlp": 0.24316406, "step": 9453, "time_per_iteration": 2.682722806930542 }, { "auxiliary_loss_clip": 0.01316107, "auxiliary_loss_mlp": 0.00290299, "balance_loss_clip": 1.06574893, "balance_loss_mlp": 0.26502705, "epoch": 0.5684052307229821, "flos": 22344481359360.0, "grad_norm": 6.536207055751518, "language_loss": 0.76158798, "learning_rate": 1.6556871687336592e-06, "loss": 0.77765203, "num_input_tokens_seen": 203630060, "router_z_loss_clip": 2.50195312, "router_z_loss_mlp": 0.25305176, "step": 9454, "time_per_iteration": 2.694864511489868 }, { "auxiliary_loss_clip": 0.01310795, "auxiliary_loss_mlp": 0.00265781, "balance_loss_clip": 1.06218648, "balance_loss_mlp": 0.24109288, "epoch": 0.5684653539756501, "flos": 21798639308160.0, "grad_norm": 25.433528840832444, "language_loss": 0.71283185, "learning_rate": 1.6553035283912671e-06, "loss": 0.72859764, "num_input_tokens_seen": 203649065, "router_z_loss_clip": 2.48632812, "router_z_loss_mlp": 0.24694824, "step": 9455, "time_per_iteration": 2.659844398498535 }, { "auxiliary_loss_clip": 0.01302446, "auxiliary_loss_mlp": 0.00272691, "balance_loss_clip": 1.04994452, "balance_loss_mlp": 0.24683467, "epoch": 0.568525477228318, "flos": 22999635475200.0, "grad_norm": 19.94370432969129, "language_loss": 0.80843806, "learning_rate": 1.6549199011198e-06, "loss": 0.82418942, "num_input_tokens_seen": 203667545, "router_z_loss_clip": 2.52539062, "router_z_loss_mlp": 0.25842285, "step": 9456, "time_per_iteration": 2.6706864833831787 }, { "auxiliary_loss_clip": 0.01325053, "auxiliary_loss_mlp": 0.00263925, "balance_loss_clip": 1.07152903, "balance_loss_mlp": 0.23722225, "epoch": 0.568585600480986, "flos": 21392489249280.0, "grad_norm": 15.111640205889765, "language_loss": 0.83967823, "learning_rate": 1.6545362869338048e-06, "loss": 0.85556805, "num_input_tokens_seen": 203686025, "router_z_loss_clip": 2.53515625, "router_z_loss_mlp": 0.26696777, "step": 9457, "time_per_iteration": 2.6673789024353027 }, { "auxiliary_loss_clip": 0.01304369, "auxiliary_loss_mlp": 0.00281216, "balance_loss_clip": 1.05604959, "balance_loss_mlp": 0.2554903, "epoch": 0.568645723733654, "flos": 30007351359360.0, "grad_norm": 25.115316143147936, "language_loss": 0.73518568, "learning_rate": 1.6541526858478285e-06, "loss": 0.75104153, "num_input_tokens_seen": 203705540, "router_z_loss_clip": 2.48046875, "router_z_loss_mlp": 0.25744629, "step": 9458, "time_per_iteration": 2.7337021827697754 }, { "auxiliary_loss_clip": 0.01328941, "auxiliary_loss_mlp": 0.00282073, "balance_loss_clip": 1.07254362, "balance_loss_mlp": 0.25430959, "epoch": 0.568705846986322, "flos": 20412667077120.0, "grad_norm": 7.326732004179747, "language_loss": 0.75705838, "learning_rate": 1.6537690978764167e-06, "loss": 0.7731685, "num_input_tokens_seen": 203723670, "router_z_loss_clip": 2.56835938, "router_z_loss_mlp": 0.27770996, "step": 9459, "time_per_iteration": 2.72263765335083 }, { "auxiliary_loss_clip": 0.01299973, "auxiliary_loss_mlp": 0.00278369, "balance_loss_clip": 1.04824793, "balance_loss_mlp": 0.25027108, "epoch": 0.5687659702389899, "flos": 17456752131840.0, "grad_norm": 121.47686369401714, "language_loss": 0.8633337, "learning_rate": 1.6533855230341155e-06, "loss": 0.87911713, "num_input_tokens_seen": 203739705, "router_z_loss_clip": 2.51953125, "router_z_loss_mlp": 0.28112793, "step": 9460, "time_per_iteration": 2.801523447036743 }, { "auxiliary_loss_clip": 0.01290257, "auxiliary_loss_mlp": 0.00310331, "balance_loss_clip": 1.03771722, "balance_loss_mlp": 0.28094602, "epoch": 0.5688260934916579, "flos": 25406081095680.0, "grad_norm": 2.7884734577236596, "language_loss": 0.8004809, "learning_rate": 1.65300196133547e-06, "loss": 0.81648684, "num_input_tokens_seen": 203759000, "router_z_loss_clip": 2.5234375, "router_z_loss_mlp": 0.29394531, "step": 9461, "time_per_iteration": 2.7450075149536133 }, { "auxiliary_loss_clip": 0.01316907, "auxiliary_loss_mlp": 0.00288091, "balance_loss_clip": 1.06316674, "balance_loss_mlp": 0.26099476, "epoch": 0.5688862167443258, "flos": 21608024808960.0, "grad_norm": 19.625152140584518, "language_loss": 0.78734565, "learning_rate": 1.6526184127950249e-06, "loss": 0.80339563, "num_input_tokens_seen": 203774295, "router_z_loss_clip": 2.5390625, "router_z_loss_mlp": 0.27111816, "step": 9462, "time_per_iteration": 2.6267518997192383 }, { "auxiliary_loss_clip": 0.0130065, "auxiliary_loss_mlp": 0.00249225, "balance_loss_clip": 1.054003, "balance_loss_mlp": 0.22633702, "epoch": 0.5689463399969938, "flos": 22418996123520.0, "grad_norm": 94.1876034502321, "language_loss": 0.79880142, "learning_rate": 1.6522348774273246e-06, "loss": 0.81430018, "num_input_tokens_seen": 203792710, "router_z_loss_clip": 2.46289062, "router_z_loss_mlp": 0.22888184, "step": 9463, "time_per_iteration": 2.65032958984375 }, { "auxiliary_loss_clip": 0.0129141, "auxiliary_loss_mlp": 0.00290357, "balance_loss_clip": 1.04111147, "balance_loss_mlp": 0.26230741, "epoch": 0.5690064632496618, "flos": 18296810484480.0, "grad_norm": 49.21828255166001, "language_loss": 0.83404845, "learning_rate": 1.6518513552469123e-06, "loss": 0.84986609, "num_input_tokens_seen": 203811645, "router_z_loss_clip": 2.50390625, "router_z_loss_mlp": 0.28063965, "step": 9464, "time_per_iteration": 2.6440815925598145 }, { "auxiliary_loss_clip": 0.01296681, "auxiliary_loss_mlp": 0.00237909, "balance_loss_clip": 1.04321289, "balance_loss_mlp": 0.21459207, "epoch": 0.5690665865023298, "flos": 21579260993280.0, "grad_norm": 11.180444025351122, "language_loss": 0.91059643, "learning_rate": 1.6514678462683312e-06, "loss": 0.92594236, "num_input_tokens_seen": 203830040, "router_z_loss_clip": 2.53710938, "router_z_loss_mlp": 0.23303223, "step": 9465, "time_per_iteration": 2.660050630569458 }, { "auxiliary_loss_clip": 0.0129427, "auxiliary_loss_mlp": 0.00256649, "balance_loss_clip": 1.04480171, "balance_loss_mlp": 0.2328552, "epoch": 0.5691267097549978, "flos": 24421446501120.0, "grad_norm": 57.90247350703802, "language_loss": 0.80147874, "learning_rate": 1.651084350506125e-06, "loss": 0.81698787, "num_input_tokens_seen": 203851245, "router_z_loss_clip": 2.4921875, "router_z_loss_mlp": 0.23803711, "step": 9466, "time_per_iteration": 2.67702317237854 }, { "auxiliary_loss_clip": 0.01405969, "auxiliary_loss_mlp": 0.00109686, "balance_loss_clip": 1.19092417, "balance_loss_mlp": 0.10134158, "epoch": 0.5691868330076657, "flos": 61657906199040.0, "grad_norm": 0.7074040969490105, "language_loss": 0.55030942, "learning_rate": 1.6507008679748343e-06, "loss": 0.56546593, "num_input_tokens_seen": 203916400, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.08349609, "step": 9467, "time_per_iteration": 3.1986613273620605 }, { "auxiliary_loss_clip": 0.01289586, "auxiliary_loss_mlp": 0.00280702, "balance_loss_clip": 1.04080665, "balance_loss_mlp": 0.25373673, "epoch": 0.5692469562603337, "flos": 21325193118720.0, "grad_norm": 31.565755790432682, "language_loss": 0.71478802, "learning_rate": 1.6503173986890023e-06, "loss": 0.73049086, "num_input_tokens_seen": 203935870, "router_z_loss_clip": 2.49023438, "router_z_loss_mlp": 0.26953125, "step": 9468, "time_per_iteration": 2.641456127166748 }, { "auxiliary_loss_clip": 0.01261873, "auxiliary_loss_mlp": 0.00305541, "balance_loss_clip": 1.02219021, "balance_loss_mlp": 0.28071022, "epoch": 0.5693070795130016, "flos": 23367899664000.0, "grad_norm": 28.13912212400121, "language_loss": 0.85583103, "learning_rate": 1.64993394266317e-06, "loss": 0.87150514, "num_input_tokens_seen": 203954950, "router_z_loss_clip": 2.3984375, "router_z_loss_mlp": 0.24841309, "step": 9469, "time_per_iteration": 2.6305646896362305 }, { "auxiliary_loss_clip": 0.01310494, "auxiliary_loss_mlp": 0.00306173, "balance_loss_clip": 1.05125451, "balance_loss_mlp": 0.27668077, "epoch": 0.5693672027656697, "flos": 18697250280960.0, "grad_norm": 77.49361269948493, "language_loss": 0.79859614, "learning_rate": 1.6495504999118769e-06, "loss": 0.81476283, "num_input_tokens_seen": 203972715, "router_z_loss_clip": 2.59765625, "router_z_loss_mlp": 0.2947998, "step": 9470, "time_per_iteration": 2.6173484325408936 }, { "auxiliary_loss_clip": 0.01305868, "auxiliary_loss_mlp": 0.00301514, "balance_loss_clip": 1.05031204, "balance_loss_mlp": 0.27593142, "epoch": 0.5694273260183376, "flos": 20449188230400.0, "grad_norm": 19.858114232308477, "language_loss": 0.81123686, "learning_rate": 1.6491670704496644e-06, "loss": 0.82731068, "num_input_tokens_seen": 203990775, "router_z_loss_clip": 2.56054688, "router_z_loss_mlp": 0.25585938, "step": 9471, "time_per_iteration": 2.634957790374756 }, { "auxiliary_loss_clip": 0.01308141, "auxiliary_loss_mlp": 0.00285926, "balance_loss_clip": 1.0525775, "balance_loss_mlp": 0.26196447, "epoch": 0.5694874492710056, "flos": 17603195880960.0, "grad_norm": 110.14699795855836, "language_loss": 0.66684163, "learning_rate": 1.6487836542910716e-06, "loss": 0.68278229, "num_input_tokens_seen": 204008845, "router_z_loss_clip": 2.5546875, "router_z_loss_mlp": 0.23974609, "step": 9472, "time_per_iteration": 2.6313059329986572 }, { "auxiliary_loss_clip": 0.01319459, "auxiliary_loss_mlp": 0.00300304, "balance_loss_clip": 1.06572008, "balance_loss_mlp": 0.27499565, "epoch": 0.5695475725236735, "flos": 13370836250880.0, "grad_norm": 15.70141945782699, "language_loss": 0.81522512, "learning_rate": 1.648400251450638e-06, "loss": 0.83142275, "num_input_tokens_seen": 204023755, "router_z_loss_clip": 2.53515625, "router_z_loss_mlp": 0.25317383, "step": 9473, "time_per_iteration": 2.660992383956909 }, { "auxiliary_loss_clip": 0.01416715, "auxiliary_loss_mlp": 0.00091686, "balance_loss_clip": 1.21103048, "balance_loss_mlp": 0.08119577, "epoch": 0.5696076957763415, "flos": 68174398661760.0, "grad_norm": 0.6430597559514286, "language_loss": 0.56895971, "learning_rate": 1.6480168619429023e-06, "loss": 0.58404368, "num_input_tokens_seen": 204091255, "router_z_loss_clip": 2.0625, "router_z_loss_mlp": 0.10498047, "step": 9474, "time_per_iteration": 4.6294004917144775 }, { "auxiliary_loss_clip": 0.01294932, "auxiliary_loss_mlp": 0.00348154, "balance_loss_clip": 1.04890513, "balance_loss_mlp": 0.32166529, "epoch": 0.5696678190290094, "flos": 33838301525760.0, "grad_norm": 6.543325823096282, "language_loss": 0.59244287, "learning_rate": 1.6476334857824017e-06, "loss": 0.60887372, "num_input_tokens_seen": 204113285, "router_z_loss_clip": 2.46484375, "router_z_loss_mlp": 0.26489258, "step": 9475, "time_per_iteration": 4.326059103012085 }, { "auxiliary_loss_clip": 0.01324683, "auxiliary_loss_mlp": 0.00352173, "balance_loss_clip": 1.07081735, "balance_loss_mlp": 0.326078, "epoch": 0.5697279422816774, "flos": 26356600748160.0, "grad_norm": 2255.4122817241014, "language_loss": 0.85806334, "learning_rate": 1.647250122983675e-06, "loss": 0.87483191, "num_input_tokens_seen": 204133045, "router_z_loss_clip": 2.53320312, "router_z_loss_mlp": 0.26086426, "step": 9476, "time_per_iteration": 2.712709903717041 }, { "auxiliary_loss_clip": 0.0135273, "auxiliary_loss_mlp": 0.00342772, "balance_loss_clip": 1.08844507, "balance_loss_mlp": 0.3174516, "epoch": 0.5697880655343454, "flos": 22930507751040.0, "grad_norm": 78.07367177482436, "language_loss": 0.77452779, "learning_rate": 1.6468667735612592e-06, "loss": 0.79148287, "num_input_tokens_seen": 204152590, "router_z_loss_clip": 2.640625, "router_z_loss_mlp": 0.25341797, "step": 9477, "time_per_iteration": 4.123754262924194 }, { "auxiliary_loss_clip": 0.01341476, "auxiliary_loss_mlp": 0.00317424, "balance_loss_clip": 1.08028007, "balance_loss_mlp": 0.29156792, "epoch": 0.5698481887870134, "flos": 26761314263040.0, "grad_norm": 4.462141868883917, "language_loss": 0.77207267, "learning_rate": 1.6464834375296906e-06, "loss": 0.78866166, "num_input_tokens_seen": 204171815, "router_z_loss_clip": 2.61132812, "router_z_loss_mlp": 0.25866699, "step": 9478, "time_per_iteration": 2.722459554672241 }, { "auxiliary_loss_clip": 0.01292244, "auxiliary_loss_mlp": 0.0038331, "balance_loss_clip": 1.05165339, "balance_loss_mlp": 0.35799026, "epoch": 0.5699083120396814, "flos": 15742269089280.0, "grad_norm": 15.120795227138883, "language_loss": 0.74234486, "learning_rate": 1.6461001149035055e-06, "loss": 0.75910044, "num_input_tokens_seen": 204188535, "router_z_loss_clip": 2.40625, "router_z_loss_mlp": 0.25341797, "step": 9479, "time_per_iteration": 2.6676619052886963 }, { "auxiliary_loss_clip": 0.01326607, "auxiliary_loss_mlp": 0.0039282, "balance_loss_clip": 1.07239223, "balance_loss_mlp": 0.36567605, "epoch": 0.5699684352923493, "flos": 19537272720000.0, "grad_norm": 225.63175567572338, "language_loss": 0.7866562, "learning_rate": 1.6457168056972392e-06, "loss": 0.80385041, "num_input_tokens_seen": 204208365, "router_z_loss_clip": 2.54296875, "router_z_loss_mlp": 0.27160645, "step": 9480, "time_per_iteration": 2.6956722736358643 }, { "auxiliary_loss_clip": 0.0134784, "auxiliary_loss_mlp": 0.00355464, "balance_loss_clip": 1.09159875, "balance_loss_mlp": 0.32985812, "epoch": 0.5700285585450173, "flos": 16253349753600.0, "grad_norm": 76.8150939361463, "language_loss": 0.81486166, "learning_rate": 1.6453335099254276e-06, "loss": 0.8318947, "num_input_tokens_seen": 204226560, "router_z_loss_clip": 2.56054688, "router_z_loss_mlp": 0.25622559, "step": 9481, "time_per_iteration": 2.625170946121216 }, { "auxiliary_loss_clip": 0.01354189, "auxiliary_loss_mlp": 0.00401452, "balance_loss_clip": 1.09387207, "balance_loss_mlp": 0.37247229, "epoch": 0.5700886817976852, "flos": 19864993432320.0, "grad_norm": 3.5284540583652695, "language_loss": 0.87036908, "learning_rate": 1.6449502276026041e-06, "loss": 0.88792551, "num_input_tokens_seen": 204245410, "router_z_loss_clip": 2.6015625, "router_z_loss_mlp": 0.28979492, "step": 9482, "time_per_iteration": 2.746548652648926 }, { "auxiliary_loss_clip": 0.01329214, "auxiliary_loss_mlp": 0.0036711, "balance_loss_clip": 1.07598102, "balance_loss_mlp": 0.34072858, "epoch": 0.5701488050503533, "flos": 23841704989440.0, "grad_norm": 1.5597158092094803, "language_loss": 0.82578397, "learning_rate": 1.6445669587433043e-06, "loss": 0.84274721, "num_input_tokens_seen": 204264840, "router_z_loss_clip": 2.53515625, "router_z_loss_mlp": 0.26379395, "step": 9483, "time_per_iteration": 2.654883861541748 }, { "auxiliary_loss_clip": 0.01330615, "auxiliary_loss_mlp": 0.00408364, "balance_loss_clip": 1.07795238, "balance_loss_mlp": 0.37796605, "epoch": 0.5702089283030212, "flos": 23659673840640.0, "grad_norm": 9.050104637697054, "language_loss": 0.87493742, "learning_rate": 1.6441837033620612e-06, "loss": 0.89232719, "num_input_tokens_seen": 204284335, "router_z_loss_clip": 2.52734375, "router_z_loss_mlp": 0.30371094, "step": 9484, "time_per_iteration": 4.050048589706421 }, { "auxiliary_loss_clip": 0.01339198, "auxiliary_loss_mlp": 0.00398812, "balance_loss_clip": 1.08654952, "balance_loss_mlp": 0.36846125, "epoch": 0.5702690515556892, "flos": 27891171544320.0, "grad_norm": 2.990230646663289, "language_loss": 0.68229276, "learning_rate": 1.6438004614734073e-06, "loss": 0.69967282, "num_input_tokens_seen": 204302590, "router_z_loss_clip": 2.52929688, "router_z_loss_mlp": 0.30334473, "step": 9485, "time_per_iteration": 2.701058864593506 }, { "auxiliary_loss_clip": 0.01329686, "auxiliary_loss_mlp": 0.00388487, "balance_loss_clip": 1.07511306, "balance_loss_mlp": 0.36040118, "epoch": 0.5703291748083571, "flos": 24023951619840.0, "grad_norm": 39.82727547660677, "language_loss": 0.71266782, "learning_rate": 1.6434172330918757e-06, "loss": 0.72984952, "num_input_tokens_seen": 204323055, "router_z_loss_clip": 2.546875, "router_z_loss_mlp": 0.28076172, "step": 9486, "time_per_iteration": 2.7277863025665283 }, { "auxiliary_loss_clip": 0.01433845, "auxiliary_loss_mlp": 0.00202547, "balance_loss_clip": 1.21974325, "balance_loss_mlp": 0.19081697, "epoch": 0.5703892980610251, "flos": 57023382919680.0, "grad_norm": 0.6583234978675099, "language_loss": 0.47621384, "learning_rate": 1.6430340182319978e-06, "loss": 0.49257779, "num_input_tokens_seen": 204386160, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.1171875, "step": 9487, "time_per_iteration": 3.2048280239105225 }, { "auxiliary_loss_clip": 0.01340464, "auxiliary_loss_mlp": 0.00382779, "balance_loss_clip": 1.08796573, "balance_loss_mlp": 0.35425258, "epoch": 0.570449421313693, "flos": 24351025887360.0, "grad_norm": 224.7900376330075, "language_loss": 0.93667763, "learning_rate": 1.6426508169083067e-06, "loss": 0.95391005, "num_input_tokens_seen": 204406315, "router_z_loss_clip": 2.52539062, "router_z_loss_mlp": 0.28540039, "step": 9488, "time_per_iteration": 2.676804542541504 }, { "auxiliary_loss_clip": 0.01351966, "auxiliary_loss_mlp": 0.00383859, "balance_loss_clip": 1.09311223, "balance_loss_mlp": 0.35450926, "epoch": 0.570509544566361, "flos": 24828566227200.0, "grad_norm": 5.744955700333227, "language_loss": 0.85704881, "learning_rate": 1.6422676291353314e-06, "loss": 0.87440705, "num_input_tokens_seen": 204427645, "router_z_loss_clip": 2.58984375, "router_z_loss_mlp": 0.29370117, "step": 9489, "time_per_iteration": 2.703979730606079 }, { "auxiliary_loss_clip": 0.01345636, "auxiliary_loss_mlp": 0.00307697, "balance_loss_clip": 1.09204745, "balance_loss_mlp": 0.2820431, "epoch": 0.570569667819029, "flos": 21397301671680.0, "grad_norm": 11.259031357636708, "language_loss": 0.7589041, "learning_rate": 1.641884454927604e-06, "loss": 0.77543741, "num_input_tokens_seen": 204445910, "router_z_loss_clip": 2.53710938, "router_z_loss_mlp": 0.25622559, "step": 9490, "time_per_iteration": 2.62622332572937 }, { "auxiliary_loss_clip": 0.01353393, "auxiliary_loss_mlp": 0.00375354, "balance_loss_clip": 1.09863091, "balance_loss_mlp": 0.34648192, "epoch": 0.570629791071697, "flos": 23216751233280.0, "grad_norm": 5.287637481706508, "language_loss": 0.81493282, "learning_rate": 1.6415012942996548e-06, "loss": 0.83222032, "num_input_tokens_seen": 204464680, "router_z_loss_clip": 2.546875, "router_z_loss_mlp": 0.28881836, "step": 9491, "time_per_iteration": 2.675053119659424 }, { "auxiliary_loss_clip": 0.01439145, "auxiliary_loss_mlp": 0.00203093, "balance_loss_clip": 1.22674537, "balance_loss_mlp": 0.19098179, "epoch": 0.570689914324365, "flos": 65284666525440.0, "grad_norm": 0.7736547327408025, "language_loss": 0.57072335, "learning_rate": 1.641118147266011e-06, "loss": 0.58714575, "num_input_tokens_seen": 204525580, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.12109375, "step": 9492, "time_per_iteration": 3.159968376159668 }, { "auxiliary_loss_clip": 0.01344783, "auxiliary_loss_mlp": 0.00348161, "balance_loss_clip": 1.09419155, "balance_loss_mlp": 0.32081428, "epoch": 0.5707500375770329, "flos": 21141904993920.0, "grad_norm": 159.0795834536752, "language_loss": 0.80841631, "learning_rate": 1.6407350138412035e-06, "loss": 0.82534575, "num_input_tokens_seen": 204541320, "router_z_loss_clip": 2.5078125, "router_z_loss_mlp": 0.27307129, "step": 9493, "time_per_iteration": 2.6449601650238037 }, { "auxiliary_loss_clip": 0.01359985, "auxiliary_loss_mlp": 0.0032896, "balance_loss_clip": 1.10295331, "balance_loss_mlp": 0.30057675, "epoch": 0.5708101608297009, "flos": 20812747737600.0, "grad_norm": 37.94457608512977, "language_loss": 0.85849661, "learning_rate": 1.6403518940397606e-06, "loss": 0.87538612, "num_input_tokens_seen": 204560275, "router_z_loss_clip": 2.57421875, "router_z_loss_mlp": 0.28405762, "step": 9494, "time_per_iteration": 2.696981430053711 }, { "auxiliary_loss_clip": 0.01351188, "auxiliary_loss_mlp": 0.00335562, "balance_loss_clip": 1.09187388, "balance_loss_mlp": 0.30621314, "epoch": 0.5708702840823688, "flos": 25812338895360.0, "grad_norm": 60.419998439698126, "language_loss": 0.88183045, "learning_rate": 1.6399687878762096e-06, "loss": 0.89869791, "num_input_tokens_seen": 204579430, "router_z_loss_clip": 2.59179688, "router_z_loss_mlp": 0.29345703, "step": 9495, "time_per_iteration": 2.745171546936035 }, { "auxiliary_loss_clip": 0.01355562, "auxiliary_loss_mlp": 0.00319951, "balance_loss_clip": 1.09664178, "balance_loss_mlp": 0.29100677, "epoch": 0.5709304073350369, "flos": 23651916503040.0, "grad_norm": 14.473053025810746, "language_loss": 0.74844027, "learning_rate": 1.6395856953650784e-06, "loss": 0.76519537, "num_input_tokens_seen": 204597710, "router_z_loss_clip": 2.5859375, "router_z_loss_mlp": 0.28918457, "step": 9496, "time_per_iteration": 2.6850390434265137 }, { "auxiliary_loss_clip": 0.01341217, "auxiliary_loss_mlp": 0.00349954, "balance_loss_clip": 1.08458924, "balance_loss_mlp": 0.31979445, "epoch": 0.5709905305877048, "flos": 16107552449280.0, "grad_norm": 12326.858336638337, "language_loss": 0.76583546, "learning_rate": 1.6392026165208938e-06, "loss": 0.78274715, "num_input_tokens_seen": 204616140, "router_z_loss_clip": 2.56640625, "router_z_loss_mlp": 0.30175781, "step": 9497, "time_per_iteration": 2.6623964309692383 }, { "auxiliary_loss_clip": 0.01361824, "auxiliary_loss_mlp": 0.00304502, "balance_loss_clip": 1.10146129, "balance_loss_mlp": 0.27560568, "epoch": 0.5710506538403728, "flos": 24750819239040.0, "grad_norm": 30.071309928808784, "language_loss": 0.88584507, "learning_rate": 1.638819551358182e-06, "loss": 0.90250826, "num_input_tokens_seen": 204636470, "router_z_loss_clip": 2.6015625, "router_z_loss_mlp": 0.28857422, "step": 9498, "time_per_iteration": 2.7057313919067383 }, { "auxiliary_loss_clip": 0.01324074, "auxiliary_loss_mlp": 0.00342722, "balance_loss_clip": 1.07379317, "balance_loss_mlp": 0.31319362, "epoch": 0.5711107770930407, "flos": 21982250655360.0, "grad_norm": 102.81284430570484, "language_loss": 0.73993468, "learning_rate": 1.638436499891469e-06, "loss": 0.75660264, "num_input_tokens_seen": 204656640, "router_z_loss_clip": 2.5, "router_z_loss_mlp": 0.29553223, "step": 9499, "time_per_iteration": 2.6671741008758545 }, { "auxiliary_loss_clip": 0.01363616, "auxiliary_loss_mlp": 0.00312856, "balance_loss_clip": 1.09932208, "balance_loss_mlp": 0.28527075, "epoch": 0.5711709003457087, "flos": 19574009354880.0, "grad_norm": 53.54428577052524, "language_loss": 0.78909451, "learning_rate": 1.6380534621352805e-06, "loss": 0.80585927, "num_input_tokens_seen": 204675475, "router_z_loss_clip": 2.6484375, "router_z_loss_mlp": 0.27587891, "step": 9500, "time_per_iteration": 2.654447317123413 }, { "auxiliary_loss_clip": 0.01358698, "auxiliary_loss_mlp": 0.00364738, "balance_loss_clip": 1.09890401, "balance_loss_mlp": 0.33467335, "epoch": 0.5712310235983766, "flos": 24242683489920.0, "grad_norm": 8.367776075202885, "language_loss": 0.84180695, "learning_rate": 1.6376704381041407e-06, "loss": 0.85904133, "num_input_tokens_seen": 204695385, "router_z_loss_clip": 2.59960938, "router_z_loss_mlp": 0.30078125, "step": 9501, "time_per_iteration": 2.682466983795166 }, { "auxiliary_loss_clip": 0.01358178, "auxiliary_loss_mlp": 0.00309607, "balance_loss_clip": 1.09803677, "balance_loss_mlp": 0.28508598, "epoch": 0.5712911468510447, "flos": 20996143603200.0, "grad_norm": 108.03066598388784, "language_loss": 0.8156538, "learning_rate": 1.6372874278125742e-06, "loss": 0.83233166, "num_input_tokens_seen": 204714730, "router_z_loss_clip": 2.6015625, "router_z_loss_mlp": 0.24511719, "step": 9502, "time_per_iteration": 2.6480491161346436 }, { "auxiliary_loss_clip": 0.01335764, "auxiliary_loss_mlp": 0.00307576, "balance_loss_clip": 1.08705187, "balance_loss_mlp": 0.28055131, "epoch": 0.5713512701037126, "flos": 18916987731840.0, "grad_norm": 155.40648795298839, "language_loss": 0.88948667, "learning_rate": 1.636904431275105e-06, "loss": 0.90591997, "num_input_tokens_seen": 204735025, "router_z_loss_clip": 2.49023438, "router_z_loss_mlp": 0.27062988, "step": 9503, "time_per_iteration": 2.642984628677368 }, { "auxiliary_loss_clip": 0.01343, "auxiliary_loss_mlp": 0.00293592, "balance_loss_clip": 1.08624268, "balance_loss_mlp": 0.26578006, "epoch": 0.5714113933563806, "flos": 17413443308160.0, "grad_norm": 7.2866724882229645, "language_loss": 0.96269667, "learning_rate": 1.6365214485062553e-06, "loss": 0.97906256, "num_input_tokens_seen": 204751365, "router_z_loss_clip": 2.56835938, "router_z_loss_mlp": 0.27832031, "step": 9504, "time_per_iteration": 2.6293106079101562 }, { "auxiliary_loss_clip": 0.01343722, "auxiliary_loss_mlp": 0.00296836, "balance_loss_clip": 1.08924603, "balance_loss_mlp": 0.27035937, "epoch": 0.5714715166090486, "flos": 20193360589440.0, "grad_norm": 22.596502201681204, "language_loss": 0.82424343, "learning_rate": 1.6361384795205496e-06, "loss": 0.84064901, "num_input_tokens_seen": 204768980, "router_z_loss_clip": 2.54492188, "router_z_loss_mlp": 0.26464844, "step": 9505, "time_per_iteration": 2.6737210750579834 }, { "auxiliary_loss_clip": 0.01369708, "auxiliary_loss_mlp": 0.00303882, "balance_loss_clip": 1.10944605, "balance_loss_mlp": 0.27770334, "epoch": 0.5715316398617165, "flos": 18551668458240.0, "grad_norm": 98.29166717122054, "language_loss": 0.88212526, "learning_rate": 1.635755524332509e-06, "loss": 0.89886117, "num_input_tokens_seen": 204788110, "router_z_loss_clip": 2.6015625, "router_z_loss_mlp": 0.26171875, "step": 9506, "time_per_iteration": 2.6810691356658936 }, { "auxiliary_loss_clip": 0.01357727, "auxiliary_loss_mlp": 0.00307707, "balance_loss_clip": 1.09893024, "balance_loss_mlp": 0.27944213, "epoch": 0.5715917631143845, "flos": 18478195188480.0, "grad_norm": 14.532586995090767, "language_loss": 0.86623013, "learning_rate": 1.6353725829566552e-06, "loss": 0.8828845, "num_input_tokens_seen": 204807240, "router_z_loss_clip": 2.58984375, "router_z_loss_mlp": 0.28271484, "step": 9507, "time_per_iteration": 2.6681888103485107 }, { "auxiliary_loss_clip": 0.01345845, "auxiliary_loss_mlp": 0.002943, "balance_loss_clip": 1.08564341, "balance_loss_mlp": 0.26642883, "epoch": 0.5716518863670524, "flos": 24020037037440.0, "grad_norm": 6.659474830415826, "language_loss": 0.76015747, "learning_rate": 1.63498965540751e-06, "loss": 0.77655894, "num_input_tokens_seen": 204826415, "router_z_loss_clip": 2.59765625, "router_z_loss_mlp": 0.27893066, "step": 9508, "time_per_iteration": 2.6319730281829834 }, { "auxiliary_loss_clip": 0.01361769, "auxiliary_loss_mlp": 0.00276886, "balance_loss_clip": 1.09930491, "balance_loss_mlp": 0.24722669, "epoch": 0.5717120096197205, "flos": 17819485626240.0, "grad_norm": 11.951729990130966, "language_loss": 0.87263107, "learning_rate": 1.634606741699593e-06, "loss": 0.88901758, "num_input_tokens_seen": 204844305, "router_z_loss_clip": 2.62109375, "router_z_loss_mlp": 0.29675293, "step": 9509, "time_per_iteration": 2.6040306091308594 }, { "auxiliary_loss_clip": 0.01352355, "auxiliary_loss_mlp": 0.00283183, "balance_loss_clip": 1.09530318, "balance_loss_mlp": 0.2560035, "epoch": 0.5717721328723884, "flos": 21866043179520.0, "grad_norm": 3.8173765649439244, "language_loss": 0.82629591, "learning_rate": 1.6342238418474255e-06, "loss": 0.84265131, "num_input_tokens_seen": 204861765, "router_z_loss_clip": 2.57226562, "router_z_loss_mlp": 0.27185059, "step": 9510, "time_per_iteration": 2.6204874515533447 }, { "auxiliary_loss_clip": 0.01334117, "auxiliary_loss_mlp": 0.00273615, "balance_loss_clip": 1.0825187, "balance_loss_mlp": 0.24581516, "epoch": 0.5718322561250564, "flos": 28437624126720.0, "grad_norm": 3.3909713837631066, "language_loss": 0.75253379, "learning_rate": 1.6338409558655264e-06, "loss": 0.76861107, "num_input_tokens_seen": 204882505, "router_z_loss_clip": 2.515625, "router_z_loss_mlp": 0.2779541, "step": 9511, "time_per_iteration": 2.7327511310577393 }, { "auxiliary_loss_clip": 0.01333541, "auxiliary_loss_mlp": 0.00284244, "balance_loss_clip": 1.0781579, "balance_loss_mlp": 0.25872117, "epoch": 0.5718923793777243, "flos": 13551825905280.0, "grad_norm": 470.8500064530821, "language_loss": 0.70566094, "learning_rate": 1.6334580837684152e-06, "loss": 0.72183877, "num_input_tokens_seen": 204899830, "router_z_loss_clip": 2.5546875, "router_z_loss_mlp": 0.25537109, "step": 9512, "time_per_iteration": 2.593287706375122 }, { "auxiliary_loss_clip": 0.01333836, "auxiliary_loss_mlp": 0.00280529, "balance_loss_clip": 1.07535648, "balance_loss_mlp": 0.25468427, "epoch": 0.5719525026303923, "flos": 17822035491840.0, "grad_norm": 11.296060750722116, "language_loss": 0.84042168, "learning_rate": 1.6330752255706104e-06, "loss": 0.85656536, "num_input_tokens_seen": 204918100, "router_z_loss_clip": 2.58398438, "router_z_loss_mlp": 0.25878906, "step": 9513, "time_per_iteration": 2.6220977306365967 }, { "auxiliary_loss_clip": 0.01422899, "auxiliary_loss_mlp": 0.0016482, "balance_loss_clip": 1.19331074, "balance_loss_mlp": 0.15699957, "epoch": 0.5720126258830602, "flos": 61298042814720.0, "grad_norm": 0.8911890332407729, "language_loss": 0.66573429, "learning_rate": 1.6326923812866288e-06, "loss": 0.68161148, "num_input_tokens_seen": 204972925, "router_z_loss_clip": 2.296875, "router_z_loss_mlp": 0.078125, "step": 9514, "time_per_iteration": 3.1294803619384766 }, { "auxiliary_loss_clip": 0.01349155, "auxiliary_loss_mlp": 0.00299231, "balance_loss_clip": 1.08592618, "balance_loss_mlp": 0.26877257, "epoch": 0.5720727491357283, "flos": 23988040997760.0, "grad_norm": 2.211867238452027, "language_loss": 0.86997932, "learning_rate": 1.63230955093099e-06, "loss": 0.88646317, "num_input_tokens_seen": 204990910, "router_z_loss_clip": 2.63085938, "router_z_loss_mlp": 0.30419922, "step": 9515, "time_per_iteration": 2.684434652328491 }, { "auxiliary_loss_clip": 0.01330651, "auxiliary_loss_mlp": 0.00253191, "balance_loss_clip": 1.07384479, "balance_loss_mlp": 0.22558217, "epoch": 0.5721328723883962, "flos": 23405426398080.0, "grad_norm": 203.59346565579256, "language_loss": 0.92466652, "learning_rate": 1.6319267345182092e-06, "loss": 0.94050497, "num_input_tokens_seen": 205010500, "router_z_loss_clip": 2.56640625, "router_z_loss_mlp": 0.27587891, "step": 9516, "time_per_iteration": 4.076708555221558 }, { "auxiliary_loss_clip": 0.01361589, "auxiliary_loss_mlp": 0.00254155, "balance_loss_clip": 1.09504128, "balance_loss_mlp": 0.2270588, "epoch": 0.5721929956410642, "flos": 18804910320000.0, "grad_norm": 5.084058432857291, "language_loss": 0.94129419, "learning_rate": 1.6315439320628038e-06, "loss": 0.95745164, "num_input_tokens_seen": 205028560, "router_z_loss_clip": 2.66601562, "router_z_loss_mlp": 0.27087402, "step": 9517, "time_per_iteration": 4.194195985794067 }, { "auxiliary_loss_clip": 0.0132638, "auxiliary_loss_mlp": 0.00278718, "balance_loss_clip": 1.07069898, "balance_loss_mlp": 0.25068003, "epoch": 0.5722531188937322, "flos": 27196659100800.0, "grad_norm": 5.284611898758786, "language_loss": 0.91773283, "learning_rate": 1.6311611435792893e-06, "loss": 0.93378377, "num_input_tokens_seen": 205048650, "router_z_loss_clip": 2.55664062, "router_z_loss_mlp": 0.28027344, "step": 9518, "time_per_iteration": 2.7196240425109863 }, { "auxiliary_loss_clip": 0.01325761, "auxiliary_loss_mlp": 0.00282893, "balance_loss_clip": 1.07167315, "balance_loss_mlp": 0.25597551, "epoch": 0.5723132421464001, "flos": 15195672852480.0, "grad_norm": 9.13968363804584, "language_loss": 0.85878414, "learning_rate": 1.6307783690821812e-06, "loss": 0.87487066, "num_input_tokens_seen": 205066480, "router_z_loss_clip": 2.53710938, "router_z_loss_mlp": 0.26940918, "step": 9519, "time_per_iteration": 2.652838706970215 }, { "auxiliary_loss_clip": 0.01334095, "auxiliary_loss_mlp": 0.00237098, "balance_loss_clip": 1.07602048, "balance_loss_mlp": 0.21087204, "epoch": 0.5723733653990681, "flos": 27599433281280.0, "grad_norm": 11.343094214285646, "language_loss": 0.87226212, "learning_rate": 1.6303956085859944e-06, "loss": 0.88797402, "num_input_tokens_seen": 205087475, "router_z_loss_clip": 2.58007812, "router_z_loss_mlp": 0.26245117, "step": 9520, "time_per_iteration": 4.167596340179443 }, { "auxiliary_loss_clip": 0.01361056, "auxiliary_loss_mlp": 0.00236332, "balance_loss_clip": 1.09099627, "balance_loss_mlp": 0.20873488, "epoch": 0.572433488651736, "flos": 18222870337920.0, "grad_norm": 8.052457681472598, "language_loss": 0.82560736, "learning_rate": 1.630012862105243e-06, "loss": 0.84158123, "num_input_tokens_seen": 205106495, "router_z_loss_clip": 2.69726562, "router_z_loss_mlp": 0.27600098, "step": 9521, "time_per_iteration": 2.7077300548553467 }, { "auxiliary_loss_clip": 0.01346006, "auxiliary_loss_mlp": 0.00252856, "balance_loss_clip": 1.08632255, "balance_loss_mlp": 0.22529477, "epoch": 0.5724936119044041, "flos": 31249106484480.0, "grad_norm": 4.095077696755133, "language_loss": 0.86081707, "learning_rate": 1.6296301296544415e-06, "loss": 0.87680572, "num_input_tokens_seen": 205128285, "router_z_loss_clip": 2.59765625, "router_z_loss_mlp": 0.27563477, "step": 9522, "time_per_iteration": 2.8005332946777344 }, { "auxiliary_loss_clip": 0.01347392, "auxiliary_loss_mlp": 0.00215655, "balance_loss_clip": 1.08900321, "balance_loss_mlp": 0.19054967, "epoch": 0.572553735157072, "flos": 19202189719680.0, "grad_norm": 15.915097068686556, "language_loss": 0.76532429, "learning_rate": 1.629247411248102e-06, "loss": 0.78095484, "num_input_tokens_seen": 205146595, "router_z_loss_clip": 2.58398438, "router_z_loss_mlp": 0.25109863, "step": 9523, "time_per_iteration": 2.6410491466522217 }, { "auxiliary_loss_clip": 0.01322936, "auxiliary_loss_mlp": 0.00225835, "balance_loss_clip": 1.06866109, "balance_loss_mlp": 0.20103897, "epoch": 0.57261385840974, "flos": 21214911386880.0, "grad_norm": 15.255690801638206, "language_loss": 0.77889961, "learning_rate": 1.628864706900738e-06, "loss": 0.79438734, "num_input_tokens_seen": 205164295, "router_z_loss_clip": 2.54101562, "router_z_loss_mlp": 0.24804688, "step": 9524, "time_per_iteration": 2.6587541103363037 }, { "auxiliary_loss_clip": 0.01345013, "auxiliary_loss_mlp": 0.00254935, "balance_loss_clip": 1.08650279, "balance_loss_mlp": 0.22744536, "epoch": 0.5726739816624079, "flos": 33984529793280.0, "grad_norm": 4.518755316924043, "language_loss": 0.72946775, "learning_rate": 1.6284820166268615e-06, "loss": 0.74546725, "num_input_tokens_seen": 205185380, "router_z_loss_clip": 2.58789062, "router_z_loss_mlp": 0.27490234, "step": 9525, "time_per_iteration": 2.7383334636688232 }, { "auxiliary_loss_clip": 0.01360557, "auxiliary_loss_mlp": 0.00233057, "balance_loss_clip": 1.09593034, "balance_loss_mlp": 0.2060678, "epoch": 0.5727341049150759, "flos": 24275972419200.0, "grad_norm": 3.4369239593359953, "language_loss": 0.81748772, "learning_rate": 1.628099340440984e-06, "loss": 0.83342385, "num_input_tokens_seen": 205204895, "router_z_loss_clip": 2.64648438, "router_z_loss_mlp": 0.27001953, "step": 9526, "time_per_iteration": 4.064497470855713 }, { "auxiliary_loss_clip": 0.01357016, "auxiliary_loss_mlp": 0.00219269, "balance_loss_clip": 1.09712124, "balance_loss_mlp": 0.19173166, "epoch": 0.5727942281677438, "flos": 28400564269440.0, "grad_norm": 247.6914980324869, "language_loss": 0.89040506, "learning_rate": 1.6277166783576176e-06, "loss": 0.90616786, "num_input_tokens_seen": 205223440, "router_z_loss_clip": 2.59960938, "router_z_loss_mlp": 0.2755127, "step": 9527, "time_per_iteration": 2.76456618309021 }, { "auxiliary_loss_clip": 0.01364645, "auxiliary_loss_mlp": 0.00241472, "balance_loss_clip": 1.1015451, "balance_loss_mlp": 0.21212234, "epoch": 0.5728543514204119, "flos": 19536769929600.0, "grad_norm": 11.699298049230817, "language_loss": 0.81152493, "learning_rate": 1.6273340303912713e-06, "loss": 0.82758611, "num_input_tokens_seen": 205242800, "router_z_loss_clip": 2.63085938, "router_z_loss_mlp": 0.29345703, "step": 9528, "time_per_iteration": 2.6819679737091064 }, { "auxiliary_loss_clip": 0.0137655, "auxiliary_loss_mlp": 0.00249637, "balance_loss_clip": 1.11085021, "balance_loss_mlp": 0.22148009, "epoch": 0.5729144746730798, "flos": 21506757390720.0, "grad_norm": 12.709092455020905, "language_loss": 0.94039983, "learning_rate": 1.6269513965564557e-06, "loss": 0.9566617, "num_input_tokens_seen": 205259465, "router_z_loss_clip": 2.66015625, "router_z_loss_mlp": 0.28137207, "step": 9529, "time_per_iteration": 2.653095006942749 }, { "auxiliary_loss_clip": 0.01459209, "auxiliary_loss_mlp": 0.00223592, "balance_loss_clip": 1.22640824, "balance_loss_mlp": 0.21424632, "epoch": 0.5729745979257478, "flos": 58681628242560.0, "grad_norm": 0.7562193965533952, "language_loss": 0.55679214, "learning_rate": 1.6265687768676813e-06, "loss": 0.5736202, "num_input_tokens_seen": 205314100, "router_z_loss_clip": 2.328125, "router_z_loss_mlp": 0.09326172, "step": 9530, "time_per_iteration": 2.988734483718872 }, { "auxiliary_loss_clip": 0.0138773, "auxiliary_loss_mlp": 0.00264759, "balance_loss_clip": 1.12250495, "balance_loss_mlp": 0.23539755, "epoch": 0.5730347211784158, "flos": 18552099421440.0, "grad_norm": 11.94689945847789, "language_loss": 0.75279456, "learning_rate": 1.6261861713394553e-06, "loss": 0.76931942, "num_input_tokens_seen": 205333420, "router_z_loss_clip": 2.65039062, "router_z_loss_mlp": 0.29394531, "step": 9531, "time_per_iteration": 2.671395778656006 }, { "auxiliary_loss_clip": 0.01384701, "auxiliary_loss_mlp": 0.00279363, "balance_loss_clip": 1.116436, "balance_loss_mlp": 0.24916689, "epoch": 0.5730948444310837, "flos": 38031482396160.0, "grad_norm": 1258.5065325121684, "language_loss": 0.82803243, "learning_rate": 1.6258035799862876e-06, "loss": 0.84467304, "num_input_tokens_seen": 205350995, "router_z_loss_clip": 2.68359375, "router_z_loss_mlp": 0.30187988, "step": 9532, "time_per_iteration": 2.750115394592285 }, { "auxiliary_loss_clip": 0.01363342, "auxiliary_loss_mlp": 0.00227822, "balance_loss_clip": 1.09825599, "balance_loss_mlp": 0.19929492, "epoch": 0.5731549676837517, "flos": 25227066689280.0, "grad_norm": 20.43126552580118, "language_loss": 0.84921956, "learning_rate": 1.625421002822686e-06, "loss": 0.86513114, "num_input_tokens_seen": 205372675, "router_z_loss_clip": 2.65039062, "router_z_loss_mlp": 0.28503418, "step": 9533, "time_per_iteration": 2.712489128112793 }, { "auxiliary_loss_clip": 0.01358513, "auxiliary_loss_mlp": 0.00211721, "balance_loss_clip": 1.09895682, "balance_loss_mlp": 0.18163224, "epoch": 0.5732150909364196, "flos": 23368222886400.0, "grad_norm": 2.5541247125560202, "language_loss": 0.92493248, "learning_rate": 1.6250384398631574e-06, "loss": 0.94063479, "num_input_tokens_seen": 205392590, "router_z_loss_clip": 2.59765625, "router_z_loss_mlp": 0.30102539, "step": 9534, "time_per_iteration": 2.670846462249756 }, { "auxiliary_loss_clip": 0.0138619, "auxiliary_loss_mlp": 0.00230703, "balance_loss_clip": 1.12091351, "balance_loss_mlp": 0.20252213, "epoch": 0.5732752141890877, "flos": 23079357711360.0, "grad_norm": 12.611559435704374, "language_loss": 0.82256722, "learning_rate": 1.6246558911222085e-06, "loss": 0.83873618, "num_input_tokens_seen": 205414885, "router_z_loss_clip": 2.65429688, "router_z_loss_mlp": 0.28198242, "step": 9535, "time_per_iteration": 2.72548770904541 }, { "auxiliary_loss_clip": 0.01381818, "auxiliary_loss_mlp": 0.00236025, "balance_loss_clip": 1.11702013, "balance_loss_mlp": 0.2060795, "epoch": 0.5733353374417556, "flos": 24352282863360.0, "grad_norm": 55.32465278951884, "language_loss": 0.77335292, "learning_rate": 1.624273356614346e-06, "loss": 0.78953135, "num_input_tokens_seen": 205434440, "router_z_loss_clip": 2.65429688, "router_z_loss_mlp": 0.29956055, "step": 9536, "time_per_iteration": 2.7011988162994385 }, { "auxiliary_loss_clip": 0.0137459, "auxiliary_loss_mlp": 0.00233286, "balance_loss_clip": 1.11618888, "balance_loss_mlp": 0.20515308, "epoch": 0.5733954606944236, "flos": 27198849830400.0, "grad_norm": 14.565433716013404, "language_loss": 0.77680612, "learning_rate": 1.6238908363540755e-06, "loss": 0.79288483, "num_input_tokens_seen": 205454225, "router_z_loss_clip": 2.58789062, "router_z_loss_mlp": 0.28125, "step": 9537, "time_per_iteration": 2.6793501377105713 }, { "auxiliary_loss_clip": 0.01369539, "auxiliary_loss_mlp": 0.00230834, "balance_loss_clip": 1.10595846, "balance_loss_mlp": 0.2006862, "epoch": 0.5734555839470915, "flos": 28765129357440.0, "grad_norm": 1148.3882537543811, "language_loss": 0.71784836, "learning_rate": 1.623508330355902e-06, "loss": 0.73385209, "num_input_tokens_seen": 205474750, "router_z_loss_clip": 2.63476562, "router_z_loss_mlp": 0.30187988, "step": 9538, "time_per_iteration": 2.732377529144287 }, { "auxiliary_loss_clip": 0.01384449, "auxiliary_loss_mlp": 0.00255321, "balance_loss_clip": 1.12226725, "balance_loss_mlp": 0.22594817, "epoch": 0.5735157071997595, "flos": 22966813422720.0, "grad_norm": 118.42874088718968, "language_loss": 0.89693344, "learning_rate": 1.6231258386343306e-06, "loss": 0.91333115, "num_input_tokens_seen": 205495495, "router_z_loss_clip": 2.625, "router_z_loss_mlp": 0.29345703, "step": 9539, "time_per_iteration": 2.7041244506835938 }, { "auxiliary_loss_clip": 0.01383451, "auxiliary_loss_mlp": 0.00240151, "balance_loss_clip": 1.11889708, "balance_loss_mlp": 0.21096882, "epoch": 0.5735758304524274, "flos": 18989455420800.0, "grad_norm": 15.200319554989989, "language_loss": 0.8299011, "learning_rate": 1.6227433612038647e-06, "loss": 0.84613705, "num_input_tokens_seen": 205510070, "router_z_loss_clip": 2.64453125, "router_z_loss_mlp": 0.29162598, "step": 9540, "time_per_iteration": 2.650118350982666 }, { "auxiliary_loss_clip": 0.01370703, "auxiliary_loss_mlp": 0.00250578, "balance_loss_clip": 1.11461413, "balance_loss_mlp": 0.22059718, "epoch": 0.5736359537050955, "flos": 28397942576640.0, "grad_norm": 10.681080444664913, "language_loss": 0.84450078, "learning_rate": 1.6223608980790089e-06, "loss": 0.8607136, "num_input_tokens_seen": 205530190, "router_z_loss_clip": 2.55859375, "router_z_loss_mlp": 0.29992676, "step": 9541, "time_per_iteration": 2.7138330936431885 }, { "auxiliary_loss_clip": 0.01394878, "auxiliary_loss_mlp": 0.00238416, "balance_loss_clip": 1.13065362, "balance_loss_mlp": 0.2085655, "epoch": 0.5736960769577634, "flos": 15627210848640.0, "grad_norm": 31.663419585975785, "language_loss": 0.74856567, "learning_rate": 1.6219784492742654e-06, "loss": 0.76489854, "num_input_tokens_seen": 205547380, "router_z_loss_clip": 2.63671875, "router_z_loss_mlp": 0.29833984, "step": 9542, "time_per_iteration": 2.68572735786438 }, { "auxiliary_loss_clip": 0.01385606, "auxiliary_loss_mlp": 0.00248353, "balance_loss_clip": 1.12664723, "balance_loss_mlp": 0.21899202, "epoch": 0.5737562002104314, "flos": 18003994813440.0, "grad_norm": 108.7950036544467, "language_loss": 0.92444474, "learning_rate": 1.6215960148041365e-06, "loss": 0.94078434, "num_input_tokens_seen": 205566540, "router_z_loss_clip": 2.59179688, "router_z_loss_mlp": 0.2935791, "step": 9543, "time_per_iteration": 2.59015154838562 }, { "auxiliary_loss_clip": 0.01393686, "auxiliary_loss_mlp": 0.00240486, "balance_loss_clip": 1.12789643, "balance_loss_mlp": 0.20910968, "epoch": 0.5738163234630994, "flos": 20698192287360.0, "grad_norm": 67.42952589647957, "language_loss": 0.80983889, "learning_rate": 1.6212135946831257e-06, "loss": 0.82618058, "num_input_tokens_seen": 205584200, "router_z_loss_clip": 2.65429688, "router_z_loss_mlp": 0.3137207, "step": 9544, "time_per_iteration": 2.6574549674987793 }, { "auxiliary_loss_clip": 0.01388467, "auxiliary_loss_mlp": 0.00233094, "balance_loss_clip": 1.1252346, "balance_loss_mlp": 0.2046392, "epoch": 0.5738764467157673, "flos": 23149311448320.0, "grad_norm": 1.6998482286827814, "language_loss": 0.8388555, "learning_rate": 1.620831188925733e-06, "loss": 0.85507113, "num_input_tokens_seen": 205604675, "router_z_loss_clip": 2.6328125, "router_z_loss_mlp": 0.2845459, "step": 9545, "time_per_iteration": 2.6500871181488037 }, { "auxiliary_loss_clip": 0.01406756, "auxiliary_loss_mlp": 0.00202041, "balance_loss_clip": 1.13750863, "balance_loss_mlp": 0.17190519, "epoch": 0.5739365699684353, "flos": 29492930730240.0, "grad_norm": 135.88904052877007, "language_loss": 0.64654195, "learning_rate": 1.620448797546459e-06, "loss": 0.6626299, "num_input_tokens_seen": 205624680, "router_z_loss_clip": 2.69140625, "router_z_loss_mlp": 0.30126953, "step": 9546, "time_per_iteration": 2.711576461791992 }, { "auxiliary_loss_clip": 0.01389377, "auxiliary_loss_mlp": 0.00221025, "balance_loss_clip": 1.12781167, "balance_loss_mlp": 0.19162774, "epoch": 0.5739966932211032, "flos": 14027247342720.0, "grad_norm": 62.84148774257563, "language_loss": 0.87907207, "learning_rate": 1.6200664205598055e-06, "loss": 0.89517611, "num_input_tokens_seen": 205641950, "router_z_loss_clip": 2.6171875, "router_z_loss_mlp": 0.29394531, "step": 9547, "time_per_iteration": 2.620549440383911 }, { "auxiliary_loss_clip": 0.01396562, "auxiliary_loss_mlp": 0.00237875, "balance_loss_clip": 1.13270974, "balance_loss_mlp": 0.20635562, "epoch": 0.5740568164737713, "flos": 19062030850560.0, "grad_norm": 174.7704824548318, "language_loss": 0.84241855, "learning_rate": 1.6196840579802704e-06, "loss": 0.85876292, "num_input_tokens_seen": 205660130, "router_z_loss_clip": 2.64257812, "router_z_loss_mlp": 0.31542969, "step": 9548, "time_per_iteration": 2.6797969341278076 }, { "auxiliary_loss_clip": 0.01405027, "auxiliary_loss_mlp": 0.0023569, "balance_loss_clip": 1.14042485, "balance_loss_mlp": 0.20460069, "epoch": 0.5741169397264392, "flos": 22127832478080.0, "grad_norm": 20.424306857333015, "language_loss": 0.78682894, "learning_rate": 1.619301709822355e-06, "loss": 0.80323613, "num_input_tokens_seen": 205678895, "router_z_loss_clip": 2.64453125, "router_z_loss_mlp": 0.31079102, "step": 9549, "time_per_iteration": 2.6539008617401123 }, { "auxiliary_loss_clip": 0.01410693, "auxiliary_loss_mlp": 0.00225376, "balance_loss_clip": 1.14748526, "balance_loss_mlp": 0.19333249, "epoch": 0.5741770629791072, "flos": 24936836797440.0, "grad_norm": 40.080706764437174, "language_loss": 0.83825648, "learning_rate": 1.6189193761005564e-06, "loss": 0.85461712, "num_input_tokens_seen": 205698450, "router_z_loss_clip": 2.63476562, "router_z_loss_mlp": 0.3203125, "step": 9550, "time_per_iteration": 2.7480530738830566 }, { "auxiliary_loss_clip": 0.01422961, "auxiliary_loss_mlp": 0.00240256, "balance_loss_clip": 1.15678501, "balance_loss_mlp": 0.20745005, "epoch": 0.5742371862317751, "flos": 18801462614400.0, "grad_norm": 48.580923978839984, "language_loss": 0.76033056, "learning_rate": 1.6185370568293727e-06, "loss": 0.77696276, "num_input_tokens_seen": 205714870, "router_z_loss_clip": 2.66210938, "router_z_loss_mlp": 0.32763672, "step": 9551, "time_per_iteration": 2.622014045715332 }, { "auxiliary_loss_clip": 0.01414234, "auxiliary_loss_mlp": 0.00231727, "balance_loss_clip": 1.14498878, "balance_loss_mlp": 0.20073237, "epoch": 0.5742973094844431, "flos": 24460661174400.0, "grad_norm": 10.601000108139148, "language_loss": 0.8158533, "learning_rate": 1.6181547520233031e-06, "loss": 0.83231294, "num_input_tokens_seen": 205736045, "router_z_loss_clip": 2.68945312, "router_z_loss_mlp": 0.30981445, "step": 9552, "time_per_iteration": 2.6785943508148193 }, { "auxiliary_loss_clip": 0.01423927, "auxiliary_loss_mlp": 0.00233932, "balance_loss_clip": 1.15410352, "balance_loss_mlp": 0.20114957, "epoch": 0.574357432737111, "flos": 21652770176640.0, "grad_norm": 31.764520214113844, "language_loss": 0.88463163, "learning_rate": 1.617772461696843e-06, "loss": 0.90121025, "num_input_tokens_seen": 205754445, "router_z_loss_clip": 2.69921875, "router_z_loss_mlp": 0.32763672, "step": 9553, "time_per_iteration": 2.741403818130493 }, { "auxiliary_loss_clip": 0.01409202, "auxiliary_loss_mlp": 0.00243554, "balance_loss_clip": 1.1429528, "balance_loss_mlp": 0.21084332, "epoch": 0.5744175559897791, "flos": 16544728880640.0, "grad_norm": 33.862036657685024, "language_loss": 0.90517467, "learning_rate": 1.6173901858644895e-06, "loss": 0.92170227, "num_input_tokens_seen": 205770595, "router_z_loss_clip": 2.66601562, "router_z_loss_mlp": 0.3269043, "step": 9554, "time_per_iteration": 2.595684289932251 }, { "auxiliary_loss_clip": 0.01424484, "auxiliary_loss_mlp": 0.00248473, "balance_loss_clip": 1.1518712, "balance_loss_mlp": 0.21623918, "epoch": 0.574477679242447, "flos": 24207598880640.0, "grad_norm": 49.925996427388775, "language_loss": 0.79985118, "learning_rate": 1.6170079245407385e-06, "loss": 0.81658077, "num_input_tokens_seen": 205791935, "router_z_loss_clip": 2.72460938, "router_z_loss_mlp": 0.32202148, "step": 9555, "time_per_iteration": 2.7023661136627197 }, { "auxiliary_loss_clip": 0.01453071, "auxiliary_loss_mlp": 0.00234593, "balance_loss_clip": 1.17507172, "balance_loss_mlp": 0.19897294, "epoch": 0.574537802495115, "flos": 14903000835840.0, "grad_norm": 6.988620674912163, "language_loss": 0.8248316, "learning_rate": 1.6166256777400853e-06, "loss": 0.84170824, "num_input_tokens_seen": 205807260, "router_z_loss_clip": 2.77929688, "router_z_loss_mlp": 0.35620117, "step": 9556, "time_per_iteration": 2.596848487854004 }, { "auxiliary_loss_clip": 0.01406997, "auxiliary_loss_mlp": 0.00220461, "balance_loss_clip": 1.13936031, "balance_loss_mlp": 0.18786904, "epoch": 0.5745979257477829, "flos": 24934969290240.0, "grad_norm": 15.752919735442012, "language_loss": 0.80447233, "learning_rate": 1.6162434454770248e-06, "loss": 0.8207469, "num_input_tokens_seen": 205826885, "router_z_loss_clip": 2.6796875, "router_z_loss_mlp": 0.32592773, "step": 9557, "time_per_iteration": 2.7069904804229736 }, { "auxiliary_loss_clip": 0.01433943, "auxiliary_loss_mlp": 0.00212691, "balance_loss_clip": 1.16319907, "balance_loss_mlp": 0.17943129, "epoch": 0.5746580490004509, "flos": 17235757704960.0, "grad_norm": 3.421437187785662, "language_loss": 0.77745157, "learning_rate": 1.6158612277660514e-06, "loss": 0.79391789, "num_input_tokens_seen": 205844630, "router_z_loss_clip": 2.70898438, "router_z_loss_mlp": 0.33251953, "step": 9558, "time_per_iteration": 2.6226611137390137 }, { "auxiliary_loss_clip": 0.01435607, "auxiliary_loss_mlp": 0.00254228, "balance_loss_clip": 1.1595459, "balance_loss_mlp": 0.22099212, "epoch": 0.5747181722531189, "flos": 13187871348480.0, "grad_norm": 23.07947277328422, "language_loss": 0.80732465, "learning_rate": 1.615479024621659e-06, "loss": 0.82422304, "num_input_tokens_seen": 205860960, "router_z_loss_clip": 2.76171875, "router_z_loss_mlp": 0.33215332, "step": 9559, "time_per_iteration": 4.012395143508911 }, { "auxiliary_loss_clip": 0.01414703, "auxiliary_loss_mlp": 0.00220761, "balance_loss_clip": 1.15442944, "balance_loss_mlp": 0.19234137, "epoch": 0.5747782955057869, "flos": 22963006581120.0, "grad_norm": 6.630213964763968, "language_loss": 0.85280389, "learning_rate": 1.6150968360583398e-06, "loss": 0.86915851, "num_input_tokens_seen": 205880675, "router_z_loss_clip": 2.60546875, "router_z_loss_mlp": 0.28466797, "step": 9560, "time_per_iteration": 4.370074033737183 }, { "auxiliary_loss_clip": 0.01392567, "auxiliary_loss_mlp": 0.00238581, "balance_loss_clip": 1.13303816, "balance_loss_mlp": 0.20582268, "epoch": 0.5748384187584549, "flos": 23403235668480.0, "grad_norm": 114.80671884901268, "language_loss": 0.71099973, "learning_rate": 1.614714662090588e-06, "loss": 0.72731113, "num_input_tokens_seen": 205900050, "router_z_loss_clip": 2.59960938, "router_z_loss_mlp": 0.32763672, "step": 9561, "time_per_iteration": 2.7172443866729736 }, { "auxiliary_loss_clip": 0.01455117, "auxiliary_loss_mlp": 0.00244467, "balance_loss_clip": 1.17166066, "balance_loss_mlp": 0.20848936, "epoch": 0.5748985420111228, "flos": 17785514338560.0, "grad_norm": 74.94103740456544, "language_loss": 0.79831445, "learning_rate": 1.6143325027328945e-06, "loss": 0.81531036, "num_input_tokens_seen": 205918855, "router_z_loss_clip": 2.8359375, "router_z_loss_mlp": 0.36010742, "step": 9562, "time_per_iteration": 4.111522912979126 }, { "auxiliary_loss_clip": 0.01412776, "auxiliary_loss_mlp": 0.00224844, "balance_loss_clip": 1.14873314, "balance_loss_mlp": 0.19523221, "epoch": 0.5749586652637908, "flos": 19866250408320.0, "grad_norm": 13.831812112049926, "language_loss": 0.9030658, "learning_rate": 1.613950357999751e-06, "loss": 0.919442, "num_input_tokens_seen": 205936970, "router_z_loss_clip": 2.64257812, "router_z_loss_mlp": 0.29626465, "step": 9563, "time_per_iteration": 2.688826322555542 }, { "auxiliary_loss_clip": 0.01436337, "auxiliary_loss_mlp": 0.00240327, "balance_loss_clip": 1.16065145, "balance_loss_mlp": 0.20454001, "epoch": 0.5750187885164587, "flos": 21287235421440.0, "grad_norm": 19.342251816856134, "language_loss": 0.69610262, "learning_rate": 1.6135682279056488e-06, "loss": 0.71286929, "num_input_tokens_seen": 205954630, "router_z_loss_clip": 2.7578125, "router_z_loss_mlp": 0.35766602, "step": 9564, "time_per_iteration": 2.6731879711151123 }, { "auxiliary_loss_clip": 0.0140626, "auxiliary_loss_mlp": 0.00230371, "balance_loss_clip": 1.14949346, "balance_loss_mlp": 0.19730261, "epoch": 0.5750789117691267, "flos": 18804658924800.0, "grad_norm": 19.977203224100155, "language_loss": 0.82786632, "learning_rate": 1.613186112465078e-06, "loss": 0.84423256, "num_input_tokens_seen": 205971510, "router_z_loss_clip": 2.56835938, "router_z_loss_mlp": 0.33056641, "step": 9565, "time_per_iteration": 2.6063783168792725 }, { "auxiliary_loss_clip": 0.01540477, "auxiliary_loss_mlp": 0.00157221, "balance_loss_clip": 1.30951536, "balance_loss_mlp": 0.14491859, "epoch": 0.5751390350217946, "flos": 70663224124800.0, "grad_norm": 0.7273468117199458, "language_loss": 0.60254431, "learning_rate": 1.6128040116925287e-06, "loss": 0.61952126, "num_input_tokens_seen": 206035125, "router_z_loss_clip": 2.3125, "router_z_loss_mlp": 0.12255859, "step": 9566, "time_per_iteration": 3.233987808227539 }, { "auxiliary_loss_clip": 0.01424111, "auxiliary_loss_mlp": 0.00214949, "balance_loss_clip": 1.16094327, "balance_loss_mlp": 0.18264353, "epoch": 0.5751991582744627, "flos": 14246338348800.0, "grad_norm": 3.1427305739596894, "language_loss": 0.84891582, "learning_rate": 1.6124219256024901e-06, "loss": 0.86530638, "num_input_tokens_seen": 206052075, "router_z_loss_clip": 2.6328125, "router_z_loss_mlp": 0.32275391, "step": 9567, "time_per_iteration": 2.620811939239502 }, { "auxiliary_loss_clip": 0.0140936, "auxiliary_loss_mlp": 0.00236691, "balance_loss_clip": 1.14966476, "balance_loss_mlp": 0.20579219, "epoch": 0.5752592815271306, "flos": 18328160079360.0, "grad_norm": 6.259362258969094, "language_loss": 0.8054316, "learning_rate": 1.6120398542094504e-06, "loss": 0.82189214, "num_input_tokens_seen": 206069970, "router_z_loss_clip": 2.6015625, "router_z_loss_mlp": 0.30932617, "step": 9568, "time_per_iteration": 2.695604085922241 }, { "auxiliary_loss_clip": 0.01432998, "auxiliary_loss_mlp": 0.00237016, "balance_loss_clip": 1.16591275, "balance_loss_mlp": 0.2048291, "epoch": 0.5753194047797986, "flos": 20922742160640.0, "grad_norm": 38.929850555862814, "language_loss": 0.80474031, "learning_rate": 1.6116577975278994e-06, "loss": 0.82144046, "num_input_tokens_seen": 206088950, "router_z_loss_clip": 2.66992188, "router_z_loss_mlp": 0.32189941, "step": 9569, "time_per_iteration": 4.054736137390137 }, { "auxiliary_loss_clip": 0.01428902, "auxiliary_loss_mlp": 0.00244211, "balance_loss_clip": 1.16068351, "balance_loss_mlp": 0.21252568, "epoch": 0.5753795280324665, "flos": 19281804215040.0, "grad_norm": 186.41210737320947, "language_loss": 0.68330693, "learning_rate": 1.6112757555723223e-06, "loss": 0.70003808, "num_input_tokens_seen": 206107780, "router_z_loss_clip": 2.68359375, "router_z_loss_mlp": 0.31689453, "step": 9570, "time_per_iteration": 2.649371385574341 }, { "auxiliary_loss_clip": 0.01401218, "auxiliary_loss_mlp": 0.00241386, "balance_loss_clip": 1.14614666, "balance_loss_mlp": 0.21074894, "epoch": 0.5754396512851345, "flos": 21652877917440.0, "grad_norm": 3.1798323129431174, "language_loss": 0.71055371, "learning_rate": 1.6108937283572082e-06, "loss": 0.72697973, "num_input_tokens_seen": 206127445, "router_z_loss_clip": 2.54882812, "router_z_loss_mlp": 0.30639648, "step": 9571, "time_per_iteration": 2.6918458938598633 }, { "auxiliary_loss_clip": 0.01399613, "auxiliary_loss_mlp": 0.00251451, "balance_loss_clip": 1.14155197, "balance_loss_mlp": 0.22040847, "epoch": 0.5754997745378025, "flos": 51021700179840.0, "grad_norm": 2.712111575918764, "language_loss": 0.75114214, "learning_rate": 1.6105117158970434e-06, "loss": 0.76765275, "num_input_tokens_seen": 206152005, "router_z_loss_clip": 2.5859375, "router_z_loss_mlp": 0.31054688, "step": 9572, "time_per_iteration": 2.9071969985961914 }, { "auxiliary_loss_clip": 0.01429743, "auxiliary_loss_mlp": 0.0024894, "balance_loss_clip": 1.16693497, "balance_loss_mlp": 0.21837442, "epoch": 0.5755598977904705, "flos": 22856890826880.0, "grad_norm": 55.37631738543515, "language_loss": 0.80231082, "learning_rate": 1.6101297182063123e-06, "loss": 0.8190977, "num_input_tokens_seen": 206169875, "router_z_loss_clip": 2.62304688, "router_z_loss_mlp": 0.30541992, "step": 9573, "time_per_iteration": 2.6757776737213135 }, { "auxiliary_loss_clip": 0.01436485, "auxiliary_loss_mlp": 0.00246154, "balance_loss_clip": 1.17804015, "balance_loss_mlp": 0.21625586, "epoch": 0.5756200210431385, "flos": 38472824805120.0, "grad_norm": 45.2369197864536, "language_loss": 0.82233268, "learning_rate": 1.6097477352995022e-06, "loss": 0.83915901, "num_input_tokens_seen": 206192635, "router_z_loss_clip": 2.5859375, "router_z_loss_mlp": 0.29858398, "step": 9574, "time_per_iteration": 2.8460984230041504 }, { "auxiliary_loss_clip": 0.01443701, "auxiliary_loss_mlp": 0.00232039, "balance_loss_clip": 1.16847467, "balance_loss_mlp": 0.19944681, "epoch": 0.5756801442958064, "flos": 23910006700800.0, "grad_norm": 107.8943582664179, "language_loss": 0.76139343, "learning_rate": 1.6093657671910968e-06, "loss": 0.77815086, "num_input_tokens_seen": 206211485, "router_z_loss_clip": 2.75, "router_z_loss_mlp": 0.32617188, "step": 9575, "time_per_iteration": 2.7229669094085693 }, { "auxiliary_loss_clip": 0.01404148, "auxiliary_loss_mlp": 0.00246228, "balance_loss_clip": 1.14998937, "balance_loss_mlp": 0.21361271, "epoch": 0.5757402675484744, "flos": 21105276099840.0, "grad_norm": 1.8620458395217068, "language_loss": 0.86637914, "learning_rate": 1.6089838138955804e-06, "loss": 0.88288289, "num_input_tokens_seen": 206231740, "router_z_loss_clip": 2.54101562, "router_z_loss_mlp": 0.32592773, "step": 9576, "time_per_iteration": 2.6923670768737793 }, { "auxiliary_loss_clip": 0.01451978, "auxiliary_loss_mlp": 0.00229584, "balance_loss_clip": 1.18067646, "balance_loss_mlp": 0.19607422, "epoch": 0.5758003908011423, "flos": 20559110826240.0, "grad_norm": 3.365319229088547, "language_loss": 0.78269434, "learning_rate": 1.6086018754274372e-06, "loss": 0.79951, "num_input_tokens_seen": 206250975, "router_z_loss_clip": 2.71679688, "router_z_loss_mlp": 0.33532715, "step": 9577, "time_per_iteration": 2.7431187629699707 }, { "auxiliary_loss_clip": 0.01422063, "auxiliary_loss_mlp": 0.00255583, "balance_loss_clip": 1.15660405, "balance_loss_mlp": 0.22196597, "epoch": 0.5758605140538103, "flos": 16473015377280.0, "grad_norm": 90.43723559072868, "language_loss": 0.76126683, "learning_rate": 1.6082199518011504e-06, "loss": 0.77804333, "num_input_tokens_seen": 206268800, "router_z_loss_clip": 2.65820312, "router_z_loss_mlp": 0.3359375, "step": 9578, "time_per_iteration": 2.680736541748047 }, { "auxiliary_loss_clip": 0.01408497, "auxiliary_loss_mlp": 0.00227349, "balance_loss_clip": 1.15366459, "balance_loss_mlp": 0.19604465, "epoch": 0.5759206373064782, "flos": 21287558643840.0, "grad_norm": 29.890796588854794, "language_loss": 0.79565424, "learning_rate": 1.6078380430312016e-06, "loss": 0.81201267, "num_input_tokens_seen": 206287190, "router_z_loss_clip": 2.55273438, "router_z_loss_mlp": 0.31274414, "step": 9579, "time_per_iteration": 2.6789474487304688 }, { "auxiliary_loss_clip": 0.01451555, "auxiliary_loss_mlp": 0.00235604, "balance_loss_clip": 1.17488396, "balance_loss_mlp": 0.19943577, "epoch": 0.5759807605591463, "flos": 26067879227520.0, "grad_norm": 41.67898755034949, "language_loss": 0.77047598, "learning_rate": 1.6074561491320742e-06, "loss": 0.78734756, "num_input_tokens_seen": 206307020, "router_z_loss_clip": 2.76953125, "router_z_loss_mlp": 0.36181641, "step": 9580, "time_per_iteration": 2.6817433834075928 }, { "auxiliary_loss_clip": 0.01432464, "auxiliary_loss_mlp": 0.00234293, "balance_loss_clip": 1.1650629, "balance_loss_mlp": 0.20284571, "epoch": 0.5760408838118142, "flos": 18873068376960.0, "grad_norm": 33.743579721503735, "language_loss": 0.9321388, "learning_rate": 1.6070742701182486e-06, "loss": 0.94880641, "num_input_tokens_seen": 206324095, "router_z_loss_clip": 2.67382812, "router_z_loss_mlp": 0.31469727, "step": 9581, "time_per_iteration": 2.6775074005126953 }, { "auxiliary_loss_clip": 0.01459201, "auxiliary_loss_mlp": 0.0023246, "balance_loss_clip": 1.19334459, "balance_loss_mlp": 0.19967702, "epoch": 0.5761010070644822, "flos": 15378134964480.0, "grad_norm": 40.29359656836719, "language_loss": 0.77205396, "learning_rate": 1.6066924060042057e-06, "loss": 0.78897059, "num_input_tokens_seen": 206343210, "router_z_loss_clip": 2.66601562, "router_z_loss_mlp": 0.32788086, "step": 9582, "time_per_iteration": 2.6716599464416504 }, { "auxiliary_loss_clip": 0.01553694, "auxiliary_loss_mlp": 0.00174375, "balance_loss_clip": 1.32204795, "balance_loss_mlp": 0.16102336, "epoch": 0.5761611303171501, "flos": 71471932882560.0, "grad_norm": 0.6392778892633493, "language_loss": 0.56461108, "learning_rate": 1.6063105568044271e-06, "loss": 0.58189178, "num_input_tokens_seen": 206415935, "router_z_loss_clip": 2.3125, "router_z_loss_mlp": 0.13378906, "step": 9583, "time_per_iteration": 3.3317198753356934 }, { "auxiliary_loss_clip": 0.01434906, "auxiliary_loss_mlp": 0.00242452, "balance_loss_clip": 1.17229009, "balance_loss_mlp": 0.20587857, "epoch": 0.5762212535698181, "flos": 16246167033600.0, "grad_norm": 24.810533088909843, "language_loss": 0.87876713, "learning_rate": 1.6059287225333912e-06, "loss": 0.89554071, "num_input_tokens_seen": 206431900, "router_z_loss_clip": 2.625, "router_z_loss_mlp": 0.36572266, "step": 9584, "time_per_iteration": 2.6679494380950928 }, { "auxiliary_loss_clip": 0.01550656, "auxiliary_loss_mlp": 0.00178182, "balance_loss_clip": 1.32072997, "balance_loss_mlp": 0.16368611, "epoch": 0.5762813768224861, "flos": 70185504216960.0, "grad_norm": 0.6169154316447756, "language_loss": 0.49292958, "learning_rate": 1.6055469032055773e-06, "loss": 0.51021796, "num_input_tokens_seen": 206501200, "router_z_loss_clip": 2.296875, "router_z_loss_mlp": 0.14453125, "step": 9585, "time_per_iteration": 3.1880929470062256 }, { "auxiliary_loss_clip": 0.0143127, "auxiliary_loss_mlp": 0.00229115, "balance_loss_clip": 1.17240834, "balance_loss_mlp": 0.19568847, "epoch": 0.5763415000751541, "flos": 20518028645760.0, "grad_norm": 15.08002149084422, "language_loss": 0.89063418, "learning_rate": 1.605165098835465e-06, "loss": 0.90723801, "num_input_tokens_seen": 206520575, "router_z_loss_clip": 2.58984375, "router_z_loss_mlp": 0.33422852, "step": 9586, "time_per_iteration": 2.6386797428131104 }, { "auxiliary_loss_clip": 0.01437279, "auxiliary_loss_mlp": 0.00243688, "balance_loss_clip": 1.1707418, "balance_loss_mlp": 0.20918855, "epoch": 0.5764016233278221, "flos": 15815526877440.0, "grad_norm": 2067.3716947730345, "language_loss": 0.88098073, "learning_rate": 1.6047833094375308e-06, "loss": 0.89779037, "num_input_tokens_seen": 206538060, "router_z_loss_clip": 2.66601562, "router_z_loss_mlp": 0.3449707, "step": 9587, "time_per_iteration": 2.629610538482666 }, { "auxiliary_loss_clip": 0.01456662, "auxiliary_loss_mlp": 0.00221424, "balance_loss_clip": 1.18955028, "balance_loss_mlp": 0.18687704, "epoch": 0.57646174658049, "flos": 20772312001920.0, "grad_norm": 6.153924131286978, "language_loss": 0.73299277, "learning_rate": 1.6044015350262542e-06, "loss": 0.74977362, "num_input_tokens_seen": 206557320, "router_z_loss_clip": 2.66992188, "router_z_loss_mlp": 0.34545898, "step": 9588, "time_per_iteration": 2.6686694622039795 }, { "auxiliary_loss_clip": 0.01462595, "auxiliary_loss_mlp": 0.0024576, "balance_loss_clip": 1.18951237, "balance_loss_mlp": 0.20928216, "epoch": 0.576521869833158, "flos": 23549930812800.0, "grad_norm": 28.107266995740073, "language_loss": 0.89437449, "learning_rate": 1.6040197756161104e-06, "loss": 0.91145802, "num_input_tokens_seen": 206575780, "router_z_loss_clip": 2.73046875, "router_z_loss_mlp": 0.36450195, "step": 9589, "time_per_iteration": 2.7115325927734375 }, { "auxiliary_loss_clip": 0.01428717, "auxiliary_loss_mlp": 0.0024391, "balance_loss_clip": 1.17050838, "balance_loss_mlp": 0.21288025, "epoch": 0.5765819930858259, "flos": 20266582464000.0, "grad_norm": 18.211866273515128, "language_loss": 0.89396584, "learning_rate": 1.6036380312215762e-06, "loss": 0.9106921, "num_input_tokens_seen": 206594100, "router_z_loss_clip": 2.58203125, "router_z_loss_mlp": 0.31018066, "step": 9590, "time_per_iteration": 2.6506288051605225 }, { "auxiliary_loss_clip": 0.01444577, "auxiliary_loss_mlp": 0.00226646, "balance_loss_clip": 1.18222451, "balance_loss_mlp": 0.19200343, "epoch": 0.5766421163384939, "flos": 23148772744320.0, "grad_norm": 30.49967056249911, "language_loss": 0.70963371, "learning_rate": 1.6032563018571283e-06, "loss": 0.72634602, "num_input_tokens_seen": 206613325, "router_z_loss_clip": 2.62109375, "router_z_loss_mlp": 0.34643555, "step": 9591, "time_per_iteration": 2.6733028888702393 }, { "auxiliary_loss_clip": 0.01457502, "auxiliary_loss_mlp": 0.00271349, "balance_loss_clip": 1.18921852, "balance_loss_mlp": 0.234036, "epoch": 0.5767022395911618, "flos": 25848895962240.0, "grad_norm": 55.28713149832437, "language_loss": 0.84892225, "learning_rate": 1.6028745875372406e-06, "loss": 0.86621082, "num_input_tokens_seen": 206634265, "router_z_loss_clip": 2.68164062, "router_z_loss_mlp": 0.37280273, "step": 9592, "time_per_iteration": 2.6924428939819336 }, { "auxiliary_loss_clip": 0.01539356, "auxiliary_loss_mlp": 0.00153476, "balance_loss_clip": 1.31743383, "balance_loss_mlp": 0.13802604, "epoch": 0.5767623628438299, "flos": 68293299657600.0, "grad_norm": 0.7188958521414582, "language_loss": 0.58906102, "learning_rate": 1.6024928882763885e-06, "loss": 0.60598934, "num_input_tokens_seen": 206696990, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.15429688, "step": 9593, "time_per_iteration": 3.3047561645507812 }, { "auxiliary_loss_clip": 0.01465179, "auxiliary_loss_mlp": 0.00276373, "balance_loss_clip": 1.19472408, "balance_loss_mlp": 0.2368193, "epoch": 0.5768224860964978, "flos": 30188448754560.0, "grad_norm": 7.976482320726445, "language_loss": 0.77412504, "learning_rate": 1.6021112040890463e-06, "loss": 0.7915405, "num_input_tokens_seen": 206717815, "router_z_loss_clip": 2.70703125, "router_z_loss_mlp": 0.39575195, "step": 9594, "time_per_iteration": 2.725802183151245 }, { "auxiliary_loss_clip": 0.01443911, "auxiliary_loss_mlp": 0.00254616, "balance_loss_clip": 1.18057871, "balance_loss_mlp": 0.2203549, "epoch": 0.5768826093491658, "flos": 17895041884800.0, "grad_norm": 1008.1362043662963, "language_loss": 0.77702415, "learning_rate": 1.6017295349896863e-06, "loss": 0.79400945, "num_input_tokens_seen": 206735985, "router_z_loss_clip": 2.63867188, "router_z_loss_mlp": 0.34277344, "step": 9595, "time_per_iteration": 2.639941930770874 }, { "auxiliary_loss_clip": 0.0143403, "auxiliary_loss_mlp": 0.00246224, "balance_loss_clip": 1.17612493, "balance_loss_mlp": 0.21148629, "epoch": 0.5769427326018337, "flos": 17457183095040.0, "grad_norm": 5.398729872333989, "language_loss": 0.76901031, "learning_rate": 1.6013478809927828e-06, "loss": 0.78581285, "num_input_tokens_seen": 206753370, "router_z_loss_clip": 2.58007812, "router_z_loss_mlp": 0.34716797, "step": 9596, "time_per_iteration": 2.641700267791748 }, { "auxiliary_loss_clip": 0.01453055, "auxiliary_loss_mlp": 0.00258641, "balance_loss_clip": 1.18671834, "balance_loss_mlp": 0.22209099, "epoch": 0.5770028558545017, "flos": 39421728345600.0, "grad_norm": 9.845850994083678, "language_loss": 0.74872923, "learning_rate": 1.6009662421128074e-06, "loss": 0.76584613, "num_input_tokens_seen": 206777645, "router_z_loss_clip": 2.6640625, "router_z_loss_mlp": 0.36523438, "step": 9597, "time_per_iteration": 2.859938859939575 }, { "auxiliary_loss_clip": 0.0144697, "auxiliary_loss_mlp": 0.00233225, "balance_loss_clip": 1.18389964, "balance_loss_mlp": 0.1999895, "epoch": 0.5770629791071697, "flos": 21536383132800.0, "grad_norm": 9.455821831082929, "language_loss": 0.87039113, "learning_rate": 1.6005846183642323e-06, "loss": 0.88719308, "num_input_tokens_seen": 206794865, "router_z_loss_clip": 2.6328125, "router_z_loss_mlp": 0.33227539, "step": 9598, "time_per_iteration": 2.739943265914917 }, { "auxiliary_loss_clip": 0.01447512, "auxiliary_loss_mlp": 0.00243622, "balance_loss_clip": 1.1825453, "balance_loss_mlp": 0.20955247, "epoch": 0.5771231023598377, "flos": 20886795624960.0, "grad_norm": 3.91352632019393, "language_loss": 0.78871602, "learning_rate": 1.6002030097615277e-06, "loss": 0.80562735, "num_input_tokens_seen": 206814095, "router_z_loss_clip": 2.65039062, "router_z_loss_mlp": 0.34057617, "step": 9599, "time_per_iteration": 2.7219135761260986 }, { "auxiliary_loss_clip": 0.01439654, "auxiliary_loss_mlp": 0.00254765, "balance_loss_clip": 1.17770708, "balance_loss_mlp": 0.21893084, "epoch": 0.5771832256125057, "flos": 18077216688000.0, "grad_norm": 17.777584051302807, "language_loss": 0.86816299, "learning_rate": 1.5998214163191663e-06, "loss": 0.88510716, "num_input_tokens_seen": 206832245, "router_z_loss_clip": 2.62304688, "router_z_loss_mlp": 0.35839844, "step": 9600, "time_per_iteration": 2.6362249851226807 }, { "auxiliary_loss_clip": 0.01450962, "auxiliary_loss_mlp": 0.00248679, "balance_loss_clip": 1.17875695, "balance_loss_mlp": 0.21422724, "epoch": 0.5772433488651736, "flos": 26359078786560.0, "grad_norm": 3.0720136824438753, "language_loss": 0.81757694, "learning_rate": 1.5994398380516163e-06, "loss": 0.83457339, "num_input_tokens_seen": 206851535, "router_z_loss_clip": 2.72460938, "router_z_loss_mlp": 0.34448242, "step": 9601, "time_per_iteration": 4.134538173675537 }, { "auxiliary_loss_clip": 0.01474031, "auxiliary_loss_mlp": 0.00242128, "balance_loss_clip": 1.20284724, "balance_loss_mlp": 0.20803379, "epoch": 0.5773034721178416, "flos": 19680987035520.0, "grad_norm": 155.4209592535194, "language_loss": 0.7434082, "learning_rate": 1.599058274973348e-06, "loss": 0.76056975, "num_input_tokens_seen": 206870595, "router_z_loss_clip": 2.71289062, "router_z_loss_mlp": 0.34106445, "step": 9602, "time_per_iteration": 4.171952486038208 }, { "auxiliary_loss_clip": 0.01436939, "auxiliary_loss_mlp": 0.00228128, "balance_loss_clip": 1.18033576, "balance_loss_mlp": 0.19353396, "epoch": 0.5773635953705095, "flos": 25082885496960.0, "grad_norm": 39.779562569371585, "language_loss": 0.78704023, "learning_rate": 1.5986767270988297e-06, "loss": 0.80369091, "num_input_tokens_seen": 206892320, "router_z_loss_clip": 2.56445312, "router_z_loss_mlp": 0.34594727, "step": 9603, "time_per_iteration": 2.747554302215576 }, { "auxiliary_loss_clip": 0.01441512, "auxiliary_loss_mlp": 0.00265294, "balance_loss_clip": 1.17627943, "balance_loss_mlp": 0.23239261, "epoch": 0.5774237186231775, "flos": 21032987978880.0, "grad_norm": 9.592594022469617, "language_loss": 0.83336353, "learning_rate": 1.5982951944425298e-06, "loss": 0.85043156, "num_input_tokens_seen": 206912485, "router_z_loss_clip": 2.6484375, "router_z_loss_mlp": 0.32910156, "step": 9604, "time_per_iteration": 2.7098844051361084 }, { "auxiliary_loss_clip": 0.01456609, "auxiliary_loss_mlp": 0.00233257, "balance_loss_clip": 1.1841414, "balance_loss_mlp": 0.19804239, "epoch": 0.5774838418758454, "flos": 15231727128960.0, "grad_norm": 41.24301976616702, "language_loss": 0.92115927, "learning_rate": 1.5979136770189174e-06, "loss": 0.9380579, "num_input_tokens_seen": 206929100, "router_z_loss_clip": 2.7265625, "router_z_loss_mlp": 0.35205078, "step": 9605, "time_per_iteration": 4.069299697875977 }, { "auxiliary_loss_clip": 0.0148695, "auxiliary_loss_mlp": 0.00269639, "balance_loss_clip": 1.20255578, "balance_loss_mlp": 0.23251759, "epoch": 0.5775439651285135, "flos": 23582609210880.0, "grad_norm": 20.687228391807416, "language_loss": 0.86693847, "learning_rate": 1.5975321748424581e-06, "loss": 0.88450438, "num_input_tokens_seen": 206947020, "router_z_loss_clip": 2.84570312, "router_z_loss_mlp": 0.37109375, "step": 9606, "time_per_iteration": 2.673295497894287 }, { "auxiliary_loss_clip": 0.01459694, "auxiliary_loss_mlp": 0.00235052, "balance_loss_clip": 1.19053721, "balance_loss_mlp": 0.19869326, "epoch": 0.5776040883811814, "flos": 18040515966720.0, "grad_norm": 2.857741229929285, "language_loss": 0.79754966, "learning_rate": 1.597150687927619e-06, "loss": 0.81449711, "num_input_tokens_seen": 206964065, "router_z_loss_clip": 2.68945312, "router_z_loss_mlp": 0.36352539, "step": 9607, "time_per_iteration": 2.6456656455993652 }, { "auxiliary_loss_clip": 0.01456953, "auxiliary_loss_mlp": 0.00247827, "balance_loss_clip": 1.18458724, "balance_loss_mlp": 0.21504405, "epoch": 0.5776642116338494, "flos": 18624638937600.0, "grad_norm": 3.667725777142241, "language_loss": 0.7806868, "learning_rate": 1.5967692162888664e-06, "loss": 0.79773462, "num_input_tokens_seen": 206981940, "router_z_loss_clip": 2.72265625, "router_z_loss_mlp": 0.32824707, "step": 9608, "time_per_iteration": 2.639314651489258 }, { "auxiliary_loss_clip": 0.01460741, "auxiliary_loss_mlp": 0.00268914, "balance_loss_clip": 1.18921745, "balance_loss_mlp": 0.23310345, "epoch": 0.5777243348865173, "flos": 28402539517440.0, "grad_norm": 107.30129625730933, "language_loss": 0.84291703, "learning_rate": 1.596387759940665e-06, "loss": 0.86021358, "num_input_tokens_seen": 207002365, "router_z_loss_clip": 2.71484375, "router_z_loss_mlp": 0.3581543, "step": 9609, "time_per_iteration": 2.711634397506714 }, { "auxiliary_loss_clip": 0.01433535, "auxiliary_loss_mlp": 0.00262757, "balance_loss_clip": 1.16726971, "balance_loss_mlp": 0.23006965, "epoch": 0.5777844581391853, "flos": 24024705805440.0, "grad_norm": 6.4008321242465005, "language_loss": 0.82737333, "learning_rate": 1.5960063188974808e-06, "loss": 0.84433627, "num_input_tokens_seen": 207021195, "router_z_loss_clip": 2.66210938, "router_z_loss_mlp": 0.32714844, "step": 9610, "time_per_iteration": 2.673670768737793 }, { "auxiliary_loss_clip": 0.01449479, "auxiliary_loss_mlp": 0.00253392, "balance_loss_clip": 1.18254983, "balance_loss_mlp": 0.21660417, "epoch": 0.5778445813918534, "flos": 17777361951360.0, "grad_norm": 84.96135206811525, "language_loss": 0.79110777, "learning_rate": 1.5956248931737777e-06, "loss": 0.80813646, "num_input_tokens_seen": 207037465, "router_z_loss_clip": 2.66796875, "router_z_loss_mlp": 0.36816406, "step": 9611, "time_per_iteration": 4.007632732391357 }, { "auxiliary_loss_clip": 0.01436472, "auxiliary_loss_mlp": 0.00238774, "balance_loss_clip": 1.17428815, "balance_loss_mlp": 0.20451364, "epoch": 0.5779047046445213, "flos": 22233194046720.0, "grad_norm": 9.141056160280792, "language_loss": 0.90793288, "learning_rate": 1.5952434827840185e-06, "loss": 0.92468536, "num_input_tokens_seen": 207054230, "router_z_loss_clip": 2.62304688, "router_z_loss_mlp": 0.34277344, "step": 9612, "time_per_iteration": 2.636336326599121 }, { "auxiliary_loss_clip": 0.01440449, "auxiliary_loss_mlp": 0.00251429, "balance_loss_clip": 1.17669797, "balance_loss_mlp": 0.21700147, "epoch": 0.5779648278971893, "flos": 21434361528960.0, "grad_norm": 263.70635302336103, "language_loss": 0.85398054, "learning_rate": 1.594862087742667e-06, "loss": 0.87089926, "num_input_tokens_seen": 207073150, "router_z_loss_clip": 2.63671875, "router_z_loss_mlp": 0.34423828, "step": 9613, "time_per_iteration": 2.6690611839294434 }, { "auxiliary_loss_clip": 0.01409065, "auxiliary_loss_mlp": 0.00238871, "balance_loss_clip": 1.14917886, "balance_loss_mlp": 0.20480123, "epoch": 0.5780249511498572, "flos": 19026120228480.0, "grad_norm": 10.105656089672872, "language_loss": 0.82239854, "learning_rate": 1.5944807080641863e-06, "loss": 0.83887792, "num_input_tokens_seen": 207090375, "router_z_loss_clip": 2.59765625, "router_z_loss_mlp": 0.34057617, "step": 9614, "time_per_iteration": 2.631988763809204 }, { "auxiliary_loss_clip": 0.01448502, "auxiliary_loss_mlp": 0.00225746, "balance_loss_clip": 1.17867661, "balance_loss_mlp": 0.19041273, "epoch": 0.5780850744025252, "flos": 12124663752960.0, "grad_norm": 36.519197099310404, "language_loss": 0.92087698, "learning_rate": 1.5940993437630375e-06, "loss": 0.93761945, "num_input_tokens_seen": 207106030, "router_z_loss_clip": 2.69921875, "router_z_loss_mlp": 0.35375977, "step": 9615, "time_per_iteration": 2.6071360111236572 }, { "auxiliary_loss_clip": 0.01448205, "auxiliary_loss_mlp": 0.00240299, "balance_loss_clip": 1.17806101, "balance_loss_mlp": 0.20131785, "epoch": 0.5781451976551931, "flos": 25044425009280.0, "grad_norm": 36.605750651577985, "language_loss": 0.75464028, "learning_rate": 1.5937179948536825e-06, "loss": 0.77152538, "num_input_tokens_seen": 207125435, "router_z_loss_clip": 2.703125, "router_z_loss_mlp": 0.38989258, "step": 9616, "time_per_iteration": 2.6779191493988037 }, { "auxiliary_loss_clip": 0.01426729, "auxiliary_loss_mlp": 0.00231506, "balance_loss_clip": 1.16624582, "balance_loss_mlp": 0.19843777, "epoch": 0.5782053209078611, "flos": 19245606284160.0, "grad_norm": 28.708519043813727, "language_loss": 0.85609877, "learning_rate": 1.5933366613505812e-06, "loss": 0.87268108, "num_input_tokens_seen": 207145095, "router_z_loss_clip": 2.60546875, "router_z_loss_mlp": 0.33081055, "step": 9617, "time_per_iteration": 2.6694273948669434 }, { "auxiliary_loss_clip": 0.014651, "auxiliary_loss_mlp": 0.00234517, "balance_loss_clip": 1.19449723, "balance_loss_mlp": 0.19675133, "epoch": 0.578265444160529, "flos": 25993831340160.0, "grad_norm": 369.0253298540447, "language_loss": 0.8324995, "learning_rate": 1.5929553432681947e-06, "loss": 0.84949565, "num_input_tokens_seen": 207166045, "router_z_loss_clip": 2.70703125, "router_z_loss_mlp": 0.37744141, "step": 9618, "time_per_iteration": 2.684602737426758 }, { "auxiliary_loss_clip": 0.01444859, "auxiliary_loss_mlp": 0.00227155, "balance_loss_clip": 1.17833626, "balance_loss_mlp": 0.19260812, "epoch": 0.5783255674131971, "flos": 21798603394560.0, "grad_norm": 3.2015767854196677, "language_loss": 0.90188551, "learning_rate": 1.5925740406209826e-06, "loss": 0.91860563, "num_input_tokens_seen": 207185290, "router_z_loss_clip": 2.6640625, "router_z_loss_mlp": 0.34545898, "step": 9619, "time_per_iteration": 2.6560845375061035 }, { "auxiliary_loss_clip": 0.01436437, "auxiliary_loss_mlp": 0.00225683, "balance_loss_clip": 1.17079604, "balance_loss_mlp": 0.19256672, "epoch": 0.578385690665865, "flos": 24789746603520.0, "grad_norm": 4.5009149294394595, "language_loss": 0.78860444, "learning_rate": 1.5921927534234039e-06, "loss": 0.80522567, "num_input_tokens_seen": 207205505, "router_z_loss_clip": 2.66015625, "router_z_loss_mlp": 0.33117676, "step": 9620, "time_per_iteration": 2.7850563526153564 }, { "auxiliary_loss_clip": 0.01429592, "auxiliary_loss_mlp": 0.00217328, "balance_loss_clip": 1.16856503, "balance_loss_mlp": 0.18299559, "epoch": 0.578445813918533, "flos": 21212864311680.0, "grad_norm": 42.14217996001199, "language_loss": 0.84026837, "learning_rate": 1.591811481689916e-06, "loss": 0.85673761, "num_input_tokens_seen": 207225315, "router_z_loss_clip": 2.609375, "router_z_loss_mlp": 0.34301758, "step": 9621, "time_per_iteration": 2.719852924346924 }, { "auxiliary_loss_clip": 0.01442048, "auxiliary_loss_mlp": 0.00251096, "balance_loss_clip": 1.17373133, "balance_loss_mlp": 0.21690726, "epoch": 0.5785059371712009, "flos": 25046795306880.0, "grad_norm": 45.831243626173986, "language_loss": 0.7719934, "learning_rate": 1.5914302254349787e-06, "loss": 0.78892487, "num_input_tokens_seen": 207247690, "router_z_loss_clip": 2.6796875, "router_z_loss_mlp": 0.34179688, "step": 9622, "time_per_iteration": 2.745157480239868 }, { "auxiliary_loss_clip": 0.01466501, "auxiliary_loss_mlp": 0.00103415, "balance_loss_clip": 1.24388826, "balance_loss_mlp": 0.09173276, "epoch": 0.5785660604238689, "flos": 70843172284800.0, "grad_norm": 0.8285340245317143, "language_loss": 0.5535413, "learning_rate": 1.5910489846730476e-06, "loss": 0.56924045, "num_input_tokens_seen": 207301735, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.11669922, "step": 9623, "time_per_iteration": 3.2049875259399414 }, { "auxiliary_loss_clip": 0.01438836, "auxiliary_loss_mlp": 0.0025123, "balance_loss_clip": 1.16507673, "balance_loss_mlp": 0.21515723, "epoch": 0.578626183676537, "flos": 31649977244160.0, "grad_norm": 15.207848413639272, "language_loss": 0.79888129, "learning_rate": 1.5906677594185799e-06, "loss": 0.81578195, "num_input_tokens_seen": 207321240, "router_z_loss_clip": 2.734375, "router_z_loss_mlp": 0.36035156, "step": 9624, "time_per_iteration": 2.7583019733428955 }, { "auxiliary_loss_clip": 0.01450562, "auxiliary_loss_mlp": 0.00224622, "balance_loss_clip": 1.17653394, "balance_loss_mlp": 0.18904966, "epoch": 0.5786863069292049, "flos": 21865181253120.0, "grad_norm": 97.9894167060353, "language_loss": 0.89796585, "learning_rate": 1.5902865496860322e-06, "loss": 0.91471767, "num_input_tokens_seen": 207339540, "router_z_loss_clip": 2.74414062, "router_z_loss_mlp": 0.35571289, "step": 9625, "time_per_iteration": 2.7461719512939453 }, { "auxiliary_loss_clip": 0.01459889, "auxiliary_loss_mlp": 0.00222062, "balance_loss_clip": 1.18082142, "balance_loss_mlp": 0.18706244, "epoch": 0.5787464301818729, "flos": 23364954748800.0, "grad_norm": 3.186677901220657, "language_loss": 0.76962787, "learning_rate": 1.5899053554898591e-06, "loss": 0.78644741, "num_input_tokens_seen": 207360470, "router_z_loss_clip": 2.79101562, "router_z_loss_mlp": 0.34985352, "step": 9626, "time_per_iteration": 2.705853223800659 }, { "auxiliary_loss_clip": 0.01451477, "auxiliary_loss_mlp": 0.00216831, "balance_loss_clip": 1.18004096, "balance_loss_mlp": 0.18090123, "epoch": 0.5788065534345408, "flos": 30004011394560.0, "grad_norm": 17.90766355068469, "language_loss": 0.77449012, "learning_rate": 1.5895241768445166e-06, "loss": 0.79117322, "num_input_tokens_seen": 207383080, "router_z_loss_clip": 2.71484375, "router_z_loss_mlp": 0.359375, "step": 9627, "time_per_iteration": 2.709117889404297 }, { "auxiliary_loss_clip": 0.01437723, "auxiliary_loss_mlp": 0.00230496, "balance_loss_clip": 1.17052197, "balance_loss_mlp": 0.19773751, "epoch": 0.5788666766872088, "flos": 24527849564160.0, "grad_norm": 104.06457856400391, "language_loss": 0.91642541, "learning_rate": 1.589143013764458e-06, "loss": 0.93310755, "num_input_tokens_seen": 207401000, "router_z_loss_clip": 2.671875, "router_z_loss_mlp": 0.32714844, "step": 9628, "time_per_iteration": 2.6767141819000244 }, { "auxiliary_loss_clip": 0.01423403, "auxiliary_loss_mlp": 0.0021437, "balance_loss_clip": 1.160164, "balance_loss_mlp": 0.181564, "epoch": 0.5789267999398767, "flos": 23732823888000.0, "grad_norm": 6.082001918768536, "language_loss": 0.79570192, "learning_rate": 1.5887618662641376e-06, "loss": 0.81207967, "num_input_tokens_seen": 207419230, "router_z_loss_clip": 2.63476562, "router_z_loss_mlp": 0.32788086, "step": 9629, "time_per_iteration": 2.6512436866760254 }, { "auxiliary_loss_clip": 0.014435, "auxiliary_loss_mlp": 0.00231508, "balance_loss_clip": 1.1719892, "balance_loss_mlp": 0.19719976, "epoch": 0.5789869231925447, "flos": 21135045496320.0, "grad_norm": 38.994443655499715, "language_loss": 0.83530331, "learning_rate": 1.5883807343580087e-06, "loss": 0.85205334, "num_input_tokens_seen": 207437615, "router_z_loss_clip": 2.71484375, "router_z_loss_mlp": 0.34326172, "step": 9630, "time_per_iteration": 2.6597306728363037 }, { "auxiliary_loss_clip": 0.01432972, "auxiliary_loss_mlp": 0.00253083, "balance_loss_clip": 1.16859496, "balance_loss_mlp": 0.21579471, "epoch": 0.5790470464452127, "flos": 21209632087680.0, "grad_norm": 2.147854018999418, "language_loss": 0.84217983, "learning_rate": 1.587999618060523e-06, "loss": 0.85904038, "num_input_tokens_seen": 207457270, "router_z_loss_clip": 2.6484375, "router_z_loss_mlp": 0.37329102, "step": 9631, "time_per_iteration": 2.6188278198242188 }, { "auxiliary_loss_clip": 0.01410517, "auxiliary_loss_mlp": 0.0022129, "balance_loss_clip": 1.15132642, "balance_loss_mlp": 0.18667126, "epoch": 0.5791071696978807, "flos": 23404384903680.0, "grad_norm": 91.31092809183572, "language_loss": 0.83621573, "learning_rate": 1.5876185173861333e-06, "loss": 0.85253382, "num_input_tokens_seen": 207477890, "router_z_loss_clip": 2.59570312, "router_z_loss_mlp": 0.34594727, "step": 9632, "time_per_iteration": 2.681260347366333 }, { "auxiliary_loss_clip": 0.01429609, "auxiliary_loss_mlp": 0.00240364, "balance_loss_clip": 1.16332185, "balance_loss_mlp": 0.20693819, "epoch": 0.5791672929505486, "flos": 24206521472640.0, "grad_norm": 5.672213010223808, "language_loss": 0.86800122, "learning_rate": 1.5872374323492915e-06, "loss": 0.88470089, "num_input_tokens_seen": 207497670, "router_z_loss_clip": 2.66210938, "router_z_loss_mlp": 0.33447266, "step": 9633, "time_per_iteration": 2.6497247219085693 }, { "auxiliary_loss_clip": 0.01439259, "auxiliary_loss_mlp": 0.00234516, "balance_loss_clip": 1.16459334, "balance_loss_mlp": 0.19870511, "epoch": 0.5792274162032166, "flos": 24348871071360.0, "grad_norm": 17.97165733572477, "language_loss": 0.87429094, "learning_rate": 1.5868563629644464e-06, "loss": 0.89102864, "num_input_tokens_seen": 207516105, "router_z_loss_clip": 2.74609375, "router_z_loss_mlp": 0.3581543, "step": 9634, "time_per_iteration": 2.664384126663208 }, { "auxiliary_loss_clip": 0.01415912, "auxiliary_loss_mlp": 0.00232572, "balance_loss_clip": 1.14962828, "balance_loss_mlp": 0.19821578, "epoch": 0.5792875394558845, "flos": 20449403712000.0, "grad_norm": 7.066425009413879, "language_loss": 0.74153781, "learning_rate": 1.5864753092460502e-06, "loss": 0.75802267, "num_input_tokens_seen": 207533685, "router_z_loss_clip": 2.66210938, "router_z_loss_mlp": 0.34338379, "step": 9635, "time_per_iteration": 2.666520833969116 }, { "auxiliary_loss_clip": 0.01420193, "auxiliary_loss_mlp": 0.00241885, "balance_loss_clip": 1.16108763, "balance_loss_mlp": 0.20831558, "epoch": 0.5793476627085525, "flos": 24060329118720.0, "grad_norm": 2.2174441867012886, "language_loss": 0.83170176, "learning_rate": 1.5860942712085516e-06, "loss": 0.84832251, "num_input_tokens_seen": 207552840, "router_z_loss_clip": 2.59375, "router_z_loss_mlp": 0.33569336, "step": 9636, "time_per_iteration": 2.755195140838623 }, { "auxiliary_loss_clip": 0.01436141, "auxiliary_loss_mlp": 0.00196351, "balance_loss_clip": 1.17231035, "balance_loss_mlp": 0.16237631, "epoch": 0.5794077859612206, "flos": 22054287381120.0, "grad_norm": 40.70171318751506, "language_loss": 0.76266015, "learning_rate": 1.5857132488663998e-06, "loss": 0.77898502, "num_input_tokens_seen": 207572095, "router_z_loss_clip": 2.63671875, "router_z_loss_mlp": 0.34008789, "step": 9637, "time_per_iteration": 2.6696088314056396 }, { "auxiliary_loss_clip": 0.01464596, "auxiliary_loss_mlp": 0.00230395, "balance_loss_clip": 1.184062, "balance_loss_mlp": 0.19327298, "epoch": 0.5794679092138885, "flos": 11434855991040.0, "grad_norm": 8.67695167067145, "language_loss": 0.8367188, "learning_rate": 1.585332242234043e-06, "loss": 0.85366875, "num_input_tokens_seen": 207587495, "router_z_loss_clip": 2.80859375, "router_z_loss_mlp": 0.37109375, "step": 9638, "time_per_iteration": 2.624295234680176 }, { "auxiliary_loss_clip": 0.01448612, "auxiliary_loss_mlp": 0.00205052, "balance_loss_clip": 1.17957354, "balance_loss_mlp": 0.17083859, "epoch": 0.5795280324665565, "flos": 18880215183360.0, "grad_norm": 7.408440215478484, "language_loss": 0.79832411, "learning_rate": 1.5849512513259291e-06, "loss": 0.8148607, "num_input_tokens_seen": 207606795, "router_z_loss_clip": 2.6875, "router_z_loss_mlp": 0.34179688, "step": 9639, "time_per_iteration": 2.647068977355957 }, { "auxiliary_loss_clip": 0.01425154, "auxiliary_loss_mlp": 0.00229599, "balance_loss_clip": 1.16206157, "balance_loss_mlp": 0.19519544, "epoch": 0.5795881557192244, "flos": 13005947940480.0, "grad_norm": 39.91116198685101, "language_loss": 0.77830231, "learning_rate": 1.5845702761565054e-06, "loss": 0.79484987, "num_input_tokens_seen": 207623620, "router_z_loss_clip": 2.62890625, "router_z_loss_mlp": 0.34399414, "step": 9640, "time_per_iteration": 2.614427328109741 }, { "auxiliary_loss_clip": 0.01455469, "auxiliary_loss_mlp": 0.0023499, "balance_loss_clip": 1.17377687, "balance_loss_mlp": 0.19772509, "epoch": 0.5796482789718924, "flos": 19932397303680.0, "grad_norm": 4.695058627405145, "language_loss": 0.86997551, "learning_rate": 1.5841893167402183e-06, "loss": 0.8868801, "num_input_tokens_seen": 207639380, "router_z_loss_clip": 2.8203125, "router_z_loss_mlp": 0.37255859, "step": 9641, "time_per_iteration": 2.6354899406433105 }, { "auxiliary_loss_clip": 0.0144221, "auxiliary_loss_mlp": 0.00219037, "balance_loss_clip": 1.17330313, "balance_loss_mlp": 0.18348902, "epoch": 0.5797084022245603, "flos": 21650794928640.0, "grad_norm": 188.15818271380263, "language_loss": 0.82008839, "learning_rate": 1.5838083730915143e-06, "loss": 0.83670092, "num_input_tokens_seen": 207657915, "router_z_loss_clip": 2.69335938, "router_z_loss_mlp": 0.35546875, "step": 9642, "time_per_iteration": 2.636300563812256 }, { "auxiliary_loss_clip": 0.01440127, "auxiliary_loss_mlp": 0.00219295, "balance_loss_clip": 1.171875, "balance_loss_mlp": 0.18474844, "epoch": 0.5797685254772283, "flos": 26031573555840.0, "grad_norm": 8.167228162589902, "language_loss": 0.81705546, "learning_rate": 1.5834274452248378e-06, "loss": 0.83364969, "num_input_tokens_seen": 207678620, "router_z_loss_clip": 2.6796875, "router_z_loss_mlp": 0.34545898, "step": 9643, "time_per_iteration": 4.1012115478515625 }, { "auxiliary_loss_clip": 0.01450736, "auxiliary_loss_mlp": 0.00232338, "balance_loss_clip": 1.17920184, "balance_loss_mlp": 0.19550283, "epoch": 0.5798286487298963, "flos": 22705167778560.0, "grad_norm": 74.69520285561644, "language_loss": 0.77412635, "learning_rate": 1.5830465331546352e-06, "loss": 0.79095709, "num_input_tokens_seen": 207696980, "router_z_loss_clip": 2.7109375, "router_z_loss_mlp": 0.3684082, "step": 9644, "time_per_iteration": 4.187214612960815 }, { "auxiliary_loss_clip": 0.01449266, "auxiliary_loss_mlp": 0.00243394, "balance_loss_clip": 1.1762985, "balance_loss_mlp": 0.20798859, "epoch": 0.5798887719825643, "flos": 23148988225920.0, "grad_norm": 13.44609471094669, "language_loss": 0.93683976, "learning_rate": 1.5826656368953496e-06, "loss": 0.95376647, "num_input_tokens_seen": 207714065, "router_z_loss_clip": 2.72851562, "router_z_loss_mlp": 0.35424805, "step": 9645, "time_per_iteration": 2.632127523422241 }, { "auxiliary_loss_clip": 0.01465977, "auxiliary_loss_mlp": 0.00226154, "balance_loss_clip": 1.19033492, "balance_loss_mlp": 0.18970001, "epoch": 0.5799488952352322, "flos": 24426043441920.0, "grad_norm": 53.14552063980344, "language_loss": 0.84528553, "learning_rate": 1.5822847564614244e-06, "loss": 0.86220682, "num_input_tokens_seen": 207734720, "router_z_loss_clip": 2.75390625, "router_z_loss_mlp": 0.36450195, "step": 9646, "time_per_iteration": 2.6819324493408203 }, { "auxiliary_loss_clip": 0.01456734, "auxiliary_loss_mlp": 0.00268034, "balance_loss_clip": 1.18353319, "balance_loss_mlp": 0.23322457, "epoch": 0.5800090184879002, "flos": 38395903829760.0, "grad_norm": 29.273130389197092, "language_loss": 0.6758033, "learning_rate": 1.5819038918673038e-06, "loss": 0.69305098, "num_input_tokens_seen": 207755435, "router_z_loss_clip": 2.73046875, "router_z_loss_mlp": 0.34790039, "step": 9647, "time_per_iteration": 4.271580219268799 }, { "auxiliary_loss_clip": 0.0143285, "auxiliary_loss_mlp": 0.00238698, "balance_loss_clip": 1.16384375, "balance_loss_mlp": 0.20303103, "epoch": 0.5800691417405681, "flos": 19784840232960.0, "grad_norm": 127.88481271754934, "language_loss": 0.92493314, "learning_rate": 1.5815230431274288e-06, "loss": 0.9416486, "num_input_tokens_seen": 207773570, "router_z_loss_clip": 2.68945312, "router_z_loss_mlp": 0.35668945, "step": 9648, "time_per_iteration": 2.630004405975342 }, { "auxiliary_loss_clip": 0.01504839, "auxiliary_loss_mlp": 0.00159448, "balance_loss_clip": 1.26795375, "balance_loss_mlp": 0.14981619, "epoch": 0.5801292649932361, "flos": 70314565783680.0, "grad_norm": 0.8418800483132198, "language_loss": 0.62708032, "learning_rate": 1.581142210256242e-06, "loss": 0.64372325, "num_input_tokens_seen": 207830095, "router_z_loss_clip": 2.375, "router_z_loss_mlp": 0.09619141, "step": 9649, "time_per_iteration": 3.220691680908203 }, { "auxiliary_loss_clip": 0.0141049, "auxiliary_loss_mlp": 0.002321, "balance_loss_clip": 1.15390563, "balance_loss_mlp": 0.19841176, "epoch": 0.5801893882459042, "flos": 18734812928640.0, "grad_norm": 33.106368416261816, "language_loss": 0.87515783, "learning_rate": 1.5807613932681857e-06, "loss": 0.8915838, "num_input_tokens_seen": 207848555, "router_z_loss_clip": 2.56640625, "router_z_loss_mlp": 0.33691406, "step": 9650, "time_per_iteration": 2.62776255607605 }, { "auxiliary_loss_clip": 0.01434666, "auxiliary_loss_mlp": 0.00244664, "balance_loss_clip": 1.16283655, "balance_loss_mlp": 0.21021225, "epoch": 0.5802495114985721, "flos": 15596507698560.0, "grad_norm": 41.67988874696643, "language_loss": 0.87757224, "learning_rate": 1.580380592177698e-06, "loss": 0.89436555, "num_input_tokens_seen": 207867060, "router_z_loss_clip": 2.72070312, "router_z_loss_mlp": 0.34472656, "step": 9651, "time_per_iteration": 2.6643552780151367 }, { "auxiliary_loss_clip": 0.01465181, "auxiliary_loss_mlp": 0.00262973, "balance_loss_clip": 1.18738627, "balance_loss_mlp": 0.22585082, "epoch": 0.5803096347512401, "flos": 18255405081600.0, "grad_norm": 28.957677933807393, "language_loss": 0.83056247, "learning_rate": 1.5799998069992213e-06, "loss": 0.847844, "num_input_tokens_seen": 207884520, "router_z_loss_clip": 2.77929688, "router_z_loss_mlp": 0.37109375, "step": 9652, "time_per_iteration": 2.619349956512451 }, { "auxiliary_loss_clip": 0.0143957, "auxiliary_loss_mlp": 0.00263755, "balance_loss_clip": 1.16705358, "balance_loss_mlp": 0.22799179, "epoch": 0.580369758003908, "flos": 22893160584960.0, "grad_norm": 4.331862951973959, "language_loss": 0.90412879, "learning_rate": 1.579619037747193e-06, "loss": 0.92116201, "num_input_tokens_seen": 207905370, "router_z_loss_clip": 2.7265625, "router_z_loss_mlp": 0.35766602, "step": 9653, "time_per_iteration": 4.051369667053223 }, { "auxiliary_loss_clip": 0.01435394, "auxiliary_loss_mlp": 0.00289862, "balance_loss_clip": 1.16929138, "balance_loss_mlp": 0.25361055, "epoch": 0.580429881256576, "flos": 18697681244160.0, "grad_norm": 31.483214406602944, "language_loss": 0.84447879, "learning_rate": 1.5792382844360534e-06, "loss": 0.86173129, "num_input_tokens_seen": 207923790, "router_z_loss_clip": 2.66015625, "router_z_loss_mlp": 0.36218262, "step": 9654, "time_per_iteration": 2.6401987075805664 }, { "auxiliary_loss_clip": 0.01425825, "auxiliary_loss_mlp": 0.00224884, "balance_loss_clip": 1.16430533, "balance_loss_mlp": 0.19021837, "epoch": 0.5804900045092439, "flos": 24681978823680.0, "grad_norm": 8.277652227205861, "language_loss": 0.76789933, "learning_rate": 1.5788575470802408e-06, "loss": 0.78440642, "num_input_tokens_seen": 207942335, "router_z_loss_clip": 2.61523438, "router_z_loss_mlp": 0.34667969, "step": 9655, "time_per_iteration": 2.6327457427978516 }, { "auxiliary_loss_clip": 0.01452082, "auxiliary_loss_mlp": 0.00258275, "balance_loss_clip": 1.1701746, "balance_loss_mlp": 0.22210717, "epoch": 0.580550127761912, "flos": 23112790295040.0, "grad_norm": 23.701028042623665, "language_loss": 0.79574883, "learning_rate": 1.5784768256941915e-06, "loss": 0.81285238, "num_input_tokens_seen": 207961975, "router_z_loss_clip": 2.82226562, "router_z_loss_mlp": 0.36181641, "step": 9656, "time_per_iteration": 2.6536576747894287 }, { "auxiliary_loss_clip": 0.01411864, "auxiliary_loss_mlp": 0.00245404, "balance_loss_clip": 1.15582013, "balance_loss_mlp": 0.21202585, "epoch": 0.5806102510145799, "flos": 18475681236480.0, "grad_norm": 9.877090035031314, "language_loss": 0.78507423, "learning_rate": 1.5780961202923433e-06, "loss": 0.80164695, "num_input_tokens_seen": 207979520, "router_z_loss_clip": 2.56054688, "router_z_loss_mlp": 0.33374023, "step": 9657, "time_per_iteration": 2.628791570663452 }, { "auxiliary_loss_clip": 0.01461918, "auxiliary_loss_mlp": 0.00260116, "balance_loss_clip": 1.18400073, "balance_loss_mlp": 0.22423354, "epoch": 0.5806703742672479, "flos": 23915645136000.0, "grad_norm": 9.795823399552962, "language_loss": 0.8014065, "learning_rate": 1.5777154308891328e-06, "loss": 0.81862682, "num_input_tokens_seen": 207998375, "router_z_loss_clip": 2.77929688, "router_z_loss_mlp": 0.35913086, "step": 9658, "time_per_iteration": 2.675096035003662 }, { "auxiliary_loss_clip": 0.01475242, "auxiliary_loss_mlp": 0.00101098, "balance_loss_clip": 1.24778867, "balance_loss_mlp": 0.09022592, "epoch": 0.5807304975199158, "flos": 66311999412480.0, "grad_norm": 0.6399457300481245, "language_loss": 0.52739513, "learning_rate": 1.5773347574989953e-06, "loss": 0.54315853, "num_input_tokens_seen": 208060605, "router_z_loss_clip": 2.28125, "router_z_loss_mlp": 0.10888672, "step": 9659, "time_per_iteration": 3.1999716758728027 }, { "auxiliary_loss_clip": 0.01435074, "auxiliary_loss_mlp": 0.00250914, "balance_loss_clip": 1.16591012, "balance_loss_mlp": 0.2148886, "epoch": 0.5807906207725838, "flos": 31722444933120.0, "grad_norm": 8.596679560694218, "language_loss": 0.68746245, "learning_rate": 1.576954100136366e-06, "loss": 0.70432234, "num_input_tokens_seen": 208080320, "router_z_loss_clip": 2.68945312, "router_z_loss_mlp": 0.36035156, "step": 9660, "time_per_iteration": 2.7469418048858643 }, { "auxiliary_loss_clip": 0.01434586, "auxiliary_loss_mlp": 0.00288171, "balance_loss_clip": 1.16705132, "balance_loss_mlp": 0.2513116, "epoch": 0.5808507440252517, "flos": 23801161512960.0, "grad_norm": 17.84934835151538, "language_loss": 0.73175955, "learning_rate": 1.5765734588156797e-06, "loss": 0.74898708, "num_input_tokens_seen": 208099305, "router_z_loss_clip": 2.67773438, "router_z_loss_mlp": 0.36889648, "step": 9661, "time_per_iteration": 2.6879231929779053 }, { "auxiliary_loss_clip": 0.01422389, "auxiliary_loss_mlp": 0.00236338, "balance_loss_clip": 1.16557574, "balance_loss_mlp": 0.2033644, "epoch": 0.5809108672779197, "flos": 13698449222400.0, "grad_norm": 26.514551410090075, "language_loss": 0.80029958, "learning_rate": 1.5761928335513704e-06, "loss": 0.8168869, "num_input_tokens_seen": 208116960, "router_z_loss_clip": 2.5703125, "router_z_loss_mlp": 0.3293457, "step": 9662, "time_per_iteration": 2.686223268508911 }, { "auxiliary_loss_clip": 0.01455156, "auxiliary_loss_mlp": 0.00078035, "balance_loss_clip": 1.23167884, "balance_loss_mlp": 0.06635216, "epoch": 0.5809709905305876, "flos": 69134866381440.0, "grad_norm": 0.8442384189804264, "language_loss": 0.58126581, "learning_rate": 1.5758122243578709e-06, "loss": 0.59659779, "num_input_tokens_seen": 208182190, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.11669922, "step": 9663, "time_per_iteration": 3.2520999908447266 }, { "auxiliary_loss_clip": 0.01418864, "auxiliary_loss_mlp": 0.00251739, "balance_loss_clip": 1.15786648, "balance_loss_mlp": 0.21931401, "epoch": 0.5810311137832557, "flos": 19827538525440.0, "grad_norm": 3.4640391172418803, "language_loss": 0.88910526, "learning_rate": 1.5754316312496152e-06, "loss": 0.90581131, "num_input_tokens_seen": 208197015, "router_z_loss_clip": 2.61132812, "router_z_loss_mlp": 0.32446289, "step": 9664, "time_per_iteration": 2.6721017360687256 }, { "auxiliary_loss_clip": 0.0142109, "auxiliary_loss_mlp": 0.0024137, "balance_loss_clip": 1.15750408, "balance_loss_mlp": 0.2073475, "epoch": 0.5810912370359237, "flos": 29238503719680.0, "grad_norm": 21.582497458036602, "language_loss": 0.86945748, "learning_rate": 1.5750510542410337e-06, "loss": 0.88608211, "num_input_tokens_seen": 208215795, "router_z_loss_clip": 2.63476562, "router_z_loss_mlp": 0.34033203, "step": 9665, "time_per_iteration": 2.7920408248901367 }, { "auxiliary_loss_clip": 0.01447703, "auxiliary_loss_mlp": 0.00255664, "balance_loss_clip": 1.17740965, "balance_loss_mlp": 0.22354892, "epoch": 0.5811513602885916, "flos": 22785572373120.0, "grad_norm": 3.8207792152145474, "language_loss": 0.8697772, "learning_rate": 1.5746704933465599e-06, "loss": 0.8868109, "num_input_tokens_seen": 208234655, "router_z_loss_clip": 2.70117188, "router_z_loss_mlp": 0.32104492, "step": 9666, "time_per_iteration": 2.8320844173431396 }, { "auxiliary_loss_clip": 0.01442235, "auxiliary_loss_mlp": 0.00248341, "balance_loss_clip": 1.18016648, "balance_loss_mlp": 0.21348381, "epoch": 0.5812114835412596, "flos": 18734346051840.0, "grad_norm": 2.581206960146033, "language_loss": 0.86910093, "learning_rate": 1.5742899485806227e-06, "loss": 0.88600671, "num_input_tokens_seen": 208251300, "router_z_loss_clip": 2.62304688, "router_z_loss_mlp": 0.34838867, "step": 9667, "time_per_iteration": 2.737617254257202 }, { "auxiliary_loss_clip": 0.01460307, "auxiliary_loss_mlp": 0.00253014, "balance_loss_clip": 1.1856457, "balance_loss_mlp": 0.21861057, "epoch": 0.5812716067939275, "flos": 26431295080320.0, "grad_norm": 5.685153963038283, "language_loss": 0.83597124, "learning_rate": 1.573909419957653e-06, "loss": 0.85310441, "num_input_tokens_seen": 208272685, "router_z_loss_clip": 2.74414062, "router_z_loss_mlp": 0.34399414, "step": 9668, "time_per_iteration": 2.8012733459472656 }, { "auxiliary_loss_clip": 0.0143394, "auxiliary_loss_mlp": 0.00238091, "balance_loss_clip": 1.16953897, "balance_loss_mlp": 0.20542765, "epoch": 0.5813317300465956, "flos": 43397865285120.0, "grad_norm": 5.199380763942596, "language_loss": 0.7471211, "learning_rate": 1.5735289074920819e-06, "loss": 0.76384139, "num_input_tokens_seen": 208294315, "router_z_loss_clip": 2.64453125, "router_z_loss_mlp": 0.32666016, "step": 9669, "time_per_iteration": 2.855381965637207 }, { "auxiliary_loss_clip": 0.01460403, "auxiliary_loss_mlp": 0.00285664, "balance_loss_clip": 1.18558264, "balance_loss_mlp": 0.24987715, "epoch": 0.5813918532992635, "flos": 24785472885120.0, "grad_norm": 25.591232336079564, "language_loss": 0.80061251, "learning_rate": 1.5731484111983363e-06, "loss": 0.81807315, "num_input_tokens_seen": 208315610, "router_z_loss_clip": 2.75, "router_z_loss_mlp": 0.35791016, "step": 9670, "time_per_iteration": 2.7088444232940674 }, { "auxiliary_loss_clip": 0.01434564, "auxiliary_loss_mlp": 0.00281517, "balance_loss_clip": 1.1692338, "balance_loss_mlp": 0.24463402, "epoch": 0.5814519765519315, "flos": 22857357703680.0, "grad_norm": 4.479030260854064, "language_loss": 0.88092422, "learning_rate": 1.5727679310908464e-06, "loss": 0.898085, "num_input_tokens_seen": 208334725, "router_z_loss_clip": 2.65234375, "router_z_loss_mlp": 0.36914062, "step": 9671, "time_per_iteration": 2.7010111808776855 }, { "auxiliary_loss_clip": 0.0146583, "auxiliary_loss_mlp": 0.00280869, "balance_loss_clip": 1.18792522, "balance_loss_mlp": 0.24312693, "epoch": 0.5815120998045994, "flos": 24060831909120.0, "grad_norm": 39.601144432398165, "language_loss": 0.72438329, "learning_rate": 1.5723874671840399e-06, "loss": 0.74185026, "num_input_tokens_seen": 208353825, "router_z_loss_clip": 2.77929688, "router_z_loss_mlp": 0.37744141, "step": 9672, "time_per_iteration": 2.702176570892334 }, { "auxiliary_loss_clip": 0.01424635, "auxiliary_loss_mlp": 0.00263192, "balance_loss_clip": 1.16630745, "balance_loss_mlp": 0.22988454, "epoch": 0.5815722230572674, "flos": 24279491952000.0, "grad_norm": 22.673731271433503, "language_loss": 0.88389784, "learning_rate": 1.572007019492342e-06, "loss": 0.90077609, "num_input_tokens_seen": 208374160, "router_z_loss_clip": 2.58007812, "router_z_loss_mlp": 0.33276367, "step": 9673, "time_per_iteration": 2.687452554702759 }, { "auxiliary_loss_clip": 0.01442274, "auxiliary_loss_mlp": 0.00274211, "balance_loss_clip": 1.17376709, "balance_loss_mlp": 0.23744704, "epoch": 0.5816323463099353, "flos": 22200371994240.0, "grad_norm": 5.80618551580568, "language_loss": 0.96549642, "learning_rate": 1.5716265880301817e-06, "loss": 0.98266131, "num_input_tokens_seen": 208392105, "router_z_loss_clip": 2.68359375, "router_z_loss_mlp": 0.36767578, "step": 9674, "time_per_iteration": 2.651585102081299 }, { "auxiliary_loss_clip": 0.01484701, "auxiliary_loss_mlp": 0.00249638, "balance_loss_clip": 1.20471072, "balance_loss_mlp": 0.21435234, "epoch": 0.5816924695626033, "flos": 24134448833280.0, "grad_norm": 287.80074106744325, "language_loss": 0.85440999, "learning_rate": 1.571246172811984e-06, "loss": 0.87175345, "num_input_tokens_seen": 208411755, "router_z_loss_clip": 2.79882812, "router_z_loss_mlp": 0.3527832, "step": 9675, "time_per_iteration": 2.6822450160980225 }, { "auxiliary_loss_clip": 0.0144211, "auxiliary_loss_mlp": 0.00228608, "balance_loss_clip": 1.1769191, "balance_loss_mlp": 0.19515836, "epoch": 0.5817525928152713, "flos": 21324223451520.0, "grad_norm": 3.1830659538130393, "language_loss": 0.78252339, "learning_rate": 1.5708657738521748e-06, "loss": 0.79923058, "num_input_tokens_seen": 208429995, "router_z_loss_clip": 2.65625, "router_z_loss_mlp": 0.33447266, "step": 9676, "time_per_iteration": 2.668804168701172 }, { "auxiliary_loss_clip": 0.01439924, "auxiliary_loss_mlp": 0.00279569, "balance_loss_clip": 1.1753583, "balance_loss_mlp": 0.24533188, "epoch": 0.5818127160679393, "flos": 26934510666240.0, "grad_norm": 82.96040220695258, "language_loss": 0.7218883, "learning_rate": 1.5704853911651779e-06, "loss": 0.73908317, "num_input_tokens_seen": 208443655, "router_z_loss_clip": 2.65039062, "router_z_loss_mlp": 0.34277344, "step": 9677, "time_per_iteration": 2.698634147644043 }, { "auxiliary_loss_clip": 0.01415625, "auxiliary_loss_mlp": 0.00108149, "balance_loss_clip": 1.19720244, "balance_loss_mlp": 0.0985164, "epoch": 0.5818728393206073, "flos": 63918626342400.0, "grad_norm": 0.8124567798299924, "language_loss": 0.54034424, "learning_rate": 1.5701050247654182e-06, "loss": 0.55558205, "num_input_tokens_seen": 208498405, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.09619141, "step": 9678, "time_per_iteration": 3.234797239303589 }, { "auxiliary_loss_clip": 0.01403267, "auxiliary_loss_mlp": 0.00068916, "balance_loss_clip": 1.1902585, "balance_loss_mlp": 0.05871172, "epoch": 0.5819329625732752, "flos": 64954108638720.0, "grad_norm": 0.7310492744720045, "language_loss": 0.55581659, "learning_rate": 1.569724674667319e-06, "loss": 0.57053846, "num_input_tokens_seen": 208559075, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.10205078, "step": 9679, "time_per_iteration": 3.0527970790863037 }, { "auxiliary_loss_clip": 0.0140956, "auxiliary_loss_mlp": 0.00241682, "balance_loss_clip": 1.15623283, "balance_loss_mlp": 0.20713517, "epoch": 0.5819930858259432, "flos": 21215270522880.0, "grad_norm": 3.996650612440571, "language_loss": 0.72403675, "learning_rate": 1.5693443408853032e-06, "loss": 0.74054921, "num_input_tokens_seen": 208577770, "router_z_loss_clip": 2.53515625, "router_z_loss_mlp": 0.34545898, "step": 9680, "time_per_iteration": 2.6927926540374756 }, { "auxiliary_loss_clip": 0.01424467, "auxiliary_loss_mlp": 0.00263878, "balance_loss_clip": 1.16489613, "balance_loss_mlp": 0.2307613, "epoch": 0.5820532090786111, "flos": 19458520151040.0, "grad_norm": 53.78562312511489, "language_loss": 0.89734864, "learning_rate": 1.5689640234337933e-06, "loss": 0.91423213, "num_input_tokens_seen": 208595110, "router_z_loss_clip": 2.59179688, "router_z_loss_mlp": 0.33129883, "step": 9681, "time_per_iteration": 2.6897482872009277 }, { "auxiliary_loss_clip": 0.01397214, "auxiliary_loss_mlp": 0.00252289, "balance_loss_clip": 1.14650238, "balance_loss_mlp": 0.22131892, "epoch": 0.5821133323312792, "flos": 17712615686400.0, "grad_norm": 7.422136703743692, "language_loss": 0.8108592, "learning_rate": 1.5685837223272109e-06, "loss": 0.82735419, "num_input_tokens_seen": 208612080, "router_z_loss_clip": 2.5078125, "router_z_loss_mlp": 0.30981445, "step": 9682, "time_per_iteration": 2.662064552307129 }, { "auxiliary_loss_clip": 0.01426837, "auxiliary_loss_mlp": 0.0029175, "balance_loss_clip": 1.16312003, "balance_loss_mlp": 0.25746524, "epoch": 0.5821734555839471, "flos": 24571804832640.0, "grad_norm": 48.096744698587784, "language_loss": 0.8439886, "learning_rate": 1.568203437579977e-06, "loss": 0.86117446, "num_input_tokens_seen": 208630235, "router_z_loss_clip": 2.63476562, "router_z_loss_mlp": 0.34301758, "step": 9683, "time_per_iteration": 2.6958487033843994 }, { "auxiliary_loss_clip": 0.01435007, "auxiliary_loss_mlp": 0.00267785, "balance_loss_clip": 1.16681135, "balance_loss_mlp": 0.2355748, "epoch": 0.5822335788366151, "flos": 22382259488640.0, "grad_norm": 5.191046252542036, "language_loss": 0.80389458, "learning_rate": 1.5678231692065116e-06, "loss": 0.82092249, "num_input_tokens_seen": 208647925, "router_z_loss_clip": 2.6796875, "router_z_loss_mlp": 0.32177734, "step": 9684, "time_per_iteration": 2.7245402336120605 }, { "auxiliary_loss_clip": 0.0141136, "auxiliary_loss_mlp": 0.00299464, "balance_loss_clip": 1.15285933, "balance_loss_mlp": 0.26742011, "epoch": 0.582293702089283, "flos": 26722494639360.0, "grad_norm": 1883.6574769105978, "language_loss": 0.84360737, "learning_rate": 1.5674429172212348e-06, "loss": 0.86071563, "num_input_tokens_seen": 208666180, "router_z_loss_clip": 2.5859375, "router_z_loss_mlp": 0.32043457, "step": 9685, "time_per_iteration": 4.263071537017822 }, { "auxiliary_loss_clip": 0.01408416, "auxiliary_loss_mlp": 0.00261036, "balance_loss_clip": 1.15548635, "balance_loss_mlp": 0.22784799, "epoch": 0.582353825341951, "flos": 17348661129600.0, "grad_norm": 34.98425650947022, "language_loss": 0.8448981, "learning_rate": 1.5670626816385667e-06, "loss": 0.86159253, "num_input_tokens_seen": 208684240, "router_z_loss_clip": 2.52539062, "router_z_loss_mlp": 0.33227539, "step": 9686, "time_per_iteration": 2.695342779159546 }, { "auxiliary_loss_clip": 0.01406217, "auxiliary_loss_mlp": 0.00107428, "balance_loss_clip": 1.19555593, "balance_loss_mlp": 0.09669939, "epoch": 0.5824139485946189, "flos": 55473261534720.0, "grad_norm": 0.7966234519230274, "language_loss": 0.56433403, "learning_rate": 1.5666824624729244e-06, "loss": 0.57947052, "num_input_tokens_seen": 208736090, "router_z_loss_clip": 2.109375, "router_z_loss_mlp": 0.10742188, "step": 9687, "time_per_iteration": 4.429518461227417 }, { "auxiliary_loss_clip": 0.01426518, "auxiliary_loss_mlp": 0.00253128, "balance_loss_clip": 1.16527152, "balance_loss_mlp": 0.22156179, "epoch": 0.582474071847287, "flos": 20303031790080.0, "grad_norm": 5.025003757303838, "language_loss": 0.78661847, "learning_rate": 1.566302259738727e-06, "loss": 0.80341488, "num_input_tokens_seen": 208754600, "router_z_loss_clip": 2.61132812, "router_z_loss_mlp": 0.31616211, "step": 9688, "time_per_iteration": 2.732639789581299 }, { "auxiliary_loss_clip": 0.0143789, "auxiliary_loss_mlp": 0.00230918, "balance_loss_clip": 1.16866505, "balance_loss_mlp": 0.20216522, "epoch": 0.5825341950999549, "flos": 23878010661120.0, "grad_norm": 211.4461487080634, "language_loss": 0.73504448, "learning_rate": 1.5659220734503918e-06, "loss": 0.75173247, "num_input_tokens_seen": 208773140, "router_z_loss_clip": 2.69335938, "router_z_loss_mlp": 0.2878418, "step": 9689, "time_per_iteration": 2.8384385108947754 }, { "auxiliary_loss_clip": 0.0143089, "auxiliary_loss_mlp": 0.00269078, "balance_loss_clip": 1.17263961, "balance_loss_mlp": 0.2345787, "epoch": 0.5825943183526229, "flos": 23113041690240.0, "grad_norm": 5.6604905675176775, "language_loss": 0.81156147, "learning_rate": 1.5655419036223341e-06, "loss": 0.82856119, "num_input_tokens_seen": 208793410, "router_z_loss_clip": 2.58007812, "router_z_loss_mlp": 0.34484863, "step": 9690, "time_per_iteration": 4.209598064422607 }, { "auxiliary_loss_clip": 0.01413419, "auxiliary_loss_mlp": 0.00269502, "balance_loss_clip": 1.15490603, "balance_loss_mlp": 0.2347403, "epoch": 0.5826544416052909, "flos": 22857429530880.0, "grad_norm": 2.8693964380939154, "language_loss": 0.83441389, "learning_rate": 1.5651617502689717e-06, "loss": 0.85124314, "num_input_tokens_seen": 208811920, "router_z_loss_clip": 2.5859375, "router_z_loss_mlp": 0.34765625, "step": 9691, "time_per_iteration": 2.659247398376465 }, { "auxiliary_loss_clip": 0.01406241, "auxiliary_loss_mlp": 0.00259712, "balance_loss_clip": 1.14478111, "balance_loss_mlp": 0.22695279, "epoch": 0.5827145648579588, "flos": 31501845555840.0, "grad_norm": 23.677123555344057, "language_loss": 0.87723905, "learning_rate": 1.5647816134047184e-06, "loss": 0.89389861, "num_input_tokens_seen": 208834720, "router_z_loss_clip": 2.61523438, "router_z_loss_mlp": 0.32739258, "step": 9692, "time_per_iteration": 2.7709574699401855 }, { "auxiliary_loss_clip": 0.01397954, "auxiliary_loss_mlp": 0.00148867, "balance_loss_clip": 1.19543648, "balance_loss_mlp": 0.13823333, "epoch": 0.5827746881106268, "flos": 69811817074560.0, "grad_norm": 0.7704784840901414, "language_loss": 0.56665868, "learning_rate": 1.5644014930439907e-06, "loss": 0.58212692, "num_input_tokens_seen": 208898415, "router_z_loss_clip": 2.03125, "router_z_loss_mlp": 0.10644531, "step": 9693, "time_per_iteration": 3.1583547592163086 }, { "auxiliary_loss_clip": 0.01406627, "auxiliary_loss_mlp": 0.00247595, "balance_loss_clip": 1.15042996, "balance_loss_mlp": 0.21724401, "epoch": 0.5828348113632947, "flos": 23112395245440.0, "grad_norm": 5.6781641455398635, "language_loss": 0.86403227, "learning_rate": 1.5640213892012025e-06, "loss": 0.88057452, "num_input_tokens_seen": 208919045, "router_z_loss_clip": 2.56054688, "router_z_loss_mlp": 0.30322266, "step": 9694, "time_per_iteration": 2.7098002433776855 }, { "auxiliary_loss_clip": 0.01394388, "auxiliary_loss_mlp": 0.00273282, "balance_loss_clip": 1.14591384, "balance_loss_mlp": 0.24302708, "epoch": 0.5828949346159628, "flos": 21873082245120.0, "grad_norm": 149.06618324276025, "language_loss": 0.81838644, "learning_rate": 1.5636413018907656e-06, "loss": 0.8350631, "num_input_tokens_seen": 208939375, "router_z_loss_clip": 2.484375, "router_z_loss_mlp": 0.30249023, "step": 9695, "time_per_iteration": 4.156244516372681 }, { "auxiliary_loss_clip": 0.01342892, "auxiliary_loss_mlp": 0.00104365, "balance_loss_clip": 1.14802337, "balance_loss_mlp": 0.09463781, "epoch": 0.5829550578686307, "flos": 65962553950080.0, "grad_norm": 0.7413795438955938, "language_loss": 0.54374528, "learning_rate": 1.563261231127095e-06, "loss": 0.55821782, "num_input_tokens_seen": 209004760, "router_z_loss_clip": 1.953125, "router_z_loss_mlp": 0.09716797, "step": 9696, "time_per_iteration": 3.247711658477783 }, { "auxiliary_loss_clip": 0.0141333, "auxiliary_loss_mlp": 0.00267658, "balance_loss_clip": 1.15726376, "balance_loss_mlp": 0.23566231, "epoch": 0.5830151811212987, "flos": 16289799079680.0, "grad_norm": 160.11330911516362, "language_loss": 0.85378724, "learning_rate": 1.5628811769246021e-06, "loss": 0.87059712, "num_input_tokens_seen": 209022930, "router_z_loss_clip": 2.5625, "router_z_loss_mlp": 0.32006836, "step": 9697, "time_per_iteration": 2.6413979530334473 }, { "auxiliary_loss_clip": 0.01411525, "auxiliary_loss_mlp": 0.0028151, "balance_loss_clip": 1.15295231, "balance_loss_mlp": 0.24665324, "epoch": 0.5830753043739666, "flos": 24168851084160.0, "grad_norm": 180.37702597331017, "language_loss": 0.83706081, "learning_rate": 1.5625011392976991e-06, "loss": 0.85399115, "num_input_tokens_seen": 209043740, "router_z_loss_clip": 2.58203125, "router_z_loss_mlp": 0.34863281, "step": 9698, "time_per_iteration": 2.6929476261138916 }, { "auxiliary_loss_clip": 0.01425841, "auxiliary_loss_mlp": 0.00269421, "balance_loss_clip": 1.16470349, "balance_loss_mlp": 0.23642346, "epoch": 0.5831354276266346, "flos": 27059050097280.0, "grad_norm": 9.290964499614601, "language_loss": 0.89576495, "learning_rate": 1.5621211182607966e-06, "loss": 0.91271746, "num_input_tokens_seen": 209068885, "router_z_loss_clip": 2.61132812, "router_z_loss_mlp": 0.33007812, "step": 9699, "time_per_iteration": 2.864872694015503 }, { "auxiliary_loss_clip": 0.01437463, "auxiliary_loss_mlp": 0.00251357, "balance_loss_clip": 1.16956961, "balance_loss_mlp": 0.21862237, "epoch": 0.5831955508793025, "flos": 23623475909760.0, "grad_norm": 142.33507769240035, "language_loss": 0.74456525, "learning_rate": 1.561741113828305e-06, "loss": 0.76145351, "num_input_tokens_seen": 209087340, "router_z_loss_clip": 2.67773438, "router_z_loss_mlp": 0.32714844, "step": 9700, "time_per_iteration": 2.716184377670288 }, { "auxiliary_loss_clip": 0.01422462, "auxiliary_loss_mlp": 0.00299775, "balance_loss_clip": 1.15922928, "balance_loss_mlp": 0.26770726, "epoch": 0.5832556741319705, "flos": 24973250209920.0, "grad_norm": 20.773391688333017, "language_loss": 0.77631998, "learning_rate": 1.5613611260146344e-06, "loss": 0.79354239, "num_input_tokens_seen": 209108840, "router_z_loss_clip": 2.6328125, "router_z_loss_mlp": 0.32080078, "step": 9701, "time_per_iteration": 2.756169557571411 }, { "auxiliary_loss_clip": 0.01404861, "auxiliary_loss_mlp": 0.00265855, "balance_loss_clip": 1.14934719, "balance_loss_mlp": 0.23447911, "epoch": 0.5833157973846385, "flos": 23221563655680.0, "grad_norm": 10.72596626390883, "language_loss": 0.93243372, "learning_rate": 1.5609811548341936e-06, "loss": 0.94914079, "num_input_tokens_seen": 209127985, "router_z_loss_clip": 2.55078125, "router_z_loss_mlp": 0.31347656, "step": 9702, "time_per_iteration": 2.7264559268951416 }, { "auxiliary_loss_clip": 0.01424218, "auxiliary_loss_mlp": 0.00242462, "balance_loss_clip": 1.16523778, "balance_loss_mlp": 0.21051362, "epoch": 0.5833759206373065, "flos": 21977941023360.0, "grad_norm": 6.662741201290832, "language_loss": 0.8293494, "learning_rate": 1.560601200301392e-06, "loss": 0.84601623, "num_input_tokens_seen": 209146885, "router_z_loss_clip": 2.58789062, "router_z_loss_mlp": 0.31958008, "step": 9703, "time_per_iteration": 2.7970352172851562 }, { "auxiliary_loss_clip": 0.0143473, "auxiliary_loss_mlp": 0.00262563, "balance_loss_clip": 1.16516519, "balance_loss_mlp": 0.22832608, "epoch": 0.5834360438899745, "flos": 21762405463680.0, "grad_norm": 23.9423745139248, "language_loss": 0.78799343, "learning_rate": 1.5602212624306366e-06, "loss": 0.80496633, "num_input_tokens_seen": 209166130, "router_z_loss_clip": 2.69726562, "router_z_loss_mlp": 0.34204102, "step": 9704, "time_per_iteration": 2.695429563522339 }, { "auxiliary_loss_clip": 0.01395067, "auxiliary_loss_mlp": 0.00277758, "balance_loss_clip": 1.13982272, "balance_loss_mlp": 0.24504668, "epoch": 0.5834961671426424, "flos": 15992566035840.0, "grad_norm": 16.682774811757273, "language_loss": 0.87770712, "learning_rate": 1.559841341236335e-06, "loss": 0.89443535, "num_input_tokens_seen": 209183350, "router_z_loss_clip": 2.55078125, "router_z_loss_mlp": 0.32739258, "step": 9705, "time_per_iteration": 2.636247396469116 }, { "auxiliary_loss_clip": 0.01427153, "auxiliary_loss_mlp": 0.00285746, "balance_loss_clip": 1.16194916, "balance_loss_mlp": 0.25015044, "epoch": 0.5835562903953104, "flos": 22818322598400.0, "grad_norm": 6.933015062768426, "language_loss": 0.85994565, "learning_rate": 1.5594614367328937e-06, "loss": 0.8770746, "num_input_tokens_seen": 209203945, "router_z_loss_clip": 2.65039062, "router_z_loss_mlp": 0.35620117, "step": 9706, "time_per_iteration": 2.7172117233276367 }, { "auxiliary_loss_clip": 0.01432049, "auxiliary_loss_mlp": 0.00261408, "balance_loss_clip": 1.17264009, "balance_loss_mlp": 0.22781461, "epoch": 0.5836164136479783, "flos": 48468056624640.0, "grad_norm": 8.693902449237452, "language_loss": 0.83756614, "learning_rate": 1.5590815489347187e-06, "loss": 0.85450071, "num_input_tokens_seen": 209227080, "router_z_loss_clip": 2.59570312, "router_z_loss_mlp": 0.3359375, "step": 9707, "time_per_iteration": 3.0046069622039795 }, { "auxiliary_loss_clip": 0.01405243, "auxiliary_loss_mlp": 0.00277565, "balance_loss_clip": 1.15778995, "balance_loss_mlp": 0.24568847, "epoch": 0.5836765369006464, "flos": 26905998245760.0, "grad_norm": 5.9673446611829934, "language_loss": 0.86950868, "learning_rate": 1.5587016778562163e-06, "loss": 0.8863368, "num_input_tokens_seen": 209248170, "router_z_loss_clip": 2.47460938, "router_z_loss_mlp": 0.31884766, "step": 9708, "time_per_iteration": 2.8034708499908447 }, { "auxiliary_loss_clip": 0.01425922, "auxiliary_loss_mlp": 0.00272782, "balance_loss_clip": 1.16616702, "balance_loss_mlp": 0.2422287, "epoch": 0.5837366601533143, "flos": 20084048524800.0, "grad_norm": 4.533411672329313, "language_loss": 0.84687173, "learning_rate": 1.5583218235117896e-06, "loss": 0.86385876, "num_input_tokens_seen": 209267730, "router_z_loss_clip": 2.59570312, "router_z_loss_mlp": 0.30529785, "step": 9709, "time_per_iteration": 2.6899569034576416 }, { "auxiliary_loss_clip": 0.01385813, "auxiliary_loss_mlp": 0.00151723, "balance_loss_clip": 1.1891315, "balance_loss_mlp": 0.14147066, "epoch": 0.5837967834059823, "flos": 65363885971200.0, "grad_norm": 0.7627342075640489, "language_loss": 0.56327617, "learning_rate": 1.557941985915844e-06, "loss": 0.57865155, "num_input_tokens_seen": 209332510, "router_z_loss_clip": 1.96875, "router_z_loss_mlp": 0.10253906, "step": 9710, "time_per_iteration": 3.2131545543670654 }, { "auxiliary_loss_clip": 0.01419632, "auxiliary_loss_mlp": 0.00246962, "balance_loss_clip": 1.16520071, "balance_loss_mlp": 0.2147751, "epoch": 0.5838569066586502, "flos": 25338641310720.0, "grad_norm": 34.38429744977601, "language_loss": 0.73220217, "learning_rate": 1.5575621650827833e-06, "loss": 0.74886811, "num_input_tokens_seen": 209353355, "router_z_loss_clip": 2.54296875, "router_z_loss_mlp": 0.32202148, "step": 9711, "time_per_iteration": 2.756408929824829 }, { "auxiliary_loss_clip": 0.01443891, "auxiliary_loss_mlp": 0.00302402, "balance_loss_clip": 1.17221189, "balance_loss_mlp": 0.26566219, "epoch": 0.5839170299113182, "flos": 22229243550720.0, "grad_norm": 9.294058052943358, "language_loss": 0.86745489, "learning_rate": 1.5571823610270085e-06, "loss": 0.88491786, "num_input_tokens_seen": 209370960, "router_z_loss_clip": 2.71875, "router_z_loss_mlp": 0.36694336, "step": 9712, "time_per_iteration": 2.632793426513672 }, { "auxiliary_loss_clip": 0.01397062, "auxiliary_loss_mlp": 0.00280924, "balance_loss_clip": 1.14462507, "balance_loss_mlp": 0.24783117, "epoch": 0.5839771531639861, "flos": 22200012858240.0, "grad_norm": 48.2739825398545, "language_loss": 0.79508102, "learning_rate": 1.5568025737629234e-06, "loss": 0.81186086, "num_input_tokens_seen": 209390955, "router_z_loss_clip": 2.52539062, "router_z_loss_mlp": 0.33105469, "step": 9713, "time_per_iteration": 2.655731439590454 }, { "auxiliary_loss_clip": 0.01450419, "auxiliary_loss_mlp": 0.00286868, "balance_loss_clip": 1.17641354, "balance_loss_mlp": 0.25008041, "epoch": 0.5840372764166541, "flos": 22419355259520.0, "grad_norm": 20.496828783630725, "language_loss": 0.80820906, "learning_rate": 1.5564228033049292e-06, "loss": 0.82558197, "num_input_tokens_seen": 209410260, "router_z_loss_clip": 2.73828125, "router_z_loss_mlp": 0.36791992, "step": 9714, "time_per_iteration": 2.661358594894409 }, { "auxiliary_loss_clip": 0.01431267, "auxiliary_loss_mlp": 0.00279843, "balance_loss_clip": 1.16513681, "balance_loss_mlp": 0.24541551, "epoch": 0.5840973996693221, "flos": 19828256797440.0, "grad_norm": 34.24472509443729, "language_loss": 0.85561264, "learning_rate": 1.5560430496674268e-06, "loss": 0.87272376, "num_input_tokens_seen": 209429920, "router_z_loss_clip": 2.66015625, "router_z_loss_mlp": 0.34399414, "step": 9715, "time_per_iteration": 2.6563961505889893 }, { "auxiliary_loss_clip": 0.01406895, "auxiliary_loss_mlp": 0.00260477, "balance_loss_clip": 1.15223992, "balance_loss_mlp": 0.2282431, "epoch": 0.5841575229219901, "flos": 21142982401920.0, "grad_norm": 39.68752471541606, "language_loss": 0.806113, "learning_rate": 1.5556633128648167e-06, "loss": 0.82278669, "num_input_tokens_seen": 209449470, "router_z_loss_clip": 2.55078125, "router_z_loss_mlp": 0.32226562, "step": 9716, "time_per_iteration": 2.651970863342285 }, { "auxiliary_loss_clip": 0.01406309, "auxiliary_loss_mlp": 0.00251414, "balance_loss_clip": 1.15348685, "balance_loss_mlp": 0.21782109, "epoch": 0.5842176461746581, "flos": 24640322025600.0, "grad_norm": 30.753105783244543, "language_loss": 0.80958885, "learning_rate": 1.5552835929114976e-06, "loss": 0.82616609, "num_input_tokens_seen": 209467695, "router_z_loss_clip": 2.52929688, "router_z_loss_mlp": 0.33618164, "step": 9717, "time_per_iteration": 2.704937696456909 }, { "auxiliary_loss_clip": 0.01406436, "auxiliary_loss_mlp": 0.00265279, "balance_loss_clip": 1.15433168, "balance_loss_mlp": 0.23035091, "epoch": 0.584277769427326, "flos": 19131158574720.0, "grad_norm": 484.19668146905826, "language_loss": 0.87799537, "learning_rate": 1.5549038898218697e-06, "loss": 0.89471251, "num_input_tokens_seen": 209484250, "router_z_loss_clip": 2.51953125, "router_z_loss_mlp": 0.34936523, "step": 9718, "time_per_iteration": 2.6332077980041504 }, { "auxiliary_loss_clip": 0.01420151, "auxiliary_loss_mlp": 0.00272484, "balance_loss_clip": 1.1637001, "balance_loss_mlp": 0.23650675, "epoch": 0.584337892679994, "flos": 22675111073280.0, "grad_norm": 63.99890798149002, "language_loss": 0.75018799, "learning_rate": 1.5545242036103306e-06, "loss": 0.76711428, "num_input_tokens_seen": 209502830, "router_z_loss_clip": 2.56640625, "router_z_loss_mlp": 0.36010742, "step": 9719, "time_per_iteration": 2.7371108531951904 }, { "auxiliary_loss_clip": 0.01421813, "auxiliary_loss_mlp": 0.00275942, "balance_loss_clip": 1.16012275, "balance_loss_mlp": 0.24058405, "epoch": 0.5843980159326619, "flos": 31284083352960.0, "grad_norm": 7.664558167722996, "language_loss": 0.84203333, "learning_rate": 1.5541445342912786e-06, "loss": 0.85901088, "num_input_tokens_seen": 209525995, "router_z_loss_clip": 2.62109375, "router_z_loss_mlp": 0.35351562, "step": 9720, "time_per_iteration": 2.7677364349365234 }, { "auxiliary_loss_clip": 0.0143034, "auxiliary_loss_mlp": 0.00274596, "balance_loss_clip": 1.16668427, "balance_loss_mlp": 0.23759331, "epoch": 0.58445813918533, "flos": 22748117466240.0, "grad_norm": 60.915656355718326, "language_loss": 0.89671397, "learning_rate": 1.5537648818791105e-06, "loss": 0.91376334, "num_input_tokens_seen": 209545895, "router_z_loss_clip": 2.63671875, "router_z_loss_mlp": 0.36987305, "step": 9721, "time_per_iteration": 2.727766513824463 }, { "auxiliary_loss_clip": 0.0139477, "auxiliary_loss_mlp": 0.00149993, "balance_loss_clip": 1.1996814, "balance_loss_mlp": 0.1381194, "epoch": 0.5845182624379979, "flos": 60686556658560.0, "grad_norm": 1.2501869279179294, "language_loss": 0.70787096, "learning_rate": 1.5533852463882226e-06, "loss": 0.72331864, "num_input_tokens_seen": 209602315, "router_z_loss_clip": 1.953125, "router_z_loss_mlp": 0.11865234, "step": 9722, "time_per_iteration": 3.284536361694336 }, { "auxiliary_loss_clip": 0.01408284, "auxiliary_loss_mlp": 0.00246688, "balance_loss_clip": 1.15242648, "balance_loss_mlp": 0.21366751, "epoch": 0.5845783856906659, "flos": 16362446336640.0, "grad_norm": 2.013606833062, "language_loss": 0.96209502, "learning_rate": 1.5530056278330113e-06, "loss": 0.97864467, "num_input_tokens_seen": 209617615, "router_z_loss_clip": 2.55859375, "router_z_loss_mlp": 0.33056641, "step": 9723, "time_per_iteration": 2.662440061569214 }, { "auxiliary_loss_clip": 0.01418168, "auxiliary_loss_mlp": 0.00246584, "balance_loss_clip": 1.1625495, "balance_loss_mlp": 0.21206099, "epoch": 0.5846385089433338, "flos": 20083402080000.0, "grad_norm": 12.948145381204586, "language_loss": 0.74832976, "learning_rate": 1.5526260262278709e-06, "loss": 0.76497728, "num_input_tokens_seen": 209637005, "router_z_loss_clip": 2.5546875, "router_z_loss_mlp": 0.34521484, "step": 9724, "time_per_iteration": 2.688657522201538 }, { "auxiliary_loss_clip": 0.01416864, "auxiliary_loss_mlp": 0.00253066, "balance_loss_clip": 1.15138268, "balance_loss_mlp": 0.21954402, "epoch": 0.5846986321960018, "flos": 17311062568320.0, "grad_norm": 238.4142574654594, "language_loss": 0.94747716, "learning_rate": 1.552246441587197e-06, "loss": 0.96417642, "num_input_tokens_seen": 209653170, "router_z_loss_clip": 2.65234375, "router_z_loss_mlp": 0.33544922, "step": 9725, "time_per_iteration": 2.6472129821777344 }, { "auxiliary_loss_clip": 0.01428606, "auxiliary_loss_mlp": 0.00274967, "balance_loss_clip": 1.16552675, "balance_loss_mlp": 0.24165946, "epoch": 0.5847587554486697, "flos": 17197907748480.0, "grad_norm": 64.02242267999365, "language_loss": 0.87494206, "learning_rate": 1.5518668739253821e-06, "loss": 0.89197779, "num_input_tokens_seen": 209671275, "router_z_loss_clip": 2.63085938, "router_z_loss_mlp": 0.33276367, "step": 9726, "time_per_iteration": 2.6538071632385254 }, { "auxiliary_loss_clip": 0.01412624, "auxiliary_loss_mlp": 0.00287173, "balance_loss_clip": 1.15958953, "balance_loss_mlp": 0.25422311, "epoch": 0.5848188787013378, "flos": 24529106540160.0, "grad_norm": 48.30326865668973, "language_loss": 0.75050312, "learning_rate": 1.5514873232568206e-06, "loss": 0.76750112, "num_input_tokens_seen": 209690380, "router_z_loss_clip": 2.52929688, "router_z_loss_mlp": 0.3293457, "step": 9727, "time_per_iteration": 4.101094484329224 }, { "auxiliary_loss_clip": 0.01430454, "auxiliary_loss_mlp": 0.00257126, "balance_loss_clip": 1.17099535, "balance_loss_mlp": 0.22133912, "epoch": 0.5848790019540057, "flos": 20628382204800.0, "grad_norm": 17.410848074698922, "language_loss": 0.87476617, "learning_rate": 1.5511077895959055e-06, "loss": 0.89164197, "num_input_tokens_seen": 209708845, "router_z_loss_clip": 2.59570312, "router_z_loss_mlp": 0.3581543, "step": 9728, "time_per_iteration": 2.6479456424713135 }, { "auxiliary_loss_clip": 0.01403506, "auxiliary_loss_mlp": 0.00235891, "balance_loss_clip": 1.15106702, "balance_loss_mlp": 0.20229813, "epoch": 0.5849391252066737, "flos": 22418852469120.0, "grad_norm": 6.631638027114804, "language_loss": 0.8400023, "learning_rate": 1.550728272957027e-06, "loss": 0.85639632, "num_input_tokens_seen": 209729000, "router_z_loss_clip": 2.52539062, "router_z_loss_mlp": 0.3359375, "step": 9729, "time_per_iteration": 4.19434666633606 }, { "auxiliary_loss_clip": 0.01417711, "auxiliary_loss_mlp": 0.00232889, "balance_loss_clip": 1.15601623, "balance_loss_mlp": 0.19920063, "epoch": 0.5849992484593417, "flos": 25410929431680.0, "grad_norm": 3.437930934970691, "language_loss": 0.80134487, "learning_rate": 1.5503487733545782e-06, "loss": 0.81785089, "num_input_tokens_seen": 209747435, "router_z_loss_clip": 2.6171875, "router_z_loss_mlp": 0.33642578, "step": 9730, "time_per_iteration": 2.743098497390747 }, { "auxiliary_loss_clip": 0.01440211, "auxiliary_loss_mlp": 0.00250902, "balance_loss_clip": 1.16888499, "balance_loss_mlp": 0.21611637, "epoch": 0.5850593717120096, "flos": 21065163586560.0, "grad_norm": 4.684179589701503, "language_loss": 0.85115719, "learning_rate": 1.5499692908029482e-06, "loss": 0.86806834, "num_input_tokens_seen": 209764910, "router_z_loss_clip": 2.71679688, "router_z_loss_mlp": 0.34765625, "step": 9731, "time_per_iteration": 2.6558971405029297 }, { "auxiliary_loss_clip": 0.01404522, "auxiliary_loss_mlp": 0.00231289, "balance_loss_clip": 1.14792347, "balance_loss_mlp": 0.1991975, "epoch": 0.5851194949646776, "flos": 25301545539840.0, "grad_norm": 11.506297204459385, "language_loss": 0.78603309, "learning_rate": 1.549589825316528e-06, "loss": 0.80239117, "num_input_tokens_seen": 209786115, "router_z_loss_clip": 2.56054688, "router_z_loss_mlp": 0.32104492, "step": 9732, "time_per_iteration": 4.127508163452148 }, { "auxiliary_loss_clip": 0.01456654, "auxiliary_loss_mlp": 0.00271281, "balance_loss_clip": 1.17988169, "balance_loss_mlp": 0.23642465, "epoch": 0.5851796182173455, "flos": 23587242065280.0, "grad_norm": 16.10661296579124, "language_loss": 0.60794353, "learning_rate": 1.5492103769097075e-06, "loss": 0.6252228, "num_input_tokens_seen": 209806095, "router_z_loss_clip": 2.76953125, "router_z_loss_mlp": 0.34863281, "step": 9733, "time_per_iteration": 2.7366416454315186 }, { "auxiliary_loss_clip": 0.01438223, "auxiliary_loss_mlp": 0.00246059, "balance_loss_clip": 1.17289209, "balance_loss_mlp": 0.21170317, "epoch": 0.5852397414700136, "flos": 24822712310400.0, "grad_norm": 4.256688825068405, "language_loss": 0.97526205, "learning_rate": 1.5488309455968739e-06, "loss": 0.99210483, "num_input_tokens_seen": 209823650, "router_z_loss_clip": 2.65429688, "router_z_loss_mlp": 0.34326172, "step": 9734, "time_per_iteration": 2.7970311641693115 }, { "auxiliary_loss_clip": 0.01398152, "auxiliary_loss_mlp": 0.00228214, "balance_loss_clip": 1.14515698, "balance_loss_mlp": 0.19607508, "epoch": 0.5852998647226815, "flos": 19937784343680.0, "grad_norm": 20.21326748910892, "language_loss": 0.7736944, "learning_rate": 1.5484515313924163e-06, "loss": 0.78995812, "num_input_tokens_seen": 209843220, "router_z_loss_clip": 2.53125, "router_z_loss_mlp": 0.32177734, "step": 9735, "time_per_iteration": 2.701411008834839 }, { "auxiliary_loss_clip": 0.01403656, "auxiliary_loss_mlp": 0.0024916, "balance_loss_clip": 1.13879168, "balance_loss_mlp": 0.21511409, "epoch": 0.5853599879753495, "flos": 16720367408640.0, "grad_norm": 263.04645876018594, "language_loss": 0.84873033, "learning_rate": 1.5480721343107217e-06, "loss": 0.86525857, "num_input_tokens_seen": 209854880, "router_z_loss_clip": 2.6484375, "router_z_loss_mlp": 0.34057617, "step": 9736, "time_per_iteration": 2.6219470500946045 }, { "auxiliary_loss_clip": 0.01396198, "auxiliary_loss_mlp": 0.00272096, "balance_loss_clip": 1.14207149, "balance_loss_mlp": 0.23833615, "epoch": 0.5854201112280174, "flos": 44456583680640.0, "grad_norm": 8.658951866056226, "language_loss": 0.76728857, "learning_rate": 1.5476927543661772e-06, "loss": 0.78397155, "num_input_tokens_seen": 209877870, "router_z_loss_clip": 2.54296875, "router_z_loss_mlp": 0.33764648, "step": 9737, "time_per_iteration": 4.343982458114624 }, { "auxiliary_loss_clip": 0.01403464, "auxiliary_loss_mlp": 0.00268338, "balance_loss_clip": 1.14862061, "balance_loss_mlp": 0.23851189, "epoch": 0.5854802344806854, "flos": 20339193807360.0, "grad_norm": 11.480425629687103, "language_loss": 0.88901794, "learning_rate": 1.547313391573169e-06, "loss": 0.90573591, "num_input_tokens_seen": 209896690, "router_z_loss_clip": 2.54882812, "router_z_loss_mlp": 0.29833984, "step": 9738, "time_per_iteration": 2.667433738708496 }, { "auxiliary_loss_clip": 0.01409477, "auxiliary_loss_mlp": 0.00272712, "balance_loss_clip": 1.1469202, "balance_loss_mlp": 0.23892829, "epoch": 0.5855403577333533, "flos": 20921054221440.0, "grad_norm": 353.676639939439, "language_loss": 0.77037024, "learning_rate": 1.546934045946082e-06, "loss": 0.78719217, "num_input_tokens_seen": 209914640, "router_z_loss_clip": 2.62695312, "router_z_loss_mlp": 0.33813477, "step": 9739, "time_per_iteration": 2.647698402404785 }, { "auxiliary_loss_clip": 0.01405644, "auxiliary_loss_mlp": 0.00272476, "balance_loss_clip": 1.14459908, "balance_loss_mlp": 0.23831083, "epoch": 0.5856004809860214, "flos": 20448649526400.0, "grad_norm": 7.0115182218571475, "language_loss": 0.69584978, "learning_rate": 1.5465547174993017e-06, "loss": 0.71263099, "num_input_tokens_seen": 209933375, "router_z_loss_clip": 2.61132812, "router_z_loss_mlp": 0.34155273, "step": 9740, "time_per_iteration": 2.6579768657684326 }, { "auxiliary_loss_clip": 0.01404789, "auxiliary_loss_mlp": 0.00250649, "balance_loss_clip": 1.14383245, "balance_loss_mlp": 0.21693689, "epoch": 0.5856606042386893, "flos": 19640766781440.0, "grad_norm": 4.894074308931436, "language_loss": 0.83230978, "learning_rate": 1.5461754062472113e-06, "loss": 0.8488642, "num_input_tokens_seen": 209952055, "router_z_loss_clip": 2.61132812, "router_z_loss_mlp": 0.3371582, "step": 9741, "time_per_iteration": 2.6509556770324707 }, { "auxiliary_loss_clip": 0.01389649, "auxiliary_loss_mlp": 0.00286042, "balance_loss_clip": 1.13807428, "balance_loss_mlp": 0.25280672, "epoch": 0.5857207274913573, "flos": 21686166846720.0, "grad_norm": 1.9740762054580359, "language_loss": 0.83016562, "learning_rate": 1.5457961122041959e-06, "loss": 0.84692252, "num_input_tokens_seen": 209971190, "router_z_loss_clip": 2.515625, "router_z_loss_mlp": 0.33251953, "step": 9742, "time_per_iteration": 2.741161346435547 }, { "auxiliary_loss_clip": 0.01374179, "auxiliary_loss_mlp": 0.00236661, "balance_loss_clip": 1.12408078, "balance_loss_mlp": 0.20461723, "epoch": 0.5857808507440253, "flos": 23182708118400.0, "grad_norm": 2.1491418736720416, "language_loss": 0.81471747, "learning_rate": 1.5454168353846369e-06, "loss": 0.83082592, "num_input_tokens_seen": 209990695, "router_z_loss_clip": 2.5, "router_z_loss_mlp": 0.32019043, "step": 9743, "time_per_iteration": 2.6612117290496826 }, { "auxiliary_loss_clip": 0.01371068, "auxiliary_loss_mlp": 0.00246096, "balance_loss_clip": 1.12545824, "balance_loss_mlp": 0.2168417, "epoch": 0.5858409739966932, "flos": 27235299156480.0, "grad_norm": 6.927540881540055, "language_loss": 0.87426388, "learning_rate": 1.5450375758029172e-06, "loss": 0.89043558, "num_input_tokens_seen": 210010210, "router_z_loss_clip": 2.453125, "router_z_loss_mlp": 0.29272461, "step": 9744, "time_per_iteration": 2.694342851638794 }, { "auxiliary_loss_clip": 0.01402835, "auxiliary_loss_mlp": 0.00273193, "balance_loss_clip": 1.14288211, "balance_loss_mlp": 0.24272308, "epoch": 0.5859010972493612, "flos": 27855512317440.0, "grad_norm": 213.52783245296405, "language_loss": 0.7824595, "learning_rate": 1.5446583334734183e-06, "loss": 0.79921979, "num_input_tokens_seen": 210030030, "router_z_loss_clip": 2.59960938, "router_z_loss_mlp": 0.30517578, "step": 9745, "time_per_iteration": 2.7748022079467773 }, { "auxiliary_loss_clip": 0.0138212, "auxiliary_loss_mlp": 0.00096948, "balance_loss_clip": 1.18206143, "balance_loss_mlp": 0.08664801, "epoch": 0.5859612205020291, "flos": 70007064428160.0, "grad_norm": 0.7090240446342765, "language_loss": 0.52542782, "learning_rate": 1.5442791084105204e-06, "loss": 0.54021847, "num_input_tokens_seen": 210094840, "router_z_loss_clip": 2.0, "router_z_loss_mlp": 0.10302734, "step": 9746, "time_per_iteration": 3.291210651397705 }, { "auxiliary_loss_clip": 0.01402896, "auxiliary_loss_mlp": 0.0025908, "balance_loss_clip": 1.14089251, "balance_loss_mlp": 0.22677404, "epoch": 0.5860213437546972, "flos": 24056019486720.0, "grad_norm": 19.033436102104876, "language_loss": 0.81006104, "learning_rate": 1.5438999006286054e-06, "loss": 0.82668078, "num_input_tokens_seen": 210114660, "router_z_loss_clip": 2.62304688, "router_z_loss_mlp": 0.32287598, "step": 9747, "time_per_iteration": 2.712878465652466 }, { "auxiliary_loss_clip": 0.01381367, "auxiliary_loss_mlp": 0.00271002, "balance_loss_clip": 1.12615943, "balance_loss_mlp": 0.24038896, "epoch": 0.5860814670073651, "flos": 18947583141120.0, "grad_norm": 6.110909032507372, "language_loss": 0.8869549, "learning_rate": 1.543520710142051e-06, "loss": 0.90347856, "num_input_tokens_seen": 210132770, "router_z_loss_clip": 2.54882812, "router_z_loss_mlp": 0.30615234, "step": 9748, "time_per_iteration": 2.657909393310547 }, { "auxiliary_loss_clip": 0.01385287, "auxiliary_loss_mlp": 0.00270736, "balance_loss_clip": 1.12250757, "balance_loss_mlp": 0.23916942, "epoch": 0.5861415902600331, "flos": 22561848512640.0, "grad_norm": 167.6249827034542, "language_loss": 0.7958765, "learning_rate": 1.5431415369652375e-06, "loss": 0.8124367, "num_input_tokens_seen": 210151895, "router_z_loss_clip": 2.63085938, "router_z_loss_mlp": 0.31542969, "step": 9749, "time_per_iteration": 2.7086052894592285 }, { "auxiliary_loss_clip": 0.01388977, "auxiliary_loss_mlp": 0.00255629, "balance_loss_clip": 1.13541222, "balance_loss_mlp": 0.22418121, "epoch": 0.586201713512701, "flos": 14392027912320.0, "grad_norm": 7.373261862963655, "language_loss": 0.83425915, "learning_rate": 1.5427623811125428e-06, "loss": 0.85070515, "num_input_tokens_seen": 210168040, "router_z_loss_clip": 2.53515625, "router_z_loss_mlp": 0.31433105, "step": 9750, "time_per_iteration": 2.681375026702881 }, { "auxiliary_loss_clip": 0.01387019, "auxiliary_loss_mlp": 0.00258892, "balance_loss_clip": 1.13064337, "balance_loss_mlp": 0.22732499, "epoch": 0.586261836765369, "flos": 19498560837120.0, "grad_norm": 4.612484450684145, "language_loss": 0.78279686, "learning_rate": 1.542383242598344e-06, "loss": 0.79925591, "num_input_tokens_seen": 210187720, "router_z_loss_clip": 2.56640625, "router_z_loss_mlp": 0.31567383, "step": 9751, "time_per_iteration": 2.6950974464416504 }, { "auxiliary_loss_clip": 0.01434978, "auxiliary_loss_mlp": 0.00263789, "balance_loss_clip": 1.15478396, "balance_loss_mlp": 0.23253272, "epoch": 0.5863219600180369, "flos": 20701819560960.0, "grad_norm": 39521.59381134093, "language_loss": 0.82842034, "learning_rate": 1.5420041214370184e-06, "loss": 0.84540802, "num_input_tokens_seen": 210206080, "router_z_loss_clip": 2.80078125, "router_z_loss_mlp": 0.31298828, "step": 9752, "time_per_iteration": 2.64135479927063 }, { "auxiliary_loss_clip": 0.01397185, "auxiliary_loss_mlp": 0.00275451, "balance_loss_clip": 1.13960862, "balance_loss_mlp": 0.24393211, "epoch": 0.586382083270705, "flos": 19792130693760.0, "grad_norm": 3.0500934933679598, "language_loss": 0.8412025, "learning_rate": 1.541625017642943e-06, "loss": 0.85792887, "num_input_tokens_seen": 210225660, "router_z_loss_clip": 2.57421875, "router_z_loss_mlp": 0.31518555, "step": 9753, "time_per_iteration": 2.675851583480835 }, { "auxiliary_loss_clip": 0.01377004, "auxiliary_loss_mlp": 0.00262406, "balance_loss_clip": 1.12910891, "balance_loss_mlp": 0.23182911, "epoch": 0.5864422065233729, "flos": 16500558130560.0, "grad_norm": 7.1193863385960565, "language_loss": 0.77977335, "learning_rate": 1.5412459312304927e-06, "loss": 0.79616749, "num_input_tokens_seen": 210242725, "router_z_loss_clip": 2.47851562, "router_z_loss_mlp": 0.30578613, "step": 9754, "time_per_iteration": 2.70888352394104 }, { "auxiliary_loss_clip": 0.01423825, "auxiliary_loss_mlp": 0.00277787, "balance_loss_clip": 1.15616846, "balance_loss_mlp": 0.24126135, "epoch": 0.5865023297760409, "flos": 20413277608320.0, "grad_norm": 6.877722607870088, "language_loss": 0.81309968, "learning_rate": 1.540866862214043e-06, "loss": 0.8301158, "num_input_tokens_seen": 210263225, "router_z_loss_clip": 2.67382812, "router_z_loss_mlp": 0.36572266, "step": 9755, "time_per_iteration": 2.6520674228668213 }, { "auxiliary_loss_clip": 0.01404003, "auxiliary_loss_mlp": 0.00115846, "balance_loss_clip": 1.20017838, "balance_loss_mlp": 0.10382976, "epoch": 0.5865624530287089, "flos": 63350769254400.0, "grad_norm": 0.7180718370044487, "language_loss": 0.56504208, "learning_rate": 1.540487810607967e-06, "loss": 0.58024061, "num_input_tokens_seen": 210322310, "router_z_loss_clip": 2.03125, "router_z_loss_mlp": 0.12011719, "step": 9756, "time_per_iteration": 3.180410146713257 }, { "auxiliary_loss_clip": 0.01382912, "auxiliary_loss_mlp": 0.00256846, "balance_loss_clip": 1.12987494, "balance_loss_mlp": 0.22598237, "epoch": 0.5866225762813768, "flos": 27016279977600.0, "grad_norm": 8.211426860153994, "language_loss": 0.81716919, "learning_rate": 1.5401087764266396e-06, "loss": 0.83356667, "num_input_tokens_seen": 210340845, "router_z_loss_clip": 2.53320312, "router_z_loss_mlp": 0.30847168, "step": 9757, "time_per_iteration": 2.730315685272217 }, { "auxiliary_loss_clip": 0.0140941, "auxiliary_loss_mlp": 0.00100944, "balance_loss_clip": 1.20291817, "balance_loss_mlp": 0.08854628, "epoch": 0.5866826995340448, "flos": 72987038507520.0, "grad_norm": 0.8437965040166641, "language_loss": 0.59838903, "learning_rate": 1.5397297596844337e-06, "loss": 0.61349261, "num_input_tokens_seen": 210397815, "router_z_loss_clip": 2.0625, "router_z_loss_mlp": 0.12402344, "step": 9758, "time_per_iteration": 3.16658878326416 }, { "auxiliary_loss_clip": 0.01411879, "auxiliary_loss_mlp": 0.00288087, "balance_loss_clip": 1.14421976, "balance_loss_mlp": 0.25551942, "epoch": 0.5867428227867127, "flos": 21285727050240.0, "grad_norm": 8.396197363694402, "language_loss": 0.82209337, "learning_rate": 1.5393507603957212e-06, "loss": 0.83909297, "num_input_tokens_seen": 210413900, "router_z_loss_clip": 2.67773438, "router_z_loss_mlp": 0.32568359, "step": 9759, "time_per_iteration": 2.6666951179504395 }, { "auxiliary_loss_clip": 0.01435044, "auxiliary_loss_mlp": 0.0026111, "balance_loss_clip": 1.16591239, "balance_loss_mlp": 0.23023435, "epoch": 0.5868029460393808, "flos": 33468852188160.0, "grad_norm": 21.177374520220287, "language_loss": 0.79840982, "learning_rate": 1.5389717785748742e-06, "loss": 0.81537139, "num_input_tokens_seen": 210434110, "router_z_loss_clip": 2.68945312, "router_z_loss_mlp": 0.30883789, "step": 9760, "time_per_iteration": 2.792797327041626 }, { "auxiliary_loss_clip": 0.01409736, "auxiliary_loss_mlp": 0.0028046, "balance_loss_clip": 1.14359736, "balance_loss_mlp": 0.247797, "epoch": 0.5868630692920487, "flos": 17889475276800.0, "grad_norm": 528.1152137543207, "language_loss": 0.81315356, "learning_rate": 1.5385928142362637e-06, "loss": 0.83005553, "num_input_tokens_seen": 210451685, "router_z_loss_clip": 2.66015625, "router_z_loss_mlp": 0.3269043, "step": 9761, "time_per_iteration": 2.639301300048828 }, { "auxiliary_loss_clip": 0.01412976, "auxiliary_loss_mlp": 0.00276266, "balance_loss_clip": 1.14202034, "balance_loss_mlp": 0.24097958, "epoch": 0.5869231925447167, "flos": 21035035054080.0, "grad_norm": 6.872173161679099, "language_loss": 0.83449221, "learning_rate": 1.5382138673942597e-06, "loss": 0.85138464, "num_input_tokens_seen": 210470825, "router_z_loss_clip": 2.7109375, "router_z_loss_mlp": 0.35327148, "step": 9762, "time_per_iteration": 2.698421001434326 }, { "auxiliary_loss_clip": 0.01409093, "auxiliary_loss_mlp": 0.00289978, "balance_loss_clip": 1.15069401, "balance_loss_mlp": 0.25750518, "epoch": 0.5869833157973846, "flos": 74738219293440.0, "grad_norm": 12.864468539220121, "language_loss": 0.78442091, "learning_rate": 1.5378349380632317e-06, "loss": 0.80141157, "num_input_tokens_seen": 210500075, "router_z_loss_clip": 2.5859375, "router_z_loss_mlp": 0.32446289, "step": 9763, "time_per_iteration": 3.0882415771484375 }, { "auxiliary_loss_clip": 0.01382075, "auxiliary_loss_mlp": 0.00270897, "balance_loss_clip": 1.12883854, "balance_loss_mlp": 0.24107063, "epoch": 0.5870434390500526, "flos": 17638998762240.0, "grad_norm": 6.7168529106438, "language_loss": 0.87265182, "learning_rate": 1.53745602625755e-06, "loss": 0.88918155, "num_input_tokens_seen": 210518150, "router_z_loss_clip": 2.53125, "router_z_loss_mlp": 0.2980957, "step": 9764, "time_per_iteration": 2.722729444503784 }, { "auxiliary_loss_clip": 0.01376821, "auxiliary_loss_mlp": 0.00277097, "balance_loss_clip": 1.12352586, "balance_loss_mlp": 0.24607839, "epoch": 0.5871035623027205, "flos": 21506146859520.0, "grad_norm": 4.4079564612935185, "language_loss": 0.84810591, "learning_rate": 1.5370771319915819e-06, "loss": 0.864645, "num_input_tokens_seen": 210537760, "router_z_loss_clip": 2.53320312, "router_z_loss_mlp": 0.31018066, "step": 9765, "time_per_iteration": 2.7128353118896484 }, { "auxiliary_loss_clip": 0.01371248, "auxiliary_loss_mlp": 0.00257546, "balance_loss_clip": 1.12267208, "balance_loss_mlp": 0.22590792, "epoch": 0.5871636855553886, "flos": 13551861818880.0, "grad_norm": 44.09306991308684, "language_loss": 0.90336722, "learning_rate": 1.5366982552796947e-06, "loss": 0.9196552, "num_input_tokens_seen": 210555515, "router_z_loss_clip": 2.49023438, "router_z_loss_mlp": 0.31640625, "step": 9766, "time_per_iteration": 2.6595351696014404 }, { "auxiliary_loss_clip": 0.01388311, "auxiliary_loss_mlp": 0.00291336, "balance_loss_clip": 1.12799716, "balance_loss_mlp": 0.25814807, "epoch": 0.5872238088080565, "flos": 26212922346240.0, "grad_norm": 16.6091255532613, "language_loss": 0.75443053, "learning_rate": 1.536319396136257e-06, "loss": 0.771227, "num_input_tokens_seen": 210575000, "router_z_loss_clip": 2.6015625, "router_z_loss_mlp": 0.33178711, "step": 9767, "time_per_iteration": 2.739656686782837 }, { "auxiliary_loss_clip": 0.01391968, "auxiliary_loss_mlp": 0.00272534, "balance_loss_clip": 1.13189149, "balance_loss_mlp": 0.2410869, "epoch": 0.5872839320607245, "flos": 30665198995200.0, "grad_norm": 15.326720532632905, "language_loss": 0.72307867, "learning_rate": 1.5359405545756336e-06, "loss": 0.73972368, "num_input_tokens_seen": 210595185, "router_z_loss_clip": 2.59960938, "router_z_loss_mlp": 0.31469727, "step": 9768, "time_per_iteration": 2.750864028930664 }, { "auxiliary_loss_clip": 0.01419331, "auxiliary_loss_mlp": 0.00140451, "balance_loss_clip": 1.21246338, "balance_loss_mlp": 0.13019854, "epoch": 0.5873440553133924, "flos": 60303570871680.0, "grad_norm": 0.6958852415824365, "language_loss": 0.53146732, "learning_rate": 1.5355617306121914e-06, "loss": 0.54706514, "num_input_tokens_seen": 210653210, "router_z_loss_clip": 2.0625, "router_z_loss_mlp": 0.10253906, "step": 9769, "time_per_iteration": 4.611809968948364 }, { "auxiliary_loss_clip": 0.01396917, "auxiliary_loss_mlp": 0.00260354, "balance_loss_clip": 1.13399017, "balance_loss_mlp": 0.22845402, "epoch": 0.5874041785660604, "flos": 21539292134400.0, "grad_norm": 3.4231113035819725, "language_loss": 0.76509166, "learning_rate": 1.5351829242602945e-06, "loss": 0.78166437, "num_input_tokens_seen": 210673750, "router_z_loss_clip": 2.63085938, "router_z_loss_mlp": 0.3190918, "step": 9770, "time_per_iteration": 2.709575653076172 }, { "auxiliary_loss_clip": 0.01383174, "auxiliary_loss_mlp": 0.00288948, "balance_loss_clip": 1.12761188, "balance_loss_mlp": 0.25806069, "epoch": 0.5874643018187284, "flos": 24388947671040.0, "grad_norm": 60.34532631676938, "language_loss": 0.78054363, "learning_rate": 1.5348041355343077e-06, "loss": 0.79726481, "num_input_tokens_seen": 210692960, "router_z_loss_clip": 2.55664062, "router_z_loss_mlp": 0.30871582, "step": 9771, "time_per_iteration": 4.273981809616089 }, { "auxiliary_loss_clip": 0.01421269, "auxiliary_loss_mlp": 0.00294272, "balance_loss_clip": 1.15204406, "balance_loss_mlp": 0.25748366, "epoch": 0.5875244250713964, "flos": 28147717457280.0, "grad_norm": 44.4117845814526, "language_loss": 0.74955183, "learning_rate": 1.5344253644485954e-06, "loss": 0.76670718, "num_input_tokens_seen": 210714040, "router_z_loss_clip": 2.68945312, "router_z_loss_mlp": 0.36791992, "step": 9772, "time_per_iteration": 2.703845977783203 }, { "auxiliary_loss_clip": 0.01446436, "auxiliary_loss_mlp": 0.00284877, "balance_loss_clip": 1.17081714, "balance_loss_mlp": 0.24835157, "epoch": 0.5875845483240644, "flos": 25812410722560.0, "grad_norm": 9.655934174215663, "language_loss": 0.83491206, "learning_rate": 1.534046611017519e-06, "loss": 0.85222518, "num_input_tokens_seen": 210733710, "router_z_loss_clip": 2.7578125, "router_z_loss_mlp": 0.36547852, "step": 9773, "time_per_iteration": 2.7180943489074707 }, { "auxiliary_loss_clip": 0.01440627, "auxiliary_loss_mlp": 0.00270122, "balance_loss_clip": 1.16497207, "balance_loss_mlp": 0.23671994, "epoch": 0.5876446715767323, "flos": 26906572863360.0, "grad_norm": 25.0679007741106, "language_loss": 0.64811862, "learning_rate": 1.5336678752554421e-06, "loss": 0.6652261, "num_input_tokens_seen": 210753580, "router_z_loss_clip": 2.75390625, "router_z_loss_mlp": 0.33398438, "step": 9774, "time_per_iteration": 4.10964560508728 }, { "auxiliary_loss_clip": 0.01427589, "auxiliary_loss_mlp": 0.0028268, "balance_loss_clip": 1.15960109, "balance_loss_mlp": 0.25042203, "epoch": 0.5877047948294003, "flos": 36684832579200.0, "grad_norm": 2.8932602155855176, "language_loss": 0.75207663, "learning_rate": 1.5332891571767264e-06, "loss": 0.76917934, "num_input_tokens_seen": 210773495, "router_z_loss_clip": 2.68554688, "router_z_loss_mlp": 0.32226562, "step": 9775, "time_per_iteration": 2.807006597518921 }, { "auxiliary_loss_clip": 0.01384423, "auxiliary_loss_mlp": 0.00280903, "balance_loss_clip": 1.12716687, "balance_loss_mlp": 0.25079089, "epoch": 0.5877649180820682, "flos": 26724721282560.0, "grad_norm": 6.831876699157589, "language_loss": 0.81679368, "learning_rate": 1.5329104567957326e-06, "loss": 0.83344692, "num_input_tokens_seen": 210793645, "router_z_loss_clip": 2.56835938, "router_z_loss_mlp": 0.30102539, "step": 9776, "time_per_iteration": 2.714385986328125 }, { "auxiliary_loss_clip": 0.01433439, "auxiliary_loss_mlp": 0.0027281, "balance_loss_clip": 1.16240144, "balance_loss_mlp": 0.2406707, "epoch": 0.5878250413347362, "flos": 21032197879680.0, "grad_norm": 5.6359733475854865, "language_loss": 0.82073557, "learning_rate": 1.532531774126821e-06, "loss": 0.837798, "num_input_tokens_seen": 210813415, "router_z_loss_clip": 2.70898438, "router_z_loss_mlp": 0.32128906, "step": 9777, "time_per_iteration": 2.6555569171905518 }, { "auxiliary_loss_clip": 0.01408469, "auxiliary_loss_mlp": 0.00251401, "balance_loss_clip": 1.14739919, "balance_loss_mlp": 0.22252849, "epoch": 0.5878851645874041, "flos": 25484259047040.0, "grad_norm": 7.736189327975747, "language_loss": 0.80513859, "learning_rate": 1.5321531091843512e-06, "loss": 0.82173723, "num_input_tokens_seen": 210833850, "router_z_loss_clip": 2.61132812, "router_z_loss_mlp": 0.28857422, "step": 9778, "time_per_iteration": 2.695220947265625 }, { "auxiliary_loss_clip": 0.01385315, "auxiliary_loss_mlp": 0.00290675, "balance_loss_clip": 1.12798846, "balance_loss_mlp": 0.26113516, "epoch": 0.5879452878400722, "flos": 23769129559680.0, "grad_norm": 38.82922060148065, "language_loss": 0.76430774, "learning_rate": 1.5317744619826824e-06, "loss": 0.78106761, "num_input_tokens_seen": 210853115, "router_z_loss_clip": 2.57617188, "router_z_loss_mlp": 0.2956543, "step": 9779, "time_per_iteration": 4.1602606773376465 }, { "auxiliary_loss_clip": 0.01417331, "auxiliary_loss_mlp": 0.00285424, "balance_loss_clip": 1.14727807, "balance_loss_mlp": 0.25261739, "epoch": 0.5880054110927401, "flos": 17824513530240.0, "grad_norm": 107.39709557361877, "language_loss": 0.73889643, "learning_rate": 1.5313958325361727e-06, "loss": 0.75592399, "num_input_tokens_seen": 210872090, "router_z_loss_clip": 2.70117188, "router_z_loss_mlp": 0.32836914, "step": 9780, "time_per_iteration": 2.6782987117767334 }, { "auxiliary_loss_clip": 0.01402668, "auxiliary_loss_mlp": 0.00268353, "balance_loss_clip": 1.14326882, "balance_loss_mlp": 0.23535591, "epoch": 0.5880655343454081, "flos": 19463404400640.0, "grad_norm": 3.284712928551496, "language_loss": 0.81150973, "learning_rate": 1.5310172208591807e-06, "loss": 0.82821995, "num_input_tokens_seen": 210888490, "router_z_loss_clip": 2.59375, "router_z_loss_mlp": 0.32995605, "step": 9781, "time_per_iteration": 2.652977228164673 }, { "auxiliary_loss_clip": 0.01410003, "auxiliary_loss_mlp": 0.00295143, "balance_loss_clip": 1.15040028, "balance_loss_mlp": 0.26441085, "epoch": 0.588125657598076, "flos": 21397588980480.0, "grad_norm": 7.678013106685158, "language_loss": 0.74757463, "learning_rate": 1.5306386269660622e-06, "loss": 0.76462609, "num_input_tokens_seen": 210908220, "router_z_loss_clip": 2.59765625, "router_z_loss_mlp": 0.30737305, "step": 9782, "time_per_iteration": 2.662749767303467 }, { "auxiliary_loss_clip": 0.01422666, "auxiliary_loss_mlp": 0.00269632, "balance_loss_clip": 1.15011299, "balance_loss_mlp": 0.23788667, "epoch": 0.588185780850744, "flos": 16034653797120.0, "grad_norm": 25.22327350441506, "language_loss": 0.79996306, "learning_rate": 1.5302600508711741e-06, "loss": 0.81688601, "num_input_tokens_seen": 210923945, "router_z_loss_clip": 2.72851562, "router_z_loss_mlp": 0.31774902, "step": 9783, "time_per_iteration": 2.655618906021118 }, { "auxiliary_loss_clip": 0.01424126, "auxiliary_loss_mlp": 0.00261337, "balance_loss_clip": 1.15142429, "balance_loss_mlp": 0.22831635, "epoch": 0.588245904103412, "flos": 23728226947200.0, "grad_norm": 14.169663181625024, "language_loss": 0.77080458, "learning_rate": 1.5298814925888719e-06, "loss": 0.78765917, "num_input_tokens_seen": 210941955, "router_z_loss_clip": 2.7265625, "router_z_loss_mlp": 0.33056641, "step": 9784, "time_per_iteration": 2.666461229324341 }, { "auxiliary_loss_clip": 0.01422437, "auxiliary_loss_mlp": 0.00271735, "balance_loss_clip": 1.14948249, "balance_loss_mlp": 0.23945293, "epoch": 0.58830602735608, "flos": 33802534558080.0, "grad_norm": 53.04076713587355, "language_loss": 0.77390784, "learning_rate": 1.5295029521335102e-06, "loss": 0.79084957, "num_input_tokens_seen": 210963105, "router_z_loss_clip": 2.72851562, "router_z_loss_mlp": 0.32299805, "step": 9785, "time_per_iteration": 2.793402671813965 }, { "auxiliary_loss_clip": 0.01405772, "auxiliary_loss_mlp": 0.00284303, "balance_loss_clip": 1.1417191, "balance_loss_mlp": 0.25371432, "epoch": 0.588366150608748, "flos": 17090714586240.0, "grad_norm": 5.545147692579656, "language_loss": 0.85675335, "learning_rate": 1.5291244295194448e-06, "loss": 0.87365413, "num_input_tokens_seen": 210978720, "router_z_loss_clip": 2.64257812, "router_z_loss_mlp": 0.30566406, "step": 9786, "time_per_iteration": 2.6206092834472656 }, { "auxiliary_loss_clip": 0.01406214, "auxiliary_loss_mlp": 0.00276835, "balance_loss_clip": 1.13977599, "balance_loss_mlp": 0.24464846, "epoch": 0.5884262738614159, "flos": 22127186033280.0, "grad_norm": 39.204909617521096, "language_loss": 0.87498975, "learning_rate": 1.5287459247610276e-06, "loss": 0.89182031, "num_input_tokens_seen": 210998750, "router_z_loss_clip": 2.6640625, "router_z_loss_mlp": 0.32202148, "step": 9787, "time_per_iteration": 2.6749095916748047 }, { "auxiliary_loss_clip": 0.01416978, "auxiliary_loss_mlp": 0.00280346, "balance_loss_clip": 1.14495254, "balance_loss_mlp": 0.24935202, "epoch": 0.5884863971140839, "flos": 21031838743680.0, "grad_norm": 21.770787969223154, "language_loss": 0.73132122, "learning_rate": 1.5283674378726116e-06, "loss": 0.74829441, "num_input_tokens_seen": 211017550, "router_z_loss_clip": 2.71875, "router_z_loss_mlp": 0.31005859, "step": 9788, "time_per_iteration": 2.661489963531494 }, { "auxiliary_loss_clip": 0.01420435, "auxiliary_loss_mlp": 0.00273089, "balance_loss_clip": 1.15455675, "balance_loss_mlp": 0.24252416, "epoch": 0.5885465203667518, "flos": 23805112008960.0, "grad_norm": 2.750449351537852, "language_loss": 0.86540866, "learning_rate": 1.5279889688685506e-06, "loss": 0.88234389, "num_input_tokens_seen": 211034135, "router_z_loss_clip": 2.65625, "router_z_loss_mlp": 0.3059082, "step": 9789, "time_per_iteration": 2.713655710220337 }, { "auxiliary_loss_clip": 0.0140241, "auxiliary_loss_mlp": 0.00278492, "balance_loss_clip": 1.14532578, "balance_loss_mlp": 0.24823651, "epoch": 0.5886066436194198, "flos": 18880574319360.0, "grad_norm": 53.51497680028886, "language_loss": 0.76628613, "learning_rate": 1.5276105177631944e-06, "loss": 0.78309518, "num_input_tokens_seen": 211053850, "router_z_loss_clip": 2.5703125, "router_z_loss_mlp": 0.30224609, "step": 9790, "time_per_iteration": 2.6472530364990234 }, { "auxiliary_loss_clip": 0.01417458, "auxiliary_loss_mlp": 0.00298147, "balance_loss_clip": 1.15363562, "balance_loss_mlp": 0.26686651, "epoch": 0.5886667668720877, "flos": 24790141653120.0, "grad_norm": 101.77230041622946, "language_loss": 0.89600122, "learning_rate": 1.527232084570895e-06, "loss": 0.91315734, "num_input_tokens_seen": 211072165, "router_z_loss_clip": 2.63671875, "router_z_loss_mlp": 0.31298828, "step": 9791, "time_per_iteration": 2.7023463249206543 }, { "auxiliary_loss_clip": 0.01433553, "auxiliary_loss_mlp": 0.00295116, "balance_loss_clip": 1.16339087, "balance_loss_mlp": 0.26176125, "epoch": 0.5887268901247558, "flos": 21614381516160.0, "grad_norm": 4.837009662505611, "language_loss": 0.832021, "learning_rate": 1.5268536693060026e-06, "loss": 0.84930766, "num_input_tokens_seen": 211089630, "router_z_loss_clip": 2.70117188, "router_z_loss_mlp": 0.33349609, "step": 9792, "time_per_iteration": 2.7167999744415283 }, { "auxiliary_loss_clip": 0.01420617, "auxiliary_loss_mlp": 0.00313429, "balance_loss_clip": 1.14833128, "balance_loss_mlp": 0.27752304, "epoch": 0.5887870133774237, "flos": 20481722974080.0, "grad_norm": 8.189055528430718, "language_loss": 0.76227617, "learning_rate": 1.5264752719828662e-06, "loss": 0.77961665, "num_input_tokens_seen": 211106120, "router_z_loss_clip": 2.72265625, "router_z_loss_mlp": 0.35888672, "step": 9793, "time_per_iteration": 2.664824962615967 }, { "auxiliary_loss_clip": 0.01425618, "auxiliary_loss_mlp": 0.00288728, "balance_loss_clip": 1.15852368, "balance_loss_mlp": 0.25511089, "epoch": 0.5888471366300917, "flos": 19206283870080.0, "grad_norm": 5.994595209458041, "language_loss": 0.66458869, "learning_rate": 1.5260968926158353e-06, "loss": 0.68173218, "num_input_tokens_seen": 211122450, "router_z_loss_clip": 2.671875, "router_z_loss_mlp": 0.33642578, "step": 9794, "time_per_iteration": 2.628608465194702 }, { "auxiliary_loss_clip": 0.01417619, "auxiliary_loss_mlp": 0.00274322, "balance_loss_clip": 1.15152466, "balance_loss_mlp": 0.24041876, "epoch": 0.5889072598827596, "flos": 19972904866560.0, "grad_norm": 35.84415310977272, "language_loss": 0.70662928, "learning_rate": 1.525718531219257e-06, "loss": 0.72354871, "num_input_tokens_seen": 211141765, "router_z_loss_clip": 2.66601562, "router_z_loss_mlp": 0.33935547, "step": 9795, "time_per_iteration": 2.688871145248413 }, { "auxiliary_loss_clip": 0.01447281, "auxiliary_loss_mlp": 0.00266647, "balance_loss_clip": 1.18072653, "balance_loss_mlp": 0.23450767, "epoch": 0.5889673831354276, "flos": 20741249715840.0, "grad_norm": 5.758697575364143, "language_loss": 0.79649484, "learning_rate": 1.5253401878074801e-06, "loss": 0.8136341, "num_input_tokens_seen": 211160475, "router_z_loss_clip": 2.6640625, "router_z_loss_mlp": 0.3215332, "step": 9796, "time_per_iteration": 2.7470226287841797 }, { "auxiliary_loss_clip": 0.01441847, "auxiliary_loss_mlp": 0.00271447, "balance_loss_clip": 1.17082441, "balance_loss_mlp": 0.23914167, "epoch": 0.5890275063880956, "flos": 25300935008640.0, "grad_norm": 5.4002259341556975, "language_loss": 0.88098389, "learning_rate": 1.5249618623948507e-06, "loss": 0.89811683, "num_input_tokens_seen": 211180480, "router_z_loss_clip": 2.7109375, "router_z_loss_mlp": 0.32324219, "step": 9797, "time_per_iteration": 2.6847426891326904 }, { "auxiliary_loss_clip": 0.01447951, "auxiliary_loss_mlp": 0.00278393, "balance_loss_clip": 1.17349219, "balance_loss_mlp": 0.24446613, "epoch": 0.5890876296407636, "flos": 11765377964160.0, "grad_norm": 13.75189377783656, "language_loss": 0.8662802, "learning_rate": 1.5245835549957152e-06, "loss": 0.88354367, "num_input_tokens_seen": 211198000, "router_z_loss_clip": 2.74609375, "router_z_loss_mlp": 0.33935547, "step": 9798, "time_per_iteration": 2.629483461380005 }, { "auxiliary_loss_clip": 0.01435092, "auxiliary_loss_mlp": 0.00279247, "balance_loss_clip": 1.17433548, "balance_loss_mlp": 0.24727558, "epoch": 0.5891477528934316, "flos": 13589460380160.0, "grad_norm": 11.795779008892957, "language_loss": 0.83138669, "learning_rate": 1.5242052656244186e-06, "loss": 0.84853011, "num_input_tokens_seen": 211214765, "router_z_loss_clip": 2.60742188, "router_z_loss_mlp": 0.31982422, "step": 9799, "time_per_iteration": 2.609060049057007 }, { "auxiliary_loss_clip": 0.0144122, "auxiliary_loss_mlp": 0.00297854, "balance_loss_clip": 1.17073905, "balance_loss_mlp": 0.26211524, "epoch": 0.5892078761460995, "flos": 15049193189760.0, "grad_norm": 19.96402258493636, "language_loss": 0.85847509, "learning_rate": 1.5238269942953064e-06, "loss": 0.87586582, "num_input_tokens_seen": 211232335, "router_z_loss_clip": 2.70117188, "router_z_loss_mlp": 0.35742188, "step": 9800, "time_per_iteration": 2.618119716644287 }, { "auxiliary_loss_clip": 0.01447281, "auxiliary_loss_mlp": 0.00290441, "balance_loss_clip": 1.18082881, "balance_loss_mlp": 0.25618017, "epoch": 0.5892679993987675, "flos": 15778215624960.0, "grad_norm": 9.804624736491053, "language_loss": 0.87965328, "learning_rate": 1.523448741022722e-06, "loss": 0.89703047, "num_input_tokens_seen": 211249985, "router_z_loss_clip": 2.6640625, "router_z_loss_mlp": 0.3425293, "step": 9801, "time_per_iteration": 2.607206106185913 }, { "auxiliary_loss_clip": 0.01443605, "auxiliary_loss_mlp": 0.00270126, "balance_loss_clip": 1.1759851, "balance_loss_mlp": 0.23729585, "epoch": 0.5893281226514354, "flos": 25265203954560.0, "grad_norm": 16.152813378129984, "language_loss": 0.73114479, "learning_rate": 1.5230705058210088e-06, "loss": 0.74828213, "num_input_tokens_seen": 211268425, "router_z_loss_clip": 2.67578125, "router_z_loss_mlp": 0.32861328, "step": 9802, "time_per_iteration": 2.7237229347229004 }, { "auxiliary_loss_clip": 0.01452632, "auxiliary_loss_mlp": 0.00268273, "balance_loss_clip": 1.18295455, "balance_loss_mlp": 0.23451261, "epoch": 0.5893882459041034, "flos": 19458232842240.0, "grad_norm": 9.922739668701103, "language_loss": 0.82699907, "learning_rate": 1.5226922887045108e-06, "loss": 0.84420812, "num_input_tokens_seen": 211286680, "router_z_loss_clip": 2.6953125, "router_z_loss_mlp": 0.33764648, "step": 9803, "time_per_iteration": 2.673612594604492 }, { "auxiliary_loss_clip": 0.01450036, "auxiliary_loss_mlp": 0.00281425, "balance_loss_clip": 1.1746794, "balance_loss_mlp": 0.24301575, "epoch": 0.5894483691567713, "flos": 20634056553600.0, "grad_norm": 2.3734516151172893, "language_loss": 0.77911937, "learning_rate": 1.5223140896875686e-06, "loss": 0.79643404, "num_input_tokens_seen": 211307700, "router_z_loss_clip": 2.75, "router_z_loss_mlp": 0.3840332, "step": 9804, "time_per_iteration": 2.7216527462005615 }, { "auxiliary_loss_clip": 0.01459676, "auxiliary_loss_mlp": 0.0024905, "balance_loss_clip": 1.18780565, "balance_loss_mlp": 0.21188051, "epoch": 0.5895084924094394, "flos": 17778223877760.0, "grad_norm": 63.24793270227469, "language_loss": 0.82747436, "learning_rate": 1.5219359087845234e-06, "loss": 0.84456164, "num_input_tokens_seen": 211324835, "router_z_loss_clip": 2.71484375, "router_z_loss_mlp": 0.37182617, "step": 9805, "time_per_iteration": 2.630599021911621 }, { "auxiliary_loss_clip": 0.01460324, "auxiliary_loss_mlp": 0.0027511, "balance_loss_clip": 1.18509316, "balance_loss_mlp": 0.23891842, "epoch": 0.5895686156621073, "flos": 20121072468480.0, "grad_norm": 4.662170052742493, "language_loss": 0.86941904, "learning_rate": 1.5215577460097174e-06, "loss": 0.88677335, "num_input_tokens_seen": 211344130, "router_z_loss_clip": 2.75585938, "router_z_loss_mlp": 0.36206055, "step": 9806, "time_per_iteration": 2.656167984008789 }, { "auxiliary_loss_clip": 0.01476659, "auxiliary_loss_mlp": 0.0028244, "balance_loss_clip": 1.19779742, "balance_loss_mlp": 0.24710615, "epoch": 0.5896287389147753, "flos": 20850058990080.0, "grad_norm": 19.910808251034002, "language_loss": 0.87735647, "learning_rate": 1.5211796013774887e-06, "loss": 0.89494741, "num_input_tokens_seen": 211362915, "router_z_loss_clip": 2.7890625, "router_z_loss_mlp": 0.35302734, "step": 9807, "time_per_iteration": 2.6964542865753174 }, { "auxiliary_loss_clip": 0.01461865, "auxiliary_loss_mlp": 0.00264049, "balance_loss_clip": 1.19022298, "balance_loss_mlp": 0.22912024, "epoch": 0.5896888621674432, "flos": 14537897043840.0, "grad_norm": 6.199578155987513, "language_loss": 0.8352977, "learning_rate": 1.5208014749021786e-06, "loss": 0.85255688, "num_input_tokens_seen": 211380700, "router_z_loss_clip": 2.7109375, "router_z_loss_mlp": 0.34936523, "step": 9808, "time_per_iteration": 2.7053072452545166 }, { "auxiliary_loss_clip": 0.01470446, "auxiliary_loss_mlp": 0.00285582, "balance_loss_clip": 1.19693053, "balance_loss_mlp": 0.25029653, "epoch": 0.5897489854201112, "flos": 20886759711360.0, "grad_norm": 24.593236949094756, "language_loss": 0.81472635, "learning_rate": 1.5204233665981236e-06, "loss": 0.83228672, "num_input_tokens_seen": 211400095, "router_z_loss_clip": 2.73632812, "router_z_loss_mlp": 0.35302734, "step": 9809, "time_per_iteration": 2.699645519256592 }, { "auxiliary_loss_clip": 0.01460456, "auxiliary_loss_mlp": 0.00272201, "balance_loss_clip": 1.18939888, "balance_loss_mlp": 0.23586641, "epoch": 0.5898091086727792, "flos": 20011149872640.0, "grad_norm": 55.05087077654035, "language_loss": 0.90899014, "learning_rate": 1.5200452764796627e-06, "loss": 0.9263168, "num_input_tokens_seen": 211417810, "router_z_loss_clip": 2.70898438, "router_z_loss_mlp": 0.36303711, "step": 9810, "time_per_iteration": 2.6804287433624268 }, { "auxiliary_loss_clip": 0.01455618, "auxiliary_loss_mlp": 0.00260101, "balance_loss_clip": 1.19197369, "balance_loss_mlp": 0.22626904, "epoch": 0.5898692319254472, "flos": 16253242012800.0, "grad_norm": 8.154298527227976, "language_loss": 0.8886925, "learning_rate": 1.5196672045611336e-06, "loss": 0.90584964, "num_input_tokens_seen": 211436020, "router_z_loss_clip": 2.63476562, "router_z_loss_mlp": 0.33837891, "step": 9811, "time_per_iteration": 2.69559645652771 }, { "auxiliary_loss_clip": 0.01490878, "auxiliary_loss_mlp": 0.00264192, "balance_loss_clip": 1.2010963, "balance_loss_mlp": 0.22635543, "epoch": 0.5899293551781152, "flos": 20448541785600.0, "grad_norm": 4.893443018670184, "language_loss": 0.84826154, "learning_rate": 1.5192891508568715e-06, "loss": 0.86581224, "num_input_tokens_seen": 211454335, "router_z_loss_clip": 2.8984375, "router_z_loss_mlp": 0.37841797, "step": 9812, "time_per_iteration": 4.055715799331665 }, { "auxiliary_loss_clip": 0.0146301, "auxiliary_loss_mlp": 0.00280365, "balance_loss_clip": 1.19781113, "balance_loss_mlp": 0.2451265, "epoch": 0.5899894784307831, "flos": 13881701433600.0, "grad_norm": 4.412660621467103, "language_loss": 0.76213706, "learning_rate": 1.5189111153812133e-06, "loss": 0.77957076, "num_input_tokens_seen": 211472775, "router_z_loss_clip": 2.65039062, "router_z_loss_mlp": 0.35253906, "step": 9813, "time_per_iteration": 4.165541410446167 }, { "auxiliary_loss_clip": 0.01458701, "auxiliary_loss_mlp": 0.00279029, "balance_loss_clip": 1.18943381, "balance_loss_mlp": 0.24348038, "epoch": 0.5900496016834511, "flos": 20083797129600.0, "grad_norm": 6.664907559916376, "language_loss": 0.77416551, "learning_rate": 1.518533098148494e-06, "loss": 0.79154277, "num_input_tokens_seen": 211492195, "router_z_loss_clip": 2.68945312, "router_z_loss_mlp": 0.35546875, "step": 9814, "time_per_iteration": 2.6638174057006836 }, { "auxiliary_loss_clip": 0.01477467, "auxiliary_loss_mlp": 0.00285508, "balance_loss_clip": 1.2047962, "balance_loss_mlp": 0.24869579, "epoch": 0.590109724936119, "flos": 20259148348800.0, "grad_norm": 12.954842924840529, "language_loss": 0.86188042, "learning_rate": 1.5181550991730476e-06, "loss": 0.87951016, "num_input_tokens_seen": 211510220, "router_z_loss_clip": 2.7265625, "router_z_loss_mlp": 0.36816406, "step": 9815, "time_per_iteration": 2.6507041454315186 }, { "auxiliary_loss_clip": 0.01473259, "auxiliary_loss_mlp": 0.00314495, "balance_loss_clip": 1.19567323, "balance_loss_mlp": 0.27599066, "epoch": 0.590169848188787, "flos": 24235069806720.0, "grad_norm": 23.221791259601495, "language_loss": 0.84579408, "learning_rate": 1.5177771184692083e-06, "loss": 0.8636716, "num_input_tokens_seen": 211526260, "router_z_loss_clip": 2.77929688, "router_z_loss_mlp": 0.38525391, "step": 9816, "time_per_iteration": 4.064629316329956 }, { "auxiliary_loss_clip": 0.01455592, "auxiliary_loss_mlp": 0.00287666, "balance_loss_clip": 1.18881845, "balance_loss_mlp": 0.25395346, "epoch": 0.590229971441455, "flos": 17784724239360.0, "grad_norm": 67.48930658067611, "language_loss": 0.88326997, "learning_rate": 1.517399156051309e-06, "loss": 0.90070248, "num_input_tokens_seen": 211542890, "router_z_loss_clip": 2.66992188, "router_z_loss_mlp": 0.33740234, "step": 9817, "time_per_iteration": 2.632958173751831 }, { "auxiliary_loss_clip": 0.01469758, "auxiliary_loss_mlp": 0.00279928, "balance_loss_clip": 1.19950151, "balance_loss_mlp": 0.24025482, "epoch": 0.590290094694123, "flos": 22236893147520.0, "grad_norm": 125.06223203402995, "language_loss": 0.84421831, "learning_rate": 1.517021211933682e-06, "loss": 0.8617152, "num_input_tokens_seen": 211562685, "router_z_loss_clip": 2.70703125, "router_z_loss_mlp": 0.39648438, "step": 9818, "time_per_iteration": 2.6349480152130127 }, { "auxiliary_loss_clip": 0.01466887, "auxiliary_loss_mlp": 0.00284745, "balance_loss_clip": 1.19916558, "balance_loss_mlp": 0.2499356, "epoch": 0.5903502179467909, "flos": 19098623831040.0, "grad_norm": 482.9918526664797, "language_loss": 0.76123679, "learning_rate": 1.5166432861306592e-06, "loss": 0.77875304, "num_input_tokens_seen": 211579960, "router_z_loss_clip": 2.67773438, "router_z_loss_mlp": 0.34863281, "step": 9819, "time_per_iteration": 2.6330816745758057 }, { "auxiliary_loss_clip": 0.01472736, "auxiliary_loss_mlp": 0.00317005, "balance_loss_clip": 1.20528638, "balance_loss_mlp": 0.2811228, "epoch": 0.5904103411994589, "flos": 24235500769920.0, "grad_norm": 3.4526620377379706, "language_loss": 0.85529912, "learning_rate": 1.5162653786565714e-06, "loss": 0.87319648, "num_input_tokens_seen": 211599310, "router_z_loss_clip": 2.67382812, "router_z_loss_mlp": 0.35864258, "step": 9820, "time_per_iteration": 2.6815593242645264 }, { "auxiliary_loss_clip": 0.01532907, "auxiliary_loss_mlp": 0.00128096, "balance_loss_clip": 1.30271101, "balance_loss_mlp": 0.11698586, "epoch": 0.5904704644521268, "flos": 64876613045760.0, "grad_norm": 1.1062929101518333, "language_loss": 0.64664656, "learning_rate": 1.5158874895257487e-06, "loss": 0.66325659, "num_input_tokens_seen": 211658790, "router_z_loss_clip": 2.3125, "router_z_loss_mlp": 0.11132812, "step": 9821, "time_per_iteration": 3.184199333190918 }, { "auxiliary_loss_clip": 0.01465714, "auxiliary_loss_mlp": 0.00299872, "balance_loss_clip": 1.19776952, "balance_loss_mlp": 0.26363283, "epoch": 0.5905305877047948, "flos": 19609991804160.0, "grad_norm": 7.923933232052816, "language_loss": 0.70191556, "learning_rate": 1.515509618752521e-06, "loss": 0.71957135, "num_input_tokens_seen": 211677240, "router_z_loss_clip": 2.6796875, "router_z_loss_mlp": 0.36254883, "step": 9822, "time_per_iteration": 4.078229188919067 }, { "auxiliary_loss_clip": 0.01470973, "auxiliary_loss_mlp": 0.00306622, "balance_loss_clip": 1.20088136, "balance_loss_mlp": 0.27186024, "epoch": 0.5905907109574628, "flos": 18989634988800.0, "grad_norm": 49.427773294066775, "language_loss": 0.90897226, "learning_rate": 1.5151317663512173e-06, "loss": 0.92674822, "num_input_tokens_seen": 211695485, "router_z_loss_clip": 2.70117188, "router_z_loss_mlp": 0.34765625, "step": 9823, "time_per_iteration": 2.6563172340393066 }, { "auxiliary_loss_clip": 0.01480132, "auxiliary_loss_mlp": 0.00301715, "balance_loss_clip": 1.20924211, "balance_loss_mlp": 0.26445055, "epoch": 0.5906508342101308, "flos": 22200407907840.0, "grad_norm": 66.35162497364094, "language_loss": 0.8126201, "learning_rate": 1.514753932336165e-06, "loss": 0.83043855, "num_input_tokens_seen": 211713090, "router_z_loss_clip": 2.71289062, "router_z_loss_mlp": 0.37255859, "step": 9824, "time_per_iteration": 2.685814380645752 }, { "auxiliary_loss_clip": 0.01475089, "auxiliary_loss_mlp": 0.00351605, "balance_loss_clip": 1.1895076, "balance_loss_mlp": 0.31124067, "epoch": 0.5907109574627988, "flos": 20886687884160.0, "grad_norm": 4.23930490315166, "language_loss": 0.94340652, "learning_rate": 1.514376116721693e-06, "loss": 0.96167344, "num_input_tokens_seen": 211732510, "router_z_loss_clip": 2.85742188, "router_z_loss_mlp": 0.40380859, "step": 9825, "time_per_iteration": 2.749094009399414 }, { "auxiliary_loss_clip": 0.01450885, "auxiliary_loss_mlp": 0.00315425, "balance_loss_clip": 1.18868399, "balance_loss_mlp": 0.28040123, "epoch": 0.5907710807154667, "flos": 21506649649920.0, "grad_norm": 20.14379221029191, "language_loss": 0.8200863, "learning_rate": 1.5139983195221272e-06, "loss": 0.83774942, "num_input_tokens_seen": 211748695, "router_z_loss_clip": 2.62109375, "router_z_loss_mlp": 0.34985352, "step": 9826, "time_per_iteration": 2.7561163902282715 }, { "auxiliary_loss_clip": 0.01489691, "auxiliary_loss_mlp": 0.00299727, "balance_loss_clip": 1.21031296, "balance_loss_mlp": 0.2640354, "epoch": 0.5908312039681347, "flos": 22018376759040.0, "grad_norm": 5.787714201645495, "language_loss": 0.80117297, "learning_rate": 1.513620540751793e-06, "loss": 0.81906712, "num_input_tokens_seen": 211768545, "router_z_loss_clip": 2.796875, "router_z_loss_mlp": 0.35668945, "step": 9827, "time_per_iteration": 2.673064708709717 }, { "auxiliary_loss_clip": 0.01481745, "auxiliary_loss_mlp": 0.00325626, "balance_loss_clip": 1.2085743, "balance_loss_mlp": 0.29041147, "epoch": 0.5908913272208026, "flos": 18479523991680.0, "grad_norm": 1.9019059185243696, "language_loss": 0.85712254, "learning_rate": 1.5132427804250178e-06, "loss": 0.87519622, "num_input_tokens_seen": 211786665, "router_z_loss_clip": 2.73046875, "router_z_loss_mlp": 0.35229492, "step": 9828, "time_per_iteration": 2.6774580478668213 }, { "auxiliary_loss_clip": 0.01494964, "auxiliary_loss_mlp": 0.00316736, "balance_loss_clip": 1.21175992, "balance_loss_mlp": 0.27737352, "epoch": 0.5909514504734706, "flos": 12312189682560.0, "grad_norm": 18.95803272588925, "language_loss": 0.96565235, "learning_rate": 1.5128650385561241e-06, "loss": 0.98376936, "num_input_tokens_seen": 211801215, "router_z_loss_clip": 2.83007812, "router_z_loss_mlp": 0.39379883, "step": 9829, "time_per_iteration": 2.645596742630005 }, { "auxiliary_loss_clip": 0.01505436, "auxiliary_loss_mlp": 0.00117243, "balance_loss_clip": 1.28202176, "balance_loss_mlp": 0.10484513, "epoch": 0.5910115737261386, "flos": 70213262451840.0, "grad_norm": 0.7464648153642631, "language_loss": 0.57608312, "learning_rate": 1.5124873151594376e-06, "loss": 0.59230995, "num_input_tokens_seen": 211857005, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.12402344, "step": 9830, "time_per_iteration": 3.1185085773468018 }, { "auxiliary_loss_clip": 0.01508483, "auxiliary_loss_mlp": 0.00297097, "balance_loss_clip": 1.21290922, "balance_loss_mlp": 0.25914064, "epoch": 0.5910716969788066, "flos": 22017766227840.0, "grad_norm": 52.86516958465032, "language_loss": 0.91713881, "learning_rate": 1.5121096102492812e-06, "loss": 0.93519461, "num_input_tokens_seen": 211876675, "router_z_loss_clip": 2.95507812, "router_z_loss_mlp": 0.37988281, "step": 9831, "time_per_iteration": 2.7070376873016357 }, { "auxiliary_loss_clip": 0.01451208, "auxiliary_loss_mlp": 0.00267996, "balance_loss_clip": 1.19184291, "balance_loss_mlp": 0.23230508, "epoch": 0.5911318202314745, "flos": 21251648021760.0, "grad_norm": 15.231786178291578, "language_loss": 0.84000003, "learning_rate": 1.5117319238399767e-06, "loss": 0.8571921, "num_input_tokens_seen": 211895725, "router_z_loss_clip": 2.59375, "router_z_loss_mlp": 0.35693359, "step": 9832, "time_per_iteration": 2.6910898685455322 }, { "auxiliary_loss_clip": 0.01430203, "auxiliary_loss_mlp": 0.00318812, "balance_loss_clip": 1.16868663, "balance_loss_mlp": 0.28471842, "epoch": 0.5911919434841425, "flos": 17821604528640.0, "grad_norm": 4.907792446618618, "language_loss": 0.9116441, "learning_rate": 1.511354255945847e-06, "loss": 0.92913425, "num_input_tokens_seen": 211913860, "router_z_loss_clip": 2.6171875, "router_z_loss_mlp": 0.34106445, "step": 9833, "time_per_iteration": 2.6622090339660645 }, { "auxiliary_loss_clip": 0.01445877, "auxiliary_loss_mlp": 0.00301013, "balance_loss_clip": 1.18425393, "balance_loss_mlp": 0.26422507, "epoch": 0.5912520667368104, "flos": 20374781207040.0, "grad_norm": 40.19126541259085, "language_loss": 0.816365, "learning_rate": 1.5109766065812123e-06, "loss": 0.83383393, "num_input_tokens_seen": 211932880, "router_z_loss_clip": 2.62109375, "router_z_loss_mlp": 0.36767578, "step": 9834, "time_per_iteration": 2.734718084335327 }, { "auxiliary_loss_clip": 0.01468623, "auxiliary_loss_mlp": 0.00262866, "balance_loss_clip": 1.19714546, "balance_loss_mlp": 0.23067942, "epoch": 0.5913121899894784, "flos": 17930557457280.0, "grad_norm": 10.74890940781382, "language_loss": 0.86840856, "learning_rate": 1.5105989757603942e-06, "loss": 0.88572341, "num_input_tokens_seen": 211948625, "router_z_loss_clip": 2.71484375, "router_z_loss_mlp": 0.32177734, "step": 9835, "time_per_iteration": 2.639735460281372 }, { "auxiliary_loss_clip": 0.01465659, "auxiliary_loss_mlp": 0.00285, "balance_loss_clip": 1.1905489, "balance_loss_mlp": 0.24821189, "epoch": 0.5913723132421465, "flos": 22126934638080.0, "grad_norm": 7.008198970993985, "language_loss": 0.83002919, "learning_rate": 1.5102213634977117e-06, "loss": 0.84753585, "num_input_tokens_seen": 211965355, "router_z_loss_clip": 2.75195312, "router_z_loss_mlp": 0.36791992, "step": 9836, "time_per_iteration": 2.675624370574951 }, { "auxiliary_loss_clip": 0.01472647, "auxiliary_loss_mlp": 0.00287301, "balance_loss_clip": 1.19862914, "balance_loss_mlp": 0.2512756, "epoch": 0.5914324364948144, "flos": 15697918771200.0, "grad_norm": 7.848924686274073, "language_loss": 0.93524665, "learning_rate": 1.5098437698074841e-06, "loss": 0.95284605, "num_input_tokens_seen": 211982245, "router_z_loss_clip": 2.74023438, "router_z_loss_mlp": 0.36035156, "step": 9837, "time_per_iteration": 2.6648731231689453 }, { "auxiliary_loss_clip": 0.01466642, "auxiliary_loss_mlp": 0.00314027, "balance_loss_clip": 1.19505453, "balance_loss_mlp": 0.27394876, "epoch": 0.5914925597474824, "flos": 22747327367040.0, "grad_norm": 17.927501516814694, "language_loss": 0.86555082, "learning_rate": 1.5094661947040304e-06, "loss": 0.88335752, "num_input_tokens_seen": 212000250, "router_z_loss_clip": 2.71484375, "router_z_loss_mlp": 0.40087891, "step": 9838, "time_per_iteration": 2.7165932655334473 }, { "auxiliary_loss_clip": 0.01460026, "auxiliary_loss_mlp": 0.00280965, "balance_loss_clip": 1.1871413, "balance_loss_mlp": 0.24603659, "epoch": 0.5915526830001503, "flos": 18292788161280.0, "grad_norm": 101.15500425465639, "language_loss": 0.77888095, "learning_rate": 1.5090886382016673e-06, "loss": 0.79629087, "num_input_tokens_seen": 212017505, "router_z_loss_clip": 2.7265625, "router_z_loss_mlp": 0.34912109, "step": 9839, "time_per_iteration": 2.672856330871582 }, { "auxiliary_loss_clip": 0.01465333, "auxiliary_loss_mlp": 0.00322869, "balance_loss_clip": 1.19870758, "balance_loss_mlp": 0.28670064, "epoch": 0.5916128062528183, "flos": 17019072910080.0, "grad_norm": 20.55763808358542, "language_loss": 0.73081607, "learning_rate": 1.5087111003147124e-06, "loss": 0.74869812, "num_input_tokens_seen": 212034595, "router_z_loss_clip": 2.66992188, "router_z_loss_mlp": 0.36181641, "step": 9840, "time_per_iteration": 2.6809816360473633 }, { "auxiliary_loss_clip": 0.01468998, "auxiliary_loss_mlp": 0.00301625, "balance_loss_clip": 1.19385648, "balance_loss_mlp": 0.26319221, "epoch": 0.5916729295054862, "flos": 24754231031040.0, "grad_norm": 5.0660359190492334, "language_loss": 0.88029432, "learning_rate": 1.5083335810574813e-06, "loss": 0.89800054, "num_input_tokens_seen": 212055775, "router_z_loss_clip": 2.75585938, "router_z_loss_mlp": 0.38427734, "step": 9841, "time_per_iteration": 2.7070467472076416 }, { "auxiliary_loss_clip": 0.01457733, "auxiliary_loss_mlp": 0.00274547, "balance_loss_clip": 1.19044077, "balance_loss_mlp": 0.24109688, "epoch": 0.5917330527581542, "flos": 15958199698560.0, "grad_norm": 2.993261078552376, "language_loss": 0.76162952, "learning_rate": 1.507956080444291e-06, "loss": 0.77895236, "num_input_tokens_seen": 212074000, "router_z_loss_clip": 2.67578125, "router_z_loss_mlp": 0.33447266, "step": 9842, "time_per_iteration": 2.6800053119659424 }, { "auxiliary_loss_clip": 0.01442849, "auxiliary_loss_mlp": 0.00319291, "balance_loss_clip": 1.17226994, "balance_loss_mlp": 0.28226411, "epoch": 0.5917931760108222, "flos": 23800730549760.0, "grad_norm": 28.796897877350066, "language_loss": 0.8847307, "learning_rate": 1.5075785984894549e-06, "loss": 0.90235209, "num_input_tokens_seen": 212091415, "router_z_loss_clip": 2.70898438, "router_z_loss_mlp": 0.37036133, "step": 9843, "time_per_iteration": 2.6853530406951904 }, { "auxiliary_loss_clip": 0.01464016, "auxiliary_loss_mlp": 0.00293084, "balance_loss_clip": 1.1908443, "balance_loss_mlp": 0.25794142, "epoch": 0.5918532992634902, "flos": 23249609199360.0, "grad_norm": 53.26492211869063, "language_loss": 0.91761816, "learning_rate": 1.5072011352072875e-06, "loss": 0.93518913, "num_input_tokens_seen": 212105255, "router_z_loss_clip": 2.734375, "router_z_loss_mlp": 0.35131836, "step": 9844, "time_per_iteration": 2.706874132156372 }, { "auxiliary_loss_clip": 0.01441935, "auxiliary_loss_mlp": 0.0027557, "balance_loss_clip": 1.17514193, "balance_loss_mlp": 0.24331211, "epoch": 0.5919134225161581, "flos": 19499853726720.0, "grad_norm": 226.14954246795605, "language_loss": 0.81536454, "learning_rate": 1.5068236906121032e-06, "loss": 0.83253962, "num_input_tokens_seen": 212122765, "router_z_loss_clip": 2.66601562, "router_z_loss_mlp": 0.32226562, "step": 9845, "time_per_iteration": 2.632582902908325 }, { "auxiliary_loss_clip": 0.01449562, "auxiliary_loss_mlp": 0.00283158, "balance_loss_clip": 1.17169535, "balance_loss_mlp": 0.24763392, "epoch": 0.5919735457688261, "flos": 38800940567040.0, "grad_norm": 938.7277273818116, "language_loss": 0.74268115, "learning_rate": 1.506446264718213e-06, "loss": 0.76000834, "num_input_tokens_seen": 212143960, "router_z_loss_clip": 2.77734375, "router_z_loss_mlp": 0.35498047, "step": 9846, "time_per_iteration": 2.8471009731292725 }, { "auxiliary_loss_clip": 0.01464888, "auxiliary_loss_mlp": 0.00298333, "balance_loss_clip": 1.19845295, "balance_loss_mlp": 0.26440641, "epoch": 0.592033669021494, "flos": 22163994495360.0, "grad_norm": 16.358482299053545, "language_loss": 0.82697076, "learning_rate": 1.506068857539931e-06, "loss": 0.84460294, "num_input_tokens_seen": 212162005, "router_z_loss_clip": 2.6640625, "router_z_loss_mlp": 0.33911133, "step": 9847, "time_per_iteration": 2.6835548877716064 }, { "auxiliary_loss_clip": 0.01478768, "auxiliary_loss_mlp": 0.00275662, "balance_loss_clip": 1.19633973, "balance_loss_mlp": 0.24111506, "epoch": 0.592093792274162, "flos": 22710985781760.0, "grad_norm": 14.175486984303348, "language_loss": 0.71645164, "learning_rate": 1.5056914690915667e-06, "loss": 0.73399603, "num_input_tokens_seen": 212181635, "router_z_loss_clip": 2.82421875, "router_z_loss_mlp": 0.34521484, "step": 9848, "time_per_iteration": 2.736417531967163 }, { "auxiliary_loss_clip": 0.01445219, "auxiliary_loss_mlp": 0.0034156, "balance_loss_clip": 1.17667902, "balance_loss_mlp": 0.30491549, "epoch": 0.59215391552683, "flos": 22528954632960.0, "grad_norm": 4.298839828126828, "language_loss": 0.8446424, "learning_rate": 1.5053140993874312e-06, "loss": 0.8625102, "num_input_tokens_seen": 212201615, "router_z_loss_clip": 2.68554688, "router_z_loss_mlp": 0.36621094, "step": 9849, "time_per_iteration": 2.7862935066223145 }, { "auxiliary_loss_clip": 0.01450549, "auxiliary_loss_mlp": 0.00292668, "balance_loss_clip": 1.17817092, "balance_loss_mlp": 0.25602332, "epoch": 0.592214038779498, "flos": 24499013921280.0, "grad_norm": 2962.871254075516, "language_loss": 0.8283686, "learning_rate": 1.5049367484418353e-06, "loss": 0.84580076, "num_input_tokens_seen": 212219355, "router_z_loss_clip": 2.72070312, "router_z_loss_mlp": 0.36669922, "step": 9850, "time_per_iteration": 2.7426486015319824 }, { "auxiliary_loss_clip": 0.01438762, "auxiliary_loss_mlp": 0.00312984, "balance_loss_clip": 1.17687035, "balance_loss_mlp": 0.2811076, "epoch": 0.592274162032166, "flos": 21831353619840.0, "grad_norm": 9.118937782162256, "language_loss": 0.8215934, "learning_rate": 1.5045594162690868e-06, "loss": 0.83911085, "num_input_tokens_seen": 212236710, "router_z_loss_clip": 2.61523438, "router_z_loss_mlp": 0.3190918, "step": 9851, "time_per_iteration": 2.6561756134033203 }, { "auxiliary_loss_clip": 0.01443587, "auxiliary_loss_mlp": 0.00329978, "balance_loss_clip": 1.17967081, "balance_loss_mlp": 0.29497844, "epoch": 0.5923342852848339, "flos": 24608146417920.0, "grad_norm": 83.99252907700021, "language_loss": 0.78716826, "learning_rate": 1.5041821028834954e-06, "loss": 0.80490386, "num_input_tokens_seen": 212256195, "router_z_loss_clip": 2.63867188, "router_z_loss_mlp": 0.35009766, "step": 9852, "time_per_iteration": 2.7424511909484863 }, { "auxiliary_loss_clip": 0.01437507, "auxiliary_loss_mlp": 0.00305297, "balance_loss_clip": 1.1671989, "balance_loss_mlp": 0.27039286, "epoch": 0.5923944085375019, "flos": 19938143479680.0, "grad_norm": 1.9598793737321079, "language_loss": 0.87193763, "learning_rate": 1.5038048082993685e-06, "loss": 0.88936567, "num_input_tokens_seen": 212274085, "router_z_loss_clip": 2.70117188, "router_z_loss_mlp": 0.34899902, "step": 9853, "time_per_iteration": 2.682861089706421 }, { "auxiliary_loss_clip": 0.01470869, "auxiliary_loss_mlp": 0.00282615, "balance_loss_clip": 1.19921947, "balance_loss_mlp": 0.24675706, "epoch": 0.5924545317901698, "flos": 28658510812800.0, "grad_norm": 30.934778056911128, "language_loss": 0.74047124, "learning_rate": 1.5034275325310124e-06, "loss": 0.7580061, "num_input_tokens_seen": 212295530, "router_z_loss_clip": 2.71484375, "router_z_loss_mlp": 0.35888672, "step": 9854, "time_per_iteration": 4.245787858963013 }, { "auxiliary_loss_clip": 0.01439293, "auxiliary_loss_mlp": 0.00294375, "balance_loss_clip": 1.17464995, "balance_loss_mlp": 0.26156843, "epoch": 0.5925146550428378, "flos": 19864885691520.0, "grad_norm": 23.62804026185717, "language_loss": 0.96210563, "learning_rate": 1.5030502755927344e-06, "loss": 0.97944224, "num_input_tokens_seen": 212313770, "router_z_loss_clip": 2.6484375, "router_z_loss_mlp": 0.328125, "step": 9855, "time_per_iteration": 4.219951868057251 }, { "auxiliary_loss_clip": 0.01443998, "auxiliary_loss_mlp": 0.00278741, "balance_loss_clip": 1.17844605, "balance_loss_mlp": 0.24753246, "epoch": 0.5925747782955058, "flos": 15122989681920.0, "grad_norm": 30.161368309566647, "language_loss": 0.94176614, "learning_rate": 1.5026730374988397e-06, "loss": 0.95899349, "num_input_tokens_seen": 212331525, "router_z_loss_clip": 2.65625, "router_z_loss_mlp": 0.31225586, "step": 9856, "time_per_iteration": 2.64567494392395 }, { "auxiliary_loss_clip": 0.01443545, "auxiliary_loss_mlp": 0.00275445, "balance_loss_clip": 1.17439985, "balance_loss_mlp": 0.24182834, "epoch": 0.5926349015481738, "flos": 18405440190720.0, "grad_norm": 15.530529186061308, "language_loss": 0.84364498, "learning_rate": 1.5022958182636332e-06, "loss": 0.8608349, "num_input_tokens_seen": 212347295, "router_z_loss_clip": 2.68945312, "router_z_loss_mlp": 0.33618164, "step": 9857, "time_per_iteration": 2.626005172729492 }, { "auxiliary_loss_clip": 0.01463858, "auxiliary_loss_mlp": 0.00301267, "balance_loss_clip": 1.1936816, "balance_loss_mlp": 0.26629087, "epoch": 0.5926950248008417, "flos": 23111138269440.0, "grad_norm": 39.66020012412898, "language_loss": 0.72082698, "learning_rate": 1.501918617901419e-06, "loss": 0.73847818, "num_input_tokens_seen": 212365750, "router_z_loss_clip": 2.703125, "router_z_loss_mlp": 0.34985352, "step": 9858, "time_per_iteration": 2.7187845706939697 }, { "auxiliary_loss_clip": 0.01438943, "auxiliary_loss_mlp": 0.00279842, "balance_loss_clip": 1.17380714, "balance_loss_mlp": 0.24541402, "epoch": 0.5927551480535097, "flos": 28033916192640.0, "grad_norm": 212.2868180546516, "language_loss": 0.83492434, "learning_rate": 1.501541436426501e-06, "loss": 0.85211217, "num_input_tokens_seen": 212385300, "router_z_loss_clip": 2.65429688, "router_z_loss_mlp": 0.34423828, "step": 9859, "time_per_iteration": 4.128331422805786 }, { "auxiliary_loss_clip": 0.0146023, "auxiliary_loss_mlp": 0.00276058, "balance_loss_clip": 1.1844486, "balance_loss_mlp": 0.24084404, "epoch": 0.5928152713061776, "flos": 21798675221760.0, "grad_norm": 4.187935599303439, "language_loss": 0.84330487, "learning_rate": 1.5011642738531818e-06, "loss": 0.86066771, "num_input_tokens_seen": 212402140, "router_z_loss_clip": 2.7578125, "router_z_loss_mlp": 0.35229492, "step": 9860, "time_per_iteration": 2.6558737754821777 }, { "auxiliary_loss_clip": 0.01456431, "auxiliary_loss_mlp": 0.00267485, "balance_loss_clip": 1.18576884, "balance_loss_mlp": 0.23436883, "epoch": 0.5928753945588456, "flos": 24316839118080.0, "grad_norm": 33.369115108060264, "language_loss": 0.83452761, "learning_rate": 1.500787130195763e-06, "loss": 0.85176677, "num_input_tokens_seen": 212421790, "router_z_loss_clip": 2.70898438, "router_z_loss_mlp": 0.33081055, "step": 9861, "time_per_iteration": 2.6949849128723145 }, { "auxiliary_loss_clip": 0.01423269, "auxiliary_loss_mlp": 0.00305644, "balance_loss_clip": 1.16187406, "balance_loss_mlp": 0.27302876, "epoch": 0.5929355178115137, "flos": 26464619923200.0, "grad_norm": 77.33186359607825, "language_loss": 0.75122076, "learning_rate": 1.5004100054685465e-06, "loss": 0.76850992, "num_input_tokens_seen": 212442115, "router_z_loss_clip": 2.61328125, "router_z_loss_mlp": 0.32617188, "step": 9862, "time_per_iteration": 2.694572687149048 }, { "auxiliary_loss_clip": 0.01423971, "auxiliary_loss_mlp": 0.00273993, "balance_loss_clip": 1.16096485, "balance_loss_mlp": 0.24267627, "epoch": 0.5929956410641816, "flos": 24965995662720.0, "grad_norm": 59.65783740673292, "language_loss": 0.84409404, "learning_rate": 1.500032899685832e-06, "loss": 0.86107373, "num_input_tokens_seen": 212459535, "router_z_loss_clip": 2.6328125, "router_z_loss_mlp": 0.31311035, "step": 9863, "time_per_iteration": 2.702258586883545 }, { "auxiliary_loss_clip": 0.01472651, "auxiliary_loss_mlp": 0.00294875, "balance_loss_clip": 1.19935524, "balance_loss_mlp": 0.26028088, "epoch": 0.5930557643168496, "flos": 26208325405440.0, "grad_norm": 4.097955551040658, "language_loss": 0.77815211, "learning_rate": 1.499655812861921e-06, "loss": 0.79582739, "num_input_tokens_seen": 212479385, "router_z_loss_clip": 2.734375, "router_z_loss_mlp": 0.34619141, "step": 9864, "time_per_iteration": 4.1070473194122314 }, { "auxiliary_loss_clip": 0.01434026, "auxiliary_loss_mlp": 0.0028807, "balance_loss_clip": 1.17057538, "balance_loss_mlp": 0.25571692, "epoch": 0.5931158875695175, "flos": 27854937699840.0, "grad_norm": 61.56773092020042, "language_loss": 0.75440192, "learning_rate": 1.4992787450111112e-06, "loss": 0.7716229, "num_input_tokens_seen": 212500060, "router_z_loss_clip": 2.63476562, "router_z_loss_mlp": 0.32348633, "step": 9865, "time_per_iteration": 2.7351908683776855 }, { "auxiliary_loss_clip": 0.01460661, "auxiliary_loss_mlp": 0.00282852, "balance_loss_clip": 1.18790269, "balance_loss_mlp": 0.24782883, "epoch": 0.5931760108221855, "flos": 15413650536960.0, "grad_norm": 3.88678547848397, "language_loss": 0.87669456, "learning_rate": 1.4989016961477015e-06, "loss": 0.89412963, "num_input_tokens_seen": 212518590, "router_z_loss_clip": 2.72851562, "router_z_loss_mlp": 0.3503418, "step": 9866, "time_per_iteration": 2.728977680206299 }, { "auxiliary_loss_clip": 0.01469724, "auxiliary_loss_mlp": 0.00285482, "balance_loss_clip": 1.20097351, "balance_loss_mlp": 0.25315273, "epoch": 0.5932361340748534, "flos": 30188520581760.0, "grad_norm": 31.61678718349712, "language_loss": 0.78580081, "learning_rate": 1.4985246662859903e-06, "loss": 0.80335295, "num_input_tokens_seen": 212538190, "router_z_loss_clip": 2.6875, "router_z_loss_mlp": 0.32324219, "step": 9867, "time_per_iteration": 2.7604734897613525 }, { "auxiliary_loss_clip": 0.01460088, "auxiliary_loss_mlp": 0.00290878, "balance_loss_clip": 1.18901873, "balance_loss_mlp": 0.25370854, "epoch": 0.5932962573275214, "flos": 20157557708160.0, "grad_norm": 85.84715009285594, "language_loss": 0.75173974, "learning_rate": 1.4981476554402732e-06, "loss": 0.76924932, "num_input_tokens_seen": 212557820, "router_z_loss_clip": 2.7109375, "router_z_loss_mlp": 0.37158203, "step": 9868, "time_per_iteration": 2.7167835235595703 }, { "auxiliary_loss_clip": 0.01452228, "auxiliary_loss_mlp": 0.00315207, "balance_loss_clip": 1.17687941, "balance_loss_mlp": 0.27956307, "epoch": 0.5933563805801894, "flos": 25445906300160.0, "grad_norm": 9.950357815816668, "language_loss": 0.81664532, "learning_rate": 1.4977706636248478e-06, "loss": 0.83431965, "num_input_tokens_seen": 212577645, "router_z_loss_clip": 2.75, "router_z_loss_mlp": 0.35668945, "step": 9869, "time_per_iteration": 2.7083303928375244 }, { "auxiliary_loss_clip": 0.01490561, "auxiliary_loss_mlp": 0.00288896, "balance_loss_clip": 1.20737004, "balance_loss_mlp": 0.24969959, "epoch": 0.5934165038328574, "flos": 59995740337920.0, "grad_norm": 5.090103037790406, "language_loss": 0.79936242, "learning_rate": 1.4973936908540091e-06, "loss": 0.81715703, "num_input_tokens_seen": 212603430, "router_z_loss_clip": 2.828125, "router_z_loss_mlp": 0.3918457, "step": 9870, "time_per_iteration": 3.0205936431884766 }, { "auxiliary_loss_clip": 0.01462296, "auxiliary_loss_mlp": 0.00315965, "balance_loss_clip": 1.18846273, "balance_loss_mlp": 0.27939144, "epoch": 0.5934766270855253, "flos": 24420548661120.0, "grad_norm": 16.045545491246536, "language_loss": 0.81537628, "learning_rate": 1.4970167371420517e-06, "loss": 0.83315885, "num_input_tokens_seen": 212620730, "router_z_loss_clip": 2.74023438, "router_z_loss_mlp": 0.3659668, "step": 9871, "time_per_iteration": 2.8234212398529053 }, { "auxiliary_loss_clip": 0.01465198, "auxiliary_loss_mlp": 0.00287943, "balance_loss_clip": 1.1898669, "balance_loss_mlp": 0.25115544, "epoch": 0.5935367503381933, "flos": 23513158264320.0, "grad_norm": 82.78545572899289, "language_loss": 0.83107162, "learning_rate": 1.496639802503271e-06, "loss": 0.84860301, "num_input_tokens_seen": 212639745, "router_z_loss_clip": 2.75390625, "router_z_loss_mlp": 0.36791992, "step": 9872, "time_per_iteration": 2.7907941341400146 }, { "auxiliary_loss_clip": 0.0146362, "auxiliary_loss_mlp": 0.00314442, "balance_loss_clip": 1.18371809, "balance_loss_mlp": 0.27803516, "epoch": 0.5935968735908612, "flos": 18948337326720.0, "grad_norm": 6691.857552820181, "language_loss": 0.88066834, "learning_rate": 1.4962628869519583e-06, "loss": 0.89844894, "num_input_tokens_seen": 212655915, "router_z_loss_clip": 2.80078125, "router_z_loss_mlp": 0.36401367, "step": 9873, "time_per_iteration": 2.663792848587036 }, { "auxiliary_loss_clip": 0.01475834, "auxiliary_loss_mlp": 0.00292449, "balance_loss_clip": 1.19771707, "balance_loss_mlp": 0.25494593, "epoch": 0.5936569968435292, "flos": 25483433034240.0, "grad_norm": 19.84337259200671, "language_loss": 0.9024213, "learning_rate": 1.4958859905024078e-06, "loss": 0.92010415, "num_input_tokens_seen": 212676115, "router_z_loss_clip": 2.78125, "router_z_loss_mlp": 0.375, "step": 9874, "time_per_iteration": 2.7288613319396973 }, { "auxiliary_loss_clip": 0.014908, "auxiliary_loss_mlp": 0.00120416, "balance_loss_clip": 1.26804423, "balance_loss_mlp": 0.10854246, "epoch": 0.5937171200961973, "flos": 66378361789440.0, "grad_norm": 0.693204142766552, "language_loss": 0.59237176, "learning_rate": 1.4955091131689115e-06, "loss": 0.60848391, "num_input_tokens_seen": 212737560, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.11865234, "step": 9875, "time_per_iteration": 3.2925188541412354 }, { "auxiliary_loss_clip": 0.0146191, "auxiliary_loss_mlp": 0.00280768, "balance_loss_clip": 1.1799314, "balance_loss_mlp": 0.24345513, "epoch": 0.5937772433488652, "flos": 14903467712640.0, "grad_norm": 36.12365358482084, "language_loss": 0.85037589, "learning_rate": 1.4951322549657594e-06, "loss": 0.86780262, "num_input_tokens_seen": 212755365, "router_z_loss_clip": 2.8203125, "router_z_loss_mlp": 0.37329102, "step": 9876, "time_per_iteration": 2.6889657974243164 }, { "auxiliary_loss_clip": 0.01449746, "auxiliary_loss_mlp": 0.00301939, "balance_loss_clip": 1.19027758, "balance_loss_mlp": 0.26629579, "epoch": 0.5938373666015332, "flos": 22561489376640.0, "grad_norm": 22.38973231787652, "language_loss": 0.79803997, "learning_rate": 1.494755415907243e-06, "loss": 0.81555676, "num_input_tokens_seen": 212773875, "router_z_loss_clip": 2.59765625, "router_z_loss_mlp": 0.35668945, "step": 9877, "time_per_iteration": 2.6529200077056885 }, { "auxiliary_loss_clip": 0.01478519, "auxiliary_loss_mlp": 0.00299851, "balance_loss_clip": 1.19903016, "balance_loss_mlp": 0.26239544, "epoch": 0.5938974898542011, "flos": 18440883936000.0, "grad_norm": 8.174596748437049, "language_loss": 0.89258218, "learning_rate": 1.4943785960076522e-06, "loss": 0.91036594, "num_input_tokens_seen": 212790590, "router_z_loss_clip": 2.79492188, "router_z_loss_mlp": 0.37475586, "step": 9878, "time_per_iteration": 2.6163697242736816 }, { "auxiliary_loss_clip": 0.01471302, "auxiliary_loss_mlp": 0.00331032, "balance_loss_clip": 1.19612551, "balance_loss_mlp": 0.29402909, "epoch": 0.5939576131068691, "flos": 45586728270720.0, "grad_norm": 4.556496793336722, "language_loss": 0.77193946, "learning_rate": 1.4940017952812754e-06, "loss": 0.78996277, "num_input_tokens_seen": 212812265, "router_z_loss_clip": 2.75390625, "router_z_loss_mlp": 0.36987305, "step": 9879, "time_per_iteration": 2.9265189170837402 }, { "auxiliary_loss_clip": 0.0147089, "auxiliary_loss_mlp": 0.00289772, "balance_loss_clip": 1.2013489, "balance_loss_mlp": 0.25439042, "epoch": 0.594017736359537, "flos": 23587708942080.0, "grad_norm": 3.375069495827247, "language_loss": 0.65870386, "learning_rate": 1.493625013742401e-06, "loss": 0.67631048, "num_input_tokens_seen": 212831915, "router_z_loss_clip": 2.69140625, "router_z_loss_mlp": 0.35375977, "step": 9880, "time_per_iteration": 2.722580671310425 }, { "auxiliary_loss_clip": 0.01458356, "auxiliary_loss_mlp": 0.00295207, "balance_loss_clip": 1.18528271, "balance_loss_mlp": 0.25989693, "epoch": 0.594077859612205, "flos": 29457235589760.0, "grad_norm": 3.7279646874262236, "language_loss": 0.86263251, "learning_rate": 1.4932482514053177e-06, "loss": 0.88016814, "num_input_tokens_seen": 212851350, "router_z_loss_clip": 2.73242188, "router_z_loss_mlp": 0.35302734, "step": 9881, "time_per_iteration": 2.72393798828125 }, { "auxiliary_loss_clip": 0.01480868, "auxiliary_loss_mlp": 0.00306144, "balance_loss_clip": 1.20126653, "balance_loss_mlp": 0.2674244, "epoch": 0.594137982864873, "flos": 16800089644800.0, "grad_norm": 20.847391549236384, "language_loss": 0.92174178, "learning_rate": 1.4928715082843112e-06, "loss": 0.93961185, "num_input_tokens_seen": 212867995, "router_z_loss_clip": 2.79882812, "router_z_loss_mlp": 0.38745117, "step": 9882, "time_per_iteration": 2.716397762298584 }, { "auxiliary_loss_clip": 0.01484937, "auxiliary_loss_mlp": 0.0029434, "balance_loss_clip": 1.20329404, "balance_loss_mlp": 0.26048455, "epoch": 0.594198106117541, "flos": 12750263953920.0, "grad_norm": 6.381291255385358, "language_loss": 0.89115751, "learning_rate": 1.492494784393667e-06, "loss": 0.90895033, "num_input_tokens_seen": 212885220, "router_z_loss_clip": 2.8203125, "router_z_loss_mlp": 0.33862305, "step": 9883, "time_per_iteration": 2.6225528717041016 }, { "auxiliary_loss_clip": 0.01474819, "auxiliary_loss_mlp": 0.00273823, "balance_loss_clip": 1.19872451, "balance_loss_mlp": 0.23658216, "epoch": 0.5942582293702089, "flos": 20996538652800.0, "grad_norm": 6.705618974231539, "language_loss": 0.84062123, "learning_rate": 1.4921180797476725e-06, "loss": 0.85810763, "num_input_tokens_seen": 212903195, "router_z_loss_clip": 2.75976562, "router_z_loss_mlp": 0.37255859, "step": 9884, "time_per_iteration": 2.697502374649048 }, { "auxiliary_loss_clip": 0.01492972, "auxiliary_loss_mlp": 0.00318569, "balance_loss_clip": 1.20609856, "balance_loss_mlp": 0.28197166, "epoch": 0.5943183526228769, "flos": 28291431772800.0, "grad_norm": 10.982445436356532, "language_loss": 0.77227426, "learning_rate": 1.4917413943606106e-06, "loss": 0.79038966, "num_input_tokens_seen": 212923340, "router_z_loss_clip": 2.86914062, "router_z_loss_mlp": 0.36547852, "step": 9885, "time_per_iteration": 2.7172701358795166 }, { "auxiliary_loss_clip": 0.01470808, "auxiliary_loss_mlp": 0.00312502, "balance_loss_clip": 1.19535732, "balance_loss_mlp": 0.27340147, "epoch": 0.5943784758755448, "flos": 26614619118720.0, "grad_norm": 31.730910303499858, "language_loss": 0.85656375, "learning_rate": 1.4913647282467667e-06, "loss": 0.87439686, "num_input_tokens_seen": 212942755, "router_z_loss_clip": 2.75, "router_z_loss_mlp": 0.39135742, "step": 9886, "time_per_iteration": 2.7271623611450195 }, { "auxiliary_loss_clip": 0.01475223, "auxiliary_loss_mlp": 0.00080213, "balance_loss_clip": 1.25392604, "balance_loss_mlp": 0.06819718, "epoch": 0.5944385991282128, "flos": 64190935347840.0, "grad_norm": 0.8419632447241692, "language_loss": 0.63697332, "learning_rate": 1.490988081420423e-06, "loss": 0.65252769, "num_input_tokens_seen": 212999355, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.12011719, "step": 9887, "time_per_iteration": 3.0732152462005615 }, { "auxiliary_loss_clip": 0.01469217, "auxiliary_loss_mlp": 0.00320778, "balance_loss_clip": 1.19714367, "balance_loss_mlp": 0.28742331, "epoch": 0.5944987223808808, "flos": 19571998193280.0, "grad_norm": 5.170479588840294, "language_loss": 0.76923394, "learning_rate": 1.4906114538958615e-06, "loss": 0.78713393, "num_input_tokens_seen": 213018570, "router_z_loss_clip": 2.71875, "router_z_loss_mlp": 0.33325195, "step": 9888, "time_per_iteration": 2.6562225818634033 }, { "auxiliary_loss_clip": 0.01498973, "auxiliary_loss_mlp": 0.00318698, "balance_loss_clip": 1.21231246, "balance_loss_mlp": 0.28043211, "epoch": 0.5945588456335488, "flos": 26177586341760.0, "grad_norm": 26.056069759639005, "language_loss": 0.8575995, "learning_rate": 1.490234845687366e-06, "loss": 0.87577623, "num_input_tokens_seen": 213037735, "router_z_loss_clip": 2.8671875, "router_z_loss_mlp": 0.38305664, "step": 9889, "time_per_iteration": 2.7081849575042725 }, { "auxiliary_loss_clip": 0.01452425, "auxiliary_loss_mlp": 0.00326827, "balance_loss_clip": 1.1824069, "balance_loss_mlp": 0.29330555, "epoch": 0.5946189688862168, "flos": 20446494710400.0, "grad_norm": 7.9870982348630415, "language_loss": 0.77629948, "learning_rate": 1.4898582568092154e-06, "loss": 0.79409206, "num_input_tokens_seen": 213057160, "router_z_loss_clip": 2.703125, "router_z_loss_mlp": 0.33520508, "step": 9890, "time_per_iteration": 2.6729488372802734 }, { "auxiliary_loss_clip": 0.0147472, "auxiliary_loss_mlp": 0.00323432, "balance_loss_clip": 1.19880402, "balance_loss_mlp": 0.28821808, "epoch": 0.5946790921388847, "flos": 13437521850240.0, "grad_norm": 19.579375794245728, "language_loss": 0.76960659, "learning_rate": 1.489481687275691e-06, "loss": 0.78758812, "num_input_tokens_seen": 213073630, "router_z_loss_clip": 2.76171875, "router_z_loss_mlp": 0.35253906, "step": 9891, "time_per_iteration": 2.6554949283599854 }, { "auxiliary_loss_clip": 0.01481012, "auxiliary_loss_mlp": 0.00333118, "balance_loss_clip": 1.20727229, "balance_loss_mlp": 0.29911971, "epoch": 0.5947392153915527, "flos": 20412272027520.0, "grad_norm": 56.04809253947543, "language_loss": 0.62161183, "learning_rate": 1.4891051371010726e-06, "loss": 0.6397531, "num_input_tokens_seen": 213092450, "router_z_loss_clip": 2.7421875, "router_z_loss_mlp": 0.33984375, "step": 9892, "time_per_iteration": 2.657518148422241 }, { "auxiliary_loss_clip": 0.01482562, "auxiliary_loss_mlp": 0.00118518, "balance_loss_clip": 1.26126361, "balance_loss_mlp": 0.10902866, "epoch": 0.5947993386442206, "flos": 65619138994560.0, "grad_norm": 0.8265870626653891, "language_loss": 0.53819823, "learning_rate": 1.4887286062996375e-06, "loss": 0.55420905, "num_input_tokens_seen": 213155465, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.09472656, "step": 9893, "time_per_iteration": 3.250304937362671 }, { "auxiliary_loss_clip": 0.01447285, "auxiliary_loss_mlp": 0.00301192, "balance_loss_clip": 1.18700457, "balance_loss_mlp": 0.26790896, "epoch": 0.5948594618968887, "flos": 23183103168000.0, "grad_norm": 7.123869544728069, "language_loss": 0.79582244, "learning_rate": 1.4883520948856658e-06, "loss": 0.81330723, "num_input_tokens_seen": 213174875, "router_z_loss_clip": 2.60546875, "router_z_loss_mlp": 0.33276367, "step": 9894, "time_per_iteration": 2.747856378555298 }, { "auxiliary_loss_clip": 0.01482984, "auxiliary_loss_mlp": 0.00296263, "balance_loss_clip": 1.20525575, "balance_loss_mlp": 0.26028556, "epoch": 0.5949195851495566, "flos": 13626771632640.0, "grad_norm": 42.942595792101834, "language_loss": 0.85340583, "learning_rate": 1.487975602873434e-06, "loss": 0.8711983, "num_input_tokens_seen": 213192695, "router_z_loss_clip": 2.77929688, "router_z_loss_mlp": 0.35961914, "step": 9895, "time_per_iteration": 2.7213644981384277 }, { "auxiliary_loss_clip": 0.01478039, "auxiliary_loss_mlp": 0.00315565, "balance_loss_clip": 1.19904673, "balance_loss_mlp": 0.28247282, "epoch": 0.5949797084022246, "flos": 19751012599680.0, "grad_norm": 51.08169300506529, "language_loss": 0.84464556, "learning_rate": 1.4875991302772182e-06, "loss": 0.86258161, "num_input_tokens_seen": 213211195, "router_z_loss_clip": 2.7890625, "router_z_loss_mlp": 0.33105469, "step": 9896, "time_per_iteration": 4.1640942096710205 }, { "auxiliary_loss_clip": 0.01486806, "auxiliary_loss_mlp": 0.00324128, "balance_loss_clip": 1.20925093, "balance_loss_mlp": 0.28757837, "epoch": 0.5950398316548925, "flos": 25773878407680.0, "grad_norm": 24.91795900956461, "language_loss": 0.91103399, "learning_rate": 1.4872226771112954e-06, "loss": 0.92914331, "num_input_tokens_seen": 213231975, "router_z_loss_clip": 2.77734375, "router_z_loss_mlp": 0.36523438, "step": 9897, "time_per_iteration": 4.181141138076782 }, { "auxiliary_loss_clip": 0.01483905, "auxiliary_loss_mlp": 0.00311989, "balance_loss_clip": 1.20670164, "balance_loss_mlp": 0.27815759, "epoch": 0.5950999549075605, "flos": 23039029716480.0, "grad_norm": 15.449107622286826, "language_loss": 0.79886782, "learning_rate": 1.486846243389939e-06, "loss": 0.8168267, "num_input_tokens_seen": 213249760, "router_z_loss_clip": 2.7734375, "router_z_loss_mlp": 0.33837891, "step": 9898, "time_per_iteration": 2.6774189472198486 }, { "auxiliary_loss_clip": 0.01470975, "auxiliary_loss_mlp": 0.00348705, "balance_loss_clip": 1.1919241, "balance_loss_mlp": 0.31115448, "epoch": 0.5951600781602284, "flos": 32446367637120.0, "grad_norm": 307.6613662600995, "language_loss": 0.72438848, "learning_rate": 1.4864698291274251e-06, "loss": 0.7425853, "num_input_tokens_seen": 213269890, "router_z_loss_clip": 2.79296875, "router_z_loss_mlp": 0.37573242, "step": 9899, "time_per_iteration": 2.756307363510132 }, { "auxiliary_loss_clip": 0.01495174, "auxiliary_loss_mlp": 0.00354505, "balance_loss_clip": 1.21591401, "balance_loss_mlp": 0.31895661, "epoch": 0.5952202014128964, "flos": 23800874204160.0, "grad_norm": 6.670110021337595, "language_loss": 0.78000456, "learning_rate": 1.4860934343380267e-06, "loss": 0.79850131, "num_input_tokens_seen": 213289400, "router_z_loss_clip": 2.79492188, "router_z_loss_mlp": 0.35571289, "step": 9900, "time_per_iteration": 2.695301055908203 }, { "auxiliary_loss_clip": 0.01461183, "auxiliary_loss_mlp": 0.00308865, "balance_loss_clip": 1.19379473, "balance_loss_mlp": 0.2756291, "epoch": 0.5952803246655644, "flos": 22492182084480.0, "grad_norm": 56.51674674973559, "language_loss": 0.90799189, "learning_rate": 1.4857170590360169e-06, "loss": 0.92569232, "num_input_tokens_seen": 213308040, "router_z_loss_clip": 2.671875, "router_z_loss_mlp": 0.33227539, "step": 9901, "time_per_iteration": 4.123290777206421 }, { "auxiliary_loss_clip": 0.01488879, "auxiliary_loss_mlp": 0.00124454, "balance_loss_clip": 1.27152586, "balance_loss_mlp": 0.11458335, "epoch": 0.5953404479182324, "flos": 51234688851840.0, "grad_norm": 0.8002927281896086, "language_loss": 0.57274765, "learning_rate": 1.4853407032356674e-06, "loss": 0.58888096, "num_input_tokens_seen": 213358585, "router_z_loss_clip": 2.171875, "router_z_loss_mlp": 0.09863281, "step": 9902, "time_per_iteration": 3.052156686782837 }, { "auxiliary_loss_clip": 0.01471812, "auxiliary_loss_mlp": 0.00290761, "balance_loss_clip": 1.19754338, "balance_loss_mlp": 0.25936151, "epoch": 0.5954005711709004, "flos": 23112682554240.0, "grad_norm": 14.619705455726475, "language_loss": 0.85779274, "learning_rate": 1.4849643669512503e-06, "loss": 0.87541848, "num_input_tokens_seen": 213379585, "router_z_loss_clip": 2.74023438, "router_z_loss_mlp": 0.31420898, "step": 9903, "time_per_iteration": 2.6832470893859863 }, { "auxiliary_loss_clip": 0.01453279, "auxiliary_loss_mlp": 0.00315017, "balance_loss_clip": 1.18439054, "balance_loss_mlp": 0.28290188, "epoch": 0.5954606944235683, "flos": 35954732736000.0, "grad_norm": 35.9819914518803, "language_loss": 0.83732986, "learning_rate": 1.4845880501970362e-06, "loss": 0.85501277, "num_input_tokens_seen": 213401465, "router_z_loss_clip": 2.68945312, "router_z_loss_mlp": 0.32104492, "step": 9904, "time_per_iteration": 2.787508249282837 }, { "auxiliary_loss_clip": 0.01443926, "auxiliary_loss_mlp": 0.00310453, "balance_loss_clip": 1.17707038, "balance_loss_mlp": 0.27688342, "epoch": 0.5955208176762363, "flos": 30443665864320.0, "grad_norm": 2.13467934450799, "language_loss": 0.79422855, "learning_rate": 1.4842117529872942e-06, "loss": 0.81177229, "num_input_tokens_seen": 213422720, "router_z_loss_clip": 2.671875, "router_z_loss_mlp": 0.33569336, "step": 9905, "time_per_iteration": 2.7255477905273438 }, { "auxiliary_loss_clip": 0.01476949, "auxiliary_loss_mlp": 0.00311149, "balance_loss_clip": 1.19626307, "balance_loss_mlp": 0.27736464, "epoch": 0.5955809409289042, "flos": 17640112083840.0, "grad_norm": 327.88387189350925, "language_loss": 0.77459538, "learning_rate": 1.483835475336295e-06, "loss": 0.79247636, "num_input_tokens_seen": 213439480, "router_z_loss_clip": 2.80859375, "router_z_loss_mlp": 0.33789062, "step": 9906, "time_per_iteration": 4.038356065750122 }, { "auxiliary_loss_clip": 0.01429177, "auxiliary_loss_mlp": 0.0032698, "balance_loss_clip": 1.16824639, "balance_loss_mlp": 0.29565141, "epoch": 0.5956410641815723, "flos": 24279887001600.0, "grad_norm": 25.02617395169087, "language_loss": 0.82176322, "learning_rate": 1.4834592172583057e-06, "loss": 0.83932477, "num_input_tokens_seen": 213458895, "router_z_loss_clip": 2.61132812, "router_z_loss_mlp": 0.31323242, "step": 9907, "time_per_iteration": 2.7398808002471924 }, { "auxiliary_loss_clip": 0.01456778, "auxiliary_loss_mlp": 0.00341845, "balance_loss_clip": 1.18769634, "balance_loss_mlp": 0.30753624, "epoch": 0.5957011874342402, "flos": 35734277013120.0, "grad_norm": 13.094683824411073, "language_loss": 0.73516726, "learning_rate": 1.483082978767595e-06, "loss": 0.7531535, "num_input_tokens_seen": 213481730, "router_z_loss_clip": 2.69335938, "router_z_loss_mlp": 0.34277344, "step": 9908, "time_per_iteration": 2.8069729804992676 }, { "auxiliary_loss_clip": 0.01440939, "auxiliary_loss_mlp": 0.00312897, "balance_loss_clip": 1.18013096, "balance_loss_mlp": 0.28092489, "epoch": 0.5957613106869082, "flos": 21245004005760.0, "grad_norm": 8.70924690554465, "language_loss": 0.8347438, "learning_rate": 1.4827067598784298e-06, "loss": 0.85228217, "num_input_tokens_seen": 213497225, "router_z_loss_clip": 2.60742188, "router_z_loss_mlp": 0.31933594, "step": 9909, "time_per_iteration": 2.6808347702026367 }, { "auxiliary_loss_clip": 0.01474586, "auxiliary_loss_mlp": 0.001227, "balance_loss_clip": 1.26097393, "balance_loss_mlp": 0.11344907, "epoch": 0.5958214339395761, "flos": 65940969876480.0, "grad_norm": 0.8983636155675322, "language_loss": 0.72977996, "learning_rate": 1.4823305606050753e-06, "loss": 0.74575281, "num_input_tokens_seen": 213556890, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.09228516, "step": 9910, "time_per_iteration": 3.2588815689086914 }, { "auxiliary_loss_clip": 0.01446701, "auxiliary_loss_mlp": 0.00298026, "balance_loss_clip": 1.17794657, "balance_loss_mlp": 0.26371709, "epoch": 0.5958815571922441, "flos": 23218690567680.0, "grad_norm": 233.59273087777117, "language_loss": 0.76708531, "learning_rate": 1.481954380961799e-06, "loss": 0.78453255, "num_input_tokens_seen": 213575800, "router_z_loss_clip": 2.68945312, "router_z_loss_mlp": 0.34326172, "step": 9911, "time_per_iteration": 2.763695240020752 }, { "auxiliary_loss_clip": 0.01474253, "auxiliary_loss_mlp": 0.0031906, "balance_loss_clip": 1.19577897, "balance_loss_mlp": 0.28153306, "epoch": 0.595941680444912, "flos": 16538623568640.0, "grad_norm": 95.13354104799065, "language_loss": 0.76984066, "learning_rate": 1.4815782209628631e-06, "loss": 0.78777379, "num_input_tokens_seen": 213592740, "router_z_loss_clip": 2.78710938, "router_z_loss_mlp": 0.37548828, "step": 9912, "time_per_iteration": 2.6175808906555176 }, { "auxiliary_loss_clip": 0.01469708, "auxiliary_loss_mlp": 0.00325577, "balance_loss_clip": 1.19350398, "balance_loss_mlp": 0.29176968, "epoch": 0.59600180369758, "flos": 27818883423360.0, "grad_norm": 21.222269647692652, "language_loss": 0.80280912, "learning_rate": 1.4812020806225337e-06, "loss": 0.82076192, "num_input_tokens_seen": 213611970, "router_z_loss_clip": 2.76367188, "router_z_loss_mlp": 0.33813477, "step": 9913, "time_per_iteration": 2.7510123252868652 }, { "auxiliary_loss_clip": 0.01457527, "auxiliary_loss_mlp": 0.00306454, "balance_loss_clip": 1.18296623, "balance_loss_mlp": 0.26935607, "epoch": 0.596061926950248, "flos": 29491566013440.0, "grad_norm": 595.4558384819093, "language_loss": 0.87530154, "learning_rate": 1.4808259599550738e-06, "loss": 0.89294136, "num_input_tokens_seen": 213632230, "router_z_loss_clip": 2.74804688, "router_z_loss_mlp": 0.37084961, "step": 9914, "time_per_iteration": 2.716421604156494 }, { "auxiliary_loss_clip": 0.0145665, "auxiliary_loss_mlp": 0.00283378, "balance_loss_clip": 1.18516803, "balance_loss_mlp": 0.24961753, "epoch": 0.596122050202916, "flos": 16836790366080.0, "grad_norm": 14.853017466440692, "language_loss": 0.75270629, "learning_rate": 1.4804498589747448e-06, "loss": 0.77010655, "num_input_tokens_seen": 213649645, "router_z_loss_clip": 2.71484375, "router_z_loss_mlp": 0.33740234, "step": 9915, "time_per_iteration": 2.631695508956909 }, { "auxiliary_loss_clip": 0.01460329, "auxiliary_loss_mlp": 0.00302083, "balance_loss_clip": 1.18636656, "balance_loss_mlp": 0.26934832, "epoch": 0.596182173455584, "flos": 20996646393600.0, "grad_norm": 13.77119772684494, "language_loss": 0.86788517, "learning_rate": 1.4800737776958095e-06, "loss": 0.88550931, "num_input_tokens_seen": 213668850, "router_z_loss_clip": 2.73828125, "router_z_loss_mlp": 0.32739258, "step": 9916, "time_per_iteration": 2.70361328125 }, { "auxiliary_loss_clip": 0.0144133, "auxiliary_loss_mlp": 0.00313288, "balance_loss_clip": 1.17290998, "balance_loss_mlp": 0.28071958, "epoch": 0.5962422967082519, "flos": 16065680169600.0, "grad_norm": 39.081681970522034, "language_loss": 0.91093624, "learning_rate": 1.4796977161325286e-06, "loss": 0.92848241, "num_input_tokens_seen": 213685695, "router_z_loss_clip": 2.68359375, "router_z_loss_mlp": 0.32568359, "step": 9917, "time_per_iteration": 2.824878692626953 }, { "auxiliary_loss_clip": 0.01423408, "auxiliary_loss_mlp": 0.00324617, "balance_loss_clip": 1.16405261, "balance_loss_mlp": 0.29393274, "epoch": 0.5963024199609199, "flos": 12166966995840.0, "grad_norm": 23.479340317551916, "language_loss": 0.85183465, "learning_rate": 1.4793216742991625e-06, "loss": 0.86931491, "num_input_tokens_seen": 213703515, "router_z_loss_clip": 2.59570312, "router_z_loss_mlp": 0.30688477, "step": 9918, "time_per_iteration": 2.694160223007202 }, { "auxiliary_loss_clip": 0.01459415, "auxiliary_loss_mlp": 0.00293097, "balance_loss_clip": 1.18749583, "balance_loss_mlp": 0.26100564, "epoch": 0.5963625432135878, "flos": 28074280101120.0, "grad_norm": 3.660428487562262, "language_loss": 0.83240718, "learning_rate": 1.4789456522099707e-06, "loss": 0.84993231, "num_input_tokens_seen": 213724170, "router_z_loss_clip": 2.72070312, "router_z_loss_mlp": 0.32104492, "step": 9919, "time_per_iteration": 2.71893310546875 }, { "auxiliary_loss_clip": 0.01463309, "auxiliary_loss_mlp": 0.00280284, "balance_loss_clip": 1.1943146, "balance_loss_mlp": 0.24564192, "epoch": 0.5964226664662559, "flos": 19860324664320.0, "grad_norm": 9.206648454916273, "language_loss": 0.85766345, "learning_rate": 1.4785696498792122e-06, "loss": 0.87509936, "num_input_tokens_seen": 213740620, "router_z_loss_clip": 2.69140625, "router_z_loss_mlp": 0.34619141, "step": 9920, "time_per_iteration": 2.702322006225586 }, { "auxiliary_loss_clip": 0.01455997, "auxiliary_loss_mlp": 0.00296578, "balance_loss_clip": 1.18783855, "balance_loss_mlp": 0.26052916, "epoch": 0.5964827897189238, "flos": 12932618325120.0, "grad_norm": 4.9602620581988, "language_loss": 0.89488453, "learning_rate": 1.4781936673211446e-06, "loss": 0.91241026, "num_input_tokens_seen": 213755390, "router_z_loss_clip": 2.68164062, "router_z_loss_mlp": 0.3605957, "step": 9921, "time_per_iteration": 2.638015031814575 }, { "auxiliary_loss_clip": 0.01452564, "auxiliary_loss_mlp": 0.00296444, "balance_loss_clip": 1.18512297, "balance_loss_mlp": 0.26437643, "epoch": 0.5965429129715918, "flos": 18150797698560.0, "grad_norm": 40.96286559492331, "language_loss": 0.90230548, "learning_rate": 1.4778177045500252e-06, "loss": 0.91979557, "num_input_tokens_seen": 213773225, "router_z_loss_clip": 2.67382812, "router_z_loss_mlp": 0.32080078, "step": 9922, "time_per_iteration": 2.6307971477508545 }, { "auxiliary_loss_clip": 0.01461636, "auxiliary_loss_mlp": 0.00287726, "balance_loss_clip": 1.19154572, "balance_loss_mlp": 0.25325069, "epoch": 0.5966030362242597, "flos": 21763231476480.0, "grad_norm": 3.0017816044522228, "language_loss": 0.84551954, "learning_rate": 1.477441761580111e-06, "loss": 0.86301315, "num_input_tokens_seen": 213791860, "router_z_loss_clip": 2.69726562, "router_z_loss_mlp": 0.34472656, "step": 9923, "time_per_iteration": 2.6804938316345215 }, { "auxiliary_loss_clip": 0.01469778, "auxiliary_loss_mlp": 0.00296416, "balance_loss_clip": 1.18920064, "balance_loss_mlp": 0.26027143, "epoch": 0.5966631594769277, "flos": 18807208790400.0, "grad_norm": 2.484071779984118, "language_loss": 0.8344422, "learning_rate": 1.4770658384256573e-06, "loss": 0.85210413, "num_input_tokens_seen": 213809455, "router_z_loss_clip": 2.80859375, "router_z_loss_mlp": 0.36132812, "step": 9924, "time_per_iteration": 2.677790403366089 }, { "auxiliary_loss_clip": 0.01462063, "auxiliary_loss_mlp": 0.00300946, "balance_loss_clip": 1.19376004, "balance_loss_mlp": 0.26747221, "epoch": 0.5967232827295956, "flos": 14064163545600.0, "grad_norm": 31.27193902091006, "language_loss": 0.72972357, "learning_rate": 1.4766899351009204e-06, "loss": 0.74735373, "num_input_tokens_seen": 213826615, "router_z_loss_clip": 2.68164062, "router_z_loss_mlp": 0.3347168, "step": 9925, "time_per_iteration": 2.660032033920288 }, { "auxiliary_loss_clip": 0.01448111, "auxiliary_loss_mlp": 0.00264419, "balance_loss_clip": 1.1813519, "balance_loss_mlp": 0.23378187, "epoch": 0.5967834059822636, "flos": 17238235743360.0, "grad_norm": 2.702343106927284, "language_loss": 0.79081291, "learning_rate": 1.4763140516201528e-06, "loss": 0.80793822, "num_input_tokens_seen": 213844495, "router_z_loss_clip": 2.66992188, "router_z_loss_mlp": 0.30639648, "step": 9926, "time_per_iteration": 2.684760808944702 }, { "auxiliary_loss_clip": 0.0147669, "auxiliary_loss_mlp": 0.00312011, "balance_loss_clip": 1.19738019, "balance_loss_mlp": 0.27708304, "epoch": 0.5968435292349316, "flos": 42520244284800.0, "grad_norm": 39.28841890999236, "language_loss": 0.78596556, "learning_rate": 1.4759381879976088e-06, "loss": 0.80385256, "num_input_tokens_seen": 213869125, "router_z_loss_clip": 2.79296875, "router_z_loss_mlp": 0.34912109, "step": 9927, "time_per_iteration": 2.8469736576080322 }, { "auxiliary_loss_clip": 0.01466686, "auxiliary_loss_mlp": 0.00311002, "balance_loss_clip": 1.18580115, "balance_loss_mlp": 0.2729502, "epoch": 0.5969036524875996, "flos": 37630898945280.0, "grad_norm": 18.239045443682112, "language_loss": 0.72351992, "learning_rate": 1.4755623442475415e-06, "loss": 0.74129683, "num_input_tokens_seen": 213891115, "router_z_loss_clip": 2.8125, "router_z_loss_mlp": 0.38012695, "step": 9928, "time_per_iteration": 2.787754535675049 }, { "auxiliary_loss_clip": 0.01458785, "auxiliary_loss_mlp": 0.00288509, "balance_loss_clip": 1.1890806, "balance_loss_mlp": 0.25591731, "epoch": 0.5969637757402676, "flos": 23148377694720.0, "grad_norm": 2.221003982786883, "language_loss": 0.75289595, "learning_rate": 1.4751865203842022e-06, "loss": 0.77036893, "num_input_tokens_seen": 213911925, "router_z_loss_clip": 2.69921875, "router_z_loss_mlp": 0.32617188, "step": 9929, "time_per_iteration": 2.742732048034668 }, { "auxiliary_loss_clip": 0.01459918, "auxiliary_loss_mlp": 0.00280982, "balance_loss_clip": 1.1908114, "balance_loss_mlp": 0.24901041, "epoch": 0.5970238989929355, "flos": 24020934877440.0, "grad_norm": 9.137605278035835, "language_loss": 0.85337186, "learning_rate": 1.4748107164218431e-06, "loss": 0.87078089, "num_input_tokens_seen": 213930715, "router_z_loss_clip": 2.69335938, "router_z_loss_mlp": 0.31982422, "step": 9930, "time_per_iteration": 2.6651439666748047 }, { "auxiliary_loss_clip": 0.01484478, "auxiliary_loss_mlp": 0.00287561, "balance_loss_clip": 1.20217323, "balance_loss_mlp": 0.25098789, "epoch": 0.5970840222456035, "flos": 19426883247360.0, "grad_norm": 10.600629686384623, "language_loss": 0.78108352, "learning_rate": 1.4744349323747146e-06, "loss": 0.79880393, "num_input_tokens_seen": 213950015, "router_z_loss_clip": 2.82226562, "router_z_loss_mlp": 0.36572266, "step": 9931, "time_per_iteration": 2.676302909851074 }, { "auxiliary_loss_clip": 0.0149596, "auxiliary_loss_mlp": 0.00119435, "balance_loss_clip": 1.27994561, "balance_loss_mlp": 0.10942179, "epoch": 0.5971441454982714, "flos": 62976615235200.0, "grad_norm": 0.8466451225774433, "language_loss": 0.64000475, "learning_rate": 1.474059168257065e-06, "loss": 0.65615869, "num_input_tokens_seen": 214003330, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.10009766, "step": 9932, "time_per_iteration": 3.1133086681365967 }, { "auxiliary_loss_clip": 0.01441198, "auxiliary_loss_mlp": 0.00319535, "balance_loss_clip": 1.17533517, "balance_loss_mlp": 0.2870152, "epoch": 0.5972042687509395, "flos": 20266223328000.0, "grad_norm": 12.997371451895141, "language_loss": 0.80515039, "learning_rate": 1.4736834240831454e-06, "loss": 0.82275772, "num_input_tokens_seen": 214021680, "router_z_loss_clip": 2.66015625, "router_z_loss_mlp": 0.32519531, "step": 9933, "time_per_iteration": 2.686845302581787 }, { "auxiliary_loss_clip": 0.01494077, "auxiliary_loss_mlp": 0.00152225, "balance_loss_clip": 1.27738965, "balance_loss_mlp": 0.14197296, "epoch": 0.5972643920036074, "flos": 71652383832960.0, "grad_norm": 0.6462812723430682, "language_loss": 0.51303875, "learning_rate": 1.473307699867203e-06, "loss": 0.5295018, "num_input_tokens_seen": 214090265, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.10253906, "step": 9934, "time_per_iteration": 3.2511589527130127 }, { "auxiliary_loss_clip": 0.01492282, "auxiliary_loss_mlp": 0.00194215, "balance_loss_clip": 1.27973986, "balance_loss_mlp": 0.18215099, "epoch": 0.5973245152562754, "flos": 56892702263040.0, "grad_norm": 0.8183256918868284, "language_loss": 0.54035115, "learning_rate": 1.4729319956234849e-06, "loss": 0.55721611, "num_input_tokens_seen": 214146375, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.12060547, "step": 9935, "time_per_iteration": 3.091986894607544 }, { "auxiliary_loss_clip": 0.01436304, "auxiliary_loss_mlp": 0.00306821, "balance_loss_clip": 1.16892934, "balance_loss_mlp": 0.27296591, "epoch": 0.5973846385089433, "flos": 24164361884160.0, "grad_norm": 31.703535566255717, "language_loss": 0.7262153, "learning_rate": 1.4725563113662394e-06, "loss": 0.74364656, "num_input_tokens_seen": 214165340, "router_z_loss_clip": 2.67382812, "router_z_loss_mlp": 0.33862305, "step": 9936, "time_per_iteration": 2.7064056396484375 }, { "auxiliary_loss_clip": 0.01456928, "auxiliary_loss_mlp": 0.00306356, "balance_loss_clip": 1.18344986, "balance_loss_mlp": 0.27230954, "epoch": 0.5974447617616113, "flos": 17670599752320.0, "grad_norm": 12.159980165547088, "language_loss": 0.74810386, "learning_rate": 1.4721806471097103e-06, "loss": 0.7657367, "num_input_tokens_seen": 214181360, "router_z_loss_clip": 2.73632812, "router_z_loss_mlp": 0.34033203, "step": 9937, "time_per_iteration": 2.6277570724487305 }, { "auxiliary_loss_clip": 0.01474976, "auxiliary_loss_mlp": 0.00298282, "balance_loss_clip": 1.19389868, "balance_loss_mlp": 0.26421177, "epoch": 0.5975048850142792, "flos": 22892514140160.0, "grad_norm": 6.048507761854739, "language_loss": 0.87313974, "learning_rate": 1.4718050028681442e-06, "loss": 0.8908723, "num_input_tokens_seen": 214198525, "router_z_loss_clip": 2.8125, "router_z_loss_mlp": 0.34057617, "step": 9938, "time_per_iteration": 4.109168291091919 }, { "auxiliary_loss_clip": 0.01471874, "auxiliary_loss_mlp": 0.00317522, "balance_loss_clip": 1.20053542, "balance_loss_mlp": 0.28233147, "epoch": 0.5975650082669473, "flos": 24353108876160.0, "grad_norm": 23.20886952459121, "language_loss": 0.82296008, "learning_rate": 1.4714293786557855e-06, "loss": 0.84085405, "num_input_tokens_seen": 214218710, "router_z_loss_clip": 2.71875, "router_z_loss_mlp": 0.35205078, "step": 9939, "time_per_iteration": 2.8293802738189697 }, { "auxiliary_loss_clip": 0.01459299, "auxiliary_loss_mlp": 0.00286161, "balance_loss_clip": 1.17672634, "balance_loss_mlp": 0.25085166, "epoch": 0.5976251315196152, "flos": 20923352691840.0, "grad_norm": 24.050588997409434, "language_loss": 0.80364656, "learning_rate": 1.471053774486878e-06, "loss": 0.82110119, "num_input_tokens_seen": 214237800, "router_z_loss_clip": 2.82617188, "router_z_loss_mlp": 0.35302734, "step": 9940, "time_per_iteration": 4.1443095207214355 }, { "auxiliary_loss_clip": 0.01441628, "auxiliary_loss_mlp": 0.00312594, "balance_loss_clip": 1.18044686, "balance_loss_mlp": 0.28019306, "epoch": 0.5976852547722832, "flos": 35844594658560.0, "grad_norm": 22.760262060806372, "language_loss": 0.75501961, "learning_rate": 1.470678190375664e-06, "loss": 0.77256185, "num_input_tokens_seen": 214260355, "router_z_loss_clip": 2.609375, "router_z_loss_mlp": 0.32397461, "step": 9941, "time_per_iteration": 2.7739100456237793 }, { "auxiliary_loss_clip": 0.01430977, "auxiliary_loss_mlp": 0.00278607, "balance_loss_clip": 1.16526663, "balance_loss_mlp": 0.24737376, "epoch": 0.5977453780249512, "flos": 12855948744960.0, "grad_norm": 17.71245679925724, "language_loss": 0.850465, "learning_rate": 1.470302626336386e-06, "loss": 0.86756086, "num_input_tokens_seen": 214277120, "router_z_loss_clip": 2.65429688, "router_z_loss_mlp": 0.31201172, "step": 9942, "time_per_iteration": 2.6258857250213623 }, { "auxiliary_loss_clip": 0.01451314, "auxiliary_loss_mlp": 0.00303559, "balance_loss_clip": 1.17525077, "balance_loss_mlp": 0.27096742, "epoch": 0.5978055012776191, "flos": 20959155573120.0, "grad_norm": 7.992167556418348, "language_loss": 0.80958295, "learning_rate": 1.4699270823832857e-06, "loss": 0.82713175, "num_input_tokens_seen": 214295300, "router_z_loss_clip": 2.7578125, "router_z_loss_mlp": 0.32592773, "step": 9943, "time_per_iteration": 4.0693888664245605 }, { "auxiliary_loss_clip": 0.01440144, "auxiliary_loss_mlp": 0.0029277, "balance_loss_clip": 1.17322516, "balance_loss_mlp": 0.25996354, "epoch": 0.5978656245302871, "flos": 34058003063040.0, "grad_norm": 7.422623337657951, "language_loss": 0.69532597, "learning_rate": 1.4695515585306032e-06, "loss": 0.71265513, "num_input_tokens_seen": 214317050, "router_z_loss_clip": 2.66992188, "router_z_loss_mlp": 0.328125, "step": 9944, "time_per_iteration": 2.7847647666931152 }, { "auxiliary_loss_clip": 0.01451754, "auxiliary_loss_mlp": 0.00261823, "balance_loss_clip": 1.18276691, "balance_loss_mlp": 0.2292784, "epoch": 0.597925747782955, "flos": 37373275624320.0, "grad_norm": 22.0019708187025, "language_loss": 0.79236156, "learning_rate": 1.4691760547925795e-06, "loss": 0.8094973, "num_input_tokens_seen": 214337470, "router_z_loss_clip": 2.69140625, "router_z_loss_mlp": 0.32519531, "step": 9945, "time_per_iteration": 2.856948137283325 }, { "auxiliary_loss_clip": 0.01439302, "auxiliary_loss_mlp": 0.00283254, "balance_loss_clip": 1.17165434, "balance_loss_mlp": 0.24887398, "epoch": 0.5979858710356231, "flos": 25374803328000.0, "grad_norm": 5.517424661009387, "language_loss": 0.76749527, "learning_rate": 1.4688005711834522e-06, "loss": 0.78472078, "num_input_tokens_seen": 214357975, "router_z_loss_clip": 2.67773438, "router_z_loss_mlp": 0.34375, "step": 9946, "time_per_iteration": 2.675006628036499 }, { "auxiliary_loss_clip": 0.01462467, "auxiliary_loss_mlp": 0.00271614, "balance_loss_clip": 1.18352222, "balance_loss_mlp": 0.23651902, "epoch": 0.598045994288291, "flos": 13698413308800.0, "grad_norm": 8.167560815739536, "language_loss": 0.96883655, "learning_rate": 1.468425107717461e-06, "loss": 0.98617738, "num_input_tokens_seen": 214374125, "router_z_loss_clip": 2.79101562, "router_z_loss_mlp": 0.35107422, "step": 9947, "time_per_iteration": 2.607351064682007 }, { "auxiliary_loss_clip": 0.01428167, "auxiliary_loss_mlp": 0.00285363, "balance_loss_clip": 1.17003906, "balance_loss_mlp": 0.25507158, "epoch": 0.598106117540959, "flos": 21981352815360.0, "grad_norm": 43.451970692821355, "language_loss": 0.80415773, "learning_rate": 1.4680496644088432e-06, "loss": 0.82129306, "num_input_tokens_seen": 214393395, "router_z_loss_clip": 2.58203125, "router_z_loss_mlp": 0.30310059, "step": 9948, "time_per_iteration": 4.051477670669556 }, { "auxiliary_loss_clip": 0.01442361, "auxiliary_loss_mlp": 0.00329912, "balance_loss_clip": 1.16869009, "balance_loss_mlp": 0.29526973, "epoch": 0.5981662407936269, "flos": 20559362221440.0, "grad_norm": 156.0013169781479, "language_loss": 0.96911025, "learning_rate": 1.4676742412718347e-06, "loss": 0.98683298, "num_input_tokens_seen": 214411550, "router_z_loss_clip": 2.73828125, "router_z_loss_mlp": 0.34667969, "step": 9949, "time_per_iteration": 2.680999994277954 }, { "auxiliary_loss_clip": 0.01399549, "auxiliary_loss_mlp": 0.00266415, "balance_loss_clip": 1.14603794, "balance_loss_mlp": 0.23782897, "epoch": 0.5982263640462949, "flos": 14063840323200.0, "grad_norm": 31.737759764207784, "language_loss": 0.76224053, "learning_rate": 1.467298838320673e-06, "loss": 0.77890027, "num_input_tokens_seen": 214429780, "router_z_loss_clip": 2.53710938, "router_z_loss_mlp": 0.28588867, "step": 9950, "time_per_iteration": 2.650062084197998 }, { "auxiliary_loss_clip": 0.01411719, "auxiliary_loss_mlp": 0.00287219, "balance_loss_clip": 1.15177262, "balance_loss_mlp": 0.25367391, "epoch": 0.5982864872989628, "flos": 17707228646400.0, "grad_norm": 7.342001567037778, "language_loss": 0.84373963, "learning_rate": 1.4669234555695921e-06, "loss": 0.86072898, "num_input_tokens_seen": 214447775, "router_z_loss_clip": 2.60546875, "router_z_loss_mlp": 0.33544922, "step": 9951, "time_per_iteration": 2.673225164413452 }, { "auxiliary_loss_clip": 0.01452765, "auxiliary_loss_mlp": 0.00291251, "balance_loss_clip": 1.17769158, "balance_loss_mlp": 0.25408131, "epoch": 0.5983466105516309, "flos": 16764789553920.0, "grad_norm": 5.413803811957908, "language_loss": 0.80164111, "learning_rate": 1.4665480930328275e-06, "loss": 0.81908131, "num_input_tokens_seen": 214467245, "router_z_loss_clip": 2.75, "router_z_loss_mlp": 0.37207031, "step": 9952, "time_per_iteration": 2.6738555431365967 }, { "auxiliary_loss_clip": 0.01409831, "auxiliary_loss_mlp": 0.00325798, "balance_loss_clip": 1.14393866, "balance_loss_mlp": 0.29032165, "epoch": 0.5984067338042988, "flos": 20042714949120.0, "grad_norm": 24.20598760213838, "language_loss": 0.88844258, "learning_rate": 1.466172750724613e-06, "loss": 0.90579885, "num_input_tokens_seen": 214484385, "router_z_loss_clip": 2.65820312, "router_z_loss_mlp": 0.35449219, "step": 9953, "time_per_iteration": 2.6685738563537598 }, { "auxiliary_loss_clip": 0.0141559, "auxiliary_loss_mlp": 0.00310054, "balance_loss_clip": 1.15498137, "balance_loss_mlp": 0.27600789, "epoch": 0.5984668570569668, "flos": 26319900026880.0, "grad_norm": 34.59490127669727, "language_loss": 0.75923145, "learning_rate": 1.4657974286591807e-06, "loss": 0.77648795, "num_input_tokens_seen": 214503465, "router_z_loss_clip": 2.60351562, "router_z_loss_mlp": 0.34033203, "step": 9954, "time_per_iteration": 2.7091217041015625 }, { "auxiliary_loss_clip": 0.01421775, "auxiliary_loss_mlp": 0.00294897, "balance_loss_clip": 1.15674758, "balance_loss_mlp": 0.2621623, "epoch": 0.5985269803096348, "flos": 20593728558720.0, "grad_norm": 15.342614143934, "language_loss": 0.78620476, "learning_rate": 1.4654221268507637e-06, "loss": 0.80337143, "num_input_tokens_seen": 214520725, "router_z_loss_clip": 2.6484375, "router_z_loss_mlp": 0.32739258, "step": 9955, "time_per_iteration": 2.656789541244507 }, { "auxiliary_loss_clip": 0.01425061, "auxiliary_loss_mlp": 0.00271302, "balance_loss_clip": 1.16198909, "balance_loss_mlp": 0.23830506, "epoch": 0.5985871035623027, "flos": 26865382942080.0, "grad_norm": 8.58997471027159, "language_loss": 0.72642851, "learning_rate": 1.4650468453135934e-06, "loss": 0.74339211, "num_input_tokens_seen": 214540675, "router_z_loss_clip": 2.6328125, "router_z_loss_mlp": 0.32983398, "step": 9956, "time_per_iteration": 2.7821648120880127 }, { "auxiliary_loss_clip": 0.01426443, "auxiliary_loss_mlp": 0.002887, "balance_loss_clip": 1.15773392, "balance_loss_mlp": 0.25555944, "epoch": 0.5986472268149707, "flos": 19609704495360.0, "grad_norm": 4.9674951190536225, "language_loss": 0.8141045, "learning_rate": 1.4646715840618999e-06, "loss": 0.83125591, "num_input_tokens_seen": 214559910, "router_z_loss_clip": 2.68554688, "router_z_loss_mlp": 0.33154297, "step": 9957, "time_per_iteration": 2.6411690711975098 }, { "auxiliary_loss_clip": 0.01420599, "auxiliary_loss_mlp": 0.00289234, "balance_loss_clip": 1.1601181, "balance_loss_mlp": 0.25747705, "epoch": 0.5987073500676386, "flos": 21794616984960.0, "grad_norm": 127.06982704033746, "language_loss": 0.9068898, "learning_rate": 1.4642963431099138e-06, "loss": 0.9239881, "num_input_tokens_seen": 214575960, "router_z_loss_clip": 2.609375, "router_z_loss_mlp": 0.31726074, "step": 9958, "time_per_iteration": 2.6876919269561768 }, { "auxiliary_loss_clip": 0.01412053, "auxiliary_loss_mlp": 0.00314302, "balance_loss_clip": 1.14579391, "balance_loss_mlp": 0.2793501, "epoch": 0.5987674733203067, "flos": 24314361079680.0, "grad_norm": 4.6597231288866725, "language_loss": 0.7479493, "learning_rate": 1.463921122471864e-06, "loss": 0.76521283, "num_input_tokens_seen": 214594230, "router_z_loss_clip": 2.66210938, "router_z_loss_mlp": 0.34985352, "step": 9959, "time_per_iteration": 2.703627109527588 }, { "auxiliary_loss_clip": 0.01412324, "auxiliary_loss_mlp": 0.00287018, "balance_loss_clip": 1.14995635, "balance_loss_mlp": 0.25657228, "epoch": 0.5988275965729746, "flos": 21320201128320.0, "grad_norm": 23.130679042259803, "language_loss": 0.89380878, "learning_rate": 1.4635459221619796e-06, "loss": 0.91080225, "num_input_tokens_seen": 214613130, "router_z_loss_clip": 2.625, "router_z_loss_mlp": 0.30444336, "step": 9960, "time_per_iteration": 2.7357676029205322 }, { "auxiliary_loss_clip": 0.01419354, "auxiliary_loss_mlp": 0.0030443, "balance_loss_clip": 1.1552521, "balance_loss_mlp": 0.27338773, "epoch": 0.5988877198256426, "flos": 25118041933440.0, "grad_norm": 47.95128779185925, "language_loss": 0.85555208, "learning_rate": 1.4631707421944868e-06, "loss": 0.87278986, "num_input_tokens_seen": 214634470, "router_z_loss_clip": 2.64257812, "router_z_loss_mlp": 0.31030273, "step": 9961, "time_per_iteration": 2.697101593017578 }, { "auxiliary_loss_clip": 0.0143416, "auxiliary_loss_mlp": 0.00320286, "balance_loss_clip": 1.16486359, "balance_loss_mlp": 0.2882427, "epoch": 0.5989478430783105, "flos": 26429104350720.0, "grad_norm": 117.63022810737114, "language_loss": 0.72965074, "learning_rate": 1.4627955825836136e-06, "loss": 0.74719524, "num_input_tokens_seen": 214654030, "router_z_loss_clip": 2.69335938, "router_z_loss_mlp": 0.32055664, "step": 9962, "time_per_iteration": 2.6937830448150635 }, { "auxiliary_loss_clip": 0.01426123, "auxiliary_loss_mlp": 0.00304516, "balance_loss_clip": 1.16073608, "balance_loss_mlp": 0.27163777, "epoch": 0.5990079663309785, "flos": 25778439434880.0, "grad_norm": 30.52698421455057, "language_loss": 0.79537928, "learning_rate": 1.4624204433435857e-06, "loss": 0.81268567, "num_input_tokens_seen": 214676985, "router_z_loss_clip": 2.65429688, "router_z_loss_mlp": 0.32861328, "step": 9963, "time_per_iteration": 2.8732857704162598 }, { "auxiliary_loss_clip": 0.01425132, "auxiliary_loss_mlp": 0.00298939, "balance_loss_clip": 1.15832818, "balance_loss_mlp": 0.26534623, "epoch": 0.5990680895836464, "flos": 36831779118720.0, "grad_norm": 4.817555347968191, "language_loss": 0.72408265, "learning_rate": 1.4620453244886281e-06, "loss": 0.74132335, "num_input_tokens_seen": 214700105, "router_z_loss_clip": 2.66601562, "router_z_loss_mlp": 0.3359375, "step": 9964, "time_per_iteration": 2.797788381576538 }, { "auxiliary_loss_clip": 0.01417687, "auxiliary_loss_mlp": 0.00292045, "balance_loss_clip": 1.15923047, "balance_loss_mlp": 0.26097909, "epoch": 0.5991282128363145, "flos": 24133550993280.0, "grad_norm": 24.64264429824892, "language_loss": 0.83266008, "learning_rate": 1.4616702260329662e-06, "loss": 0.84975743, "num_input_tokens_seen": 214717885, "router_z_loss_clip": 2.5859375, "router_z_loss_mlp": 0.31054688, "step": 9965, "time_per_iteration": 2.7253520488739014 }, { "auxiliary_loss_clip": 0.01425355, "auxiliary_loss_mlp": 0.00324647, "balance_loss_clip": 1.15585756, "balance_loss_mlp": 0.2913157, "epoch": 0.5991883360889824, "flos": 10304064956160.0, "grad_norm": 33.46685471446255, "language_loss": 0.84343249, "learning_rate": 1.4612951479908229e-06, "loss": 0.86093247, "num_input_tokens_seen": 214733680, "router_z_loss_clip": 2.69335938, "router_z_loss_mlp": 0.33325195, "step": 9966, "time_per_iteration": 2.613053321838379 }, { "auxiliary_loss_clip": 0.01394039, "auxiliary_loss_mlp": 0.00298493, "balance_loss_clip": 1.13706374, "balance_loss_mlp": 0.26841682, "epoch": 0.5992484593416504, "flos": 23951196622080.0, "grad_norm": 37.94913086317696, "language_loss": 0.80649388, "learning_rate": 1.460920090376422e-06, "loss": 0.82341921, "num_input_tokens_seen": 214753285, "router_z_loss_clip": 2.57226562, "router_z_loss_mlp": 0.30065918, "step": 9967, "time_per_iteration": 2.7463812828063965 }, { "auxiliary_loss_clip": 0.01432821, "auxiliary_loss_mlp": 0.003284, "balance_loss_clip": 1.15505719, "balance_loss_mlp": 0.29523596, "epoch": 0.5993085825943184, "flos": 11944105061760.0, "grad_norm": 4.399801749845032, "language_loss": 0.768875, "learning_rate": 1.4605450532039847e-06, "loss": 0.78648728, "num_input_tokens_seen": 214767810, "router_z_loss_clip": 2.77539062, "router_z_loss_mlp": 0.33154297, "step": 9968, "time_per_iteration": 2.6479532718658447 }, { "auxiliary_loss_clip": 0.01417193, "auxiliary_loss_mlp": 0.00369468, "balance_loss_clip": 1.1518054, "balance_loss_mlp": 0.33361012, "epoch": 0.5993687058469863, "flos": 19026838500480.0, "grad_norm": 4.410219532635535, "language_loss": 0.85895246, "learning_rate": 1.4601700364877334e-06, "loss": 0.87681901, "num_input_tokens_seen": 214786040, "router_z_loss_clip": 2.65429688, "router_z_loss_mlp": 0.35864258, "step": 9969, "time_per_iteration": 2.7295374870300293 }, { "auxiliary_loss_clip": 0.01420789, "auxiliary_loss_mlp": 0.0031866, "balance_loss_clip": 1.15566957, "balance_loss_mlp": 0.28394622, "epoch": 0.5994288290996543, "flos": 14282967242880.0, "grad_norm": 8.359661781838778, "language_loss": 0.86651123, "learning_rate": 1.4597950402418889e-06, "loss": 0.88390577, "num_input_tokens_seen": 214803110, "router_z_loss_clip": 2.65234375, "router_z_loss_mlp": 0.34716797, "step": 9970, "time_per_iteration": 2.6438724994659424 }, { "auxiliary_loss_clip": 0.01474053, "auxiliary_loss_mlp": 0.00348452, "balance_loss_clip": 1.19013047, "balance_loss_mlp": 0.31090146, "epoch": 0.5994889523523222, "flos": 19206643006080.0, "grad_norm": 9.018295407229688, "language_loss": 0.71078378, "learning_rate": 1.4594200644806697e-06, "loss": 0.72900885, "num_input_tokens_seen": 214819945, "router_z_loss_clip": 2.83984375, "router_z_loss_mlp": 0.37548828, "step": 9971, "time_per_iteration": 2.685835123062134 }, { "auxiliary_loss_clip": 0.01414932, "auxiliary_loss_mlp": 0.0032318, "balance_loss_clip": 1.15447164, "balance_loss_mlp": 0.29223365, "epoch": 0.5995490756049903, "flos": 28037040675840.0, "grad_norm": 6759.724836545048, "language_loss": 0.84792924, "learning_rate": 1.4590451092182962e-06, "loss": 0.86531031, "num_input_tokens_seen": 214838810, "router_z_loss_clip": 2.60351562, "router_z_loss_mlp": 0.30957031, "step": 9972, "time_per_iteration": 2.721536159515381 }, { "auxiliary_loss_clip": 0.01434767, "auxiliary_loss_mlp": 0.00337197, "balance_loss_clip": 1.15508723, "balance_loss_mlp": 0.30069494, "epoch": 0.5996091988576582, "flos": 29052953038080.0, "grad_norm": 20.14085245547083, "language_loss": 0.85426974, "learning_rate": 1.4586701744689864e-06, "loss": 0.87198937, "num_input_tokens_seen": 214857040, "router_z_loss_clip": 2.796875, "router_z_loss_mlp": 0.36499023, "step": 9973, "time_per_iteration": 2.746941566467285 }, { "auxiliary_loss_clip": 0.0141588, "auxiliary_loss_mlp": 0.00338812, "balance_loss_clip": 1.15154767, "balance_loss_mlp": 0.3056246, "epoch": 0.5996693221103262, "flos": 20813968800000.0, "grad_norm": 12.42926830487094, "language_loss": 0.74373686, "learning_rate": 1.4582952602469578e-06, "loss": 0.76128376, "num_input_tokens_seen": 214873375, "router_z_loss_clip": 2.64257812, "router_z_loss_mlp": 0.33203125, "step": 9974, "time_per_iteration": 2.6495375633239746 }, { "auxiliary_loss_clip": 0.01413327, "auxiliary_loss_mlp": 0.00342228, "balance_loss_clip": 1.14885831, "balance_loss_mlp": 0.30954036, "epoch": 0.5997294453629941, "flos": 23768914078080.0, "grad_norm": 1.892862428779597, "language_loss": 0.8177439, "learning_rate": 1.457920366566428e-06, "loss": 0.83529943, "num_input_tokens_seen": 214893900, "router_z_loss_clip": 2.64648438, "router_z_loss_mlp": 0.32702637, "step": 9975, "time_per_iteration": 2.732412815093994 }, { "auxiliary_loss_clip": 0.01436103, "auxiliary_loss_mlp": 0.00330124, "balance_loss_clip": 1.16395473, "balance_loss_mlp": 0.29715109, "epoch": 0.5997895686156621, "flos": 20960017499520.0, "grad_norm": 6.598833590115363, "language_loss": 0.86684233, "learning_rate": 1.457545493441611e-06, "loss": 0.88450456, "num_input_tokens_seen": 214912110, "router_z_loss_clip": 2.72070312, "router_z_loss_mlp": 0.32983398, "step": 9976, "time_per_iteration": 2.6756153106689453 }, { "auxiliary_loss_clip": 0.01422332, "auxiliary_loss_mlp": 0.00320024, "balance_loss_clip": 1.15816283, "balance_loss_mlp": 0.28743201, "epoch": 0.59984969186833, "flos": 28365443746560.0, "grad_norm": 26.868308199820838, "language_loss": 0.84783989, "learning_rate": 1.4571706408867237e-06, "loss": 0.86526346, "num_input_tokens_seen": 214930140, "router_z_loss_clip": 2.640625, "router_z_loss_mlp": 0.32568359, "step": 9977, "time_per_iteration": 2.7380430698394775 }, { "auxiliary_loss_clip": 0.01415575, "auxiliary_loss_mlp": 0.00332755, "balance_loss_clip": 1.15214062, "balance_loss_mlp": 0.30049741, "epoch": 0.5999098151209981, "flos": 22565906749440.0, "grad_norm": 2.7700301138224472, "language_loss": 0.75697339, "learning_rate": 1.4567958089159802e-06, "loss": 0.77445674, "num_input_tokens_seen": 214949200, "router_z_loss_clip": 2.63867188, "router_z_loss_mlp": 0.32275391, "step": 9978, "time_per_iteration": 2.693101644515991 }, { "auxiliary_loss_clip": 0.01421437, "auxiliary_loss_mlp": 0.00340515, "balance_loss_clip": 1.15556455, "balance_loss_mlp": 0.30735072, "epoch": 0.599969938373666, "flos": 18768712389120.0, "grad_norm": 94.87843972793368, "language_loss": 0.90451181, "learning_rate": 1.456420997543594e-06, "loss": 0.92213136, "num_input_tokens_seen": 214965775, "router_z_loss_clip": 2.66015625, "router_z_loss_mlp": 0.33154297, "step": 9979, "time_per_iteration": 2.6999776363372803 }, { "auxiliary_loss_clip": 0.01407356, "auxiliary_loss_mlp": 0.00316183, "balance_loss_clip": 1.15129149, "balance_loss_mlp": 0.28572461, "epoch": 0.600030061626334, "flos": 11327231865600.0, "grad_norm": 33.330531075699845, "language_loss": 0.78634769, "learning_rate": 1.4560462067837782e-06, "loss": 0.80358303, "num_input_tokens_seen": 214982480, "router_z_loss_clip": 2.56054688, "router_z_loss_mlp": 0.30480957, "step": 9980, "time_per_iteration": 2.6430375576019287 }, { "auxiliary_loss_clip": 0.01456948, "auxiliary_loss_mlp": 0.00367139, "balance_loss_clip": 1.17284119, "balance_loss_mlp": 0.32982603, "epoch": 0.600090184879002, "flos": 16578664254720.0, "grad_norm": 74.7111821112081, "language_loss": 0.81710738, "learning_rate": 1.4556714366507445e-06, "loss": 0.83534825, "num_input_tokens_seen": 214998110, "router_z_loss_clip": 2.84179688, "router_z_loss_mlp": 0.37329102, "step": 9981, "time_per_iteration": 4.097184896469116 }, { "auxiliary_loss_clip": 0.01415323, "auxiliary_loss_mlp": 0.00331601, "balance_loss_clip": 1.15387344, "balance_loss_mlp": 0.29931867, "epoch": 0.6001503081316699, "flos": 23618627573760.0, "grad_norm": 21.386118672943045, "language_loss": 0.84527588, "learning_rate": 1.4552966871587048e-06, "loss": 0.86274511, "num_input_tokens_seen": 215017995, "router_z_loss_clip": 2.61523438, "router_z_loss_mlp": 0.32275391, "step": 9982, "time_per_iteration": 4.168829441070557 }, { "auxiliary_loss_clip": 0.01442417, "auxiliary_loss_mlp": 0.00318097, "balance_loss_clip": 1.16906667, "balance_loss_mlp": 0.28421801, "epoch": 0.6002104313843379, "flos": 20667668705280.0, "grad_norm": 6.584850856373448, "language_loss": 0.79686552, "learning_rate": 1.4549219583218686e-06, "loss": 0.81447065, "num_input_tokens_seen": 215038285, "router_z_loss_clip": 2.73242188, "router_z_loss_mlp": 0.33886719, "step": 9983, "time_per_iteration": 2.7342581748962402 }, { "auxiliary_loss_clip": 0.01428253, "auxiliary_loss_mlp": 0.00341015, "balance_loss_clip": 1.15854931, "balance_loss_mlp": 0.30727839, "epoch": 0.6002705546370058, "flos": 22455229968000.0, "grad_norm": 8.7836698087294, "language_loss": 0.87948149, "learning_rate": 1.454547250154447e-06, "loss": 0.89717418, "num_input_tokens_seen": 215057825, "router_z_loss_clip": 2.69726562, "router_z_loss_mlp": 0.3371582, "step": 9984, "time_per_iteration": 2.6695799827575684 }, { "auxiliary_loss_clip": 0.0141668, "auxiliary_loss_mlp": 0.00334178, "balance_loss_clip": 1.15147495, "balance_loss_mlp": 0.30187231, "epoch": 0.6003306778896739, "flos": 25191982080000.0, "grad_norm": 3.042314356800506, "language_loss": 0.90574348, "learning_rate": 1.4541725626706485e-06, "loss": 0.92325205, "num_input_tokens_seen": 215077790, "router_z_loss_clip": 2.66015625, "router_z_loss_mlp": 0.32275391, "step": 9985, "time_per_iteration": 4.211479902267456 }, { "auxiliary_loss_clip": 0.01418196, "auxiliary_loss_mlp": 0.00328035, "balance_loss_clip": 1.15351558, "balance_loss_mlp": 0.29664764, "epoch": 0.6003908011423418, "flos": 26687733252480.0, "grad_norm": 16.329012150277222, "language_loss": 0.77744532, "learning_rate": 1.4537978958846809e-06, "loss": 0.79490769, "num_input_tokens_seen": 215097650, "router_z_loss_clip": 2.6484375, "router_z_loss_mlp": 0.31396484, "step": 9986, "time_per_iteration": 2.726940870285034 }, { "auxiliary_loss_clip": 0.01406799, "auxiliary_loss_mlp": 0.00329612, "balance_loss_clip": 1.14479637, "balance_loss_mlp": 0.2962814, "epoch": 0.6004509243950098, "flos": 22565080736640.0, "grad_norm": 15.808759722515509, "language_loss": 0.78561515, "learning_rate": 1.4534232498107514e-06, "loss": 0.80297923, "num_input_tokens_seen": 215118235, "router_z_loss_clip": 2.62109375, "router_z_loss_mlp": 0.33374023, "step": 9987, "time_per_iteration": 2.6985161304473877 }, { "auxiliary_loss_clip": 0.01398374, "auxiliary_loss_mlp": 0.00314395, "balance_loss_clip": 1.14199054, "balance_loss_mlp": 0.28349549, "epoch": 0.6005110476476777, "flos": 19719303868800.0, "grad_norm": 21.174343423968683, "language_loss": 0.91119695, "learning_rate": 1.4530486244630673e-06, "loss": 0.92832458, "num_input_tokens_seen": 215136755, "router_z_loss_clip": 2.56640625, "router_z_loss_mlp": 0.30883789, "step": 9988, "time_per_iteration": 2.689006805419922 }, { "auxiliary_loss_clip": 0.01408264, "auxiliary_loss_mlp": 0.00366229, "balance_loss_clip": 1.14733624, "balance_loss_mlp": 0.33471, "epoch": 0.6005711709003457, "flos": 17712543859200.0, "grad_norm": 55.3597455517253, "language_loss": 0.7397275, "learning_rate": 1.4526740198558346e-06, "loss": 0.75747252, "num_input_tokens_seen": 215155225, "router_z_loss_clip": 2.61328125, "router_z_loss_mlp": 0.31494141, "step": 9989, "time_per_iteration": 4.088709831237793 }, { "auxiliary_loss_clip": 0.01396081, "auxiliary_loss_mlp": 0.00354184, "balance_loss_clip": 1.1405251, "balance_loss_mlp": 0.32214028, "epoch": 0.6006312941530136, "flos": 18514464946560.0, "grad_norm": 3.061140142917169, "language_loss": 0.86524117, "learning_rate": 1.452299436003257e-06, "loss": 0.88274384, "num_input_tokens_seen": 215174815, "router_z_loss_clip": 2.55664062, "router_z_loss_mlp": 0.3203125, "step": 9990, "time_per_iteration": 2.7315266132354736 }, { "auxiliary_loss_clip": 0.01421567, "auxiliary_loss_mlp": 0.00334098, "balance_loss_clip": 1.15310717, "balance_loss_mlp": 0.29995665, "epoch": 0.6006914174056817, "flos": 21390837223680.0, "grad_norm": 15.166533304580021, "language_loss": 0.88687658, "learning_rate": 1.4519248729195403e-06, "loss": 0.90443325, "num_input_tokens_seen": 215192045, "router_z_loss_clip": 2.6875, "router_z_loss_mlp": 0.34130859, "step": 9991, "time_per_iteration": 2.728395700454712 }, { "auxiliary_loss_clip": 0.01408061, "auxiliary_loss_mlp": 0.00323974, "balance_loss_clip": 1.14694834, "balance_loss_mlp": 0.29338509, "epoch": 0.6007515406583496, "flos": 12750515349120.0, "grad_norm": 10.353042928028572, "language_loss": 0.89440346, "learning_rate": 1.4515503306188878e-06, "loss": 0.91172385, "num_input_tokens_seen": 215209885, "router_z_loss_clip": 2.61523438, "router_z_loss_mlp": 0.30566406, "step": 9992, "time_per_iteration": 2.6152234077453613 }, { "auxiliary_loss_clip": 0.01385108, "auxiliary_loss_mlp": 0.0036004, "balance_loss_clip": 1.13179946, "balance_loss_mlp": 0.32773402, "epoch": 0.6008116639110176, "flos": 19206894401280.0, "grad_norm": 11.692409729580566, "language_loss": 0.74481988, "learning_rate": 1.4511758091155008e-06, "loss": 0.7622714, "num_input_tokens_seen": 215228150, "router_z_loss_clip": 2.53125, "router_z_loss_mlp": 0.32299805, "step": 9993, "time_per_iteration": 2.6675519943237305 }, { "auxiliary_loss_clip": 0.01399492, "auxiliary_loss_mlp": 0.00365354, "balance_loss_clip": 1.13650084, "balance_loss_mlp": 0.33395433, "epoch": 0.6008717871636855, "flos": 17055342668160.0, "grad_norm": 165.05488672115393, "language_loss": 0.89324188, "learning_rate": 1.4508013084235826e-06, "loss": 0.9108904, "num_input_tokens_seen": 215243755, "router_z_loss_clip": 2.62890625, "router_z_loss_mlp": 0.31347656, "step": 9994, "time_per_iteration": 2.6259684562683105 }, { "auxiliary_loss_clip": 0.01383571, "auxiliary_loss_mlp": 0.00305486, "balance_loss_clip": 1.13291883, "balance_loss_mlp": 0.27716172, "epoch": 0.6009319104163535, "flos": 20298686244480.0, "grad_norm": 61.14265997248346, "language_loss": 0.76577556, "learning_rate": 1.4504268285573337e-06, "loss": 0.78266621, "num_input_tokens_seen": 215262130, "router_z_loss_clip": 2.5078125, "router_z_loss_mlp": 0.28344727, "step": 9995, "time_per_iteration": 2.6715784072875977 }, { "auxiliary_loss_clip": 0.01379922, "auxiliary_loss_mlp": 0.0038359, "balance_loss_clip": 1.12658429, "balance_loss_mlp": 0.35130814, "epoch": 0.6009920336690215, "flos": 21836776573440.0, "grad_norm": 116.335673296125, "language_loss": 0.86408412, "learning_rate": 1.4500523695309546e-06, "loss": 0.88171923, "num_input_tokens_seen": 215281785, "router_z_loss_clip": 2.53515625, "router_z_loss_mlp": 0.32275391, "step": 9996, "time_per_iteration": 2.6766254901885986 }, { "auxiliary_loss_clip": 0.01394947, "auxiliary_loss_mlp": 0.00346584, "balance_loss_clip": 1.13672185, "balance_loss_mlp": 0.31688845, "epoch": 0.6010521569216895, "flos": 22596107109120.0, "grad_norm": 89.90947501264846, "language_loss": 0.86918008, "learning_rate": 1.4496779313586447e-06, "loss": 0.88659537, "num_input_tokens_seen": 215297550, "router_z_loss_clip": 2.58789062, "router_z_loss_mlp": 0.29675293, "step": 9997, "time_per_iteration": 2.783836603164673 }, { "auxiliary_loss_clip": 0.01397064, "auxiliary_loss_mlp": 0.00352174, "balance_loss_clip": 1.13661647, "balance_loss_mlp": 0.31815159, "epoch": 0.6011122801743575, "flos": 19171702051200.0, "grad_norm": 127.2899225913262, "language_loss": 0.810799, "learning_rate": 1.4493035140546028e-06, "loss": 0.82829148, "num_input_tokens_seen": 215316360, "router_z_loss_clip": 2.60546875, "router_z_loss_mlp": 0.33984375, "step": 9998, "time_per_iteration": 2.6948649883270264 }, { "auxiliary_loss_clip": 0.01392374, "auxiliary_loss_mlp": 0.00355598, "balance_loss_clip": 1.13603759, "balance_loss_mlp": 0.32133746, "epoch": 0.6011724034270254, "flos": 25010022758400.0, "grad_norm": 7.8213173865757835, "language_loss": 0.78690922, "learning_rate": 1.448929117633027e-06, "loss": 0.80438888, "num_input_tokens_seen": 215336405, "router_z_loss_clip": 2.5625, "router_z_loss_mlp": 0.34228516, "step": 9999, "time_per_iteration": 2.7241053581237793 }, { "auxiliary_loss_clip": 0.0139637, "auxiliary_loss_mlp": 0.0036153, "balance_loss_clip": 1.13139212, "balance_loss_mlp": 0.32886651, "epoch": 0.6012325266796934, "flos": 21797669640960.0, "grad_norm": 212.4734671827191, "language_loss": 0.84363669, "learning_rate": 1.4485547421081142e-06, "loss": 0.86121571, "num_input_tokens_seen": 215356590, "router_z_loss_clip": 2.65039062, "router_z_loss_mlp": 0.32666016, "step": 10000, "time_per_iteration": 2.752117156982422 }, { "auxiliary_loss_clip": 0.01399424, "auxiliary_loss_mlp": 0.00371839, "balance_loss_clip": 1.13352656, "balance_loss_mlp": 0.33722112, "epoch": 0.6012926499323613, "flos": 19573003774080.0, "grad_norm": 33.91547378385103, "language_loss": 0.86740804, "learning_rate": 1.4481803874940608e-06, "loss": 0.88512075, "num_input_tokens_seen": 215374295, "router_z_loss_clip": 2.66210938, "router_z_loss_mlp": 0.34594727, "step": 10001, "time_per_iteration": 2.630244493484497 }, { "auxiliary_loss_clip": 0.01386873, "auxiliary_loss_mlp": 0.00352205, "balance_loss_clip": 1.12464726, "balance_loss_mlp": 0.31858772, "epoch": 0.6013527731850293, "flos": 34860786076800.0, "grad_norm": 120.95259470600065, "language_loss": 0.65830135, "learning_rate": 1.4478060538050624e-06, "loss": 0.6756922, "num_input_tokens_seen": 215394535, "router_z_loss_clip": 2.62695312, "router_z_loss_mlp": 0.33642578, "step": 10002, "time_per_iteration": 2.8082430362701416 }, { "auxiliary_loss_clip": 0.01401788, "auxiliary_loss_mlp": 0.00368575, "balance_loss_clip": 1.13562226, "balance_loss_mlp": 0.33398014, "epoch": 0.6014128964376972, "flos": 23291948355840.0, "grad_norm": 18.84599855044039, "language_loss": 0.84107, "learning_rate": 1.447431741055314e-06, "loss": 0.85877365, "num_input_tokens_seen": 215414355, "router_z_loss_clip": 2.66210938, "router_z_loss_mlp": 0.34594727, "step": 10003, "time_per_iteration": 2.731093645095825 }, { "auxiliary_loss_clip": 0.01363977, "auxiliary_loss_mlp": 0.00348852, "balance_loss_clip": 1.10672331, "balance_loss_mlp": 0.3176426, "epoch": 0.6014730196903653, "flos": 24820916630400.0, "grad_norm": 237.16173582452151, "language_loss": 0.84011924, "learning_rate": 1.4470574492590091e-06, "loss": 0.85724747, "num_input_tokens_seen": 215428280, "router_z_loss_clip": 2.5703125, "router_z_loss_mlp": 0.31201172, "step": 10004, "time_per_iteration": 2.691577196121216 }, { "auxiliary_loss_clip": 0.0139457, "auxiliary_loss_mlp": 0.003245, "balance_loss_clip": 1.13333392, "balance_loss_mlp": 0.29329115, "epoch": 0.6015331429430332, "flos": 23112359331840.0, "grad_norm": 18.204634744951473, "language_loss": 0.80178177, "learning_rate": 1.4466831784303408e-06, "loss": 0.81897247, "num_input_tokens_seen": 215448970, "router_z_loss_clip": 2.61328125, "router_z_loss_mlp": 0.31201172, "step": 10005, "time_per_iteration": 2.662774085998535 }, { "auxiliary_loss_clip": 0.0135618, "auxiliary_loss_mlp": 0.00362015, "balance_loss_clip": 1.10944676, "balance_loss_mlp": 0.3304956, "epoch": 0.6015932661957012, "flos": 19201363706880.0, "grad_norm": 10.446694389590224, "language_loss": 0.81328583, "learning_rate": 1.4463089285835026e-06, "loss": 0.83046782, "num_input_tokens_seen": 215465260, "router_z_loss_clip": 2.46484375, "router_z_loss_mlp": 0.31567383, "step": 10006, "time_per_iteration": 2.656230926513672 }, { "auxiliary_loss_clip": 0.01371245, "auxiliary_loss_mlp": 0.0036986, "balance_loss_clip": 1.11591864, "balance_loss_mlp": 0.33524194, "epoch": 0.6016533894483691, "flos": 18113630100480.0, "grad_norm": 8.52947219937548, "language_loss": 0.82551503, "learning_rate": 1.445934699732685e-06, "loss": 0.84292614, "num_input_tokens_seen": 215482725, "router_z_loss_clip": 2.54882812, "router_z_loss_mlp": 0.34619141, "step": 10007, "time_per_iteration": 2.638892650604248 }, { "auxiliary_loss_clip": 0.01365793, "auxiliary_loss_mlp": 0.00363211, "balance_loss_clip": 1.1106261, "balance_loss_mlp": 0.33264586, "epoch": 0.6017135127010371, "flos": 16216900427520.0, "grad_norm": 171.01913804272465, "language_loss": 0.7793895, "learning_rate": 1.4455604918920785e-06, "loss": 0.79667956, "num_input_tokens_seen": 215500420, "router_z_loss_clip": 2.55273438, "router_z_loss_mlp": 0.30541992, "step": 10008, "time_per_iteration": 2.6441187858581543 }, { "auxiliary_loss_clip": 0.01383902, "auxiliary_loss_mlp": 0.00371711, "balance_loss_clip": 1.1241219, "balance_loss_mlp": 0.3383683, "epoch": 0.6017736359537051, "flos": 23444246021760.0, "grad_norm": 5.3810085446696405, "language_loss": 0.82220966, "learning_rate": 1.4451863050758748e-06, "loss": 0.83976573, "num_input_tokens_seen": 215522260, "router_z_loss_clip": 2.59765625, "router_z_loss_mlp": 0.33325195, "step": 10009, "time_per_iteration": 2.7488009929656982 }, { "auxiliary_loss_clip": 0.01373333, "auxiliary_loss_mlp": 0.0036897, "balance_loss_clip": 1.11871123, "balance_loss_mlp": 0.33401746, "epoch": 0.601833759206373, "flos": 23514056104320.0, "grad_norm": 165.24135270335407, "language_loss": 0.82287663, "learning_rate": 1.4448121392982608e-06, "loss": 0.84029967, "num_input_tokens_seen": 215541715, "router_z_loss_clip": 2.546875, "router_z_loss_mlp": 0.34985352, "step": 10010, "time_per_iteration": 2.690584182739258 }, { "auxiliary_loss_clip": 0.01419469, "auxiliary_loss_mlp": 0.00175042, "balance_loss_clip": 1.22174513, "balance_loss_mlp": 0.1637409, "epoch": 0.6018938824590411, "flos": 63991668648960.0, "grad_norm": 0.796262007626621, "language_loss": 0.54903471, "learning_rate": 1.4444379945734268e-06, "loss": 0.56497979, "num_input_tokens_seen": 215603020, "router_z_loss_clip": 1.9765625, "router_z_loss_mlp": 0.11279297, "step": 10011, "time_per_iteration": 3.352522850036621 }, { "auxiliary_loss_clip": 0.01371451, "auxiliary_loss_mlp": 0.00342503, "balance_loss_clip": 1.11778474, "balance_loss_mlp": 0.3104839, "epoch": 0.601954005711709, "flos": 34640007131520.0, "grad_norm": 9.087404836282108, "language_loss": 0.67406696, "learning_rate": 1.44406387091556e-06, "loss": 0.69120657, "num_input_tokens_seen": 215625115, "router_z_loss_clip": 2.53515625, "router_z_loss_mlp": 0.32055664, "step": 10012, "time_per_iteration": 2.784060001373291 }, { "auxiliary_loss_clip": 0.01385787, "auxiliary_loss_mlp": 0.00342689, "balance_loss_clip": 1.13004017, "balance_loss_mlp": 0.31206399, "epoch": 0.602014128964377, "flos": 19427062815360.0, "grad_norm": 16.96608319141926, "language_loss": 0.8038578, "learning_rate": 1.4436897683388462e-06, "loss": 0.82114255, "num_input_tokens_seen": 215643730, "router_z_loss_clip": 2.56054688, "router_z_loss_mlp": 0.30603027, "step": 10013, "time_per_iteration": 2.693093776702881 }, { "auxiliary_loss_clip": 0.01378521, "auxiliary_loss_mlp": 0.0033451, "balance_loss_clip": 1.12764573, "balance_loss_mlp": 0.30313414, "epoch": 0.6020742522170449, "flos": 28329389470080.0, "grad_norm": 17.62575474386003, "language_loss": 0.86706638, "learning_rate": 1.4433156868574732e-06, "loss": 0.8841967, "num_input_tokens_seen": 215664425, "router_z_loss_clip": 2.50976562, "router_z_loss_mlp": 0.3137207, "step": 10014, "time_per_iteration": 2.7528038024902344 }, { "auxiliary_loss_clip": 0.01382223, "auxiliary_loss_mlp": 0.00330473, "balance_loss_clip": 1.12719321, "balance_loss_mlp": 0.30033669, "epoch": 0.6021343754697129, "flos": 22747040058240.0, "grad_norm": 126.87806393996347, "language_loss": 0.78545964, "learning_rate": 1.442941626485624e-06, "loss": 0.80258662, "num_input_tokens_seen": 215684280, "router_z_loss_clip": 2.54492188, "router_z_loss_mlp": 0.30151367, "step": 10015, "time_per_iteration": 2.725839853286743 }, { "auxiliary_loss_clip": 0.01447186, "auxiliary_loss_mlp": 0.00193064, "balance_loss_clip": 1.23957062, "balance_loss_mlp": 0.18128571, "epoch": 0.6021944987223808, "flos": 65752007402880.0, "grad_norm": 0.7952304105629564, "language_loss": 0.54120386, "learning_rate": 1.4425675872374848e-06, "loss": 0.55760646, "num_input_tokens_seen": 215739780, "router_z_loss_clip": 2.078125, "router_z_loss_mlp": 0.11767578, "step": 10016, "time_per_iteration": 3.111849784851074 }, { "auxiliary_loss_clip": 0.01385223, "auxiliary_loss_mlp": 0.00381718, "balance_loss_clip": 1.12660623, "balance_loss_mlp": 0.34760031, "epoch": 0.6022546219750489, "flos": 16105182151680.0, "grad_norm": 182.24485996368898, "language_loss": 0.88485789, "learning_rate": 1.4421935691272381e-06, "loss": 0.90252733, "num_input_tokens_seen": 215757885, "router_z_loss_clip": 2.5859375, "router_z_loss_mlp": 0.34155273, "step": 10017, "time_per_iteration": 2.790952205657959 }, { "auxiliary_loss_clip": 0.01384378, "auxiliary_loss_mlp": 0.00357076, "balance_loss_clip": 1.1296382, "balance_loss_mlp": 0.32605767, "epoch": 0.6023147452277168, "flos": 25512555985920.0, "grad_norm": 14.720220608036652, "language_loss": 0.89246893, "learning_rate": 1.4418195721690677e-06, "loss": 0.9098835, "num_input_tokens_seen": 215776415, "router_z_loss_clip": 2.54492188, "router_z_loss_mlp": 0.31005859, "step": 10018, "time_per_iteration": 2.6985058784484863 }, { "auxiliary_loss_clip": 0.01395153, "auxiliary_loss_mlp": 0.00390667, "balance_loss_clip": 1.12641501, "balance_loss_mlp": 0.35604897, "epoch": 0.6023748684803848, "flos": 22636075968000.0, "grad_norm": 6.619338265337981, "language_loss": 0.85383737, "learning_rate": 1.4414455963771549e-06, "loss": 0.87169552, "num_input_tokens_seen": 215794865, "router_z_loss_clip": 2.6875, "router_z_loss_mlp": 0.34594727, "step": 10019, "time_per_iteration": 2.6992669105529785 }, { "auxiliary_loss_clip": 0.0141248, "auxiliary_loss_mlp": 0.00350844, "balance_loss_clip": 1.14784706, "balance_loss_mlp": 0.31777489, "epoch": 0.6024349917330527, "flos": 26210444307840.0, "grad_norm": 1093.0814034223424, "language_loss": 0.78414643, "learning_rate": 1.441071641765681e-06, "loss": 0.80177963, "num_input_tokens_seen": 215816840, "router_z_loss_clip": 2.64453125, "router_z_loss_mlp": 0.33081055, "step": 10020, "time_per_iteration": 2.7343335151672363 }, { "auxiliary_loss_clip": 0.01407783, "auxiliary_loss_mlp": 0.00365303, "balance_loss_clip": 1.13937521, "balance_loss_mlp": 0.33230591, "epoch": 0.6024951149857207, "flos": 21251755762560.0, "grad_norm": 11.012329164623267, "language_loss": 0.71682167, "learning_rate": 1.4406977083488264e-06, "loss": 0.7345525, "num_input_tokens_seen": 215836100, "router_z_loss_clip": 2.68359375, "router_z_loss_mlp": 0.32983398, "step": 10021, "time_per_iteration": 2.6939637660980225 }, { "auxiliary_loss_clip": 0.01389736, "auxiliary_loss_mlp": 0.00369549, "balance_loss_clip": 1.13008201, "balance_loss_mlp": 0.33612305, "epoch": 0.6025552382383887, "flos": 26943453152640.0, "grad_norm": 28.28146704550219, "language_loss": 0.86402833, "learning_rate": 1.4403237961407704e-06, "loss": 0.88162118, "num_input_tokens_seen": 215858480, "router_z_loss_clip": 2.59960938, "router_z_loss_mlp": 0.33422852, "step": 10022, "time_per_iteration": 4.189125061035156 }, { "auxiliary_loss_clip": 0.01416716, "auxiliary_loss_mlp": 0.00393812, "balance_loss_clip": 1.14870226, "balance_loss_mlp": 0.35917026, "epoch": 0.6026153614910567, "flos": 31684379495040.0, "grad_norm": 199.89469858768322, "language_loss": 0.72753763, "learning_rate": 1.439949905155693e-06, "loss": 0.7456429, "num_input_tokens_seen": 215879950, "router_z_loss_clip": 2.67773438, "router_z_loss_mlp": 0.34643555, "step": 10023, "time_per_iteration": 2.7947092056274414 }, { "auxiliary_loss_clip": 0.01384301, "auxiliary_loss_mlp": 0.00379743, "balance_loss_clip": 1.12416601, "balance_loss_mlp": 0.34846276, "epoch": 0.6026754847437247, "flos": 29312731175040.0, "grad_norm": 4.446007709718987, "language_loss": 0.82879972, "learning_rate": 1.4395760354077707e-06, "loss": 0.84644008, "num_input_tokens_seen": 215899830, "router_z_loss_clip": 2.59765625, "router_z_loss_mlp": 0.31298828, "step": 10024, "time_per_iteration": 4.198841094970703 }, { "auxiliary_loss_clip": 0.01413043, "auxiliary_loss_mlp": 0.00355383, "balance_loss_clip": 1.14978266, "balance_loss_mlp": 0.32090729, "epoch": 0.6027356079963926, "flos": 23586775188480.0, "grad_norm": 32.73060047105878, "language_loss": 0.80887246, "learning_rate": 1.4392021869111815e-06, "loss": 0.82655674, "num_input_tokens_seen": 215920440, "router_z_loss_clip": 2.6328125, "router_z_loss_mlp": 0.3449707, "step": 10025, "time_per_iteration": 2.698819637298584 }, { "auxiliary_loss_clip": 0.01400091, "auxiliary_loss_mlp": 0.00409323, "balance_loss_clip": 1.13130748, "balance_loss_mlp": 0.37091392, "epoch": 0.6027957312490606, "flos": 20813753318400.0, "grad_norm": 87.38505111028007, "language_loss": 0.78043985, "learning_rate": 1.4388283596801016e-06, "loss": 0.79853398, "num_input_tokens_seen": 215940535, "router_z_loss_clip": 2.6875, "router_z_loss_mlp": 0.38427734, "step": 10026, "time_per_iteration": 2.697399139404297 }, { "auxiliary_loss_clip": 0.01376961, "auxiliary_loss_mlp": 0.0038991, "balance_loss_clip": 1.1268183, "balance_loss_mlp": 0.35836738, "epoch": 0.6028558545017285, "flos": 19935773182080.0, "grad_norm": 8.582245925781649, "language_loss": 0.85602224, "learning_rate": 1.4384545537287061e-06, "loss": 0.87369096, "num_input_tokens_seen": 215958045, "router_z_loss_clip": 2.5, "router_z_loss_mlp": 0.31542969, "step": 10027, "time_per_iteration": 4.059483051300049 }, { "auxiliary_loss_clip": 0.01404831, "auxiliary_loss_mlp": 0.00370006, "balance_loss_clip": 1.13367069, "balance_loss_mlp": 0.33636466, "epoch": 0.6029159777543965, "flos": 22820836550400.0, "grad_norm": 32.01795149673977, "language_loss": 0.80158675, "learning_rate": 1.438080769071171e-06, "loss": 0.8193351, "num_input_tokens_seen": 215977330, "router_z_loss_clip": 2.7109375, "router_z_loss_mlp": 0.33642578, "step": 10028, "time_per_iteration": 2.69870662689209 }, { "auxiliary_loss_clip": 0.01408731, "auxiliary_loss_mlp": 0.00357698, "balance_loss_clip": 1.14081025, "balance_loss_mlp": 0.32400972, "epoch": 0.6029761010070644, "flos": 23587242065280.0, "grad_norm": 5.091545404845053, "language_loss": 0.89652336, "learning_rate": 1.437707005721669e-06, "loss": 0.91418767, "num_input_tokens_seen": 215997865, "router_z_loss_clip": 2.67578125, "router_z_loss_mlp": 0.33691406, "step": 10029, "time_per_iteration": 2.734306812286377 }, { "auxiliary_loss_clip": 0.01386816, "auxiliary_loss_mlp": 0.00397401, "balance_loss_clip": 1.13176537, "balance_loss_mlp": 0.36354566, "epoch": 0.6030362242597325, "flos": 13662430859520.0, "grad_norm": 22.679296217885597, "language_loss": 0.87580943, "learning_rate": 1.437333263694373e-06, "loss": 0.8936516, "num_input_tokens_seen": 216016230, "router_z_loss_clip": 2.55273438, "router_z_loss_mlp": 0.33862305, "step": 10030, "time_per_iteration": 2.6607537269592285 }, { "auxiliary_loss_clip": 0.01392784, "auxiliary_loss_mlp": 0.00375209, "balance_loss_clip": 1.13007843, "balance_loss_mlp": 0.34278435, "epoch": 0.6030963475124004, "flos": 24422883045120.0, "grad_norm": 7.676899543469771, "language_loss": 0.76504254, "learning_rate": 1.4369595430034572e-06, "loss": 0.78272247, "num_input_tokens_seen": 216035785, "router_z_loss_clip": 2.62890625, "router_z_loss_mlp": 0.32397461, "step": 10031, "time_per_iteration": 4.124647855758667 }, { "auxiliary_loss_clip": 0.01419182, "auxiliary_loss_mlp": 0.00370145, "balance_loss_clip": 1.14310479, "balance_loss_mlp": 0.33421564, "epoch": 0.6031564707650684, "flos": 29644043247360.0, "grad_norm": 20.653430256619757, "language_loss": 0.80190986, "learning_rate": 1.4365858436630912e-06, "loss": 0.81980312, "num_input_tokens_seen": 216059555, "router_z_loss_clip": 2.76171875, "router_z_loss_mlp": 0.35961914, "step": 10032, "time_per_iteration": 2.739248037338257 }, { "auxiliary_loss_clip": 0.0144057, "auxiliary_loss_mlp": 0.00373092, "balance_loss_clip": 1.16110349, "balance_loss_mlp": 0.3397606, "epoch": 0.6032165940177363, "flos": 16618776768000.0, "grad_norm": 13.752817789924428, "language_loss": 0.76260555, "learning_rate": 1.4362121656874465e-06, "loss": 0.78074217, "num_input_tokens_seen": 216077235, "router_z_loss_clip": 2.79296875, "router_z_loss_mlp": 0.33349609, "step": 10033, "time_per_iteration": 2.764418363571167 }, { "auxiliary_loss_clip": 0.01404987, "auxiliary_loss_mlp": 0.00358523, "balance_loss_clip": 1.14420271, "balance_loss_mlp": 0.32521638, "epoch": 0.6032767172704043, "flos": 17488173553920.0, "grad_norm": 31.322335385330483, "language_loss": 0.82789582, "learning_rate": 1.4358385090906934e-06, "loss": 0.84553087, "num_input_tokens_seen": 216094985, "router_z_loss_clip": 2.60742188, "router_z_loss_mlp": 0.33300781, "step": 10034, "time_per_iteration": 2.7168924808502197 }, { "auxiliary_loss_clip": 0.01412685, "auxiliary_loss_mlp": 0.00352157, "balance_loss_clip": 1.14363217, "balance_loss_mlp": 0.31989875, "epoch": 0.6033368405230723, "flos": 26832955939200.0, "grad_norm": 12.225238831012959, "language_loss": 0.80988961, "learning_rate": 1.4354648738870004e-06, "loss": 0.82753801, "num_input_tokens_seen": 216115905, "router_z_loss_clip": 2.6875, "router_z_loss_mlp": 0.32250977, "step": 10035, "time_per_iteration": 2.7675068378448486 }, { "auxiliary_loss_clip": 0.01424459, "auxiliary_loss_mlp": 0.00308614, "balance_loss_clip": 1.15920365, "balance_loss_mlp": 0.27826291, "epoch": 0.6033969637757403, "flos": 16909904499840.0, "grad_norm": 65.93353515652042, "language_loss": 0.92079592, "learning_rate": 1.435091260090536e-06, "loss": 0.93812662, "num_input_tokens_seen": 216132420, "router_z_loss_clip": 2.65234375, "router_z_loss_mlp": 0.30371094, "step": 10036, "time_per_iteration": 2.701313018798828 }, { "auxiliary_loss_clip": 0.01403286, "auxiliary_loss_mlp": 0.00386095, "balance_loss_clip": 1.137393, "balance_loss_mlp": 0.35157198, "epoch": 0.6034570870284083, "flos": 22930076787840.0, "grad_norm": 3.687153998503546, "language_loss": 0.80040634, "learning_rate": 1.4347176677154676e-06, "loss": 0.81830013, "num_input_tokens_seen": 216149800, "router_z_loss_clip": 2.66015625, "router_z_loss_mlp": 0.34521484, "step": 10037, "time_per_iteration": 2.7304418087005615 }, { "auxiliary_loss_clip": 0.01394249, "auxiliary_loss_mlp": 0.0034517, "balance_loss_clip": 1.13791943, "balance_loss_mlp": 0.31527254, "epoch": 0.6035172102810762, "flos": 23366319465600.0, "grad_norm": 6.424099168543208, "language_loss": 0.90217859, "learning_rate": 1.4343440967759616e-06, "loss": 0.91957271, "num_input_tokens_seen": 216168200, "router_z_loss_clip": 2.56445312, "router_z_loss_mlp": 0.29907227, "step": 10038, "time_per_iteration": 2.7651429176330566 }, { "auxiliary_loss_clip": 0.01418226, "auxiliary_loss_mlp": 0.00349251, "balance_loss_clip": 1.1490773, "balance_loss_mlp": 0.31605172, "epoch": 0.6035773335337442, "flos": 20887082933760.0, "grad_norm": 6.908621980332366, "language_loss": 0.84893328, "learning_rate": 1.4339705472861846e-06, "loss": 0.86660802, "num_input_tokens_seen": 216187105, "router_z_loss_clip": 2.69335938, "router_z_loss_mlp": 0.33215332, "step": 10039, "time_per_iteration": 2.6870992183685303 }, { "auxiliary_loss_clip": 0.01393702, "auxiliary_loss_mlp": 0.00362023, "balance_loss_clip": 1.13654971, "balance_loss_mlp": 0.32962161, "epoch": 0.6036374567864121, "flos": 24936298093440.0, "grad_norm": 21.346097857028997, "language_loss": 0.78317034, "learning_rate": 1.433597019260301e-06, "loss": 0.80072761, "num_input_tokens_seen": 216205440, "router_z_loss_clip": 2.56640625, "router_z_loss_mlp": 0.32385254, "step": 10040, "time_per_iteration": 2.776392936706543 }, { "auxiliary_loss_clip": 0.01429959, "auxiliary_loss_mlp": 0.00360893, "balance_loss_clip": 1.15673327, "balance_loss_mlp": 0.32563055, "epoch": 0.6036975800390801, "flos": 23148269953920.0, "grad_norm": 10.923943473666517, "language_loss": 0.87104869, "learning_rate": 1.433223512712475e-06, "loss": 0.88895726, "num_input_tokens_seen": 216223130, "router_z_loss_clip": 2.73242188, "router_z_loss_mlp": 0.35229492, "step": 10041, "time_per_iteration": 2.689458131790161 }, { "auxiliary_loss_clip": 0.01398139, "auxiliary_loss_mlp": 0.00368595, "balance_loss_clip": 1.13630593, "balance_loss_mlp": 0.33493006, "epoch": 0.603757703291748, "flos": 18660729127680.0, "grad_norm": 31.76278218852651, "language_loss": 0.83529419, "learning_rate": 1.4328500276568704e-06, "loss": 0.85296154, "num_input_tokens_seen": 216240260, "router_z_loss_clip": 2.61914062, "router_z_loss_mlp": 0.33642578, "step": 10042, "time_per_iteration": 2.746649980545044 }, { "auxiliary_loss_clip": 0.0141027, "auxiliary_loss_mlp": 0.003485, "balance_loss_clip": 1.14237142, "balance_loss_mlp": 0.315741, "epoch": 0.6038178265444161, "flos": 19682603147520.0, "grad_norm": 3.116310999879363, "language_loss": 0.91292864, "learning_rate": 1.4324765641076498e-06, "loss": 0.9305163, "num_input_tokens_seen": 216258510, "router_z_loss_clip": 2.67773438, "router_z_loss_mlp": 0.32763672, "step": 10043, "time_per_iteration": 2.6706154346466064 }, { "auxiliary_loss_clip": 0.01400642, "auxiliary_loss_mlp": 0.00364479, "balance_loss_clip": 1.13600218, "balance_loss_mlp": 0.32983637, "epoch": 0.603877949797084, "flos": 22638230784000.0, "grad_norm": 66.82341253836053, "language_loss": 0.75687504, "learning_rate": 1.432103122078974e-06, "loss": 0.77452624, "num_input_tokens_seen": 216277550, "router_z_loss_clip": 2.64453125, "router_z_loss_mlp": 0.34643555, "step": 10044, "time_per_iteration": 2.7031021118164062 }, { "auxiliary_loss_clip": 0.01412751, "auxiliary_loss_mlp": 0.00377578, "balance_loss_clip": 1.14456022, "balance_loss_mlp": 0.34121925, "epoch": 0.603938073049752, "flos": 25447881548160.0, "grad_norm": 22061.505549041354, "language_loss": 0.84637535, "learning_rate": 1.4317297015850057e-06, "loss": 0.86427855, "num_input_tokens_seen": 216296690, "router_z_loss_clip": 2.68359375, "router_z_loss_mlp": 0.36352539, "step": 10045, "time_per_iteration": 2.678995370864868 }, { "auxiliary_loss_clip": 0.01391119, "auxiliary_loss_mlp": 0.00362724, "balance_loss_clip": 1.13315773, "balance_loss_mlp": 0.33017978, "epoch": 0.6039981963024199, "flos": 22340135813760.0, "grad_norm": 46.295235088850376, "language_loss": 0.82562768, "learning_rate": 1.4313563026399036e-06, "loss": 0.84316611, "num_input_tokens_seen": 216316110, "router_z_loss_clip": 2.57617188, "router_z_loss_mlp": 0.32543945, "step": 10046, "time_per_iteration": 2.7199976444244385 }, { "auxiliary_loss_clip": 0.01382638, "auxiliary_loss_mlp": 0.0036478, "balance_loss_clip": 1.12455332, "balance_loss_mlp": 0.33354694, "epoch": 0.6040583195550879, "flos": 20703148364160.0, "grad_norm": 121.10851888363715, "language_loss": 0.91447353, "learning_rate": 1.430982925257827e-06, "loss": 0.93194771, "num_input_tokens_seen": 216333855, "router_z_loss_clip": 2.58398438, "router_z_loss_mlp": 0.31201172, "step": 10047, "time_per_iteration": 2.6805508136749268 }, { "auxiliary_loss_clip": 0.01389062, "auxiliary_loss_mlp": 0.00352415, "balance_loss_clip": 1.13196588, "balance_loss_mlp": 0.32117009, "epoch": 0.604118442807756, "flos": 27163118776320.0, "grad_norm": 22.94984451865178, "language_loss": 0.81849307, "learning_rate": 1.4306095694529358e-06, "loss": 0.83590794, "num_input_tokens_seen": 216354890, "router_z_loss_clip": 2.57226562, "router_z_loss_mlp": 0.31237793, "step": 10048, "time_per_iteration": 2.7736077308654785 }, { "auxiliary_loss_clip": 0.01421584, "auxiliary_loss_mlp": 0.00386862, "balance_loss_clip": 1.14375663, "balance_loss_mlp": 0.34990752, "epoch": 0.6041785660604239, "flos": 30881524654080.0, "grad_norm": 8.650273594110924, "language_loss": 0.76201725, "learning_rate": 1.430236235239386e-06, "loss": 0.78010166, "num_input_tokens_seen": 216376055, "router_z_loss_clip": 2.77734375, "router_z_loss_mlp": 0.36962891, "step": 10049, "time_per_iteration": 2.753739356994629 }, { "auxiliary_loss_clip": 0.01378498, "auxiliary_loss_mlp": 0.00366462, "balance_loss_clip": 1.12516141, "balance_loss_mlp": 0.3373273, "epoch": 0.6042386893130919, "flos": 19938215306880.0, "grad_norm": 303.3669984298203, "language_loss": 0.72586989, "learning_rate": 1.429862922631336e-06, "loss": 0.74331951, "num_input_tokens_seen": 216396295, "router_z_loss_clip": 2.53515625, "router_z_loss_mlp": 0.29125977, "step": 10050, "time_per_iteration": 2.7078661918640137 }, { "auxiliary_loss_clip": 0.01392814, "auxiliary_loss_mlp": 0.00364624, "balance_loss_clip": 1.13310981, "balance_loss_mlp": 0.33291477, "epoch": 0.6042988125657598, "flos": 32415915882240.0, "grad_norm": 48.51616758596713, "language_loss": 0.76481068, "learning_rate": 1.4294896316429408e-06, "loss": 0.78238499, "num_input_tokens_seen": 216416605, "router_z_loss_clip": 2.59570312, "router_z_loss_mlp": 0.31713867, "step": 10051, "time_per_iteration": 2.793001413345337 }, { "auxiliary_loss_clip": 0.01378964, "auxiliary_loss_mlp": 0.00382931, "balance_loss_clip": 1.12012076, "balance_loss_mlp": 0.34812158, "epoch": 0.6043589358184278, "flos": 17420805596160.0, "grad_norm": 167.54155774548067, "language_loss": 0.71879667, "learning_rate": 1.4291163622883553e-06, "loss": 0.73641562, "num_input_tokens_seen": 216435130, "router_z_loss_clip": 2.58984375, "router_z_loss_mlp": 0.34790039, "step": 10052, "time_per_iteration": 2.666473388671875 }, { "auxiliary_loss_clip": 0.0141374, "auxiliary_loss_mlp": 0.00374922, "balance_loss_clip": 1.14234591, "balance_loss_mlp": 0.3405419, "epoch": 0.6044190590710957, "flos": 27672834723840.0, "grad_norm": 7.665452784558671, "language_loss": 0.76911253, "learning_rate": 1.4287431145817358e-06, "loss": 0.78699911, "num_input_tokens_seen": 216455640, "router_z_loss_clip": 2.71484375, "router_z_loss_mlp": 0.34375, "step": 10053, "time_per_iteration": 2.8005988597869873 }, { "auxiliary_loss_clip": 0.01451805, "auxiliary_loss_mlp": 0.00145913, "balance_loss_clip": 1.24142885, "balance_loss_mlp": 0.13632822, "epoch": 0.6044791823237637, "flos": 65316267515520.0, "grad_norm": 0.7101076357213952, "language_loss": 0.59531099, "learning_rate": 1.4283698885372336e-06, "loss": 0.61128813, "num_input_tokens_seen": 216518130, "router_z_loss_clip": 2.09375, "router_z_loss_mlp": 0.09570312, "step": 10054, "time_per_iteration": 3.2851803302764893 }, { "auxiliary_loss_clip": 0.01372389, "auxiliary_loss_mlp": 0.00359623, "balance_loss_clip": 1.11830497, "balance_loss_mlp": 0.32879591, "epoch": 0.6045393055764317, "flos": 24492369905280.0, "grad_norm": 21.441109663922614, "language_loss": 0.90379274, "learning_rate": 1.4279966841690027e-06, "loss": 0.9211129, "num_input_tokens_seen": 216536845, "router_z_loss_clip": 2.53710938, "router_z_loss_mlp": 0.30773926, "step": 10055, "time_per_iteration": 2.72823429107666 }, { "auxiliary_loss_clip": 0.01367196, "auxiliary_loss_mlp": 0.00366263, "balance_loss_clip": 1.11345339, "balance_loss_mlp": 0.33257419, "epoch": 0.6045994288290997, "flos": 19054345340160.0, "grad_norm": 16.164333565827146, "language_loss": 0.79717952, "learning_rate": 1.4276235014911952e-06, "loss": 0.81451416, "num_input_tokens_seen": 216551860, "router_z_loss_clip": 2.54101562, "router_z_loss_mlp": 0.3371582, "step": 10056, "time_per_iteration": 2.6509318351745605 }, { "auxiliary_loss_clip": 0.01363168, "auxiliary_loss_mlp": 0.00327146, "balance_loss_clip": 1.11467159, "balance_loss_mlp": 0.29783228, "epoch": 0.6046595520817676, "flos": 26576697335040.0, "grad_norm": 21.329281014458694, "language_loss": 0.85696441, "learning_rate": 1.4272503405179616e-06, "loss": 0.87386757, "num_input_tokens_seen": 216574775, "router_z_loss_clip": 2.48242188, "router_z_loss_mlp": 0.29309082, "step": 10057, "time_per_iteration": 2.7698428630828857 }, { "auxiliary_loss_clip": 0.01375607, "auxiliary_loss_mlp": 0.00369317, "balance_loss_clip": 1.12095106, "balance_loss_mlp": 0.33665347, "epoch": 0.6047196753344356, "flos": 13582277660160.0, "grad_norm": 4.366083848443535, "language_loss": 0.83402324, "learning_rate": 1.4268772012634527e-06, "loss": 0.8514725, "num_input_tokens_seen": 216590100, "router_z_loss_clip": 2.54882812, "router_z_loss_mlp": 0.3269043, "step": 10058, "time_per_iteration": 2.667850971221924 }, { "auxiliary_loss_clip": 0.01362487, "auxiliary_loss_mlp": 0.00363871, "balance_loss_clip": 1.11136568, "balance_loss_mlp": 0.33480775, "epoch": 0.6047797985871035, "flos": 25520456977920.0, "grad_norm": 35.484951616276206, "language_loss": 0.7847538, "learning_rate": 1.4265040837418176e-06, "loss": 0.80201739, "num_input_tokens_seen": 216610145, "router_z_loss_clip": 2.51171875, "router_z_loss_mlp": 0.29077148, "step": 10059, "time_per_iteration": 2.7339673042297363 }, { "auxiliary_loss_clip": 0.01385335, "auxiliary_loss_mlp": 0.00351014, "balance_loss_clip": 1.12584615, "balance_loss_mlp": 0.31889877, "epoch": 0.6048399218397715, "flos": 20520147548160.0, "grad_norm": 14.880429145083259, "language_loss": 0.81422424, "learning_rate": 1.4261309879672054e-06, "loss": 0.83158767, "num_input_tokens_seen": 216630625, "router_z_loss_clip": 2.59570312, "router_z_loss_mlp": 0.32104492, "step": 10060, "time_per_iteration": 2.733057737350464 }, { "auxiliary_loss_clip": 0.01368341, "auxiliary_loss_mlp": 0.00374249, "balance_loss_clip": 1.11605501, "balance_loss_mlp": 0.34203842, "epoch": 0.6049000450924396, "flos": 20408788408320.0, "grad_norm": 81.25122532662198, "language_loss": 0.80465806, "learning_rate": 1.4257579139537628e-06, "loss": 0.82208401, "num_input_tokens_seen": 216649255, "router_z_loss_clip": 2.5234375, "router_z_loss_mlp": 0.32202148, "step": 10061, "time_per_iteration": 2.642153739929199 }, { "auxiliary_loss_clip": 0.01380953, "auxiliary_loss_mlp": 0.0035148, "balance_loss_clip": 1.12329459, "balance_loss_mlp": 0.31948438, "epoch": 0.6049601683451075, "flos": 20741357456640.0, "grad_norm": 396.9208360417663, "language_loss": 0.74054599, "learning_rate": 1.425384861715639e-06, "loss": 0.75787032, "num_input_tokens_seen": 216668100, "router_z_loss_clip": 2.57617188, "router_z_loss_mlp": 0.32006836, "step": 10062, "time_per_iteration": 2.7986693382263184 }, { "auxiliary_loss_clip": 0.01365828, "auxiliary_loss_mlp": 0.0036562, "balance_loss_clip": 1.11554813, "balance_loss_mlp": 0.33264619, "epoch": 0.6050202915977755, "flos": 20083114771200.0, "grad_norm": 20.05489731746714, "language_loss": 0.79688358, "learning_rate": 1.425011831266978e-06, "loss": 0.81419814, "num_input_tokens_seen": 216686125, "router_z_loss_clip": 2.5, "router_z_loss_mlp": 0.32958984, "step": 10063, "time_per_iteration": 2.70166277885437 }, { "auxiliary_loss_clip": 0.01373403, "auxiliary_loss_mlp": 0.00340815, "balance_loss_clip": 1.11994696, "balance_loss_mlp": 0.31063104, "epoch": 0.6050804148504434, "flos": 15960821391360.0, "grad_norm": 20.15265765746853, "language_loss": 0.90201575, "learning_rate": 1.424638822621926e-06, "loss": 0.91915792, "num_input_tokens_seen": 216704265, "router_z_loss_clip": 2.53125, "router_z_loss_mlp": 0.30175781, "step": 10064, "time_per_iteration": 4.0784430503845215 }, { "auxiliary_loss_clip": 0.01377701, "auxiliary_loss_mlp": 0.00343466, "balance_loss_clip": 1.12264001, "balance_loss_mlp": 0.31352013, "epoch": 0.6051405381031114, "flos": 17456644391040.0, "grad_norm": 809.0444523403607, "language_loss": 0.89554393, "learning_rate": 1.4242658357946278e-06, "loss": 0.91275561, "num_input_tokens_seen": 216721765, "router_z_loss_clip": 2.54882812, "router_z_loss_mlp": 0.29931641, "step": 10065, "time_per_iteration": 2.643646240234375 }, { "auxiliary_loss_clip": 0.01396454, "auxiliary_loss_mlp": 0.00375019, "balance_loss_clip": 1.13279963, "balance_loss_mlp": 0.34113991, "epoch": 0.6052006613557793, "flos": 11400130517760.0, "grad_norm": 22.31329378901884, "language_loss": 0.87311888, "learning_rate": 1.423892870799226e-06, "loss": 0.89083362, "num_input_tokens_seen": 216738295, "router_z_loss_clip": 2.63476562, "router_z_loss_mlp": 0.33862305, "step": 10066, "time_per_iteration": 4.059038400650024 }, { "auxiliary_loss_clip": 0.01360542, "auxiliary_loss_mlp": 0.0032523, "balance_loss_clip": 1.11197805, "balance_loss_mlp": 0.29528505, "epoch": 0.6052607846084473, "flos": 24750998807040.0, "grad_norm": 64.1785660122441, "language_loss": 0.79090273, "learning_rate": 1.4235199276498655e-06, "loss": 0.80776048, "num_input_tokens_seen": 216759875, "router_z_loss_clip": 2.48632812, "router_z_loss_mlp": 0.29956055, "step": 10067, "time_per_iteration": 2.690668821334839 }, { "auxiliary_loss_clip": 0.01408838, "auxiliary_loss_mlp": 0.00362781, "balance_loss_clip": 1.14468336, "balance_loss_mlp": 0.33212072, "epoch": 0.6053209078611153, "flos": 20741141975040.0, "grad_norm": 59.78947552336681, "language_loss": 0.73199022, "learning_rate": 1.4231470063606863e-06, "loss": 0.74970639, "num_input_tokens_seen": 216780705, "router_z_loss_clip": 2.64453125, "router_z_loss_mlp": 0.30664062, "step": 10068, "time_per_iteration": 2.6877613067626953 }, { "auxiliary_loss_clip": 0.01381368, "auxiliary_loss_mlp": 0.00352349, "balance_loss_clip": 1.12344146, "balance_loss_mlp": 0.31978148, "epoch": 0.6053810311137833, "flos": 18953149749120.0, "grad_norm": 7.396543040406006, "language_loss": 0.9466393, "learning_rate": 1.4227741069458303e-06, "loss": 0.9639765, "num_input_tokens_seen": 216797625, "router_z_loss_clip": 2.58398438, "router_z_loss_mlp": 0.32568359, "step": 10069, "time_per_iteration": 4.17581582069397 }, { "auxiliary_loss_clip": 0.01385969, "auxiliary_loss_mlp": 0.0033679, "balance_loss_clip": 1.13251066, "balance_loss_mlp": 0.30448383, "epoch": 0.6054411543664512, "flos": 23951124794880.0, "grad_norm": 21.21559379305466, "language_loss": 0.90082568, "learning_rate": 1.4224012294194387e-06, "loss": 0.91805327, "num_input_tokens_seen": 216817610, "router_z_loss_clip": 2.53320312, "router_z_loss_mlp": 0.32299805, "step": 10070, "time_per_iteration": 2.7493646144866943 }, { "auxiliary_loss_clip": 0.01383593, "auxiliary_loss_mlp": 0.00370144, "balance_loss_clip": 1.12719846, "balance_loss_mlp": 0.33749259, "epoch": 0.6055012776191192, "flos": 20593979953920.0, "grad_norm": 32.79556114047225, "language_loss": 0.9284023, "learning_rate": 1.4220283737956496e-06, "loss": 0.94593966, "num_input_tokens_seen": 216836835, "router_z_loss_clip": 2.56445312, "router_z_loss_mlp": 0.32641602, "step": 10071, "time_per_iteration": 2.72204327583313 }, { "auxiliary_loss_clip": 0.01381046, "auxiliary_loss_mlp": 0.00355487, "balance_loss_clip": 1.12445712, "balance_loss_mlp": 0.32439759, "epoch": 0.6055614008717871, "flos": 30298191782400.0, "grad_norm": 6.80160558168375, "language_loss": 0.84189111, "learning_rate": 1.421655540088603e-06, "loss": 0.85925645, "num_input_tokens_seen": 216856760, "router_z_loss_clip": 2.56640625, "router_z_loss_mlp": 0.31079102, "step": 10072, "time_per_iteration": 2.813603401184082 }, { "auxiliary_loss_clip": 0.01388594, "auxiliary_loss_mlp": 0.00350192, "balance_loss_clip": 1.1248982, "balance_loss_mlp": 0.31574088, "epoch": 0.6056215241244551, "flos": 27125017424640.0, "grad_norm": 9.237194381581729, "language_loss": 0.81141913, "learning_rate": 1.4212827283124367e-06, "loss": 0.828807, "num_input_tokens_seen": 216878795, "router_z_loss_clip": 2.63671875, "router_z_loss_mlp": 0.34448242, "step": 10073, "time_per_iteration": 2.7730884552001953 }, { "auxiliary_loss_clip": 0.01442877, "auxiliary_loss_mlp": 0.00181589, "balance_loss_clip": 1.23707008, "balance_loss_mlp": 0.17100285, "epoch": 0.6056816473771232, "flos": 56007323925120.0, "grad_norm": 0.7482639451386884, "language_loss": 0.54875761, "learning_rate": 1.4209099384812863e-06, "loss": 0.56500232, "num_input_tokens_seen": 216937800, "router_z_loss_clip": 2.0625, "router_z_loss_mlp": 0.10595703, "step": 10074, "time_per_iteration": 4.644582271575928 }, { "auxiliary_loss_clip": 0.01362079, "auxiliary_loss_mlp": 0.00370399, "balance_loss_clip": 1.11302197, "balance_loss_mlp": 0.33973873, "epoch": 0.6057417706297911, "flos": 23549499849600.0, "grad_norm": 30.720600990048236, "language_loss": 0.87769461, "learning_rate": 1.4205371706092894e-06, "loss": 0.89501941, "num_input_tokens_seen": 216955280, "router_z_loss_clip": 2.48828125, "router_z_loss_mlp": 0.30639648, "step": 10075, "time_per_iteration": 2.7310824394226074 }, { "auxiliary_loss_clip": 0.01369091, "auxiliary_loss_mlp": 0.00345965, "balance_loss_clip": 1.11677766, "balance_loss_mlp": 0.31382599, "epoch": 0.6058018938824591, "flos": 27744296832000.0, "grad_norm": 6.961684023990378, "language_loss": 0.84749061, "learning_rate": 1.4201644247105813e-06, "loss": 0.86464119, "num_input_tokens_seen": 216976950, "router_z_loss_clip": 2.52148438, "router_z_loss_mlp": 0.32128906, "step": 10076, "time_per_iteration": 2.767946720123291 }, { "auxiliary_loss_clip": 0.01410538, "auxiliary_loss_mlp": 0.00348037, "balance_loss_clip": 1.14050126, "balance_loss_mlp": 0.31236923, "epoch": 0.605862017135127, "flos": 22783381643520.0, "grad_norm": 2.9841658670083167, "language_loss": 0.81456, "learning_rate": 1.4197917007992964e-06, "loss": 0.83214575, "num_input_tokens_seen": 216996945, "router_z_loss_clip": 2.70117188, "router_z_loss_mlp": 0.35668945, "step": 10077, "time_per_iteration": 2.724966287612915 }, { "auxiliary_loss_clip": 0.01424774, "auxiliary_loss_mlp": 0.00354337, "balance_loss_clip": 1.15732479, "balance_loss_mlp": 0.32014826, "epoch": 0.605922140387795, "flos": 21215019127680.0, "grad_norm": 38.03664165584043, "language_loss": 0.64428616, "learning_rate": 1.4194189988895682e-06, "loss": 0.66207731, "num_input_tokens_seen": 217016580, "router_z_loss_clip": 2.67382812, "router_z_loss_mlp": 0.34179688, "step": 10078, "time_per_iteration": 2.7018587589263916 }, { "auxiliary_loss_clip": 0.0137951, "auxiliary_loss_mlp": 0.00361864, "balance_loss_clip": 1.12142754, "balance_loss_mlp": 0.32920098, "epoch": 0.6059822636404629, "flos": 27268372604160.0, "grad_norm": 11.442252058735331, "language_loss": 0.774984, "learning_rate": 1.4190463189955297e-06, "loss": 0.79239774, "num_input_tokens_seen": 217037300, "router_z_loss_clip": 2.58398438, "router_z_loss_mlp": 0.32666016, "step": 10079, "time_per_iteration": 2.785423994064331 }, { "auxiliary_loss_clip": 0.01366346, "auxiliary_loss_mlp": 0.00347094, "balance_loss_clip": 1.11416888, "balance_loss_mlp": 0.31648073, "epoch": 0.606042386893131, "flos": 20631327120000.0, "grad_norm": 430.215436466252, "language_loss": 0.7100482, "learning_rate": 1.4186736611313131e-06, "loss": 0.72718263, "num_input_tokens_seen": 217055805, "router_z_loss_clip": 2.5234375, "router_z_loss_mlp": 0.30639648, "step": 10080, "time_per_iteration": 2.7140510082244873 }, { "auxiliary_loss_clip": 0.01372893, "auxiliary_loss_mlp": 0.00350743, "balance_loss_clip": 1.11933529, "balance_loss_mlp": 0.31834161, "epoch": 0.6061025101457989, "flos": 23002293081600.0, "grad_norm": 6.371870723174296, "language_loss": 0.79043579, "learning_rate": 1.4183010253110492e-06, "loss": 0.80767214, "num_input_tokens_seen": 217074175, "router_z_loss_clip": 2.54101562, "router_z_loss_mlp": 0.32397461, "step": 10081, "time_per_iteration": 2.681917667388916 }, { "auxiliary_loss_clip": 0.01382206, "auxiliary_loss_mlp": 0.0034257, "balance_loss_clip": 1.12598431, "balance_loss_mlp": 0.3091442, "epoch": 0.6061626333984669, "flos": 29898937134720.0, "grad_norm": 5.5520270052032386, "language_loss": 0.75048065, "learning_rate": 1.4179284115488691e-06, "loss": 0.76772845, "num_input_tokens_seen": 217095695, "router_z_loss_clip": 2.5625, "router_z_loss_mlp": 0.33422852, "step": 10082, "time_per_iteration": 2.83628249168396 }, { "auxiliary_loss_clip": 0.01372684, "auxiliary_loss_mlp": 0.00351697, "balance_loss_clip": 1.12413275, "balance_loss_mlp": 0.32036883, "epoch": 0.6062227566511348, "flos": 25009196745600.0, "grad_norm": 23.434233036760993, "language_loss": 0.71492517, "learning_rate": 1.4175558198589015e-06, "loss": 0.73216897, "num_input_tokens_seen": 217116260, "router_z_loss_clip": 2.48828125, "router_z_loss_mlp": 0.31323242, "step": 10083, "time_per_iteration": 2.73077392578125 }, { "auxiliary_loss_clip": 0.01382275, "auxiliary_loss_mlp": 0.00353852, "balance_loss_clip": 1.12460113, "balance_loss_mlp": 0.31951964, "epoch": 0.6062828799038028, "flos": 19463943104640.0, "grad_norm": 92.63399668096643, "language_loss": 0.81832999, "learning_rate": 1.4171832502552764e-06, "loss": 0.83569121, "num_input_tokens_seen": 217134465, "router_z_loss_clip": 2.57617188, "router_z_loss_mlp": 0.34326172, "step": 10084, "time_per_iteration": 2.7303669452667236 }, { "auxiliary_loss_clip": 0.01422759, "auxiliary_loss_mlp": 0.0032559, "balance_loss_clip": 1.14788365, "balance_loss_mlp": 0.29199672, "epoch": 0.6063430031564707, "flos": 13589568120960.0, "grad_norm": 18.75990410324582, "language_loss": 0.81391239, "learning_rate": 1.4168107027521204e-06, "loss": 0.83139586, "num_input_tokens_seen": 217149920, "router_z_loss_clip": 2.74414062, "router_z_loss_mlp": 0.33618164, "step": 10085, "time_per_iteration": 2.7210023403167725 }, { "auxiliary_loss_clip": 0.01398075, "auxiliary_loss_mlp": 0.00343789, "balance_loss_clip": 1.14021707, "balance_loss_mlp": 0.31024376, "epoch": 0.6064031264091387, "flos": 23255499029760.0, "grad_norm": 20.343862263650355, "language_loss": 0.83850443, "learning_rate": 1.4164381773635605e-06, "loss": 0.85592306, "num_input_tokens_seen": 217168165, "router_z_loss_clip": 2.578125, "router_z_loss_mlp": 0.33544922, "step": 10086, "time_per_iteration": 2.697746992111206 }, { "auxiliary_loss_clip": 0.01381928, "auxiliary_loss_mlp": 0.00358033, "balance_loss_clip": 1.12934804, "balance_loss_mlp": 0.32424951, "epoch": 0.6064632496618068, "flos": 22458462192000.0, "grad_norm": 4.743494513085421, "language_loss": 0.78794312, "learning_rate": 1.4160656741037246e-06, "loss": 0.80534279, "num_input_tokens_seen": 217190070, "router_z_loss_clip": 2.52148438, "router_z_loss_mlp": 0.33813477, "step": 10087, "time_per_iteration": 2.7177181243896484 }, { "auxiliary_loss_clip": 0.01371487, "auxiliary_loss_mlp": 0.00326705, "balance_loss_clip": 1.12008905, "balance_loss_mlp": 0.29737949, "epoch": 0.6065233729144747, "flos": 25118652464640.0, "grad_norm": 13.745864745597467, "language_loss": 0.88883924, "learning_rate": 1.4156931929867355e-06, "loss": 0.90582114, "num_input_tokens_seen": 217209370, "router_z_loss_clip": 2.515625, "router_z_loss_mlp": 0.29370117, "step": 10088, "time_per_iteration": 2.6820290088653564 }, { "auxiliary_loss_clip": 0.01377298, "auxiliary_loss_mlp": 0.00339116, "balance_loss_clip": 1.12404668, "balance_loss_mlp": 0.30912304, "epoch": 0.6065834961671427, "flos": 23477355383040.0, "grad_norm": 15.632830248645542, "language_loss": 0.79043835, "learning_rate": 1.4153207340267201e-06, "loss": 0.80760252, "num_input_tokens_seen": 217226990, "router_z_loss_clip": 2.53320312, "router_z_loss_mlp": 0.30004883, "step": 10089, "time_per_iteration": 2.7188403606414795 }, { "auxiliary_loss_clip": 0.01382409, "auxiliary_loss_mlp": 0.00370826, "balance_loss_clip": 1.12645495, "balance_loss_mlp": 0.33956891, "epoch": 0.6066436194198106, "flos": 17019396132480.0, "grad_norm": 94.10094466305266, "language_loss": 0.88282692, "learning_rate": 1.4149482972378009e-06, "loss": 0.90035927, "num_input_tokens_seen": 217244585, "router_z_loss_clip": 2.5625, "router_z_loss_mlp": 0.31274414, "step": 10090, "time_per_iteration": 2.62555193901062 }, { "auxiliary_loss_clip": 0.01403751, "auxiliary_loss_mlp": 0.00353951, "balance_loss_clip": 1.12948108, "balance_loss_mlp": 0.31849816, "epoch": 0.6067037426724786, "flos": 18514752255360.0, "grad_norm": 79.32742255965753, "language_loss": 0.85637534, "learning_rate": 1.4145758826341e-06, "loss": 0.87395233, "num_input_tokens_seen": 217263435, "router_z_loss_clip": 2.74414062, "router_z_loss_mlp": 0.35449219, "step": 10091, "time_per_iteration": 2.691298246383667 }, { "auxiliary_loss_clip": 0.0140267, "auxiliary_loss_mlp": 0.00359507, "balance_loss_clip": 1.14079869, "balance_loss_mlp": 0.32598579, "epoch": 0.6067638659251465, "flos": 22345989730560.0, "grad_norm": 80.3433076029316, "language_loss": 0.85912657, "learning_rate": 1.4142034902297415e-06, "loss": 0.87674832, "num_input_tokens_seen": 217283725, "router_z_loss_clip": 2.61914062, "router_z_loss_mlp": 0.33520508, "step": 10092, "time_per_iteration": 2.6874682903289795 }, { "auxiliary_loss_clip": 0.01382456, "auxiliary_loss_mlp": 0.00346335, "balance_loss_clip": 1.12161732, "balance_loss_mlp": 0.31424397, "epoch": 0.6068239891778145, "flos": 12451019748480.0, "grad_norm": 23.284199083678068, "language_loss": 0.82325512, "learning_rate": 1.4138311200388444e-06, "loss": 0.84054303, "num_input_tokens_seen": 217301120, "router_z_loss_clip": 2.609375, "router_z_loss_mlp": 0.32092285, "step": 10093, "time_per_iteration": 2.6994102001190186 }, { "auxiliary_loss_clip": 0.01388, "auxiliary_loss_mlp": 0.00327426, "balance_loss_clip": 1.13207436, "balance_loss_mlp": 0.29616922, "epoch": 0.6068841124304825, "flos": 23185868515200.0, "grad_norm": 6.753543263943484, "language_loss": 0.92117089, "learning_rate": 1.4134587720755304e-06, "loss": 0.93832517, "num_input_tokens_seen": 217319585, "router_z_loss_clip": 2.55664062, "router_z_loss_mlp": 0.31225586, "step": 10094, "time_per_iteration": 2.700490713119507 }, { "auxiliary_loss_clip": 0.01371258, "auxiliary_loss_mlp": 0.0036393, "balance_loss_clip": 1.11805618, "balance_loss_mlp": 0.32875109, "epoch": 0.6069442356831505, "flos": 18587902302720.0, "grad_norm": 21.742915418759353, "language_loss": 0.81178498, "learning_rate": 1.413086446353919e-06, "loss": 0.82913685, "num_input_tokens_seen": 217338880, "router_z_loss_clip": 2.52929688, "router_z_loss_mlp": 0.35168457, "step": 10095, "time_per_iteration": 2.7087674140930176 }, { "auxiliary_loss_clip": 0.01360843, "auxiliary_loss_mlp": 0.00403723, "balance_loss_clip": 1.10967743, "balance_loss_mlp": 0.36970109, "epoch": 0.6070043589358184, "flos": 20960340721920.0, "grad_norm": 9.660705163410816, "language_loss": 0.82410479, "learning_rate": 1.4127141428881273e-06, "loss": 0.84175038, "num_input_tokens_seen": 217357480, "router_z_loss_clip": 2.51757812, "router_z_loss_mlp": 0.34008789, "step": 10096, "time_per_iteration": 2.707672357559204 }, { "auxiliary_loss_clip": 0.01376796, "auxiliary_loss_mlp": 0.00353906, "balance_loss_clip": 1.1217066, "balance_loss_mlp": 0.3211236, "epoch": 0.6070644821884864, "flos": 11692443398400.0, "grad_norm": 239.2376926420582, "language_loss": 0.86988461, "learning_rate": 1.4123418616922749e-06, "loss": 0.88719171, "num_input_tokens_seen": 217374575, "router_z_loss_clip": 2.55273438, "router_z_loss_mlp": 0.32788086, "step": 10097, "time_per_iteration": 2.7126564979553223 }, { "auxiliary_loss_clip": 0.01382553, "auxiliary_loss_mlp": 0.00339575, "balance_loss_clip": 1.12442851, "balance_loss_mlp": 0.30893871, "epoch": 0.6071246054411543, "flos": 19310568030720.0, "grad_norm": 91.98899817815985, "language_loss": 0.74287921, "learning_rate": 1.411969602780478e-06, "loss": 0.76010048, "num_input_tokens_seen": 217392950, "router_z_loss_clip": 2.58007812, "router_z_loss_mlp": 0.30664062, "step": 10098, "time_per_iteration": 2.674354076385498 }, { "auxiliary_loss_clip": 0.01393277, "auxiliary_loss_mlp": 0.00336529, "balance_loss_clip": 1.13659894, "balance_loss_mlp": 0.30381769, "epoch": 0.6071847286938223, "flos": 17749029098880.0, "grad_norm": 444.29980506589413, "language_loss": 0.86657536, "learning_rate": 1.4115973661668523e-06, "loss": 0.88387334, "num_input_tokens_seen": 217412145, "router_z_loss_clip": 2.56640625, "router_z_loss_mlp": 0.32714844, "step": 10099, "time_per_iteration": 2.6824686527252197 }, { "auxiliary_loss_clip": 0.01396294, "auxiliary_loss_mlp": 0.00363111, "balance_loss_clip": 1.13027906, "balance_loss_mlp": 0.32505971, "epoch": 0.6072448519464904, "flos": 22637512512000.0, "grad_norm": 30.71544842921217, "language_loss": 0.7864114, "learning_rate": 1.4112251518655133e-06, "loss": 0.8040055, "num_input_tokens_seen": 217432080, "router_z_loss_clip": 2.65820312, "router_z_loss_mlp": 0.38037109, "step": 10100, "time_per_iteration": 2.714069366455078 }, { "auxiliary_loss_clip": 0.01395473, "auxiliary_loss_mlp": 0.00357124, "balance_loss_clip": 1.13699305, "balance_loss_mlp": 0.32090837, "epoch": 0.6073049751991583, "flos": 19537308633600.0, "grad_norm": 19.337434748723, "language_loss": 0.76148027, "learning_rate": 1.4108529598905764e-06, "loss": 0.77900624, "num_input_tokens_seen": 217450945, "router_z_loss_clip": 2.58789062, "router_z_loss_mlp": 0.36206055, "step": 10101, "time_per_iteration": 2.749934434890747 }, { "auxiliary_loss_clip": 0.01377563, "auxiliary_loss_mlp": 0.00341808, "balance_loss_clip": 1.12305474, "balance_loss_mlp": 0.31033698, "epoch": 0.6073650984518263, "flos": 28294233033600.0, "grad_norm": 76.19143520797958, "language_loss": 0.74929255, "learning_rate": 1.410480790256154e-06, "loss": 0.76648629, "num_input_tokens_seen": 217473105, "router_z_loss_clip": 2.54296875, "router_z_loss_mlp": 0.31445312, "step": 10102, "time_per_iteration": 2.756674289703369 }, { "auxiliary_loss_clip": 0.01390762, "auxiliary_loss_mlp": 0.00355898, "balance_loss_clip": 1.13431644, "balance_loss_mlp": 0.32313913, "epoch": 0.6074252217044942, "flos": 25664422688640.0, "grad_norm": 21.456476036803846, "language_loss": 0.81733859, "learning_rate": 1.4101086429763589e-06, "loss": 0.83480513, "num_input_tokens_seen": 217491780, "router_z_loss_clip": 2.56640625, "router_z_loss_mlp": 0.32751465, "step": 10103, "time_per_iteration": 2.735938787460327 }, { "auxiliary_loss_clip": 0.01417834, "auxiliary_loss_mlp": 0.00363579, "balance_loss_clip": 1.14367366, "balance_loss_mlp": 0.32781613, "epoch": 0.6074853449571622, "flos": 22857106308480.0, "grad_norm": 17.796964218554898, "language_loss": 0.83364522, "learning_rate": 1.4097365180653032e-06, "loss": 0.85145932, "num_input_tokens_seen": 217510605, "router_z_loss_clip": 2.7421875, "router_z_loss_mlp": 0.35742188, "step": 10104, "time_per_iteration": 2.7056121826171875 }, { "auxiliary_loss_clip": 0.01448338, "auxiliary_loss_mlp": 0.00255981, "balance_loss_clip": 1.24455547, "balance_loss_mlp": 0.24439387, "epoch": 0.6075454682098301, "flos": 67111406547840.0, "grad_norm": 0.7314854318948669, "language_loss": 0.54972637, "learning_rate": 1.4093644155370977e-06, "loss": 0.56676954, "num_input_tokens_seen": 217574815, "router_z_loss_clip": 2.03125, "router_z_loss_mlp": 0.11572266, "step": 10105, "time_per_iteration": 3.188776969909668 }, { "auxiliary_loss_clip": 0.0146054, "auxiliary_loss_mlp": 0.00171155, "balance_loss_clip": 1.25075245, "balance_loss_mlp": 0.15971135, "epoch": 0.6076055914624982, "flos": 70712024751360.0, "grad_norm": 0.7695728644510808, "language_loss": 0.56780499, "learning_rate": 1.4089923354058533e-06, "loss": 0.58412194, "num_input_tokens_seen": 217632375, "router_z_loss_clip": 2.09375, "router_z_loss_mlp": 0.11425781, "step": 10106, "time_per_iteration": 4.461594343185425 }, { "auxiliary_loss_clip": 0.01405454, "auxiliary_loss_mlp": 0.00338685, "balance_loss_clip": 1.14830875, "balance_loss_mlp": 0.30715367, "epoch": 0.6076657147151661, "flos": 28364545906560.0, "grad_norm": 3.5130612308512, "language_loss": 0.73797512, "learning_rate": 1.4086202776856784e-06, "loss": 0.75541651, "num_input_tokens_seen": 217653055, "router_z_loss_clip": 2.56835938, "router_z_loss_mlp": 0.31530762, "step": 10107, "time_per_iteration": 2.775022268295288 }, { "auxiliary_loss_clip": 0.01407729, "auxiliary_loss_mlp": 0.00360463, "balance_loss_clip": 1.14367723, "balance_loss_mlp": 0.32641721, "epoch": 0.6077258379678341, "flos": 15049767807360.0, "grad_norm": 25.723340664811968, "language_loss": 0.87567061, "learning_rate": 1.4082482423906815e-06, "loss": 0.89335257, "num_input_tokens_seen": 217671520, "router_z_loss_clip": 2.640625, "router_z_loss_mlp": 0.34008789, "step": 10108, "time_per_iteration": 4.161105394363403 }, { "auxiliary_loss_clip": 0.01417428, "auxiliary_loss_mlp": 0.00360369, "balance_loss_clip": 1.14803755, "balance_loss_mlp": 0.32424834, "epoch": 0.607785961220502, "flos": 36167251553280.0, "grad_norm": 75.46112504239606, "language_loss": 0.78915799, "learning_rate": 1.4078762295349714e-06, "loss": 0.80693591, "num_input_tokens_seen": 217691880, "router_z_loss_clip": 2.69921875, "router_z_loss_mlp": 0.36132812, "step": 10109, "time_per_iteration": 2.8070921897888184 }, { "auxiliary_loss_clip": 0.01413395, "auxiliary_loss_mlp": 0.00355095, "balance_loss_clip": 1.15668607, "balance_loss_mlp": 0.32243139, "epoch": 0.60784608447317, "flos": 22524249951360.0, "grad_norm": 129.30306139544265, "language_loss": 0.86111408, "learning_rate": 1.407504239132653e-06, "loss": 0.87879896, "num_input_tokens_seen": 217710530, "router_z_loss_clip": 2.5625, "router_z_loss_mlp": 0.3269043, "step": 10110, "time_per_iteration": 2.7106804847717285 }, { "auxiliary_loss_clip": 0.01420245, "auxiliary_loss_mlp": 0.00339877, "balance_loss_clip": 1.15241671, "balance_loss_mlp": 0.30664146, "epoch": 0.6079062077258379, "flos": 23841166285440.0, "grad_norm": 13.480916004276034, "language_loss": 0.80485928, "learning_rate": 1.4071322711978338e-06, "loss": 0.82246053, "num_input_tokens_seen": 217728650, "router_z_loss_clip": 2.67382812, "router_z_loss_mlp": 0.33227539, "step": 10111, "time_per_iteration": 4.2622339725494385 }, { "auxiliary_loss_clip": 0.01429174, "auxiliary_loss_mlp": 0.00368184, "balance_loss_clip": 1.15861237, "balance_loss_mlp": 0.3317779, "epoch": 0.6079663309785059, "flos": 23367037737600.0, "grad_norm": 9.607394578279044, "language_loss": 0.75023311, "learning_rate": 1.4067603257446186e-06, "loss": 0.76820672, "num_input_tokens_seen": 217747135, "router_z_loss_clip": 2.70507812, "router_z_loss_mlp": 0.36401367, "step": 10112, "time_per_iteration": 2.691591501235962 }, { "auxiliary_loss_clip": 0.01439266, "auxiliary_loss_mlp": 0.00155696, "balance_loss_clip": 1.23403096, "balance_loss_mlp": 0.14396535, "epoch": 0.6080264542311739, "flos": 71382873110400.0, "grad_norm": 0.6248874244874802, "language_loss": 0.48978099, "learning_rate": 1.4063884027871105e-06, "loss": 0.50573063, "num_input_tokens_seen": 217811860, "router_z_loss_clip": 2.046875, "router_z_loss_mlp": 0.1171875, "step": 10113, "time_per_iteration": 3.2471537590026855 }, { "auxiliary_loss_clip": 0.01438742, "auxiliary_loss_mlp": 0.00199673, "balance_loss_clip": 1.23373866, "balance_loss_mlp": 0.18822894, "epoch": 0.6080865774838419, "flos": 66529833442560.0, "grad_norm": 0.8232318243870913, "language_loss": 0.564821, "learning_rate": 1.4060165023394147e-06, "loss": 0.58120513, "num_input_tokens_seen": 217866510, "router_z_loss_clip": 2.046875, "router_z_loss_mlp": 0.11425781, "step": 10114, "time_per_iteration": 3.0797135829925537 }, { "auxiliary_loss_clip": 0.01427169, "auxiliary_loss_mlp": 0.00352926, "balance_loss_clip": 1.15827012, "balance_loss_mlp": 0.31845117, "epoch": 0.6081467007365099, "flos": 19207935895680.0, "grad_norm": 3.4101776899876257, "language_loss": 0.80236083, "learning_rate": 1.4056446244156317e-06, "loss": 0.82016182, "num_input_tokens_seen": 217885650, "router_z_loss_clip": 2.68945312, "router_z_loss_mlp": 0.34472656, "step": 10115, "time_per_iteration": 2.686617136001587 }, { "auxiliary_loss_clip": 0.01406714, "auxiliary_loss_mlp": 0.00319946, "balance_loss_clip": 1.14684415, "balance_loss_mlp": 0.28651965, "epoch": 0.6082068239891778, "flos": 24167737762560.0, "grad_norm": 11.753707122779716, "language_loss": 0.78753686, "learning_rate": 1.4052727690298642e-06, "loss": 0.80480349, "num_input_tokens_seen": 217905300, "router_z_loss_clip": 2.59570312, "router_z_loss_mlp": 0.33422852, "step": 10116, "time_per_iteration": 4.135500431060791 }, { "auxiliary_loss_clip": 0.01423777, "auxiliary_loss_mlp": 0.00348436, "balance_loss_clip": 1.1504817, "balance_loss_mlp": 0.31303057, "epoch": 0.6082669472418458, "flos": 37413316310400.0, "grad_norm": 31.402995429809234, "language_loss": 0.62902546, "learning_rate": 1.4049009361962138e-06, "loss": 0.64674753, "num_input_tokens_seen": 217927845, "router_z_loss_clip": 2.734375, "router_z_loss_mlp": 0.35400391, "step": 10117, "time_per_iteration": 2.796548366546631 }, { "auxiliary_loss_clip": 0.01419041, "auxiliary_loss_mlp": 0.00352323, "balance_loss_clip": 1.15244973, "balance_loss_mlp": 0.31856337, "epoch": 0.6083270704945137, "flos": 15085534775040.0, "grad_norm": 29.22168270138686, "language_loss": 0.78485942, "learning_rate": 1.4045291259287786e-06, "loss": 0.80257308, "num_input_tokens_seen": 217946145, "router_z_loss_clip": 2.66796875, "router_z_loss_mlp": 0.33740234, "step": 10118, "time_per_iteration": 2.756643295288086 }, { "auxiliary_loss_clip": 0.0141003, "auxiliary_loss_mlp": 0.00347184, "balance_loss_clip": 1.1464839, "balance_loss_mlp": 0.31418732, "epoch": 0.6083871937471818, "flos": 20668458804480.0, "grad_norm": 9.819555949579659, "language_loss": 0.8108902, "learning_rate": 1.4041573382416588e-06, "loss": 0.82846236, "num_input_tokens_seen": 217965190, "router_z_loss_clip": 2.63476562, "router_z_loss_mlp": 0.33007812, "step": 10119, "time_per_iteration": 2.6960272789001465 }, { "auxiliary_loss_clip": 0.01393577, "auxiliary_loss_mlp": 0.00371094, "balance_loss_clip": 1.1386863, "balance_loss_mlp": 0.33490214, "epoch": 0.6084473169998497, "flos": 21506901045120.0, "grad_norm": 207.52861740714823, "language_loss": 0.7528249, "learning_rate": 1.4037855731489525e-06, "loss": 0.77047157, "num_input_tokens_seen": 217983625, "router_z_loss_clip": 2.54492188, "router_z_loss_mlp": 0.36181641, "step": 10120, "time_per_iteration": 2.7375195026397705 }, { "auxiliary_loss_clip": 0.0143036, "auxiliary_loss_mlp": 0.00370424, "balance_loss_clip": 1.16187143, "balance_loss_mlp": 0.33633021, "epoch": 0.6085074402525177, "flos": 26870051710080.0, "grad_norm": 16.91133655401329, "language_loss": 0.81980246, "learning_rate": 1.4034138306647571e-06, "loss": 0.83781034, "num_input_tokens_seen": 218006005, "router_z_loss_clip": 2.68359375, "router_z_loss_mlp": 0.34082031, "step": 10121, "time_per_iteration": 2.7258880138397217 }, { "auxiliary_loss_clip": 0.01388383, "auxiliary_loss_mlp": 0.00343341, "balance_loss_clip": 1.13453937, "balance_loss_mlp": 0.31091568, "epoch": 0.6085675635051856, "flos": 10889839952640.0, "grad_norm": 18.95768367400698, "language_loss": 0.87254941, "learning_rate": 1.4030421108031685e-06, "loss": 0.88986671, "num_input_tokens_seen": 218024195, "router_z_loss_clip": 2.53710938, "router_z_loss_mlp": 0.32421875, "step": 10122, "time_per_iteration": 2.6450514793395996 }, { "auxiliary_loss_clip": 0.01422955, "auxiliary_loss_mlp": 0.00338767, "balance_loss_clip": 1.15930903, "balance_loss_mlp": 0.30655706, "epoch": 0.6086276867578536, "flos": 34862186707200.0, "grad_norm": 154.21622201150421, "language_loss": 0.62281632, "learning_rate": 1.402670413578284e-06, "loss": 0.64043361, "num_input_tokens_seen": 218047190, "router_z_loss_clip": 2.63671875, "router_z_loss_mlp": 0.32275391, "step": 10123, "time_per_iteration": 2.781679153442383 }, { "auxiliary_loss_clip": 0.01404998, "auxiliary_loss_mlp": 0.00355869, "balance_loss_clip": 1.14705181, "balance_loss_mlp": 0.3226577, "epoch": 0.6086878100105215, "flos": 20047706939520.0, "grad_norm": 12.298946292190369, "language_loss": 0.8216821, "learning_rate": 1.4022987390041965e-06, "loss": 0.83929074, "num_input_tokens_seen": 218065945, "router_z_loss_clip": 2.578125, "router_z_loss_mlp": 0.33251953, "step": 10124, "time_per_iteration": 2.6461029052734375 }, { "auxiliary_loss_clip": 0.01406631, "auxiliary_loss_mlp": 0.0033992, "balance_loss_clip": 1.14227962, "balance_loss_mlp": 0.30689925, "epoch": 0.6087479332631895, "flos": 18332469711360.0, "grad_norm": 7.29010548935568, "language_loss": 0.74558353, "learning_rate": 1.4019270870950006e-06, "loss": 0.76304913, "num_input_tokens_seen": 218085285, "router_z_loss_clip": 2.64257812, "router_z_loss_mlp": 0.32983398, "step": 10125, "time_per_iteration": 2.653268337249756 }, { "auxiliary_loss_clip": 0.01406044, "auxiliary_loss_mlp": 0.0032416, "balance_loss_clip": 1.14585078, "balance_loss_mlp": 0.29204458, "epoch": 0.6088080565158575, "flos": 24493411399680.0, "grad_norm": 48.60187426945705, "language_loss": 0.83360851, "learning_rate": 1.40155545786479e-06, "loss": 0.8509106, "num_input_tokens_seen": 218104735, "router_z_loss_clip": 2.6015625, "router_z_loss_mlp": 0.32104492, "step": 10126, "time_per_iteration": 2.6528241634368896 }, { "auxiliary_loss_clip": 0.0142124, "auxiliary_loss_mlp": 0.00339419, "balance_loss_clip": 1.15060377, "balance_loss_mlp": 0.30441886, "epoch": 0.6088681797685255, "flos": 10269016260480.0, "grad_norm": 43.82410713811943, "language_loss": 0.84347671, "learning_rate": 1.4011838513276558e-06, "loss": 0.86108333, "num_input_tokens_seen": 218121855, "router_z_loss_clip": 2.70703125, "router_z_loss_mlp": 0.35009766, "step": 10127, "time_per_iteration": 2.7650816440582275 }, { "auxiliary_loss_clip": 0.01448582, "auxiliary_loss_mlp": 0.00371814, "balance_loss_clip": 1.17093515, "balance_loss_mlp": 0.33767253, "epoch": 0.6089283030211935, "flos": 21973703218560.0, "grad_norm": 16.262045112949433, "language_loss": 0.8162694, "learning_rate": 1.400812267497691e-06, "loss": 0.83447337, "num_input_tokens_seen": 218137325, "router_z_loss_clip": 2.7734375, "router_z_loss_mlp": 0.34155273, "step": 10128, "time_per_iteration": 2.6754305362701416 }, { "auxiliary_loss_clip": 0.01409805, "auxiliary_loss_mlp": 0.00348877, "balance_loss_clip": 1.14811122, "balance_loss_mlp": 0.31583184, "epoch": 0.6089884262738614, "flos": 17785191116160.0, "grad_norm": 123.35631374989457, "language_loss": 0.81727278, "learning_rate": 1.4004407063889842e-06, "loss": 0.83485961, "num_input_tokens_seen": 218155530, "router_z_loss_clip": 2.61914062, "router_z_loss_mlp": 0.33056641, "step": 10129, "time_per_iteration": 2.6481165885925293 }, { "auxiliary_loss_clip": 0.014071, "auxiliary_loss_mlp": 0.0033319, "balance_loss_clip": 1.14911056, "balance_loss_mlp": 0.29928726, "epoch": 0.6090485495265294, "flos": 36910423946880.0, "grad_norm": 13.882699882564525, "language_loss": 0.71640515, "learning_rate": 1.400069168015626e-06, "loss": 0.7338081, "num_input_tokens_seen": 218182535, "router_z_loss_clip": 2.58398438, "router_z_loss_mlp": 0.33935547, "step": 10130, "time_per_iteration": 2.8195807933807373 }, { "auxiliary_loss_clip": 0.01403635, "auxiliary_loss_mlp": 0.0035653, "balance_loss_clip": 1.1485424, "balance_loss_mlp": 0.32558292, "epoch": 0.6091086727791973, "flos": 19899036547200.0, "grad_norm": 157.49363812218576, "language_loss": 0.82820857, "learning_rate": 1.3996976523917054e-06, "loss": 0.84581029, "num_input_tokens_seen": 218201740, "router_z_loss_clip": 2.55078125, "router_z_loss_mlp": 0.30932617, "step": 10131, "time_per_iteration": 2.6640162467956543 }, { "auxiliary_loss_clip": 0.01405156, "auxiliary_loss_mlp": 0.00333853, "balance_loss_clip": 1.14548469, "balance_loss_mlp": 0.30096269, "epoch": 0.6091687960318654, "flos": 22163635359360.0, "grad_norm": 8.881889416896676, "language_loss": 0.82807595, "learning_rate": 1.3993261595313093e-06, "loss": 0.84546602, "num_input_tokens_seen": 218219800, "router_z_loss_clip": 2.59765625, "router_z_loss_mlp": 0.32897949, "step": 10132, "time_per_iteration": 2.663092613220215 }, { "auxiliary_loss_clip": 0.01391381, "auxiliary_loss_mlp": 0.00343853, "balance_loss_clip": 1.14110363, "balance_loss_mlp": 0.31560084, "epoch": 0.6092289192845333, "flos": 21465280160640.0, "grad_norm": 24.509764475942717, "language_loss": 0.79691362, "learning_rate": 1.3989546894485261e-06, "loss": 0.81426597, "num_input_tokens_seen": 218237585, "router_z_loss_clip": 2.50390625, "router_z_loss_mlp": 0.2824707, "step": 10133, "time_per_iteration": 2.7175862789154053 }, { "auxiliary_loss_clip": 0.0143269, "auxiliary_loss_mlp": 0.00341103, "balance_loss_clip": 1.16466022, "balance_loss_mlp": 0.30519736, "epoch": 0.6092890425372013, "flos": 28694924225280.0, "grad_norm": 5.8201236429387935, "language_loss": 0.72110081, "learning_rate": 1.3985832421574414e-06, "loss": 0.73883879, "num_input_tokens_seen": 218258700, "router_z_loss_clip": 2.68164062, "router_z_loss_mlp": 0.35913086, "step": 10134, "time_per_iteration": 2.7426187992095947 }, { "auxiliary_loss_clip": 0.01393979, "auxiliary_loss_mlp": 0.00336596, "balance_loss_clip": 1.13930273, "balance_loss_mlp": 0.3051959, "epoch": 0.6093491657898692, "flos": 20813178700800.0, "grad_norm": 13.825067920335895, "language_loss": 0.86201477, "learning_rate": 1.3982118176721397e-06, "loss": 0.8793205, "num_input_tokens_seen": 218275655, "router_z_loss_clip": 2.54492188, "router_z_loss_mlp": 0.3137207, "step": 10135, "time_per_iteration": 2.6537997722625732 }, { "auxiliary_loss_clip": 0.0142838, "auxiliary_loss_mlp": 0.00352401, "balance_loss_clip": 1.16305256, "balance_loss_mlp": 0.31849772, "epoch": 0.6094092890425372, "flos": 25446983708160.0, "grad_norm": 8.911015124169474, "language_loss": 0.78631675, "learning_rate": 1.3978404160067069e-06, "loss": 0.80412447, "num_input_tokens_seen": 218295720, "router_z_loss_clip": 2.65429688, "router_z_loss_mlp": 0.33862305, "step": 10136, "time_per_iteration": 2.715176582336426 }, { "auxiliary_loss_clip": 0.01413937, "auxiliary_loss_mlp": 0.00336894, "balance_loss_clip": 1.15614426, "balance_loss_mlp": 0.30568486, "epoch": 0.6094694122952051, "flos": 35621265847680.0, "grad_norm": 5.7657809988054884, "language_loss": 0.80138963, "learning_rate": 1.3974690371752253e-06, "loss": 0.81889796, "num_input_tokens_seen": 218316745, "router_z_loss_clip": 2.58007812, "router_z_loss_mlp": 0.31201172, "step": 10137, "time_per_iteration": 2.778026580810547 }, { "auxiliary_loss_clip": 0.01395503, "auxiliary_loss_mlp": 0.00354944, "balance_loss_clip": 1.13356817, "balance_loss_mlp": 0.32242334, "epoch": 0.6095295355478731, "flos": 24456962073600.0, "grad_norm": 8.061528840206416, "language_loss": 0.85268211, "learning_rate": 1.3970976811917785e-06, "loss": 0.87018663, "num_input_tokens_seen": 218335385, "router_z_loss_clip": 2.62109375, "router_z_loss_mlp": 0.32519531, "step": 10138, "time_per_iteration": 2.717181921005249 }, { "auxiliary_loss_clip": 0.01416003, "auxiliary_loss_mlp": 0.0033503, "balance_loss_clip": 1.15742946, "balance_loss_mlp": 0.30355883, "epoch": 0.6095896588005411, "flos": 15633208419840.0, "grad_norm": 825.5703708869182, "language_loss": 0.86358529, "learning_rate": 1.3967263480704481e-06, "loss": 0.88109559, "num_input_tokens_seen": 218353320, "router_z_loss_clip": 2.58398438, "router_z_loss_mlp": 0.31420898, "step": 10139, "time_per_iteration": 2.6477596759796143 }, { "auxiliary_loss_clip": 0.01407606, "auxiliary_loss_mlp": 0.00325972, "balance_loss_clip": 1.14520741, "balance_loss_mlp": 0.29163969, "epoch": 0.6096497820532091, "flos": 15550577182080.0, "grad_norm": 1150.5951866058797, "language_loss": 0.90187693, "learning_rate": 1.396355037825315e-06, "loss": 0.9192127, "num_input_tokens_seen": 218365620, "router_z_loss_clip": 2.62695312, "router_z_loss_mlp": 0.34326172, "step": 10140, "time_per_iteration": 2.6439151763916016 }, { "auxiliary_loss_clip": 0.01418463, "auxiliary_loss_mlp": 0.00331638, "balance_loss_clip": 1.15179873, "balance_loss_mlp": 0.29952329, "epoch": 0.6097099053058771, "flos": 24204474397440.0, "grad_norm": 181.3751431473662, "language_loss": 0.82364297, "learning_rate": 1.3959837504704592e-06, "loss": 0.84114397, "num_input_tokens_seen": 218383785, "router_z_loss_clip": 2.6640625, "router_z_loss_mlp": 0.32080078, "step": 10141, "time_per_iteration": 2.702707529067993 }, { "auxiliary_loss_clip": 0.01388784, "auxiliary_loss_mlp": 0.00328151, "balance_loss_clip": 1.13326621, "balance_loss_mlp": 0.29667932, "epoch": 0.609770028558545, "flos": 19570238426880.0, "grad_norm": 12.04571873041154, "language_loss": 0.83394068, "learning_rate": 1.3956124860199603e-06, "loss": 0.85111004, "num_input_tokens_seen": 218399055, "router_z_loss_clip": 2.55859375, "router_z_loss_mlp": 0.31445312, "step": 10142, "time_per_iteration": 2.7253119945526123 }, { "auxiliary_loss_clip": 0.01403864, "auxiliary_loss_mlp": 0.00321829, "balance_loss_clip": 1.1455009, "balance_loss_mlp": 0.29062033, "epoch": 0.609830151811213, "flos": 23949185460480.0, "grad_norm": 992.1151287864252, "language_loss": 0.84799099, "learning_rate": 1.3952412444878964e-06, "loss": 0.86524796, "num_input_tokens_seen": 218419120, "router_z_loss_clip": 2.58203125, "router_z_loss_mlp": 0.31213379, "step": 10143, "time_per_iteration": 2.6774985790252686 }, { "auxiliary_loss_clip": 0.0143022, "auxiliary_loss_mlp": 0.00341272, "balance_loss_clip": 1.16142213, "balance_loss_mlp": 0.30772692, "epoch": 0.6098902750638809, "flos": 16179732829440.0, "grad_norm": 28.03567529922613, "language_loss": 0.82640839, "learning_rate": 1.3948700258883448e-06, "loss": 0.84412336, "num_input_tokens_seen": 218435290, "router_z_loss_clip": 2.68945312, "router_z_loss_mlp": 0.33532715, "step": 10144, "time_per_iteration": 2.655001640319824 }, { "auxiliary_loss_clip": 0.01406624, "auxiliary_loss_mlp": 0.00340013, "balance_loss_clip": 1.14639914, "balance_loss_mlp": 0.30789793, "epoch": 0.609950398316549, "flos": 44526393763200.0, "grad_norm": 23.672989946781517, "language_loss": 0.80638891, "learning_rate": 1.394498830235383e-06, "loss": 0.82385528, "num_input_tokens_seen": 218457880, "router_z_loss_clip": 2.59960938, "router_z_loss_mlp": 0.32104492, "step": 10145, "time_per_iteration": 2.8796489238739014 }, { "auxiliary_loss_clip": 0.01442488, "auxiliary_loss_mlp": 0.00300655, "balance_loss_clip": 1.17207432, "balance_loss_mlp": 0.26885018, "epoch": 0.6100105215692169, "flos": 23221743223680.0, "grad_norm": 7.7390733760303485, "language_loss": 0.7657882, "learning_rate": 1.3941276575430862e-06, "loss": 0.78321964, "num_input_tokens_seen": 218475930, "router_z_loss_clip": 2.703125, "router_z_loss_mlp": 0.31787109, "step": 10146, "time_per_iteration": 2.6904456615448 }, { "auxiliary_loss_clip": 0.0138927, "auxiliary_loss_mlp": 0.00341038, "balance_loss_clip": 1.13676715, "balance_loss_mlp": 0.31063959, "epoch": 0.6100706448218849, "flos": 15012564295680.0, "grad_norm": 18.798041537233658, "language_loss": 0.84101844, "learning_rate": 1.3937565078255289e-06, "loss": 0.85832155, "num_input_tokens_seen": 218493675, "router_z_loss_clip": 2.52734375, "router_z_loss_mlp": 0.30419922, "step": 10147, "time_per_iteration": 2.6950643062591553 }, { "auxiliary_loss_clip": 0.01385472, "auxiliary_loss_mlp": 0.00326429, "balance_loss_clip": 1.13217282, "balance_loss_mlp": 0.29533923, "epoch": 0.6101307680745528, "flos": 19639976682240.0, "grad_norm": 7.573165399977149, "language_loss": 0.85740203, "learning_rate": 1.393385381096786e-06, "loss": 0.87452102, "num_input_tokens_seen": 218511780, "router_z_loss_clip": 2.53125, "router_z_loss_mlp": 0.31091309, "step": 10148, "time_per_iteration": 2.652137517929077 }, { "auxiliary_loss_clip": 0.01415261, "auxiliary_loss_mlp": 0.00350341, "balance_loss_clip": 1.14990628, "balance_loss_mlp": 0.3173435, "epoch": 0.6101908913272208, "flos": 29935566028800.0, "grad_norm": 7.343686786973513, "language_loss": 0.63910842, "learning_rate": 1.39301427737093e-06, "loss": 0.65676445, "num_input_tokens_seen": 218531850, "router_z_loss_clip": 2.65429688, "router_z_loss_mlp": 0.33007812, "step": 10149, "time_per_iteration": 4.169723987579346 }, { "auxiliary_loss_clip": 0.01437167, "auxiliary_loss_mlp": 0.00317269, "balance_loss_clip": 1.1712451, "balance_loss_mlp": 0.28517812, "epoch": 0.6102510145798887, "flos": 21798639308160.0, "grad_norm": 8.897404324538678, "language_loss": 0.86660653, "learning_rate": 1.3926431966620333e-06, "loss": 0.88415092, "num_input_tokens_seen": 218551245, "router_z_loss_clip": 2.66015625, "router_z_loss_mlp": 0.32104492, "step": 10150, "time_per_iteration": 4.109920263290405 }, { "auxiliary_loss_clip": 0.01413246, "auxiliary_loss_mlp": 0.00297736, "balance_loss_clip": 1.15008068, "balance_loss_mlp": 0.26872051, "epoch": 0.6103111378325567, "flos": 20706129192960.0, "grad_norm": 6.092779973094965, "language_loss": 0.75045216, "learning_rate": 1.3922721389841684e-06, "loss": 0.76756203, "num_input_tokens_seen": 218571365, "router_z_loss_clip": 2.62890625, "router_z_loss_mlp": 0.2902832, "step": 10151, "time_per_iteration": 2.648402690887451 }, { "auxiliary_loss_clip": 0.01421744, "auxiliary_loss_mlp": 0.0033584, "balance_loss_clip": 1.15826344, "balance_loss_mlp": 0.30415377, "epoch": 0.6103712610852247, "flos": 29381643417600.0, "grad_norm": 13.128318821377084, "language_loss": 0.76986068, "learning_rate": 1.3919011043514036e-06, "loss": 0.78743649, "num_input_tokens_seen": 218588315, "router_z_loss_clip": 2.63476562, "router_z_loss_mlp": 0.31689453, "step": 10152, "time_per_iteration": 2.698082447052002 }, { "auxiliary_loss_clip": 0.01382514, "auxiliary_loss_mlp": 0.00346353, "balance_loss_clip": 1.13014603, "balance_loss_mlp": 0.31557363, "epoch": 0.6104313843378927, "flos": 20813035046400.0, "grad_norm": 394.00079057746973, "language_loss": 0.83327156, "learning_rate": 1.391530092777811e-06, "loss": 0.85056025, "num_input_tokens_seen": 218605940, "router_z_loss_clip": 2.5234375, "router_z_loss_mlp": 0.30749512, "step": 10153, "time_per_iteration": 2.776413679122925 }, { "auxiliary_loss_clip": 0.01394079, "auxiliary_loss_mlp": 0.00342017, "balance_loss_clip": 1.1397922, "balance_loss_mlp": 0.30956799, "epoch": 0.6104915075905607, "flos": 26578457101440.0, "grad_norm": 16.08697004663068, "language_loss": 0.86527348, "learning_rate": 1.3911591042774573e-06, "loss": 0.8826344, "num_input_tokens_seen": 218626100, "router_z_loss_clip": 2.54492188, "router_z_loss_mlp": 0.32421875, "step": 10154, "time_per_iteration": 4.103141784667969 }, { "auxiliary_loss_clip": 0.01383251, "auxiliary_loss_mlp": 0.00318986, "balance_loss_clip": 1.13071609, "balance_loss_mlp": 0.28888577, "epoch": 0.6105516308432286, "flos": 23915788790400.0, "grad_norm": 20.60544685304977, "language_loss": 0.76634461, "learning_rate": 1.3907881388644116e-06, "loss": 0.78336698, "num_input_tokens_seen": 218645060, "router_z_loss_clip": 2.52539062, "router_z_loss_mlp": 0.3013916, "step": 10155, "time_per_iteration": 2.7075273990631104 }, { "auxiliary_loss_clip": 0.01401665, "auxiliary_loss_mlp": 0.00322315, "balance_loss_clip": 1.14329481, "balance_loss_mlp": 0.29050988, "epoch": 0.6106117540958966, "flos": 31577365900800.0, "grad_norm": 14.23682051616567, "language_loss": 0.77256221, "learning_rate": 1.3904171965527413e-06, "loss": 0.78980201, "num_input_tokens_seen": 218667690, "router_z_loss_clip": 2.58203125, "router_z_loss_mlp": 0.31811523, "step": 10156, "time_per_iteration": 2.7359888553619385 }, { "auxiliary_loss_clip": 0.01388633, "auxiliary_loss_mlp": 0.00325296, "balance_loss_clip": 1.13702869, "balance_loss_mlp": 0.29570806, "epoch": 0.6106718773485645, "flos": 19608160210560.0, "grad_norm": 2057.5861620480755, "language_loss": 0.73719037, "learning_rate": 1.3900462773565114e-06, "loss": 0.75432968, "num_input_tokens_seen": 218687505, "router_z_loss_clip": 2.51757812, "router_z_loss_mlp": 0.29589844, "step": 10157, "time_per_iteration": 2.698759078979492 }, { "auxiliary_loss_clip": 0.01406119, "auxiliary_loss_mlp": 0.00334313, "balance_loss_clip": 1.14585304, "balance_loss_mlp": 0.30291301, "epoch": 0.6107320006012326, "flos": 17123895774720.0, "grad_norm": 21.622986220576117, "language_loss": 0.81859446, "learning_rate": 1.3896753812897877e-06, "loss": 0.83599877, "num_input_tokens_seen": 218705315, "router_z_loss_clip": 2.6015625, "router_z_loss_mlp": 0.31420898, "step": 10158, "time_per_iteration": 4.125421047210693 }, { "auxiliary_loss_clip": 0.01387707, "auxiliary_loss_mlp": 0.00311785, "balance_loss_clip": 1.13374531, "balance_loss_mlp": 0.28198272, "epoch": 0.6107921238539005, "flos": 30148228500480.0, "grad_norm": 15.22769154299476, "language_loss": 0.77387702, "learning_rate": 1.389304508366635e-06, "loss": 0.79087198, "num_input_tokens_seen": 218725735, "router_z_loss_clip": 2.53515625, "router_z_loss_mlp": 0.29785156, "step": 10159, "time_per_iteration": 2.7974021434783936 }, { "auxiliary_loss_clip": 0.01411477, "auxiliary_loss_mlp": 0.00332029, "balance_loss_clip": 1.14834106, "balance_loss_mlp": 0.29996198, "epoch": 0.6108522471065685, "flos": 18440273404800.0, "grad_norm": 2.5990064601297758, "language_loss": 0.85611284, "learning_rate": 1.3889336586011167e-06, "loss": 0.87354791, "num_input_tokens_seen": 218743215, "router_z_loss_clip": 2.62890625, "router_z_loss_mlp": 0.32080078, "step": 10160, "time_per_iteration": 2.6853291988372803 }, { "auxiliary_loss_clip": 0.01504664, "auxiliary_loss_mlp": 0.00146015, "balance_loss_clip": 1.2857604, "balance_loss_mlp": 0.13695467, "epoch": 0.6109123703592364, "flos": 64135454791680.0, "grad_norm": 0.8058367495320924, "language_loss": 0.61166143, "learning_rate": 1.388562832007295e-06, "loss": 0.62816823, "num_input_tokens_seen": 218806440, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.09082031, "step": 10161, "time_per_iteration": 3.3114585876464844 }, { "auxiliary_loss_clip": 0.01392835, "auxiliary_loss_mlp": 0.00345508, "balance_loss_clip": 1.13549519, "balance_loss_mlp": 0.31141371, "epoch": 0.6109724936119044, "flos": 20667848273280.0, "grad_norm": 21.983055209717882, "language_loss": 0.8387239, "learning_rate": 1.3881920285992324e-06, "loss": 0.85610729, "num_input_tokens_seen": 218825720, "router_z_loss_clip": 2.56835938, "router_z_loss_mlp": 0.34082031, "step": 10162, "time_per_iteration": 2.720425605773926 }, { "auxiliary_loss_clip": 0.01402221, "auxiliary_loss_mlp": 0.00337803, "balance_loss_clip": 1.14472413, "balance_loss_mlp": 0.30576, "epoch": 0.6110326168645723, "flos": 31351882273920.0, "grad_norm": 16.10582637719134, "language_loss": 0.77684319, "learning_rate": 1.3878212483909888e-06, "loss": 0.79424345, "num_input_tokens_seen": 218847735, "router_z_loss_clip": 2.57617188, "router_z_loss_mlp": 0.32080078, "step": 10163, "time_per_iteration": 2.728424310684204 }, { "auxiliary_loss_clip": 0.01392289, "auxiliary_loss_mlp": 0.00319579, "balance_loss_clip": 1.13942516, "balance_loss_mlp": 0.28797624, "epoch": 0.6110927401172404, "flos": 25003378742400.0, "grad_norm": 14.08357147298663, "language_loss": 0.66350484, "learning_rate": 1.387450491396625e-06, "loss": 0.68062347, "num_input_tokens_seen": 218866585, "router_z_loss_clip": 2.53125, "router_z_loss_mlp": 0.3157959, "step": 10164, "time_per_iteration": 2.673356533050537 }, { "auxiliary_loss_clip": 0.01387773, "auxiliary_loss_mlp": 0.00316662, "balance_loss_clip": 1.13360381, "balance_loss_mlp": 0.28712207, "epoch": 0.6111528633699083, "flos": 26248078782720.0, "grad_norm": 6.007281713504845, "language_loss": 0.80657876, "learning_rate": 1.3870797576302003e-06, "loss": 0.82362306, "num_input_tokens_seen": 218885560, "router_z_loss_clip": 2.54492188, "router_z_loss_mlp": 0.29541016, "step": 10165, "time_per_iteration": 2.6646978855133057 }, { "auxiliary_loss_clip": 0.01439314, "auxiliary_loss_mlp": 0.00290513, "balance_loss_clip": 1.17762589, "balance_loss_mlp": 0.26130676, "epoch": 0.6112129866225763, "flos": 22382474970240.0, "grad_norm": 5159.176398691883, "language_loss": 0.85352325, "learning_rate": 1.3867090471057719e-06, "loss": 0.87082154, "num_input_tokens_seen": 218905055, "router_z_loss_clip": 2.62109375, "router_z_loss_mlp": 0.29187012, "step": 10166, "time_per_iteration": 2.6702864170074463 }, { "auxiliary_loss_clip": 0.01385, "auxiliary_loss_mlp": 0.00332517, "balance_loss_clip": 1.13142872, "balance_loss_mlp": 0.30121264, "epoch": 0.6112731098752443, "flos": 25227892702080.0, "grad_norm": 8.233390608617913, "language_loss": 0.75700086, "learning_rate": 1.3863383598373987e-06, "loss": 0.774176, "num_input_tokens_seen": 218924030, "router_z_loss_clip": 2.53710938, "router_z_loss_mlp": 0.31274414, "step": 10167, "time_per_iteration": 2.694436550140381 }, { "auxiliary_loss_clip": 0.01400558, "auxiliary_loss_mlp": 0.00308573, "balance_loss_clip": 1.14807463, "balance_loss_mlp": 0.27984339, "epoch": 0.6113332331279122, "flos": 22893160584960.0, "grad_norm": 95.12774586983802, "language_loss": 0.84838176, "learning_rate": 1.3859676958391364e-06, "loss": 0.86547309, "num_input_tokens_seen": 218943750, "router_z_loss_clip": 2.52539062, "router_z_loss_mlp": 0.28759766, "step": 10168, "time_per_iteration": 2.6640512943267822 }, { "auxiliary_loss_clip": 0.01433216, "auxiliary_loss_mlp": 0.00342832, "balance_loss_clip": 1.15721452, "balance_loss_mlp": 0.3085711, "epoch": 0.6113933563805802, "flos": 18620329305600.0, "grad_norm": 10.04455279760714, "language_loss": 0.94177383, "learning_rate": 1.3855970551250398e-06, "loss": 0.95953429, "num_input_tokens_seen": 218957585, "router_z_loss_clip": 2.76171875, "router_z_loss_mlp": 0.34228516, "step": 10169, "time_per_iteration": 2.6380956172943115 }, { "auxiliary_loss_clip": 0.01371984, "auxiliary_loss_mlp": 0.00305695, "balance_loss_clip": 1.12437296, "balance_loss_mlp": 0.2768226, "epoch": 0.6114534796332481, "flos": 41866275317760.0, "grad_norm": 21.76900538953039, "language_loss": 0.84999937, "learning_rate": 1.3852264377091652e-06, "loss": 0.86677611, "num_input_tokens_seen": 218980025, "router_z_loss_clip": 2.4765625, "router_z_loss_mlp": 0.28857422, "step": 10170, "time_per_iteration": 2.851030111312866 }, { "auxiliary_loss_clip": 0.01395845, "auxiliary_loss_mlp": 0.00336631, "balance_loss_clip": 1.13515723, "balance_loss_mlp": 0.30277592, "epoch": 0.6115136028859162, "flos": 21908454163200.0, "grad_norm": 178.02249484449067, "language_loss": 0.75650293, "learning_rate": 1.3848558436055651e-06, "loss": 0.77382767, "num_input_tokens_seen": 218998200, "router_z_loss_clip": 2.60546875, "router_z_loss_mlp": 0.33862305, "step": 10171, "time_per_iteration": 2.710383415222168 }, { "auxiliary_loss_clip": 0.01417686, "auxiliary_loss_mlp": 0.00327215, "balance_loss_clip": 1.15005875, "balance_loss_mlp": 0.29498059, "epoch": 0.6115737261385841, "flos": 28804846821120.0, "grad_norm": 3.1090078868166375, "language_loss": 0.85728896, "learning_rate": 1.3844852728282934e-06, "loss": 0.87473798, "num_input_tokens_seen": 219017910, "router_z_loss_clip": 2.6796875, "router_z_loss_mlp": 0.32226562, "step": 10172, "time_per_iteration": 2.752166748046875 }, { "auxiliary_loss_clip": 0.01402422, "auxiliary_loss_mlp": 0.00342478, "balance_loss_clip": 1.13884115, "balance_loss_mlp": 0.3114596, "epoch": 0.6116338493912521, "flos": 21251468453760.0, "grad_norm": 9.099355395337989, "language_loss": 0.73096067, "learning_rate": 1.3841147253914022e-06, "loss": 0.74840969, "num_input_tokens_seen": 219037730, "router_z_loss_clip": 2.63476562, "router_z_loss_mlp": 0.30981445, "step": 10173, "time_per_iteration": 2.793596029281616 }, { "auxiliary_loss_clip": 0.01394196, "auxiliary_loss_mlp": 0.00339462, "balance_loss_clip": 1.13551712, "balance_loss_mlp": 0.30753809, "epoch": 0.61169397264392, "flos": 17530189488000.0, "grad_norm": 61.17207488840093, "language_loss": 0.64096582, "learning_rate": 1.3837442013089416e-06, "loss": 0.65830243, "num_input_tokens_seen": 219056755, "router_z_loss_clip": 2.58789062, "router_z_loss_mlp": 0.31958008, "step": 10174, "time_per_iteration": 2.7037415504455566 }, { "auxiliary_loss_clip": 0.0141137, "auxiliary_loss_mlp": 0.00326254, "balance_loss_clip": 1.15074873, "balance_loss_mlp": 0.29450858, "epoch": 0.611754095896588, "flos": 23951555758080.0, "grad_norm": 305.5275609759678, "language_loss": 0.71752936, "learning_rate": 1.3833737005949628e-06, "loss": 0.7349056, "num_input_tokens_seen": 219076985, "router_z_loss_clip": 2.60546875, "router_z_loss_mlp": 0.31750488, "step": 10175, "time_per_iteration": 2.6987216472625732 }, { "auxiliary_loss_clip": 0.01372154, "auxiliary_loss_mlp": 0.00318909, "balance_loss_clip": 1.12543678, "balance_loss_mlp": 0.28910619, "epoch": 0.6118142191492559, "flos": 25994872834560.0, "grad_norm": 13.208036683213807, "language_loss": 0.89972198, "learning_rate": 1.3830032232635154e-06, "loss": 0.91663259, "num_input_tokens_seen": 219096050, "router_z_loss_clip": 2.46484375, "router_z_loss_mlp": 0.29785156, "step": 10176, "time_per_iteration": 2.695727825164795 }, { "auxiliary_loss_clip": 0.01399556, "auxiliary_loss_mlp": 0.00319446, "balance_loss_clip": 1.14340019, "balance_loss_mlp": 0.28776032, "epoch": 0.611874342401924, "flos": 24603190341120.0, "grad_norm": 120.91008113790944, "language_loss": 0.83466119, "learning_rate": 1.3826327693286474e-06, "loss": 0.85185122, "num_input_tokens_seen": 219112665, "router_z_loss_clip": 2.5625, "router_z_loss_mlp": 0.31689453, "step": 10177, "time_per_iteration": 2.6574885845184326 }, { "auxiliary_loss_clip": 0.01412148, "auxiliary_loss_mlp": 0.00322645, "balance_loss_clip": 1.15259647, "balance_loss_mlp": 0.29103082, "epoch": 0.6119344656545919, "flos": 15887132640000.0, "grad_norm": 11.108774757286444, "language_loss": 0.83411562, "learning_rate": 1.3822623388044065e-06, "loss": 0.85146356, "num_input_tokens_seen": 219129120, "router_z_loss_clip": 2.59765625, "router_z_loss_mlp": 0.31640625, "step": 10178, "time_per_iteration": 2.728905439376831 }, { "auxiliary_loss_clip": 0.01402324, "auxiliary_loss_mlp": 0.00326735, "balance_loss_clip": 1.14457607, "balance_loss_mlp": 0.29619318, "epoch": 0.6119945889072599, "flos": 21652877917440.0, "grad_norm": 43.962402011027905, "language_loss": 0.75148714, "learning_rate": 1.3818919317048402e-06, "loss": 0.76877779, "num_input_tokens_seen": 219148950, "router_z_loss_clip": 2.578125, "router_z_loss_mlp": 0.30541992, "step": 10179, "time_per_iteration": 2.66497802734375 }, { "auxiliary_loss_clip": 0.01374543, "auxiliary_loss_mlp": 0.00328511, "balance_loss_clip": 1.12268043, "balance_loss_mlp": 0.29718244, "epoch": 0.6120547121599279, "flos": 13772533023360.0, "grad_norm": 9.76193333277153, "language_loss": 0.91107917, "learning_rate": 1.3815215480439933e-06, "loss": 0.92810971, "num_input_tokens_seen": 219165585, "router_z_loss_clip": 2.51953125, "router_z_loss_mlp": 0.31323242, "step": 10180, "time_per_iteration": 2.636752128601074 }, { "auxiliary_loss_clip": 0.01403594, "auxiliary_loss_mlp": 0.00332982, "balance_loss_clip": 1.1463356, "balance_loss_mlp": 0.29943642, "epoch": 0.6121148354125958, "flos": 20079164275200.0, "grad_norm": 100.37943296545427, "language_loss": 0.83140588, "learning_rate": 1.3811511878359113e-06, "loss": 0.84877169, "num_input_tokens_seen": 219183280, "router_z_loss_clip": 2.57421875, "router_z_loss_mlp": 0.33544922, "step": 10181, "time_per_iteration": 2.666895627975464 }, { "auxiliary_loss_clip": 0.01406338, "auxiliary_loss_mlp": 0.0031145, "balance_loss_clip": 1.14437795, "balance_loss_mlp": 0.28012139, "epoch": 0.6121749586652638, "flos": 13471313569920.0, "grad_norm": 17.025039314181704, "language_loss": 0.88203812, "learning_rate": 1.3807808510946384e-06, "loss": 0.899216, "num_input_tokens_seen": 219197200, "router_z_loss_clip": 2.61914062, "router_z_loss_mlp": 0.31323242, "step": 10182, "time_per_iteration": 2.6499643325805664 }, { "auxiliary_loss_clip": 0.01376639, "auxiliary_loss_mlp": 0.00287353, "balance_loss_clip": 1.12909746, "balance_loss_mlp": 0.25889724, "epoch": 0.6122350819179317, "flos": 20120533764480.0, "grad_norm": 4.3045141785300105, "language_loss": 0.87879872, "learning_rate": 1.3804105378342177e-06, "loss": 0.89543861, "num_input_tokens_seen": 219216825, "router_z_loss_clip": 2.47265625, "router_z_loss_mlp": 0.28430176, "step": 10183, "time_per_iteration": 2.6414413452148438 }, { "auxiliary_loss_clip": 0.01472288, "auxiliary_loss_mlp": 0.00068482, "balance_loss_clip": 1.26163709, "balance_loss_mlp": 0.05856404, "epoch": 0.6122952051705998, "flos": 65429242767360.0, "grad_norm": 0.7060268937869889, "language_loss": 0.61983848, "learning_rate": 1.3800402480686914e-06, "loss": 0.63524616, "num_input_tokens_seen": 219283795, "router_z_loss_clip": 2.09375, "router_z_loss_mlp": 0.09912109, "step": 10184, "time_per_iteration": 3.3234522342681885 }, { "auxiliary_loss_clip": 0.01391733, "auxiliary_loss_mlp": 0.00311602, "balance_loss_clip": 1.13920689, "balance_loss_mlp": 0.28214517, "epoch": 0.6123553284232677, "flos": 20376253664640.0, "grad_norm": 36.10985644557028, "language_loss": 0.90229249, "learning_rate": 1.379669981812101e-06, "loss": 0.91932583, "num_input_tokens_seen": 219302385, "router_z_loss_clip": 2.52539062, "router_z_loss_mlp": 0.29455566, "step": 10185, "time_per_iteration": 2.6838324069976807 }, { "auxiliary_loss_clip": 0.01420831, "auxiliary_loss_mlp": 0.00313528, "balance_loss_clip": 1.15670311, "balance_loss_mlp": 0.28134179, "epoch": 0.6124154516759357, "flos": 23987645948160.0, "grad_norm": 7.497138232067033, "language_loss": 0.82583523, "learning_rate": 1.3792997390784868e-06, "loss": 0.84317881, "num_input_tokens_seen": 219319765, "router_z_loss_clip": 2.64257812, "router_z_loss_mlp": 0.32202148, "step": 10186, "time_per_iteration": 2.6775832176208496 }, { "auxiliary_loss_clip": 0.01366989, "auxiliary_loss_mlp": 0.0031782, "balance_loss_clip": 1.12242603, "balance_loss_mlp": 0.28880483, "epoch": 0.6124755749286036, "flos": 21468799693440.0, "grad_norm": 40.40421837782595, "language_loss": 0.83104181, "learning_rate": 1.3789295198818895e-06, "loss": 0.8478899, "num_input_tokens_seen": 219337440, "router_z_loss_clip": 2.44921875, "router_z_loss_mlp": 0.29016113, "step": 10187, "time_per_iteration": 2.6952874660491943 }, { "auxiliary_loss_clip": 0.01407499, "auxiliary_loss_mlp": 0.00317177, "balance_loss_clip": 1.14778745, "balance_loss_mlp": 0.28696954, "epoch": 0.6125356981812716, "flos": 23879195809920.0, "grad_norm": 47.82696632771355, "language_loss": 0.87675321, "learning_rate": 1.3785593242363462e-06, "loss": 0.89399993, "num_input_tokens_seen": 219357525, "router_z_loss_clip": 2.59765625, "router_z_loss_mlp": 0.30200195, "step": 10188, "time_per_iteration": 2.70984148979187 }, { "auxiliary_loss_clip": 0.01389049, "auxiliary_loss_mlp": 0.00325026, "balance_loss_clip": 1.1384604, "balance_loss_mlp": 0.29334044, "epoch": 0.6125958214339395, "flos": 14425604150400.0, "grad_norm": 19.979920938196226, "language_loss": 0.83503169, "learning_rate": 1.378189152155896e-06, "loss": 0.85217243, "num_input_tokens_seen": 219374855, "router_z_loss_clip": 2.50976562, "router_z_loss_mlp": 0.31689453, "step": 10189, "time_per_iteration": 2.68572998046875 }, { "auxiliary_loss_clip": 0.01402527, "auxiliary_loss_mlp": 0.00324729, "balance_loss_clip": 1.14764977, "balance_loss_mlp": 0.29439047, "epoch": 0.6126559446866076, "flos": 23259090389760.0, "grad_norm": 76.68403760737405, "language_loss": 0.78213727, "learning_rate": 1.3778190036545758e-06, "loss": 0.79940987, "num_input_tokens_seen": 219394740, "router_z_loss_clip": 2.54492188, "router_z_loss_mlp": 0.30334473, "step": 10190, "time_per_iteration": 2.6652724742889404 }, { "auxiliary_loss_clip": 0.01404074, "auxiliary_loss_mlp": 0.00307764, "balance_loss_clip": 1.14471745, "balance_loss_mlp": 0.27700794, "epoch": 0.6127160679392755, "flos": 26864808324480.0, "grad_norm": 20.248031896861217, "language_loss": 0.74499094, "learning_rate": 1.3774488787464207e-06, "loss": 0.76210928, "num_input_tokens_seen": 219413755, "router_z_loss_clip": 2.59765625, "router_z_loss_mlp": 0.30761719, "step": 10191, "time_per_iteration": 4.118507385253906 }, { "auxiliary_loss_clip": 0.01415043, "auxiliary_loss_mlp": 0.00304729, "balance_loss_clip": 1.14872169, "balance_loss_mlp": 0.27220884, "epoch": 0.6127761911919435, "flos": 26396425952640.0, "grad_norm": 215.91755173146439, "language_loss": 0.82921231, "learning_rate": 1.377078777445467e-06, "loss": 0.84641004, "num_input_tokens_seen": 219433560, "router_z_loss_clip": 2.66015625, "router_z_loss_mlp": 0.32519531, "step": 10192, "time_per_iteration": 4.1279003620147705 }, { "auxiliary_loss_clip": 0.01414912, "auxiliary_loss_mlp": 0.00316177, "balance_loss_clip": 1.1577605, "balance_loss_mlp": 0.28556389, "epoch": 0.6128363144446115, "flos": 22634747164800.0, "grad_norm": 10.987532618968952, "language_loss": 0.91648179, "learning_rate": 1.3767086997657478e-06, "loss": 0.93379271, "num_input_tokens_seen": 219452640, "router_z_loss_clip": 2.56835938, "router_z_loss_mlp": 0.3059082, "step": 10193, "time_per_iteration": 2.651946544647217 }, { "auxiliary_loss_clip": 0.01408802, "auxiliary_loss_mlp": 0.00309728, "balance_loss_clip": 1.15031171, "balance_loss_mlp": 0.2798067, "epoch": 0.6128964376972794, "flos": 26759051706240.0, "grad_norm": 5.121694639487218, "language_loss": 0.78058589, "learning_rate": 1.3763386457212979e-06, "loss": 0.79777122, "num_input_tokens_seen": 219468585, "router_z_loss_clip": 2.58398438, "router_z_loss_mlp": 0.29931641, "step": 10194, "time_per_iteration": 2.689164400100708 }, { "auxiliary_loss_clip": 0.01482105, "auxiliary_loss_mlp": 0.00109229, "balance_loss_clip": 1.27324152, "balance_loss_mlp": 0.10002613, "epoch": 0.6129565609499474, "flos": 65567929178880.0, "grad_norm": 0.8153659682324801, "language_loss": 0.58600307, "learning_rate": 1.375968615326149e-06, "loss": 0.60191637, "num_input_tokens_seen": 219523015, "router_z_loss_clip": 2.09375, "router_z_loss_mlp": 0.09179688, "step": 10195, "time_per_iteration": 2.9926364421844482 }, { "auxiliary_loss_clip": 0.01407985, "auxiliary_loss_mlp": 0.00305516, "balance_loss_clip": 1.15236664, "balance_loss_mlp": 0.27435449, "epoch": 0.6130166842026153, "flos": 16362087200640.0, "grad_norm": 30.182693182359465, "language_loss": 0.76531321, "learning_rate": 1.3755986085943324e-06, "loss": 0.78244817, "num_input_tokens_seen": 219539980, "router_z_loss_clip": 2.55664062, "router_z_loss_mlp": 0.31152344, "step": 10196, "time_per_iteration": 4.09409236907959 }, { "auxiliary_loss_clip": 0.01413547, "auxiliary_loss_mlp": 0.0032627, "balance_loss_clip": 1.15582013, "balance_loss_mlp": 0.29581243, "epoch": 0.6130768074552834, "flos": 23652455207040.0, "grad_norm": 39.7341049056273, "language_loss": 0.77832603, "learning_rate": 1.3752286255398788e-06, "loss": 0.79572415, "num_input_tokens_seen": 219556980, "router_z_loss_clip": 2.578125, "router_z_loss_mlp": 0.30456543, "step": 10197, "time_per_iteration": 2.70631742477417 }, { "auxiliary_loss_clip": 0.01413531, "auxiliary_loss_mlp": 0.00313255, "balance_loss_clip": 1.15420318, "balance_loss_mlp": 0.2823084, "epoch": 0.6131369307079513, "flos": 20047455544320.0, "grad_norm": 32.023290801156, "language_loss": 0.85849857, "learning_rate": 1.3748586661768191e-06, "loss": 0.8757664, "num_input_tokens_seen": 219576410, "router_z_loss_clip": 2.58984375, "router_z_loss_mlp": 0.30957031, "step": 10198, "time_per_iteration": 2.7635929584503174 }, { "auxiliary_loss_clip": 0.01401386, "auxiliary_loss_mlp": 0.0033593, "balance_loss_clip": 1.1452744, "balance_loss_mlp": 0.30309987, "epoch": 0.6131970539606193, "flos": 22672166158080.0, "grad_norm": 22.603215981633642, "language_loss": 0.77934039, "learning_rate": 1.374488730519181e-06, "loss": 0.79671353, "num_input_tokens_seen": 219597180, "router_z_loss_clip": 2.5625, "router_z_loss_mlp": 0.32836914, "step": 10199, "time_per_iteration": 2.741129159927368 }, { "auxiliary_loss_clip": 0.01420889, "auxiliary_loss_mlp": 0.00362574, "balance_loss_clip": 1.15520036, "balance_loss_mlp": 0.32857585, "epoch": 0.6132571772132872, "flos": 26870913636480.0, "grad_norm": 5.279650747972043, "language_loss": 0.69432974, "learning_rate": 1.374118818580993e-06, "loss": 0.7121644, "num_input_tokens_seen": 219617630, "router_z_loss_clip": 2.65820312, "router_z_loss_mlp": 0.34008789, "step": 10200, "time_per_iteration": 4.195077657699585 }, { "auxiliary_loss_clip": 0.01402688, "auxiliary_loss_mlp": 0.00311545, "balance_loss_clip": 1.14921415, "balance_loss_mlp": 0.28126615, "epoch": 0.6133173004659552, "flos": 22892657794560.0, "grad_norm": 31.65163013511999, "language_loss": 0.7593872, "learning_rate": 1.3737489303762822e-06, "loss": 0.77652955, "num_input_tokens_seen": 219637025, "router_z_loss_clip": 2.53320312, "router_z_loss_mlp": 0.30273438, "step": 10201, "time_per_iteration": 2.72731614112854 }, { "auxiliary_loss_clip": 0.01406002, "auxiliary_loss_mlp": 0.0032966, "balance_loss_clip": 1.14888859, "balance_loss_mlp": 0.29694897, "epoch": 0.6133774237186231, "flos": 20485098852480.0, "grad_norm": 358.3052255003385, "language_loss": 0.90342724, "learning_rate": 1.3733790659190746e-06, "loss": 0.92078388, "num_input_tokens_seen": 219656625, "router_z_loss_clip": 2.5703125, "router_z_loss_mlp": 0.32714844, "step": 10202, "time_per_iteration": 2.7551000118255615 }, { "auxiliary_loss_clip": 0.01488896, "auxiliary_loss_mlp": 0.00085086, "balance_loss_clip": 1.27357364, "balance_loss_mlp": 0.07612149, "epoch": 0.6134375469712912, "flos": 69413065217280.0, "grad_norm": 0.8813776463294625, "language_loss": 0.66673565, "learning_rate": 1.3730092252233953e-06, "loss": 0.68247545, "num_input_tokens_seen": 219718090, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.08984375, "step": 10203, "time_per_iteration": 3.2225589752197266 }, { "auxiliary_loss_clip": 0.013856, "auxiliary_loss_mlp": 0.00302976, "balance_loss_clip": 1.13506126, "balance_loss_mlp": 0.27400821, "epoch": 0.6134976702239591, "flos": 41281541815680.0, "grad_norm": 3.5918123458233477, "language_loss": 0.67286181, "learning_rate": 1.37263940830327e-06, "loss": 0.68974757, "num_input_tokens_seen": 219740100, "router_z_loss_clip": 2.50390625, "router_z_loss_mlp": 0.28955078, "step": 10204, "time_per_iteration": 2.8573427200317383 }, { "auxiliary_loss_clip": 0.01400168, "auxiliary_loss_mlp": 0.00277333, "balance_loss_clip": 1.14650261, "balance_loss_mlp": 0.24908018, "epoch": 0.6135577934766271, "flos": 22346600261760.0, "grad_norm": 6.97235840907787, "language_loss": 0.78757811, "learning_rate": 1.3722696151727204e-06, "loss": 0.80435312, "num_input_tokens_seen": 219761225, "router_z_loss_clip": 2.53710938, "router_z_loss_mlp": 0.28283691, "step": 10205, "time_per_iteration": 2.7259349822998047 }, { "auxiliary_loss_clip": 0.01394998, "auxiliary_loss_mlp": 0.00311779, "balance_loss_clip": 1.1420083, "balance_loss_mlp": 0.2827996, "epoch": 0.6136179167292951, "flos": 23728155120000.0, "grad_norm": 32.98834682429746, "language_loss": 0.82571387, "learning_rate": 1.3718998458457701e-06, "loss": 0.84278166, "num_input_tokens_seen": 219780085, "router_z_loss_clip": 2.52929688, "router_z_loss_mlp": 0.28979492, "step": 10206, "time_per_iteration": 2.680375814437866 }, { "auxiliary_loss_clip": 0.0138903, "auxiliary_loss_mlp": 0.00307091, "balance_loss_clip": 1.13589489, "balance_loss_mlp": 0.27578691, "epoch": 0.613678039981963, "flos": 26024678144640.0, "grad_norm": 50.89342108462829, "language_loss": 0.82410944, "learning_rate": 1.3715301003364407e-06, "loss": 0.84107059, "num_input_tokens_seen": 219797895, "router_z_loss_clip": 2.53515625, "router_z_loss_mlp": 0.31298828, "step": 10207, "time_per_iteration": 2.7162060737609863 }, { "auxiliary_loss_clip": 0.01397691, "auxiliary_loss_mlp": 0.00333475, "balance_loss_clip": 1.14297175, "balance_loss_mlp": 0.30187249, "epoch": 0.613738163234631, "flos": 9859957200000.0, "grad_norm": 8.690659942166308, "language_loss": 0.89909399, "learning_rate": 1.3711603786587525e-06, "loss": 0.91640562, "num_input_tokens_seen": 219811295, "router_z_loss_clip": 2.54492188, "router_z_loss_mlp": 0.3157959, "step": 10208, "time_per_iteration": 2.6198158264160156 }, { "auxiliary_loss_clip": 0.01412777, "auxiliary_loss_mlp": 0.00322341, "balance_loss_clip": 1.14839792, "balance_loss_mlp": 0.28979695, "epoch": 0.613798286487299, "flos": 33182070001920.0, "grad_norm": 89.01791566024771, "language_loss": 0.79643828, "learning_rate": 1.3707906808267265e-06, "loss": 0.81378949, "num_input_tokens_seen": 219832735, "router_z_loss_clip": 2.64648438, "router_z_loss_mlp": 0.32543945, "step": 10209, "time_per_iteration": 2.827317476272583 }, { "auxiliary_loss_clip": 0.01374173, "auxiliary_loss_mlp": 0.00284529, "balance_loss_clip": 1.12344694, "balance_loss_mlp": 0.25614551, "epoch": 0.613858409739967, "flos": 25627901535360.0, "grad_norm": 11.17945399522078, "language_loss": 0.80704725, "learning_rate": 1.37042100685438e-06, "loss": 0.82363427, "num_input_tokens_seen": 219852755, "router_z_loss_clip": 2.50390625, "router_z_loss_mlp": 0.28381348, "step": 10210, "time_per_iteration": 2.7028067111968994 }, { "auxiliary_loss_clip": 0.01474995, "auxiliary_loss_mlp": 0.0005603, "balance_loss_clip": 1.26476407, "balance_loss_mlp": 0.04854394, "epoch": 0.6139185329926349, "flos": 67192313932800.0, "grad_norm": 0.8525612326084457, "language_loss": 0.64536607, "learning_rate": 1.3700513567557325e-06, "loss": 0.66067624, "num_input_tokens_seen": 219922785, "router_z_loss_clip": 2.09375, "router_z_loss_mlp": 0.07470703, "step": 10211, "time_per_iteration": 3.334641933441162 }, { "auxiliary_loss_clip": 0.01395349, "auxiliary_loss_mlp": 0.00306556, "balance_loss_clip": 1.14203405, "balance_loss_mlp": 0.27727863, "epoch": 0.6139786562453029, "flos": 21543637680000.0, "grad_norm": 2.9019121054602626, "language_loss": 0.81712282, "learning_rate": 1.369681730544801e-06, "loss": 0.83414185, "num_input_tokens_seen": 219942215, "router_z_loss_clip": 2.53125, "router_z_loss_mlp": 0.29321289, "step": 10212, "time_per_iteration": 2.6846156120300293 }, { "auxiliary_loss_clip": 0.01393795, "auxiliary_loss_mlp": 0.00286489, "balance_loss_clip": 1.14456344, "balance_loss_mlp": 0.25733086, "epoch": 0.6140387794979708, "flos": 26068489758720.0, "grad_norm": 11.928191321990656, "language_loss": 0.7992453, "learning_rate": 1.3693121282356009e-06, "loss": 0.81604815, "num_input_tokens_seen": 219963830, "router_z_loss_clip": 2.49609375, "router_z_loss_mlp": 0.29174805, "step": 10213, "time_per_iteration": 2.731318235397339 }, { "auxiliary_loss_clip": 0.01424394, "auxiliary_loss_mlp": 0.0032757, "balance_loss_clip": 1.15732121, "balance_loss_mlp": 0.29526412, "epoch": 0.6140989027506388, "flos": 23694614795520.0, "grad_norm": 5.183432008459838, "language_loss": 0.81250525, "learning_rate": 1.3689425498421483e-06, "loss": 0.83002484, "num_input_tokens_seen": 219983815, "router_z_loss_clip": 2.66992188, "router_z_loss_mlp": 0.32348633, "step": 10214, "time_per_iteration": 2.7029788494110107 }, { "auxiliary_loss_clip": 0.01403749, "auxiliary_loss_mlp": 0.00294793, "balance_loss_clip": 1.14701986, "balance_loss_mlp": 0.2654677, "epoch": 0.6141590260033067, "flos": 22231721589120.0, "grad_norm": 2.0699245643812794, "language_loss": 0.80723977, "learning_rate": 1.3685729953784572e-06, "loss": 0.82422513, "num_input_tokens_seen": 220003165, "router_z_loss_clip": 2.57226562, "router_z_loss_mlp": 0.29309082, "step": 10215, "time_per_iteration": 2.7386810779571533 }, { "auxiliary_loss_clip": 0.01395768, "auxiliary_loss_mlp": 0.00312357, "balance_loss_clip": 1.14141893, "balance_loss_mlp": 0.28172037, "epoch": 0.6142191492559748, "flos": 23871653953920.0, "grad_norm": 13.015180909257753, "language_loss": 0.86385763, "learning_rate": 1.368203464858542e-06, "loss": 0.88093889, "num_input_tokens_seen": 220021015, "router_z_loss_clip": 2.54101562, "router_z_loss_mlp": 0.30639648, "step": 10216, "time_per_iteration": 2.7087652683258057 }, { "auxiliary_loss_clip": 0.01402498, "auxiliary_loss_mlp": 0.00331425, "balance_loss_clip": 1.14583206, "balance_loss_mlp": 0.29730716, "epoch": 0.6142792725086427, "flos": 15042513260160.0, "grad_norm": 14.836003221330513, "language_loss": 0.8555876, "learning_rate": 1.3678339582964147e-06, "loss": 0.87292683, "num_input_tokens_seen": 220035780, "router_z_loss_clip": 2.56445312, "router_z_loss_mlp": 0.34106445, "step": 10217, "time_per_iteration": 2.633204460144043 }, { "auxiliary_loss_clip": 0.01406709, "auxiliary_loss_mlp": 0.00333101, "balance_loss_clip": 1.14774239, "balance_loss_mlp": 0.29915029, "epoch": 0.6143393957613107, "flos": 23330947547520.0, "grad_norm": 7.451417629783373, "language_loss": 0.87003624, "learning_rate": 1.3674644757060865e-06, "loss": 0.88743436, "num_input_tokens_seen": 220054280, "router_z_loss_clip": 2.58984375, "router_z_loss_mlp": 0.33984375, "step": 10218, "time_per_iteration": 2.667884588241577 }, { "auxiliary_loss_clip": 0.01402811, "auxiliary_loss_mlp": 0.00312001, "balance_loss_clip": 1.14686739, "balance_loss_mlp": 0.28255653, "epoch": 0.6143995190139786, "flos": 20117086058880.0, "grad_norm": 21.823206040943585, "language_loss": 0.87114942, "learning_rate": 1.367095017101569e-06, "loss": 0.88829756, "num_input_tokens_seen": 220074120, "router_z_loss_clip": 2.5625, "router_z_loss_mlp": 0.29467773, "step": 10219, "time_per_iteration": 2.7967169284820557 }, { "auxiliary_loss_clip": 0.01389444, "auxiliary_loss_mlp": 0.00288302, "balance_loss_clip": 1.13724744, "balance_loss_mlp": 0.26028833, "epoch": 0.6144596422666466, "flos": 42303559489920.0, "grad_norm": 1693.8833671428456, "language_loss": 0.75078511, "learning_rate": 1.3667255824968717e-06, "loss": 0.76756251, "num_input_tokens_seen": 220096320, "router_z_loss_clip": 2.52148438, "router_z_loss_mlp": 0.2800293, "step": 10220, "time_per_iteration": 2.8495051860809326 }, { "auxiliary_loss_clip": 0.01397879, "auxiliary_loss_mlp": 0.00302743, "balance_loss_clip": 1.14342809, "balance_loss_mlp": 0.27165362, "epoch": 0.6145197655193146, "flos": 21573622558080.0, "grad_norm": 196.16123100509924, "language_loss": 0.79424471, "learning_rate": 1.3663561719060041e-06, "loss": 0.81125093, "num_input_tokens_seen": 220114850, "router_z_loss_clip": 2.54882812, "router_z_loss_mlp": 0.31103516, "step": 10221, "time_per_iteration": 2.702946186065674 }, { "auxiliary_loss_clip": 0.01371008, "auxiliary_loss_mlp": 0.00280897, "balance_loss_clip": 1.12811971, "balance_loss_mlp": 0.25269243, "epoch": 0.6145798887719826, "flos": 21471098163840.0, "grad_norm": 28.36384682868476, "language_loss": 0.85059416, "learning_rate": 1.3659867853429735e-06, "loss": 0.86711323, "num_input_tokens_seen": 220133395, "router_z_loss_clip": 2.42773438, "router_z_loss_mlp": 0.28222656, "step": 10222, "time_per_iteration": 2.669923782348633 }, { "auxiliary_loss_clip": 0.0138599, "auxiliary_loss_mlp": 0.00292334, "balance_loss_clip": 1.12894678, "balance_loss_mlp": 0.26195917, "epoch": 0.6146400120246506, "flos": 20777016683520.0, "grad_norm": 7.297915762188477, "language_loss": 0.85951918, "learning_rate": 1.365617422821788e-06, "loss": 0.87630248, "num_input_tokens_seen": 220152790, "router_z_loss_clip": 2.5703125, "router_z_loss_mlp": 0.30395508, "step": 10223, "time_per_iteration": 2.6733436584472656 }, { "auxiliary_loss_clip": 0.01394828, "auxiliary_loss_mlp": 0.00281297, "balance_loss_clip": 1.14525664, "balance_loss_mlp": 0.25181693, "epoch": 0.6147001352773185, "flos": 13881306384000.0, "grad_norm": 51.79091811223888, "language_loss": 0.87256837, "learning_rate": 1.3652480843564535e-06, "loss": 0.88932955, "num_input_tokens_seen": 220169535, "router_z_loss_clip": 2.49804688, "router_z_loss_mlp": 0.29467773, "step": 10224, "time_per_iteration": 2.6552085876464844 }, { "auxiliary_loss_clip": 0.01387349, "auxiliary_loss_mlp": 0.00327948, "balance_loss_clip": 1.1399436, "balance_loss_mlp": 0.29766864, "epoch": 0.6147602585299865, "flos": 56641791807360.0, "grad_norm": 4.521746423717136, "language_loss": 0.70374733, "learning_rate": 1.3648787699609746e-06, "loss": 0.7209003, "num_input_tokens_seen": 220195305, "router_z_loss_clip": 2.4765625, "router_z_loss_mlp": 0.3026123, "step": 10225, "time_per_iteration": 3.0338363647460938 }, { "auxiliary_loss_clip": 0.01407127, "auxiliary_loss_mlp": 0.00318185, "balance_loss_clip": 1.1461817, "balance_loss_mlp": 0.28608176, "epoch": 0.6148203817826544, "flos": 32817217605120.0, "grad_norm": 16.333622333045568, "language_loss": 0.71716499, "learning_rate": 1.364509479649357e-06, "loss": 0.73441815, "num_input_tokens_seen": 220215040, "router_z_loss_clip": 2.60742188, "router_z_loss_mlp": 0.32104492, "step": 10226, "time_per_iteration": 2.7693545818328857 }, { "auxiliary_loss_clip": 0.01376049, "auxiliary_loss_mlp": 0.00314174, "balance_loss_clip": 1.12788332, "balance_loss_mlp": 0.28260696, "epoch": 0.6148805050353224, "flos": 18332038748160.0, "grad_norm": 5.296876212244825, "language_loss": 0.8237431, "learning_rate": 1.3641402134356037e-06, "loss": 0.84064531, "num_input_tokens_seen": 220234205, "router_z_loss_clip": 2.484375, "router_z_loss_mlp": 0.31542969, "step": 10227, "time_per_iteration": 2.6796963214874268 }, { "auxiliary_loss_clip": 0.01403947, "auxiliary_loss_mlp": 0.00330376, "balance_loss_clip": 1.14189124, "balance_loss_mlp": 0.29671121, "epoch": 0.6149406282879903, "flos": 14063983977600.0, "grad_norm": 117.01235407722645, "language_loss": 0.75855792, "learning_rate": 1.3637709713337164e-06, "loss": 0.77590108, "num_input_tokens_seen": 220252730, "router_z_loss_clip": 2.61914062, "router_z_loss_mlp": 0.33666992, "step": 10228, "time_per_iteration": 2.62235689163208 }, { "auxiliary_loss_clip": 0.01362434, "auxiliary_loss_mlp": 0.00308052, "balance_loss_clip": 1.12151074, "balance_loss_mlp": 0.27817863, "epoch": 0.6150007515406584, "flos": 25190186400000.0, "grad_norm": 8.321035014720488, "language_loss": 0.79614115, "learning_rate": 1.3634017533576985e-06, "loss": 0.812846, "num_input_tokens_seen": 220273345, "router_z_loss_clip": 2.41601562, "router_z_loss_mlp": 0.29882812, "step": 10229, "time_per_iteration": 2.731825113296509 }, { "auxiliary_loss_clip": 0.0140363, "auxiliary_loss_mlp": 0.00299313, "balance_loss_clip": 1.1514101, "balance_loss_mlp": 0.26839042, "epoch": 0.6150608747933263, "flos": 21945262625280.0, "grad_norm": 70.30550121399833, "language_loss": 0.84539586, "learning_rate": 1.3630325595215493e-06, "loss": 0.86242533, "num_input_tokens_seen": 220293845, "router_z_loss_clip": 2.515625, "router_z_loss_mlp": 0.30932617, "step": 10230, "time_per_iteration": 2.7254226207733154 }, { "auxiliary_loss_clip": 0.01387247, "auxiliary_loss_mlp": 0.00318839, "balance_loss_clip": 1.13332534, "balance_loss_mlp": 0.28814244, "epoch": 0.6151209980459943, "flos": 30117453523200.0, "grad_norm": 109.68028173697545, "language_loss": 0.80185747, "learning_rate": 1.36266338983927e-06, "loss": 0.81891835, "num_input_tokens_seen": 220316070, "router_z_loss_clip": 2.53515625, "router_z_loss_mlp": 0.30700684, "step": 10231, "time_per_iteration": 2.742785930633545 }, { "auxiliary_loss_clip": 0.01379086, "auxiliary_loss_mlp": 0.00282895, "balance_loss_clip": 1.12908816, "balance_loss_mlp": 0.25364131, "epoch": 0.6151811212986622, "flos": 30008356940160.0, "grad_norm": 5.540551093458924, "language_loss": 0.78180552, "learning_rate": 1.362294244324858e-06, "loss": 0.79842532, "num_input_tokens_seen": 220335695, "router_z_loss_clip": 2.50195312, "router_z_loss_mlp": 0.29211426, "step": 10232, "time_per_iteration": 2.735069513320923 }, { "auxiliary_loss_clip": 0.01372102, "auxiliary_loss_mlp": 0.00302616, "balance_loss_clip": 1.13018966, "balance_loss_mlp": 0.27439919, "epoch": 0.6152412445513302, "flos": 18872888808960.0, "grad_norm": 14.28632563718053, "language_loss": 0.97926742, "learning_rate": 1.3619251229923126e-06, "loss": 0.9960146, "num_input_tokens_seen": 220353720, "router_z_loss_clip": 2.41601562, "router_z_loss_mlp": 0.28259277, "step": 10233, "time_per_iteration": 4.036175012588501 }, { "auxiliary_loss_clip": 0.01399026, "auxiliary_loss_mlp": 0.00294072, "balance_loss_clip": 1.14431763, "balance_loss_mlp": 0.26479372, "epoch": 0.6153013678039982, "flos": 25703601448320.0, "grad_norm": 148.91879731053226, "language_loss": 0.78029811, "learning_rate": 1.3615560258556306e-06, "loss": 0.79722905, "num_input_tokens_seen": 220372515, "router_z_loss_clip": 2.54882812, "router_z_loss_mlp": 0.29272461, "step": 10234, "time_per_iteration": 2.7076327800750732 }, { "auxiliary_loss_clip": 0.01381902, "auxiliary_loss_mlp": 0.00323823, "balance_loss_clip": 1.12982035, "balance_loss_mlp": 0.29304266, "epoch": 0.6153614910566662, "flos": 28510271383680.0, "grad_norm": 26.137144006190223, "language_loss": 0.7423653, "learning_rate": 1.3611869529288077e-06, "loss": 0.7594226, "num_input_tokens_seen": 220393490, "router_z_loss_clip": 2.51953125, "router_z_loss_mlp": 0.30786133, "step": 10235, "time_per_iteration": 4.193690776824951 }, { "auxiliary_loss_clip": 0.01377185, "auxiliary_loss_mlp": 0.00318389, "balance_loss_clip": 1.12294006, "balance_loss_mlp": 0.28751379, "epoch": 0.6154216143093342, "flos": 23549787158400.0, "grad_norm": 13.37427996547943, "language_loss": 0.89442253, "learning_rate": 1.3608179042258398e-06, "loss": 0.91137826, "num_input_tokens_seen": 220412855, "router_z_loss_clip": 2.54101562, "router_z_loss_mlp": 0.30859375, "step": 10236, "time_per_iteration": 2.6831564903259277 }, { "auxiliary_loss_clip": 0.01392038, "auxiliary_loss_mlp": 0.00328095, "balance_loss_clip": 1.13540697, "balance_loss_mlp": 0.29748181, "epoch": 0.6154817375620021, "flos": 22748081552640.0, "grad_norm": 128.91756436094624, "language_loss": 0.85189259, "learning_rate": 1.360448879760721e-06, "loss": 0.86909389, "num_input_tokens_seen": 220433440, "router_z_loss_clip": 2.56640625, "router_z_loss_mlp": 0.30639648, "step": 10237, "time_per_iteration": 2.675614595413208 }, { "auxiliary_loss_clip": 0.01370948, "auxiliary_loss_mlp": 0.00314848, "balance_loss_clip": 1.12729025, "balance_loss_mlp": 0.2867859, "epoch": 0.6155418608146701, "flos": 27162975121920.0, "grad_norm": 6.347301806843105, "language_loss": 0.84089464, "learning_rate": 1.3600798795474449e-06, "loss": 0.85775256, "num_input_tokens_seen": 220453445, "router_z_loss_clip": 2.4375, "router_z_loss_mlp": 0.28088379, "step": 10238, "time_per_iteration": 4.1746087074279785 }, { "auxiliary_loss_clip": 0.01427398, "auxiliary_loss_mlp": 0.00087487, "balance_loss_clip": 1.22660685, "balance_loss_mlp": 0.07799748, "epoch": 0.615601984067338, "flos": 68811165014400.0, "grad_norm": 0.7521241109959493, "language_loss": 0.56857097, "learning_rate": 1.3597109036000036e-06, "loss": 0.58371985, "num_input_tokens_seen": 220509730, "router_z_loss_clip": 2.0, "router_z_loss_mlp": 0.09472656, "step": 10239, "time_per_iteration": 3.1684136390686035 }, { "auxiliary_loss_clip": 0.01370573, "auxiliary_loss_mlp": 0.00325768, "balance_loss_clip": 1.1222899, "balance_loss_mlp": 0.29365337, "epoch": 0.615662107320006, "flos": 15517144598400.0, "grad_norm": 10.617246017245686, "language_loss": 0.86749786, "learning_rate": 1.3593419519323892e-06, "loss": 0.88446122, "num_input_tokens_seen": 220527295, "router_z_loss_clip": 2.48242188, "router_z_loss_mlp": 0.32104492, "step": 10240, "time_per_iteration": 2.6389002799987793 }, { "auxiliary_loss_clip": 0.01355866, "auxiliary_loss_mlp": 0.00303922, "balance_loss_clip": 1.10708141, "balance_loss_mlp": 0.2725457, "epoch": 0.615722230572674, "flos": 21063691128960.0, "grad_norm": 22.261158350816274, "language_loss": 0.80748713, "learning_rate": 1.3589730245585922e-06, "loss": 0.824085, "num_input_tokens_seen": 220542730, "router_z_loss_clip": 2.48828125, "router_z_loss_mlp": 0.3137207, "step": 10241, "time_per_iteration": 2.6556334495544434 }, { "auxiliary_loss_clip": 0.01370013, "auxiliary_loss_mlp": 0.00307948, "balance_loss_clip": 1.12658489, "balance_loss_mlp": 0.27890909, "epoch": 0.615782353825342, "flos": 23256791919360.0, "grad_norm": 17.529558241851316, "language_loss": 0.77576458, "learning_rate": 1.3586041214926018e-06, "loss": 0.79254425, "num_input_tokens_seen": 220562995, "router_z_loss_clip": 2.4375, "router_z_loss_mlp": 0.2902832, "step": 10242, "time_per_iteration": 4.096301555633545 }, { "auxiliary_loss_clip": 0.01355495, "auxiliary_loss_mlp": 0.00300452, "balance_loss_clip": 1.11260772, "balance_loss_mlp": 0.27283159, "epoch": 0.6158424770780099, "flos": 21103911383040.0, "grad_norm": 32.14584363943654, "language_loss": 0.791291, "learning_rate": 1.3582352427484086e-06, "loss": 0.80785048, "num_input_tokens_seen": 220581775, "router_z_loss_clip": 2.42773438, "router_z_loss_mlp": 0.27600098, "step": 10243, "time_per_iteration": 2.9034903049468994 }, { "auxiliary_loss_clip": 0.01406832, "auxiliary_loss_mlp": 0.00110753, "balance_loss_clip": 1.21354723, "balance_loss_mlp": 0.10302867, "epoch": 0.6159026003306779, "flos": 70333276769280.0, "grad_norm": 0.7426087223286683, "language_loss": 0.56621015, "learning_rate": 1.3578663883399984e-06, "loss": 0.58138597, "num_input_tokens_seen": 220646395, "router_z_loss_clip": 1.9375, "router_z_loss_mlp": 0.07714844, "step": 10244, "time_per_iteration": 3.2529876232147217 }, { "auxiliary_loss_clip": 0.01400548, "auxiliary_loss_mlp": 0.00332658, "balance_loss_clip": 1.14897919, "balance_loss_mlp": 0.29987502, "epoch": 0.6159627235833458, "flos": 33874355802240.0, "grad_norm": 5.6137483231180605, "language_loss": 0.70712399, "learning_rate": 1.3574975582813593e-06, "loss": 0.72445607, "num_input_tokens_seen": 220668335, "router_z_loss_clip": 2.51367188, "router_z_loss_mlp": 0.32751465, "step": 10245, "time_per_iteration": 2.800830364227295 }, { "auxiliary_loss_clip": 0.01384134, "auxiliary_loss_mlp": 0.0029949, "balance_loss_clip": 1.13193321, "balance_loss_mlp": 0.27030793, "epoch": 0.6160228468360138, "flos": 26575440359040.0, "grad_norm": 637.8307481429061, "language_loss": 0.85531867, "learning_rate": 1.3571287525864771e-06, "loss": 0.87215489, "num_input_tokens_seen": 220688915, "router_z_loss_clip": 2.52539062, "router_z_loss_mlp": 0.29187012, "step": 10246, "time_per_iteration": 2.7459628582000732 }, { "auxiliary_loss_clip": 0.01377553, "auxiliary_loss_mlp": 0.00345757, "balance_loss_clip": 1.12837029, "balance_loss_mlp": 0.31324863, "epoch": 0.6160829700886818, "flos": 17193274894080.0, "grad_norm": 28.902117184959202, "language_loss": 0.952402, "learning_rate": 1.3567599712693368e-06, "loss": 0.96963513, "num_input_tokens_seen": 220703465, "router_z_loss_clip": 2.48828125, "router_z_loss_mlp": 0.32519531, "step": 10247, "time_per_iteration": 2.624130964279175 }, { "auxiliary_loss_clip": 0.01393499, "auxiliary_loss_mlp": 0.00327093, "balance_loss_clip": 1.14122975, "balance_loss_mlp": 0.29690921, "epoch": 0.6161430933413498, "flos": 23623547736960.0, "grad_norm": 16.656727882253012, "language_loss": 0.86366528, "learning_rate": 1.3563912143439235e-06, "loss": 0.88087124, "num_input_tokens_seen": 220722090, "router_z_loss_clip": 2.5234375, "router_z_loss_mlp": 0.30200195, "step": 10248, "time_per_iteration": 2.658085823059082 }, { "auxiliary_loss_clip": 0.01363348, "auxiliary_loss_mlp": 0.00286359, "balance_loss_clip": 1.12203252, "balance_loss_mlp": 0.25880995, "epoch": 0.6162032165940178, "flos": 23002436736000.0, "grad_norm": 11.384735941322692, "language_loss": 0.93775725, "learning_rate": 1.3560224818242191e-06, "loss": 0.95425439, "num_input_tokens_seen": 220741075, "router_z_loss_clip": 2.41796875, "router_z_loss_mlp": 0.27526855, "step": 10249, "time_per_iteration": 2.6998984813690186 }, { "auxiliary_loss_clip": 0.01368045, "auxiliary_loss_mlp": 0.0032382, "balance_loss_clip": 1.11650324, "balance_loss_mlp": 0.29203904, "epoch": 0.6162633398466857, "flos": 39421979740800.0, "grad_norm": 19.43668053007184, "language_loss": 0.79307961, "learning_rate": 1.3556537737242072e-06, "loss": 0.80999827, "num_input_tokens_seen": 220763395, "router_z_loss_clip": 2.515625, "router_z_loss_mlp": 0.31774902, "step": 10250, "time_per_iteration": 2.829848051071167 }, { "auxiliary_loss_clip": 0.01378828, "auxiliary_loss_mlp": 0.00293526, "balance_loss_clip": 1.1374284, "balance_loss_mlp": 0.26529759, "epoch": 0.6163234630993537, "flos": 19244672530560.0, "grad_norm": 13.395631514954118, "language_loss": 0.79563594, "learning_rate": 1.3552850900578692e-06, "loss": 0.81235945, "num_input_tokens_seen": 220780640, "router_z_loss_clip": 2.41210938, "router_z_loss_mlp": 0.2824707, "step": 10251, "time_per_iteration": 2.6603810787200928 }, { "auxiliary_loss_clip": 0.01354105, "auxiliary_loss_mlp": 0.00311331, "balance_loss_clip": 1.10996377, "balance_loss_mlp": 0.28218395, "epoch": 0.6163835863520216, "flos": 15961791058560.0, "grad_norm": 50.91367438595711, "language_loss": 0.75851047, "learning_rate": 1.3549164308391844e-06, "loss": 0.77516484, "num_input_tokens_seen": 220797960, "router_z_loss_clip": 2.4453125, "router_z_loss_mlp": 0.29162598, "step": 10252, "time_per_iteration": 2.629621744155884 }, { "auxiliary_loss_clip": 0.01366431, "auxiliary_loss_mlp": 0.00149606, "balance_loss_clip": 1.17238593, "balance_loss_mlp": 0.14064187, "epoch": 0.6164437096046896, "flos": 68103834393600.0, "grad_norm": 0.8580420957238833, "language_loss": 0.5709179, "learning_rate": 1.3545477960821333e-06, "loss": 0.58607829, "num_input_tokens_seen": 220856930, "router_z_loss_clip": 1.9375, "router_z_loss_mlp": 0.08984375, "step": 10253, "time_per_iteration": 3.2444961071014404 }, { "auxiliary_loss_clip": 0.01371107, "auxiliary_loss_mlp": 0.00336553, "balance_loss_clip": 1.1207937, "balance_loss_mlp": 0.3051528, "epoch": 0.6165038328573575, "flos": 21361211481600.0, "grad_norm": 53.57994765528177, "language_loss": 0.84480655, "learning_rate": 1.3541791858006946e-06, "loss": 0.86188316, "num_input_tokens_seen": 220877595, "router_z_loss_clip": 2.5078125, "router_z_loss_mlp": 0.3137207, "step": 10254, "time_per_iteration": 2.6631457805633545 }, { "auxiliary_loss_clip": 0.01364188, "auxiliary_loss_mlp": 0.00313698, "balance_loss_clip": 1.11770916, "balance_loss_mlp": 0.28371692, "epoch": 0.6165639561100256, "flos": 21101972048640.0, "grad_norm": 4.202670685329964, "language_loss": 0.87594062, "learning_rate": 1.353810600008846e-06, "loss": 0.89271951, "num_input_tokens_seen": 220896880, "router_z_loss_clip": 2.47070312, "router_z_loss_mlp": 0.29968262, "step": 10255, "time_per_iteration": 2.700911521911621 }, { "auxiliary_loss_clip": 0.01363407, "auxiliary_loss_mlp": 0.00332078, "balance_loss_clip": 1.11639452, "balance_loss_mlp": 0.30091661, "epoch": 0.6166240793626935, "flos": 25338533569920.0, "grad_norm": 90.9569689757983, "language_loss": 0.73819876, "learning_rate": 1.3534420387205646e-06, "loss": 0.75515366, "num_input_tokens_seen": 220916425, "router_z_loss_clip": 2.47070312, "router_z_loss_mlp": 0.3112793, "step": 10256, "time_per_iteration": 2.724639654159546 }, { "auxiliary_loss_clip": 0.01358737, "auxiliary_loss_mlp": 0.00326751, "balance_loss_clip": 1.12193894, "balance_loss_mlp": 0.29846269, "epoch": 0.6166842026153615, "flos": 19682639061120.0, "grad_norm": 267.3401052259844, "language_loss": 0.80877471, "learning_rate": 1.353073501949825e-06, "loss": 0.82562959, "num_input_tokens_seen": 220935050, "router_z_loss_clip": 2.3671875, "router_z_loss_mlp": 0.28283691, "step": 10257, "time_per_iteration": 2.721689224243164 }, { "auxiliary_loss_clip": 0.01375659, "auxiliary_loss_mlp": 0.00291524, "balance_loss_clip": 1.12794709, "balance_loss_mlp": 0.26210332, "epoch": 0.6167443258680294, "flos": 19318361281920.0, "grad_norm": 5.955286700811938, "language_loss": 0.79839259, "learning_rate": 1.3527049897106034e-06, "loss": 0.81506443, "num_input_tokens_seen": 220953085, "router_z_loss_clip": 2.4765625, "router_z_loss_mlp": 0.29394531, "step": 10258, "time_per_iteration": 2.6874730587005615 }, { "auxiliary_loss_clip": 0.01393111, "auxiliary_loss_mlp": 0.00289577, "balance_loss_clip": 1.13808441, "balance_loss_mlp": 0.25789139, "epoch": 0.6168044491206974, "flos": 25265239868160.0, "grad_norm": 86.53541082372406, "language_loss": 0.77083635, "learning_rate": 1.3523365020168735e-06, "loss": 0.78766322, "num_input_tokens_seen": 220969050, "router_z_loss_clip": 2.546875, "router_z_loss_mlp": 0.31713867, "step": 10259, "time_per_iteration": 2.7840352058410645 }, { "auxiliary_loss_clip": 0.01353392, "auxiliary_loss_mlp": 0.00291249, "balance_loss_clip": 1.11261761, "balance_loss_mlp": 0.26318687, "epoch": 0.6168645723733654, "flos": 13219903301760.0, "grad_norm": 10627.191722239806, "language_loss": 0.81016296, "learning_rate": 1.3519680388826084e-06, "loss": 0.82660931, "num_input_tokens_seen": 220985825, "router_z_loss_clip": 2.40820312, "router_z_loss_mlp": 0.28051758, "step": 10260, "time_per_iteration": 2.621734857559204 }, { "auxiliary_loss_clip": 0.01376981, "auxiliary_loss_mlp": 0.0031863, "balance_loss_clip": 1.12733889, "balance_loss_mlp": 0.28529871, "epoch": 0.6169246956260334, "flos": 26652038112000.0, "grad_norm": 43.10229572201512, "language_loss": 0.76286292, "learning_rate": 1.3515996003217803e-06, "loss": 0.77981907, "num_input_tokens_seen": 221004465, "router_z_loss_clip": 2.49414062, "router_z_loss_mlp": 0.33312988, "step": 10261, "time_per_iteration": 2.7354576587677 }, { "auxiliary_loss_clip": 0.01345106, "auxiliary_loss_mlp": 0.00296538, "balance_loss_clip": 1.10794187, "balance_loss_mlp": 0.26797497, "epoch": 0.6169848188787014, "flos": 23148413608320.0, "grad_norm": 57.453005224503876, "language_loss": 0.7844969, "learning_rate": 1.3512311863483602e-06, "loss": 0.80091339, "num_input_tokens_seen": 221023260, "router_z_loss_clip": 2.37109375, "router_z_loss_mlp": 0.28552246, "step": 10262, "time_per_iteration": 2.660094976425171 }, { "auxiliary_loss_clip": 0.0137033, "auxiliary_loss_mlp": 0.00303822, "balance_loss_clip": 1.12635183, "balance_loss_mlp": 0.27556944, "epoch": 0.6170449421313693, "flos": 23331917214720.0, "grad_norm": 36.535067065516515, "language_loss": 0.77669787, "learning_rate": 1.3508627969763188e-06, "loss": 0.79343939, "num_input_tokens_seen": 221043090, "router_z_loss_clip": 2.44140625, "router_z_loss_mlp": 0.28234863, "step": 10263, "time_per_iteration": 2.7001962661743164 }, { "auxiliary_loss_clip": 0.0137559, "auxiliary_loss_mlp": 0.00301764, "balance_loss_clip": 1.12455773, "balance_loss_mlp": 0.27324921, "epoch": 0.6171050653840373, "flos": 15851617067520.0, "grad_norm": 3.4755049248303154, "language_loss": 0.85906315, "learning_rate": 1.3504944322196244e-06, "loss": 0.87583667, "num_input_tokens_seen": 221061435, "router_z_loss_clip": 2.51367188, "router_z_loss_mlp": 0.28515625, "step": 10264, "time_per_iteration": 2.704101085662842 }, { "auxiliary_loss_clip": 0.01362878, "auxiliary_loss_mlp": 0.00306715, "balance_loss_clip": 1.12152898, "balance_loss_mlp": 0.27655464, "epoch": 0.6171651886367052, "flos": 20045516209920.0, "grad_norm": 74.30899437798362, "language_loss": 0.91193867, "learning_rate": 1.350126092092247e-06, "loss": 0.92863464, "num_input_tokens_seen": 221078705, "router_z_loss_clip": 2.4140625, "router_z_loss_mlp": 0.30175781, "step": 10265, "time_per_iteration": 2.779444694519043 }, { "auxiliary_loss_clip": 0.01339441, "auxiliary_loss_mlp": 0.00302565, "balance_loss_clip": 1.10518861, "balance_loss_mlp": 0.27364454, "epoch": 0.6172253118893732, "flos": 26432695710720.0, "grad_norm": 46.39462733124493, "language_loss": 0.74780476, "learning_rate": 1.349757776608153e-06, "loss": 0.76422483, "num_input_tokens_seen": 221099245, "router_z_loss_clip": 2.33984375, "router_z_loss_mlp": 0.28918457, "step": 10266, "time_per_iteration": 2.7015230655670166 }, { "auxiliary_loss_clip": 0.01337234, "auxiliary_loss_mlp": 0.00304033, "balance_loss_clip": 1.10221577, "balance_loss_mlp": 0.2752918, "epoch": 0.6172854351420412, "flos": 22632879657600.0, "grad_norm": 26.10713879144964, "language_loss": 0.81359679, "learning_rate": 1.3493894857813094e-06, "loss": 0.83000946, "num_input_tokens_seen": 221116930, "router_z_loss_clip": 2.34570312, "router_z_loss_mlp": 0.28747559, "step": 10267, "time_per_iteration": 2.6755475997924805 }, { "auxiliary_loss_clip": 0.0138718, "auxiliary_loss_mlp": 0.00314406, "balance_loss_clip": 1.13086081, "balance_loss_mlp": 0.28341141, "epoch": 0.6173455583947092, "flos": 21212936138880.0, "grad_norm": 614.5453537193556, "language_loss": 0.81657004, "learning_rate": 1.3490212196256818e-06, "loss": 0.83358592, "num_input_tokens_seen": 221137660, "router_z_loss_clip": 2.56640625, "router_z_loss_mlp": 0.30957031, "step": 10268, "time_per_iteration": 2.878925323486328 }, { "auxiliary_loss_clip": 0.01346474, "auxiliary_loss_mlp": 0.00343059, "balance_loss_clip": 1.10368586, "balance_loss_mlp": 0.31339973, "epoch": 0.6174056816473771, "flos": 19500284689920.0, "grad_norm": 11.688741318605787, "language_loss": 0.83770859, "learning_rate": 1.3486529781552342e-06, "loss": 0.85460389, "num_input_tokens_seen": 221156225, "router_z_loss_clip": 2.42578125, "router_z_loss_mlp": 0.296875, "step": 10269, "time_per_iteration": 2.705042839050293 }, { "auxiliary_loss_clip": 0.01354819, "auxiliary_loss_mlp": 0.00321531, "balance_loss_clip": 1.11026239, "balance_loss_mlp": 0.28971392, "epoch": 0.6174658049000451, "flos": 15997342544640.0, "grad_norm": 20.412722592967107, "language_loss": 0.8388263, "learning_rate": 1.3482847613839318e-06, "loss": 0.85558981, "num_input_tokens_seen": 221173820, "router_z_loss_clip": 2.44335938, "router_z_loss_mlp": 0.31848145, "step": 10270, "time_per_iteration": 2.6347737312316895 }, { "auxiliary_loss_clip": 0.01353062, "auxiliary_loss_mlp": 0.00302779, "balance_loss_clip": 1.11645126, "balance_loss_mlp": 0.27395457, "epoch": 0.617525928152713, "flos": 21903893136000.0, "grad_norm": 1582.3365300415737, "language_loss": 0.899279, "learning_rate": 1.347916569325736e-06, "loss": 0.91583741, "num_input_tokens_seen": 221191815, "router_z_loss_clip": 2.36523438, "router_z_loss_mlp": 0.28833008, "step": 10271, "time_per_iteration": 2.6874799728393555 }, { "auxiliary_loss_clip": 0.01363473, "auxiliary_loss_mlp": 0.00291686, "balance_loss_clip": 1.11978984, "balance_loss_mlp": 0.26121596, "epoch": 0.617586051405381, "flos": 21105958458240.0, "grad_norm": 9.114701664126727, "language_loss": 0.8549006, "learning_rate": 1.3475484019946093e-06, "loss": 0.87145215, "num_input_tokens_seen": 221211205, "router_z_loss_clip": 2.43554688, "router_z_loss_mlp": 0.3046875, "step": 10272, "time_per_iteration": 2.752615213394165 }, { "auxiliary_loss_clip": 0.01346488, "auxiliary_loss_mlp": 0.00138376, "balance_loss_clip": 1.16200197, "balance_loss_mlp": 0.1286485, "epoch": 0.617646174658049, "flos": 58610776665600.0, "grad_norm": 0.8164147237824814, "language_loss": 0.58093882, "learning_rate": 1.347180259404513e-06, "loss": 0.59578741, "num_input_tokens_seen": 221268430, "router_z_loss_clip": 1.84375, "router_z_loss_mlp": 0.09716797, "step": 10273, "time_per_iteration": 3.052072286605835 }, { "auxiliary_loss_clip": 0.01338217, "auxiliary_loss_mlp": 0.00314393, "balance_loss_clip": 1.10537589, "balance_loss_mlp": 0.28605664, "epoch": 0.617706297910717, "flos": 13878684691200.0, "grad_norm": 25.09440450125433, "language_loss": 0.82122755, "learning_rate": 1.3468121415694059e-06, "loss": 0.83775365, "num_input_tokens_seen": 221281930, "router_z_loss_clip": 2.33007812, "router_z_loss_mlp": 0.28344727, "step": 10274, "time_per_iteration": 2.664402484893799 }, { "auxiliary_loss_clip": 0.01337515, "auxiliary_loss_mlp": 0.0030349, "balance_loss_clip": 1.09908044, "balance_loss_mlp": 0.2729128, "epoch": 0.617766421163385, "flos": 19208438686080.0, "grad_norm": 9.752213902476496, "language_loss": 0.86300278, "learning_rate": 1.3464440485032484e-06, "loss": 0.87941283, "num_input_tokens_seen": 221301605, "router_z_loss_clip": 2.38476562, "router_z_loss_mlp": 0.3059082, "step": 10275, "time_per_iteration": 4.081395864486694 }, { "auxiliary_loss_clip": 0.01341631, "auxiliary_loss_mlp": 0.00292762, "balance_loss_clip": 1.10428214, "balance_loss_mlp": 0.26474744, "epoch": 0.6178265444160529, "flos": 22565978576640.0, "grad_norm": 27.92713540644189, "language_loss": 0.84484267, "learning_rate": 1.346075980219998e-06, "loss": 0.86118662, "num_input_tokens_seen": 221320105, "router_z_loss_clip": 2.375, "router_z_loss_mlp": 0.28051758, "step": 10276, "time_per_iteration": 2.665997266769409 }, { "auxiliary_loss_clip": 0.01354482, "auxiliary_loss_mlp": 0.00304517, "balance_loss_clip": 1.11154985, "balance_loss_mlp": 0.27423814, "epoch": 0.6178866676687209, "flos": 11984289402240.0, "grad_norm": 73.91974072106456, "language_loss": 0.88795459, "learning_rate": 1.345707936733612e-06, "loss": 0.90454453, "num_input_tokens_seen": 221335915, "router_z_loss_clip": 2.4296875, "router_z_loss_mlp": 0.30273438, "step": 10277, "time_per_iteration": 4.104903697967529 }, { "auxiliary_loss_clip": 0.01354277, "auxiliary_loss_mlp": 0.00294336, "balance_loss_clip": 1.10784173, "balance_loss_mlp": 0.26541618, "epoch": 0.6179467909213888, "flos": 20991510748800.0, "grad_norm": 4.013649701955217, "language_loss": 0.89176893, "learning_rate": 1.3453399180580466e-06, "loss": 0.9082551, "num_input_tokens_seen": 221353965, "router_z_loss_clip": 2.4609375, "router_z_loss_mlp": 0.28955078, "step": 10278, "time_per_iteration": 2.6586687564849854 }, { "auxiliary_loss_clip": 0.01327633, "auxiliary_loss_mlp": 0.00308268, "balance_loss_clip": 1.09221995, "balance_loss_mlp": 0.28080255, "epoch": 0.6180069141740568, "flos": 25338102606720.0, "grad_norm": 2.1528391895138554, "language_loss": 0.79312682, "learning_rate": 1.3449719242072567e-06, "loss": 0.80948585, "num_input_tokens_seen": 221374080, "router_z_loss_clip": 2.3515625, "router_z_loss_mlp": 0.27453613, "step": 10279, "time_per_iteration": 2.7586822509765625 }, { "auxiliary_loss_clip": 0.01331671, "auxiliary_loss_mlp": 0.00289967, "balance_loss_clip": 1.09926867, "balance_loss_mlp": 0.26331162, "epoch": 0.6180670374267248, "flos": 19645722858240.0, "grad_norm": 43.62626522796563, "language_loss": 0.76009506, "learning_rate": 1.3446039551951975e-06, "loss": 0.7763114, "num_input_tokens_seen": 221392910, "router_z_loss_clip": 2.32226562, "router_z_loss_mlp": 0.26647949, "step": 10280, "time_per_iteration": 4.100741624832153 }, { "auxiliary_loss_clip": 0.01357017, "auxiliary_loss_mlp": 0.00305611, "balance_loss_clip": 1.10967875, "balance_loss_mlp": 0.27695328, "epoch": 0.6181271606793928, "flos": 19464876858240.0, "grad_norm": 5.417593170297812, "language_loss": 0.78298187, "learning_rate": 1.3442360110358215e-06, "loss": 0.79960823, "num_input_tokens_seen": 221410990, "router_z_loss_clip": 2.4765625, "router_z_loss_mlp": 0.28662109, "step": 10281, "time_per_iteration": 2.6784181594848633 }, { "auxiliary_loss_clip": 0.01348529, "auxiliary_loss_mlp": 0.0029273, "balance_loss_clip": 1.11138749, "balance_loss_mlp": 0.26512086, "epoch": 0.6181872839320607, "flos": 25594289383680.0, "grad_norm": 39.01796415989131, "language_loss": 0.82523167, "learning_rate": 1.3438680917430827e-06, "loss": 0.84164423, "num_input_tokens_seen": 221431020, "router_z_loss_clip": 2.37109375, "router_z_loss_mlp": 0.27624512, "step": 10282, "time_per_iteration": 2.7049720287323 }, { "auxiliary_loss_clip": 0.01345026, "auxiliary_loss_mlp": 0.0030228, "balance_loss_clip": 1.09855986, "balance_loss_mlp": 0.27116591, "epoch": 0.6182474071847287, "flos": 25551806572800.0, "grad_norm": 91.10626086387813, "language_loss": 0.75733966, "learning_rate": 1.343500197330931e-06, "loss": 0.77381271, "num_input_tokens_seen": 221453235, "router_z_loss_clip": 2.46679688, "router_z_loss_mlp": 0.31066895, "step": 10283, "time_per_iteration": 2.710658073425293 }, { "auxiliary_loss_clip": 0.01360879, "auxiliary_loss_mlp": 0.00307302, "balance_loss_clip": 1.10927725, "balance_loss_mlp": 0.27621174, "epoch": 0.6183075304373966, "flos": 22123738327680.0, "grad_norm": 12.416485689060497, "language_loss": 0.8198539, "learning_rate": 1.3431323278133176e-06, "loss": 0.83653575, "num_input_tokens_seen": 221472560, "router_z_loss_clip": 2.51367188, "router_z_loss_mlp": 0.31103516, "step": 10284, "time_per_iteration": 4.113955497741699 }, { "auxiliary_loss_clip": 0.0133929, "auxiliary_loss_mlp": 0.00283167, "balance_loss_clip": 1.10690284, "balance_loss_mlp": 0.25549853, "epoch": 0.6183676536900646, "flos": 22455589104000.0, "grad_norm": 1.6985633687752506, "language_loss": 0.81200504, "learning_rate": 1.3427644832041922e-06, "loss": 0.82822961, "num_input_tokens_seen": 221492835, "router_z_loss_clip": 2.32421875, "router_z_loss_mlp": 0.2767334, "step": 10285, "time_per_iteration": 2.703740119934082 }, { "auxiliary_loss_clip": 0.01327715, "auxiliary_loss_mlp": 0.0034032, "balance_loss_clip": 1.09178376, "balance_loss_mlp": 0.31070834, "epoch": 0.6184277769427327, "flos": 23364128736000.0, "grad_norm": 5.80315752464746, "language_loss": 0.78312767, "learning_rate": 1.342396663517503e-06, "loss": 0.79980803, "num_input_tokens_seen": 221511870, "router_z_loss_clip": 2.359375, "router_z_loss_mlp": 0.29638672, "step": 10286, "time_per_iteration": 2.6791162490844727 }, { "auxiliary_loss_clip": 0.01322338, "auxiliary_loss_mlp": 0.00299232, "balance_loss_clip": 1.09062994, "balance_loss_mlp": 0.27093178, "epoch": 0.6184879001954006, "flos": 22711057608960.0, "grad_norm": 116.71674161772613, "language_loss": 0.81978011, "learning_rate": 1.342028868767199e-06, "loss": 0.83599579, "num_input_tokens_seen": 221529915, "router_z_loss_clip": 2.3203125, "router_z_loss_mlp": 0.2833252, "step": 10287, "time_per_iteration": 2.7467212677001953 }, { "auxiliary_loss_clip": 0.01333417, "auxiliary_loss_mlp": 0.00298757, "balance_loss_clip": 1.09627295, "balance_loss_mlp": 0.26897866, "epoch": 0.6185480234480686, "flos": 23841920471040.0, "grad_norm": 16.8847798171533, "language_loss": 0.79017127, "learning_rate": 1.3416610989672262e-06, "loss": 0.80649304, "num_input_tokens_seen": 221549745, "router_z_loss_clip": 2.37304688, "router_z_loss_mlp": 0.29785156, "step": 10288, "time_per_iteration": 2.7699875831604004 }, { "auxiliary_loss_clip": 0.01312715, "auxiliary_loss_mlp": 0.00291869, "balance_loss_clip": 1.08390772, "balance_loss_mlp": 0.2650229, "epoch": 0.6186081467007365, "flos": 45477595774080.0, "grad_norm": 29.730126467011132, "language_loss": 0.79036582, "learning_rate": 1.3412933541315296e-06, "loss": 0.80641162, "num_input_tokens_seen": 221572455, "router_z_loss_clip": 2.28710938, "router_z_loss_mlp": 0.26843262, "step": 10289, "time_per_iteration": 2.94004225730896 }, { "auxiliary_loss_clip": 0.01320971, "auxiliary_loss_mlp": 0.00336465, "balance_loss_clip": 1.08413577, "balance_loss_mlp": 0.30477884, "epoch": 0.6186682699534045, "flos": 23550864566400.0, "grad_norm": 44.450132934390595, "language_loss": 0.84731477, "learning_rate": 1.340925634274056e-06, "loss": 0.8638891, "num_input_tokens_seen": 221591325, "router_z_loss_clip": 2.36914062, "router_z_loss_mlp": 0.31689453, "step": 10290, "time_per_iteration": 2.6856114864349365 }, { "auxiliary_loss_clip": 0.01337517, "auxiliary_loss_mlp": 0.00287203, "balance_loss_clip": 1.0964371, "balance_loss_mlp": 0.25999987, "epoch": 0.6187283932060724, "flos": 25774201630080.0, "grad_norm": 28.367403396745818, "language_loss": 0.88374537, "learning_rate": 1.3405579394087475e-06, "loss": 0.89999259, "num_input_tokens_seen": 221611640, "router_z_loss_clip": 2.41015625, "router_z_loss_mlp": 0.27197266, "step": 10291, "time_per_iteration": 2.6968815326690674 }, { "auxiliary_loss_clip": 0.01319707, "auxiliary_loss_mlp": 0.00290195, "balance_loss_clip": 1.08679938, "balance_loss_mlp": 0.26134658, "epoch": 0.6187885164587404, "flos": 25265203954560.0, "grad_norm": 52.73498050822463, "language_loss": 0.86184186, "learning_rate": 1.3401902695495487e-06, "loss": 0.87794089, "num_input_tokens_seen": 221631225, "router_z_loss_clip": 2.33007812, "router_z_loss_mlp": 0.28869629, "step": 10292, "time_per_iteration": 2.6798181533813477 }, { "auxiliary_loss_clip": 0.01348709, "auxiliary_loss_mlp": 0.0026399, "balance_loss_clip": 1.09937668, "balance_loss_mlp": 0.23351979, "epoch": 0.6188486397114084, "flos": 26250772302720.0, "grad_norm": 69.69838648168762, "language_loss": 0.82910991, "learning_rate": 1.339822624710401e-06, "loss": 0.8452369, "num_input_tokens_seen": 221651035, "router_z_loss_clip": 2.49414062, "router_z_loss_mlp": 0.30493164, "step": 10293, "time_per_iteration": 2.692646026611328 }, { "auxiliary_loss_clip": 0.01315389, "auxiliary_loss_mlp": 0.0027418, "balance_loss_clip": 1.08135569, "balance_loss_mlp": 0.24770372, "epoch": 0.6189087629640764, "flos": 20923388605440.0, "grad_norm": 6.3308887274663626, "language_loss": 0.89162821, "learning_rate": 1.3394550049052454e-06, "loss": 0.90752393, "num_input_tokens_seen": 221671300, "router_z_loss_clip": 2.33984375, "router_z_loss_mlp": 0.26477051, "step": 10294, "time_per_iteration": 2.683659076690674 }, { "auxiliary_loss_clip": 0.01329106, "auxiliary_loss_mlp": 0.00260574, "balance_loss_clip": 1.09229636, "balance_loss_mlp": 0.23385873, "epoch": 0.6189688862167443, "flos": 14829814874880.0, "grad_norm": 444.39393047994537, "language_loss": 0.80359685, "learning_rate": 1.3390874101480225e-06, "loss": 0.81949365, "num_input_tokens_seen": 221687320, "router_z_loss_clip": 2.36914062, "router_z_loss_mlp": 0.26721191, "step": 10295, "time_per_iteration": 2.613093376159668 }, { "auxiliary_loss_clip": 0.0133357, "auxiliary_loss_mlp": 0.00251673, "balance_loss_clip": 1.09719515, "balance_loss_mlp": 0.2243022, "epoch": 0.6190290094694123, "flos": 24285058560000.0, "grad_norm": 19.21635585239069, "language_loss": 0.76998067, "learning_rate": 1.3387198404526705e-06, "loss": 0.78583312, "num_input_tokens_seen": 221710175, "router_z_loss_clip": 2.36523438, "router_z_loss_mlp": 0.27368164, "step": 10296, "time_per_iteration": 2.896052360534668 }, { "auxiliary_loss_clip": 0.01314847, "auxiliary_loss_mlp": 0.00260187, "balance_loss_clip": 1.07885885, "balance_loss_mlp": 0.23242362, "epoch": 0.6190891327220802, "flos": 22529457423360.0, "grad_norm": 12.404155965891597, "language_loss": 0.79470301, "learning_rate": 1.3383522958331287e-06, "loss": 0.81045341, "num_input_tokens_seen": 221728145, "router_z_loss_clip": 2.359375, "router_z_loss_mlp": 0.27770996, "step": 10297, "time_per_iteration": 2.6926326751708984 }, { "auxiliary_loss_clip": 0.01344733, "auxiliary_loss_mlp": 0.0008149, "balance_loss_clip": 1.16297114, "balance_loss_mlp": 0.07128606, "epoch": 0.6191492559747482, "flos": 67729357152000.0, "grad_norm": 0.8894633889288233, "language_loss": 0.63918138, "learning_rate": 1.3379847763033345e-06, "loss": 0.65344363, "num_input_tokens_seen": 221786100, "router_z_loss_clip": 1.8125, "router_z_loss_mlp": 0.10205078, "step": 10298, "time_per_iteration": 3.0680267810821533 }, { "auxiliary_loss_clip": 0.01306697, "auxiliary_loss_mlp": 0.00286818, "balance_loss_clip": 1.07410157, "balance_loss_mlp": 0.2592687, "epoch": 0.6192093792274163, "flos": 22346672088960.0, "grad_norm": 8.480071554814659, "language_loss": 0.80737364, "learning_rate": 1.3376172818772236e-06, "loss": 0.82330877, "num_input_tokens_seen": 221806450, "router_z_loss_clip": 2.32421875, "router_z_loss_mlp": 0.2755127, "step": 10299, "time_per_iteration": 2.7057738304138184 }, { "auxiliary_loss_clip": 0.01333334, "auxiliary_loss_mlp": 0.00299846, "balance_loss_clip": 1.09058869, "balance_loss_mlp": 0.27090204, "epoch": 0.6192695024800842, "flos": 13553944807680.0, "grad_norm": 61.07979480623225, "language_loss": 0.75831079, "learning_rate": 1.337249812568732e-06, "loss": 0.77464259, "num_input_tokens_seen": 221823330, "router_z_loss_clip": 2.42773438, "router_z_loss_mlp": 0.28955078, "step": 10300, "time_per_iteration": 2.6401803493499756 }, { "auxiliary_loss_clip": 0.01319797, "auxiliary_loss_mlp": 0.0028605, "balance_loss_clip": 1.08303118, "balance_loss_mlp": 0.25695091, "epoch": 0.6193296257327522, "flos": 17415310815360.0, "grad_norm": 12.97222966674437, "language_loss": 0.74148345, "learning_rate": 1.3368823683917939e-06, "loss": 0.75754189, "num_input_tokens_seen": 221839360, "router_z_loss_clip": 2.36914062, "router_z_loss_mlp": 0.29125977, "step": 10301, "time_per_iteration": 2.702589750289917 }, { "auxiliary_loss_clip": 0.01309587, "auxiliary_loss_mlp": 0.0025704, "balance_loss_clip": 1.07835281, "balance_loss_mlp": 0.23051588, "epoch": 0.6193897489854201, "flos": 31101118450560.0, "grad_norm": 16.90812788536881, "language_loss": 0.79458094, "learning_rate": 1.3365149493603424e-06, "loss": 0.8102473, "num_input_tokens_seen": 221859465, "router_z_loss_clip": 2.3125, "router_z_loss_mlp": 0.26550293, "step": 10302, "time_per_iteration": 2.825822591781616 }, { "auxiliary_loss_clip": 0.01326335, "auxiliary_loss_mlp": 0.00298065, "balance_loss_clip": 1.08827925, "balance_loss_mlp": 0.26883516, "epoch": 0.6194498722380881, "flos": 19134031662720.0, "grad_norm": 21.74750643775397, "language_loss": 0.89180356, "learning_rate": 1.3361475554883107e-06, "loss": 0.90804756, "num_input_tokens_seen": 221878555, "router_z_loss_clip": 2.38085938, "router_z_loss_mlp": 0.29211426, "step": 10303, "time_per_iteration": 2.684495449066162 }, { "auxiliary_loss_clip": 0.0132975, "auxiliary_loss_mlp": 0.00276661, "balance_loss_clip": 1.08833849, "balance_loss_mlp": 0.2477643, "epoch": 0.619509995490756, "flos": 21835088634240.0, "grad_norm": 7.122673019165301, "language_loss": 0.84131449, "learning_rate": 1.3357801867896307e-06, "loss": 0.8573786, "num_input_tokens_seen": 221898790, "router_z_loss_clip": 2.41210938, "router_z_loss_mlp": 0.28930664, "step": 10304, "time_per_iteration": 2.6788198947906494 }, { "auxiliary_loss_clip": 0.01337511, "auxiliary_loss_mlp": 0.00302038, "balance_loss_clip": 1.09131718, "balance_loss_mlp": 0.27030435, "epoch": 0.619570118743424, "flos": 23806548552960.0, "grad_norm": 21.218733675918063, "language_loss": 0.8615973, "learning_rate": 1.3354128432782324e-06, "loss": 0.87799287, "num_input_tokens_seen": 221918875, "router_z_loss_clip": 2.46289062, "router_z_loss_mlp": 0.31726074, "step": 10305, "time_per_iteration": 2.734121561050415 }, { "auxiliary_loss_clip": 0.01350067, "auxiliary_loss_mlp": 0.00271062, "balance_loss_clip": 1.10116887, "balance_loss_mlp": 0.24080698, "epoch": 0.619630241996092, "flos": 21101612912640.0, "grad_norm": 5.404759790101958, "language_loss": 0.87966186, "learning_rate": 1.335045524968045e-06, "loss": 0.89587313, "num_input_tokens_seen": 221937895, "router_z_loss_clip": 2.4921875, "router_z_loss_mlp": 0.30273438, "step": 10306, "time_per_iteration": 2.6942226886749268 }, { "auxiliary_loss_clip": 0.01328511, "auxiliary_loss_mlp": 0.0025202, "balance_loss_clip": 1.09310675, "balance_loss_mlp": 0.22549549, "epoch": 0.61969036524876, "flos": 27308269635840.0, "grad_norm": 14.380783020694226, "language_loss": 0.87048328, "learning_rate": 1.3346782318729988e-06, "loss": 0.88628858, "num_input_tokens_seen": 221955920, "router_z_loss_clip": 2.35742188, "router_z_loss_mlp": 0.265625, "step": 10307, "time_per_iteration": 2.7160332202911377 }, { "auxiliary_loss_clip": 0.01333065, "auxiliary_loss_mlp": 0.00082543, "balance_loss_clip": 1.1554296, "balance_loss_mlp": 0.07558092, "epoch": 0.6197504885014279, "flos": 51648955384320.0, "grad_norm": 0.7912963125069601, "language_loss": 0.59144467, "learning_rate": 1.3343109640070203e-06, "loss": 0.60560071, "num_input_tokens_seen": 222011405, "router_z_loss_clip": 1.78125, "router_z_loss_mlp": 0.06982422, "step": 10308, "time_per_iteration": 3.195725202560425 }, { "auxiliary_loss_clip": 0.01326088, "auxiliary_loss_mlp": 0.0026348, "balance_loss_clip": 1.09439206, "balance_loss_mlp": 0.23910134, "epoch": 0.6198106117540959, "flos": 30557107992960.0, "grad_norm": 5.5014056179783655, "language_loss": 0.74738109, "learning_rate": 1.333943721384037e-06, "loss": 0.76327676, "num_input_tokens_seen": 222034545, "router_z_loss_clip": 2.3125, "router_z_loss_mlp": 0.24365234, "step": 10309, "time_per_iteration": 2.779743194580078 }, { "auxiliary_loss_clip": 0.01324105, "auxiliary_loss_mlp": 0.00280597, "balance_loss_clip": 1.08863282, "balance_loss_mlp": 0.25159386, "epoch": 0.6198707350067638, "flos": 18909733184640.0, "grad_norm": 20.253684711443317, "language_loss": 0.78604639, "learning_rate": 1.3335765040179746e-06, "loss": 0.80209339, "num_input_tokens_seen": 222052690, "router_z_loss_clip": 2.35546875, "router_z_loss_mlp": 0.28991699, "step": 10310, "time_per_iteration": 2.930680513381958 }, { "auxiliary_loss_clip": 0.01310858, "auxiliary_loss_mlp": 0.00261402, "balance_loss_clip": 1.0755564, "balance_loss_mlp": 0.23150463, "epoch": 0.6199308582594318, "flos": 21433858738560.0, "grad_norm": 16.308277927828247, "language_loss": 0.86123443, "learning_rate": 1.3332093119227573e-06, "loss": 0.87695694, "num_input_tokens_seen": 222069095, "router_z_loss_clip": 2.3515625, "router_z_loss_mlp": 0.29882812, "step": 10311, "time_per_iteration": 2.669586658477783 }, { "auxiliary_loss_clip": 0.01313671, "auxiliary_loss_mlp": 0.00280749, "balance_loss_clip": 1.07409644, "balance_loss_mlp": 0.25020745, "epoch": 0.6199909815120999, "flos": 18407379525120.0, "grad_norm": 5.526139196498862, "language_loss": 0.80482996, "learning_rate": 1.3328421451123105e-06, "loss": 0.8207742, "num_input_tokens_seen": 222087360, "router_z_loss_clip": 2.39648438, "router_z_loss_mlp": 0.30517578, "step": 10312, "time_per_iteration": 2.638352632522583 }, { "auxiliary_loss_clip": 0.01363256, "auxiliary_loss_mlp": 0.00277246, "balance_loss_clip": 1.11205673, "balance_loss_mlp": 0.24720523, "epoch": 0.6200511047647678, "flos": 21466860359040.0, "grad_norm": 6.74330042052667, "language_loss": 0.81297863, "learning_rate": 1.3324750036005557e-06, "loss": 0.82938361, "num_input_tokens_seen": 222106130, "router_z_loss_clip": 2.51171875, "router_z_loss_mlp": 0.30065918, "step": 10313, "time_per_iteration": 2.6630566120147705 }, { "auxiliary_loss_clip": 0.01328985, "auxiliary_loss_mlp": 0.00304442, "balance_loss_clip": 1.08747315, "balance_loss_mlp": 0.2743775, "epoch": 0.6201112280174358, "flos": 18215903099520.0, "grad_norm": 11.966818274433072, "language_loss": 0.87325078, "learning_rate": 1.332107887401416e-06, "loss": 0.88958502, "num_input_tokens_seen": 222123125, "router_z_loss_clip": 2.41601562, "router_z_loss_mlp": 0.30041504, "step": 10314, "time_per_iteration": 2.6283528804779053 }, { "auxiliary_loss_clip": 0.01292074, "auxiliary_loss_mlp": 0.00285189, "balance_loss_clip": 1.0626018, "balance_loss_mlp": 0.25910625, "epoch": 0.6201713512701037, "flos": 20011185786240.0, "grad_norm": 34.86822390583382, "language_loss": 0.86603028, "learning_rate": 1.331740796528812e-06, "loss": 0.88180292, "num_input_tokens_seen": 222140655, "router_z_loss_clip": 2.29492188, "router_z_loss_mlp": 0.26086426, "step": 10315, "time_per_iteration": 2.6606106758117676 }, { "auxiliary_loss_clip": 0.01315731, "auxiliary_loss_mlp": 0.00309613, "balance_loss_clip": 1.07904232, "balance_loss_mlp": 0.28255227, "epoch": 0.6202314745227717, "flos": 22487692884480.0, "grad_norm": 15.054500392658369, "language_loss": 0.82163858, "learning_rate": 1.3313737309966641e-06, "loss": 0.83789206, "num_input_tokens_seen": 222160450, "router_z_loss_clip": 2.3671875, "router_z_loss_mlp": 0.27038574, "step": 10316, "time_per_iteration": 2.679185390472412 }, { "auxiliary_loss_clip": 0.01301301, "auxiliary_loss_mlp": 0.00289528, "balance_loss_clip": 1.06601143, "balance_loss_mlp": 0.26364762, "epoch": 0.6202915977754396, "flos": 26828682220800.0, "grad_norm": 268.56319940345185, "language_loss": 0.85409701, "learning_rate": 1.3310066908188915e-06, "loss": 0.87000525, "num_input_tokens_seen": 222179170, "router_z_loss_clip": 2.35351562, "router_z_loss_mlp": 0.25891113, "step": 10317, "time_per_iteration": 4.110605955123901 }, { "auxiliary_loss_clip": 0.01327176, "auxiliary_loss_mlp": 0.00036235, "balance_loss_clip": 1.1505487, "balance_loss_mlp": 0.02622138, "epoch": 0.6203517210281076, "flos": 62742694890240.0, "grad_norm": 0.6854700409506946, "language_loss": 0.58545113, "learning_rate": 1.3306396760094122e-06, "loss": 0.59908521, "num_input_tokens_seen": 222242660, "router_z_loss_clip": 1.765625, "router_z_loss_mlp": 0.10009766, "step": 10318, "time_per_iteration": 3.218869924545288 }, { "auxiliary_loss_clip": 0.01301446, "auxiliary_loss_mlp": 0.00272652, "balance_loss_clip": 1.06839919, "balance_loss_mlp": 0.24487662, "epoch": 0.6204118442807756, "flos": 23404277162880.0, "grad_norm": 86.90050356144388, "language_loss": 0.86543071, "learning_rate": 1.330272686582143e-06, "loss": 0.8811717, "num_input_tokens_seen": 222262170, "router_z_loss_clip": 2.328125, "router_z_loss_mlp": 0.27770996, "step": 10319, "time_per_iteration": 4.089164972305298 }, { "auxiliary_loss_clip": 0.01298236, "auxiliary_loss_mlp": 0.00243126, "balance_loss_clip": 1.06812084, "balance_loss_mlp": 0.21847314, "epoch": 0.6204719675334436, "flos": 20193647898240.0, "grad_norm": 22.361338837415108, "language_loss": 0.74299365, "learning_rate": 1.3299057225510013e-06, "loss": 0.75840735, "num_input_tokens_seen": 222280375, "router_z_loss_clip": 2.30078125, "router_z_loss_mlp": 0.24682617, "step": 10320, "time_per_iteration": 2.6544108390808105 }, { "auxiliary_loss_clip": 0.01267685, "auxiliary_loss_mlp": 0.00261629, "balance_loss_clip": 1.04825246, "balance_loss_mlp": 0.23704842, "epoch": 0.6205320907861115, "flos": 13188050916480.0, "grad_norm": 42.712745146082, "language_loss": 0.82159984, "learning_rate": 1.3295387839299013e-06, "loss": 0.83689296, "num_input_tokens_seen": 222297325, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.24609375, "step": 10321, "time_per_iteration": 2.6668434143066406 }, { "auxiliary_loss_clip": 0.0128626, "auxiliary_loss_mlp": 0.00256819, "balance_loss_clip": 1.05706275, "balance_loss_mlp": 0.23112971, "epoch": 0.6205922140387795, "flos": 20668386977280.0, "grad_norm": 1027.398906896536, "language_loss": 0.82300013, "learning_rate": 1.329171870732758e-06, "loss": 0.83843088, "num_input_tokens_seen": 222317095, "router_z_loss_clip": 2.29101562, "router_z_loss_mlp": 0.25683594, "step": 10322, "time_per_iteration": 4.142731428146362 }, { "auxiliary_loss_clip": 0.01302827, "auxiliary_loss_mlp": 0.00253767, "balance_loss_clip": 1.0723716, "balance_loss_mlp": 0.22870967, "epoch": 0.6206523372914474, "flos": 23877831093120.0, "grad_norm": 13.503279999236085, "language_loss": 0.80342805, "learning_rate": 1.3288049829734845e-06, "loss": 0.81899399, "num_input_tokens_seen": 222337055, "router_z_loss_clip": 2.3046875, "router_z_loss_mlp": 0.25048828, "step": 10323, "time_per_iteration": 2.682368516921997 }, { "auxiliary_loss_clip": 0.0131784, "auxiliary_loss_mlp": 0.00290007, "balance_loss_clip": 1.07716918, "balance_loss_mlp": 0.26153958, "epoch": 0.6207124605441154, "flos": 13406603218560.0, "grad_norm": 16.36340821888915, "language_loss": 0.68777829, "learning_rate": 1.3284381206659933e-06, "loss": 0.70385677, "num_input_tokens_seen": 222354515, "router_z_loss_clip": 2.40625, "router_z_loss_mlp": 0.28479004, "step": 10324, "time_per_iteration": 2.6142630577087402 }, { "auxiliary_loss_clip": 0.0130858, "auxiliary_loss_mlp": 0.00265909, "balance_loss_clip": 1.0737046, "balance_loss_mlp": 0.23866998, "epoch": 0.6207725837967835, "flos": 18916341287040.0, "grad_norm": 117.90523726719724, "language_loss": 0.85766852, "learning_rate": 1.3280712838241956e-06, "loss": 0.87341344, "num_input_tokens_seen": 222372755, "router_z_loss_clip": 2.34570312, "router_z_loss_mlp": 0.27209473, "step": 10325, "time_per_iteration": 2.653644561767578 }, { "auxiliary_loss_clip": 0.01301715, "auxiliary_loss_mlp": 0.00281493, "balance_loss_clip": 1.0667268, "balance_loss_mlp": 0.25365782, "epoch": 0.6208327070494514, "flos": 23980211832960.0, "grad_norm": 7.341069893402933, "language_loss": 0.8015582, "learning_rate": 1.327704472462003e-06, "loss": 0.81739026, "num_input_tokens_seen": 222391380, "router_z_loss_clip": 2.34960938, "router_z_loss_mlp": 0.27832031, "step": 10326, "time_per_iteration": 2.6519041061401367 }, { "auxiliary_loss_clip": 0.01299021, "auxiliary_loss_mlp": 0.00270881, "balance_loss_clip": 1.0672996, "balance_loss_mlp": 0.2429141, "epoch": 0.6208928303021194, "flos": 22820405587200.0, "grad_norm": 7.406109409997229, "language_loss": 0.85666013, "learning_rate": 1.3273376865933234e-06, "loss": 0.87235916, "num_input_tokens_seen": 222411165, "router_z_loss_clip": 2.31835938, "router_z_loss_mlp": 0.27954102, "step": 10327, "time_per_iteration": 4.056640386581421 }, { "auxiliary_loss_clip": 0.01324192, "auxiliary_loss_mlp": 0.00289167, "balance_loss_clip": 1.08589053, "balance_loss_mlp": 0.25998494, "epoch": 0.6209529535547873, "flos": 17564519911680.0, "grad_norm": 47.33054883231845, "language_loss": 0.89487028, "learning_rate": 1.326970926232066e-06, "loss": 0.91100395, "num_input_tokens_seen": 222428110, "router_z_loss_clip": 2.38476562, "router_z_loss_mlp": 0.29174805, "step": 10328, "time_per_iteration": 2.629875898361206 }, { "auxiliary_loss_clip": 0.01307258, "auxiliary_loss_mlp": 0.00251887, "balance_loss_clip": 1.07707441, "balance_loss_mlp": 0.22469571, "epoch": 0.6210130768074553, "flos": 22011912311040.0, "grad_norm": 39.36442437649523, "language_loss": 0.86923897, "learning_rate": 1.3266041913921396e-06, "loss": 0.88483042, "num_input_tokens_seen": 222446385, "router_z_loss_clip": 2.30273438, "router_z_loss_mlp": 0.27172852, "step": 10329, "time_per_iteration": 2.666555404663086 }, { "auxiliary_loss_clip": 0.01300047, "auxiliary_loss_mlp": 0.00114146, "balance_loss_clip": 1.12392437, "balance_loss_mlp": 0.10303535, "epoch": 0.6210732000601232, "flos": 63676873854720.0, "grad_norm": 0.7646719881809241, "language_loss": 0.61239672, "learning_rate": 1.3262374820874484e-06, "loss": 0.62653869, "num_input_tokens_seen": 222502150, "router_z_loss_clip": 1.765625, "router_z_loss_mlp": 0.11132812, "step": 10330, "time_per_iteration": 3.164947032928467 }, { "auxiliary_loss_clip": 0.0132177, "auxiliary_loss_mlp": 0.00257894, "balance_loss_clip": 1.08602214, "balance_loss_mlp": 0.23073792, "epoch": 0.6211333233127913, "flos": 24243365848320.0, "grad_norm": 4.225844054072377, "language_loss": 0.86939335, "learning_rate": 1.3258707983319002e-06, "loss": 0.88519001, "num_input_tokens_seen": 222519880, "router_z_loss_clip": 2.35742188, "router_z_loss_mlp": 0.27160645, "step": 10331, "time_per_iteration": 2.6863324642181396 }, { "auxiliary_loss_clip": 0.01297963, "auxiliary_loss_mlp": 0.0024593, "balance_loss_clip": 1.06531143, "balance_loss_mlp": 0.21975169, "epoch": 0.6211934465654592, "flos": 16943803960320.0, "grad_norm": 121.54085396143287, "language_loss": 0.74299192, "learning_rate": 1.3255041401393992e-06, "loss": 0.75843084, "num_input_tokens_seen": 222538545, "router_z_loss_clip": 2.328125, "router_z_loss_mlp": 0.26184082, "step": 10332, "time_per_iteration": 2.6324405670166016 }, { "auxiliary_loss_clip": 0.01303904, "auxiliary_loss_mlp": 0.00262297, "balance_loss_clip": 1.073192, "balance_loss_mlp": 0.23710853, "epoch": 0.6212535698181272, "flos": 15267386355840.0, "grad_norm": 771.8662423434789, "language_loss": 0.83086157, "learning_rate": 1.3251375075238476e-06, "loss": 0.84652358, "num_input_tokens_seen": 222556935, "router_z_loss_clip": 2.30859375, "router_z_loss_mlp": 0.25195312, "step": 10333, "time_per_iteration": 2.6692850589752197 }, { "auxiliary_loss_clip": 0.01299102, "auxiliary_loss_mlp": 0.00252802, "balance_loss_clip": 1.07220256, "balance_loss_mlp": 0.22782733, "epoch": 0.6213136930707951, "flos": 13443950384640.0, "grad_norm": 227.68621501900185, "language_loss": 0.79658079, "learning_rate": 1.3247709004991507e-06, "loss": 0.81209975, "num_input_tokens_seen": 222574035, "router_z_loss_clip": 2.265625, "router_z_loss_mlp": 0.25012207, "step": 10334, "time_per_iteration": 2.7203922271728516 }, { "auxiliary_loss_clip": 0.01288179, "auxiliary_loss_mlp": 0.00250015, "balance_loss_clip": 1.06226158, "balance_loss_mlp": 0.22444472, "epoch": 0.6213738163234631, "flos": 18111223889280.0, "grad_norm": 10.484090048423955, "language_loss": 0.787974, "learning_rate": 1.3244043190792078e-06, "loss": 0.80335587, "num_input_tokens_seen": 222592290, "router_z_loss_clip": 2.26171875, "router_z_loss_mlp": 0.25561523, "step": 10335, "time_per_iteration": 2.7480292320251465 }, { "auxiliary_loss_clip": 0.01299076, "auxiliary_loss_mlp": 0.00237957, "balance_loss_clip": 1.07427835, "balance_loss_mlp": 0.21394783, "epoch": 0.621433939576131, "flos": 25337348421120.0, "grad_norm": 2.960374593638807, "language_loss": 0.8607294, "learning_rate": 1.3240377632779213e-06, "loss": 0.87609971, "num_input_tokens_seen": 222612805, "router_z_loss_clip": 2.24609375, "router_z_loss_mlp": 0.24023438, "step": 10336, "time_per_iteration": 2.7224369049072266 }, { "auxiliary_loss_clip": 0.01281781, "auxiliary_loss_mlp": 0.00277109, "balance_loss_clip": 1.05884147, "balance_loss_mlp": 0.25085929, "epoch": 0.621494062828799, "flos": 22565619440640.0, "grad_norm": 47.51365612811129, "language_loss": 0.82197535, "learning_rate": 1.3236712331091907e-06, "loss": 0.83756429, "num_input_tokens_seen": 222632260, "router_z_loss_clip": 2.23046875, "router_z_loss_mlp": 0.26220703, "step": 10337, "time_per_iteration": 2.6502163410186768 }, { "auxiliary_loss_clip": 0.01323989, "auxiliary_loss_mlp": 0.00280627, "balance_loss_clip": 1.08490527, "balance_loss_mlp": 0.25308961, "epoch": 0.621554186081467, "flos": 27417976750080.0, "grad_norm": 45.973465797231476, "language_loss": 0.72699428, "learning_rate": 1.3233047285869145e-06, "loss": 0.74304044, "num_input_tokens_seen": 222653570, "router_z_loss_clip": 2.390625, "router_z_loss_mlp": 0.27539062, "step": 10338, "time_per_iteration": 2.6785459518432617 }, { "auxiliary_loss_clip": 0.01284727, "auxiliary_loss_mlp": 0.00256705, "balance_loss_clip": 1.05719924, "balance_loss_mlp": 0.23045543, "epoch": 0.621614309334135, "flos": 22346815743360.0, "grad_norm": 138.92332659416812, "language_loss": 0.78443766, "learning_rate": 1.322938249724991e-06, "loss": 0.79985201, "num_input_tokens_seen": 222672480, "router_z_loss_clip": 2.2734375, "router_z_loss_mlp": 0.26269531, "step": 10339, "time_per_iteration": 2.658094644546509 }, { "auxiliary_loss_clip": 0.01287575, "auxiliary_loss_mlp": 0.00223372, "balance_loss_clip": 1.06112695, "balance_loss_mlp": 0.19962594, "epoch": 0.621674432586803, "flos": 19281229597440.0, "grad_norm": 12.414805152957209, "language_loss": 0.78956461, "learning_rate": 1.3225717965373166e-06, "loss": 0.80467409, "num_input_tokens_seen": 222691200, "router_z_loss_clip": 2.26171875, "router_z_loss_mlp": 0.23742676, "step": 10340, "time_per_iteration": 2.6325876712799072 }, { "auxiliary_loss_clip": 0.01302086, "auxiliary_loss_mlp": 0.00232947, "balance_loss_clip": 1.070508, "balance_loss_mlp": 0.20732927, "epoch": 0.6217345558394709, "flos": 21609533180160.0, "grad_norm": 185.39481677242802, "language_loss": 0.78590894, "learning_rate": 1.322205369037788e-06, "loss": 0.80125928, "num_input_tokens_seen": 222709975, "router_z_loss_clip": 2.3125, "router_z_loss_mlp": 0.25634766, "step": 10341, "time_per_iteration": 2.667786121368408 }, { "auxiliary_loss_clip": 0.01317361, "auxiliary_loss_mlp": 0.00242558, "balance_loss_clip": 1.08301306, "balance_loss_mlp": 0.21615291, "epoch": 0.6217946790921389, "flos": 18004102554240.0, "grad_norm": 12.449379530199982, "language_loss": 0.88974571, "learning_rate": 1.321838967240299e-06, "loss": 0.9053449, "num_input_tokens_seen": 222729005, "router_z_loss_clip": 2.34375, "router_z_loss_mlp": 0.26416016, "step": 10342, "time_per_iteration": 2.6338534355163574 }, { "auxiliary_loss_clip": 0.01317224, "auxiliary_loss_mlp": 0.00099972, "balance_loss_clip": 1.14166021, "balance_loss_mlp": 0.09215173, "epoch": 0.6218548023448068, "flos": 61973631768960.0, "grad_norm": 0.7788421358116109, "language_loss": 0.56792647, "learning_rate": 1.3214725911587452e-06, "loss": 0.58209842, "num_input_tokens_seen": 222786090, "router_z_loss_clip": 1.75, "router_z_loss_mlp": 0.078125, "step": 10343, "time_per_iteration": 3.074636697769165 }, { "auxiliary_loss_clip": 0.01287572, "auxiliary_loss_mlp": 0.00245501, "balance_loss_clip": 1.06415153, "balance_loss_mlp": 0.2197998, "epoch": 0.6219149255974749, "flos": 25739152934400.0, "grad_norm": 4.825099779455435, "language_loss": 0.80245674, "learning_rate": 1.3211062408070184e-06, "loss": 0.81778741, "num_input_tokens_seen": 222806100, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.25708008, "step": 10344, "time_per_iteration": 2.698265552520752 }, { "auxiliary_loss_clip": 0.013225, "auxiliary_loss_mlp": 0.00260659, "balance_loss_clip": 1.08795476, "balance_loss_mlp": 0.23382463, "epoch": 0.6219750488501428, "flos": 25411073086080.0, "grad_norm": 8.06868354690829, "language_loss": 0.69254136, "learning_rate": 1.3207399161990105e-06, "loss": 0.70837295, "num_input_tokens_seen": 222826575, "router_z_loss_clip": 2.34570312, "router_z_loss_mlp": 0.26867676, "step": 10345, "time_per_iteration": 2.711132526397705 }, { "auxiliary_loss_clip": 0.01311377, "auxiliary_loss_mlp": 0.00262418, "balance_loss_clip": 1.07804346, "balance_loss_mlp": 0.23399878, "epoch": 0.6220351721028108, "flos": 20047383717120.0, "grad_norm": 11.544681077962773, "language_loss": 0.84592593, "learning_rate": 1.320373617348614e-06, "loss": 0.86166394, "num_input_tokens_seen": 222845285, "router_z_loss_clip": 2.328125, "router_z_loss_mlp": 0.28417969, "step": 10346, "time_per_iteration": 2.6951956748962402 }, { "auxiliary_loss_clip": 0.01338247, "auxiliary_loss_mlp": 0.00250807, "balance_loss_clip": 1.09729278, "balance_loss_mlp": 0.22251855, "epoch": 0.6220952953554787, "flos": 27488397363840.0, "grad_norm": 3.5352178875141003, "language_loss": 0.78411019, "learning_rate": 1.3200073442697171e-06, "loss": 0.80000073, "num_input_tokens_seen": 222864575, "router_z_loss_clip": 2.40820312, "router_z_loss_mlp": 0.28283691, "step": 10347, "time_per_iteration": 2.740128755569458 }, { "auxiliary_loss_clip": 0.01319571, "auxiliary_loss_mlp": 0.00235082, "balance_loss_clip": 1.09024906, "balance_loss_mlp": 0.20988104, "epoch": 0.6221554186081467, "flos": 19207612673280.0, "grad_norm": 94.38055846182816, "language_loss": 0.79825634, "learning_rate": 1.3196410969762108e-06, "loss": 0.81380284, "num_input_tokens_seen": 222884420, "router_z_loss_clip": 2.29296875, "router_z_loss_mlp": 0.25219727, "step": 10348, "time_per_iteration": 2.6733741760253906 }, { "auxiliary_loss_clip": 0.01330569, "auxiliary_loss_mlp": 0.00109322, "balance_loss_clip": 1.15283823, "balance_loss_mlp": 0.09983259, "epoch": 0.6222155418608146, "flos": 62950939989120.0, "grad_norm": 0.7858844834115778, "language_loss": 0.53497344, "learning_rate": 1.3192748754819815e-06, "loss": 0.54937232, "num_input_tokens_seen": 222944690, "router_z_loss_clip": 1.78125, "router_z_loss_mlp": 0.09472656, "step": 10349, "time_per_iteration": 3.195889472961426 }, { "auxiliary_loss_clip": 0.0130958, "auxiliary_loss_mlp": 0.00249782, "balance_loss_clip": 1.07720351, "balance_loss_mlp": 0.22386573, "epoch": 0.6222756651134826, "flos": 22601099099520.0, "grad_norm": 169.74657484505445, "language_loss": 0.78270519, "learning_rate": 1.3189086798009173e-06, "loss": 0.79829884, "num_input_tokens_seen": 222962990, "router_z_loss_clip": 2.3203125, "router_z_loss_mlp": 0.25927734, "step": 10350, "time_per_iteration": 2.816863775253296 }, { "auxiliary_loss_clip": 0.01321268, "auxiliary_loss_mlp": 0.00251098, "balance_loss_clip": 1.08810639, "balance_loss_mlp": 0.22404924, "epoch": 0.6223357883661506, "flos": 21142228216320.0, "grad_norm": 3.74417644680404, "language_loss": 0.67113638, "learning_rate": 1.3185425099469046e-06, "loss": 0.68686002, "num_input_tokens_seen": 222980715, "router_z_loss_clip": 2.33789062, "router_z_loss_mlp": 0.27062988, "step": 10351, "time_per_iteration": 2.658879041671753 }, { "auxiliary_loss_clip": 0.0132832, "auxiliary_loss_mlp": 0.00093404, "balance_loss_clip": 1.15198517, "balance_loss_mlp": 0.0849162, "epoch": 0.6223959116188186, "flos": 63765071700480.0, "grad_norm": 0.7963870113234681, "language_loss": 0.60843891, "learning_rate": 1.3181763659338276e-06, "loss": 0.62265617, "num_input_tokens_seen": 223040685, "router_z_loss_clip": 1.765625, "router_z_loss_mlp": 0.08496094, "step": 10352, "time_per_iteration": 3.1323366165161133 }, { "auxiliary_loss_clip": 0.01321444, "auxiliary_loss_mlp": 0.00236402, "balance_loss_clip": 1.09064555, "balance_loss_mlp": 0.20899597, "epoch": 0.6224560348714866, "flos": 22565727181440.0, "grad_norm": 35.56450302318675, "language_loss": 0.91001862, "learning_rate": 1.3178102477755714e-06, "loss": 0.92559707, "num_input_tokens_seen": 223059000, "router_z_loss_clip": 2.3046875, "router_z_loss_mlp": 0.27404785, "step": 10353, "time_per_iteration": 2.673476457595825 }, { "auxiliary_loss_clip": 0.01308074, "auxiliary_loss_mlp": 0.00247064, "balance_loss_clip": 1.08094811, "balance_loss_mlp": 0.2217558, "epoch": 0.6225161581241545, "flos": 24097748112000.0, "grad_norm": 7.670524804699495, "language_loss": 0.83554864, "learning_rate": 1.3174441554860195e-06, "loss": 0.85110003, "num_input_tokens_seen": 223079345, "router_z_loss_clip": 2.27148438, "router_z_loss_mlp": 0.25305176, "step": 10354, "time_per_iteration": 2.6669516563415527 }, { "auxiliary_loss_clip": 0.01303437, "auxiliary_loss_mlp": 0.00251317, "balance_loss_clip": 1.07422304, "balance_loss_mlp": 0.22569901, "epoch": 0.6225762813768225, "flos": 20443513881600.0, "grad_norm": 22.053093716543074, "language_loss": 0.8461594, "learning_rate": 1.3170780890790528e-06, "loss": 0.86170697, "num_input_tokens_seen": 223097880, "router_z_loss_clip": 2.29492188, "router_z_loss_mlp": 0.25610352, "step": 10355, "time_per_iteration": 2.6259427070617676 }, { "auxiliary_loss_clip": 0.01309345, "auxiliary_loss_mlp": 0.00242098, "balance_loss_clip": 1.08026695, "balance_loss_mlp": 0.21612188, "epoch": 0.6226364046294904, "flos": 27198131558400.0, "grad_norm": 5.851559024277878, "language_loss": 0.85372877, "learning_rate": 1.3167120485685538e-06, "loss": 0.86924326, "num_input_tokens_seen": 223118185, "router_z_loss_clip": 2.29492188, "router_z_loss_mlp": 0.2598877, "step": 10356, "time_per_iteration": 2.9869544506073 }, { "auxiliary_loss_clip": 0.0132444, "auxiliary_loss_mlp": 0.00274733, "balance_loss_clip": 1.09021139, "balance_loss_mlp": 0.24760097, "epoch": 0.6226965278821585, "flos": 20445776438400.0, "grad_norm": 20.281764857799615, "language_loss": 0.78111285, "learning_rate": 1.3163460339684024e-06, "loss": 0.7971046, "num_input_tokens_seen": 223137600, "router_z_loss_clip": 2.34570312, "router_z_loss_mlp": 0.27148438, "step": 10357, "time_per_iteration": 2.6503679752349854 }, { "auxiliary_loss_clip": 0.01337755, "auxiliary_loss_mlp": 0.00279745, "balance_loss_clip": 1.09654653, "balance_loss_mlp": 0.2502763, "epoch": 0.6227566511348264, "flos": 22162737519360.0, "grad_norm": 12.79856802309104, "language_loss": 0.86712182, "learning_rate": 1.3159800452924778e-06, "loss": 0.88329685, "num_input_tokens_seen": 223154360, "router_z_loss_clip": 2.41015625, "router_z_loss_mlp": 0.29467773, "step": 10358, "time_per_iteration": 2.6606297492980957 }, { "auxiliary_loss_clip": 0.01310164, "auxiliary_loss_mlp": 0.00238386, "balance_loss_clip": 1.07904267, "balance_loss_mlp": 0.21186206, "epoch": 0.6228167743874944, "flos": 18040875102720.0, "grad_norm": 759.1143070867084, "language_loss": 0.91620111, "learning_rate": 1.3156140825546588e-06, "loss": 0.93168664, "num_input_tokens_seen": 223172255, "router_z_loss_clip": 2.3125, "router_z_loss_mlp": 0.265625, "step": 10359, "time_per_iteration": 4.064505338668823 }, { "auxiliary_loss_clip": 0.01326422, "auxiliary_loss_mlp": 0.00231584, "balance_loss_clip": 1.09085572, "balance_loss_mlp": 0.20461929, "epoch": 0.6228768976401623, "flos": 17742851959680.0, "grad_norm": 4.723569207089729, "language_loss": 0.82445198, "learning_rate": 1.315248145768822e-06, "loss": 0.8400321, "num_input_tokens_seen": 223186965, "router_z_loss_clip": 2.35351562, "router_z_loss_mlp": 0.26977539, "step": 10360, "time_per_iteration": 2.6189181804656982 }, { "auxiliary_loss_clip": 0.01338534, "auxiliary_loss_mlp": 0.00270905, "balance_loss_clip": 1.09944868, "balance_loss_mlp": 0.24253309, "epoch": 0.6229370208928303, "flos": 17894934144000.0, "grad_norm": 14.827282265720882, "language_loss": 0.86011112, "learning_rate": 1.3148822349488442e-06, "loss": 0.87620544, "num_input_tokens_seen": 223206045, "router_z_loss_clip": 2.39648438, "router_z_loss_mlp": 0.28381348, "step": 10361, "time_per_iteration": 4.092345952987671 }, { "auxiliary_loss_clip": 0.01316102, "auxiliary_loss_mlp": 0.00230217, "balance_loss_clip": 1.08622253, "balance_loss_mlp": 0.20598215, "epoch": 0.6229971441454982, "flos": 17347763289600.0, "grad_norm": 3.1165834374882126, "language_loss": 0.75942373, "learning_rate": 1.3145163501086005e-06, "loss": 0.77488697, "num_input_tokens_seen": 223224820, "router_z_loss_clip": 2.29882812, "router_z_loss_mlp": 0.24243164, "step": 10362, "time_per_iteration": 2.631775379180908 }, { "auxiliary_loss_clip": 0.01315844, "auxiliary_loss_mlp": 0.00276331, "balance_loss_clip": 1.08375299, "balance_loss_mlp": 0.24899636, "epoch": 0.6230572673981662, "flos": 29241376807680.0, "grad_norm": 7.406725308922693, "language_loss": 0.76214391, "learning_rate": 1.3141504912619658e-06, "loss": 0.77806568, "num_input_tokens_seen": 223243205, "router_z_loss_clip": 2.32226562, "router_z_loss_mlp": 0.27355957, "step": 10363, "time_per_iteration": 2.7273290157318115 }, { "auxiliary_loss_clip": 0.01328076, "auxiliary_loss_mlp": 0.00262576, "balance_loss_clip": 1.08901477, "balance_loss_mlp": 0.23630184, "epoch": 0.6231173906508342, "flos": 16325961096960.0, "grad_norm": 2.294169353109777, "language_loss": 0.94826496, "learning_rate": 1.3137846584228127e-06, "loss": 0.96417147, "num_input_tokens_seen": 223261370, "router_z_loss_clip": 2.390625, "router_z_loss_mlp": 0.26269531, "step": 10364, "time_per_iteration": 4.01880407333374 }, { "auxiliary_loss_clip": 0.01343185, "auxiliary_loss_mlp": 0.00080751, "balance_loss_clip": 1.16561604, "balance_loss_mlp": 0.07331251, "epoch": 0.6231775139035022, "flos": 68702032517760.0, "grad_norm": 0.9034384558497288, "language_loss": 0.60344177, "learning_rate": 1.313418851605015e-06, "loss": 0.61768115, "num_input_tokens_seen": 223315050, "router_z_loss_clip": 1.78125, "router_z_loss_mlp": 0.07421875, "step": 10365, "time_per_iteration": 3.1797680854797363 }, { "auxiliary_loss_clip": 0.01332513, "auxiliary_loss_mlp": 0.00254394, "balance_loss_clip": 1.09341562, "balance_loss_mlp": 0.22534317, "epoch": 0.6232376371561702, "flos": 19821038163840.0, "grad_norm": 4.925275668886382, "language_loss": 0.85040683, "learning_rate": 1.3130530708224427e-06, "loss": 0.86627591, "num_input_tokens_seen": 223332130, "router_z_loss_clip": 2.39257812, "router_z_loss_mlp": 0.29064941, "step": 10366, "time_per_iteration": 2.6355018615722656 }, { "auxiliary_loss_clip": 0.0133767, "auxiliary_loss_mlp": 0.00238238, "balance_loss_clip": 1.09752154, "balance_loss_mlp": 0.21055728, "epoch": 0.6232977604088381, "flos": 23258264376960.0, "grad_norm": 5.521972441108516, "language_loss": 0.84866011, "learning_rate": 1.3126873160889665e-06, "loss": 0.86441916, "num_input_tokens_seen": 223351605, "router_z_loss_clip": 2.40234375, "router_z_loss_mlp": 0.27697754, "step": 10367, "time_per_iteration": 2.7085306644439697 }, { "auxiliary_loss_clip": 0.01329546, "auxiliary_loss_mlp": 0.00240021, "balance_loss_clip": 1.09611905, "balance_loss_mlp": 0.21424839, "epoch": 0.6233578836615061, "flos": 21106425335040.0, "grad_norm": 16.468926353292385, "language_loss": 0.84709287, "learning_rate": 1.312321587418457e-06, "loss": 0.86278856, "num_input_tokens_seen": 223372090, "router_z_loss_clip": 2.33398438, "router_z_loss_mlp": 0.2578125, "step": 10368, "time_per_iteration": 2.661933660507202 }, { "auxiliary_loss_clip": 0.01342355, "auxiliary_loss_mlp": 0.00269108, "balance_loss_clip": 1.10135508, "balance_loss_mlp": 0.24035501, "epoch": 0.623418006914174, "flos": 23769416868480.0, "grad_norm": 8.308856144875948, "language_loss": 0.8006134, "learning_rate": 1.3119558848247811e-06, "loss": 0.816728, "num_input_tokens_seen": 223390110, "router_z_loss_clip": 2.41210938, "router_z_loss_mlp": 0.28759766, "step": 10369, "time_per_iteration": 4.071518898010254 }, { "auxiliary_loss_clip": 0.01316041, "auxiliary_loss_mlp": 0.00259388, "balance_loss_clip": 1.08648896, "balance_loss_mlp": 0.23169559, "epoch": 0.6234781301668421, "flos": 17890480857600.0, "grad_norm": 10.402359575816, "language_loss": 0.94451338, "learning_rate": 1.3115902083218072e-06, "loss": 0.96026772, "num_input_tokens_seen": 223404205, "router_z_loss_clip": 2.29296875, "router_z_loss_mlp": 0.27661133, "step": 10370, "time_per_iteration": 2.647996664047241 }, { "auxiliary_loss_clip": 0.01341479, "auxiliary_loss_mlp": 0.00286832, "balance_loss_clip": 1.10475254, "balance_loss_mlp": 0.25716048, "epoch": 0.62353825341951, "flos": 26175503352960.0, "grad_norm": 5.669240266890785, "language_loss": 0.71796858, "learning_rate": 1.311224557923402e-06, "loss": 0.73425168, "num_input_tokens_seen": 223424855, "router_z_loss_clip": 2.36914062, "router_z_loss_mlp": 0.296875, "step": 10371, "time_per_iteration": 2.7879505157470703 }, { "auxiliary_loss_clip": 0.01335062, "auxiliary_loss_mlp": 0.00267255, "balance_loss_clip": 1.10400271, "balance_loss_mlp": 0.2411481, "epoch": 0.623598376672178, "flos": 31139902160640.0, "grad_norm": 70.31724079767974, "language_loss": 0.8212204, "learning_rate": 1.3108589336434298e-06, "loss": 0.83724362, "num_input_tokens_seen": 223447225, "router_z_loss_clip": 2.30859375, "router_z_loss_mlp": 0.26086426, "step": 10372, "time_per_iteration": 2.7664785385131836 }, { "auxiliary_loss_clip": 0.013717, "auxiliary_loss_mlp": 0.00275482, "balance_loss_clip": 1.1210475, "balance_loss_mlp": 0.24444015, "epoch": 0.6236584999248459, "flos": 23730202195200.0, "grad_norm": 4.255487798977755, "language_loss": 0.83825195, "learning_rate": 1.3104933354957568e-06, "loss": 0.85472381, "num_input_tokens_seen": 223467520, "router_z_loss_clip": 2.5078125, "router_z_loss_mlp": 0.31030273, "step": 10373, "time_per_iteration": 2.754181146621704 }, { "auxiliary_loss_clip": 0.01337034, "auxiliary_loss_mlp": 0.00253566, "balance_loss_clip": 1.10494709, "balance_loss_mlp": 0.22755468, "epoch": 0.6237186231775139, "flos": 21762764599680.0, "grad_norm": 13.294638636650623, "language_loss": 0.76040423, "learning_rate": 1.3101277634942448e-06, "loss": 0.77631027, "num_input_tokens_seen": 223488130, "router_z_loss_clip": 2.32226562, "router_z_loss_mlp": 0.26000977, "step": 10374, "time_per_iteration": 2.6562414169311523 }, { "auxiliary_loss_clip": 0.01352857, "auxiliary_loss_mlp": 0.00240788, "balance_loss_clip": 1.11076021, "balance_loss_mlp": 0.21437109, "epoch": 0.6237787464301818, "flos": 14939486075520.0, "grad_norm": 2.674659180123151, "language_loss": 0.84392256, "learning_rate": 1.3097622176527577e-06, "loss": 0.85985905, "num_input_tokens_seen": 223505105, "router_z_loss_clip": 2.41992188, "router_z_loss_mlp": 0.2644043, "step": 10375, "time_per_iteration": 2.650298833847046 }, { "auxiliary_loss_clip": 0.01331773, "auxiliary_loss_mlp": 0.00253114, "balance_loss_clip": 1.09864628, "balance_loss_mlp": 0.22730517, "epoch": 0.6238388696828499, "flos": 35590311302400.0, "grad_norm": 19.584333926395477, "language_loss": 0.76569808, "learning_rate": 1.3093966979851566e-06, "loss": 0.78154695, "num_input_tokens_seen": 223528065, "router_z_loss_clip": 2.3359375, "router_z_loss_mlp": 0.25817871, "step": 10376, "time_per_iteration": 2.766901969909668 }, { "auxiliary_loss_clip": 0.01354674, "auxiliary_loss_mlp": 0.00279682, "balance_loss_clip": 1.11204231, "balance_loss_mlp": 0.24895018, "epoch": 0.6238989929355178, "flos": 23623511823360.0, "grad_norm": 12.874054872415622, "language_loss": 0.86961001, "learning_rate": 1.309031204505301e-06, "loss": 0.88595361, "num_input_tokens_seen": 223547305, "router_z_loss_clip": 2.421875, "router_z_loss_mlp": 0.30712891, "step": 10377, "time_per_iteration": 2.6772079467773438 }, { "auxiliary_loss_clip": 0.01344586, "auxiliary_loss_mlp": 0.00255796, "balance_loss_clip": 1.10530043, "balance_loss_mlp": 0.22885454, "epoch": 0.6239591161881858, "flos": 22087468569600.0, "grad_norm": 4.053190268110635, "language_loss": 0.77822709, "learning_rate": 1.308665737227052e-06, "loss": 0.79423094, "num_input_tokens_seen": 223567205, "router_z_loss_clip": 2.390625, "router_z_loss_mlp": 0.26928711, "step": 10378, "time_per_iteration": 2.6243035793304443 }, { "auxiliary_loss_clip": 0.01334251, "auxiliary_loss_mlp": 0.00250529, "balance_loss_clip": 1.10041142, "balance_loss_mlp": 0.2213106, "epoch": 0.6240192394408538, "flos": 24535930124160.0, "grad_norm": 12.40190590246399, "language_loss": 0.83606064, "learning_rate": 1.3083002961642675e-06, "loss": 0.85190845, "num_input_tokens_seen": 223586560, "router_z_loss_clip": 2.33789062, "router_z_loss_mlp": 0.29223633, "step": 10379, "time_per_iteration": 2.7167038917541504 }, { "auxiliary_loss_clip": 0.01346753, "auxiliary_loss_mlp": 0.00262924, "balance_loss_clip": 1.10820162, "balance_loss_mlp": 0.23446855, "epoch": 0.6240793626935217, "flos": 27931930502400.0, "grad_norm": 3.3374631801015546, "language_loss": 0.84772301, "learning_rate": 1.3079348813308051e-06, "loss": 0.86381978, "num_input_tokens_seen": 223610595, "router_z_loss_clip": 2.390625, "router_z_loss_mlp": 0.28442383, "step": 10380, "time_per_iteration": 2.744037389755249 }, { "auxiliary_loss_clip": 0.01327406, "auxiliary_loss_mlp": 0.00241794, "balance_loss_clip": 1.09490943, "balance_loss_mlp": 0.21527007, "epoch": 0.6241394859461897, "flos": 22892514140160.0, "grad_norm": 7.008684197903679, "language_loss": 0.87047923, "learning_rate": 1.3075694927405207e-06, "loss": 0.88617122, "num_input_tokens_seen": 223630230, "router_z_loss_clip": 2.32617188, "router_z_loss_mlp": 0.26550293, "step": 10381, "time_per_iteration": 2.7291626930236816 }, { "auxiliary_loss_clip": 0.0134302, "auxiliary_loss_mlp": 0.0028329, "balance_loss_clip": 1.10799968, "balance_loss_mlp": 0.25607419, "epoch": 0.6241996091988576, "flos": 12750766744320.0, "grad_norm": 2.4094153241824112, "language_loss": 0.83631754, "learning_rate": 1.3072041304072718e-06, "loss": 0.85258061, "num_input_tokens_seen": 223648360, "router_z_loss_clip": 2.35351562, "router_z_loss_mlp": 0.27209473, "step": 10382, "time_per_iteration": 2.608396530151367 }, { "auxiliary_loss_clip": 0.01344036, "auxiliary_loss_mlp": 0.00269763, "balance_loss_clip": 1.10806036, "balance_loss_mlp": 0.24309596, "epoch": 0.6242597324515257, "flos": 25851302173440.0, "grad_norm": 361.17519112919894, "language_loss": 0.83507735, "learning_rate": 1.306838794344911e-06, "loss": 0.85121536, "num_input_tokens_seen": 223671255, "router_z_loss_clip": 2.359375, "router_z_loss_mlp": 0.26672363, "step": 10383, "time_per_iteration": 2.7286007404327393 }, { "auxiliary_loss_clip": 0.01334922, "auxiliary_loss_mlp": 0.00244739, "balance_loss_clip": 1.09984112, "balance_loss_mlp": 0.21893036, "epoch": 0.6243198557041936, "flos": 19937712516480.0, "grad_norm": 9.016681350425893, "language_loss": 0.82794553, "learning_rate": 1.3064734845672925e-06, "loss": 0.84374213, "num_input_tokens_seen": 223689860, "router_z_loss_clip": 2.34960938, "router_z_loss_mlp": 0.25817871, "step": 10384, "time_per_iteration": 2.6625802516937256 }, { "auxiliary_loss_clip": 0.01343935, "auxiliary_loss_mlp": 0.00259852, "balance_loss_clip": 1.10291934, "balance_loss_mlp": 0.23300651, "epoch": 0.6243799789568616, "flos": 18406194376320.0, "grad_norm": 8.755352052072526, "language_loss": 0.75496769, "learning_rate": 1.3061082010882694e-06, "loss": 0.77100563, "num_input_tokens_seen": 223707835, "router_z_loss_clip": 2.41015625, "router_z_loss_mlp": 0.26831055, "step": 10385, "time_per_iteration": 2.7052927017211914 }, { "auxiliary_loss_clip": 0.01341627, "auxiliary_loss_mlp": 0.0006174, "balance_loss_clip": 1.16149354, "balance_loss_mlp": 0.05115395, "epoch": 0.6244401022095295, "flos": 66027587523840.0, "grad_norm": 0.762078135743884, "language_loss": 0.61437762, "learning_rate": 1.305742943921692e-06, "loss": 0.62841129, "num_input_tokens_seen": 223771875, "router_z_loss_clip": 1.8046875, "router_z_loss_mlp": 0.10595703, "step": 10386, "time_per_iteration": 3.165860652923584 }, { "auxiliary_loss_clip": 0.01350701, "auxiliary_loss_mlp": 0.00263467, "balance_loss_clip": 1.1084739, "balance_loss_mlp": 0.23595381, "epoch": 0.6245002254621975, "flos": 24571266128640.0, "grad_norm": 17.45672213259028, "language_loss": 0.79576224, "learning_rate": 1.3053777130814128e-06, "loss": 0.81190395, "num_input_tokens_seen": 223788895, "router_z_loss_clip": 2.41992188, "router_z_loss_mlp": 0.27539062, "step": 10387, "time_per_iteration": 2.7145533561706543 }, { "auxiliary_loss_clip": 0.01348709, "auxiliary_loss_mlp": 0.00288051, "balance_loss_clip": 1.10569286, "balance_loss_mlp": 0.26037085, "epoch": 0.6245603487148654, "flos": 29168837291520.0, "grad_norm": 6.400978725763331, "language_loss": 0.71545857, "learning_rate": 1.3050125085812798e-06, "loss": 0.73182619, "num_input_tokens_seen": 223810385, "router_z_loss_clip": 2.43359375, "router_z_loss_mlp": 0.27685547, "step": 10388, "time_per_iteration": 2.83872389793396 }, { "auxiliary_loss_clip": 0.01338606, "auxiliary_loss_mlp": 0.00277971, "balance_loss_clip": 1.1001035, "balance_loss_mlp": 0.25141111, "epoch": 0.6246204719675335, "flos": 14790097411200.0, "grad_norm": 10.044758002148804, "language_loss": 0.85740817, "learning_rate": 1.3046473304351417e-06, "loss": 0.8735739, "num_input_tokens_seen": 223826040, "router_z_loss_clip": 2.3828125, "router_z_loss_mlp": 0.26574707, "step": 10389, "time_per_iteration": 2.682607889175415 }, { "auxiliary_loss_clip": 0.01320263, "auxiliary_loss_mlp": 0.0027873, "balance_loss_clip": 1.08948433, "balance_loss_mlp": 0.25287339, "epoch": 0.6246805952202014, "flos": 12493538472960.0, "grad_norm": 4.490717287663689, "language_loss": 0.70891106, "learning_rate": 1.3042821786568475e-06, "loss": 0.72490096, "num_input_tokens_seen": 223842300, "router_z_loss_clip": 2.31054688, "router_z_loss_mlp": 0.25878906, "step": 10390, "time_per_iteration": 2.7706573009490967 }, { "auxiliary_loss_clip": 0.01329603, "auxiliary_loss_mlp": 0.00255591, "balance_loss_clip": 1.09194398, "balance_loss_mlp": 0.22748137, "epoch": 0.6247407184728694, "flos": 12786677366400.0, "grad_norm": 885.0060605204962, "language_loss": 0.86846793, "learning_rate": 1.3039170532602416e-06, "loss": 0.88431978, "num_input_tokens_seen": 223858320, "router_z_loss_clip": 2.38085938, "router_z_loss_mlp": 0.28125, "step": 10391, "time_per_iteration": 2.7689011096954346 }, { "auxiliary_loss_clip": 0.0135915, "auxiliary_loss_mlp": 0.00268444, "balance_loss_clip": 1.11455822, "balance_loss_mlp": 0.24076359, "epoch": 0.6248008417255374, "flos": 40629188960640.0, "grad_norm": 4.687086035717943, "language_loss": 0.71962172, "learning_rate": 1.3035519542591718e-06, "loss": 0.73589766, "num_input_tokens_seen": 223883545, "router_z_loss_clip": 2.44726562, "router_z_loss_mlp": 0.27636719, "step": 10392, "time_per_iteration": 2.8763935565948486 }, { "auxiliary_loss_clip": 0.01351295, "auxiliary_loss_mlp": 0.0024818, "balance_loss_clip": 1.10641575, "balance_loss_mlp": 0.21968938, "epoch": 0.6248609649782053, "flos": 19902017376000.0, "grad_norm": 31.60986690666227, "language_loss": 0.82824206, "learning_rate": 1.3031868816674819e-06, "loss": 0.84423685, "num_input_tokens_seen": 223901445, "router_z_loss_clip": 2.45117188, "router_z_loss_mlp": 0.28515625, "step": 10393, "time_per_iteration": 2.652094602584839 }, { "auxiliary_loss_clip": 0.01337241, "auxiliary_loss_mlp": 0.00257256, "balance_loss_clip": 1.0950222, "balance_loss_mlp": 0.23077917, "epoch": 0.6249210882308733, "flos": 19682746801920.0, "grad_norm": 31.734883472654403, "language_loss": 0.90238667, "learning_rate": 1.3028218354990142e-06, "loss": 0.91833162, "num_input_tokens_seen": 223920170, "router_z_loss_clip": 2.421875, "router_z_loss_mlp": 0.26501465, "step": 10394, "time_per_iteration": 2.6590681076049805 }, { "auxiliary_loss_clip": 0.01349266, "auxiliary_loss_mlp": 0.00269364, "balance_loss_clip": 1.10449719, "balance_loss_mlp": 0.24235077, "epoch": 0.6249812114835412, "flos": 13990726189440.0, "grad_norm": 65.90563672470965, "language_loss": 0.84000355, "learning_rate": 1.3024568157676128e-06, "loss": 0.85618979, "num_input_tokens_seen": 223936495, "router_z_loss_clip": 2.4453125, "router_z_loss_mlp": 0.2701416, "step": 10395, "time_per_iteration": 2.5890016555786133 }, { "auxiliary_loss_clip": 0.01344918, "auxiliary_loss_mlp": 0.00261116, "balance_loss_clip": 1.10419345, "balance_loss_mlp": 0.23295861, "epoch": 0.6250413347362093, "flos": 14530031965440.0, "grad_norm": 4.074906178134511, "language_loss": 0.82101476, "learning_rate": 1.302091822487119e-06, "loss": 0.83707505, "num_input_tokens_seen": 223950070, "router_z_loss_clip": 2.40820312, "router_z_loss_mlp": 0.28137207, "step": 10396, "time_per_iteration": 2.6247849464416504 }, { "auxiliary_loss_clip": 0.013487, "auxiliary_loss_mlp": 0.00256699, "balance_loss_clip": 1.10811937, "balance_loss_mlp": 0.22978181, "epoch": 0.6251014579888772, "flos": 22963006581120.0, "grad_norm": 10.991154107588617, "language_loss": 0.82503033, "learning_rate": 1.3017268556713732e-06, "loss": 0.84108436, "num_input_tokens_seen": 223970065, "router_z_loss_clip": 2.40429688, "router_z_loss_mlp": 0.26916504, "step": 10397, "time_per_iteration": 2.670114040374756 }, { "auxiliary_loss_clip": 0.01332651, "auxiliary_loss_mlp": 0.00267436, "balance_loss_clip": 1.09537876, "balance_loss_mlp": 0.24041152, "epoch": 0.6251615812415452, "flos": 28111232217600.0, "grad_norm": 15.574226082831267, "language_loss": 0.8395201, "learning_rate": 1.3013619153342154e-06, "loss": 0.85552096, "num_input_tokens_seen": 223990315, "router_z_loss_clip": 2.375, "router_z_loss_mlp": 0.26989746, "step": 10398, "time_per_iteration": 2.713029384613037 }, { "auxiliary_loss_clip": 0.01341577, "auxiliary_loss_mlp": 0.00284747, "balance_loss_clip": 1.09926987, "balance_loss_mlp": 0.25663757, "epoch": 0.6252217044942131, "flos": 26724469887360.0, "grad_norm": 5.842797253686681, "language_loss": 0.82888776, "learning_rate": 1.300997001489483e-06, "loss": 0.84515095, "num_input_tokens_seen": 224009960, "router_z_loss_clip": 2.42382812, "router_z_loss_mlp": 0.28125, "step": 10399, "time_per_iteration": 2.6707003116607666 }, { "auxiliary_loss_clip": 0.01355855, "auxiliary_loss_mlp": 0.00246344, "balance_loss_clip": 1.11021662, "balance_loss_mlp": 0.21862751, "epoch": 0.6252818277468811, "flos": 20006768413440.0, "grad_norm": 27.19580313097312, "language_loss": 0.82074273, "learning_rate": 1.3006321141510147e-06, "loss": 0.83676469, "num_input_tokens_seen": 224028870, "router_z_loss_clip": 2.45703125, "router_z_loss_mlp": 0.27722168, "step": 10400, "time_per_iteration": 2.6289303302764893 }, { "auxiliary_loss_clip": 0.01311519, "auxiliary_loss_mlp": 0.00087559, "balance_loss_clip": 1.13840103, "balance_loss_mlp": 0.07973859, "epoch": 0.625341950999549, "flos": 59278285059840.0, "grad_norm": 0.8102866019197914, "language_loss": 0.55975646, "learning_rate": 1.3002672533326465e-06, "loss": 0.57374722, "num_input_tokens_seen": 224094140, "router_z_loss_clip": 1.734375, "router_z_loss_mlp": 0.078125, "step": 10401, "time_per_iteration": 3.22579026222229 }, { "auxiliary_loss_clip": 0.01326139, "auxiliary_loss_mlp": 0.00270131, "balance_loss_clip": 1.09041953, "balance_loss_mlp": 0.24328507, "epoch": 0.625402074252217, "flos": 20157090831360.0, "grad_norm": 28.958252519829728, "language_loss": 0.90508121, "learning_rate": 1.2999024190482146e-06, "loss": 0.92104393, "num_input_tokens_seen": 224113235, "router_z_loss_clip": 2.35546875, "router_z_loss_mlp": 0.26818848, "step": 10402, "time_per_iteration": 4.123790502548218 }, { "auxiliary_loss_clip": 0.01326017, "auxiliary_loss_mlp": 0.00259324, "balance_loss_clip": 1.09209812, "balance_loss_mlp": 0.23382533, "epoch": 0.625462197504885, "flos": 29132531619840.0, "grad_norm": 14.127950612954809, "language_loss": 0.77789795, "learning_rate": 1.2995376113115527e-06, "loss": 0.79375142, "num_input_tokens_seen": 224134530, "router_z_loss_clip": 2.33789062, "router_z_loss_mlp": 0.25500488, "step": 10403, "time_per_iteration": 4.132863283157349 }, { "auxiliary_loss_clip": 0.01334924, "auxiliary_loss_mlp": 0.00252713, "balance_loss_clip": 1.09993267, "balance_loss_mlp": 0.22603391, "epoch": 0.625522320757553, "flos": 26104436294400.0, "grad_norm": 86.43826748192993, "language_loss": 0.79481924, "learning_rate": 1.2991728301364954e-06, "loss": 0.81069559, "num_input_tokens_seen": 224154170, "router_z_loss_clip": 2.3515625, "router_z_loss_mlp": 0.26660156, "step": 10404, "time_per_iteration": 2.704120397567749 }, { "auxiliary_loss_clip": 0.01329499, "auxiliary_loss_mlp": 0.0027953, "balance_loss_clip": 1.09390962, "balance_loss_mlp": 0.25209987, "epoch": 0.625582444010221, "flos": 20630967984000.0, "grad_norm": 2114.5465056534167, "language_loss": 0.76070237, "learning_rate": 1.2988080755368742e-06, "loss": 0.77679271, "num_input_tokens_seen": 224172730, "router_z_loss_clip": 2.359375, "router_z_loss_mlp": 0.27416992, "step": 10405, "time_per_iteration": 2.653061628341675 }, { "auxiliary_loss_clip": 0.01322626, "auxiliary_loss_mlp": 0.00250096, "balance_loss_clip": 1.09114003, "balance_loss_mlp": 0.22366706, "epoch": 0.6256425672628889, "flos": 20521512264960.0, "grad_norm": 28.391013653367466, "language_loss": 0.85545009, "learning_rate": 1.2984433475265207e-06, "loss": 0.87117732, "num_input_tokens_seen": 224192620, "router_z_loss_clip": 2.31640625, "router_z_loss_mlp": 0.26403809, "step": 10406, "time_per_iteration": 2.674349546432495 }, { "auxiliary_loss_clip": 0.01320766, "auxiliary_loss_mlp": 0.00258269, "balance_loss_clip": 1.09020996, "balance_loss_mlp": 0.23179284, "epoch": 0.6257026905155569, "flos": 29529200488320.0, "grad_norm": 6.24189336385816, "language_loss": 0.76582867, "learning_rate": 1.2980786461192666e-06, "loss": 0.78161901, "num_input_tokens_seen": 224214660, "router_z_loss_clip": 2.30273438, "router_z_loss_mlp": 0.26464844, "step": 10407, "time_per_iteration": 4.117915630340576 }, { "auxiliary_loss_clip": 0.0132234, "auxiliary_loss_mlp": 0.00281979, "balance_loss_clip": 1.09012055, "balance_loss_mlp": 0.25768426, "epoch": 0.6257628137682248, "flos": 24024885373440.0, "grad_norm": 2.2064013586626854, "language_loss": 0.90433538, "learning_rate": 1.2977139713289398e-06, "loss": 0.92037857, "num_input_tokens_seen": 224234170, "router_z_loss_clip": 2.31835938, "router_z_loss_mlp": 0.24291992, "step": 10408, "time_per_iteration": 2.68227481842041 }, { "auxiliary_loss_clip": 0.01321803, "auxiliary_loss_mlp": 0.00275116, "balance_loss_clip": 1.08886981, "balance_loss_mlp": 0.25055876, "epoch": 0.6258229370208929, "flos": 20850956830080.0, "grad_norm": 12.63246138116142, "language_loss": 0.86920726, "learning_rate": 1.2973493231693699e-06, "loss": 0.88517642, "num_input_tokens_seen": 224253115, "router_z_loss_clip": 2.328125, "router_z_loss_mlp": 0.24560547, "step": 10409, "time_per_iteration": 2.6660523414611816 }, { "auxiliary_loss_clip": 0.01316532, "auxiliary_loss_mlp": 0.00281407, "balance_loss_clip": 1.08667564, "balance_loss_mlp": 0.2538341, "epoch": 0.6258830602735608, "flos": 22231542021120.0, "grad_norm": 28.428126305879452, "language_loss": 0.7715919, "learning_rate": 1.2969847016543845e-06, "loss": 0.78757131, "num_input_tokens_seen": 224271375, "router_z_loss_clip": 2.30078125, "router_z_loss_mlp": 0.27587891, "step": 10410, "time_per_iteration": 2.6441843509674072 }, { "auxiliary_loss_clip": 0.01302935, "auxiliary_loss_mlp": 0.00264751, "balance_loss_clip": 1.08108997, "balance_loss_mlp": 0.23912065, "epoch": 0.6259431835262288, "flos": 25076887925760.0, "grad_norm": 3.850156390872138, "language_loss": 0.74762821, "learning_rate": 1.2966201067978086e-06, "loss": 0.76330507, "num_input_tokens_seen": 224290315, "router_z_loss_clip": 2.21679688, "router_z_loss_mlp": 0.25646973, "step": 10411, "time_per_iteration": 4.0202476978302 }, { "auxiliary_loss_clip": 0.0133406, "auxiliary_loss_mlp": 0.00256214, "balance_loss_clip": 1.09374714, "balance_loss_mlp": 0.22889078, "epoch": 0.6260033067788967, "flos": 28252288926720.0, "grad_norm": 40.89638216731196, "language_loss": 0.79599249, "learning_rate": 1.2962555386134702e-06, "loss": 0.81189525, "num_input_tokens_seen": 224310545, "router_z_loss_clip": 2.40039062, "router_z_loss_mlp": 0.27331543, "step": 10412, "time_per_iteration": 2.7574076652526855 }, { "auxiliary_loss_clip": 0.01324835, "auxiliary_loss_mlp": 0.00269927, "balance_loss_clip": 1.09307802, "balance_loss_mlp": 0.24172217, "epoch": 0.6260634300315647, "flos": 23367432787200.0, "grad_norm": 82.31161830053041, "language_loss": 0.77125812, "learning_rate": 1.2958909971151908e-06, "loss": 0.78720576, "num_input_tokens_seen": 224331115, "router_z_loss_clip": 2.31640625, "router_z_loss_mlp": 0.28222656, "step": 10413, "time_per_iteration": 2.6715426445007324 }, { "auxiliary_loss_clip": 0.01346565, "auxiliary_loss_mlp": 0.00272228, "balance_loss_clip": 1.1019907, "balance_loss_mlp": 0.24430926, "epoch": 0.6261235532842326, "flos": 18035308494720.0, "grad_norm": 40.66113303459306, "language_loss": 0.90316451, "learning_rate": 1.295526482316796e-06, "loss": 0.91935241, "num_input_tokens_seen": 224347525, "router_z_loss_clip": 2.4453125, "router_z_loss_mlp": 0.27905273, "step": 10414, "time_per_iteration": 2.6534852981567383 }, { "auxiliary_loss_clip": 0.01316471, "auxiliary_loss_mlp": 0.0026896, "balance_loss_clip": 1.08513832, "balance_loss_mlp": 0.24213783, "epoch": 0.6261836765369007, "flos": 22011265866240.0, "grad_norm": 7.591153931373517, "language_loss": 0.81124806, "learning_rate": 1.2951619942321083e-06, "loss": 0.82710236, "num_input_tokens_seen": 224367045, "router_z_loss_clip": 2.3125, "router_z_loss_mlp": 0.26818848, "step": 10415, "time_per_iteration": 2.634305715560913 }, { "auxiliary_loss_clip": 0.01319453, "auxiliary_loss_mlp": 0.00265152, "balance_loss_clip": 1.0896765, "balance_loss_mlp": 0.23981974, "epoch": 0.6262437997895686, "flos": 24936010784640.0, "grad_norm": 63.39845988689822, "language_loss": 0.8183682, "learning_rate": 1.2947975328749472e-06, "loss": 0.83421421, "num_input_tokens_seen": 224388860, "router_z_loss_clip": 2.296875, "router_z_loss_mlp": 0.25305176, "step": 10416, "time_per_iteration": 2.725013017654419 }, { "auxiliary_loss_clip": 0.01304585, "auxiliary_loss_mlp": 0.0024013, "balance_loss_clip": 1.07669377, "balance_loss_mlp": 0.21472698, "epoch": 0.6263039230422366, "flos": 31608428186880.0, "grad_norm": 7.518597740985801, "language_loss": 0.90905929, "learning_rate": 1.2944330982591352e-06, "loss": 0.92450643, "num_input_tokens_seen": 224409645, "router_z_loss_clip": 2.28125, "router_z_loss_mlp": 0.25402832, "step": 10417, "time_per_iteration": 2.732072353363037 }, { "auxiliary_loss_clip": 0.01344354, "auxiliary_loss_mlp": 0.00244837, "balance_loss_clip": 1.10315061, "balance_loss_mlp": 0.21819374, "epoch": 0.6263640462949046, "flos": 17639465639040.0, "grad_norm": 80.69032500854921, "language_loss": 0.73287094, "learning_rate": 1.2940686903984904e-06, "loss": 0.74876285, "num_input_tokens_seen": 224428530, "router_z_loss_clip": 2.41210938, "router_z_loss_mlp": 0.26672363, "step": 10418, "time_per_iteration": 2.6286497116088867 }, { "auxiliary_loss_clip": 0.01331137, "auxiliary_loss_mlp": 0.00278216, "balance_loss_clip": 1.09382498, "balance_loss_mlp": 0.25057095, "epoch": 0.6264241695475725, "flos": 19974951941760.0, "grad_norm": 32.90128397452683, "language_loss": 0.92345577, "learning_rate": 1.2937043093068316e-06, "loss": 0.93954927, "num_input_tokens_seen": 224447175, "router_z_loss_clip": 2.375, "router_z_loss_mlp": 0.2767334, "step": 10419, "time_per_iteration": 2.646334648132324 }, { "auxiliary_loss_clip": 0.0134835, "auxiliary_loss_mlp": 0.00252965, "balance_loss_clip": 1.10705709, "balance_loss_mlp": 0.22591668, "epoch": 0.6264842928002405, "flos": 27344323912320.0, "grad_norm": 5.2950551501701035, "language_loss": 0.7134397, "learning_rate": 1.2933399549979762e-06, "loss": 0.72945279, "num_input_tokens_seen": 224469445, "router_z_loss_clip": 2.41601562, "router_z_loss_mlp": 0.27026367, "step": 10420, "time_per_iteration": 2.7286884784698486 }, { "auxiliary_loss_clip": 0.01326697, "auxiliary_loss_mlp": 0.00262887, "balance_loss_clip": 1.09235549, "balance_loss_mlp": 0.23657787, "epoch": 0.6265444160529084, "flos": 22997265177600.0, "grad_norm": 110.27835241838423, "language_loss": 0.93244493, "learning_rate": 1.292975627485741e-06, "loss": 0.94834077, "num_input_tokens_seen": 224486590, "router_z_loss_clip": 2.34179688, "router_z_loss_mlp": 0.26318359, "step": 10421, "time_per_iteration": 2.665055990219116 }, { "auxiliary_loss_clip": 0.01313569, "auxiliary_loss_mlp": 0.00247235, "balance_loss_clip": 1.08054805, "balance_loss_mlp": 0.22030574, "epoch": 0.6266045393055765, "flos": 19938323047680.0, "grad_norm": 155.73058555067522, "language_loss": 0.88227272, "learning_rate": 1.2926113267839403e-06, "loss": 0.89788079, "num_input_tokens_seen": 224502795, "router_z_loss_clip": 2.33007812, "router_z_loss_mlp": 0.26928711, "step": 10422, "time_per_iteration": 2.652860403060913 }, { "auxiliary_loss_clip": 0.0131458, "auxiliary_loss_mlp": 0.00268075, "balance_loss_clip": 1.08182096, "balance_loss_mlp": 0.24087167, "epoch": 0.6266646625582444, "flos": 24389091325440.0, "grad_norm": 15.699479029572815, "language_loss": 0.82155341, "learning_rate": 1.292247052906389e-06, "loss": 0.83737993, "num_input_tokens_seen": 224522300, "router_z_loss_clip": 2.33007812, "router_z_loss_mlp": 0.27233887, "step": 10423, "time_per_iteration": 2.6847589015960693 }, { "auxiliary_loss_clip": 0.01288345, "auxiliary_loss_mlp": 0.002365, "balance_loss_clip": 1.06130791, "balance_loss_mlp": 0.21278977, "epoch": 0.6267247858109124, "flos": 14683802088960.0, "grad_norm": 52.665510222894035, "language_loss": 0.86662817, "learning_rate": 1.2918828058669004e-06, "loss": 0.88187659, "num_input_tokens_seen": 224538260, "router_z_loss_clip": 2.26757812, "router_z_loss_mlp": 0.23706055, "step": 10424, "time_per_iteration": 2.619312047958374 }, { "auxiliary_loss_clip": 0.01318309, "auxiliary_loss_mlp": 0.00262144, "balance_loss_clip": 1.08714747, "balance_loss_mlp": 0.23581108, "epoch": 0.6267849090635803, "flos": 24929977299840.0, "grad_norm": 107.59525905105433, "language_loss": 0.77174336, "learning_rate": 1.2915185856792868e-06, "loss": 0.78754789, "num_input_tokens_seen": 224559155, "router_z_loss_clip": 2.3125, "router_z_loss_mlp": 0.26330566, "step": 10425, "time_per_iteration": 2.691349983215332 }, { "auxiliary_loss_clip": 0.01293004, "auxiliary_loss_mlp": 0.00250553, "balance_loss_clip": 1.07052159, "balance_loss_mlp": 0.22538809, "epoch": 0.6268450323162483, "flos": 25337851211520.0, "grad_norm": 75.29472115060186, "language_loss": 0.82863641, "learning_rate": 1.2911543923573598e-06, "loss": 0.84407198, "num_input_tokens_seen": 224578660, "router_z_loss_clip": 2.22460938, "router_z_loss_mlp": 0.25183105, "step": 10426, "time_per_iteration": 2.689901828765869 }, { "auxiliary_loss_clip": 0.01313556, "auxiliary_loss_mlp": 0.00240309, "balance_loss_clip": 1.08151197, "balance_loss_mlp": 0.21590656, "epoch": 0.6269051555689162, "flos": 26177299032960.0, "grad_norm": 119.2563067294524, "language_loss": 0.85685945, "learning_rate": 1.290790225914929e-06, "loss": 0.87239808, "num_input_tokens_seen": 224599080, "router_z_loss_clip": 2.32421875, "router_z_loss_mlp": 0.24401855, "step": 10427, "time_per_iteration": 2.81270432472229 }, { "auxiliary_loss_clip": 0.01319034, "auxiliary_loss_mlp": 0.00235834, "balance_loss_clip": 1.08539176, "balance_loss_mlp": 0.20987034, "epoch": 0.6269652788215843, "flos": 18256877539200.0, "grad_norm": 231.24490482829583, "language_loss": 0.75583982, "learning_rate": 1.2904260863658034e-06, "loss": 0.77138853, "num_input_tokens_seen": 224614225, "router_z_loss_clip": 2.33984375, "router_z_loss_mlp": 0.25964355, "step": 10428, "time_per_iteration": 2.7009880542755127 }, { "auxiliary_loss_clip": 0.01312031, "auxiliary_loss_mlp": 0.00239316, "balance_loss_clip": 1.08198619, "balance_loss_mlp": 0.213805, "epoch": 0.6270254020742522, "flos": 11765413877760.0, "grad_norm": 3.7551174064192936, "language_loss": 0.80538917, "learning_rate": 1.2900619737237928e-06, "loss": 0.82090265, "num_input_tokens_seen": 224632365, "router_z_loss_clip": 2.30273438, "router_z_loss_mlp": 0.25512695, "step": 10429, "time_per_iteration": 2.6409249305725098 }, { "auxiliary_loss_clip": 0.01324609, "auxiliary_loss_mlp": 0.00230519, "balance_loss_clip": 1.08811343, "balance_loss_mlp": 0.20255294, "epoch": 0.6270855253269202, "flos": 23475631530240.0, "grad_norm": 29.500078038513994, "language_loss": 0.86391199, "learning_rate": 1.2896978880027023e-06, "loss": 0.87946326, "num_input_tokens_seen": 224651125, "router_z_loss_clip": 2.36523438, "router_z_loss_mlp": 0.27941895, "step": 10430, "time_per_iteration": 2.6862316131591797 }, { "auxiliary_loss_clip": 0.01251359, "auxiliary_loss_mlp": 0.00083428, "balance_loss_clip": 1.07324374, "balance_loss_mlp": 0.0744155, "epoch": 0.6271456485795882, "flos": 70064520232320.0, "grad_norm": 0.7522777165611687, "language_loss": 0.58938313, "learning_rate": 1.2893338292163393e-06, "loss": 0.60273099, "num_input_tokens_seen": 224716115, "router_z_loss_clip": 1.78125, "router_z_loss_mlp": 0.09033203, "step": 10431, "time_per_iteration": 3.273266553878784 }, { "auxiliary_loss_clip": 0.01251259, "auxiliary_loss_mlp": 0.00075898, "balance_loss_clip": 1.07331789, "balance_loss_mlp": 0.06750559, "epoch": 0.6272057718322561, "flos": 65156718280320.0, "grad_norm": 0.8576167730837732, "language_loss": 0.63294315, "learning_rate": 1.2889697973785095e-06, "loss": 0.64621472, "num_input_tokens_seen": 224782930, "router_z_loss_clip": 1.78125, "router_z_loss_mlp": 0.08398438, "step": 10432, "time_per_iteration": 3.179032802581787 }, { "auxiliary_loss_clip": 0.01293705, "auxiliary_loss_mlp": 0.00233941, "balance_loss_clip": 1.06968701, "balance_loss_mlp": 0.21026629, "epoch": 0.6272658950849241, "flos": 24389342720640.0, "grad_norm": 147.07556694217993, "language_loss": 0.73676288, "learning_rate": 1.2886057925030153e-06, "loss": 0.75203931, "num_input_tokens_seen": 224802010, "router_z_loss_clip": 2.23828125, "router_z_loss_mlp": 0.2364502, "step": 10433, "time_per_iteration": 2.7192723751068115 }, { "auxiliary_loss_clip": 0.01334424, "auxiliary_loss_mlp": 0.00218409, "balance_loss_clip": 1.09119797, "balance_loss_mlp": 0.18956015, "epoch": 0.627326018337592, "flos": 17966001202560.0, "grad_norm": 31.259050625114433, "language_loss": 0.7667942, "learning_rate": 1.2882418146036612e-06, "loss": 0.78232253, "num_input_tokens_seen": 224818875, "router_z_loss_clip": 2.4296875, "router_z_loss_mlp": 0.28845215, "step": 10434, "time_per_iteration": 2.6397247314453125 }, { "auxiliary_loss_clip": 0.01310251, "auxiliary_loss_mlp": 0.00229963, "balance_loss_clip": 1.07723844, "balance_loss_mlp": 0.20124534, "epoch": 0.6273861415902601, "flos": 20230097224320.0, "grad_norm": 13.162355238446764, "language_loss": 0.92797196, "learning_rate": 1.2878778636942484e-06, "loss": 0.94337416, "num_input_tokens_seen": 224837790, "router_z_loss_clip": 2.328125, "router_z_loss_mlp": 0.28723145, "step": 10435, "time_per_iteration": 2.7174577713012695 }, { "auxiliary_loss_clip": 0.01257471, "auxiliary_loss_mlp": 0.00071901, "balance_loss_clip": 1.08293295, "balance_loss_mlp": 0.06179186, "epoch": 0.627446264842928, "flos": 64953210798720.0, "grad_norm": 0.71809535520133, "language_loss": 0.6110658, "learning_rate": 1.2875139397885786e-06, "loss": 0.62435955, "num_input_tokens_seen": 224899685, "router_z_loss_clip": 1.75, "router_z_loss_mlp": 0.10107422, "step": 10436, "time_per_iteration": 3.1275362968444824 }, { "auxiliary_loss_clip": 0.01306439, "auxiliary_loss_mlp": 0.00228892, "balance_loss_clip": 1.07917476, "balance_loss_mlp": 0.20338088, "epoch": 0.627506388095596, "flos": 23584261236480.0, "grad_norm": 13.847731655750236, "language_loss": 0.85308814, "learning_rate": 1.2871500429004523e-06, "loss": 0.86844146, "num_input_tokens_seen": 224918650, "router_z_loss_clip": 2.2734375, "router_z_loss_mlp": 0.25524902, "step": 10437, "time_per_iteration": 2.658050298690796 }, { "auxiliary_loss_clip": 0.01268495, "auxiliary_loss_mlp": 0.0011132, "balance_loss_clip": 1.09378052, "balance_loss_mlp": 0.10135391, "epoch": 0.6275665113482639, "flos": 67583631674880.0, "grad_norm": 0.7368724395395051, "language_loss": 0.53695112, "learning_rate": 1.2867861730436667e-06, "loss": 0.55074924, "num_input_tokens_seen": 224981575, "router_z_loss_clip": 1.75, "router_z_loss_mlp": 0.09960938, "step": 10438, "time_per_iteration": 3.0657083988189697 }, { "auxiliary_loss_clip": 0.01320932, "auxiliary_loss_mlp": 0.00234235, "balance_loss_clip": 1.08675218, "balance_loss_mlp": 0.20722215, "epoch": 0.6276266346009319, "flos": 27636924101760.0, "grad_norm": 32.86300113927001, "language_loss": 0.91692388, "learning_rate": 1.2864223302320214e-06, "loss": 0.93247557, "num_input_tokens_seen": 225000820, "router_z_loss_clip": 2.34570312, "router_z_loss_mlp": 0.27038574, "step": 10439, "time_per_iteration": 2.7613890171051025 }, { "auxiliary_loss_clip": 0.01315649, "auxiliary_loss_mlp": 0.00237212, "balance_loss_clip": 1.08465505, "balance_loss_mlp": 0.20719466, "epoch": 0.6276867578535998, "flos": 22746142218240.0, "grad_norm": 18.81804963355678, "language_loss": 0.8892622, "learning_rate": 1.2860585144793128e-06, "loss": 0.90479082, "num_input_tokens_seen": 225017585, "router_z_loss_clip": 2.31054688, "router_z_loss_mlp": 0.30029297, "step": 10440, "time_per_iteration": 2.635059356689453 }, { "auxiliary_loss_clip": 0.01306757, "auxiliary_loss_mlp": 0.00202852, "balance_loss_clip": 1.08481753, "balance_loss_mlp": 0.17692348, "epoch": 0.6277468811062679, "flos": 24644200694400.0, "grad_norm": 47.240359905268356, "language_loss": 0.80488908, "learning_rate": 1.285694725799337e-06, "loss": 0.81998515, "num_input_tokens_seen": 225039085, "router_z_loss_clip": 2.22070312, "router_z_loss_mlp": 0.25915527, "step": 10441, "time_per_iteration": 2.6990294456481934 }, { "auxiliary_loss_clip": 0.01299143, "auxiliary_loss_mlp": 0.00221999, "balance_loss_clip": 1.07389534, "balance_loss_mlp": 0.19512957, "epoch": 0.6278070043589358, "flos": 19678975873920.0, "grad_norm": 4.585640952974947, "language_loss": 0.81251407, "learning_rate": 1.2853309642058884e-06, "loss": 0.82772547, "num_input_tokens_seen": 225058105, "router_z_loss_clip": 2.25195312, "router_z_loss_mlp": 0.26867676, "step": 10442, "time_per_iteration": 2.6201555728912354 }, { "auxiliary_loss_clip": 0.01306357, "auxiliary_loss_mlp": 0.00229691, "balance_loss_clip": 1.07650208, "balance_loss_mlp": 0.20119961, "epoch": 0.6278671276116038, "flos": 22121834906880.0, "grad_norm": 124.1113859010021, "language_loss": 0.78122354, "learning_rate": 1.284967229712762e-06, "loss": 0.79658401, "num_input_tokens_seen": 225077605, "router_z_loss_clip": 2.29882812, "router_z_loss_mlp": 0.28491211, "step": 10443, "time_per_iteration": 2.6732349395751953 }, { "auxiliary_loss_clip": 0.01302043, "auxiliary_loss_mlp": 0.00244872, "balance_loss_clip": 1.06963265, "balance_loss_mlp": 0.21733472, "epoch": 0.6279272508642717, "flos": 23038562839680.0, "grad_norm": 45.62759597102971, "language_loss": 0.80014038, "learning_rate": 1.2846035223337492e-06, "loss": 0.81560957, "num_input_tokens_seen": 225097775, "router_z_loss_clip": 2.32421875, "router_z_loss_mlp": 0.2755127, "step": 10444, "time_per_iteration": 4.132807493209839 }, { "auxiliary_loss_clip": 0.01291482, "auxiliary_loss_mlp": 0.00203938, "balance_loss_clip": 1.06503439, "balance_loss_mlp": 0.17599498, "epoch": 0.6279873741169397, "flos": 19824090819840.0, "grad_norm": 323.9000556479844, "language_loss": 0.79359996, "learning_rate": 1.2842398420826423e-06, "loss": 0.80855417, "num_input_tokens_seen": 225115585, "router_z_loss_clip": 2.26367188, "router_z_loss_mlp": 0.27941895, "step": 10445, "time_per_iteration": 4.087191104888916 }, { "auxiliary_loss_clip": 0.01300094, "auxiliary_loss_mlp": 0.00221388, "balance_loss_clip": 1.07146406, "balance_loss_mlp": 0.19605637, "epoch": 0.6280474973696077, "flos": 23915393740800.0, "grad_norm": 4.051088917182642, "language_loss": 0.76879764, "learning_rate": 1.2838761889732331e-06, "loss": 0.7840125, "num_input_tokens_seen": 225135575, "router_z_loss_clip": 2.28515625, "router_z_loss_mlp": 0.25317383, "step": 10446, "time_per_iteration": 2.7174670696258545 }, { "auxiliary_loss_clip": 0.01309734, "auxiliary_loss_mlp": 0.00229767, "balance_loss_clip": 1.07861543, "balance_loss_mlp": 0.20326632, "epoch": 0.6281076206222757, "flos": 17967976450560.0, "grad_norm": 11.916150447348647, "language_loss": 0.83875918, "learning_rate": 1.2835125630193102e-06, "loss": 0.85415423, "num_input_tokens_seen": 225154230, "router_z_loss_clip": 2.3125, "router_z_loss_mlp": 0.26489258, "step": 10447, "time_per_iteration": 2.773871898651123 }, { "auxiliary_loss_clip": 0.01251262, "auxiliary_loss_mlp": 0.00057019, "balance_loss_clip": 1.08673668, "balance_loss_mlp": 0.04600453, "epoch": 0.6281677438749437, "flos": 66778370622720.0, "grad_norm": 0.6685129180595251, "language_loss": 0.51600134, "learning_rate": 1.2831489642346626e-06, "loss": 0.52908421, "num_input_tokens_seen": 225213650, "router_z_loss_clip": 1.640625, "router_z_loss_mlp": 0.11035156, "step": 10448, "time_per_iteration": 3.0455448627471924 }, { "auxiliary_loss_clip": 0.013235, "auxiliary_loss_mlp": 0.00223829, "balance_loss_clip": 1.09320772, "balance_loss_mlp": 0.19694722, "epoch": 0.6282278671276116, "flos": 11656173640320.0, "grad_norm": 399.3275957078817, "language_loss": 1.0084759, "learning_rate": 1.282785392633079e-06, "loss": 1.02394915, "num_input_tokens_seen": 225230135, "router_z_loss_clip": 2.30078125, "router_z_loss_mlp": 0.26904297, "step": 10449, "time_per_iteration": 4.049811601638794 }, { "auxiliary_loss_clip": 0.01302753, "auxiliary_loss_mlp": 0.00232359, "balance_loss_clip": 1.07747746, "balance_loss_mlp": 0.20776649, "epoch": 0.6282879903802796, "flos": 42741597847680.0, "grad_norm": 9.937495098080724, "language_loss": 0.68571544, "learning_rate": 1.2824218482283438e-06, "loss": 0.70106661, "num_input_tokens_seen": 225253520, "router_z_loss_clip": 2.25390625, "router_z_loss_mlp": 0.24572754, "step": 10450, "time_per_iteration": 2.834761142730713 }, { "auxiliary_loss_clip": 0.01301831, "auxiliary_loss_mlp": 0.00218139, "balance_loss_clip": 1.07948458, "balance_loss_mlp": 0.19378486, "epoch": 0.6283481136329475, "flos": 20009210538240.0, "grad_norm": 5.377746820468152, "language_loss": 0.83257997, "learning_rate": 1.2820583310342452e-06, "loss": 0.84777963, "num_input_tokens_seen": 225272460, "router_z_loss_clip": 2.22265625, "router_z_loss_mlp": 0.24353027, "step": 10451, "time_per_iteration": 2.6560468673706055 }, { "auxiliary_loss_clip": 0.01317832, "auxiliary_loss_mlp": 0.0021809, "balance_loss_clip": 1.08811355, "balance_loss_mlp": 0.18889546, "epoch": 0.6284082368856155, "flos": 21904431840000.0, "grad_norm": 12.914296180069416, "language_loss": 0.85119534, "learning_rate": 1.281694841064566e-06, "loss": 0.86655462, "num_input_tokens_seen": 225291700, "router_z_loss_clip": 2.29492188, "router_z_loss_mlp": 0.29187012, "step": 10452, "time_per_iteration": 2.666250467300415 }, { "auxiliary_loss_clip": 0.01291397, "auxiliary_loss_mlp": 0.00222872, "balance_loss_clip": 1.07043362, "balance_loss_mlp": 0.19602621, "epoch": 0.6284683601382834, "flos": 25484187219840.0, "grad_norm": 14.62094236623885, "language_loss": 0.80647463, "learning_rate": 1.2813313783330904e-06, "loss": 0.82161731, "num_input_tokens_seen": 225311470, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.26831055, "step": 10453, "time_per_iteration": 4.0778467655181885 }, { "auxiliary_loss_clip": 0.01296106, "auxiliary_loss_mlp": 0.00208728, "balance_loss_clip": 1.07050312, "balance_loss_mlp": 0.18195391, "epoch": 0.6285284833909515, "flos": 16538695395840.0, "grad_norm": 3.1622425976700104, "language_loss": 0.88310778, "learning_rate": 1.2809679428536013e-06, "loss": 0.89815617, "num_input_tokens_seen": 225328385, "router_z_loss_clip": 2.2578125, "router_z_loss_mlp": 0.26794434, "step": 10454, "time_per_iteration": 2.6679470539093018 }, { "auxiliary_loss_clip": 0.01297718, "auxiliary_loss_mlp": 0.00233539, "balance_loss_clip": 1.07891154, "balance_loss_mlp": 0.20827839, "epoch": 0.6285886066436194, "flos": 22820692896000.0, "grad_norm": 42.943975526142346, "language_loss": 0.89965487, "learning_rate": 1.2806045346398792e-06, "loss": 0.91496742, "num_input_tokens_seen": 225348415, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.25268555, "step": 10455, "time_per_iteration": 2.674139976501465 }, { "auxiliary_loss_clip": 0.01298343, "auxiliary_loss_mlp": 0.00215138, "balance_loss_clip": 1.07538998, "balance_loss_mlp": 0.1909145, "epoch": 0.6286487298962874, "flos": 24715734629760.0, "grad_norm": 9.125608513769647, "language_loss": 0.89761829, "learning_rate": 1.280241153705706e-06, "loss": 0.91275305, "num_input_tokens_seen": 225367740, "router_z_loss_clip": 2.22851562, "router_z_loss_mlp": 0.24230957, "step": 10456, "time_per_iteration": 2.66049861907959 }, { "auxiliary_loss_clip": 0.01302072, "auxiliary_loss_mlp": 0.00230222, "balance_loss_clip": 1.07806313, "balance_loss_mlp": 0.20294675, "epoch": 0.6287088531489553, "flos": 20740818752640.0, "grad_norm": 15.115764829805041, "language_loss": 0.81673115, "learning_rate": 1.27987780006486e-06, "loss": 0.83205414, "num_input_tokens_seen": 225388405, "router_z_loss_clip": 2.23632812, "router_z_loss_mlp": 0.27282715, "step": 10457, "time_per_iteration": 2.665334463119507 }, { "auxiliary_loss_clip": 0.01312895, "auxiliary_loss_mlp": 0.00227216, "balance_loss_clip": 1.08229327, "balance_loss_mlp": 0.20053747, "epoch": 0.6287689764016233, "flos": 23070630706560.0, "grad_norm": 17.897092378241208, "language_loss": 0.88829565, "learning_rate": 1.2795144737311202e-06, "loss": 0.90369678, "num_input_tokens_seen": 225408360, "router_z_loss_clip": 2.30859375, "router_z_loss_mlp": 0.26660156, "step": 10458, "time_per_iteration": 2.6691384315490723 }, { "auxiliary_loss_clip": 0.01311068, "auxiliary_loss_mlp": 0.00226431, "balance_loss_clip": 1.08221674, "balance_loss_mlp": 0.19927472, "epoch": 0.6288290996542913, "flos": 32233669251840.0, "grad_norm": 5.753699275963532, "language_loss": 0.6914717, "learning_rate": 1.2791511747182635e-06, "loss": 0.70684671, "num_input_tokens_seen": 225431310, "router_z_loss_clip": 2.2890625, "router_z_loss_mlp": 0.27111816, "step": 10459, "time_per_iteration": 2.7937045097351074 }, { "auxiliary_loss_clip": 0.01305186, "auxiliary_loss_mlp": 0.00202337, "balance_loss_clip": 1.07785642, "balance_loss_mlp": 0.17618228, "epoch": 0.6288892229069593, "flos": 24641327606400.0, "grad_norm": 102.04876601120655, "language_loss": 0.85218853, "learning_rate": 1.2787879030400666e-06, "loss": 0.86726373, "num_input_tokens_seen": 225450385, "router_z_loss_clip": 2.27148438, "router_z_loss_mlp": 0.26159668, "step": 10460, "time_per_iteration": 2.6624557971954346 }, { "auxiliary_loss_clip": 0.01313559, "auxiliary_loss_mlp": 0.00224403, "balance_loss_clip": 1.08890986, "balance_loss_mlp": 0.19705683, "epoch": 0.6289493461596273, "flos": 17858341163520.0, "grad_norm": 49.89238880377945, "language_loss": 0.8155368, "learning_rate": 1.2784246587103047e-06, "loss": 0.8309164, "num_input_tokens_seen": 225467325, "router_z_loss_clip": 2.24609375, "router_z_loss_mlp": 0.27355957, "step": 10461, "time_per_iteration": 2.6436572074890137 }, { "auxiliary_loss_clip": 0.01309974, "auxiliary_loss_mlp": 0.0022474, "balance_loss_clip": 1.08981895, "balance_loss_mlp": 0.20089795, "epoch": 0.6290094694122952, "flos": 22345379199360.0, "grad_norm": 5.600496338467464, "language_loss": 0.77838862, "learning_rate": 1.2780614417427523e-06, "loss": 0.79373586, "num_input_tokens_seen": 225487370, "router_z_loss_clip": 2.19921875, "router_z_loss_mlp": 0.23852539, "step": 10462, "time_per_iteration": 2.645785093307495 }, { "auxiliary_loss_clip": 0.01290452, "auxiliary_loss_mlp": 0.00214325, "balance_loss_clip": 1.0699693, "balance_loss_mlp": 0.18900478, "epoch": 0.6290695926649632, "flos": 28402431776640.0, "grad_norm": 2.44829488318691, "language_loss": 0.80815423, "learning_rate": 1.2776982521511821e-06, "loss": 0.82320201, "num_input_tokens_seen": 225506915, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.25317383, "step": 10463, "time_per_iteration": 2.703110694885254 }, { "auxiliary_loss_clip": 0.01316995, "auxiliary_loss_mlp": 0.00227154, "balance_loss_clip": 1.09152842, "balance_loss_mlp": 0.20114288, "epoch": 0.6291297159176311, "flos": 21505464501120.0, "grad_norm": 49.188995264057866, "language_loss": 0.79169226, "learning_rate": 1.2773350899493665e-06, "loss": 0.80713379, "num_input_tokens_seen": 225525670, "router_z_loss_clip": 2.25195312, "router_z_loss_mlp": 0.26025391, "step": 10464, "time_per_iteration": 2.612696886062622 }, { "auxiliary_loss_clip": 0.01295749, "auxiliary_loss_mlp": 0.00209993, "balance_loss_clip": 1.07721591, "balance_loss_mlp": 0.18467319, "epoch": 0.6291898391702991, "flos": 12203308581120.0, "grad_norm": 11.837603454522661, "language_loss": 0.76647252, "learning_rate": 1.2769719551510768e-06, "loss": 0.7815299, "num_input_tokens_seen": 225542235, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.2532959, "step": 10465, "time_per_iteration": 2.5982189178466797 }, { "auxiliary_loss_clip": 0.0122853, "auxiliary_loss_mlp": 0.00049187, "balance_loss_clip": 1.07035804, "balance_loss_mlp": 0.0406515, "epoch": 0.629249962422967, "flos": 69299479434240.0, "grad_norm": 0.6613717268604755, "language_loss": 0.59109247, "learning_rate": 1.2766088477700832e-06, "loss": 0.60386962, "num_input_tokens_seen": 225607185, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.08544922, "step": 10466, "time_per_iteration": 3.2266221046447754 }, { "auxiliary_loss_clip": 0.01299689, "auxiliary_loss_mlp": 0.00230864, "balance_loss_clip": 1.07644236, "balance_loss_mlp": 0.20453019, "epoch": 0.6293100856756351, "flos": 40077888042240.0, "grad_norm": 27.490037973019035, "language_loss": 0.7271623, "learning_rate": 1.276245767820154e-06, "loss": 0.74246788, "num_input_tokens_seen": 225628785, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.2635498, "step": 10467, "time_per_iteration": 2.795806646347046 }, { "auxiliary_loss_clip": 0.01227062, "auxiliary_loss_mlp": 0.00077115, "balance_loss_clip": 1.06868148, "balance_loss_mlp": 0.0687225, "epoch": 0.629370208928303, "flos": 67501108177920.0, "grad_norm": 0.7829882300932713, "language_loss": 0.55987191, "learning_rate": 1.2758827153150586e-06, "loss": 0.57291365, "num_input_tokens_seen": 225678980, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.08398438, "step": 10468, "time_per_iteration": 2.910860300064087 }, { "auxiliary_loss_clip": 0.0123457, "auxiliary_loss_mlp": 0.00048448, "balance_loss_clip": 1.07459927, "balance_loss_mlp": 0.04019896, "epoch": 0.629430332180971, "flos": 60660450449280.0, "grad_norm": 0.7474143396581256, "language_loss": 0.57185102, "learning_rate": 1.2755196902685626e-06, "loss": 0.58468115, "num_input_tokens_seen": 225740295, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.08251953, "step": 10469, "time_per_iteration": 3.0529420375823975 }, { "auxiliary_loss_clip": 0.01232786, "auxiliary_loss_mlp": 0.00044508, "balance_loss_clip": 1.07292223, "balance_loss_mlp": 0.03582941, "epoch": 0.6294904554336389, "flos": 66869764778880.0, "grad_norm": 0.6615524631400618, "language_loss": 0.51093459, "learning_rate": 1.2751566926944329e-06, "loss": 0.52370751, "num_input_tokens_seen": 225805615, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.08691406, "step": 10470, "time_per_iteration": 3.2080726623535156 }, { "auxiliary_loss_clip": 0.01286084, "auxiliary_loss_mlp": 0.00266937, "balance_loss_clip": 1.06659698, "balance_loss_mlp": 0.24170008, "epoch": 0.6295505786863069, "flos": 42522794150400.0, "grad_norm": 42.42711519263081, "language_loss": 0.82002109, "learning_rate": 1.2747937226064342e-06, "loss": 0.83555126, "num_input_tokens_seen": 225826585, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.25219727, "step": 10471, "time_per_iteration": 2.8393959999084473 }, { "auxiliary_loss_clip": 0.01309797, "auxiliary_loss_mlp": 0.00238911, "balance_loss_clip": 1.0819006, "balance_loss_mlp": 0.21332842, "epoch": 0.629610701938975, "flos": 17384140788480.0, "grad_norm": 2.5338258236648254, "language_loss": 0.70775318, "learning_rate": 1.2744307800183297e-06, "loss": 0.72324032, "num_input_tokens_seen": 225844095, "router_z_loss_clip": 2.28125, "router_z_loss_mlp": 0.25622559, "step": 10472, "time_per_iteration": 2.813768148422241 }, { "auxiliary_loss_clip": 0.01299197, "auxiliary_loss_mlp": 0.00228856, "balance_loss_clip": 1.07713366, "balance_loss_mlp": 0.20274951, "epoch": 0.6296708251916429, "flos": 24242934885120.0, "grad_norm": 4.185411259076127, "language_loss": 0.78346848, "learning_rate": 1.2740678649438828e-06, "loss": 0.79874897, "num_input_tokens_seen": 225864310, "router_z_loss_clip": 2.22070312, "router_z_loss_mlp": 0.26098633, "step": 10473, "time_per_iteration": 2.665252685546875 }, { "auxiliary_loss_clip": 0.01302129, "auxiliary_loss_mlp": 0.00232216, "balance_loss_clip": 1.07811022, "balance_loss_mlp": 0.20474976, "epoch": 0.6297309484443109, "flos": 19278536077440.0, "grad_norm": 30.00697463266715, "language_loss": 0.82215714, "learning_rate": 1.2737049773968554e-06, "loss": 0.83750057, "num_input_tokens_seen": 225883830, "router_z_loss_clip": 2.2421875, "router_z_loss_mlp": 0.27453613, "step": 10474, "time_per_iteration": 2.6721673011779785 }, { "auxiliary_loss_clip": 0.01289783, "auxiliary_loss_mlp": 0.0025506, "balance_loss_clip": 1.07057619, "balance_loss_mlp": 0.22950159, "epoch": 0.6297910716969788, "flos": 30662685043200.0, "grad_norm": 7.722634786897449, "language_loss": 0.74187678, "learning_rate": 1.2733421173910081e-06, "loss": 0.75732517, "num_input_tokens_seen": 225905755, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.25561523, "step": 10475, "time_per_iteration": 2.743040084838867 }, { "auxiliary_loss_clip": 0.01300802, "auxiliary_loss_mlp": 0.00241993, "balance_loss_clip": 1.07871127, "balance_loss_mlp": 0.21624345, "epoch": 0.6298511949496468, "flos": 14423018371200.0, "grad_norm": 556.8246388500883, "language_loss": 0.98293203, "learning_rate": 1.272979284940101e-06, "loss": 0.99835998, "num_input_tokens_seen": 225922155, "router_z_loss_clip": 2.22265625, "router_z_loss_mlp": 0.25732422, "step": 10476, "time_per_iteration": 2.657285451889038 }, { "auxiliary_loss_clip": 0.01291537, "auxiliary_loss_mlp": 0.00235121, "balance_loss_clip": 1.07328606, "balance_loss_mlp": 0.21102899, "epoch": 0.6299113182023147, "flos": 23514163845120.0, "grad_norm": 47.35024011585484, "language_loss": 0.83851445, "learning_rate": 1.2726164800578913e-06, "loss": 0.85378104, "num_input_tokens_seen": 225941060, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.24084473, "step": 10477, "time_per_iteration": 2.6901097297668457 }, { "auxiliary_loss_clip": 0.01297243, "auxiliary_loss_mlp": 0.00245016, "balance_loss_clip": 1.07562208, "balance_loss_mlp": 0.21859935, "epoch": 0.6299714414549827, "flos": 22674500542080.0, "grad_norm": 3.2684281014951724, "language_loss": 0.76136935, "learning_rate": 1.272253702758138e-06, "loss": 0.77679193, "num_input_tokens_seen": 225960870, "router_z_loss_clip": 2.21679688, "router_z_loss_mlp": 0.26416016, "step": 10478, "time_per_iteration": 2.6311099529266357 }, { "auxiliary_loss_clip": 0.01320348, "auxiliary_loss_mlp": 0.00248858, "balance_loss_clip": 1.08842671, "balance_loss_mlp": 0.22095115, "epoch": 0.6300315647076506, "flos": 14501735026560.0, "grad_norm": 34.81070648508225, "language_loss": 0.80639517, "learning_rate": 1.2718909530545974e-06, "loss": 0.82208723, "num_input_tokens_seen": 225977895, "router_z_loss_clip": 2.31640625, "router_z_loss_mlp": 0.27941895, "step": 10479, "time_per_iteration": 2.6669647693634033 }, { "auxiliary_loss_clip": 0.01318322, "auxiliary_loss_mlp": 0.00237285, "balance_loss_clip": 1.09409463, "balance_loss_mlp": 0.21123759, "epoch": 0.6300916879603187, "flos": 21871681614720.0, "grad_norm": 3.1517775783913478, "language_loss": 0.81133556, "learning_rate": 1.2715282309610245e-06, "loss": 0.82689172, "num_input_tokens_seen": 225997835, "router_z_loss_clip": 2.2421875, "router_z_loss_mlp": 0.26037598, "step": 10480, "time_per_iteration": 2.6561458110809326 }, { "auxiliary_loss_clip": 0.01295229, "auxiliary_loss_mlp": 0.00253851, "balance_loss_clip": 1.07219505, "balance_loss_mlp": 0.22726689, "epoch": 0.6301518112129866, "flos": 21834047139840.0, "grad_norm": 27.69267149676469, "language_loss": 0.85056055, "learning_rate": 1.2711655364911744e-06, "loss": 0.86605144, "num_input_tokens_seen": 226017620, "router_z_loss_clip": 2.23046875, "router_z_loss_mlp": 0.26599121, "step": 10481, "time_per_iteration": 2.6612236499786377 }, { "auxiliary_loss_clip": 0.01219402, "auxiliary_loss_mlp": 0.00074645, "balance_loss_clip": 1.06257498, "balance_loss_mlp": 0.06715885, "epoch": 0.6302119344656546, "flos": 44334237957120.0, "grad_norm": 0.8898791841472752, "language_loss": 0.61671746, "learning_rate": 1.2708028696588e-06, "loss": 0.62965786, "num_input_tokens_seen": 226068755, "router_z_loss_clip": 1.5703125, "router_z_loss_mlp": 0.07470703, "step": 10482, "time_per_iteration": 2.863495111465454 }, { "auxiliary_loss_clip": 0.01310829, "auxiliary_loss_mlp": 0.00270011, "balance_loss_clip": 1.08569515, "balance_loss_mlp": 0.24354699, "epoch": 0.6302720577183225, "flos": 11217919800960.0, "grad_norm": 18.607221788901246, "language_loss": 0.90974414, "learning_rate": 1.2704402304776541e-06, "loss": 0.92555255, "num_input_tokens_seen": 226084395, "router_z_loss_clip": 2.25195312, "router_z_loss_mlp": 0.26477051, "step": 10483, "time_per_iteration": 2.6381165981292725 }, { "auxiliary_loss_clip": 0.01309493, "auxiliary_loss_mlp": 0.00275602, "balance_loss_clip": 1.09049034, "balance_loss_mlp": 0.24770705, "epoch": 0.6303321809709905, "flos": 27964932122880.0, "grad_norm": 187.98326172989837, "language_loss": 0.80532765, "learning_rate": 1.270077618961487e-06, "loss": 0.82117867, "num_input_tokens_seen": 226105890, "router_z_loss_clip": 2.19140625, "router_z_loss_mlp": 0.27905273, "step": 10484, "time_per_iteration": 2.696347236633301 }, { "auxiliary_loss_clip": 0.013059, "auxiliary_loss_mlp": 0.0026539, "balance_loss_clip": 1.08035231, "balance_loss_mlp": 0.2388902, "epoch": 0.6303923042236586, "flos": 28220759763840.0, "grad_norm": 66.57353691499989, "language_loss": 0.81342, "learning_rate": 1.2697150351240506e-06, "loss": 0.82913291, "num_input_tokens_seen": 226126760, "router_z_loss_clip": 2.25585938, "router_z_loss_mlp": 0.26489258, "step": 10485, "time_per_iteration": 2.7066617012023926 }, { "auxiliary_loss_clip": 0.01317279, "auxiliary_loss_mlp": 0.00288412, "balance_loss_clip": 1.08779418, "balance_loss_mlp": 0.26017177, "epoch": 0.6304524274763265, "flos": 27631034271360.0, "grad_norm": 91.20207700720978, "language_loss": 0.89396644, "learning_rate": 1.269352478979093e-06, "loss": 0.91002333, "num_input_tokens_seen": 226147315, "router_z_loss_clip": 2.296875, "router_z_loss_mlp": 0.28234863, "step": 10486, "time_per_iteration": 2.7110085487365723 }, { "auxiliary_loss_clip": 0.01287875, "auxiliary_loss_mlp": 0.00267984, "balance_loss_clip": 1.07089555, "balance_loss_mlp": 0.24247277, "epoch": 0.6305125507289945, "flos": 17311313963520.0, "grad_norm": 22.807515473657496, "language_loss": 0.7302919, "learning_rate": 1.2689899505403628e-06, "loss": 0.74585056, "num_input_tokens_seen": 226165935, "router_z_loss_clip": 2.171875, "router_z_loss_mlp": 0.25512695, "step": 10487, "time_per_iteration": 5.454029321670532 }, { "auxiliary_loss_clip": 0.01303264, "auxiliary_loss_mlp": 0.0027983, "balance_loss_clip": 1.08051205, "balance_loss_mlp": 0.25279301, "epoch": 0.6305726739816624, "flos": 25808280658560.0, "grad_norm": 22.04759777228893, "language_loss": 0.73413277, "learning_rate": 1.2686274498216065e-06, "loss": 0.74996364, "num_input_tokens_seen": 226186890, "router_z_loss_clip": 2.2265625, "router_z_loss_mlp": 0.27038574, "step": 10488, "time_per_iteration": 2.6791250705718994 }, { "auxiliary_loss_clip": 0.01307135, "auxiliary_loss_mlp": 0.0029332, "balance_loss_clip": 1.08334506, "balance_loss_mlp": 0.26530594, "epoch": 0.6306327972343304, "flos": 21797454159360.0, "grad_norm": 133.24050336275735, "language_loss": 0.73821825, "learning_rate": 1.2682649768365706e-06, "loss": 0.75422281, "num_input_tokens_seen": 226206710, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.28027344, "step": 10489, "time_per_iteration": 2.728381872177124 }, { "auxiliary_loss_clip": 0.01323996, "auxiliary_loss_mlp": 0.00297226, "balance_loss_clip": 1.0873512, "balance_loss_mlp": 0.26866388, "epoch": 0.6306929204869983, "flos": 20777375819520.0, "grad_norm": 368.1018256784661, "language_loss": 0.8172462, "learning_rate": 1.2679025315990007e-06, "loss": 0.83345842, "num_input_tokens_seen": 226225565, "router_z_loss_clip": 2.36914062, "router_z_loss_mlp": 0.28527832, "step": 10490, "time_per_iteration": 2.6850428581237793 }, { "auxiliary_loss_clip": 0.01310483, "auxiliary_loss_mlp": 0.00277903, "balance_loss_clip": 1.08596933, "balance_loss_mlp": 0.25196293, "epoch": 0.6307530437396663, "flos": 23654214973440.0, "grad_norm": 8.719511135553441, "language_loss": 0.86087537, "learning_rate": 1.2675401141226393e-06, "loss": 0.87675923, "num_input_tokens_seen": 226243680, "router_z_loss_clip": 2.2421875, "router_z_loss_mlp": 0.25952148, "step": 10491, "time_per_iteration": 4.124901294708252 }, { "auxiliary_loss_clip": 0.01309927, "auxiliary_loss_mlp": 0.00274925, "balance_loss_clip": 1.08708549, "balance_loss_mlp": 0.24799588, "epoch": 0.6308131669923343, "flos": 24719002767360.0, "grad_norm": 791.924397346603, "language_loss": 0.65929377, "learning_rate": 1.2671777244212308e-06, "loss": 0.67514223, "num_input_tokens_seen": 226264345, "router_z_loss_clip": 2.23046875, "router_z_loss_mlp": 0.26916504, "step": 10492, "time_per_iteration": 2.816011428833008 }, { "auxiliary_loss_clip": 0.01318105, "auxiliary_loss_mlp": 0.00274449, "balance_loss_clip": 1.08956194, "balance_loss_mlp": 0.24730493, "epoch": 0.6308732902450023, "flos": 22565403959040.0, "grad_norm": 14.305467315759804, "language_loss": 0.73560584, "learning_rate": 1.2668153625085168e-06, "loss": 0.75153136, "num_input_tokens_seen": 226283165, "router_z_loss_clip": 2.28515625, "router_z_loss_mlp": 0.27160645, "step": 10493, "time_per_iteration": 2.679671287536621 }, { "auxiliary_loss_clip": 0.01323307, "auxiliary_loss_mlp": 0.00275118, "balance_loss_clip": 1.09761822, "balance_loss_mlp": 0.24859405, "epoch": 0.6309334134976702, "flos": 24644200694400.0, "grad_norm": 4.65604658537384, "language_loss": 0.87346387, "learning_rate": 1.2664530283982367e-06, "loss": 0.88944817, "num_input_tokens_seen": 226304080, "router_z_loss_clip": 2.2578125, "router_z_loss_mlp": 0.265625, "step": 10494, "time_per_iteration": 2.7208423614501953 }, { "auxiliary_loss_clip": 0.01314796, "auxiliary_loss_mlp": 0.00308316, "balance_loss_clip": 1.09154201, "balance_loss_mlp": 0.2798489, "epoch": 0.6309935367503382, "flos": 41427949651200.0, "grad_norm": 6.144772537645815, "language_loss": 0.87370008, "learning_rate": 1.2660907221041317e-06, "loss": 0.8899312, "num_input_tokens_seen": 226325925, "router_z_loss_clip": 2.23242188, "router_z_loss_mlp": 0.28479004, "step": 10495, "time_per_iteration": 4.2698540687561035 }, { "auxiliary_loss_clip": 0.01326241, "auxiliary_loss_mlp": 0.00288082, "balance_loss_clip": 1.09592509, "balance_loss_mlp": 0.26067603, "epoch": 0.6310536600030061, "flos": 15118931445120.0, "grad_norm": 18.77949496924361, "language_loss": 0.79489958, "learning_rate": 1.2657284436399403e-06, "loss": 0.81104285, "num_input_tokens_seen": 226344190, "router_z_loss_clip": 2.29882812, "router_z_loss_mlp": 0.27404785, "step": 10496, "time_per_iteration": 2.670269012451172 }, { "auxiliary_loss_clip": 0.01333405, "auxiliary_loss_mlp": 0.00269902, "balance_loss_clip": 1.10212088, "balance_loss_mlp": 0.24204287, "epoch": 0.6311137832556741, "flos": 15231619388160.0, "grad_norm": 10.022518054428645, "language_loss": 0.91682839, "learning_rate": 1.2653661930193997e-06, "loss": 0.93286151, "num_input_tokens_seen": 226361520, "router_z_loss_clip": 2.31054688, "router_z_loss_mlp": 0.27856445, "step": 10497, "time_per_iteration": 2.7033205032348633 }, { "auxiliary_loss_clip": 0.01323455, "auxiliary_loss_mlp": 0.00298296, "balance_loss_clip": 1.09835744, "balance_loss_mlp": 0.27118734, "epoch": 0.6311739065083422, "flos": 22018664067840.0, "grad_norm": 19.208189953743506, "language_loss": 0.83645415, "learning_rate": 1.265003970256247e-06, "loss": 0.85267174, "num_input_tokens_seen": 226381920, "router_z_loss_clip": 2.24804688, "router_z_loss_mlp": 0.27148438, "step": 10498, "time_per_iteration": 2.655501365661621 }, { "auxiliary_loss_clip": 0.01311073, "auxiliary_loss_mlp": 0.00288406, "balance_loss_clip": 1.08733416, "balance_loss_mlp": 0.26008224, "epoch": 0.6312340297610101, "flos": 22710770300160.0, "grad_norm": 215.63383821804354, "language_loss": 0.78874195, "learning_rate": 1.264641775364217e-06, "loss": 0.80473673, "num_input_tokens_seen": 226400035, "router_z_loss_clip": 2.23828125, "router_z_loss_mlp": 0.28356934, "step": 10499, "time_per_iteration": 2.679158926010132 }, { "auxiliary_loss_clip": 0.01344267, "auxiliary_loss_mlp": 0.00258478, "balance_loss_clip": 1.11507225, "balance_loss_mlp": 0.23009461, "epoch": 0.6312941530136781, "flos": 24280102483200.0, "grad_norm": 45.014474383215095, "language_loss": 0.80946761, "learning_rate": 1.2642796083570448e-06, "loss": 0.825495, "num_input_tokens_seen": 226418280, "router_z_loss_clip": 2.29296875, "router_z_loss_mlp": 0.28393555, "step": 10500, "time_per_iteration": 2.675501585006714 }, { "auxiliary_loss_clip": 0.01339468, "auxiliary_loss_mlp": 0.00277484, "balance_loss_clip": 1.11015546, "balance_loss_mlp": 0.25037605, "epoch": 0.631354276266346, "flos": 21725956137600.0, "grad_norm": 39.73185089281494, "language_loss": 0.82313228, "learning_rate": 1.2639174692484634e-06, "loss": 0.83930182, "num_input_tokens_seen": 226436650, "router_z_loss_clip": 2.296875, "router_z_loss_mlp": 0.27099609, "step": 10501, "time_per_iteration": 2.663654088973999 }, { "auxiliary_loss_clip": 0.0131708, "auxiliary_loss_mlp": 0.00296549, "balance_loss_clip": 1.09309042, "balance_loss_mlp": 0.26915431, "epoch": 0.631414399519014, "flos": 24025100855040.0, "grad_norm": 70.96265931519326, "language_loss": 0.82411981, "learning_rate": 1.2635553580522053e-06, "loss": 0.84025609, "num_input_tokens_seen": 226456275, "router_z_loss_clip": 2.23828125, "router_z_loss_mlp": 0.27404785, "step": 10502, "time_per_iteration": 2.671628713607788 }, { "auxiliary_loss_clip": 0.01328655, "auxiliary_loss_mlp": 0.00289506, "balance_loss_clip": 1.09777331, "balance_loss_mlp": 0.26046693, "epoch": 0.6314745227716819, "flos": 24315797623680.0, "grad_norm": 79.15230845305945, "language_loss": 0.94462478, "learning_rate": 1.2631932747820022e-06, "loss": 0.96080637, "num_input_tokens_seen": 226473610, "router_z_loss_clip": 2.31054688, "router_z_loss_mlp": 0.29052734, "step": 10503, "time_per_iteration": 2.686699390411377 }, { "auxiliary_loss_clip": 0.01330485, "auxiliary_loss_mlp": 0.0028073, "balance_loss_clip": 1.1029948, "balance_loss_mlp": 0.25439653, "epoch": 0.6315346460243499, "flos": 23366391292800.0, "grad_norm": 13.498137145671677, "language_loss": 0.9288348, "learning_rate": 1.2628312194515838e-06, "loss": 0.94494694, "num_input_tokens_seen": 226493665, "router_z_loss_clip": 2.27148438, "router_z_loss_mlp": 0.26379395, "step": 10504, "time_per_iteration": 2.6633176803588867 }, { "auxiliary_loss_clip": 0.01331111, "auxiliary_loss_mlp": 0.00271241, "balance_loss_clip": 1.09903598, "balance_loss_mlp": 0.24403772, "epoch": 0.6315947692770179, "flos": 20260333497600.0, "grad_norm": 60.61797786712131, "language_loss": 0.86404645, "learning_rate": 1.2624691920746793e-06, "loss": 0.88006997, "num_input_tokens_seen": 226511625, "router_z_loss_clip": 2.31835938, "router_z_loss_mlp": 0.27209473, "step": 10505, "time_per_iteration": 2.6564669609069824 }, { "auxiliary_loss_clip": 0.01348583, "auxiliary_loss_mlp": 0.00276843, "balance_loss_clip": 1.11541641, "balance_loss_mlp": 0.24744575, "epoch": 0.6316548925296859, "flos": 25265850399360.0, "grad_norm": 3.1095890039918777, "language_loss": 0.88854164, "learning_rate": 1.2621071926650166e-06, "loss": 0.90479594, "num_input_tokens_seen": 226530085, "router_z_loss_clip": 2.33203125, "router_z_loss_mlp": 0.29406738, "step": 10506, "time_per_iteration": 2.6361513137817383 }, { "auxiliary_loss_clip": 0.01332652, "auxiliary_loss_mlp": 0.00283154, "balance_loss_clip": 1.10220051, "balance_loss_mlp": 0.25599849, "epoch": 0.6317150157823538, "flos": 22930579578240.0, "grad_norm": 8.644313832844142, "language_loss": 0.8164649, "learning_rate": 1.2617452212363238e-06, "loss": 0.83262289, "num_input_tokens_seen": 226548115, "router_z_loss_clip": 2.3046875, "router_z_loss_mlp": 0.27160645, "step": 10507, "time_per_iteration": 2.6307029724121094 }, { "auxiliary_loss_clip": 0.01354197, "auxiliary_loss_mlp": 0.00287338, "balance_loss_clip": 1.11983871, "balance_loss_mlp": 0.26021743, "epoch": 0.6317751390350218, "flos": 22527051212160.0, "grad_norm": 129.47837017597664, "language_loss": 0.74742448, "learning_rate": 1.2613832778023258e-06, "loss": 0.76383978, "num_input_tokens_seen": 226567955, "router_z_loss_clip": 2.34375, "router_z_loss_mlp": 0.27124023, "step": 10508, "time_per_iteration": 2.6589043140411377 }, { "auxiliary_loss_clip": 0.01321448, "auxiliary_loss_mlp": 0.00310975, "balance_loss_clip": 1.09703457, "balance_loss_mlp": 0.28176862, "epoch": 0.6318352622876897, "flos": 23294749616640.0, "grad_norm": 62.596241956924565, "language_loss": 0.76933801, "learning_rate": 1.2610213623767478e-06, "loss": 0.78566223, "num_input_tokens_seen": 226588205, "router_z_loss_clip": 2.2421875, "router_z_loss_mlp": 0.29174805, "step": 10509, "time_per_iteration": 2.6819825172424316 }, { "auxiliary_loss_clip": 0.01320525, "auxiliary_loss_mlp": 0.00278602, "balance_loss_clip": 1.09719253, "balance_loss_mlp": 0.25192267, "epoch": 0.6318953855403577, "flos": 20704082117760.0, "grad_norm": 49.88383846675862, "language_loss": 0.85031915, "learning_rate": 1.2606594749733143e-06, "loss": 0.86631042, "num_input_tokens_seen": 226606965, "router_z_loss_clip": 2.23632812, "router_z_loss_mlp": 0.26672363, "step": 10510, "time_per_iteration": 2.682595729827881 }, { "auxiliary_loss_clip": 0.01323395, "auxiliary_loss_mlp": 0.00300456, "balance_loss_clip": 1.09809446, "balance_loss_mlp": 0.27034363, "epoch": 0.6319555087930258, "flos": 22820046451200.0, "grad_norm": 11.02926850767085, "language_loss": 0.77811778, "learning_rate": 1.2602976156057469e-06, "loss": 0.79435623, "num_input_tokens_seen": 226627845, "router_z_loss_clip": 2.25195312, "router_z_loss_mlp": 0.30114746, "step": 10511, "time_per_iteration": 2.7533230781555176 }, { "auxiliary_loss_clip": 0.01334263, "auxiliary_loss_mlp": 0.00294632, "balance_loss_clip": 1.11174393, "balance_loss_mlp": 0.26856068, "epoch": 0.6320156320456937, "flos": 19970929618560.0, "grad_norm": 2.3114343406731654, "language_loss": 0.87071538, "learning_rate": 1.2599357842877684e-06, "loss": 0.88700426, "num_input_tokens_seen": 226645855, "router_z_loss_clip": 2.2265625, "router_z_loss_mlp": 0.26074219, "step": 10512, "time_per_iteration": 2.689199686050415 }, { "auxiliary_loss_clip": 0.01326969, "auxiliary_loss_mlp": 0.00290279, "balance_loss_clip": 1.10498095, "balance_loss_mlp": 0.26125163, "epoch": 0.6320757552983617, "flos": 27013406889600.0, "grad_norm": 54.8947147873753, "language_loss": 0.77372831, "learning_rate": 1.2595739810330994e-06, "loss": 0.78990078, "num_input_tokens_seen": 226665375, "router_z_loss_clip": 2.22070312, "router_z_loss_mlp": 0.29040527, "step": 10513, "time_per_iteration": 2.663728713989258 }, { "auxiliary_loss_clip": 0.0133713, "auxiliary_loss_mlp": 0.00289462, "balance_loss_clip": 1.10562468, "balance_loss_mlp": 0.26187667, "epoch": 0.6321358785510296, "flos": 23695943598720.0, "grad_norm": 4.087661614772674, "language_loss": 0.74548328, "learning_rate": 1.259212205855459e-06, "loss": 0.76174915, "num_input_tokens_seen": 226685270, "router_z_loss_clip": 2.31445312, "router_z_loss_mlp": 0.27587891, "step": 10514, "time_per_iteration": 2.6908814907073975 }, { "auxiliary_loss_clip": 0.01327737, "auxiliary_loss_mlp": 0.00256787, "balance_loss_clip": 1.0994184, "balance_loss_mlp": 0.22929721, "epoch": 0.6321960018036976, "flos": 25995231970560.0, "grad_norm": 75.57308177373021, "language_loss": 0.82892728, "learning_rate": 1.2588504587685663e-06, "loss": 0.84477258, "num_input_tokens_seen": 226705325, "router_z_loss_clip": 2.28515625, "router_z_loss_mlp": 0.27490234, "step": 10515, "time_per_iteration": 2.7107856273651123 }, { "auxiliary_loss_clip": 0.01343457, "auxiliary_loss_mlp": 0.00265093, "balance_loss_clip": 1.11480808, "balance_loss_mlp": 0.23746008, "epoch": 0.6322561250563655, "flos": 22821016118400.0, "grad_norm": 139.29874030006522, "language_loss": 0.94322914, "learning_rate": 1.2584887397861379e-06, "loss": 0.95931464, "num_input_tokens_seen": 226723815, "router_z_loss_clip": 2.2890625, "router_z_loss_mlp": 0.27661133, "step": 10516, "time_per_iteration": 2.7027688026428223 }, { "auxiliary_loss_clip": 0.01336798, "auxiliary_loss_mlp": 0.00307335, "balance_loss_clip": 1.10597086, "balance_loss_mlp": 0.27741349, "epoch": 0.6323162483090335, "flos": 18988413926400.0, "grad_norm": 17.555480703658063, "language_loss": 0.88267249, "learning_rate": 1.2581270489218911e-06, "loss": 0.89911383, "num_input_tokens_seen": 226741550, "router_z_loss_clip": 2.30859375, "router_z_loss_mlp": 0.29956055, "step": 10517, "time_per_iteration": 2.6591944694519043 }, { "auxiliary_loss_clip": 0.01342472, "auxiliary_loss_mlp": 0.00272236, "balance_loss_clip": 1.11362755, "balance_loss_mlp": 0.24456739, "epoch": 0.6323763715617015, "flos": 19865173000320.0, "grad_norm": 37.61367839318699, "language_loss": 0.8506034, "learning_rate": 1.257765386189541e-06, "loss": 0.86675048, "num_input_tokens_seen": 226761115, "router_z_loss_clip": 2.2890625, "router_z_loss_mlp": 0.27661133, "step": 10518, "time_per_iteration": 2.791691541671753 }, { "auxiliary_loss_clip": 0.01343397, "auxiliary_loss_mlp": 0.00269699, "balance_loss_clip": 1.11945474, "balance_loss_mlp": 0.24285318, "epoch": 0.6324364948143695, "flos": 22782699285120.0, "grad_norm": 98.17468794949811, "language_loss": 0.89840209, "learning_rate": 1.2574037516028018e-06, "loss": 0.91453302, "num_input_tokens_seen": 226782225, "router_z_loss_clip": 2.2421875, "router_z_loss_mlp": 0.26831055, "step": 10519, "time_per_iteration": 2.691584348678589 }, { "auxiliary_loss_clip": 0.01343288, "auxiliary_loss_mlp": 0.00282326, "balance_loss_clip": 1.11886859, "balance_loss_mlp": 0.25549152, "epoch": 0.6324966180670374, "flos": 22235923480320.0, "grad_norm": 32.1289285037465, "language_loss": 0.79021704, "learning_rate": 1.2570421451753867e-06, "loss": 0.80647314, "num_input_tokens_seen": 226802375, "router_z_loss_clip": 2.24609375, "router_z_loss_mlp": 0.26855469, "step": 10520, "time_per_iteration": 2.676027774810791 }, { "auxiliary_loss_clip": 0.01346722, "auxiliary_loss_mlp": 0.0027238, "balance_loss_clip": 1.11556292, "balance_loss_mlp": 0.24429393, "epoch": 0.6325567413197054, "flos": 21689183589120.0, "grad_norm": 41.48705552374485, "language_loss": 0.8169682, "learning_rate": 1.2566805669210081e-06, "loss": 0.83315915, "num_input_tokens_seen": 226822165, "router_z_loss_clip": 2.31640625, "router_z_loss_mlp": 0.28076172, "step": 10521, "time_per_iteration": 2.7076125144958496 }, { "auxiliary_loss_clip": 0.01323583, "auxiliary_loss_mlp": 0.00272355, "balance_loss_clip": 1.10105503, "balance_loss_mlp": 0.24472275, "epoch": 0.6326168645723733, "flos": 19937137898880.0, "grad_norm": 24.29992747594278, "language_loss": 0.79727411, "learning_rate": 1.256319016853377e-06, "loss": 0.81323349, "num_input_tokens_seen": 226841645, "router_z_loss_clip": 2.22851562, "router_z_loss_mlp": 0.27636719, "step": 10522, "time_per_iteration": 2.6867616176605225 }, { "auxiliary_loss_clip": 0.01331023, "auxiliary_loss_mlp": 0.00298386, "balance_loss_clip": 1.1086812, "balance_loss_mlp": 0.26984692, "epoch": 0.6326769878250413, "flos": 20230348619520.0, "grad_norm": 58.70127845209635, "language_loss": 0.89387542, "learning_rate": 1.2559574949862023e-06, "loss": 0.91016954, "num_input_tokens_seen": 226860355, "router_z_loss_clip": 2.22265625, "router_z_loss_mlp": 0.28588867, "step": 10523, "time_per_iteration": 2.678992748260498 }, { "auxiliary_loss_clip": 0.01337561, "auxiliary_loss_mlp": 0.00297592, "balance_loss_clip": 1.11221588, "balance_loss_mlp": 0.26898199, "epoch": 0.6327371110777094, "flos": 20775759707520.0, "grad_norm": 161.22467828262245, "language_loss": 0.83487695, "learning_rate": 1.255596001333195e-06, "loss": 0.85122848, "num_input_tokens_seen": 226878390, "router_z_loss_clip": 2.25390625, "router_z_loss_mlp": 0.28613281, "step": 10524, "time_per_iteration": 2.623450756072998 }, { "auxiliary_loss_clip": 0.01360633, "auxiliary_loss_mlp": 0.00301925, "balance_loss_clip": 1.12141442, "balance_loss_mlp": 0.27047753, "epoch": 0.6327972343303773, "flos": 30336544529280.0, "grad_norm": 15.775232020846635, "language_loss": 0.91151363, "learning_rate": 1.2552345359080615e-06, "loss": 0.92813921, "num_input_tokens_seen": 226898420, "router_z_loss_clip": 2.390625, "router_z_loss_mlp": 0.31445312, "step": 10525, "time_per_iteration": 2.7243878841400146 }, { "auxiliary_loss_clip": 0.01327665, "auxiliary_loss_mlp": 0.00276279, "balance_loss_clip": 1.10500479, "balance_loss_mlp": 0.24902809, "epoch": 0.6328573575830453, "flos": 17092258871040.0, "grad_norm": 52.37301500884562, "language_loss": 0.73967981, "learning_rate": 1.2548730987245093e-06, "loss": 0.75571924, "num_input_tokens_seen": 226916305, "router_z_loss_clip": 2.23046875, "router_z_loss_mlp": 0.27258301, "step": 10526, "time_per_iteration": 2.6159050464630127 }, { "auxiliary_loss_clip": 0.01346119, "auxiliary_loss_mlp": 0.00258147, "balance_loss_clip": 1.11765838, "balance_loss_mlp": 0.23012093, "epoch": 0.6329174808357132, "flos": 25047154442880.0, "grad_norm": 7.646450863832346, "language_loss": 0.79812837, "learning_rate": 1.254511689796244e-06, "loss": 0.81417108, "num_input_tokens_seen": 226937705, "router_z_loss_clip": 2.28320312, "router_z_loss_mlp": 0.28015137, "step": 10527, "time_per_iteration": 2.686269998550415 }, { "auxiliary_loss_clip": 0.01325942, "auxiliary_loss_mlp": 0.00272921, "balance_loss_clip": 1.10585308, "balance_loss_mlp": 0.24823233, "epoch": 0.6329776040883812, "flos": 16836826279680.0, "grad_norm": 11.92973678827094, "language_loss": 0.79051054, "learning_rate": 1.2541503091369693e-06, "loss": 0.80649918, "num_input_tokens_seen": 226954880, "router_z_loss_clip": 2.20117188, "router_z_loss_mlp": 0.24694824, "step": 10528, "time_per_iteration": 2.644465446472168 }, { "auxiliary_loss_clip": 0.0135735, "auxiliary_loss_mlp": 0.00268851, "balance_loss_clip": 1.12125754, "balance_loss_mlp": 0.24045566, "epoch": 0.6330377273410491, "flos": 13516705382400.0, "grad_norm": 108.0096857467456, "language_loss": 0.78227496, "learning_rate": 1.2537889567603905e-06, "loss": 0.79853702, "num_input_tokens_seen": 226972595, "router_z_loss_clip": 2.36132812, "router_z_loss_mlp": 0.28442383, "step": 10529, "time_per_iteration": 4.06558895111084 }, { "auxiliary_loss_clip": 0.01380851, "auxiliary_loss_mlp": 0.00314538, "balance_loss_clip": 1.13495815, "balance_loss_mlp": 0.28316164, "epoch": 0.6330978505937171, "flos": 21538825257600.0, "grad_norm": 1078.6756324067644, "language_loss": 0.84894574, "learning_rate": 1.2534276326802092e-06, "loss": 0.86589968, "num_input_tokens_seen": 226991910, "router_z_loss_clip": 2.45703125, "router_z_loss_mlp": 0.3137207, "step": 10530, "time_per_iteration": 4.058784008026123 }, { "auxiliary_loss_clip": 0.01374139, "auxiliary_loss_mlp": 0.002916, "balance_loss_clip": 1.13512123, "balance_loss_mlp": 0.26347899, "epoch": 0.6331579738463851, "flos": 25009484054400.0, "grad_norm": 13.302855306741977, "language_loss": 0.798401, "learning_rate": 1.2530663369101259e-06, "loss": 0.81505841, "num_input_tokens_seen": 227010175, "router_z_loss_clip": 2.390625, "router_z_loss_mlp": 0.28112793, "step": 10531, "time_per_iteration": 2.6525802612304688 }, { "auxiliary_loss_clip": 0.01356099, "auxiliary_loss_mlp": 0.00293567, "balance_loss_clip": 1.12648046, "balance_loss_mlp": 0.26583844, "epoch": 0.6332180970990531, "flos": 14976007228800.0, "grad_norm": 25.847278370402414, "language_loss": 0.86606884, "learning_rate": 1.2527050694638432e-06, "loss": 0.8825655, "num_input_tokens_seen": 227025540, "router_z_loss_clip": 2.296875, "router_z_loss_mlp": 0.27709961, "step": 10532, "time_per_iteration": 2.645313262939453 }, { "auxiliary_loss_clip": 0.01322851, "auxiliary_loss_mlp": 0.00286897, "balance_loss_clip": 1.10144877, "balance_loss_mlp": 0.25978839, "epoch": 0.633278220351721, "flos": 22706963458560.0, "grad_norm": 37.8406908746276, "language_loss": 0.80476636, "learning_rate": 1.2523438303550582e-06, "loss": 0.82086384, "num_input_tokens_seen": 227045520, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.27099609, "step": 10533, "time_per_iteration": 4.174433469772339 }, { "auxiliary_loss_clip": 0.0135768, "auxiliary_loss_mlp": 0.00295776, "balance_loss_clip": 1.11843181, "balance_loss_mlp": 0.26597396, "epoch": 0.633338343604389, "flos": 12602922364800.0, "grad_norm": 24.190370037818564, "language_loss": 0.88274497, "learning_rate": 1.2519826195974706e-06, "loss": 0.89927953, "num_input_tokens_seen": 227059420, "router_z_loss_clip": 2.390625, "router_z_loss_mlp": 0.29797363, "step": 10534, "time_per_iteration": 2.6452763080596924 }, { "auxiliary_loss_clip": 0.01346558, "auxiliary_loss_mlp": 0.00256907, "balance_loss_clip": 1.11720872, "balance_loss_mlp": 0.23052615, "epoch": 0.6333984668570569, "flos": 25960111447680.0, "grad_norm": 482.0229138995103, "language_loss": 0.91808206, "learning_rate": 1.251621437204777e-06, "loss": 0.93411672, "num_input_tokens_seen": 227081310, "router_z_loss_clip": 2.2890625, "router_z_loss_mlp": 0.26391602, "step": 10535, "time_per_iteration": 2.745957136154175 }, { "auxiliary_loss_clip": 0.01325087, "auxiliary_loss_mlp": 0.00286144, "balance_loss_clip": 1.09930897, "balance_loss_mlp": 0.2588093, "epoch": 0.6334585901097249, "flos": 23659242877440.0, "grad_norm": 5.38501606689798, "language_loss": 0.85000926, "learning_rate": 1.2512602831906733e-06, "loss": 0.86612153, "num_input_tokens_seen": 227100365, "router_z_loss_clip": 2.2578125, "router_z_loss_mlp": 0.2734375, "step": 10536, "time_per_iteration": 2.6630067825317383 }, { "auxiliary_loss_clip": 0.01351178, "auxiliary_loss_mlp": 0.002882, "balance_loss_clip": 1.1219331, "balance_loss_mlp": 0.26039994, "epoch": 0.633518713362393, "flos": 28760496503040.0, "grad_norm": 2.1745772193660047, "language_loss": 0.6876117, "learning_rate": 1.250899157568855e-06, "loss": 0.70400554, "num_input_tokens_seen": 227119680, "router_z_loss_clip": 2.29101562, "router_z_loss_mlp": 0.27832031, "step": 10537, "time_per_iteration": 4.087575912475586 }, { "auxiliary_loss_clip": 0.01285261, "auxiliary_loss_mlp": 0.00075888, "balance_loss_clip": 1.13508105, "balance_loss_mlp": 0.06825836, "epoch": 0.6335788366150609, "flos": 70420322401920.0, "grad_norm": 0.7758460645531864, "language_loss": 0.51978451, "learning_rate": 1.2505380603530155e-06, "loss": 0.53339601, "num_input_tokens_seen": 227184465, "router_z_loss_clip": 1.5, "router_z_loss_mlp": 0.07617188, "step": 10538, "time_per_iteration": 3.3048202991485596 }, { "auxiliary_loss_clip": 0.01374052, "auxiliary_loss_mlp": 0.00278435, "balance_loss_clip": 1.12995338, "balance_loss_mlp": 0.24784636, "epoch": 0.6336389598677289, "flos": 23732069702400.0, "grad_norm": 6.928084117716129, "language_loss": 0.92803842, "learning_rate": 1.250176991556848e-06, "loss": 0.94456339, "num_input_tokens_seen": 227202185, "router_z_loss_clip": 2.4453125, "router_z_loss_mlp": 0.30541992, "step": 10539, "time_per_iteration": 2.6838860511779785 }, { "auxiliary_loss_clip": 0.01363694, "auxiliary_loss_mlp": 0.00281315, "balance_loss_clip": 1.12340999, "balance_loss_mlp": 0.25027335, "epoch": 0.6336990831203968, "flos": 29276676898560.0, "grad_norm": 28.997992376609446, "language_loss": 0.92767864, "learning_rate": 1.2498159511940438e-06, "loss": 0.94412875, "num_input_tokens_seen": 227222020, "router_z_loss_clip": 2.40039062, "router_z_loss_mlp": 0.3104248, "step": 10540, "time_per_iteration": 2.7283031940460205 }, { "auxiliary_loss_clip": 0.01325125, "auxiliary_loss_mlp": 0.00270078, "balance_loss_clip": 1.10351515, "balance_loss_mlp": 0.24460261, "epoch": 0.6337592063730648, "flos": 29096836479360.0, "grad_norm": 4.35250806662915, "language_loss": 0.79992342, "learning_rate": 1.2494549392782943e-06, "loss": 0.81587553, "num_input_tokens_seen": 227240885, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.25463867, "step": 10541, "time_per_iteration": 2.7414968013763428 }, { "auxiliary_loss_clip": 0.01360766, "auxiliary_loss_mlp": 0.00288023, "balance_loss_clip": 1.12363267, "balance_loss_mlp": 0.25912666, "epoch": 0.6338193296257327, "flos": 34706477249280.0, "grad_norm": 31.403505779751132, "language_loss": 0.92241836, "learning_rate": 1.2490939558232887e-06, "loss": 0.93890625, "num_input_tokens_seen": 227257880, "router_z_loss_clip": 2.37109375, "router_z_loss_mlp": 0.28881836, "step": 10542, "time_per_iteration": 2.766915798187256 }, { "auxiliary_loss_clip": 0.0132393, "auxiliary_loss_mlp": 0.00286572, "balance_loss_clip": 1.09685445, "balance_loss_mlp": 0.25723416, "epoch": 0.6338794528784008, "flos": 16687581269760.0, "grad_norm": 20.846128405104068, "language_loss": 0.83930314, "learning_rate": 1.2487330008427153e-06, "loss": 0.85540819, "num_input_tokens_seen": 227274840, "router_z_loss_clip": 2.265625, "router_z_loss_mlp": 0.29370117, "step": 10543, "time_per_iteration": 2.629380464553833 }, { "auxiliary_loss_clip": 0.01313919, "auxiliary_loss_mlp": 0.00269651, "balance_loss_clip": 1.09413671, "balance_loss_mlp": 0.24322221, "epoch": 0.6339395761310687, "flos": 22346600261760.0, "grad_norm": 3.3172628751848645, "language_loss": 0.8014912, "learning_rate": 1.2483720743502618e-06, "loss": 0.8173269, "num_input_tokens_seen": 227294835, "router_z_loss_clip": 2.19921875, "router_z_loss_mlp": 0.26428223, "step": 10544, "time_per_iteration": 2.63185453414917 }, { "auxiliary_loss_clip": 0.01344067, "auxiliary_loss_mlp": 0.00305171, "balance_loss_clip": 1.10837948, "balance_loss_mlp": 0.27412885, "epoch": 0.6339996993837367, "flos": 18551812112640.0, "grad_norm": 26.716536863925903, "language_loss": 0.76594895, "learning_rate": 1.2480111763596144e-06, "loss": 0.78244132, "num_input_tokens_seen": 227314935, "router_z_loss_clip": 2.359375, "router_z_loss_mlp": 0.31054688, "step": 10545, "time_per_iteration": 2.6522269248962402 }, { "auxiliary_loss_clip": 0.01341311, "auxiliary_loss_mlp": 0.00244556, "balance_loss_clip": 1.11327648, "balance_loss_mlp": 0.21582688, "epoch": 0.6340598226364046, "flos": 12969498614400.0, "grad_norm": 22.74412926869667, "language_loss": 0.81185508, "learning_rate": 1.2476503068844592e-06, "loss": 0.82771379, "num_input_tokens_seen": 227332905, "router_z_loss_clip": 2.27929688, "router_z_loss_mlp": 0.2869873, "step": 10546, "time_per_iteration": 2.626230001449585 }, { "auxiliary_loss_clip": 0.0132056, "auxiliary_loss_mlp": 0.00276664, "balance_loss_clip": 1.10158587, "balance_loss_mlp": 0.24978226, "epoch": 0.6341199458890726, "flos": 26687984647680.0, "grad_norm": 190.21026855553092, "language_loss": 0.82361376, "learning_rate": 1.2472894659384792e-06, "loss": 0.83958602, "num_input_tokens_seen": 227354915, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.26916504, "step": 10547, "time_per_iteration": 2.7480995655059814 }, { "auxiliary_loss_clip": 0.01358349, "auxiliary_loss_mlp": 0.00304082, "balance_loss_clip": 1.12230945, "balance_loss_mlp": 0.2731595, "epoch": 0.6341800691417405, "flos": 18734274224640.0, "grad_norm": 10.809650008122539, "language_loss": 0.74136889, "learning_rate": 1.2469286535353578e-06, "loss": 0.75799322, "num_input_tokens_seen": 227372990, "router_z_loss_clip": 2.359375, "router_z_loss_mlp": 0.30932617, "step": 10548, "time_per_iteration": 2.66064453125 }, { "auxiliary_loss_clip": 0.01341313, "auxiliary_loss_mlp": 0.0027612, "balance_loss_clip": 1.10986996, "balance_loss_mlp": 0.24668752, "epoch": 0.6342401923944085, "flos": 26249443499520.0, "grad_norm": 11.641476388040372, "language_loss": 0.71027946, "learning_rate": 1.2465678696887785e-06, "loss": 0.7264539, "num_input_tokens_seen": 227393270, "router_z_loss_clip": 2.3125, "router_z_loss_mlp": 0.29455566, "step": 10549, "time_per_iteration": 2.754582405090332 }, { "auxiliary_loss_clip": 0.01315401, "auxiliary_loss_mlp": 0.00277646, "balance_loss_clip": 1.09372258, "balance_loss_mlp": 0.25231433, "epoch": 0.6343003156470765, "flos": 24680937329280.0, "grad_norm": 37.18195478220987, "language_loss": 0.8052808, "learning_rate": 1.2462071144124197e-06, "loss": 0.82121134, "num_input_tokens_seen": 227413630, "router_z_loss_clip": 2.21289062, "router_z_loss_mlp": 0.25317383, "step": 10550, "time_per_iteration": 2.6711015701293945 }, { "auxiliary_loss_clip": 0.01257966, "auxiliary_loss_mlp": 0.00083199, "balance_loss_clip": 1.11191511, "balance_loss_mlp": 0.07566463, "epoch": 0.6343604388997445, "flos": 69805352626560.0, "grad_norm": 0.6907094952451559, "language_loss": 0.57379198, "learning_rate": 1.2458463877199638e-06, "loss": 0.58720362, "num_input_tokens_seen": 227476630, "router_z_loss_clip": 1.453125, "router_z_loss_mlp": 0.07519531, "step": 10551, "time_per_iteration": 3.1610169410705566 }, { "auxiliary_loss_clip": 0.01311695, "auxiliary_loss_mlp": 0.00288281, "balance_loss_clip": 1.08892286, "balance_loss_mlp": 0.26010019, "epoch": 0.6344205621524125, "flos": 21982430223360.0, "grad_norm": 44.73949610362728, "language_loss": 0.73916811, "learning_rate": 1.2454856896250881e-06, "loss": 0.7551679, "num_input_tokens_seen": 227496060, "router_z_loss_clip": 2.22851562, "router_z_loss_mlp": 0.28198242, "step": 10552, "time_per_iteration": 2.6520326137542725 }, { "auxiliary_loss_clip": 0.01325463, "auxiliary_loss_mlp": 0.00287496, "balance_loss_clip": 1.09622538, "balance_loss_mlp": 0.25927877, "epoch": 0.6344806854050804, "flos": 20448865008000.0, "grad_norm": 8.905538246087303, "language_loss": 0.89343464, "learning_rate": 1.24512502014147e-06, "loss": 0.90956426, "num_input_tokens_seen": 227513440, "router_z_loss_clip": 2.28710938, "router_z_loss_mlp": 0.2824707, "step": 10553, "time_per_iteration": 2.650280237197876 }, { "auxiliary_loss_clip": 0.01351206, "auxiliary_loss_mlp": 0.00298466, "balance_loss_clip": 1.1169312, "balance_loss_mlp": 0.26954556, "epoch": 0.6345408086577484, "flos": 40510611187200.0, "grad_norm": 63.647205690877044, "language_loss": 0.64050281, "learning_rate": 1.2447643792827879e-06, "loss": 0.65699947, "num_input_tokens_seen": 227535395, "router_z_loss_clip": 2.34765625, "router_z_loss_mlp": 0.28918457, "step": 10554, "time_per_iteration": 2.8377532958984375 }, { "auxiliary_loss_clip": 0.01318878, "auxiliary_loss_mlp": 0.00289761, "balance_loss_clip": 1.09408283, "balance_loss_mlp": 0.26265314, "epoch": 0.6346009319104163, "flos": 21361319222400.0, "grad_norm": 1206.206816046274, "language_loss": 0.79167342, "learning_rate": 1.2444037670627153e-06, "loss": 0.80775976, "num_input_tokens_seen": 227554545, "router_z_loss_clip": 2.24804688, "router_z_loss_mlp": 0.27099609, "step": 10555, "time_per_iteration": 2.667522430419922 }, { "auxiliary_loss_clip": 0.01250465, "auxiliary_loss_mlp": 0.00111293, "balance_loss_clip": 1.10930061, "balance_loss_mlp": 0.10147049, "epoch": 0.6346610551630844, "flos": 71365419100800.0, "grad_norm": 0.7777050815664506, "language_loss": 0.54714429, "learning_rate": 1.2440431834949276e-06, "loss": 0.56076193, "num_input_tokens_seen": 227608575, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.09814453, "step": 10556, "time_per_iteration": 3.0919930934906006 }, { "auxiliary_loss_clip": 0.01336657, "auxiliary_loss_mlp": 0.00292407, "balance_loss_clip": 1.10174751, "balance_loss_mlp": 0.26317739, "epoch": 0.6347211784157523, "flos": 25411504049280.0, "grad_norm": 504.5100521692488, "language_loss": 0.81261861, "learning_rate": 1.2436826285930985e-06, "loss": 0.82890922, "num_input_tokens_seen": 227628175, "router_z_loss_clip": 2.34960938, "router_z_loss_mlp": 0.29223633, "step": 10557, "time_per_iteration": 2.7303178310394287 }, { "auxiliary_loss_clip": 0.01333366, "auxiliary_loss_mlp": 0.00280541, "balance_loss_clip": 1.10345936, "balance_loss_mlp": 0.25305146, "epoch": 0.6347813016684203, "flos": 15742735966080.0, "grad_norm": 127.9082046193094, "language_loss": 0.77378857, "learning_rate": 1.2433221023709002e-06, "loss": 0.7899276, "num_input_tokens_seen": 227645330, "router_z_loss_clip": 2.30078125, "router_z_loss_mlp": 0.27490234, "step": 10558, "time_per_iteration": 2.681154251098633 }, { "auxiliary_loss_clip": 0.01314243, "auxiliary_loss_mlp": 0.00270705, "balance_loss_clip": 1.09096694, "balance_loss_mlp": 0.24202371, "epoch": 0.6348414249210882, "flos": 21464777370240.0, "grad_norm": 59.874803885321356, "language_loss": 0.8296538, "learning_rate": 1.2429616048420031e-06, "loss": 0.84550333, "num_input_tokens_seen": 227665250, "router_z_loss_clip": 2.23242188, "router_z_loss_mlp": 0.28710938, "step": 10559, "time_per_iteration": 2.7034597396850586 }, { "auxiliary_loss_clip": 0.01332005, "auxiliary_loss_mlp": 0.00291362, "balance_loss_clip": 1.10459042, "balance_loss_mlp": 0.25942561, "epoch": 0.6349015481737562, "flos": 21653057485440.0, "grad_norm": 38.94226085146236, "language_loss": 0.78078067, "learning_rate": 1.242601136020078e-06, "loss": 0.79701436, "num_input_tokens_seen": 227685070, "router_z_loss_clip": 2.27539062, "router_z_loss_mlp": 0.31896973, "step": 10560, "time_per_iteration": 2.81333065032959 }, { "auxiliary_loss_clip": 0.0131792, "auxiliary_loss_mlp": 0.00273001, "balance_loss_clip": 1.0946579, "balance_loss_mlp": 0.24748977, "epoch": 0.6349616714264241, "flos": 22194984954240.0, "grad_norm": 5.326271664519876, "language_loss": 0.84202111, "learning_rate": 1.2422406959187939e-06, "loss": 0.85793036, "num_input_tokens_seen": 227704430, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.25549316, "step": 10561, "time_per_iteration": 2.714729070663452 }, { "auxiliary_loss_clip": 0.01337224, "auxiliary_loss_mlp": 0.00302365, "balance_loss_clip": 1.10393858, "balance_loss_mlp": 0.27363545, "epoch": 0.6350217946790921, "flos": 25410354814080.0, "grad_norm": 9.747282608484559, "language_loss": 0.79002392, "learning_rate": 1.2418802845518178e-06, "loss": 0.80641985, "num_input_tokens_seen": 227724920, "router_z_loss_clip": 2.3359375, "router_z_loss_mlp": 0.2878418, "step": 10562, "time_per_iteration": 2.722564220428467 }, { "auxiliary_loss_clip": 0.01327346, "auxiliary_loss_mlp": 0.00267306, "balance_loss_clip": 1.09910798, "balance_loss_mlp": 0.23857649, "epoch": 0.63508191793176, "flos": 19718944732800.0, "grad_norm": 105.97214236043743, "language_loss": 0.88938457, "learning_rate": 1.2415199019328185e-06, "loss": 0.90533113, "num_input_tokens_seen": 227743400, "router_z_loss_clip": 2.27929688, "router_z_loss_mlp": 0.28735352, "step": 10563, "time_per_iteration": 2.6756014823913574 }, { "auxiliary_loss_clip": 0.0135452, "auxiliary_loss_mlp": 0.00299684, "balance_loss_clip": 1.11891019, "balance_loss_mlp": 0.27056164, "epoch": 0.6351420411844281, "flos": 18186923802240.0, "grad_norm": 4.476276113717623, "language_loss": 0.88366556, "learning_rate": 1.2411595480754597e-06, "loss": 0.90020758, "num_input_tokens_seen": 227759990, "router_z_loss_clip": 2.359375, "router_z_loss_mlp": 0.29125977, "step": 10564, "time_per_iteration": 2.8209893703460693 }, { "auxiliary_loss_clip": 0.01332871, "auxiliary_loss_mlp": 0.00292022, "balance_loss_clip": 1.10435045, "balance_loss_mlp": 0.2624577, "epoch": 0.6352021644370961, "flos": 33726511422720.0, "grad_norm": 6.512906752465071, "language_loss": 0.79385221, "learning_rate": 1.240799222993407e-06, "loss": 0.81010115, "num_input_tokens_seen": 227780835, "router_z_loss_clip": 2.28710938, "router_z_loss_mlp": 0.2956543, "step": 10565, "time_per_iteration": 2.8383054733276367 }, { "auxiliary_loss_clip": 0.0131599, "auxiliary_loss_mlp": 0.00274645, "balance_loss_clip": 1.09311402, "balance_loss_mlp": 0.24643975, "epoch": 0.635262287689764, "flos": 20374781207040.0, "grad_norm": 54.93111422183994, "language_loss": 0.77848387, "learning_rate": 1.240438926700324e-06, "loss": 0.7943902, "num_input_tokens_seen": 227798580, "router_z_loss_clip": 2.22851562, "router_z_loss_mlp": 0.28173828, "step": 10566, "time_per_iteration": 2.6383228302001953 }, { "auxiliary_loss_clip": 0.01329952, "auxiliary_loss_mlp": 0.0027112, "balance_loss_clip": 1.10359693, "balance_loss_mlp": 0.24421448, "epoch": 0.635322410942432, "flos": 27525421307520.0, "grad_norm": 68.87142713039943, "language_loss": 0.7506966, "learning_rate": 1.2400786592098725e-06, "loss": 0.7667073, "num_input_tokens_seen": 227819210, "router_z_loss_clip": 2.26757812, "router_z_loss_mlp": 0.26904297, "step": 10567, "time_per_iteration": 2.7130463123321533 }, { "auxiliary_loss_clip": 0.01349669, "auxiliary_loss_mlp": 0.00291157, "balance_loss_clip": 1.11893106, "balance_loss_mlp": 0.26090193, "epoch": 0.6353825341950999, "flos": 21543601766400.0, "grad_norm": 1541.3643730587303, "language_loss": 0.92180783, "learning_rate": 1.2397184205357154e-06, "loss": 0.93821603, "num_input_tokens_seen": 227838340, "router_z_loss_clip": 2.30859375, "router_z_loss_mlp": 0.30236816, "step": 10568, "time_per_iteration": 2.6699957847595215 }, { "auxiliary_loss_clip": 0.01313571, "auxiliary_loss_mlp": 0.00280484, "balance_loss_clip": 1.09068072, "balance_loss_mlp": 0.25368541, "epoch": 0.635442657447768, "flos": 31759756185600.0, "grad_norm": 256.21715968186214, "language_loss": 0.91819584, "learning_rate": 1.2393582106915113e-06, "loss": 0.93413639, "num_input_tokens_seen": 227859170, "router_z_loss_clip": 2.22460938, "router_z_loss_mlp": 0.26794434, "step": 10569, "time_per_iteration": 2.7247705459594727 }, { "auxiliary_loss_clip": 0.01308586, "auxiliary_loss_mlp": 0.00271629, "balance_loss_clip": 1.08682728, "balance_loss_mlp": 0.24391288, "epoch": 0.6355027807004359, "flos": 19828831415040.0, "grad_norm": 5.283491377691104, "language_loss": 0.75400567, "learning_rate": 1.2389980296909198e-06, "loss": 0.76980776, "num_input_tokens_seen": 227878545, "router_z_loss_clip": 2.22070312, "router_z_loss_mlp": 0.27685547, "step": 10570, "time_per_iteration": 2.6391632556915283 }, { "auxiliary_loss_clip": 0.01319254, "auxiliary_loss_mlp": 0.00312411, "balance_loss_clip": 1.09207618, "balance_loss_mlp": 0.28284726, "epoch": 0.6355629039531039, "flos": 30372383324160.0, "grad_norm": 20.663064496200164, "language_loss": 0.76403725, "learning_rate": 1.2386378775476e-06, "loss": 0.78035384, "num_input_tokens_seen": 227898875, "router_z_loss_clip": 2.26953125, "router_z_loss_mlp": 0.29553223, "step": 10571, "time_per_iteration": 4.137019157409668 }, { "auxiliary_loss_clip": 0.01336463, "auxiliary_loss_mlp": 0.00272592, "balance_loss_clip": 1.10576677, "balance_loss_mlp": 0.24717677, "epoch": 0.6356230272057718, "flos": 17932065828480.0, "grad_norm": 62.99466995143967, "language_loss": 0.78540283, "learning_rate": 1.2382777542752074e-06, "loss": 0.80149341, "num_input_tokens_seen": 227917130, "router_z_loss_clip": 2.30664062, "router_z_loss_mlp": 0.25415039, "step": 10572, "time_per_iteration": 4.145476818084717 }, { "auxiliary_loss_clip": 0.01313356, "auxiliary_loss_mlp": 0.00284814, "balance_loss_clip": 1.09317589, "balance_loss_mlp": 0.25769374, "epoch": 0.6356831504584398, "flos": 25375844822400.0, "grad_norm": 450.9350439627282, "language_loss": 0.86173201, "learning_rate": 1.2379176598873992e-06, "loss": 0.87771368, "num_input_tokens_seen": 227939550, "router_z_loss_clip": 2.20507812, "router_z_loss_mlp": 0.27111816, "step": 10573, "time_per_iteration": 2.7504618167877197 }, { "auxiliary_loss_clip": 0.01325075, "auxiliary_loss_mlp": 0.00312687, "balance_loss_clip": 1.10082138, "balance_loss_mlp": 0.28411299, "epoch": 0.6357432737111077, "flos": 46500331720320.0, "grad_norm": 18.16833877964789, "language_loss": 0.75729454, "learning_rate": 1.2375575943978303e-06, "loss": 0.77367222, "num_input_tokens_seen": 227962200, "router_z_loss_clip": 2.24414062, "router_z_loss_mlp": 0.28564453, "step": 10574, "time_per_iteration": 2.8483505249023438 }, { "auxiliary_loss_clip": 0.01328424, "auxiliary_loss_mlp": 0.00312892, "balance_loss_clip": 1.10196209, "balance_loss_mlp": 0.28528342, "epoch": 0.6358033969637757, "flos": 17274361847040.0, "grad_norm": 38.578103353933045, "language_loss": 0.96183187, "learning_rate": 1.2371975578201525e-06, "loss": 0.97824502, "num_input_tokens_seen": 227979270, "router_z_loss_clip": 2.26367188, "router_z_loss_mlp": 0.27612305, "step": 10575, "time_per_iteration": 4.047002077102661 }, { "auxiliary_loss_clip": 0.01318665, "auxiliary_loss_mlp": 0.00287593, "balance_loss_clip": 1.09834802, "balance_loss_mlp": 0.26258239, "epoch": 0.6358635202164437, "flos": 27125520215040.0, "grad_norm": 30.785877605115054, "language_loss": 0.7782805, "learning_rate": 1.2368375501680204e-06, "loss": 0.79434311, "num_input_tokens_seen": 228000550, "router_z_loss_clip": 2.20507812, "router_z_loss_mlp": 0.25036621, "step": 10576, "time_per_iteration": 2.701847553253174 }, { "auxiliary_loss_clip": 0.01320756, "auxiliary_loss_mlp": 0.00282659, "balance_loss_clip": 1.0984118, "balance_loss_mlp": 0.25673103, "epoch": 0.6359236434691117, "flos": 27525205825920.0, "grad_norm": 6.9852240189034065, "language_loss": 0.74206769, "learning_rate": 1.236477571455085e-06, "loss": 0.75810182, "num_input_tokens_seen": 228022005, "router_z_loss_clip": 2.22460938, "router_z_loss_mlp": 0.25939941, "step": 10577, "time_per_iteration": 2.7426161766052246 }, { "auxiliary_loss_clip": 0.01324331, "auxiliary_loss_mlp": 0.00270133, "balance_loss_clip": 1.10084534, "balance_loss_mlp": 0.24430029, "epoch": 0.6359837667217797, "flos": 39348290989440.0, "grad_norm": 30.03863631893891, "language_loss": 0.80647767, "learning_rate": 1.2361176216949964e-06, "loss": 0.82242227, "num_input_tokens_seen": 228043770, "router_z_loss_clip": 2.23339844, "router_z_loss_mlp": 0.25830078, "step": 10578, "time_per_iteration": 2.835326671600342 }, { "auxiliary_loss_clip": 0.01229762, "auxiliary_loss_mlp": 0.00033582, "balance_loss_clip": 1.09323525, "balance_loss_mlp": 0.02704948, "epoch": 0.6360438899744476, "flos": 56413797206400.0, "grad_norm": 0.6842579444044682, "language_loss": 0.53678942, "learning_rate": 1.2357577009014044e-06, "loss": 0.54942286, "num_input_tokens_seen": 228104985, "router_z_loss_clip": 1.359375, "router_z_loss_mlp": 0.06542969, "step": 10579, "time_per_iteration": 4.5555689334869385 }, { "auxiliary_loss_clip": 0.01315212, "auxiliary_loss_mlp": 0.00280001, "balance_loss_clip": 1.08977056, "balance_loss_mlp": 0.25115204, "epoch": 0.6361040132271156, "flos": 24973106555520.0, "grad_norm": 267.3025372176447, "language_loss": 0.856583, "learning_rate": 1.2353978090879568e-06, "loss": 0.87253517, "num_input_tokens_seen": 228125620, "router_z_loss_clip": 2.25390625, "router_z_loss_mlp": 0.28833008, "step": 10580, "time_per_iteration": 2.781122922897339 }, { "auxiliary_loss_clip": 0.01311756, "auxiliary_loss_mlp": 0.00275417, "balance_loss_clip": 1.09254336, "balance_loss_mlp": 0.25006151, "epoch": 0.6361641364797835, "flos": 23259198130560.0, "grad_norm": 21.476537014699993, "language_loss": 0.73673242, "learning_rate": 1.235037946268301e-06, "loss": 0.75260419, "num_input_tokens_seen": 228143495, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.25354004, "step": 10581, "time_per_iteration": 2.684227705001831 }, { "auxiliary_loss_clip": 0.01302435, "auxiliary_loss_mlp": 0.00292298, "balance_loss_clip": 1.08379912, "balance_loss_mlp": 0.26664382, "epoch": 0.6362242597324516, "flos": 25994513698560.0, "grad_norm": 5.141423367266589, "language_loss": 0.76509702, "learning_rate": 1.2346781124560828e-06, "loss": 0.78104436, "num_input_tokens_seen": 228166500, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.25683594, "step": 10582, "time_per_iteration": 2.7454686164855957 }, { "auxiliary_loss_clip": 0.01325981, "auxiliary_loss_mlp": 0.00295575, "balance_loss_clip": 1.1028285, "balance_loss_mlp": 0.26894403, "epoch": 0.6362843829851195, "flos": 25703242312320.0, "grad_norm": 884.9362521624552, "language_loss": 0.92255235, "learning_rate": 1.2343183076649473e-06, "loss": 0.93876791, "num_input_tokens_seen": 228185325, "router_z_loss_clip": 2.23046875, "router_z_loss_mlp": 0.26623535, "step": 10583, "time_per_iteration": 2.7152318954467773 }, { "auxiliary_loss_clip": 0.01332179, "auxiliary_loss_mlp": 0.00281124, "balance_loss_clip": 1.10757589, "balance_loss_mlp": 0.25463554, "epoch": 0.6363445062377875, "flos": 20522912895360.0, "grad_norm": 9.329491129970437, "language_loss": 0.82221699, "learning_rate": 1.233958531908538e-06, "loss": 0.83835, "num_input_tokens_seen": 228204050, "router_z_loss_clip": 2.2421875, "router_z_loss_mlp": 0.26525879, "step": 10584, "time_per_iteration": 2.793417453765869 }, { "auxiliary_loss_clip": 0.01309238, "auxiliary_loss_mlp": 0.00304942, "balance_loss_clip": 1.08440208, "balance_loss_mlp": 0.27665412, "epoch": 0.6364046294904554, "flos": 19463799450240.0, "grad_norm": 8.385514560613922, "language_loss": 0.80287683, "learning_rate": 1.2335987852004985e-06, "loss": 0.8190186, "num_input_tokens_seen": 228222430, "router_z_loss_clip": 2.24804688, "router_z_loss_mlp": 0.28283691, "step": 10585, "time_per_iteration": 2.6669907569885254 }, { "auxiliary_loss_clip": 0.01317462, "auxiliary_loss_mlp": 0.00322815, "balance_loss_clip": 1.09626436, "balance_loss_mlp": 0.29511088, "epoch": 0.6364647527431234, "flos": 20995892208000.0, "grad_norm": 39.665895191565184, "language_loss": 0.89501727, "learning_rate": 1.2332390675544697e-06, "loss": 0.91142005, "num_input_tokens_seen": 228241925, "router_z_loss_clip": 2.21289062, "router_z_loss_mlp": 0.27709961, "step": 10586, "time_per_iteration": 2.6465060710906982 }, { "auxiliary_loss_clip": 0.01299748, "auxiliary_loss_mlp": 0.00307631, "balance_loss_clip": 1.08604169, "balance_loss_mlp": 0.28209648, "epoch": 0.6365248759957913, "flos": 25770789838080.0, "grad_norm": 142.703178435658, "language_loss": 0.78477156, "learning_rate": 1.2328793789840918e-06, "loss": 0.80084538, "num_input_tokens_seen": 228262535, "router_z_loss_clip": 2.13867188, "router_z_loss_mlp": 0.25561523, "step": 10587, "time_per_iteration": 2.691361904144287 }, { "auxiliary_loss_clip": 0.01321178, "auxiliary_loss_mlp": 0.00304628, "balance_loss_clip": 1.09844053, "balance_loss_mlp": 0.27681684, "epoch": 0.6365849992484593, "flos": 22455589104000.0, "grad_norm": 87.46129068108255, "language_loss": 0.83905983, "learning_rate": 1.2325197195030058e-06, "loss": 0.85531783, "num_input_tokens_seen": 228281340, "router_z_loss_clip": 2.23046875, "router_z_loss_mlp": 0.27819824, "step": 10588, "time_per_iteration": 2.675084114074707 }, { "auxiliary_loss_clip": 0.01302388, "auxiliary_loss_mlp": 0.00302321, "balance_loss_clip": 1.08493948, "balance_loss_mlp": 0.27445, "epoch": 0.6366451225011273, "flos": 19025689265280.0, "grad_norm": 8.929159192450316, "language_loss": 0.84385502, "learning_rate": 1.2321600891248478e-06, "loss": 0.85990214, "num_input_tokens_seen": 228300865, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.27856445, "step": 10589, "time_per_iteration": 2.6993770599365234 }, { "auxiliary_loss_clip": 0.01325631, "auxiliary_loss_mlp": 0.00310442, "balance_loss_clip": 1.10555232, "balance_loss_mlp": 0.28302419, "epoch": 0.6367052457537953, "flos": 25228395492480.0, "grad_norm": 34.248760932962085, "language_loss": 0.77532065, "learning_rate": 1.231800487863257e-06, "loss": 0.79168141, "num_input_tokens_seen": 228320815, "router_z_loss_clip": 2.20117188, "router_z_loss_mlp": 0.27404785, "step": 10590, "time_per_iteration": 2.7116940021514893 }, { "auxiliary_loss_clip": 0.01322416, "auxiliary_loss_mlp": 0.00303556, "balance_loss_clip": 1.09325576, "balance_loss_mlp": 0.27537453, "epoch": 0.6367653690064633, "flos": 19208438686080.0, "grad_norm": 9.310516476267823, "language_loss": 0.86528552, "learning_rate": 1.2314409157318685e-06, "loss": 0.88154525, "num_input_tokens_seen": 228339065, "router_z_loss_clip": 2.29296875, "router_z_loss_mlp": 0.28173828, "step": 10591, "time_per_iteration": 2.64190411567688 }, { "auxiliary_loss_clip": 0.01312331, "auxiliary_loss_mlp": 0.00312402, "balance_loss_clip": 1.09363484, "balance_loss_mlp": 0.2859135, "epoch": 0.6368254922591312, "flos": 23546806329600.0, "grad_norm": 24.679587247814577, "language_loss": 0.95076942, "learning_rate": 1.231081372744317e-06, "loss": 0.9670167, "num_input_tokens_seen": 228359210, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.26489258, "step": 10592, "time_per_iteration": 2.7151992321014404 }, { "auxiliary_loss_clip": 0.01301811, "auxiliary_loss_mlp": 0.00322995, "balance_loss_clip": 1.08371139, "balance_loss_mlp": 0.294909, "epoch": 0.6368856155117992, "flos": 26467313443200.0, "grad_norm": 15.008840950403757, "language_loss": 0.73125017, "learning_rate": 1.2307218589142376e-06, "loss": 0.74749821, "num_input_tokens_seen": 228379630, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.28100586, "step": 10593, "time_per_iteration": 2.702620506286621 }, { "auxiliary_loss_clip": 0.0130733, "auxiliary_loss_mlp": 0.00297479, "balance_loss_clip": 1.08614588, "balance_loss_mlp": 0.27078795, "epoch": 0.6369457387644671, "flos": 33692432394240.0, "grad_norm": 48.75733659355035, "language_loss": 0.71147186, "learning_rate": 1.2303623742552618e-06, "loss": 0.72751993, "num_input_tokens_seen": 228401410, "router_z_loss_clip": 2.21289062, "router_z_loss_mlp": 0.26660156, "step": 10594, "time_per_iteration": 2.78024959564209 }, { "auxiliary_loss_clip": 0.01201549, "auxiliary_loss_mlp": 0.00097548, "balance_loss_clip": 1.06060171, "balance_loss_mlp": 0.09006175, "epoch": 0.6370058620171352, "flos": 70908600908160.0, "grad_norm": 0.7380371548155644, "language_loss": 0.54037547, "learning_rate": 1.230002918781022e-06, "loss": 0.55336642, "num_input_tokens_seen": 228470335, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.07470703, "step": 10595, "time_per_iteration": 3.2539753913879395 }, { "auxiliary_loss_clip": 0.01329838, "auxiliary_loss_mlp": 0.00295213, "balance_loss_clip": 1.10137868, "balance_loss_mlp": 0.26677004, "epoch": 0.6370659852698031, "flos": 21141940907520.0, "grad_norm": 4.495530354474944, "language_loss": 0.74232197, "learning_rate": 1.2296434925051493e-06, "loss": 0.75857258, "num_input_tokens_seen": 228490765, "router_z_loss_clip": 2.28710938, "router_z_loss_mlp": 0.28442383, "step": 10596, "time_per_iteration": 2.6934781074523926 }, { "auxiliary_loss_clip": 0.01316101, "auxiliary_loss_mlp": 0.00294426, "balance_loss_clip": 1.09574032, "balance_loss_mlp": 0.26618522, "epoch": 0.6371261085224711, "flos": 20193288762240.0, "grad_norm": 121.13850350845412, "language_loss": 0.89264911, "learning_rate": 1.2292840954412718e-06, "loss": 0.90875441, "num_input_tokens_seen": 228509700, "router_z_loss_clip": 2.20507812, "router_z_loss_mlp": 0.28210449, "step": 10597, "time_per_iteration": 2.7051053047180176 }, { "auxiliary_loss_clip": 0.01347598, "auxiliary_loss_mlp": 0.00295189, "balance_loss_clip": 1.11183405, "balance_loss_mlp": 0.26926091, "epoch": 0.637186231775139, "flos": 19683536901120.0, "grad_norm": 19.48611780179347, "language_loss": 0.79286706, "learning_rate": 1.2289247276030189e-06, "loss": 0.80929494, "num_input_tokens_seen": 228529050, "router_z_loss_clip": 2.35742188, "router_z_loss_mlp": 0.25927734, "step": 10598, "time_per_iteration": 2.7356369495391846 }, { "auxiliary_loss_clip": 0.01344298, "auxiliary_loss_mlp": 0.00322295, "balance_loss_clip": 1.1100204, "balance_loss_mlp": 0.29289785, "epoch": 0.637246355027807, "flos": 13071196995840.0, "grad_norm": 14.526710003121748, "language_loss": 0.75372213, "learning_rate": 1.2285653890040176e-06, "loss": 0.77038813, "num_input_tokens_seen": 228544665, "router_z_loss_clip": 2.34179688, "router_z_loss_mlp": 0.29394531, "step": 10599, "time_per_iteration": 2.6622183322906494 }, { "auxiliary_loss_clip": 0.01351309, "auxiliary_loss_mlp": 0.00341526, "balance_loss_clip": 1.11247814, "balance_loss_mlp": 0.31005466, "epoch": 0.6373064782804749, "flos": 18222654856320.0, "grad_norm": 51.44489662200496, "language_loss": 0.89542979, "learning_rate": 1.2282060796578942e-06, "loss": 0.91235811, "num_input_tokens_seen": 228562060, "router_z_loss_clip": 2.390625, "router_z_loss_mlp": 0.31494141, "step": 10600, "time_per_iteration": 2.6249659061431885 }, { "auxiliary_loss_clip": 0.01326175, "auxiliary_loss_mlp": 0.00320362, "balance_loss_clip": 1.10197735, "balance_loss_mlp": 0.29215717, "epoch": 0.637366601533143, "flos": 24498475217280.0, "grad_norm": 101.74078541289214, "language_loss": 0.84652841, "learning_rate": 1.2278467995782732e-06, "loss": 0.86299378, "num_input_tokens_seen": 228582550, "router_z_loss_clip": 2.24609375, "router_z_loss_mlp": 0.28186035, "step": 10601, "time_per_iteration": 2.7083680629730225 }, { "auxiliary_loss_clip": 0.01337833, "auxiliary_loss_mlp": 0.003199, "balance_loss_clip": 1.10256243, "balance_loss_mlp": 0.2898595, "epoch": 0.6374267247858109, "flos": 26359042872960.0, "grad_norm": 23.299745438839214, "language_loss": 0.76056683, "learning_rate": 1.2274875487787797e-06, "loss": 0.77714419, "num_input_tokens_seen": 228604960, "router_z_loss_clip": 2.34960938, "router_z_loss_mlp": 0.30029297, "step": 10602, "time_per_iteration": 2.7213990688323975 }, { "auxiliary_loss_clip": 0.01320383, "auxiliary_loss_mlp": 0.0032622, "balance_loss_clip": 1.09391093, "balance_loss_mlp": 0.29794407, "epoch": 0.6374868480384789, "flos": 20371728551040.0, "grad_norm": 5.622948204089622, "language_loss": 0.84514511, "learning_rate": 1.2271283272730354e-06, "loss": 0.86161113, "num_input_tokens_seen": 228622195, "router_z_loss_clip": 2.26953125, "router_z_loss_mlp": 0.2833252, "step": 10603, "time_per_iteration": 2.7457683086395264 }, { "auxiliary_loss_clip": 0.01322071, "auxiliary_loss_mlp": 0.00309544, "balance_loss_clip": 1.09786975, "balance_loss_mlp": 0.28117183, "epoch": 0.6375469712911469, "flos": 20996251344000.0, "grad_norm": 14.071101774520777, "language_loss": 0.86968106, "learning_rate": 1.2267691350746621e-06, "loss": 0.88599718, "num_input_tokens_seen": 228639735, "router_z_loss_clip": 2.23828125, "router_z_loss_mlp": 0.28344727, "step": 10604, "time_per_iteration": 2.6339969635009766 }, { "auxiliary_loss_clip": 0.01336357, "auxiliary_loss_mlp": 0.00320454, "balance_loss_clip": 1.10180521, "balance_loss_mlp": 0.28912613, "epoch": 0.6376070945438148, "flos": 19715748422400.0, "grad_norm": 181.5412169242318, "language_loss": 0.84276414, "learning_rate": 1.226409972197281e-06, "loss": 0.85933226, "num_input_tokens_seen": 228658195, "router_z_loss_clip": 2.34375, "router_z_loss_mlp": 0.31298828, "step": 10605, "time_per_iteration": 2.702362298965454 }, { "auxiliary_loss_clip": 0.01350643, "auxiliary_loss_mlp": 0.00321684, "balance_loss_clip": 1.1133796, "balance_loss_mlp": 0.29002243, "epoch": 0.6376672177964828, "flos": 21506757390720.0, "grad_norm": 12.502249458799074, "language_loss": 0.73801053, "learning_rate": 1.2260508386545106e-06, "loss": 0.7547338, "num_input_tokens_seen": 228677415, "router_z_loss_clip": 2.375, "router_z_loss_mlp": 0.31665039, "step": 10606, "time_per_iteration": 2.7296223640441895 }, { "auxiliary_loss_clip": 0.01315504, "auxiliary_loss_mlp": 0.00319205, "balance_loss_clip": 1.0948875, "balance_loss_mlp": 0.29181054, "epoch": 0.6377273410491507, "flos": 18843873598080.0, "grad_norm": 6098.390928700006, "language_loss": 0.8364442, "learning_rate": 1.225691734459971e-06, "loss": 0.85279125, "num_input_tokens_seen": 228696450, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.27392578, "step": 10607, "time_per_iteration": 2.647829055786133 }, { "auxiliary_loss_clip": 0.01342008, "auxiliary_loss_mlp": 0.00322227, "balance_loss_clip": 1.11412847, "balance_loss_mlp": 0.29331884, "epoch": 0.6377874643018188, "flos": 53062970181120.0, "grad_norm": 3.4758798882768303, "language_loss": 0.72130346, "learning_rate": 1.225332659627278e-06, "loss": 0.7379458, "num_input_tokens_seen": 228721600, "router_z_loss_clip": 2.28125, "router_z_loss_mlp": 0.2890625, "step": 10608, "time_per_iteration": 2.9333865642547607 }, { "auxiliary_loss_clip": 0.01216183, "auxiliary_loss_mlp": 0.00062506, "balance_loss_clip": 1.07641888, "balance_loss_mlp": 0.0566887, "epoch": 0.6378475875544867, "flos": 65135026465920.0, "grad_norm": 0.7314460600515144, "language_loss": 0.51699847, "learning_rate": 1.2249736141700475e-06, "loss": 0.52978534, "num_input_tokens_seen": 228784535, "router_z_loss_clip": 1.3984375, "router_z_loss_mlp": 0.05810547, "step": 10609, "time_per_iteration": 3.1176838874816895 }, { "auxiliary_loss_clip": 0.0132074, "auxiliary_loss_mlp": 0.00296537, "balance_loss_clip": 1.09488153, "balance_loss_mlp": 0.270549, "epoch": 0.6379077108071547, "flos": 23002759958400.0, "grad_norm": 65.1116446195222, "language_loss": 0.82907069, "learning_rate": 1.2246145981018965e-06, "loss": 0.84524345, "num_input_tokens_seen": 228804110, "router_z_loss_clip": 2.25976562, "router_z_loss_mlp": 0.25964355, "step": 10610, "time_per_iteration": 2.832923412322998 }, { "auxiliary_loss_clip": 0.01216455, "auxiliary_loss_mlp": 0.00092342, "balance_loss_clip": 1.07738543, "balance_loss_mlp": 0.08561824, "epoch": 0.6379678340598226, "flos": 67601947610880.0, "grad_norm": 0.8208219875041748, "language_loss": 0.6206125, "learning_rate": 1.2242556114364364e-06, "loss": 0.63370049, "num_input_tokens_seen": 228867705, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.06738281, "step": 10611, "time_per_iteration": 3.158858299255371 }, { "auxiliary_loss_clip": 0.01333591, "auxiliary_loss_mlp": 0.00313696, "balance_loss_clip": 1.10324073, "balance_loss_mlp": 0.28522837, "epoch": 0.6380279573124906, "flos": 29680061610240.0, "grad_norm": 10.045676310935978, "language_loss": 0.79281938, "learning_rate": 1.223896654187282e-06, "loss": 0.8092922, "num_input_tokens_seen": 228889215, "router_z_loss_clip": 2.3046875, "router_z_loss_mlp": 0.28491211, "step": 10612, "time_per_iteration": 2.7353837490081787 }, { "auxiliary_loss_clip": 0.01208946, "auxiliary_loss_mlp": 0.00065819, "balance_loss_clip": 1.06953239, "balance_loss_mlp": 0.05795102, "epoch": 0.6380880805651585, "flos": 66484046580480.0, "grad_norm": 0.7081144700903569, "language_loss": 0.56981945, "learning_rate": 1.2235377263680446e-06, "loss": 0.58256716, "num_input_tokens_seen": 228948465, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.07861328, "step": 10613, "time_per_iteration": 4.428763389587402 }, { "auxiliary_loss_clip": 0.01343138, "auxiliary_loss_mlp": 0.00294512, "balance_loss_clip": 1.11144853, "balance_loss_mlp": 0.26646221, "epoch": 0.6381482038178266, "flos": 23914998691200.0, "grad_norm": 5.061211440882698, "language_loss": 0.81358731, "learning_rate": 1.2231788279923334e-06, "loss": 0.8299638, "num_input_tokens_seen": 228967955, "router_z_loss_clip": 2.3203125, "router_z_loss_mlp": 0.28051758, "step": 10614, "time_per_iteration": 4.1110756397247314 }, { "auxiliary_loss_clip": 0.01344319, "auxiliary_loss_mlp": 0.00316979, "balance_loss_clip": 1.10902727, "balance_loss_mlp": 0.2870571, "epoch": 0.6382083270704945, "flos": 24243042625920.0, "grad_norm": 7.349668164184247, "language_loss": 0.86890954, "learning_rate": 1.2228199590737599e-06, "loss": 0.88552248, "num_input_tokens_seen": 228985495, "router_z_loss_clip": 2.34960938, "router_z_loss_mlp": 0.29907227, "step": 10615, "time_per_iteration": 2.653588056564331 }, { "auxiliary_loss_clip": 0.01219515, "auxiliary_loss_mlp": 0.00059782, "balance_loss_clip": 1.07779527, "balance_loss_mlp": 0.05305844, "epoch": 0.6382684503231625, "flos": 70775552931840.0, "grad_norm": 0.7543205484595451, "language_loss": 0.54915917, "learning_rate": 1.2224611196259305e-06, "loss": 0.56195211, "num_input_tokens_seen": 229052995, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.06738281, "step": 10616, "time_per_iteration": 3.2411060333251953 }, { "auxiliary_loss_clip": 0.01347075, "auxiliary_loss_mlp": 0.00345276, "balance_loss_clip": 1.11577249, "balance_loss_mlp": 0.31537852, "epoch": 0.6383285735758305, "flos": 16544836621440.0, "grad_norm": 11.710828436496751, "language_loss": 0.90828246, "learning_rate": 1.2221023096624538e-06, "loss": 0.92520595, "num_input_tokens_seen": 229071030, "router_z_loss_clip": 2.3203125, "router_z_loss_mlp": 0.29833984, "step": 10617, "time_per_iteration": 4.006694793701172 }, { "auxiliary_loss_clip": 0.01341521, "auxiliary_loss_mlp": 0.00336123, "balance_loss_clip": 1.1041348, "balance_loss_mlp": 0.30535498, "epoch": 0.6383886968284984, "flos": 14427651225600.0, "grad_norm": 8.715667214244343, "language_loss": 0.94235021, "learning_rate": 1.221743529196936e-06, "loss": 0.95912671, "num_input_tokens_seen": 229088275, "router_z_loss_clip": 2.37695312, "router_z_loss_mlp": 0.30761719, "step": 10618, "time_per_iteration": 2.648388147354126 }, { "auxiliary_loss_clip": 0.01324167, "auxiliary_loss_mlp": 0.0033205, "balance_loss_clip": 1.09509158, "balance_loss_mlp": 0.30227152, "epoch": 0.6384488200811664, "flos": 17929659617280.0, "grad_norm": 3.5579882786992916, "language_loss": 0.82357633, "learning_rate": 1.2213847782429806e-06, "loss": 0.84013855, "num_input_tokens_seen": 229105190, "router_z_loss_clip": 2.29101562, "router_z_loss_mlp": 0.29772949, "step": 10619, "time_per_iteration": 2.6519646644592285 }, { "auxiliary_loss_clip": 0.01348496, "auxiliary_loss_mlp": 0.00337648, "balance_loss_clip": 1.10785949, "balance_loss_mlp": 0.30699962, "epoch": 0.6385089433338343, "flos": 18515578268160.0, "grad_norm": 10.870535672394851, "language_loss": 0.83973491, "learning_rate": 1.221026056814193e-06, "loss": 0.85659635, "num_input_tokens_seen": 229122290, "router_z_loss_clip": 2.40820312, "router_z_loss_mlp": 0.30664062, "step": 10620, "time_per_iteration": 2.656034469604492 }, { "auxiliary_loss_clip": 0.01358901, "auxiliary_loss_mlp": 0.00320406, "balance_loss_clip": 1.1215564, "balance_loss_mlp": 0.29236776, "epoch": 0.6385690665865024, "flos": 24753620499840.0, "grad_norm": 45.499741641004306, "language_loss": 0.80624843, "learning_rate": 1.2206673649241752e-06, "loss": 0.8230415, "num_input_tokens_seen": 229141620, "router_z_loss_clip": 2.37304688, "router_z_loss_mlp": 0.28027344, "step": 10621, "time_per_iteration": 4.113484621047974 }, { "auxiliary_loss_clip": 0.01336078, "auxiliary_loss_mlp": 0.00302662, "balance_loss_clip": 1.10585892, "balance_loss_mlp": 0.27414674, "epoch": 0.6386291898391703, "flos": 20120569678080.0, "grad_norm": 98.95786076539129, "language_loss": 0.83346635, "learning_rate": 1.220308702586529e-06, "loss": 0.84985375, "num_input_tokens_seen": 229161570, "router_z_loss_clip": 2.30664062, "router_z_loss_mlp": 0.28540039, "step": 10622, "time_per_iteration": 2.648211717605591 }, { "auxiliary_loss_clip": 0.01330546, "auxiliary_loss_mlp": 0.00278665, "balance_loss_clip": 1.10214043, "balance_loss_mlp": 0.25122309, "epoch": 0.6386893130918383, "flos": 16867278034560.0, "grad_norm": 262.04378774587144, "language_loss": 0.81171858, "learning_rate": 1.2199500698148546e-06, "loss": 0.8278107, "num_input_tokens_seen": 229178465, "router_z_loss_clip": 2.28710938, "router_z_loss_mlp": 0.2746582, "step": 10623, "time_per_iteration": 2.7018375396728516 }, { "auxiliary_loss_clip": 0.01317438, "auxiliary_loss_mlp": 0.00315886, "balance_loss_clip": 1.09532583, "balance_loss_mlp": 0.28890878, "epoch": 0.6387494363445062, "flos": 22966274718720.0, "grad_norm": 2.5337377322700347, "language_loss": 0.81829232, "learning_rate": 1.2195914666227527e-06, "loss": 0.8346256, "num_input_tokens_seen": 229198975, "router_z_loss_clip": 2.22265625, "router_z_loss_mlp": 0.27001953, "step": 10624, "time_per_iteration": 2.7024853229522705 }, { "auxiliary_loss_clip": 0.01349868, "auxiliary_loss_mlp": 0.00308223, "balance_loss_clip": 1.11360657, "balance_loss_mlp": 0.27991036, "epoch": 0.6388095595971742, "flos": 22857716839680.0, "grad_norm": 9.832380484943643, "language_loss": 0.88159788, "learning_rate": 1.21923289302382e-06, "loss": 0.89817882, "num_input_tokens_seen": 229218825, "router_z_loss_clip": 2.36132812, "router_z_loss_mlp": 0.28308105, "step": 10625, "time_per_iteration": 2.6761586666107178 }, { "auxiliary_loss_clip": 0.01349081, "auxiliary_loss_mlp": 0.00314837, "balance_loss_clip": 1.11456025, "balance_loss_mlp": 0.28309143, "epoch": 0.6388696828498421, "flos": 17311529445120.0, "grad_norm": 13.019868406840951, "language_loss": 0.80655426, "learning_rate": 1.218874349031654e-06, "loss": 0.82319343, "num_input_tokens_seen": 229236060, "router_z_loss_clip": 2.34375, "router_z_loss_mlp": 0.31750488, "step": 10626, "time_per_iteration": 2.681581735610962 }, { "auxiliary_loss_clip": 0.01350904, "auxiliary_loss_mlp": 0.00308505, "balance_loss_clip": 1.11415482, "balance_loss_mlp": 0.27913171, "epoch": 0.6389298061025102, "flos": 17128636369920.0, "grad_norm": 28.81007041173824, "language_loss": 0.79676735, "learning_rate": 1.2185158346598517e-06, "loss": 0.81336153, "num_input_tokens_seen": 229255160, "router_z_loss_clip": 2.36523438, "router_z_loss_mlp": 0.29394531, "step": 10627, "time_per_iteration": 2.60990571975708 }, { "auxiliary_loss_clip": 0.01347028, "auxiliary_loss_mlp": 0.00310295, "balance_loss_clip": 1.1088109, "balance_loss_mlp": 0.27949154, "epoch": 0.6389899293551781, "flos": 27710971989120.0, "grad_norm": 16.262848661465867, "language_loss": 0.75712878, "learning_rate": 1.2181573499220064e-06, "loss": 0.77370203, "num_input_tokens_seen": 229278705, "router_z_loss_clip": 2.37695312, "router_z_loss_mlp": 0.30786133, "step": 10628, "time_per_iteration": 2.6972618103027344 }, { "auxiliary_loss_clip": 0.01317486, "auxiliary_loss_mlp": 0.00319573, "balance_loss_clip": 1.09477806, "balance_loss_mlp": 0.29171357, "epoch": 0.6390500526078461, "flos": 21215701486080.0, "grad_norm": 22.977909727304432, "language_loss": 0.73906642, "learning_rate": 1.2177988948317135e-06, "loss": 0.75543702, "num_input_tokens_seen": 229299990, "router_z_loss_clip": 2.22460938, "router_z_loss_mlp": 0.27832031, "step": 10629, "time_per_iteration": 2.73294997215271 }, { "auxiliary_loss_clip": 0.0136349, "auxiliary_loss_mlp": 0.00358543, "balance_loss_clip": 1.11489511, "balance_loss_mlp": 0.32452106, "epoch": 0.6391101758605141, "flos": 21581056673280.0, "grad_norm": 25.995776257919722, "language_loss": 0.84266829, "learning_rate": 1.2174404694025646e-06, "loss": 0.85988867, "num_input_tokens_seen": 229319230, "router_z_loss_clip": 2.48242188, "router_z_loss_mlp": 0.34033203, "step": 10630, "time_per_iteration": 2.695523738861084 }, { "auxiliary_loss_clip": 0.01315178, "auxiliary_loss_mlp": 0.00336247, "balance_loss_clip": 1.09144425, "balance_loss_mlp": 0.30738604, "epoch": 0.639170299113182, "flos": 19900473091200.0, "grad_norm": 26.608633351860895, "language_loss": 0.75109178, "learning_rate": 1.2170820736481511e-06, "loss": 0.76760602, "num_input_tokens_seen": 229338600, "router_z_loss_clip": 2.23828125, "router_z_loss_mlp": 0.28869629, "step": 10631, "time_per_iteration": 2.643413782119751 }, { "auxiliary_loss_clip": 0.01213964, "auxiliary_loss_mlp": 0.00116791, "balance_loss_clip": 1.06785309, "balance_loss_mlp": 0.10754063, "epoch": 0.63923042236585, "flos": 69877604833920.0, "grad_norm": 0.7548304968797463, "language_loss": 0.62241942, "learning_rate": 1.2167237075820646e-06, "loss": 0.63572693, "num_input_tokens_seen": 229402420, "router_z_loss_clip": 1.4609375, "router_z_loss_mlp": 0.09228516, "step": 10632, "time_per_iteration": 3.222689151763916 }, { "auxiliary_loss_clip": 0.01328005, "auxiliary_loss_mlp": 0.00318873, "balance_loss_clip": 1.09855294, "balance_loss_mlp": 0.29051358, "epoch": 0.639290545618518, "flos": 22674823764480.0, "grad_norm": 11.163677810233947, "language_loss": 0.74276173, "learning_rate": 1.216365371217893e-06, "loss": 0.75923049, "num_input_tokens_seen": 229419185, "router_z_loss_clip": 2.29296875, "router_z_loss_mlp": 0.28344727, "step": 10633, "time_per_iteration": 2.6719958782196045 }, { "auxiliary_loss_clip": 0.01321772, "auxiliary_loss_mlp": 0.00324685, "balance_loss_clip": 1.09514141, "balance_loss_mlp": 0.29698098, "epoch": 0.639350668871186, "flos": 19829190551040.0, "grad_norm": 5.594006695077692, "language_loss": 0.89060926, "learning_rate": 1.216007064569225e-06, "loss": 0.90707392, "num_input_tokens_seen": 229436735, "router_z_loss_clip": 2.26953125, "router_z_loss_mlp": 0.27722168, "step": 10634, "time_per_iteration": 2.715884208679199 }, { "auxiliary_loss_clip": 0.01357959, "auxiliary_loss_mlp": 0.00328902, "balance_loss_clip": 1.12123692, "balance_loss_mlp": 0.2998988, "epoch": 0.6394107921238539, "flos": 20553328736640.0, "grad_norm": 76.91496261733667, "language_loss": 0.82268906, "learning_rate": 1.2156487876496483e-06, "loss": 0.83955771, "num_input_tokens_seen": 229455595, "router_z_loss_clip": 2.36914062, "router_z_loss_mlp": 0.2902832, "step": 10635, "time_per_iteration": 2.680143117904663 }, { "auxiliary_loss_clip": 0.01322751, "auxiliary_loss_mlp": 0.00338067, "balance_loss_clip": 1.09458733, "balance_loss_mlp": 0.30919451, "epoch": 0.6394709153765219, "flos": 25774991729280.0, "grad_norm": 23.107799391006544, "language_loss": 0.77259934, "learning_rate": 1.2152905404727475e-06, "loss": 0.78920758, "num_input_tokens_seen": 229476230, "router_z_loss_clip": 2.28710938, "router_z_loss_mlp": 0.28857422, "step": 10636, "time_per_iteration": 2.693263530731201 }, { "auxiliary_loss_clip": 0.0134059, "auxiliary_loss_mlp": 0.00344051, "balance_loss_clip": 1.10173059, "balance_loss_mlp": 0.31305683, "epoch": 0.6395310386291898, "flos": 17530153574400.0, "grad_norm": 16.717493452812853, "language_loss": 0.81575036, "learning_rate": 1.2149323230521085e-06, "loss": 0.83259672, "num_input_tokens_seen": 229494300, "router_z_loss_clip": 2.38867188, "router_z_loss_mlp": 0.31030273, "step": 10637, "time_per_iteration": 2.639697551727295 }, { "auxiliary_loss_clip": 0.01328877, "auxiliary_loss_mlp": 0.00371586, "balance_loss_clip": 1.09828448, "balance_loss_mlp": 0.33928025, "epoch": 0.6395911618818578, "flos": 18588225525120.0, "grad_norm": 27.74804647802693, "language_loss": 0.85659611, "learning_rate": 1.2145741354013143e-06, "loss": 0.87360078, "num_input_tokens_seen": 229512985, "router_z_loss_clip": 2.30859375, "router_z_loss_mlp": 0.32324219, "step": 10638, "time_per_iteration": 2.633134365081787 }, { "auxiliary_loss_clip": 0.01349738, "auxiliary_loss_mlp": 0.00307842, "balance_loss_clip": 1.11145258, "balance_loss_mlp": 0.27639496, "epoch": 0.6396512851345257, "flos": 28366557068160.0, "grad_norm": 19.49472039380725, "language_loss": 0.8831014, "learning_rate": 1.2142159775339478e-06, "loss": 0.89967722, "num_input_tokens_seen": 229534270, "router_z_loss_clip": 2.38085938, "router_z_loss_mlp": 0.31445312, "step": 10639, "time_per_iteration": 2.724376678466797 }, { "auxiliary_loss_clip": 0.01224251, "auxiliary_loss_mlp": 0.00158425, "balance_loss_clip": 1.07620311, "balance_loss_mlp": 0.14712372, "epoch": 0.6397114083871938, "flos": 70724307202560.0, "grad_norm": 0.8234453156705953, "language_loss": 0.5838567, "learning_rate": 1.21385784946359e-06, "loss": 0.59768343, "num_input_tokens_seen": 229596455, "router_z_loss_clip": 1.484375, "router_z_loss_mlp": 0.11279297, "step": 10640, "time_per_iteration": 3.137103319168091 }, { "auxiliary_loss_clip": 0.01316726, "auxiliary_loss_mlp": 0.00334964, "balance_loss_clip": 1.09166861, "balance_loss_mlp": 0.30664015, "epoch": 0.6397715316398617, "flos": 18142537570560.0, "grad_norm": 15.26380223263372, "language_loss": 0.83760935, "learning_rate": 1.2134997512038215e-06, "loss": 0.85412621, "num_input_tokens_seen": 229612860, "router_z_loss_clip": 2.25, "router_z_loss_mlp": 0.28320312, "step": 10641, "time_per_iteration": 2.6568174362182617 }, { "auxiliary_loss_clip": 0.01337911, "auxiliary_loss_mlp": 0.00321641, "balance_loss_clip": 1.09575891, "balance_loss_mlp": 0.29249439, "epoch": 0.6398316548925297, "flos": 25739512070400.0, "grad_norm": 114.2933621908355, "language_loss": 0.7285403, "learning_rate": 1.2131416827682209e-06, "loss": 0.74513578, "num_input_tokens_seen": 229633960, "router_z_loss_clip": 2.421875, "router_z_loss_mlp": 0.29162598, "step": 10642, "time_per_iteration": 2.7038278579711914 }, { "auxiliary_loss_clip": 0.01222902, "auxiliary_loss_mlp": 0.00116077, "balance_loss_clip": 1.07788885, "balance_loss_mlp": 0.10854261, "epoch": 0.6398917781451977, "flos": 71214234756480.0, "grad_norm": 1.1274009292353036, "language_loss": 0.55601174, "learning_rate": 1.2127836441703667e-06, "loss": 0.5694015, "num_input_tokens_seen": 229686730, "router_z_loss_clip": 1.453125, "router_z_loss_mlp": 0.07519531, "step": 10643, "time_per_iteration": 3.1127641201019287 }, { "auxiliary_loss_clip": 0.01351902, "auxiliary_loss_mlp": 0.00342855, "balance_loss_clip": 1.11119437, "balance_loss_mlp": 0.31016749, "epoch": 0.6399519013978656, "flos": 20521835487360.0, "grad_norm": 408255.3123770578, "language_loss": 0.8417291, "learning_rate": 1.2124256354238358e-06, "loss": 0.85867667, "num_input_tokens_seen": 229704800, "router_z_loss_clip": 2.40234375, "router_z_loss_mlp": 0.32666016, "step": 10644, "time_per_iteration": 2.6148338317871094 }, { "auxiliary_loss_clip": 0.0135124, "auxiliary_loss_mlp": 0.00329187, "balance_loss_clip": 1.11523223, "balance_loss_mlp": 0.29850292, "epoch": 0.6400120246505336, "flos": 24460840742400.0, "grad_norm": 170.29372487351387, "language_loss": 0.87225366, "learning_rate": 1.212067656542203e-06, "loss": 0.88905799, "num_input_tokens_seen": 229725265, "router_z_loss_clip": 2.36328125, "router_z_loss_mlp": 0.30688477, "step": 10645, "time_per_iteration": 2.7186522483825684 }, { "auxiliary_loss_clip": 0.01337389, "auxiliary_loss_mlp": 0.00333971, "balance_loss_clip": 1.09449184, "balance_loss_mlp": 0.30142692, "epoch": 0.6400721479032015, "flos": 28366090191360.0, "grad_norm": 100.64132652135164, "language_loss": 0.83293349, "learning_rate": 1.2117097075390447e-06, "loss": 0.8496471, "num_input_tokens_seen": 229744840, "router_z_loss_clip": 2.4296875, "router_z_loss_mlp": 0.32543945, "step": 10646, "time_per_iteration": 2.72652530670166 }, { "auxiliary_loss_clip": 0.01332968, "auxiliary_loss_mlp": 0.00325344, "balance_loss_clip": 1.10186863, "balance_loss_mlp": 0.29618585, "epoch": 0.6401322711558696, "flos": 17816540711040.0, "grad_norm": 122.39719522474606, "language_loss": 0.89471602, "learning_rate": 1.2113517884279327e-06, "loss": 0.91129917, "num_input_tokens_seen": 229759095, "router_z_loss_clip": 2.3125, "router_z_loss_mlp": 0.29174805, "step": 10647, "time_per_iteration": 2.613165855407715 }, { "auxiliary_loss_clip": 0.01353473, "auxiliary_loss_mlp": 0.00309361, "balance_loss_clip": 1.12343788, "balance_loss_mlp": 0.28264642, "epoch": 0.6401923944085375, "flos": 26030855283840.0, "grad_norm": 7.063916518980274, "language_loss": 0.80300188, "learning_rate": 1.2109938992224399e-06, "loss": 0.81963015, "num_input_tokens_seen": 229777750, "router_z_loss_clip": 2.30078125, "router_z_loss_mlp": 0.26757812, "step": 10648, "time_per_iteration": 2.704153537750244 }, { "auxiliary_loss_clip": 0.0129949, "auxiliary_loss_mlp": 0.00317216, "balance_loss_clip": 1.07668388, "balance_loss_mlp": 0.28880799, "epoch": 0.6402525176612055, "flos": 23586451966080.0, "grad_norm": 10.157343880643904, "language_loss": 0.84352458, "learning_rate": 1.210636039936138e-06, "loss": 0.85969162, "num_input_tokens_seen": 229796785, "router_z_loss_clip": 2.22851562, "router_z_loss_mlp": 0.28430176, "step": 10649, "time_per_iteration": 2.6895945072174072 }, { "auxiliary_loss_clip": 0.01339647, "auxiliary_loss_mlp": 0.00291847, "balance_loss_clip": 1.10395288, "balance_loss_mlp": 0.26311785, "epoch": 0.6403126409138734, "flos": 18041413806720.0, "grad_norm": 22.804704480556165, "language_loss": 0.85011041, "learning_rate": 1.2102782105825956e-06, "loss": 0.86642528, "num_input_tokens_seen": 229815425, "router_z_loss_clip": 2.35546875, "router_z_loss_mlp": 0.2869873, "step": 10650, "time_per_iteration": 2.665221929550171 }, { "auxiliary_loss_clip": 0.01309893, "auxiliary_loss_mlp": 0.00306538, "balance_loss_clip": 1.08697009, "balance_loss_mlp": 0.27855986, "epoch": 0.6403727641665414, "flos": 21979485308160.0, "grad_norm": 139.90529594696036, "language_loss": 0.76489186, "learning_rate": 1.2099204111753833e-06, "loss": 0.78105617, "num_input_tokens_seen": 229834545, "router_z_loss_clip": 2.23242188, "router_z_loss_mlp": 0.27966309, "step": 10651, "time_per_iteration": 2.6852502822875977 }, { "auxiliary_loss_clip": 0.01327999, "auxiliary_loss_mlp": 0.00321454, "balance_loss_clip": 1.09663665, "balance_loss_mlp": 0.29056704, "epoch": 0.6404328874192093, "flos": 24895539135360.0, "grad_norm": 7.445664833555336, "language_loss": 0.74666989, "learning_rate": 1.2095626417280684e-06, "loss": 0.76316446, "num_input_tokens_seen": 229849175, "router_z_loss_clip": 2.31054688, "router_z_loss_mlp": 0.30871582, "step": 10652, "time_per_iteration": 2.686128616333008 }, { "auxiliary_loss_clip": 0.01353393, "auxiliary_loss_mlp": 0.00279338, "balance_loss_clip": 1.11752033, "balance_loss_mlp": 0.25174123, "epoch": 0.6404930106718774, "flos": 17597198309760.0, "grad_norm": 24.19916343581083, "language_loss": 0.87889749, "learning_rate": 1.2092049022542168e-06, "loss": 0.89522475, "num_input_tokens_seen": 229865400, "router_z_loss_clip": 2.359375, "router_z_loss_mlp": 0.27587891, "step": 10653, "time_per_iteration": 2.592432975769043 }, { "auxiliary_loss_clip": 0.01373041, "auxiliary_loss_mlp": 0.00351488, "balance_loss_clip": 1.12196898, "balance_loss_mlp": 0.31946856, "epoch": 0.6405531339245453, "flos": 20157880930560.0, "grad_norm": 28.96365066113406, "language_loss": 0.83259159, "learning_rate": 1.2088471927673952e-06, "loss": 0.84983689, "num_input_tokens_seen": 229882945, "router_z_loss_clip": 2.51171875, "router_z_loss_mlp": 0.3203125, "step": 10654, "time_per_iteration": 2.6467502117156982 }, { "auxiliary_loss_clip": 0.01348849, "auxiliary_loss_mlp": 0.00338311, "balance_loss_clip": 1.10617971, "balance_loss_mlp": 0.30780578, "epoch": 0.6406132571772133, "flos": 21942281796480.0, "grad_norm": 31.06370620608267, "language_loss": 0.80821943, "learning_rate": 1.2084895132811666e-06, "loss": 0.82509112, "num_input_tokens_seen": 229901590, "router_z_loss_clip": 2.4296875, "router_z_loss_mlp": 0.30505371, "step": 10655, "time_per_iteration": 4.159826040267944 }, { "auxiliary_loss_clip": 0.01333661, "auxiliary_loss_mlp": 0.00319405, "balance_loss_clip": 1.10070205, "balance_loss_mlp": 0.28835148, "epoch": 0.6406733804298813, "flos": 28768002445440.0, "grad_norm": 23.62355876494868, "language_loss": 0.89710861, "learning_rate": 1.2081318638090952e-06, "loss": 0.91363931, "num_input_tokens_seen": 229922535, "router_z_loss_clip": 2.33007812, "router_z_loss_mlp": 0.31066895, "step": 10656, "time_per_iteration": 4.142528295516968 }, { "auxiliary_loss_clip": 0.01333688, "auxiliary_loss_mlp": 0.00327145, "balance_loss_clip": 1.10241759, "balance_loss_mlp": 0.2988686, "epoch": 0.6407335036825492, "flos": 17457183095040.0, "grad_norm": 24.178742518927642, "language_loss": 0.82640803, "learning_rate": 1.2077742443647433e-06, "loss": 0.84301639, "num_input_tokens_seen": 229939575, "router_z_loss_clip": 2.31640625, "router_z_loss_mlp": 0.28320312, "step": 10657, "time_per_iteration": 2.6341371536254883 }, { "auxiliary_loss_clip": 0.01320295, "auxiliary_loss_mlp": 0.00275876, "balance_loss_clip": 1.09565341, "balance_loss_mlp": 0.24893469, "epoch": 0.6407936269352172, "flos": 22125282612480.0, "grad_norm": 40.59238665603964, "language_loss": 0.85083246, "learning_rate": 1.2074166549616707e-06, "loss": 0.86679423, "num_input_tokens_seen": 229958840, "router_z_loss_clip": 2.24804688, "router_z_loss_mlp": 0.26977539, "step": 10658, "time_per_iteration": 2.667628049850464 }, { "auxiliary_loss_clip": 0.01340838, "auxiliary_loss_mlp": 0.00310861, "balance_loss_clip": 1.10162902, "balance_loss_mlp": 0.28126171, "epoch": 0.6408537501878852, "flos": 23110635479040.0, "grad_norm": 18.353326279660113, "language_loss": 0.81593251, "learning_rate": 1.2070590956134386e-06, "loss": 0.83244956, "num_input_tokens_seen": 229979680, "router_z_loss_clip": 2.38671875, "router_z_loss_mlp": 0.29614258, "step": 10659, "time_per_iteration": 4.090449094772339 }, { "auxiliary_loss_clip": 0.0134869, "auxiliary_loss_mlp": 0.00281506, "balance_loss_clip": 1.10716021, "balance_loss_mlp": 0.25225246, "epoch": 0.6409138734405532, "flos": 16472440759680.0, "grad_norm": 512.5291181837384, "language_loss": 0.85643709, "learning_rate": 1.2067015663336046e-06, "loss": 0.87273896, "num_input_tokens_seen": 229996830, "router_z_loss_clip": 2.41601562, "router_z_loss_mlp": 0.2923584, "step": 10660, "time_per_iteration": 2.6665236949920654 }, { "auxiliary_loss_clip": 0.01376292, "auxiliary_loss_mlp": 0.00326408, "balance_loss_clip": 1.12399507, "balance_loss_mlp": 0.29200381, "epoch": 0.6409739966932211, "flos": 22777922776320.0, "grad_norm": 3.3015202651241116, "language_loss": 0.78246343, "learning_rate": 1.206344067135727e-06, "loss": 0.79949033, "num_input_tokens_seen": 230015115, "router_z_loss_clip": 2.52148438, "router_z_loss_mlp": 0.34423828, "step": 10661, "time_per_iteration": 2.6736040115356445 }, { "auxiliary_loss_clip": 0.01328664, "auxiliary_loss_mlp": 0.00307794, "balance_loss_clip": 1.09757793, "balance_loss_mlp": 0.28106761, "epoch": 0.6410341199458891, "flos": 25152049134720.0, "grad_norm": 7929.995398006301, "language_loss": 0.82002431, "learning_rate": 1.205986598033362e-06, "loss": 0.83638889, "num_input_tokens_seen": 230035515, "router_z_loss_clip": 2.3125, "router_z_loss_mlp": 0.26733398, "step": 10662, "time_per_iteration": 2.6910316944122314 }, { "auxiliary_loss_clip": 0.01317689, "auxiliary_loss_mlp": 0.00303787, "balance_loss_clip": 1.09084582, "balance_loss_mlp": 0.27708453, "epoch": 0.641094243198557, "flos": 27046193028480.0, "grad_norm": 4.2612103985848, "language_loss": 0.76433611, "learning_rate": 1.2056291590400644e-06, "loss": 0.78055084, "num_input_tokens_seen": 230054355, "router_z_loss_clip": 2.26757812, "router_z_loss_mlp": 0.26708984, "step": 10663, "time_per_iteration": 4.1028971672058105 }, { "auxiliary_loss_clip": 0.01356595, "auxiliary_loss_mlp": 0.00312872, "balance_loss_clip": 1.11673951, "balance_loss_mlp": 0.28149593, "epoch": 0.641154366451225, "flos": 25374551932800.0, "grad_norm": 107.64854643305142, "language_loss": 0.80389732, "learning_rate": 1.205271750169389e-06, "loss": 0.82059199, "num_input_tokens_seen": 230074605, "router_z_loss_clip": 2.3984375, "router_z_loss_mlp": 0.3137207, "step": 10664, "time_per_iteration": 2.718365430831909 }, { "auxiliary_loss_clip": 0.0133628, "auxiliary_loss_mlp": 0.002924, "balance_loss_clip": 1.1003015, "balance_loss_mlp": 0.26251382, "epoch": 0.6412144897038929, "flos": 25153342024320.0, "grad_norm": 19.679797835954265, "language_loss": 0.72053957, "learning_rate": 1.2049143714348881e-06, "loss": 0.73682636, "num_input_tokens_seen": 230093820, "router_z_loss_clip": 2.36132812, "router_z_loss_mlp": 0.29870605, "step": 10665, "time_per_iteration": 2.710953712463379 }, { "auxiliary_loss_clip": 0.01314872, "auxiliary_loss_mlp": 0.00316425, "balance_loss_clip": 1.08849788, "balance_loss_mlp": 0.28891134, "epoch": 0.641274612956561, "flos": 23440762402560.0, "grad_norm": 32.9218329751044, "language_loss": 0.7088744, "learning_rate": 1.2045570228501145e-06, "loss": 0.72518742, "num_input_tokens_seen": 230114285, "router_z_loss_clip": 2.25585938, "router_z_loss_mlp": 0.27514648, "step": 10666, "time_per_iteration": 2.6887588500976562 }, { "auxiliary_loss_clip": 0.01356908, "auxiliary_loss_mlp": 0.00298794, "balance_loss_clip": 1.10984373, "balance_loss_mlp": 0.26923043, "epoch": 0.6413347362092289, "flos": 19427493778560.0, "grad_norm": 772.6745325066196, "language_loss": 0.790196, "learning_rate": 1.2041997044286176e-06, "loss": 0.80675298, "num_input_tokens_seen": 230132760, "router_z_loss_clip": 2.47070312, "router_z_loss_mlp": 0.2956543, "step": 10667, "time_per_iteration": 2.683084487915039 }, { "auxiliary_loss_clip": 0.01379603, "auxiliary_loss_mlp": 0.00312841, "balance_loss_clip": 1.12442875, "balance_loss_mlp": 0.27996284, "epoch": 0.6413948594618969, "flos": 17196578945280.0, "grad_norm": 15.20963880344858, "language_loss": 0.90221524, "learning_rate": 1.2038424161839484e-06, "loss": 0.91913962, "num_input_tokens_seen": 230149690, "router_z_loss_clip": 2.54882812, "router_z_loss_mlp": 0.32861328, "step": 10668, "time_per_iteration": 2.597987413406372 }, { "auxiliary_loss_clip": 0.01337505, "auxiliary_loss_mlp": 0.00327794, "balance_loss_clip": 1.10466564, "balance_loss_mlp": 0.29851598, "epoch": 0.6414549827145648, "flos": 22269787027200.0, "grad_norm": 23.97061313322171, "language_loss": 0.75147891, "learning_rate": 1.2034851581296544e-06, "loss": 0.76813185, "num_input_tokens_seen": 230166950, "router_z_loss_clip": 2.32714844, "router_z_loss_mlp": 0.29272461, "step": 10669, "time_per_iteration": 2.6343696117401123 }, { "auxiliary_loss_clip": 0.01359775, "auxiliary_loss_mlp": 0.00290798, "balance_loss_clip": 1.11276102, "balance_loss_mlp": 0.25945795, "epoch": 0.6415151059672328, "flos": 19640192163840.0, "grad_norm": 11.954597199552616, "language_loss": 0.86773574, "learning_rate": 1.2031279302792825e-06, "loss": 0.88424146, "num_input_tokens_seen": 230184785, "router_z_loss_clip": 2.47070312, "router_z_loss_mlp": 0.31347656, "step": 10670, "time_per_iteration": 2.6281027793884277 }, { "auxiliary_loss_clip": 0.01361802, "auxiliary_loss_mlp": 0.00315907, "balance_loss_clip": 1.11736345, "balance_loss_mlp": 0.28491294, "epoch": 0.6415752292199008, "flos": 14865833237760.0, "grad_norm": 8.292064065908924, "language_loss": 0.98550797, "learning_rate": 1.20277073264638e-06, "loss": 1.002285, "num_input_tokens_seen": 230201385, "router_z_loss_clip": 2.4453125, "router_z_loss_mlp": 0.30981445, "step": 10671, "time_per_iteration": 2.6256699562072754 }, { "auxiliary_loss_clip": 0.01341935, "auxiliary_loss_mlp": 0.00288512, "balance_loss_clip": 1.11278129, "balance_loss_mlp": 0.26176143, "epoch": 0.6416353524725688, "flos": 13735580906880.0, "grad_norm": 29.76189061889072, "language_loss": 0.76253045, "learning_rate": 1.2024135652444907e-06, "loss": 0.77883488, "num_input_tokens_seen": 230220380, "router_z_loss_clip": 2.2890625, "router_z_loss_mlp": 0.26745605, "step": 10672, "time_per_iteration": 2.6550514698028564 }, { "auxiliary_loss_clip": 0.01385912, "auxiliary_loss_mlp": 0.00334315, "balance_loss_clip": 1.12602019, "balance_loss_mlp": 0.30253413, "epoch": 0.6416954757252368, "flos": 24534924543360.0, "grad_norm": 502.35399592939575, "language_loss": 0.84440792, "learning_rate": 1.2020564280871593e-06, "loss": 0.86161017, "num_input_tokens_seen": 230239845, "router_z_loss_clip": 2.59960938, "router_z_loss_mlp": 0.31787109, "step": 10673, "time_per_iteration": 2.6766021251678467 }, { "auxiliary_loss_clip": 0.01328113, "auxiliary_loss_mlp": 0.0033996, "balance_loss_clip": 1.09643078, "balance_loss_mlp": 0.31011045, "epoch": 0.6417555989779047, "flos": 27710002321920.0, "grad_norm": 20.479062424451495, "language_loss": 0.76618505, "learning_rate": 1.2016993211879283e-06, "loss": 0.78286582, "num_input_tokens_seen": 230262420, "router_z_loss_clip": 2.31445312, "router_z_loss_mlp": 0.29833984, "step": 10674, "time_per_iteration": 2.7230372428894043 }, { "auxiliary_loss_clip": 0.01358943, "auxiliary_loss_mlp": 0.00316197, "balance_loss_clip": 1.11307144, "balance_loss_mlp": 0.28620407, "epoch": 0.6418157222305727, "flos": 20556632787840.0, "grad_norm": 2344.9367301071816, "language_loss": 0.76126474, "learning_rate": 1.201342244560338e-06, "loss": 0.77801615, "num_input_tokens_seen": 230279950, "router_z_loss_clip": 2.45703125, "router_z_loss_mlp": 0.29968262, "step": 10675, "time_per_iteration": 2.6205568313598633 }, { "auxiliary_loss_clip": 0.01333198, "auxiliary_loss_mlp": 0.00301878, "balance_loss_clip": 1.10226107, "balance_loss_mlp": 0.27338701, "epoch": 0.6418758454832406, "flos": 22601530062720.0, "grad_norm": 58.47165484355713, "language_loss": 0.75011736, "learning_rate": 1.2009851982179307e-06, "loss": 0.76646817, "num_input_tokens_seen": 230299705, "router_z_loss_clip": 2.3125, "router_z_loss_mlp": 0.28515625, "step": 10676, "time_per_iteration": 2.7537074089050293 }, { "auxiliary_loss_clip": 0.01373542, "auxiliary_loss_mlp": 0.00334536, "balance_loss_clip": 1.12329364, "balance_loss_mlp": 0.30366105, "epoch": 0.6419359687359086, "flos": 27375098889600.0, "grad_norm": 163.52950400605167, "language_loss": 0.86886257, "learning_rate": 1.2006281821742446e-06, "loss": 0.88594341, "num_input_tokens_seen": 230320030, "router_z_loss_clip": 2.50390625, "router_z_loss_mlp": 0.30859375, "step": 10677, "time_per_iteration": 2.6823160648345947 }, { "auxiliary_loss_clip": 0.01310617, "auxiliary_loss_mlp": 0.00123141, "balance_loss_clip": 1.15532196, "balance_loss_mlp": 0.11484366, "epoch": 0.6419960919885765, "flos": 67251924552960.0, "grad_norm": 0.7494537753268423, "language_loss": 0.59592843, "learning_rate": 1.200271196442818e-06, "loss": 0.61026603, "num_input_tokens_seen": 230381495, "router_z_loss_clip": 1.5546875, "router_z_loss_mlp": 0.08300781, "step": 10678, "time_per_iteration": 3.2689905166625977 }, { "auxiliary_loss_clip": 0.01345558, "auxiliary_loss_mlp": 0.00324342, "balance_loss_clip": 1.11296332, "balance_loss_mlp": 0.29581547, "epoch": 0.6420562152412446, "flos": 19901873721600.0, "grad_norm": 17.8297637875541, "language_loss": 0.75319016, "learning_rate": 1.1999142410371875e-06, "loss": 0.76988918, "num_input_tokens_seen": 230401385, "router_z_loss_clip": 2.32421875, "router_z_loss_mlp": 0.28503418, "step": 10679, "time_per_iteration": 2.685029983520508 }, { "auxiliary_loss_clip": 0.01331891, "auxiliary_loss_mlp": 0.00327855, "balance_loss_clip": 1.09549677, "balance_loss_mlp": 0.29881543, "epoch": 0.6421163384939125, "flos": 24790177566720.0, "grad_norm": 11.902538981217267, "language_loss": 0.81772494, "learning_rate": 1.1995573159708897e-06, "loss": 0.83432245, "num_input_tokens_seen": 230421340, "router_z_loss_clip": 2.359375, "router_z_loss_mlp": 0.29077148, "step": 10680, "time_per_iteration": 2.6739790439605713 }, { "auxiliary_loss_clip": 0.01342985, "auxiliary_loss_mlp": 0.00325421, "balance_loss_clip": 1.10704935, "balance_loss_mlp": 0.29642966, "epoch": 0.6421764617465805, "flos": 25592816926080.0, "grad_norm": 19.722652260014982, "language_loss": 0.76807952, "learning_rate": 1.1992004212574582e-06, "loss": 0.78476357, "num_input_tokens_seen": 230441270, "router_z_loss_clip": 2.35351562, "router_z_loss_mlp": 0.28955078, "step": 10681, "time_per_iteration": 2.6980271339416504 }, { "auxiliary_loss_clip": 0.01327806, "auxiliary_loss_mlp": 0.00298586, "balance_loss_clip": 1.0939095, "balance_loss_mlp": 0.27036902, "epoch": 0.6422365849992484, "flos": 14134727813760.0, "grad_norm": 17.78720700960706, "language_loss": 0.8271836, "learning_rate": 1.198843556910427e-06, "loss": 0.84344745, "num_input_tokens_seen": 230457455, "router_z_loss_clip": 2.34375, "router_z_loss_mlp": 0.28234863, "step": 10682, "time_per_iteration": 2.6231837272644043 }, { "auxiliary_loss_clip": 0.01342895, "auxiliary_loss_mlp": 0.00327691, "balance_loss_clip": 1.10734749, "balance_loss_mlp": 0.29934281, "epoch": 0.6422967082519164, "flos": 22383911514240.0, "grad_norm": 13.631304141263522, "language_loss": 0.8443259, "learning_rate": 1.1984867229433287e-06, "loss": 0.86103171, "num_input_tokens_seen": 230478955, "router_z_loss_clip": 2.35546875, "router_z_loss_mlp": 0.28369141, "step": 10683, "time_per_iteration": 2.7008073329925537 }, { "auxiliary_loss_clip": 0.01336205, "auxiliary_loss_mlp": 0.0031586, "balance_loss_clip": 1.09681129, "balance_loss_mlp": 0.28612953, "epoch": 0.6423568315045844, "flos": 14647927380480.0, "grad_norm": 23.744220296617364, "language_loss": 0.77394795, "learning_rate": 1.1981299193696941e-06, "loss": 0.79046863, "num_input_tokens_seen": 230496425, "router_z_loss_clip": 2.3984375, "router_z_loss_mlp": 0.29736328, "step": 10684, "time_per_iteration": 2.6368422508239746 }, { "auxiliary_loss_clip": 0.01346603, "auxiliary_loss_mlp": 0.00307142, "balance_loss_clip": 1.10610342, "balance_loss_mlp": 0.27804339, "epoch": 0.6424169547572524, "flos": 26833925606400.0, "grad_norm": 97337.47497438289, "language_loss": 0.80968869, "learning_rate": 1.1977731462030533e-06, "loss": 0.82622617, "num_input_tokens_seen": 230516245, "router_z_loss_clip": 2.40429688, "router_z_loss_mlp": 0.29101562, "step": 10685, "time_per_iteration": 2.7682058811187744 }, { "auxiliary_loss_clip": 0.01325487, "auxiliary_loss_mlp": 0.0030697, "balance_loss_clip": 1.09484315, "balance_loss_mlp": 0.27790701, "epoch": 0.6424770780099204, "flos": 22707430335360.0, "grad_norm": 18.511401761047868, "language_loss": 0.81843221, "learning_rate": 1.197416403456935e-06, "loss": 0.83475679, "num_input_tokens_seen": 230534745, "router_z_loss_clip": 2.3046875, "router_z_loss_mlp": 0.29052734, "step": 10686, "time_per_iteration": 2.661789655685425 }, { "auxiliary_loss_clip": 0.01353426, "auxiliary_loss_mlp": 0.00301887, "balance_loss_clip": 1.11218619, "balance_loss_mlp": 0.27079698, "epoch": 0.6425372012625883, "flos": 28469512425600.0, "grad_norm": 12.579080959748042, "language_loss": 0.79340529, "learning_rate": 1.197059691144867e-06, "loss": 0.8099584, "num_input_tokens_seen": 230555895, "router_z_loss_clip": 2.4140625, "router_z_loss_mlp": 0.31054688, "step": 10687, "time_per_iteration": 2.746061325073242 }, { "auxiliary_loss_clip": 0.01316961, "auxiliary_loss_mlp": 0.00312147, "balance_loss_clip": 1.08382881, "balance_loss_mlp": 0.28327501, "epoch": 0.6425973245152563, "flos": 29351694453120.0, "grad_norm": 27.32791772907604, "language_loss": 0.74618113, "learning_rate": 1.1967030092803767e-06, "loss": 0.76247221, "num_input_tokens_seen": 230577460, "router_z_loss_clip": 2.33007812, "router_z_loss_mlp": 0.28857422, "step": 10688, "time_per_iteration": 2.7221336364746094 }, { "auxiliary_loss_clip": 0.01321772, "auxiliary_loss_mlp": 0.00312687, "balance_loss_clip": 1.08904886, "balance_loss_mlp": 0.28485137, "epoch": 0.6426574477679242, "flos": 16430388912000.0, "grad_norm": 5.411239791948602, "language_loss": 0.81101358, "learning_rate": 1.1963463578769876e-06, "loss": 0.82735813, "num_input_tokens_seen": 230595030, "router_z_loss_clip": 2.32617188, "router_z_loss_mlp": 0.27880859, "step": 10689, "time_per_iteration": 2.659327745437622 }, { "auxiliary_loss_clip": 0.01344541, "auxiliary_loss_mlp": 0.00330248, "balance_loss_clip": 1.10644925, "balance_loss_mlp": 0.30032659, "epoch": 0.6427175710205922, "flos": 21835914647040.0, "grad_norm": 7.017430681310821, "language_loss": 0.79712224, "learning_rate": 1.195989736948226e-06, "loss": 0.81387013, "num_input_tokens_seen": 230615135, "router_z_loss_clip": 2.37890625, "router_z_loss_mlp": 0.29931641, "step": 10690, "time_per_iteration": 2.6815927028656006 }, { "auxiliary_loss_clip": 0.01324978, "auxiliary_loss_mlp": 0.00320252, "balance_loss_clip": 1.09327841, "balance_loss_mlp": 0.29173732, "epoch": 0.6427776942732601, "flos": 17786627660160.0, "grad_norm": 13.7320593810427, "language_loss": 0.82946026, "learning_rate": 1.1956331465076143e-06, "loss": 0.84591258, "num_input_tokens_seen": 230631965, "router_z_loss_clip": 2.3203125, "router_z_loss_mlp": 0.28515625, "step": 10691, "time_per_iteration": 2.6586573123931885 }, { "auxiliary_loss_clip": 0.013463, "auxiliary_loss_mlp": 0.00311266, "balance_loss_clip": 1.1048671, "balance_loss_mlp": 0.28173751, "epoch": 0.6428378175259282, "flos": 15085893911040.0, "grad_norm": 5.307170341695963, "language_loss": 0.83715361, "learning_rate": 1.1952765865686738e-06, "loss": 0.85372925, "num_input_tokens_seen": 230649565, "router_z_loss_clip": 2.41796875, "router_z_loss_mlp": 0.29504395, "step": 10692, "time_per_iteration": 2.624025821685791 }, { "auxiliary_loss_clip": 0.01331775, "auxiliary_loss_mlp": 0.00314227, "balance_loss_clip": 1.09623003, "balance_loss_mlp": 0.28618857, "epoch": 0.6428979407785961, "flos": 23841776816640.0, "grad_norm": 79.61309968870607, "language_loss": 0.71691859, "learning_rate": 1.1949200571449263e-06, "loss": 0.73337859, "num_input_tokens_seen": 230669265, "router_z_loss_clip": 2.35351562, "router_z_loss_mlp": 0.27990723, "step": 10693, "time_per_iteration": 2.6710762977600098 }, { "auxiliary_loss_clip": 0.0132488, "auxiliary_loss_mlp": 0.00315287, "balance_loss_clip": 1.08703244, "balance_loss_mlp": 0.28538904, "epoch": 0.6429580640312641, "flos": 32926852892160.0, "grad_norm": 46.853845247639526, "language_loss": 0.71183372, "learning_rate": 1.1945635582498903e-06, "loss": 0.72823542, "num_input_tokens_seen": 230690575, "router_z_loss_clip": 2.38085938, "router_z_loss_mlp": 0.29858398, "step": 10694, "time_per_iteration": 2.765357255935669 }, { "auxiliary_loss_clip": 0.0130019, "auxiliary_loss_mlp": 0.00297019, "balance_loss_clip": 1.07420921, "balance_loss_mlp": 0.2698276, "epoch": 0.643018187283932, "flos": 21068359896960.0, "grad_norm": 3.252992517307486, "language_loss": 0.85806751, "learning_rate": 1.1942070898970853e-06, "loss": 0.87403965, "num_input_tokens_seen": 230709420, "router_z_loss_clip": 2.25976562, "router_z_loss_mlp": 0.27197266, "step": 10695, "time_per_iteration": 2.649407148361206 }, { "auxiliary_loss_clip": 0.0131022, "auxiliary_loss_mlp": 0.00357152, "balance_loss_clip": 1.07827497, "balance_loss_mlp": 0.3268255, "epoch": 0.6430783105366, "flos": 26724649455360.0, "grad_norm": 27.73559592309693, "language_loss": 0.79991156, "learning_rate": 1.1938506521000285e-06, "loss": 0.8165853, "num_input_tokens_seen": 230729350, "router_z_loss_clip": 2.31835938, "router_z_loss_mlp": 0.30322266, "step": 10696, "time_per_iteration": 2.6948697566986084 }, { "auxiliary_loss_clip": 0.01323345, "auxiliary_loss_mlp": 0.00298418, "balance_loss_clip": 1.09383464, "balance_loss_mlp": 0.27144074, "epoch": 0.643138433789268, "flos": 23696841438720.0, "grad_norm": 3.0206054369893245, "language_loss": 0.81609833, "learning_rate": 1.1934942448722347e-06, "loss": 0.83231592, "num_input_tokens_seen": 230749220, "router_z_loss_clip": 2.29492188, "router_z_loss_mlp": 0.26989746, "step": 10697, "time_per_iteration": 4.084696054458618 }, { "auxiliary_loss_clip": 0.01294932, "auxiliary_loss_mlp": 0.00317092, "balance_loss_clip": 1.07026029, "balance_loss_mlp": 0.28944701, "epoch": 0.643198557041936, "flos": 34202184255360.0, "grad_norm": 600.4021243228092, "language_loss": 0.73032963, "learning_rate": 1.1931378682272208e-06, "loss": 0.74644983, "num_input_tokens_seen": 230770245, "router_z_loss_clip": 2.2421875, "router_z_loss_mlp": 0.27600098, "step": 10698, "time_per_iteration": 4.19050145149231 }, { "auxiliary_loss_clip": 0.01315527, "auxiliary_loss_mlp": 0.0010435, "balance_loss_clip": 1.16339445, "balance_loss_mlp": 0.09543266, "epoch": 0.643258680294604, "flos": 67626473621760.0, "grad_norm": 0.812920265420946, "language_loss": 0.63186079, "learning_rate": 1.1927815221784996e-06, "loss": 0.64605957, "num_input_tokens_seen": 230837030, "router_z_loss_clip": 1.5234375, "router_z_loss_mlp": 0.08935547, "step": 10699, "time_per_iteration": 3.1632678508758545 }, { "auxiliary_loss_clip": 0.01302956, "auxiliary_loss_mlp": 0.00324988, "balance_loss_clip": 1.08118749, "balance_loss_mlp": 0.29585317, "epoch": 0.6433188035472719, "flos": 25185984508800.0, "grad_norm": 40.85415054257934, "language_loss": 0.74683642, "learning_rate": 1.1924252067395838e-06, "loss": 0.76311582, "num_input_tokens_seen": 230856845, "router_z_loss_clip": 2.21679688, "router_z_loss_mlp": 0.29125977, "step": 10700, "time_per_iteration": 2.6777923107147217 }, { "auxiliary_loss_clip": 0.01329664, "auxiliary_loss_mlp": 0.00316409, "balance_loss_clip": 1.09267402, "balance_loss_mlp": 0.28810903, "epoch": 0.6433789267999399, "flos": 24973573432320.0, "grad_norm": 50.56559278397293, "language_loss": 0.80156291, "learning_rate": 1.1920689219239855e-06, "loss": 0.81802368, "num_input_tokens_seen": 230878785, "router_z_loss_clip": 2.36914062, "router_z_loss_mlp": 0.28295898, "step": 10701, "time_per_iteration": 4.231788635253906 }, { "auxiliary_loss_clip": 0.01330371, "auxiliary_loss_mlp": 0.00323912, "balance_loss_clip": 1.09410453, "balance_loss_mlp": 0.29399049, "epoch": 0.6434390500526078, "flos": 17566028282880.0, "grad_norm": 4.97021459977323, "language_loss": 0.92045039, "learning_rate": 1.1917126677452144e-06, "loss": 0.93699324, "num_input_tokens_seen": 230895445, "router_z_loss_clip": 2.36328125, "router_z_loss_mlp": 0.29907227, "step": 10702, "time_per_iteration": 2.6464922428131104 }, { "auxiliary_loss_clip": 0.01313446, "auxiliary_loss_mlp": 0.00302305, "balance_loss_clip": 1.0835762, "balance_loss_mlp": 0.27295554, "epoch": 0.6434991733052758, "flos": 20843594542080.0, "grad_norm": 32.39803082886074, "language_loss": 0.80850756, "learning_rate": 1.1913564442167798e-06, "loss": 0.82466507, "num_input_tokens_seen": 230911375, "router_z_loss_clip": 2.296875, "router_z_loss_mlp": 0.29370117, "step": 10703, "time_per_iteration": 2.6282846927642822 }, { "auxiliary_loss_clip": 0.01315776, "auxiliary_loss_mlp": 0.00115334, "balance_loss_clip": 1.16131842, "balance_loss_mlp": 0.10708441, "epoch": 0.6435592965579437, "flos": 66094596345600.0, "grad_norm": 0.680218486158399, "language_loss": 0.53896624, "learning_rate": 1.1910002513521898e-06, "loss": 0.55327737, "num_input_tokens_seen": 230975990, "router_z_loss_clip": 1.546875, "router_z_loss_mlp": 0.08251953, "step": 10704, "time_per_iteration": 3.201376438140869 }, { "auxiliary_loss_clip": 0.01310034, "auxiliary_loss_mlp": 0.00306732, "balance_loss_clip": 1.07935798, "balance_loss_mlp": 0.28006503, "epoch": 0.6436194198106118, "flos": 23768842250880.0, "grad_norm": 17.30830626801004, "language_loss": 0.84590888, "learning_rate": 1.1906440891649519e-06, "loss": 0.86207646, "num_input_tokens_seen": 230997110, "router_z_loss_clip": 2.30664062, "router_z_loss_mlp": 0.26672363, "step": 10705, "time_per_iteration": 4.0692055225372314 }, { "auxiliary_loss_clip": 0.01311479, "auxiliary_loss_mlp": 0.0029741, "balance_loss_clip": 1.08359408, "balance_loss_mlp": 0.2708621, "epoch": 0.6436795430632797, "flos": 20230312705920.0, "grad_norm": 44.85573731157887, "language_loss": 0.85418254, "learning_rate": 1.1902879576685708e-06, "loss": 0.87027144, "num_input_tokens_seen": 231015590, "router_z_loss_clip": 2.27929688, "router_z_loss_mlp": 0.26550293, "step": 10706, "time_per_iteration": 2.6756386756896973 }, { "auxiliary_loss_clip": 0.01318136, "auxiliary_loss_mlp": 0.0029887, "balance_loss_clip": 1.08474374, "balance_loss_mlp": 0.27076039, "epoch": 0.6437396663159477, "flos": 20301846641280.0, "grad_norm": 6.7825924808703695, "language_loss": 0.88451457, "learning_rate": 1.1899318568765518e-06, "loss": 0.90068471, "num_input_tokens_seen": 231033800, "router_z_loss_clip": 2.33203125, "router_z_loss_mlp": 0.28088379, "step": 10707, "time_per_iteration": 2.671919107437134 }, { "auxiliary_loss_clip": 0.01312046, "auxiliary_loss_mlp": 0.00315569, "balance_loss_clip": 1.08333206, "balance_loss_mlp": 0.28548014, "epoch": 0.6437997895686156, "flos": 23878585278720.0, "grad_norm": 2.2188549014854835, "language_loss": 0.9100771, "learning_rate": 1.1895757868023978e-06, "loss": 0.92635322, "num_input_tokens_seen": 231053160, "router_z_loss_clip": 2.28320312, "router_z_loss_mlp": 0.30078125, "step": 10708, "time_per_iteration": 2.7166755199432373 }, { "auxiliary_loss_clip": 0.01351728, "auxiliary_loss_mlp": 0.00286186, "balance_loss_clip": 1.10238707, "balance_loss_mlp": 0.25535846, "epoch": 0.6438599128212836, "flos": 18989275852800.0, "grad_norm": 20.387078870458232, "language_loss": 0.76430678, "learning_rate": 1.1892197474596106e-06, "loss": 0.7806859, "num_input_tokens_seen": 231069470, "router_z_loss_clip": 2.4921875, "router_z_loss_mlp": 0.30834961, "step": 10709, "time_per_iteration": 2.720517873764038 }, { "auxiliary_loss_clip": 0.01309742, "auxiliary_loss_mlp": 0.00306567, "balance_loss_clip": 1.08307099, "balance_loss_mlp": 0.2794705, "epoch": 0.6439200360739517, "flos": 24096347481600.0, "grad_norm": 27.306571572081623, "language_loss": 0.8660053, "learning_rate": 1.1888637388616929e-06, "loss": 0.88216841, "num_input_tokens_seen": 231088205, "router_z_loss_clip": 2.26757812, "router_z_loss_mlp": 0.27087402, "step": 10710, "time_per_iteration": 2.776304006576538 }, { "auxiliary_loss_clip": 0.01304962, "auxiliary_loss_mlp": 0.00293075, "balance_loss_clip": 1.07817686, "balance_loss_mlp": 0.26398745, "epoch": 0.6439801593266196, "flos": 31902141697920.0, "grad_norm": 1406.4648131354552, "language_loss": 0.73065841, "learning_rate": 1.1885077610221425e-06, "loss": 0.74663877, "num_input_tokens_seen": 231107850, "router_z_loss_clip": 2.26953125, "router_z_loss_mlp": 0.29077148, "step": 10711, "time_per_iteration": 2.7382729053497314 }, { "auxiliary_loss_clip": 0.01318632, "auxiliary_loss_mlp": 0.00305511, "balance_loss_clip": 1.08457756, "balance_loss_mlp": 0.27704394, "epoch": 0.6440402825792876, "flos": 27125879351040.0, "grad_norm": 13.622845220683967, "language_loss": 0.85337508, "learning_rate": 1.1881518139544597e-06, "loss": 0.86961645, "num_input_tokens_seen": 231127200, "router_z_loss_clip": 2.34179688, "router_z_loss_mlp": 0.28479004, "step": 10712, "time_per_iteration": 2.690814971923828 }, { "auxiliary_loss_clip": 0.01312156, "auxiliary_loss_mlp": 0.00336387, "balance_loss_clip": 1.0783546, "balance_loss_mlp": 0.30770504, "epoch": 0.6441004058319555, "flos": 20667704618880.0, "grad_norm": 32.200878243272435, "language_loss": 0.88778603, "learning_rate": 1.1877958976721417e-06, "loss": 0.90427148, "num_input_tokens_seen": 231146360, "router_z_loss_clip": 2.3359375, "router_z_loss_mlp": 0.28686523, "step": 10713, "time_per_iteration": 2.6412696838378906 }, { "auxiliary_loss_clip": 0.01326011, "auxiliary_loss_mlp": 0.00276101, "balance_loss_clip": 1.09427571, "balance_loss_mlp": 0.24771738, "epoch": 0.6441605290846235, "flos": 26026006947840.0, "grad_norm": 8.234217640409533, "language_loss": 0.8437326, "learning_rate": 1.187440012188684e-06, "loss": 0.85975367, "num_input_tokens_seen": 231168350, "router_z_loss_clip": 2.31640625, "router_z_loss_mlp": 0.28356934, "step": 10714, "time_per_iteration": 2.67676043510437 }, { "auxiliary_loss_clip": 0.01306295, "auxiliary_loss_mlp": 0.00296083, "balance_loss_clip": 1.07961869, "balance_loss_mlp": 0.26767525, "epoch": 0.6442206523372914, "flos": 24899489631360.0, "grad_norm": 22421.676018611077, "language_loss": 0.86433238, "learning_rate": 1.187084157517583e-06, "loss": 0.88035619, "num_input_tokens_seen": 231188385, "router_z_loss_clip": 2.26757812, "router_z_loss_mlp": 0.28430176, "step": 10715, "time_per_iteration": 2.684894561767578 }, { "auxiliary_loss_clip": 0.01319914, "auxiliary_loss_mlp": 0.00275578, "balance_loss_clip": 1.08339787, "balance_loss_mlp": 0.24938796, "epoch": 0.6442807755899594, "flos": 25156322853120.0, "grad_norm": 166.13733262170632, "language_loss": 0.88165456, "learning_rate": 1.186728333672332e-06, "loss": 0.89760947, "num_input_tokens_seen": 231209880, "router_z_loss_clip": 2.36914062, "router_z_loss_mlp": 0.26159668, "step": 10716, "time_per_iteration": 2.729490280151367 }, { "auxiliary_loss_clip": 0.01304743, "auxiliary_loss_mlp": 0.00306356, "balance_loss_clip": 1.07291889, "balance_loss_mlp": 0.27520663, "epoch": 0.6443408988426274, "flos": 27344503480320.0, "grad_norm": 8.449046098868852, "language_loss": 0.84509248, "learning_rate": 1.186372540666424e-06, "loss": 0.86120343, "num_input_tokens_seen": 231230765, "router_z_loss_clip": 2.3203125, "router_z_loss_mlp": 0.31152344, "step": 10717, "time_per_iteration": 2.725562334060669 }, { "auxiliary_loss_clip": 0.01307052, "auxiliary_loss_mlp": 0.00285619, "balance_loss_clip": 1.08064222, "balance_loss_mlp": 0.25781959, "epoch": 0.6444010220952954, "flos": 27928339142400.0, "grad_norm": 21.734757054577855, "language_loss": 0.76128006, "learning_rate": 1.1860167785133513e-06, "loss": 0.77720678, "num_input_tokens_seen": 231252350, "router_z_loss_clip": 2.265625, "router_z_loss_mlp": 0.2779541, "step": 10718, "time_per_iteration": 2.7151403427124023 }, { "auxiliary_loss_clip": 0.01304842, "auxiliary_loss_mlp": 0.0016248, "balance_loss_clip": 1.15339351, "balance_loss_mlp": 0.15327679, "epoch": 0.6444611453479633, "flos": 71215024855680.0, "grad_norm": 0.7811786789954092, "language_loss": 0.49165267, "learning_rate": 1.185661047226603e-06, "loss": 0.5063259, "num_input_tokens_seen": 231313865, "router_z_loss_clip": 1.515625, "router_z_loss_mlp": 0.09179688, "step": 10719, "time_per_iteration": 3.282759189605713 }, { "auxiliary_loss_clip": 0.01313111, "auxiliary_loss_mlp": 0.00281955, "balance_loss_clip": 1.07995832, "balance_loss_mlp": 0.25466788, "epoch": 0.6445212686006313, "flos": 22705131864960.0, "grad_norm": 14.680650291142943, "language_loss": 0.85627401, "learning_rate": 1.18530534681967e-06, "loss": 0.87222469, "num_input_tokens_seen": 231331710, "router_z_loss_clip": 2.33203125, "router_z_loss_mlp": 0.27294922, "step": 10720, "time_per_iteration": 2.6423840522766113 }, { "auxiliary_loss_clip": 0.01326735, "auxiliary_loss_mlp": 0.00318832, "balance_loss_clip": 1.09560108, "balance_loss_mlp": 0.28910115, "epoch": 0.6445813918532992, "flos": 21178821196800.0, "grad_norm": 84.02311705959364, "language_loss": 0.84558862, "learning_rate": 1.18494967730604e-06, "loss": 0.86204427, "num_input_tokens_seen": 231350705, "router_z_loss_clip": 2.31054688, "router_z_loss_mlp": 0.29711914, "step": 10721, "time_per_iteration": 2.800558090209961 }, { "auxiliary_loss_clip": 0.01317548, "auxiliary_loss_mlp": 0.00286394, "balance_loss_clip": 1.08352542, "balance_loss_mlp": 0.25675827, "epoch": 0.6446415151059672, "flos": 25191910252800.0, "grad_norm": 40.4678420022837, "language_loss": 0.79766321, "learning_rate": 1.1845940386991995e-06, "loss": 0.81370258, "num_input_tokens_seen": 231369550, "router_z_loss_clip": 2.33789062, "router_z_loss_mlp": 0.29614258, "step": 10722, "time_per_iteration": 2.7468130588531494 }, { "auxiliary_loss_clip": 0.01298919, "auxiliary_loss_mlp": 0.00294978, "balance_loss_clip": 1.07498002, "balance_loss_mlp": 0.26868093, "epoch": 0.6447016383586353, "flos": 25302227898240.0, "grad_norm": 39.612232102485095, "language_loss": 0.85095108, "learning_rate": 1.184238431012635e-06, "loss": 0.86689001, "num_input_tokens_seen": 231389285, "router_z_loss_clip": 2.23828125, "router_z_loss_mlp": 0.26281738, "step": 10723, "time_per_iteration": 2.767415761947632 }, { "auxiliary_loss_clip": 0.01326847, "auxiliary_loss_mlp": 0.00297835, "balance_loss_clip": 1.09121251, "balance_loss_mlp": 0.2669833, "epoch": 0.6447617616113032, "flos": 27703142824320.0, "grad_norm": 12.66167836315832, "language_loss": 0.64569581, "learning_rate": 1.1838828542598312e-06, "loss": 0.6619426, "num_input_tokens_seen": 231408820, "router_z_loss_clip": 2.35546875, "router_z_loss_mlp": 0.30834961, "step": 10724, "time_per_iteration": 2.731027603149414 }, { "auxiliary_loss_clip": 0.01291514, "auxiliary_loss_mlp": 0.00262359, "balance_loss_clip": 1.07273698, "balance_loss_mlp": 0.23750389, "epoch": 0.6448218848639712, "flos": 23039101543680.0, "grad_norm": 9.541597324522032, "language_loss": 0.9066236, "learning_rate": 1.183527308454271e-06, "loss": 0.92216229, "num_input_tokens_seen": 231428100, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.2487793, "step": 10725, "time_per_iteration": 2.7188286781311035 }, { "auxiliary_loss_clip": 0.01289461, "auxiliary_loss_mlp": 0.00295542, "balance_loss_clip": 1.06460261, "balance_loss_mlp": 0.26669306, "epoch": 0.6448820081166391, "flos": 24496104919680.0, "grad_norm": 5.967770022035746, "language_loss": 0.87847137, "learning_rate": 1.1831717936094368e-06, "loss": 0.89432138, "num_input_tokens_seen": 231445810, "router_z_loss_clip": 2.24804688, "router_z_loss_mlp": 0.28857422, "step": 10726, "time_per_iteration": 2.675412178039551 }, { "auxiliary_loss_clip": 0.01307411, "auxiliary_loss_mlp": 0.00295177, "balance_loss_clip": 1.0765748, "balance_loss_mlp": 0.26697236, "epoch": 0.6449421313693071, "flos": 22419283432320.0, "grad_norm": 3.493504970105384, "language_loss": 0.90047497, "learning_rate": 1.1828163097388108e-06, "loss": 0.91650087, "num_input_tokens_seen": 231463570, "router_z_loss_clip": 2.30859375, "router_z_loss_mlp": 0.28210449, "step": 10727, "time_per_iteration": 2.671908140182495 }, { "auxiliary_loss_clip": 0.01329109, "auxiliary_loss_mlp": 0.00307715, "balance_loss_clip": 1.09151781, "balance_loss_mlp": 0.27779344, "epoch": 0.645002254621975, "flos": 20225715765120.0, "grad_norm": 4.868185748158972, "language_loss": 0.87236303, "learning_rate": 1.1824608568558717e-06, "loss": 0.88873124, "num_input_tokens_seen": 231482155, "router_z_loss_clip": 2.37304688, "router_z_loss_mlp": 0.29931641, "step": 10728, "time_per_iteration": 2.6317687034606934 }, { "auxiliary_loss_clip": 0.0132198, "auxiliary_loss_mlp": 0.00304138, "balance_loss_clip": 1.08487439, "balance_loss_mlp": 0.27488363, "epoch": 0.645062377874643, "flos": 27855440490240.0, "grad_norm": 57.866097702824455, "language_loss": 0.83071196, "learning_rate": 1.1821054349740988e-06, "loss": 0.84697318, "num_input_tokens_seen": 231502465, "router_z_loss_clip": 2.36914062, "router_z_loss_mlp": 0.29296875, "step": 10729, "time_per_iteration": 2.726339101791382 }, { "auxiliary_loss_clip": 0.01306367, "auxiliary_loss_mlp": 0.00306995, "balance_loss_clip": 1.0784297, "balance_loss_mlp": 0.2785995, "epoch": 0.645122501127311, "flos": 25301509626240.0, "grad_norm": 12.052163546143952, "language_loss": 0.73962629, "learning_rate": 1.1817500441069706e-06, "loss": 0.75575995, "num_input_tokens_seen": 231522740, "router_z_loss_clip": 2.27929688, "router_z_loss_mlp": 0.28393555, "step": 10730, "time_per_iteration": 2.667731761932373 }, { "auxiliary_loss_clip": 0.01299718, "auxiliary_loss_mlp": 0.00290916, "balance_loss_clip": 1.07361937, "balance_loss_mlp": 0.26271072, "epoch": 0.645182624379979, "flos": 18807352444800.0, "grad_norm": 75.88892762581303, "language_loss": 0.71826249, "learning_rate": 1.1813946842679614e-06, "loss": 0.73416883, "num_input_tokens_seen": 231542050, "router_z_loss_clip": 2.25976562, "router_z_loss_mlp": 0.28173828, "step": 10731, "time_per_iteration": 2.669257164001465 }, { "auxiliary_loss_clip": 0.01303376, "auxiliary_loss_mlp": 0.0030065, "balance_loss_clip": 1.07697392, "balance_loss_mlp": 0.27387595, "epoch": 0.6452427476326469, "flos": 18332182402560.0, "grad_norm": 9.817721914389624, "language_loss": 0.74866617, "learning_rate": 1.1810393554705492e-06, "loss": 0.76470637, "num_input_tokens_seen": 231560380, "router_z_loss_clip": 2.26953125, "router_z_loss_mlp": 0.26733398, "step": 10732, "time_per_iteration": 2.7008848190307617 }, { "auxiliary_loss_clip": 0.01315408, "auxiliary_loss_mlp": 0.00302713, "balance_loss_clip": 1.08754039, "balance_loss_mlp": 0.27481776, "epoch": 0.6453028708853149, "flos": 22784746360320.0, "grad_norm": 23.12309009759477, "language_loss": 0.83241469, "learning_rate": 1.1806840577282055e-06, "loss": 0.84859586, "num_input_tokens_seen": 231580810, "router_z_loss_clip": 2.27929688, "router_z_loss_mlp": 0.27905273, "step": 10733, "time_per_iteration": 2.6557767391204834 }, { "auxiliary_loss_clip": 0.01303108, "auxiliary_loss_mlp": 0.00320078, "balance_loss_clip": 1.07788813, "balance_loss_mlp": 0.29196835, "epoch": 0.6453629941379828, "flos": 23945989150080.0, "grad_norm": 17.085864774036413, "language_loss": 0.79946458, "learning_rate": 1.1803287910544048e-06, "loss": 0.81569648, "num_input_tokens_seen": 231600585, "router_z_loss_clip": 2.25390625, "router_z_loss_mlp": 0.28088379, "step": 10734, "time_per_iteration": 2.658217430114746 }, { "auxiliary_loss_clip": 0.01305443, "auxiliary_loss_mlp": 0.00286994, "balance_loss_clip": 1.08627105, "balance_loss_mlp": 0.26093459, "epoch": 0.6454231173906508, "flos": 17676381841920.0, "grad_norm": 131.4420528248961, "language_loss": 0.80526853, "learning_rate": 1.1799735554626191e-06, "loss": 0.82119286, "num_input_tokens_seen": 231618765, "router_z_loss_clip": 2.19140625, "router_z_loss_mlp": 0.26062012, "step": 10735, "time_per_iteration": 2.6392030715942383 }, { "auxiliary_loss_clip": 0.01330969, "auxiliary_loss_mlp": 0.00281154, "balance_loss_clip": 1.09827232, "balance_loss_mlp": 0.25317585, "epoch": 0.6454832406433189, "flos": 23292774368640.0, "grad_norm": 20.825800911493026, "language_loss": 0.81851739, "learning_rate": 1.1796183509663176e-06, "loss": 0.8346386, "num_input_tokens_seen": 231638525, "router_z_loss_clip": 2.32421875, "router_z_loss_mlp": 0.27966309, "step": 10736, "time_per_iteration": 2.7035329341888428 }, { "auxiliary_loss_clip": 0.01341724, "auxiliary_loss_mlp": 0.00340651, "balance_loss_clip": 1.10309303, "balance_loss_mlp": 0.31155241, "epoch": 0.6455433638959868, "flos": 20157198572160.0, "grad_norm": 9.252236520536625, "language_loss": 0.79894322, "learning_rate": 1.1792631775789708e-06, "loss": 0.81576693, "num_input_tokens_seen": 231656785, "router_z_loss_clip": 2.38867188, "router_z_loss_mlp": 0.29077148, "step": 10737, "time_per_iteration": 2.662172794342041 }, { "auxiliary_loss_clip": 0.0127804, "auxiliary_loss_mlp": 0.00169682, "balance_loss_clip": 1.12661386, "balance_loss_mlp": 0.15971565, "epoch": 0.6456034871486548, "flos": 66532922012160.0, "grad_norm": 0.7816289395522237, "language_loss": 0.57614064, "learning_rate": 1.1789080353140464e-06, "loss": 0.59061778, "num_input_tokens_seen": 231719075, "router_z_loss_clip": 1.515625, "router_z_loss_mlp": 0.09960938, "step": 10738, "time_per_iteration": 3.222099781036377 }, { "auxiliary_loss_clip": 0.01326658, "auxiliary_loss_mlp": 0.00319654, "balance_loss_clip": 1.09726882, "balance_loss_mlp": 0.29217663, "epoch": 0.6456636104013227, "flos": 24206090509440.0, "grad_norm": 24.229133226154076, "language_loss": 0.81520712, "learning_rate": 1.1785529241850118e-06, "loss": 0.83167028, "num_input_tokens_seen": 231737810, "router_z_loss_clip": 2.29492188, "router_z_loss_mlp": 0.27453613, "step": 10739, "time_per_iteration": 4.026391267776489 }, { "auxiliary_loss_clip": 0.01357341, "auxiliary_loss_mlp": 0.00279314, "balance_loss_clip": 1.11291504, "balance_loss_mlp": 0.25119245, "epoch": 0.6457237336539907, "flos": 23624086440960.0, "grad_norm": 24.529417613839975, "language_loss": 0.81073087, "learning_rate": 1.1781978442053324e-06, "loss": 0.82709742, "num_input_tokens_seen": 231756140, "router_z_loss_clip": 2.44726562, "router_z_loss_mlp": 0.28112793, "step": 10740, "time_per_iteration": 4.098350524902344 }, { "auxiliary_loss_clip": 0.0124572, "auxiliary_loss_mlp": 0.00250186, "balance_loss_clip": 1.09107995, "balance_loss_mlp": 0.23616698, "epoch": 0.6457838569066586, "flos": 65846023251840.0, "grad_norm": 0.6904326731980418, "language_loss": 0.54570508, "learning_rate": 1.1778427953884733e-06, "loss": 0.56066406, "num_input_tokens_seen": 231823665, "router_z_loss_clip": 1.546875, "router_z_loss_mlp": 0.140625, "step": 10741, "time_per_iteration": 3.142732858657837 }, { "auxiliary_loss_clip": 0.01329505, "auxiliary_loss_mlp": 0.00329344, "balance_loss_clip": 1.10058069, "balance_loss_mlp": 0.30074573, "epoch": 0.6458439801593266, "flos": 22381972179840.0, "grad_norm": 2.403242501216037, "language_loss": 0.85374171, "learning_rate": 1.1774877777478977e-06, "loss": 0.87033015, "num_input_tokens_seen": 231844500, "router_z_loss_clip": 2.28710938, "router_z_loss_mlp": 0.28625488, "step": 10742, "time_per_iteration": 2.6630959510803223 }, { "auxiliary_loss_clip": 0.01331913, "auxiliary_loss_mlp": 0.00302407, "balance_loss_clip": 1.10325646, "balance_loss_mlp": 0.27448821, "epoch": 0.6459041034119946, "flos": 24789243813120.0, "grad_norm": 3.6667474233441446, "language_loss": 0.870942, "learning_rate": 1.1771327912970678e-06, "loss": 0.88728517, "num_input_tokens_seen": 231864510, "router_z_loss_clip": 2.28710938, "router_z_loss_mlp": 0.27941895, "step": 10743, "time_per_iteration": 4.064514398574829 }, { "auxiliary_loss_clip": 0.012994, "auxiliary_loss_mlp": 0.00315404, "balance_loss_clip": 1.07820904, "balance_loss_mlp": 0.2876049, "epoch": 0.6459642266646626, "flos": 18325358818560.0, "grad_norm": 60.936628852573364, "language_loss": 0.78146189, "learning_rate": 1.1767778360494453e-06, "loss": 0.79760993, "num_input_tokens_seen": 231881555, "router_z_loss_clip": 2.21289062, "router_z_loss_mlp": 0.27770996, "step": 10744, "time_per_iteration": 2.698080539703369 }, { "auxiliary_loss_clip": 0.01307335, "auxiliary_loss_mlp": 0.00303401, "balance_loss_clip": 1.08324337, "balance_loss_mlp": 0.27487412, "epoch": 0.6460243499173305, "flos": 43581368891520.0, "grad_norm": 244.25563984337583, "language_loss": 0.74622309, "learning_rate": 1.1764229120184896e-06, "loss": 0.76233041, "num_input_tokens_seen": 231905945, "router_z_loss_clip": 2.24023438, "router_z_loss_mlp": 0.28515625, "step": 10745, "time_per_iteration": 2.8522021770477295 }, { "auxiliary_loss_clip": 0.01328276, "auxiliary_loss_mlp": 0.00341423, "balance_loss_clip": 1.09755492, "balance_loss_mlp": 0.31033334, "epoch": 0.6460844731699985, "flos": 19244026085760.0, "grad_norm": 319.2883526141002, "language_loss": 0.83232874, "learning_rate": 1.1760680192176597e-06, "loss": 0.84902573, "num_input_tokens_seen": 231922535, "router_z_loss_clip": 2.30859375, "router_z_loss_mlp": 0.31054688, "step": 10746, "time_per_iteration": 2.80941104888916 }, { "auxiliary_loss_clip": 0.01347739, "auxiliary_loss_mlp": 0.00308635, "balance_loss_clip": 1.11012089, "balance_loss_mlp": 0.28031093, "epoch": 0.6461445964226664, "flos": 27453348668160.0, "grad_norm": 6.191671461099192, "language_loss": 0.73541844, "learning_rate": 1.175713157660413e-06, "loss": 0.75198221, "num_input_tokens_seen": 231944800, "router_z_loss_clip": 2.37304688, "router_z_loss_mlp": 0.2833252, "step": 10747, "time_per_iteration": 2.7193171977996826 }, { "auxiliary_loss_clip": 0.01341882, "auxiliary_loss_mlp": 0.00280693, "balance_loss_clip": 1.10703421, "balance_loss_mlp": 0.25440785, "epoch": 0.6462047196753344, "flos": 20295489934080.0, "grad_norm": 20.102376468234404, "language_loss": 0.76496315, "learning_rate": 1.1753583273602056e-06, "loss": 0.78118885, "num_input_tokens_seen": 231962970, "router_z_loss_clip": 2.34765625, "router_z_loss_mlp": 0.26306152, "step": 10748, "time_per_iteration": 4.173394441604614 }, { "auxiliary_loss_clip": 0.0132423, "auxiliary_loss_mlp": 0.00321309, "balance_loss_clip": 1.09281838, "balance_loss_mlp": 0.29015988, "epoch": 0.6462648429280025, "flos": 22018340845440.0, "grad_norm": 7.855046563370563, "language_loss": 0.82895851, "learning_rate": 1.1750035283304937e-06, "loss": 0.84541392, "num_input_tokens_seen": 231981195, "router_z_loss_clip": 2.31640625, "router_z_loss_mlp": 0.31164551, "step": 10749, "time_per_iteration": 2.6884281635284424 }, { "auxiliary_loss_clip": 0.01338155, "auxiliary_loss_mlp": 0.00308455, "balance_loss_clip": 1.10100746, "balance_loss_mlp": 0.2792486, "epoch": 0.6463249661806704, "flos": 27781141207680.0, "grad_norm": 15.7975072149449, "language_loss": 0.8390249, "learning_rate": 1.17464876058473e-06, "loss": 0.85549098, "num_input_tokens_seen": 232001735, "router_z_loss_clip": 2.375, "router_z_loss_mlp": 0.29199219, "step": 10750, "time_per_iteration": 2.718506336212158 }, { "auxiliary_loss_clip": 0.01339825, "auxiliary_loss_mlp": 0.00338753, "balance_loss_clip": 1.1039772, "balance_loss_mlp": 0.30900997, "epoch": 0.6463850894333384, "flos": 22050588280320.0, "grad_norm": 8.429314550379159, "language_loss": 0.77104133, "learning_rate": 1.1742940241363683e-06, "loss": 0.78782713, "num_input_tokens_seen": 232019830, "router_z_loss_clip": 2.36328125, "router_z_loss_mlp": 0.29748535, "step": 10751, "time_per_iteration": 2.7429609298706055 }, { "auxiliary_loss_clip": 0.01342395, "auxiliary_loss_mlp": 0.00325281, "balance_loss_clip": 1.10439014, "balance_loss_mlp": 0.2948468, "epoch": 0.6464452126860063, "flos": 21106245767040.0, "grad_norm": 17.47510365386952, "language_loss": 0.81949103, "learning_rate": 1.1739393189988604e-06, "loss": 0.83616781, "num_input_tokens_seen": 232039625, "router_z_loss_clip": 2.38085938, "router_z_loss_mlp": 0.30432129, "step": 10752, "time_per_iteration": 2.738900661468506 }, { "auxiliary_loss_clip": 0.01354249, "auxiliary_loss_mlp": 0.00295482, "balance_loss_clip": 1.10984087, "balance_loss_mlp": 0.26789695, "epoch": 0.6465053359386743, "flos": 16028045694720.0, "grad_norm": 3.2119133803899858, "language_loss": 0.85521722, "learning_rate": 1.1735846451856554e-06, "loss": 0.87171447, "num_input_tokens_seen": 232055855, "router_z_loss_clip": 2.44335938, "router_z_loss_mlp": 0.27587891, "step": 10753, "time_per_iteration": 2.5810434818267822 }, { "auxiliary_loss_clip": 0.01356084, "auxiliary_loss_mlp": 0.00275643, "balance_loss_clip": 1.11618018, "balance_loss_mlp": 0.24752162, "epoch": 0.6465654591913422, "flos": 23398674641280.0, "grad_norm": 14.851778734079934, "language_loss": 0.91777778, "learning_rate": 1.1732300027102041e-06, "loss": 0.93409508, "num_input_tokens_seen": 232073475, "router_z_loss_clip": 2.40039062, "router_z_loss_mlp": 0.28137207, "step": 10754, "time_per_iteration": 2.6652534008026123 }, { "auxiliary_loss_clip": 0.01348586, "auxiliary_loss_mlp": 0.00306996, "balance_loss_clip": 1.11220014, "balance_loss_mlp": 0.27808744, "epoch": 0.6466255824440102, "flos": 15377273038080.0, "grad_norm": 784.7520789983366, "language_loss": 0.67307508, "learning_rate": 1.1728753915859541e-06, "loss": 0.68963087, "num_input_tokens_seen": 232091090, "router_z_loss_clip": 2.36523438, "router_z_loss_mlp": 0.28894043, "step": 10755, "time_per_iteration": 2.6186342239379883 }, { "auxiliary_loss_clip": 0.01330873, "auxiliary_loss_mlp": 0.00304834, "balance_loss_clip": 1.09996843, "balance_loss_mlp": 0.27653384, "epoch": 0.6466857056966782, "flos": 16252846963200.0, "grad_norm": 91.06577232174001, "language_loss": 0.74805844, "learning_rate": 1.1725208118263518e-06, "loss": 0.7644155, "num_input_tokens_seen": 232107320, "router_z_loss_clip": 2.30859375, "router_z_loss_mlp": 0.28295898, "step": 10756, "time_per_iteration": 2.6364352703094482 }, { "auxiliary_loss_clip": 0.01347428, "auxiliary_loss_mlp": 0.00291438, "balance_loss_clip": 1.10717773, "balance_loss_mlp": 0.26222014, "epoch": 0.6467458289493462, "flos": 21178246579200.0, "grad_norm": 325.8720967911381, "language_loss": 0.85229284, "learning_rate": 1.172166263444844e-06, "loss": 0.86868155, "num_input_tokens_seen": 232123930, "router_z_loss_clip": 2.40039062, "router_z_loss_mlp": 0.29248047, "step": 10757, "time_per_iteration": 2.628664255142212 }, { "auxiliary_loss_clip": 0.01329231, "auxiliary_loss_mlp": 0.00282996, "balance_loss_clip": 1.10330987, "balance_loss_mlp": 0.25632867, "epoch": 0.6468059522020141, "flos": 17968299672960.0, "grad_norm": 121.10742290079726, "language_loss": 0.80957353, "learning_rate": 1.1718117464548734e-06, "loss": 0.82569587, "num_input_tokens_seen": 232142905, "router_z_loss_clip": 2.26171875, "router_z_loss_mlp": 0.26672363, "step": 10758, "time_per_iteration": 2.625394821166992 }, { "auxiliary_loss_clip": 0.01338379, "auxiliary_loss_mlp": 0.00288376, "balance_loss_clip": 1.10378814, "balance_loss_mlp": 0.26002765, "epoch": 0.6468660754546821, "flos": 17890157635200.0, "grad_norm": 8.687228669089507, "language_loss": 0.75754702, "learning_rate": 1.1714572608698845e-06, "loss": 0.77381456, "num_input_tokens_seen": 232162230, "router_z_loss_clip": 2.34765625, "router_z_loss_mlp": 0.28356934, "step": 10759, "time_per_iteration": 2.6569833755493164 }, { "auxiliary_loss_clip": 0.01350751, "auxiliary_loss_mlp": 0.00315239, "balance_loss_clip": 1.11034489, "balance_loss_mlp": 0.28548473, "epoch": 0.64692619870735, "flos": 22600991358720.0, "grad_norm": 3.8941473003734868, "language_loss": 0.84276462, "learning_rate": 1.1711028067033197e-06, "loss": 0.85942453, "num_input_tokens_seen": 232182700, "router_z_loss_clip": 2.40429688, "router_z_loss_mlp": 0.29772949, "step": 10760, "time_per_iteration": 2.687274932861328 }, { "auxiliary_loss_clip": 0.01312002, "auxiliary_loss_mlp": 0.0029163, "balance_loss_clip": 1.09101236, "balance_loss_mlp": 0.26514208, "epoch": 0.646986321960018, "flos": 49600786993920.0, "grad_norm": 2.2837612846846294, "language_loss": 0.71805936, "learning_rate": 1.1707483839686194e-06, "loss": 0.73409569, "num_input_tokens_seen": 232208235, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.26489258, "step": 10761, "time_per_iteration": 2.905994415283203 }, { "auxiliary_loss_clip": 0.01324376, "auxiliary_loss_mlp": 0.00289977, "balance_loss_clip": 1.09917188, "balance_loss_mlp": 0.26309496, "epoch": 0.6470464452126861, "flos": 21908454163200.0, "grad_norm": 383.6544492556188, "language_loss": 0.7960043, "learning_rate": 1.1703939926792235e-06, "loss": 0.81214786, "num_input_tokens_seen": 232228720, "router_z_loss_clip": 2.24804688, "router_z_loss_mlp": 0.2689209, "step": 10762, "time_per_iteration": 2.673093318939209 }, { "auxiliary_loss_clip": 0.01336074, "auxiliary_loss_mlp": 0.00269285, "balance_loss_clip": 1.09851921, "balance_loss_mlp": 0.24057984, "epoch": 0.647106568465354, "flos": 18106124158080.0, "grad_norm": 18.260874418798167, "language_loss": 0.89714921, "learning_rate": 1.1700396328485705e-06, "loss": 0.91320276, "num_input_tokens_seen": 232244655, "router_z_loss_clip": 2.37890625, "router_z_loss_mlp": 0.28735352, "step": 10763, "time_per_iteration": 2.633495330810547 }, { "auxiliary_loss_clip": 0.01259553, "auxiliary_loss_mlp": 0.00142986, "balance_loss_clip": 1.10569191, "balance_loss_mlp": 0.13363951, "epoch": 0.647166691718022, "flos": 69480038125440.0, "grad_norm": 0.7135658666553623, "language_loss": 0.57263792, "learning_rate": 1.1696853044900978e-06, "loss": 0.58666331, "num_input_tokens_seen": 232308685, "router_z_loss_clip": 1.5390625, "router_z_loss_mlp": 0.09326172, "step": 10764, "time_per_iteration": 3.382359504699707 }, { "auxiliary_loss_clip": 0.01344946, "auxiliary_loss_mlp": 0.00270749, "balance_loss_clip": 1.10814929, "balance_loss_mlp": 0.24401061, "epoch": 0.6472268149706899, "flos": 34095170661120.0, "grad_norm": 44.148347374971486, "language_loss": 0.68867326, "learning_rate": 1.1693310076172413e-06, "loss": 0.70483029, "num_input_tokens_seen": 232327520, "router_z_loss_clip": 2.37109375, "router_z_loss_mlp": 0.26745605, "step": 10765, "time_per_iteration": 2.844805955886841 }, { "auxiliary_loss_clip": 0.01321875, "auxiliary_loss_mlp": 0.00254692, "balance_loss_clip": 1.09764957, "balance_loss_mlp": 0.23127964, "epoch": 0.6472869382233579, "flos": 28111232217600.0, "grad_norm": 32.363547897738556, "language_loss": 0.70425141, "learning_rate": 1.168976742243437e-06, "loss": 0.72001714, "num_input_tokens_seen": 232349025, "router_z_loss_clip": 2.24023438, "router_z_loss_mlp": 0.23400879, "step": 10766, "time_per_iteration": 2.725289821624756 }, { "auxiliary_loss_clip": 0.01344679, "auxiliary_loss_mlp": 0.00286274, "balance_loss_clip": 1.11068082, "balance_loss_mlp": 0.25831988, "epoch": 0.6473470614760258, "flos": 22492146170880.0, "grad_norm": 15.20918143205395, "language_loss": 0.83187079, "learning_rate": 1.1686225083821174e-06, "loss": 0.84818035, "num_input_tokens_seen": 232367835, "router_z_loss_clip": 2.33789062, "router_z_loss_mlp": 0.27941895, "step": 10767, "time_per_iteration": 2.6951723098754883 }, { "auxiliary_loss_clip": 0.01345116, "auxiliary_loss_mlp": 0.00291186, "balance_loss_clip": 1.10492575, "balance_loss_mlp": 0.26193228, "epoch": 0.6474071847286939, "flos": 14538938538240.0, "grad_norm": 19.975224066405758, "language_loss": 0.86490571, "learning_rate": 1.1682683060467153e-06, "loss": 0.88126868, "num_input_tokens_seen": 232385840, "router_z_loss_clip": 2.3984375, "router_z_loss_mlp": 0.29211426, "step": 10768, "time_per_iteration": 2.611851453781128 }, { "auxiliary_loss_clip": 0.01334769, "auxiliary_loss_mlp": 0.00275231, "balance_loss_clip": 1.10100198, "balance_loss_mlp": 0.24831353, "epoch": 0.6474673079813618, "flos": 24098214988800.0, "grad_norm": 28.26879493342924, "language_loss": 0.77963531, "learning_rate": 1.167914135250663e-06, "loss": 0.7957353, "num_input_tokens_seen": 232406205, "router_z_loss_clip": 2.33789062, "router_z_loss_mlp": 0.26904297, "step": 10769, "time_per_iteration": 2.733921766281128 }, { "auxiliary_loss_clip": 0.01320033, "auxiliary_loss_mlp": 0.00245359, "balance_loss_clip": 1.09681988, "balance_loss_mlp": 0.22185141, "epoch": 0.6475274312340298, "flos": 14976186796800.0, "grad_norm": 4.639272128220012, "language_loss": 0.81506139, "learning_rate": 1.1675599960073895e-06, "loss": 0.8307153, "num_input_tokens_seen": 232424995, "router_z_loss_clip": 2.23242188, "router_z_loss_mlp": 0.23510742, "step": 10770, "time_per_iteration": 2.6413686275482178 }, { "auxiliary_loss_clip": 0.01345824, "auxiliary_loss_mlp": 0.0030139, "balance_loss_clip": 1.10437727, "balance_loss_mlp": 0.27020502, "epoch": 0.6475875544866977, "flos": 25045322849280.0, "grad_norm": 9.22970949790574, "language_loss": 0.80166233, "learning_rate": 1.167205888330325e-06, "loss": 0.81813443, "num_input_tokens_seen": 232445870, "router_z_loss_clip": 2.41796875, "router_z_loss_mlp": 0.31176758, "step": 10771, "time_per_iteration": 2.6896181106567383 }, { "auxiliary_loss_clip": 0.01356822, "auxiliary_loss_mlp": 0.00254782, "balance_loss_clip": 1.11836958, "balance_loss_mlp": 0.22866371, "epoch": 0.6476476777393657, "flos": 16472153450880.0, "grad_norm": 17.109170471519175, "language_loss": 0.82782757, "learning_rate": 1.1668518122328958e-06, "loss": 0.8439436, "num_input_tokens_seen": 232464285, "router_z_loss_clip": 2.3828125, "router_z_loss_mlp": 0.2611084, "step": 10772, "time_per_iteration": 2.636303424835205 }, { "auxiliary_loss_clip": 0.01313425, "auxiliary_loss_mlp": 0.00253298, "balance_loss_clip": 1.0933646, "balance_loss_mlp": 0.22970621, "epoch": 0.6477078009920336, "flos": 25812267068160.0, "grad_norm": 2.779353302228359, "language_loss": 0.86815, "learning_rate": 1.1664977677285305e-06, "loss": 0.8838172, "num_input_tokens_seen": 232485815, "router_z_loss_clip": 2.20507812, "router_z_loss_mlp": 0.23608398, "step": 10773, "time_per_iteration": 2.738797187805176 }, { "auxiliary_loss_clip": 0.01317412, "auxiliary_loss_mlp": 0.00282746, "balance_loss_clip": 1.09484887, "balance_loss_mlp": 0.25622159, "epoch": 0.6477679242447016, "flos": 17676130446720.0, "grad_norm": 25.056881632149924, "language_loss": 0.85503864, "learning_rate": 1.1661437548306524e-06, "loss": 0.87104017, "num_input_tokens_seen": 232504875, "router_z_loss_clip": 2.22460938, "router_z_loss_mlp": 0.26525879, "step": 10774, "time_per_iteration": 2.7165367603302 }, { "auxiliary_loss_clip": 0.01337698, "auxiliary_loss_mlp": 0.00291456, "balance_loss_clip": 1.09878194, "balance_loss_mlp": 0.26315585, "epoch": 0.6478280474973696, "flos": 21032305620480.0, "grad_norm": 31.106835653613732, "language_loss": 0.82331723, "learning_rate": 1.1657897735526867e-06, "loss": 0.83960879, "num_input_tokens_seen": 232521945, "router_z_loss_clip": 2.38867188, "router_z_loss_mlp": 0.28308105, "step": 10775, "time_per_iteration": 2.6899709701538086 }, { "auxiliary_loss_clip": 0.01355915, "auxiliary_loss_mlp": 0.00277137, "balance_loss_clip": 1.11401057, "balance_loss_mlp": 0.24784693, "epoch": 0.6478881707500376, "flos": 21616931381760.0, "grad_norm": 2499.9818019981512, "language_loss": 0.74004424, "learning_rate": 1.1654358239080574e-06, "loss": 0.75637472, "num_input_tokens_seen": 232541500, "router_z_loss_clip": 2.421875, "router_z_loss_mlp": 0.29284668, "step": 10776, "time_per_iteration": 2.670818328857422 }, { "auxiliary_loss_clip": 0.01360637, "auxiliary_loss_mlp": 0.00254698, "balance_loss_clip": 1.11568558, "balance_loss_mlp": 0.22608779, "epoch": 0.6479482940027056, "flos": 18442571875200.0, "grad_norm": 8.899358488006493, "language_loss": 0.93113399, "learning_rate": 1.1650819059101839e-06, "loss": 0.94728732, "num_input_tokens_seen": 232559720, "router_z_loss_clip": 2.44921875, "router_z_loss_mlp": 0.28601074, "step": 10777, "time_per_iteration": 2.6123886108398438 }, { "auxiliary_loss_clip": 0.01336811, "auxiliary_loss_mlp": 0.00257837, "balance_loss_clip": 1.10277283, "balance_loss_mlp": 0.23065743, "epoch": 0.6480084172553735, "flos": 22164066322560.0, "grad_norm": 16.090908856207978, "language_loss": 0.81631726, "learning_rate": 1.1647280195724896e-06, "loss": 0.83226371, "num_input_tokens_seen": 232579370, "router_z_loss_clip": 2.34179688, "router_z_loss_mlp": 0.27185059, "step": 10778, "time_per_iteration": 2.6568751335144043 }, { "auxiliary_loss_clip": 0.01308834, "auxiliary_loss_mlp": 0.0027704, "balance_loss_clip": 1.08525181, "balance_loss_mlp": 0.24902616, "epoch": 0.6480685405080415, "flos": 24316228586880.0, "grad_norm": 79160.87210018265, "language_loss": 0.83842218, "learning_rate": 1.1643741649083923e-06, "loss": 0.85428089, "num_input_tokens_seen": 232600495, "router_z_loss_clip": 2.24023438, "router_z_loss_mlp": 0.28015137, "step": 10779, "time_per_iteration": 2.6914546489715576 }, { "auxiliary_loss_clip": 0.01289477, "auxiliary_loss_mlp": 0.00055286, "balance_loss_clip": 1.13717651, "balance_loss_mlp": 0.04608352, "epoch": 0.6481286637607094, "flos": 59891207760000.0, "grad_norm": 0.7173531710917146, "language_loss": 0.5862698, "learning_rate": 1.1640203419313095e-06, "loss": 0.59971744, "num_input_tokens_seen": 232663165, "router_z_loss_clip": 1.5234375, "router_z_loss_mlp": 0.09179688, "step": 10780, "time_per_iteration": 3.208174467086792 }, { "auxiliary_loss_clip": 0.01343452, "auxiliary_loss_mlp": 0.00263436, "balance_loss_clip": 1.10809398, "balance_loss_mlp": 0.23525444, "epoch": 0.6481887870133775, "flos": 25484187219840.0, "grad_norm": 6.8958670901715395, "language_loss": 0.8721869, "learning_rate": 1.1636665506546599e-06, "loss": 0.88825583, "num_input_tokens_seen": 232683385, "router_z_loss_clip": 2.35546875, "router_z_loss_mlp": 0.28186035, "step": 10781, "time_per_iteration": 4.106105089187622 }, { "auxiliary_loss_clip": 0.01353539, "auxiliary_loss_mlp": 0.0029613, "balance_loss_clip": 1.10981393, "balance_loss_mlp": 0.26477778, "epoch": 0.6482489102660454, "flos": 19930206574080.0, "grad_norm": 63.73623491790509, "language_loss": 0.88285816, "learning_rate": 1.1633127910918578e-06, "loss": 0.89935482, "num_input_tokens_seen": 232699095, "router_z_loss_clip": 2.43554688, "router_z_loss_mlp": 0.31347656, "step": 10782, "time_per_iteration": 3.995553970336914 }, { "auxiliary_loss_clip": 0.01342911, "auxiliary_loss_mlp": 0.00269892, "balance_loss_clip": 1.10329831, "balance_loss_mlp": 0.24166296, "epoch": 0.6483090335187134, "flos": 26979471515520.0, "grad_norm": 78.61265279091884, "language_loss": 0.76340872, "learning_rate": 1.1629590632563187e-06, "loss": 0.77953672, "num_input_tokens_seen": 232717920, "router_z_loss_clip": 2.39453125, "router_z_loss_mlp": 0.28259277, "step": 10783, "time_per_iteration": 2.66917085647583 }, { "auxiliary_loss_clip": 0.01362366, "auxiliary_loss_mlp": 0.00280101, "balance_loss_clip": 1.11683488, "balance_loss_mlp": 0.24734193, "epoch": 0.6483691567713813, "flos": 25077965333760.0, "grad_norm": 11.459332036609275, "language_loss": 0.96980155, "learning_rate": 1.1626053671614561e-06, "loss": 0.9862262, "num_input_tokens_seen": 232737605, "router_z_loss_clip": 2.45507812, "router_z_loss_mlp": 0.32751465, "step": 10784, "time_per_iteration": 2.6881103515625 }, { "auxiliary_loss_clip": 0.01327628, "auxiliary_loss_mlp": 0.00264081, "balance_loss_clip": 1.0981276, "balance_loss_mlp": 0.23656771, "epoch": 0.6484292800240493, "flos": 16105972250880.0, "grad_norm": 239.09806897478433, "language_loss": 0.83756113, "learning_rate": 1.1622517028206815e-06, "loss": 0.85347819, "num_input_tokens_seen": 232755110, "router_z_loss_clip": 2.296875, "router_z_loss_mlp": 0.27526855, "step": 10785, "time_per_iteration": 4.0510499477386475 }, { "auxiliary_loss_clip": 0.01318504, "auxiliary_loss_mlp": 0.00267074, "balance_loss_clip": 1.09514427, "balance_loss_mlp": 0.24152738, "epoch": 0.6484894032767172, "flos": 28840398307200.0, "grad_norm": 4.2229506196830116, "language_loss": 0.74777281, "learning_rate": 1.1618980702474071e-06, "loss": 0.76362854, "num_input_tokens_seen": 232779040, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.2557373, "step": 10786, "time_per_iteration": 2.7529633045196533 }, { "auxiliary_loss_clip": 0.01323391, "auxiliary_loss_mlp": 0.00262492, "balance_loss_clip": 1.09358859, "balance_loss_mlp": 0.2353245, "epoch": 0.6485495265293852, "flos": 30227052896640.0, "grad_norm": 11.731744451924255, "language_loss": 0.79601479, "learning_rate": 1.161544469455041e-06, "loss": 0.81187367, "num_input_tokens_seen": 232800515, "router_z_loss_clip": 2.29492188, "router_z_loss_mlp": 0.27185059, "step": 10787, "time_per_iteration": 2.7386820316314697 }, { "auxiliary_loss_clip": 0.01343393, "auxiliary_loss_mlp": 0.00247028, "balance_loss_clip": 1.10071754, "balance_loss_mlp": 0.21851283, "epoch": 0.6486096497820532, "flos": 20082181017600.0, "grad_norm": 4.5152024859208755, "language_loss": 0.93218565, "learning_rate": 1.1611909004569934e-06, "loss": 0.94808978, "num_input_tokens_seen": 232818450, "router_z_loss_clip": 2.42773438, "router_z_loss_mlp": 0.28491211, "step": 10788, "time_per_iteration": 2.6683664321899414 }, { "auxiliary_loss_clip": 0.01350534, "auxiliary_loss_mlp": 0.00271396, "balance_loss_clip": 1.11250591, "balance_loss_mlp": 0.24333426, "epoch": 0.6486697730347212, "flos": 17129067333120.0, "grad_norm": 4.70036056042283, "language_loss": 0.86564225, "learning_rate": 1.1608373632666708e-06, "loss": 0.88186157, "num_input_tokens_seen": 232834785, "router_z_loss_clip": 2.38085938, "router_z_loss_mlp": 0.28076172, "step": 10789, "time_per_iteration": 2.631643533706665 }, { "auxiliary_loss_clip": 0.01349984, "auxiliary_loss_mlp": 0.0025723, "balance_loss_clip": 1.11410141, "balance_loss_mlp": 0.22888175, "epoch": 0.6487298962873892, "flos": 38911940570880.0, "grad_norm": 1.867650976865241, "language_loss": 0.82616329, "learning_rate": 1.160483857897479e-06, "loss": 0.84223545, "num_input_tokens_seen": 232856050, "router_z_loss_clip": 2.359375, "router_z_loss_mlp": 0.2833252, "step": 10790, "time_per_iteration": 4.225125312805176 }, { "auxiliary_loss_clip": 0.01338725, "auxiliary_loss_mlp": 0.00264431, "balance_loss_clip": 1.10341883, "balance_loss_mlp": 0.23650065, "epoch": 0.6487900195400571, "flos": 11947840076160.0, "grad_norm": 9.56806208971264, "language_loss": 0.70118952, "learning_rate": 1.160130384362823e-06, "loss": 0.71722102, "num_input_tokens_seen": 232873945, "router_z_loss_clip": 2.35351562, "router_z_loss_mlp": 0.27966309, "step": 10791, "time_per_iteration": 2.6226983070373535 }, { "auxiliary_loss_clip": 0.01315325, "auxiliary_loss_mlp": 0.00287412, "balance_loss_clip": 1.08614385, "balance_loss_mlp": 0.2608521, "epoch": 0.6488501427927251, "flos": 22344445445760.0, "grad_norm": 55.532448080571946, "language_loss": 0.93290299, "learning_rate": 1.1597769426761082e-06, "loss": 0.94893038, "num_input_tokens_seen": 232892160, "router_z_loss_clip": 2.29101562, "router_z_loss_mlp": 0.265625, "step": 10792, "time_per_iteration": 2.669800043106079 }, { "auxiliary_loss_clip": 0.01334541, "auxiliary_loss_mlp": 0.00274743, "balance_loss_clip": 1.09680295, "balance_loss_mlp": 0.24702723, "epoch": 0.648910266045393, "flos": 22236282616320.0, "grad_norm": 39.689573973104004, "language_loss": 0.87642914, "learning_rate": 1.159423532850735e-06, "loss": 0.89252198, "num_input_tokens_seen": 232911725, "router_z_loss_clip": 2.37792969, "router_z_loss_mlp": 0.27734375, "step": 10793, "time_per_iteration": 2.7394931316375732 }, { "auxiliary_loss_clip": 0.0133411, "auxiliary_loss_mlp": 0.00255158, "balance_loss_clip": 1.09483576, "balance_loss_mlp": 0.22806132, "epoch": 0.6489703892980611, "flos": 25301258231040.0, "grad_norm": 189.64928913580118, "language_loss": 0.81923133, "learning_rate": 1.1590701549001055e-06, "loss": 0.83512396, "num_input_tokens_seen": 232929085, "router_z_loss_clip": 2.39453125, "router_z_loss_mlp": 0.27099609, "step": 10794, "time_per_iteration": 2.659733533859253 }, { "auxiliary_loss_clip": 0.01325891, "auxiliary_loss_mlp": 0.00258847, "balance_loss_clip": 1.09341717, "balance_loss_mlp": 0.2319061, "epoch": 0.649030512550729, "flos": 24571912573440.0, "grad_norm": 3.7494719571295607, "language_loss": 0.79422694, "learning_rate": 1.158716808837621e-06, "loss": 0.81007439, "num_input_tokens_seen": 232949455, "router_z_loss_clip": 2.32421875, "router_z_loss_mlp": 0.26928711, "step": 10795, "time_per_iteration": 2.6960673332214355 }, { "auxiliary_loss_clip": 0.0132579, "auxiliary_loss_mlp": 0.00260981, "balance_loss_clip": 1.09294939, "balance_loss_mlp": 0.23345591, "epoch": 0.649090635803397, "flos": 26244702904320.0, "grad_norm": 47.42106347021005, "language_loss": 0.63449788, "learning_rate": 1.158363494676679e-06, "loss": 0.65036559, "num_input_tokens_seen": 232969445, "router_z_loss_clip": 2.328125, "router_z_loss_mlp": 0.27539062, "step": 10796, "time_per_iteration": 2.7699289321899414 }, { "auxiliary_loss_clip": 0.01309481, "auxiliary_loss_mlp": 0.0025496, "balance_loss_clip": 1.08294535, "balance_loss_mlp": 0.22910401, "epoch": 0.6491507590560649, "flos": 24937375501440.0, "grad_norm": 68.54088277262782, "language_loss": 0.85224193, "learning_rate": 1.1580102124306775e-06, "loss": 0.8678863, "num_input_tokens_seen": 232988900, "router_z_loss_clip": 2.26367188, "router_z_loss_mlp": 0.25830078, "step": 10797, "time_per_iteration": 2.6600160598754883 }, { "auxiliary_loss_clip": 0.01305197, "auxiliary_loss_mlp": 0.00238306, "balance_loss_clip": 1.08357942, "balance_loss_mlp": 0.2145004, "epoch": 0.6492108823087329, "flos": 19499781899520.0, "grad_norm": 40.57215913000819, "language_loss": 0.77702451, "learning_rate": 1.1576569621130134e-06, "loss": 0.79245955, "num_input_tokens_seen": 233005060, "router_z_loss_clip": 2.21484375, "router_z_loss_mlp": 0.23803711, "step": 10798, "time_per_iteration": 2.6526808738708496 }, { "auxiliary_loss_clip": 0.01302329, "auxiliary_loss_mlp": 0.00254397, "balance_loss_clip": 1.07667589, "balance_loss_mlp": 0.2283856, "epoch": 0.6492710055614008, "flos": 19719303868800.0, "grad_norm": 3.2033624557506575, "language_loss": 0.82063174, "learning_rate": 1.1573037437370811e-06, "loss": 0.83619899, "num_input_tokens_seen": 233023375, "router_z_loss_clip": 2.2578125, "router_z_loss_mlp": 0.26013184, "step": 10799, "time_per_iteration": 2.6514625549316406 }, { "auxiliary_loss_clip": 0.01317734, "auxiliary_loss_mlp": 0.00261163, "balance_loss_clip": 1.08127308, "balance_loss_mlp": 0.23507985, "epoch": 0.6493311288140688, "flos": 24317018686080.0, "grad_norm": 3.674873016486915, "language_loss": 0.791924, "learning_rate": 1.1569505573162755e-06, "loss": 0.80771291, "num_input_tokens_seen": 233043130, "router_z_loss_clip": 2.359375, "router_z_loss_mlp": 0.26098633, "step": 10800, "time_per_iteration": 2.6840381622314453 }, { "auxiliary_loss_clip": 0.01246621, "auxiliary_loss_mlp": 0.00037137, "balance_loss_clip": 1.10311341, "balance_loss_mlp": 0.02984124, "epoch": 0.6493912520667368, "flos": 70934635290240.0, "grad_norm": 0.7604428421242306, "language_loss": 0.59695607, "learning_rate": 1.1565974028639897e-06, "loss": 0.60979366, "num_input_tokens_seen": 233110560, "router_z_loss_clip": 1.4375, "router_z_loss_mlp": 0.07275391, "step": 10801, "time_per_iteration": 3.238823175430298 }, { "auxiliary_loss_clip": 0.01340394, "auxiliary_loss_mlp": 0.00251214, "balance_loss_clip": 1.10578763, "balance_loss_mlp": 0.22368857, "epoch": 0.6494513753194048, "flos": 25337779384320.0, "grad_norm": 6.0003088884595535, "language_loss": 0.85357761, "learning_rate": 1.156244280393614e-06, "loss": 0.8694936, "num_input_tokens_seen": 233130080, "router_z_loss_clip": 2.34472656, "router_z_loss_mlp": 0.27526855, "step": 10802, "time_per_iteration": 2.642927885055542 }, { "auxiliary_loss_clip": 0.01299158, "auxiliary_loss_mlp": 0.00244491, "balance_loss_clip": 1.07487178, "balance_loss_mlp": 0.21876523, "epoch": 0.6495114985720728, "flos": 24681978823680.0, "grad_norm": 5.293251220230723, "language_loss": 0.84095216, "learning_rate": 1.155891189918541e-06, "loss": 0.85638869, "num_input_tokens_seen": 233150235, "router_z_loss_clip": 2.24414062, "router_z_loss_mlp": 0.25744629, "step": 10803, "time_per_iteration": 2.637833595275879 }, { "auxiliary_loss_clip": 0.01319215, "auxiliary_loss_mlp": 0.00265643, "balance_loss_clip": 1.08804369, "balance_loss_mlp": 0.23878554, "epoch": 0.6495716218247407, "flos": 23651162317440.0, "grad_norm": 5.546422514154618, "language_loss": 0.81867421, "learning_rate": 1.1555381314521578e-06, "loss": 0.83452278, "num_input_tokens_seen": 233166710, "router_z_loss_clip": 2.31445312, "router_z_loss_mlp": 0.26879883, "step": 10804, "time_per_iteration": 2.643524408340454 }, { "auxiliary_loss_clip": 0.01296793, "auxiliary_loss_mlp": 0.0024966, "balance_loss_clip": 1.07171845, "balance_loss_mlp": 0.22491232, "epoch": 0.6496317450774087, "flos": 22346169298560.0, "grad_norm": 16.335717509204155, "language_loss": 0.80198616, "learning_rate": 1.1551851050078537e-06, "loss": 0.81745064, "num_input_tokens_seen": 233185445, "router_z_loss_clip": 2.25, "router_z_loss_mlp": 0.24719238, "step": 10805, "time_per_iteration": 2.661180019378662 }, { "auxiliary_loss_clip": 0.01282467, "auxiliary_loss_mlp": 0.00259115, "balance_loss_clip": 1.06082177, "balance_loss_mlp": 0.2336998, "epoch": 0.6496918683300766, "flos": 30518647505280.0, "grad_norm": 4.483512410245368, "language_loss": 0.76072735, "learning_rate": 1.1548321105990155e-06, "loss": 0.77614319, "num_input_tokens_seen": 233205805, "router_z_loss_clip": 2.21679688, "router_z_loss_mlp": 0.25415039, "step": 10806, "time_per_iteration": 2.720574378967285 }, { "auxiliary_loss_clip": 0.01302445, "auxiliary_loss_mlp": 0.00275668, "balance_loss_clip": 1.07413697, "balance_loss_mlp": 0.24894123, "epoch": 0.6497519915827447, "flos": 12458992567680.0, "grad_norm": 70.05050627970263, "language_loss": 0.90847087, "learning_rate": 1.1544791482390275e-06, "loss": 0.92425197, "num_input_tokens_seen": 233224215, "router_z_loss_clip": 2.28320312, "router_z_loss_mlp": 0.26733398, "step": 10807, "time_per_iteration": 2.669102430343628 }, { "auxiliary_loss_clip": 0.0125141, "auxiliary_loss_mlp": 0.00077813, "balance_loss_clip": 1.10896492, "balance_loss_mlp": 0.07004017, "epoch": 0.6498121148354126, "flos": 69093748287360.0, "grad_norm": 0.7798654109677798, "language_loss": 0.5826239, "learning_rate": 1.1541262179412745e-06, "loss": 0.59591615, "num_input_tokens_seen": 233294440, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.07763672, "step": 10808, "time_per_iteration": 3.3394484519958496 }, { "auxiliary_loss_clip": 0.0132107, "auxiliary_loss_mlp": 0.00271014, "balance_loss_clip": 1.09410596, "balance_loss_mlp": 0.2456466, "epoch": 0.6498722380880806, "flos": 36897135914880.0, "grad_norm": 40.23593847558313, "language_loss": 0.69967175, "learning_rate": 1.1537733197191415e-06, "loss": 0.71559262, "num_input_tokens_seen": 233316125, "router_z_loss_clip": 2.2734375, "router_z_loss_mlp": 0.25378418, "step": 10809, "time_per_iteration": 2.8150007724761963 }, { "auxiliary_loss_clip": 0.01300553, "auxiliary_loss_mlp": 0.00255318, "balance_loss_clip": 1.07653749, "balance_loss_mlp": 0.230892, "epoch": 0.6499323613407485, "flos": 29017760688000.0, "grad_norm": 9.1422833826576, "language_loss": 0.86161268, "learning_rate": 1.153420453586008e-06, "loss": 0.8771714, "num_input_tokens_seen": 233336140, "router_z_loss_clip": 2.2421875, "router_z_loss_mlp": 0.24438477, "step": 10810, "time_per_iteration": 2.7254974842071533 }, { "auxiliary_loss_clip": 0.01274905, "auxiliary_loss_mlp": 0.00258824, "balance_loss_clip": 1.05977416, "balance_loss_mlp": 0.23486295, "epoch": 0.6499924845934165, "flos": 20119240874880.0, "grad_norm": 445.03035216624124, "language_loss": 0.80536771, "learning_rate": 1.1530676195552561e-06, "loss": 0.820705, "num_input_tokens_seen": 233356095, "router_z_loss_clip": 2.15039062, "router_z_loss_mlp": 0.23986816, "step": 10811, "time_per_iteration": 2.7155773639678955 }, { "auxiliary_loss_clip": 0.01292604, "auxiliary_loss_mlp": 0.00240633, "balance_loss_clip": 1.07647943, "balance_loss_mlp": 0.21545586, "epoch": 0.6500526078460844, "flos": 24421338760320.0, "grad_norm": 3.5987500807277457, "language_loss": 0.83604407, "learning_rate": 1.1527148176402649e-06, "loss": 0.85137641, "num_input_tokens_seen": 233376830, "router_z_loss_clip": 2.16113281, "router_z_loss_mlp": 0.25183105, "step": 10812, "time_per_iteration": 2.728624105453491 }, { "auxiliary_loss_clip": 0.0130738, "auxiliary_loss_mlp": 0.00244917, "balance_loss_clip": 1.07861483, "balance_loss_mlp": 0.22095615, "epoch": 0.6501127310987524, "flos": 23331019374720.0, "grad_norm": 57.38540825969017, "language_loss": 0.91851151, "learning_rate": 1.152362047854413e-06, "loss": 0.93403447, "num_input_tokens_seen": 233395275, "router_z_loss_clip": 2.28710938, "router_z_loss_mlp": 0.23937988, "step": 10813, "time_per_iteration": 2.6959879398345947 }, { "auxiliary_loss_clip": 0.01300348, "auxiliary_loss_mlp": 0.00254417, "balance_loss_clip": 1.07556367, "balance_loss_mlp": 0.22900152, "epoch": 0.6501728543514204, "flos": 18697824898560.0, "grad_norm": 6.268670782662336, "language_loss": 0.87700891, "learning_rate": 1.1520093102110764e-06, "loss": 0.89255655, "num_input_tokens_seen": 233413345, "router_z_loss_clip": 2.24707031, "router_z_loss_mlp": 0.25427246, "step": 10814, "time_per_iteration": 2.624119281768799 }, { "auxiliary_loss_clip": 0.01295442, "auxiliary_loss_mlp": 0.00267859, "balance_loss_clip": 1.07126594, "balance_loss_mlp": 0.24131152, "epoch": 0.6502329776040884, "flos": 44199858199680.0, "grad_norm": 10.703023218735192, "language_loss": 0.74109316, "learning_rate": 1.1516566047236328e-06, "loss": 0.75672615, "num_input_tokens_seen": 233436105, "router_z_loss_clip": 2.23828125, "router_z_loss_mlp": 0.26538086, "step": 10815, "time_per_iteration": 2.852877140045166 }, { "auxiliary_loss_clip": 0.01329269, "auxiliary_loss_mlp": 0.00266482, "balance_loss_clip": 1.09029078, "balance_loss_mlp": 0.23788375, "epoch": 0.6502931008567564, "flos": 14574741419520.0, "grad_norm": 2.0379314357948095, "language_loss": 0.85311669, "learning_rate": 1.1513039314054546e-06, "loss": 0.86907423, "num_input_tokens_seen": 233452320, "router_z_loss_clip": 2.38476562, "router_z_loss_mlp": 0.28613281, "step": 10816, "time_per_iteration": 2.6741676330566406 }, { "auxiliary_loss_clip": 0.01306141, "auxiliary_loss_mlp": 0.00240187, "balance_loss_clip": 1.07985997, "balance_loss_mlp": 0.21493825, "epoch": 0.6503532241094243, "flos": 21395003201280.0, "grad_norm": 25.939950616979125, "language_loss": 0.79197395, "learning_rate": 1.1509512902699174e-06, "loss": 0.80743718, "num_input_tokens_seen": 233469920, "router_z_loss_clip": 2.26171875, "router_z_loss_mlp": 0.25231934, "step": 10817, "time_per_iteration": 2.621716022491455 }, { "auxiliary_loss_clip": 0.01307837, "auxiliary_loss_mlp": 0.00260525, "balance_loss_clip": 1.07818115, "balance_loss_mlp": 0.23493078, "epoch": 0.6504133473620923, "flos": 74740840986240.0, "grad_norm": 18.665952518613842, "language_loss": 0.78025353, "learning_rate": 1.1505986813303916e-06, "loss": 0.79593718, "num_input_tokens_seen": 233499780, "router_z_loss_clip": 2.29492188, "router_z_loss_mlp": 0.25610352, "step": 10818, "time_per_iteration": 3.0672836303710938 }, { "auxiliary_loss_clip": 0.01301771, "auxiliary_loss_mlp": 0.00239627, "balance_loss_clip": 1.07335925, "balance_loss_mlp": 0.21478328, "epoch": 0.6504734706147602, "flos": 19713270384000.0, "grad_norm": 16.941868268918817, "language_loss": 0.73588908, "learning_rate": 1.150246104600249e-06, "loss": 0.75130308, "num_input_tokens_seen": 233518235, "router_z_loss_clip": 2.28320312, "router_z_loss_mlp": 0.24816895, "step": 10819, "time_per_iteration": 2.6197569370269775 }, { "auxiliary_loss_clip": 0.01300378, "auxiliary_loss_mlp": 0.00236663, "balance_loss_clip": 1.07193351, "balance_loss_mlp": 0.21030548, "epoch": 0.6505335938674283, "flos": 25556870390400.0, "grad_norm": 195.40005439179754, "language_loss": 0.90328932, "learning_rate": 1.14989356009286e-06, "loss": 0.91865969, "num_input_tokens_seen": 233535215, "router_z_loss_clip": 2.28125, "router_z_loss_mlp": 0.26391602, "step": 10820, "time_per_iteration": 2.68359112739563 }, { "auxiliary_loss_clip": 0.01296191, "auxiliary_loss_mlp": 0.00240695, "balance_loss_clip": 1.0693531, "balance_loss_mlp": 0.21370634, "epoch": 0.6505937171200962, "flos": 17821424960640.0, "grad_norm": 19.114068574446925, "language_loss": 0.88493919, "learning_rate": 1.1495410478215914e-06, "loss": 0.90030807, "num_input_tokens_seen": 233552775, "router_z_loss_clip": 2.265625, "router_z_loss_mlp": 0.27001953, "step": 10821, "time_per_iteration": 2.5827677249908447 }, { "auxiliary_loss_clip": 0.01288657, "auxiliary_loss_mlp": 0.00241711, "balance_loss_clip": 1.06992018, "balance_loss_mlp": 0.21740375, "epoch": 0.6506538403727642, "flos": 20668135582080.0, "grad_norm": 4.616899053142082, "language_loss": 0.8567943, "learning_rate": 1.1491885677998126e-06, "loss": 0.87209797, "num_input_tokens_seen": 233572080, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.24316406, "step": 10822, "time_per_iteration": 2.640397787094116 }, { "auxiliary_loss_clip": 0.0127793, "auxiliary_loss_mlp": 0.00237102, "balance_loss_clip": 1.0601728, "balance_loss_mlp": 0.211555, "epoch": 0.6507139636254321, "flos": 11721422695680.0, "grad_norm": 3.142345118238088, "language_loss": 0.94891667, "learning_rate": 1.1488361200408883e-06, "loss": 0.96406704, "num_input_tokens_seen": 233589155, "router_z_loss_clip": 2.18164062, "router_z_loss_mlp": 0.25549316, "step": 10823, "time_per_iteration": 4.1874823570251465 }, { "auxiliary_loss_clip": 0.01272579, "auxiliary_loss_mlp": 0.00252436, "balance_loss_clip": 1.05208588, "balance_loss_mlp": 0.22727136, "epoch": 0.6507740868781001, "flos": 26761745226240.0, "grad_norm": 5.444236386355637, "language_loss": 0.73727709, "learning_rate": 1.148483704558183e-06, "loss": 0.7525273, "num_input_tokens_seen": 233608180, "router_z_loss_clip": 2.20507812, "router_z_loss_mlp": 0.25170898, "step": 10824, "time_per_iteration": 4.226592302322388 }, { "auxiliary_loss_clip": 0.01307538, "auxiliary_loss_mlp": 0.00279907, "balance_loss_clip": 1.07050574, "balance_loss_mlp": 0.25058109, "epoch": 0.650834210130768, "flos": 16471722487680.0, "grad_norm": 13.282082838102562, "language_loss": 0.95928645, "learning_rate": 1.1481313213650607e-06, "loss": 0.97516096, "num_input_tokens_seen": 233625750, "router_z_loss_clip": 2.37109375, "router_z_loss_mlp": 0.29321289, "step": 10825, "time_per_iteration": 2.6215689182281494 }, { "auxiliary_loss_clip": 0.01300841, "auxiliary_loss_mlp": 0.00261407, "balance_loss_clip": 1.06924725, "balance_loss_mlp": 0.2328445, "epoch": 0.650894333383436, "flos": 17128672283520.0, "grad_norm": 4.144501009329079, "language_loss": 0.8600111, "learning_rate": 1.147778970474885e-06, "loss": 0.8756336, "num_input_tokens_seen": 233644235, "router_z_loss_clip": 2.3125, "router_z_loss_mlp": 0.2857666, "step": 10826, "time_per_iteration": 2.6049647331237793 }, { "auxiliary_loss_clip": 0.01295533, "auxiliary_loss_mlp": 0.00251067, "balance_loss_clip": 1.06661355, "balance_loss_mlp": 0.22535405, "epoch": 0.650954456636104, "flos": 18734238311040.0, "grad_norm": 249.5837689238325, "language_loss": 0.77834958, "learning_rate": 1.1474266519010157e-06, "loss": 0.79381555, "num_input_tokens_seen": 233662845, "router_z_loss_clip": 2.2890625, "router_z_loss_mlp": 0.25744629, "step": 10827, "time_per_iteration": 2.583498001098633 }, { "auxiliary_loss_clip": 0.01285052, "auxiliary_loss_mlp": 0.00250878, "balance_loss_clip": 1.06051934, "balance_loss_mlp": 0.2261183, "epoch": 0.651014579888772, "flos": 24528244613760.0, "grad_norm": 29.205940852598506, "language_loss": 0.87294555, "learning_rate": 1.1470743656568136e-06, "loss": 0.88830489, "num_input_tokens_seen": 233681990, "router_z_loss_clip": 2.24414062, "router_z_loss_mlp": 0.24780273, "step": 10828, "time_per_iteration": 4.0123419761657715 }, { "auxiliary_loss_clip": 0.01294181, "auxiliary_loss_mlp": 0.00238217, "balance_loss_clip": 1.07513452, "balance_loss_mlp": 0.21315941, "epoch": 0.65107470314144, "flos": 24061083304320.0, "grad_norm": 10.006285032163035, "language_loss": 0.95966613, "learning_rate": 1.1467221117556362e-06, "loss": 0.97499013, "num_input_tokens_seen": 233698930, "router_z_loss_clip": 2.19238281, "router_z_loss_mlp": 0.25024414, "step": 10829, "time_per_iteration": 2.6456191539764404 }, { "auxiliary_loss_clip": 0.01223824, "auxiliary_loss_mlp": 0.00075867, "balance_loss_clip": 1.0825969, "balance_loss_mlp": 0.06757022, "epoch": 0.6511348263941079, "flos": 72480734352000.0, "grad_norm": 0.6307632301644442, "language_loss": 0.5503993, "learning_rate": 1.1463698902108428e-06, "loss": 0.56339622, "num_input_tokens_seen": 233769825, "router_z_loss_clip": 1.4140625, "router_z_loss_mlp": 0.08300781, "step": 10830, "time_per_iteration": 3.2903075218200684 }, { "auxiliary_loss_clip": 0.01314613, "auxiliary_loss_mlp": 0.00231533, "balance_loss_clip": 1.0787282, "balance_loss_mlp": 0.20435371, "epoch": 0.6511949496467759, "flos": 23367684182400.0, "grad_norm": 6.955038588159152, "language_loss": 0.8252368, "learning_rate": 1.1460177010357878e-06, "loss": 0.84069824, "num_input_tokens_seen": 233787095, "router_z_loss_clip": 2.35742188, "router_z_loss_mlp": 0.27148438, "step": 10831, "time_per_iteration": 2.678997039794922 }, { "auxiliary_loss_clip": 0.0122712, "auxiliary_loss_mlp": 0.00068981, "balance_loss_clip": 1.08712292, "balance_loss_mlp": 0.06106578, "epoch": 0.6512550728994438, "flos": 67333191073920.0, "grad_norm": 2.251470749504655, "language_loss": 0.50390238, "learning_rate": 1.145665544243828e-06, "loss": 0.51686341, "num_input_tokens_seen": 233853050, "router_z_loss_clip": 1.3984375, "router_z_loss_mlp": 0.07910156, "step": 10832, "time_per_iteration": 4.680277109146118 }, { "auxiliary_loss_clip": 0.01299993, "auxiliary_loss_mlp": 0.00266957, "balance_loss_clip": 1.0704875, "balance_loss_mlp": 0.2402299, "epoch": 0.6513151961521119, "flos": 21141689512320.0, "grad_norm": 7.526681362631786, "language_loss": 0.94736075, "learning_rate": 1.145313419848316e-06, "loss": 0.96303028, "num_input_tokens_seen": 233871385, "router_z_loss_clip": 2.29785156, "router_z_loss_mlp": 0.26721191, "step": 10833, "time_per_iteration": 2.6669704914093018 }, { "auxiliary_loss_clip": 0.01313473, "auxiliary_loss_mlp": 0.00231704, "balance_loss_clip": 1.08460236, "balance_loss_mlp": 0.20624103, "epoch": 0.6513753194047798, "flos": 15158828476800.0, "grad_norm": 7.70050729489033, "language_loss": 0.92257738, "learning_rate": 1.1449613278626049e-06, "loss": 0.93802917, "num_input_tokens_seen": 233888175, "router_z_loss_clip": 2.28710938, "router_z_loss_mlp": 0.25439453, "step": 10834, "time_per_iteration": 2.7741682529449463 }, { "auxiliary_loss_clip": 0.01305665, "auxiliary_loss_mlp": 0.00228555, "balance_loss_clip": 1.07809997, "balance_loss_mlp": 0.20234078, "epoch": 0.6514354426574478, "flos": 30226621933440.0, "grad_norm": 69.16813707458631, "language_loss": 0.84736121, "learning_rate": 1.1446092683000455e-06, "loss": 0.86270338, "num_input_tokens_seen": 233911470, "router_z_loss_clip": 2.27539062, "router_z_loss_mlp": 0.26245117, "step": 10835, "time_per_iteration": 2.7275969982147217 }, { "auxiliary_loss_clip": 0.01293411, "auxiliary_loss_mlp": 0.00254651, "balance_loss_clip": 1.07137311, "balance_loss_mlp": 0.22943833, "epoch": 0.6514955659101157, "flos": 24205587719040.0, "grad_norm": 14.075011727168357, "language_loss": 0.85170782, "learning_rate": 1.1442572411739882e-06, "loss": 0.86718845, "num_input_tokens_seen": 233932135, "router_z_loss_clip": 2.22265625, "router_z_loss_mlp": 0.25219727, "step": 10836, "time_per_iteration": 2.815384864807129 }, { "auxiliary_loss_clip": 0.01292613, "auxiliary_loss_mlp": 0.00249708, "balance_loss_clip": 1.06740022, "balance_loss_mlp": 0.22298093, "epoch": 0.6515556891627837, "flos": 12377761960320.0, "grad_norm": 7.849347601684581, "language_loss": 0.89417213, "learning_rate": 1.143905246497783e-06, "loss": 0.90959531, "num_input_tokens_seen": 233947880, "router_z_loss_clip": 2.25195312, "router_z_loss_mlp": 0.26721191, "step": 10837, "time_per_iteration": 2.69932222366333 }, { "auxiliary_loss_clip": 0.01288782, "auxiliary_loss_mlp": 0.00246684, "balance_loss_clip": 1.06484973, "balance_loss_mlp": 0.22039844, "epoch": 0.6516158124154516, "flos": 49601217957120.0, "grad_norm": 33.29754927574795, "language_loss": 0.70345891, "learning_rate": 1.1435532842847758e-06, "loss": 0.71881354, "num_input_tokens_seen": 233971475, "router_z_loss_clip": 2.23632812, "router_z_loss_mlp": 0.26257324, "step": 10838, "time_per_iteration": 2.9870166778564453 }, { "auxiliary_loss_clip": 0.01229782, "auxiliary_loss_mlp": 0.0005363, "balance_loss_clip": 1.09023428, "balance_loss_mlp": 0.04557143, "epoch": 0.6516759356681197, "flos": 59702748076800.0, "grad_norm": 0.7145862166443198, "language_loss": 0.59511012, "learning_rate": 1.1432013545483147e-06, "loss": 0.60794425, "num_input_tokens_seen": 234030690, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.08056641, "step": 10839, "time_per_iteration": 3.2294921875 }, { "auxiliary_loss_clip": 0.01303032, "auxiliary_loss_mlp": 0.00214501, "balance_loss_clip": 1.07881522, "balance_loss_mlp": 0.19120756, "epoch": 0.6517360589207876, "flos": 37450807130880.0, "grad_norm": 47.56012204937173, "language_loss": 0.74625266, "learning_rate": 1.1428494573017439e-06, "loss": 0.761428, "num_input_tokens_seen": 234052470, "router_z_loss_clip": 2.23632812, "router_z_loss_mlp": 0.23291016, "step": 10840, "time_per_iteration": 2.785402297973633 }, { "auxiliary_loss_clip": 0.01292196, "auxiliary_loss_mlp": 0.00239624, "balance_loss_clip": 1.06625164, "balance_loss_mlp": 0.21525773, "epoch": 0.6517961821734556, "flos": 25374911068800.0, "grad_norm": 148.77497003546088, "language_loss": 0.83755374, "learning_rate": 1.1424975925584071e-06, "loss": 0.85287189, "num_input_tokens_seen": 234071495, "router_z_loss_clip": 2.26367188, "router_z_loss_mlp": 0.24401855, "step": 10841, "time_per_iteration": 2.7215096950531006 }, { "auxiliary_loss_clip": 0.01283327, "auxiliary_loss_mlp": 0.00232953, "balance_loss_clip": 1.05874097, "balance_loss_mlp": 0.20651235, "epoch": 0.6518563054261236, "flos": 28766996864640.0, "grad_norm": 355.1152710825266, "language_loss": 0.70957315, "learning_rate": 1.142145760331648e-06, "loss": 0.72473598, "num_input_tokens_seen": 234092325, "router_z_loss_clip": 2.24804688, "router_z_loss_mlp": 0.26403809, "step": 10842, "time_per_iteration": 2.710496187210083 }, { "auxiliary_loss_clip": 0.01225244, "auxiliary_loss_mlp": 0.00093816, "balance_loss_clip": 1.08697748, "balance_loss_mlp": 0.08690225, "epoch": 0.6519164286787915, "flos": 68924750797440.0, "grad_norm": 0.798550831622932, "language_loss": 0.55364186, "learning_rate": 1.141793960634807e-06, "loss": 0.56683248, "num_input_tokens_seen": 234148005, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 0.06933594, "step": 10843, "time_per_iteration": 2.972451686859131 }, { "auxiliary_loss_clip": 0.01317606, "auxiliary_loss_mlp": 0.00262029, "balance_loss_clip": 1.07867944, "balance_loss_mlp": 0.23254859, "epoch": 0.6519765519314595, "flos": 20441933683200.0, "grad_norm": 50.65795358658335, "language_loss": 0.89522696, "learning_rate": 1.1414421934812253e-06, "loss": 0.91102326, "num_input_tokens_seen": 234164280, "router_z_loss_clip": 2.38867188, "router_z_loss_mlp": 0.2947998, "step": 10844, "time_per_iteration": 2.602994680404663 }, { "auxiliary_loss_clip": 0.01279171, "auxiliary_loss_mlp": 0.00232202, "balance_loss_clip": 1.05967951, "balance_loss_mlp": 0.20713212, "epoch": 0.6520366751841274, "flos": 28402970480640.0, "grad_norm": 14.618272414952026, "language_loss": 0.69535124, "learning_rate": 1.1410904588842421e-06, "loss": 0.71046489, "num_input_tokens_seen": 234185090, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.25061035, "step": 10845, "time_per_iteration": 2.699364423751831 }, { "auxiliary_loss_clip": 0.01311695, "auxiliary_loss_mlp": 0.0026032, "balance_loss_clip": 1.07956731, "balance_loss_mlp": 0.23411788, "epoch": 0.6520967984367955, "flos": 22273414300800.0, "grad_norm": 42.89005955420811, "language_loss": 0.87229002, "learning_rate": 1.140738756857194e-06, "loss": 0.8880102, "num_input_tokens_seen": 234204050, "router_z_loss_clip": 2.32324219, "router_z_loss_mlp": 0.26208496, "step": 10846, "time_per_iteration": 2.6589903831481934 }, { "auxiliary_loss_clip": 0.01222547, "auxiliary_loss_mlp": 0.00080894, "balance_loss_clip": 1.08497393, "balance_loss_mlp": 0.07383708, "epoch": 0.6521569216894634, "flos": 68917140092160.0, "grad_norm": 0.7009818564968063, "language_loss": 0.59312677, "learning_rate": 1.1403870874134192e-06, "loss": 0.60616112, "num_input_tokens_seen": 234269790, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.07080078, "step": 10847, "time_per_iteration": 3.221574544906616 }, { "auxiliary_loss_clip": 0.01321869, "auxiliary_loss_mlp": 0.00240488, "balance_loss_clip": 1.08798528, "balance_loss_mlp": 0.21372592, "epoch": 0.6522170449421314, "flos": 29130520458240.0, "grad_norm": 3.9761476451735263, "language_loss": 0.89747679, "learning_rate": 1.1400354505662514e-06, "loss": 0.91310036, "num_input_tokens_seen": 234290135, "router_z_loss_clip": 2.33789062, "router_z_loss_mlp": 0.26782227, "step": 10848, "time_per_iteration": 2.7506299018859863 }, { "auxiliary_loss_clip": 0.01298486, "auxiliary_loss_mlp": 0.00278664, "balance_loss_clip": 1.07502723, "balance_loss_mlp": 0.25246161, "epoch": 0.6522771681947993, "flos": 26651930371200.0, "grad_norm": 9.925496980787841, "language_loss": 0.83815867, "learning_rate": 1.1396838463290263e-06, "loss": 0.85393018, "num_input_tokens_seen": 234309535, "router_z_loss_clip": 2.23632812, "router_z_loss_mlp": 0.26196289, "step": 10849, "time_per_iteration": 2.7233543395996094 }, { "auxiliary_loss_clip": 0.01282492, "auxiliary_loss_mlp": 0.00249557, "balance_loss_clip": 1.06497765, "balance_loss_mlp": 0.22212738, "epoch": 0.6523372914474673, "flos": 25739763465600.0, "grad_norm": 9.19941028012679, "language_loss": 0.75777763, "learning_rate": 1.1393322747150752e-06, "loss": 0.77309811, "num_input_tokens_seen": 234328755, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.27429199, "step": 10850, "time_per_iteration": 2.745002031326294 }, { "auxiliary_loss_clip": 0.01287758, "auxiliary_loss_mlp": 0.00227489, "balance_loss_clip": 1.0700624, "balance_loss_mlp": 0.20145406, "epoch": 0.6523974147001352, "flos": 24827345164800.0, "grad_norm": 2.369021006181498, "language_loss": 0.75944996, "learning_rate": 1.1389807357377313e-06, "loss": 0.77460241, "num_input_tokens_seen": 234348655, "router_z_loss_clip": 2.17578125, "router_z_loss_mlp": 0.26037598, "step": 10851, "time_per_iteration": 2.7214741706848145 }, { "auxiliary_loss_clip": 0.01286302, "auxiliary_loss_mlp": 0.00254547, "balance_loss_clip": 1.06644654, "balance_loss_mlp": 0.22875053, "epoch": 0.6524575379528033, "flos": 26317637470080.0, "grad_norm": 8.969155720989066, "language_loss": 0.82404292, "learning_rate": 1.1386292294103235e-06, "loss": 0.83945149, "num_input_tokens_seen": 234367445, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.25805664, "step": 10852, "time_per_iteration": 2.723938465118408 }, { "auxiliary_loss_clip": 0.01274717, "auxiliary_loss_mlp": 0.00245065, "balance_loss_clip": 1.05061221, "balance_loss_mlp": 0.21991169, "epoch": 0.6525176612054712, "flos": 19494143464320.0, "grad_norm": 75.09056126562844, "language_loss": 0.77297634, "learning_rate": 1.1382777557461812e-06, "loss": 0.78817415, "num_input_tokens_seen": 234384825, "router_z_loss_clip": 2.2421875, "router_z_loss_mlp": 0.25158691, "step": 10853, "time_per_iteration": 2.6487016677856445 }, { "auxiliary_loss_clip": 0.01222563, "auxiliary_loss_mlp": 0.0006233, "balance_loss_clip": 1.08478498, "balance_loss_mlp": 0.05551087, "epoch": 0.6525777844581392, "flos": 71706894721920.0, "grad_norm": 0.6973560723838015, "language_loss": 0.62520504, "learning_rate": 1.137926314758634e-06, "loss": 0.63805389, "num_input_tokens_seen": 234450630, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.06835938, "step": 10854, "time_per_iteration": 3.280423164367676 }, { "auxiliary_loss_clip": 0.01279398, "auxiliary_loss_mlp": 0.00231685, "balance_loss_clip": 1.06129956, "balance_loss_mlp": 0.20632935, "epoch": 0.6526379077108072, "flos": 26653115520000.0, "grad_norm": 3.103985738594293, "language_loss": 0.86076581, "learning_rate": 1.1375749064610072e-06, "loss": 0.87587667, "num_input_tokens_seen": 234473505, "router_z_loss_clip": 2.18164062, "router_z_loss_mlp": 0.25341797, "step": 10855, "time_per_iteration": 2.7648627758026123 }, { "auxiliary_loss_clip": 0.01288302, "auxiliary_loss_mlp": 0.00239359, "balance_loss_clip": 1.0703547, "balance_loss_mlp": 0.21408656, "epoch": 0.6526980309634751, "flos": 22820369673600.0, "grad_norm": 7.602607194479932, "language_loss": 0.87962317, "learning_rate": 1.1372235308666256e-06, "loss": 0.89489973, "num_input_tokens_seen": 234492485, "router_z_loss_clip": 2.18164062, "router_z_loss_mlp": 0.25292969, "step": 10856, "time_per_iteration": 2.6949477195739746 }, { "auxiliary_loss_clip": 0.01290298, "auxiliary_loss_mlp": 0.00251532, "balance_loss_clip": 1.06494153, "balance_loss_mlp": 0.2225765, "epoch": 0.6527581542161431, "flos": 28365048696960.0, "grad_norm": 35.58260499781198, "language_loss": 0.80950785, "learning_rate": 1.136872187988815e-06, "loss": 0.82492614, "num_input_tokens_seen": 234512645, "router_z_loss_clip": 2.25195312, "router_z_loss_mlp": 0.28967285, "step": 10857, "time_per_iteration": 2.7428953647613525 }, { "auxiliary_loss_clip": 0.0127374, "auxiliary_loss_mlp": 0.00228445, "balance_loss_clip": 1.05705476, "balance_loss_mlp": 0.20308964, "epoch": 0.652818277468811, "flos": 18369206346240.0, "grad_norm": 7.941383962838423, "language_loss": 0.73880857, "learning_rate": 1.1365208778408965e-06, "loss": 0.75383043, "num_input_tokens_seen": 234529310, "router_z_loss_clip": 2.16601562, "router_z_loss_mlp": 0.25366211, "step": 10858, "time_per_iteration": 2.64495849609375 }, { "auxiliary_loss_clip": 0.01262392, "auxiliary_loss_mlp": 0.002086, "balance_loss_clip": 1.05052924, "balance_loss_mlp": 0.1858307, "epoch": 0.6528784007214791, "flos": 18036170421120.0, "grad_norm": 41.214894556864316, "language_loss": 0.84832132, "learning_rate": 1.1361696004361939e-06, "loss": 0.86303127, "num_input_tokens_seen": 234546685, "router_z_loss_clip": 2.11816406, "router_z_loss_mlp": 0.22753906, "step": 10859, "time_per_iteration": 2.683326482772827 }, { "auxiliary_loss_clip": 0.0128133, "auxiliary_loss_mlp": 0.00240958, "balance_loss_clip": 1.0593698, "balance_loss_mlp": 0.21671087, "epoch": 0.652938523974147, "flos": 22382008093440.0, "grad_norm": 12.065000921612917, "language_loss": 0.74920744, "learning_rate": 1.1358183557880256e-06, "loss": 0.76443034, "num_input_tokens_seen": 234566255, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.24230957, "step": 10860, "time_per_iteration": 2.714423418045044 }, { "auxiliary_loss_clip": 0.01294689, "auxiliary_loss_mlp": 0.00232013, "balance_loss_clip": 1.0665282, "balance_loss_mlp": 0.20504773, "epoch": 0.652998647226815, "flos": 16764035368320.0, "grad_norm": 195.00025949255462, "language_loss": 0.76952171, "learning_rate": 1.135467143909712e-06, "loss": 0.78478873, "num_input_tokens_seen": 234585405, "router_z_loss_clip": 2.28320312, "router_z_loss_mlp": 0.27001953, "step": 10861, "time_per_iteration": 2.6455233097076416 }, { "auxiliary_loss_clip": 0.01281819, "auxiliary_loss_mlp": 0.00276504, "balance_loss_clip": 1.05797172, "balance_loss_mlp": 0.25002784, "epoch": 0.6530587704794829, "flos": 35772522019200.0, "grad_norm": 3.5289487713951955, "language_loss": 0.73840225, "learning_rate": 1.135115964814572e-06, "loss": 0.75398552, "num_input_tokens_seen": 234608095, "router_z_loss_clip": 2.23632812, "router_z_loss_mlp": 0.26489258, "step": 10862, "time_per_iteration": 2.7689011096954346 }, { "auxiliary_loss_clip": 0.01283393, "auxiliary_loss_mlp": 0.00242434, "balance_loss_clip": 1.0593338, "balance_loss_mlp": 0.21685168, "epoch": 0.6531188937321509, "flos": 19316134638720.0, "grad_norm": 25.94521152131059, "language_loss": 0.84953922, "learning_rate": 1.13476481851592e-06, "loss": 0.86479747, "num_input_tokens_seen": 234627335, "router_z_loss_clip": 2.24023438, "router_z_loss_mlp": 0.25598145, "step": 10863, "time_per_iteration": 2.6877613067626953 }, { "auxiliary_loss_clip": 0.01245649, "auxiliary_loss_mlp": 0.00250211, "balance_loss_clip": 1.03676891, "balance_loss_mlp": 0.22782359, "epoch": 0.6531790169848188, "flos": 22893771116160.0, "grad_norm": 26.342126279011076, "language_loss": 0.8174206, "learning_rate": 1.1344137050270739e-06, "loss": 0.83237922, "num_input_tokens_seen": 234646540, "router_z_loss_clip": 2.08984375, "router_z_loss_mlp": 0.22399902, "step": 10864, "time_per_iteration": 2.637723684310913 }, { "auxiliary_loss_clip": 0.012788, "auxiliary_loss_mlp": 0.00240179, "balance_loss_clip": 1.05818832, "balance_loss_mlp": 0.21491827, "epoch": 0.6532391402374869, "flos": 29563530912000.0, "grad_norm": 570.0223015947254, "language_loss": 0.93262708, "learning_rate": 1.1340626243613458e-06, "loss": 0.94781685, "num_input_tokens_seen": 234665470, "router_z_loss_clip": 2.20507812, "router_z_loss_mlp": 0.25244141, "step": 10865, "time_per_iteration": 4.09613561630249 }, { "auxiliary_loss_clip": 0.01256346, "auxiliary_loss_mlp": 0.00274787, "balance_loss_clip": 1.04206967, "balance_loss_mlp": 0.24989566, "epoch": 0.6532992634901548, "flos": 23105463920640.0, "grad_norm": 14.008411349440369, "language_loss": 0.89589578, "learning_rate": 1.133711576532051e-06, "loss": 0.9112072, "num_input_tokens_seen": 234683955, "router_z_loss_clip": 2.14648438, "router_z_loss_mlp": 0.24853516, "step": 10866, "time_per_iteration": 4.1007304191589355 }, { "auxiliary_loss_clip": 0.01261706, "auxiliary_loss_mlp": 0.00277374, "balance_loss_clip": 1.04998112, "balance_loss_mlp": 0.25403258, "epoch": 0.6533593867428228, "flos": 26067340523520.0, "grad_norm": 7.596231931343413, "language_loss": 0.87043715, "learning_rate": 1.1333605615524995e-06, "loss": 0.8858279, "num_input_tokens_seen": 234704595, "router_z_loss_clip": 2.1171875, "router_z_loss_mlp": 0.23352051, "step": 10867, "time_per_iteration": 2.7413864135742188 }, { "auxiliary_loss_clip": 0.01274013, "auxiliary_loss_mlp": 0.00253397, "balance_loss_clip": 1.05289078, "balance_loss_mlp": 0.22787458, "epoch": 0.6534195099954908, "flos": 21212469262080.0, "grad_norm": 3.624982020097747, "language_loss": 0.91373181, "learning_rate": 1.1330095794360016e-06, "loss": 0.92900598, "num_input_tokens_seen": 234724090, "router_z_loss_clip": 2.2109375, "router_z_loss_mlp": 0.25524902, "step": 10868, "time_per_iteration": 2.6586413383483887 }, { "auxiliary_loss_clip": 0.01275886, "auxiliary_loss_mlp": 0.00264243, "balance_loss_clip": 1.05205798, "balance_loss_mlp": 0.23773122, "epoch": 0.6534796332481587, "flos": 19646584784640.0, "grad_norm": 101.50793370570032, "language_loss": 0.89552003, "learning_rate": 1.1326586301958675e-06, "loss": 0.91092134, "num_input_tokens_seen": 234742560, "router_z_loss_clip": 2.23242188, "router_z_loss_mlp": 0.26525879, "step": 10869, "time_per_iteration": 2.7347662448883057 }, { "auxiliary_loss_clip": 0.01278368, "auxiliary_loss_mlp": 0.0023664, "balance_loss_clip": 1.05649662, "balance_loss_mlp": 0.21160582, "epoch": 0.6535397565008267, "flos": 24022479162240.0, "grad_norm": 11.191159583769872, "language_loss": 0.80640614, "learning_rate": 1.1323077138454063e-06, "loss": 0.82155621, "num_input_tokens_seen": 234762315, "router_z_loss_clip": 2.21777344, "router_z_loss_mlp": 0.25036621, "step": 10870, "time_per_iteration": 4.124929666519165 }, { "auxiliary_loss_clip": 0.01282197, "auxiliary_loss_mlp": 0.00244609, "balance_loss_clip": 1.06227899, "balance_loss_mlp": 0.21889579, "epoch": 0.6535998797534947, "flos": 24602759377920.0, "grad_norm": 15.031720567961912, "language_loss": 0.83353996, "learning_rate": 1.1319568303979221e-06, "loss": 0.84880805, "num_input_tokens_seen": 234781300, "router_z_loss_clip": 2.19824219, "router_z_loss_mlp": 0.25732422, "step": 10871, "time_per_iteration": 2.761345386505127 }, { "auxiliary_loss_clip": 0.01262372, "auxiliary_loss_mlp": 0.00248098, "balance_loss_clip": 1.04933739, "balance_loss_mlp": 0.22269417, "epoch": 0.6536600030061627, "flos": 23364164649600.0, "grad_norm": 6.777374970670331, "language_loss": 0.63868195, "learning_rate": 1.1316059798667227e-06, "loss": 0.6537866, "num_input_tokens_seen": 234801040, "router_z_loss_clip": 2.12988281, "router_z_loss_mlp": 0.25402832, "step": 10872, "time_per_iteration": 2.723134756088257 }, { "auxiliary_loss_clip": 0.01294994, "auxiliary_loss_mlp": 0.00231035, "balance_loss_clip": 1.07190061, "balance_loss_mlp": 0.20765772, "epoch": 0.6537201262588306, "flos": 23878477537920.0, "grad_norm": 151.6286478481261, "language_loss": 0.81741726, "learning_rate": 1.1312551622651112e-06, "loss": 0.8326776, "num_input_tokens_seen": 234821415, "router_z_loss_clip": 2.23144531, "router_z_loss_mlp": 0.23364258, "step": 10873, "time_per_iteration": 2.735045909881592 }, { "auxiliary_loss_clip": 0.0127064, "auxiliary_loss_mlp": 0.00252806, "balance_loss_clip": 1.05174458, "balance_loss_mlp": 0.22710408, "epoch": 0.6537802495114986, "flos": 24354760901760.0, "grad_norm": 3.4160412792141113, "language_loss": 0.82272679, "learning_rate": 1.1309043776063917e-06, "loss": 0.83796126, "num_input_tokens_seen": 234843795, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.25695801, "step": 10874, "time_per_iteration": 4.147740602493286 }, { "auxiliary_loss_clip": 0.01304264, "auxiliary_loss_mlp": 0.00238089, "balance_loss_clip": 1.07497644, "balance_loss_mlp": 0.21326992, "epoch": 0.6538403727641665, "flos": 27996892248960.0, "grad_norm": 18.202891541769056, "language_loss": 0.87526995, "learning_rate": 1.1305536259038642e-06, "loss": 0.89069349, "num_input_tokens_seen": 234862350, "router_z_loss_clip": 2.29296875, "router_z_loss_mlp": 0.24816895, "step": 10875, "time_per_iteration": 2.755470037460327 }, { "auxiliary_loss_clip": 0.0127696, "auxiliary_loss_mlp": 0.00235264, "balance_loss_clip": 1.05871844, "balance_loss_mlp": 0.21058796, "epoch": 0.6539004960168345, "flos": 27563594486400.0, "grad_norm": 7.6632994093100315, "language_loss": 0.76511437, "learning_rate": 1.1302029071708314e-06, "loss": 0.7802366, "num_input_tokens_seen": 234881790, "router_z_loss_clip": 2.18164062, "router_z_loss_mlp": 0.24694824, "step": 10876, "time_per_iteration": 2.7656705379486084 }, { "auxiliary_loss_clip": 0.01306393, "auxiliary_loss_mlp": 0.00251464, "balance_loss_clip": 1.07597184, "balance_loss_mlp": 0.22547595, "epoch": 0.6539606192695024, "flos": 14530067879040.0, "grad_norm": 32.326042237307156, "language_loss": 0.871952, "learning_rate": 1.1298522214205908e-06, "loss": 0.88753057, "num_input_tokens_seen": 234897775, "router_z_loss_clip": 2.30273438, "router_z_loss_mlp": 0.2598877, "step": 10877, "time_per_iteration": 2.753566265106201 }, { "auxiliary_loss_clip": 0.01295032, "auxiliary_loss_mlp": 0.00260711, "balance_loss_clip": 1.07344127, "balance_loss_mlp": 0.23707192, "epoch": 0.6540207425221705, "flos": 21616356764160.0, "grad_norm": 24.19132912524503, "language_loss": 0.88866556, "learning_rate": 1.1295015686664408e-06, "loss": 0.90422308, "num_input_tokens_seen": 234918395, "router_z_loss_clip": 2.21484375, "router_z_loss_mlp": 0.2364502, "step": 10878, "time_per_iteration": 2.6627676486968994 }, { "auxiliary_loss_clip": 0.0129288, "auxiliary_loss_mlp": 0.00250057, "balance_loss_clip": 1.06390071, "balance_loss_mlp": 0.22367632, "epoch": 0.6540808657748384, "flos": 17668983640320.0, "grad_norm": 6.753474428707174, "language_loss": 0.93942773, "learning_rate": 1.1291509489216797e-06, "loss": 0.95485705, "num_input_tokens_seen": 234936260, "router_z_loss_clip": 2.2890625, "router_z_loss_mlp": 0.26416016, "step": 10879, "time_per_iteration": 2.7481799125671387 }, { "auxiliary_loss_clip": 0.01284997, "auxiliary_loss_mlp": 0.0026615, "balance_loss_clip": 1.05776119, "balance_loss_mlp": 0.23987672, "epoch": 0.6541409890275064, "flos": 14538292093440.0, "grad_norm": 39.384032894606925, "language_loss": 0.81656718, "learning_rate": 1.128800362199601e-06, "loss": 0.83207864, "num_input_tokens_seen": 234952110, "router_z_loss_clip": 2.27148438, "router_z_loss_mlp": 0.26269531, "step": 10880, "time_per_iteration": 2.6587209701538086 }, { "auxiliary_loss_clip": 0.0126964, "auxiliary_loss_mlp": 0.00241536, "balance_loss_clip": 1.05515063, "balance_loss_mlp": 0.21813522, "epoch": 0.6542011122801744, "flos": 17165301177600.0, "grad_norm": 28.87966381069261, "language_loss": 0.92245233, "learning_rate": 1.1284498085135005e-06, "loss": 0.93756407, "num_input_tokens_seen": 234970810, "router_z_loss_clip": 2.14648438, "router_z_loss_mlp": 0.23425293, "step": 10881, "time_per_iteration": 2.681338310241699 }, { "auxiliary_loss_clip": 0.01263386, "auxiliary_loss_mlp": 0.00249294, "balance_loss_clip": 1.04325151, "balance_loss_mlp": 0.2252848, "epoch": 0.6542612355328423, "flos": 18186600579840.0, "grad_norm": 2.4609817992105563, "language_loss": 0.86997545, "learning_rate": 1.1280992878766699e-06, "loss": 0.88510221, "num_input_tokens_seen": 234989565, "router_z_loss_clip": 2.20507812, "router_z_loss_mlp": 0.24023438, "step": 10882, "time_per_iteration": 2.6458663940429688 }, { "auxiliary_loss_clip": 0.01295012, "auxiliary_loss_mlp": 0.00246287, "balance_loss_clip": 1.0626266, "balance_loss_mlp": 0.221861, "epoch": 0.6543213587855103, "flos": 19792453916160.0, "grad_norm": 14.450162427544912, "language_loss": 0.88762236, "learning_rate": 1.1277488003024024e-06, "loss": 0.90303534, "num_input_tokens_seen": 235007955, "router_z_loss_clip": 2.32421875, "router_z_loss_mlp": 0.24450684, "step": 10883, "time_per_iteration": 2.721550226211548 }, { "auxiliary_loss_clip": 0.01290626, "auxiliary_loss_mlp": 0.00243798, "balance_loss_clip": 1.06385863, "balance_loss_mlp": 0.22017065, "epoch": 0.6543814820381783, "flos": 21105096531840.0, "grad_norm": 11.487302224720281, "language_loss": 0.96227038, "learning_rate": 1.127398345803988e-06, "loss": 0.97761464, "num_input_tokens_seen": 235024860, "router_z_loss_clip": 2.26757812, "router_z_loss_mlp": 0.2364502, "step": 10884, "time_per_iteration": 2.6525158882141113 }, { "auxiliary_loss_clip": 0.01312682, "auxiliary_loss_mlp": 0.00244969, "balance_loss_clip": 1.07974529, "balance_loss_mlp": 0.21997052, "epoch": 0.6544416052908463, "flos": 20194042947840.0, "grad_norm": 172.97193056660404, "language_loss": 0.93572617, "learning_rate": 1.127047924394715e-06, "loss": 0.95130265, "num_input_tokens_seen": 235043815, "router_z_loss_clip": 2.328125, "router_z_loss_mlp": 0.25, "step": 10885, "time_per_iteration": 2.689781427383423 }, { "auxiliary_loss_clip": 0.0128043, "auxiliary_loss_mlp": 0.00236048, "balance_loss_clip": 1.05666947, "balance_loss_mlp": 0.21207532, "epoch": 0.6545017285435142, "flos": 23368258800000.0, "grad_norm": 10.207961806352907, "language_loss": 0.81304878, "learning_rate": 1.1266975360878722e-06, "loss": 0.82821357, "num_input_tokens_seen": 235062985, "router_z_loss_clip": 2.23828125, "router_z_loss_mlp": 0.23986816, "step": 10886, "time_per_iteration": 2.6667215824127197 }, { "auxiliary_loss_clip": 0.01264493, "auxiliary_loss_mlp": 0.0021361, "balance_loss_clip": 1.04882336, "balance_loss_mlp": 0.1903169, "epoch": 0.6545618517961822, "flos": 19134714021120.0, "grad_norm": 33.35775084237621, "language_loss": 0.84717309, "learning_rate": 1.1263471808967468e-06, "loss": 0.86195409, "num_input_tokens_seen": 235081670, "router_z_loss_clip": 2.15820312, "router_z_loss_mlp": 0.23303223, "step": 10887, "time_per_iteration": 2.68330454826355 }, { "auxiliary_loss_clip": 0.01271209, "auxiliary_loss_mlp": 0.0025897, "balance_loss_clip": 1.05113912, "balance_loss_mlp": 0.23471075, "epoch": 0.6546219750488501, "flos": 14938624149120.0, "grad_norm": 12.55933496050534, "language_loss": 0.87230873, "learning_rate": 1.1259968588346234e-06, "loss": 0.88761055, "num_input_tokens_seen": 235098510, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.24267578, "step": 10888, "time_per_iteration": 2.589668035507202 }, { "auxiliary_loss_clip": 0.01270934, "auxiliary_loss_mlp": 0.00252172, "balance_loss_clip": 1.05471718, "balance_loss_mlp": 0.2292596, "epoch": 0.6546820983015181, "flos": 36320518886400.0, "grad_norm": 52.45069538924412, "language_loss": 0.73613513, "learning_rate": 1.1256465699147874e-06, "loss": 0.75136614, "num_input_tokens_seen": 235119990, "router_z_loss_clip": 2.15820312, "router_z_loss_mlp": 0.22912598, "step": 10889, "time_per_iteration": 2.8222599029541016 }, { "auxiliary_loss_clip": 0.01285453, "auxiliary_loss_mlp": 0.00259404, "balance_loss_clip": 1.05499232, "balance_loss_mlp": 0.23199777, "epoch": 0.654742221554186, "flos": 20411446014720.0, "grad_norm": 2.48036852256772, "language_loss": 0.86782783, "learning_rate": 1.1252963141505203e-06, "loss": 0.8832764, "num_input_tokens_seen": 235139255, "router_z_loss_clip": 2.3046875, "router_z_loss_mlp": 0.27416992, "step": 10890, "time_per_iteration": 2.6636970043182373 }, { "auxiliary_loss_clip": 0.01280255, "auxiliary_loss_mlp": 0.00275465, "balance_loss_clip": 1.0580585, "balance_loss_mlp": 0.24950102, "epoch": 0.6548023448068541, "flos": 24863650836480.0, "grad_norm": 28.147015632290092, "language_loss": 0.76915932, "learning_rate": 1.1249460915551052e-06, "loss": 0.78471649, "num_input_tokens_seen": 235158455, "router_z_loss_clip": 2.22070312, "router_z_loss_mlp": 0.2598877, "step": 10891, "time_per_iteration": 2.6773128509521484 }, { "auxiliary_loss_clip": 0.01268136, "auxiliary_loss_mlp": 0.00221296, "balance_loss_clip": 1.05103266, "balance_loss_mlp": 0.19894443, "epoch": 0.654862468059522, "flos": 21427573858560.0, "grad_norm": 130.20932052948413, "language_loss": 0.86174089, "learning_rate": 1.1245959021418214e-06, "loss": 0.87663519, "num_input_tokens_seen": 235177350, "router_z_loss_clip": 2.16796875, "router_z_loss_mlp": 0.22363281, "step": 10892, "time_per_iteration": 2.68110990524292 }, { "auxiliary_loss_clip": 0.01285467, "auxiliary_loss_mlp": 0.00249194, "balance_loss_clip": 1.06006312, "balance_loss_mlp": 0.22518542, "epoch": 0.65492259131219, "flos": 26577846570240.0, "grad_norm": 9.075569114855618, "language_loss": 0.86423004, "learning_rate": 1.1242457459239497e-06, "loss": 0.87957662, "num_input_tokens_seen": 235196435, "router_z_loss_clip": 2.25390625, "router_z_loss_mlp": 0.2401123, "step": 10893, "time_per_iteration": 2.7410171031951904 }, { "auxiliary_loss_clip": 0.01282963, "auxiliary_loss_mlp": 0.0021491, "balance_loss_clip": 1.05641222, "balance_loss_mlp": 0.19110423, "epoch": 0.6549827145648579, "flos": 21501334437120.0, "grad_norm": 7.9243298832215885, "language_loss": 0.7744534, "learning_rate": 1.123895622914766e-06, "loss": 0.78943217, "num_input_tokens_seen": 235215430, "router_z_loss_clip": 2.265625, "router_z_loss_mlp": 0.23791504, "step": 10894, "time_per_iteration": 2.66680645942688 }, { "auxiliary_loss_clip": 0.01287132, "auxiliary_loss_mlp": 0.00218249, "balance_loss_clip": 1.05959558, "balance_loss_mlp": 0.19332188, "epoch": 0.6550428378175259, "flos": 22594275515520.0, "grad_norm": 207.53590672908507, "language_loss": 0.76451582, "learning_rate": 1.123545533127549e-06, "loss": 0.77956963, "num_input_tokens_seen": 235232015, "router_z_loss_clip": 2.27539062, "router_z_loss_mlp": 0.24938965, "step": 10895, "time_per_iteration": 2.694896936416626 }, { "auxiliary_loss_clip": 0.01267342, "auxiliary_loss_mlp": 0.00208987, "balance_loss_clip": 1.05093527, "balance_loss_mlp": 0.18551511, "epoch": 0.655102961070194, "flos": 12823809050880.0, "grad_norm": 46.72849748378113, "language_loss": 0.85037613, "learning_rate": 1.1231954765755722e-06, "loss": 0.86513948, "num_input_tokens_seen": 235248115, "router_z_loss_clip": 2.16210938, "router_z_loss_mlp": 0.23474121, "step": 10896, "time_per_iteration": 2.631767511367798 }, { "auxiliary_loss_clip": 0.01262139, "auxiliary_loss_mlp": 0.00217296, "balance_loss_clip": 1.04549205, "balance_loss_mlp": 0.19359674, "epoch": 0.6551630843228619, "flos": 24791075406720.0, "grad_norm": 4.185231757657812, "language_loss": 0.76614445, "learning_rate": 1.1228454532721111e-06, "loss": 0.78093874, "num_input_tokens_seen": 235270785, "router_z_loss_clip": 2.16503906, "router_z_loss_mlp": 0.23706055, "step": 10897, "time_per_iteration": 2.754528045654297 }, { "auxiliary_loss_clip": 0.01273615, "auxiliary_loss_mlp": 0.00234587, "balance_loss_clip": 1.05341506, "balance_loss_mlp": 0.21110269, "epoch": 0.6552232075755299, "flos": 16724461559040.0, "grad_norm": 186.3534395257103, "language_loss": 0.82690084, "learning_rate": 1.1224954632304391e-06, "loss": 0.8419829, "num_input_tokens_seen": 235287905, "router_z_loss_clip": 2.19921875, "router_z_loss_mlp": 0.23474121, "step": 10898, "time_per_iteration": 2.692232370376587 }, { "auxiliary_loss_clip": 0.01283559, "auxiliary_loss_mlp": 0.00224523, "balance_loss_clip": 1.06355453, "balance_loss_mlp": 0.20069274, "epoch": 0.6552833308281978, "flos": 22016473338240.0, "grad_norm": 46.320242335365464, "language_loss": 0.84008509, "learning_rate": 1.122145506463827e-06, "loss": 0.85516596, "num_input_tokens_seen": 235305525, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.23852539, "step": 10899, "time_per_iteration": 2.677253246307373 }, { "auxiliary_loss_clip": 0.01274211, "auxiliary_loss_mlp": 0.00229432, "balance_loss_clip": 1.05045724, "balance_loss_mlp": 0.20594731, "epoch": 0.6553434540808658, "flos": 24863399441280.0, "grad_norm": 2.9410986951762164, "language_loss": 0.6429019, "learning_rate": 1.1217955829855443e-06, "loss": 0.6579383, "num_input_tokens_seen": 235324415, "router_z_loss_clip": 2.23730469, "router_z_loss_mlp": 0.23461914, "step": 10900, "time_per_iteration": 2.6710598468780518 }, { "auxiliary_loss_clip": 0.01283385, "auxiliary_loss_mlp": 0.00214801, "balance_loss_clip": 1.05804515, "balance_loss_mlp": 0.19100709, "epoch": 0.6554035773335337, "flos": 23221060865280.0, "grad_norm": 4.321381735483956, "language_loss": 0.83396643, "learning_rate": 1.1214456928088622e-06, "loss": 0.8489483, "num_input_tokens_seen": 235341595, "router_z_loss_clip": 2.25, "router_z_loss_mlp": 0.23779297, "step": 10901, "time_per_iteration": 2.676638126373291 }, { "auxiliary_loss_clip": 0.01283699, "auxiliary_loss_mlp": 0.00200524, "balance_loss_clip": 1.05755949, "balance_loss_mlp": 0.17610994, "epoch": 0.6554637005862017, "flos": 22783597125120.0, "grad_norm": 7.4955332566519015, "language_loss": 0.809811, "learning_rate": 1.1210958359470463e-06, "loss": 0.82465327, "num_input_tokens_seen": 235361700, "router_z_loss_clip": 2.26171875, "router_z_loss_mlp": 0.2442627, "step": 10902, "time_per_iteration": 2.6722092628479004 }, { "auxiliary_loss_clip": 0.01268227, "auxiliary_loss_mlp": 0.00192497, "balance_loss_clip": 1.0517993, "balance_loss_mlp": 0.16860729, "epoch": 0.6555238238388696, "flos": 21507224267520.0, "grad_norm": 4.506528977657844, "language_loss": 0.77858722, "learning_rate": 1.1207460124133645e-06, "loss": 0.79319453, "num_input_tokens_seen": 235382065, "router_z_loss_clip": 2.16210938, "router_z_loss_mlp": 0.23876953, "step": 10903, "time_per_iteration": 2.7329742908477783 }, { "auxiliary_loss_clip": 0.01302214, "auxiliary_loss_mlp": 0.002142, "balance_loss_clip": 1.06954694, "balance_loss_mlp": 0.18977419, "epoch": 0.6555839470915377, "flos": 30519473518080.0, "grad_norm": 6.452241327104623, "language_loss": 0.76953816, "learning_rate": 1.1203962222210832e-06, "loss": 0.7847023, "num_input_tokens_seen": 235402130, "router_z_loss_clip": 2.32617188, "router_z_loss_mlp": 0.2442627, "step": 10904, "time_per_iteration": 2.710002899169922 }, { "auxiliary_loss_clip": 0.01289509, "auxiliary_loss_mlp": 0.00243696, "balance_loss_clip": 1.06266308, "balance_loss_mlp": 0.2180897, "epoch": 0.6556440703442056, "flos": 24642943718400.0, "grad_norm": 4.0679568045962835, "language_loss": 0.96902895, "learning_rate": 1.120046465383464e-06, "loss": 0.98436099, "num_input_tokens_seen": 235420435, "router_z_loss_clip": 2.26757812, "router_z_loss_mlp": 0.25610352, "step": 10905, "time_per_iteration": 2.7390940189361572 }, { "auxiliary_loss_clip": 0.01289449, "auxiliary_loss_mlp": 0.00209828, "balance_loss_clip": 1.06724226, "balance_loss_mlp": 0.18564025, "epoch": 0.6557041935968736, "flos": 23732464752000.0, "grad_norm": 5.2061903983510875, "language_loss": 0.84607613, "learning_rate": 1.1196967419137721e-06, "loss": 0.86106884, "num_input_tokens_seen": 235439960, "router_z_loss_clip": 2.22070312, "router_z_loss_mlp": 0.24206543, "step": 10906, "time_per_iteration": 2.6646931171417236 }, { "auxiliary_loss_clip": 0.01284026, "auxiliary_loss_mlp": 0.00221858, "balance_loss_clip": 1.05888224, "balance_loss_mlp": 0.19727743, "epoch": 0.6557643168495415, "flos": 11102753819520.0, "grad_norm": 4.98764644262817, "language_loss": 0.88555413, "learning_rate": 1.119347051825267e-06, "loss": 0.90061301, "num_input_tokens_seen": 235457495, "router_z_loss_clip": 2.25390625, "router_z_loss_mlp": 0.24584961, "step": 10907, "time_per_iteration": 2.655547857284546 }, { "auxiliary_loss_clip": 0.01288601, "auxiliary_loss_mlp": 0.00218752, "balance_loss_clip": 1.06228352, "balance_loss_mlp": 0.19374156, "epoch": 0.6558244401022095, "flos": 30191034533760.0, "grad_norm": 4.240968373265685, "language_loss": 0.80724388, "learning_rate": 1.118997395131211e-06, "loss": 0.82231736, "num_input_tokens_seen": 235479525, "router_z_loss_clip": 2.26171875, "router_z_loss_mlp": 0.25, "step": 10908, "time_per_iteration": 4.108297824859619 }, { "auxiliary_loss_clip": 0.012895, "auxiliary_loss_mlp": 0.00216868, "balance_loss_clip": 1.06229877, "balance_loss_mlp": 0.19263314, "epoch": 0.6558845633548775, "flos": 17931060247680.0, "grad_norm": 131.77238661328542, "language_loss": 0.91522723, "learning_rate": 1.118647771844861e-06, "loss": 0.93029094, "num_input_tokens_seen": 235496305, "router_z_loss_clip": 2.27148438, "router_z_loss_mlp": 0.24243164, "step": 10909, "time_per_iteration": 4.07070779800415 }, { "auxiliary_loss_clip": 0.01297113, "auxiliary_loss_mlp": 0.00215572, "balance_loss_clip": 1.06669688, "balance_loss_mlp": 0.19088383, "epoch": 0.6559446866075455, "flos": 21904144531200.0, "grad_norm": 18.387072844437405, "language_loss": 0.75924754, "learning_rate": 1.1182981819794767e-06, "loss": 0.77437437, "num_input_tokens_seen": 235512545, "router_z_loss_clip": 2.30859375, "router_z_loss_mlp": 0.24682617, "step": 10910, "time_per_iteration": 2.723598003387451 }, { "auxiliary_loss_clip": 0.01300166, "auxiliary_loss_mlp": 0.00202458, "balance_loss_clip": 1.06556511, "balance_loss_mlp": 0.17687619, "epoch": 0.6560048098602135, "flos": 14127976056960.0, "grad_norm": 38.10937597344694, "language_loss": 0.91661435, "learning_rate": 1.117948625548313e-06, "loss": 0.93164062, "num_input_tokens_seen": 235526045, "router_z_loss_clip": 2.34570312, "router_z_loss_mlp": 0.25561523, "step": 10911, "time_per_iteration": 2.6993765830993652 }, { "auxiliary_loss_clip": 0.0126964, "auxiliary_loss_mlp": 0.00215191, "balance_loss_clip": 1.05499494, "balance_loss_mlp": 0.19083694, "epoch": 0.6560649331128814, "flos": 18807567926400.0, "grad_norm": 3.014057129800051, "language_loss": 0.81293213, "learning_rate": 1.1175991025646265e-06, "loss": 0.82778049, "num_input_tokens_seen": 235545285, "router_z_loss_clip": 2.14648438, "router_z_loss_mlp": 0.24365234, "step": 10912, "time_per_iteration": 4.177293539047241 }, { "auxiliary_loss_clip": 0.01318694, "auxiliary_loss_mlp": 0.00241726, "balance_loss_clip": 1.0843178, "balance_loss_mlp": 0.21490347, "epoch": 0.6561250563655494, "flos": 17053618815360.0, "grad_norm": 9.418710906181504, "language_loss": 0.87184262, "learning_rate": 1.1172496130416697e-06, "loss": 0.88744682, "num_input_tokens_seen": 235563150, "router_z_loss_clip": 2.34375, "router_z_loss_mlp": 0.26855469, "step": 10913, "time_per_iteration": 2.6818864345550537 }, { "auxiliary_loss_clip": 0.01270391, "auxiliary_loss_mlp": 0.00195331, "balance_loss_clip": 1.05455065, "balance_loss_mlp": 0.17121519, "epoch": 0.6561851796182173, "flos": 22637656166400.0, "grad_norm": 14.940546540433687, "language_loss": 0.80994129, "learning_rate": 1.1169001569926961e-06, "loss": 0.82459843, "num_input_tokens_seen": 235582535, "router_z_loss_clip": 2.1640625, "router_z_loss_mlp": 0.24121094, "step": 10914, "time_per_iteration": 2.665663003921509 }, { "auxiliary_loss_clip": 0.01302609, "auxiliary_loss_mlp": 0.00213902, "balance_loss_clip": 1.07183075, "balance_loss_mlp": 0.18870132, "epoch": 0.6562453028708853, "flos": 19239213663360.0, "grad_norm": 5.80505734613378, "language_loss": 0.81891096, "learning_rate": 1.116550734430958e-06, "loss": 0.83407611, "num_input_tokens_seen": 235601490, "router_z_loss_clip": 2.30859375, "router_z_loss_mlp": 0.25219727, "step": 10915, "time_per_iteration": 2.655879259109497 }, { "auxiliary_loss_clip": 0.01309884, "auxiliary_loss_mlp": 0.0023805, "balance_loss_clip": 1.08088863, "balance_loss_mlp": 0.21052468, "epoch": 0.6563054261235532, "flos": 23801305167360.0, "grad_norm": 156.76837143944917, "language_loss": 0.84858847, "learning_rate": 1.1162013453697042e-06, "loss": 0.86406779, "num_input_tokens_seen": 235619165, "router_z_loss_clip": 2.28710938, "router_z_loss_mlp": 0.2755127, "step": 10916, "time_per_iteration": 2.6404120922088623 }, { "auxiliary_loss_clip": 0.01286008, "auxiliary_loss_mlp": 0.0019984, "balance_loss_clip": 1.06224084, "balance_loss_mlp": 0.17576019, "epoch": 0.6563655493762213, "flos": 19240039676160.0, "grad_norm": 47.968771543372576, "language_loss": 0.82472956, "learning_rate": 1.1158519898221831e-06, "loss": 0.83958805, "num_input_tokens_seen": 235637115, "router_z_loss_clip": 2.23828125, "router_z_loss_mlp": 0.24084473, "step": 10917, "time_per_iteration": 4.133657217025757 }, { "auxiliary_loss_clip": 0.01301438, "auxiliary_loss_mlp": 0.00203818, "balance_loss_clip": 1.07371092, "balance_loss_mlp": 0.1782712, "epoch": 0.6564256726288892, "flos": 25556439427200.0, "grad_norm": 36.690825657906665, "language_loss": 0.81118125, "learning_rate": 1.1155026678016445e-06, "loss": 0.82623386, "num_input_tokens_seen": 235656330, "router_z_loss_clip": 2.27929688, "router_z_loss_mlp": 0.25537109, "step": 10918, "time_per_iteration": 2.709566354751587 }, { "auxiliary_loss_clip": 0.01295058, "auxiliary_loss_mlp": 0.00198213, "balance_loss_clip": 1.07504964, "balance_loss_mlp": 0.17547995, "epoch": 0.6564857958815572, "flos": 22200623389440.0, "grad_norm": 12.956843602357612, "language_loss": 0.81405604, "learning_rate": 1.115153379321332e-06, "loss": 0.82898879, "num_input_tokens_seen": 235674510, "router_z_loss_clip": 2.20117188, "router_z_loss_mlp": 0.22753906, "step": 10919, "time_per_iteration": 2.6647236347198486 }, { "auxiliary_loss_clip": 0.01202549, "auxiliary_loss_mlp": 0.00082534, "balance_loss_clip": 1.06641889, "balance_loss_mlp": 0.07452312, "epoch": 0.6565459191342251, "flos": 58123144604160.0, "grad_norm": 0.7187803542447121, "language_loss": 0.52438414, "learning_rate": 1.1148041243944931e-06, "loss": 0.5372349, "num_input_tokens_seen": 235735050, "router_z_loss_clip": 1.359375, "router_z_loss_mlp": 0.08007812, "step": 10920, "time_per_iteration": 3.1762983798980713 }, { "auxiliary_loss_clip": 0.01284827, "auxiliary_loss_mlp": 0.00215558, "balance_loss_clip": 1.06644917, "balance_loss_mlp": 0.19085824, "epoch": 0.6566060423868931, "flos": 30809631582720.0, "grad_norm": 2.3507194405737066, "language_loss": 0.7147544, "learning_rate": 1.1144549030343697e-06, "loss": 0.72975826, "num_input_tokens_seen": 235757545, "router_z_loss_clip": 2.18164062, "router_z_loss_mlp": 0.24707031, "step": 10921, "time_per_iteration": 2.7751264572143555 }, { "auxiliary_loss_clip": 0.01287271, "auxiliary_loss_mlp": 0.00204914, "balance_loss_clip": 1.06796837, "balance_loss_mlp": 0.17931968, "epoch": 0.6566661656395612, "flos": 23367432787200.0, "grad_norm": 74.50217093529716, "language_loss": 0.9043014, "learning_rate": 1.114105715254205e-06, "loss": 0.91922325, "num_input_tokens_seen": 235777265, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.25634766, "step": 10922, "time_per_iteration": 2.7428367137908936 }, { "auxiliary_loss_clip": 0.01292817, "auxiliary_loss_mlp": 0.00226402, "balance_loss_clip": 1.06719589, "balance_loss_mlp": 0.20261931, "epoch": 0.6567262888922291, "flos": 25735597488000.0, "grad_norm": 33.03753580718833, "language_loss": 0.81696653, "learning_rate": 1.1137565610672414e-06, "loss": 0.83215874, "num_input_tokens_seen": 235796565, "router_z_loss_clip": 2.25976562, "router_z_loss_mlp": 0.2376709, "step": 10923, "time_per_iteration": 2.7667367458343506 }, { "auxiliary_loss_clip": 0.01299726, "auxiliary_loss_mlp": 0.00223101, "balance_loss_clip": 1.07447791, "balance_loss_mlp": 0.19819795, "epoch": 0.6567864121448971, "flos": 17123716206720.0, "grad_norm": 6.581455141610187, "language_loss": 0.88879943, "learning_rate": 1.1134074404867169e-06, "loss": 0.9040277, "num_input_tokens_seen": 235814805, "router_z_loss_clip": 2.24609375, "router_z_loss_mlp": 0.24890137, "step": 10924, "time_per_iteration": 2.677077054977417 }, { "auxiliary_loss_clip": 0.01279698, "auxiliary_loss_mlp": 0.00216196, "balance_loss_clip": 1.06231809, "balance_loss_mlp": 0.1914244, "epoch": 0.656846535397565, "flos": 22419319345920.0, "grad_norm": 6.939945101015875, "language_loss": 0.80811834, "learning_rate": 1.1130583535258717e-06, "loss": 0.82307726, "num_input_tokens_seen": 235833405, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.24780273, "step": 10925, "time_per_iteration": 2.759979248046875 }, { "auxiliary_loss_clip": 0.01285524, "auxiliary_loss_mlp": 0.00220033, "balance_loss_clip": 1.05921364, "balance_loss_mlp": 0.19676325, "epoch": 0.656906658650233, "flos": 17704535126400.0, "grad_norm": 5.988322027613412, "language_loss": 0.81441486, "learning_rate": 1.112709300197942e-06, "loss": 0.82947046, "num_input_tokens_seen": 235848530, "router_z_loss_clip": 2.26367188, "router_z_loss_mlp": 0.23254395, "step": 10926, "time_per_iteration": 2.631743907928467 }, { "auxiliary_loss_clip": 0.01312846, "auxiliary_loss_mlp": 0.00234734, "balance_loss_clip": 1.07759929, "balance_loss_mlp": 0.20819815, "epoch": 0.6569667819029009, "flos": 21175158009600.0, "grad_norm": 25.39745908411096, "language_loss": 0.79045188, "learning_rate": 1.1123602805161656e-06, "loss": 0.80592763, "num_input_tokens_seen": 235867225, "router_z_loss_clip": 2.34765625, "router_z_loss_mlp": 0.26574707, "step": 10927, "time_per_iteration": 2.707012414932251 }, { "auxiliary_loss_clip": 0.01215989, "auxiliary_loss_mlp": 0.0011259, "balance_loss_clip": 1.07749724, "balance_loss_mlp": 0.10472205, "epoch": 0.6570269051555689, "flos": 68761897511040.0, "grad_norm": 0.7912194219971501, "language_loss": 0.63856637, "learning_rate": 1.112011294493775e-06, "loss": 0.65185213, "num_input_tokens_seen": 235932925, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.07861328, "step": 10928, "time_per_iteration": 3.141432046890259 }, { "auxiliary_loss_clip": 0.01285861, "auxiliary_loss_mlp": 0.00221001, "balance_loss_clip": 1.06227183, "balance_loss_mlp": 0.19708753, "epoch": 0.6570870284082369, "flos": 26319289495680.0, "grad_norm": 9.139986345643585, "language_loss": 0.82578886, "learning_rate": 1.1116623421440063e-06, "loss": 0.84085751, "num_input_tokens_seen": 235952680, "router_z_loss_clip": 2.23828125, "router_z_loss_mlp": 0.23913574, "step": 10929, "time_per_iteration": 2.887753486633301 }, { "auxiliary_loss_clip": 0.01320132, "auxiliary_loss_mlp": 0.00238939, "balance_loss_clip": 1.08936954, "balance_loss_mlp": 0.21133026, "epoch": 0.6571471516609049, "flos": 26174749167360.0, "grad_norm": 5.154416789691215, "language_loss": 0.75122964, "learning_rate": 1.1113134234800895e-06, "loss": 0.76682037, "num_input_tokens_seen": 235972075, "router_z_loss_clip": 2.30859375, "router_z_loss_mlp": 0.27587891, "step": 10930, "time_per_iteration": 2.7492318153381348 }, { "auxiliary_loss_clip": 0.01300785, "auxiliary_loss_mlp": 0.00224395, "balance_loss_clip": 1.06822634, "balance_loss_mlp": 0.19953945, "epoch": 0.6572072749135728, "flos": 20376253664640.0, "grad_norm": 21.246466224239313, "language_loss": 0.80233473, "learning_rate": 1.110964538515258e-06, "loss": 0.81758648, "num_input_tokens_seen": 235990340, "router_z_loss_clip": 2.32226562, "router_z_loss_mlp": 0.2487793, "step": 10931, "time_per_iteration": 2.7322959899902344 }, { "auxiliary_loss_clip": 0.01300482, "auxiliary_loss_mlp": 0.00213414, "balance_loss_clip": 1.07229435, "balance_loss_mlp": 0.18883292, "epoch": 0.6572673981662408, "flos": 17128744110720.0, "grad_norm": 29.4599427204182, "language_loss": 0.78084165, "learning_rate": 1.1106156872627393e-06, "loss": 0.79598057, "num_input_tokens_seen": 236007470, "router_z_loss_clip": 2.28320312, "router_z_loss_mlp": 0.24584961, "step": 10932, "time_per_iteration": 2.7350847721099854 }, { "auxiliary_loss_clip": 0.01308846, "auxiliary_loss_mlp": 0.0023622, "balance_loss_clip": 1.07833552, "balance_loss_mlp": 0.21085247, "epoch": 0.6573275214189087, "flos": 41275113281280.0, "grad_norm": 13.417427592693649, "language_loss": 0.88223577, "learning_rate": 1.1102668697357626e-06, "loss": 0.89768642, "num_input_tokens_seen": 236029030, "router_z_loss_clip": 2.30273438, "router_z_loss_mlp": 0.25378418, "step": 10933, "time_per_iteration": 2.8791182041168213 }, { "auxiliary_loss_clip": 0.01315128, "auxiliary_loss_mlp": 0.00238117, "balance_loss_clip": 1.07944679, "balance_loss_mlp": 0.21111655, "epoch": 0.6573876446715767, "flos": 22890143842560.0, "grad_norm": 5.3218745488158286, "language_loss": 0.82210588, "learning_rate": 1.1099180859475571e-06, "loss": 0.83763832, "num_input_tokens_seen": 236047160, "router_z_loss_clip": 2.35546875, "router_z_loss_mlp": 0.26989746, "step": 10934, "time_per_iteration": 2.6923482418060303 }, { "auxiliary_loss_clip": 0.01289023, "auxiliary_loss_mlp": 0.0024096, "balance_loss_clip": 1.06614304, "balance_loss_mlp": 0.21597332, "epoch": 0.6574477679242448, "flos": 44018150273280.0, "grad_norm": 8387.726704944685, "language_loss": 0.82980549, "learning_rate": 1.1095693359113454e-06, "loss": 0.84510541, "num_input_tokens_seen": 236069215, "router_z_loss_clip": 2.23242188, "router_z_loss_mlp": 0.24987793, "step": 10935, "time_per_iteration": 2.8770923614501953 }, { "auxiliary_loss_clip": 0.01312224, "auxiliary_loss_mlp": 0.00237899, "balance_loss_clip": 1.07944572, "balance_loss_mlp": 0.2106479, "epoch": 0.6575078911769127, "flos": 24571517523840.0, "grad_norm": 140.56214639922322, "language_loss": 0.88265508, "learning_rate": 1.1092206196403538e-06, "loss": 0.89815629, "num_input_tokens_seen": 236088335, "router_z_loss_clip": 2.32617188, "router_z_loss_mlp": 0.27258301, "step": 10936, "time_per_iteration": 2.714184045791626 }, { "auxiliary_loss_clip": 0.01310243, "auxiliary_loss_mlp": 0.0021881, "balance_loss_clip": 1.07775915, "balance_loss_mlp": 0.19292969, "epoch": 0.6575680144295807, "flos": 20924035050240.0, "grad_norm": 22.389530552888452, "language_loss": 0.76574105, "learning_rate": 1.1088719371478056e-06, "loss": 0.78103155, "num_input_tokens_seen": 236108540, "router_z_loss_clip": 2.32421875, "router_z_loss_mlp": 0.25891113, "step": 10937, "time_per_iteration": 2.698570966720581 }, { "auxiliary_loss_clip": 0.01311373, "auxiliary_loss_mlp": 0.0024463, "balance_loss_clip": 1.07815361, "balance_loss_mlp": 0.21883342, "epoch": 0.6576281376822486, "flos": 10925642833920.0, "grad_norm": 700.1767097064842, "language_loss": 0.81609976, "learning_rate": 1.1085232884469236e-06, "loss": 0.83165979, "num_input_tokens_seen": 236124495, "router_z_loss_clip": 2.328125, "router_z_loss_mlp": 0.25817871, "step": 10938, "time_per_iteration": 2.6148123741149902 }, { "auxiliary_loss_clip": 0.01304209, "auxiliary_loss_mlp": 0.00236436, "balance_loss_clip": 1.07250428, "balance_loss_mlp": 0.21241572, "epoch": 0.6576882609349166, "flos": 19281552819840.0, "grad_norm": 15.077558527055661, "language_loss": 0.83161622, "learning_rate": 1.108174673550927e-06, "loss": 0.84702265, "num_input_tokens_seen": 236142550, "router_z_loss_clip": 2.31835938, "router_z_loss_mlp": 0.2401123, "step": 10939, "time_per_iteration": 2.6682143211364746 }, { "auxiliary_loss_clip": 0.01327751, "auxiliary_loss_mlp": 0.00231306, "balance_loss_clip": 1.08482003, "balance_loss_mlp": 0.20424551, "epoch": 0.6577483841875845, "flos": 20220544206720.0, "grad_norm": 22.213872852261165, "language_loss": 0.91226584, "learning_rate": 1.107826092473037e-06, "loss": 0.92785645, "num_input_tokens_seen": 236156620, "router_z_loss_clip": 2.43164062, "router_z_loss_mlp": 0.27050781, "step": 10940, "time_per_iteration": 2.649789333343506 }, { "auxiliary_loss_clip": 0.01304211, "auxiliary_loss_mlp": 0.00244998, "balance_loss_clip": 1.07020426, "balance_loss_mlp": 0.21819958, "epoch": 0.6578085074402525, "flos": 34751078962560.0, "grad_norm": 496.7056901195356, "language_loss": 0.76998961, "learning_rate": 1.107477545226471e-06, "loss": 0.78548175, "num_input_tokens_seen": 236177095, "router_z_loss_clip": 2.34179688, "router_z_loss_mlp": 0.26794434, "step": 10941, "time_per_iteration": 2.7449278831481934 }, { "auxiliary_loss_clip": 0.01305738, "auxiliary_loss_mlp": 0.00243476, "balance_loss_clip": 1.07374656, "balance_loss_mlp": 0.21797673, "epoch": 0.6578686306929205, "flos": 23470998675840.0, "grad_norm": 100.44742258074554, "language_loss": 0.77556753, "learning_rate": 1.1071290318244448e-06, "loss": 0.79105967, "num_input_tokens_seen": 236194695, "router_z_loss_clip": 2.3203125, "router_z_loss_mlp": 0.25476074, "step": 10942, "time_per_iteration": 2.651858329772949 }, { "auxiliary_loss_clip": 0.01337894, "auxiliary_loss_mlp": 0.00244135, "balance_loss_clip": 1.09581709, "balance_loss_mlp": 0.21752764, "epoch": 0.6579287539455885, "flos": 18077073033600.0, "grad_norm": 74.04648305235268, "language_loss": 0.80645269, "learning_rate": 1.1067805522801753e-06, "loss": 0.82227302, "num_input_tokens_seen": 236213885, "router_z_loss_clip": 2.41992188, "router_z_loss_mlp": 0.26623535, "step": 10943, "time_per_iteration": 2.6836698055267334 }, { "auxiliary_loss_clip": 0.01304352, "auxiliary_loss_mlp": 0.00235115, "balance_loss_clip": 1.07166183, "balance_loss_mlp": 0.21095133, "epoch": 0.6579888771982564, "flos": 28661383900800.0, "grad_norm": 28.77079501849207, "language_loss": 0.64981973, "learning_rate": 1.1064321066068778e-06, "loss": 0.66521442, "num_input_tokens_seen": 236237315, "router_z_loss_clip": 2.33007812, "router_z_loss_mlp": 0.24133301, "step": 10944, "time_per_iteration": 2.717470169067383 }, { "auxiliary_loss_clip": 0.01319699, "auxiliary_loss_mlp": 0.00224005, "balance_loss_clip": 1.08069968, "balance_loss_mlp": 0.19633655, "epoch": 0.6580490004509244, "flos": 25046543911680.0, "grad_norm": 2.7080146769026228, "language_loss": 0.81569117, "learning_rate": 1.1060836948177646e-06, "loss": 0.83112824, "num_input_tokens_seen": 236256345, "router_z_loss_clip": 2.38867188, "router_z_loss_mlp": 0.27685547, "step": 10945, "time_per_iteration": 2.705613136291504 }, { "auxiliary_loss_clip": 0.01311821, "auxiliary_loss_mlp": 0.002293, "balance_loss_clip": 1.07361221, "balance_loss_mlp": 0.2041229, "epoch": 0.6581091237035923, "flos": 43508793461760.0, "grad_norm": 9.432859633658497, "language_loss": 0.76627636, "learning_rate": 1.105735316926046e-06, "loss": 0.78168762, "num_input_tokens_seen": 236281890, "router_z_loss_clip": 2.38085938, "router_z_loss_mlp": 0.25170898, "step": 10946, "time_per_iteration": 2.874081611633301 }, { "auxiliary_loss_clip": 0.01296448, "auxiliary_loss_mlp": 0.0023116, "balance_loss_clip": 1.06893384, "balance_loss_mlp": 0.20548244, "epoch": 0.6581692469562603, "flos": 22415404763520.0, "grad_norm": 133.22929229123142, "language_loss": 0.89085436, "learning_rate": 1.105386972944934e-06, "loss": 0.90613043, "num_input_tokens_seen": 236298370, "router_z_loss_clip": 2.27539062, "router_z_loss_mlp": 0.25683594, "step": 10947, "time_per_iteration": 2.6572890281677246 }, { "auxiliary_loss_clip": 0.01323954, "auxiliary_loss_mlp": 0.00226035, "balance_loss_clip": 1.08761859, "balance_loss_mlp": 0.20036954, "epoch": 0.6582293702089284, "flos": 24859772167680.0, "grad_norm": 16.568192679209684, "language_loss": 0.85295796, "learning_rate": 1.1050386628876385e-06, "loss": 0.86845791, "num_input_tokens_seen": 236317380, "router_z_loss_clip": 2.36523438, "router_z_loss_mlp": 0.25695801, "step": 10948, "time_per_iteration": 2.6326494216918945 }, { "auxiliary_loss_clip": 0.01314524, "auxiliary_loss_mlp": 0.00242353, "balance_loss_clip": 1.07990599, "balance_loss_mlp": 0.21647251, "epoch": 0.6582894934615963, "flos": 23039676161280.0, "grad_norm": 58.691884632954086, "language_loss": 0.86490542, "learning_rate": 1.1046903867673655e-06, "loss": 0.88047421, "num_input_tokens_seen": 236336210, "router_z_loss_clip": 2.34765625, "router_z_loss_mlp": 0.25854492, "step": 10949, "time_per_iteration": 2.654794216156006 }, { "auxiliary_loss_clip": 0.01257047, "auxiliary_loss_mlp": 0.00104372, "balance_loss_clip": 1.11039734, "balance_loss_mlp": 0.09693316, "epoch": 0.6583496167142643, "flos": 72551980978560.0, "grad_norm": 0.7145695678965415, "language_loss": 0.61218536, "learning_rate": 1.104342144597323e-06, "loss": 0.62579954, "num_input_tokens_seen": 236403090, "router_z_loss_clip": 1.46875, "router_z_loss_mlp": 0.07421875, "step": 10950, "time_per_iteration": 4.595803737640381 }, { "auxiliary_loss_clip": 0.01319523, "auxiliary_loss_mlp": 0.00213135, "balance_loss_clip": 1.08640945, "balance_loss_mlp": 0.18942389, "epoch": 0.6584097399669322, "flos": 13078846592640.0, "grad_norm": 12.776267337934986, "language_loss": 0.75873339, "learning_rate": 1.1039939363907178e-06, "loss": 0.77405989, "num_input_tokens_seen": 236420475, "router_z_loss_clip": 2.33007812, "router_z_loss_mlp": 0.23730469, "step": 10951, "time_per_iteration": 4.06748366355896 }, { "auxiliary_loss_clip": 0.01308648, "auxiliary_loss_mlp": 0.00249415, "balance_loss_clip": 1.07509482, "balance_loss_mlp": 0.22402298, "epoch": 0.6584698632196002, "flos": 28693164458880.0, "grad_norm": 202.2166719986399, "language_loss": 0.82806444, "learning_rate": 1.1036457621607504e-06, "loss": 0.84364504, "num_input_tokens_seen": 236441915, "router_z_loss_clip": 2.33789062, "router_z_loss_mlp": 0.25390625, "step": 10952, "time_per_iteration": 2.684678554534912 }, { "auxiliary_loss_clip": 0.01327985, "auxiliary_loss_mlp": 0.00253549, "balance_loss_clip": 1.08828282, "balance_loss_mlp": 0.22613122, "epoch": 0.6585299864722681, "flos": 14319272914560.0, "grad_norm": 13.356693260247768, "language_loss": 0.81189388, "learning_rate": 1.1032976219206257e-06, "loss": 0.8277092, "num_input_tokens_seen": 236460340, "router_z_loss_clip": 2.3984375, "router_z_loss_mlp": 0.27404785, "step": 10953, "time_per_iteration": 2.67913818359375 }, { "auxiliary_loss_clip": 0.01315402, "auxiliary_loss_mlp": 0.00247293, "balance_loss_clip": 1.08243358, "balance_loss_mlp": 0.22149593, "epoch": 0.6585901097249361, "flos": 26797907243520.0, "grad_norm": 164.7818173694376, "language_loss": 0.86002386, "learning_rate": 1.102949515683546e-06, "loss": 0.87565082, "num_input_tokens_seen": 236478280, "router_z_loss_clip": 2.33007812, "router_z_loss_mlp": 0.25817871, "step": 10954, "time_per_iteration": 4.184727191925049 }, { "auxiliary_loss_clip": 0.01327734, "auxiliary_loss_mlp": 0.00232521, "balance_loss_clip": 1.08555329, "balance_loss_mlp": 0.20565167, "epoch": 0.658650232977604, "flos": 18733124989440.0, "grad_norm": 9.719339438860779, "language_loss": 0.78728443, "learning_rate": 1.1026014434627096e-06, "loss": 0.80288696, "num_input_tokens_seen": 236493225, "router_z_loss_clip": 2.41992188, "router_z_loss_mlp": 0.26855469, "step": 10955, "time_per_iteration": 2.6890621185302734 }, { "auxiliary_loss_clip": 0.01324585, "auxiliary_loss_mlp": 0.00237878, "balance_loss_clip": 1.08719659, "balance_loss_mlp": 0.21389294, "epoch": 0.6587103562302721, "flos": 24753440931840.0, "grad_norm": 6.486147543655871, "language_loss": 0.89267361, "learning_rate": 1.1022534052713172e-06, "loss": 0.90829831, "num_input_tokens_seen": 236514420, "router_z_loss_clip": 2.37304688, "router_z_loss_mlp": 0.23974609, "step": 10956, "time_per_iteration": 2.80676007270813 }, { "auxiliary_loss_clip": 0.01303168, "auxiliary_loss_mlp": 0.0024208, "balance_loss_clip": 1.06881762, "balance_loss_mlp": 0.2168791, "epoch": 0.65877047948294, "flos": 22346133384960.0, "grad_norm": 15.33282434247926, "language_loss": 0.89873546, "learning_rate": 1.1019054011225648e-06, "loss": 0.91418791, "num_input_tokens_seen": 236532785, "router_z_loss_clip": 2.34570312, "router_z_loss_mlp": 0.2520752, "step": 10957, "time_per_iteration": 2.695183515548706 }, { "auxiliary_loss_clip": 0.0130266, "auxiliary_loss_mlp": 0.00241369, "balance_loss_clip": 1.06982148, "balance_loss_mlp": 0.21563169, "epoch": 0.658830602735608, "flos": 45180542298240.0, "grad_norm": 40.4774447479131, "language_loss": 0.83112955, "learning_rate": 1.1015574310296506e-06, "loss": 0.8465699, "num_input_tokens_seen": 236553330, "router_z_loss_clip": 2.33007812, "router_z_loss_mlp": 0.25756836, "step": 10958, "time_per_iteration": 2.8857944011688232 }, { "auxiliary_loss_clip": 0.01293729, "auxiliary_loss_mlp": 0.00239634, "balance_loss_clip": 1.06610072, "balance_loss_mlp": 0.21454012, "epoch": 0.6588907259882759, "flos": 19901622326400.0, "grad_norm": 17.23603380923493, "language_loss": 0.8231886, "learning_rate": 1.1012094950057678e-06, "loss": 0.83852226, "num_input_tokens_seen": 236572960, "router_z_loss_clip": 2.27929688, "router_z_loss_mlp": 0.25109863, "step": 10959, "time_per_iteration": 4.007513046264648 }, { "auxiliary_loss_clip": 0.01296512, "auxiliary_loss_mlp": 0.00220527, "balance_loss_clip": 1.06243372, "balance_loss_mlp": 0.19618435, "epoch": 0.6589508492409439, "flos": 24133766474880.0, "grad_norm": 10.263285414051381, "language_loss": 0.71028233, "learning_rate": 1.1008615930641107e-06, "loss": 0.72545266, "num_input_tokens_seen": 236594090, "router_z_loss_clip": 2.34375, "router_z_loss_mlp": 0.2434082, "step": 10960, "time_per_iteration": 2.670806884765625 }, { "auxiliary_loss_clip": 0.01329313, "auxiliary_loss_mlp": 0.00236477, "balance_loss_clip": 1.08509994, "balance_loss_mlp": 0.20889214, "epoch": 0.659010972493612, "flos": 18222906251520.0, "grad_norm": 3.6560036581069273, "language_loss": 0.92299986, "learning_rate": 1.1005137252178734e-06, "loss": 0.93865776, "num_input_tokens_seen": 236610190, "router_z_loss_clip": 2.4375, "router_z_loss_mlp": 0.27624512, "step": 10961, "time_per_iteration": 2.6301345825195312 }, { "auxiliary_loss_clip": 0.0133678, "auxiliary_loss_mlp": 0.00227304, "balance_loss_clip": 1.09766269, "balance_loss_mlp": 0.20141193, "epoch": 0.6590710957462799, "flos": 27600007898880.0, "grad_norm": 17.381411880198446, "language_loss": 0.8267144, "learning_rate": 1.1001658914802453e-06, "loss": 0.84235525, "num_input_tokens_seen": 236631575, "router_z_loss_clip": 2.39453125, "router_z_loss_mlp": 0.25878906, "step": 10962, "time_per_iteration": 2.684784173965454 }, { "auxiliary_loss_clip": 0.01308322, "auxiliary_loss_mlp": 0.00244281, "balance_loss_clip": 1.0752883, "balance_loss_mlp": 0.21997431, "epoch": 0.6591312189989479, "flos": 20302959962880.0, "grad_norm": 1467.8770176333844, "language_loss": 0.87297487, "learning_rate": 1.0998180918644165e-06, "loss": 0.88850093, "num_input_tokens_seen": 236649815, "router_z_loss_clip": 2.33007812, "router_z_loss_mlp": 0.24328613, "step": 10963, "time_per_iteration": 2.6317331790924072 }, { "auxiliary_loss_clip": 0.01306148, "auxiliary_loss_mlp": 0.00234045, "balance_loss_clip": 1.07489681, "balance_loss_mlp": 0.2096073, "epoch": 0.6591913422516158, "flos": 12312943868160.0, "grad_norm": 3.8741011551764006, "language_loss": 0.87432832, "learning_rate": 1.0994703263835754e-06, "loss": 0.88973022, "num_input_tokens_seen": 236668335, "router_z_loss_clip": 2.3125, "router_z_loss_mlp": 0.24462891, "step": 10964, "time_per_iteration": 2.604593276977539 }, { "auxiliary_loss_clip": 0.01307227, "auxiliary_loss_mlp": 0.00268989, "balance_loss_clip": 1.07038546, "balance_loss_mlp": 0.24240568, "epoch": 0.6592514655042838, "flos": 25884591102720.0, "grad_norm": 28.073641793747406, "language_loss": 0.82838809, "learning_rate": 1.0991225950509106e-06, "loss": 0.8441503, "num_input_tokens_seen": 236688945, "router_z_loss_clip": 2.3671875, "router_z_loss_mlp": 0.26586914, "step": 10965, "time_per_iteration": 2.6874279975891113 }, { "auxiliary_loss_clip": 0.01333385, "auxiliary_loss_mlp": 0.00249656, "balance_loss_clip": 1.08606446, "balance_loss_mlp": 0.22223793, "epoch": 0.6593115887569517, "flos": 14063624841600.0, "grad_norm": 131.684138310424, "language_loss": 0.84028006, "learning_rate": 1.0987748978796067e-06, "loss": 0.85611045, "num_input_tokens_seen": 236707055, "router_z_loss_clip": 2.47265625, "router_z_loss_mlp": 0.27404785, "step": 10966, "time_per_iteration": 2.6165521144866943 }, { "auxiliary_loss_clip": 0.01309586, "auxiliary_loss_mlp": 0.00246039, "balance_loss_clip": 1.07396841, "balance_loss_mlp": 0.21963379, "epoch": 0.6593717120096197, "flos": 24717925359360.0, "grad_norm": 40.880823829445, "language_loss": 0.84490502, "learning_rate": 1.0984272348828487e-06, "loss": 0.86046124, "num_input_tokens_seen": 236725900, "router_z_loss_clip": 2.35546875, "router_z_loss_mlp": 0.26379395, "step": 10967, "time_per_iteration": 2.7135274410247803 }, { "auxiliary_loss_clip": 0.01243186, "auxiliary_loss_mlp": 0.00114221, "balance_loss_clip": 1.09459949, "balance_loss_mlp": 0.1051136, "epoch": 0.6594318352622877, "flos": 55558083502080.0, "grad_norm": 1.8944556541337019, "language_loss": 0.47547817, "learning_rate": 1.0980796060738221e-06, "loss": 0.48905224, "num_input_tokens_seen": 236788415, "router_z_loss_clip": 1.484375, "router_z_loss_mlp": 0.09130859, "step": 10968, "time_per_iteration": 3.141249418258667 }, { "auxiliary_loss_clip": 0.01317666, "auxiliary_loss_mlp": 0.00271758, "balance_loss_clip": 1.0785625, "balance_loss_mlp": 0.2446851, "epoch": 0.6594919585149557, "flos": 17456931699840.0, "grad_norm": 50.056463031135934, "language_loss": 0.88547486, "learning_rate": 1.0977320114657058e-06, "loss": 0.90136909, "num_input_tokens_seen": 236805155, "router_z_loss_clip": 2.39257812, "router_z_loss_mlp": 0.27075195, "step": 10969, "time_per_iteration": 2.641716718673706 }, { "auxiliary_loss_clip": 0.01298271, "auxiliary_loss_mlp": 0.00228286, "balance_loss_clip": 1.06389475, "balance_loss_mlp": 0.20389557, "epoch": 0.6595520817676236, "flos": 18223229473920.0, "grad_norm": 19.477118489527673, "language_loss": 0.77019823, "learning_rate": 1.0973844510716817e-06, "loss": 0.78546381, "num_input_tokens_seen": 236824360, "router_z_loss_clip": 2.34570312, "router_z_loss_mlp": 0.24401855, "step": 10970, "time_per_iteration": 2.602297306060791 }, { "auxiliary_loss_clip": 0.01311053, "auxiliary_loss_mlp": 0.00234884, "balance_loss_clip": 1.07178116, "balance_loss_mlp": 0.21035039, "epoch": 0.6596122050202916, "flos": 22199761463040.0, "grad_norm": 62.23048812967663, "language_loss": 0.84235513, "learning_rate": 1.0970369249049308e-06, "loss": 0.85781449, "num_input_tokens_seen": 236844640, "router_z_loss_clip": 2.39648438, "router_z_loss_mlp": 0.24523926, "step": 10971, "time_per_iteration": 2.69478440284729 }, { "auxiliary_loss_clip": 0.01332136, "auxiliary_loss_mlp": 0.00226151, "balance_loss_clip": 1.0837965, "balance_loss_mlp": 0.20052072, "epoch": 0.6596723282729595, "flos": 14173834746240.0, "grad_norm": 35.580097653748204, "language_loss": 0.82171786, "learning_rate": 1.096689432978629e-06, "loss": 0.83730072, "num_input_tokens_seen": 236861160, "router_z_loss_clip": 2.48242188, "router_z_loss_mlp": 0.25646973, "step": 10972, "time_per_iteration": 2.5835957527160645 }, { "auxiliary_loss_clip": 0.01305104, "auxiliary_loss_mlp": 0.00245696, "balance_loss_clip": 1.06649804, "balance_loss_mlp": 0.21851644, "epoch": 0.6597324515256275, "flos": 30553193410560.0, "grad_norm": 6.1068476374105645, "language_loss": 0.66510361, "learning_rate": 1.0963419753059556e-06, "loss": 0.68061161, "num_input_tokens_seen": 236880465, "router_z_loss_clip": 2.38671875, "router_z_loss_mlp": 0.27160645, "step": 10973, "time_per_iteration": 2.693239450454712 }, { "auxiliary_loss_clip": 0.01339925, "auxiliary_loss_mlp": 0.00247567, "balance_loss_clip": 1.08513534, "balance_loss_mlp": 0.22113863, "epoch": 0.6597925747782956, "flos": 17639860688640.0, "grad_norm": 24.41482389402415, "language_loss": 0.87404919, "learning_rate": 1.0959945519000839e-06, "loss": 0.88992417, "num_input_tokens_seen": 236897730, "router_z_loss_clip": 2.55273438, "router_z_loss_mlp": 0.26428223, "step": 10974, "time_per_iteration": 2.6741955280303955 }, { "auxiliary_loss_clip": 0.01333976, "auxiliary_loss_mlp": 0.00240417, "balance_loss_clip": 1.08381391, "balance_loss_mlp": 0.21390441, "epoch": 0.6598526980309635, "flos": 22819112697600.0, "grad_norm": 20.11649840461087, "language_loss": 0.81470156, "learning_rate": 1.0956471627741906e-06, "loss": 0.83044547, "num_input_tokens_seen": 236917300, "router_z_loss_clip": 2.50585938, "router_z_loss_mlp": 0.26525879, "step": 10975, "time_per_iteration": 2.6635823249816895 }, { "auxiliary_loss_clip": 0.0132096, "auxiliary_loss_mlp": 0.00250154, "balance_loss_clip": 1.08096313, "balance_loss_mlp": 0.22416642, "epoch": 0.6599128212836315, "flos": 21068036674560.0, "grad_norm": 312.94091025780955, "language_loss": 0.81301957, "learning_rate": 1.0952998079414464e-06, "loss": 0.8287307, "num_input_tokens_seen": 236935590, "router_z_loss_clip": 2.40039062, "router_z_loss_mlp": 0.26000977, "step": 10976, "time_per_iteration": 2.677762031555176 }, { "auxiliary_loss_clip": 0.01298929, "auxiliary_loss_mlp": 0.00234819, "balance_loss_clip": 1.06302679, "balance_loss_mlp": 0.20980918, "epoch": 0.6599729445362994, "flos": 22163527618560.0, "grad_norm": 38.35996794752393, "language_loss": 0.75009131, "learning_rate": 1.0949524874150243e-06, "loss": 0.76542878, "num_input_tokens_seen": 236952830, "router_z_loss_clip": 2.35546875, "router_z_loss_mlp": 0.25024414, "step": 10977, "time_per_iteration": 2.652109384536743 }, { "auxiliary_loss_clip": 0.01327283, "auxiliary_loss_mlp": 0.00228849, "balance_loss_clip": 1.08054781, "balance_loss_mlp": 0.20187162, "epoch": 0.6600330677889674, "flos": 18150079426560.0, "grad_norm": 49.05650727122837, "language_loss": 0.91626191, "learning_rate": 1.0946052012080952e-06, "loss": 0.93182319, "num_input_tokens_seen": 236971930, "router_z_loss_clip": 2.46484375, "router_z_loss_mlp": 0.26965332, "step": 10978, "time_per_iteration": 2.6904234886169434 }, { "auxiliary_loss_clip": 0.01315537, "auxiliary_loss_mlp": 0.00243523, "balance_loss_clip": 1.07053185, "balance_loss_mlp": 0.21829835, "epoch": 0.6600931910416353, "flos": 18150115340160.0, "grad_norm": 480.1024350451201, "language_loss": 0.78188384, "learning_rate": 1.0942579493338278e-06, "loss": 0.79747438, "num_input_tokens_seen": 236989920, "router_z_loss_clip": 2.44921875, "router_z_loss_mlp": 0.25219727, "step": 10979, "time_per_iteration": 2.6528327465057373 }, { "auxiliary_loss_clip": 0.01319572, "auxiliary_loss_mlp": 0.00230828, "balance_loss_clip": 1.07364786, "balance_loss_mlp": 0.20561558, "epoch": 0.6601533142943034, "flos": 17420733768960.0, "grad_norm": 177.7227014423668, "language_loss": 0.81824052, "learning_rate": 1.0939107318053889e-06, "loss": 0.83374453, "num_input_tokens_seen": 237006570, "router_z_loss_clip": 2.45898438, "router_z_loss_mlp": 0.25219727, "step": 10980, "time_per_iteration": 2.652216911315918 }, { "auxiliary_loss_clip": 0.01308203, "auxiliary_loss_mlp": 0.00224118, "balance_loss_clip": 1.07251525, "balance_loss_mlp": 0.19915567, "epoch": 0.6602134375469713, "flos": 28219574615040.0, "grad_norm": 5.082797096114045, "language_loss": 0.80255759, "learning_rate": 1.0935635486359459e-06, "loss": 0.81788081, "num_input_tokens_seen": 237028415, "router_z_loss_clip": 2.35546875, "router_z_loss_mlp": 0.24987793, "step": 10981, "time_per_iteration": 2.707818031311035 }, { "auxiliary_loss_clip": 0.01303242, "auxiliary_loss_mlp": 0.00234895, "balance_loss_clip": 1.06438661, "balance_loss_mlp": 0.20946792, "epoch": 0.6602735607996393, "flos": 29418056830080.0, "grad_norm": 135.31517356213328, "language_loss": 0.77355814, "learning_rate": 1.0932163998386647e-06, "loss": 0.78893948, "num_input_tokens_seen": 237046595, "router_z_loss_clip": 2.38476562, "router_z_loss_mlp": 0.25427246, "step": 10982, "time_per_iteration": 2.7649765014648438 }, { "auxiliary_loss_clip": 0.01311725, "auxiliary_loss_mlp": 0.00235034, "balance_loss_clip": 1.07347763, "balance_loss_mlp": 0.2104297, "epoch": 0.6603336840523072, "flos": 18588045957120.0, "grad_norm": 95.74729539028526, "language_loss": 0.76812482, "learning_rate": 1.0928692854267075e-06, "loss": 0.78359234, "num_input_tokens_seen": 237066150, "router_z_loss_clip": 2.3828125, "router_z_loss_mlp": 0.24609375, "step": 10983, "time_per_iteration": 2.713827133178711 }, { "auxiliary_loss_clip": 0.0130095, "auxiliary_loss_mlp": 0.00192223, "balance_loss_clip": 1.06499481, "balance_loss_mlp": 0.16815445, "epoch": 0.6603938073049752, "flos": 33254860913280.0, "grad_norm": 13.977818283010057, "language_loss": 0.78977567, "learning_rate": 1.092522205413239e-06, "loss": 0.80470741, "num_input_tokens_seen": 237087060, "router_z_loss_clip": 2.35742188, "router_z_loss_mlp": 0.2409668, "step": 10984, "time_per_iteration": 2.764824151992798 }, { "auxiliary_loss_clip": 0.01311758, "auxiliary_loss_mlp": 0.00249769, "balance_loss_clip": 1.07171023, "balance_loss_mlp": 0.22285204, "epoch": 0.6604539305576431, "flos": 17384284442880.0, "grad_norm": 4.74309316608751, "language_loss": 0.91286904, "learning_rate": 1.0921751598114193e-06, "loss": 0.92848432, "num_input_tokens_seen": 237103825, "router_z_loss_clip": 2.40429688, "router_z_loss_mlp": 0.26928711, "step": 10985, "time_per_iteration": 2.594558000564575 }, { "auxiliary_loss_clip": 0.01317484, "auxiliary_loss_mlp": 0.00233768, "balance_loss_clip": 1.07228851, "balance_loss_mlp": 0.2066007, "epoch": 0.6605140538103111, "flos": 21251145231360.0, "grad_norm": 8.573416578956435, "language_loss": 0.8073436, "learning_rate": 1.0918281486344077e-06, "loss": 0.82285619, "num_input_tokens_seen": 237121740, "router_z_loss_clip": 2.45117188, "router_z_loss_mlp": 0.2713623, "step": 10986, "time_per_iteration": 2.6564717292785645 }, { "auxiliary_loss_clip": 0.01305338, "auxiliary_loss_mlp": 0.00220486, "balance_loss_clip": 1.07079828, "balance_loss_mlp": 0.19566658, "epoch": 0.6605741770629792, "flos": 13881701433600.0, "grad_norm": 53.640535409635504, "language_loss": 0.87401903, "learning_rate": 1.0914811718953636e-06, "loss": 0.88927734, "num_input_tokens_seen": 237139565, "router_z_loss_clip": 2.34375, "router_z_loss_mlp": 0.24816895, "step": 10987, "time_per_iteration": 2.618987560272217 }, { "auxiliary_loss_clip": 0.01233583, "auxiliary_loss_mlp": 0.00083578, "balance_loss_clip": 1.073259, "balance_loss_mlp": 0.07394554, "epoch": 0.6606343003156471, "flos": 69316215171840.0, "grad_norm": 0.7906003898804458, "language_loss": 0.53605723, "learning_rate": 1.0911342296074454e-06, "loss": 0.54922885, "num_input_tokens_seen": 237201055, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.09619141, "step": 10988, "time_per_iteration": 3.2334463596343994 }, { "auxiliary_loss_clip": 0.01294592, "auxiliary_loss_mlp": 0.00203535, "balance_loss_clip": 1.06407237, "balance_loss_mlp": 0.18003868, "epoch": 0.6606944235683151, "flos": 27272394927360.0, "grad_norm": 100.37237298079094, "language_loss": 0.8505165, "learning_rate": 1.0907873217838077e-06, "loss": 0.86549777, "num_input_tokens_seen": 237221805, "router_z_loss_clip": 2.3046875, "router_z_loss_mlp": 0.23486328, "step": 10989, "time_per_iteration": 2.717146635055542 }, { "auxiliary_loss_clip": 0.0130773, "auxiliary_loss_mlp": 0.00196435, "balance_loss_clip": 1.06903589, "balance_loss_mlp": 0.17173481, "epoch": 0.660754546820983, "flos": 13772820332160.0, "grad_norm": 4.740401414373968, "language_loss": 0.85838544, "learning_rate": 1.0904404484376064e-06, "loss": 0.87342715, "num_input_tokens_seen": 237238270, "router_z_loss_clip": 2.38867188, "router_z_loss_mlp": 0.24682617, "step": 10990, "time_per_iteration": 2.577291250228882 }, { "auxiliary_loss_clip": 0.01314972, "auxiliary_loss_mlp": 0.0021506, "balance_loss_clip": 1.07301581, "balance_loss_mlp": 0.18989488, "epoch": 0.660814670073651, "flos": 15705209232000.0, "grad_norm": 10.799305458272546, "language_loss": 0.70793259, "learning_rate": 1.0900936095819937e-06, "loss": 0.72323292, "num_input_tokens_seen": 237255400, "router_z_loss_clip": 2.41992188, "router_z_loss_mlp": 0.25195312, "step": 10991, "time_per_iteration": 2.6064372062683105 }, { "auxiliary_loss_clip": 0.01322659, "auxiliary_loss_mlp": 0.00206373, "balance_loss_clip": 1.0761199, "balance_loss_mlp": 0.17866868, "epoch": 0.6608747933263189, "flos": 20850023076480.0, "grad_norm": 80.47784600718673, "language_loss": 0.78198504, "learning_rate": 1.0897468052301234e-06, "loss": 0.7972753, "num_input_tokens_seen": 237273105, "router_z_loss_clip": 2.46679688, "router_z_loss_mlp": 0.27685547, "step": 10992, "time_per_iteration": 4.077103137969971 }, { "auxiliary_loss_clip": 0.01323213, "auxiliary_loss_mlp": 0.00210364, "balance_loss_clip": 1.07228875, "balance_loss_mlp": 0.1823616, "epoch": 0.660934916578987, "flos": 20632117219200.0, "grad_norm": 9.50510170739613, "language_loss": 0.95711279, "learning_rate": 1.0894000353951444e-06, "loss": 0.97244859, "num_input_tokens_seen": 237292650, "router_z_loss_clip": 2.50976562, "router_z_loss_mlp": 0.2800293, "step": 10993, "time_per_iteration": 4.027516841888428 }, { "auxiliary_loss_clip": 0.01346513, "auxiliary_loss_mlp": 0.00200172, "balance_loss_clip": 1.0881567, "balance_loss_mlp": 0.1711323, "epoch": 0.6609950398316549, "flos": 25113588647040.0, "grad_norm": 17.918226289318532, "language_loss": 0.73076451, "learning_rate": 1.0890533000902078e-06, "loss": 0.74623138, "num_input_tokens_seen": 237312865, "router_z_loss_clip": 2.5859375, "router_z_loss_mlp": 0.29052734, "step": 10994, "time_per_iteration": 2.687933921813965 }, { "auxiliary_loss_clip": 0.01307271, "auxiliary_loss_mlp": 0.00217734, "balance_loss_clip": 1.06743073, "balance_loss_mlp": 0.19145995, "epoch": 0.6610551630843229, "flos": 18661196004480.0, "grad_norm": 5.263128290323231, "language_loss": 0.86475456, "learning_rate": 1.0887065993284626e-06, "loss": 0.88000458, "num_input_tokens_seen": 237331210, "router_z_loss_clip": 2.3984375, "router_z_loss_mlp": 0.26306152, "step": 10995, "time_per_iteration": 2.616635322570801 }, { "auxiliary_loss_clip": 0.01310948, "auxiliary_loss_mlp": 0.00203989, "balance_loss_clip": 1.0694983, "balance_loss_mlp": 0.17903873, "epoch": 0.6611152863369908, "flos": 23258192549760.0, "grad_norm": 1417.8763854982528, "language_loss": 0.81085432, "learning_rate": 1.088359933123053e-06, "loss": 0.82600367, "num_input_tokens_seen": 237349455, "router_z_loss_clip": 2.41601562, "router_z_loss_mlp": 0.24951172, "step": 10996, "time_per_iteration": 2.665783166885376 }, { "auxiliary_loss_clip": 0.01317735, "auxiliary_loss_mlp": 0.00206645, "balance_loss_clip": 1.07689905, "balance_loss_mlp": 0.18124199, "epoch": 0.6611754095896588, "flos": 22159720776960.0, "grad_norm": 4.19632665026702, "language_loss": 0.7679745, "learning_rate": 1.088013301487126e-06, "loss": 0.78321832, "num_input_tokens_seen": 237367100, "router_z_loss_clip": 2.4140625, "router_z_loss_mlp": 0.25378418, "step": 10997, "time_per_iteration": 4.0714099407196045 }, { "auxiliary_loss_clip": 0.01320009, "auxiliary_loss_mlp": 0.00201079, "balance_loss_clip": 1.07257915, "balance_loss_mlp": 0.17419702, "epoch": 0.6612355328423267, "flos": 13991228979840.0, "grad_norm": 31.162974855774134, "language_loss": 0.79015601, "learning_rate": 1.0876667044338269e-06, "loss": 0.80536687, "num_input_tokens_seen": 237384840, "router_z_loss_clip": 2.47070312, "router_z_loss_mlp": 0.26879883, "step": 10998, "time_per_iteration": 2.596104145050049 }, { "auxiliary_loss_clip": 0.01215036, "auxiliary_loss_mlp": 0.00121651, "balance_loss_clip": 1.05878472, "balance_loss_mlp": 0.11220992, "epoch": 0.6612956560949947, "flos": 61453716359040.0, "grad_norm": 0.6459925793216112, "language_loss": 0.50474203, "learning_rate": 1.087320141976297e-06, "loss": 0.51810884, "num_input_tokens_seen": 237443355, "router_z_loss_clip": 1.5625, "router_z_loss_mlp": 0.09423828, "step": 10999, "time_per_iteration": 3.1296231746673584 }, { "auxiliary_loss_clip": 0.01311147, "auxiliary_loss_mlp": 0.00217273, "balance_loss_clip": 1.06902969, "balance_loss_mlp": 0.19283488, "epoch": 0.6613557793476627, "flos": 21616644072960.0, "grad_norm": 57.00656657509133, "language_loss": 0.7752493, "learning_rate": 1.086973614127679e-06, "loss": 0.79053348, "num_input_tokens_seen": 237459205, "router_z_loss_clip": 2.42382812, "router_z_loss_mlp": 0.2442627, "step": 11000, "time_per_iteration": 2.6392862796783447 }, { "auxiliary_loss_clip": 0.01298982, "auxiliary_loss_mlp": 0.00202406, "balance_loss_clip": 1.0637877, "balance_loss_mlp": 0.17778933, "epoch": 0.6614159026003307, "flos": 34020117192960.0, "grad_norm": 1.7262664204774734, "language_loss": 0.71182984, "learning_rate": 1.0866271209011133e-06, "loss": 0.72684371, "num_input_tokens_seen": 237483580, "router_z_loss_clip": 2.35351562, "router_z_loss_mlp": 0.24621582, "step": 11001, "time_per_iteration": 4.1706671714782715 }, { "auxiliary_loss_clip": 0.01309917, "auxiliary_loss_mlp": 0.00190059, "balance_loss_clip": 1.0708313, "balance_loss_mlp": 0.16348717, "epoch": 0.6614760258529987, "flos": 24097281235200.0, "grad_norm": 136.72045858319976, "language_loss": 0.79074585, "learning_rate": 1.086280662309739e-06, "loss": 0.8057456, "num_input_tokens_seen": 237502860, "router_z_loss_clip": 2.39257812, "router_z_loss_mlp": 0.26538086, "step": 11002, "time_per_iteration": 2.6834259033203125 }, { "auxiliary_loss_clip": 0.01304313, "auxiliary_loss_mlp": 0.00206421, "balance_loss_clip": 1.06887889, "balance_loss_mlp": 0.18213849, "epoch": 0.6615361491056666, "flos": 14903790935040.0, "grad_norm": 6.95275925639977, "language_loss": 0.86796963, "learning_rate": 1.0859342383666928e-06, "loss": 0.88307703, "num_input_tokens_seen": 237521030, "router_z_loss_clip": 2.35351562, "router_z_loss_mlp": 0.24304199, "step": 11003, "time_per_iteration": 2.6476757526397705 }, { "auxiliary_loss_clip": 0.01340398, "auxiliary_loss_mlp": 0.00217356, "balance_loss_clip": 1.08949184, "balance_loss_mlp": 0.18915114, "epoch": 0.6615962723583346, "flos": 15304877176320.0, "grad_norm": 26.212404148920797, "language_loss": 0.80156469, "learning_rate": 1.0855878490851119e-06, "loss": 0.81714225, "num_input_tokens_seen": 237539585, "router_z_loss_clip": 2.50585938, "router_z_loss_mlp": 0.28198242, "step": 11004, "time_per_iteration": 2.6818130016326904 }, { "auxiliary_loss_clip": 0.01326474, "auxiliary_loss_mlp": 0.00220258, "balance_loss_clip": 1.0752387, "balance_loss_mlp": 0.1930311, "epoch": 0.6616563956110025, "flos": 18732586285440.0, "grad_norm": 69.88220206356237, "language_loss": 0.80191159, "learning_rate": 1.085241494478132e-06, "loss": 0.81737888, "num_input_tokens_seen": 237557655, "router_z_loss_clip": 2.51367188, "router_z_loss_mlp": 0.2722168, "step": 11005, "time_per_iteration": 2.6136467456817627 }, { "auxiliary_loss_clip": 0.01304904, "auxiliary_loss_mlp": 0.00205018, "balance_loss_clip": 1.06656539, "balance_loss_mlp": 0.18114041, "epoch": 0.6617165188636706, "flos": 24495063425280.0, "grad_norm": 3.3884146505108594, "language_loss": 0.84034801, "learning_rate": 1.0848951745588855e-06, "loss": 0.85544729, "num_input_tokens_seen": 237577000, "router_z_loss_clip": 2.3828125, "router_z_loss_mlp": 0.2388916, "step": 11006, "time_per_iteration": 2.713775873184204 }, { "auxiliary_loss_clip": 0.01314688, "auxiliary_loss_mlp": 0.00201662, "balance_loss_clip": 1.06915593, "balance_loss_mlp": 0.17559059, "epoch": 0.6617766421163385, "flos": 22379673709440.0, "grad_norm": 16.716259497089375, "language_loss": 0.83501899, "learning_rate": 1.0845488893405068e-06, "loss": 0.85018253, "num_input_tokens_seen": 237597960, "router_z_loss_clip": 2.45703125, "router_z_loss_mlp": 0.26098633, "step": 11007, "time_per_iteration": 2.6609649658203125 }, { "auxiliary_loss_clip": 0.01332409, "auxiliary_loss_mlp": 0.00209655, "balance_loss_clip": 1.08098364, "balance_loss_mlp": 0.18402538, "epoch": 0.6618367653690065, "flos": 20850418126080.0, "grad_norm": 8.099382482980749, "language_loss": 0.86976707, "learning_rate": 1.0842026388361248e-06, "loss": 0.88518775, "num_input_tokens_seen": 237616385, "router_z_loss_clip": 2.51367188, "router_z_loss_mlp": 0.2565918, "step": 11008, "time_per_iteration": 2.6517550945281982 }, { "auxiliary_loss_clip": 0.0132632, "auxiliary_loss_mlp": 0.00218429, "balance_loss_clip": 1.07614291, "balance_loss_mlp": 0.19109425, "epoch": 0.6618968886216744, "flos": 17712328377600.0, "grad_norm": 54.17045749775656, "language_loss": 0.90103018, "learning_rate": 1.0838564230588715e-06, "loss": 0.91647762, "num_input_tokens_seen": 237634930, "router_z_loss_clip": 2.50390625, "router_z_loss_mlp": 0.2734375, "step": 11009, "time_per_iteration": 2.6340255737304688 }, { "auxiliary_loss_clip": 0.0119621, "auxiliary_loss_mlp": 0.00090149, "balance_loss_clip": 1.03999138, "balance_loss_mlp": 0.08128002, "epoch": 0.6619570118743424, "flos": 67035347498880.0, "grad_norm": 0.9775835445113337, "language_loss": 0.67004734, "learning_rate": 1.0835102420218735e-06, "loss": 0.68291098, "num_input_tokens_seen": 237693175, "router_z_loss_clip": 1.5625, "router_z_loss_mlp": 0.08886719, "step": 11010, "time_per_iteration": 3.117248773574829 }, { "auxiliary_loss_clip": 0.0132341, "auxiliary_loss_mlp": 0.00213666, "balance_loss_clip": 1.07631004, "balance_loss_mlp": 0.18603356, "epoch": 0.6620171351270103, "flos": 18660908695680.0, "grad_norm": 12.8246900283568, "language_loss": 0.80893147, "learning_rate": 1.0831640957382593e-06, "loss": 0.82430226, "num_input_tokens_seen": 237713160, "router_z_loss_clip": 2.47265625, "router_z_loss_mlp": 0.27648926, "step": 11011, "time_per_iteration": 2.6529664993286133 }, { "auxiliary_loss_clip": 0.01311887, "auxiliary_loss_mlp": 0.00219961, "balance_loss_clip": 1.06801009, "balance_loss_mlp": 0.19390163, "epoch": 0.6620772583796783, "flos": 24170503109760.0, "grad_norm": 11.673914333388911, "language_loss": 0.7977379, "learning_rate": 1.0828179842211557e-06, "loss": 0.81305635, "num_input_tokens_seen": 237733600, "router_z_loss_clip": 2.4375, "router_z_loss_mlp": 0.26062012, "step": 11012, "time_per_iteration": 2.6502583026885986 }, { "auxiliary_loss_clip": 0.01291868, "auxiliary_loss_mlp": 0.00197872, "balance_loss_clip": 1.06116235, "balance_loss_mlp": 0.17602131, "epoch": 0.6621373816323463, "flos": 23623547736960.0, "grad_norm": 3.410284685492743, "language_loss": 0.86377776, "learning_rate": 1.0824719074836845e-06, "loss": 0.8786751, "num_input_tokens_seen": 237752135, "router_z_loss_clip": 2.30664062, "router_z_loss_mlp": 0.21862793, "step": 11013, "time_per_iteration": 2.6754016876220703 }, { "auxiliary_loss_clip": 0.01322366, "auxiliary_loss_mlp": 0.00215225, "balance_loss_clip": 1.08142602, "balance_loss_mlp": 0.18972582, "epoch": 0.6621975048850143, "flos": 18442212739200.0, "grad_norm": 171.59903115980063, "language_loss": 0.79553342, "learning_rate": 1.082125865538971e-06, "loss": 0.81090933, "num_input_tokens_seen": 237770735, "router_z_loss_clip": 2.41015625, "router_z_loss_mlp": 0.25488281, "step": 11014, "time_per_iteration": 2.5936050415039062 }, { "auxiliary_loss_clip": 0.01309146, "auxiliary_loss_mlp": 0.00214715, "balance_loss_clip": 1.0709269, "balance_loss_mlp": 0.18941867, "epoch": 0.6622576281376823, "flos": 14063876236800.0, "grad_norm": 4010.0250004299874, "language_loss": 0.85255289, "learning_rate": 1.081779858400137e-06, "loss": 0.86779153, "num_input_tokens_seen": 237789005, "router_z_loss_clip": 2.38476562, "router_z_loss_mlp": 0.25317383, "step": 11015, "time_per_iteration": 2.632601261138916 }, { "auxiliary_loss_clip": 0.01327547, "auxiliary_loss_mlp": 0.0020398, "balance_loss_clip": 1.0788213, "balance_loss_mlp": 0.17737275, "epoch": 0.6623177513903502, "flos": 17018965169280.0, "grad_norm": 13.144387668284084, "language_loss": 0.90288651, "learning_rate": 1.0814338860803021e-06, "loss": 0.9182018, "num_input_tokens_seen": 237807740, "router_z_loss_clip": 2.48632812, "router_z_loss_mlp": 0.26611328, "step": 11016, "time_per_iteration": 2.6051058769226074 }, { "auxiliary_loss_clip": 0.01340209, "auxiliary_loss_mlp": 0.00217527, "balance_loss_clip": 1.0878818, "balance_loss_mlp": 0.18895274, "epoch": 0.6623778746430182, "flos": 17271021882240.0, "grad_norm": 34.517595497635405, "language_loss": 0.83487284, "learning_rate": 1.0810879485925864e-06, "loss": 0.85045016, "num_input_tokens_seen": 237826340, "router_z_loss_clip": 2.51953125, "router_z_loss_mlp": 0.28540039, "step": 11017, "time_per_iteration": 2.643122434616089 }, { "auxiliary_loss_clip": 0.01329295, "auxiliary_loss_mlp": 0.00217423, "balance_loss_clip": 1.08531141, "balance_loss_mlp": 0.19126859, "epoch": 0.6624379978956861, "flos": 48792688767360.0, "grad_norm": 4.953408435792612, "language_loss": 0.84034628, "learning_rate": 1.0807420459501084e-06, "loss": 0.8558135, "num_input_tokens_seen": 237848305, "router_z_loss_clip": 2.43945312, "router_z_loss_mlp": 0.26184082, "step": 11018, "time_per_iteration": 2.936920404434204 }, { "auxiliary_loss_clip": 0.01305146, "auxiliary_loss_mlp": 0.00206217, "balance_loss_clip": 1.0669893, "balance_loss_mlp": 0.18195805, "epoch": 0.6624981211483542, "flos": 18952431477120.0, "grad_norm": 59.63110331423388, "language_loss": 0.90634918, "learning_rate": 1.0803961781659841e-06, "loss": 0.92146277, "num_input_tokens_seen": 237867020, "router_z_loss_clip": 2.37890625, "router_z_loss_mlp": 0.24255371, "step": 11019, "time_per_iteration": 2.680730104446411 }, { "auxiliary_loss_clip": 0.01304717, "auxiliary_loss_mlp": 0.00231849, "balance_loss_clip": 1.07030869, "balance_loss_mlp": 0.20605251, "epoch": 0.6625582444010221, "flos": 23256576437760.0, "grad_norm": 3.345653983912836, "language_loss": 0.77589941, "learning_rate": 1.080050345253328e-06, "loss": 0.79126513, "num_input_tokens_seen": 237886710, "router_z_loss_clip": 2.33984375, "router_z_loss_mlp": 0.25805664, "step": 11020, "time_per_iteration": 2.687253713607788 }, { "auxiliary_loss_clip": 0.01342455, "auxiliary_loss_mlp": 0.00224092, "balance_loss_clip": 1.0912137, "balance_loss_mlp": 0.19582756, "epoch": 0.6626183676536901, "flos": 21394823633280.0, "grad_norm": 2.279079471693407, "language_loss": 0.79273897, "learning_rate": 1.0797045472252554e-06, "loss": 0.80840445, "num_input_tokens_seen": 237904795, "router_z_loss_clip": 2.51171875, "router_z_loss_mlp": 0.28271484, "step": 11021, "time_per_iteration": 2.7399682998657227 }, { "auxiliary_loss_clip": 0.01333922, "auxiliary_loss_mlp": 0.00227853, "balance_loss_clip": 1.09148836, "balance_loss_mlp": 0.19938551, "epoch": 0.662678490906358, "flos": 14571293713920.0, "grad_norm": 64.35818947220129, "language_loss": 0.90711522, "learning_rate": 1.0793587840948793e-06, "loss": 0.92273301, "num_input_tokens_seen": 237921320, "router_z_loss_clip": 2.42578125, "router_z_loss_mlp": 0.2845459, "step": 11022, "time_per_iteration": 2.736555576324463 }, { "auxiliary_loss_clip": 0.01363694, "auxiliary_loss_mlp": 0.00228242, "balance_loss_clip": 1.09674442, "balance_loss_mlp": 0.19715185, "epoch": 0.662738614159026, "flos": 15992350554240.0, "grad_norm": 9.266504744567998, "language_loss": 0.88706428, "learning_rate": 1.0790130558753099e-06, "loss": 0.90298361, "num_input_tokens_seen": 237933525, "router_z_loss_clip": 2.66992188, "router_z_loss_mlp": 0.31103516, "step": 11023, "time_per_iteration": 2.6191444396972656 }, { "auxiliary_loss_clip": 0.01330342, "auxiliary_loss_mlp": 0.00224946, "balance_loss_clip": 1.09027624, "balance_loss_mlp": 0.19810009, "epoch": 0.6627987374116939, "flos": 19536338966400.0, "grad_norm": 4.839216360389303, "language_loss": 0.81343567, "learning_rate": 1.0786673625796574e-06, "loss": 0.82898855, "num_input_tokens_seen": 237953395, "router_z_loss_clip": 2.39648438, "router_z_loss_mlp": 0.2689209, "step": 11024, "time_per_iteration": 2.6421637535095215 }, { "auxiliary_loss_clip": 0.01360013, "auxiliary_loss_mlp": 0.00234272, "balance_loss_clip": 1.1035502, "balance_loss_mlp": 0.20438653, "epoch": 0.662858860664362, "flos": 15702838934400.0, "grad_norm": 142.88119463158276, "language_loss": 0.81744641, "learning_rate": 1.0783217042210306e-06, "loss": 0.83338928, "num_input_tokens_seen": 237971445, "router_z_loss_clip": 2.56054688, "router_z_loss_mlp": 0.29870605, "step": 11025, "time_per_iteration": 2.7105233669281006 }, { "auxiliary_loss_clip": 0.01362168, "auxiliary_loss_mlp": 0.00220634, "balance_loss_clip": 1.10154486, "balance_loss_mlp": 0.18994993, "epoch": 0.6629189839170299, "flos": 20154289570560.0, "grad_norm": 4000.398566753828, "language_loss": 0.86473113, "learning_rate": 1.0779760808125379e-06, "loss": 0.88055921, "num_input_tokens_seen": 237989965, "router_z_loss_clip": 2.60546875, "router_z_loss_mlp": 0.30688477, "step": 11026, "time_per_iteration": 2.6520049571990967 }, { "auxiliary_loss_clip": 0.01324224, "auxiliary_loss_mlp": 0.00215253, "balance_loss_clip": 1.07702327, "balance_loss_mlp": 0.18700041, "epoch": 0.6629791071696979, "flos": 20915415786240.0, "grad_norm": 5.1897221259831285, "language_loss": 0.83979452, "learning_rate": 1.0776304923672842e-06, "loss": 0.85518932, "num_input_tokens_seen": 238006820, "router_z_loss_clip": 2.47070312, "router_z_loss_mlp": 0.28259277, "step": 11027, "time_per_iteration": 2.6647684574127197 }, { "auxiliary_loss_clip": 0.01328454, "auxiliary_loss_mlp": 0.00224332, "balance_loss_clip": 1.08557153, "balance_loss_mlp": 0.19586453, "epoch": 0.6630392304223659, "flos": 20846898593280.0, "grad_norm": 6.02540474170768, "language_loss": 0.81762862, "learning_rate": 1.0772849388983742e-06, "loss": 0.83315647, "num_input_tokens_seen": 238022560, "router_z_loss_clip": 2.42578125, "router_z_loss_mlp": 0.28503418, "step": 11028, "time_per_iteration": 2.6518003940582275 }, { "auxiliary_loss_clip": 0.01325464, "auxiliary_loss_mlp": 0.00207194, "balance_loss_clip": 1.0827527, "balance_loss_mlp": 0.18000214, "epoch": 0.6630993536750338, "flos": 20995820380800.0, "grad_norm": 431.64723499318114, "language_loss": 0.8744908, "learning_rate": 1.0769394204189138e-06, "loss": 0.88981742, "num_input_tokens_seen": 238041895, "router_z_loss_clip": 2.42773438, "router_z_loss_mlp": 0.2722168, "step": 11029, "time_per_iteration": 2.6660776138305664 }, { "auxiliary_loss_clip": 0.0132888, "auxiliary_loss_mlp": 0.00212975, "balance_loss_clip": 1.08609676, "balance_loss_mlp": 0.18606925, "epoch": 0.6631594769277018, "flos": 18259032355200.0, "grad_norm": 7.858498183882519, "language_loss": 0.87515771, "learning_rate": 1.0765939369420012e-06, "loss": 0.89057624, "num_input_tokens_seen": 238060445, "router_z_loss_clip": 2.42773438, "router_z_loss_mlp": 0.26904297, "step": 11030, "time_per_iteration": 2.644345760345459 }, { "auxiliary_loss_clip": 0.01363678, "auxiliary_loss_mlp": 0.0022296, "balance_loss_clip": 1.1091181, "balance_loss_mlp": 0.19518416, "epoch": 0.6632196001803697, "flos": 17820491207040.0, "grad_norm": 267.651267033203, "language_loss": 0.87287939, "learning_rate": 1.0762484884807391e-06, "loss": 0.88874578, "num_input_tokens_seen": 238077080, "router_z_loss_clip": 2.546875, "router_z_loss_mlp": 0.27783203, "step": 11031, "time_per_iteration": 2.6109282970428467 }, { "auxiliary_loss_clip": 0.0134285, "auxiliary_loss_mlp": 0.00218074, "balance_loss_clip": 1.09337044, "balance_loss_mlp": 0.19071537, "epoch": 0.6632797234330378, "flos": 12670182581760.0, "grad_norm": 6.44422893466026, "language_loss": 0.8637504, "learning_rate": 1.075903075048228e-06, "loss": 0.8793596, "num_input_tokens_seen": 238091045, "router_z_loss_clip": 2.49609375, "router_z_loss_mlp": 0.27380371, "step": 11032, "time_per_iteration": 2.6281681060791016 }, { "auxiliary_loss_clip": 0.01315055, "auxiliary_loss_mlp": 0.00194812, "balance_loss_clip": 1.07463562, "balance_loss_mlp": 0.16840698, "epoch": 0.6633398466857057, "flos": 23584728113280.0, "grad_norm": 6.979344023250589, "language_loss": 0.88499534, "learning_rate": 1.0755576966575635e-06, "loss": 0.90009403, "num_input_tokens_seen": 238110220, "router_z_loss_clip": 2.40625, "router_z_loss_mlp": 0.26403809, "step": 11033, "time_per_iteration": 2.7141945362091064 }, { "auxiliary_loss_clip": 0.01353551, "auxiliary_loss_mlp": 0.00236076, "balance_loss_clip": 1.09809518, "balance_loss_mlp": 0.20559445, "epoch": 0.6633999699383737, "flos": 20631686256000.0, "grad_norm": 5.088107633825604, "language_loss": 0.89340174, "learning_rate": 1.0752123533218451e-06, "loss": 0.909298, "num_input_tokens_seen": 238130400, "router_z_loss_clip": 2.55664062, "router_z_loss_mlp": 0.30480957, "step": 11034, "time_per_iteration": 4.079174995422363 }, { "auxiliary_loss_clip": 0.0130167, "auxiliary_loss_mlp": 0.00204928, "balance_loss_clip": 1.07167554, "balance_loss_mlp": 0.18116991, "epoch": 0.6634600931910416, "flos": 21797095023360.0, "grad_norm": 13.385411810843099, "language_loss": 0.82952487, "learning_rate": 1.074867045054166e-06, "loss": 0.84459078, "num_input_tokens_seen": 238148165, "router_z_loss_clip": 2.296875, "router_z_loss_mlp": 0.23754883, "step": 11035, "time_per_iteration": 4.090306520462036 }, { "auxiliary_loss_clip": 0.01345896, "auxiliary_loss_mlp": 0.00200781, "balance_loss_clip": 1.09283519, "balance_loss_mlp": 0.17193222, "epoch": 0.6635202164437096, "flos": 18732873594240.0, "grad_norm": 2.7276029236074284, "language_loss": 0.91539776, "learning_rate": 1.074521771867622e-06, "loss": 0.93086451, "num_input_tokens_seen": 238166360, "router_z_loss_clip": 2.53320312, "router_z_loss_mlp": 0.28857422, "step": 11036, "time_per_iteration": 2.657353639602661 }, { "auxiliary_loss_clip": 0.01270705, "auxiliary_loss_mlp": 0.00077486, "balance_loss_clip": 1.11072636, "balance_loss_mlp": 0.06847345, "epoch": 0.6635803396963775, "flos": 60222771227520.0, "grad_norm": 0.7435380376249618, "language_loss": 0.51429021, "learning_rate": 1.0741765337753044e-06, "loss": 0.52777207, "num_input_tokens_seen": 238227630, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.09033203, "step": 11037, "time_per_iteration": 3.1461713314056396 }, { "auxiliary_loss_clip": 0.01319014, "auxiliary_loss_mlp": 0.00195681, "balance_loss_clip": 1.07755542, "balance_loss_mlp": 0.16728488, "epoch": 0.6636404629490456, "flos": 29167041611520.0, "grad_norm": 167.3148044735357, "language_loss": 0.86027896, "learning_rate": 1.0738313307903052e-06, "loss": 0.87542588, "num_input_tokens_seen": 238248435, "router_z_loss_clip": 2.41601562, "router_z_loss_mlp": 0.28393555, "step": 11038, "time_per_iteration": 2.7862954139709473 }, { "auxiliary_loss_clip": 0.01328164, "auxiliary_loss_mlp": 0.00249127, "balance_loss_clip": 1.0874753, "balance_loss_mlp": 0.22102985, "epoch": 0.6637005862017135, "flos": 38907702766080.0, "grad_norm": 64.67917318920603, "language_loss": 0.74512303, "learning_rate": 1.073486162925716e-06, "loss": 0.76089597, "num_input_tokens_seen": 238268755, "router_z_loss_clip": 2.40625, "router_z_loss_mlp": 0.28088379, "step": 11039, "time_per_iteration": 4.272482872009277 }, { "auxiliary_loss_clip": 0.01343551, "auxiliary_loss_mlp": 0.00229736, "balance_loss_clip": 1.09476781, "balance_loss_mlp": 0.20088738, "epoch": 0.6637607094543815, "flos": 22783345729920.0, "grad_norm": 56.57298763913959, "language_loss": 0.72021407, "learning_rate": 1.0731410301946237e-06, "loss": 0.73594701, "num_input_tokens_seen": 238290120, "router_z_loss_clip": 2.484375, "router_z_loss_mlp": 0.28857422, "step": 11040, "time_per_iteration": 2.727445125579834 }, { "auxiliary_loss_clip": 0.01323888, "auxiliary_loss_mlp": 0.00200063, "balance_loss_clip": 1.08265293, "balance_loss_mlp": 0.1740633, "epoch": 0.6638208327070495, "flos": 18114096977280.0, "grad_norm": 12.905604380074468, "language_loss": 0.81175852, "learning_rate": 1.0727959326101161e-06, "loss": 0.826998, "num_input_tokens_seen": 238309290, "router_z_loss_clip": 2.41210938, "router_z_loss_mlp": 0.2598877, "step": 11041, "time_per_iteration": 2.7263808250427246 }, { "auxiliary_loss_clip": 0.01333424, "auxiliary_loss_mlp": 0.00225087, "balance_loss_clip": 1.0920434, "balance_loss_mlp": 0.19734702, "epoch": 0.6638809559597174, "flos": 29424880414080.0, "grad_norm": 21.489010189778124, "language_loss": 0.70112062, "learning_rate": 1.0724508701852806e-06, "loss": 0.71670568, "num_input_tokens_seen": 238327280, "router_z_loss_clip": 2.41601562, "router_z_loss_mlp": 0.27758789, "step": 11042, "time_per_iteration": 2.741267442703247 }, { "auxiliary_loss_clip": 0.01343871, "auxiliary_loss_mlp": 0.00219225, "balance_loss_clip": 1.09252191, "balance_loss_mlp": 0.18854007, "epoch": 0.6639410792123854, "flos": 28072699902720.0, "grad_norm": 10.173305716744704, "language_loss": 0.78910041, "learning_rate": 1.0721058429331998e-06, "loss": 0.80473131, "num_input_tokens_seen": 238346330, "router_z_loss_clip": 2.51171875, "router_z_loss_mlp": 0.30725098, "step": 11043, "time_per_iteration": 4.111525535583496 }, { "auxiliary_loss_clip": 0.01309862, "auxiliary_loss_mlp": 0.0021331, "balance_loss_clip": 1.07616389, "balance_loss_mlp": 0.18859768, "epoch": 0.6640012024650533, "flos": 25556367600000.0, "grad_norm": 3.4964327291312283, "language_loss": 0.88713145, "learning_rate": 1.0717608508669587e-06, "loss": 0.90236318, "num_input_tokens_seen": 238364650, "router_z_loss_clip": 2.3359375, "router_z_loss_mlp": 0.24731445, "step": 11044, "time_per_iteration": 2.7383153438568115 }, { "auxiliary_loss_clip": 0.01353126, "auxiliary_loss_mlp": 0.00222285, "balance_loss_clip": 1.10284746, "balance_loss_mlp": 0.19336483, "epoch": 0.6640613257177214, "flos": 14866946559360.0, "grad_norm": 3.818935525689628, "language_loss": 0.80193698, "learning_rate": 1.0714158939996392e-06, "loss": 0.81769109, "num_input_tokens_seen": 238381630, "router_z_loss_clip": 2.50390625, "router_z_loss_mlp": 0.28930664, "step": 11045, "time_per_iteration": 2.677267551422119 }, { "auxiliary_loss_clip": 0.01338662, "auxiliary_loss_mlp": 0.00220162, "balance_loss_clip": 1.09320331, "balance_loss_mlp": 0.19206476, "epoch": 0.6641214489703893, "flos": 23221096778880.0, "grad_norm": 4.964796694512858, "language_loss": 0.72012931, "learning_rate": 1.0710709723443235e-06, "loss": 0.73571754, "num_input_tokens_seen": 238402595, "router_z_loss_clip": 2.453125, "router_z_loss_mlp": 0.28063965, "step": 11046, "time_per_iteration": 2.6837244033813477 }, { "auxiliary_loss_clip": 0.01317976, "auxiliary_loss_mlp": 0.00209184, "balance_loss_clip": 1.07988834, "balance_loss_mlp": 0.18378031, "epoch": 0.6641815722230573, "flos": 37742617221120.0, "grad_norm": 14.55161845947497, "language_loss": 0.77583802, "learning_rate": 1.070726085914088e-06, "loss": 0.79110956, "num_input_tokens_seen": 238426860, "router_z_loss_clip": 2.37890625, "router_z_loss_mlp": 0.25366211, "step": 11047, "time_per_iteration": 2.764198064804077 }, { "auxiliary_loss_clip": 0.01346259, "auxiliary_loss_mlp": 0.00224767, "balance_loss_clip": 1.10057473, "balance_loss_mlp": 0.1959663, "epoch": 0.6642416954757252, "flos": 17931132074880.0, "grad_norm": 12.91122996596213, "language_loss": 0.83730817, "learning_rate": 1.0703812347220126e-06, "loss": 0.8530184, "num_input_tokens_seen": 238443990, "router_z_loss_clip": 2.45507812, "router_z_loss_mlp": 0.28808594, "step": 11048, "time_per_iteration": 2.6463725566864014 }, { "auxiliary_loss_clip": 0.01278829, "auxiliary_loss_mlp": 0.00113365, "balance_loss_clip": 1.11699009, "balance_loss_mlp": 0.10525852, "epoch": 0.6643018187283932, "flos": 51995384104320.0, "grad_norm": 0.7220635702194952, "language_loss": 0.54553211, "learning_rate": 1.0700364187811745e-06, "loss": 0.55945402, "num_input_tokens_seen": 238503045, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.08105469, "step": 11049, "time_per_iteration": 3.155355930328369 }, { "auxiliary_loss_clip": 0.013384, "auxiliary_loss_mlp": 0.00214381, "balance_loss_clip": 1.09545112, "balance_loss_mlp": 0.18649827, "epoch": 0.6643619419810611, "flos": 30226657847040.0, "grad_norm": 4.576893290811261, "language_loss": 0.71977448, "learning_rate": 1.069691638104648e-06, "loss": 0.73530233, "num_input_tokens_seen": 238527320, "router_z_loss_clip": 2.4296875, "router_z_loss_mlp": 0.27893066, "step": 11050, "time_per_iteration": 2.7248833179473877 }, { "auxiliary_loss_clip": 0.0132625, "auxiliary_loss_mlp": 0.00217681, "balance_loss_clip": 1.08579183, "balance_loss_mlp": 0.18965478, "epoch": 0.6644220652337292, "flos": 22966131064320.0, "grad_norm": 16.64397957206263, "language_loss": 0.89886206, "learning_rate": 1.0693468927055085e-06, "loss": 0.9143014, "num_input_tokens_seen": 238546030, "router_z_loss_clip": 2.40820312, "router_z_loss_mlp": 0.28027344, "step": 11051, "time_per_iteration": 2.6803629398345947 }, { "auxiliary_loss_clip": 0.01342429, "auxiliary_loss_mlp": 0.00226117, "balance_loss_clip": 1.09663796, "balance_loss_mlp": 0.19655341, "epoch": 0.6644821884863971, "flos": 21142228216320.0, "grad_norm": 16.690932316827727, "language_loss": 0.92355686, "learning_rate": 1.0690021825968276e-06, "loss": 0.93924236, "num_input_tokens_seen": 238564175, "router_z_loss_clip": 2.45898438, "router_z_loss_mlp": 0.29541016, "step": 11052, "time_per_iteration": 2.660831928253174 }, { "auxiliary_loss_clip": 0.01347572, "auxiliary_loss_mlp": 0.00225497, "balance_loss_clip": 1.0945102, "balance_loss_mlp": 0.19705349, "epoch": 0.6645423117390651, "flos": 20192821885440.0, "grad_norm": 39.628675196494356, "language_loss": 0.84187144, "learning_rate": 1.0686575077916776e-06, "loss": 0.85760218, "num_input_tokens_seen": 238581010, "router_z_loss_clip": 2.52929688, "router_z_loss_mlp": 0.28491211, "step": 11053, "time_per_iteration": 2.690481424331665 }, { "auxiliary_loss_clip": 0.01309955, "auxiliary_loss_mlp": 0.00227512, "balance_loss_clip": 1.07614756, "balance_loss_mlp": 0.20276463, "epoch": 0.6646024349917331, "flos": 24351959640960.0, "grad_norm": 2.016946055326757, "language_loss": 0.86517549, "learning_rate": 1.0683128683031278e-06, "loss": 0.88055015, "num_input_tokens_seen": 238601365, "router_z_loss_clip": 2.33789062, "router_z_loss_mlp": 0.24768066, "step": 11054, "time_per_iteration": 2.677443265914917 }, { "auxiliary_loss_clip": 0.01307472, "auxiliary_loss_mlp": 0.002151, "balance_loss_clip": 1.077461, "balance_loss_mlp": 0.18746775, "epoch": 0.664662558244401, "flos": 18806706000000.0, "grad_norm": 2.673033141844444, "language_loss": 0.80205917, "learning_rate": 1.0679682641442472e-06, "loss": 0.81728482, "num_input_tokens_seen": 238619850, "router_z_loss_clip": 2.30078125, "router_z_loss_mlp": 0.27636719, "step": 11055, "time_per_iteration": 2.648564338684082 }, { "auxiliary_loss_clip": 0.01349794, "auxiliary_loss_mlp": 0.00221601, "balance_loss_clip": 1.10213411, "balance_loss_mlp": 0.19176316, "epoch": 0.664722681497069, "flos": 18952790613120.0, "grad_norm": 9.98820160521406, "language_loss": 0.84486455, "learning_rate": 1.0676236953281042e-06, "loss": 0.86057854, "num_input_tokens_seen": 238637635, "router_z_loss_clip": 2.47265625, "router_z_loss_mlp": 0.29870605, "step": 11056, "time_per_iteration": 2.6633214950561523 }, { "auxiliary_loss_clip": 0.01326039, "auxiliary_loss_mlp": 0.00236661, "balance_loss_clip": 1.08595324, "balance_loss_mlp": 0.206954, "epoch": 0.6647828047497369, "flos": 19571279921280.0, "grad_norm": 8.190865522167034, "language_loss": 0.79716694, "learning_rate": 1.0672791618677641e-06, "loss": 0.81279397, "num_input_tokens_seen": 238656200, "router_z_loss_clip": 2.40039062, "router_z_loss_mlp": 0.29711914, "step": 11057, "time_per_iteration": 2.730844736099243 }, { "auxiliary_loss_clip": 0.01342792, "auxiliary_loss_mlp": 0.00200609, "balance_loss_clip": 1.09464884, "balance_loss_mlp": 0.17294037, "epoch": 0.664842928002405, "flos": 23149455102720.0, "grad_norm": 9.75676712505621, "language_loss": 0.88838124, "learning_rate": 1.066934663776291e-06, "loss": 0.90381521, "num_input_tokens_seen": 238675005, "router_z_loss_clip": 2.48242188, "router_z_loss_mlp": 0.27685547, "step": 11058, "time_per_iteration": 2.6712958812713623 }, { "auxiliary_loss_clip": 0.01260405, "auxiliary_loss_mlp": 0.00080382, "balance_loss_clip": 1.10241556, "balance_loss_mlp": 0.07294323, "epoch": 0.6649030512550729, "flos": 65244913148160.0, "grad_norm": 0.7529408138088636, "language_loss": 0.62004191, "learning_rate": 1.0665902010667496e-06, "loss": 0.63344979, "num_input_tokens_seen": 238731425, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.07421875, "step": 11059, "time_per_iteration": 3.0669233798980713 }, { "auxiliary_loss_clip": 0.01301186, "auxiliary_loss_mlp": 0.00221929, "balance_loss_clip": 1.07162642, "balance_loss_mlp": 0.1962629, "epoch": 0.6649631745077409, "flos": 20194797133440.0, "grad_norm": 7.5310791373914885, "language_loss": 0.84936416, "learning_rate": 1.0662457737522008e-06, "loss": 0.86459535, "num_input_tokens_seen": 238752020, "router_z_loss_clip": 2.29101562, "router_z_loss_mlp": 0.25634766, "step": 11060, "time_per_iteration": 2.636038064956665 }, { "auxiliary_loss_clip": 0.0131728, "auxiliary_loss_mlp": 0.0021398, "balance_loss_clip": 1.07885671, "balance_loss_mlp": 0.18709852, "epoch": 0.6650232977604088, "flos": 17238558965760.0, "grad_norm": 4.739421964250357, "language_loss": 0.86255348, "learning_rate": 1.0659013818457055e-06, "loss": 0.87786609, "num_input_tokens_seen": 238769665, "router_z_loss_clip": 2.38476562, "router_z_loss_mlp": 0.26855469, "step": 11061, "time_per_iteration": 2.6438612937927246 }, { "auxiliary_loss_clip": 0.01320037, "auxiliary_loss_mlp": 0.00208231, "balance_loss_clip": 1.08242416, "balance_loss_mlp": 0.18009731, "epoch": 0.6650834210130768, "flos": 10006867825920.0, "grad_norm": 16.70752121773909, "language_loss": 0.65535951, "learning_rate": 1.0655570253603243e-06, "loss": 0.67064214, "num_input_tokens_seen": 238782180, "router_z_loss_clip": 2.37695312, "router_z_loss_mlp": 0.28125, "step": 11062, "time_per_iteration": 2.6328303813934326 }, { "auxiliary_loss_clip": 0.01314815, "auxiliary_loss_mlp": 0.00215248, "balance_loss_clip": 1.07255006, "balance_loss_mlp": 0.18717435, "epoch": 0.6651435442657447, "flos": 10452088903680.0, "grad_norm": 7.549475212894057, "language_loss": 0.86532158, "learning_rate": 1.0652127043091144e-06, "loss": 0.88062215, "num_input_tokens_seen": 238800315, "router_z_loss_clip": 2.42773438, "router_z_loss_mlp": 0.28088379, "step": 11063, "time_per_iteration": 2.623905658721924 }, { "auxiliary_loss_clip": 0.01317553, "auxiliary_loss_mlp": 0.00196395, "balance_loss_clip": 1.07950068, "balance_loss_mlp": 0.16865513, "epoch": 0.6652036675184128, "flos": 22344229964160.0, "grad_norm": 194.68605213943553, "language_loss": 0.7767669, "learning_rate": 1.0648684187051316e-06, "loss": 0.79190642, "num_input_tokens_seen": 238822250, "router_z_loss_clip": 2.3828125, "router_z_loss_mlp": 0.27709961, "step": 11064, "time_per_iteration": 2.808529853820801 }, { "auxiliary_loss_clip": 0.01258142, "auxiliary_loss_mlp": 0.00122799, "balance_loss_clip": 1.10071039, "balance_loss_mlp": 0.11464508, "epoch": 0.6652637907710807, "flos": 52909633998720.0, "grad_norm": 1.104478105494719, "language_loss": 0.62396896, "learning_rate": 1.0645241685614322e-06, "loss": 0.6377784, "num_input_tokens_seen": 238877190, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.08154297, "step": 11065, "time_per_iteration": 3.0941576957702637 }, { "auxiliary_loss_clip": 0.01335858, "auxiliary_loss_mlp": 0.00231172, "balance_loss_clip": 1.09126985, "balance_loss_mlp": 0.20258537, "epoch": 0.6653239140237487, "flos": 23104637907840.0, "grad_norm": 5.42160028619434, "language_loss": 0.71016169, "learning_rate": 1.0641799538910708e-06, "loss": 0.72583193, "num_input_tokens_seen": 238896010, "router_z_loss_clip": 2.44335938, "router_z_loss_mlp": 0.28588867, "step": 11066, "time_per_iteration": 2.7121167182922363 }, { "auxiliary_loss_clip": 0.01310432, "auxiliary_loss_mlp": 0.00225557, "balance_loss_clip": 1.07764697, "balance_loss_mlp": 0.19592187, "epoch": 0.6653840372764167, "flos": 25959393175680.0, "grad_norm": 7.807055371604854, "language_loss": 0.74738038, "learning_rate": 1.0638357747070985e-06, "loss": 0.76274025, "num_input_tokens_seen": 238918990, "router_z_loss_clip": 2.32617188, "router_z_loss_mlp": 0.29663086, "step": 11067, "time_per_iteration": 2.7526328563690186 }, { "auxiliary_loss_clip": 0.01250534, "auxiliary_loss_mlp": 0.00081958, "balance_loss_clip": 1.09468865, "balance_loss_mlp": 0.0733752, "epoch": 0.6654441605290846, "flos": 66041985899520.0, "grad_norm": 0.8889512278614532, "language_loss": 0.7135253, "learning_rate": 1.0634916310225684e-06, "loss": 0.72685015, "num_input_tokens_seen": 238975735, "router_z_loss_clip": 1.5625, "router_z_loss_mlp": 0.0859375, "step": 11068, "time_per_iteration": 3.163757801055908 }, { "auxiliary_loss_clip": 0.01249606, "auxiliary_loss_mlp": 0.00109474, "balance_loss_clip": 1.0952673, "balance_loss_mlp": 0.10265537, "epoch": 0.6655042837817526, "flos": 65196112521600.0, "grad_norm": 0.7018646067512857, "language_loss": 0.57425451, "learning_rate": 1.0631475228505285e-06, "loss": 0.58784533, "num_input_tokens_seen": 239042360, "router_z_loss_clip": 1.546875, "router_z_loss_mlp": 0.06835938, "step": 11069, "time_per_iteration": 3.2369637489318848 }, { "auxiliary_loss_clip": 0.01251835, "auxiliary_loss_mlp": 0.00070717, "balance_loss_clip": 1.09750175, "balance_loss_mlp": 0.06327787, "epoch": 0.6655644070344205, "flos": 69008746752000.0, "grad_norm": 0.7615894395703928, "language_loss": 0.62520063, "learning_rate": 1.062803450204029e-06, "loss": 0.63842607, "num_input_tokens_seen": 239109410, "router_z_loss_clip": 1.546875, "router_z_loss_mlp": 0.07421875, "step": 11070, "time_per_iteration": 3.2223222255706787 }, { "auxiliary_loss_clip": 0.01305645, "auxiliary_loss_mlp": 0.00200235, "balance_loss_clip": 1.07565129, "balance_loss_mlp": 0.1740445, "epoch": 0.6656245302870886, "flos": 36315562809600.0, "grad_norm": 4487.2493810219, "language_loss": 0.68167925, "learning_rate": 1.062459413096116e-06, "loss": 0.696738, "num_input_tokens_seen": 239135345, "router_z_loss_clip": 2.30273438, "router_z_loss_mlp": 0.26171875, "step": 11071, "time_per_iteration": 2.8197529315948486 }, { "auxiliary_loss_clip": 0.01331726, "auxiliary_loss_mlp": 0.0020697, "balance_loss_clip": 1.09714079, "balance_loss_mlp": 0.18097015, "epoch": 0.6656846535397565, "flos": 21794832466560.0, "grad_norm": 18.159954072695946, "language_loss": 0.79407573, "learning_rate": 1.0621154115398364e-06, "loss": 0.80946267, "num_input_tokens_seen": 239154340, "router_z_loss_clip": 2.34765625, "router_z_loss_mlp": 0.26000977, "step": 11072, "time_per_iteration": 2.6498849391937256 }, { "auxiliary_loss_clip": 0.01311318, "auxiliary_loss_mlp": 0.00214606, "balance_loss_clip": 1.07624209, "balance_loss_mlp": 0.18718764, "epoch": 0.6657447767924245, "flos": 37487615592960.0, "grad_norm": 16.924933307414022, "language_loss": 0.76907802, "learning_rate": 1.0617714455482353e-06, "loss": 0.78433722, "num_input_tokens_seen": 239177815, "router_z_loss_clip": 2.34960938, "router_z_loss_mlp": 0.27429199, "step": 11073, "time_per_iteration": 2.7898452281951904 }, { "auxiliary_loss_clip": 0.01333971, "auxiliary_loss_mlp": 0.00194516, "balance_loss_clip": 1.08811975, "balance_loss_mlp": 0.16505969, "epoch": 0.6658049000450924, "flos": 16837688206080.0, "grad_norm": 5.998503280476906, "language_loss": 0.67883027, "learning_rate": 1.061427515134354e-06, "loss": 0.69411516, "num_input_tokens_seen": 239195735, "router_z_loss_clip": 2.45507812, "router_z_loss_mlp": 0.29431152, "step": 11074, "time_per_iteration": 2.6236162185668945 }, { "auxiliary_loss_clip": 0.01297754, "auxiliary_loss_mlp": 0.00188976, "balance_loss_clip": 1.06922174, "balance_loss_mlp": 0.16313094, "epoch": 0.6658650232977604, "flos": 33510975863040.0, "grad_norm": 54.94338736780369, "language_loss": 0.78335035, "learning_rate": 1.061083620311235e-06, "loss": 0.79821765, "num_input_tokens_seen": 239217535, "router_z_loss_clip": 2.28710938, "router_z_loss_mlp": 0.25842285, "step": 11075, "time_per_iteration": 2.7730679512023926 }, { "auxiliary_loss_clip": 0.01312984, "auxiliary_loss_mlp": 0.00204702, "balance_loss_clip": 1.0770936, "balance_loss_mlp": 0.17787966, "epoch": 0.6659251465504283, "flos": 37706311549440.0, "grad_norm": 43.62880066545068, "language_loss": 0.72613734, "learning_rate": 1.0607397610919202e-06, "loss": 0.74131417, "num_input_tokens_seen": 239241975, "router_z_loss_clip": 2.359375, "router_z_loss_mlp": 0.26806641, "step": 11076, "time_per_iteration": 4.205381870269775 }, { "auxiliary_loss_clip": 0.01338323, "auxiliary_loss_mlp": 0.00180675, "balance_loss_clip": 1.08985543, "balance_loss_mlp": 0.14906095, "epoch": 0.6659852698030964, "flos": 24893420232960.0, "grad_norm": 3.7513189059400585, "language_loss": 0.83584177, "learning_rate": 1.0603959374894468e-06, "loss": 0.85103178, "num_input_tokens_seen": 239262025, "router_z_loss_clip": 2.484375, "router_z_loss_mlp": 0.31616211, "step": 11077, "time_per_iteration": 4.214414358139038 }, { "auxiliary_loss_clip": 0.01320126, "auxiliary_loss_mlp": 0.00200548, "balance_loss_clip": 1.08035088, "balance_loss_mlp": 0.1706382, "epoch": 0.6660453930557643, "flos": 24352821567360.0, "grad_norm": 3.6882647767335794, "language_loss": 0.76146972, "learning_rate": 1.0600521495168538e-06, "loss": 0.77667642, "num_input_tokens_seen": 239282775, "router_z_loss_clip": 2.39648438, "router_z_loss_mlp": 0.29882812, "step": 11078, "time_per_iteration": 2.729682683944702 }, { "auxiliary_loss_clip": 0.0132357, "auxiliary_loss_mlp": 0.00196999, "balance_loss_clip": 1.08137536, "balance_loss_mlp": 0.16894889, "epoch": 0.6661055163084323, "flos": 10597814380800.0, "grad_norm": 13.291610269706066, "language_loss": 0.79701859, "learning_rate": 1.0597083971871783e-06, "loss": 0.81222427, "num_input_tokens_seen": 239299775, "router_z_loss_clip": 2.421875, "router_z_loss_mlp": 0.28063965, "step": 11079, "time_per_iteration": 2.6364195346832275 }, { "auxiliary_loss_clip": 0.01328776, "auxiliary_loss_mlp": 0.00195573, "balance_loss_clip": 1.08835828, "balance_loss_mlp": 0.16784523, "epoch": 0.6661656395611003, "flos": 24057491944320.0, "grad_norm": 14.40899469674049, "language_loss": 0.86634582, "learning_rate": 1.0593646805134544e-06, "loss": 0.88158929, "num_input_tokens_seen": 239319660, "router_z_loss_clip": 2.40625, "router_z_loss_mlp": 0.27709961, "step": 11080, "time_per_iteration": 2.7041614055633545 }, { "auxiliary_loss_clip": 0.01312844, "auxiliary_loss_mlp": 0.00185565, "balance_loss_clip": 1.08262396, "balance_loss_mlp": 0.16090098, "epoch": 0.6662257628137682, "flos": 23036192542080.0, "grad_norm": 71.88135040582911, "language_loss": 0.84642279, "learning_rate": 1.0590209995087157e-06, "loss": 0.86140686, "num_input_tokens_seen": 239339215, "router_z_loss_clip": 2.3046875, "router_z_loss_mlp": 0.24658203, "step": 11081, "time_per_iteration": 4.141645193099976 }, { "auxiliary_loss_clip": 0.01333128, "auxiliary_loss_mlp": 0.00201035, "balance_loss_clip": 1.09021413, "balance_loss_mlp": 0.17297332, "epoch": 0.6662858860664362, "flos": 24754446512640.0, "grad_norm": 2.6529501221131477, "language_loss": 0.88095307, "learning_rate": 1.0586773541859946e-06, "loss": 0.89629471, "num_input_tokens_seen": 239358545, "router_z_loss_clip": 2.43164062, "router_z_loss_mlp": 0.28076172, "step": 11082, "time_per_iteration": 2.7488412857055664 }, { "auxiliary_loss_clip": 0.01301449, "auxiliary_loss_mlp": 0.00208948, "balance_loss_clip": 1.07121634, "balance_loss_mlp": 0.18343765, "epoch": 0.6663460093191041, "flos": 20009066883840.0, "grad_norm": 49.59402567509883, "language_loss": 0.89077979, "learning_rate": 1.0583337445583234e-06, "loss": 0.90588379, "num_input_tokens_seen": 239376665, "router_z_loss_clip": 2.29882812, "router_z_loss_mlp": 0.25488281, "step": 11083, "time_per_iteration": 2.66951322555542 }, { "auxiliary_loss_clip": 0.01320706, "auxiliary_loss_mlp": 0.00208678, "balance_loss_clip": 1.08130312, "balance_loss_mlp": 0.18062803, "epoch": 0.6664061325717722, "flos": 17821389047040.0, "grad_norm": 136.51337356621013, "language_loss": 0.96703744, "learning_rate": 1.057990170638731e-06, "loss": 0.98233128, "num_input_tokens_seen": 239394345, "router_z_loss_clip": 2.39648438, "router_z_loss_mlp": 0.28039551, "step": 11084, "time_per_iteration": 2.667663097381592 }, { "auxiliary_loss_clip": 0.01342587, "auxiliary_loss_mlp": 0.0020289, "balance_loss_clip": 1.09533024, "balance_loss_mlp": 0.17484003, "epoch": 0.6664662558244401, "flos": 18076893465600.0, "grad_norm": 6.850261241113876, "language_loss": 0.86874938, "learning_rate": 1.0576466324402452e-06, "loss": 0.88420421, "num_input_tokens_seen": 239410605, "router_z_loss_clip": 2.47460938, "router_z_loss_mlp": 0.28027344, "step": 11085, "time_per_iteration": 4.084920406341553 }, { "auxiliary_loss_clip": 0.01315556, "auxiliary_loss_mlp": 0.00189184, "balance_loss_clip": 1.08029866, "balance_loss_mlp": 0.16285038, "epoch": 0.6665263790771081, "flos": 21574197175680.0, "grad_norm": 4.116345513658924, "language_loss": 0.88226295, "learning_rate": 1.057303129975894e-06, "loss": 0.89731038, "num_input_tokens_seen": 239427155, "router_z_loss_clip": 2.35351562, "router_z_loss_mlp": 0.26367188, "step": 11086, "time_per_iteration": 2.710196018218994 }, { "auxiliary_loss_clip": 0.01336157, "auxiliary_loss_mlp": 0.00205891, "balance_loss_clip": 1.09105086, "balance_loss_mlp": 0.17703076, "epoch": 0.666586502329776, "flos": 24206629213440.0, "grad_norm": 572.6338760507011, "language_loss": 0.8415755, "learning_rate": 1.056959663258702e-06, "loss": 0.85699594, "num_input_tokens_seen": 239445510, "router_z_loss_clip": 2.45117188, "router_z_loss_mlp": 0.28869629, "step": 11087, "time_per_iteration": 2.6710333824157715 }, { "auxiliary_loss_clip": 0.01301696, "auxiliary_loss_mlp": 0.00231223, "balance_loss_clip": 1.072788, "balance_loss_mlp": 0.2055814, "epoch": 0.666646625582444, "flos": 22200515648640.0, "grad_norm": 21309.640937763797, "language_loss": 0.72633266, "learning_rate": 1.0566162323016939e-06, "loss": 0.74166191, "num_input_tokens_seen": 239464805, "router_z_loss_clip": 2.28710938, "router_z_loss_mlp": 0.25634766, "step": 11088, "time_per_iteration": 2.6456172466278076 }, { "auxiliary_loss_clip": 0.01348143, "auxiliary_loss_mlp": 0.00209566, "balance_loss_clip": 1.10506248, "balance_loss_mlp": 0.18210003, "epoch": 0.6667067488351119, "flos": 18259930195200.0, "grad_norm": 14.148509666635064, "language_loss": 0.74393022, "learning_rate": 1.0562728371178928e-06, "loss": 0.7595073, "num_input_tokens_seen": 239483890, "router_z_loss_clip": 2.4296875, "router_z_loss_mlp": 0.27441406, "step": 11089, "time_per_iteration": 2.619051694869995 }, { "auxiliary_loss_clip": 0.01324247, "auxiliary_loss_mlp": 0.00204829, "balance_loss_clip": 1.09322, "balance_loss_mlp": 0.17919894, "epoch": 0.66676687208778, "flos": 17236547804160.0, "grad_norm": 5.639536938968374, "language_loss": 0.88286352, "learning_rate": 1.0559294777203221e-06, "loss": 0.89815432, "num_input_tokens_seen": 239500080, "router_z_loss_clip": 2.3125, "router_z_loss_mlp": 0.25622559, "step": 11090, "time_per_iteration": 2.5965497493743896 }, { "auxiliary_loss_clip": 0.01354949, "auxiliary_loss_mlp": 0.002151, "balance_loss_clip": 1.10258651, "balance_loss_mlp": 0.18522595, "epoch": 0.6668269953404479, "flos": 19752197748480.0, "grad_norm": 16.49088350283455, "language_loss": 0.87806815, "learning_rate": 1.0555861541219984e-06, "loss": 0.89376867, "num_input_tokens_seen": 239517335, "router_z_loss_clip": 2.52539062, "router_z_loss_mlp": 0.29858398, "step": 11091, "time_per_iteration": 2.6317131519317627 }, { "auxiliary_loss_clip": 0.01329173, "auxiliary_loss_mlp": 0.00197128, "balance_loss_clip": 1.08531487, "balance_loss_mlp": 0.16706333, "epoch": 0.6668871185931159, "flos": 20558428467840.0, "grad_norm": 51.896271976057314, "language_loss": 0.89036846, "learning_rate": 1.0552428663359425e-06, "loss": 0.90563142, "num_input_tokens_seen": 239536240, "router_z_loss_clip": 2.44140625, "router_z_loss_mlp": 0.30078125, "step": 11092, "time_per_iteration": 2.605680465698242 }, { "auxiliary_loss_clip": 0.01259865, "auxiliary_loss_mlp": 0.00069026, "balance_loss_clip": 1.1029191, "balance_loss_mlp": 0.06139618, "epoch": 0.6669472418457839, "flos": 58088167735680.0, "grad_norm": 0.7386861047862212, "language_loss": 0.57403326, "learning_rate": 1.0548996143751724e-06, "loss": 0.58732218, "num_input_tokens_seen": 239598000, "router_z_loss_clip": 1.5625, "router_z_loss_mlp": 0.07617188, "step": 11093, "time_per_iteration": 3.178612470626831 }, { "auxiliary_loss_clip": 0.01332201, "auxiliary_loss_mlp": 0.00206973, "balance_loss_clip": 1.09629369, "balance_loss_mlp": 0.17979333, "epoch": 0.6670073650984518, "flos": 26065113880320.0, "grad_norm": 32.46576738181697, "language_loss": 0.83531493, "learning_rate": 1.054556398252703e-06, "loss": 0.85070664, "num_input_tokens_seen": 239617650, "router_z_loss_clip": 2.35546875, "router_z_loss_mlp": 0.27172852, "step": 11094, "time_per_iteration": 2.7155601978302 }, { "auxiliary_loss_clip": 0.01325022, "auxiliary_loss_mlp": 0.0021249, "balance_loss_clip": 1.08382905, "balance_loss_mlp": 0.18360591, "epoch": 0.6670674883511198, "flos": 32416849635840.0, "grad_norm": 13.045531330639157, "language_loss": 0.82024777, "learning_rate": 1.05421321798155e-06, "loss": 0.83562291, "num_input_tokens_seen": 239639825, "router_z_loss_clip": 2.41210938, "router_z_loss_mlp": 0.28894043, "step": 11095, "time_per_iteration": 2.7791244983673096 }, { "auxiliary_loss_clip": 0.01320421, "auxiliary_loss_mlp": 0.00188036, "balance_loss_clip": 1.08649921, "balance_loss_mlp": 0.16185766, "epoch": 0.6671276116037878, "flos": 18037786533120.0, "grad_norm": 17.444688079774846, "language_loss": 0.83250791, "learning_rate": 1.053870073574727e-06, "loss": 0.84759247, "num_input_tokens_seen": 239656300, "router_z_loss_clip": 2.33984375, "router_z_loss_mlp": 0.26147461, "step": 11096, "time_per_iteration": 2.639589309692383 }, { "auxiliary_loss_clip": 0.01312894, "auxiliary_loss_mlp": 0.00204797, "balance_loss_clip": 1.0797956, "balance_loss_mlp": 0.1795601, "epoch": 0.6671877348564558, "flos": 23767046570880.0, "grad_norm": 5.704983351897757, "language_loss": 0.72099102, "learning_rate": 1.0535269650452456e-06, "loss": 0.73616791, "num_input_tokens_seen": 239676655, "router_z_loss_clip": 2.33007812, "router_z_loss_mlp": 0.25219727, "step": 11097, "time_per_iteration": 2.669074058532715 }, { "auxiliary_loss_clip": 0.01317717, "auxiliary_loss_mlp": 0.00219107, "balance_loss_clip": 1.08482218, "balance_loss_mlp": 0.19173642, "epoch": 0.6672478581091237, "flos": 20918360701440.0, "grad_norm": 8.470894060410712, "language_loss": 0.83871341, "learning_rate": 1.0531838924061158e-06, "loss": 0.85408163, "num_input_tokens_seen": 239695430, "router_z_loss_clip": 2.32617188, "router_z_loss_mlp": 0.27368164, "step": 11098, "time_per_iteration": 2.6763150691986084 }, { "auxiliary_loss_clip": 0.01318168, "auxiliary_loss_mlp": 0.00214346, "balance_loss_clip": 1.08470082, "balance_loss_mlp": 0.18589067, "epoch": 0.6673079813617917, "flos": 27855799626240.0, "grad_norm": 7.184630282611995, "language_loss": 0.82195252, "learning_rate": 1.0528408556703476e-06, "loss": 0.83727765, "num_input_tokens_seen": 239717070, "router_z_loss_clip": 2.33398438, "router_z_loss_mlp": 0.28430176, "step": 11099, "time_per_iteration": 2.735762596130371 }, { "auxiliary_loss_clip": 0.01312954, "auxiliary_loss_mlp": 0.00210594, "balance_loss_clip": 1.07816482, "balance_loss_mlp": 0.18455821, "epoch": 0.6673681046144596, "flos": 21616859554560.0, "grad_norm": 7.482101810110598, "language_loss": 0.85700184, "learning_rate": 1.0524978548509502e-06, "loss": 0.87223727, "num_input_tokens_seen": 239737105, "router_z_loss_clip": 2.34960938, "router_z_loss_mlp": 0.26074219, "step": 11100, "time_per_iteration": 2.681617259979248 }, { "auxiliary_loss_clip": 0.01309243, "auxiliary_loss_mlp": 0.00197878, "balance_loss_clip": 1.07835829, "balance_loss_mlp": 0.17290333, "epoch": 0.6674282278671276, "flos": 20889884194560.0, "grad_norm": 25.506406302488685, "language_loss": 0.67203647, "learning_rate": 1.0521548899609288e-06, "loss": 0.68710768, "num_input_tokens_seen": 239757835, "router_z_loss_clip": 2.30664062, "router_z_loss_mlp": 0.24987793, "step": 11101, "time_per_iteration": 2.69958233833313 }, { "auxiliary_loss_clip": 0.01347545, "auxiliary_loss_mlp": 0.00223254, "balance_loss_clip": 1.09571695, "balance_loss_mlp": 0.19333267, "epoch": 0.6674883511197955, "flos": 23624194181760.0, "grad_norm": 7.003055670503995, "language_loss": 0.82108068, "learning_rate": 1.0518119610132884e-06, "loss": 0.83678865, "num_input_tokens_seen": 239775425, "router_z_loss_clip": 2.515625, "router_z_loss_mlp": 0.29919434, "step": 11102, "time_per_iteration": 2.678558826446533 }, { "auxiliary_loss_clip": 0.01315001, "auxiliary_loss_mlp": 0.00200844, "balance_loss_clip": 1.07688498, "balance_loss_mlp": 0.1756787, "epoch": 0.6675484743724636, "flos": 19609668581760.0, "grad_norm": 2.6948460577015205, "language_loss": 0.90081221, "learning_rate": 1.051469068021034e-06, "loss": 0.91597068, "num_input_tokens_seen": 239794605, "router_z_loss_clip": 2.3828125, "router_z_loss_mlp": 0.25195312, "step": 11103, "time_per_iteration": 2.6501219272613525 }, { "auxiliary_loss_clip": 0.01318945, "auxiliary_loss_mlp": 0.00208936, "balance_loss_clip": 1.07591355, "balance_loss_mlp": 0.18164852, "epoch": 0.6676085976251315, "flos": 14319452482560.0, "grad_norm": 2.8721578807070047, "language_loss": 0.86369944, "learning_rate": 1.0511262109971668e-06, "loss": 0.87897813, "num_input_tokens_seen": 239812135, "router_z_loss_clip": 2.43164062, "router_z_loss_mlp": 0.27270508, "step": 11104, "time_per_iteration": 2.6603171825408936 }, { "auxiliary_loss_clip": 0.01325105, "auxiliary_loss_mlp": 0.00209222, "balance_loss_clip": 1.08781636, "balance_loss_mlp": 0.18121944, "epoch": 0.6676687208777995, "flos": 38104596529920.0, "grad_norm": 83.30280752928206, "language_loss": 0.65825939, "learning_rate": 1.0507833899546889e-06, "loss": 0.6736027, "num_input_tokens_seen": 239835845, "router_z_loss_clip": 2.375, "router_z_loss_mlp": 0.28039551, "step": 11105, "time_per_iteration": 2.806737184524536 }, { "auxiliary_loss_clip": 0.01335285, "auxiliary_loss_mlp": 0.00203197, "balance_loss_clip": 1.08751881, "balance_loss_mlp": 0.1718931, "epoch": 0.6677288441304675, "flos": 23981576549760.0, "grad_norm": 99.13133577267284, "language_loss": 0.84299159, "learning_rate": 1.0504406049066e-06, "loss": 0.85837638, "num_input_tokens_seen": 239853820, "router_z_loss_clip": 2.48046875, "router_z_loss_mlp": 0.31323242, "step": 11106, "time_per_iteration": 2.6882996559143066 }, { "auxiliary_loss_clip": 0.01311336, "auxiliary_loss_mlp": 0.00208662, "balance_loss_clip": 1.07787359, "balance_loss_mlp": 0.1815419, "epoch": 0.6677889673831354, "flos": 24170682677760.0, "grad_norm": 94.49071588201153, "language_loss": 0.84850132, "learning_rate": 1.0500978558659e-06, "loss": 0.86370134, "num_input_tokens_seen": 239873365, "router_z_loss_clip": 2.3359375, "router_z_loss_mlp": 0.27124023, "step": 11107, "time_per_iteration": 2.8059704303741455 }, { "auxiliary_loss_clip": 0.01314509, "auxiliary_loss_mlp": 0.00186347, "balance_loss_clip": 1.07883501, "balance_loss_mlp": 0.15939346, "epoch": 0.6678490906358034, "flos": 22309648145280.0, "grad_norm": 12.566571384626034, "language_loss": 0.97566247, "learning_rate": 1.049755142845583e-06, "loss": 0.99067098, "num_input_tokens_seen": 239891215, "router_z_loss_clip": 2.35546875, "router_z_loss_mlp": 0.26965332, "step": 11108, "time_per_iteration": 2.716533660888672 }, { "auxiliary_loss_clip": 0.01295367, "auxiliary_loss_mlp": 0.00199134, "balance_loss_clip": 1.06362414, "balance_loss_mlp": 0.1751256, "epoch": 0.6679092138884714, "flos": 36898752026880.0, "grad_norm": 55.56784468847424, "language_loss": 0.88102758, "learning_rate": 1.049412465858646e-06, "loss": 0.89597261, "num_input_tokens_seen": 239913490, "router_z_loss_clip": 2.31835938, "router_z_loss_mlp": 0.2401123, "step": 11109, "time_per_iteration": 2.8695895671844482 }, { "auxiliary_loss_clip": 0.01330776, "auxiliary_loss_mlp": 0.00217463, "balance_loss_clip": 1.0863235, "balance_loss_mlp": 0.18764897, "epoch": 0.6679693371411394, "flos": 18150294908160.0, "grad_norm": 9.017437806504473, "language_loss": 0.79914677, "learning_rate": 1.0490698249180847e-06, "loss": 0.81462908, "num_input_tokens_seen": 239931565, "router_z_loss_clip": 2.4453125, "router_z_loss_mlp": 0.2980957, "step": 11110, "time_per_iteration": 2.610011577606201 }, { "auxiliary_loss_clip": 0.01336959, "auxiliary_loss_mlp": 0.0022371, "balance_loss_clip": 1.08530641, "balance_loss_mlp": 0.19016463, "epoch": 0.6680294603938073, "flos": 27198167472000.0, "grad_norm": 3.2974133694201213, "language_loss": 0.82343972, "learning_rate": 1.04872722003689e-06, "loss": 0.83904648, "num_input_tokens_seen": 239952395, "router_z_loss_clip": 2.51953125, "router_z_loss_mlp": 0.3359375, "step": 11111, "time_per_iteration": 2.7356085777282715 }, { "auxiliary_loss_clip": 0.01313772, "auxiliary_loss_mlp": 0.00201354, "balance_loss_clip": 1.07758522, "balance_loss_mlp": 0.17274374, "epoch": 0.6680895836464753, "flos": 21725309692800.0, "grad_norm": 60.321112960300894, "language_loss": 0.75017422, "learning_rate": 1.0483846512280553e-06, "loss": 0.76532555, "num_input_tokens_seen": 239968910, "router_z_loss_clip": 2.359375, "router_z_loss_mlp": 0.28601074, "step": 11112, "time_per_iteration": 2.627368927001953 }, { "auxiliary_loss_clip": 0.01315317, "auxiliary_loss_mlp": 0.00231732, "balance_loss_clip": 1.0774579, "balance_loss_mlp": 0.20506452, "epoch": 0.6681497068991432, "flos": 19646477043840.0, "grad_norm": 15.389212339500732, "language_loss": 0.71247816, "learning_rate": 1.048042118504569e-06, "loss": 0.72794867, "num_input_tokens_seen": 239987680, "router_z_loss_clip": 2.38085938, "router_z_loss_mlp": 0.26672363, "step": 11113, "time_per_iteration": 2.794199228286743 }, { "auxiliary_loss_clip": 0.01301784, "auxiliary_loss_mlp": 0.00194226, "balance_loss_clip": 1.07128799, "balance_loss_mlp": 0.16872764, "epoch": 0.6682098301518112, "flos": 17419153570560.0, "grad_norm": 836.2031743434474, "language_loss": 0.75061989, "learning_rate": 1.047699621879422e-06, "loss": 0.76558, "num_input_tokens_seen": 240005790, "router_z_loss_clip": 2.30273438, "router_z_loss_mlp": 0.25512695, "step": 11114, "time_per_iteration": 2.6205427646636963 }, { "auxiliary_loss_clip": 0.01305907, "auxiliary_loss_mlp": 0.00226428, "balance_loss_clip": 1.07565737, "balance_loss_mlp": 0.19948708, "epoch": 0.6682699534044791, "flos": 22599016110720.0, "grad_norm": 39.257555293671246, "language_loss": 0.8568027, "learning_rate": 1.0473571613655998e-06, "loss": 0.87212598, "num_input_tokens_seen": 240025895, "router_z_loss_clip": 2.30078125, "router_z_loss_mlp": 0.26916504, "step": 11115, "time_per_iteration": 2.6771557331085205 }, { "auxiliary_loss_clip": 0.01323149, "auxiliary_loss_mlp": 0.00214554, "balance_loss_clip": 1.07749939, "balance_loss_mlp": 0.18482274, "epoch": 0.6683300766571472, "flos": 24863686750080.0, "grad_norm": 25.94574468096145, "language_loss": 0.87993014, "learning_rate": 1.0470147369760896e-06, "loss": 0.89530718, "num_input_tokens_seen": 240044880, "router_z_loss_clip": 2.45507812, "router_z_loss_mlp": 0.29736328, "step": 11116, "time_per_iteration": 2.6875 }, { "auxiliary_loss_clip": 0.01318369, "auxiliary_loss_mlp": 0.00214739, "balance_loss_clip": 1.07876205, "balance_loss_mlp": 0.18705896, "epoch": 0.6683901999098151, "flos": 27126633536640.0, "grad_norm": 2.1403193181815654, "language_loss": 0.87234044, "learning_rate": 1.0466723487238768e-06, "loss": 0.88767159, "num_input_tokens_seen": 240065785, "router_z_loss_clip": 2.39648438, "router_z_loss_mlp": 0.27722168, "step": 11117, "time_per_iteration": 2.6962273120880127 }, { "auxiliary_loss_clip": 0.01320599, "auxiliary_loss_mlp": 0.00183809, "balance_loss_clip": 1.07633781, "balance_loss_mlp": 0.15529433, "epoch": 0.6684503231624831, "flos": 20739023072640.0, "grad_norm": 79.70186049883748, "language_loss": 0.72723031, "learning_rate": 1.0463299966219441e-06, "loss": 0.7422744, "num_input_tokens_seen": 240085130, "router_z_loss_clip": 2.44140625, "router_z_loss_mlp": 0.28491211, "step": 11118, "time_per_iteration": 2.663332462310791 }, { "auxiliary_loss_clip": 0.01308865, "auxiliary_loss_mlp": 0.00219591, "balance_loss_clip": 1.07405233, "balance_loss_mlp": 0.19251896, "epoch": 0.668510446415151, "flos": 21762189982080.0, "grad_norm": 5.823710943337122, "language_loss": 0.77157974, "learning_rate": 1.0459876806832727e-06, "loss": 0.7868644, "num_input_tokens_seen": 240105495, "router_z_loss_clip": 2.34765625, "router_z_loss_mlp": 0.27075195, "step": 11119, "time_per_iteration": 4.132906913757324 }, { "auxiliary_loss_clip": 0.01325467, "auxiliary_loss_mlp": 0.00220301, "balance_loss_clip": 1.084095, "balance_loss_mlp": 0.19282301, "epoch": 0.668570569667819, "flos": 30191250015360.0, "grad_norm": 5.2350465745689405, "language_loss": 0.75270492, "learning_rate": 1.0456454009208448e-06, "loss": 0.76816261, "num_input_tokens_seen": 240125455, "router_z_loss_clip": 2.4140625, "router_z_loss_mlp": 0.27478027, "step": 11120, "time_per_iteration": 4.179814100265503 }, { "auxiliary_loss_clip": 0.01324784, "auxiliary_loss_mlp": 0.00238188, "balance_loss_clip": 1.08054221, "balance_loss_mlp": 0.21063852, "epoch": 0.668630692920487, "flos": 24170646764160.0, "grad_norm": 34.87265951134468, "language_loss": 0.79572982, "learning_rate": 1.045303157347638e-06, "loss": 0.81135952, "num_input_tokens_seen": 240143870, "router_z_loss_clip": 2.44335938, "router_z_loss_mlp": 0.27563477, "step": 11121, "time_per_iteration": 2.7327256202697754 }, { "auxiliary_loss_clip": 0.01313256, "auxiliary_loss_mlp": 0.00232732, "balance_loss_clip": 1.07154477, "balance_loss_mlp": 0.20590973, "epoch": 0.668690816173155, "flos": 17457147181440.0, "grad_norm": 27.538550582892153, "language_loss": 0.80934107, "learning_rate": 1.0449609499766316e-06, "loss": 0.82480097, "num_input_tokens_seen": 240161020, "router_z_loss_clip": 2.41601562, "router_z_loss_mlp": 0.26806641, "step": 11122, "time_per_iteration": 2.6584713459014893 }, { "auxiliary_loss_clip": 0.01311803, "auxiliary_loss_mlp": 0.00213238, "balance_loss_clip": 1.07400084, "balance_loss_mlp": 0.18638057, "epoch": 0.668750939425823, "flos": 25005102595200.0, "grad_norm": 16.93209366373838, "language_loss": 0.78807271, "learning_rate": 1.0446187788208015e-06, "loss": 0.80332315, "num_input_tokens_seen": 240179820, "router_z_loss_clip": 2.37695312, "router_z_loss_mlp": 0.26867676, "step": 11123, "time_per_iteration": 4.1461341381073 }, { "auxiliary_loss_clip": 0.01322354, "auxiliary_loss_mlp": 0.00232309, "balance_loss_clip": 1.08337045, "balance_loss_mlp": 0.20511785, "epoch": 0.6688110626784909, "flos": 24096778444800.0, "grad_norm": 15.058782835814498, "language_loss": 0.88433719, "learning_rate": 1.0442766438931244e-06, "loss": 0.89988387, "num_input_tokens_seen": 240200130, "router_z_loss_clip": 2.390625, "router_z_loss_mlp": 0.27185059, "step": 11124, "time_per_iteration": 2.702263116836548 }, { "auxiliary_loss_clip": 0.0132587, "auxiliary_loss_mlp": 0.00208562, "balance_loss_clip": 1.08840537, "balance_loss_mlp": 0.18302739, "epoch": 0.6688711859311589, "flos": 21759532375680.0, "grad_norm": 4.686264789007286, "language_loss": 0.80359197, "learning_rate": 1.0439345452065716e-06, "loss": 0.81893629, "num_input_tokens_seen": 240217945, "router_z_loss_clip": 2.37304688, "router_z_loss_mlp": 0.25537109, "step": 11125, "time_per_iteration": 2.664653778076172 }, { "auxiliary_loss_clip": 0.01310936, "auxiliary_loss_mlp": 0.00221057, "balance_loss_clip": 1.07628167, "balance_loss_mlp": 0.19307905, "epoch": 0.6689313091838268, "flos": 22929645824640.0, "grad_norm": 11.897673305683146, "language_loss": 0.76183593, "learning_rate": 1.043592482774116e-06, "loss": 0.77715582, "num_input_tokens_seen": 240237220, "router_z_loss_clip": 2.34960938, "router_z_loss_mlp": 0.27966309, "step": 11126, "time_per_iteration": 2.6937265396118164 }, { "auxiliary_loss_clip": 0.01302463, "auxiliary_loss_mlp": 0.00227183, "balance_loss_clip": 1.06960869, "balance_loss_mlp": 0.20045656, "epoch": 0.6689914324364948, "flos": 20886149180160.0, "grad_norm": 3.9871094588706377, "language_loss": 0.78618526, "learning_rate": 1.0432504566087305e-06, "loss": 0.80148172, "num_input_tokens_seen": 240256000, "router_z_loss_clip": 2.32617188, "router_z_loss_mlp": 0.26721191, "step": 11127, "time_per_iteration": 4.171586990356445 }, { "auxiliary_loss_clip": 0.01351041, "auxiliary_loss_mlp": 0.00223555, "balance_loss_clip": 1.09441113, "balance_loss_mlp": 0.19380051, "epoch": 0.6690515556891627, "flos": 22748225207040.0, "grad_norm": 29370.680498656297, "language_loss": 0.90261739, "learning_rate": 1.0429084667233827e-06, "loss": 0.91836333, "num_input_tokens_seen": 240275845, "router_z_loss_clip": 2.56445312, "router_z_loss_mlp": 0.29785156, "step": 11128, "time_per_iteration": 2.7912137508392334 }, { "auxiliary_loss_clip": 0.01322946, "auxiliary_loss_mlp": 0.00226613, "balance_loss_clip": 1.08219504, "balance_loss_mlp": 0.19991052, "epoch": 0.6691116789418308, "flos": 23331450337920.0, "grad_norm": 59.63433753567568, "language_loss": 0.87685573, "learning_rate": 1.0425665131310427e-06, "loss": 0.89235133, "num_input_tokens_seen": 240294095, "router_z_loss_clip": 2.40625, "router_z_loss_mlp": 0.26708984, "step": 11129, "time_per_iteration": 2.6986677646636963 }, { "auxiliary_loss_clip": 0.01299244, "auxiliary_loss_mlp": 0.00223407, "balance_loss_clip": 1.0658927, "balance_loss_mlp": 0.19601291, "epoch": 0.6691718021944987, "flos": 32447014081920.0, "grad_norm": 70.74143421719654, "language_loss": 0.77160859, "learning_rate": 1.0422245958446762e-06, "loss": 0.78683507, "num_input_tokens_seen": 240313460, "router_z_loss_clip": 2.3359375, "router_z_loss_mlp": 0.27416992, "step": 11130, "time_per_iteration": 2.7375078201293945 }, { "auxiliary_loss_clip": 0.01304307, "auxiliary_loss_mlp": 0.00203511, "balance_loss_clip": 1.07301176, "balance_loss_mlp": 0.17958587, "epoch": 0.6692319254471667, "flos": 23731602825600.0, "grad_norm": 303.49438202720097, "language_loss": 0.77707672, "learning_rate": 1.0418827148772486e-06, "loss": 0.79215491, "num_input_tokens_seen": 240333540, "router_z_loss_clip": 2.31445312, "router_z_loss_mlp": 0.23925781, "step": 11131, "time_per_iteration": 2.692816734313965 }, { "auxiliary_loss_clip": 0.0132581, "auxiliary_loss_mlp": 0.00212334, "balance_loss_clip": 1.08032918, "balance_loss_mlp": 0.18431999, "epoch": 0.6692920486998346, "flos": 14427902620800.0, "grad_norm": 89.21277387851875, "language_loss": 0.81553006, "learning_rate": 1.0415408702417243e-06, "loss": 0.83091152, "num_input_tokens_seen": 240350085, "router_z_loss_clip": 2.453125, "router_z_loss_mlp": 0.2800293, "step": 11132, "time_per_iteration": 2.6575253009796143 }, { "auxiliary_loss_clip": 0.0132818, "auxiliary_loss_mlp": 0.00217392, "balance_loss_clip": 1.08811736, "balance_loss_mlp": 0.18955675, "epoch": 0.6693521719525026, "flos": 21507475662720.0, "grad_norm": 39.51931179937368, "language_loss": 0.84746683, "learning_rate": 1.0411990619510661e-06, "loss": 0.86292255, "num_input_tokens_seen": 240370015, "router_z_loss_clip": 2.3984375, "router_z_loss_mlp": 0.2779541, "step": 11133, "time_per_iteration": 2.740431785583496 }, { "auxiliary_loss_clip": 0.01339219, "auxiliary_loss_mlp": 0.00232521, "balance_loss_clip": 1.08943141, "balance_loss_mlp": 0.2047694, "epoch": 0.6694122952051706, "flos": 25406943022080.0, "grad_norm": 15.087878586981141, "language_loss": 0.75975776, "learning_rate": 1.0408572900182363e-06, "loss": 0.77547514, "num_input_tokens_seen": 240390770, "router_z_loss_clip": 2.5, "router_z_loss_mlp": 0.27734375, "step": 11134, "time_per_iteration": 2.6746294498443604 }, { "auxiliary_loss_clip": 0.0133751, "auxiliary_loss_mlp": 0.00235386, "balance_loss_clip": 1.09002805, "balance_loss_mlp": 0.20701423, "epoch": 0.6694724184578386, "flos": 25661729168640.0, "grad_norm": 15.50452115507411, "language_loss": 0.87496895, "learning_rate": 1.0405155544561943e-06, "loss": 0.89069796, "num_input_tokens_seen": 240409590, "router_z_loss_clip": 2.47460938, "router_z_loss_mlp": 0.28381348, "step": 11135, "time_per_iteration": 2.6890697479248047 }, { "auxiliary_loss_clip": 0.01299115, "auxiliary_loss_mlp": 0.00213938, "balance_loss_clip": 1.06960583, "balance_loss_mlp": 0.18753341, "epoch": 0.6695325417105066, "flos": 17709311635200.0, "grad_norm": 10.35013586630069, "language_loss": 0.80971396, "learning_rate": 1.040173855277898e-06, "loss": 0.82484448, "num_input_tokens_seen": 240428180, "router_z_loss_clip": 2.29492188, "router_z_loss_mlp": 0.26416016, "step": 11136, "time_per_iteration": 2.614917516708374 }, { "auxiliary_loss_clip": 0.01308536, "auxiliary_loss_mlp": 0.0020966, "balance_loss_clip": 1.06853032, "balance_loss_mlp": 0.18205138, "epoch": 0.6695926649631745, "flos": 24460050643200.0, "grad_norm": 3.3721020561873285, "language_loss": 0.73050344, "learning_rate": 1.0398321924963061e-06, "loss": 0.74568546, "num_input_tokens_seen": 240447815, "router_z_loss_clip": 2.40039062, "router_z_loss_mlp": 0.27612305, "step": 11137, "time_per_iteration": 2.6895534992218018 }, { "auxiliary_loss_clip": 0.01318249, "auxiliary_loss_mlp": 0.00221247, "balance_loss_clip": 1.08054447, "balance_loss_mlp": 0.19424629, "epoch": 0.6696527882158425, "flos": 24280138396800.0, "grad_norm": 10.84939039972831, "language_loss": 0.76255822, "learning_rate": 1.0394905661243724e-06, "loss": 0.77795315, "num_input_tokens_seen": 240468635, "router_z_loss_clip": 2.37695312, "router_z_loss_mlp": 0.27026367, "step": 11138, "time_per_iteration": 2.650568962097168 }, { "auxiliary_loss_clip": 0.01298273, "auxiliary_loss_mlp": 0.00220836, "balance_loss_clip": 1.06370175, "balance_loss_mlp": 0.19408543, "epoch": 0.6697129114685104, "flos": 23002759958400.0, "grad_norm": 81.75616939443583, "language_loss": 0.80177253, "learning_rate": 1.039148976175053e-06, "loss": 0.81696355, "num_input_tokens_seen": 240488550, "router_z_loss_clip": 2.34179688, "router_z_loss_mlp": 0.26733398, "step": 11139, "time_per_iteration": 2.6941585540771484 }, { "auxiliary_loss_clip": 0.01304608, "auxiliary_loss_mlp": 0.00207029, "balance_loss_clip": 1.07178128, "balance_loss_mlp": 0.18257928, "epoch": 0.6697730347211784, "flos": 22638123043200.0, "grad_norm": 86.34494938487205, "language_loss": 0.79949015, "learning_rate": 1.0388074226613016e-06, "loss": 0.81460655, "num_input_tokens_seen": 240508330, "router_z_loss_clip": 2.32617188, "router_z_loss_mlp": 0.24462891, "step": 11140, "time_per_iteration": 2.6625864505767822 }, { "auxiliary_loss_clip": 0.01330353, "auxiliary_loss_mlp": 0.0023131, "balance_loss_clip": 1.07981801, "balance_loss_mlp": 0.20131667, "epoch": 0.6698331579738463, "flos": 28877242682880.0, "grad_norm": 37.37632069177336, "language_loss": 0.83002639, "learning_rate": 1.0384659055960691e-06, "loss": 0.84564304, "num_input_tokens_seen": 240528470, "router_z_loss_clip": 2.50976562, "router_z_loss_mlp": 0.29980469, "step": 11141, "time_per_iteration": 2.7214157581329346 }, { "auxiliary_loss_clip": 0.01319799, "auxiliary_loss_mlp": 0.00222197, "balance_loss_clip": 1.07701063, "balance_loss_mlp": 0.19409905, "epoch": 0.6698932812265144, "flos": 24207096090240.0, "grad_norm": 1.9154356018265815, "language_loss": 0.89778113, "learning_rate": 1.0381244249923052e-06, "loss": 0.91320109, "num_input_tokens_seen": 240547815, "router_z_loss_clip": 2.4296875, "router_z_loss_mlp": 0.28100586, "step": 11142, "time_per_iteration": 2.7078449726104736 }, { "auxiliary_loss_clip": 0.01299201, "auxiliary_loss_mlp": 0.00219681, "balance_loss_clip": 1.06559968, "balance_loss_mlp": 0.19350302, "epoch": 0.6699534044791823, "flos": 22090269830400.0, "grad_norm": 4.079309301758657, "language_loss": 0.7705518, "learning_rate": 1.037782980862959e-06, "loss": 0.78574067, "num_input_tokens_seen": 240567765, "router_z_loss_clip": 2.3359375, "router_z_loss_mlp": 0.26196289, "step": 11143, "time_per_iteration": 2.6819558143615723 }, { "auxiliary_loss_clip": 0.01304142, "auxiliary_loss_mlp": 0.00219671, "balance_loss_clip": 1.06657827, "balance_loss_mlp": 0.19412413, "epoch": 0.6700135277318503, "flos": 25192377129600.0, "grad_norm": 11.126824571937712, "language_loss": 0.76712799, "learning_rate": 1.0374415732209796e-06, "loss": 0.78236616, "num_input_tokens_seen": 240590750, "router_z_loss_clip": 2.37695312, "router_z_loss_mlp": 0.25561523, "step": 11144, "time_per_iteration": 2.7973074913024902 }, { "auxiliary_loss_clip": 0.01312998, "auxiliary_loss_mlp": 0.00251638, "balance_loss_clip": 1.07850671, "balance_loss_mlp": 0.2248043, "epoch": 0.6700736509845182, "flos": 23440187784960.0, "grad_norm": 21.517507189600842, "language_loss": 0.80583131, "learning_rate": 1.0371002020793114e-06, "loss": 0.82147765, "num_input_tokens_seen": 240608875, "router_z_loss_clip": 2.34570312, "router_z_loss_mlp": 0.26831055, "step": 11145, "time_per_iteration": 2.730835437774658 }, { "auxiliary_loss_clip": 0.01327684, "auxiliary_loss_mlp": 0.0021291, "balance_loss_clip": 1.08515453, "balance_loss_mlp": 0.18592066, "epoch": 0.6701337742371862, "flos": 24389953251840.0, "grad_norm": 70.07832585781325, "language_loss": 0.80427998, "learning_rate": 1.0367588674509008e-06, "loss": 0.81968594, "num_input_tokens_seen": 240628565, "router_z_loss_clip": 2.42578125, "router_z_loss_mlp": 0.27001953, "step": 11146, "time_per_iteration": 2.7705130577087402 }, { "auxiliary_loss_clip": 0.01307497, "auxiliary_loss_mlp": 0.00225625, "balance_loss_clip": 1.07075119, "balance_loss_mlp": 0.19944689, "epoch": 0.6701938974898543, "flos": 14793652857600.0, "grad_norm": 42.14412677715837, "language_loss": 0.88102555, "learning_rate": 1.0364175693486905e-06, "loss": 0.89635682, "num_input_tokens_seen": 240646325, "router_z_loss_clip": 2.36914062, "router_z_loss_mlp": 0.26171875, "step": 11147, "time_per_iteration": 2.6426525115966797 }, { "auxiliary_loss_clip": 0.01329189, "auxiliary_loss_mlp": 0.00223919, "balance_loss_clip": 1.08586931, "balance_loss_mlp": 0.1975261, "epoch": 0.6702540207425222, "flos": 20154002261760.0, "grad_norm": 50.0046244054322, "language_loss": 0.78156543, "learning_rate": 1.0360763077856218e-06, "loss": 0.79709655, "num_input_tokens_seen": 240666145, "router_z_loss_clip": 2.43359375, "router_z_loss_mlp": 0.26403809, "step": 11148, "time_per_iteration": 2.676154851913452 }, { "auxiliary_loss_clip": 0.01304097, "auxiliary_loss_mlp": 0.00222921, "balance_loss_clip": 1.07158685, "balance_loss_mlp": 0.19956753, "epoch": 0.6703141439951902, "flos": 21214157201280.0, "grad_norm": 5.552412564141966, "language_loss": 0.78365409, "learning_rate": 1.035735082774636e-06, "loss": 0.79892427, "num_input_tokens_seen": 240685570, "router_z_loss_clip": 2.3203125, "router_z_loss_mlp": 0.23364258, "step": 11149, "time_per_iteration": 2.672943115234375 }, { "auxiliary_loss_clip": 0.01289192, "auxiliary_loss_mlp": 0.00231151, "balance_loss_clip": 1.05793822, "balance_loss_mlp": 0.20683183, "epoch": 0.6703742672478581, "flos": 23112538899840.0, "grad_norm": 2.8394804695397107, "language_loss": 0.82109839, "learning_rate": 1.0353938943286727e-06, "loss": 0.8363018, "num_input_tokens_seen": 240706945, "router_z_loss_clip": 2.31640625, "router_z_loss_mlp": 0.24316406, "step": 11150, "time_per_iteration": 2.763502836227417 }, { "auxiliary_loss_clip": 0.01327913, "auxiliary_loss_mlp": 0.00221915, "balance_loss_clip": 1.08658934, "balance_loss_mlp": 0.19583249, "epoch": 0.6704343905005261, "flos": 22528918719360.0, "grad_norm": 18.442359967408713, "language_loss": 0.86991942, "learning_rate": 1.035052742460671e-06, "loss": 0.8854177, "num_input_tokens_seen": 240727990, "router_z_loss_clip": 2.4140625, "router_z_loss_mlp": 0.26086426, "step": 11151, "time_per_iteration": 2.7157938480377197 }, { "auxiliary_loss_clip": 0.01295024, "auxiliary_loss_mlp": 0.00171155, "balance_loss_clip": 1.13366318, "balance_loss_mlp": 0.1614753, "epoch": 0.670494513753194, "flos": 64793158773120.0, "grad_norm": 0.7653275710376538, "language_loss": 0.55300856, "learning_rate": 1.0347116271835643e-06, "loss": 0.56767035, "num_input_tokens_seen": 240790380, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.09667969, "step": 11152, "time_per_iteration": 3.2375335693359375 }, { "auxiliary_loss_clip": 0.01310746, "auxiliary_loss_mlp": 0.00229638, "balance_loss_clip": 1.06665266, "balance_loss_mlp": 0.20124227, "epoch": 0.670554637005862, "flos": 23511506238720.0, "grad_norm": 25.932823779974033, "language_loss": 0.90204805, "learning_rate": 1.0343705485102896e-06, "loss": 0.91745186, "num_input_tokens_seen": 240811545, "router_z_loss_clip": 2.44335938, "router_z_loss_mlp": 0.28442383, "step": 11153, "time_per_iteration": 2.7074735164642334 }, { "auxiliary_loss_clip": 0.01312386, "auxiliary_loss_mlp": 0.00215123, "balance_loss_clip": 1.0714947, "balance_loss_mlp": 0.18794341, "epoch": 0.67061476025853, "flos": 19463404400640.0, "grad_norm": 3.7851652937530322, "language_loss": 0.82565665, "learning_rate": 1.0340295064537814e-06, "loss": 0.84093177, "num_input_tokens_seen": 240831380, "router_z_loss_clip": 2.40820312, "router_z_loss_mlp": 0.27172852, "step": 11154, "time_per_iteration": 2.6449761390686035 }, { "auxiliary_loss_clip": 0.01324564, "auxiliary_loss_mlp": 0.00222543, "balance_loss_clip": 1.08121371, "balance_loss_mlp": 0.19707972, "epoch": 0.670674883511198, "flos": 20519967980160.0, "grad_norm": 7.603343091415059, "language_loss": 0.83139914, "learning_rate": 1.0336885010269702e-06, "loss": 0.84687024, "num_input_tokens_seen": 240851855, "router_z_loss_clip": 2.43554688, "router_z_loss_mlp": 0.25488281, "step": 11155, "time_per_iteration": 2.7192580699920654 }, { "auxiliary_loss_clip": 0.01300291, "auxiliary_loss_mlp": 0.00232091, "balance_loss_clip": 1.07050157, "balance_loss_mlp": 0.20719999, "epoch": 0.6707350067638659, "flos": 25483971738240.0, "grad_norm": 25.743831235299304, "language_loss": 0.87111139, "learning_rate": 1.0333475322427878e-06, "loss": 0.88643527, "num_input_tokens_seen": 240869980, "router_z_loss_clip": 2.296875, "router_z_loss_mlp": 0.24902344, "step": 11156, "time_per_iteration": 2.727607488632202 }, { "auxiliary_loss_clip": 0.01287585, "auxiliary_loss_mlp": 0.00224908, "balance_loss_clip": 1.05688679, "balance_loss_mlp": 0.19931388, "epoch": 0.6707951300165339, "flos": 22273450214400.0, "grad_norm": 10.500207549851753, "language_loss": 0.82196856, "learning_rate": 1.033006600114165e-06, "loss": 0.83709353, "num_input_tokens_seen": 240888680, "router_z_loss_clip": 2.30664062, "router_z_loss_mlp": 0.25585938, "step": 11157, "time_per_iteration": 2.701732635498047 }, { "auxiliary_loss_clip": 0.0130142, "auxiliary_loss_mlp": 0.0020927, "balance_loss_clip": 1.06456459, "balance_loss_mlp": 0.18414101, "epoch": 0.6708552532692018, "flos": 23984593292160.0, "grad_norm": 7.006803858516912, "language_loss": 0.81962526, "learning_rate": 1.0326657046540282e-06, "loss": 0.83473217, "num_input_tokens_seen": 240909050, "router_z_loss_clip": 2.37304688, "router_z_loss_mlp": 0.25158691, "step": 11158, "time_per_iteration": 2.6897823810577393 }, { "auxiliary_loss_clip": 0.01285535, "auxiliary_loss_mlp": 0.00220889, "balance_loss_clip": 1.05700874, "balance_loss_mlp": 0.19598572, "epoch": 0.6709153765218698, "flos": 24937519155840.0, "grad_norm": 3.0327757934913593, "language_loss": 0.887137, "learning_rate": 1.0323248458753044e-06, "loss": 0.90220124, "num_input_tokens_seen": 240930035, "router_z_loss_clip": 2.28320312, "router_z_loss_mlp": 0.24914551, "step": 11159, "time_per_iteration": 2.908073902130127 }, { "auxiliary_loss_clip": 0.01321718, "auxiliary_loss_mlp": 0.00232846, "balance_loss_clip": 1.07968867, "balance_loss_mlp": 0.20626272, "epoch": 0.6709754997745379, "flos": 17530225401600.0, "grad_norm": 51.89727699016217, "language_loss": 0.8589862, "learning_rate": 1.0319840237909193e-06, "loss": 0.87453187, "num_input_tokens_seen": 240948895, "router_z_loss_clip": 2.41796875, "router_z_loss_mlp": 0.26586914, "step": 11160, "time_per_iteration": 2.653062105178833 }, { "auxiliary_loss_clip": 0.012906, "auxiliary_loss_mlp": 0.00231904, "balance_loss_clip": 1.05886316, "balance_loss_mlp": 0.20781192, "epoch": 0.6710356230272058, "flos": 22090880361600.0, "grad_norm": 7.398265330509629, "language_loss": 0.80798954, "learning_rate": 1.0316432384137978e-06, "loss": 0.82321459, "num_input_tokens_seen": 240967770, "router_z_loss_clip": 2.3203125, "router_z_loss_mlp": 0.2409668, "step": 11161, "time_per_iteration": 4.079337120056152 }, { "auxiliary_loss_clip": 0.01318755, "auxiliary_loss_mlp": 0.00239053, "balance_loss_clip": 1.07195854, "balance_loss_mlp": 0.21313679, "epoch": 0.6710957462798738, "flos": 24206449645440.0, "grad_norm": 5.170603133659091, "language_loss": 0.77752888, "learning_rate": 1.0313024897568618e-06, "loss": 0.79310697, "num_input_tokens_seen": 240988985, "router_z_loss_clip": 2.46875, "router_z_loss_mlp": 0.25915527, "step": 11162, "time_per_iteration": 4.1115453243255615 }, { "auxiliary_loss_clip": 0.01281767, "auxiliary_loss_mlp": 0.00215832, "balance_loss_clip": 1.05368853, "balance_loss_mlp": 0.19194284, "epoch": 0.6711558695325417, "flos": 19093955063040.0, "grad_norm": 1.9691619794633932, "language_loss": 0.7583847, "learning_rate": 1.030961777833032e-06, "loss": 0.77336073, "num_input_tokens_seen": 241005455, "router_z_loss_clip": 2.27929688, "router_z_loss_mlp": 0.23913574, "step": 11163, "time_per_iteration": 2.6660752296447754 }, { "auxiliary_loss_clip": 0.01292569, "auxiliary_loss_mlp": 0.00235627, "balance_loss_clip": 1.06611109, "balance_loss_mlp": 0.21037829, "epoch": 0.6712159927852097, "flos": 25557875971200.0, "grad_norm": 96.02624115754256, "language_loss": 0.82285243, "learning_rate": 1.0306211026552291e-06, "loss": 0.83813441, "num_input_tokens_seen": 241026175, "router_z_loss_clip": 2.26367188, "router_z_loss_mlp": 0.25244141, "step": 11164, "time_per_iteration": 2.727402687072754 }, { "auxiliary_loss_clip": 0.01318483, "auxiliary_loss_mlp": 0.00235922, "balance_loss_clip": 1.07845807, "balance_loss_mlp": 0.20833676, "epoch": 0.6712761160378776, "flos": 22228812587520.0, "grad_norm": 8.542920958716882, "language_loss": 0.77385592, "learning_rate": 1.0302804642363704e-06, "loss": 0.78939998, "num_input_tokens_seen": 241044040, "router_z_loss_clip": 2.3984375, "router_z_loss_mlp": 0.27563477, "step": 11165, "time_per_iteration": 2.683198928833008 }, { "auxiliary_loss_clip": 0.01305686, "auxiliary_loss_mlp": 0.00230589, "balance_loss_clip": 1.07147026, "balance_loss_mlp": 0.20441058, "epoch": 0.6713362392905456, "flos": 22455517276800.0, "grad_norm": 6.872094082588257, "language_loss": 0.80635619, "learning_rate": 1.0299398625893738e-06, "loss": 0.82171893, "num_input_tokens_seen": 241063615, "router_z_loss_clip": 2.34765625, "router_z_loss_mlp": 0.26171875, "step": 11166, "time_per_iteration": 4.1220855712890625 }, { "auxiliary_loss_clip": 0.01301295, "auxiliary_loss_mlp": 0.00202661, "balance_loss_clip": 1.07058787, "balance_loss_mlp": 0.1801185, "epoch": 0.6713963625432136, "flos": 25630200005760.0, "grad_norm": 32.212513337903594, "language_loss": 0.86242837, "learning_rate": 1.0295992977271546e-06, "loss": 0.87746793, "num_input_tokens_seen": 241082520, "router_z_loss_clip": 2.30664062, "router_z_loss_mlp": 0.22546387, "step": 11167, "time_per_iteration": 2.752408981323242 }, { "auxiliary_loss_clip": 0.01301529, "auxiliary_loss_mlp": 0.00242807, "balance_loss_clip": 1.06565118, "balance_loss_mlp": 0.21810687, "epoch": 0.6714564857958816, "flos": 35006475640320.0, "grad_norm": 80.8812172122069, "language_loss": 0.77141017, "learning_rate": 1.029258769662629e-06, "loss": 0.78685355, "num_input_tokens_seen": 241103505, "router_z_loss_clip": 2.35742188, "router_z_loss_mlp": 0.24694824, "step": 11168, "time_per_iteration": 2.834103584289551 }, { "auxiliary_loss_clip": 0.01312613, "auxiliary_loss_mlp": 0.00215109, "balance_loss_clip": 1.07785714, "balance_loss_mlp": 0.18986005, "epoch": 0.6715166090485495, "flos": 26279931168000.0, "grad_norm": 29.58170497517036, "language_loss": 0.82572567, "learning_rate": 1.0289182784087068e-06, "loss": 0.84100294, "num_input_tokens_seen": 241122885, "router_z_loss_clip": 2.34570312, "router_z_loss_mlp": 0.25231934, "step": 11169, "time_per_iteration": 2.7379508018493652 }, { "auxiliary_loss_clip": 0.01304174, "auxiliary_loss_mlp": 0.00256016, "balance_loss_clip": 1.07173753, "balance_loss_mlp": 0.23062465, "epoch": 0.6715767323012175, "flos": 15924156583680.0, "grad_norm": 57.717844886732344, "language_loss": 0.83811641, "learning_rate": 1.0285778239783005e-06, "loss": 0.85371828, "num_input_tokens_seen": 241140865, "router_z_loss_clip": 2.32617188, "router_z_loss_mlp": 0.25402832, "step": 11170, "time_per_iteration": 4.177717924118042 }, { "auxiliary_loss_clip": 0.01308827, "auxiliary_loss_mlp": 0.00252284, "balance_loss_clip": 1.07298625, "balance_loss_mlp": 0.22664186, "epoch": 0.6716368555538854, "flos": 17491441691520.0, "grad_norm": 733.7462503720692, "language_loss": 0.84388578, "learning_rate": 1.0282374063843212e-06, "loss": 0.85949689, "num_input_tokens_seen": 241158225, "router_z_loss_clip": 2.35742188, "router_z_loss_mlp": 0.25646973, "step": 11171, "time_per_iteration": 2.732903003692627 }, { "auxiliary_loss_clip": 0.01306764, "auxiliary_loss_mlp": 0.00238739, "balance_loss_clip": 1.06903815, "balance_loss_mlp": 0.21153502, "epoch": 0.6716969788065534, "flos": 16761521416320.0, "grad_norm": 6.772319733547146, "language_loss": 0.9219479, "learning_rate": 1.0278970256396762e-06, "loss": 0.9374029, "num_input_tokens_seen": 241175215, "router_z_loss_clip": 2.37695312, "router_z_loss_mlp": 0.27185059, "step": 11172, "time_per_iteration": 2.639415979385376 }, { "auxiliary_loss_clip": 0.01290413, "auxiliary_loss_mlp": 0.00226622, "balance_loss_clip": 1.05911851, "balance_loss_mlp": 0.20107546, "epoch": 0.6717571020592215, "flos": 22709800632960.0, "grad_norm": 31.01093720871742, "language_loss": 0.71029425, "learning_rate": 1.0275566817572733e-06, "loss": 0.72546458, "num_input_tokens_seen": 241195250, "router_z_loss_clip": 2.3125, "router_z_loss_mlp": 0.25537109, "step": 11173, "time_per_iteration": 2.698124885559082 }, { "auxiliary_loss_clip": 0.01355391, "auxiliary_loss_mlp": 0.00241722, "balance_loss_clip": 1.09554827, "balance_loss_mlp": 0.21227738, "epoch": 0.6718172253118894, "flos": 18734094656640.0, "grad_norm": 2441.0965571876413, "language_loss": 0.8459897, "learning_rate": 1.02721637475002e-06, "loss": 0.86196077, "num_input_tokens_seen": 241210720, "router_z_loss_clip": 2.59765625, "router_z_loss_mlp": 0.29431152, "step": 11174, "time_per_iteration": 2.641655206680298 }, { "auxiliary_loss_clip": 0.01284579, "auxiliary_loss_mlp": 0.00240028, "balance_loss_clip": 1.0619334, "balance_loss_mlp": 0.21728238, "epoch": 0.6718773485645574, "flos": 15632526061440.0, "grad_norm": 107.69167646268538, "language_loss": 0.78150815, "learning_rate": 1.0268761046308178e-06, "loss": 0.79675424, "num_input_tokens_seen": 241227395, "router_z_loss_clip": 2.22460938, "router_z_loss_mlp": 0.22741699, "step": 11175, "time_per_iteration": 2.638228178024292 }, { "auxiliary_loss_clip": 0.01308585, "auxiliary_loss_mlp": 0.00243061, "balance_loss_clip": 1.07635081, "balance_loss_mlp": 0.21908842, "epoch": 0.6719374718172253, "flos": 19354774694400.0, "grad_norm": 134.6125037434017, "language_loss": 0.81516433, "learning_rate": 1.0265358714125714e-06, "loss": 0.83068079, "num_input_tokens_seen": 241246355, "router_z_loss_clip": 2.32226562, "router_z_loss_mlp": 0.23999023, "step": 11176, "time_per_iteration": 2.642571210861206 }, { "auxiliary_loss_clip": 0.01329842, "auxiliary_loss_mlp": 0.00227503, "balance_loss_clip": 1.08468199, "balance_loss_mlp": 0.20031121, "epoch": 0.6719975950698933, "flos": 21981316901760.0, "grad_norm": 895.2823886853358, "language_loss": 0.81995833, "learning_rate": 1.026195675108182e-06, "loss": 0.83553183, "num_input_tokens_seen": 241264180, "router_z_loss_clip": 2.453125, "router_z_loss_mlp": 0.27185059, "step": 11177, "time_per_iteration": 2.6667635440826416 }, { "auxiliary_loss_clip": 0.0133293, "auxiliary_loss_mlp": 0.00259906, "balance_loss_clip": 1.08594823, "balance_loss_mlp": 0.23310791, "epoch": 0.6720577183225612, "flos": 25228072270080.0, "grad_norm": 20.080070173550794, "language_loss": 0.84501922, "learning_rate": 1.025855515730551e-06, "loss": 0.86094755, "num_input_tokens_seen": 241282245, "router_z_loss_clip": 2.47070312, "router_z_loss_mlp": 0.2677002, "step": 11178, "time_per_iteration": 2.678718328475952 }, { "auxiliary_loss_clip": 0.01332226, "auxiliary_loss_mlp": 0.00258423, "balance_loss_clip": 1.09080184, "balance_loss_mlp": 0.23138602, "epoch": 0.6721178415752292, "flos": 16945886949120.0, "grad_norm": 149.11966519640944, "language_loss": 0.804452, "learning_rate": 1.0255153932925766e-06, "loss": 0.82035851, "num_input_tokens_seen": 241300745, "router_z_loss_clip": 2.40820312, "router_z_loss_mlp": 0.27038574, "step": 11179, "time_per_iteration": 2.6362802982330322 }, { "auxiliary_loss_clip": 0.01304602, "auxiliary_loss_mlp": 0.00233707, "balance_loss_clip": 1.0784744, "balance_loss_mlp": 0.209126, "epoch": 0.6721779648278972, "flos": 21541375123200.0, "grad_norm": 7.3295992695664145, "language_loss": 0.81544036, "learning_rate": 1.0251753078071557e-06, "loss": 0.83082342, "num_input_tokens_seen": 241319320, "router_z_loss_clip": 2.25976562, "router_z_loss_mlp": 0.24560547, "step": 11180, "time_per_iteration": 2.633653163909912 }, { "auxiliary_loss_clip": 0.01295047, "auxiliary_loss_mlp": 0.00233745, "balance_loss_clip": 1.06583261, "balance_loss_mlp": 0.20921159, "epoch": 0.6722380880805652, "flos": 22605444645120.0, "grad_norm": 12.187054888855709, "language_loss": 0.82421541, "learning_rate": 1.0248352592871848e-06, "loss": 0.83950329, "num_input_tokens_seen": 241342225, "router_z_loss_clip": 2.29101562, "router_z_loss_mlp": 0.24536133, "step": 11181, "time_per_iteration": 2.7172436714172363 }, { "auxiliary_loss_clip": 0.01297595, "auxiliary_loss_mlp": 0.00239285, "balance_loss_clip": 1.06309187, "balance_loss_mlp": 0.2151688, "epoch": 0.6722982113332331, "flos": 15925269905280.0, "grad_norm": 46.70879263312581, "language_loss": 0.84384191, "learning_rate": 1.0244952477455585e-06, "loss": 0.85921079, "num_input_tokens_seen": 241358240, "router_z_loss_clip": 2.34570312, "router_z_loss_mlp": 0.24133301, "step": 11182, "time_per_iteration": 2.6725025177001953 }, { "auxiliary_loss_clip": 0.01298019, "auxiliary_loss_mlp": 0.00242355, "balance_loss_clip": 1.06790876, "balance_loss_mlp": 0.2186321, "epoch": 0.6723583345859011, "flos": 20596170683520.0, "grad_norm": 10.374757353583368, "language_loss": 0.76454103, "learning_rate": 1.0241552731951699e-06, "loss": 0.77994478, "num_input_tokens_seen": 241378420, "router_z_loss_clip": 2.30078125, "router_z_loss_mlp": 0.23718262, "step": 11183, "time_per_iteration": 2.75481915473938 }, { "auxiliary_loss_clip": 0.01299911, "auxiliary_loss_mlp": 0.00229237, "balance_loss_clip": 1.06682992, "balance_loss_mlp": 0.20423898, "epoch": 0.672418457838569, "flos": 21725848396800.0, "grad_norm": 37.87933611582405, "language_loss": 0.86010969, "learning_rate": 1.0238153356489112e-06, "loss": 0.87540114, "num_input_tokens_seen": 241397185, "router_z_loss_clip": 2.33203125, "router_z_loss_mlp": 0.25024414, "step": 11184, "time_per_iteration": 2.6840507984161377 }, { "auxiliary_loss_clip": 0.01329063, "auxiliary_loss_mlp": 0.0022966, "balance_loss_clip": 1.09002566, "balance_loss_mlp": 0.20032257, "epoch": 0.672478581091237, "flos": 21470379891840.0, "grad_norm": 27.24076439100641, "language_loss": 0.77170384, "learning_rate": 1.0234754351196743e-06, "loss": 0.78729099, "num_input_tokens_seen": 241415785, "router_z_loss_clip": 2.39257812, "router_z_loss_mlp": 0.2935791, "step": 11185, "time_per_iteration": 2.6839869022369385 }, { "auxiliary_loss_clip": 0.01312333, "auxiliary_loss_mlp": 0.00232469, "balance_loss_clip": 1.0757035, "balance_loss_mlp": 0.20768562, "epoch": 0.6725387043439051, "flos": 30846763267200.0, "grad_norm": 2.336651280486047, "language_loss": 0.88213193, "learning_rate": 1.023135571620345e-06, "loss": 0.89758003, "num_input_tokens_seen": 241437390, "router_z_loss_clip": 2.36914062, "router_z_loss_mlp": 0.24780273, "step": 11186, "time_per_iteration": 2.776339292526245 }, { "auxiliary_loss_clip": 0.01290655, "auxiliary_loss_mlp": 0.00222937, "balance_loss_clip": 1.06559384, "balance_loss_mlp": 0.1997866, "epoch": 0.672598827596573, "flos": 24055947659520.0, "grad_norm": 29.03659291746407, "language_loss": 0.86465454, "learning_rate": 1.022795745163813e-06, "loss": 0.87979043, "num_input_tokens_seen": 241458085, "router_z_loss_clip": 2.25195312, "router_z_loss_mlp": 0.23144531, "step": 11187, "time_per_iteration": 2.6979331970214844 }, { "auxiliary_loss_clip": 0.0131591, "auxiliary_loss_mlp": 0.00247805, "balance_loss_clip": 1.07596147, "balance_loss_mlp": 0.22100694, "epoch": 0.672658950849241, "flos": 21871861182720.0, "grad_norm": 11.116828611075734, "language_loss": 0.79061115, "learning_rate": 1.022455955762965e-06, "loss": 0.80624837, "num_input_tokens_seen": 241476880, "router_z_loss_clip": 2.3984375, "router_z_loss_mlp": 0.26806641, "step": 11188, "time_per_iteration": 2.7001514434814453 }, { "auxiliary_loss_clip": 0.01298323, "auxiliary_loss_mlp": 0.00232267, "balance_loss_clip": 1.06988394, "balance_loss_mlp": 0.20831782, "epoch": 0.6727190741019089, "flos": 23222102359680.0, "grad_norm": 73.11593431435512, "language_loss": 0.83189559, "learning_rate": 1.0221162034306842e-06, "loss": 0.84720147, "num_input_tokens_seen": 241496535, "router_z_loss_clip": 2.28710938, "router_z_loss_mlp": 0.23950195, "step": 11189, "time_per_iteration": 2.730586528778076 }, { "auxiliary_loss_clip": 0.01325971, "auxiliary_loss_mlp": 0.00220794, "balance_loss_clip": 1.08023143, "balance_loss_mlp": 0.1933282, "epoch": 0.6727791973545769, "flos": 15778610674560.0, "grad_norm": 3.158940472573889, "language_loss": 0.85707736, "learning_rate": 1.0217764881798562e-06, "loss": 0.872545, "num_input_tokens_seen": 241513465, "router_z_loss_clip": 2.45898438, "router_z_loss_mlp": 0.27453613, "step": 11190, "time_per_iteration": 2.6533353328704834 }, { "auxiliary_loss_clip": 0.0131784, "auxiliary_loss_mlp": 0.00243074, "balance_loss_clip": 1.07440543, "balance_loss_mlp": 0.21446377, "epoch": 0.6728393206072448, "flos": 21249852341760.0, "grad_norm": 6.975194801086423, "language_loss": 0.84600842, "learning_rate": 1.0214368100233612e-06, "loss": 0.86161757, "num_input_tokens_seen": 241534125, "router_z_loss_clip": 2.4375, "router_z_loss_mlp": 0.28637695, "step": 11191, "time_per_iteration": 2.6581954956054688 }, { "auxiliary_loss_clip": 0.01299849, "auxiliary_loss_mlp": 0.00208175, "balance_loss_clip": 1.06998098, "balance_loss_mlp": 0.18419032, "epoch": 0.6728994438599128, "flos": 32123279779200.0, "grad_norm": 2.6161481655427306, "language_loss": 0.91869473, "learning_rate": 1.0210971689740802e-06, "loss": 0.93377507, "num_input_tokens_seen": 241556340, "router_z_loss_clip": 2.29882812, "router_z_loss_mlp": 0.23962402, "step": 11192, "time_per_iteration": 2.769777297973633 }, { "auxiliary_loss_clip": 0.01320469, "auxiliary_loss_mlp": 0.00238929, "balance_loss_clip": 1.08372831, "balance_loss_mlp": 0.21246472, "epoch": 0.6729595671125808, "flos": 23112359331840.0, "grad_norm": 73.67056136592683, "language_loss": 0.83245581, "learning_rate": 1.0207575650448923e-06, "loss": 0.84804976, "num_input_tokens_seen": 241575185, "router_z_loss_clip": 2.36523438, "router_z_loss_mlp": 0.2644043, "step": 11193, "time_per_iteration": 2.6710476875305176 }, { "auxiliary_loss_clip": 0.01285587, "auxiliary_loss_mlp": 0.00213632, "balance_loss_clip": 1.05345035, "balance_loss_mlp": 0.18734622, "epoch": 0.6730196903652488, "flos": 14611406227200.0, "grad_norm": 323.7458337071312, "language_loss": 0.8733561, "learning_rate": 1.0204179982486758e-06, "loss": 0.88834834, "num_input_tokens_seen": 241592970, "router_z_loss_clip": 2.32226562, "router_z_loss_mlp": 0.26306152, "step": 11194, "time_per_iteration": 2.704383373260498 }, { "auxiliary_loss_clip": 0.01306125, "auxiliary_loss_mlp": 0.00234157, "balance_loss_clip": 1.06479001, "balance_loss_mlp": 0.20794335, "epoch": 0.6730798136179167, "flos": 21105922544640.0, "grad_norm": 132.2576801637099, "language_loss": 0.99689412, "learning_rate": 1.0200784685983075e-06, "loss": 1.01229692, "num_input_tokens_seen": 241610245, "router_z_loss_clip": 2.41015625, "router_z_loss_mlp": 0.26220703, "step": 11195, "time_per_iteration": 2.6532254219055176 }, { "auxiliary_loss_clip": 0.01291221, "auxiliary_loss_mlp": 0.00204549, "balance_loss_clip": 1.06333947, "balance_loss_mlp": 0.17903787, "epoch": 0.6731399368705847, "flos": 28986267438720.0, "grad_norm": 5.0177471510128315, "language_loss": 0.79570413, "learning_rate": 1.019738976106662e-06, "loss": 0.81066191, "num_input_tokens_seen": 241630350, "router_z_loss_clip": 2.27734375, "router_z_loss_mlp": 0.25488281, "step": 11196, "time_per_iteration": 2.724848747253418 }, { "auxiliary_loss_clip": 0.01285756, "auxiliary_loss_mlp": 0.0014176, "balance_loss_clip": 1.12766671, "balance_loss_mlp": 0.13222323, "epoch": 0.6732000601232526, "flos": 64743708723840.0, "grad_norm": 0.7640002041131015, "language_loss": 0.55892313, "learning_rate": 1.0193995207866123e-06, "loss": 0.57319832, "num_input_tokens_seen": 241692380, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.09521484, "step": 11197, "time_per_iteration": 3.1000730991363525 }, { "auxiliary_loss_clip": 0.01296749, "auxiliary_loss_mlp": 0.00219859, "balance_loss_clip": 1.06849587, "balance_loss_mlp": 0.19538517, "epoch": 0.6732601833759206, "flos": 17201642762880.0, "grad_norm": 6.236577681339241, "language_loss": 0.84329802, "learning_rate": 1.0190601026510312e-06, "loss": 0.85846412, "num_input_tokens_seen": 241710430, "router_z_loss_clip": 2.27929688, "router_z_loss_mlp": 0.24499512, "step": 11198, "time_per_iteration": 2.6077024936676025 }, { "auxiliary_loss_clip": 0.01293736, "auxiliary_loss_mlp": 0.00237604, "balance_loss_clip": 1.06176114, "balance_loss_mlp": 0.20989954, "epoch": 0.6733203066285887, "flos": 18658861620480.0, "grad_norm": 19.091696384327854, "language_loss": 0.8945744, "learning_rate": 1.0187207217127892e-06, "loss": 0.90988779, "num_input_tokens_seen": 241724775, "router_z_loss_clip": 2.31640625, "router_z_loss_mlp": 0.27685547, "step": 11199, "time_per_iteration": 2.636986494064331 }, { "auxiliary_loss_clip": 0.01300395, "auxiliary_loss_mlp": 0.00245473, "balance_loss_clip": 1.06099725, "balance_loss_mlp": 0.21717232, "epoch": 0.6733804298812566, "flos": 35809330481280.0, "grad_norm": 52.92740060880868, "language_loss": 0.77838266, "learning_rate": 1.0183813779847552e-06, "loss": 0.79384136, "num_input_tokens_seen": 241744440, "router_z_loss_clip": 2.39453125, "router_z_loss_mlp": 0.28295898, "step": 11200, "time_per_iteration": 2.7594549655914307 }, { "auxiliary_loss_clip": 0.01301699, "auxiliary_loss_mlp": 0.00231539, "balance_loss_clip": 1.07031977, "balance_loss_mlp": 0.20648101, "epoch": 0.6734405531339246, "flos": 61638833099520.0, "grad_norm": 93.03523221876395, "language_loss": 0.70245671, "learning_rate": 1.0180420714797987e-06, "loss": 0.71778911, "num_input_tokens_seen": 241771705, "router_z_loss_clip": 2.31640625, "router_z_loss_mlp": 0.25036621, "step": 11201, "time_per_iteration": 3.0999417304992676 }, { "auxiliary_loss_clip": 0.01300747, "auxiliary_loss_mlp": 0.00232774, "balance_loss_clip": 1.06679022, "balance_loss_mlp": 0.20647672, "epoch": 0.6735006763865925, "flos": 20522338277760.0, "grad_norm": 5.335154939501229, "language_loss": 0.72587299, "learning_rate": 1.0177028022107856e-06, "loss": 0.7412082, "num_input_tokens_seen": 241790830, "router_z_loss_clip": 2.33984375, "router_z_loss_mlp": 0.26269531, "step": 11202, "time_per_iteration": 2.7085816860198975 }, { "auxiliary_loss_clip": 0.01290599, "auxiliary_loss_mlp": 0.00225402, "balance_loss_clip": 1.05947876, "balance_loss_mlp": 0.1995215, "epoch": 0.6735607996392605, "flos": 13918869031680.0, "grad_norm": 53.21245041917134, "language_loss": 0.8348155, "learning_rate": 1.0173635701905796e-06, "loss": 0.84997547, "num_input_tokens_seen": 241808165, "router_z_loss_clip": 2.3125, "router_z_loss_mlp": 0.25878906, "step": 11203, "time_per_iteration": 4.047514200210571 }, { "auxiliary_loss_clip": 0.01324868, "auxiliary_loss_mlp": 0.00244827, "balance_loss_clip": 1.07979214, "balance_loss_mlp": 0.21669312, "epoch": 0.6736209228919284, "flos": 18807244704000.0, "grad_norm": 282.89564674482324, "language_loss": 0.76716226, "learning_rate": 1.0170243754320456e-06, "loss": 0.78285921, "num_input_tokens_seen": 241826925, "router_z_loss_clip": 2.45117188, "router_z_loss_mlp": 0.28112793, "step": 11204, "time_per_iteration": 4.178461074829102 }, { "auxiliary_loss_clip": 0.01315272, "auxiliary_loss_mlp": 0.00225324, "balance_loss_clip": 1.07457805, "balance_loss_mlp": 0.19763178, "epoch": 0.6736810461445965, "flos": 20373129181440.0, "grad_norm": 190.32500804307804, "language_loss": 0.81085718, "learning_rate": 1.0166852179480465e-06, "loss": 0.82626307, "num_input_tokens_seen": 241845525, "router_z_loss_clip": 2.40625, "router_z_loss_mlp": 0.27697754, "step": 11205, "time_per_iteration": 2.723565101623535 }, { "auxiliary_loss_clip": 0.01299501, "auxiliary_loss_mlp": 0.00216269, "balance_loss_clip": 1.06566763, "balance_loss_mlp": 0.18999511, "epoch": 0.6737411693972644, "flos": 30007530927360.0, "grad_norm": 15.619446346773517, "language_loss": 0.80414432, "learning_rate": 1.0163460977514416e-06, "loss": 0.81930196, "num_input_tokens_seen": 241866815, "router_z_loss_clip": 2.33789062, "router_z_loss_mlp": 0.26269531, "step": 11206, "time_per_iteration": 2.7050068378448486 }, { "auxiliary_loss_clip": 0.01333846, "auxiliary_loss_mlp": 0.00239511, "balance_loss_clip": 1.08939242, "balance_loss_mlp": 0.2116634, "epoch": 0.6738012926499324, "flos": 25447342844160.0, "grad_norm": 8.978462294900178, "language_loss": 0.78417063, "learning_rate": 1.016007014855092e-06, "loss": 0.79990417, "num_input_tokens_seen": 241887050, "router_z_loss_clip": 2.44140625, "router_z_loss_mlp": 0.27880859, "step": 11207, "time_per_iteration": 2.7164790630340576 }, { "auxiliary_loss_clip": 0.01277901, "auxiliary_loss_mlp": 0.00207694, "balance_loss_clip": 1.05443978, "balance_loss_mlp": 0.18442425, "epoch": 0.6738614159026003, "flos": 20776873029120.0, "grad_norm": 26.87810196210832, "language_loss": 0.82907987, "learning_rate": 1.0156679692718553e-06, "loss": 0.84393579, "num_input_tokens_seen": 241904280, "router_z_loss_clip": 2.23632812, "router_z_loss_mlp": 0.23266602, "step": 11208, "time_per_iteration": 4.086644172668457 }, { "auxiliary_loss_clip": 0.01296945, "auxiliary_loss_mlp": 0.0023074, "balance_loss_clip": 1.06009579, "balance_loss_mlp": 0.20267785, "epoch": 0.6739215391552683, "flos": 19566898462080.0, "grad_norm": 6.545634156104539, "language_loss": 0.84514874, "learning_rate": 1.0153289610145867e-06, "loss": 0.86042559, "num_input_tokens_seen": 241919190, "router_z_loss_clip": 2.36914062, "router_z_loss_mlp": 0.28088379, "step": 11209, "time_per_iteration": 2.6883926391601562 }, { "auxiliary_loss_clip": 0.01276144, "auxiliary_loss_mlp": 0.00197979, "balance_loss_clip": 1.05097556, "balance_loss_mlp": 0.17306437, "epoch": 0.6739816624079362, "flos": 24388193485440.0, "grad_norm": 72.91196250060742, "language_loss": 0.77097631, "learning_rate": 1.0149899900961428e-06, "loss": 0.78571755, "num_input_tokens_seen": 241940525, "router_z_loss_clip": 2.24609375, "router_z_loss_mlp": 0.24926758, "step": 11210, "time_per_iteration": 2.698976993560791 }, { "auxiliary_loss_clip": 0.01275776, "auxiliary_loss_mlp": 0.0020364, "balance_loss_clip": 1.0537796, "balance_loss_mlp": 0.18085892, "epoch": 0.6740417856606042, "flos": 22528164533760.0, "grad_norm": 163.47028337840828, "language_loss": 0.87599576, "learning_rate": 1.014651056529377e-06, "loss": 0.89078987, "num_input_tokens_seen": 241959290, "router_z_loss_clip": 2.22265625, "router_z_loss_mlp": 0.22790527, "step": 11211, "time_per_iteration": 2.6795995235443115 }, { "auxiliary_loss_clip": 0.01291122, "auxiliary_loss_mlp": 0.00228971, "balance_loss_clip": 1.0600332, "balance_loss_mlp": 0.20408005, "epoch": 0.6741019089132723, "flos": 25775458606080.0, "grad_norm": 69.66147143645453, "language_loss": 0.82254744, "learning_rate": 1.014312160327143e-06, "loss": 0.83774835, "num_input_tokens_seen": 241980715, "router_z_loss_clip": 2.30859375, "router_z_loss_mlp": 0.2487793, "step": 11212, "time_per_iteration": 4.104069948196411 }, { "auxiliary_loss_clip": 0.01300148, "auxiliary_loss_mlp": 0.00225793, "balance_loss_clip": 1.06230092, "balance_loss_mlp": 0.19879256, "epoch": 0.6741620321659402, "flos": 21105671149440.0, "grad_norm": 10.703458552470371, "language_loss": 0.87071037, "learning_rate": 1.0139733015022905e-06, "loss": 0.88596982, "num_input_tokens_seen": 241999985, "router_z_loss_clip": 2.37890625, "router_z_loss_mlp": 0.26977539, "step": 11213, "time_per_iteration": 2.6865451335906982 }, { "auxiliary_loss_clip": 0.01281681, "auxiliary_loss_mlp": 0.00217096, "balance_loss_clip": 1.05103159, "balance_loss_mlp": 0.19201475, "epoch": 0.6742221554186082, "flos": 20740423703040.0, "grad_norm": 509.5370735052831, "language_loss": 0.76606941, "learning_rate": 1.0136344800676685e-06, "loss": 0.78105724, "num_input_tokens_seen": 242018990, "router_z_loss_clip": 2.30273438, "router_z_loss_mlp": 0.25097656, "step": 11214, "time_per_iteration": 2.76403546333313 }, { "auxiliary_loss_clip": 0.01289843, "auxiliary_loss_mlp": 0.00228712, "balance_loss_clip": 1.05741894, "balance_loss_mlp": 0.20335665, "epoch": 0.6742822786712761, "flos": 37774146384000.0, "grad_norm": 132.8866380919774, "language_loss": 0.78431439, "learning_rate": 1.0132956960361263e-06, "loss": 0.79949993, "num_input_tokens_seen": 242039340, "router_z_loss_clip": 2.32421875, "router_z_loss_mlp": 0.25354004, "step": 11215, "time_per_iteration": 2.825324773788452 }, { "auxiliary_loss_clip": 0.01298552, "auxiliary_loss_mlp": 0.00195974, "balance_loss_clip": 1.05976915, "balance_loss_mlp": 0.17063032, "epoch": 0.6743424019239441, "flos": 37263891732480.0, "grad_norm": 3.943185180035349, "language_loss": 0.75674945, "learning_rate": 1.0129569494205096e-06, "loss": 0.77169466, "num_input_tokens_seen": 242062215, "router_z_loss_clip": 2.38867188, "router_z_loss_mlp": 0.25341797, "step": 11216, "time_per_iteration": 2.8052830696105957 }, { "auxiliary_loss_clip": 0.01259489, "auxiliary_loss_mlp": 0.00086099, "balance_loss_clip": 1.1066761, "balance_loss_mlp": 0.07908985, "epoch": 0.674402525176612, "flos": 65997746300160.0, "grad_norm": 0.6755282063509241, "language_loss": 0.5594821, "learning_rate": 1.0126182402336646e-06, "loss": 0.57293797, "num_input_tokens_seen": 242131130, "router_z_loss_clip": 1.53125, "router_z_loss_mlp": 0.0703125, "step": 11217, "time_per_iteration": 3.246645450592041 }, { "auxiliary_loss_clip": 0.01297422, "auxiliary_loss_mlp": 0.00210943, "balance_loss_clip": 1.06312394, "balance_loss_mlp": 0.18516967, "epoch": 0.67446264842928, "flos": 26461208131200.0, "grad_norm": 115.2914297863557, "language_loss": 0.82990873, "learning_rate": 1.0122795684884363e-06, "loss": 0.8449924, "num_input_tokens_seen": 242149720, "router_z_loss_clip": 2.33984375, "router_z_loss_mlp": 0.25769043, "step": 11218, "time_per_iteration": 2.655461072921753 }, { "auxiliary_loss_clip": 0.01323506, "auxiliary_loss_mlp": 0.00224308, "balance_loss_clip": 1.07618332, "balance_loss_mlp": 0.19734278, "epoch": 0.674522771681948, "flos": 23732392924800.0, "grad_norm": 206.36283447635344, "language_loss": 0.75383103, "learning_rate": 1.0119409341976639e-06, "loss": 0.76930916, "num_input_tokens_seen": 242168875, "router_z_loss_clip": 2.47265625, "router_z_loss_mlp": 0.26977539, "step": 11219, "time_per_iteration": 2.699918508529663 }, { "auxiliary_loss_clip": 0.01286701, "auxiliary_loss_mlp": 0.00218437, "balance_loss_clip": 1.05775905, "balance_loss_mlp": 0.19308147, "epoch": 0.674582894934616, "flos": 24754338771840.0, "grad_norm": 7.738309191391277, "language_loss": 0.83573312, "learning_rate": 1.0116023373741904e-06, "loss": 0.85078448, "num_input_tokens_seen": 242188465, "router_z_loss_clip": 2.28710938, "router_z_loss_mlp": 0.25366211, "step": 11220, "time_per_iteration": 2.6912553310394287 }, { "auxiliary_loss_clip": 0.01267055, "auxiliary_loss_mlp": 0.00215277, "balance_loss_clip": 1.0424422, "balance_loss_mlp": 0.1898137, "epoch": 0.6746430181872839, "flos": 24826626892800.0, "grad_norm": 131.47479151310202, "language_loss": 0.79646844, "learning_rate": 1.0112637780308554e-06, "loss": 0.81129175, "num_input_tokens_seen": 242208675, "router_z_loss_clip": 2.24804688, "router_z_loss_mlp": 0.25476074, "step": 11221, "time_per_iteration": 2.7170398235321045 }, { "auxiliary_loss_clip": 0.0127352, "auxiliary_loss_mlp": 0.00202385, "balance_loss_clip": 1.04535341, "balance_loss_mlp": 0.17907922, "epoch": 0.6747031414399519, "flos": 16873491087360.0, "grad_norm": 148.23225497609852, "language_loss": 0.68243349, "learning_rate": 1.010925256180498e-06, "loss": 0.69719255, "num_input_tokens_seen": 242227440, "router_z_loss_clip": 2.28515625, "router_z_loss_mlp": 0.23291016, "step": 11222, "time_per_iteration": 2.63856840133667 }, { "auxiliary_loss_clip": 0.0128996, "auxiliary_loss_mlp": 0.00240388, "balance_loss_clip": 1.05994058, "balance_loss_mlp": 0.21455535, "epoch": 0.6747632646926198, "flos": 22784925928320.0, "grad_norm": 2.091110316342292, "language_loss": 0.84203786, "learning_rate": 1.0105867718359528e-06, "loss": 0.85734141, "num_input_tokens_seen": 242245240, "router_z_loss_clip": 2.29492188, "router_z_loss_mlp": 0.25805664, "step": 11223, "time_per_iteration": 2.645334005355835 }, { "auxiliary_loss_clip": 0.01286318, "auxiliary_loss_mlp": 0.00240035, "balance_loss_clip": 1.05398595, "balance_loss_mlp": 0.21398789, "epoch": 0.6748233879452878, "flos": 20046090827520.0, "grad_norm": 12.857476862051659, "language_loss": 0.82169771, "learning_rate": 1.0102483250100574e-06, "loss": 0.83696127, "num_input_tokens_seen": 242263435, "router_z_loss_clip": 2.32226562, "router_z_loss_mlp": 0.26037598, "step": 11224, "time_per_iteration": 2.644181489944458 }, { "auxiliary_loss_clip": 0.01286945, "auxiliary_loss_mlp": 0.00202064, "balance_loss_clip": 1.05796456, "balance_loss_mlp": 0.1789135, "epoch": 0.6748835111979558, "flos": 23002831785600.0, "grad_norm": 5.7970593878259935, "language_loss": 0.69243896, "learning_rate": 1.0099099157156445e-06, "loss": 0.70732903, "num_input_tokens_seen": 242282765, "router_z_loss_clip": 2.29296875, "router_z_loss_mlp": 0.23168945, "step": 11225, "time_per_iteration": 2.67832612991333 }, { "auxiliary_loss_clip": 0.01267537, "auxiliary_loss_mlp": 0.00210036, "balance_loss_clip": 1.04456508, "balance_loss_mlp": 0.18746983, "epoch": 0.6749436344506238, "flos": 12197311009920.0, "grad_norm": 5.118617176185002, "language_loss": 0.70526987, "learning_rate": 1.0095715439655462e-06, "loss": 0.72004557, "num_input_tokens_seen": 242298980, "router_z_loss_clip": 2.23046875, "router_z_loss_mlp": 0.22570801, "step": 11226, "time_per_iteration": 2.6403989791870117 }, { "auxiliary_loss_clip": 0.01280401, "auxiliary_loss_mlp": 0.00226755, "balance_loss_clip": 1.05323482, "balance_loss_mlp": 0.20056452, "epoch": 0.6750037577032918, "flos": 11873720361600.0, "grad_norm": 17.00704244151231, "language_loss": 0.81830925, "learning_rate": 1.0092332097725945e-06, "loss": 0.83338082, "num_input_tokens_seen": 242315420, "router_z_loss_clip": 2.2734375, "router_z_loss_mlp": 0.26220703, "step": 11227, "time_per_iteration": 2.6202220916748047 }, { "auxiliary_loss_clip": 0.01266553, "auxiliary_loss_mlp": 0.00219998, "balance_loss_clip": 1.04112554, "balance_loss_mlp": 0.19670437, "epoch": 0.6750638809559597, "flos": 17019611614080.0, "grad_norm": 151.35065950682036, "language_loss": 0.79755819, "learning_rate": 1.0088949131496183e-06, "loss": 0.81242365, "num_input_tokens_seen": 242332805, "router_z_loss_clip": 2.25195312, "router_z_loss_mlp": 0.2331543, "step": 11228, "time_per_iteration": 2.6803903579711914 }, { "auxiliary_loss_clip": 0.01265982, "auxiliary_loss_mlp": 0.00090376, "balance_loss_clip": 1.10507941, "balance_loss_mlp": 0.08217438, "epoch": 0.6751240042086277, "flos": 70951011891840.0, "grad_norm": 0.7561516964718535, "language_loss": 0.52518928, "learning_rate": 1.0085566541094482e-06, "loss": 0.53875291, "num_input_tokens_seen": 242396160, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.08203125, "step": 11229, "time_per_iteration": 3.240126848220825 }, { "auxiliary_loss_clip": 0.01290531, "auxiliary_loss_mlp": 0.00202459, "balance_loss_clip": 1.05854654, "balance_loss_mlp": 0.17575563, "epoch": 0.6751841274612956, "flos": 22675146986880.0, "grad_norm": 4.569299161972507, "language_loss": 0.86787391, "learning_rate": 1.0082184326649072e-06, "loss": 0.8828038, "num_input_tokens_seen": 242414660, "router_z_loss_clip": 2.31835938, "router_z_loss_mlp": 0.26721191, "step": 11230, "time_per_iteration": 2.696629285812378 }, { "auxiliary_loss_clip": 0.01277966, "auxiliary_loss_mlp": 0.00203423, "balance_loss_clip": 1.05212879, "balance_loss_mlp": 0.1796051, "epoch": 0.6752442507139637, "flos": 21288636051840.0, "grad_norm": 513.3641229552558, "language_loss": 0.74183339, "learning_rate": 1.0078802488288228e-06, "loss": 0.75664729, "num_input_tokens_seen": 242434225, "router_z_loss_clip": 2.25976562, "router_z_loss_mlp": 0.23828125, "step": 11231, "time_per_iteration": 2.6920838356018066 }, { "auxiliary_loss_clip": 0.01348831, "auxiliary_loss_mlp": 0.00248204, "balance_loss_clip": 1.09460807, "balance_loss_mlp": 0.21911699, "epoch": 0.6753043739666316, "flos": 28256921781120.0, "grad_norm": 137.1191303029634, "language_loss": 0.74597096, "learning_rate": 1.0075421026140198e-06, "loss": 0.76194131, "num_input_tokens_seen": 242454355, "router_z_loss_clip": 2.5390625, "router_z_loss_mlp": 0.29077148, "step": 11232, "time_per_iteration": 2.831052541732788 }, { "auxiliary_loss_clip": 0.01305869, "auxiliary_loss_mlp": 0.0023058, "balance_loss_clip": 1.07137322, "balance_loss_mlp": 0.20481853, "epoch": 0.6753644972192996, "flos": 21360349555200.0, "grad_norm": 15.280151081469556, "language_loss": 0.7979033, "learning_rate": 1.0072039940333188e-06, "loss": 0.81326783, "num_input_tokens_seen": 242474935, "router_z_loss_clip": 2.34375, "router_z_loss_mlp": 0.25720215, "step": 11233, "time_per_iteration": 2.6672158241271973 }, { "auxiliary_loss_clip": 0.01287524, "auxiliary_loss_mlp": 0.00236873, "balance_loss_clip": 1.05835927, "balance_loss_mlp": 0.2103014, "epoch": 0.6754246204719675, "flos": 26541971861760.0, "grad_norm": 11.420094268039191, "language_loss": 0.84996253, "learning_rate": 1.0068659230995418e-06, "loss": 0.86520648, "num_input_tokens_seen": 242495530, "router_z_loss_clip": 2.29296875, "router_z_loss_mlp": 0.26599121, "step": 11234, "time_per_iteration": 2.762568235397339 }, { "auxiliary_loss_clip": 0.01314089, "auxiliary_loss_mlp": 0.00232099, "balance_loss_clip": 1.07786465, "balance_loss_mlp": 0.20423998, "epoch": 0.6754847437246355, "flos": 25556690822400.0, "grad_norm": 3.711701044595577, "language_loss": 0.82746947, "learning_rate": 1.0065278898255101e-06, "loss": 0.84293139, "num_input_tokens_seen": 242514550, "router_z_loss_clip": 2.359375, "router_z_loss_mlp": 0.27868652, "step": 11235, "time_per_iteration": 2.724761486053467 }, { "auxiliary_loss_clip": 0.01268559, "auxiliary_loss_mlp": 0.00098104, "balance_loss_clip": 1.11155069, "balance_loss_mlp": 0.08966371, "epoch": 0.6755448669773034, "flos": 59513318726400.0, "grad_norm": 0.7701769422053569, "language_loss": 0.50637114, "learning_rate": 1.0061898942240387e-06, "loss": 0.52003777, "num_input_tokens_seen": 242569200, "router_z_loss_clip": 1.5703125, "router_z_loss_mlp": 0.08447266, "step": 11236, "time_per_iteration": 3.120445966720581 }, { "auxiliary_loss_clip": 0.01301247, "auxiliary_loss_mlp": 0.00223249, "balance_loss_clip": 1.06360793, "balance_loss_mlp": 0.19707045, "epoch": 0.6756049902299714, "flos": 23294534135040.0, "grad_norm": 2.6211664404715767, "language_loss": 0.84217304, "learning_rate": 1.0058519363079464e-06, "loss": 0.857418, "num_input_tokens_seen": 242586950, "router_z_loss_clip": 2.37109375, "router_z_loss_mlp": 0.26196289, "step": 11237, "time_per_iteration": 2.6781067848205566 }, { "auxiliary_loss_clip": 0.01327772, "auxiliary_loss_mlp": 0.00222063, "balance_loss_clip": 1.08601165, "balance_loss_mlp": 0.19477563, "epoch": 0.6756651134826394, "flos": 31575426566400.0, "grad_norm": 24.203126446412714, "language_loss": 0.87107146, "learning_rate": 1.0055140160900482e-06, "loss": 0.88656974, "num_input_tokens_seen": 242607380, "router_z_loss_clip": 2.41796875, "router_z_loss_mlp": 0.27282715, "step": 11238, "time_per_iteration": 2.7278542518615723 }, { "auxiliary_loss_clip": 0.01299646, "auxiliary_loss_mlp": 0.00228279, "balance_loss_clip": 1.06509101, "balance_loss_mlp": 0.20033616, "epoch": 0.6757252367353074, "flos": 27272287186560.0, "grad_norm": 82.4615143611275, "language_loss": 0.7579577, "learning_rate": 1.0051761335831587e-06, "loss": 0.77323693, "num_input_tokens_seen": 242628025, "router_z_loss_clip": 2.34570312, "router_z_loss_mlp": 0.27966309, "step": 11239, "time_per_iteration": 2.6949501037597656 }, { "auxiliary_loss_clip": 0.01287329, "auxiliary_loss_mlp": 0.00207811, "balance_loss_clip": 1.06155765, "balance_loss_mlp": 0.18237215, "epoch": 0.6757853599879754, "flos": 16830900535680.0, "grad_norm": 50.99771384188272, "language_loss": 0.89265299, "learning_rate": 1.0048382888000898e-06, "loss": 0.90760446, "num_input_tokens_seen": 242643825, "router_z_loss_clip": 2.25976562, "router_z_loss_mlp": 0.25402832, "step": 11240, "time_per_iteration": 2.5914981365203857 }, { "auxiliary_loss_clip": 0.0136126, "auxiliary_loss_mlp": 0.00229392, "balance_loss_clip": 1.10285079, "balance_loss_mlp": 0.19899383, "epoch": 0.6758454832406433, "flos": 23220055284480.0, "grad_norm": 5.95678381639936, "language_loss": 0.90519518, "learning_rate": 1.0045004817536525e-06, "loss": 0.92110169, "num_input_tokens_seen": 242661820, "router_z_loss_clip": 2.58398438, "router_z_loss_mlp": 0.30395508, "step": 11241, "time_per_iteration": 2.6538054943084717 }, { "auxiliary_loss_clip": 0.01311054, "auxiliary_loss_mlp": 0.00231149, "balance_loss_clip": 1.07782602, "balance_loss_mlp": 0.20574597, "epoch": 0.6759056064933113, "flos": 16289547684480.0, "grad_norm": 29.674028975161615, "language_loss": 0.89524812, "learning_rate": 1.0041627124566572e-06, "loss": 0.91067016, "num_input_tokens_seen": 242679890, "router_z_loss_clip": 2.33203125, "router_z_loss_mlp": 0.25415039, "step": 11242, "time_per_iteration": 2.622302532196045 }, { "auxiliary_loss_clip": 0.01295658, "auxiliary_loss_mlp": 0.00226701, "balance_loss_clip": 1.06593955, "balance_loss_mlp": 0.20022488, "epoch": 0.6759657297459792, "flos": 25922297404800.0, "grad_norm": 47.5442822572552, "language_loss": 0.8046267, "learning_rate": 1.0038249809219109e-06, "loss": 0.81985033, "num_input_tokens_seen": 242699495, "router_z_loss_clip": 2.296875, "router_z_loss_mlp": 0.26501465, "step": 11243, "time_per_iteration": 2.7488772869110107 }, { "auxiliary_loss_clip": 0.01292892, "auxiliary_loss_mlp": 0.0020828, "balance_loss_clip": 1.06154251, "balance_loss_mlp": 0.18388994, "epoch": 0.6760258529986473, "flos": 23000820624000.0, "grad_norm": 14.78352165159696, "language_loss": 0.80294251, "learning_rate": 1.003487287162221e-06, "loss": 0.81795424, "num_input_tokens_seen": 242719500, "router_z_loss_clip": 2.3125, "router_z_loss_mlp": 0.24365234, "step": 11244, "time_per_iteration": 2.6461992263793945 }, { "auxiliary_loss_clip": 0.01291896, "auxiliary_loss_mlp": 0.00199837, "balance_loss_clip": 1.06276941, "balance_loss_mlp": 0.17524418, "epoch": 0.6760859762513152, "flos": 20959335141120.0, "grad_norm": 3.097029647035086, "language_loss": 0.94757545, "learning_rate": 1.003149631190393e-06, "loss": 0.96249282, "num_input_tokens_seen": 242738325, "router_z_loss_clip": 2.2890625, "router_z_loss_mlp": 0.24609375, "step": 11245, "time_per_iteration": 4.072397708892822 }, { "auxiliary_loss_clip": 0.01308007, "auxiliary_loss_mlp": 0.00213578, "balance_loss_clip": 1.07433724, "balance_loss_mlp": 0.18735172, "epoch": 0.6761460995039832, "flos": 23622937205760.0, "grad_norm": 6.154526126456195, "language_loss": 0.82243657, "learning_rate": 1.0028120130192327e-06, "loss": 0.83765239, "num_input_tokens_seen": 242756620, "router_z_loss_clip": 2.33984375, "router_z_loss_mlp": 0.2623291, "step": 11246, "time_per_iteration": 4.251142978668213 }, { "auxiliary_loss_clip": 0.0129426, "auxiliary_loss_mlp": 0.00211815, "balance_loss_clip": 1.06401968, "balance_loss_mlp": 0.18742456, "epoch": 0.6762062227566511, "flos": 20770875457920.0, "grad_norm": 3.325448848588042, "language_loss": 0.943434, "learning_rate": 1.002474432661539e-06, "loss": 0.95849478, "num_input_tokens_seen": 242774505, "router_z_loss_clip": 2.30273438, "router_z_loss_mlp": 0.24401855, "step": 11247, "time_per_iteration": 2.734910488128662 }, { "auxiliary_loss_clip": 0.01284792, "auxiliary_loss_mlp": 0.00216099, "balance_loss_clip": 1.12481809, "balance_loss_mlp": 0.20689617, "epoch": 0.6762663460093191, "flos": 52818099166080.0, "grad_norm": 0.8088087879989856, "language_loss": 0.53630304, "learning_rate": 1.002136890130115e-06, "loss": 0.55131191, "num_input_tokens_seen": 242828645, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.09179688, "step": 11248, "time_per_iteration": 3.173039674758911 }, { "auxiliary_loss_clip": 0.01299586, "auxiliary_loss_mlp": 0.00215846, "balance_loss_clip": 1.06996012, "balance_loss_mlp": 0.19078809, "epoch": 0.676326469261987, "flos": 23696302734720.0, "grad_norm": 11.256525265876824, "language_loss": 0.818546, "learning_rate": 1.001799385437761e-06, "loss": 0.8337003, "num_input_tokens_seen": 242850100, "router_z_loss_clip": 2.30273438, "router_z_loss_mlp": 0.25048828, "step": 11249, "time_per_iteration": 2.707488536834717 }, { "auxiliary_loss_clip": 0.0132297, "auxiliary_loss_mlp": 0.00218591, "balance_loss_clip": 1.07952595, "balance_loss_mlp": 0.19001648, "epoch": 0.676386592514655, "flos": 14063732582400.0, "grad_norm": 23.921609418666314, "language_loss": 0.8387078, "learning_rate": 1.0014619185972732e-06, "loss": 0.85412347, "num_input_tokens_seen": 242867775, "router_z_loss_clip": 2.4296875, "router_z_loss_mlp": 0.28552246, "step": 11250, "time_per_iteration": 4.140531778335571 }, { "auxiliary_loss_clip": 0.01304699, "auxiliary_loss_mlp": 0.00245014, "balance_loss_clip": 1.07163429, "balance_loss_mlp": 0.21872875, "epoch": 0.676446715767323, "flos": 20412236113920.0, "grad_norm": 1211.709643533875, "language_loss": 0.81856942, "learning_rate": 1.0011244896214497e-06, "loss": 0.83406657, "num_input_tokens_seen": 242886865, "router_z_loss_clip": 2.33203125, "router_z_loss_mlp": 0.26293945, "step": 11251, "time_per_iteration": 2.6566455364227295 }, { "auxiliary_loss_clip": 0.01332642, "auxiliary_loss_mlp": 0.00213477, "balance_loss_clip": 1.08879435, "balance_loss_mlp": 0.18591629, "epoch": 0.676506839019991, "flos": 21288241002240.0, "grad_norm": 118.297584514589, "language_loss": 0.80952287, "learning_rate": 1.0007870985230873e-06, "loss": 0.82498407, "num_input_tokens_seen": 242906705, "router_z_loss_clip": 2.43945312, "router_z_loss_mlp": 0.27563477, "step": 11252, "time_per_iteration": 2.6716134548187256 }, { "auxiliary_loss_clip": 0.01293741, "auxiliary_loss_mlp": 0.00200295, "balance_loss_clip": 1.06821823, "balance_loss_mlp": 0.17675166, "epoch": 0.676566962272659, "flos": 29932477459200.0, "grad_norm": 21.117730126635536, "language_loss": 0.75473469, "learning_rate": 1.0004497453149765e-06, "loss": 0.76967514, "num_input_tokens_seen": 242925215, "router_z_loss_clip": 2.25585938, "router_z_loss_mlp": 0.23547363, "step": 11253, "time_per_iteration": 2.7073874473571777 }, { "auxiliary_loss_clip": 0.0132758, "auxiliary_loss_mlp": 0.00236335, "balance_loss_clip": 1.0827179, "balance_loss_mlp": 0.20674753, "epoch": 0.6766270855253269, "flos": 17931203902080.0, "grad_norm": 4.630326378458993, "language_loss": 0.8581233, "learning_rate": 1.0001124300099115e-06, "loss": 0.87376237, "num_input_tokens_seen": 242944750, "router_z_loss_clip": 2.44726562, "router_z_loss_mlp": 0.29614258, "step": 11254, "time_per_iteration": 4.0635294914245605 }, { "auxiliary_loss_clip": 0.01321265, "auxiliary_loss_mlp": 0.00210824, "balance_loss_clip": 1.0840261, "balance_loss_mlp": 0.18314409, "epoch": 0.6766872087779949, "flos": 23104853389440.0, "grad_norm": 7.81418434837909, "language_loss": 0.80079079, "learning_rate": 9.997751526206835e-07, "loss": 0.81611168, "num_input_tokens_seen": 242963860, "router_z_loss_clip": 2.375, "router_z_loss_mlp": 0.27709961, "step": 11255, "time_per_iteration": 2.6828413009643555 }, { "auxiliary_loss_clip": 0.01326996, "auxiliary_loss_mlp": 0.00231535, "balance_loss_clip": 1.08142185, "balance_loss_mlp": 0.20355611, "epoch": 0.6767473320306628, "flos": 26213137827840.0, "grad_norm": 32.36280186990526, "language_loss": 0.85633969, "learning_rate": 9.994379131600828e-07, "loss": 0.871925, "num_input_tokens_seen": 242983050, "router_z_loss_clip": 2.453125, "router_z_loss_mlp": 0.27966309, "step": 11256, "time_per_iteration": 2.701913356781006 }, { "auxiliary_loss_clip": 0.01313479, "auxiliary_loss_mlp": 0.0020358, "balance_loss_clip": 1.07653534, "balance_loss_mlp": 0.17727031, "epoch": 0.6768074552833309, "flos": 18368739469440.0, "grad_norm": 44.33197210832063, "language_loss": 0.77429003, "learning_rate": 9.991007116408965e-07, "loss": 0.78946066, "num_input_tokens_seen": 243001125, "router_z_loss_clip": 2.36914062, "router_z_loss_mlp": 0.26330566, "step": 11257, "time_per_iteration": 2.6664254665374756 }, { "auxiliary_loss_clip": 0.01286041, "auxiliary_loss_mlp": 0.00218575, "balance_loss_clip": 1.05911541, "balance_loss_mlp": 0.1944114, "epoch": 0.6768675785359988, "flos": 23039927556480.0, "grad_norm": 116.03369803766219, "language_loss": 0.81370461, "learning_rate": 9.987635480759109e-07, "loss": 0.82875073, "num_input_tokens_seen": 243021865, "router_z_loss_clip": 2.26757812, "router_z_loss_mlp": 0.24157715, "step": 11258, "time_per_iteration": 2.6875154972076416 }, { "auxiliary_loss_clip": 0.01289812, "auxiliary_loss_mlp": 0.0021406, "balance_loss_clip": 1.06337559, "balance_loss_mlp": 0.19053976, "epoch": 0.6769277017886668, "flos": 33036524092800.0, "grad_norm": 3.442452905056381, "language_loss": 0.74364018, "learning_rate": 9.984264224779127e-07, "loss": 0.75867891, "num_input_tokens_seen": 243042970, "router_z_loss_clip": 2.26367188, "router_z_loss_mlp": 0.23535156, "step": 11259, "time_per_iteration": 2.809744119644165 }, { "auxiliary_loss_clip": 0.01286401, "auxiliary_loss_mlp": 0.00197924, "balance_loss_clip": 1.0555532, "balance_loss_mlp": 0.17205521, "epoch": 0.6769878250413347, "flos": 20848406964480.0, "grad_norm": 9.371611021795106, "language_loss": 0.9336834, "learning_rate": 9.980893348596839e-07, "loss": 0.94852662, "num_input_tokens_seen": 243058470, "router_z_loss_clip": 2.30859375, "router_z_loss_mlp": 0.25842285, "step": 11260, "time_per_iteration": 2.645406484603882 }, { "auxiliary_loss_clip": 0.013201, "auxiliary_loss_mlp": 0.00237704, "balance_loss_clip": 1.07588589, "balance_loss_mlp": 0.21011904, "epoch": 0.6770479482940027, "flos": 15595968994560.0, "grad_norm": 186.54146122843073, "language_loss": 0.87568104, "learning_rate": 9.977522852340081e-07, "loss": 0.89125907, "num_input_tokens_seen": 243076630, "router_z_loss_clip": 2.44335938, "router_z_loss_mlp": 0.27612305, "step": 11261, "time_per_iteration": 2.636981248855591 }, { "auxiliary_loss_clip": 0.01288065, "auxiliary_loss_mlp": 0.00205718, "balance_loss_clip": 1.05966091, "balance_loss_mlp": 0.18054113, "epoch": 0.6771080715466706, "flos": 18621011664000.0, "grad_norm": 1.9344882801306478, "language_loss": 0.9472822, "learning_rate": 9.97415273613666e-07, "loss": 0.96222007, "num_input_tokens_seen": 243092260, "router_z_loss_clip": 2.28515625, "router_z_loss_mlp": 0.2520752, "step": 11262, "time_per_iteration": 2.619154453277588 }, { "auxiliary_loss_clip": 0.01296493, "auxiliary_loss_mlp": 0.00201708, "balance_loss_clip": 1.06491542, "balance_loss_mlp": 0.17625666, "epoch": 0.6771681947993387, "flos": 12495441893760.0, "grad_norm": 55.76934331525783, "language_loss": 0.82805419, "learning_rate": 9.97078300011439e-07, "loss": 0.84303617, "num_input_tokens_seen": 243109405, "router_z_loss_clip": 2.3125, "router_z_loss_mlp": 0.2545166, "step": 11263, "time_per_iteration": 2.6332430839538574 }, { "auxiliary_loss_clip": 0.01306044, "auxiliary_loss_mlp": 0.00213262, "balance_loss_clip": 1.06775928, "balance_loss_mlp": 0.18449718, "epoch": 0.6772283180520066, "flos": 22236964974720.0, "grad_norm": 2.9676118471325044, "language_loss": 0.79353553, "learning_rate": 9.967413644401016e-07, "loss": 0.80872858, "num_input_tokens_seen": 243128135, "router_z_loss_clip": 2.38476562, "router_z_loss_mlp": 0.28747559, "step": 11264, "time_per_iteration": 2.6286263465881348 }, { "auxiliary_loss_clip": 0.01309012, "auxiliary_loss_mlp": 0.00207027, "balance_loss_clip": 1.07407391, "balance_loss_mlp": 0.17853622, "epoch": 0.6772884413046746, "flos": 16143139848960.0, "grad_norm": 4.539254012112613, "language_loss": 0.84984136, "learning_rate": 9.964044669124324e-07, "loss": 0.8650018, "num_input_tokens_seen": 243146785, "router_z_loss_clip": 2.35351562, "router_z_loss_mlp": 0.28466797, "step": 11265, "time_per_iteration": 2.6215333938598633 }, { "auxiliary_loss_clip": 0.01295043, "auxiliary_loss_mlp": 0.00212036, "balance_loss_clip": 1.06685257, "balance_loss_mlp": 0.18716902, "epoch": 0.6773485645573426, "flos": 19135755515520.0, "grad_norm": 193.91547836874702, "language_loss": 0.69858325, "learning_rate": 9.96067607441207e-07, "loss": 0.71365404, "num_input_tokens_seen": 243165275, "router_z_loss_clip": 2.27929688, "router_z_loss_mlp": 0.24853516, "step": 11266, "time_per_iteration": 2.6507067680358887 }, { "auxiliary_loss_clip": 0.01299726, "auxiliary_loss_mlp": 0.00223054, "balance_loss_clip": 1.0698086, "balance_loss_mlp": 0.19649437, "epoch": 0.6774086878100105, "flos": 14136918543360.0, "grad_norm": 6.073911330931136, "language_loss": 0.79655826, "learning_rate": 9.957307860391976e-07, "loss": 0.81178606, "num_input_tokens_seen": 243182845, "router_z_loss_clip": 2.30273438, "router_z_loss_mlp": 0.26574707, "step": 11267, "time_per_iteration": 2.608888864517212 }, { "auxiliary_loss_clip": 0.01324473, "auxiliary_loss_mlp": 0.00218028, "balance_loss_clip": 1.07969809, "balance_loss_mlp": 0.18952471, "epoch": 0.6774688110626785, "flos": 22197067943040.0, "grad_norm": 9.966181283008698, "language_loss": 0.77148485, "learning_rate": 9.953940027191785e-07, "loss": 0.78690988, "num_input_tokens_seen": 243201475, "router_z_loss_clip": 2.44726562, "router_z_loss_mlp": 0.28540039, "step": 11268, "time_per_iteration": 2.723778486251831 }, { "auxiliary_loss_clip": 0.0132533, "auxiliary_loss_mlp": 0.00219414, "balance_loss_clip": 1.08728361, "balance_loss_mlp": 0.19073229, "epoch": 0.6775289343153464, "flos": 23039963470080.0, "grad_norm": 31.458423882901357, "language_loss": 0.8415426, "learning_rate": 9.950572574939194e-07, "loss": 0.85698998, "num_input_tokens_seen": 243221850, "router_z_loss_clip": 2.37695312, "router_z_loss_mlp": 0.28686523, "step": 11269, "time_per_iteration": 2.660388469696045 }, { "auxiliary_loss_clip": 0.01299854, "auxiliary_loss_mlp": 0.00210996, "balance_loss_clip": 1.06879091, "balance_loss_mlp": 0.18317288, "epoch": 0.6775890575680145, "flos": 18293506433280.0, "grad_norm": 170.92684964360814, "language_loss": 0.83930421, "learning_rate": 9.94720550376189e-07, "loss": 0.85441267, "num_input_tokens_seen": 243239855, "router_z_loss_clip": 2.30664062, "router_z_loss_mlp": 0.27844238, "step": 11270, "time_per_iteration": 2.6789307594299316 }, { "auxiliary_loss_clip": 0.01301153, "auxiliary_loss_mlp": 0.00222821, "balance_loss_clip": 1.06892323, "balance_loss_mlp": 0.19506958, "epoch": 0.6776491808206824, "flos": 25336450581120.0, "grad_norm": 3.692228090660706, "language_loss": 0.79510987, "learning_rate": 9.94383881378756e-07, "loss": 0.81034958, "num_input_tokens_seen": 243260085, "router_z_loss_clip": 2.32226562, "router_z_loss_mlp": 0.27734375, "step": 11271, "time_per_iteration": 2.6820244789123535 }, { "auxiliary_loss_clip": 0.01314575, "auxiliary_loss_mlp": 0.00188953, "balance_loss_clip": 1.08086634, "balance_loss_mlp": 0.16160686, "epoch": 0.6777093040733504, "flos": 26028233591040.0, "grad_norm": 21.29938881010118, "language_loss": 0.76385337, "learning_rate": 9.94047250514387e-07, "loss": 0.77888864, "num_input_tokens_seen": 243280065, "router_z_loss_clip": 2.33789062, "router_z_loss_mlp": 0.2734375, "step": 11272, "time_per_iteration": 2.7630057334899902 }, { "auxiliary_loss_clip": 0.01306895, "auxiliary_loss_mlp": 0.00210071, "balance_loss_clip": 1.07330513, "balance_loss_mlp": 0.18007761, "epoch": 0.6777694273260183, "flos": 18003599763840.0, "grad_norm": 4.311547946234995, "language_loss": 0.81897163, "learning_rate": 9.937106577958481e-07, "loss": 0.83414137, "num_input_tokens_seen": 243297775, "router_z_loss_clip": 2.33398438, "router_z_loss_mlp": 0.29980469, "step": 11273, "time_per_iteration": 2.663538694381714 }, { "auxiliary_loss_clip": 0.01300425, "auxiliary_loss_mlp": 0.00221647, "balance_loss_clip": 1.06985056, "balance_loss_mlp": 0.19283403, "epoch": 0.6778295505786863, "flos": 23441085624960.0, "grad_norm": 748.7022476552714, "language_loss": 0.77919465, "learning_rate": 9.933741032359015e-07, "loss": 0.79441535, "num_input_tokens_seen": 243315760, "router_z_loss_clip": 2.30664062, "router_z_loss_mlp": 0.28820801, "step": 11274, "time_per_iteration": 2.753005027770996 }, { "auxiliary_loss_clip": 0.01307842, "auxiliary_loss_mlp": 0.00201453, "balance_loss_clip": 1.07115901, "balance_loss_mlp": 0.17343906, "epoch": 0.6778896738313542, "flos": 19098408349440.0, "grad_norm": 60.24082870717384, "language_loss": 0.74781477, "learning_rate": 9.930375868473093e-07, "loss": 0.76290768, "num_input_tokens_seen": 243335715, "router_z_loss_clip": 2.3671875, "router_z_loss_mlp": 0.27990723, "step": 11275, "time_per_iteration": 2.6674227714538574 }, { "auxiliary_loss_clip": 0.01295039, "auxiliary_loss_mlp": 0.00220688, "balance_loss_clip": 1.07061398, "balance_loss_mlp": 0.19574933, "epoch": 0.6779497970840223, "flos": 26103933504000.0, "grad_norm": 81.46913128112362, "language_loss": 0.80885571, "learning_rate": 9.927011086428335e-07, "loss": 0.82401299, "num_input_tokens_seen": 243356935, "router_z_loss_clip": 2.24609375, "router_z_loss_mlp": 0.24926758, "step": 11276, "time_per_iteration": 2.754884719848633 }, { "auxiliary_loss_clip": 0.01292634, "auxiliary_loss_mlp": 0.00201544, "balance_loss_clip": 1.06803954, "balance_loss_mlp": 0.17503206, "epoch": 0.6780099203366902, "flos": 19719232041600.0, "grad_norm": 33.89928082546112, "language_loss": 0.85210454, "learning_rate": 9.923646686352317e-07, "loss": 0.86704636, "num_input_tokens_seen": 243375625, "router_z_loss_clip": 2.2421875, "router_z_loss_mlp": 0.26550293, "step": 11277, "time_per_iteration": 2.631744146347046 }, { "auxiliary_loss_clip": 0.01337403, "auxiliary_loss_mlp": 0.0022117, "balance_loss_clip": 1.09417415, "balance_loss_mlp": 0.19214261, "epoch": 0.6780700435893582, "flos": 18214538382720.0, "grad_norm": 103.35896418452627, "language_loss": 0.9295696, "learning_rate": 9.920282668372627e-07, "loss": 0.94515538, "num_input_tokens_seen": 243390195, "router_z_loss_clip": 2.43164062, "router_z_loss_mlp": 0.29016113, "step": 11278, "time_per_iteration": 2.642486572265625 }, { "auxiliary_loss_clip": 0.01300989, "auxiliary_loss_mlp": 0.00211436, "balance_loss_clip": 1.07519364, "balance_loss_mlp": 0.18568656, "epoch": 0.6781301668420262, "flos": 25376239872000.0, "grad_norm": 5.398722570520613, "language_loss": 0.76297128, "learning_rate": 9.916919032616844e-07, "loss": 0.77809548, "num_input_tokens_seen": 243411690, "router_z_loss_clip": 2.25585938, "router_z_loss_mlp": 0.25744629, "step": 11279, "time_per_iteration": 2.7287485599517822 }, { "auxiliary_loss_clip": 0.01315107, "auxiliary_loss_mlp": 0.00208959, "balance_loss_clip": 1.0800308, "balance_loss_mlp": 0.18281616, "epoch": 0.6781902900946941, "flos": 24020432087040.0, "grad_norm": 855.4106859756599, "language_loss": 0.8327291, "learning_rate": 9.913555779212485e-07, "loss": 0.84796971, "num_input_tokens_seen": 243430280, "router_z_loss_clip": 2.34960938, "router_z_loss_mlp": 0.26159668, "step": 11280, "time_per_iteration": 2.6987435817718506 }, { "auxiliary_loss_clip": 0.01321982, "auxiliary_loss_mlp": 0.00219451, "balance_loss_clip": 1.08347535, "balance_loss_mlp": 0.19283199, "epoch": 0.6782504133473621, "flos": 19646764352640.0, "grad_norm": 29.511036963529197, "language_loss": 0.80397546, "learning_rate": 9.910192908287104e-07, "loss": 0.81938982, "num_input_tokens_seen": 243448690, "router_z_loss_clip": 2.38476562, "router_z_loss_mlp": 0.26623535, "step": 11281, "time_per_iteration": 2.7068095207214355 }, { "auxiliary_loss_clip": 0.01330775, "auxiliary_loss_mlp": 0.0021149, "balance_loss_clip": 1.09729505, "balance_loss_mlp": 0.18651575, "epoch": 0.67831053660003, "flos": 24932742647040.0, "grad_norm": 101.57531249571609, "language_loss": 0.70746022, "learning_rate": 9.906830419968217e-07, "loss": 0.72288287, "num_input_tokens_seen": 243470695, "router_z_loss_clip": 2.33789062, "router_z_loss_mlp": 0.24975586, "step": 11282, "time_per_iteration": 2.714597225189209 }, { "auxiliary_loss_clip": 0.01344551, "auxiliary_loss_mlp": 0.00236908, "balance_loss_clip": 1.09721136, "balance_loss_mlp": 0.20850082, "epoch": 0.6783706598526981, "flos": 31208383440000.0, "grad_norm": 4.191243079076073, "language_loss": 0.81996882, "learning_rate": 9.90346831438334e-07, "loss": 0.83578336, "num_input_tokens_seen": 243493345, "router_z_loss_clip": 2.47265625, "router_z_loss_mlp": 0.28417969, "step": 11283, "time_per_iteration": 2.768838882446289 }, { "auxiliary_loss_clip": 0.01306603, "auxiliary_loss_mlp": 0.00220294, "balance_loss_clip": 1.0757277, "balance_loss_mlp": 0.19456902, "epoch": 0.678430783105366, "flos": 35441317687680.0, "grad_norm": 9.038976015586, "language_loss": 0.62665313, "learning_rate": 9.900106591659948e-07, "loss": 0.64192212, "num_input_tokens_seen": 243515670, "router_z_loss_clip": 2.30664062, "router_z_loss_mlp": 0.25732422, "step": 11284, "time_per_iteration": 2.806441068649292 }, { "auxiliary_loss_clip": 0.01317183, "auxiliary_loss_mlp": 0.00218698, "balance_loss_clip": 1.08059013, "balance_loss_mlp": 0.19248353, "epoch": 0.678490906358034, "flos": 14428800460800.0, "grad_norm": 2.772937886423487, "language_loss": 0.8447901, "learning_rate": 9.896745251925535e-07, "loss": 0.86014891, "num_input_tokens_seen": 243533625, "router_z_loss_clip": 2.36328125, "router_z_loss_mlp": 0.26196289, "step": 11285, "time_per_iteration": 2.6513357162475586 }, { "auxiliary_loss_clip": 0.01309576, "auxiliary_loss_mlp": 0.0021389, "balance_loss_clip": 1.07677889, "balance_loss_mlp": 0.18682925, "epoch": 0.6785510296107019, "flos": 24311236596480.0, "grad_norm": 28.42528101944517, "language_loss": 0.73731726, "learning_rate": 9.893384295307557e-07, "loss": 0.75255191, "num_input_tokens_seen": 243553040, "router_z_loss_clip": 2.32421875, "router_z_loss_mlp": 0.27026367, "step": 11286, "time_per_iteration": 2.6624886989593506 }, { "auxiliary_loss_clip": 0.01335215, "auxiliary_loss_mlp": 0.00223106, "balance_loss_clip": 1.08952641, "balance_loss_mlp": 0.19426952, "epoch": 0.6786111528633699, "flos": 26977244872320.0, "grad_norm": 10.122499328015232, "language_loss": 0.65500963, "learning_rate": 9.890023721933447e-07, "loss": 0.6705929, "num_input_tokens_seen": 243572590, "router_z_loss_clip": 2.45898438, "router_z_loss_mlp": 0.28808594, "step": 11287, "time_per_iteration": 4.075164794921875 }, { "auxiliary_loss_clip": 0.01297356, "auxiliary_loss_mlp": 0.00241007, "balance_loss_clip": 1.07133138, "balance_loss_mlp": 0.21437603, "epoch": 0.6786712761160378, "flos": 24317557390080.0, "grad_norm": 4.808566460524855, "language_loss": 0.82946181, "learning_rate": 9.886663531930655e-07, "loss": 0.84484541, "num_input_tokens_seen": 243594140, "router_z_loss_clip": 2.25976562, "router_z_loss_mlp": 0.26635742, "step": 11288, "time_per_iteration": 2.7399380207061768 }, { "auxiliary_loss_clip": 0.0134779, "auxiliary_loss_mlp": 0.00222315, "balance_loss_clip": 1.10494685, "balance_loss_mlp": 0.19493297, "epoch": 0.6787313993687059, "flos": 22930435923840.0, "grad_norm": 12.800271192634636, "language_loss": 0.80086672, "learning_rate": 9.883303725426593e-07, "loss": 0.81656778, "num_input_tokens_seen": 243615170, "router_z_loss_clip": 2.43164062, "router_z_loss_mlp": 0.2734375, "step": 11289, "time_per_iteration": 4.155390739440918 }, { "auxiliary_loss_clip": 0.01304629, "auxiliary_loss_mlp": 0.00212805, "balance_loss_clip": 1.07342029, "balance_loss_mlp": 0.18568458, "epoch": 0.6787915226213738, "flos": 26868435598080.0, "grad_norm": 5.047538352736781, "language_loss": 0.86163843, "learning_rate": 9.879944302548682e-07, "loss": 0.87681276, "num_input_tokens_seen": 243635675, "router_z_loss_clip": 2.31054688, "router_z_loss_mlp": 0.27111816, "step": 11290, "time_per_iteration": 2.746690511703491 }, { "auxiliary_loss_clip": 0.01309427, "auxiliary_loss_mlp": 0.00231763, "balance_loss_clip": 1.08349967, "balance_loss_mlp": 0.20718208, "epoch": 0.6788516458740418, "flos": 20008851402240.0, "grad_norm": 32.34611754149499, "language_loss": 0.80958533, "learning_rate": 9.87658526342428e-07, "loss": 0.82499725, "num_input_tokens_seen": 243654950, "router_z_loss_clip": 2.25976562, "router_z_loss_mlp": 0.24584961, "step": 11291, "time_per_iteration": 2.705991744995117 }, { "auxiliary_loss_clip": 0.01312834, "auxiliary_loss_mlp": 0.00211017, "balance_loss_clip": 1.07986045, "balance_loss_mlp": 0.18531564, "epoch": 0.6789117691267098, "flos": 28727099832960.0, "grad_norm": 4.16007525496291, "language_loss": 0.82633233, "learning_rate": 9.873226608180785e-07, "loss": 0.84157085, "num_input_tokens_seen": 243674970, "router_z_loss_clip": 2.33007812, "router_z_loss_mlp": 0.25695801, "step": 11292, "time_per_iteration": 4.213390588760376 }, { "auxiliary_loss_clip": 0.01332597, "auxiliary_loss_mlp": 0.00260498, "balance_loss_clip": 1.09197164, "balance_loss_mlp": 0.23175633, "epoch": 0.6789718923793777, "flos": 23403451150080.0, "grad_norm": 8.580508335467105, "language_loss": 0.91306704, "learning_rate": 9.869868336945556e-07, "loss": 0.92899799, "num_input_tokens_seen": 243693440, "router_z_loss_clip": 2.41210938, "router_z_loss_mlp": 0.28747559, "step": 11293, "time_per_iteration": 2.721670150756836 }, { "auxiliary_loss_clip": 0.01359747, "auxiliary_loss_mlp": 0.00232706, "balance_loss_clip": 1.11129522, "balance_loss_mlp": 0.20271252, "epoch": 0.6790320156320457, "flos": 20448865008000.0, "grad_norm": 3.8682650109924035, "language_loss": 0.91052288, "learning_rate": 9.866510449845929e-07, "loss": 0.92644745, "num_input_tokens_seen": 243710055, "router_z_loss_clip": 2.48632812, "router_z_loss_mlp": 0.29968262, "step": 11294, "time_per_iteration": 2.7330214977264404 }, { "auxiliary_loss_clip": 0.01322434, "auxiliary_loss_mlp": 0.00214486, "balance_loss_clip": 1.08948255, "balance_loss_mlp": 0.18921322, "epoch": 0.6790921388847136, "flos": 24167199058560.0, "grad_norm": 50.93826601598886, "language_loss": 0.85482961, "learning_rate": 9.86315294700924e-07, "loss": 0.87019879, "num_input_tokens_seen": 243728635, "router_z_loss_clip": 2.33398438, "router_z_loss_mlp": 0.25280762, "step": 11295, "time_per_iteration": 2.6926283836364746 }, { "auxiliary_loss_clip": 0.01294288, "auxiliary_loss_mlp": 0.00208455, "balance_loss_clip": 1.07394433, "balance_loss_mlp": 0.18439864, "epoch": 0.6791522621373817, "flos": 21908095027200.0, "grad_norm": 3.7104553850605106, "language_loss": 0.79499489, "learning_rate": 9.859795828562823e-07, "loss": 0.81002235, "num_input_tokens_seen": 243748330, "router_z_loss_clip": 2.20117188, "router_z_loss_mlp": 0.24047852, "step": 11296, "time_per_iteration": 4.196814298629761 }, { "auxiliary_loss_clip": 0.01308648, "auxiliary_loss_mlp": 0.00209024, "balance_loss_clip": 1.08158088, "balance_loss_mlp": 0.18347792, "epoch": 0.6792123853900496, "flos": 24826519152000.0, "grad_norm": 48.418470308297486, "language_loss": 0.79253566, "learning_rate": 9.856439094633949e-07, "loss": 0.80771244, "num_input_tokens_seen": 243769380, "router_z_loss_clip": 2.27148438, "router_z_loss_mlp": 0.25561523, "step": 11297, "time_per_iteration": 2.71862530708313 }, { "auxiliary_loss_clip": 0.01334393, "auxiliary_loss_mlp": 0.00231705, "balance_loss_clip": 1.08842838, "balance_loss_mlp": 0.20220038, "epoch": 0.6792725086427176, "flos": 17566279678080.0, "grad_norm": 10.395497677228484, "language_loss": 0.79382652, "learning_rate": 9.853082745349918e-07, "loss": 0.80948752, "num_input_tokens_seen": 243785510, "router_z_loss_clip": 2.45898438, "router_z_loss_mlp": 0.29492188, "step": 11298, "time_per_iteration": 2.6678709983825684 }, { "auxiliary_loss_clip": 0.01323229, "auxiliary_loss_mlp": 0.00207873, "balance_loss_clip": 1.08893299, "balance_loss_mlp": 0.18262455, "epoch": 0.6793326318953855, "flos": 26941837040640.0, "grad_norm": 220.97177703423156, "language_loss": 0.7953952, "learning_rate": 9.84972678083801e-07, "loss": 0.81070626, "num_input_tokens_seen": 243805545, "router_z_loss_clip": 2.34179688, "router_z_loss_mlp": 0.25256348, "step": 11299, "time_per_iteration": 2.7395448684692383 }, { "auxiliary_loss_clip": 0.01319469, "auxiliary_loss_mlp": 0.00246936, "balance_loss_clip": 1.08403754, "balance_loss_mlp": 0.2201378, "epoch": 0.6793927551480535, "flos": 24318275662080.0, "grad_norm": 59.382884595424066, "language_loss": 0.82772326, "learning_rate": 9.846371201225488e-07, "loss": 0.84338737, "num_input_tokens_seen": 243825185, "router_z_loss_clip": 2.35351562, "router_z_loss_mlp": 0.26806641, "step": 11300, "time_per_iteration": 2.7566795349121094 }, { "auxiliary_loss_clip": 0.01318948, "auxiliary_loss_mlp": 0.00242405, "balance_loss_clip": 1.0859015, "balance_loss_mlp": 0.2138305, "epoch": 0.6794528784007214, "flos": 11436615757440.0, "grad_norm": 92.33796106141028, "language_loss": 0.73054093, "learning_rate": 9.843016006639577e-07, "loss": 0.74615443, "num_input_tokens_seen": 243841600, "router_z_loss_clip": 2.33398438, "router_z_loss_mlp": 0.28552246, "step": 11301, "time_per_iteration": 2.6600849628448486 }, { "auxiliary_loss_clip": 0.01314419, "auxiliary_loss_mlp": 0.00216771, "balance_loss_clip": 1.08179462, "balance_loss_mlp": 0.18979371, "epoch": 0.6795130016533895, "flos": 25229688382080.0, "grad_norm": 42.61519046938056, "language_loss": 0.89500463, "learning_rate": 9.839661197207525e-07, "loss": 0.91031653, "num_input_tokens_seen": 243862250, "router_z_loss_clip": 2.33007812, "router_z_loss_mlp": 0.26977539, "step": 11302, "time_per_iteration": 2.6935997009277344 }, { "auxiliary_loss_clip": 0.01321088, "auxiliary_loss_mlp": 0.00213752, "balance_loss_clip": 1.08478713, "balance_loss_mlp": 0.18677504, "epoch": 0.6795731249060574, "flos": 18296415434880.0, "grad_norm": 101.81306409398864, "language_loss": 0.78176069, "learning_rate": 9.83630677305654e-07, "loss": 0.79710907, "num_input_tokens_seen": 243880560, "router_z_loss_clip": 2.36132812, "router_z_loss_mlp": 0.26977539, "step": 11303, "time_per_iteration": 2.6947760581970215 }, { "auxiliary_loss_clip": 0.01357681, "auxiliary_loss_mlp": 0.00210743, "balance_loss_clip": 1.10815465, "balance_loss_mlp": 0.1826694, "epoch": 0.6796332481587254, "flos": 20300374183680.0, "grad_norm": 2.655632736186707, "language_loss": 0.80241382, "learning_rate": 9.832952734313813e-07, "loss": 0.81809807, "num_input_tokens_seen": 243900635, "router_z_loss_clip": 2.49609375, "router_z_loss_mlp": 0.28088379, "step": 11304, "time_per_iteration": 2.668069362640381 }, { "auxiliary_loss_clip": 0.01339115, "auxiliary_loss_mlp": 0.00239059, "balance_loss_clip": 1.10018468, "balance_loss_mlp": 0.21028209, "epoch": 0.6796933714113934, "flos": 23586847015680.0, "grad_norm": 24.265924647726155, "language_loss": 0.82209778, "learning_rate": 9.829599081106536e-07, "loss": 0.83787954, "num_input_tokens_seen": 243920160, "router_z_loss_clip": 2.38867188, "router_z_loss_mlp": 0.28747559, "step": 11305, "time_per_iteration": 2.7086780071258545 }, { "auxiliary_loss_clip": 0.01333451, "auxiliary_loss_mlp": 0.0023132, "balance_loss_clip": 1.09788966, "balance_loss_mlp": 0.20489147, "epoch": 0.6797534946640613, "flos": 27119917693440.0, "grad_norm": 34.25944412804717, "language_loss": 0.76861006, "learning_rate": 9.826245813561882e-07, "loss": 0.78425777, "num_input_tokens_seen": 243939015, "router_z_loss_clip": 2.35546875, "router_z_loss_mlp": 0.26428223, "step": 11306, "time_per_iteration": 2.6930007934570312 }, { "auxiliary_loss_clip": 0.01310056, "auxiliary_loss_mlp": 0.00218521, "balance_loss_clip": 1.08562684, "balance_loss_mlp": 0.19328472, "epoch": 0.6798136179167293, "flos": 22127437428480.0, "grad_norm": 39.614563097428835, "language_loss": 0.86693501, "learning_rate": 9.822892931807021e-07, "loss": 0.88222086, "num_input_tokens_seen": 243958470, "router_z_loss_clip": 2.24414062, "router_z_loss_mlp": 0.25231934, "step": 11307, "time_per_iteration": 2.6930887699127197 }, { "auxiliary_loss_clip": 0.01327176, "auxiliary_loss_mlp": 0.00230771, "balance_loss_clip": 1.09396303, "balance_loss_mlp": 0.20427109, "epoch": 0.6798737411693972, "flos": 17488640430720.0, "grad_norm": 3.113852430876501, "language_loss": 0.94993961, "learning_rate": 9.819540435969066e-07, "loss": 0.96551907, "num_input_tokens_seen": 243975450, "router_z_loss_clip": 2.33398438, "router_z_loss_mlp": 0.26501465, "step": 11308, "time_per_iteration": 2.6412384510040283 }, { "auxiliary_loss_clip": 0.01322778, "auxiliary_loss_mlp": 0.00219867, "balance_loss_clip": 1.08764672, "balance_loss_mlp": 0.19185312, "epoch": 0.6799338644220653, "flos": 22892262744960.0, "grad_norm": 1970.862576569574, "language_loss": 0.81521916, "learning_rate": 9.816188326175154e-07, "loss": 0.83064568, "num_input_tokens_seen": 243994355, "router_z_loss_clip": 2.34765625, "router_z_loss_mlp": 0.27990723, "step": 11309, "time_per_iteration": 2.6705515384674072 }, { "auxiliary_loss_clip": 0.01314644, "auxiliary_loss_mlp": 0.00219119, "balance_loss_clip": 1.0840137, "balance_loss_mlp": 0.19116473, "epoch": 0.6799939876747332, "flos": 23180409648000.0, "grad_norm": 2.0986490342564346, "language_loss": 0.92172986, "learning_rate": 9.812836602552411e-07, "loss": 0.93706745, "num_input_tokens_seen": 244011620, "router_z_loss_clip": 2.30664062, "router_z_loss_mlp": 0.27954102, "step": 11310, "time_per_iteration": 2.6523659229278564 }, { "auxiliary_loss_clip": 0.01301225, "auxiliary_loss_mlp": 0.00204937, "balance_loss_clip": 1.0768702, "balance_loss_mlp": 0.18014117, "epoch": 0.6800541109274012, "flos": 19499925553920.0, "grad_norm": 28.29991023288008, "language_loss": 0.9104625, "learning_rate": 9.80948526522792e-07, "loss": 0.92552412, "num_input_tokens_seen": 244029925, "router_z_loss_clip": 2.24023438, "router_z_loss_mlp": 0.24780273, "step": 11311, "time_per_iteration": 2.711142063140869 }, { "auxiliary_loss_clip": 0.01334849, "auxiliary_loss_mlp": 0.00234293, "balance_loss_clip": 1.09002721, "balance_loss_mlp": 0.20477691, "epoch": 0.6801142341800691, "flos": 22277652105600.0, "grad_norm": 33.71440914258333, "language_loss": 0.8423481, "learning_rate": 9.806134314328767e-07, "loss": 0.85803956, "num_input_tokens_seen": 244051225, "router_z_loss_clip": 2.44726562, "router_z_loss_mlp": 0.29504395, "step": 11312, "time_per_iteration": 2.744414806365967 }, { "auxiliary_loss_clip": 0.01200189, "auxiliary_loss_mlp": 0.00174708, "balance_loss_clip": 1.04871559, "balance_loss_mlp": 0.16359723, "epoch": 0.6801743574327371, "flos": 68714817759360.0, "grad_norm": 0.974939239236735, "language_loss": 0.56720746, "learning_rate": 9.802783749982038e-07, "loss": 0.5809564, "num_input_tokens_seen": 244115930, "router_z_loss_clip": 1.515625, "router_z_loss_mlp": 0.11132812, "step": 11313, "time_per_iteration": 3.2752668857574463 }, { "auxiliary_loss_clip": 0.01322041, "auxiliary_loss_mlp": 0.00228143, "balance_loss_clip": 1.0875268, "balance_loss_mlp": 0.20282267, "epoch": 0.680234480685405, "flos": 29460467813760.0, "grad_norm": 33.91793621755427, "language_loss": 0.75060344, "learning_rate": 9.799433572314754e-07, "loss": 0.76610529, "num_input_tokens_seen": 244137320, "router_z_loss_clip": 2.34179688, "router_z_loss_mlp": 0.25341797, "step": 11314, "time_per_iteration": 2.7429568767547607 }, { "auxiliary_loss_clip": 0.01320048, "auxiliary_loss_mlp": 0.00225395, "balance_loss_clip": 1.0877316, "balance_loss_mlp": 0.19845358, "epoch": 0.6802946039380731, "flos": 15916866122880.0, "grad_norm": 937.3890076649097, "language_loss": 0.87088001, "learning_rate": 9.796083781453972e-07, "loss": 0.88633442, "num_input_tokens_seen": 244152755, "router_z_loss_clip": 2.32226562, "router_z_loss_mlp": 0.26916504, "step": 11315, "time_per_iteration": 2.601853847503662 }, { "auxiliary_loss_clip": 0.01319458, "auxiliary_loss_mlp": 0.00231848, "balance_loss_clip": 1.08410025, "balance_loss_mlp": 0.20512152, "epoch": 0.680354727190741, "flos": 22018664067840.0, "grad_norm": 25.556567692300085, "language_loss": 0.77385712, "learning_rate": 9.792734377526718e-07, "loss": 0.78937018, "num_input_tokens_seen": 244171480, "router_z_loss_clip": 2.35351562, "router_z_loss_mlp": 0.26757812, "step": 11316, "time_per_iteration": 2.695786714553833 }, { "auxiliary_loss_clip": 0.01324297, "auxiliary_loss_mlp": 0.0024933, "balance_loss_clip": 1.09068692, "balance_loss_mlp": 0.22138724, "epoch": 0.680414850443409, "flos": 18441494467200.0, "grad_norm": 4.618903987695485, "language_loss": 0.75081897, "learning_rate": 9.789385360660003e-07, "loss": 0.76655531, "num_input_tokens_seen": 244187920, "router_z_loss_clip": 2.33789062, "router_z_loss_mlp": 0.27954102, "step": 11317, "time_per_iteration": 2.6427836418151855 }, { "auxiliary_loss_clip": 0.01327075, "auxiliary_loss_mlp": 0.00223803, "balance_loss_clip": 1.0897119, "balance_loss_mlp": 0.19665933, "epoch": 0.680474973696077, "flos": 26358611909760.0, "grad_norm": 4.1378780023082, "language_loss": 0.83072293, "learning_rate": 9.78603673098082e-07, "loss": 0.84623176, "num_input_tokens_seen": 244209565, "router_z_loss_clip": 2.37695312, "router_z_loss_mlp": 0.27148438, "step": 11318, "time_per_iteration": 2.711071729660034 }, { "auxiliary_loss_clip": 0.01289979, "auxiliary_loss_mlp": 0.0021701, "balance_loss_clip": 1.06784201, "balance_loss_mlp": 0.19357356, "epoch": 0.6805350969487449, "flos": 18333116156160.0, "grad_norm": 2.9069256391834766, "language_loss": 0.74351805, "learning_rate": 9.782688488616143e-07, "loss": 0.75858796, "num_input_tokens_seen": 244228015, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.234375, "step": 11319, "time_per_iteration": 2.6153879165649414 }, { "auxiliary_loss_clip": 0.01312093, "auxiliary_loss_mlp": 0.00205154, "balance_loss_clip": 1.08154595, "balance_loss_mlp": 0.17873746, "epoch": 0.6805952202014129, "flos": 19937497034880.0, "grad_norm": 488.93365288879244, "language_loss": 0.83895481, "learning_rate": 9.779340633692945e-07, "loss": 0.85412729, "num_input_tokens_seen": 244245615, "router_z_loss_clip": 2.30859375, "router_z_loss_mlp": 0.26416016, "step": 11320, "time_per_iteration": 2.717703342437744 }, { "auxiliary_loss_clip": 0.01327164, "auxiliary_loss_mlp": 0.00232659, "balance_loss_clip": 1.09228742, "balance_loss_mlp": 0.20558643, "epoch": 0.6806553434540809, "flos": 25224301342080.0, "grad_norm": 38.394007313471306, "language_loss": 0.81871909, "learning_rate": 9.77599316633817e-07, "loss": 0.83431733, "num_input_tokens_seen": 244263625, "router_z_loss_clip": 2.34960938, "router_z_loss_mlp": 0.27062988, "step": 11321, "time_per_iteration": 2.665618419647217 }, { "auxiliary_loss_clip": 0.01326938, "auxiliary_loss_mlp": 0.00245909, "balance_loss_clip": 1.09138346, "balance_loss_mlp": 0.21906283, "epoch": 0.6807154667067489, "flos": 17785586165760.0, "grad_norm": 2.439947391372625, "language_loss": 0.7961266, "learning_rate": 9.772646086678758e-07, "loss": 0.81185508, "num_input_tokens_seen": 244282745, "router_z_loss_clip": 2.35546875, "router_z_loss_mlp": 0.26843262, "step": 11322, "time_per_iteration": 2.760840654373169 }, { "auxiliary_loss_clip": 0.01315106, "auxiliary_loss_mlp": 0.00229435, "balance_loss_clip": 1.08031905, "balance_loss_mlp": 0.2026011, "epoch": 0.6807755899594168, "flos": 22199905117440.0, "grad_norm": 21.081301451474275, "language_loss": 0.86804044, "learning_rate": 9.769299394841638e-07, "loss": 0.88348585, "num_input_tokens_seen": 244303770, "router_z_loss_clip": 2.34960938, "router_z_loss_mlp": 0.26818848, "step": 11323, "time_per_iteration": 2.7937393188476562 }, { "auxiliary_loss_clip": 0.0117847, "auxiliary_loss_mlp": 0.00161085, "balance_loss_clip": 1.02806103, "balance_loss_mlp": 0.15107137, "epoch": 0.6808357132120848, "flos": 68631073200000.0, "grad_norm": 0.7786063341856444, "language_loss": 0.56555778, "learning_rate": 9.765953090953714e-07, "loss": 0.57895333, "num_input_tokens_seen": 244355910, "router_z_loss_clip": 1.5, "router_z_loss_mlp": 0.10009766, "step": 11324, "time_per_iteration": 2.9567527770996094 }, { "auxiliary_loss_clip": 0.01335103, "auxiliary_loss_mlp": 0.00234431, "balance_loss_clip": 1.09821689, "balance_loss_mlp": 0.20704909, "epoch": 0.6808958364647527, "flos": 23843357015040.0, "grad_norm": 3.7369134854781287, "language_loss": 0.7840873, "learning_rate": 9.76260717514186e-07, "loss": 0.79978263, "num_input_tokens_seen": 244376610, "router_z_loss_clip": 2.37109375, "router_z_loss_mlp": 0.27380371, "step": 11325, "time_per_iteration": 2.688129186630249 }, { "auxiliary_loss_clip": 0.01327312, "auxiliary_loss_mlp": 0.00227113, "balance_loss_clip": 1.08764386, "balance_loss_mlp": 0.19679812, "epoch": 0.6809559597174207, "flos": 17711717846400.0, "grad_norm": 12.87350082893165, "language_loss": 0.77954024, "learning_rate": 9.759261647532974e-07, "loss": 0.79508448, "num_input_tokens_seen": 244393000, "router_z_loss_clip": 2.39453125, "router_z_loss_mlp": 0.30285645, "step": 11326, "time_per_iteration": 2.6052799224853516 }, { "auxiliary_loss_clip": 0.01318836, "auxiliary_loss_mlp": 0.0022485, "balance_loss_clip": 1.08550775, "balance_loss_mlp": 0.19901715, "epoch": 0.6810160829700886, "flos": 22491894775680.0, "grad_norm": 131.43899559331504, "language_loss": 0.79321003, "learning_rate": 9.75591650825392e-07, "loss": 0.80864692, "num_input_tokens_seen": 244409515, "router_z_loss_clip": 2.3359375, "router_z_loss_mlp": 0.25854492, "step": 11327, "time_per_iteration": 2.670639991760254 }, { "auxiliary_loss_clip": 0.01332608, "auxiliary_loss_mlp": 0.00230138, "balance_loss_clip": 1.09620059, "balance_loss_mlp": 0.20167072, "epoch": 0.6810762062227567, "flos": 16832875783680.0, "grad_norm": 294.0619182677346, "language_loss": 0.84627366, "learning_rate": 9.752571757431526e-07, "loss": 0.86190116, "num_input_tokens_seen": 244427165, "router_z_loss_clip": 2.36328125, "router_z_loss_mlp": 0.28491211, "step": 11328, "time_per_iteration": 2.6032655239105225 }, { "auxiliary_loss_clip": 0.01322629, "auxiliary_loss_mlp": 0.00230872, "balance_loss_clip": 1.08326077, "balance_loss_mlp": 0.20316795, "epoch": 0.6811363294754246, "flos": 12714676554240.0, "grad_norm": 26.96706487279339, "language_loss": 0.73656809, "learning_rate": 9.74922739519265e-07, "loss": 0.75210315, "num_input_tokens_seen": 244445705, "router_z_loss_clip": 2.39257812, "router_z_loss_mlp": 0.27697754, "step": 11329, "time_per_iteration": 4.0281922817230225 }, { "auxiliary_loss_clip": 0.01325922, "auxiliary_loss_mlp": 0.00223582, "balance_loss_clip": 1.08471787, "balance_loss_mlp": 0.19649783, "epoch": 0.6811964527280926, "flos": 17711969241600.0, "grad_norm": 19.541979627151445, "language_loss": 0.8559655, "learning_rate": 9.745883421664096e-07, "loss": 0.87146056, "num_input_tokens_seen": 244460415, "router_z_loss_clip": 2.41210938, "router_z_loss_mlp": 0.27050781, "step": 11330, "time_per_iteration": 2.6189651489257812 }, { "auxiliary_loss_clip": 0.01320902, "auxiliary_loss_mlp": 0.00225218, "balance_loss_clip": 1.08924818, "balance_loss_mlp": 0.19758505, "epoch": 0.6812565759807605, "flos": 24863471268480.0, "grad_norm": 29.8491246882194, "language_loss": 0.71888965, "learning_rate": 9.742539836972665e-07, "loss": 0.73435086, "num_input_tokens_seen": 244480555, "router_z_loss_clip": 2.31835938, "router_z_loss_mlp": 0.27636719, "step": 11331, "time_per_iteration": 4.166785001754761 }, { "auxiliary_loss_clip": 0.01326166, "auxiliary_loss_mlp": 0.00221086, "balance_loss_clip": 1.09001553, "balance_loss_mlp": 0.19387066, "epoch": 0.6813166992334285, "flos": 17166019449600.0, "grad_norm": 60.32069536375336, "language_loss": 0.80901849, "learning_rate": 9.739196641245148e-07, "loss": 0.82449102, "num_input_tokens_seen": 244498540, "router_z_loss_clip": 2.35742188, "router_z_loss_mlp": 0.27233887, "step": 11332, "time_per_iteration": 2.6404590606689453 }, { "auxiliary_loss_clip": 0.01318961, "auxiliary_loss_mlp": 0.00239364, "balance_loss_clip": 1.0839808, "balance_loss_mlp": 0.21410345, "epoch": 0.6813768224860965, "flos": 18843550375680.0, "grad_norm": 89.75654103361083, "language_loss": 0.83005482, "learning_rate": 9.735853834608326e-07, "loss": 0.8456381, "num_input_tokens_seen": 244517015, "router_z_loss_clip": 2.35351562, "router_z_loss_mlp": 0.25280762, "step": 11333, "time_per_iteration": 2.6837387084960938 }, { "auxiliary_loss_clip": 0.01347256, "auxiliary_loss_mlp": 0.00249319, "balance_loss_clip": 1.10680425, "balance_loss_mlp": 0.22166267, "epoch": 0.6814369457387645, "flos": 24532733813760.0, "grad_norm": 11.328477005705668, "language_loss": 0.77963328, "learning_rate": 9.732511417188963e-07, "loss": 0.79559898, "num_input_tokens_seen": 244537450, "router_z_loss_clip": 2.40429688, "router_z_loss_mlp": 0.27648926, "step": 11334, "time_per_iteration": 2.6901237964630127 }, { "auxiliary_loss_clip": 0.01315, "auxiliary_loss_mlp": 0.00230105, "balance_loss_clip": 1.08718622, "balance_loss_mlp": 0.20571497, "epoch": 0.6814970689914325, "flos": 18222978078720.0, "grad_norm": 31.200579773330176, "language_loss": 0.92576712, "learning_rate": 9.729169389113791e-07, "loss": 0.94121814, "num_input_tokens_seen": 244555640, "router_z_loss_clip": 2.2734375, "router_z_loss_mlp": 0.24389648, "step": 11335, "time_per_iteration": 4.090856313705444 }, { "auxiliary_loss_clip": 0.01313851, "auxiliary_loss_mlp": 0.00219836, "balance_loss_clip": 1.08630514, "balance_loss_mlp": 0.19438514, "epoch": 0.6815571922441004, "flos": 25228790542080.0, "grad_norm": 28.67166025867169, "language_loss": 0.89645892, "learning_rate": 9.725827750509542e-07, "loss": 0.91179574, "num_input_tokens_seen": 244574005, "router_z_loss_clip": 2.27539062, "router_z_loss_mlp": 0.25463867, "step": 11336, "time_per_iteration": 2.684993028640747 }, { "auxiliary_loss_clip": 0.01300377, "auxiliary_loss_mlp": 0.00237905, "balance_loss_clip": 1.07524395, "balance_loss_mlp": 0.2123228, "epoch": 0.6816173154967684, "flos": 19456078026240.0, "grad_norm": 6.2520092393676885, "language_loss": 0.88289475, "learning_rate": 9.72248650150294e-07, "loss": 0.89827752, "num_input_tokens_seen": 244591395, "router_z_loss_clip": 2.25, "router_z_loss_mlp": 0.2557373, "step": 11337, "time_per_iteration": 2.6603710651397705 }, { "auxiliary_loss_clip": 0.01290707, "auxiliary_loss_mlp": 0.00206683, "balance_loss_clip": 1.06719398, "balance_loss_mlp": 0.18092155, "epoch": 0.6816774387494363, "flos": 17931455297280.0, "grad_norm": 31.692101224895776, "language_loss": 0.78996348, "learning_rate": 9.719145642220673e-07, "loss": 0.80493742, "num_input_tokens_seen": 244610400, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.25756836, "step": 11338, "time_per_iteration": 4.083198547363281 }, { "auxiliary_loss_clip": 0.01306936, "auxiliary_loss_mlp": 0.00229136, "balance_loss_clip": 1.07847929, "balance_loss_mlp": 0.2047455, "epoch": 0.6817375620021043, "flos": 22233014478720.0, "grad_norm": 6.200340893422323, "language_loss": 0.84643358, "learning_rate": 9.715805172789435e-07, "loss": 0.86179429, "num_input_tokens_seen": 244630400, "router_z_loss_clip": 2.28125, "router_z_loss_mlp": 0.24389648, "step": 11339, "time_per_iteration": 2.6369268894195557 }, { "auxiliary_loss_clip": 0.01328998, "auxiliary_loss_mlp": 0.00244791, "balance_loss_clip": 1.0965687, "balance_loss_mlp": 0.21794489, "epoch": 0.6817976852547722, "flos": 25374408278400.0, "grad_norm": 12.272080836668996, "language_loss": 0.7915557, "learning_rate": 9.712465093335901e-07, "loss": 0.80729353, "num_input_tokens_seen": 244649155, "router_z_loss_clip": 2.328125, "router_z_loss_mlp": 0.26855469, "step": 11340, "time_per_iteration": 2.7062699794769287 }, { "auxiliary_loss_clip": 0.01350481, "auxiliary_loss_mlp": 0.00213112, "balance_loss_clip": 1.10318327, "balance_loss_mlp": 0.18278527, "epoch": 0.6818578085074403, "flos": 22265764704000.0, "grad_norm": 5.079924212906128, "language_loss": 0.97659248, "learning_rate": 9.709125403986722e-07, "loss": 0.99222845, "num_input_tokens_seen": 244665470, "router_z_loss_clip": 2.47460938, "router_z_loss_mlp": 0.30322266, "step": 11341, "time_per_iteration": 2.6394901275634766 }, { "auxiliary_loss_clip": 0.01310797, "auxiliary_loss_mlp": 0.00224593, "balance_loss_clip": 1.07766902, "balance_loss_mlp": 0.19685295, "epoch": 0.6819179317601082, "flos": 19318145800320.0, "grad_norm": 7.757833861847879, "language_loss": 0.76598269, "learning_rate": 9.705786104868531e-07, "loss": 0.78133655, "num_input_tokens_seen": 244684390, "router_z_loss_clip": 2.33398438, "router_z_loss_mlp": 0.27746582, "step": 11342, "time_per_iteration": 2.782640218734741 }, { "auxiliary_loss_clip": 0.01301721, "auxiliary_loss_mlp": 0.00233328, "balance_loss_clip": 1.07561302, "balance_loss_mlp": 0.2066485, "epoch": 0.6819780550127762, "flos": 21104126864640.0, "grad_norm": 350.5037640289769, "language_loss": 0.81567067, "learning_rate": 9.702447196107963e-07, "loss": 0.83102113, "num_input_tokens_seen": 244703370, "router_z_loss_clip": 2.2578125, "router_z_loss_mlp": 0.26672363, "step": 11343, "time_per_iteration": 2.6717472076416016 }, { "auxiliary_loss_clip": 0.01326219, "auxiliary_loss_mlp": 0.00221837, "balance_loss_clip": 1.09693468, "balance_loss_mlp": 0.19558708, "epoch": 0.6820381782654441, "flos": 29716403195520.0, "grad_norm": 4.5198485772932635, "language_loss": 0.86427736, "learning_rate": 9.699108677831639e-07, "loss": 0.87975788, "num_input_tokens_seen": 244723325, "router_z_loss_clip": 2.29296875, "router_z_loss_mlp": 0.2623291, "step": 11344, "time_per_iteration": 2.7020976543426514 }, { "auxiliary_loss_clip": 0.01315913, "auxiliary_loss_mlp": 0.00237559, "balance_loss_clip": 1.08190072, "balance_loss_mlp": 0.21036755, "epoch": 0.6820983015181121, "flos": 29242130993280.0, "grad_norm": 18.31676394500806, "language_loss": 0.72755826, "learning_rate": 9.695770550166136e-07, "loss": 0.74309295, "num_input_tokens_seen": 244745650, "router_z_loss_clip": 2.34179688, "router_z_loss_mlp": 0.27172852, "step": 11345, "time_per_iteration": 2.7447431087493896 }, { "auxiliary_loss_clip": 0.01343647, "auxiliary_loss_mlp": 0.00215412, "balance_loss_clip": 1.10537243, "balance_loss_mlp": 0.18639693, "epoch": 0.6821584247707801, "flos": 18871775487360.0, "grad_norm": 108.89467184277882, "language_loss": 0.75636524, "learning_rate": 9.692432813238054e-07, "loss": 0.77195579, "num_input_tokens_seen": 244760270, "router_z_loss_clip": 2.38085938, "router_z_loss_mlp": 0.2902832, "step": 11346, "time_per_iteration": 2.6062843799591064 }, { "auxiliary_loss_clip": 0.0134568, "auxiliary_loss_mlp": 0.00228426, "balance_loss_clip": 1.10518646, "balance_loss_mlp": 0.20079292, "epoch": 0.6822185480234481, "flos": 21324582587520.0, "grad_norm": 3.6535129832131066, "language_loss": 0.85008955, "learning_rate": 9.689095467173952e-07, "loss": 0.8658306, "num_input_tokens_seen": 244779565, "router_z_loss_clip": 2.40625, "router_z_loss_mlp": 0.27648926, "step": 11347, "time_per_iteration": 2.6937942504882812 }, { "auxiliary_loss_clip": 0.01170686, "auxiliary_loss_mlp": 0.00099261, "balance_loss_clip": 1.01774716, "balance_loss_mlp": 0.09234683, "epoch": 0.6822786712761161, "flos": 63488306430720.0, "grad_norm": 0.7094703262582747, "language_loss": 0.51985794, "learning_rate": 9.685758512100378e-07, "loss": 0.53255737, "num_input_tokens_seen": 244838480, "router_z_loss_clip": 1.53125, "router_z_loss_mlp": 0.06933594, "step": 11348, "time_per_iteration": 3.148404359817505 }, { "auxiliary_loss_clip": 0.01328772, "auxiliary_loss_mlp": 0.00244963, "balance_loss_clip": 1.09647274, "balance_loss_mlp": 0.21651936, "epoch": 0.682338794528784, "flos": 21068934514560.0, "grad_norm": 14.922133044727996, "language_loss": 0.85822189, "learning_rate": 9.682421948143873e-07, "loss": 0.87395924, "num_input_tokens_seen": 244855265, "router_z_loss_clip": 2.32617188, "router_z_loss_mlp": 0.2845459, "step": 11349, "time_per_iteration": 2.6567888259887695 }, { "auxiliary_loss_clip": 0.01376108, "auxiliary_loss_mlp": 0.00254506, "balance_loss_clip": 1.12300181, "balance_loss_mlp": 0.22241443, "epoch": 0.682398917781452, "flos": 36283243547520.0, "grad_norm": 7.228374132193785, "language_loss": 0.83979142, "learning_rate": 9.67908577543096e-07, "loss": 0.85609758, "num_input_tokens_seen": 244875555, "router_z_loss_clip": 2.52929688, "router_z_loss_mlp": 0.32104492, "step": 11350, "time_per_iteration": 2.7415237426757812 }, { "auxiliary_loss_clip": 0.01329801, "auxiliary_loss_mlp": 0.00235319, "balance_loss_clip": 1.0942204, "balance_loss_mlp": 0.20894971, "epoch": 0.6824590410341199, "flos": 24859197550080.0, "grad_norm": 4.663517850575071, "language_loss": 0.8563695, "learning_rate": 9.675749994088161e-07, "loss": 0.87202072, "num_input_tokens_seen": 244895270, "router_z_loss_clip": 2.35351562, "router_z_loss_mlp": 0.26391602, "step": 11351, "time_per_iteration": 2.6771934032440186 }, { "auxiliary_loss_clip": 0.01320978, "auxiliary_loss_mlp": 0.00220047, "balance_loss_clip": 1.08888996, "balance_loss_mlp": 0.19408366, "epoch": 0.6825191642867879, "flos": 22452392793600.0, "grad_norm": 8.209618898660578, "language_loss": 0.79626787, "learning_rate": 9.672414604241954e-07, "loss": 0.81167817, "num_input_tokens_seen": 244914535, "router_z_loss_clip": 2.3203125, "router_z_loss_mlp": 0.2598877, "step": 11352, "time_per_iteration": 2.6202590465545654 }, { "auxiliary_loss_clip": 0.01339779, "auxiliary_loss_mlp": 0.00219549, "balance_loss_clip": 1.10274887, "balance_loss_mlp": 0.19164234, "epoch": 0.6825792875394558, "flos": 29424377623680.0, "grad_norm": 8.481243143774462, "language_loss": 0.87259305, "learning_rate": 9.669079606018814e-07, "loss": 0.88818634, "num_input_tokens_seen": 244936095, "router_z_loss_clip": 2.37304688, "router_z_loss_mlp": 0.2791748, "step": 11353, "time_per_iteration": 2.7030444145202637 }, { "auxiliary_loss_clip": 0.0134183, "auxiliary_loss_mlp": 0.00208366, "balance_loss_clip": 1.10182726, "balance_loss_mlp": 0.18079241, "epoch": 0.6826394107921239, "flos": 18770974945920.0, "grad_norm": 4.3522275685777, "language_loss": 0.86253417, "learning_rate": 9.665744999545218e-07, "loss": 0.87803614, "num_input_tokens_seen": 244955290, "router_z_loss_clip": 2.39648438, "router_z_loss_mlp": 0.27575684, "step": 11354, "time_per_iteration": 2.6290395259857178 }, { "auxiliary_loss_clip": 0.01334799, "auxiliary_loss_mlp": 0.00217497, "balance_loss_clip": 1.09897757, "balance_loss_mlp": 0.18980497, "epoch": 0.6826995340447918, "flos": 16617591619200.0, "grad_norm": 8.318967449939317, "language_loss": 0.71902382, "learning_rate": 9.662410784947599e-07, "loss": 0.73454678, "num_input_tokens_seen": 244972935, "router_z_loss_clip": 2.359375, "router_z_loss_mlp": 0.27697754, "step": 11355, "time_per_iteration": 2.659524917602539 }, { "auxiliary_loss_clip": 0.01323209, "auxiliary_loss_mlp": 0.00207773, "balance_loss_clip": 1.08947253, "balance_loss_mlp": 0.18139245, "epoch": 0.6827596572974598, "flos": 20848299223680.0, "grad_norm": 6.983634545652979, "language_loss": 0.89719748, "learning_rate": 9.659076962352398e-07, "loss": 0.9125073, "num_input_tokens_seen": 244989440, "router_z_loss_clip": 2.33398438, "router_z_loss_mlp": 0.26379395, "step": 11356, "time_per_iteration": 2.638920545578003 }, { "auxiliary_loss_clip": 0.01352767, "auxiliary_loss_mlp": 0.0022392, "balance_loss_clip": 1.10921419, "balance_loss_mlp": 0.19600117, "epoch": 0.6828197805501277, "flos": 22748081552640.0, "grad_norm": 24.05930095201894, "language_loss": 0.85512888, "learning_rate": 9.655743531886052e-07, "loss": 0.87089574, "num_input_tokens_seen": 245007830, "router_z_loss_clip": 2.43554688, "router_z_loss_mlp": 0.27929688, "step": 11357, "time_per_iteration": 2.6437485218048096 }, { "auxiliary_loss_clip": 0.01152857, "auxiliary_loss_mlp": 0.00123835, "balance_loss_clip": 1.00159669, "balance_loss_mlp": 0.11630123, "epoch": 0.6828799038027957, "flos": 71646565829760.0, "grad_norm": 0.8085626736641585, "language_loss": 0.59049946, "learning_rate": 9.65241049367493e-07, "loss": 0.60326636, "num_input_tokens_seen": 245070720, "router_z_loss_clip": 1.515625, "router_z_loss_mlp": 0.07519531, "step": 11358, "time_per_iteration": 3.210094928741455 }, { "auxiliary_loss_clip": 0.0132394, "auxiliary_loss_mlp": 0.00233212, "balance_loss_clip": 1.08705711, "balance_loss_mlp": 0.20475665, "epoch": 0.6829400270554637, "flos": 19829154637440.0, "grad_norm": 27.771926705006624, "language_loss": 0.88635445, "learning_rate": 9.64907784784544e-07, "loss": 0.90192604, "num_input_tokens_seen": 245089070, "router_z_loss_clip": 2.36914062, "router_z_loss_mlp": 0.28430176, "step": 11359, "time_per_iteration": 2.679016351699829 }, { "auxiliary_loss_clip": 0.01350268, "auxiliary_loss_mlp": 0.00240355, "balance_loss_clip": 1.10180569, "balance_loss_mlp": 0.2100763, "epoch": 0.6830001503081317, "flos": 21980634543360.0, "grad_norm": 11.285460129189289, "language_loss": 0.91048473, "learning_rate": 9.645745594523958e-07, "loss": 0.92639101, "num_input_tokens_seen": 245106500, "router_z_loss_clip": 2.48242188, "router_z_loss_mlp": 0.30273438, "step": 11360, "time_per_iteration": 2.7041373252868652 }, { "auxiliary_loss_clip": 0.01320147, "auxiliary_loss_mlp": 0.00216439, "balance_loss_clip": 1.08792067, "balance_loss_mlp": 0.18823454, "epoch": 0.6830602735607997, "flos": 24316767290880.0, "grad_norm": 24.022104849084034, "language_loss": 0.80748415, "learning_rate": 9.642413733836844e-07, "loss": 0.82285005, "num_input_tokens_seen": 245125260, "router_z_loss_clip": 2.32421875, "router_z_loss_mlp": 0.28222656, "step": 11361, "time_per_iteration": 2.679302453994751 }, { "auxiliary_loss_clip": 0.01163024, "auxiliary_loss_mlp": 0.00105032, "balance_loss_clip": 1.0107224, "balance_loss_mlp": 0.09678274, "epoch": 0.6831203968134676, "flos": 57690062323200.0, "grad_norm": 0.8537423254233002, "language_loss": 0.58059853, "learning_rate": 9.639082265910437e-07, "loss": 0.59327906, "num_input_tokens_seen": 245188730, "router_z_loss_clip": 1.5234375, "router_z_loss_mlp": 0.08251953, "step": 11362, "time_per_iteration": 3.212855577468872 }, { "auxiliary_loss_clip": 0.01323737, "auxiliary_loss_mlp": 0.0022489, "balance_loss_clip": 1.08710849, "balance_loss_mlp": 0.19328766, "epoch": 0.6831805200661356, "flos": 14388436552320.0, "grad_norm": 66.45598451697782, "language_loss": 0.86763436, "learning_rate": 9.635751190871074e-07, "loss": 0.88312066, "num_input_tokens_seen": 245205065, "router_z_loss_clip": 2.37109375, "router_z_loss_mlp": 0.31567383, "step": 11363, "time_per_iteration": 2.6062698364257812 }, { "auxiliary_loss_clip": 0.01323461, "auxiliary_loss_mlp": 0.00235961, "balance_loss_clip": 1.09624028, "balance_loss_mlp": 0.20677876, "epoch": 0.6832406433188035, "flos": 22820297846400.0, "grad_norm": 14.78062015443939, "language_loss": 0.99370527, "learning_rate": 9.632420508845063e-07, "loss": 1.00929952, "num_input_tokens_seen": 245224265, "router_z_loss_clip": 2.27148438, "router_z_loss_mlp": 0.29150391, "step": 11364, "time_per_iteration": 2.712170124053955 }, { "auxiliary_loss_clip": 0.01330578, "auxiliary_loss_mlp": 0.00215283, "balance_loss_clip": 1.09652936, "balance_loss_mlp": 0.18841347, "epoch": 0.6833007665714715, "flos": 17561718650880.0, "grad_norm": 6.596449783504465, "language_loss": 0.9476493, "learning_rate": 9.629090219958697e-07, "loss": 0.96310788, "num_input_tokens_seen": 245243360, "router_z_loss_clip": 2.3359375, "router_z_loss_mlp": 0.26843262, "step": 11365, "time_per_iteration": 2.6384382247924805 }, { "auxiliary_loss_clip": 0.01350287, "auxiliary_loss_mlp": 0.00214927, "balance_loss_clip": 1.10680246, "balance_loss_mlp": 0.18499368, "epoch": 0.6833608898241395, "flos": 22445928345600.0, "grad_norm": 146.52456472484948, "language_loss": 0.92077136, "learning_rate": 9.625760324338272e-07, "loss": 0.93642348, "num_input_tokens_seen": 245256350, "router_z_loss_clip": 2.4375, "router_z_loss_mlp": 0.29919434, "step": 11366, "time_per_iteration": 2.6060526371002197 }, { "auxiliary_loss_clip": 0.01327476, "auxiliary_loss_mlp": 0.00218719, "balance_loss_clip": 1.09189796, "balance_loss_mlp": 0.18977532, "epoch": 0.6834210130768075, "flos": 24534637234560.0, "grad_norm": 5.405924753602497, "language_loss": 0.83705574, "learning_rate": 9.622430822110062e-07, "loss": 0.85251766, "num_input_tokens_seen": 245277575, "router_z_loss_clip": 2.3515625, "router_z_loss_mlp": 0.28930664, "step": 11367, "time_per_iteration": 2.790229320526123 }, { "auxiliary_loss_clip": 0.01339277, "auxiliary_loss_mlp": 0.0022667, "balance_loss_clip": 1.09930801, "balance_loss_mlp": 0.19711784, "epoch": 0.6834811363294754, "flos": 20047132321920.0, "grad_norm": 21.72607634374216, "language_loss": 0.77226353, "learning_rate": 9.619101713400312e-07, "loss": 0.78792298, "num_input_tokens_seen": 245296615, "router_z_loss_clip": 2.40234375, "router_z_loss_mlp": 0.29528809, "step": 11368, "time_per_iteration": 2.6209146976470947 }, { "auxiliary_loss_clip": 0.01319464, "auxiliary_loss_mlp": 0.00210065, "balance_loss_clip": 1.08541298, "balance_loss_mlp": 0.18253976, "epoch": 0.6835412595821434, "flos": 24790752184320.0, "grad_norm": 7.468361554133743, "language_loss": 0.81972206, "learning_rate": 9.615772998335261e-07, "loss": 0.83501738, "num_input_tokens_seen": 245316275, "router_z_loss_clip": 2.34375, "router_z_loss_mlp": 0.27514648, "step": 11369, "time_per_iteration": 2.6648309230804443 }, { "auxiliary_loss_clip": 0.01345137, "auxiliary_loss_mlp": 0.0021839, "balance_loss_clip": 1.10420227, "balance_loss_mlp": 0.18688326, "epoch": 0.6836013828348113, "flos": 19500356517120.0, "grad_norm": 11.566002239938404, "language_loss": 0.87424445, "learning_rate": 9.612444677041138e-07, "loss": 0.88987976, "num_input_tokens_seen": 245334595, "router_z_loss_clip": 2.41210938, "router_z_loss_mlp": 0.31506348, "step": 11370, "time_per_iteration": 2.6069774627685547 }, { "auxiliary_loss_clip": 0.01156683, "auxiliary_loss_mlp": 0.00127264, "balance_loss_clip": 1.00525641, "balance_loss_mlp": 0.11849048, "epoch": 0.6836615060874793, "flos": 58363999251840.0, "grad_norm": 0.7846285360047448, "language_loss": 0.59606874, "learning_rate": 9.609116749644162e-07, "loss": 0.60890818, "num_input_tokens_seen": 245389750, "router_z_loss_clip": 1.515625, "router_z_loss_mlp": 0.08789062, "step": 11371, "time_per_iteration": 3.049044132232666 }, { "auxiliary_loss_clip": 0.0130859, "auxiliary_loss_mlp": 0.00212791, "balance_loss_clip": 1.08044767, "balance_loss_mlp": 0.18596914, "epoch": 0.6837216293401474, "flos": 12166895168640.0, "grad_norm": 9617.15315793117, "language_loss": 0.71199423, "learning_rate": 9.605789216270511e-07, "loss": 0.72720802, "num_input_tokens_seen": 245407530, "router_z_loss_clip": 2.28125, "router_z_loss_mlp": 0.26806641, "step": 11372, "time_per_iteration": 4.057488203048706 }, { "auxiliary_loss_clip": 0.01334152, "auxiliary_loss_mlp": 0.0022438, "balance_loss_clip": 1.1017971, "balance_loss_mlp": 0.19762969, "epoch": 0.6837817525928153, "flos": 22127581082880.0, "grad_norm": 20.63110418043011, "language_loss": 0.79280746, "learning_rate": 9.602462077046375e-07, "loss": 0.80839282, "num_input_tokens_seen": 245427000, "router_z_loss_clip": 2.32421875, "router_z_loss_mlp": 0.26757812, "step": 11373, "time_per_iteration": 4.1181488037109375 }, { "auxiliary_loss_clip": 0.01142562, "auxiliary_loss_mlp": 0.00088774, "balance_loss_clip": 0.99122751, "balance_loss_mlp": 0.07980987, "epoch": 0.6838418758454833, "flos": 65005928985600.0, "grad_norm": 1.1859871045653942, "language_loss": 0.56307733, "learning_rate": 9.599135332097935e-07, "loss": 0.5753907, "num_input_tokens_seen": 245491620, "router_z_loss_clip": 1.515625, "router_z_loss_mlp": 0.08984375, "step": 11374, "time_per_iteration": 3.2921030521392822 }, { "auxiliary_loss_clip": 0.01341109, "auxiliary_loss_mlp": 0.0021781, "balance_loss_clip": 1.10525179, "balance_loss_mlp": 0.1908807, "epoch": 0.6839019990981512, "flos": 21030833162880.0, "grad_norm": 336.473778664986, "language_loss": 0.82802689, "learning_rate": 9.595808981551312e-07, "loss": 0.84361607, "num_input_tokens_seen": 245511285, "router_z_loss_clip": 2.35742188, "router_z_loss_mlp": 0.26928711, "step": 11375, "time_per_iteration": 2.697084426879883 }, { "auxiliary_loss_clip": 0.01324292, "auxiliary_loss_mlp": 0.00215984, "balance_loss_clip": 1.09170496, "balance_loss_mlp": 0.18696821, "epoch": 0.6839621223508192, "flos": 24935543907840.0, "grad_norm": 178.41062312990812, "language_loss": 0.76421791, "learning_rate": 9.592483025532651e-07, "loss": 0.77962071, "num_input_tokens_seen": 245532910, "router_z_loss_clip": 2.32421875, "router_z_loss_mlp": 0.29003906, "step": 11376, "time_per_iteration": 2.6964218616485596 }, { "auxiliary_loss_clip": 0.01338336, "auxiliary_loss_mlp": 0.00232872, "balance_loss_clip": 1.09691238, "balance_loss_mlp": 0.20272407, "epoch": 0.6840222456034871, "flos": 26358827391360.0, "grad_norm": 980.3329566996595, "language_loss": 0.81719315, "learning_rate": 9.58915746416808e-07, "loss": 0.83290529, "num_input_tokens_seen": 245550540, "router_z_loss_clip": 2.41601562, "router_z_loss_mlp": 0.30151367, "step": 11377, "time_per_iteration": 4.113945960998535 }, { "auxiliary_loss_clip": 0.01142874, "auxiliary_loss_mlp": 0.00102171, "balance_loss_clip": 0.98944914, "balance_loss_mlp": 0.09477954, "epoch": 0.6840823688561551, "flos": 65988336936960.0, "grad_norm": 0.7412685148472757, "language_loss": 0.561064, "learning_rate": 9.585832297583707e-07, "loss": 0.5735144, "num_input_tokens_seen": 245619570, "router_z_loss_clip": 1.53125, "router_z_loss_mlp": 0.07373047, "step": 11378, "time_per_iteration": 3.2214090824127197 }, { "auxiliary_loss_clip": 0.01321601, "auxiliary_loss_mlp": 0.0021634, "balance_loss_clip": 1.08924901, "balance_loss_mlp": 0.18812314, "epoch": 0.684142492108823, "flos": 21397588980480.0, "grad_norm": 57.02778976334184, "language_loss": 0.85029751, "learning_rate": 9.58250752590561e-07, "loss": 0.86567688, "num_input_tokens_seen": 245637980, "router_z_loss_clip": 2.32226562, "router_z_loss_mlp": 0.28222656, "step": 11379, "time_per_iteration": 2.640737533569336 }, { "auxiliary_loss_clip": 0.01300038, "auxiliary_loss_mlp": 0.00201014, "balance_loss_clip": 1.08041763, "balance_loss_mlp": 0.17550288, "epoch": 0.6842026153614911, "flos": 18801426700800.0, "grad_norm": 6.837824797734919, "language_loss": 0.77458, "learning_rate": 9.57918314925988e-07, "loss": 0.78959054, "num_input_tokens_seen": 245655690, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.25512695, "step": 11380, "time_per_iteration": 2.6612961292266846 }, { "auxiliary_loss_clip": 0.01338528, "auxiliary_loss_mlp": 0.00214643, "balance_loss_clip": 1.10065103, "balance_loss_mlp": 0.1854371, "epoch": 0.684262738614159, "flos": 19646405216640.0, "grad_norm": 232.71335384445288, "language_loss": 0.86598891, "learning_rate": 9.575859167772568e-07, "loss": 0.88152063, "num_input_tokens_seen": 245671525, "router_z_loss_clip": 2.38085938, "router_z_loss_mlp": 0.29223633, "step": 11381, "time_per_iteration": 4.029786586761475 }, { "auxiliary_loss_clip": 0.01127161, "auxiliary_loss_mlp": 0.00095351, "balance_loss_clip": 0.97805715, "balance_loss_mlp": 0.08791246, "epoch": 0.684322861866827, "flos": 62354462739840.0, "grad_norm": 0.8723047349986254, "language_loss": 0.66622567, "learning_rate": 9.572535581569713e-07, "loss": 0.67845076, "num_input_tokens_seen": 245724115, "router_z_loss_clip": 1.484375, "router_z_loss_mlp": 0.07421875, "step": 11382, "time_per_iteration": 2.9936907291412354 }, { "auxiliary_loss_clip": 0.01122984, "auxiliary_loss_mlp": 0.00077902, "balance_loss_clip": 0.97599769, "balance_loss_mlp": 0.07060599, "epoch": 0.6843829851194949, "flos": 65805048812160.0, "grad_norm": 0.8142352532481685, "language_loss": 0.57476544, "learning_rate": 9.569212390777356e-07, "loss": 0.58677429, "num_input_tokens_seen": 245789245, "router_z_loss_clip": 1.46875, "router_z_loss_mlp": 0.07275391, "step": 11383, "time_per_iteration": 3.241542100906372 }, { "auxiliary_loss_clip": 0.01303615, "auxiliary_loss_mlp": 0.00201128, "balance_loss_clip": 1.07366729, "balance_loss_mlp": 0.17295843, "epoch": 0.6844431083721629, "flos": 27855153181440.0, "grad_norm": 6.200664654815156, "language_loss": 0.86023569, "learning_rate": 9.565889595521517e-07, "loss": 0.87528312, "num_input_tokens_seen": 245812420, "router_z_loss_clip": 2.29882812, "router_z_loss_mlp": 0.28173828, "step": 11384, "time_per_iteration": 2.7454993724823 }, { "auxiliary_loss_clip": 0.01315681, "auxiliary_loss_mlp": 0.00222788, "balance_loss_clip": 1.08185744, "balance_loss_mlp": 0.19458283, "epoch": 0.684503231624831, "flos": 18255010032000.0, "grad_norm": 17.424319530594193, "language_loss": 0.86133575, "learning_rate": 9.562567195928187e-07, "loss": 0.87672043, "num_input_tokens_seen": 245829135, "router_z_loss_clip": 2.33984375, "router_z_loss_mlp": 0.28173828, "step": 11385, "time_per_iteration": 2.6432549953460693 }, { "auxiliary_loss_clip": 0.01341802, "auxiliary_loss_mlp": 0.00230963, "balance_loss_clip": 1.09680796, "balance_loss_mlp": 0.19880068, "epoch": 0.6845633548774989, "flos": 17639681120640.0, "grad_norm": 4.049947944977753, "language_loss": 0.9180429, "learning_rate": 9.55924519212335e-07, "loss": 0.93377054, "num_input_tokens_seen": 245847140, "router_z_loss_clip": 2.44726562, "router_z_loss_mlp": 0.3215332, "step": 11386, "time_per_iteration": 2.6324450969696045 }, { "auxiliary_loss_clip": 0.01322191, "auxiliary_loss_mlp": 0.00196552, "balance_loss_clip": 1.09165668, "balance_loss_mlp": 0.16945602, "epoch": 0.6846234781301669, "flos": 20807576179200.0, "grad_norm": 291.86929868731335, "language_loss": 0.89135408, "learning_rate": 9.555923584232984e-07, "loss": 0.90654153, "num_input_tokens_seen": 245862855, "router_z_loss_clip": 2.30859375, "router_z_loss_mlp": 0.27111816, "step": 11387, "time_per_iteration": 2.7209436893463135 }, { "auxiliary_loss_clip": 0.01315768, "auxiliary_loss_mlp": 0.00227502, "balance_loss_clip": 1.09011602, "balance_loss_mlp": 0.20163396, "epoch": 0.6846836013828348, "flos": 36101176485120.0, "grad_norm": 183.31036655861934, "language_loss": 0.78040683, "learning_rate": 9.552602372383047e-07, "loss": 0.79583955, "num_input_tokens_seen": 245885415, "router_z_loss_clip": 2.2578125, "router_z_loss_mlp": 0.25866699, "step": 11388, "time_per_iteration": 2.826472043991089 }, { "auxiliary_loss_clip": 0.01315398, "auxiliary_loss_mlp": 0.00205803, "balance_loss_clip": 1.08315706, "balance_loss_mlp": 0.17999467, "epoch": 0.6847437246355028, "flos": 43142468607360.0, "grad_norm": 65.28604128838909, "language_loss": 0.70094407, "learning_rate": 9.549281556699469e-07, "loss": 0.71615613, "num_input_tokens_seen": 245906285, "router_z_loss_clip": 2.32421875, "router_z_loss_mlp": 0.25793457, "step": 11389, "time_per_iteration": 2.8460347652435303 }, { "auxiliary_loss_clip": 0.01126228, "auxiliary_loss_mlp": 0.00052319, "balance_loss_clip": 0.97536141, "balance_loss_mlp": 0.04430835, "epoch": 0.6848038478881707, "flos": 71663729552640.0, "grad_norm": 0.7088352766433441, "language_loss": 0.55422103, "learning_rate": 9.54596113730818e-07, "loss": 0.56600654, "num_input_tokens_seen": 245967620, "router_z_loss_clip": 1.5078125, "router_z_loss_mlp": 0.08007812, "step": 11390, "time_per_iteration": 3.2279646396636963 }, { "auxiliary_loss_clip": 0.01300576, "auxiliary_loss_mlp": 0.00203543, "balance_loss_clip": 1.07629502, "balance_loss_mlp": 0.17730513, "epoch": 0.6848639711408387, "flos": 19937820257280.0, "grad_norm": 7.6436062038874955, "language_loss": 0.95745444, "learning_rate": 9.542641114335109e-07, "loss": 0.97249568, "num_input_tokens_seen": 245985075, "router_z_loss_clip": 2.2421875, "router_z_loss_mlp": 0.26269531, "step": 11391, "time_per_iteration": 2.656298875808716 }, { "auxiliary_loss_clip": 0.01312479, "auxiliary_loss_mlp": 0.00217588, "balance_loss_clip": 1.07865834, "balance_loss_mlp": 0.19114703, "epoch": 0.6849240943935067, "flos": 26867501844480.0, "grad_norm": 10.482021994306256, "language_loss": 0.85160601, "learning_rate": 9.539321487906117e-07, "loss": 0.86690664, "num_input_tokens_seen": 246003560, "router_z_loss_clip": 2.33789062, "router_z_loss_mlp": 0.2644043, "step": 11392, "time_per_iteration": 2.705381155014038 }, { "auxiliary_loss_clip": 0.013029, "auxiliary_loss_mlp": 0.0021586, "balance_loss_clip": 1.07577372, "balance_loss_mlp": 0.18891835, "epoch": 0.6849842176461747, "flos": 13735365425280.0, "grad_norm": 51.59990266304212, "language_loss": 0.80846453, "learning_rate": 9.536002258147104e-07, "loss": 0.82365215, "num_input_tokens_seen": 246019600, "router_z_loss_clip": 2.26953125, "router_z_loss_mlp": 0.26928711, "step": 11393, "time_per_iteration": 2.5993099212646484 }, { "auxiliary_loss_clip": 0.01323789, "auxiliary_loss_mlp": 0.0022222, "balance_loss_clip": 1.08896661, "balance_loss_mlp": 0.19490942, "epoch": 0.6850443408988426, "flos": 24973070641920.0, "grad_norm": 57.42161687920957, "language_loss": 0.72971022, "learning_rate": 9.532683425183936e-07, "loss": 0.74517024, "num_input_tokens_seen": 246038920, "router_z_loss_clip": 2.34960938, "router_z_loss_mlp": 0.27294922, "step": 11394, "time_per_iteration": 2.679710865020752 }, { "auxiliary_loss_clip": 0.0129639, "auxiliary_loss_mlp": 0.00228987, "balance_loss_clip": 1.07435131, "balance_loss_mlp": 0.20271346, "epoch": 0.6851044641515106, "flos": 27744225004800.0, "grad_norm": 3.1061484806775246, "language_loss": 0.86327988, "learning_rate": 9.529364989142468e-07, "loss": 0.87853366, "num_input_tokens_seen": 246060490, "router_z_loss_clip": 2.22265625, "router_z_loss_mlp": 0.26269531, "step": 11395, "time_per_iteration": 2.7165331840515137 }, { "auxiliary_loss_clip": 0.01307431, "auxiliary_loss_mlp": 0.00226719, "balance_loss_clip": 1.08200693, "balance_loss_mlp": 0.20189922, "epoch": 0.6851645874041785, "flos": 24351061800960.0, "grad_norm": 24.407280164036642, "language_loss": 0.79482216, "learning_rate": 9.526046950148527e-07, "loss": 0.81016362, "num_input_tokens_seen": 246081465, "router_z_loss_clip": 2.2578125, "router_z_loss_mlp": 0.24841309, "step": 11396, "time_per_iteration": 2.714905261993408 }, { "auxiliary_loss_clip": 0.01314507, "auxiliary_loss_mlp": 0.00223512, "balance_loss_clip": 1.08423615, "balance_loss_mlp": 0.19533128, "epoch": 0.6852247106568465, "flos": 15077849264640.0, "grad_norm": 7.80191187678357, "language_loss": 0.8897692, "learning_rate": 9.522729308327931e-07, "loss": 0.90514946, "num_input_tokens_seen": 246096110, "router_z_loss_clip": 2.296875, "router_z_loss_mlp": 0.28198242, "step": 11397, "time_per_iteration": 2.6716971397399902 }, { "auxiliary_loss_clip": 0.01305861, "auxiliary_loss_mlp": 0.00220458, "balance_loss_clip": 1.07964933, "balance_loss_mlp": 0.19414845, "epoch": 0.6852848339095146, "flos": 18770005278720.0, "grad_norm": 29.886382012136686, "language_loss": 0.78346556, "learning_rate": 9.519412063806493e-07, "loss": 0.7987287, "num_input_tokens_seen": 246114785, "router_z_loss_clip": 2.26171875, "router_z_loss_mlp": 0.26318359, "step": 11398, "time_per_iteration": 2.6406800746917725 }, { "auxiliary_loss_clip": 0.01284762, "auxiliary_loss_mlp": 0.00215241, "balance_loss_clip": 1.06599474, "balance_loss_mlp": 0.18907444, "epoch": 0.6853449571621825, "flos": 27854363082240.0, "grad_norm": 7.023816377315614, "language_loss": 0.77568412, "learning_rate": 9.516095216709996e-07, "loss": 0.79068416, "num_input_tokens_seen": 246136375, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.26171875, "step": 11399, "time_per_iteration": 2.734245777130127 }, { "auxiliary_loss_clip": 0.01294316, "auxiliary_loss_mlp": 0.00237131, "balance_loss_clip": 1.07242477, "balance_loss_mlp": 0.21109605, "epoch": 0.6854050804148505, "flos": 18150510389760.0, "grad_norm": 155.96721789712078, "language_loss": 0.7929548, "learning_rate": 9.512778767164217e-07, "loss": 0.80826932, "num_input_tokens_seen": 246155090, "router_z_loss_clip": 2.22070312, "router_z_loss_mlp": 0.26062012, "step": 11400, "time_per_iteration": 2.639631509780884 }, { "auxiliary_loss_clip": 0.01329642, "auxiliary_loss_mlp": 0.00222714, "balance_loss_clip": 1.08656001, "balance_loss_mlp": 0.19284041, "epoch": 0.6854652036675184, "flos": 16326212492160.0, "grad_norm": 2.311747045619686, "language_loss": 0.87323302, "learning_rate": 9.509462715294927e-07, "loss": 0.88875651, "num_input_tokens_seen": 246172645, "router_z_loss_clip": 2.43164062, "router_z_loss_mlp": 0.29882812, "step": 11401, "time_per_iteration": 2.6340627670288086 }, { "auxiliary_loss_clip": 0.01298828, "auxiliary_loss_mlp": 0.00198166, "balance_loss_clip": 1.07182455, "balance_loss_mlp": 0.17179731, "epoch": 0.6855253269201864, "flos": 14940814878720.0, "grad_norm": 3.5644813293294, "language_loss": 0.85316801, "learning_rate": 9.50614706122786e-07, "loss": 0.86813796, "num_input_tokens_seen": 246189055, "router_z_loss_clip": 2.26953125, "router_z_loss_mlp": 0.26342773, "step": 11402, "time_per_iteration": 2.6001222133636475 }, { "auxiliary_loss_clip": 0.01320984, "auxiliary_loss_mlp": 0.00223365, "balance_loss_clip": 1.08800292, "balance_loss_mlp": 0.19638826, "epoch": 0.6855854501728543, "flos": 23037736826880.0, "grad_norm": 3.1189103137928984, "language_loss": 0.78643751, "learning_rate": 9.502831805088742e-07, "loss": 0.80188107, "num_input_tokens_seen": 246207990, "router_z_loss_clip": 2.33398438, "router_z_loss_mlp": 0.26989746, "step": 11403, "time_per_iteration": 2.706217050552368 }, { "auxiliary_loss_clip": 0.01301826, "auxiliary_loss_mlp": 0.00231407, "balance_loss_clip": 1.07848895, "balance_loss_mlp": 0.20493068, "epoch": 0.6856455734255223, "flos": 13253623194240.0, "grad_norm": 18.297737702311643, "language_loss": 0.91324198, "learning_rate": 9.499516947003294e-07, "loss": 0.92857432, "num_input_tokens_seen": 246221595, "router_z_loss_clip": 2.22851562, "router_z_loss_mlp": 0.26464844, "step": 11404, "time_per_iteration": 2.633016586303711 }, { "auxiliary_loss_clip": 0.01326881, "auxiliary_loss_mlp": 0.00218202, "balance_loss_clip": 1.09741902, "balance_loss_mlp": 0.19233397, "epoch": 0.6857056966781903, "flos": 23333461499520.0, "grad_norm": 2.2190170737494523, "language_loss": 0.83188021, "learning_rate": 9.496202487097222e-07, "loss": 0.84733099, "num_input_tokens_seen": 246242970, "router_z_loss_clip": 2.29492188, "router_z_loss_mlp": 0.25854492, "step": 11405, "time_per_iteration": 2.7015390396118164 }, { "auxiliary_loss_clip": 0.01135956, "auxiliary_loss_mlp": 0.00088138, "balance_loss_clip": 0.98579991, "balance_loss_mlp": 0.07988854, "epoch": 0.6857658199308583, "flos": 61852647784320.0, "grad_norm": 0.7668166622478042, "language_loss": 0.60166395, "learning_rate": 9.492888425496199e-07, "loss": 0.61390489, "num_input_tokens_seen": 246300405, "router_z_loss_clip": 1.5, "router_z_loss_mlp": 0.08251953, "step": 11406, "time_per_iteration": 3.2171072959899902 }, { "auxiliary_loss_clip": 0.01314297, "auxiliary_loss_mlp": 0.00232053, "balance_loss_clip": 1.08457565, "balance_loss_mlp": 0.20420602, "epoch": 0.6858259431835262, "flos": 16654543735680.0, "grad_norm": 29.30960763978246, "language_loss": 0.85970145, "learning_rate": 9.489574762325907e-07, "loss": 0.87516499, "num_input_tokens_seen": 246318780, "router_z_loss_clip": 2.296875, "router_z_loss_mlp": 0.27844238, "step": 11407, "time_per_iteration": 2.6490190029144287 }, { "auxiliary_loss_clip": 0.01309612, "auxiliary_loss_mlp": 0.00213048, "balance_loss_clip": 1.0779829, "balance_loss_mlp": 0.18409257, "epoch": 0.6858860664361942, "flos": 21872974504320.0, "grad_norm": 729.1861968259099, "language_loss": 0.78635478, "learning_rate": 9.486261497711991e-07, "loss": 0.80158138, "num_input_tokens_seen": 246339405, "router_z_loss_clip": 2.31445312, "router_z_loss_mlp": 0.28930664, "step": 11408, "time_per_iteration": 2.7114598751068115 }, { "auxiliary_loss_clip": 0.01312029, "auxiliary_loss_mlp": 0.0022864, "balance_loss_clip": 1.07925069, "balance_loss_mlp": 0.20244947, "epoch": 0.6859461896888621, "flos": 15267637751040.0, "grad_norm": 3.0154352531304083, "language_loss": 0.79146254, "learning_rate": 9.482948631780087e-07, "loss": 0.80686921, "num_input_tokens_seen": 246357055, "router_z_loss_clip": 2.32617188, "router_z_loss_mlp": 0.26196289, "step": 11409, "time_per_iteration": 2.670933723449707 }, { "auxiliary_loss_clip": 0.01291838, "auxiliary_loss_mlp": 0.00237765, "balance_loss_clip": 1.07363713, "balance_loss_mlp": 0.21282688, "epoch": 0.6860063129415301, "flos": 18620293392000.0, "grad_norm": 24.142668884263877, "language_loss": 0.82742667, "learning_rate": 9.479636164655825e-07, "loss": 0.84272265, "num_input_tokens_seen": 246374050, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.24963379, "step": 11410, "time_per_iteration": 2.655996084213257 }, { "auxiliary_loss_clip": 0.01308851, "auxiliary_loss_mlp": 0.00235948, "balance_loss_clip": 1.07706535, "balance_loss_mlp": 0.20843467, "epoch": 0.6860664361941982, "flos": 23951376190080.0, "grad_norm": 212.0217761035595, "language_loss": 0.79691046, "learning_rate": 9.476324096464821e-07, "loss": 0.8123585, "num_input_tokens_seen": 246392910, "router_z_loss_clip": 2.31835938, "router_z_loss_mlp": 0.27502441, "step": 11411, "time_per_iteration": 2.6714401245117188 }, { "auxiliary_loss_clip": 0.01303021, "auxiliary_loss_mlp": 0.00211325, "balance_loss_clip": 1.06847548, "balance_loss_mlp": 0.18220216, "epoch": 0.6861265594468661, "flos": 20407782827520.0, "grad_norm": 57.67596830213775, "language_loss": 0.800192, "learning_rate": 9.473012427332654e-07, "loss": 0.81533551, "num_input_tokens_seen": 246411540, "router_z_loss_clip": 2.34570312, "router_z_loss_mlp": 0.29150391, "step": 11412, "time_per_iteration": 2.675710678100586 }, { "auxiliary_loss_clip": 0.0128875, "auxiliary_loss_mlp": 0.00227181, "balance_loss_clip": 1.05943418, "balance_loss_mlp": 0.19932142, "epoch": 0.6861866826995341, "flos": 11428571111040.0, "grad_norm": 23.344048857108735, "language_loss": 0.8084355, "learning_rate": 9.469701157384919e-07, "loss": 0.82359481, "num_input_tokens_seen": 246423295, "router_z_loss_clip": 2.29296875, "router_z_loss_mlp": 0.27856445, "step": 11413, "time_per_iteration": 2.5663914680480957 }, { "auxiliary_loss_clip": 0.01297023, "auxiliary_loss_mlp": 0.00241558, "balance_loss_clip": 1.07139754, "balance_loss_mlp": 0.21464014, "epoch": 0.686246805952202, "flos": 15997593939840.0, "grad_norm": 3.7755445038198725, "language_loss": 0.81121528, "learning_rate": 9.466390286747164e-07, "loss": 0.82660103, "num_input_tokens_seen": 246441045, "router_z_loss_clip": 2.2578125, "router_z_loss_mlp": 0.26953125, "step": 11414, "time_per_iteration": 3.9959309101104736 }, { "auxiliary_loss_clip": 0.01325681, "auxiliary_loss_mlp": 0.00237793, "balance_loss_clip": 1.08960092, "balance_loss_mlp": 0.20825326, "epoch": 0.68630692920487, "flos": 19826712512640.0, "grad_norm": 3.217131902400526, "language_loss": 0.97033232, "learning_rate": 9.46307981554495e-07, "loss": 0.98596704, "num_input_tokens_seen": 246456905, "router_z_loss_clip": 2.359375, "router_z_loss_mlp": 0.29516602, "step": 11415, "time_per_iteration": 4.103572845458984 }, { "auxiliary_loss_clip": 0.01317206, "auxiliary_loss_mlp": 0.00242153, "balance_loss_clip": 1.07944095, "balance_loss_mlp": 0.21392423, "epoch": 0.6863670524575379, "flos": 26286216048000.0, "grad_norm": 151.11963573318604, "language_loss": 0.74313694, "learning_rate": 9.459769743903801e-07, "loss": 0.75873053, "num_input_tokens_seen": 246477545, "router_z_loss_clip": 2.37695312, "router_z_loss_mlp": 0.2824707, "step": 11416, "time_per_iteration": 2.684231996536255 }, { "auxiliary_loss_clip": 0.01283374, "auxiliary_loss_mlp": 0.00224961, "balance_loss_clip": 1.05845869, "balance_loss_mlp": 0.19899735, "epoch": 0.686427175710206, "flos": 19173138595200.0, "grad_norm": 67.70781745444148, "language_loss": 0.82942605, "learning_rate": 9.456460071949237e-07, "loss": 0.84450948, "num_input_tokens_seen": 246496705, "router_z_loss_clip": 2.25, "router_z_loss_mlp": 0.25939941, "step": 11417, "time_per_iteration": 2.689422369003296 }, { "auxiliary_loss_clip": 0.01319676, "auxiliary_loss_mlp": 0.0023757, "balance_loss_clip": 1.0887444, "balance_loss_mlp": 0.21000887, "epoch": 0.6864872989628739, "flos": 18916628595840.0, "grad_norm": 14.130640764940376, "language_loss": 0.85158789, "learning_rate": 9.45315079980678e-07, "loss": 0.86716032, "num_input_tokens_seen": 246514860, "router_z_loss_clip": 2.3125, "router_z_loss_mlp": 0.27575684, "step": 11418, "time_per_iteration": 2.647162914276123 }, { "auxiliary_loss_clip": 0.01317399, "auxiliary_loss_mlp": 0.00211285, "balance_loss_clip": 1.0830307, "balance_loss_mlp": 0.18410495, "epoch": 0.6865474222155419, "flos": 25956196865280.0, "grad_norm": 16.90431747077494, "language_loss": 0.84242475, "learning_rate": 9.449841927601887e-07, "loss": 0.85771155, "num_input_tokens_seen": 246536145, "router_z_loss_clip": 2.34570312, "router_z_loss_mlp": 0.27185059, "step": 11419, "time_per_iteration": 4.206003427505493 }, { "auxiliary_loss_clip": 0.01279721, "auxiliary_loss_mlp": 0.00228565, "balance_loss_clip": 1.06157827, "balance_loss_mlp": 0.2047466, "epoch": 0.6866075454682098, "flos": 18478087447680.0, "grad_norm": 3.174319829220743, "language_loss": 0.79345661, "learning_rate": 9.446533455460044e-07, "loss": 0.80853945, "num_input_tokens_seen": 246553265, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.23815918, "step": 11420, "time_per_iteration": 2.6605639457702637 }, { "auxiliary_loss_clip": 0.01296151, "auxiliary_loss_mlp": 0.00238648, "balance_loss_clip": 1.0688616, "balance_loss_mlp": 0.21244612, "epoch": 0.6866676687208778, "flos": 34239998298240.0, "grad_norm": 3.4984242446274476, "language_loss": 0.80891299, "learning_rate": 9.443225383506712e-07, "loss": 0.82426101, "num_input_tokens_seen": 246575130, "router_z_loss_clip": 2.27148438, "router_z_loss_mlp": 0.26184082, "step": 11421, "time_per_iteration": 2.7796339988708496 }, { "auxiliary_loss_clip": 0.01285989, "auxiliary_loss_mlp": 0.00215982, "balance_loss_clip": 1.0666883, "balance_loss_mlp": 0.19144903, "epoch": 0.6867277919735457, "flos": 21721754246400.0, "grad_norm": 386.212655751545, "language_loss": 0.83543682, "learning_rate": 9.439917711867338e-07, "loss": 0.85045654, "num_input_tokens_seen": 246593095, "router_z_loss_clip": 2.19335938, "router_z_loss_mlp": 0.24560547, "step": 11422, "time_per_iteration": 2.6969165802001953 }, { "auxiliary_loss_clip": 0.01301859, "auxiliary_loss_mlp": 0.00236353, "balance_loss_clip": 1.06897879, "balance_loss_mlp": 0.20629999, "epoch": 0.6867879152262137, "flos": 24097999507200.0, "grad_norm": 2.0561006585753887, "language_loss": 0.83225816, "learning_rate": 9.436610440667334e-07, "loss": 0.84764028, "num_input_tokens_seen": 246612165, "router_z_loss_clip": 2.328125, "router_z_loss_mlp": 0.30053711, "step": 11423, "time_per_iteration": 4.090404272079468 }, { "auxiliary_loss_clip": 0.01301754, "auxiliary_loss_mlp": 0.00242925, "balance_loss_clip": 1.07724702, "balance_loss_mlp": 0.21713987, "epoch": 0.6868480384788818, "flos": 21615818060160.0, "grad_norm": 1237.0640902245777, "language_loss": 0.80364835, "learning_rate": 9.433303570032129e-07, "loss": 0.81909513, "num_input_tokens_seen": 246632065, "router_z_loss_clip": 2.24609375, "router_z_loss_mlp": 0.25769043, "step": 11424, "time_per_iteration": 2.666814088821411 }, { "auxiliary_loss_clip": 0.01285303, "auxiliary_loss_mlp": 0.00228496, "balance_loss_clip": 1.06063366, "balance_loss_mlp": 0.20119685, "epoch": 0.6869081617315497, "flos": 26286144220800.0, "grad_norm": 6.957102040376679, "language_loss": 0.72288632, "learning_rate": 9.429997100087112e-07, "loss": 0.73802429, "num_input_tokens_seen": 246651245, "router_z_loss_clip": 2.24804688, "router_z_loss_mlp": 0.27294922, "step": 11425, "time_per_iteration": 2.7278475761413574 }, { "auxiliary_loss_clip": 0.01282721, "auxiliary_loss_mlp": 0.00212387, "balance_loss_clip": 1.05600369, "balance_loss_mlp": 0.1871857, "epoch": 0.6869682849842177, "flos": 21105096531840.0, "grad_norm": 4.21643719903377, "language_loss": 0.78518486, "learning_rate": 9.426691030957657e-07, "loss": 0.80013597, "num_input_tokens_seen": 246672225, "router_z_loss_clip": 2.265625, "router_z_loss_mlp": 0.2520752, "step": 11426, "time_per_iteration": 2.659825563430786 }, { "auxiliary_loss_clip": 0.01294055, "auxiliary_loss_mlp": 0.00223845, "balance_loss_clip": 1.06646717, "balance_loss_mlp": 0.1960932, "epoch": 0.6870284082368856, "flos": 17092653920640.0, "grad_norm": 31.160708811583568, "language_loss": 0.92369866, "learning_rate": 9.423385362769136e-07, "loss": 0.9388777, "num_input_tokens_seen": 246688385, "router_z_loss_clip": 2.27929688, "router_z_loss_mlp": 0.27758789, "step": 11427, "time_per_iteration": 2.6106672286987305 }, { "auxiliary_loss_clip": 0.01273237, "auxiliary_loss_mlp": 0.00207968, "balance_loss_clip": 1.05358458, "balance_loss_mlp": 0.18322045, "epoch": 0.6870885314895536, "flos": 27308090067840.0, "grad_norm": 10.113187348863718, "language_loss": 0.84098458, "learning_rate": 9.420080095646909e-07, "loss": 0.85579669, "num_input_tokens_seen": 246710730, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.24755859, "step": 11428, "time_per_iteration": 2.683837413787842 }, { "auxiliary_loss_clip": 0.01315341, "auxiliary_loss_mlp": 0.00236779, "balance_loss_clip": 1.08288431, "balance_loss_mlp": 0.20752494, "epoch": 0.6871486547422215, "flos": 20814543417600.0, "grad_norm": 20.023143724926324, "language_loss": 0.8116352, "learning_rate": 9.4167752297163e-07, "loss": 0.82715642, "num_input_tokens_seen": 246730350, "router_z_loss_clip": 2.32617188, "router_z_loss_mlp": 0.29284668, "step": 11429, "time_per_iteration": 2.707099199295044 }, { "auxiliary_loss_clip": 0.01310961, "auxiliary_loss_mlp": 0.00216918, "balance_loss_clip": 1.07395792, "balance_loss_mlp": 0.18966669, "epoch": 0.6872087779948896, "flos": 30154118330880.0, "grad_norm": 6.7264972139513235, "language_loss": 0.91320485, "learning_rate": 9.413470765102643e-07, "loss": 0.92848361, "num_input_tokens_seen": 246751700, "router_z_loss_clip": 2.37304688, "router_z_loss_mlp": 0.27209473, "step": 11430, "time_per_iteration": 2.757507085800171 }, { "auxiliary_loss_clip": 0.01295505, "auxiliary_loss_mlp": 0.00224687, "balance_loss_clip": 1.0705049, "balance_loss_mlp": 0.19819853, "epoch": 0.6872689012475575, "flos": 20704584908160.0, "grad_norm": 25.23189301479444, "language_loss": 0.77919334, "learning_rate": 9.410166701931225e-07, "loss": 0.79439527, "num_input_tokens_seen": 246769860, "router_z_loss_clip": 2.25, "router_z_loss_mlp": 0.26525879, "step": 11431, "time_per_iteration": 2.654498815536499 }, { "auxiliary_loss_clip": 0.01283948, "auxiliary_loss_mlp": 0.00217167, "balance_loss_clip": 1.06463718, "balance_loss_mlp": 0.19337302, "epoch": 0.6873290245002255, "flos": 25520852027520.0, "grad_norm": 2.8009744688603475, "language_loss": 0.87896317, "learning_rate": 9.406863040327355e-07, "loss": 0.8939743, "num_input_tokens_seen": 246789905, "router_z_loss_clip": 2.19140625, "router_z_loss_mlp": 0.23779297, "step": 11432, "time_per_iteration": 2.6764862537384033 }, { "auxiliary_loss_clip": 0.01285565, "auxiliary_loss_mlp": 0.00209901, "balance_loss_clip": 1.06626046, "balance_loss_mlp": 0.18495058, "epoch": 0.6873891477528934, "flos": 25191479289600.0, "grad_norm": 13.98807157182821, "language_loss": 0.73497874, "learning_rate": 9.403559780416295e-07, "loss": 0.74993336, "num_input_tokens_seen": 246808815, "router_z_loss_clip": 2.19140625, "router_z_loss_mlp": 0.24938965, "step": 11433, "time_per_iteration": 2.7252845764160156 }, { "auxiliary_loss_clip": 0.01297058, "auxiliary_loss_mlp": 0.00220511, "balance_loss_clip": 1.06904054, "balance_loss_mlp": 0.19510713, "epoch": 0.6874492710055614, "flos": 35152380685440.0, "grad_norm": 12.055244355180793, "language_loss": 0.81512797, "learning_rate": 9.400256922323309e-07, "loss": 0.83030367, "num_input_tokens_seen": 246829775, "router_z_loss_clip": 2.28125, "router_z_loss_mlp": 0.25378418, "step": 11434, "time_per_iteration": 2.893059492111206 }, { "auxiliary_loss_clip": 0.01284894, "auxiliary_loss_mlp": 0.00195909, "balance_loss_clip": 1.06240487, "balance_loss_mlp": 0.1710775, "epoch": 0.6875093942582293, "flos": 17822215059840.0, "grad_norm": 20.18970825482068, "language_loss": 0.88963187, "learning_rate": 9.396954466173657e-07, "loss": 0.90443987, "num_input_tokens_seen": 246848045, "router_z_loss_clip": 2.22460938, "router_z_loss_mlp": 0.24853516, "step": 11435, "time_per_iteration": 2.663299560546875 }, { "auxiliary_loss_clip": 0.01315451, "auxiliary_loss_mlp": 0.00229667, "balance_loss_clip": 1.08188188, "balance_loss_mlp": 0.20207021, "epoch": 0.6875695175108973, "flos": 20704548994560.0, "grad_norm": 6.7215323853994695, "language_loss": 0.90158951, "learning_rate": 9.393652412092538e-07, "loss": 0.91704065, "num_input_tokens_seen": 246866095, "router_z_loss_clip": 2.3359375, "router_z_loss_mlp": 0.27575684, "step": 11436, "time_per_iteration": 2.6802542209625244 }, { "auxiliary_loss_clip": 0.01279014, "auxiliary_loss_mlp": 0.00218049, "balance_loss_clip": 1.05745292, "balance_loss_mlp": 0.19274041, "epoch": 0.6876296407635654, "flos": 25374013228800.0, "grad_norm": 10.996698648074721, "language_loss": 0.87551749, "learning_rate": 9.390350760205183e-07, "loss": 0.89048809, "num_input_tokens_seen": 246883975, "router_z_loss_clip": 2.21679688, "router_z_loss_mlp": 0.25317383, "step": 11437, "time_per_iteration": 2.677687883377075 }, { "auxiliary_loss_clip": 0.01330847, "auxiliary_loss_mlp": 0.00245151, "balance_loss_clip": 1.0939486, "balance_loss_mlp": 0.21717215, "epoch": 0.6876897640162333, "flos": 23222317841280.0, "grad_norm": 31.871484200819832, "language_loss": 0.86550176, "learning_rate": 9.387049510636793e-07, "loss": 0.88126177, "num_input_tokens_seen": 246901560, "router_z_loss_clip": 2.36914062, "router_z_loss_mlp": 0.27966309, "step": 11438, "time_per_iteration": 2.67934250831604 }, { "auxiliary_loss_clip": 0.01276323, "auxiliary_loss_mlp": 0.00240806, "balance_loss_clip": 1.05895817, "balance_loss_mlp": 0.21560492, "epoch": 0.6877498872689013, "flos": 27124335066240.0, "grad_norm": 61.5684580711111, "language_loss": 0.79143393, "learning_rate": 9.383748663512554e-07, "loss": 0.80660522, "num_input_tokens_seen": 246922655, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.2520752, "step": 11439, "time_per_iteration": 2.736802339553833 }, { "auxiliary_loss_clip": 0.01289722, "auxiliary_loss_mlp": 0.00224492, "balance_loss_clip": 1.06664872, "balance_loss_mlp": 0.1988744, "epoch": 0.6878100105215692, "flos": 11581658876160.0, "grad_norm": 9.829076454592645, "language_loss": 0.84273624, "learning_rate": 9.380448218957623e-07, "loss": 0.85787845, "num_input_tokens_seen": 246940100, "router_z_loss_clip": 2.23242188, "router_z_loss_mlp": 0.25634766, "step": 11440, "time_per_iteration": 2.688310146331787 }, { "auxiliary_loss_clip": 0.01287884, "auxiliary_loss_mlp": 0.00220178, "balance_loss_clip": 1.06799459, "balance_loss_mlp": 0.19482201, "epoch": 0.6878701337742372, "flos": 20303175444480.0, "grad_norm": 6.577072896244068, "language_loss": 0.78997684, "learning_rate": 9.377148177097167e-07, "loss": 0.80505753, "num_input_tokens_seen": 246958545, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.25341797, "step": 11441, "time_per_iteration": 2.6336488723754883 }, { "auxiliary_loss_clip": 0.01318852, "auxiliary_loss_mlp": 0.00238361, "balance_loss_clip": 1.08664632, "balance_loss_mlp": 0.20903541, "epoch": 0.6879302570269051, "flos": 13840080549120.0, "grad_norm": 56.1304409299792, "language_loss": 0.7588312, "learning_rate": 9.373848538056317e-07, "loss": 0.77440333, "num_input_tokens_seen": 246974805, "router_z_loss_clip": 2.3203125, "router_z_loss_mlp": 0.29345703, "step": 11442, "time_per_iteration": 2.662461042404175 }, { "auxiliary_loss_clip": 0.01294645, "auxiliary_loss_mlp": 0.00218895, "balance_loss_clip": 1.07156277, "balance_loss_mlp": 0.19415948, "epoch": 0.6879903802795732, "flos": 21324654414720.0, "grad_norm": 13.21480983694487, "language_loss": 0.82287359, "learning_rate": 9.370549301960189e-07, "loss": 0.838009, "num_input_tokens_seen": 246992505, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.24731445, "step": 11443, "time_per_iteration": 2.662371873855591 }, { "auxiliary_loss_clip": 0.01295606, "auxiliary_loss_mlp": 0.00202261, "balance_loss_clip": 1.07200408, "balance_loss_mlp": 0.17597565, "epoch": 0.6880505035322411, "flos": 25152049134720.0, "grad_norm": 25.630543726804504, "language_loss": 0.82421803, "learning_rate": 9.367250468933893e-07, "loss": 0.83919668, "num_input_tokens_seen": 247013370, "router_z_loss_clip": 2.24023438, "router_z_loss_mlp": 0.26257324, "step": 11444, "time_per_iteration": 2.719097852706909 }, { "auxiliary_loss_clip": 0.01287885, "auxiliary_loss_mlp": 0.00220638, "balance_loss_clip": 1.06894398, "balance_loss_mlp": 0.19449519, "epoch": 0.6881106267849091, "flos": 23215530170880.0, "grad_norm": 147.1924596783848, "language_loss": 0.86017859, "learning_rate": 9.363952039102536e-07, "loss": 0.87526387, "num_input_tokens_seen": 247029855, "router_z_loss_clip": 2.19140625, "router_z_loss_mlp": 0.26123047, "step": 11445, "time_per_iteration": 2.6388022899627686 }, { "auxiliary_loss_clip": 0.01129502, "auxiliary_loss_mlp": 0.00062938, "balance_loss_clip": 0.9812699, "balance_loss_mlp": 0.05497492, "epoch": 0.688170750037577, "flos": 48484397312640.0, "grad_norm": 0.7925604633907269, "language_loss": 0.5766995, "learning_rate": 9.360654012591183e-07, "loss": 0.58862388, "num_input_tokens_seen": 247085030, "router_z_loss_clip": 1.484375, "router_z_loss_mlp": 0.07958984, "step": 11446, "time_per_iteration": 3.2068605422973633 }, { "auxiliary_loss_clip": 0.01309983, "auxiliary_loss_mlp": 0.00241424, "balance_loss_clip": 1.08053744, "balance_loss_mlp": 0.21543661, "epoch": 0.688230873290245, "flos": 22783633038720.0, "grad_norm": 419.85862117588727, "language_loss": 0.83841741, "learning_rate": 9.357356389524886e-07, "loss": 0.85393143, "num_input_tokens_seen": 247104840, "router_z_loss_clip": 2.29882812, "router_z_loss_mlp": 0.25964355, "step": 11447, "time_per_iteration": 2.7774975299835205 }, { "auxiliary_loss_clip": 0.0131424, "auxiliary_loss_mlp": 0.00228082, "balance_loss_clip": 1.0862906, "balance_loss_mlp": 0.20099753, "epoch": 0.6882909965429129, "flos": 22455660931200.0, "grad_norm": 2.9850701924980143, "language_loss": 0.80482721, "learning_rate": 9.354059170028705e-07, "loss": 0.82025045, "num_input_tokens_seen": 247121905, "router_z_loss_clip": 2.27929688, "router_z_loss_mlp": 0.27111816, "step": 11448, "time_per_iteration": 2.727586507797241 }, { "auxiliary_loss_clip": 0.01311303, "auxiliary_loss_mlp": 0.00231982, "balance_loss_clip": 1.08065033, "balance_loss_mlp": 0.20365755, "epoch": 0.688351119795581, "flos": 26214143408640.0, "grad_norm": 29.28177465043976, "language_loss": 0.84857488, "learning_rate": 9.350762354227673e-07, "loss": 0.86400771, "num_input_tokens_seen": 247142375, "router_z_loss_clip": 2.30664062, "router_z_loss_mlp": 0.2833252, "step": 11449, "time_per_iteration": 2.759615421295166 }, { "auxiliary_loss_clip": 0.01294732, "auxiliary_loss_mlp": 0.00216645, "balance_loss_clip": 1.07520628, "balance_loss_mlp": 0.1908128, "epoch": 0.6884112430482489, "flos": 22565260304640.0, "grad_norm": 128.36708985493803, "language_loss": 0.8013497, "learning_rate": 9.34746594224679e-07, "loss": 0.81646347, "num_input_tokens_seen": 247161095, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.25830078, "step": 11450, "time_per_iteration": 2.6574368476867676 }, { "auxiliary_loss_clip": 0.01335668, "auxiliary_loss_mlp": 0.00240482, "balance_loss_clip": 1.10048592, "balance_loss_mlp": 0.21208677, "epoch": 0.6884713663009169, "flos": 17341047446400.0, "grad_norm": 3.0134411078306163, "language_loss": 0.86955214, "learning_rate": 9.344169934211068e-07, "loss": 0.88531363, "num_input_tokens_seen": 247178565, "router_z_loss_clip": 2.34570312, "router_z_loss_mlp": 0.28369141, "step": 11451, "time_per_iteration": 2.6486563682556152 }, { "auxiliary_loss_clip": 0.01332841, "auxiliary_loss_mlp": 0.00227438, "balance_loss_clip": 1.09991384, "balance_loss_mlp": 0.20091403, "epoch": 0.6885314895535849, "flos": 26470832976000.0, "grad_norm": 5.820247415981641, "language_loss": 0.7530145, "learning_rate": 9.340874330245505e-07, "loss": 0.76861727, "num_input_tokens_seen": 247202345, "router_z_loss_clip": 2.33007812, "router_z_loss_mlp": 0.26574707, "step": 11452, "time_per_iteration": 2.7194902896881104 }, { "auxiliary_loss_clip": 0.01322129, "auxiliary_loss_mlp": 0.00236674, "balance_loss_clip": 1.09399652, "balance_loss_mlp": 0.21082938, "epoch": 0.6885916128062528, "flos": 20521548178560.0, "grad_norm": 2.580366958045525, "language_loss": 0.79788196, "learning_rate": 9.337579130475042e-07, "loss": 0.81347001, "num_input_tokens_seen": 247219240, "router_z_loss_clip": 2.28125, "router_z_loss_mlp": 0.25805664, "step": 11453, "time_per_iteration": 2.6723976135253906 }, { "auxiliary_loss_clip": 0.01143268, "auxiliary_loss_mlp": 0.00119669, "balance_loss_clip": 0.9980197, "balance_loss_mlp": 0.11132425, "epoch": 0.6886517360589208, "flos": 70715795679360.0, "grad_norm": 0.7631602870708566, "language_loss": 0.50005144, "learning_rate": 9.334284335024644e-07, "loss": 0.51268077, "num_input_tokens_seen": 247272010, "router_z_loss_clip": 1.453125, "router_z_loss_mlp": 0.08349609, "step": 11454, "time_per_iteration": 2.998389720916748 }, { "auxiliary_loss_clip": 0.01296947, "auxiliary_loss_mlp": 0.00220319, "balance_loss_clip": 1.08374417, "balance_loss_mlp": 0.19516611, "epoch": 0.6887118593115887, "flos": 17893533513600.0, "grad_norm": 7.760866431362832, "language_loss": 0.8357693, "learning_rate": 9.330989944019263e-07, "loss": 0.85094196, "num_input_tokens_seen": 247290630, "router_z_loss_clip": 2.1328125, "router_z_loss_mlp": 0.25146484, "step": 11455, "time_per_iteration": 2.644127130508423 }, { "auxiliary_loss_clip": 0.01337928, "auxiliary_loss_mlp": 0.00229822, "balance_loss_clip": 1.10098863, "balance_loss_mlp": 0.19913781, "epoch": 0.6887719825642568, "flos": 17453017117440.0, "grad_norm": 51.9491973848988, "language_loss": 0.83988905, "learning_rate": 9.327695957583803e-07, "loss": 0.85556662, "num_input_tokens_seen": 247304800, "router_z_loss_clip": 2.37304688, "router_z_loss_mlp": 0.30688477, "step": 11456, "time_per_iteration": 4.009915590286255 }, { "auxiliary_loss_clip": 0.01295133, "auxiliary_loss_mlp": 0.00230052, "balance_loss_clip": 1.07794356, "balance_loss_mlp": 0.20518512, "epoch": 0.6888321058169247, "flos": 23070199743360.0, "grad_norm": 8.070151590330056, "language_loss": 0.87776458, "learning_rate": 9.32440237584319e-07, "loss": 0.8930164, "num_input_tokens_seen": 247323450, "router_z_loss_clip": 2.17285156, "router_z_loss_mlp": 0.24865723, "step": 11457, "time_per_iteration": 2.6713366508483887 }, { "auxiliary_loss_clip": 0.01312592, "auxiliary_loss_mlp": 0.00229157, "balance_loss_clip": 1.08710098, "balance_loss_mlp": 0.20371723, "epoch": 0.6888922290695927, "flos": 23368833417600.0, "grad_norm": 9.651104730989799, "language_loss": 0.86572939, "learning_rate": 9.321109198922301e-07, "loss": 0.88114691, "num_input_tokens_seen": 247343845, "router_z_loss_clip": 2.25390625, "router_z_loss_mlp": 0.2545166, "step": 11458, "time_per_iteration": 4.1068079471588135 }, { "auxiliary_loss_clip": 0.01303158, "auxiliary_loss_mlp": 0.00229841, "balance_loss_clip": 1.08345807, "balance_loss_mlp": 0.20225623, "epoch": 0.6889523523222606, "flos": 17631636474240.0, "grad_norm": 5.047134182062818, "language_loss": 0.76095968, "learning_rate": 9.31781642694603e-07, "loss": 0.7762897, "num_input_tokens_seen": 247356650, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.27575684, "step": 11459, "time_per_iteration": 2.6859982013702393 }, { "auxiliary_loss_clip": 0.01323321, "auxiliary_loss_mlp": 0.00226324, "balance_loss_clip": 1.09428346, "balance_loss_mlp": 0.19869165, "epoch": 0.6890124755749286, "flos": 25228144097280.0, "grad_norm": 296.97508855727773, "language_loss": 0.75759238, "learning_rate": 9.314524060039221e-07, "loss": 0.77308881, "num_input_tokens_seen": 247377340, "router_z_loss_clip": 2.29101562, "router_z_loss_mlp": 0.27648926, "step": 11460, "time_per_iteration": 2.747553825378418 }, { "auxiliary_loss_clip": 0.01334853, "auxiliary_loss_mlp": 0.00249872, "balance_loss_clip": 1.09870076, "balance_loss_mlp": 0.2182336, "epoch": 0.6890725988275965, "flos": 20230240878720.0, "grad_norm": 10.001968078305456, "language_loss": 0.8805567, "learning_rate": 9.311232098326731e-07, "loss": 0.89640397, "num_input_tokens_seen": 247395805, "router_z_loss_clip": 2.35742188, "router_z_loss_mlp": 0.31640625, "step": 11461, "time_per_iteration": 4.19239616394043 }, { "auxiliary_loss_clip": 0.01305923, "auxiliary_loss_mlp": 0.00213396, "balance_loss_clip": 1.07981074, "balance_loss_mlp": 0.18701524, "epoch": 0.6891327220802645, "flos": 14535311264640.0, "grad_norm": 56.64484727732651, "language_loss": 0.7736237, "learning_rate": 9.307940541933401e-07, "loss": 0.78881687, "num_input_tokens_seen": 247413165, "router_z_loss_clip": 2.26171875, "router_z_loss_mlp": 0.26367188, "step": 11462, "time_per_iteration": 2.6811599731445312 }, { "auxiliary_loss_clip": 0.0131152, "auxiliary_loss_mlp": 0.00238822, "balance_loss_clip": 1.09128034, "balance_loss_mlp": 0.21054545, "epoch": 0.6891928453329325, "flos": 21139139646720.0, "grad_norm": 158.26326199128758, "language_loss": 0.93672299, "learning_rate": 9.304649390984034e-07, "loss": 0.9522264, "num_input_tokens_seen": 247433140, "router_z_loss_clip": 2.20507812, "router_z_loss_mlp": 0.28271484, "step": 11463, "time_per_iteration": 2.6819827556610107 }, { "auxiliary_loss_clip": 0.01295716, "auxiliary_loss_mlp": 0.00217814, "balance_loss_clip": 1.07814062, "balance_loss_mlp": 0.19365063, "epoch": 0.6892529685856005, "flos": 17858520731520.0, "grad_norm": 2.3634371562557837, "language_loss": 0.7575919, "learning_rate": 9.301358645603428e-07, "loss": 0.77272725, "num_input_tokens_seen": 247451265, "router_z_loss_clip": 2.17578125, "router_z_loss_mlp": 0.24157715, "step": 11464, "time_per_iteration": 2.6504580974578857 }, { "auxiliary_loss_clip": 0.01309307, "auxiliary_loss_mlp": 0.00230408, "balance_loss_clip": 1.08251476, "balance_loss_mlp": 0.20256107, "epoch": 0.6893130918382685, "flos": 29934811843200.0, "grad_norm": 8.248440116904419, "language_loss": 0.75306797, "learning_rate": 9.298068305916373e-07, "loss": 0.76846516, "num_input_tokens_seen": 247471645, "router_z_loss_clip": 2.26953125, "router_z_loss_mlp": 0.27844238, "step": 11465, "time_per_iteration": 4.1993982791900635 }, { "auxiliary_loss_clip": 0.01327243, "auxiliary_loss_mlp": 0.00255242, "balance_loss_clip": 1.0944407, "balance_loss_mlp": 0.22555935, "epoch": 0.6893732150909364, "flos": 24388516707840.0, "grad_norm": 3.114898435955582, "language_loss": 0.8127569, "learning_rate": 9.294778372047649e-07, "loss": 0.82858169, "num_input_tokens_seen": 247491170, "router_z_loss_clip": 2.32617188, "router_z_loss_mlp": 0.29699707, "step": 11466, "time_per_iteration": 2.6897146701812744 }, { "auxiliary_loss_clip": 0.01314433, "auxiliary_loss_mlp": 0.00228497, "balance_loss_clip": 1.09010816, "balance_loss_mlp": 0.20095937, "epoch": 0.6894333383436044, "flos": 16982874979200.0, "grad_norm": 10.136319153429584, "language_loss": 0.79929268, "learning_rate": 9.291488844121995e-07, "loss": 0.814722, "num_input_tokens_seen": 247509005, "router_z_loss_clip": 2.24414062, "router_z_loss_mlp": 0.27539062, "step": 11467, "time_per_iteration": 2.718916416168213 }, { "auxiliary_loss_clip": 0.01322769, "auxiliary_loss_mlp": 0.00233806, "balance_loss_clip": 1.09065104, "balance_loss_mlp": 0.20485018, "epoch": 0.6894934615962723, "flos": 18985540838400.0, "grad_norm": 15.521384417608077, "language_loss": 0.89928752, "learning_rate": 9.288199722264156e-07, "loss": 0.91485322, "num_input_tokens_seen": 247527050, "router_z_loss_clip": 2.3203125, "router_z_loss_mlp": 0.28942871, "step": 11468, "time_per_iteration": 2.636523962020874 }, { "auxiliary_loss_clip": 0.01351062, "auxiliary_loss_mlp": 0.00221488, "balance_loss_clip": 1.11380863, "balance_loss_mlp": 0.19181724, "epoch": 0.6895535848489404, "flos": 34531664734080.0, "grad_norm": 15.486063178333204, "language_loss": 0.73255074, "learning_rate": 9.284911006598875e-07, "loss": 0.74827623, "num_input_tokens_seen": 247547765, "router_z_loss_clip": 2.375, "router_z_loss_mlp": 0.29711914, "step": 11469, "time_per_iteration": 2.81756329536438 }, { "auxiliary_loss_clip": 0.01157977, "auxiliary_loss_mlp": 0.00076395, "balance_loss_clip": 1.01172388, "balance_loss_mlp": 0.06747842, "epoch": 0.6896137081016083, "flos": 50075852273280.0, "grad_norm": 0.7737481553979402, "language_loss": 0.54489303, "learning_rate": 9.281622697250824e-07, "loss": 0.55723679, "num_input_tokens_seen": 247603515, "router_z_loss_clip": 1.4609375, "router_z_loss_mlp": 0.08935547, "step": 11470, "time_per_iteration": 3.01958966255188 }, { "auxiliary_loss_clip": 0.01316814, "auxiliary_loss_mlp": 0.00216855, "balance_loss_clip": 1.09283054, "balance_loss_mlp": 0.19161856, "epoch": 0.6896738313542763, "flos": 19938215306880.0, "grad_norm": 13.588526669961501, "language_loss": 0.84462506, "learning_rate": 9.278334794344715e-07, "loss": 0.85996175, "num_input_tokens_seen": 247622110, "router_z_loss_clip": 2.2421875, "router_z_loss_mlp": 0.25244141, "step": 11471, "time_per_iteration": 2.662492036819458 }, { "auxiliary_loss_clip": 0.01311183, "auxiliary_loss_mlp": 0.00223916, "balance_loss_clip": 1.0852282, "balance_loss_mlp": 0.19730853, "epoch": 0.6897339546069442, "flos": 21725489260800.0, "grad_norm": 175.97009524285994, "language_loss": 0.86096746, "learning_rate": 9.275047298005232e-07, "loss": 0.87631845, "num_input_tokens_seen": 247641905, "router_z_loss_clip": 2.2578125, "router_z_loss_mlp": 0.26635742, "step": 11472, "time_per_iteration": 2.6898367404937744 }, { "auxiliary_loss_clip": 0.01317316, "auxiliary_loss_mlp": 0.00226635, "balance_loss_clip": 1.0890255, "balance_loss_mlp": 0.19986045, "epoch": 0.6897940778596122, "flos": 19826497031040.0, "grad_norm": 6.093336395081715, "language_loss": 0.83861232, "learning_rate": 9.271760208357024e-07, "loss": 0.85405183, "num_input_tokens_seen": 247660945, "router_z_loss_clip": 2.28710938, "router_z_loss_mlp": 0.26794434, "step": 11473, "time_per_iteration": 2.6662890911102295 }, { "auxiliary_loss_clip": 0.0133379, "auxiliary_loss_mlp": 0.00233931, "balance_loss_clip": 1.0977478, "balance_loss_mlp": 0.20437893, "epoch": 0.6898542011122801, "flos": 17310056987520.0, "grad_norm": 4.394009387243449, "language_loss": 0.82559717, "learning_rate": 9.268473525524751e-07, "loss": 0.84127438, "num_input_tokens_seen": 247678395, "router_z_loss_clip": 2.36132812, "router_z_loss_mlp": 0.29541016, "step": 11474, "time_per_iteration": 2.6429173946380615 }, { "auxiliary_loss_clip": 0.01322662, "auxiliary_loss_mlp": 0.0021127, "balance_loss_clip": 1.09119391, "balance_loss_mlp": 0.18342283, "epoch": 0.6899143243649482, "flos": 24754051463040.0, "grad_norm": 37.454293746270615, "language_loss": 0.82711792, "learning_rate": 9.26518724963303e-07, "loss": 0.84245729, "num_input_tokens_seen": 247698380, "router_z_loss_clip": 2.31054688, "router_z_loss_mlp": 0.27844238, "step": 11475, "time_per_iteration": 2.789879560470581 }, { "auxiliary_loss_clip": 0.01320289, "auxiliary_loss_mlp": 0.00206093, "balance_loss_clip": 1.08943391, "balance_loss_mlp": 0.17900863, "epoch": 0.6899744476176161, "flos": 17234536642560.0, "grad_norm": 5.201716087866568, "language_loss": 0.96656942, "learning_rate": 9.261901380806491e-07, "loss": 0.98183322, "num_input_tokens_seen": 247716370, "router_z_loss_clip": 2.30664062, "router_z_loss_mlp": 0.27075195, "step": 11476, "time_per_iteration": 2.722194194793701 }, { "auxiliary_loss_clip": 0.01291812, "auxiliary_loss_mlp": 0.0022147, "balance_loss_clip": 1.06959844, "balance_loss_mlp": 0.19545884, "epoch": 0.6900345708702841, "flos": 25410678036480.0, "grad_norm": 102.78020507535624, "language_loss": 0.7775588, "learning_rate": 9.258615919169724e-07, "loss": 0.79269165, "num_input_tokens_seen": 247737335, "router_z_loss_clip": 2.22265625, "router_z_loss_mlp": 0.26013184, "step": 11477, "time_per_iteration": 2.675520658493042 }, { "auxiliary_loss_clip": 0.01338302, "auxiliary_loss_mlp": 0.00227612, "balance_loss_clip": 1.10480499, "balance_loss_mlp": 0.19969282, "epoch": 0.6900946941229521, "flos": 23434190213760.0, "grad_norm": 15.036246598504665, "language_loss": 0.7715174, "learning_rate": 9.255330864847313e-07, "loss": 0.78717649, "num_input_tokens_seen": 247756680, "router_z_loss_clip": 2.33398438, "router_z_loss_mlp": 0.2791748, "step": 11478, "time_per_iteration": 2.748300075531006 }, { "auxiliary_loss_clip": 0.01315724, "auxiliary_loss_mlp": 0.00240406, "balance_loss_clip": 1.08830714, "balance_loss_mlp": 0.21406046, "epoch": 0.69015481737562, "flos": 17820096157440.0, "grad_norm": 3.013988505457007, "language_loss": 0.8462944, "learning_rate": 9.252046217963843e-07, "loss": 0.86185569, "num_input_tokens_seen": 247774265, "router_z_loss_clip": 2.2734375, "router_z_loss_mlp": 0.26391602, "step": 11479, "time_per_iteration": 2.7033936977386475 }, { "auxiliary_loss_clip": 0.0132255, "auxiliary_loss_mlp": 0.00232283, "balance_loss_clip": 1.09022689, "balance_loss_mlp": 0.20290965, "epoch": 0.690214940628288, "flos": 17456500736640.0, "grad_norm": 6.321882654911184, "language_loss": 0.87872303, "learning_rate": 9.248761978643856e-07, "loss": 0.89427137, "num_input_tokens_seen": 247792395, "router_z_loss_clip": 2.32617188, "router_z_loss_mlp": 0.29345703, "step": 11480, "time_per_iteration": 2.751217842102051 }, { "auxiliary_loss_clip": 0.01294607, "auxiliary_loss_mlp": 0.00235298, "balance_loss_clip": 1.06974864, "balance_loss_mlp": 0.20942938, "epoch": 0.6902750638809559, "flos": 29566691308800.0, "grad_norm": 4.40063162056622, "language_loss": 0.82171911, "learning_rate": 9.245478147011885e-07, "loss": 0.83701813, "num_input_tokens_seen": 247811985, "router_z_loss_clip": 2.24414062, "router_z_loss_mlp": 0.25842285, "step": 11481, "time_per_iteration": 2.7223100662231445 }, { "auxiliary_loss_clip": 0.01314608, "auxiliary_loss_mlp": 0.00231216, "balance_loss_clip": 1.08924234, "balance_loss_mlp": 0.20674194, "epoch": 0.690335187133624, "flos": 25557121785600.0, "grad_norm": 8.075145040513439, "language_loss": 0.78740585, "learning_rate": 9.24219472319246e-07, "loss": 0.80286413, "num_input_tokens_seen": 247831880, "router_z_loss_clip": 2.25585938, "router_z_loss_mlp": 0.24487305, "step": 11482, "time_per_iteration": 2.720813035964966 }, { "auxiliary_loss_clip": 0.01314744, "auxiliary_loss_mlp": 0.00254791, "balance_loss_clip": 1.08359361, "balance_loss_mlp": 0.22732529, "epoch": 0.6903953103862919, "flos": 22488447070080.0, "grad_norm": 5.8900817641606436, "language_loss": 0.88369155, "learning_rate": 9.238911707310096e-07, "loss": 0.89938688, "num_input_tokens_seen": 247851170, "router_z_loss_clip": 2.31054688, "router_z_loss_mlp": 0.27478027, "step": 11483, "time_per_iteration": 2.697443723678589 }, { "auxiliary_loss_clip": 0.01306334, "auxiliary_loss_mlp": 0.00242448, "balance_loss_clip": 1.08919477, "balance_loss_mlp": 0.21654424, "epoch": 0.6904554336389599, "flos": 26100521712000.0, "grad_norm": 8.037775621594268, "language_loss": 0.73514903, "learning_rate": 9.235629099489273e-07, "loss": 0.75063682, "num_input_tokens_seen": 247868950, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.25939941, "step": 11484, "time_per_iteration": 2.718510389328003 }, { "auxiliary_loss_clip": 0.01290372, "auxiliary_loss_mlp": 0.0024684, "balance_loss_clip": 1.06971526, "balance_loss_mlp": 0.21884987, "epoch": 0.6905155568916278, "flos": 31171754545920.0, "grad_norm": 10.921594418241574, "language_loss": 0.797225, "learning_rate": 9.232346899854479e-07, "loss": 0.81259716, "num_input_tokens_seen": 247889805, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.27990723, "step": 11485, "time_per_iteration": 2.7111570835113525 }, { "auxiliary_loss_clip": 0.01302909, "auxiliary_loss_mlp": 0.00252427, "balance_loss_clip": 1.07802296, "balance_loss_mlp": 0.22528341, "epoch": 0.6905756801442958, "flos": 17639681120640.0, "grad_norm": 3.499777026043928, "language_loss": 0.91353023, "learning_rate": 9.22906510853017e-07, "loss": 0.92908359, "num_input_tokens_seen": 247908585, "router_z_loss_clip": 2.24804688, "router_z_loss_mlp": 0.27111816, "step": 11486, "time_per_iteration": 2.7135114669799805 }, { "auxiliary_loss_clip": 0.01317215, "auxiliary_loss_mlp": 0.00249888, "balance_loss_clip": 1.09068191, "balance_loss_mlp": 0.22365046, "epoch": 0.6906358033969637, "flos": 22343691260160.0, "grad_norm": 12.651417772532204, "language_loss": 0.7960971, "learning_rate": 9.225783725640786e-07, "loss": 0.81176811, "num_input_tokens_seen": 247928480, "router_z_loss_clip": 2.26367188, "router_z_loss_mlp": 0.26269531, "step": 11487, "time_per_iteration": 2.708281993865967 }, { "auxiliary_loss_clip": 0.01122203, "auxiliary_loss_mlp": 0.00068168, "balance_loss_clip": 0.9795078, "balance_loss_mlp": 0.06125404, "epoch": 0.6906959266496318, "flos": 69747789081600.0, "grad_norm": 0.8590823011672896, "language_loss": 0.6569944, "learning_rate": 9.222502751310759e-07, "loss": 0.66889811, "num_input_tokens_seen": 247988855, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.06933594, "step": 11488, "time_per_iteration": 3.1898462772369385 }, { "auxiliary_loss_clip": 0.01320258, "auxiliary_loss_mlp": 0.00295142, "balance_loss_clip": 1.08697355, "balance_loss_mlp": 0.2672351, "epoch": 0.6907560499022997, "flos": 21434253788160.0, "grad_norm": 904.8165761070784, "language_loss": 0.85116631, "learning_rate": 9.219222185664519e-07, "loss": 0.8673203, "num_input_tokens_seen": 248007685, "router_z_loss_clip": 2.33203125, "router_z_loss_mlp": 0.27880859, "step": 11489, "time_per_iteration": 2.6812314987182617 }, { "auxiliary_loss_clip": 0.01308019, "auxiliary_loss_mlp": 0.00273204, "balance_loss_clip": 1.08388162, "balance_loss_mlp": 0.24565518, "epoch": 0.6908161731549677, "flos": 14392207480320.0, "grad_norm": 19.325312891714866, "language_loss": 0.72146189, "learning_rate": 9.215942028826445e-07, "loss": 0.73727405, "num_input_tokens_seen": 248025145, "router_z_loss_clip": 2.2421875, "router_z_loss_mlp": 0.2755127, "step": 11490, "time_per_iteration": 2.628631591796875 }, { "auxiliary_loss_clip": 0.01304875, "auxiliary_loss_mlp": 0.00248708, "balance_loss_clip": 1.08219779, "balance_loss_mlp": 0.22336459, "epoch": 0.6908762964076357, "flos": 20010970304640.0, "grad_norm": 6.866291061733434, "language_loss": 0.77613151, "learning_rate": 9.212662280920937e-07, "loss": 0.79166734, "num_input_tokens_seen": 248043750, "router_z_loss_clip": 2.22558594, "router_z_loss_mlp": 0.25341797, "step": 11491, "time_per_iteration": 2.650214195251465 }, { "auxiliary_loss_clip": 0.01306502, "auxiliary_loss_mlp": 0.00263167, "balance_loss_clip": 1.08157241, "balance_loss_mlp": 0.23478284, "epoch": 0.6909364196603036, "flos": 28769079853440.0, "grad_norm": 6.9653540392790765, "language_loss": 0.75853342, "learning_rate": 9.20938294207235e-07, "loss": 0.77423012, "num_input_tokens_seen": 248065765, "router_z_loss_clip": 2.25390625, "router_z_loss_mlp": 0.28417969, "step": 11492, "time_per_iteration": 2.7249984741210938 }, { "auxiliary_loss_clip": 0.01326675, "auxiliary_loss_mlp": 0.00255226, "balance_loss_clip": 1.09049749, "balance_loss_mlp": 0.22586495, "epoch": 0.6909965429129716, "flos": 22528128620160.0, "grad_norm": 9.119265070271943, "language_loss": 0.82883114, "learning_rate": 9.206104012405049e-07, "loss": 0.84465015, "num_input_tokens_seen": 248083810, "router_z_loss_clip": 2.36132812, "router_z_loss_mlp": 0.2935791, "step": 11493, "time_per_iteration": 2.7061524391174316 }, { "auxiliary_loss_clip": 0.01307963, "auxiliary_loss_mlp": 0.00265394, "balance_loss_clip": 1.08132935, "balance_loss_mlp": 0.23745167, "epoch": 0.6910566661656395, "flos": 18405942981120.0, "grad_norm": 45.33405724894164, "language_loss": 0.81686914, "learning_rate": 9.20282549204336e-07, "loss": 0.83260268, "num_input_tokens_seen": 248103185, "router_z_loss_clip": 2.26367188, "router_z_loss_mlp": 0.27929688, "step": 11494, "time_per_iteration": 2.6435508728027344 }, { "auxiliary_loss_clip": 0.01314605, "auxiliary_loss_mlp": 0.00279742, "balance_loss_clip": 1.08554626, "balance_loss_mlp": 0.25089356, "epoch": 0.6911167894183076, "flos": 30773972355840.0, "grad_norm": 26.104037383112463, "language_loss": 0.76974726, "learning_rate": 9.19954738111161e-07, "loss": 0.78569067, "num_input_tokens_seen": 248125665, "router_z_loss_clip": 2.29101562, "router_z_loss_mlp": 0.28881836, "step": 11495, "time_per_iteration": 2.7480101585388184 }, { "auxiliary_loss_clip": 0.01305114, "auxiliary_loss_mlp": 0.00293233, "balance_loss_clip": 1.08226109, "balance_loss_mlp": 0.26647067, "epoch": 0.6911769126709755, "flos": 13735724561280.0, "grad_norm": 50.2413971594535, "language_loss": 0.82752627, "learning_rate": 9.196269679734119e-07, "loss": 0.84350979, "num_input_tokens_seen": 248142545, "router_z_loss_clip": 2.23242188, "router_z_loss_mlp": 0.26745605, "step": 11496, "time_per_iteration": 2.658576250076294 }, { "auxiliary_loss_clip": 0.01292189, "auxiliary_loss_mlp": 0.00256694, "balance_loss_clip": 1.07615983, "balance_loss_mlp": 0.23021811, "epoch": 0.6912370359236435, "flos": 17566854295680.0, "grad_norm": 26.61395978688353, "language_loss": 0.87774134, "learning_rate": 9.19299238803515e-07, "loss": 0.89323014, "num_input_tokens_seen": 248160225, "router_z_loss_clip": 2.16015625, "router_z_loss_mlp": 0.26452637, "step": 11497, "time_per_iteration": 2.663674831390381 }, { "auxiliary_loss_clip": 0.01324258, "auxiliary_loss_mlp": 0.00259775, "balance_loss_clip": 1.099823, "balance_loss_mlp": 0.23302434, "epoch": 0.6912971591763114, "flos": 22090772620800.0, "grad_norm": 30.570038290386222, "language_loss": 0.88344979, "learning_rate": 9.189715506138993e-07, "loss": 0.89929014, "num_input_tokens_seen": 248180430, "router_z_loss_clip": 2.24804688, "router_z_loss_mlp": 0.26745605, "step": 11498, "time_per_iteration": 2.700890064239502 }, { "auxiliary_loss_clip": 0.012983, "auxiliary_loss_mlp": 0.00263596, "balance_loss_clip": 1.08118415, "balance_loss_mlp": 0.23865694, "epoch": 0.6913572824289794, "flos": 29971476650880.0, "grad_norm": 8.679129564184658, "language_loss": 0.91112882, "learning_rate": 9.186439034169915e-07, "loss": 0.9267478, "num_input_tokens_seen": 248202365, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.24951172, "step": 11499, "time_per_iteration": 4.13749098777771 }, { "auxiliary_loss_clip": 0.01292083, "auxiliary_loss_mlp": 0.00264741, "balance_loss_clip": 1.07734871, "balance_loss_mlp": 0.23995745, "epoch": 0.6914174056816473, "flos": 20448936835200.0, "grad_norm": 6.366804741517456, "language_loss": 0.82265496, "learning_rate": 9.183162972252145e-07, "loss": 0.83822322, "num_input_tokens_seen": 248221750, "router_z_loss_clip": 2.14453125, "router_z_loss_mlp": 0.24755859, "step": 11500, "time_per_iteration": 4.1396262645721436 }, { "auxiliary_loss_clip": 0.01304076, "auxiliary_loss_mlp": 0.00288907, "balance_loss_clip": 1.07701707, "balance_loss_mlp": 0.26126266, "epoch": 0.6914775289343154, "flos": 21282530739840.0, "grad_norm": 9.237510588595658, "language_loss": 0.85505563, "learning_rate": 9.179887320509921e-07, "loss": 0.87098545, "num_input_tokens_seen": 248239535, "router_z_loss_clip": 2.2734375, "router_z_loss_mlp": 0.27636719, "step": 11501, "time_per_iteration": 2.7013654708862305 }, { "auxiliary_loss_clip": 0.01299511, "auxiliary_loss_mlp": 0.00279798, "balance_loss_clip": 1.07636714, "balance_loss_mlp": 0.25081828, "epoch": 0.6915376521869833, "flos": 23878118401920.0, "grad_norm": 6.181522988266709, "language_loss": 0.81134921, "learning_rate": 9.176612079067458e-07, "loss": 0.8271423, "num_input_tokens_seen": 248259055, "router_z_loss_clip": 2.23242188, "router_z_loss_mlp": 0.29003906, "step": 11502, "time_per_iteration": 2.6507809162139893 }, { "auxiliary_loss_clip": 0.01321349, "auxiliary_loss_mlp": 0.00271048, "balance_loss_clip": 1.09201598, "balance_loss_mlp": 0.24218714, "epoch": 0.6915977754396513, "flos": 11510268595200.0, "grad_norm": 12.07756844021101, "language_loss": 0.84570503, "learning_rate": 9.173337248048953e-07, "loss": 0.86162901, "num_input_tokens_seen": 248276765, "router_z_loss_clip": 2.29492188, "router_z_loss_mlp": 0.28857422, "step": 11503, "time_per_iteration": 2.6662471294403076 }, { "auxiliary_loss_clip": 0.01306542, "auxiliary_loss_mlp": 0.0023097, "balance_loss_clip": 1.07951152, "balance_loss_mlp": 0.2057333, "epoch": 0.6916578986923193, "flos": 22601278667520.0, "grad_norm": 118.05737838361628, "language_loss": 0.86023146, "learning_rate": 9.170062827578575e-07, "loss": 0.8756066, "num_input_tokens_seen": 248295310, "router_z_loss_clip": 2.2734375, "router_z_loss_mlp": 0.25231934, "step": 11504, "time_per_iteration": 4.190938234329224 }, { "auxiliary_loss_clip": 0.01332385, "auxiliary_loss_mlp": 0.00280069, "balance_loss_clip": 1.10184383, "balance_loss_mlp": 0.25159025, "epoch": 0.6917180219449872, "flos": 23477355383040.0, "grad_norm": 27.94622750539555, "language_loss": 0.81244922, "learning_rate": 9.166788817780499e-07, "loss": 0.8285737, "num_input_tokens_seen": 248315230, "router_z_loss_clip": 2.3046875, "router_z_loss_mlp": 0.28479004, "step": 11505, "time_per_iteration": 2.6970748901367188 }, { "auxiliary_loss_clip": 0.01289587, "auxiliary_loss_mlp": 0.00276031, "balance_loss_clip": 1.0713104, "balance_loss_mlp": 0.25105637, "epoch": 0.6917781451976552, "flos": 23732536579200.0, "grad_norm": 15.363115713337606, "language_loss": 0.94425434, "learning_rate": 9.163515218778886e-07, "loss": 0.95991051, "num_input_tokens_seen": 248332980, "router_z_loss_clip": 2.18164062, "router_z_loss_mlp": 0.25, "step": 11506, "time_per_iteration": 2.7147128582000732 }, { "auxiliary_loss_clip": 0.01303077, "auxiliary_loss_mlp": 0.00269282, "balance_loss_clip": 1.08090591, "balance_loss_mlp": 0.24341348, "epoch": 0.6918382684503231, "flos": 31466760946560.0, "grad_norm": 35.6326349563974, "language_loss": 0.82166803, "learning_rate": 9.160242030697856e-07, "loss": 0.83739161, "num_input_tokens_seen": 248352865, "router_z_loss_clip": 2.22265625, "router_z_loss_mlp": 0.25878906, "step": 11507, "time_per_iteration": 4.168775796890259 }, { "auxiliary_loss_clip": 0.0127711, "auxiliary_loss_mlp": 0.00265851, "balance_loss_clip": 1.06028605, "balance_loss_mlp": 0.24130537, "epoch": 0.6918983917029912, "flos": 21650471706240.0, "grad_norm": 3.2057044630949285, "language_loss": 0.84432274, "learning_rate": 9.156969253661538e-07, "loss": 0.85975236, "num_input_tokens_seen": 248371125, "router_z_loss_clip": 2.16894531, "router_z_loss_mlp": 0.24536133, "step": 11508, "time_per_iteration": 2.646319627761841 }, { "auxiliary_loss_clip": 0.01276492, "auxiliary_loss_mlp": 0.00283192, "balance_loss_clip": 1.06356227, "balance_loss_mlp": 0.25825295, "epoch": 0.6919585149556591, "flos": 25550082720000.0, "grad_norm": 16.330980557234295, "language_loss": 0.80053985, "learning_rate": 9.153696887794027e-07, "loss": 0.81613672, "num_input_tokens_seen": 248390455, "router_z_loss_clip": 2.12695312, "router_z_loss_mlp": 0.24963379, "step": 11509, "time_per_iteration": 2.7215075492858887 }, { "auxiliary_loss_clip": 0.01286562, "auxiliary_loss_mlp": 0.00273405, "balance_loss_clip": 1.07140517, "balance_loss_mlp": 0.24677365, "epoch": 0.6920186382083271, "flos": 23659781581440.0, "grad_norm": 4.149489660433431, "language_loss": 0.73249662, "learning_rate": 9.150424933219425e-07, "loss": 0.74809635, "num_input_tokens_seen": 248411305, "router_z_loss_clip": 2.15234375, "router_z_loss_mlp": 0.26611328, "step": 11510, "time_per_iteration": 2.7804603576660156 }, { "auxiliary_loss_clip": 0.01308722, "auxiliary_loss_mlp": 0.00246921, "balance_loss_clip": 1.08261693, "balance_loss_mlp": 0.22056374, "epoch": 0.692078761460995, "flos": 19061959023360.0, "grad_norm": 7.677232079248114, "language_loss": 0.84240043, "learning_rate": 9.147153390061788e-07, "loss": 0.85795683, "num_input_tokens_seen": 248430190, "router_z_loss_clip": 2.26171875, "router_z_loss_mlp": 0.26342773, "step": 11511, "time_per_iteration": 2.764129877090454 }, { "auxiliary_loss_clip": 0.01308477, "auxiliary_loss_mlp": 0.00259838, "balance_loss_clip": 1.08615541, "balance_loss_mlp": 0.23200309, "epoch": 0.692138884713663, "flos": 29023291382400.0, "grad_norm": 3.126908786534533, "language_loss": 0.71768296, "learning_rate": 9.143882258445184e-07, "loss": 0.73336613, "num_input_tokens_seen": 248450830, "router_z_loss_clip": 2.22265625, "router_z_loss_mlp": 0.27844238, "step": 11512, "time_per_iteration": 2.688924551010132 }, { "auxiliary_loss_clip": 0.01309872, "auxiliary_loss_mlp": 0.00265953, "balance_loss_clip": 1.08624554, "balance_loss_mlp": 0.24007289, "epoch": 0.6921990079663309, "flos": 14757849976320.0, "grad_norm": 2.71942559268146, "language_loss": 0.92555869, "learning_rate": 9.140611538493666e-07, "loss": 0.9413169, "num_input_tokens_seen": 248468585, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.25878906, "step": 11513, "time_per_iteration": 2.6588733196258545 }, { "auxiliary_loss_clip": 0.0128917, "auxiliary_loss_mlp": 0.00237328, "balance_loss_clip": 1.07374787, "balance_loss_mlp": 0.21498813, "epoch": 0.692259131218999, "flos": 23841848643840.0, "grad_norm": 377.7788695046926, "language_loss": 0.83747172, "learning_rate": 9.137341230331233e-07, "loss": 0.85273671, "num_input_tokens_seen": 248490535, "router_z_loss_clip": 2.15527344, "router_z_loss_mlp": 0.22363281, "step": 11514, "time_per_iteration": 2.6802823543548584 }, { "auxiliary_loss_clip": 0.0132334, "auxiliary_loss_mlp": 0.00257487, "balance_loss_clip": 1.08943009, "balance_loss_mlp": 0.23084328, "epoch": 0.6923192544716669, "flos": 19135073157120.0, "grad_norm": 85.9287146165235, "language_loss": 0.837098, "learning_rate": 9.134071334081907e-07, "loss": 0.85290629, "num_input_tokens_seen": 248508575, "router_z_loss_clip": 2.33398438, "router_z_loss_mlp": 0.26611328, "step": 11515, "time_per_iteration": 2.669571876525879 }, { "auxiliary_loss_clip": 0.01273745, "auxiliary_loss_mlp": 0.00229127, "balance_loss_clip": 1.06588495, "balance_loss_mlp": 0.2062266, "epoch": 0.6923793777243349, "flos": 28074639237120.0, "grad_norm": 158.26266522000012, "language_loss": 0.62308425, "learning_rate": 9.130801849869694e-07, "loss": 0.6381129, "num_input_tokens_seen": 248527025, "router_z_loss_clip": 2.078125, "router_z_loss_mlp": 0.22900391, "step": 11516, "time_per_iteration": 2.690290689468384 }, { "auxiliary_loss_clip": 0.0128466, "auxiliary_loss_mlp": 0.0027538, "balance_loss_clip": 1.07294619, "balance_loss_mlp": 0.25063232, "epoch": 0.6924395009770029, "flos": 16581250033920.0, "grad_norm": 7.2642672588330255, "language_loss": 0.82342666, "learning_rate": 9.127532777818557e-07, "loss": 0.83902717, "num_input_tokens_seen": 248544275, "router_z_loss_clip": 2.11914062, "router_z_loss_mlp": 0.24755859, "step": 11517, "time_per_iteration": 2.6340889930725098 }, { "auxiliary_loss_clip": 0.01310751, "auxiliary_loss_mlp": 0.00274762, "balance_loss_clip": 1.08347082, "balance_loss_mlp": 0.2475944, "epoch": 0.6924996242296708, "flos": 16655297921280.0, "grad_norm": 1297.328809777429, "language_loss": 0.83356225, "learning_rate": 9.124264118052465e-07, "loss": 0.84941733, "num_input_tokens_seen": 248561870, "router_z_loss_clip": 2.27148438, "router_z_loss_mlp": 0.27197266, "step": 11518, "time_per_iteration": 2.609740734100342 }, { "auxiliary_loss_clip": 0.01323837, "auxiliary_loss_mlp": 0.00284965, "balance_loss_clip": 1.08927631, "balance_loss_mlp": 0.25659341, "epoch": 0.6925597474823388, "flos": 34754167532160.0, "grad_norm": 14.075376235934375, "language_loss": 0.72459018, "learning_rate": 9.120995870695376e-07, "loss": 0.74067819, "num_input_tokens_seen": 248588190, "router_z_loss_clip": 2.34570312, "router_z_loss_mlp": 0.28344727, "step": 11519, "time_per_iteration": 2.8003461360931396 }, { "auxiliary_loss_clip": 0.01293514, "auxiliary_loss_mlp": 0.00272941, "balance_loss_clip": 1.07369697, "balance_loss_mlp": 0.24825232, "epoch": 0.6926198707350067, "flos": 21871717528320.0, "grad_norm": 19.650124046436865, "language_loss": 0.70870024, "learning_rate": 9.117728035871212e-07, "loss": 0.72436476, "num_input_tokens_seen": 248606460, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.24694824, "step": 11520, "time_per_iteration": 2.696540594100952 }, { "auxiliary_loss_clip": 0.01318114, "auxiliary_loss_mlp": 0.00291636, "balance_loss_clip": 1.08332229, "balance_loss_mlp": 0.26209545, "epoch": 0.6926799939876748, "flos": 13006271162880.0, "grad_norm": 100.2489245075827, "language_loss": 0.85879612, "learning_rate": 9.114460613703887e-07, "loss": 0.87489367, "num_input_tokens_seen": 248623715, "router_z_loss_clip": 2.34765625, "router_z_loss_mlp": 0.29541016, "step": 11521, "time_per_iteration": 2.6229183673858643 }, { "auxiliary_loss_clip": 0.01295731, "auxiliary_loss_mlp": 0.00282723, "balance_loss_clip": 1.07679379, "balance_loss_mlp": 0.25777262, "epoch": 0.6927401172403427, "flos": 16761234107520.0, "grad_norm": 85.52897290668318, "language_loss": 0.89349103, "learning_rate": 9.111193604317304e-07, "loss": 0.90927553, "num_input_tokens_seen": 248640575, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.24951172, "step": 11522, "time_per_iteration": 2.7441492080688477 }, { "auxiliary_loss_clip": 0.01285943, "auxiliary_loss_mlp": 0.00285793, "balance_loss_clip": 1.07009864, "balance_loss_mlp": 0.26010299, "epoch": 0.6928002404930107, "flos": 25705648523520.0, "grad_norm": 3.1386390383044516, "language_loss": 0.81322896, "learning_rate": 9.107927007835361e-07, "loss": 0.82894635, "num_input_tokens_seen": 248663535, "router_z_loss_clip": 2.16210938, "router_z_loss_mlp": 0.25708008, "step": 11523, "time_per_iteration": 2.7605044841766357 }, { "auxiliary_loss_clip": 0.01283857, "auxiliary_loss_mlp": 0.00283119, "balance_loss_clip": 1.07013512, "balance_loss_mlp": 0.25901496, "epoch": 0.6928603637456786, "flos": 18588261438720.0, "grad_norm": 22.09733384874968, "language_loss": 0.74744618, "learning_rate": 9.104660824381915e-07, "loss": 0.76311594, "num_input_tokens_seen": 248681125, "router_z_loss_clip": 2.13476562, "router_z_loss_mlp": 0.24133301, "step": 11524, "time_per_iteration": 2.702228307723999 }, { "auxiliary_loss_clip": 0.01321411, "auxiliary_loss_mlp": 0.00268831, "balance_loss_clip": 1.09485483, "balance_loss_mlp": 0.24236678, "epoch": 0.6929204869983466, "flos": 22200874784640.0, "grad_norm": 911.1556551453043, "language_loss": 0.73478997, "learning_rate": 9.101395054080815e-07, "loss": 0.75069237, "num_input_tokens_seen": 248700555, "router_z_loss_clip": 2.26953125, "router_z_loss_mlp": 0.26477051, "step": 11525, "time_per_iteration": 2.675136089324951 }, { "auxiliary_loss_clip": 0.01301136, "auxiliary_loss_mlp": 0.00283094, "balance_loss_clip": 1.07711029, "balance_loss_mlp": 0.25701076, "epoch": 0.6929806102510145, "flos": 17894754576000.0, "grad_norm": 64.10652842486756, "language_loss": 0.80345589, "learning_rate": 9.098129697055907e-07, "loss": 0.81929815, "num_input_tokens_seen": 248716095, "router_z_loss_clip": 2.24023438, "router_z_loss_mlp": 0.26086426, "step": 11526, "time_per_iteration": 2.826284885406494 }, { "auxiliary_loss_clip": 0.012791, "auxiliary_loss_mlp": 0.00286869, "balance_loss_clip": 1.06456995, "balance_loss_mlp": 0.26266918, "epoch": 0.6930407335036826, "flos": 19755178577280.0, "grad_norm": 13.27052534398285, "language_loss": 0.82914507, "learning_rate": 9.094864753431022e-07, "loss": 0.84480482, "num_input_tokens_seen": 248735330, "router_z_loss_clip": 2.14453125, "router_z_loss_mlp": 0.24206543, "step": 11527, "time_per_iteration": 2.687304735183716 }, { "auxiliary_loss_clip": 0.01301908, "auxiliary_loss_mlp": 0.00281144, "balance_loss_clip": 1.08015049, "balance_loss_mlp": 0.25470322, "epoch": 0.6931008567563505, "flos": 21544248211200.0, "grad_norm": 376.6884635237403, "language_loss": 0.86169451, "learning_rate": 9.091600223329952e-07, "loss": 0.87752503, "num_input_tokens_seen": 248754530, "router_z_loss_clip": 2.22460938, "router_z_loss_mlp": 0.26464844, "step": 11528, "time_per_iteration": 2.6772499084472656 }, { "auxiliary_loss_clip": 0.01265581, "auxiliary_loss_mlp": 0.00255385, "balance_loss_clip": 1.05433106, "balance_loss_mlp": 0.2320078, "epoch": 0.6931609800090185, "flos": 26250018117120.0, "grad_norm": 97.99514989415712, "language_loss": 0.80891138, "learning_rate": 9.088336106876491e-07, "loss": 0.824121, "num_input_tokens_seen": 248775825, "router_z_loss_clip": 2.11328125, "router_z_loss_mlp": 0.23364258, "step": 11529, "time_per_iteration": 2.6894993782043457 }, { "auxiliary_loss_clip": 0.01281166, "auxiliary_loss_mlp": 0.00280905, "balance_loss_clip": 1.06849575, "balance_loss_mlp": 0.25667021, "epoch": 0.6932211032616865, "flos": 32343376366080.0, "grad_norm": 2.5734612003580066, "language_loss": 0.80186015, "learning_rate": 9.085072404194436e-07, "loss": 0.81748086, "num_input_tokens_seen": 248796180, "router_z_loss_clip": 2.12695312, "router_z_loss_mlp": 0.24243164, "step": 11530, "time_per_iteration": 2.770237684249878 }, { "auxiliary_loss_clip": 0.01306827, "auxiliary_loss_mlp": 0.00280672, "balance_loss_clip": 1.08110881, "balance_loss_mlp": 0.25328997, "epoch": 0.6932812265143544, "flos": 22049079909120.0, "grad_norm": 3.8436117916387373, "language_loss": 0.85055965, "learning_rate": 9.081809115407513e-07, "loss": 0.86643469, "num_input_tokens_seen": 248814735, "router_z_loss_clip": 2.25585938, "router_z_loss_mlp": 0.27355957, "step": 11531, "time_per_iteration": 2.6862869262695312 }, { "auxiliary_loss_clip": 0.01287135, "auxiliary_loss_mlp": 0.00261949, "balance_loss_clip": 1.07043123, "balance_loss_mlp": 0.23895399, "epoch": 0.6933413497670224, "flos": 26256626219520.0, "grad_norm": 12.230624014864734, "language_loss": 0.75474524, "learning_rate": 9.078546240639484e-07, "loss": 0.77023613, "num_input_tokens_seen": 248839140, "router_z_loss_clip": 2.1640625, "router_z_loss_mlp": 0.2298584, "step": 11532, "time_per_iteration": 2.769974708557129 }, { "auxiliary_loss_clip": 0.01304991, "auxiliary_loss_mlp": 0.00286053, "balance_loss_clip": 1.08096123, "balance_loss_mlp": 0.25972, "epoch": 0.6934014730196904, "flos": 19573003774080.0, "grad_norm": 10.123490041728155, "language_loss": 0.74096489, "learning_rate": 9.075283780014082e-07, "loss": 0.75687534, "num_input_tokens_seen": 248858300, "router_z_loss_clip": 2.24414062, "router_z_loss_mlp": 0.26342773, "step": 11533, "time_per_iteration": 2.680494546890259 }, { "auxiliary_loss_clip": 0.01300122, "auxiliary_loss_mlp": 0.00290187, "balance_loss_clip": 1.07447779, "balance_loss_mlp": 0.26434252, "epoch": 0.6934615962723584, "flos": 22119249127680.0, "grad_norm": 16.665870622920476, "language_loss": 0.70214134, "learning_rate": 9.072021733655007e-07, "loss": 0.71804446, "num_input_tokens_seen": 248876310, "router_z_loss_clip": 2.25976562, "router_z_loss_mlp": 0.25866699, "step": 11534, "time_per_iteration": 2.6382734775543213 }, { "auxiliary_loss_clip": 0.01312722, "auxiliary_loss_mlp": 0.00276576, "balance_loss_clip": 1.08582258, "balance_loss_mlp": 0.24992032, "epoch": 0.6935217195250263, "flos": 21360816432000.0, "grad_norm": 19.84737164819511, "language_loss": 0.81443524, "learning_rate": 9.068760101685971e-07, "loss": 0.83032823, "num_input_tokens_seen": 248895650, "router_z_loss_clip": 2.26953125, "router_z_loss_mlp": 0.26660156, "step": 11535, "time_per_iteration": 2.70357608795166 }, { "auxiliary_loss_clip": 0.01203329, "auxiliary_loss_mlp": 0.00129336, "balance_loss_clip": 1.05727029, "balance_loss_mlp": 0.12089604, "epoch": 0.6935818427776943, "flos": 64063813115520.0, "grad_norm": 0.70360812406411, "language_loss": 0.58621407, "learning_rate": 9.065498884230638e-07, "loss": 0.59954071, "num_input_tokens_seen": 248963920, "router_z_loss_clip": 1.4609375, "router_z_loss_mlp": 0.08447266, "step": 11536, "time_per_iteration": 3.2685277462005615 }, { "auxiliary_loss_clip": 0.01315174, "auxiliary_loss_mlp": 0.00287459, "balance_loss_clip": 1.08594334, "balance_loss_mlp": 0.25982636, "epoch": 0.6936419660303622, "flos": 20302564913280.0, "grad_norm": 9.723397091747158, "language_loss": 0.79783762, "learning_rate": 9.062238081412692e-07, "loss": 0.81386387, "num_input_tokens_seen": 248983380, "router_z_loss_clip": 2.29394531, "router_z_loss_mlp": 0.27624512, "step": 11537, "time_per_iteration": 2.682908058166504 }, { "auxiliary_loss_clip": 0.01206323, "auxiliary_loss_mlp": 0.00111312, "balance_loss_clip": 1.06132936, "balance_loss_mlp": 0.10339644, "epoch": 0.6937020892830302, "flos": 67182581347200.0, "grad_norm": 0.7723814349692734, "language_loss": 0.55247545, "learning_rate": 9.058977693355767e-07, "loss": 0.56565177, "num_input_tokens_seen": 249044680, "router_z_loss_clip": 1.453125, "router_z_loss_mlp": 0.07910156, "step": 11538, "time_per_iteration": 3.1421449184417725 }, { "auxiliary_loss_clip": 0.01265707, "auxiliary_loss_mlp": 0.00246089, "balance_loss_clip": 1.05723953, "balance_loss_mlp": 0.22438097, "epoch": 0.6937622125356981, "flos": 23878190229120.0, "grad_norm": 140.7407802381563, "language_loss": 0.82710326, "learning_rate": 9.055717720183505e-07, "loss": 0.84222126, "num_input_tokens_seen": 249061060, "router_z_loss_clip": 2.08789062, "router_z_loss_mlp": 0.21716309, "step": 11539, "time_per_iteration": 2.6570234298706055 }, { "auxiliary_loss_clip": 0.0127501, "auxiliary_loss_mlp": 0.0027453, "balance_loss_clip": 1.05974507, "balance_loss_mlp": 0.25052142, "epoch": 0.6938223357883662, "flos": 28730619365760.0, "grad_norm": 36.40069699376993, "language_loss": 0.7194488, "learning_rate": 9.05245816201953e-07, "loss": 0.73494422, "num_input_tokens_seen": 249081430, "router_z_loss_clip": 2.15527344, "router_z_loss_mlp": 0.2401123, "step": 11540, "time_per_iteration": 2.7153890132904053 }, { "auxiliary_loss_clip": 0.01284998, "auxiliary_loss_mlp": 0.00265338, "balance_loss_clip": 1.07114089, "balance_loss_mlp": 0.24018469, "epoch": 0.6938824590410341, "flos": 28655027193600.0, "grad_norm": 31.679835958077117, "language_loss": 0.92464095, "learning_rate": 9.049199018987437e-07, "loss": 0.9401443, "num_input_tokens_seen": 249103020, "router_z_loss_clip": 2.13964844, "router_z_loss_mlp": 0.25158691, "step": 11541, "time_per_iteration": 4.12665581703186 }, { "auxiliary_loss_clip": 0.01292333, "auxiliary_loss_mlp": 0.00288158, "balance_loss_clip": 1.07033074, "balance_loss_mlp": 0.26273036, "epoch": 0.6939425822937021, "flos": 18983062800000.0, "grad_norm": 4.500303178715709, "language_loss": 0.91832942, "learning_rate": 9.04594029121081e-07, "loss": 0.93413436, "num_input_tokens_seen": 249120810, "router_z_loss_clip": 2.21679688, "router_z_loss_mlp": 0.25415039, "step": 11542, "time_per_iteration": 2.6150009632110596 }, { "auxiliary_loss_clip": 0.01313657, "auxiliary_loss_mlp": 0.00298177, "balance_loss_clip": 1.08547032, "balance_loss_mlp": 0.27019882, "epoch": 0.6940027055463701, "flos": 23075838178560.0, "grad_norm": 9.674469223226875, "language_loss": 0.82523537, "learning_rate": 9.04268197881323e-07, "loss": 0.84135377, "num_input_tokens_seen": 249138050, "router_z_loss_clip": 2.28320312, "router_z_loss_mlp": 0.2800293, "step": 11543, "time_per_iteration": 4.138035774230957 }, { "auxiliary_loss_clip": 0.01287896, "auxiliary_loss_mlp": 0.00272683, "balance_loss_clip": 1.06964254, "balance_loss_mlp": 0.24726713, "epoch": 0.694062828799038, "flos": 18186564666240.0, "grad_norm": 8.504606202653616, "language_loss": 0.81592315, "learning_rate": 9.039424081918241e-07, "loss": 0.8315289, "num_input_tokens_seen": 249155570, "router_z_loss_clip": 2.18164062, "router_z_loss_mlp": 0.25427246, "step": 11544, "time_per_iteration": 2.66316819190979 }, { "auxiliary_loss_clip": 0.01287591, "auxiliary_loss_mlp": 0.00278593, "balance_loss_clip": 1.06696463, "balance_loss_mlp": 0.25456005, "epoch": 0.694122952051706, "flos": 17821532701440.0, "grad_norm": 38.40593630046677, "language_loss": 0.79157138, "learning_rate": 9.036166600649388e-07, "loss": 0.80723321, "num_input_tokens_seen": 249172960, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.24035645, "step": 11545, "time_per_iteration": 2.6780457496643066 }, { "auxiliary_loss_clip": 0.01274024, "auxiliary_loss_mlp": 0.00243569, "balance_loss_clip": 1.06284881, "balance_loss_mlp": 0.22001363, "epoch": 0.694183075304374, "flos": 21215306436480.0, "grad_norm": 11.921560979595178, "language_loss": 0.8603459, "learning_rate": 9.0329095351302e-07, "loss": 0.87552184, "num_input_tokens_seen": 249192450, "router_z_loss_clip": 2.11523438, "router_z_loss_mlp": 0.23535156, "step": 11546, "time_per_iteration": 4.195306777954102 }, { "auxiliary_loss_clip": 0.0126741, "auxiliary_loss_mlp": 0.00288273, "balance_loss_clip": 1.05986381, "balance_loss_mlp": 0.2655876, "epoch": 0.694243198557042, "flos": 24060508686720.0, "grad_norm": 11.416251023040264, "language_loss": 0.84371287, "learning_rate": 9.029652885484194e-07, "loss": 0.85926974, "num_input_tokens_seen": 249214320, "router_z_loss_clip": 2.07421875, "router_z_loss_mlp": 0.22692871, "step": 11547, "time_per_iteration": 2.754603385925293 }, { "auxiliary_loss_clip": 0.01265616, "auxiliary_loss_mlp": 0.00265915, "balance_loss_clip": 1.05521727, "balance_loss_mlp": 0.24283652, "epoch": 0.6943033218097099, "flos": 21141869080320.0, "grad_norm": 11.7015599488726, "language_loss": 0.89103782, "learning_rate": 9.026396651834834e-07, "loss": 0.90635312, "num_input_tokens_seen": 249230925, "router_z_loss_clip": 2.10253906, "router_z_loss_mlp": 0.23071289, "step": 11548, "time_per_iteration": 2.7024788856506348 }, { "auxiliary_loss_clip": 0.01168362, "auxiliary_loss_mlp": 0.0018477, "balance_loss_clip": 1.02625871, "balance_loss_mlp": 0.17404091, "epoch": 0.6943634450623779, "flos": 57812015975040.0, "grad_norm": 0.6778900724852694, "language_loss": 0.52989888, "learning_rate": 9.023140834305613e-07, "loss": 0.54343021, "num_input_tokens_seen": 249293975, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.10742188, "step": 11549, "time_per_iteration": 4.53424334526062 }, { "auxiliary_loss_clip": 0.01289753, "auxiliary_loss_mlp": 0.00286806, "balance_loss_clip": 1.06772852, "balance_loss_mlp": 0.26010299, "epoch": 0.6944235683150458, "flos": 30590684231040.0, "grad_norm": 16.492252485559327, "language_loss": 0.80389428, "learning_rate": 9.01988543302e-07, "loss": 0.81965989, "num_input_tokens_seen": 249315285, "router_z_loss_clip": 2.22070312, "router_z_loss_mlp": 0.26708984, "step": 11550, "time_per_iteration": 2.75132155418396 }, { "auxiliary_loss_clip": 0.01300146, "auxiliary_loss_mlp": 0.00264391, "balance_loss_clip": 1.07709289, "balance_loss_mlp": 0.23833185, "epoch": 0.6944836915677138, "flos": 19719447523200.0, "grad_norm": 159.66552169303074, "language_loss": 0.82757831, "learning_rate": 9.016630448101425e-07, "loss": 0.84322369, "num_input_tokens_seen": 249333505, "router_z_loss_clip": 2.22949219, "router_z_loss_mlp": 0.26049805, "step": 11551, "time_per_iteration": 2.67439603805542 }, { "auxiliary_loss_clip": 0.0128735, "auxiliary_loss_mlp": 0.00279959, "balance_loss_clip": 1.07017446, "balance_loss_mlp": 0.2549485, "epoch": 0.6945438148203817, "flos": 24863579009280.0, "grad_norm": 8.868287160742073, "language_loss": 0.9062829, "learning_rate": 9.01337587967333e-07, "loss": 0.921956, "num_input_tokens_seen": 249354180, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.25, "step": 11552, "time_per_iteration": 2.7214698791503906 }, { "auxiliary_loss_clip": 0.01272683, "auxiliary_loss_mlp": 0.00268299, "balance_loss_clip": 1.06061316, "balance_loss_mlp": 0.2441356, "epoch": 0.6946039380730498, "flos": 33326646243840.0, "grad_norm": 26.468980376466835, "language_loss": 0.74093509, "learning_rate": 9.010121727859117e-07, "loss": 0.75634491, "num_input_tokens_seen": 249377035, "router_z_loss_clip": 2.12304688, "router_z_loss_mlp": 0.24182129, "step": 11553, "time_per_iteration": 2.7499876022338867 }, { "auxiliary_loss_clip": 0.01324134, "auxiliary_loss_mlp": 0.002553, "balance_loss_clip": 1.09325838, "balance_loss_mlp": 0.2276554, "epoch": 0.6946640613257177, "flos": 20850956830080.0, "grad_norm": 8.778987608949201, "language_loss": 0.86931819, "learning_rate": 9.006867992782195e-07, "loss": 0.88511252, "num_input_tokens_seen": 249396155, "router_z_loss_clip": 2.30859375, "router_z_loss_mlp": 0.27648926, "step": 11554, "time_per_iteration": 2.6144003868103027 }, { "auxiliary_loss_clip": 0.01285157, "auxiliary_loss_mlp": 0.00266138, "balance_loss_clip": 1.06744599, "balance_loss_mlp": 0.24130671, "epoch": 0.6947241845783857, "flos": 19354846521600.0, "grad_norm": 34.265753755812305, "language_loss": 0.82078964, "learning_rate": 9.003614674565934e-07, "loss": 0.83630258, "num_input_tokens_seen": 249414555, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.24829102, "step": 11555, "time_per_iteration": 2.642665147781372 }, { "auxiliary_loss_clip": 0.01282588, "auxiliary_loss_mlp": 0.00278824, "balance_loss_clip": 1.0636183, "balance_loss_mlp": 0.25328934, "epoch": 0.6947843078310536, "flos": 27120240915840.0, "grad_norm": 12.786429228100987, "language_loss": 0.86000997, "learning_rate": 9.000361773333705e-07, "loss": 0.87562406, "num_input_tokens_seen": 249433570, "router_z_loss_clip": 2.19140625, "router_z_loss_mlp": 0.25500488, "step": 11556, "time_per_iteration": 2.6957011222839355 }, { "auxiliary_loss_clip": 0.0129113, "auxiliary_loss_mlp": 0.00271873, "balance_loss_clip": 1.06796217, "balance_loss_mlp": 0.24594478, "epoch": 0.6948444310837216, "flos": 28585109370240.0, "grad_norm": 612.184728107731, "language_loss": 0.71128988, "learning_rate": 8.997109289208869e-07, "loss": 0.72691989, "num_input_tokens_seen": 249453735, "router_z_loss_clip": 2.23144531, "router_z_loss_mlp": 0.25915527, "step": 11557, "time_per_iteration": 2.7108612060546875 }, { "auxiliary_loss_clip": 0.01277492, "auxiliary_loss_mlp": 0.00260633, "balance_loss_clip": 1.06406939, "balance_loss_mlp": 0.23643313, "epoch": 0.6949045543363896, "flos": 15669262696320.0, "grad_norm": 4.37388067809844, "language_loss": 0.93432152, "learning_rate": 8.993857222314752e-07, "loss": 0.94970274, "num_input_tokens_seen": 249470805, "router_z_loss_clip": 2.13671875, "router_z_loss_mlp": 0.24194336, "step": 11558, "time_per_iteration": 2.6287479400634766 }, { "auxiliary_loss_clip": 0.01296031, "auxiliary_loss_mlp": 0.00253619, "balance_loss_clip": 1.07310212, "balance_loss_mlp": 0.22853768, "epoch": 0.6949646775890576, "flos": 23259413612160.0, "grad_norm": 36.06726203903377, "language_loss": 0.76844513, "learning_rate": 8.990605572774664e-07, "loss": 0.78394163, "num_input_tokens_seen": 249491150, "router_z_loss_clip": 2.23242188, "router_z_loss_mlp": 0.25061035, "step": 11559, "time_per_iteration": 2.70719313621521 }, { "auxiliary_loss_clip": 0.01281943, "auxiliary_loss_mlp": 0.00257209, "balance_loss_clip": 1.06521749, "balance_loss_mlp": 0.23385634, "epoch": 0.6950248008417256, "flos": 22382546797440.0, "grad_norm": 15.232175611036396, "language_loss": 0.87534386, "learning_rate": 8.987354340711921e-07, "loss": 0.89073539, "num_input_tokens_seen": 249511560, "router_z_loss_clip": 2.1640625, "router_z_loss_mlp": 0.23352051, "step": 11560, "time_per_iteration": 2.667597532272339 }, { "auxiliary_loss_clip": 0.01277952, "auxiliary_loss_mlp": 0.00229896, "balance_loss_clip": 1.0630455, "balance_loss_mlp": 0.20462318, "epoch": 0.6950849240943935, "flos": 23477355383040.0, "grad_norm": 433.98191344196493, "language_loss": 0.83370721, "learning_rate": 8.9841035262498e-07, "loss": 0.84878564, "num_input_tokens_seen": 249531910, "router_z_loss_clip": 2.15039062, "router_z_loss_mlp": 0.25280762, "step": 11561, "time_per_iteration": 2.6866745948791504 }, { "auxiliary_loss_clip": 0.01290082, "auxiliary_loss_mlp": 0.0027791, "balance_loss_clip": 1.07095909, "balance_loss_mlp": 0.25175595, "epoch": 0.6951450473470615, "flos": 17420554200960.0, "grad_norm": 5.033589334401177, "language_loss": 0.87175465, "learning_rate": 8.980853129511577e-07, "loss": 0.88743448, "num_input_tokens_seen": 249550300, "router_z_loss_clip": 2.19335938, "router_z_loss_mlp": 0.26159668, "step": 11562, "time_per_iteration": 2.628737688064575 }, { "auxiliary_loss_clip": 0.01299517, "auxiliary_loss_mlp": 0.00284839, "balance_loss_clip": 1.07518899, "balance_loss_mlp": 0.25749287, "epoch": 0.6952051705997294, "flos": 20485745297280.0, "grad_norm": 44.8587900534598, "language_loss": 0.7716617, "learning_rate": 8.977603150620515e-07, "loss": 0.78750515, "num_input_tokens_seen": 249567740, "router_z_loss_clip": 2.24609375, "router_z_loss_mlp": 0.2734375, "step": 11563, "time_per_iteration": 2.630321502685547 }, { "auxiliary_loss_clip": 0.0127929, "auxiliary_loss_mlp": 0.00266027, "balance_loss_clip": 1.06417322, "balance_loss_mlp": 0.24168466, "epoch": 0.6952652938523974, "flos": 13989541040640.0, "grad_norm": 9.724799628348563, "language_loss": 0.8135947, "learning_rate": 8.974353589699846e-07, "loss": 0.82904792, "num_input_tokens_seen": 249582700, "router_z_loss_clip": 2.1484375, "router_z_loss_mlp": 0.2434082, "step": 11564, "time_per_iteration": 2.659019947052002 }, { "auxiliary_loss_clip": 0.01333262, "auxiliary_loss_mlp": 0.00262656, "balance_loss_clip": 1.09225178, "balance_loss_mlp": 0.23323521, "epoch": 0.6953254171050653, "flos": 30953956429440.0, "grad_norm": 49.5064919801215, "language_loss": 0.80632931, "learning_rate": 8.971104446872785e-07, "loss": 0.82228851, "num_input_tokens_seen": 249602920, "router_z_loss_clip": 2.4140625, "router_z_loss_mlp": 0.29418945, "step": 11565, "time_per_iteration": 2.7365176677703857 }, { "auxiliary_loss_clip": 0.01099798, "auxiliary_loss_mlp": 0.00097539, "balance_loss_clip": 0.96075493, "balance_loss_mlp": 0.08938526, "epoch": 0.6953855403577334, "flos": 61670257499520.0, "grad_norm": 0.9686945605819843, "language_loss": 0.57513863, "learning_rate": 8.96785572226255e-07, "loss": 0.58711201, "num_input_tokens_seen": 249660400, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.08154297, "step": 11566, "time_per_iteration": 3.0226728916168213 }, { "auxiliary_loss_clip": 0.01313287, "auxiliary_loss_mlp": 0.00257598, "balance_loss_clip": 1.08098078, "balance_loss_mlp": 0.23032275, "epoch": 0.6954456636104013, "flos": 23039029716480.0, "grad_norm": 9.856553428060108, "language_loss": 0.85986859, "learning_rate": 8.964607415992338e-07, "loss": 0.87557745, "num_input_tokens_seen": 249679335, "router_z_loss_clip": 2.32421875, "router_z_loss_mlp": 0.27294922, "step": 11567, "time_per_iteration": 2.664696455001831 }, { "auxiliary_loss_clip": 0.0130707, "auxiliary_loss_mlp": 0.00268994, "balance_loss_clip": 1.08264685, "balance_loss_mlp": 0.24350691, "epoch": 0.6955057868630693, "flos": 23918518224000.0, "grad_norm": 14.01189704846884, "language_loss": 0.81899816, "learning_rate": 8.961359528185313e-07, "loss": 0.83475876, "num_input_tokens_seen": 249701805, "router_z_loss_clip": 2.24023438, "router_z_loss_mlp": 0.25476074, "step": 11568, "time_per_iteration": 2.829275608062744 }, { "auxiliary_loss_clip": 0.01307153, "auxiliary_loss_mlp": 0.0023615, "balance_loss_clip": 1.07965851, "balance_loss_mlp": 0.21245071, "epoch": 0.6955659101157372, "flos": 22594634651520.0, "grad_norm": 9.20240329755627, "language_loss": 0.79655039, "learning_rate": 8.958112058964649e-07, "loss": 0.81198347, "num_input_tokens_seen": 249720550, "router_z_loss_clip": 2.2734375, "router_z_loss_mlp": 0.23706055, "step": 11569, "time_per_iteration": 2.7245430946350098 }, { "auxiliary_loss_clip": 0.0127551, "auxiliary_loss_mlp": 0.002605, "balance_loss_clip": 1.0589509, "balance_loss_mlp": 0.23528756, "epoch": 0.6956260333684052, "flos": 24572523104640.0, "grad_norm": 22.565521571802737, "language_loss": 0.8379181, "learning_rate": 8.954865008453471e-07, "loss": 0.85327816, "num_input_tokens_seen": 249740325, "router_z_loss_clip": 2.16210938, "router_z_loss_mlp": 0.2520752, "step": 11570, "time_per_iteration": 2.708599805831909 }, { "auxiliary_loss_clip": 0.01289488, "auxiliary_loss_mlp": 0.00272472, "balance_loss_clip": 1.07106805, "balance_loss_mlp": 0.24648434, "epoch": 0.6956861566210732, "flos": 25846058787840.0, "grad_norm": 8.536648881124693, "language_loss": 0.81551456, "learning_rate": 8.95161837677493e-07, "loss": 0.8311342, "num_input_tokens_seen": 249760570, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.2598877, "step": 11571, "time_per_iteration": 2.7924716472625732 }, { "auxiliary_loss_clip": 0.01261003, "auxiliary_loss_mlp": 0.00238926, "balance_loss_clip": 1.05816984, "balance_loss_mlp": 0.21640712, "epoch": 0.6957462798737412, "flos": 15301393557120.0, "grad_norm": 646.7788151917458, "language_loss": 0.82709289, "learning_rate": 8.948372164052118e-07, "loss": 0.84209216, "num_input_tokens_seen": 249778290, "router_z_loss_clip": 2.02832031, "router_z_loss_mlp": 0.22521973, "step": 11572, "time_per_iteration": 2.7000858783721924 }, { "auxiliary_loss_clip": 0.01274207, "auxiliary_loss_mlp": 0.00264139, "balance_loss_clip": 1.05746484, "balance_loss_mlp": 0.23601753, "epoch": 0.6958064031264092, "flos": 36246830135040.0, "grad_norm": 32.21237181777202, "language_loss": 0.77558964, "learning_rate": 8.94512637040814e-07, "loss": 0.79097313, "num_input_tokens_seen": 249800925, "router_z_loss_clip": 2.16796875, "router_z_loss_mlp": 0.28112793, "step": 11573, "time_per_iteration": 2.8843913078308105 }, { "auxiliary_loss_clip": 0.01311631, "auxiliary_loss_mlp": 0.0027946, "balance_loss_clip": 1.0895741, "balance_loss_mlp": 0.25168464, "epoch": 0.6958665263790771, "flos": 19208725994880.0, "grad_norm": 74.57106615894776, "language_loss": 0.82960743, "learning_rate": 8.941880995966095e-07, "loss": 0.84551835, "num_input_tokens_seen": 249820500, "router_z_loss_clip": 2.21679688, "router_z_loss_mlp": 0.27783203, "step": 11574, "time_per_iteration": 2.7093420028686523 }, { "auxiliary_loss_clip": 0.01286083, "auxiliary_loss_mlp": 0.00223072, "balance_loss_clip": 1.06405556, "balance_loss_mlp": 0.19654818, "epoch": 0.6959266496317451, "flos": 21795838047360.0, "grad_norm": 9926.656857561158, "language_loss": 0.81979674, "learning_rate": 8.938636040849014e-07, "loss": 0.83488834, "num_input_tokens_seen": 249839845, "router_z_loss_clip": 2.21679688, "router_z_loss_mlp": 0.26538086, "step": 11575, "time_per_iteration": 2.7470943927764893 }, { "auxiliary_loss_clip": 0.01294164, "auxiliary_loss_mlp": 0.0026411, "balance_loss_clip": 1.0732286, "balance_loss_mlp": 0.23714501, "epoch": 0.695986772884413, "flos": 20558248899840.0, "grad_norm": 21.37178932750013, "language_loss": 0.87346935, "learning_rate": 8.935391505179966e-07, "loss": 0.88905215, "num_input_tokens_seen": 249857400, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.26989746, "step": 11576, "time_per_iteration": 2.641602039337158 }, { "auxiliary_loss_clip": 0.01316012, "auxiliary_loss_mlp": 0.0029725, "balance_loss_clip": 1.07997561, "balance_loss_mlp": 0.26962882, "epoch": 0.696046896137081, "flos": 14936217937920.0, "grad_norm": 200.11487955809358, "language_loss": 0.67937887, "learning_rate": 8.932147389081985e-07, "loss": 0.69551152, "num_input_tokens_seen": 249871645, "router_z_loss_clip": 2.36132812, "router_z_loss_mlp": 0.27624512, "step": 11577, "time_per_iteration": 2.6142749786376953 }, { "auxiliary_loss_clip": 0.01286297, "auxiliary_loss_mlp": 0.00260182, "balance_loss_clip": 1.07233906, "balance_loss_mlp": 0.23514792, "epoch": 0.696107019389749, "flos": 30740216549760.0, "grad_norm": 44.69229931361354, "language_loss": 0.81236827, "learning_rate": 8.928903692678081e-07, "loss": 0.82783306, "num_input_tokens_seen": 249894215, "router_z_loss_clip": 2.14257812, "router_z_loss_mlp": 0.25024414, "step": 11578, "time_per_iteration": 2.768449068069458 }, { "auxiliary_loss_clip": 0.01297751, "auxiliary_loss_mlp": 0.00259658, "balance_loss_clip": 1.0767889, "balance_loss_mlp": 0.23345554, "epoch": 0.696167142642417, "flos": 20776729374720.0, "grad_norm": 1.976882549499725, "language_loss": 0.88561594, "learning_rate": 8.925660416091254e-07, "loss": 0.90119004, "num_input_tokens_seen": 249912850, "router_z_loss_clip": 2.21289062, "router_z_loss_mlp": 0.26208496, "step": 11579, "time_per_iteration": 2.6877195835113525 }, { "auxiliary_loss_clip": 0.01296159, "auxiliary_loss_mlp": 0.00262058, "balance_loss_clip": 1.07404017, "balance_loss_mlp": 0.23458028, "epoch": 0.6962272658950849, "flos": 22565152563840.0, "grad_norm": 831.7374460481063, "language_loss": 0.80521083, "learning_rate": 8.922417559444502e-07, "loss": 0.82079297, "num_input_tokens_seen": 249932650, "router_z_loss_clip": 2.22070312, "router_z_loss_mlp": 0.27478027, "step": 11580, "time_per_iteration": 2.710693836212158 }, { "auxiliary_loss_clip": 0.01310908, "auxiliary_loss_mlp": 0.00255526, "balance_loss_clip": 1.08205593, "balance_loss_mlp": 0.22753549, "epoch": 0.6962873891477529, "flos": 22200156512640.0, "grad_norm": 13.644704821110995, "language_loss": 0.7474578, "learning_rate": 8.919175122860787e-07, "loss": 0.76312214, "num_input_tokens_seen": 249951205, "router_z_loss_clip": 2.28515625, "router_z_loss_mlp": 0.27978516, "step": 11581, "time_per_iteration": 2.664170026779175 }, { "auxiliary_loss_clip": 0.01280841, "auxiliary_loss_mlp": 0.00243112, "balance_loss_clip": 1.06221437, "balance_loss_mlp": 0.2187573, "epoch": 0.6963475124004208, "flos": 12489695717760.0, "grad_norm": 2.335573893877376, "language_loss": 0.82963055, "learning_rate": 8.915933106463056e-07, "loss": 0.84487009, "num_input_tokens_seen": 249967045, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.24328613, "step": 11582, "time_per_iteration": 2.640676975250244 }, { "auxiliary_loss_clip": 0.01294347, "auxiliary_loss_mlp": 0.0025676, "balance_loss_clip": 1.07182372, "balance_loss_mlp": 0.23172641, "epoch": 0.6964076356530888, "flos": 17165085696000.0, "grad_norm": 9.739230513551702, "language_loss": 0.77081335, "learning_rate": 8.91269151037425e-07, "loss": 0.78632438, "num_input_tokens_seen": 249984565, "router_z_loss_clip": 2.22851562, "router_z_loss_mlp": 0.25036621, "step": 11583, "time_per_iteration": 4.093315839767456 }, { "auxiliary_loss_clip": 0.012916, "auxiliary_loss_mlp": 0.00235724, "balance_loss_clip": 1.07445979, "balance_loss_mlp": 0.20997484, "epoch": 0.6964677589057569, "flos": 19937317466880.0, "grad_norm": 7.3255550116584605, "language_loss": 0.90564007, "learning_rate": 8.909450334717301e-07, "loss": 0.92091334, "num_input_tokens_seen": 250004235, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.25756836, "step": 11584, "time_per_iteration": 2.695594310760498 }, { "auxiliary_loss_clip": 0.01318985, "auxiliary_loss_mlp": 0.00255685, "balance_loss_clip": 1.09014976, "balance_loss_mlp": 0.22753954, "epoch": 0.6965278821584248, "flos": 22784064001920.0, "grad_norm": 122.1428154674752, "language_loss": 0.89815623, "learning_rate": 8.906209579615107e-07, "loss": 0.913903, "num_input_tokens_seen": 250017645, "router_z_loss_clip": 2.28710938, "router_z_loss_mlp": 0.28112793, "step": 11585, "time_per_iteration": 4.143929719924927 }, { "auxiliary_loss_clip": 0.01286089, "auxiliary_loss_mlp": 0.00238227, "balance_loss_clip": 1.07045984, "balance_loss_mlp": 0.21324088, "epoch": 0.6965880054110928, "flos": 20047563285120.0, "grad_norm": 2.3369090410392466, "language_loss": 0.86410159, "learning_rate": 8.90296924519055e-07, "loss": 0.8793447, "num_input_tokens_seen": 250037640, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.25012207, "step": 11586, "time_per_iteration": 2.645862102508545 }, { "auxiliary_loss_clip": 0.01269011, "auxiliary_loss_mlp": 0.002872, "balance_loss_clip": 1.05791426, "balance_loss_mlp": 0.26068833, "epoch": 0.6966481286637607, "flos": 21908238681600.0, "grad_norm": 5417.557564947026, "language_loss": 0.86292994, "learning_rate": 8.899729331566519e-07, "loss": 0.87849212, "num_input_tokens_seen": 250056490, "router_z_loss_clip": 2.11328125, "router_z_loss_mlp": 0.26513672, "step": 11587, "time_per_iteration": 2.7160658836364746 }, { "auxiliary_loss_clip": 0.01275381, "auxiliary_loss_mlp": 0.0022482, "balance_loss_clip": 1.06502223, "balance_loss_mlp": 0.20012023, "epoch": 0.6967082519164287, "flos": 15633172506240.0, "grad_norm": 28.13757085737976, "language_loss": 0.8198297, "learning_rate": 8.896489838865857e-07, "loss": 0.83483171, "num_input_tokens_seen": 250074285, "router_z_loss_clip": 2.10644531, "router_z_loss_mlp": 0.24694824, "step": 11588, "time_per_iteration": 4.143243312835693 }, { "auxiliary_loss_clip": 0.01286577, "auxiliary_loss_mlp": 0.00246185, "balance_loss_clip": 1.06497145, "balance_loss_mlp": 0.22102007, "epoch": 0.6967683751690966, "flos": 24024598064640.0, "grad_norm": 16.86093753578077, "language_loss": 0.83650494, "learning_rate": 8.893250767211413e-07, "loss": 0.85183263, "num_input_tokens_seen": 250093350, "router_z_loss_clip": 2.21484375, "router_z_loss_mlp": 0.25170898, "step": 11589, "time_per_iteration": 2.705061674118042 }, { "auxiliary_loss_clip": 0.01309524, "auxiliary_loss_mlp": 0.00264188, "balance_loss_clip": 1.08133912, "balance_loss_mlp": 0.23746106, "epoch": 0.6968284984217646, "flos": 31024700265600.0, "grad_norm": 42.006055150064, "language_loss": 0.71743941, "learning_rate": 8.890012116726012e-07, "loss": 0.73317659, "num_input_tokens_seen": 250114170, "router_z_loss_clip": 2.27929688, "router_z_loss_mlp": 0.26745605, "step": 11590, "time_per_iteration": 2.799466133117676 }, { "auxiliary_loss_clip": 0.01112646, "auxiliary_loss_mlp": 0.00087347, "balance_loss_clip": 0.97337615, "balance_loss_mlp": 0.07938349, "epoch": 0.6968886216744326, "flos": 67622990002560.0, "grad_norm": 110.62832568099462, "language_loss": 0.60842067, "learning_rate": 8.88677388753248e-07, "loss": 0.62042063, "num_input_tokens_seen": 250178250, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.07958984, "step": 11591, "time_per_iteration": 3.2509963512420654 }, { "auxiliary_loss_clip": 0.01311449, "auxiliary_loss_mlp": 0.00241282, "balance_loss_clip": 1.08770108, "balance_loss_mlp": 0.2136018, "epoch": 0.6969487449271006, "flos": 24863686750080.0, "grad_norm": 10.892331944954357, "language_loss": 0.78043956, "learning_rate": 8.883536079753582e-07, "loss": 0.7959668, "num_input_tokens_seen": 250198420, "router_z_loss_clip": 2.23828125, "router_z_loss_mlp": 0.27709961, "step": 11592, "time_per_iteration": 4.15174412727356 }, { "auxiliary_loss_clip": 0.01297378, "auxiliary_loss_mlp": 0.00238522, "balance_loss_clip": 1.08008456, "balance_loss_mlp": 0.21355975, "epoch": 0.6970088681797685, "flos": 28767858791040.0, "grad_norm": 5.046033818867354, "language_loss": 0.70891917, "learning_rate": 8.880298693512109e-07, "loss": 0.72427821, "num_input_tokens_seen": 250220650, "router_z_loss_clip": 2.171875, "router_z_loss_mlp": 0.25, "step": 11593, "time_per_iteration": 2.825120210647583 }, { "auxiliary_loss_clip": 0.01290676, "auxiliary_loss_mlp": 0.00232736, "balance_loss_clip": 1.07197762, "balance_loss_mlp": 0.20736852, "epoch": 0.6970689914324365, "flos": 27308556944640.0, "grad_norm": 7.171725400821005, "language_loss": 0.62179899, "learning_rate": 8.877061728930832e-07, "loss": 0.6370331, "num_input_tokens_seen": 250241750, "router_z_loss_clip": 2.19140625, "router_z_loss_mlp": 0.25378418, "step": 11594, "time_per_iteration": 2.7425458431243896 }, { "auxiliary_loss_clip": 0.01282157, "auxiliary_loss_mlp": 0.00223667, "balance_loss_clip": 1.06572127, "balance_loss_mlp": 0.19840702, "epoch": 0.6971291146851044, "flos": 19136258305920.0, "grad_norm": 52.191303744747515, "language_loss": 0.85527509, "learning_rate": 8.87382518613248e-07, "loss": 0.87033331, "num_input_tokens_seen": 250259445, "router_z_loss_clip": 2.16796875, "router_z_loss_mlp": 0.25256348, "step": 11595, "time_per_iteration": 2.747504234313965 }, { "auxiliary_loss_clip": 0.01321353, "auxiliary_loss_mlp": 0.00256172, "balance_loss_clip": 1.08694196, "balance_loss_mlp": 0.22796667, "epoch": 0.6971892379377724, "flos": 14610508387200.0, "grad_norm": 37.43101383925927, "language_loss": 0.81434894, "learning_rate": 8.870589065239793e-07, "loss": 0.83012414, "num_input_tokens_seen": 250275640, "router_z_loss_clip": 2.34570312, "router_z_loss_mlp": 0.28198242, "step": 11596, "time_per_iteration": 2.595853090286255 }, { "auxiliary_loss_clip": 0.01311298, "auxiliary_loss_mlp": 0.0025316, "balance_loss_clip": 1.0851264, "balance_loss_mlp": 0.22668314, "epoch": 0.6972493611904405, "flos": 22307457415680.0, "grad_norm": 3.9015003814296065, "language_loss": 0.83983612, "learning_rate": 8.867353366375492e-07, "loss": 0.85548067, "num_input_tokens_seen": 250296435, "router_z_loss_clip": 2.2578125, "router_z_loss_mlp": 0.26464844, "step": 11597, "time_per_iteration": 2.6793556213378906 }, { "auxiliary_loss_clip": 0.01309016, "auxiliary_loss_mlp": 0.00272803, "balance_loss_clip": 1.08648598, "balance_loss_mlp": 0.24577829, "epoch": 0.6973094844431084, "flos": 17420374632960.0, "grad_norm": 9.832967467763396, "language_loss": 0.82973194, "learning_rate": 8.864118089662267e-07, "loss": 0.84555018, "num_input_tokens_seen": 250314035, "router_z_loss_clip": 2.22363281, "router_z_loss_mlp": 0.27001953, "step": 11598, "time_per_iteration": 2.7300751209259033 }, { "auxiliary_loss_clip": 0.01333338, "auxiliary_loss_mlp": 0.00260279, "balance_loss_clip": 1.09470737, "balance_loss_mlp": 0.23176399, "epoch": 0.6973696076957764, "flos": 27235370983680.0, "grad_norm": 59.86397808374666, "language_loss": 0.96971887, "learning_rate": 8.860883235222791e-07, "loss": 0.98565507, "num_input_tokens_seen": 250332995, "router_z_loss_clip": 2.38476562, "router_z_loss_mlp": 0.28503418, "step": 11599, "time_per_iteration": 2.7195026874542236 }, { "auxiliary_loss_clip": 0.0132864, "auxiliary_loss_mlp": 0.00265254, "balance_loss_clip": 1.0947274, "balance_loss_mlp": 0.23627406, "epoch": 0.6974297309484443, "flos": 22018089450240.0, "grad_norm": 4.286044615335168, "language_loss": 0.79549825, "learning_rate": 8.85764880317974e-07, "loss": 0.81143719, "num_input_tokens_seen": 250352120, "router_z_loss_clip": 2.33984375, "router_z_loss_mlp": 0.28967285, "step": 11600, "time_per_iteration": 2.688734531402588 }, { "auxiliary_loss_clip": 0.01297189, "auxiliary_loss_mlp": 0.00258396, "balance_loss_clip": 1.07651997, "balance_loss_mlp": 0.23308802, "epoch": 0.6974898542011123, "flos": 28366449327360.0, "grad_norm": 211.55427917820833, "language_loss": 0.84414446, "learning_rate": 8.854414793655771e-07, "loss": 0.85970032, "num_input_tokens_seen": 250371705, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.2532959, "step": 11601, "time_per_iteration": 2.730921983718872 }, { "auxiliary_loss_clip": 0.01287664, "auxiliary_loss_mlp": 0.00227612, "balance_loss_clip": 1.07198644, "balance_loss_mlp": 0.20331705, "epoch": 0.6975499774537802, "flos": 15232050351360.0, "grad_norm": 38.751905237347, "language_loss": 0.81477702, "learning_rate": 8.851181206773508e-07, "loss": 0.82992971, "num_input_tokens_seen": 250390485, "router_z_loss_clip": 2.15429688, "router_z_loss_mlp": 0.24291992, "step": 11602, "time_per_iteration": 2.6343679428100586 }, { "auxiliary_loss_clip": 0.01313076, "auxiliary_loss_mlp": 0.00253165, "balance_loss_clip": 1.09008121, "balance_loss_mlp": 0.22972879, "epoch": 0.6976101007064482, "flos": 22157422306560.0, "grad_norm": 571.0638020979759, "language_loss": 0.83622706, "learning_rate": 8.847948042655567e-07, "loss": 0.85188949, "num_input_tokens_seen": 250407020, "router_z_loss_clip": 2.2265625, "router_z_loss_mlp": 0.234375, "step": 11603, "time_per_iteration": 2.6371474266052246 }, { "auxiliary_loss_clip": 0.01304961, "auxiliary_loss_mlp": 0.00227883, "balance_loss_clip": 1.08270741, "balance_loss_mlp": 0.20184797, "epoch": 0.6976702239591162, "flos": 22273522041600.0, "grad_norm": 4.570734110541665, "language_loss": 0.7036798, "learning_rate": 8.844715301424557e-07, "loss": 0.71900821, "num_input_tokens_seen": 250425880, "router_z_loss_clip": 2.22070312, "router_z_loss_mlp": 0.26049805, "step": 11604, "time_per_iteration": 2.6659657955169678 }, { "auxiliary_loss_clip": 0.01309819, "auxiliary_loss_mlp": 0.00242048, "balance_loss_clip": 1.08536172, "balance_loss_mlp": 0.21447469, "epoch": 0.6977303472117842, "flos": 25848608653440.0, "grad_norm": 19.412551594633, "language_loss": 0.87725925, "learning_rate": 8.841482983203057e-07, "loss": 0.89277792, "num_input_tokens_seen": 250442925, "router_z_loss_clip": 2.24609375, "router_z_loss_mlp": 0.27575684, "step": 11605, "time_per_iteration": 2.6619906425476074 }, { "auxiliary_loss_clip": 0.01305137, "auxiliary_loss_mlp": 0.00232367, "balance_loss_clip": 1.08265674, "balance_loss_mlp": 0.20672509, "epoch": 0.6977904704644521, "flos": 20959586536320.0, "grad_norm": 22.17364236194721, "language_loss": 0.76789093, "learning_rate": 8.838251088113638e-07, "loss": 0.78326601, "num_input_tokens_seen": 250461220, "router_z_loss_clip": 2.22460938, "router_z_loss_mlp": 0.25646973, "step": 11606, "time_per_iteration": 2.6805639266967773 }, { "auxiliary_loss_clip": 0.0131932, "auxiliary_loss_mlp": 0.00226688, "balance_loss_clip": 1.09146237, "balance_loss_mlp": 0.20003247, "epoch": 0.6978505937171201, "flos": 22055041566720.0, "grad_norm": 92.43521896483705, "language_loss": 0.90048945, "learning_rate": 8.835019616278856e-07, "loss": 0.91594958, "num_input_tokens_seen": 250480975, "router_z_loss_clip": 2.27734375, "router_z_loss_mlp": 0.26647949, "step": 11607, "time_per_iteration": 2.6860251426696777 }, { "auxiliary_loss_clip": 0.01321572, "auxiliary_loss_mlp": 0.00273909, "balance_loss_clip": 1.08936143, "balance_loss_mlp": 0.247421, "epoch": 0.697910716969788, "flos": 20043720529920.0, "grad_norm": 30227.619900663347, "language_loss": 0.87079179, "learning_rate": 8.831788567821265e-07, "loss": 0.88674664, "num_input_tokens_seen": 250497980, "router_z_loss_clip": 2.3203125, "router_z_loss_mlp": 0.26477051, "step": 11608, "time_per_iteration": 2.7198586463928223 }, { "auxiliary_loss_clip": 0.01292183, "auxiliary_loss_mlp": 0.00240056, "balance_loss_clip": 1.07176399, "balance_loss_mlp": 0.21539119, "epoch": 0.697970840222456, "flos": 15888245961600.0, "grad_norm": 15.220661875825252, "language_loss": 0.99072599, "learning_rate": 8.828557942863357e-07, "loss": 1.00604844, "num_input_tokens_seen": 250511910, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.2467041, "step": 11609, "time_per_iteration": 2.6847362518310547 }, { "auxiliary_loss_clip": 0.01309718, "auxiliary_loss_mlp": 0.00248966, "balance_loss_clip": 1.08363056, "balance_loss_mlp": 0.22216761, "epoch": 0.698030963475124, "flos": 21215629658880.0, "grad_norm": 50.61888558893641, "language_loss": 0.73246491, "learning_rate": 8.82532774152765e-07, "loss": 0.74805176, "num_input_tokens_seen": 250531090, "router_z_loss_clip": 2.26171875, "router_z_loss_mlp": 0.26831055, "step": 11610, "time_per_iteration": 2.711545944213867 }, { "auxiliary_loss_clip": 0.01296018, "auxiliary_loss_mlp": 0.00245495, "balance_loss_clip": 1.07844853, "balance_loss_mlp": 0.22053289, "epoch": 0.698091086727792, "flos": 33759728524800.0, "grad_norm": 149.23390057467813, "language_loss": 0.91233128, "learning_rate": 8.822097963936643e-07, "loss": 0.92774642, "num_input_tokens_seen": 250551565, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.24951172, "step": 11611, "time_per_iteration": 2.860970973968506 }, { "auxiliary_loss_clip": 0.01304627, "auxiliary_loss_mlp": 0.00224007, "balance_loss_clip": 1.08079839, "balance_loss_mlp": 0.1991404, "epoch": 0.69815120998046, "flos": 15887850912000.0, "grad_norm": 11.460937148735658, "language_loss": 0.79710072, "learning_rate": 8.818868610212793e-07, "loss": 0.81238711, "num_input_tokens_seen": 250569625, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.24829102, "step": 11612, "time_per_iteration": 2.748175621032715 }, { "auxiliary_loss_clip": 0.01308347, "auxiliary_loss_mlp": 0.00235734, "balance_loss_clip": 1.08936858, "balance_loss_mlp": 0.20955601, "epoch": 0.6982113332331279, "flos": 18947044437120.0, "grad_norm": 2.747968249744425, "language_loss": 0.88038969, "learning_rate": 8.815639680478573e-07, "loss": 0.89583051, "num_input_tokens_seen": 250586960, "router_z_loss_clip": 2.18457031, "router_z_loss_mlp": 0.26184082, "step": 11613, "time_per_iteration": 2.705920457839966 }, { "auxiliary_loss_clip": 0.01313769, "auxiliary_loss_mlp": 0.00256016, "balance_loss_clip": 1.08791995, "balance_loss_mlp": 0.22886053, "epoch": 0.6982714564857959, "flos": 24389594115840.0, "grad_norm": 11.166335424784496, "language_loss": 0.83072996, "learning_rate": 8.812411174856411e-07, "loss": 0.8464278, "num_input_tokens_seen": 250605080, "router_z_loss_clip": 2.26171875, "router_z_loss_mlp": 0.27185059, "step": 11614, "time_per_iteration": 2.776323080062866 }, { "auxiliary_loss_clip": 0.01296321, "auxiliary_loss_mlp": 0.00258371, "balance_loss_clip": 1.07846117, "balance_loss_mlp": 0.23195384, "epoch": 0.6983315797384638, "flos": 20083725302400.0, "grad_norm": 7.12871923323161, "language_loss": 0.85466999, "learning_rate": 8.809183093468746e-07, "loss": 0.87021685, "num_input_tokens_seen": 250623965, "router_z_loss_clip": 2.18164062, "router_z_loss_mlp": 0.26428223, "step": 11615, "time_per_iteration": 2.706289052963257 }, { "auxiliary_loss_clip": 0.01303919, "auxiliary_loss_mlp": 0.00228384, "balance_loss_clip": 1.08291698, "balance_loss_mlp": 0.20290911, "epoch": 0.6983917029911318, "flos": 13512431664000.0, "grad_norm": 18.529615515496587, "language_loss": 0.80607045, "learning_rate": 8.80595543643797e-07, "loss": 0.82139337, "num_input_tokens_seen": 250640675, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.25488281, "step": 11616, "time_per_iteration": 2.638949155807495 }, { "auxiliary_loss_clip": 0.01295533, "auxiliary_loss_mlp": 0.00211921, "balance_loss_clip": 1.07847941, "balance_loss_mlp": 0.18613623, "epoch": 0.6984518262437998, "flos": 22018412672640.0, "grad_norm": 18.766983593462218, "language_loss": 0.9188478, "learning_rate": 8.802728203886487e-07, "loss": 0.93392229, "num_input_tokens_seen": 250660295, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.25769043, "step": 11617, "time_per_iteration": 2.7625527381896973 }, { "auxiliary_loss_clip": 0.01333784, "auxiliary_loss_mlp": 0.00267457, "balance_loss_clip": 1.09932852, "balance_loss_mlp": 0.23672503, "epoch": 0.6985119494964678, "flos": 18770615809920.0, "grad_norm": 7.2074438760974715, "language_loss": 0.71430898, "learning_rate": 8.799501395936682e-07, "loss": 0.73032141, "num_input_tokens_seen": 250678155, "router_z_loss_clip": 2.34570312, "router_z_loss_mlp": 0.30725098, "step": 11618, "time_per_iteration": 2.7730109691619873 }, { "auxiliary_loss_clip": 0.01292729, "auxiliary_loss_mlp": 0.00257858, "balance_loss_clip": 1.07629216, "balance_loss_mlp": 0.23108372, "epoch": 0.6985720727491357, "flos": 22382834106240.0, "grad_norm": 3.47547254924256, "language_loss": 0.91574353, "learning_rate": 8.796275012710903e-07, "loss": 0.93124938, "num_input_tokens_seen": 250697230, "router_z_loss_clip": 2.16210938, "router_z_loss_mlp": 0.26745605, "step": 11619, "time_per_iteration": 2.663734197616577 }, { "auxiliary_loss_clip": 0.01313394, "auxiliary_loss_mlp": 0.00234288, "balance_loss_clip": 1.09086156, "balance_loss_mlp": 0.20929028, "epoch": 0.6986321960018037, "flos": 39567884785920.0, "grad_norm": 156.49808108863385, "language_loss": 0.74839717, "learning_rate": 8.793049054331494e-07, "loss": 0.76387399, "num_input_tokens_seen": 250719865, "router_z_loss_clip": 2.2265625, "router_z_loss_mlp": 0.24987793, "step": 11620, "time_per_iteration": 2.812398910522461 }, { "auxiliary_loss_clip": 0.01338664, "auxiliary_loss_mlp": 0.00255896, "balance_loss_clip": 1.10368967, "balance_loss_mlp": 0.22813249, "epoch": 0.6986923192544716, "flos": 17967725055360.0, "grad_norm": 4.396020378632075, "language_loss": 0.83254671, "learning_rate": 8.789823520920794e-07, "loss": 0.84849226, "num_input_tokens_seen": 250736565, "router_z_loss_clip": 2.3515625, "router_z_loss_mlp": 0.27783203, "step": 11621, "time_per_iteration": 2.62652325630188 }, { "auxiliary_loss_clip": 0.01322773, "auxiliary_loss_mlp": 0.00253854, "balance_loss_clip": 1.09412527, "balance_loss_mlp": 0.22523215, "epoch": 0.6987524425071396, "flos": 25594325297280.0, "grad_norm": 10.812802379644133, "language_loss": 0.77497709, "learning_rate": 8.7865984126011e-07, "loss": 0.79074335, "num_input_tokens_seen": 250757235, "router_z_loss_clip": 2.28710938, "router_z_loss_mlp": 0.28625488, "step": 11622, "time_per_iteration": 2.696106195449829 }, { "auxiliary_loss_clip": 0.01325198, "auxiliary_loss_mlp": 0.0022108, "balance_loss_clip": 1.0964669, "balance_loss_mlp": 0.19406709, "epoch": 0.6988125657598077, "flos": 17530081747200.0, "grad_norm": 3.523260552001237, "language_loss": 0.70515937, "learning_rate": 8.783373729494721e-07, "loss": 0.72062218, "num_input_tokens_seen": 250775585, "router_z_loss_clip": 2.28710938, "router_z_loss_mlp": 0.2701416, "step": 11623, "time_per_iteration": 2.6642913818359375 }, { "auxiliary_loss_clip": 0.01326026, "auxiliary_loss_mlp": 0.00231306, "balance_loss_clip": 1.08964455, "balance_loss_mlp": 0.20214784, "epoch": 0.6988726890124756, "flos": 39165721136640.0, "grad_norm": 50.029928844056094, "language_loss": 0.68231297, "learning_rate": 8.780149471723932e-07, "loss": 0.69788623, "num_input_tokens_seen": 250795725, "router_z_loss_clip": 2.36523438, "router_z_loss_mlp": 0.29150391, "step": 11624, "time_per_iteration": 2.9272568225860596 }, { "auxiliary_loss_clip": 0.01323179, "auxiliary_loss_mlp": 0.00234155, "balance_loss_clip": 1.09448612, "balance_loss_mlp": 0.20635541, "epoch": 0.6989328122651436, "flos": 20193468330240.0, "grad_norm": 12.017261367937008, "language_loss": 0.85732478, "learning_rate": 8.776925639411017e-07, "loss": 0.8728981, "num_input_tokens_seen": 250814555, "router_z_loss_clip": 2.28710938, "router_z_loss_mlp": 0.27819824, "step": 11625, "time_per_iteration": 4.1252405643463135 }, { "auxiliary_loss_clip": 0.01313187, "auxiliary_loss_mlp": 0.00233468, "balance_loss_clip": 1.08870339, "balance_loss_mlp": 0.20743248, "epoch": 0.6989929355178115, "flos": 21834873152640.0, "grad_norm": 7.300717445748576, "language_loss": 0.76181597, "learning_rate": 8.773702232678188e-07, "loss": 0.77728248, "num_input_tokens_seen": 250833105, "router_z_loss_clip": 2.24414062, "router_z_loss_mlp": 0.26025391, "step": 11626, "time_per_iteration": 2.653622627258301 }, { "auxiliary_loss_clip": 0.01310597, "auxiliary_loss_mlp": 0.0022246, "balance_loss_clip": 1.0850718, "balance_loss_mlp": 0.19361147, "epoch": 0.6990530587704795, "flos": 26322880855680.0, "grad_norm": 10.038082266061474, "language_loss": 0.80559611, "learning_rate": 8.770479251647697e-07, "loss": 0.82092673, "num_input_tokens_seen": 250852570, "router_z_loss_clip": 2.25195312, "router_z_loss_mlp": 0.28869629, "step": 11627, "time_per_iteration": 4.113186359405518 }, { "auxiliary_loss_clip": 0.01287764, "auxiliary_loss_mlp": 0.00229214, "balance_loss_clip": 1.07361078, "balance_loss_mlp": 0.20257035, "epoch": 0.6991131820231474, "flos": 19828975069440.0, "grad_norm": 213.34408602385665, "language_loss": 0.70517671, "learning_rate": 8.767256696441768e-07, "loss": 0.72034645, "num_input_tokens_seen": 250870500, "router_z_loss_clip": 2.140625, "router_z_loss_mlp": 0.26611328, "step": 11628, "time_per_iteration": 2.6422126293182373 }, { "auxiliary_loss_clip": 0.01302693, "auxiliary_loss_mlp": 0.00248859, "balance_loss_clip": 1.07807577, "balance_loss_mlp": 0.22152387, "epoch": 0.6991733052758154, "flos": 33984817102080.0, "grad_norm": 97.65910319278507, "language_loss": 0.76003712, "learning_rate": 8.764034567182581e-07, "loss": 0.77555269, "num_input_tokens_seen": 250892745, "router_z_loss_clip": 2.24804688, "router_z_loss_mlp": 0.27307129, "step": 11629, "time_per_iteration": 2.7795052528381348 }, { "auxiliary_loss_clip": 0.01302595, "auxiliary_loss_mlp": 0.0024338, "balance_loss_clip": 1.08028042, "balance_loss_mlp": 0.21531841, "epoch": 0.6992334285284834, "flos": 15633136592640.0, "grad_norm": 44.998773626826214, "language_loss": 0.79473895, "learning_rate": 8.760812863992337e-07, "loss": 0.81019866, "num_input_tokens_seen": 250910225, "router_z_loss_clip": 2.22070312, "router_z_loss_mlp": 0.28063965, "step": 11630, "time_per_iteration": 4.147046804428101 }, { "auxiliary_loss_clip": 0.01305602, "auxiliary_loss_mlp": 0.00238802, "balance_loss_clip": 1.0842793, "balance_loss_mlp": 0.21298155, "epoch": 0.6992935517811514, "flos": 21726279360000.0, "grad_norm": 13.554491626142193, "language_loss": 0.81130159, "learning_rate": 8.757591586993196e-07, "loss": 0.82674563, "num_input_tokens_seen": 250929715, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.25830078, "step": 11631, "time_per_iteration": 2.6665046215057373 }, { "auxiliary_loss_clip": 0.01353006, "auxiliary_loss_mlp": 0.00249862, "balance_loss_clip": 1.10736477, "balance_loss_mlp": 0.2197497, "epoch": 0.6993536750338193, "flos": 20115254465280.0, "grad_norm": 93.71521491821949, "language_loss": 0.98277402, "learning_rate": 8.7543707363073e-07, "loss": 0.99880272, "num_input_tokens_seen": 250944230, "router_z_loss_clip": 2.45507812, "router_z_loss_mlp": 0.30114746, "step": 11632, "time_per_iteration": 2.6804420948028564 }, { "auxiliary_loss_clip": 0.01317002, "auxiliary_loss_mlp": 0.00255881, "balance_loss_clip": 1.09397316, "balance_loss_mlp": 0.22911859, "epoch": 0.6994137982864873, "flos": 22010547594240.0, "grad_norm": 9.87550507147234, "language_loss": 0.86164057, "learning_rate": 8.751150312056792e-07, "loss": 0.8773694, "num_input_tokens_seen": 250961865, "router_z_loss_clip": 2.23046875, "router_z_loss_mlp": 0.26794434, "step": 11633, "time_per_iteration": 2.698359251022339 }, { "auxiliary_loss_clip": 0.01333914, "auxiliary_loss_mlp": 0.00246051, "balance_loss_clip": 1.09615552, "balance_loss_mlp": 0.21672554, "epoch": 0.6994739215391552, "flos": 25519020433920.0, "grad_norm": 10.89829014589353, "language_loss": 0.79708385, "learning_rate": 8.747930314363794e-07, "loss": 0.8128835, "num_input_tokens_seen": 250982025, "router_z_loss_clip": 2.37890625, "router_z_loss_mlp": 0.29309082, "step": 11634, "time_per_iteration": 4.0990142822265625 }, { "auxiliary_loss_clip": 0.0116623, "auxiliary_loss_mlp": 0.00059865, "balance_loss_clip": 1.02436411, "balance_loss_mlp": 0.05252205, "epoch": 0.6995340447918232, "flos": 59128357691520.0, "grad_norm": 0.6575285437797487, "language_loss": 0.52785188, "learning_rate": 8.744710743350412e-07, "loss": 0.54011285, "num_input_tokens_seen": 251046900, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.07324219, "step": 11635, "time_per_iteration": 3.3541340827941895 }, { "auxiliary_loss_clip": 0.01321406, "auxiliary_loss_mlp": 0.00232455, "balance_loss_clip": 1.09496069, "balance_loss_mlp": 0.20556137, "epoch": 0.6995941680444913, "flos": 17967832796160.0, "grad_norm": 199.7813421496229, "language_loss": 0.8705588, "learning_rate": 8.741491599138726e-07, "loss": 0.88609743, "num_input_tokens_seen": 251065050, "router_z_loss_clip": 2.265625, "router_z_loss_mlp": 0.26916504, "step": 11636, "time_per_iteration": 2.7324116230010986 }, { "auxiliary_loss_clip": 0.01303378, "auxiliary_loss_mlp": 0.00213857, "balance_loss_clip": 1.079615, "balance_loss_mlp": 0.18678461, "epoch": 0.6996542912971592, "flos": 21980095839360.0, "grad_norm": 5.136567454497571, "language_loss": 0.91567683, "learning_rate": 8.738272881850801e-07, "loss": 0.93084925, "num_input_tokens_seen": 251083355, "router_z_loss_clip": 2.23828125, "router_z_loss_mlp": 0.27087402, "step": 11637, "time_per_iteration": 2.711207866668701 }, { "auxiliary_loss_clip": 0.01335218, "auxiliary_loss_mlp": 0.00250357, "balance_loss_clip": 1.10503817, "balance_loss_mlp": 0.22292675, "epoch": 0.6997144145498272, "flos": 11686158518400.0, "grad_norm": 29.557613024811893, "language_loss": 0.78578746, "learning_rate": 8.735054591608704e-07, "loss": 0.80164325, "num_input_tokens_seen": 251096420, "router_z_loss_clip": 2.30078125, "router_z_loss_mlp": 0.27429199, "step": 11638, "time_per_iteration": 2.676647424697876 }, { "auxiliary_loss_clip": 0.01325551, "auxiliary_loss_mlp": 0.00240186, "balance_loss_clip": 1.0930655, "balance_loss_mlp": 0.20979983, "epoch": 0.6997745378024951, "flos": 29607162958080.0, "grad_norm": 8.705756134485972, "language_loss": 0.85307497, "learning_rate": 8.731836728534459e-07, "loss": 0.86873233, "num_input_tokens_seen": 251115410, "router_z_loss_clip": 2.328125, "router_z_loss_mlp": 0.30407715, "step": 11639, "time_per_iteration": 2.7238106727600098 }, { "auxiliary_loss_clip": 0.01318988, "auxiliary_loss_mlp": 0.00226685, "balance_loss_clip": 1.08858538, "balance_loss_mlp": 0.1990048, "epoch": 0.6998346610551631, "flos": 20886616056960.0, "grad_norm": 214.1430496779924, "language_loss": 0.90713298, "learning_rate": 8.728619292750093e-07, "loss": 0.92258972, "num_input_tokens_seen": 251133530, "router_z_loss_clip": 2.30273438, "router_z_loss_mlp": 0.27697754, "step": 11640, "time_per_iteration": 2.652945041656494 }, { "auxiliary_loss_clip": 0.0128748, "auxiliary_loss_mlp": 0.00224376, "balance_loss_clip": 1.07074511, "balance_loss_mlp": 0.19837648, "epoch": 0.699894784307831, "flos": 27163046949120.0, "grad_norm": 13.093506115514211, "language_loss": 0.8482216, "learning_rate": 8.725402284377619e-07, "loss": 0.86334014, "num_input_tokens_seen": 251153985, "router_z_loss_clip": 2.16796875, "router_z_loss_mlp": 0.2598877, "step": 11641, "time_per_iteration": 2.676967144012451 }, { "auxiliary_loss_clip": 0.01313616, "auxiliary_loss_mlp": 0.00226745, "balance_loss_clip": 1.08747435, "balance_loss_mlp": 0.19880247, "epoch": 0.699954907560499, "flos": 20923640000640.0, "grad_norm": 2.9278682398823572, "language_loss": 0.88381433, "learning_rate": 8.722185703539022e-07, "loss": 0.89921796, "num_input_tokens_seen": 251173225, "router_z_loss_clip": 2.26171875, "router_z_loss_mlp": 0.27929688, "step": 11642, "time_per_iteration": 2.6657252311706543 }, { "auxiliary_loss_clip": 0.01359525, "auxiliary_loss_mlp": 0.0024995, "balance_loss_clip": 1.11479068, "balance_loss_mlp": 0.21927774, "epoch": 0.700015030813167, "flos": 28657792540800.0, "grad_norm": 26.690227191939602, "language_loss": 0.8398481, "learning_rate": 8.718969550356266e-07, "loss": 0.85594285, "num_input_tokens_seen": 251192485, "router_z_loss_clip": 2.44726562, "router_z_loss_mlp": 0.30651855, "step": 11643, "time_per_iteration": 2.7600436210632324 }, { "auxiliary_loss_clip": 0.01327381, "auxiliary_loss_mlp": 0.00252635, "balance_loss_clip": 1.09230638, "balance_loss_mlp": 0.22311832, "epoch": 0.700075154065835, "flos": 29205286617600.0, "grad_norm": 23.553946160485978, "language_loss": 0.67224503, "learning_rate": 8.715753824951315e-07, "loss": 0.68804514, "num_input_tokens_seen": 251214965, "router_z_loss_clip": 2.3515625, "router_z_loss_mlp": 0.29504395, "step": 11644, "time_per_iteration": 2.7750518321990967 }, { "auxiliary_loss_clip": 0.01323691, "auxiliary_loss_mlp": 0.00268128, "balance_loss_clip": 1.09639907, "balance_loss_mlp": 0.24041153, "epoch": 0.7001352773185029, "flos": 23112431159040.0, "grad_norm": 6.978426704557098, "language_loss": 0.88433588, "learning_rate": 8.712538527446119e-07, "loss": 0.90025413, "num_input_tokens_seen": 251234500, "router_z_loss_clip": 2.2734375, "router_z_loss_mlp": 0.27709961, "step": 11645, "time_per_iteration": 2.67975115776062 }, { "auxiliary_loss_clip": 0.01320831, "auxiliary_loss_mlp": 0.0024532, "balance_loss_clip": 1.0930171, "balance_loss_mlp": 0.21543472, "epoch": 0.7001954005711709, "flos": 21322858734720.0, "grad_norm": 648.8115189432855, "language_loss": 0.75412035, "learning_rate": 8.709323657962584e-07, "loss": 0.76978189, "num_input_tokens_seen": 251254360, "router_z_loss_clip": 2.27539062, "router_z_loss_mlp": 0.29907227, "step": 11646, "time_per_iteration": 2.6506471633911133 }, { "auxiliary_loss_clip": 0.01304137, "auxiliary_loss_mlp": 0.00226729, "balance_loss_clip": 1.08103848, "balance_loss_mlp": 0.19973981, "epoch": 0.7002555238238388, "flos": 24535822383360.0, "grad_norm": 2.923809909581497, "language_loss": 0.77856195, "learning_rate": 8.706109216622635e-07, "loss": 0.79387057, "num_input_tokens_seen": 251274790, "router_z_loss_clip": 2.23046875, "router_z_loss_mlp": 0.26977539, "step": 11647, "time_per_iteration": 2.674961805343628 }, { "auxiliary_loss_clip": 0.01319062, "auxiliary_loss_mlp": 0.00200688, "balance_loss_clip": 1.09230149, "balance_loss_mlp": 0.17682266, "epoch": 0.7003156470765068, "flos": 39056552726400.0, "grad_norm": 8.416306262752876, "language_loss": 0.81521565, "learning_rate": 8.702895203548155e-07, "loss": 0.8304131, "num_input_tokens_seen": 251296275, "router_z_loss_clip": 2.26757812, "router_z_loss_mlp": 0.23864746, "step": 11648, "time_per_iteration": 2.802321672439575 }, { "auxiliary_loss_clip": 0.01307147, "auxiliary_loss_mlp": 0.00208628, "balance_loss_clip": 1.08472657, "balance_loss_mlp": 0.18284345, "epoch": 0.7003757703291749, "flos": 28804092635520.0, "grad_norm": 15.004134061053056, "language_loss": 0.84959239, "learning_rate": 8.699681618861014e-07, "loss": 0.86475015, "num_input_tokens_seen": 251317375, "router_z_loss_clip": 2.22265625, "router_z_loss_mlp": 0.25793457, "step": 11649, "time_per_iteration": 2.7266275882720947 }, { "auxiliary_loss_clip": 0.01317337, "auxiliary_loss_mlp": 0.00226785, "balance_loss_clip": 1.08860934, "balance_loss_mlp": 0.19900928, "epoch": 0.7004358935818428, "flos": 15953854152960.0, "grad_norm": 9.281049190072922, "language_loss": 0.84888721, "learning_rate": 8.69646846268308e-07, "loss": 0.86432844, "num_input_tokens_seen": 251333570, "router_z_loss_clip": 2.28710938, "router_z_loss_mlp": 0.27807617, "step": 11650, "time_per_iteration": 2.708414316177368 }, { "auxiliary_loss_clip": 0.01305397, "auxiliary_loss_mlp": 0.0024505, "balance_loss_clip": 1.0839963, "balance_loss_mlp": 0.21845399, "epoch": 0.7004960168345108, "flos": 20411984718720.0, "grad_norm": 55.63922341323825, "language_loss": 0.85771924, "learning_rate": 8.693255735136194e-07, "loss": 0.87322378, "num_input_tokens_seen": 251351070, "router_z_loss_clip": 2.21484375, "router_z_loss_mlp": 0.26574707, "step": 11651, "time_per_iteration": 2.6482927799224854 }, { "auxiliary_loss_clip": 0.0134054, "auxiliary_loss_mlp": 0.00242229, "balance_loss_clip": 1.09833336, "balance_loss_mlp": 0.21307014, "epoch": 0.7005561400871787, "flos": 17347547808000.0, "grad_norm": 33.26775123222925, "language_loss": 0.78738892, "learning_rate": 8.690043436342198e-07, "loss": 0.80321658, "num_input_tokens_seen": 251370005, "router_z_loss_clip": 2.421875, "router_z_loss_mlp": 0.29150391, "step": 11652, "time_per_iteration": 2.658904790878296 }, { "auxiliary_loss_clip": 0.01312984, "auxiliary_loss_mlp": 0.00242624, "balance_loss_clip": 1.08720446, "balance_loss_mlp": 0.21462129, "epoch": 0.7006162633398467, "flos": 25302120157440.0, "grad_norm": 1.9405637777040976, "language_loss": 0.80430275, "learning_rate": 8.686831566422874e-07, "loss": 0.81985879, "num_input_tokens_seen": 251391210, "router_z_loss_clip": 2.25585938, "router_z_loss_mlp": 0.2800293, "step": 11653, "time_per_iteration": 2.718663215637207 }, { "auxiliary_loss_clip": 0.01346137, "auxiliary_loss_mlp": 0.00233235, "balance_loss_clip": 1.10103869, "balance_loss_mlp": 0.20171601, "epoch": 0.7006763865925146, "flos": 20668997508480.0, "grad_norm": 23.747329461607578, "language_loss": 0.81266439, "learning_rate": 8.68362012550003e-07, "loss": 0.82845819, "num_input_tokens_seen": 251411505, "router_z_loss_clip": 2.453125, "router_z_loss_mlp": 0.31494141, "step": 11654, "time_per_iteration": 2.758901834487915 }, { "auxiliary_loss_clip": 0.01340043, "auxiliary_loss_mlp": 0.00265732, "balance_loss_clip": 1.10157323, "balance_loss_mlp": 0.23656186, "epoch": 0.7007365098451827, "flos": 20046449963520.0, "grad_norm": 44.192815979898114, "language_loss": 0.85167617, "learning_rate": 8.680409113695453e-07, "loss": 0.86773384, "num_input_tokens_seen": 251428975, "router_z_loss_clip": 2.38867188, "router_z_loss_mlp": 0.29187012, "step": 11655, "time_per_iteration": 2.6764109134674072 }, { "auxiliary_loss_clip": 0.01340696, "auxiliary_loss_mlp": 0.002519, "balance_loss_clip": 1.09821689, "balance_loss_mlp": 0.22373088, "epoch": 0.7007966330978506, "flos": 20777375819520.0, "grad_norm": 23.449489548791338, "language_loss": 0.78931576, "learning_rate": 8.677198531130889e-07, "loss": 0.8052417, "num_input_tokens_seen": 251446940, "router_z_loss_clip": 2.421875, "router_z_loss_mlp": 0.28137207, "step": 11656, "time_per_iteration": 2.685081958770752 }, { "auxiliary_loss_clip": 0.01317644, "auxiliary_loss_mlp": 0.00221561, "balance_loss_clip": 1.08907604, "balance_loss_mlp": 0.19272405, "epoch": 0.7008567563505186, "flos": 29638189330560.0, "grad_norm": 4.513124834494237, "language_loss": 0.85405034, "learning_rate": 8.673988377928092e-07, "loss": 0.86944234, "num_input_tokens_seen": 251466205, "router_z_loss_clip": 2.28515625, "router_z_loss_mlp": 0.28808594, "step": 11657, "time_per_iteration": 2.7515408992767334 }, { "auxiliary_loss_clip": 0.01342975, "auxiliary_loss_mlp": 0.00231104, "balance_loss_clip": 1.1030066, "balance_loss_mlp": 0.19898915, "epoch": 0.7009168796031865, "flos": 17092007475840.0, "grad_norm": 27.67714548999825, "language_loss": 0.88662696, "learning_rate": 8.670778654208797e-07, "loss": 0.90236783, "num_input_tokens_seen": 251484820, "router_z_loss_clip": 2.3984375, "router_z_loss_mlp": 0.32116699, "step": 11658, "time_per_iteration": 2.620649576187134 }, { "auxiliary_loss_clip": 0.01307203, "auxiliary_loss_mlp": 0.0023092, "balance_loss_clip": 1.08722842, "balance_loss_mlp": 0.20463455, "epoch": 0.7009770028558545, "flos": 20448972748800.0, "grad_norm": 33.60606379453234, "language_loss": 0.89475942, "learning_rate": 8.667569360094713e-07, "loss": 0.91014063, "num_input_tokens_seen": 251502670, "router_z_loss_clip": 2.19921875, "router_z_loss_mlp": 0.26293945, "step": 11659, "time_per_iteration": 2.7887327671051025 }, { "auxiliary_loss_clip": 0.01295383, "auxiliary_loss_mlp": 0.00207662, "balance_loss_clip": 1.07755494, "balance_loss_mlp": 0.18052956, "epoch": 0.7010371261085224, "flos": 19245139407360.0, "grad_norm": 4.5528509019015075, "language_loss": 0.77040106, "learning_rate": 8.664360495707526e-07, "loss": 0.7854315, "num_input_tokens_seen": 251521630, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.27124023, "step": 11660, "time_per_iteration": 2.7761759757995605 }, { "auxiliary_loss_clip": 0.01312247, "auxiliary_loss_mlp": 0.00228586, "balance_loss_clip": 1.08812857, "balance_loss_mlp": 0.20098928, "epoch": 0.7010972493611904, "flos": 22127581082880.0, "grad_norm": 6.457399249868733, "language_loss": 0.87940824, "learning_rate": 8.661152061168924e-07, "loss": 0.89481652, "num_input_tokens_seen": 251540105, "router_z_loss_clip": 2.2421875, "router_z_loss_mlp": 0.27587891, "step": 11661, "time_per_iteration": 2.7300469875335693 }, { "auxiliary_loss_clip": 0.01318492, "auxiliary_loss_mlp": 0.00217746, "balance_loss_clip": 1.092731, "balance_loss_mlp": 0.18961266, "epoch": 0.7011573726138585, "flos": 31391132860800.0, "grad_norm": 7276.132771548174, "language_loss": 0.8524704, "learning_rate": 8.657944056600579e-07, "loss": 0.86783278, "num_input_tokens_seen": 251560530, "router_z_loss_clip": 2.26171875, "router_z_loss_mlp": 0.28149414, "step": 11662, "time_per_iteration": 2.74375057220459 }, { "auxiliary_loss_clip": 0.01326428, "auxiliary_loss_mlp": 0.00233453, "balance_loss_clip": 1.0976491, "balance_loss_mlp": 0.20683353, "epoch": 0.7012174958665264, "flos": 18150582216960.0, "grad_norm": 8.92457066673001, "language_loss": 0.91757965, "learning_rate": 8.654736482124134e-07, "loss": 0.93317842, "num_input_tokens_seen": 251577930, "router_z_loss_clip": 2.2890625, "router_z_loss_mlp": 0.26586914, "step": 11663, "time_per_iteration": 2.684138536453247 }, { "auxiliary_loss_clip": 0.01229761, "auxiliary_loss_mlp": 0.00121707, "balance_loss_clip": 1.07882893, "balance_loss_mlp": 0.11202722, "epoch": 0.7012776191191944, "flos": 60651256567680.0, "grad_norm": 0.7918717568430268, "language_loss": 0.53599918, "learning_rate": 8.651529337861209e-07, "loss": 0.54951382, "num_input_tokens_seen": 251638820, "router_z_loss_clip": 1.5078125, "router_z_loss_mlp": 0.09667969, "step": 11664, "time_per_iteration": 3.1592986583709717 }, { "auxiliary_loss_clip": 0.01337299, "auxiliary_loss_mlp": 0.00234934, "balance_loss_clip": 1.10293567, "balance_loss_mlp": 0.20681244, "epoch": 0.7013377423718623, "flos": 27198598435200.0, "grad_norm": 9.878759453427575, "language_loss": 0.90925407, "learning_rate": 8.64832262393344e-07, "loss": 0.92497647, "num_input_tokens_seen": 251658070, "router_z_loss_clip": 2.34375, "router_z_loss_mlp": 0.28137207, "step": 11665, "time_per_iteration": 2.6933584213256836 }, { "auxiliary_loss_clip": 0.01336425, "auxiliary_loss_mlp": 0.00216375, "balance_loss_clip": 1.0995028, "balance_loss_mlp": 0.18874224, "epoch": 0.7013978656245303, "flos": 16543543731840.0, "grad_norm": 877.4929760936716, "language_loss": 0.86383212, "learning_rate": 8.645116340462404e-07, "loss": 0.87936008, "num_input_tokens_seen": 251671575, "router_z_loss_clip": 2.3671875, "router_z_loss_mlp": 0.27612305, "step": 11666, "time_per_iteration": 2.658090829849243 }, { "auxiliary_loss_clip": 0.01344087, "auxiliary_loss_mlp": 0.00222153, "balance_loss_clip": 1.10629058, "balance_loss_mlp": 0.19471067, "epoch": 0.7014579888771982, "flos": 23143780753920.0, "grad_norm": 2.7742263673426657, "language_loss": 0.87328041, "learning_rate": 8.641910487569695e-07, "loss": 0.88894284, "num_input_tokens_seen": 251689350, "router_z_loss_clip": 2.37695312, "router_z_loss_mlp": 0.27453613, "step": 11667, "time_per_iteration": 4.126591682434082 }, { "auxiliary_loss_clip": 0.01317407, "auxiliary_loss_mlp": 0.00225937, "balance_loss_clip": 1.08837581, "balance_loss_mlp": 0.19811311, "epoch": 0.7015181121298663, "flos": 25082095397760.0, "grad_norm": 43.62373346979314, "language_loss": 0.75295365, "learning_rate": 8.638705065376879e-07, "loss": 0.76838708, "num_input_tokens_seen": 251704635, "router_z_loss_clip": 2.2890625, "router_z_loss_mlp": 0.27856445, "step": 11668, "time_per_iteration": 2.6964621543884277 }, { "auxiliary_loss_clip": 0.01322329, "auxiliary_loss_mlp": 0.00252634, "balance_loss_clip": 1.09370542, "balance_loss_mlp": 0.22545415, "epoch": 0.7015782353825342, "flos": 23327894891520.0, "grad_norm": 126.83534057536173, "language_loss": 0.85276854, "learning_rate": 8.635500074005519e-07, "loss": 0.86851817, "num_input_tokens_seen": 251723035, "router_z_loss_clip": 2.28515625, "router_z_loss_mlp": 0.27172852, "step": 11669, "time_per_iteration": 4.103363275527954 }, { "auxiliary_loss_clip": 0.01223842, "auxiliary_loss_mlp": 0.00094179, "balance_loss_clip": 1.07288122, "balance_loss_mlp": 0.08535793, "epoch": 0.7016383586352022, "flos": 70397161107840.0, "grad_norm": 0.7301794816138979, "language_loss": 0.54027987, "learning_rate": 8.632295513577122e-07, "loss": 0.55346012, "num_input_tokens_seen": 251791630, "router_z_loss_clip": 1.5078125, "router_z_loss_mlp": 0.08837891, "step": 11670, "time_per_iteration": 3.263697862625122 }, { "auxiliary_loss_clip": 0.0130224, "auxiliary_loss_mlp": 0.00250197, "balance_loss_clip": 1.07986248, "balance_loss_mlp": 0.22249281, "epoch": 0.7016984818878701, "flos": 19792274348160.0, "grad_norm": 6.20706387279615, "language_loss": 0.8969236, "learning_rate": 8.629091384213218e-07, "loss": 0.91244805, "num_input_tokens_seen": 251809840, "router_z_loss_clip": 2.22460938, "router_z_loss_mlp": 0.27709961, "step": 11671, "time_per_iteration": 2.6400206089019775 }, { "auxiliary_loss_clip": 0.01339456, "auxiliary_loss_mlp": 0.00241384, "balance_loss_clip": 1.10699201, "balance_loss_mlp": 0.21378738, "epoch": 0.7017586051405381, "flos": 12896923184640.0, "grad_norm": 125.45969303259349, "language_loss": 0.85002911, "learning_rate": 8.625887686035313e-07, "loss": 0.86583751, "num_input_tokens_seen": 251827550, "router_z_loss_clip": 2.32617188, "router_z_loss_mlp": 0.27600098, "step": 11672, "time_per_iteration": 4.107734441757202 }, { "auxiliary_loss_clip": 0.01327859, "auxiliary_loss_mlp": 0.00243899, "balance_loss_clip": 1.09231639, "balance_loss_mlp": 0.2145258, "epoch": 0.701818728393206, "flos": 18332828847360.0, "grad_norm": 7.481867002414591, "language_loss": 0.93051362, "learning_rate": 8.622684419164883e-07, "loss": 0.94623119, "num_input_tokens_seen": 251844880, "router_z_loss_clip": 2.35351562, "router_z_loss_mlp": 0.29382324, "step": 11673, "time_per_iteration": 2.6551406383514404 }, { "auxiliary_loss_clip": 0.01311887, "auxiliary_loss_mlp": 0.00235299, "balance_loss_clip": 1.08491898, "balance_loss_mlp": 0.20697439, "epoch": 0.701878851645874, "flos": 17384212615680.0, "grad_norm": 14.623282361587567, "language_loss": 0.8168571, "learning_rate": 8.619481583723399e-07, "loss": 0.83232892, "num_input_tokens_seen": 251861025, "router_z_loss_clip": 2.26953125, "router_z_loss_mlp": 0.2833252, "step": 11674, "time_per_iteration": 2.6849820613861084 }, { "auxiliary_loss_clip": 0.0131536, "auxiliary_loss_mlp": 0.0022559, "balance_loss_clip": 1.09025431, "balance_loss_mlp": 0.19936347, "epoch": 0.701938974898542, "flos": 23915501481600.0, "grad_norm": 188.4768488618571, "language_loss": 0.78091323, "learning_rate": 8.616279179832329e-07, "loss": 0.7963227, "num_input_tokens_seen": 251880175, "router_z_loss_clip": 2.25195312, "router_z_loss_mlp": 0.26220703, "step": 11675, "time_per_iteration": 2.7338595390319824 }, { "auxiliary_loss_clip": 0.01353477, "auxiliary_loss_mlp": 0.00242419, "balance_loss_clip": 1.11170697, "balance_loss_mlp": 0.21347526, "epoch": 0.70199909815121, "flos": 21795586652160.0, "grad_norm": 62.16736915348335, "language_loss": 0.61342102, "learning_rate": 8.613077207613078e-07, "loss": 0.62937999, "num_input_tokens_seen": 251899005, "router_z_loss_clip": 2.41992188, "router_z_loss_mlp": 0.28955078, "step": 11676, "time_per_iteration": 4.161667585372925 }, { "auxiliary_loss_clip": 0.01207725, "auxiliary_loss_mlp": 0.0009836, "balance_loss_clip": 1.05719435, "balance_loss_mlp": 0.08982471, "epoch": 0.702059221403878, "flos": 71715047109120.0, "grad_norm": 0.7124634283139019, "language_loss": 0.58557308, "learning_rate": 8.609875667187079e-07, "loss": 0.59863394, "num_input_tokens_seen": 251966790, "router_z_loss_clip": 1.5078125, "router_z_loss_mlp": 0.08544922, "step": 11677, "time_per_iteration": 3.241692543029785 }, { "auxiliary_loss_clip": 0.01323668, "auxiliary_loss_mlp": 0.00237443, "balance_loss_clip": 1.0905242, "balance_loss_mlp": 0.20928627, "epoch": 0.7021193446565459, "flos": 28111052649600.0, "grad_norm": 5.6204315281460815, "language_loss": 0.70921403, "learning_rate": 8.606674558675737e-07, "loss": 0.72482514, "num_input_tokens_seen": 251989315, "router_z_loss_clip": 2.33007812, "router_z_loss_mlp": 0.28186035, "step": 11678, "time_per_iteration": 2.7664196491241455 }, { "auxiliary_loss_clip": 0.01323936, "auxiliary_loss_mlp": 0.00251399, "balance_loss_clip": 1.09771585, "balance_loss_mlp": 0.22411212, "epoch": 0.7021794679092139, "flos": 22924905229440.0, "grad_norm": 14.160294846085067, "language_loss": 0.8456012, "learning_rate": 8.603473882200444e-07, "loss": 0.86135459, "num_input_tokens_seen": 252006620, "router_z_loss_clip": 2.26464844, "router_z_loss_mlp": 0.27258301, "step": 11679, "time_per_iteration": 2.685023069381714 }, { "auxiliary_loss_clip": 0.01326193, "auxiliary_loss_mlp": 0.00206133, "balance_loss_clip": 1.10198545, "balance_loss_mlp": 0.1808008, "epoch": 0.7022395911618818, "flos": 18077827219200.0, "grad_norm": 7.905375875381045, "language_loss": 0.81314653, "learning_rate": 8.600273637882567e-07, "loss": 0.82846975, "num_input_tokens_seen": 252024570, "router_z_loss_clip": 2.2421875, "router_z_loss_mlp": 0.25341797, "step": 11680, "time_per_iteration": 2.6626663208007812 }, { "auxiliary_loss_clip": 0.0135234, "auxiliary_loss_mlp": 0.00259828, "balance_loss_clip": 1.11087489, "balance_loss_mlp": 0.22965586, "epoch": 0.7022997144145499, "flos": 16034294661120.0, "grad_norm": 78.61715038692307, "language_loss": 0.83418208, "learning_rate": 8.597073825843446e-07, "loss": 0.85030377, "num_input_tokens_seen": 252042775, "router_z_loss_clip": 2.41210938, "router_z_loss_mlp": 0.30163574, "step": 11681, "time_per_iteration": 2.6673402786254883 }, { "auxiliary_loss_clip": 0.01305988, "auxiliary_loss_mlp": 0.00229545, "balance_loss_clip": 1.08566785, "balance_loss_mlp": 0.20209104, "epoch": 0.7023598376672178, "flos": 26468678160000.0, "grad_norm": 228.57749837659802, "language_loss": 0.83644915, "learning_rate": 8.593874446204434e-07, "loss": 0.85180449, "num_input_tokens_seen": 252063690, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.2746582, "step": 11682, "time_per_iteration": 2.7109694480895996 }, { "auxiliary_loss_clip": 0.0134591, "auxiliary_loss_mlp": 0.00259228, "balance_loss_clip": 1.10606432, "balance_loss_mlp": 0.22977164, "epoch": 0.7024199609198858, "flos": 17055917285760.0, "grad_norm": 34.78708347901931, "language_loss": 0.84089476, "learning_rate": 8.590675499086841e-07, "loss": 0.85694611, "num_input_tokens_seen": 252080335, "router_z_loss_clip": 2.3984375, "router_z_loss_mlp": 0.29455566, "step": 11683, "time_per_iteration": 2.6066231727600098 }, { "auxiliary_loss_clip": 0.0135689, "auxiliary_loss_mlp": 0.00262841, "balance_loss_clip": 1.11442196, "balance_loss_mlp": 0.23306251, "epoch": 0.7024800841725537, "flos": 25849039616640.0, "grad_norm": 10.726741627333086, "language_loss": 0.82668078, "learning_rate": 8.587476984611976e-07, "loss": 0.84287804, "num_input_tokens_seen": 252101075, "router_z_loss_clip": 2.42578125, "router_z_loss_mlp": 0.29785156, "step": 11684, "time_per_iteration": 2.7096107006073 }, { "auxiliary_loss_clip": 0.01316845, "auxiliary_loss_mlp": 0.00246006, "balance_loss_clip": 1.08983564, "balance_loss_mlp": 0.21812263, "epoch": 0.7025402074252217, "flos": 23513014609920.0, "grad_norm": 3.884303840129664, "language_loss": 0.81274045, "learning_rate": 8.584278902901128e-07, "loss": 0.82836896, "num_input_tokens_seen": 252120510, "router_z_loss_clip": 2.27539062, "router_z_loss_mlp": 0.27905273, "step": 11685, "time_per_iteration": 2.6517906188964844 }, { "auxiliary_loss_clip": 0.01310065, "auxiliary_loss_mlp": 0.00234835, "balance_loss_clip": 1.08278561, "balance_loss_mlp": 0.20810872, "epoch": 0.7026003306778896, "flos": 20150985519360.0, "grad_norm": 487.6545545311212, "language_loss": 0.90787542, "learning_rate": 8.581081254075582e-07, "loss": 0.92332435, "num_input_tokens_seen": 252137590, "router_z_loss_clip": 2.27929688, "router_z_loss_mlp": 0.26733398, "step": 11686, "time_per_iteration": 2.6723053455352783 }, { "auxiliary_loss_clip": 0.01191238, "auxiliary_loss_mlp": 0.00108204, "balance_loss_clip": 1.0427537, "balance_loss_mlp": 0.09995484, "epoch": 0.7026604539305576, "flos": 64772400712320.0, "grad_norm": 0.9613418414631639, "language_loss": 0.69182831, "learning_rate": 8.577884038256566e-07, "loss": 0.70482278, "num_input_tokens_seen": 252199830, "router_z_loss_clip": 1.484375, "router_z_loss_mlp": 0.08251953, "step": 11687, "time_per_iteration": 3.322389841079712 }, { "auxiliary_loss_clip": 0.01314231, "auxiliary_loss_mlp": 0.0026263, "balance_loss_clip": 1.08434069, "balance_loss_mlp": 0.23392409, "epoch": 0.7027205771832256, "flos": 21871466133120.0, "grad_norm": 6.408008998817926, "language_loss": 0.86102438, "learning_rate": 8.574687255565329e-07, "loss": 0.87679291, "num_input_tokens_seen": 252217200, "router_z_loss_clip": 2.296875, "router_z_loss_mlp": 0.2869873, "step": 11688, "time_per_iteration": 2.660240411758423 }, { "auxiliary_loss_clip": 0.01326162, "auxiliary_loss_mlp": 0.00276363, "balance_loss_clip": 1.08914804, "balance_loss_mlp": 0.24725235, "epoch": 0.7027807004358936, "flos": 23367791923200.0, "grad_norm": 2.2219332605155007, "language_loss": 0.75302064, "learning_rate": 8.571490906123107e-07, "loss": 0.76904595, "num_input_tokens_seen": 252236105, "router_z_loss_clip": 2.36914062, "router_z_loss_mlp": 0.29125977, "step": 11689, "time_per_iteration": 2.682840347290039 }, { "auxiliary_loss_clip": 0.0132404, "auxiliary_loss_mlp": 0.00248816, "balance_loss_clip": 1.08934522, "balance_loss_mlp": 0.22006254, "epoch": 0.7028408236885616, "flos": 15304266645120.0, "grad_norm": 16.83030651929631, "language_loss": 0.89897859, "learning_rate": 8.568294990051086e-07, "loss": 0.91470718, "num_input_tokens_seen": 252253315, "router_z_loss_clip": 2.34375, "router_z_loss_mlp": 0.28710938, "step": 11690, "time_per_iteration": 2.6385533809661865 }, { "auxiliary_loss_clip": 0.01318776, "auxiliary_loss_mlp": 0.00248342, "balance_loss_clip": 1.08615899, "balance_loss_mlp": 0.21934988, "epoch": 0.7029009469412295, "flos": 22018197191040.0, "grad_norm": 564.8394970409294, "language_loss": 0.83475941, "learning_rate": 8.56509950747047e-07, "loss": 0.85043061, "num_input_tokens_seen": 252272765, "router_z_loss_clip": 2.32421875, "router_z_loss_mlp": 0.28991699, "step": 11691, "time_per_iteration": 2.679696559906006 }, { "auxiliary_loss_clip": 0.0129996, "auxiliary_loss_mlp": 0.00221494, "balance_loss_clip": 1.07930541, "balance_loss_mlp": 0.19579235, "epoch": 0.7029610701938975, "flos": 21835519597440.0, "grad_norm": 3.728597496619534, "language_loss": 0.87768555, "learning_rate": 8.561904458502429e-07, "loss": 0.89290011, "num_input_tokens_seen": 252290510, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.25720215, "step": 11692, "time_per_iteration": 2.6516358852386475 }, { "auxiliary_loss_clip": 0.01307524, "auxiliary_loss_mlp": 0.00235712, "balance_loss_clip": 1.08328772, "balance_loss_mlp": 0.20900902, "epoch": 0.7030211934465654, "flos": 19135647774720.0, "grad_norm": 14.059582892251306, "language_loss": 0.83444768, "learning_rate": 8.558709843268111e-07, "loss": 0.8498801, "num_input_tokens_seen": 252309365, "router_z_loss_clip": 2.24414062, "router_z_loss_mlp": 0.26721191, "step": 11693, "time_per_iteration": 2.67405366897583 }, { "auxiliary_loss_clip": 0.01341414, "auxiliary_loss_mlp": 0.0024898, "balance_loss_clip": 1.10374928, "balance_loss_mlp": 0.22002393, "epoch": 0.7030813166992335, "flos": 38546010766080.0, "grad_norm": 16.67464394212993, "language_loss": 0.76519525, "learning_rate": 8.55551566188866e-07, "loss": 0.78109914, "num_input_tokens_seen": 252333010, "router_z_loss_clip": 2.37695312, "router_z_loss_mlp": 0.29003906, "step": 11694, "time_per_iteration": 2.8117544651031494 }, { "auxiliary_loss_clip": 0.01336262, "auxiliary_loss_mlp": 0.00243222, "balance_loss_clip": 1.09843314, "balance_loss_mlp": 0.21355127, "epoch": 0.7031414399519014, "flos": 14720897859840.0, "grad_norm": 9.720224042527823, "language_loss": 0.86681819, "learning_rate": 8.552321914485203e-07, "loss": 0.88261306, "num_input_tokens_seen": 252351330, "router_z_loss_clip": 2.37890625, "router_z_loss_mlp": 0.296875, "step": 11695, "time_per_iteration": 2.6800084114074707 }, { "auxiliary_loss_clip": 0.01320046, "auxiliary_loss_mlp": 0.00233713, "balance_loss_clip": 1.09479523, "balance_loss_mlp": 0.20860752, "epoch": 0.7032015632045694, "flos": 14027247342720.0, "grad_norm": 109.0917691305427, "language_loss": 0.82383567, "learning_rate": 8.549128601178852e-07, "loss": 0.83937323, "num_input_tokens_seen": 252369580, "router_z_loss_clip": 2.25390625, "router_z_loss_mlp": 0.25109863, "step": 11696, "time_per_iteration": 2.697777509689331 }, { "auxiliary_loss_clip": 0.01310709, "auxiliary_loss_mlp": 0.00237588, "balance_loss_clip": 1.08289218, "balance_loss_mlp": 0.20987152, "epoch": 0.7032616864572373, "flos": 27637175496960.0, "grad_norm": 2.190951765886415, "language_loss": 0.82078892, "learning_rate": 8.545935722090693e-07, "loss": 0.83627188, "num_input_tokens_seen": 252390525, "router_z_loss_clip": 2.27734375, "router_z_loss_mlp": 0.27758789, "step": 11697, "time_per_iteration": 2.735732316970825 }, { "auxiliary_loss_clip": 0.01319326, "auxiliary_loss_mlp": 0.00251372, "balance_loss_clip": 1.08688414, "balance_loss_mlp": 0.22242841, "epoch": 0.7033218097099053, "flos": 17967294092160.0, "grad_norm": 3.5968193162284825, "language_loss": 0.8687942, "learning_rate": 8.542743277341793e-07, "loss": 0.88450116, "num_input_tokens_seen": 252407470, "router_z_loss_clip": 2.32226562, "router_z_loss_mlp": 0.28955078, "step": 11698, "time_per_iteration": 2.6966915130615234 }, { "auxiliary_loss_clip": 0.01317074, "auxiliary_loss_mlp": 0.00246025, "balance_loss_clip": 1.08201146, "balance_loss_mlp": 0.21658072, "epoch": 0.7033819329625732, "flos": 19501721233920.0, "grad_norm": 32.06662519110118, "language_loss": 0.91213053, "learning_rate": 8.539551267053222e-07, "loss": 0.92776155, "num_input_tokens_seen": 252427025, "router_z_loss_clip": 2.3515625, "router_z_loss_mlp": 0.29443359, "step": 11699, "time_per_iteration": 2.7040956020355225 }, { "auxiliary_loss_clip": 0.01325223, "auxiliary_loss_mlp": 0.00244987, "balance_loss_clip": 1.09637189, "balance_loss_mlp": 0.21883294, "epoch": 0.7034420562152413, "flos": 23987645948160.0, "grad_norm": 15.413865507802537, "language_loss": 0.87578392, "learning_rate": 8.53635969134601e-07, "loss": 0.89148605, "num_input_tokens_seen": 252445410, "router_z_loss_clip": 2.28320312, "router_z_loss_mlp": 0.26159668, "step": 11700, "time_per_iteration": 2.749657392501831 }, { "auxiliary_loss_clip": 0.01312332, "auxiliary_loss_mlp": 0.00271446, "balance_loss_clip": 1.08097541, "balance_loss_mlp": 0.24097607, "epoch": 0.7035021794679092, "flos": 35043427756800.0, "grad_norm": 2.3315473357736773, "language_loss": 0.82577407, "learning_rate": 8.533168550341186e-07, "loss": 0.84161192, "num_input_tokens_seen": 252463905, "router_z_loss_clip": 2.31054688, "router_z_loss_mlp": 0.3046875, "step": 11701, "time_per_iteration": 2.820862293243408 }, { "auxiliary_loss_clip": 0.01341693, "auxiliary_loss_mlp": 0.00224941, "balance_loss_clip": 1.09823859, "balance_loss_mlp": 0.19563891, "epoch": 0.7035623027205772, "flos": 10997428164480.0, "grad_norm": 15.444516257875412, "language_loss": 0.94879824, "learning_rate": 8.529977844159769e-07, "loss": 0.9644646, "num_input_tokens_seen": 252478655, "router_z_loss_clip": 2.43554688, "router_z_loss_mlp": 0.29296875, "step": 11702, "time_per_iteration": 2.7030301094055176 }, { "auxiliary_loss_clip": 0.01320072, "auxiliary_loss_mlp": 0.00251985, "balance_loss_clip": 1.08684635, "balance_loss_mlp": 0.22354189, "epoch": 0.7036224259732452, "flos": 23623727304960.0, "grad_norm": 7.048535511243155, "language_loss": 0.68671012, "learning_rate": 8.526787572922738e-07, "loss": 0.70243073, "num_input_tokens_seen": 252498740, "router_z_loss_clip": 2.33203125, "router_z_loss_mlp": 0.2845459, "step": 11703, "time_per_iteration": 2.7521889209747314 }, { "auxiliary_loss_clip": 0.01326608, "auxiliary_loss_mlp": 0.00251341, "balance_loss_clip": 1.09000587, "balance_loss_mlp": 0.22183666, "epoch": 0.7036825492259131, "flos": 31686175175040.0, "grad_norm": 13.170309643139255, "language_loss": 0.71039307, "learning_rate": 8.523597736751067e-07, "loss": 0.72617245, "num_input_tokens_seen": 252517800, "router_z_loss_clip": 2.3671875, "router_z_loss_mlp": 0.29492188, "step": 11704, "time_per_iteration": 2.7554047107696533 }, { "auxiliary_loss_clip": 0.01305925, "auxiliary_loss_mlp": 0.00287263, "balance_loss_clip": 1.08352602, "balance_loss_mlp": 0.26088223, "epoch": 0.7037426724785811, "flos": 30192866127360.0, "grad_norm": 233.88713233125634, "language_loss": 0.77621615, "learning_rate": 8.520408335765719e-07, "loss": 0.79214805, "num_input_tokens_seen": 252539620, "router_z_loss_clip": 2.22460938, "router_z_loss_mlp": 0.26379395, "step": 11705, "time_per_iteration": 2.714193820953369 }, { "auxiliary_loss_clip": 0.01306334, "auxiliary_loss_mlp": 0.00232228, "balance_loss_clip": 1.08286262, "balance_loss_mlp": 0.20674139, "epoch": 0.703802795731249, "flos": 24311523905280.0, "grad_norm": 2.7437645383781217, "language_loss": 0.6983937, "learning_rate": 8.517219370087645e-07, "loss": 0.71377933, "num_input_tokens_seen": 252557300, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.25512695, "step": 11706, "time_per_iteration": 2.6913063526153564 }, { "auxiliary_loss_clip": 0.01327118, "auxiliary_loss_mlp": 0.00253749, "balance_loss_clip": 1.09640789, "balance_loss_mlp": 0.22716531, "epoch": 0.7038629189839171, "flos": 22528954632960.0, "grad_norm": 107.75134668138257, "language_loss": 0.77105629, "learning_rate": 8.514030839837756e-07, "loss": 0.78686488, "num_input_tokens_seen": 252576715, "router_z_loss_clip": 2.30078125, "router_z_loss_mlp": 0.26574707, "step": 11707, "time_per_iteration": 2.647080659866333 }, { "auxiliary_loss_clip": 0.01313748, "auxiliary_loss_mlp": 0.00227592, "balance_loss_clip": 1.08975935, "balance_loss_mlp": 0.20042384, "epoch": 0.703923042236585, "flos": 26250484993920.0, "grad_norm": 9.098448514624287, "language_loss": 0.82875156, "learning_rate": 8.510842745136974e-07, "loss": 0.84416497, "num_input_tokens_seen": 252596190, "router_z_loss_clip": 2.23828125, "router_z_loss_mlp": 0.27172852, "step": 11708, "time_per_iteration": 2.6897099018096924 }, { "auxiliary_loss_clip": 0.013212, "auxiliary_loss_mlp": 0.00229509, "balance_loss_clip": 1.09399998, "balance_loss_mlp": 0.20265129, "epoch": 0.703983165489253, "flos": 19390254353280.0, "grad_norm": 6.275333996928926, "language_loss": 0.80044556, "learning_rate": 8.50765508610619e-07, "loss": 0.8159526, "num_input_tokens_seen": 252613410, "router_z_loss_clip": 2.2734375, "router_z_loss_mlp": 0.26818848, "step": 11709, "time_per_iteration": 4.172357559204102 }, { "auxiliary_loss_clip": 0.01322265, "auxiliary_loss_mlp": 0.0025041, "balance_loss_clip": 1.09076929, "balance_loss_mlp": 0.22112057, "epoch": 0.7040432887419209, "flos": 16683630773760.0, "grad_norm": 28.696795050176263, "language_loss": 0.88182455, "learning_rate": 8.504467862866267e-07, "loss": 0.8975513, "num_input_tokens_seen": 252629150, "router_z_loss_clip": 2.31445312, "router_z_loss_mlp": 0.29284668, "step": 11710, "time_per_iteration": 2.6173975467681885 }, { "auxiliary_loss_clip": 0.01312818, "auxiliary_loss_mlp": 0.00248064, "balance_loss_clip": 1.07825851, "balance_loss_mlp": 0.2194659, "epoch": 0.7041034119945889, "flos": 21141402203520.0, "grad_norm": 1.8958269495038547, "language_loss": 0.8516022, "learning_rate": 8.501281075538076e-07, "loss": 0.86721104, "num_input_tokens_seen": 252648225, "router_z_loss_clip": 2.34570312, "router_z_loss_mlp": 0.2857666, "step": 11711, "time_per_iteration": 4.1078102588653564 }, { "auxiliary_loss_clip": 0.01329073, "auxiliary_loss_mlp": 0.00253856, "balance_loss_clip": 1.09532356, "balance_loss_mlp": 0.22684358, "epoch": 0.7041635352472568, "flos": 16910299549440.0, "grad_norm": 4.067309756814337, "language_loss": 0.84680963, "learning_rate": 8.498094724242457e-07, "loss": 0.86263895, "num_input_tokens_seen": 252665380, "router_z_loss_clip": 2.34179688, "router_z_loss_mlp": 0.26977539, "step": 11712, "time_per_iteration": 2.6760692596435547 }, { "auxiliary_loss_clip": 0.01155791, "auxiliary_loss_mlp": 0.00076112, "balance_loss_clip": 1.00832593, "balance_loss_mlp": 0.06724288, "epoch": 0.7042236584999249, "flos": 71681219475840.0, "grad_norm": 0.855935011190982, "language_loss": 0.63870609, "learning_rate": 8.494908809100247e-07, "loss": 0.65102506, "num_input_tokens_seen": 252727950, "router_z_loss_clip": 1.4765625, "router_z_loss_mlp": 0.08886719, "step": 11713, "time_per_iteration": 3.213409185409546 }, { "auxiliary_loss_clip": 0.01295079, "auxiliary_loss_mlp": 0.00235854, "balance_loss_clip": 1.0741415, "balance_loss_mlp": 0.20943692, "epoch": 0.7042837817525928, "flos": 28658187590400.0, "grad_norm": 78.5787318600898, "language_loss": 0.79747212, "learning_rate": 8.49172333023225e-07, "loss": 0.81278145, "num_input_tokens_seen": 252746770, "router_z_loss_clip": 2.21191406, "router_z_loss_mlp": 0.26428223, "step": 11714, "time_per_iteration": 4.125335216522217 }, { "auxiliary_loss_clip": 0.01318555, "auxiliary_loss_mlp": 0.00238014, "balance_loss_clip": 1.08986795, "balance_loss_mlp": 0.20963044, "epoch": 0.7043439050052608, "flos": 19753562465280.0, "grad_norm": 22.42929641167915, "language_loss": 0.86562717, "learning_rate": 8.488538287759248e-07, "loss": 0.88119292, "num_input_tokens_seen": 252765610, "router_z_loss_clip": 2.2890625, "router_z_loss_mlp": 0.28356934, "step": 11715, "time_per_iteration": 2.6931378841400146 }, { "auxiliary_loss_clip": 0.01331531, "auxiliary_loss_mlp": 0.00253548, "balance_loss_clip": 1.09589267, "balance_loss_mlp": 0.22413862, "epoch": 0.7044040282579288, "flos": 11538529620480.0, "grad_norm": 7.139489232483213, "language_loss": 0.82123238, "learning_rate": 8.485353681802037e-07, "loss": 0.83708322, "num_input_tokens_seen": 252781610, "router_z_loss_clip": 2.359375, "router_z_loss_mlp": 0.29394531, "step": 11716, "time_per_iteration": 2.6041252613067627 }, { "auxiliary_loss_clip": 0.01336551, "auxiliary_loss_mlp": 0.00268123, "balance_loss_clip": 1.09819496, "balance_loss_mlp": 0.24063377, "epoch": 0.7044641515105967, "flos": 33656126722560.0, "grad_norm": 44.30289417993636, "language_loss": 0.75555289, "learning_rate": 8.482169512481358e-07, "loss": 0.77159965, "num_input_tokens_seen": 252800600, "router_z_loss_clip": 2.38476562, "router_z_loss_mlp": 0.27490234, "step": 11717, "time_per_iteration": 2.77431321144104 }, { "auxiliary_loss_clip": 0.01306803, "auxiliary_loss_mlp": 0.00242705, "balance_loss_clip": 1.07968259, "balance_loss_mlp": 0.21756384, "epoch": 0.7045242747632647, "flos": 26723859356160.0, "grad_norm": 13.197460340871222, "language_loss": 0.80155879, "learning_rate": 8.478985779917967e-07, "loss": 0.81705385, "num_input_tokens_seen": 252822310, "router_z_loss_clip": 2.27148438, "router_z_loss_mlp": 0.25146484, "step": 11718, "time_per_iteration": 4.227887153625488 }, { "auxiliary_loss_clip": 0.01312114, "auxiliary_loss_mlp": 0.00234008, "balance_loss_clip": 1.08426023, "balance_loss_mlp": 0.20684057, "epoch": 0.7045843980159326, "flos": 26797655848320.0, "grad_norm": 5.474984375858945, "language_loss": 0.85110229, "learning_rate": 8.475802484232606e-07, "loss": 0.86656356, "num_input_tokens_seen": 252842355, "router_z_loss_clip": 2.27929688, "router_z_loss_mlp": 0.27185059, "step": 11719, "time_per_iteration": 2.7273921966552734 }, { "auxiliary_loss_clip": 0.013184, "auxiliary_loss_mlp": 0.00251984, "balance_loss_clip": 1.08964539, "balance_loss_mlp": 0.22358853, "epoch": 0.7046445212686007, "flos": 41574824363520.0, "grad_norm": 6.1303110187322645, "language_loss": 0.72973633, "learning_rate": 8.472619625545951e-07, "loss": 0.74544024, "num_input_tokens_seen": 252866785, "router_z_loss_clip": 2.29101562, "router_z_loss_mlp": 0.28393555, "step": 11720, "time_per_iteration": 2.8446900844573975 }, { "auxiliary_loss_clip": 0.01325549, "auxiliary_loss_mlp": 0.00246485, "balance_loss_clip": 1.09359372, "balance_loss_mlp": 0.21937671, "epoch": 0.7047046445212686, "flos": 15560166113280.0, "grad_norm": 24.839738710101877, "language_loss": 0.88498962, "learning_rate": 8.46943720397872e-07, "loss": 0.90070993, "num_input_tokens_seen": 252881870, "router_z_loss_clip": 2.31640625, "router_z_loss_mlp": 0.27087402, "step": 11721, "time_per_iteration": 2.6057255268096924 }, { "auxiliary_loss_clip": 0.01148274, "auxiliary_loss_mlp": 0.00090179, "balance_loss_clip": 1.00112152, "balance_loss_mlp": 0.08092834, "epoch": 0.7047647677739366, "flos": 70410269571840.0, "grad_norm": 0.7485611837238091, "language_loss": 0.64404643, "learning_rate": 8.466255219651582e-07, "loss": 0.65643096, "num_input_tokens_seen": 252951300, "router_z_loss_clip": 1.46875, "router_z_loss_mlp": 0.09228516, "step": 11722, "time_per_iteration": 3.3200857639312744 }, { "auxiliary_loss_clip": 0.01306996, "auxiliary_loss_mlp": 0.00230338, "balance_loss_clip": 1.08157897, "balance_loss_mlp": 0.20439778, "epoch": 0.7048248910266045, "flos": 23660032976640.0, "grad_norm": 3.2987490481092876, "language_loss": 0.74042159, "learning_rate": 8.463073672685211e-07, "loss": 0.75579494, "num_input_tokens_seen": 252971400, "router_z_loss_clip": 2.25390625, "router_z_loss_mlp": 0.25952148, "step": 11723, "time_per_iteration": 2.7141129970550537 }, { "auxiliary_loss_clip": 0.01294557, "auxiliary_loss_mlp": 0.00249873, "balance_loss_clip": 1.072896, "balance_loss_mlp": 0.22251439, "epoch": 0.7048850142792725, "flos": 21397158017280.0, "grad_norm": 4.414000866810696, "language_loss": 0.8776021, "learning_rate": 8.459892563200235e-07, "loss": 0.89304638, "num_input_tokens_seen": 252989475, "router_z_loss_clip": 2.21484375, "router_z_loss_mlp": 0.27331543, "step": 11724, "time_per_iteration": 2.770357847213745 }, { "auxiliary_loss_clip": 0.01314291, "auxiliary_loss_mlp": 0.00232149, "balance_loss_clip": 1.08776498, "balance_loss_mlp": 0.20457634, "epoch": 0.7049451375319404, "flos": 21648101408640.0, "grad_norm": 313.73858153889097, "language_loss": 0.80119449, "learning_rate": 8.456711891317296e-07, "loss": 0.81665885, "num_input_tokens_seen": 253007220, "router_z_loss_clip": 2.26757812, "router_z_loss_mlp": 0.27575684, "step": 11725, "time_per_iteration": 2.6550889015197754 }, { "auxiliary_loss_clip": 0.01310509, "auxiliary_loss_mlp": 0.00216837, "balance_loss_clip": 1.08416402, "balance_loss_mlp": 0.1899316, "epoch": 0.7050052607846085, "flos": 14866802904960.0, "grad_norm": 6.566165361528535, "language_loss": 0.86872017, "learning_rate": 8.453531657156998e-07, "loss": 0.88399357, "num_input_tokens_seen": 253025410, "router_z_loss_clip": 2.265625, "router_z_loss_mlp": 0.26940918, "step": 11726, "time_per_iteration": 2.60640025138855 }, { "auxiliary_loss_clip": 0.01320555, "auxiliary_loss_mlp": 0.00260016, "balance_loss_clip": 1.08888531, "balance_loss_mlp": 0.23278855, "epoch": 0.7050653840372764, "flos": 19241763528960.0, "grad_norm": 2.7977418610118123, "language_loss": 0.77398586, "learning_rate": 8.450351860839931e-07, "loss": 0.78979164, "num_input_tokens_seen": 253043305, "router_z_loss_clip": 2.31835938, "router_z_loss_mlp": 0.27209473, "step": 11727, "time_per_iteration": 2.6946444511413574 }, { "auxiliary_loss_clip": 0.01267666, "auxiliary_loss_mlp": 0.00222224, "balance_loss_clip": 1.05684149, "balance_loss_mlp": 0.19838229, "epoch": 0.7051255072899444, "flos": 27780422935680.0, "grad_norm": 2.4528021432575664, "language_loss": 0.76114142, "learning_rate": 8.44717250248668e-07, "loss": 0.77604032, "num_input_tokens_seen": 253062790, "router_z_loss_clip": 2.10839844, "router_z_loss_mlp": 0.23852539, "step": 11728, "time_per_iteration": 2.7293336391448975 }, { "auxiliary_loss_clip": 0.01309001, "auxiliary_loss_mlp": 0.00206411, "balance_loss_clip": 1.08035278, "balance_loss_mlp": 0.17884955, "epoch": 0.7051856305426124, "flos": 27892033470720.0, "grad_norm": 15.64677542454446, "language_loss": 0.79671085, "learning_rate": 8.443993582217803e-07, "loss": 0.81186485, "num_input_tokens_seen": 253082055, "router_z_loss_clip": 2.28515625, "router_z_loss_mlp": 0.27563477, "step": 11729, "time_per_iteration": 2.733164072036743 }, { "auxiliary_loss_clip": 0.01327964, "auxiliary_loss_mlp": 0.00230899, "balance_loss_clip": 1.09201169, "balance_loss_mlp": 0.20191908, "epoch": 0.7052457537952803, "flos": 25043563082880.0, "grad_norm": 19.83512566636921, "language_loss": 0.85310328, "learning_rate": 8.440815100153862e-07, "loss": 0.86869192, "num_input_tokens_seen": 253102575, "router_z_loss_clip": 2.359375, "router_z_loss_mlp": 0.28991699, "step": 11730, "time_per_iteration": 2.6579298973083496 }, { "auxiliary_loss_clip": 0.01303831, "auxiliary_loss_mlp": 0.0021693, "balance_loss_clip": 1.07934999, "balance_loss_mlp": 0.19032297, "epoch": 0.7053058770479483, "flos": 21871717528320.0, "grad_norm": 34.21991092124688, "language_loss": 0.74223852, "learning_rate": 8.437637056415359e-07, "loss": 0.75744617, "num_input_tokens_seen": 253121290, "router_z_loss_clip": 2.24414062, "router_z_loss_mlp": 0.26611328, "step": 11731, "time_per_iteration": 2.6538937091827393 }, { "auxiliary_loss_clip": 0.01302487, "auxiliary_loss_mlp": 0.00248768, "balance_loss_clip": 1.08030844, "balance_loss_mlp": 0.22313754, "epoch": 0.7053660003006162, "flos": 16398716094720.0, "grad_norm": 60.94869609901482, "language_loss": 0.82757443, "learning_rate": 8.434459451122815e-07, "loss": 0.84308696, "num_input_tokens_seen": 253139720, "router_z_loss_clip": 2.22265625, "router_z_loss_mlp": 0.2565918, "step": 11732, "time_per_iteration": 2.6105875968933105 }, { "auxiliary_loss_clip": 0.01318774, "auxiliary_loss_mlp": 0.00226691, "balance_loss_clip": 1.09651589, "balance_loss_mlp": 0.19978571, "epoch": 0.7054261235532843, "flos": 22711560399360.0, "grad_norm": 230.39928559629897, "language_loss": 0.77418, "learning_rate": 8.431282284396735e-07, "loss": 0.78963464, "num_input_tokens_seen": 253160250, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.26928711, "step": 11733, "time_per_iteration": 2.6692514419555664 }, { "auxiliary_loss_clip": 0.01287465, "auxiliary_loss_mlp": 0.00233922, "balance_loss_clip": 1.06664228, "balance_loss_mlp": 0.2060028, "epoch": 0.7054862468059522, "flos": 13589711775360.0, "grad_norm": 2.906144032403962, "language_loss": 0.81351227, "learning_rate": 8.428105556357583e-07, "loss": 0.82872611, "num_input_tokens_seen": 253178710, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.27941895, "step": 11734, "time_per_iteration": 2.617344379425049 }, { "auxiliary_loss_clip": 0.01335906, "auxiliary_loss_mlp": 0.00234844, "balance_loss_clip": 1.09389019, "balance_loss_mlp": 0.20663913, "epoch": 0.7055463700586202, "flos": 15880704105600.0, "grad_norm": 3.628348902361387, "language_loss": 0.82472265, "learning_rate": 8.424929267125829e-07, "loss": 0.84043014, "num_input_tokens_seen": 253194805, "router_z_loss_clip": 2.41992188, "router_z_loss_mlp": 0.28186035, "step": 11735, "time_per_iteration": 2.658045768737793 }, { "auxiliary_loss_clip": 0.01335467, "auxiliary_loss_mlp": 0.00219686, "balance_loss_clip": 1.09663975, "balance_loss_mlp": 0.1872611, "epoch": 0.7056064933112881, "flos": 23076161400960.0, "grad_norm": 22.46104892926085, "language_loss": 0.81119752, "learning_rate": 8.421753416821933e-07, "loss": 0.82674909, "num_input_tokens_seen": 253213895, "router_z_loss_clip": 2.390625, "router_z_loss_mlp": 0.32421875, "step": 11736, "time_per_iteration": 2.689769983291626 }, { "auxiliary_loss_clip": 0.01305399, "auxiliary_loss_mlp": 0.00219753, "balance_loss_clip": 1.08131957, "balance_loss_mlp": 0.19271615, "epoch": 0.7056666165639561, "flos": 24057168721920.0, "grad_norm": 17.39489262798504, "language_loss": 0.75964153, "learning_rate": 8.41857800556629e-07, "loss": 0.77489305, "num_input_tokens_seen": 253231620, "router_z_loss_clip": 2.2421875, "router_z_loss_mlp": 0.27062988, "step": 11737, "time_per_iteration": 2.7491648197174072 }, { "auxiliary_loss_clip": 0.01335472, "auxiliary_loss_mlp": 0.00235325, "balance_loss_clip": 1.09588861, "balance_loss_mlp": 0.2057377, "epoch": 0.705726739816624, "flos": 17493237371520.0, "grad_norm": 15.081980981608442, "language_loss": 0.78224897, "learning_rate": 8.415403033479332e-07, "loss": 0.797957, "num_input_tokens_seen": 253249590, "router_z_loss_clip": 2.39648438, "router_z_loss_mlp": 0.29577637, "step": 11738, "time_per_iteration": 2.6664719581604004 }, { "auxiliary_loss_clip": 0.0132925, "auxiliary_loss_mlp": 0.00220611, "balance_loss_clip": 1.09363246, "balance_loss_mlp": 0.19048658, "epoch": 0.7057868630692921, "flos": 51350426472960.0, "grad_norm": 32.42702375695564, "language_loss": 0.83672082, "learning_rate": 8.41222850068145e-07, "loss": 0.85221946, "num_input_tokens_seen": 253273870, "router_z_loss_clip": 2.35351562, "router_z_loss_mlp": 0.30114746, "step": 11739, "time_per_iteration": 2.9577319622039795 }, { "auxiliary_loss_clip": 0.01283925, "auxiliary_loss_mlp": 0.00219629, "balance_loss_clip": 1.062837, "balance_loss_mlp": 0.19330731, "epoch": 0.70584698632196, "flos": 26102963836800.0, "grad_norm": 28.762442675812515, "language_loss": 0.78459811, "learning_rate": 8.409054407293032e-07, "loss": 0.79963362, "num_input_tokens_seen": 253293720, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.26281738, "step": 11740, "time_per_iteration": 2.7515838146209717 }, { "auxiliary_loss_clip": 0.01287157, "auxiliary_loss_mlp": 0.00212783, "balance_loss_clip": 1.06664169, "balance_loss_mlp": 0.18723664, "epoch": 0.705907109574628, "flos": 21543134889600.0, "grad_norm": 196.53078447009077, "language_loss": 0.89626062, "learning_rate": 8.405880753434434e-07, "loss": 0.91126001, "num_input_tokens_seen": 253313700, "router_z_loss_clip": 2.20507812, "router_z_loss_mlp": 0.25537109, "step": 11741, "time_per_iteration": 2.6962642669677734 }, { "auxiliary_loss_clip": 0.01309614, "auxiliary_loss_mlp": 0.00231083, "balance_loss_clip": 1.08166611, "balance_loss_mlp": 0.20182911, "epoch": 0.705967232827296, "flos": 22710842127360.0, "grad_norm": 1441.677851573194, "language_loss": 0.87249136, "learning_rate": 8.402707539225993e-07, "loss": 0.88789827, "num_input_tokens_seen": 253332425, "router_z_loss_clip": 2.27929688, "router_z_loss_mlp": 0.29284668, "step": 11742, "time_per_iteration": 2.6889233589172363 }, { "auxiliary_loss_clip": 0.0131415, "auxiliary_loss_mlp": 0.00257398, "balance_loss_clip": 1.07974052, "balance_loss_mlp": 0.22783412, "epoch": 0.7060273560799639, "flos": 28691225124480.0, "grad_norm": 3.728993143273088, "language_loss": 0.72883701, "learning_rate": 8.39953476478805e-07, "loss": 0.74455249, "num_input_tokens_seen": 253353620, "router_z_loss_clip": 2.33984375, "router_z_loss_mlp": 0.2956543, "step": 11743, "time_per_iteration": 2.7079827785491943 }, { "auxiliary_loss_clip": 0.01321113, "auxiliary_loss_mlp": 0.00240838, "balance_loss_clip": 1.08624458, "balance_loss_mlp": 0.21296667, "epoch": 0.7060874793326319, "flos": 15706178899200.0, "grad_norm": 21.459992160720404, "language_loss": 0.73772317, "learning_rate": 8.396362430240902e-07, "loss": 0.75334275, "num_input_tokens_seen": 253370930, "router_z_loss_clip": 2.34765625, "router_z_loss_mlp": 0.27832031, "step": 11744, "time_per_iteration": 2.6887686252593994 }, { "auxiliary_loss_clip": 0.01307566, "auxiliary_loss_mlp": 0.00231633, "balance_loss_clip": 1.08528531, "balance_loss_mlp": 0.20575297, "epoch": 0.7061476025852998, "flos": 21506757390720.0, "grad_norm": 4.629380566651849, "language_loss": 0.72482294, "learning_rate": 8.393190535704857e-07, "loss": 0.74021494, "num_input_tokens_seen": 253389810, "router_z_loss_clip": 2.2265625, "router_z_loss_mlp": 0.25866699, "step": 11745, "time_per_iteration": 2.632183074951172 }, { "auxiliary_loss_clip": 0.01310914, "auxiliary_loss_mlp": 0.00233282, "balance_loss_clip": 1.08735764, "balance_loss_mlp": 0.20460053, "epoch": 0.7062077258379679, "flos": 28181832399360.0, "grad_norm": 2.5529946609183995, "language_loss": 0.7719962, "learning_rate": 8.390019081300188e-07, "loss": 0.78743815, "num_input_tokens_seen": 253408685, "router_z_loss_clip": 2.23828125, "router_z_loss_mlp": 0.28674316, "step": 11746, "time_per_iteration": 2.778212070465088 }, { "auxiliary_loss_clip": 0.01293444, "auxiliary_loss_mlp": 0.00219416, "balance_loss_clip": 1.06996655, "balance_loss_mlp": 0.19090064, "epoch": 0.7062678490906358, "flos": 27853680723840.0, "grad_norm": 7.565634419161394, "language_loss": 0.84946394, "learning_rate": 8.386848067147175e-07, "loss": 0.86459249, "num_input_tokens_seen": 253429685, "router_z_loss_clip": 2.23242188, "router_z_loss_mlp": 0.28491211, "step": 11747, "time_per_iteration": 2.7247676849365234 }, { "auxiliary_loss_clip": 0.0128311, "auxiliary_loss_mlp": 0.00223814, "balance_loss_clip": 1.06855845, "balance_loss_mlp": 0.19920962, "epoch": 0.7063279723433038, "flos": 23184862934400.0, "grad_norm": 60.60335225987283, "language_loss": 0.72048271, "learning_rate": 8.383677493366031e-07, "loss": 0.73555195, "num_input_tokens_seen": 253448260, "router_z_loss_clip": 2.14648438, "router_z_loss_mlp": 0.24621582, "step": 11748, "time_per_iteration": 2.6634669303894043 }, { "auxiliary_loss_clip": 0.01334053, "auxiliary_loss_mlp": 0.00260521, "balance_loss_clip": 1.10058284, "balance_loss_mlp": 0.23281638, "epoch": 0.7063880955959717, "flos": 20188655907840.0, "grad_norm": 15.83648273847332, "language_loss": 0.89145207, "learning_rate": 8.380507360077003e-07, "loss": 0.90739775, "num_input_tokens_seen": 253467725, "router_z_loss_clip": 2.33789062, "router_z_loss_mlp": 0.27709961, "step": 11749, "time_per_iteration": 2.7296640872955322 }, { "auxiliary_loss_clip": 0.01144363, "auxiliary_loss_mlp": 0.00071639, "balance_loss_clip": 0.99257708, "balance_loss_mlp": 0.06372344, "epoch": 0.7064482188486397, "flos": 63668182763520.0, "grad_norm": 0.7821520493582876, "language_loss": 0.53388321, "learning_rate": 8.377337667400304e-07, "loss": 0.54604316, "num_input_tokens_seen": 253526940, "router_z_loss_clip": 1.515625, "router_z_loss_mlp": 0.07910156, "step": 11750, "time_per_iteration": 3.1415154933929443 }, { "auxiliary_loss_clip": 0.01309889, "auxiliary_loss_mlp": 0.00231918, "balance_loss_clip": 1.0841428, "balance_loss_mlp": 0.20672946, "epoch": 0.7065083421013076, "flos": 25191227894400.0, "grad_norm": 50.22901524468032, "language_loss": 0.88308692, "learning_rate": 8.37416841545612e-07, "loss": 0.89850503, "num_input_tokens_seen": 253546160, "router_z_loss_clip": 2.265625, "router_z_loss_mlp": 0.25170898, "step": 11751, "time_per_iteration": 2.730597734451294 }, { "auxiliary_loss_clip": 0.0130193, "auxiliary_loss_mlp": 0.00201096, "balance_loss_clip": 1.07811129, "balance_loss_mlp": 0.17444067, "epoch": 0.7065684653539757, "flos": 22893699288960.0, "grad_norm": 4.8794327112594855, "language_loss": 0.7584548, "learning_rate": 8.370999604364634e-07, "loss": 0.77348506, "num_input_tokens_seen": 253565505, "router_z_loss_clip": 2.23632812, "router_z_loss_mlp": 0.26660156, "step": 11752, "time_per_iteration": 4.065606355667114 }, { "auxiliary_loss_clip": 0.01317368, "auxiliary_loss_mlp": 0.00239564, "balance_loss_clip": 1.08648133, "balance_loss_mlp": 0.21042901, "epoch": 0.7066285886066436, "flos": 23550254035200.0, "grad_norm": 3.4223805384649832, "language_loss": 0.87047994, "learning_rate": 8.367831234246025e-07, "loss": 0.88604927, "num_input_tokens_seen": 253585125, "router_z_loss_clip": 2.30859375, "router_z_loss_mlp": 0.29125977, "step": 11753, "time_per_iteration": 4.157552003860474 }, { "auxiliary_loss_clip": 0.0130819, "auxiliary_loss_mlp": 0.00221703, "balance_loss_clip": 1.0860424, "balance_loss_mlp": 0.1966095, "epoch": 0.7066887118593116, "flos": 21069293650560.0, "grad_norm": 17.561498410063667, "language_loss": 0.7656002, "learning_rate": 8.364663305220405e-07, "loss": 0.78089917, "num_input_tokens_seen": 253604815, "router_z_loss_clip": 2.22753906, "router_z_loss_mlp": 0.25085449, "step": 11754, "time_per_iteration": 2.735171318054199 }, { "auxiliary_loss_clip": 0.01312966, "auxiliary_loss_mlp": 0.00222381, "balance_loss_clip": 1.08722937, "balance_loss_mlp": 0.19577345, "epoch": 0.7067488351119796, "flos": 21176307244800.0, "grad_norm": 16.025329457683053, "language_loss": 0.94642526, "learning_rate": 8.361495817407919e-07, "loss": 0.96177876, "num_input_tokens_seen": 253622855, "router_z_loss_clip": 2.25976562, "router_z_loss_mlp": 0.26599121, "step": 11755, "time_per_iteration": 2.6536967754364014 }, { "auxiliary_loss_clip": 0.01291818, "auxiliary_loss_mlp": 0.00235595, "balance_loss_clip": 1.07401145, "balance_loss_mlp": 0.20986995, "epoch": 0.7068089583646475, "flos": 20449224144000.0, "grad_norm": 2.952421877152004, "language_loss": 0.88228846, "learning_rate": 8.358328770928678e-07, "loss": 0.89756262, "num_input_tokens_seen": 253642760, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.25708008, "step": 11756, "time_per_iteration": 4.105769872665405 }, { "auxiliary_loss_clip": 0.01148786, "auxiliary_loss_mlp": 0.00061118, "balance_loss_clip": 0.99933922, "balance_loss_mlp": 0.05124793, "epoch": 0.7068690816173155, "flos": 59109179829120.0, "grad_norm": 0.823877723369298, "language_loss": 0.59542727, "learning_rate": 8.355162165902785e-07, "loss": 0.6075263, "num_input_tokens_seen": 253695685, "router_z_loss_clip": 1.4921875, "router_z_loss_mlp": 0.09863281, "step": 11757, "time_per_iteration": 2.967616558074951 }, { "auxiliary_loss_clip": 0.01316745, "auxiliary_loss_mlp": 0.00265007, "balance_loss_clip": 1.09149361, "balance_loss_mlp": 0.23794615, "epoch": 0.7069292048699835, "flos": 16251554073600.0, "grad_norm": 7.8541397859173605, "language_loss": 0.89002889, "learning_rate": 8.351996002450307e-07, "loss": 0.90584636, "num_input_tokens_seen": 253713305, "router_z_loss_clip": 2.25585938, "router_z_loss_mlp": 0.27050781, "step": 11758, "time_per_iteration": 2.6295113563537598 }, { "auxiliary_loss_clip": 0.01306234, "auxiliary_loss_mlp": 0.00243387, "balance_loss_clip": 1.08393776, "balance_loss_mlp": 0.21575466, "epoch": 0.7069893281226515, "flos": 41172768455040.0, "grad_norm": 4.190992112918625, "language_loss": 0.84691751, "learning_rate": 8.348830280691304e-07, "loss": 0.8624137, "num_input_tokens_seen": 253736100, "router_z_loss_clip": 2.22265625, "router_z_loss_mlp": 0.27648926, "step": 11759, "time_per_iteration": 2.8372421264648438 }, { "auxiliary_loss_clip": 0.01306034, "auxiliary_loss_mlp": 0.0020472, "balance_loss_clip": 1.08065867, "balance_loss_mlp": 0.17839819, "epoch": 0.7070494513753194, "flos": 24207275658240.0, "grad_norm": 30.358461241256776, "language_loss": 0.76064342, "learning_rate": 8.34566500074583e-07, "loss": 0.77575094, "num_input_tokens_seen": 253757350, "router_z_loss_clip": 2.25390625, "router_z_loss_mlp": 0.26318359, "step": 11760, "time_per_iteration": 4.148841619491577 }, { "auxiliary_loss_clip": 0.01316615, "auxiliary_loss_mlp": 0.00229283, "balance_loss_clip": 1.08502531, "balance_loss_mlp": 0.20200787, "epoch": 0.7071095746279874, "flos": 20185675079040.0, "grad_norm": 735.3354563176207, "language_loss": 0.856749, "learning_rate": 8.342500162733899e-07, "loss": 0.872208, "num_input_tokens_seen": 253772855, "router_z_loss_clip": 2.31640625, "router_z_loss_mlp": 0.27270508, "step": 11761, "time_per_iteration": 2.6712989807128906 }, { "auxiliary_loss_clip": 0.01313266, "auxiliary_loss_mlp": 0.00233881, "balance_loss_clip": 1.08972406, "balance_loss_mlp": 0.20746401, "epoch": 0.7071696978806553, "flos": 18183045133440.0, "grad_norm": 7.372245888337569, "language_loss": 0.82323182, "learning_rate": 8.33933576677553e-07, "loss": 0.83870327, "num_input_tokens_seen": 253790360, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.26403809, "step": 11762, "time_per_iteration": 2.675609588623047 }, { "auxiliary_loss_clip": 0.01299462, "auxiliary_loss_mlp": 0.00226695, "balance_loss_clip": 1.08113039, "balance_loss_mlp": 0.20088658, "epoch": 0.7072298211333233, "flos": 24131719399680.0, "grad_norm": 7.651089139557024, "language_loss": 0.84264308, "learning_rate": 8.336171812990724e-07, "loss": 0.85790467, "num_input_tokens_seen": 253810585, "router_z_loss_clip": 2.18164062, "router_z_loss_mlp": 0.25793457, "step": 11763, "time_per_iteration": 2.663773775100708 }, { "auxiliary_loss_clip": 0.0131509, "auxiliary_loss_mlp": 0.00244894, "balance_loss_clip": 1.0856005, "balance_loss_mlp": 0.21600926, "epoch": 0.7072899443859912, "flos": 27198418867200.0, "grad_norm": 145.79413998867074, "language_loss": 0.87113613, "learning_rate": 8.333008301499453e-07, "loss": 0.88673598, "num_input_tokens_seen": 253829080, "router_z_loss_clip": 2.29492188, "router_z_loss_mlp": 0.28894043, "step": 11764, "time_per_iteration": 2.7702624797821045 }, { "auxiliary_loss_clip": 0.01302811, "auxiliary_loss_mlp": 0.0023128, "balance_loss_clip": 1.08003044, "balance_loss_mlp": 0.20437488, "epoch": 0.7073500676386593, "flos": 16435596384000.0, "grad_norm": 7.610852136064226, "language_loss": 0.87211096, "learning_rate": 8.32984523242167e-07, "loss": 0.88745183, "num_input_tokens_seen": 253846780, "router_z_loss_clip": 2.22265625, "router_z_loss_mlp": 0.26940918, "step": 11765, "time_per_iteration": 2.6413052082061768 }, { "auxiliary_loss_clip": 0.01312833, "auxiliary_loss_mlp": 0.00247549, "balance_loss_clip": 1.08924675, "balance_loss_mlp": 0.22028607, "epoch": 0.7074101908913272, "flos": 27673732563840.0, "grad_norm": 200.82270234390404, "language_loss": 0.75362372, "learning_rate": 8.326682605877324e-07, "loss": 0.76922756, "num_input_tokens_seen": 253867075, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.27270508, "step": 11766, "time_per_iteration": 2.772528886795044 }, { "auxiliary_loss_clip": 0.0133011, "auxiliary_loss_mlp": 0.00244982, "balance_loss_clip": 1.09978426, "balance_loss_mlp": 0.21665801, "epoch": 0.7074703141439952, "flos": 22238078296320.0, "grad_norm": 15.652506593538734, "language_loss": 0.72292864, "learning_rate": 8.323520421986352e-07, "loss": 0.73867953, "num_input_tokens_seen": 253885790, "router_z_loss_clip": 2.3046875, "router_z_loss_mlp": 0.28295898, "step": 11767, "time_per_iteration": 2.7287936210632324 }, { "auxiliary_loss_clip": 0.01315436, "auxiliary_loss_mlp": 0.0022021, "balance_loss_clip": 1.08451641, "balance_loss_mlp": 0.19174254, "epoch": 0.7075304373966632, "flos": 29643217234560.0, "grad_norm": 173.5241611600249, "language_loss": 0.61132354, "learning_rate": 8.320358680868646e-07, "loss": 0.62667996, "num_input_tokens_seen": 253907070, "router_z_loss_clip": 2.31054688, "router_z_loss_mlp": 0.28417969, "step": 11768, "time_per_iteration": 2.7615857124328613 }, { "auxiliary_loss_clip": 0.01290216, "auxiliary_loss_mlp": 0.00208498, "balance_loss_clip": 1.07181776, "balance_loss_mlp": 0.18206942, "epoch": 0.7075905606493311, "flos": 19755214490880.0, "grad_norm": 2.71677225170037, "language_loss": 0.82408965, "learning_rate": 8.317197382644119e-07, "loss": 0.83907682, "num_input_tokens_seen": 253927290, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.26428223, "step": 11769, "time_per_iteration": 2.692446708679199 }, { "auxiliary_loss_clip": 0.01174854, "auxiliary_loss_mlp": 0.00093828, "balance_loss_clip": 1.02405286, "balance_loss_mlp": 0.08543548, "epoch": 0.7076506839019991, "flos": 65716132694400.0, "grad_norm": 0.8362946786902048, "language_loss": 0.61487353, "learning_rate": 8.314036527432637e-07, "loss": 0.62756038, "num_input_tokens_seen": 253983440, "router_z_loss_clip": 1.5078125, "router_z_loss_mlp": 0.08398438, "step": 11770, "time_per_iteration": 3.1053028106689453 }, { "auxiliary_loss_clip": 0.0133586, "auxiliary_loss_mlp": 0.00235104, "balance_loss_clip": 1.10499501, "balance_loss_mlp": 0.20796019, "epoch": 0.707710807154667, "flos": 23765286804480.0, "grad_norm": 5.5328402097002245, "language_loss": 0.82802981, "learning_rate": 8.310876115354055e-07, "loss": 0.84373939, "num_input_tokens_seen": 254003825, "router_z_loss_clip": 2.3125, "router_z_loss_mlp": 0.27160645, "step": 11771, "time_per_iteration": 2.751283884048462 }, { "auxiliary_loss_clip": 0.01305564, "auxiliary_loss_mlp": 0.00236505, "balance_loss_clip": 1.08369279, "balance_loss_mlp": 0.20772775, "epoch": 0.7077709304073351, "flos": 21251360712960.0, "grad_norm": 163.39558080410222, "language_loss": 0.79734862, "learning_rate": 8.307716146528221e-07, "loss": 0.81276929, "num_input_tokens_seen": 254023345, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.28808594, "step": 11772, "time_per_iteration": 2.6969833374023438 }, { "auxiliary_loss_clip": 0.01315169, "auxiliary_loss_mlp": 0.00217284, "balance_loss_clip": 1.08569837, "balance_loss_mlp": 0.18991369, "epoch": 0.707831053660003, "flos": 20740746925440.0, "grad_norm": 25.933654063051723, "language_loss": 0.80892909, "learning_rate": 8.30455662107496e-07, "loss": 0.82425362, "num_input_tokens_seen": 254041815, "router_z_loss_clip": 2.2890625, "router_z_loss_mlp": 0.27380371, "step": 11773, "time_per_iteration": 2.692239284515381 }, { "auxiliary_loss_clip": 0.01332292, "auxiliary_loss_mlp": 0.00255219, "balance_loss_clip": 1.10147107, "balance_loss_mlp": 0.22802764, "epoch": 0.707891176912671, "flos": 21980993679360.0, "grad_norm": 30.936194627872844, "language_loss": 0.77894199, "learning_rate": 8.301397539114095e-07, "loss": 0.79481709, "num_input_tokens_seen": 254062065, "router_z_loss_clip": 2.3046875, "router_z_loss_mlp": 0.27197266, "step": 11774, "time_per_iteration": 2.6728408336639404 }, { "auxiliary_loss_clip": 0.01297431, "auxiliary_loss_mlp": 0.00235735, "balance_loss_clip": 1.08267379, "balance_loss_mlp": 0.21004577, "epoch": 0.7079513001653389, "flos": 21068970428160.0, "grad_norm": 1364.1114897444427, "language_loss": 0.81514478, "learning_rate": 8.298238900765407e-07, "loss": 0.8304764, "num_input_tokens_seen": 254080605, "router_z_loss_clip": 2.14453125, "router_z_loss_mlp": 0.25695801, "step": 11775, "time_per_iteration": 2.6735169887542725 }, { "auxiliary_loss_clip": 0.01318235, "auxiliary_loss_mlp": 0.00262022, "balance_loss_clip": 1.09220397, "balance_loss_mlp": 0.23413911, "epoch": 0.7080114234180069, "flos": 18040659621120.0, "grad_norm": 123.84558346286963, "language_loss": 0.93919879, "learning_rate": 8.295080706148665e-07, "loss": 0.95500135, "num_input_tokens_seen": 254098710, "router_z_loss_clip": 2.26367188, "router_z_loss_mlp": 0.27856445, "step": 11776, "time_per_iteration": 2.626718759536743 }, { "auxiliary_loss_clip": 0.0130368, "auxiliary_loss_mlp": 0.00227973, "balance_loss_clip": 1.07993364, "balance_loss_mlp": 0.19998288, "epoch": 0.7080715466706748, "flos": 15122271409920.0, "grad_norm": 68.63603195095418, "language_loss": 0.8186906, "learning_rate": 8.291922955383641e-07, "loss": 0.83400714, "num_input_tokens_seen": 254117200, "router_z_loss_clip": 2.23632812, "router_z_loss_mlp": 0.28015137, "step": 11777, "time_per_iteration": 2.6779003143310547 }, { "auxiliary_loss_clip": 0.01343658, "auxiliary_loss_mlp": 0.00254166, "balance_loss_clip": 1.10791063, "balance_loss_mlp": 0.22535291, "epoch": 0.7081316699233429, "flos": 14422802889600.0, "grad_norm": 6.3698679744293765, "language_loss": 0.90174735, "learning_rate": 8.288765648590066e-07, "loss": 0.91772568, "num_input_tokens_seen": 254132115, "router_z_loss_clip": 2.35546875, "router_z_loss_mlp": 0.28845215, "step": 11778, "time_per_iteration": 2.611539602279663 }, { "auxiliary_loss_clip": 0.01311736, "auxiliary_loss_mlp": 0.00223513, "balance_loss_clip": 1.08899486, "balance_loss_mlp": 0.19822881, "epoch": 0.7081917931760108, "flos": 23222389668480.0, "grad_norm": 2.0723821926668036, "language_loss": 0.906708, "learning_rate": 8.285608785887673e-07, "loss": 0.92206055, "num_input_tokens_seen": 254152285, "router_z_loss_clip": 2.22460938, "router_z_loss_mlp": 0.25280762, "step": 11779, "time_per_iteration": 2.7006278038024902 }, { "auxiliary_loss_clip": 0.01329387, "auxiliary_loss_mlp": 0.00238015, "balance_loss_clip": 1.09784985, "balance_loss_mlp": 0.21171729, "epoch": 0.7082519164286788, "flos": 39308429871360.0, "grad_norm": 10.121596812940599, "language_loss": 0.78394639, "learning_rate": 8.28245236739618e-07, "loss": 0.79962045, "num_input_tokens_seen": 254172805, "router_z_loss_clip": 2.31640625, "router_z_loss_mlp": 0.26293945, "step": 11780, "time_per_iteration": 2.848233699798584 }, { "auxiliary_loss_clip": 0.01319947, "auxiliary_loss_mlp": 0.00205106, "balance_loss_clip": 1.09083533, "balance_loss_mlp": 0.17856991, "epoch": 0.7083120396813467, "flos": 21651154064640.0, "grad_norm": 244.0619349576848, "language_loss": 0.80012584, "learning_rate": 8.279296393235256e-07, "loss": 0.81537634, "num_input_tokens_seen": 254191890, "router_z_loss_clip": 2.28710938, "router_z_loss_mlp": 0.26538086, "step": 11781, "time_per_iteration": 2.704643487930298 }, { "auxiliary_loss_clip": 0.01299745, "auxiliary_loss_mlp": 0.00217526, "balance_loss_clip": 1.07875276, "balance_loss_mlp": 0.19258785, "epoch": 0.7083721629340147, "flos": 17567033863680.0, "grad_norm": 12.916079052207705, "language_loss": 0.85229313, "learning_rate": 8.276140863524585e-07, "loss": 0.86746585, "num_input_tokens_seen": 254210150, "router_z_loss_clip": 2.2109375, "router_z_loss_mlp": 0.24975586, "step": 11782, "time_per_iteration": 2.6962387561798096 }, { "auxiliary_loss_clip": 0.01297354, "auxiliary_loss_mlp": 0.00214264, "balance_loss_clip": 1.07921827, "balance_loss_mlp": 0.19081597, "epoch": 0.7084322861866827, "flos": 29350509304320.0, "grad_norm": 2.1652447612010053, "language_loss": 0.75775671, "learning_rate": 8.272985778383828e-07, "loss": 0.77287292, "num_input_tokens_seen": 254233015, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.23449707, "step": 11783, "time_per_iteration": 2.747670888900757 }, { "auxiliary_loss_clip": 0.01320161, "auxiliary_loss_mlp": 0.0024293, "balance_loss_clip": 1.09177828, "balance_loss_mlp": 0.21575026, "epoch": 0.7084924094393507, "flos": 20194294343040.0, "grad_norm": 74.80175211946299, "language_loss": 0.85525769, "learning_rate": 8.269831137932632e-07, "loss": 0.87088865, "num_input_tokens_seen": 254251345, "router_z_loss_clip": 2.28515625, "router_z_loss_mlp": 0.27185059, "step": 11784, "time_per_iteration": 2.701885461807251 }, { "auxiliary_loss_clip": 0.01317298, "auxiliary_loss_mlp": 0.00212169, "balance_loss_clip": 1.09566522, "balance_loss_mlp": 0.18513253, "epoch": 0.7085525326920187, "flos": 23477211728640.0, "grad_norm": 17.08869940962551, "language_loss": 0.85103178, "learning_rate": 8.266676942290609e-07, "loss": 0.86632645, "num_input_tokens_seen": 254269905, "router_z_loss_clip": 2.21484375, "router_z_loss_mlp": 0.27038574, "step": 11785, "time_per_iteration": 2.6550745964050293 }, { "auxiliary_loss_clip": 0.0131782, "auxiliary_loss_mlp": 0.00238683, "balance_loss_clip": 1.08891094, "balance_loss_mlp": 0.21084802, "epoch": 0.7086126559446866, "flos": 25958818558080.0, "grad_norm": 4.989566989739853, "language_loss": 0.84086692, "learning_rate": 8.26352319157738e-07, "loss": 0.85643196, "num_input_tokens_seen": 254289990, "router_z_loss_clip": 2.29101562, "router_z_loss_mlp": 0.27832031, "step": 11786, "time_per_iteration": 2.75182843208313 }, { "auxiliary_loss_clip": 0.01344697, "auxiliary_loss_mlp": 0.00250227, "balance_loss_clip": 1.10377717, "balance_loss_mlp": 0.21985272, "epoch": 0.7086727791973546, "flos": 26724793109760.0, "grad_norm": 1657.7014686987259, "language_loss": 0.86547232, "learning_rate": 8.260369885912526e-07, "loss": 0.88142151, "num_input_tokens_seen": 254309085, "router_z_loss_clip": 2.41210938, "router_z_loss_mlp": 0.30383301, "step": 11787, "time_per_iteration": 2.710374593734741 }, { "auxiliary_loss_clip": 0.01319059, "auxiliary_loss_mlp": 0.00239937, "balance_loss_clip": 1.08884549, "balance_loss_mlp": 0.21325818, "epoch": 0.7087329024500225, "flos": 21683365585920.0, "grad_norm": 51.61730053589012, "language_loss": 0.84958708, "learning_rate": 8.257217025415615e-07, "loss": 0.86517704, "num_input_tokens_seen": 254327045, "router_z_loss_clip": 2.30273438, "router_z_loss_mlp": 0.26660156, "step": 11788, "time_per_iteration": 2.6813700199127197 }, { "auxiliary_loss_clip": 0.01360167, "auxiliary_loss_mlp": 0.00257817, "balance_loss_clip": 1.11741817, "balance_loss_mlp": 0.22662038, "epoch": 0.7087930257026905, "flos": 17931060247680.0, "grad_norm": 15.412505344874223, "language_loss": 0.7955755, "learning_rate": 8.254064610206212e-07, "loss": 0.8117553, "num_input_tokens_seen": 254344585, "router_z_loss_clip": 2.42773438, "router_z_loss_mlp": 0.31188965, "step": 11789, "time_per_iteration": 2.6369714736938477 }, { "auxiliary_loss_clip": 0.01332695, "auxiliary_loss_mlp": 0.00251012, "balance_loss_clip": 1.10202718, "balance_loss_mlp": 0.22255713, "epoch": 0.7088531489553584, "flos": 18911528864640.0, "grad_norm": 9.018995081709546, "language_loss": 0.84803426, "learning_rate": 8.250912640403858e-07, "loss": 0.86387134, "num_input_tokens_seen": 254362470, "router_z_loss_clip": 2.30859375, "router_z_loss_mlp": 0.2845459, "step": 11790, "time_per_iteration": 2.678720235824585 }, { "auxiliary_loss_clip": 0.01335088, "auxiliary_loss_mlp": 0.00257669, "balance_loss_clip": 1.09826136, "balance_loss_mlp": 0.22909483, "epoch": 0.7089132722080265, "flos": 27380880979200.0, "grad_norm": 189.95478257218957, "language_loss": 0.81768656, "learning_rate": 8.247761116128085e-07, "loss": 0.83361411, "num_input_tokens_seen": 254383190, "router_z_loss_clip": 2.36914062, "router_z_loss_mlp": 0.28540039, "step": 11791, "time_per_iteration": 2.728337049484253 }, { "auxiliary_loss_clip": 0.0131831, "auxiliary_loss_mlp": 0.00251133, "balance_loss_clip": 1.09359479, "balance_loss_mlp": 0.2238695, "epoch": 0.7089733954606944, "flos": 22162917087360.0, "grad_norm": 52.750406647008674, "language_loss": 0.89692217, "learning_rate": 8.244610037498376e-07, "loss": 0.91261661, "num_input_tokens_seen": 254403115, "router_z_loss_clip": 2.25, "router_z_loss_mlp": 0.27282715, "step": 11792, "time_per_iteration": 2.649060010910034 }, { "auxiliary_loss_clip": 0.01333196, "auxiliary_loss_mlp": 0.00234937, "balance_loss_clip": 1.09487367, "balance_loss_mlp": 0.20501527, "epoch": 0.7090335187133624, "flos": 24425827960320.0, "grad_norm": 8.639870072237386, "language_loss": 0.73816812, "learning_rate": 8.241459404634232e-07, "loss": 0.75384951, "num_input_tokens_seen": 254421875, "router_z_loss_clip": 2.3828125, "router_z_loss_mlp": 0.29931641, "step": 11793, "time_per_iteration": 2.676018714904785 }, { "auxiliary_loss_clip": 0.012995, "auxiliary_loss_mlp": 0.00250214, "balance_loss_clip": 1.07636631, "balance_loss_mlp": 0.2238811, "epoch": 0.7090936419660303, "flos": 21835232288640.0, "grad_norm": 13.30590651833809, "language_loss": 0.77432221, "learning_rate": 8.238309217655133e-07, "loss": 0.78981942, "num_input_tokens_seen": 254440765, "router_z_loss_clip": 2.23046875, "router_z_loss_mlp": 0.26342773, "step": 11794, "time_per_iteration": 4.08112907409668 }, { "auxiliary_loss_clip": 0.01322997, "auxiliary_loss_mlp": 0.00244229, "balance_loss_clip": 1.09356093, "balance_loss_mlp": 0.21777654, "epoch": 0.7091537652186983, "flos": 20082360585600.0, "grad_norm": 6.699976676901574, "language_loss": 0.81575048, "learning_rate": 8.23515947668052e-07, "loss": 0.83142269, "num_input_tokens_seen": 254459480, "router_z_loss_clip": 2.29589844, "router_z_loss_mlp": 0.26452637, "step": 11795, "time_per_iteration": 4.156214714050293 }, { "auxiliary_loss_clip": 0.01312425, "auxiliary_loss_mlp": 0.0024266, "balance_loss_clip": 1.08181763, "balance_loss_mlp": 0.21559973, "epoch": 0.7092138884713663, "flos": 13151565676800.0, "grad_norm": 99.41797874437958, "language_loss": 0.84658235, "learning_rate": 8.232010181829838e-07, "loss": 0.8621332, "num_input_tokens_seen": 254473985, "router_z_loss_clip": 2.30566406, "router_z_loss_mlp": 0.27062988, "step": 11796, "time_per_iteration": 2.656310796737671 }, { "auxiliary_loss_clip": 0.01325841, "auxiliary_loss_mlp": 0.00266719, "balance_loss_clip": 1.08801007, "balance_loss_mlp": 0.23595136, "epoch": 0.7092740117240343, "flos": 21645982506240.0, "grad_norm": 3.3170342866029996, "language_loss": 0.8135649, "learning_rate": 8.228861333222523e-07, "loss": 0.82949054, "num_input_tokens_seen": 254492135, "router_z_loss_clip": 2.38085938, "router_z_loss_mlp": 0.30786133, "step": 11797, "time_per_iteration": 2.627934455871582 }, { "auxiliary_loss_clip": 0.0131857, "auxiliary_loss_mlp": 0.00233344, "balance_loss_clip": 1.0920707, "balance_loss_mlp": 0.20627198, "epoch": 0.7093341349767023, "flos": 21032521102080.0, "grad_norm": 6.062186458002524, "language_loss": 0.86124688, "learning_rate": 8.225712930977953e-07, "loss": 0.87676609, "num_input_tokens_seen": 254512865, "router_z_loss_clip": 2.26367188, "router_z_loss_mlp": 0.27087402, "step": 11798, "time_per_iteration": 4.1195783615112305 }, { "auxiliary_loss_clip": 0.01295147, "auxiliary_loss_mlp": 0.00237158, "balance_loss_clip": 1.07578409, "balance_loss_mlp": 0.21189737, "epoch": 0.7093942582293702, "flos": 22017658487040.0, "grad_norm": 367.14708351276334, "language_loss": 0.73859, "learning_rate": 8.222564975215529e-07, "loss": 0.7539131, "num_input_tokens_seen": 254532605, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.25268555, "step": 11799, "time_per_iteration": 2.7658236026763916 }, { "auxiliary_loss_clip": 0.01306503, "auxiliary_loss_mlp": 0.00231256, "balance_loss_clip": 1.08147573, "balance_loss_mlp": 0.20367077, "epoch": 0.7094543814820382, "flos": 27235586465280.0, "grad_norm": 51.69963050462977, "language_loss": 0.89106786, "learning_rate": 8.219417466054622e-07, "loss": 0.90644544, "num_input_tokens_seen": 254553780, "router_z_loss_clip": 2.24902344, "router_z_loss_mlp": 0.27624512, "step": 11800, "time_per_iteration": 2.759904146194458 }, { "auxiliary_loss_clip": 0.01302847, "auxiliary_loss_mlp": 0.00233453, "balance_loss_clip": 1.08015442, "balance_loss_mlp": 0.2060115, "epoch": 0.7095145047347061, "flos": 12089148180480.0, "grad_norm": 5.529268432127161, "language_loss": 0.94055223, "learning_rate": 8.21627040361459e-07, "loss": 0.95591527, "num_input_tokens_seen": 254567510, "router_z_loss_clip": 2.2265625, "router_z_loss_mlp": 0.27441406, "step": 11801, "time_per_iteration": 2.6875314712524414 }, { "auxiliary_loss_clip": 0.01303428, "auxiliary_loss_mlp": 0.00221785, "balance_loss_clip": 1.07665348, "balance_loss_mlp": 0.19498691, "epoch": 0.7095746279873741, "flos": 19383789905280.0, "grad_norm": 15.311859806525256, "language_loss": 0.84082711, "learning_rate": 8.213123788014758e-07, "loss": 0.85607922, "num_input_tokens_seen": 254585565, "router_z_loss_clip": 2.26953125, "router_z_loss_mlp": 0.26782227, "step": 11802, "time_per_iteration": 2.6406784057617188 }, { "auxiliary_loss_clip": 0.01307315, "auxiliary_loss_mlp": 0.00259105, "balance_loss_clip": 1.07991576, "balance_loss_mlp": 0.23037511, "epoch": 0.709634751240042, "flos": 21360600950400.0, "grad_norm": 4.100908213751981, "language_loss": 0.90386236, "learning_rate": 8.209977619374462e-07, "loss": 0.91952658, "num_input_tokens_seen": 254603465, "router_z_loss_clip": 2.27148438, "router_z_loss_mlp": 0.28735352, "step": 11803, "time_per_iteration": 4.037668943405151 }, { "auxiliary_loss_clip": 0.01303068, "auxiliary_loss_mlp": 0.00212109, "balance_loss_clip": 1.07581019, "balance_loss_mlp": 0.18593046, "epoch": 0.7096948744927101, "flos": 13917037438080.0, "grad_norm": 17.77275933572515, "language_loss": 0.77895284, "learning_rate": 8.206831897812995e-07, "loss": 0.7941047, "num_input_tokens_seen": 254620500, "router_z_loss_clip": 2.26953125, "router_z_loss_mlp": 0.26196289, "step": 11804, "time_per_iteration": 2.7181293964385986 }, { "auxiliary_loss_clip": 0.01284986, "auxiliary_loss_mlp": 0.0023195, "balance_loss_clip": 1.07100308, "balance_loss_mlp": 0.20666611, "epoch": 0.709754997745378, "flos": 30298335436800.0, "grad_norm": 22.154516640777494, "language_loss": 0.85323536, "learning_rate": 8.203686623449637e-07, "loss": 0.86840475, "num_input_tokens_seen": 254638565, "router_z_loss_clip": 2.13867188, "router_z_loss_mlp": 0.25292969, "step": 11805, "time_per_iteration": 2.7226836681365967 }, { "auxiliary_loss_clip": 0.0129799, "auxiliary_loss_mlp": 0.00239087, "balance_loss_clip": 1.0788151, "balance_loss_mlp": 0.21079835, "epoch": 0.709815120998046, "flos": 18515147304960.0, "grad_norm": 7.129100229469187, "language_loss": 0.89134413, "learning_rate": 8.200541796403667e-07, "loss": 0.90671492, "num_input_tokens_seen": 254657505, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.28259277, "step": 11806, "time_per_iteration": 2.6834018230438232 }, { "auxiliary_loss_clip": 0.01328154, "auxiliary_loss_mlp": 0.0023504, "balance_loss_clip": 1.09248495, "balance_loss_mlp": 0.20606007, "epoch": 0.7098752442507139, "flos": 22272588288000.0, "grad_norm": 43.02001618844886, "language_loss": 0.69695556, "learning_rate": 8.197397416794332e-07, "loss": 0.71258754, "num_input_tokens_seen": 254674730, "router_z_loss_clip": 2.35742188, "router_z_loss_mlp": 0.29003906, "step": 11807, "time_per_iteration": 2.648043155670166 }, { "auxiliary_loss_clip": 0.01317784, "auxiliary_loss_mlp": 0.00253206, "balance_loss_clip": 1.08190334, "balance_loss_mlp": 0.22379705, "epoch": 0.7099353675033819, "flos": 19275447507840.0, "grad_norm": 4.9955948137257495, "language_loss": 0.79677719, "learning_rate": 8.194253484740882e-07, "loss": 0.81248707, "num_input_tokens_seen": 254691665, "router_z_loss_clip": 2.36132812, "router_z_loss_mlp": 0.29406738, "step": 11808, "time_per_iteration": 2.6850717067718506 }, { "auxiliary_loss_clip": 0.01302351, "auxiliary_loss_mlp": 0.00245787, "balance_loss_clip": 1.07675552, "balance_loss_mlp": 0.21972759, "epoch": 0.70999549075605, "flos": 21908525990400.0, "grad_norm": 23.450290510449364, "language_loss": 0.79483259, "learning_rate": 8.191110000362513e-07, "loss": 0.810314, "num_input_tokens_seen": 254711610, "router_z_loss_clip": 2.26171875, "router_z_loss_mlp": 0.26049805, "step": 11809, "time_per_iteration": 2.6580810546875 }, { "auxiliary_loss_clip": 0.01138217, "auxiliary_loss_mlp": 0.00110738, "balance_loss_clip": 0.98690641, "balance_loss_mlp": 0.10339484, "epoch": 0.7100556140087179, "flos": 70456053456000.0, "grad_norm": 0.8189236744616847, "language_loss": 0.58730751, "learning_rate": 8.187966963778435e-07, "loss": 0.59979707, "num_input_tokens_seen": 254772615, "router_z_loss_clip": 1.515625, "router_z_loss_mlp": 0.07324219, "step": 11810, "time_per_iteration": 3.241323232650757 }, { "auxiliary_loss_clip": 0.01298781, "auxiliary_loss_mlp": 0.00236374, "balance_loss_clip": 1.07861161, "balance_loss_mlp": 0.21005303, "epoch": 0.7101157372613859, "flos": 23039568420480.0, "grad_norm": 5.593104852310323, "language_loss": 0.81009841, "learning_rate": 8.18482437510784e-07, "loss": 0.82544994, "num_input_tokens_seen": 254791375, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.26318359, "step": 11811, "time_per_iteration": 2.7064311504364014 }, { "auxiliary_loss_clip": 0.01289432, "auxiliary_loss_mlp": 0.00235322, "balance_loss_clip": 1.07041764, "balance_loss_mlp": 0.20848814, "epoch": 0.7101758605140538, "flos": 23185329811200.0, "grad_norm": 478.68339422123296, "language_loss": 0.89451277, "learning_rate": 8.181682234469882e-07, "loss": 0.90976036, "num_input_tokens_seen": 254809300, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.26843262, "step": 11812, "time_per_iteration": 2.683307647705078 }, { "auxiliary_loss_clip": 0.01308027, "auxiliary_loss_mlp": 0.00234162, "balance_loss_clip": 1.07913661, "balance_loss_mlp": 0.20754215, "epoch": 0.7102359837667218, "flos": 23696123166720.0, "grad_norm": 9.538017043412362, "language_loss": 0.78298199, "learning_rate": 8.178540541983716e-07, "loss": 0.79840392, "num_input_tokens_seen": 254829325, "router_z_loss_clip": 2.29199219, "router_z_loss_mlp": 0.26623535, "step": 11813, "time_per_iteration": 2.692432165145874 }, { "auxiliary_loss_clip": 0.01280867, "auxiliary_loss_mlp": 0.00231497, "balance_loss_clip": 1.06379867, "balance_loss_mlp": 0.20510413, "epoch": 0.7102961070193897, "flos": 19391116279680.0, "grad_norm": 3.1039371274621796, "language_loss": 0.89275843, "learning_rate": 8.175399297768495e-07, "loss": 0.90788209, "num_input_tokens_seen": 254847690, "router_z_loss_clip": 2.16601562, "router_z_loss_mlp": 0.26379395, "step": 11814, "time_per_iteration": 2.6304638385772705 }, { "auxiliary_loss_clip": 0.01309877, "auxiliary_loss_mlp": 0.00237565, "balance_loss_clip": 1.0812943, "balance_loss_mlp": 0.21098107, "epoch": 0.7103562302720577, "flos": 21507511576320.0, "grad_norm": 5.1568719017404225, "language_loss": 0.84473801, "learning_rate": 8.172258501943301e-07, "loss": 0.86021245, "num_input_tokens_seen": 254865960, "router_z_loss_clip": 2.28515625, "router_z_loss_mlp": 0.26574707, "step": 11815, "time_per_iteration": 2.6646881103515625 }, { "auxiliary_loss_clip": 0.01273851, "auxiliary_loss_mlp": 0.00239697, "balance_loss_clip": 1.06197822, "balance_loss_mlp": 0.21513981, "epoch": 0.7104163535247257, "flos": 14535059869440.0, "grad_norm": 22.050838542330087, "language_loss": 0.86587441, "learning_rate": 8.16911815462725e-07, "loss": 0.88100988, "num_input_tokens_seen": 254882815, "router_z_loss_clip": 2.11621094, "router_z_loss_mlp": 0.2454834, "step": 11816, "time_per_iteration": 2.6475400924682617 }, { "auxiliary_loss_clip": 0.01264185, "auxiliary_loss_mlp": 0.00236192, "balance_loss_clip": 1.05082464, "balance_loss_mlp": 0.21106282, "epoch": 0.7104764767773937, "flos": 11400310085760.0, "grad_norm": 7.529657391722591, "language_loss": 0.93358308, "learning_rate": 8.165978255939426e-07, "loss": 0.94858682, "num_input_tokens_seen": 254898705, "router_z_loss_clip": 2.13476562, "router_z_loss_mlp": 0.25146484, "step": 11817, "time_per_iteration": 2.623936653137207 }, { "auxiliary_loss_clip": 0.01279408, "auxiliary_loss_mlp": 0.00231503, "balance_loss_clip": 1.05989003, "balance_loss_mlp": 0.2066716, "epoch": 0.7105366000300616, "flos": 11690432236800.0, "grad_norm": 7.0885683922989395, "language_loss": 0.94240069, "learning_rate": 8.162838805998897e-07, "loss": 0.95750976, "num_input_tokens_seen": 254913665, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.24853516, "step": 11818, "time_per_iteration": 2.7172231674194336 }, { "auxiliary_loss_clip": 0.01271691, "auxiliary_loss_mlp": 0.00235114, "balance_loss_clip": 1.05837846, "balance_loss_mlp": 0.21092631, "epoch": 0.7105967232827296, "flos": 19354020508800.0, "grad_norm": 14.665763364046901, "language_loss": 0.82658303, "learning_rate": 8.159699804924709e-07, "loss": 0.84165108, "num_input_tokens_seen": 254932140, "router_z_loss_clip": 2.1328125, "router_z_loss_mlp": 0.24206543, "step": 11819, "time_per_iteration": 2.6298696994781494 }, { "auxiliary_loss_clip": 0.01307785, "auxiliary_loss_mlp": 0.00249537, "balance_loss_clip": 1.08462465, "balance_loss_mlp": 0.22425261, "epoch": 0.7106568465353975, "flos": 22930400010240.0, "grad_norm": 45.92688921509662, "language_loss": 0.78506815, "learning_rate": 8.156561252835883e-07, "loss": 0.80064142, "num_input_tokens_seen": 254951580, "router_z_loss_clip": 2.23242188, "router_z_loss_mlp": 0.25305176, "step": 11820, "time_per_iteration": 2.6420037746429443 }, { "auxiliary_loss_clip": 0.01298431, "auxiliary_loss_mlp": 0.00273271, "balance_loss_clip": 1.07215142, "balance_loss_mlp": 0.24548347, "epoch": 0.7107169697880655, "flos": 19099665325440.0, "grad_norm": 5.737542674377961, "language_loss": 0.85577929, "learning_rate": 8.153423149851449e-07, "loss": 0.87149632, "num_input_tokens_seen": 254969425, "router_z_loss_clip": 2.26367188, "router_z_loss_mlp": 0.27807617, "step": 11821, "time_per_iteration": 2.6810569763183594 }, { "auxiliary_loss_clip": 0.01137292, "auxiliary_loss_mlp": 0.00161742, "balance_loss_clip": 0.98758823, "balance_loss_mlp": 0.15363584, "epoch": 0.7107770930407336, "flos": 63638054231040.0, "grad_norm": 0.7423283992107796, "language_loss": 0.54645759, "learning_rate": 8.150285496090388e-07, "loss": 0.55944794, "num_input_tokens_seen": 255032680, "router_z_loss_clip": 1.5, "router_z_loss_mlp": 0.08105469, "step": 11822, "time_per_iteration": 3.195051670074463 }, { "auxiliary_loss_clip": 0.01265238, "auxiliary_loss_mlp": 0.00224653, "balance_loss_clip": 1.0574038, "balance_loss_mlp": 0.20075171, "epoch": 0.7108372162934015, "flos": 22054466949120.0, "grad_norm": 14.307011450028615, "language_loss": 0.69920397, "learning_rate": 8.147148291671688e-07, "loss": 0.71410286, "num_input_tokens_seen": 255054400, "router_z_loss_clip": 2.078125, "router_z_loss_mlp": 0.23876953, "step": 11823, "time_per_iteration": 2.7069122791290283 }, { "auxiliary_loss_clip": 0.01280458, "auxiliary_loss_mlp": 0.00255443, "balance_loss_clip": 1.06215882, "balance_loss_mlp": 0.22963434, "epoch": 0.7108973395460695, "flos": 19135144984320.0, "grad_norm": 64.92627333505916, "language_loss": 0.80971861, "learning_rate": 8.144011536714322e-07, "loss": 0.82507759, "num_input_tokens_seen": 255072785, "router_z_loss_clip": 2.18261719, "router_z_loss_mlp": 0.25830078, "step": 11824, "time_per_iteration": 2.744117498397827 }, { "auxiliary_loss_clip": 0.01270469, "auxiliary_loss_mlp": 0.00241619, "balance_loss_clip": 1.05627465, "balance_loss_mlp": 0.21831334, "epoch": 0.7109574627987374, "flos": 17894431353600.0, "grad_norm": 4.78040171196326, "language_loss": 0.78983933, "learning_rate": 8.140875231337223e-07, "loss": 0.80496019, "num_input_tokens_seen": 255091820, "router_z_loss_clip": 2.14257812, "router_z_loss_mlp": 0.23278809, "step": 11825, "time_per_iteration": 2.6786391735076904 }, { "auxiliary_loss_clip": 0.0129474, "auxiliary_loss_mlp": 0.00223079, "balance_loss_clip": 1.07213485, "balance_loss_mlp": 0.19743708, "epoch": 0.7110175860514054, "flos": 28979623422720.0, "grad_norm": 4.831099879472472, "language_loss": 0.8573302, "learning_rate": 8.137739375659321e-07, "loss": 0.87250841, "num_input_tokens_seen": 255111720, "router_z_loss_clip": 2.2265625, "router_z_loss_mlp": 0.25646973, "step": 11826, "time_per_iteration": 2.7674899101257324 }, { "auxiliary_loss_clip": 0.01286027, "auxiliary_loss_mlp": 0.0026403, "balance_loss_clip": 1.06656241, "balance_loss_mlp": 0.23754169, "epoch": 0.7110777093040733, "flos": 26173312623360.0, "grad_norm": 19.56752080799484, "language_loss": 0.88714719, "learning_rate": 8.134603969799527e-07, "loss": 0.90264773, "num_input_tokens_seen": 255133495, "router_z_loss_clip": 2.19238281, "router_z_loss_mlp": 0.26501465, "step": 11827, "time_per_iteration": 2.7827465534210205 }, { "auxiliary_loss_clip": 0.01281431, "auxiliary_loss_mlp": 0.00245756, "balance_loss_clip": 1.06237221, "balance_loss_mlp": 0.21998283, "epoch": 0.7111378325567413, "flos": 26869943969280.0, "grad_norm": 3.525313372649869, "language_loss": 0.69932628, "learning_rate": 8.131469013876748e-07, "loss": 0.71459806, "num_input_tokens_seen": 255156880, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.2578125, "step": 11828, "time_per_iteration": 2.720226287841797 }, { "auxiliary_loss_clip": 0.01280886, "auxiliary_loss_mlp": 0.00239231, "balance_loss_clip": 1.06122541, "balance_loss_mlp": 0.21403056, "epoch": 0.7111979558094093, "flos": 27271820309760.0, "grad_norm": 2.7471545017884225, "language_loss": 0.79053861, "learning_rate": 8.128334508009846e-07, "loss": 0.80573976, "num_input_tokens_seen": 255178920, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.25195312, "step": 11829, "time_per_iteration": 2.693220615386963 }, { "auxiliary_loss_clip": 0.01275794, "auxiliary_loss_mlp": 0.00246234, "balance_loss_clip": 1.05893147, "balance_loss_mlp": 0.2209495, "epoch": 0.7112580790620773, "flos": 25046938961280.0, "grad_norm": 10.010772882863414, "language_loss": 0.88176221, "learning_rate": 8.125200452317697e-07, "loss": 0.89698255, "num_input_tokens_seen": 255198095, "router_z_loss_clip": 2.17089844, "router_z_loss_mlp": 0.25268555, "step": 11830, "time_per_iteration": 2.708855628967285 }, { "auxiliary_loss_clip": 0.01279903, "auxiliary_loss_mlp": 0.00232115, "balance_loss_clip": 1.06499267, "balance_loss_mlp": 0.2069979, "epoch": 0.7113182023147452, "flos": 21646628951040.0, "grad_norm": 24.018989400200926, "language_loss": 0.90403134, "learning_rate": 8.122066846919138e-07, "loss": 0.91915154, "num_input_tokens_seen": 255215860, "router_z_loss_clip": 2.15234375, "router_z_loss_mlp": 0.2512207, "step": 11831, "time_per_iteration": 2.6565804481506348 }, { "auxiliary_loss_clip": 0.01257268, "auxiliary_loss_mlp": 0.0020592, "balance_loss_clip": 1.04293799, "balance_loss_mlp": 0.18277003, "epoch": 0.7113783255674132, "flos": 20996287257600.0, "grad_norm": 34.87712364751467, "language_loss": 0.84586287, "learning_rate": 8.118933691932985e-07, "loss": 0.86049473, "num_input_tokens_seen": 255235425, "router_z_loss_clip": 2.14257812, "router_z_loss_mlp": 0.23156738, "step": 11832, "time_per_iteration": 2.6915061473846436 }, { "auxiliary_loss_clip": 0.01153801, "auxiliary_loss_mlp": 0.00116108, "balance_loss_clip": 1.00534821, "balance_loss_mlp": 0.10890808, "epoch": 0.7114384488200811, "flos": 66771080161920.0, "grad_norm": 0.7446925512026675, "language_loss": 0.56104136, "learning_rate": 8.115800987478059e-07, "loss": 0.57374048, "num_input_tokens_seen": 255291680, "router_z_loss_clip": 1.484375, "router_z_loss_mlp": 0.07177734, "step": 11833, "time_per_iteration": 3.0766937732696533 }, { "auxiliary_loss_clip": 0.01261284, "auxiliary_loss_mlp": 0.00242031, "balance_loss_clip": 1.04798818, "balance_loss_mlp": 0.21725938, "epoch": 0.7114985720727491, "flos": 25010058672000.0, "grad_norm": 164.59082177685957, "language_loss": 0.78650331, "learning_rate": 8.11266873367315e-07, "loss": 0.80153644, "num_input_tokens_seen": 255313880, "router_z_loss_clip": 2.13476562, "router_z_loss_mlp": 0.24768066, "step": 11834, "time_per_iteration": 2.7964184284210205 }, { "auxiliary_loss_clip": 0.01293159, "auxiliary_loss_mlp": 0.00242862, "balance_loss_clip": 1.06641793, "balance_loss_mlp": 0.21741037, "epoch": 0.7115586953254172, "flos": 21470128496640.0, "grad_norm": 127.5517436536806, "language_loss": 0.87514263, "learning_rate": 8.10953693063704e-07, "loss": 0.89050281, "num_input_tokens_seen": 255332390, "router_z_loss_clip": 2.26757812, "router_z_loss_mlp": 0.25439453, "step": 11835, "time_per_iteration": 2.689601421356201 }, { "auxiliary_loss_clip": 0.01259749, "auxiliary_loss_mlp": 0.00235548, "balance_loss_clip": 1.04818177, "balance_loss_mlp": 0.21060941, "epoch": 0.7116188185780851, "flos": 28622600190720.0, "grad_norm": 29.92542143175671, "language_loss": 0.83210486, "learning_rate": 8.10640557848848e-07, "loss": 0.84705776, "num_input_tokens_seen": 255354025, "router_z_loss_clip": 2.11523438, "router_z_loss_mlp": 0.24963379, "step": 11836, "time_per_iteration": 4.166890859603882 }, { "auxiliary_loss_clip": 0.01245875, "auxiliary_loss_mlp": 0.00207909, "balance_loss_clip": 1.03423977, "balance_loss_mlp": 0.18475816, "epoch": 0.7116789418307531, "flos": 25293608634240.0, "grad_norm": 1547.0627220984154, "language_loss": 0.7631861, "learning_rate": 8.103274677346208e-07, "loss": 0.77772391, "num_input_tokens_seen": 255371400, "router_z_loss_clip": 2.11914062, "router_z_loss_mlp": 0.23168945, "step": 11837, "time_per_iteration": 2.713808059692383 }, { "auxiliary_loss_clip": 0.01312325, "auxiliary_loss_mlp": 0.00277977, "balance_loss_clip": 1.0821085, "balance_loss_mlp": 0.25020093, "epoch": 0.711739065083421, "flos": 25557301353600.0, "grad_norm": 73.97627968694856, "language_loss": 0.70998228, "learning_rate": 8.100144227328958e-07, "loss": 0.72588527, "num_input_tokens_seen": 255390710, "router_z_loss_clip": 2.29882812, "router_z_loss_mlp": 0.27758789, "step": 11838, "time_per_iteration": 4.191413879394531 }, { "auxiliary_loss_clip": 0.01272934, "auxiliary_loss_mlp": 0.00273094, "balance_loss_clip": 1.0595963, "balance_loss_mlp": 0.24794063, "epoch": 0.711799188336089, "flos": 26140993361280.0, "grad_norm": 125.02997867215538, "language_loss": 0.75171351, "learning_rate": 8.097014228555426e-07, "loss": 0.76717383, "num_input_tokens_seen": 255408790, "router_z_loss_clip": 2.13671875, "router_z_loss_mlp": 0.25158691, "step": 11839, "time_per_iteration": 2.6903140544891357 }, { "auxiliary_loss_clip": 0.01279361, "auxiliary_loss_mlp": 0.00249951, "balance_loss_clip": 1.06308126, "balance_loss_mlp": 0.22541822, "epoch": 0.7118593115887569, "flos": 21140648017920.0, "grad_norm": 19.006471311540068, "language_loss": 0.92159569, "learning_rate": 8.093884681144305e-07, "loss": 0.93688881, "num_input_tokens_seen": 255426280, "router_z_loss_clip": 2.16210938, "router_z_loss_mlp": 0.24536133, "step": 11840, "time_per_iteration": 4.078336477279663 }, { "auxiliary_loss_clip": 0.01290042, "auxiliary_loss_mlp": 0.00250374, "balance_loss_clip": 1.06446135, "balance_loss_mlp": 0.22336107, "epoch": 0.711919434841425, "flos": 14975684006400.0, "grad_norm": 7.295192867157736, "language_loss": 0.86120081, "learning_rate": 8.090755585214277e-07, "loss": 0.87660497, "num_input_tokens_seen": 255442935, "router_z_loss_clip": 2.2578125, "router_z_loss_mlp": 0.27038574, "step": 11841, "time_per_iteration": 2.6544392108917236 }, { "auxiliary_loss_clip": 0.01271654, "auxiliary_loss_mlp": 0.00244951, "balance_loss_clip": 1.05924404, "balance_loss_mlp": 0.22000009, "epoch": 0.7119795580940929, "flos": 16508997826560.0, "grad_norm": 5.659541085534239, "language_loss": 0.82240331, "learning_rate": 8.087626940883994e-07, "loss": 0.8375693, "num_input_tokens_seen": 255460925, "router_z_loss_clip": 2.12207031, "router_z_loss_mlp": 0.24938965, "step": 11842, "time_per_iteration": 2.693629503250122 }, { "auxiliary_loss_clip": 0.01148192, "auxiliary_loss_mlp": 0.00167125, "balance_loss_clip": 0.99981904, "balance_loss_mlp": 0.15739775, "epoch": 0.7120396813467609, "flos": 66570736055040.0, "grad_norm": 0.7759872523398607, "language_loss": 0.61048806, "learning_rate": 8.084498748272082e-07, "loss": 0.62364125, "num_input_tokens_seen": 255521360, "router_z_loss_clip": 1.484375, "router_z_loss_mlp": 0.09716797, "step": 11843, "time_per_iteration": 3.1240131855010986 }, { "auxiliary_loss_clip": 0.0126324, "auxiliary_loss_mlp": 0.00237256, "balance_loss_clip": 1.05098021, "balance_loss_mlp": 0.21219784, "epoch": 0.7120998045994288, "flos": 26432731624320.0, "grad_norm": 222.66528375189867, "language_loss": 0.8803097, "learning_rate": 8.081371007497171e-07, "loss": 0.89531469, "num_input_tokens_seen": 255541435, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.25061035, "step": 11844, "time_per_iteration": 2.686566114425659 }, { "auxiliary_loss_clip": 0.01277385, "auxiliary_loss_mlp": 0.00245656, "balance_loss_clip": 1.05601549, "balance_loss_mlp": 0.21876253, "epoch": 0.7121599278520968, "flos": 16427982700800.0, "grad_norm": 15.92153091131029, "language_loss": 0.86214697, "learning_rate": 8.078243718677873e-07, "loss": 0.87737745, "num_input_tokens_seen": 255558505, "router_z_loss_clip": 2.21484375, "router_z_loss_mlp": 0.26928711, "step": 11845, "time_per_iteration": 4.0865888595581055 }, { "auxiliary_loss_clip": 0.01259933, "auxiliary_loss_mlp": 0.00237445, "balance_loss_clip": 1.04729533, "balance_loss_mlp": 0.21158886, "epoch": 0.7122200511047647, "flos": 28949889939840.0, "grad_norm": 30.46951875105981, "language_loss": 0.85683751, "learning_rate": 8.075116881932762e-07, "loss": 0.87181127, "num_input_tokens_seen": 255577815, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.25854492, "step": 11846, "time_per_iteration": 2.7531375885009766 }, { "auxiliary_loss_clip": 0.01258073, "auxiliary_loss_mlp": 0.00256262, "balance_loss_clip": 1.04605448, "balance_loss_mlp": 0.23114485, "epoch": 0.7122801743574327, "flos": 16471866142080.0, "grad_norm": 28.04507569741064, "language_loss": 0.67009115, "learning_rate": 8.071990497380421e-07, "loss": 0.68523455, "num_input_tokens_seen": 255595885, "router_z_loss_clip": 2.11914062, "router_z_loss_mlp": 0.25109863, "step": 11847, "time_per_iteration": 2.677217483520508 }, { "auxiliary_loss_clip": 0.01264927, "auxiliary_loss_mlp": 0.00241516, "balance_loss_clip": 1.05498052, "balance_loss_mlp": 0.21648213, "epoch": 0.7123402976101008, "flos": 20631039811200.0, "grad_norm": 3.801206611753666, "language_loss": 0.77665049, "learning_rate": 8.068864565139395e-07, "loss": 0.79171497, "num_input_tokens_seen": 255616750, "router_z_loss_clip": 2.10253906, "router_z_loss_mlp": 0.25024414, "step": 11848, "time_per_iteration": 2.696023464202881 }, { "auxiliary_loss_clip": 0.01141828, "auxiliary_loss_mlp": 0.00120446, "balance_loss_clip": 0.99276054, "balance_loss_mlp": 0.11071836, "epoch": 0.7124004208627687, "flos": 62325734837760.0, "grad_norm": 0.8172684682466103, "language_loss": 0.61412293, "learning_rate": 8.065739085328211e-07, "loss": 0.62674564, "num_input_tokens_seen": 255677900, "router_z_loss_clip": 1.484375, "router_z_loss_mlp": 0.09716797, "step": 11849, "time_per_iteration": 3.16585373878479 }, { "auxiliary_loss_clip": 0.01265861, "auxiliary_loss_mlp": 0.00232461, "balance_loss_clip": 1.04902875, "balance_loss_mlp": 0.2049236, "epoch": 0.7124605441154367, "flos": 39675975788160.0, "grad_norm": 2.1272482297112414, "language_loss": 0.70605141, "learning_rate": 8.0626140580654e-07, "loss": 0.72103465, "num_input_tokens_seen": 255699140, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.2755127, "step": 11850, "time_per_iteration": 2.88264536857605 }, { "auxiliary_loss_clip": 0.01277557, "auxiliary_loss_mlp": 0.00234697, "balance_loss_clip": 1.05865097, "balance_loss_mlp": 0.2088283, "epoch": 0.7125206673681046, "flos": 28181868312960.0, "grad_norm": 105.07111302683754, "language_loss": 0.77226412, "learning_rate": 8.05948948346946e-07, "loss": 0.78738666, "num_input_tokens_seen": 255719640, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.25854492, "step": 11851, "time_per_iteration": 2.7177817821502686 }, { "auxiliary_loss_clip": 0.0124309, "auxiliary_loss_mlp": 0.00219849, "balance_loss_clip": 1.03737426, "balance_loss_mlp": 0.19653204, "epoch": 0.7125807906207726, "flos": 26176939896960.0, "grad_norm": 25.456657458017784, "language_loss": 0.88378161, "learning_rate": 8.056365361658882e-07, "loss": 0.89841098, "num_input_tokens_seen": 255740450, "router_z_loss_clip": 2.06054688, "router_z_loss_mlp": 0.23291016, "step": 11852, "time_per_iteration": 2.6728756427764893 }, { "auxiliary_loss_clip": 0.01291491, "auxiliary_loss_mlp": 0.00246187, "balance_loss_clip": 1.06233895, "balance_loss_mlp": 0.21652734, "epoch": 0.7126409138734405, "flos": 17157328358400.0, "grad_norm": 90.70486721673967, "language_loss": 0.84299767, "learning_rate": 8.053241692752126e-07, "loss": 0.85837442, "num_input_tokens_seen": 255758070, "router_z_loss_clip": 2.2890625, "router_z_loss_mlp": 0.29650879, "step": 11853, "time_per_iteration": 2.7249643802642822 }, { "auxiliary_loss_clip": 0.0124291, "auxiliary_loss_mlp": 0.00214689, "balance_loss_clip": 1.03495979, "balance_loss_mlp": 0.19152692, "epoch": 0.7127010371261085, "flos": 18769933451520.0, "grad_norm": 131.22078910061552, "language_loss": 1.00342584, "learning_rate": 8.050118476867635e-07, "loss": 1.01800179, "num_input_tokens_seen": 255775685, "router_z_loss_clip": 2.08007812, "router_z_loss_mlp": 0.23181152, "step": 11854, "time_per_iteration": 2.618648052215576 }, { "auxiliary_loss_clip": 0.01239656, "auxiliary_loss_mlp": 0.00240753, "balance_loss_clip": 1.0360918, "balance_loss_mlp": 0.21730411, "epoch": 0.7127611603787765, "flos": 20376433232640.0, "grad_norm": 2381.655795776995, "language_loss": 0.84391654, "learning_rate": 8.046995714123856e-07, "loss": 0.85872066, "num_input_tokens_seen": 255794750, "router_z_loss_clip": 2.03710938, "router_z_loss_mlp": 0.23449707, "step": 11855, "time_per_iteration": 2.6915297508239746 }, { "auxiliary_loss_clip": 0.01280084, "auxiliary_loss_mlp": 0.00243886, "balance_loss_clip": 1.06152654, "balance_loss_mlp": 0.21799409, "epoch": 0.7128212836314445, "flos": 20449008662400.0, "grad_norm": 13.927002641833509, "language_loss": 0.82375711, "learning_rate": 8.043873404639192e-07, "loss": 0.83899677, "num_input_tokens_seen": 255813325, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.25927734, "step": 11856, "time_per_iteration": 2.659921407699585 }, { "auxiliary_loss_clip": 0.01277876, "auxiliary_loss_mlp": 0.00233068, "balance_loss_clip": 1.05628347, "balance_loss_mlp": 0.20816565, "epoch": 0.7128814068841124, "flos": 23440834229760.0, "grad_norm": 9.777537997000486, "language_loss": 0.78051168, "learning_rate": 8.040751548532046e-07, "loss": 0.79562104, "num_input_tokens_seen": 255832470, "router_z_loss_clip": 2.21484375, "router_z_loss_mlp": 0.24926758, "step": 11857, "time_per_iteration": 2.7028539180755615 }, { "auxiliary_loss_clip": 0.01245092, "auxiliary_loss_mlp": 0.00230586, "balance_loss_clip": 1.0368495, "balance_loss_mlp": 0.20602918, "epoch": 0.7129415301367804, "flos": 18222942165120.0, "grad_norm": 759.3069654840982, "language_loss": 0.92750919, "learning_rate": 8.03763014592081e-07, "loss": 0.94226593, "num_input_tokens_seen": 255849740, "router_z_loss_clip": 2.08398438, "router_z_loss_mlp": 0.24536133, "step": 11858, "time_per_iteration": 2.645064353942871 }, { "auxiliary_loss_clip": 0.01264737, "auxiliary_loss_mlp": 0.00208552, "balance_loss_clip": 1.04824293, "balance_loss_mlp": 0.18355414, "epoch": 0.7130016533894483, "flos": 15523896355200.0, "grad_norm": 5.604398731106021, "language_loss": 0.87606448, "learning_rate": 8.034509196923829e-07, "loss": 0.89079738, "num_input_tokens_seen": 255866975, "router_z_loss_clip": 2.16503906, "router_z_loss_mlp": 0.25, "step": 11859, "time_per_iteration": 2.6407277584075928 }, { "auxiliary_loss_clip": 0.01254789, "auxiliary_loss_mlp": 0.00229623, "balance_loss_clip": 1.0444746, "balance_loss_mlp": 0.20535222, "epoch": 0.7130617766421163, "flos": 57115668960000.0, "grad_norm": 3.1704172784753943, "language_loss": 0.74061662, "learning_rate": 8.031388701659456e-07, "loss": 0.75546074, "num_input_tokens_seen": 255892915, "router_z_loss_clip": 2.10351562, "router_z_loss_mlp": 0.24291992, "step": 11860, "time_per_iteration": 2.98284912109375 }, { "auxiliary_loss_clip": 0.01277054, "auxiliary_loss_mlp": 0.00252683, "balance_loss_clip": 1.05309129, "balance_loss_mlp": 0.22564626, "epoch": 0.7131218998947844, "flos": 19788252024960.0, "grad_norm": 81.59333778989303, "language_loss": 0.71766853, "learning_rate": 8.028268660246023e-07, "loss": 0.73296589, "num_input_tokens_seen": 255911480, "router_z_loss_clip": 2.23925781, "router_z_loss_mlp": 0.27026367, "step": 11861, "time_per_iteration": 2.662611722946167 }, { "auxiliary_loss_clip": 0.01276246, "auxiliary_loss_mlp": 0.002345, "balance_loss_clip": 1.06038141, "balance_loss_mlp": 0.20935857, "epoch": 0.7131820231474523, "flos": 26651894457600.0, "grad_norm": 26.459816216299515, "language_loss": 0.75748026, "learning_rate": 8.025149072801849e-07, "loss": 0.77258778, "num_input_tokens_seen": 255931140, "router_z_loss_clip": 2.16113281, "router_z_loss_mlp": 0.25146484, "step": 11862, "time_per_iteration": 2.734743118286133 }, { "auxiliary_loss_clip": 0.0126215, "auxiliary_loss_mlp": 0.0021549, "balance_loss_clip": 1.04649734, "balance_loss_mlp": 0.19013372, "epoch": 0.7132421464001203, "flos": 29205609840000.0, "grad_norm": 24.67354832644342, "language_loss": 0.77065599, "learning_rate": 8.022029939445214e-07, "loss": 0.7854324, "num_input_tokens_seen": 255951665, "router_z_loss_clip": 2.15332031, "router_z_loss_mlp": 0.25366211, "step": 11863, "time_per_iteration": 2.7892401218414307 }, { "auxiliary_loss_clip": 0.01283125, "auxiliary_loss_mlp": 0.00215533, "balance_loss_clip": 1.06308913, "balance_loss_mlp": 0.19070138, "epoch": 0.7133022696527882, "flos": 23073611535360.0, "grad_norm": 6.014648925128437, "language_loss": 0.74023563, "learning_rate": 8.018911260294414e-07, "loss": 0.7552222, "num_input_tokens_seen": 255970055, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.24841309, "step": 11864, "time_per_iteration": 2.777473211288452 }, { "auxiliary_loss_clip": 0.01270052, "auxiliary_loss_mlp": 0.00246243, "balance_loss_clip": 1.05304718, "balance_loss_mlp": 0.22104195, "epoch": 0.7133623929054562, "flos": 17457111267840.0, "grad_norm": 2.7344733741812637, "language_loss": 0.9412626, "learning_rate": 8.015793035467697e-07, "loss": 0.95642555, "num_input_tokens_seen": 255987720, "router_z_loss_clip": 2.171875, "router_z_loss_mlp": 0.25219727, "step": 11865, "time_per_iteration": 2.6319243907928467 }, { "auxiliary_loss_clip": 0.0126455, "auxiliary_loss_mlp": 0.00235705, "balance_loss_clip": 1.04429901, "balance_loss_mlp": 0.20951448, "epoch": 0.7134225161581241, "flos": 19536554448000.0, "grad_norm": 7.1582059509409, "language_loss": 0.83978724, "learning_rate": 8.012675265083304e-07, "loss": 0.85478985, "num_input_tokens_seen": 256005490, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.26196289, "step": 11866, "time_per_iteration": 2.6633992195129395 }, { "auxiliary_loss_clip": 0.01285911, "auxiliary_loss_mlp": 0.0020777, "balance_loss_clip": 1.06485641, "balance_loss_mlp": 0.18370193, "epoch": 0.7134826394107922, "flos": 26250089944320.0, "grad_norm": 16.826330433995448, "language_loss": 0.81603765, "learning_rate": 8.009557949259464e-07, "loss": 0.83097452, "num_input_tokens_seen": 256026030, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.24072266, "step": 11867, "time_per_iteration": 2.676706314086914 }, { "auxiliary_loss_clip": 0.01246773, "auxiliary_loss_mlp": 0.00228992, "balance_loss_clip": 1.03675795, "balance_loss_mlp": 0.20344529, "epoch": 0.7135427626634601, "flos": 15815311395840.0, "grad_norm": 14.918312061975374, "language_loss": 0.78725874, "learning_rate": 8.006441088114397e-07, "loss": 0.80201638, "num_input_tokens_seen": 256043680, "router_z_loss_clip": 2.09960938, "router_z_loss_mlp": 0.25561523, "step": 11868, "time_per_iteration": 2.69899845123291 }, { "auxiliary_loss_clip": 0.01275725, "auxiliary_loss_mlp": 0.00231874, "balance_loss_clip": 1.05612624, "balance_loss_mlp": 0.20381194, "epoch": 0.7136028859161281, "flos": 18223409041920.0, "grad_norm": 2.830523440944727, "language_loss": 0.74946964, "learning_rate": 8.003324681766286e-07, "loss": 0.76454568, "num_input_tokens_seen": 256059705, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.28063965, "step": 11869, "time_per_iteration": 2.631502151489258 }, { "auxiliary_loss_clip": 0.01268007, "auxiliary_loss_mlp": 0.00242037, "balance_loss_clip": 1.04881299, "balance_loss_mlp": 0.21595421, "epoch": 0.713663009168796, "flos": 24314827956480.0, "grad_norm": 29.211077191918694, "language_loss": 0.83624315, "learning_rate": 8.000208730333298e-07, "loss": 0.85134363, "num_input_tokens_seen": 256079785, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.26098633, "step": 11870, "time_per_iteration": 2.7220349311828613 }, { "auxiliary_loss_clip": 0.0125619, "auxiliary_loss_mlp": 0.0021277, "balance_loss_clip": 1.04481411, "balance_loss_mlp": 0.18748602, "epoch": 0.713723132421464, "flos": 26538488242560.0, "grad_norm": 5.558874879147666, "language_loss": 0.87984353, "learning_rate": 7.997093233933597e-07, "loss": 0.89453316, "num_input_tokens_seen": 256099000, "router_z_loss_clip": 2.109375, "router_z_loss_mlp": 0.25292969, "step": 11871, "time_per_iteration": 2.6989307403564453 }, { "auxiliary_loss_clip": 0.01275654, "auxiliary_loss_mlp": 0.00257414, "balance_loss_clip": 1.05822229, "balance_loss_mlp": 0.22878017, "epoch": 0.7137832556741319, "flos": 19865675790720.0, "grad_norm": 2.1061822791508513, "language_loss": 0.86417615, "learning_rate": 7.993978192685331e-07, "loss": 0.87950689, "num_input_tokens_seen": 256117985, "router_z_loss_clip": 2.17578125, "router_z_loss_mlp": 0.28625488, "step": 11872, "time_per_iteration": 2.751570701599121 }, { "auxiliary_loss_clip": 0.01277149, "auxiliary_loss_mlp": 0.00234379, "balance_loss_clip": 1.05957198, "balance_loss_mlp": 0.20700818, "epoch": 0.7138433789267999, "flos": 21688932193920.0, "grad_norm": 30.43630265853651, "language_loss": 0.94419801, "learning_rate": 7.990863606706606e-07, "loss": 0.95931333, "num_input_tokens_seen": 256134350, "router_z_loss_clip": 2.17871094, "router_z_loss_mlp": 0.27392578, "step": 11873, "time_per_iteration": 2.727137804031372 }, { "auxiliary_loss_clip": 0.01247021, "auxiliary_loss_mlp": 0.00203735, "balance_loss_clip": 1.03883553, "balance_loss_mlp": 0.17996505, "epoch": 0.713903502179468, "flos": 17602729004160.0, "grad_norm": 14.943641467380465, "language_loss": 0.94634032, "learning_rate": 7.987749476115539e-07, "loss": 0.96084791, "num_input_tokens_seen": 256150610, "router_z_loss_clip": 2.08398438, "router_z_loss_mlp": 0.23742676, "step": 11874, "time_per_iteration": 2.625796318054199 }, { "auxiliary_loss_clip": 0.01274303, "auxiliary_loss_mlp": 0.0020825, "balance_loss_clip": 1.05546188, "balance_loss_mlp": 0.18186875, "epoch": 0.7139636254321359, "flos": 18040336398720.0, "grad_norm": 8.930798948928826, "language_loss": 0.92674279, "learning_rate": 7.984635801030228e-07, "loss": 0.94156832, "num_input_tokens_seen": 256168620, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.26367188, "step": 11875, "time_per_iteration": 2.6625149250030518 }, { "auxiliary_loss_clip": 0.01271912, "auxiliary_loss_mlp": 0.00248202, "balance_loss_clip": 1.05048513, "balance_loss_mlp": 0.2198181, "epoch": 0.7140237486848039, "flos": 23331127115520.0, "grad_norm": 10.808462844116736, "language_loss": 0.79541701, "learning_rate": 7.981522581568721e-07, "loss": 0.81061816, "num_input_tokens_seen": 256186700, "router_z_loss_clip": 2.21289062, "router_z_loss_mlp": 0.28393555, "step": 11876, "time_per_iteration": 2.6693646907806396 }, { "auxiliary_loss_clip": 0.01257502, "auxiliary_loss_mlp": 0.0023419, "balance_loss_clip": 1.04416847, "balance_loss_mlp": 0.20772564, "epoch": 0.7140838719374718, "flos": 16837077674880.0, "grad_norm": 19.283066876119175, "language_loss": 0.86992192, "learning_rate": 7.978409817849079e-07, "loss": 0.88483882, "num_input_tokens_seen": 256205390, "router_z_loss_clip": 2.13085938, "router_z_loss_mlp": 0.26501465, "step": 11877, "time_per_iteration": 2.6685664653778076 }, { "auxiliary_loss_clip": 0.01259093, "auxiliary_loss_mlp": 0.00234487, "balance_loss_clip": 1.04267073, "balance_loss_mlp": 0.2093934, "epoch": 0.7141439951901398, "flos": 21142012734720.0, "grad_norm": 13.292210243408844, "language_loss": 0.77064592, "learning_rate": 7.97529750998934e-07, "loss": 0.78558171, "num_input_tokens_seen": 256224575, "router_z_loss_clip": 2.16210938, "router_z_loss_mlp": 0.25097656, "step": 11878, "time_per_iteration": 4.146072864532471 }, { "auxiliary_loss_clip": 0.01260961, "auxiliary_loss_mlp": 0.00208266, "balance_loss_clip": 1.04362762, "balance_loss_mlp": 0.18388772, "epoch": 0.7142041184428077, "flos": 24717709877760.0, "grad_norm": 17.025928246028684, "language_loss": 0.76790094, "learning_rate": 7.972185658107535e-07, "loss": 0.78259313, "num_input_tokens_seen": 256242130, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.24377441, "step": 11879, "time_per_iteration": 2.6682848930358887 }, { "auxiliary_loss_clip": 0.01267764, "auxiliary_loss_mlp": 0.00221591, "balance_loss_clip": 1.04848838, "balance_loss_mlp": 0.19516227, "epoch": 0.7142642416954758, "flos": 21908202768000.0, "grad_norm": 8.38540381463372, "language_loss": 0.78211689, "learning_rate": 7.969074262321646e-07, "loss": 0.79701042, "num_input_tokens_seen": 256261920, "router_z_loss_clip": 2.19140625, "router_z_loss_mlp": 0.26416016, "step": 11880, "time_per_iteration": 4.1915342807769775 }, { "auxiliary_loss_clip": 0.01277478, "auxiliary_loss_mlp": 0.00221945, "balance_loss_clip": 1.05591345, "balance_loss_mlp": 0.19620734, "epoch": 0.7143243649481437, "flos": 20805636844800.0, "grad_norm": 35.965645444652736, "language_loss": 0.90640676, "learning_rate": 7.965963322749674e-07, "loss": 0.92140102, "num_input_tokens_seen": 256277970, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.25744629, "step": 11881, "time_per_iteration": 2.6142094135284424 }, { "auxiliary_loss_clip": 0.01254184, "auxiliary_loss_mlp": 0.00204132, "balance_loss_clip": 1.04209733, "balance_loss_mlp": 0.17914605, "epoch": 0.7143844882008117, "flos": 27235011847680.0, "grad_norm": 9.725998550650056, "language_loss": 0.70498842, "learning_rate": 7.962852839509579e-07, "loss": 0.71957159, "num_input_tokens_seen": 256298205, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.24987793, "step": 11882, "time_per_iteration": 4.116163492202759 }, { "auxiliary_loss_clip": 0.01264404, "auxiliary_loss_mlp": 0.00203547, "balance_loss_clip": 1.04254198, "balance_loss_mlp": 0.17666522, "epoch": 0.7144446114534796, "flos": 17929623703680.0, "grad_norm": 8.084659908894222, "language_loss": 0.78842235, "learning_rate": 7.959742812719304e-07, "loss": 0.8031019, "num_input_tokens_seen": 256316685, "router_z_loss_clip": 2.21679688, "router_z_loss_mlp": 0.26867676, "step": 11883, "time_per_iteration": 2.657250165939331 }, { "auxiliary_loss_clip": 0.01266311, "auxiliary_loss_mlp": 0.00220972, "balance_loss_clip": 1.05439281, "balance_loss_mlp": 0.19772655, "epoch": 0.7145047347061476, "flos": 20740962407040.0, "grad_norm": 18.546685640048075, "language_loss": 0.85822642, "learning_rate": 7.956633242496788e-07, "loss": 0.87309921, "num_input_tokens_seen": 256334205, "router_z_loss_clip": 2.12402344, "router_z_loss_mlp": 0.23242188, "step": 11884, "time_per_iteration": 2.65487003326416 }, { "auxiliary_loss_clip": 0.01284388, "auxiliary_loss_mlp": 0.00235088, "balance_loss_clip": 1.05765331, "balance_loss_mlp": 0.20615613, "epoch": 0.7145648579588155, "flos": 21178605715200.0, "grad_norm": 19.636169756243543, "language_loss": 0.84497678, "learning_rate": 7.953524128959954e-07, "loss": 0.86017156, "num_input_tokens_seen": 256353340, "router_z_loss_clip": 2.26953125, "router_z_loss_mlp": 0.28918457, "step": 11885, "time_per_iteration": 2.72526478767395 }, { "auxiliary_loss_clip": 0.01150098, "auxiliary_loss_mlp": 0.00055123, "balance_loss_clip": 1.00522554, "balance_loss_mlp": 0.04754145, "epoch": 0.7146249812114835, "flos": 64784539509120.0, "grad_norm": 0.8680475333707608, "language_loss": 0.65902877, "learning_rate": 7.95041547222669e-07, "loss": 0.67108095, "num_input_tokens_seen": 256411550, "router_z_loss_clip": 1.453125, "router_z_loss_mlp": 0.07568359, "step": 11886, "time_per_iteration": 3.123544454574585 }, { "auxiliary_loss_clip": 0.01248985, "auxiliary_loss_mlp": 0.00204516, "balance_loss_clip": 1.03756428, "balance_loss_mlp": 0.17969659, "epoch": 0.7146851044641516, "flos": 18113881495680.0, "grad_norm": 213.60631635354207, "language_loss": 0.84467685, "learning_rate": 7.947307272414874e-07, "loss": 0.85921186, "num_input_tokens_seen": 256430360, "router_z_loss_clip": 2.11523438, "router_z_loss_mlp": 0.2479248, "step": 11887, "time_per_iteration": 4.177378177642822 }, { "auxiliary_loss_clip": 0.01254731, "auxiliary_loss_mlp": 0.00223374, "balance_loss_clip": 1.0429852, "balance_loss_mlp": 0.19816151, "epoch": 0.7147452277168195, "flos": 19243846517760.0, "grad_norm": 232.52600638433896, "language_loss": 0.78150272, "learning_rate": 7.944199529642372e-07, "loss": 0.79628378, "num_input_tokens_seen": 256449750, "router_z_loss_clip": 2.11523438, "router_z_loss_mlp": 0.25183105, "step": 11888, "time_per_iteration": 2.7120697498321533 }, { "auxiliary_loss_clip": 0.01242683, "auxiliary_loss_mlp": 0.0021817, "balance_loss_clip": 1.03116477, "balance_loss_mlp": 0.19269545, "epoch": 0.7148053509694875, "flos": 23764712186880.0, "grad_norm": 7.94569194090063, "language_loss": 0.92294937, "learning_rate": 7.941092244027041e-07, "loss": 0.93755794, "num_input_tokens_seen": 256467330, "router_z_loss_clip": 2.11523438, "router_z_loss_mlp": 0.25500488, "step": 11889, "time_per_iteration": 2.654223918914795 }, { "auxiliary_loss_clip": 0.01260051, "auxiliary_loss_mlp": 0.00208632, "balance_loss_clip": 1.04492295, "balance_loss_mlp": 0.18211968, "epoch": 0.7148654742221554, "flos": 22485322586880.0, "grad_norm": 5.443867285171067, "language_loss": 0.8477695, "learning_rate": 7.937985415686695e-07, "loss": 0.86245638, "num_input_tokens_seen": 256485705, "router_z_loss_clip": 2.15039062, "router_z_loss_mlp": 0.26550293, "step": 11890, "time_per_iteration": 2.700242757797241 }, { "auxiliary_loss_clip": 0.01252159, "auxiliary_loss_mlp": 0.00193136, "balance_loss_clip": 1.04245806, "balance_loss_mlp": 0.1702601, "epoch": 0.7149255974748234, "flos": 24679213476480.0, "grad_norm": 8.46228822807439, "language_loss": 0.81110811, "learning_rate": 7.934879044739147e-07, "loss": 0.82556105, "num_input_tokens_seen": 256504755, "router_z_loss_clip": 2.09570312, "router_z_loss_mlp": 0.22875977, "step": 11891, "time_per_iteration": 2.7379469871520996 }, { "auxiliary_loss_clip": 0.01266126, "auxiliary_loss_mlp": 0.00237414, "balance_loss_clip": 1.04849863, "balance_loss_mlp": 0.21210569, "epoch": 0.7149857207274913, "flos": 18405583845120.0, "grad_norm": 6.628701029653289, "language_loss": 0.77453387, "learning_rate": 7.931773131302211e-07, "loss": 0.78956926, "num_input_tokens_seen": 256523670, "router_z_loss_clip": 2.171875, "router_z_loss_mlp": 0.2532959, "step": 11892, "time_per_iteration": 2.6510062217712402 }, { "auxiliary_loss_clip": 0.01273863, "auxiliary_loss_mlp": 0.00235197, "balance_loss_clip": 1.05498338, "balance_loss_mlp": 0.20723069, "epoch": 0.7150458439801594, "flos": 24969515195520.0, "grad_norm": 744.2274721602425, "language_loss": 0.80616105, "learning_rate": 7.928667675493632e-07, "loss": 0.82125163, "num_input_tokens_seen": 256542225, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.27954102, "step": 11893, "time_per_iteration": 2.7355520725250244 }, { "auxiliary_loss_clip": 0.01273695, "auxiliary_loss_mlp": 0.0022779, "balance_loss_clip": 1.05404377, "balance_loss_mlp": 0.2012538, "epoch": 0.7151059672328273, "flos": 16690777580160.0, "grad_norm": 22.689845751680505, "language_loss": 0.79345465, "learning_rate": 7.925562677431185e-07, "loss": 0.80846953, "num_input_tokens_seen": 256560730, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.26550293, "step": 11894, "time_per_iteration": 2.6429600715637207 }, { "auxiliary_loss_clip": 0.01264935, "auxiliary_loss_mlp": 0.00208926, "balance_loss_clip": 1.04521501, "balance_loss_mlp": 0.18365389, "epoch": 0.7151660904854953, "flos": 27271820309760.0, "grad_norm": 14.784022717953684, "language_loss": 0.84911275, "learning_rate": 7.922458137232613e-07, "loss": 0.86385137, "num_input_tokens_seen": 256580505, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.25256348, "step": 11895, "time_per_iteration": 2.735880136489868 }, { "auxiliary_loss_clip": 0.01273099, "auxiliary_loss_mlp": 0.0023551, "balance_loss_clip": 1.05164599, "balance_loss_mlp": 0.20766284, "epoch": 0.7152262137381632, "flos": 18332254229760.0, "grad_norm": 10.662360226954874, "language_loss": 0.7740047, "learning_rate": 7.919354055015643e-07, "loss": 0.78909075, "num_input_tokens_seen": 256597330, "router_z_loss_clip": 2.21484375, "router_z_loss_mlp": 0.27844238, "step": 11896, "time_per_iteration": 2.620870351791382 }, { "auxiliary_loss_clip": 0.01258522, "auxiliary_loss_mlp": 0.00228478, "balance_loss_clip": 1.04067671, "balance_loss_mlp": 0.20018944, "epoch": 0.7152863369908312, "flos": 21799285752960.0, "grad_norm": 49.794996365429995, "language_loss": 0.94091928, "learning_rate": 7.91625043089798e-07, "loss": 0.95578933, "num_input_tokens_seen": 256616030, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.28295898, "step": 11897, "time_per_iteration": 2.6797690391540527 }, { "auxiliary_loss_clip": 0.01268442, "auxiliary_loss_mlp": 0.00211907, "balance_loss_clip": 1.05356431, "balance_loss_mlp": 0.18681353, "epoch": 0.7153464602434991, "flos": 22158427887360.0, "grad_norm": 4.758699032591646, "language_loss": 0.86554211, "learning_rate": 7.913147264997304e-07, "loss": 0.88034558, "num_input_tokens_seen": 256635570, "router_z_loss_clip": 2.14648438, "router_z_loss_mlp": 0.25085449, "step": 11898, "time_per_iteration": 2.6576380729675293 }, { "auxiliary_loss_clip": 0.01268806, "auxiliary_loss_mlp": 0.00202047, "balance_loss_clip": 1.04604244, "balance_loss_mlp": 0.17552334, "epoch": 0.7154065834961671, "flos": 24716057852160.0, "grad_norm": 60.35270207578106, "language_loss": 0.83088797, "learning_rate": 7.910044557431302e-07, "loss": 0.84559643, "num_input_tokens_seen": 256655290, "router_z_loss_clip": 2.23046875, "router_z_loss_mlp": 0.26513672, "step": 11899, "time_per_iteration": 2.722740888595581 }, { "auxiliary_loss_clip": 0.01255993, "auxiliary_loss_mlp": 0.00226396, "balance_loss_clip": 1.04568672, "balance_loss_mlp": 0.20231603, "epoch": 0.7154667067488351, "flos": 22601494149120.0, "grad_norm": 14.633255648291215, "language_loss": 0.84605372, "learning_rate": 7.906942308317614e-07, "loss": 0.86087757, "num_input_tokens_seen": 256671605, "router_z_loss_clip": 2.10058594, "router_z_loss_mlp": 0.24084473, "step": 11900, "time_per_iteration": 2.77054500579834 }, { "auxiliary_loss_clip": 0.01252181, "auxiliary_loss_mlp": 0.00208875, "balance_loss_clip": 1.04094946, "balance_loss_mlp": 0.18455622, "epoch": 0.7155268300015031, "flos": 18771154513920.0, "grad_norm": 45.949619770470285, "language_loss": 0.90657091, "learning_rate": 7.903840517773886e-07, "loss": 0.92118144, "num_input_tokens_seen": 256689680, "router_z_loss_clip": 2.11328125, "router_z_loss_mlp": 0.24316406, "step": 11901, "time_per_iteration": 2.7028958797454834 }, { "auxiliary_loss_clip": 0.01278047, "auxiliary_loss_mlp": 0.00240053, "balance_loss_clip": 1.05471039, "balance_loss_mlp": 0.21195579, "epoch": 0.7155869532541711, "flos": 18296343607680.0, "grad_norm": 2.6095953929097346, "language_loss": 0.89547253, "learning_rate": 7.900739185917744e-07, "loss": 0.91065347, "num_input_tokens_seen": 256707760, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.28076172, "step": 11902, "time_per_iteration": 2.6463263034820557 }, { "auxiliary_loss_clip": 0.01263134, "auxiliary_loss_mlp": 0.00205042, "balance_loss_clip": 1.04559529, "balance_loss_mlp": 0.17863756, "epoch": 0.715647076506839, "flos": 11980805783040.0, "grad_norm": 3.3729937887601893, "language_loss": 0.78673697, "learning_rate": 7.897638312866785e-07, "loss": 0.80141866, "num_input_tokens_seen": 256724150, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.26416016, "step": 11903, "time_per_iteration": 2.636934518814087 }, { "auxiliary_loss_clip": 0.01244981, "auxiliary_loss_mlp": 0.00236749, "balance_loss_clip": 1.03751171, "balance_loss_mlp": 0.21279959, "epoch": 0.715707199759507, "flos": 18951641377920.0, "grad_norm": 12.666410514764433, "language_loss": 0.80936444, "learning_rate": 7.894537898738589e-07, "loss": 0.82418168, "num_input_tokens_seen": 256742780, "router_z_loss_clip": 2.07226562, "router_z_loss_mlp": 0.23950195, "step": 11904, "time_per_iteration": 2.6337482929229736 }, { "auxiliary_loss_clip": 0.01264309, "auxiliary_loss_mlp": 0.00220712, "balance_loss_clip": 1.04385495, "balance_loss_mlp": 0.19484398, "epoch": 0.7157673230121749, "flos": 15304410299520.0, "grad_norm": 12.340101771157052, "language_loss": 0.83252442, "learning_rate": 7.891437943650727e-07, "loss": 0.84737468, "num_input_tokens_seen": 256761355, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.25854492, "step": 11905, "time_per_iteration": 2.6642301082611084 }, { "auxiliary_loss_clip": 0.01253056, "auxiliary_loss_mlp": 0.00239458, "balance_loss_clip": 1.0422616, "balance_loss_mlp": 0.21450716, "epoch": 0.715827446264843, "flos": 23221850964480.0, "grad_norm": 9.676597489466346, "language_loss": 0.87730426, "learning_rate": 7.88833844772076e-07, "loss": 0.89222944, "num_input_tokens_seen": 256781335, "router_z_loss_clip": 2.11132812, "router_z_loss_mlp": 0.24975586, "step": 11906, "time_per_iteration": 2.6924445629119873 }, { "auxiliary_loss_clip": 0.01156371, "auxiliary_loss_mlp": 0.00081476, "balance_loss_clip": 1.00821066, "balance_loss_mlp": 0.07394207, "epoch": 0.7158875695175109, "flos": 60975421833600.0, "grad_norm": 0.7139877342393994, "language_loss": 0.54484075, "learning_rate": 7.885239411066205e-07, "loss": 0.55721921, "num_input_tokens_seen": 256838890, "router_z_loss_clip": 1.484375, "router_z_loss_mlp": 0.07519531, "step": 11907, "time_per_iteration": 3.0979459285736084 }, { "auxiliary_loss_clip": 0.01250868, "auxiliary_loss_mlp": 0.00220317, "balance_loss_clip": 1.03863072, "balance_loss_mlp": 0.19580777, "epoch": 0.7159476927701789, "flos": 17128780024320.0, "grad_norm": 8.90857974602353, "language_loss": 0.78343278, "learning_rate": 7.882140833804593e-07, "loss": 0.79814464, "num_input_tokens_seen": 256858145, "router_z_loss_clip": 2.12207031, "router_z_loss_mlp": 0.24487305, "step": 11908, "time_per_iteration": 2.6786036491394043 }, { "auxiliary_loss_clip": 0.01265346, "auxiliary_loss_mlp": 0.00217189, "balance_loss_clip": 1.04540873, "balance_loss_mlp": 0.18826842, "epoch": 0.7160078160228468, "flos": 22490601886080.0, "grad_norm": 20.550082684250587, "language_loss": 0.79345298, "learning_rate": 7.879042716053415e-07, "loss": 0.80827832, "num_input_tokens_seen": 256878545, "router_z_loss_clip": 2.19921875, "router_z_loss_mlp": 0.2890625, "step": 11909, "time_per_iteration": 2.6847121715545654 }, { "auxiliary_loss_clip": 0.01271979, "auxiliary_loss_mlp": 0.00227631, "balance_loss_clip": 1.04966998, "balance_loss_mlp": 0.19990245, "epoch": 0.7160679392755148, "flos": 30590935626240.0, "grad_norm": 26.92875375630976, "language_loss": 0.80603832, "learning_rate": 7.875945057930144e-07, "loss": 0.82103443, "num_input_tokens_seen": 256899920, "router_z_loss_clip": 2.22460938, "router_z_loss_mlp": 0.27734375, "step": 11910, "time_per_iteration": 2.8244686126708984 }, { "auxiliary_loss_clip": 0.01254635, "auxiliary_loss_mlp": 0.00214491, "balance_loss_clip": 1.04168344, "balance_loss_mlp": 0.18907547, "epoch": 0.7161280625281827, "flos": 21323648833920.0, "grad_norm": 2.8697259191627458, "language_loss": 0.81939328, "learning_rate": 7.872847859552251e-07, "loss": 0.83408457, "num_input_tokens_seen": 256918460, "router_z_loss_clip": 2.12890625, "router_z_loss_mlp": 0.25402832, "step": 11911, "time_per_iteration": 2.6515073776245117 }, { "auxiliary_loss_clip": 0.01279842, "auxiliary_loss_mlp": 0.00214594, "balance_loss_clip": 1.05246496, "balance_loss_mlp": 0.18665171, "epoch": 0.7161881857808508, "flos": 61860078921600.0, "grad_norm": 32.631607560757566, "language_loss": 0.69410354, "learning_rate": 7.869751121037192e-07, "loss": 0.70904791, "num_input_tokens_seen": 256942015, "router_z_loss_clip": 2.2734375, "router_z_loss_mlp": 0.27941895, "step": 11912, "time_per_iteration": 3.08223819732666 }, { "auxiliary_loss_clip": 0.01262134, "auxiliary_loss_mlp": 0.00226588, "balance_loss_clip": 1.04938984, "balance_loss_mlp": 0.20218542, "epoch": 0.7162483090335187, "flos": 20812101292800.0, "grad_norm": 32.39114378128236, "language_loss": 0.86930037, "learning_rate": 7.866654842502376e-07, "loss": 0.88418758, "num_input_tokens_seen": 256961065, "router_z_loss_clip": 2.12890625, "router_z_loss_mlp": 0.24401855, "step": 11913, "time_per_iteration": 2.7596442699432373 }, { "auxiliary_loss_clip": 0.01235702, "auxiliary_loss_mlp": 0.00227623, "balance_loss_clip": 1.03135467, "balance_loss_mlp": 0.20422173, "epoch": 0.7163084322861867, "flos": 24097532630400.0, "grad_norm": 4.125145605369417, "language_loss": 0.81531912, "learning_rate": 7.863559024065234e-07, "loss": 0.82995236, "num_input_tokens_seen": 256982165, "router_z_loss_clip": 2.04394531, "router_z_loss_mlp": 0.23425293, "step": 11914, "time_per_iteration": 2.753490924835205 }, { "auxiliary_loss_clip": 0.01257115, "auxiliary_loss_mlp": 0.00209209, "balance_loss_clip": 1.0509423, "balance_loss_mlp": 0.18489072, "epoch": 0.7163685555388547, "flos": 20080888128000.0, "grad_norm": 5.132760735389259, "language_loss": 0.8006829, "learning_rate": 7.860463665843143e-07, "loss": 0.81534618, "num_input_tokens_seen": 256999825, "router_z_loss_clip": 2.05761719, "router_z_loss_mlp": 0.24328613, "step": 11915, "time_per_iteration": 2.752250909805298 }, { "auxiliary_loss_clip": 0.01263253, "auxiliary_loss_mlp": 0.00229128, "balance_loss_clip": 1.04668188, "balance_loss_mlp": 0.20212674, "epoch": 0.7164286787915226, "flos": 17456967613440.0, "grad_norm": 98.05236979291158, "language_loss": 0.86818063, "learning_rate": 7.85736876795349e-07, "loss": 0.88310438, "num_input_tokens_seen": 257017450, "router_z_loss_clip": 2.16601562, "router_z_loss_mlp": 0.27026367, "step": 11916, "time_per_iteration": 2.6448185443878174 }, { "auxiliary_loss_clip": 0.01277141, "auxiliary_loss_mlp": 0.00233216, "balance_loss_clip": 1.06015348, "balance_loss_mlp": 0.20634601, "epoch": 0.7164888020441906, "flos": 19718908819200.0, "grad_norm": 84.34837181805183, "language_loss": 0.76712942, "learning_rate": 7.854274330513626e-07, "loss": 0.78223294, "num_input_tokens_seen": 257035465, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.26843262, "step": 11917, "time_per_iteration": 2.6621766090393066 }, { "auxiliary_loss_clip": 0.01269178, "auxiliary_loss_mlp": 0.00227563, "balance_loss_clip": 1.05154824, "balance_loss_mlp": 0.20076516, "epoch": 0.7165489252968585, "flos": 21470523546240.0, "grad_norm": 449.56883407250615, "language_loss": 0.84539956, "learning_rate": 7.851180353640896e-07, "loss": 0.86036694, "num_input_tokens_seen": 257053750, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.26794434, "step": 11918, "time_per_iteration": 2.6377766132354736 }, { "auxiliary_loss_clip": 0.0117663, "auxiliary_loss_mlp": 0.00060356, "balance_loss_clip": 1.0258801, "balance_loss_mlp": 0.05301319, "epoch": 0.7166090485495266, "flos": 69928060464000.0, "grad_norm": 0.6377753369933555, "language_loss": 0.52944678, "learning_rate": 7.848086837452639e-07, "loss": 0.54181665, "num_input_tokens_seen": 257121215, "router_z_loss_clip": 1.515625, "router_z_loss_mlp": 0.07324219, "step": 11919, "time_per_iteration": 3.208137035369873 }, { "auxiliary_loss_clip": 0.01294544, "auxiliary_loss_mlp": 0.00231933, "balance_loss_clip": 1.07156801, "balance_loss_mlp": 0.20561175, "epoch": 0.7166691718021945, "flos": 27343892949120.0, "grad_norm": 10.029736608656009, "language_loss": 0.73919046, "learning_rate": 7.844993782066132e-07, "loss": 0.75445521, "num_input_tokens_seen": 257143370, "router_z_loss_clip": 2.23046875, "router_z_loss_mlp": 0.26379395, "step": 11920, "time_per_iteration": 4.231455564498901 }, { "auxiliary_loss_clip": 0.01267635, "auxiliary_loss_mlp": 0.00213123, "balance_loss_clip": 1.04934192, "balance_loss_mlp": 0.18470398, "epoch": 0.7167292950548625, "flos": 30408868563840.0, "grad_norm": 8.262721794304813, "language_loss": 0.82358849, "learning_rate": 7.841901187598678e-07, "loss": 0.83839607, "num_input_tokens_seen": 257162160, "router_z_loss_clip": 2.18457031, "router_z_loss_mlp": 0.28442383, "step": 11921, "time_per_iteration": 2.7021360397338867 }, { "auxiliary_loss_clip": 0.01287914, "auxiliary_loss_mlp": 0.00250266, "balance_loss_clip": 1.06517196, "balance_loss_mlp": 0.22078547, "epoch": 0.7167894183075304, "flos": 14571257800320.0, "grad_norm": 33.28079654794878, "language_loss": 0.86775792, "learning_rate": 7.83880905416755e-07, "loss": 0.88313973, "num_input_tokens_seen": 257179300, "router_z_loss_clip": 2.22460938, "router_z_loss_mlp": 0.2947998, "step": 11922, "time_per_iteration": 4.082406997680664 }, { "auxiliary_loss_clip": 0.01191035, "auxiliary_loss_mlp": 0.00071154, "balance_loss_clip": 1.03875232, "balance_loss_mlp": 0.06290495, "epoch": 0.7168495415601984, "flos": 64110674407680.0, "grad_norm": 0.7511269270598219, "language_loss": 0.54569602, "learning_rate": 7.83571738189001e-07, "loss": 0.5583179, "num_input_tokens_seen": 257235470, "router_z_loss_clip": 1.5234375, "router_z_loss_mlp": 0.08251953, "step": 11923, "time_per_iteration": 2.9721767902374268 }, { "auxiliary_loss_clip": 0.0128907, "auxiliary_loss_mlp": 0.00215849, "balance_loss_clip": 1.06755817, "balance_loss_mlp": 0.18889561, "epoch": 0.7169096648128663, "flos": 24681440119680.0, "grad_norm": 7.629939222497074, "language_loss": 0.84847987, "learning_rate": 7.832626170883279e-07, "loss": 0.86352903, "num_input_tokens_seen": 257255850, "router_z_loss_clip": 2.21679688, "router_z_loss_mlp": 0.26940918, "step": 11924, "time_per_iteration": 2.6907904148101807 }, { "auxiliary_loss_clip": 0.01246472, "auxiliary_loss_mlp": 0.00216752, "balance_loss_clip": 1.03681493, "balance_loss_mlp": 0.19248073, "epoch": 0.7169697880655344, "flos": 20667525050880.0, "grad_norm": 5.477023946022965, "language_loss": 0.77063525, "learning_rate": 7.829535421264588e-07, "loss": 0.78526747, "num_input_tokens_seen": 257275425, "router_z_loss_clip": 2.09570312, "router_z_loss_mlp": 0.24267578, "step": 11925, "time_per_iteration": 4.1361799240112305 }, { "auxiliary_loss_clip": 0.01248798, "auxiliary_loss_mlp": 0.00193376, "balance_loss_clip": 1.03850234, "balance_loss_mlp": 0.16912889, "epoch": 0.7170299113182023, "flos": 21032700670080.0, "grad_norm": 10.637210698786488, "language_loss": 0.84144413, "learning_rate": 7.826445133151133e-07, "loss": 0.8558659, "num_input_tokens_seen": 257295740, "router_z_loss_clip": 2.09960938, "router_z_loss_mlp": 0.24255371, "step": 11926, "time_per_iteration": 2.782789468765259 }, { "auxiliary_loss_clip": 0.01268474, "auxiliary_loss_mlp": 0.00221548, "balance_loss_clip": 1.04856217, "balance_loss_mlp": 0.19569129, "epoch": 0.7170900345708703, "flos": 22893304239360.0, "grad_norm": 9.014022248344915, "language_loss": 0.85278666, "learning_rate": 7.823355306660093e-07, "loss": 0.86768693, "num_input_tokens_seen": 257315970, "router_z_loss_clip": 2.19921875, "router_z_loss_mlp": 0.25866699, "step": 11927, "time_per_iteration": 2.692638397216797 }, { "auxiliary_loss_clip": 0.01277987, "auxiliary_loss_mlp": 0.00221798, "balance_loss_clip": 1.05713391, "balance_loss_mlp": 0.19125679, "epoch": 0.7171501578235383, "flos": 15518688883200.0, "grad_norm": 266.91809568443483, "language_loss": 0.78218329, "learning_rate": 7.820265941908642e-07, "loss": 0.79718113, "num_input_tokens_seen": 257334230, "router_z_loss_clip": 2.2109375, "router_z_loss_mlp": 0.30505371, "step": 11928, "time_per_iteration": 2.686121940612793 }, { "auxiliary_loss_clip": 0.01254692, "auxiliary_loss_mlp": 0.00212798, "balance_loss_clip": 1.04575753, "balance_loss_mlp": 0.1870247, "epoch": 0.7172102810762062, "flos": 26104292640000.0, "grad_norm": 12.646325122161398, "language_loss": 0.71942115, "learning_rate": 7.817177039013931e-07, "loss": 0.73409605, "num_input_tokens_seen": 257352145, "router_z_loss_clip": 2.09082031, "router_z_loss_mlp": 0.25769043, "step": 11929, "time_per_iteration": 4.119197368621826 }, { "auxiliary_loss_clip": 0.01277903, "auxiliary_loss_mlp": 0.00214275, "balance_loss_clip": 1.05448365, "balance_loss_mlp": 0.18798964, "epoch": 0.7172704043288742, "flos": 21506649649920.0, "grad_norm": 12.879138468175295, "language_loss": 0.81176162, "learning_rate": 7.81408859809308e-07, "loss": 0.8266834, "num_input_tokens_seen": 257371460, "router_z_loss_clip": 2.23632812, "router_z_loss_mlp": 0.26306152, "step": 11930, "time_per_iteration": 2.7036526203155518 }, { "auxiliary_loss_clip": 0.01270081, "auxiliary_loss_mlp": 0.00197007, "balance_loss_clip": 1.05743527, "balance_loss_mlp": 0.17203259, "epoch": 0.7173305275815421, "flos": 18770939032320.0, "grad_norm": 22.95463412717161, "language_loss": 0.90153819, "learning_rate": 7.811000619263219e-07, "loss": 0.9162091, "num_input_tokens_seen": 257390800, "router_z_loss_clip": 2.12597656, "router_z_loss_mlp": 0.24987793, "step": 11931, "time_per_iteration": 2.647714138031006 }, { "auxiliary_loss_clip": 0.01250316, "auxiliary_loss_mlp": 0.00220043, "balance_loss_clip": 1.04033279, "balance_loss_mlp": 0.19579619, "epoch": 0.7173906508342102, "flos": 16179876483840.0, "grad_norm": 481.2374985030743, "language_loss": 0.86100888, "learning_rate": 7.80791310264143e-07, "loss": 0.87571251, "num_input_tokens_seen": 257407495, "router_z_loss_clip": 2.10253906, "router_z_loss_mlp": 0.24267578, "step": 11932, "time_per_iteration": 2.6424949169158936 }, { "auxiliary_loss_clip": 0.01259525, "auxiliary_loss_mlp": 0.00211827, "balance_loss_clip": 1.04499197, "balance_loss_mlp": 0.18656684, "epoch": 0.7174507740868781, "flos": 26613864933120.0, "grad_norm": 13.85324772332025, "language_loss": 0.81430089, "learning_rate": 7.804826048344803e-07, "loss": 0.82901442, "num_input_tokens_seen": 257429675, "router_z_loss_clip": 2.14453125, "router_z_loss_mlp": 0.25256348, "step": 11933, "time_per_iteration": 2.7740328311920166 }, { "auxiliary_loss_clip": 0.01288486, "auxiliary_loss_mlp": 0.00239329, "balance_loss_clip": 1.0577879, "balance_loss_mlp": 0.20938373, "epoch": 0.7175108973395461, "flos": 18432911116800.0, "grad_norm": 8.959877738238635, "language_loss": 0.84645581, "learning_rate": 7.801739456490388e-07, "loss": 0.86173403, "num_input_tokens_seen": 257442765, "router_z_loss_clip": 2.30859375, "router_z_loss_mlp": 0.29956055, "step": 11934, "time_per_iteration": 2.5996956825256348 }, { "auxiliary_loss_clip": 0.01261466, "auxiliary_loss_mlp": 0.00206236, "balance_loss_clip": 1.04674792, "balance_loss_mlp": 0.17828137, "epoch": 0.717571020592214, "flos": 23914962777600.0, "grad_norm": 6.618132174380191, "language_loss": 0.93629003, "learning_rate": 7.798653327195237e-07, "loss": 0.95096701, "num_input_tokens_seen": 257459310, "router_z_loss_clip": 2.15039062, "router_z_loss_mlp": 0.2791748, "step": 11935, "time_per_iteration": 2.6732680797576904 }, { "auxiliary_loss_clip": 0.01272979, "auxiliary_loss_mlp": 0.00225615, "balance_loss_clip": 1.06047201, "balance_loss_mlp": 0.19951968, "epoch": 0.717631143844882, "flos": 38256930109440.0, "grad_norm": 8.154293989888002, "language_loss": 0.81803894, "learning_rate": 7.795567660576388e-07, "loss": 0.83302486, "num_input_tokens_seen": 257484750, "router_z_loss_clip": 2.12109375, "router_z_loss_mlp": 0.2611084, "step": 11936, "time_per_iteration": 2.833138942718506 }, { "auxiliary_loss_clip": 0.01191469, "auxiliary_loss_mlp": 0.00059612, "balance_loss_clip": 1.03945732, "balance_loss_mlp": 0.05241149, "epoch": 0.7176912670975499, "flos": 65515896328320.0, "grad_norm": 0.7550919204081958, "language_loss": 0.54969656, "learning_rate": 7.79248245675082e-07, "loss": 0.56220734, "num_input_tokens_seen": 257543110, "router_z_loss_clip": 1.515625, "router_z_loss_mlp": 0.07177734, "step": 11937, "time_per_iteration": 3.2498767375946045 }, { "auxiliary_loss_clip": 0.01275128, "auxiliary_loss_mlp": 0.00229831, "balance_loss_clip": 1.05531311, "balance_loss_mlp": 0.20172133, "epoch": 0.717751390350218, "flos": 31281066610560.0, "grad_norm": 12.2428878961128, "language_loss": 0.61375093, "learning_rate": 7.789397715835542e-07, "loss": 0.62880051, "num_input_tokens_seen": 257567410, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.28137207, "step": 11938, "time_per_iteration": 2.7231028079986572 }, { "auxiliary_loss_clip": 0.01260255, "auxiliary_loss_mlp": 0.00218438, "balance_loss_clip": 1.0467056, "balance_loss_mlp": 0.19290309, "epoch": 0.7178115136028859, "flos": 19859031774720.0, "grad_norm": 28.084082258614814, "language_loss": 0.84142596, "learning_rate": 7.786313437947527e-07, "loss": 0.85621285, "num_input_tokens_seen": 257586270, "router_z_loss_clip": 2.13671875, "router_z_loss_mlp": 0.25549316, "step": 11939, "time_per_iteration": 2.6487059593200684 }, { "auxiliary_loss_clip": 0.0118519, "auxiliary_loss_mlp": 0.00077497, "balance_loss_clip": 1.03395808, "balance_loss_mlp": 0.0699634, "epoch": 0.7178716368555539, "flos": 64348655967360.0, "grad_norm": 4.143817666274519, "language_loss": 0.6049704, "learning_rate": 7.783229623203738e-07, "loss": 0.61759728, "num_input_tokens_seen": 257647415, "router_z_loss_clip": 1.515625, "router_z_loss_mlp": 0.07519531, "step": 11940, "time_per_iteration": 3.125281572341919 }, { "auxiliary_loss_clip": 0.01273334, "auxiliary_loss_mlp": 0.00197705, "balance_loss_clip": 1.06084824, "balance_loss_mlp": 0.17339832, "epoch": 0.7179317601082219, "flos": 26762607152640.0, "grad_norm": 3.168053533812203, "language_loss": 0.65411437, "learning_rate": 7.780146271721097e-07, "loss": 0.66882479, "num_input_tokens_seen": 257669795, "router_z_loss_clip": 2.12695312, "router_z_loss_mlp": 0.24316406, "step": 11941, "time_per_iteration": 2.7014000415802 }, { "auxiliary_loss_clip": 0.01274934, "auxiliary_loss_mlp": 0.00230627, "balance_loss_clip": 1.0620079, "balance_loss_mlp": 0.20570049, "epoch": 0.7179918833608898, "flos": 23513804709120.0, "grad_norm": 4.598870124580506, "language_loss": 0.86272597, "learning_rate": 7.777063383616543e-07, "loss": 0.87778163, "num_input_tokens_seen": 257687415, "router_z_loss_clip": 2.12890625, "router_z_loss_mlp": 0.24938965, "step": 11942, "time_per_iteration": 2.6897284984588623 }, { "auxiliary_loss_clip": 0.01267906, "auxiliary_loss_mlp": 0.00211709, "balance_loss_clip": 1.05214334, "balance_loss_mlp": 0.18681842, "epoch": 0.7180520066135578, "flos": 17165588486400.0, "grad_norm": 11.195815188512125, "language_loss": 0.75243604, "learning_rate": 7.773980959006968e-07, "loss": 0.76723218, "num_input_tokens_seen": 257706215, "router_z_loss_clip": 2.15234375, "router_z_loss_mlp": 0.24865723, "step": 11943, "time_per_iteration": 2.6274774074554443 }, { "auxiliary_loss_clip": 0.01272674, "auxiliary_loss_mlp": 0.00247864, "balance_loss_clip": 1.05353844, "balance_loss_mlp": 0.22346207, "epoch": 0.7181121298662257, "flos": 17566638814080.0, "grad_norm": 1723.7773482036296, "language_loss": 0.85947657, "learning_rate": 7.770898998009254e-07, "loss": 0.87468195, "num_input_tokens_seen": 257724740, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.24438477, "step": 11944, "time_per_iteration": 2.694152593612671 }, { "auxiliary_loss_clip": 0.01287535, "auxiliary_loss_mlp": 0.00223375, "balance_loss_clip": 1.06398845, "balance_loss_mlp": 0.19494364, "epoch": 0.7181722531188938, "flos": 11947660508160.0, "grad_norm": 10.15681520764219, "language_loss": 0.75445485, "learning_rate": 7.767817500740277e-07, "loss": 0.76956391, "num_input_tokens_seen": 257742060, "router_z_loss_clip": 2.23828125, "router_z_loss_mlp": 0.28442383, "step": 11945, "time_per_iteration": 2.6331703662872314 }, { "auxiliary_loss_clip": 0.01167605, "auxiliary_loss_mlp": 0.00103727, "balance_loss_clip": 1.01610684, "balance_loss_mlp": 0.09671738, "epoch": 0.7182323763715617, "flos": 65503649790720.0, "grad_norm": 0.6835535749883511, "language_loss": 0.50433487, "learning_rate": 7.76473646731689e-07, "loss": 0.51704818, "num_input_tokens_seen": 257802250, "router_z_loss_clip": 1.515625, "router_z_loss_mlp": 0.0703125, "step": 11946, "time_per_iteration": 3.096867799758911 }, { "auxiliary_loss_clip": 0.01285294, "auxiliary_loss_mlp": 0.00214961, "balance_loss_clip": 1.06409395, "balance_loss_mlp": 0.18786471, "epoch": 0.7182924996242297, "flos": 20630932070400.0, "grad_norm": 5.742867556199986, "language_loss": 0.82683671, "learning_rate": 7.761655897855925e-07, "loss": 0.84183925, "num_input_tokens_seen": 257821155, "router_z_loss_clip": 2.2109375, "router_z_loss_mlp": 0.27087402, "step": 11947, "time_per_iteration": 2.635847806930542 }, { "auxiliary_loss_clip": 0.01260844, "auxiliary_loss_mlp": 0.00219633, "balance_loss_clip": 1.05024827, "balance_loss_mlp": 0.19414642, "epoch": 0.7183526228768976, "flos": 16216433550720.0, "grad_norm": 7.866271554400097, "language_loss": 0.8059808, "learning_rate": 7.758575792474187e-07, "loss": 0.82078552, "num_input_tokens_seen": 257839905, "router_z_loss_clip": 2.10351562, "router_z_loss_mlp": 0.25512695, "step": 11948, "time_per_iteration": 2.6462740898132324 }, { "auxiliary_loss_clip": 0.01281404, "auxiliary_loss_mlp": 0.00254019, "balance_loss_clip": 1.06648111, "balance_loss_mlp": 0.22723305, "epoch": 0.7184127461295656, "flos": 22232655342720.0, "grad_norm": 3.7174083021791846, "language_loss": 0.78485847, "learning_rate": 7.755496151288483e-07, "loss": 0.80021274, "num_input_tokens_seen": 257860055, "router_z_loss_clip": 2.1484375, "router_z_loss_mlp": 0.26757812, "step": 11949, "time_per_iteration": 2.6439085006713867 }, { "auxiliary_loss_clip": 0.01267572, "auxiliary_loss_mlp": 0.00224553, "balance_loss_clip": 1.04945612, "balance_loss_mlp": 0.19847031, "epoch": 0.7184728693822335, "flos": 27344503480320.0, "grad_norm": 2867.13708883593, "language_loss": 0.85918683, "learning_rate": 7.752416974415598e-07, "loss": 0.87410808, "num_input_tokens_seen": 257879315, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.26074219, "step": 11950, "time_per_iteration": 2.8285813331604004 }, { "auxiliary_loss_clip": 0.0128229, "auxiliary_loss_mlp": 0.0024866, "balance_loss_clip": 1.06408191, "balance_loss_mlp": 0.22196887, "epoch": 0.7185329926349016, "flos": 16508530949760.0, "grad_norm": 17.53060593532323, "language_loss": 0.77840233, "learning_rate": 7.749338261972282e-07, "loss": 0.79371184, "num_input_tokens_seen": 257896570, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.2668457, "step": 11951, "time_per_iteration": 2.704913854598999 }, { "auxiliary_loss_clip": 0.01281651, "auxiliary_loss_mlp": 0.00232361, "balance_loss_clip": 1.0582664, "balance_loss_mlp": 0.20514566, "epoch": 0.7185931158875695, "flos": 23951052967680.0, "grad_norm": 17.271431252928643, "language_loss": 0.86208498, "learning_rate": 7.746260014075286e-07, "loss": 0.8772251, "num_input_tokens_seen": 257916855, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.27209473, "step": 11952, "time_per_iteration": 2.7066574096679688 }, { "auxiliary_loss_clip": 0.01285418, "auxiliary_loss_mlp": 0.0023973, "balance_loss_clip": 1.06518388, "balance_loss_mlp": 0.21359886, "epoch": 0.7186532391402375, "flos": 26542007775360.0, "grad_norm": 48.16844378588123, "language_loss": 0.82661736, "learning_rate": 7.743182230841352e-07, "loss": 0.84186876, "num_input_tokens_seen": 257937140, "router_z_loss_clip": 2.20117188, "router_z_loss_mlp": 0.26159668, "step": 11953, "time_per_iteration": 2.7572388648986816 }, { "auxiliary_loss_clip": 0.01292815, "auxiliary_loss_mlp": 0.00251596, "balance_loss_clip": 1.0655787, "balance_loss_mlp": 0.22365382, "epoch": 0.7187133623929055, "flos": 22383049587840.0, "grad_norm": 3.357789755071188, "language_loss": 0.82308835, "learning_rate": 7.740104912387164e-07, "loss": 0.83853245, "num_input_tokens_seen": 257956785, "router_z_loss_clip": 2.27148438, "router_z_loss_mlp": 0.27966309, "step": 11954, "time_per_iteration": 2.6839640140533447 }, { "auxiliary_loss_clip": 0.012823, "auxiliary_loss_mlp": 0.00240461, "balance_loss_clip": 1.06385612, "balance_loss_mlp": 0.21376991, "epoch": 0.7187734856455734, "flos": 15779580341760.0, "grad_norm": 8.247387421709513, "language_loss": 0.81956148, "learning_rate": 7.737028058829425e-07, "loss": 0.83478904, "num_input_tokens_seen": 257975455, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.26672363, "step": 11955, "time_per_iteration": 2.843238592147827 }, { "auxiliary_loss_clip": 0.0126268, "auxiliary_loss_mlp": 0.00236231, "balance_loss_clip": 1.04996705, "balance_loss_mlp": 0.21216276, "epoch": 0.7188336088982414, "flos": 31759612531200.0, "grad_norm": 10.352524305193992, "language_loss": 0.81142282, "learning_rate": 7.733951670284817e-07, "loss": 0.82641196, "num_input_tokens_seen": 257996850, "router_z_loss_clip": 2.12304688, "router_z_loss_mlp": 0.24072266, "step": 11956, "time_per_iteration": 2.7955563068389893 }, { "auxiliary_loss_clip": 0.01287323, "auxiliary_loss_mlp": 0.00235469, "balance_loss_clip": 1.06312215, "balance_loss_mlp": 0.20750269, "epoch": 0.7188937321509093, "flos": 21465208333440.0, "grad_norm": 5.46743126248071, "language_loss": 0.79256272, "learning_rate": 7.730875746869987e-07, "loss": 0.80779064, "num_input_tokens_seen": 258016145, "router_z_loss_clip": 2.24414062, "router_z_loss_mlp": 0.27978516, "step": 11957, "time_per_iteration": 2.7715535163879395 }, { "auxiliary_loss_clip": 0.01284077, "auxiliary_loss_mlp": 0.00249224, "balance_loss_clip": 1.06129372, "balance_loss_mlp": 0.21896912, "epoch": 0.7189538554035774, "flos": 27271497087360.0, "grad_norm": 3.5168923581034885, "language_loss": 0.83218426, "learning_rate": 7.727800288701582e-07, "loss": 0.84751725, "num_input_tokens_seen": 258035420, "router_z_loss_clip": 2.2265625, "router_z_loss_mlp": 0.30285645, "step": 11958, "time_per_iteration": 2.7629034519195557 }, { "auxiliary_loss_clip": 0.01259904, "auxiliary_loss_mlp": 0.00219078, "balance_loss_clip": 1.04090214, "balance_loss_mlp": 0.193663, "epoch": 0.7190139786562453, "flos": 21580625710080.0, "grad_norm": 16.794277923719616, "language_loss": 0.91561925, "learning_rate": 7.724725295896215e-07, "loss": 0.93040907, "num_input_tokens_seen": 258053520, "router_z_loss_clip": 2.19140625, "router_z_loss_mlp": 0.25402832, "step": 11959, "time_per_iteration": 2.669038772583008 }, { "auxiliary_loss_clip": 0.01281339, "auxiliary_loss_mlp": 0.00223452, "balance_loss_clip": 1.05732512, "balance_loss_mlp": 0.19647522, "epoch": 0.7190741019089133, "flos": 26721237663360.0, "grad_norm": 5.744386962422641, "language_loss": 0.89042354, "learning_rate": 7.7216507685705e-07, "loss": 0.90547156, "num_input_tokens_seen": 258073020, "router_z_loss_clip": 2.23828125, "router_z_loss_mlp": 0.26953125, "step": 11960, "time_per_iteration": 2.6460134983062744 }, { "auxiliary_loss_clip": 0.01278968, "auxiliary_loss_mlp": 0.00249463, "balance_loss_clip": 1.05835605, "balance_loss_mlp": 0.22048274, "epoch": 0.7191342251615812, "flos": 26104759516800.0, "grad_norm": 61.44568339644394, "language_loss": 0.861283, "learning_rate": 7.718576706841013e-07, "loss": 0.8765673, "num_input_tokens_seen": 258093155, "router_z_loss_clip": 2.20410156, "router_z_loss_mlp": 0.28979492, "step": 11961, "time_per_iteration": 2.6783933639526367 }, { "auxiliary_loss_clip": 0.01270645, "auxiliary_loss_mlp": 0.00220876, "balance_loss_clip": 1.06094491, "balance_loss_mlp": 0.19717672, "epoch": 0.7191943484142492, "flos": 22967028904320.0, "grad_norm": 21.2956373254159, "language_loss": 0.80758417, "learning_rate": 7.715503110824326e-07, "loss": 0.82249933, "num_input_tokens_seen": 258113905, "router_z_loss_clip": 2.09765625, "router_z_loss_mlp": 0.23693848, "step": 11962, "time_per_iteration": 4.067600250244141 }, { "auxiliary_loss_clip": 0.01277967, "auxiliary_loss_mlp": 0.00222357, "balance_loss_clip": 1.0540688, "balance_loss_mlp": 0.19571325, "epoch": 0.7192544716669171, "flos": 22565332131840.0, "grad_norm": 151.07154464750238, "language_loss": 0.82572287, "learning_rate": 7.712429980637001e-07, "loss": 0.84072614, "num_input_tokens_seen": 258132820, "router_z_loss_clip": 2.23632812, "router_z_loss_mlp": 0.26635742, "step": 11963, "time_per_iteration": 2.663830041885376 }, { "auxiliary_loss_clip": 0.01319084, "auxiliary_loss_mlp": 0.00242357, "balance_loss_clip": 1.08314097, "balance_loss_mlp": 0.21334137, "epoch": 0.7193145949195852, "flos": 18982200873600.0, "grad_norm": 40.65007559096879, "language_loss": 0.94660389, "learning_rate": 7.709357316395564e-07, "loss": 0.96221828, "num_input_tokens_seen": 258148055, "router_z_loss_clip": 2.35742188, "router_z_loss_mlp": 0.28979492, "step": 11964, "time_per_iteration": 4.069465398788452 }, { "auxiliary_loss_clip": 0.01279608, "auxiliary_loss_mlp": 0.00253329, "balance_loss_clip": 1.05891967, "balance_loss_mlp": 0.22532724, "epoch": 0.7193747181722531, "flos": 18004246208640.0, "grad_norm": 108.46581405345161, "language_loss": 0.81560117, "learning_rate": 7.70628511821652e-07, "loss": 0.83093053, "num_input_tokens_seen": 258165995, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.27966309, "step": 11965, "time_per_iteration": 2.6524858474731445 }, { "auxiliary_loss_clip": 0.01288687, "auxiliary_loss_mlp": 0.00222494, "balance_loss_clip": 1.06413031, "balance_loss_mlp": 0.19579124, "epoch": 0.7194348414249211, "flos": 24389414547840.0, "grad_norm": 11.657674583756176, "language_loss": 0.86127383, "learning_rate": 7.703213386216377e-07, "loss": 0.87638563, "num_input_tokens_seen": 258186165, "router_z_loss_clip": 2.24414062, "router_z_loss_mlp": 0.26708984, "step": 11966, "time_per_iteration": 2.6706488132476807 }, { "auxiliary_loss_clip": 0.01265701, "auxiliary_loss_mlp": 0.00219864, "balance_loss_clip": 1.05122292, "balance_loss_mlp": 0.19649848, "epoch": 0.7194949646775891, "flos": 22163455791360.0, "grad_norm": 6.665520391476738, "language_loss": 0.80880672, "learning_rate": 7.700142120511619e-07, "loss": 0.8236624, "num_input_tokens_seen": 258204595, "router_z_loss_clip": 2.14453125, "router_z_loss_mlp": 0.23376465, "step": 11967, "time_per_iteration": 4.0812389850616455 }, { "auxiliary_loss_clip": 0.01271883, "auxiliary_loss_mlp": 0.00211527, "balance_loss_clip": 1.05991709, "balance_loss_mlp": 0.18701792, "epoch": 0.719555087930257, "flos": 20266366982400.0, "grad_norm": 8.429505443655739, "language_loss": 0.88601232, "learning_rate": 7.6970713212187e-07, "loss": 0.90084636, "num_input_tokens_seen": 258223110, "router_z_loss_clip": 2.11914062, "router_z_loss_mlp": 0.24499512, "step": 11968, "time_per_iteration": 2.685732126235962 }, { "auxiliary_loss_clip": 0.01278588, "auxiliary_loss_mlp": 0.00235189, "balance_loss_clip": 1.05891609, "balance_loss_mlp": 0.20984487, "epoch": 0.719615211182925, "flos": 24716309247360.0, "grad_norm": 26.40673377375941, "language_loss": 0.85160196, "learning_rate": 7.69400098845407e-07, "loss": 0.86673975, "num_input_tokens_seen": 258242660, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.25354004, "step": 11969, "time_per_iteration": 2.7035915851593018 }, { "auxiliary_loss_clip": 0.01273618, "auxiliary_loss_mlp": 0.00236032, "balance_loss_clip": 1.05364263, "balance_loss_mlp": 0.20935297, "epoch": 0.719675334435593, "flos": 20009641501440.0, "grad_norm": 4.077165123040124, "language_loss": 0.78019047, "learning_rate": 7.69093112233417e-07, "loss": 0.79528701, "num_input_tokens_seen": 258261850, "router_z_loss_clip": 2.20410156, "router_z_loss_mlp": 0.2668457, "step": 11970, "time_per_iteration": 2.706408977508545 }, { "auxiliary_loss_clip": 0.01177366, "auxiliary_loss_mlp": 0.00148605, "balance_loss_clip": 1.01754701, "balance_loss_mlp": 0.14097515, "epoch": 0.719735457688261, "flos": 44199861177600.0, "grad_norm": 0.9253839073123786, "language_loss": 0.5985446, "learning_rate": 7.68786172297538e-07, "loss": 0.61180431, "num_input_tokens_seen": 258312570, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.07617188, "step": 11971, "time_per_iteration": 4.455890417098999 }, { "auxiliary_loss_clip": 0.01275942, "auxiliary_loss_mlp": 0.00238057, "balance_loss_clip": 1.05220294, "balance_loss_mlp": 0.21032888, "epoch": 0.7197955809409289, "flos": 16802890905600.0, "grad_norm": 204.16617404685942, "language_loss": 0.8965435, "learning_rate": 7.684792790494105e-07, "loss": 0.9116835, "num_input_tokens_seen": 258331600, "router_z_loss_clip": 2.23632812, "router_z_loss_mlp": 0.27722168, "step": 11972, "time_per_iteration": 2.687319755554199 }, { "auxiliary_loss_clip": 0.01291396, "auxiliary_loss_mlp": 0.00235537, "balance_loss_clip": 1.07060146, "balance_loss_mlp": 0.20944166, "epoch": 0.7198557041935969, "flos": 24535391420160.0, "grad_norm": 4.312155147429244, "language_loss": 0.82065988, "learning_rate": 7.681724325006733e-07, "loss": 0.83592921, "num_input_tokens_seen": 258351785, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.26086426, "step": 11973, "time_per_iteration": 2.73226261138916 }, { "auxiliary_loss_clip": 0.01177082, "auxiliary_loss_mlp": 0.00083242, "balance_loss_clip": 1.01614666, "balance_loss_mlp": 0.07599403, "epoch": 0.7199158274462648, "flos": 70710839602560.0, "grad_norm": 0.8313854656387055, "language_loss": 0.55821055, "learning_rate": 7.6786563266296e-07, "loss": 0.57081383, "num_input_tokens_seen": 258404035, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.07226562, "step": 11974, "time_per_iteration": 3.005429983139038 }, { "auxiliary_loss_clip": 0.01269321, "auxiliary_loss_mlp": 0.00220181, "balance_loss_clip": 1.05064058, "balance_loss_mlp": 0.19532628, "epoch": 0.7199759506989328, "flos": 29347995352320.0, "grad_norm": 7.08770582997476, "language_loss": 0.69264615, "learning_rate": 7.675588795479062e-07, "loss": 0.70754117, "num_input_tokens_seen": 258424850, "router_z_loss_clip": 2.18847656, "router_z_loss_mlp": 0.24841309, "step": 11975, "time_per_iteration": 2.7801480293273926 }, { "auxiliary_loss_clip": 0.01254887, "auxiliary_loss_mlp": 0.00208499, "balance_loss_clip": 1.03538144, "balance_loss_mlp": 0.18373913, "epoch": 0.7200360739516007, "flos": 24640465680000.0, "grad_norm": 21.052868106043128, "language_loss": 0.75716591, "learning_rate": 7.672521731671425e-07, "loss": 0.7717998, "num_input_tokens_seen": 258445485, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.24755859, "step": 11976, "time_per_iteration": 2.705580711364746 }, { "auxiliary_loss_clip": 0.01269313, "auxiliary_loss_mlp": 0.00227527, "balance_loss_clip": 1.05587435, "balance_loss_mlp": 0.20229049, "epoch": 0.7200961972042688, "flos": 20812855478400.0, "grad_norm": 21.055087082594326, "language_loss": 0.74992645, "learning_rate": 7.669455135323004e-07, "loss": 0.76489484, "num_input_tokens_seen": 258464505, "router_z_loss_clip": 2.13476562, "router_z_loss_mlp": 0.25244141, "step": 11977, "time_per_iteration": 2.686563014984131 }, { "auxiliary_loss_clip": 0.01284247, "auxiliary_loss_mlp": 0.00236879, "balance_loss_clip": 1.06343853, "balance_loss_mlp": 0.21147573, "epoch": 0.7201563204569367, "flos": 31245910174080.0, "grad_norm": 2.883417268537661, "language_loss": 0.83641291, "learning_rate": 7.666389006550074e-07, "loss": 0.85162419, "num_input_tokens_seen": 258487190, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.25378418, "step": 11978, "time_per_iteration": 2.7775189876556396 }, { "auxiliary_loss_clip": 0.01275724, "auxiliary_loss_mlp": 0.00238939, "balance_loss_clip": 1.05908537, "balance_loss_mlp": 0.21311848, "epoch": 0.7202164437096047, "flos": 26651391667200.0, "grad_norm": 71.5673744653802, "language_loss": 0.85707206, "learning_rate": 7.663323345468908e-07, "loss": 0.87221873, "num_input_tokens_seen": 258503790, "router_z_loss_clip": 2.16796875, "router_z_loss_mlp": 0.25805664, "step": 11979, "time_per_iteration": 2.7086057662963867 }, { "auxiliary_loss_clip": 0.01282397, "auxiliary_loss_mlp": 0.00242937, "balance_loss_clip": 1.05881512, "balance_loss_mlp": 0.21612713, "epoch": 0.7202765669622727, "flos": 25959608657280.0, "grad_norm": 2.34155609923816, "language_loss": 0.71218228, "learning_rate": 7.660258152195767e-07, "loss": 0.72743559, "num_input_tokens_seen": 258527335, "router_z_loss_clip": 2.24023438, "router_z_loss_mlp": 0.2677002, "step": 11980, "time_per_iteration": 2.7212605476379395 }, { "auxiliary_loss_clip": 0.01279393, "auxiliary_loss_mlp": 0.00238517, "balance_loss_clip": 1.05585742, "balance_loss_mlp": 0.21227857, "epoch": 0.7203366902149406, "flos": 28512354372480.0, "grad_norm": 16.82305431011846, "language_loss": 0.76674664, "learning_rate": 7.657193426846871e-07, "loss": 0.78192574, "num_input_tokens_seen": 258546690, "router_z_loss_clip": 2.23535156, "router_z_loss_mlp": 0.2623291, "step": 11981, "time_per_iteration": 2.7050836086273193 }, { "auxiliary_loss_clip": 0.01288222, "auxiliary_loss_mlp": 0.00236004, "balance_loss_clip": 1.06398034, "balance_loss_mlp": 0.20765644, "epoch": 0.7203968134676086, "flos": 21106030285440.0, "grad_norm": 39.996194041635505, "language_loss": 0.81953472, "learning_rate": 7.65412916953843e-07, "loss": 0.834777, "num_input_tokens_seen": 258566340, "router_z_loss_clip": 2.2421875, "router_z_loss_mlp": 0.2833252, "step": 11982, "time_per_iteration": 2.695765972137451 }, { "auxiliary_loss_clip": 0.01271304, "auxiliary_loss_mlp": 0.00227577, "balance_loss_clip": 1.05238104, "balance_loss_mlp": 0.20186332, "epoch": 0.7204569367202766, "flos": 18332146488960.0, "grad_norm": 65.72578191298544, "language_loss": 0.7538923, "learning_rate": 7.65106538038665e-07, "loss": 0.76888108, "num_input_tokens_seen": 258584455, "router_z_loss_clip": 2.19140625, "router_z_loss_mlp": 0.25695801, "step": 11983, "time_per_iteration": 2.6308202743530273 }, { "auxiliary_loss_clip": 0.01278897, "auxiliary_loss_mlp": 0.00254672, "balance_loss_clip": 1.05486679, "balance_loss_mlp": 0.22744417, "epoch": 0.7205170599729446, "flos": 23255103980160.0, "grad_norm": 27.819114363776198, "language_loss": 0.73824, "learning_rate": 7.648002059507715e-07, "loss": 0.75357574, "num_input_tokens_seen": 258604725, "router_z_loss_clip": 2.24023438, "router_z_loss_mlp": 0.27233887, "step": 11984, "time_per_iteration": 2.701201915740967 }, { "auxiliary_loss_clip": 0.01316322, "auxiliary_loss_mlp": 0.00216763, "balance_loss_clip": 1.08471966, "balance_loss_mlp": 0.18861744, "epoch": 0.7205771832256125, "flos": 20120892900480.0, "grad_norm": 31.333632469713486, "language_loss": 0.82802451, "learning_rate": 7.644939207017771e-07, "loss": 0.84335536, "num_input_tokens_seen": 258622885, "router_z_loss_clip": 2.31640625, "router_z_loss_mlp": 0.28161621, "step": 11985, "time_per_iteration": 2.641064405441284 }, { "auxiliary_loss_clip": 0.01270582, "auxiliary_loss_mlp": 0.00213202, "balance_loss_clip": 1.05429995, "balance_loss_mlp": 0.18754837, "epoch": 0.7206373064782805, "flos": 27703250565120.0, "grad_norm": 11.52233170752679, "language_loss": 0.68589664, "learning_rate": 7.641876823032977e-07, "loss": 0.7007345, "num_input_tokens_seen": 258644305, "router_z_loss_clip": 2.16210938, "router_z_loss_mlp": 0.25634766, "step": 11986, "time_per_iteration": 2.7199172973632812 }, { "auxiliary_loss_clip": 0.01284087, "auxiliary_loss_mlp": 0.00239347, "balance_loss_clip": 1.05869222, "balance_loss_mlp": 0.21129669, "epoch": 0.7206974297309484, "flos": 17968156018560.0, "grad_norm": 19.069202993554043, "language_loss": 0.78884959, "learning_rate": 7.638814907669455e-07, "loss": 0.80408382, "num_input_tokens_seen": 258661775, "router_z_loss_clip": 2.25195312, "router_z_loss_mlp": 0.28039551, "step": 11987, "time_per_iteration": 2.6895010471343994 }, { "auxiliary_loss_clip": 0.01283081, "auxiliary_loss_mlp": 0.00239176, "balance_loss_clip": 1.05851269, "balance_loss_mlp": 0.21153107, "epoch": 0.7207575529836164, "flos": 16983162288000.0, "grad_norm": 46.19099880940014, "language_loss": 0.86691052, "learning_rate": 7.635753461043301e-07, "loss": 0.88213307, "num_input_tokens_seen": 258679830, "router_z_loss_clip": 2.24609375, "router_z_loss_mlp": 0.27648926, "step": 11988, "time_per_iteration": 2.698059320449829 }, { "auxiliary_loss_clip": 0.01268328, "auxiliary_loss_mlp": 0.00221811, "balance_loss_clip": 1.05020583, "balance_loss_mlp": 0.19683644, "epoch": 0.7208176762362843, "flos": 18727594295040.0, "grad_norm": 6.410814048068348, "language_loss": 0.85214162, "learning_rate": 7.632692483270618e-07, "loss": 0.86704296, "num_input_tokens_seen": 258697415, "router_z_loss_clip": 2.18164062, "router_z_loss_mlp": 0.25, "step": 11989, "time_per_iteration": 2.6740670204162598 }, { "auxiliary_loss_clip": 0.01255594, "auxiliary_loss_mlp": 0.00248045, "balance_loss_clip": 1.03864741, "balance_loss_mlp": 0.2223081, "epoch": 0.7208777994889524, "flos": 18734489706240.0, "grad_norm": 8.572349543196555, "language_loss": 0.90791595, "learning_rate": 7.629631974467481e-07, "loss": 0.92295235, "num_input_tokens_seen": 258716755, "router_z_loss_clip": 2.17285156, "router_z_loss_mlp": 0.25756836, "step": 11990, "time_per_iteration": 2.685666799545288 }, { "auxiliary_loss_clip": 0.01289538, "auxiliary_loss_mlp": 0.00247137, "balance_loss_clip": 1.0622952, "balance_loss_mlp": 0.22005308, "epoch": 0.7209379227416203, "flos": 14793437376000.0, "grad_norm": 18.331604295828267, "language_loss": 0.86277282, "learning_rate": 7.626571934749931e-07, "loss": 0.87813962, "num_input_tokens_seen": 258733270, "router_z_loss_clip": 2.27539062, "router_z_loss_mlp": 0.27075195, "step": 11991, "time_per_iteration": 2.617421865463257 }, { "auxiliary_loss_clip": 0.01274107, "auxiliary_loss_mlp": 0.00229003, "balance_loss_clip": 1.05062568, "balance_loss_mlp": 0.20262162, "epoch": 0.7209980459942883, "flos": 29636860527360.0, "grad_norm": 6.388065775798868, "language_loss": 0.78805083, "learning_rate": 7.623512364234022e-07, "loss": 0.80308193, "num_input_tokens_seen": 258755270, "router_z_loss_clip": 2.23632812, "router_z_loss_mlp": 0.2635498, "step": 11992, "time_per_iteration": 2.746302604675293 }, { "auxiliary_loss_clip": 0.01292702, "auxiliary_loss_mlp": 0.00241478, "balance_loss_clip": 1.06250489, "balance_loss_mlp": 0.21308249, "epoch": 0.7210581692469563, "flos": 23477175815040.0, "grad_norm": 8.660892457368531, "language_loss": 0.73977697, "learning_rate": 7.620453263035755e-07, "loss": 0.75511873, "num_input_tokens_seen": 258775340, "router_z_loss_clip": 2.30273438, "router_z_loss_mlp": 0.28356934, "step": 11993, "time_per_iteration": 2.72078275680542 }, { "auxiliary_loss_clip": 0.01275699, "auxiliary_loss_mlp": 0.00234308, "balance_loss_clip": 1.05721712, "balance_loss_mlp": 0.21037072, "epoch": 0.7211182924996242, "flos": 26099839353600.0, "grad_norm": 12.133936881067596, "language_loss": 0.7320618, "learning_rate": 7.61739463127115e-07, "loss": 0.74716187, "num_input_tokens_seen": 258794580, "router_z_loss_clip": 2.18847656, "router_z_loss_mlp": 0.23950195, "step": 11994, "time_per_iteration": 2.71977162361145 }, { "auxiliary_loss_clip": 0.0131178, "auxiliary_loss_mlp": 0.00227499, "balance_loss_clip": 1.07593036, "balance_loss_mlp": 0.19862671, "epoch": 0.7211784157522922, "flos": 17712076982400.0, "grad_norm": 15.29109309836306, "language_loss": 0.7661289, "learning_rate": 7.614336469056172e-07, "loss": 0.78152168, "num_input_tokens_seen": 258812330, "router_z_loss_clip": 2.35742188, "router_z_loss_mlp": 0.28857422, "step": 11995, "time_per_iteration": 2.6521332263946533 }, { "auxiliary_loss_clip": 0.01288454, "auxiliary_loss_mlp": 0.00233967, "balance_loss_clip": 1.06649494, "balance_loss_mlp": 0.20671612, "epoch": 0.7212385390049602, "flos": 24423637230720.0, "grad_norm": 26.061642915747093, "language_loss": 0.86751246, "learning_rate": 7.6112787765068e-07, "loss": 0.88273668, "num_input_tokens_seen": 258831770, "router_z_loss_clip": 2.21679688, "router_z_loss_mlp": 0.27233887, "step": 11996, "time_per_iteration": 2.692399263381958 }, { "auxiliary_loss_clip": 0.01300197, "auxiliary_loss_mlp": 0.00237727, "balance_loss_clip": 1.07074058, "balance_loss_mlp": 0.21058334, "epoch": 0.7212986622576282, "flos": 28147250580480.0, "grad_norm": 3.8818114038970344, "language_loss": 0.91973388, "learning_rate": 7.60822155373899e-07, "loss": 0.93511313, "num_input_tokens_seen": 258849090, "router_z_loss_clip": 2.29492188, "router_z_loss_mlp": 0.27124023, "step": 11997, "time_per_iteration": 2.744286298751831 }, { "auxiliary_loss_clip": 0.01287665, "auxiliary_loss_mlp": 0.0022072, "balance_loss_clip": 1.06028461, "balance_loss_mlp": 0.1936473, "epoch": 0.7213587855102961, "flos": 21835770992640.0, "grad_norm": 1238.4657391461449, "language_loss": 0.78906381, "learning_rate": 7.605164800868646e-07, "loss": 0.80414772, "num_input_tokens_seen": 258868230, "router_z_loss_clip": 2.2734375, "router_z_loss_mlp": 0.27062988, "step": 11998, "time_per_iteration": 2.687098503112793 }, { "auxiliary_loss_clip": 0.01263077, "auxiliary_loss_mlp": 0.00211432, "balance_loss_clip": 1.04519546, "balance_loss_mlp": 0.18553934, "epoch": 0.7214189087629641, "flos": 14611549881600.0, "grad_norm": 29.24405151464381, "language_loss": 0.79672396, "learning_rate": 7.602108518011696e-07, "loss": 0.81146908, "num_input_tokens_seen": 258885525, "router_z_loss_clip": 2.17578125, "router_z_loss_mlp": 0.2590332, "step": 11999, "time_per_iteration": 2.707030773162842 }, { "auxiliary_loss_clip": 0.01285329, "auxiliary_loss_mlp": 0.00233863, "balance_loss_clip": 1.06151283, "balance_loss_mlp": 0.20628986, "epoch": 0.721479032015632, "flos": 19390864884480.0, "grad_norm": 32.891450241636356, "language_loss": 0.9069469, "learning_rate": 7.599052705284039e-07, "loss": 0.92213887, "num_input_tokens_seen": 258903245, "router_z_loss_clip": 2.23632812, "router_z_loss_mlp": 0.27587891, "step": 12000, "time_per_iteration": 2.666729211807251 }, { "auxiliary_loss_clip": 0.01273804, "auxiliary_loss_mlp": 0.00226903, "balance_loss_clip": 1.05559194, "balance_loss_mlp": 0.20268011, "epoch": 0.7215391552683, "flos": 18512884748160.0, "grad_norm": 164.78102882984254, "language_loss": 0.85328454, "learning_rate": 7.59599736280154e-07, "loss": 0.86829156, "num_input_tokens_seen": 258921245, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.24230957, "step": 12001, "time_per_iteration": 2.8500304222106934 }, { "auxiliary_loss_clip": 0.01269797, "auxiliary_loss_mlp": 0.00216622, "balance_loss_clip": 1.05096877, "balance_loss_mlp": 0.19101554, "epoch": 0.721599278520968, "flos": 23258731253760.0, "grad_norm": 37.1926755820163, "language_loss": 0.87862599, "learning_rate": 7.592942490680066e-07, "loss": 0.8934902, "num_input_tokens_seen": 258939425, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.25598145, "step": 12002, "time_per_iteration": 2.729344606399536 }, { "auxiliary_loss_clip": 0.01294095, "auxiliary_loss_mlp": 0.00213469, "balance_loss_clip": 1.06669271, "balance_loss_mlp": 0.18640804, "epoch": 0.721659401773636, "flos": 39199045979520.0, "grad_norm": 4.02729078884529, "language_loss": 0.72336805, "learning_rate": 7.589888089035462e-07, "loss": 0.73844367, "num_input_tokens_seen": 258960710, "router_z_loss_clip": 2.27148438, "router_z_loss_mlp": 0.27062988, "step": 12003, "time_per_iteration": 2.8136518001556396 }, { "auxiliary_loss_clip": 0.01298767, "auxiliary_loss_mlp": 0.00245831, "balance_loss_clip": 1.07014298, "balance_loss_mlp": 0.21775705, "epoch": 0.7217195250263039, "flos": 14939917038720.0, "grad_norm": 14.252274180273691, "language_loss": 0.81094903, "learning_rate": 7.586834157983544e-07, "loss": 0.82639503, "num_input_tokens_seen": 258978475, "router_z_loss_clip": 2.28515625, "router_z_loss_mlp": 0.28088379, "step": 12004, "time_per_iteration": 2.613208770751953 }, { "auxiliary_loss_clip": 0.01151325, "auxiliary_loss_mlp": 0.0006073, "balance_loss_clip": 0.99559057, "balance_loss_mlp": 0.0521468, "epoch": 0.7217796482789719, "flos": 70869206666880.0, "grad_norm": 0.8543799041770398, "language_loss": 0.53192043, "learning_rate": 7.583780697640112e-07, "loss": 0.54404098, "num_input_tokens_seen": 259037520, "router_z_loss_clip": 1.5546875, "router_z_loss_mlp": 0.0859375, "step": 12005, "time_per_iteration": 4.473542928695679 }, { "auxiliary_loss_clip": 0.01287666, "auxiliary_loss_mlp": 0.00218609, "balance_loss_clip": 1.0618, "balance_loss_mlp": 0.19394496, "epoch": 0.7218397715316398, "flos": 37451525402880.0, "grad_norm": 36.761025856332225, "language_loss": 0.71219629, "learning_rate": 7.580727708120962e-07, "loss": 0.7272591, "num_input_tokens_seen": 259061325, "router_z_loss_clip": 2.25976562, "router_z_loss_mlp": 0.24645996, "step": 12006, "time_per_iteration": 4.225692510604858 }, { "auxiliary_loss_clip": 0.01276949, "auxiliary_loss_mlp": 0.00217214, "balance_loss_clip": 1.0558089, "balance_loss_mlp": 0.19216776, "epoch": 0.7218998947843078, "flos": 22710662559360.0, "grad_norm": 173.99190768631524, "language_loss": 0.97985613, "learning_rate": 7.577675189541865e-07, "loss": 0.99479777, "num_input_tokens_seen": 259078135, "router_z_loss_clip": 2.21289062, "router_z_loss_mlp": 0.25024414, "step": 12007, "time_per_iteration": 2.6619932651519775 }, { "auxiliary_loss_clip": 0.01274404, "auxiliary_loss_mlp": 0.00239581, "balance_loss_clip": 1.05065525, "balance_loss_mlp": 0.21352234, "epoch": 0.7219600180369758, "flos": 12167182477440.0, "grad_norm": 44.4385224785224, "language_loss": 0.74539208, "learning_rate": 7.574623142018568e-07, "loss": 0.7605319, "num_input_tokens_seen": 259095910, "router_z_loss_clip": 2.24023438, "router_z_loss_mlp": 0.26074219, "step": 12008, "time_per_iteration": 2.6073551177978516 }, { "auxiliary_loss_clip": 0.01270335, "auxiliary_loss_mlp": 0.00221083, "balance_loss_clip": 1.04697537, "balance_loss_mlp": 0.19454658, "epoch": 0.7220201412896438, "flos": 22596573985920.0, "grad_norm": 33.74395203792997, "language_loss": 0.88282555, "learning_rate": 7.57157156566681e-07, "loss": 0.89773977, "num_input_tokens_seen": 259114225, "router_z_loss_clip": 2.23046875, "router_z_loss_mlp": 0.26538086, "step": 12009, "time_per_iteration": 4.125509738922119 }, { "auxiliary_loss_clip": 0.01300366, "auxiliary_loss_mlp": 0.00254465, "balance_loss_clip": 1.06899786, "balance_loss_mlp": 0.22742838, "epoch": 0.7220802645423118, "flos": 26718651884160.0, "grad_norm": 212.31168893122904, "language_loss": 0.71120381, "learning_rate": 7.568520460602297e-07, "loss": 0.7267521, "num_input_tokens_seen": 259134660, "router_z_loss_clip": 2.31835938, "router_z_loss_mlp": 0.27038574, "step": 12010, "time_per_iteration": 2.7128357887268066 }, { "auxiliary_loss_clip": 0.01280105, "auxiliary_loss_mlp": 0.00223307, "balance_loss_clip": 1.0625205, "balance_loss_mlp": 0.19786739, "epoch": 0.7221403877949797, "flos": 24420548661120.0, "grad_norm": 94.05798383279836, "language_loss": 0.83583134, "learning_rate": 7.565469826940742e-07, "loss": 0.85086548, "num_input_tokens_seen": 259153300, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.2545166, "step": 12011, "time_per_iteration": 2.7237677574157715 }, { "auxiliary_loss_clip": 0.01281516, "auxiliary_loss_mlp": 0.00223253, "balance_loss_clip": 1.06051099, "balance_loss_mlp": 0.19717011, "epoch": 0.7222005110476477, "flos": 23514379326720.0, "grad_norm": 32.05653979635765, "language_loss": 0.86258161, "learning_rate": 7.56241966479781e-07, "loss": 0.87762934, "num_input_tokens_seen": 259172115, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.26074219, "step": 12012, "time_per_iteration": 2.7390973567962646 }, { "auxiliary_loss_clip": 0.01276219, "auxiliary_loss_mlp": 0.00223573, "balance_loss_clip": 1.04847956, "balance_loss_mlp": 0.19716819, "epoch": 0.7222606343003156, "flos": 23112538899840.0, "grad_norm": 5.714869652721236, "language_loss": 0.86140341, "learning_rate": 7.559369974289171e-07, "loss": 0.87640131, "num_input_tokens_seen": 259191345, "router_z_loss_clip": 2.27734375, "router_z_loss_mlp": 0.26416016, "step": 12013, "time_per_iteration": 4.215647220611572 }, { "auxiliary_loss_clip": 0.01292628, "auxiliary_loss_mlp": 0.00232485, "balance_loss_clip": 1.07058883, "balance_loss_mlp": 0.20729646, "epoch": 0.7223207575529836, "flos": 24351169541760.0, "grad_norm": 188.28572466052273, "language_loss": 0.82655311, "learning_rate": 7.556320755530484e-07, "loss": 0.84180427, "num_input_tokens_seen": 259211700, "router_z_loss_clip": 2.22167969, "router_z_loss_mlp": 0.25170898, "step": 12014, "time_per_iteration": 2.738389015197754 }, { "auxiliary_loss_clip": 0.0128476, "auxiliary_loss_mlp": 0.0021688, "balance_loss_clip": 1.06307495, "balance_loss_mlp": 0.19027224, "epoch": 0.7223808808056515, "flos": 28330179569280.0, "grad_norm": 6.068376206425522, "language_loss": 0.92309165, "learning_rate": 7.553272008637346e-07, "loss": 0.93810809, "num_input_tokens_seen": 259233825, "router_z_loss_clip": 2.22265625, "router_z_loss_mlp": 0.26599121, "step": 12015, "time_per_iteration": 2.772022247314453 }, { "auxiliary_loss_clip": 0.01270758, "auxiliary_loss_mlp": 0.0021099, "balance_loss_clip": 1.04875052, "balance_loss_mlp": 0.18643326, "epoch": 0.7224410040583196, "flos": 21069437304960.0, "grad_norm": 44.072045500354896, "language_loss": 0.86542636, "learning_rate": 7.55022373372538e-07, "loss": 0.88024384, "num_input_tokens_seen": 259253055, "router_z_loss_clip": 2.21679688, "router_z_loss_mlp": 0.24560547, "step": 12016, "time_per_iteration": 2.6919190883636475 }, { "auxiliary_loss_clip": 0.01273208, "auxiliary_loss_mlp": 0.00223728, "balance_loss_clip": 1.05526495, "balance_loss_mlp": 0.19968396, "epoch": 0.7225011273109875, "flos": 26795429205120.0, "grad_norm": 69.01418673848607, "language_loss": 0.84320527, "learning_rate": 7.547175930910186e-07, "loss": 0.85817462, "num_input_tokens_seen": 259273420, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.24084473, "step": 12017, "time_per_iteration": 2.7903904914855957 }, { "auxiliary_loss_clip": 0.01256766, "auxiliary_loss_mlp": 0.00203895, "balance_loss_clip": 1.04009604, "balance_loss_mlp": 0.17930201, "epoch": 0.7225612505636555, "flos": 23583578878080.0, "grad_norm": 33.28488959675894, "language_loss": 0.81191063, "learning_rate": 7.54412860030732e-07, "loss": 0.82651728, "num_input_tokens_seen": 259291000, "router_z_loss_clip": 2.16015625, "router_z_loss_mlp": 0.24597168, "step": 12018, "time_per_iteration": 2.6843619346618652 }, { "auxiliary_loss_clip": 0.01255222, "auxiliary_loss_mlp": 0.00235381, "balance_loss_clip": 1.04184294, "balance_loss_mlp": 0.21010903, "epoch": 0.7226213738163234, "flos": 20777627214720.0, "grad_norm": 7.513429103109149, "language_loss": 0.84272408, "learning_rate": 7.541081742032347e-07, "loss": 0.85763013, "num_input_tokens_seen": 259312390, "router_z_loss_clip": 2.1328125, "router_z_loss_mlp": 0.25256348, "step": 12019, "time_per_iteration": 2.87785267829895 }, { "auxiliary_loss_clip": 0.01262795, "auxiliary_loss_mlp": 0.00220087, "balance_loss_clip": 1.04975808, "balance_loss_mlp": 0.19470698, "epoch": 0.7226814970689914, "flos": 32635832901120.0, "grad_norm": 259.8497637730788, "language_loss": 0.82196414, "learning_rate": 7.53803535620081e-07, "loss": 0.83679295, "num_input_tokens_seen": 259332645, "router_z_loss_clip": 2.1328125, "router_z_loss_mlp": 0.25378418, "step": 12020, "time_per_iteration": 2.924428701400757 }, { "auxiliary_loss_clip": 0.0126936, "auxiliary_loss_mlp": 0.0024594, "balance_loss_clip": 1.0465467, "balance_loss_mlp": 0.22227678, "epoch": 0.7227416203216595, "flos": 22454368041600.0, "grad_norm": 4.644978591650585, "language_loss": 0.83384532, "learning_rate": 7.534989442928219e-07, "loss": 0.84899831, "num_input_tokens_seen": 259353810, "router_z_loss_clip": 2.2265625, "router_z_loss_mlp": 0.23669434, "step": 12021, "time_per_iteration": 2.898688554763794 }, { "auxiliary_loss_clip": 0.01282823, "auxiliary_loss_mlp": 0.00224909, "balance_loss_clip": 1.05838585, "balance_loss_mlp": 0.19724092, "epoch": 0.7228017435743274, "flos": 21652303299840.0, "grad_norm": 18.05328117920469, "language_loss": 0.74715316, "learning_rate": 7.531944002330073e-07, "loss": 0.76223052, "num_input_tokens_seen": 259372460, "router_z_loss_clip": 2.2421875, "router_z_loss_mlp": 0.27661133, "step": 12022, "time_per_iteration": 2.7517173290252686 }, { "auxiliary_loss_clip": 0.01269642, "auxiliary_loss_mlp": 0.00246908, "balance_loss_clip": 1.04883754, "balance_loss_mlp": 0.22002634, "epoch": 0.7228618668269954, "flos": 29533474206720.0, "grad_norm": 10.098462373193954, "language_loss": 0.74963611, "learning_rate": 7.528899034521858e-07, "loss": 0.76480162, "num_input_tokens_seen": 259393275, "router_z_loss_clip": 2.2109375, "router_z_loss_mlp": 0.26879883, "step": 12023, "time_per_iteration": 2.7015674114227295 }, { "auxiliary_loss_clip": 0.01268836, "auxiliary_loss_mlp": 0.00225601, "balance_loss_clip": 1.0461843, "balance_loss_mlp": 0.19947004, "epoch": 0.7229219900796633, "flos": 27453815544960.0, "grad_norm": 33.59654137288184, "language_loss": 0.76428324, "learning_rate": 7.525854539619052e-07, "loss": 0.77922761, "num_input_tokens_seen": 259416205, "router_z_loss_clip": 2.22460938, "router_z_loss_mlp": 0.26123047, "step": 12024, "time_per_iteration": 2.707451105117798 }, { "auxiliary_loss_clip": 0.01262658, "auxiliary_loss_mlp": 0.00230141, "balance_loss_clip": 1.03713965, "balance_loss_mlp": 0.20298535, "epoch": 0.7229821133323313, "flos": 16289368116480.0, "grad_norm": 239.46872495358159, "language_loss": 0.83759469, "learning_rate": 7.522810517737089e-07, "loss": 0.85252273, "num_input_tokens_seen": 259433115, "router_z_loss_clip": 2.25195312, "router_z_loss_mlp": 0.27148438, "step": 12025, "time_per_iteration": 2.656254768371582 }, { "auxiliary_loss_clip": 0.0126115, "auxiliary_loss_mlp": 0.0022059, "balance_loss_clip": 1.04206836, "balance_loss_mlp": 0.19431628, "epoch": 0.7230422365849992, "flos": 20412343854720.0, "grad_norm": 108.03002347626064, "language_loss": 0.8331632, "learning_rate": 7.519766968991395e-07, "loss": 0.84798062, "num_input_tokens_seen": 259450475, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.26257324, "step": 12026, "time_per_iteration": 2.697174072265625 }, { "auxiliary_loss_clip": 0.01269011, "auxiliary_loss_mlp": 0.00220287, "balance_loss_clip": 1.05357909, "balance_loss_mlp": 0.19266585, "epoch": 0.7231023598376672, "flos": 25593499284480.0, "grad_norm": 7.966147561993866, "language_loss": 0.78598845, "learning_rate": 7.516723893497388e-07, "loss": 0.80088139, "num_input_tokens_seen": 259469355, "router_z_loss_clip": 2.15429688, "router_z_loss_mlp": 0.27624512, "step": 12027, "time_per_iteration": 2.720675468444824 }, { "auxiliary_loss_clip": 0.01272616, "auxiliary_loss_mlp": 0.00234768, "balance_loss_clip": 1.05283761, "balance_loss_mlp": 0.2095907, "epoch": 0.7231624830903352, "flos": 25149607009920.0, "grad_norm": 7.968967093858625, "language_loss": 0.87871087, "learning_rate": 7.513681291370469e-07, "loss": 0.89378476, "num_input_tokens_seen": 259486565, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.25170898, "step": 12028, "time_per_iteration": 2.823854446411133 }, { "auxiliary_loss_clip": 0.01279634, "auxiliary_loss_mlp": 0.00222456, "balance_loss_clip": 1.05619586, "balance_loss_mlp": 0.19766034, "epoch": 0.7232226063430032, "flos": 21725740656000.0, "grad_norm": 42.96263471786511, "language_loss": 0.88930637, "learning_rate": 7.510639162726e-07, "loss": 0.90432727, "num_input_tokens_seen": 259505070, "router_z_loss_clip": 2.23828125, "router_z_loss_mlp": 0.24816895, "step": 12029, "time_per_iteration": 2.7128186225891113 }, { "auxiliary_loss_clip": 0.01163123, "auxiliary_loss_mlp": 0.0011819, "balance_loss_clip": 1.00681901, "balance_loss_mlp": 0.1101796, "epoch": 0.7232827295956711, "flos": 68436798491520.0, "grad_norm": 2.144642586085176, "language_loss": 0.61344457, "learning_rate": 7.507597507679347e-07, "loss": 0.62625766, "num_input_tokens_seen": 259569135, "router_z_loss_clip": 1.5625, "router_z_loss_mlp": 0.08007812, "step": 12030, "time_per_iteration": 3.311546564102173 }, { "auxiliary_loss_clip": 0.01267744, "auxiliary_loss_mlp": 0.00214628, "balance_loss_clip": 1.04630268, "balance_loss_mlp": 0.18681654, "epoch": 0.7233428528483391, "flos": 20192642317440.0, "grad_norm": 44.85885372039228, "language_loss": 0.86251915, "learning_rate": 7.504556326345859e-07, "loss": 0.87734282, "num_input_tokens_seen": 259587035, "router_z_loss_clip": 2.21679688, "router_z_loss_mlp": 0.27819824, "step": 12031, "time_per_iteration": 2.7131011486053467 }, { "auxiliary_loss_clip": 0.01281894, "auxiliary_loss_mlp": 0.00253826, "balance_loss_clip": 1.05859089, "balance_loss_mlp": 0.22676536, "epoch": 0.723402976101007, "flos": 23949472769280.0, "grad_norm": 12.418482663991782, "language_loss": 0.89295501, "learning_rate": 7.501515618840834e-07, "loss": 0.9083122, "num_input_tokens_seen": 259606140, "router_z_loss_clip": 2.23046875, "router_z_loss_mlp": 0.27075195, "step": 12032, "time_per_iteration": 2.705920457839966 }, { "auxiliary_loss_clip": 0.01274944, "auxiliary_loss_mlp": 0.00233493, "balance_loss_clip": 1.05012405, "balance_loss_mlp": 0.20633674, "epoch": 0.723463099353675, "flos": 20813394182400.0, "grad_norm": 5.2748871415768415, "language_loss": 0.85430598, "learning_rate": 7.498475385279592e-07, "loss": 0.86939037, "num_input_tokens_seen": 259624275, "router_z_loss_clip": 2.25390625, "router_z_loss_mlp": 0.27148438, "step": 12033, "time_per_iteration": 2.6838576793670654 }, { "auxiliary_loss_clip": 0.01268165, "auxiliary_loss_mlp": 0.00206552, "balance_loss_clip": 1.04913151, "balance_loss_mlp": 0.18158942, "epoch": 0.723523222606343, "flos": 19098013299840.0, "grad_norm": 2.0629415054743667, "language_loss": 0.80835652, "learning_rate": 7.495435625777423e-07, "loss": 0.82310373, "num_input_tokens_seen": 259643465, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.24963379, "step": 12034, "time_per_iteration": 2.6600399017333984 }, { "auxiliary_loss_clip": 0.0124278, "auxiliary_loss_mlp": 0.00225557, "balance_loss_clip": 1.02486813, "balance_loss_mlp": 0.20018888, "epoch": 0.723583345859011, "flos": 26506994993280.0, "grad_norm": 92.99637885367927, "language_loss": 0.89139611, "learning_rate": 7.492396340449578e-07, "loss": 0.90607947, "num_input_tokens_seen": 259662500, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.25378418, "step": 12035, "time_per_iteration": 2.7290596961975098 }, { "auxiliary_loss_clip": 0.01286496, "auxiliary_loss_mlp": 0.00226932, "balance_loss_clip": 1.05626869, "balance_loss_mlp": 0.19852439, "epoch": 0.723643469111679, "flos": 16033863697920.0, "grad_norm": 3386.8964651176993, "language_loss": 0.69518805, "learning_rate": 7.489357529411326e-07, "loss": 0.71032238, "num_input_tokens_seen": 259680140, "router_z_loss_clip": 2.30664062, "router_z_loss_mlp": 0.28417969, "step": 12036, "time_per_iteration": 2.6330206394195557 }, { "auxiliary_loss_clip": 0.01236461, "auxiliary_loss_mlp": 0.00206852, "balance_loss_clip": 1.02612507, "balance_loss_mlp": 0.18291432, "epoch": 0.7237035923643469, "flos": 21945549934080.0, "grad_norm": 4.8235916718369705, "language_loss": 0.75975448, "learning_rate": 7.486319192777883e-07, "loss": 0.77418756, "num_input_tokens_seen": 259700160, "router_z_loss_clip": 2.10546875, "router_z_loss_mlp": 0.23925781, "step": 12037, "time_per_iteration": 2.667886972427368 }, { "auxiliary_loss_clip": 0.01271382, "auxiliary_loss_mlp": 0.00238989, "balance_loss_clip": 1.04620934, "balance_loss_mlp": 0.21165408, "epoch": 0.7237637156170149, "flos": 23583112001280.0, "grad_norm": 390.93073834807353, "language_loss": 0.81991184, "learning_rate": 7.483281330664479e-07, "loss": 0.83501548, "num_input_tokens_seen": 259720525, "router_z_loss_clip": 2.25390625, "router_z_loss_mlp": 0.27331543, "step": 12038, "time_per_iteration": 2.7076480388641357 }, { "auxiliary_loss_clip": 0.012825, "auxiliary_loss_mlp": 0.00225504, "balance_loss_clip": 1.05601335, "balance_loss_mlp": 0.19846702, "epoch": 0.7238238388696828, "flos": 20594698225920.0, "grad_norm": 26.562260516497325, "language_loss": 0.80992603, "learning_rate": 7.480243943186293e-07, "loss": 0.82500601, "num_input_tokens_seen": 259738680, "router_z_loss_clip": 2.26953125, "router_z_loss_mlp": 0.27038574, "step": 12039, "time_per_iteration": 2.739544153213501 }, { "auxiliary_loss_clip": 0.01250462, "auxiliary_loss_mlp": 0.00220608, "balance_loss_clip": 1.03822732, "balance_loss_mlp": 0.19777903, "epoch": 0.7238839621223508, "flos": 24207024263040.0, "grad_norm": 3.7866550103913323, "language_loss": 0.85539603, "learning_rate": 7.477207030458513e-07, "loss": 0.87010682, "num_input_tokens_seen": 259758790, "router_z_loss_clip": 2.12109375, "router_z_loss_mlp": 0.2286377, "step": 12040, "time_per_iteration": 2.720722198486328 }, { "auxiliary_loss_clip": 0.01265887, "auxiliary_loss_mlp": 0.0021832, "balance_loss_clip": 1.03996968, "balance_loss_mlp": 0.19229637, "epoch": 0.7239440853750188, "flos": 14209745368320.0, "grad_norm": 3.3638649095431097, "language_loss": 0.85208243, "learning_rate": 7.474170592596301e-07, "loss": 0.86692446, "num_input_tokens_seen": 259777370, "router_z_loss_clip": 2.26171875, "router_z_loss_mlp": 0.26013184, "step": 12041, "time_per_iteration": 2.6597347259521484 }, { "auxiliary_loss_clip": 0.01252894, "auxiliary_loss_mlp": 0.00217207, "balance_loss_clip": 1.03300285, "balance_loss_mlp": 0.19223225, "epoch": 0.7240042086276868, "flos": 21614812479360.0, "grad_norm": 18.3092018530224, "language_loss": 0.74152803, "learning_rate": 7.471134629714797e-07, "loss": 0.75622904, "num_input_tokens_seen": 259794665, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.24951172, "step": 12042, "time_per_iteration": 2.656081438064575 }, { "auxiliary_loss_clip": 0.01283594, "auxiliary_loss_mlp": 0.00247752, "balance_loss_clip": 1.05690813, "balance_loss_mlp": 0.22047734, "epoch": 0.7240643318803547, "flos": 23331450337920.0, "grad_norm": 6.049910069362165, "language_loss": 0.90746987, "learning_rate": 7.468099141929116e-07, "loss": 0.92278332, "num_input_tokens_seen": 259811110, "router_z_loss_clip": 2.26757812, "router_z_loss_mlp": 0.27282715, "step": 12043, "time_per_iteration": 2.659255027770996 }, { "auxiliary_loss_clip": 0.01262366, "auxiliary_loss_mlp": 0.0023571, "balance_loss_clip": 1.04054368, "balance_loss_mlp": 0.20774373, "epoch": 0.7241244551330227, "flos": 24024849459840.0, "grad_norm": 28.09313221973264, "language_loss": 0.7282508, "learning_rate": 7.465064129354379e-07, "loss": 0.74323153, "num_input_tokens_seen": 259831080, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.27966309, "step": 12044, "time_per_iteration": 2.7109737396240234 }, { "auxiliary_loss_clip": 0.01268604, "auxiliary_loss_mlp": 0.00246281, "balance_loss_clip": 1.04796469, "balance_loss_mlp": 0.22018635, "epoch": 0.7241845783856906, "flos": 18730323728640.0, "grad_norm": 13.42573846496366, "language_loss": 0.88042843, "learning_rate": 7.462029592105658e-07, "loss": 0.89557731, "num_input_tokens_seen": 259850135, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.26123047, "step": 12045, "time_per_iteration": 2.6442127227783203 }, { "auxiliary_loss_clip": 0.01251475, "auxiliary_loss_mlp": 0.00217862, "balance_loss_clip": 1.03407288, "balance_loss_mlp": 0.19435346, "epoch": 0.7242447016383586, "flos": 19498668577920.0, "grad_norm": 1.8255434433947455, "language_loss": 0.79080719, "learning_rate": 7.458995530298034e-07, "loss": 0.80550057, "num_input_tokens_seen": 259868185, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.23498535, "step": 12046, "time_per_iteration": 2.6434571743011475 }, { "auxiliary_loss_clip": 0.01256295, "auxiliary_loss_mlp": 0.00229185, "balance_loss_clip": 1.03838575, "balance_loss_mlp": 0.20398375, "epoch": 0.7243048248910267, "flos": 22163491704960.0, "grad_norm": 5.726548600967069, "language_loss": 0.8046295, "learning_rate": 7.455961944046553e-07, "loss": 0.81948423, "num_input_tokens_seen": 259887055, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.2520752, "step": 12047, "time_per_iteration": 4.071962356567383 }, { "auxiliary_loss_clip": 0.01278115, "auxiliary_loss_mlp": 0.00210867, "balance_loss_clip": 1.05233371, "balance_loss_mlp": 0.1853919, "epoch": 0.7243649481436946, "flos": 27672762896640.0, "grad_norm": 7.161236229036705, "language_loss": 0.79496324, "learning_rate": 7.45292883346627e-07, "loss": 0.80985308, "num_input_tokens_seen": 259908295, "router_z_loss_clip": 2.2578125, "router_z_loss_mlp": 0.25463867, "step": 12048, "time_per_iteration": 4.130521535873413 }, { "auxiliary_loss_clip": 0.0112476, "auxiliary_loss_mlp": 0.00081424, "balance_loss_clip": 0.97220814, "balance_loss_mlp": 0.07365181, "epoch": 0.7244250713963626, "flos": 63244545759360.0, "grad_norm": 0.9352396133428663, "language_loss": 0.53099358, "learning_rate": 7.449896198672168e-07, "loss": 0.54305542, "num_input_tokens_seen": 259968475, "router_z_loss_clip": 1.53125, "router_z_loss_mlp": 0.07763672, "step": 12049, "time_per_iteration": 3.17824387550354 }, { "auxiliary_loss_clip": 0.0127365, "auxiliary_loss_mlp": 0.00258874, "balance_loss_clip": 1.04749703, "balance_loss_mlp": 0.23298165, "epoch": 0.7244851946490305, "flos": 17967114524160.0, "grad_norm": 7.607850557255949, "language_loss": 0.71443379, "learning_rate": 7.446864039779258e-07, "loss": 0.72975904, "num_input_tokens_seen": 259984865, "router_z_loss_clip": 2.25976562, "router_z_loss_mlp": 0.25891113, "step": 12050, "time_per_iteration": 2.6474475860595703 }, { "auxiliary_loss_clip": 0.01116972, "auxiliary_loss_mlp": 0.00108793, "balance_loss_clip": 0.96234995, "balance_loss_mlp": 0.10149699, "epoch": 0.7245453179016985, "flos": 70943649603840.0, "grad_norm": 0.7148443151744304, "language_loss": 0.52273166, "learning_rate": 7.443832356902528e-07, "loss": 0.53498936, "num_input_tokens_seen": 260046735, "router_z_loss_clip": 1.546875, "router_z_loss_mlp": 0.07275391, "step": 12051, "time_per_iteration": 4.623785972595215 }, { "auxiliary_loss_clip": 0.01243553, "auxiliary_loss_mlp": 0.00252252, "balance_loss_clip": 1.02909935, "balance_loss_mlp": 0.22670534, "epoch": 0.7246054411543664, "flos": 24568464867840.0, "grad_norm": 3.378749155927843, "language_loss": 0.78021979, "learning_rate": 7.440801150156927e-07, "loss": 0.79517782, "num_input_tokens_seen": 260067950, "router_z_loss_clip": 2.14453125, "router_z_loss_mlp": 0.25549316, "step": 12052, "time_per_iteration": 2.6937434673309326 }, { "auxiliary_loss_clip": 0.01251612, "auxiliary_loss_mlp": 0.00238259, "balance_loss_clip": 1.0302484, "balance_loss_mlp": 0.21113876, "epoch": 0.7246655644070344, "flos": 32338312548480.0, "grad_norm": 11.282816127506127, "language_loss": 0.80879098, "learning_rate": 7.437770419657415e-07, "loss": 0.8236897, "num_input_tokens_seen": 260087730, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.2713623, "step": 12053, "time_per_iteration": 2.7403669357299805 }, { "auxiliary_loss_clip": 0.01269423, "auxiliary_loss_mlp": 0.00230777, "balance_loss_clip": 1.0451355, "balance_loss_mlp": 0.20484917, "epoch": 0.7247256876597024, "flos": 21872471713920.0, "grad_norm": 4.3815430562959055, "language_loss": 0.87816846, "learning_rate": 7.434740165518898e-07, "loss": 0.89317054, "num_input_tokens_seen": 260107760, "router_z_loss_clip": 2.24414062, "router_z_loss_mlp": 0.2590332, "step": 12054, "time_per_iteration": 2.653071641921997 }, { "auxiliary_loss_clip": 0.01249155, "auxiliary_loss_mlp": 0.00241343, "balance_loss_clip": 1.03106713, "balance_loss_mlp": 0.21621388, "epoch": 0.7247858109123704, "flos": 16213093585920.0, "grad_norm": 9.556909149608172, "language_loss": 0.79154909, "learning_rate": 7.431710387856301e-07, "loss": 0.80645406, "num_input_tokens_seen": 260123660, "router_z_loss_clip": 2.18164062, "router_z_loss_mlp": 0.2512207, "step": 12055, "time_per_iteration": 2.6758198738098145 }, { "auxiliary_loss_clip": 0.01252814, "auxiliary_loss_mlp": 0.00242606, "balance_loss_clip": 1.03506732, "balance_loss_mlp": 0.21845451, "epoch": 0.7248459341650383, "flos": 20850705434880.0, "grad_norm": 68.77276886785229, "language_loss": 0.81238157, "learning_rate": 7.428681086784496e-07, "loss": 0.82733577, "num_input_tokens_seen": 260142690, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.24157715, "step": 12056, "time_per_iteration": 4.080258131027222 }, { "auxiliary_loss_clip": 0.0125064, "auxiliary_loss_mlp": 0.00225557, "balance_loss_clip": 1.03224885, "balance_loss_mlp": 0.20105913, "epoch": 0.7249060574177063, "flos": 25921794614400.0, "grad_norm": 6.303727030519149, "language_loss": 0.77806783, "learning_rate": 7.425652262418368e-07, "loss": 0.79282987, "num_input_tokens_seen": 260162590, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.24487305, "step": 12057, "time_per_iteration": 2.7776424884796143 }, { "auxiliary_loss_clip": 0.01263628, "auxiliary_loss_mlp": 0.00247192, "balance_loss_clip": 1.04024279, "balance_loss_mlp": 0.22104931, "epoch": 0.7249661806703742, "flos": 17345536646400.0, "grad_norm": 4.345539200625184, "language_loss": 0.70928895, "learning_rate": 7.42262391487277e-07, "loss": 0.72439706, "num_input_tokens_seen": 260181065, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.26135254, "step": 12058, "time_per_iteration": 2.69521164894104 }, { "auxiliary_loss_clip": 0.01266325, "auxiliary_loss_mlp": 0.00248036, "balance_loss_clip": 1.042153, "balance_loss_mlp": 0.22296628, "epoch": 0.7250263039230422, "flos": 19574153009280.0, "grad_norm": 26.976626070991806, "language_loss": 0.81024867, "learning_rate": 7.419596044262535e-07, "loss": 0.82539225, "num_input_tokens_seen": 260200330, "router_z_loss_clip": 2.24023438, "router_z_loss_mlp": 0.25073242, "step": 12059, "time_per_iteration": 2.6649599075317383 }, { "auxiliary_loss_clip": 0.01227025, "auxiliary_loss_mlp": 0.00233844, "balance_loss_clip": 1.01880765, "balance_loss_mlp": 0.21046749, "epoch": 0.7250864271757103, "flos": 21976648133760.0, "grad_norm": 12.63449340063702, "language_loss": 0.84425902, "learning_rate": 7.416568650702472e-07, "loss": 0.8588677, "num_input_tokens_seen": 260219975, "router_z_loss_clip": 2.08203125, "router_z_loss_mlp": 0.23388672, "step": 12060, "time_per_iteration": 2.7001724243164062 }, { "auxiliary_loss_clip": 0.01238865, "auxiliary_loss_mlp": 0.0022635, "balance_loss_clip": 1.01957941, "balance_loss_mlp": 0.20201969, "epoch": 0.7251465504283782, "flos": 25012608537600.0, "grad_norm": 65.27919795730367, "language_loss": 0.82523793, "learning_rate": 7.413541734307393e-07, "loss": 0.83989006, "num_input_tokens_seen": 260242025, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.24353027, "step": 12061, "time_per_iteration": 2.8386805057525635 }, { "auxiliary_loss_clip": 0.01248402, "auxiliary_loss_mlp": 0.00248076, "balance_loss_clip": 1.03756571, "balance_loss_mlp": 0.22400814, "epoch": 0.7252066736810462, "flos": 16690131135360.0, "grad_norm": 81.45767526515074, "language_loss": 0.86743212, "learning_rate": 7.410515295192068e-07, "loss": 0.88239688, "num_input_tokens_seen": 260260015, "router_z_loss_clip": 2.10742188, "router_z_loss_mlp": 0.24072266, "step": 12062, "time_per_iteration": 2.675107955932617 }, { "auxiliary_loss_clip": 0.01285405, "auxiliary_loss_mlp": 0.0026109, "balance_loss_clip": 1.05607128, "balance_loss_mlp": 0.23342109, "epoch": 0.7252667969337141, "flos": 25703026830720.0, "grad_norm": 13.638672484690883, "language_loss": 0.8057844, "learning_rate": 7.407489333471262e-07, "loss": 0.82124937, "num_input_tokens_seen": 260278635, "router_z_loss_clip": 2.29492188, "router_z_loss_mlp": 0.27661133, "step": 12063, "time_per_iteration": 2.690988063812256 }, { "auxiliary_loss_clip": 0.01240893, "auxiliary_loss_mlp": 0.00235042, "balance_loss_clip": 1.02335787, "balance_loss_mlp": 0.20914981, "epoch": 0.7253269201863821, "flos": 18259930195200.0, "grad_norm": 1001.2335629438238, "language_loss": 0.77250719, "learning_rate": 7.40446384925973e-07, "loss": 0.78726649, "num_input_tokens_seen": 260298510, "router_z_loss_clip": 2.17480469, "router_z_loss_mlp": 0.25891113, "step": 12064, "time_per_iteration": 2.6365723609924316 }, { "auxiliary_loss_clip": 0.01246427, "auxiliary_loss_mlp": 0.002567, "balance_loss_clip": 1.02759624, "balance_loss_mlp": 0.23170227, "epoch": 0.72538704343905, "flos": 20411805150720.0, "grad_norm": 9.056626644501517, "language_loss": 0.98317873, "learning_rate": 7.401438842672192e-07, "loss": 0.99821001, "num_input_tokens_seen": 260317405, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.24975586, "step": 12065, "time_per_iteration": 2.589937210083008 }, { "auxiliary_loss_clip": 0.01089864, "auxiliary_loss_mlp": 0.0006551, "balance_loss_clip": 0.9403888, "balance_loss_mlp": 0.05897757, "epoch": 0.725447166691718, "flos": 70151209706880.0, "grad_norm": 3.6546765283462435, "language_loss": 0.55665648, "learning_rate": 7.398414313823349e-07, "loss": 0.56821024, "num_input_tokens_seen": 260388085, "router_z_loss_clip": 1.4921875, "router_z_loss_mlp": 0.06542969, "step": 12066, "time_per_iteration": 3.334608793258667 }, { "auxiliary_loss_clip": 0.01250917, "auxiliary_loss_mlp": 0.00246661, "balance_loss_clip": 1.03526139, "balance_loss_mlp": 0.22178163, "epoch": 0.725507289944386, "flos": 27052334254080.0, "grad_norm": 13.573734116625271, "language_loss": 0.83150285, "learning_rate": 7.395390262827897e-07, "loss": 0.84647858, "num_input_tokens_seen": 260406165, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.24865723, "step": 12067, "time_per_iteration": 2.669015407562256 }, { "auxiliary_loss_clip": 0.01093861, "auxiliary_loss_mlp": 0.00121592, "balance_loss_clip": 0.94269562, "balance_loss_mlp": 0.11453526, "epoch": 0.725567413197054, "flos": 62921924778240.0, "grad_norm": 0.7150106117422902, "language_loss": 0.56329787, "learning_rate": 7.392366689800515e-07, "loss": 0.57545245, "num_input_tokens_seen": 260461365, "router_z_loss_clip": 1.515625, "router_z_loss_mlp": 0.07080078, "step": 12068, "time_per_iteration": 3.0415451526641846 }, { "auxiliary_loss_clip": 0.01094892, "auxiliary_loss_mlp": 0.00129063, "balance_loss_clip": 0.9435392, "balance_loss_mlp": 0.12210134, "epoch": 0.7256275364497219, "flos": 60295957188480.0, "grad_norm": 0.6555718883480811, "language_loss": 0.55106884, "learning_rate": 7.389343594855848e-07, "loss": 0.56330836, "num_input_tokens_seen": 260523795, "router_z_loss_clip": 1.515625, "router_z_loss_mlp": 0.06982422, "step": 12069, "time_per_iteration": 3.1353037357330322 }, { "auxiliary_loss_clip": 0.0123998, "auxiliary_loss_mlp": 0.00232529, "balance_loss_clip": 1.02880526, "balance_loss_mlp": 0.20966437, "epoch": 0.7256876597023899, "flos": 24498511130880.0, "grad_norm": 3.22236282966861, "language_loss": 0.87662464, "learning_rate": 7.38632097810854e-07, "loss": 0.89134973, "num_input_tokens_seen": 260544765, "router_z_loss_clip": 2.11132812, "router_z_loss_mlp": 0.2286377, "step": 12070, "time_per_iteration": 2.6457509994506836 }, { "auxiliary_loss_clip": 0.01243859, "auxiliary_loss_mlp": 0.00224169, "balance_loss_clip": 1.02886677, "balance_loss_mlp": 0.20024347, "epoch": 0.7257477829550578, "flos": 24352749740160.0, "grad_norm": 2.2172236758770354, "language_loss": 0.79382384, "learning_rate": 7.383298839673197e-07, "loss": 0.8085041, "num_input_tokens_seen": 260564340, "router_z_loss_clip": 2.15234375, "router_z_loss_mlp": 0.23925781, "step": 12071, "time_per_iteration": 2.6753342151641846 }, { "auxiliary_loss_clip": 0.01242259, "auxiliary_loss_mlp": 0.00238651, "balance_loss_clip": 1.03244984, "balance_loss_mlp": 0.21469031, "epoch": 0.7258079062077258, "flos": 17202217380480.0, "grad_norm": 7.945172261506096, "language_loss": 0.75695908, "learning_rate": 7.380277179664436e-07, "loss": 0.77176821, "num_input_tokens_seen": 260582565, "router_z_loss_clip": 2.09667969, "router_z_loss_mlp": 0.23962402, "step": 12072, "time_per_iteration": 2.727036237716675 }, { "auxiliary_loss_clip": 0.01258998, "auxiliary_loss_mlp": 0.00236676, "balance_loss_clip": 1.0352037, "balance_loss_mlp": 0.21186891, "epoch": 0.7258680294603939, "flos": 21580338401280.0, "grad_norm": 10.299992987992185, "language_loss": 0.83817196, "learning_rate": 7.377255998196821e-07, "loss": 0.85312873, "num_input_tokens_seen": 260601700, "router_z_loss_clip": 2.23828125, "router_z_loss_mlp": 0.2479248, "step": 12073, "time_per_iteration": 2.738556385040283 }, { "auxiliary_loss_clip": 0.0126266, "auxiliary_loss_mlp": 0.0025568, "balance_loss_clip": 1.04257727, "balance_loss_mlp": 0.23010972, "epoch": 0.7259281527130618, "flos": 34855399036800.0, "grad_norm": 11.678413389618562, "language_loss": 0.77105212, "learning_rate": 7.374235295384923e-07, "loss": 0.78623557, "num_input_tokens_seen": 260623040, "router_z_loss_clip": 2.19628906, "router_z_loss_mlp": 0.25585938, "step": 12074, "time_per_iteration": 2.9134140014648438 }, { "auxiliary_loss_clip": 0.01254104, "auxiliary_loss_mlp": 0.00238183, "balance_loss_clip": 1.03729093, "balance_loss_mlp": 0.21277916, "epoch": 0.7259882759657298, "flos": 25404644551680.0, "grad_norm": 8.185945630682946, "language_loss": 0.80456036, "learning_rate": 7.371215071343302e-07, "loss": 0.81948316, "num_input_tokens_seen": 260642735, "router_z_loss_clip": 2.16601562, "router_z_loss_mlp": 0.25402832, "step": 12075, "time_per_iteration": 2.74428653717041 }, { "auxiliary_loss_clip": 0.01256035, "auxiliary_loss_mlp": 0.00260344, "balance_loss_clip": 1.03316331, "balance_loss_mlp": 0.23304555, "epoch": 0.7260483992183977, "flos": 62953630531200.0, "grad_norm": 23.584045513066417, "language_loss": 0.70743024, "learning_rate": 7.368195326186458e-07, "loss": 0.72259402, "num_input_tokens_seen": 260669935, "router_z_loss_clip": 2.23046875, "router_z_loss_mlp": 0.27355957, "step": 12076, "time_per_iteration": 3.070871114730835 }, { "auxiliary_loss_clip": 0.01253907, "auxiliary_loss_mlp": 0.00241512, "balance_loss_clip": 1.03565693, "balance_loss_mlp": 0.21735997, "epoch": 0.7261085224710657, "flos": 26467528924800.0, "grad_norm": 19.495163140102978, "language_loss": 0.86088657, "learning_rate": 7.365176060028912e-07, "loss": 0.87584072, "num_input_tokens_seen": 260689605, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.24157715, "step": 12077, "time_per_iteration": 2.712242364883423 }, { "auxiliary_loss_clip": 0.01097362, "auxiliary_loss_mlp": 0.00092578, "balance_loss_clip": 0.94802356, "balance_loss_mlp": 0.08470994, "epoch": 0.7261686457237336, "flos": 66772732187520.0, "grad_norm": 0.8789776441714001, "language_loss": 0.64651084, "learning_rate": 7.362157272985163e-07, "loss": 0.65841031, "num_input_tokens_seen": 260748265, "router_z_loss_clip": 1.4921875, "router_z_loss_mlp": 0.07861328, "step": 12078, "time_per_iteration": 3.171923875808716 }, { "auxiliary_loss_clip": 0.01101216, "auxiliary_loss_mlp": 0.00092906, "balance_loss_clip": 0.94899428, "balance_loss_mlp": 0.08575378, "epoch": 0.7262287689764017, "flos": 69999594399360.0, "grad_norm": 0.7023879364798159, "language_loss": 0.58860755, "learning_rate": 7.359138965169671e-07, "loss": 0.60054874, "num_input_tokens_seen": 260816715, "router_z_loss_clip": 1.515625, "router_z_loss_mlp": 0.07128906, "step": 12079, "time_per_iteration": 3.2756714820861816 }, { "auxiliary_loss_clip": 0.01255845, "auxiliary_loss_mlp": 0.00267513, "balance_loss_clip": 1.03924131, "balance_loss_mlp": 0.24170418, "epoch": 0.7262888922290696, "flos": 23805435231360.0, "grad_norm": 11.697391168651972, "language_loss": 0.74452758, "learning_rate": 7.356121136696895e-07, "loss": 0.75976121, "num_input_tokens_seen": 260836765, "router_z_loss_clip": 2.16894531, "router_z_loss_mlp": 0.25805664, "step": 12080, "time_per_iteration": 2.681201696395874 }, { "auxiliary_loss_clip": 0.01254445, "auxiliary_loss_mlp": 0.00267848, "balance_loss_clip": 1.03028321, "balance_loss_mlp": 0.24156255, "epoch": 0.7263490154817376, "flos": 19500320603520.0, "grad_norm": 1353.7176604243023, "language_loss": 0.81053519, "learning_rate": 7.35310378768128e-07, "loss": 0.8257581, "num_input_tokens_seen": 260854610, "router_z_loss_clip": 2.24023438, "router_z_loss_mlp": 0.26306152, "step": 12081, "time_per_iteration": 2.6446871757507324 }, { "auxiliary_loss_clip": 0.01241769, "auxiliary_loss_mlp": 0.00232757, "balance_loss_clip": 1.02619338, "balance_loss_mlp": 0.20719807, "epoch": 0.7264091387344055, "flos": 16286243633280.0, "grad_norm": 11.711055465487718, "language_loss": 0.89375991, "learning_rate": 7.350086918237237e-07, "loss": 0.9085052, "num_input_tokens_seen": 260871620, "router_z_loss_clip": 2.15429688, "router_z_loss_mlp": 0.25549316, "step": 12082, "time_per_iteration": 2.6734776496887207 }, { "auxiliary_loss_clip": 0.0127715, "auxiliary_loss_mlp": 0.00235681, "balance_loss_clip": 1.04515159, "balance_loss_mlp": 0.20757174, "epoch": 0.7264692619870735, "flos": 24352031468160.0, "grad_norm": 10.6639709717113, "language_loss": 0.86087918, "learning_rate": 7.347070528479158e-07, "loss": 0.87600756, "num_input_tokens_seen": 260890490, "router_z_loss_clip": 2.32226562, "router_z_loss_mlp": 0.28125, "step": 12083, "time_per_iteration": 2.7450032234191895 }, { "auxiliary_loss_clip": 0.01266354, "auxiliary_loss_mlp": 0.00259185, "balance_loss_clip": 1.04331684, "balance_loss_mlp": 0.23328078, "epoch": 0.7265293852397414, "flos": 25119478477440.0, "grad_norm": 3.665624236241452, "language_loss": 0.80432618, "learning_rate": 7.344054618521433e-07, "loss": 0.81958157, "num_input_tokens_seen": 260909700, "router_z_loss_clip": 2.22851562, "router_z_loss_mlp": 0.2590332, "step": 12084, "time_per_iteration": 2.6886215209960938 }, { "auxiliary_loss_clip": 0.01261847, "auxiliary_loss_mlp": 0.00239779, "balance_loss_clip": 1.04004526, "balance_loss_mlp": 0.21448243, "epoch": 0.7265895084924094, "flos": 22638230784000.0, "grad_norm": 3.098585695528353, "language_loss": 0.85272646, "learning_rate": 7.34103918847843e-07, "loss": 0.86774266, "num_input_tokens_seen": 260929090, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.25292969, "step": 12085, "time_per_iteration": 2.688434600830078 }, { "auxiliary_loss_clip": 0.01264753, "auxiliary_loss_mlp": 0.00237633, "balance_loss_clip": 1.0411768, "balance_loss_mlp": 0.21207437, "epoch": 0.7266496317450775, "flos": 23368222886400.0, "grad_norm": 8.714331915310572, "language_loss": 0.791704, "learning_rate": 7.338024238464493e-07, "loss": 0.80672789, "num_input_tokens_seen": 260946615, "router_z_loss_clip": 2.24023438, "router_z_loss_mlp": 0.25598145, "step": 12086, "time_per_iteration": 2.633610248565674 }, { "auxiliary_loss_clip": 0.01253618, "auxiliary_loss_mlp": 0.00234968, "balance_loss_clip": 1.03811932, "balance_loss_mlp": 0.20977867, "epoch": 0.7267097549977454, "flos": 28074603323520.0, "grad_norm": 11.323773298052332, "language_loss": 0.77264804, "learning_rate": 7.335009768593938e-07, "loss": 0.78753388, "num_input_tokens_seen": 260968515, "router_z_loss_clip": 2.15332031, "router_z_loss_mlp": 0.25158691, "step": 12087, "time_per_iteration": 2.726531982421875 }, { "auxiliary_loss_clip": 0.0128475, "auxiliary_loss_mlp": 0.00276867, "balance_loss_clip": 1.05714989, "balance_loss_mlp": 0.24903116, "epoch": 0.7267698782504134, "flos": 22195523658240.0, "grad_norm": 7.21447526576546, "language_loss": 0.86662984, "learning_rate": 7.331995778981088e-07, "loss": 0.88224602, "num_input_tokens_seen": 260986790, "router_z_loss_clip": 2.27539062, "router_z_loss_mlp": 0.27832031, "step": 12088, "time_per_iteration": 2.672757148742676 }, { "auxiliary_loss_clip": 0.01261395, "auxiliary_loss_mlp": 0.00248747, "balance_loss_clip": 1.03851187, "balance_loss_mlp": 0.22136509, "epoch": 0.7268300015030813, "flos": 18514859996160.0, "grad_norm": 12.923488893392822, "language_loss": 0.81380951, "learning_rate": 7.328982269740221e-07, "loss": 0.82891089, "num_input_tokens_seen": 261004925, "router_z_loss_clip": 2.23046875, "router_z_loss_mlp": 0.27355957, "step": 12089, "time_per_iteration": 4.1307213306427 }, { "auxiliary_loss_clip": 0.01261456, "auxiliary_loss_mlp": 0.00233135, "balance_loss_clip": 1.04430008, "balance_loss_mlp": 0.20849448, "epoch": 0.7268901247557493, "flos": 23986029836160.0, "grad_norm": 35.654303371393006, "language_loss": 0.79146791, "learning_rate": 7.325969240985616e-07, "loss": 0.80641389, "num_input_tokens_seen": 261023895, "router_z_loss_clip": 2.171875, "router_z_loss_mlp": 0.24633789, "step": 12090, "time_per_iteration": 4.115847826004028 }, { "auxiliary_loss_clip": 0.01253213, "auxiliary_loss_mlp": 0.00241123, "balance_loss_clip": 1.03807831, "balance_loss_mlp": 0.21471855, "epoch": 0.7269502480084172, "flos": 32088087429120.0, "grad_norm": 9.928690696610657, "language_loss": 0.84656465, "learning_rate": 7.322956692831528e-07, "loss": 0.86150807, "num_input_tokens_seen": 261045445, "router_z_loss_clip": 2.15234375, "router_z_loss_mlp": 0.26403809, "step": 12091, "time_per_iteration": 2.818190813064575 }, { "auxiliary_loss_clip": 0.01259447, "auxiliary_loss_mlp": 0.0022384, "balance_loss_clip": 1.04024971, "balance_loss_mlp": 0.1995337, "epoch": 0.7270103712610853, "flos": 19062785036160.0, "grad_norm": 13.834609043654309, "language_loss": 0.79517519, "learning_rate": 7.319944625392205e-07, "loss": 0.81000811, "num_input_tokens_seen": 261064275, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.24316406, "step": 12092, "time_per_iteration": 2.6594552993774414 }, { "auxiliary_loss_clip": 0.01267909, "auxiliary_loss_mlp": 0.00227182, "balance_loss_clip": 1.04900861, "balance_loss_mlp": 0.20355485, "epoch": 0.7270704945137532, "flos": 34532921710080.0, "grad_norm": 25.099596956599733, "language_loss": 0.70042276, "learning_rate": 7.31693303878184e-07, "loss": 0.71537369, "num_input_tokens_seen": 261083310, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.23620605, "step": 12093, "time_per_iteration": 4.21579384803772 }, { "auxiliary_loss_clip": 0.01269741, "auxiliary_loss_mlp": 0.00241915, "balance_loss_clip": 1.0534668, "balance_loss_mlp": 0.21839526, "epoch": 0.7271306177664212, "flos": 21507583403520.0, "grad_norm": 8.501364100824022, "language_loss": 0.80477196, "learning_rate": 7.313921933114644e-07, "loss": 0.81988853, "num_input_tokens_seen": 261103460, "router_z_loss_clip": 2.16015625, "router_z_loss_mlp": 0.23522949, "step": 12094, "time_per_iteration": 2.6825344562530518 }, { "auxiliary_loss_clip": 0.01240656, "auxiliary_loss_mlp": 0.00212874, "balance_loss_clip": 1.03211808, "balance_loss_mlp": 0.1905939, "epoch": 0.7271907410190891, "flos": 22272444633600.0, "grad_norm": 44.84208940070094, "language_loss": 0.93193215, "learning_rate": 7.310911308504808e-07, "loss": 0.9464674, "num_input_tokens_seen": 261121375, "router_z_loss_clip": 2.0859375, "router_z_loss_mlp": 0.22265625, "step": 12095, "time_per_iteration": 2.671861410140991 }, { "auxiliary_loss_clip": 0.01277523, "auxiliary_loss_mlp": 0.00237402, "balance_loss_clip": 1.05369389, "balance_loss_mlp": 0.20981744, "epoch": 0.7272508642717571, "flos": 22893124671360.0, "grad_norm": 4.664460477296896, "language_loss": 0.87002295, "learning_rate": 7.307901165066479e-07, "loss": 0.88517225, "num_input_tokens_seen": 261141105, "router_z_loss_clip": 2.24023438, "router_z_loss_mlp": 0.27600098, "step": 12096, "time_per_iteration": 2.673971176147461 }, { "auxiliary_loss_clip": 0.01271215, "auxiliary_loss_mlp": 0.00227754, "balance_loss_clip": 1.05008245, "balance_loss_mlp": 0.20261317, "epoch": 0.727310987524425, "flos": 11655886331520.0, "grad_norm": 6.731083439545469, "language_loss": 0.8216356, "learning_rate": 7.30489150291381e-07, "loss": 0.83662534, "num_input_tokens_seen": 261159255, "router_z_loss_clip": 2.21484375, "router_z_loss_mlp": 0.25109863, "step": 12097, "time_per_iteration": 2.7495741844177246 }, { "auxiliary_loss_clip": 0.01285212, "auxiliary_loss_mlp": 0.0026012, "balance_loss_clip": 1.05725908, "balance_loss_mlp": 0.23121193, "epoch": 0.727371110777093, "flos": 24535319592960.0, "grad_norm": 2.238590421368581, "language_loss": 0.86884624, "learning_rate": 7.301882322160935e-07, "loss": 0.88429952, "num_input_tokens_seen": 261177960, "router_z_loss_clip": 2.27734375, "router_z_loss_mlp": 0.28918457, "step": 12098, "time_per_iteration": 4.096341371536255 }, { "auxiliary_loss_clip": 0.01275339, "auxiliary_loss_mlp": 0.00261135, "balance_loss_clip": 1.04681957, "balance_loss_mlp": 0.23409879, "epoch": 0.7274312340297611, "flos": 74739835405440.0, "grad_norm": 39.489435095921756, "language_loss": 0.76474637, "learning_rate": 7.298873622921952e-07, "loss": 0.78011107, "num_input_tokens_seen": 261205660, "router_z_loss_clip": 2.28125, "router_z_loss_mlp": 0.27038574, "step": 12099, "time_per_iteration": 3.1368422508239746 }, { "auxiliary_loss_clip": 0.01291622, "auxiliary_loss_mlp": 0.00264759, "balance_loss_clip": 1.05453777, "balance_loss_mlp": 0.2349565, "epoch": 0.727491357282429, "flos": 22342865247360.0, "grad_norm": 411.16012032047, "language_loss": 0.82548571, "learning_rate": 7.29586540531095e-07, "loss": 0.84104949, "num_input_tokens_seen": 261225185, "router_z_loss_clip": 2.37109375, "router_z_loss_mlp": 0.2980957, "step": 12100, "time_per_iteration": 2.7505297660827637 }, { "auxiliary_loss_clip": 0.0128041, "auxiliary_loss_mlp": 0.00242168, "balance_loss_clip": 1.06242013, "balance_loss_mlp": 0.21672916, "epoch": 0.727551480535097, "flos": 23297550877440.0, "grad_norm": 2.972072811239429, "language_loss": 0.81146657, "learning_rate": 7.292857669442005e-07, "loss": 0.82669234, "num_input_tokens_seen": 261247965, "router_z_loss_clip": 2.18164062, "router_z_loss_mlp": 0.25439453, "step": 12101, "time_per_iteration": 2.7899773120880127 }, { "auxiliary_loss_clip": 0.01258146, "auxiliary_loss_mlp": 0.00218798, "balance_loss_clip": 1.04257107, "balance_loss_mlp": 0.19592139, "epoch": 0.7276116037877649, "flos": 21470559459840.0, "grad_norm": 15.479879239277544, "language_loss": 0.89413977, "learning_rate": 7.289850415429177e-07, "loss": 0.9089092, "num_input_tokens_seen": 261267585, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.22888184, "step": 12102, "time_per_iteration": 2.6946518421173096 }, { "auxiliary_loss_clip": 0.01258904, "auxiliary_loss_mlp": 0.00225409, "balance_loss_clip": 1.042539, "balance_loss_mlp": 0.20069724, "epoch": 0.7276717270404329, "flos": 21464059098240.0, "grad_norm": 7.11643406185164, "language_loss": 0.87777305, "learning_rate": 7.286843643386495e-07, "loss": 0.89261615, "num_input_tokens_seen": 261285200, "router_z_loss_clip": 2.16503906, "router_z_loss_mlp": 0.24707031, "step": 12103, "time_per_iteration": 2.725360631942749 }, { "auxiliary_loss_clip": 0.01273293, "auxiliary_loss_mlp": 0.0021955, "balance_loss_clip": 1.05027652, "balance_loss_mlp": 0.19424143, "epoch": 0.7277318502931008, "flos": 16837221329280.0, "grad_norm": 23.181462925983205, "language_loss": 0.76702893, "learning_rate": 7.283837353427968e-07, "loss": 0.78195739, "num_input_tokens_seen": 261303645, "router_z_loss_clip": 2.22851562, "router_z_loss_mlp": 0.25305176, "step": 12104, "time_per_iteration": 2.643568515777588 }, { "auxiliary_loss_clip": 0.01264537, "auxiliary_loss_mlp": 0.00243948, "balance_loss_clip": 1.04474926, "balance_loss_mlp": 0.21843696, "epoch": 0.7277919735457689, "flos": 33400550476800.0, "grad_norm": 3.4796838070257308, "language_loss": 0.75092298, "learning_rate": 7.280831545667611e-07, "loss": 0.76600778, "num_input_tokens_seen": 261323265, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.25537109, "step": 12105, "time_per_iteration": 2.7491822242736816 }, { "auxiliary_loss_clip": 0.01267182, "auxiliary_loss_mlp": 0.00239707, "balance_loss_clip": 1.04561365, "balance_loss_mlp": 0.21227714, "epoch": 0.7278520967984368, "flos": 19206499351680.0, "grad_norm": 3.547716866409734, "language_loss": 0.82550406, "learning_rate": 7.27782622021939e-07, "loss": 0.84057295, "num_input_tokens_seen": 261339745, "router_z_loss_clip": 2.21679688, "router_z_loss_mlp": 0.27429199, "step": 12106, "time_per_iteration": 2.607492208480835 }, { "auxiliary_loss_clip": 0.01285073, "auxiliary_loss_mlp": 0.00243678, "balance_loss_clip": 1.06138706, "balance_loss_mlp": 0.21699874, "epoch": 0.7279122200511048, "flos": 34094667870720.0, "grad_norm": 334.13393317773574, "language_loss": 0.79613519, "learning_rate": 7.274821377197273e-07, "loss": 0.81142271, "num_input_tokens_seen": 261359310, "router_z_loss_clip": 2.23632812, "router_z_loss_mlp": 0.26660156, "step": 12107, "time_per_iteration": 2.752103328704834 }, { "auxiliary_loss_clip": 0.01260349, "auxiliary_loss_mlp": 0.00219, "balance_loss_clip": 1.04326308, "balance_loss_mlp": 0.19369218, "epoch": 0.7279723433037727, "flos": 54599049348480.0, "grad_norm": 28.11152081961286, "language_loss": 0.82603687, "learning_rate": 7.271817016715205e-07, "loss": 0.84083039, "num_input_tokens_seen": 261384640, "router_z_loss_clip": 2.171875, "router_z_loss_mlp": 0.25280762, "step": 12108, "time_per_iteration": 2.936485767364502 }, { "auxiliary_loss_clip": 0.01287245, "auxiliary_loss_mlp": 0.00238711, "balance_loss_clip": 1.06352949, "balance_loss_mlp": 0.21148404, "epoch": 0.7280324665564407, "flos": 36137482156800.0, "grad_norm": 13.903603057504498, "language_loss": 0.7230953, "learning_rate": 7.268813138887124e-07, "loss": 0.7383548, "num_input_tokens_seen": 261405290, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.27209473, "step": 12109, "time_per_iteration": 2.768984317779541 }, { "auxiliary_loss_clip": 0.01272367, "auxiliary_loss_mlp": 0.00260698, "balance_loss_clip": 1.05097234, "balance_loss_mlp": 0.23305303, "epoch": 0.7280925898091086, "flos": 11618539165440.0, "grad_norm": 9.346265747416028, "language_loss": 0.74570274, "learning_rate": 7.265809743826912e-07, "loss": 0.76103342, "num_input_tokens_seen": 261419710, "router_z_loss_clip": 2.21289062, "router_z_loss_mlp": 0.27661133, "step": 12110, "time_per_iteration": 2.5977776050567627 }, { "auxiliary_loss_clip": 0.01280578, "auxiliary_loss_mlp": 0.0024082, "balance_loss_clip": 1.05403578, "balance_loss_mlp": 0.21527302, "epoch": 0.7281527130617766, "flos": 34277094069120.0, "grad_norm": 30.24648436924254, "language_loss": 0.68184721, "learning_rate": 7.26280683164847e-07, "loss": 0.69706118, "num_input_tokens_seen": 261442385, "router_z_loss_clip": 2.26757812, "router_z_loss_mlp": 0.25524902, "step": 12111, "time_per_iteration": 2.747955322265625 }, { "auxiliary_loss_clip": 0.01294927, "auxiliary_loss_mlp": 0.00229834, "balance_loss_clip": 1.06058788, "balance_loss_mlp": 0.20237993, "epoch": 0.7282128363144446, "flos": 13918043018880.0, "grad_norm": 2400.037514326801, "language_loss": 0.86554652, "learning_rate": 7.259804402465677e-07, "loss": 0.88079411, "num_input_tokens_seen": 261459805, "router_z_loss_clip": 2.34375, "router_z_loss_mlp": 0.27453613, "step": 12112, "time_per_iteration": 2.6613457202911377 }, { "auxiliary_loss_clip": 0.01267185, "auxiliary_loss_mlp": 0.00231431, "balance_loss_clip": 1.04775906, "balance_loss_mlp": 0.20652843, "epoch": 0.7282729595671126, "flos": 20777627214720.0, "grad_norm": 16.89572788049301, "language_loss": 0.74356925, "learning_rate": 7.25680245639237e-07, "loss": 0.75855541, "num_input_tokens_seen": 261477175, "router_z_loss_clip": 2.19335938, "router_z_loss_mlp": 0.24914551, "step": 12113, "time_per_iteration": 2.702554702758789 }, { "auxiliary_loss_clip": 0.01272549, "auxiliary_loss_mlp": 0.00233295, "balance_loss_clip": 1.04925847, "balance_loss_mlp": 0.20668702, "epoch": 0.7283330828197806, "flos": 16325422392960.0, "grad_norm": 44.369101287803964, "language_loss": 0.80753446, "learning_rate": 7.253800993542399e-07, "loss": 0.82259291, "num_input_tokens_seen": 261494990, "router_z_loss_clip": 2.23242188, "router_z_loss_mlp": 0.26635742, "step": 12114, "time_per_iteration": 2.623629093170166 }, { "auxiliary_loss_clip": 0.01257588, "auxiliary_loss_mlp": 0.00245327, "balance_loss_clip": 1.04145026, "balance_loss_mlp": 0.21794438, "epoch": 0.7283932060724485, "flos": 27490193043840.0, "grad_norm": 36.993132534421, "language_loss": 0.76103586, "learning_rate": 7.250800014029564e-07, "loss": 0.77606499, "num_input_tokens_seen": 261514445, "router_z_loss_clip": 2.16015625, "router_z_loss_mlp": 0.27380371, "step": 12115, "time_per_iteration": 2.779085874557495 }, { "auxiliary_loss_clip": 0.01286753, "auxiliary_loss_mlp": 0.00227305, "balance_loss_clip": 1.0594728, "balance_loss_mlp": 0.20019686, "epoch": 0.7284533293251165, "flos": 18367877543040.0, "grad_norm": 405.09157506531295, "language_loss": 0.68120849, "learning_rate": 7.247799517967674e-07, "loss": 0.69634908, "num_input_tokens_seen": 261533565, "router_z_loss_clip": 2.27148438, "router_z_loss_mlp": 0.27087402, "step": 12116, "time_per_iteration": 2.732276439666748 }, { "auxiliary_loss_clip": 0.01295119, "auxiliary_loss_mlp": 0.00244203, "balance_loss_clip": 1.068488, "balance_loss_mlp": 0.21434067, "epoch": 0.7285134525777844, "flos": 21725525174400.0, "grad_norm": 9.506354351601134, "language_loss": 0.81813252, "learning_rate": 7.2447995054705e-07, "loss": 0.83352578, "num_input_tokens_seen": 261553795, "router_z_loss_clip": 2.26367188, "router_z_loss_mlp": 0.29870605, "step": 12117, "time_per_iteration": 2.682467460632324 }, { "auxiliary_loss_clip": 0.01280491, "auxiliary_loss_mlp": 0.00245108, "balance_loss_clip": 1.0593195, "balance_loss_mlp": 0.21898963, "epoch": 0.7285735758304525, "flos": 20741357456640.0, "grad_norm": 160.7165542498642, "language_loss": 0.77343655, "learning_rate": 7.241799976651807e-07, "loss": 0.78869247, "num_input_tokens_seen": 261572565, "router_z_loss_clip": 2.2109375, "router_z_loss_mlp": 0.26135254, "step": 12118, "time_per_iteration": 2.736356735229492 }, { "auxiliary_loss_clip": 0.01267246, "auxiliary_loss_mlp": 0.00256369, "balance_loss_clip": 1.05497098, "balance_loss_mlp": 0.23300368, "epoch": 0.7286336990831204, "flos": 17310954827520.0, "grad_norm": 146.89779056020166, "language_loss": 0.91368973, "learning_rate": 7.238800931625346e-07, "loss": 0.92892587, "num_input_tokens_seen": 261590910, "router_z_loss_clip": 2.11914062, "router_z_loss_mlp": 0.23376465, "step": 12119, "time_per_iteration": 2.7646474838256836 }, { "auxiliary_loss_clip": 0.0125879, "auxiliary_loss_mlp": 0.00215246, "balance_loss_clip": 1.04147148, "balance_loss_mlp": 0.19060591, "epoch": 0.7286938223357884, "flos": 19787390098560.0, "grad_norm": 117.53962686666505, "language_loss": 0.92112124, "learning_rate": 7.235802370504831e-07, "loss": 0.93586153, "num_input_tokens_seen": 261606005, "router_z_loss_clip": 2.171875, "router_z_loss_mlp": 0.24645996, "step": 12120, "time_per_iteration": 2.678436756134033 }, { "auxiliary_loss_clip": 0.0126597, "auxiliary_loss_mlp": 0.00215232, "balance_loss_clip": 1.04909921, "balance_loss_mlp": 0.19040096, "epoch": 0.7287539455884563, "flos": 15340859625600.0, "grad_norm": 106.98424610063344, "language_loss": 0.86961353, "learning_rate": 7.232804293403963e-07, "loss": 0.88442552, "num_input_tokens_seen": 261622305, "router_z_loss_clip": 2.17089844, "router_z_loss_mlp": 0.24841309, "step": 12121, "time_per_iteration": 2.6567625999450684 }, { "auxiliary_loss_clip": 0.01288645, "auxiliary_loss_mlp": 0.00234258, "balance_loss_clip": 1.05922365, "balance_loss_mlp": 0.20760277, "epoch": 0.7288140688411243, "flos": 25192484870400.0, "grad_norm": 8.186345709562667, "language_loss": 0.78831816, "learning_rate": 7.229806700436441e-07, "loss": 0.80354726, "num_input_tokens_seen": 261642465, "router_z_loss_clip": 2.29492188, "router_z_loss_mlp": 0.26672363, "step": 12122, "time_per_iteration": 2.7476046085357666 }, { "auxiliary_loss_clip": 0.01236315, "auxiliary_loss_mlp": 0.00227351, "balance_loss_clip": 1.02819109, "balance_loss_mlp": 0.20381942, "epoch": 0.7288741920937922, "flos": 23984162328960.0, "grad_norm": 20.501816134195007, "language_loss": 0.93881905, "learning_rate": 7.226809591715923e-07, "loss": 0.95345581, "num_input_tokens_seen": 261661420, "router_z_loss_clip": 2.08007812, "router_z_loss_mlp": 0.23547363, "step": 12123, "time_per_iteration": 2.7230021953582764 }, { "auxiliary_loss_clip": 0.01262324, "auxiliary_loss_mlp": 0.00234401, "balance_loss_clip": 1.04146433, "balance_loss_mlp": 0.208652, "epoch": 0.7289343153464602, "flos": 22744921155840.0, "grad_norm": 4.896195921757098, "language_loss": 0.89278805, "learning_rate": 7.223812967356065e-07, "loss": 0.90775532, "num_input_tokens_seen": 261680865, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.25756836, "step": 12124, "time_per_iteration": 2.665356159210205 }, { "auxiliary_loss_clip": 0.01276073, "auxiliary_loss_mlp": 0.00255583, "balance_loss_clip": 1.05493462, "balance_loss_mlp": 0.23100173, "epoch": 0.7289944385991282, "flos": 24900028335360.0, "grad_norm": 2.5127645729155597, "language_loss": 0.75379413, "learning_rate": 7.220816827470499e-07, "loss": 0.76911068, "num_input_tokens_seen": 261701455, "router_z_loss_clip": 2.2109375, "router_z_loss_mlp": 0.24572754, "step": 12125, "time_per_iteration": 2.7525475025177 }, { "auxiliary_loss_clip": 0.01294423, "auxiliary_loss_mlp": 0.00257196, "balance_loss_clip": 1.0609014, "balance_loss_mlp": 0.22921745, "epoch": 0.7290545618517962, "flos": 22967064817920.0, "grad_norm": 1.90918260976406, "language_loss": 0.82583708, "learning_rate": 7.217821172172855e-07, "loss": 0.8413533, "num_input_tokens_seen": 261721260, "router_z_loss_clip": 2.33984375, "router_z_loss_mlp": 0.27978516, "step": 12126, "time_per_iteration": 2.8510475158691406 }, { "auxiliary_loss_clip": 0.0109545, "auxiliary_loss_mlp": 0.00054484, "balance_loss_clip": 0.94712412, "balance_loss_mlp": 0.04747497, "epoch": 0.7291146851044642, "flos": 61901523216000.0, "grad_norm": 0.8272256417272956, "language_loss": 0.57929534, "learning_rate": 7.2148260015767e-07, "loss": 0.59079468, "num_input_tokens_seen": 261779370, "router_z_loss_clip": 1.484375, "router_z_loss_mlp": 0.0703125, "step": 12127, "time_per_iteration": 3.1270980834960938 }, { "auxiliary_loss_clip": 0.012569, "auxiliary_loss_mlp": 0.00203943, "balance_loss_clip": 1.0408206, "balance_loss_mlp": 0.18145992, "epoch": 0.7291748083571321, "flos": 23330947547520.0, "grad_norm": 195.3503783303676, "language_loss": 0.78244293, "learning_rate": 7.21183131579562e-07, "loss": 0.79705143, "num_input_tokens_seen": 261798050, "router_z_loss_clip": 2.1640625, "router_z_loss_mlp": 0.22497559, "step": 12128, "time_per_iteration": 2.658074140548706 }, { "auxiliary_loss_clip": 0.01256408, "auxiliary_loss_mlp": 0.00210009, "balance_loss_clip": 1.03474355, "balance_loss_mlp": 0.18414059, "epoch": 0.7292349316098001, "flos": 28330000001280.0, "grad_norm": 5.324848188690332, "language_loss": 0.74013877, "learning_rate": 7.20883711494319e-07, "loss": 0.75480294, "num_input_tokens_seen": 261817660, "router_z_loss_clip": 2.21484375, "router_z_loss_mlp": 0.25866699, "step": 12129, "time_per_iteration": 2.712419271469116 }, { "auxiliary_loss_clip": 0.01251271, "auxiliary_loss_mlp": 0.0024342, "balance_loss_clip": 1.03789699, "balance_loss_mlp": 0.21871966, "epoch": 0.729295054862468, "flos": 24132222190080.0, "grad_norm": 5.418545663474037, "language_loss": 0.81653589, "learning_rate": 7.205843399132927e-07, "loss": 0.83148277, "num_input_tokens_seen": 261837935, "router_z_loss_clip": 2.13378906, "router_z_loss_mlp": 0.24731445, "step": 12130, "time_per_iteration": 2.7416512966156006 }, { "auxiliary_loss_clip": 0.01268837, "auxiliary_loss_mlp": 0.00215431, "balance_loss_clip": 1.04671371, "balance_loss_mlp": 0.18922891, "epoch": 0.7293551781151361, "flos": 22816239609600.0, "grad_norm": 2.952511640622813, "language_loss": 0.78170836, "learning_rate": 7.202850168478374e-07, "loss": 0.79655111, "num_input_tokens_seen": 261857575, "router_z_loss_clip": 2.21484375, "router_z_loss_mlp": 0.26196289, "step": 12131, "time_per_iteration": 4.110085964202881 }, { "auxiliary_loss_clip": 0.01252354, "auxiliary_loss_mlp": 0.00228204, "balance_loss_clip": 1.03707969, "balance_loss_mlp": 0.20287196, "epoch": 0.729415301367804, "flos": 22126683242880.0, "grad_norm": 5.709691268912697, "language_loss": 0.84980553, "learning_rate": 7.199857423093025e-07, "loss": 0.86461115, "num_input_tokens_seen": 261877265, "router_z_loss_clip": 2.15234375, "router_z_loss_mlp": 0.2532959, "step": 12132, "time_per_iteration": 2.6924636363983154 }, { "auxiliary_loss_clip": 0.0125906, "auxiliary_loss_mlp": 0.00199988, "balance_loss_clip": 1.04413831, "balance_loss_mlp": 0.17533529, "epoch": 0.729475424620472, "flos": 12349608675840.0, "grad_norm": 57.45528915869271, "language_loss": 0.88451552, "learning_rate": 7.196865163090358e-07, "loss": 0.89910603, "num_input_tokens_seen": 261893695, "router_z_loss_clip": 2.15136719, "router_z_loss_mlp": 0.24658203, "step": 12133, "time_per_iteration": 4.1014063358306885 }, { "auxiliary_loss_clip": 0.01254546, "auxiliary_loss_mlp": 0.00225997, "balance_loss_clip": 1.04056239, "balance_loss_mlp": 0.19980642, "epoch": 0.7295355478731399, "flos": 22195308176640.0, "grad_norm": 21.99897843270376, "language_loss": 0.80623585, "learning_rate": 7.193873388583846e-07, "loss": 0.82104135, "num_input_tokens_seen": 261911825, "router_z_loss_clip": 2.14453125, "router_z_loss_mlp": 0.26171875, "step": 12134, "time_per_iteration": 2.641759157180786 }, { "auxiliary_loss_clip": 0.01282985, "auxiliary_loss_mlp": 0.00238043, "balance_loss_clip": 1.06008005, "balance_loss_mlp": 0.21226948, "epoch": 0.7295956711258079, "flos": 23222030532480.0, "grad_norm": 3.3257628560320813, "language_loss": 0.78871799, "learning_rate": 7.190882099686939e-07, "loss": 0.80392826, "num_input_tokens_seen": 261931190, "router_z_loss_clip": 2.23046875, "router_z_loss_mlp": 0.25769043, "step": 12135, "time_per_iteration": 2.6655220985412598 }, { "auxiliary_loss_clip": 0.0125792, "auxiliary_loss_mlp": 0.0024135, "balance_loss_clip": 1.0364908, "balance_loss_mlp": 0.21637535, "epoch": 0.7296557943784758, "flos": 31869104163840.0, "grad_norm": 17.645662666385217, "language_loss": 0.7283504, "learning_rate": 7.187891296513075e-07, "loss": 0.74334311, "num_input_tokens_seen": 261951240, "router_z_loss_clip": 2.21289062, "router_z_loss_mlp": 0.24963379, "step": 12136, "time_per_iteration": 4.3085548877716064 }, { "auxiliary_loss_clip": 0.01263145, "auxiliary_loss_mlp": 0.00238278, "balance_loss_clip": 1.04467785, "balance_loss_mlp": 0.21422121, "epoch": 0.7297159176311439, "flos": 26651714889600.0, "grad_norm": 33.888076463111965, "language_loss": 0.81474495, "learning_rate": 7.184900979175654e-07, "loss": 0.82975912, "num_input_tokens_seen": 261971605, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.24060059, "step": 12137, "time_per_iteration": 2.729807138442993 }, { "auxiliary_loss_clip": 0.01281273, "auxiliary_loss_mlp": 0.00220139, "balance_loss_clip": 1.05800378, "balance_loss_mlp": 0.19430621, "epoch": 0.7297760408838118, "flos": 24749562263040.0, "grad_norm": 5.483370348608696, "language_loss": 0.82298255, "learning_rate": 7.181911147788069e-07, "loss": 0.8379966, "num_input_tokens_seen": 261990830, "router_z_loss_clip": 2.23242188, "router_z_loss_mlp": 0.25817871, "step": 12138, "time_per_iteration": 2.7305657863616943 }, { "auxiliary_loss_clip": 0.01260243, "auxiliary_loss_mlp": 0.0022573, "balance_loss_clip": 1.04041719, "balance_loss_mlp": 0.20070788, "epoch": 0.7298361641364798, "flos": 18073768982400.0, "grad_norm": 62.106252395720624, "language_loss": 0.82268447, "learning_rate": 7.178921802463702e-07, "loss": 0.8375442, "num_input_tokens_seen": 262008190, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.25024414, "step": 12139, "time_per_iteration": 2.6892127990722656 }, { "auxiliary_loss_clip": 0.01259101, "auxiliary_loss_mlp": 0.00236483, "balance_loss_clip": 1.04480171, "balance_loss_mlp": 0.21247463, "epoch": 0.7298962873891478, "flos": 29895597169920.0, "grad_norm": 3.2464281567487423, "language_loss": 0.79798996, "learning_rate": 7.175932943315898e-07, "loss": 0.81294584, "num_input_tokens_seen": 262030460, "router_z_loss_clip": 2.14257812, "router_z_loss_mlp": 0.24023438, "step": 12140, "time_per_iteration": 4.135778903961182 }, { "auxiliary_loss_clip": 0.01277539, "auxiliary_loss_mlp": 0.00242378, "balance_loss_clip": 1.05390823, "balance_loss_mlp": 0.21759447, "epoch": 0.7299564106418157, "flos": 32266096254720.0, "grad_norm": 11.939390105143806, "language_loss": 0.63581771, "learning_rate": 7.172944570458003e-07, "loss": 0.65101689, "num_input_tokens_seen": 262050830, "router_z_loss_clip": 2.23828125, "router_z_loss_mlp": 0.24768066, "step": 12141, "time_per_iteration": 2.7798407077789307 }, { "auxiliary_loss_clip": 0.01276991, "auxiliary_loss_mlp": 0.00204162, "balance_loss_clip": 1.05970395, "balance_loss_mlp": 0.180177, "epoch": 0.7300165338944837, "flos": 22930292269440.0, "grad_norm": 4.528240783951071, "language_loss": 0.80143863, "learning_rate": 7.169956684003342e-07, "loss": 0.81625009, "num_input_tokens_seen": 262071245, "router_z_loss_clip": 2.17578125, "router_z_loss_mlp": 0.23986816, "step": 12142, "time_per_iteration": 2.682826042175293 }, { "auxiliary_loss_clip": 0.01248634, "auxiliary_loss_mlp": 0.00201696, "balance_loss_clip": 1.03736854, "balance_loss_mlp": 0.17787784, "epoch": 0.7300766571471516, "flos": 19828795501440.0, "grad_norm": 31.202185643079506, "language_loss": 0.79662842, "learning_rate": 7.16696928406521e-07, "loss": 0.81113172, "num_input_tokens_seen": 262087525, "router_z_loss_clip": 2.11230469, "router_z_loss_mlp": 0.23803711, "step": 12143, "time_per_iteration": 2.7022361755371094 }, { "auxiliary_loss_clip": 0.01279051, "auxiliary_loss_mlp": 0.00229261, "balance_loss_clip": 1.06027055, "balance_loss_mlp": 0.20296367, "epoch": 0.7301367803998197, "flos": 24347829576960.0, "grad_norm": 56.01705730540878, "language_loss": 0.74144959, "learning_rate": 7.163982370756882e-07, "loss": 0.75653267, "num_input_tokens_seen": 262107355, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.26318359, "step": 12144, "time_per_iteration": 2.709817409515381 }, { "auxiliary_loss_clip": 0.01292461, "auxiliary_loss_mlp": 0.00249697, "balance_loss_clip": 1.06854498, "balance_loss_mlp": 0.22423397, "epoch": 0.7301969036524876, "flos": 15304518040320.0, "grad_norm": 47.16023626613984, "language_loss": 0.86033523, "learning_rate": 7.160995944191627e-07, "loss": 0.87575674, "num_input_tokens_seen": 262125645, "router_z_loss_clip": 2.23828125, "router_z_loss_mlp": 0.25488281, "step": 12145, "time_per_iteration": 2.67399263381958 }, { "auxiliary_loss_clip": 0.0125892, "auxiliary_loss_mlp": 0.00259122, "balance_loss_clip": 1.03997445, "balance_loss_mlp": 0.23266964, "epoch": 0.7302570269051556, "flos": 23507268433920.0, "grad_norm": 11.66439997265892, "language_loss": 0.98708725, "learning_rate": 7.158010004482702e-07, "loss": 1.0022676, "num_input_tokens_seen": 262144075, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.2644043, "step": 12146, "time_per_iteration": 2.666377544403076 }, { "auxiliary_loss_clip": 0.01276984, "auxiliary_loss_mlp": 0.00227848, "balance_loss_clip": 1.05465388, "balance_loss_mlp": 0.20197991, "epoch": 0.7303171501578235, "flos": 20523056549760.0, "grad_norm": 12.743594054439544, "language_loss": 0.67436433, "learning_rate": 7.155024551743316e-07, "loss": 0.68941265, "num_input_tokens_seen": 262165940, "router_z_loss_clip": 2.2265625, "router_z_loss_mlp": 0.25866699, "step": 12147, "time_per_iteration": 2.79508900642395 }, { "auxiliary_loss_clip": 0.01271738, "auxiliary_loss_mlp": 0.00239774, "balance_loss_clip": 1.05278039, "balance_loss_mlp": 0.21539578, "epoch": 0.7303772734104915, "flos": 18332613365760.0, "grad_norm": 30.759902019143446, "language_loss": 0.82518113, "learning_rate": 7.152039586086693e-07, "loss": 0.84029627, "num_input_tokens_seen": 262184520, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.24353027, "step": 12148, "time_per_iteration": 2.634248733520508 }, { "auxiliary_loss_clip": 0.01140812, "auxiliary_loss_mlp": 0.00099413, "balance_loss_clip": 0.98543155, "balance_loss_mlp": 0.09235552, "epoch": 0.7304373966631594, "flos": 60654776100480.0, "grad_norm": 0.7445455277795336, "language_loss": 0.56129277, "learning_rate": 7.149055107626017e-07, "loss": 0.57369506, "num_input_tokens_seen": 262247070, "router_z_loss_clip": 1.5546875, "router_z_loss_mlp": 0.07080078, "step": 12149, "time_per_iteration": 3.118022918701172 }, { "auxiliary_loss_clip": 0.01277485, "auxiliary_loss_mlp": 0.00221265, "balance_loss_clip": 1.06115007, "balance_loss_mlp": 0.19698198, "epoch": 0.7304975199158275, "flos": 19828077229440.0, "grad_norm": 24.4361253284381, "language_loss": 0.82950854, "learning_rate": 7.146071116474451e-07, "loss": 0.84449601, "num_input_tokens_seen": 262266605, "router_z_loss_clip": 2.16601562, "router_z_loss_mlp": 0.24304199, "step": 12150, "time_per_iteration": 2.6864802837371826 }, { "auxiliary_loss_clip": 0.01272267, "auxiliary_loss_mlp": 0.00218958, "balance_loss_clip": 1.05141258, "balance_loss_mlp": 0.19449672, "epoch": 0.7305576431684954, "flos": 13223997452160.0, "grad_norm": 342.07665874240166, "language_loss": 0.93157494, "learning_rate": 7.143087612745158e-07, "loss": 0.94648719, "num_input_tokens_seen": 262283880, "router_z_loss_clip": 2.2109375, "router_z_loss_mlp": 0.24462891, "step": 12151, "time_per_iteration": 2.6406688690185547 }, { "auxiliary_loss_clip": 0.01278782, "auxiliary_loss_mlp": 0.0023185, "balance_loss_clip": 1.05816436, "balance_loss_mlp": 0.206864, "epoch": 0.7306177664211634, "flos": 24060472773120.0, "grad_norm": 26.780871819756527, "language_loss": 0.85945439, "learning_rate": 7.14010459655127e-07, "loss": 0.87456071, "num_input_tokens_seen": 262304155, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.24987793, "step": 12152, "time_per_iteration": 2.671936273574829 }, { "auxiliary_loss_clip": 0.01275775, "auxiliary_loss_mlp": 0.00239622, "balance_loss_clip": 1.06043971, "balance_loss_mlp": 0.21510062, "epoch": 0.7306778896738314, "flos": 27089106802560.0, "grad_norm": 3.964205802481182, "language_loss": 0.86515439, "learning_rate": 7.137122068005919e-07, "loss": 0.88030833, "num_input_tokens_seen": 262325660, "router_z_loss_clip": 2.15234375, "router_z_loss_mlp": 0.2454834, "step": 12153, "time_per_iteration": 2.6952896118164062 }, { "auxiliary_loss_clip": 0.01278821, "auxiliary_loss_mlp": 0.00255096, "balance_loss_clip": 1.0590694, "balance_loss_mlp": 0.22909603, "epoch": 0.7307380129264993, "flos": 16690669839360.0, "grad_norm": 13.605643333848507, "language_loss": 0.75762582, "learning_rate": 7.134140027222173e-07, "loss": 0.77296501, "num_input_tokens_seen": 262344075, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.26000977, "step": 12154, "time_per_iteration": 2.6677534580230713 }, { "auxiliary_loss_clip": 0.01288358, "auxiliary_loss_mlp": 0.00227959, "balance_loss_clip": 1.06170559, "balance_loss_mlp": 0.20209068, "epoch": 0.7307981361791673, "flos": 21725740656000.0, "grad_norm": 134.8578983523739, "language_loss": 0.73469603, "learning_rate": 7.131158474313128e-07, "loss": 0.74985915, "num_input_tokens_seen": 262363305, "router_z_loss_clip": 2.265625, "router_z_loss_mlp": 0.25891113, "step": 12155, "time_per_iteration": 2.672449827194214 }, { "auxiliary_loss_clip": 0.01238045, "auxiliary_loss_mlp": 0.00213469, "balance_loss_clip": 1.03113317, "balance_loss_mlp": 0.1896987, "epoch": 0.7308582594318352, "flos": 18040659621120.0, "grad_norm": 70.83322231286152, "language_loss": 0.90266061, "learning_rate": 7.128177409391851e-07, "loss": 0.91717577, "num_input_tokens_seen": 262380730, "router_z_loss_clip": 2.06738281, "router_z_loss_mlp": 0.23791504, "step": 12156, "time_per_iteration": 2.653552293777466 }, { "auxiliary_loss_clip": 0.01267039, "auxiliary_loss_mlp": 0.00244476, "balance_loss_clip": 1.05033934, "balance_loss_mlp": 0.22041979, "epoch": 0.7309183826845033, "flos": 13844964798720.0, "grad_norm": 8.408825542314908, "language_loss": 0.83606195, "learning_rate": 7.125196832571367e-07, "loss": 0.8511771, "num_input_tokens_seen": 262395480, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.24072266, "step": 12157, "time_per_iteration": 2.5982203483581543 }, { "auxiliary_loss_clip": 0.01257122, "auxiliary_loss_mlp": 0.00230656, "balance_loss_clip": 1.042907, "balance_loss_mlp": 0.20806572, "epoch": 0.7309785059371712, "flos": 17019216564480.0, "grad_norm": 3.963865641527486, "language_loss": 0.80986285, "learning_rate": 7.122216743964713e-07, "loss": 0.82474053, "num_input_tokens_seen": 262413340, "router_z_loss_clip": 2.140625, "router_z_loss_mlp": 0.22607422, "step": 12158, "time_per_iteration": 2.772718906402588 }, { "auxiliary_loss_clip": 0.01271984, "auxiliary_loss_mlp": 0.00254722, "balance_loss_clip": 1.05377889, "balance_loss_mlp": 0.22770953, "epoch": 0.7310386291898392, "flos": 26502398052480.0, "grad_norm": 53.408192145423634, "language_loss": 0.92519587, "learning_rate": 7.119237143684896e-07, "loss": 0.94046295, "num_input_tokens_seen": 262433455, "router_z_loss_clip": 2.18066406, "router_z_loss_mlp": 0.27026367, "step": 12159, "time_per_iteration": 2.7398695945739746 }, { "auxiliary_loss_clip": 0.01283192, "auxiliary_loss_mlp": 0.00227394, "balance_loss_clip": 1.05536687, "balance_loss_mlp": 0.20060745, "epoch": 0.7310987524425071, "flos": 16945922862720.0, "grad_norm": 97.29042053401565, "language_loss": 0.83942783, "learning_rate": 7.116258031844895e-07, "loss": 0.85453367, "num_input_tokens_seen": 262450335, "router_z_loss_clip": 2.27539062, "router_z_loss_mlp": 0.26782227, "step": 12160, "time_per_iteration": 2.6845180988311768 }, { "auxiliary_loss_clip": 0.01301116, "auxiliary_loss_mlp": 0.00233843, "balance_loss_clip": 1.06780934, "balance_loss_mlp": 0.205888, "epoch": 0.7311588756951751, "flos": 13845288021120.0, "grad_norm": 16.752251947680907, "language_loss": 0.85269803, "learning_rate": 7.113279408557675e-07, "loss": 0.8680476, "num_input_tokens_seen": 262468240, "router_z_loss_clip": 2.33398438, "router_z_loss_mlp": 0.27954102, "step": 12161, "time_per_iteration": 2.716829299926758 }, { "auxiliary_loss_clip": 0.01317814, "auxiliary_loss_mlp": 0.0022959, "balance_loss_clip": 1.08046627, "balance_loss_mlp": 0.20151666, "epoch": 0.731218998947843, "flos": 28767894704640.0, "grad_norm": 8.323234721399743, "language_loss": 0.79385924, "learning_rate": 7.110301273936192e-07, "loss": 0.80933326, "num_input_tokens_seen": 262487045, "router_z_loss_clip": 2.37304688, "router_z_loss_mlp": 0.28076172, "step": 12162, "time_per_iteration": 2.741257667541504 }, { "auxiliary_loss_clip": 0.01294616, "auxiliary_loss_mlp": 0.00228259, "balance_loss_clip": 1.06719136, "balance_loss_mlp": 0.20110363, "epoch": 0.7312791222005111, "flos": 27088783580160.0, "grad_norm": 2.4667831219465746, "language_loss": 0.75072217, "learning_rate": 7.107323628093382e-07, "loss": 0.76595092, "num_input_tokens_seen": 262504855, "router_z_loss_clip": 2.27539062, "router_z_loss_mlp": 0.2713623, "step": 12163, "time_per_iteration": 2.805196523666382 }, { "auxiliary_loss_clip": 0.01272227, "auxiliary_loss_mlp": 0.00254396, "balance_loss_clip": 1.0485363, "balance_loss_mlp": 0.22683522, "epoch": 0.731339245453179, "flos": 20924035050240.0, "grad_norm": 4.775194471855563, "language_loss": 0.76254904, "learning_rate": 7.104346471142153e-07, "loss": 0.77781528, "num_input_tokens_seen": 262524920, "router_z_loss_clip": 2.24023438, "router_z_loss_mlp": 0.2755127, "step": 12164, "time_per_iteration": 2.801875114440918 }, { "auxiliary_loss_clip": 0.01258309, "auxiliary_loss_mlp": 0.00217241, "balance_loss_clip": 1.04501724, "balance_loss_mlp": 0.19233775, "epoch": 0.731399368705847, "flos": 23075694524160.0, "grad_norm": 11.3491048727447, "language_loss": 0.83028877, "learning_rate": 7.101369803195391e-07, "loss": 0.84504426, "num_input_tokens_seen": 262545725, "router_z_loss_clip": 2.13476562, "router_z_loss_mlp": 0.24914551, "step": 12165, "time_per_iteration": 2.913952589035034 }, { "auxiliary_loss_clip": 0.0127009, "auxiliary_loss_mlp": 0.0022034, "balance_loss_clip": 1.05070865, "balance_loss_mlp": 0.19508001, "epoch": 0.731459491958515, "flos": 23582681038080.0, "grad_norm": 4.637734226455805, "language_loss": 0.84056485, "learning_rate": 7.098393624365988e-07, "loss": 0.85546911, "num_input_tokens_seen": 262565480, "router_z_loss_clip": 2.19335938, "router_z_loss_mlp": 0.25280762, "step": 12166, "time_per_iteration": 2.6711525917053223 }, { "auxiliary_loss_clip": 0.01263408, "auxiliary_loss_mlp": 0.00213778, "balance_loss_clip": 1.04532099, "balance_loss_mlp": 0.18994758, "epoch": 0.7315196152111829, "flos": 22379278659840.0, "grad_norm": 10.142232686521435, "language_loss": 0.85356009, "learning_rate": 7.095417934766781e-07, "loss": 0.86833197, "num_input_tokens_seen": 262584145, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.23840332, "step": 12167, "time_per_iteration": 2.654996633529663 }, { "auxiliary_loss_clip": 0.01258735, "auxiliary_loss_mlp": 0.00216753, "balance_loss_clip": 1.04230118, "balance_loss_mlp": 0.19231471, "epoch": 0.7315797384638509, "flos": 26177047637760.0, "grad_norm": 323.4310408174081, "language_loss": 0.83065581, "learning_rate": 7.092442734510622e-07, "loss": 0.8454107, "num_input_tokens_seen": 262604045, "router_z_loss_clip": 2.16601562, "router_z_loss_mlp": 0.24438477, "step": 12168, "time_per_iteration": 2.6749041080474854 }, { "auxiliary_loss_clip": 0.01286397, "auxiliary_loss_mlp": 0.00246034, "balance_loss_clip": 1.0573802, "balance_loss_mlp": 0.21935451, "epoch": 0.7316398617165188, "flos": 21506326427520.0, "grad_norm": 2.2619963381915604, "language_loss": 0.885867, "learning_rate": 7.089468023710326e-07, "loss": 0.90119135, "num_input_tokens_seen": 262624540, "router_z_loss_clip": 2.29296875, "router_z_loss_mlp": 0.26696777, "step": 12169, "time_per_iteration": 2.7780306339263916 }, { "auxiliary_loss_clip": 0.01280988, "auxiliary_loss_mlp": 0.00218518, "balance_loss_clip": 1.05864859, "balance_loss_mlp": 0.19388901, "epoch": 0.7316999849691869, "flos": 30482557315200.0, "grad_norm": 2.1851908474256687, "language_loss": 0.79907095, "learning_rate": 7.08649380247871e-07, "loss": 0.81406605, "num_input_tokens_seen": 262644545, "router_z_loss_clip": 2.22265625, "router_z_loss_mlp": 0.24633789, "step": 12170, "time_per_iteration": 2.7666494846343994 }, { "auxiliary_loss_clip": 0.01261713, "auxiliary_loss_mlp": 0.00212773, "balance_loss_clip": 1.04254615, "balance_loss_mlp": 0.18741766, "epoch": 0.7317601082218548, "flos": 21543781334400.0, "grad_norm": 469.80286867599347, "language_loss": 0.78160077, "learning_rate": 7.083520070928533e-07, "loss": 0.79634571, "num_input_tokens_seen": 262662570, "router_z_loss_clip": 2.19335938, "router_z_loss_mlp": 0.25341797, "step": 12171, "time_per_iteration": 2.644022226333618 }, { "auxiliary_loss_clip": 0.01279001, "auxiliary_loss_mlp": 0.00256153, "balance_loss_clip": 1.05425644, "balance_loss_mlp": 0.22878218, "epoch": 0.7318202314745228, "flos": 33251592775680.0, "grad_norm": 7.887623573219813, "language_loss": 0.73686242, "learning_rate": 7.080546829172564e-07, "loss": 0.75221395, "num_input_tokens_seen": 262683245, "router_z_loss_clip": 2.24804688, "router_z_loss_mlp": 0.27380371, "step": 12172, "time_per_iteration": 2.714815616607666 }, { "auxiliary_loss_clip": 0.01304492, "auxiliary_loss_mlp": 0.00246365, "balance_loss_clip": 1.06990004, "balance_loss_mlp": 0.21869631, "epoch": 0.7318803547271907, "flos": 20157054917760.0, "grad_norm": 71.33397749093749, "language_loss": 0.72884965, "learning_rate": 7.077574077323564e-07, "loss": 0.74435818, "num_input_tokens_seen": 262701585, "router_z_loss_clip": 2.34570312, "router_z_loss_mlp": 0.27685547, "step": 12173, "time_per_iteration": 4.131450176239014 }, { "auxiliary_loss_clip": 0.01258729, "auxiliary_loss_mlp": 0.00216844, "balance_loss_clip": 1.04371715, "balance_loss_mlp": 0.19409913, "epoch": 0.7319404779798587, "flos": 20558536208640.0, "grad_norm": 5.603741076141377, "language_loss": 0.83231294, "learning_rate": 7.074601815494243e-07, "loss": 0.84706867, "num_input_tokens_seen": 262719295, "router_z_loss_clip": 2.15039062, "router_z_loss_mlp": 0.22741699, "step": 12174, "time_per_iteration": 2.7303740978240967 }, { "auxiliary_loss_clip": 0.01267554, "auxiliary_loss_mlp": 0.00226459, "balance_loss_clip": 1.05161023, "balance_loss_mlp": 0.20212884, "epoch": 0.7320006012325266, "flos": 28695391102080.0, "grad_norm": 780.7006529920528, "language_loss": 0.8703745, "learning_rate": 7.071630043797317e-07, "loss": 0.88531458, "num_input_tokens_seen": 262739995, "router_z_loss_clip": 2.15820312, "router_z_loss_mlp": 0.24316406, "step": 12175, "time_per_iteration": 4.141963005065918 }, { "auxiliary_loss_clip": 0.01254214, "auxiliary_loss_mlp": 0.00222725, "balance_loss_clip": 1.03667653, "balance_loss_mlp": 0.19703519, "epoch": 0.7320607244851947, "flos": 16362697731840.0, "grad_norm": 8.10099619468523, "language_loss": 0.85550559, "learning_rate": 7.068658762345488e-07, "loss": 0.87027502, "num_input_tokens_seen": 262757680, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.25708008, "step": 12176, "time_per_iteration": 2.6761393547058105 }, { "auxiliary_loss_clip": 0.01284451, "auxiliary_loss_mlp": 0.00212368, "balance_loss_clip": 1.05937147, "balance_loss_mlp": 0.18764408, "epoch": 0.7321208477378626, "flos": 20955097336320.0, "grad_norm": 14.24967872280871, "language_loss": 0.83636749, "learning_rate": 7.065687971251399e-07, "loss": 0.85133564, "num_input_tokens_seen": 262776990, "router_z_loss_clip": 2.25195312, "router_z_loss_mlp": 0.24719238, "step": 12177, "time_per_iteration": 2.6815991401672363 }, { "auxiliary_loss_clip": 0.01240888, "auxiliary_loss_mlp": 0.00214544, "balance_loss_clip": 1.02842069, "balance_loss_mlp": 0.19110793, "epoch": 0.7321809709905306, "flos": 13845072539520.0, "grad_norm": 11.042537130963975, "language_loss": 0.83400738, "learning_rate": 7.06271767062772e-07, "loss": 0.84856176, "num_input_tokens_seen": 262795440, "router_z_loss_clip": 2.12695312, "router_z_loss_mlp": 0.23461914, "step": 12178, "time_per_iteration": 4.016367435455322 }, { "auxiliary_loss_clip": 0.01266145, "auxiliary_loss_mlp": 0.00240069, "balance_loss_clip": 1.04301989, "balance_loss_mlp": 0.21409304, "epoch": 0.7322410942431986, "flos": 26979938392320.0, "grad_norm": 67.55388268571134, "language_loss": 0.91153467, "learning_rate": 7.059747860587084e-07, "loss": 0.92659682, "num_input_tokens_seen": 262816385, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.26000977, "step": 12179, "time_per_iteration": 2.8037075996398926 }, { "auxiliary_loss_clip": 0.01264007, "auxiliary_loss_mlp": 0.00218374, "balance_loss_clip": 1.04673243, "balance_loss_mlp": 0.19422197, "epoch": 0.7323012174958665, "flos": 17639717034240.0, "grad_norm": 103.44882232937525, "language_loss": 0.82304072, "learning_rate": 7.056778541242115e-07, "loss": 0.83786452, "num_input_tokens_seen": 262834955, "router_z_loss_clip": 2.17675781, "router_z_loss_mlp": 0.24169922, "step": 12180, "time_per_iteration": 2.682004690170288 }, { "auxiliary_loss_clip": 0.01285535, "auxiliary_loss_mlp": 0.00255643, "balance_loss_clip": 1.05356669, "balance_loss_mlp": 0.22702043, "epoch": 0.7323613407485345, "flos": 32342765834880.0, "grad_norm": 5.939929901887024, "language_loss": 0.88825202, "learning_rate": 7.053809712705396e-07, "loss": 0.90366375, "num_input_tokens_seen": 262853555, "router_z_loss_clip": 2.32226562, "router_z_loss_mlp": 0.28601074, "step": 12181, "time_per_iteration": 2.7430319786071777 }, { "auxiliary_loss_clip": 0.01283357, "auxiliary_loss_mlp": 0.00239218, "balance_loss_clip": 1.06197321, "balance_loss_mlp": 0.21308698, "epoch": 0.7324214640012024, "flos": 18362777811840.0, "grad_norm": 42.91402216894036, "language_loss": 0.8140195, "learning_rate": 7.050841375089506e-07, "loss": 0.82924521, "num_input_tokens_seen": 262870975, "router_z_loss_clip": 2.21289062, "router_z_loss_mlp": 0.26123047, "step": 12182, "time_per_iteration": 2.743802785873413 }, { "auxiliary_loss_clip": 0.01281285, "auxiliary_loss_mlp": 0.00237065, "balance_loss_clip": 1.05710793, "balance_loss_mlp": 0.21114917, "epoch": 0.7324815872538705, "flos": 30812289189120.0, "grad_norm": 614.1968038986224, "language_loss": 0.78858173, "learning_rate": 7.047873528507015e-07, "loss": 0.8037653, "num_input_tokens_seen": 262892635, "router_z_loss_clip": 2.24023438, "router_z_loss_mlp": 0.25952148, "step": 12183, "time_per_iteration": 4.1820068359375 }, { "auxiliary_loss_clip": 0.01277743, "auxiliary_loss_mlp": 0.00234202, "balance_loss_clip": 1.04870903, "balance_loss_mlp": 0.20814292, "epoch": 0.7325417105065384, "flos": 21505069451520.0, "grad_norm": 67.96327613811766, "language_loss": 0.81205863, "learning_rate": 7.04490617307045e-07, "loss": 0.82717812, "num_input_tokens_seen": 262910725, "router_z_loss_clip": 2.28710938, "router_z_loss_mlp": 0.26074219, "step": 12184, "time_per_iteration": 2.6547834873199463 }, { "auxiliary_loss_clip": 0.01106719, "auxiliary_loss_mlp": 0.00065223, "balance_loss_clip": 0.95135605, "balance_loss_mlp": 0.05892899, "epoch": 0.7326018337592064, "flos": 67257742556160.0, "grad_norm": 0.7509799004240644, "language_loss": 0.6467225, "learning_rate": 7.041939308892344e-07, "loss": 0.65844196, "num_input_tokens_seen": 262974150, "router_z_loss_clip": 1.5546875, "router_z_loss_mlp": 0.06298828, "step": 12185, "time_per_iteration": 3.1555488109588623 }, { "auxiliary_loss_clip": 0.01264927, "auxiliary_loss_mlp": 0.00224355, "balance_loss_clip": 1.04145074, "balance_loss_mlp": 0.19717555, "epoch": 0.7326619570118743, "flos": 22857070394880.0, "grad_norm": 196.23057870594036, "language_loss": 0.91929048, "learning_rate": 7.038972936085197e-07, "loss": 0.9341833, "num_input_tokens_seen": 262993370, "router_z_loss_clip": 2.23632812, "router_z_loss_mlp": 0.27185059, "step": 12186, "time_per_iteration": 2.6969404220581055 }, { "auxiliary_loss_clip": 0.01273289, "auxiliary_loss_mlp": 0.0025166, "balance_loss_clip": 1.04388297, "balance_loss_mlp": 0.2242177, "epoch": 0.7327220802645423, "flos": 23327499841920.0, "grad_norm": 17.847073571981564, "language_loss": 0.83119917, "learning_rate": 7.036007054761508e-07, "loss": 0.8464486, "num_input_tokens_seen": 263012665, "router_z_loss_clip": 2.29492188, "router_z_loss_mlp": 0.2746582, "step": 12187, "time_per_iteration": 2.83419132232666 }, { "auxiliary_loss_clip": 0.01279156, "auxiliary_loss_mlp": 0.00205811, "balance_loss_clip": 1.05269766, "balance_loss_mlp": 0.18144508, "epoch": 0.7327822035172102, "flos": 23180661043200.0, "grad_norm": 281.889061545406, "language_loss": 0.9766494, "learning_rate": 7.033041665033716e-07, "loss": 0.99149907, "num_input_tokens_seen": 263031475, "router_z_loss_clip": 2.265625, "router_z_loss_mlp": 0.24377441, "step": 12188, "time_per_iteration": 2.7800092697143555 }, { "auxiliary_loss_clip": 0.01255528, "auxiliary_loss_mlp": 0.00241614, "balance_loss_clip": 1.03438497, "balance_loss_mlp": 0.21597221, "epoch": 0.7328423267698783, "flos": 21066600130560.0, "grad_norm": 16.85740685926344, "language_loss": 0.8419463, "learning_rate": 7.030076767014284e-07, "loss": 0.85691774, "num_input_tokens_seen": 263051445, "router_z_loss_clip": 2.2109375, "router_z_loss_mlp": 0.2565918, "step": 12189, "time_per_iteration": 2.729074478149414 }, { "auxiliary_loss_clip": 0.01261451, "auxiliary_loss_mlp": 0.00220687, "balance_loss_clip": 1.03563344, "balance_loss_mlp": 0.19701242, "epoch": 0.7329024500225462, "flos": 21689578638720.0, "grad_norm": 6.726875137754761, "language_loss": 0.90031266, "learning_rate": 7.027112360815648e-07, "loss": 0.91513407, "num_input_tokens_seen": 263070835, "router_z_loss_clip": 2.26171875, "router_z_loss_mlp": 0.23657227, "step": 12190, "time_per_iteration": 2.799989938735962 }, { "auxiliary_loss_clip": 0.01267386, "auxiliary_loss_mlp": 0.00246187, "balance_loss_clip": 1.04563653, "balance_loss_mlp": 0.22062883, "epoch": 0.7329625732752142, "flos": 24164038661760.0, "grad_norm": 69.61236772254864, "language_loss": 0.79888946, "learning_rate": 7.024148446550204e-07, "loss": 0.81402522, "num_input_tokens_seen": 263090070, "router_z_loss_clip": 2.21289062, "router_z_loss_mlp": 0.2557373, "step": 12191, "time_per_iteration": 2.6681509017944336 }, { "auxiliary_loss_clip": 0.01247066, "auxiliary_loss_mlp": 0.00221881, "balance_loss_clip": 1.03003633, "balance_loss_mlp": 0.1978122, "epoch": 0.7330226965278822, "flos": 30077915627520.0, "grad_norm": 113.43333543725988, "language_loss": 0.77050745, "learning_rate": 7.021185024330361e-07, "loss": 0.7851969, "num_input_tokens_seen": 263110030, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.24072266, "step": 12192, "time_per_iteration": 2.7498695850372314 }, { "auxiliary_loss_clip": 0.01259162, "auxiliary_loss_mlp": 0.00225464, "balance_loss_clip": 1.03853345, "balance_loss_mlp": 0.20217048, "epoch": 0.7330828197805501, "flos": 23368294713600.0, "grad_norm": 12.84282469944012, "language_loss": 0.8311457, "learning_rate": 7.01822209426848e-07, "loss": 0.84599197, "num_input_tokens_seen": 263129735, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.23278809, "step": 12193, "time_per_iteration": 2.6332759857177734 }, { "auxiliary_loss_clip": 0.01256932, "auxiliary_loss_mlp": 0.00219056, "balance_loss_clip": 1.03406835, "balance_loss_mlp": 0.19319907, "epoch": 0.7331429430332181, "flos": 21032808410880.0, "grad_norm": 9.671674478962926, "language_loss": 0.86712027, "learning_rate": 7.015259656476911e-07, "loss": 0.8818801, "num_input_tokens_seen": 263149100, "router_z_loss_clip": 2.23046875, "router_z_loss_mlp": 0.25866699, "step": 12194, "time_per_iteration": 2.656219482421875 }, { "auxiliary_loss_clip": 0.01262493, "auxiliary_loss_mlp": 0.00206023, "balance_loss_clip": 1.04265618, "balance_loss_mlp": 0.18091738, "epoch": 0.733203066285886, "flos": 14647891466880.0, "grad_norm": 333.6559140077547, "language_loss": 0.78979433, "learning_rate": 7.012297711067998e-07, "loss": 0.80447954, "num_input_tokens_seen": 263166620, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.25097656, "step": 12195, "time_per_iteration": 2.6064982414245605 }, { "auxiliary_loss_clip": 0.01234148, "auxiliary_loss_mlp": 0.00213088, "balance_loss_clip": 1.01904356, "balance_loss_mlp": 0.18977109, "epoch": 0.7332631895385541, "flos": 17165301177600.0, "grad_norm": 11.733237573276215, "language_loss": 0.8046937, "learning_rate": 7.009336258154057e-07, "loss": 0.81916606, "num_input_tokens_seen": 263184780, "router_z_loss_clip": 2.15234375, "router_z_loss_mlp": 0.23291016, "step": 12196, "time_per_iteration": 2.643955945968628 }, { "auxiliary_loss_clip": 0.01250739, "auxiliary_loss_mlp": 0.00233229, "balance_loss_clip": 1.03438807, "balance_loss_mlp": 0.20957758, "epoch": 0.733323312791222, "flos": 28658151676800.0, "grad_norm": 7.126596125794442, "language_loss": 0.80327433, "learning_rate": 7.006375297847394e-07, "loss": 0.81811404, "num_input_tokens_seen": 263204625, "router_z_loss_clip": 2.1640625, "router_z_loss_mlp": 0.23620605, "step": 12197, "time_per_iteration": 2.6887831687927246 }, { "auxiliary_loss_clip": 0.01269759, "auxiliary_loss_mlp": 0.00241738, "balance_loss_clip": 1.04090214, "balance_loss_mlp": 0.21403354, "epoch": 0.73338343604389, "flos": 16618417632000.0, "grad_norm": 6.411773261507132, "language_loss": 0.86613178, "learning_rate": 7.003414830260282e-07, "loss": 0.88124669, "num_input_tokens_seen": 263221565, "router_z_loss_clip": 2.29296875, "router_z_loss_mlp": 0.27746582, "step": 12198, "time_per_iteration": 2.6116392612457275 }, { "auxiliary_loss_clip": 0.01237183, "auxiliary_loss_mlp": 0.0020892, "balance_loss_clip": 1.02175963, "balance_loss_mlp": 0.18599597, "epoch": 0.7334435592965579, "flos": 21142084561920.0, "grad_norm": 239251.0291879866, "language_loss": 0.83455348, "learning_rate": 7.000454855504974e-07, "loss": 0.84901452, "num_input_tokens_seen": 263240620, "router_z_loss_clip": 2.15429688, "router_z_loss_mlp": 0.22937012, "step": 12199, "time_per_iteration": 2.628767490386963 }, { "auxiliary_loss_clip": 0.01254743, "auxiliary_loss_mlp": 0.00203604, "balance_loss_clip": 1.0278126, "balance_loss_mlp": 0.1761265, "epoch": 0.7335036825492259, "flos": 17125332318720.0, "grad_norm": 6.958209072996161, "language_loss": 0.89729589, "learning_rate": 6.997495373693729e-07, "loss": 0.91187936, "num_input_tokens_seen": 263254365, "router_z_loss_clip": 2.26953125, "router_z_loss_mlp": 0.27490234, "step": 12200, "time_per_iteration": 2.6664860248565674 }, { "auxiliary_loss_clip": 0.01245045, "auxiliary_loss_mlp": 0.00233915, "balance_loss_clip": 1.02357662, "balance_loss_mlp": 0.20922714, "epoch": 0.7335638058018938, "flos": 23731818307200.0, "grad_norm": 8.00007684665419, "language_loss": 0.71158361, "learning_rate": 6.994536384938754e-07, "loss": 0.7263732, "num_input_tokens_seen": 263275880, "router_z_loss_clip": 2.21484375, "router_z_loss_mlp": 0.24682617, "step": 12201, "time_per_iteration": 2.6951498985290527 }, { "auxiliary_loss_clip": 0.01251487, "auxiliary_loss_mlp": 0.00218167, "balance_loss_clip": 1.03392673, "balance_loss_mlp": 0.19563606, "epoch": 0.7336239290545619, "flos": 34933289679360.0, "grad_norm": 8.373727642710868, "language_loss": 0.60221046, "learning_rate": 6.991577889352264e-07, "loss": 0.616907, "num_input_tokens_seen": 263298315, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.22521973, "step": 12202, "time_per_iteration": 2.8626182079315186 }, { "auxiliary_loss_clip": 0.01257428, "auxiliary_loss_mlp": 0.00221719, "balance_loss_clip": 1.03579271, "balance_loss_mlp": 0.19775796, "epoch": 0.7336840523072298, "flos": 21103049456640.0, "grad_norm": 70.88155269797817, "language_loss": 0.77400792, "learning_rate": 6.98861988704645e-07, "loss": 0.78879941, "num_input_tokens_seen": 263318615, "router_z_loss_clip": 2.21679688, "router_z_loss_mlp": 0.23962402, "step": 12203, "time_per_iteration": 2.6359329223632812 }, { "auxiliary_loss_clip": 0.01279326, "auxiliary_loss_mlp": 0.00222506, "balance_loss_clip": 1.05555391, "balance_loss_mlp": 0.19766238, "epoch": 0.7337441755598978, "flos": 24024418496640.0, "grad_norm": 1367.2553819419488, "language_loss": 0.75781077, "learning_rate": 6.985662378133474e-07, "loss": 0.77282912, "num_input_tokens_seen": 263336705, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.24829102, "step": 12204, "time_per_iteration": 2.690642833709717 }, { "auxiliary_loss_clip": 0.01236832, "auxiliary_loss_mlp": 0.00197067, "balance_loss_clip": 1.0209862, "balance_loss_mlp": 0.17452441, "epoch": 0.7338042988125658, "flos": 22711309004160.0, "grad_norm": 11.726102968434004, "language_loss": 0.85817206, "learning_rate": 6.982705362725479e-07, "loss": 0.87251103, "num_input_tokens_seen": 263355065, "router_z_loss_clip": 2.16015625, "router_z_loss_mlp": 0.22546387, "step": 12205, "time_per_iteration": 2.687021017074585 }, { "auxiliary_loss_clip": 0.0126336, "auxiliary_loss_mlp": 0.00205971, "balance_loss_clip": 1.04496169, "balance_loss_mlp": 0.1823799, "epoch": 0.7338644220652337, "flos": 21360996000000.0, "grad_norm": 3.1268134892893085, "language_loss": 0.87110847, "learning_rate": 6.979748840934601e-07, "loss": 0.88580179, "num_input_tokens_seen": 263374460, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.23596191, "step": 12206, "time_per_iteration": 2.649634838104248 }, { "auxiliary_loss_clip": 0.01246771, "auxiliary_loss_mlp": 0.00208928, "balance_loss_clip": 1.02573454, "balance_loss_mlp": 0.18463324, "epoch": 0.7339245453179017, "flos": 30920236536960.0, "grad_norm": 6.541478045751026, "language_loss": 0.80366164, "learning_rate": 6.976792812872958e-07, "loss": 0.81821859, "num_input_tokens_seen": 263393610, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.24316406, "step": 12207, "time_per_iteration": 2.7695722579956055 }, { "auxiliary_loss_clip": 0.01102047, "auxiliary_loss_mlp": 0.00068776, "balance_loss_clip": 0.94867128, "balance_loss_mlp": 0.06252918, "epoch": 0.7339846685705697, "flos": 67899429072000.0, "grad_norm": 0.7581809320744195, "language_loss": 0.54125774, "learning_rate": 6.97383727865263e-07, "loss": 0.552966, "num_input_tokens_seen": 263450340, "router_z_loss_clip": 1.53125, "router_z_loss_mlp": 0.0625, "step": 12208, "time_per_iteration": 3.2372446060180664 }, { "auxiliary_loss_clip": 0.01253109, "auxiliary_loss_mlp": 0.00202862, "balance_loss_clip": 1.03301466, "balance_loss_mlp": 0.17736267, "epoch": 0.7340447918232377, "flos": 22236749493120.0, "grad_norm": 3.814983797700515, "language_loss": 0.86065829, "learning_rate": 6.970882238385703e-07, "loss": 0.87521797, "num_input_tokens_seen": 263471735, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.25512695, "step": 12209, "time_per_iteration": 2.7253241539001465 }, { "auxiliary_loss_clip": 0.01251215, "auxiliary_loss_mlp": 0.00235788, "balance_loss_clip": 1.03127217, "balance_loss_mlp": 0.21278061, "epoch": 0.7341049150759056, "flos": 23764784014080.0, "grad_norm": 256.62531283024583, "language_loss": 0.85295904, "learning_rate": 6.96792769218423e-07, "loss": 0.86782902, "num_input_tokens_seen": 263493245, "router_z_loss_clip": 2.19921875, "router_z_loss_mlp": 0.23010254, "step": 12210, "time_per_iteration": 2.695286273956299 }, { "auxiliary_loss_clip": 0.01256626, "auxiliary_loss_mlp": 0.0022581, "balance_loss_clip": 1.03647196, "balance_loss_mlp": 0.20025128, "epoch": 0.7341650383285736, "flos": 17236547804160.0, "grad_norm": 16.09707281974319, "language_loss": 0.85869777, "learning_rate": 6.964973640160236e-07, "loss": 0.87352216, "num_input_tokens_seen": 263511660, "router_z_loss_clip": 2.20117188, "router_z_loss_mlp": 0.25549316, "step": 12211, "time_per_iteration": 2.8366668224334717 }, { "auxiliary_loss_clip": 0.01249513, "auxiliary_loss_mlp": 0.00229781, "balance_loss_clip": 1.03117752, "balance_loss_mlp": 0.20597512, "epoch": 0.7342251615812415, "flos": 23403953940480.0, "grad_norm": 31.586438709728164, "language_loss": 0.80335844, "learning_rate": 6.962020082425748e-07, "loss": 0.81815135, "num_input_tokens_seen": 263530875, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.23803711, "step": 12212, "time_per_iteration": 2.703235387802124 }, { "auxiliary_loss_clip": 0.01259261, "auxiliary_loss_mlp": 0.00219448, "balance_loss_clip": 1.03468311, "balance_loss_mlp": 0.19335268, "epoch": 0.7342852848339095, "flos": 22747183712640.0, "grad_norm": 12.473432506520094, "language_loss": 0.7771523, "learning_rate": 6.959067019092766e-07, "loss": 0.79193938, "num_input_tokens_seen": 263551585, "router_z_loss_clip": 2.24414062, "router_z_loss_mlp": 0.26074219, "step": 12213, "time_per_iteration": 2.7583351135253906 }, { "auxiliary_loss_clip": 0.01127181, "auxiliary_loss_mlp": 0.0005275, "balance_loss_clip": 0.9695009, "balance_loss_mlp": 0.04683714, "epoch": 0.7343454080865774, "flos": 53942353925760.0, "grad_norm": 0.6990467631280239, "language_loss": 0.53397632, "learning_rate": 6.956114450273276e-07, "loss": 0.54577565, "num_input_tokens_seen": 263609545, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.05908203, "step": 12214, "time_per_iteration": 3.0913808345794678 }, { "auxiliary_loss_clip": 0.01278366, "auxiliary_loss_mlp": 0.00229165, "balance_loss_clip": 1.04895651, "balance_loss_mlp": 0.20267701, "epoch": 0.7344055313392455, "flos": 12166859255040.0, "grad_norm": 8.957653096051944, "language_loss": 0.81924713, "learning_rate": 6.953162376079233e-07, "loss": 0.83432245, "num_input_tokens_seen": 263627880, "router_z_loss_clip": 2.296875, "router_z_loss_mlp": 0.26489258, "step": 12215, "time_per_iteration": 4.0502448081970215 }, { "auxiliary_loss_clip": 0.01252465, "auxiliary_loss_mlp": 0.00233662, "balance_loss_clip": 1.03781128, "balance_loss_mlp": 0.20960578, "epoch": 0.7344656545919134, "flos": 18550052346240.0, "grad_norm": 6.307657519449156, "language_loss": 0.79802155, "learning_rate": 6.950210796622573e-07, "loss": 0.81288284, "num_input_tokens_seen": 263645665, "router_z_loss_clip": 2.14648438, "router_z_loss_mlp": 0.24060059, "step": 12216, "time_per_iteration": 2.6409900188446045 }, { "auxiliary_loss_clip": 0.0130708, "auxiliary_loss_mlp": 0.00235184, "balance_loss_clip": 1.06701159, "balance_loss_mlp": 0.20746818, "epoch": 0.7345257778445814, "flos": 23661649088640.0, "grad_norm": 14.056888863624494, "language_loss": 0.85474944, "learning_rate": 6.947259712015236e-07, "loss": 0.87017202, "num_input_tokens_seen": 263668170, "router_z_loss_clip": 2.39648438, "router_z_loss_mlp": 0.27722168, "step": 12217, "time_per_iteration": 4.1446638107299805 }, { "auxiliary_loss_clip": 0.01243839, "auxiliary_loss_mlp": 0.00212813, "balance_loss_clip": 1.02753735, "balance_loss_mlp": 0.18913837, "epoch": 0.7345859010972494, "flos": 13808659127040.0, "grad_norm": 38.96895091069983, "language_loss": 0.85401666, "learning_rate": 6.94430912236911e-07, "loss": 0.8685832, "num_input_tokens_seen": 263684190, "router_z_loss_clip": 2.16210938, "router_z_loss_mlp": 0.23718262, "step": 12218, "time_per_iteration": 2.6875252723693848 }, { "auxiliary_loss_clip": 0.0126522, "auxiliary_loss_mlp": 0.00222183, "balance_loss_clip": 1.04405284, "balance_loss_mlp": 0.19757792, "epoch": 0.7346460243499173, "flos": 22272731942400.0, "grad_norm": 4.511590587235688, "language_loss": 0.81287509, "learning_rate": 6.941359027796092e-07, "loss": 0.82774913, "num_input_tokens_seen": 263702095, "router_z_loss_clip": 2.21289062, "router_z_loss_mlp": 0.24633789, "step": 12219, "time_per_iteration": 2.672482967376709 }, { "auxiliary_loss_clip": 0.01250081, "auxiliary_loss_mlp": 0.00219944, "balance_loss_clip": 1.03445029, "balance_loss_mlp": 0.19505268, "epoch": 0.7347061476025853, "flos": 23255247634560.0, "grad_norm": 17.65719708544549, "language_loss": 0.82680631, "learning_rate": 6.938409428408061e-07, "loss": 0.8415066, "num_input_tokens_seen": 263721385, "router_z_loss_clip": 2.15820312, "router_z_loss_mlp": 0.24902344, "step": 12220, "time_per_iteration": 5.093493700027466 }, { "auxiliary_loss_clip": 0.01263343, "auxiliary_loss_mlp": 0.00256998, "balance_loss_clip": 1.03845549, "balance_loss_mlp": 0.23115307, "epoch": 0.7347662708552533, "flos": 15267565923840.0, "grad_norm": 11.973503502536625, "language_loss": 0.7375415, "learning_rate": 6.93546032431684e-07, "loss": 0.75274485, "num_input_tokens_seen": 263737835, "router_z_loss_clip": 2.25195312, "router_z_loss_mlp": 0.25878906, "step": 12221, "time_per_iteration": 2.620224714279175 }, { "auxiliary_loss_clip": 0.01269429, "auxiliary_loss_mlp": 0.00222492, "balance_loss_clip": 1.0473547, "balance_loss_mlp": 0.19816187, "epoch": 0.7348263941079213, "flos": 24859987649280.0, "grad_norm": 3.3630889793842984, "language_loss": 0.77436095, "learning_rate": 6.932511715634273e-07, "loss": 0.78928018, "num_input_tokens_seen": 263756480, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.24316406, "step": 12222, "time_per_iteration": 2.7200722694396973 }, { "auxiliary_loss_clip": 0.01244837, "auxiliary_loss_mlp": 0.00210666, "balance_loss_clip": 1.03376508, "balance_loss_mlp": 0.18862469, "epoch": 0.7348865173605892, "flos": 24352103295360.0, "grad_norm": 21.688633760418604, "language_loss": 0.74086571, "learning_rate": 6.92956360247217e-07, "loss": 0.75542068, "num_input_tokens_seen": 263776440, "router_z_loss_clip": 2.109375, "router_z_loss_mlp": 0.22009277, "step": 12223, "time_per_iteration": 2.6943159103393555 }, { "auxiliary_loss_clip": 0.01261103, "auxiliary_loss_mlp": 0.00223432, "balance_loss_clip": 1.04229546, "balance_loss_mlp": 0.1988984, "epoch": 0.7349466406132572, "flos": 20004613597440.0, "grad_norm": 8.444874809512603, "language_loss": 0.83091146, "learning_rate": 6.926615984942332e-07, "loss": 0.84575689, "num_input_tokens_seen": 263793700, "router_z_loss_clip": 2.19140625, "router_z_loss_mlp": 0.24523926, "step": 12224, "time_per_iteration": 2.645768165588379 }, { "auxiliary_loss_clip": 0.0126054, "auxiliary_loss_mlp": 0.00222041, "balance_loss_clip": 1.04120553, "balance_loss_mlp": 0.19704258, "epoch": 0.7350067638659251, "flos": 29825068815360.0, "grad_norm": 302.36220094809863, "language_loss": 0.81954235, "learning_rate": 6.92366886315652e-07, "loss": 0.83436811, "num_input_tokens_seen": 263814620, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.25024414, "step": 12225, "time_per_iteration": 4.283280611038208 }, { "auxiliary_loss_clip": 0.01276311, "auxiliary_loss_mlp": 0.00218037, "balance_loss_clip": 1.04549336, "balance_loss_mlp": 0.1910121, "epoch": 0.7350668871185931, "flos": 21866150920320.0, "grad_norm": 133.67544045158786, "language_loss": 0.84840178, "learning_rate": 6.920722237226501e-07, "loss": 0.86334527, "num_input_tokens_seen": 263832725, "router_z_loss_clip": 2.3046875, "router_z_loss_mlp": 0.27026367, "step": 12226, "time_per_iteration": 2.6750435829162598 }, { "auxiliary_loss_clip": 0.01261556, "auxiliary_loss_mlp": 0.00248888, "balance_loss_clip": 1.04276633, "balance_loss_mlp": 0.22219706, "epoch": 0.735127010371261, "flos": 22566122231040.0, "grad_norm": 50.88158691581524, "language_loss": 0.72746432, "learning_rate": 6.917776107264008e-07, "loss": 0.74256873, "num_input_tokens_seen": 263853850, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.2668457, "step": 12227, "time_per_iteration": 2.8338546752929688 }, { "auxiliary_loss_clip": 0.01268912, "auxiliary_loss_mlp": 0.0022113, "balance_loss_clip": 1.04984307, "balance_loss_mlp": 0.19635789, "epoch": 0.7351871336239291, "flos": 25884339707520.0, "grad_norm": 194.11611459073075, "language_loss": 0.71510661, "learning_rate": 6.914830473380749e-07, "loss": 0.73000699, "num_input_tokens_seen": 263874760, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.24768066, "step": 12228, "time_per_iteration": 2.7638187408447266 }, { "auxiliary_loss_clip": 0.01262472, "auxiliary_loss_mlp": 0.0022036, "balance_loss_clip": 1.04187822, "balance_loss_mlp": 0.1953263, "epoch": 0.735247256876597, "flos": 17932173569280.0, "grad_norm": 42.22187265005329, "language_loss": 0.71574557, "learning_rate": 6.911885335688427e-07, "loss": 0.73057389, "num_input_tokens_seen": 263893390, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.25036621, "step": 12229, "time_per_iteration": 2.665720224380493 }, { "auxiliary_loss_clip": 0.01275328, "auxiliary_loss_mlp": 0.002302, "balance_loss_clip": 1.04597902, "balance_loss_mlp": 0.20355719, "epoch": 0.735307380129265, "flos": 28875159694080.0, "grad_norm": 55.77157764527744, "language_loss": 0.82570344, "learning_rate": 6.908940694298726e-07, "loss": 0.84075874, "num_input_tokens_seen": 263911180, "router_z_loss_clip": 2.29492188, "router_z_loss_mlp": 0.26660156, "step": 12230, "time_per_iteration": 2.7665233612060547 }, { "auxiliary_loss_clip": 0.01298793, "auxiliary_loss_mlp": 0.00244973, "balance_loss_clip": 1.06391835, "balance_loss_mlp": 0.21588579, "epoch": 0.7353675033819329, "flos": 13625658311040.0, "grad_norm": 7.822588390996206, "language_loss": 0.85930753, "learning_rate": 6.90599654932332e-07, "loss": 0.87474513, "num_input_tokens_seen": 263928975, "router_z_loss_clip": 2.34960938, "router_z_loss_mlp": 0.29064941, "step": 12231, "time_per_iteration": 2.694552421569824 }, { "auxiliary_loss_clip": 0.0128441, "auxiliary_loss_mlp": 0.00246278, "balance_loss_clip": 1.05896544, "balance_loss_mlp": 0.21974234, "epoch": 0.7354276266346009, "flos": 19463081178240.0, "grad_norm": 35.51980100489979, "language_loss": 0.75331759, "learning_rate": 6.903052900873823e-07, "loss": 0.76862442, "num_input_tokens_seen": 263944495, "router_z_loss_clip": 2.25390625, "router_z_loss_mlp": 0.26550293, "step": 12232, "time_per_iteration": 2.75712513923645 }, { "auxiliary_loss_clip": 0.01271946, "auxiliary_loss_mlp": 0.00210207, "balance_loss_clip": 1.04557455, "balance_loss_mlp": 0.18549535, "epoch": 0.735487749887269, "flos": 15771858917760.0, "grad_norm": 5.443384656002718, "language_loss": 0.8333686, "learning_rate": 6.900109749061874e-07, "loss": 0.84819013, "num_input_tokens_seen": 263961325, "router_z_loss_clip": 2.26367188, "router_z_loss_mlp": 0.24707031, "step": 12233, "time_per_iteration": 2.691032648086548 }, { "auxiliary_loss_clip": 0.0128442, "auxiliary_loss_mlp": 0.00253925, "balance_loss_clip": 1.05261266, "balance_loss_mlp": 0.22709076, "epoch": 0.7355478731399369, "flos": 18260648467200.0, "grad_norm": 7.922902665656344, "language_loss": 0.80829012, "learning_rate": 6.897167093999079e-07, "loss": 0.82367355, "num_input_tokens_seen": 263980445, "router_z_loss_clip": 2.31835938, "router_z_loss_mlp": 0.26843262, "step": 12234, "time_per_iteration": 2.741530179977417 }, { "auxiliary_loss_clip": 0.01272458, "auxiliary_loss_mlp": 0.0021435, "balance_loss_clip": 1.05261934, "balance_loss_mlp": 0.18892264, "epoch": 0.7356079963926049, "flos": 26542043688960.0, "grad_norm": 79.38112799000211, "language_loss": 0.71639633, "learning_rate": 6.894224935797017e-07, "loss": 0.73126435, "num_input_tokens_seen": 263999330, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.25427246, "step": 12235, "time_per_iteration": 2.729766368865967 }, { "auxiliary_loss_clip": 0.0126282, "auxiliary_loss_mlp": 0.00233053, "balance_loss_clip": 1.04017735, "balance_loss_mlp": 0.2076378, "epoch": 0.7356681196452728, "flos": 10778624467200.0, "grad_norm": 8.520145619201891, "language_loss": 0.94865608, "learning_rate": 6.891283274567259e-07, "loss": 0.9636147, "num_input_tokens_seen": 264014150, "router_z_loss_clip": 2.23046875, "router_z_loss_mlp": 0.25415039, "step": 12236, "time_per_iteration": 2.6683595180511475 }, { "auxiliary_loss_clip": 0.012584, "auxiliary_loss_mlp": 0.00234542, "balance_loss_clip": 1.03573036, "balance_loss_mlp": 0.20963949, "epoch": 0.7357282428979408, "flos": 19718693337600.0, "grad_norm": 16.37717794990735, "language_loss": 0.76231301, "learning_rate": 6.888342110421364e-07, "loss": 0.77724242, "num_input_tokens_seen": 264033140, "router_z_loss_clip": 2.22753906, "router_z_loss_mlp": 0.2487793, "step": 12237, "time_per_iteration": 2.6573774814605713 }, { "auxiliary_loss_clip": 0.01275953, "auxiliary_loss_mlp": 0.00230275, "balance_loss_clip": 1.04905462, "balance_loss_mlp": 0.20447797, "epoch": 0.7357883661506087, "flos": 19464014931840.0, "grad_norm": 27.347344206863042, "language_loss": 0.79293674, "learning_rate": 6.885401443470839e-07, "loss": 0.80799901, "num_input_tokens_seen": 264052105, "router_z_loss_clip": 2.27148438, "router_z_loss_mlp": 0.25793457, "step": 12238, "time_per_iteration": 2.754409074783325 }, { "auxiliary_loss_clip": 0.01289567, "auxiliary_loss_mlp": 0.00241605, "balance_loss_clip": 1.06276691, "balance_loss_mlp": 0.21448524, "epoch": 0.7358484894032767, "flos": 27123006263040.0, "grad_norm": 18.08212878230921, "language_loss": 0.80910665, "learning_rate": 6.882461273827205e-07, "loss": 0.82441843, "num_input_tokens_seen": 264070690, "router_z_loss_clip": 2.26757812, "router_z_loss_mlp": 0.27111816, "step": 12239, "time_per_iteration": 2.7685506343841553 }, { "auxiliary_loss_clip": 0.01243972, "auxiliary_loss_mlp": 0.00195166, "balance_loss_clip": 1.02618229, "balance_loss_mlp": 0.17114535, "epoch": 0.7359086126559446, "flos": 24502282058880.0, "grad_norm": 22.413252839123324, "language_loss": 0.84449756, "learning_rate": 6.879521601601954e-07, "loss": 0.85888892, "num_input_tokens_seen": 264094225, "router_z_loss_clip": 2.18164062, "router_z_loss_mlp": 0.2401123, "step": 12240, "time_per_iteration": 2.7528834342956543 }, { "auxiliary_loss_clip": 0.01250583, "auxiliary_loss_mlp": 0.00243607, "balance_loss_clip": 1.03741741, "balance_loss_mlp": 0.21798879, "epoch": 0.7359687359086127, "flos": 23331270769920.0, "grad_norm": 4.239121124459072, "language_loss": 0.91398537, "learning_rate": 6.876582426906565e-07, "loss": 0.9289273, "num_input_tokens_seen": 264113190, "router_z_loss_clip": 2.13085938, "router_z_loss_mlp": 0.25610352, "step": 12241, "time_per_iteration": 2.7032997608184814 }, { "auxiliary_loss_clip": 0.01256251, "auxiliary_loss_mlp": 0.00217863, "balance_loss_clip": 1.03619242, "balance_loss_mlp": 0.19164938, "epoch": 0.7360288591612806, "flos": 20193396503040.0, "grad_norm": 8.581886236059724, "language_loss": 0.86556304, "learning_rate": 6.873643749852484e-07, "loss": 0.88030416, "num_input_tokens_seen": 264132050, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.26196289, "step": 12242, "time_per_iteration": 2.682818651199341 }, { "auxiliary_loss_clip": 0.01255408, "auxiliary_loss_mlp": 0.00227713, "balance_loss_clip": 1.03914332, "balance_loss_mlp": 0.20134404, "epoch": 0.7360889824139486, "flos": 24972783333120.0, "grad_norm": 65.50764383341328, "language_loss": 0.85587615, "learning_rate": 6.870705570551145e-07, "loss": 0.87070733, "num_input_tokens_seen": 264152800, "router_z_loss_clip": 2.1640625, "router_z_loss_mlp": 0.26391602, "step": 12243, "time_per_iteration": 2.7442879676818848 }, { "auxiliary_loss_clip": 0.01264216, "auxiliary_loss_mlp": 0.00209392, "balance_loss_clip": 1.04589295, "balance_loss_mlp": 0.18248695, "epoch": 0.7361491056666165, "flos": 15012312900480.0, "grad_norm": 48.845477013922654, "language_loss": 0.85658562, "learning_rate": 6.867767889113969e-07, "loss": 0.87132168, "num_input_tokens_seen": 264169650, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.26916504, "step": 12244, "time_per_iteration": 2.760718822479248 }, { "auxiliary_loss_clip": 0.01269564, "auxiliary_loss_mlp": 0.00232674, "balance_loss_clip": 1.04654789, "balance_loss_mlp": 0.20759188, "epoch": 0.7362092289192845, "flos": 22930400010240.0, "grad_norm": 19.286526711407813, "language_loss": 0.80209786, "learning_rate": 6.864830705652347e-07, "loss": 0.81712019, "num_input_tokens_seen": 264190530, "router_z_loss_clip": 2.22851562, "router_z_loss_mlp": 0.25061035, "step": 12245, "time_per_iteration": 2.7162952423095703 }, { "auxiliary_loss_clip": 0.01253265, "auxiliary_loss_mlp": 0.00235442, "balance_loss_clip": 1.04460156, "balance_loss_mlp": 0.21042004, "epoch": 0.7362693521719526, "flos": 20702681487360.0, "grad_norm": 29.47710665905208, "language_loss": 0.81483656, "learning_rate": 6.861894020277658e-07, "loss": 0.82972366, "num_input_tokens_seen": 264210820, "router_z_loss_clip": 2.08789062, "router_z_loss_mlp": 0.25012207, "step": 12246, "time_per_iteration": 2.6916232109069824 }, { "auxiliary_loss_clip": 0.01246749, "auxiliary_loss_mlp": 0.00218126, "balance_loss_clip": 1.03444624, "balance_loss_mlp": 0.19229308, "epoch": 0.7363294754246205, "flos": 13111381336320.0, "grad_norm": 66.78115809199367, "language_loss": 0.79229516, "learning_rate": 6.858957833101266e-07, "loss": 0.80694389, "num_input_tokens_seen": 264227430, "router_z_loss_clip": 2.12402344, "router_z_loss_mlp": 0.25830078, "step": 12247, "time_per_iteration": 2.6968791484832764 }, { "auxiliary_loss_clip": 0.01265549, "auxiliary_loss_mlp": 0.00196554, "balance_loss_clip": 1.04399419, "balance_loss_mlp": 0.17159148, "epoch": 0.7363895986772885, "flos": 14027426910720.0, "grad_norm": 58.29796735175838, "language_loss": 0.80837429, "learning_rate": 6.856022144234526e-07, "loss": 0.82299531, "num_input_tokens_seen": 264245230, "router_z_loss_clip": 2.21484375, "router_z_loss_mlp": 0.24951172, "step": 12248, "time_per_iteration": 2.672248125076294 }, { "auxiliary_loss_clip": 0.01280096, "auxiliary_loss_mlp": 0.00220817, "balance_loss_clip": 1.05487525, "balance_loss_mlp": 0.19548526, "epoch": 0.7364497219299564, "flos": 19719986227200.0, "grad_norm": 12.911212611115042, "language_loss": 0.82963467, "learning_rate": 6.853086953788727e-07, "loss": 0.84464377, "num_input_tokens_seen": 264263945, "router_z_loss_clip": 2.25390625, "router_z_loss_mlp": 0.25354004, "step": 12249, "time_per_iteration": 2.694141387939453 }, { "auxiliary_loss_clip": 0.01279544, "auxiliary_loss_mlp": 0.0023667, "balance_loss_clip": 1.05527186, "balance_loss_mlp": 0.2096695, "epoch": 0.7365098451826244, "flos": 21361391049600.0, "grad_norm": 8.137717936342712, "language_loss": 0.86324573, "learning_rate": 6.850152261875189e-07, "loss": 0.87840784, "num_input_tokens_seen": 264281500, "router_z_loss_clip": 2.24023438, "router_z_loss_mlp": 0.27001953, "step": 12250, "time_per_iteration": 2.700054407119751 }, { "auxiliary_loss_clip": 0.0126658, "auxiliary_loss_mlp": 0.00217077, "balance_loss_clip": 1.04075527, "balance_loss_mlp": 0.19006424, "epoch": 0.7365699684352923, "flos": 23368222886400.0, "grad_norm": 10.453047546331705, "language_loss": 0.79185766, "learning_rate": 6.8472180686052e-07, "loss": 0.80669427, "num_input_tokens_seen": 264301625, "router_z_loss_clip": 2.2578125, "router_z_loss_mlp": 0.26989746, "step": 12251, "time_per_iteration": 2.7506723403930664 }, { "auxiliary_loss_clip": 0.01245708, "auxiliary_loss_mlp": 0.00208095, "balance_loss_clip": 1.03491604, "balance_loss_mlp": 0.18322781, "epoch": 0.7366300916879603, "flos": 59524879927680.0, "grad_norm": 6.429797757639255, "language_loss": 0.7217083, "learning_rate": 6.844284374090015e-07, "loss": 0.73624635, "num_input_tokens_seen": 264323975, "router_z_loss_clip": 2.109375, "router_z_loss_mlp": 0.24865723, "step": 12252, "time_per_iteration": 3.099086284637451 }, { "auxiliary_loss_clip": 0.01272283, "auxiliary_loss_mlp": 0.00215835, "balance_loss_clip": 1.05416679, "balance_loss_mlp": 0.19075368, "epoch": 0.7366902149406283, "flos": 20923137210240.0, "grad_norm": 5.581459251337042, "language_loss": 0.85783219, "learning_rate": 6.841351178440884e-07, "loss": 0.87271339, "num_input_tokens_seen": 264343785, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.25097656, "step": 12253, "time_per_iteration": 2.6945266723632812 }, { "auxiliary_loss_clip": 0.01242517, "auxiliary_loss_mlp": 0.0021622, "balance_loss_clip": 1.02738881, "balance_loss_mlp": 0.19159172, "epoch": 0.7367503381932963, "flos": 17348158339200.0, "grad_norm": 49.658501464984745, "language_loss": 0.84993726, "learning_rate": 6.83841848176905e-07, "loss": 0.86452466, "num_input_tokens_seen": 264361130, "router_z_loss_clip": 2.15234375, "router_z_loss_mlp": 0.24621582, "step": 12254, "time_per_iteration": 2.725686550140381 }, { "auxiliary_loss_clip": 0.0126258, "auxiliary_loss_mlp": 0.00225532, "balance_loss_clip": 1.04435849, "balance_loss_mlp": 0.1987696, "epoch": 0.7368104614459642, "flos": 17821317219840.0, "grad_norm": 103.06309669692513, "language_loss": 0.80921793, "learning_rate": 6.835486284185692e-07, "loss": 0.824099, "num_input_tokens_seen": 264376965, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.26757812, "step": 12255, "time_per_iteration": 2.67702054977417 }, { "auxiliary_loss_clip": 0.01284419, "auxiliary_loss_mlp": 0.00242107, "balance_loss_clip": 1.05895603, "balance_loss_mlp": 0.21622637, "epoch": 0.7368705846986322, "flos": 24606099342720.0, "grad_norm": 3.4084831402422093, "language_loss": 0.83403742, "learning_rate": 6.832554585802012e-07, "loss": 0.84930265, "num_input_tokens_seen": 264396310, "router_z_loss_clip": 2.25585938, "router_z_loss_mlp": 0.25878906, "step": 12256, "time_per_iteration": 2.7805869579315186 }, { "auxiliary_loss_clip": 0.01249566, "auxiliary_loss_mlp": 0.00219453, "balance_loss_clip": 1.03456879, "balance_loss_mlp": 0.19482456, "epoch": 0.7369307079513001, "flos": 34970169968640.0, "grad_norm": 39.36458451922757, "language_loss": 0.80820298, "learning_rate": 6.829623386729182e-07, "loss": 0.82289314, "num_input_tokens_seen": 264418085, "router_z_loss_clip": 2.1484375, "router_z_loss_mlp": 0.24597168, "step": 12257, "time_per_iteration": 4.218636512756348 }, { "auxiliary_loss_clip": 0.01268385, "auxiliary_loss_mlp": 0.00219858, "balance_loss_clip": 1.05114746, "balance_loss_mlp": 0.19440649, "epoch": 0.7369908312039681, "flos": 21214588164480.0, "grad_norm": 18.97561532502363, "language_loss": 0.8515234, "learning_rate": 6.826692687078362e-07, "loss": 0.86640584, "num_input_tokens_seen": 264437595, "router_z_loss_clip": 2.171875, "router_z_loss_mlp": 0.2545166, "step": 12258, "time_per_iteration": 2.675668239593506 }, { "auxiliary_loss_clip": 0.01255636, "auxiliary_loss_mlp": 0.00215023, "balance_loss_clip": 1.03875804, "balance_loss_mlp": 0.18904749, "epoch": 0.7370509544566362, "flos": 23623655477760.0, "grad_norm": 4.43346054533031, "language_loss": 0.74266171, "learning_rate": 6.823762486960674e-07, "loss": 0.75736833, "num_input_tokens_seen": 264457385, "router_z_loss_clip": 2.16796875, "router_z_loss_mlp": 0.25976562, "step": 12259, "time_per_iteration": 4.159494400024414 }, { "auxiliary_loss_clip": 0.01258834, "auxiliary_loss_mlp": 0.00226602, "balance_loss_clip": 1.04239488, "balance_loss_mlp": 0.20260543, "epoch": 0.7371110777093041, "flos": 24827704300800.0, "grad_norm": 18.795991038646026, "language_loss": 0.81597346, "learning_rate": 6.820832786487225e-07, "loss": 0.83082783, "num_input_tokens_seen": 264477205, "router_z_loss_clip": 2.1640625, "router_z_loss_mlp": 0.23999023, "step": 12260, "time_per_iteration": 2.8093581199645996 }, { "auxiliary_loss_clip": 0.01271884, "auxiliary_loss_mlp": 0.00228302, "balance_loss_clip": 1.05089712, "balance_loss_mlp": 0.20111074, "epoch": 0.7371712009619721, "flos": 23149491016320.0, "grad_norm": 54.59212498556115, "language_loss": 0.80190647, "learning_rate": 6.817903585769125e-07, "loss": 0.81690836, "num_input_tokens_seen": 264497195, "router_z_loss_clip": 2.2109375, "router_z_loss_mlp": 0.27160645, "step": 12261, "time_per_iteration": 2.665038585662842 }, { "auxiliary_loss_clip": 0.01262975, "auxiliary_loss_mlp": 0.00239868, "balance_loss_clip": 1.04378593, "balance_loss_mlp": 0.21432137, "epoch": 0.73723132421464, "flos": 23112898035840.0, "grad_norm": 100.14797100938209, "language_loss": 0.80279839, "learning_rate": 6.814974884917438e-07, "loss": 0.81782681, "num_input_tokens_seen": 264516950, "router_z_loss_clip": 2.19335938, "router_z_loss_mlp": 0.25512695, "step": 12262, "time_per_iteration": 4.3127405643463135 }, { "auxiliary_loss_clip": 0.01243933, "auxiliary_loss_mlp": 0.00223468, "balance_loss_clip": 1.02967763, "balance_loss_mlp": 0.20029336, "epoch": 0.737291447467308, "flos": 19273328605440.0, "grad_norm": 12.025232204211028, "language_loss": 0.95066363, "learning_rate": 6.81204668404322e-07, "loss": 0.96533763, "num_input_tokens_seen": 264532675, "router_z_loss_clip": 2.14257812, "router_z_loss_mlp": 0.23181152, "step": 12263, "time_per_iteration": 2.6801939010620117 }, { "auxiliary_loss_clip": 0.01247732, "auxiliary_loss_mlp": 0.00204938, "balance_loss_clip": 1.03684068, "balance_loss_mlp": 0.18138236, "epoch": 0.7373515707199759, "flos": 25118257415040.0, "grad_norm": 5.028312613136822, "language_loss": 0.7357589, "learning_rate": 6.809118983257522e-07, "loss": 0.75028563, "num_input_tokens_seen": 264555635, "router_z_loss_clip": 2.10839844, "router_z_loss_mlp": 0.23583984, "step": 12264, "time_per_iteration": 2.7366175651550293 }, { "auxiliary_loss_clip": 0.01242374, "auxiliary_loss_mlp": 0.00217348, "balance_loss_clip": 1.02962935, "balance_loss_mlp": 0.19367269, "epoch": 0.737411693972644, "flos": 32408481767040.0, "grad_norm": 28.678861410749903, "language_loss": 0.87356931, "learning_rate": 6.806191782671356e-07, "loss": 0.88816655, "num_input_tokens_seen": 264573140, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.23693848, "step": 12265, "time_per_iteration": 2.8255434036254883 }, { "auxiliary_loss_clip": 0.01285659, "auxiliary_loss_mlp": 0.00228911, "balance_loss_clip": 1.05307961, "balance_loss_mlp": 0.20180288, "epoch": 0.7374718172253119, "flos": 24315797623680.0, "grad_norm": 18.420178953401184, "language_loss": 0.80040622, "learning_rate": 6.803265082395711e-07, "loss": 0.81555194, "num_input_tokens_seen": 264591610, "router_z_loss_clip": 2.32421875, "router_z_loss_mlp": 0.27111816, "step": 12266, "time_per_iteration": 2.7268521785736084 }, { "auxiliary_loss_clip": 0.01245628, "auxiliary_loss_mlp": 0.0023059, "balance_loss_clip": 1.03062201, "balance_loss_mlp": 0.20611608, "epoch": 0.7375319404779799, "flos": 27156115624320.0, "grad_norm": 6.097709618072031, "language_loss": 0.82546854, "learning_rate": 6.800338882541576e-07, "loss": 0.8402307, "num_input_tokens_seen": 264611170, "router_z_loss_clip": 2.1484375, "router_z_loss_mlp": 0.24499512, "step": 12267, "time_per_iteration": 4.131671667098999 }, { "auxiliary_loss_clip": 0.01238468, "auxiliary_loss_mlp": 0.00227645, "balance_loss_clip": 1.02487183, "balance_loss_mlp": 0.20437524, "epoch": 0.7375920637306478, "flos": 18879999701760.0, "grad_norm": 74.53478304120783, "language_loss": 0.9193908, "learning_rate": 6.797413183219923e-07, "loss": 0.93405187, "num_input_tokens_seen": 264629365, "router_z_loss_clip": 2.13476562, "router_z_loss_mlp": 0.23254395, "step": 12268, "time_per_iteration": 2.709599018096924 }, { "auxiliary_loss_clip": 0.01263877, "auxiliary_loss_mlp": 0.00237554, "balance_loss_clip": 1.04738247, "balance_loss_mlp": 0.21116114, "epoch": 0.7376521869833158, "flos": 15669765486720.0, "grad_norm": 18.0545236519316, "language_loss": 0.82559305, "learning_rate": 6.794487984541677e-07, "loss": 0.84060735, "num_input_tokens_seen": 264647915, "router_z_loss_clip": 2.16210938, "router_z_loss_mlp": 0.26367188, "step": 12269, "time_per_iteration": 2.6595513820648193 }, { "auxiliary_loss_clip": 0.01288952, "auxiliary_loss_mlp": 0.00251844, "balance_loss_clip": 1.06072092, "balance_loss_mlp": 0.22573701, "epoch": 0.7377123102359837, "flos": 36971973901440.0, "grad_norm": 21.540670517611055, "language_loss": 0.77562159, "learning_rate": 6.791563286617776e-07, "loss": 0.79102951, "num_input_tokens_seen": 264669620, "router_z_loss_clip": 2.27929688, "router_z_loss_mlp": 0.26123047, "step": 12270, "time_per_iteration": 2.8648931980133057 }, { "auxiliary_loss_clip": 0.01272303, "auxiliary_loss_mlp": 0.00239467, "balance_loss_clip": 1.05074787, "balance_loss_mlp": 0.21603025, "epoch": 0.7377724334886517, "flos": 24496284487680.0, "grad_norm": 7.907991593908506, "language_loss": 0.76789749, "learning_rate": 6.788639089559119e-07, "loss": 0.78301513, "num_input_tokens_seen": 264689345, "router_z_loss_clip": 2.21484375, "router_z_loss_mlp": 0.234375, "step": 12271, "time_per_iteration": 2.689436435699463 }, { "auxiliary_loss_clip": 0.01275713, "auxiliary_loss_mlp": 0.00215179, "balance_loss_clip": 1.0491749, "balance_loss_mlp": 0.19115818, "epoch": 0.7378325567413198, "flos": 24390025079040.0, "grad_norm": 9.651710794190384, "language_loss": 0.78309858, "learning_rate": 6.785715393476586e-07, "loss": 0.79800749, "num_input_tokens_seen": 264707625, "router_z_loss_clip": 2.26269531, "router_z_loss_mlp": 0.2401123, "step": 12272, "time_per_iteration": 2.7671549320220947 }, { "auxiliary_loss_clip": 0.01251069, "auxiliary_loss_mlp": 0.00222435, "balance_loss_clip": 1.03780067, "balance_loss_mlp": 0.19935597, "epoch": 0.7378926799939877, "flos": 17416388223360.0, "grad_norm": 34.057397833390596, "language_loss": 0.84933525, "learning_rate": 6.782792198481049e-07, "loss": 0.8640703, "num_input_tokens_seen": 264725575, "router_z_loss_clip": 2.13183594, "router_z_loss_mlp": 0.23071289, "step": 12273, "time_per_iteration": 2.841933012008667 }, { "auxiliary_loss_clip": 0.0124326, "auxiliary_loss_mlp": 0.0022524, "balance_loss_clip": 1.03170455, "balance_loss_mlp": 0.20181563, "epoch": 0.7379528032466557, "flos": 18474208778880.0, "grad_norm": 1698.6095848733887, "language_loss": 0.89481866, "learning_rate": 6.779869504683355e-07, "loss": 0.9095037, "num_input_tokens_seen": 264742855, "router_z_loss_clip": 2.11425781, "router_z_loss_mlp": 0.23388672, "step": 12274, "time_per_iteration": 2.704786539077759 }, { "auxiliary_loss_clip": 0.01283354, "auxiliary_loss_mlp": 0.00241358, "balance_loss_clip": 1.05621898, "balance_loss_mlp": 0.2143811, "epoch": 0.7380129264993236, "flos": 17821999578240.0, "grad_norm": 52.27709756389123, "language_loss": 0.85283536, "learning_rate": 6.776947312194341e-07, "loss": 0.86808252, "num_input_tokens_seen": 264761155, "router_z_loss_clip": 2.27539062, "router_z_loss_mlp": 0.26977539, "step": 12275, "time_per_iteration": 2.8179750442504883 }, { "auxiliary_loss_clip": 0.01281855, "auxiliary_loss_mlp": 0.00244288, "balance_loss_clip": 1.05314374, "balance_loss_mlp": 0.21851538, "epoch": 0.7380730497519916, "flos": 22997372918400.0, "grad_norm": 8.648323019593299, "language_loss": 0.82045901, "learning_rate": 6.774025621124813e-07, "loss": 0.83572048, "num_input_tokens_seen": 264780660, "router_z_loss_clip": 2.28710938, "router_z_loss_mlp": 0.25769043, "step": 12276, "time_per_iteration": 2.9179494380950928 }, { "auxiliary_loss_clip": 0.01271118, "auxiliary_loss_mlp": 0.00250761, "balance_loss_clip": 1.05032635, "balance_loss_mlp": 0.2261202, "epoch": 0.7381331730046595, "flos": 20266259241600.0, "grad_norm": 83.0724155161785, "language_loss": 0.85610789, "learning_rate": 6.771104431585551e-07, "loss": 0.87132668, "num_input_tokens_seen": 264798850, "router_z_loss_clip": 2.20996094, "router_z_loss_mlp": 0.24682617, "step": 12277, "time_per_iteration": 2.7011587619781494 }, { "auxiliary_loss_clip": 0.01254575, "auxiliary_loss_mlp": 0.00221872, "balance_loss_clip": 1.03911924, "balance_loss_mlp": 0.19782773, "epoch": 0.7381932962573275, "flos": 19754532132480.0, "grad_norm": 491.14116470886864, "language_loss": 0.87245536, "learning_rate": 6.768183743687338e-07, "loss": 0.88721979, "num_input_tokens_seen": 264816795, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.24060059, "step": 12278, "time_per_iteration": 2.656581163406372 }, { "auxiliary_loss_clip": 0.01268226, "auxiliary_loss_mlp": 0.00235979, "balance_loss_clip": 1.04583597, "balance_loss_mlp": 0.2112309, "epoch": 0.7382534195099955, "flos": 17305316392320.0, "grad_norm": 6.10151170029027, "language_loss": 0.80919015, "learning_rate": 6.765263557540921e-07, "loss": 0.82423222, "num_input_tokens_seen": 264834105, "router_z_loss_clip": 2.2265625, "router_z_loss_mlp": 0.24755859, "step": 12279, "time_per_iteration": 2.675281524658203 }, { "auxiliary_loss_clip": 0.01251746, "auxiliary_loss_mlp": 0.00222206, "balance_loss_clip": 1.03133798, "balance_loss_mlp": 0.19717184, "epoch": 0.7383135427626635, "flos": 18697358021760.0, "grad_norm": 99.61362887716027, "language_loss": 0.93835306, "learning_rate": 6.762343873257034e-07, "loss": 0.95309258, "num_input_tokens_seen": 264850895, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.25024414, "step": 12280, "time_per_iteration": 2.6653239727020264 }, { "auxiliary_loss_clip": 0.01265731, "auxiliary_loss_mlp": 0.00249372, "balance_loss_clip": 1.04545259, "balance_loss_mlp": 0.2250298, "epoch": 0.7383736660153314, "flos": 20881300844160.0, "grad_norm": 19.500705371729683, "language_loss": 0.80838352, "learning_rate": 6.759424690946408e-07, "loss": 0.82353455, "num_input_tokens_seen": 264869505, "router_z_loss_clip": 2.20117188, "router_z_loss_mlp": 0.2434082, "step": 12281, "time_per_iteration": 2.713010311126709 }, { "auxiliary_loss_clip": 0.01272837, "auxiliary_loss_mlp": 0.00235428, "balance_loss_clip": 1.04801512, "balance_loss_mlp": 0.20780769, "epoch": 0.7384337892679994, "flos": 20663215418880.0, "grad_norm": 24.65418401691631, "language_loss": 0.72160614, "learning_rate": 6.756506010719711e-07, "loss": 0.73668879, "num_input_tokens_seen": 264886915, "router_z_loss_clip": 2.24609375, "router_z_loss_mlp": 0.27612305, "step": 12282, "time_per_iteration": 2.6481986045837402 }, { "auxiliary_loss_clip": 0.01288764, "auxiliary_loss_mlp": 0.00238928, "balance_loss_clip": 1.05980253, "balance_loss_mlp": 0.21090215, "epoch": 0.7384939125206673, "flos": 29169627390720.0, "grad_norm": 7.163832964918908, "language_loss": 0.76737976, "learning_rate": 6.753587832687632e-07, "loss": 0.78265667, "num_input_tokens_seen": 264910350, "router_z_loss_clip": 2.28710938, "router_z_loss_mlp": 0.28039551, "step": 12283, "time_per_iteration": 2.7424838542938232 }, { "auxiliary_loss_clip": 0.01249694, "auxiliary_loss_mlp": 0.00260917, "balance_loss_clip": 1.0344255, "balance_loss_mlp": 0.23699155, "epoch": 0.7385540357733353, "flos": 36312833376000.0, "grad_norm": 10.627468975322424, "language_loss": 0.82942367, "learning_rate": 6.750670156960832e-07, "loss": 0.84452981, "num_input_tokens_seen": 264930705, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.23937988, "step": 12284, "time_per_iteration": 2.768044948577881 }, { "auxiliary_loss_clip": 0.01263038, "auxiliary_loss_mlp": 0.00233724, "balance_loss_clip": 1.04163456, "balance_loss_mlp": 0.2076173, "epoch": 0.7386141590260034, "flos": 20302600826880.0, "grad_norm": 144.40660542347334, "language_loss": 0.79421377, "learning_rate": 6.747752983649954e-07, "loss": 0.80918139, "num_input_tokens_seen": 264946975, "router_z_loss_clip": 2.21289062, "router_z_loss_mlp": 0.26098633, "step": 12285, "time_per_iteration": 2.692070960998535 }, { "auxiliary_loss_clip": 0.01283501, "auxiliary_loss_mlp": 0.00258466, "balance_loss_clip": 1.05635345, "balance_loss_mlp": 0.23338485, "epoch": 0.7386742822786713, "flos": 25483792170240.0, "grad_norm": 1270.9824745499013, "language_loss": 0.86845928, "learning_rate": 6.744836312865602e-07, "loss": 0.88387901, "num_input_tokens_seen": 264967665, "router_z_loss_clip": 2.2734375, "router_z_loss_mlp": 0.25085449, "step": 12286, "time_per_iteration": 2.698803663253784 }, { "auxiliary_loss_clip": 0.01266553, "auxiliary_loss_mlp": 0.00231125, "balance_loss_clip": 1.04712057, "balance_loss_mlp": 0.20725946, "epoch": 0.7387344055313393, "flos": 13771958405760.0, "grad_norm": 8.221250154825556, "language_loss": 0.74456543, "learning_rate": 6.741920144718396e-07, "loss": 0.75954217, "num_input_tokens_seen": 264985480, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.23852539, "step": 12287, "time_per_iteration": 2.71813702583313 }, { "auxiliary_loss_clip": 0.01273176, "auxiliary_loss_mlp": 0.00247231, "balance_loss_clip": 1.05340946, "balance_loss_mlp": 0.2247127, "epoch": 0.7387945287840072, "flos": 27855189095040.0, "grad_norm": 6.610940292304719, "language_loss": 0.83611321, "learning_rate": 6.739004479318903e-07, "loss": 0.85131729, "num_input_tokens_seen": 265004790, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.22509766, "step": 12288, "time_per_iteration": 2.780893325805664 }, { "auxiliary_loss_clip": 0.01290982, "auxiliary_loss_mlp": 0.00231034, "balance_loss_clip": 1.05982661, "balance_loss_mlp": 0.20630975, "epoch": 0.7388546520366752, "flos": 44233039388160.0, "grad_norm": 3.663666178520096, "language_loss": 0.6531955, "learning_rate": 6.736089316777684e-07, "loss": 0.66841567, "num_input_tokens_seen": 265028790, "router_z_loss_clip": 2.3125, "router_z_loss_mlp": 0.24707031, "step": 12289, "time_per_iteration": 2.921173095703125 }, { "auxiliary_loss_clip": 0.01182236, "auxiliary_loss_mlp": 0.00084526, "balance_loss_clip": 1.02642405, "balance_loss_mlp": 0.0781362, "epoch": 0.7389147752893431, "flos": 70680890638080.0, "grad_norm": 0.6368397395786621, "language_loss": 0.48806775, "learning_rate": 6.733174657205287e-07, "loss": 0.5007354, "num_input_tokens_seen": 265096660, "router_z_loss_clip": 1.5546875, "router_z_loss_mlp": 0.06396484, "step": 12290, "time_per_iteration": 3.355513334274292 }, { "auxiliary_loss_clip": 0.01276853, "auxiliary_loss_mlp": 0.00237956, "balance_loss_clip": 1.05529928, "balance_loss_mlp": 0.21231395, "epoch": 0.7389748985420111, "flos": 25994980575360.0, "grad_norm": 71.85350166931549, "language_loss": 0.78167808, "learning_rate": 6.730260500712237e-07, "loss": 0.79682612, "num_input_tokens_seen": 265116375, "router_z_loss_clip": 2.21679688, "router_z_loss_mlp": 0.25634766, "step": 12291, "time_per_iteration": 2.7469441890716553 }, { "auxiliary_loss_clip": 0.01182127, "auxiliary_loss_mlp": 0.00079594, "balance_loss_clip": 1.02547216, "balance_loss_mlp": 0.07287049, "epoch": 0.7390350217946791, "flos": 54403661318400.0, "grad_norm": 0.9859024954114693, "language_loss": 0.60809267, "learning_rate": 6.727346847409052e-07, "loss": 0.6207099, "num_input_tokens_seen": 265161230, "router_z_loss_clip": 1.5625, "router_z_loss_mlp": 0.06738281, "step": 12292, "time_per_iteration": 2.833390474319458 }, { "auxiliary_loss_clip": 0.01262156, "auxiliary_loss_mlp": 0.00242475, "balance_loss_clip": 1.04064691, "balance_loss_mlp": 0.21652311, "epoch": 0.7390951450473471, "flos": 32196968530560.0, "grad_norm": 5.959172625707286, "language_loss": 0.74711865, "learning_rate": 6.724433697406191e-07, "loss": 0.76216495, "num_input_tokens_seen": 265182515, "router_z_loss_clip": 2.21484375, "router_z_loss_mlp": 0.25952148, "step": 12293, "time_per_iteration": 2.7658944129943848 }, { "auxiliary_loss_clip": 0.01283685, "auxiliary_loss_mlp": 0.00216719, "balance_loss_clip": 1.0599376, "balance_loss_mlp": 0.19246016, "epoch": 0.739155268300015, "flos": 16684241304960.0, "grad_norm": 8.730579500182467, "language_loss": 0.90405679, "learning_rate": 6.721521050814134e-07, "loss": 0.91906095, "num_input_tokens_seen": 265198160, "router_z_loss_clip": 2.23828125, "router_z_loss_mlp": 0.24267578, "step": 12294, "time_per_iteration": 2.7008557319641113 }, { "auxiliary_loss_clip": 0.01256504, "auxiliary_loss_mlp": 0.00220483, "balance_loss_clip": 1.03518474, "balance_loss_mlp": 0.19590238, "epoch": 0.739215391552683, "flos": 31649761762560.0, "grad_norm": 2751.298062486261, "language_loss": 0.80066085, "learning_rate": 6.718608907743337e-07, "loss": 0.81543076, "num_input_tokens_seen": 265218480, "router_z_loss_clip": 2.21289062, "router_z_loss_mlp": 0.24584961, "step": 12295, "time_per_iteration": 2.756542921066284 }, { "auxiliary_loss_clip": 0.01249615, "auxiliary_loss_mlp": 0.00251775, "balance_loss_clip": 1.0364728, "balance_loss_mlp": 0.22807637, "epoch": 0.7392755148053509, "flos": 29718522097920.0, "grad_norm": 2.962426504495809, "language_loss": 0.83596742, "learning_rate": 6.715697268304215e-07, "loss": 0.85098135, "num_input_tokens_seen": 265240165, "router_z_loss_clip": 2.13085938, "router_z_loss_mlp": 0.23693848, "step": 12296, "time_per_iteration": 2.757657051086426 }, { "auxiliary_loss_clip": 0.01255579, "auxiliary_loss_mlp": 0.0022476, "balance_loss_clip": 1.03605461, "balance_loss_mlp": 0.19922489, "epoch": 0.7393356380580189, "flos": 37050475075200.0, "grad_norm": 7.591795453232218, "language_loss": 0.76146621, "learning_rate": 6.712786132607182e-07, "loss": 0.77626961, "num_input_tokens_seen": 265263295, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.25524902, "step": 12297, "time_per_iteration": 2.8081440925598145 }, { "auxiliary_loss_clip": 0.01249793, "auxiliary_loss_mlp": 0.00233276, "balance_loss_clip": 1.03533149, "balance_loss_mlp": 0.20731206, "epoch": 0.739395761310687, "flos": 19719627091200.0, "grad_norm": 46.22600802887809, "language_loss": 0.76785457, "learning_rate": 6.709875500762645e-07, "loss": 0.78268522, "num_input_tokens_seen": 265282740, "router_z_loss_clip": 2.14453125, "router_z_loss_mlp": 0.25964355, "step": 12298, "time_per_iteration": 2.6965579986572266 }, { "auxiliary_loss_clip": 0.01267205, "auxiliary_loss_mlp": 0.00255648, "balance_loss_clip": 1.0432874, "balance_loss_mlp": 0.22925499, "epoch": 0.7394558845633549, "flos": 11801504067840.0, "grad_norm": 4.182858771359563, "language_loss": 0.8288182, "learning_rate": 6.706965372880946e-07, "loss": 0.84404671, "num_input_tokens_seen": 265300175, "router_z_loss_clip": 2.23632812, "router_z_loss_mlp": 0.26403809, "step": 12299, "time_per_iteration": 2.6821885108947754 }, { "auxiliary_loss_clip": 0.01187769, "auxiliary_loss_mlp": 0.00071432, "balance_loss_clip": 1.03289342, "balance_loss_mlp": 0.06385061, "epoch": 0.7395160078160229, "flos": 66195827850240.0, "grad_norm": 0.7342932719568352, "language_loss": 0.59923506, "learning_rate": 6.704055749072455e-07, "loss": 0.61182708, "num_input_tokens_seen": 265363275, "router_z_loss_clip": 1.546875, "router_z_loss_mlp": 0.07568359, "step": 12300, "time_per_iteration": 4.596611976623535 }, { "auxiliary_loss_clip": 0.01271922, "auxiliary_loss_mlp": 0.00235956, "balance_loss_clip": 1.0516479, "balance_loss_mlp": 0.21145865, "epoch": 0.7395761310686908, "flos": 21249708687360.0, "grad_norm": 3.1507778748855384, "language_loss": 0.86943674, "learning_rate": 6.7011466294475e-07, "loss": 0.88451552, "num_input_tokens_seen": 265382935, "router_z_loss_clip": 2.20019531, "router_z_loss_mlp": 0.24475098, "step": 12301, "time_per_iteration": 4.1425461769104 }, { "auxiliary_loss_clip": 0.0125001, "auxiliary_loss_mlp": 0.00240975, "balance_loss_clip": 1.03897214, "balance_loss_mlp": 0.21811107, "epoch": 0.7396362543213588, "flos": 25955299025280.0, "grad_norm": 37.06473847077831, "language_loss": 0.78968471, "learning_rate": 6.698238014116406e-07, "loss": 0.80459452, "num_input_tokens_seen": 265403245, "router_z_loss_clip": 2.11132812, "router_z_loss_mlp": 0.22875977, "step": 12302, "time_per_iteration": 2.790836811065674 }, { "auxiliary_loss_clip": 0.01278539, "auxiliary_loss_mlp": 0.00228829, "balance_loss_clip": 1.05216038, "balance_loss_mlp": 0.20471299, "epoch": 0.7396963775740267, "flos": 27377936064000.0, "grad_norm": 35.62010155916454, "language_loss": 0.82004601, "learning_rate": 6.695329903189451e-07, "loss": 0.83511966, "num_input_tokens_seen": 265423105, "router_z_loss_clip": 2.25976562, "router_z_loss_mlp": 0.24145508, "step": 12303, "time_per_iteration": 2.725104570388794 }, { "auxiliary_loss_clip": 0.01249287, "auxiliary_loss_mlp": 0.00232886, "balance_loss_clip": 1.03585553, "balance_loss_mlp": 0.20813853, "epoch": 0.7397565008266948, "flos": 25520133755520.0, "grad_norm": 8.614418395697774, "language_loss": 0.60980415, "learning_rate": 6.692422296776927e-07, "loss": 0.62462592, "num_input_tokens_seen": 265443445, "router_z_loss_clip": 2.13867188, "router_z_loss_mlp": 0.24755859, "step": 12304, "time_per_iteration": 4.96955680847168 }, { "auxiliary_loss_clip": 0.01261649, "auxiliary_loss_mlp": 0.0024685, "balance_loss_clip": 1.04144168, "balance_loss_mlp": 0.22093381, "epoch": 0.7398166240793627, "flos": 23727760070400.0, "grad_norm": 28.143512243882352, "language_loss": 0.90468025, "learning_rate": 6.689515194989084e-07, "loss": 0.91976523, "num_input_tokens_seen": 265462085, "router_z_loss_clip": 2.20117188, "router_z_loss_mlp": 0.2590332, "step": 12305, "time_per_iteration": 2.683502197265625 }, { "auxiliary_loss_clip": 0.01201364, "auxiliary_loss_mlp": 0.00068651, "balance_loss_clip": 1.04203963, "balance_loss_mlp": 0.06202263, "epoch": 0.7398767473320307, "flos": 67267582882560.0, "grad_norm": 0.8631616631077789, "language_loss": 0.574377, "learning_rate": 6.68660859793615e-07, "loss": 0.58707714, "num_input_tokens_seen": 265521190, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.06640625, "step": 12306, "time_per_iteration": 3.205760955810547 }, { "auxiliary_loss_clip": 0.01278385, "auxiliary_loss_mlp": 0.00210827, "balance_loss_clip": 1.0519197, "balance_loss_mlp": 0.18368325, "epoch": 0.7399368705846986, "flos": 22018699981440.0, "grad_norm": 12.261800877866241, "language_loss": 0.90508467, "learning_rate": 6.683702505728355e-07, "loss": 0.91997677, "num_input_tokens_seen": 265539705, "router_z_loss_clip": 2.265625, "router_z_loss_mlp": 0.2713623, "step": 12307, "time_per_iteration": 2.7140860557556152 }, { "auxiliary_loss_clip": 0.0124699, "auxiliary_loss_mlp": 0.00233962, "balance_loss_clip": 1.03276002, "balance_loss_mlp": 0.21027523, "epoch": 0.7399969938373666, "flos": 14173870659840.0, "grad_norm": 41.94612032727318, "language_loss": 0.78864467, "learning_rate": 6.680796918475893e-07, "loss": 0.80345416, "num_input_tokens_seen": 265555855, "router_z_loss_clip": 2.13964844, "router_z_loss_mlp": 0.23681641, "step": 12308, "time_per_iteration": 2.680783271789551 }, { "auxiliary_loss_clip": 0.01249394, "auxiliary_loss_mlp": 0.00246044, "balance_loss_clip": 1.03488445, "balance_loss_mlp": 0.22300103, "epoch": 0.7400571170900345, "flos": 25301473712640.0, "grad_norm": 88.13468179700475, "language_loss": 0.89085853, "learning_rate": 6.67789183628896e-07, "loss": 0.90581292, "num_input_tokens_seen": 265575455, "router_z_loss_clip": 2.15039062, "router_z_loss_mlp": 0.23046875, "step": 12309, "time_per_iteration": 4.143434286117554 }, { "auxiliary_loss_clip": 0.0127052, "auxiliary_loss_mlp": 0.00246445, "balance_loss_clip": 1.04760885, "balance_loss_mlp": 0.2202428, "epoch": 0.7401172403427025, "flos": 22711344917760.0, "grad_norm": 2.131792418124988, "language_loss": 0.80502224, "learning_rate": 6.674987259277692e-07, "loss": 0.82019192, "num_input_tokens_seen": 265595250, "router_z_loss_clip": 2.22851562, "router_z_loss_mlp": 0.26220703, "step": 12310, "time_per_iteration": 2.70511531829834 }, { "auxiliary_loss_clip": 0.01271008, "auxiliary_loss_mlp": 0.00245146, "balance_loss_clip": 1.05160046, "balance_loss_mlp": 0.22015955, "epoch": 0.7401773635953706, "flos": 18067448188800.0, "grad_norm": 6.525572646256948, "language_loss": 0.9635663, "learning_rate": 6.672083187552239e-07, "loss": 0.97872782, "num_input_tokens_seen": 265606945, "router_z_loss_clip": 2.19140625, "router_z_loss_mlp": 0.25, "step": 12311, "time_per_iteration": 2.6612541675567627 }, { "auxiliary_loss_clip": 0.0127298, "auxiliary_loss_mlp": 0.00249347, "balance_loss_clip": 1.0553329, "balance_loss_mlp": 0.2237291, "epoch": 0.7402374868480385, "flos": 22712135016960.0, "grad_norm": 18.776523666994663, "language_loss": 0.86374891, "learning_rate": 6.669179621222738e-07, "loss": 0.87897211, "num_input_tokens_seen": 265626115, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.25646973, "step": 12312, "time_per_iteration": 2.7320120334625244 }, { "auxiliary_loss_clip": 0.0126986, "auxiliary_loss_mlp": 0.00240103, "balance_loss_clip": 1.05643106, "balance_loss_mlp": 0.21617815, "epoch": 0.7402976101007065, "flos": 22856675345280.0, "grad_norm": 6.934692127465287, "language_loss": 0.84799212, "learning_rate": 6.666276560399273e-07, "loss": 0.86309177, "num_input_tokens_seen": 265646520, "router_z_loss_clip": 2.1328125, "router_z_loss_mlp": 0.23925781, "step": 12313, "time_per_iteration": 2.760917901992798 }, { "auxiliary_loss_clip": 0.01298998, "auxiliary_loss_mlp": 0.00245384, "balance_loss_clip": 1.06722915, "balance_loss_mlp": 0.21887223, "epoch": 0.7403577333533744, "flos": 12345801834240.0, "grad_norm": 24.84446095604338, "language_loss": 0.85565025, "learning_rate": 6.663374005191937e-07, "loss": 0.87109405, "num_input_tokens_seen": 265661875, "router_z_loss_clip": 2.32226562, "router_z_loss_mlp": 0.26489258, "step": 12314, "time_per_iteration": 2.6825313568115234 }, { "auxiliary_loss_clip": 0.01180918, "auxiliary_loss_mlp": 0.0006326, "balance_loss_clip": 1.02496767, "balance_loss_mlp": 0.05644093, "epoch": 0.7404178566060424, "flos": 60327270869760.0, "grad_norm": 0.8222137382726563, "language_loss": 0.54746795, "learning_rate": 6.660471955710809e-07, "loss": 0.5599097, "num_input_tokens_seen": 265721255, "router_z_loss_clip": 1.5625, "router_z_loss_mlp": 0.06835938, "step": 12315, "time_per_iteration": 3.180222511291504 }, { "auxiliary_loss_clip": 0.01261389, "auxiliary_loss_mlp": 0.00237329, "balance_loss_clip": 1.04536736, "balance_loss_mlp": 0.21372585, "epoch": 0.7404779798587103, "flos": 32014650072960.0, "grad_norm": 27.87922019102722, "language_loss": 0.85977137, "learning_rate": 6.65757041206591e-07, "loss": 0.8747586, "num_input_tokens_seen": 265743970, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.23596191, "step": 12316, "time_per_iteration": 2.7724194526672363 }, { "auxiliary_loss_clip": 0.0126291, "auxiliary_loss_mlp": 0.00264082, "balance_loss_clip": 1.0444746, "balance_loss_mlp": 0.23892914, "epoch": 0.7405381031113784, "flos": 12889704551040.0, "grad_norm": 9.211984609038568, "language_loss": 0.84669304, "learning_rate": 6.654669374367275e-07, "loss": 0.86196297, "num_input_tokens_seen": 265760890, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.25134277, "step": 12317, "time_per_iteration": 2.6675539016723633 }, { "auxiliary_loss_clip": 0.0124324, "auxiliary_loss_mlp": 0.00243878, "balance_loss_clip": 1.03436852, "balance_loss_mlp": 0.22021475, "epoch": 0.7405982263640463, "flos": 20229127557120.0, "grad_norm": 3.8344483651347314, "language_loss": 0.88487709, "learning_rate": 6.651768842724917e-07, "loss": 0.89974827, "num_input_tokens_seen": 265779600, "router_z_loss_clip": 2.08691406, "router_z_loss_mlp": 0.23657227, "step": 12318, "time_per_iteration": 2.7700178623199463 }, { "auxiliary_loss_clip": 0.01279761, "auxiliary_loss_mlp": 0.00219607, "balance_loss_clip": 1.05663967, "balance_loss_mlp": 0.19603959, "epoch": 0.7406583496167143, "flos": 17567213431680.0, "grad_norm": 50.350757860801636, "language_loss": 0.85339892, "learning_rate": 6.648868817248827e-07, "loss": 0.86839259, "num_input_tokens_seen": 265797030, "router_z_loss_clip": 2.23242188, "router_z_loss_mlp": 0.23571777, "step": 12319, "time_per_iteration": 2.645869016647339 }, { "auxiliary_loss_clip": 0.01264088, "auxiliary_loss_mlp": 0.00252986, "balance_loss_clip": 1.04723525, "balance_loss_mlp": 0.23008566, "epoch": 0.7407184728693822, "flos": 18295733076480.0, "grad_norm": 9.4393129128646, "language_loss": 0.73316777, "learning_rate": 6.64596929804897e-07, "loss": 0.74833852, "num_input_tokens_seen": 265815055, "router_z_loss_clip": 2.1640625, "router_z_loss_mlp": 0.22912598, "step": 12320, "time_per_iteration": 2.6533761024475098 }, { "auxiliary_loss_clip": 0.01290181, "auxiliary_loss_mlp": 0.00231631, "balance_loss_clip": 1.06447577, "balance_loss_mlp": 0.20510654, "epoch": 0.7407785961220502, "flos": 16690562098560.0, "grad_norm": 12.206445506125954, "language_loss": 0.93864954, "learning_rate": 6.643070285235288e-07, "loss": 0.95386755, "num_input_tokens_seen": 265828480, "router_z_loss_clip": 2.2578125, "router_z_loss_mlp": 0.26525879, "step": 12321, "time_per_iteration": 2.6335253715515137 }, { "auxiliary_loss_clip": 0.01311885, "auxiliary_loss_mlp": 0.00259949, "balance_loss_clip": 1.07730031, "balance_loss_mlp": 0.23176795, "epoch": 0.7408387193747181, "flos": 22088330496000.0, "grad_norm": 114.08934743370307, "language_loss": 0.80890179, "learning_rate": 6.640171778917727e-07, "loss": 0.82462025, "num_input_tokens_seen": 265845825, "router_z_loss_clip": 2.34570312, "router_z_loss_mlp": 0.28186035, "step": 12322, "time_per_iteration": 2.6503727436065674 }, { "auxiliary_loss_clip": 0.01273417, "auxiliary_loss_mlp": 0.00221632, "balance_loss_clip": 1.05282807, "balance_loss_mlp": 0.19689611, "epoch": 0.7408988426273861, "flos": 24236721832320.0, "grad_norm": 144.39479727663962, "language_loss": 0.71623683, "learning_rate": 6.637273779206183e-07, "loss": 0.73118734, "num_input_tokens_seen": 265866335, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.24743652, "step": 12323, "time_per_iteration": 2.6856162548065186 }, { "auxiliary_loss_clip": 0.01274207, "auxiliary_loss_mlp": 0.00237923, "balance_loss_clip": 1.04965639, "balance_loss_mlp": 0.21180406, "epoch": 0.7409589658800542, "flos": 29023004073600.0, "grad_norm": 1152.7223428246532, "language_loss": 0.82406098, "learning_rate": 6.634376286210559e-07, "loss": 0.83918226, "num_input_tokens_seen": 265888945, "router_z_loss_clip": 2.24609375, "router_z_loss_mlp": 0.26123047, "step": 12324, "time_per_iteration": 2.7285168170928955 }, { "auxiliary_loss_clip": 0.01259609, "auxiliary_loss_mlp": 0.00254554, "balance_loss_clip": 1.03711486, "balance_loss_mlp": 0.23116533, "epoch": 0.7410190891327221, "flos": 19351362902400.0, "grad_norm": 12.753875034701235, "language_loss": 0.8112638, "learning_rate": 6.63147930004073e-07, "loss": 0.82640541, "num_input_tokens_seen": 265908030, "router_z_loss_clip": 2.22460938, "router_z_loss_mlp": 0.23388672, "step": 12325, "time_per_iteration": 2.6304686069488525 }, { "auxiliary_loss_clip": 0.01272615, "auxiliary_loss_mlp": 0.00232745, "balance_loss_clip": 1.04718268, "balance_loss_mlp": 0.20691228, "epoch": 0.7410792123853901, "flos": 22747650589440.0, "grad_norm": 51.61392070808152, "language_loss": 0.79252505, "learning_rate": 6.628582820806545e-07, "loss": 0.80757862, "num_input_tokens_seen": 265927030, "router_z_loss_clip": 2.25488281, "router_z_loss_mlp": 0.25842285, "step": 12326, "time_per_iteration": 2.6819567680358887 }, { "auxiliary_loss_clip": 0.01268437, "auxiliary_loss_mlp": 0.00223752, "balance_loss_clip": 1.04557002, "balance_loss_mlp": 0.20054199, "epoch": 0.741139335638058, "flos": 25372433030400.0, "grad_norm": 212.69080790820658, "language_loss": 0.94677323, "learning_rate": 6.625686848617835e-07, "loss": 0.96169513, "num_input_tokens_seen": 265945490, "router_z_loss_clip": 2.2265625, "router_z_loss_mlp": 0.2322998, "step": 12327, "time_per_iteration": 2.7108561992645264 }, { "auxiliary_loss_clip": 0.01292858, "auxiliary_loss_mlp": 0.00236716, "balance_loss_clip": 1.06301355, "balance_loss_mlp": 0.20919031, "epoch": 0.741199458890726, "flos": 18585639745920.0, "grad_norm": 16.131904842700603, "language_loss": 0.94526333, "learning_rate": 6.62279138358442e-07, "loss": 0.96055907, "num_input_tokens_seen": 265963265, "router_z_loss_clip": 2.30078125, "router_z_loss_mlp": 0.27526855, "step": 12328, "time_per_iteration": 2.6415703296661377 }, { "auxiliary_loss_clip": 0.01267565, "auxiliary_loss_mlp": 0.00251723, "balance_loss_clip": 1.04815733, "balance_loss_mlp": 0.22618826, "epoch": 0.7412595821433939, "flos": 22127078292480.0, "grad_norm": 58.537342001614725, "language_loss": 0.73510647, "learning_rate": 6.619896425816103e-07, "loss": 0.75029939, "num_input_tokens_seen": 265982270, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.25524902, "step": 12329, "time_per_iteration": 2.747649908065796 }, { "auxiliary_loss_clip": 0.01307838, "auxiliary_loss_mlp": 0.00282692, "balance_loss_clip": 1.07589221, "balance_loss_mlp": 0.25651357, "epoch": 0.741319705396062, "flos": 29169699217920.0, "grad_norm": 19.722489259761467, "language_loss": 0.73948866, "learning_rate": 6.617001975422647e-07, "loss": 0.75539398, "num_input_tokens_seen": 266003835, "router_z_loss_clip": 2.32226562, "router_z_loss_mlp": 0.26196289, "step": 12330, "time_per_iteration": 2.708306074142456 }, { "auxiliary_loss_clip": 0.012972, "auxiliary_loss_mlp": 0.00266641, "balance_loss_clip": 1.06849432, "balance_loss_mlp": 0.23894837, "epoch": 0.7413798286487299, "flos": 20667489137280.0, "grad_norm": 1907.6995867907722, "language_loss": 0.95358223, "learning_rate": 6.614108032513823e-07, "loss": 0.96922064, "num_input_tokens_seen": 266021595, "router_z_loss_clip": 2.28515625, "router_z_loss_mlp": 0.27697754, "step": 12331, "time_per_iteration": 2.6816039085388184 }, { "auxiliary_loss_clip": 0.0126682, "auxiliary_loss_mlp": 0.00247874, "balance_loss_clip": 1.04758477, "balance_loss_mlp": 0.22437817, "epoch": 0.7414399519013979, "flos": 16398895662720.0, "grad_norm": 13.610097619950784, "language_loss": 0.77901328, "learning_rate": 6.611214597199364e-07, "loss": 0.79416019, "num_input_tokens_seen": 266039860, "router_z_loss_clip": 2.19140625, "router_z_loss_mlp": 0.23474121, "step": 12332, "time_per_iteration": 2.6564199924468994 }, { "auxiliary_loss_clip": 0.01269705, "auxiliary_loss_mlp": 0.00243835, "balance_loss_clip": 1.047719, "balance_loss_mlp": 0.21684563, "epoch": 0.7415000751540658, "flos": 25630235919360.0, "grad_norm": 52.988527000328595, "language_loss": 0.73411912, "learning_rate": 6.608321669588984e-07, "loss": 0.74925447, "num_input_tokens_seen": 266058050, "router_z_loss_clip": 2.22265625, "router_z_loss_mlp": 0.27001953, "step": 12333, "time_per_iteration": 2.7015342712402344 }, { "auxiliary_loss_clip": 0.01260696, "auxiliary_loss_mlp": 0.00231982, "balance_loss_clip": 1.04454207, "balance_loss_mlp": 0.20750877, "epoch": 0.7415601984067338, "flos": 24499732193280.0, "grad_norm": 3.870097106515917, "language_loss": 0.77805012, "learning_rate": 6.605429249792387e-07, "loss": 0.79297698, "num_input_tokens_seen": 266078060, "router_z_loss_clip": 2.16210938, "router_z_loss_mlp": 0.24499512, "step": 12334, "time_per_iteration": 2.7093048095703125 }, { "auxiliary_loss_clip": 0.01286268, "auxiliary_loss_mlp": 0.00251532, "balance_loss_clip": 1.06230116, "balance_loss_mlp": 0.22714148, "epoch": 0.7416203216594017, "flos": 20887154760960.0, "grad_norm": 6.0518320904831056, "language_loss": 0.88798124, "learning_rate": 6.602537337919257e-07, "loss": 0.90335923, "num_input_tokens_seen": 266097110, "router_z_loss_clip": 2.23632812, "router_z_loss_mlp": 0.24401855, "step": 12335, "time_per_iteration": 2.7232179641723633 }, { "auxiliary_loss_clip": 0.01282102, "auxiliary_loss_mlp": 0.0025603, "balance_loss_clip": 1.05613661, "balance_loss_mlp": 0.22916003, "epoch": 0.7416804449120697, "flos": 15624265933440.0, "grad_norm": 49.1363773580001, "language_loss": 0.85179108, "learning_rate": 6.599645934079259e-07, "loss": 0.86717236, "num_input_tokens_seen": 266110870, "router_z_loss_clip": 2.25976562, "router_z_loss_mlp": 0.26855469, "step": 12336, "time_per_iteration": 2.588747501373291 }, { "auxiliary_loss_clip": 0.01290757, "auxiliary_loss_mlp": 0.00229421, "balance_loss_clip": 1.06101012, "balance_loss_mlp": 0.20373128, "epoch": 0.7417405681647377, "flos": 17120483982720.0, "grad_norm": 13.1220028417644, "language_loss": 0.82790154, "learning_rate": 6.596755038382029e-07, "loss": 0.84310329, "num_input_tokens_seen": 266127845, "router_z_loss_clip": 2.296875, "router_z_loss_mlp": 0.25708008, "step": 12337, "time_per_iteration": 2.7125251293182373 }, { "auxiliary_loss_clip": 0.01288277, "auxiliary_loss_mlp": 0.00255678, "balance_loss_clip": 1.06232846, "balance_loss_mlp": 0.23076329, "epoch": 0.7418006914174057, "flos": 18880322924160.0, "grad_norm": 7.855338871381207, "language_loss": 0.83500534, "learning_rate": 6.593864650937186e-07, "loss": 0.85044491, "num_input_tokens_seen": 266145400, "router_z_loss_clip": 2.25878906, "router_z_loss_mlp": 0.24890137, "step": 12338, "time_per_iteration": 2.6354596614837646 }, { "auxiliary_loss_clip": 0.01259641, "auxiliary_loss_mlp": 0.00220942, "balance_loss_clip": 1.04086375, "balance_loss_mlp": 0.1967068, "epoch": 0.7418608146700737, "flos": 21580733450880.0, "grad_norm": 10.402588911340676, "language_loss": 0.79122186, "learning_rate": 6.590974771854345e-07, "loss": 0.80602771, "num_input_tokens_seen": 266164430, "router_z_loss_clip": 2.19140625, "router_z_loss_mlp": 0.24230957, "step": 12339, "time_per_iteration": 2.713149309158325 }, { "auxiliary_loss_clip": 0.01263644, "auxiliary_loss_mlp": 0.00248999, "balance_loss_clip": 1.04104662, "balance_loss_mlp": 0.22375041, "epoch": 0.7419209379227416, "flos": 22340459036160.0, "grad_norm": 6.226652093977864, "language_loss": 0.88181746, "learning_rate": 6.588085401243077e-07, "loss": 0.89694393, "num_input_tokens_seen": 266183855, "router_z_loss_clip": 2.22460938, "router_z_loss_mlp": 0.25280762, "step": 12340, "time_per_iteration": 2.7532715797424316 }, { "auxiliary_loss_clip": 0.0126604, "auxiliary_loss_mlp": 0.00240102, "balance_loss_clip": 1.04737139, "balance_loss_mlp": 0.21486507, "epoch": 0.7419810611754096, "flos": 16762275601920.0, "grad_norm": 6.058950939536265, "language_loss": 0.81424636, "learning_rate": 6.585196539212958e-07, "loss": 0.82930779, "num_input_tokens_seen": 266202085, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.25244141, "step": 12341, "time_per_iteration": 2.6485252380371094 }, { "auxiliary_loss_clip": 0.01247024, "auxiliary_loss_mlp": 0.00254853, "balance_loss_clip": 1.0386281, "balance_loss_mlp": 0.23052262, "epoch": 0.7420411844280775, "flos": 26212958259840.0, "grad_norm": 12.425008886197322, "language_loss": 0.84882271, "learning_rate": 6.582308185873535e-07, "loss": 0.86384153, "num_input_tokens_seen": 266223445, "router_z_loss_clip": 2.08496094, "router_z_loss_mlp": 0.24328613, "step": 12342, "time_per_iteration": 4.115522861480713 }, { "auxiliary_loss_clip": 0.01276249, "auxiliary_loss_mlp": 0.00234358, "balance_loss_clip": 1.05618191, "balance_loss_mlp": 0.20921716, "epoch": 0.7421013076807456, "flos": 68529371840640.0, "grad_norm": 2.419586108572292, "language_loss": 0.84504092, "learning_rate": 6.57942034133433e-07, "loss": 0.860147, "num_input_tokens_seen": 266246575, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.25146484, "step": 12343, "time_per_iteration": 3.0371694564819336 }, { "auxiliary_loss_clip": 0.01275644, "auxiliary_loss_mlp": 0.00234607, "balance_loss_clip": 1.05202043, "balance_loss_mlp": 0.20954902, "epoch": 0.7421614309334135, "flos": 24425325169920.0, "grad_norm": 9.241459295385848, "language_loss": 0.74975848, "learning_rate": 6.576533005704843e-07, "loss": 0.76486099, "num_input_tokens_seen": 266266055, "router_z_loss_clip": 2.23632812, "router_z_loss_mlp": 0.25061035, "step": 12344, "time_per_iteration": 4.090264320373535 }, { "auxiliary_loss_clip": 0.01280745, "auxiliary_loss_mlp": 0.00239145, "balance_loss_clip": 1.05608773, "balance_loss_mlp": 0.21345538, "epoch": 0.7422215541860815, "flos": 12311076360960.0, "grad_norm": 26.787580886230938, "language_loss": 0.92116559, "learning_rate": 6.573646179094572e-07, "loss": 0.93636447, "num_input_tokens_seen": 266282240, "router_z_loss_clip": 2.24414062, "router_z_loss_mlp": 0.25708008, "step": 12345, "time_per_iteration": 2.674975872039795 }, { "auxiliary_loss_clip": 0.01291667, "auxiliary_loss_mlp": 0.00273714, "balance_loss_clip": 1.06229436, "balance_loss_mlp": 0.24769068, "epoch": 0.7422816774387494, "flos": 19645579203840.0, "grad_norm": 37.43950005349612, "language_loss": 0.81339109, "learning_rate": 6.570759861612988e-07, "loss": 0.82904494, "num_input_tokens_seen": 266300980, "router_z_loss_clip": 2.29296875, "router_z_loss_mlp": 0.26000977, "step": 12346, "time_per_iteration": 2.7408361434936523 }, { "auxiliary_loss_clip": 0.01263868, "auxiliary_loss_mlp": 0.00236924, "balance_loss_clip": 1.04140091, "balance_loss_mlp": 0.21183026, "epoch": 0.7423418006914174, "flos": 32015978876160.0, "grad_norm": 10.113905430718432, "language_loss": 0.8014791, "learning_rate": 6.56787405336953e-07, "loss": 0.81648695, "num_input_tokens_seen": 266322215, "router_z_loss_clip": 2.22460938, "router_z_loss_mlp": 0.25097656, "step": 12347, "time_per_iteration": 4.377183198928833 }, { "auxiliary_loss_clip": 0.01304925, "auxiliary_loss_mlp": 0.00233684, "balance_loss_clip": 1.07192802, "balance_loss_mlp": 0.20799448, "epoch": 0.7424019239440853, "flos": 18916951818240.0, "grad_norm": 9.218649138874824, "language_loss": 0.89985913, "learning_rate": 6.564988754473642e-07, "loss": 0.91524518, "num_input_tokens_seen": 266341600, "router_z_loss_clip": 2.33203125, "router_z_loss_mlp": 0.2565918, "step": 12348, "time_per_iteration": 2.6815731525421143 }, { "auxiliary_loss_clip": 0.01282174, "auxiliary_loss_mlp": 0.00240599, "balance_loss_clip": 1.05747008, "balance_loss_mlp": 0.21477875, "epoch": 0.7424620471967533, "flos": 35876518871040.0, "grad_norm": 9.864778043461547, "language_loss": 0.7859416, "learning_rate": 6.562103965034724e-07, "loss": 0.80116934, "num_input_tokens_seen": 266362895, "router_z_loss_clip": 2.24804688, "router_z_loss_mlp": 0.25817871, "step": 12349, "time_per_iteration": 2.7848119735717773 }, { "auxiliary_loss_clip": 0.01297856, "auxiliary_loss_mlp": 0.00267954, "balance_loss_clip": 1.06320059, "balance_loss_mlp": 0.24028581, "epoch": 0.7425221704494213, "flos": 27016603200000.0, "grad_norm": 15.764120348421692, "language_loss": 0.87475222, "learning_rate": 6.559219685162165e-07, "loss": 0.8904103, "num_input_tokens_seen": 266384015, "router_z_loss_clip": 2.34375, "router_z_loss_mlp": 0.27697754, "step": 12350, "time_per_iteration": 2.812830924987793 }, { "auxiliary_loss_clip": 0.01283197, "auxiliary_loss_mlp": 0.0025219, "balance_loss_clip": 1.06045175, "balance_loss_mlp": 0.22727554, "epoch": 0.7425822937020893, "flos": 34167135559680.0, "grad_norm": 3.790930262190436, "language_loss": 0.81908989, "learning_rate": 6.556335914965343e-07, "loss": 0.83444381, "num_input_tokens_seen": 266405990, "router_z_loss_clip": 2.22460938, "router_z_loss_mlp": 0.24926758, "step": 12351, "time_per_iteration": 4.25305962562561 }, { "auxiliary_loss_clip": 0.01281712, "auxiliary_loss_mlp": 0.00246324, "balance_loss_clip": 1.05209661, "balance_loss_mlp": 0.22001421, "epoch": 0.7426424169547573, "flos": 21283572234240.0, "grad_norm": 7.3218196399769395, "language_loss": 0.90289998, "learning_rate": 6.553452654553611e-07, "loss": 0.91818035, "num_input_tokens_seen": 266424260, "router_z_loss_clip": 2.29882812, "router_z_loss_mlp": 0.26318359, "step": 12352, "time_per_iteration": 2.653043270111084 }, { "auxiliary_loss_clip": 0.01295569, "auxiliary_loss_mlp": 0.0024227, "balance_loss_clip": 1.06440449, "balance_loss_mlp": 0.21561509, "epoch": 0.7427025402074252, "flos": 22448442297600.0, "grad_norm": 48.575608289907755, "language_loss": 0.79113585, "learning_rate": 6.550569904036307e-07, "loss": 0.8065142, "num_input_tokens_seen": 266444580, "router_z_loss_clip": 2.3125, "router_z_loss_mlp": 0.26647949, "step": 12353, "time_per_iteration": 2.705878496170044 }, { "auxiliary_loss_clip": 0.01292525, "auxiliary_loss_mlp": 0.00251562, "balance_loss_clip": 1.07066965, "balance_loss_mlp": 0.22539514, "epoch": 0.7427626634600932, "flos": 22524609087360.0, "grad_norm": 276.0480346472409, "language_loss": 0.7935183, "learning_rate": 6.547687663522739e-07, "loss": 0.80895913, "num_input_tokens_seen": 266465640, "router_z_loss_clip": 2.21484375, "router_z_loss_mlp": 0.26135254, "step": 12354, "time_per_iteration": 2.6751458644866943 }, { "auxiliary_loss_clip": 0.01193975, "auxiliary_loss_mlp": 0.00130749, "balance_loss_clip": 1.04040718, "balance_loss_mlp": 0.12192784, "epoch": 0.7428227867127611, "flos": 67209477655680.0, "grad_norm": 0.6817748025487731, "language_loss": 0.58936971, "learning_rate": 6.544805933122199e-07, "loss": 0.60261697, "num_input_tokens_seen": 266531950, "router_z_loss_clip": 1.53125, "router_z_loss_mlp": 0.08837891, "step": 12355, "time_per_iteration": 3.273772716522217 }, { "auxiliary_loss_clip": 0.01285365, "auxiliary_loss_mlp": 0.00243105, "balance_loss_clip": 1.05826378, "balance_loss_mlp": 0.21575797, "epoch": 0.7428829099654292, "flos": 14721221082240.0, "grad_norm": 10.105509002819252, "language_loss": 0.76742649, "learning_rate": 6.541924712943971e-07, "loss": 0.78271121, "num_input_tokens_seen": 266550665, "router_z_loss_clip": 2.265625, "router_z_loss_mlp": 0.27307129, "step": 12356, "time_per_iteration": 2.6638338565826416 }, { "auxiliary_loss_clip": 0.01290599, "auxiliary_loss_mlp": 0.00246597, "balance_loss_clip": 1.06291962, "balance_loss_mlp": 0.22109783, "epoch": 0.7429430332180971, "flos": 48646496413440.0, "grad_norm": 12.248818908404452, "language_loss": 0.80559218, "learning_rate": 6.539044003097301e-07, "loss": 0.82096416, "num_input_tokens_seen": 266572455, "router_z_loss_clip": 2.27734375, "router_z_loss_mlp": 0.25500488, "step": 12357, "time_per_iteration": 2.9150428771972656 }, { "auxiliary_loss_clip": 0.01270678, "auxiliary_loss_mlp": 0.00239347, "balance_loss_clip": 1.05195463, "balance_loss_mlp": 0.21517158, "epoch": 0.7430031564707651, "flos": 16764071281920.0, "grad_norm": 27.279979428767117, "language_loss": 0.72751474, "learning_rate": 6.53616380369143e-07, "loss": 0.74261498, "num_input_tokens_seen": 266590895, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.24157715, "step": 12358, "time_per_iteration": 2.635324716567993 }, { "auxiliary_loss_clip": 0.01292273, "auxiliary_loss_mlp": 0.00240456, "balance_loss_clip": 1.06031752, "balance_loss_mlp": 0.21367016, "epoch": 0.743063279723433, "flos": 23870576545920.0, "grad_norm": 6.8226567260965725, "language_loss": 0.88673192, "learning_rate": 6.533284114835591e-07, "loss": 0.90205926, "num_input_tokens_seen": 266607660, "router_z_loss_clip": 2.3203125, "router_z_loss_mlp": 0.26818848, "step": 12359, "time_per_iteration": 2.6608457565307617 }, { "auxiliary_loss_clip": 0.01276022, "auxiliary_loss_mlp": 0.00233179, "balance_loss_clip": 1.05333138, "balance_loss_mlp": 0.2072387, "epoch": 0.743123402976101, "flos": 14391704689920.0, "grad_norm": 14.263246819026817, "language_loss": 0.76515102, "learning_rate": 6.530404936638956e-07, "loss": 0.78024298, "num_input_tokens_seen": 266624260, "router_z_loss_clip": 2.2265625, "router_z_loss_mlp": 0.25939941, "step": 12360, "time_per_iteration": 2.7221288681030273 }, { "auxiliary_loss_clip": 0.01271665, "auxiliary_loss_mlp": 0.00248721, "balance_loss_clip": 1.04852223, "balance_loss_mlp": 0.22213764, "epoch": 0.7431835262287689, "flos": 27454318335360.0, "grad_norm": 3.3429178163519744, "language_loss": 0.80355501, "learning_rate": 6.527526269210715e-07, "loss": 0.81875885, "num_input_tokens_seen": 266644210, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.26611328, "step": 12361, "time_per_iteration": 2.6906986236572266 }, { "auxiliary_loss_clip": 0.0129508, "auxiliary_loss_mlp": 0.0023771, "balance_loss_clip": 1.06704521, "balance_loss_mlp": 0.21129346, "epoch": 0.743243649481437, "flos": 20959514709120.0, "grad_norm": 3.900412409653168, "language_loss": 0.66069233, "learning_rate": 6.524648112660027e-07, "loss": 0.67602026, "num_input_tokens_seen": 266664230, "router_z_loss_clip": 2.27929688, "router_z_loss_mlp": 0.26428223, "step": 12362, "time_per_iteration": 2.7057554721832275 }, { "auxiliary_loss_clip": 0.01288297, "auxiliary_loss_mlp": 0.00272153, "balance_loss_clip": 1.0560267, "balance_loss_mlp": 0.24342361, "epoch": 0.7433037727341049, "flos": 22783166161920.0, "grad_norm": 78.43477996246736, "language_loss": 0.84827423, "learning_rate": 6.521770467096039e-07, "loss": 0.86387873, "num_input_tokens_seen": 266683270, "router_z_loss_clip": 2.32421875, "router_z_loss_mlp": 0.28759766, "step": 12363, "time_per_iteration": 2.6924140453338623 }, { "auxiliary_loss_clip": 0.01265658, "auxiliary_loss_mlp": 0.00245473, "balance_loss_clip": 1.04470825, "balance_loss_mlp": 0.22023611, "epoch": 0.7433638959867729, "flos": 22196708807040.0, "grad_norm": 189.29369760420508, "language_loss": 0.83714485, "learning_rate": 6.518893332627862e-07, "loss": 0.85225606, "num_input_tokens_seen": 266701235, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.25244141, "step": 12364, "time_per_iteration": 2.7528648376464844 }, { "auxiliary_loss_clip": 0.01285204, "auxiliary_loss_mlp": 0.00272287, "balance_loss_clip": 1.05910265, "balance_loss_mlp": 0.24380806, "epoch": 0.7434240192394409, "flos": 23296760778240.0, "grad_norm": 59.61805312879276, "language_loss": 0.85393119, "learning_rate": 6.516016709364604e-07, "loss": 0.86950606, "num_input_tokens_seen": 266721495, "router_z_loss_clip": 2.2578125, "router_z_loss_mlp": 0.28466797, "step": 12365, "time_per_iteration": 2.6700475215911865 }, { "auxiliary_loss_clip": 0.01283363, "auxiliary_loss_mlp": 0.00267221, "balance_loss_clip": 1.05538273, "balance_loss_mlp": 0.24030387, "epoch": 0.7434841424921088, "flos": 54009575251200.0, "grad_norm": 45.02109916581162, "language_loss": 0.82717741, "learning_rate": 6.513140597415346e-07, "loss": 0.8426832, "num_input_tokens_seen": 266747400, "router_z_loss_clip": 2.27734375, "router_z_loss_mlp": 0.26904297, "step": 12366, "time_per_iteration": 2.9759387969970703 }, { "auxiliary_loss_clip": 0.01278904, "auxiliary_loss_mlp": 0.00263262, "balance_loss_clip": 1.05798829, "balance_loss_mlp": 0.23818043, "epoch": 0.7435442657447768, "flos": 21433966479360.0, "grad_norm": 14.352109722750543, "language_loss": 0.77897823, "learning_rate": 6.510264996889141e-07, "loss": 0.79439986, "num_input_tokens_seen": 266767630, "router_z_loss_clip": 2.20996094, "router_z_loss_mlp": 0.25073242, "step": 12367, "time_per_iteration": 2.671523332595825 }, { "auxiliary_loss_clip": 0.0129871, "auxiliary_loss_mlp": 0.0025631, "balance_loss_clip": 1.06542468, "balance_loss_mlp": 0.22781873, "epoch": 0.7436043889974447, "flos": 24499408970880.0, "grad_norm": 43.78658361438385, "language_loss": 0.82542038, "learning_rate": 6.507389907895038e-07, "loss": 0.84097058, "num_input_tokens_seen": 266788015, "router_z_loss_clip": 2.33398438, "router_z_loss_mlp": 0.28491211, "step": 12368, "time_per_iteration": 2.706244468688965 }, { "auxiliary_loss_clip": 0.01265705, "auxiliary_loss_mlp": 0.0022569, "balance_loss_clip": 1.04983366, "balance_loss_mlp": 0.20288537, "epoch": 0.7436645122501128, "flos": 40698388512000.0, "grad_norm": 5.241704048347659, "language_loss": 0.76046824, "learning_rate": 6.50451533054207e-07, "loss": 0.77538216, "num_input_tokens_seen": 266809010, "router_z_loss_clip": 2.15429688, "router_z_loss_mlp": 0.22814941, "step": 12369, "time_per_iteration": 2.8648805618286133 }, { "auxiliary_loss_clip": 0.0128662, "auxiliary_loss_mlp": 0.00268538, "balance_loss_clip": 1.05788827, "balance_loss_mlp": 0.24257454, "epoch": 0.7437246355027807, "flos": 18908835344640.0, "grad_norm": 6.1420304554672045, "language_loss": 0.81279796, "learning_rate": 6.501641264939233e-07, "loss": 0.82834953, "num_input_tokens_seen": 266825390, "router_z_loss_clip": 2.28710938, "router_z_loss_mlp": 0.25964355, "step": 12370, "time_per_iteration": 2.620091438293457 }, { "auxiliary_loss_clip": 0.01267887, "auxiliary_loss_mlp": 0.00241462, "balance_loss_clip": 1.04701579, "balance_loss_mlp": 0.21733364, "epoch": 0.7437847587554487, "flos": 21543817248000.0, "grad_norm": 4.497211577674663, "language_loss": 0.84656221, "learning_rate": 6.498767711195503e-07, "loss": 0.86165571, "num_input_tokens_seen": 266844675, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.24133301, "step": 12371, "time_per_iteration": 2.716639280319214 }, { "auxiliary_loss_clip": 0.01271885, "auxiliary_loss_mlp": 0.00252943, "balance_loss_clip": 1.04447174, "balance_loss_mlp": 0.22737269, "epoch": 0.7438448820081166, "flos": 27782470010880.0, "grad_norm": 75.66619695928526, "language_loss": 0.79598427, "learning_rate": 6.495894669419857e-07, "loss": 0.81123257, "num_input_tokens_seen": 266865160, "router_z_loss_clip": 2.26953125, "router_z_loss_mlp": 0.25585938, "step": 12372, "time_per_iteration": 2.7481513023376465 }, { "auxiliary_loss_clip": 0.01279511, "auxiliary_loss_mlp": 0.00231568, "balance_loss_clip": 1.05363297, "balance_loss_mlp": 0.20614058, "epoch": 0.7439050052607846, "flos": 17967832796160.0, "grad_norm": 110.88070416135915, "language_loss": 0.85569847, "learning_rate": 6.493022139721245e-07, "loss": 0.87080932, "num_input_tokens_seen": 266883285, "router_z_loss_clip": 2.25585938, "router_z_loss_mlp": 0.25439453, "step": 12373, "time_per_iteration": 2.680837392807007 }, { "auxiliary_loss_clip": 0.01298554, "auxiliary_loss_mlp": 0.00229455, "balance_loss_clip": 1.06582522, "balance_loss_mlp": 0.20104736, "epoch": 0.7439651285134525, "flos": 22958696949120.0, "grad_norm": 12.226254449920933, "language_loss": 0.85953283, "learning_rate": 6.49015012220858e-07, "loss": 0.87481284, "num_input_tokens_seen": 266900960, "router_z_loss_clip": 2.32421875, "router_z_loss_mlp": 0.28381348, "step": 12374, "time_per_iteration": 2.651879072189331 }, { "auxiliary_loss_clip": 0.01284219, "auxiliary_loss_mlp": 0.00253195, "balance_loss_clip": 1.05427241, "balance_loss_mlp": 0.22681428, "epoch": 0.7440252517661206, "flos": 18806777827200.0, "grad_norm": 27.562222894382376, "language_loss": 0.85499287, "learning_rate": 6.487278616990774e-07, "loss": 0.87036705, "num_input_tokens_seen": 266917710, "router_z_loss_clip": 2.30078125, "router_z_loss_mlp": 0.26367188, "step": 12375, "time_per_iteration": 2.686805248260498 }, { "auxiliary_loss_clip": 0.01266241, "auxiliary_loss_mlp": 0.00222396, "balance_loss_clip": 1.04675543, "balance_loss_mlp": 0.19793417, "epoch": 0.7440853750187885, "flos": 20266295155200.0, "grad_norm": 230.2157192614666, "language_loss": 0.83396804, "learning_rate": 6.484407624176733e-07, "loss": 0.84885442, "num_input_tokens_seen": 266934220, "router_z_loss_clip": 2.19335938, "router_z_loss_mlp": 0.24462891, "step": 12376, "time_per_iteration": 2.730562925338745 }, { "auxiliary_loss_clip": 0.01275455, "auxiliary_loss_mlp": 0.00228869, "balance_loss_clip": 1.04596317, "balance_loss_mlp": 0.20315555, "epoch": 0.7441454982714565, "flos": 25337276593920.0, "grad_norm": 32.79194256582312, "language_loss": 0.8724041, "learning_rate": 6.481537143875296e-07, "loss": 0.88744736, "num_input_tokens_seen": 266955210, "router_z_loss_clip": 2.29492188, "router_z_loss_mlp": 0.25720215, "step": 12377, "time_per_iteration": 2.816422700881958 }, { "auxiliary_loss_clip": 0.01278755, "auxiliary_loss_mlp": 0.00237772, "balance_loss_clip": 1.05042863, "balance_loss_mlp": 0.21219015, "epoch": 0.7442056215241245, "flos": 64480910866560.0, "grad_norm": 19.934309582573576, "language_loss": 0.77322984, "learning_rate": 6.478667176195322e-07, "loss": 0.78839505, "num_input_tokens_seen": 266976555, "router_z_loss_clip": 2.28710938, "router_z_loss_mlp": 0.2557373, "step": 12378, "time_per_iteration": 3.0794167518615723 }, { "auxiliary_loss_clip": 0.01284742, "auxiliary_loss_mlp": 0.00235941, "balance_loss_clip": 1.05257106, "balance_loss_mlp": 0.20988205, "epoch": 0.7442657447767924, "flos": 31285376242560.0, "grad_norm": 4.344385977652243, "language_loss": 0.79314446, "learning_rate": 6.475797721245648e-07, "loss": 0.80835128, "num_input_tokens_seen": 266997640, "router_z_loss_clip": 2.32421875, "router_z_loss_mlp": 0.26049805, "step": 12379, "time_per_iteration": 2.768495798110962 }, { "auxiliary_loss_clip": 0.01288495, "auxiliary_loss_mlp": 0.00217756, "balance_loss_clip": 1.05734742, "balance_loss_mlp": 0.19364002, "epoch": 0.7443258680294604, "flos": 20807899401600.0, "grad_norm": 51.77284803597454, "language_loss": 0.72787005, "learning_rate": 6.472928779135085e-07, "loss": 0.74293256, "num_input_tokens_seen": 267016165, "router_z_loss_clip": 2.31054688, "router_z_loss_mlp": 0.2409668, "step": 12380, "time_per_iteration": 2.667865753173828 }, { "auxiliary_loss_clip": 0.01301483, "auxiliary_loss_mlp": 0.00234768, "balance_loss_clip": 1.06299663, "balance_loss_mlp": 0.20657468, "epoch": 0.7443859912821283, "flos": 22199833290240.0, "grad_norm": 9.923404981933457, "language_loss": 0.88546801, "learning_rate": 6.470060349972411e-07, "loss": 0.90083051, "num_input_tokens_seen": 267034075, "router_z_loss_clip": 2.3828125, "router_z_loss_mlp": 0.28198242, "step": 12381, "time_per_iteration": 2.627962589263916 }, { "auxiliary_loss_clip": 0.01309854, "auxiliary_loss_mlp": 0.00236682, "balance_loss_clip": 1.06616163, "balance_loss_mlp": 0.20806003, "epoch": 0.7444461145347964, "flos": 22017838055040.0, "grad_norm": 80.39967161813905, "language_loss": 0.81887794, "learning_rate": 6.467192433866411e-07, "loss": 0.83434325, "num_input_tokens_seen": 267053645, "router_z_loss_clip": 2.43554688, "router_z_loss_mlp": 0.28625488, "step": 12382, "time_per_iteration": 2.693512201309204 }, { "auxiliary_loss_clip": 0.01193766, "auxiliary_loss_mlp": 0.00107046, "balance_loss_clip": 1.03070068, "balance_loss_mlp": 0.09922616, "epoch": 0.7445062377874643, "flos": 70559047704960.0, "grad_norm": 0.6450958150143737, "language_loss": 0.54216093, "learning_rate": 6.464325030925831e-07, "loss": 0.55516905, "num_input_tokens_seen": 267121830, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.078125, "step": 12383, "time_per_iteration": 3.310544967651367 }, { "auxiliary_loss_clip": 0.01293448, "auxiliary_loss_mlp": 0.00239641, "balance_loss_clip": 1.05959868, "balance_loss_mlp": 0.21291403, "epoch": 0.7445663610401323, "flos": 22164425458560.0, "grad_norm": 5.726818046322329, "language_loss": 0.86037654, "learning_rate": 6.461458141259395e-07, "loss": 0.87570745, "num_input_tokens_seen": 267141145, "router_z_loss_clip": 2.3359375, "router_z_loss_mlp": 0.26733398, "step": 12384, "time_per_iteration": 4.026790380477905 }, { "auxiliary_loss_clip": 0.01264071, "auxiliary_loss_mlp": 0.00238835, "balance_loss_clip": 1.04010928, "balance_loss_mlp": 0.21437326, "epoch": 0.7446264842928002, "flos": 24170251714560.0, "grad_norm": 4.0005015649022955, "language_loss": 0.87229031, "learning_rate": 6.458591764975823e-07, "loss": 0.88731933, "num_input_tokens_seen": 267159280, "router_z_loss_clip": 2.24414062, "router_z_loss_mlp": 0.24438477, "step": 12385, "time_per_iteration": 2.692669630050659 }, { "auxiliary_loss_clip": 0.01290889, "auxiliary_loss_mlp": 0.00254687, "balance_loss_clip": 1.06226802, "balance_loss_mlp": 0.22794867, "epoch": 0.7446866075454682, "flos": 24134556574080.0, "grad_norm": 19.81489543662182, "language_loss": 0.88985926, "learning_rate": 6.455725902183813e-07, "loss": 0.90531492, "num_input_tokens_seen": 267179390, "router_z_loss_clip": 2.29101562, "router_z_loss_mlp": 0.26757812, "step": 12386, "time_per_iteration": 4.113672256469727 }, { "auxiliary_loss_clip": 0.01277621, "auxiliary_loss_mlp": 0.00243938, "balance_loss_clip": 1.05296683, "balance_loss_mlp": 0.21805757, "epoch": 0.7447467307981361, "flos": 23548063305600.0, "grad_norm": 5.92792226246144, "language_loss": 0.78860497, "learning_rate": 6.452860552992037e-07, "loss": 0.80382055, "num_input_tokens_seen": 267198165, "router_z_loss_clip": 2.24804688, "router_z_loss_mlp": 0.25891113, "step": 12387, "time_per_iteration": 2.66705060005188 }, { "auxiliary_loss_clip": 0.0126685, "auxiliary_loss_mlp": 0.00232102, "balance_loss_clip": 1.04342568, "balance_loss_mlp": 0.20409974, "epoch": 0.7448068540508042, "flos": 19567832215680.0, "grad_norm": 742.1653866570434, "language_loss": 0.77712011, "learning_rate": 6.449995717509138e-07, "loss": 0.79210973, "num_input_tokens_seen": 267214520, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.27990723, "step": 12388, "time_per_iteration": 2.6976475715637207 }, { "auxiliary_loss_clip": 0.01293874, "auxiliary_loss_mlp": 0.00240577, "balance_loss_clip": 1.0617367, "balance_loss_mlp": 0.21416013, "epoch": 0.7448669773034721, "flos": 21839721488640.0, "grad_norm": 1.7014135736479443, "language_loss": 0.9127807, "learning_rate": 6.447131395843761e-07, "loss": 0.92812514, "num_input_tokens_seen": 267236555, "router_z_loss_clip": 2.328125, "router_z_loss_mlp": 0.26428223, "step": 12389, "time_per_iteration": 4.328755140304565 }, { "auxiliary_loss_clip": 0.01302026, "auxiliary_loss_mlp": 0.00250697, "balance_loss_clip": 1.06577468, "balance_loss_mlp": 0.22165796, "epoch": 0.7449271005561401, "flos": 25155389099520.0, "grad_norm": 3.8722107796805805, "language_loss": 0.85906965, "learning_rate": 6.444267588104526e-07, "loss": 0.87459689, "num_input_tokens_seen": 267254800, "router_z_loss_clip": 2.359375, "router_z_loss_mlp": 0.29052734, "step": 12390, "time_per_iteration": 2.6958730220794678 }, { "auxiliary_loss_clip": 0.01278131, "auxiliary_loss_mlp": 0.00254236, "balance_loss_clip": 1.04984343, "balance_loss_mlp": 0.22796208, "epoch": 0.7449872238088081, "flos": 22273342473600.0, "grad_norm": 18.855089381267018, "language_loss": 0.92224497, "learning_rate": 6.441404294400014e-07, "loss": 0.93756866, "num_input_tokens_seen": 267274610, "router_z_loss_clip": 2.28710938, "router_z_loss_mlp": 0.26306152, "step": 12391, "time_per_iteration": 2.883460760116577 }, { "auxiliary_loss_clip": 0.01288717, "auxiliary_loss_mlp": 0.00242433, "balance_loss_clip": 1.05871189, "balance_loss_mlp": 0.21588509, "epoch": 0.745047347061476, "flos": 20594805966720.0, "grad_norm": 44.992353707124785, "language_loss": 0.80435711, "learning_rate": 6.438541514838811e-07, "loss": 0.81966865, "num_input_tokens_seen": 267292600, "router_z_loss_clip": 2.30273438, "router_z_loss_mlp": 0.26574707, "step": 12392, "time_per_iteration": 2.7058165073394775 }, { "auxiliary_loss_clip": 0.01251459, "auxiliary_loss_mlp": 0.00223474, "balance_loss_clip": 1.03113675, "balance_loss_mlp": 0.19873855, "epoch": 0.745107470314144, "flos": 22127545169280.0, "grad_norm": 3.348419013263166, "language_loss": 0.83503616, "learning_rate": 6.435679249529487e-07, "loss": 0.84978551, "num_input_tokens_seen": 267311295, "router_z_loss_clip": 2.20507812, "router_z_loss_mlp": 0.24731445, "step": 12393, "time_per_iteration": 4.093194484710693 }, { "auxiliary_loss_clip": 0.01297842, "auxiliary_loss_mlp": 0.00242586, "balance_loss_clip": 1.06504297, "balance_loss_mlp": 0.21594289, "epoch": 0.745167593566812, "flos": 22236498097920.0, "grad_norm": 2.262205135280922, "language_loss": 0.80751741, "learning_rate": 6.432817498580552e-07, "loss": 0.82292163, "num_input_tokens_seen": 267328390, "router_z_loss_clip": 2.328125, "router_z_loss_mlp": 0.26635742, "step": 12394, "time_per_iteration": 2.69937801361084 }, { "auxiliary_loss_clip": 0.01277901, "auxiliary_loss_mlp": 0.00246, "balance_loss_clip": 1.05307567, "balance_loss_mlp": 0.22141866, "epoch": 0.74522771681948, "flos": 20666232161280.0, "grad_norm": 240.3197244175798, "language_loss": 0.8965801, "learning_rate": 6.429956262100535e-07, "loss": 0.9118191, "num_input_tokens_seen": 267348185, "router_z_loss_clip": 2.24804688, "router_z_loss_mlp": 0.24572754, "step": 12395, "time_per_iteration": 2.666973829269409 }, { "auxiliary_loss_clip": 0.01297445, "auxiliary_loss_mlp": 0.00225705, "balance_loss_clip": 1.06316793, "balance_loss_mlp": 0.19750038, "epoch": 0.7452878400721479, "flos": 21106999952640.0, "grad_norm": 3.5553123204209647, "language_loss": 0.81212288, "learning_rate": 6.427095540197937e-07, "loss": 0.82735443, "num_input_tokens_seen": 267367010, "router_z_loss_clip": 2.33984375, "router_z_loss_mlp": 0.28234863, "step": 12396, "time_per_iteration": 2.721423387527466 }, { "auxiliary_loss_clip": 0.01292979, "auxiliary_loss_mlp": 0.00251085, "balance_loss_clip": 1.05974543, "balance_loss_mlp": 0.22369048, "epoch": 0.7453479633248159, "flos": 26688056474880.0, "grad_norm": 7.009803116813096, "language_loss": 0.76768494, "learning_rate": 6.424235332981245e-07, "loss": 0.78312564, "num_input_tokens_seen": 267386605, "router_z_loss_clip": 2.328125, "router_z_loss_mlp": 0.27392578, "step": 12397, "time_per_iteration": 2.7134506702423096 }, { "auxiliary_loss_clip": 0.01268432, "auxiliary_loss_mlp": 0.00248522, "balance_loss_clip": 1.04440212, "balance_loss_mlp": 0.22259434, "epoch": 0.7454080865774838, "flos": 17016056167680.0, "grad_norm": 9.552688690261743, "language_loss": 0.83439463, "learning_rate": 6.421375640558908e-07, "loss": 0.84956419, "num_input_tokens_seen": 267404135, "router_z_loss_clip": 2.24023438, "router_z_loss_mlp": 0.25915527, "step": 12398, "time_per_iteration": 2.6651053428649902 }, { "auxiliary_loss_clip": 0.01260576, "auxiliary_loss_mlp": 0.00231011, "balance_loss_clip": 1.03970695, "balance_loss_mlp": 0.20642973, "epoch": 0.7454682098301518, "flos": 21323900229120.0, "grad_norm": 49.356317118422396, "language_loss": 0.84524423, "learning_rate": 6.418516463039363e-07, "loss": 0.86016011, "num_input_tokens_seen": 267423120, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.24584961, "step": 12399, "time_per_iteration": 2.7186365127563477 }, { "auxiliary_loss_clip": 0.01268781, "auxiliary_loss_mlp": 0.00218168, "balance_loss_clip": 1.05316925, "balance_loss_mlp": 0.19423096, "epoch": 0.7455283330828197, "flos": 17858341163520.0, "grad_norm": 2276.7774202439846, "language_loss": 0.83101988, "learning_rate": 6.415657800531038e-07, "loss": 0.84588945, "num_input_tokens_seen": 267441250, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.23937988, "step": 12400, "time_per_iteration": 2.64639949798584 }, { "auxiliary_loss_clip": 0.01290704, "auxiliary_loss_mlp": 0.00248645, "balance_loss_clip": 1.06010342, "balance_loss_mlp": 0.22254997, "epoch": 0.7455884563354878, "flos": 30774259664640.0, "grad_norm": 7.192174103342073, "language_loss": 0.90471488, "learning_rate": 6.412799653142327e-07, "loss": 0.92010838, "num_input_tokens_seen": 267462820, "router_z_loss_clip": 2.30273438, "router_z_loss_mlp": 0.26098633, "step": 12401, "time_per_iteration": 2.753737211227417 }, { "auxiliary_loss_clip": 0.01274145, "auxiliary_loss_mlp": 0.002348, "balance_loss_clip": 1.04820061, "balance_loss_mlp": 0.20943213, "epoch": 0.7456485795881557, "flos": 23185545292800.0, "grad_norm": 13.452444529356061, "language_loss": 0.74011016, "learning_rate": 6.409942020981611e-07, "loss": 0.75519967, "num_input_tokens_seen": 267483065, "router_z_loss_clip": 2.25976562, "router_z_loss_mlp": 0.25390625, "step": 12402, "time_per_iteration": 2.7102510929107666 }, { "auxiliary_loss_clip": 0.01271939, "auxiliary_loss_mlp": 0.00244476, "balance_loss_clip": 1.04390335, "balance_loss_mlp": 0.21909648, "epoch": 0.7457087028408237, "flos": 38727144074880.0, "grad_norm": 30.01593075038537, "language_loss": 0.78173125, "learning_rate": 6.407084904157265e-07, "loss": 0.79689538, "num_input_tokens_seen": 267504825, "router_z_loss_clip": 2.28125, "router_z_loss_mlp": 0.25402832, "step": 12403, "time_per_iteration": 2.867724657058716 }, { "auxiliary_loss_clip": 0.01190411, "auxiliary_loss_mlp": 0.00109701, "balance_loss_clip": 1.02783716, "balance_loss_mlp": 0.10097501, "epoch": 0.7457688260934917, "flos": 56043737337600.0, "grad_norm": 0.8292392905431775, "language_loss": 0.57982534, "learning_rate": 6.404228302777621e-07, "loss": 0.59282649, "num_input_tokens_seen": 267559260, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.08740234, "step": 12404, "time_per_iteration": 3.0095009803771973 }, { "auxiliary_loss_clip": 0.01273114, "auxiliary_loss_mlp": 0.00231975, "balance_loss_clip": 1.0476433, "balance_loss_mlp": 0.20673881, "epoch": 0.7458289493461596, "flos": 20116152305280.0, "grad_norm": 14.577041088594148, "language_loss": 0.82479846, "learning_rate": 6.401372216950995e-07, "loss": 0.83984941, "num_input_tokens_seen": 267578720, "router_z_loss_clip": 2.2578125, "router_z_loss_mlp": 0.25268555, "step": 12405, "time_per_iteration": 2.700167655944824 }, { "auxiliary_loss_clip": 0.01285501, "auxiliary_loss_mlp": 0.00233246, "balance_loss_clip": 1.05791855, "balance_loss_mlp": 0.20740189, "epoch": 0.7458890725988276, "flos": 20193073280640.0, "grad_norm": 2.5200607139981797, "language_loss": 0.75775492, "learning_rate": 6.398516646785698e-07, "loss": 0.77294242, "num_input_tokens_seen": 267598250, "router_z_loss_clip": 2.27929688, "router_z_loss_mlp": 0.25830078, "step": 12406, "time_per_iteration": 2.637152671813965 }, { "auxiliary_loss_clip": 0.01304577, "auxiliary_loss_mlp": 0.00263168, "balance_loss_clip": 1.06675076, "balance_loss_mlp": 0.2362625, "epoch": 0.7459491958514956, "flos": 17018749687680.0, "grad_norm": 56.100494575367264, "language_loss": 0.74431694, "learning_rate": 6.39566159239002e-07, "loss": 0.75999439, "num_input_tokens_seen": 267615430, "router_z_loss_clip": 2.37890625, "router_z_loss_mlp": 0.26953125, "step": 12407, "time_per_iteration": 2.6780710220336914 }, { "auxiliary_loss_clip": 0.01297081, "auxiliary_loss_mlp": 0.00261815, "balance_loss_clip": 1.06239498, "balance_loss_mlp": 0.23307331, "epoch": 0.7460093191041636, "flos": 25078719519360.0, "grad_norm": 18.79105783653529, "language_loss": 0.80049407, "learning_rate": 6.392807053872212e-07, "loss": 0.81608301, "num_input_tokens_seen": 267635075, "router_z_loss_clip": 2.34960938, "router_z_loss_mlp": 0.28771973, "step": 12408, "time_per_iteration": 2.680427312850952 }, { "auxiliary_loss_clip": 0.01311472, "auxiliary_loss_mlp": 0.00253502, "balance_loss_clip": 1.06838894, "balance_loss_mlp": 0.22399761, "epoch": 0.7460694423568315, "flos": 21908525990400.0, "grad_norm": 78.38220481612734, "language_loss": 0.81576025, "learning_rate": 6.38995303134053e-07, "loss": 0.83141005, "num_input_tokens_seen": 267654105, "router_z_loss_clip": 2.4296875, "router_z_loss_mlp": 0.29492188, "step": 12409, "time_per_iteration": 2.741215705871582 }, { "auxiliary_loss_clip": 0.01267308, "auxiliary_loss_mlp": 0.00228251, "balance_loss_clip": 1.04708946, "balance_loss_mlp": 0.2032885, "epoch": 0.7461295656094995, "flos": 21215737399680.0, "grad_norm": 2.6778373324063804, "language_loss": 0.7350111, "learning_rate": 6.38709952490319e-07, "loss": 0.74996674, "num_input_tokens_seen": 267673090, "router_z_loss_clip": 2.20117188, "router_z_loss_mlp": 0.24951172, "step": 12410, "time_per_iteration": 2.6356825828552246 }, { "auxiliary_loss_clip": 0.01281969, "auxiliary_loss_mlp": 0.00246556, "balance_loss_clip": 1.05749989, "balance_loss_mlp": 0.21820824, "epoch": 0.7461896888621674, "flos": 22346851656960.0, "grad_norm": 65.04907313168127, "language_loss": 0.92111272, "learning_rate": 6.384246534668396e-07, "loss": 0.93639803, "num_input_tokens_seen": 267690605, "router_z_loss_clip": 2.2421875, "router_z_loss_mlp": 0.28356934, "step": 12411, "time_per_iteration": 2.7312209606170654 }, { "auxiliary_loss_clip": 0.01284468, "auxiliary_loss_mlp": 0.00254435, "balance_loss_clip": 1.05698776, "balance_loss_mlp": 0.22880477, "epoch": 0.7462498121148354, "flos": 25482930243840.0, "grad_norm": 11.199930551329091, "language_loss": 0.83679616, "learning_rate": 6.381394060744339e-07, "loss": 0.85218513, "num_input_tokens_seen": 267710540, "router_z_loss_clip": 2.27148438, "router_z_loss_mlp": 0.2565918, "step": 12412, "time_per_iteration": 2.697864532470703 }, { "auxiliary_loss_clip": 0.01281042, "auxiliary_loss_mlp": 0.00245284, "balance_loss_clip": 1.05516672, "balance_loss_mlp": 0.21923685, "epoch": 0.7463099353675033, "flos": 33947936812800.0, "grad_norm": 6.391154669273732, "language_loss": 0.69768006, "learning_rate": 6.378542103239188e-07, "loss": 0.71294332, "num_input_tokens_seen": 267730780, "router_z_loss_clip": 2.25976562, "router_z_loss_mlp": 0.26049805, "step": 12413, "time_per_iteration": 2.78680157661438 }, { "auxiliary_loss_clip": 0.01187104, "auxiliary_loss_mlp": 0.0017087, "balance_loss_clip": 1.02489126, "balance_loss_mlp": 0.16138119, "epoch": 0.7463700586201714, "flos": 62767723691520.0, "grad_norm": 0.6955332882386145, "language_loss": 0.53928834, "learning_rate": 6.375690662261082e-07, "loss": 0.55286807, "num_input_tokens_seen": 267794240, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.09472656, "step": 12414, "time_per_iteration": 3.2042486667633057 }, { "auxiliary_loss_clip": 0.01289319, "auxiliary_loss_mlp": 0.0022769, "balance_loss_clip": 1.06502759, "balance_loss_mlp": 0.20121375, "epoch": 0.7464301818728393, "flos": 33432654257280.0, "grad_norm": 2.3321665912276783, "language_loss": 0.62606871, "learning_rate": 6.372839737918154e-07, "loss": 0.64123881, "num_input_tokens_seen": 267817190, "router_z_loss_clip": 2.24414062, "router_z_loss_mlp": 0.26464844, "step": 12415, "time_per_iteration": 2.8101730346679688 }, { "auxiliary_loss_clip": 0.01285883, "auxiliary_loss_mlp": 0.00257619, "balance_loss_clip": 1.05857885, "balance_loss_mlp": 0.23141706, "epoch": 0.7464903051255073, "flos": 26869872142080.0, "grad_norm": 11.028694911496736, "language_loss": 0.79957098, "learning_rate": 6.369989330318506e-07, "loss": 0.81500602, "num_input_tokens_seen": 267836245, "router_z_loss_clip": 2.27539062, "router_z_loss_mlp": 0.26208496, "step": 12416, "time_per_iteration": 2.74772310256958 }, { "auxiliary_loss_clip": 0.01286547, "auxiliary_loss_mlp": 0.00250269, "balance_loss_clip": 1.05757499, "balance_loss_mlp": 0.22418566, "epoch": 0.7465504283781753, "flos": 44086954775040.0, "grad_norm": 8.254002445508346, "language_loss": 0.74568462, "learning_rate": 6.367139439570233e-07, "loss": 0.76105273, "num_input_tokens_seen": 267858310, "router_z_loss_clip": 2.28710938, "router_z_loss_mlp": 0.2611084, "step": 12417, "time_per_iteration": 2.868565559387207 }, { "auxiliary_loss_clip": 0.01311202, "auxiliary_loss_mlp": 0.00263687, "balance_loss_clip": 1.0754962, "balance_loss_mlp": 0.23488617, "epoch": 0.7466105516308432, "flos": 19676102785920.0, "grad_norm": 95.52785367263712, "language_loss": 0.82160151, "learning_rate": 6.364290065781392e-07, "loss": 0.83735037, "num_input_tokens_seen": 267876345, "router_z_loss_clip": 2.35546875, "router_z_loss_mlp": 0.28771973, "step": 12418, "time_per_iteration": 2.693126678466797 }, { "auxiliary_loss_clip": 0.01299607, "auxiliary_loss_mlp": 0.00243018, "balance_loss_clip": 1.06589246, "balance_loss_mlp": 0.2165532, "epoch": 0.7466706748835112, "flos": 20520722165760.0, "grad_norm": 6.0430711610527394, "language_loss": 0.75715959, "learning_rate": 6.361441209060039e-07, "loss": 0.77258581, "num_input_tokens_seen": 267896740, "router_z_loss_clip": 2.33984375, "router_z_loss_mlp": 0.26428223, "step": 12419, "time_per_iteration": 2.713282585144043 }, { "auxiliary_loss_clip": 0.01296149, "auxiliary_loss_mlp": 0.00269517, "balance_loss_clip": 1.06967402, "balance_loss_mlp": 0.24131219, "epoch": 0.7467307981361792, "flos": 21690260997120.0, "grad_norm": 70.1823341488199, "language_loss": 0.80930758, "learning_rate": 6.358592869514216e-07, "loss": 0.82496428, "num_input_tokens_seen": 267914765, "router_z_loss_clip": 2.26171875, "router_z_loss_mlp": 0.28222656, "step": 12420, "time_per_iteration": 2.7412707805633545 }, { "auxiliary_loss_clip": 0.01326187, "auxiliary_loss_mlp": 0.00245494, "balance_loss_clip": 1.08379841, "balance_loss_mlp": 0.21867147, "epoch": 0.7467909213888472, "flos": 19573686132480.0, "grad_norm": 37.431306091564174, "language_loss": 0.7650528, "learning_rate": 6.355745047251904e-07, "loss": 0.78076959, "num_input_tokens_seen": 267934085, "router_z_loss_clip": 2.42382812, "router_z_loss_mlp": 0.26818848, "step": 12421, "time_per_iteration": 2.662130117416382 }, { "auxiliary_loss_clip": 0.01312654, "auxiliary_loss_mlp": 0.00254735, "balance_loss_clip": 1.07094622, "balance_loss_mlp": 0.22732881, "epoch": 0.7468510446415151, "flos": 23695225326720.0, "grad_norm": 6.26495430022639, "language_loss": 0.79937553, "learning_rate": 6.352897742381107e-07, "loss": 0.81504935, "num_input_tokens_seen": 267955170, "router_z_loss_clip": 2.41796875, "router_z_loss_mlp": 0.27429199, "step": 12422, "time_per_iteration": 2.7077560424804688 }, { "auxiliary_loss_clip": 0.01282305, "auxiliary_loss_mlp": 0.00244064, "balance_loss_clip": 1.05937862, "balance_loss_mlp": 0.21936424, "epoch": 0.7469111678941831, "flos": 29315783831040.0, "grad_norm": 26.663280231138664, "language_loss": 0.8167643, "learning_rate": 6.350050955009796e-07, "loss": 0.83202803, "num_input_tokens_seen": 267974980, "router_z_loss_clip": 2.22851562, "router_z_loss_mlp": 0.24682617, "step": 12423, "time_per_iteration": 2.7593326568603516 }, { "auxiliary_loss_clip": 0.0126573, "auxiliary_loss_mlp": 0.00237009, "balance_loss_clip": 1.05032659, "balance_loss_mlp": 0.21385877, "epoch": 0.746971291146851, "flos": 21798639308160.0, "grad_norm": 351.72251038160005, "language_loss": 0.73658907, "learning_rate": 6.347204685245929e-07, "loss": 0.75161648, "num_input_tokens_seen": 267994985, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.23144531, "step": 12424, "time_per_iteration": 2.647552490234375 }, { "auxiliary_loss_clip": 0.01301523, "auxiliary_loss_mlp": 0.00225925, "balance_loss_clip": 1.06561923, "balance_loss_mlp": 0.20022357, "epoch": 0.747031414399519, "flos": 36245070368640.0, "grad_norm": 9.253019840254478, "language_loss": 0.83324164, "learning_rate": 6.344358933197418e-07, "loss": 0.84851611, "num_input_tokens_seen": 268014985, "router_z_loss_clip": 2.35742188, "router_z_loss_mlp": 0.25695801, "step": 12425, "time_per_iteration": 2.8415040969848633 }, { "auxiliary_loss_clip": 0.01304204, "auxiliary_loss_mlp": 0.00244615, "balance_loss_clip": 1.0711937, "balance_loss_mlp": 0.21727985, "epoch": 0.7470915376521869, "flos": 19974916028160.0, "grad_norm": 5.311829231920295, "language_loss": 0.78847528, "learning_rate": 6.341513698972194e-07, "loss": 0.80396342, "num_input_tokens_seen": 268034395, "router_z_loss_clip": 2.33007812, "router_z_loss_mlp": 0.27355957, "step": 12426, "time_per_iteration": 4.128875255584717 }, { "auxiliary_loss_clip": 0.01301592, "auxiliary_loss_mlp": 0.00268904, "balance_loss_clip": 1.07128525, "balance_loss_mlp": 0.24178348, "epoch": 0.747151660904855, "flos": 20084299920000.0, "grad_norm": 3.85979496344621, "language_loss": 0.70673186, "learning_rate": 6.338668982678139e-07, "loss": 0.72243679, "num_input_tokens_seen": 268054485, "router_z_loss_clip": 2.3046875, "router_z_loss_mlp": 0.27111816, "step": 12427, "time_per_iteration": 2.671505928039551 }, { "auxiliary_loss_clip": 0.01283496, "auxiliary_loss_mlp": 0.00270577, "balance_loss_clip": 1.05641925, "balance_loss_mlp": 0.24269393, "epoch": 0.7472117841575229, "flos": 16290373697280.0, "grad_norm": 248.06693845235992, "language_loss": 0.81249487, "learning_rate": 6.335824784423118e-07, "loss": 0.82803571, "num_input_tokens_seen": 268072250, "router_z_loss_clip": 2.26757812, "router_z_loss_mlp": 0.27868652, "step": 12428, "time_per_iteration": 4.04669976234436 }, { "auxiliary_loss_clip": 0.01309002, "auxiliary_loss_mlp": 0.002583, "balance_loss_clip": 1.06995523, "balance_loss_mlp": 0.23150128, "epoch": 0.7472719074101909, "flos": 21389939383680.0, "grad_norm": 96.669855394889, "language_loss": 0.67841893, "learning_rate": 6.33298110431499e-07, "loss": 0.69409192, "num_input_tokens_seen": 268089840, "router_z_loss_clip": 2.390625, "router_z_loss_mlp": 0.26782227, "step": 12429, "time_per_iteration": 2.65919828414917 }, { "auxiliary_loss_clip": 0.01318963, "auxiliary_loss_mlp": 0.00262488, "balance_loss_clip": 1.08271956, "balance_loss_mlp": 0.23502181, "epoch": 0.7473320306628589, "flos": 29643289061760.0, "grad_norm": 38.88807674918153, "language_loss": 0.70444399, "learning_rate": 6.330137942461595e-07, "loss": 0.72025847, "num_input_tokens_seen": 268109360, "router_z_loss_clip": 2.36523438, "router_z_loss_mlp": 0.27478027, "step": 12430, "time_per_iteration": 2.7940196990966797 }, { "auxiliary_loss_clip": 0.01281924, "auxiliary_loss_mlp": 0.00229488, "balance_loss_clip": 1.05595946, "balance_loss_mlp": 0.20569423, "epoch": 0.7473921539155268, "flos": 24136100858880.0, "grad_norm": 2.904324735805579, "language_loss": 0.81321746, "learning_rate": 6.327295298970734e-07, "loss": 0.82833159, "num_input_tokens_seen": 268131840, "router_z_loss_clip": 2.2578125, "router_z_loss_mlp": 0.23815918, "step": 12431, "time_per_iteration": 4.268065929412842 }, { "auxiliary_loss_clip": 0.01293511, "auxiliary_loss_mlp": 0.00225509, "balance_loss_clip": 1.06304407, "balance_loss_mlp": 0.19989048, "epoch": 0.7474522771681948, "flos": 17487958072320.0, "grad_norm": 17.012543359342867, "language_loss": 0.82843435, "learning_rate": 6.32445317395021e-07, "loss": 0.84362447, "num_input_tokens_seen": 268148300, "router_z_loss_clip": 2.3046875, "router_z_loss_mlp": 0.25634766, "step": 12432, "time_per_iteration": 2.6053214073181152 }, { "auxiliary_loss_clip": 0.01316533, "auxiliary_loss_mlp": 0.00269219, "balance_loss_clip": 1.07738245, "balance_loss_mlp": 0.23982242, "epoch": 0.7475124004208628, "flos": 16727298733440.0, "grad_norm": 181.96907705238738, "language_loss": 0.82488036, "learning_rate": 6.321611567507787e-07, "loss": 0.84073794, "num_input_tokens_seen": 268166450, "router_z_loss_clip": 2.38867188, "router_z_loss_mlp": 0.29394531, "step": 12433, "time_per_iteration": 2.7276611328125 }, { "auxiliary_loss_clip": 0.01295057, "auxiliary_loss_mlp": 0.00256894, "balance_loss_clip": 1.06436563, "balance_loss_mlp": 0.2306318, "epoch": 0.7475725236735308, "flos": 19720237622400.0, "grad_norm": 6.192264673162611, "language_loss": 0.75323588, "learning_rate": 6.318770479751232e-07, "loss": 0.76875538, "num_input_tokens_seen": 268186165, "router_z_loss_clip": 2.30664062, "router_z_loss_mlp": 0.26257324, "step": 12434, "time_per_iteration": 2.67352557182312 }, { "auxiliary_loss_clip": 0.01279982, "auxiliary_loss_mlp": 0.0026715, "balance_loss_clip": 1.05288982, "balance_loss_mlp": 0.2413051, "epoch": 0.7476326469261987, "flos": 26286000566400.0, "grad_norm": 29.164364973577765, "language_loss": 0.8484478, "learning_rate": 6.315929910788263e-07, "loss": 0.86391914, "num_input_tokens_seen": 268208145, "router_z_loss_clip": 2.2734375, "router_z_loss_mlp": 0.25842285, "step": 12435, "time_per_iteration": 4.334566354751587 }, { "auxiliary_loss_clip": 0.01299365, "auxiliary_loss_mlp": 0.00245918, "balance_loss_clip": 1.05941975, "balance_loss_mlp": 0.21911931, "epoch": 0.7476927701788667, "flos": 31831828824960.0, "grad_norm": 6.025619875154452, "language_loss": 0.76502311, "learning_rate": 6.313089860726604e-07, "loss": 0.78047597, "num_input_tokens_seen": 268228345, "router_z_loss_clip": 2.3984375, "router_z_loss_mlp": 0.26831055, "step": 12436, "time_per_iteration": 2.846079111099243 }, { "auxiliary_loss_clip": 0.01286189, "auxiliary_loss_mlp": 0.00261829, "balance_loss_clip": 1.05283809, "balance_loss_mlp": 0.2342554, "epoch": 0.7477528934315346, "flos": 31795487239680.0, "grad_norm": 13.71546663338658, "language_loss": 0.77624583, "learning_rate": 6.31025032967396e-07, "loss": 0.79172593, "num_input_tokens_seen": 268250260, "router_z_loss_clip": 2.3359375, "router_z_loss_mlp": 0.27575684, "step": 12437, "time_per_iteration": 2.8230979442596436 }, { "auxiliary_loss_clip": 0.01289427, "auxiliary_loss_mlp": 0.00251839, "balance_loss_clip": 1.06222093, "balance_loss_mlp": 0.22542182, "epoch": 0.7478130166842026, "flos": 20371979946240.0, "grad_norm": 20.905735110851772, "language_loss": 0.73955882, "learning_rate": 6.307411317737986e-07, "loss": 0.7549715, "num_input_tokens_seen": 268268440, "router_z_loss_clip": 2.27539062, "router_z_loss_mlp": 0.2644043, "step": 12438, "time_per_iteration": 2.7023043632507324 }, { "auxiliary_loss_clip": 0.01280721, "auxiliary_loss_mlp": 0.00246922, "balance_loss_clip": 1.05633533, "balance_loss_mlp": 0.22108956, "epoch": 0.7478731399368705, "flos": 18148930191360.0, "grad_norm": 19.469690781636253, "language_loss": 0.86717588, "learning_rate": 6.304572825026344e-07, "loss": 0.88245237, "num_input_tokens_seen": 268285765, "router_z_loss_clip": 2.2421875, "router_z_loss_mlp": 0.25866699, "step": 12439, "time_per_iteration": 2.657804250717163 }, { "auxiliary_loss_clip": 0.01278915, "auxiliary_loss_mlp": 0.0025599, "balance_loss_clip": 1.05518389, "balance_loss_mlp": 0.22954978, "epoch": 0.7479332631895386, "flos": 15267889146240.0, "grad_norm": 30.335116169125083, "language_loss": 0.80764031, "learning_rate": 6.301734851646674e-07, "loss": 0.82298934, "num_input_tokens_seen": 268304015, "router_z_loss_clip": 2.23632812, "router_z_loss_mlp": 0.26464844, "step": 12440, "time_per_iteration": 2.691300630569458 }, { "auxiliary_loss_clip": 0.01284818, "auxiliary_loss_mlp": 0.00240947, "balance_loss_clip": 1.05764151, "balance_loss_mlp": 0.21581808, "epoch": 0.7479933864422065, "flos": 21142515525120.0, "grad_norm": 3.4695347437616553, "language_loss": 0.80233204, "learning_rate": 6.298897397706597e-07, "loss": 0.8175897, "num_input_tokens_seen": 268323290, "router_z_loss_clip": 2.27148438, "router_z_loss_mlp": 0.25134277, "step": 12441, "time_per_iteration": 2.702354669570923 }, { "auxiliary_loss_clip": 0.01289677, "auxiliary_loss_mlp": 0.00233658, "balance_loss_clip": 1.05759811, "balance_loss_mlp": 0.20552427, "epoch": 0.7480535096948745, "flos": 14392027912320.0, "grad_norm": 61.08820436121606, "language_loss": 0.92358434, "learning_rate": 6.296060463313698e-07, "loss": 0.93881768, "num_input_tokens_seen": 268339490, "router_z_loss_clip": 2.3203125, "router_z_loss_mlp": 0.28137207, "step": 12442, "time_per_iteration": 2.715532064437866 }, { "auxiliary_loss_clip": 0.01323655, "auxiliary_loss_mlp": 0.00251724, "balance_loss_clip": 1.0865407, "balance_loss_mlp": 0.22218373, "epoch": 0.7481136329475425, "flos": 27344683048320.0, "grad_norm": 633.9063510640644, "language_loss": 0.73650146, "learning_rate": 6.293224048575565e-07, "loss": 0.7522552, "num_input_tokens_seen": 268359865, "router_z_loss_clip": 2.37109375, "router_z_loss_mlp": 0.29541016, "step": 12443, "time_per_iteration": 2.761531352996826 }, { "auxiliary_loss_clip": 0.01277485, "auxiliary_loss_mlp": 0.00245653, "balance_loss_clip": 1.05500722, "balance_loss_mlp": 0.22076166, "epoch": 0.7481737562002104, "flos": 19531454716800.0, "grad_norm": 4.229804734088964, "language_loss": 0.79228228, "learning_rate": 6.29038815359975e-07, "loss": 0.80751365, "num_input_tokens_seen": 268377065, "router_z_loss_clip": 2.2265625, "router_z_loss_mlp": 0.24865723, "step": 12444, "time_per_iteration": 2.616154193878174 }, { "auxiliary_loss_clip": 0.01301418, "auxiliary_loss_mlp": 0.00260726, "balance_loss_clip": 1.06554246, "balance_loss_mlp": 0.2323541, "epoch": 0.7482338794528784, "flos": 21760035166080.0, "grad_norm": 2.582532633697953, "language_loss": 0.75342238, "learning_rate": 6.287552778493786e-07, "loss": 0.76904392, "num_input_tokens_seen": 268396935, "router_z_loss_clip": 2.36132812, "router_z_loss_mlp": 0.28369141, "step": 12445, "time_per_iteration": 2.6954331398010254 }, { "auxiliary_loss_clip": 0.01292614, "auxiliary_loss_mlp": 0.00255013, "balance_loss_clip": 1.06419539, "balance_loss_mlp": 0.22828668, "epoch": 0.7482940027055464, "flos": 18697358021760.0, "grad_norm": 3.970613056152344, "language_loss": 0.82249463, "learning_rate": 6.28471792336519e-07, "loss": 0.83797091, "num_input_tokens_seen": 268414460, "router_z_loss_clip": 2.28515625, "router_z_loss_mlp": 0.26721191, "step": 12446, "time_per_iteration": 2.675278425216675 }, { "auxiliary_loss_clip": 0.01307459, "auxiliary_loss_mlp": 0.00246838, "balance_loss_clip": 1.06894207, "balance_loss_mlp": 0.21747719, "epoch": 0.7483541259582144, "flos": 15998024903040.0, "grad_norm": 3.0063082973114192, "language_loss": 0.81684119, "learning_rate": 6.281883588321475e-07, "loss": 0.83238411, "num_input_tokens_seen": 268432225, "router_z_loss_clip": 2.3828125, "router_z_loss_mlp": 0.29382324, "step": 12447, "time_per_iteration": 2.6692981719970703 }, { "auxiliary_loss_clip": 0.0129974, "auxiliary_loss_mlp": 0.00252928, "balance_loss_clip": 1.06376588, "balance_loss_mlp": 0.22565269, "epoch": 0.7484142492108823, "flos": 25556295772800.0, "grad_norm": 21.10796898332828, "language_loss": 0.85289109, "learning_rate": 6.279049773470109e-07, "loss": 0.8684178, "num_input_tokens_seen": 268449270, "router_z_loss_clip": 2.35742188, "router_z_loss_mlp": 0.27294922, "step": 12448, "time_per_iteration": 2.6656105518341064 }, { "auxiliary_loss_clip": 0.01291148, "auxiliary_loss_mlp": 0.00231799, "balance_loss_clip": 1.05845881, "balance_loss_mlp": 0.20608565, "epoch": 0.7484743724635503, "flos": 22887737631360.0, "grad_norm": 5.308758201220384, "language_loss": 0.82126027, "learning_rate": 6.276216478918543e-07, "loss": 0.83648974, "num_input_tokens_seen": 268467250, "router_z_loss_clip": 2.32617188, "router_z_loss_mlp": 0.25671387, "step": 12449, "time_per_iteration": 2.697805404663086 }, { "auxiliary_loss_clip": 0.01318559, "auxiliary_loss_mlp": 0.00284347, "balance_loss_clip": 1.07292449, "balance_loss_mlp": 0.2567139, "epoch": 0.7485344957162182, "flos": 25300288563840.0, "grad_norm": 26.40440656248313, "language_loss": 0.69458467, "learning_rate": 6.273383704774225e-07, "loss": 0.71061373, "num_input_tokens_seen": 268487270, "router_z_loss_clip": 2.45703125, "router_z_loss_mlp": 0.27636719, "step": 12450, "time_per_iteration": 2.6614151000976562 }, { "auxiliary_loss_clip": 0.01286867, "auxiliary_loss_mlp": 0.00234283, "balance_loss_clip": 1.06566, "balance_loss_mlp": 0.20870087, "epoch": 0.7485946189688862, "flos": 27053016612480.0, "grad_norm": 4.3254156307020875, "language_loss": 0.78348982, "learning_rate": 6.270551451144577e-07, "loss": 0.79870129, "num_input_tokens_seen": 268508020, "router_z_loss_clip": 2.21289062, "router_z_loss_mlp": 0.25598145, "step": 12451, "time_per_iteration": 2.784985303878784 }, { "auxiliary_loss_clip": 0.01330058, "auxiliary_loss_mlp": 0.00271919, "balance_loss_clip": 1.08226073, "balance_loss_mlp": 0.24296349, "epoch": 0.7486547422215541, "flos": 26906752431360.0, "grad_norm": 7.8391319733960625, "language_loss": 0.8972227, "learning_rate": 6.267719718136988e-07, "loss": 0.91324246, "num_input_tokens_seen": 268527375, "router_z_loss_clip": 2.47851562, "router_z_loss_mlp": 0.28967285, "step": 12452, "time_per_iteration": 2.7311184406280518 }, { "auxiliary_loss_clip": 0.01352617, "auxiliary_loss_mlp": 0.00269555, "balance_loss_clip": 1.10264874, "balance_loss_mlp": 0.24015787, "epoch": 0.7487148654742222, "flos": 22346277039360.0, "grad_norm": 4.224820433888867, "language_loss": 0.79306298, "learning_rate": 6.264888505858843e-07, "loss": 0.80928469, "num_input_tokens_seen": 268544870, "router_z_loss_clip": 2.5, "router_z_loss_mlp": 0.29345703, "step": 12453, "time_per_iteration": 2.7425343990325928 }, { "auxiliary_loss_clip": 0.01302447, "auxiliary_loss_mlp": 0.00236681, "balance_loss_clip": 1.06788874, "balance_loss_mlp": 0.20978743, "epoch": 0.7487749887268901, "flos": 23038814234880.0, "grad_norm": 4.872183837831684, "language_loss": 0.81901705, "learning_rate": 6.262057814417517e-07, "loss": 0.8344084, "num_input_tokens_seen": 268564580, "router_z_loss_clip": 2.34570312, "router_z_loss_mlp": 0.26928711, "step": 12454, "time_per_iteration": 2.694247245788574 }, { "auxiliary_loss_clip": 0.01202878, "auxiliary_loss_mlp": 0.00123985, "balance_loss_clip": 1.03798032, "balance_loss_mlp": 0.11511607, "epoch": 0.7488351119795581, "flos": 71525294536320.0, "grad_norm": 0.7183918615209325, "language_loss": 0.58715725, "learning_rate": 6.259227643920322e-07, "loss": 0.60042584, "num_input_tokens_seen": 268629550, "router_z_loss_clip": 1.65625, "router_z_loss_mlp": 0.08886719, "step": 12455, "time_per_iteration": 3.423179864883423 }, { "auxiliary_loss_clip": 0.01280562, "auxiliary_loss_mlp": 0.00228411, "balance_loss_clip": 1.05695462, "balance_loss_mlp": 0.20281699, "epoch": 0.748895235232226, "flos": 17196255722880.0, "grad_norm": 71.8676819485333, "language_loss": 0.88212538, "learning_rate": 6.256397994474592e-07, "loss": 0.89721513, "num_input_tokens_seen": 268646645, "router_z_loss_clip": 2.23632812, "router_z_loss_mlp": 0.25634766, "step": 12456, "time_per_iteration": 2.6498279571533203 }, { "auxiliary_loss_clip": 0.0119982, "auxiliary_loss_mlp": 0.001679, "balance_loss_clip": 1.03432882, "balance_loss_mlp": 0.15774377, "epoch": 0.748955358484894, "flos": 58979256336000.0, "grad_norm": 0.8906802383869041, "language_loss": 0.60946584, "learning_rate": 6.25356886618763e-07, "loss": 0.62314302, "num_input_tokens_seen": 268702275, "router_z_loss_clip": 1.65625, "router_z_loss_mlp": 0.1015625, "step": 12457, "time_per_iteration": 3.0612950325012207 }, { "auxiliary_loss_clip": 0.01307413, "auxiliary_loss_mlp": 0.00236567, "balance_loss_clip": 1.07175362, "balance_loss_mlp": 0.21144973, "epoch": 0.749015481737562, "flos": 11360413054080.0, "grad_norm": 242.61831801093996, "language_loss": 0.78290343, "learning_rate": 6.250740259166711e-07, "loss": 0.79834318, "num_input_tokens_seen": 268716265, "router_z_loss_clip": 2.35742188, "router_z_loss_mlp": 0.25134277, "step": 12458, "time_per_iteration": 2.6288180351257324 }, { "auxiliary_loss_clip": 0.01284408, "auxiliary_loss_mlp": 0.00247584, "balance_loss_clip": 1.06072474, "balance_loss_mlp": 0.22194239, "epoch": 0.74907560499023, "flos": 21106497162240.0, "grad_norm": 19.929779847057716, "language_loss": 0.83523142, "learning_rate": 6.247912173519106e-07, "loss": 0.85055137, "num_input_tokens_seen": 268734330, "router_z_loss_clip": 2.23632812, "router_z_loss_mlp": 0.2565918, "step": 12459, "time_per_iteration": 2.6447603702545166 }, { "auxiliary_loss_clip": 0.0129858, "auxiliary_loss_mlp": 0.00240002, "balance_loss_clip": 1.06886172, "balance_loss_mlp": 0.21295339, "epoch": 0.749135728242898, "flos": 22268027260800.0, "grad_norm": 3.973942140528668, "language_loss": 0.87779355, "learning_rate": 6.245084609352043e-07, "loss": 0.89317942, "num_input_tokens_seen": 268753500, "router_z_loss_clip": 2.29882812, "router_z_loss_mlp": 0.27062988, "step": 12460, "time_per_iteration": 2.6817595958709717 }, { "auxiliary_loss_clip": 0.01293962, "auxiliary_loss_mlp": 0.0025254, "balance_loss_clip": 1.06412017, "balance_loss_mlp": 0.22594473, "epoch": 0.7491958514955659, "flos": 24057527857920.0, "grad_norm": 29.63831961050431, "language_loss": 0.93207854, "learning_rate": 6.242257566772755e-07, "loss": 0.9475435, "num_input_tokens_seen": 268772055, "router_z_loss_clip": 2.30078125, "router_z_loss_mlp": 0.26611328, "step": 12461, "time_per_iteration": 2.6822867393493652 }, { "auxiliary_loss_clip": 0.01281368, "auxiliary_loss_mlp": 0.00224036, "balance_loss_clip": 1.06000102, "balance_loss_mlp": 0.20018236, "epoch": 0.7492559747482339, "flos": 24492118510080.0, "grad_norm": 24.70839756217448, "language_loss": 0.77877223, "learning_rate": 6.239431045888435e-07, "loss": 0.79382634, "num_input_tokens_seen": 268792265, "router_z_loss_clip": 2.21484375, "router_z_loss_mlp": 0.2388916, "step": 12462, "time_per_iteration": 2.6966402530670166 }, { "auxiliary_loss_clip": 0.01301773, "auxiliary_loss_mlp": 0.00272502, "balance_loss_clip": 1.06982982, "balance_loss_mlp": 0.24535772, "epoch": 0.7493160980009018, "flos": 27745338326400.0, "grad_norm": 15.424165432874085, "language_loss": 0.79411924, "learning_rate": 6.236605046806267e-07, "loss": 0.80986202, "num_input_tokens_seen": 268812735, "router_z_loss_clip": 2.3203125, "router_z_loss_mlp": 0.27111816, "step": 12463, "time_per_iteration": 2.789400100708008 }, { "auxiliary_loss_clip": 0.01275622, "auxiliary_loss_mlp": 0.00243021, "balance_loss_clip": 1.05285597, "balance_loss_mlp": 0.21838091, "epoch": 0.7493762212535698, "flos": 30226190970240.0, "grad_norm": 12.00355858197271, "language_loss": 0.84720933, "learning_rate": 6.233779569633419e-07, "loss": 0.86239576, "num_input_tokens_seen": 268833090, "router_z_loss_clip": 2.22851562, "router_z_loss_mlp": 0.24645996, "step": 12464, "time_per_iteration": 2.80342435836792 }, { "auxiliary_loss_clip": 0.01283159, "auxiliary_loss_mlp": 0.00253574, "balance_loss_clip": 1.05407906, "balance_loss_mlp": 0.22756258, "epoch": 0.7494363445062378, "flos": 21944472526080.0, "grad_norm": 290.1565301506593, "language_loss": 0.86069226, "learning_rate": 6.230954614477034e-07, "loss": 0.87605959, "num_input_tokens_seen": 268851880, "router_z_loss_clip": 2.29492188, "router_z_loss_mlp": 0.26062012, "step": 12465, "time_per_iteration": 2.722250461578369 }, { "auxiliary_loss_clip": 0.01336415, "auxiliary_loss_mlp": 0.00261003, "balance_loss_clip": 1.08316112, "balance_loss_mlp": 0.23105793, "epoch": 0.7494964677589058, "flos": 12490342162560.0, "grad_norm": 4.999096187530889, "language_loss": 0.85676384, "learning_rate": 6.22813018144422e-07, "loss": 0.87273806, "num_input_tokens_seen": 268867910, "router_z_loss_clip": 2.53320312, "router_z_loss_mlp": 0.29931641, "step": 12466, "time_per_iteration": 2.6257405281066895 }, { "auxiliary_loss_clip": 0.01318774, "auxiliary_loss_mlp": 0.00269332, "balance_loss_clip": 1.08019865, "balance_loss_mlp": 0.24166401, "epoch": 0.7495565910115737, "flos": 21653057485440.0, "grad_norm": 37.22856354617648, "language_loss": 0.74850595, "learning_rate": 6.22530627064209e-07, "loss": 0.76438701, "num_input_tokens_seen": 268887260, "router_z_loss_clip": 2.38867188, "router_z_loss_mlp": 0.27648926, "step": 12467, "time_per_iteration": 2.643465757369995 }, { "auxiliary_loss_clip": 0.0131683, "auxiliary_loss_mlp": 0.00248706, "balance_loss_clip": 1.07432556, "balance_loss_mlp": 0.22021481, "epoch": 0.7496167142642417, "flos": 15268535591040.0, "grad_norm": 14.814807135000269, "language_loss": 0.86569273, "learning_rate": 6.222482882177735e-07, "loss": 0.88134813, "num_input_tokens_seen": 268902520, "router_z_loss_clip": 2.42773438, "router_z_loss_mlp": 0.28503418, "step": 12468, "time_per_iteration": 4.113052845001221 }, { "auxiliary_loss_clip": 0.01299763, "auxiliary_loss_mlp": 0.00229603, "balance_loss_clip": 1.06468976, "balance_loss_mlp": 0.20305556, "epoch": 0.7496768375169096, "flos": 22054933825920.0, "grad_norm": 11.517291407864052, "language_loss": 0.78028494, "learning_rate": 6.219660016158201e-07, "loss": 0.7955786, "num_input_tokens_seen": 268920970, "router_z_loss_clip": 2.34570312, "router_z_loss_mlp": 0.26501465, "step": 12469, "time_per_iteration": 2.6290242671966553 }, { "auxiliary_loss_clip": 0.01313349, "auxiliary_loss_mlp": 0.00235918, "balance_loss_clip": 1.07408953, "balance_loss_mlp": 0.20798752, "epoch": 0.7497369607695776, "flos": 19057038860160.0, "grad_norm": 6.490601707817686, "language_loss": 0.77652812, "learning_rate": 6.216837672690543e-07, "loss": 0.7920208, "num_input_tokens_seen": 268936600, "router_z_loss_clip": 2.39453125, "router_z_loss_mlp": 0.2791748, "step": 12470, "time_per_iteration": 4.075212001800537 }, { "auxiliary_loss_clip": 0.01305011, "auxiliary_loss_mlp": 0.00269618, "balance_loss_clip": 1.06188774, "balance_loss_mlp": 0.23914862, "epoch": 0.7497970840222457, "flos": 21617434172160.0, "grad_norm": 13.297279286671179, "language_loss": 0.85294437, "learning_rate": 6.214015851881793e-07, "loss": 0.86869067, "num_input_tokens_seen": 268956560, "router_z_loss_clip": 2.42773438, "router_z_loss_mlp": 0.30419922, "step": 12471, "time_per_iteration": 2.6477580070495605 }, { "auxiliary_loss_clip": 0.01288424, "auxiliary_loss_mlp": 0.002364, "balance_loss_clip": 1.05461848, "balance_loss_mlp": 0.2089697, "epoch": 0.7498572072749136, "flos": 13735580906880.0, "grad_norm": 13.17358645911383, "language_loss": 0.91231173, "learning_rate": 6.211194553838929e-07, "loss": 0.92755997, "num_input_tokens_seen": 268973945, "router_z_loss_clip": 2.3359375, "router_z_loss_mlp": 0.27441406, "step": 12472, "time_per_iteration": 2.6355648040771484 }, { "auxiliary_loss_clip": 0.012938, "auxiliary_loss_mlp": 0.00244721, "balance_loss_clip": 1.06477666, "balance_loss_mlp": 0.21886453, "epoch": 0.7499173305275816, "flos": 22966526113920.0, "grad_norm": 9.70357782466278, "language_loss": 0.91755533, "learning_rate": 6.208373778668951e-07, "loss": 0.9329406, "num_input_tokens_seen": 268993245, "router_z_loss_clip": 2.29296875, "router_z_loss_mlp": 0.25891113, "step": 12473, "time_per_iteration": 2.65549635887146 }, { "auxiliary_loss_clip": 0.01322755, "auxiliary_loss_mlp": 0.0026459, "balance_loss_clip": 1.07973015, "balance_loss_mlp": 0.2375415, "epoch": 0.7499774537802495, "flos": 22740467869440.0, "grad_norm": 5.020482024584094, "language_loss": 0.82620728, "learning_rate": 6.205553526478829e-07, "loss": 0.84208071, "num_input_tokens_seen": 269012125, "router_z_loss_clip": 2.4296875, "router_z_loss_mlp": 0.27026367, "step": 12474, "time_per_iteration": 4.17205286026001 }, { "auxiliary_loss_clip": 0.0131679, "auxiliary_loss_mlp": 0.00285978, "balance_loss_clip": 1.07670116, "balance_loss_mlp": 0.25767735, "epoch": 0.7500375770329175, "flos": 18296559089280.0, "grad_norm": 14.7656479144248, "language_loss": 0.84062934, "learning_rate": 6.202733797375492e-07, "loss": 0.85665703, "num_input_tokens_seen": 269030545, "router_z_loss_clip": 2.39648438, "router_z_loss_mlp": 0.28308105, "step": 12475, "time_per_iteration": 2.7308402061462402 }, { "auxiliary_loss_clip": 0.01307308, "auxiliary_loss_mlp": 0.00266274, "balance_loss_clip": 1.06817961, "balance_loss_mlp": 0.23697272, "epoch": 0.7500977002855854, "flos": 19169978198400.0, "grad_norm": 4.001296191613778, "language_loss": 0.90004545, "learning_rate": 6.199914591465878e-07, "loss": 0.91578126, "num_input_tokens_seen": 269048180, "router_z_loss_clip": 2.390625, "router_z_loss_mlp": 0.29296875, "step": 12476, "time_per_iteration": 2.7059619426727295 }, { "auxiliary_loss_clip": 0.01306546, "auxiliary_loss_mlp": 0.00247041, "balance_loss_clip": 1.06416786, "balance_loss_mlp": 0.22025496, "epoch": 0.7501578235382534, "flos": 22163886754560.0, "grad_norm": 6.048106329043422, "language_loss": 0.84372127, "learning_rate": 6.19709590885688e-07, "loss": 0.8592571, "num_input_tokens_seen": 269068600, "router_z_loss_clip": 2.42382812, "router_z_loss_mlp": 0.26794434, "step": 12477, "time_per_iteration": 2.7879600524902344 }, { "auxiliary_loss_clip": 0.01205541, "auxiliary_loss_mlp": 0.00154205, "balance_loss_clip": 1.04353869, "balance_loss_mlp": 0.14347588, "epoch": 0.7502179467909214, "flos": 64465040033280.0, "grad_norm": 0.8004231533549595, "language_loss": 0.53527057, "learning_rate": 6.194277749655394e-07, "loss": 0.54886806, "num_input_tokens_seen": 269119045, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.10742188, "step": 12478, "time_per_iteration": 4.65759015083313 }, { "auxiliary_loss_clip": 0.01277932, "auxiliary_loss_mlp": 0.00223087, "balance_loss_clip": 1.05358434, "balance_loss_mlp": 0.19889933, "epoch": 0.7502780700435894, "flos": 20478275268480.0, "grad_norm": 2.868545559724577, "language_loss": 0.88888717, "learning_rate": 6.191460113968272e-07, "loss": 0.9038974, "num_input_tokens_seen": 269136755, "router_z_loss_clip": 2.2421875, "router_z_loss_mlp": 0.24157715, "step": 12479, "time_per_iteration": 2.6429803371429443 }, { "auxiliary_loss_clip": 0.01301853, "auxiliary_loss_mlp": 0.00286126, "balance_loss_clip": 1.06565082, "balance_loss_mlp": 0.25632399, "epoch": 0.7503381932962573, "flos": 20445273648000.0, "grad_norm": 2.811533641580032, "language_loss": 0.74383867, "learning_rate": 6.188643001902369e-07, "loss": 0.75971854, "num_input_tokens_seen": 269156120, "router_z_loss_clip": 2.3671875, "router_z_loss_mlp": 0.2980957, "step": 12480, "time_per_iteration": 2.672666072845459 }, { "auxiliary_loss_clip": 0.01268096, "auxiliary_loss_mlp": 0.00238407, "balance_loss_clip": 1.04691315, "balance_loss_mlp": 0.21338516, "epoch": 0.7503983165489253, "flos": 22381936266240.0, "grad_norm": 7.870035776576065, "language_loss": 0.84729689, "learning_rate": 6.185826413564512e-07, "loss": 0.86236191, "num_input_tokens_seen": 269175650, "router_z_loss_clip": 2.21679688, "router_z_loss_mlp": 0.25024414, "step": 12481, "time_per_iteration": 2.6653196811676025 }, { "auxiliary_loss_clip": 0.0129337, "auxiliary_loss_mlp": 0.00261026, "balance_loss_clip": 1.05579889, "balance_loss_mlp": 0.23257121, "epoch": 0.7504584398015932, "flos": 24899453717760.0, "grad_norm": 288.5599406885729, "language_loss": 0.79852653, "learning_rate": 6.183010349061501e-07, "loss": 0.81407046, "num_input_tokens_seen": 269197080, "router_z_loss_clip": 2.37695312, "router_z_loss_mlp": 0.28417969, "step": 12482, "time_per_iteration": 2.8244144916534424 }, { "auxiliary_loss_clip": 0.01284348, "auxiliary_loss_mlp": 0.00296765, "balance_loss_clip": 1.05648255, "balance_loss_mlp": 0.26859558, "epoch": 0.7505185630542612, "flos": 25885237547520.0, "grad_norm": 24.31309298517661, "language_loss": 0.78583848, "learning_rate": 6.180194808500118e-07, "loss": 0.80164957, "num_input_tokens_seen": 269218600, "router_z_loss_clip": 2.27734375, "router_z_loss_mlp": 0.28186035, "step": 12483, "time_per_iteration": 2.769561290740967 }, { "auxiliary_loss_clip": 0.01282325, "auxiliary_loss_mlp": 0.00271098, "balance_loss_clip": 1.054829, "balance_loss_mlp": 0.24508618, "epoch": 0.7505786863069293, "flos": 23143852581120.0, "grad_norm": 6.132587859533722, "language_loss": 0.8102119, "learning_rate": 6.177379791987131e-07, "loss": 0.82574606, "num_input_tokens_seen": 269239245, "router_z_loss_clip": 2.27734375, "router_z_loss_mlp": 0.26000977, "step": 12484, "time_per_iteration": 2.7096052169799805 }, { "auxiliary_loss_clip": 0.01286151, "auxiliary_loss_mlp": 0.00274137, "balance_loss_clip": 1.05648732, "balance_loss_mlp": 0.2474346, "epoch": 0.7506388095595972, "flos": 16983377769600.0, "grad_norm": 60.29314029086098, "language_loss": 0.9203254, "learning_rate": 6.174565299629295e-07, "loss": 0.93592823, "num_input_tokens_seen": 269258520, "router_z_loss_clip": 2.296875, "router_z_loss_mlp": 0.26708984, "step": 12485, "time_per_iteration": 2.6579833030700684 }, { "auxiliary_loss_clip": 0.01270257, "auxiliary_loss_mlp": 0.00235856, "balance_loss_clip": 1.05176163, "balance_loss_mlp": 0.211513, "epoch": 0.7506989328122652, "flos": 22344984149760.0, "grad_norm": 27.486481467458674, "language_loss": 0.84489346, "learning_rate": 6.171751331533323e-07, "loss": 0.8599546, "num_input_tokens_seen": 269278320, "router_z_loss_clip": 2.18164062, "router_z_loss_mlp": 0.24353027, "step": 12486, "time_per_iteration": 2.650078535079956 }, { "auxiliary_loss_clip": 0.01293108, "auxiliary_loss_mlp": 0.00275052, "balance_loss_clip": 1.06411994, "balance_loss_mlp": 0.24774104, "epoch": 0.7507590560649331, "flos": 25776069137280.0, "grad_norm": 3.47378740253378, "language_loss": 0.80091572, "learning_rate": 6.168937887805932e-07, "loss": 0.81659722, "num_input_tokens_seen": 269298025, "router_z_loss_clip": 2.29296875, "router_z_loss_mlp": 0.27282715, "step": 12487, "time_per_iteration": 2.7029314041137695 }, { "auxiliary_loss_clip": 0.01298912, "auxiliary_loss_mlp": 0.00252273, "balance_loss_clip": 1.06399584, "balance_loss_mlp": 0.22585654, "epoch": 0.7508191793176011, "flos": 24279420124800.0, "grad_norm": 27.92391169644297, "language_loss": 0.77902055, "learning_rate": 6.166124968553801e-07, "loss": 0.79453236, "num_input_tokens_seen": 269316770, "router_z_loss_clip": 2.34960938, "router_z_loss_mlp": 0.26428223, "step": 12488, "time_per_iteration": 2.672670841217041 }, { "auxiliary_loss_clip": 0.01291467, "auxiliary_loss_mlp": 0.00274003, "balance_loss_clip": 1.06560755, "balance_loss_mlp": 0.24665666, "epoch": 0.750879302570269, "flos": 19899575251200.0, "grad_norm": 726.1232372969641, "language_loss": 0.83207226, "learning_rate": 6.163312573883592e-07, "loss": 0.84772694, "num_input_tokens_seen": 269334755, "router_z_loss_clip": 2.25976562, "router_z_loss_mlp": 0.2734375, "step": 12489, "time_per_iteration": 2.7408132553100586 }, { "auxiliary_loss_clip": 0.01291517, "auxiliary_loss_mlp": 0.00286601, "balance_loss_clip": 1.06298804, "balance_loss_mlp": 0.2585634, "epoch": 0.750939425822937, "flos": 29205681667200.0, "grad_norm": 18.042870867786547, "language_loss": 0.8295821, "learning_rate": 6.160500703901956e-07, "loss": 0.84536326, "num_input_tokens_seen": 269353810, "router_z_loss_clip": 2.28515625, "router_z_loss_mlp": 0.28027344, "step": 12490, "time_per_iteration": 2.7784836292266846 }, { "auxiliary_loss_clip": 0.01305439, "auxiliary_loss_mlp": 0.00267094, "balance_loss_clip": 1.06732559, "balance_loss_mlp": 0.23989066, "epoch": 0.750999549075605, "flos": 21142300043520.0, "grad_norm": 55.00373735662941, "language_loss": 0.86044168, "learning_rate": 6.157689358715527e-07, "loss": 0.876167, "num_input_tokens_seen": 269372910, "router_z_loss_clip": 2.37890625, "router_z_loss_mlp": 0.27233887, "step": 12491, "time_per_iteration": 2.725717782974243 }, { "auxiliary_loss_clip": 0.01288395, "auxiliary_loss_mlp": 0.00258993, "balance_loss_clip": 1.05843759, "balance_loss_mlp": 0.23275481, "epoch": 0.751059672328273, "flos": 23547740083200.0, "grad_norm": 79.15258017327297, "language_loss": 0.83069456, "learning_rate": 6.154878538430899e-07, "loss": 0.84616846, "num_input_tokens_seen": 269391545, "router_z_loss_clip": 2.3046875, "router_z_loss_mlp": 0.26245117, "step": 12492, "time_per_iteration": 2.653517007827759 }, { "auxiliary_loss_clip": 0.01289564, "auxiliary_loss_mlp": 0.00267695, "balance_loss_clip": 1.05643845, "balance_loss_mlp": 0.2414809, "epoch": 0.7511197955809409, "flos": 18989742729600.0, "grad_norm": 311.69382688040923, "language_loss": 0.78475136, "learning_rate": 6.152068243154671e-07, "loss": 0.80032396, "num_input_tokens_seen": 269408530, "router_z_loss_clip": 2.33007812, "router_z_loss_mlp": 0.26245117, "step": 12493, "time_per_iteration": 2.655421495437622 }, { "auxiliary_loss_clip": 0.01279525, "auxiliary_loss_mlp": 0.00258024, "balance_loss_clip": 1.04832113, "balance_loss_mlp": 0.23047486, "epoch": 0.7511799188336089, "flos": 22046961006720.0, "grad_norm": 6.877721721534211, "language_loss": 0.87289226, "learning_rate": 6.149258472993395e-07, "loss": 0.8882677, "num_input_tokens_seen": 269425930, "router_z_loss_clip": 2.31445312, "router_z_loss_mlp": 0.27526855, "step": 12494, "time_per_iteration": 2.6517417430877686 }, { "auxiliary_loss_clip": 0.0129887, "auxiliary_loss_mlp": 0.00259407, "balance_loss_clip": 1.06207478, "balance_loss_mlp": 0.23163159, "epoch": 0.7512400420862768, "flos": 16467125546880.0, "grad_norm": 43.441595010049866, "language_loss": 0.85959351, "learning_rate": 6.146449228053634e-07, "loss": 0.87517631, "num_input_tokens_seen": 269443945, "router_z_loss_clip": 2.36523438, "router_z_loss_mlp": 0.27746582, "step": 12495, "time_per_iteration": 2.660835027694702 }, { "auxiliary_loss_clip": 0.01290041, "auxiliary_loss_mlp": 0.00260641, "balance_loss_clip": 1.05773044, "balance_loss_mlp": 0.23416433, "epoch": 0.7513001653389448, "flos": 20448326304000.0, "grad_norm": 11.343752149526738, "language_loss": 0.77690554, "learning_rate": 6.143640508441898e-07, "loss": 0.7924124, "num_input_tokens_seen": 269463625, "router_z_loss_clip": 2.3203125, "router_z_loss_mlp": 0.26513672, "step": 12496, "time_per_iteration": 2.746290445327759 }, { "auxiliary_loss_clip": 0.01293642, "auxiliary_loss_mlp": 0.00291954, "balance_loss_clip": 1.05971646, "balance_loss_mlp": 0.26391554, "epoch": 0.7513602885916129, "flos": 23476816679040.0, "grad_norm": 6.647271950431287, "language_loss": 0.83953071, "learning_rate": 6.140832314264705e-07, "loss": 0.85538673, "num_input_tokens_seen": 269483415, "router_z_loss_clip": 2.33984375, "router_z_loss_mlp": 0.28063965, "step": 12497, "time_per_iteration": 2.670529842376709 }, { "auxiliary_loss_clip": 0.01278589, "auxiliary_loss_mlp": 0.0027212, "balance_loss_clip": 1.04744637, "balance_loss_mlp": 0.24463038, "epoch": 0.7514204118442808, "flos": 26797224885120.0, "grad_norm": 94.4727330028092, "language_loss": 0.83975422, "learning_rate": 6.13802464562855e-07, "loss": 0.85526133, "num_input_tokens_seen": 269504635, "router_z_loss_clip": 2.31445312, "router_z_loss_mlp": 0.27502441, "step": 12498, "time_per_iteration": 2.739048480987549 }, { "auxiliary_loss_clip": 0.01276354, "auxiliary_loss_mlp": 0.00278369, "balance_loss_clip": 1.05811501, "balance_loss_mlp": 0.25344205, "epoch": 0.7514805350969488, "flos": 19865639877120.0, "grad_norm": 31.528500084232327, "language_loss": 0.81650496, "learning_rate": 6.135217502639878e-07, "loss": 0.83205223, "num_input_tokens_seen": 269523955, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.24914551, "step": 12499, "time_per_iteration": 2.673887014389038 }, { "auxiliary_loss_clip": 0.01272023, "auxiliary_loss_mlp": 0.00247632, "balance_loss_clip": 1.05097985, "balance_loss_mlp": 0.22392094, "epoch": 0.7515406583496167, "flos": 24571553437440.0, "grad_norm": 3.1076282040512657, "language_loss": 0.86736447, "learning_rate": 6.132410885405148e-07, "loss": 0.88256097, "num_input_tokens_seen": 269544410, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.23706055, "step": 12500, "time_per_iteration": 2.740064859390259 }, { "auxiliary_loss_clip": 0.01329509, "auxiliary_loss_mlp": 0.00281974, "balance_loss_clip": 1.08002019, "balance_loss_mlp": 0.25291044, "epoch": 0.7516007816022847, "flos": 20120246455680.0, "grad_norm": 8.565206812812418, "language_loss": 0.82216161, "learning_rate": 6.129604794030794e-07, "loss": 0.83827645, "num_input_tokens_seen": 269563315, "router_z_loss_clip": 2.4921875, "router_z_loss_mlp": 0.29064941, "step": 12501, "time_per_iteration": 2.686832904815674 }, { "auxiliary_loss_clip": 0.01271144, "auxiliary_loss_mlp": 0.00291392, "balance_loss_clip": 1.04486442, "balance_loss_mlp": 0.26498717, "epoch": 0.7516609048549526, "flos": 22784638619520.0, "grad_norm": 7.29073159647736, "language_loss": 0.85006958, "learning_rate": 6.126799228623207e-07, "loss": 0.86569494, "num_input_tokens_seen": 269583950, "router_z_loss_clip": 2.265625, "router_z_loss_mlp": 0.26403809, "step": 12502, "time_per_iteration": 2.7141616344451904 }, { "auxiliary_loss_clip": 0.0131533, "auxiliary_loss_mlp": 0.00309306, "balance_loss_clip": 1.0738889, "balance_loss_mlp": 0.2800279, "epoch": 0.7517210281076206, "flos": 10634012311680.0, "grad_norm": 41.083480787449474, "language_loss": 0.81287336, "learning_rate": 6.123994189288786e-07, "loss": 0.82911974, "num_input_tokens_seen": 269600120, "router_z_loss_clip": 2.4140625, "router_z_loss_mlp": 0.29321289, "step": 12503, "time_per_iteration": 2.6630327701568604 }, { "auxiliary_loss_clip": 0.01208753, "auxiliary_loss_mlp": 0.00132461, "balance_loss_clip": 1.05116463, "balance_loss_mlp": 0.12158871, "epoch": 0.7517811513602886, "flos": 66052221275520.0, "grad_norm": 1.0450516033114592, "language_loss": 0.63403547, "learning_rate": 6.121189676133903e-07, "loss": 0.64744759, "num_input_tokens_seen": 269659815, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.10888672, "step": 12504, "time_per_iteration": 3.122180938720703 }, { "auxiliary_loss_clip": 0.01258409, "auxiliary_loss_mlp": 0.00251675, "balance_loss_clip": 1.03749323, "balance_loss_mlp": 0.22660543, "epoch": 0.7518412746129566, "flos": 37268345018880.0, "grad_norm": 2.223151359437166, "language_loss": 0.7471661, "learning_rate": 6.118385689264896e-07, "loss": 0.76226693, "num_input_tokens_seen": 269684565, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.25097656, "step": 12505, "time_per_iteration": 2.9461798667907715 }, { "auxiliary_loss_clip": 0.01201916, "auxiliary_loss_mlp": 0.00159854, "balance_loss_clip": 1.04451919, "balance_loss_mlp": 0.14869569, "epoch": 0.7519013978656245, "flos": 60518567727360.0, "grad_norm": 0.6312058656466595, "language_loss": 0.54449302, "learning_rate": 6.11558222878809e-07, "loss": 0.55811071, "num_input_tokens_seen": 269752325, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.11181641, "step": 12506, "time_per_iteration": 3.262106418609619 }, { "auxiliary_loss_clip": 0.01321564, "auxiliary_loss_mlp": 0.00274025, "balance_loss_clip": 1.07706225, "balance_loss_mlp": 0.24487823, "epoch": 0.7519615211182925, "flos": 18806885568000.0, "grad_norm": 7.161409393359197, "language_loss": 0.87816656, "learning_rate": 6.112779294809796e-07, "loss": 0.89412242, "num_input_tokens_seen": 269770630, "router_z_loss_clip": 2.4453125, "router_z_loss_mlp": 0.29138184, "step": 12507, "time_per_iteration": 2.687849998474121 }, { "auxiliary_loss_clip": 0.01256848, "auxiliary_loss_mlp": 0.00274981, "balance_loss_clip": 1.04216886, "balance_loss_mlp": 0.24976851, "epoch": 0.7520216443709604, "flos": 14575244209920.0, "grad_norm": 8.834018747233726, "language_loss": 0.77965659, "learning_rate": 6.10997688743631e-07, "loss": 0.79497486, "num_input_tokens_seen": 269787280, "router_z_loss_clip": 2.1484375, "router_z_loss_mlp": 0.25231934, "step": 12508, "time_per_iteration": 2.6248879432678223 }, { "auxiliary_loss_clip": 0.01280049, "auxiliary_loss_mlp": 0.00273786, "balance_loss_clip": 1.05550373, "balance_loss_mlp": 0.24746479, "epoch": 0.7520817676236284, "flos": 17056599644160.0, "grad_norm": 2.1860823527440867, "language_loss": 0.78590477, "learning_rate": 6.107175006773885e-07, "loss": 0.8014431, "num_input_tokens_seen": 269805205, "router_z_loss_clip": 2.2421875, "router_z_loss_mlp": 0.26318359, "step": 12509, "time_per_iteration": 2.679819345474243 }, { "auxiliary_loss_clip": 0.01296876, "auxiliary_loss_mlp": 0.00284961, "balance_loss_clip": 1.0597986, "balance_loss_mlp": 0.25525451, "epoch": 0.7521418908762965, "flos": 25666397936640.0, "grad_norm": 5.43598499479025, "language_loss": 0.7035917, "learning_rate": 6.104373652928785e-07, "loss": 0.71941012, "num_input_tokens_seen": 269824820, "router_z_loss_clip": 2.37304688, "router_z_loss_mlp": 0.29736328, "step": 12510, "time_per_iteration": 2.6889264583587646 }, { "auxiliary_loss_clip": 0.01281272, "auxiliary_loss_mlp": 0.00273775, "balance_loss_clip": 1.05777895, "balance_loss_mlp": 0.2468572, "epoch": 0.7522020141289644, "flos": 20886759711360.0, "grad_norm": 74.46407791765485, "language_loss": 0.88450778, "learning_rate": 6.10157282600722e-07, "loss": 0.90005827, "num_input_tokens_seen": 269842825, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.26940918, "step": 12511, "time_per_iteration": 4.0594775676727295 }, { "auxiliary_loss_clip": 0.01285776, "auxiliary_loss_mlp": 0.00279549, "balance_loss_clip": 1.05185652, "balance_loss_mlp": 0.25235745, "epoch": 0.7522621373816324, "flos": 12640305444480.0, "grad_norm": 27.423544250743607, "language_loss": 0.84609419, "learning_rate": 6.098772526115412e-07, "loss": 0.86174744, "num_input_tokens_seen": 269859000, "router_z_loss_clip": 2.33984375, "router_z_loss_mlp": 0.27172852, "step": 12512, "time_per_iteration": 4.028371572494507 }, { "auxiliary_loss_clip": 0.01261673, "auxiliary_loss_mlp": 0.00263273, "balance_loss_clip": 1.04122293, "balance_loss_mlp": 0.23761943, "epoch": 0.7523222606343003, "flos": 25626141768960.0, "grad_norm": 497.15509450032374, "language_loss": 0.88279641, "learning_rate": 6.095972753359537e-07, "loss": 0.8980459, "num_input_tokens_seen": 269878895, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.25683594, "step": 12513, "time_per_iteration": 2.6903774738311768 }, { "auxiliary_loss_clip": 0.01289522, "auxiliary_loss_mlp": 0.00249272, "balance_loss_clip": 1.0565114, "balance_loss_mlp": 0.21895757, "epoch": 0.7523823838869683, "flos": 20448900921600.0, "grad_norm": 10.297269541552067, "language_loss": 0.82456231, "learning_rate": 6.093173507845771e-07, "loss": 0.8399502, "num_input_tokens_seen": 269897280, "router_z_loss_clip": 2.32617188, "router_z_loss_mlp": 0.30285645, "step": 12514, "time_per_iteration": 2.6166131496429443 }, { "auxiliary_loss_clip": 0.01260069, "auxiliary_loss_mlp": 0.00252455, "balance_loss_clip": 1.04254842, "balance_loss_mlp": 0.22763579, "epoch": 0.7524425071396362, "flos": 14720610551040.0, "grad_norm": 368.2859388430732, "language_loss": 0.76657128, "learning_rate": 6.090374789680271e-07, "loss": 0.7816965, "num_input_tokens_seen": 269914640, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.24829102, "step": 12515, "time_per_iteration": 2.6741838455200195 }, { "auxiliary_loss_clip": 0.01287164, "auxiliary_loss_mlp": 0.00262893, "balance_loss_clip": 1.05463862, "balance_loss_mlp": 0.23702507, "epoch": 0.7525026303923043, "flos": 30592048947840.0, "grad_norm": 11.022183281724217, "language_loss": 0.801332, "learning_rate": 6.087576598969137e-07, "loss": 0.81683254, "num_input_tokens_seen": 269934960, "router_z_loss_clip": 2.32421875, "router_z_loss_mlp": 0.25854492, "step": 12516, "time_per_iteration": 4.233945369720459 }, { "auxiliary_loss_clip": 0.01270501, "auxiliary_loss_mlp": 0.00263315, "balance_loss_clip": 1.05038011, "balance_loss_mlp": 0.23588482, "epoch": 0.7525627536449722, "flos": 24791757765120.0, "grad_norm": 1.6634787136932223, "language_loss": 0.94560826, "learning_rate": 6.084778935818495e-07, "loss": 0.96094638, "num_input_tokens_seen": 269956655, "router_z_loss_clip": 2.20117188, "router_z_loss_mlp": 0.27404785, "step": 12517, "time_per_iteration": 2.7058098316192627 }, { "auxiliary_loss_clip": 0.01300128, "auxiliary_loss_mlp": 0.00245428, "balance_loss_clip": 1.06806004, "balance_loss_mlp": 0.2192972, "epoch": 0.7526228768976402, "flos": 20779782030720.0, "grad_norm": 438.0698607567295, "language_loss": 0.82059324, "learning_rate": 6.081981800334437e-07, "loss": 0.83604884, "num_input_tokens_seen": 269976835, "router_z_loss_clip": 2.3203125, "router_z_loss_mlp": 0.26123047, "step": 12518, "time_per_iteration": 2.722987651824951 }, { "auxiliary_loss_clip": 0.01189494, "auxiliary_loss_mlp": 0.00105926, "balance_loss_clip": 1.03650677, "balance_loss_mlp": 0.09724726, "epoch": 0.7526830001503081, "flos": 66559243703040.0, "grad_norm": 0.6869482806994321, "language_loss": 0.54946476, "learning_rate": 6.079185192623017e-07, "loss": 0.56241894, "num_input_tokens_seen": 270040630, "router_z_loss_clip": 1.53125, "router_z_loss_mlp": 0.08691406, "step": 12519, "time_per_iteration": 3.253899097442627 }, { "auxiliary_loss_clip": 0.01277343, "auxiliary_loss_mlp": 0.00295608, "balance_loss_clip": 1.0510025, "balance_loss_mlp": 0.2677722, "epoch": 0.7527431234029761, "flos": 23477894087040.0, "grad_norm": 14.68673891562467, "language_loss": 0.8293404, "learning_rate": 6.07638911279029e-07, "loss": 0.84506989, "num_input_tokens_seen": 270059695, "router_z_loss_clip": 2.26367188, "router_z_loss_mlp": 0.27832031, "step": 12520, "time_per_iteration": 4.172483682632446 }, { "auxiliary_loss_clip": 0.01261482, "auxiliary_loss_mlp": 0.00246523, "balance_loss_clip": 1.0380336, "balance_loss_mlp": 0.21955773, "epoch": 0.752803246655644, "flos": 22049546785920.0, "grad_norm": 13.618211179454727, "language_loss": 0.8071208, "learning_rate": 6.07359356094229e-07, "loss": 0.82220083, "num_input_tokens_seen": 270078420, "router_z_loss_clip": 2.23632812, "router_z_loss_mlp": 0.27001953, "step": 12521, "time_per_iteration": 2.7003703117370605 }, { "auxiliary_loss_clip": 0.01305562, "auxiliary_loss_mlp": 0.0027563, "balance_loss_clip": 1.06695807, "balance_loss_mlp": 0.24643555, "epoch": 0.752863369908312, "flos": 30153795108480.0, "grad_norm": 2.843850777901777, "language_loss": 0.75286829, "learning_rate": 6.070798537185016e-07, "loss": 0.76868021, "num_input_tokens_seen": 270097040, "router_z_loss_clip": 2.38476562, "router_z_loss_mlp": 0.29187012, "step": 12522, "time_per_iteration": 2.7394142150878906 }, { "auxiliary_loss_clip": 0.01292765, "auxiliary_loss_mlp": 0.0026829, "balance_loss_clip": 1.05979729, "balance_loss_mlp": 0.24095546, "epoch": 0.7529234931609801, "flos": 24567638855040.0, "grad_norm": 13.137864832565525, "language_loss": 0.84102857, "learning_rate": 6.068004041624453e-07, "loss": 0.85663915, "num_input_tokens_seen": 270116365, "router_z_loss_clip": 2.328125, "router_z_loss_mlp": 0.2734375, "step": 12523, "time_per_iteration": 2.7307329177856445 }, { "auxiliary_loss_clip": 0.01270207, "auxiliary_loss_mlp": 0.00275655, "balance_loss_clip": 1.04512155, "balance_loss_mlp": 0.24946511, "epoch": 0.752983616413648, "flos": 23112395245440.0, "grad_norm": 76.4800201302663, "language_loss": 0.87523299, "learning_rate": 6.065210074366571e-07, "loss": 0.89069158, "num_input_tokens_seen": 270135395, "router_z_loss_clip": 2.25390625, "router_z_loss_mlp": 0.26208496, "step": 12524, "time_per_iteration": 2.6739490032196045 }, { "auxiliary_loss_clip": 0.01267706, "auxiliary_loss_mlp": 0.00270431, "balance_loss_clip": 1.04447746, "balance_loss_mlp": 0.2429176, "epoch": 0.753043739666316, "flos": 24316946858880.0, "grad_norm": 3.5212036375808347, "language_loss": 0.80610561, "learning_rate": 6.062416635517326e-07, "loss": 0.82148695, "num_input_tokens_seen": 270156425, "router_z_loss_clip": 2.23242188, "router_z_loss_mlp": 0.27526855, "step": 12525, "time_per_iteration": 2.724648952484131 }, { "auxiliary_loss_clip": 0.0129138, "auxiliary_loss_mlp": 0.00269133, "balance_loss_clip": 1.0628463, "balance_loss_mlp": 0.24265708, "epoch": 0.7531038629189839, "flos": 24243294021120.0, "grad_norm": 5.9211812250633855, "language_loss": 0.82918763, "learning_rate": 6.059623725182641e-07, "loss": 0.84479272, "num_input_tokens_seen": 270176905, "router_z_loss_clip": 2.2890625, "router_z_loss_mlp": 0.26477051, "step": 12526, "time_per_iteration": 2.692064046859741 }, { "auxiliary_loss_clip": 0.01264706, "auxiliary_loss_mlp": 0.00233559, "balance_loss_clip": 1.03973317, "balance_loss_mlp": 0.20668897, "epoch": 0.7531639861716519, "flos": 30188807890560.0, "grad_norm": 22078.638196319032, "language_loss": 0.80052811, "learning_rate": 6.056831343468414e-07, "loss": 0.81551075, "num_input_tokens_seen": 270196640, "router_z_loss_clip": 2.25195312, "router_z_loss_mlp": 0.2689209, "step": 12527, "time_per_iteration": 2.783945083618164 }, { "auxiliary_loss_clip": 0.01262449, "auxiliary_loss_mlp": 0.00258853, "balance_loss_clip": 1.04267716, "balance_loss_mlp": 0.23352104, "epoch": 0.7532241094243198, "flos": 18223193560320.0, "grad_norm": 8.776421866891916, "language_loss": 0.89242077, "learning_rate": 6.054039490480539e-07, "loss": 0.90763384, "num_input_tokens_seen": 270213905, "router_z_loss_clip": 2.20117188, "router_z_loss_mlp": 0.2532959, "step": 12528, "time_per_iteration": 2.7970943450927734 }, { "auxiliary_loss_clip": 0.01298147, "auxiliary_loss_mlp": 0.00242171, "balance_loss_clip": 1.06018329, "balance_loss_mlp": 0.21524188, "epoch": 0.7532842326769879, "flos": 20881049448960.0, "grad_norm": 135.24776129383724, "language_loss": 0.92432958, "learning_rate": 6.051248166324892e-07, "loss": 0.93973279, "num_input_tokens_seen": 270231995, "router_z_loss_clip": 2.38085938, "router_z_loss_mlp": 0.26940918, "step": 12529, "time_per_iteration": 2.6813876628875732 }, { "auxiliary_loss_clip": 0.01306175, "auxiliary_loss_mlp": 0.00238553, "balance_loss_clip": 1.06689787, "balance_loss_mlp": 0.20939425, "epoch": 0.7533443559296558, "flos": 18078689145600.0, "grad_norm": 8.903961227739362, "language_loss": 0.86469889, "learning_rate": 6.048457371107303e-07, "loss": 0.88014615, "num_input_tokens_seen": 270251480, "router_z_loss_clip": 2.39257812, "router_z_loss_mlp": 0.29174805, "step": 12530, "time_per_iteration": 2.638803243637085 }, { "auxiliary_loss_clip": 0.01174664, "auxiliary_loss_mlp": 0.00093409, "balance_loss_clip": 1.02113891, "balance_loss_mlp": 0.08382466, "epoch": 0.7534044791823238, "flos": 50254830766080.0, "grad_norm": 1.2527930130117408, "language_loss": 0.63276398, "learning_rate": 6.045667104933612e-07, "loss": 0.64544475, "num_input_tokens_seen": 270306480, "router_z_loss_clip": 1.53125, "router_z_loss_mlp": 0.09570312, "step": 12531, "time_per_iteration": 3.053140163421631 }, { "auxiliary_loss_clip": 0.01299357, "auxiliary_loss_mlp": 0.00239048, "balance_loss_clip": 1.06436253, "balance_loss_mlp": 0.21267864, "epoch": 0.7534646024349917, "flos": 20850274471680.0, "grad_norm": 61.40977783520651, "language_loss": 0.80663502, "learning_rate": 6.042877367909633e-07, "loss": 0.8220191, "num_input_tokens_seen": 270324595, "router_z_loss_clip": 2.3515625, "router_z_loss_mlp": 0.2635498, "step": 12532, "time_per_iteration": 2.660615921020508 }, { "auxiliary_loss_clip": 0.01253328, "auxiliary_loss_mlp": 0.00255399, "balance_loss_clip": 1.03461874, "balance_loss_mlp": 0.23083034, "epoch": 0.7535247256876597, "flos": 23071779941760.0, "grad_norm": 89.48599880176604, "language_loss": 0.82729959, "learning_rate": 6.040088160141132e-07, "loss": 0.84238684, "num_input_tokens_seen": 270344375, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.24572754, "step": 12533, "time_per_iteration": 2.645151376724243 }, { "auxiliary_loss_clip": 0.011794, "auxiliary_loss_mlp": 0.00144149, "balance_loss_clip": 1.02438033, "balance_loss_mlp": 0.13308619, "epoch": 0.7535848489403276, "flos": 58623418252800.0, "grad_norm": 52.961888828116145, "language_loss": 0.56878179, "learning_rate": 6.037299481733886e-07, "loss": 0.58201724, "num_input_tokens_seen": 270405235, "router_z_loss_clip": 1.546875, "router_z_loss_mlp": 0.11083984, "step": 12534, "time_per_iteration": 3.174124002456665 }, { "auxiliary_loss_clip": 0.01275219, "auxiliary_loss_mlp": 0.00265573, "balance_loss_clip": 1.04858744, "balance_loss_mlp": 0.23858392, "epoch": 0.7536449721929956, "flos": 26577882483840.0, "grad_norm": 14.209548779578869, "language_loss": 0.78306687, "learning_rate": 6.03451133279365e-07, "loss": 0.79847479, "num_input_tokens_seen": 270425820, "router_z_loss_clip": 2.265625, "router_z_loss_mlp": 0.27026367, "step": 12535, "time_per_iteration": 2.6726419925689697 }, { "auxiliary_loss_clip": 0.01293014, "auxiliary_loss_mlp": 0.0026946, "balance_loss_clip": 1.0580883, "balance_loss_mlp": 0.24109977, "epoch": 0.7537050954456637, "flos": 25735992537600.0, "grad_norm": 2.508772958061081, "language_loss": 0.87550139, "learning_rate": 6.031723713426135e-07, "loss": 0.8911261, "num_input_tokens_seen": 270447120, "router_z_loss_clip": 2.34960938, "router_z_loss_mlp": 0.2833252, "step": 12536, "time_per_iteration": 2.780121088027954 }, { "auxiliary_loss_clip": 0.01300985, "auxiliary_loss_mlp": 0.00300677, "balance_loss_clip": 1.06703949, "balance_loss_mlp": 0.27101731, "epoch": 0.7537652186983316, "flos": 30224431203840.0, "grad_norm": 17.30369915258467, "language_loss": 0.8193472, "learning_rate": 6.028936623737067e-07, "loss": 0.83536386, "num_input_tokens_seen": 270468680, "router_z_loss_clip": 2.34179688, "router_z_loss_mlp": 0.29675293, "step": 12537, "time_per_iteration": 2.735922336578369 }, { "auxiliary_loss_clip": 0.0129252, "auxiliary_loss_mlp": 0.00285615, "balance_loss_clip": 1.06010818, "balance_loss_mlp": 0.25661093, "epoch": 0.7538253419509996, "flos": 12641239198080.0, "grad_norm": 39.5538844123437, "language_loss": 0.82553971, "learning_rate": 6.026150063832111e-07, "loss": 0.84132099, "num_input_tokens_seen": 270486310, "router_z_loss_clip": 2.32617188, "router_z_loss_mlp": 0.29003906, "step": 12538, "time_per_iteration": 2.644432306289673 }, { "auxiliary_loss_clip": 0.0129547, "auxiliary_loss_mlp": 0.00261515, "balance_loss_clip": 1.06125176, "balance_loss_mlp": 0.23253497, "epoch": 0.7538854652036675, "flos": 23185976256000.0, "grad_norm": 4.6002542754149, "language_loss": 0.75051856, "learning_rate": 6.023364033816956e-07, "loss": 0.76608843, "num_input_tokens_seen": 270507210, "router_z_loss_clip": 2.34179688, "router_z_loss_mlp": 0.29016113, "step": 12539, "time_per_iteration": 2.6887168884277344 }, { "auxiliary_loss_clip": 0.01260781, "auxiliary_loss_mlp": 0.00260483, "balance_loss_clip": 1.0433532, "balance_loss_mlp": 0.23518707, "epoch": 0.7539455884563355, "flos": 23186227651200.0, "grad_norm": 52.010915548921965, "language_loss": 0.82014805, "learning_rate": 6.020578533797229e-07, "loss": 0.83536065, "num_input_tokens_seen": 270525250, "router_z_loss_clip": 2.171875, "router_z_loss_mlp": 0.25292969, "step": 12540, "time_per_iteration": 2.729484796524048 }, { "auxiliary_loss_clip": 0.01282964, "auxiliary_loss_mlp": 0.00247675, "balance_loss_clip": 1.05145824, "balance_loss_mlp": 0.21865904, "epoch": 0.7540057117090034, "flos": 13181155505280.0, "grad_norm": 56.20360401116509, "language_loss": 0.83856404, "learning_rate": 6.017793563878566e-07, "loss": 0.85387045, "num_input_tokens_seen": 270539295, "router_z_loss_clip": 2.31835938, "router_z_loss_mlp": 0.29052734, "step": 12541, "time_per_iteration": 2.6377341747283936 }, { "auxiliary_loss_clip": 0.01262733, "auxiliary_loss_mlp": 0.00259378, "balance_loss_clip": 1.03940952, "balance_loss_mlp": 0.23308066, "epoch": 0.7540658349616715, "flos": 45478134478080.0, "grad_norm": 3.6197868505049327, "language_loss": 0.80541992, "learning_rate": 6.015009124166576e-07, "loss": 0.82064098, "num_input_tokens_seen": 270562815, "router_z_loss_clip": 2.23046875, "router_z_loss_mlp": 0.26306152, "step": 12542, "time_per_iteration": 2.924018383026123 }, { "auxiliary_loss_clip": 0.01285554, "auxiliary_loss_mlp": 0.00284719, "balance_loss_clip": 1.05707562, "balance_loss_mlp": 0.25830221, "epoch": 0.7541259582143394, "flos": 19930817105280.0, "grad_norm": 18.167900916337516, "language_loss": 0.91760945, "learning_rate": 6.012225214766844e-07, "loss": 0.93331218, "num_input_tokens_seen": 270579055, "router_z_loss_clip": 2.28515625, "router_z_loss_mlp": 0.26391602, "step": 12543, "time_per_iteration": 2.633511781692505 }, { "auxiliary_loss_clip": 0.01290186, "auxiliary_loss_mlp": 0.00260785, "balance_loss_clip": 1.06228173, "balance_loss_mlp": 0.23575103, "epoch": 0.7541860814670074, "flos": 27198239299200.0, "grad_norm": 3.4645240274852496, "language_loss": 0.82237589, "learning_rate": 6.009441835784927e-07, "loss": 0.83788568, "num_input_tokens_seen": 270599080, "router_z_loss_clip": 2.28125, "router_z_loss_mlp": 0.25036621, "step": 12544, "time_per_iteration": 2.701096296310425 }, { "auxiliary_loss_clip": 0.0125423, "auxiliary_loss_mlp": 0.00262276, "balance_loss_clip": 1.03582418, "balance_loss_mlp": 0.23559725, "epoch": 0.7542462047196753, "flos": 21324151624320.0, "grad_norm": 7.108094156774898, "language_loss": 0.76123655, "learning_rate": 6.006658987326383e-07, "loss": 0.77640164, "num_input_tokens_seen": 270618715, "router_z_loss_clip": 2.18164062, "router_z_loss_mlp": 0.26672363, "step": 12545, "time_per_iteration": 2.7560884952545166 }, { "auxiliary_loss_clip": 0.01278962, "auxiliary_loss_mlp": 0.00253756, "balance_loss_clip": 1.05050683, "balance_loss_mlp": 0.2282453, "epoch": 0.7543063279723433, "flos": 11940944664960.0, "grad_norm": 27.469529816908917, "language_loss": 0.78644276, "learning_rate": 6.003876669496728e-07, "loss": 0.80176985, "num_input_tokens_seen": 270635695, "router_z_loss_clip": 2.28515625, "router_z_loss_mlp": 0.25524902, "step": 12546, "time_per_iteration": 2.688429117202759 }, { "auxiliary_loss_clip": 0.01288387, "auxiliary_loss_mlp": 0.00265212, "balance_loss_clip": 1.06065989, "balance_loss_mlp": 0.23763841, "epoch": 0.7543664512250112, "flos": 22819974624000.0, "grad_norm": 12.208186040597882, "language_loss": 0.83310521, "learning_rate": 6.00109488240147e-07, "loss": 0.84864116, "num_input_tokens_seen": 270654325, "router_z_loss_clip": 2.27539062, "router_z_loss_mlp": 0.27587891, "step": 12547, "time_per_iteration": 2.7195005416870117 }, { "auxiliary_loss_clip": 0.01277849, "auxiliary_loss_mlp": 0.00279761, "balance_loss_clip": 1.04689026, "balance_loss_mlp": 0.25060257, "epoch": 0.7544265744776792, "flos": 20923855482240.0, "grad_norm": 22.016145584227246, "language_loss": 0.76250511, "learning_rate": 5.998313626146099e-07, "loss": 0.77808118, "num_input_tokens_seen": 270674260, "router_z_loss_clip": 2.31445312, "router_z_loss_mlp": 0.29162598, "step": 12548, "time_per_iteration": 2.703922748565674 }, { "auxiliary_loss_clip": 0.01263001, "auxiliary_loss_mlp": 0.00253722, "balance_loss_clip": 1.04117882, "balance_loss_mlp": 0.22883126, "epoch": 0.7544866977303473, "flos": 15195493284480.0, "grad_norm": 14.19015353989516, "language_loss": 0.94910872, "learning_rate": 5.995532900836088e-07, "loss": 0.9642759, "num_input_tokens_seen": 270692200, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.24902344, "step": 12549, "time_per_iteration": 2.662792921066284 }, { "auxiliary_loss_clip": 0.01258432, "auxiliary_loss_mlp": 0.00291913, "balance_loss_clip": 1.03951645, "balance_loss_mlp": 0.26616424, "epoch": 0.7545468209830152, "flos": 27083683848960.0, "grad_norm": 10.842840776974802, "language_loss": 0.83559668, "learning_rate": 5.992752706576865e-07, "loss": 0.85110009, "num_input_tokens_seen": 270709675, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.25744629, "step": 12550, "time_per_iteration": 2.6908328533172607 }, { "auxiliary_loss_clip": 0.01280867, "auxiliary_loss_mlp": 0.00265079, "balance_loss_clip": 1.05433667, "balance_loss_mlp": 0.24074849, "epoch": 0.7546069442356832, "flos": 26871703735680.0, "grad_norm": 2.927849313669267, "language_loss": 0.74807966, "learning_rate": 5.98997304347386e-07, "loss": 0.76353908, "num_input_tokens_seen": 270733055, "router_z_loss_clip": 2.265625, "router_z_loss_mlp": 0.24328613, "step": 12551, "time_per_iteration": 2.7266247272491455 }, { "auxiliary_loss_clip": 0.01267254, "auxiliary_loss_mlp": 0.00279397, "balance_loss_clip": 1.04925203, "balance_loss_mlp": 0.25394559, "epoch": 0.7546670674883511, "flos": 15743131015680.0, "grad_norm": 4.09233882679482, "language_loss": 0.93833673, "learning_rate": 5.987193911632487e-07, "loss": 0.9538033, "num_input_tokens_seen": 270749275, "router_z_loss_clip": 2.18164062, "router_z_loss_mlp": 0.25463867, "step": 12552, "time_per_iteration": 2.6632258892059326 }, { "auxiliary_loss_clip": 0.0127739, "auxiliary_loss_mlp": 0.00273798, "balance_loss_clip": 1.05166101, "balance_loss_mlp": 0.24446087, "epoch": 0.7547271907410191, "flos": 23477714519040.0, "grad_norm": 7.322244541135616, "language_loss": 0.8736338, "learning_rate": 5.98441531115812e-07, "loss": 0.88914573, "num_input_tokens_seen": 270768230, "router_z_loss_clip": 2.25390625, "router_z_loss_mlp": 0.29333496, "step": 12553, "time_per_iteration": 4.13367772102356 }, { "auxiliary_loss_clip": 0.01278944, "auxiliary_loss_mlp": 0.00280686, "balance_loss_clip": 1.05279756, "balance_loss_mlp": 0.25291035, "epoch": 0.754787313993687, "flos": 31722804069120.0, "grad_norm": 8.549356686428661, "language_loss": 0.71913821, "learning_rate": 5.981637242156135e-07, "loss": 0.73473454, "num_input_tokens_seen": 270786285, "router_z_loss_clip": 2.26367188, "router_z_loss_mlp": 0.2779541, "step": 12554, "time_per_iteration": 4.186604022979736 }, { "auxiliary_loss_clip": 0.01268286, "auxiliary_loss_mlp": 0.00249743, "balance_loss_clip": 1.04278708, "balance_loss_mlp": 0.22313583, "epoch": 0.7548474372463551, "flos": 27563055782400.0, "grad_norm": 7.407070774084945, "language_loss": 0.79990304, "learning_rate": 5.978859704731864e-07, "loss": 0.81508338, "num_input_tokens_seen": 270805505, "router_z_loss_clip": 2.25097656, "router_z_loss_mlp": 0.26599121, "step": 12555, "time_per_iteration": 2.7251288890838623 }, { "auxiliary_loss_clip": 0.01294224, "auxiliary_loss_mlp": 0.00303189, "balance_loss_clip": 1.06109297, "balance_loss_mlp": 0.27479362, "epoch": 0.754907560499023, "flos": 19318576763520.0, "grad_norm": 6.53011522799755, "language_loss": 0.86766225, "learning_rate": 5.976082698990645e-07, "loss": 0.88363642, "num_input_tokens_seen": 270824610, "router_z_loss_clip": 2.33203125, "router_z_loss_mlp": 0.28405762, "step": 12556, "time_per_iteration": 2.6453497409820557 }, { "auxiliary_loss_clip": 0.0113702, "auxiliary_loss_mlp": 0.00119119, "balance_loss_clip": 0.98849726, "balance_loss_mlp": 0.10939135, "epoch": 0.754967683751691, "flos": 69744628684800.0, "grad_norm": 0.6889693573164366, "language_loss": 0.49941641, "learning_rate": 5.973306225037769e-07, "loss": 0.51197779, "num_input_tokens_seen": 270886155, "router_z_loss_clip": 1.484375, "router_z_loss_mlp": 0.09716797, "step": 12557, "time_per_iteration": 3.1473546028137207 }, { "auxiliary_loss_clip": 0.01271145, "auxiliary_loss_mlp": 0.00276877, "balance_loss_clip": 1.04742861, "balance_loss_mlp": 0.25011468, "epoch": 0.7550278070043589, "flos": 24421913377920.0, "grad_norm": 100.52683122455679, "language_loss": 0.78067625, "learning_rate": 5.970530282978525e-07, "loss": 0.79615647, "num_input_tokens_seen": 270905325, "router_z_loss_clip": 2.24023438, "router_z_loss_mlp": 0.2677002, "step": 12558, "time_per_iteration": 4.23699688911438 }, { "auxiliary_loss_clip": 0.01249345, "auxiliary_loss_mlp": 0.00276625, "balance_loss_clip": 1.02962708, "balance_loss_mlp": 0.25090003, "epoch": 0.7550879302570269, "flos": 32634611838720.0, "grad_norm": 25.711680231316258, "language_loss": 0.86599177, "learning_rate": 5.967754872918187e-07, "loss": 0.88125145, "num_input_tokens_seen": 270927535, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.25756836, "step": 12559, "time_per_iteration": 2.7501254081726074 }, { "auxiliary_loss_clip": 0.01265397, "auxiliary_loss_mlp": 0.00294187, "balance_loss_clip": 1.04664207, "balance_loss_mlp": 0.26736513, "epoch": 0.7551480535096948, "flos": 21795550738560.0, "grad_norm": 166.87885934552375, "language_loss": 0.84781212, "learning_rate": 5.96497999496199e-07, "loss": 0.86340791, "num_input_tokens_seen": 270946920, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.26843262, "step": 12560, "time_per_iteration": 2.7232937812805176 }, { "auxiliary_loss_clip": 0.01256736, "auxiliary_loss_mlp": 0.00243705, "balance_loss_clip": 1.03952837, "balance_loss_mlp": 0.21906416, "epoch": 0.7552081767623628, "flos": 18515111391360.0, "grad_norm": 20.19498346912948, "language_loss": 0.78798455, "learning_rate": 5.96220564921515e-07, "loss": 0.80298895, "num_input_tokens_seen": 270965705, "router_z_loss_clip": 2.17089844, "router_z_loss_mlp": 0.24633789, "step": 12561, "time_per_iteration": 2.636481285095215 }, { "auxiliary_loss_clip": 0.01268747, "auxiliary_loss_mlp": 0.00282554, "balance_loss_clip": 1.04913831, "balance_loss_mlp": 0.25648266, "epoch": 0.7552683000150308, "flos": 27634805199360.0, "grad_norm": 5.241544416770227, "language_loss": 0.81670702, "learning_rate": 5.959431835782889e-07, "loss": 0.83222008, "num_input_tokens_seen": 270986550, "router_z_loss_clip": 2.19921875, "router_z_loss_mlp": 0.26049805, "step": 12562, "time_per_iteration": 2.724472761154175 }, { "auxiliary_loss_clip": 0.01266133, "auxiliary_loss_mlp": 0.00286783, "balance_loss_clip": 1.04428625, "balance_loss_mlp": 0.26030686, "epoch": 0.7553284232676988, "flos": 20302924049280.0, "grad_norm": 6.871216483248377, "language_loss": 0.83931816, "learning_rate": 5.956658554770371e-07, "loss": 0.85484731, "num_input_tokens_seen": 271006250, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.26489258, "step": 12563, "time_per_iteration": 4.192183494567871 }, { "auxiliary_loss_clip": 0.01323538, "auxiliary_loss_mlp": 0.00302228, "balance_loss_clip": 1.08272994, "balance_loss_mlp": 0.27297398, "epoch": 0.7553885465203668, "flos": 33255471444480.0, "grad_norm": 45.04692056439169, "language_loss": 0.80006814, "learning_rate": 5.953885806282768e-07, "loss": 0.81632578, "num_input_tokens_seen": 271025575, "router_z_loss_clip": 2.41015625, "router_z_loss_mlp": 0.29272461, "step": 12564, "time_per_iteration": 2.8034846782684326 }, { "auxiliary_loss_clip": 0.01287776, "auxiliary_loss_mlp": 0.00247257, "balance_loss_clip": 1.05433059, "balance_loss_mlp": 0.22119831, "epoch": 0.7554486697730347, "flos": 21616249023360.0, "grad_norm": 76.28864582420485, "language_loss": 0.75403601, "learning_rate": 5.951113590425228e-07, "loss": 0.76938629, "num_input_tokens_seen": 271045805, "router_z_loss_clip": 2.3359375, "router_z_loss_mlp": 0.26062012, "step": 12565, "time_per_iteration": 2.7463457584381104 }, { "auxiliary_loss_clip": 0.01284181, "auxiliary_loss_mlp": 0.00292228, "balance_loss_clip": 1.05259383, "balance_loss_mlp": 0.26321211, "epoch": 0.7555087930257027, "flos": 27632973605760.0, "grad_norm": 5.648656447087321, "language_loss": 0.81027186, "learning_rate": 5.94834190730287e-07, "loss": 0.82603592, "num_input_tokens_seen": 271066065, "router_z_loss_clip": 2.31835938, "router_z_loss_mlp": 0.2902832, "step": 12566, "time_per_iteration": 2.7320828437805176 }, { "auxiliary_loss_clip": 0.01306097, "auxiliary_loss_mlp": 0.00274594, "balance_loss_clip": 1.07022619, "balance_loss_mlp": 0.24513784, "epoch": 0.7555689162783706, "flos": 23621644316160.0, "grad_norm": 22.477733111148375, "language_loss": 0.83153403, "learning_rate": 5.945570757020789e-07, "loss": 0.84734094, "num_input_tokens_seen": 271085870, "router_z_loss_clip": 2.36132812, "router_z_loss_mlp": 0.29455566, "step": 12567, "time_per_iteration": 2.6708714962005615 }, { "auxiliary_loss_clip": 0.01263047, "auxiliary_loss_mlp": 0.00288346, "balance_loss_clip": 1.0424459, "balance_loss_mlp": 0.26190504, "epoch": 0.7556290395310387, "flos": 24863076218880.0, "grad_norm": 3.490698674764972, "language_loss": 0.72501671, "learning_rate": 5.942800139684073e-07, "loss": 0.74053073, "num_input_tokens_seen": 271104260, "router_z_loss_clip": 2.20507812, "router_z_loss_mlp": 0.26452637, "step": 12568, "time_per_iteration": 2.6525750160217285 }, { "auxiliary_loss_clip": 0.01272095, "auxiliary_loss_mlp": 0.00275494, "balance_loss_clip": 1.04968023, "balance_loss_mlp": 0.24933949, "epoch": 0.7556891627837066, "flos": 43543770330240.0, "grad_norm": 638.7975334153308, "language_loss": 0.74204254, "learning_rate": 5.940030055397789e-07, "loss": 0.75751841, "num_input_tokens_seen": 271125745, "router_z_loss_clip": 2.22070312, "router_z_loss_mlp": 0.26147461, "step": 12569, "time_per_iteration": 2.8404653072357178 }, { "auxiliary_loss_clip": 0.0131165, "auxiliary_loss_mlp": 0.0027282, "balance_loss_clip": 1.06767452, "balance_loss_mlp": 0.24448434, "epoch": 0.7557492860363746, "flos": 26650924790400.0, "grad_norm": 238.6576073557996, "language_loss": 0.76253539, "learning_rate": 5.93726050426697e-07, "loss": 0.7783801, "num_input_tokens_seen": 271147145, "router_z_loss_clip": 2.44140625, "router_z_loss_mlp": 0.2833252, "step": 12570, "time_per_iteration": 2.75126576423645 }, { "auxiliary_loss_clip": 0.01291792, "auxiliary_loss_mlp": 0.00258513, "balance_loss_clip": 1.05851102, "balance_loss_mlp": 0.2297121, "epoch": 0.7558094092890425, "flos": 55182885010560.0, "grad_norm": 4.3016094254758075, "language_loss": 0.80007285, "learning_rate": 5.934491486396647e-07, "loss": 0.8155759, "num_input_tokens_seen": 271170865, "router_z_loss_clip": 2.33203125, "router_z_loss_mlp": 0.28796387, "step": 12571, "time_per_iteration": 2.928222179412842 }, { "auxiliary_loss_clip": 0.01293595, "auxiliary_loss_mlp": 0.00262335, "balance_loss_clip": 1.05892646, "balance_loss_mlp": 0.23658538, "epoch": 0.7558695325417105, "flos": 23988292392960.0, "grad_norm": 23.0514948782356, "language_loss": 0.83836651, "learning_rate": 5.931723001891811e-07, "loss": 0.85392576, "num_input_tokens_seen": 271191450, "router_z_loss_clip": 2.34765625, "router_z_loss_mlp": 0.25744629, "step": 12572, "time_per_iteration": 2.696108341217041 }, { "auxiliary_loss_clip": 0.01296738, "auxiliary_loss_mlp": 0.00257288, "balance_loss_clip": 1.06761456, "balance_loss_mlp": 0.23242137, "epoch": 0.7559296557943784, "flos": 14611262572800.0, "grad_norm": 14.960785294166346, "language_loss": 0.83309734, "learning_rate": 5.928955050857456e-07, "loss": 0.84863764, "num_input_tokens_seen": 271207335, "router_z_loss_clip": 2.29101562, "router_z_loss_mlp": 0.24890137, "step": 12573, "time_per_iteration": 2.650101900100708 }, { "auxiliary_loss_clip": 0.01280719, "auxiliary_loss_mlp": 0.00267023, "balance_loss_clip": 1.05063748, "balance_loss_mlp": 0.24072564, "epoch": 0.7559897790470465, "flos": 18550483309440.0, "grad_norm": 3.5564087991668676, "language_loss": 0.75676733, "learning_rate": 5.926187633398527e-07, "loss": 0.77224475, "num_input_tokens_seen": 271226895, "router_z_loss_clip": 2.30078125, "router_z_loss_mlp": 0.26318359, "step": 12574, "time_per_iteration": 2.8007190227508545 }, { "auxiliary_loss_clip": 0.01278192, "auxiliary_loss_mlp": 0.00268934, "balance_loss_clip": 1.05106759, "balance_loss_mlp": 0.24230258, "epoch": 0.7560499022997144, "flos": 17967868709760.0, "grad_norm": 39.15720658549678, "language_loss": 0.81326264, "learning_rate": 5.923420749619974e-07, "loss": 0.82873386, "num_input_tokens_seen": 271244375, "router_z_loss_clip": 2.27734375, "router_z_loss_mlp": 0.26623535, "step": 12575, "time_per_iteration": 2.624263048171997 }, { "auxiliary_loss_clip": 0.01268369, "auxiliary_loss_mlp": 0.00268048, "balance_loss_clip": 1.04317641, "balance_loss_mlp": 0.24129739, "epoch": 0.7561100255523824, "flos": 15737815802880.0, "grad_norm": 502.9876403381837, "language_loss": 0.80762756, "learning_rate": 5.92065439962673e-07, "loss": 0.82299173, "num_input_tokens_seen": 271259530, "router_z_loss_clip": 2.25390625, "router_z_loss_mlp": 0.26757812, "step": 12576, "time_per_iteration": 2.696828603744507 }, { "auxiliary_loss_clip": 0.01278936, "auxiliary_loss_mlp": 0.00243727, "balance_loss_clip": 1.05451381, "balance_loss_mlp": 0.21866971, "epoch": 0.7561701488050504, "flos": 15888102307200.0, "grad_norm": 34.68879549866784, "language_loss": 0.75470036, "learning_rate": 5.917888583523669e-07, "loss": 0.76992702, "num_input_tokens_seen": 271276835, "router_z_loss_clip": 2.24414062, "router_z_loss_mlp": 0.25048828, "step": 12577, "time_per_iteration": 2.6409530639648438 }, { "auxiliary_loss_clip": 0.01269531, "auxiliary_loss_mlp": 0.00290093, "balance_loss_clip": 1.04831672, "balance_loss_mlp": 0.26355749, "epoch": 0.7562302720577183, "flos": 20339157893760.0, "grad_norm": 4.635938863816431, "language_loss": 0.84905559, "learning_rate": 5.915123301415685e-07, "loss": 0.8646518, "num_input_tokens_seen": 271296275, "router_z_loss_clip": 2.2109375, "router_z_loss_mlp": 0.26525879, "step": 12578, "time_per_iteration": 2.7034356594085693 }, { "auxiliary_loss_clip": 0.01265784, "auxiliary_loss_mlp": 0.00270896, "balance_loss_clip": 1.04065955, "balance_loss_mlp": 0.24381204, "epoch": 0.7562903953103863, "flos": 20812209033600.0, "grad_norm": 39.136947299815056, "language_loss": 0.82099438, "learning_rate": 5.912358553407641e-07, "loss": 0.83636117, "num_input_tokens_seen": 271315685, "router_z_loss_clip": 2.25390625, "router_z_loss_mlp": 0.27099609, "step": 12579, "time_per_iteration": 2.675410747528076 }, { "auxiliary_loss_clip": 0.0128751, "auxiliary_loss_mlp": 0.00246635, "balance_loss_clip": 1.05344439, "balance_loss_mlp": 0.21865697, "epoch": 0.7563505185630542, "flos": 37596999484800.0, "grad_norm": 37.30202361841782, "language_loss": 0.73043334, "learning_rate": 5.90959433960437e-07, "loss": 0.74577475, "num_input_tokens_seen": 271336790, "router_z_loss_clip": 2.33984375, "router_z_loss_mlp": 0.27966309, "step": 12580, "time_per_iteration": 2.8137407302856445 }, { "auxiliary_loss_clip": 0.01284534, "auxiliary_loss_mlp": 0.00269217, "balance_loss_clip": 1.05865526, "balance_loss_mlp": 0.2422995, "epoch": 0.7564106418157223, "flos": 20230995064320.0, "grad_norm": 4.968352436654494, "language_loss": 0.82333988, "learning_rate": 5.906830660110691e-07, "loss": 0.83887738, "num_input_tokens_seen": 271355470, "router_z_loss_clip": 2.25585938, "router_z_loss_mlp": 0.26904297, "step": 12581, "time_per_iteration": 2.6574835777282715 }, { "auxiliary_loss_clip": 0.01283779, "auxiliary_loss_mlp": 0.0026938, "balance_loss_clip": 1.05549562, "balance_loss_mlp": 0.24205756, "epoch": 0.7564707650683902, "flos": 24754877475840.0, "grad_norm": 57.20042768766058, "language_loss": 0.70908368, "learning_rate": 5.904067515031412e-07, "loss": 0.72461528, "num_input_tokens_seen": 271375810, "router_z_loss_clip": 2.28515625, "router_z_loss_mlp": 0.27331543, "step": 12582, "time_per_iteration": 2.705822706222534 }, { "auxiliary_loss_clip": 0.01148343, "auxiliary_loss_mlp": 0.00099745, "balance_loss_clip": 0.99493361, "balance_loss_mlp": 0.08963581, "epoch": 0.7565308883210582, "flos": 48530076433920.0, "grad_norm": 0.9784831049619438, "language_loss": 0.59936279, "learning_rate": 5.901304904471307e-07, "loss": 0.61184365, "num_input_tokens_seen": 271424775, "router_z_loss_clip": 1.53125, "router_z_loss_mlp": 0.10107422, "step": 12583, "time_per_iteration": 2.937648057937622 }, { "auxiliary_loss_clip": 0.01277807, "auxiliary_loss_mlp": 0.00256743, "balance_loss_clip": 1.05547667, "balance_loss_mlp": 0.23102966, "epoch": 0.7565910115737261, "flos": 12495082757760.0, "grad_norm": 39.040208325157366, "language_loss": 0.86805999, "learning_rate": 5.898542828535125e-07, "loss": 0.88340545, "num_input_tokens_seen": 271440500, "router_z_loss_clip": 2.22167969, "router_z_loss_mlp": 0.25732422, "step": 12584, "time_per_iteration": 2.63339900970459 }, { "auxiliary_loss_clip": 0.01273121, "auxiliary_loss_mlp": 0.002681, "balance_loss_clip": 1.05493593, "balance_loss_mlp": 0.24353062, "epoch": 0.7566511348263941, "flos": 21173003193600.0, "grad_norm": 55.50629824096023, "language_loss": 0.83621335, "learning_rate": 5.895781287327612e-07, "loss": 0.85162556, "num_input_tokens_seen": 271458180, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.24572754, "step": 12585, "time_per_iteration": 2.735262870788574 }, { "auxiliary_loss_clip": 0.0128889, "auxiliary_loss_mlp": 0.00269298, "balance_loss_clip": 1.06136286, "balance_loss_mlp": 0.24091466, "epoch": 0.756711258079062, "flos": 21754827694080.0, "grad_norm": 3.0701566017468713, "language_loss": 0.91759735, "learning_rate": 5.893020280953493e-07, "loss": 0.93317926, "num_input_tokens_seen": 271475730, "router_z_loss_clip": 2.27539062, "router_z_loss_mlp": 0.28369141, "step": 12586, "time_per_iteration": 2.6567022800445557 }, { "auxiliary_loss_clip": 0.01282714, "auxiliary_loss_mlp": 0.00264983, "balance_loss_clip": 1.0556401, "balance_loss_mlp": 0.23861401, "epoch": 0.75677138133173, "flos": 22382905933440.0, "grad_norm": 38.83871699993469, "language_loss": 0.90284902, "learning_rate": 5.890259809517459e-07, "loss": 0.91832602, "num_input_tokens_seen": 271495030, "router_z_loss_clip": 2.27148438, "router_z_loss_mlp": 0.2635498, "step": 12587, "time_per_iteration": 2.725794792175293 }, { "auxiliary_loss_clip": 0.01268969, "auxiliary_loss_mlp": 0.00262621, "balance_loss_clip": 1.04268456, "balance_loss_mlp": 0.23619233, "epoch": 0.756831504584398, "flos": 22708974620160.0, "grad_norm": 33.240489860849316, "language_loss": 0.77507627, "learning_rate": 5.88749987312418e-07, "loss": 0.79039228, "num_input_tokens_seen": 271515355, "router_z_loss_clip": 2.26171875, "router_z_loss_mlp": 0.2644043, "step": 12588, "time_per_iteration": 2.6744072437286377 }, { "auxiliary_loss_clip": 0.01274744, "auxiliary_loss_mlp": 0.00277852, "balance_loss_clip": 1.04738772, "balance_loss_mlp": 0.24927762, "epoch": 0.756891627837066, "flos": 24098358643200.0, "grad_norm": 57.770339864760835, "language_loss": 0.77647471, "learning_rate": 5.884740471878327e-07, "loss": 0.79200065, "num_input_tokens_seen": 271535090, "router_z_loss_clip": 2.27539062, "router_z_loss_mlp": 0.2857666, "step": 12589, "time_per_iteration": 2.7360613346099854 }, { "auxiliary_loss_clip": 0.01256179, "auxiliary_loss_mlp": 0.00252045, "balance_loss_clip": 1.0351932, "balance_loss_mlp": 0.22778553, "epoch": 0.756951751089734, "flos": 19749001438080.0, "grad_norm": 7.0240027113802315, "language_loss": 0.98409581, "learning_rate": 5.881981605884522e-07, "loss": 0.99917805, "num_input_tokens_seen": 271551075, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.24267578, "step": 12590, "time_per_iteration": 2.687446355819702 }, { "auxiliary_loss_clip": 0.01256725, "auxiliary_loss_mlp": 0.00246167, "balance_loss_clip": 1.03906274, "balance_loss_mlp": 0.22288562, "epoch": 0.7570118743424019, "flos": 35079266551680.0, "grad_norm": 6.521065267318732, "language_loss": 0.73348808, "learning_rate": 5.879223275247391e-07, "loss": 0.74851692, "num_input_tokens_seen": 271571035, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.2331543, "step": 12591, "time_per_iteration": 2.81270432472229 }, { "auxiliary_loss_clip": 0.01289671, "auxiliary_loss_mlp": 0.0026128, "balance_loss_clip": 1.0650785, "balance_loss_mlp": 0.23719969, "epoch": 0.7570719975950699, "flos": 25594540778880.0, "grad_norm": 23.680648251269407, "language_loss": 0.82076234, "learning_rate": 5.876465480071528e-07, "loss": 0.83627188, "num_input_tokens_seen": 271592950, "router_z_loss_clip": 2.24707031, "router_z_loss_mlp": 0.24084473, "step": 12592, "time_per_iteration": 2.7410266399383545 }, { "auxiliary_loss_clip": 0.01272408, "auxiliary_loss_mlp": 0.00268506, "balance_loss_clip": 1.0492835, "balance_loss_mlp": 0.24286392, "epoch": 0.7571321208477378, "flos": 10816223028480.0, "grad_norm": 32.371427686539455, "language_loss": 0.79320997, "learning_rate": 5.873708220461522e-07, "loss": 0.80861914, "num_input_tokens_seen": 271608835, "router_z_loss_clip": 2.2265625, "router_z_loss_mlp": 0.25683594, "step": 12593, "time_per_iteration": 2.634935140609741 }, { "auxiliary_loss_clip": 0.01299561, "auxiliary_loss_mlp": 0.00295063, "balance_loss_clip": 1.06778622, "balance_loss_mlp": 0.26781133, "epoch": 0.7571922441004059, "flos": 18260109763200.0, "grad_norm": 3.925951052265435, "language_loss": 0.7572028, "learning_rate": 5.870951496521903e-07, "loss": 0.77314901, "num_input_tokens_seen": 271627730, "router_z_loss_clip": 2.3203125, "router_z_loss_mlp": 0.27270508, "step": 12594, "time_per_iteration": 2.6398978233337402 }, { "auxiliary_loss_clip": 0.01293201, "auxiliary_loss_mlp": 0.00268497, "balance_loss_clip": 1.05990028, "balance_loss_mlp": 0.24051866, "epoch": 0.7572523673530738, "flos": 22890502978560.0, "grad_norm": 8.653490560480336, "language_loss": 0.86076421, "learning_rate": 5.86819530835722e-07, "loss": 0.87638116, "num_input_tokens_seen": 271646415, "router_z_loss_clip": 2.33398438, "router_z_loss_mlp": 0.27978516, "step": 12595, "time_per_iteration": 4.041696786880493 }, { "auxiliary_loss_clip": 0.01267362, "auxiliary_loss_mlp": 0.0025428, "balance_loss_clip": 1.04447174, "balance_loss_mlp": 0.22828063, "epoch": 0.7573124906057418, "flos": 20996323171200.0, "grad_norm": 2.2734712116758544, "language_loss": 0.8071245, "learning_rate": 5.865439656071993e-07, "loss": 0.82234085, "num_input_tokens_seen": 271666240, "router_z_loss_clip": 2.23046875, "router_z_loss_mlp": 0.26000977, "step": 12596, "time_per_iteration": 2.695888042449951 }, { "auxiliary_loss_clip": 0.01258592, "auxiliary_loss_mlp": 0.00252769, "balance_loss_clip": 1.03999555, "balance_loss_mlp": 0.22989321, "epoch": 0.7573726138584097, "flos": 20886292834560.0, "grad_norm": 3.257173489484663, "language_loss": 0.87189436, "learning_rate": 5.862684539770706e-07, "loss": 0.88700801, "num_input_tokens_seen": 271686370, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.22900391, "step": 12597, "time_per_iteration": 4.169264554977417 }, { "auxiliary_loss_clip": 0.01304768, "auxiliary_loss_mlp": 0.00258672, "balance_loss_clip": 1.0709374, "balance_loss_mlp": 0.23075327, "epoch": 0.7574327371110777, "flos": 24530507170560.0, "grad_norm": 5.332323887170776, "language_loss": 0.90126687, "learning_rate": 5.859929959557835e-07, "loss": 0.91690129, "num_input_tokens_seen": 271705050, "router_z_loss_clip": 2.34179688, "router_z_loss_mlp": 0.27929688, "step": 12598, "time_per_iteration": 2.6514058113098145 }, { "auxiliary_loss_clip": 0.0125961, "auxiliary_loss_mlp": 0.00239529, "balance_loss_clip": 1.04087782, "balance_loss_mlp": 0.21577027, "epoch": 0.7574928603637456, "flos": 23364523785600.0, "grad_norm": 15.122121758953583, "language_loss": 0.71768618, "learning_rate": 5.857175915537845e-07, "loss": 0.73267758, "num_input_tokens_seen": 271724915, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.2376709, "step": 12599, "time_per_iteration": 2.658071756362915 }, { "auxiliary_loss_clip": 0.01299192, "auxiliary_loss_mlp": 0.0027053, "balance_loss_clip": 1.06284833, "balance_loss_mlp": 0.24304014, "epoch": 0.7575529836164137, "flos": 13516274419200.0, "grad_norm": 1377.5974003012827, "language_loss": 0.74537373, "learning_rate": 5.854422407815161e-07, "loss": 0.76107097, "num_input_tokens_seen": 271742410, "router_z_loss_clip": 2.36132812, "router_z_loss_mlp": 0.27478027, "step": 12600, "time_per_iteration": 4.143930196762085 }, { "auxiliary_loss_clip": 0.01258436, "auxiliary_loss_mlp": 0.00258371, "balance_loss_clip": 1.04450476, "balance_loss_mlp": 0.2339924, "epoch": 0.7576131068690816, "flos": 19646584784640.0, "grad_norm": 19.00681050888721, "language_loss": 0.72866392, "learning_rate": 5.851669436494191e-07, "loss": 0.74383199, "num_input_tokens_seen": 271761425, "router_z_loss_clip": 2.13867188, "router_z_loss_mlp": 0.24365234, "step": 12601, "time_per_iteration": 2.6707918643951416 }, { "auxiliary_loss_clip": 0.01263784, "auxiliary_loss_mlp": 0.00275171, "balance_loss_clip": 1.04520547, "balance_loss_mlp": 0.24921891, "epoch": 0.7576732301217496, "flos": 20048245643520.0, "grad_norm": 6.362487913297703, "language_loss": 0.75549436, "learning_rate": 5.848917001679335e-07, "loss": 0.77088392, "num_input_tokens_seen": 271780875, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.25964355, "step": 12602, "time_per_iteration": 2.7143306732177734 }, { "auxiliary_loss_clip": 0.01277474, "auxiliary_loss_mlp": 0.00268849, "balance_loss_clip": 1.04946852, "balance_loss_mlp": 0.24189578, "epoch": 0.7577333533744176, "flos": 15377093470080.0, "grad_norm": 120.4616137320142, "language_loss": 0.78941083, "learning_rate": 5.846165103474967e-07, "loss": 0.80487406, "num_input_tokens_seen": 271799490, "router_z_loss_clip": 2.27929688, "router_z_loss_mlp": 0.26965332, "step": 12603, "time_per_iteration": 2.652600049972534 }, { "auxiliary_loss_clip": 0.0125423, "auxiliary_loss_mlp": 0.0026171, "balance_loss_clip": 1.03539145, "balance_loss_mlp": 0.23619951, "epoch": 0.7577934766270855, "flos": 17894862316800.0, "grad_norm": 283.60226430667416, "language_loss": 0.71249551, "learning_rate": 5.843413741985439e-07, "loss": 0.72765493, "num_input_tokens_seen": 271817040, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.25524902, "step": 12604, "time_per_iteration": 2.6781163215637207 }, { "auxiliary_loss_clip": 0.01270425, "auxiliary_loss_mlp": 0.00250694, "balance_loss_clip": 1.05086613, "balance_loss_mlp": 0.22552937, "epoch": 0.7578535998797535, "flos": 21613770984960.0, "grad_norm": 11.855018287092424, "language_loss": 0.86560136, "learning_rate": 5.840662917315076e-07, "loss": 0.88081253, "num_input_tokens_seen": 271835480, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.25158691, "step": 12605, "time_per_iteration": 4.124801158905029 }, { "auxiliary_loss_clip": 0.01291651, "auxiliary_loss_mlp": 0.00276388, "balance_loss_clip": 1.05555749, "balance_loss_mlp": 0.2490537, "epoch": 0.7579137231324214, "flos": 18478374756480.0, "grad_norm": 67.2861050938665, "language_loss": 0.8986423, "learning_rate": 5.837912629568198e-07, "loss": 0.91432261, "num_input_tokens_seen": 271849835, "router_z_loss_clip": 2.36132812, "router_z_loss_mlp": 0.2734375, "step": 12606, "time_per_iteration": 2.7105586528778076 }, { "auxiliary_loss_clip": 0.01247936, "auxiliary_loss_mlp": 0.00252308, "balance_loss_clip": 1.03410411, "balance_loss_mlp": 0.22797781, "epoch": 0.7579738463850895, "flos": 23255032152960.0, "grad_norm": 1940.682417772098, "language_loss": 0.77309573, "learning_rate": 5.835162878849087e-07, "loss": 0.7880981, "num_input_tokens_seen": 271869560, "router_z_loss_clip": 2.14257812, "router_z_loss_mlp": 0.2434082, "step": 12607, "time_per_iteration": 2.7186691761016846 }, { "auxiliary_loss_clip": 0.01290886, "auxiliary_loss_mlp": 0.00264587, "balance_loss_clip": 1.06013417, "balance_loss_mlp": 0.23802763, "epoch": 0.7580339696377574, "flos": 14027031861120.0, "grad_norm": 357.2157857184072, "language_loss": 0.83808541, "learning_rate": 5.83241366526202e-07, "loss": 0.85364014, "num_input_tokens_seen": 271887950, "router_z_loss_clip": 2.3046875, "router_z_loss_mlp": 0.26574707, "step": 12608, "time_per_iteration": 2.639857053756714 }, { "auxiliary_loss_clip": 0.01260125, "auxiliary_loss_mlp": 0.0026024, "balance_loss_clip": 1.03633499, "balance_loss_mlp": 0.23395406, "epoch": 0.7580940928904254, "flos": 25082777756160.0, "grad_norm": 12.991315400472335, "language_loss": 0.78760445, "learning_rate": 5.829664988911245e-07, "loss": 0.80280817, "num_input_tokens_seen": 271907700, "router_z_loss_clip": 2.23828125, "router_z_loss_mlp": 0.26281738, "step": 12609, "time_per_iteration": 2.7654175758361816 }, { "auxiliary_loss_clip": 0.01286621, "auxiliary_loss_mlp": 0.00266975, "balance_loss_clip": 1.05747306, "balance_loss_mlp": 0.23893689, "epoch": 0.7581542161430933, "flos": 23836425690240.0, "grad_norm": 50.44850031646102, "language_loss": 0.8809157, "learning_rate": 5.826916849901007e-07, "loss": 0.89645171, "num_input_tokens_seen": 271926840, "router_z_loss_clip": 2.29492188, "router_z_loss_mlp": 0.28051758, "step": 12610, "time_per_iteration": 2.6466822624206543 }, { "auxiliary_loss_clip": 0.01300169, "auxiliary_loss_mlp": 0.00251779, "balance_loss_clip": 1.07140255, "balance_loss_mlp": 0.22689992, "epoch": 0.7582143393957613, "flos": 22237000888320.0, "grad_norm": 12.661808579729744, "language_loss": 0.78265715, "learning_rate": 5.824169248335488e-07, "loss": 0.79817665, "num_input_tokens_seen": 271946465, "router_z_loss_clip": 2.29101562, "router_z_loss_mlp": 0.2487793, "step": 12611, "time_per_iteration": 2.645425319671631 }, { "auxiliary_loss_clip": 0.01276062, "auxiliary_loss_mlp": 0.00283668, "balance_loss_clip": 1.05458832, "balance_loss_mlp": 0.25728744, "epoch": 0.7582744626484292, "flos": 21106389421440.0, "grad_norm": 173.99691862823158, "language_loss": 0.78785467, "learning_rate": 5.821422184318893e-07, "loss": 0.80345201, "num_input_tokens_seen": 271967295, "router_z_loss_clip": 2.21289062, "router_z_loss_mlp": 0.26367188, "step": 12612, "time_per_iteration": 2.6826963424682617 }, { "auxiliary_loss_clip": 0.01284326, "auxiliary_loss_mlp": 0.00270409, "balance_loss_clip": 1.05636096, "balance_loss_mlp": 0.2406545, "epoch": 0.7583345859010973, "flos": 24604770539520.0, "grad_norm": 5.00513809428715, "language_loss": 0.66623664, "learning_rate": 5.818675657955397e-07, "loss": 0.68178397, "num_input_tokens_seen": 271987960, "router_z_loss_clip": 2.27734375, "router_z_loss_mlp": 0.29760742, "step": 12613, "time_per_iteration": 2.756448268890381 }, { "auxiliary_loss_clip": 0.01286203, "auxiliary_loss_mlp": 0.00276426, "balance_loss_clip": 1.05910778, "balance_loss_mlp": 0.24918643, "epoch": 0.7583947091537652, "flos": 33546814657920.0, "grad_norm": 20.448332639058837, "language_loss": 0.67450863, "learning_rate": 5.815929669349135e-07, "loss": 0.69013494, "num_input_tokens_seen": 272011780, "router_z_loss_clip": 2.27148438, "router_z_loss_mlp": 0.2722168, "step": 12614, "time_per_iteration": 2.844142198562622 }, { "auxiliary_loss_clip": 0.01286181, "auxiliary_loss_mlp": 0.00250118, "balance_loss_clip": 1.05913627, "balance_loss_mlp": 0.22204378, "epoch": 0.7584548324064332, "flos": 20121000641280.0, "grad_norm": 2.6642660521020476, "language_loss": 0.82105446, "learning_rate": 5.813184218604246e-07, "loss": 0.83641744, "num_input_tokens_seen": 272030825, "router_z_loss_clip": 2.2734375, "router_z_loss_mlp": 0.28039551, "step": 12615, "time_per_iteration": 2.653377056121826 }, { "auxiliary_loss_clip": 0.01155203, "auxiliary_loss_mlp": 0.0014425, "balance_loss_clip": 1.00228357, "balance_loss_mlp": 0.13604794, "epoch": 0.7585149556591012, "flos": 70402584061440.0, "grad_norm": 0.7947148579938159, "language_loss": 0.66735852, "learning_rate": 5.810439305824828e-07, "loss": 0.68035305, "num_input_tokens_seen": 272095825, "router_z_loss_clip": 1.53125, "router_z_loss_mlp": 0.08203125, "step": 12616, "time_per_iteration": 3.18774151802063 }, { "auxiliary_loss_clip": 0.01289107, "auxiliary_loss_mlp": 0.00272185, "balance_loss_clip": 1.06086469, "balance_loss_mlp": 0.24519598, "epoch": 0.7585750789117691, "flos": 16143786293760.0, "grad_norm": 12.831242012564738, "language_loss": 0.93436027, "learning_rate": 5.807694931114979e-07, "loss": 0.94997311, "num_input_tokens_seen": 272113950, "router_z_loss_clip": 2.27929688, "router_z_loss_mlp": 0.26989746, "step": 12617, "time_per_iteration": 2.691293954849243 }, { "auxiliary_loss_clip": 0.01287759, "auxiliary_loss_mlp": 0.00263153, "balance_loss_clip": 1.05862057, "balance_loss_mlp": 0.23768964, "epoch": 0.7586352021644371, "flos": 17493165544320.0, "grad_norm": 45.27324442694777, "language_loss": 0.87045026, "learning_rate": 5.804951094578757e-07, "loss": 0.88595939, "num_input_tokens_seen": 272130315, "router_z_loss_clip": 2.2890625, "router_z_loss_mlp": 0.25500488, "step": 12618, "time_per_iteration": 2.622885227203369 }, { "auxiliary_loss_clip": 0.013212, "auxiliary_loss_mlp": 0.00289755, "balance_loss_clip": 1.08122754, "balance_loss_mlp": 0.26026216, "epoch": 0.758695325417105, "flos": 17275187859840.0, "grad_norm": 2.841622946630219, "language_loss": 0.85881615, "learning_rate": 5.802207796320209e-07, "loss": 0.87492573, "num_input_tokens_seen": 272149080, "router_z_loss_clip": 2.40039062, "router_z_loss_mlp": 0.29516602, "step": 12619, "time_per_iteration": 2.6291160583496094 }, { "auxiliary_loss_clip": 0.01272431, "auxiliary_loss_mlp": 0.00267998, "balance_loss_clip": 1.04869795, "balance_loss_mlp": 0.24228457, "epoch": 0.7587554486697731, "flos": 29495660163840.0, "grad_norm": 21.134744816363824, "language_loss": 0.89424145, "learning_rate": 5.79946503644337e-07, "loss": 0.90964574, "num_input_tokens_seen": 272168285, "router_z_loss_clip": 2.23828125, "router_z_loss_mlp": 0.25720215, "step": 12620, "time_per_iteration": 2.8846263885498047 }, { "auxiliary_loss_clip": 0.01310052, "auxiliary_loss_mlp": 0.0027743, "balance_loss_clip": 1.07201004, "balance_loss_mlp": 0.24972543, "epoch": 0.758815571922441, "flos": 16100800692480.0, "grad_norm": 4.765544676669795, "language_loss": 0.93704116, "learning_rate": 5.796722815052242e-07, "loss": 0.95291603, "num_input_tokens_seen": 272184585, "router_z_loss_clip": 2.37890625, "router_z_loss_mlp": 0.27685547, "step": 12621, "time_per_iteration": 2.633944272994995 }, { "auxiliary_loss_clip": 0.01261424, "auxiliary_loss_mlp": 0.00249991, "balance_loss_clip": 1.04121304, "balance_loss_mlp": 0.22525528, "epoch": 0.758875695175109, "flos": 16143714466560.0, "grad_norm": 7.506009552144079, "language_loss": 0.80361575, "learning_rate": 5.7939811322508e-07, "loss": 0.81872988, "num_input_tokens_seen": 272200205, "router_z_loss_clip": 2.20117188, "router_z_loss_mlp": 0.24743652, "step": 12622, "time_per_iteration": 2.660008192062378 }, { "auxiliary_loss_clip": 0.01151874, "auxiliary_loss_mlp": 0.00109612, "balance_loss_clip": 0.99706173, "balance_loss_mlp": 0.10222103, "epoch": 0.7589358184277769, "flos": 68462006860800.0, "grad_norm": 0.8254282923909504, "language_loss": 0.6035229, "learning_rate": 5.791239988143024e-07, "loss": 0.61613774, "num_input_tokens_seen": 272259670, "router_z_loss_clip": 1.546875, "router_z_loss_mlp": 0.07373047, "step": 12623, "time_per_iteration": 3.2179250717163086 }, { "auxiliary_loss_clip": 0.01268189, "auxiliary_loss_mlp": 0.00249212, "balance_loss_clip": 1.05212879, "balance_loss_mlp": 0.22491759, "epoch": 0.7589959416804449, "flos": 20047311889920.0, "grad_norm": 9.339506264648017, "language_loss": 0.75714028, "learning_rate": 5.788499382832847e-07, "loss": 0.77231431, "num_input_tokens_seen": 272277925, "router_z_loss_clip": 2.16113281, "router_z_loss_mlp": 0.24316406, "step": 12624, "time_per_iteration": 2.6495935916900635 }, { "auxiliary_loss_clip": 0.01282629, "auxiliary_loss_mlp": 0.00253033, "balance_loss_clip": 1.05373907, "balance_loss_mlp": 0.22760592, "epoch": 0.7590560649331128, "flos": 18771800958720.0, "grad_norm": 3.707521337048524, "language_loss": 0.83972198, "learning_rate": 5.785759316424196e-07, "loss": 0.85507858, "num_input_tokens_seen": 272296010, "router_z_loss_clip": 2.2890625, "router_z_loss_mlp": 0.25439453, "step": 12625, "time_per_iteration": 2.6595163345336914 }, { "auxiliary_loss_clip": 0.012789, "auxiliary_loss_mlp": 0.00255717, "balance_loss_clip": 1.05914307, "balance_loss_mlp": 0.23003972, "epoch": 0.7591161881857809, "flos": 29825284296960.0, "grad_norm": 22.9179340869197, "language_loss": 0.70150089, "learning_rate": 5.783019789020977e-07, "loss": 0.71684712, "num_input_tokens_seen": 272318330, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.25683594, "step": 12626, "time_per_iteration": 2.7379026412963867 }, { "auxiliary_loss_clip": 0.01319321, "auxiliary_loss_mlp": 0.00264171, "balance_loss_clip": 1.07746744, "balance_loss_mlp": 0.23508358, "epoch": 0.7591763114384488, "flos": 20302708567680.0, "grad_norm": 5.530054788789839, "language_loss": 0.85549206, "learning_rate": 5.780280800727084e-07, "loss": 0.87132698, "num_input_tokens_seen": 272335265, "router_z_loss_clip": 2.41796875, "router_z_loss_mlp": 0.29101562, "step": 12627, "time_per_iteration": 2.7339189052581787 }, { "auxiliary_loss_clip": 0.01277781, "auxiliary_loss_mlp": 0.00263399, "balance_loss_clip": 1.05247498, "balance_loss_mlp": 0.23663664, "epoch": 0.7592364346911168, "flos": 20813609664000.0, "grad_norm": 16.72962765557675, "language_loss": 0.77677977, "learning_rate": 5.777542351646356e-07, "loss": 0.79219151, "num_input_tokens_seen": 272354795, "router_z_loss_clip": 2.25390625, "router_z_loss_mlp": 0.2677002, "step": 12628, "time_per_iteration": 2.78608775138855 }, { "auxiliary_loss_clip": 0.01335463, "auxiliary_loss_mlp": 0.00239361, "balance_loss_clip": 1.09049714, "balance_loss_mlp": 0.21168074, "epoch": 0.7592965579437848, "flos": 21251504367360.0, "grad_norm": 26.778772125990404, "language_loss": 0.71522987, "learning_rate": 5.774804441882648e-07, "loss": 0.73097813, "num_input_tokens_seen": 272372875, "router_z_loss_clip": 2.453125, "router_z_loss_mlp": 0.27697754, "step": 12629, "time_per_iteration": 2.683120012283325 }, { "auxiliary_loss_clip": 0.01266568, "auxiliary_loss_mlp": 0.00248088, "balance_loss_clip": 1.04522228, "balance_loss_mlp": 0.22386429, "epoch": 0.7593566811964527, "flos": 26213604704640.0, "grad_norm": 4.876445934758837, "language_loss": 0.84743214, "learning_rate": 5.772067071539786e-07, "loss": 0.86257869, "num_input_tokens_seen": 272394715, "router_z_loss_clip": 2.21289062, "router_z_loss_mlp": 0.24243164, "step": 12630, "time_per_iteration": 2.712991237640381 }, { "auxiliary_loss_clip": 0.01130936, "auxiliary_loss_mlp": 0.00081728, "balance_loss_clip": 0.98053002, "balance_loss_mlp": 0.0743847, "epoch": 0.7594168044491207, "flos": 71237255374080.0, "grad_norm": 0.9229598482375224, "language_loss": 0.60790735, "learning_rate": 5.769330240721562e-07, "loss": 0.62003404, "num_input_tokens_seen": 272458775, "router_z_loss_clip": 1.5, "router_z_loss_mlp": 0.07324219, "step": 12631, "time_per_iteration": 3.2272579669952393 }, { "auxiliary_loss_clip": 0.01316104, "auxiliary_loss_mlp": 0.00268762, "balance_loss_clip": 1.07628369, "balance_loss_mlp": 0.2395077, "epoch": 0.7594769277017887, "flos": 26613326229120.0, "grad_norm": 40.33735774445904, "language_loss": 0.81648147, "learning_rate": 5.766593949531767e-07, "loss": 0.83233011, "num_input_tokens_seen": 272479355, "router_z_loss_clip": 2.40039062, "router_z_loss_mlp": 0.29260254, "step": 12632, "time_per_iteration": 2.6948697566986084 }, { "auxiliary_loss_clip": 0.01283622, "auxiliary_loss_mlp": 0.00253244, "balance_loss_clip": 1.05537271, "balance_loss_mlp": 0.22680296, "epoch": 0.7595370509544567, "flos": 17595941333760.0, "grad_norm": 16.656169885579107, "language_loss": 0.8086071, "learning_rate": 5.763858198074154e-07, "loss": 0.82397574, "num_input_tokens_seen": 272493555, "router_z_loss_clip": 2.27929688, "router_z_loss_mlp": 0.26464844, "step": 12633, "time_per_iteration": 2.597329616546631 }, { "auxiliary_loss_clip": 0.01279593, "auxiliary_loss_mlp": 0.00255621, "balance_loss_clip": 1.05361652, "balance_loss_mlp": 0.23124278, "epoch": 0.7595971742071246, "flos": 18002953319040.0, "grad_norm": 26.41190958748143, "language_loss": 0.81153089, "learning_rate": 5.76112298645246e-07, "loss": 0.82688302, "num_input_tokens_seen": 272508925, "router_z_loss_clip": 2.26171875, "router_z_loss_mlp": 0.24377441, "step": 12634, "time_per_iteration": 2.634786605834961 }, { "auxiliary_loss_clip": 0.01272389, "auxiliary_loss_mlp": 0.00271048, "balance_loss_clip": 1.05065203, "balance_loss_mlp": 0.24607371, "epoch": 0.7596572974597926, "flos": 28840326480000.0, "grad_norm": 114.58533133703563, "language_loss": 0.73354024, "learning_rate": 5.758388314770408e-07, "loss": 0.74897462, "num_input_tokens_seen": 272528805, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.24987793, "step": 12635, "time_per_iteration": 2.713315725326538 }, { "auxiliary_loss_clip": 0.01276518, "auxiliary_loss_mlp": 0.00271325, "balance_loss_clip": 1.04679203, "balance_loss_mlp": 0.2457072, "epoch": 0.7597174207124605, "flos": 14282823588480.0, "grad_norm": 45.822509611410574, "language_loss": 0.77312601, "learning_rate": 5.7556541831317e-07, "loss": 0.78860438, "num_input_tokens_seen": 272546655, "router_z_loss_clip": 2.296875, "router_z_loss_mlp": 0.25622559, "step": 12636, "time_per_iteration": 2.650327444076538 }, { "auxiliary_loss_clip": 0.01291774, "auxiliary_loss_mlp": 0.00273509, "balance_loss_clip": 1.06379139, "balance_loss_mlp": 0.24691375, "epoch": 0.7597775439651285, "flos": 21688932193920.0, "grad_norm": 42.08502502354406, "language_loss": 0.89285862, "learning_rate": 5.752920591640018e-07, "loss": 0.9085114, "num_input_tokens_seen": 272564010, "router_z_loss_clip": 2.28320312, "router_z_loss_mlp": 0.26635742, "step": 12637, "time_per_iteration": 4.087862968444824 }, { "auxiliary_loss_clip": 0.01257843, "auxiliary_loss_mlp": 0.00267085, "balance_loss_clip": 1.0385505, "balance_loss_mlp": 0.24137157, "epoch": 0.7598376672177964, "flos": 36101248312320.0, "grad_norm": 77.76543173872622, "language_loss": 0.73894453, "learning_rate": 5.750187540399017e-07, "loss": 0.75419384, "num_input_tokens_seen": 272585840, "router_z_loss_clip": 2.19140625, "router_z_loss_mlp": 0.25732422, "step": 12638, "time_per_iteration": 2.7655234336853027 }, { "auxiliary_loss_clip": 0.0127536, "auxiliary_loss_mlp": 0.0028638, "balance_loss_clip": 1.0484786, "balance_loss_mlp": 0.25943831, "epoch": 0.7598977904704645, "flos": 18332326056960.0, "grad_norm": 212.26535849888418, "language_loss": 0.7605871, "learning_rate": 5.747455029512323e-07, "loss": 0.77620453, "num_input_tokens_seen": 272602300, "router_z_loss_clip": 2.265625, "router_z_loss_mlp": 0.26928711, "step": 12639, "time_per_iteration": 4.165729522705078 }, { "auxiliary_loss_clip": 0.01283776, "auxiliary_loss_mlp": 0.00266291, "balance_loss_clip": 1.05757165, "balance_loss_mlp": 0.23983884, "epoch": 0.7599579137231324, "flos": 20192642317440.0, "grad_norm": 6.207100336406484, "language_loss": 0.80703259, "learning_rate": 5.744723059083572e-07, "loss": 0.82253325, "num_input_tokens_seen": 272619595, "router_z_loss_clip": 2.26171875, "router_z_loss_mlp": 0.26464844, "step": 12640, "time_per_iteration": 2.6329703330993652 }, { "auxiliary_loss_clip": 0.01298569, "auxiliary_loss_mlp": 0.00280582, "balance_loss_clip": 1.06817937, "balance_loss_mlp": 0.25263932, "epoch": 0.7600180369758004, "flos": 24024849459840.0, "grad_norm": 24.075018322635245, "language_loss": 0.74821579, "learning_rate": 5.741991629216343e-07, "loss": 0.76400721, "num_input_tokens_seen": 272638825, "router_z_loss_clip": 2.30273438, "router_z_loss_mlp": 0.27954102, "step": 12641, "time_per_iteration": 2.664675712585449 }, { "auxiliary_loss_clip": 0.01306156, "auxiliary_loss_mlp": 0.00266602, "balance_loss_clip": 1.07398677, "balance_loss_mlp": 0.2375747, "epoch": 0.7600781602284684, "flos": 18989527248000.0, "grad_norm": 34.21978896635616, "language_loss": 0.76469827, "learning_rate": 5.73926074001422e-07, "loss": 0.78042591, "num_input_tokens_seen": 272657240, "router_z_loss_clip": 2.32421875, "router_z_loss_mlp": 0.2902832, "step": 12642, "time_per_iteration": 4.229751110076904 }, { "auxiliary_loss_clip": 0.01269324, "auxiliary_loss_mlp": 0.00253097, "balance_loss_clip": 1.04378843, "balance_loss_mlp": 0.22765759, "epoch": 0.7601382834811363, "flos": 26067520091520.0, "grad_norm": 9.081573487650436, "language_loss": 0.8439492, "learning_rate": 5.736530391580765e-07, "loss": 0.85917336, "num_input_tokens_seen": 272677520, "router_z_loss_clip": 2.25195312, "router_z_loss_mlp": 0.25476074, "step": 12643, "time_per_iteration": 2.699097156524658 }, { "auxiliary_loss_clip": 0.01283213, "auxiliary_loss_mlp": 0.00274452, "balance_loss_clip": 1.05398703, "balance_loss_mlp": 0.24511465, "epoch": 0.7601984067338043, "flos": 18844232734080.0, "grad_norm": 13.611206102780388, "language_loss": 0.8581003, "learning_rate": 5.733800584019508e-07, "loss": 0.8736769, "num_input_tokens_seen": 272696770, "router_z_loss_clip": 2.2890625, "router_z_loss_mlp": 0.29370117, "step": 12644, "time_per_iteration": 2.6784486770629883 }, { "auxiliary_loss_clip": 0.0127139, "auxiliary_loss_mlp": 0.00250781, "balance_loss_clip": 1.04880834, "balance_loss_mlp": 0.22453126, "epoch": 0.7602585299864723, "flos": 24646391424000.0, "grad_norm": 2.3854985964746147, "language_loss": 0.8597672, "learning_rate": 5.731071317433957e-07, "loss": 0.87498885, "num_input_tokens_seen": 272718340, "router_z_loss_clip": 2.22851562, "router_z_loss_mlp": 0.2623291, "step": 12645, "time_per_iteration": 2.692143201828003 }, { "auxiliary_loss_clip": 0.01284925, "auxiliary_loss_mlp": 0.00270207, "balance_loss_clip": 1.0599364, "balance_loss_mlp": 0.24289608, "epoch": 0.7603186532391403, "flos": 23842100039040.0, "grad_norm": 9.401164668335099, "language_loss": 0.79878932, "learning_rate": 5.728342591927611e-07, "loss": 0.81434065, "num_input_tokens_seen": 272739575, "router_z_loss_clip": 2.25097656, "router_z_loss_mlp": 0.27319336, "step": 12646, "time_per_iteration": 2.684187173843384 }, { "auxiliary_loss_clip": 0.01272319, "auxiliary_loss_mlp": 0.0027154, "balance_loss_clip": 1.04854774, "balance_loss_mlp": 0.24562384, "epoch": 0.7603787764918082, "flos": 22199905117440.0, "grad_norm": 6.955633531893365, "language_loss": 0.76680887, "learning_rate": 5.725614407603949e-07, "loss": 0.78224748, "num_input_tokens_seen": 272758710, "router_z_loss_clip": 2.23828125, "router_z_loss_mlp": 0.25927734, "step": 12647, "time_per_iteration": 4.080297231674194 }, { "auxiliary_loss_clip": 0.01117088, "auxiliary_loss_mlp": 0.00101286, "balance_loss_clip": 0.96495438, "balance_loss_mlp": 0.09303628, "epoch": 0.7604388997444762, "flos": 54086894254080.0, "grad_norm": 0.6637491776424435, "language_loss": 0.48739842, "learning_rate": 5.722886764566415e-07, "loss": 0.49958214, "num_input_tokens_seen": 272814855, "router_z_loss_clip": 1.515625, "router_z_loss_mlp": 0.08251953, "step": 12648, "time_per_iteration": 3.1484665870666504 }, { "auxiliary_loss_clip": 0.01248064, "auxiliary_loss_mlp": 0.00249525, "balance_loss_clip": 1.03371191, "balance_loss_mlp": 0.22539718, "epoch": 0.7604990229971441, "flos": 19681920789120.0, "grad_norm": 6.520246822194471, "language_loss": 0.83053517, "learning_rate": 5.720159662918451e-07, "loss": 0.84551102, "num_input_tokens_seen": 272834400, "router_z_loss_clip": 2.14257812, "router_z_loss_mlp": 0.24145508, "step": 12649, "time_per_iteration": 2.7060465812683105 }, { "auxiliary_loss_clip": 0.01260904, "auxiliary_loss_mlp": 0.00256847, "balance_loss_clip": 1.04230392, "balance_loss_mlp": 0.23057294, "epoch": 0.7605591462498121, "flos": 25228036356480.0, "grad_norm": 7.298749613808355, "language_loss": 0.73971081, "learning_rate": 5.717433102763462e-07, "loss": 0.7548883, "num_input_tokens_seen": 272854760, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.26293945, "step": 12650, "time_per_iteration": 2.7326879501342773 }, { "auxiliary_loss_clip": 0.01113344, "auxiliary_loss_mlp": 0.00089957, "balance_loss_clip": 0.96237659, "balance_loss_mlp": 0.08270862, "epoch": 0.76061926950248, "flos": 66783757662720.0, "grad_norm": 0.7736981918977436, "language_loss": 0.62260449, "learning_rate": 5.714707084204838e-07, "loss": 0.63463748, "num_input_tokens_seen": 272919030, "router_z_loss_clip": 1.5078125, "router_z_loss_mlp": 0.07226562, "step": 12651, "time_per_iteration": 3.1622626781463623 }, { "auxiliary_loss_clip": 0.01272256, "auxiliary_loss_mlp": 0.00262288, "balance_loss_clip": 1.04965115, "balance_loss_mlp": 0.23682462, "epoch": 0.7606793927551481, "flos": 25338354001920.0, "grad_norm": 4.646937430167028, "language_loss": 0.79676628, "learning_rate": 5.711981607345951e-07, "loss": 0.81211174, "num_input_tokens_seen": 272938925, "router_z_loss_clip": 2.2265625, "router_z_loss_mlp": 0.25488281, "step": 12652, "time_per_iteration": 2.73091721534729 }, { "auxiliary_loss_clip": 0.01281946, "auxiliary_loss_mlp": 0.0027673, "balance_loss_clip": 1.05346537, "balance_loss_mlp": 0.24910918, "epoch": 0.760739516007816, "flos": 18223624523520.0, "grad_norm": 201.46983676024675, "language_loss": 0.85167873, "learning_rate": 5.709256672290152e-07, "loss": 0.86726546, "num_input_tokens_seen": 272954945, "router_z_loss_clip": 2.28515625, "router_z_loss_mlp": 0.27636719, "step": 12653, "time_per_iteration": 2.602353811264038 }, { "auxiliary_loss_clip": 0.01278103, "auxiliary_loss_mlp": 0.00270844, "balance_loss_clip": 1.05287051, "balance_loss_mlp": 0.24507064, "epoch": 0.760799639260484, "flos": 22559119079040.0, "grad_norm": 41.408303239484404, "language_loss": 0.87368715, "learning_rate": 5.706532279140785e-07, "loss": 0.88917661, "num_input_tokens_seen": 272972855, "router_z_loss_clip": 2.25390625, "router_z_loss_mlp": 0.25817871, "step": 12654, "time_per_iteration": 2.750601291656494 }, { "auxiliary_loss_clip": 0.01279952, "auxiliary_loss_mlp": 0.00273694, "balance_loss_clip": 1.05165052, "balance_loss_mlp": 0.24548939, "epoch": 0.760859762513152, "flos": 22309324922880.0, "grad_norm": 6.10955388938357, "language_loss": 0.87304151, "learning_rate": 5.703808428001136e-07, "loss": 0.88857806, "num_input_tokens_seen": 272989895, "router_z_loss_clip": 2.28320312, "router_z_loss_mlp": 0.28173828, "step": 12655, "time_per_iteration": 2.694735288619995 }, { "auxiliary_loss_clip": 0.01265614, "auxiliary_loss_mlp": 0.00265036, "balance_loss_clip": 1.04605639, "balance_loss_mlp": 0.24088424, "epoch": 0.7609198857658199, "flos": 24863902231680.0, "grad_norm": 95.45395242506218, "language_loss": 0.74830914, "learning_rate": 5.701085118974505e-07, "loss": 0.76361567, "num_input_tokens_seen": 273011695, "router_z_loss_clip": 2.19335938, "router_z_loss_mlp": 0.24157715, "step": 12656, "time_per_iteration": 2.7514352798461914 }, { "auxiliary_loss_clip": 0.0128328, "auxiliary_loss_mlp": 0.00295887, "balance_loss_clip": 1.05648553, "balance_loss_mlp": 0.26809952, "epoch": 0.760980009018488, "flos": 16836790366080.0, "grad_norm": 163.06514292778806, "language_loss": 0.82111639, "learning_rate": 5.698362352164164e-07, "loss": 0.83690804, "num_input_tokens_seen": 273028815, "router_z_loss_clip": 2.265625, "router_z_loss_mlp": 0.27807617, "step": 12657, "time_per_iteration": 2.6058833599090576 }, { "auxiliary_loss_clip": 0.01109289, "auxiliary_loss_mlp": 0.00074764, "balance_loss_clip": 0.95733702, "balance_loss_mlp": 0.06665795, "epoch": 0.7610401322711559, "flos": 61230603029760.0, "grad_norm": 0.8528212645030652, "language_loss": 0.64307958, "learning_rate": 5.695640127673347e-07, "loss": 0.6549201, "num_input_tokens_seen": 273084080, "router_z_loss_clip": 1.515625, "router_z_loss_mlp": 0.08105469, "step": 12658, "time_per_iteration": 3.1242079734802246 }, { "auxiliary_loss_clip": 0.01271993, "auxiliary_loss_mlp": 0.00258595, "balance_loss_clip": 1.0530467, "balance_loss_mlp": 0.23177341, "epoch": 0.7611002555238239, "flos": 19640730867840.0, "grad_norm": 9.664089974298758, "language_loss": 0.86907721, "learning_rate": 5.692918445605293e-07, "loss": 0.88438308, "num_input_tokens_seen": 273102295, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.26831055, "step": 12659, "time_per_iteration": 2.716341257095337 }, { "auxiliary_loss_clip": 0.01259726, "auxiliary_loss_mlp": 0.00281491, "balance_loss_clip": 1.04040647, "balance_loss_mlp": 0.2549426, "epoch": 0.7611603787764918, "flos": 26872206526080.0, "grad_norm": 52.066370294333396, "language_loss": 0.75392377, "learning_rate": 5.690197306063209e-07, "loss": 0.76933599, "num_input_tokens_seen": 273123400, "router_z_loss_clip": 2.19433594, "router_z_loss_mlp": 0.26538086, "step": 12660, "time_per_iteration": 2.7115330696105957 }, { "auxiliary_loss_clip": 0.01276889, "auxiliary_loss_mlp": 0.00247881, "balance_loss_clip": 1.05101657, "balance_loss_mlp": 0.22178647, "epoch": 0.7612205020291598, "flos": 27344252085120.0, "grad_norm": 9.117382853551334, "language_loss": 0.7732867, "learning_rate": 5.687476709150281e-07, "loss": 0.7885344, "num_input_tokens_seen": 273145150, "router_z_loss_clip": 2.25976562, "router_z_loss_mlp": 0.2611084, "step": 12661, "time_per_iteration": 2.695098876953125 }, { "auxiliary_loss_clip": 0.01265023, "auxiliary_loss_mlp": 0.00270499, "balance_loss_clip": 1.04531229, "balance_loss_mlp": 0.24360535, "epoch": 0.7612806252818277, "flos": 29314598682240.0, "grad_norm": 40.14297210166979, "language_loss": 0.88851571, "learning_rate": 5.68475665496966e-07, "loss": 0.90387082, "num_input_tokens_seen": 273165180, "router_z_loss_clip": 2.20019531, "router_z_loss_mlp": 0.26867676, "step": 12662, "time_per_iteration": 2.70656418800354 }, { "auxiliary_loss_clip": 0.01265797, "auxiliary_loss_mlp": 0.00271589, "balance_loss_clip": 1.04287803, "balance_loss_mlp": 0.24531564, "epoch": 0.7613407485344957, "flos": 19026048401280.0, "grad_norm": 60.62080104282488, "language_loss": 0.77893865, "learning_rate": 5.682037143624505e-07, "loss": 0.79431248, "num_input_tokens_seen": 273184005, "router_z_loss_clip": 2.22851562, "router_z_loss_mlp": 0.26269531, "step": 12663, "time_per_iteration": 2.693967819213867 }, { "auxiliary_loss_clip": 0.01286552, "auxiliary_loss_mlp": 0.00248802, "balance_loss_clip": 1.06293273, "balance_loss_mlp": 0.22472212, "epoch": 0.7614008717871636, "flos": 23256037733760.0, "grad_norm": 3.598346751993354, "language_loss": 0.76048672, "learning_rate": 5.67931817521794e-07, "loss": 0.77584022, "num_input_tokens_seen": 273203565, "router_z_loss_clip": 2.23632812, "router_z_loss_mlp": 0.2409668, "step": 12664, "time_per_iteration": 2.7175943851470947 }, { "auxiliary_loss_clip": 0.01300724, "auxiliary_loss_mlp": 0.0025871, "balance_loss_clip": 1.06969571, "balance_loss_mlp": 0.23318724, "epoch": 0.7614609950398317, "flos": 21579907438080.0, "grad_norm": 80.05537205254802, "language_loss": 0.86880851, "learning_rate": 5.676599749853066e-07, "loss": 0.88440287, "num_input_tokens_seen": 273221645, "router_z_loss_clip": 2.31445312, "router_z_loss_mlp": 0.25537109, "step": 12665, "time_per_iteration": 2.83748459815979 }, { "auxiliary_loss_clip": 0.01284264, "auxiliary_loss_mlp": 0.00266606, "balance_loss_clip": 1.06446576, "balance_loss_mlp": 0.24136898, "epoch": 0.7615211182924996, "flos": 29277897960960.0, "grad_norm": 5.28276408912887, "language_loss": 0.94114959, "learning_rate": 5.673881867632959e-07, "loss": 0.95665824, "num_input_tokens_seen": 273242040, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.25231934, "step": 12666, "time_per_iteration": 2.7577264308929443 }, { "auxiliary_loss_clip": 0.01276448, "auxiliary_loss_mlp": 0.00256307, "balance_loss_clip": 1.05534613, "balance_loss_mlp": 0.23126142, "epoch": 0.7615812415451676, "flos": 13261129136640.0, "grad_norm": 6.9619219239999, "language_loss": 0.90980184, "learning_rate": 5.671164528660693e-07, "loss": 0.92512941, "num_input_tokens_seen": 273257365, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.25036621, "step": 12667, "time_per_iteration": 2.6422979831695557 }, { "auxiliary_loss_clip": 0.01261752, "auxiliary_loss_mlp": 0.00257461, "balance_loss_clip": 1.04759967, "balance_loss_mlp": 0.23230821, "epoch": 0.7616413647978356, "flos": 18584741905920.0, "grad_norm": 20.24145768387223, "language_loss": 0.85315037, "learning_rate": 5.668447733039296e-07, "loss": 0.86834246, "num_input_tokens_seen": 273274710, "router_z_loss_clip": 2.14257812, "router_z_loss_mlp": 0.25183105, "step": 12668, "time_per_iteration": 2.6042826175689697 }, { "auxiliary_loss_clip": 0.01262361, "auxiliary_loss_mlp": 0.00260202, "balance_loss_clip": 1.04206777, "balance_loss_mlp": 0.23439345, "epoch": 0.7617014880505035, "flos": 18516188799360.0, "grad_norm": 405.1028242604791, "language_loss": 0.71782362, "learning_rate": 5.6657314808718e-07, "loss": 0.73304927, "num_input_tokens_seen": 273292870, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.25805664, "step": 12669, "time_per_iteration": 2.662238836288452 }, { "auxiliary_loss_clip": 0.01298241, "auxiliary_loss_mlp": 0.00265572, "balance_loss_clip": 1.06825781, "balance_loss_mlp": 0.24024013, "epoch": 0.7617616113031715, "flos": 24973178382720.0, "grad_norm": 141.2824557550159, "language_loss": 0.72387612, "learning_rate": 5.663015772261202e-07, "loss": 0.73951423, "num_input_tokens_seen": 273312375, "router_z_loss_clip": 2.30078125, "router_z_loss_mlp": 0.25317383, "step": 12670, "time_per_iteration": 2.6809232234954834 }, { "auxiliary_loss_clip": 0.01285112, "auxiliary_loss_mlp": 0.00258062, "balance_loss_clip": 1.05987263, "balance_loss_mlp": 0.23264615, "epoch": 0.7618217345558395, "flos": 23295036925440.0, "grad_norm": 8.908977739018141, "language_loss": 0.79546922, "learning_rate": 5.660300607310493e-07, "loss": 0.81090099, "num_input_tokens_seen": 273332590, "router_z_loss_clip": 2.25585938, "router_z_loss_mlp": 0.25402832, "step": 12671, "time_per_iteration": 2.718287706375122 }, { "auxiliary_loss_clip": 0.01270738, "auxiliary_loss_mlp": 0.00275106, "balance_loss_clip": 1.04946101, "balance_loss_mlp": 0.24957107, "epoch": 0.7618818578085075, "flos": 25482894330240.0, "grad_norm": 14.662966719418453, "language_loss": 0.78192908, "learning_rate": 5.657585986122613e-07, "loss": 0.79738754, "num_input_tokens_seen": 273352885, "router_z_loss_clip": 2.21484375, "router_z_loss_mlp": 0.25537109, "step": 12672, "time_per_iteration": 2.724764108657837 }, { "auxiliary_loss_clip": 0.01111847, "auxiliary_loss_mlp": 0.00090118, "balance_loss_clip": 0.96094966, "balance_loss_mlp": 0.08201206, "epoch": 0.7619419810611754, "flos": 61151994115200.0, "grad_norm": 0.7369080867025939, "language_loss": 0.55961502, "learning_rate": 5.654871908800506e-07, "loss": 0.57163465, "num_input_tokens_seen": 273411730, "router_z_loss_clip": 1.515625, "router_z_loss_mlp": 0.08105469, "step": 12673, "time_per_iteration": 3.147416114807129 }, { "auxiliary_loss_clip": 0.01289936, "auxiliary_loss_mlp": 0.00284863, "balance_loss_clip": 1.05865884, "balance_loss_mlp": 0.2559672, "epoch": 0.7620021043138434, "flos": 23258659426560.0, "grad_norm": 7.194931304048916, "language_loss": 0.82926953, "learning_rate": 5.652158375447102e-07, "loss": 0.84501755, "num_input_tokens_seen": 273430020, "router_z_loss_clip": 2.3125, "router_z_loss_mlp": 0.28918457, "step": 12674, "time_per_iteration": 2.6847071647644043 }, { "auxiliary_loss_clip": 0.01264105, "auxiliary_loss_mlp": 0.0025168, "balance_loss_clip": 1.05056691, "balance_loss_mlp": 0.22812402, "epoch": 0.7620622275665113, "flos": 25082490447360.0, "grad_norm": 32.151163972297255, "language_loss": 0.81374478, "learning_rate": 5.649445386165286e-07, "loss": 0.8289026, "num_input_tokens_seen": 273448690, "router_z_loss_clip": 2.13476562, "router_z_loss_mlp": 0.23535156, "step": 12675, "time_per_iteration": 2.6854469776153564 }, { "auxiliary_loss_clip": 0.01271661, "auxiliary_loss_mlp": 0.00278195, "balance_loss_clip": 1.05157578, "balance_loss_mlp": 0.25165898, "epoch": 0.7621223508191793, "flos": 20155007842560.0, "grad_norm": 21.91298783165497, "language_loss": 0.79442024, "learning_rate": 5.646732941057936e-07, "loss": 0.80991882, "num_input_tokens_seen": 273465190, "router_z_loss_clip": 2.20117188, "router_z_loss_mlp": 0.26513672, "step": 12676, "time_per_iteration": 2.6442995071411133 }, { "auxiliary_loss_clip": 0.01309317, "auxiliary_loss_mlp": 0.00262022, "balance_loss_clip": 1.07254887, "balance_loss_mlp": 0.23461571, "epoch": 0.7621824740718472, "flos": 18000187971840.0, "grad_norm": 11.096268159823534, "language_loss": 0.686059, "learning_rate": 5.644021040227927e-07, "loss": 0.70177239, "num_input_tokens_seen": 273478620, "router_z_loss_clip": 2.36914062, "router_z_loss_mlp": 0.27404785, "step": 12677, "time_per_iteration": 2.609644651412964 }, { "auxiliary_loss_clip": 0.01289093, "auxiliary_loss_mlp": 0.00258249, "balance_loss_clip": 1.06207156, "balance_loss_mlp": 0.23375136, "epoch": 0.7622425973245153, "flos": 21725668828800.0, "grad_norm": 22.316303070910823, "language_loss": 0.87141776, "learning_rate": 5.641309683778064e-07, "loss": 0.88689125, "num_input_tokens_seen": 273497635, "router_z_loss_clip": 2.27148438, "router_z_loss_mlp": 0.24487305, "step": 12678, "time_per_iteration": 2.686627149581909 }, { "auxiliary_loss_clip": 0.01263653, "auxiliary_loss_mlp": 0.0022637, "balance_loss_clip": 1.043823, "balance_loss_mlp": 0.20207542, "epoch": 0.7623027205771832, "flos": 19718549683200.0, "grad_norm": 107.04999422075765, "language_loss": 0.84730542, "learning_rate": 5.638598871811175e-07, "loss": 0.86220568, "num_input_tokens_seen": 273513955, "router_z_loss_clip": 2.19921875, "router_z_loss_mlp": 0.24316406, "step": 12679, "time_per_iteration": 4.0583176612854 }, { "auxiliary_loss_clip": 0.01266933, "auxiliary_loss_mlp": 0.00243007, "balance_loss_clip": 1.04432011, "balance_loss_mlp": 0.21655481, "epoch": 0.7623628438298512, "flos": 23988831096960.0, "grad_norm": 18.349422831314328, "language_loss": 0.85801864, "learning_rate": 5.635888604430059e-07, "loss": 0.87311804, "num_input_tokens_seen": 273533970, "router_z_loss_clip": 2.2265625, "router_z_loss_mlp": 0.26452637, "step": 12680, "time_per_iteration": 2.6646339893341064 }, { "auxiliary_loss_clip": 0.01288256, "auxiliary_loss_mlp": 0.00250237, "balance_loss_clip": 1.06416309, "balance_loss_mlp": 0.22576348, "epoch": 0.7624229670825191, "flos": 22345702421760.0, "grad_norm": 6.95107515506403, "language_loss": 0.73563689, "learning_rate": 5.633178881737493e-07, "loss": 0.75102186, "num_input_tokens_seen": 273553090, "router_z_loss_clip": 2.23828125, "router_z_loss_mlp": 0.24487305, "step": 12681, "time_per_iteration": 4.145059108734131 }, { "auxiliary_loss_clip": 0.0126622, "auxiliary_loss_mlp": 0.00245147, "balance_loss_clip": 1.04822731, "balance_loss_mlp": 0.21827716, "epoch": 0.7624830903351871, "flos": 22711775880960.0, "grad_norm": 323.65011493987265, "language_loss": 0.83983397, "learning_rate": 5.63046970383622e-07, "loss": 0.85494757, "num_input_tokens_seen": 273572460, "router_z_loss_clip": 2.18164062, "router_z_loss_mlp": 0.26879883, "step": 12682, "time_per_iteration": 2.6562650203704834 }, { "auxiliary_loss_clip": 0.01275874, "auxiliary_loss_mlp": 0.00238138, "balance_loss_clip": 1.05261517, "balance_loss_mlp": 0.21352145, "epoch": 0.7625432135878552, "flos": 25593714766080.0, "grad_norm": 5.722497589168924, "language_loss": 0.75437444, "learning_rate": 5.627761070828974e-07, "loss": 0.76951456, "num_input_tokens_seen": 273592815, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.24633789, "step": 12683, "time_per_iteration": 2.684774160385132 }, { "auxiliary_loss_clip": 0.01264959, "auxiliary_loss_mlp": 0.00272959, "balance_loss_clip": 1.04152226, "balance_loss_mlp": 0.24550475, "epoch": 0.7626033368405231, "flos": 23987645948160.0, "grad_norm": 4.2393519134954625, "language_loss": 0.90975869, "learning_rate": 5.625052982818472e-07, "loss": 0.92513788, "num_input_tokens_seen": 273611790, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.2746582, "step": 12684, "time_per_iteration": 2.695608139038086 }, { "auxiliary_loss_clip": 0.01269426, "auxiliary_loss_mlp": 0.00236577, "balance_loss_clip": 1.05039001, "balance_loss_mlp": 0.2122938, "epoch": 0.7626634600931911, "flos": 12599115523200.0, "grad_norm": 34.77029736604417, "language_loss": 0.90014654, "learning_rate": 5.622345439907396e-07, "loss": 0.91520655, "num_input_tokens_seen": 273628340, "router_z_loss_clip": 2.19140625, "router_z_loss_mlp": 0.24267578, "step": 12685, "time_per_iteration": 4.18950343132019 }, { "auxiliary_loss_clip": 0.01286823, "auxiliary_loss_mlp": 0.00241017, "balance_loss_clip": 1.06376743, "balance_loss_mlp": 0.21600702, "epoch": 0.762723583345859, "flos": 26322593546880.0, "grad_norm": 8.987446404860817, "language_loss": 0.84869647, "learning_rate": 5.619638442198422e-07, "loss": 0.86397481, "num_input_tokens_seen": 273646585, "router_z_loss_clip": 2.23242188, "router_z_loss_mlp": 0.25012207, "step": 12686, "time_per_iteration": 2.6924245357513428 }, { "auxiliary_loss_clip": 0.01276487, "auxiliary_loss_mlp": 0.00244735, "balance_loss_clip": 1.04911041, "balance_loss_mlp": 0.2182703, "epoch": 0.762783706598527, "flos": 21907053532800.0, "grad_norm": 502.954820396623, "language_loss": 0.81199074, "learning_rate": 5.616931989794198e-07, "loss": 0.82720292, "num_input_tokens_seen": 273665410, "router_z_loss_clip": 2.27539062, "router_z_loss_mlp": 0.26477051, "step": 12687, "time_per_iteration": 2.6972367763519287 }, { "auxiliary_loss_clip": 0.01273548, "auxiliary_loss_mlp": 0.00235935, "balance_loss_clip": 1.05090976, "balance_loss_mlp": 0.21088867, "epoch": 0.7628438298511949, "flos": 15339782217600.0, "grad_norm": 14.665675381086281, "language_loss": 0.74053788, "learning_rate": 5.614226082797369e-07, "loss": 0.7556327, "num_input_tokens_seen": 273683035, "router_z_loss_clip": 2.22851562, "router_z_loss_mlp": 0.25061035, "step": 12688, "time_per_iteration": 2.6413238048553467 }, { "auxiliary_loss_clip": 0.01267946, "auxiliary_loss_mlp": 0.00239779, "balance_loss_clip": 1.0488596, "balance_loss_mlp": 0.21504264, "epoch": 0.7629039531038629, "flos": 13006307076480.0, "grad_norm": 23.48138802857968, "language_loss": 0.78331769, "learning_rate": 5.611520721310515e-07, "loss": 0.79839498, "num_input_tokens_seen": 273700130, "router_z_loss_clip": 2.19140625, "router_z_loss_mlp": 0.24768066, "step": 12689, "time_per_iteration": 4.041190147399902 }, { "auxiliary_loss_clip": 0.01290872, "auxiliary_loss_mlp": 0.00264731, "balance_loss_clip": 1.05914736, "balance_loss_mlp": 0.23719397, "epoch": 0.7629640763565309, "flos": 26171660597760.0, "grad_norm": 49.167961375443426, "language_loss": 0.78509408, "learning_rate": 5.608815905436238e-07, "loss": 0.80065012, "num_input_tokens_seen": 273720310, "router_z_loss_clip": 2.31640625, "router_z_loss_mlp": 0.27539062, "step": 12690, "time_per_iteration": 2.721982717514038 }, { "auxiliary_loss_clip": 0.0128425, "auxiliary_loss_mlp": 0.00271096, "balance_loss_clip": 1.05851793, "balance_loss_mlp": 0.24341583, "epoch": 0.7630241996091989, "flos": 36793713680640.0, "grad_norm": 9.449557159645515, "language_loss": 0.76459599, "learning_rate": 5.606111635277109e-07, "loss": 0.78014934, "num_input_tokens_seen": 273744475, "router_z_loss_clip": 2.26171875, "router_z_loss_mlp": 0.27697754, "step": 12691, "time_per_iteration": 2.90408992767334 }, { "auxiliary_loss_clip": 0.01278683, "auxiliary_loss_mlp": 0.00244073, "balance_loss_clip": 1.05329657, "balance_loss_mlp": 0.2170368, "epoch": 0.7630843228618668, "flos": 21835160461440.0, "grad_norm": 32.35369533795213, "language_loss": 0.92016912, "learning_rate": 5.603407910935662e-07, "loss": 0.93539667, "num_input_tokens_seen": 273764635, "router_z_loss_clip": 2.25976562, "router_z_loss_mlp": 0.27026367, "step": 12692, "time_per_iteration": 2.704926013946533 }, { "auxiliary_loss_clip": 0.01283587, "auxiliary_loss_mlp": 0.00219272, "balance_loss_clip": 1.05760288, "balance_loss_mlp": 0.19629988, "epoch": 0.7631444461145348, "flos": 12640520926080.0, "grad_norm": 84.43506609685507, "language_loss": 0.88809896, "learning_rate": 5.600704732514438e-07, "loss": 0.90312755, "num_input_tokens_seen": 273780115, "router_z_loss_clip": 2.26171875, "router_z_loss_mlp": 0.22961426, "step": 12693, "time_per_iteration": 2.6399025917053223 }, { "auxiliary_loss_clip": 0.01308596, "auxiliary_loss_mlp": 0.00242748, "balance_loss_clip": 1.07464433, "balance_loss_mlp": 0.21500841, "epoch": 0.7632045693672027, "flos": 16836610798080.0, "grad_norm": 4.616658233314432, "language_loss": 0.83732796, "learning_rate": 5.598002100115933e-07, "loss": 0.85284138, "num_input_tokens_seen": 273796605, "router_z_loss_clip": 2.34375, "router_z_loss_mlp": 0.27734375, "step": 12694, "time_per_iteration": 2.7257773876190186 }, { "auxiliary_loss_clip": 0.01270864, "auxiliary_loss_mlp": 0.00243649, "balance_loss_clip": 1.04878175, "balance_loss_mlp": 0.21818569, "epoch": 0.7632646926198707, "flos": 22017335264640.0, "grad_norm": 19.912793942683418, "language_loss": 0.77847421, "learning_rate": 5.595300013842625e-07, "loss": 0.79361939, "num_input_tokens_seen": 273816515, "router_z_loss_clip": 2.22460938, "router_z_loss_mlp": 0.25488281, "step": 12695, "time_per_iteration": 2.6410582065582275 }, { "auxiliary_loss_clip": 0.01275267, "auxiliary_loss_mlp": 0.00243775, "balance_loss_clip": 1.05210817, "balance_loss_mlp": 0.21924129, "epoch": 0.7633248158725388, "flos": 23114011357440.0, "grad_norm": 55.45335177371007, "language_loss": 0.8049857, "learning_rate": 5.592598473796985e-07, "loss": 0.82017612, "num_input_tokens_seen": 273837060, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.24536133, "step": 12696, "time_per_iteration": 2.6885006427764893 }, { "auxiliary_loss_clip": 0.01282975, "auxiliary_loss_mlp": 0.00248455, "balance_loss_clip": 1.05732501, "balance_loss_mlp": 0.22180003, "epoch": 0.7633849391252067, "flos": 10889839952640.0, "grad_norm": 10.786900769057517, "language_loss": 0.81736147, "learning_rate": 5.589897480081453e-07, "loss": 0.83267581, "num_input_tokens_seen": 273853365, "router_z_loss_clip": 2.25976562, "router_z_loss_mlp": 0.26623535, "step": 12697, "time_per_iteration": 2.6312382221221924 }, { "auxiliary_loss_clip": 0.01274418, "auxiliary_loss_mlp": 0.00245323, "balance_loss_clip": 1.058375, "balance_loss_mlp": 0.22022918, "epoch": 0.7634450623778747, "flos": 20994168355200.0, "grad_norm": 3.09226679857225, "language_loss": 0.75251657, "learning_rate": 5.587197032798461e-07, "loss": 0.76771396, "num_input_tokens_seen": 273870750, "router_z_loss_clip": 2.16015625, "router_z_loss_mlp": 0.25097656, "step": 12698, "time_per_iteration": 2.6544764041900635 }, { "auxiliary_loss_clip": 0.01272114, "auxiliary_loss_mlp": 0.00272343, "balance_loss_clip": 1.04830515, "balance_loss_mlp": 0.24575932, "epoch": 0.7635051856305426, "flos": 18882046776960.0, "grad_norm": 10.136195151070405, "language_loss": 0.80456507, "learning_rate": 5.5844971320504e-07, "loss": 0.82000959, "num_input_tokens_seen": 273890890, "router_z_loss_clip": 2.24023438, "router_z_loss_mlp": 0.26550293, "step": 12699, "time_per_iteration": 2.6564512252807617 }, { "auxiliary_loss_clip": 0.01271575, "auxiliary_loss_mlp": 0.00253628, "balance_loss_clip": 1.05176628, "balance_loss_mlp": 0.22789089, "epoch": 0.7635653088832106, "flos": 34786989584640.0, "grad_norm": 11.121182999298837, "language_loss": 0.79414809, "learning_rate": 5.581797777939648e-07, "loss": 0.80940008, "num_input_tokens_seen": 273914015, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.25756836, "step": 12700, "time_per_iteration": 2.771686315536499 }, { "auxiliary_loss_clip": 0.01290633, "auxiliary_loss_mlp": 0.00250093, "balance_loss_clip": 1.06334662, "balance_loss_mlp": 0.22263895, "epoch": 0.7636254321358785, "flos": 23178434400000.0, "grad_norm": 615.6056670487602, "language_loss": 0.77938581, "learning_rate": 5.579098970568574e-07, "loss": 0.79479301, "num_input_tokens_seen": 273927415, "router_z_loss_clip": 2.27539062, "router_z_loss_mlp": 0.27453613, "step": 12701, "time_per_iteration": 2.627302408218384 }, { "auxiliary_loss_clip": 0.01276414, "auxiliary_loss_mlp": 0.00235115, "balance_loss_clip": 1.04677916, "balance_loss_mlp": 0.20747024, "epoch": 0.7636855553885465, "flos": 21325229032320.0, "grad_norm": 17.529886644067208, "language_loss": 0.73591101, "learning_rate": 5.576400710039508e-07, "loss": 0.75102627, "num_input_tokens_seen": 273946690, "router_z_loss_clip": 2.30078125, "router_z_loss_mlp": 0.27661133, "step": 12702, "time_per_iteration": 2.6611268520355225 }, { "auxiliary_loss_clip": 0.012812, "auxiliary_loss_mlp": 0.00238816, "balance_loss_clip": 1.05792093, "balance_loss_mlp": 0.21150544, "epoch": 0.7637456786412145, "flos": 28658079849600.0, "grad_norm": 10.753575628267152, "language_loss": 0.74613917, "learning_rate": 5.57370299645477e-07, "loss": 0.76133937, "num_input_tokens_seen": 273966870, "router_z_loss_clip": 2.23242188, "router_z_loss_mlp": 0.27282715, "step": 12703, "time_per_iteration": 2.744915008544922 }, { "auxiliary_loss_clip": 0.01269462, "auxiliary_loss_mlp": 0.00259686, "balance_loss_clip": 1.04492784, "balance_loss_mlp": 0.23294729, "epoch": 0.7638058018938825, "flos": 21907269014400.0, "grad_norm": 5.375119436500927, "language_loss": 0.90944815, "learning_rate": 5.571005829916668e-07, "loss": 0.9247396, "num_input_tokens_seen": 273986360, "router_z_loss_clip": 2.24414062, "router_z_loss_mlp": 0.26745605, "step": 12704, "time_per_iteration": 2.654968500137329 }, { "auxiliary_loss_clip": 0.01262819, "auxiliary_loss_mlp": 0.00228144, "balance_loss_clip": 1.03703642, "balance_loss_mlp": 0.20359899, "epoch": 0.7638659251465504, "flos": 29643899592960.0, "grad_norm": 1084.79590878452, "language_loss": 0.73452079, "learning_rate": 5.568309210527469e-07, "loss": 0.74943042, "num_input_tokens_seen": 274009745, "router_z_loss_clip": 2.2578125, "router_z_loss_mlp": 0.24572754, "step": 12705, "time_per_iteration": 2.796926736831665 }, { "auxiliary_loss_clip": 0.01285243, "auxiliary_loss_mlp": 0.00247589, "balance_loss_clip": 1.05447924, "balance_loss_mlp": 0.22098112, "epoch": 0.7639260483992184, "flos": 26141172929280.0, "grad_norm": 2.027374859401982, "language_loss": 0.82405746, "learning_rate": 5.565613138389427e-07, "loss": 0.83938575, "num_input_tokens_seen": 274028775, "router_z_loss_clip": 2.30859375, "router_z_loss_mlp": 0.26611328, "step": 12706, "time_per_iteration": 2.696748733520508 }, { "auxiliary_loss_clip": 0.01283742, "auxiliary_loss_mlp": 0.00225739, "balance_loss_clip": 1.05814266, "balance_loss_mlp": 0.20050268, "epoch": 0.7639861716518863, "flos": 20156695781760.0, "grad_norm": 27.113752520732504, "language_loss": 0.86145842, "learning_rate": 5.562917613604781e-07, "loss": 0.8765533, "num_input_tokens_seen": 274047520, "router_z_loss_clip": 2.25585938, "router_z_loss_mlp": 0.25256348, "step": 12707, "time_per_iteration": 2.668889045715332 }, { "auxiliary_loss_clip": 0.01279369, "auxiliary_loss_mlp": 0.00233706, "balance_loss_clip": 1.05518937, "balance_loss_mlp": 0.20821877, "epoch": 0.7640462949045543, "flos": 18583125793920.0, "grad_norm": 56.16637415276084, "language_loss": 0.86474192, "learning_rate": 5.560222636275751e-07, "loss": 0.87987268, "num_input_tokens_seen": 274065350, "router_z_loss_clip": 2.24023438, "router_z_loss_mlp": 0.25476074, "step": 12708, "time_per_iteration": 2.62629771232605 }, { "auxiliary_loss_clip": 0.01110695, "auxiliary_loss_mlp": 0.00060206, "balance_loss_clip": 0.95458335, "balance_loss_mlp": 0.0542698, "epoch": 0.7641064181572224, "flos": 68321991646080.0, "grad_norm": 0.8679497125263992, "language_loss": 0.55385458, "learning_rate": 5.557528206504521e-07, "loss": 0.56556368, "num_input_tokens_seen": 274122315, "router_z_loss_clip": 1.5625, "router_z_loss_mlp": 0.05932617, "step": 12709, "time_per_iteration": 3.225857734680176 }, { "auxiliary_loss_clip": 0.01286009, "auxiliary_loss_mlp": 0.00269587, "balance_loss_clip": 1.05525255, "balance_loss_mlp": 0.24233535, "epoch": 0.7641665414098903, "flos": 17968982031360.0, "grad_norm": 1562.6074145075663, "language_loss": 0.73497874, "learning_rate": 5.554834324393271e-07, "loss": 0.75053465, "num_input_tokens_seen": 274140555, "router_z_loss_clip": 2.3046875, "router_z_loss_mlp": 0.27233887, "step": 12710, "time_per_iteration": 2.621039867401123 }, { "auxiliary_loss_clip": 0.01296989, "auxiliary_loss_mlp": 0.00262345, "balance_loss_clip": 1.0657177, "balance_loss_mlp": 0.23324564, "epoch": 0.7642266646625583, "flos": 21252078984960.0, "grad_norm": 5.014917441913007, "language_loss": 0.77292621, "learning_rate": 5.552140990044154e-07, "loss": 0.78851956, "num_input_tokens_seen": 274161125, "router_z_loss_clip": 2.3125, "router_z_loss_mlp": 0.2911377, "step": 12711, "time_per_iteration": 2.7208259105682373 }, { "auxiliary_loss_clip": 0.01272561, "auxiliary_loss_mlp": 0.00241193, "balance_loss_clip": 1.04996061, "balance_loss_mlp": 0.21263005, "epoch": 0.7642867879152262, "flos": 22747794243840.0, "grad_norm": 44.25669233893523, "language_loss": 0.7947675, "learning_rate": 5.549448203559293e-07, "loss": 0.80990505, "num_input_tokens_seen": 274180835, "router_z_loss_clip": 2.22460938, "router_z_loss_mlp": 0.28527832, "step": 12712, "time_per_iteration": 2.743647575378418 }, { "auxiliary_loss_clip": 0.01278613, "auxiliary_loss_mlp": 0.0023695, "balance_loss_clip": 1.05612421, "balance_loss_mlp": 0.21154688, "epoch": 0.7643469111678942, "flos": 23332132696320.0, "grad_norm": 7.590128367457276, "language_loss": 0.88161051, "learning_rate": 5.546755965040804e-07, "loss": 0.89676613, "num_input_tokens_seen": 274201190, "router_z_loss_clip": 2.22265625, "router_z_loss_mlp": 0.25415039, "step": 12713, "time_per_iteration": 2.7385001182556152 }, { "auxiliary_loss_clip": 0.0129941, "auxiliary_loss_mlp": 0.00242214, "balance_loss_clip": 1.06469238, "balance_loss_mlp": 0.21330589, "epoch": 0.7644070344205621, "flos": 19857092440320.0, "grad_norm": 82.01749561819447, "language_loss": 0.91353804, "learning_rate": 5.544064274590776e-07, "loss": 0.9289543, "num_input_tokens_seen": 274217595, "router_z_loss_clip": 2.34765625, "router_z_loss_mlp": 0.2890625, "step": 12714, "time_per_iteration": 2.7090137004852295 }, { "auxiliary_loss_clip": 0.01275007, "auxiliary_loss_mlp": 0.00243453, "balance_loss_clip": 1.04885674, "balance_loss_mlp": 0.21694098, "epoch": 0.7644671576732301, "flos": 22090628966400.0, "grad_norm": 38.02727192804331, "language_loss": 0.81575394, "learning_rate": 5.541373132311287e-07, "loss": 0.83093858, "num_input_tokens_seen": 274237885, "router_z_loss_clip": 2.26171875, "router_z_loss_mlp": 0.26525879, "step": 12715, "time_per_iteration": 2.6875364780426025 }, { "auxiliary_loss_clip": 0.01259474, "auxiliary_loss_mlp": 0.00259965, "balance_loss_clip": 1.03612244, "balance_loss_mlp": 0.2340486, "epoch": 0.7645272809258981, "flos": 25481421872640.0, "grad_norm": 2.2656378483611017, "language_loss": 0.71774685, "learning_rate": 5.538682538304376e-07, "loss": 0.73294127, "num_input_tokens_seen": 274258820, "router_z_loss_clip": 2.23828125, "router_z_loss_mlp": 0.25915527, "step": 12716, "time_per_iteration": 2.7384514808654785 }, { "auxiliary_loss_clip": 0.01287503, "auxiliary_loss_mlp": 0.00253005, "balance_loss_clip": 1.0624764, "balance_loss_mlp": 0.22695763, "epoch": 0.7645874041785661, "flos": 21541877913600.0, "grad_norm": 11.324470998960981, "language_loss": 0.87118495, "learning_rate": 5.535992492672068e-07, "loss": 0.88659, "num_input_tokens_seen": 274278835, "router_z_loss_clip": 2.25195312, "router_z_loss_mlp": 0.26086426, "step": 12717, "time_per_iteration": 2.7705345153808594 }, { "auxiliary_loss_clip": 0.0125312, "auxiliary_loss_mlp": 0.0022754, "balance_loss_clip": 1.03434587, "balance_loss_mlp": 0.2047708, "epoch": 0.764647527431234, "flos": 20630896156800.0, "grad_norm": 67.24863182386605, "language_loss": 0.77688497, "learning_rate": 5.53330299551638e-07, "loss": 0.79169154, "num_input_tokens_seen": 274297110, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.2277832, "step": 12718, "time_per_iteration": 2.7335708141326904 }, { "auxiliary_loss_clip": 0.01267337, "auxiliary_loss_mlp": 0.00243088, "balance_loss_clip": 1.04746997, "balance_loss_mlp": 0.21650398, "epoch": 0.764707650683902, "flos": 21434074220160.0, "grad_norm": 65.99842473659072, "language_loss": 0.88414466, "learning_rate": 5.530614046939286e-07, "loss": 0.89924884, "num_input_tokens_seen": 274315610, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.26611328, "step": 12719, "time_per_iteration": 2.737826108932495 }, { "auxiliary_loss_clip": 0.01280435, "auxiliary_loss_mlp": 0.00239035, "balance_loss_clip": 1.05423522, "balance_loss_mlp": 0.21299939, "epoch": 0.7647677739365699, "flos": 22711201263360.0, "grad_norm": 15.414486890632242, "language_loss": 0.77339286, "learning_rate": 5.527925647042754e-07, "loss": 0.78858757, "num_input_tokens_seen": 274333975, "router_z_loss_clip": 2.26171875, "router_z_loss_mlp": 0.26049805, "step": 12720, "time_per_iteration": 2.7290849685668945 }, { "auxiliary_loss_clip": 0.01292575, "auxiliary_loss_mlp": 0.00234336, "balance_loss_clip": 1.06210136, "balance_loss_mlp": 0.20890856, "epoch": 0.7648278971892379, "flos": 21324115710720.0, "grad_norm": 124.49053410507939, "language_loss": 0.79750955, "learning_rate": 5.52523779592875e-07, "loss": 0.81277865, "num_input_tokens_seen": 274353695, "router_z_loss_clip": 2.3046875, "router_z_loss_mlp": 0.25402832, "step": 12721, "time_per_iteration": 4.193897724151611 }, { "auxiliary_loss_clip": 0.01270147, "auxiliary_loss_mlp": 0.00235622, "balance_loss_clip": 1.04014158, "balance_loss_mlp": 0.20759566, "epoch": 0.764888020441906, "flos": 20667345482880.0, "grad_norm": 786.9545835555739, "language_loss": 0.82688308, "learning_rate": 5.522550493699163e-07, "loss": 0.84194082, "num_input_tokens_seen": 274371120, "router_z_loss_clip": 2.29882812, "router_z_loss_mlp": 0.28015137, "step": 12722, "time_per_iteration": 2.6784377098083496 }, { "auxiliary_loss_clip": 0.01249505, "auxiliary_loss_mlp": 0.00242784, "balance_loss_clip": 1.03159904, "balance_loss_mlp": 0.217154, "epoch": 0.7649481436945739, "flos": 25082526360960.0, "grad_norm": 3.157824594671958, "language_loss": 0.81988513, "learning_rate": 5.519863740455912e-07, "loss": 0.83480799, "num_input_tokens_seen": 274389665, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.25610352, "step": 12723, "time_per_iteration": 4.115354776382446 }, { "auxiliary_loss_clip": 0.01277163, "auxiliary_loss_mlp": 0.00246097, "balance_loss_clip": 1.04906011, "balance_loss_mlp": 0.21919173, "epoch": 0.7650082669472419, "flos": 24900890261760.0, "grad_norm": 26.635030133750274, "language_loss": 0.79632115, "learning_rate": 5.517177536300881e-07, "loss": 0.81155372, "num_input_tokens_seen": 274408750, "router_z_loss_clip": 2.28125, "router_z_loss_mlp": 0.26953125, "step": 12724, "time_per_iteration": 2.7480392456054688 }, { "auxiliary_loss_clip": 0.01256614, "auxiliary_loss_mlp": 0.00241887, "balance_loss_clip": 1.04081035, "balance_loss_mlp": 0.21708001, "epoch": 0.7650683901999098, "flos": 14647388676480.0, "grad_norm": 12.331624723612606, "language_loss": 0.90284145, "learning_rate": 5.514491881335935e-07, "loss": 0.91782641, "num_input_tokens_seen": 274424600, "router_z_loss_clip": 2.15820312, "router_z_loss_mlp": 0.2479248, "step": 12725, "time_per_iteration": 2.5969669818878174 }, { "auxiliary_loss_clip": 0.01256535, "auxiliary_loss_mlp": 0.00241947, "balance_loss_clip": 1.03298926, "balance_loss_mlp": 0.21607873, "epoch": 0.7651285134525778, "flos": 26352434770560.0, "grad_norm": 5.319726394561995, "language_loss": 0.87116241, "learning_rate": 5.511806775662901e-07, "loss": 0.88614726, "num_input_tokens_seen": 274443075, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.25878906, "step": 12726, "time_per_iteration": 2.736837148666382 }, { "auxiliary_loss_clip": 0.0128045, "auxiliary_loss_mlp": 0.00232887, "balance_loss_clip": 1.05383837, "balance_loss_mlp": 0.20849666, "epoch": 0.7651886367052457, "flos": 26646866553600.0, "grad_norm": 24.876471836161723, "language_loss": 0.77201301, "learning_rate": 5.509122219383615e-07, "loss": 0.78714633, "num_input_tokens_seen": 274463240, "router_z_loss_clip": 2.26757812, "router_z_loss_mlp": 0.24401855, "step": 12727, "time_per_iteration": 4.272839784622192 }, { "auxiliary_loss_clip": 0.01235149, "auxiliary_loss_mlp": 0.0024854, "balance_loss_clip": 1.02609611, "balance_loss_mlp": 0.22358923, "epoch": 0.7652487599579137, "flos": 25702847262720.0, "grad_norm": 5.938427740939801, "language_loss": 0.86825329, "learning_rate": 5.506438212599864e-07, "loss": 0.8830902, "num_input_tokens_seen": 274482750, "router_z_loss_clip": 2.09375, "router_z_loss_mlp": 0.24938965, "step": 12728, "time_per_iteration": 2.6754238605499268 }, { "auxiliary_loss_clip": 0.01294576, "auxiliary_loss_mlp": 0.00268453, "balance_loss_clip": 1.06279206, "balance_loss_mlp": 0.24049857, "epoch": 0.7653088832105817, "flos": 28585576247040.0, "grad_norm": 5.1412127835635975, "language_loss": 0.6720134, "learning_rate": 5.503754755413424e-07, "loss": 0.68764365, "num_input_tokens_seen": 274503545, "router_z_loss_clip": 2.31640625, "router_z_loss_mlp": 0.27941895, "step": 12729, "time_per_iteration": 2.7920708656311035 }, { "auxiliary_loss_clip": 0.01258531, "auxiliary_loss_mlp": 0.00239111, "balance_loss_clip": 1.04177094, "balance_loss_mlp": 0.21419594, "epoch": 0.7653690064632497, "flos": 23366750428800.0, "grad_norm": 1.6766076977020532, "language_loss": 0.85918164, "learning_rate": 5.501071847926055e-07, "loss": 0.87415808, "num_input_tokens_seen": 274523825, "router_z_loss_clip": 2.171875, "router_z_loss_mlp": 0.24926758, "step": 12730, "time_per_iteration": 2.67775297164917 }, { "auxiliary_loss_clip": 0.01269265, "auxiliary_loss_mlp": 0.00233231, "balance_loss_clip": 1.04637349, "balance_loss_mlp": 0.20655197, "epoch": 0.7654291297159176, "flos": 15773905992960.0, "grad_norm": 40.693625287112404, "language_loss": 0.79025698, "learning_rate": 5.498389490239495e-07, "loss": 0.80528188, "num_input_tokens_seen": 274541625, "router_z_loss_clip": 2.23046875, "router_z_loss_mlp": 0.26660156, "step": 12731, "time_per_iteration": 2.675168752670288 }, { "auxiliary_loss_clip": 0.01271209, "auxiliary_loss_mlp": 0.00243559, "balance_loss_clip": 1.04941106, "balance_loss_mlp": 0.21838206, "epoch": 0.7654892529685856, "flos": 18033800123520.0, "grad_norm": 22.84271856590658, "language_loss": 0.79287505, "learning_rate": 5.495707682455471e-07, "loss": 0.80802274, "num_input_tokens_seen": 274557580, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.25158691, "step": 12732, "time_per_iteration": 4.00103497505188 }, { "auxiliary_loss_clip": 0.01261296, "auxiliary_loss_mlp": 0.00265011, "balance_loss_clip": 1.04081023, "balance_loss_mlp": 0.23901157, "epoch": 0.7655493762212535, "flos": 27236017428480.0, "grad_norm": 380.5987507806138, "language_loss": 0.84754133, "learning_rate": 5.493026424675653e-07, "loss": 0.86280435, "num_input_tokens_seen": 274578135, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.26037598, "step": 12733, "time_per_iteration": 2.6978349685668945 }, { "auxiliary_loss_clip": 0.01262183, "auxiliary_loss_mlp": 0.00218941, "balance_loss_clip": 1.04018617, "balance_loss_mlp": 0.19415781, "epoch": 0.7656094994739215, "flos": 20773964027520.0, "grad_norm": 13.649359085294176, "language_loss": 0.83450544, "learning_rate": 5.490345717001726e-07, "loss": 0.8493166, "num_input_tokens_seen": 274595655, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.24780273, "step": 12734, "time_per_iteration": 2.7412548065185547 }, { "auxiliary_loss_clip": 0.01290823, "auxiliary_loss_mlp": 0.00241746, "balance_loss_clip": 1.05789089, "balance_loss_mlp": 0.21372037, "epoch": 0.7656696227265896, "flos": 23039245198080.0, "grad_norm": 1166.7471497503159, "language_loss": 0.84016854, "learning_rate": 5.48766555953535e-07, "loss": 0.85549426, "num_input_tokens_seen": 274616305, "router_z_loss_clip": 2.33007812, "router_z_loss_mlp": 0.28039551, "step": 12735, "time_per_iteration": 2.7041659355163574 }, { "auxiliary_loss_clip": 0.01250975, "auxiliary_loss_mlp": 0.00258053, "balance_loss_clip": 1.03314424, "balance_loss_mlp": 0.23170736, "epoch": 0.7657297459792575, "flos": 27525636789120.0, "grad_norm": 33.62679524328525, "language_loss": 0.78910881, "learning_rate": 5.484985952378145e-07, "loss": 0.8041991, "num_input_tokens_seen": 274638110, "router_z_loss_clip": 2.18164062, "router_z_loss_mlp": 0.26342773, "step": 12736, "time_per_iteration": 2.7407126426696777 }, { "auxiliary_loss_clip": 0.01291567, "auxiliary_loss_mlp": 0.00266699, "balance_loss_clip": 1.05670094, "balance_loss_mlp": 0.23783794, "epoch": 0.7657898692319255, "flos": 17128456801920.0, "grad_norm": 18.022030605774834, "language_loss": 0.85962236, "learning_rate": 5.482306895631728e-07, "loss": 0.87520504, "num_input_tokens_seen": 274656565, "router_z_loss_clip": 2.34765625, "router_z_loss_mlp": 0.28845215, "step": 12737, "time_per_iteration": 2.650926113128662 }, { "auxiliary_loss_clip": 0.01257372, "auxiliary_loss_mlp": 0.00255671, "balance_loss_clip": 1.0319289, "balance_loss_mlp": 0.22819373, "epoch": 0.7658499924845934, "flos": 21465747037440.0, "grad_norm": 8.187678095109984, "language_loss": 0.84014106, "learning_rate": 5.479628389397699e-07, "loss": 0.85527146, "num_input_tokens_seen": 274674215, "router_z_loss_clip": 2.2578125, "router_z_loss_mlp": 0.2746582, "step": 12738, "time_per_iteration": 2.698997974395752 }, { "auxiliary_loss_clip": 0.0128569, "auxiliary_loss_mlp": 0.00240278, "balance_loss_clip": 1.05474877, "balance_loss_mlp": 0.21260919, "epoch": 0.7659101157372614, "flos": 29496665744640.0, "grad_norm": 7.275680849010905, "language_loss": 0.72027612, "learning_rate": 5.476950433777603e-07, "loss": 0.7355358, "num_input_tokens_seen": 274693445, "router_z_loss_clip": 2.30859375, "router_z_loss_mlp": 0.2767334, "step": 12739, "time_per_iteration": 2.6929633617401123 }, { "auxiliary_loss_clip": 0.01268896, "auxiliary_loss_mlp": 0.00257463, "balance_loss_clip": 1.04310226, "balance_loss_mlp": 0.22966355, "epoch": 0.7659702389899293, "flos": 18551812112640.0, "grad_norm": 4.721233462991793, "language_loss": 0.91074562, "learning_rate": 5.474273028873004e-07, "loss": 0.92600918, "num_input_tokens_seen": 274712815, "router_z_loss_clip": 2.2578125, "router_z_loss_mlp": 0.27783203, "step": 12740, "time_per_iteration": 2.6642727851867676 }, { "auxiliary_loss_clip": 0.01280059, "auxiliary_loss_mlp": 0.00241045, "balance_loss_clip": 1.05366015, "balance_loss_mlp": 0.21464038, "epoch": 0.7660303622425974, "flos": 23549176627200.0, "grad_norm": 2.5096109225099874, "language_loss": 0.73922372, "learning_rate": 5.471596174785429e-07, "loss": 0.75443476, "num_input_tokens_seen": 274732690, "router_z_loss_clip": 2.265625, "router_z_loss_mlp": 0.26416016, "step": 12741, "time_per_iteration": 2.7189760208129883 }, { "auxiliary_loss_clip": 0.01276061, "auxiliary_loss_mlp": 0.00256909, "balance_loss_clip": 1.05047274, "balance_loss_mlp": 0.23045608, "epoch": 0.7660904854952653, "flos": 18916736336640.0, "grad_norm": 9.530630448051854, "language_loss": 0.82543206, "learning_rate": 5.468919871616386e-07, "loss": 0.84076178, "num_input_tokens_seen": 274752460, "router_z_loss_clip": 2.25390625, "router_z_loss_mlp": 0.26464844, "step": 12742, "time_per_iteration": 2.694627523422241 }, { "auxiliary_loss_clip": 0.01256592, "auxiliary_loss_mlp": 0.002597, "balance_loss_clip": 1.03885436, "balance_loss_mlp": 0.23527369, "epoch": 0.7661506087479333, "flos": 23147515768320.0, "grad_norm": 26.52227494653124, "language_loss": 0.82514238, "learning_rate": 5.46624411946736e-07, "loss": 0.84030533, "num_input_tokens_seen": 274773070, "router_z_loss_clip": 2.17578125, "router_z_loss_mlp": 0.24438477, "step": 12743, "time_per_iteration": 2.6362171173095703 }, { "auxiliary_loss_clip": 0.01242203, "auxiliary_loss_mlp": 0.00219153, "balance_loss_clip": 1.02869725, "balance_loss_mlp": 0.19515643, "epoch": 0.7662107320006012, "flos": 17565776887680.0, "grad_norm": 6.491529606636794, "language_loss": 0.83096552, "learning_rate": 5.463568918439805e-07, "loss": 0.84557915, "num_input_tokens_seen": 274790220, "router_z_loss_clip": 2.13671875, "router_z_loss_mlp": 0.23999023, "step": 12744, "time_per_iteration": 2.625882863998413 }, { "auxiliary_loss_clip": 0.01278073, "auxiliary_loss_mlp": 0.00248683, "balance_loss_clip": 1.04803503, "balance_loss_mlp": 0.22312406, "epoch": 0.7662708552532692, "flos": 22303075956480.0, "grad_norm": 12.430210442778753, "language_loss": 0.83055806, "learning_rate": 5.460894268635181e-07, "loss": 0.84582567, "num_input_tokens_seen": 274805095, "router_z_loss_clip": 2.3046875, "router_z_loss_mlp": 0.2557373, "step": 12745, "time_per_iteration": 2.619189500808716 }, { "auxiliary_loss_clip": 0.01265823, "auxiliary_loss_mlp": 0.00232247, "balance_loss_clip": 1.04093623, "balance_loss_mlp": 0.20585383, "epoch": 0.7663309785059371, "flos": 15742053607680.0, "grad_norm": 5.937222466819913, "language_loss": 0.86783063, "learning_rate": 5.458220170154896e-07, "loss": 0.88281137, "num_input_tokens_seen": 274821800, "router_z_loss_clip": 2.24609375, "router_z_loss_mlp": 0.26416016, "step": 12746, "time_per_iteration": 2.660491943359375 }, { "auxiliary_loss_clip": 0.01114477, "auxiliary_loss_mlp": 0.00102529, "balance_loss_clip": 0.95968866, "balance_loss_mlp": 0.09470846, "epoch": 0.7663911017586051, "flos": 62163312514560.0, "grad_norm": 0.6828071238977172, "language_loss": 0.56251711, "learning_rate": 5.455546623100362e-07, "loss": 0.57468712, "num_input_tokens_seen": 274886970, "router_z_loss_clip": 1.546875, "router_z_loss_mlp": 0.078125, "step": 12747, "time_per_iteration": 3.1987884044647217 }, { "auxiliary_loss_clip": 0.01256404, "auxiliary_loss_mlp": 0.0022519, "balance_loss_clip": 1.0385989, "balance_loss_mlp": 0.20146723, "epoch": 0.7664512250112732, "flos": 26506025326080.0, "grad_norm": 22.496614400677295, "language_loss": 0.77997625, "learning_rate": 5.452873627572956e-07, "loss": 0.79479218, "num_input_tokens_seen": 274907240, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.23706055, "step": 12748, "time_per_iteration": 2.677819013595581 }, { "auxiliary_loss_clip": 0.01253444, "auxiliary_loss_mlp": 0.00244907, "balance_loss_clip": 1.03091586, "balance_loss_mlp": 0.22008777, "epoch": 0.7665113482639411, "flos": 16249542912000.0, "grad_norm": 21.440581026298105, "language_loss": 0.78483546, "learning_rate": 5.450201183674052e-07, "loss": 0.79981893, "num_input_tokens_seen": 274924650, "router_z_loss_clip": 2.22460938, "router_z_loss_mlp": 0.24816895, "step": 12749, "time_per_iteration": 2.6415834426879883 }, { "auxiliary_loss_clip": 0.01269609, "auxiliary_loss_mlp": 0.00255309, "balance_loss_clip": 1.04483795, "balance_loss_mlp": 0.22853467, "epoch": 0.7665714715166091, "flos": 27197880163200.0, "grad_norm": 9.114735132843384, "language_loss": 0.80513895, "learning_rate": 5.447529291504967e-07, "loss": 0.82038808, "num_input_tokens_seen": 274944550, "router_z_loss_clip": 2.24609375, "router_z_loss_mlp": 0.26794434, "step": 12750, "time_per_iteration": 2.691922426223755 }, { "auxiliary_loss_clip": 0.01251441, "auxiliary_loss_mlp": 0.00224304, "balance_loss_clip": 1.02965724, "balance_loss_mlp": 0.20005706, "epoch": 0.766631594769277, "flos": 21067785279360.0, "grad_norm": 169.1865444981657, "language_loss": 0.83856225, "learning_rate": 5.444857951167026e-07, "loss": 0.85331964, "num_input_tokens_seen": 274961330, "router_z_loss_clip": 2.21679688, "router_z_loss_mlp": 0.24243164, "step": 12751, "time_per_iteration": 2.7842702865600586 }, { "auxiliary_loss_clip": 0.01247276, "auxiliary_loss_mlp": 0.00219848, "balance_loss_clip": 1.03050613, "balance_loss_mlp": 0.19607788, "epoch": 0.766691718021945, "flos": 24097963593600.0, "grad_norm": 17.82081923906857, "language_loss": 0.69288468, "learning_rate": 5.442187162761537e-07, "loss": 0.70755595, "num_input_tokens_seen": 274981655, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.23791504, "step": 12752, "time_per_iteration": 2.7001657485961914 }, { "auxiliary_loss_clip": 0.01294148, "auxiliary_loss_mlp": 0.00230022, "balance_loss_clip": 1.06202507, "balance_loss_mlp": 0.20386712, "epoch": 0.7667518412746129, "flos": 23440654661760.0, "grad_norm": 104.38853276876567, "language_loss": 0.78793627, "learning_rate": 5.439516926389767e-07, "loss": 0.80317795, "num_input_tokens_seen": 274999970, "router_z_loss_clip": 2.32421875, "router_z_loss_mlp": 0.26135254, "step": 12753, "time_per_iteration": 2.6957192420959473 }, { "auxiliary_loss_clip": 0.01267438, "auxiliary_loss_mlp": 0.00230183, "balance_loss_clip": 1.04411399, "balance_loss_mlp": 0.20516106, "epoch": 0.766811964527281, "flos": 18148786536960.0, "grad_norm": 5.898997145829721, "language_loss": 0.69458109, "learning_rate": 5.436847242152971e-07, "loss": 0.70955729, "num_input_tokens_seen": 275015805, "router_z_loss_clip": 2.23242188, "router_z_loss_mlp": 0.25024414, "step": 12754, "time_per_iteration": 2.652019739151001 }, { "auxiliary_loss_clip": 0.01267333, "auxiliary_loss_mlp": 0.00242097, "balance_loss_clip": 1.0463506, "balance_loss_mlp": 0.21663356, "epoch": 0.7668720877799489, "flos": 19536051657600.0, "grad_norm": 3.373882237469164, "language_loss": 0.88521528, "learning_rate": 5.434178110152401e-07, "loss": 0.90030956, "num_input_tokens_seen": 275031810, "router_z_loss_clip": 2.2109375, "router_z_loss_mlp": 0.2545166, "step": 12755, "time_per_iteration": 2.6875650882720947 }, { "auxiliary_loss_clip": 0.01256224, "auxiliary_loss_mlp": 0.00251833, "balance_loss_clip": 1.03609157, "balance_loss_mlp": 0.22746709, "epoch": 0.7669322110326169, "flos": 22674320974080.0, "grad_norm": 47.01766026647117, "language_loss": 0.78717953, "learning_rate": 5.431509530489242e-07, "loss": 0.8022601, "num_input_tokens_seen": 275049325, "router_z_loss_clip": 2.20117188, "router_z_loss_mlp": 0.24353027, "step": 12756, "time_per_iteration": 2.668938159942627 }, { "auxiliary_loss_clip": 0.01271031, "auxiliary_loss_mlp": 0.00258082, "balance_loss_clip": 1.04951322, "balance_loss_mlp": 0.23308364, "epoch": 0.7669923342852848, "flos": 26469396432000.0, "grad_norm": 2.721122924380877, "language_loss": 0.75982213, "learning_rate": 5.428841503264706e-07, "loss": 0.77511322, "num_input_tokens_seen": 275070865, "router_z_loss_clip": 2.2109375, "router_z_loss_mlp": 0.25, "step": 12757, "time_per_iteration": 2.7869956493377686 }, { "auxiliary_loss_clip": 0.01279347, "auxiliary_loss_mlp": 0.00250541, "balance_loss_clip": 1.05030942, "balance_loss_mlp": 0.22252665, "epoch": 0.7670524575379528, "flos": 22856136641280.0, "grad_norm": 4.827882037691112, "language_loss": 0.85117406, "learning_rate": 5.426174028579955e-07, "loss": 0.8664729, "num_input_tokens_seen": 275088015, "router_z_loss_clip": 2.29296875, "router_z_loss_mlp": 0.28027344, "step": 12758, "time_per_iteration": 2.8065409660339355 }, { "auxiliary_loss_clip": 0.01256663, "auxiliary_loss_mlp": 0.00211963, "balance_loss_clip": 1.03689814, "balance_loss_mlp": 0.18620166, "epoch": 0.7671125807906207, "flos": 22452141398400.0, "grad_norm": 59.11108311295037, "language_loss": 0.83566093, "learning_rate": 5.423507106536156e-07, "loss": 0.85034722, "num_input_tokens_seen": 275106975, "router_z_loss_clip": 2.19921875, "router_z_loss_mlp": 0.25744629, "step": 12759, "time_per_iteration": 2.678190231323242 }, { "auxiliary_loss_clip": 0.01252208, "auxiliary_loss_mlp": 0.00245098, "balance_loss_clip": 1.03189945, "balance_loss_mlp": 0.2176802, "epoch": 0.7671727040432887, "flos": 35371543518720.0, "grad_norm": 5.794891571131718, "language_loss": 0.76103199, "learning_rate": 5.420840737234425e-07, "loss": 0.77600503, "num_input_tokens_seen": 275129560, "router_z_loss_clip": 2.20507812, "router_z_loss_mlp": 0.27416992, "step": 12760, "time_per_iteration": 2.80131459236145 }, { "auxiliary_loss_clip": 0.01263752, "auxiliary_loss_mlp": 0.00237329, "balance_loss_clip": 1.0398314, "balance_loss_mlp": 0.2121879, "epoch": 0.7672328272959568, "flos": 22494947431680.0, "grad_norm": 5.477351256672551, "language_loss": 0.85129118, "learning_rate": 5.418174920775871e-07, "loss": 0.86630201, "num_input_tokens_seen": 275151180, "router_z_loss_clip": 2.24414062, "router_z_loss_mlp": 0.25134277, "step": 12761, "time_per_iteration": 2.74959397315979 }, { "auxiliary_loss_clip": 0.01251971, "auxiliary_loss_mlp": 0.00260744, "balance_loss_clip": 1.03325737, "balance_loss_mlp": 0.23478061, "epoch": 0.7672929505486247, "flos": 22815557251200.0, "grad_norm": 24.75385870329156, "language_loss": 0.74162734, "learning_rate": 5.415509657261589e-07, "loss": 0.75675452, "num_input_tokens_seen": 275170605, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.25964355, "step": 12762, "time_per_iteration": 2.689042329788208 }, { "auxiliary_loss_clip": 0.01256268, "auxiliary_loss_mlp": 0.00240576, "balance_loss_clip": 1.03616214, "balance_loss_mlp": 0.21504137, "epoch": 0.7673530738012927, "flos": 20338834671360.0, "grad_norm": 33.28088805144394, "language_loss": 0.82671475, "learning_rate": 5.412844946792639e-07, "loss": 0.84168315, "num_input_tokens_seen": 275188750, "router_z_loss_clip": 2.20117188, "router_z_loss_mlp": 0.25537109, "step": 12763, "time_per_iteration": 4.099484205245972 }, { "auxiliary_loss_clip": 0.01258688, "auxiliary_loss_mlp": 0.00242249, "balance_loss_clip": 1.03772998, "balance_loss_mlp": 0.21740536, "epoch": 0.7674131970539606, "flos": 34933576988160.0, "grad_norm": 43.36362877368881, "language_loss": 0.77780318, "learning_rate": 5.410180789470067e-07, "loss": 0.79281253, "num_input_tokens_seen": 275211365, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.24841309, "step": 12764, "time_per_iteration": 2.769517183303833 }, { "auxiliary_loss_clip": 0.01248571, "auxiliary_loss_mlp": 0.00252455, "balance_loss_clip": 1.02458501, "balance_loss_mlp": 0.22455966, "epoch": 0.7674733203066286, "flos": 28328850766080.0, "grad_norm": 7.68558741050245, "language_loss": 0.7582649, "learning_rate": 5.40751718539491e-07, "loss": 0.77327514, "num_input_tokens_seen": 275231670, "router_z_loss_clip": 2.23828125, "router_z_loss_mlp": 0.2791748, "step": 12765, "time_per_iteration": 4.150561571121216 }, { "auxiliary_loss_clip": 0.01240176, "auxiliary_loss_mlp": 0.00241069, "balance_loss_clip": 1.02504456, "balance_loss_mlp": 0.21616621, "epoch": 0.7675334435592965, "flos": 16289727252480.0, "grad_norm": 76.95806779318754, "language_loss": 0.6739161, "learning_rate": 5.404854134668162e-07, "loss": 0.68872857, "num_input_tokens_seen": 275249425, "router_z_loss_clip": 2.15234375, "router_z_loss_mlp": 0.24902344, "step": 12766, "time_per_iteration": 2.6815874576568604 }, { "auxiliary_loss_clip": 0.01115284, "auxiliary_loss_mlp": 0.00079075, "balance_loss_clip": 0.96415013, "balance_loss_mlp": 0.07101665, "epoch": 0.7675935668119646, "flos": 64826232220800.0, "grad_norm": 0.7146374040159169, "language_loss": 0.5997687, "learning_rate": 5.402191637390803e-07, "loss": 0.61171234, "num_input_tokens_seen": 275312485, "router_z_loss_clip": 1.515625, "router_z_loss_mlp": 0.08056641, "step": 12767, "time_per_iteration": 3.322126626968384 }, { "auxiliary_loss_clip": 0.01250942, "auxiliary_loss_mlp": 0.00234575, "balance_loss_clip": 1.03345978, "balance_loss_mlp": 0.20862299, "epoch": 0.7676536900646325, "flos": 22675398382080.0, "grad_norm": 29.32571923397089, "language_loss": 0.75578046, "learning_rate": 5.399529693663801e-07, "loss": 0.7706356, "num_input_tokens_seen": 275331680, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.25964355, "step": 12768, "time_per_iteration": 2.678720712661743 }, { "auxiliary_loss_clip": 0.01284966, "auxiliary_loss_mlp": 0.00263423, "balance_loss_clip": 1.0596174, "balance_loss_mlp": 0.23773351, "epoch": 0.7677138133173005, "flos": 26939682224640.0, "grad_norm": 161.8080709229524, "language_loss": 0.77301073, "learning_rate": 5.3968683035881e-07, "loss": 0.78849465, "num_input_tokens_seen": 275351615, "router_z_loss_clip": 2.25, "router_z_loss_mlp": 0.25720215, "step": 12769, "time_per_iteration": 4.329387187957764 }, { "auxiliary_loss_clip": 0.0127335, "auxiliary_loss_mlp": 0.00265324, "balance_loss_clip": 1.04595065, "balance_loss_mlp": 0.23808505, "epoch": 0.7677739365699684, "flos": 23799545400960.0, "grad_norm": 9.157587945078788, "language_loss": 0.87461472, "learning_rate": 5.394207467264611e-07, "loss": 0.89000142, "num_input_tokens_seen": 275368815, "router_z_loss_clip": 2.27734375, "router_z_loss_mlp": 0.27233887, "step": 12770, "time_per_iteration": 2.6807053089141846 }, { "auxiliary_loss_clip": 0.01245268, "auxiliary_loss_mlp": 0.00239003, "balance_loss_clip": 1.03429174, "balance_loss_mlp": 0.21471974, "epoch": 0.7678340598226364, "flos": 34455497944320.0, "grad_norm": 5.451477807429747, "language_loss": 0.83993512, "learning_rate": 5.391547184794245e-07, "loss": 0.85477781, "num_input_tokens_seen": 275389345, "router_z_loss_clip": 2.109375, "router_z_loss_mlp": 0.24316406, "step": 12771, "time_per_iteration": 2.8015010356903076 }, { "auxiliary_loss_clip": 0.01243942, "auxiliary_loss_mlp": 0.00251948, "balance_loss_clip": 1.02079487, "balance_loss_mlp": 0.22501831, "epoch": 0.7678941830753043, "flos": 23841740903040.0, "grad_norm": 16.04302037539354, "language_loss": 0.75150484, "learning_rate": 5.388887456277876e-07, "loss": 0.7664637, "num_input_tokens_seen": 275411240, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.26916504, "step": 12772, "time_per_iteration": 2.7077622413635254 }, { "auxiliary_loss_clip": 0.01242806, "auxiliary_loss_mlp": 0.00247381, "balance_loss_clip": 1.03008413, "balance_loss_mlp": 0.22460055, "epoch": 0.7679543063279723, "flos": 25410929431680.0, "grad_norm": 4.447219982833494, "language_loss": 0.81149346, "learning_rate": 5.386228281816349e-07, "loss": 0.82639533, "num_input_tokens_seen": 275432010, "router_z_loss_clip": 2.13085938, "router_z_loss_mlp": 0.22790527, "step": 12773, "time_per_iteration": 2.742201089859009 }, { "auxiliary_loss_clip": 0.01231313, "auxiliary_loss_mlp": 0.00247216, "balance_loss_clip": 1.02138758, "balance_loss_mlp": 0.22243203, "epoch": 0.7680144295806404, "flos": 27962382257280.0, "grad_norm": 3.731964194899917, "language_loss": 0.8717888, "learning_rate": 5.383569661510512e-07, "loss": 0.88657403, "num_input_tokens_seen": 275453710, "router_z_loss_clip": 2.1015625, "router_z_loss_mlp": 0.24780273, "step": 12774, "time_per_iteration": 4.227788686752319 }, { "auxiliary_loss_clip": 0.01239843, "auxiliary_loss_mlp": 0.00234787, "balance_loss_clip": 1.02231514, "balance_loss_mlp": 0.20974085, "epoch": 0.7680745528333083, "flos": 20412810731520.0, "grad_norm": 13.011509617105395, "language_loss": 0.7883333, "learning_rate": 5.380911595461177e-07, "loss": 0.80307955, "num_input_tokens_seen": 275472915, "router_z_loss_clip": 2.17578125, "router_z_loss_mlp": 0.25036621, "step": 12775, "time_per_iteration": 2.6849753856658936 }, { "auxiliary_loss_clip": 0.01109838, "auxiliary_loss_mlp": 0.00154538, "balance_loss_clip": 0.96294641, "balance_loss_mlp": 0.14624126, "epoch": 0.7681346760859763, "flos": 68401103351040.0, "grad_norm": 1.6920924159462867, "language_loss": 0.56240648, "learning_rate": 5.378254083769147e-07, "loss": 0.57505029, "num_input_tokens_seen": 275534785, "router_z_loss_clip": 1.46875, "router_z_loss_mlp": 0.08300781, "step": 12776, "time_per_iteration": 3.2437031269073486 }, { "auxiliary_loss_clip": 0.01242781, "auxiliary_loss_mlp": 0.00242188, "balance_loss_clip": 1.02857709, "balance_loss_mlp": 0.21715453, "epoch": 0.7681947993386442, "flos": 21251468453760.0, "grad_norm": 10.363392722661375, "language_loss": 0.81097567, "learning_rate": 5.375597126535188e-07, "loss": 0.82582545, "num_input_tokens_seen": 275553205, "router_z_loss_clip": 2.14160156, "router_z_loss_mlp": 0.25012207, "step": 12777, "time_per_iteration": 2.7425320148468018 }, { "auxiliary_loss_clip": 0.01245754, "auxiliary_loss_mlp": 0.00266371, "balance_loss_clip": 1.02885389, "balance_loss_mlp": 0.24153966, "epoch": 0.7682549225913122, "flos": 21397696721280.0, "grad_norm": 1502.4512965038634, "language_loss": 0.78464556, "learning_rate": 5.372940723860043e-07, "loss": 0.79976678, "num_input_tokens_seen": 275571490, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.24841309, "step": 12778, "time_per_iteration": 2.751849412918091 }, { "auxiliary_loss_clip": 0.0124717, "auxiliary_loss_mlp": 0.00254668, "balance_loss_clip": 1.02384222, "balance_loss_mlp": 0.2271069, "epoch": 0.7683150458439801, "flos": 23038921975680.0, "grad_norm": 9.371413915076598, "language_loss": 0.7808255, "learning_rate": 5.37028487584446e-07, "loss": 0.79584384, "num_input_tokens_seen": 275589665, "router_z_loss_clip": 2.23632812, "router_z_loss_mlp": 0.27575684, "step": 12779, "time_per_iteration": 2.7256972789764404 }, { "auxiliary_loss_clip": 0.01249397, "auxiliary_loss_mlp": 0.00235687, "balance_loss_clip": 1.02964473, "balance_loss_mlp": 0.21163017, "epoch": 0.7683751690966482, "flos": 67332397996800.0, "grad_norm": 34.47332643898498, "language_loss": 0.6667282, "learning_rate": 5.367629582589133e-07, "loss": 0.68157899, "num_input_tokens_seen": 275615605, "router_z_loss_clip": 2.19921875, "router_z_loss_mlp": 0.24060059, "step": 12780, "time_per_iteration": 3.1422266960144043 }, { "auxiliary_loss_clip": 0.01260624, "auxiliary_loss_mlp": 0.00242225, "balance_loss_clip": 1.03852618, "balance_loss_mlp": 0.21692899, "epoch": 0.7684352923493161, "flos": 21798890703360.0, "grad_norm": 8.054529717520078, "language_loss": 0.76280546, "learning_rate": 5.364974844194759e-07, "loss": 0.77783394, "num_input_tokens_seen": 275634965, "router_z_loss_clip": 2.22265625, "router_z_loss_mlp": 0.25305176, "step": 12781, "time_per_iteration": 2.682741641998291 }, { "auxiliary_loss_clip": 0.01256131, "auxiliary_loss_mlp": 0.00250875, "balance_loss_clip": 1.03852296, "balance_loss_mlp": 0.22574615, "epoch": 0.7684954156019841, "flos": 25847603072640.0, "grad_norm": 56.455211922491735, "language_loss": 0.84407294, "learning_rate": 5.362320660762016e-07, "loss": 0.85914296, "num_input_tokens_seen": 275655785, "router_z_loss_clip": 2.17578125, "router_z_loss_mlp": 0.2512207, "step": 12782, "time_per_iteration": 2.668347120285034 }, { "auxiliary_loss_clip": 0.01247444, "auxiliary_loss_mlp": 0.00242906, "balance_loss_clip": 1.02475286, "balance_loss_mlp": 0.21678728, "epoch": 0.768555538854652, "flos": 25447378757760.0, "grad_norm": 8.460205142153198, "language_loss": 0.72366273, "learning_rate": 5.35966703239153e-07, "loss": 0.73856628, "num_input_tokens_seen": 275676160, "router_z_loss_clip": 2.23046875, "router_z_loss_mlp": 0.26098633, "step": 12783, "time_per_iteration": 2.6936240196228027 }, { "auxiliary_loss_clip": 0.01241992, "auxiliary_loss_mlp": 0.00234109, "balance_loss_clip": 1.02128315, "balance_loss_mlp": 0.20937276, "epoch": 0.76861566210732, "flos": 19646369303040.0, "grad_norm": 56.299664212602245, "language_loss": 0.78881752, "learning_rate": 5.357013959183938e-07, "loss": 0.80357856, "num_input_tokens_seen": 275695660, "router_z_loss_clip": 2.20507812, "router_z_loss_mlp": 0.24743652, "step": 12784, "time_per_iteration": 2.6265580654144287 }, { "auxiliary_loss_clip": 0.01226024, "auxiliary_loss_mlp": 0.00234172, "balance_loss_clip": 1.01468849, "balance_loss_mlp": 0.2113438, "epoch": 0.7686757853599879, "flos": 22419032037120.0, "grad_norm": 13.466406444856814, "language_loss": 0.86019087, "learning_rate": 5.354361441239843e-07, "loss": 0.87479281, "num_input_tokens_seen": 275714025, "router_z_loss_clip": 2.11230469, "router_z_loss_mlp": 0.22839355, "step": 12785, "time_per_iteration": 2.6630618572235107 }, { "auxiliary_loss_clip": 0.01248811, "auxiliary_loss_mlp": 0.00241306, "balance_loss_clip": 1.02921677, "balance_loss_mlp": 0.21683228, "epoch": 0.768735908612656, "flos": 47774262453120.0, "grad_norm": 848.0115892992427, "language_loss": 0.83202511, "learning_rate": 5.351709478659836e-07, "loss": 0.84692633, "num_input_tokens_seen": 275737300, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.24475098, "step": 12786, "time_per_iteration": 2.867196798324585 }, { "auxiliary_loss_clip": 0.01235084, "auxiliary_loss_mlp": 0.00240973, "balance_loss_clip": 1.01598954, "balance_loss_mlp": 0.21620145, "epoch": 0.7687960318653239, "flos": 30263179000320.0, "grad_norm": 29.137828322949566, "language_loss": 0.66260588, "learning_rate": 5.349058071544468e-07, "loss": 0.6773665, "num_input_tokens_seen": 275757895, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.24780273, "step": 12787, "time_per_iteration": 2.7086031436920166 }, { "auxiliary_loss_clip": 0.01229445, "auxiliary_loss_mlp": 0.002381, "balance_loss_clip": 1.01959097, "balance_loss_mlp": 0.21431811, "epoch": 0.7688561551179919, "flos": 19573434737280.0, "grad_norm": 32.584040721325294, "language_loss": 0.82144153, "learning_rate": 5.346407219994292e-07, "loss": 0.83611697, "num_input_tokens_seen": 275776745, "router_z_loss_clip": 2.09765625, "router_z_loss_mlp": 0.23779297, "step": 12788, "time_per_iteration": 2.6373941898345947 }, { "auxiliary_loss_clip": 0.01256185, "auxiliary_loss_mlp": 0.00230168, "balance_loss_clip": 1.0329566, "balance_loss_mlp": 0.20466903, "epoch": 0.7689162783706599, "flos": 22783776693120.0, "grad_norm": 4.254322809034814, "language_loss": 0.7646172, "learning_rate": 5.343756924109821e-07, "loss": 0.77948076, "num_input_tokens_seen": 275797205, "router_z_loss_clip": 2.23046875, "router_z_loss_mlp": 0.25488281, "step": 12789, "time_per_iteration": 2.6536967754364014 }, { "auxiliary_loss_clip": 0.01256, "auxiliary_loss_mlp": 0.00259081, "balance_loss_clip": 1.03314495, "balance_loss_mlp": 0.23330814, "epoch": 0.7689764016233278, "flos": 34204195416960.0, "grad_norm": 677.5004762750588, "language_loss": 0.79197329, "learning_rate": 5.341107183991553e-07, "loss": 0.80712408, "num_input_tokens_seen": 275817935, "router_z_loss_clip": 2.22558594, "router_z_loss_mlp": 0.25769043, "step": 12790, "time_per_iteration": 2.7505643367767334 }, { "auxiliary_loss_clip": 0.01237223, "auxiliary_loss_mlp": 0.00225463, "balance_loss_clip": 1.0203464, "balance_loss_mlp": 0.2021699, "epoch": 0.7690365248759958, "flos": 17274469587840.0, "grad_norm": 103.32874232176789, "language_loss": 0.76718795, "learning_rate": 5.338457999739969e-07, "loss": 0.78181481, "num_input_tokens_seen": 275837145, "router_z_loss_clip": 2.171875, "router_z_loss_mlp": 0.2331543, "step": 12791, "time_per_iteration": 2.6691551208496094 }, { "auxiliary_loss_clip": 0.01224948, "auxiliary_loss_mlp": 0.00227389, "balance_loss_clip": 1.01738763, "balance_loss_mlp": 0.20388064, "epoch": 0.7690966481286637, "flos": 18223157646720.0, "grad_norm": 22.345017699084416, "language_loss": 0.86684692, "learning_rate": 5.335809371455526e-07, "loss": 0.88137025, "num_input_tokens_seen": 275855705, "router_z_loss_clip": 2.07617188, "router_z_loss_mlp": 0.23510742, "step": 12792, "time_per_iteration": 2.683351993560791 }, { "auxiliary_loss_clip": 0.01267852, "auxiliary_loss_mlp": 0.00258303, "balance_loss_clip": 1.03658795, "balance_loss_mlp": 0.2321005, "epoch": 0.7691567713813318, "flos": 21537568281600.0, "grad_norm": 20.245631456603423, "language_loss": 0.80971551, "learning_rate": 5.333161299238673e-07, "loss": 0.82497704, "num_input_tokens_seen": 275873930, "router_z_loss_clip": 2.31054688, "router_z_loss_mlp": 0.26220703, "step": 12793, "time_per_iteration": 2.698831796646118 }, { "auxiliary_loss_clip": 0.01251237, "auxiliary_loss_mlp": 0.00254092, "balance_loss_clip": 1.03178692, "balance_loss_mlp": 0.22760388, "epoch": 0.7692168946339997, "flos": 39379999720320.0, "grad_norm": 20.334903625162124, "language_loss": 0.70203805, "learning_rate": 5.330513783189803e-07, "loss": 0.71709132, "num_input_tokens_seen": 275895895, "router_z_loss_clip": 2.19433594, "router_z_loss_mlp": 0.26489258, "step": 12794, "time_per_iteration": 2.8655974864959717 }, { "auxiliary_loss_clip": 0.01246201, "auxiliary_loss_mlp": 0.00250928, "balance_loss_clip": 1.02845645, "balance_loss_mlp": 0.22625154, "epoch": 0.7692770178866677, "flos": 25009950931200.0, "grad_norm": 2.5167676398897307, "language_loss": 0.82426447, "learning_rate": 5.327866823409319e-07, "loss": 0.83923578, "num_input_tokens_seen": 275917825, "router_z_loss_clip": 2.17480469, "router_z_loss_mlp": 0.2467041, "step": 12795, "time_per_iteration": 2.7106339931488037 }, { "auxiliary_loss_clip": 0.01260355, "auxiliary_loss_mlp": 0.00238246, "balance_loss_clip": 1.03918695, "balance_loss_mlp": 0.21334332, "epoch": 0.7693371411393356, "flos": 24716273333760.0, "grad_norm": 7.1433705368088045, "language_loss": 0.78210145, "learning_rate": 5.325220419997601e-07, "loss": 0.79708743, "num_input_tokens_seen": 275937890, "router_z_loss_clip": 2.21289062, "router_z_loss_mlp": 0.24890137, "step": 12796, "time_per_iteration": 2.7988545894622803 }, { "auxiliary_loss_clip": 0.0125417, "auxiliary_loss_mlp": 0.00225719, "balance_loss_clip": 1.03386736, "balance_loss_mlp": 0.20094767, "epoch": 0.7693972643920036, "flos": 15924803028480.0, "grad_norm": 25.771394593026752, "language_loss": 0.7316137, "learning_rate": 5.32257457305499e-07, "loss": 0.74641258, "num_input_tokens_seen": 275954495, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.24755859, "step": 12797, "time_per_iteration": 2.648627281188965 }, { "auxiliary_loss_clip": 0.01265608, "auxiliary_loss_mlp": 0.00273659, "balance_loss_clip": 1.03974771, "balance_loss_mlp": 0.24711077, "epoch": 0.7694573876446715, "flos": 25405901527680.0, "grad_norm": 17.25864488602954, "language_loss": 0.99742502, "learning_rate": 5.319929282681823e-07, "loss": 1.01281762, "num_input_tokens_seen": 275972395, "router_z_loss_clip": 2.25976562, "router_z_loss_mlp": 0.26550293, "step": 12798, "time_per_iteration": 2.73410964012146 }, { "auxiliary_loss_clip": 0.0124246, "auxiliary_loss_mlp": 0.0025183, "balance_loss_clip": 1.02800894, "balance_loss_mlp": 0.22692743, "epoch": 0.7695175108973396, "flos": 16654220513280.0, "grad_norm": 16.722873040305767, "language_loss": 0.90120739, "learning_rate": 5.317284548978418e-07, "loss": 0.91615033, "num_input_tokens_seen": 275989020, "router_z_loss_clip": 2.14257812, "router_z_loss_mlp": 0.24914551, "step": 12799, "time_per_iteration": 2.652498960494995 }, { "auxiliary_loss_clip": 0.01274251, "auxiliary_loss_mlp": 0.00242206, "balance_loss_clip": 1.0536294, "balance_loss_mlp": 0.21673059, "epoch": 0.7695776341500075, "flos": 13626520237440.0, "grad_norm": 37.93211436444691, "language_loss": 0.89368302, "learning_rate": 5.314640372045045e-07, "loss": 0.90884757, "num_input_tokens_seen": 276006525, "router_z_loss_clip": 2.20507812, "router_z_loss_mlp": 0.25488281, "step": 12800, "time_per_iteration": 2.687779426574707 }, { "auxiliary_loss_clip": 0.01272011, "auxiliary_loss_mlp": 0.00244378, "balance_loss_clip": 1.03965878, "balance_loss_mlp": 0.21771087, "epoch": 0.7696377574026755, "flos": 24276690691200.0, "grad_norm": 38.423968930710124, "language_loss": 0.91974056, "learning_rate": 5.31199675198198e-07, "loss": 0.93490446, "num_input_tokens_seen": 276027130, "router_z_loss_clip": 2.3203125, "router_z_loss_mlp": 0.26647949, "step": 12801, "time_per_iteration": 2.6648640632629395 }, { "auxiliary_loss_clip": 0.01239167, "auxiliary_loss_mlp": 0.00219941, "balance_loss_clip": 1.02163815, "balance_loss_mlp": 0.19543186, "epoch": 0.7696978806553435, "flos": 20923137210240.0, "grad_norm": 92.60099329687579, "language_loss": 0.79604137, "learning_rate": 5.30935368888947e-07, "loss": 0.81063247, "num_input_tokens_seen": 276045715, "router_z_loss_clip": 2.17578125, "router_z_loss_mlp": 0.24499512, "step": 12802, "time_per_iteration": 2.696763753890991 }, { "auxiliary_loss_clip": 0.01223515, "auxiliary_loss_mlp": 0.002538, "balance_loss_clip": 1.01327872, "balance_loss_mlp": 0.22976753, "epoch": 0.7697580039080114, "flos": 22929609911040.0, "grad_norm": 5.101028591003924, "language_loss": 0.83631873, "learning_rate": 5.306711182867747e-07, "loss": 0.85109186, "num_input_tokens_seen": 276065375, "router_z_loss_clip": 2.1015625, "router_z_loss_mlp": 0.24023438, "step": 12803, "time_per_iteration": 2.7735984325408936 }, { "auxiliary_loss_clip": 0.01090598, "auxiliary_loss_mlp": 0.0008457, "balance_loss_clip": 0.94428754, "balance_loss_mlp": 0.07751252, "epoch": 0.7698181271606794, "flos": 68717654933760.0, "grad_norm": 14.12486255727315, "language_loss": 0.5484736, "learning_rate": 5.304069234017001e-07, "loss": 0.56022525, "num_input_tokens_seen": 276131405, "router_z_loss_clip": 1.46875, "router_z_loss_mlp": 0.07080078, "step": 12804, "time_per_iteration": 3.169898271560669 }, { "auxiliary_loss_clip": 0.01089941, "auxiliary_loss_mlp": 0.00122016, "balance_loss_clip": 0.94300497, "balance_loss_mlp": 0.11452992, "epoch": 0.7698782504133473, "flos": 67409716999680.0, "grad_norm": 0.7186985638953662, "language_loss": 0.53442359, "learning_rate": 5.301427842437429e-07, "loss": 0.54654312, "num_input_tokens_seen": 276200755, "router_z_loss_clip": 1.46875, "router_z_loss_mlp": 0.07470703, "step": 12805, "time_per_iteration": 4.705320835113525 }, { "auxiliary_loss_clip": 0.01245336, "auxiliary_loss_mlp": 0.00244739, "balance_loss_clip": 1.02985287, "balance_loss_mlp": 0.22030109, "epoch": 0.7699383736660154, "flos": 22488842119680.0, "grad_norm": 95.88905504619305, "language_loss": 0.80587304, "learning_rate": 5.298787008229187e-07, "loss": 0.82077372, "num_input_tokens_seen": 276217880, "router_z_loss_clip": 2.15234375, "router_z_loss_mlp": 0.24438477, "step": 12806, "time_per_iteration": 2.6428346633911133 }, { "auxiliary_loss_clip": 0.0124617, "auxiliary_loss_mlp": 0.00254248, "balance_loss_clip": 1.02663159, "balance_loss_mlp": 0.22841486, "epoch": 0.7699984969186833, "flos": 21539723097600.0, "grad_norm": 12.666929612873329, "language_loss": 0.79728353, "learning_rate": 5.296146731492408e-07, "loss": 0.81228769, "num_input_tokens_seen": 276234810, "router_z_loss_clip": 2.19335938, "router_z_loss_mlp": 0.25842285, "step": 12807, "time_per_iteration": 4.037569046020508 }, { "auxiliary_loss_clip": 0.01257979, "auxiliary_loss_mlp": 0.00236382, "balance_loss_clip": 1.03667402, "balance_loss_mlp": 0.21141946, "epoch": 0.7700586201713513, "flos": 21719096640000.0, "grad_norm": 3.377475029599023, "language_loss": 0.87583435, "learning_rate": 5.293507012327218e-07, "loss": 0.89077795, "num_input_tokens_seen": 276252850, "router_z_loss_clip": 2.21679688, "router_z_loss_mlp": 0.24951172, "step": 12808, "time_per_iteration": 2.652200937271118 }, { "auxiliary_loss_clip": 0.01279304, "auxiliary_loss_mlp": 0.0024346, "balance_loss_clip": 1.05462325, "balance_loss_mlp": 0.21811619, "epoch": 0.7701187434240192, "flos": 27856015107840.0, "grad_norm": 7.907202183619345, "language_loss": 0.8897723, "learning_rate": 5.290867850833718e-07, "loss": 0.90499997, "num_input_tokens_seen": 276272525, "router_z_loss_clip": 2.24609375, "router_z_loss_mlp": 0.25341797, "step": 12809, "time_per_iteration": 2.756920337677002 }, { "auxiliary_loss_clip": 0.01235024, "auxiliary_loss_mlp": 0.00240931, "balance_loss_clip": 1.02069736, "balance_loss_mlp": 0.21503851, "epoch": 0.7701788666766872, "flos": 28621307301120.0, "grad_norm": 60.22562829298659, "language_loss": 0.75515711, "learning_rate": 5.288229247111993e-07, "loss": 0.76991671, "num_input_tokens_seen": 276294210, "router_z_loss_clip": 2.14160156, "router_z_loss_mlp": 0.25891113, "step": 12810, "time_per_iteration": 2.755636692047119 }, { "auxiliary_loss_clip": 0.01263043, "auxiliary_loss_mlp": 0.00259015, "balance_loss_clip": 1.0359472, "balance_loss_mlp": 0.23336154, "epoch": 0.7702389899293551, "flos": 14246446089600.0, "grad_norm": 95.36877056906921, "language_loss": 0.8636421, "learning_rate": 5.285591201262079e-07, "loss": 0.87886262, "num_input_tokens_seen": 276310290, "router_z_loss_clip": 2.26953125, "router_z_loss_mlp": 0.25646973, "step": 12811, "time_per_iteration": 4.175467491149902 }, { "auxiliary_loss_clip": 0.01106857, "auxiliary_loss_mlp": 0.00065002, "balance_loss_clip": 0.96334791, "balance_loss_mlp": 0.05851712, "epoch": 0.7702991131820232, "flos": 70574128439040.0, "grad_norm": 0.7824459749948809, "language_loss": 0.56284046, "learning_rate": 5.28295371338402e-07, "loss": 0.57455909, "num_input_tokens_seen": 276371715, "router_z_loss_clip": 1.4375, "router_z_loss_mlp": 0.06494141, "step": 12812, "time_per_iteration": 3.224522352218628 }, { "auxiliary_loss_clip": 0.0125949, "auxiliary_loss_mlp": 0.002223, "balance_loss_clip": 1.03889394, "balance_loss_mlp": 0.19702789, "epoch": 0.7703592364346911, "flos": 25480021242240.0, "grad_norm": 5.191329591667375, "language_loss": 0.78299904, "learning_rate": 5.280316783577836e-07, "loss": 0.79781699, "num_input_tokens_seen": 276389895, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.25268555, "step": 12813, "time_per_iteration": 2.684845447540283 }, { "auxiliary_loss_clip": 0.01255821, "auxiliary_loss_mlp": 0.002301, "balance_loss_clip": 1.03662276, "balance_loss_mlp": 0.20401676, "epoch": 0.7704193596873591, "flos": 19280906375040.0, "grad_norm": 9.671210290650016, "language_loss": 0.74771637, "learning_rate": 5.27768041194351e-07, "loss": 0.76257557, "num_input_tokens_seen": 276408990, "router_z_loss_clip": 2.19335938, "router_z_loss_mlp": 0.2611084, "step": 12814, "time_per_iteration": 2.6678366661071777 }, { "auxiliary_loss_clip": 0.01240973, "auxiliary_loss_mlp": 0.00250283, "balance_loss_clip": 1.0235796, "balance_loss_mlp": 0.2253084, "epoch": 0.7704794829400271, "flos": 23658452778240.0, "grad_norm": 231.99494554801817, "language_loss": 0.73251247, "learning_rate": 5.275044598581018e-07, "loss": 0.74742496, "num_input_tokens_seen": 276428190, "router_z_loss_clip": 2.171875, "router_z_loss_mlp": 0.24975586, "step": 12815, "time_per_iteration": 2.6907424926757812 }, { "auxiliary_loss_clip": 0.01262281, "auxiliary_loss_mlp": 0.00227385, "balance_loss_clip": 1.03436816, "balance_loss_mlp": 0.20092073, "epoch": 0.770539606192695, "flos": 18989311766400.0, "grad_norm": 171.41782999605877, "language_loss": 0.7744323, "learning_rate": 5.272409343590322e-07, "loss": 0.78932899, "num_input_tokens_seen": 276446855, "router_z_loss_clip": 2.28320312, "router_z_loss_mlp": 0.26428223, "step": 12816, "time_per_iteration": 4.143450736999512 }, { "auxiliary_loss_clip": 0.01268433, "auxiliary_loss_mlp": 0.00239017, "balance_loss_clip": 1.04677951, "balance_loss_mlp": 0.21038258, "epoch": 0.770599729445363, "flos": 11830160142720.0, "grad_norm": 4.912886046158793, "language_loss": 0.82197392, "learning_rate": 5.26977464707133e-07, "loss": 0.83704841, "num_input_tokens_seen": 276462000, "router_z_loss_clip": 2.21289062, "router_z_loss_mlp": 0.28649902, "step": 12817, "time_per_iteration": 2.679966688156128 }, { "auxiliary_loss_clip": 0.01265389, "auxiliary_loss_mlp": 0.00252555, "balance_loss_clip": 1.04155219, "balance_loss_mlp": 0.22641255, "epoch": 0.770659852698031, "flos": 17822610109440.0, "grad_norm": 12.765487179601418, "language_loss": 0.72021091, "learning_rate": 5.267140509123957e-07, "loss": 0.73539031, "num_input_tokens_seen": 276481190, "router_z_loss_clip": 2.23828125, "router_z_loss_mlp": 0.26123047, "step": 12818, "time_per_iteration": 2.677358627319336 }, { "auxiliary_loss_clip": 0.01260737, "auxiliary_loss_mlp": 0.0024163, "balance_loss_clip": 1.04541934, "balance_loss_mlp": 0.2186821, "epoch": 0.770719975950699, "flos": 21871968923520.0, "grad_norm": 56.19294915430246, "language_loss": 0.74881297, "learning_rate": 5.264506929848093e-07, "loss": 0.76383662, "num_input_tokens_seen": 276499520, "router_z_loss_clip": 2.15234375, "router_z_loss_mlp": 0.22924805, "step": 12819, "time_per_iteration": 2.736767053604126 }, { "auxiliary_loss_clip": 0.01259278, "auxiliary_loss_mlp": 0.00242181, "balance_loss_clip": 1.03816032, "balance_loss_mlp": 0.21454839, "epoch": 0.7707800992033669, "flos": 21325049464320.0, "grad_norm": 8.875041999697002, "language_loss": 0.64651591, "learning_rate": 5.261873909343608e-07, "loss": 0.66153049, "num_input_tokens_seen": 276519110, "router_z_loss_clip": 2.21289062, "router_z_loss_mlp": 0.27636719, "step": 12820, "time_per_iteration": 2.715465784072876 }, { "auxiliary_loss_clip": 0.01249562, "auxiliary_loss_mlp": 0.00243273, "balance_loss_clip": 1.03094065, "balance_loss_mlp": 0.21772684, "epoch": 0.7708402224560349, "flos": 28179426188160.0, "grad_norm": 5.8549475423893265, "language_loss": 0.87290388, "learning_rate": 5.259241447710343e-07, "loss": 0.88783216, "num_input_tokens_seen": 276538805, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.2557373, "step": 12821, "time_per_iteration": 2.7437856197357178 }, { "auxiliary_loss_clip": 0.01263862, "auxiliary_loss_mlp": 0.0024577, "balance_loss_clip": 1.04219222, "balance_loss_mlp": 0.22215472, "epoch": 0.7709003457087028, "flos": 15377057556480.0, "grad_norm": 596.48649033779, "language_loss": 0.75355709, "learning_rate": 5.256609545048114e-07, "loss": 0.76865339, "num_input_tokens_seen": 276554770, "router_z_loss_clip": 2.21777344, "router_z_loss_mlp": 0.23620605, "step": 12822, "time_per_iteration": 2.626981496810913 }, { "auxiliary_loss_clip": 0.01248649, "auxiliary_loss_mlp": 0.00242897, "balance_loss_clip": 1.03384018, "balance_loss_mlp": 0.21755311, "epoch": 0.7709604689613708, "flos": 30621854257920.0, "grad_norm": 43.22973834210171, "language_loss": 0.78331959, "learning_rate": 5.253978201456733e-07, "loss": 0.798235, "num_input_tokens_seen": 276574535, "router_z_loss_clip": 2.1484375, "router_z_loss_mlp": 0.25366211, "step": 12823, "time_per_iteration": 2.799835443496704 }, { "auxiliary_loss_clip": 0.01268138, "auxiliary_loss_mlp": 0.00232351, "balance_loss_clip": 1.03773236, "balance_loss_mlp": 0.20320465, "epoch": 0.7710205922140387, "flos": 20301272023680.0, "grad_norm": 6.994742579040156, "language_loss": 0.84538782, "learning_rate": 5.251347417035969e-07, "loss": 0.86039275, "num_input_tokens_seen": 276592925, "router_z_loss_clip": 2.30859375, "router_z_loss_mlp": 0.29138184, "step": 12824, "time_per_iteration": 2.638211250305176 }, { "auxiliary_loss_clip": 0.01264593, "auxiliary_loss_mlp": 0.00244935, "balance_loss_clip": 1.04784942, "balance_loss_mlp": 0.21918556, "epoch": 0.7710807154667068, "flos": 19644214487040.0, "grad_norm": 7727.985272493031, "language_loss": 0.80743349, "learning_rate": 5.248717191885592e-07, "loss": 0.82252884, "num_input_tokens_seen": 276610540, "router_z_loss_clip": 2.16796875, "router_z_loss_mlp": 0.25756836, "step": 12825, "time_per_iteration": 2.648982524871826 }, { "auxiliary_loss_clip": 0.01224659, "auxiliary_loss_mlp": 0.00251678, "balance_loss_clip": 1.0155226, "balance_loss_mlp": 0.22741912, "epoch": 0.7711408387193747, "flos": 20006337450240.0, "grad_norm": 10.63089926117252, "language_loss": 0.79380828, "learning_rate": 5.246087526105343e-07, "loss": 0.8085717, "num_input_tokens_seen": 276629200, "router_z_loss_clip": 2.08789062, "router_z_loss_mlp": 0.24243164, "step": 12826, "time_per_iteration": 2.638233184814453 }, { "auxiliary_loss_clip": 0.01242415, "auxiliary_loss_mlp": 0.00229366, "balance_loss_clip": 1.02582037, "balance_loss_mlp": 0.20485696, "epoch": 0.7712009619720427, "flos": 24971131307520.0, "grad_norm": 6.5452617396536725, "language_loss": 0.87831104, "learning_rate": 5.243458419794933e-07, "loss": 0.89302886, "num_input_tokens_seen": 276648655, "router_z_loss_clip": 2.16894531, "router_z_loss_mlp": 0.24511719, "step": 12827, "time_per_iteration": 2.6716842651367188 }, { "auxiliary_loss_clip": 0.01103828, "auxiliary_loss_mlp": 0.00074637, "balance_loss_clip": 0.96020627, "balance_loss_mlp": 0.06819975, "epoch": 0.7712610852247107, "flos": 63249681404160.0, "grad_norm": 0.8652510827315144, "language_loss": 0.54834723, "learning_rate": 5.240829873054051e-07, "loss": 0.56013191, "num_input_tokens_seen": 276716500, "router_z_loss_clip": 1.4375, "router_z_loss_mlp": 0.06445312, "step": 12828, "time_per_iteration": 3.325289487838745 }, { "auxiliary_loss_clip": 0.0124603, "auxiliary_loss_mlp": 0.0023572, "balance_loss_clip": 1.03263152, "balance_loss_mlp": 0.21299908, "epoch": 0.7713212084773786, "flos": 18697860812160.0, "grad_norm": 16.10629913990984, "language_loss": 0.76313281, "learning_rate": 5.23820188598238e-07, "loss": 0.77795035, "num_input_tokens_seen": 276733535, "router_z_loss_clip": 2.1328125, "router_z_loss_mlp": 0.22729492, "step": 12829, "time_per_iteration": 2.6299617290496826 }, { "auxiliary_loss_clip": 0.01252897, "auxiliary_loss_mlp": 0.00233517, "balance_loss_clip": 1.03210759, "balance_loss_mlp": 0.20808931, "epoch": 0.7713813317300466, "flos": 14173367869440.0, "grad_norm": 17.65675293336867, "language_loss": 0.89257574, "learning_rate": 5.235574458679579e-07, "loss": 0.90743983, "num_input_tokens_seen": 276749575, "router_z_loss_clip": 2.20507812, "router_z_loss_mlp": 0.25476074, "step": 12830, "time_per_iteration": 2.618144989013672 }, { "auxiliary_loss_clip": 0.0126284, "auxiliary_loss_mlp": 0.00239348, "balance_loss_clip": 1.03452277, "balance_loss_mlp": 0.21095249, "epoch": 0.7714414549827145, "flos": 25703960584320.0, "grad_norm": 11.267234802506424, "language_loss": 0.86726522, "learning_rate": 5.232947591245269e-07, "loss": 0.88228714, "num_input_tokens_seen": 276769460, "router_z_loss_clip": 2.28320312, "router_z_loss_mlp": 0.28430176, "step": 12831, "time_per_iteration": 2.7140369415283203 }, { "auxiliary_loss_clip": 0.01248467, "auxiliary_loss_mlp": 0.00264307, "balance_loss_clip": 1.02920675, "balance_loss_mlp": 0.23827131, "epoch": 0.7715015782353826, "flos": 30555312312960.0, "grad_norm": 6.224111820153066, "language_loss": 0.67697883, "learning_rate": 5.230321283779071e-07, "loss": 0.6921066, "num_input_tokens_seen": 276790820, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.26037598, "step": 12832, "time_per_iteration": 2.7435131072998047 }, { "auxiliary_loss_clip": 0.01271558, "auxiliary_loss_mlp": 0.00261734, "balance_loss_clip": 1.04828501, "balance_loss_mlp": 0.23537712, "epoch": 0.7715617014880505, "flos": 20229343038720.0, "grad_norm": 20.224523601322257, "language_loss": 0.86119008, "learning_rate": 5.227695536380572e-07, "loss": 0.87652302, "num_input_tokens_seen": 276811345, "router_z_loss_clip": 2.23632812, "router_z_loss_mlp": 0.2635498, "step": 12833, "time_per_iteration": 2.639610528945923 }, { "auxiliary_loss_clip": 0.01096235, "auxiliary_loss_mlp": 0.0007292, "balance_loss_clip": 0.95523751, "balance_loss_mlp": 0.06638706, "epoch": 0.7716218247407185, "flos": 63664770971520.0, "grad_norm": 0.8052558704835378, "language_loss": 0.54174203, "learning_rate": 5.22507034914933e-07, "loss": 0.5534336, "num_input_tokens_seen": 276870950, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.06542969, "step": 12834, "time_per_iteration": 3.150012731552124 }, { "auxiliary_loss_clip": 0.01250611, "auxiliary_loss_mlp": 0.0028307, "balance_loss_clip": 1.03174353, "balance_loss_mlp": 0.25645041, "epoch": 0.7716819479933864, "flos": 19791807471360.0, "grad_norm": 30.25373318608111, "language_loss": 0.82050693, "learning_rate": 5.222445722184903e-07, "loss": 0.8358438, "num_input_tokens_seen": 276890760, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.26611328, "step": 12835, "time_per_iteration": 2.6310555934906006 }, { "auxiliary_loss_clip": 0.01261926, "auxiliary_loss_mlp": 0.00246856, "balance_loss_clip": 1.04230297, "balance_loss_mlp": 0.22153637, "epoch": 0.7717420712460544, "flos": 18442176825600.0, "grad_norm": 562.5870816413186, "language_loss": 0.81561518, "learning_rate": 5.219821655586814e-07, "loss": 0.83070296, "num_input_tokens_seen": 276909625, "router_z_loss_clip": 2.19335938, "router_z_loss_mlp": 0.25341797, "step": 12836, "time_per_iteration": 2.66184139251709 }, { "auxiliary_loss_clip": 0.01257812, "auxiliary_loss_mlp": 0.00238728, "balance_loss_clip": 1.04291272, "balance_loss_mlp": 0.21367061, "epoch": 0.7718021944987223, "flos": 35189476456320.0, "grad_norm": 14.38765697721291, "language_loss": 0.68767595, "learning_rate": 5.217198149454575e-07, "loss": 0.70264137, "num_input_tokens_seen": 276930760, "router_z_loss_clip": 2.14746094, "router_z_loss_mlp": 0.25073242, "step": 12837, "time_per_iteration": 2.8509364128112793 }, { "auxiliary_loss_clip": 0.01092524, "auxiliary_loss_mlp": 0.00060548, "balance_loss_clip": 0.94791818, "balance_loss_mlp": 0.054611, "epoch": 0.7718623177513904, "flos": 67923167961600.0, "grad_norm": 0.8201031157277134, "language_loss": 0.55044687, "learning_rate": 5.214575203887666e-07, "loss": 0.56197762, "num_input_tokens_seen": 276989580, "router_z_loss_clip": 1.453125, "router_z_loss_mlp": 0.05932617, "step": 12838, "time_per_iteration": 3.0957891941070557 }, { "auxiliary_loss_clip": 0.01251353, "auxiliary_loss_mlp": 0.0022867, "balance_loss_clip": 1.03492177, "balance_loss_mlp": 0.20476812, "epoch": 0.7719224410040583, "flos": 18581401941120.0, "grad_norm": 1815.8216406940726, "language_loss": 0.77343464, "learning_rate": 5.211952818985538e-07, "loss": 0.78823495, "num_input_tokens_seen": 277005450, "router_z_loss_clip": 2.1640625, "router_z_loss_mlp": 0.23913574, "step": 12839, "time_per_iteration": 2.635469675064087 }, { "auxiliary_loss_clip": 0.01242992, "auxiliary_loss_mlp": 0.00241635, "balance_loss_clip": 1.02984405, "balance_loss_mlp": 0.21638671, "epoch": 0.7719825642567263, "flos": 23075802264960.0, "grad_norm": 5.398951413699886, "language_loss": 0.87118489, "learning_rate": 5.209330994847647e-07, "loss": 0.88603115, "num_input_tokens_seen": 277023055, "router_z_loss_clip": 2.1328125, "router_z_loss_mlp": 0.25256348, "step": 12840, "time_per_iteration": 2.655405044555664 }, { "auxiliary_loss_clip": 0.01237591, "auxiliary_loss_mlp": 0.00239968, "balance_loss_clip": 1.02275515, "balance_loss_mlp": 0.21616162, "epoch": 0.7720426875093943, "flos": 20339086066560.0, "grad_norm": 3.40554664932091, "language_loss": 0.86430502, "learning_rate": 5.206709731573402e-07, "loss": 0.87908059, "num_input_tokens_seen": 277041150, "router_z_loss_clip": 2.15039062, "router_z_loss_mlp": 0.23828125, "step": 12841, "time_per_iteration": 2.7080938816070557 }, { "auxiliary_loss_clip": 0.01253386, "auxiliary_loss_mlp": 0.00243955, "balance_loss_clip": 1.03272057, "balance_loss_mlp": 0.21875432, "epoch": 0.7721028107620622, "flos": 23880704181120.0, "grad_norm": 7.732467457494695, "language_loss": 0.82000911, "learning_rate": 5.204089029262208e-07, "loss": 0.83498251, "num_input_tokens_seen": 277063895, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.25219727, "step": 12842, "time_per_iteration": 2.835198402404785 }, { "auxiliary_loss_clip": 0.01256554, "auxiliary_loss_mlp": 0.00251219, "balance_loss_clip": 1.03572512, "balance_loss_mlp": 0.22600672, "epoch": 0.7721629340147302, "flos": 26651571235200.0, "grad_norm": 78.87335811360593, "language_loss": 0.75566822, "learning_rate": 5.201468888013445e-07, "loss": 0.77074599, "num_input_tokens_seen": 277084045, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.25231934, "step": 12843, "time_per_iteration": 2.823942184448242 }, { "auxiliary_loss_clip": 0.01256806, "auxiliary_loss_mlp": 0.00266694, "balance_loss_clip": 1.03622162, "balance_loss_mlp": 0.24103998, "epoch": 0.7722230572673981, "flos": 21178857110400.0, "grad_norm": 102.7039147772605, "language_loss": 0.83170915, "learning_rate": 5.198849307926465e-07, "loss": 0.84694409, "num_input_tokens_seen": 277102625, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.25634766, "step": 12844, "time_per_iteration": 2.663763999938965 }, { "auxiliary_loss_clip": 0.01231494, "auxiliary_loss_mlp": 0.00243468, "balance_loss_clip": 1.02236283, "balance_loss_mlp": 0.22012734, "epoch": 0.7722831805200662, "flos": 27964644814080.0, "grad_norm": 45.246408589918914, "language_loss": 0.78400964, "learning_rate": 5.196230289100596e-07, "loss": 0.79875928, "num_input_tokens_seen": 277123210, "router_z_loss_clip": 2.09179688, "router_z_loss_mlp": 0.2331543, "step": 12845, "time_per_iteration": 2.716320514678955 }, { "auxiliary_loss_clip": 0.01221911, "auxiliary_loss_mlp": 0.00227264, "balance_loss_clip": 1.01300359, "balance_loss_mlp": 0.20466161, "epoch": 0.7723433037727341, "flos": 33875576864640.0, "grad_norm": 5.611056447270926, "language_loss": 0.71425295, "learning_rate": 5.193611831635159e-07, "loss": 0.72874475, "num_input_tokens_seen": 277144895, "router_z_loss_clip": 2.08886719, "router_z_loss_mlp": 0.22583008, "step": 12846, "time_per_iteration": 2.753615617752075 }, { "auxiliary_loss_clip": 0.01100416, "auxiliary_loss_mlp": 0.00032445, "balance_loss_clip": 0.95630026, "balance_loss_mlp": 0.02646067, "epoch": 0.7724034270254021, "flos": 62848271940480.0, "grad_norm": 67.97940199374975, "language_loss": 0.60842174, "learning_rate": 5.19099393562945e-07, "loss": 0.61975032, "num_input_tokens_seen": 277205160, "router_z_loss_clip": 1.4375, "router_z_loss_mlp": 0.05981445, "step": 12847, "time_per_iteration": 4.572547197341919 }, { "auxiliary_loss_clip": 0.01235022, "auxiliary_loss_mlp": 0.00268119, "balance_loss_clip": 1.02058935, "balance_loss_mlp": 0.24357402, "epoch": 0.77246355027807, "flos": 23295467888640.0, "grad_norm": 4.624236479668811, "language_loss": 0.86120176, "learning_rate": 5.188376601182732e-07, "loss": 0.8762331, "num_input_tokens_seen": 277223005, "router_z_loss_clip": 2.14257812, "router_z_loss_mlp": 0.2454834, "step": 12848, "time_per_iteration": 2.77409029006958 }, { "auxiliary_loss_clip": 0.0126538, "auxiliary_loss_mlp": 0.00249479, "balance_loss_clip": 1.03962719, "balance_loss_mlp": 0.22325256, "epoch": 0.772523673530738, "flos": 20121287950080.0, "grad_norm": 38.36378532454981, "language_loss": 0.79855978, "learning_rate": 5.185759828394261e-07, "loss": 0.81370831, "num_input_tokens_seen": 277241785, "router_z_loss_clip": 2.25976562, "router_z_loss_mlp": 0.26257324, "step": 12849, "time_per_iteration": 2.648344039916992 }, { "auxiliary_loss_clip": 0.01258105, "auxiliary_loss_mlp": 0.00236208, "balance_loss_clip": 1.04068947, "balance_loss_mlp": 0.21156734, "epoch": 0.7725837967834059, "flos": 17820096157440.0, "grad_norm": 13.351689425649063, "language_loss": 0.8722592, "learning_rate": 5.183143617363261e-07, "loss": 0.88720232, "num_input_tokens_seen": 277259050, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.24658203, "step": 12850, "time_per_iteration": 4.033052921295166 }, { "auxiliary_loss_clip": 0.01253329, "auxiliary_loss_mlp": 0.0023898, "balance_loss_clip": 1.03426695, "balance_loss_mlp": 0.21255097, "epoch": 0.772643920036074, "flos": 27198921657600.0, "grad_norm": 12.392744519384172, "language_loss": 0.87751281, "learning_rate": 5.180527968188935e-07, "loss": 0.89243591, "num_input_tokens_seen": 277278235, "router_z_loss_clip": 2.19335938, "router_z_loss_mlp": 0.26452637, "step": 12851, "time_per_iteration": 2.7129900455474854 }, { "auxiliary_loss_clip": 0.01240523, "auxiliary_loss_mlp": 0.00222649, "balance_loss_clip": 1.02472639, "balance_loss_mlp": 0.19848561, "epoch": 0.7727040432887419, "flos": 21579512388480.0, "grad_norm": 198.11304910982685, "language_loss": 0.81199026, "learning_rate": 5.177912880970474e-07, "loss": 0.82662201, "num_input_tokens_seen": 277298355, "router_z_loss_clip": 2.16015625, "router_z_loss_mlp": 0.24157715, "step": 12852, "time_per_iteration": 2.7089927196502686 }, { "auxiliary_loss_clip": 0.01226376, "auxiliary_loss_mlp": 0.00238518, "balance_loss_clip": 1.0154314, "balance_loss_mlp": 0.21363913, "epoch": 0.7727641665414099, "flos": 22236641752320.0, "grad_norm": 13.575638587301421, "language_loss": 0.89070535, "learning_rate": 5.17529835580704e-07, "loss": 0.90535426, "num_input_tokens_seen": 277316095, "router_z_loss_clip": 2.109375, "router_z_loss_mlp": 0.24890137, "step": 12853, "time_per_iteration": 4.197924852371216 }, { "auxiliary_loss_clip": 0.01100769, "auxiliary_loss_mlp": 0.00048537, "balance_loss_clip": 0.95566106, "balance_loss_mlp": 0.04205186, "epoch": 0.7728242897940779, "flos": 54832221463680.0, "grad_norm": 0.7826021142164474, "language_loss": 0.5357641, "learning_rate": 5.172684392797786e-07, "loss": 0.54725718, "num_input_tokens_seen": 277380130, "router_z_loss_clip": 1.453125, "router_z_loss_mlp": 0.06494141, "step": 12854, "time_per_iteration": 3.2553722858428955 }, { "auxiliary_loss_clip": 0.01274509, "auxiliary_loss_mlp": 0.0023304, "balance_loss_clip": 1.05084336, "balance_loss_mlp": 0.20525248, "epoch": 0.7728844130467458, "flos": 34461962392320.0, "grad_norm": 15.500718585212713, "language_loss": 0.8212567, "learning_rate": 5.170070992041826e-07, "loss": 0.8363322, "num_input_tokens_seen": 277404015, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.27783203, "step": 12855, "time_per_iteration": 2.7610433101654053 }, { "auxiliary_loss_clip": 0.01261009, "auxiliary_loss_mlp": 0.00216057, "balance_loss_clip": 1.03454733, "balance_loss_mlp": 0.19012864, "epoch": 0.7729445362994138, "flos": 18916341287040.0, "grad_norm": 202.5300899485053, "language_loss": 0.78207308, "learning_rate": 5.167458153638254e-07, "loss": 0.79684377, "num_input_tokens_seen": 277421375, "router_z_loss_clip": 2.26367188, "router_z_loss_mlp": 0.25964355, "step": 12856, "time_per_iteration": 2.6504650115966797 }, { "auxiliary_loss_clip": 0.01254497, "auxiliary_loss_mlp": 0.00218478, "balance_loss_clip": 1.03470612, "balance_loss_mlp": 0.19314599, "epoch": 0.7730046595520818, "flos": 22200048771840.0, "grad_norm": 5.670016457707896, "language_loss": 0.86097628, "learning_rate": 5.164845877686162e-07, "loss": 0.87570608, "num_input_tokens_seen": 277440170, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.25317383, "step": 12857, "time_per_iteration": 2.6666340827941895 }, { "auxiliary_loss_clip": 0.01228201, "auxiliary_loss_mlp": 0.00230501, "balance_loss_clip": 1.01768041, "balance_loss_mlp": 0.20680171, "epoch": 0.7730647828047498, "flos": 13552328695680.0, "grad_norm": 14.467035711416884, "language_loss": 0.85646433, "learning_rate": 5.162234164284591e-07, "loss": 0.87105131, "num_input_tokens_seen": 277456880, "router_z_loss_clip": 2.10742188, "router_z_loss_mlp": 0.23718262, "step": 12858, "time_per_iteration": 4.027954578399658 }, { "auxiliary_loss_clip": 0.01250913, "auxiliary_loss_mlp": 0.00244081, "balance_loss_clip": 1.03131199, "balance_loss_mlp": 0.21746175, "epoch": 0.7731249060574177, "flos": 21976037602560.0, "grad_norm": 12.087293472504545, "language_loss": 0.85196459, "learning_rate": 5.159623013532591e-07, "loss": 0.86691451, "num_input_tokens_seen": 277475365, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.26586914, "step": 12859, "time_per_iteration": 2.7285454273223877 }, { "auxiliary_loss_clip": 0.01243081, "auxiliary_loss_mlp": 0.00202088, "balance_loss_clip": 1.03400326, "balance_loss_mlp": 0.18061796, "epoch": 0.7731850293100857, "flos": 22601817371520.0, "grad_norm": 7.523682611188777, "language_loss": 0.74703711, "learning_rate": 5.157012425529186e-07, "loss": 0.7614888, "num_input_tokens_seen": 277494975, "router_z_loss_clip": 2.09179688, "router_z_loss_mlp": 0.21484375, "step": 12860, "time_per_iteration": 2.6868538856506348 }, { "auxiliary_loss_clip": 0.01269117, "auxiliary_loss_mlp": 0.00221612, "balance_loss_clip": 1.04273272, "balance_loss_mlp": 0.19401537, "epoch": 0.7732451525627536, "flos": 14098422142080.0, "grad_norm": 97.88251598367559, "language_loss": 0.88067472, "learning_rate": 5.154402400373343e-07, "loss": 0.89558196, "num_input_tokens_seen": 277510520, "router_z_loss_clip": 2.26171875, "router_z_loss_mlp": 0.27612305, "step": 12861, "time_per_iteration": 2.6787497997283936 }, { "auxiliary_loss_clip": 0.01274055, "auxiliary_loss_mlp": 0.00235257, "balance_loss_clip": 1.0511055, "balance_loss_mlp": 0.20856632, "epoch": 0.7733052758154216, "flos": 21470020755840.0, "grad_norm": 2.932567642750793, "language_loss": 0.83150017, "learning_rate": 5.15179293816405e-07, "loss": 0.84659326, "num_input_tokens_seen": 277530505, "router_z_loss_clip": 2.23046875, "router_z_loss_mlp": 0.26647949, "step": 12862, "time_per_iteration": 2.7404251098632812 }, { "auxiliary_loss_clip": 0.01238265, "auxiliary_loss_mlp": 0.00235647, "balance_loss_clip": 1.02582455, "balance_loss_mlp": 0.21076852, "epoch": 0.7733653990680895, "flos": 21394284929280.0, "grad_norm": 9.76090935719173, "language_loss": 0.8750608, "learning_rate": 5.149184039000256e-07, "loss": 0.88979995, "num_input_tokens_seen": 277550810, "router_z_loss_clip": 2.12304688, "router_z_loss_mlp": 0.24902344, "step": 12863, "time_per_iteration": 2.7067933082580566 }, { "auxiliary_loss_clip": 0.01252533, "auxiliary_loss_mlp": 0.00222971, "balance_loss_clip": 1.03417289, "balance_loss_mlp": 0.1982944, "epoch": 0.7734255223207576, "flos": 17676058619520.0, "grad_norm": 9.485917790931303, "language_loss": 0.79530221, "learning_rate": 5.146575702980898e-07, "loss": 0.81005728, "num_input_tokens_seen": 277567680, "router_z_loss_clip": 2.18457031, "router_z_loss_mlp": 0.24694824, "step": 12864, "time_per_iteration": 2.7051379680633545 }, { "auxiliary_loss_clip": 0.01246079, "auxiliary_loss_mlp": 0.00218817, "balance_loss_clip": 1.03176343, "balance_loss_mlp": 0.19483194, "epoch": 0.7734856455734255, "flos": 25230837617280.0, "grad_norm": 4.729697439862633, "language_loss": 0.86672622, "learning_rate": 5.143967930204871e-07, "loss": 0.88137519, "num_input_tokens_seen": 277588970, "router_z_loss_clip": 2.14453125, "router_z_loss_mlp": 0.23986816, "step": 12865, "time_per_iteration": 2.742351770401001 }, { "auxiliary_loss_clip": 0.01273122, "auxiliary_loss_mlp": 0.00260245, "balance_loss_clip": 1.04525065, "balance_loss_mlp": 0.23208769, "epoch": 0.7735457688260935, "flos": 23433112805760.0, "grad_norm": 32.3943662400385, "language_loss": 0.80434179, "learning_rate": 5.141360720771077e-07, "loss": 0.81967545, "num_input_tokens_seen": 277605450, "router_z_loss_clip": 2.28125, "router_z_loss_mlp": 0.28173828, "step": 12866, "time_per_iteration": 2.7930376529693604 }, { "auxiliary_loss_clip": 0.01275507, "auxiliary_loss_mlp": 0.00225657, "balance_loss_clip": 1.05099916, "balance_loss_mlp": 0.19856054, "epoch": 0.7736058920787615, "flos": 18729246320640.0, "grad_norm": 150.04346315802664, "language_loss": 0.72458982, "learning_rate": 5.138754074778371e-07, "loss": 0.73960143, "num_input_tokens_seen": 277622530, "router_z_loss_clip": 2.24414062, "router_z_loss_mlp": 0.27075195, "step": 12867, "time_per_iteration": 2.6135456562042236 }, { "auxiliary_loss_clip": 0.01236851, "auxiliary_loss_mlp": 0.00224775, "balance_loss_clip": 1.02354383, "balance_loss_mlp": 0.19850126, "epoch": 0.7736660153314294, "flos": 22893304239360.0, "grad_norm": 92.313965213826, "language_loss": 0.76230502, "learning_rate": 5.136147992325595e-07, "loss": 0.77692127, "num_input_tokens_seen": 277642700, "router_z_loss_clip": 2.13476562, "router_z_loss_mlp": 0.26281738, "step": 12868, "time_per_iteration": 2.6858370304107666 }, { "auxiliary_loss_clip": 0.0125775, "auxiliary_loss_mlp": 0.00217617, "balance_loss_clip": 1.04190183, "balance_loss_mlp": 0.19178453, "epoch": 0.7737261385840974, "flos": 13800901789440.0, "grad_norm": 7.273265223688193, "language_loss": 0.865152, "learning_rate": 5.133542473511578e-07, "loss": 0.8799057, "num_input_tokens_seen": 277660005, "router_z_loss_clip": 2.16015625, "router_z_loss_mlp": 0.25830078, "step": 12869, "time_per_iteration": 2.607187271118164 }, { "auxiliary_loss_clip": 0.01247175, "auxiliary_loss_mlp": 0.00250024, "balance_loss_clip": 1.03018761, "balance_loss_mlp": 0.22413133, "epoch": 0.7737862618367654, "flos": 28730727106560.0, "grad_norm": 4.39831510983338, "language_loss": 0.81009841, "learning_rate": 5.130937518435124e-07, "loss": 0.82507038, "num_input_tokens_seen": 277682890, "router_z_loss_clip": 2.16601562, "router_z_loss_mlp": 0.25915527, "step": 12870, "time_per_iteration": 2.7541966438293457 }, { "auxiliary_loss_clip": 0.01256278, "auxiliary_loss_mlp": 0.00212046, "balance_loss_clip": 1.03877234, "balance_loss_mlp": 0.18671355, "epoch": 0.7738463850894334, "flos": 17018570119680.0, "grad_norm": 4.1264017274205536, "language_loss": 0.84140241, "learning_rate": 5.12833312719501e-07, "loss": 0.85608554, "num_input_tokens_seen": 277699330, "router_z_loss_clip": 2.17675781, "router_z_loss_mlp": 0.25341797, "step": 12871, "time_per_iteration": 2.6025922298431396 }, { "auxiliary_loss_clip": 0.01220366, "auxiliary_loss_mlp": 0.00244197, "balance_loss_clip": 1.01034617, "balance_loss_mlp": 0.22004545, "epoch": 0.7739065083421013, "flos": 20704010290560.0, "grad_norm": 40.57034174177533, "language_loss": 0.77485663, "learning_rate": 5.12572929988999e-07, "loss": 0.78950226, "num_input_tokens_seen": 277718750, "router_z_loss_clip": 2.09960938, "router_z_loss_mlp": 0.24157715, "step": 12872, "time_per_iteration": 2.6933298110961914 }, { "auxiliary_loss_clip": 0.0125718, "auxiliary_loss_mlp": 0.0022768, "balance_loss_clip": 1.0372653, "balance_loss_mlp": 0.20207383, "epoch": 0.7739666315947693, "flos": 20697222620160.0, "grad_norm": 122.61065511138243, "language_loss": 0.93445128, "learning_rate": 5.123126036618804e-07, "loss": 0.94929981, "num_input_tokens_seen": 277734645, "router_z_loss_clip": 2.20117188, "router_z_loss_mlp": 0.25598145, "step": 12873, "time_per_iteration": 2.644747495651245 }, { "auxiliary_loss_clip": 0.01264844, "auxiliary_loss_mlp": 0.00226941, "balance_loss_clip": 1.04551315, "balance_loss_mlp": 0.20251517, "epoch": 0.7740267548474372, "flos": 29570677718400.0, "grad_norm": 252.929357831653, "language_loss": 0.74547589, "learning_rate": 5.120523337480174e-07, "loss": 0.76039374, "num_input_tokens_seen": 277755535, "router_z_loss_clip": 2.19628906, "router_z_loss_mlp": 0.24450684, "step": 12874, "time_per_iteration": 2.734696388244629 }, { "auxiliary_loss_clip": 0.01260723, "auxiliary_loss_mlp": 0.0022603, "balance_loss_clip": 1.0435648, "balance_loss_mlp": 0.20078164, "epoch": 0.7740868781001052, "flos": 23659099223040.0, "grad_norm": 63.76851603315143, "language_loss": 0.70320332, "learning_rate": 5.117921202572785e-07, "loss": 0.71807075, "num_input_tokens_seen": 277775585, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.25256348, "step": 12875, "time_per_iteration": 2.674215078353882 }, { "auxiliary_loss_clip": 0.01254101, "auxiliary_loss_mlp": 0.00237563, "balance_loss_clip": 1.03203773, "balance_loss_mlp": 0.21056256, "epoch": 0.7741470013527731, "flos": 24717314828160.0, "grad_norm": 43.599243578066805, "language_loss": 0.72143799, "learning_rate": 5.115319631995318e-07, "loss": 0.73635465, "num_input_tokens_seen": 277794795, "router_z_loss_clip": 2.22070312, "router_z_loss_mlp": 0.26977539, "step": 12876, "time_per_iteration": 2.702072858810425 }, { "auxiliary_loss_clip": 0.01248004, "auxiliary_loss_mlp": 0.00231079, "balance_loss_clip": 1.02965868, "balance_loss_mlp": 0.20642647, "epoch": 0.7742071246054412, "flos": 21871645701120.0, "grad_norm": 17.87293561592834, "language_loss": 0.7967748, "learning_rate": 5.112718625846433e-07, "loss": 0.81156558, "num_input_tokens_seen": 277813235, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.24645996, "step": 12877, "time_per_iteration": 2.6738336086273193 }, { "auxiliary_loss_clip": 0.01263484, "auxiliary_loss_mlp": 0.00229647, "balance_loss_clip": 1.0399189, "balance_loss_mlp": 0.20289619, "epoch": 0.7742672478581091, "flos": 22674249146880.0, "grad_norm": 548.3668551485101, "language_loss": 0.8991099, "learning_rate": 5.110118184224736e-07, "loss": 0.91404116, "num_input_tokens_seen": 277832560, "router_z_loss_clip": 2.23046875, "router_z_loss_mlp": 0.2677002, "step": 12878, "time_per_iteration": 2.7053792476654053 }, { "auxiliary_loss_clip": 0.01257652, "auxiliary_loss_mlp": 0.00239467, "balance_loss_clip": 1.0371865, "balance_loss_mlp": 0.21382508, "epoch": 0.7743273711107771, "flos": 18840892769280.0, "grad_norm": 8.65563500519384, "language_loss": 0.79984903, "learning_rate": 5.10751830722885e-07, "loss": 0.81482023, "num_input_tokens_seen": 277850120, "router_z_loss_clip": 2.20507812, "router_z_loss_mlp": 0.25646973, "step": 12879, "time_per_iteration": 2.7803151607513428 }, { "auxiliary_loss_clip": 0.0123907, "auxiliary_loss_mlp": 0.00246107, "balance_loss_clip": 1.02596426, "balance_loss_mlp": 0.22063223, "epoch": 0.7743874943634451, "flos": 28729326476160.0, "grad_norm": 292.72859435305935, "language_loss": 0.85317731, "learning_rate": 5.104918994957364e-07, "loss": 0.868029, "num_input_tokens_seen": 277871020, "router_z_loss_clip": 2.1328125, "router_z_loss_mlp": 0.25463867, "step": 12880, "time_per_iteration": 2.812614917755127 }, { "auxiliary_loss_clip": 0.01245972, "auxiliary_loss_mlp": 0.00234347, "balance_loss_clip": 1.0286088, "balance_loss_mlp": 0.20834711, "epoch": 0.774447617616113, "flos": 21909639312000.0, "grad_norm": 3.8653551791372496, "language_loss": 0.78073287, "learning_rate": 5.102320247508847e-07, "loss": 0.79553604, "num_input_tokens_seen": 277891525, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.26000977, "step": 12881, "time_per_iteration": 2.7884013652801514 }, { "auxiliary_loss_clip": 0.01269778, "auxiliary_loss_mlp": 0.00264359, "balance_loss_clip": 1.04443073, "balance_loss_mlp": 0.23496228, "epoch": 0.774507740868781, "flos": 19500643825920.0, "grad_norm": 15.672923749609533, "language_loss": 0.91374868, "learning_rate": 5.099722064981832e-07, "loss": 0.92909002, "num_input_tokens_seen": 277910425, "router_z_loss_clip": 2.25390625, "router_z_loss_mlp": 0.29418945, "step": 12882, "time_per_iteration": 2.6561548709869385 }, { "auxiliary_loss_clip": 0.01076604, "auxiliary_loss_mlp": 0.00050599, "balance_loss_clip": 0.92966306, "balance_loss_mlp": 0.04306469, "epoch": 0.774567864121449, "flos": 59426560402560.0, "grad_norm": 0.7544995522018929, "language_loss": 0.59548444, "learning_rate": 5.097124447474858e-07, "loss": 0.60675645, "num_input_tokens_seen": 277972795, "router_z_loss_clip": 1.46875, "router_z_loss_mlp": 0.07519531, "step": 12883, "time_per_iteration": 3.128261089324951 }, { "auxiliary_loss_clip": 0.01254095, "auxiliary_loss_mlp": 0.00231005, "balance_loss_clip": 1.03616977, "balance_loss_mlp": 0.20480278, "epoch": 0.774627987374117, "flos": 13225326255360.0, "grad_norm": 15.412545171402407, "language_loss": 0.82867396, "learning_rate": 5.094527395086416e-07, "loss": 0.84352493, "num_input_tokens_seen": 277990675, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.26171875, "step": 12884, "time_per_iteration": 2.6274242401123047 }, { "auxiliary_loss_clip": 0.01248067, "auxiliary_loss_mlp": 0.00228196, "balance_loss_clip": 1.03743172, "balance_loss_mlp": 0.20418699, "epoch": 0.7746881106267849, "flos": 21394033534080.0, "grad_norm": 84.65330546812044, "language_loss": 0.85873926, "learning_rate": 5.091930907914986e-07, "loss": 0.87350184, "num_input_tokens_seen": 278010050, "router_z_loss_clip": 2.10742188, "router_z_loss_mlp": 0.23999023, "step": 12885, "time_per_iteration": 2.753474473953247 }, { "auxiliary_loss_clip": 0.01243964, "auxiliary_loss_mlp": 0.00214413, "balance_loss_clip": 1.02215183, "balance_loss_mlp": 0.18949872, "epoch": 0.7747482338794529, "flos": 25629338079360.0, "grad_norm": 11.768135936337396, "language_loss": 0.74697912, "learning_rate": 5.089334986059029e-07, "loss": 0.76156288, "num_input_tokens_seen": 278030660, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.24926758, "step": 12886, "time_per_iteration": 2.723646640777588 }, { "auxiliary_loss_clip": 0.01245465, "auxiliary_loss_mlp": 0.00221034, "balance_loss_clip": 1.02917004, "balance_loss_mlp": 0.19695359, "epoch": 0.7748083571321208, "flos": 11546933402880.0, "grad_norm": 3.511893999971291, "language_loss": 0.76930857, "learning_rate": 5.086739629616987e-07, "loss": 0.78397352, "num_input_tokens_seen": 278047645, "router_z_loss_clip": 2.16015625, "router_z_loss_mlp": 0.24084473, "step": 12887, "time_per_iteration": 2.681191921234131 }, { "auxiliary_loss_clip": 0.01221921, "auxiliary_loss_mlp": 0.00208193, "balance_loss_clip": 1.01107955, "balance_loss_mlp": 0.18506634, "epoch": 0.7748684803847888, "flos": 19062425900160.0, "grad_norm": 4.348727360250039, "language_loss": 0.80538106, "learning_rate": 5.084144838687275e-07, "loss": 0.81968218, "num_input_tokens_seen": 278066170, "router_z_loss_clip": 2.10839844, "router_z_loss_mlp": 0.23132324, "step": 12888, "time_per_iteration": 2.689887523651123 }, { "auxiliary_loss_clip": 0.01244472, "auxiliary_loss_mlp": 0.002437, "balance_loss_clip": 1.02637637, "balance_loss_mlp": 0.21820083, "epoch": 0.7749286036374567, "flos": 22273162905600.0, "grad_norm": 49.732738148633715, "language_loss": 0.89527822, "learning_rate": 5.081550613368279e-07, "loss": 0.91015989, "num_input_tokens_seen": 278085545, "router_z_loss_clip": 2.18164062, "router_z_loss_mlp": 0.25512695, "step": 12889, "time_per_iteration": 4.058763742446899 }, { "auxiliary_loss_clip": 0.01273057, "auxiliary_loss_mlp": 0.00252601, "balance_loss_clip": 1.04872, "balance_loss_mlp": 0.22592232, "epoch": 0.7749887268901248, "flos": 20192462749440.0, "grad_norm": 15.78685214344116, "language_loss": 0.86309278, "learning_rate": 5.07895695375838e-07, "loss": 0.87834942, "num_input_tokens_seen": 278102995, "router_z_loss_clip": 2.2421875, "router_z_loss_mlp": 0.26672363, "step": 12890, "time_per_iteration": 2.6511127948760986 }, { "auxiliary_loss_clip": 0.01272772, "auxiliary_loss_mlp": 0.00241751, "balance_loss_clip": 1.0477531, "balance_loss_mlp": 0.2153701, "epoch": 0.7750488501427927, "flos": 20337541781760.0, "grad_norm": 11.571241039085566, "language_loss": 0.7536521, "learning_rate": 5.076363859955932e-07, "loss": 0.76879734, "num_input_tokens_seen": 278121460, "router_z_loss_clip": 2.25195312, "router_z_loss_mlp": 0.26379395, "step": 12891, "time_per_iteration": 2.66204833984375 }, { "auxiliary_loss_clip": 0.01271856, "auxiliary_loss_mlp": 0.00216212, "balance_loss_clip": 1.04791903, "balance_loss_mlp": 0.19085652, "epoch": 0.7751089733954607, "flos": 28364043116160.0, "grad_norm": 28.187115181631835, "language_loss": 0.85474479, "learning_rate": 5.073771332059257e-07, "loss": 0.86962545, "num_input_tokens_seen": 278143905, "router_z_loss_clip": 2.23828125, "router_z_loss_mlp": 0.25366211, "step": 12892, "time_per_iteration": 4.158003091812134 }, { "auxiliary_loss_clip": 0.01287846, "auxiliary_loss_mlp": 0.00243394, "balance_loss_clip": 1.05800271, "balance_loss_mlp": 0.2148906, "epoch": 0.7751690966481286, "flos": 16943803960320.0, "grad_norm": 27.914766266578305, "language_loss": 0.79496288, "learning_rate": 5.071179370166669e-07, "loss": 0.81027526, "num_input_tokens_seen": 278160850, "router_z_loss_clip": 2.29882812, "router_z_loss_mlp": 0.28466797, "step": 12893, "time_per_iteration": 2.765023946762085 }, { "auxiliary_loss_clip": 0.01085096, "auxiliary_loss_mlp": 0.00023706, "balance_loss_clip": 0.93632233, "balance_loss_mlp": 0.01564701, "epoch": 0.7752292199007966, "flos": 65668050339840.0, "grad_norm": 2.0906801949986775, "language_loss": 0.57774091, "learning_rate": 5.068587974376468e-07, "loss": 0.58882892, "num_input_tokens_seen": 278219950, "router_z_loss_clip": 1.484375, "router_z_loss_mlp": 0.08056641, "step": 12894, "time_per_iteration": 3.3085410594940186 }, { "auxiliary_loss_clip": 0.01253448, "auxiliary_loss_mlp": 0.00223614, "balance_loss_clip": 1.0317471, "balance_loss_mlp": 0.19651753, "epoch": 0.7752893431534646, "flos": 20594662312320.0, "grad_norm": 11.22484025629422, "language_loss": 0.87018561, "learning_rate": 5.065997144786895e-07, "loss": 0.88495624, "num_input_tokens_seen": 278237805, "router_z_loss_clip": 2.21679688, "router_z_loss_mlp": 0.27124023, "step": 12895, "time_per_iteration": 2.716477632522583 }, { "auxiliary_loss_clip": 0.01258332, "auxiliary_loss_mlp": 0.00233313, "balance_loss_clip": 1.03430784, "balance_loss_mlp": 0.20711058, "epoch": 0.7753494664061326, "flos": 20485350247680.0, "grad_norm": 5184.048696331857, "language_loss": 0.75281, "learning_rate": 5.063406881496209e-07, "loss": 0.76772648, "num_input_tokens_seen": 278257660, "router_z_loss_clip": 2.23828125, "router_z_loss_mlp": 0.26220703, "step": 12896, "time_per_iteration": 4.217605113983154 }, { "auxiliary_loss_clip": 0.01266195, "auxiliary_loss_mlp": 0.00234265, "balance_loss_clip": 1.04849482, "balance_loss_mlp": 0.21038762, "epoch": 0.7754095896588006, "flos": 20265900105600.0, "grad_norm": 12.04671452490976, "language_loss": 0.74984336, "learning_rate": 5.060817184602629e-07, "loss": 0.76484793, "num_input_tokens_seen": 278275110, "router_z_loss_clip": 2.17675781, "router_z_loss_mlp": 0.23876953, "step": 12897, "time_per_iteration": 2.6508378982543945 }, { "auxiliary_loss_clip": 0.01259441, "auxiliary_loss_mlp": 0.00222146, "balance_loss_clip": 1.0395267, "balance_loss_mlp": 0.19544317, "epoch": 0.7754697129114685, "flos": 23331091201920.0, "grad_norm": 4.243216019129883, "language_loss": 0.8178463, "learning_rate": 5.058228054204364e-07, "loss": 0.83266217, "num_input_tokens_seen": 278293035, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.2668457, "step": 12898, "time_per_iteration": 2.7206027507781982 }, { "auxiliary_loss_clip": 0.01271332, "auxiliary_loss_mlp": 0.00228821, "balance_loss_clip": 1.04489064, "balance_loss_mlp": 0.20065166, "epoch": 0.7755298361641365, "flos": 17347619635200.0, "grad_norm": 15.815378932386963, "language_loss": 0.78160596, "learning_rate": 5.055639490399588e-07, "loss": 0.79660749, "num_input_tokens_seen": 278311010, "router_z_loss_clip": 2.26367188, "router_z_loss_mlp": 0.28149414, "step": 12899, "time_per_iteration": 2.7522242069244385 }, { "auxiliary_loss_clip": 0.0127699, "auxiliary_loss_mlp": 0.00224502, "balance_loss_clip": 1.05230784, "balance_loss_mlp": 0.19754831, "epoch": 0.7755899594168044, "flos": 19645866512640.0, "grad_norm": 16.617663377278298, "language_loss": 0.85282987, "learning_rate": 5.053051493286453e-07, "loss": 0.86784482, "num_input_tokens_seen": 278329900, "router_z_loss_clip": 2.25, "router_z_loss_mlp": 0.26977539, "step": 12900, "time_per_iteration": 4.044504404067993 }, { "auxiliary_loss_clip": 0.01242908, "auxiliary_loss_mlp": 0.00219852, "balance_loss_clip": 1.03070295, "balance_loss_mlp": 0.19533044, "epoch": 0.7756500826694724, "flos": 27414457217280.0, "grad_norm": 17.922385941984402, "language_loss": 0.83115011, "learning_rate": 5.050464062963113e-07, "loss": 0.84577775, "num_input_tokens_seen": 278349980, "router_z_loss_clip": 2.12109375, "router_z_loss_mlp": 0.24523926, "step": 12901, "time_per_iteration": 2.70332670211792 }, { "auxiliary_loss_clip": 0.01246146, "auxiliary_loss_mlp": 0.00214791, "balance_loss_clip": 1.03430629, "balance_loss_mlp": 0.19066326, "epoch": 0.7757102059221404, "flos": 28730511624960.0, "grad_norm": 2.6693492486493993, "language_loss": 0.84294105, "learning_rate": 5.047877199527666e-07, "loss": 0.85755044, "num_input_tokens_seen": 278372485, "router_z_loss_clip": 2.11914062, "router_z_loss_mlp": 0.24108887, "step": 12902, "time_per_iteration": 2.7551798820495605 }, { "auxiliary_loss_clip": 0.01258864, "auxiliary_loss_mlp": 0.00207996, "balance_loss_clip": 1.04043698, "balance_loss_mlp": 0.18279502, "epoch": 0.7757703291748084, "flos": 22486795044480.0, "grad_norm": 20.72398791325989, "language_loss": 0.80304325, "learning_rate": 5.045290903078215e-07, "loss": 0.81771177, "num_input_tokens_seen": 278391660, "router_z_loss_clip": 2.18457031, "router_z_loss_mlp": 0.25219727, "step": 12903, "time_per_iteration": 2.7267367839813232 }, { "auxiliary_loss_clip": 0.012726, "auxiliary_loss_mlp": 0.00233261, "balance_loss_clip": 1.04903889, "balance_loss_mlp": 0.20589103, "epoch": 0.7758304524274763, "flos": 21430159637760.0, "grad_norm": 9.44599644494363, "language_loss": 0.86103565, "learning_rate": 5.042705173712835e-07, "loss": 0.87609422, "num_input_tokens_seen": 278409125, "router_z_loss_clip": 2.2421875, "router_z_loss_mlp": 0.27368164, "step": 12904, "time_per_iteration": 2.712670087814331 }, { "auxiliary_loss_clip": 0.01249326, "auxiliary_loss_mlp": 0.00219907, "balance_loss_clip": 1.03347564, "balance_loss_mlp": 0.19537354, "epoch": 0.7758905756801443, "flos": 23659242877440.0, "grad_norm": 145.01968003158225, "language_loss": 0.76398766, "learning_rate": 5.040120011529576e-07, "loss": 0.77867997, "num_input_tokens_seen": 278429450, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.24523926, "step": 12905, "time_per_iteration": 2.7801804542541504 }, { "auxiliary_loss_clip": 0.01271277, "auxiliary_loss_mlp": 0.0023445, "balance_loss_clip": 1.04967964, "balance_loss_mlp": 0.2074133, "epoch": 0.7759506989328122, "flos": 28365479660160.0, "grad_norm": 12.369513367734433, "language_loss": 0.75895476, "learning_rate": 5.037535416626459e-07, "loss": 0.77401197, "num_input_tokens_seen": 278449925, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.27038574, "step": 12906, "time_per_iteration": 2.726865768432617 }, { "auxiliary_loss_clip": 0.01285394, "auxiliary_loss_mlp": 0.00239827, "balance_loss_clip": 1.05774939, "balance_loss_mlp": 0.20934498, "epoch": 0.7760108221854802, "flos": 14902785354240.0, "grad_norm": 8.730028173344728, "language_loss": 0.90438485, "learning_rate": 5.034951389101498e-07, "loss": 0.91963708, "num_input_tokens_seen": 278467255, "router_z_loss_clip": 2.27539062, "router_z_loss_mlp": 0.30493164, "step": 12907, "time_per_iteration": 2.6092119216918945 }, { "auxiliary_loss_clip": 0.01242608, "auxiliary_loss_mlp": 0.00215528, "balance_loss_clip": 1.0313859, "balance_loss_mlp": 0.19113818, "epoch": 0.7760709454381483, "flos": 14792503622400.0, "grad_norm": 88.41672455471947, "language_loss": 0.77226973, "learning_rate": 5.032367929052685e-07, "loss": 0.78685105, "num_input_tokens_seen": 278484250, "router_z_loss_clip": 2.11230469, "router_z_loss_mlp": 0.24377441, "step": 12908, "time_per_iteration": 2.660560369491577 }, { "auxiliary_loss_clip": 0.01252986, "auxiliary_loss_mlp": 0.00226527, "balance_loss_clip": 1.02843559, "balance_loss_mlp": 0.20046771, "epoch": 0.7761310686908162, "flos": 17379831156480.0, "grad_norm": 2.286484694379444, "language_loss": 0.79393625, "learning_rate": 5.029785036577976e-07, "loss": 0.80873132, "num_input_tokens_seen": 278502740, "router_z_loss_clip": 2.24414062, "router_z_loss_mlp": 0.26074219, "step": 12909, "time_per_iteration": 2.6452951431274414 }, { "auxiliary_loss_clip": 0.01240783, "auxiliary_loss_mlp": 0.00209074, "balance_loss_clip": 1.02746296, "balance_loss_mlp": 0.18443379, "epoch": 0.7761911919434842, "flos": 25556547168000.0, "grad_norm": 15.299219019368067, "language_loss": 0.74973387, "learning_rate": 5.027202711775324e-07, "loss": 0.76423252, "num_input_tokens_seen": 278523890, "router_z_loss_clip": 2.13476562, "router_z_loss_mlp": 0.24633789, "step": 12910, "time_per_iteration": 2.7392613887786865 }, { "auxiliary_loss_clip": 0.01258299, "auxiliary_loss_mlp": 0.00219211, "balance_loss_clip": 1.03491282, "balance_loss_mlp": 0.19361669, "epoch": 0.7762513151961521, "flos": 23179763203200.0, "grad_norm": 151.03230315895183, "language_loss": 0.79733413, "learning_rate": 5.024620954742646e-07, "loss": 0.81210923, "num_input_tokens_seen": 278543185, "router_z_loss_clip": 2.23144531, "router_z_loss_mlp": 0.25610352, "step": 12911, "time_per_iteration": 2.679025888442993 }, { "auxiliary_loss_clip": 0.0128329, "auxiliary_loss_mlp": 0.00230341, "balance_loss_clip": 1.05688322, "balance_loss_mlp": 0.20409098, "epoch": 0.7763114384488201, "flos": 21689614552320.0, "grad_norm": 10.750717420329636, "language_loss": 0.75663757, "learning_rate": 5.022039765577836e-07, "loss": 0.77177382, "num_input_tokens_seen": 278559220, "router_z_loss_clip": 2.26757812, "router_z_loss_mlp": 0.2623291, "step": 12912, "time_per_iteration": 2.6502342224121094 }, { "auxiliary_loss_clip": 0.01085966, "auxiliary_loss_mlp": 0.00065452, "balance_loss_clip": 0.94050872, "balance_loss_mlp": 0.05624895, "epoch": 0.776371561701488, "flos": 69025554316800.0, "grad_norm": 0.765642590694399, "language_loss": 0.52766335, "learning_rate": 5.019459144378779e-07, "loss": 0.53917754, "num_input_tokens_seen": 278618185, "router_z_loss_clip": 1.453125, "router_z_loss_mlp": 0.09179688, "step": 12913, "time_per_iteration": 3.237210750579834 }, { "auxiliary_loss_clip": 0.01269822, "auxiliary_loss_mlp": 0.00224147, "balance_loss_clip": 1.04403853, "balance_loss_mlp": 0.19640742, "epoch": 0.776431684954156, "flos": 22893914770560.0, "grad_norm": 24.807566301629816, "language_loss": 0.71640491, "learning_rate": 5.016879091243338e-07, "loss": 0.73134458, "num_input_tokens_seen": 278636210, "router_z_loss_clip": 2.25976562, "router_z_loss_mlp": 0.27758789, "step": 12914, "time_per_iteration": 2.680666208267212 }, { "auxiliary_loss_clip": 0.01257012, "auxiliary_loss_mlp": 0.00225214, "balance_loss_clip": 1.03812194, "balance_loss_mlp": 0.19973868, "epoch": 0.776491808206824, "flos": 20261554560000.0, "grad_norm": 1.8491742747754158, "language_loss": 0.88483894, "learning_rate": 5.014299606269339e-07, "loss": 0.89966118, "num_input_tokens_seen": 278653305, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.25500488, "step": 12915, "time_per_iteration": 2.655639171600342 }, { "auxiliary_loss_clip": 0.0125733, "auxiliary_loss_mlp": 0.00230356, "balance_loss_clip": 1.03608108, "balance_loss_mlp": 0.20362934, "epoch": 0.776551931459492, "flos": 26759051706240.0, "grad_norm": 331.1432131360405, "language_loss": 0.83203906, "learning_rate": 5.011720689554603e-07, "loss": 0.84691596, "num_input_tokens_seen": 278671850, "router_z_loss_clip": 2.2109375, "router_z_loss_mlp": 0.26745605, "step": 12916, "time_per_iteration": 2.7593867778778076 }, { "auxiliary_loss_clip": 0.01255338, "auxiliary_loss_mlp": 0.00221497, "balance_loss_clip": 1.03653932, "balance_loss_mlp": 0.19616537, "epoch": 0.7766120547121599, "flos": 52665080250240.0, "grad_norm": 10.859453580105704, "language_loss": 0.7156496, "learning_rate": 5.009142341196919e-07, "loss": 0.73041797, "num_input_tokens_seen": 278697860, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.25354004, "step": 12917, "time_per_iteration": 3.0413341522216797 }, { "auxiliary_loss_clip": 0.0124412, "auxiliary_loss_mlp": 0.00213506, "balance_loss_clip": 1.03346169, "balance_loss_mlp": 0.18960452, "epoch": 0.7766721779648279, "flos": 25156215112320.0, "grad_norm": 34.22556083535453, "language_loss": 0.69995278, "learning_rate": 5.006564561294065e-07, "loss": 0.71452904, "num_input_tokens_seen": 278720655, "router_z_loss_clip": 2.10546875, "router_z_loss_mlp": 0.23876953, "step": 12918, "time_per_iteration": 2.7178475856781006 }, { "auxiliary_loss_clip": 0.01249507, "auxiliary_loss_mlp": 0.00224154, "balance_loss_clip": 1.03486586, "balance_loss_mlp": 0.19882166, "epoch": 0.7767323012174958, "flos": 23760761690880.0, "grad_norm": 125.96937677754636, "language_loss": 0.80689877, "learning_rate": 5.003987349943777e-07, "loss": 0.82163537, "num_input_tokens_seen": 278737375, "router_z_loss_clip": 2.14453125, "router_z_loss_mlp": 0.2532959, "step": 12919, "time_per_iteration": 2.6825029850006104 }, { "auxiliary_loss_clip": 0.01281837, "auxiliary_loss_mlp": 0.00258804, "balance_loss_clip": 1.05441177, "balance_loss_mlp": 0.22965762, "epoch": 0.7767924244701638, "flos": 22086642556800.0, "grad_norm": 79.62037085795713, "language_loss": 0.87163216, "learning_rate": 5.001410707243792e-07, "loss": 0.88703859, "num_input_tokens_seen": 278756510, "router_z_loss_clip": 2.2734375, "router_z_loss_mlp": 0.29125977, "step": 12920, "time_per_iteration": 2.6795122623443604 }, { "auxiliary_loss_clip": 0.01265953, "auxiliary_loss_mlp": 0.00210512, "balance_loss_clip": 1.03912246, "balance_loss_mlp": 0.18224788, "epoch": 0.7768525477228319, "flos": 21981640124160.0, "grad_norm": 8.43449608872046, "language_loss": 0.77926683, "learning_rate": 4.998834633291829e-07, "loss": 0.7940315, "num_input_tokens_seen": 278775410, "router_z_loss_clip": 2.26757812, "router_z_loss_mlp": 0.28259277, "step": 12921, "time_per_iteration": 2.655284881591797 }, { "auxiliary_loss_clip": 0.0127707, "auxiliary_loss_mlp": 0.002303, "balance_loss_clip": 1.04927933, "balance_loss_mlp": 0.20270288, "epoch": 0.7769126709754998, "flos": 21794581071360.0, "grad_norm": 470.82967839974094, "language_loss": 0.84249735, "learning_rate": 4.996259128185547e-07, "loss": 0.85757107, "num_input_tokens_seen": 278794260, "router_z_loss_clip": 2.28125, "router_z_loss_mlp": 0.27600098, "step": 12922, "time_per_iteration": 2.7056233882904053 }, { "auxiliary_loss_clip": 0.01250101, "auxiliary_loss_mlp": 0.00226739, "balance_loss_clip": 1.02947593, "balance_loss_mlp": 0.201252, "epoch": 0.7769727942281678, "flos": 20047994248320.0, "grad_norm": 18.312790646180392, "language_loss": 0.87952638, "learning_rate": 4.993684192022625e-07, "loss": 0.89429474, "num_input_tokens_seen": 278813290, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.25476074, "step": 12923, "time_per_iteration": 2.6798768043518066 }, { "auxiliary_loss_clip": 0.01246017, "auxiliary_loss_mlp": 0.00239879, "balance_loss_clip": 1.0312078, "balance_loss_mlp": 0.21303311, "epoch": 0.7770329174808357, "flos": 21686777377920.0, "grad_norm": 18.863714335620223, "language_loss": 0.98769879, "learning_rate": 4.991109824900699e-07, "loss": 1.00255775, "num_input_tokens_seen": 278830610, "router_z_loss_clip": 2.14648438, "router_z_loss_mlp": 0.26867676, "step": 12924, "time_per_iteration": 2.693669319152832 }, { "auxiliary_loss_clip": 0.01255561, "auxiliary_loss_mlp": 0.00226561, "balance_loss_clip": 1.03395247, "balance_loss_mlp": 0.19982252, "epoch": 0.7770930407335037, "flos": 25849255098240.0, "grad_norm": 8.431127502711172, "language_loss": 0.74633074, "learning_rate": 4.988536026917401e-07, "loss": 0.76115197, "num_input_tokens_seen": 278849530, "router_z_loss_clip": 2.21679688, "router_z_loss_mlp": 0.2677002, "step": 12925, "time_per_iteration": 2.71305513381958 }, { "auxiliary_loss_clip": 0.01265649, "auxiliary_loss_mlp": 0.00226469, "balance_loss_clip": 1.03910577, "balance_loss_mlp": 0.19948016, "epoch": 0.7771531639861716, "flos": 24347865490560.0, "grad_norm": 96.54782489989756, "language_loss": 0.80509764, "learning_rate": 4.985962798170314e-07, "loss": 0.82001883, "num_input_tokens_seen": 278869005, "router_z_loss_clip": 2.26171875, "router_z_loss_mlp": 0.26965332, "step": 12926, "time_per_iteration": 2.7438793182373047 }, { "auxiliary_loss_clip": 0.01275697, "auxiliary_loss_mlp": 0.00224276, "balance_loss_clip": 1.04665422, "balance_loss_mlp": 0.19579718, "epoch": 0.7772132872388396, "flos": 25629948610560.0, "grad_norm": 90.73486134156663, "language_loss": 0.76587486, "learning_rate": 4.983390138757027e-07, "loss": 0.78087461, "num_input_tokens_seen": 278888790, "router_z_loss_clip": 2.2890625, "router_z_loss_mlp": 0.28491211, "step": 12927, "time_per_iteration": 2.7893826961517334 }, { "auxiliary_loss_clip": 0.0126902, "auxiliary_loss_mlp": 0.00236288, "balance_loss_clip": 1.04317927, "balance_loss_mlp": 0.20876263, "epoch": 0.7772734104915076, "flos": 26067412350720.0, "grad_norm": 35.11736696828609, "language_loss": 0.84290171, "learning_rate": 4.980818048775093e-07, "loss": 0.8579548, "num_input_tokens_seen": 278908150, "router_z_loss_clip": 2.2578125, "router_z_loss_mlp": 0.27502441, "step": 12928, "time_per_iteration": 2.7146034240722656 }, { "auxiliary_loss_clip": 0.01235724, "auxiliary_loss_mlp": 0.00205411, "balance_loss_clip": 1.01890206, "balance_loss_mlp": 0.18099679, "epoch": 0.7773335337441756, "flos": 22925048883840.0, "grad_norm": 8.402953225994409, "language_loss": 0.82317048, "learning_rate": 4.978246528322036e-07, "loss": 0.83758181, "num_input_tokens_seen": 278927425, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.24389648, "step": 12929, "time_per_iteration": 2.657606363296509 }, { "auxiliary_loss_clip": 0.01265275, "auxiliary_loss_mlp": 0.00227506, "balance_loss_clip": 1.04352915, "balance_loss_mlp": 0.20138764, "epoch": 0.7773936569968435, "flos": 20776765288320.0, "grad_norm": 11.003820954019623, "language_loss": 0.87013519, "learning_rate": 4.975675577495377e-07, "loss": 0.88506299, "num_input_tokens_seen": 278946475, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.26135254, "step": 12930, "time_per_iteration": 2.6961331367492676 }, { "auxiliary_loss_clip": 0.01255047, "auxiliary_loss_mlp": 0.00224875, "balance_loss_clip": 1.03906083, "balance_loss_mlp": 0.20021111, "epoch": 0.7774537802495115, "flos": 20372267255040.0, "grad_norm": 22.861825214722263, "language_loss": 0.86518145, "learning_rate": 4.973105196392613e-07, "loss": 0.87998068, "num_input_tokens_seen": 278964345, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.2467041, "step": 12931, "time_per_iteration": 2.6199100017547607 }, { "auxiliary_loss_clip": 0.01081813, "auxiliary_loss_mlp": 0.00033296, "balance_loss_clip": 0.93627524, "balance_loss_mlp": 0.02380712, "epoch": 0.7775139035021794, "flos": 53912081738880.0, "grad_norm": 0.7757071626892396, "language_loss": 0.58954066, "learning_rate": 4.970535385111199e-07, "loss": 0.60069174, "num_input_tokens_seen": 279022380, "router_z_loss_clip": 1.453125, "router_z_loss_mlp": 0.09472656, "step": 12932, "time_per_iteration": 4.556074142456055 }, { "auxiliary_loss_clip": 0.01259647, "auxiliary_loss_mlp": 0.00197477, "balance_loss_clip": 1.03353846, "balance_loss_mlp": 0.17187075, "epoch": 0.7775740267548474, "flos": 28842481296000.0, "grad_norm": 13.834573187728951, "language_loss": 0.82843697, "learning_rate": 4.967966143748595e-07, "loss": 0.84300816, "num_input_tokens_seen": 279044275, "router_z_loss_clip": 2.26367188, "router_z_loss_mlp": 0.25622559, "step": 12933, "time_per_iteration": 2.8430984020233154 }, { "auxiliary_loss_clip": 0.01245341, "auxiliary_loss_mlp": 0.00215289, "balance_loss_clip": 1.02513433, "balance_loss_mlp": 0.188944, "epoch": 0.7776341500075155, "flos": 21872471713920.0, "grad_norm": 209.9332184943851, "language_loss": 0.82001579, "learning_rate": 4.965397472402215e-07, "loss": 0.83462203, "num_input_tokens_seen": 279063375, "router_z_loss_clip": 2.19921875, "router_z_loss_mlp": 0.26330566, "step": 12934, "time_per_iteration": 4.1079394817352295 }, { "auxiliary_loss_clip": 0.01257879, "auxiliary_loss_mlp": 0.00215124, "balance_loss_clip": 1.03379285, "balance_loss_mlp": 0.18768221, "epoch": 0.7776942732601834, "flos": 20229845829120.0, "grad_norm": 33.36217034258308, "language_loss": 0.79719198, "learning_rate": 4.962829371169475e-07, "loss": 0.81192201, "num_input_tokens_seen": 279082680, "router_z_loss_clip": 2.2421875, "router_z_loss_mlp": 0.27429199, "step": 12935, "time_per_iteration": 2.644845962524414 }, { "auxiliary_loss_clip": 0.01251427, "auxiliary_loss_mlp": 0.00219841, "balance_loss_clip": 1.03038359, "balance_loss_mlp": 0.19392487, "epoch": 0.7777543965128514, "flos": 22231829329920.0, "grad_norm": 85.07181807440844, "language_loss": 0.89775121, "learning_rate": 4.960261840147746e-07, "loss": 0.91246384, "num_input_tokens_seen": 279099805, "router_z_loss_clip": 2.21386719, "router_z_loss_mlp": 0.25915527, "step": 12936, "time_per_iteration": 2.6791605949401855 }, { "auxiliary_loss_clip": 0.01250215, "auxiliary_loss_mlp": 0.00222835, "balance_loss_clip": 1.03075504, "balance_loss_mlp": 0.19608428, "epoch": 0.7778145197655193, "flos": 14501950508160.0, "grad_norm": 25.013280593991354, "language_loss": 0.7850377, "learning_rate": 4.957694879434397e-07, "loss": 0.79976821, "num_input_tokens_seen": 279117975, "router_z_loss_clip": 2.20117188, "router_z_loss_mlp": 0.26733398, "step": 12937, "time_per_iteration": 2.6188995838165283 }, { "auxiliary_loss_clip": 0.01252039, "auxiliary_loss_mlp": 0.00202465, "balance_loss_clip": 1.03184533, "balance_loss_mlp": 0.17760953, "epoch": 0.7778746430181873, "flos": 21140288881920.0, "grad_norm": 6.48571790881416, "language_loss": 0.940229, "learning_rate": 4.955128489126777e-07, "loss": 0.95477402, "num_input_tokens_seen": 279137255, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.24865723, "step": 12938, "time_per_iteration": 4.297469139099121 }, { "auxiliary_loss_clip": 0.01257314, "auxiliary_loss_mlp": 0.00213585, "balance_loss_clip": 1.03442109, "balance_loss_mlp": 0.18626186, "epoch": 0.7779347662708552, "flos": 20266366982400.0, "grad_norm": 9.324392793500177, "language_loss": 0.95170045, "learning_rate": 4.95256266932218e-07, "loss": 0.96640944, "num_input_tokens_seen": 279154500, "router_z_loss_clip": 2.23632812, "router_z_loss_mlp": 0.2734375, "step": 12939, "time_per_iteration": 2.7526421546936035 }, { "auxiliary_loss_clip": 0.01248391, "auxiliary_loss_mlp": 0.00215915, "balance_loss_clip": 1.03437912, "balance_loss_mlp": 0.19055909, "epoch": 0.7779948895235232, "flos": 19209013303680.0, "grad_norm": 58.520284483370894, "language_loss": 0.79045373, "learning_rate": 4.949997420117915e-07, "loss": 0.80509675, "num_input_tokens_seen": 279173635, "router_z_loss_clip": 2.14355469, "router_z_loss_mlp": 0.25366211, "step": 12940, "time_per_iteration": 2.668226957321167 }, { "auxiliary_loss_clip": 0.0125539, "auxiliary_loss_mlp": 0.0022813, "balance_loss_clip": 1.02977371, "balance_loss_mlp": 0.20095035, "epoch": 0.7780550127761912, "flos": 23914711382400.0, "grad_norm": 14.499276255945697, "language_loss": 0.84494936, "learning_rate": 4.947432741611255e-07, "loss": 0.8597846, "num_input_tokens_seen": 279194430, "router_z_loss_clip": 2.25585938, "router_z_loss_mlp": 0.27172852, "step": 12941, "time_per_iteration": 2.714939594268799 }, { "auxiliary_loss_clip": 0.01275798, "auxiliary_loss_mlp": 0.00215149, "balance_loss_clip": 1.04087067, "balance_loss_mlp": 0.18767163, "epoch": 0.7781151360288592, "flos": 32415951795840.0, "grad_norm": 159.18229052610346, "language_loss": 0.83205521, "learning_rate": 4.944868633899462e-07, "loss": 0.84696472, "num_input_tokens_seen": 279212920, "router_z_loss_clip": 2.34765625, "router_z_loss_mlp": 0.2746582, "step": 12942, "time_per_iteration": 4.159067392349243 }, { "auxiliary_loss_clip": 0.01231383, "auxiliary_loss_mlp": 0.00224709, "balance_loss_clip": 1.0160296, "balance_loss_mlp": 0.1995559, "epoch": 0.7781752592815271, "flos": 22346384780160.0, "grad_norm": 7.21692366259594, "language_loss": 0.75777292, "learning_rate": 4.942305097079751e-07, "loss": 0.77233386, "num_input_tokens_seen": 279232310, "router_z_loss_clip": 2.15429688, "router_z_loss_mlp": 0.25134277, "step": 12943, "time_per_iteration": 2.6589066982269287 }, { "auxiliary_loss_clip": 0.01070536, "auxiliary_loss_mlp": 0.00016015, "balance_loss_clip": 0.92267847, "balance_loss_mlp": 0.0056672, "epoch": 0.7782353825341951, "flos": 70460183520000.0, "grad_norm": 0.7806757165398607, "language_loss": 0.57897693, "learning_rate": 4.939742131249347e-07, "loss": 0.58984244, "num_input_tokens_seen": 279295375, "router_z_loss_clip": 1.4765625, "router_z_loss_mlp": 0.10351562, "step": 12944, "time_per_iteration": 3.2966887950897217 }, { "auxiliary_loss_clip": 0.01253556, "auxiliary_loss_mlp": 0.00231397, "balance_loss_clip": 1.02905142, "balance_loss_mlp": 0.20202357, "epoch": 0.778295505786863, "flos": 19062569554560.0, "grad_norm": 125.77300155175298, "language_loss": 0.78307807, "learning_rate": 4.937179736505428e-07, "loss": 0.79792756, "num_input_tokens_seen": 279313660, "router_z_loss_clip": 2.24414062, "router_z_loss_mlp": 0.2935791, "step": 12945, "time_per_iteration": 2.7075395584106445 }, { "auxiliary_loss_clip": 0.01257527, "auxiliary_loss_mlp": 0.00227002, "balance_loss_clip": 1.03424597, "balance_loss_mlp": 0.19975048, "epoch": 0.778355629039531, "flos": 20999734963200.0, "grad_norm": 29.269429304424282, "language_loss": 0.75892615, "learning_rate": 4.93461791294516e-07, "loss": 0.77377146, "num_input_tokens_seen": 279334495, "router_z_loss_clip": 2.23242188, "router_z_loss_mlp": 0.27246094, "step": 12946, "time_per_iteration": 2.7571144104003906 }, { "auxiliary_loss_clip": 0.01247471, "auxiliary_loss_mlp": 0.00207958, "balance_loss_clip": 1.02820122, "balance_loss_mlp": 0.18415204, "epoch": 0.7784157522921991, "flos": 21398091770880.0, "grad_norm": 3.5927227979435012, "language_loss": 0.71665233, "learning_rate": 4.932056660665689e-07, "loss": 0.73120666, "num_input_tokens_seen": 279352985, "router_z_loss_clip": 2.19140625, "router_z_loss_mlp": 0.23791504, "step": 12947, "time_per_iteration": 2.683212995529175 }, { "auxiliary_loss_clip": 0.01231799, "auxiliary_loss_mlp": 0.00218739, "balance_loss_clip": 1.01541758, "balance_loss_mlp": 0.19272722, "epoch": 0.778475875544867, "flos": 20813861059200.0, "grad_norm": 36.38523109227311, "language_loss": 0.75843167, "learning_rate": 4.929495979764147e-07, "loss": 0.77293706, "num_input_tokens_seen": 279371360, "router_z_loss_clip": 2.16015625, "router_z_loss_mlp": 0.26000977, "step": 12948, "time_per_iteration": 2.6894073486328125 }, { "auxiliary_loss_clip": 0.01240841, "auxiliary_loss_mlp": 0.00210164, "balance_loss_clip": 1.01845264, "balance_loss_mlp": 0.18377143, "epoch": 0.778535998797535, "flos": 14355363104640.0, "grad_norm": 19.909588198773193, "language_loss": 0.84644222, "learning_rate": 4.926935870337625e-07, "loss": 0.86095232, "num_input_tokens_seen": 279389400, "router_z_loss_clip": 2.2265625, "router_z_loss_mlp": 0.26391602, "step": 12949, "time_per_iteration": 2.5946786403656006 }, { "auxiliary_loss_clip": 0.01282023, "auxiliary_loss_mlp": 0.00225854, "balance_loss_clip": 1.05426669, "balance_loss_mlp": 0.1989727, "epoch": 0.7785961220502029, "flos": 19209552007680.0, "grad_norm": 5.195926441817377, "language_loss": 0.73760796, "learning_rate": 4.924376332483202e-07, "loss": 0.75268674, "num_input_tokens_seen": 279409715, "router_z_loss_clip": 2.27734375, "router_z_loss_mlp": 0.26879883, "step": 12950, "time_per_iteration": 2.706953287124634 }, { "auxiliary_loss_clip": 0.01251696, "auxiliary_loss_mlp": 0.00221118, "balance_loss_clip": 1.0297606, "balance_loss_mlp": 0.19513038, "epoch": 0.7786562453028709, "flos": 25738757884800.0, "grad_norm": 128.07106660444126, "language_loss": 0.80252647, "learning_rate": 4.921817366297938e-07, "loss": 0.8172546, "num_input_tokens_seen": 279427705, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.2598877, "step": 12951, "time_per_iteration": 2.640488386154175 }, { "auxiliary_loss_clip": 0.01242397, "auxiliary_loss_mlp": 0.00225071, "balance_loss_clip": 1.02335501, "balance_loss_mlp": 0.1993103, "epoch": 0.7787163685555388, "flos": 25739440243200.0, "grad_norm": 30.611847150403097, "language_loss": 0.7687552, "learning_rate": 4.919258971878877e-07, "loss": 0.78342992, "num_input_tokens_seen": 279448215, "router_z_loss_clip": 2.19140625, "router_z_loss_mlp": 0.25744629, "step": 12952, "time_per_iteration": 2.737388849258423 }, { "auxiliary_loss_clip": 0.01212014, "auxiliary_loss_mlp": 0.00191579, "balance_loss_clip": 1.00455987, "balance_loss_mlp": 0.16690287, "epoch": 0.7787764918082068, "flos": 22747722416640.0, "grad_norm": 52.40796550303848, "language_loss": 0.87100834, "learning_rate": 4.916701149323022e-07, "loss": 0.88504422, "num_input_tokens_seen": 279466260, "router_z_loss_clip": 2.07421875, "router_z_loss_mlp": 0.24658203, "step": 12953, "time_per_iteration": 2.684340715408325 }, { "auxiliary_loss_clip": 0.01248908, "auxiliary_loss_mlp": 0.00196229, "balance_loss_clip": 1.02683055, "balance_loss_mlp": 0.17180267, "epoch": 0.7788366150608748, "flos": 15190860430080.0, "grad_norm": 22.125562718542316, "language_loss": 0.8595252, "learning_rate": 4.91414389872737e-07, "loss": 0.87397659, "num_input_tokens_seen": 279484520, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.2442627, "step": 12954, "time_per_iteration": 2.6449499130249023 }, { "auxiliary_loss_clip": 0.01240587, "auxiliary_loss_mlp": 0.00223882, "balance_loss_clip": 1.01812279, "balance_loss_mlp": 0.19870487, "epoch": 0.7788967383135428, "flos": 21210242618880.0, "grad_norm": 6.9378641327134964, "language_loss": 0.786762, "learning_rate": 4.911587220188905e-07, "loss": 0.80140674, "num_input_tokens_seen": 279503130, "router_z_loss_clip": 2.2265625, "router_z_loss_mlp": 0.25183105, "step": 12955, "time_per_iteration": 2.6917474269866943 }, { "auxiliary_loss_clip": 0.01227341, "auxiliary_loss_mlp": 0.0020817, "balance_loss_clip": 1.01394022, "balance_loss_mlp": 0.18268323, "epoch": 0.7789568615662107, "flos": 21682970536320.0, "grad_norm": 15.924674813752498, "language_loss": 0.75073588, "learning_rate": 4.909031113804551e-07, "loss": 0.76509094, "num_input_tokens_seen": 279521930, "router_z_loss_clip": 2.1328125, "router_z_loss_mlp": 0.25488281, "step": 12956, "time_per_iteration": 2.6332123279571533 }, { "auxiliary_loss_clip": 0.01229458, "auxiliary_loss_mlp": 0.00236013, "balance_loss_clip": 1.01116848, "balance_loss_mlp": 0.21071669, "epoch": 0.7790169848188787, "flos": 26360371676160.0, "grad_norm": 216.30364465727632, "language_loss": 0.84790164, "learning_rate": 4.906475579671252e-07, "loss": 0.86255634, "num_input_tokens_seen": 279542375, "router_z_loss_clip": 2.18066406, "router_z_loss_mlp": 0.25292969, "step": 12957, "time_per_iteration": 2.6980607509613037 }, { "auxiliary_loss_clip": 0.0123716, "auxiliary_loss_mlp": 0.00228278, "balance_loss_clip": 1.01836574, "balance_loss_mlp": 0.2020279, "epoch": 0.7790771080715466, "flos": 25516183259520.0, "grad_norm": 3.388980000085202, "language_loss": 0.84069812, "learning_rate": 4.903920617885917e-07, "loss": 0.8553524, "num_input_tokens_seen": 279561885, "router_z_loss_clip": 2.19140625, "router_z_loss_mlp": 0.26245117, "step": 12958, "time_per_iteration": 2.675447940826416 }, { "auxiliary_loss_clip": 0.012428, "auxiliary_loss_mlp": 0.00219841, "balance_loss_clip": 1.02209425, "balance_loss_mlp": 0.1935908, "epoch": 0.7791372313242146, "flos": 16034186920320.0, "grad_norm": 6.4599828169369875, "language_loss": 0.79464966, "learning_rate": 4.901366228545418e-07, "loss": 0.80927604, "num_input_tokens_seen": 279579965, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.26269531, "step": 12959, "time_per_iteration": 2.7068676948547363 }, { "auxiliary_loss_clip": 0.0124765, "auxiliary_loss_mlp": 0.00223672, "balance_loss_clip": 1.02352118, "balance_loss_mlp": 0.19805422, "epoch": 0.7791973545768827, "flos": 23842207779840.0, "grad_norm": 10.154486464000113, "language_loss": 0.84727383, "learning_rate": 4.898812411746632e-07, "loss": 0.86198705, "num_input_tokens_seen": 279599030, "router_z_loss_clip": 2.24023438, "router_z_loss_mlp": 0.25610352, "step": 12960, "time_per_iteration": 2.655852794647217 }, { "auxiliary_loss_clip": 0.01270769, "auxiliary_loss_mlp": 0.00210845, "balance_loss_clip": 1.04062307, "balance_loss_mlp": 0.18610877, "epoch": 0.7792574778295506, "flos": 24168384207360.0, "grad_norm": 214.15558939047776, "language_loss": 0.84161729, "learning_rate": 4.896259167586385e-07, "loss": 0.85643339, "num_input_tokens_seen": 279614400, "router_z_loss_clip": 2.30273438, "router_z_loss_mlp": 0.24731445, "step": 12961, "time_per_iteration": 2.686166763305664 }, { "auxiliary_loss_clip": 0.01229885, "auxiliary_loss_mlp": 0.00211233, "balance_loss_clip": 1.01827526, "balance_loss_mlp": 0.18771276, "epoch": 0.7793176010822186, "flos": 21464921024640.0, "grad_norm": 4.710212835537806, "language_loss": 0.79902619, "learning_rate": 4.893706496161511e-07, "loss": 0.81343734, "num_input_tokens_seen": 279633745, "router_z_loss_clip": 2.11914062, "router_z_loss_mlp": 0.23522949, "step": 12962, "time_per_iteration": 2.6445977687835693 }, { "auxiliary_loss_clip": 0.01224431, "auxiliary_loss_mlp": 0.00193022, "balance_loss_clip": 1.00726497, "balance_loss_mlp": 0.16832227, "epoch": 0.7793777243348865, "flos": 20666699038080.0, "grad_norm": 176.05614417214215, "language_loss": 0.79526484, "learning_rate": 4.891154397568795e-07, "loss": 0.80943936, "num_input_tokens_seen": 279651165, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.24694824, "step": 12963, "time_per_iteration": 2.6311538219451904 }, { "auxiliary_loss_clip": 0.01226244, "auxiliary_loss_mlp": 0.00205776, "balance_loss_clip": 1.01272655, "balance_loss_mlp": 0.18171927, "epoch": 0.7794378475875545, "flos": 27125771610240.0, "grad_norm": 61.57601564144057, "language_loss": 0.735165, "learning_rate": 4.888602871905019e-07, "loss": 0.74948519, "num_input_tokens_seen": 279671175, "router_z_loss_clip": 2.13476562, "router_z_loss_mlp": 0.24060059, "step": 12964, "time_per_iteration": 2.6878037452697754 }, { "auxiliary_loss_clip": 0.01238857, "auxiliary_loss_mlp": 0.00209892, "balance_loss_clip": 1.01887214, "balance_loss_mlp": 0.1834397, "epoch": 0.7794979708402224, "flos": 28074136446720.0, "grad_norm": 76.4365587660423, "language_loss": 0.82380384, "learning_rate": 4.88605191926694e-07, "loss": 0.83829129, "num_input_tokens_seen": 279688675, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.26452637, "step": 12965, "time_per_iteration": 2.7241263389587402 }, { "auxiliary_loss_clip": 0.01215041, "auxiliary_loss_mlp": 0.00208863, "balance_loss_clip": 1.00483048, "balance_loss_mlp": 0.18348336, "epoch": 0.7795580940928905, "flos": 26869548919680.0, "grad_norm": 64.23062658846396, "language_loss": 0.77093256, "learning_rate": 4.883501539751289e-07, "loss": 0.78517157, "num_input_tokens_seen": 279710245, "router_z_loss_clip": 2.10351562, "router_z_loss_mlp": 0.25378418, "step": 12966, "time_per_iteration": 2.7625341415405273 }, { "auxiliary_loss_clip": 0.01239835, "auxiliary_loss_mlp": 0.00183471, "balance_loss_clip": 1.02278447, "balance_loss_mlp": 0.16016531, "epoch": 0.7796182173455584, "flos": 23835384195840.0, "grad_norm": 16.47102550936087, "language_loss": 0.81968176, "learning_rate": 4.880951733454768e-07, "loss": 0.83391482, "num_input_tokens_seen": 279729045, "router_z_loss_clip": 2.171875, "router_z_loss_mlp": 0.23303223, "step": 12967, "time_per_iteration": 2.732750177383423 }, { "auxiliary_loss_clip": 0.01236622, "auxiliary_loss_mlp": 0.00206884, "balance_loss_clip": 1.01646042, "balance_loss_mlp": 0.18146902, "epoch": 0.7796783405982264, "flos": 19792238434560.0, "grad_norm": 19.816011279085792, "language_loss": 0.83481407, "learning_rate": 4.878402500474073e-07, "loss": 0.84924912, "num_input_tokens_seen": 279748350, "router_z_loss_clip": 2.20117188, "router_z_loss_mlp": 0.25415039, "step": 12968, "time_per_iteration": 2.706890344619751 }, { "auxiliary_loss_clip": 0.01238233, "auxiliary_loss_mlp": 0.00206575, "balance_loss_clip": 1.01947582, "balance_loss_mlp": 0.18168369, "epoch": 0.7797384638508943, "flos": 15450207603840.0, "grad_norm": 1194.8211577592128, "language_loss": 0.7152757, "learning_rate": 4.875853840905874e-07, "loss": 0.72972375, "num_input_tokens_seen": 279765620, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.24914551, "step": 12969, "time_per_iteration": 2.6947693824768066 }, { "auxiliary_loss_clip": 0.01204048, "auxiliary_loss_mlp": 0.00201587, "balance_loss_clip": 0.9946388, "balance_loss_mlp": 0.17854388, "epoch": 0.7797985871035623, "flos": 20922742160640.0, "grad_norm": 24.520613527178615, "language_loss": 0.77240443, "learning_rate": 4.873305754846811e-07, "loss": 0.78646076, "num_input_tokens_seen": 279782485, "router_z_loss_clip": 2.09375, "router_z_loss_mlp": 0.23059082, "step": 12970, "time_per_iteration": 2.675076723098755 }, { "auxiliary_loss_clip": 0.01240123, "auxiliary_loss_mlp": 0.00205205, "balance_loss_clip": 1.02601004, "balance_loss_mlp": 0.18007557, "epoch": 0.7798587103562302, "flos": 36937212514560.0, "grad_norm": 6.627705101011302, "language_loss": 0.80549705, "learning_rate": 4.870758242393507e-07, "loss": 0.81995034, "num_input_tokens_seen": 279804170, "router_z_loss_clip": 2.13964844, "router_z_loss_mlp": 0.2512207, "step": 12971, "time_per_iteration": 2.7911126613616943 }, { "auxiliary_loss_clip": 0.01247053, "auxiliary_loss_mlp": 0.00226441, "balance_loss_clip": 1.02416813, "balance_loss_mlp": 0.19857022, "epoch": 0.7799188336088982, "flos": 22419283432320.0, "grad_norm": 572.0207609201108, "language_loss": 0.82789207, "learning_rate": 4.868211303642578e-07, "loss": 0.84262699, "num_input_tokens_seen": 279823730, "router_z_loss_clip": 2.23046875, "router_z_loss_mlp": 0.27880859, "step": 12972, "time_per_iteration": 2.6765475273132324 }, { "auxiliary_loss_clip": 0.01255881, "auxiliary_loss_mlp": 0.00218576, "balance_loss_clip": 1.03476477, "balance_loss_mlp": 0.19148004, "epoch": 0.7799789568615663, "flos": 18880466578560.0, "grad_norm": 5.244344554093659, "language_loss": 0.81381452, "learning_rate": 4.865664938690584e-07, "loss": 0.82855904, "num_input_tokens_seen": 279843035, "router_z_loss_clip": 2.2109375, "router_z_loss_mlp": 0.27111816, "step": 12973, "time_per_iteration": 2.623101234436035 }, { "auxiliary_loss_clip": 0.01234888, "auxiliary_loss_mlp": 0.00202184, "balance_loss_clip": 1.02031136, "balance_loss_mlp": 0.17891453, "epoch": 0.7800390801142342, "flos": 20262272832000.0, "grad_norm": 53.311556130900925, "language_loss": 0.85902131, "learning_rate": 4.863119147634089e-07, "loss": 0.87339199, "num_input_tokens_seen": 279861450, "router_z_loss_clip": 2.14257812, "router_z_loss_mlp": 0.23266602, "step": 12974, "time_per_iteration": 4.19887113571167 }, { "auxiliary_loss_clip": 0.01239219, "auxiliary_loss_mlp": 0.00214949, "balance_loss_clip": 1.02610302, "balance_loss_mlp": 0.19039237, "epoch": 0.7800992033669022, "flos": 16690310703360.0, "grad_norm": 23.134253299711727, "language_loss": 0.7643441, "learning_rate": 4.86057393056964e-07, "loss": 0.77888578, "num_input_tokens_seen": 279878660, "router_z_loss_clip": 2.13085938, "router_z_loss_mlp": 0.24584961, "step": 12975, "time_per_iteration": 2.6556384563446045 }, { "auxiliary_loss_clip": 0.01235871, "auxiliary_loss_mlp": 0.00212663, "balance_loss_clip": 1.01733637, "balance_loss_mlp": 0.18771216, "epoch": 0.7801593266195701, "flos": 18585208782720.0, "grad_norm": 3966.892530472151, "language_loss": 0.89908206, "learning_rate": 4.858029287593739e-07, "loss": 0.91356742, "num_input_tokens_seen": 279895685, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.24963379, "step": 12976, "time_per_iteration": 4.028537273406982 }, { "auxiliary_loss_clip": 0.01240761, "auxiliary_loss_mlp": 0.00196871, "balance_loss_clip": 1.02153945, "balance_loss_mlp": 0.17151567, "epoch": 0.7802194498722381, "flos": 25484941405440.0, "grad_norm": 7.270277207444462, "language_loss": 0.72818255, "learning_rate": 4.85548521880289e-07, "loss": 0.74255884, "num_input_tokens_seen": 279917240, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.2532959, "step": 12977, "time_per_iteration": 2.685765504837036 }, { "auxiliary_loss_clip": 0.01243859, "auxiliary_loss_mlp": 0.0021668, "balance_loss_clip": 1.02448618, "balance_loss_mlp": 0.18978646, "epoch": 0.780279573124906, "flos": 31176315573120.0, "grad_norm": 144.7963509457103, "language_loss": 0.81020421, "learning_rate": 4.852941724293554e-07, "loss": 0.82480961, "num_input_tokens_seen": 279938665, "router_z_loss_clip": 2.19140625, "router_z_loss_mlp": 0.26916504, "step": 12978, "time_per_iteration": 2.840775728225708 }, { "auxiliary_loss_clip": 0.01265322, "auxiliary_loss_mlp": 0.00208623, "balance_loss_clip": 1.03361225, "balance_loss_mlp": 0.18205127, "epoch": 0.780339696377574, "flos": 26944027770240.0, "grad_norm": 13.276446925128756, "language_loss": 0.71842343, "learning_rate": 4.85039880416219e-07, "loss": 0.73316288, "num_input_tokens_seen": 279957965, "router_z_loss_clip": 2.31640625, "router_z_loss_mlp": 0.26623535, "step": 12979, "time_per_iteration": 2.690765380859375 }, { "auxiliary_loss_clip": 0.01226807, "auxiliary_loss_mlp": 0.00203819, "balance_loss_clip": 1.01396048, "balance_loss_mlp": 0.18033446, "epoch": 0.780399819630242, "flos": 27957426180480.0, "grad_norm": 42.83677708973445, "language_loss": 0.85718369, "learning_rate": 4.847856458505217e-07, "loss": 0.87148988, "num_input_tokens_seen": 279977490, "router_z_loss_clip": 2.13085938, "router_z_loss_mlp": 0.23486328, "step": 12980, "time_per_iteration": 4.339831829071045 }, { "auxiliary_loss_clip": 0.01248059, "auxiliary_loss_mlp": 0.00214052, "balance_loss_clip": 1.02365279, "balance_loss_mlp": 0.18727753, "epoch": 0.78045994288291, "flos": 22486795044480.0, "grad_norm": 11.340840567328787, "language_loss": 0.85048962, "learning_rate": 4.845314687419046e-07, "loss": 0.86511075, "num_input_tokens_seen": 279994220, "router_z_loss_clip": 2.24414062, "router_z_loss_mlp": 0.26794434, "step": 12981, "time_per_iteration": 2.688406229019165 }, { "auxiliary_loss_clip": 0.01236756, "auxiliary_loss_mlp": 0.00225578, "balance_loss_clip": 1.01801801, "balance_loss_mlp": 0.19903025, "epoch": 0.7805200661355779, "flos": 20850849089280.0, "grad_norm": 6.018409720981341, "language_loss": 0.82019055, "learning_rate": 4.842773491000067e-07, "loss": 0.83481389, "num_input_tokens_seen": 280012590, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.26550293, "step": 12982, "time_per_iteration": 2.6811201572418213 }, { "auxiliary_loss_clip": 0.01234522, "auxiliary_loss_mlp": 0.00189131, "balance_loss_clip": 1.01832247, "balance_loss_mlp": 0.16486031, "epoch": 0.7805801893882459, "flos": 25665966973440.0, "grad_norm": 3.336172361233026, "language_loss": 0.79078364, "learning_rate": 4.840232869344636e-07, "loss": 0.80502015, "num_input_tokens_seen": 280033700, "router_z_loss_clip": 2.16601562, "router_z_loss_mlp": 0.24255371, "step": 12983, "time_per_iteration": 2.7027289867401123 }, { "auxiliary_loss_clip": 0.01238815, "auxiliary_loss_mlp": 0.00205977, "balance_loss_clip": 1.02448559, "balance_loss_mlp": 0.18212286, "epoch": 0.7806403126409138, "flos": 11327806483200.0, "grad_norm": 5.2158170134815665, "language_loss": 0.84790134, "learning_rate": 4.837692822549086e-07, "loss": 0.86234927, "num_input_tokens_seen": 280052215, "router_z_loss_clip": 2.14453125, "router_z_loss_mlp": 0.23840332, "step": 12984, "time_per_iteration": 2.655400514602661 }, { "auxiliary_loss_clip": 0.01260578, "auxiliary_loss_mlp": 0.00207973, "balance_loss_clip": 1.03319991, "balance_loss_mlp": 0.18391635, "epoch": 0.7807004358935818, "flos": 19573362910080.0, "grad_norm": 31.016882211534934, "language_loss": 0.91902816, "learning_rate": 4.835153350709746e-07, "loss": 0.93371367, "num_input_tokens_seen": 280070525, "router_z_loss_clip": 2.27148438, "router_z_loss_mlp": 0.24060059, "step": 12985, "time_per_iteration": 4.185423135757446 }, { "auxiliary_loss_clip": 0.01223625, "auxiliary_loss_mlp": 0.00201937, "balance_loss_clip": 1.00615382, "balance_loss_mlp": 0.17641392, "epoch": 0.7807605591462499, "flos": 19135827342720.0, "grad_norm": 8.515762815659226, "language_loss": 0.84195393, "learning_rate": 4.832614453922915e-07, "loss": 0.85620958, "num_input_tokens_seen": 280089855, "router_z_loss_clip": 2.17285156, "router_z_loss_mlp": 0.25512695, "step": 12986, "time_per_iteration": 2.68963885307312 }, { "auxiliary_loss_clip": 0.01240006, "auxiliary_loss_mlp": 0.00205923, "balance_loss_clip": 1.01931369, "balance_loss_mlp": 0.18091336, "epoch": 0.7808206823989178, "flos": 32374654133760.0, "grad_norm": 216.57746776625424, "language_loss": 0.82788873, "learning_rate": 4.830076132284859e-07, "loss": 0.84234804, "num_input_tokens_seen": 280109960, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.25, "step": 12987, "time_per_iteration": 2.752666473388672 }, { "auxiliary_loss_clip": 0.01081914, "auxiliary_loss_mlp": 0.00092796, "balance_loss_clip": 0.92777896, "balance_loss_mlp": 0.08440325, "epoch": 0.7808808056515858, "flos": 55050235061760.0, "grad_norm": 0.7157086139320568, "language_loss": 0.54215562, "learning_rate": 4.82753838589184e-07, "loss": 0.55390275, "num_input_tokens_seen": 280169805, "router_z_loss_clip": 1.5390625, "router_z_loss_mlp": 0.08398438, "step": 12988, "time_per_iteration": 3.160268783569336 }, { "auxiliary_loss_clip": 0.012414, "auxiliary_loss_mlp": 0.00226416, "balance_loss_clip": 1.02134717, "balance_loss_mlp": 0.20128691, "epoch": 0.7809409289042537, "flos": 12859468277760.0, "grad_norm": 348.35507267326375, "language_loss": 0.89019382, "learning_rate": 4.82500121484009e-07, "loss": 0.90487194, "num_input_tokens_seen": 280184630, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.25146484, "step": 12989, "time_per_iteration": 2.6547319889068604 }, { "auxiliary_loss_clip": 0.01214229, "auxiliary_loss_mlp": 0.00215859, "balance_loss_clip": 1.00558805, "balance_loss_mlp": 0.19134969, "epoch": 0.7810010521569217, "flos": 21687244254720.0, "grad_norm": 27.594626530983863, "language_loss": 0.78495353, "learning_rate": 4.822464619225806e-07, "loss": 0.79925442, "num_input_tokens_seen": 280203880, "router_z_loss_clip": 2.0859375, "router_z_loss_mlp": 0.24487305, "step": 12990, "time_per_iteration": 2.692359209060669 }, { "auxiliary_loss_clip": 0.0123957, "auxiliary_loss_mlp": 0.00224916, "balance_loss_clip": 1.02092791, "balance_loss_mlp": 0.19759315, "epoch": 0.7810611754095896, "flos": 16757068129920.0, "grad_norm": 65.40421069665753, "language_loss": 0.8392899, "learning_rate": 4.819928599145184e-07, "loss": 0.85393476, "num_input_tokens_seen": 280220460, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.27355957, "step": 12991, "time_per_iteration": 2.6360480785369873 }, { "auxiliary_loss_clip": 0.01240331, "auxiliary_loss_mlp": 0.00210779, "balance_loss_clip": 1.02231383, "balance_loss_mlp": 0.18620971, "epoch": 0.7811212986622577, "flos": 43507464658560.0, "grad_norm": 46.72349023065151, "language_loss": 0.74909538, "learning_rate": 4.817393154694398e-07, "loss": 0.76360643, "num_input_tokens_seen": 280242680, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.24572754, "step": 12992, "time_per_iteration": 2.8671185970306396 }, { "auxiliary_loss_clip": 0.01244849, "auxiliary_loss_mlp": 0.00200938, "balance_loss_clip": 1.02108812, "balance_loss_mlp": 0.1743902, "epoch": 0.7811814219149256, "flos": 21757700782080.0, "grad_norm": 16.704388674453845, "language_loss": 0.7069115, "learning_rate": 4.814858285969578e-07, "loss": 0.72136939, "num_input_tokens_seen": 280260655, "router_z_loss_clip": 2.23632812, "router_z_loss_mlp": 0.265625, "step": 12993, "time_per_iteration": 2.6308515071868896 }, { "auxiliary_loss_clip": 0.01224138, "auxiliary_loss_mlp": 0.00208969, "balance_loss_clip": 1.00886285, "balance_loss_mlp": 0.18305308, "epoch": 0.7812415451675936, "flos": 24061514267520.0, "grad_norm": 40.30184772489457, "language_loss": 0.77144712, "learning_rate": 4.812323993066862e-07, "loss": 0.78577816, "num_input_tokens_seen": 280281185, "router_z_loss_clip": 2.15722656, "router_z_loss_mlp": 0.25915527, "step": 12994, "time_per_iteration": 2.68083119392395 }, { "auxiliary_loss_clip": 0.01240716, "auxiliary_loss_mlp": 0.00213214, "balance_loss_clip": 1.02509665, "balance_loss_mlp": 0.18969376, "epoch": 0.7813016684202615, "flos": 18989706816000.0, "grad_norm": 21.700218086742687, "language_loss": 0.79209661, "learning_rate": 4.809790276082335e-07, "loss": 0.80663592, "num_input_tokens_seen": 280298255, "router_z_loss_clip": 2.15429688, "router_z_loss_mlp": 0.23547363, "step": 12995, "time_per_iteration": 2.6370160579681396 }, { "auxiliary_loss_clip": 0.01213747, "auxiliary_loss_mlp": 0.00198187, "balance_loss_clip": 1.00520778, "balance_loss_mlp": 0.17379683, "epoch": 0.7813617916729295, "flos": 25260786581760.0, "grad_norm": 126.24769943570153, "language_loss": 0.80518121, "learning_rate": 4.807257135112088e-07, "loss": 0.81930053, "num_input_tokens_seen": 280319000, "router_z_loss_clip": 2.0859375, "router_z_loss_mlp": 0.24389648, "step": 12996, "time_per_iteration": 2.6807973384857178 }, { "auxiliary_loss_clip": 0.01273483, "auxiliary_loss_mlp": 0.00219812, "balance_loss_clip": 1.04696953, "balance_loss_mlp": 0.19371663, "epoch": 0.7814219149255974, "flos": 17966037116160.0, "grad_norm": 79.12754483325044, "language_loss": 0.84496796, "learning_rate": 4.804724570252167e-07, "loss": 0.85990089, "num_input_tokens_seen": 280336375, "router_z_loss_clip": 2.265625, "router_z_loss_mlp": 0.26086426, "step": 12997, "time_per_iteration": 2.6297659873962402 }, { "auxiliary_loss_clip": 0.01264085, "auxiliary_loss_mlp": 0.00220584, "balance_loss_clip": 1.03296685, "balance_loss_mlp": 0.191688, "epoch": 0.7814820381782654, "flos": 25776176878080.0, "grad_norm": 3.1003174886589413, "language_loss": 0.89390993, "learning_rate": 4.802192581598614e-07, "loss": 0.90875655, "num_input_tokens_seen": 280358760, "router_z_loss_clip": 2.30859375, "router_z_loss_mlp": 0.28857422, "step": 12998, "time_per_iteration": 2.6963419914245605 }, { "auxiliary_loss_clip": 0.01269822, "auxiliary_loss_mlp": 0.00230108, "balance_loss_clip": 1.03637862, "balance_loss_mlp": 0.20386991, "epoch": 0.7815421614309335, "flos": 20519572930560.0, "grad_norm": 66.53097259659732, "language_loss": 0.83727586, "learning_rate": 4.799661169247453e-07, "loss": 0.85227519, "num_input_tokens_seen": 280377085, "router_z_loss_clip": 2.33398438, "router_z_loss_mlp": 0.2623291, "step": 12999, "time_per_iteration": 2.674636125564575 }, { "auxiliary_loss_clip": 0.01261524, "auxiliary_loss_mlp": 0.00213301, "balance_loss_clip": 1.03362322, "balance_loss_mlp": 0.18690783, "epoch": 0.7816022846836014, "flos": 21287666384640.0, "grad_norm": 4.5006573519597906, "language_loss": 0.92253995, "learning_rate": 4.797130333294652e-07, "loss": 0.93728817, "num_input_tokens_seen": 280395465, "router_z_loss_clip": 2.27734375, "router_z_loss_mlp": 0.2635498, "step": 13000, "time_per_iteration": 2.630629062652588 }, { "auxiliary_loss_clip": 0.01263142, "auxiliary_loss_mlp": 0.00225536, "balance_loss_clip": 1.04046845, "balance_loss_mlp": 0.19956063, "epoch": 0.7816624079362694, "flos": 19208402772480.0, "grad_norm": 74.64648707972255, "language_loss": 0.74299186, "learning_rate": 4.794600073836192e-07, "loss": 0.7578786, "num_input_tokens_seen": 280412775, "router_z_loss_clip": 2.22460938, "router_z_loss_mlp": 0.25964355, "step": 13001, "time_per_iteration": 2.634693145751953 }, { "auxiliary_loss_clip": 0.01250455, "auxiliary_loss_mlp": 0.00207509, "balance_loss_clip": 1.0311811, "balance_loss_mlp": 0.18252289, "epoch": 0.7817225311889373, "flos": 26104687689600.0, "grad_norm": 5.850436765836123, "language_loss": 0.75330031, "learning_rate": 4.792070390968027e-07, "loss": 0.76787996, "num_input_tokens_seen": 280432905, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.24987793, "step": 13002, "time_per_iteration": 2.7151730060577393 }, { "auxiliary_loss_clip": 0.01266662, "auxiliary_loss_mlp": 0.00206818, "balance_loss_clip": 1.04057598, "balance_loss_mlp": 0.1788041, "epoch": 0.7817826544416053, "flos": 21250929749760.0, "grad_norm": 41.13321969585581, "language_loss": 0.85435021, "learning_rate": 4.78954128478607e-07, "loss": 0.86908495, "num_input_tokens_seen": 280450785, "router_z_loss_clip": 2.25976562, "router_z_loss_mlp": 0.2800293, "step": 13003, "time_per_iteration": 2.676223039627075 }, { "auxiliary_loss_clip": 0.01235943, "auxiliary_loss_mlp": 0.00212234, "balance_loss_clip": 1.01645446, "balance_loss_mlp": 0.18784334, "epoch": 0.7818427776942732, "flos": 19932181822080.0, "grad_norm": 7.402755922226686, "language_loss": 0.70831764, "learning_rate": 4.787012755386233e-07, "loss": 0.72279942, "num_input_tokens_seen": 280468400, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.24401855, "step": 13004, "time_per_iteration": 2.6530966758728027 }, { "auxiliary_loss_clip": 0.01220766, "auxiliary_loss_mlp": 0.00194772, "balance_loss_clip": 1.00721192, "balance_loss_mlp": 0.16985707, "epoch": 0.7819029009469413, "flos": 11363753018880.0, "grad_norm": 7.087857769835801, "language_loss": 0.91359985, "learning_rate": 4.784484802864403e-07, "loss": 0.92775518, "num_input_tokens_seen": 280483930, "router_z_loss_clip": 2.13476562, "router_z_loss_mlp": 0.24951172, "step": 13005, "time_per_iteration": 2.679480791091919 }, { "auxiliary_loss_clip": 0.01232972, "auxiliary_loss_mlp": 0.00202456, "balance_loss_clip": 1.0177834, "balance_loss_mlp": 0.17841189, "epoch": 0.7819630241996092, "flos": 24279276470400.0, "grad_norm": 2.8750236232222655, "language_loss": 0.83666992, "learning_rate": 4.781957427316432e-07, "loss": 0.85102427, "num_input_tokens_seen": 280503465, "router_z_loss_clip": 2.15039062, "router_z_loss_mlp": 0.24060059, "step": 13006, "time_per_iteration": 2.713557720184326 }, { "auxiliary_loss_clip": 0.01263596, "auxiliary_loss_mlp": 0.0023736, "balance_loss_clip": 1.03612828, "balance_loss_mlp": 0.20952451, "epoch": 0.7820231474522772, "flos": 22708902792960.0, "grad_norm": 2.895107710596516, "language_loss": 0.79188609, "learning_rate": 4.779430628838157e-07, "loss": 0.80689561, "num_input_tokens_seen": 280523375, "router_z_loss_clip": 2.27539062, "router_z_loss_mlp": 0.27844238, "step": 13007, "time_per_iteration": 2.6942782402038574 }, { "auxiliary_loss_clip": 0.01235858, "auxiliary_loss_mlp": 0.00210831, "balance_loss_clip": 1.01869071, "balance_loss_mlp": 0.18602347, "epoch": 0.7820832707049451, "flos": 20047419630720.0, "grad_norm": 45.3370219888529, "language_loss": 0.7908752, "learning_rate": 4.776904407525397e-07, "loss": 0.80534208, "num_input_tokens_seen": 280542920, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.24804688, "step": 13008, "time_per_iteration": 2.6401000022888184 }, { "auxiliary_loss_clip": 0.01257086, "auxiliary_loss_mlp": 0.00206938, "balance_loss_clip": 1.03838873, "balance_loss_mlp": 0.1824764, "epoch": 0.7821433939576131, "flos": 27162795553920.0, "grad_norm": 39.13966341514411, "language_loss": 0.76791108, "learning_rate": 4.774378763473954e-07, "loss": 0.78255135, "num_input_tokens_seen": 280561700, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.24487305, "step": 13009, "time_per_iteration": 2.7246603965759277 }, { "auxiliary_loss_clip": 0.01248991, "auxiliary_loss_mlp": 0.00202685, "balance_loss_clip": 1.02678323, "balance_loss_mlp": 0.17667331, "epoch": 0.782203517210281, "flos": 22602068766720.0, "grad_norm": 2.009513380332088, "language_loss": 0.89049268, "learning_rate": 4.771853696779586e-07, "loss": 0.90500951, "num_input_tokens_seen": 280580605, "router_z_loss_clip": 2.22460938, "router_z_loss_mlp": 0.26000977, "step": 13010, "time_per_iteration": 2.686748743057251 }, { "auxiliary_loss_clip": 0.01234883, "auxiliary_loss_mlp": 0.0020735, "balance_loss_clip": 1.02029049, "balance_loss_mlp": 0.18343671, "epoch": 0.782263640462949, "flos": 29059812535680.0, "grad_norm": 3.1547837062307393, "language_loss": 0.70061827, "learning_rate": 4.76932920753806e-07, "loss": 0.71504056, "num_input_tokens_seen": 280601495, "router_z_loss_clip": 2.14257812, "router_z_loss_mlp": 0.23950195, "step": 13011, "time_per_iteration": 2.7727651596069336 }, { "auxiliary_loss_clip": 0.01249579, "auxiliary_loss_mlp": 0.00221657, "balance_loss_clip": 1.03215432, "balance_loss_mlp": 0.19863737, "epoch": 0.782323763715617, "flos": 25299498464640.0, "grad_norm": 54.66002192406816, "language_loss": 0.7620362, "learning_rate": 4.7668052958450913e-07, "loss": 0.77674854, "num_input_tokens_seen": 280622760, "router_z_loss_clip": 2.17285156, "router_z_loss_mlp": 0.22998047, "step": 13012, "time_per_iteration": 2.751333236694336 }, { "auxiliary_loss_clip": 0.01083054, "auxiliary_loss_mlp": 0.00037655, "balance_loss_clip": 0.92865467, "balance_loss_mlp": 0.03136034, "epoch": 0.782383886968285, "flos": 65194388668800.0, "grad_norm": 0.6884724403952267, "language_loss": 0.54362071, "learning_rate": 4.764281961796395e-07, "loss": 0.55482781, "num_input_tokens_seen": 280687115, "router_z_loss_clip": 1.546875, "router_z_loss_mlp": 0.06298828, "step": 13013, "time_per_iteration": 3.2975046634674072 }, { "auxiliary_loss_clip": 0.01258402, "auxiliary_loss_mlp": 0.00220531, "balance_loss_clip": 1.03880501, "balance_loss_mlp": 0.19527024, "epoch": 0.782444010220953, "flos": 18405440190720.0, "grad_norm": 12.226302738309105, "language_loss": 0.75910866, "learning_rate": 4.76175920548765e-07, "loss": 0.77389801, "num_input_tokens_seen": 280705000, "router_z_loss_clip": 2.19238281, "router_z_loss_mlp": 0.25280762, "step": 13014, "time_per_iteration": 2.6687326431274414 }, { "auxiliary_loss_clip": 0.01077512, "auxiliary_loss_mlp": 0.00052925, "balance_loss_clip": 0.92539084, "balance_loss_mlp": 0.04629658, "epoch": 0.7825041334736209, "flos": 63955003841280.0, "grad_norm": 0.6692692896484363, "language_loss": 0.57643449, "learning_rate": 4.759237027014524e-07, "loss": 0.58773881, "num_input_tokens_seen": 280773525, "router_z_loss_clip": 1.5234375, "router_z_loss_mlp": 0.06640625, "step": 13015, "time_per_iteration": 3.2053847312927246 }, { "auxiliary_loss_clip": 0.0121904, "auxiliary_loss_mlp": 0.00210926, "balance_loss_clip": 1.00859833, "balance_loss_mlp": 0.18845467, "epoch": 0.7825642567262889, "flos": 20339373375360.0, "grad_norm": 6.656662790512305, "language_loss": 0.82373726, "learning_rate": 4.756715426472666e-07, "loss": 0.83803689, "num_input_tokens_seen": 280791915, "router_z_loss_clip": 2.10351562, "router_z_loss_mlp": 0.22473145, "step": 13016, "time_per_iteration": 4.061009883880615 }, { "auxiliary_loss_clip": 0.01259278, "auxiliary_loss_mlp": 0.00227876, "balance_loss_clip": 1.03537583, "balance_loss_mlp": 0.20167403, "epoch": 0.7826243799789568, "flos": 20262955190400.0, "grad_norm": 39.2249696446195, "language_loss": 0.82910532, "learning_rate": 4.7541944039576766e-07, "loss": 0.84397686, "num_input_tokens_seen": 280811460, "router_z_loss_clip": 2.23828125, "router_z_loss_mlp": 0.26196289, "step": 13017, "time_per_iteration": 2.6817708015441895 }, { "auxiliary_loss_clip": 0.01244934, "auxiliary_loss_mlp": 0.00233364, "balance_loss_clip": 1.02665389, "balance_loss_mlp": 0.20879519, "epoch": 0.7826845032316249, "flos": 21132926593920.0, "grad_norm": 4.399713972484514, "language_loss": 0.84374785, "learning_rate": 4.7516739595651636e-07, "loss": 0.85853082, "num_input_tokens_seen": 280825415, "router_z_loss_clip": 2.18164062, "router_z_loss_mlp": 0.24560547, "step": 13018, "time_per_iteration": 4.076875686645508 }, { "auxiliary_loss_clip": 0.01242933, "auxiliary_loss_mlp": 0.00203621, "balance_loss_clip": 1.02561557, "balance_loss_mlp": 0.17995746, "epoch": 0.7827446264842928, "flos": 22492253911680.0, "grad_norm": 2.5648425131604657, "language_loss": 0.82442319, "learning_rate": 4.749154093390708e-07, "loss": 0.83888865, "num_input_tokens_seen": 280845335, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.23693848, "step": 13019, "time_per_iteration": 2.6467161178588867 }, { "auxiliary_loss_clip": 0.01239388, "auxiliary_loss_mlp": 0.00233619, "balance_loss_clip": 1.02553689, "balance_loss_mlp": 0.20897821, "epoch": 0.7828047497369608, "flos": 28840649702400.0, "grad_norm": 7.780043470914796, "language_loss": 0.74295777, "learning_rate": 4.746634805529852e-07, "loss": 0.75768781, "num_input_tokens_seen": 280867145, "router_z_loss_clip": 2.13867188, "router_z_loss_mlp": 0.24621582, "step": 13020, "time_per_iteration": 2.722555160522461 }, { "auxiliary_loss_clip": 0.01238974, "auxiliary_loss_mlp": 0.00218469, "balance_loss_clip": 1.02777386, "balance_loss_mlp": 0.19546141, "epoch": 0.7828648729896287, "flos": 23257689759360.0, "grad_norm": 12.88693257840787, "language_loss": 0.70819384, "learning_rate": 4.7441160960781325e-07, "loss": 0.72276831, "num_input_tokens_seen": 280886185, "router_z_loss_clip": 2.11132812, "router_z_loss_mlp": 0.23010254, "step": 13021, "time_per_iteration": 2.635263204574585 }, { "auxiliary_loss_clip": 0.01231098, "auxiliary_loss_mlp": 0.00196948, "balance_loss_clip": 1.01811063, "balance_loss_mlp": 0.17507286, "epoch": 0.7829249962422967, "flos": 25265670831360.0, "grad_norm": 34.12789000003095, "language_loss": 0.77330399, "learning_rate": 4.7415979651310636e-07, "loss": 0.78758448, "num_input_tokens_seen": 280907665, "router_z_loss_clip": 2.12890625, "router_z_loss_mlp": 0.21887207, "step": 13022, "time_per_iteration": 4.201209545135498 }, { "auxiliary_loss_clip": 0.01074264, "auxiliary_loss_mlp": 0.00048265, "balance_loss_clip": 0.92379767, "balance_loss_mlp": 0.04039736, "epoch": 0.7829851194949646, "flos": 70722044645760.0, "grad_norm": 0.6325930228435039, "language_loss": 0.55720735, "learning_rate": 4.739080412784131e-07, "loss": 0.56843269, "num_input_tokens_seen": 280971405, "router_z_loss_clip": 1.5078125, "router_z_loss_mlp": 0.07861328, "step": 13023, "time_per_iteration": 3.3985989093780518 }, { "auxiliary_loss_clip": 0.01218991, "auxiliary_loss_mlp": 0.00219868, "balance_loss_clip": 1.01012921, "balance_loss_mlp": 0.19674164, "epoch": 0.7830452427476327, "flos": 25660795415040.0, "grad_norm": 33.78440349031529, "language_loss": 0.71955562, "learning_rate": 4.736563439132792e-07, "loss": 0.73394418, "num_input_tokens_seen": 280989615, "router_z_loss_clip": 2.0859375, "router_z_loss_mlp": 0.23120117, "step": 13024, "time_per_iteration": 2.7439496517181396 }, { "auxiliary_loss_clip": 0.01258648, "auxiliary_loss_mlp": 0.002068, "balance_loss_clip": 1.03457975, "balance_loss_mlp": 0.18364927, "epoch": 0.7831053660003006, "flos": 22784315397120.0, "grad_norm": 22.344008753407408, "language_loss": 0.8430829, "learning_rate": 4.734047044272498e-07, "loss": 0.85773742, "num_input_tokens_seen": 281009450, "router_z_loss_clip": 2.24023438, "router_z_loss_mlp": 0.23132324, "step": 13025, "time_per_iteration": 2.732992649078369 }, { "auxiliary_loss_clip": 0.01244034, "auxiliary_loss_mlp": 0.00202701, "balance_loss_clip": 1.02824318, "balance_loss_mlp": 0.18039647, "epoch": 0.7831654892529686, "flos": 25812267068160.0, "grad_norm": 5.263063896171608, "language_loss": 0.87672424, "learning_rate": 4.731531228298673e-07, "loss": 0.8911916, "num_input_tokens_seen": 281028120, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.22302246, "step": 13026, "time_per_iteration": 2.696462631225586 }, { "auxiliary_loss_clip": 0.01242468, "auxiliary_loss_mlp": 0.00199341, "balance_loss_clip": 1.01849294, "balance_loss_mlp": 0.17492652, "epoch": 0.7832256125056366, "flos": 20771557816320.0, "grad_norm": 10.898651691031937, "language_loss": 0.85091627, "learning_rate": 4.729015991306715e-07, "loss": 0.86533439, "num_input_tokens_seen": 281042130, "router_z_loss_clip": 2.23632812, "router_z_loss_mlp": 0.2442627, "step": 13027, "time_per_iteration": 4.0654919147491455 }, { "auxiliary_loss_clip": 0.0123033, "auxiliary_loss_mlp": 0.00212278, "balance_loss_clip": 1.01831841, "balance_loss_mlp": 0.18875808, "epoch": 0.7832857357583045, "flos": 21506541909120.0, "grad_norm": 2.3374400542342926, "language_loss": 0.77251518, "learning_rate": 4.726501333391997e-07, "loss": 0.78694117, "num_input_tokens_seen": 281060945, "router_z_loss_clip": 2.12207031, "router_z_loss_mlp": 0.23522949, "step": 13028, "time_per_iteration": 2.730891704559326 }, { "auxiliary_loss_clip": 0.01257358, "auxiliary_loss_mlp": 0.00241685, "balance_loss_clip": 1.0342443, "balance_loss_mlp": 0.21473137, "epoch": 0.7833458590109725, "flos": 18077791305600.0, "grad_norm": 4.0330594424493516, "language_loss": 0.76213658, "learning_rate": 4.7239872546498774e-07, "loss": 0.77712703, "num_input_tokens_seen": 281079270, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.26965332, "step": 13029, "time_per_iteration": 2.7300047874450684 }, { "auxiliary_loss_clip": 0.01256742, "auxiliary_loss_mlp": 0.0021924, "balance_loss_clip": 1.03525209, "balance_loss_mlp": 0.19397929, "epoch": 0.7834059822636404, "flos": 28288738252800.0, "grad_norm": 34.6268436411174, "language_loss": 0.88494962, "learning_rate": 4.721473755175698e-07, "loss": 0.89970934, "num_input_tokens_seen": 281099500, "router_z_loss_clip": 2.2109375, "router_z_loss_mlp": 0.25244141, "step": 13030, "time_per_iteration": 2.941351890563965 }, { "auxiliary_loss_clip": 0.01257371, "auxiliary_loss_mlp": 0.00207474, "balance_loss_clip": 1.0352658, "balance_loss_mlp": 0.18266661, "epoch": 0.7834661055163085, "flos": 31686211088640.0, "grad_norm": 9.172777555960634, "language_loss": 0.80518484, "learning_rate": 4.71896083506476e-07, "loss": 0.81983334, "num_input_tokens_seen": 281121250, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.24816895, "step": 13031, "time_per_iteration": 2.805762767791748 }, { "auxiliary_loss_clip": 0.01254901, "auxiliary_loss_mlp": 0.00203685, "balance_loss_clip": 1.02784944, "balance_loss_mlp": 0.17972437, "epoch": 0.7835262287689764, "flos": 12933192942720.0, "grad_norm": 18.464868939818373, "language_loss": 0.88580614, "learning_rate": 4.7164484944123574e-07, "loss": 0.90039194, "num_input_tokens_seen": 281138760, "router_z_loss_clip": 2.2734375, "router_z_loss_mlp": 0.23937988, "step": 13032, "time_per_iteration": 2.6394622325897217 }, { "auxiliary_loss_clip": 0.01251297, "auxiliary_loss_mlp": 0.0022326, "balance_loss_clip": 1.03220415, "balance_loss_mlp": 0.19742706, "epoch": 0.7835863520216444, "flos": 16143211676160.0, "grad_norm": 10.315943972139483, "language_loss": 0.71066725, "learning_rate": 4.7139367333137726e-07, "loss": 0.72541285, "num_input_tokens_seen": 281157420, "router_z_loss_clip": 2.19140625, "router_z_loss_mlp": 0.25842285, "step": 13033, "time_per_iteration": 2.609969139099121 }, { "auxiliary_loss_clip": 0.01248926, "auxiliary_loss_mlp": 0.00216945, "balance_loss_clip": 1.02854812, "balance_loss_mlp": 0.19036084, "epoch": 0.7836464752743123, "flos": 11509909459200.0, "grad_norm": 66.33187607592848, "language_loss": 0.80270666, "learning_rate": 4.7114255518642255e-07, "loss": 0.81736529, "num_input_tokens_seen": 281174620, "router_z_loss_clip": 2.20214844, "router_z_loss_mlp": 0.26599121, "step": 13034, "time_per_iteration": 2.6741387844085693 }, { "auxiliary_loss_clip": 0.01247765, "auxiliary_loss_mlp": 0.00206026, "balance_loss_clip": 1.03020334, "balance_loss_mlp": 0.18064663, "epoch": 0.7837065985269803, "flos": 18223696350720.0, "grad_norm": 4.670648793059117, "language_loss": 0.80269861, "learning_rate": 4.7089149501589555e-07, "loss": 0.81723654, "num_input_tokens_seen": 281193865, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.25390625, "step": 13035, "time_per_iteration": 2.6385042667388916 }, { "auxiliary_loss_clip": 0.01267538, "auxiliary_loss_mlp": 0.00225532, "balance_loss_clip": 1.04205298, "balance_loss_mlp": 0.19978327, "epoch": 0.7837667217796482, "flos": 24754410599040.0, "grad_norm": 1372.0967908456291, "language_loss": 0.7580899, "learning_rate": 4.7064049282931664e-07, "loss": 0.77302063, "num_input_tokens_seen": 281212250, "router_z_loss_clip": 2.25, "router_z_loss_mlp": 0.25769043, "step": 13036, "time_per_iteration": 2.665850877761841 }, { "auxiliary_loss_clip": 0.01279603, "auxiliary_loss_mlp": 0.00226324, "balance_loss_clip": 1.04964328, "balance_loss_mlp": 0.20039597, "epoch": 0.7838268450323163, "flos": 22383121415040.0, "grad_norm": 15.54750865131778, "language_loss": 0.80640656, "learning_rate": 4.703895486362031e-07, "loss": 0.82146585, "num_input_tokens_seen": 281230850, "router_z_loss_clip": 2.30078125, "router_z_loss_mlp": 0.25927734, "step": 13037, "time_per_iteration": 2.695424795150757 }, { "auxiliary_loss_clip": 0.01248459, "auxiliary_loss_mlp": 0.00231957, "balance_loss_clip": 1.02813625, "balance_loss_mlp": 0.20719719, "epoch": 0.7838869682849842, "flos": 19500284689920.0, "grad_norm": 13.402209979701386, "language_loss": 0.70922554, "learning_rate": 4.701386624460717e-07, "loss": 0.72402966, "num_input_tokens_seen": 281249810, "router_z_loss_clip": 2.20507812, "router_z_loss_mlp": 0.24780273, "step": 13038, "time_per_iteration": 2.60798716545105 }, { "auxiliary_loss_clip": 0.0124983, "auxiliary_loss_mlp": 0.00204265, "balance_loss_clip": 1.03196955, "balance_loss_mlp": 0.17954105, "epoch": 0.7839470915376522, "flos": 32892845690880.0, "grad_norm": 2.454463442078956, "language_loss": 0.76992869, "learning_rate": 4.698878342684349e-07, "loss": 0.78446966, "num_input_tokens_seen": 281273730, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.24743652, "step": 13039, "time_per_iteration": 2.7684693336486816 }, { "auxiliary_loss_clip": 0.01223497, "auxiliary_loss_mlp": 0.00194096, "balance_loss_clip": 1.01106453, "balance_loss_mlp": 0.17143445, "epoch": 0.7840072147903202, "flos": 29676003373440.0, "grad_norm": 17.50059297099022, "language_loss": 0.75294912, "learning_rate": 4.6963706411280537e-07, "loss": 0.76712501, "num_input_tokens_seen": 281293670, "router_z_loss_clip": 2.12304688, "router_z_loss_mlp": 0.2265625, "step": 13040, "time_per_iteration": 2.720083475112915 }, { "auxiliary_loss_clip": 0.01243058, "auxiliary_loss_mlp": 0.00223918, "balance_loss_clip": 1.02125573, "balance_loss_mlp": 0.19924167, "epoch": 0.7840673380429881, "flos": 18186744234240.0, "grad_norm": 22.435007703621835, "language_loss": 0.7550056, "learning_rate": 4.6938635198869116e-07, "loss": 0.76967537, "num_input_tokens_seen": 281313070, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.2467041, "step": 13041, "time_per_iteration": 2.7100839614868164 }, { "auxiliary_loss_clip": 0.01079114, "auxiliary_loss_mlp": 0.00054833, "balance_loss_clip": 0.92805266, "balance_loss_mlp": 0.04815757, "epoch": 0.7841274612956561, "flos": 66346006613760.0, "grad_norm": 0.6424405194281785, "language_loss": 0.56741017, "learning_rate": 4.691356979055998e-07, "loss": 0.57874966, "num_input_tokens_seen": 281374880, "router_z_loss_clip": 1.515625, "router_z_loss_mlp": 0.06689453, "step": 13042, "time_per_iteration": 3.1545472145080566 }, { "auxiliary_loss_clip": 0.01243556, "auxiliary_loss_mlp": 0.00219837, "balance_loss_clip": 1.02498233, "balance_loss_mlp": 0.19605453, "epoch": 0.784187584548324, "flos": 26648482665600.0, "grad_norm": 104.9232247249522, "language_loss": 0.93292975, "learning_rate": 4.688851018730369e-07, "loss": 0.94756365, "num_input_tokens_seen": 281392620, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.23791504, "step": 13043, "time_per_iteration": 2.7251017093658447 }, { "auxiliary_loss_clip": 0.01255686, "auxiliary_loss_mlp": 0.00203846, "balance_loss_clip": 1.03604794, "balance_loss_mlp": 0.17961107, "epoch": 0.7842477078009921, "flos": 25740158515200.0, "grad_norm": 8.969652152178636, "language_loss": 0.92640638, "learning_rate": 4.6863456390050425e-07, "loss": 0.94100171, "num_input_tokens_seen": 281413140, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.24230957, "step": 13044, "time_per_iteration": 2.7122583389282227 }, { "auxiliary_loss_clip": 0.01265759, "auxiliary_loss_mlp": 0.00221516, "balance_loss_clip": 1.03971148, "balance_loss_mlp": 0.19695854, "epoch": 0.78430783105366, "flos": 21980957765760.0, "grad_norm": 19.717843262073053, "language_loss": 0.86739218, "learning_rate": 4.6838408399750195e-07, "loss": 0.88226491, "num_input_tokens_seen": 281430860, "router_z_loss_clip": 2.26367188, "router_z_loss_mlp": 0.24560547, "step": 13045, "time_per_iteration": 2.7338812351226807 }, { "auxiliary_loss_clip": 0.01258803, "auxiliary_loss_mlp": 0.00201446, "balance_loss_clip": 1.04050756, "balance_loss_mlp": 0.17694855, "epoch": 0.784367954306328, "flos": 23842279607040.0, "grad_norm": 8.84356058119435, "language_loss": 0.80112714, "learning_rate": 4.6813366217352925e-07, "loss": 0.81572962, "num_input_tokens_seen": 281451385, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.24475098, "step": 13046, "time_per_iteration": 2.7146244049072266 }, { "auxiliary_loss_clip": 0.01246176, "auxiliary_loss_mlp": 0.00219421, "balance_loss_clip": 1.02548015, "balance_loss_mlp": 0.19542462, "epoch": 0.7844280775589959, "flos": 24826662806400.0, "grad_norm": 119.57206159597919, "language_loss": 0.71731758, "learning_rate": 4.678832984380809e-07, "loss": 0.73197353, "num_input_tokens_seen": 281472255, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.2401123, "step": 13047, "time_per_iteration": 2.710035800933838 }, { "auxiliary_loss_clip": 0.01222737, "auxiliary_loss_mlp": 0.0020589, "balance_loss_clip": 1.01211011, "balance_loss_mlp": 0.18282318, "epoch": 0.7844882008116639, "flos": 22455660931200.0, "grad_norm": 25.165524483891314, "language_loss": 0.79723573, "learning_rate": 4.676329928006515e-07, "loss": 0.81152195, "num_input_tokens_seen": 281492860, "router_z_loss_clip": 2.10351562, "router_z_loss_mlp": 0.23071289, "step": 13048, "time_per_iteration": 2.697352647781372 }, { "auxiliary_loss_clip": 0.01250691, "auxiliary_loss_mlp": 0.00228025, "balance_loss_clip": 1.02926397, "balance_loss_mlp": 0.20330065, "epoch": 0.7845483240643318, "flos": 26104041244800.0, "grad_norm": 2.5906656046106873, "language_loss": 0.82603014, "learning_rate": 4.6738274527073243e-07, "loss": 0.84081733, "num_input_tokens_seen": 281511815, "router_z_loss_clip": 2.21289062, "router_z_loss_mlp": 0.24731445, "step": 13049, "time_per_iteration": 2.6875557899475098 }, { "auxiliary_loss_clip": 0.01274445, "auxiliary_loss_mlp": 0.00227156, "balance_loss_clip": 1.0449661, "balance_loss_mlp": 0.20031022, "epoch": 0.7846084473169999, "flos": 19354307817600.0, "grad_norm": 35.54122270729712, "language_loss": 0.83165973, "learning_rate": 4.6713255585781454e-07, "loss": 0.84667575, "num_input_tokens_seen": 281530090, "router_z_loss_clip": 2.29882812, "router_z_loss_mlp": 0.26855469, "step": 13050, "time_per_iteration": 2.660062551498413 }, { "auxiliary_loss_clip": 0.01233323, "auxiliary_loss_mlp": 0.00212935, "balance_loss_clip": 1.01770568, "balance_loss_mlp": 0.18806756, "epoch": 0.7846685705696678, "flos": 23325811902720.0, "grad_norm": 20.36139741035254, "language_loss": 0.82327789, "learning_rate": 4.668824245713825e-07, "loss": 0.83774054, "num_input_tokens_seen": 281547075, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.24853516, "step": 13051, "time_per_iteration": 2.7238576412200928 }, { "auxiliary_loss_clip": 0.01268143, "auxiliary_loss_mlp": 0.00254886, "balance_loss_clip": 1.04352582, "balance_loss_mlp": 0.22780119, "epoch": 0.7847286938223358, "flos": 35809545962880.0, "grad_norm": 2.6602771594759385, "language_loss": 0.80853069, "learning_rate": 4.666323514209227e-07, "loss": 0.82376093, "num_input_tokens_seen": 281568080, "router_z_loss_clip": 2.24414062, "router_z_loss_mlp": 0.27075195, "step": 13052, "time_per_iteration": 2.8782267570495605 }, { "auxiliary_loss_clip": 0.01243015, "auxiliary_loss_mlp": 0.00218407, "balance_loss_clip": 1.02660441, "balance_loss_mlp": 0.19455352, "epoch": 0.7847888170750038, "flos": 18478159274880.0, "grad_norm": 54.66998068921527, "language_loss": 0.77906525, "learning_rate": 4.663823364159183e-07, "loss": 0.79367954, "num_input_tokens_seen": 281586925, "router_z_loss_clip": 2.1640625, "router_z_loss_mlp": 0.23864746, "step": 13053, "time_per_iteration": 2.6475422382354736 }, { "auxiliary_loss_clip": 0.01237255, "auxiliary_loss_mlp": 0.00213047, "balance_loss_clip": 1.02235389, "balance_loss_mlp": 0.18952695, "epoch": 0.7848489403276717, "flos": 25119155255040.0, "grad_norm": 5.119599131863661, "language_loss": 0.78693271, "learning_rate": 4.6613237956584893e-07, "loss": 0.80143577, "num_input_tokens_seen": 281603915, "router_z_loss_clip": 2.1484375, "router_z_loss_mlp": 0.23535156, "step": 13054, "time_per_iteration": 2.747056007385254 }, { "auxiliary_loss_clip": 0.01251052, "auxiliary_loss_mlp": 0.00203217, "balance_loss_clip": 1.03145981, "balance_loss_mlp": 0.17725322, "epoch": 0.7849090635803397, "flos": 26502433966080.0, "grad_norm": 6.89441626237142, "language_loss": 0.82416868, "learning_rate": 4.658824808801938e-07, "loss": 0.83871138, "num_input_tokens_seen": 281624220, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.25952148, "step": 13055, "time_per_iteration": 2.699556350708008 }, { "auxiliary_loss_clip": 0.01261788, "auxiliary_loss_mlp": 0.00226374, "balance_loss_clip": 1.03651452, "balance_loss_mlp": 0.20069641, "epoch": 0.7849691868330076, "flos": 20959658363520.0, "grad_norm": 8.528910884071719, "language_loss": 0.82497597, "learning_rate": 4.656326403684283e-07, "loss": 0.83985752, "num_input_tokens_seen": 281642325, "router_z_loss_clip": 2.25195312, "router_z_loss_mlp": 0.25671387, "step": 13056, "time_per_iteration": 2.720327854156494 }, { "auxiliary_loss_clip": 0.01256415, "auxiliary_loss_mlp": 0.00224227, "balance_loss_clip": 1.03729546, "balance_loss_mlp": 0.20001557, "epoch": 0.7850293100856757, "flos": 26067484177920.0, "grad_norm": 14.53267089711101, "language_loss": 0.77817178, "learning_rate": 4.6538285804002744e-07, "loss": 0.79297823, "num_input_tokens_seen": 281663065, "router_z_loss_clip": 2.19238281, "router_z_loss_mlp": 0.24230957, "step": 13057, "time_per_iteration": 2.6988909244537354 }, { "auxiliary_loss_clip": 0.01242186, "auxiliary_loss_mlp": 0.00210268, "balance_loss_clip": 1.02214932, "balance_loss_mlp": 0.18655753, "epoch": 0.7850894333383436, "flos": 22491894775680.0, "grad_norm": 15.62411640202038, "language_loss": 0.83583605, "learning_rate": 4.6513313390446175e-07, "loss": 0.85036057, "num_input_tokens_seen": 281681005, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.23730469, "step": 13058, "time_per_iteration": 4.021360874176025 }, { "auxiliary_loss_clip": 0.01256554, "auxiliary_loss_mlp": 0.00225808, "balance_loss_clip": 1.03612185, "balance_loss_mlp": 0.20318258, "epoch": 0.7851495565910116, "flos": 20558643949440.0, "grad_norm": 8.83705305339014, "language_loss": 0.77888083, "learning_rate": 4.6488346797120146e-07, "loss": 0.79370445, "num_input_tokens_seen": 281697965, "router_z_loss_clip": 2.20507812, "router_z_loss_mlp": 0.22619629, "step": 13059, "time_per_iteration": 2.6702728271484375 }, { "auxiliary_loss_clip": 0.0127392, "auxiliary_loss_mlp": 0.00225076, "balance_loss_clip": 1.04111922, "balance_loss_mlp": 0.19772945, "epoch": 0.7852096798436795, "flos": 15924838942080.0, "grad_norm": 67.4797474011675, "language_loss": 0.84787637, "learning_rate": 4.646338602497144e-07, "loss": 0.86286628, "num_input_tokens_seen": 281716035, "router_z_loss_clip": 2.33007812, "router_z_loss_mlp": 0.2734375, "step": 13060, "time_per_iteration": 4.072642803192139 }, { "auxiliary_loss_clip": 0.01240023, "auxiliary_loss_mlp": 0.0021619, "balance_loss_clip": 1.02268314, "balance_loss_mlp": 0.19123991, "epoch": 0.7852698030963475, "flos": 19062282245760.0, "grad_norm": 57.16406017676538, "language_loss": 0.83511078, "learning_rate": 4.643843107494654e-07, "loss": 0.84967291, "num_input_tokens_seen": 281732815, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.24975586, "step": 13061, "time_per_iteration": 2.626573085784912 }, { "auxiliary_loss_clip": 0.0124453, "auxiliary_loss_mlp": 0.00225041, "balance_loss_clip": 1.02267694, "balance_loss_mlp": 0.19942263, "epoch": 0.7853299263490154, "flos": 24644380262400.0, "grad_norm": 3.3030291971987813, "language_loss": 0.82970297, "learning_rate": 4.641348194799164e-07, "loss": 0.84439862, "num_input_tokens_seen": 281751980, "router_z_loss_clip": 2.21777344, "router_z_loss_mlp": 0.25610352, "step": 13062, "time_per_iteration": 2.7087514400482178 }, { "auxiliary_loss_clip": 0.01251326, "auxiliary_loss_mlp": 0.00212026, "balance_loss_clip": 1.03339624, "balance_loss_mlp": 0.18913826, "epoch": 0.7853900496016835, "flos": 22017981709440.0, "grad_norm": 629.9631445142546, "language_loss": 0.76033688, "learning_rate": 4.638853864505297e-07, "loss": 0.77497041, "num_input_tokens_seen": 281772670, "router_z_loss_clip": 2.17578125, "router_z_loss_mlp": 0.2286377, "step": 13063, "time_per_iteration": 2.6595499515533447 }, { "auxiliary_loss_clip": 0.01245062, "auxiliary_loss_mlp": 0.00194329, "balance_loss_clip": 1.02734733, "balance_loss_mlp": 0.17130992, "epoch": 0.7854501728543514, "flos": 30227412032640.0, "grad_norm": 10.251212742206356, "language_loss": 0.82266629, "learning_rate": 4.636360116707625e-07, "loss": 0.83706015, "num_input_tokens_seen": 281792930, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.22998047, "step": 13064, "time_per_iteration": 2.8023855686187744 }, { "auxiliary_loss_clip": 0.01251132, "auxiliary_loss_mlp": 0.00237278, "balance_loss_clip": 1.02927804, "balance_loss_mlp": 0.21016955, "epoch": 0.7855102961070194, "flos": 18843694030080.0, "grad_norm": 45.48408710665011, "language_loss": 0.76130617, "learning_rate": 4.633866951500718e-07, "loss": 0.77619028, "num_input_tokens_seen": 281811805, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.27124023, "step": 13065, "time_per_iteration": 4.26900577545166 }, { "auxiliary_loss_clip": 0.01252109, "auxiliary_loss_mlp": 0.00218849, "balance_loss_clip": 1.03359783, "balance_loss_mlp": 0.19358817, "epoch": 0.7855704193596874, "flos": 22309971367680.0, "grad_norm": 21.91147439367599, "language_loss": 0.86657178, "learning_rate": 4.6313743689791196e-07, "loss": 0.88128138, "num_input_tokens_seen": 281831885, "router_z_loss_clip": 2.18652344, "router_z_loss_mlp": 0.25256348, "step": 13066, "time_per_iteration": 2.802011251449585 }, { "auxiliary_loss_clip": 0.01095634, "auxiliary_loss_mlp": 0.00063961, "balance_loss_clip": 0.9420166, "balance_loss_mlp": 0.05752374, "epoch": 0.7856305426123553, "flos": 60004434407040.0, "grad_norm": 0.7741542542799573, "language_loss": 0.52848983, "learning_rate": 4.628882369237346e-07, "loss": 0.54008579, "num_input_tokens_seen": 281900310, "router_z_loss_clip": 1.53125, "router_z_loss_mlp": 0.06445312, "step": 13067, "time_per_iteration": 3.318704128265381 }, { "auxiliary_loss_clip": 0.01238884, "auxiliary_loss_mlp": 0.00214695, "balance_loss_clip": 1.02281761, "balance_loss_mlp": 0.18852885, "epoch": 0.7856906658650233, "flos": 21868593045120.0, "grad_norm": 23.93218982729872, "language_loss": 0.75047886, "learning_rate": 4.62639095236989e-07, "loss": 0.76501465, "num_input_tokens_seen": 281918870, "router_z_loss_clip": 2.16210938, "router_z_loss_mlp": 0.26184082, "step": 13068, "time_per_iteration": 2.7167530059814453 }, { "auxiliary_loss_clip": 0.0123139, "auxiliary_loss_mlp": 0.00184386, "balance_loss_clip": 1.01737952, "balance_loss_mlp": 0.16165242, "epoch": 0.7857507891176913, "flos": 23622937205760.0, "grad_norm": 170.3444216191764, "language_loss": 0.75893724, "learning_rate": 4.6239001184712267e-07, "loss": 0.77309501, "num_input_tokens_seen": 281936905, "router_z_loss_clip": 2.13476562, "router_z_loss_mlp": 0.22741699, "step": 13069, "time_per_iteration": 4.166734933853149 }, { "auxiliary_loss_clip": 0.01268275, "auxiliary_loss_mlp": 0.00231444, "balance_loss_clip": 1.04289544, "balance_loss_mlp": 0.20542045, "epoch": 0.7858109123703593, "flos": 25520061928320.0, "grad_norm": 4.347305228647228, "language_loss": 0.83049756, "learning_rate": 4.6214098676358195e-07, "loss": 0.84549475, "num_input_tokens_seen": 281955625, "router_z_loss_clip": 2.25, "router_z_loss_mlp": 0.26037598, "step": 13070, "time_per_iteration": 2.700254201889038 }, { "auxiliary_loss_clip": 0.01245984, "auxiliary_loss_mlp": 0.001988, "balance_loss_clip": 1.02641809, "balance_loss_mlp": 0.17395645, "epoch": 0.7858710356230272, "flos": 17457398576640.0, "grad_norm": 82.31310919932727, "language_loss": 0.75251716, "learning_rate": 4.618920199958083e-07, "loss": 0.76696503, "num_input_tokens_seen": 281973285, "router_z_loss_clip": 2.19921875, "router_z_loss_mlp": 0.24853516, "step": 13071, "time_per_iteration": 2.6336936950683594 }, { "auxiliary_loss_clip": 0.01253651, "auxiliary_loss_mlp": 0.0022711, "balance_loss_clip": 1.03150141, "balance_loss_mlp": 0.20114625, "epoch": 0.7859311588756952, "flos": 24679680353280.0, "grad_norm": 9.606408581346285, "language_loss": 0.81247544, "learning_rate": 4.616431115532442e-07, "loss": 0.82728314, "num_input_tokens_seen": 281991410, "router_z_loss_clip": 2.22265625, "router_z_loss_mlp": 0.25952148, "step": 13072, "time_per_iteration": 2.7319447994232178 }, { "auxiliary_loss_clip": 0.01275098, "auxiliary_loss_mlp": 0.00224839, "balance_loss_clip": 1.04585826, "balance_loss_mlp": 0.19888671, "epoch": 0.7859912821283631, "flos": 21799142098560.0, "grad_norm": 36.38784875649851, "language_loss": 0.79449034, "learning_rate": 4.613942614453268e-07, "loss": 0.80948973, "num_input_tokens_seen": 282010845, "router_z_loss_clip": 2.2890625, "router_z_loss_mlp": 0.25952148, "step": 13073, "time_per_iteration": 2.678715467453003 }, { "auxiliary_loss_clip": 0.01257764, "auxiliary_loss_mlp": 0.00205981, "balance_loss_clip": 1.03239286, "balance_loss_mlp": 0.17937383, "epoch": 0.7860514053810311, "flos": 20847293642880.0, "grad_norm": 11.757560147393978, "language_loss": 0.83593822, "learning_rate": 4.611454696814938e-07, "loss": 0.85057569, "num_input_tokens_seen": 282029635, "router_z_loss_clip": 2.2578125, "router_z_loss_mlp": 0.26586914, "step": 13074, "time_per_iteration": 2.6301026344299316 }, { "auxiliary_loss_clip": 0.01234978, "auxiliary_loss_mlp": 0.0022508, "balance_loss_clip": 1.02272117, "balance_loss_mlp": 0.20057088, "epoch": 0.786111528633699, "flos": 24315689882880.0, "grad_norm": 28.5573641494948, "language_loss": 0.8124553, "learning_rate": 4.608967362711782e-07, "loss": 0.82705587, "num_input_tokens_seen": 282050285, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.24523926, "step": 13075, "time_per_iteration": 2.7052865028381348 }, { "auxiliary_loss_clip": 0.01280986, "auxiliary_loss_mlp": 0.00203081, "balance_loss_clip": 1.05631244, "balance_loss_mlp": 0.17830959, "epoch": 0.7861716518863671, "flos": 24353180703360.0, "grad_norm": 4.8978619358706705, "language_loss": 0.75557268, "learning_rate": 4.6064806122381283e-07, "loss": 0.77041334, "num_input_tokens_seen": 282071040, "router_z_loss_clip": 2.24804688, "router_z_loss_mlp": 0.2479248, "step": 13076, "time_per_iteration": 2.7693278789520264 }, { "auxiliary_loss_clip": 0.0127347, "auxiliary_loss_mlp": 0.00212272, "balance_loss_clip": 1.05130911, "balance_loss_mlp": 0.18621311, "epoch": 0.786231775139035, "flos": 14022399006720.0, "grad_norm": 118.56588888833174, "language_loss": 0.8937996, "learning_rate": 4.603994445488282e-07, "loss": 0.90865701, "num_input_tokens_seen": 282086610, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.26074219, "step": 13077, "time_per_iteration": 2.635636806488037 }, { "auxiliary_loss_clip": 0.0126148, "auxiliary_loss_mlp": 0.0022634, "balance_loss_clip": 1.0379374, "balance_loss_mlp": 0.20124659, "epoch": 0.786291898391703, "flos": 33724248865920.0, "grad_norm": 9.719030716110984, "language_loss": 0.78089058, "learning_rate": 4.6015088625564956e-07, "loss": 0.7957688, "num_input_tokens_seen": 282107440, "router_z_loss_clip": 2.23632812, "router_z_loss_mlp": 0.25085449, "step": 13078, "time_per_iteration": 2.7345705032348633 }, { "auxiliary_loss_clip": 0.01231946, "auxiliary_loss_mlp": 0.00218679, "balance_loss_clip": 1.02042389, "balance_loss_mlp": 0.19593361, "epoch": 0.786352021644371, "flos": 25811476968960.0, "grad_norm": 4.613682705558808, "language_loss": 0.87161756, "learning_rate": 4.599023863537039e-07, "loss": 0.88612378, "num_input_tokens_seen": 282127290, "router_z_loss_clip": 2.11523438, "router_z_loss_mlp": 0.22753906, "step": 13079, "time_per_iteration": 2.711592674255371 }, { "auxiliary_loss_clip": 0.01242254, "auxiliary_loss_mlp": 0.001999, "balance_loss_clip": 1.02938843, "balance_loss_mlp": 0.17655879, "epoch": 0.7864121448970389, "flos": 28910818920960.0, "grad_norm": 33.304927685251634, "language_loss": 0.74722457, "learning_rate": 4.596539448524146e-07, "loss": 0.76164615, "num_input_tokens_seen": 282147505, "router_z_loss_clip": 2.13085938, "router_z_loss_mlp": 0.23339844, "step": 13080, "time_per_iteration": 2.7042531967163086 }, { "auxiliary_loss_clip": 0.01257215, "auxiliary_loss_mlp": 0.00220437, "balance_loss_clip": 1.03587055, "balance_loss_mlp": 0.19498578, "epoch": 0.7864722681497069, "flos": 19208833735680.0, "grad_norm": 6.386351040065943, "language_loss": 0.77981174, "learning_rate": 4.594055617612016e-07, "loss": 0.79458827, "num_input_tokens_seen": 282166450, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.25427246, "step": 13081, "time_per_iteration": 2.6706178188323975 }, { "auxiliary_loss_clip": 0.01246974, "auxiliary_loss_mlp": 0.00206747, "balance_loss_clip": 1.03098726, "balance_loss_mlp": 0.18192723, "epoch": 0.7865323914023749, "flos": 21871573873920.0, "grad_norm": 44.67230090614582, "language_loss": 0.75002217, "learning_rate": 4.591572370894838e-07, "loss": 0.76455939, "num_input_tokens_seen": 282186465, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.24804688, "step": 13082, "time_per_iteration": 2.668178081512451 }, { "auxiliary_loss_clip": 0.01246228, "auxiliary_loss_mlp": 0.00207342, "balance_loss_clip": 1.02694869, "balance_loss_mlp": 0.18366739, "epoch": 0.7865925146550429, "flos": 25520313323520.0, "grad_norm": 21.4314543709459, "language_loss": 0.73138857, "learning_rate": 4.589089708466789e-07, "loss": 0.74592429, "num_input_tokens_seen": 282207180, "router_z_loss_clip": 2.19140625, "router_z_loss_mlp": 0.23681641, "step": 13083, "time_per_iteration": 2.701592206954956 }, { "auxiliary_loss_clip": 0.01265691, "auxiliary_loss_mlp": 0.00208188, "balance_loss_clip": 1.03676045, "balance_loss_mlp": 0.1807934, "epoch": 0.7866526379077108, "flos": 19097366855040.0, "grad_norm": 15.756268684197083, "language_loss": 0.86132008, "learning_rate": 4.5866076304220015e-07, "loss": 0.87605882, "num_input_tokens_seen": 282225865, "router_z_loss_clip": 2.28710938, "router_z_loss_mlp": 0.27429199, "step": 13084, "time_per_iteration": 2.595592498779297 }, { "auxiliary_loss_clip": 0.01239776, "auxiliary_loss_mlp": 0.00206039, "balance_loss_clip": 1.02471042, "balance_loss_mlp": 0.18356802, "epoch": 0.7867127611603788, "flos": 16173771171840.0, "grad_norm": 37.5908463465442, "language_loss": 0.76650655, "learning_rate": 4.584126136854591e-07, "loss": 0.78096461, "num_input_tokens_seen": 282242895, "router_z_loss_clip": 2.15136719, "router_z_loss_mlp": 0.22497559, "step": 13085, "time_per_iteration": 2.6232354640960693 }, { "auxiliary_loss_clip": 0.01263217, "auxiliary_loss_mlp": 0.00222143, "balance_loss_clip": 1.03492999, "balance_loss_mlp": 0.19694185, "epoch": 0.7867728844130467, "flos": 20773640805120.0, "grad_norm": 253.69442056822695, "language_loss": 0.81442571, "learning_rate": 4.5816452278586617e-07, "loss": 0.8292793, "num_input_tokens_seen": 282260425, "router_z_loss_clip": 2.28515625, "router_z_loss_mlp": 0.25231934, "step": 13086, "time_per_iteration": 2.640385389328003 }, { "auxiliary_loss_clip": 0.01245619, "auxiliary_loss_mlp": 0.00195666, "balance_loss_clip": 1.02866435, "balance_loss_mlp": 0.17033441, "epoch": 0.7868330076657147, "flos": 21760106993280.0, "grad_norm": 97.8055042392618, "language_loss": 0.81574726, "learning_rate": 4.5791649035282965e-07, "loss": 0.83016014, "num_input_tokens_seen": 282279335, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.2532959, "step": 13087, "time_per_iteration": 2.6687729358673096 }, { "auxiliary_loss_clip": 0.0122832, "auxiliary_loss_mlp": 0.00224298, "balance_loss_clip": 1.01362491, "balance_loss_mlp": 0.20036089, "epoch": 0.7868931309183826, "flos": 25700692446720.0, "grad_norm": 10.3180264076929, "language_loss": 0.76497924, "learning_rate": 4.5766851639575456e-07, "loss": 0.77950549, "num_input_tokens_seen": 282299905, "router_z_loss_clip": 2.14648438, "router_z_loss_mlp": 0.23937988, "step": 13088, "time_per_iteration": 2.7521519660949707 }, { "auxiliary_loss_clip": 0.01096319, "auxiliary_loss_mlp": 0.00057634, "balance_loss_clip": 0.94434589, "balance_loss_mlp": 0.05033831, "epoch": 0.7869532541710507, "flos": 64644883430400.0, "grad_norm": 0.7172201046128968, "language_loss": 0.54680371, "learning_rate": 4.574206009240431e-07, "loss": 0.55834317, "num_input_tokens_seen": 282367620, "router_z_loss_clip": 1.515625, "router_z_loss_mlp": 0.07275391, "step": 13089, "time_per_iteration": 3.2705702781677246 }, { "auxiliary_loss_clip": 0.01097265, "auxiliary_loss_mlp": 0.00067981, "balance_loss_clip": 0.94668317, "balance_loss_mlp": 0.06116207, "epoch": 0.7870133774237186, "flos": 67453600440960.0, "grad_norm": 0.7144238226787382, "language_loss": 0.49266508, "learning_rate": 4.571727439470976e-07, "loss": 0.50431752, "num_input_tokens_seen": 282435695, "router_z_loss_clip": 1.5, "router_z_loss_mlp": 0.06835938, "step": 13090, "time_per_iteration": 3.2774274349212646 }, { "auxiliary_loss_clip": 0.0123296, "auxiliary_loss_mlp": 0.00195209, "balance_loss_clip": 1.01739967, "balance_loss_mlp": 0.17217743, "epoch": 0.7870735006763866, "flos": 26068310190720.0, "grad_norm": 3.548803143607071, "language_loss": 0.89414251, "learning_rate": 4.5692494547431583e-07, "loss": 0.9084242, "num_input_tokens_seen": 282456025, "router_z_loss_clip": 2.15234375, "router_z_loss_mlp": 0.23034668, "step": 13091, "time_per_iteration": 2.6999571323394775 }, { "auxiliary_loss_clip": 0.01094368, "auxiliary_loss_mlp": 0.00066974, "balance_loss_clip": 0.9429391, "balance_loss_mlp": 0.06039357, "epoch": 0.7871336239290546, "flos": 70289572896000.0, "grad_norm": 0.8079349075148051, "language_loss": 0.63622558, "learning_rate": 4.566772055150947e-07, "loss": 0.64783901, "num_input_tokens_seen": 282520995, "router_z_loss_clip": 1.515625, "router_z_loss_mlp": 0.06591797, "step": 13092, "time_per_iteration": 3.172266721725464 }, { "auxiliary_loss_clip": 0.01246168, "auxiliary_loss_mlp": 0.00218712, "balance_loss_clip": 1.02521038, "balance_loss_mlp": 0.19316536, "epoch": 0.7871937471817225, "flos": 15778574760960.0, "grad_norm": 29.59435047614907, "language_loss": 0.88944411, "learning_rate": 4.564295240788285e-07, "loss": 0.90409291, "num_input_tokens_seen": 282539355, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.25561523, "step": 13093, "time_per_iteration": 2.721970558166504 }, { "auxiliary_loss_clip": 0.01230744, "auxiliary_loss_mlp": 0.00188286, "balance_loss_clip": 1.0125078, "balance_loss_mlp": 0.1641935, "epoch": 0.7872538704343905, "flos": 20485242506880.0, "grad_norm": 81.92766753511157, "language_loss": 0.83719295, "learning_rate": 4.561819011749106e-07, "loss": 0.85138333, "num_input_tokens_seen": 282555735, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.2409668, "step": 13094, "time_per_iteration": 2.670842170715332 }, { "auxiliary_loss_clip": 0.01243754, "auxiliary_loss_mlp": 0.00224845, "balance_loss_clip": 1.02097404, "balance_loss_mlp": 0.19959605, "epoch": 0.7873139936870585, "flos": 25082670015360.0, "grad_norm": 56.51454818069856, "language_loss": 0.86204833, "learning_rate": 4.5593433681272884e-07, "loss": 0.87673432, "num_input_tokens_seen": 282574550, "router_z_loss_clip": 2.22460938, "router_z_loss_mlp": 0.25280762, "step": 13095, "time_per_iteration": 2.7483232021331787 }, { "auxiliary_loss_clip": 0.01255456, "auxiliary_loss_mlp": 0.00209981, "balance_loss_clip": 1.02951622, "balance_loss_mlp": 0.18458909, "epoch": 0.7873741169397265, "flos": 30883176679680.0, "grad_norm": 15.033749463980687, "language_loss": 0.75488102, "learning_rate": 4.556868310016715e-07, "loss": 0.76953542, "num_input_tokens_seen": 282596520, "router_z_loss_clip": 2.25976562, "router_z_loss_mlp": 0.25402832, "step": 13096, "time_per_iteration": 2.7231228351593018 }, { "auxiliary_loss_clip": 0.01222488, "auxiliary_loss_mlp": 0.00191981, "balance_loss_clip": 1.01089239, "balance_loss_mlp": 0.16922414, "epoch": 0.7874342401923944, "flos": 46791962242560.0, "grad_norm": 14.02274298696027, "language_loss": 0.75246024, "learning_rate": 4.55439383751125e-07, "loss": 0.76660496, "num_input_tokens_seen": 282620560, "router_z_loss_clip": 2.11328125, "router_z_loss_mlp": 0.22766113, "step": 13097, "time_per_iteration": 2.927507162094116 }, { "auxiliary_loss_clip": 0.01273149, "auxiliary_loss_mlp": 0.00207477, "balance_loss_clip": 1.04525805, "balance_loss_mlp": 0.18096499, "epoch": 0.7874943634450624, "flos": 23584548545280.0, "grad_norm": 13.943764542811582, "language_loss": 0.87328094, "learning_rate": 4.5519199507047126e-07, "loss": 0.88808721, "num_input_tokens_seen": 282639830, "router_z_loss_clip": 2.27929688, "router_z_loss_mlp": 0.26513672, "step": 13098, "time_per_iteration": 2.684392213821411 }, { "auxiliary_loss_clip": 0.012314, "auxiliary_loss_mlp": 0.00195737, "balance_loss_clip": 1.01677382, "balance_loss_mlp": 0.17336093, "epoch": 0.7875544866977303, "flos": 20191169859840.0, "grad_norm": 3.762547610573993, "language_loss": 0.81369442, "learning_rate": 4.5494466496909177e-07, "loss": 0.82796574, "num_input_tokens_seen": 282660130, "router_z_loss_clip": 2.14257812, "router_z_loss_mlp": 0.22399902, "step": 13099, "time_per_iteration": 2.7116940021514893 }, { "auxiliary_loss_clip": 0.01257023, "auxiliary_loss_mlp": 0.00212495, "balance_loss_clip": 1.03776729, "balance_loss_mlp": 0.18799752, "epoch": 0.7876146099503983, "flos": 22602571557120.0, "grad_norm": 5.3487021793504255, "language_loss": 0.84484303, "learning_rate": 4.5469739345636603e-07, "loss": 0.8595382, "num_input_tokens_seen": 282681125, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.24511719, "step": 13100, "time_per_iteration": 4.147493124008179 }, { "auxiliary_loss_clip": 0.01265451, "auxiliary_loss_mlp": 0.00220247, "balance_loss_clip": 1.03575182, "balance_loss_mlp": 0.19349624, "epoch": 0.7876747332030662, "flos": 10705833555840.0, "grad_norm": 7.551240129450193, "language_loss": 0.78539664, "learning_rate": 4.5445018054167007e-07, "loss": 0.80025369, "num_input_tokens_seen": 282696690, "router_z_loss_clip": 2.29101562, "router_z_loss_mlp": 0.26757812, "step": 13101, "time_per_iteration": 2.6222496032714844 }, { "auxiliary_loss_clip": 0.01243931, "auxiliary_loss_mlp": 0.00198407, "balance_loss_clip": 1.02555871, "balance_loss_mlp": 0.17415984, "epoch": 0.7877348564557343, "flos": 38399315621760.0, "grad_norm": 10.70158512516379, "language_loss": 0.83275193, "learning_rate": 4.5420302623437745e-07, "loss": 0.84717524, "num_input_tokens_seen": 282721210, "router_z_loss_clip": 2.18164062, "router_z_loss_mlp": 0.24243164, "step": 13102, "time_per_iteration": 4.23903751373291 }, { "auxiliary_loss_clip": 0.01228151, "auxiliary_loss_mlp": 0.00221666, "balance_loss_clip": 1.01357841, "balance_loss_mlp": 0.19784829, "epoch": 0.7877949797084022, "flos": 18329524796160.0, "grad_norm": 11.949221136122025, "language_loss": 0.87855607, "learning_rate": 4.5395593054386093e-07, "loss": 0.89305425, "num_input_tokens_seen": 282738505, "router_z_loss_clip": 2.14453125, "router_z_loss_mlp": 0.23840332, "step": 13103, "time_per_iteration": 2.6434707641601562 }, { "auxiliary_loss_clip": 0.01271238, "auxiliary_loss_mlp": 0.00212824, "balance_loss_clip": 1.04433572, "balance_loss_mlp": 0.18843356, "epoch": 0.7878551029610702, "flos": 25806736373760.0, "grad_norm": 13.96612637455344, "language_loss": 0.88321817, "learning_rate": 4.537088934794913e-07, "loss": 0.89805877, "num_input_tokens_seen": 282756895, "router_z_loss_clip": 2.27148438, "router_z_loss_mlp": 0.24401855, "step": 13104, "time_per_iteration": 2.7515950202941895 }, { "auxiliary_loss_clip": 0.01243135, "auxiliary_loss_mlp": 0.00205882, "balance_loss_clip": 1.02230227, "balance_loss_mlp": 0.18246968, "epoch": 0.7879152262137382, "flos": 22342685679360.0, "grad_norm": 137.18513435967049, "language_loss": 0.80301124, "learning_rate": 4.5346191505063515e-07, "loss": 0.81750143, "num_input_tokens_seen": 282774955, "router_z_loss_clip": 2.2109375, "router_z_loss_mlp": 0.23388672, "step": 13105, "time_per_iteration": 2.7590067386627197 }, { "auxiliary_loss_clip": 0.01239488, "auxiliary_loss_mlp": 0.00214814, "balance_loss_clip": 1.01376462, "balance_loss_mlp": 0.18848091, "epoch": 0.7879753494664061, "flos": 24785329230720.0, "grad_norm": 9.0969655750624, "language_loss": 0.84954017, "learning_rate": 4.5321499526665776e-07, "loss": 0.86408317, "num_input_tokens_seen": 282793165, "router_z_loss_clip": 2.2578125, "router_z_loss_mlp": 0.26342773, "step": 13106, "time_per_iteration": 2.6987545490264893 }, { "auxiliary_loss_clip": 0.01241434, "auxiliary_loss_mlp": 0.00201656, "balance_loss_clip": 1.02001894, "balance_loss_mlp": 0.17712313, "epoch": 0.7880354727190741, "flos": 16909078487040.0, "grad_norm": 3867.6299648925824, "language_loss": 0.84275711, "learning_rate": 4.5296813413692337e-07, "loss": 0.85718799, "num_input_tokens_seen": 282809820, "router_z_loss_clip": 2.21289062, "router_z_loss_mlp": 0.24523926, "step": 13107, "time_per_iteration": 4.21003532409668 }, { "auxiliary_loss_clip": 0.01231679, "auxiliary_loss_mlp": 0.00188028, "balance_loss_clip": 1.01895452, "balance_loss_mlp": 0.16609317, "epoch": 0.7880955959717421, "flos": 22230500526720.0, "grad_norm": 21.72220197822257, "language_loss": 0.79695749, "learning_rate": 4.5272133167079165e-07, "loss": 0.81115454, "num_input_tokens_seen": 282828600, "router_z_loss_clip": 2.12890625, "router_z_loss_mlp": 0.21948242, "step": 13108, "time_per_iteration": 2.7177891731262207 }, { "auxiliary_loss_clip": 0.01101296, "auxiliary_loss_mlp": 0.00042951, "balance_loss_clip": 0.94741952, "balance_loss_mlp": 0.0355125, "epoch": 0.7881557192244101, "flos": 69183200131200.0, "grad_norm": 0.8672798269681281, "language_loss": 0.59915078, "learning_rate": 4.5247458787762216e-07, "loss": 0.61059326, "num_input_tokens_seen": 282882775, "router_z_loss_clip": 1.5390625, "router_z_loss_mlp": 0.07421875, "step": 13109, "time_per_iteration": 3.145371913909912 }, { "auxiliary_loss_clip": 0.01253546, "auxiliary_loss_mlp": 0.00226862, "balance_loss_clip": 1.03490686, "balance_loss_mlp": 0.2012682, "epoch": 0.788215842477078, "flos": 24935436167040.0, "grad_norm": 4.682680974527127, "language_loss": 0.79875278, "learning_rate": 4.5222790276677126e-07, "loss": 0.81355691, "num_input_tokens_seen": 282902680, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.25585938, "step": 13110, "time_per_iteration": 2.738661766052246 }, { "auxiliary_loss_clip": 0.01233867, "auxiliary_loss_mlp": 0.00200286, "balance_loss_clip": 1.01813579, "balance_loss_mlp": 0.1778146, "epoch": 0.788275965729746, "flos": 26106483369600.0, "grad_norm": 4.585450469427106, "language_loss": 0.80837965, "learning_rate": 4.5198127634759455e-07, "loss": 0.82272112, "num_input_tokens_seen": 282923625, "router_z_loss_clip": 2.16015625, "router_z_loss_mlp": 0.22473145, "step": 13111, "time_per_iteration": 4.137788772583008 }, { "auxiliary_loss_clip": 0.01236055, "auxiliary_loss_mlp": 0.00206228, "balance_loss_clip": 1.01692128, "balance_loss_mlp": 0.18121809, "epoch": 0.7883360889824139, "flos": 21214803646080.0, "grad_norm": 54.55076188146769, "language_loss": 0.71983802, "learning_rate": 4.5173470862944206e-07, "loss": 0.73426086, "num_input_tokens_seen": 282941955, "router_z_loss_clip": 2.19335938, "router_z_loss_mlp": 0.25012207, "step": 13112, "time_per_iteration": 2.67777156829834 }, { "auxiliary_loss_clip": 0.01245348, "auxiliary_loss_mlp": 0.0020892, "balance_loss_clip": 1.0217247, "balance_loss_mlp": 0.18271767, "epoch": 0.7883962122350819, "flos": 21142551438720.0, "grad_norm": 6.176783462314498, "language_loss": 0.76971227, "learning_rate": 4.514881996216644e-07, "loss": 0.78425491, "num_input_tokens_seen": 282961280, "router_z_loss_clip": 2.23632812, "router_z_loss_mlp": 0.26196289, "step": 13113, "time_per_iteration": 2.7192180156707764 }, { "auxiliary_loss_clip": 0.01229696, "auxiliary_loss_mlp": 0.00195209, "balance_loss_clip": 1.01497841, "balance_loss_mlp": 0.17121205, "epoch": 0.7884563354877498, "flos": 15302901928320.0, "grad_norm": 39.82539489126125, "language_loss": 0.727, "learning_rate": 4.5124174933361e-07, "loss": 0.74124908, "num_input_tokens_seen": 282978210, "router_z_loss_clip": 2.14453125, "router_z_loss_mlp": 0.23986816, "step": 13114, "time_per_iteration": 2.6741318702697754 }, { "auxiliary_loss_clip": 0.0123617, "auxiliary_loss_mlp": 0.00212697, "balance_loss_clip": 1.01674557, "balance_loss_mlp": 0.18687651, "epoch": 0.7885164587404179, "flos": 24388301226240.0, "grad_norm": 6.09079784322184, "language_loss": 0.72265738, "learning_rate": 4.5099535777462306e-07, "loss": 0.73714608, "num_input_tokens_seen": 282998845, "router_z_loss_clip": 2.19335938, "router_z_loss_mlp": 0.25830078, "step": 13115, "time_per_iteration": 2.768845796585083 }, { "auxiliary_loss_clip": 0.01242816, "auxiliary_loss_mlp": 0.00225846, "balance_loss_clip": 1.02059329, "balance_loss_mlp": 0.19991848, "epoch": 0.7885765819930858, "flos": 14385886686720.0, "grad_norm": 216.363041099254, "language_loss": 0.93706167, "learning_rate": 4.50749024954048e-07, "loss": 0.95174825, "num_input_tokens_seen": 283015200, "router_z_loss_clip": 2.22265625, "router_z_loss_mlp": 0.25915527, "step": 13116, "time_per_iteration": 2.6353321075439453 }, { "auxiliary_loss_clip": 0.01264942, "auxiliary_loss_mlp": 0.00232123, "balance_loss_clip": 1.03548419, "balance_loss_mlp": 0.20477596, "epoch": 0.7886367052457538, "flos": 18259930195200.0, "grad_norm": 8.405841604036748, "language_loss": 0.82281935, "learning_rate": 4.505027508812245e-07, "loss": 0.83779001, "num_input_tokens_seen": 283033680, "router_z_loss_clip": 2.296875, "router_z_loss_mlp": 0.27355957, "step": 13117, "time_per_iteration": 2.667958974838257 }, { "auxiliary_loss_clip": 0.0123191, "auxiliary_loss_mlp": 0.00232184, "balance_loss_clip": 1.01750243, "balance_loss_mlp": 0.20661348, "epoch": 0.7886968284984217, "flos": 15305092657920.0, "grad_norm": 261.08196233536484, "language_loss": 0.85610044, "learning_rate": 4.502565355654926e-07, "loss": 0.87074137, "num_input_tokens_seen": 283050620, "router_z_loss_clip": 2.14257812, "router_z_loss_mlp": 0.25561523, "step": 13118, "time_per_iteration": 2.6074297428131104 }, { "auxiliary_loss_clip": 0.01244131, "auxiliary_loss_mlp": 0.002063, "balance_loss_clip": 1.02941346, "balance_loss_mlp": 0.18311419, "epoch": 0.7887569517510897, "flos": 21215450090880.0, "grad_norm": 12.318056355060778, "language_loss": 0.80504251, "learning_rate": 4.500103790161878e-07, "loss": 0.81954676, "num_input_tokens_seen": 283070215, "router_z_loss_clip": 2.14257812, "router_z_loss_mlp": 0.23193359, "step": 13119, "time_per_iteration": 2.711944818496704 }, { "auxiliary_loss_clip": 0.01225992, "auxiliary_loss_mlp": 0.00203451, "balance_loss_clip": 1.01234937, "balance_loss_mlp": 0.18193324, "epoch": 0.7888170750037578, "flos": 22711237176960.0, "grad_norm": 9.50903849518193, "language_loss": 0.79511863, "learning_rate": 4.4976428124264454e-07, "loss": 0.80941302, "num_input_tokens_seen": 283091485, "router_z_loss_clip": 2.13867188, "router_z_loss_mlp": 0.21508789, "step": 13120, "time_per_iteration": 2.6600284576416016 }, { "auxiliary_loss_clip": 0.0125422, "auxiliary_loss_mlp": 0.00207113, "balance_loss_clip": 1.03531528, "balance_loss_mlp": 0.18406957, "epoch": 0.7888771982564257, "flos": 36429148592640.0, "grad_norm": 22.90084099356108, "language_loss": 0.85500205, "learning_rate": 4.4951824225419564e-07, "loss": 0.86961538, "num_input_tokens_seen": 283115040, "router_z_loss_clip": 2.19042969, "router_z_loss_mlp": 0.23034668, "step": 13121, "time_per_iteration": 2.8684659004211426 }, { "auxiliary_loss_clip": 0.01230962, "auxiliary_loss_mlp": 0.00198764, "balance_loss_clip": 1.01230073, "balance_loss_mlp": 0.17587595, "epoch": 0.7889373215090937, "flos": 27309993488640.0, "grad_norm": 34.844666367351714, "language_loss": 0.85216224, "learning_rate": 4.4927226206017057e-07, "loss": 0.86645949, "num_input_tokens_seen": 283136925, "router_z_loss_clip": 2.18164062, "router_z_loss_mlp": 0.22900391, "step": 13122, "time_per_iteration": 2.8993124961853027 }, { "auxiliary_loss_clip": 0.01230287, "auxiliary_loss_mlp": 0.00214074, "balance_loss_clip": 1.01449418, "balance_loss_mlp": 0.19155537, "epoch": 0.7889974447617616, "flos": 19829010983040.0, "grad_norm": 29.369414586679785, "language_loss": 0.85792381, "learning_rate": 4.4902634066989597e-07, "loss": 0.87236738, "num_input_tokens_seen": 283155725, "router_z_loss_clip": 2.15527344, "router_z_loss_mlp": 0.22521973, "step": 13123, "time_per_iteration": 2.6595842838287354 }, { "auxiliary_loss_clip": 0.01252011, "auxiliary_loss_mlp": 0.00241603, "balance_loss_clip": 1.02676809, "balance_loss_mlp": 0.21444687, "epoch": 0.7890575680144296, "flos": 17271201450240.0, "grad_norm": 4.012332436894762, "language_loss": 0.76117778, "learning_rate": 4.487804780926985e-07, "loss": 0.77611387, "num_input_tokens_seen": 283173845, "router_z_loss_clip": 2.25, "router_z_loss_mlp": 0.27172852, "step": 13124, "time_per_iteration": 2.675924777984619 }, { "auxiliary_loss_clip": 0.01260255, "auxiliary_loss_mlp": 0.00214571, "balance_loss_clip": 1.03343773, "balance_loss_mlp": 0.19072931, "epoch": 0.7891176912670975, "flos": 27600151553280.0, "grad_norm": 13.56917860188531, "language_loss": 0.84415758, "learning_rate": 4.4853467433790036e-07, "loss": 0.85890579, "num_input_tokens_seen": 283191985, "router_z_loss_clip": 2.26953125, "router_z_loss_mlp": 0.23852539, "step": 13125, "time_per_iteration": 2.7045187950134277 }, { "auxiliary_loss_clip": 0.01234826, "auxiliary_loss_mlp": 0.0019693, "balance_loss_clip": 1.01568198, "balance_loss_mlp": 0.17286195, "epoch": 0.7891778145197655, "flos": 22711668140160.0, "grad_norm": 4.238614892885009, "language_loss": 0.80538845, "learning_rate": 4.4828892941482267e-07, "loss": 0.81970608, "num_input_tokens_seen": 283210855, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.2409668, "step": 13126, "time_per_iteration": 2.706866979598999 }, { "auxiliary_loss_clip": 0.01250799, "auxiliary_loss_mlp": 0.00213781, "balance_loss_clip": 1.02960837, "balance_loss_mlp": 0.1884492, "epoch": 0.7892379377724335, "flos": 17310775259520.0, "grad_norm": 6.887634762417076, "language_loss": 0.85745382, "learning_rate": 4.480432433327845e-07, "loss": 0.87209964, "num_input_tokens_seen": 283229665, "router_z_loss_clip": 2.21289062, "router_z_loss_mlp": 0.2532959, "step": 13127, "time_per_iteration": 2.6812210083007812 }, { "auxiliary_loss_clip": 0.01242287, "auxiliary_loss_mlp": 0.00201399, "balance_loss_clip": 1.02718496, "balance_loss_mlp": 0.179548, "epoch": 0.7892980610251015, "flos": 25775674087680.0, "grad_norm": 2.4039406194226496, "language_loss": 0.90982485, "learning_rate": 4.47797616101103e-07, "loss": 0.92426169, "num_input_tokens_seen": 283248615, "router_z_loss_clip": 2.15039062, "router_z_loss_mlp": 0.21826172, "step": 13128, "time_per_iteration": 2.7352020740509033 }, { "auxiliary_loss_clip": 0.01231984, "auxiliary_loss_mlp": 0.00198285, "balance_loss_clip": 1.01847172, "balance_loss_mlp": 0.17602846, "epoch": 0.7893581842777694, "flos": 21579943351680.0, "grad_norm": 3.6184867666961296, "language_loss": 0.77697676, "learning_rate": 4.475520477290904e-07, "loss": 0.79127944, "num_input_tokens_seen": 283267135, "router_z_loss_clip": 2.13476562, "router_z_loss_mlp": 0.22241211, "step": 13129, "time_per_iteration": 2.671686887741089 }, { "auxiliary_loss_clip": 0.01127858, "auxiliary_loss_mlp": 0.00116509, "balance_loss_clip": 0.97202402, "balance_loss_mlp": 0.1083555, "epoch": 0.7894183075304374, "flos": 69016468176000.0, "grad_norm": 0.7043971965011516, "language_loss": 0.60531402, "learning_rate": 4.473065382260597e-07, "loss": 0.61775768, "num_input_tokens_seen": 283328940, "router_z_loss_clip": 1.5625, "router_z_loss_mlp": 0.08154297, "step": 13130, "time_per_iteration": 3.2569127082824707 }, { "auxiliary_loss_clip": 0.0123912, "auxiliary_loss_mlp": 0.00199526, "balance_loss_clip": 1.02332401, "balance_loss_mlp": 0.17667392, "epoch": 0.7894784307831053, "flos": 24243258107520.0, "grad_norm": 34.485588000734914, "language_loss": 0.81149918, "learning_rate": 4.4706108760132124e-07, "loss": 0.82588565, "num_input_tokens_seen": 283350000, "router_z_loss_clip": 2.16015625, "router_z_loss_mlp": 0.22875977, "step": 13131, "time_per_iteration": 2.7640187740325928 }, { "auxiliary_loss_clip": 0.01298344, "auxiliary_loss_mlp": 0.00232407, "balance_loss_clip": 1.05476594, "balance_loss_mlp": 0.20335577, "epoch": 0.7895385540357733, "flos": 20266546550400.0, "grad_norm": 12.027626235370969, "language_loss": 0.83917534, "learning_rate": 4.4681569586418153e-07, "loss": 0.85448283, "num_input_tokens_seen": 283368020, "router_z_loss_clip": 2.43164062, "router_z_loss_mlp": 0.29040527, "step": 13132, "time_per_iteration": 2.6529951095581055 }, { "auxiliary_loss_clip": 0.01246538, "auxiliary_loss_mlp": 0.00215539, "balance_loss_clip": 1.02457094, "balance_loss_mlp": 0.18964721, "epoch": 0.7895986772884414, "flos": 20996574566400.0, "grad_norm": 109.54053729387593, "language_loss": 0.71526027, "learning_rate": 4.465703630239468e-07, "loss": 0.72988105, "num_input_tokens_seen": 283387030, "router_z_loss_clip": 2.22070312, "router_z_loss_mlp": 0.25891113, "step": 13133, "time_per_iteration": 2.7360644340515137 }, { "auxiliary_loss_clip": 0.01281821, "auxiliary_loss_mlp": 0.00224337, "balance_loss_clip": 1.04716468, "balance_loss_mlp": 0.1971333, "epoch": 0.7896588005411093, "flos": 18657999694080.0, "grad_norm": 10.911504595053357, "language_loss": 0.93020445, "learning_rate": 4.463250890899195e-07, "loss": 0.94526613, "num_input_tokens_seen": 283402090, "router_z_loss_clip": 2.34570312, "router_z_loss_mlp": 0.27233887, "step": 13134, "time_per_iteration": 2.6385700702667236 }, { "auxiliary_loss_clip": 0.01251974, "auxiliary_loss_mlp": 0.00207788, "balance_loss_clip": 1.03216195, "balance_loss_mlp": 0.18239652, "epoch": 0.7897189237937773, "flos": 18405907067520.0, "grad_norm": 21.440628764261056, "language_loss": 0.88818669, "learning_rate": 4.460798740713998e-07, "loss": 0.90278435, "num_input_tokens_seen": 283421035, "router_z_loss_clip": 2.19921875, "router_z_loss_mlp": 0.25366211, "step": 13135, "time_per_iteration": 2.690096378326416 }, { "auxiliary_loss_clip": 0.01240022, "auxiliary_loss_mlp": 0.00211723, "balance_loss_clip": 1.02267337, "balance_loss_mlp": 0.18590248, "epoch": 0.7897790470464452, "flos": 23731602825600.0, "grad_norm": 7.297266625746801, "language_loss": 0.7847479, "learning_rate": 4.4583471797768733e-07, "loss": 0.79926533, "num_input_tokens_seen": 283441830, "router_z_loss_clip": 2.171875, "router_z_loss_mlp": 0.25842285, "step": 13136, "time_per_iteration": 2.7181344032287598 }, { "auxiliary_loss_clip": 0.01259345, "auxiliary_loss_mlp": 0.0023445, "balance_loss_clip": 1.03295732, "balance_loss_mlp": 0.20853405, "epoch": 0.7898391702991132, "flos": 15918949111680.0, "grad_norm": 8.703370579101698, "language_loss": 0.82353795, "learning_rate": 4.455896208180778e-07, "loss": 0.83847582, "num_input_tokens_seen": 283459540, "router_z_loss_clip": 2.26171875, "router_z_loss_mlp": 0.25915527, "step": 13137, "time_per_iteration": 2.6891896724700928 }, { "auxiliary_loss_clip": 0.01237656, "auxiliary_loss_mlp": 0.00191615, "balance_loss_clip": 1.02043045, "balance_loss_mlp": 0.16674845, "epoch": 0.7898992935517811, "flos": 19829046896640.0, "grad_norm": 5.157149637968335, "language_loss": 0.83120191, "learning_rate": 4.4534458260186645e-07, "loss": 0.84549463, "num_input_tokens_seen": 283478790, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.24853516, "step": 13138, "time_per_iteration": 2.6567556858062744 }, { "auxiliary_loss_clip": 0.01236067, "auxiliary_loss_mlp": 0.00215149, "balance_loss_clip": 1.01839113, "balance_loss_mlp": 0.19296461, "epoch": 0.7899594168044491, "flos": 16216253982720.0, "grad_norm": 277.4102441836117, "language_loss": 0.76490963, "learning_rate": 4.4509960333834426e-07, "loss": 0.77942175, "num_input_tokens_seen": 283495720, "router_z_loss_clip": 2.17578125, "router_z_loss_mlp": 0.22192383, "step": 13139, "time_per_iteration": 2.6820759773254395 }, { "auxiliary_loss_clip": 0.01116914, "auxiliary_loss_mlp": 0.00143769, "balance_loss_clip": 0.96133077, "balance_loss_mlp": 0.13594885, "epoch": 0.790019540057117, "flos": 68331005959680.0, "grad_norm": 0.8653991395872381, "language_loss": 0.59560609, "learning_rate": 4.448546830368003e-07, "loss": 0.60821289, "num_input_tokens_seen": 283558795, "router_z_loss_clip": 1.5546875, "router_z_loss_mlp": 0.078125, "step": 13140, "time_per_iteration": 3.244492769241333 }, { "auxiliary_loss_clip": 0.01245114, "auxiliary_loss_mlp": 0.00214232, "balance_loss_clip": 1.0266788, "balance_loss_mlp": 0.18961535, "epoch": 0.7900796633097851, "flos": 30332773601280.0, "grad_norm": 6.965899795399672, "language_loss": 0.83940393, "learning_rate": 4.4460982170652304e-07, "loss": 0.85399741, "num_input_tokens_seen": 283579305, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.24609375, "step": 13141, "time_per_iteration": 2.7538228034973145 }, { "auxiliary_loss_clip": 0.01245932, "auxiliary_loss_mlp": 0.00233424, "balance_loss_clip": 1.0266813, "balance_loss_mlp": 0.20930827, "epoch": 0.790139786562453, "flos": 22126790983680.0, "grad_norm": 224.6797147980219, "language_loss": 0.755557, "learning_rate": 4.4436501935679694e-07, "loss": 0.77035058, "num_input_tokens_seen": 283597840, "router_z_loss_clip": 2.19140625, "router_z_loss_mlp": 0.24145508, "step": 13142, "time_per_iteration": 2.6963860988616943 }, { "auxiliary_loss_clip": 0.01120505, "auxiliary_loss_mlp": 0.00130961, "balance_loss_clip": 0.96438807, "balance_loss_mlp": 0.12395179, "epoch": 0.790199909815121, "flos": 58207284213120.0, "grad_norm": 1.763560783369387, "language_loss": 0.59229243, "learning_rate": 4.441202759969049e-07, "loss": 0.60480714, "num_input_tokens_seen": 283647950, "router_z_loss_clip": 1.5625, "router_z_loss_mlp": 0.0703125, "step": 13143, "time_per_iteration": 4.3365209102630615 }, { "auxiliary_loss_clip": 0.01248984, "auxiliary_loss_mlp": 0.00212757, "balance_loss_clip": 1.0304693, "balance_loss_mlp": 0.18781862, "epoch": 0.7902600330677889, "flos": 34533316759680.0, "grad_norm": 5.112342301571589, "language_loss": 0.80957496, "learning_rate": 4.4387559163612875e-07, "loss": 0.82419235, "num_input_tokens_seen": 283670645, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.24951172, "step": 13144, "time_per_iteration": 4.173142433166504 }, { "auxiliary_loss_clip": 0.01248011, "auxiliary_loss_mlp": 0.00216706, "balance_loss_clip": 1.02486897, "balance_loss_mlp": 0.19017038, "epoch": 0.7903201563204569, "flos": 22346384780160.0, "grad_norm": 86.44753568940614, "language_loss": 0.91272599, "learning_rate": 4.4363096628374605e-07, "loss": 0.92737317, "num_input_tokens_seen": 283688830, "router_z_loss_clip": 2.23046875, "router_z_loss_mlp": 0.26538086, "step": 13145, "time_per_iteration": 2.6889121532440186 }, { "auxiliary_loss_clip": 0.01232988, "auxiliary_loss_mlp": 0.00212511, "balance_loss_clip": 1.01718891, "balance_loss_mlp": 0.18870467, "epoch": 0.790380279573125, "flos": 22053533195520.0, "grad_norm": 38.16247682095744, "language_loss": 0.81929249, "learning_rate": 4.4338639994903235e-07, "loss": 0.83374751, "num_input_tokens_seen": 283708625, "router_z_loss_clip": 2.15429688, "router_z_loss_mlp": 0.23791504, "step": 13146, "time_per_iteration": 2.683138370513916 }, { "auxiliary_loss_clip": 0.01243475, "auxiliary_loss_mlp": 0.00220908, "balance_loss_clip": 1.02109373, "balance_loss_mlp": 0.19745927, "epoch": 0.7904404028257929, "flos": 20302600826880.0, "grad_norm": 23.53552496340858, "language_loss": 0.84964561, "learning_rate": 4.4314189264126246e-07, "loss": 0.8642894, "num_input_tokens_seen": 283725710, "router_z_loss_clip": 2.22460938, "router_z_loss_mlp": 0.23461914, "step": 13147, "time_per_iteration": 2.695978879928589 }, { "auxiliary_loss_clip": 0.01230792, "auxiliary_loss_mlp": 0.00215251, "balance_loss_clip": 1.01400828, "balance_loss_mlp": 0.19055064, "epoch": 0.7905005260784609, "flos": 20008923229440.0, "grad_norm": 39.2000993983356, "language_loss": 0.79277962, "learning_rate": 4.428974443697087e-07, "loss": 0.80724001, "num_input_tokens_seen": 283744150, "router_z_loss_clip": 2.16796875, "router_z_loss_mlp": 0.24682617, "step": 13148, "time_per_iteration": 2.671116352081299 }, { "auxiliary_loss_clip": 0.01249605, "auxiliary_loss_mlp": 0.00211103, "balance_loss_clip": 1.02770734, "balance_loss_mlp": 0.18625955, "epoch": 0.7905606493311288, "flos": 26905926418560.0, "grad_norm": 22.198634697221973, "language_loss": 0.80182499, "learning_rate": 4.4265305514363913e-07, "loss": 0.81643206, "num_input_tokens_seen": 283764170, "router_z_loss_clip": 2.22070312, "router_z_loss_mlp": 0.24853516, "step": 13149, "time_per_iteration": 4.245004653930664 }, { "auxiliary_loss_clip": 0.01259075, "auxiliary_loss_mlp": 0.00232606, "balance_loss_clip": 1.02790499, "balance_loss_mlp": 0.20609362, "epoch": 0.7906207725837968, "flos": 23696230907520.0, "grad_norm": 120.1933606834315, "language_loss": 0.74069345, "learning_rate": 4.424087249723225e-07, "loss": 0.75561023, "num_input_tokens_seen": 283784305, "router_z_loss_clip": 2.3125, "router_z_loss_mlp": 0.26513672, "step": 13150, "time_per_iteration": 2.750566005706787 }, { "auxiliary_loss_clip": 0.01229731, "auxiliary_loss_mlp": 0.00206046, "balance_loss_clip": 1.01613855, "balance_loss_mlp": 0.18331242, "epoch": 0.7906808958364647, "flos": 20848837927680.0, "grad_norm": 5.366986753067599, "language_loss": 0.78122079, "learning_rate": 4.421644538650231e-07, "loss": 0.79557854, "num_input_tokens_seen": 283804040, "router_z_loss_clip": 2.13671875, "router_z_loss_mlp": 0.22729492, "step": 13151, "time_per_iteration": 2.796795129776001 }, { "auxiliary_loss_clip": 0.01252112, "auxiliary_loss_mlp": 0.0022148, "balance_loss_clip": 1.02905989, "balance_loss_mlp": 0.1980079, "epoch": 0.7907410190891327, "flos": 40735196974080.0, "grad_norm": 2825.305667922748, "language_loss": 0.77175272, "learning_rate": 4.4192024183100306e-07, "loss": 0.78648865, "num_input_tokens_seen": 283827120, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.23498535, "step": 13152, "time_per_iteration": 2.885089635848999 }, { "auxiliary_loss_clip": 0.01252121, "auxiliary_loss_mlp": 0.00206774, "balance_loss_clip": 1.0329206, "balance_loss_mlp": 0.18368277, "epoch": 0.7908011423418007, "flos": 13261165050240.0, "grad_norm": 12.785550469853707, "language_loss": 0.81839502, "learning_rate": 4.4167608887952367e-07, "loss": 0.83298397, "num_input_tokens_seen": 283844820, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.2310791, "step": 13153, "time_per_iteration": 2.6655986309051514 }, { "auxiliary_loss_clip": 0.01257073, "auxiliary_loss_mlp": 0.00213389, "balance_loss_clip": 1.03523517, "balance_loss_mlp": 0.1884149, "epoch": 0.7908612655944687, "flos": 19754747614080.0, "grad_norm": 27.55603175311605, "language_loss": 0.85902524, "learning_rate": 4.4143199501984306e-07, "loss": 0.87372983, "num_input_tokens_seen": 283862870, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.24951172, "step": 13154, "time_per_iteration": 4.128228425979614 }, { "auxiliary_loss_clip": 0.01273522, "auxiliary_loss_mlp": 0.00220697, "balance_loss_clip": 1.03573024, "balance_loss_mlp": 0.1944351, "epoch": 0.7909213888471366, "flos": 21287738211840.0, "grad_norm": 60.99277426490816, "language_loss": 0.82499558, "learning_rate": 4.411879602612185e-07, "loss": 0.83993781, "num_input_tokens_seen": 283882405, "router_z_loss_clip": 2.3828125, "router_z_loss_mlp": 0.26269531, "step": 13155, "time_per_iteration": 2.714562177658081 }, { "auxiliary_loss_clip": 0.01253492, "auxiliary_loss_mlp": 0.00197817, "balance_loss_clip": 1.03034902, "balance_loss_mlp": 0.17235386, "epoch": 0.7909815120998046, "flos": 22528882805760.0, "grad_norm": 19.17917202892157, "language_loss": 0.8447305, "learning_rate": 4.4094398461290174e-07, "loss": 0.85924351, "num_input_tokens_seen": 283902070, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.25476074, "step": 13156, "time_per_iteration": 2.755249261856079 }, { "auxiliary_loss_clip": 0.01235883, "auxiliary_loss_mlp": 0.00191411, "balance_loss_clip": 1.01635957, "balance_loss_mlp": 0.16654375, "epoch": 0.7910416353524725, "flos": 26727702111360.0, "grad_norm": 73.62265239592028, "language_loss": 0.71521187, "learning_rate": 4.4070006808414526e-07, "loss": 0.7294848, "num_input_tokens_seen": 283924100, "router_z_loss_clip": 2.19335938, "router_z_loss_mlp": 0.24890137, "step": 13157, "time_per_iteration": 2.7404820919036865 }, { "auxiliary_loss_clip": 0.01248506, "auxiliary_loss_mlp": 0.00222514, "balance_loss_clip": 1.02491736, "balance_loss_mlp": 0.19724202, "epoch": 0.7911017586051405, "flos": 24644847139200.0, "grad_norm": 4.330373624204686, "language_loss": 0.82718408, "learning_rate": 4.4045621068419894e-07, "loss": 0.84189427, "num_input_tokens_seen": 283944955, "router_z_loss_clip": 2.23632812, "router_z_loss_mlp": 0.25256348, "step": 13158, "time_per_iteration": 2.7005462646484375 }, { "auxiliary_loss_clip": 0.01224128, "auxiliary_loss_mlp": 0.00192748, "balance_loss_clip": 1.01337028, "balance_loss_mlp": 0.17033666, "epoch": 0.7911618818578086, "flos": 17565489578880.0, "grad_norm": 72.34876072229818, "language_loss": 0.78789663, "learning_rate": 4.40212412422309e-07, "loss": 0.80206537, "num_input_tokens_seen": 283963125, "router_z_loss_clip": 2.10546875, "router_z_loss_mlp": 0.22412109, "step": 13159, "time_per_iteration": 2.626145362854004 }, { "auxiliary_loss_clip": 0.01243581, "auxiliary_loss_mlp": 0.00208023, "balance_loss_clip": 1.02116323, "balance_loss_mlp": 0.1827271, "epoch": 0.7912220051104765, "flos": 16721660298240.0, "grad_norm": 3.3629252908345726, "language_loss": 0.74565017, "learning_rate": 4.399686733077206e-07, "loss": 0.76016629, "num_input_tokens_seen": 283982850, "router_z_loss_clip": 2.2265625, "router_z_loss_mlp": 0.25305176, "step": 13160, "time_per_iteration": 2.6828629970550537 }, { "auxiliary_loss_clip": 0.01217916, "auxiliary_loss_mlp": 0.00207833, "balance_loss_clip": 1.00897551, "balance_loss_mlp": 0.18591064, "epoch": 0.7912821283631445, "flos": 13698736531200.0, "grad_norm": 18.340203382754183, "language_loss": 0.80170017, "learning_rate": 4.3972499334967694e-07, "loss": 0.81595773, "num_input_tokens_seen": 283998275, "router_z_loss_clip": 2.08789062, "router_z_loss_mlp": 0.21923828, "step": 13161, "time_per_iteration": 2.702373504638672 }, { "auxiliary_loss_clip": 0.01232035, "auxiliary_loss_mlp": 0.0020021, "balance_loss_clip": 1.01702118, "balance_loss_mlp": 0.17769149, "epoch": 0.7913422516158124, "flos": 23769021818880.0, "grad_norm": 5.5870736735245785, "language_loss": 0.81671053, "learning_rate": 4.39481372557418e-07, "loss": 0.83103293, "num_input_tokens_seen": 284018750, "router_z_loss_clip": 2.14648438, "router_z_loss_mlp": 0.22521973, "step": 13162, "time_per_iteration": 2.771310806274414 }, { "auxiliary_loss_clip": 0.01262062, "auxiliary_loss_mlp": 0.00207527, "balance_loss_clip": 1.03806627, "balance_loss_mlp": 0.18170607, "epoch": 0.7914023748684804, "flos": 19938251220480.0, "grad_norm": 56.46613946293597, "language_loss": 0.79814899, "learning_rate": 4.392378109401811e-07, "loss": 0.81284487, "num_input_tokens_seen": 284037850, "router_z_loss_clip": 2.24121094, "router_z_loss_mlp": 0.25842285, "step": 13163, "time_per_iteration": 2.6334853172302246 }, { "auxiliary_loss_clip": 0.01236097, "auxiliary_loss_mlp": 0.00192129, "balance_loss_clip": 1.02058673, "balance_loss_mlp": 0.16802511, "epoch": 0.7914624981211483, "flos": 20594805966720.0, "grad_norm": 6.670564976617729, "language_loss": 0.80734742, "learning_rate": 4.3899430850720296e-07, "loss": 0.8216297, "num_input_tokens_seen": 284056380, "router_z_loss_clip": 2.15234375, "router_z_loss_mlp": 0.2409668, "step": 13164, "time_per_iteration": 2.6926498413085938 }, { "auxiliary_loss_clip": 0.01245384, "auxiliary_loss_mlp": 0.00210715, "balance_loss_clip": 1.02469134, "balance_loss_mlp": 0.18658713, "epoch": 0.7915226213738163, "flos": 21799465320960.0, "grad_norm": 27.839520021736906, "language_loss": 0.75325143, "learning_rate": 4.387508652677177e-07, "loss": 0.76781237, "num_input_tokens_seen": 284074945, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.24108887, "step": 13165, "time_per_iteration": 2.6458113193511963 }, { "auxiliary_loss_clip": 0.01215843, "auxiliary_loss_mlp": 0.00198295, "balance_loss_clip": 1.0081296, "balance_loss_mlp": 0.17698073, "epoch": 0.7915827446264843, "flos": 16288362535680.0, "grad_norm": 23.01351783449072, "language_loss": 0.79140174, "learning_rate": 4.385074812309557e-07, "loss": 0.80554318, "num_input_tokens_seen": 284092070, "router_z_loss_clip": 2.08007812, "router_z_loss_mlp": 0.21313477, "step": 13166, "time_per_iteration": 2.6624362468719482 }, { "auxiliary_loss_clip": 0.01251044, "auxiliary_loss_mlp": 0.00216196, "balance_loss_clip": 1.02996492, "balance_loss_mlp": 0.19335584, "epoch": 0.7916428678791523, "flos": 25702595867520.0, "grad_norm": 6.824687890987323, "language_loss": 0.86259949, "learning_rate": 4.382641564061462e-07, "loss": 0.87727189, "num_input_tokens_seen": 284112255, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.22839355, "step": 13167, "time_per_iteration": 2.89607310295105 }, { "auxiliary_loss_clip": 0.01230473, "auxiliary_loss_mlp": 0.00209847, "balance_loss_clip": 1.01242733, "balance_loss_mlp": 0.18545653, "epoch": 0.7917029911318202, "flos": 23878513451520.0, "grad_norm": 2.361629165066597, "language_loss": 0.92184865, "learning_rate": 4.3802089080251713e-07, "loss": 0.93625188, "num_input_tokens_seen": 284132330, "router_z_loss_clip": 2.18164062, "router_z_loss_mlp": 0.24389648, "step": 13168, "time_per_iteration": 2.948854446411133 }, { "auxiliary_loss_clip": 0.01249058, "auxiliary_loss_mlp": 0.00196824, "balance_loss_clip": 1.02704024, "balance_loss_mlp": 0.17173012, "epoch": 0.7917631143844882, "flos": 21646593037440.0, "grad_norm": 2.6795114093287844, "language_loss": 0.81002945, "learning_rate": 4.3777768442929155e-07, "loss": 0.82448828, "num_input_tokens_seen": 284150640, "router_z_loss_clip": 2.22167969, "router_z_loss_mlp": 0.25109863, "step": 13169, "time_per_iteration": 2.7106986045837402 }, { "auxiliary_loss_clip": 0.01252743, "auxiliary_loss_mlp": 0.00206429, "balance_loss_clip": 1.02745843, "balance_loss_mlp": 0.18252765, "epoch": 0.7918232376371561, "flos": 38874198355200.0, "grad_norm": 5.883455657817193, "language_loss": 0.77751338, "learning_rate": 4.3753453729569287e-07, "loss": 0.79210508, "num_input_tokens_seen": 284171910, "router_z_loss_clip": 2.25390625, "router_z_loss_mlp": 0.23913574, "step": 13170, "time_per_iteration": 2.789294481277466 }, { "auxiliary_loss_clip": 0.01227806, "auxiliary_loss_mlp": 0.00200479, "balance_loss_clip": 1.01495171, "balance_loss_mlp": 0.17888981, "epoch": 0.7918833608898241, "flos": 20775544225920.0, "grad_norm": 14.732063678976541, "language_loss": 0.78285348, "learning_rate": 4.372914494109412e-07, "loss": 0.79713631, "num_input_tokens_seen": 284191340, "router_z_loss_clip": 2.12890625, "router_z_loss_mlp": 0.21606445, "step": 13171, "time_per_iteration": 2.6892974376678467 }, { "auxiliary_loss_clip": 0.01239914, "auxiliary_loss_mlp": 0.0020925, "balance_loss_clip": 1.02061033, "balance_loss_mlp": 0.18512245, "epoch": 0.7919434841424922, "flos": 33910122769920.0, "grad_norm": 10.134422282176, "language_loss": 0.76717472, "learning_rate": 4.370484207842553e-07, "loss": 0.78166634, "num_input_tokens_seen": 284212495, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.24157715, "step": 13172, "time_per_iteration": 2.7374603748321533 }, { "auxiliary_loss_clip": 0.01224686, "auxiliary_loss_mlp": 0.00199895, "balance_loss_clip": 1.00955963, "balance_loss_mlp": 0.17799667, "epoch": 0.7920036073951601, "flos": 21064660796160.0, "grad_norm": 29.289104506830444, "language_loss": 0.86052388, "learning_rate": 4.3680545142484893e-07, "loss": 0.87476969, "num_input_tokens_seen": 284230825, "router_z_loss_clip": 2.15234375, "router_z_loss_mlp": 0.21899414, "step": 13173, "time_per_iteration": 2.7438483238220215 }, { "auxiliary_loss_clip": 0.01227213, "auxiliary_loss_mlp": 0.002092, "balance_loss_clip": 1.01473081, "balance_loss_mlp": 0.18653879, "epoch": 0.7920637306478281, "flos": 23655974739840.0, "grad_norm": 3.9399529907613378, "language_loss": 0.83291602, "learning_rate": 4.365625413419365e-07, "loss": 0.84728014, "num_input_tokens_seen": 284250365, "router_z_loss_clip": 2.12304688, "router_z_loss_mlp": 0.22680664, "step": 13174, "time_per_iteration": 2.713083267211914 }, { "auxiliary_loss_clip": 0.01217537, "auxiliary_loss_mlp": 0.00193682, "balance_loss_clip": 1.00541139, "balance_loss_mlp": 0.17000748, "epoch": 0.792123853900496, "flos": 27195438038400.0, "grad_norm": 336.06490431539424, "language_loss": 0.76501799, "learning_rate": 4.363196905447297e-07, "loss": 0.77913022, "num_input_tokens_seen": 284269635, "router_z_loss_clip": 2.1171875, "router_z_loss_mlp": 0.23669434, "step": 13175, "time_per_iteration": 2.7407984733581543 }, { "auxiliary_loss_clip": 0.01243053, "auxiliary_loss_mlp": 0.00208765, "balance_loss_clip": 1.02545094, "balance_loss_mlp": 0.1866276, "epoch": 0.792183977153164, "flos": 19098659744640.0, "grad_norm": 7.59355990813891, "language_loss": 0.69647914, "learning_rate": 4.360768990424364e-07, "loss": 0.71099734, "num_input_tokens_seen": 284288380, "router_z_loss_clip": 2.17578125, "router_z_loss_mlp": 0.22155762, "step": 13176, "time_per_iteration": 2.6199002265930176 }, { "auxiliary_loss_clip": 0.01234879, "auxiliary_loss_mlp": 0.0018657, "balance_loss_clip": 1.02074289, "balance_loss_mlp": 0.16438517, "epoch": 0.7922441004058319, "flos": 17128851851520.0, "grad_norm": 2.305721745379097, "language_loss": 0.83627439, "learning_rate": 4.3583416684426376e-07, "loss": 0.8504889, "num_input_tokens_seen": 284306920, "router_z_loss_clip": 2.14648438, "router_z_loss_mlp": 0.22192383, "step": 13177, "time_per_iteration": 2.695570468902588 }, { "auxiliary_loss_clip": 0.01236885, "auxiliary_loss_mlp": 0.00187289, "balance_loss_clip": 1.02313697, "balance_loss_mlp": 0.16467464, "epoch": 0.7923042236585, "flos": 17821640442240.0, "grad_norm": 15.416012550430255, "language_loss": 0.73075724, "learning_rate": 4.355914939594174e-07, "loss": 0.74499893, "num_input_tokens_seen": 284324700, "router_z_loss_clip": 2.13476562, "router_z_loss_mlp": 0.22607422, "step": 13178, "time_per_iteration": 2.620849370956421 }, { "auxiliary_loss_clip": 0.01232952, "auxiliary_loss_mlp": 0.00190259, "balance_loss_clip": 1.01799262, "balance_loss_mlp": 0.16760913, "epoch": 0.7923643469111679, "flos": 29935206892800.0, "grad_norm": 112.6410339015841, "language_loss": 0.76272768, "learning_rate": 4.3534888039709726e-07, "loss": 0.77695984, "num_input_tokens_seen": 284345985, "router_z_loss_clip": 2.1484375, "router_z_loss_mlp": 0.22680664, "step": 13179, "time_per_iteration": 2.7470221519470215 }, { "auxiliary_loss_clip": 0.01258509, "auxiliary_loss_mlp": 0.00227193, "balance_loss_clip": 1.03357983, "balance_loss_mlp": 0.19996569, "epoch": 0.7924244701638359, "flos": 22674716023680.0, "grad_norm": 5.928043640276092, "language_loss": 0.85032141, "learning_rate": 4.3510632616650444e-07, "loss": 0.86517847, "num_input_tokens_seen": 284364475, "router_z_loss_clip": 2.25097656, "router_z_loss_mlp": 0.2722168, "step": 13180, "time_per_iteration": 2.673217296600342 }, { "auxiliary_loss_clip": 0.01252826, "auxiliary_loss_mlp": 0.00210144, "balance_loss_clip": 1.03362322, "balance_loss_mlp": 0.1856699, "epoch": 0.7924845934165038, "flos": 17968156018560.0, "grad_norm": 10.196099968253366, "language_loss": 0.90316534, "learning_rate": 4.3486383127683646e-07, "loss": 0.91779506, "num_input_tokens_seen": 284382125, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.24487305, "step": 13181, "time_per_iteration": 2.6615023612976074 }, { "auxiliary_loss_clip": 0.01228021, "auxiliary_loss_mlp": 0.00206846, "balance_loss_clip": 1.01442766, "balance_loss_mlp": 0.18327856, "epoch": 0.7925447166691718, "flos": 23476960333440.0, "grad_norm": 80.87985500191917, "language_loss": 0.84591544, "learning_rate": 4.346213957372895e-07, "loss": 0.86026406, "num_input_tokens_seen": 284401585, "router_z_loss_clip": 2.13476562, "router_z_loss_mlp": 0.23571777, "step": 13182, "time_per_iteration": 2.708810567855835 }, { "auxiliary_loss_clip": 0.01277994, "auxiliary_loss_mlp": 0.00225701, "balance_loss_clip": 1.04214764, "balance_loss_mlp": 0.19843769, "epoch": 0.7926048399218397, "flos": 20447572118400.0, "grad_norm": 12.67059254104185, "language_loss": 0.84017777, "learning_rate": 4.34379019557056e-07, "loss": 0.85521472, "num_input_tokens_seen": 284419125, "router_z_loss_clip": 2.359375, "router_z_loss_mlp": 0.27258301, "step": 13183, "time_per_iteration": 2.663673162460327 }, { "auxiliary_loss_clip": 0.01257402, "auxiliary_loss_mlp": 0.00212196, "balance_loss_clip": 1.03160453, "balance_loss_mlp": 0.18796109, "epoch": 0.7926649631745077, "flos": 37160038535040.0, "grad_norm": 27.688113463298706, "language_loss": 0.78536052, "learning_rate": 4.341367027453264e-07, "loss": 0.80005652, "num_input_tokens_seen": 284440445, "router_z_loss_clip": 2.2578125, "router_z_loss_mlp": 0.24267578, "step": 13184, "time_per_iteration": 2.9393982887268066 }, { "auxiliary_loss_clip": 0.01250622, "auxiliary_loss_mlp": 0.00209306, "balance_loss_clip": 1.02900505, "balance_loss_mlp": 0.18470135, "epoch": 0.7927250864271758, "flos": 17018606033280.0, "grad_norm": 58.15230256867171, "language_loss": 0.807877, "learning_rate": 4.338944453112907e-07, "loss": 0.82247627, "num_input_tokens_seen": 284459370, "router_z_loss_clip": 2.21679688, "router_z_loss_mlp": 0.24621582, "step": 13185, "time_per_iteration": 4.127445220947266 }, { "auxiliary_loss_clip": 0.01246261, "auxiliary_loss_mlp": 0.00211811, "balance_loss_clip": 1.02436459, "balance_loss_mlp": 0.18705113, "epoch": 0.7927852096798437, "flos": 17749208666880.0, "grad_norm": 97.7198930677389, "language_loss": 0.74513733, "learning_rate": 4.3365224726413375e-07, "loss": 0.759718, "num_input_tokens_seen": 284477525, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.24743652, "step": 13186, "time_per_iteration": 2.612457275390625 }, { "auxiliary_loss_clip": 0.01231266, "auxiliary_loss_mlp": 0.00210116, "balance_loss_clip": 1.01809263, "balance_loss_mlp": 0.18715589, "epoch": 0.7928453329325117, "flos": 23838436851840.0, "grad_norm": 62.64566417811416, "language_loss": 0.8240723, "learning_rate": 4.334101086130408e-07, "loss": 0.83848614, "num_input_tokens_seen": 284496590, "router_z_loss_clip": 2.13085938, "router_z_loss_mlp": 0.22973633, "step": 13187, "time_per_iteration": 4.140455484390259 }, { "auxiliary_loss_clip": 0.01229977, "auxiliary_loss_mlp": 0.00208689, "balance_loss_clip": 1.01439929, "balance_loss_mlp": 0.18502587, "epoch": 0.7929054561851796, "flos": 17454920538240.0, "grad_norm": 23.054145446397193, "language_loss": 0.81074381, "learning_rate": 4.3316802936719334e-07, "loss": 0.82513052, "num_input_tokens_seen": 284511470, "router_z_loss_clip": 2.15820312, "router_z_loss_mlp": 0.2364502, "step": 13188, "time_per_iteration": 2.587362051010132 }, { "auxiliary_loss_clip": 0.01246613, "auxiliary_loss_mlp": 0.0021818, "balance_loss_clip": 1.02142119, "balance_loss_mlp": 0.19080992, "epoch": 0.7929655794378476, "flos": 21981280988160.0, "grad_norm": 261.1744806867916, "language_loss": 0.74255085, "learning_rate": 4.329260095357725e-07, "loss": 0.75719875, "num_input_tokens_seen": 284531125, "router_z_loss_clip": 2.25, "router_z_loss_mlp": 0.2734375, "step": 13189, "time_per_iteration": 2.6871156692504883 }, { "auxiliary_loss_clip": 0.01235591, "auxiliary_loss_mlp": 0.00210785, "balance_loss_clip": 1.01297092, "balance_loss_mlp": 0.18726525, "epoch": 0.7930257026905155, "flos": 17273930883840.0, "grad_norm": 19.950613366442127, "language_loss": 0.83568513, "learning_rate": 4.3268404912795307e-07, "loss": 0.85014886, "num_input_tokens_seen": 284549340, "router_z_loss_clip": 2.22851562, "router_z_loss_mlp": 0.23510742, "step": 13190, "time_per_iteration": 2.6609575748443604 }, { "auxiliary_loss_clip": 0.01238991, "auxiliary_loss_mlp": 0.00203032, "balance_loss_clip": 1.02864206, "balance_loss_mlp": 0.18066819, "epoch": 0.7930858259431836, "flos": 27300584125440.0, "grad_norm": 18.692852160756857, "language_loss": 0.79087365, "learning_rate": 4.3244214815291166e-07, "loss": 0.8052938, "num_input_tokens_seen": 284567060, "router_z_loss_clip": 2.10253906, "router_z_loss_mlp": 0.22351074, "step": 13191, "time_per_iteration": 4.354615688323975 }, { "auxiliary_loss_clip": 0.01259762, "auxiliary_loss_mlp": 0.00221318, "balance_loss_clip": 1.03346813, "balance_loss_mlp": 0.19577141, "epoch": 0.7931459491958515, "flos": 19863736456320.0, "grad_norm": 5.748731444252395, "language_loss": 0.7596311, "learning_rate": 4.322003066198219e-07, "loss": 0.7744419, "num_input_tokens_seen": 284586600, "router_z_loss_clip": 2.26171875, "router_z_loss_mlp": 0.25549316, "step": 13192, "time_per_iteration": 2.6866166591644287 }, { "auxiliary_loss_clip": 0.01255357, "auxiliary_loss_mlp": 0.00217831, "balance_loss_clip": 1.03312588, "balance_loss_mlp": 0.19357198, "epoch": 0.7932060724485195, "flos": 23147120718720.0, "grad_norm": 2.1316742100931996, "language_loss": 0.82553607, "learning_rate": 4.3195852453785274e-07, "loss": 0.84026796, "num_input_tokens_seen": 284605715, "router_z_loss_clip": 2.22460938, "router_z_loss_mlp": 0.24279785, "step": 13193, "time_per_iteration": 2.709317684173584 }, { "auxiliary_loss_clip": 0.01249201, "auxiliary_loss_mlp": 0.00225588, "balance_loss_clip": 1.03111243, "balance_loss_mlp": 0.2017104, "epoch": 0.7932661957011874, "flos": 29934847756800.0, "grad_norm": 12.473432993562028, "language_loss": 0.79882789, "learning_rate": 4.317168019161741e-07, "loss": 0.81357574, "num_input_tokens_seen": 284628540, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.23876953, "step": 13194, "time_per_iteration": 2.8075287342071533 }, { "auxiliary_loss_clip": 0.01262409, "auxiliary_loss_mlp": 0.00223307, "balance_loss_clip": 1.03632534, "balance_loss_mlp": 0.19793949, "epoch": 0.7933263189538554, "flos": 22559119079040.0, "grad_norm": 66.74401134756553, "language_loss": 0.78635406, "learning_rate": 4.314751387639517e-07, "loss": 0.80121118, "num_input_tokens_seen": 284646040, "router_z_loss_clip": 2.26171875, "router_z_loss_mlp": 0.25366211, "step": 13195, "time_per_iteration": 2.7143137454986572 }, { "auxiliary_loss_clip": 0.01252222, "auxiliary_loss_mlp": 0.00214503, "balance_loss_clip": 1.02815485, "balance_loss_mlp": 0.18883723, "epoch": 0.7933864422065233, "flos": 25479051575040.0, "grad_norm": 20.6456130027667, "language_loss": 0.85126543, "learning_rate": 4.3123353509034844e-07, "loss": 0.8659327, "num_input_tokens_seen": 284665110, "router_z_loss_clip": 2.24023438, "router_z_loss_mlp": 0.2565918, "step": 13196, "time_per_iteration": 4.142575025558472 }, { "auxiliary_loss_clip": 0.01257809, "auxiliary_loss_mlp": 0.00212112, "balance_loss_clip": 1.03633583, "balance_loss_mlp": 0.185707, "epoch": 0.7934465654591913, "flos": 33583156243200.0, "grad_norm": 5.59741657169601, "language_loss": 0.76953828, "learning_rate": 4.309919909045268e-07, "loss": 0.7842375, "num_input_tokens_seen": 284686515, "router_z_loss_clip": 2.21484375, "router_z_loss_mlp": 0.26403809, "step": 13197, "time_per_iteration": 2.8046646118164062 }, { "auxiliary_loss_clip": 0.0124231, "auxiliary_loss_mlp": 0.00208924, "balance_loss_clip": 1.02434397, "balance_loss_mlp": 0.18297246, "epoch": 0.7935066887118594, "flos": 31432538263680.0, "grad_norm": 17.536607378824073, "language_loss": 0.74391472, "learning_rate": 4.30750506215646e-07, "loss": 0.75842708, "num_input_tokens_seen": 284707300, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.25952148, "step": 13198, "time_per_iteration": 2.780355215072632 }, { "auxiliary_loss_clip": 0.01247987, "auxiliary_loss_mlp": 0.00213172, "balance_loss_clip": 1.0222739, "balance_loss_mlp": 0.18614721, "epoch": 0.7935668119645273, "flos": 14682616940160.0, "grad_norm": 1532.1523214880265, "language_loss": 0.83622634, "learning_rate": 4.30509081032864e-07, "loss": 0.85083795, "num_input_tokens_seen": 284723545, "router_z_loss_clip": 2.25390625, "router_z_loss_mlp": 0.27038574, "step": 13199, "time_per_iteration": 2.644821882247925 }, { "auxiliary_loss_clip": 0.01247623, "auxiliary_loss_mlp": 0.00209656, "balance_loss_clip": 1.02907395, "balance_loss_mlp": 0.18545701, "epoch": 0.7936269352171953, "flos": 18004246208640.0, "grad_norm": 3.697888002567617, "language_loss": 0.88435066, "learning_rate": 4.302677153653349e-07, "loss": 0.89892352, "num_input_tokens_seen": 284742650, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.24206543, "step": 13200, "time_per_iteration": 2.648928165435791 }, { "auxiliary_loss_clip": 0.01234573, "auxiliary_loss_mlp": 0.00199022, "balance_loss_clip": 1.02245903, "balance_loss_mlp": 0.17709893, "epoch": 0.7936870584698632, "flos": 18880215183360.0, "grad_norm": 24.47996604661523, "language_loss": 0.82951379, "learning_rate": 4.3002640922221077e-07, "loss": 0.84384978, "num_input_tokens_seen": 284760955, "router_z_loss_clip": 2.11914062, "router_z_loss_mlp": 0.21923828, "step": 13201, "time_per_iteration": 2.610272169113159 }, { "auxiliary_loss_clip": 0.01245044, "auxiliary_loss_mlp": 0.00213825, "balance_loss_clip": 1.02506566, "balance_loss_mlp": 0.18889804, "epoch": 0.7937471817225312, "flos": 23367001824000.0, "grad_norm": 30.200425334299304, "language_loss": 0.75145042, "learning_rate": 4.2978516261264296e-07, "loss": 0.76603913, "num_input_tokens_seen": 284780745, "router_z_loss_clip": 2.19921875, "router_z_loss_mlp": 0.24938965, "step": 13202, "time_per_iteration": 2.6895530223846436 }, { "auxiliary_loss_clip": 0.01241447, "auxiliary_loss_mlp": 0.00222387, "balance_loss_clip": 1.02177119, "balance_loss_mlp": 0.19666177, "epoch": 0.7938073049751991, "flos": 22674428714880.0, "grad_norm": 6.443916083163891, "language_loss": 0.82362688, "learning_rate": 4.2954397554577884e-07, "loss": 0.83826524, "num_input_tokens_seen": 284799000, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.25732422, "step": 13203, "time_per_iteration": 2.634995460510254 }, { "auxiliary_loss_clip": 0.01261458, "auxiliary_loss_mlp": 0.00215912, "balance_loss_clip": 1.04227877, "balance_loss_mlp": 0.19172452, "epoch": 0.7938674282278672, "flos": 22851431959680.0, "grad_norm": 199.7582469653167, "language_loss": 0.77239949, "learning_rate": 4.293028480307643e-07, "loss": 0.78717315, "num_input_tokens_seen": 284817450, "router_z_loss_clip": 2.19140625, "router_z_loss_mlp": 0.24206543, "step": 13204, "time_per_iteration": 2.6831390857696533 }, { "auxiliary_loss_clip": 0.01232503, "auxiliary_loss_mlp": 0.00220603, "balance_loss_clip": 1.01867604, "balance_loss_mlp": 0.19690405, "epoch": 0.7939275514805351, "flos": 27012509049600.0, "grad_norm": 20.840402908972248, "language_loss": 0.83891642, "learning_rate": 4.290617800767438e-07, "loss": 0.85344756, "num_input_tokens_seen": 284838865, "router_z_loss_clip": 2.13867188, "router_z_loss_mlp": 0.23681641, "step": 13205, "time_per_iteration": 2.7248647212982178 }, { "auxiliary_loss_clip": 0.01240337, "auxiliary_loss_mlp": 0.00208007, "balance_loss_clip": 1.02185595, "balance_loss_mlp": 0.18429592, "epoch": 0.7939876747332031, "flos": 21142838747520.0, "grad_norm": 4.426987165718106, "language_loss": 0.84820265, "learning_rate": 4.28820771692858e-07, "loss": 0.86268616, "num_input_tokens_seen": 284857975, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.23730469, "step": 13206, "time_per_iteration": 2.6756465435028076 }, { "auxiliary_loss_clip": 0.01262079, "auxiliary_loss_mlp": 0.00205245, "balance_loss_clip": 1.03710186, "balance_loss_mlp": 0.18031834, "epoch": 0.794047797985871, "flos": 23289075267840.0, "grad_norm": 4.393236190342007, "language_loss": 0.86152279, "learning_rate": 4.285798228882456e-07, "loss": 0.87619603, "num_input_tokens_seen": 284877145, "router_z_loss_clip": 2.25195312, "router_z_loss_mlp": 0.24938965, "step": 13207, "time_per_iteration": 2.741239309310913 }, { "auxiliary_loss_clip": 0.01250608, "auxiliary_loss_mlp": 0.0020701, "balance_loss_clip": 1.03106022, "balance_loss_mlp": 0.18141584, "epoch": 0.794107921238539, "flos": 24608074590720.0, "grad_norm": 5.868907872537064, "language_loss": 0.90563428, "learning_rate": 4.2833893367204375e-07, "loss": 0.92021048, "num_input_tokens_seen": 284895560, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.25622559, "step": 13208, "time_per_iteration": 2.651477336883545 }, { "auxiliary_loss_clip": 0.01123604, "auxiliary_loss_mlp": 0.00062154, "balance_loss_clip": 0.97412688, "balance_loss_mlp": 0.05414264, "epoch": 0.7941680444912069, "flos": 64093690252800.0, "grad_norm": 0.7252432645717167, "language_loss": 0.57905078, "learning_rate": 4.280981040533875e-07, "loss": 0.59090841, "num_input_tokens_seen": 284963135, "router_z_loss_clip": 1.4921875, "router_z_loss_mlp": 0.08007812, "step": 13209, "time_per_iteration": 3.232123613357544 }, { "auxiliary_loss_clip": 0.01256523, "auxiliary_loss_mlp": 0.00219065, "balance_loss_clip": 1.03098381, "balance_loss_mlp": 0.19415006, "epoch": 0.794228167743875, "flos": 24388839930240.0, "grad_norm": 28.496231512800005, "language_loss": 0.72121608, "learning_rate": 4.2785733404140825e-07, "loss": 0.73597193, "num_input_tokens_seen": 284981755, "router_z_loss_clip": 2.25195312, "router_z_loss_mlp": 0.24926758, "step": 13210, "time_per_iteration": 2.6655592918395996 }, { "auxiliary_loss_clip": 0.01231483, "auxiliary_loss_mlp": 0.00196521, "balance_loss_clip": 1.01876211, "balance_loss_mlp": 0.17349012, "epoch": 0.794288290996543, "flos": 28512498026880.0, "grad_norm": 162.71740213261276, "language_loss": 0.76888514, "learning_rate": 4.2761662364523676e-07, "loss": 0.78316516, "num_input_tokens_seen": 285003060, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.23022461, "step": 13211, "time_per_iteration": 2.7732200622558594 }, { "auxiliary_loss_clip": 0.01253916, "auxiliary_loss_mlp": 0.00231163, "balance_loss_clip": 1.02654088, "balance_loss_mlp": 0.20586649, "epoch": 0.7943484142492109, "flos": 25922117836800.0, "grad_norm": 32.546236792785756, "language_loss": 0.79482865, "learning_rate": 4.2737597287400074e-07, "loss": 0.80967945, "num_input_tokens_seen": 285021640, "router_z_loss_clip": 2.27734375, "router_z_loss_mlp": 0.25280762, "step": 13212, "time_per_iteration": 2.7046329975128174 }, { "auxiliary_loss_clip": 0.01215762, "auxiliary_loss_mlp": 0.00198424, "balance_loss_clip": 1.00960922, "balance_loss_mlp": 0.17749101, "epoch": 0.7944085375018789, "flos": 23915286000000.0, "grad_norm": 8.352817809473706, "language_loss": 0.86004496, "learning_rate": 4.271353817368246e-07, "loss": 0.87418675, "num_input_tokens_seen": 285040490, "router_z_loss_clip": 2.0625, "router_z_loss_mlp": 0.20922852, "step": 13213, "time_per_iteration": 2.8237009048461914 }, { "auxiliary_loss_clip": 0.01259992, "auxiliary_loss_mlp": 0.00204988, "balance_loss_clip": 1.03695428, "balance_loss_mlp": 0.18112186, "epoch": 0.7944686607545468, "flos": 20229953569920.0, "grad_norm": 3747.4457343000236, "language_loss": 0.81679916, "learning_rate": 4.268948502428327e-07, "loss": 0.83144897, "num_input_tokens_seen": 285059270, "router_z_loss_clip": 2.2265625, "router_z_loss_mlp": 0.23852539, "step": 13214, "time_per_iteration": 2.6512110233306885 }, { "auxiliary_loss_clip": 0.01224659, "auxiliary_loss_mlp": 0.00185573, "balance_loss_clip": 1.00659502, "balance_loss_mlp": 0.16251804, "epoch": 0.7945287840072148, "flos": 21980993679360.0, "grad_norm": 60.139786442736494, "language_loss": 0.80935949, "learning_rate": 4.2665437840114535e-07, "loss": 0.82346177, "num_input_tokens_seen": 285075390, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.23071289, "step": 13215, "time_per_iteration": 2.664173126220703 }, { "auxiliary_loss_clip": 0.0124367, "auxiliary_loss_mlp": 0.00212705, "balance_loss_clip": 1.02894068, "balance_loss_mlp": 0.18867245, "epoch": 0.7945889072598827, "flos": 26397718842240.0, "grad_norm": 2.177067793063461, "language_loss": 0.86096895, "learning_rate": 4.2641396622088253e-07, "loss": 0.87553269, "num_input_tokens_seen": 285096290, "router_z_loss_clip": 2.14648438, "router_z_loss_mlp": 0.24023438, "step": 13216, "time_per_iteration": 2.709200620651245 }, { "auxiliary_loss_clip": 0.01243606, "auxiliary_loss_mlp": 0.00214103, "balance_loss_clip": 1.02349782, "balance_loss_mlp": 0.18861577, "epoch": 0.7946490305125508, "flos": 25810255906560.0, "grad_norm": 8.098278788984295, "language_loss": 0.81597036, "learning_rate": 4.261736137111598e-07, "loss": 0.83054745, "num_input_tokens_seen": 285116020, "router_z_loss_clip": 2.20117188, "router_z_loss_mlp": 0.25500488, "step": 13217, "time_per_iteration": 2.8105363845825195 }, { "auxiliary_loss_clip": 0.01230251, "auxiliary_loss_mlp": 0.00201744, "balance_loss_clip": 1.01817632, "balance_loss_mlp": 0.17990527, "epoch": 0.7947091537652187, "flos": 15960965045760.0, "grad_norm": 28.35786312450135, "language_loss": 0.81684959, "learning_rate": 4.259333208810907e-07, "loss": 0.83116955, "num_input_tokens_seen": 285133510, "router_z_loss_clip": 2.12207031, "router_z_loss_mlp": 0.21826172, "step": 13218, "time_per_iteration": 2.6333329677581787 }, { "auxiliary_loss_clip": 0.01253807, "auxiliary_loss_mlp": 0.00218518, "balance_loss_clip": 1.03306508, "balance_loss_mlp": 0.19452128, "epoch": 0.7947692770178867, "flos": 18587866389120.0, "grad_norm": 17.89620280451043, "language_loss": 0.94080561, "learning_rate": 4.2569308773978817e-07, "loss": 0.95552886, "num_input_tokens_seen": 285151690, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.24023438, "step": 13219, "time_per_iteration": 2.712911605834961 }, { "auxiliary_loss_clip": 0.01248302, "auxiliary_loss_mlp": 0.00200053, "balance_loss_clip": 1.02847815, "balance_loss_mlp": 0.17429203, "epoch": 0.7948294002705546, "flos": 20442220992000.0, "grad_norm": 2.558484521118442, "language_loss": 0.85173559, "learning_rate": 4.2545291429636123e-07, "loss": 0.8662191, "num_input_tokens_seen": 285170485, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.2578125, "step": 13220, "time_per_iteration": 2.725888252258301 }, { "auxiliary_loss_clip": 0.01261714, "auxiliary_loss_mlp": 0.00223578, "balance_loss_clip": 1.03298163, "balance_loss_mlp": 0.1973879, "epoch": 0.7948895235232226, "flos": 38181194282880.0, "grad_norm": 171.99765494257588, "language_loss": 0.78144515, "learning_rate": 4.252128005599176e-07, "loss": 0.79629809, "num_input_tokens_seen": 285191050, "router_z_loss_clip": 2.28710938, "router_z_loss_mlp": 0.26220703, "step": 13221, "time_per_iteration": 2.7997210025787354 }, { "auxiliary_loss_clip": 0.01246565, "auxiliary_loss_mlp": 0.0019741, "balance_loss_clip": 1.03223431, "balance_loss_mlp": 0.17357981, "epoch": 0.7949496467758905, "flos": 15559806977280.0, "grad_norm": 50.89350430174263, "language_loss": 0.83295095, "learning_rate": 4.249727465395634e-07, "loss": 0.84739065, "num_input_tokens_seen": 285208750, "router_z_loss_clip": 2.14453125, "router_z_loss_mlp": 0.23852539, "step": 13222, "time_per_iteration": 2.64424729347229 }, { "auxiliary_loss_clip": 0.01128851, "auxiliary_loss_mlp": 0.00113711, "balance_loss_clip": 0.97379756, "balance_loss_mlp": 0.10522332, "epoch": 0.7950097700285585, "flos": 70897036728960.0, "grad_norm": 0.7655919837487744, "language_loss": 0.65554649, "learning_rate": 4.247327522443993e-07, "loss": 0.66797215, "num_input_tokens_seen": 285264605, "router_z_loss_clip": 1.546875, "router_z_loss_mlp": 0.08496094, "step": 13223, "time_per_iteration": 3.0239062309265137 }, { "auxiliary_loss_clip": 0.01237705, "auxiliary_loss_mlp": 0.00221256, "balance_loss_clip": 1.01925445, "balance_loss_mlp": 0.19607854, "epoch": 0.7950698932812266, "flos": 23951627585280.0, "grad_norm": 5.8261822460486075, "language_loss": 0.7997967, "learning_rate": 4.2449281768352717e-07, "loss": 0.81438631, "num_input_tokens_seen": 285283940, "router_z_loss_clip": 2.18457031, "router_z_loss_mlp": 0.25170898, "step": 13224, "time_per_iteration": 2.6759393215179443 }, { "auxiliary_loss_clip": 0.01118175, "auxiliary_loss_mlp": 0.00067091, "balance_loss_clip": 0.9702636, "balance_loss_mlp": 0.0595095, "epoch": 0.7951300165338945, "flos": 60282561415680.0, "grad_norm": 597.1156136479367, "language_loss": 0.54299188, "learning_rate": 4.2425294286604527e-07, "loss": 0.55484462, "num_input_tokens_seen": 285349525, "router_z_loss_clip": 1.484375, "router_z_loss_mlp": 0.07568359, "step": 13225, "time_per_iteration": 3.177720308303833 }, { "auxiliary_loss_clip": 0.01239284, "auxiliary_loss_mlp": 0.00199766, "balance_loss_clip": 1.02355361, "balance_loss_mlp": 0.17674688, "epoch": 0.7951901397865625, "flos": 22819004956800.0, "grad_norm": 172.89885551278357, "language_loss": 0.7294836, "learning_rate": 4.2401312780105034e-07, "loss": 0.74387407, "num_input_tokens_seen": 285367355, "router_z_loss_clip": 2.16210938, "router_z_loss_mlp": 0.23010254, "step": 13226, "time_per_iteration": 2.636798143386841 }, { "auxiliary_loss_clip": 0.012577, "auxiliary_loss_mlp": 0.00196597, "balance_loss_clip": 1.03537607, "balance_loss_mlp": 0.17077598, "epoch": 0.7952502630392304, "flos": 35695672871040.0, "grad_norm": 8.864164496337763, "language_loss": 0.78903735, "learning_rate": 4.237733724976349e-07, "loss": 0.80358034, "num_input_tokens_seen": 285386190, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.25830078, "step": 13227, "time_per_iteration": 4.30753231048584 }, { "auxiliary_loss_clip": 0.01232542, "auxiliary_loss_mlp": 0.00195765, "balance_loss_clip": 1.019449, "balance_loss_mlp": 0.17441472, "epoch": 0.7953103862918984, "flos": 25629840869760.0, "grad_norm": 76.95280039873832, "language_loss": 0.77834803, "learning_rate": 4.2353367696489184e-07, "loss": 0.79263109, "num_input_tokens_seen": 285406150, "router_z_loss_clip": 2.13085938, "router_z_loss_mlp": 0.21325684, "step": 13228, "time_per_iteration": 2.643220901489258 }, { "auxiliary_loss_clip": 0.01237182, "auxiliary_loss_mlp": 0.0021127, "balance_loss_clip": 1.02140403, "balance_loss_mlp": 0.18760739, "epoch": 0.7953705095445663, "flos": 40551980676480.0, "grad_norm": 95.22465435190553, "language_loss": 0.7666254, "learning_rate": 4.232940412119095e-07, "loss": 0.78110993, "num_input_tokens_seen": 285429900, "router_z_loss_clip": 2.16015625, "router_z_loss_mlp": 0.23669434, "step": 13229, "time_per_iteration": 4.18335747718811 }, { "auxiliary_loss_clip": 0.01278565, "auxiliary_loss_mlp": 0.00207421, "balance_loss_clip": 1.0498724, "balance_loss_mlp": 0.17928779, "epoch": 0.7954306327972344, "flos": 27636672706560.0, "grad_norm": 18.006080560959152, "language_loss": 0.7969172, "learning_rate": 4.2305446524777457e-07, "loss": 0.81177711, "num_input_tokens_seen": 285452555, "router_z_loss_clip": 2.28710938, "router_z_loss_mlp": 0.28112793, "step": 13230, "time_per_iteration": 2.7070672512054443 }, { "auxiliary_loss_clip": 0.01156302, "auxiliary_loss_mlp": 0.00056207, "balance_loss_clip": 1.00118375, "balance_loss_mlp": 0.0491974, "epoch": 0.7954907560499023, "flos": 59504055995520.0, "grad_norm": 0.9696029197366931, "language_loss": 0.6290611, "learning_rate": 4.2281494908157247e-07, "loss": 0.64118612, "num_input_tokens_seen": 285515700, "router_z_loss_clip": 1.546875, "router_z_loss_mlp": 0.0703125, "step": 13231, "time_per_iteration": 3.2635040283203125 }, { "auxiliary_loss_clip": 0.01248838, "auxiliary_loss_mlp": 0.00219729, "balance_loss_clip": 1.02860308, "balance_loss_mlp": 0.19618568, "epoch": 0.7955508793025703, "flos": 20120533764480.0, "grad_norm": 3.516625775772951, "language_loss": 0.77261776, "learning_rate": 4.2257549272238566e-07, "loss": 0.78730345, "num_input_tokens_seen": 285533910, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.23547363, "step": 13232, "time_per_iteration": 2.8252370357513428 }, { "auxiliary_loss_clip": 0.01237148, "auxiliary_loss_mlp": 0.00198851, "balance_loss_clip": 1.0200963, "balance_loss_mlp": 0.17505634, "epoch": 0.7956110025552382, "flos": 26505378881280.0, "grad_norm": 211.27522839723017, "language_loss": 0.84432185, "learning_rate": 4.223360961792952e-07, "loss": 0.8586818, "num_input_tokens_seen": 285554080, "router_z_loss_clip": 2.16796875, "router_z_loss_mlp": 0.23803711, "step": 13233, "time_per_iteration": 4.2158613204956055 }, { "auxiliary_loss_clip": 0.0125078, "auxiliary_loss_mlp": 0.00205236, "balance_loss_clip": 1.0344069, "balance_loss_mlp": 0.18241939, "epoch": 0.7956711258079062, "flos": 22565475786240.0, "grad_norm": 12.849951536221331, "language_loss": 0.85120881, "learning_rate": 4.220967594613769e-07, "loss": 0.86576891, "num_input_tokens_seen": 285572325, "router_z_loss_clip": 2.16796875, "router_z_loss_mlp": 0.2277832, "step": 13234, "time_per_iteration": 2.6405181884765625 }, { "auxiliary_loss_clip": 0.01225035, "auxiliary_loss_mlp": 0.00201068, "balance_loss_clip": 1.01100349, "balance_loss_mlp": 0.17809603, "epoch": 0.7957312490605741, "flos": 17379005143680.0, "grad_norm": 6.2163458496577375, "language_loss": 0.79627568, "learning_rate": 4.218574825777077e-07, "loss": 0.81053668, "num_input_tokens_seen": 285589770, "router_z_loss_clip": 2.14160156, "router_z_loss_mlp": 0.22961426, "step": 13235, "time_per_iteration": 2.61320424079895 }, { "auxiliary_loss_clip": 0.01239627, "auxiliary_loss_mlp": 0.0019168, "balance_loss_clip": 1.02083945, "balance_loss_mlp": 0.16854101, "epoch": 0.7957913723132422, "flos": 22491427898880.0, "grad_norm": 6.994640592956855, "language_loss": 0.76434493, "learning_rate": 4.2161826553736145e-07, "loss": 0.77865803, "num_input_tokens_seen": 285610065, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.23144531, "step": 13236, "time_per_iteration": 2.6703128814697266 }, { "auxiliary_loss_clip": 0.01233962, "auxiliary_loss_mlp": 0.0020676, "balance_loss_clip": 1.01629305, "balance_loss_mlp": 0.18288217, "epoch": 0.7958514955659101, "flos": 22638087129600.0, "grad_norm": 143.18470435375372, "language_loss": 0.81850857, "learning_rate": 4.2137910834940826e-07, "loss": 0.83291578, "num_input_tokens_seen": 285628480, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.23852539, "step": 13237, "time_per_iteration": 2.705169439315796 }, { "auxiliary_loss_clip": 0.01269457, "auxiliary_loss_mlp": 0.00218972, "balance_loss_clip": 1.04133892, "balance_loss_mlp": 0.19217353, "epoch": 0.7959116188185781, "flos": 20704225772160.0, "grad_norm": 3.884567771219963, "language_loss": 0.82213843, "learning_rate": 4.211400110229175e-07, "loss": 0.83702266, "num_input_tokens_seen": 285647805, "router_z_loss_clip": 2.28125, "router_z_loss_mlp": 0.2677002, "step": 13238, "time_per_iteration": 4.120609521865845 }, { "auxiliary_loss_clip": 0.01247004, "auxiliary_loss_mlp": 0.00217391, "balance_loss_clip": 1.02469039, "balance_loss_mlp": 0.19147514, "epoch": 0.7959717420712461, "flos": 19024683684480.0, "grad_norm": 35.05938039467809, "language_loss": 0.83483315, "learning_rate": 4.2090097356695684e-07, "loss": 0.84947711, "num_input_tokens_seen": 285665505, "router_z_loss_clip": 2.22167969, "router_z_loss_mlp": 0.25866699, "step": 13239, "time_per_iteration": 2.6624863147735596 }, { "auxiliary_loss_clip": 0.01248082, "auxiliary_loss_mlp": 0.00218798, "balance_loss_clip": 1.02833486, "balance_loss_mlp": 0.19434775, "epoch": 0.796031865323914, "flos": 26356636661760.0, "grad_norm": 4.710936968232002, "language_loss": 0.78840083, "learning_rate": 4.2066199599058814e-07, "loss": 0.80306965, "num_input_tokens_seen": 285685855, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.24450684, "step": 13240, "time_per_iteration": 2.727114677429199 }, { "auxiliary_loss_clip": 0.01163333, "auxiliary_loss_mlp": 0.00076045, "balance_loss_clip": 1.01387215, "balance_loss_mlp": 0.06903511, "epoch": 0.796091988576582, "flos": 62069440320000.0, "grad_norm": 0.8749732617329505, "language_loss": 0.57883584, "learning_rate": 4.2042307830287526e-07, "loss": 0.59122962, "num_input_tokens_seen": 285735710, "router_z_loss_clip": 1.4921875, "router_z_loss_mlp": 0.0703125, "step": 13241, "time_per_iteration": 2.9187567234039307 }, { "auxiliary_loss_clip": 0.01226674, "auxiliary_loss_mlp": 0.00202949, "balance_loss_clip": 1.01931608, "balance_loss_mlp": 0.18180105, "epoch": 0.7961521118292499, "flos": 39020103400320.0, "grad_norm": 10.618924393334641, "language_loss": 0.72368616, "learning_rate": 4.201842205128772e-07, "loss": 0.73798239, "num_input_tokens_seen": 285757045, "router_z_loss_clip": 2.07324219, "router_z_loss_mlp": 0.21166992, "step": 13242, "time_per_iteration": 2.810734272003174 }, { "auxiliary_loss_clip": 0.0124891, "auxiliary_loss_mlp": 0.00227788, "balance_loss_clip": 1.02607584, "balance_loss_mlp": 0.20354125, "epoch": 0.796212235081918, "flos": 21762836426880.0, "grad_norm": 613.423243443754, "language_loss": 0.87043697, "learning_rate": 4.199454226296526e-07, "loss": 0.88520396, "num_input_tokens_seen": 285776050, "router_z_loss_clip": 2.23046875, "router_z_loss_mlp": 0.24255371, "step": 13243, "time_per_iteration": 2.6423768997192383 }, { "auxiliary_loss_clip": 0.01236461, "auxiliary_loss_mlp": 0.00243101, "balance_loss_clip": 1.01945519, "balance_loss_mlp": 0.21741116, "epoch": 0.7962723583345859, "flos": 21178857110400.0, "grad_norm": 10.08545002060913, "language_loss": 0.85709357, "learning_rate": 4.1970668466225565e-07, "loss": 0.87188917, "num_input_tokens_seen": 285796830, "router_z_loss_clip": 2.16796875, "router_z_loss_mlp": 0.25695801, "step": 13244, "time_per_iteration": 2.7063939571380615 }, { "auxiliary_loss_clip": 0.01237093, "auxiliary_loss_mlp": 0.00207172, "balance_loss_clip": 1.01740098, "balance_loss_mlp": 0.18203031, "epoch": 0.7963324815872539, "flos": 17128636369920.0, "grad_norm": 3.8905176519017783, "language_loss": 0.81318265, "learning_rate": 4.1946800661973934e-07, "loss": 0.82762527, "num_input_tokens_seen": 285814755, "router_z_loss_clip": 2.19921875, "router_z_loss_mlp": 0.25146484, "step": 13245, "time_per_iteration": 2.592916965484619 }, { "auxiliary_loss_clip": 0.01256042, "auxiliary_loss_mlp": 0.00207187, "balance_loss_clip": 1.03243637, "balance_loss_mlp": 0.18043642, "epoch": 0.7963926048399218, "flos": 21397481239680.0, "grad_norm": 1.905362855459799, "language_loss": 0.84302843, "learning_rate": 4.192293885111549e-07, "loss": 0.85766071, "num_input_tokens_seen": 285834255, "router_z_loss_clip": 2.23242188, "router_z_loss_mlp": 0.26782227, "step": 13246, "time_per_iteration": 2.627763032913208 }, { "auxiliary_loss_clip": 0.01268426, "auxiliary_loss_mlp": 0.00212958, "balance_loss_clip": 1.04013824, "balance_loss_mlp": 0.18595758, "epoch": 0.7964527280925898, "flos": 25184188828800.0, "grad_norm": 36.689555643999014, "language_loss": 0.75735408, "learning_rate": 4.1899083034555007e-07, "loss": 0.77216792, "num_input_tokens_seen": 285853540, "router_z_loss_clip": 2.28808594, "router_z_loss_mlp": 0.27001953, "step": 13247, "time_per_iteration": 2.672680377960205 }, { "auxiliary_loss_clip": 0.0125334, "auxiliary_loss_mlp": 0.00218061, "balance_loss_clip": 1.03584433, "balance_loss_mlp": 0.19426635, "epoch": 0.7965128513452577, "flos": 27015884928000.0, "grad_norm": 5.99808075913781, "language_loss": 0.81285286, "learning_rate": 4.1875233213197123e-07, "loss": 0.82756686, "num_input_tokens_seen": 285872705, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.23791504, "step": 13248, "time_per_iteration": 2.6568360328674316 }, { "auxiliary_loss_clip": 0.01252524, "auxiliary_loss_mlp": 0.00204968, "balance_loss_clip": 1.02921319, "balance_loss_mlp": 0.17904013, "epoch": 0.7965729745979258, "flos": 24419578993920.0, "grad_norm": 28.348975452962573, "language_loss": 0.86089468, "learning_rate": 4.1851389387946255e-07, "loss": 0.87546962, "num_input_tokens_seen": 285890290, "router_z_loss_clip": 2.23632812, "router_z_loss_mlp": 0.25891113, "step": 13249, "time_per_iteration": 2.669780731201172 }, { "auxiliary_loss_clip": 0.01241924, "auxiliary_loss_mlp": 0.00199298, "balance_loss_clip": 1.02697301, "balance_loss_mlp": 0.17725602, "epoch": 0.7966330978505937, "flos": 18840389978880.0, "grad_norm": 29.443996353918894, "language_loss": 0.71149516, "learning_rate": 4.1827551559706674e-07, "loss": 0.72590739, "num_input_tokens_seen": 285909190, "router_z_loss_clip": 2.1484375, "router_z_loss_mlp": 0.22033691, "step": 13250, "time_per_iteration": 2.598623275756836 }, { "auxiliary_loss_clip": 0.01232331, "auxiliary_loss_mlp": 0.00201876, "balance_loss_clip": 1.01433635, "balance_loss_mlp": 0.17730746, "epoch": 0.7966932211032617, "flos": 13152319862400.0, "grad_norm": 5971.878748756533, "language_loss": 0.85414362, "learning_rate": 4.180371972938206e-07, "loss": 0.86848569, "num_input_tokens_seen": 285927570, "router_z_loss_clip": 2.18164062, "router_z_loss_mlp": 0.24572754, "step": 13251, "time_per_iteration": 2.6243185997009277 }, { "auxiliary_loss_clip": 0.01256545, "auxiliary_loss_mlp": 0.00224979, "balance_loss_clip": 1.03550673, "balance_loss_mlp": 0.20006457, "epoch": 0.7967533443559297, "flos": 23949760078080.0, "grad_norm": 18.410349546700377, "language_loss": 0.82523894, "learning_rate": 4.177989389787624e-07, "loss": 0.84005421, "num_input_tokens_seen": 285945810, "router_z_loss_clip": 2.2109375, "router_z_loss_mlp": 0.24938965, "step": 13252, "time_per_iteration": 2.714676856994629 }, { "auxiliary_loss_clip": 0.01238074, "auxiliary_loss_mlp": 0.00224177, "balance_loss_clip": 1.02136123, "balance_loss_mlp": 0.19975097, "epoch": 0.7968134676085976, "flos": 30368791964160.0, "grad_norm": 92.33062904126038, "language_loss": 0.75979692, "learning_rate": 4.175607406609278e-07, "loss": 0.77441943, "num_input_tokens_seen": 285964235, "router_z_loss_clip": 2.16210938, "router_z_loss_mlp": 0.24401855, "step": 13253, "time_per_iteration": 2.788999080657959 }, { "auxiliary_loss_clip": 0.01260426, "auxiliary_loss_mlp": 0.00236851, "balance_loss_clip": 1.03470922, "balance_loss_mlp": 0.210053, "epoch": 0.7968735908612656, "flos": 23075048079360.0, "grad_norm": 7.027724365399763, "language_loss": 0.74032688, "learning_rate": 4.1732260234934767e-07, "loss": 0.75529963, "num_input_tokens_seen": 285983710, "router_z_loss_clip": 2.2578125, "router_z_loss_mlp": 0.26806641, "step": 13254, "time_per_iteration": 2.6461777687072754 }, { "auxiliary_loss_clip": 0.01249116, "auxiliary_loss_mlp": 0.00228008, "balance_loss_clip": 1.03081238, "balance_loss_mlp": 0.20318857, "epoch": 0.7969337141139335, "flos": 23582250074880.0, "grad_norm": 13.999731745913033, "language_loss": 0.7729373, "learning_rate": 4.1708452405305314e-07, "loss": 0.78770858, "num_input_tokens_seen": 286003425, "router_z_loss_clip": 2.18261719, "router_z_loss_mlp": 0.24829102, "step": 13255, "time_per_iteration": 2.708155393600464 }, { "auxiliary_loss_clip": 0.01240576, "auxiliary_loss_mlp": 0.00221637, "balance_loss_clip": 1.027596, "balance_loss_mlp": 0.19692516, "epoch": 0.7969938373666016, "flos": 19755860935680.0, "grad_norm": 3.6896892666756016, "language_loss": 0.86227167, "learning_rate": 4.168465057810733e-07, "loss": 0.87689376, "num_input_tokens_seen": 286020130, "router_z_loss_clip": 2.12792969, "router_z_loss_mlp": 0.24731445, "step": 13256, "time_per_iteration": 2.6654441356658936 }, { "auxiliary_loss_clip": 0.01246149, "auxiliary_loss_mlp": 0.00217313, "balance_loss_clip": 1.02855635, "balance_loss_mlp": 0.19407919, "epoch": 0.7970539606192695, "flos": 24134089697280.0, "grad_norm": 12.955025671501444, "language_loss": 0.73218632, "learning_rate": 4.166085475424315e-07, "loss": 0.74682093, "num_input_tokens_seen": 286040230, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.2322998, "step": 13257, "time_per_iteration": 2.72440767288208 }, { "auxiliary_loss_clip": 0.01262098, "auxiliary_loss_mlp": 0.00219857, "balance_loss_clip": 1.04333007, "balance_loss_mlp": 0.19577712, "epoch": 0.7971140838719375, "flos": 17968622895360.0, "grad_norm": 4.45887707512312, "language_loss": 0.81917036, "learning_rate": 4.163706493461523e-07, "loss": 0.83398998, "num_input_tokens_seen": 286059475, "router_z_loss_clip": 2.18652344, "router_z_loss_mlp": 0.24060059, "step": 13258, "time_per_iteration": 2.637598991394043 }, { "auxiliary_loss_clip": 0.0126559, "auxiliary_loss_mlp": 0.00225018, "balance_loss_clip": 1.03946364, "balance_loss_mlp": 0.20015061, "epoch": 0.7971742071246054, "flos": 19169547235200.0, "grad_norm": 57.18527935281728, "language_loss": 0.78919804, "learning_rate": 4.1613281120125655e-07, "loss": 0.80410415, "num_input_tokens_seen": 286077820, "router_z_loss_clip": 2.25976562, "router_z_loss_mlp": 0.2487793, "step": 13259, "time_per_iteration": 2.6689794063568115 }, { "auxiliary_loss_clip": 0.01239908, "auxiliary_loss_mlp": 0.00210493, "balance_loss_clip": 1.02539289, "balance_loss_mlp": 0.18660372, "epoch": 0.7972343303772734, "flos": 27125951178240.0, "grad_norm": 308.72907683486204, "language_loss": 0.79128015, "learning_rate": 4.158950331167641e-07, "loss": 0.80578417, "num_input_tokens_seen": 286097285, "router_z_loss_clip": 2.14453125, "router_z_loss_mlp": 0.23901367, "step": 13260, "time_per_iteration": 2.681950092315674 }, { "auxiliary_loss_clip": 0.01231801, "auxiliary_loss_mlp": 0.00220988, "balance_loss_clip": 1.01819932, "balance_loss_mlp": 0.19643098, "epoch": 0.7972944536299413, "flos": 20996646393600.0, "grad_norm": 15.39389867487627, "language_loss": 0.85922754, "learning_rate": 4.1565731510169065e-07, "loss": 0.8737554, "num_input_tokens_seen": 286116000, "router_z_loss_clip": 2.1328125, "router_z_loss_mlp": 0.24560547, "step": 13261, "time_per_iteration": 2.6668612957000732 }, { "auxiliary_loss_clip": 0.01230568, "auxiliary_loss_mlp": 0.00184488, "balance_loss_clip": 1.01966941, "balance_loss_mlp": 0.16100393, "epoch": 0.7973545768826094, "flos": 21580015178880.0, "grad_norm": 37.78197553372923, "language_loss": 0.82144725, "learning_rate": 4.154196571650501e-07, "loss": 0.83559787, "num_input_tokens_seen": 286135110, "router_z_loss_clip": 2.10546875, "router_z_loss_mlp": 0.23486328, "step": 13262, "time_per_iteration": 2.6787497997283936 }, { "auxiliary_loss_clip": 0.01267038, "auxiliary_loss_mlp": 0.00204518, "balance_loss_clip": 1.04288781, "balance_loss_mlp": 0.17977056, "epoch": 0.7974147001352773, "flos": 20558536208640.0, "grad_norm": 3.2947057289004933, "language_loss": 0.82142001, "learning_rate": 4.1518205931585524e-07, "loss": 0.83613563, "num_input_tokens_seen": 286152835, "router_z_loss_clip": 2.24023438, "router_z_loss_mlp": 0.24768066, "step": 13263, "time_per_iteration": 2.663862705230713 }, { "auxiliary_loss_clip": 0.01264221, "auxiliary_loss_mlp": 0.00214448, "balance_loss_clip": 1.03878093, "balance_loss_mlp": 0.18884197, "epoch": 0.7974748233879453, "flos": 20996790048000.0, "grad_norm": 13.556732671349918, "language_loss": 0.80818999, "learning_rate": 4.149445215631153e-07, "loss": 0.82297665, "num_input_tokens_seen": 286171785, "router_z_loss_clip": 2.25390625, "router_z_loss_mlp": 0.25610352, "step": 13264, "time_per_iteration": 2.642371654510498 }, { "auxiliary_loss_clip": 0.01231332, "auxiliary_loss_mlp": 0.00217145, "balance_loss_clip": 1.01769638, "balance_loss_mlp": 0.19383919, "epoch": 0.7975349466406133, "flos": 22565188477440.0, "grad_norm": 3.174423683047426, "language_loss": 0.83249474, "learning_rate": 4.1470704391583776e-07, "loss": 0.8469795, "num_input_tokens_seen": 286190420, "router_z_loss_clip": 2.13476562, "router_z_loss_mlp": 0.2331543, "step": 13265, "time_per_iteration": 2.6306214332580566 }, { "auxiliary_loss_clip": 0.01257571, "auxiliary_loss_mlp": 0.00205521, "balance_loss_clip": 1.03494143, "balance_loss_mlp": 0.17946199, "epoch": 0.7975950698932812, "flos": 21689542725120.0, "grad_norm": 10.192205676234801, "language_loss": 0.83780837, "learning_rate": 4.144696263830285e-07, "loss": 0.85243928, "num_input_tokens_seen": 286210105, "router_z_loss_clip": 2.22460938, "router_z_loss_mlp": 0.26037598, "step": 13266, "time_per_iteration": 2.6510586738586426 }, { "auxiliary_loss_clip": 0.01227841, "auxiliary_loss_mlp": 0.00201751, "balance_loss_clip": 1.0214808, "balance_loss_mlp": 0.17975655, "epoch": 0.7976551931459492, "flos": 19604568850560.0, "grad_norm": 6.581833675913213, "language_loss": 0.91058421, "learning_rate": 4.1423226897369015e-07, "loss": 0.92488015, "num_input_tokens_seen": 286228180, "router_z_loss_clip": 2.06152344, "router_z_loss_mlp": 0.2199707, "step": 13267, "time_per_iteration": 2.601616859436035 }, { "auxiliary_loss_clip": 0.0124195, "auxiliary_loss_mlp": 0.00209478, "balance_loss_clip": 1.02785587, "balance_loss_mlp": 0.18545707, "epoch": 0.7977153163986171, "flos": 21687603390720.0, "grad_norm": 5.085661535240184, "language_loss": 0.85334927, "learning_rate": 4.139949716968223e-07, "loss": 0.86786354, "num_input_tokens_seen": 286247305, "router_z_loss_clip": 2.14550781, "router_z_loss_mlp": 0.23999023, "step": 13268, "time_per_iteration": 2.650585651397705 }, { "auxiliary_loss_clip": 0.01243569, "auxiliary_loss_mlp": 0.00207279, "balance_loss_clip": 1.0267936, "balance_loss_mlp": 0.18319836, "epoch": 0.7977754396512852, "flos": 23476780765440.0, "grad_norm": 18.335315957361097, "language_loss": 0.85906976, "learning_rate": 4.1375773456142403e-07, "loss": 0.87357819, "num_input_tokens_seen": 286268145, "router_z_loss_clip": 2.1640625, "router_z_loss_mlp": 0.2409668, "step": 13269, "time_per_iteration": 4.156991243362427 }, { "auxiliary_loss_clip": 0.01224028, "auxiliary_loss_mlp": 0.00202644, "balance_loss_clip": 1.01885498, "balance_loss_mlp": 0.18124625, "epoch": 0.7978355629039531, "flos": 22382223575040.0, "grad_norm": 10.933165859344685, "language_loss": 0.89740264, "learning_rate": 4.135205575764922e-07, "loss": 0.91166937, "num_input_tokens_seen": 286286775, "router_z_loss_clip": 2.05273438, "router_z_loss_mlp": 0.21411133, "step": 13270, "time_per_iteration": 2.692385196685791 }, { "auxiliary_loss_clip": 0.01241541, "auxiliary_loss_mlp": 0.00212751, "balance_loss_clip": 1.02469683, "balance_loss_mlp": 0.18673998, "epoch": 0.7978956861566211, "flos": 20266331068800.0, "grad_norm": 14.904026597244542, "language_loss": 0.68777651, "learning_rate": 4.1328344075101905e-07, "loss": 0.70231944, "num_input_tokens_seen": 286305590, "router_z_loss_clip": 2.16796875, "router_z_loss_mlp": 0.2598877, "step": 13271, "time_per_iteration": 4.041945695877075 }, { "auxiliary_loss_clip": 0.01261708, "auxiliary_loss_mlp": 0.00220989, "balance_loss_clip": 1.0445298, "balance_loss_mlp": 0.19770767, "epoch": 0.797955809409289, "flos": 28112417366400.0, "grad_norm": 4.544745822813709, "language_loss": 0.79141891, "learning_rate": 4.130463840939975e-07, "loss": 0.80624592, "num_input_tokens_seen": 286328050, "router_z_loss_clip": 2.171875, "router_z_loss_mlp": 0.23291016, "step": 13272, "time_per_iteration": 2.7085559368133545 }, { "auxiliary_loss_clip": 0.01241733, "auxiliary_loss_mlp": 0.00193647, "balance_loss_clip": 1.02284515, "balance_loss_mlp": 0.1694233, "epoch": 0.798015932661957, "flos": 15559591495680.0, "grad_norm": 23.355581968652665, "language_loss": 0.79204094, "learning_rate": 4.128093876144161e-07, "loss": 0.80639476, "num_input_tokens_seen": 286345265, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.24230957, "step": 13273, "time_per_iteration": 2.608564853668213 }, { "auxiliary_loss_clip": 0.01259425, "auxiliary_loss_mlp": 0.00197025, "balance_loss_clip": 1.03751481, "balance_loss_mlp": 0.17158604, "epoch": 0.7980760559146249, "flos": 23951196622080.0, "grad_norm": 7.885431739925885, "language_loss": 0.85231531, "learning_rate": 4.1257245132126117e-07, "loss": 0.86687976, "num_input_tokens_seen": 286364465, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.25463867, "step": 13274, "time_per_iteration": 2.7092010974884033 }, { "auxiliary_loss_clip": 0.01226261, "auxiliary_loss_mlp": 0.00200304, "balance_loss_clip": 1.01959479, "balance_loss_mlp": 0.17828585, "epoch": 0.798136179167293, "flos": 28038082170240.0, "grad_norm": 11.377340524631531, "language_loss": 0.83395571, "learning_rate": 4.12335575223518e-07, "loss": 0.8482213, "num_input_tokens_seen": 286385565, "router_z_loss_clip": 2.06835938, "router_z_loss_mlp": 0.22021484, "step": 13275, "time_per_iteration": 2.676877021789551 }, { "auxiliary_loss_clip": 0.01249595, "auxiliary_loss_mlp": 0.0021799, "balance_loss_clip": 1.02726173, "balance_loss_mlp": 0.19083381, "epoch": 0.7981963024199609, "flos": 35984538046080.0, "grad_norm": 23.762643393299246, "language_loss": 0.73619854, "learning_rate": 4.1209875933016877e-07, "loss": 0.7508744, "num_input_tokens_seen": 286403950, "router_z_loss_clip": 2.22265625, "router_z_loss_mlp": 0.27160645, "step": 13276, "time_per_iteration": 4.21284031867981 }, { "auxiliary_loss_clip": 0.01230608, "auxiliary_loss_mlp": 0.00210285, "balance_loss_clip": 1.02312124, "balance_loss_mlp": 0.18787396, "epoch": 0.7982564256726289, "flos": 25884914325120.0, "grad_norm": 7.656751525461095, "language_loss": 0.6824652, "learning_rate": 4.118620036501945e-07, "loss": 0.69687414, "num_input_tokens_seen": 286426160, "router_z_loss_clip": 2.07617188, "router_z_loss_mlp": 0.22399902, "step": 13277, "time_per_iteration": 2.6657140254974365 }, { "auxiliary_loss_clip": 0.01260605, "auxiliary_loss_mlp": 0.00198557, "balance_loss_clip": 1.03984082, "balance_loss_mlp": 0.17501345, "epoch": 0.7983165489252969, "flos": 25739152934400.0, "grad_norm": 88.33504493823, "language_loss": 0.87225646, "learning_rate": 4.1162530819257227e-07, "loss": 0.88684809, "num_input_tokens_seen": 286446610, "router_z_loss_clip": 2.2109375, "router_z_loss_mlp": 0.23547363, "step": 13278, "time_per_iteration": 2.6657557487487793 }, { "auxiliary_loss_clip": 0.01251856, "auxiliary_loss_mlp": 0.00207872, "balance_loss_clip": 1.02739692, "balance_loss_mlp": 0.18178937, "epoch": 0.7983766721779648, "flos": 21908202768000.0, "grad_norm": 8.684533412749031, "language_loss": 0.74095225, "learning_rate": 4.113886729662768e-07, "loss": 0.75554955, "num_input_tokens_seen": 286465460, "router_z_loss_clip": 2.24414062, "router_z_loss_mlp": 0.26098633, "step": 13279, "time_per_iteration": 2.656404972076416 }, { "auxiliary_loss_clip": 0.01213158, "auxiliary_loss_mlp": 0.00196755, "balance_loss_clip": 1.00886178, "balance_loss_mlp": 0.17514281, "epoch": 0.7984367954306328, "flos": 29347420734720.0, "grad_norm": 18.428940427105243, "language_loss": 0.76573193, "learning_rate": 4.111520979802825e-07, "loss": 0.77983105, "num_input_tokens_seen": 286485720, "router_z_loss_clip": 2.04296875, "router_z_loss_mlp": 0.21618652, "step": 13280, "time_per_iteration": 2.8659298419952393 }, { "auxiliary_loss_clip": 0.01243522, "auxiliary_loss_mlp": 0.00199001, "balance_loss_clip": 1.02312231, "balance_loss_mlp": 0.17442033, "epoch": 0.7984969186833007, "flos": 31357772104320.0, "grad_norm": 515.4138826758893, "language_loss": 0.72482026, "learning_rate": 4.1091558324355955e-07, "loss": 0.73924541, "num_input_tokens_seen": 286507465, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.24572754, "step": 13281, "time_per_iteration": 4.318729877471924 }, { "auxiliary_loss_clip": 0.01258966, "auxiliary_loss_mlp": 0.00228273, "balance_loss_clip": 1.03815639, "balance_loss_mlp": 0.20226142, "epoch": 0.7985570419359688, "flos": 24312924535680.0, "grad_norm": 31.443577200114223, "language_loss": 0.88542247, "learning_rate": 4.1067912876507683e-07, "loss": 0.90029484, "num_input_tokens_seen": 286526345, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.26000977, "step": 13282, "time_per_iteration": 2.7402892112731934 }, { "auxiliary_loss_clip": 0.01241226, "auxiliary_loss_mlp": 0.00204027, "balance_loss_clip": 1.01963496, "balance_loss_mlp": 0.17755023, "epoch": 0.7986171651886367, "flos": 15742233175680.0, "grad_norm": 3.3706505418667887, "language_loss": 0.82110262, "learning_rate": 4.10442734553802e-07, "loss": 0.83555508, "num_input_tokens_seen": 286544095, "router_z_loss_clip": 2.21484375, "router_z_loss_mlp": 0.26501465, "step": 13283, "time_per_iteration": 2.6869969367980957 }, { "auxiliary_loss_clip": 0.01234921, "auxiliary_loss_mlp": 0.00199514, "balance_loss_clip": 1.02385449, "balance_loss_mlp": 0.17632751, "epoch": 0.7986772884413047, "flos": 11619401091840.0, "grad_norm": 5.440364273001664, "language_loss": 0.82918674, "learning_rate": 4.102064006186967e-07, "loss": 0.84353113, "num_input_tokens_seen": 286560960, "router_z_loss_clip": 2.11328125, "router_z_loss_mlp": 0.23193359, "step": 13284, "time_per_iteration": 2.6739907264709473 }, { "auxiliary_loss_clip": 0.01236863, "auxiliary_loss_mlp": 0.00210331, "balance_loss_clip": 1.02268004, "balance_loss_mlp": 0.1869894, "epoch": 0.7987374116939726, "flos": 22091059929600.0, "grad_norm": 68.32600856545905, "language_loss": 0.78582126, "learning_rate": 4.0997012696872415e-07, "loss": 0.80029321, "num_input_tokens_seen": 286579865, "router_z_loss_clip": 2.14160156, "router_z_loss_mlp": 0.23352051, "step": 13285, "time_per_iteration": 2.6498446464538574 }, { "auxiliary_loss_clip": 0.01247595, "auxiliary_loss_mlp": 0.00217143, "balance_loss_clip": 1.03112841, "balance_loss_mlp": 0.19309838, "epoch": 0.7987975349466406, "flos": 17890696339200.0, "grad_norm": 85.38747576390287, "language_loss": 0.82599223, "learning_rate": 4.097339136128437e-07, "loss": 0.84063965, "num_input_tokens_seen": 286597295, "router_z_loss_clip": 2.16601562, "router_z_loss_mlp": 0.24047852, "step": 13286, "time_per_iteration": 2.616816997528076 }, { "auxiliary_loss_clip": 0.01260792, "auxiliary_loss_mlp": 0.00198883, "balance_loss_clip": 1.04132032, "balance_loss_mlp": 0.175482, "epoch": 0.7988576581993085, "flos": 19719232041600.0, "grad_norm": 120.20844151050508, "language_loss": 0.8500905, "learning_rate": 4.0949776056001296e-07, "loss": 0.8646872, "num_input_tokens_seen": 286616270, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.23388672, "step": 13287, "time_per_iteration": 2.605144500732422 }, { "auxiliary_loss_clip": 0.01236683, "auxiliary_loss_mlp": 0.00211557, "balance_loss_clip": 1.0235188, "balance_loss_mlp": 0.18716723, "epoch": 0.7989177814519766, "flos": 28036358317440.0, "grad_norm": 21.582041640416687, "language_loss": 0.70979893, "learning_rate": 4.092616678191863e-07, "loss": 0.72428131, "num_input_tokens_seen": 286638315, "router_z_loss_clip": 2.1328125, "router_z_loss_mlp": 0.24414062, "step": 13288, "time_per_iteration": 2.7114150524139404 }, { "auxiliary_loss_clip": 0.01241629, "auxiliary_loss_mlp": 0.00194477, "balance_loss_clip": 1.02550125, "balance_loss_mlp": 0.17361543, "epoch": 0.7989779047046445, "flos": 28871029630080.0, "grad_norm": 81.99177668884035, "language_loss": 0.79208148, "learning_rate": 4.090256353993169e-07, "loss": 0.80644244, "num_input_tokens_seen": 286658630, "router_z_loss_clip": 2.1640625, "router_z_loss_mlp": 0.20874023, "step": 13289, "time_per_iteration": 2.666494369506836 }, { "auxiliary_loss_clip": 0.01226885, "auxiliary_loss_mlp": 0.0019104, "balance_loss_clip": 1.01781988, "balance_loss_mlp": 0.16714999, "epoch": 0.7990380279573125, "flos": 18186887888640.0, "grad_norm": 14.443015563584334, "language_loss": 0.73252249, "learning_rate": 4.0878966330935506e-07, "loss": 0.74670172, "num_input_tokens_seen": 286676870, "router_z_loss_clip": 2.08886719, "router_z_loss_mlp": 0.23876953, "step": 13290, "time_per_iteration": 2.6194076538085938 }, { "auxiliary_loss_clip": 0.01250475, "auxiliary_loss_mlp": 0.00196182, "balance_loss_clip": 1.0329417, "balance_loss_mlp": 0.17322268, "epoch": 0.7990981512099805, "flos": 20879936127360.0, "grad_norm": 12.189201392432244, "language_loss": 0.79401398, "learning_rate": 4.08553751558248e-07, "loss": 0.8084805, "num_input_tokens_seen": 286694300, "router_z_loss_clip": 2.17480469, "router_z_loss_mlp": 0.22961426, "step": 13291, "time_per_iteration": 2.7054336071014404 }, { "auxiliary_loss_clip": 0.01237396, "auxiliary_loss_mlp": 0.00201053, "balance_loss_clip": 1.02459979, "balance_loss_mlp": 0.17759246, "epoch": 0.7991582744626484, "flos": 26099911180800.0, "grad_norm": 8.769158630480351, "language_loss": 0.70491695, "learning_rate": 4.083179001549422e-07, "loss": 0.71930146, "num_input_tokens_seen": 286714545, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.23461914, "step": 13292, "time_per_iteration": 2.6829683780670166 }, { "auxiliary_loss_clip": 0.0124162, "auxiliary_loss_mlp": 0.00227892, "balance_loss_clip": 1.02610767, "balance_loss_mlp": 0.20360877, "epoch": 0.7992183977153164, "flos": 35295843605760.0, "grad_norm": 9.850675043188652, "language_loss": 0.62243438, "learning_rate": 4.0808210910838105e-07, "loss": 0.63712949, "num_input_tokens_seen": 286734525, "router_z_loss_clip": 2.15429688, "router_z_loss_mlp": 0.24291992, "step": 13293, "time_per_iteration": 2.7421510219573975 }, { "auxiliary_loss_clip": 0.01246654, "auxiliary_loss_mlp": 0.00206435, "balance_loss_clip": 1.02940965, "balance_loss_mlp": 0.18229511, "epoch": 0.7992785209679844, "flos": 51853426577280.0, "grad_norm": 10.660722634331274, "language_loss": 0.82678676, "learning_rate": 4.0784637842750704e-07, "loss": 0.84131765, "num_input_tokens_seen": 286753430, "router_z_loss_clip": 2.171875, "router_z_loss_mlp": 0.24157715, "step": 13294, "time_per_iteration": 2.8733322620391846 }, { "auxiliary_loss_clip": 0.01260881, "auxiliary_loss_mlp": 0.00218382, "balance_loss_clip": 1.03981543, "balance_loss_mlp": 0.19306178, "epoch": 0.7993386442206524, "flos": 22565116650240.0, "grad_norm": 87.16052505754796, "language_loss": 0.81738609, "learning_rate": 4.0761070812125675e-07, "loss": 0.83217871, "num_input_tokens_seen": 286771915, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.2532959, "step": 13295, "time_per_iteration": 2.6623427867889404 }, { "auxiliary_loss_clip": 0.01244481, "auxiliary_loss_mlp": 0.00220041, "balance_loss_clip": 1.03223658, "balance_loss_mlp": 0.19850026, "epoch": 0.7993987674733203, "flos": 18800277465600.0, "grad_norm": 12.605196911419164, "language_loss": 0.84981537, "learning_rate": 4.0737509819856797e-07, "loss": 0.86446059, "num_input_tokens_seen": 286789835, "router_z_loss_clip": 2.12207031, "router_z_loss_mlp": 0.2154541, "step": 13296, "time_per_iteration": 2.6478521823883057 }, { "auxiliary_loss_clip": 0.01165275, "auxiliary_loss_mlp": 0.00097526, "balance_loss_clip": 1.02666259, "balance_loss_mlp": 0.08922907, "epoch": 0.7994588907259883, "flos": 69421720394880.0, "grad_norm": 0.6826416606507184, "language_loss": 0.60390925, "learning_rate": 4.0713954866837573e-07, "loss": 0.61653721, "num_input_tokens_seen": 286855580, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.08300781, "step": 13297, "time_per_iteration": 3.286343812942505 }, { "auxiliary_loss_clip": 0.01234687, "auxiliary_loss_mlp": 0.00198198, "balance_loss_clip": 1.02513433, "balance_loss_mlp": 0.17621566, "epoch": 0.7995190139786562, "flos": 13480327883520.0, "grad_norm": 5.409362748078585, "language_loss": 0.80898333, "learning_rate": 4.0690405953961073e-07, "loss": 0.82331216, "num_input_tokens_seen": 286874360, "router_z_loss_clip": 2.09570312, "router_z_loss_mlp": 0.21972656, "step": 13298, "time_per_iteration": 2.6769845485687256 }, { "auxiliary_loss_clip": 0.01270649, "auxiliary_loss_mlp": 0.00217794, "balance_loss_clip": 1.04625058, "balance_loss_mlp": 0.19206814, "epoch": 0.7995791372313242, "flos": 21652842003840.0, "grad_norm": 6.023256678626334, "language_loss": 0.83364415, "learning_rate": 4.066686308212037e-07, "loss": 0.84852862, "num_input_tokens_seen": 286891950, "router_z_loss_clip": 2.24023438, "router_z_loss_mlp": 0.25720215, "step": 13299, "time_per_iteration": 2.781677722930908 }, { "auxiliary_loss_clip": 0.01246528, "auxiliary_loss_mlp": 0.00216899, "balance_loss_clip": 1.03461349, "balance_loss_mlp": 0.1943799, "epoch": 0.7996392604839921, "flos": 26068130622720.0, "grad_norm": 6.344094059529175, "language_loss": 0.84375632, "learning_rate": 4.064332625220828e-07, "loss": 0.85839057, "num_input_tokens_seen": 286911725, "router_z_loss_clip": 2.12304688, "router_z_loss_mlp": 0.2253418, "step": 13300, "time_per_iteration": 2.7108616828918457 }, { "auxiliary_loss_clip": 0.01256214, "auxiliary_loss_mlp": 0.0022044, "balance_loss_clip": 1.03594065, "balance_loss_mlp": 0.19576378, "epoch": 0.7996993837366602, "flos": 24606889441920.0, "grad_norm": 23.92639154449928, "language_loss": 0.72664154, "learning_rate": 4.0619795465117115e-07, "loss": 0.74140811, "num_input_tokens_seen": 286931400, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.24707031, "step": 13301, "time_per_iteration": 2.701855182647705 }, { "auxiliary_loss_clip": 0.01261743, "auxiliary_loss_mlp": 0.0024257, "balance_loss_clip": 1.04379177, "balance_loss_mlp": 0.2154264, "epoch": 0.7997595069893281, "flos": 20992049452800.0, "grad_norm": 18.535204670228858, "language_loss": 0.79499316, "learning_rate": 4.059627072173928e-07, "loss": 0.81003624, "num_input_tokens_seen": 286949795, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.27172852, "step": 13302, "time_per_iteration": 2.6812937259674072 }, { "auxiliary_loss_clip": 0.01256816, "auxiliary_loss_mlp": 0.00238105, "balance_loss_clip": 1.03721011, "balance_loss_mlp": 0.21327372, "epoch": 0.7998196302419961, "flos": 24426510318720.0, "grad_norm": 17.022303540396287, "language_loss": 0.91394842, "learning_rate": 4.057275202296684e-07, "loss": 0.92889762, "num_input_tokens_seen": 286968805, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.24829102, "step": 13303, "time_per_iteration": 2.699009895324707 }, { "auxiliary_loss_clip": 0.0122414, "auxiliary_loss_mlp": 0.00212711, "balance_loss_clip": 1.01629722, "balance_loss_mlp": 0.1913729, "epoch": 0.7998797534946641, "flos": 30264651457920.0, "grad_norm": 3.110660103770848, "language_loss": 0.67668688, "learning_rate": 4.054923936969166e-07, "loss": 0.69105542, "num_input_tokens_seen": 286990235, "router_z_loss_clip": 2.08007812, "router_z_loss_mlp": 0.21350098, "step": 13304, "time_per_iteration": 2.7082479000091553 }, { "auxiliary_loss_clip": 0.01233327, "auxiliary_loss_mlp": 0.00207822, "balance_loss_clip": 1.01777673, "balance_loss_mlp": 0.18467206, "epoch": 0.799939876747332, "flos": 23513984277120.0, "grad_norm": 38.885444674562834, "language_loss": 0.76425844, "learning_rate": 4.0525732762805265e-07, "loss": 0.77866995, "num_input_tokens_seen": 287011060, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.23156738, "step": 13305, "time_per_iteration": 2.6777825355529785 }, { "auxiliary_loss_clip": 0.01236867, "auxiliary_loss_mlp": 0.0023555, "balance_loss_clip": 1.02734983, "balance_loss_mlp": 0.21251842, "epoch": 0.8, "flos": 19318109886720.0, "grad_norm": 6.234388367366129, "language_loss": 0.76192635, "learning_rate": 4.0502232203199107e-07, "loss": 0.77665055, "num_input_tokens_seen": 287029215, "router_z_loss_clip": 2.09472656, "router_z_loss_mlp": 0.23022461, "step": 13306, "time_per_iteration": 2.6380460262298584 }, { "auxiliary_loss_clip": 0.01248651, "auxiliary_loss_mlp": 0.00236471, "balance_loss_clip": 1.03186142, "balance_loss_mlp": 0.21171147, "epoch": 0.800060123252668, "flos": 32412432263040.0, "grad_norm": 4725.8635618591225, "language_loss": 0.76579297, "learning_rate": 4.0478737691764286e-07, "loss": 0.78064418, "num_input_tokens_seen": 287050855, "router_z_loss_clip": 2.16894531, "router_z_loss_mlp": 0.24731445, "step": 13307, "time_per_iteration": 2.736438274383545 }, { "auxiliary_loss_clip": 0.01236823, "auxiliary_loss_mlp": 0.00232907, "balance_loss_clip": 1.02257633, "balance_loss_mlp": 0.2094464, "epoch": 0.800120246505336, "flos": 20010611168640.0, "grad_norm": 12.619989079260478, "language_loss": 0.85518718, "learning_rate": 4.0455249229391677e-07, "loss": 0.86988443, "num_input_tokens_seen": 287069915, "router_z_loss_clip": 2.140625, "router_z_loss_mlp": 0.23461914, "step": 13308, "time_per_iteration": 2.633929491043091 }, { "auxiliary_loss_clip": 0.01264141, "auxiliary_loss_mlp": 0.00229141, "balance_loss_clip": 1.03814209, "balance_loss_mlp": 0.20397568, "epoch": 0.8001803697580039, "flos": 31868278151040.0, "grad_norm": 5.514510663790233, "language_loss": 0.84383506, "learning_rate": 4.0431766816972e-07, "loss": 0.85876787, "num_input_tokens_seen": 287091450, "router_z_loss_clip": 2.25976562, "router_z_loss_mlp": 0.25158691, "step": 13309, "time_per_iteration": 2.7503931522369385 }, { "auxiliary_loss_clip": 0.01169723, "auxiliary_loss_mlp": 0.00147439, "balance_loss_clip": 1.02973199, "balance_loss_mlp": 0.13866562, "epoch": 0.8002404930106719, "flos": 63392066916480.0, "grad_norm": 0.8903896692886496, "language_loss": 0.63918954, "learning_rate": 4.040829045539571e-07, "loss": 0.65236115, "num_input_tokens_seen": 287148365, "router_z_loss_clip": 1.3984375, "router_z_loss_mlp": 0.08789062, "step": 13310, "time_per_iteration": 3.0954532623291016 }, { "auxiliary_loss_clip": 0.01250569, "auxiliary_loss_mlp": 0.00239907, "balance_loss_clip": 1.03254592, "balance_loss_mlp": 0.21587458, "epoch": 0.8003006162633398, "flos": 27855476403840.0, "grad_norm": 12.37433616509396, "language_loss": 0.91808617, "learning_rate": 4.0384820145553156e-07, "loss": 0.93299097, "num_input_tokens_seen": 287168280, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.24023438, "step": 13311, "time_per_iteration": 4.143649339675903 }, { "auxiliary_loss_clip": 0.01251982, "auxiliary_loss_mlp": 0.00217726, "balance_loss_clip": 1.03278494, "balance_loss_mlp": 0.1937415, "epoch": 0.8003607395160078, "flos": 18223337214720.0, "grad_norm": 14.19547829874809, "language_loss": 0.74998879, "learning_rate": 4.0361355888334116e-07, "loss": 0.76468581, "num_input_tokens_seen": 287185980, "router_z_loss_clip": 2.19335938, "router_z_loss_mlp": 0.23986816, "step": 13312, "time_per_iteration": 2.5976626873016357 }, { "auxiliary_loss_clip": 0.01260148, "auxiliary_loss_mlp": 0.00236474, "balance_loss_clip": 1.03470135, "balance_loss_mlp": 0.21095164, "epoch": 0.8004208627686757, "flos": 20886975192960.0, "grad_norm": 19.623763667929808, "language_loss": 0.82742548, "learning_rate": 4.033789768462843e-07, "loss": 0.84239173, "num_input_tokens_seen": 287203875, "router_z_loss_clip": 2.25195312, "router_z_loss_mlp": 0.25549316, "step": 13313, "time_per_iteration": 4.166054725646973 }, { "auxiliary_loss_clip": 0.01246592, "auxiliary_loss_mlp": 0.002576, "balance_loss_clip": 1.02906442, "balance_loss_mlp": 0.23251849, "epoch": 0.8004809860213438, "flos": 26436143416320.0, "grad_norm": 10.741185378911528, "language_loss": 0.81799328, "learning_rate": 4.031444553532575e-07, "loss": 0.83303523, "num_input_tokens_seen": 287226445, "router_z_loss_clip": 2.17578125, "router_z_loss_mlp": 0.25085449, "step": 13314, "time_per_iteration": 2.8518967628479004 }, { "auxiliary_loss_clip": 0.01169184, "auxiliary_loss_mlp": 0.00067298, "balance_loss_clip": 1.03642321, "balance_loss_mlp": 0.0589054, "epoch": 0.8005411092740117, "flos": 63648612829440.0, "grad_norm": 0.7505227835315214, "language_loss": 0.53426361, "learning_rate": 4.029099944131522e-07, "loss": 0.54662842, "num_input_tokens_seen": 287286240, "router_z_loss_clip": 1.328125, "router_z_loss_mlp": 0.08398438, "step": 13315, "time_per_iteration": 3.0841362476348877 }, { "auxiliary_loss_clip": 0.01239473, "auxiliary_loss_mlp": 0.00233234, "balance_loss_clip": 1.02565324, "balance_loss_mlp": 0.20921339, "epoch": 0.8006012325266797, "flos": 36138056774400.0, "grad_norm": 139.2078915974386, "language_loss": 0.77404112, "learning_rate": 4.026755940348603e-07, "loss": 0.78876811, "num_input_tokens_seen": 287310265, "router_z_loss_clip": 2.13769531, "router_z_loss_mlp": 0.24047852, "step": 13316, "time_per_iteration": 2.8092644214630127 }, { "auxiliary_loss_clip": 0.01262383, "auxiliary_loss_mlp": 0.00234517, "balance_loss_clip": 1.04227352, "balance_loss_mlp": 0.21110471, "epoch": 0.8006613557793477, "flos": 33838947970560.0, "grad_norm": 9.200924746423478, "language_loss": 0.73046958, "learning_rate": 4.024412542272706e-07, "loss": 0.74543858, "num_input_tokens_seen": 287331610, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.23413086, "step": 13317, "time_per_iteration": 2.8291587829589844 }, { "auxiliary_loss_clip": 0.01156032, "auxiliary_loss_mlp": 0.00092233, "balance_loss_clip": 1.02565026, "balance_loss_mlp": 0.08264848, "epoch": 0.8007214790320156, "flos": 67348310699520.0, "grad_norm": 37.0108842143121, "language_loss": 0.57998139, "learning_rate": 4.0220697499926783e-07, "loss": 0.59246403, "num_input_tokens_seen": 287394795, "router_z_loss_clip": 1.3046875, "router_z_loss_mlp": 0.09570312, "step": 13318, "time_per_iteration": 4.730731010437012 }, { "auxiliary_loss_clip": 0.01248159, "auxiliary_loss_mlp": 0.00241581, "balance_loss_clip": 1.03264701, "balance_loss_mlp": 0.21736917, "epoch": 0.8007816022846836, "flos": 23185653033600.0, "grad_norm": 10.644071373184, "language_loss": 0.72693861, "learning_rate": 4.019727563597366e-07, "loss": 0.74183601, "num_input_tokens_seen": 287414595, "router_z_loss_clip": 2.15234375, "router_z_loss_mlp": 0.24243164, "step": 13319, "time_per_iteration": 2.7234225273132324 }, { "auxiliary_loss_clip": 0.01272517, "auxiliary_loss_mlp": 0.00244091, "balance_loss_clip": 1.04718542, "balance_loss_mlp": 0.21707842, "epoch": 0.8008417255373516, "flos": 21981388728960.0, "grad_norm": 5.50458695864645, "language_loss": 0.82008284, "learning_rate": 4.0173859831755873e-07, "loss": 0.83524895, "num_input_tokens_seen": 287434395, "router_z_loss_clip": 2.25390625, "router_z_loss_mlp": 0.27026367, "step": 13320, "time_per_iteration": 2.6549532413482666 }, { "auxiliary_loss_clip": 0.01247052, "auxiliary_loss_mlp": 0.00246568, "balance_loss_clip": 1.02928138, "balance_loss_mlp": 0.21984088, "epoch": 0.8009018487900196, "flos": 16727334647040.0, "grad_norm": 9.865385512779099, "language_loss": 0.87542582, "learning_rate": 4.015045008816138e-07, "loss": 0.89036202, "num_input_tokens_seen": 287450590, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.26708984, "step": 13321, "time_per_iteration": 2.658527374267578 }, { "auxiliary_loss_clip": 0.01226318, "auxiliary_loss_mlp": 0.00218291, "balance_loss_clip": 1.01878607, "balance_loss_mlp": 0.19400841, "epoch": 0.8009619720426875, "flos": 20813609664000.0, "grad_norm": 13.794431951452433, "language_loss": 0.75439554, "learning_rate": 4.0127046406077825e-07, "loss": 0.76884162, "num_input_tokens_seen": 287468455, "router_z_loss_clip": 2.07617188, "router_z_loss_mlp": 0.24279785, "step": 13322, "time_per_iteration": 4.093682527542114 }, { "auxiliary_loss_clip": 0.01255104, "auxiliary_loss_mlp": 0.00239711, "balance_loss_clip": 1.03626621, "balance_loss_mlp": 0.21507019, "epoch": 0.8010220952953555, "flos": 17931096161280.0, "grad_norm": 6.0271863287017435, "language_loss": 0.85842311, "learning_rate": 4.010364878639265e-07, "loss": 0.87337124, "num_input_tokens_seen": 287486485, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.24645996, "step": 13323, "time_per_iteration": 2.638410806655884 }, { "auxiliary_loss_clip": 0.01254425, "auxiliary_loss_mlp": 0.00232007, "balance_loss_clip": 1.0309484, "balance_loss_mlp": 0.20663974, "epoch": 0.8010822185480234, "flos": 24572235795840.0, "grad_norm": 30.850466085548575, "language_loss": 0.81758034, "learning_rate": 4.00802572299932e-07, "loss": 0.83244467, "num_input_tokens_seen": 287503940, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.25378418, "step": 13324, "time_per_iteration": 2.7644364833831787 }, { "auxiliary_loss_clip": 0.01271601, "auxiliary_loss_mlp": 0.00237172, "balance_loss_clip": 1.04468071, "balance_loss_mlp": 0.21120842, "epoch": 0.8011423418006914, "flos": 21829988903040.0, "grad_norm": 30.963216070949777, "language_loss": 0.84231734, "learning_rate": 4.005687173776635e-07, "loss": 0.85740507, "num_input_tokens_seen": 287521660, "router_z_loss_clip": 2.26953125, "router_z_loss_mlp": 0.25939941, "step": 13325, "time_per_iteration": 2.6473357677459717 }, { "auxiliary_loss_clip": 0.01232688, "auxiliary_loss_mlp": 0.0023196, "balance_loss_clip": 1.02338266, "balance_loss_mlp": 0.210788, "epoch": 0.8012024650533593, "flos": 23915178259200.0, "grad_norm": 161.34635696752164, "language_loss": 0.86002553, "learning_rate": 4.003349231059898e-07, "loss": 0.87467194, "num_input_tokens_seen": 287541505, "router_z_loss_clip": 2.09570312, "router_z_loss_mlp": 0.21179199, "step": 13326, "time_per_iteration": 2.7384510040283203 }, { "auxiliary_loss_clip": 0.01236845, "auxiliary_loss_mlp": 0.00210055, "balance_loss_clip": 1.02598953, "balance_loss_mlp": 0.18697609, "epoch": 0.8012625883060274, "flos": 23587062497280.0, "grad_norm": 25.35427691245447, "language_loss": 0.73929614, "learning_rate": 4.001011894937765e-07, "loss": 0.75376511, "num_input_tokens_seen": 287560015, "router_z_loss_clip": 2.109375, "router_z_loss_mlp": 0.2310791, "step": 13327, "time_per_iteration": 2.68664813041687 }, { "auxiliary_loss_clip": 0.01222487, "auxiliary_loss_mlp": 0.00220997, "balance_loss_clip": 1.01402974, "balance_loss_mlp": 0.19797784, "epoch": 0.8013227115586953, "flos": 20813932886400.0, "grad_norm": 46.80935288864243, "language_loss": 0.80416256, "learning_rate": 3.9986751654988636e-07, "loss": 0.81859744, "num_input_tokens_seen": 287579150, "router_z_loss_clip": 2.08496094, "router_z_loss_mlp": 0.22998047, "step": 13328, "time_per_iteration": 2.6755244731903076 }, { "auxiliary_loss_clip": 0.01262573, "auxiliary_loss_mlp": 0.00222169, "balance_loss_clip": 1.0359478, "balance_loss_mlp": 0.19564503, "epoch": 0.8013828348113633, "flos": 15888317788800.0, "grad_norm": 23.857715401436934, "language_loss": 0.83983791, "learning_rate": 3.996339042831798e-07, "loss": 0.85468537, "num_input_tokens_seen": 287597420, "router_z_loss_clip": 2.26953125, "router_z_loss_mlp": 0.26525879, "step": 13329, "time_per_iteration": 2.643158435821533 }, { "auxiliary_loss_clip": 0.0114824, "auxiliary_loss_mlp": 0.0009657, "balance_loss_clip": 1.01178992, "balance_loss_mlp": 0.08874957, "epoch": 0.8014429580640313, "flos": 71062981562880.0, "grad_norm": 0.7281421273228869, "language_loss": 0.5176186, "learning_rate": 3.9940035270251605e-07, "loss": 0.53006667, "num_input_tokens_seen": 287667280, "router_z_loss_clip": 1.359375, "router_z_loss_mlp": 0.078125, "step": 13330, "time_per_iteration": 3.2616822719573975 }, { "auxiliary_loss_clip": 0.01273737, "auxiliary_loss_mlp": 0.00239242, "balance_loss_clip": 1.04442763, "balance_loss_mlp": 0.21081066, "epoch": 0.8015030813166992, "flos": 23076340968960.0, "grad_norm": 141.26607903430966, "language_loss": 0.82844937, "learning_rate": 3.991668618167519e-07, "loss": 0.84357917, "num_input_tokens_seen": 287687375, "router_z_loss_clip": 2.29296875, "router_z_loss_mlp": 0.2845459, "step": 13331, "time_per_iteration": 2.76023006439209 }, { "auxiliary_loss_clip": 0.01253461, "auxiliary_loss_mlp": 0.00225075, "balance_loss_clip": 1.03474998, "balance_loss_mlp": 0.20241365, "epoch": 0.8015632045693672, "flos": 21872328059520.0, "grad_norm": 15.966703536998855, "language_loss": 0.85656261, "learning_rate": 3.989334316347401e-07, "loss": 0.87134796, "num_input_tokens_seen": 287707895, "router_z_loss_clip": 2.19140625, "router_z_loss_mlp": 0.22668457, "step": 13332, "time_per_iteration": 2.7064309120178223 }, { "auxiliary_loss_clip": 0.01248391, "auxiliary_loss_mlp": 0.00210581, "balance_loss_clip": 1.03058589, "balance_loss_mlp": 0.1872994, "epoch": 0.8016233278220352, "flos": 23656728925440.0, "grad_norm": 550.6250363063909, "language_loss": 0.92105079, "learning_rate": 3.987000621653338e-07, "loss": 0.93564057, "num_input_tokens_seen": 287723990, "router_z_loss_clip": 2.17480469, "router_z_loss_mlp": 0.23291016, "step": 13333, "time_per_iteration": 2.6614484786987305 }, { "auxiliary_loss_clip": 0.01261054, "auxiliary_loss_mlp": 0.00237031, "balance_loss_clip": 1.04259574, "balance_loss_mlp": 0.21084026, "epoch": 0.8016834510747032, "flos": 16253170185600.0, "grad_norm": 3.1050735853742153, "language_loss": 0.83754075, "learning_rate": 3.9846675341738133e-07, "loss": 0.85252154, "num_input_tokens_seen": 287742380, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.26196289, "step": 13334, "time_per_iteration": 2.6660256385803223 }, { "auxiliary_loss_clip": 0.01243336, "auxiliary_loss_mlp": 0.00213839, "balance_loss_clip": 1.03162324, "balance_loss_mlp": 0.19189245, "epoch": 0.8017435743273711, "flos": 12276027665280.0, "grad_norm": 10.9289256331366, "language_loss": 0.84959412, "learning_rate": 3.9823350539972967e-07, "loss": 0.86416584, "num_input_tokens_seen": 287760130, "router_z_loss_clip": 2.11816406, "router_z_loss_mlp": 0.21960449, "step": 13335, "time_per_iteration": 2.789978265762329 }, { "auxiliary_loss_clip": 0.01239604, "auxiliary_loss_mlp": 0.00238372, "balance_loss_clip": 1.02638066, "balance_loss_mlp": 0.21423197, "epoch": 0.8018036975800391, "flos": 17196112068480.0, "grad_norm": 100.94040730779551, "language_loss": 0.83407229, "learning_rate": 3.9800031812122416e-07, "loss": 0.84885204, "num_input_tokens_seen": 287777565, "router_z_loss_clip": 2.1328125, "router_z_loss_mlp": 0.24145508, "step": 13336, "time_per_iteration": 2.6316394805908203 }, { "auxiliary_loss_clip": 0.01289011, "auxiliary_loss_mlp": 0.00246087, "balance_loss_clip": 1.05656385, "balance_loss_mlp": 0.21826372, "epoch": 0.801863820832707, "flos": 20631865824000.0, "grad_norm": 41.0829768475869, "language_loss": 0.85865968, "learning_rate": 3.977671915907068e-07, "loss": 0.87401068, "num_input_tokens_seen": 287796310, "router_z_loss_clip": 2.32617188, "router_z_loss_mlp": 0.27832031, "step": 13337, "time_per_iteration": 2.66713547706604 }, { "auxiliary_loss_clip": 0.01253497, "auxiliary_loss_mlp": 0.00238593, "balance_loss_clip": 1.03026509, "balance_loss_mlp": 0.21359463, "epoch": 0.801923944085375, "flos": 30445569285120.0, "grad_norm": 4.1154273739625475, "language_loss": 0.87915128, "learning_rate": 3.9753412581701883e-07, "loss": 0.89407218, "num_input_tokens_seen": 287817330, "router_z_loss_clip": 2.23242188, "router_z_loss_mlp": 0.25024414, "step": 13338, "time_per_iteration": 2.7267210483551025 }, { "auxiliary_loss_clip": 0.01262631, "auxiliary_loss_mlp": 0.00243513, "balance_loss_clip": 1.03338671, "balance_loss_mlp": 0.21617785, "epoch": 0.801984067338043, "flos": 20010575255040.0, "grad_norm": 40.87659454273491, "language_loss": 0.83817852, "learning_rate": 3.9730112080899733e-07, "loss": 0.85323995, "num_input_tokens_seen": 287835095, "router_z_loss_clip": 2.29101562, "router_z_loss_mlp": 0.27331543, "step": 13339, "time_per_iteration": 2.6773712635040283 }, { "auxiliary_loss_clip": 0.01230098, "auxiliary_loss_mlp": 0.00235633, "balance_loss_clip": 1.01570952, "balance_loss_mlp": 0.21189804, "epoch": 0.802044190590711, "flos": 22784028088320.0, "grad_norm": 28.600044238528948, "language_loss": 0.84815347, "learning_rate": 3.970681765754775e-07, "loss": 0.86281079, "num_input_tokens_seen": 287854595, "router_z_loss_clip": 2.13867188, "router_z_loss_mlp": 0.23718262, "step": 13340, "time_per_iteration": 2.694871664047241 }, { "auxiliary_loss_clip": 0.01247634, "auxiliary_loss_mlp": 0.00228216, "balance_loss_clip": 1.03291106, "balance_loss_mlp": 0.20611452, "epoch": 0.8021043138433789, "flos": 27600115639680.0, "grad_norm": 8.035370125091926, "language_loss": 0.7485615, "learning_rate": 3.968352931252936e-07, "loss": 0.76332003, "num_input_tokens_seen": 287876960, "router_z_loss_clip": 2.14453125, "router_z_loss_mlp": 0.22106934, "step": 13341, "time_per_iteration": 2.707085371017456 }, { "auxiliary_loss_clip": 0.01130056, "auxiliary_loss_mlp": 0.00091603, "balance_loss_clip": 0.9924407, "balance_loss_mlp": 0.08344873, "epoch": 0.8021644370960469, "flos": 62063730057600.0, "grad_norm": 0.7893093306031082, "language_loss": 0.60218072, "learning_rate": 3.9660247046727547e-07, "loss": 0.61439729, "num_input_tokens_seen": 287936530, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.08154297, "step": 13342, "time_per_iteration": 3.132772922515869 }, { "auxiliary_loss_clip": 0.01267858, "auxiliary_loss_mlp": 0.00252464, "balance_loss_clip": 1.04352903, "balance_loss_mlp": 0.22663185, "epoch": 0.8022245603487148, "flos": 23361794352000.0, "grad_norm": 23.99942322000829, "language_loss": 0.70732796, "learning_rate": 3.963697086102522e-07, "loss": 0.72253114, "num_input_tokens_seen": 287954285, "router_z_loss_clip": 2.24804688, "router_z_loss_mlp": 0.25805664, "step": 13343, "time_per_iteration": 2.681363582611084 }, { "auxiliary_loss_clip": 0.0124556, "auxiliary_loss_mlp": 0.00218167, "balance_loss_clip": 1.03194833, "balance_loss_mlp": 0.19690034, "epoch": 0.8022846836013828, "flos": 10853354712960.0, "grad_norm": 57.01862378702845, "language_loss": 0.78110051, "learning_rate": 3.96137007563051e-07, "loss": 0.7957378, "num_input_tokens_seen": 287971595, "router_z_loss_clip": 2.13574219, "router_z_loss_mlp": 0.21264648, "step": 13344, "time_per_iteration": 2.76662015914917 }, { "auxiliary_loss_clip": 0.01256586, "auxiliary_loss_mlp": 0.00236444, "balance_loss_clip": 1.03482771, "balance_loss_mlp": 0.21192294, "epoch": 0.8023448068540509, "flos": 29240443054080.0, "grad_norm": 5.9417861497591336, "language_loss": 0.77748871, "learning_rate": 3.9590436733449506e-07, "loss": 0.79241902, "num_input_tokens_seen": 287992540, "router_z_loss_clip": 2.22070312, "router_z_loss_mlp": 0.24536133, "step": 13345, "time_per_iteration": 2.7502329349517822 }, { "auxiliary_loss_clip": 0.01135481, "auxiliary_loss_mlp": 0.00107311, "balance_loss_clip": 0.9968884, "balance_loss_mlp": 0.09844204, "epoch": 0.8024049301067188, "flos": 64153588181760.0, "grad_norm": 0.8927187641369104, "language_loss": 0.62292808, "learning_rate": 3.956717879334059e-07, "loss": 0.63535601, "num_input_tokens_seen": 288052810, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.08886719, "step": 13346, "time_per_iteration": 3.198486089706421 }, { "auxiliary_loss_clip": 0.01245775, "auxiliary_loss_mlp": 0.00225083, "balance_loss_clip": 1.02880907, "balance_loss_mlp": 0.20183763, "epoch": 0.8024650533593868, "flos": 28585360765440.0, "grad_norm": 13.437239300558874, "language_loss": 0.82039237, "learning_rate": 3.9543926936860327e-07, "loss": 0.83510101, "num_input_tokens_seen": 288073045, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.23242188, "step": 13347, "time_per_iteration": 2.7025835514068604 }, { "auxiliary_loss_clip": 0.01266283, "auxiliary_loss_mlp": 0.00230912, "balance_loss_clip": 1.03794754, "balance_loss_mlp": 0.20530613, "epoch": 0.8025251766120547, "flos": 16982264448000.0, "grad_norm": 70.36423090610836, "language_loss": 0.83024544, "learning_rate": 3.9520681164890493e-07, "loss": 0.84521747, "num_input_tokens_seen": 288091165, "router_z_loss_clip": 2.28710938, "router_z_loss_mlp": 0.25598145, "step": 13348, "time_per_iteration": 2.6590094566345215 }, { "auxiliary_loss_clip": 0.01261738, "auxiliary_loss_mlp": 0.00238203, "balance_loss_clip": 1.04118407, "balance_loss_mlp": 0.21322861, "epoch": 0.8025852998647227, "flos": 22163671272960.0, "grad_norm": 11.006415164490289, "language_loss": 0.85313785, "learning_rate": 3.9497441478312444e-07, "loss": 0.86813724, "num_input_tokens_seen": 288110595, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.24963379, "step": 13349, "time_per_iteration": 2.712337017059326 }, { "auxiliary_loss_clip": 0.01245711, "auxiliary_loss_mlp": 0.00219925, "balance_loss_clip": 1.03271246, "balance_loss_mlp": 0.1979664, "epoch": 0.8026454231173906, "flos": 22017012042240.0, "grad_norm": 7.580473379573667, "language_loss": 0.90387166, "learning_rate": 3.947420787800755e-07, "loss": 0.91852802, "num_input_tokens_seen": 288128995, "router_z_loss_clip": 2.12890625, "router_z_loss_mlp": 0.21960449, "step": 13350, "time_per_iteration": 2.6711838245391846 }, { "auxiliary_loss_clip": 0.01238474, "auxiliary_loss_mlp": 0.00196337, "balance_loss_clip": 1.02410853, "balance_loss_mlp": 0.17335302, "epoch": 0.8027055463700586, "flos": 22491320158080.0, "grad_norm": 27.22096605772479, "language_loss": 0.7778008, "learning_rate": 3.945098036485679e-07, "loss": 0.79214895, "num_input_tokens_seen": 288149265, "router_z_loss_clip": 2.14453125, "router_z_loss_mlp": 0.2298584, "step": 13351, "time_per_iteration": 2.7243239879608154 }, { "auxiliary_loss_clip": 0.01257721, "auxiliary_loss_mlp": 0.00254811, "balance_loss_clip": 1.03902221, "balance_loss_mlp": 0.23024245, "epoch": 0.8027656696227266, "flos": 28912901909760.0, "grad_norm": 59.958548515775725, "language_loss": 0.68042499, "learning_rate": 3.9427758939740885e-07, "loss": 0.69555032, "num_input_tokens_seen": 288170745, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.24584961, "step": 13352, "time_per_iteration": 2.7727866172790527 }, { "auxiliary_loss_clip": 0.0124882, "auxiliary_loss_mlp": 0.00238562, "balance_loss_clip": 1.03333926, "balance_loss_mlp": 0.21423134, "epoch": 0.8028257928753946, "flos": 18589374760320.0, "grad_norm": 180.43140815835739, "language_loss": 0.84499592, "learning_rate": 3.940454360354046e-07, "loss": 0.85986972, "num_input_tokens_seen": 288189415, "router_z_loss_clip": 2.15820312, "router_z_loss_mlp": 0.24353027, "step": 13353, "time_per_iteration": 2.6403348445892334 }, { "auxiliary_loss_clip": 0.01290813, "auxiliary_loss_mlp": 0.00258958, "balance_loss_clip": 1.06163442, "balance_loss_mlp": 0.23235051, "epoch": 0.8028859161280625, "flos": 19130009339520.0, "grad_norm": 13.38424889889698, "language_loss": 0.82958329, "learning_rate": 3.938133435713582e-07, "loss": 0.84508103, "num_input_tokens_seen": 288206900, "router_z_loss_clip": 2.2890625, "router_z_loss_mlp": 0.26623535, "step": 13354, "time_per_iteration": 4.05318546295166 }, { "auxiliary_loss_clip": 0.0123208, "auxiliary_loss_mlp": 0.00232791, "balance_loss_clip": 1.01566601, "balance_loss_mlp": 0.20863955, "epoch": 0.8029460393807305, "flos": 20229881742720.0, "grad_norm": 83.5248597671148, "language_loss": 0.74910235, "learning_rate": 3.935813120140714e-07, "loss": 0.76375109, "num_input_tokens_seen": 288224800, "router_z_loss_clip": 2.16601562, "router_z_loss_mlp": 0.24169922, "step": 13355, "time_per_iteration": 4.0628321170806885 }, { "auxiliary_loss_clip": 0.0127832, "auxiliary_loss_mlp": 0.00221015, "balance_loss_clip": 1.05140519, "balance_loss_mlp": 0.19651759, "epoch": 0.8030061626333984, "flos": 49783320933120.0, "grad_norm": 108.85149059242262, "language_loss": 0.78861445, "learning_rate": 3.9334934137234235e-07, "loss": 0.80360782, "num_input_tokens_seen": 288249400, "router_z_loss_clip": 2.26953125, "router_z_loss_mlp": 0.24523926, "step": 13356, "time_per_iteration": 2.8984999656677246 }, { "auxiliary_loss_clip": 0.01230792, "auxiliary_loss_mlp": 0.00242097, "balance_loss_clip": 1.01560855, "balance_loss_mlp": 0.21969806, "epoch": 0.8030662858860664, "flos": 21615243442560.0, "grad_norm": 45.49889289481716, "language_loss": 0.8343066, "learning_rate": 3.931174316549666e-07, "loss": 0.8490355, "num_input_tokens_seen": 288268780, "router_z_loss_clip": 2.15039062, "router_z_loss_mlp": 0.22399902, "step": 13357, "time_per_iteration": 2.6517579555511475 }, { "auxiliary_loss_clip": 0.01250197, "auxiliary_loss_mlp": 0.0025898, "balance_loss_clip": 1.02914035, "balance_loss_mlp": 0.23425552, "epoch": 0.8031264091387345, "flos": 25630056351360.0, "grad_norm": 47.7594139089667, "language_loss": 0.84940434, "learning_rate": 3.9288558287073937e-07, "loss": 0.86449617, "num_input_tokens_seen": 288290830, "router_z_loss_clip": 2.2109375, "router_z_loss_mlp": 0.24719238, "step": 13358, "time_per_iteration": 2.6944169998168945 }, { "auxiliary_loss_clip": 0.01244925, "auxiliary_loss_mlp": 0.0023035, "balance_loss_clip": 1.03067589, "balance_loss_mlp": 0.20550701, "epoch": 0.8031865323914024, "flos": 19646225648640.0, "grad_norm": 7.223557595053155, "language_loss": 0.89886284, "learning_rate": 3.9265379502845143e-07, "loss": 0.91361558, "num_input_tokens_seen": 288308865, "router_z_loss_clip": 2.140625, "router_z_loss_mlp": 0.24829102, "step": 13359, "time_per_iteration": 2.6465065479278564 }, { "auxiliary_loss_clip": 0.01229162, "auxiliary_loss_mlp": 0.00222954, "balance_loss_clip": 1.01619279, "balance_loss_mlp": 0.20059049, "epoch": 0.8032466556440704, "flos": 26169110732160.0, "grad_norm": 5.69112239375238, "language_loss": 0.80982065, "learning_rate": 3.924220681368928e-07, "loss": 0.82434177, "num_input_tokens_seen": 288327325, "router_z_loss_clip": 2.12597656, "router_z_loss_mlp": 0.22375488, "step": 13360, "time_per_iteration": 4.235654354095459 }, { "auxiliary_loss_clip": 0.01248028, "auxiliary_loss_mlp": 0.00244401, "balance_loss_clip": 1.02648234, "balance_loss_mlp": 0.21972466, "epoch": 0.8033067788967383, "flos": 25520026014720.0, "grad_norm": 78.73548626470712, "language_loss": 0.77857566, "learning_rate": 3.921904022048512e-07, "loss": 0.79349989, "num_input_tokens_seen": 288347285, "router_z_loss_clip": 2.21484375, "router_z_loss_mlp": 0.24694824, "step": 13361, "time_per_iteration": 2.684387683868408 }, { "auxiliary_loss_clip": 0.0127472, "auxiliary_loss_mlp": 0.00240106, "balance_loss_clip": 1.04683375, "balance_loss_mlp": 0.21440491, "epoch": 0.8033669021494063, "flos": 24024274842240.0, "grad_norm": 136.24253691707455, "language_loss": 0.76891148, "learning_rate": 3.919587972411098e-07, "loss": 0.7840597, "num_input_tokens_seen": 288367785, "router_z_loss_clip": 2.27929688, "router_z_loss_mlp": 0.25708008, "step": 13362, "time_per_iteration": 2.6669068336486816 }, { "auxiliary_loss_clip": 0.01292887, "auxiliary_loss_mlp": 0.00239366, "balance_loss_clip": 1.05437064, "balance_loss_mlp": 0.21288951, "epoch": 0.8034270254020742, "flos": 13588059749760.0, "grad_norm": 55.04409575539353, "language_loss": 0.88798356, "learning_rate": 3.91727253254452e-07, "loss": 0.90330613, "num_input_tokens_seen": 288384135, "router_z_loss_clip": 2.38671875, "router_z_loss_mlp": 0.26501465, "step": 13363, "time_per_iteration": 2.6693735122680664 }, { "auxiliary_loss_clip": 0.01253219, "auxiliary_loss_mlp": 0.00238332, "balance_loss_clip": 1.03125858, "balance_loss_mlp": 0.21215346, "epoch": 0.8034871486547422, "flos": 27412661537280.0, "grad_norm": 49.48004097073689, "language_loss": 0.83734316, "learning_rate": 3.9149577025365787e-07, "loss": 0.85225868, "num_input_tokens_seen": 288403805, "router_z_loss_clip": 2.21679688, "router_z_loss_mlp": 0.26208496, "step": 13364, "time_per_iteration": 4.135625600814819 }, { "auxiliary_loss_clip": 0.01250596, "auxiliary_loss_mlp": 0.00211924, "balance_loss_clip": 1.03789449, "balance_loss_mlp": 0.18817732, "epoch": 0.8035472719074102, "flos": 32598593475840.0, "grad_norm": 7.316695139101207, "language_loss": 0.70202166, "learning_rate": 3.9126434824750596e-07, "loss": 0.71664685, "num_input_tokens_seen": 288424895, "router_z_loss_clip": 2.12988281, "router_z_loss_mlp": 0.23754883, "step": 13365, "time_per_iteration": 2.7325398921966553 }, { "auxiliary_loss_clip": 0.01258917, "auxiliary_loss_mlp": 0.0023792, "balance_loss_clip": 1.03571105, "balance_loss_mlp": 0.2125161, "epoch": 0.8036073951600782, "flos": 21287989607040.0, "grad_norm": 190.10057254573832, "language_loss": 0.76276237, "learning_rate": 3.910329872447706e-07, "loss": 0.77773076, "num_input_tokens_seen": 288443865, "router_z_loss_clip": 2.23144531, "router_z_loss_mlp": 0.25390625, "step": 13366, "time_per_iteration": 2.7344837188720703 }, { "auxiliary_loss_clip": 0.01239281, "auxiliary_loss_mlp": 0.00227204, "balance_loss_clip": 1.02651763, "balance_loss_mlp": 0.20398186, "epoch": 0.8036675184127461, "flos": 18113845582080.0, "grad_norm": 25.887449963571843, "language_loss": 0.82626504, "learning_rate": 3.908016872542259e-07, "loss": 0.84092987, "num_input_tokens_seen": 288461065, "router_z_loss_clip": 2.12988281, "router_z_loss_mlp": 0.2322998, "step": 13367, "time_per_iteration": 2.668088436126709 }, { "auxiliary_loss_clip": 0.01236995, "auxiliary_loss_mlp": 0.00255311, "balance_loss_clip": 1.02092957, "balance_loss_mlp": 0.23249485, "epoch": 0.8037276416654141, "flos": 26030280666240.0, "grad_norm": 31.075213940065026, "language_loss": 0.79836792, "learning_rate": 3.905704482846428e-07, "loss": 0.81329101, "num_input_tokens_seen": 288481865, "router_z_loss_clip": 2.16015625, "router_z_loss_mlp": 0.22839355, "step": 13368, "time_per_iteration": 2.70517635345459 }, { "auxiliary_loss_clip": 0.01266416, "auxiliary_loss_mlp": 0.00232032, "balance_loss_clip": 1.04115427, "balance_loss_mlp": 0.2067356, "epoch": 0.803787764918082, "flos": 18802180886400.0, "grad_norm": 113.14697651812297, "language_loss": 0.77455181, "learning_rate": 3.90339270344789e-07, "loss": 0.78953624, "num_input_tokens_seen": 288499345, "router_z_loss_clip": 2.25585938, "router_z_loss_mlp": 0.25268555, "step": 13369, "time_per_iteration": 2.637619733810425 }, { "auxiliary_loss_clip": 0.0124085, "auxiliary_loss_mlp": 0.00249227, "balance_loss_clip": 1.02665401, "balance_loss_mlp": 0.22390737, "epoch": 0.80384788817075, "flos": 20225787592320.0, "grad_norm": 1.8972763831888355, "language_loss": 0.80979264, "learning_rate": 3.901081534434312e-07, "loss": 0.82469338, "num_input_tokens_seen": 288517660, "router_z_loss_clip": 2.14257812, "router_z_loss_mlp": 0.25341797, "step": 13370, "time_per_iteration": 2.657428503036499 }, { "auxiliary_loss_clip": 0.01259928, "auxiliary_loss_mlp": 0.00235682, "balance_loss_clip": 1.03652656, "balance_loss_mlp": 0.20988479, "epoch": 0.8039080114234181, "flos": 18515290959360.0, "grad_norm": 176.30559898324182, "language_loss": 0.98139298, "learning_rate": 3.898770975893342e-07, "loss": 0.99634904, "num_input_tokens_seen": 288534180, "router_z_loss_clip": 2.23242188, "router_z_loss_mlp": 0.25805664, "step": 13371, "time_per_iteration": 2.6572108268737793 }, { "auxiliary_loss_clip": 0.01257792, "auxiliary_loss_mlp": 0.00251467, "balance_loss_clip": 1.03254783, "balance_loss_mlp": 0.22447772, "epoch": 0.803968134676086, "flos": 22382510883840.0, "grad_norm": 18.45679510602033, "language_loss": 0.82192779, "learning_rate": 3.89646102791259e-07, "loss": 0.83702034, "num_input_tokens_seen": 288553350, "router_z_loss_clip": 2.25390625, "router_z_loss_mlp": 0.26965332, "step": 13372, "time_per_iteration": 2.6380958557128906 }, { "auxiliary_loss_clip": 0.01258101, "auxiliary_loss_mlp": 0.00216393, "balance_loss_clip": 1.03403425, "balance_loss_mlp": 0.19215785, "epoch": 0.804028257928754, "flos": 23842566915840.0, "grad_norm": 76.62934500683733, "language_loss": 0.89280176, "learning_rate": 3.894151690579646e-07, "loss": 0.90754664, "num_input_tokens_seen": 288571325, "router_z_loss_clip": 2.23828125, "router_z_loss_mlp": 0.24255371, "step": 13373, "time_per_iteration": 2.717031717300415 }, { "auxiliary_loss_clip": 0.01238785, "auxiliary_loss_mlp": 0.00236687, "balance_loss_clip": 1.02605939, "balance_loss_mlp": 0.21263021, "epoch": 0.8040883811814219, "flos": 23550720912000.0, "grad_norm": 20.43892996652995, "language_loss": 0.83311069, "learning_rate": 3.8918429639820815e-07, "loss": 0.84786546, "num_input_tokens_seen": 288592100, "router_z_loss_clip": 2.12695312, "router_z_loss_mlp": 0.24035645, "step": 13374, "time_per_iteration": 2.6644508838653564 }, { "auxiliary_loss_clip": 0.01273537, "auxiliary_loss_mlp": 0.00262213, "balance_loss_clip": 1.04318357, "balance_loss_mlp": 0.23524803, "epoch": 0.8041485044340899, "flos": 19026263882880.0, "grad_norm": 1343.961670477193, "language_loss": 0.8037284, "learning_rate": 3.889534848207452e-07, "loss": 0.81908584, "num_input_tokens_seen": 288612305, "router_z_loss_clip": 2.29882812, "router_z_loss_mlp": 0.26965332, "step": 13375, "time_per_iteration": 2.6703829765319824 }, { "auxiliary_loss_clip": 0.01138887, "auxiliary_loss_mlp": 0.0011901, "balance_loss_clip": 1.00126243, "balance_loss_mlp": 0.11023647, "epoch": 0.8042086276867578, "flos": 70005663797760.0, "grad_norm": 8.922791356050814, "language_loss": 0.55136746, "learning_rate": 3.887227343343271e-07, "loss": 0.56394643, "num_input_tokens_seen": 288676015, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.08789062, "step": 13376, "time_per_iteration": 3.22613263130188 }, { "auxiliary_loss_clip": 0.01259717, "auxiliary_loss_mlp": 0.00235692, "balance_loss_clip": 1.03685713, "balance_loss_mlp": 0.20977575, "epoch": 0.8042687509394258, "flos": 21872435800320.0, "grad_norm": 8.220750806448063, "language_loss": 0.81260562, "learning_rate": 3.8849204494770425e-07, "loss": 0.82755971, "num_input_tokens_seen": 288696455, "router_z_loss_clip": 2.22851562, "router_z_loss_mlp": 0.2590332, "step": 13377, "time_per_iteration": 2.680286407470703 }, { "auxiliary_loss_clip": 0.01249995, "auxiliary_loss_mlp": 0.00224674, "balance_loss_clip": 1.03266716, "balance_loss_mlp": 0.20160744, "epoch": 0.8043288741920938, "flos": 26614870513920.0, "grad_norm": 84.97066140119793, "language_loss": 0.79709738, "learning_rate": 3.8826141666962567e-07, "loss": 0.81184411, "num_input_tokens_seen": 288715560, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.23059082, "step": 13378, "time_per_iteration": 2.813237190246582 }, { "auxiliary_loss_clip": 0.01262085, "auxiliary_loss_mlp": 0.00221871, "balance_loss_clip": 1.03893769, "balance_loss_mlp": 0.1978502, "epoch": 0.8043889974447618, "flos": 33403387651200.0, "grad_norm": 6.824453897003627, "language_loss": 0.75001442, "learning_rate": 3.880308495088347e-07, "loss": 0.76485395, "num_input_tokens_seen": 288739485, "router_z_loss_clip": 2.23632812, "router_z_loss_mlp": 0.24023438, "step": 13379, "time_per_iteration": 2.7630767822265625 }, { "auxiliary_loss_clip": 0.01277426, "auxiliary_loss_mlp": 0.00265951, "balance_loss_clip": 1.04223514, "balance_loss_mlp": 0.23625553, "epoch": 0.8044491206974297, "flos": 20375966355840.0, "grad_norm": 28.812975997072872, "language_loss": 0.87465799, "learning_rate": 3.8780034347407533e-07, "loss": 0.89009178, "num_input_tokens_seen": 288757420, "router_z_loss_clip": 2.3515625, "router_z_loss_mlp": 0.29663086, "step": 13380, "time_per_iteration": 2.713365316390991 }, { "auxiliary_loss_clip": 0.01250515, "auxiliary_loss_mlp": 0.00250771, "balance_loss_clip": 1.0334667, "balance_loss_mlp": 0.22558242, "epoch": 0.8045092439500977, "flos": 23403810286080.0, "grad_norm": 2.659097751368952, "language_loss": 0.76742923, "learning_rate": 3.875698985740887e-07, "loss": 0.78244209, "num_input_tokens_seen": 288775535, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.25183105, "step": 13381, "time_per_iteration": 2.671079397201538 }, { "auxiliary_loss_clip": 0.01241161, "auxiliary_loss_mlp": 0.00221605, "balance_loss_clip": 1.02440238, "balance_loss_mlp": 0.19787082, "epoch": 0.8045693672027656, "flos": 24097245321600.0, "grad_norm": 40.2647674719375, "language_loss": 0.74620104, "learning_rate": 3.873395148176135e-07, "loss": 0.76082873, "num_input_tokens_seen": 288795035, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.23706055, "step": 13382, "time_per_iteration": 2.687542200088501 }, { "auxiliary_loss_clip": 0.01250689, "auxiliary_loss_mlp": 0.00237702, "balance_loss_clip": 1.02764618, "balance_loss_mlp": 0.21369284, "epoch": 0.8046294904554336, "flos": 27707165147520.0, "grad_norm": 84.00647880272892, "language_loss": 0.85339087, "learning_rate": 3.8710919221338487e-07, "loss": 0.86827475, "num_input_tokens_seen": 288816270, "router_z_loss_clip": 2.23046875, "router_z_loss_mlp": 0.24023438, "step": 13383, "time_per_iteration": 2.6875877380371094 }, { "auxiliary_loss_clip": 0.01252318, "auxiliary_loss_mlp": 0.00216847, "balance_loss_clip": 1.03127384, "balance_loss_mlp": 0.19542484, "epoch": 0.8046896137081017, "flos": 24972998814720.0, "grad_norm": 377.4702004881045, "language_loss": 0.7905826, "learning_rate": 3.868789307701381e-07, "loss": 0.80527425, "num_input_tokens_seen": 288836050, "router_z_loss_clip": 2.21289062, "router_z_loss_mlp": 0.21435547, "step": 13384, "time_per_iteration": 2.693364143371582 }, { "auxiliary_loss_clip": 0.01275083, "auxiliary_loss_mlp": 0.0026227, "balance_loss_clip": 1.04468215, "balance_loss_mlp": 0.23625837, "epoch": 0.8047497369607696, "flos": 17675484001920.0, "grad_norm": 31.032553415830986, "language_loss": 0.91479015, "learning_rate": 3.8664873049660375e-07, "loss": 0.93016374, "num_input_tokens_seen": 288852900, "router_z_loss_clip": 2.3046875, "router_z_loss_mlp": 0.26025391, "step": 13385, "time_per_iteration": 2.6032614707946777 }, { "auxiliary_loss_clip": 0.01251802, "auxiliary_loss_mlp": 0.00233957, "balance_loss_clip": 1.03419256, "balance_loss_mlp": 0.209746, "epoch": 0.8048098602134376, "flos": 22382079920640.0, "grad_norm": 90.19486286047395, "language_loss": 0.80491412, "learning_rate": 3.864185914015108e-07, "loss": 0.81977177, "num_input_tokens_seen": 288872625, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.24230957, "step": 13386, "time_per_iteration": 2.6752798557281494 }, { "auxiliary_loss_clip": 0.01142112, "auxiliary_loss_mlp": 0.000791, "balance_loss_clip": 1.00483704, "balance_loss_mlp": 0.07118483, "epoch": 0.8048699834661055, "flos": 71200949702400.0, "grad_norm": 0.6384686952769673, "language_loss": 0.50608557, "learning_rate": 3.861885134935865e-07, "loss": 0.51829773, "num_input_tokens_seen": 288939180, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.07910156, "step": 13387, "time_per_iteration": 3.2328684329986572 }, { "auxiliary_loss_clip": 0.0124773, "auxiliary_loss_mlp": 0.00237973, "balance_loss_clip": 1.03156507, "balance_loss_mlp": 0.21249783, "epoch": 0.8049301067187735, "flos": 23660320285440.0, "grad_norm": 23.020309983108582, "language_loss": 0.80344731, "learning_rate": 3.859584967815559e-07, "loss": 0.81830436, "num_input_tokens_seen": 288958925, "router_z_loss_clip": 2.1640625, "router_z_loss_mlp": 0.25463867, "step": 13388, "time_per_iteration": 2.7053186893463135 }, { "auxiliary_loss_clip": 0.01273152, "auxiliary_loss_mlp": 0.00253128, "balance_loss_clip": 1.05116785, "balance_loss_mlp": 0.22820163, "epoch": 0.8049902299714414, "flos": 24426330750720.0, "grad_norm": 71.27830891852007, "language_loss": 0.80172086, "learning_rate": 3.857285412741411e-07, "loss": 0.81698364, "num_input_tokens_seen": 288980935, "router_z_loss_clip": 2.22265625, "router_z_loss_mlp": 0.24902344, "step": 13389, "time_per_iteration": 2.7365095615386963 }, { "auxiliary_loss_clip": 0.01247211, "auxiliary_loss_mlp": 0.00225011, "balance_loss_clip": 1.03143477, "balance_loss_mlp": 0.20128885, "epoch": 0.8050503532241094, "flos": 17492626840320.0, "grad_norm": 56.10731137447746, "language_loss": 0.89683944, "learning_rate": 3.8549864698006097e-07, "loss": 0.91156167, "num_input_tokens_seen": 288996780, "router_z_loss_clip": 2.15820312, "router_z_loss_mlp": 0.23706055, "step": 13390, "time_per_iteration": 2.6669185161590576 }, { "auxiliary_loss_clip": 0.01145253, "auxiliary_loss_mlp": 0.0009024, "balance_loss_clip": 1.0076654, "balance_loss_mlp": 0.08232447, "epoch": 0.8051104764767774, "flos": 57658030369920.0, "grad_norm": 0.7667739381795756, "language_loss": 0.55112123, "learning_rate": 3.8526881390803424e-07, "loss": 0.5634762, "num_input_tokens_seen": 289057590, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.07910156, "step": 13391, "time_per_iteration": 3.1184849739074707 }, { "auxiliary_loss_clip": 0.01248163, "auxiliary_loss_mlp": 0.00237319, "balance_loss_clip": 1.03437996, "balance_loss_mlp": 0.21345332, "epoch": 0.8051705997294454, "flos": 18003456109440.0, "grad_norm": 450.2839108287048, "language_loss": 0.91094708, "learning_rate": 3.850390420667762e-07, "loss": 0.92580187, "num_input_tokens_seen": 289076285, "router_z_loss_clip": 2.13476562, "router_z_loss_mlp": 0.23864746, "step": 13392, "time_per_iteration": 2.598285675048828 }, { "auxiliary_loss_clip": 0.01259479, "auxiliary_loss_mlp": 0.00225605, "balance_loss_clip": 1.03335166, "balance_loss_mlp": 0.20011824, "epoch": 0.8052307229821133, "flos": 26397754755840.0, "grad_norm": 1.865151941120161, "language_loss": 0.76227707, "learning_rate": 3.8480933146499914e-07, "loss": 0.77712786, "num_input_tokens_seen": 289097585, "router_z_loss_clip": 2.25976562, "router_z_loss_mlp": 0.25463867, "step": 13393, "time_per_iteration": 2.7135493755340576 }, { "auxiliary_loss_clip": 0.01248537, "auxiliary_loss_mlp": 0.00219981, "balance_loss_clip": 1.02663803, "balance_loss_mlp": 0.195317, "epoch": 0.8052908462347813, "flos": 21757018423680.0, "grad_norm": 4.49333863306604, "language_loss": 0.85536569, "learning_rate": 3.84579682111414e-07, "loss": 0.87005085, "num_input_tokens_seen": 289116890, "router_z_loss_clip": 2.21777344, "router_z_loss_mlp": 0.24658203, "step": 13394, "time_per_iteration": 2.6541073322296143 }, { "auxiliary_loss_clip": 0.01240466, "auxiliary_loss_mlp": 0.00234984, "balance_loss_clip": 1.02460265, "balance_loss_mlp": 0.21270406, "epoch": 0.8053509694874492, "flos": 25442279026560.0, "grad_norm": 9.060740892138583, "language_loss": 0.71725887, "learning_rate": 3.843500940147304e-07, "loss": 0.73201334, "num_input_tokens_seen": 289136670, "router_z_loss_clip": 2.16015625, "router_z_loss_mlp": 0.22265625, "step": 13395, "time_per_iteration": 2.6910953521728516 }, { "auxiliary_loss_clip": 0.01146215, "auxiliary_loss_mlp": 0.00092398, "balance_loss_clip": 1.00726295, "balance_loss_mlp": 0.08419637, "epoch": 0.8054110927401172, "flos": 57668122091520.0, "grad_norm": 0.7287275401128784, "language_loss": 0.56923556, "learning_rate": 3.8412056718365206e-07, "loss": 0.58162177, "num_input_tokens_seen": 289200150, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.08203125, "step": 13396, "time_per_iteration": 4.699950933456421 }, { "auxiliary_loss_clip": 0.01255019, "auxiliary_loss_mlp": 0.00246364, "balance_loss_clip": 1.03394127, "balance_loss_mlp": 0.22130668, "epoch": 0.8054712159927853, "flos": 19276201693440.0, "grad_norm": 121.12208683514432, "language_loss": 0.85006285, "learning_rate": 3.8389110162688353e-07, "loss": 0.86507666, "num_input_tokens_seen": 289218125, "router_z_loss_clip": 2.21289062, "router_z_loss_mlp": 0.25048828, "step": 13397, "time_per_iteration": 4.187336206436157 }, { "auxiliary_loss_clip": 0.01259086, "auxiliary_loss_mlp": 0.00235568, "balance_loss_clip": 1.03968883, "balance_loss_mlp": 0.21107073, "epoch": 0.8055313392454532, "flos": 17967617314560.0, "grad_norm": 57.827894557795126, "language_loss": 0.77488101, "learning_rate": 3.836616973531266e-07, "loss": 0.78982747, "num_input_tokens_seen": 289237115, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.24499512, "step": 13398, "time_per_iteration": 2.6124346256256104 }, { "auxiliary_loss_clip": 0.01254785, "auxiliary_loss_mlp": 0.00236582, "balance_loss_clip": 1.03636885, "balance_loss_mlp": 0.21241796, "epoch": 0.8055914624981212, "flos": 13478352635520.0, "grad_norm": 3.2574701233077494, "language_loss": 0.79885721, "learning_rate": 3.834323543710805e-07, "loss": 0.81377089, "num_input_tokens_seen": 289253635, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.24169922, "step": 13399, "time_per_iteration": 2.7147955894470215 }, { "auxiliary_loss_clip": 0.01264085, "auxiliary_loss_mlp": 0.0022882, "balance_loss_clip": 1.03995109, "balance_loss_mlp": 0.20496653, "epoch": 0.8056515857507891, "flos": 13224787551360.0, "grad_norm": 7.993489262583065, "language_loss": 0.81231809, "learning_rate": 3.8320307268944153e-07, "loss": 0.82724714, "num_input_tokens_seen": 289270085, "router_z_loss_clip": 2.2421875, "router_z_loss_mlp": 0.23864746, "step": 13400, "time_per_iteration": 2.601632833480835 }, { "auxiliary_loss_clip": 0.01248706, "auxiliary_loss_mlp": 0.00255825, "balance_loss_clip": 1.03121829, "balance_loss_mlp": 0.23149444, "epoch": 0.8057117090034571, "flos": 23878190229120.0, "grad_norm": 90.83406184252226, "language_loss": 0.71074671, "learning_rate": 3.829738523169037e-07, "loss": 0.72579199, "num_input_tokens_seen": 289289645, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.2434082, "step": 13401, "time_per_iteration": 2.7417078018188477 }, { "auxiliary_loss_clip": 0.01264047, "auxiliary_loss_mlp": 0.0022414, "balance_loss_clip": 1.04393911, "balance_loss_mlp": 0.20046508, "epoch": 0.805771832256125, "flos": 21214300855680.0, "grad_norm": 16.430105362590332, "language_loss": 0.94417059, "learning_rate": 3.8274469326215985e-07, "loss": 0.95905244, "num_input_tokens_seen": 289306630, "router_z_loss_clip": 2.20117188, "router_z_loss_mlp": 0.23669434, "step": 13402, "time_per_iteration": 4.176253318786621 }, { "auxiliary_loss_clip": 0.01279083, "auxiliary_loss_mlp": 0.00250118, "balance_loss_clip": 1.05207396, "balance_loss_mlp": 0.22339134, "epoch": 0.805831955508793, "flos": 17566818382080.0, "grad_norm": 172.07423420212947, "language_loss": 0.78020012, "learning_rate": 3.8251559553389876e-07, "loss": 0.79549217, "num_input_tokens_seen": 289324960, "router_z_loss_clip": 2.27539062, "router_z_loss_mlp": 0.26745605, "step": 13403, "time_per_iteration": 2.6191420555114746 }, { "auxiliary_loss_clip": 0.01261014, "auxiliary_loss_mlp": 0.00255452, "balance_loss_clip": 1.04163229, "balance_loss_mlp": 0.23133603, "epoch": 0.805892078761461, "flos": 26907542530560.0, "grad_norm": 18.2882141141593, "language_loss": 0.9095974, "learning_rate": 3.822865591408084e-07, "loss": 0.92476213, "num_input_tokens_seen": 289344980, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.2409668, "step": 13404, "time_per_iteration": 2.7151122093200684 }, { "auxiliary_loss_clip": 0.01244529, "auxiliary_loss_mlp": 0.00226377, "balance_loss_clip": 1.02918839, "balance_loss_mlp": 0.20394158, "epoch": 0.805952202014129, "flos": 31506442496640.0, "grad_norm": 4.523370585123993, "language_loss": 0.78012609, "learning_rate": 3.820575840915743e-07, "loss": 0.79483509, "num_input_tokens_seen": 289367500, "router_z_loss_clip": 2.15136719, "router_z_loss_mlp": 0.22436523, "step": 13405, "time_per_iteration": 2.7438855171203613 }, { "auxiliary_loss_clip": 0.01257641, "auxiliary_loss_mlp": 0.00228806, "balance_loss_clip": 1.03842735, "balance_loss_mlp": 0.2045826, "epoch": 0.8060123252667969, "flos": 24389953251840.0, "grad_norm": 284.6279377324054, "language_loss": 0.84087789, "learning_rate": 3.818286703948788e-07, "loss": 0.85574234, "num_input_tokens_seen": 289385930, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.24206543, "step": 13406, "time_per_iteration": 4.094002962112427 }, { "auxiliary_loss_clip": 0.01245383, "auxiliary_loss_mlp": 0.00228872, "balance_loss_clip": 1.02690315, "balance_loss_mlp": 0.20506588, "epoch": 0.8060724485194649, "flos": 23479941162240.0, "grad_norm": 2.285205576482345, "language_loss": 0.81345505, "learning_rate": 3.815998180594018e-07, "loss": 0.8281976, "num_input_tokens_seen": 289408025, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.23803711, "step": 13407, "time_per_iteration": 2.691359281539917 }, { "auxiliary_loss_clip": 0.01249637, "auxiliary_loss_mlp": 0.00227407, "balance_loss_clip": 1.0286411, "balance_loss_mlp": 0.20257543, "epoch": 0.8061325717721328, "flos": 18624495283200.0, "grad_norm": 22.84082826215416, "language_loss": 0.83983898, "learning_rate": 3.81371027093822e-07, "loss": 0.85460943, "num_input_tokens_seen": 289426575, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.2479248, "step": 13408, "time_per_iteration": 2.6235780715942383 }, { "auxiliary_loss_clip": 0.0124423, "auxiliary_loss_mlp": 0.00230476, "balance_loss_clip": 1.02462709, "balance_loss_mlp": 0.20579922, "epoch": 0.8061926950248008, "flos": 23582752865280.0, "grad_norm": 30.60667076260256, "language_loss": 0.78524947, "learning_rate": 3.8114229750681523e-07, "loss": 0.7999965, "num_input_tokens_seen": 289447760, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.24682617, "step": 13409, "time_per_iteration": 2.7211503982543945 }, { "auxiliary_loss_clip": 0.01262282, "auxiliary_loss_mlp": 0.00231293, "balance_loss_clip": 1.03244066, "balance_loss_mlp": 0.20467384, "epoch": 0.8062528182774689, "flos": 11143333209600.0, "grad_norm": 23.054110592069275, "language_loss": 0.85434699, "learning_rate": 3.809136293070545e-07, "loss": 0.86928272, "num_input_tokens_seen": 289463920, "router_z_loss_clip": 2.29882812, "router_z_loss_mlp": 0.26599121, "step": 13410, "time_per_iteration": 2.656146764755249 }, { "auxiliary_loss_clip": 0.01251801, "auxiliary_loss_mlp": 0.00238477, "balance_loss_clip": 1.03421736, "balance_loss_mlp": 0.21501659, "epoch": 0.8063129415301368, "flos": 22346815743360.0, "grad_norm": 11.28384225965076, "language_loss": 0.7496655, "learning_rate": 3.806850225032117e-07, "loss": 0.76456833, "num_input_tokens_seen": 289482635, "router_z_loss_clip": 2.17578125, "router_z_loss_mlp": 0.234375, "step": 13411, "time_per_iteration": 2.6835126876831055 }, { "auxiliary_loss_clip": 0.01252042, "auxiliary_loss_mlp": 0.00249117, "balance_loss_clip": 1.03376555, "balance_loss_mlp": 0.22318941, "epoch": 0.8063730647828048, "flos": 23988400133760.0, "grad_norm": 32.174117926128616, "language_loss": 0.75467443, "learning_rate": 3.804564771039551e-07, "loss": 0.76968604, "num_input_tokens_seen": 289502040, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.25915527, "step": 13412, "time_per_iteration": 2.7118241786956787 }, { "auxiliary_loss_clip": 0.01264257, "auxiliary_loss_mlp": 0.00268769, "balance_loss_clip": 1.03252089, "balance_loss_mlp": 0.24032545, "epoch": 0.8064331880354727, "flos": 21321494017920.0, "grad_norm": 12.08115915949616, "language_loss": 0.87558705, "learning_rate": 3.8022799311795064e-07, "loss": 0.8909173, "num_input_tokens_seen": 289520740, "router_z_loss_clip": 2.3203125, "router_z_loss_mlp": 0.2845459, "step": 13413, "time_per_iteration": 2.6894147396087646 }, { "auxiliary_loss_clip": 0.01250243, "auxiliary_loss_mlp": 0.00244408, "balance_loss_clip": 1.03484488, "balance_loss_mlp": 0.22140096, "epoch": 0.8064933112881407, "flos": 19682890456320.0, "grad_norm": 3.687088213967103, "language_loss": 0.91515368, "learning_rate": 3.7999957055386303e-07, "loss": 0.9301002, "num_input_tokens_seen": 289535840, "router_z_loss_clip": 2.15234375, "router_z_loss_mlp": 0.23010254, "step": 13414, "time_per_iteration": 2.641120672225952 }, { "auxiliary_loss_clip": 0.01245008, "auxiliary_loss_mlp": 0.00220543, "balance_loss_clip": 1.02949715, "balance_loss_mlp": 0.19748822, "epoch": 0.8065534345408086, "flos": 19279721226240.0, "grad_norm": 2.280350307751352, "language_loss": 0.74744481, "learning_rate": 3.7977120942035467e-07, "loss": 0.76210034, "num_input_tokens_seen": 289555205, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.23071289, "step": 13415, "time_per_iteration": 2.6619577407836914 }, { "auxiliary_loss_clip": 0.0123263, "auxiliary_loss_mlp": 0.00222261, "balance_loss_clip": 1.02014446, "balance_loss_mlp": 0.19993338, "epoch": 0.8066135577934767, "flos": 19677718897920.0, "grad_norm": 24.11568833660313, "language_loss": 0.83011311, "learning_rate": 3.7954290972608383e-07, "loss": 0.84466195, "num_input_tokens_seen": 289573000, "router_z_loss_clip": 2.12792969, "router_z_loss_mlp": 0.2232666, "step": 13416, "time_per_iteration": 2.6445231437683105 }, { "auxiliary_loss_clip": 0.01252896, "auxiliary_loss_mlp": 0.00237302, "balance_loss_clip": 1.03214204, "balance_loss_mlp": 0.2140439, "epoch": 0.8066736810461446, "flos": 21143592933120.0, "grad_norm": 9.15248673146207, "language_loss": 0.74198294, "learning_rate": 3.793146714797086e-07, "loss": 0.75688493, "num_input_tokens_seen": 289592625, "router_z_loss_clip": 2.20507812, "router_z_loss_mlp": 0.2322998, "step": 13417, "time_per_iteration": 2.681936740875244 }, { "auxiliary_loss_clip": 0.01251826, "auxiliary_loss_mlp": 0.00252742, "balance_loss_clip": 1.03338814, "balance_loss_mlp": 0.22729042, "epoch": 0.8067338042988126, "flos": 22598261925120.0, "grad_norm": 123.29715642451211, "language_loss": 0.88614208, "learning_rate": 3.7908649468988306e-07, "loss": 0.90118778, "num_input_tokens_seen": 289610780, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.2545166, "step": 13418, "time_per_iteration": 2.6361660957336426 }, { "auxiliary_loss_clip": 0.01254563, "auxiliary_loss_mlp": 0.00233818, "balance_loss_clip": 1.03731489, "balance_loss_mlp": 0.20935667, "epoch": 0.8067939275514805, "flos": 16508423208960.0, "grad_norm": 15.925859379614634, "language_loss": 0.92935592, "learning_rate": 3.7885837936526066e-07, "loss": 0.94423974, "num_input_tokens_seen": 289628890, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.24462891, "step": 13419, "time_per_iteration": 2.6347508430480957 }, { "auxiliary_loss_clip": 0.01261074, "auxiliary_loss_mlp": 0.00227731, "balance_loss_clip": 1.03413439, "balance_loss_mlp": 0.20251787, "epoch": 0.8068540508041485, "flos": 28541836460160.0, "grad_norm": 40.16399378297997, "language_loss": 0.82942653, "learning_rate": 3.7863032551449047e-07, "loss": 0.84431458, "num_input_tokens_seen": 289647220, "router_z_loss_clip": 2.26953125, "router_z_loss_mlp": 0.2520752, "step": 13420, "time_per_iteration": 2.665339708328247 }, { "auxiliary_loss_clip": 0.01238303, "auxiliary_loss_mlp": 0.00231488, "balance_loss_clip": 1.0232482, "balance_loss_mlp": 0.20854029, "epoch": 0.8069141740568164, "flos": 21652482867840.0, "grad_norm": 9.009966713304287, "language_loss": 0.86823291, "learning_rate": 3.784023331462207e-07, "loss": 0.88293087, "num_input_tokens_seen": 289665800, "router_z_loss_clip": 2.15429688, "router_z_loss_mlp": 0.22949219, "step": 13421, "time_per_iteration": 2.669095039367676 }, { "auxiliary_loss_clip": 0.01256225, "auxiliary_loss_mlp": 0.00233196, "balance_loss_clip": 1.03630042, "balance_loss_mlp": 0.20903192, "epoch": 0.8069742973094844, "flos": 17529327561600.0, "grad_norm": 1716.2104624425883, "language_loss": 0.85891342, "learning_rate": 3.78174402269098e-07, "loss": 0.87380767, "num_input_tokens_seen": 289682705, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.24157715, "step": 13422, "time_per_iteration": 2.5828311443328857 }, { "auxiliary_loss_clip": 0.01234135, "auxiliary_loss_mlp": 0.00223859, "balance_loss_clip": 1.01759672, "balance_loss_mlp": 0.20196025, "epoch": 0.8070344205621525, "flos": 23367037737600.0, "grad_norm": 105.13362525872822, "language_loss": 0.74513805, "learning_rate": 3.7794653289176347e-07, "loss": 0.75971794, "num_input_tokens_seen": 289702920, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.21899414, "step": 13423, "time_per_iteration": 2.6549413204193115 }, { "auxiliary_loss_clip": 0.01268307, "auxiliary_loss_mlp": 0.00225823, "balance_loss_clip": 1.03782344, "balance_loss_mlp": 0.19897754, "epoch": 0.8070945438148204, "flos": 22930184528640.0, "grad_norm": 10.001271829873442, "language_loss": 0.87968123, "learning_rate": 3.7771872502285904e-07, "loss": 0.8946225, "num_input_tokens_seen": 289723280, "router_z_loss_clip": 2.3046875, "router_z_loss_mlp": 0.26855469, "step": 13424, "time_per_iteration": 2.6913044452667236 }, { "auxiliary_loss_clip": 0.01245122, "auxiliary_loss_mlp": 0.00263436, "balance_loss_clip": 1.03073287, "balance_loss_mlp": 0.23883098, "epoch": 0.8071546670674884, "flos": 25300683613440.0, "grad_norm": 6.258808657573291, "language_loss": 0.86463463, "learning_rate": 3.774909786710232e-07, "loss": 0.87972021, "num_input_tokens_seen": 289743475, "router_z_loss_clip": 2.14355469, "router_z_loss_mlp": 0.24597168, "step": 13425, "time_per_iteration": 2.6728477478027344 }, { "auxiliary_loss_clip": 0.0124957, "auxiliary_loss_mlp": 0.00228333, "balance_loss_clip": 1.02787173, "balance_loss_mlp": 0.20451505, "epoch": 0.8072147903201563, "flos": 18113701927680.0, "grad_norm": 8558.256844419511, "language_loss": 0.89631563, "learning_rate": 3.772632938448923e-07, "loss": 0.91109467, "num_input_tokens_seen": 289761400, "router_z_loss_clip": 2.21484375, "router_z_loss_mlp": 0.23803711, "step": 13426, "time_per_iteration": 2.731123447418213 }, { "auxiliary_loss_clip": 0.01262484, "auxiliary_loss_mlp": 0.00219362, "balance_loss_clip": 1.03890538, "balance_loss_mlp": 0.19579436, "epoch": 0.8072749135728243, "flos": 26688164215680.0, "grad_norm": 3.0227160680949874, "language_loss": 0.82497156, "learning_rate": 3.770356705530997e-07, "loss": 0.83978999, "num_input_tokens_seen": 289781025, "router_z_loss_clip": 2.23632812, "router_z_loss_mlp": 0.23571777, "step": 13427, "time_per_iteration": 2.841608762741089 }, { "auxiliary_loss_clip": 0.01257025, "auxiliary_loss_mlp": 0.00235949, "balance_loss_clip": 1.03769422, "balance_loss_mlp": 0.21157047, "epoch": 0.8073350368254922, "flos": 19240291071360.0, "grad_norm": 320.01965037648597, "language_loss": 0.76852983, "learning_rate": 3.768081088042774e-07, "loss": 0.78345954, "num_input_tokens_seen": 289798380, "router_z_loss_clip": 2.19335938, "router_z_loss_mlp": 0.24389648, "step": 13428, "time_per_iteration": 2.689815044403076 }, { "auxiliary_loss_clip": 0.01252751, "auxiliary_loss_mlp": 0.00243252, "balance_loss_clip": 1.02789426, "balance_loss_mlp": 0.21886221, "epoch": 0.8073951600781603, "flos": 13334530579200.0, "grad_norm": 2942.461241852755, "language_loss": 0.83870494, "learning_rate": 3.765806086070544e-07, "loss": 0.85366499, "num_input_tokens_seen": 289814515, "router_z_loss_clip": 2.24804688, "router_z_loss_mlp": 0.24389648, "step": 13429, "time_per_iteration": 2.625699520111084 }, { "auxiliary_loss_clip": 0.01220719, "auxiliary_loss_mlp": 0.00206126, "balance_loss_clip": 1.01021051, "balance_loss_mlp": 0.18286823, "epoch": 0.8074552833308282, "flos": 22853191726080.0, "grad_norm": 23.499952320300277, "language_loss": 0.74466062, "learning_rate": 3.763531699700568e-07, "loss": 0.75892901, "num_input_tokens_seen": 289834315, "router_z_loss_clip": 2.10839844, "router_z_loss_mlp": 0.23254395, "step": 13430, "time_per_iteration": 2.7014336585998535 }, { "auxiliary_loss_clip": 0.01262704, "auxiliary_loss_mlp": 0.00252309, "balance_loss_clip": 1.04387772, "balance_loss_mlp": 0.22791901, "epoch": 0.8075154065834962, "flos": 20339409288960.0, "grad_norm": 13.211270407531273, "language_loss": 0.8742609, "learning_rate": 3.7612579290190994e-07, "loss": 0.88941109, "num_input_tokens_seen": 289853770, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.24377441, "step": 13431, "time_per_iteration": 2.6349031925201416 }, { "auxiliary_loss_clip": 0.01249615, "auxiliary_loss_mlp": 0.00234586, "balance_loss_clip": 1.03149223, "balance_loss_mlp": 0.20984988, "epoch": 0.8075755298361641, "flos": 21908059113600.0, "grad_norm": 4.6482697957036505, "language_loss": 0.8944813, "learning_rate": 3.7589847741123593e-07, "loss": 0.90932333, "num_input_tokens_seen": 289870480, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.24743652, "step": 13432, "time_per_iteration": 2.718703508377075 }, { "auxiliary_loss_clip": 0.01261095, "auxiliary_loss_mlp": 0.0025805, "balance_loss_clip": 1.03807998, "balance_loss_mlp": 0.23085867, "epoch": 0.8076356530888321, "flos": 15669298609920.0, "grad_norm": 22.80120514440815, "language_loss": 0.79597014, "learning_rate": 3.7567122350665415e-07, "loss": 0.81116164, "num_input_tokens_seen": 289888275, "router_z_loss_clip": 2.22851562, "router_z_loss_mlp": 0.27197266, "step": 13433, "time_per_iteration": 2.6393189430236816 }, { "auxiliary_loss_clip": 0.01236564, "auxiliary_loss_mlp": 0.00237755, "balance_loss_clip": 1.02514303, "balance_loss_mlp": 0.21402074, "epoch": 0.8076957763415, "flos": 37777414521600.0, "grad_norm": 4.748352995834898, "language_loss": 0.78486615, "learning_rate": 3.754440311967828e-07, "loss": 0.79960936, "num_input_tokens_seen": 289911495, "router_z_loss_clip": 2.11328125, "router_z_loss_mlp": 0.23754883, "step": 13434, "time_per_iteration": 2.8145241737365723 }, { "auxiliary_loss_clip": 0.01271945, "auxiliary_loss_mlp": 0.00231238, "balance_loss_clip": 1.05169308, "balance_loss_mlp": 0.20745617, "epoch": 0.807755899594168, "flos": 19610781903360.0, "grad_norm": 19700.835075823325, "language_loss": 0.76070607, "learning_rate": 3.752169004902361e-07, "loss": 0.77573788, "num_input_tokens_seen": 289930045, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.23779297, "step": 13435, "time_per_iteration": 2.698435068130493 }, { "auxiliary_loss_clip": 0.0127035, "auxiliary_loss_mlp": 0.00241399, "balance_loss_clip": 1.04275489, "balance_loss_mlp": 0.21541128, "epoch": 0.8078160228468361, "flos": 23294893271040.0, "grad_norm": 3.2848726158730615, "language_loss": 0.81739044, "learning_rate": 3.749898313956279e-07, "loss": 0.83250797, "num_input_tokens_seen": 289950815, "router_z_loss_clip": 2.27539062, "router_z_loss_mlp": 0.2598877, "step": 13436, "time_per_iteration": 2.6979031562805176 }, { "auxiliary_loss_clip": 0.01240251, "auxiliary_loss_mlp": 0.00243392, "balance_loss_clip": 1.02127647, "balance_loss_mlp": 0.21858504, "epoch": 0.807876146099504, "flos": 27162651899520.0, "grad_norm": 15.111051984232606, "language_loss": 0.78022313, "learning_rate": 3.747628239215674e-07, "loss": 0.79505956, "num_input_tokens_seen": 289971730, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.24841309, "step": 13437, "time_per_iteration": 2.7338192462921143 }, { "auxiliary_loss_clip": 0.01254632, "auxiliary_loss_mlp": 0.00225478, "balance_loss_clip": 1.03665984, "balance_loss_mlp": 0.20187488, "epoch": 0.807936269352172, "flos": 27160030206720.0, "grad_norm": 3.25190176416486, "language_loss": 0.8013736, "learning_rate": 3.745358780766636e-07, "loss": 0.81617463, "num_input_tokens_seen": 289992995, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.23571777, "step": 13438, "time_per_iteration": 4.0928168296813965 }, { "auxiliary_loss_clip": 0.01247094, "auxiliary_loss_mlp": 0.00252324, "balance_loss_clip": 1.02789414, "balance_loss_mlp": 0.22717035, "epoch": 0.8079963926048399, "flos": 20740423703040.0, "grad_norm": 65.55299533951268, "language_loss": 0.84924513, "learning_rate": 3.7430899386952344e-07, "loss": 0.86423934, "num_input_tokens_seen": 290009405, "router_z_loss_clip": 2.19433594, "router_z_loss_mlp": 0.25158691, "step": 13439, "time_per_iteration": 4.104784965515137 }, { "auxiliary_loss_clip": 0.01254695, "auxiliary_loss_mlp": 0.00218037, "balance_loss_clip": 1.03483987, "balance_loss_mlp": 0.19495818, "epoch": 0.8080565158575079, "flos": 25009663622400.0, "grad_norm": 10.132168332826955, "language_loss": 0.85041952, "learning_rate": 3.7408217130874786e-07, "loss": 0.86514688, "num_input_tokens_seen": 290031085, "router_z_loss_clip": 2.19921875, "router_z_loss_mlp": 0.23059082, "step": 13440, "time_per_iteration": 2.7142832279205322 }, { "auxiliary_loss_clip": 0.0123807, "auxiliary_loss_mlp": 0.00233642, "balance_loss_clip": 1.02188754, "balance_loss_mlp": 0.20842946, "epoch": 0.8081166391101758, "flos": 18698076293760.0, "grad_norm": 6.882992382563688, "language_loss": 0.69572771, "learning_rate": 3.7385541040293946e-07, "loss": 0.71044481, "num_input_tokens_seen": 290048670, "router_z_loss_clip": 2.16015625, "router_z_loss_mlp": 0.25231934, "step": 13441, "time_per_iteration": 2.6069788932800293 }, { "auxiliary_loss_clip": 0.01245987, "auxiliary_loss_mlp": 0.00234686, "balance_loss_clip": 1.02653992, "balance_loss_mlp": 0.20875832, "epoch": 0.8081767623628439, "flos": 19828651847040.0, "grad_norm": 6.922772703735724, "language_loss": 0.83093703, "learning_rate": 3.7362871116069684e-07, "loss": 0.84574378, "num_input_tokens_seen": 290064085, "router_z_loss_clip": 2.19140625, "router_z_loss_mlp": 0.25915527, "step": 13442, "time_per_iteration": 2.712170362472534 }, { "auxiliary_loss_clip": 0.012328, "auxiliary_loss_mlp": 0.00247998, "balance_loss_clip": 1.01818562, "balance_loss_mlp": 0.22386965, "epoch": 0.8082368856155118, "flos": 35772952982400.0, "grad_norm": 5.111849033620663, "language_loss": 0.77814567, "learning_rate": 3.734020735906169e-07, "loss": 0.79295361, "num_input_tokens_seen": 290086255, "router_z_loss_clip": 2.14648438, "router_z_loss_mlp": 0.24108887, "step": 13443, "time_per_iteration": 2.797401189804077 }, { "auxiliary_loss_clip": 0.01244169, "auxiliary_loss_mlp": 0.00236437, "balance_loss_clip": 1.02836001, "balance_loss_mlp": 0.21290547, "epoch": 0.8082970088681798, "flos": 17198015489280.0, "grad_norm": 61.29089920834056, "language_loss": 0.88807452, "learning_rate": 3.7317549770129286e-07, "loss": 0.90288055, "num_input_tokens_seen": 290103995, "router_z_loss_clip": 2.15429688, "router_z_loss_mlp": 0.23547363, "step": 13444, "time_per_iteration": 4.214062690734863 }, { "auxiliary_loss_clip": 0.01166099, "auxiliary_loss_mlp": 0.00060127, "balance_loss_clip": 1.01682425, "balance_loss_mlp": 0.05178274, "epoch": 0.8083571321208477, "flos": 63555207511680.0, "grad_norm": 0.7942436192855066, "language_loss": 0.53062141, "learning_rate": 3.7294898350131754e-07, "loss": 0.54288363, "num_input_tokens_seen": 290157245, "router_z_loss_clip": 1.4921875, "router_z_loss_mlp": 0.08349609, "step": 13445, "time_per_iteration": 3.015890598297119 }, { "auxiliary_loss_clip": 0.01256207, "auxiliary_loss_mlp": 0.00218155, "balance_loss_clip": 1.03670609, "balance_loss_mlp": 0.19427773, "epoch": 0.8084172553735157, "flos": 17930701111680.0, "grad_norm": 27.272448777501047, "language_loss": 0.81041145, "learning_rate": 3.7272253099927964e-07, "loss": 0.82515514, "num_input_tokens_seen": 290174970, "router_z_loss_clip": 2.19433594, "router_z_loss_mlp": 0.23864746, "step": 13446, "time_per_iteration": 2.6572768688201904 }, { "auxiliary_loss_clip": 0.01257751, "auxiliary_loss_mlp": 0.00227467, "balance_loss_clip": 1.03479505, "balance_loss_mlp": 0.20236169, "epoch": 0.8084773786261836, "flos": 24097999507200.0, "grad_norm": 17.767417368868184, "language_loss": 0.79819924, "learning_rate": 3.7249614020376606e-07, "loss": 0.8130514, "num_input_tokens_seen": 290194395, "router_z_loss_clip": 2.23242188, "router_z_loss_mlp": 0.25097656, "step": 13447, "time_per_iteration": 2.716792106628418 }, { "auxiliary_loss_clip": 0.01271721, "auxiliary_loss_mlp": 0.002428, "balance_loss_clip": 1.04046988, "balance_loss_mlp": 0.21657398, "epoch": 0.8085375018788516, "flos": 15588211656960.0, "grad_norm": 53.94329727117832, "language_loss": 0.86619854, "learning_rate": 3.7226981112336197e-07, "loss": 0.88134378, "num_input_tokens_seen": 290209200, "router_z_loss_clip": 2.31445312, "router_z_loss_mlp": 0.26245117, "step": 13448, "time_per_iteration": 3.999521493911743 }, { "auxiliary_loss_clip": 0.01168722, "auxiliary_loss_mlp": 0.00052994, "balance_loss_clip": 1.01880574, "balance_loss_mlp": 0.0460319, "epoch": 0.8085976251315197, "flos": 67561296393600.0, "grad_norm": 0.7137493339030268, "language_loss": 0.63215363, "learning_rate": 3.7204354376665024e-07, "loss": 0.64437079, "num_input_tokens_seen": 290274565, "router_z_loss_clip": 1.5, "router_z_loss_mlp": 0.06982422, "step": 13449, "time_per_iteration": 3.1849558353424072 }, { "auxiliary_loss_clip": 0.01251931, "auxiliary_loss_mlp": 0.00222075, "balance_loss_clip": 1.03155947, "balance_loss_mlp": 0.19809011, "epoch": 0.8086577483841876, "flos": 22561453463040.0, "grad_norm": 11.27390570574001, "language_loss": 0.81659973, "learning_rate": 3.718173381422105e-07, "loss": 0.83133972, "num_input_tokens_seen": 290293630, "router_z_loss_clip": 2.20117188, "router_z_loss_mlp": 0.2401123, "step": 13450, "time_per_iteration": 2.6623189449310303 }, { "auxiliary_loss_clip": 0.01261065, "auxiliary_loss_mlp": 0.00245368, "balance_loss_clip": 1.03610611, "balance_loss_mlp": 0.21978578, "epoch": 0.8087178716368556, "flos": 17968084191360.0, "grad_norm": 120.09069910165458, "language_loss": 0.80615884, "learning_rate": 3.7159119425861986e-07, "loss": 0.82122314, "num_input_tokens_seen": 290311450, "router_z_loss_clip": 2.24804688, "router_z_loss_mlp": 0.25561523, "step": 13451, "time_per_iteration": 2.640828847885132 }, { "auxiliary_loss_clip": 0.01268009, "auxiliary_loss_mlp": 0.00259933, "balance_loss_clip": 1.03761303, "balance_loss_mlp": 0.23382597, "epoch": 0.8087779948895235, "flos": 21719527603200.0, "grad_norm": 11.117121114421733, "language_loss": 0.89087069, "learning_rate": 3.713651121244543e-07, "loss": 0.9061501, "num_input_tokens_seen": 290330165, "router_z_loss_clip": 2.3046875, "router_z_loss_mlp": 0.26147461, "step": 13452, "time_per_iteration": 2.6674225330352783 }, { "auxiliary_loss_clip": 0.01255948, "auxiliary_loss_mlp": 0.00231264, "balance_loss_clip": 1.03117549, "balance_loss_mlp": 0.20705296, "epoch": 0.8088381181421915, "flos": 29092885983360.0, "grad_norm": 129.28119465182687, "language_loss": 0.85309124, "learning_rate": 3.711390917482875e-07, "loss": 0.86796331, "num_input_tokens_seen": 290350815, "router_z_loss_clip": 2.25, "router_z_loss_mlp": 0.24206543, "step": 13453, "time_per_iteration": 2.782118797302246 }, { "auxiliary_loss_clip": 0.01254033, "auxiliary_loss_mlp": 0.00221158, "balance_loss_clip": 1.0299964, "balance_loss_mlp": 0.19506305, "epoch": 0.8088982413948594, "flos": 22198432659840.0, "grad_norm": 25.01543612638971, "language_loss": 0.87455785, "learning_rate": 3.709131331386892e-07, "loss": 0.88930976, "num_input_tokens_seen": 290367380, "router_z_loss_clip": 2.2421875, "router_z_loss_mlp": 0.26098633, "step": 13454, "time_per_iteration": 2.640681028366089 }, { "auxiliary_loss_clip": 0.01248811, "auxiliary_loss_mlp": 0.00236798, "balance_loss_clip": 1.03214443, "balance_loss_mlp": 0.21258703, "epoch": 0.8089583646475275, "flos": 28036717453440.0, "grad_norm": 62.69702436607521, "language_loss": 0.83993047, "learning_rate": 3.7068723630422795e-07, "loss": 0.85478657, "num_input_tokens_seen": 290387965, "router_z_loss_clip": 2.16796875, "router_z_loss_mlp": 0.2421875, "step": 13455, "time_per_iteration": 2.729024887084961 }, { "auxiliary_loss_clip": 0.01257772, "auxiliary_loss_mlp": 0.00244552, "balance_loss_clip": 1.03380275, "balance_loss_mlp": 0.2181707, "epoch": 0.8090184879001954, "flos": 16617735273600.0, "grad_norm": 75.55141927589207, "language_loss": 0.88773578, "learning_rate": 3.70461401253471e-07, "loss": 0.90275902, "num_input_tokens_seen": 290404150, "router_z_loss_clip": 2.24023438, "router_z_loss_mlp": 0.26379395, "step": 13456, "time_per_iteration": 2.6238224506378174 }, { "auxiliary_loss_clip": 0.0124892, "auxiliary_loss_mlp": 0.00221462, "balance_loss_clip": 1.03315544, "balance_loss_mlp": 0.19828725, "epoch": 0.8090786111528634, "flos": 27340804379520.0, "grad_norm": 5.054235932248078, "language_loss": 0.81554586, "learning_rate": 3.702356279949801e-07, "loss": 0.83024967, "num_input_tokens_seen": 290422370, "router_z_loss_clip": 2.16210938, "router_z_loss_mlp": 0.23156738, "step": 13457, "time_per_iteration": 2.7002880573272705 }, { "auxiliary_loss_clip": 0.01234952, "auxiliary_loss_mlp": 0.00224633, "balance_loss_clip": 1.01485848, "balance_loss_mlp": 0.20052855, "epoch": 0.8091387344055313, "flos": 21105742976640.0, "grad_norm": 37.360141017949616, "language_loss": 0.81626058, "learning_rate": 3.700099165373176e-07, "loss": 0.83085644, "num_input_tokens_seen": 290442645, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.24108887, "step": 13458, "time_per_iteration": 2.674607276916504 }, { "auxiliary_loss_clip": 0.01240459, "auxiliary_loss_mlp": 0.00259654, "balance_loss_clip": 1.02433372, "balance_loss_mlp": 0.23492986, "epoch": 0.8091988576581993, "flos": 11655060318720.0, "grad_norm": 12.502761481466584, "language_loss": 0.88668311, "learning_rate": 3.6978426688904275e-07, "loss": 0.90168428, "num_input_tokens_seen": 290458520, "router_z_loss_clip": 2.15917969, "router_z_loss_mlp": 0.24707031, "step": 13459, "time_per_iteration": 2.6068167686462402 }, { "auxiliary_loss_clip": 0.01270368, "auxiliary_loss_mlp": 0.00240305, "balance_loss_clip": 1.04043043, "balance_loss_mlp": 0.21407925, "epoch": 0.8092589809108672, "flos": 22963329803520.0, "grad_norm": 84.06053804408687, "language_loss": 0.87887156, "learning_rate": 3.695586790587113e-07, "loss": 0.8939783, "num_input_tokens_seen": 290474465, "router_z_loss_clip": 2.30078125, "router_z_loss_mlp": 0.26220703, "step": 13460, "time_per_iteration": 2.651014566421509 }, { "auxiliary_loss_clip": 0.01258589, "auxiliary_loss_mlp": 0.0023657, "balance_loss_clip": 1.03490579, "balance_loss_mlp": 0.21035601, "epoch": 0.8093191041635353, "flos": 13260985482240.0, "grad_norm": 46.08530000831884, "language_loss": 0.9338938, "learning_rate": 3.693331530548789e-07, "loss": 0.94884539, "num_input_tokens_seen": 290492060, "router_z_loss_clip": 2.23632812, "router_z_loss_mlp": 0.26196289, "step": 13461, "time_per_iteration": 2.6302249431610107 }, { "auxiliary_loss_clip": 0.01274259, "auxiliary_loss_mlp": 0.00250197, "balance_loss_clip": 1.05459976, "balance_loss_mlp": 0.22531852, "epoch": 0.8093792274162032, "flos": 25516003691520.0, "grad_norm": 50.08833199619352, "language_loss": 0.8498919, "learning_rate": 3.69107688886096e-07, "loss": 0.8651365, "num_input_tokens_seen": 290511510, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.24890137, "step": 13462, "time_per_iteration": 2.6727752685546875 }, { "auxiliary_loss_clip": 0.01259616, "auxiliary_loss_mlp": 0.00245837, "balance_loss_clip": 1.03407502, "balance_loss_mlp": 0.22101754, "epoch": 0.8094393506688712, "flos": 23546483107200.0, "grad_norm": 10.587487505662676, "language_loss": 0.90515327, "learning_rate": 3.6888228656091357e-07, "loss": 0.92020786, "num_input_tokens_seen": 290530035, "router_z_loss_clip": 2.25390625, "router_z_loss_mlp": 0.24829102, "step": 13463, "time_per_iteration": 2.6616151332855225 }, { "auxiliary_loss_clip": 0.01244722, "auxiliary_loss_mlp": 0.002404, "balance_loss_clip": 1.02837491, "balance_loss_mlp": 0.2185128, "epoch": 0.8094994739215392, "flos": 17055917285760.0, "grad_norm": 10.358524995193848, "language_loss": 0.69346321, "learning_rate": 3.686569460878779e-07, "loss": 0.70831442, "num_input_tokens_seen": 290548245, "router_z_loss_clip": 2.1640625, "router_z_loss_mlp": 0.21887207, "step": 13464, "time_per_iteration": 2.648324966430664 }, { "auxiliary_loss_clip": 0.01253712, "auxiliary_loss_mlp": 0.00249693, "balance_loss_clip": 1.03348136, "balance_loss_mlp": 0.22475445, "epoch": 0.8095595971742071, "flos": 23551223702400.0, "grad_norm": 2.266254937630501, "language_loss": 0.69911921, "learning_rate": 3.684316674755341e-07, "loss": 0.71415323, "num_input_tokens_seen": 290568625, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.24951172, "step": 13465, "time_per_iteration": 2.705716609954834 }, { "auxiliary_loss_clip": 0.01239152, "auxiliary_loss_mlp": 0.00243431, "balance_loss_clip": 1.02808952, "balance_loss_mlp": 0.21910049, "epoch": 0.8096197204268751, "flos": 20373201008640.0, "grad_norm": 6.528605361067902, "language_loss": 0.88202685, "learning_rate": 3.682064507324256e-07, "loss": 0.89685267, "num_input_tokens_seen": 290586575, "router_z_loss_clip": 2.11132812, "router_z_loss_mlp": 0.24328613, "step": 13466, "time_per_iteration": 2.640038251876831 }, { "auxiliary_loss_clip": 0.01255895, "auxiliary_loss_mlp": 0.00228271, "balance_loss_clip": 1.03482461, "balance_loss_mlp": 0.20322526, "epoch": 0.809679843679543, "flos": 27818775682560.0, "grad_norm": 10102.504244300411, "language_loss": 0.8321321, "learning_rate": 3.6798129586709204e-07, "loss": 0.84697372, "num_input_tokens_seen": 290606790, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.25048828, "step": 13467, "time_per_iteration": 2.7206127643585205 }, { "auxiliary_loss_clip": 0.01258224, "auxiliary_loss_mlp": 0.00256395, "balance_loss_clip": 1.03612494, "balance_loss_mlp": 0.22940651, "epoch": 0.8097399669322111, "flos": 22014103040640.0, "grad_norm": 8.818521342053408, "language_loss": 0.84682059, "learning_rate": 3.6775620288807073e-07, "loss": 0.86196679, "num_input_tokens_seen": 290625525, "router_z_loss_clip": 2.22070312, "router_z_loss_mlp": 0.2701416, "step": 13468, "time_per_iteration": 2.6615004539489746 }, { "auxiliary_loss_clip": 0.01240269, "auxiliary_loss_mlp": 0.00226938, "balance_loss_clip": 1.027794, "balance_loss_mlp": 0.20383494, "epoch": 0.809800090184879, "flos": 18988988544000.0, "grad_norm": 12.989857332840073, "language_loss": 0.76582122, "learning_rate": 3.675311718038978e-07, "loss": 0.78049326, "num_input_tokens_seen": 290644935, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.2310791, "step": 13469, "time_per_iteration": 2.653299331665039 }, { "auxiliary_loss_clip": 0.01161235, "auxiliary_loss_mlp": 0.00079509, "balance_loss_clip": 1.01505029, "balance_loss_mlp": 0.07197522, "epoch": 0.809860213437547, "flos": 66099516508800.0, "grad_norm": 0.6683568957295799, "language_loss": 0.53867906, "learning_rate": 3.6730620262310683e-07, "loss": 0.55108649, "num_input_tokens_seen": 290710735, "router_z_loss_clip": 1.46875, "router_z_loss_mlp": 0.07519531, "step": 13470, "time_per_iteration": 3.243690252304077 }, { "auxiliary_loss_clip": 0.01251549, "auxiliary_loss_mlp": 0.0025069, "balance_loss_clip": 1.03085601, "balance_loss_mlp": 0.22615676, "epoch": 0.8099203366902149, "flos": 20882485992960.0, "grad_norm": 3630.459010881376, "language_loss": 0.76924002, "learning_rate": 3.670812953542279e-07, "loss": 0.78426242, "num_input_tokens_seen": 290729565, "router_z_loss_clip": 2.20800781, "router_z_loss_mlp": 0.24536133, "step": 13471, "time_per_iteration": 2.658543586730957 }, { "auxiliary_loss_clip": 0.01248919, "auxiliary_loss_mlp": 0.00229943, "balance_loss_clip": 1.03209651, "balance_loss_mlp": 0.20699526, "epoch": 0.8099804599428829, "flos": 26030927111040.0, "grad_norm": 44.05005542012157, "language_loss": 0.87078762, "learning_rate": 3.6685645000579003e-07, "loss": 0.88557625, "num_input_tokens_seen": 290749360, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.22949219, "step": 13472, "time_per_iteration": 2.7330634593963623 }, { "auxiliary_loss_clip": 0.01163377, "auxiliary_loss_mlp": 0.00070328, "balance_loss_clip": 1.01320767, "balance_loss_mlp": 0.06308004, "epoch": 0.8100405831955508, "flos": 69303573584640.0, "grad_norm": 0.7262934514640456, "language_loss": 0.56908512, "learning_rate": 3.666316665863201e-07, "loss": 0.58142221, "num_input_tokens_seen": 290812145, "router_z_loss_clip": 1.5, "router_z_loss_mlp": 0.07226562, "step": 13473, "time_per_iteration": 3.180678606033325 }, { "auxiliary_loss_clip": 0.01254491, "auxiliary_loss_mlp": 0.00215422, "balance_loss_clip": 1.0359025, "balance_loss_mlp": 0.19184276, "epoch": 0.8101007064482189, "flos": 15012492468480.0, "grad_norm": 39.93573334662261, "language_loss": 0.84900194, "learning_rate": 3.664069451043399e-07, "loss": 0.86370111, "num_input_tokens_seen": 290829845, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.23608398, "step": 13474, "time_per_iteration": 2.667511463165283 }, { "auxiliary_loss_clip": 0.01254042, "auxiliary_loss_mlp": 0.00251525, "balance_loss_clip": 1.03554177, "balance_loss_mlp": 0.2252396, "epoch": 0.8101608297008868, "flos": 21067210661760.0, "grad_norm": 8.068543430294122, "language_loss": 0.84820509, "learning_rate": 3.661822855683723e-07, "loss": 0.86326075, "num_input_tokens_seen": 290848815, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.26318359, "step": 13475, "time_per_iteration": 2.7101924419403076 }, { "auxiliary_loss_clip": 0.01234668, "auxiliary_loss_mlp": 0.00228539, "balance_loss_clip": 1.02062738, "balance_loss_mlp": 0.20592535, "epoch": 0.8102209529535548, "flos": 23731279603200.0, "grad_norm": 46.92043242221108, "language_loss": 0.8151347, "learning_rate": 3.659576879869364e-07, "loss": 0.82976675, "num_input_tokens_seen": 290868580, "router_z_loss_clip": 2.140625, "router_z_loss_mlp": 0.22631836, "step": 13476, "time_per_iteration": 2.712204694747925 }, { "auxiliary_loss_clip": 0.01273496, "auxiliary_loss_mlp": 0.0021759, "balance_loss_clip": 1.04355752, "balance_loss_mlp": 0.19126877, "epoch": 0.8102810762062228, "flos": 10955879107200.0, "grad_norm": 62.36986046615791, "language_loss": 0.85849738, "learning_rate": 3.657331523685485e-07, "loss": 0.8734082, "num_input_tokens_seen": 290883540, "router_z_loss_clip": 2.30273438, "router_z_loss_mlp": 0.26330566, "step": 13477, "time_per_iteration": 2.694314956665039 }, { "auxiliary_loss_clip": 0.01257463, "auxiliary_loss_mlp": 0.0023305, "balance_loss_clip": 1.03599453, "balance_loss_mlp": 0.20733692, "epoch": 0.8103411994588907, "flos": 14648825220480.0, "grad_norm": 315.63889248006694, "language_loss": 0.78968084, "learning_rate": 3.6550867872172365e-07, "loss": 0.80458599, "num_input_tokens_seen": 290901560, "router_z_loss_clip": 2.21679688, "router_z_loss_mlp": 0.25720215, "step": 13478, "time_per_iteration": 2.720038890838623 }, { "auxiliary_loss_clip": 0.01167602, "auxiliary_loss_mlp": 0.00071688, "balance_loss_clip": 1.01758814, "balance_loss_mlp": 0.06429717, "epoch": 0.8104013227115587, "flos": 59153314665600.0, "grad_norm": 0.6706795497544106, "language_loss": 0.51671851, "learning_rate": 3.6528426705497293e-07, "loss": 0.52911139, "num_input_tokens_seen": 290959185, "router_z_loss_clip": 1.5, "router_z_loss_mlp": 0.07373047, "step": 13479, "time_per_iteration": 3.130262613296509 }, { "auxiliary_loss_clip": 0.01258519, "auxiliary_loss_mlp": 0.00233337, "balance_loss_clip": 1.04199517, "balance_loss_mlp": 0.20906594, "epoch": 0.8104614459642266, "flos": 19828687760640.0, "grad_norm": 7.293655027982415, "language_loss": 0.78299069, "learning_rate": 3.650599173768072e-07, "loss": 0.79790926, "num_input_tokens_seen": 290979585, "router_z_loss_clip": 2.1640625, "router_z_loss_mlp": 0.24291992, "step": 13480, "time_per_iteration": 4.092426300048828 }, { "auxiliary_loss_clip": 0.01251655, "auxiliary_loss_mlp": 0.00247765, "balance_loss_clip": 1.0314492, "balance_loss_mlp": 0.22302948, "epoch": 0.8105215692168947, "flos": 25374264624000.0, "grad_norm": 4.327641230476764, "language_loss": 0.88600904, "learning_rate": 3.648356296957327e-07, "loss": 0.90100324, "num_input_tokens_seen": 291000865, "router_z_loss_clip": 2.20507812, "router_z_loss_mlp": 0.24780273, "step": 13481, "time_per_iteration": 2.718433380126953 }, { "auxiliary_loss_clip": 0.0125255, "auxiliary_loss_mlp": 0.00234808, "balance_loss_clip": 1.029212, "balance_loss_mlp": 0.21091846, "epoch": 0.8105816924695626, "flos": 20481722974080.0, "grad_norm": 108.82492594393477, "language_loss": 0.82044578, "learning_rate": 3.646114040202548e-07, "loss": 0.8353194, "num_input_tokens_seen": 291018285, "router_z_loss_clip": 2.23828125, "router_z_loss_mlp": 0.23876953, "step": 13482, "time_per_iteration": 4.042736768722534 }, { "auxiliary_loss_clip": 0.01251376, "auxiliary_loss_mlp": 0.00238652, "balance_loss_clip": 1.03075814, "balance_loss_mlp": 0.21414214, "epoch": 0.8106418157222306, "flos": 14538687143040.0, "grad_norm": 3.507069146609087, "language_loss": 0.75156182, "learning_rate": 3.6438724035887705e-07, "loss": 0.76646209, "num_input_tokens_seen": 291035745, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.24487305, "step": 13483, "time_per_iteration": 2.5852248668670654 }, { "auxiliary_loss_clip": 0.01245531, "auxiliary_loss_mlp": 0.00241605, "balance_loss_clip": 1.02626669, "balance_loss_mlp": 0.21568921, "epoch": 0.8107019389748985, "flos": 22564470205440.0, "grad_norm": 3.523924127071528, "language_loss": 0.82763863, "learning_rate": 3.641631387200992e-07, "loss": 0.84250998, "num_input_tokens_seen": 291053280, "router_z_loss_clip": 2.19335938, "router_z_loss_mlp": 0.25878906, "step": 13484, "time_per_iteration": 2.6638400554656982 }, { "auxiliary_loss_clip": 0.01280877, "auxiliary_loss_mlp": 0.00219944, "balance_loss_clip": 1.05031717, "balance_loss_mlp": 0.19508937, "epoch": 0.8107620622275665, "flos": 19609560840960.0, "grad_norm": 8.298796791088785, "language_loss": 0.81433129, "learning_rate": 3.639390991124183e-07, "loss": 0.8293395, "num_input_tokens_seen": 291072855, "router_z_loss_clip": 2.30664062, "router_z_loss_mlp": 0.24853516, "step": 13485, "time_per_iteration": 2.6407179832458496 }, { "auxiliary_loss_clip": 0.01252005, "auxiliary_loss_mlp": 0.00218867, "balance_loss_clip": 1.03444028, "balance_loss_mlp": 0.19625337, "epoch": 0.8108221854802344, "flos": 16143498984960.0, "grad_norm": 28.134921208988604, "language_loss": 0.83524251, "learning_rate": 3.637151215443308e-07, "loss": 0.84995121, "num_input_tokens_seen": 291090285, "router_z_loss_clip": 2.18164062, "router_z_loss_mlp": 0.22607422, "step": 13486, "time_per_iteration": 2.6572391986846924 }, { "auxiliary_loss_clip": 0.01282986, "auxiliary_loss_mlp": 0.00233951, "balance_loss_clip": 1.0525105, "balance_loss_mlp": 0.20743914, "epoch": 0.8108823087329025, "flos": 21106209853440.0, "grad_norm": 9.704399337916424, "language_loss": 0.79523718, "learning_rate": 3.6349120602433045e-07, "loss": 0.81040657, "num_input_tokens_seen": 291107675, "router_z_loss_clip": 2.30273438, "router_z_loss_mlp": 0.26513672, "step": 13487, "time_per_iteration": 4.175585985183716 }, { "auxiliary_loss_clip": 0.01240634, "auxiliary_loss_mlp": 0.00218999, "balance_loss_clip": 1.02551043, "balance_loss_mlp": 0.19742167, "epoch": 0.8109424319855704, "flos": 29199648182400.0, "grad_norm": 6.018014934480455, "language_loss": 0.90040052, "learning_rate": 3.6326735256090715e-07, "loss": 0.91499686, "num_input_tokens_seen": 291126900, "router_z_loss_clip": 2.1484375, "router_z_loss_mlp": 0.21557617, "step": 13488, "time_per_iteration": 2.7000784873962402 }, { "auxiliary_loss_clip": 0.01255296, "auxiliary_loss_mlp": 0.00246364, "balance_loss_clip": 1.03717256, "balance_loss_mlp": 0.22140174, "epoch": 0.8110025552382384, "flos": 23111856541440.0, "grad_norm": 47.022319750060696, "language_loss": 0.81009358, "learning_rate": 3.630435611625502e-07, "loss": 0.8251102, "num_input_tokens_seen": 291145285, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.24951172, "step": 13489, "time_per_iteration": 2.857109785079956 }, { "auxiliary_loss_clip": 0.01242099, "auxiliary_loss_mlp": 0.00229604, "balance_loss_clip": 1.02636719, "balance_loss_mlp": 0.2045466, "epoch": 0.8110626784909064, "flos": 22379961018240.0, "grad_norm": 7.672187099651406, "language_loss": 0.78728807, "learning_rate": 3.628198318377453e-07, "loss": 0.80200511, "num_input_tokens_seen": 291163485, "router_z_loss_clip": 2.16015625, "router_z_loss_mlp": 0.25048828, "step": 13490, "time_per_iteration": 4.135535955429077 }, { "auxiliary_loss_clip": 0.01262773, "auxiliary_loss_mlp": 0.00237792, "balance_loss_clip": 1.04339886, "balance_loss_mlp": 0.2148326, "epoch": 0.8111228017435743, "flos": 23368043318400.0, "grad_norm": 17.62562101508556, "language_loss": 0.80451554, "learning_rate": 3.625961645949762e-07, "loss": 0.81952125, "num_input_tokens_seen": 291182215, "router_z_loss_clip": 2.19335938, "router_z_loss_mlp": 0.22949219, "step": 13491, "time_per_iteration": 2.6563916206359863 }, { "auxiliary_loss_clip": 0.01257886, "auxiliary_loss_mlp": 0.00255024, "balance_loss_clip": 1.0354414, "balance_loss_mlp": 0.22962052, "epoch": 0.8111829249962423, "flos": 21286553063040.0, "grad_norm": 146.71934536026149, "language_loss": 0.74184042, "learning_rate": 3.623725594427245e-07, "loss": 0.75696957, "num_input_tokens_seen": 291203145, "router_z_loss_clip": 2.22265625, "router_z_loss_mlp": 0.25402832, "step": 13492, "time_per_iteration": 2.7401821613311768 }, { "auxiliary_loss_clip": 0.01252518, "auxiliary_loss_mlp": 0.00250427, "balance_loss_clip": 1.03022861, "balance_loss_mlp": 0.22522576, "epoch": 0.8112430482489102, "flos": 22345558767360.0, "grad_norm": 13.185582932723104, "language_loss": 0.79659981, "learning_rate": 3.6214901638947006e-07, "loss": 0.8116293, "num_input_tokens_seen": 291220600, "router_z_loss_clip": 2.22265625, "router_z_loss_mlp": 0.25183105, "step": 13493, "time_per_iteration": 2.6355457305908203 }, { "auxiliary_loss_clip": 0.01256699, "auxiliary_loss_mlp": 0.00220188, "balance_loss_clip": 1.03442144, "balance_loss_mlp": 0.19511791, "epoch": 0.8113031715015783, "flos": 31138321962240.0, "grad_norm": 2.5824214517009487, "language_loss": 0.79434687, "learning_rate": 3.619255354436885e-07, "loss": 0.80911577, "num_input_tokens_seen": 291241195, "router_z_loss_clip": 2.22070312, "router_z_loss_mlp": 0.25085449, "step": 13494, "time_per_iteration": 2.7185592651367188 }, { "auxiliary_loss_clip": 0.01277924, "auxiliary_loss_mlp": 0.00260696, "balance_loss_clip": 1.04791403, "balance_loss_mlp": 0.23392203, "epoch": 0.8113632947542462, "flos": 25335445000320.0, "grad_norm": 5.148851858492411, "language_loss": 0.85103476, "learning_rate": 3.6170211661385543e-07, "loss": 0.86642098, "num_input_tokens_seen": 291258715, "router_z_loss_clip": 2.29882812, "router_z_loss_mlp": 0.26782227, "step": 13495, "time_per_iteration": 2.672102928161621 }, { "auxiliary_loss_clip": 0.01270557, "auxiliary_loss_mlp": 0.00233225, "balance_loss_clip": 1.0428679, "balance_loss_mlp": 0.20585501, "epoch": 0.8114234180069142, "flos": 28439168411520.0, "grad_norm": 15.246778347304039, "language_loss": 0.88522822, "learning_rate": 3.614787599084417e-07, "loss": 0.90026605, "num_input_tokens_seen": 291278030, "router_z_loss_clip": 2.27539062, "router_z_loss_mlp": 0.27380371, "step": 13496, "time_per_iteration": 2.742910861968994 }, { "auxiliary_loss_clip": 0.01261879, "auxiliary_loss_mlp": 0.00219011, "balance_loss_clip": 1.03656793, "balance_loss_mlp": 0.19526415, "epoch": 0.8114835412595821, "flos": 20338870584960.0, "grad_norm": 5.946275623870035, "language_loss": 0.79858351, "learning_rate": 3.6125546533591787e-07, "loss": 0.8133924, "num_input_tokens_seen": 291296740, "router_z_loss_clip": 2.25390625, "router_z_loss_mlp": 0.23742676, "step": 13497, "time_per_iteration": 2.6759040355682373 }, { "auxiliary_loss_clip": 0.01262295, "auxiliary_loss_mlp": 0.0026259, "balance_loss_clip": 1.04006422, "balance_loss_mlp": 0.23700735, "epoch": 0.8115436645122501, "flos": 22490889194880.0, "grad_norm": 2.9476671653097664, "language_loss": 0.83227056, "learning_rate": 3.610322329047508e-07, "loss": 0.8475194, "num_input_tokens_seen": 291318730, "router_z_loss_clip": 2.22460938, "router_z_loss_mlp": 0.2557373, "step": 13498, "time_per_iteration": 2.702817678451538 }, { "auxiliary_loss_clip": 0.01247738, "auxiliary_loss_mlp": 0.0023631, "balance_loss_clip": 1.02524114, "balance_loss_mlp": 0.21071552, "epoch": 0.811603787764918, "flos": 13845288021120.0, "grad_norm": 31.149791138252063, "language_loss": 0.93538445, "learning_rate": 3.608090626234055e-07, "loss": 0.95022488, "num_input_tokens_seen": 291336755, "router_z_loss_clip": 2.22851562, "router_z_loss_mlp": 0.25598145, "step": 13499, "time_per_iteration": 2.6531219482421875 }, { "auxiliary_loss_clip": 0.01264085, "auxiliary_loss_mlp": 0.00260407, "balance_loss_clip": 1.03901768, "balance_loss_mlp": 0.23236893, "epoch": 0.8116639110175861, "flos": 21614632911360.0, "grad_norm": 19.788418810415305, "language_loss": 0.82724428, "learning_rate": 3.6058595450034603e-07, "loss": 0.84248918, "num_input_tokens_seen": 291356795, "router_z_loss_clip": 2.25195312, "router_z_loss_mlp": 0.28039551, "step": 13500, "time_per_iteration": 2.6419479846954346 }, { "auxiliary_loss_clip": 0.01147936, "auxiliary_loss_mlp": 0.00111114, "balance_loss_clip": 0.99212372, "balance_loss_mlp": 0.10286444, "epoch": 0.811724034270254, "flos": 64459799625600.0, "grad_norm": 0.8394028406525358, "language_loss": 0.59294742, "learning_rate": 3.603629085440303e-07, "loss": 0.60553795, "num_input_tokens_seen": 291416005, "router_z_loss_clip": 1.5625, "router_z_loss_mlp": 0.08251953, "step": 13501, "time_per_iteration": 3.202393054962158 }, { "auxiliary_loss_clip": 0.01235177, "auxiliary_loss_mlp": 0.00217401, "balance_loss_clip": 1.02623451, "balance_loss_mlp": 0.19420266, "epoch": 0.811784157522922, "flos": 24754123290240.0, "grad_norm": 14.430684631741803, "language_loss": 0.86995828, "learning_rate": 3.6013992476291753e-07, "loss": 0.88448411, "num_input_tokens_seen": 291434870, "router_z_loss_clip": 2.0859375, "router_z_loss_mlp": 0.23217773, "step": 13502, "time_per_iteration": 2.716451406478882 }, { "auxiliary_loss_clip": 0.01245939, "auxiliary_loss_mlp": 0.00228484, "balance_loss_clip": 1.02616572, "balance_loss_mlp": 0.20560782, "epoch": 0.81184428077559, "flos": 12167146563840.0, "grad_norm": 13.38976488702154, "language_loss": 0.79775119, "learning_rate": 3.599170031654635e-07, "loss": 0.81249535, "num_input_tokens_seen": 291452230, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.22888184, "step": 13503, "time_per_iteration": 2.7014784812927246 }, { "auxiliary_loss_clip": 0.01262009, "auxiliary_loss_mlp": 0.00231519, "balance_loss_clip": 1.03855157, "balance_loss_mlp": 0.20631804, "epoch": 0.8119044040282579, "flos": 44422037775360.0, "grad_norm": 18.60518469167934, "language_loss": 0.7355299, "learning_rate": 3.5969414376012065e-07, "loss": 0.75046521, "num_input_tokens_seen": 291477425, "router_z_loss_clip": 2.23046875, "router_z_loss_mlp": 0.25195312, "step": 13504, "time_per_iteration": 2.917557716369629 }, { "auxiliary_loss_clip": 0.01254924, "auxiliary_loss_mlp": 0.00245292, "balance_loss_clip": 1.03387463, "balance_loss_mlp": 0.22024632, "epoch": 0.8119645272809259, "flos": 52155507957120.0, "grad_norm": 28.133769306885604, "language_loss": 0.81264168, "learning_rate": 3.594713465553403e-07, "loss": 0.82764387, "num_input_tokens_seen": 291501070, "router_z_loss_clip": 2.20996094, "router_z_loss_mlp": 0.25036621, "step": 13505, "time_per_iteration": 2.922914981842041 }, { "auxiliary_loss_clip": 0.01258322, "auxiliary_loss_mlp": 0.00246376, "balance_loss_clip": 1.03827882, "balance_loss_mlp": 0.22090077, "epoch": 0.8120246505335939, "flos": 30232978640640.0, "grad_norm": 37.19981979744804, "language_loss": 0.79809898, "learning_rate": 3.5924861155957123e-07, "loss": 0.813146, "num_input_tokens_seen": 291524945, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.25476074, "step": 13506, "time_per_iteration": 2.7759575843811035 }, { "auxiliary_loss_clip": 0.01268009, "auxiliary_loss_mlp": 0.00244, "balance_loss_clip": 1.04112148, "balance_loss_mlp": 0.2184175, "epoch": 0.8120847737862619, "flos": 22127652910080.0, "grad_norm": 7.700564288680627, "language_loss": 0.85545897, "learning_rate": 3.590259387812593e-07, "loss": 0.87057906, "num_input_tokens_seen": 291544605, "router_z_loss_clip": 2.26953125, "router_z_loss_mlp": 0.2557373, "step": 13507, "time_per_iteration": 2.6925323009490967 }, { "auxiliary_loss_clip": 0.01265869, "auxiliary_loss_mlp": 0.00238473, "balance_loss_clip": 1.04006195, "balance_loss_mlp": 0.2140951, "epoch": 0.8121448970389298, "flos": 23295180579840.0, "grad_norm": 15.370127337959591, "language_loss": 0.78955495, "learning_rate": 3.5880332822884783e-07, "loss": 0.80459839, "num_input_tokens_seen": 291563850, "router_z_loss_clip": 2.2578125, "router_z_loss_mlp": 0.24389648, "step": 13508, "time_per_iteration": 2.675222396850586 }, { "auxiliary_loss_clip": 0.01257579, "auxiliary_loss_mlp": 0.00230644, "balance_loss_clip": 1.03625321, "balance_loss_mlp": 0.20606278, "epoch": 0.8122050202915978, "flos": 22164138149760.0, "grad_norm": 2.3275325830914553, "language_loss": 0.83311975, "learning_rate": 3.585807799107785e-07, "loss": 0.84800196, "num_input_tokens_seen": 291581730, "router_z_loss_clip": 2.21484375, "router_z_loss_mlp": 0.24560547, "step": 13509, "time_per_iteration": 2.656312942504883 }, { "auxiliary_loss_clip": 0.01271578, "auxiliary_loss_mlp": 0.00235412, "balance_loss_clip": 1.04183292, "balance_loss_mlp": 0.20898351, "epoch": 0.8122651435442657, "flos": 23258946735360.0, "grad_norm": 3120.5613210437077, "language_loss": 0.8447907, "learning_rate": 3.58358293835491e-07, "loss": 0.8598606, "num_input_tokens_seen": 291601225, "router_z_loss_clip": 2.29882812, "router_z_loss_mlp": 0.26416016, "step": 13510, "time_per_iteration": 2.7643418312072754 }, { "auxiliary_loss_clip": 0.01266319, "auxiliary_loss_mlp": 0.00235606, "balance_loss_clip": 1.04228115, "balance_loss_mlp": 0.20966636, "epoch": 0.8123252667969337, "flos": 16140015365760.0, "grad_norm": 6.037697463409731, "language_loss": 0.78869438, "learning_rate": 3.581358700114212e-07, "loss": 0.80371362, "num_input_tokens_seen": 291616995, "router_z_loss_clip": 2.24023438, "router_z_loss_mlp": 0.25952148, "step": 13511, "time_per_iteration": 2.640000104904175 }, { "auxiliary_loss_clip": 0.01243744, "auxiliary_loss_mlp": 0.00252353, "balance_loss_clip": 1.02436304, "balance_loss_mlp": 0.22779605, "epoch": 0.8123853900496016, "flos": 21245399055360.0, "grad_norm": 36.32082310917654, "language_loss": 0.87225795, "learning_rate": 3.57913508447004e-07, "loss": 0.88721901, "num_input_tokens_seen": 291636145, "router_z_loss_clip": 2.19140625, "router_z_loss_mlp": 0.24560547, "step": 13512, "time_per_iteration": 2.7179341316223145 }, { "auxiliary_loss_clip": 0.01250444, "auxiliary_loss_mlp": 0.00214029, "balance_loss_clip": 1.02953935, "balance_loss_mlp": 0.19027032, "epoch": 0.8124455133022697, "flos": 64377596373120.0, "grad_norm": 3.514086413204601, "language_loss": 0.72342539, "learning_rate": 3.5769120915067076e-07, "loss": 0.73807013, "num_input_tokens_seen": 291662440, "router_z_loss_clip": 2.2109375, "router_z_loss_mlp": 0.23754883, "step": 13513, "time_per_iteration": 3.0391275882720947 }, { "auxiliary_loss_clip": 0.01254053, "auxiliary_loss_mlp": 0.00227965, "balance_loss_clip": 1.03053796, "balance_loss_mlp": 0.20091683, "epoch": 0.8125056365549376, "flos": 23842207779840.0, "grad_norm": 19.704419048971104, "language_loss": 0.80756259, "learning_rate": 3.5746897213085194e-07, "loss": 0.82238281, "num_input_tokens_seen": 291680950, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.27026367, "step": 13514, "time_per_iteration": 2.670525074005127 }, { "auxiliary_loss_clip": 0.01243149, "auxiliary_loss_mlp": 0.00227046, "balance_loss_clip": 1.0277276, "balance_loss_mlp": 0.20296581, "epoch": 0.8125657598076056, "flos": 23550325862400.0, "grad_norm": 52.36381222954289, "language_loss": 0.70208389, "learning_rate": 3.5724679739597364e-07, "loss": 0.71678579, "num_input_tokens_seen": 291702395, "router_z_loss_clip": 2.15039062, "router_z_loss_mlp": 0.2409668, "step": 13515, "time_per_iteration": 2.7418272495269775 }, { "auxiliary_loss_clip": 0.0122438, "auxiliary_loss_mlp": 0.00228461, "balance_loss_clip": 1.01415718, "balance_loss_mlp": 0.20546573, "epoch": 0.8126258830602736, "flos": 20704225772160.0, "grad_norm": 27.059677875910104, "language_loss": 0.83650333, "learning_rate": 3.570246849544616e-07, "loss": 0.85103172, "num_input_tokens_seen": 291721135, "router_z_loss_clip": 2.10058594, "router_z_loss_mlp": 0.2298584, "step": 13516, "time_per_iteration": 2.737856864929199 }, { "auxiliary_loss_clip": 0.01266226, "auxiliary_loss_mlp": 0.00250108, "balance_loss_clip": 1.04750431, "balance_loss_mlp": 0.22683883, "epoch": 0.8126860063129415, "flos": 23618160696960.0, "grad_norm": 37.1275351238903, "language_loss": 0.97356945, "learning_rate": 3.5680263481473907e-07, "loss": 0.98873281, "num_input_tokens_seen": 291741235, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.23266602, "step": 13517, "time_per_iteration": 2.730692148208618 }, { "auxiliary_loss_clip": 0.01248602, "auxiliary_loss_mlp": 0.00239723, "balance_loss_clip": 1.0362941, "balance_loss_mlp": 0.21653718, "epoch": 0.8127461295656095, "flos": 25007149670400.0, "grad_norm": 16.297532896476618, "language_loss": 0.85162956, "learning_rate": 3.565806469852244e-07, "loss": 0.86651284, "num_input_tokens_seen": 291761430, "router_z_loss_clip": 2.11914062, "router_z_loss_mlp": 0.23168945, "step": 13518, "time_per_iteration": 2.8045406341552734 }, { "auxiliary_loss_clip": 0.01251434, "auxiliary_loss_mlp": 0.00226166, "balance_loss_clip": 1.03558755, "balance_loss_mlp": 0.20311141, "epoch": 0.8128062528182775, "flos": 27342169096320.0, "grad_norm": 100.88082534939616, "language_loss": 0.86301112, "learning_rate": 3.56358721474336e-07, "loss": 0.87778717, "num_input_tokens_seen": 291781755, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.23046875, "step": 13519, "time_per_iteration": 2.8490850925445557 }, { "auxiliary_loss_clip": 0.01263778, "auxiliary_loss_mlp": 0.00235977, "balance_loss_clip": 1.03726745, "balance_loss_mlp": 0.20815358, "epoch": 0.8128663760709455, "flos": 26506312634880.0, "grad_norm": 18269.55922526264, "language_loss": 0.80655849, "learning_rate": 3.561368582904905e-07, "loss": 0.82155603, "num_input_tokens_seen": 291804410, "router_z_loss_clip": 2.26757812, "router_z_loss_mlp": 0.27819824, "step": 13520, "time_per_iteration": 2.7178120613098145 }, { "auxiliary_loss_clip": 0.01253184, "auxiliary_loss_mlp": 0.00247703, "balance_loss_clip": 1.03528988, "balance_loss_mlp": 0.22370584, "epoch": 0.8129264993236134, "flos": 17931239815680.0, "grad_norm": 32.71165318826856, "language_loss": 0.78387046, "learning_rate": 3.5591505744209925e-07, "loss": 0.79887938, "num_input_tokens_seen": 291823285, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.24023438, "step": 13521, "time_per_iteration": 2.70349383354187 }, { "auxiliary_loss_clip": 0.01268956, "auxiliary_loss_mlp": 0.00246545, "balance_loss_clip": 1.04180801, "balance_loss_mlp": 0.22018735, "epoch": 0.8129866225762814, "flos": 26177694082560.0, "grad_norm": 39.94981684135408, "language_loss": 0.77574682, "learning_rate": 3.5569331893757394e-07, "loss": 0.79090184, "num_input_tokens_seen": 291845305, "router_z_loss_clip": 2.2734375, "router_z_loss_mlp": 0.2635498, "step": 13522, "time_per_iteration": 4.14990758895874 }, { "auxiliary_loss_clip": 0.01250102, "auxiliary_loss_mlp": 0.00249203, "balance_loss_clip": 1.03625107, "balance_loss_mlp": 0.2257902, "epoch": 0.8130467458289493, "flos": 21032197879680.0, "grad_norm": 5.278705218801356, "language_loss": 0.75606155, "learning_rate": 3.554716427853233e-07, "loss": 0.77105457, "num_input_tokens_seen": 291863715, "router_z_loss_clip": 2.140625, "router_z_loss_mlp": 0.23413086, "step": 13523, "time_per_iteration": 2.6319921016693115 }, { "auxiliary_loss_clip": 0.01246901, "auxiliary_loss_mlp": 0.00232413, "balance_loss_clip": 1.02951372, "balance_loss_mlp": 0.20770073, "epoch": 0.8131068690816173, "flos": 15487051979520.0, "grad_norm": 7.362166997872346, "language_loss": 0.79548568, "learning_rate": 3.5525002899375256e-07, "loss": 0.81027877, "num_input_tokens_seen": 291880735, "router_z_loss_clip": 2.17578125, "router_z_loss_mlp": 0.24731445, "step": 13524, "time_per_iteration": 4.1123669147491455 }, { "auxiliary_loss_clip": 0.01245566, "auxiliary_loss_mlp": 0.00228439, "balance_loss_clip": 1.02713466, "balance_loss_mlp": 0.20427507, "epoch": 0.8131669923342852, "flos": 29351227576320.0, "grad_norm": 7.652571253702531, "language_loss": 0.69684613, "learning_rate": 3.550284775712653e-07, "loss": 0.71158612, "num_input_tokens_seen": 291900535, "router_z_loss_clip": 2.18164062, "router_z_loss_mlp": 0.24157715, "step": 13525, "time_per_iteration": 2.736398458480835 }, { "auxiliary_loss_clip": 0.01244978, "auxiliary_loss_mlp": 0.00206285, "balance_loss_clip": 1.02882326, "balance_loss_mlp": 0.18293181, "epoch": 0.8132271155869533, "flos": 35256162055680.0, "grad_norm": 60.08572391206648, "language_loss": 0.71770406, "learning_rate": 3.548069885262628e-07, "loss": 0.73221666, "num_input_tokens_seen": 291919760, "router_z_loss_clip": 2.1640625, "router_z_loss_mlp": 0.23364258, "step": 13526, "time_per_iteration": 2.751244068145752 }, { "auxiliary_loss_clip": 0.01233431, "auxiliary_loss_mlp": 0.00231526, "balance_loss_clip": 1.01870847, "balance_loss_mlp": 0.20739798, "epoch": 0.8132872388396212, "flos": 27781895393280.0, "grad_norm": 4.8078015626904635, "language_loss": 0.82497138, "learning_rate": 3.5458556186714473e-07, "loss": 0.83962095, "num_input_tokens_seen": 291938915, "router_z_loss_clip": 2.14648438, "router_z_loss_mlp": 0.24121094, "step": 13527, "time_per_iteration": 2.734947443008423 }, { "auxiliary_loss_clip": 0.01262773, "auxiliary_loss_mlp": 0.00248904, "balance_loss_clip": 1.04149556, "balance_loss_mlp": 0.22340514, "epoch": 0.8133473620922892, "flos": 27819601695360.0, "grad_norm": 56.61141286664804, "language_loss": 0.77315533, "learning_rate": 3.5436419760230706e-07, "loss": 0.78827214, "num_input_tokens_seen": 291958145, "router_z_loss_clip": 2.2109375, "router_z_loss_mlp": 0.25512695, "step": 13528, "time_per_iteration": 2.7453033924102783 }, { "auxiliary_loss_clip": 0.01248625, "auxiliary_loss_mlp": 0.00241789, "balance_loss_clip": 1.03063548, "balance_loss_mlp": 0.21630171, "epoch": 0.8134074853449572, "flos": 18989527248000.0, "grad_norm": 46.322956932422244, "language_loss": 0.79638535, "learning_rate": 3.5414289574014357e-07, "loss": 0.81128949, "num_input_tokens_seen": 291976860, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.25463867, "step": 13529, "time_per_iteration": 4.208867311477661 }, { "auxiliary_loss_clip": 0.01241699, "auxiliary_loss_mlp": 0.00217009, "balance_loss_clip": 1.0257498, "balance_loss_mlp": 0.19444311, "epoch": 0.8134676085976251, "flos": 24242863057920.0, "grad_norm": 67.89003158133195, "language_loss": 0.84055507, "learning_rate": 3.5392165628904635e-07, "loss": 0.85514218, "num_input_tokens_seen": 291998085, "router_z_loss_clip": 2.15820312, "router_z_loss_mlp": 0.22570801, "step": 13530, "time_per_iteration": 2.685152292251587 }, { "auxiliary_loss_clip": 0.01233501, "auxiliary_loss_mlp": 0.00220808, "balance_loss_clip": 1.02189028, "balance_loss_mlp": 0.19880182, "epoch": 0.8135277318502931, "flos": 19062389986560.0, "grad_norm": 3.1681398095051905, "language_loss": 0.90336001, "learning_rate": 3.537004792574052e-07, "loss": 0.91790307, "num_input_tokens_seen": 292016585, "router_z_loss_clip": 2.11523438, "router_z_loss_mlp": 0.21984863, "step": 13531, "time_per_iteration": 2.625089645385742 }, { "auxiliary_loss_clip": 0.01264161, "auxiliary_loss_mlp": 0.00256322, "balance_loss_clip": 1.03897822, "balance_loss_mlp": 0.22960687, "epoch": 0.813587855102961, "flos": 17269728992640.0, "grad_norm": 52.915002268697386, "language_loss": 0.83871138, "learning_rate": 3.534793646536065e-07, "loss": 0.85391623, "num_input_tokens_seen": 292033255, "router_z_loss_clip": 2.25390625, "router_z_loss_mlp": 0.26721191, "step": 13532, "time_per_iteration": 4.045852899551392 }, { "auxiliary_loss_clip": 0.01244592, "auxiliary_loss_mlp": 0.00213317, "balance_loss_clip": 1.02739048, "balance_loss_mlp": 0.18946359, "epoch": 0.8136479783556291, "flos": 20157593621760.0, "grad_norm": 29.089776172542667, "language_loss": 0.84558058, "learning_rate": 3.5325831248603533e-07, "loss": 0.86015964, "num_input_tokens_seen": 292051800, "router_z_loss_clip": 2.171875, "router_z_loss_mlp": 0.23840332, "step": 13533, "time_per_iteration": 2.6456258296966553 }, { "auxiliary_loss_clip": 0.01268743, "auxiliary_loss_mlp": 0.00246846, "balance_loss_clip": 1.0437994, "balance_loss_mlp": 0.22128725, "epoch": 0.813708101608297, "flos": 22052348046720.0, "grad_norm": 46.692679162384145, "language_loss": 0.83607823, "learning_rate": 3.5303732276307495e-07, "loss": 0.85123408, "num_input_tokens_seen": 292072215, "router_z_loss_clip": 2.25195312, "router_z_loss_mlp": 0.25549316, "step": 13534, "time_per_iteration": 2.7209250926971436 }, { "auxiliary_loss_clip": 0.01253147, "auxiliary_loss_mlp": 0.00226829, "balance_loss_clip": 1.03528214, "balance_loss_mlp": 0.20413205, "epoch": 0.813768224860965, "flos": 16173412035840.0, "grad_norm": 185.24598784764217, "language_loss": 1.00213301, "learning_rate": 3.5281639549310336e-07, "loss": 1.01693273, "num_input_tokens_seen": 292088830, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.22680664, "step": 13535, "time_per_iteration": 2.6354846954345703 }, { "auxiliary_loss_clip": 0.01248226, "auxiliary_loss_mlp": 0.00219428, "balance_loss_clip": 1.03362453, "balance_loss_mlp": 0.19727927, "epoch": 0.8138283481136329, "flos": 24352318776960.0, "grad_norm": 53.770768321391905, "language_loss": 0.77499425, "learning_rate": 3.52595530684499e-07, "loss": 0.78967083, "num_input_tokens_seen": 292109225, "router_z_loss_clip": 2.14648438, "router_z_loss_mlp": 0.22143555, "step": 13536, "time_per_iteration": 2.6855294704437256 }, { "auxiliary_loss_clip": 0.01255797, "auxiliary_loss_mlp": 0.00223386, "balance_loss_clip": 1.03305876, "balance_loss_mlp": 0.19904372, "epoch": 0.8138884713663009, "flos": 25516362827520.0, "grad_norm": 5.104108752004816, "language_loss": 0.83032143, "learning_rate": 3.5237472834563775e-07, "loss": 0.84511328, "num_input_tokens_seen": 292129660, "router_z_loss_clip": 2.22851562, "router_z_loss_mlp": 0.24353027, "step": 13537, "time_per_iteration": 2.7276642322540283 }, { "auxiliary_loss_clip": 0.01230128, "auxiliary_loss_mlp": 0.00217179, "balance_loss_clip": 1.01595473, "balance_loss_mlp": 0.1943627, "epoch": 0.8139485946189688, "flos": 22454368041600.0, "grad_norm": 9.5294011274582, "language_loss": 0.82874942, "learning_rate": 3.5215398848489163e-07, "loss": 0.8432225, "num_input_tokens_seen": 292149090, "router_z_loss_clip": 2.14257812, "router_z_loss_mlp": 0.22814941, "step": 13538, "time_per_iteration": 2.6948652267456055 }, { "auxiliary_loss_clip": 0.0126248, "auxiliary_loss_mlp": 0.00231052, "balance_loss_clip": 1.03921175, "balance_loss_mlp": 0.20682889, "epoch": 0.8140087178716369, "flos": 21250391045760.0, "grad_norm": 4.691297662153734, "language_loss": 0.84092104, "learning_rate": 3.5193331111063176e-07, "loss": 0.85585636, "num_input_tokens_seen": 292169260, "router_z_loss_clip": 2.23242188, "router_z_loss_mlp": 0.2421875, "step": 13539, "time_per_iteration": 2.718554735183716 }, { "auxiliary_loss_clip": 0.01231138, "auxiliary_loss_mlp": 0.00217939, "balance_loss_clip": 1.01783752, "balance_loss_mlp": 0.19581358, "epoch": 0.8140688411243048, "flos": 39415730774400.0, "grad_norm": 20.362097618469463, "language_loss": 0.72214556, "learning_rate": 3.5171269623122533e-07, "loss": 0.73663634, "num_input_tokens_seen": 292188145, "router_z_loss_clip": 2.13476562, "router_z_loss_mlp": 0.22143555, "step": 13540, "time_per_iteration": 2.784166097640991 }, { "auxiliary_loss_clip": 0.01247539, "auxiliary_loss_mlp": 0.00221948, "balance_loss_clip": 1.02965927, "balance_loss_mlp": 0.19882125, "epoch": 0.8141289643769728, "flos": 25415885508480.0, "grad_norm": 6780.4690583192, "language_loss": 0.72740412, "learning_rate": 3.5149214385503913e-07, "loss": 0.74209899, "num_input_tokens_seen": 292212135, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.2310791, "step": 13541, "time_per_iteration": 2.8120169639587402 }, { "auxiliary_loss_clip": 0.01269454, "auxiliary_loss_mlp": 0.00235907, "balance_loss_clip": 1.0427022, "balance_loss_mlp": 0.20954967, "epoch": 0.8141890876296408, "flos": 12568053237120.0, "grad_norm": 6.040106645177595, "language_loss": 0.80084497, "learning_rate": 3.512716539904355e-07, "loss": 0.81589854, "num_input_tokens_seen": 292230645, "router_z_loss_clip": 2.26953125, "router_z_loss_mlp": 0.26367188, "step": 13542, "time_per_iteration": 2.6247639656066895 }, { "auxiliary_loss_clip": 0.01266185, "auxiliary_loss_mlp": 0.00236832, "balance_loss_clip": 1.0423429, "balance_loss_mlp": 0.21219176, "epoch": 0.8142492108823087, "flos": 14967172483200.0, "grad_norm": 8.897830887097742, "language_loss": 0.91117233, "learning_rate": 3.5105122664577613e-07, "loss": 0.92620248, "num_input_tokens_seen": 292243540, "router_z_loss_clip": 2.24023438, "router_z_loss_mlp": 0.2467041, "step": 13543, "time_per_iteration": 2.619767904281616 }, { "auxiliary_loss_clip": 0.01265477, "auxiliary_loss_mlp": 0.00222101, "balance_loss_clip": 1.04156959, "balance_loss_mlp": 0.19719842, "epoch": 0.8143093341349767, "flos": 12422004537600.0, "grad_norm": 27.09973744512186, "language_loss": 0.89328361, "learning_rate": 3.5083086182942003e-07, "loss": 0.90815938, "num_input_tokens_seen": 292261715, "router_z_loss_clip": 2.23828125, "router_z_loss_mlp": 0.24902344, "step": 13544, "time_per_iteration": 2.5914998054504395 }, { "auxiliary_loss_clip": 0.01290137, "auxiliary_loss_mlp": 0.00238956, "balance_loss_clip": 1.05172324, "balance_loss_mlp": 0.21281339, "epoch": 0.8143694573876447, "flos": 11910564737280.0, "grad_norm": 5.448066730464954, "language_loss": 0.86469084, "learning_rate": 3.5061055954972264e-07, "loss": 0.87998176, "num_input_tokens_seen": 292275080, "router_z_loss_clip": 2.38085938, "router_z_loss_mlp": 0.26159668, "step": 13545, "time_per_iteration": 2.61784029006958 }, { "auxiliary_loss_clip": 0.01222455, "auxiliary_loss_mlp": 0.00227487, "balance_loss_clip": 1.01280403, "balance_loss_mlp": 0.20470604, "epoch": 0.8144295806403127, "flos": 21212900225280.0, "grad_norm": 29.99562815298172, "language_loss": 0.8350544, "learning_rate": 3.5039031981503776e-07, "loss": 0.84955382, "num_input_tokens_seen": 292294635, "router_z_loss_clip": 2.09960938, "router_z_loss_mlp": 0.22802734, "step": 13546, "time_per_iteration": 2.649095058441162 }, { "auxiliary_loss_clip": 0.01251074, "auxiliary_loss_mlp": 0.0022316, "balance_loss_clip": 1.03612459, "balance_loss_mlp": 0.19985431, "epoch": 0.8144897038929806, "flos": 19865280741120.0, "grad_norm": 208.5203966441145, "language_loss": 0.80781567, "learning_rate": 3.501701426337178e-07, "loss": 0.82255793, "num_input_tokens_seen": 292312695, "router_z_loss_clip": 2.1484375, "router_z_loss_mlp": 0.23291016, "step": 13547, "time_per_iteration": 2.6567447185516357 }, { "auxiliary_loss_clip": 0.01276007, "auxiliary_loss_mlp": 0.00238432, "balance_loss_clip": 1.04938865, "balance_loss_mlp": 0.21259919, "epoch": 0.8145498271456486, "flos": 24571733005440.0, "grad_norm": 17.22913828946049, "language_loss": 0.80540508, "learning_rate": 3.49950028014111e-07, "loss": 0.82054949, "num_input_tokens_seen": 292332005, "router_z_loss_clip": 2.265625, "router_z_loss_mlp": 0.25805664, "step": 13548, "time_per_iteration": 2.678844690322876 }, { "auxiliary_loss_clip": 0.0126454, "auxiliary_loss_mlp": 0.00213653, "balance_loss_clip": 1.03935695, "balance_loss_mlp": 0.18964434, "epoch": 0.8146099503983165, "flos": 20193037367040.0, "grad_norm": 6.643646694199728, "language_loss": 0.86246407, "learning_rate": 3.4972997596456444e-07, "loss": 0.87724602, "num_input_tokens_seen": 292348365, "router_z_loss_clip": 2.25390625, "router_z_loss_mlp": 0.23999023, "step": 13549, "time_per_iteration": 2.6857943534851074 }, { "auxiliary_loss_clip": 0.01251135, "auxiliary_loss_mlp": 0.00249869, "balance_loss_clip": 1.03351092, "balance_loss_mlp": 0.22452548, "epoch": 0.8146700736509845, "flos": 19536949497600.0, "grad_norm": 15.35205066156056, "language_loss": 0.80989861, "learning_rate": 3.4950998649342233e-07, "loss": 0.82490861, "num_input_tokens_seen": 292368050, "router_z_loss_clip": 2.17578125, "router_z_loss_mlp": 0.25305176, "step": 13550, "time_per_iteration": 2.7402467727661133 }, { "auxiliary_loss_clip": 0.0122302, "auxiliary_loss_mlp": 0.00216701, "balance_loss_clip": 1.0188179, "balance_loss_mlp": 0.19411072, "epoch": 0.8147301969036524, "flos": 18041341979520.0, "grad_norm": 13.738913912677365, "language_loss": 0.79102993, "learning_rate": 3.4929005960902826e-07, "loss": 0.80542713, "num_input_tokens_seen": 292385315, "router_z_loss_clip": 2.04394531, "router_z_loss_mlp": 0.22607422, "step": 13551, "time_per_iteration": 2.618367910385132 }, { "auxiliary_loss_clip": 0.01289034, "auxiliary_loss_mlp": 0.00230997, "balance_loss_clip": 1.05350697, "balance_loss_mlp": 0.20403174, "epoch": 0.8147903201563205, "flos": 18004713085440.0, "grad_norm": 3.383675758626476, "language_loss": 0.80767846, "learning_rate": 3.4907019531971926e-07, "loss": 0.82287872, "num_input_tokens_seen": 292403375, "router_z_loss_clip": 2.34960938, "router_z_loss_mlp": 0.26977539, "step": 13552, "time_per_iteration": 2.630678653717041 }, { "auxiliary_loss_clip": 0.01245952, "auxiliary_loss_mlp": 0.00219802, "balance_loss_clip": 1.02690101, "balance_loss_mlp": 0.19522101, "epoch": 0.8148504434089884, "flos": 20259327916800.0, "grad_norm": 182.160143087172, "language_loss": 0.89881992, "learning_rate": 3.4885039363383407e-07, "loss": 0.91347742, "num_input_tokens_seen": 292419260, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.24584961, "step": 13553, "time_per_iteration": 2.622894763946533 }, { "auxiliary_loss_clip": 0.01254226, "auxiliary_loss_mlp": 0.00220035, "balance_loss_clip": 1.03327179, "balance_loss_mlp": 0.19603767, "epoch": 0.8149105666616564, "flos": 12494723621760.0, "grad_norm": 50.88388042489687, "language_loss": 0.78257149, "learning_rate": 3.4863065455970795e-07, "loss": 0.79731411, "num_input_tokens_seen": 292436095, "router_z_loss_clip": 2.2109375, "router_z_loss_mlp": 0.23950195, "step": 13554, "time_per_iteration": 2.632939338684082 }, { "auxiliary_loss_clip": 0.01258477, "auxiliary_loss_mlp": 0.00219552, "balance_loss_clip": 1.03604412, "balance_loss_mlp": 0.19475633, "epoch": 0.8149706899143244, "flos": 32523683662080.0, "grad_norm": 152.0857071431594, "language_loss": 0.7466628, "learning_rate": 3.484109781056723e-07, "loss": 0.76144314, "num_input_tokens_seen": 292457190, "router_z_loss_clip": 2.2265625, "router_z_loss_mlp": 0.24804688, "step": 13555, "time_per_iteration": 2.7388505935668945 }, { "auxiliary_loss_clip": 0.01247562, "auxiliary_loss_mlp": 0.00232719, "balance_loss_clip": 1.02454376, "balance_loss_mlp": 0.20770884, "epoch": 0.8150308131669923, "flos": 19386088375680.0, "grad_norm": 52.69184183143722, "language_loss": 0.83178413, "learning_rate": 3.4819136428005844e-07, "loss": 0.84658694, "num_input_tokens_seen": 292474300, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.25, "step": 13556, "time_per_iteration": 2.688929319381714 }, { "auxiliary_loss_clip": 0.01240909, "auxiliary_loss_mlp": 0.0021488, "balance_loss_clip": 1.03059494, "balance_loss_mlp": 0.19285055, "epoch": 0.8150909364196604, "flos": 17421380213760.0, "grad_norm": 159.82228728202358, "language_loss": 0.86893535, "learning_rate": 3.4797181309119307e-07, "loss": 0.88349319, "num_input_tokens_seen": 292492420, "router_z_loss_clip": 2.10742188, "router_z_loss_mlp": 0.2199707, "step": 13557, "time_per_iteration": 2.627779960632324 }, { "auxiliary_loss_clip": 0.01255051, "auxiliary_loss_mlp": 0.00244111, "balance_loss_clip": 1.03243959, "balance_loss_mlp": 0.21950589, "epoch": 0.8151510596723283, "flos": 27162795553920.0, "grad_norm": 7.883874783048359, "language_loss": 0.73363245, "learning_rate": 3.4775232454740255e-07, "loss": 0.74862409, "num_input_tokens_seen": 292512895, "router_z_loss_clip": 2.22851562, "router_z_loss_mlp": 0.24609375, "step": 13558, "time_per_iteration": 2.711686372756958 }, { "auxiliary_loss_clip": 0.0113699, "auxiliary_loss_mlp": 0.00124357, "balance_loss_clip": 0.9798395, "balance_loss_mlp": 0.11625078, "epoch": 0.8152111829249963, "flos": 64219052718720.0, "grad_norm": 0.8814042396580454, "language_loss": 0.56171191, "learning_rate": 3.4753289865700896e-07, "loss": 0.57432532, "num_input_tokens_seen": 292566580, "router_z_loss_clip": 1.5703125, "router_z_loss_mlp": 0.08105469, "step": 13559, "time_per_iteration": 3.138695240020752 }, { "auxiliary_loss_clip": 0.01141877, "auxiliary_loss_mlp": 0.00124073, "balance_loss_clip": 0.98326218, "balance_loss_mlp": 0.1168248, "epoch": 0.8152713061776642, "flos": 67072012306560.0, "grad_norm": 0.7451755935656752, "language_loss": 0.54796052, "learning_rate": 3.473135354283334e-07, "loss": 0.56061995, "num_input_tokens_seen": 292621490, "router_z_loss_clip": 1.5859375, "router_z_loss_mlp": 0.07226562, "step": 13560, "time_per_iteration": 3.0047426223754883 }, { "auxiliary_loss_clip": 0.01239645, "auxiliary_loss_mlp": 0.00231474, "balance_loss_clip": 1.02363825, "balance_loss_mlp": 0.20785867, "epoch": 0.8153314294303322, "flos": 14391130072320.0, "grad_norm": 19.74038884258692, "language_loss": 0.75670397, "learning_rate": 3.470942348696948e-07, "loss": 0.77141517, "num_input_tokens_seen": 292638660, "router_z_loss_clip": 2.15234375, "router_z_loss_mlp": 0.23608398, "step": 13561, "time_per_iteration": 2.7067933082580566 }, { "auxiliary_loss_clip": 0.01255107, "auxiliary_loss_mlp": 0.00224621, "balance_loss_clip": 1.03103495, "balance_loss_mlp": 0.20104158, "epoch": 0.8153915526830001, "flos": 25623520076160.0, "grad_norm": 25.807826330386735, "language_loss": 0.89785993, "learning_rate": 3.468749969894085e-07, "loss": 0.91265714, "num_input_tokens_seen": 292658545, "router_z_loss_clip": 2.2421875, "router_z_loss_mlp": 0.23571777, "step": 13562, "time_per_iteration": 2.73317813873291 }, { "auxiliary_loss_clip": 0.01252275, "auxiliary_loss_mlp": 0.00238546, "balance_loss_clip": 1.03084004, "balance_loss_mlp": 0.21533608, "epoch": 0.8154516759356681, "flos": 23369156640000.0, "grad_norm": 62.71155646591662, "language_loss": 0.81014389, "learning_rate": 3.4665582179578734e-07, "loss": 0.82505214, "num_input_tokens_seen": 292678460, "router_z_loss_clip": 2.21289062, "router_z_loss_mlp": 0.23217773, "step": 13563, "time_per_iteration": 2.688458204269409 }, { "auxiliary_loss_clip": 0.01237161, "auxiliary_loss_mlp": 0.00246645, "balance_loss_clip": 1.01733816, "balance_loss_mlp": 0.22078824, "epoch": 0.815511799188336, "flos": 28149189914880.0, "grad_norm": 180.43029810059005, "language_loss": 0.77392042, "learning_rate": 3.4643670929714387e-07, "loss": 0.78875852, "num_input_tokens_seen": 292699815, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.25866699, "step": 13564, "time_per_iteration": 4.268319845199585 }, { "auxiliary_loss_clip": 0.01254612, "auxiliary_loss_mlp": 0.00229452, "balance_loss_clip": 1.03008997, "balance_loss_mlp": 0.20474008, "epoch": 0.8155719224410041, "flos": 16983413683200.0, "grad_norm": 8.919595631482935, "language_loss": 0.78983355, "learning_rate": 3.462176595017854e-07, "loss": 0.80467421, "num_input_tokens_seen": 292717370, "router_z_loss_clip": 2.24414062, "router_z_loss_mlp": 0.24731445, "step": 13565, "time_per_iteration": 2.6007192134857178 }, { "auxiliary_loss_clip": 0.01247101, "auxiliary_loss_mlp": 0.00242206, "balance_loss_clip": 1.02832258, "balance_loss_mlp": 0.21570534, "epoch": 0.815632045693672, "flos": 24681727428480.0, "grad_norm": 13.571364120840183, "language_loss": 0.86213732, "learning_rate": 3.459986724180188e-07, "loss": 0.87703037, "num_input_tokens_seen": 292737110, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.26477051, "step": 13566, "time_per_iteration": 4.108477592468262 }, { "auxiliary_loss_clip": 0.01235443, "auxiliary_loss_mlp": 0.00231436, "balance_loss_clip": 1.02697802, "balance_loss_mlp": 0.20993076, "epoch": 0.81569216894634, "flos": 19938323047680.0, "grad_norm": 3.0367861806755676, "language_loss": 0.88884473, "learning_rate": 3.457797480541491e-07, "loss": 0.90351355, "num_input_tokens_seen": 292756510, "router_z_loss_clip": 2.08105469, "router_z_loss_mlp": 0.21520996, "step": 13567, "time_per_iteration": 2.632107973098755 }, { "auxiliary_loss_clip": 0.01231667, "auxiliary_loss_mlp": 0.00206362, "balance_loss_clip": 1.01802564, "balance_loss_mlp": 0.18365213, "epoch": 0.8157522921990079, "flos": 21799393493760.0, "grad_norm": 14.739333905282908, "language_loss": 0.88111109, "learning_rate": 3.455608864184771e-07, "loss": 0.89549136, "num_input_tokens_seen": 292776710, "router_z_loss_clip": 2.13671875, "router_z_loss_mlp": 0.22729492, "step": 13568, "time_per_iteration": 2.6391797065734863 }, { "auxiliary_loss_clip": 0.01228165, "auxiliary_loss_mlp": 0.00234236, "balance_loss_clip": 1.01481688, "balance_loss_mlp": 0.21084669, "epoch": 0.8158124154516759, "flos": 18508323720960.0, "grad_norm": 26.06239235038625, "language_loss": 0.84840405, "learning_rate": 3.453420875193016e-07, "loss": 0.86302811, "num_input_tokens_seen": 292794350, "router_z_loss_clip": 2.13476562, "router_z_loss_mlp": 0.23388672, "step": 13569, "time_per_iteration": 2.6246917247772217 }, { "auxiliary_loss_clip": 0.01245487, "auxiliary_loss_mlp": 0.00204075, "balance_loss_clip": 1.02761078, "balance_loss_mlp": 0.17945787, "epoch": 0.815872538704344, "flos": 26830801123200.0, "grad_norm": 318.78422639666746, "language_loss": 0.70006704, "learning_rate": 3.451233513649199e-07, "loss": 0.71456259, "num_input_tokens_seen": 292814005, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.24633789, "step": 13570, "time_per_iteration": 2.6746420860290527 }, { "auxiliary_loss_clip": 0.01283323, "auxiliary_loss_mlp": 0.00237169, "balance_loss_clip": 1.05420828, "balance_loss_mlp": 0.21281411, "epoch": 0.8159326619570119, "flos": 21725704742400.0, "grad_norm": 12.152664016810554, "language_loss": 0.89691299, "learning_rate": 3.4490467796362687e-07, "loss": 0.91211784, "num_input_tokens_seen": 292833485, "router_z_loss_clip": 2.29101562, "router_z_loss_mlp": 0.24353027, "step": 13571, "time_per_iteration": 4.226847887039185 }, { "auxiliary_loss_clip": 0.01250047, "auxiliary_loss_mlp": 0.00249437, "balance_loss_clip": 1.02639675, "balance_loss_mlp": 0.22435494, "epoch": 0.8159927852096799, "flos": 13840726993920.0, "grad_norm": 43.00434104765732, "language_loss": 0.91894388, "learning_rate": 3.446860673237142e-07, "loss": 0.93393874, "num_input_tokens_seen": 292848045, "router_z_loss_clip": 2.23632812, "router_z_loss_mlp": 0.25097656, "step": 13572, "time_per_iteration": 2.5651023387908936 }, { "auxiliary_loss_clip": 0.01237195, "auxiliary_loss_mlp": 0.00229114, "balance_loss_clip": 1.01704848, "balance_loss_mlp": 0.20582101, "epoch": 0.8160529084623478, "flos": 24499516711680.0, "grad_norm": 179.05023055646618, "language_loss": 0.73213691, "learning_rate": 3.4446751945347186e-07, "loss": 0.74680007, "num_input_tokens_seen": 292869965, "router_z_loss_clip": 2.20117188, "router_z_loss_mlp": 0.23278809, "step": 13573, "time_per_iteration": 2.7188475131988525 }, { "auxiliary_loss_clip": 0.01234683, "auxiliary_loss_mlp": 0.00236156, "balance_loss_clip": 1.01657486, "balance_loss_mlp": 0.21215871, "epoch": 0.8161130317150158, "flos": 24826339584000.0, "grad_norm": 6.761420486317194, "language_loss": 0.837883, "learning_rate": 3.442490343611868e-07, "loss": 0.8525914, "num_input_tokens_seen": 292889680, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.23999023, "step": 13574, "time_per_iteration": 2.6817822456359863 }, { "auxiliary_loss_clip": 0.01260971, "auxiliary_loss_mlp": 0.00250538, "balance_loss_clip": 1.03393531, "balance_loss_mlp": 0.22357282, "epoch": 0.8161731549676837, "flos": 30956542208640.0, "grad_norm": 25.963301999029163, "language_loss": 0.68143773, "learning_rate": 3.4403061205514485e-07, "loss": 0.69655281, "num_input_tokens_seen": 292912360, "router_z_loss_clip": 2.27148438, "router_z_loss_mlp": 0.26977539, "step": 13575, "time_per_iteration": 4.205013751983643 }, { "auxiliary_loss_clip": 0.01259114, "auxiliary_loss_mlp": 0.00251215, "balance_loss_clip": 1.03528738, "balance_loss_mlp": 0.22538272, "epoch": 0.8162332782203517, "flos": 18551991680640.0, "grad_norm": 5.731836077073053, "language_loss": 0.81350482, "learning_rate": 3.4381225254362736e-07, "loss": 0.82860816, "num_input_tokens_seen": 292928325, "router_z_loss_clip": 2.23828125, "router_z_loss_mlp": 0.25830078, "step": 13576, "time_per_iteration": 2.7013890743255615 }, { "auxiliary_loss_clip": 0.01138795, "auxiliary_loss_mlp": 0.0012653, "balance_loss_clip": 0.97863472, "balance_loss_mlp": 0.11832883, "epoch": 0.8162934014730197, "flos": 70386853904640.0, "grad_norm": 0.8001935386036532, "language_loss": 0.58063459, "learning_rate": 3.435939558349155e-07, "loss": 0.59328783, "num_input_tokens_seen": 292992795, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.08203125, "step": 13577, "time_per_iteration": 3.1637229919433594 }, { "auxiliary_loss_clip": 0.0123236, "auxiliary_loss_mlp": 0.00214354, "balance_loss_clip": 1.0170964, "balance_loss_mlp": 0.19230068, "epoch": 0.8163535247256877, "flos": 21214839559680.0, "grad_norm": 178.10629830551696, "language_loss": 0.79151565, "learning_rate": 3.4337572193728747e-07, "loss": 0.80598283, "num_input_tokens_seen": 293011950, "router_z_loss_clip": 2.15039062, "router_z_loss_mlp": 0.22058105, "step": 13578, "time_per_iteration": 2.706305503845215 }, { "auxiliary_loss_clip": 0.01243091, "auxiliary_loss_mlp": 0.00233693, "balance_loss_clip": 1.0265851, "balance_loss_mlp": 0.20896855, "epoch": 0.8164136479783556, "flos": 21098847565440.0, "grad_norm": 7.194957940691047, "language_loss": 0.81192064, "learning_rate": 3.431575508590172e-07, "loss": 0.82668847, "num_input_tokens_seen": 293030175, "router_z_loss_clip": 2.1640625, "router_z_loss_mlp": 0.24731445, "step": 13579, "time_per_iteration": 2.7310259342193604 }, { "auxiliary_loss_clip": 0.0124422, "auxiliary_loss_mlp": 0.00219615, "balance_loss_clip": 1.02251065, "balance_loss_mlp": 0.19578488, "epoch": 0.8164737712310236, "flos": 21720640924800.0, "grad_norm": 17.678489813359178, "language_loss": 0.84979862, "learning_rate": 3.4293944260837873e-07, "loss": 0.86443698, "num_input_tokens_seen": 293047980, "router_z_loss_clip": 2.21679688, "router_z_loss_mlp": 0.23840332, "step": 13580, "time_per_iteration": 2.7038021087646484 }, { "auxiliary_loss_clip": 0.01218598, "auxiliary_loss_mlp": 0.0021832, "balance_loss_clip": 1.00785804, "balance_loss_mlp": 0.19654022, "epoch": 0.8165338944836915, "flos": 19536805843200.0, "grad_norm": 39.928527283867055, "language_loss": 0.76388764, "learning_rate": 3.4272139719364314e-07, "loss": 0.77825689, "num_input_tokens_seen": 293067030, "router_z_loss_clip": 2.10546875, "router_z_loss_mlp": 0.21789551, "step": 13581, "time_per_iteration": 2.6899499893188477 }, { "auxiliary_loss_clip": 0.01249858, "auxiliary_loss_mlp": 0.00233051, "balance_loss_clip": 1.03340292, "balance_loss_mlp": 0.20930448, "epoch": 0.8165940177363595, "flos": 22928568416640.0, "grad_norm": 29.207434561582446, "language_loss": 0.67811364, "learning_rate": 3.4250341462307786e-07, "loss": 0.69294274, "num_input_tokens_seen": 293085575, "router_z_loss_clip": 2.1640625, "router_z_loss_mlp": 0.23754883, "step": 13582, "time_per_iteration": 2.6897215843200684 }, { "auxiliary_loss_clip": 0.01222838, "auxiliary_loss_mlp": 0.00203583, "balance_loss_clip": 1.013592, "balance_loss_mlp": 0.18152896, "epoch": 0.8166541409890276, "flos": 23370377702400.0, "grad_norm": 8.581124132876525, "language_loss": 0.86669493, "learning_rate": 3.4228549490494897e-07, "loss": 0.88095915, "num_input_tokens_seen": 293108200, "router_z_loss_clip": 2.09179688, "router_z_loss_mlp": 0.22045898, "step": 13583, "time_per_iteration": 2.7488291263580322 }, { "auxiliary_loss_clip": 0.01245953, "auxiliary_loss_mlp": 0.00217859, "balance_loss_clip": 1.02464128, "balance_loss_mlp": 0.19337381, "epoch": 0.8167142642416955, "flos": 18441997257600.0, "grad_norm": 9.803979082924888, "language_loss": 0.81997538, "learning_rate": 3.4206763804752093e-07, "loss": 0.83461356, "num_input_tokens_seen": 293126020, "router_z_loss_clip": 2.21289062, "router_z_loss_mlp": 0.24475098, "step": 13584, "time_per_iteration": 2.6698060035705566 }, { "auxiliary_loss_clip": 0.01262901, "auxiliary_loss_mlp": 0.00241194, "balance_loss_clip": 1.04090929, "balance_loss_mlp": 0.2158021, "epoch": 0.8167743874943635, "flos": 21214983214080.0, "grad_norm": 4.467093834016869, "language_loss": 0.81975222, "learning_rate": 3.4184984405905405e-07, "loss": 0.83479315, "num_input_tokens_seen": 293144620, "router_z_loss_clip": 2.21679688, "router_z_loss_mlp": 0.25402832, "step": 13585, "time_per_iteration": 2.6799745559692383 }, { "auxiliary_loss_clip": 0.01246515, "auxiliary_loss_mlp": 0.00216529, "balance_loss_clip": 1.02520871, "balance_loss_mlp": 0.1906372, "epoch": 0.8168345107470314, "flos": 18697681244160.0, "grad_norm": 33.48706628664923, "language_loss": 0.79981363, "learning_rate": 3.416321129478068e-07, "loss": 0.81444407, "num_input_tokens_seen": 293162850, "router_z_loss_clip": 2.21679688, "router_z_loss_mlp": 0.25878906, "step": 13586, "time_per_iteration": 2.590622663497925 }, { "auxiliary_loss_clip": 0.01234704, "auxiliary_loss_mlp": 0.00229572, "balance_loss_clip": 1.0162338, "balance_loss_mlp": 0.20645729, "epoch": 0.8168946339996994, "flos": 16253098358400.0, "grad_norm": 40.25104903012679, "language_loss": 0.68641245, "learning_rate": 3.4141444472203594e-07, "loss": 0.70105523, "num_input_tokens_seen": 293181620, "router_z_loss_clip": 2.18457031, "router_z_loss_mlp": 0.2310791, "step": 13587, "time_per_iteration": 2.6386237144470215 }, { "auxiliary_loss_clip": 0.01265695, "auxiliary_loss_mlp": 0.00248065, "balance_loss_clip": 1.04007125, "balance_loss_mlp": 0.22316191, "epoch": 0.8169547572523673, "flos": 26941585645440.0, "grad_norm": 24.991072115904384, "language_loss": 0.78783035, "learning_rate": 3.4119683938999624e-07, "loss": 0.80296797, "num_input_tokens_seen": 293200270, "router_z_loss_clip": 2.2578125, "router_z_loss_mlp": 0.24914551, "step": 13588, "time_per_iteration": 2.6782288551330566 }, { "auxiliary_loss_clip": 0.01263433, "auxiliary_loss_mlp": 0.00234098, "balance_loss_clip": 1.03752255, "balance_loss_mlp": 0.20738363, "epoch": 0.8170148805050353, "flos": 18952323736320.0, "grad_norm": 10.732826624707533, "language_loss": 0.82071978, "learning_rate": 3.4097929695993854e-07, "loss": 0.83569509, "num_input_tokens_seen": 293218960, "router_z_loss_clip": 2.25976562, "router_z_loss_mlp": 0.26745605, "step": 13589, "time_per_iteration": 2.6707208156585693 }, { "auxiliary_loss_clip": 0.01234079, "auxiliary_loss_mlp": 0.00224027, "balance_loss_clip": 1.01830506, "balance_loss_mlp": 0.19952931, "epoch": 0.8170750037577033, "flos": 21834909066240.0, "grad_norm": 23.35409681224662, "language_loss": 0.80224895, "learning_rate": 3.4076181744011166e-07, "loss": 0.81683004, "num_input_tokens_seen": 293236450, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.24511719, "step": 13590, "time_per_iteration": 2.6321022510528564 }, { "auxiliary_loss_clip": 0.01270869, "auxiliary_loss_mlp": 0.00236293, "balance_loss_clip": 1.04300213, "balance_loss_mlp": 0.20903006, "epoch": 0.8171351270103713, "flos": 33507169021440.0, "grad_norm": 62.07627934848193, "language_loss": 0.75208771, "learning_rate": 3.4054440083876345e-07, "loss": 0.76715934, "num_input_tokens_seen": 293256480, "router_z_loss_clip": 2.27929688, "router_z_loss_mlp": 0.27270508, "step": 13591, "time_per_iteration": 2.7721383571624756 }, { "auxiliary_loss_clip": 0.01255017, "auxiliary_loss_mlp": 0.00234418, "balance_loss_clip": 1.02899265, "balance_loss_mlp": 0.20931205, "epoch": 0.8171952502630392, "flos": 22708184520960.0, "grad_norm": 20.53249106273902, "language_loss": 0.79607373, "learning_rate": 3.403270471641373e-07, "loss": 0.81096804, "num_input_tokens_seen": 293274960, "router_z_loss_clip": 2.2578125, "router_z_loss_mlp": 0.25109863, "step": 13592, "time_per_iteration": 2.722709894180298 }, { "auxiliary_loss_clip": 0.01247602, "auxiliary_loss_mlp": 0.00239584, "balance_loss_clip": 1.02709758, "balance_loss_mlp": 0.21590948, "epoch": 0.8172553735157072, "flos": 26723715701760.0, "grad_norm": 14.577371307776817, "language_loss": 0.74675333, "learning_rate": 3.401097564244759e-07, "loss": 0.76162529, "num_input_tokens_seen": 293295945, "router_z_loss_clip": 2.20507812, "router_z_loss_mlp": 0.23706055, "step": 13593, "time_per_iteration": 2.696730613708496 }, { "auxiliary_loss_clip": 0.01245893, "auxiliary_loss_mlp": 0.00210375, "balance_loss_clip": 1.02647471, "balance_loss_mlp": 0.1856395, "epoch": 0.8173154967683751, "flos": 15961072786560.0, "grad_norm": 2.115531180191339, "language_loss": 0.7745508, "learning_rate": 3.398925286280188e-07, "loss": 0.78911352, "num_input_tokens_seen": 293313300, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.24731445, "step": 13594, "time_per_iteration": 2.651156425476074 }, { "auxiliary_loss_clip": 0.0125225, "auxiliary_loss_mlp": 0.00244313, "balance_loss_clip": 1.03416109, "balance_loss_mlp": 0.21861155, "epoch": 0.8173756200210431, "flos": 25986720447360.0, "grad_norm": 13.323615664979247, "language_loss": 0.75671184, "learning_rate": 3.3967536378300456e-07, "loss": 0.77167743, "num_input_tokens_seen": 293333085, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.25671387, "step": 13595, "time_per_iteration": 2.676764965057373 }, { "auxiliary_loss_clip": 0.01265339, "auxiliary_loss_mlp": 0.00240987, "balance_loss_clip": 1.03218341, "balance_loss_mlp": 0.21412951, "epoch": 0.8174357432737112, "flos": 25664422688640.0, "grad_norm": 9.44417395291569, "language_loss": 0.8448534, "learning_rate": 3.394582618976658e-07, "loss": 0.85991663, "num_input_tokens_seen": 293351895, "router_z_loss_clip": 2.32617188, "router_z_loss_mlp": 0.26855469, "step": 13596, "time_per_iteration": 2.821202516555786 }, { "auxiliary_loss_clip": 0.01222711, "auxiliary_loss_mlp": 0.00232833, "balance_loss_clip": 1.00785422, "balance_loss_mlp": 0.20893104, "epoch": 0.8174958665263791, "flos": 21835088634240.0, "grad_norm": 22.063513547718664, "language_loss": 0.70612109, "learning_rate": 3.392412229802362e-07, "loss": 0.72067654, "num_input_tokens_seen": 293371165, "router_z_loss_clip": 2.14941406, "router_z_loss_mlp": 0.2388916, "step": 13597, "time_per_iteration": 2.6506845951080322 }, { "auxiliary_loss_clip": 0.01225278, "auxiliary_loss_mlp": 0.00217049, "balance_loss_clip": 1.01099396, "balance_loss_mlp": 0.19356504, "epoch": 0.8175559897790471, "flos": 22455517276800.0, "grad_norm": 9.429242738081662, "language_loss": 0.88628447, "learning_rate": 3.390242470389462e-07, "loss": 0.90070772, "num_input_tokens_seen": 293391150, "router_z_loss_clip": 2.140625, "router_z_loss_mlp": 0.23474121, "step": 13598, "time_per_iteration": 2.707749605178833 }, { "auxiliary_loss_clip": 0.01256129, "auxiliary_loss_mlp": 0.00231405, "balance_loss_clip": 1.03481412, "balance_loss_mlp": 0.206478, "epoch": 0.817616113031715, "flos": 23615790399360.0, "grad_norm": 11.046739670019871, "language_loss": 0.89472878, "learning_rate": 3.3880733408202277e-07, "loss": 0.90960413, "num_input_tokens_seen": 293409440, "router_z_loss_clip": 2.21484375, "router_z_loss_mlp": 0.24951172, "step": 13599, "time_per_iteration": 2.6525466442108154 }, { "auxiliary_loss_clip": 0.01225443, "auxiliary_loss_mlp": 0.00244937, "balance_loss_clip": 1.01437676, "balance_loss_mlp": 0.22090447, "epoch": 0.817676236284383, "flos": 27672260106240.0, "grad_norm": 18.18627347229968, "language_loss": 0.90638965, "learning_rate": 3.3859048411769186e-07, "loss": 0.92109346, "num_input_tokens_seen": 293428995, "router_z_loss_clip": 2.11328125, "router_z_loss_mlp": 0.2401123, "step": 13600, "time_per_iteration": 2.7490286827087402 }, { "auxiliary_loss_clip": 0.01242171, "auxiliary_loss_mlp": 0.00218399, "balance_loss_clip": 1.02467167, "balance_loss_mlp": 0.1953322, "epoch": 0.8177363595370509, "flos": 24681009156480.0, "grad_norm": 92.63417949838149, "language_loss": 0.81137621, "learning_rate": 3.383736971541766e-07, "loss": 0.82598186, "num_input_tokens_seen": 293449155, "router_z_loss_clip": 2.17480469, "router_z_loss_mlp": 0.23095703, "step": 13601, "time_per_iteration": 2.6697206497192383 }, { "auxiliary_loss_clip": 0.01258376, "auxiliary_loss_mlp": 0.00240428, "balance_loss_clip": 1.03257287, "balance_loss_mlp": 0.21498901, "epoch": 0.817796482789719, "flos": 17346326745600.0, "grad_norm": 7.01483989835848, "language_loss": 0.78517038, "learning_rate": 3.3815697319969737e-07, "loss": 0.80015838, "num_input_tokens_seen": 293466125, "router_z_loss_clip": 2.2578125, "router_z_loss_mlp": 0.2545166, "step": 13602, "time_per_iteration": 2.635955333709717 }, { "auxiliary_loss_clip": 0.01247971, "auxiliary_loss_mlp": 0.0022649, "balance_loss_clip": 1.02880645, "balance_loss_mlp": 0.20207639, "epoch": 0.8178566060423869, "flos": 17778475272960.0, "grad_norm": 239.31191441023182, "language_loss": 0.92977273, "learning_rate": 3.379403122624718e-07, "loss": 0.94451725, "num_input_tokens_seen": 293481345, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.2442627, "step": 13603, "time_per_iteration": 2.622375726699829 }, { "auxiliary_loss_clip": 0.01233078, "auxiliary_loss_mlp": 0.00249555, "balance_loss_clip": 1.01996124, "balance_loss_mlp": 0.22535545, "epoch": 0.8179167292950549, "flos": 24973250209920.0, "grad_norm": 43.46399181354795, "language_loss": 0.79644322, "learning_rate": 3.377237143507159e-07, "loss": 0.81126952, "num_input_tokens_seen": 293502330, "router_z_loss_clip": 2.1328125, "router_z_loss_mlp": 0.24194336, "step": 13604, "time_per_iteration": 2.735285520553589 }, { "auxiliary_loss_clip": 0.01242408, "auxiliary_loss_mlp": 0.00223806, "balance_loss_clip": 1.02613473, "balance_loss_mlp": 0.20180006, "epoch": 0.8179768525477228, "flos": 22856783086080.0, "grad_norm": 17.268153239676447, "language_loss": 0.82432437, "learning_rate": 3.3750717947264406e-07, "loss": 0.83898652, "num_input_tokens_seen": 293521415, "router_z_loss_clip": 2.16601562, "router_z_loss_mlp": 0.22009277, "step": 13605, "time_per_iteration": 2.7484636306762695 }, { "auxiliary_loss_clip": 0.01244341, "auxiliary_loss_mlp": 0.00229201, "balance_loss_clip": 1.02103996, "balance_loss_mlp": 0.20515689, "epoch": 0.8180369758003908, "flos": 18515147304960.0, "grad_norm": 6.872064212114467, "language_loss": 0.83944142, "learning_rate": 3.372907076364666e-07, "loss": 0.85417694, "num_input_tokens_seen": 293539245, "router_z_loss_clip": 2.23632812, "router_z_loss_mlp": 0.24047852, "step": 13606, "time_per_iteration": 4.157891035079956 }, { "auxiliary_loss_clip": 0.01229637, "auxiliary_loss_mlp": 0.00216448, "balance_loss_clip": 1.01210761, "balance_loss_mlp": 0.19296405, "epoch": 0.8180970990530587, "flos": 33182105915520.0, "grad_norm": 10.03262438283251, "language_loss": 0.75111127, "learning_rate": 3.370742988503916e-07, "loss": 0.76557207, "num_input_tokens_seen": 293560640, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.23486328, "step": 13607, "time_per_iteration": 2.7612736225128174 }, { "auxiliary_loss_clip": 0.01235232, "auxiliary_loss_mlp": 0.00226194, "balance_loss_clip": 1.0163734, "balance_loss_mlp": 0.20151812, "epoch": 0.8181572223057267, "flos": 25010022758400.0, "grad_norm": 42.01879452164553, "language_loss": 0.77668351, "learning_rate": 3.3685795312262634e-07, "loss": 0.79129773, "num_input_tokens_seen": 293579465, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.24707031, "step": 13608, "time_per_iteration": 4.223840951919556 }, { "auxiliary_loss_clip": 0.01238061, "auxiliary_loss_mlp": 0.00232788, "balance_loss_clip": 1.02119541, "balance_loss_mlp": 0.2089695, "epoch": 0.8182173455583948, "flos": 28548731871360.0, "grad_norm": 6.872353997768753, "language_loss": 0.88280845, "learning_rate": 3.366416704613735e-07, "loss": 0.89751691, "num_input_tokens_seen": 293600540, "router_z_loss_clip": 2.16796875, "router_z_loss_mlp": 0.23815918, "step": 13609, "time_per_iteration": 2.6917612552642822 }, { "auxiliary_loss_clip": 0.01124973, "auxiliary_loss_mlp": 0.00123548, "balance_loss_clip": 0.96447891, "balance_loss_mlp": 0.11658575, "epoch": 0.8182774688110627, "flos": 72028043245440.0, "grad_norm": 0.7383993673933895, "language_loss": 0.55270004, "learning_rate": 3.3642545087483544e-07, "loss": 0.56518525, "num_input_tokens_seen": 293665160, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.06982422, "step": 13610, "time_per_iteration": 3.278946876525879 }, { "auxiliary_loss_clip": 0.01230903, "auxiliary_loss_mlp": 0.0023003, "balance_loss_clip": 1.01862049, "balance_loss_mlp": 0.20652156, "epoch": 0.8183375920637307, "flos": 19755358145280.0, "grad_norm": 7.907689107263748, "language_loss": 0.86046761, "learning_rate": 3.362092943712107e-07, "loss": 0.87507695, "num_input_tokens_seen": 293683995, "router_z_loss_clip": 2.12109375, "router_z_loss_mlp": 0.23535156, "step": 13611, "time_per_iteration": 2.6614105701446533 }, { "auxiliary_loss_clip": 0.0127349, "auxiliary_loss_mlp": 0.00254554, "balance_loss_clip": 1.04217577, "balance_loss_mlp": 0.22782713, "epoch": 0.8183977153163986, "flos": 22341895580160.0, "grad_norm": 59.677783021721815, "language_loss": 0.84386289, "learning_rate": 3.3599320095869745e-07, "loss": 0.85914338, "num_input_tokens_seen": 293704115, "router_z_loss_clip": 2.3125, "router_z_loss_mlp": 0.26757812, "step": 13612, "time_per_iteration": 2.686920166015625 }, { "auxiliary_loss_clip": 0.01232405, "auxiliary_loss_mlp": 0.00216965, "balance_loss_clip": 1.01718616, "balance_loss_mlp": 0.19405346, "epoch": 0.8184578385690666, "flos": 17712472032000.0, "grad_norm": 4.573402672039386, "language_loss": 0.94057643, "learning_rate": 3.3577717064548793e-07, "loss": 0.95507014, "num_input_tokens_seen": 293722225, "router_z_loss_clip": 2.15039062, "router_z_loss_mlp": 0.22900391, "step": 13613, "time_per_iteration": 4.182492733001709 }, { "auxiliary_loss_clip": 0.01252191, "auxiliary_loss_mlp": 0.00214689, "balance_loss_clip": 1.0335834, "balance_loss_mlp": 0.19174162, "epoch": 0.8185179618217345, "flos": 25701159323520.0, "grad_norm": 6.1772431034183, "language_loss": 0.78254831, "learning_rate": 3.355612034397746e-07, "loss": 0.79721713, "num_input_tokens_seen": 293743995, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.22949219, "step": 13614, "time_per_iteration": 2.702536106109619 }, { "auxiliary_loss_clip": 0.01250215, "auxiliary_loss_mlp": 0.00235987, "balance_loss_clip": 1.02632284, "balance_loss_mlp": 0.21046394, "epoch": 0.8185780850744026, "flos": 25960326929280.0, "grad_norm": 161.6201602206527, "language_loss": 0.885768, "learning_rate": 3.353452993497479e-07, "loss": 0.90063, "num_input_tokens_seen": 293764935, "router_z_loss_clip": 2.23828125, "router_z_loss_mlp": 0.25524902, "step": 13615, "time_per_iteration": 2.73157000541687 }, { "auxiliary_loss_clip": 0.01233571, "auxiliary_loss_mlp": 0.00236708, "balance_loss_clip": 1.01857722, "balance_loss_mlp": 0.21207955, "epoch": 0.8186382083270705, "flos": 25228431406080.0, "grad_norm": 136.58700573919373, "language_loss": 0.81148791, "learning_rate": 3.3512945838359375e-07, "loss": 0.82619071, "num_input_tokens_seen": 293784035, "router_z_loss_clip": 2.15039062, "router_z_loss_mlp": 0.24645996, "step": 13616, "time_per_iteration": 2.6968929767608643 }, { "auxiliary_loss_clip": 0.01229197, "auxiliary_loss_mlp": 0.00212616, "balance_loss_clip": 1.01175332, "balance_loss_mlp": 0.18705788, "epoch": 0.8186983315797385, "flos": 22415009713920.0, "grad_norm": 138.82314073262458, "language_loss": 0.81712729, "learning_rate": 3.349136805494979e-07, "loss": 0.83154535, "num_input_tokens_seen": 293803360, "router_z_loss_clip": 2.17480469, "router_z_loss_mlp": 0.25524902, "step": 13617, "time_per_iteration": 4.097262144088745 }, { "auxiliary_loss_clip": 0.01215602, "auxiliary_loss_mlp": 0.00234467, "balance_loss_clip": 1.00343537, "balance_loss_mlp": 0.21175721, "epoch": 0.8187584548324064, "flos": 22018017623040.0, "grad_norm": 12.372772789887094, "language_loss": 0.77010036, "learning_rate": 3.346979658556415e-07, "loss": 0.78460109, "num_input_tokens_seen": 293821325, "router_z_loss_clip": 2.11914062, "router_z_loss_mlp": 0.22705078, "step": 13618, "time_per_iteration": 2.7285943031311035 }, { "auxiliary_loss_clip": 0.012695, "auxiliary_loss_mlp": 0.00256034, "balance_loss_clip": 1.03741097, "balance_loss_mlp": 0.2294504, "epoch": 0.8188185780850744, "flos": 29241664116480.0, "grad_norm": 10.199501280667347, "language_loss": 0.77605677, "learning_rate": 3.344823143102058e-07, "loss": 0.7913121, "num_input_tokens_seen": 293840315, "router_z_loss_clip": 2.32226562, "router_z_loss_mlp": 0.26550293, "step": 13619, "time_per_iteration": 2.7123286724090576 }, { "auxiliary_loss_clip": 0.01243323, "auxiliary_loss_mlp": 0.00229191, "balance_loss_clip": 1.02051568, "balance_loss_mlp": 0.20367992, "epoch": 0.8188787013377423, "flos": 20696504348160.0, "grad_norm": 170.26012773334634, "language_loss": 0.83210528, "learning_rate": 3.3426672592136694e-07, "loss": 0.84683043, "num_input_tokens_seen": 293855685, "router_z_loss_clip": 2.22851562, "router_z_loss_mlp": 0.25512695, "step": 13620, "time_per_iteration": 2.673306465148926 }, { "auxiliary_loss_clip": 0.01220856, "auxiliary_loss_mlp": 0.00199678, "balance_loss_clip": 1.00759304, "balance_loss_mlp": 0.17664658, "epoch": 0.8189388245904103, "flos": 23732967542400.0, "grad_norm": 31.292526239485994, "language_loss": 0.83273578, "learning_rate": 3.340512006973011e-07, "loss": 0.84694111, "num_input_tokens_seen": 293875540, "router_z_loss_clip": 2.12890625, "router_z_loss_mlp": 0.23046875, "step": 13621, "time_per_iteration": 2.6782937049865723 }, { "auxiliary_loss_clip": 0.01240932, "auxiliary_loss_mlp": 0.00229672, "balance_loss_clip": 1.02131569, "balance_loss_mlp": 0.20648529, "epoch": 0.8189989478430784, "flos": 28255090187520.0, "grad_norm": 155.54383010912207, "language_loss": 0.7555353, "learning_rate": 3.3383573864618076e-07, "loss": 0.77024132, "num_input_tokens_seen": 293896570, "router_z_loss_clip": 2.19921875, "router_z_loss_mlp": 0.23168945, "step": 13622, "time_per_iteration": 2.6988582611083984 }, { "auxiliary_loss_clip": 0.01253285, "auxiliary_loss_mlp": 0.00229265, "balance_loss_clip": 1.02757955, "balance_loss_mlp": 0.2031108, "epoch": 0.8190590710957463, "flos": 21397696721280.0, "grad_norm": 20.514331944934668, "language_loss": 0.83619738, "learning_rate": 3.3362033977617653e-07, "loss": 0.85102284, "num_input_tokens_seen": 293914680, "router_z_loss_clip": 2.25585938, "router_z_loss_mlp": 0.26147461, "step": 13623, "time_per_iteration": 2.681013345718384 }, { "auxiliary_loss_clip": 0.01252441, "auxiliary_loss_mlp": 0.00234643, "balance_loss_clip": 1.03097248, "balance_loss_mlp": 0.20958543, "epoch": 0.8191191943484143, "flos": 38796451367040.0, "grad_norm": 578.633403426441, "language_loss": 0.7105062, "learning_rate": 3.3340500409545527e-07, "loss": 0.72537702, "num_input_tokens_seen": 293936480, "router_z_loss_clip": 2.2109375, "router_z_loss_mlp": 0.25048828, "step": 13624, "time_per_iteration": 2.7900772094726562 }, { "auxiliary_loss_clip": 0.01230671, "auxiliary_loss_mlp": 0.00217994, "balance_loss_clip": 1.01641321, "balance_loss_mlp": 0.19423531, "epoch": 0.8191793176010822, "flos": 25446516831360.0, "grad_norm": 142.43244871160977, "language_loss": 0.85976791, "learning_rate": 3.3318973161218386e-07, "loss": 0.87425458, "num_input_tokens_seen": 293957815, "router_z_loss_clip": 2.14648438, "router_z_loss_mlp": 0.23779297, "step": 13625, "time_per_iteration": 2.7486565113067627 }, { "auxiliary_loss_clip": 0.01259118, "auxiliary_loss_mlp": 0.00229021, "balance_loss_clip": 1.03150463, "balance_loss_mlp": 0.20210335, "epoch": 0.8192394408537502, "flos": 25083029151360.0, "grad_norm": 6.637832144209108, "language_loss": 0.87028491, "learning_rate": 3.329745223345244e-07, "loss": 0.88516629, "num_input_tokens_seen": 293975440, "router_z_loss_clip": 2.27734375, "router_z_loss_mlp": 0.26904297, "step": 13626, "time_per_iteration": 2.6799509525299072 }, { "auxiliary_loss_clip": 0.01239807, "auxiliary_loss_mlp": 0.00221908, "balance_loss_clip": 1.02225757, "balance_loss_mlp": 0.19921096, "epoch": 0.8192995641064181, "flos": 27673732563840.0, "grad_norm": 9.5238976292753, "language_loss": 0.80082422, "learning_rate": 3.3275937627063823e-07, "loss": 0.81544137, "num_input_tokens_seen": 293997540, "router_z_loss_clip": 2.171875, "router_z_loss_mlp": 0.22668457, "step": 13627, "time_per_iteration": 2.823000907897949 }, { "auxiliary_loss_clip": 0.01270842, "auxiliary_loss_mlp": 0.00223706, "balance_loss_clip": 1.038517, "balance_loss_mlp": 0.19782606, "epoch": 0.8193596873590862, "flos": 21288492397440.0, "grad_norm": 393.7156227157432, "language_loss": 0.76812488, "learning_rate": 3.3254429342868353e-07, "loss": 0.78307033, "num_input_tokens_seen": 294017030, "router_z_loss_clip": 2.32226562, "router_z_loss_mlp": 0.2590332, "step": 13628, "time_per_iteration": 2.6596760749816895 }, { "auxiliary_loss_clip": 0.01254449, "auxiliary_loss_mlp": 0.00229448, "balance_loss_clip": 1.03008604, "balance_loss_mlp": 0.20439002, "epoch": 0.8194198106117541, "flos": 17492626840320.0, "grad_norm": 2.417840381882748, "language_loss": 0.92729759, "learning_rate": 3.323292738168171e-07, "loss": 0.94213653, "num_input_tokens_seen": 294035700, "router_z_loss_clip": 2.24023438, "router_z_loss_mlp": 0.25061035, "step": 13629, "time_per_iteration": 2.724249839782715 }, { "auxiliary_loss_clip": 0.01251406, "auxiliary_loss_mlp": 0.00247312, "balance_loss_clip": 1.02811038, "balance_loss_mlp": 0.22122872, "epoch": 0.8194799338644221, "flos": 15267925059840.0, "grad_norm": 181.3411595057849, "language_loss": 0.83143896, "learning_rate": 3.3211431744319084e-07, "loss": 0.84642613, "num_input_tokens_seen": 294049730, "router_z_loss_clip": 2.23242188, "router_z_loss_mlp": 0.26086426, "step": 13630, "time_per_iteration": 2.66106915473938 }, { "auxiliary_loss_clip": 0.01248923, "auxiliary_loss_mlp": 0.00244801, "balance_loss_clip": 1.02701235, "balance_loss_mlp": 0.21880096, "epoch": 0.81954005711709, "flos": 14718814871040.0, "grad_norm": 614.7116099961211, "language_loss": 0.8014667, "learning_rate": 3.31899424315957e-07, "loss": 0.81640387, "num_input_tokens_seen": 294066545, "router_z_loss_clip": 2.22070312, "router_z_loss_mlp": 0.2598877, "step": 13631, "time_per_iteration": 2.6554653644561768 }, { "auxiliary_loss_clip": 0.01244195, "auxiliary_loss_mlp": 0.0021925, "balance_loss_clip": 1.02374268, "balance_loss_mlp": 0.19493136, "epoch": 0.819600180369758, "flos": 23074042498560.0, "grad_norm": 56.88657449291879, "language_loss": 0.82867718, "learning_rate": 3.3168459444326447e-07, "loss": 0.84331167, "num_input_tokens_seen": 294087455, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.24328613, "step": 13632, "time_per_iteration": 2.659135580062866 }, { "auxiliary_loss_clip": 0.01245434, "auxiliary_loss_mlp": 0.00210328, "balance_loss_clip": 1.02502787, "balance_loss_mlp": 0.18686718, "epoch": 0.8196603036224259, "flos": 27599792417280.0, "grad_norm": 1277.1665420570603, "language_loss": 0.73602366, "learning_rate": 3.314698278332588e-07, "loss": 0.75058126, "num_input_tokens_seen": 294107480, "router_z_loss_clip": 2.20507812, "router_z_loss_mlp": 0.23449707, "step": 13633, "time_per_iteration": 2.7224810123443604 }, { "auxiliary_loss_clip": 0.01226567, "auxiliary_loss_mlp": 0.00215108, "balance_loss_clip": 1.01582181, "balance_loss_mlp": 0.19245851, "epoch": 0.8197204268750939, "flos": 28582020800640.0, "grad_norm": 16.838950097140543, "language_loss": 0.82175523, "learning_rate": 3.3125512449408513e-07, "loss": 0.83617198, "num_input_tokens_seen": 294130115, "router_z_loss_clip": 2.10742188, "router_z_loss_mlp": 0.2265625, "step": 13634, "time_per_iteration": 2.746833086013794 }, { "auxiliary_loss_clip": 0.01249416, "auxiliary_loss_mlp": 0.00222323, "balance_loss_clip": 1.03161204, "balance_loss_mlp": 0.1983735, "epoch": 0.819780550127762, "flos": 23258300290560.0, "grad_norm": 23.25011803649152, "language_loss": 0.8891021, "learning_rate": 3.310404844338841e-07, "loss": 0.9038195, "num_input_tokens_seen": 294148495, "router_z_loss_clip": 2.18164062, "router_z_loss_mlp": 0.23962402, "step": 13635, "time_per_iteration": 2.6425766944885254 }, { "auxiliary_loss_clip": 0.01237155, "auxiliary_loss_mlp": 0.00240142, "balance_loss_clip": 1.01828504, "balance_loss_mlp": 0.21531102, "epoch": 0.8198406733804299, "flos": 26685255214080.0, "grad_norm": 32.2447068419712, "language_loss": 0.81934237, "learning_rate": 3.308259076607949e-07, "loss": 0.83411539, "num_input_tokens_seen": 294169595, "router_z_loss_clip": 2.19140625, "router_z_loss_mlp": 0.24829102, "step": 13636, "time_per_iteration": 2.737190008163452 }, { "auxiliary_loss_clip": 0.01246973, "auxiliary_loss_mlp": 0.00229042, "balance_loss_clip": 1.02858293, "balance_loss_mlp": 0.20251754, "epoch": 0.8199007966330979, "flos": 20084084438400.0, "grad_norm": 7.296427976224378, "language_loss": 0.89477926, "learning_rate": 3.3061139418295445e-07, "loss": 0.9095394, "num_input_tokens_seen": 294183885, "router_z_loss_clip": 2.18261719, "router_z_loss_mlp": 0.26513672, "step": 13637, "time_per_iteration": 2.61794114112854 }, { "auxiliary_loss_clip": 0.01257128, "auxiliary_loss_mlp": 0.00214053, "balance_loss_clip": 1.0336715, "balance_loss_mlp": 0.18713582, "epoch": 0.8199609198857658, "flos": 31902788142720.0, "grad_norm": 13.16731069067959, "language_loss": 0.7988708, "learning_rate": 3.3039694400849725e-07, "loss": 0.8135826, "num_input_tokens_seen": 294200150, "router_z_loss_clip": 2.23046875, "router_z_loss_mlp": 0.26928711, "step": 13638, "time_per_iteration": 2.7373180389404297 }, { "auxiliary_loss_clip": 0.01257292, "auxiliary_loss_mlp": 0.00229786, "balance_loss_clip": 1.03119087, "balance_loss_mlp": 0.20381021, "epoch": 0.8200210431384338, "flos": 26470150617600.0, "grad_norm": 4.791338275655386, "language_loss": 0.89958721, "learning_rate": 3.3018255714555564e-07, "loss": 0.91445804, "num_input_tokens_seen": 294220385, "router_z_loss_clip": 2.26171875, "router_z_loss_mlp": 0.2598877, "step": 13639, "time_per_iteration": 2.6779372692108154 }, { "auxiliary_loss_clip": 0.01224772, "auxiliary_loss_mlp": 0.0021258, "balance_loss_clip": 1.01015687, "balance_loss_mlp": 0.19045471, "epoch": 0.8200811663911017, "flos": 22091454979200.0, "grad_norm": 5.225023142683345, "language_loss": 0.87684983, "learning_rate": 3.299682336022589e-07, "loss": 0.89122337, "num_input_tokens_seen": 294239355, "router_z_loss_clip": 2.14257812, "router_z_loss_mlp": 0.22155762, "step": 13640, "time_per_iteration": 2.679725170135498 }, { "auxiliary_loss_clip": 0.01267753, "auxiliary_loss_mlp": 0.00224948, "balance_loss_clip": 1.03786564, "balance_loss_mlp": 0.19959234, "epoch": 0.8201412896437698, "flos": 37593659520000.0, "grad_norm": 8.439023030084238, "language_loss": 0.70380116, "learning_rate": 3.297539733867336e-07, "loss": 0.71872818, "num_input_tokens_seen": 294259395, "router_z_loss_clip": 2.29882812, "router_z_loss_mlp": 0.25354004, "step": 13641, "time_per_iteration": 2.782400131225586 }, { "auxiliary_loss_clip": 0.01239016, "auxiliary_loss_mlp": 0.0021963, "balance_loss_clip": 1.01888204, "balance_loss_mlp": 0.19560906, "epoch": 0.8202014128964377, "flos": 19646333389440.0, "grad_norm": 2.932030131352737, "language_loss": 0.81939214, "learning_rate": 3.295397765071055e-07, "loss": 0.83397859, "num_input_tokens_seen": 294277365, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.23999023, "step": 13642, "time_per_iteration": 2.7147645950317383 }, { "auxiliary_loss_clip": 0.01243662, "auxiliary_loss_mlp": 0.00222272, "balance_loss_clip": 1.02568531, "balance_loss_mlp": 0.20018294, "epoch": 0.8202615361491057, "flos": 31467335564160.0, "grad_norm": 14.787210729213797, "language_loss": 0.80226874, "learning_rate": 3.2932564297149615e-07, "loss": 0.81692809, "num_input_tokens_seen": 294297555, "router_z_loss_clip": 2.18164062, "router_z_loss_mlp": 0.22094727, "step": 13643, "time_per_iteration": 2.7551143169403076 }, { "auxiliary_loss_clip": 0.01232984, "auxiliary_loss_mlp": 0.00236832, "balance_loss_clip": 1.01829958, "balance_loss_mlp": 0.2126442, "epoch": 0.8203216594017736, "flos": 24715555061760.0, "grad_norm": 6.8590470912508215, "language_loss": 0.72676098, "learning_rate": 3.291115727880256e-07, "loss": 0.74145919, "num_input_tokens_seen": 294317600, "router_z_loss_clip": 2.14453125, "router_z_loss_mlp": 0.24169922, "step": 13644, "time_per_iteration": 2.6882035732269287 }, { "auxiliary_loss_clip": 0.01236148, "auxiliary_loss_mlp": 0.00243983, "balance_loss_clip": 1.02211404, "balance_loss_mlp": 0.22049864, "epoch": 0.8203817826544416, "flos": 26031824951040.0, "grad_norm": 23.820883903321622, "language_loss": 0.77343839, "learning_rate": 3.2889756596481234e-07, "loss": 0.78823966, "num_input_tokens_seen": 294340215, "router_z_loss_clip": 2.13769531, "router_z_loss_mlp": 0.23474121, "step": 13645, "time_per_iteration": 2.723560094833374 }, { "auxiliary_loss_clip": 0.0123103, "auxiliary_loss_mlp": 0.00212428, "balance_loss_clip": 1.01705456, "balance_loss_mlp": 0.18785886, "epoch": 0.8204419059071095, "flos": 25954544839680.0, "grad_norm": 14.35527184315142, "language_loss": 0.78650713, "learning_rate": 3.286836225099707e-07, "loss": 0.80094177, "num_input_tokens_seen": 294358590, "router_z_loss_clip": 2.13671875, "router_z_loss_mlp": 0.24584961, "step": 13646, "time_per_iteration": 2.7297866344451904 }, { "auxiliary_loss_clip": 0.01271724, "auxiliary_loss_mlp": 0.00243041, "balance_loss_clip": 1.04672635, "balance_loss_mlp": 0.21491942, "epoch": 0.8205020291597775, "flos": 23580059345280.0, "grad_norm": 5.120213029104192, "language_loss": 0.86889851, "learning_rate": 3.284697424316132e-07, "loss": 0.88404614, "num_input_tokens_seen": 294375825, "router_z_loss_clip": 2.25390625, "router_z_loss_mlp": 0.28137207, "step": 13647, "time_per_iteration": 2.6841328144073486 }, { "auxiliary_loss_clip": 0.01243732, "auxiliary_loss_mlp": 0.00218895, "balance_loss_clip": 1.02677619, "balance_loss_mlp": 0.19617419, "epoch": 0.8205621524124456, "flos": 26799164219520.0, "grad_norm": 15.570122333779041, "language_loss": 0.75166202, "learning_rate": 3.2825592573785034e-07, "loss": 0.76628828, "num_input_tokens_seen": 294398500, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.22753906, "step": 13648, "time_per_iteration": 2.7461588382720947 }, { "auxiliary_loss_clip": 0.01257447, "auxiliary_loss_mlp": 0.00228439, "balance_loss_clip": 1.03442454, "balance_loss_mlp": 0.20396492, "epoch": 0.8206222756651135, "flos": 27527863432320.0, "grad_norm": 5.399678894828169, "language_loss": 0.87199652, "learning_rate": 3.28042172436791e-07, "loss": 0.88685536, "num_input_tokens_seen": 294418840, "router_z_loss_clip": 2.23046875, "router_z_loss_mlp": 0.24462891, "step": 13649, "time_per_iteration": 4.1821746826171875 }, { "auxiliary_loss_clip": 0.01260453, "auxiliary_loss_mlp": 0.00232517, "balance_loss_clip": 1.03754437, "balance_loss_mlp": 0.20716138, "epoch": 0.8206823989177815, "flos": 21178605715200.0, "grad_norm": 2.934184487465175, "language_loss": 0.76703769, "learning_rate": 3.278284825365396e-07, "loss": 0.7819674, "num_input_tokens_seen": 294438215, "router_z_loss_clip": 2.23046875, "router_z_loss_mlp": 0.25341797, "step": 13650, "time_per_iteration": 4.081311225891113 }, { "auxiliary_loss_clip": 0.01263501, "auxiliary_loss_mlp": 0.00221003, "balance_loss_clip": 1.04180765, "balance_loss_mlp": 0.19666037, "epoch": 0.8207425221704494, "flos": 11509622150400.0, "grad_norm": 68.8148422625759, "language_loss": 0.73510504, "learning_rate": 3.276148560452001e-07, "loss": 0.74995005, "num_input_tokens_seen": 294455260, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.2434082, "step": 13651, "time_per_iteration": 2.6158719062805176 }, { "auxiliary_loss_clip": 0.01272527, "auxiliary_loss_mlp": 0.00243324, "balance_loss_clip": 1.04788935, "balance_loss_mlp": 0.21757466, "epoch": 0.8208026454231174, "flos": 19791987039360.0, "grad_norm": 21.950805584674466, "language_loss": 0.79784769, "learning_rate": 3.2740129297087293e-07, "loss": 0.81300622, "num_input_tokens_seen": 294473205, "router_z_loss_clip": 2.24609375, "router_z_loss_mlp": 0.25732422, "step": 13652, "time_per_iteration": 2.6387953758239746 }, { "auxiliary_loss_clip": 0.01219008, "auxiliary_loss_mlp": 0.00215244, "balance_loss_clip": 1.01057911, "balance_loss_mlp": 0.19483548, "epoch": 0.8208627686757853, "flos": 15667538843520.0, "grad_norm": 181.75661215739916, "language_loss": 0.80228043, "learning_rate": 3.271877933216558e-07, "loss": 0.81662297, "num_input_tokens_seen": 294490645, "router_z_loss_clip": 2.0859375, "router_z_loss_mlp": 0.20422363, "step": 13653, "time_per_iteration": 2.6369502544403076 }, { "auxiliary_loss_clip": 0.01271833, "auxiliary_loss_mlp": 0.00227553, "balance_loss_clip": 1.04706132, "balance_loss_mlp": 0.20175579, "epoch": 0.8209228919284534, "flos": 37482659516160.0, "grad_norm": 17.004547955933646, "language_loss": 0.70912635, "learning_rate": 3.269743571056451e-07, "loss": 0.72412014, "num_input_tokens_seen": 294513500, "router_z_loss_clip": 2.25, "router_z_loss_mlp": 0.25830078, "step": 13654, "time_per_iteration": 2.7935566902160645 }, { "auxiliary_loss_clip": 0.01254462, "auxiliary_loss_mlp": 0.00221711, "balance_loss_clip": 1.03050542, "balance_loss_mlp": 0.19722551, "epoch": 0.8209830151811213, "flos": 23112969863040.0, "grad_norm": 2.707119339929814, "language_loss": 0.76125014, "learning_rate": 3.2676098433093447e-07, "loss": 0.77601182, "num_input_tokens_seen": 294535710, "router_z_loss_clip": 2.24023438, "router_z_loss_mlp": 0.24487305, "step": 13655, "time_per_iteration": 4.268051624298096 }, { "auxiliary_loss_clip": 0.01239907, "auxiliary_loss_mlp": 0.00218908, "balance_loss_clip": 1.02497005, "balance_loss_mlp": 0.19565003, "epoch": 0.8210431384337893, "flos": 21288169175040.0, "grad_norm": 81.889303743954, "language_loss": 0.89060974, "learning_rate": 3.265476750056162e-07, "loss": 0.90519786, "num_input_tokens_seen": 294554055, "router_z_loss_clip": 2.15039062, "router_z_loss_mlp": 0.23242188, "step": 13656, "time_per_iteration": 2.804494857788086 }, { "auxiliary_loss_clip": 0.01245342, "auxiliary_loss_mlp": 0.0024035, "balance_loss_clip": 1.02566564, "balance_loss_mlp": 0.21424335, "epoch": 0.8211032616864572, "flos": 11502403516800.0, "grad_norm": 26.257839822303094, "language_loss": 0.82505226, "learning_rate": 3.2633442913777654e-07, "loss": 0.83990914, "num_input_tokens_seen": 294570390, "router_z_loss_clip": 2.19921875, "router_z_loss_mlp": 0.2611084, "step": 13657, "time_per_iteration": 2.8462464809417725 }, { "auxiliary_loss_clip": 0.01238647, "auxiliary_loss_mlp": 0.0018821, "balance_loss_clip": 1.02005768, "balance_loss_mlp": 0.16380775, "epoch": 0.8211633849391252, "flos": 29821477455360.0, "grad_norm": 16.67974773419019, "language_loss": 0.64542651, "learning_rate": 3.2612124673550325e-07, "loss": 0.65969503, "num_input_tokens_seen": 294593050, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.24401855, "step": 13658, "time_per_iteration": 2.7173759937286377 }, { "auxiliary_loss_clip": 0.01235294, "auxiliary_loss_mlp": 0.00213987, "balance_loss_clip": 1.01976478, "balance_loss_mlp": 0.1910753, "epoch": 0.8212235081917931, "flos": 13115439573120.0, "grad_norm": 41.57184216171161, "language_loss": 0.89648181, "learning_rate": 3.259081278068805e-07, "loss": 0.91097462, "num_input_tokens_seen": 294608550, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.22924805, "step": 13659, "time_per_iteration": 4.0472471714019775 }, { "auxiliary_loss_clip": 0.01216562, "auxiliary_loss_mlp": 0.00224556, "balance_loss_clip": 1.01067019, "balance_loss_mlp": 0.20283625, "epoch": 0.8212836314444611, "flos": 40515351782400.0, "grad_norm": 7.205116579726877, "language_loss": 0.65633023, "learning_rate": 3.256950723599887e-07, "loss": 0.67074138, "num_input_tokens_seen": 294630380, "router_z_loss_clip": 2.05761719, "router_z_loss_mlp": 0.21716309, "step": 13660, "time_per_iteration": 2.860856056213379 }, { "auxiliary_loss_clip": 0.01250479, "auxiliary_loss_mlp": 0.0024773, "balance_loss_clip": 1.03152609, "balance_loss_mlp": 0.22205274, "epoch": 0.8213437546971292, "flos": 18770543982720.0, "grad_norm": 9.301822166249279, "language_loss": 0.81717086, "learning_rate": 3.254820804029075e-07, "loss": 0.83215296, "num_input_tokens_seen": 294648655, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.2565918, "step": 13661, "time_per_iteration": 2.6207244396209717 }, { "auxiliary_loss_clip": 0.01279097, "auxiliary_loss_mlp": 0.00251919, "balance_loss_clip": 1.05228782, "balance_loss_mlp": 0.22558588, "epoch": 0.8214038779497971, "flos": 19682279925120.0, "grad_norm": 12.792296415836011, "language_loss": 0.82918072, "learning_rate": 3.252691519437143e-07, "loss": 0.84449089, "num_input_tokens_seen": 294666915, "router_z_loss_clip": 2.265625, "router_z_loss_mlp": 0.2635498, "step": 13662, "time_per_iteration": 2.689063549041748 }, { "auxiliary_loss_clip": 0.01108874, "auxiliary_loss_mlp": 0.00155879, "balance_loss_clip": 0.95874131, "balance_loss_mlp": 0.146008, "epoch": 0.8214640012024651, "flos": 71602969697280.0, "grad_norm": 0.7408562411072988, "language_loss": 0.5355463, "learning_rate": 3.250562869904825e-07, "loss": 0.54819381, "num_input_tokens_seen": 294731545, "router_z_loss_clip": 1.5, "router_z_loss_mlp": 0.09863281, "step": 13663, "time_per_iteration": 3.314476251602173 }, { "auxiliary_loss_clip": 0.0125228, "auxiliary_loss_mlp": 0.00243212, "balance_loss_clip": 1.03496218, "balance_loss_mlp": 0.21904835, "epoch": 0.821524124455133, "flos": 14757203531520.0, "grad_norm": 29.046762703257798, "language_loss": 0.74119705, "learning_rate": 3.248434855512838e-07, "loss": 0.75615197, "num_input_tokens_seen": 294748745, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.24169922, "step": 13664, "time_per_iteration": 2.6688344478607178 }, { "auxiliary_loss_clip": 0.01227981, "auxiliary_loss_mlp": 0.00235818, "balance_loss_clip": 1.01918674, "balance_loss_mlp": 0.21283421, "epoch": 0.821584247707801, "flos": 25082274965760.0, "grad_norm": 5.613330639938815, "language_loss": 0.81126535, "learning_rate": 3.246307476341881e-07, "loss": 0.8259033, "num_input_tokens_seen": 294768955, "router_z_loss_clip": 2.08886719, "router_z_loss_mlp": 0.22998047, "step": 13665, "time_per_iteration": 2.6938154697418213 }, { "auxiliary_loss_clip": 0.01240915, "auxiliary_loss_mlp": 0.0022119, "balance_loss_clip": 1.02450442, "balance_loss_mlp": 0.19694272, "epoch": 0.8216443709604689, "flos": 36830701710720.0, "grad_norm": 19.312257949647538, "language_loss": 0.75910032, "learning_rate": 3.2441807324726256e-07, "loss": 0.77372134, "num_input_tokens_seen": 294789250, "router_z_loss_clip": 2.16015625, "router_z_loss_mlp": 0.24243164, "step": 13666, "time_per_iteration": 2.773989677429199 }, { "auxiliary_loss_clip": 0.01249972, "auxiliary_loss_mlp": 0.00212411, "balance_loss_clip": 1.03317082, "balance_loss_mlp": 0.18883166, "epoch": 0.821704494213137, "flos": 25081808088960.0, "grad_norm": 2.3392429262338355, "language_loss": 0.84126705, "learning_rate": 3.2420546239857174e-07, "loss": 0.85589087, "num_input_tokens_seen": 294809760, "router_z_loss_clip": 2.16796875, "router_z_loss_mlp": 0.2355957, "step": 13667, "time_per_iteration": 2.7081947326660156 }, { "auxiliary_loss_clip": 0.01250573, "auxiliary_loss_mlp": 0.00220989, "balance_loss_clip": 1.03033924, "balance_loss_mlp": 0.19720653, "epoch": 0.8217646174658049, "flos": 14356117290240.0, "grad_norm": 32.91499290911049, "language_loss": 0.84639972, "learning_rate": 3.239929150961773e-07, "loss": 0.86111534, "num_input_tokens_seen": 294826495, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.2376709, "step": 13668, "time_per_iteration": 2.6379787921905518 }, { "auxiliary_loss_clip": 0.01247541, "auxiliary_loss_mlp": 0.00222743, "balance_loss_clip": 1.03325963, "balance_loss_mlp": 0.19836485, "epoch": 0.8218247407184729, "flos": 22090557139200.0, "grad_norm": 66.67600582677461, "language_loss": 0.83786201, "learning_rate": 3.2378043134813984e-07, "loss": 0.85256481, "num_input_tokens_seen": 294845370, "router_z_loss_clip": 2.14453125, "router_z_loss_mlp": 0.24389648, "step": 13669, "time_per_iteration": 2.7121329307556152 }, { "auxiliary_loss_clip": 0.01235252, "auxiliary_loss_mlp": 0.00204454, "balance_loss_clip": 1.02056801, "balance_loss_mlp": 0.18193524, "epoch": 0.8218848639711408, "flos": 16764035368320.0, "grad_norm": 10.91392769205187, "language_loss": 0.84687203, "learning_rate": 3.235680111625161e-07, "loss": 0.86126906, "num_input_tokens_seen": 294863740, "router_z_loss_clip": 2.1484375, "router_z_loss_mlp": 0.22521973, "step": 13670, "time_per_iteration": 2.6542582511901855 }, { "auxiliary_loss_clip": 0.01265687, "auxiliary_loss_mlp": 0.00245999, "balance_loss_clip": 1.04445469, "balance_loss_mlp": 0.22017856, "epoch": 0.8219449872238088, "flos": 25994801007360.0, "grad_norm": 440.46525290282216, "language_loss": 0.82288915, "learning_rate": 3.2335565454736123e-07, "loss": 0.83800602, "num_input_tokens_seen": 294882815, "router_z_loss_clip": 2.2109375, "router_z_loss_mlp": 0.25830078, "step": 13671, "time_per_iteration": 2.68607759475708 }, { "auxiliary_loss_clip": 0.01288915, "auxiliary_loss_mlp": 0.0023902, "balance_loss_clip": 1.05405521, "balance_loss_mlp": 0.2109459, "epoch": 0.8220051104764767, "flos": 20778094091520.0, "grad_norm": 38.35309260010062, "language_loss": 0.84440565, "learning_rate": 3.23143361510728e-07, "loss": 0.859685, "num_input_tokens_seen": 294901985, "router_z_loss_clip": 2.35351562, "router_z_loss_mlp": 0.28063965, "step": 13672, "time_per_iteration": 2.6699037551879883 }, { "auxiliary_loss_clip": 0.0124324, "auxiliary_loss_mlp": 0.00252, "balance_loss_clip": 1.02960837, "balance_loss_mlp": 0.22704962, "epoch": 0.8220652337291448, "flos": 14574849160320.0, "grad_norm": 18.142793899282335, "language_loss": 0.81932831, "learning_rate": 3.2293113206066733e-07, "loss": 0.83428061, "num_input_tokens_seen": 294919705, "router_z_loss_clip": 2.13574219, "router_z_loss_mlp": 0.24951172, "step": 13673, "time_per_iteration": 2.7024781703948975 }, { "auxiliary_loss_clip": 0.01256177, "auxiliary_loss_mlp": 0.00232818, "balance_loss_clip": 1.03212619, "balance_loss_mlp": 0.20828438, "epoch": 0.8221253569818128, "flos": 23805866194560.0, "grad_norm": 72.21136026046727, "language_loss": 0.87313193, "learning_rate": 3.227189662052254e-07, "loss": 0.88802189, "num_input_tokens_seen": 294939900, "router_z_loss_clip": 2.2421875, "router_z_loss_mlp": 0.24523926, "step": 13674, "time_per_iteration": 2.6698179244995117 }, { "auxiliary_loss_clip": 0.01263377, "auxiliary_loss_mlp": 0.00253738, "balance_loss_clip": 1.03981853, "balance_loss_mlp": 0.22835883, "epoch": 0.8221854802344807, "flos": 21288241002240.0, "grad_norm": 5.878062521689124, "language_loss": 0.79801971, "learning_rate": 3.225068639524484e-07, "loss": 0.81319082, "num_input_tokens_seen": 294959110, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.25390625, "step": 13675, "time_per_iteration": 2.6710855960845947 }, { "auxiliary_loss_clip": 0.01234729, "auxiliary_loss_mlp": 0.00231161, "balance_loss_clip": 1.02227378, "balance_loss_mlp": 0.20801061, "epoch": 0.8222456034871487, "flos": 20956785275520.0, "grad_norm": 6.541662131526932, "language_loss": 0.80523026, "learning_rate": 3.2229482531037965e-07, "loss": 0.81988919, "num_input_tokens_seen": 294978660, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.23156738, "step": 13676, "time_per_iteration": 2.7572646141052246 }, { "auxiliary_loss_clip": 0.01229733, "auxiliary_loss_mlp": 0.00242074, "balance_loss_clip": 1.02137756, "balance_loss_mlp": 0.21922135, "epoch": 0.8223057267398166, "flos": 21397517153280.0, "grad_norm": 25.893425971811478, "language_loss": 0.87010145, "learning_rate": 3.2208285028705893e-07, "loss": 0.88481957, "num_input_tokens_seen": 294998075, "router_z_loss_clip": 2.08203125, "router_z_loss_mlp": 0.2286377, "step": 13677, "time_per_iteration": 2.6480190753936768 }, { "auxiliary_loss_clip": 0.01246649, "auxiliary_loss_mlp": 0.0024106, "balance_loss_clip": 1.03006721, "balance_loss_mlp": 0.21681239, "epoch": 0.8223658499924846, "flos": 15268212368640.0, "grad_norm": 172.844779936462, "language_loss": 0.81316245, "learning_rate": 3.218709388905245e-07, "loss": 0.82803953, "num_input_tokens_seen": 295015950, "router_z_loss_clip": 2.16601562, "router_z_loss_mlp": 0.24243164, "step": 13678, "time_per_iteration": 2.678593635559082 }, { "auxiliary_loss_clip": 0.01249737, "auxiliary_loss_mlp": 0.00238603, "balance_loss_clip": 1.03460026, "balance_loss_mlp": 0.21358114, "epoch": 0.8224259732451525, "flos": 31249537447680.0, "grad_norm": 34.378082543398804, "language_loss": 0.80116951, "learning_rate": 3.216590911288133e-07, "loss": 0.81605291, "num_input_tokens_seen": 295036800, "router_z_loss_clip": 2.15332031, "router_z_loss_mlp": 0.25036621, "step": 13679, "time_per_iteration": 2.7520902156829834 }, { "auxiliary_loss_clip": 0.01229325, "auxiliary_loss_mlp": 0.00213411, "balance_loss_clip": 1.01534271, "balance_loss_mlp": 0.18999822, "epoch": 0.8224860964978206, "flos": 21574628138880.0, "grad_norm": 4.042398476127678, "language_loss": 0.7843374, "learning_rate": 3.214473070099564e-07, "loss": 0.79876477, "num_input_tokens_seen": 295055300, "router_z_loss_clip": 2.13867188, "router_z_loss_mlp": 0.23413086, "step": 13680, "time_per_iteration": 2.720616340637207 }, { "auxiliary_loss_clip": 0.01247513, "auxiliary_loss_mlp": 0.00234018, "balance_loss_clip": 1.03880715, "balance_loss_mlp": 0.2103077, "epoch": 0.8225462197504885, "flos": 25483217552640.0, "grad_norm": 47.33663552588526, "language_loss": 0.67661369, "learning_rate": 3.21235586541986e-07, "loss": 0.69142902, "num_input_tokens_seen": 295076420, "router_z_loss_clip": 2.08789062, "router_z_loss_mlp": 0.23693848, "step": 13681, "time_per_iteration": 2.686305284500122 }, { "auxiliary_loss_clip": 0.01243938, "auxiliary_loss_mlp": 0.00240993, "balance_loss_clip": 1.02640605, "balance_loss_mlp": 0.21648324, "epoch": 0.8226063430031565, "flos": 39385458587520.0, "grad_norm": 13.530218311318416, "language_loss": 0.77126098, "learning_rate": 3.2102392973293047e-07, "loss": 0.78611028, "num_input_tokens_seen": 295100540, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.24499512, "step": 13682, "time_per_iteration": 2.858214855194092 }, { "auxiliary_loss_clip": 0.01250528, "auxiliary_loss_mlp": 0.00235185, "balance_loss_clip": 1.03342938, "balance_loss_mlp": 0.20962654, "epoch": 0.8226664662558244, "flos": 22815269942400.0, "grad_norm": 6.374335967324102, "language_loss": 0.87066352, "learning_rate": 3.20812336590816e-07, "loss": 0.8855207, "num_input_tokens_seen": 295120180, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.25549316, "step": 13683, "time_per_iteration": 2.6456854343414307 }, { "auxiliary_loss_clip": 0.01237029, "auxiliary_loss_mlp": 0.0022887, "balance_loss_clip": 1.02489209, "balance_loss_mlp": 0.20567146, "epoch": 0.8227265895084924, "flos": 25665607837440.0, "grad_norm": 54.25876249103956, "language_loss": 0.95874703, "learning_rate": 3.206008071236661e-07, "loss": 0.97340602, "num_input_tokens_seen": 295138530, "router_z_loss_clip": 2.12109375, "router_z_loss_mlp": 0.23193359, "step": 13684, "time_per_iteration": 2.737603187561035 }, { "auxiliary_loss_clip": 0.01228565, "auxiliary_loss_mlp": 0.00213356, "balance_loss_clip": 1.01550329, "balance_loss_mlp": 0.18901309, "epoch": 0.8227867127611603, "flos": 26179274280960.0, "grad_norm": 71.91474888039888, "language_loss": 0.84901667, "learning_rate": 3.2038934133950157e-07, "loss": 0.86343592, "num_input_tokens_seen": 295160260, "router_z_loss_clip": 2.12695312, "router_z_loss_mlp": 0.24365234, "step": 13685, "time_per_iteration": 2.7387423515319824 }, { "auxiliary_loss_clip": 0.01245198, "auxiliary_loss_mlp": 0.0021491, "balance_loss_clip": 1.02883542, "balance_loss_mlp": 0.19067426, "epoch": 0.8228468360138284, "flos": 22018053536640.0, "grad_norm": 9.175162194788308, "language_loss": 0.75161052, "learning_rate": 3.2017793924634194e-07, "loss": 0.76621151, "num_input_tokens_seen": 295177055, "router_z_loss_clip": 2.1640625, "router_z_loss_mlp": 0.24243164, "step": 13686, "time_per_iteration": 2.6856203079223633 }, { "auxiliary_loss_clip": 0.01252212, "auxiliary_loss_mlp": 0.00212692, "balance_loss_clip": 1.03255177, "balance_loss_mlp": 0.18710992, "epoch": 0.8229069592664963, "flos": 14903359971840.0, "grad_norm": 6.618273250106911, "language_loss": 0.8738755, "learning_rate": 3.1996660085220263e-07, "loss": 0.88852453, "num_input_tokens_seen": 295193870, "router_z_loss_clip": 2.19921875, "router_z_loss_mlp": 0.25622559, "step": 13687, "time_per_iteration": 2.642263412475586 }, { "auxiliary_loss_clip": 0.01250415, "auxiliary_loss_mlp": 0.0022075, "balance_loss_clip": 1.03233647, "balance_loss_mlp": 0.19613338, "epoch": 0.8229670825191643, "flos": 15669478177920.0, "grad_norm": 8.116875019613335, "language_loss": 0.80769032, "learning_rate": 3.1975532616509825e-07, "loss": 0.82240194, "num_input_tokens_seen": 295211040, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.24597168, "step": 13688, "time_per_iteration": 2.706721305847168 }, { "auxiliary_loss_clip": 0.01237136, "auxiliary_loss_mlp": 0.00221888, "balance_loss_clip": 1.02440405, "balance_loss_mlp": 0.19907084, "epoch": 0.8230272057718323, "flos": 23183498217600.0, "grad_norm": 11.290484975133644, "language_loss": 0.79901791, "learning_rate": 3.1954411519304025e-07, "loss": 0.81360817, "num_input_tokens_seen": 295231300, "router_z_loss_clip": 2.12890625, "router_z_loss_mlp": 0.22802734, "step": 13689, "time_per_iteration": 2.7404541969299316 }, { "auxiliary_loss_clip": 0.01257332, "auxiliary_loss_mlp": 0.0022619, "balance_loss_clip": 1.03241658, "balance_loss_mlp": 0.1988195, "epoch": 0.8230873290245002, "flos": 21032413361280.0, "grad_norm": 2.713952788341214, "language_loss": 0.75348747, "learning_rate": 3.1933296794403887e-07, "loss": 0.76832271, "num_input_tokens_seen": 295251045, "router_z_loss_clip": 2.25, "router_z_loss_mlp": 0.27416992, "step": 13690, "time_per_iteration": 2.7195749282836914 }, { "auxiliary_loss_clip": 0.01242649, "auxiliary_loss_mlp": 0.002309, "balance_loss_clip": 1.02938831, "balance_loss_mlp": 0.20780903, "epoch": 0.8231474522771682, "flos": 21250139650560.0, "grad_norm": 70.44654865624261, "language_loss": 0.92159998, "learning_rate": 3.191218844260988e-07, "loss": 0.93633544, "num_input_tokens_seen": 295270225, "router_z_loss_clip": 2.13085938, "router_z_loss_mlp": 0.23083496, "step": 13691, "time_per_iteration": 4.133214235305786 }, { "auxiliary_loss_clip": 0.01253323, "auxiliary_loss_mlp": 0.00233414, "balance_loss_clip": 1.03353584, "balance_loss_mlp": 0.2098701, "epoch": 0.8232075755298361, "flos": 23842028211840.0, "grad_norm": 188.86806102349396, "language_loss": 0.83057982, "learning_rate": 3.189108646472252e-07, "loss": 0.84544718, "num_input_tokens_seen": 295288950, "router_z_loss_clip": 2.20117188, "router_z_loss_mlp": 0.2355957, "step": 13692, "time_per_iteration": 4.138348817825317 }, { "auxiliary_loss_clip": 0.01256276, "auxiliary_loss_mlp": 0.00219079, "balance_loss_clip": 1.03657186, "balance_loss_mlp": 0.19503444, "epoch": 0.8232676987825042, "flos": 21653955325440.0, "grad_norm": 1.703064595473075, "language_loss": 0.76884782, "learning_rate": 3.186999086154205e-07, "loss": 0.78360134, "num_input_tokens_seen": 295309405, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.24047852, "step": 13693, "time_per_iteration": 2.6602659225463867 }, { "auxiliary_loss_clip": 0.01235106, "auxiliary_loss_mlp": 0.00227539, "balance_loss_clip": 1.02553749, "balance_loss_mlp": 0.20618893, "epoch": 0.8233278220351721, "flos": 26322701287680.0, "grad_norm": 5.9562302222932, "language_loss": 0.88839418, "learning_rate": 3.1848901633868355e-07, "loss": 0.90302062, "num_input_tokens_seen": 295331115, "router_z_loss_clip": 2.09667969, "router_z_loss_mlp": 0.21350098, "step": 13694, "time_per_iteration": 2.747666358947754 }, { "auxiliary_loss_clip": 0.01239788, "auxiliary_loss_mlp": 0.00217655, "balance_loss_clip": 1.0233674, "balance_loss_mlp": 0.19175132, "epoch": 0.8233879452878401, "flos": 21725812483200.0, "grad_norm": 5.869027643170198, "language_loss": 0.84043533, "learning_rate": 3.182781878250118e-07, "loss": 0.85500979, "num_input_tokens_seen": 295350495, "router_z_loss_clip": 2.1640625, "router_z_loss_mlp": 0.2590332, "step": 13695, "time_per_iteration": 2.6517174243927 }, { "auxiliary_loss_clip": 0.01220973, "auxiliary_loss_mlp": 0.0020508, "balance_loss_clip": 1.01205659, "balance_loss_mlp": 0.18319336, "epoch": 0.823448068540508, "flos": 20557746109440.0, "grad_norm": 5.090813151301775, "language_loss": 0.87449217, "learning_rate": 3.1806742308239985e-07, "loss": 0.8887527, "num_input_tokens_seen": 295368225, "router_z_loss_clip": 2.09082031, "router_z_loss_mlp": 0.21875, "step": 13696, "time_per_iteration": 2.6309125423431396 }, { "auxiliary_loss_clip": 0.01105385, "auxiliary_loss_mlp": 0.00110521, "balance_loss_clip": 0.95935684, "balance_loss_mlp": 0.10322586, "epoch": 0.823508191793176, "flos": 67273688194560.0, "grad_norm": 0.7262349095254194, "language_loss": 0.63090885, "learning_rate": 3.178567221188393e-07, "loss": 0.64306796, "num_input_tokens_seen": 295430035, "router_z_loss_clip": 1.4609375, "router_z_loss_mlp": 0.07275391, "step": 13697, "time_per_iteration": 4.700345754623413 }, { "auxiliary_loss_clip": 0.01237537, "auxiliary_loss_mlp": 0.00217337, "balance_loss_clip": 1.02631581, "balance_loss_mlp": 0.19574869, "epoch": 0.8235683150458439, "flos": 17928402641280.0, "grad_norm": 26.340340670157833, "language_loss": 0.79147303, "learning_rate": 3.1764608494232037e-07, "loss": 0.80602169, "num_input_tokens_seen": 295447765, "router_z_loss_clip": 2.11230469, "router_z_loss_mlp": 0.21569824, "step": 13698, "time_per_iteration": 2.605940818786621 }, { "auxiliary_loss_clip": 0.01239521, "auxiliary_loss_mlp": 0.002269, "balance_loss_clip": 1.01893389, "balance_loss_mlp": 0.20248568, "epoch": 0.823628438298512, "flos": 18916089891840.0, "grad_norm": 3.23361335005492, "language_loss": 0.80313373, "learning_rate": 3.174355115608305e-07, "loss": 0.81779796, "num_input_tokens_seen": 295464810, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.24389648, "step": 13699, "time_per_iteration": 2.7057089805603027 }, { "auxiliary_loss_clip": 0.01230184, "auxiliary_loss_mlp": 0.002096, "balance_loss_clip": 1.01932991, "balance_loss_mlp": 0.18745068, "epoch": 0.8236885615511799, "flos": 18696460181760.0, "grad_norm": 92.54941802529576, "language_loss": 0.90949273, "learning_rate": 3.1722500198235526e-07, "loss": 0.92389059, "num_input_tokens_seen": 295482605, "router_z_loss_clip": 2.109375, "router_z_loss_mlp": 0.22167969, "step": 13700, "time_per_iteration": 2.659273386001587 }, { "auxiliary_loss_clip": 0.01240607, "auxiliary_loss_mlp": 0.00220151, "balance_loss_clip": 1.0226512, "balance_loss_mlp": 0.19561726, "epoch": 0.8237486848038479, "flos": 23695009845120.0, "grad_norm": 63.76990812076491, "language_loss": 0.80563623, "learning_rate": 3.170145562148763e-07, "loss": 0.82024384, "num_input_tokens_seen": 295503780, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.2454834, "step": 13701, "time_per_iteration": 4.130049705505371 }, { "auxiliary_loss_clip": 0.01252803, "auxiliary_loss_mlp": 0.00236022, "balance_loss_clip": 1.03082907, "balance_loss_mlp": 0.21122669, "epoch": 0.8238088080565159, "flos": 23441301106560.0, "grad_norm": 45.3421641611917, "language_loss": 0.80697507, "learning_rate": 3.1680417426637384e-07, "loss": 0.82186329, "num_input_tokens_seen": 295522035, "router_z_loss_clip": 2.21582031, "router_z_loss_mlp": 0.2479248, "step": 13702, "time_per_iteration": 2.6792609691619873 }, { "auxiliary_loss_clip": 0.01255957, "auxiliary_loss_mlp": 0.00223299, "balance_loss_clip": 1.03338742, "balance_loss_mlp": 0.19701307, "epoch": 0.8238689313091838, "flos": 22746537267840.0, "grad_norm": 98.87493402274612, "language_loss": 0.8288964, "learning_rate": 3.1659385614482603e-07, "loss": 0.84368896, "num_input_tokens_seen": 295541190, "router_z_loss_clip": 2.22460938, "router_z_loss_mlp": 0.26269531, "step": 13703, "time_per_iteration": 2.6331515312194824 }, { "auxiliary_loss_clip": 0.01260561, "auxiliary_loss_mlp": 0.00230415, "balance_loss_clip": 1.038872, "balance_loss_mlp": 0.20386674, "epoch": 0.8239290545618518, "flos": 25630092264960.0, "grad_norm": 12.134572155476082, "language_loss": 0.79572046, "learning_rate": 3.1638360185820755e-07, "loss": 0.8106302, "num_input_tokens_seen": 295558860, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.26501465, "step": 13704, "time_per_iteration": 2.7313072681427 }, { "auxiliary_loss_clip": 0.01247891, "auxiliary_loss_mlp": 0.00227987, "balance_loss_clip": 1.03171611, "balance_loss_mlp": 0.20433626, "epoch": 0.8239891778145197, "flos": 26026473824640.0, "grad_norm": 32.560862376204454, "language_loss": 0.71583915, "learning_rate": 3.161734114144916e-07, "loss": 0.73059791, "num_input_tokens_seen": 295578155, "router_z_loss_clip": 2.16210938, "router_z_loss_mlp": 0.23669434, "step": 13705, "time_per_iteration": 2.739715337753296 }, { "auxiliary_loss_clip": 0.0124168, "auxiliary_loss_mlp": 0.00254802, "balance_loss_clip": 1.0227921, "balance_loss_mlp": 0.22955298, "epoch": 0.8240493010671878, "flos": 21833257040640.0, "grad_norm": 13.816152728300471, "language_loss": 0.78797925, "learning_rate": 3.1596328482164915e-07, "loss": 0.80294406, "num_input_tokens_seen": 295599170, "router_z_loss_clip": 2.19140625, "router_z_loss_mlp": 0.25256348, "step": 13706, "time_per_iteration": 2.724320650100708 }, { "auxiliary_loss_clip": 0.01250745, "auxiliary_loss_mlp": 0.0021616, "balance_loss_clip": 1.03473306, "balance_loss_mlp": 0.19157901, "epoch": 0.8241094243198557, "flos": 18551919853440.0, "grad_norm": 11.329205204278571, "language_loss": 0.78145373, "learning_rate": 3.157532220876475e-07, "loss": 0.79612279, "num_input_tokens_seen": 295617465, "router_z_loss_clip": 2.15917969, "router_z_loss_mlp": 0.24609375, "step": 13707, "time_per_iteration": 2.712522268295288 }, { "auxiliary_loss_clip": 0.01243916, "auxiliary_loss_mlp": 0.00235078, "balance_loss_clip": 1.02715433, "balance_loss_mlp": 0.2104378, "epoch": 0.8241695475725237, "flos": 25447163276160.0, "grad_norm": 9.378367201950107, "language_loss": 0.85314208, "learning_rate": 3.1554322322045226e-07, "loss": 0.86793208, "num_input_tokens_seen": 295634960, "router_z_loss_clip": 2.16796875, "router_z_loss_mlp": 0.24658203, "step": 13708, "time_per_iteration": 2.687114715576172 }, { "auxiliary_loss_clip": 0.01244743, "auxiliary_loss_mlp": 0.00212005, "balance_loss_clip": 1.0303781, "balance_loss_mlp": 0.18861581, "epoch": 0.8242296708251916, "flos": 18989670902400.0, "grad_norm": 13.98955378160717, "language_loss": 0.77171767, "learning_rate": 3.1533328822802664e-07, "loss": 0.78628516, "num_input_tokens_seen": 295652725, "router_z_loss_clip": 2.14257812, "router_z_loss_mlp": 0.23400879, "step": 13709, "time_per_iteration": 2.6638786792755127 }, { "auxiliary_loss_clip": 0.01250175, "auxiliary_loss_mlp": 0.00234275, "balance_loss_clip": 1.03313017, "balance_loss_mlp": 0.21003985, "epoch": 0.8242897940778596, "flos": 22600883617920.0, "grad_norm": 27.04804398585057, "language_loss": 0.91761935, "learning_rate": 3.151234171183319e-07, "loss": 0.93246394, "num_input_tokens_seen": 295671195, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.24206543, "step": 13710, "time_per_iteration": 2.7124404907226562 }, { "auxiliary_loss_clip": 0.01227089, "auxiliary_loss_mlp": 0.00213394, "balance_loss_clip": 1.01686203, "balance_loss_mlp": 0.19113779, "epoch": 0.8243499173305275, "flos": 21468153248640.0, "grad_norm": 99.74923823045009, "language_loss": 0.84715009, "learning_rate": 3.149136098993257e-07, "loss": 0.86155498, "num_input_tokens_seen": 295689130, "router_z_loss_clip": 2.09960938, "router_z_loss_mlp": 0.22241211, "step": 13711, "time_per_iteration": 2.7122344970703125 }, { "auxiliary_loss_clip": 0.01241908, "auxiliary_loss_mlp": 0.00212587, "balance_loss_clip": 1.02648282, "balance_loss_mlp": 0.1877677, "epoch": 0.8244100405831956, "flos": 20010359773440.0, "grad_norm": 28.45550344578042, "language_loss": 0.73181266, "learning_rate": 3.1470386657896473e-07, "loss": 0.74635756, "num_input_tokens_seen": 295706385, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.24816895, "step": 13712, "time_per_iteration": 2.632077693939209 }, { "auxiliary_loss_clip": 0.01237037, "auxiliary_loss_mlp": 0.00232018, "balance_loss_clip": 1.02369022, "balance_loss_mlp": 0.2094878, "epoch": 0.8244701638358635, "flos": 26430684549120.0, "grad_norm": 4.149625348004869, "language_loss": 0.81302315, "learning_rate": 3.14494187165202e-07, "loss": 0.82771373, "num_input_tokens_seen": 295727925, "router_z_loss_clip": 2.1328125, "router_z_loss_mlp": 0.2253418, "step": 13713, "time_per_iteration": 2.730330228805542 }, { "auxiliary_loss_clip": 0.01254818, "auxiliary_loss_mlp": 0.00215109, "balance_loss_clip": 1.03582156, "balance_loss_mlp": 0.19237542, "epoch": 0.8245302870885315, "flos": 17640004343040.0, "grad_norm": 2.886085383691765, "language_loss": 0.89391631, "learning_rate": 3.1428457166598833e-07, "loss": 0.90861559, "num_input_tokens_seen": 295744420, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.22717285, "step": 13714, "time_per_iteration": 2.610612630844116 }, { "auxiliary_loss_clip": 0.01236079, "auxiliary_loss_mlp": 0.00235219, "balance_loss_clip": 1.02101314, "balance_loss_mlp": 0.21067351, "epoch": 0.8245904103411995, "flos": 26209510554240.0, "grad_norm": 68.65541869933067, "language_loss": 0.73131657, "learning_rate": 3.1407502008927235e-07, "loss": 0.7460295, "num_input_tokens_seen": 295765105, "router_z_loss_clip": 2.15039062, "router_z_loss_mlp": 0.24536133, "step": 13715, "time_per_iteration": 2.736536741256714 }, { "auxiliary_loss_clip": 0.01231519, "auxiliary_loss_mlp": 0.00234092, "balance_loss_clip": 1.01908624, "balance_loss_mlp": 0.20933272, "epoch": 0.8246505335938674, "flos": 24205084928640.0, "grad_norm": 5.250951576341245, "language_loss": 0.8265897, "learning_rate": 3.1386553244300086e-07, "loss": 0.84124577, "num_input_tokens_seen": 295784200, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.24755859, "step": 13716, "time_per_iteration": 2.648739814758301 }, { "auxiliary_loss_clip": 0.01120863, "auxiliary_loss_mlp": 0.00075684, "balance_loss_clip": 0.97442561, "balance_loss_mlp": 0.06862691, "epoch": 0.8247106568465354, "flos": 67092195749760.0, "grad_norm": 0.7110320997913081, "language_loss": 0.58483642, "learning_rate": 3.136561087351175e-07, "loss": 0.59680194, "num_input_tokens_seen": 295846555, "router_z_loss_clip": 1.46875, "router_z_loss_mlp": 0.07080078, "step": 13717, "time_per_iteration": 3.3186609745025635 }, { "auxiliary_loss_clip": 0.01235099, "auxiliary_loss_mlp": 0.00201219, "balance_loss_clip": 1.02276647, "balance_loss_mlp": 0.17840239, "epoch": 0.8247707800992033, "flos": 12568232805120.0, "grad_norm": 35.70377066259417, "language_loss": 0.91626281, "learning_rate": 3.1344674897356373e-07, "loss": 0.93062598, "num_input_tokens_seen": 295863425, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.22814941, "step": 13718, "time_per_iteration": 2.6424427032470703 }, { "auxiliary_loss_clip": 0.01233468, "auxiliary_loss_mlp": 0.00228118, "balance_loss_clip": 1.02066708, "balance_loss_mlp": 0.20379902, "epoch": 0.8248309033518714, "flos": 15923617879680.0, "grad_norm": 73.73984528235015, "language_loss": 0.75610912, "learning_rate": 3.132374531662778e-07, "loss": 0.77072489, "num_input_tokens_seen": 295880925, "router_z_loss_clip": 2.12890625, "router_z_loss_mlp": 0.24328613, "step": 13719, "time_per_iteration": 2.733717203140259 }, { "auxiliary_loss_clip": 0.01243472, "auxiliary_loss_mlp": 0.0023118, "balance_loss_clip": 1.02771688, "balance_loss_mlp": 0.20752853, "epoch": 0.8248910266045393, "flos": 17564735393280.0, "grad_norm": 4.756571579354355, "language_loss": 0.80945337, "learning_rate": 3.13028221321197e-07, "loss": 0.82419991, "num_input_tokens_seen": 295898205, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.2364502, "step": 13720, "time_per_iteration": 2.6479713916778564 }, { "auxiliary_loss_clip": 0.01246253, "auxiliary_loss_mlp": 0.0023609, "balance_loss_clip": 1.02906942, "balance_loss_mlp": 0.21158043, "epoch": 0.8249511498572073, "flos": 28619655275520.0, "grad_norm": 20.576203577300436, "language_loss": 0.80852336, "learning_rate": 3.1281905344625467e-07, "loss": 0.82334685, "num_input_tokens_seen": 295918130, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.24511719, "step": 13721, "time_per_iteration": 2.702624559402466 }, { "auxiliary_loss_clip": 0.01236102, "auxiliary_loss_mlp": 0.00211034, "balance_loss_clip": 1.02380729, "balance_loss_mlp": 0.18961197, "epoch": 0.8250112731098752, "flos": 25556583081600.0, "grad_norm": 108.77752603267722, "language_loss": 0.84982389, "learning_rate": 3.1260994954938305e-07, "loss": 0.86429524, "num_input_tokens_seen": 295937760, "router_z_loss_clip": 2.12304688, "router_z_loss_mlp": 0.21398926, "step": 13722, "time_per_iteration": 2.799842119216919 }, { "auxiliary_loss_clip": 0.01228403, "auxiliary_loss_mlp": 0.00215937, "balance_loss_clip": 1.01953173, "balance_loss_mlp": 0.19319192, "epoch": 0.8250713963625432, "flos": 27746164339200.0, "grad_norm": 2.309025607945843, "language_loss": 0.71246827, "learning_rate": 3.1240090963851205e-07, "loss": 0.72691166, "num_input_tokens_seen": 295957585, "router_z_loss_clip": 2.0859375, "router_z_loss_mlp": 0.2277832, "step": 13723, "time_per_iteration": 2.675764560699463 }, { "auxiliary_loss_clip": 0.01232261, "auxiliary_loss_mlp": 0.00224283, "balance_loss_clip": 1.01723194, "balance_loss_mlp": 0.20138261, "epoch": 0.8251315196152111, "flos": 21610610588160.0, "grad_norm": 6.623763668634026, "language_loss": 0.80538511, "learning_rate": 3.121919337215666e-07, "loss": 0.81995058, "num_input_tokens_seen": 295977135, "router_z_loss_clip": 2.15332031, "router_z_loss_mlp": 0.22912598, "step": 13724, "time_per_iteration": 2.66599702835083 }, { "auxiliary_loss_clip": 0.01241447, "auxiliary_loss_mlp": 0.00222984, "balance_loss_clip": 1.0249002, "balance_loss_mlp": 0.19784269, "epoch": 0.8251916428678792, "flos": 28579363194240.0, "grad_norm": 2.3863167282345517, "language_loss": 0.72928286, "learning_rate": 3.1198302180647253e-07, "loss": 0.74392718, "num_input_tokens_seen": 295996265, "router_z_loss_clip": 2.16894531, "router_z_loss_mlp": 0.2512207, "step": 13725, "time_per_iteration": 2.671313762664795 }, { "auxiliary_loss_clip": 0.01236678, "auxiliary_loss_mlp": 0.00237116, "balance_loss_clip": 1.02170467, "balance_loss_mlp": 0.21291672, "epoch": 0.8252517661205471, "flos": 23075191733760.0, "grad_norm": 15.060454260926559, "language_loss": 0.88056469, "learning_rate": 3.1177417390115125e-07, "loss": 0.89530265, "num_input_tokens_seen": 296014745, "router_z_loss_clip": 2.15039062, "router_z_loss_mlp": 0.24194336, "step": 13726, "time_per_iteration": 2.685744285583496 }, { "auxiliary_loss_clip": 0.01230692, "auxiliary_loss_mlp": 0.00223333, "balance_loss_clip": 1.02219033, "balance_loss_mlp": 0.2010649, "epoch": 0.8253118893732151, "flos": 31759576617600.0, "grad_norm": 9.879298219852705, "language_loss": 0.77446032, "learning_rate": 3.1156539001352286e-07, "loss": 0.78900057, "num_input_tokens_seen": 296036960, "router_z_loss_clip": 2.08496094, "router_z_loss_mlp": 0.22277832, "step": 13727, "time_per_iteration": 2.725353956222534 }, { "auxiliary_loss_clip": 0.01262121, "auxiliary_loss_mlp": 0.00215735, "balance_loss_clip": 1.03982079, "balance_loss_mlp": 0.19129729, "epoch": 0.8253720126258831, "flos": 18296415434880.0, "grad_norm": 12.68690984681354, "language_loss": 0.73351455, "learning_rate": 3.113566701515036e-07, "loss": 0.74829304, "num_input_tokens_seen": 296056540, "router_z_loss_clip": 2.22070312, "router_z_loss_mlp": 0.24450684, "step": 13728, "time_per_iteration": 2.654900312423706 }, { "auxiliary_loss_clip": 0.01271623, "auxiliary_loss_mlp": 0.00219986, "balance_loss_clip": 1.04398966, "balance_loss_mlp": 0.19365218, "epoch": 0.825432135878551, "flos": 26797332625920.0, "grad_norm": 25.478564419143616, "language_loss": 0.80749714, "learning_rate": 3.111480143230092e-07, "loss": 0.82241321, "num_input_tokens_seen": 296077950, "router_z_loss_clip": 2.27734375, "router_z_loss_mlp": 0.26330566, "step": 13729, "time_per_iteration": 2.716139316558838 }, { "auxiliary_loss_clip": 0.0112564, "auxiliary_loss_mlp": 0.0010179, "balance_loss_clip": 0.97979748, "balance_loss_mlp": 0.0930166, "epoch": 0.825492259131219, "flos": 54219116217600.0, "grad_norm": 0.8720807073653413, "language_loss": 0.61714756, "learning_rate": 3.109394225359514e-07, "loss": 0.62942183, "num_input_tokens_seen": 296127060, "router_z_loss_clip": 1.4609375, "router_z_loss_mlp": 0.08789062, "step": 13730, "time_per_iteration": 2.9684746265411377 }, { "auxiliary_loss_clip": 0.01240568, "auxiliary_loss_mlp": 0.00248936, "balance_loss_clip": 1.0288353, "balance_loss_mlp": 0.22424799, "epoch": 0.825552382383887, "flos": 43756145493120.0, "grad_norm": 12.075186337833602, "language_loss": 0.73284853, "learning_rate": 3.1073089479823945e-07, "loss": 0.74774361, "num_input_tokens_seen": 296147775, "router_z_loss_clip": 2.12011719, "router_z_loss_mlp": 0.24707031, "step": 13731, "time_per_iteration": 2.8733692169189453 }, { "auxiliary_loss_clip": 0.01273518, "auxiliary_loss_mlp": 0.00232276, "balance_loss_clip": 1.04332185, "balance_loss_mlp": 0.20593053, "epoch": 0.825612505636555, "flos": 12602814624000.0, "grad_norm": 5.1594736656851286, "language_loss": 0.78695595, "learning_rate": 3.105224311177812e-07, "loss": 0.80201387, "num_input_tokens_seen": 296163560, "router_z_loss_clip": 2.296875, "router_z_loss_mlp": 0.2635498, "step": 13732, "time_per_iteration": 2.6765167713165283 }, { "auxiliary_loss_clip": 0.01256289, "auxiliary_loss_mlp": 0.00228551, "balance_loss_clip": 1.0344367, "balance_loss_mlp": 0.20382693, "epoch": 0.8256726288892229, "flos": 17595618111360.0, "grad_norm": 167.93281129424227, "language_loss": 0.84550118, "learning_rate": 3.103140315024817e-07, "loss": 0.8603496, "num_input_tokens_seen": 296178730, "router_z_loss_clip": 2.22070312, "router_z_loss_mlp": 0.24731445, "step": 13733, "time_per_iteration": 4.1236958503723145 }, { "auxiliary_loss_clip": 0.0123679, "auxiliary_loss_mlp": 0.00233739, "balance_loss_clip": 1.0230757, "balance_loss_mlp": 0.20964727, "epoch": 0.8257327521418909, "flos": 23805794367360.0, "grad_norm": 5.594116131344165, "language_loss": 0.86728245, "learning_rate": 3.1010569596024437e-07, "loss": 0.88198775, "num_input_tokens_seen": 296200175, "router_z_loss_clip": 2.13671875, "router_z_loss_mlp": 0.2409668, "step": 13734, "time_per_iteration": 2.7720351219177246 }, { "auxiliary_loss_clip": 0.01236384, "auxiliary_loss_mlp": 0.00215903, "balance_loss_clip": 1.02265763, "balance_loss_mlp": 0.19482657, "epoch": 0.8257928753945588, "flos": 19281121856640.0, "grad_norm": 51.600220434275315, "language_loss": 0.9053669, "learning_rate": 3.098974244989676e-07, "loss": 0.91988981, "num_input_tokens_seen": 296219305, "router_z_loss_clip": 2.13867188, "router_z_loss_mlp": 0.21081543, "step": 13735, "time_per_iteration": 4.0675859451293945 }, { "auxiliary_loss_clip": 0.01243093, "auxiliary_loss_mlp": 0.00254172, "balance_loss_clip": 1.02663863, "balance_loss_mlp": 0.22870913, "epoch": 0.8258529986472268, "flos": 18478841633280.0, "grad_norm": 8.567213070340127, "language_loss": 0.78533185, "learning_rate": 3.096892171265497e-07, "loss": 0.80030453, "num_input_tokens_seen": 296236945, "router_z_loss_clip": 2.16601562, "router_z_loss_mlp": 0.25463867, "step": 13736, "time_per_iteration": 2.7923378944396973 }, { "auxiliary_loss_clip": 0.01135499, "auxiliary_loss_mlp": 0.00082045, "balance_loss_clip": 0.9878093, "balance_loss_mlp": 0.07412942, "epoch": 0.8259131218998947, "flos": 62137957512960.0, "grad_norm": 0.8361744366926249, "language_loss": 0.67033142, "learning_rate": 3.0948107385088665e-07, "loss": 0.68250686, "num_input_tokens_seen": 296294685, "router_z_loss_clip": 1.4765625, "router_z_loss_mlp": 0.07910156, "step": 13737, "time_per_iteration": 3.2065865993499756 }, { "auxiliary_loss_clip": 0.01245297, "auxiliary_loss_mlp": 0.00206937, "balance_loss_clip": 1.03305912, "balance_loss_mlp": 0.18457344, "epoch": 0.8259732451525628, "flos": 22159038418560.0, "grad_norm": 3.863960758361799, "language_loss": 0.7635408, "learning_rate": 3.0927299467987e-07, "loss": 0.77806318, "num_input_tokens_seen": 296314790, "router_z_loss_clip": 2.12304688, "router_z_loss_mlp": 0.22363281, "step": 13738, "time_per_iteration": 2.700362205505371 }, { "auxiliary_loss_clip": 0.01263875, "auxiliary_loss_mlp": 0.00234355, "balance_loss_clip": 1.04243517, "balance_loss_mlp": 0.20933321, "epoch": 0.8260333684052307, "flos": 38361645233280.0, "grad_norm": 8.493720784494672, "language_loss": 0.7580108, "learning_rate": 3.090649796213911e-07, "loss": 0.77299309, "num_input_tokens_seen": 296335355, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.25024414, "step": 13739, "time_per_iteration": 4.382062196731567 }, { "auxiliary_loss_clip": 0.01135154, "auxiliary_loss_mlp": 0.00144387, "balance_loss_clip": 0.9886443, "balance_loss_mlp": 0.13527916, "epoch": 0.8260934916578987, "flos": 62185611882240.0, "grad_norm": 0.8663924147076932, "language_loss": 0.58351749, "learning_rate": 3.0885702868333853e-07, "loss": 0.59631288, "num_input_tokens_seen": 296399885, "router_z_loss_clip": 1.46875, "router_z_loss_mlp": 0.09130859, "step": 13740, "time_per_iteration": 3.189614772796631 }, { "auxiliary_loss_clip": 0.01277336, "auxiliary_loss_mlp": 0.00223443, "balance_loss_clip": 1.04884434, "balance_loss_mlp": 0.19677539, "epoch": 0.8261536149105667, "flos": 22565475786240.0, "grad_norm": 13.364197431927689, "language_loss": 0.854689, "learning_rate": 3.086491418735959e-07, "loss": 0.8696968, "num_input_tokens_seen": 296417660, "router_z_loss_clip": 2.28710938, "router_z_loss_mlp": 0.26647949, "step": 13741, "time_per_iteration": 2.6721713542938232 }, { "auxiliary_loss_clip": 0.01248233, "auxiliary_loss_mlp": 0.00240956, "balance_loss_clip": 1.02797616, "balance_loss_mlp": 0.21505208, "epoch": 0.8262137381632346, "flos": 32525479342080.0, "grad_norm": 106.6984855606408, "language_loss": 0.70320237, "learning_rate": 3.0844131920004726e-07, "loss": 0.71809423, "num_input_tokens_seen": 296438255, "router_z_loss_clip": 2.20117188, "router_z_loss_mlp": 0.25927734, "step": 13742, "time_per_iteration": 2.7714455127716064 }, { "auxiliary_loss_clip": 0.012794, "auxiliary_loss_mlp": 0.00221907, "balance_loss_clip": 1.04593682, "balance_loss_mlp": 0.19586015, "epoch": 0.8262738614159026, "flos": 14136451666560.0, "grad_norm": 6.206290013136441, "language_loss": 0.85225707, "learning_rate": 3.0823356067057327e-07, "loss": 0.86727011, "num_input_tokens_seen": 296454485, "router_z_loss_clip": 2.33007812, "router_z_loss_mlp": 0.26037598, "step": 13743, "time_per_iteration": 4.133354425430298 }, { "auxiliary_loss_clip": 0.01261048, "auxiliary_loss_mlp": 0.00219147, "balance_loss_clip": 1.04239273, "balance_loss_mlp": 0.1942682, "epoch": 0.8263339846685706, "flos": 19825347795840.0, "grad_norm": 74.9525593955595, "language_loss": 0.73916334, "learning_rate": 3.0802586629305283e-07, "loss": 0.75396532, "num_input_tokens_seen": 296473740, "router_z_loss_clip": 2.18652344, "router_z_loss_mlp": 0.24914551, "step": 13744, "time_per_iteration": 2.665152072906494 }, { "auxiliary_loss_clip": 0.01239497, "auxiliary_loss_mlp": 0.00223062, "balance_loss_clip": 1.02517974, "balance_loss_mlp": 0.19832581, "epoch": 0.8263941079212386, "flos": 22745962650240.0, "grad_norm": 28.219475610407027, "language_loss": 0.83585417, "learning_rate": 3.078182360753612e-07, "loss": 0.85047972, "num_input_tokens_seen": 296493355, "router_z_loss_clip": 2.14453125, "router_z_loss_mlp": 0.24731445, "step": 13745, "time_per_iteration": 2.655303716659546 }, { "auxiliary_loss_clip": 0.01212838, "auxiliary_loss_mlp": 0.00205452, "balance_loss_clip": 1.0059278, "balance_loss_mlp": 0.18331502, "epoch": 0.8264542311739065, "flos": 20120641505280.0, "grad_norm": 41.39163883663468, "language_loss": 0.85554135, "learning_rate": 3.076106700253709e-07, "loss": 0.86972421, "num_input_tokens_seen": 296510520, "router_z_loss_clip": 2.0703125, "router_z_loss_mlp": 0.22119141, "step": 13746, "time_per_iteration": 2.7531418800354004 }, { "auxiliary_loss_clip": 0.01268524, "auxiliary_loss_mlp": 0.00231303, "balance_loss_clip": 1.04368258, "balance_loss_mlp": 0.20636395, "epoch": 0.8265143544265745, "flos": 16837149502080.0, "grad_norm": 250.147131077403, "language_loss": 0.77533954, "learning_rate": 3.0740316815095415e-07, "loss": 0.7903378, "num_input_tokens_seen": 296528265, "router_z_loss_clip": 2.24804688, "router_z_loss_mlp": 0.24926758, "step": 13747, "time_per_iteration": 2.795776128768921 }, { "auxiliary_loss_clip": 0.01246858, "auxiliary_loss_mlp": 0.00220894, "balance_loss_clip": 1.02465999, "balance_loss_mlp": 0.19500187, "epoch": 0.8265744776792424, "flos": 22018592240640.0, "grad_norm": 136.48325268213713, "language_loss": 0.82071501, "learning_rate": 3.0719573045997835e-07, "loss": 0.83539248, "num_input_tokens_seen": 296547810, "router_z_loss_clip": 2.22265625, "router_z_loss_mlp": 0.25866699, "step": 13748, "time_per_iteration": 2.711371421813965 }, { "auxiliary_loss_clip": 0.01240226, "auxiliary_loss_mlp": 0.00206766, "balance_loss_clip": 1.02271843, "balance_loss_mlp": 0.18245879, "epoch": 0.8266346009319104, "flos": 19244852098560.0, "grad_norm": 20.258204629541055, "language_loss": 0.71682024, "learning_rate": 3.069883569603102e-07, "loss": 0.7312901, "num_input_tokens_seen": 296565940, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.24304199, "step": 13749, "time_per_iteration": 2.7066643238067627 }, { "auxiliary_loss_clip": 0.01235561, "auxiliary_loss_mlp": 0.00232813, "balance_loss_clip": 1.0225836, "balance_loss_mlp": 0.20932877, "epoch": 0.8266947241845783, "flos": 24166768095360.0, "grad_norm": 8457.750649150728, "language_loss": 0.80598569, "learning_rate": 3.067810476598132e-07, "loss": 0.82066941, "num_input_tokens_seen": 296585090, "router_z_loss_clip": 2.13085938, "router_z_loss_mlp": 0.23461914, "step": 13750, "time_per_iteration": 2.661062240600586 }, { "auxiliary_loss_clip": 0.0126055, "auxiliary_loss_mlp": 0.00208175, "balance_loss_clip": 1.03888261, "balance_loss_mlp": 0.18259311, "epoch": 0.8267548474372464, "flos": 21105814803840.0, "grad_norm": 8.791103582183815, "language_loss": 0.74327219, "learning_rate": 3.065738025663496e-07, "loss": 0.75795943, "num_input_tokens_seen": 296604950, "router_z_loss_clip": 2.21679688, "router_z_loss_mlp": 0.25549316, "step": 13751, "time_per_iteration": 2.6904919147491455 }, { "auxiliary_loss_clip": 0.01230282, "auxiliary_loss_mlp": 0.00195705, "balance_loss_clip": 1.02074802, "balance_loss_mlp": 0.17287596, "epoch": 0.8268149706899143, "flos": 39968288668800.0, "grad_norm": 7.942913869885441, "language_loss": 0.68371159, "learning_rate": 3.0636662168777607e-07, "loss": 0.69797146, "num_input_tokens_seen": 296627780, "router_z_loss_clip": 2.09570312, "router_z_loss_mlp": 0.22839355, "step": 13752, "time_per_iteration": 2.8289999961853027 }, { "auxiliary_loss_clip": 0.01134559, "auxiliary_loss_mlp": 0.00088488, "balance_loss_clip": 0.98859489, "balance_loss_mlp": 0.08047757, "epoch": 0.8268750939425823, "flos": 65782423244160.0, "grad_norm": 0.7630124809574861, "language_loss": 0.56689179, "learning_rate": 3.0615950503194986e-07, "loss": 0.57912225, "num_input_tokens_seen": 296683850, "router_z_loss_clip": 1.453125, "router_z_loss_mlp": 0.08007812, "step": 13753, "time_per_iteration": 3.1919877529144287 }, { "auxiliary_loss_clip": 0.0113349, "auxiliary_loss_mlp": 0.00077252, "balance_loss_clip": 0.98879313, "balance_loss_mlp": 0.07095728, "epoch": 0.8269352171952503, "flos": 52981455242880.0, "grad_norm": 0.6762874096634176, "language_loss": 0.54082799, "learning_rate": 3.0595245260672563e-07, "loss": 0.55293542, "num_input_tokens_seen": 296741420, "router_z_loss_clip": 1.4453125, "router_z_loss_mlp": 0.06298828, "step": 13754, "time_per_iteration": 3.257965564727783 }, { "auxiliary_loss_clip": 0.01243352, "auxiliary_loss_mlp": 0.00219648, "balance_loss_clip": 1.02978277, "balance_loss_mlp": 0.1975587, "epoch": 0.8269953404479182, "flos": 23076125487360.0, "grad_norm": 3.581537670597949, "language_loss": 0.78144348, "learning_rate": 3.0574546441995354e-07, "loss": 0.7960735, "num_input_tokens_seen": 296759620, "router_z_loss_clip": 2.13574219, "router_z_loss_mlp": 0.2208252, "step": 13755, "time_per_iteration": 2.675516366958618 }, { "auxiliary_loss_clip": 0.01228141, "auxiliary_loss_mlp": 0.00209441, "balance_loss_clip": 1.01589537, "balance_loss_mlp": 0.18640991, "epoch": 0.8270554637005862, "flos": 14209996763520.0, "grad_norm": 5.466438829741084, "language_loss": 0.77077186, "learning_rate": 3.0553854047948324e-07, "loss": 0.78514767, "num_input_tokens_seen": 296777275, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.23034668, "step": 13756, "time_per_iteration": 2.6372978687286377 }, { "auxiliary_loss_clip": 0.01253849, "auxiliary_loss_mlp": 0.00226293, "balance_loss_clip": 1.03122413, "balance_loss_mlp": 0.200353, "epoch": 0.8271155869532542, "flos": 21762046327680.0, "grad_norm": 97.28484533715145, "language_loss": 0.83036524, "learning_rate": 3.053316807931623e-07, "loss": 0.84516662, "num_input_tokens_seen": 296796655, "router_z_loss_clip": 2.2265625, "router_z_loss_mlp": 0.25927734, "step": 13757, "time_per_iteration": 2.652130126953125 }, { "auxiliary_loss_clip": 0.01263736, "auxiliary_loss_mlp": 0.00205901, "balance_loss_clip": 1.04183459, "balance_loss_mlp": 0.18067625, "epoch": 0.8271757102059222, "flos": 15120475729920.0, "grad_norm": 136.89634758473167, "language_loss": 0.75142902, "learning_rate": 3.0512488536883283e-07, "loss": 0.76612532, "num_input_tokens_seen": 296813705, "router_z_loss_clip": 2.21679688, "router_z_loss_mlp": 0.25219727, "step": 13758, "time_per_iteration": 2.63655161857605 }, { "auxiliary_loss_clip": 0.01217864, "auxiliary_loss_mlp": 0.00204591, "balance_loss_clip": 1.01014304, "balance_loss_mlp": 0.18271625, "epoch": 0.8272358334585901, "flos": 24133730561280.0, "grad_norm": 6.956377693169685, "language_loss": 0.75452864, "learning_rate": 3.0491815421433775e-07, "loss": 0.76875317, "num_input_tokens_seen": 296833985, "router_z_loss_clip": 2.07910156, "router_z_loss_mlp": 0.21875, "step": 13759, "time_per_iteration": 2.6333537101745605 }, { "auxiliary_loss_clip": 0.01246996, "auxiliary_loss_mlp": 0.00205426, "balance_loss_clip": 1.02925563, "balance_loss_mlp": 0.18064271, "epoch": 0.8272959567112581, "flos": 18990712396800.0, "grad_norm": 4.66962923167097, "language_loss": 0.75595659, "learning_rate": 3.047114873375161e-07, "loss": 0.77048081, "num_input_tokens_seen": 296850150, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.24768066, "step": 13760, "time_per_iteration": 2.639218330383301 }, { "auxiliary_loss_clip": 0.01242132, "auxiliary_loss_mlp": 0.00200869, "balance_loss_clip": 1.02916789, "balance_loss_mlp": 0.17863688, "epoch": 0.827356079963926, "flos": 20631614428800.0, "grad_norm": 37.267361356554645, "language_loss": 0.86831021, "learning_rate": 3.0450488474620505e-07, "loss": 0.88274026, "num_input_tokens_seen": 296869585, "router_z_loss_clip": 2.13476562, "router_z_loss_mlp": 0.22229004, "step": 13761, "time_per_iteration": 2.664045572280884 }, { "auxiliary_loss_clip": 0.01228514, "auxiliary_loss_mlp": 0.00214966, "balance_loss_clip": 1.01805639, "balance_loss_mlp": 0.19100484, "epoch": 0.827416203216594, "flos": 22416625825920.0, "grad_norm": 5.238561914498069, "language_loss": 0.77642775, "learning_rate": 3.042983464482387e-07, "loss": 0.79086256, "num_input_tokens_seen": 296887710, "router_z_loss_clip": 2.1015625, "router_z_loss_mlp": 0.23950195, "step": 13762, "time_per_iteration": 2.6790876388549805 }, { "auxiliary_loss_clip": 0.0122878, "auxiliary_loss_mlp": 0.00206595, "balance_loss_clip": 1.01617765, "balance_loss_mlp": 0.18430313, "epoch": 0.827476326469262, "flos": 19026192055680.0, "grad_norm": 14.024822236878371, "language_loss": 0.78175646, "learning_rate": 3.0409187245144853e-07, "loss": 0.79611015, "num_input_tokens_seen": 296906265, "router_z_loss_clip": 2.12890625, "router_z_loss_mlp": 0.22290039, "step": 13763, "time_per_iteration": 2.63822078704834 }, { "auxiliary_loss_clip": 0.01136945, "auxiliary_loss_mlp": 0.0008619, "balance_loss_clip": 0.98955786, "balance_loss_mlp": 0.07822638, "epoch": 0.82753644972193, "flos": 68500575089280.0, "grad_norm": 0.8200110864949555, "language_loss": 0.64361113, "learning_rate": 3.038854627636651e-07, "loss": 0.65584248, "num_input_tokens_seen": 296971290, "router_z_loss_clip": 1.46875, "router_z_loss_mlp": 0.07958984, "step": 13764, "time_per_iteration": 3.2747726440429688 }, { "auxiliary_loss_clip": 0.01251143, "auxiliary_loss_mlp": 0.00215161, "balance_loss_clip": 1.03639734, "balance_loss_mlp": 0.19191527, "epoch": 0.8275965729745979, "flos": 18405404277120.0, "grad_norm": 89.32998378167966, "language_loss": 0.88119102, "learning_rate": 3.0367911739271423e-07, "loss": 0.89585406, "num_input_tokens_seen": 296989060, "router_z_loss_clip": 2.1484375, "router_z_loss_mlp": 0.2322998, "step": 13765, "time_per_iteration": 2.670809030532837 }, { "auxiliary_loss_clip": 0.01260283, "auxiliary_loss_mlp": 0.00243591, "balance_loss_clip": 1.03580236, "balance_loss_mlp": 0.21722168, "epoch": 0.8276566962272659, "flos": 28512067063680.0, "grad_norm": 9.07144145268564, "language_loss": 0.7236082, "learning_rate": 3.034728363464214e-07, "loss": 0.73864692, "num_input_tokens_seen": 297011300, "router_z_loss_clip": 2.24414062, "router_z_loss_mlp": 0.26391602, "step": 13766, "time_per_iteration": 2.6884818077087402 }, { "auxiliary_loss_clip": 0.01251381, "auxiliary_loss_mlp": 0.00228193, "balance_loss_clip": 1.0340941, "balance_loss_mlp": 0.20417231, "epoch": 0.8277168194799339, "flos": 20230240878720.0, "grad_norm": 3.7253424323374036, "language_loss": 0.91214973, "learning_rate": 3.03266619632609e-07, "loss": 0.92694545, "num_input_tokens_seen": 297030350, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.23999023, "step": 13767, "time_per_iteration": 2.645993232727051 }, { "auxiliary_loss_clip": 0.01260142, "auxiliary_loss_mlp": 0.0024018, "balance_loss_clip": 1.04393899, "balance_loss_mlp": 0.21582527, "epoch": 0.8277769427326018, "flos": 28476623318400.0, "grad_norm": 3.6185304549008523, "language_loss": 0.75436783, "learning_rate": 3.030604672590964e-07, "loss": 0.76937103, "num_input_tokens_seen": 297049710, "router_z_loss_clip": 2.16210938, "router_z_loss_mlp": 0.24365234, "step": 13768, "time_per_iteration": 2.701990842819214 }, { "auxiliary_loss_clip": 0.0123752, "auxiliary_loss_mlp": 0.00219366, "balance_loss_clip": 1.02300525, "balance_loss_mlp": 0.19614369, "epoch": 0.8278370659852698, "flos": 27197628768000.0, "grad_norm": 32.86982186399373, "language_loss": 0.82507348, "learning_rate": 3.028543792337006e-07, "loss": 0.83964229, "num_input_tokens_seen": 297070510, "router_z_loss_clip": 2.15234375, "router_z_loss_mlp": 0.23205566, "step": 13769, "time_per_iteration": 2.7151389122009277 }, { "auxiliary_loss_clip": 0.01252721, "auxiliary_loss_mlp": 0.00224346, "balance_loss_clip": 1.03177619, "balance_loss_mlp": 0.19990838, "epoch": 0.8278971892379378, "flos": 37816126404480.0, "grad_norm": 475.8146310245161, "language_loss": 0.81802207, "learning_rate": 3.0264835556423675e-07, "loss": 0.83279276, "num_input_tokens_seen": 297092585, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.24450684, "step": 13770, "time_per_iteration": 2.780045747756958 }, { "auxiliary_loss_clip": 0.01257034, "auxiliary_loss_mlp": 0.00230311, "balance_loss_clip": 1.03644466, "balance_loss_mlp": 0.20489573, "epoch": 0.8279573124906058, "flos": 22560160573440.0, "grad_norm": 6.622580544150541, "language_loss": 0.82319808, "learning_rate": 3.0244239625851785e-07, "loss": 0.83807153, "num_input_tokens_seen": 297110055, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.25415039, "step": 13771, "time_per_iteration": 2.6750035285949707 }, { "auxiliary_loss_clip": 0.01255859, "auxiliary_loss_mlp": 0.00233195, "balance_loss_clip": 1.03467369, "balance_loss_mlp": 0.20895989, "epoch": 0.8280174357432737, "flos": 36064619418240.0, "grad_norm": 12.371922511913198, "language_loss": 0.78636062, "learning_rate": 3.0223650132435284e-07, "loss": 0.80125123, "num_input_tokens_seen": 297132170, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.24230957, "step": 13772, "time_per_iteration": 2.7810404300689697 }, { "auxiliary_loss_clip": 0.01240538, "auxiliary_loss_mlp": 0.00212527, "balance_loss_clip": 1.02186298, "balance_loss_mlp": 0.18652776, "epoch": 0.8280775589959417, "flos": 22961067246720.0, "grad_norm": 38.98230412585889, "language_loss": 0.84209853, "learning_rate": 3.0203067076955035e-07, "loss": 0.85662913, "num_input_tokens_seen": 297149515, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.26000977, "step": 13773, "time_per_iteration": 2.6616311073303223 }, { "auxiliary_loss_clip": 0.01241788, "auxiliary_loss_mlp": 0.00215048, "balance_loss_clip": 1.02787054, "balance_loss_mlp": 0.19021642, "epoch": 0.8281376822486096, "flos": 26063282286720.0, "grad_norm": 26.76171681635544, "language_loss": 0.81892121, "learning_rate": 3.01824904601915e-07, "loss": 0.8334896, "num_input_tokens_seen": 297170320, "router_z_loss_clip": 2.13867188, "router_z_loss_mlp": 0.24865723, "step": 13774, "time_per_iteration": 2.7025716304779053 }, { "auxiliary_loss_clip": 0.01283138, "auxiliary_loss_mlp": 0.00239619, "balance_loss_clip": 1.05093694, "balance_loss_mlp": 0.2148833, "epoch": 0.8281978055012776, "flos": 20667776446080.0, "grad_norm": 212.2076913752154, "language_loss": 0.80505908, "learning_rate": 3.01619202829249e-07, "loss": 0.82028663, "num_input_tokens_seen": 297189935, "router_z_loss_clip": 2.32226562, "router_z_loss_mlp": 0.24743652, "step": 13775, "time_per_iteration": 4.053435564041138 }, { "auxiliary_loss_clip": 0.01264522, "auxiliary_loss_mlp": 0.00232782, "balance_loss_clip": 1.04104233, "balance_loss_mlp": 0.20656797, "epoch": 0.8282579287539455, "flos": 29315281040640.0, "grad_norm": 188.62565141071192, "language_loss": 0.82478911, "learning_rate": 3.01413565459353e-07, "loss": 0.83976215, "num_input_tokens_seen": 297210885, "router_z_loss_clip": 2.23632812, "router_z_loss_mlp": 0.26220703, "step": 13776, "time_per_iteration": 2.7121739387512207 }, { "auxiliary_loss_clip": 0.01256664, "auxiliary_loss_mlp": 0.00248622, "balance_loss_clip": 1.03604829, "balance_loss_mlp": 0.22187141, "epoch": 0.8283180520066136, "flos": 15706178899200.0, "grad_norm": 4.625787613198145, "language_loss": 0.86910814, "learning_rate": 3.0120799250002483e-07, "loss": 0.884161, "num_input_tokens_seen": 297228500, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.26733398, "step": 13777, "time_per_iteration": 4.086107015609741 }, { "auxiliary_loss_clip": 0.01259472, "auxiliary_loss_mlp": 0.0021859, "balance_loss_clip": 1.04062665, "balance_loss_mlp": 0.19471279, "epoch": 0.8283781752592815, "flos": 24791470456320.0, "grad_norm": 4.9575915670400015, "language_loss": 0.90455806, "learning_rate": 3.010024839590604e-07, "loss": 0.9193387, "num_input_tokens_seen": 297249470, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.2388916, "step": 13778, "time_per_iteration": 2.6484200954437256 }, { "auxiliary_loss_clip": 0.01236732, "auxiliary_loss_mlp": 0.00224769, "balance_loss_clip": 1.02502775, "balance_loss_mlp": 0.20078456, "epoch": 0.8284382985119495, "flos": 18982811404800.0, "grad_norm": 11.578791714006718, "language_loss": 0.82244647, "learning_rate": 3.0079703984425187e-07, "loss": 0.83706146, "num_input_tokens_seen": 297265970, "router_z_loss_clip": 2.11914062, "router_z_loss_mlp": 0.23999023, "step": 13779, "time_per_iteration": 2.7026782035827637 }, { "auxiliary_loss_clip": 0.01143771, "auxiliary_loss_mlp": 0.00085905, "balance_loss_clip": 0.99478519, "balance_loss_mlp": 0.07884748, "epoch": 0.8284984217646175, "flos": 61034460814080.0, "grad_norm": 0.9798142161237802, "language_loss": 0.55828029, "learning_rate": 3.0059166016338954e-07, "loss": 0.57057703, "num_input_tokens_seen": 297325525, "router_z_loss_clip": 1.4921875, "router_z_loss_mlp": 0.07080078, "step": 13780, "time_per_iteration": 3.1780967712402344 }, { "auxiliary_loss_clip": 0.01246513, "auxiliary_loss_mlp": 0.00228055, "balance_loss_clip": 1.02917981, "balance_loss_mlp": 0.20485698, "epoch": 0.8285585450172854, "flos": 19714635100800.0, "grad_norm": 323.5546785607675, "language_loss": 0.87015879, "learning_rate": 3.0038634492426205e-07, "loss": 0.8849045, "num_input_tokens_seen": 297345025, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.23205566, "step": 13781, "time_per_iteration": 4.179501056671143 }, { "auxiliary_loss_clip": 0.01255693, "auxiliary_loss_mlp": 0.0023605, "balance_loss_clip": 1.03313041, "balance_loss_mlp": 0.2096334, "epoch": 0.8286186682699535, "flos": 21688896280320.0, "grad_norm": 94.09848572989968, "language_loss": 0.85934108, "learning_rate": 3.001810941346543e-07, "loss": 0.87425852, "num_input_tokens_seen": 297363570, "router_z_loss_clip": 2.2265625, "router_z_loss_mlp": 0.26416016, "step": 13782, "time_per_iteration": 2.6848580837249756 }, { "auxiliary_loss_clip": 0.01233149, "auxiliary_loss_mlp": 0.00222206, "balance_loss_clip": 1.01688385, "balance_loss_mlp": 0.19751783, "epoch": 0.8286787915226214, "flos": 25775566346880.0, "grad_norm": 442.2845867364426, "language_loss": 0.83576179, "learning_rate": 2.9997590780234983e-07, "loss": 0.85031533, "num_input_tokens_seen": 297385385, "router_z_loss_clip": 2.16113281, "router_z_loss_mlp": 0.24719238, "step": 13783, "time_per_iteration": 2.6668589115142822 }, { "auxiliary_loss_clip": 0.01257395, "auxiliary_loss_mlp": 0.00232133, "balance_loss_clip": 1.03773832, "balance_loss_mlp": 0.20822006, "epoch": 0.8287389147752894, "flos": 21288348743040.0, "grad_norm": 13.899776357603388, "language_loss": 0.81127763, "learning_rate": 2.997707859351304e-07, "loss": 0.82617289, "num_input_tokens_seen": 297403950, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.23901367, "step": 13784, "time_per_iteration": 2.7319283485412598 }, { "auxiliary_loss_clip": 0.01244266, "auxiliary_loss_mlp": 0.00222873, "balance_loss_clip": 1.02401578, "balance_loss_mlp": 0.19705245, "epoch": 0.8287990380279573, "flos": 33544875323520.0, "grad_norm": 16.89689404276723, "language_loss": 0.78871685, "learning_rate": 2.99565728540772e-07, "loss": 0.80338824, "num_input_tokens_seen": 297424565, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.25830078, "step": 13785, "time_per_iteration": 2.7777044773101807 }, { "auxiliary_loss_clip": 0.01259473, "auxiliary_loss_mlp": 0.00218284, "balance_loss_clip": 1.03781331, "balance_loss_mlp": 0.19345284, "epoch": 0.8288591612806253, "flos": 22966346545920.0, "grad_norm": 17.233553605888478, "language_loss": 0.76175666, "learning_rate": 2.993607356270516e-07, "loss": 0.7765342, "num_input_tokens_seen": 297445180, "router_z_loss_clip": 2.2109375, "router_z_loss_mlp": 0.24804688, "step": 13786, "time_per_iteration": 4.111645936965942 }, { "auxiliary_loss_clip": 0.01284486, "auxiliary_loss_mlp": 0.00246875, "balance_loss_clip": 1.05260921, "balance_loss_mlp": 0.22013602, "epoch": 0.8289192845332932, "flos": 18588979710720.0, "grad_norm": 4.60534145889195, "language_loss": 0.85015708, "learning_rate": 2.991558072017426e-07, "loss": 0.86547065, "num_input_tokens_seen": 297463790, "router_z_loss_clip": 2.3203125, "router_z_loss_mlp": 0.26745605, "step": 13787, "time_per_iteration": 2.651737928390503 }, { "auxiliary_loss_clip": 0.01247581, "auxiliary_loss_mlp": 0.00248749, "balance_loss_clip": 1.03205323, "balance_loss_mlp": 0.22458486, "epoch": 0.8289794077859612, "flos": 15450423085440.0, "grad_norm": 94.98577729245056, "language_loss": 0.88310397, "learning_rate": 2.989509432726163e-07, "loss": 0.8980673, "num_input_tokens_seen": 297480100, "router_z_loss_clip": 2.15429688, "router_z_loss_mlp": 0.24169922, "step": 13788, "time_per_iteration": 2.5857040882110596 }, { "auxiliary_loss_clip": 0.01228949, "auxiliary_loss_mlp": 0.00244201, "balance_loss_clip": 1.01715589, "balance_loss_mlp": 0.21847609, "epoch": 0.8290395310386292, "flos": 28877853214080.0, "grad_norm": 7.475433475822789, "language_loss": 0.77819824, "learning_rate": 2.9874614384744014e-07, "loss": 0.79292977, "num_input_tokens_seen": 297499890, "router_z_loss_clip": 2.11621094, "router_z_loss_mlp": 0.25732422, "step": 13789, "time_per_iteration": 2.7031240463256836 }, { "auxiliary_loss_clip": 0.01237254, "auxiliary_loss_mlp": 0.00223346, "balance_loss_clip": 1.02214837, "balance_loss_mlp": 0.19915839, "epoch": 0.8290996542912972, "flos": 36576274700160.0, "grad_norm": 70.60422017996845, "language_loss": 0.74819934, "learning_rate": 2.985414089339813e-07, "loss": 0.7628054, "num_input_tokens_seen": 297521440, "router_z_loss_clip": 2.15234375, "router_z_loss_mlp": 0.24194336, "step": 13790, "time_per_iteration": 2.8739311695098877 }, { "auxiliary_loss_clip": 0.01245451, "auxiliary_loss_mlp": 0.00235216, "balance_loss_clip": 1.02744913, "balance_loss_mlp": 0.2095736, "epoch": 0.8291597775439651, "flos": 23623009032960.0, "grad_norm": 9.795337496227742, "language_loss": 0.83472574, "learning_rate": 2.9833673854000265e-07, "loss": 0.84953243, "num_input_tokens_seen": 297539920, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.25622559, "step": 13791, "time_per_iteration": 2.6970527172088623 }, { "auxiliary_loss_clip": 0.01231132, "auxiliary_loss_mlp": 0.00224889, "balance_loss_clip": 1.02288365, "balance_loss_mlp": 0.20271645, "epoch": 0.8292199007966331, "flos": 21397481239680.0, "grad_norm": 12.586594012563165, "language_loss": 0.75925064, "learning_rate": 2.981321326732651e-07, "loss": 0.77381086, "num_input_tokens_seen": 297560000, "router_z_loss_clip": 2.08105469, "router_z_loss_mlp": 0.22167969, "step": 13792, "time_per_iteration": 2.7274858951568604 }, { "auxiliary_loss_clip": 0.01246715, "auxiliary_loss_mlp": 0.00236668, "balance_loss_clip": 1.02426505, "balance_loss_mlp": 0.21127626, "epoch": 0.829280024049301, "flos": 28767607395840.0, "grad_norm": 97.49957148964462, "language_loss": 0.7233448, "learning_rate": 2.9792759134152736e-07, "loss": 0.73817873, "num_input_tokens_seen": 297579300, "router_z_loss_clip": 2.22265625, "router_z_loss_mlp": 0.25427246, "step": 13793, "time_per_iteration": 2.8064236640930176 }, { "auxiliary_loss_clip": 0.01240316, "auxiliary_loss_mlp": 0.00240087, "balance_loss_clip": 1.0207932, "balance_loss_mlp": 0.21490952, "epoch": 0.829340147301969, "flos": 19938071652480.0, "grad_norm": 16.533549050206364, "language_loss": 0.75018191, "learning_rate": 2.977231145525461e-07, "loss": 0.76498592, "num_input_tokens_seen": 297598095, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.25195312, "step": 13794, "time_per_iteration": 2.648599863052368 }, { "auxiliary_loss_clip": 0.01252489, "auxiliary_loss_mlp": 0.00231258, "balance_loss_clip": 1.03053331, "balance_loss_mlp": 0.20393501, "epoch": 0.829400270554637, "flos": 25228575060480.0, "grad_norm": 184.98972570930403, "language_loss": 0.76529086, "learning_rate": 2.975187023140757e-07, "loss": 0.78012836, "num_input_tokens_seen": 297615955, "router_z_loss_clip": 2.21679688, "router_z_loss_mlp": 0.27319336, "step": 13795, "time_per_iteration": 2.6704583168029785 }, { "auxiliary_loss_clip": 0.0123443, "auxiliary_loss_mlp": 0.00220766, "balance_loss_clip": 1.02103209, "balance_loss_mlp": 0.19793773, "epoch": 0.829460393807305, "flos": 24463570176000.0, "grad_norm": 18.064916733766907, "language_loss": 0.72891682, "learning_rate": 2.973143546338661e-07, "loss": 0.74346876, "num_input_tokens_seen": 297636285, "router_z_loss_clip": 2.13476562, "router_z_loss_mlp": 0.22802734, "step": 13796, "time_per_iteration": 2.7402515411376953 }, { "auxiliary_loss_clip": 0.01231135, "auxiliary_loss_mlp": 0.00217583, "balance_loss_clip": 1.01510262, "balance_loss_mlp": 0.19385993, "epoch": 0.829520517059973, "flos": 15122486891520.0, "grad_norm": 7.301935890668897, "language_loss": 0.78551757, "learning_rate": 2.971100715196666e-07, "loss": 0.80000478, "num_input_tokens_seen": 297653315, "router_z_loss_clip": 2.16015625, "router_z_loss_mlp": 0.23693848, "step": 13797, "time_per_iteration": 2.6100783348083496 }, { "auxiliary_loss_clip": 0.01254978, "auxiliary_loss_mlp": 0.00212997, "balance_loss_clip": 1.03668964, "balance_loss_mlp": 0.19076477, "epoch": 0.8295806403126409, "flos": 21579979265280.0, "grad_norm": 897.2442972134277, "language_loss": 0.83165491, "learning_rate": 2.969058529792243e-07, "loss": 0.84633464, "num_input_tokens_seen": 297673480, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.22253418, "step": 13798, "time_per_iteration": 2.6724605560302734 }, { "auxiliary_loss_clip": 0.01221082, "auxiliary_loss_mlp": 0.00210501, "balance_loss_clip": 1.01250827, "balance_loss_mlp": 0.18866198, "epoch": 0.8296407635653089, "flos": 21726566668800.0, "grad_norm": 2.9183843256278994, "language_loss": 0.82121563, "learning_rate": 2.967016990202822e-07, "loss": 0.83553147, "num_input_tokens_seen": 297693250, "router_z_loss_clip": 2.08789062, "router_z_loss_mlp": 0.21838379, "step": 13799, "time_per_iteration": 2.6565942764282227 }, { "auxiliary_loss_clip": 0.01238572, "auxiliary_loss_mlp": 0.00238203, "balance_loss_clip": 1.02502453, "balance_loss_mlp": 0.21487333, "epoch": 0.8297008868179768, "flos": 11181147252480.0, "grad_norm": 97.39358959105127, "language_loss": 0.77144474, "learning_rate": 2.9649760965058245e-07, "loss": 0.78621244, "num_input_tokens_seen": 297710975, "router_z_loss_clip": 2.13671875, "router_z_loss_mlp": 0.2331543, "step": 13800, "time_per_iteration": 2.6554486751556396 }, { "auxiliary_loss_clip": 0.0125845, "auxiliary_loss_mlp": 0.00233744, "balance_loss_clip": 1.03791928, "balance_loss_mlp": 0.21035497, "epoch": 0.8297610100706448, "flos": 20664041431680.0, "grad_norm": 125.94898943565737, "language_loss": 0.85036457, "learning_rate": 2.9629358487786515e-07, "loss": 0.86528647, "num_input_tokens_seen": 297730860, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.23388672, "step": 13801, "time_per_iteration": 2.6669509410858154 }, { "auxiliary_loss_clip": 0.01237897, "auxiliary_loss_mlp": 0.00230582, "balance_loss_clip": 1.0238452, "balance_loss_mlp": 0.20885043, "epoch": 0.8298211333233128, "flos": 20376325491840.0, "grad_norm": 312.94258918611513, "language_loss": 0.81359339, "learning_rate": 2.9608962470986476e-07, "loss": 0.82827818, "num_input_tokens_seen": 297749765, "router_z_loss_clip": 2.140625, "router_z_loss_mlp": 0.2175293, "step": 13802, "time_per_iteration": 2.7154736518859863 }, { "auxiliary_loss_clip": 0.01243505, "auxiliary_loss_mlp": 0.00248267, "balance_loss_clip": 1.03043866, "balance_loss_mlp": 0.22412759, "epoch": 0.8298812565759808, "flos": 21508696725120.0, "grad_norm": 21.000970331903588, "language_loss": 0.80955434, "learning_rate": 2.9588572915431644e-07, "loss": 0.82447207, "num_input_tokens_seen": 297770380, "router_z_loss_clip": 2.12890625, "router_z_loss_mlp": 0.24133301, "step": 13803, "time_per_iteration": 2.646883249282837 }, { "auxiliary_loss_clip": 0.01239857, "auxiliary_loss_mlp": 0.00228813, "balance_loss_clip": 1.02238083, "balance_loss_mlp": 0.20492311, "epoch": 0.8299413798286487, "flos": 22818681734400.0, "grad_norm": 6.107876303975471, "language_loss": 0.83322006, "learning_rate": 2.9568189821895215e-07, "loss": 0.84790677, "num_input_tokens_seen": 297789440, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.23876953, "step": 13804, "time_per_iteration": 2.6580870151519775 }, { "auxiliary_loss_clip": 0.01252406, "auxiliary_loss_mlp": 0.00236474, "balance_loss_clip": 1.03563797, "balance_loss_mlp": 0.21240589, "epoch": 0.8300015030813167, "flos": 29679199683840.0, "grad_norm": 4035.036957262985, "language_loss": 0.81072009, "learning_rate": 2.954781319115016e-07, "loss": 0.82560891, "num_input_tokens_seen": 297810425, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.24084473, "step": 13805, "time_per_iteration": 2.7219719886779785 }, { "auxiliary_loss_clip": 0.01250442, "auxiliary_loss_mlp": 0.0023703, "balance_loss_clip": 1.02985334, "balance_loss_mlp": 0.21181759, "epoch": 0.8300616263339846, "flos": 19719483436800.0, "grad_norm": 9.063772738240289, "language_loss": 0.85948133, "learning_rate": 2.952744302396906e-07, "loss": 0.87435603, "num_input_tokens_seen": 297827680, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.25195312, "step": 13806, "time_per_iteration": 2.678441286087036 }, { "auxiliary_loss_clip": 0.012555, "auxiliary_loss_mlp": 0.00235658, "balance_loss_clip": 1.03208649, "balance_loss_mlp": 0.20964643, "epoch": 0.8301217495866526, "flos": 19901945548800.0, "grad_norm": 64.40860713094932, "language_loss": 0.69716597, "learning_rate": 2.950707932112444e-07, "loss": 0.7120775, "num_input_tokens_seen": 297848005, "router_z_loss_clip": 2.23046875, "router_z_loss_mlp": 0.26025391, "step": 13807, "time_per_iteration": 2.6556942462921143 }, { "auxiliary_loss_clip": 0.01260434, "auxiliary_loss_mlp": 0.00217766, "balance_loss_clip": 1.04056144, "balance_loss_mlp": 0.19307749, "epoch": 0.8301818728393207, "flos": 19715784336000.0, "grad_norm": 144.61951363751692, "language_loss": 0.82023251, "learning_rate": 2.948672208338847e-07, "loss": 0.83501446, "num_input_tokens_seen": 297866730, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.24707031, "step": 13808, "time_per_iteration": 2.662076234817505 }, { "auxiliary_loss_clip": 0.01258321, "auxiliary_loss_mlp": 0.00253403, "balance_loss_clip": 1.03631282, "balance_loss_mlp": 0.22739121, "epoch": 0.8302419960919886, "flos": 28293658416000.0, "grad_norm": 19.076714820311846, "language_loss": 0.76192862, "learning_rate": 2.9466371311533046e-07, "loss": 0.77704585, "num_input_tokens_seen": 297886390, "router_z_loss_clip": 2.22460938, "router_z_loss_mlp": 0.26013184, "step": 13809, "time_per_iteration": 2.715114116668701 }, { "auxiliary_loss_clip": 0.01247243, "auxiliary_loss_mlp": 0.00204683, "balance_loss_clip": 1.03241348, "balance_loss_mlp": 0.18273634, "epoch": 0.8303021193446566, "flos": 18223444955520.0, "grad_norm": 46.186096444210534, "language_loss": 0.83046895, "learning_rate": 2.9446027006329896e-07, "loss": 0.84498823, "num_input_tokens_seen": 297905110, "router_z_loss_clip": 2.14453125, "router_z_loss_mlp": 0.21948242, "step": 13810, "time_per_iteration": 2.6643331050872803 }, { "auxiliary_loss_clip": 0.01237264, "auxiliary_loss_mlp": 0.00211861, "balance_loss_clip": 1.02535307, "balance_loss_mlp": 0.1893547, "epoch": 0.8303622425973245, "flos": 23111425578240.0, "grad_norm": 6.199573626404562, "language_loss": 0.89054924, "learning_rate": 2.94256891685505e-07, "loss": 0.9050405, "num_input_tokens_seen": 297925460, "router_z_loss_clip": 2.12109375, "router_z_loss_mlp": 0.22521973, "step": 13811, "time_per_iteration": 2.71201753616333 }, { "auxiliary_loss_clip": 0.012571, "auxiliary_loss_mlp": 0.00228161, "balance_loss_clip": 1.03825879, "balance_loss_mlp": 0.20447424, "epoch": 0.8304223658499925, "flos": 19572860119680.0, "grad_norm": 10.428280280625035, "language_loss": 0.81417882, "learning_rate": 2.9405357798966156e-07, "loss": 0.82903147, "num_input_tokens_seen": 297941760, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.23669434, "step": 13812, "time_per_iteration": 2.6210925579071045 }, { "auxiliary_loss_clip": 0.01229768, "auxiliary_loss_mlp": 0.0021238, "balance_loss_clip": 1.02218461, "balance_loss_mlp": 0.18988499, "epoch": 0.8304824891026604, "flos": 24426115269120.0, "grad_norm": 257.95596626486974, "language_loss": 0.84575832, "learning_rate": 2.9385032898347664e-07, "loss": 0.8601799, "num_input_tokens_seen": 297959745, "router_z_loss_clip": 2.07617188, "router_z_loss_mlp": 0.22485352, "step": 13813, "time_per_iteration": 2.6970741748809814 }, { "auxiliary_loss_clip": 0.01265045, "auxiliary_loss_mlp": 0.00244689, "balance_loss_clip": 1.04274559, "balance_loss_mlp": 0.21863002, "epoch": 0.8305426123553284, "flos": 22381792611840.0, "grad_norm": 14.818326722933298, "language_loss": 0.79712784, "learning_rate": 2.93647144674658e-07, "loss": 0.81222522, "num_input_tokens_seen": 297977665, "router_z_loss_clip": 2.2265625, "router_z_loss_mlp": 0.26049805, "step": 13814, "time_per_iteration": 2.6786389350891113 }, { "auxiliary_loss_clip": 0.01290516, "auxiliary_loss_mlp": 0.00246012, "balance_loss_clip": 1.0536474, "balance_loss_mlp": 0.21896353, "epoch": 0.8306027356079964, "flos": 14903575453440.0, "grad_norm": 888.0188581491986, "language_loss": 0.78344643, "learning_rate": 2.9344402507091116e-07, "loss": 0.79881167, "num_input_tokens_seen": 297993525, "router_z_loss_clip": 2.37304688, "router_z_loss_mlp": 0.27062988, "step": 13815, "time_per_iteration": 2.6335439682006836 }, { "auxiliary_loss_clip": 0.01257862, "auxiliary_loss_mlp": 0.00211582, "balance_loss_clip": 1.03760135, "balance_loss_mlp": 0.18645266, "epoch": 0.8306628588606644, "flos": 19644573623040.0, "grad_norm": 20.944854037308055, "language_loss": 0.85415787, "learning_rate": 2.9324097017993745e-07, "loss": 0.86885226, "num_input_tokens_seen": 298012920, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.2512207, "step": 13816, "time_per_iteration": 2.617687702178955 }, { "auxiliary_loss_clip": 0.01240626, "auxiliary_loss_mlp": 0.0024419, "balance_loss_clip": 1.02802896, "balance_loss_mlp": 0.21997905, "epoch": 0.8307229821133323, "flos": 24389737770240.0, "grad_norm": 7.831650968266981, "language_loss": 0.88076079, "learning_rate": 2.930379800094371e-07, "loss": 0.8956089, "num_input_tokens_seen": 298033310, "router_z_loss_clip": 2.12890625, "router_z_loss_mlp": 0.2421875, "step": 13817, "time_per_iteration": 4.0785300731658936 }, { "auxiliary_loss_clip": 0.01243724, "auxiliary_loss_mlp": 0.00245404, "balance_loss_clip": 1.02863467, "balance_loss_mlp": 0.22020333, "epoch": 0.8307831053660003, "flos": 20996933702400.0, "grad_norm": 3.7922020437824355, "language_loss": 0.84956449, "learning_rate": 2.9283505456710875e-07, "loss": 0.86445582, "num_input_tokens_seen": 298053530, "router_z_loss_clip": 2.15039062, "router_z_loss_mlp": 0.25195312, "step": 13818, "time_per_iteration": 2.6947429180145264 }, { "auxiliary_loss_clip": 0.01264067, "auxiliary_loss_mlp": 0.00230884, "balance_loss_clip": 1.04279721, "balance_loss_mlp": 0.2056717, "epoch": 0.8308432286186682, "flos": 21397301671680.0, "grad_norm": 6.3221500605381165, "language_loss": 0.89299858, "learning_rate": 2.926321938606453e-07, "loss": 0.90794814, "num_input_tokens_seen": 298069305, "router_z_loss_clip": 2.21484375, "router_z_loss_mlp": 0.2520752, "step": 13819, "time_per_iteration": 4.141093969345093 }, { "auxiliary_loss_clip": 0.01137863, "auxiliary_loss_mlp": 0.00079354, "balance_loss_clip": 0.9876911, "balance_loss_mlp": 0.07124736, "epoch": 0.8309033518713362, "flos": 62533656714240.0, "grad_norm": 0.8030089267201549, "language_loss": 0.55680835, "learning_rate": 2.924293978977399e-07, "loss": 0.56898057, "num_input_tokens_seen": 298125830, "router_z_loss_clip": 1.5, "router_z_loss_mlp": 0.08105469, "step": 13820, "time_per_iteration": 3.1619043350219727 }, { "auxiliary_loss_clip": 0.01253437, "auxiliary_loss_mlp": 0.00225351, "balance_loss_clip": 1.03535271, "balance_loss_mlp": 0.20113996, "epoch": 0.8309634751240043, "flos": 16979104051200.0, "grad_norm": 10.749788927244932, "language_loss": 0.77753973, "learning_rate": 2.922266666860831e-07, "loss": 0.79232764, "num_input_tokens_seen": 298142320, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.2421875, "step": 13821, "time_per_iteration": 2.6801838874816895 }, { "auxiliary_loss_clip": 0.01261663, "auxiliary_loss_mlp": 0.00245768, "balance_loss_clip": 1.03595054, "balance_loss_mlp": 0.22096057, "epoch": 0.8310235983766722, "flos": 22674464628480.0, "grad_norm": 2.9733022639774545, "language_loss": 0.77193981, "learning_rate": 2.920240002333625e-07, "loss": 0.78701413, "num_input_tokens_seen": 298161845, "router_z_loss_clip": 2.25390625, "router_z_loss_mlp": 0.24804688, "step": 13822, "time_per_iteration": 2.693302631378174 }, { "auxiliary_loss_clip": 0.01238213, "auxiliary_loss_mlp": 0.00245006, "balance_loss_clip": 1.02748787, "balance_loss_mlp": 0.22079468, "epoch": 0.8310837216293402, "flos": 30811463176320.0, "grad_norm": 98.96779557857764, "language_loss": 0.69136167, "learning_rate": 2.918213985472631e-07, "loss": 0.70619392, "num_input_tokens_seen": 298184165, "router_z_loss_clip": 2.10742188, "router_z_loss_mlp": 0.2421875, "step": 13823, "time_per_iteration": 4.348609685897827 }, { "auxiliary_loss_clip": 0.01118151, "auxiliary_loss_mlp": 0.0005821, "balance_loss_clip": 0.96950424, "balance_loss_mlp": 0.04977007, "epoch": 0.8311438448820081, "flos": 71276074997760.0, "grad_norm": 0.9693701903317519, "language_loss": 0.60529602, "learning_rate": 2.916188616354669e-07, "loss": 0.61705959, "num_input_tokens_seen": 298251720, "router_z_loss_clip": 1.484375, "router_z_loss_mlp": 0.08447266, "step": 13824, "time_per_iteration": 3.2749969959259033 }, { "auxiliary_loss_clip": 0.01231812, "auxiliary_loss_mlp": 0.00248031, "balance_loss_clip": 1.02023482, "balance_loss_mlp": 0.2245353, "epoch": 0.8312039681346761, "flos": 20887082933760.0, "grad_norm": 6.116696708296442, "language_loss": 0.81350416, "learning_rate": 2.914163895056552e-07, "loss": 0.82830256, "num_input_tokens_seen": 298271910, "router_z_loss_clip": 2.1171875, "router_z_loss_mlp": 0.23498535, "step": 13825, "time_per_iteration": 2.640084981918335 }, { "auxiliary_loss_clip": 0.01262407, "auxiliary_loss_mlp": 0.00244593, "balance_loss_clip": 1.04014015, "balance_loss_mlp": 0.21985707, "epoch": 0.831264091387344, "flos": 17017528625280.0, "grad_norm": 56.322355776331285, "language_loss": 0.8763839, "learning_rate": 2.9121398216550486e-07, "loss": 0.89145386, "num_input_tokens_seen": 298288105, "router_z_loss_clip": 2.22265625, "router_z_loss_mlp": 0.24743652, "step": 13826, "time_per_iteration": 2.6370654106140137 }, { "auxiliary_loss_clip": 0.01254225, "auxiliary_loss_mlp": 0.00238169, "balance_loss_clip": 1.03845787, "balance_loss_mlp": 0.21270634, "epoch": 0.831324214640012, "flos": 24419578993920.0, "grad_norm": 13.043725121205929, "language_loss": 0.75174606, "learning_rate": 2.910116396226914e-07, "loss": 0.76666999, "num_input_tokens_seen": 298307600, "router_z_loss_clip": 2.16015625, "router_z_loss_mlp": 0.2545166, "step": 13827, "time_per_iteration": 2.6431050300598145 }, { "auxiliary_loss_clip": 0.01247462, "auxiliary_loss_mlp": 0.00231504, "balance_loss_clip": 1.02937961, "balance_loss_mlp": 0.20645835, "epoch": 0.83138433789268, "flos": 13545576938880.0, "grad_norm": 9.226943275741856, "language_loss": 0.81125385, "learning_rate": 2.9080936188488834e-07, "loss": 0.82604349, "num_input_tokens_seen": 298323055, "router_z_loss_clip": 2.18261719, "router_z_loss_mlp": 0.25048828, "step": 13828, "time_per_iteration": 4.006492614746094 }, { "auxiliary_loss_clip": 0.012479, "auxiliary_loss_mlp": 0.00239689, "balance_loss_clip": 1.03409505, "balance_loss_mlp": 0.21520318, "epoch": 0.831444461145348, "flos": 44492386561920.0, "grad_norm": 41.296919260227604, "language_loss": 0.74980307, "learning_rate": 2.906071489597657e-07, "loss": 0.76467896, "num_input_tokens_seen": 298346950, "router_z_loss_clip": 2.13867188, "router_z_loss_mlp": 0.24511719, "step": 13829, "time_per_iteration": 2.8562862873077393 }, { "auxiliary_loss_clip": 0.01261573, "auxiliary_loss_mlp": 0.0023478, "balance_loss_clip": 1.04317141, "balance_loss_mlp": 0.20905453, "epoch": 0.8315045843980159, "flos": 22705024124160.0, "grad_norm": 72.56369163897446, "language_loss": 0.90519452, "learning_rate": 2.9040500085499054e-07, "loss": 0.92015803, "num_input_tokens_seen": 298366315, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.25732422, "step": 13830, "time_per_iteration": 2.623713493347168 }, { "auxiliary_loss_clip": 0.01252222, "auxiliary_loss_mlp": 0.00233893, "balance_loss_clip": 1.03937364, "balance_loss_mlp": 0.21057522, "epoch": 0.8315647076506839, "flos": 16873491087360.0, "grad_norm": 163.1740020647691, "language_loss": 0.82869506, "learning_rate": 2.9020291757822925e-07, "loss": 0.84355617, "num_input_tokens_seen": 298385185, "router_z_loss_clip": 2.12890625, "router_z_loss_mlp": 0.2331543, "step": 13831, "time_per_iteration": 2.6503617763519287 }, { "auxiliary_loss_clip": 0.0125491, "auxiliary_loss_mlp": 0.00236162, "balance_loss_clip": 1.03866494, "balance_loss_mlp": 0.21278527, "epoch": 0.8316248309033518, "flos": 13808730954240.0, "grad_norm": 4.712171340617204, "language_loss": 0.80023301, "learning_rate": 2.9000089913714523e-07, "loss": 0.8151437, "num_input_tokens_seen": 298402335, "router_z_loss_clip": 2.16015625, "router_z_loss_mlp": 0.23400879, "step": 13832, "time_per_iteration": 2.5967254638671875 }, { "auxiliary_loss_clip": 0.01262648, "auxiliary_loss_mlp": 0.00239962, "balance_loss_clip": 1.04241788, "balance_loss_mlp": 0.2158581, "epoch": 0.8316849541560198, "flos": 23512511819520.0, "grad_norm": 24.672247831249784, "language_loss": 0.91111326, "learning_rate": 2.897989455393979e-07, "loss": 0.92613935, "num_input_tokens_seen": 298423370, "router_z_loss_clip": 2.20117188, "router_z_loss_mlp": 0.24084473, "step": 13833, "time_per_iteration": 2.6808390617370605 }, { "auxiliary_loss_clip": 0.01262378, "auxiliary_loss_mlp": 0.00229977, "balance_loss_clip": 1.04490662, "balance_loss_mlp": 0.20611136, "epoch": 0.8317450774086879, "flos": 23771356202880.0, "grad_norm": 296.2131734149485, "language_loss": 0.82004309, "learning_rate": 2.8959705679264625e-07, "loss": 0.83496666, "num_input_tokens_seen": 298444835, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.23864746, "step": 13834, "time_per_iteration": 2.7046937942504883 }, { "auxiliary_loss_clip": 0.01260389, "auxiliary_loss_mlp": 0.00227554, "balance_loss_clip": 1.04146039, "balance_loss_mlp": 0.2034376, "epoch": 0.8318052006613558, "flos": 16215535710720.0, "grad_norm": 23.7430650338012, "language_loss": 0.87938696, "learning_rate": 2.893952329045459e-07, "loss": 0.89426637, "num_input_tokens_seen": 298461845, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.24133301, "step": 13835, "time_per_iteration": 2.716485023498535 }, { "auxiliary_loss_clip": 0.01258448, "auxiliary_loss_mlp": 0.00223151, "balance_loss_clip": 1.0358125, "balance_loss_mlp": 0.19699669, "epoch": 0.8318653239140238, "flos": 19974556892160.0, "grad_norm": 24.017322734328975, "language_loss": 0.89827585, "learning_rate": 2.8919347388274905e-07, "loss": 0.91309184, "num_input_tokens_seen": 298479095, "router_z_loss_clip": 2.2265625, "router_z_loss_mlp": 0.26171875, "step": 13836, "time_per_iteration": 2.7292585372924805 }, { "auxiliary_loss_clip": 0.01244848, "auxiliary_loss_mlp": 0.00222658, "balance_loss_clip": 1.03304005, "balance_loss_mlp": 0.19848201, "epoch": 0.8319254471666917, "flos": 17704714694400.0, "grad_norm": 5.28082408937824, "language_loss": 0.86214459, "learning_rate": 2.8899177973490727e-07, "loss": 0.87681961, "num_input_tokens_seen": 298494475, "router_z_loss_clip": 2.11914062, "router_z_loss_mlp": 0.24182129, "step": 13837, "time_per_iteration": 2.6462252140045166 }, { "auxiliary_loss_clip": 0.01260141, "auxiliary_loss_mlp": 0.00250577, "balance_loss_clip": 1.0381186, "balance_loss_mlp": 0.22539988, "epoch": 0.8319855704193597, "flos": 19536554448000.0, "grad_norm": 11.090970803105646, "language_loss": 0.9020195, "learning_rate": 2.887901504686685e-07, "loss": 0.91712666, "num_input_tokens_seen": 298513185, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.25170898, "step": 13838, "time_per_iteration": 2.646807909011841 }, { "auxiliary_loss_clip": 0.01263165, "auxiliary_loss_mlp": 0.00244577, "balance_loss_clip": 1.04890537, "balance_loss_mlp": 0.22041366, "epoch": 0.8320456936720276, "flos": 21178067011200.0, "grad_norm": 6.811739518405201, "language_loss": 0.81996512, "learning_rate": 2.885885860916795e-07, "loss": 0.8350426, "num_input_tokens_seen": 298531885, "router_z_loss_clip": 2.14648438, "router_z_loss_mlp": 0.24169922, "step": 13839, "time_per_iteration": 2.77290940284729 }, { "auxiliary_loss_clip": 0.01257815, "auxiliary_loss_mlp": 0.00246635, "balance_loss_clip": 1.03885841, "balance_loss_mlp": 0.22187537, "epoch": 0.8321058169246957, "flos": 33250874503680.0, "grad_norm": 16.771998866196643, "language_loss": 0.74395764, "learning_rate": 2.8838708661158253e-07, "loss": 0.75900221, "num_input_tokens_seen": 298554905, "router_z_loss_clip": 2.19140625, "router_z_loss_mlp": 0.24731445, "step": 13840, "time_per_iteration": 2.839007616043091 }, { "auxiliary_loss_clip": 0.01259811, "auxiliary_loss_mlp": 0.00237555, "balance_loss_clip": 1.03717017, "balance_loss_mlp": 0.21173392, "epoch": 0.8321659401773636, "flos": 14208129256320.0, "grad_norm": 10.63976154055097, "language_loss": 0.85496867, "learning_rate": 2.8818565203601843e-07, "loss": 0.86994231, "num_input_tokens_seen": 298571185, "router_z_loss_clip": 2.22851562, "router_z_loss_mlp": 0.25854492, "step": 13841, "time_per_iteration": 2.6224141120910645 }, { "auxiliary_loss_clip": 0.01260438, "auxiliary_loss_mlp": 0.0023773, "balance_loss_clip": 1.04176474, "balance_loss_mlp": 0.21170625, "epoch": 0.8322260634300316, "flos": 15158253859200.0, "grad_norm": 6.565811622883925, "language_loss": 0.77105165, "learning_rate": 2.879842823726262e-07, "loss": 0.78603333, "num_input_tokens_seen": 298588505, "router_z_loss_clip": 2.18847656, "router_z_loss_mlp": 0.26037598, "step": 13842, "time_per_iteration": 2.6358580589294434 }, { "auxiliary_loss_clip": 0.0124941, "auxiliary_loss_mlp": 0.00220551, "balance_loss_clip": 1.0357275, "balance_loss_mlp": 0.19554067, "epoch": 0.8322861866826995, "flos": 25300827267840.0, "grad_norm": 19.97146697020617, "language_loss": 0.78260374, "learning_rate": 2.8778297762904124e-07, "loss": 0.79730332, "num_input_tokens_seen": 298609295, "router_z_loss_clip": 2.13574219, "router_z_loss_mlp": 0.25024414, "step": 13843, "time_per_iteration": 2.6698038578033447 }, { "auxiliary_loss_clip": 0.01241542, "auxiliary_loss_mlp": 0.00221936, "balance_loss_clip": 1.03088486, "balance_loss_mlp": 0.1994893, "epoch": 0.8323463099353675, "flos": 17019360218880.0, "grad_norm": 13.71192230217096, "language_loss": 0.85910898, "learning_rate": 2.875817378128975e-07, "loss": 0.87374377, "num_input_tokens_seen": 298625765, "router_z_loss_clip": 2.10742188, "router_z_loss_mlp": 0.22436523, "step": 13844, "time_per_iteration": 2.6656699180603027 }, { "auxiliary_loss_clip": 0.01141178, "auxiliary_loss_mlp": 0.00106275, "balance_loss_clip": 0.99043518, "balance_loss_mlp": 0.09840752, "epoch": 0.8324064331880354, "flos": 55607889709440.0, "grad_norm": 0.7651262316004078, "language_loss": 0.5482384, "learning_rate": 2.8738056293182624e-07, "loss": 0.56071293, "num_input_tokens_seen": 298683005, "router_z_loss_clip": 1.5078125, "router_z_loss_mlp": 0.07861328, "step": 13845, "time_per_iteration": 3.0707898139953613 }, { "auxiliary_loss_clip": 0.01261067, "auxiliary_loss_mlp": 0.00232365, "balance_loss_clip": 1.04176593, "balance_loss_mlp": 0.20841566, "epoch": 0.8324665564407034, "flos": 26138623063680.0, "grad_norm": 7.793613726941194, "language_loss": 0.82155013, "learning_rate": 2.871794529934555e-07, "loss": 0.83648443, "num_input_tokens_seen": 298703060, "router_z_loss_clip": 2.19335938, "router_z_loss_mlp": 0.23950195, "step": 13846, "time_per_iteration": 2.7012441158294678 }, { "auxiliary_loss_clip": 0.0127525, "auxiliary_loss_mlp": 0.00231281, "balance_loss_clip": 1.04942322, "balance_loss_mlp": 0.20504341, "epoch": 0.8325266796933715, "flos": 22049187649920.0, "grad_norm": 24.363714453125773, "language_loss": 0.86209196, "learning_rate": 2.8697840800541115e-07, "loss": 0.87715721, "num_input_tokens_seen": 298721765, "router_z_loss_clip": 2.25585938, "router_z_loss_mlp": 0.26257324, "step": 13847, "time_per_iteration": 2.68108868598938 }, { "auxiliary_loss_clip": 0.01240156, "auxiliary_loss_mlp": 0.00213072, "balance_loss_clip": 1.02685642, "balance_loss_mlp": 0.18894404, "epoch": 0.8325868029460394, "flos": 22816634659200.0, "grad_norm": 6.646408599914071, "language_loss": 0.81511569, "learning_rate": 2.867774279753175e-07, "loss": 0.82964802, "num_input_tokens_seen": 298740825, "router_z_loss_clip": 2.1328125, "router_z_loss_mlp": 0.24133301, "step": 13848, "time_per_iteration": 2.7149817943573 }, { "auxiliary_loss_clip": 0.01252564, "auxiliary_loss_mlp": 0.00219491, "balance_loss_clip": 1.03701997, "balance_loss_mlp": 0.19640031, "epoch": 0.8326469261987074, "flos": 14757454926720.0, "grad_norm": 24.19997684299332, "language_loss": 0.70498061, "learning_rate": 2.8657651291079554e-07, "loss": 0.71970117, "num_input_tokens_seen": 298758515, "router_z_loss_clip": 2.15234375, "router_z_loss_mlp": 0.23071289, "step": 13849, "time_per_iteration": 2.626472234725952 }, { "auxiliary_loss_clip": 0.01245963, "auxiliary_loss_mlp": 0.0022354, "balance_loss_clip": 1.03249753, "balance_loss_mlp": 0.20124771, "epoch": 0.8327070494513753, "flos": 22926126291840.0, "grad_norm": 3.80235230795064, "language_loss": 0.90057361, "learning_rate": 2.863756628194638e-07, "loss": 0.91526866, "num_input_tokens_seen": 298776375, "router_z_loss_clip": 2.13476562, "router_z_loss_mlp": 0.22277832, "step": 13850, "time_per_iteration": 2.7565460205078125 }, { "auxiliary_loss_clip": 0.01238535, "auxiliary_loss_mlp": 0.00227404, "balance_loss_clip": 1.02730107, "balance_loss_mlp": 0.20446862, "epoch": 0.8327671727040433, "flos": 20665334321280.0, "grad_norm": 8.60012012176768, "language_loss": 0.85319155, "learning_rate": 2.8617487770893877e-07, "loss": 0.8678509, "num_input_tokens_seen": 298795135, "router_z_loss_clip": 2.11328125, "router_z_loss_mlp": 0.22961426, "step": 13851, "time_per_iteration": 2.7233338356018066 }, { "auxiliary_loss_clip": 0.01139699, "auxiliary_loss_mlp": 0.00137306, "balance_loss_clip": 0.9915055, "balance_loss_mlp": 0.12934296, "epoch": 0.8328272959567112, "flos": 56060760384000.0, "grad_norm": 0.9620154445664564, "language_loss": 0.55278707, "learning_rate": 2.859741575868344e-07, "loss": 0.56555712, "num_input_tokens_seen": 298855475, "router_z_loss_clip": 1.484375, "router_z_loss_mlp": 0.07958984, "step": 13852, "time_per_iteration": 3.133488178253174 }, { "auxiliary_loss_clip": 0.0123702, "auxiliary_loss_mlp": 0.00218012, "balance_loss_clip": 1.02560973, "balance_loss_mlp": 0.19628012, "epoch": 0.8328874192093793, "flos": 32303084284800.0, "grad_norm": 10.282039438913827, "language_loss": 0.74003565, "learning_rate": 2.8577350246076125e-07, "loss": 0.75458598, "num_input_tokens_seen": 298875875, "router_z_loss_clip": 2.11523438, "router_z_loss_mlp": 0.21728516, "step": 13853, "time_per_iteration": 2.73909330368042 }, { "auxiliary_loss_clip": 0.01246707, "auxiliary_loss_mlp": 0.00237239, "balance_loss_clip": 1.02768183, "balance_loss_mlp": 0.21275368, "epoch": 0.8329475424620472, "flos": 23512691387520.0, "grad_norm": 110.96289802698962, "language_loss": 0.86351681, "learning_rate": 2.855729123383286e-07, "loss": 0.87835628, "num_input_tokens_seen": 298895950, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.24487305, "step": 13854, "time_per_iteration": 2.637152671813965 }, { "auxiliary_loss_clip": 0.01149375, "auxiliary_loss_mlp": 0.00195855, "balance_loss_clip": 0.99815136, "balance_loss_mlp": 0.18736708, "epoch": 0.8330076657147152, "flos": 67840680378240.0, "grad_norm": 0.757006895129408, "language_loss": 0.57889485, "learning_rate": 2.8537238722714295e-07, "loss": 0.59234715, "num_input_tokens_seen": 298955770, "router_z_loss_clip": 1.515625, "router_z_loss_mlp": 0.08496094, "step": 13855, "time_per_iteration": 3.023240327835083 }, { "auxiliary_loss_clip": 0.01251637, "auxiliary_loss_mlp": 0.00228334, "balance_loss_clip": 1.03302813, "balance_loss_mlp": 0.20299006, "epoch": 0.8330677889673831, "flos": 22892801448960.0, "grad_norm": 309.77100567679184, "language_loss": 0.8242563, "learning_rate": 2.8517192713480853e-07, "loss": 0.83905602, "num_input_tokens_seen": 298976545, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.25341797, "step": 13856, "time_per_iteration": 2.7564315795898438 }, { "auxiliary_loss_clip": 0.0125073, "auxiliary_loss_mlp": 0.00245995, "balance_loss_clip": 1.034464, "balance_loss_mlp": 0.22037646, "epoch": 0.8331279122200511, "flos": 27345042184320.0, "grad_norm": 9.537398100261386, "language_loss": 0.82104933, "learning_rate": 2.8497153206892677e-07, "loss": 0.83601665, "num_input_tokens_seen": 298996750, "router_z_loss_clip": 2.16015625, "router_z_loss_mlp": 0.25622559, "step": 13857, "time_per_iteration": 2.7504165172576904 }, { "auxiliary_loss_clip": 0.01235697, "auxiliary_loss_mlp": 0.00212058, "balance_loss_clip": 1.0282675, "balance_loss_mlp": 0.18995686, "epoch": 0.833188035472719, "flos": 19938179393280.0, "grad_norm": 3.615291738024865, "language_loss": 0.79325855, "learning_rate": 2.847712020370958e-07, "loss": 0.80773616, "num_input_tokens_seen": 299014895, "router_z_loss_clip": 2.07617188, "router_z_loss_mlp": 0.22106934, "step": 13858, "time_per_iteration": 2.6877708435058594 }, { "auxiliary_loss_clip": 0.01290425, "auxiliary_loss_mlp": 0.00250024, "balance_loss_clip": 1.06108475, "balance_loss_mlp": 0.22175951, "epoch": 0.833248158725387, "flos": 15232624968960.0, "grad_norm": 32.232577787365834, "language_loss": 0.86521351, "learning_rate": 2.8457093704691316e-07, "loss": 0.88061798, "num_input_tokens_seen": 299032855, "router_z_loss_clip": 2.29296875, "router_z_loss_mlp": 0.28259277, "step": 13859, "time_per_iteration": 2.654614210128784 }, { "auxiliary_loss_clip": 0.01262294, "auxiliary_loss_mlp": 0.00237027, "balance_loss_clip": 1.04462814, "balance_loss_mlp": 0.21223134, "epoch": 0.8333082819780551, "flos": 24535535074560.0, "grad_norm": 4.441699606250712, "language_loss": 0.86301148, "learning_rate": 2.8437073710597205e-07, "loss": 0.87800473, "num_input_tokens_seen": 299052055, "router_z_loss_clip": 2.18164062, "router_z_loss_mlp": 0.24780273, "step": 13860, "time_per_iteration": 4.134536266326904 }, { "auxiliary_loss_clip": 0.01246247, "auxiliary_loss_mlp": 0.00245111, "balance_loss_clip": 1.03449273, "balance_loss_mlp": 0.22188941, "epoch": 0.833368405230723, "flos": 31467407391360.0, "grad_norm": 16.14362777445629, "language_loss": 0.88738424, "learning_rate": 2.841706022218644e-07, "loss": 0.90229785, "num_input_tokens_seen": 299075285, "router_z_loss_clip": 2.11914062, "router_z_loss_mlp": 0.2322998, "step": 13861, "time_per_iteration": 4.2075395584106445 }, { "auxiliary_loss_clip": 0.01256041, "auxiliary_loss_mlp": 0.00235356, "balance_loss_clip": 1.03746939, "balance_loss_mlp": 0.20998821, "epoch": 0.833428528483391, "flos": 14902713527040.0, "grad_norm": 11.473391171840477, "language_loss": 0.87088877, "learning_rate": 2.839705324021806e-07, "loss": 0.88580275, "num_input_tokens_seen": 299092520, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.25402832, "step": 13862, "time_per_iteration": 2.651503324508667 }, { "auxiliary_loss_clip": 0.01256325, "auxiliary_loss_mlp": 0.00258129, "balance_loss_clip": 1.03655362, "balance_loss_mlp": 0.23267823, "epoch": 0.8334886517360589, "flos": 22199833290240.0, "grad_norm": 282.1429552772105, "language_loss": 0.84314531, "learning_rate": 2.83770527654505e-07, "loss": 0.85828984, "num_input_tokens_seen": 299109450, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.25463867, "step": 13863, "time_per_iteration": 2.718950033187866 }, { "auxiliary_loss_clip": 0.01249039, "auxiliary_loss_mlp": 0.0021998, "balance_loss_clip": 1.03782582, "balance_loss_mlp": 0.19747353, "epoch": 0.8335487749887269, "flos": 30372562892160.0, "grad_norm": 267.19248680580796, "language_loss": 0.8169986, "learning_rate": 2.835705879864232e-07, "loss": 0.83168876, "num_input_tokens_seen": 299129540, "router_z_loss_clip": 2.11328125, "router_z_loss_mlp": 0.2253418, "step": 13864, "time_per_iteration": 2.7328476905822754 }, { "auxiliary_loss_clip": 0.01253752, "auxiliary_loss_mlp": 0.00227082, "balance_loss_clip": 1.03708994, "balance_loss_mlp": 0.20285875, "epoch": 0.8336088982413948, "flos": 24681152810880.0, "grad_norm": 2.652391819235367, "language_loss": 0.76489562, "learning_rate": 2.833707134055168e-07, "loss": 0.77970392, "num_input_tokens_seen": 299148670, "router_z_loss_clip": 2.16796875, "router_z_loss_mlp": 0.24243164, "step": 13865, "time_per_iteration": 2.7088258266448975 }, { "auxiliary_loss_clip": 0.01258892, "auxiliary_loss_mlp": 0.00235096, "balance_loss_clip": 1.04841781, "balance_loss_mlp": 0.2116951, "epoch": 0.8336690214940629, "flos": 38177207873280.0, "grad_norm": 3.591509157395617, "language_loss": 0.82437032, "learning_rate": 2.831709039193653e-07, "loss": 0.83931017, "num_input_tokens_seen": 299169330, "router_z_loss_clip": 2.10449219, "router_z_loss_mlp": 0.23413086, "step": 13866, "time_per_iteration": 4.330663204193115 }, { "auxiliary_loss_clip": 0.01129108, "auxiliary_loss_mlp": 0.00077726, "balance_loss_clip": 0.98731381, "balance_loss_mlp": 0.07000114, "epoch": 0.8337291447467308, "flos": 55565119589760.0, "grad_norm": 0.8408839763035625, "language_loss": 0.61918813, "learning_rate": 2.8297115953554465e-07, "loss": 0.63125646, "num_input_tokens_seen": 299220980, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.07714844, "step": 13867, "time_per_iteration": 3.1018972396850586 }, { "auxiliary_loss_clip": 0.01263233, "auxiliary_loss_mlp": 0.00237365, "balance_loss_clip": 1.0476172, "balance_loss_mlp": 0.21286753, "epoch": 0.8337892679993988, "flos": 24133550993280.0, "grad_norm": 31.42192113352572, "language_loss": 0.79189903, "learning_rate": 2.827714802616301e-07, "loss": 0.80690503, "num_input_tokens_seen": 299240130, "router_z_loss_clip": 2.15917969, "router_z_loss_mlp": 0.24499512, "step": 13868, "time_per_iteration": 2.670517921447754 }, { "auxiliary_loss_clip": 0.01266506, "auxiliary_loss_mlp": 0.00227662, "balance_loss_clip": 1.04924393, "balance_loss_mlp": 0.20255625, "epoch": 0.8338493912520667, "flos": 28183915388160.0, "grad_norm": 2.6745417749301867, "language_loss": 0.86488926, "learning_rate": 2.8257186610519325e-07, "loss": 0.87983096, "num_input_tokens_seen": 299260705, "router_z_loss_clip": 2.171875, "router_z_loss_mlp": 0.2512207, "step": 13869, "time_per_iteration": 2.7124533653259277 }, { "auxiliary_loss_clip": 0.01268606, "auxiliary_loss_mlp": 0.002235, "balance_loss_clip": 1.04994631, "balance_loss_mlp": 0.20038566, "epoch": 0.8339095145047347, "flos": 22158356060160.0, "grad_norm": 961.539759478965, "language_loss": 0.88789672, "learning_rate": 2.823723170738028e-07, "loss": 0.90281779, "num_input_tokens_seen": 299278925, "router_z_loss_clip": 2.18847656, "router_z_loss_mlp": 0.2310791, "step": 13870, "time_per_iteration": 4.051081895828247 }, { "auxiliary_loss_clip": 0.01248756, "auxiliary_loss_mlp": 0.00223357, "balance_loss_clip": 1.0317229, "balance_loss_mlp": 0.19996831, "epoch": 0.8339696377574026, "flos": 17307112072320.0, "grad_norm": 94.67120819769241, "language_loss": 0.80142617, "learning_rate": 2.821728331750264e-07, "loss": 0.81614733, "num_input_tokens_seen": 299291580, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.23376465, "step": 13871, "time_per_iteration": 2.7701234817504883 }, { "auxiliary_loss_clip": 0.01240075, "auxiliary_loss_mlp": 0.00222306, "balance_loss_clip": 1.02707708, "balance_loss_mlp": 0.19902432, "epoch": 0.8340297610100706, "flos": 20668351063680.0, "grad_norm": 21.48627631839179, "language_loss": 0.7712326, "learning_rate": 2.8197341441642853e-07, "loss": 0.78585637, "num_input_tokens_seen": 299310385, "router_z_loss_clip": 2.13085938, "router_z_loss_mlp": 0.23278809, "step": 13872, "time_per_iteration": 2.6460704803466797 }, { "auxiliary_loss_clip": 0.01266975, "auxiliary_loss_mlp": 0.00233412, "balance_loss_clip": 1.04515982, "balance_loss_mlp": 0.20769854, "epoch": 0.8340898842627387, "flos": 20515442866560.0, "grad_norm": 44.73213539929616, "language_loss": 0.82164693, "learning_rate": 2.817740608055712e-07, "loss": 0.83665085, "num_input_tokens_seen": 299327660, "router_z_loss_clip": 2.21484375, "router_z_loss_mlp": 0.25708008, "step": 13873, "time_per_iteration": 2.6297760009765625 }, { "auxiliary_loss_clip": 0.01283428, "auxiliary_loss_mlp": 0.00228477, "balance_loss_clip": 1.05421901, "balance_loss_mlp": 0.19886523, "epoch": 0.8341500075154066, "flos": 21425850005760.0, "grad_norm": 15.683187883502741, "language_loss": 0.84218472, "learning_rate": 2.81574772350013e-07, "loss": 0.8573038, "num_input_tokens_seen": 299343685, "router_z_loss_clip": 2.2890625, "router_z_loss_mlp": 0.29614258, "step": 13874, "time_per_iteration": 2.6503989696502686 }, { "auxiliary_loss_clip": 0.01260701, "auxiliary_loss_mlp": 0.00239877, "balance_loss_clip": 1.04756773, "balance_loss_mlp": 0.2170721, "epoch": 0.8342101307680746, "flos": 22090988102400.0, "grad_norm": 65.10395329525184, "language_loss": 0.74930525, "learning_rate": 2.813755490573118e-07, "loss": 0.76431108, "num_input_tokens_seen": 299363305, "router_z_loss_clip": 2.1328125, "router_z_loss_mlp": 0.22802734, "step": 13875, "time_per_iteration": 2.7033369541168213 }, { "auxiliary_loss_clip": 0.01246972, "auxiliary_loss_mlp": 0.0024061, "balance_loss_clip": 1.03336716, "balance_loss_mlp": 0.21664852, "epoch": 0.8342702540207425, "flos": 21871466133120.0, "grad_norm": 35.6247521691611, "language_loss": 0.86562002, "learning_rate": 2.8117639093502243e-07, "loss": 0.88049579, "num_input_tokens_seen": 299382630, "router_z_loss_clip": 2.13867188, "router_z_loss_mlp": 0.23950195, "step": 13876, "time_per_iteration": 2.67905330657959 }, { "auxiliary_loss_clip": 0.01238116, "auxiliary_loss_mlp": 0.0020385, "balance_loss_clip": 1.02627707, "balance_loss_mlp": 0.18119985, "epoch": 0.8343303772734105, "flos": 22528487756160.0, "grad_norm": 178.4478056406995, "language_loss": 0.95420909, "learning_rate": 2.8097729799069615e-07, "loss": 0.96862876, "num_input_tokens_seen": 299402385, "router_z_loss_clip": 2.11425781, "router_z_loss_mlp": 0.2265625, "step": 13877, "time_per_iteration": 2.6645212173461914 }, { "auxiliary_loss_clip": 0.01250246, "auxiliary_loss_mlp": 0.00235273, "balance_loss_clip": 1.03231502, "balance_loss_mlp": 0.21044208, "epoch": 0.8343905005260784, "flos": 14939773384320.0, "grad_norm": 13.320538036946756, "language_loss": 0.75795174, "learning_rate": 2.807782702318828e-07, "loss": 0.77280694, "num_input_tokens_seen": 299419820, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.24829102, "step": 13878, "time_per_iteration": 2.6764557361602783 }, { "auxiliary_loss_clip": 0.01271151, "auxiliary_loss_mlp": 0.00226007, "balance_loss_clip": 1.04528713, "balance_loss_mlp": 0.20026982, "epoch": 0.8344506237787465, "flos": 15012456554880.0, "grad_norm": 31.465089763112232, "language_loss": 0.87459135, "learning_rate": 2.805793076661309e-07, "loss": 0.88956296, "num_input_tokens_seen": 299436265, "router_z_loss_clip": 2.2578125, "router_z_loss_mlp": 0.25720215, "step": 13879, "time_per_iteration": 2.687739372253418 }, { "auxiliary_loss_clip": 0.01248172, "auxiliary_loss_mlp": 0.00227844, "balance_loss_clip": 1.03312039, "balance_loss_mlp": 0.20277466, "epoch": 0.8345107470314144, "flos": 17560389847680.0, "grad_norm": 135.67830840457447, "language_loss": 0.89776331, "learning_rate": 2.803804103009828e-07, "loss": 0.91252351, "num_input_tokens_seen": 299451660, "router_z_loss_clip": 2.15039062, "router_z_loss_mlp": 0.25085449, "step": 13880, "time_per_iteration": 2.6845197677612305 }, { "auxiliary_loss_clip": 0.01252255, "auxiliary_loss_mlp": 0.00218039, "balance_loss_clip": 1.03555536, "balance_loss_mlp": 0.19413757, "epoch": 0.8345708702840824, "flos": 25187277398400.0, "grad_norm": 15.31731083723965, "language_loss": 0.83332235, "learning_rate": 2.80181578143982e-07, "loss": 0.84802532, "num_input_tokens_seen": 299472070, "router_z_loss_clip": 2.16796875, "router_z_loss_mlp": 0.23913574, "step": 13881, "time_per_iteration": 2.679426670074463 }, { "auxiliary_loss_clip": 0.01244117, "auxiliary_loss_mlp": 0.00244759, "balance_loss_clip": 1.03107023, "balance_loss_mlp": 0.22057158, "epoch": 0.8346309935367503, "flos": 15083559527040.0, "grad_norm": 93.65799037542583, "language_loss": 0.87864429, "learning_rate": 2.7998281120266807e-07, "loss": 0.89353305, "num_input_tokens_seen": 299486725, "router_z_loss_clip": 2.12890625, "router_z_loss_mlp": 0.24194336, "step": 13882, "time_per_iteration": 2.7177250385284424 }, { "auxiliary_loss_clip": 0.01249713, "auxiliary_loss_mlp": 0.0025268, "balance_loss_clip": 1.03407633, "balance_loss_mlp": 0.22688295, "epoch": 0.8346911167894183, "flos": 22930615491840.0, "grad_norm": 4.713681502623207, "language_loss": 0.89110625, "learning_rate": 2.79784109484579e-07, "loss": 0.90613019, "num_input_tokens_seen": 299505435, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.25817871, "step": 13883, "time_per_iteration": 2.6467316150665283 }, { "auxiliary_loss_clip": 0.01259921, "auxiliary_loss_mlp": 0.00239717, "balance_loss_clip": 1.04154325, "balance_loss_mlp": 0.21514761, "epoch": 0.8347512400420862, "flos": 20193037367040.0, "grad_norm": 12.04935402057782, "language_loss": 0.82619524, "learning_rate": 2.795854729972482e-07, "loss": 0.84119165, "num_input_tokens_seen": 299523555, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.24523926, "step": 13884, "time_per_iteration": 2.6880276203155518 }, { "auxiliary_loss_clip": 0.01305213, "auxiliary_loss_mlp": 0.00245295, "balance_loss_clip": 1.06747818, "balance_loss_mlp": 0.21689951, "epoch": 0.8348113632947542, "flos": 25954832148480.0, "grad_norm": 10.799399774436171, "language_loss": 0.79262841, "learning_rate": 2.7938690174820913e-07, "loss": 0.80813348, "num_input_tokens_seen": 299541660, "router_z_loss_clip": 2.375, "router_z_loss_mlp": 0.28381348, "step": 13885, "time_per_iteration": 2.843804121017456 }, { "auxiliary_loss_clip": 0.01249722, "auxiliary_loss_mlp": 0.00226991, "balance_loss_clip": 1.03582919, "balance_loss_mlp": 0.20335157, "epoch": 0.8348714865474223, "flos": 34204554552960.0, "grad_norm": 620.8634900510882, "language_loss": 0.76443416, "learning_rate": 2.791883957449912e-07, "loss": 0.77920127, "num_input_tokens_seen": 299562465, "router_z_loss_clip": 2.13867188, "router_z_loss_mlp": 0.23632812, "step": 13886, "time_per_iteration": 2.8130409717559814 }, { "auxiliary_loss_clip": 0.01258431, "auxiliary_loss_mlp": 0.00207397, "balance_loss_clip": 1.04096532, "balance_loss_mlp": 0.1828759, "epoch": 0.8349316098000902, "flos": 24390132819840.0, "grad_norm": 87.53446254820078, "language_loss": 0.85654044, "learning_rate": 2.7898995499512134e-07, "loss": 0.87119871, "num_input_tokens_seen": 299582700, "router_z_loss_clip": 2.17578125, "router_z_loss_mlp": 0.24536133, "step": 13887, "time_per_iteration": 2.7548434734344482 }, { "auxiliary_loss_clip": 0.01274019, "auxiliary_loss_mlp": 0.00233154, "balance_loss_clip": 1.05274653, "balance_loss_mlp": 0.20779788, "epoch": 0.8349917330527582, "flos": 23032744836480.0, "grad_norm": 106.57875416138464, "language_loss": 0.7918399, "learning_rate": 2.7879157950612467e-07, "loss": 0.80691165, "num_input_tokens_seen": 299600310, "router_z_loss_clip": 2.2109375, "router_z_loss_mlp": 0.25341797, "step": 13888, "time_per_iteration": 2.6814167499542236 }, { "auxiliary_loss_clip": 0.01278125, "auxiliary_loss_mlp": 0.0023903, "balance_loss_clip": 1.05050695, "balance_loss_mlp": 0.21398437, "epoch": 0.8350518563054261, "flos": 13625873792640.0, "grad_norm": 16.859763353888123, "language_loss": 0.77963853, "learning_rate": 2.785932692855244e-07, "loss": 0.79481012, "num_input_tokens_seen": 299617025, "router_z_loss_clip": 2.27246094, "router_z_loss_mlp": 0.25036621, "step": 13889, "time_per_iteration": 2.664193868637085 }, { "auxiliary_loss_clip": 0.01258472, "auxiliary_loss_mlp": 0.00220093, "balance_loss_clip": 1.03914452, "balance_loss_mlp": 0.19651377, "epoch": 0.8351119795580941, "flos": 21579799697280.0, "grad_norm": 9.659830748195038, "language_loss": 0.7722019, "learning_rate": 2.783950243408399e-07, "loss": 0.78698754, "num_input_tokens_seen": 299633050, "router_z_loss_clip": 2.19140625, "router_z_loss_mlp": 0.23571777, "step": 13890, "time_per_iteration": 2.6464157104492188 }, { "auxiliary_loss_clip": 0.01276042, "auxiliary_loss_mlp": 0.00238988, "balance_loss_clip": 1.05443811, "balance_loss_mlp": 0.21434793, "epoch": 0.835172102810762, "flos": 20038297576320.0, "grad_norm": 22.692925171413645, "language_loss": 0.69700474, "learning_rate": 2.7819684467958817e-07, "loss": 0.71215498, "num_input_tokens_seen": 299646445, "router_z_loss_clip": 2.21679688, "router_z_loss_mlp": 0.24621582, "step": 13891, "time_per_iteration": 2.6518824100494385 }, { "auxiliary_loss_clip": 0.01247365, "auxiliary_loss_mlp": 0.00226557, "balance_loss_clip": 1.03565359, "balance_loss_mlp": 0.20303681, "epoch": 0.8352322260634301, "flos": 25111577485440.0, "grad_norm": 11.072407850125444, "language_loss": 0.76764309, "learning_rate": 2.779987303092846e-07, "loss": 0.78238225, "num_input_tokens_seen": 299662665, "router_z_loss_clip": 2.11914062, "router_z_loss_mlp": 0.23522949, "step": 13892, "time_per_iteration": 2.679222583770752 }, { "auxiliary_loss_clip": 0.01252581, "auxiliary_loss_mlp": 0.00216724, "balance_loss_clip": 1.03643095, "balance_loss_mlp": 0.19399107, "epoch": 0.835292349316098, "flos": 24863758577280.0, "grad_norm": 2.4501881940684442, "language_loss": 0.72604823, "learning_rate": 2.7780068123744207e-07, "loss": 0.74074125, "num_input_tokens_seen": 299683585, "router_z_loss_clip": 2.1640625, "router_z_loss_mlp": 0.22741699, "step": 13893, "time_per_iteration": 2.703325033187866 }, { "auxiliary_loss_clip": 0.01259902, "auxiliary_loss_mlp": 0.00251394, "balance_loss_clip": 1.04173207, "balance_loss_mlp": 0.22424977, "epoch": 0.835352472568766, "flos": 19865568049920.0, "grad_norm": 3.5430756064374047, "language_loss": 0.87483215, "learning_rate": 2.7760269747156996e-07, "loss": 0.88994515, "num_input_tokens_seen": 299702680, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.27160645, "step": 13894, "time_per_iteration": 2.737489700317383 }, { "auxiliary_loss_clip": 0.01241356, "auxiliary_loss_mlp": 0.00219781, "balance_loss_clip": 1.03006768, "balance_loss_mlp": 0.19584361, "epoch": 0.8354125958214339, "flos": 22054754257920.0, "grad_norm": 159.0754439182516, "language_loss": 0.80475384, "learning_rate": 2.7740477901917625e-07, "loss": 0.8193652, "num_input_tokens_seen": 299721050, "router_z_loss_clip": 2.1171875, "router_z_loss_mlp": 0.23937988, "step": 13895, "time_per_iteration": 2.688227415084839 }, { "auxiliary_loss_clip": 0.0126959, "auxiliary_loss_mlp": 0.00257711, "balance_loss_clip": 1.05105221, "balance_loss_mlp": 0.23090053, "epoch": 0.8354727190741019, "flos": 21397804462080.0, "grad_norm": 86.04088801196643, "language_loss": 0.8140502, "learning_rate": 2.772069258877667e-07, "loss": 0.82932323, "num_input_tokens_seen": 299738255, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.26843262, "step": 13896, "time_per_iteration": 2.6331627368927 }, { "auxiliary_loss_clip": 0.01252634, "auxiliary_loss_mlp": 0.00207199, "balance_loss_clip": 1.03647256, "balance_loss_mlp": 0.18497854, "epoch": 0.8355328423267698, "flos": 50840997834240.0, "grad_norm": 20.817570793137097, "language_loss": 0.67555076, "learning_rate": 2.770091380848423e-07, "loss": 0.69014907, "num_input_tokens_seen": 299761315, "router_z_loss_clip": 2.16015625, "router_z_loss_mlp": 0.22241211, "step": 13897, "time_per_iteration": 2.9100496768951416 }, { "auxiliary_loss_clip": 0.01159732, "auxiliary_loss_mlp": 0.00148947, "balance_loss_clip": 1.01592469, "balance_loss_mlp": 0.13945788, "epoch": 0.8355929655794379, "flos": 65551052764800.0, "grad_norm": 0.7071299721380457, "language_loss": 0.56999946, "learning_rate": 2.7681141561790423e-07, "loss": 0.58308625, "num_input_tokens_seen": 299828735, "router_z_loss_clip": 1.4375, "router_z_loss_mlp": 0.09472656, "step": 13898, "time_per_iteration": 3.202359437942505 }, { "auxiliary_loss_clip": 0.01269084, "auxiliary_loss_mlp": 0.00237732, "balance_loss_clip": 1.0478816, "balance_loss_mlp": 0.21170908, "epoch": 0.8356530888321058, "flos": 19170516902400.0, "grad_norm": 37.669097836609154, "language_loss": 0.89813882, "learning_rate": 2.7661375849444967e-07, "loss": 0.91320705, "num_input_tokens_seen": 299848395, "router_z_loss_clip": 2.2109375, "router_z_loss_mlp": 0.26000977, "step": 13899, "time_per_iteration": 2.6208600997924805 }, { "auxiliary_loss_clip": 0.01269404, "auxiliary_loss_mlp": 0.00232851, "balance_loss_clip": 1.05295885, "balance_loss_mlp": 0.20796055, "epoch": 0.8357132120847738, "flos": 44126672238720.0, "grad_norm": 4.028850471628516, "language_loss": 0.74851757, "learning_rate": 2.764161667219749e-07, "loss": 0.76354015, "num_input_tokens_seen": 299871665, "router_z_loss_clip": 2.16796875, "router_z_loss_mlp": 0.24890137, "step": 13900, "time_per_iteration": 2.8973681926727295 }, { "auxiliary_loss_clip": 0.01262192, "auxiliary_loss_mlp": 0.0024598, "balance_loss_clip": 1.04271197, "balance_loss_mlp": 0.22101757, "epoch": 0.8357733353374418, "flos": 24389701856640.0, "grad_norm": 9.11913424850274, "language_loss": 0.79403883, "learning_rate": 2.762186403079716e-07, "loss": 0.80912054, "num_input_tokens_seen": 299891960, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.24975586, "step": 13901, "time_per_iteration": 2.694429874420166 }, { "auxiliary_loss_clip": 0.01274922, "auxiliary_loss_mlp": 0.00225132, "balance_loss_clip": 1.04883909, "balance_loss_mlp": 0.20022893, "epoch": 0.8358334585901097, "flos": 20916313626240.0, "grad_norm": 18.071865079606397, "language_loss": 0.88884592, "learning_rate": 2.7602117925992963e-07, "loss": 0.90384644, "num_input_tokens_seen": 299905070, "router_z_loss_clip": 2.26171875, "router_z_loss_mlp": 0.24902344, "step": 13902, "time_per_iteration": 4.090671539306641 }, { "auxiliary_loss_clip": 0.01248042, "auxiliary_loss_mlp": 0.0022048, "balance_loss_clip": 1.03912222, "balance_loss_mlp": 0.19845045, "epoch": 0.8358935818427777, "flos": 19244169740160.0, "grad_norm": 52.9527669075588, "language_loss": 0.68714893, "learning_rate": 2.758237835853379e-07, "loss": 0.70183408, "num_input_tokens_seen": 299925130, "router_z_loss_clip": 2.08984375, "router_z_loss_mlp": 0.22033691, "step": 13903, "time_per_iteration": 4.066463947296143 }, { "auxiliary_loss_clip": 0.01272468, "auxiliary_loss_mlp": 0.00214589, "balance_loss_clip": 1.05705845, "balance_loss_mlp": 0.19118802, "epoch": 0.8359537050954456, "flos": 24134053783680.0, "grad_norm": 23.195158359126104, "language_loss": 0.83453977, "learning_rate": 2.7562645329168054e-07, "loss": 0.84941041, "num_input_tokens_seen": 299943845, "router_z_loss_clip": 2.15039062, "router_z_loss_mlp": 0.23413086, "step": 13904, "time_per_iteration": 2.690842628479004 }, { "auxiliary_loss_clip": 0.01256503, "auxiliary_loss_mlp": 0.00220832, "balance_loss_clip": 1.03979468, "balance_loss_mlp": 0.19690718, "epoch": 0.8360138283481137, "flos": 16180415187840.0, "grad_norm": 12.83640728736064, "language_loss": 0.79010296, "learning_rate": 2.7542918838644104e-07, "loss": 0.80487633, "num_input_tokens_seen": 299961620, "router_z_loss_clip": 2.16796875, "router_z_loss_mlp": 0.23950195, "step": 13905, "time_per_iteration": 2.6769862174987793 }, { "auxiliary_loss_clip": 0.01257222, "auxiliary_loss_mlp": 0.00241276, "balance_loss_clip": 1.042117, "balance_loss_mlp": 0.21648088, "epoch": 0.8360739516007816, "flos": 22198899536640.0, "grad_norm": 14.981439188764158, "language_loss": 0.73058736, "learning_rate": 2.752319888771e-07, "loss": 0.74557227, "num_input_tokens_seen": 299982170, "router_z_loss_clip": 2.15039062, "router_z_loss_mlp": 0.24829102, "step": 13906, "time_per_iteration": 2.725395441055298 }, { "auxiliary_loss_clip": 0.01242499, "auxiliary_loss_mlp": 0.00236044, "balance_loss_clip": 1.03205478, "balance_loss_mlp": 0.21190438, "epoch": 0.8361340748534496, "flos": 20923137210240.0, "grad_norm": 15.748675870859278, "language_loss": 0.8014397, "learning_rate": 2.7503485477113475e-07, "loss": 0.81622517, "num_input_tokens_seen": 300001330, "router_z_loss_clip": 2.10644531, "router_z_loss_mlp": 0.24121094, "step": 13907, "time_per_iteration": 2.672337293624878 }, { "auxiliary_loss_clip": 0.01269193, "auxiliary_loss_mlp": 0.00234724, "balance_loss_clip": 1.04534733, "balance_loss_mlp": 0.20769969, "epoch": 0.8361941981061175, "flos": 26173599932160.0, "grad_norm": 513.2316372148265, "language_loss": 0.83139789, "learning_rate": 2.7483778607602005e-07, "loss": 0.8464371, "num_input_tokens_seen": 300020645, "router_z_loss_clip": 2.24023438, "router_z_loss_mlp": 0.27026367, "step": 13908, "time_per_iteration": 4.256725788116455 }, { "auxiliary_loss_clip": 0.01276215, "auxiliary_loss_mlp": 0.00225057, "balance_loss_clip": 1.05707824, "balance_loss_mlp": 0.19988018, "epoch": 0.8362543213587855, "flos": 24419363512320.0, "grad_norm": 16.24378684280321, "language_loss": 0.80263126, "learning_rate": 2.7464078279922964e-07, "loss": 0.817644, "num_input_tokens_seen": 300039945, "router_z_loss_clip": 2.19335938, "router_z_loss_mlp": 0.25170898, "step": 13909, "time_per_iteration": 2.7075142860412598 }, { "auxiliary_loss_clip": 0.01270751, "auxiliary_loss_mlp": 0.0023731, "balance_loss_clip": 1.04642332, "balance_loss_mlp": 0.21085739, "epoch": 0.8363144446114534, "flos": 17202396948480.0, "grad_norm": 5.58731252211104, "language_loss": 0.82637393, "learning_rate": 2.744438449482338e-07, "loss": 0.84145457, "num_input_tokens_seen": 300058260, "router_z_loss_clip": 2.24414062, "router_z_loss_mlp": 0.2644043, "step": 13910, "time_per_iteration": 2.666461944580078 }, { "auxiliary_loss_clip": 0.01252514, "auxiliary_loss_mlp": 0.0022947, "balance_loss_clip": 1.0412991, "balance_loss_mlp": 0.20494872, "epoch": 0.8363745678641215, "flos": 19279398003840.0, "grad_norm": 8.918759493762801, "language_loss": 0.78302813, "learning_rate": 2.742469725305001e-07, "loss": 0.79784799, "num_input_tokens_seen": 300076720, "router_z_loss_clip": 2.11132812, "router_z_loss_mlp": 0.24511719, "step": 13911, "time_per_iteration": 2.688788890838623 }, { "auxiliary_loss_clip": 0.01268634, "auxiliary_loss_mlp": 0.00216656, "balance_loss_clip": 1.04951394, "balance_loss_mlp": 0.1933153, "epoch": 0.8364346911167894, "flos": 11874869596800.0, "grad_norm": 7.762831398043462, "language_loss": 0.87745452, "learning_rate": 2.740501655534946e-07, "loss": 0.89230746, "num_input_tokens_seen": 300092950, "router_z_loss_clip": 2.19140625, "router_z_loss_mlp": 0.23339844, "step": 13912, "time_per_iteration": 4.064759016036987 }, { "auxiliary_loss_clip": 0.01264924, "auxiliary_loss_mlp": 0.00225741, "balance_loss_clip": 1.04764915, "balance_loss_mlp": 0.2027452, "epoch": 0.8364948143694574, "flos": 20225212974720.0, "grad_norm": 25.522307630256012, "language_loss": 0.85650092, "learning_rate": 2.738534240246797e-07, "loss": 0.87140757, "num_input_tokens_seen": 300110950, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.23010254, "step": 13913, "time_per_iteration": 2.689507246017456 }, { "auxiliary_loss_clip": 0.01286693, "auxiliary_loss_mlp": 0.00245873, "balance_loss_clip": 1.05971551, "balance_loss_mlp": 0.21862128, "epoch": 0.8365549376221254, "flos": 21612909058560.0, "grad_norm": 29.225574226905263, "language_loss": 0.82212669, "learning_rate": 2.736567479515153e-07, "loss": 0.83745235, "num_input_tokens_seen": 300128705, "router_z_loss_clip": 2.26757812, "router_z_loss_mlp": 0.27233887, "step": 13914, "time_per_iteration": 2.661478281021118 }, { "auxiliary_loss_clip": 0.01261056, "auxiliary_loss_mlp": 0.00202886, "balance_loss_clip": 1.04191732, "balance_loss_mlp": 0.17828095, "epoch": 0.8366150608747933, "flos": 23294210912640.0, "grad_norm": 15.104196534922421, "language_loss": 0.79016417, "learning_rate": 2.7346013734146025e-07, "loss": 0.80480361, "num_input_tokens_seen": 300148635, "router_z_loss_clip": 2.19140625, "router_z_loss_mlp": 0.24633789, "step": 13915, "time_per_iteration": 2.6597394943237305 }, { "auxiliary_loss_clip": 0.0125355, "auxiliary_loss_mlp": 0.00228887, "balance_loss_clip": 1.03569674, "balance_loss_mlp": 0.20353135, "epoch": 0.8366751841274613, "flos": 15267673664640.0, "grad_norm": 632.6852116784831, "language_loss": 0.81227958, "learning_rate": 2.7326359220197035e-07, "loss": 0.82710397, "num_input_tokens_seen": 300165490, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.25341797, "step": 13916, "time_per_iteration": 2.659848690032959 }, { "auxiliary_loss_clip": 0.01266162, "auxiliary_loss_mlp": 0.00213863, "balance_loss_clip": 1.04410255, "balance_loss_mlp": 0.18938935, "epoch": 0.8367353073801292, "flos": 13224931205760.0, "grad_norm": 3.04085323022098, "language_loss": 0.85492694, "learning_rate": 2.7306711254049755e-07, "loss": 0.86972719, "num_input_tokens_seen": 300182130, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.24487305, "step": 13917, "time_per_iteration": 2.6027884483337402 }, { "auxiliary_loss_clip": 0.01252289, "auxiliary_loss_mlp": 0.00215195, "balance_loss_clip": 1.03924417, "balance_loss_mlp": 0.19180582, "epoch": 0.8367954306327973, "flos": 24205084928640.0, "grad_norm": 6.1174381507574225, "language_loss": 0.85702944, "learning_rate": 2.728706983644933e-07, "loss": 0.87170434, "num_input_tokens_seen": 300203050, "router_z_loss_clip": 2.13085938, "router_z_loss_mlp": 0.23388672, "step": 13918, "time_per_iteration": 2.661808967590332 }, { "auxiliary_loss_clip": 0.01280333, "auxiliary_loss_mlp": 0.00215187, "balance_loss_clip": 1.06167126, "balance_loss_mlp": 0.18981965, "epoch": 0.8368555538854652, "flos": 24534744975360.0, "grad_norm": 2.2152554588916584, "language_loss": 0.7656858, "learning_rate": 2.7267434968140457e-07, "loss": 0.78064096, "num_input_tokens_seen": 300224380, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.25366211, "step": 13919, "time_per_iteration": 2.6720550060272217 }, { "auxiliary_loss_clip": 0.01245836, "auxiliary_loss_mlp": 0.0019923, "balance_loss_clip": 1.03207779, "balance_loss_mlp": 0.17665219, "epoch": 0.8369156771381332, "flos": 20259363830400.0, "grad_norm": 1841.8518532644093, "language_loss": 0.82361603, "learning_rate": 2.7247806649867835e-07, "loss": 0.8380667, "num_input_tokens_seen": 300242915, "router_z_loss_clip": 2.140625, "router_z_loss_mlp": 0.22595215, "step": 13920, "time_per_iteration": 2.6577064990997314 }, { "auxiliary_loss_clip": 0.01262326, "auxiliary_loss_mlp": 0.00229384, "balance_loss_clip": 1.04381847, "balance_loss_mlp": 0.20338476, "epoch": 0.8369758003908011, "flos": 21835555511040.0, "grad_norm": 5.225476596093305, "language_loss": 0.78463978, "learning_rate": 2.722818488237566e-07, "loss": 0.79955685, "num_input_tokens_seen": 300261905, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.26013184, "step": 13921, "time_per_iteration": 2.6572532653808594 }, { "auxiliary_loss_clip": 0.01272105, "auxiliary_loss_mlp": 0.00227712, "balance_loss_clip": 1.04838717, "balance_loss_mlp": 0.20290503, "epoch": 0.8370359236434691, "flos": 21719312121600.0, "grad_norm": 10.886515060421619, "language_loss": 0.94580519, "learning_rate": 2.720856966640801e-07, "loss": 0.96080339, "num_input_tokens_seen": 300281145, "router_z_loss_clip": 2.23828125, "router_z_loss_mlp": 0.24804688, "step": 13922, "time_per_iteration": 2.6863386631011963 }, { "auxiliary_loss_clip": 0.01251117, "auxiliary_loss_mlp": 0.00236419, "balance_loss_clip": 1.0387814, "balance_loss_mlp": 0.21262479, "epoch": 0.837096046896137, "flos": 23148880485120.0, "grad_norm": 6.744313947971173, "language_loss": 0.79015112, "learning_rate": 2.71889610027088e-07, "loss": 0.80502641, "num_input_tokens_seen": 300301610, "router_z_loss_clip": 2.12695312, "router_z_loss_mlp": 0.23803711, "step": 13923, "time_per_iteration": 2.6720070838928223 }, { "auxiliary_loss_clip": 0.01261759, "auxiliary_loss_mlp": 0.00218053, "balance_loss_clip": 1.04367173, "balance_loss_mlp": 0.19326895, "epoch": 0.8371561701488051, "flos": 24492872695680.0, "grad_norm": 2.2910633016194133, "language_loss": 0.82147062, "learning_rate": 2.7169358892021433e-07, "loss": 0.83626878, "num_input_tokens_seen": 300319420, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.2479248, "step": 13924, "time_per_iteration": 2.712542772293091 }, { "auxiliary_loss_clip": 0.01257452, "auxiliary_loss_mlp": 0.00223915, "balance_loss_clip": 1.0416559, "balance_loss_mlp": 0.1975583, "epoch": 0.837216293401473, "flos": 29206723161600.0, "grad_norm": 80.36577754254857, "language_loss": 0.72673297, "learning_rate": 2.7149763335089293e-07, "loss": 0.74154663, "num_input_tokens_seen": 300341325, "router_z_loss_clip": 2.15820312, "router_z_loss_mlp": 0.26330566, "step": 13925, "time_per_iteration": 2.7430009841918945 }, { "auxiliary_loss_clip": 0.01277761, "auxiliary_loss_mlp": 0.0023465, "balance_loss_clip": 1.05522656, "balance_loss_mlp": 0.20875771, "epoch": 0.837276416654141, "flos": 25265275781760.0, "grad_norm": 17.529108224919884, "language_loss": 0.82064283, "learning_rate": 2.713017433265543e-07, "loss": 0.83576691, "num_input_tokens_seen": 300361620, "router_z_loss_clip": 2.22753906, "router_z_loss_mlp": 0.25891113, "step": 13926, "time_per_iteration": 2.741399049758911 }, { "auxiliary_loss_clip": 0.01267672, "auxiliary_loss_mlp": 0.00225136, "balance_loss_clip": 1.0466584, "balance_loss_mlp": 0.19906479, "epoch": 0.837336539906809, "flos": 13882024656000.0, "grad_norm": 23.50575167140102, "language_loss": 0.80315131, "learning_rate": 2.711059188546274e-07, "loss": 0.81807941, "num_input_tokens_seen": 300378675, "router_z_loss_clip": 2.2109375, "router_z_loss_mlp": 0.26049805, "step": 13927, "time_per_iteration": 2.6098484992980957 }, { "auxiliary_loss_clip": 0.0117027, "auxiliary_loss_mlp": 0.00106876, "balance_loss_clip": 1.02789187, "balance_loss_mlp": 0.09900796, "epoch": 0.8373966631594769, "flos": 68870599044480.0, "grad_norm": 4.755317234961843, "language_loss": 0.57414454, "learning_rate": 2.7091015994253695e-07, "loss": 0.58691597, "num_input_tokens_seen": 300449740, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.07861328, "step": 13928, "time_per_iteration": 3.3357577323913574 }, { "auxiliary_loss_clip": 0.01276222, "auxiliary_loss_mlp": 0.00238733, "balance_loss_clip": 1.05475283, "balance_loss_mlp": 0.21288809, "epoch": 0.8374567864121449, "flos": 20448972748800.0, "grad_norm": 31.70877768193827, "language_loss": 0.7686702, "learning_rate": 2.707144665977068e-07, "loss": 0.78381974, "num_input_tokens_seen": 300470000, "router_z_loss_clip": 2.21289062, "router_z_loss_mlp": 0.25854492, "step": 13929, "time_per_iteration": 2.6872711181640625 }, { "auxiliary_loss_clip": 0.01296682, "auxiliary_loss_mlp": 0.00236851, "balance_loss_clip": 1.07096171, "balance_loss_mlp": 0.20957586, "epoch": 0.8375169096648128, "flos": 41904197101440.0, "grad_norm": 25.659412138874394, "language_loss": 0.7586025, "learning_rate": 2.705188388275574e-07, "loss": 0.77393782, "num_input_tokens_seen": 300494975, "router_z_loss_clip": 2.2578125, "router_z_loss_mlp": 0.27282715, "step": 13930, "time_per_iteration": 2.841099500656128 }, { "auxiliary_loss_clip": 0.01275952, "auxiliary_loss_mlp": 0.00226285, "balance_loss_clip": 1.05656004, "balance_loss_mlp": 0.20258623, "epoch": 0.8375770329174809, "flos": 20009354192640.0, "grad_norm": 99.13957253613962, "language_loss": 0.79536068, "learning_rate": 2.703232766395067e-07, "loss": 0.81038308, "num_input_tokens_seen": 300513175, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.23706055, "step": 13931, "time_per_iteration": 2.7806365489959717 }, { "auxiliary_loss_clip": 0.01274415, "auxiliary_loss_mlp": 0.00238313, "balance_loss_clip": 1.05061293, "balance_loss_mlp": 0.21268326, "epoch": 0.8376371561701488, "flos": 22783597125120.0, "grad_norm": 2.7933431953655234, "language_loss": 0.77998757, "learning_rate": 2.701277800409705e-07, "loss": 0.79511482, "num_input_tokens_seen": 300533770, "router_z_loss_clip": 2.23828125, "router_z_loss_mlp": 0.25622559, "step": 13932, "time_per_iteration": 2.644869327545166 }, { "auxiliary_loss_clip": 0.01266481, "auxiliary_loss_mlp": 0.00217034, "balance_loss_clip": 1.05003142, "balance_loss_mlp": 0.19428851, "epoch": 0.8376972794228168, "flos": 23914459987200.0, "grad_norm": 144.291333514061, "language_loss": 0.75322503, "learning_rate": 2.699323490393628e-07, "loss": 0.76806015, "num_input_tokens_seen": 300552995, "router_z_loss_clip": 2.16601562, "router_z_loss_mlp": 0.22753906, "step": 13933, "time_per_iteration": 2.729759931564331 }, { "auxiliary_loss_clip": 0.01253531, "auxiliary_loss_mlp": 0.00224049, "balance_loss_clip": 1.04054713, "balance_loss_mlp": 0.20166129, "epoch": 0.8377574026754847, "flos": 13734718980480.0, "grad_norm": 39.78640973605216, "language_loss": 0.84810811, "learning_rate": 2.697369836420933e-07, "loss": 0.86288393, "num_input_tokens_seen": 300570275, "router_z_loss_clip": 2.12890625, "router_z_loss_mlp": 0.22387695, "step": 13934, "time_per_iteration": 2.5917294025421143 }, { "auxiliary_loss_clip": 0.01260149, "auxiliary_loss_mlp": 0.00240307, "balance_loss_clip": 1.04596412, "balance_loss_mlp": 0.21527344, "epoch": 0.8378175259281527, "flos": 21651333632640.0, "grad_norm": 7.748708248045925, "language_loss": 0.83090466, "learning_rate": 2.6954168385657115e-07, "loss": 0.84590924, "num_input_tokens_seen": 300590875, "router_z_loss_clip": 2.14257812, "router_z_loss_mlp": 0.25024414, "step": 13935, "time_per_iteration": 2.679527997970581 }, { "auxiliary_loss_clip": 0.01269643, "auxiliary_loss_mlp": 0.00235445, "balance_loss_clip": 1.04846334, "balance_loss_mlp": 0.21044701, "epoch": 0.8378776491808206, "flos": 15448806973440.0, "grad_norm": 105.05106889364808, "language_loss": 0.63662493, "learning_rate": 2.6934644969020135e-07, "loss": 0.65167582, "num_input_tokens_seen": 300607490, "router_z_loss_clip": 2.21484375, "router_z_loss_mlp": 0.24987793, "step": 13936, "time_per_iteration": 2.599281072616577 }, { "auxiliary_loss_clip": 0.01267872, "auxiliary_loss_mlp": 0.00229066, "balance_loss_clip": 1.05098498, "balance_loss_mlp": 0.20305455, "epoch": 0.8379377724334887, "flos": 14720395069440.0, "grad_norm": 71.15572203225985, "language_loss": 0.97416025, "learning_rate": 2.691512811503882e-07, "loss": 0.98912966, "num_input_tokens_seen": 300623635, "router_z_loss_clip": 2.1640625, "router_z_loss_mlp": 0.26000977, "step": 13937, "time_per_iteration": 2.699110984802246 }, { "auxiliary_loss_clip": 0.01277433, "auxiliary_loss_mlp": 0.00224315, "balance_loss_clip": 1.05849957, "balance_loss_mlp": 0.1996147, "epoch": 0.8379978956861566, "flos": 24535247765760.0, "grad_norm": 14.004522642010809, "language_loss": 0.88583255, "learning_rate": 2.689561782445313e-07, "loss": 0.90085006, "num_input_tokens_seen": 300643835, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.24707031, "step": 13938, "time_per_iteration": 2.6620261669158936 }, { "auxiliary_loss_clip": 0.01290076, "auxiliary_loss_mlp": 0.00227993, "balance_loss_clip": 1.06465292, "balance_loss_mlp": 0.20106338, "epoch": 0.8380580189388246, "flos": 18952611045120.0, "grad_norm": 250.0823117499765, "language_loss": 0.80014718, "learning_rate": 2.6876114098002965e-07, "loss": 0.81532788, "num_input_tokens_seen": 300662500, "router_z_loss_clip": 2.25390625, "router_z_loss_mlp": 0.26953125, "step": 13939, "time_per_iteration": 2.6878933906555176 }, { "auxiliary_loss_clip": 0.01282742, "auxiliary_loss_mlp": 0.00250464, "balance_loss_clip": 1.059376, "balance_loss_mlp": 0.22397591, "epoch": 0.8381181421914926, "flos": 26540283922560.0, "grad_norm": 10.17151221752381, "language_loss": 0.85913253, "learning_rate": 2.6856616936428e-07, "loss": 0.87446457, "num_input_tokens_seen": 300681480, "router_z_loss_clip": 2.23242188, "router_z_loss_mlp": 0.26477051, "step": 13940, "time_per_iteration": 2.740241050720215 }, { "auxiliary_loss_clip": 0.01280946, "auxiliary_loss_mlp": 0.00226396, "balance_loss_clip": 1.05963922, "balance_loss_mlp": 0.20070647, "epoch": 0.8381782654441605, "flos": 23291481479040.0, "grad_norm": 11.110725506108894, "language_loss": 0.85168648, "learning_rate": 2.6837126340467374e-07, "loss": 0.8667599, "num_input_tokens_seen": 300699165, "router_z_loss_clip": 2.21484375, "router_z_loss_mlp": 0.25695801, "step": 13941, "time_per_iteration": 2.6624722480773926 }, { "auxiliary_loss_clip": 0.01277193, "auxiliary_loss_mlp": 0.00237691, "balance_loss_clip": 1.05428839, "balance_loss_mlp": 0.20949784, "epoch": 0.8382383886968285, "flos": 26758800311040.0, "grad_norm": 77.09613246811952, "language_loss": 0.80998206, "learning_rate": 2.6817642310860276e-07, "loss": 0.82513088, "num_input_tokens_seen": 300714615, "router_z_loss_clip": 2.22851562, "router_z_loss_mlp": 0.28210449, "step": 13942, "time_per_iteration": 2.696963310241699 }, { "auxiliary_loss_clip": 0.01279089, "auxiliary_loss_mlp": 0.00231718, "balance_loss_clip": 1.05181265, "balance_loss_mlp": 0.20391804, "epoch": 0.8382985119494964, "flos": 26104544035200.0, "grad_norm": 3.250924616566831, "language_loss": 0.88596189, "learning_rate": 2.679816484834554e-07, "loss": 0.90107, "num_input_tokens_seen": 300734860, "router_z_loss_clip": 2.27539062, "router_z_loss_mlp": 0.27819824, "step": 13943, "time_per_iteration": 2.6827361583709717 }, { "auxiliary_loss_clip": 0.01278026, "auxiliary_loss_mlp": 0.00247836, "balance_loss_clip": 1.06103706, "balance_loss_mlp": 0.223744, "epoch": 0.8383586352021645, "flos": 16435129507200.0, "grad_norm": 20.53703365661822, "language_loss": 0.9302007, "learning_rate": 2.6778693953661766e-07, "loss": 0.94545925, "num_input_tokens_seen": 300752735, "router_z_loss_clip": 2.16796875, "router_z_loss_mlp": 0.2409668, "step": 13944, "time_per_iteration": 4.021123647689819 }, { "auxiliary_loss_clip": 0.0119787, "auxiliary_loss_mlp": 0.00099874, "balance_loss_clip": 1.0530982, "balance_loss_mlp": 0.09172006, "epoch": 0.8384187584548324, "flos": 64195532288640.0, "grad_norm": 0.611124694016949, "language_loss": 0.49782342, "learning_rate": 2.6759229627547263e-07, "loss": 0.51080084, "num_input_tokens_seen": 300820760, "router_z_loss_clip": 1.453125, "router_z_loss_mlp": 0.08154297, "step": 13945, "time_per_iteration": 4.74060583114624 }, { "auxiliary_loss_clip": 0.01267386, "auxiliary_loss_mlp": 0.00237416, "balance_loss_clip": 1.04957461, "balance_loss_mlp": 0.21201268, "epoch": 0.8384788817075004, "flos": 22382905933440.0, "grad_norm": 5.358383429730548, "language_loss": 0.72720081, "learning_rate": 2.673977187074017e-07, "loss": 0.74224883, "num_input_tokens_seen": 300840025, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.25402832, "step": 13946, "time_per_iteration": 2.635892868041992 }, { "auxiliary_loss_clip": 0.01275162, "auxiliary_loss_mlp": 0.00239381, "balance_loss_clip": 1.05226827, "balance_loss_mlp": 0.21293995, "epoch": 0.8385390049601683, "flos": 29496845312640.0, "grad_norm": 167.68311514220244, "language_loss": 0.7498821, "learning_rate": 2.672032068397829e-07, "loss": 0.76502758, "num_input_tokens_seen": 300860380, "router_z_loss_clip": 2.22851562, "router_z_loss_mlp": 0.26452637, "step": 13947, "time_per_iteration": 2.8435580730438232 }, { "auxiliary_loss_clip": 0.01267617, "auxiliary_loss_mlp": 0.00222048, "balance_loss_clip": 1.04637957, "balance_loss_mlp": 0.19749051, "epoch": 0.8385991282128363, "flos": 32707797799680.0, "grad_norm": 3.902823358744054, "language_loss": 0.76863939, "learning_rate": 2.6700876067999176e-07, "loss": 0.78353596, "num_input_tokens_seen": 300881895, "router_z_loss_clip": 2.21484375, "router_z_loss_mlp": 0.24584961, "step": 13948, "time_per_iteration": 2.7680132389068604 }, { "auxiliary_loss_clip": 0.01253004, "auxiliary_loss_mlp": 0.00220852, "balance_loss_clip": 1.04005146, "balance_loss_mlp": 0.19742718, "epoch": 0.8386592514655042, "flos": 25441022050560.0, "grad_norm": 11.961047084478684, "language_loss": 0.9141674, "learning_rate": 2.6681438023540194e-07, "loss": 0.92890584, "num_input_tokens_seen": 300901575, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.234375, "step": 13949, "time_per_iteration": 2.6776645183563232 }, { "auxiliary_loss_clip": 0.01264518, "auxiliary_loss_mlp": 0.00229168, "balance_loss_clip": 1.04746461, "balance_loss_mlp": 0.20382375, "epoch": 0.8387193747181723, "flos": 22015898720640.0, "grad_norm": 8.503177703267914, "language_loss": 0.77417529, "learning_rate": 2.66620065513385e-07, "loss": 0.78911209, "num_input_tokens_seen": 300919735, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.25341797, "step": 13950, "time_per_iteration": 4.543719530105591 }, { "auxiliary_loss_clip": 0.01255278, "auxiliary_loss_mlp": 0.0021306, "balance_loss_clip": 1.04271841, "balance_loss_mlp": 0.18831176, "epoch": 0.8387794979708402, "flos": 18150223080960.0, "grad_norm": 19.55744785010543, "language_loss": 0.74438208, "learning_rate": 2.6642581652130913e-07, "loss": 0.75906545, "num_input_tokens_seen": 300939150, "router_z_loss_clip": 2.12695312, "router_z_loss_mlp": 0.24755859, "step": 13951, "time_per_iteration": 2.6871209144592285 }, { "auxiliary_loss_clip": 0.0127688, "auxiliary_loss_mlp": 0.00234391, "balance_loss_clip": 1.05856633, "balance_loss_mlp": 0.20948833, "epoch": 0.8388396212235082, "flos": 25411216740480.0, "grad_norm": 29.690176796545643, "language_loss": 0.77554786, "learning_rate": 2.662316332665393e-07, "loss": 0.79066062, "num_input_tokens_seen": 300959730, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.24914551, "step": 13952, "time_per_iteration": 2.6965112686157227 }, { "auxiliary_loss_clip": 0.01263246, "auxiliary_loss_mlp": 0.0022951, "balance_loss_clip": 1.05071235, "balance_loss_mlp": 0.2064072, "epoch": 0.8388997444761762, "flos": 22273055164800.0, "grad_norm": 11.512120387139356, "language_loss": 0.801395, "learning_rate": 2.6603751575643987e-07, "loss": 0.81632257, "num_input_tokens_seen": 300976120, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.23132324, "step": 13953, "time_per_iteration": 2.639967679977417 }, { "auxiliary_loss_clip": 0.0127762, "auxiliary_loss_mlp": 0.00242024, "balance_loss_clip": 1.05923414, "balance_loss_mlp": 0.21622682, "epoch": 0.8389598677288441, "flos": 19573219255680.0, "grad_norm": 8.46106643499843, "language_loss": 0.7669431, "learning_rate": 2.6584346399837176e-07, "loss": 0.78213954, "num_input_tokens_seen": 300995080, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.25793457, "step": 13954, "time_per_iteration": 4.101582050323486 }, { "auxiliary_loss_clip": 0.01269302, "auxiliary_loss_mlp": 0.00222005, "balance_loss_clip": 1.05040956, "balance_loss_mlp": 0.1984973, "epoch": 0.8390199909815121, "flos": 17384715406080.0, "grad_norm": 7.769362956734998, "language_loss": 0.80384803, "learning_rate": 2.656494779996932e-07, "loss": 0.81876111, "num_input_tokens_seen": 301012920, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.23522949, "step": 13955, "time_per_iteration": 2.618305206298828 }, { "auxiliary_loss_clip": 0.01269775, "auxiliary_loss_mlp": 0.00220411, "balance_loss_clip": 1.0531739, "balance_loss_mlp": 0.19511506, "epoch": 0.83908011423418, "flos": 24639639667200.0, "grad_norm": 4.3014605308103615, "language_loss": 0.75612891, "learning_rate": 2.6545555776775995e-07, "loss": 0.77103078, "num_input_tokens_seen": 301028875, "router_z_loss_clip": 2.1640625, "router_z_loss_mlp": 0.25280762, "step": 13956, "time_per_iteration": 2.6459875106811523 }, { "auxiliary_loss_clip": 0.01287014, "auxiliary_loss_mlp": 0.00247352, "balance_loss_clip": 1.05836701, "balance_loss_mlp": 0.21954064, "epoch": 0.8391402374868481, "flos": 24718356322560.0, "grad_norm": 2.437416335577208, "language_loss": 0.88572514, "learning_rate": 2.6526170330992667e-07, "loss": 0.90106881, "num_input_tokens_seen": 301050115, "router_z_loss_clip": 2.2890625, "router_z_loss_mlp": 0.27832031, "step": 13957, "time_per_iteration": 2.677393674850464 }, { "auxiliary_loss_clip": 0.01180447, "auxiliary_loss_mlp": 0.00120704, "balance_loss_clip": 1.03565145, "balance_loss_mlp": 0.11393332, "epoch": 0.839200360739516, "flos": 56871695784960.0, "grad_norm": 0.8740684704022128, "language_loss": 0.52930307, "learning_rate": 2.6506791463354283e-07, "loss": 0.54231453, "num_input_tokens_seen": 301114155, "router_z_loss_clip": 1.4453125, "router_z_loss_mlp": 0.06787109, "step": 13958, "time_per_iteration": 3.2448062896728516 }, { "auxiliary_loss_clip": 0.01262846, "auxiliary_loss_mlp": 0.00223601, "balance_loss_clip": 1.04599857, "balance_loss_mlp": 0.19990182, "epoch": 0.839260483992184, "flos": 18332792933760.0, "grad_norm": 67.6244181750782, "language_loss": 0.8284837, "learning_rate": 2.648741917459574e-07, "loss": 0.84334815, "num_input_tokens_seen": 301133150, "router_z_loss_clip": 2.16796875, "router_z_loss_mlp": 0.23693848, "step": 13959, "time_per_iteration": 2.6084694862365723 }, { "auxiliary_loss_clip": 0.01277079, "auxiliary_loss_mlp": 0.00216378, "balance_loss_clip": 1.06162381, "balance_loss_mlp": 0.19323955, "epoch": 0.8393206072448519, "flos": 27087921653760.0, "grad_norm": 2.247092613246207, "language_loss": 0.64732629, "learning_rate": 2.646805346545169e-07, "loss": 0.66226089, "num_input_tokens_seen": 301153600, "router_z_loss_clip": 2.15429688, "router_z_loss_mlp": 0.23144531, "step": 13960, "time_per_iteration": 2.6975579261779785 }, { "auxiliary_loss_clip": 0.01185462, "auxiliary_loss_mlp": 0.00136759, "balance_loss_clip": 1.0398773, "balance_loss_mlp": 0.12836701, "epoch": 0.8393807304975199, "flos": 61521192057600.0, "grad_norm": 0.7625886597988187, "language_loss": 0.60222661, "learning_rate": 2.6448694336656397e-07, "loss": 0.61544883, "num_input_tokens_seen": 301214335, "router_z_loss_clip": 1.453125, "router_z_loss_mlp": 0.08398438, "step": 13961, "time_per_iteration": 3.2253010272979736 }, { "auxiliary_loss_clip": 0.01255966, "auxiliary_loss_mlp": 0.00212916, "balance_loss_clip": 1.04110706, "balance_loss_mlp": 0.18760775, "epoch": 0.8394408537501878, "flos": 14894848448640.0, "grad_norm": 5.782051190822292, "language_loss": 0.76425219, "learning_rate": 2.642934178894405e-07, "loss": 0.77894098, "num_input_tokens_seen": 301228960, "router_z_loss_clip": 2.1484375, "router_z_loss_mlp": 0.25317383, "step": 13962, "time_per_iteration": 2.6460254192352295 }, { "auxiliary_loss_clip": 0.01266736, "auxiliary_loss_mlp": 0.00232463, "balance_loss_clip": 1.05093193, "balance_loss_mlp": 0.20879972, "epoch": 0.8395009770028559, "flos": 17412186332160.0, "grad_norm": 71.36521566754654, "language_loss": 0.84126723, "learning_rate": 2.640999582304841e-07, "loss": 0.85625923, "num_input_tokens_seen": 301245875, "router_z_loss_clip": 2.16015625, "router_z_loss_mlp": 0.23681641, "step": 13963, "time_per_iteration": 2.6018378734588623 }, { "auxiliary_loss_clip": 0.01263098, "auxiliary_loss_mlp": 0.0023291, "balance_loss_clip": 1.04694939, "balance_loss_mlp": 0.2081501, "epoch": 0.8395611002555238, "flos": 27924747782400.0, "grad_norm": 81.07888821627522, "language_loss": 0.82646412, "learning_rate": 2.6390656439703173e-07, "loss": 0.84142423, "num_input_tokens_seen": 301265550, "router_z_loss_clip": 2.15820312, "router_z_loss_mlp": 0.24743652, "step": 13964, "time_per_iteration": 2.7339107990264893 }, { "auxiliary_loss_clip": 0.01296086, "auxiliary_loss_mlp": 0.0024667, "balance_loss_clip": 1.0644325, "balance_loss_mlp": 0.21832225, "epoch": 0.8396212235081918, "flos": 11100922225920.0, "grad_norm": 16.370444358047873, "language_loss": 0.86728007, "learning_rate": 2.637132363964161e-07, "loss": 0.8827076, "num_input_tokens_seen": 301282035, "router_z_loss_clip": 2.3203125, "router_z_loss_mlp": 0.28356934, "step": 13965, "time_per_iteration": 2.6139590740203857 }, { "auxiliary_loss_clip": 0.0126589, "auxiliary_loss_mlp": 0.00210391, "balance_loss_clip": 1.04746461, "balance_loss_mlp": 0.18685859, "epoch": 0.8396813467608598, "flos": 35735641729920.0, "grad_norm": 20.853476678434387, "language_loss": 0.73400128, "learning_rate": 2.635199742359684e-07, "loss": 0.74876416, "num_input_tokens_seen": 301305210, "router_z_loss_clip": 2.18652344, "router_z_loss_mlp": 0.23547363, "step": 13966, "time_per_iteration": 2.859145402908325 }, { "auxiliary_loss_clip": 0.01267226, "auxiliary_loss_mlp": 0.00214056, "balance_loss_clip": 1.05027223, "balance_loss_mlp": 0.18921243, "epoch": 0.8397414700135277, "flos": 26176724415360.0, "grad_norm": 3.501532706318258, "language_loss": 0.83554041, "learning_rate": 2.633267779230177e-07, "loss": 0.85035324, "num_input_tokens_seen": 301324885, "router_z_loss_clip": 2.171875, "router_z_loss_mlp": 0.24841309, "step": 13967, "time_per_iteration": 2.7299818992614746 }, { "auxiliary_loss_clip": 0.01270882, "auxiliary_loss_mlp": 0.00215889, "balance_loss_clip": 1.05152917, "balance_loss_mlp": 0.19143942, "epoch": 0.8398015932661957, "flos": 18333116156160.0, "grad_norm": 32.53385813177828, "language_loss": 0.91122562, "learning_rate": 2.6313364746488974e-07, "loss": 0.92609334, "num_input_tokens_seen": 301343070, "router_z_loss_clip": 2.19335938, "router_z_loss_mlp": 0.24450684, "step": 13968, "time_per_iteration": 2.7362773418426514 }, { "auxiliary_loss_clip": 0.01280679, "auxiliary_loss_mlp": 0.0024018, "balance_loss_clip": 1.05338097, "balance_loss_mlp": 0.21404958, "epoch": 0.8398617165188637, "flos": 17379507934080.0, "grad_norm": 55.253414839769, "language_loss": 0.85783273, "learning_rate": 2.629405828689075e-07, "loss": 0.87304127, "num_input_tokens_seen": 301359280, "router_z_loss_clip": 2.2734375, "router_z_loss_mlp": 0.26159668, "step": 13969, "time_per_iteration": 2.6690313816070557 }, { "auxiliary_loss_clip": 0.01279974, "auxiliary_loss_mlp": 0.00240589, "balance_loss_clip": 1.05761075, "balance_loss_mlp": 0.21414848, "epoch": 0.8399218397715317, "flos": 22929681738240.0, "grad_norm": 69.13581942101045, "language_loss": 0.88020313, "learning_rate": 2.627475841423923e-07, "loss": 0.89540875, "num_input_tokens_seen": 301376465, "router_z_loss_clip": 2.22460938, "router_z_loss_mlp": 0.26452637, "step": 13970, "time_per_iteration": 2.692054033279419 }, { "auxiliary_loss_clip": 0.01278556, "auxiliary_loss_mlp": 0.00230897, "balance_loss_clip": 1.0579927, "balance_loss_mlp": 0.20675747, "epoch": 0.8399819630241996, "flos": 23149562843520.0, "grad_norm": 65.91319592646211, "language_loss": 0.80723947, "learning_rate": 2.625546512926633e-07, "loss": 0.82233405, "num_input_tokens_seen": 301396000, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.24145508, "step": 13971, "time_per_iteration": 2.7439663410186768 }, { "auxiliary_loss_clip": 0.01272405, "auxiliary_loss_mlp": 0.00210293, "balance_loss_clip": 1.05148578, "balance_loss_mlp": 0.18568808, "epoch": 0.8400420862768676, "flos": 16397423205120.0, "grad_norm": 19.828319703721718, "language_loss": 0.8457849, "learning_rate": 2.623617843270358e-07, "loss": 0.8606118, "num_input_tokens_seen": 301413160, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.24609375, "step": 13972, "time_per_iteration": 2.6380109786987305 }, { "auxiliary_loss_clip": 0.01257994, "auxiliary_loss_mlp": 0.00211719, "balance_loss_clip": 1.04328823, "balance_loss_mlp": 0.18840155, "epoch": 0.8401022095295355, "flos": 21287486816640.0, "grad_norm": 25.12813324193505, "language_loss": 0.7385999, "learning_rate": 2.6216898325282333e-07, "loss": 0.75329709, "num_input_tokens_seen": 301433325, "router_z_loss_clip": 2.1484375, "router_z_loss_mlp": 0.2331543, "step": 13973, "time_per_iteration": 2.7582948207855225 }, { "auxiliary_loss_clip": 0.0126188, "auxiliary_loss_mlp": 0.00215751, "balance_loss_clip": 1.04536331, "balance_loss_mlp": 0.19181341, "epoch": 0.8401623327822035, "flos": 17311313963520.0, "grad_norm": 24.650643762721263, "language_loss": 0.86998272, "learning_rate": 2.619762480773382e-07, "loss": 0.88475895, "num_input_tokens_seen": 301450265, "router_z_loss_clip": 2.16601562, "router_z_loss_mlp": 0.23913574, "step": 13974, "time_per_iteration": 2.632596254348755 }, { "auxiliary_loss_clip": 0.01265336, "auxiliary_loss_mlp": 0.00218124, "balance_loss_clip": 1.04441333, "balance_loss_mlp": 0.19208905, "epoch": 0.8402224560348714, "flos": 22236677665920.0, "grad_norm": 1527.4370311244913, "language_loss": 0.78664309, "learning_rate": 2.617835788078868e-07, "loss": 0.80147767, "num_input_tokens_seen": 301470760, "router_z_loss_clip": 2.21289062, "router_z_loss_mlp": 0.26037598, "step": 13975, "time_per_iteration": 2.703493118286133 }, { "auxiliary_loss_clip": 0.01282998, "auxiliary_loss_mlp": 0.00221357, "balance_loss_clip": 1.05842292, "balance_loss_mlp": 0.1963827, "epoch": 0.8402825792875395, "flos": 20229953569920.0, "grad_norm": 1.8136460375866832, "language_loss": 0.79276448, "learning_rate": 2.6159097545177645e-07, "loss": 0.80780804, "num_input_tokens_seen": 301489425, "router_z_loss_clip": 2.24414062, "router_z_loss_mlp": 0.24975586, "step": 13976, "time_per_iteration": 2.720053195953369 }, { "auxiliary_loss_clip": 0.01268408, "auxiliary_loss_mlp": 0.00215979, "balance_loss_clip": 1.05076194, "balance_loss_mlp": 0.19087306, "epoch": 0.8403427025402074, "flos": 23289973107840.0, "grad_norm": 12.164063095766762, "language_loss": 0.80680668, "learning_rate": 2.61398438016311e-07, "loss": 0.82165056, "num_input_tokens_seen": 301508885, "router_z_loss_clip": 2.17578125, "router_z_loss_mlp": 0.25097656, "step": 13977, "time_per_iteration": 2.874589443206787 }, { "auxiliary_loss_clip": 0.01258913, "auxiliary_loss_mlp": 0.00242669, "balance_loss_clip": 1.0417552, "balance_loss_mlp": 0.21571594, "epoch": 0.8404028257928754, "flos": 32675586278400.0, "grad_norm": 4.4853341926233306, "language_loss": 0.75114465, "learning_rate": 2.6120596650879043e-07, "loss": 0.76616049, "num_input_tokens_seen": 301533780, "router_z_loss_clip": 2.17089844, "router_z_loss_mlp": 0.26953125, "step": 13978, "time_per_iteration": 2.8083550930023193 }, { "auxiliary_loss_clip": 0.01250602, "auxiliary_loss_mlp": 0.00209808, "balance_loss_clip": 1.03376949, "balance_loss_mlp": 0.18292651, "epoch": 0.8404629490455434, "flos": 16180522928640.0, "grad_norm": 7.4489054513968735, "language_loss": 0.84363657, "learning_rate": 2.610135609365145e-07, "loss": 0.85824072, "num_input_tokens_seen": 301551775, "router_z_loss_clip": 2.16601562, "router_z_loss_mlp": 0.26879883, "step": 13979, "time_per_iteration": 2.663464307785034 }, { "auxiliary_loss_clip": 0.01274324, "auxiliary_loss_mlp": 0.00226796, "balance_loss_clip": 1.0542618, "balance_loss_mlp": 0.20077229, "epoch": 0.8405230722982113, "flos": 15194451790080.0, "grad_norm": 4.101976717903875, "language_loss": 0.88438129, "learning_rate": 2.60821221306778e-07, "loss": 0.89939249, "num_input_tokens_seen": 301570495, "router_z_loss_clip": 2.20507812, "router_z_loss_mlp": 0.26000977, "step": 13980, "time_per_iteration": 2.6902098655700684 }, { "auxiliary_loss_clip": 0.0127382, "auxiliary_loss_mlp": 0.00227364, "balance_loss_clip": 1.05822134, "balance_loss_mlp": 0.20385621, "epoch": 0.8405831955508793, "flos": 27812418975360.0, "grad_norm": 1501.9146508019476, "language_loss": 0.92055917, "learning_rate": 2.606289476268757e-07, "loss": 0.93557107, "num_input_tokens_seen": 301591705, "router_z_loss_clip": 2.15820312, "router_z_loss_mlp": 0.23522949, "step": 13981, "time_per_iteration": 2.7011451721191406 }, { "auxiliary_loss_clip": 0.01278871, "auxiliary_loss_mlp": 0.0024547, "balance_loss_clip": 1.05939662, "balance_loss_mlp": 0.22144887, "epoch": 0.8406433188035473, "flos": 23769452782080.0, "grad_norm": 21943.214773252053, "language_loss": 0.75856405, "learning_rate": 2.6043673990409745e-07, "loss": 0.77380747, "num_input_tokens_seen": 301611670, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.24023438, "step": 13982, "time_per_iteration": 2.72743558883667 }, { "auxiliary_loss_clip": 0.01276725, "auxiliary_loss_mlp": 0.0024094, "balance_loss_clip": 1.05604506, "balance_loss_mlp": 0.21502344, "epoch": 0.8407034420562153, "flos": 29205681667200.0, "grad_norm": 14.056379069046912, "language_loss": 0.76159024, "learning_rate": 2.602445981457324e-07, "loss": 0.7767669, "num_input_tokens_seen": 301632540, "router_z_loss_clip": 2.20507812, "router_z_loss_mlp": 0.25939941, "step": 13983, "time_per_iteration": 2.71828293800354 }, { "auxiliary_loss_clip": 0.01269153, "auxiliary_loss_mlp": 0.00223229, "balance_loss_clip": 1.04633582, "balance_loss_mlp": 0.19680071, "epoch": 0.8407635653088832, "flos": 26360084367360.0, "grad_norm": 152.6457426716722, "language_loss": 0.87017286, "learning_rate": 2.6005252235906684e-07, "loss": 0.88509667, "num_input_tokens_seen": 301651480, "router_z_loss_clip": 2.22265625, "router_z_loss_mlp": 0.26416016, "step": 13984, "time_per_iteration": 2.7417914867401123 }, { "auxiliary_loss_clip": 0.01264244, "auxiliary_loss_mlp": 0.00212303, "balance_loss_clip": 1.04897952, "balance_loss_mlp": 0.18860447, "epoch": 0.8408236885615512, "flos": 21468799693440.0, "grad_norm": 6.592828898921302, "language_loss": 0.68671715, "learning_rate": 2.598605125513842e-07, "loss": 0.70148259, "num_input_tokens_seen": 301670010, "router_z_loss_clip": 2.15429688, "router_z_loss_mlp": 0.23718262, "step": 13985, "time_per_iteration": 2.724541664123535 }, { "auxiliary_loss_clip": 0.01258745, "auxiliary_loss_mlp": 0.00232057, "balance_loss_clip": 1.04057014, "balance_loss_mlp": 0.20701112, "epoch": 0.8408838118142191, "flos": 22963724853120.0, "grad_norm": 14.571582443690344, "language_loss": 0.88594091, "learning_rate": 2.5966856872996467e-07, "loss": 0.90084898, "num_input_tokens_seen": 301689785, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.25061035, "step": 13986, "time_per_iteration": 4.095928192138672 }, { "auxiliary_loss_clip": 0.01251404, "auxiliary_loss_mlp": 0.00218152, "balance_loss_clip": 1.03704834, "balance_loss_mlp": 0.19340453, "epoch": 0.8409439350668871, "flos": 26800026145920.0, "grad_norm": 18.759014926764046, "language_loss": 0.74330288, "learning_rate": 2.5947669090208755e-07, "loss": 0.75799847, "num_input_tokens_seen": 301712225, "router_z_loss_clip": 2.14453125, "router_z_loss_mlp": 0.24768066, "step": 13987, "time_per_iteration": 2.7979485988616943 }, { "auxiliary_loss_clip": 0.01260644, "auxiliary_loss_mlp": 0.00224605, "balance_loss_clip": 1.04833746, "balance_loss_mlp": 0.20151402, "epoch": 0.841004058319555, "flos": 26578672583040.0, "grad_norm": 8.080809504440852, "language_loss": 0.74491924, "learning_rate": 2.5928487907502906e-07, "loss": 0.7597717, "num_input_tokens_seen": 301730955, "router_z_loss_clip": 2.12109375, "router_z_loss_mlp": 0.2310791, "step": 13988, "time_per_iteration": 4.183348178863525 }, { "auxiliary_loss_clip": 0.01285437, "auxiliary_loss_mlp": 0.00210245, "balance_loss_clip": 1.05954361, "balance_loss_mlp": 0.18292238, "epoch": 0.8410641815722231, "flos": 14501878680960.0, "grad_norm": 32.878146827716776, "language_loss": 0.89551914, "learning_rate": 2.590931332560622e-07, "loss": 0.91047597, "num_input_tokens_seen": 301746930, "router_z_loss_clip": 2.2578125, "router_z_loss_mlp": 0.2734375, "step": 13989, "time_per_iteration": 2.762453317642212 }, { "auxiliary_loss_clip": 0.01263935, "auxiliary_loss_mlp": 0.00226763, "balance_loss_clip": 1.04351807, "balance_loss_mlp": 0.2014551, "epoch": 0.841124304824891, "flos": 29166682475520.0, "grad_norm": 34.10868208700546, "language_loss": 0.82225299, "learning_rate": 2.5890145345245826e-07, "loss": 0.83715993, "num_input_tokens_seen": 301766945, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.2532959, "step": 13990, "time_per_iteration": 2.8175392150878906 }, { "auxiliary_loss_clip": 0.01261456, "auxiliary_loss_mlp": 0.00225204, "balance_loss_clip": 1.04837775, "balance_loss_mlp": 0.19962144, "epoch": 0.841184428077559, "flos": 22412028885120.0, "grad_norm": 13.224706549165322, "language_loss": 0.86726564, "learning_rate": 2.5870983967148597e-07, "loss": 0.88213223, "num_input_tokens_seen": 301785460, "router_z_loss_clip": 2.13476562, "router_z_loss_mlp": 0.2557373, "step": 13991, "time_per_iteration": 2.738900661468506 }, { "auxiliary_loss_clip": 0.01268327, "auxiliary_loss_mlp": 0.0022452, "balance_loss_clip": 1.04612124, "balance_loss_mlp": 0.19952157, "epoch": 0.841244551330227, "flos": 22962791099520.0, "grad_norm": 2.484890188185025, "language_loss": 0.79861104, "learning_rate": 2.585182919204105e-07, "loss": 0.81353951, "num_input_tokens_seen": 301804180, "router_z_loss_clip": 2.22070312, "router_z_loss_mlp": 0.24987793, "step": 13992, "time_per_iteration": 5.041700839996338 }, { "auxiliary_loss_clip": 0.01267776, "auxiliary_loss_mlp": 0.0022857, "balance_loss_clip": 1.05032814, "balance_loss_mlp": 0.2033449, "epoch": 0.8413046745828949, "flos": 21032736583680.0, "grad_norm": 42.25221563334634, "language_loss": 0.8457284, "learning_rate": 2.583268102064959e-07, "loss": 0.86069191, "num_input_tokens_seen": 301823670, "router_z_loss_clip": 2.17578125, "router_z_loss_mlp": 0.25256348, "step": 13993, "time_per_iteration": 2.719926118850708 }, { "auxiliary_loss_clip": 0.01291427, "auxiliary_loss_mlp": 0.00256216, "balance_loss_clip": 1.06071115, "balance_loss_mlp": 0.22858283, "epoch": 0.841364797835563, "flos": 27052082858880.0, "grad_norm": 6.302978340984394, "language_loss": 0.81181192, "learning_rate": 2.5813539453700393e-07, "loss": 0.82728839, "num_input_tokens_seen": 301845890, "router_z_loss_clip": 2.30859375, "router_z_loss_mlp": 0.27636719, "step": 13994, "time_per_iteration": 2.7070443630218506 }, { "auxiliary_loss_clip": 0.01258157, "auxiliary_loss_mlp": 0.00218, "balance_loss_clip": 1.04442573, "balance_loss_mlp": 0.19365761, "epoch": 0.8414249210882309, "flos": 17895688329600.0, "grad_norm": 71.22266275041795, "language_loss": 0.6729182, "learning_rate": 2.5794404491919163e-07, "loss": 0.68767983, "num_input_tokens_seen": 301863985, "router_z_loss_clip": 2.13476562, "router_z_loss_mlp": 0.24365234, "step": 13995, "time_per_iteration": 2.670283079147339 }, { "auxiliary_loss_clip": 0.01272776, "auxiliary_loss_mlp": 0.00211611, "balance_loss_clip": 1.04984009, "balance_loss_mlp": 0.18580171, "epoch": 0.8414850443408989, "flos": 25441201618560.0, "grad_norm": 37.216884591099664, "language_loss": 0.78422993, "learning_rate": 2.577527613603163e-07, "loss": 0.79907382, "num_input_tokens_seen": 301882765, "router_z_loss_clip": 2.22851562, "router_z_loss_mlp": 0.25817871, "step": 13996, "time_per_iteration": 4.200998783111572 }, { "auxiliary_loss_clip": 0.01246438, "auxiliary_loss_mlp": 0.00209083, "balance_loss_clip": 1.03330624, "balance_loss_mlp": 0.18596821, "epoch": 0.8415451675935668, "flos": 23220055284480.0, "grad_norm": 5.72831959240996, "language_loss": 0.70877647, "learning_rate": 2.5756154386763017e-07, "loss": 0.72333169, "num_input_tokens_seen": 301902720, "router_z_loss_clip": 2.13476562, "router_z_loss_mlp": 0.2310791, "step": 13997, "time_per_iteration": 2.6707754135131836 }, { "auxiliary_loss_clip": 0.01288686, "auxiliary_loss_mlp": 0.00231592, "balance_loss_clip": 1.05911422, "balance_loss_mlp": 0.20554501, "epoch": 0.8416052908462348, "flos": 18546496899840.0, "grad_norm": 25.318301090682894, "language_loss": 0.90128314, "learning_rate": 2.5737039244838565e-07, "loss": 0.91648591, "num_input_tokens_seen": 301921245, "router_z_loss_clip": 2.29882812, "router_z_loss_mlp": 0.26037598, "step": 13998, "time_per_iteration": 2.733452558517456 }, { "auxiliary_loss_clip": 0.01252315, "auxiliary_loss_mlp": 0.0021404, "balance_loss_clip": 1.03460789, "balance_loss_mlp": 0.18758713, "epoch": 0.8416654140989027, "flos": 26105190480000.0, "grad_norm": 23.55003162153108, "language_loss": 0.88497126, "learning_rate": 2.5717930710982984e-07, "loss": 0.89963484, "num_input_tokens_seen": 301942320, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.2644043, "step": 13999, "time_per_iteration": 2.7387781143188477 }, { "auxiliary_loss_clip": 0.01271326, "auxiliary_loss_mlp": 0.00222276, "balance_loss_clip": 1.04920459, "balance_loss_mlp": 0.1974805, "epoch": 0.8417255373515707, "flos": 26433270328320.0, "grad_norm": 237.1274672528101, "language_loss": 0.78272688, "learning_rate": 2.569882878592096e-07, "loss": 0.79766291, "num_input_tokens_seen": 301963110, "router_z_loss_clip": 2.22070312, "router_z_loss_mlp": 0.24816895, "step": 14000, "time_per_iteration": 2.764411211013794 }, { "auxiliary_loss_clip": 0.01266572, "auxiliary_loss_mlp": 0.00194331, "balance_loss_clip": 1.0473485, "balance_loss_mlp": 0.16883188, "epoch": 0.8417856606042387, "flos": 24717745791360.0, "grad_norm": 63.75137308499209, "language_loss": 0.86843073, "learning_rate": 2.5679733470376885e-07, "loss": 0.88303971, "num_input_tokens_seen": 301984915, "router_z_loss_clip": 2.19628906, "router_z_loss_mlp": 0.25488281, "step": 14001, "time_per_iteration": 2.7190444469451904 }, { "auxiliary_loss_clip": 0.0125592, "auxiliary_loss_mlp": 0.0026046, "balance_loss_clip": 1.03594685, "balance_loss_mlp": 0.23362592, "epoch": 0.8418457838569067, "flos": 20850849089280.0, "grad_norm": 10.680479492459558, "language_loss": 0.87011051, "learning_rate": 2.5660644765074703e-07, "loss": 0.88527429, "num_input_tokens_seen": 302004095, "router_z_loss_clip": 2.19921875, "router_z_loss_mlp": 0.26831055, "step": 14002, "time_per_iteration": 2.724395275115967 }, { "auxiliary_loss_clip": 0.01263291, "auxiliary_loss_mlp": 0.00225845, "balance_loss_clip": 1.04410505, "balance_loss_mlp": 0.20039354, "epoch": 0.8419059071095746, "flos": 28660629715200.0, "grad_norm": 35.072387256192116, "language_loss": 0.84267151, "learning_rate": 2.5641562670738334e-07, "loss": 0.8575629, "num_input_tokens_seen": 302027250, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.2545166, "step": 14003, "time_per_iteration": 2.794793128967285 }, { "auxiliary_loss_clip": 0.01272869, "auxiliary_loss_mlp": 0.00228872, "balance_loss_clip": 1.05268145, "balance_loss_mlp": 0.20235974, "epoch": 0.8419660303622426, "flos": 21653596189440.0, "grad_norm": 115.46181395103311, "language_loss": 0.73497665, "learning_rate": 2.5622487188091436e-07, "loss": 0.74999404, "num_input_tokens_seen": 302046950, "router_z_loss_clip": 2.19921875, "router_z_loss_mlp": 0.26501465, "step": 14004, "time_per_iteration": 2.7022056579589844 }, { "auxiliary_loss_clip": 0.01272238, "auxiliary_loss_mlp": 0.00230877, "balance_loss_clip": 1.052984, "balance_loss_mlp": 0.20381683, "epoch": 0.8420261536149106, "flos": 25301114576640.0, "grad_norm": 32.40092808493116, "language_loss": 0.84520221, "learning_rate": 2.560341831785724e-07, "loss": 0.86023343, "num_input_tokens_seen": 302065470, "router_z_loss_clip": 2.19335938, "router_z_loss_mlp": 0.27026367, "step": 14005, "time_per_iteration": 2.7497127056121826 }, { "auxiliary_loss_clip": 0.01274589, "auxiliary_loss_mlp": 0.00225895, "balance_loss_clip": 1.0496254, "balance_loss_mlp": 0.19933489, "epoch": 0.8420862768675785, "flos": 18763397176320.0, "grad_norm": 7.164986285210702, "language_loss": 0.86648333, "learning_rate": 2.5584356060758906e-07, "loss": 0.8814882, "num_input_tokens_seen": 302083190, "router_z_loss_clip": 2.25195312, "router_z_loss_mlp": 0.265625, "step": 14006, "time_per_iteration": 2.6687023639678955 }, { "auxiliary_loss_clip": 0.01270553, "auxiliary_loss_mlp": 0.00223435, "balance_loss_clip": 1.0454762, "balance_loss_mlp": 0.19813836, "epoch": 0.8421464001202466, "flos": 18328052338560.0, "grad_norm": 3.9317029902522704, "language_loss": 0.8465097, "learning_rate": 2.556530041751932e-07, "loss": 0.86144954, "num_input_tokens_seen": 302098820, "router_z_loss_clip": 2.25195312, "router_z_loss_mlp": 0.25256348, "step": 14007, "time_per_iteration": 2.7602672576904297 }, { "auxiliary_loss_clip": 0.01265337, "auxiliary_loss_mlp": 0.00210624, "balance_loss_clip": 1.04715014, "balance_loss_mlp": 0.18694906, "epoch": 0.8422065233729145, "flos": 31537181560320.0, "grad_norm": 54.68118402480354, "language_loss": 0.71705461, "learning_rate": 2.554625138886102e-07, "loss": 0.73181415, "num_input_tokens_seen": 302117075, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.23681641, "step": 14008, "time_per_iteration": 2.765855550765991 }, { "auxiliary_loss_clip": 0.0116503, "auxiliary_loss_mlp": 0.00221126, "balance_loss_clip": 1.02045262, "balance_loss_mlp": 0.21244717, "epoch": 0.8422666466255825, "flos": 64298128510080.0, "grad_norm": 0.6956672199384136, "language_loss": 0.56064594, "learning_rate": 2.552720897550631e-07, "loss": 0.57450747, "num_input_tokens_seen": 302179735, "router_z_loss_clip": 1.4453125, "router_z_loss_mlp": 0.08691406, "step": 14009, "time_per_iteration": 3.2657644748687744 }, { "auxiliary_loss_clip": 0.01255092, "auxiliary_loss_mlp": 0.00228425, "balance_loss_clip": 1.04145586, "balance_loss_mlp": 0.20444019, "epoch": 0.8423267698782504, "flos": 24316731377280.0, "grad_norm": 3.7245767969624217, "language_loss": 0.83635223, "learning_rate": 2.5508173178177304e-07, "loss": 0.85118735, "num_input_tokens_seen": 302202055, "router_z_loss_clip": 2.13867188, "router_z_loss_mlp": 0.23974609, "step": 14010, "time_per_iteration": 2.847505569458008 }, { "auxiliary_loss_clip": 0.012913, "auxiliary_loss_mlp": 0.002161, "balance_loss_clip": 1.06277239, "balance_loss_mlp": 0.18799087, "epoch": 0.8423868931309184, "flos": 18296092212480.0, "grad_norm": 6.768011939093078, "language_loss": 0.81054318, "learning_rate": 2.548914399759592e-07, "loss": 0.82561719, "num_input_tokens_seen": 302221360, "router_z_loss_clip": 2.28710938, "router_z_loss_mlp": 0.28100586, "step": 14011, "time_per_iteration": 2.71439528465271 }, { "auxiliary_loss_clip": 0.01280825, "auxiliary_loss_mlp": 0.00240754, "balance_loss_clip": 1.05578852, "balance_loss_mlp": 0.21591097, "epoch": 0.8424470163835863, "flos": 23550218121600.0, "grad_norm": 25.379258480558768, "language_loss": 0.92963314, "learning_rate": 2.5470121434483636e-07, "loss": 0.9448489, "num_input_tokens_seen": 302240715, "router_z_loss_clip": 2.25195312, "router_z_loss_mlp": 0.24841309, "step": 14012, "time_per_iteration": 2.741568088531494 }, { "auxiliary_loss_clip": 0.01247332, "auxiliary_loss_mlp": 0.0021036, "balance_loss_clip": 1.03948021, "balance_loss_mlp": 0.18704297, "epoch": 0.8425071396362543, "flos": 23769488695680.0, "grad_norm": 44.77893567290176, "language_loss": 0.74933469, "learning_rate": 2.5451105489561884e-07, "loss": 0.7639116, "num_input_tokens_seen": 302260950, "router_z_loss_clip": 2.08203125, "router_z_loss_mlp": 0.23339844, "step": 14013, "time_per_iteration": 2.783193349838257 }, { "auxiliary_loss_clip": 0.0129298, "auxiliary_loss_mlp": 0.00242759, "balance_loss_clip": 1.05698299, "balance_loss_mlp": 0.21574655, "epoch": 0.8425672628889223, "flos": 16178906816640.0, "grad_norm": 129.15117056879146, "language_loss": 0.88692003, "learning_rate": 2.5432096163551644e-07, "loss": 0.90227747, "num_input_tokens_seen": 302277500, "router_z_loss_clip": 2.359375, "router_z_loss_mlp": 0.27050781, "step": 14014, "time_per_iteration": 2.7176311016082764 }, { "auxiliary_loss_clip": 0.01283512, "auxiliary_loss_mlp": 0.00211696, "balance_loss_clip": 1.06075275, "balance_loss_mlp": 0.18629217, "epoch": 0.8426273861415903, "flos": 23149131880320.0, "grad_norm": 21.79858222166037, "language_loss": 0.73534817, "learning_rate": 2.5413093457173884e-07, "loss": 0.75030029, "num_input_tokens_seen": 302297930, "router_z_loss_clip": 2.22851562, "router_z_loss_mlp": 0.25415039, "step": 14015, "time_per_iteration": 2.696579694747925 }, { "auxiliary_loss_clip": 0.01278098, "auxiliary_loss_mlp": 0.00194859, "balance_loss_clip": 1.05542433, "balance_loss_mlp": 0.17026588, "epoch": 0.8426875093942582, "flos": 17457757712640.0, "grad_norm": 145.3584906614496, "language_loss": 0.85114706, "learning_rate": 2.5394097371149036e-07, "loss": 0.86587662, "num_input_tokens_seen": 302315735, "router_z_loss_clip": 2.22851562, "router_z_loss_mlp": 0.24597168, "step": 14016, "time_per_iteration": 2.6907215118408203 }, { "auxiliary_loss_clip": 0.01269797, "auxiliary_loss_mlp": 0.00248541, "balance_loss_clip": 1.04662323, "balance_loss_mlp": 0.2210747, "epoch": 0.8427476326469262, "flos": 19640551299840.0, "grad_norm": 6.079571490263519, "language_loss": 0.86336768, "learning_rate": 2.5375107906197544e-07, "loss": 0.87855107, "num_input_tokens_seen": 302332790, "router_z_loss_clip": 2.23242188, "router_z_loss_mlp": 0.27490234, "step": 14017, "time_per_iteration": 2.659961700439453 }, { "auxiliary_loss_clip": 0.01248289, "auxiliary_loss_mlp": 0.00247034, "balance_loss_clip": 1.03124976, "balance_loss_mlp": 0.2221193, "epoch": 0.8428077558995941, "flos": 11941160146560.0, "grad_norm": 15.22548712496516, "language_loss": 0.70648134, "learning_rate": 2.5356125063039525e-07, "loss": 0.72143459, "num_input_tokens_seen": 302346490, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.24914551, "step": 14018, "time_per_iteration": 2.6481215953826904 }, { "auxiliary_loss_clip": 0.01253069, "auxiliary_loss_mlp": 0.0023965, "balance_loss_clip": 1.03636205, "balance_loss_mlp": 0.21638064, "epoch": 0.8428678791522621, "flos": 10451729767680.0, "grad_norm": 16.097535745195668, "language_loss": 0.87098992, "learning_rate": 2.5337148842394687e-07, "loss": 0.88591707, "num_input_tokens_seen": 302363235, "router_z_loss_clip": 2.16796875, "router_z_loss_mlp": 0.23291016, "step": 14019, "time_per_iteration": 2.694059371948242 }, { "auxiliary_loss_clip": 0.01261656, "auxiliary_loss_mlp": 0.0020344, "balance_loss_clip": 1.03983688, "balance_loss_mlp": 0.17621225, "epoch": 0.8429280024049302, "flos": 28767248259840.0, "grad_norm": 53.53796164333151, "language_loss": 0.87307113, "learning_rate": 2.531817924498265e-07, "loss": 0.88772213, "num_input_tokens_seen": 302383270, "router_z_loss_clip": 2.22070312, "router_z_loss_mlp": 0.27233887, "step": 14020, "time_per_iteration": 2.73976469039917 }, { "auxiliary_loss_clip": 0.01258832, "auxiliary_loss_mlp": 0.00214249, "balance_loss_clip": 1.04087949, "balance_loss_mlp": 0.18971595, "epoch": 0.8429881256575981, "flos": 19537093152000.0, "grad_norm": 32.55733787938635, "language_loss": 0.80703974, "learning_rate": 2.5299216271522805e-07, "loss": 0.82177055, "num_input_tokens_seen": 302401355, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.24560547, "step": 14021, "time_per_iteration": 2.725313186645508 }, { "auxiliary_loss_clip": 0.01271076, "auxiliary_loss_mlp": 0.00231743, "balance_loss_clip": 1.05303645, "balance_loss_mlp": 0.20617214, "epoch": 0.8430482489102661, "flos": 24790931752320.0, "grad_norm": 5.8127512413606395, "language_loss": 0.76984274, "learning_rate": 2.5280259922734125e-07, "loss": 0.78487092, "num_input_tokens_seen": 302419515, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.2557373, "step": 14022, "time_per_iteration": 3.1866438388824463 }, { "auxiliary_loss_clip": 0.01285278, "auxiliary_loss_mlp": 0.00240323, "balance_loss_clip": 1.05468154, "balance_loss_mlp": 0.2121893, "epoch": 0.843108372162934, "flos": 21544248211200.0, "grad_norm": 6.932843184152693, "language_loss": 0.80988622, "learning_rate": 2.526131019933553e-07, "loss": 0.8251422, "num_input_tokens_seen": 302438280, "router_z_loss_clip": 2.30859375, "router_z_loss_mlp": 0.28125, "step": 14023, "time_per_iteration": 2.7496917247772217 }, { "auxiliary_loss_clip": 0.01265786, "auxiliary_loss_mlp": 0.00239138, "balance_loss_clip": 1.04953289, "balance_loss_mlp": 0.21398523, "epoch": 0.843168495415602, "flos": 24608792862720.0, "grad_norm": 8.93944562162873, "language_loss": 0.72238129, "learning_rate": 2.524236710204559e-07, "loss": 0.73743057, "num_input_tokens_seen": 302460860, "router_z_loss_clip": 2.16210938, "router_z_loss_mlp": 0.25170898, "step": 14024, "time_per_iteration": 2.7442147731781006 }, { "auxiliary_loss_clip": 0.01270138, "auxiliary_loss_mlp": 0.00222973, "balance_loss_clip": 1.04770613, "balance_loss_mlp": 0.1965677, "epoch": 0.8432286186682699, "flos": 15122738286720.0, "grad_norm": 3.1497919157267638, "language_loss": 0.8812058, "learning_rate": 2.522343063158261e-07, "loss": 0.89613682, "num_input_tokens_seen": 302476980, "router_z_loss_clip": 2.22265625, "router_z_loss_mlp": 0.2644043, "step": 14025, "time_per_iteration": 2.7435178756713867 }, { "auxiliary_loss_clip": 0.0124802, "auxiliary_loss_mlp": 0.00195565, "balance_loss_clip": 1.03614008, "balance_loss_mlp": 0.17292677, "epoch": 0.843288741920938, "flos": 20301882554880.0, "grad_norm": 211.0821485460889, "language_loss": 0.83575284, "learning_rate": 2.5204500788664606e-07, "loss": 0.85018873, "num_input_tokens_seen": 302496380, "router_z_loss_clip": 2.11914062, "router_z_loss_mlp": 0.22644043, "step": 14026, "time_per_iteration": 2.7158002853393555 }, { "auxiliary_loss_clip": 0.01262236, "auxiliary_loss_mlp": 0.00213162, "balance_loss_clip": 1.04261017, "balance_loss_mlp": 0.18929663, "epoch": 0.8433488651736059, "flos": 23332096782720.0, "grad_norm": 5.493664511309784, "language_loss": 0.88440001, "learning_rate": 2.518557757400945e-07, "loss": 0.89915395, "num_input_tokens_seen": 302516845, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.23852539, "step": 14027, "time_per_iteration": 2.7522008419036865 }, { "auxiliary_loss_clip": 0.01261958, "auxiliary_loss_mlp": 0.00221306, "balance_loss_clip": 1.04087305, "balance_loss_mlp": 0.19583064, "epoch": 0.8434089884262739, "flos": 39458105844480.0, "grad_norm": 6.248941638744556, "language_loss": 0.65307248, "learning_rate": 2.5166660988334754e-07, "loss": 0.66790509, "num_input_tokens_seen": 302538865, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.25476074, "step": 14028, "time_per_iteration": 4.248548984527588 }, { "auxiliary_loss_clip": 0.0127096, "auxiliary_loss_mlp": 0.00233874, "balance_loss_clip": 1.05176139, "balance_loss_mlp": 0.2080178, "epoch": 0.8434691116789418, "flos": 23768842250880.0, "grad_norm": 15.446024729894063, "language_loss": 0.72728854, "learning_rate": 2.51477510323578e-07, "loss": 0.74233687, "num_input_tokens_seen": 302557970, "router_z_loss_clip": 2.19335938, "router_z_loss_mlp": 0.25842285, "step": 14029, "time_per_iteration": 2.703129529953003 }, { "auxiliary_loss_clip": 0.01263678, "auxiliary_loss_mlp": 0.00236292, "balance_loss_clip": 1.04966879, "balance_loss_mlp": 0.21335599, "epoch": 0.8435292349316098, "flos": 22671411972480.0, "grad_norm": 2.062747530672219, "language_loss": 0.80962425, "learning_rate": 2.51288477067956e-07, "loss": 0.82462394, "num_input_tokens_seen": 302578915, "router_z_loss_clip": 2.14257812, "router_z_loss_mlp": 0.22949219, "step": 14030, "time_per_iteration": 4.138928413391113 }, { "auxiliary_loss_clip": 0.01255644, "auxiliary_loss_mlp": 0.00235156, "balance_loss_clip": 1.04119682, "balance_loss_mlp": 0.2116721, "epoch": 0.8435893581842777, "flos": 18843622202880.0, "grad_norm": 5.4298796130119875, "language_loss": 0.89683944, "learning_rate": 2.510995101236502e-07, "loss": 0.9117474, "num_input_tokens_seen": 302596300, "router_z_loss_clip": 2.14648438, "router_z_loss_mlp": 0.23498535, "step": 14031, "time_per_iteration": 2.7077152729034424 }, { "auxiliary_loss_clip": 0.0127432, "auxiliary_loss_mlp": 0.00227373, "balance_loss_clip": 1.04908073, "balance_loss_mlp": 0.20279205, "epoch": 0.8436494814369457, "flos": 20704225772160.0, "grad_norm": 474.6650455922936, "language_loss": 0.91162241, "learning_rate": 2.509106094978266e-07, "loss": 0.92663932, "num_input_tokens_seen": 302614975, "router_z_loss_clip": 2.25195312, "router_z_loss_mlp": 0.24597168, "step": 14032, "time_per_iteration": 2.74483060836792 }, { "auxiliary_loss_clip": 0.01267845, "auxiliary_loss_mlp": 0.0022677, "balance_loss_clip": 1.04891682, "balance_loss_mlp": 0.19999588, "epoch": 0.8437096046896138, "flos": 22674177319680.0, "grad_norm": 3.0641845685395266, "language_loss": 0.82002974, "learning_rate": 2.507217751976478e-07, "loss": 0.83497584, "num_input_tokens_seen": 302636415, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.26757812, "step": 14033, "time_per_iteration": 2.7558698654174805 }, { "auxiliary_loss_clip": 0.01251235, "auxiliary_loss_mlp": 0.00229682, "balance_loss_clip": 1.03886509, "balance_loss_mlp": 0.20496954, "epoch": 0.8437697279422817, "flos": 16180127879040.0, "grad_norm": 6.410283049050331, "language_loss": 0.91078633, "learning_rate": 2.505330072302743e-07, "loss": 0.92559552, "num_input_tokens_seen": 302653605, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.24719238, "step": 14034, "time_per_iteration": 2.6484436988830566 }, { "auxiliary_loss_clip": 0.01269006, "auxiliary_loss_mlp": 0.00221539, "balance_loss_clip": 1.04422498, "balance_loss_mlp": 0.19615975, "epoch": 0.8438298511949497, "flos": 28765847629440.0, "grad_norm": 20.125297607174176, "language_loss": 0.85310841, "learning_rate": 2.503443056028656e-07, "loss": 0.86801392, "num_input_tokens_seen": 302673965, "router_z_loss_clip": 2.24804688, "router_z_loss_mlp": 0.25378418, "step": 14035, "time_per_iteration": 4.292035818099976 }, { "auxiliary_loss_clip": 0.01267692, "auxiliary_loss_mlp": 0.00244856, "balance_loss_clip": 1.04692876, "balance_loss_mlp": 0.22007269, "epoch": 0.8438899744476176, "flos": 33724284779520.0, "grad_norm": 3.8938413731683164, "language_loss": 0.78328383, "learning_rate": 2.501556703225751e-07, "loss": 0.79840934, "num_input_tokens_seen": 302695560, "router_z_loss_clip": 2.20507812, "router_z_loss_mlp": 0.2479248, "step": 14036, "time_per_iteration": 2.8351244926452637 }, { "auxiliary_loss_clip": 0.01235348, "auxiliary_loss_mlp": 0.00208411, "balance_loss_clip": 1.02818418, "balance_loss_mlp": 0.18688136, "epoch": 0.8439500977002856, "flos": 25110787386240.0, "grad_norm": 24.710618631353306, "language_loss": 0.75454032, "learning_rate": 2.49967101396557e-07, "loss": 0.76897788, "num_input_tokens_seen": 302713480, "router_z_loss_clip": 2.07128906, "router_z_loss_mlp": 0.21533203, "step": 14037, "time_per_iteration": 2.823664426803589 }, { "auxiliary_loss_clip": 0.01253043, "auxiliary_loss_mlp": 0.0023648, "balance_loss_clip": 1.03765678, "balance_loss_mlp": 0.21233991, "epoch": 0.8440102209529535, "flos": 32850362880000.0, "grad_norm": 5.309624110040808, "language_loss": 0.75874865, "learning_rate": 2.4977859883196227e-07, "loss": 0.77364391, "num_input_tokens_seen": 302736860, "router_z_loss_clip": 2.15234375, "router_z_loss_mlp": 0.24145508, "step": 14038, "time_per_iteration": 4.3892295360565186 }, { "auxiliary_loss_clip": 0.01261415, "auxiliary_loss_mlp": 0.00217334, "balance_loss_clip": 1.04561388, "balance_loss_mlp": 0.19395688, "epoch": 0.8440703442056215, "flos": 23730202195200.0, "grad_norm": 23.784884832634344, "language_loss": 0.81205165, "learning_rate": 2.49590162635938e-07, "loss": 0.82683909, "num_input_tokens_seen": 302757745, "router_z_loss_clip": 2.16015625, "router_z_loss_mlp": 0.23376465, "step": 14039, "time_per_iteration": 2.777498483657837 }, { "auxiliary_loss_clip": 0.01267261, "auxiliary_loss_mlp": 0.00238907, "balance_loss_clip": 1.04274011, "balance_loss_mlp": 0.21237113, "epoch": 0.8441304674582895, "flos": 20193719725440.0, "grad_norm": 78.13862815315439, "language_loss": 0.88687563, "learning_rate": 2.4940179281563046e-07, "loss": 0.90193737, "num_input_tokens_seen": 302774885, "router_z_loss_clip": 2.24414062, "router_z_loss_mlp": 0.26513672, "step": 14040, "time_per_iteration": 2.7399117946624756 }, { "auxiliary_loss_clip": 0.01274281, "auxiliary_loss_mlp": 0.0025049, "balance_loss_clip": 1.05075121, "balance_loss_mlp": 0.22385831, "epoch": 0.8441905907109575, "flos": 20219897761920.0, "grad_norm": 6.2609120064525285, "language_loss": 0.78004873, "learning_rate": 2.492134893781821e-07, "loss": 0.79529643, "num_input_tokens_seen": 302791035, "router_z_loss_clip": 2.23828125, "router_z_loss_mlp": 0.26647949, "step": 14041, "time_per_iteration": 2.698157548904419 }, { "auxiliary_loss_clip": 0.01267118, "auxiliary_loss_mlp": 0.00241382, "balance_loss_clip": 1.04532278, "balance_loss_mlp": 0.21808863, "epoch": 0.8442507139636254, "flos": 13516453987200.0, "grad_norm": 21.477807277810186, "language_loss": 0.79660845, "learning_rate": 2.490252523307341e-07, "loss": 0.81169343, "num_input_tokens_seen": 302808650, "router_z_loss_clip": 2.21679688, "router_z_loss_mlp": 0.23278809, "step": 14042, "time_per_iteration": 2.699533224105835 }, { "auxiliary_loss_clip": 0.01264277, "auxiliary_loss_mlp": 0.00211476, "balance_loss_clip": 1.04449964, "balance_loss_mlp": 0.18650174, "epoch": 0.8443108372162934, "flos": 18220212731520.0, "grad_norm": 47.14073108112217, "language_loss": 0.84691018, "learning_rate": 2.4883708168042373e-07, "loss": 0.86166769, "num_input_tokens_seen": 302824605, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.24975586, "step": 14043, "time_per_iteration": 2.6643974781036377 }, { "auxiliary_loss_clip": 0.01247684, "auxiliary_loss_mlp": 0.00242272, "balance_loss_clip": 1.03593874, "balance_loss_mlp": 0.21896631, "epoch": 0.8443709604689613, "flos": 16105110324480.0, "grad_norm": 5.407239718842649, "language_loss": 0.80905783, "learning_rate": 2.486489774343865e-07, "loss": 0.82395738, "num_input_tokens_seen": 302840170, "router_z_loss_clip": 2.11816406, "router_z_loss_mlp": 0.2331543, "step": 14044, "time_per_iteration": 2.6336328983306885 }, { "auxiliary_loss_clip": 0.01241704, "auxiliary_loss_mlp": 0.00228307, "balance_loss_clip": 1.02856016, "balance_loss_mlp": 0.20498976, "epoch": 0.8444310837216293, "flos": 18512130562560.0, "grad_norm": 9.393127127822908, "language_loss": 0.80821854, "learning_rate": 2.484609395997559e-07, "loss": 0.82291865, "num_input_tokens_seen": 302858320, "router_z_loss_clip": 2.12988281, "router_z_loss_mlp": 0.23327637, "step": 14045, "time_per_iteration": 2.6563990116119385 }, { "auxiliary_loss_clip": 0.01252322, "auxiliary_loss_mlp": 0.00226113, "balance_loss_clip": 1.03426003, "balance_loss_mlp": 0.2003991, "epoch": 0.8444912069742974, "flos": 14939845211520.0, "grad_norm": 10.761057257524433, "language_loss": 0.85689843, "learning_rate": 2.4827296818366216e-07, "loss": 0.87168276, "num_input_tokens_seen": 302875255, "router_z_loss_clip": 2.18164062, "router_z_loss_mlp": 0.25732422, "step": 14046, "time_per_iteration": 2.665637969970703 }, { "auxiliary_loss_clip": 0.01266792, "auxiliary_loss_mlp": 0.00224128, "balance_loss_clip": 1.04430485, "balance_loss_mlp": 0.19861752, "epoch": 0.8445513302269653, "flos": 20120318282880.0, "grad_norm": 28.55373748882652, "language_loss": 0.85852003, "learning_rate": 2.4808506319323255e-07, "loss": 0.87342924, "num_input_tokens_seen": 302894690, "router_z_loss_clip": 2.22265625, "router_z_loss_mlp": 0.25512695, "step": 14047, "time_per_iteration": 2.659726858139038 }, { "auxiliary_loss_clip": 0.01271117, "auxiliary_loss_mlp": 0.00214199, "balance_loss_clip": 1.05016792, "balance_loss_mlp": 0.19096512, "epoch": 0.8446114534796333, "flos": 31170928533120.0, "grad_norm": 16.16873100854314, "language_loss": 0.78338915, "learning_rate": 2.478972246355935e-07, "loss": 0.79824233, "num_input_tokens_seen": 302912405, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.23254395, "step": 14048, "time_per_iteration": 2.756404161453247 }, { "auxiliary_loss_clip": 0.01264712, "auxiliary_loss_mlp": 0.00212213, "balance_loss_clip": 1.04532516, "balance_loss_mlp": 0.18730995, "epoch": 0.8446715767323012, "flos": 23948323534080.0, "grad_norm": 112.54317219372383, "language_loss": 0.81598675, "learning_rate": 2.477094525178667e-07, "loss": 0.83075595, "num_input_tokens_seen": 302932525, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.24902344, "step": 14049, "time_per_iteration": 2.697051763534546 }, { "auxiliary_loss_clip": 0.01147784, "auxiliary_loss_mlp": 0.00221709, "balance_loss_clip": 1.00206423, "balance_loss_mlp": 0.21102758, "epoch": 0.8447316999849692, "flos": 67984897484160.0, "grad_norm": 1.4283108240590596, "language_loss": 0.5982374, "learning_rate": 2.475217468471729e-07, "loss": 0.61193234, "num_input_tokens_seen": 302991285, "router_z_loss_clip": 1.453125, "router_z_loss_mlp": 0.10693359, "step": 14050, "time_per_iteration": 3.156367540359497 }, { "auxiliary_loss_clip": 0.01261348, "auxiliary_loss_mlp": 0.00243198, "balance_loss_clip": 1.03849828, "balance_loss_mlp": 0.21893871, "epoch": 0.8447918232376371, "flos": 22418924296320.0, "grad_norm": 3.599125665390115, "language_loss": 0.81381518, "learning_rate": 2.473341076306303e-07, "loss": 0.82886064, "num_input_tokens_seen": 303009515, "router_z_loss_clip": 2.23046875, "router_z_loss_mlp": 0.24243164, "step": 14051, "time_per_iteration": 2.705198287963867 }, { "auxiliary_loss_clip": 0.01261475, "auxiliary_loss_mlp": 0.00245951, "balance_loss_clip": 1.04097402, "balance_loss_mlp": 0.21945064, "epoch": 0.8448519464903052, "flos": 23694147918720.0, "grad_norm": 13.037807389236216, "language_loss": 0.82608008, "learning_rate": 2.471465348753547e-07, "loss": 0.84115434, "num_input_tokens_seen": 303026905, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.26477051, "step": 14052, "time_per_iteration": 2.7819175720214844 }, { "auxiliary_loss_clip": 0.01247202, "auxiliary_loss_mlp": 0.002333, "balance_loss_clip": 1.03445256, "balance_loss_mlp": 0.20983969, "epoch": 0.8449120697429731, "flos": 13735904129280.0, "grad_norm": 673.0970583227521, "language_loss": 0.81142759, "learning_rate": 2.469590285884575e-07, "loss": 0.82623261, "num_input_tokens_seen": 303045245, "router_z_loss_clip": 2.12695312, "router_z_loss_mlp": 0.23449707, "step": 14053, "time_per_iteration": 2.6766607761383057 }, { "auxiliary_loss_clip": 0.01255827, "auxiliary_loss_mlp": 0.00236165, "balance_loss_clip": 1.03815079, "balance_loss_mlp": 0.21090417, "epoch": 0.8449721929956411, "flos": 20886795624960.0, "grad_norm": 20.546676823314833, "language_loss": 0.8097654, "learning_rate": 2.467715887770494e-07, "loss": 0.82468534, "num_input_tokens_seen": 303065205, "router_z_loss_clip": 2.17578125, "router_z_loss_mlp": 0.25256348, "step": 14054, "time_per_iteration": 2.6964504718780518 }, { "auxiliary_loss_clip": 0.01260598, "auxiliary_loss_mlp": 0.00228661, "balance_loss_clip": 1.03999007, "balance_loss_mlp": 0.20386508, "epoch": 0.845032316248309, "flos": 33216939129600.0, "grad_norm": 9.96260890670144, "language_loss": 0.84211373, "learning_rate": 2.4658421544823895e-07, "loss": 0.85700631, "num_input_tokens_seen": 303088250, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.24804688, "step": 14055, "time_per_iteration": 2.803569793701172 }, { "auxiliary_loss_clip": 0.01247686, "auxiliary_loss_mlp": 0.00207616, "balance_loss_clip": 1.03410661, "balance_loss_mlp": 0.18527614, "epoch": 0.845092439500977, "flos": 23585230903680.0, "grad_norm": 12.517435762866613, "language_loss": 0.7859987, "learning_rate": 2.463969086091302e-07, "loss": 0.80055165, "num_input_tokens_seen": 303109280, "router_z_loss_clip": 2.13085938, "router_z_loss_mlp": 0.22338867, "step": 14056, "time_per_iteration": 2.7259232997894287 }, { "auxiliary_loss_clip": 0.01262052, "auxiliary_loss_mlp": 0.00250526, "balance_loss_clip": 1.03932977, "balance_loss_mlp": 0.22439513, "epoch": 0.8451525627536449, "flos": 13333920048000.0, "grad_norm": 7.866318057504293, "language_loss": 0.79103327, "learning_rate": 2.4620966826682686e-07, "loss": 0.80615902, "num_input_tokens_seen": 303126075, "router_z_loss_clip": 2.2265625, "router_z_loss_mlp": 0.26123047, "step": 14057, "time_per_iteration": 2.697777271270752 }, { "auxiliary_loss_clip": 0.01264352, "auxiliary_loss_mlp": 0.00230871, "balance_loss_clip": 1.0437963, "balance_loss_mlp": 0.2054916, "epoch": 0.8452126860063129, "flos": 27817985583360.0, "grad_norm": 23.358679683453353, "language_loss": 0.84614348, "learning_rate": 2.460224944284284e-07, "loss": 0.86109573, "num_input_tokens_seen": 303146920, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.25415039, "step": 14058, "time_per_iteration": 2.702638864517212 }, { "auxiliary_loss_clip": 0.01257869, "auxiliary_loss_mlp": 0.00231224, "balance_loss_clip": 1.04061759, "balance_loss_mlp": 0.20832399, "epoch": 0.845272809258981, "flos": 27124694202240.0, "grad_norm": 122.40228373534026, "language_loss": 0.75649643, "learning_rate": 2.45835387101033e-07, "loss": 0.77138734, "num_input_tokens_seen": 303167885, "router_z_loss_clip": 2.171875, "router_z_loss_mlp": 0.22912598, "step": 14059, "time_per_iteration": 2.718773365020752 }, { "auxiliary_loss_clip": 0.0129206, "auxiliary_loss_mlp": 0.0025045, "balance_loss_clip": 1.05706143, "balance_loss_mlp": 0.21913409, "epoch": 0.8453329325116489, "flos": 18332577452160.0, "grad_norm": 213.13374059988325, "language_loss": 0.69253707, "learning_rate": 2.4564834629173516e-07, "loss": 0.70796216, "num_input_tokens_seen": 303185000, "router_z_loss_clip": 2.34765625, "router_z_loss_mlp": 0.31335449, "step": 14060, "time_per_iteration": 2.663581609725952 }, { "auxiliary_loss_clip": 0.01259691, "auxiliary_loss_mlp": 0.00256876, "balance_loss_clip": 1.03801107, "balance_loss_mlp": 0.22803971, "epoch": 0.8453930557643169, "flos": 22675254727680.0, "grad_norm": 9.282157571589982, "language_loss": 0.84858519, "learning_rate": 2.454613720076277e-07, "loss": 0.86375087, "num_input_tokens_seen": 303205210, "router_z_loss_clip": 2.21484375, "router_z_loss_mlp": 0.28833008, "step": 14061, "time_per_iteration": 2.6942708492279053 }, { "auxiliary_loss_clip": 0.01276604, "auxiliary_loss_mlp": 0.00258766, "balance_loss_clip": 1.04797077, "balance_loss_mlp": 0.23177691, "epoch": 0.8454531790169848, "flos": 22487261921280.0, "grad_norm": 10.55864614907716, "language_loss": 0.83522391, "learning_rate": 2.452744642558013e-07, "loss": 0.85057765, "num_input_tokens_seen": 303224655, "router_z_loss_clip": 2.2890625, "router_z_loss_mlp": 0.27026367, "step": 14062, "time_per_iteration": 2.6752195358276367 }, { "auxiliary_loss_clip": 0.01140734, "auxiliary_loss_mlp": 0.00149485, "balance_loss_clip": 0.99171311, "balance_loss_mlp": 0.14209385, "epoch": 0.8455133022696528, "flos": 58277848481280.0, "grad_norm": 0.6361557136777004, "language_loss": 0.52120697, "learning_rate": 2.450876230433432e-07, "loss": 0.53410912, "num_input_tokens_seen": 303289645, "router_z_loss_clip": 1.484375, "router_z_loss_mlp": 0.07373047, "step": 14063, "time_per_iteration": 3.261197328567505 }, { "auxiliary_loss_clip": 0.01242818, "auxiliary_loss_mlp": 0.00238466, "balance_loss_clip": 1.03105962, "balance_loss_mlp": 0.21507749, "epoch": 0.8455734255223207, "flos": 21361283308800.0, "grad_norm": 21.005204697699202, "language_loss": 0.88202095, "learning_rate": 2.449008483773378e-07, "loss": 0.89683378, "num_input_tokens_seen": 303308350, "router_z_loss_clip": 2.1171875, "router_z_loss_mlp": 0.23413086, "step": 14064, "time_per_iteration": 2.771409034729004 }, { "auxiliary_loss_clip": 0.01256762, "auxiliary_loss_mlp": 0.00226586, "balance_loss_clip": 1.0374769, "balance_loss_mlp": 0.2018625, "epoch": 0.8456335487749888, "flos": 20449260057600.0, "grad_norm": 6.59050265536563, "language_loss": 0.80290765, "learning_rate": 2.447141402648685e-07, "loss": 0.81774116, "num_input_tokens_seen": 303325230, "router_z_loss_clip": 2.19238281, "router_z_loss_mlp": 0.24755859, "step": 14065, "time_per_iteration": 2.719221591949463 }, { "auxiliary_loss_clip": 0.01231659, "auxiliary_loss_mlp": 0.0020354, "balance_loss_clip": 1.01737452, "balance_loss_mlp": 0.17975801, "epoch": 0.8456936720276567, "flos": 28840901097600.0, "grad_norm": 105.0441383144679, "language_loss": 0.82850051, "learning_rate": 2.445274987130146e-07, "loss": 0.84285253, "num_input_tokens_seen": 303345810, "router_z_loss_clip": 2.14453125, "router_z_loss_mlp": 0.23803711, "step": 14066, "time_per_iteration": 2.8389899730682373 }, { "auxiliary_loss_clip": 0.01263968, "auxiliary_loss_mlp": 0.00222359, "balance_loss_clip": 1.04332972, "balance_loss_mlp": 0.19763537, "epoch": 0.8457537952803247, "flos": 22672884430080.0, "grad_norm": 184.4396474122943, "language_loss": 0.76552528, "learning_rate": 2.4434092372885363e-07, "loss": 0.78038859, "num_input_tokens_seen": 303365140, "router_z_loss_clip": 2.2109375, "router_z_loss_mlp": 0.24731445, "step": 14067, "time_per_iteration": 2.8848531246185303 }, { "auxiliary_loss_clip": 0.01250397, "auxiliary_loss_mlp": 0.00228623, "balance_loss_clip": 1.03628683, "balance_loss_mlp": 0.2043401, "epoch": 0.8458139185329926, "flos": 33802929607680.0, "grad_norm": 385.1729473342786, "language_loss": 0.78192103, "learning_rate": 2.4415441531946144e-07, "loss": 0.79671121, "num_input_tokens_seen": 303386150, "router_z_loss_clip": 2.14257812, "router_z_loss_mlp": 0.24267578, "step": 14068, "time_per_iteration": 2.8889315128326416 }, { "auxiliary_loss_clip": 0.01146942, "auxiliary_loss_mlp": 0.00161957, "balance_loss_clip": 0.99849856, "balance_loss_mlp": 0.15237299, "epoch": 0.8458740417856606, "flos": 70295929603200.0, "grad_norm": 0.677644693516346, "language_loss": 0.59635663, "learning_rate": 2.4396797349190976e-07, "loss": 0.60944557, "num_input_tokens_seen": 303453770, "router_z_loss_clip": 1.484375, "router_z_loss_mlp": 0.09570312, "step": 14069, "time_per_iteration": 3.2998690605163574 }, { "auxiliary_loss_clip": 0.01251007, "auxiliary_loss_mlp": 0.00210993, "balance_loss_clip": 1.03064847, "balance_loss_mlp": 0.18771149, "epoch": 0.8459341650383285, "flos": 24170862245760.0, "grad_norm": 24.726926691772483, "language_loss": 0.82489204, "learning_rate": 2.4378159825326804e-07, "loss": 0.83951205, "num_input_tokens_seen": 303474520, "router_z_loss_clip": 2.20117188, "router_z_loss_mlp": 0.23291016, "step": 14070, "time_per_iteration": 2.741861581802368 }, { "auxiliary_loss_clip": 0.01250387, "auxiliary_loss_mlp": 0.00205635, "balance_loss_clip": 1.03539085, "balance_loss_mlp": 0.18144755, "epoch": 0.8459942882909965, "flos": 38181158369280.0, "grad_norm": 4.984591778324076, "language_loss": 0.71972829, "learning_rate": 2.435952896106039e-07, "loss": 0.73428845, "num_input_tokens_seen": 303497345, "router_z_loss_clip": 2.14648438, "router_z_loss_mlp": 0.24182129, "step": 14071, "time_per_iteration": 4.362313747406006 }, { "auxiliary_loss_clip": 0.01152296, "auxiliary_loss_mlp": 0.00120599, "balance_loss_clip": 1.00421214, "balance_loss_mlp": 0.11287411, "epoch": 0.8460544115436646, "flos": 64118252177280.0, "grad_norm": 0.7219923093408456, "language_loss": 0.60493946, "learning_rate": 2.4340904757098313e-07, "loss": 0.61766839, "num_input_tokens_seen": 303554890, "router_z_loss_clip": 1.484375, "router_z_loss_mlp": 0.07714844, "step": 14072, "time_per_iteration": 4.404930353164673 }, { "auxiliary_loss_clip": 0.01264627, "auxiliary_loss_mlp": 0.00229774, "balance_loss_clip": 1.03897214, "balance_loss_mlp": 0.20316616, "epoch": 0.8461145347963325, "flos": 24170826332160.0, "grad_norm": 2.907804182309453, "language_loss": 0.80631471, "learning_rate": 2.4322287214146664e-07, "loss": 0.82125866, "num_input_tokens_seen": 303574380, "router_z_loss_clip": 2.25390625, "router_z_loss_mlp": 0.26635742, "step": 14073, "time_per_iteration": 2.6860878467559814 }, { "auxiliary_loss_clip": 0.01279891, "auxiliary_loss_mlp": 0.00230006, "balance_loss_clip": 1.05206299, "balance_loss_mlp": 0.20308848, "epoch": 0.8461746580490005, "flos": 34893787697280.0, "grad_norm": 81.81823270753848, "language_loss": 0.86622536, "learning_rate": 2.430367633291155e-07, "loss": 0.88132429, "num_input_tokens_seen": 303594910, "router_z_loss_clip": 2.28125, "router_z_loss_mlp": 0.26928711, "step": 14074, "time_per_iteration": 2.9131650924682617 }, { "auxiliary_loss_clip": 0.01253525, "auxiliary_loss_mlp": 0.0020974, "balance_loss_clip": 1.03345346, "balance_loss_mlp": 0.18558797, "epoch": 0.8462347813016684, "flos": 25557014044800.0, "grad_norm": 13.670863615595035, "language_loss": 0.83395565, "learning_rate": 2.4285072114098583e-07, "loss": 0.84858835, "num_input_tokens_seen": 303613520, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.24157715, "step": 14075, "time_per_iteration": 2.719479560852051 }, { "auxiliary_loss_clip": 0.01248128, "auxiliary_loss_mlp": 0.00253724, "balance_loss_clip": 1.03147554, "balance_loss_mlp": 0.22867835, "epoch": 0.8462949045543364, "flos": 21325336773120.0, "grad_norm": 443.37297686075846, "language_loss": 0.81333846, "learning_rate": 2.4266474558413355e-07, "loss": 0.82835698, "num_input_tokens_seen": 303631225, "router_z_loss_clip": 2.16503906, "router_z_loss_mlp": 0.25036621, "step": 14076, "time_per_iteration": 2.7429816722869873 }, { "auxiliary_loss_clip": 0.01275695, "auxiliary_loss_mlp": 0.00240329, "balance_loss_clip": 1.04798913, "balance_loss_mlp": 0.21504499, "epoch": 0.8463550278070043, "flos": 22637440684800.0, "grad_norm": 117.19735803105011, "language_loss": 0.88294631, "learning_rate": 2.4247883666560945e-07, "loss": 0.89810658, "num_input_tokens_seen": 303649175, "router_z_loss_clip": 2.27929688, "router_z_loss_mlp": 0.25280762, "step": 14077, "time_per_iteration": 4.145796060562134 }, { "auxiliary_loss_clip": 0.01269474, "auxiliary_loss_mlp": 0.00219273, "balance_loss_clip": 1.0459342, "balance_loss_mlp": 0.19615883, "epoch": 0.8464151510596724, "flos": 13005588804480.0, "grad_norm": 17.998876682724976, "language_loss": 0.87838531, "learning_rate": 2.422929943924643e-07, "loss": 0.89327276, "num_input_tokens_seen": 303665915, "router_z_loss_clip": 2.23632812, "router_z_loss_mlp": 0.23132324, "step": 14078, "time_per_iteration": 2.6467270851135254 }, { "auxiliary_loss_clip": 0.01261952, "auxiliary_loss_mlp": 0.00221644, "balance_loss_clip": 1.0449152, "balance_loss_mlp": 0.19722964, "epoch": 0.8464752743123403, "flos": 15704921923200.0, "grad_norm": 4.322404547187289, "language_loss": 0.92045605, "learning_rate": 2.4210721877174565e-07, "loss": 0.93529201, "num_input_tokens_seen": 303679985, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.24401855, "step": 14079, "time_per_iteration": 2.680760622024536 }, { "auxiliary_loss_clip": 0.01285628, "auxiliary_loss_mlp": 0.00220904, "balance_loss_clip": 1.0554359, "balance_loss_mlp": 0.19418889, "epoch": 0.8465353975650083, "flos": 21653955325440.0, "grad_norm": 6.906334873987103, "language_loss": 0.70808482, "learning_rate": 2.419215098104965e-07, "loss": 0.72315013, "num_input_tokens_seen": 303698470, "router_z_loss_clip": 2.30371094, "router_z_loss_mlp": 0.26721191, "step": 14080, "time_per_iteration": 4.069557189941406 }, { "auxiliary_loss_clip": 0.01283967, "auxiliary_loss_mlp": 0.00241176, "balance_loss_clip": 1.05689371, "balance_loss_mlp": 0.21503344, "epoch": 0.8465955208176762, "flos": 18515650095360.0, "grad_norm": 27.127250637039708, "language_loss": 0.76507938, "learning_rate": 2.4173586751576014e-07, "loss": 0.78033078, "num_input_tokens_seen": 303716415, "router_z_loss_clip": 2.2734375, "router_z_loss_mlp": 0.26159668, "step": 14081, "time_per_iteration": 2.6432697772979736 }, { "auxiliary_loss_clip": 0.01261036, "auxiliary_loss_mlp": 0.00231412, "balance_loss_clip": 1.04142118, "balance_loss_mlp": 0.20752245, "epoch": 0.8466556440703442, "flos": 24200559815040.0, "grad_norm": 12.67107487481706, "language_loss": 0.81159133, "learning_rate": 2.41550291894576e-07, "loss": 0.82651579, "num_input_tokens_seen": 303734490, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.23864746, "step": 14082, "time_per_iteration": 2.809234619140625 }, { "auxiliary_loss_clip": 0.01251232, "auxiliary_loss_mlp": 0.00236419, "balance_loss_clip": 1.03237343, "balance_loss_mlp": 0.21198145, "epoch": 0.8467157673230121, "flos": 20375894528640.0, "grad_norm": 7.694821415779886, "language_loss": 0.82511455, "learning_rate": 2.413647829539809e-07, "loss": 0.83999109, "num_input_tokens_seen": 303752310, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.24414062, "step": 14083, "time_per_iteration": 2.6410090923309326 }, { "auxiliary_loss_clip": 0.01267304, "auxiliary_loss_mlp": 0.00222399, "balance_loss_clip": 1.04659963, "balance_loss_mlp": 0.19644761, "epoch": 0.8467758905756801, "flos": 28473642489600.0, "grad_norm": 10.658477980105587, "language_loss": 0.73626757, "learning_rate": 2.411793407010092e-07, "loss": 0.75116467, "num_input_tokens_seen": 303776065, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.2598877, "step": 14084, "time_per_iteration": 2.7728354930877686 }, { "auxiliary_loss_clip": 0.01258596, "auxiliary_loss_mlp": 0.00215777, "balance_loss_clip": 1.04191828, "balance_loss_mlp": 0.19257893, "epoch": 0.8468360138283482, "flos": 11692551139200.0, "grad_norm": 23.416390631339503, "language_loss": 0.81210387, "learning_rate": 2.409939651426938e-07, "loss": 0.82684767, "num_input_tokens_seen": 303793500, "router_z_loss_clip": 2.16210938, "router_z_loss_mlp": 0.23205566, "step": 14085, "time_per_iteration": 2.619007110595703 }, { "auxiliary_loss_clip": 0.0124512, "auxiliary_loss_mlp": 0.00217676, "balance_loss_clip": 1.02935553, "balance_loss_mlp": 0.19465648, "epoch": 0.8468961370810161, "flos": 24607859109120.0, "grad_norm": 651.5973739362198, "language_loss": 0.77483928, "learning_rate": 2.408086562860634e-07, "loss": 0.78946716, "num_input_tokens_seen": 303814835, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.23022461, "step": 14086, "time_per_iteration": 2.7769453525543213 }, { "auxiliary_loss_clip": 0.01252688, "auxiliary_loss_mlp": 0.00232502, "balance_loss_clip": 1.03933918, "balance_loss_mlp": 0.20960158, "epoch": 0.8469562603336841, "flos": 19609812236160.0, "grad_norm": 5.244998139712387, "language_loss": 0.82359004, "learning_rate": 2.4062341413814445e-07, "loss": 0.83844191, "num_input_tokens_seen": 303834505, "router_z_loss_clip": 2.1328125, "router_z_loss_mlp": 0.22912598, "step": 14087, "time_per_iteration": 2.7103874683380127 }, { "auxiliary_loss_clip": 0.01254748, "auxiliary_loss_mlp": 0.00191097, "balance_loss_clip": 1.03789663, "balance_loss_mlp": 0.1689598, "epoch": 0.847016383586352, "flos": 22638949056000.0, "grad_norm": 4.413488480277439, "language_loss": 0.80530077, "learning_rate": 2.4043823870596227e-07, "loss": 0.81975925, "num_input_tokens_seen": 303855050, "router_z_loss_clip": 2.16699219, "router_z_loss_mlp": 0.22155762, "step": 14088, "time_per_iteration": 2.743617057800293 }, { "auxiliary_loss_clip": 0.01269175, "auxiliary_loss_mlp": 0.00223899, "balance_loss_clip": 1.04665124, "balance_loss_mlp": 0.19837667, "epoch": 0.84707650683902, "flos": 20960161153920.0, "grad_norm": 52.729474088172445, "language_loss": 0.83298182, "learning_rate": 2.402531299965387e-07, "loss": 0.84791255, "num_input_tokens_seen": 303875635, "router_z_loss_clip": 2.22265625, "router_z_loss_mlp": 0.25549316, "step": 14089, "time_per_iteration": 2.6954169273376465 }, { "auxiliary_loss_clip": 0.01253194, "auxiliary_loss_mlp": 0.00236657, "balance_loss_clip": 1.04022348, "balance_loss_mlp": 0.21308948, "epoch": 0.8471366300916879, "flos": 24093007516800.0, "grad_norm": 8.236392737766359, "language_loss": 0.84101272, "learning_rate": 2.400680880168928e-07, "loss": 0.85591125, "num_input_tokens_seen": 303896750, "router_z_loss_clip": 2.13378906, "router_z_loss_mlp": 0.23547363, "step": 14090, "time_per_iteration": 2.6903371810913086 }, { "auxiliary_loss_clip": 0.01284905, "auxiliary_loss_mlp": 0.00247046, "balance_loss_clip": 1.0569706, "balance_loss_mlp": 0.22055751, "epoch": 0.847196753344356, "flos": 18332900674560.0, "grad_norm": 10.320729654247975, "language_loss": 0.86870825, "learning_rate": 2.3988311277404085e-07, "loss": 0.88402778, "num_input_tokens_seen": 303915435, "router_z_loss_clip": 2.27734375, "router_z_loss_mlp": 0.26489258, "step": 14091, "time_per_iteration": 2.7105090618133545 }, { "auxiliary_loss_clip": 0.01149553, "auxiliary_loss_mlp": 0.0015225, "balance_loss_clip": 1.00085616, "balance_loss_mlp": 0.1440963, "epoch": 0.8472568765970239, "flos": 49567536956160.0, "grad_norm": 0.8071600436633227, "language_loss": 0.58941996, "learning_rate": 2.396982042749982e-07, "loss": 0.60243797, "num_input_tokens_seen": 303977245, "router_z_loss_clip": 1.484375, "router_z_loss_mlp": 0.08154297, "step": 14092, "time_per_iteration": 3.2294692993164062 }, { "auxiliary_loss_clip": 0.01251169, "auxiliary_loss_mlp": 0.00232854, "balance_loss_clip": 1.03750587, "balance_loss_mlp": 0.20866624, "epoch": 0.8473169998496919, "flos": 19279074781440.0, "grad_norm": 141.2053472442435, "language_loss": 0.78046072, "learning_rate": 2.395133625267756e-07, "loss": 0.79530096, "num_input_tokens_seen": 303996055, "router_z_loss_clip": 2.13671875, "router_z_loss_mlp": 0.24157715, "step": 14093, "time_per_iteration": 2.751455307006836 }, { "auxiliary_loss_clip": 0.01248976, "auxiliary_loss_mlp": 0.00213087, "balance_loss_clip": 1.03222334, "balance_loss_mlp": 0.18781486, "epoch": 0.8473771231023598, "flos": 17675555829120.0, "grad_norm": 36.27473339707779, "language_loss": 0.91806626, "learning_rate": 2.3932858753638263e-07, "loss": 0.93268687, "num_input_tokens_seen": 304012205, "router_z_loss_clip": 2.16699219, "router_z_loss_mlp": 0.25256348, "step": 14094, "time_per_iteration": 2.655585765838623 }, { "auxiliary_loss_clip": 0.01234815, "auxiliary_loss_mlp": 0.00203717, "balance_loss_clip": 1.02606869, "balance_loss_mlp": 0.18149662, "epoch": 0.8474372463550278, "flos": 26359761144960.0, "grad_norm": 13.101543195701902, "language_loss": 0.78148043, "learning_rate": 2.3914387931082626e-07, "loss": 0.79586577, "num_input_tokens_seen": 304033475, "router_z_loss_clip": 2.0859375, "router_z_loss_mlp": 0.2220459, "step": 14095, "time_per_iteration": 2.8285298347473145 }, { "auxiliary_loss_clip": 0.01257774, "auxiliary_loss_mlp": 0.00222226, "balance_loss_clip": 1.0393275, "balance_loss_mlp": 0.19826454, "epoch": 0.8474973696076957, "flos": 23402050519680.0, "grad_norm": 9.41505447614017, "language_loss": 0.88214773, "learning_rate": 2.3895923785711105e-07, "loss": 0.89694774, "num_input_tokens_seen": 304051845, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.23950195, "step": 14096, "time_per_iteration": 2.737312078475952 }, { "auxiliary_loss_clip": 0.01270966, "auxiliary_loss_mlp": 0.00224257, "balance_loss_clip": 1.0443604, "balance_loss_mlp": 0.19911599, "epoch": 0.8475574928603637, "flos": 25075666863360.0, "grad_norm": 2.6094696484212063, "language_loss": 0.85985255, "learning_rate": 2.387746631822374e-07, "loss": 0.87480474, "num_input_tokens_seen": 304069965, "router_z_loss_clip": 2.26757812, "router_z_loss_mlp": 0.25158691, "step": 14097, "time_per_iteration": 2.8089165687561035 }, { "auxiliary_loss_clip": 0.01250043, "auxiliary_loss_mlp": 0.00218608, "balance_loss_clip": 1.03325486, "balance_loss_mlp": 0.1948491, "epoch": 0.8476176161130318, "flos": 19966691813760.0, "grad_norm": 4.260956208722383, "language_loss": 0.89849591, "learning_rate": 2.385901552932048e-07, "loss": 0.91318238, "num_input_tokens_seen": 304086805, "router_z_loss_clip": 2.16601562, "router_z_loss_mlp": 0.23742676, "step": 14098, "time_per_iteration": 2.7213079929351807 }, { "auxiliary_loss_clip": 0.01246599, "auxiliary_loss_mlp": 0.00242739, "balance_loss_clip": 1.03473687, "balance_loss_mlp": 0.21858755, "epoch": 0.8476777393656997, "flos": 21285834791040.0, "grad_norm": 45.30672232737521, "language_loss": 0.79892719, "learning_rate": 2.3840571419701062e-07, "loss": 0.8138206, "num_input_tokens_seen": 304105865, "router_z_loss_clip": 2.11523438, "router_z_loss_mlp": 0.24157715, "step": 14099, "time_per_iteration": 2.7016491889953613 }, { "auxiliary_loss_clip": 0.01270871, "auxiliary_loss_mlp": 0.00239829, "balance_loss_clip": 1.04989886, "balance_loss_mlp": 0.21368605, "epoch": 0.8477378626183677, "flos": 29971476650880.0, "grad_norm": 94.46462381579936, "language_loss": 0.71570277, "learning_rate": 2.3822133990064787e-07, "loss": 0.73080981, "num_input_tokens_seen": 304128300, "router_z_loss_clip": 2.2109375, "router_z_loss_mlp": 0.26147461, "step": 14100, "time_per_iteration": 2.7585854530334473 }, { "auxiliary_loss_clip": 0.01275299, "auxiliary_loss_mlp": 0.00235978, "balance_loss_clip": 1.05028582, "balance_loss_mlp": 0.20758194, "epoch": 0.8477979858710356, "flos": 24237727413120.0, "grad_norm": 15.33281906425421, "language_loss": 0.84967434, "learning_rate": 2.380370324111085e-07, "loss": 0.8647871, "num_input_tokens_seen": 304143695, "router_z_loss_clip": 2.25195312, "router_z_loss_mlp": 0.28381348, "step": 14101, "time_per_iteration": 2.7331595420837402 }, { "auxiliary_loss_clip": 0.01255919, "auxiliary_loss_mlp": 0.00220749, "balance_loss_clip": 1.03790855, "balance_loss_mlp": 0.19557217, "epoch": 0.8478581091237036, "flos": 25593678852480.0, "grad_norm": 25.509507941137116, "language_loss": 0.78099775, "learning_rate": 2.3785279173538163e-07, "loss": 0.79576445, "num_input_tokens_seen": 304165800, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.25183105, "step": 14102, "time_per_iteration": 2.7837982177734375 }, { "auxiliary_loss_clip": 0.01266244, "auxiliary_loss_mlp": 0.00214421, "balance_loss_clip": 1.04443264, "balance_loss_mlp": 0.19044799, "epoch": 0.8479182323763715, "flos": 12057116227200.0, "grad_norm": 14.359700034406025, "language_loss": 0.92986453, "learning_rate": 2.3766861788045366e-07, "loss": 0.94467115, "num_input_tokens_seen": 304182910, "router_z_loss_clip": 2.21679688, "router_z_loss_mlp": 0.23986816, "step": 14103, "time_per_iteration": 2.69295597076416 }, { "auxiliary_loss_clip": 0.01264984, "auxiliary_loss_mlp": 0.00228756, "balance_loss_clip": 1.04814458, "balance_loss_mlp": 0.20450863, "epoch": 0.8479783556290396, "flos": 21433391861760.0, "grad_norm": 24.58196023644449, "language_loss": 0.85956657, "learning_rate": 2.374845108533079e-07, "loss": 0.87450397, "num_input_tokens_seen": 304200175, "router_z_loss_clip": 2.16699219, "router_z_loss_mlp": 0.24243164, "step": 14104, "time_per_iteration": 2.7226693630218506 }, { "auxiliary_loss_clip": 0.01262404, "auxiliary_loss_mlp": 0.00260778, "balance_loss_clip": 1.0462532, "balance_loss_mlp": 0.23440938, "epoch": 0.8480384788817075, "flos": 19642634288640.0, "grad_norm": 2.264858473742614, "language_loss": 0.85331905, "learning_rate": 2.3730047066092607e-07, "loss": 0.86855078, "num_input_tokens_seen": 304217775, "router_z_loss_clip": 2.15820312, "router_z_loss_mlp": 0.26391602, "step": 14105, "time_per_iteration": 2.6739699840545654 }, { "auxiliary_loss_clip": 0.01285318, "auxiliary_loss_mlp": 0.00232369, "balance_loss_clip": 1.05945766, "balance_loss_mlp": 0.20670286, "epoch": 0.8480986021343755, "flos": 22489201255680.0, "grad_norm": 3.955443851655256, "language_loss": 0.60108554, "learning_rate": 2.3711649731028749e-07, "loss": 0.61626244, "num_input_tokens_seen": 304235760, "router_z_loss_clip": 2.26171875, "router_z_loss_mlp": 0.25683594, "step": 14106, "time_per_iteration": 2.694535255432129 }, { "auxiliary_loss_clip": 0.01262549, "auxiliary_loss_mlp": 0.00230923, "balance_loss_clip": 1.04189181, "balance_loss_mlp": 0.20579395, "epoch": 0.8481587253870434, "flos": 22090557139200.0, "grad_norm": 20.44814551509585, "language_loss": 0.85315531, "learning_rate": 2.3693259080836792e-07, "loss": 0.86809003, "num_input_tokens_seen": 304253985, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.25146484, "step": 14107, "time_per_iteration": 2.750936985015869 }, { "auxiliary_loss_clip": 0.01256576, "auxiliary_loss_mlp": 0.00227948, "balance_loss_clip": 1.03652954, "balance_loss_mlp": 0.20314068, "epoch": 0.8482188486397114, "flos": 33582689366400.0, "grad_norm": 68.31997429341723, "language_loss": 0.79512757, "learning_rate": 2.3674875116214087e-07, "loss": 0.80997288, "num_input_tokens_seen": 304276785, "router_z_loss_clip": 2.19921875, "router_z_loss_mlp": 0.24816895, "step": 14108, "time_per_iteration": 2.832409620285034 }, { "auxiliary_loss_clip": 0.01240837, "auxiliary_loss_mlp": 0.00222879, "balance_loss_clip": 1.02939689, "balance_loss_mlp": 0.20016935, "epoch": 0.8482789718923793, "flos": 20919402195840.0, "grad_norm": 18.644032999838654, "language_loss": 0.80032754, "learning_rate": 2.3656497837857836e-07, "loss": 0.81496471, "num_input_tokens_seen": 304296310, "router_z_loss_clip": 2.11230469, "router_z_loss_mlp": 0.22705078, "step": 14109, "time_per_iteration": 2.812953472137451 }, { "auxiliary_loss_clip": 0.01256874, "auxiliary_loss_mlp": 0.00209659, "balance_loss_clip": 1.0404774, "balance_loss_mlp": 0.18598372, "epoch": 0.8483390951450474, "flos": 12896204912640.0, "grad_norm": 36.23868347492057, "language_loss": 0.84204096, "learning_rate": 2.3638127246464811e-07, "loss": 0.8567062, "num_input_tokens_seen": 304311715, "router_z_loss_clip": 2.16210938, "router_z_loss_mlp": 0.23669434, "step": 14110, "time_per_iteration": 2.730633020401001 }, { "auxiliary_loss_clip": 0.01258243, "auxiliary_loss_mlp": 0.00211833, "balance_loss_clip": 1.04025102, "balance_loss_mlp": 0.18571442, "epoch": 0.8483992183977154, "flos": 25081628520960.0, "grad_norm": 343.00451382520555, "language_loss": 0.83997172, "learning_rate": 2.3619763342731658e-07, "loss": 0.85467249, "num_input_tokens_seen": 304331910, "router_z_loss_clip": 2.18164062, "router_z_loss_mlp": 0.26135254, "step": 14111, "time_per_iteration": 2.7635891437530518 }, { "auxiliary_loss_clip": 0.01239634, "auxiliary_loss_mlp": 0.00229985, "balance_loss_clip": 1.0235877, "balance_loss_mlp": 0.20590471, "epoch": 0.8484593416503833, "flos": 25557445008000.0, "grad_norm": 31.4414191370526, "language_loss": 0.74891275, "learning_rate": 2.3601406127354772e-07, "loss": 0.76360893, "num_input_tokens_seen": 304351405, "router_z_loss_clip": 2.16308594, "router_z_loss_mlp": 0.24060059, "step": 14112, "time_per_iteration": 2.7937867641448975 }, { "auxiliary_loss_clip": 0.01250598, "auxiliary_loss_mlp": 0.00226829, "balance_loss_clip": 1.03441668, "balance_loss_mlp": 0.20441729, "epoch": 0.8485194649030513, "flos": 27198454780800.0, "grad_norm": 3.822794192762103, "language_loss": 0.81300449, "learning_rate": 2.3583055601030312e-07, "loss": 0.82777882, "num_input_tokens_seen": 304372935, "router_z_loss_clip": 2.16015625, "router_z_loss_mlp": 0.22399902, "step": 14113, "time_per_iteration": 4.265378713607788 }, { "auxiliary_loss_clip": 0.01275761, "auxiliary_loss_mlp": 0.00227763, "balance_loss_clip": 1.05256355, "balance_loss_mlp": 0.20216875, "epoch": 0.8485795881557192, "flos": 24205910941440.0, "grad_norm": 12.176067959296192, "language_loss": 0.7483176, "learning_rate": 2.3564711764454003e-07, "loss": 0.76335287, "num_input_tokens_seen": 304393070, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.2557373, "step": 14114, "time_per_iteration": 4.134046316146851 }, { "auxiliary_loss_clip": 0.01257531, "auxiliary_loss_mlp": 0.00250931, "balance_loss_clip": 1.03940737, "balance_loss_mlp": 0.22382301, "epoch": 0.8486397114083872, "flos": 21141653598720.0, "grad_norm": 3.9953943630432582, "language_loss": 0.87336671, "learning_rate": 2.3546374618321495e-07, "loss": 0.8884514, "num_input_tokens_seen": 304411195, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.27111816, "step": 14115, "time_per_iteration": 2.757525682449341 }, { "auxiliary_loss_clip": 0.01258725, "auxiliary_loss_mlp": 0.00223367, "balance_loss_clip": 1.04056549, "balance_loss_mlp": 0.19826113, "epoch": 0.8486998346610551, "flos": 19974772373760.0, "grad_norm": 2.7301004416734154, "language_loss": 0.88763273, "learning_rate": 2.3528044163328187e-07, "loss": 0.90245366, "num_input_tokens_seen": 304429425, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.25109863, "step": 14116, "time_per_iteration": 2.7475719451904297 }, { "auxiliary_loss_clip": 0.01254848, "auxiliary_loss_mlp": 0.00196711, "balance_loss_clip": 1.03672659, "balance_loss_mlp": 0.17246418, "epoch": 0.8487599579137232, "flos": 19792310261760.0, "grad_norm": 5.701828901919393, "language_loss": 0.75548327, "learning_rate": 2.3509720400169076e-07, "loss": 0.76999891, "num_input_tokens_seen": 304447460, "router_z_loss_clip": 2.18164062, "router_z_loss_mlp": 0.24279785, "step": 14117, "time_per_iteration": 2.7552623748779297 }, { "auxiliary_loss_clip": 0.01251233, "auxiliary_loss_mlp": 0.00220911, "balance_loss_clip": 1.03465474, "balance_loss_mlp": 0.19756992, "epoch": 0.8488200811663911, "flos": 26396030903040.0, "grad_norm": 3.5873541300646625, "language_loss": 0.74546659, "learning_rate": 2.3491403329539096e-07, "loss": 0.76018798, "num_input_tokens_seen": 304468230, "router_z_loss_clip": 2.16796875, "router_z_loss_mlp": 0.23339844, "step": 14118, "time_per_iteration": 2.767672538757324 }, { "auxiliary_loss_clip": 0.01244739, "auxiliary_loss_mlp": 0.00206844, "balance_loss_clip": 1.03162897, "balance_loss_mlp": 0.18529111, "epoch": 0.8488802044190591, "flos": 16359285939840.0, "grad_norm": 68.0383087711467, "language_loss": 0.80324948, "learning_rate": 2.3473092952132757e-07, "loss": 0.8177653, "num_input_tokens_seen": 304484860, "router_z_loss_clip": 2.12988281, "router_z_loss_mlp": 0.21557617, "step": 14119, "time_per_iteration": 4.2154765129089355 }, { "auxiliary_loss_clip": 0.01263208, "auxiliary_loss_mlp": 0.00240461, "balance_loss_clip": 1.04622245, "balance_loss_mlp": 0.21536735, "epoch": 0.848940327671727, "flos": 19208869649280.0, "grad_norm": 287.1657993689893, "language_loss": 0.85462862, "learning_rate": 2.345478926864446e-07, "loss": 0.86966532, "num_input_tokens_seen": 304503575, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.25085449, "step": 14120, "time_per_iteration": 2.7213051319122314 }, { "auxiliary_loss_clip": 0.01264774, "auxiliary_loss_mlp": 0.00238491, "balance_loss_clip": 1.04411745, "balance_loss_mlp": 0.21298024, "epoch": 0.849000450924395, "flos": 21871178824320.0, "grad_norm": 101.1119628733246, "language_loss": 0.82321262, "learning_rate": 2.3436492279768227e-07, "loss": 0.83824527, "num_input_tokens_seen": 304525005, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.25500488, "step": 14121, "time_per_iteration": 2.74642276763916 }, { "auxiliary_loss_clip": 0.01136366, "auxiliary_loss_mlp": 0.00087442, "balance_loss_clip": 0.99316621, "balance_loss_mlp": 0.08095723, "epoch": 0.8490605741770629, "flos": 71166475624320.0, "grad_norm": 0.7801563130309834, "language_loss": 0.59344292, "learning_rate": 2.3418201986197883e-07, "loss": 0.605681, "num_input_tokens_seen": 304585220, "router_z_loss_clip": 1.4375, "router_z_loss_mlp": 0.06494141, "step": 14122, "time_per_iteration": 4.655341625213623 }, { "auxiliary_loss_clip": 0.0125317, "auxiliary_loss_mlp": 0.00205015, "balance_loss_clip": 1.03498101, "balance_loss_mlp": 0.18072012, "epoch": 0.849120697429731, "flos": 24973357950720.0, "grad_norm": 10.537211010791724, "language_loss": 0.90467894, "learning_rate": 2.3399918388627048e-07, "loss": 0.91926074, "num_input_tokens_seen": 304604665, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.24291992, "step": 14123, "time_per_iteration": 2.7931089401245117 }, { "auxiliary_loss_clip": 0.01234166, "auxiliary_loss_mlp": 0.00221183, "balance_loss_clip": 1.02519751, "balance_loss_mlp": 0.19729345, "epoch": 0.8491808206823989, "flos": 23032277959680.0, "grad_norm": 12.96170640249753, "language_loss": 0.90388352, "learning_rate": 2.3381641487749016e-07, "loss": 0.918437, "num_input_tokens_seen": 304620600, "router_z_loss_clip": 2.08984375, "router_z_loss_mlp": 0.2388916, "step": 14124, "time_per_iteration": 2.7274348735809326 }, { "auxiliary_loss_clip": 0.01261295, "auxiliary_loss_mlp": 0.00217126, "balance_loss_clip": 1.04186189, "balance_loss_mlp": 0.19398715, "epoch": 0.8492409439350669, "flos": 23878549365120.0, "grad_norm": 16.56102732774694, "language_loss": 0.80170667, "learning_rate": 2.3363371284256805e-07, "loss": 0.81649089, "num_input_tokens_seen": 304639540, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.23132324, "step": 14125, "time_per_iteration": 2.7850608825683594 }, { "auxiliary_loss_clip": 0.01289088, "auxiliary_loss_mlp": 0.00238259, "balance_loss_clip": 1.05904818, "balance_loss_mlp": 0.21063834, "epoch": 0.8493010671877349, "flos": 22419893963520.0, "grad_norm": 181.77009602047218, "language_loss": 0.81040549, "learning_rate": 2.3345107778843288e-07, "loss": 0.82567894, "num_input_tokens_seen": 304660595, "router_z_loss_clip": 2.29882812, "router_z_loss_mlp": 0.27612305, "step": 14126, "time_per_iteration": 2.7433271408081055 }, { "auxiliary_loss_clip": 0.01233414, "auxiliary_loss_mlp": 0.00221314, "balance_loss_clip": 1.02325749, "balance_loss_mlp": 0.19779383, "epoch": 0.8493611904404028, "flos": 17529435302400.0, "grad_norm": 87.36279460721846, "language_loss": 0.75485253, "learning_rate": 2.3326850972200928e-07, "loss": 0.76939976, "num_input_tokens_seen": 304679580, "router_z_loss_clip": 2.10058594, "router_z_loss_mlp": 0.23498535, "step": 14127, "time_per_iteration": 2.842017650604248 }, { "auxiliary_loss_clip": 0.01271351, "auxiliary_loss_mlp": 0.00212525, "balance_loss_clip": 1.04503644, "balance_loss_mlp": 0.18533294, "epoch": 0.8494213136930708, "flos": 19462937523840.0, "grad_norm": 6.515291597690145, "language_loss": 0.8004117, "learning_rate": 2.330860086502211e-07, "loss": 0.8152504, "num_input_tokens_seen": 304698385, "router_z_loss_clip": 2.26660156, "router_z_loss_mlp": 0.27209473, "step": 14128, "time_per_iteration": 2.680050849914551 }, { "auxiliary_loss_clip": 0.0124726, "auxiliary_loss_mlp": 0.00226506, "balance_loss_clip": 1.03300488, "balance_loss_mlp": 0.20097157, "epoch": 0.8494814369457387, "flos": 18770292587520.0, "grad_norm": 6.078610346995285, "language_loss": 0.85391974, "learning_rate": 2.3290357457998855e-07, "loss": 0.86865735, "num_input_tokens_seen": 304715430, "router_z_loss_clip": 2.140625, "router_z_loss_mlp": 0.25549316, "step": 14129, "time_per_iteration": 2.7344493865966797 }, { "auxiliary_loss_clip": 0.01251618, "auxiliary_loss_mlp": 0.00211252, "balance_loss_clip": 1.03616929, "balance_loss_mlp": 0.18776768, "epoch": 0.8495415601984068, "flos": 23331486251520.0, "grad_norm": 6.749478107763598, "language_loss": 0.73935407, "learning_rate": 2.3272120751823031e-07, "loss": 0.75398278, "num_input_tokens_seen": 304734345, "router_z_loss_clip": 2.15722656, "router_z_loss_mlp": 0.23498535, "step": 14130, "time_per_iteration": 2.7283401489257812 }, { "auxiliary_loss_clip": 0.01244213, "auxiliary_loss_mlp": 0.00219866, "balance_loss_clip": 1.03160143, "balance_loss_mlp": 0.19572575, "epoch": 0.8496016834510747, "flos": 26612859352320.0, "grad_norm": 20.954668869267543, "language_loss": 0.78296667, "learning_rate": 2.3253890747186e-07, "loss": 0.79760742, "num_input_tokens_seen": 304755030, "router_z_loss_clip": 2.12695312, "router_z_loss_mlp": 0.24145508, "step": 14131, "time_per_iteration": 2.805588960647583 }, { "auxiliary_loss_clip": 0.01240878, "auxiliary_loss_mlp": 0.00219402, "balance_loss_clip": 1.0303576, "balance_loss_mlp": 0.19542943, "epoch": 0.8496618067037427, "flos": 25480380378240.0, "grad_norm": 18.793284172914905, "language_loss": 0.76539636, "learning_rate": 2.3235667444779162e-07, "loss": 0.77999914, "num_input_tokens_seen": 304774320, "router_z_loss_clip": 2.10742188, "router_z_loss_mlp": 0.23999023, "step": 14132, "time_per_iteration": 2.79803204536438 }, { "auxiliary_loss_clip": 0.01248447, "auxiliary_loss_mlp": 0.00203569, "balance_loss_clip": 1.03594398, "balance_loss_mlp": 0.17934605, "epoch": 0.8497219299564106, "flos": 25374587846400.0, "grad_norm": 4.604222589321168, "language_loss": 0.77048945, "learning_rate": 2.3217450845293564e-07, "loss": 0.78500962, "num_input_tokens_seen": 304795355, "router_z_loss_clip": 2.12304688, "router_z_loss_mlp": 0.2421875, "step": 14133, "time_per_iteration": 2.7623770236968994 }, { "auxiliary_loss_clip": 0.01152714, "auxiliary_loss_mlp": 0.00140858, "balance_loss_clip": 1.0050354, "balance_loss_mlp": 0.13423008, "epoch": 0.8497820532090786, "flos": 67780279658880.0, "grad_norm": 0.7232400725600282, "language_loss": 0.5688439, "learning_rate": 2.3199240949419918e-07, "loss": 0.5817796, "num_input_tokens_seen": 304863915, "router_z_loss_clip": 1.4765625, "router_z_loss_mlp": 0.06640625, "step": 14134, "time_per_iteration": 3.3380932807922363 }, { "auxiliary_loss_clip": 0.01275764, "auxiliary_loss_mlp": 0.00205, "balance_loss_clip": 1.05127311, "balance_loss_mlp": 0.18056196, "epoch": 0.8498421764617465, "flos": 23440546920960.0, "grad_norm": 1321.4126025728785, "language_loss": 0.87890685, "learning_rate": 2.3181037757848787e-07, "loss": 0.89371443, "num_input_tokens_seen": 304881555, "router_z_loss_clip": 2.24804688, "router_z_loss_mlp": 0.24401855, "step": 14135, "time_per_iteration": 2.8014698028564453 }, { "auxiliary_loss_clip": 0.01278478, "auxiliary_loss_mlp": 0.00224012, "balance_loss_clip": 1.05362344, "balance_loss_mlp": 0.1973331, "epoch": 0.8499022997144146, "flos": 17712615686400.0, "grad_norm": 11.641768875716627, "language_loss": 0.75155783, "learning_rate": 2.316284127127044e-07, "loss": 0.76658273, "num_input_tokens_seen": 304898760, "router_z_loss_clip": 2.25, "router_z_loss_mlp": 0.2668457, "step": 14136, "time_per_iteration": 2.726651906967163 }, { "auxiliary_loss_clip": 0.01272199, "auxiliary_loss_mlp": 0.00232084, "balance_loss_clip": 1.05070877, "balance_loss_mlp": 0.20712194, "epoch": 0.8499624229670825, "flos": 18588512833920.0, "grad_norm": 2.869216246269516, "language_loss": 0.90049547, "learning_rate": 2.3144651490374835e-07, "loss": 0.91553825, "num_input_tokens_seen": 304915465, "router_z_loss_clip": 2.21289062, "router_z_loss_mlp": 0.24938965, "step": 14137, "time_per_iteration": 2.8696117401123047 }, { "auxiliary_loss_clip": 0.01241545, "auxiliary_loss_mlp": 0.00239165, "balance_loss_clip": 1.03206205, "balance_loss_mlp": 0.21564505, "epoch": 0.8500225462197505, "flos": 24345854328960.0, "grad_norm": 17.93503697433748, "language_loss": 0.86302823, "learning_rate": 2.3126468415851773e-07, "loss": 0.87783533, "num_input_tokens_seen": 304933190, "router_z_loss_clip": 2.09375, "router_z_loss_mlp": 0.23535156, "step": 14138, "time_per_iteration": 2.8463594913482666 }, { "auxiliary_loss_clip": 0.01253133, "auxiliary_loss_mlp": 0.00222783, "balance_loss_clip": 1.03627801, "balance_loss_mlp": 0.19783264, "epoch": 0.8500826694724185, "flos": 16545518979840.0, "grad_norm": 4.956328391128377, "language_loss": 0.71652687, "learning_rate": 2.310829204839073e-07, "loss": 0.73128605, "num_input_tokens_seen": 304951110, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.24951172, "step": 14139, "time_per_iteration": 2.8149728775024414 }, { "auxiliary_loss_clip": 0.01259505, "auxiliary_loss_mlp": 0.00213819, "balance_loss_clip": 1.0448662, "balance_loss_mlp": 0.19002488, "epoch": 0.8501427927250864, "flos": 16289404030080.0, "grad_norm": 23.224531579259896, "language_loss": 0.78402573, "learning_rate": 2.3090122388681043e-07, "loss": 0.79875892, "num_input_tokens_seen": 304969095, "router_z_loss_clip": 2.1484375, "router_z_loss_mlp": 0.23791504, "step": 14140, "time_per_iteration": 2.6956145763397217 }, { "auxiliary_loss_clip": 0.01271501, "auxiliary_loss_mlp": 0.00228264, "balance_loss_clip": 1.04758298, "balance_loss_mlp": 0.20210969, "epoch": 0.8502029159777544, "flos": 26687912820480.0, "grad_norm": 422682.61986657494, "language_loss": 0.73715216, "learning_rate": 2.3071959437411648e-07, "loss": 0.75214988, "num_input_tokens_seen": 304989315, "router_z_loss_clip": 2.24414062, "router_z_loss_mlp": 0.26171875, "step": 14141, "time_per_iteration": 2.8010926246643066 }, { "auxiliary_loss_clip": 0.01244884, "auxiliary_loss_mlp": 0.0020312, "balance_loss_clip": 1.03163457, "balance_loss_mlp": 0.18091121, "epoch": 0.8502630392304223, "flos": 35590778179200.0, "grad_norm": 6.317127456306902, "language_loss": 0.79366952, "learning_rate": 2.3053803195271214e-07, "loss": 0.80814958, "num_input_tokens_seen": 305011020, "router_z_loss_clip": 2.1328125, "router_z_loss_mlp": 0.2220459, "step": 14142, "time_per_iteration": 2.8930771350860596 }, { "auxiliary_loss_clip": 0.01249127, "auxiliary_loss_mlp": 0.00220752, "balance_loss_clip": 1.03442669, "balance_loss_mlp": 0.19645722, "epoch": 0.8503231624830904, "flos": 21649466125440.0, "grad_norm": 2.9688085940762234, "language_loss": 0.72382146, "learning_rate": 2.3035653662948375e-07, "loss": 0.73852026, "num_input_tokens_seen": 305033550, "router_z_loss_clip": 2.14453125, "router_z_loss_mlp": 0.24267578, "step": 14143, "time_per_iteration": 2.785146713256836 }, { "auxiliary_loss_clip": 0.01258572, "auxiliary_loss_mlp": 0.00217581, "balance_loss_clip": 1.03711879, "balance_loss_mlp": 0.1923086, "epoch": 0.8503832857357583, "flos": 22417451838720.0, "grad_norm": 26.37499376342691, "language_loss": 0.77764839, "learning_rate": 2.3017510841131216e-07, "loss": 0.7924099, "num_input_tokens_seen": 305052885, "router_z_loss_clip": 2.21679688, "router_z_loss_mlp": 0.25268555, "step": 14144, "time_per_iteration": 2.730778455734253 }, { "auxiliary_loss_clip": 0.01256321, "auxiliary_loss_mlp": 0.00216549, "balance_loss_clip": 1.04071748, "balance_loss_mlp": 0.19121677, "epoch": 0.8504434089884263, "flos": 18697968552960.0, "grad_norm": 9.68333736707444, "language_loss": 0.75621581, "learning_rate": 2.299937473050777e-07, "loss": 0.77094448, "num_input_tokens_seen": 305071995, "router_z_loss_clip": 2.15429688, "router_z_loss_mlp": 0.25341797, "step": 14145, "time_per_iteration": 2.758537769317627 }, { "auxiliary_loss_clip": 0.01246672, "auxiliary_loss_mlp": 0.00221849, "balance_loss_clip": 1.03219318, "balance_loss_mlp": 0.19713709, "epoch": 0.8505035322410942, "flos": 20007989475840.0, "grad_norm": 3.6430930492698974, "language_loss": 0.91224205, "learning_rate": 2.2981245331765842e-07, "loss": 0.92692727, "num_input_tokens_seen": 305090190, "router_z_loss_clip": 2.14550781, "router_z_loss_mlp": 0.24707031, "step": 14146, "time_per_iteration": 2.7445316314697266 }, { "auxiliary_loss_clip": 0.0124437, "auxiliary_loss_mlp": 0.00203949, "balance_loss_clip": 1.02754736, "balance_loss_mlp": 0.18000022, "epoch": 0.8505636554937622, "flos": 20812173120000.0, "grad_norm": 7.266980494620317, "language_loss": 0.91117001, "learning_rate": 2.2963122645592814e-07, "loss": 0.92565322, "num_input_tokens_seen": 305109355, "router_z_loss_clip": 2.16796875, "router_z_loss_mlp": 0.23962402, "step": 14147, "time_per_iteration": 2.7156193256378174 }, { "auxiliary_loss_clip": 0.01258753, "auxiliary_loss_mlp": 0.00216077, "balance_loss_clip": 1.03892171, "balance_loss_mlp": 0.19000548, "epoch": 0.8506237787464301, "flos": 14174445277440.0, "grad_norm": 137.41608493811913, "language_loss": 0.96643162, "learning_rate": 2.2945006672675894e-07, "loss": 0.98117995, "num_input_tokens_seen": 305124165, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.26037598, "step": 14148, "time_per_iteration": 2.7488412857055664 }, { "auxiliary_loss_clip": 0.01256812, "auxiliary_loss_mlp": 0.00220813, "balance_loss_clip": 1.04044628, "balance_loss_mlp": 0.19734097, "epoch": 0.8506839019990982, "flos": 23258372117760.0, "grad_norm": 28.918373224598636, "language_loss": 0.81515455, "learning_rate": 2.292689741370204e-07, "loss": 0.82993078, "num_input_tokens_seen": 305143940, "router_z_loss_clip": 2.1640625, "router_z_loss_mlp": 0.23498535, "step": 14149, "time_per_iteration": 2.715012788772583 }, { "auxiliary_loss_clip": 0.01257824, "auxiliary_loss_mlp": 0.00219584, "balance_loss_clip": 1.04107273, "balance_loss_mlp": 0.19559897, "epoch": 0.8507440252517661, "flos": 23659206963840.0, "grad_norm": 2.4495845945974124, "language_loss": 0.85488224, "learning_rate": 2.290879486935804e-07, "loss": 0.86965638, "num_input_tokens_seen": 305163505, "router_z_loss_clip": 2.16796875, "router_z_loss_mlp": 0.23999023, "step": 14150, "time_per_iteration": 2.79972767829895 }, { "auxiliary_loss_clip": 0.01246294, "auxiliary_loss_mlp": 0.00214921, "balance_loss_clip": 1.03439963, "balance_loss_mlp": 0.19206865, "epoch": 0.8508041485044341, "flos": 18661339658880.0, "grad_norm": 39.54143597893323, "language_loss": 0.80160046, "learning_rate": 2.2890699040330231e-07, "loss": 0.81621259, "num_input_tokens_seen": 305182325, "router_z_loss_clip": 2.1171875, "router_z_loss_mlp": 0.22851562, "step": 14151, "time_per_iteration": 2.75498104095459 }, { "auxiliary_loss_clip": 0.01145488, "auxiliary_loss_mlp": 0.00145734, "balance_loss_clip": 1.00044394, "balance_loss_mlp": 0.13724671, "epoch": 0.8508642717571021, "flos": 52510918055040.0, "grad_norm": 0.8465537791046179, "language_loss": 0.58914316, "learning_rate": 2.2872609927304909e-07, "loss": 0.60205531, "num_input_tokens_seen": 305230775, "router_z_loss_clip": 1.453125, "router_z_loss_mlp": 0.08496094, "step": 14152, "time_per_iteration": 3.049924373626709 }, { "auxiliary_loss_clip": 0.0114835, "auxiliary_loss_mlp": 0.00157564, "balance_loss_clip": 1.00429809, "balance_loss_mlp": 0.14850378, "epoch": 0.85092439500977, "flos": 69297145050240.0, "grad_norm": 0.6832510463338531, "language_loss": 0.59631598, "learning_rate": 2.285452753096797e-07, "loss": 0.60937512, "num_input_tokens_seen": 305296000, "router_z_loss_clip": 1.4375, "router_z_loss_mlp": 0.09082031, "step": 14153, "time_per_iteration": 3.218358278274536 }, { "auxiliary_loss_clip": 0.01239757, "auxiliary_loss_mlp": 0.00251728, "balance_loss_clip": 1.02858853, "balance_loss_mlp": 0.22694454, "epoch": 0.850984518262438, "flos": 24389737770240.0, "grad_norm": 4.144986153690338, "language_loss": 0.86882555, "learning_rate": 2.2836451852005067e-07, "loss": 0.88374037, "num_input_tokens_seen": 305314705, "router_z_loss_clip": 2.11035156, "router_z_loss_mlp": 0.24768066, "step": 14154, "time_per_iteration": 2.809767961502075 }, { "auxiliary_loss_clip": 0.01227748, "auxiliary_loss_mlp": 0.00209223, "balance_loss_clip": 1.02105343, "balance_loss_mlp": 0.18576288, "epoch": 0.851044641515106, "flos": 23294821443840.0, "grad_norm": 3.0287336926344937, "language_loss": 0.85319293, "learning_rate": 2.281838289110165e-07, "loss": 0.86756271, "num_input_tokens_seen": 305333870, "router_z_loss_clip": 2.06738281, "router_z_loss_mlp": 0.23449707, "step": 14155, "time_per_iteration": 4.153446197509766 }, { "auxiliary_loss_clip": 0.01258539, "auxiliary_loss_mlp": 0.00216193, "balance_loss_clip": 1.03894281, "balance_loss_mlp": 0.19143328, "epoch": 0.851104764767774, "flos": 22050085489920.0, "grad_norm": 243.16628309335073, "language_loss": 0.78534478, "learning_rate": 2.2800320648942904e-07, "loss": 0.8000921, "num_input_tokens_seen": 305352780, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.24768066, "step": 14156, "time_per_iteration": 4.191095590591431 }, { "auxiliary_loss_clip": 0.01236125, "auxiliary_loss_mlp": 0.00220166, "balance_loss_clip": 1.02624655, "balance_loss_mlp": 0.1943213, "epoch": 0.8511648880204419, "flos": 20704728562560.0, "grad_norm": 3.593363054292012, "language_loss": 0.8346017, "learning_rate": 2.278226512621386e-07, "loss": 0.84916461, "num_input_tokens_seen": 305371370, "router_z_loss_clip": 2.1015625, "router_z_loss_mlp": 0.25830078, "step": 14157, "time_per_iteration": 2.715684175491333 }, { "auxiliary_loss_clip": 0.01229575, "auxiliary_loss_mlp": 0.00205617, "balance_loss_clip": 1.01995468, "balance_loss_mlp": 0.18123919, "epoch": 0.8512250112731099, "flos": 24024669891840.0, "grad_norm": 7.890857554098454, "language_loss": 0.88135487, "learning_rate": 2.2764216323598995e-07, "loss": 0.89570683, "num_input_tokens_seen": 305387955, "router_z_loss_clip": 2.09570312, "router_z_loss_mlp": 0.24365234, "step": 14158, "time_per_iteration": 2.8615832328796387 }, { "auxiliary_loss_clip": 0.01255845, "auxiliary_loss_mlp": 0.00225706, "balance_loss_clip": 1.0383265, "balance_loss_mlp": 0.19982539, "epoch": 0.8512851345257778, "flos": 22015467757440.0, "grad_norm": 10.897707131527621, "language_loss": 0.88103527, "learning_rate": 2.27461742417828e-07, "loss": 0.89585078, "num_input_tokens_seen": 305406285, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.25854492, "step": 14159, "time_per_iteration": 2.7339324951171875 }, { "auxiliary_loss_clip": 0.01250811, "auxiliary_loss_mlp": 0.00210282, "balance_loss_clip": 1.0354948, "balance_loss_mlp": 0.18648778, "epoch": 0.8513452577784458, "flos": 14830209924480.0, "grad_norm": 6.123466150954596, "language_loss": 0.80008125, "learning_rate": 2.2728138881449488e-07, "loss": 0.81469214, "num_input_tokens_seen": 305424500, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.23779297, "step": 14160, "time_per_iteration": 2.7723586559295654 }, { "auxiliary_loss_clip": 0.01280752, "auxiliary_loss_mlp": 0.00258613, "balance_loss_clip": 1.05445862, "balance_loss_mlp": 0.23213705, "epoch": 0.8514053810311137, "flos": 33035662166400.0, "grad_norm": 6.725414165892664, "language_loss": 0.79317331, "learning_rate": 2.2710110243282866e-07, "loss": 0.80856693, "num_input_tokens_seen": 305442990, "router_z_loss_clip": 2.26367188, "router_z_loss_mlp": 0.26464844, "step": 14161, "time_per_iteration": 4.26505184173584 }, { "auxiliary_loss_clip": 0.01259131, "auxiliary_loss_mlp": 0.00206078, "balance_loss_clip": 1.03965068, "balance_loss_mlp": 0.18302271, "epoch": 0.8514655042837818, "flos": 27564456412800.0, "grad_norm": 86.94211923624658, "language_loss": 0.88269043, "learning_rate": 2.2692088327966653e-07, "loss": 0.89734256, "num_input_tokens_seen": 305463065, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.23059082, "step": 14162, "time_per_iteration": 2.7833611965179443 }, { "auxiliary_loss_clip": 0.0125095, "auxiliary_loss_mlp": 0.00234886, "balance_loss_clip": 1.03492117, "balance_loss_mlp": 0.21130598, "epoch": 0.8515256275364497, "flos": 35556052705920.0, "grad_norm": 2.575726209116466, "language_loss": 0.82904845, "learning_rate": 2.2674073136184235e-07, "loss": 0.84390676, "num_input_tokens_seen": 305489070, "router_z_loss_clip": 2.16210938, "router_z_loss_mlp": 0.23596191, "step": 14163, "time_per_iteration": 2.9315714836120605 }, { "auxiliary_loss_clip": 0.01129777, "auxiliary_loss_mlp": 0.00154082, "balance_loss_clip": 0.98360658, "balance_loss_mlp": 0.14592844, "epoch": 0.8515857507891177, "flos": 70207372621440.0, "grad_norm": 0.7091481400333814, "language_loss": 0.54074597, "learning_rate": 2.2656064668618735e-07, "loss": 0.55358452, "num_input_tokens_seen": 305551490, "router_z_loss_clip": 1.4609375, "router_z_loss_mlp": 0.08154297, "step": 14164, "time_per_iteration": 3.2960622310638428 }, { "auxiliary_loss_clip": 0.01255076, "auxiliary_loss_mlp": 0.0020716, "balance_loss_clip": 1.03193331, "balance_loss_mlp": 0.18112427, "epoch": 0.8516458740417857, "flos": 22675290641280.0, "grad_norm": 4.18781060008492, "language_loss": 0.82237101, "learning_rate": 2.2638062925953005e-07, "loss": 0.83699334, "num_input_tokens_seen": 305570535, "router_z_loss_clip": 2.23046875, "router_z_loss_mlp": 0.26025391, "step": 14165, "time_per_iteration": 4.171326637268066 }, { "auxiliary_loss_clip": 0.01233282, "auxiliary_loss_mlp": 0.0020967, "balance_loss_clip": 1.02059472, "balance_loss_mlp": 0.1856972, "epoch": 0.8517059972944536, "flos": 22747435107840.0, "grad_norm": 13.844316007808242, "language_loss": 0.75686961, "learning_rate": 2.26200679088697e-07, "loss": 0.77129912, "num_input_tokens_seen": 305590800, "router_z_loss_clip": 2.12695312, "router_z_loss_mlp": 0.23974609, "step": 14166, "time_per_iteration": 2.7469704151153564 }, { "auxiliary_loss_clip": 0.01241911, "auxiliary_loss_mlp": 0.00217067, "balance_loss_clip": 1.02496386, "balance_loss_mlp": 0.19272463, "epoch": 0.8517661205471216, "flos": 21689147675520.0, "grad_norm": 34.60444735854965, "language_loss": 0.81544453, "learning_rate": 2.260207961805125e-07, "loss": 0.83003432, "num_input_tokens_seen": 305609495, "router_z_loss_clip": 2.16894531, "router_z_loss_mlp": 0.24316406, "step": 14167, "time_per_iteration": 2.7881689071655273 }, { "auxiliary_loss_clip": 0.01252032, "auxiliary_loss_mlp": 0.00225565, "balance_loss_clip": 1.03736687, "balance_loss_mlp": 0.20257001, "epoch": 0.8518262437997896, "flos": 25374839241600.0, "grad_norm": 83.19535898597242, "language_loss": 0.88236332, "learning_rate": 2.258409805417969e-07, "loss": 0.89713925, "num_input_tokens_seen": 305629420, "router_z_loss_clip": 2.15039062, "router_z_loss_mlp": 0.23010254, "step": 14168, "time_per_iteration": 2.7600932121276855 }, { "auxiliary_loss_clip": 0.01236427, "auxiliary_loss_mlp": 0.00201441, "balance_loss_clip": 1.0263294, "balance_loss_mlp": 0.17817168, "epoch": 0.8518863670524576, "flos": 27235406897280.0, "grad_norm": 1823.1925775208308, "language_loss": 0.87161303, "learning_rate": 2.2566123217936893e-07, "loss": 0.88599169, "num_input_tokens_seen": 305649835, "router_z_loss_clip": 2.10058594, "router_z_loss_mlp": 0.23278809, "step": 14169, "time_per_iteration": 2.786895275115967 }, { "auxiliary_loss_clip": 0.01244102, "auxiliary_loss_mlp": 0.00224189, "balance_loss_clip": 1.03056037, "balance_loss_mlp": 0.19978639, "epoch": 0.8519464903051255, "flos": 20959514709120.0, "grad_norm": 1051.1813453464063, "language_loss": 0.75335133, "learning_rate": 2.254815511000452e-07, "loss": 0.76803422, "num_input_tokens_seen": 305668840, "router_z_loss_clip": 2.1328125, "router_z_loss_mlp": 0.24438477, "step": 14170, "time_per_iteration": 2.6893014907836914 }, { "auxiliary_loss_clip": 0.01241256, "auxiliary_loss_mlp": 0.00221648, "balance_loss_clip": 1.02853346, "balance_loss_mlp": 0.19858147, "epoch": 0.8520066135577935, "flos": 18441745862400.0, "grad_norm": 29.92199104197921, "language_loss": 0.97138882, "learning_rate": 2.253019373106384e-07, "loss": 0.98601782, "num_input_tokens_seen": 305686955, "router_z_loss_clip": 2.12988281, "router_z_loss_mlp": 0.23059082, "step": 14171, "time_per_iteration": 2.7783195972442627 }, { "auxiliary_loss_clip": 0.0126444, "auxiliary_loss_mlp": 0.00224358, "balance_loss_clip": 1.04521179, "balance_loss_mlp": 0.19885918, "epoch": 0.8520667368104614, "flos": 29130233149440.0, "grad_norm": 209.04452275394402, "language_loss": 0.63514304, "learning_rate": 2.2512239081796003e-07, "loss": 0.65003109, "num_input_tokens_seen": 305706290, "router_z_loss_clip": 2.19140625, "router_z_loss_mlp": 0.25537109, "step": 14172, "time_per_iteration": 2.8143856525421143 }, { "auxiliary_loss_clip": 0.01237391, "auxiliary_loss_mlp": 0.00236081, "balance_loss_clip": 1.02709079, "balance_loss_mlp": 0.21253714, "epoch": 0.8521268600631294, "flos": 16034366488320.0, "grad_norm": 12.651143309092951, "language_loss": 0.78006637, "learning_rate": 2.2494291162881862e-07, "loss": 0.79480106, "num_input_tokens_seen": 305723835, "router_z_loss_clip": 2.09960938, "router_z_loss_mlp": 0.23547363, "step": 14173, "time_per_iteration": 2.6988277435302734 }, { "auxiliary_loss_clip": 0.01254228, "auxiliary_loss_mlp": 0.00224707, "balance_loss_clip": 1.03449416, "balance_loss_mlp": 0.19868377, "epoch": 0.8521869833157973, "flos": 22454870832000.0, "grad_norm": 11.174476138820927, "language_loss": 0.86767799, "learning_rate": 2.247634997500205e-07, "loss": 0.88246727, "num_input_tokens_seen": 305741655, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.26025391, "step": 14174, "time_per_iteration": 2.7365152835845947 }, { "auxiliary_loss_clip": 0.01273239, "auxiliary_loss_mlp": 0.00213262, "balance_loss_clip": 1.0491271, "balance_loss_mlp": 0.1892415, "epoch": 0.8522471065684654, "flos": 24972029147520.0, "grad_norm": 6.958771907138971, "language_loss": 0.90573955, "learning_rate": 2.245841551883676e-07, "loss": 0.92060453, "num_input_tokens_seen": 305761890, "router_z_loss_clip": 2.24023438, "router_z_loss_mlp": 0.24023438, "step": 14175, "time_per_iteration": 2.8263590335845947 }, { "auxiliary_loss_clip": 0.01267034, "auxiliary_loss_mlp": 0.00220182, "balance_loss_clip": 1.04449284, "balance_loss_mlp": 0.19539826, "epoch": 0.8523072298211333, "flos": 17710604524800.0, "grad_norm": 453.03026976741785, "language_loss": 0.76684785, "learning_rate": 2.2440487795066153e-07, "loss": 0.78171992, "num_input_tokens_seen": 305779190, "router_z_loss_clip": 2.22265625, "router_z_loss_mlp": 0.24780273, "step": 14176, "time_per_iteration": 2.7904422283172607 }, { "auxiliary_loss_clip": 0.01255723, "auxiliary_loss_mlp": 0.00205423, "balance_loss_clip": 1.04191041, "balance_loss_mlp": 0.18147346, "epoch": 0.8523673530738013, "flos": 25446193608960.0, "grad_norm": 5.515385238069052, "language_loss": 0.85774612, "learning_rate": 2.2422566804370068e-07, "loss": 0.87235761, "num_input_tokens_seen": 305799870, "router_z_loss_clip": 2.13867188, "router_z_loss_mlp": 0.23962402, "step": 14177, "time_per_iteration": 2.7860147953033447 }, { "auxiliary_loss_clip": 0.01240433, "auxiliary_loss_mlp": 0.00196464, "balance_loss_clip": 1.02781177, "balance_loss_mlp": 0.17345662, "epoch": 0.8524274763264693, "flos": 31429593348480.0, "grad_norm": 2.0381909218283703, "language_loss": 0.82174653, "learning_rate": 2.2404652547428026e-07, "loss": 0.83611548, "num_input_tokens_seen": 305819695, "router_z_loss_clip": 2.12207031, "router_z_loss_mlp": 0.23010254, "step": 14178, "time_per_iteration": 2.793318748474121 }, { "auxiliary_loss_clip": 0.01251478, "auxiliary_loss_mlp": 0.00211358, "balance_loss_clip": 1.03298807, "balance_loss_mlp": 0.18705107, "epoch": 0.8524875995791372, "flos": 17712651600000.0, "grad_norm": 13.658459694090839, "language_loss": 0.83910435, "learning_rate": 2.238674502491935e-07, "loss": 0.85373271, "num_input_tokens_seen": 305837270, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.24353027, "step": 14179, "time_per_iteration": 2.7665469646453857 }, { "auxiliary_loss_clip": 0.01230958, "auxiliary_loss_mlp": 0.00195981, "balance_loss_clip": 1.0220437, "balance_loss_mlp": 0.17358187, "epoch": 0.8525477228318052, "flos": 21687316081920.0, "grad_norm": 47.22761672787492, "language_loss": 0.91867322, "learning_rate": 2.2368844237523165e-07, "loss": 0.93294257, "num_input_tokens_seen": 305855250, "router_z_loss_clip": 2.08691406, "router_z_loss_mlp": 0.22412109, "step": 14180, "time_per_iteration": 2.8783514499664307 }, { "auxiliary_loss_clip": 0.01241492, "auxiliary_loss_mlp": 0.00230801, "balance_loss_clip": 1.02845502, "balance_loss_mlp": 0.20602965, "epoch": 0.8526078460844732, "flos": 24827057856000.0, "grad_norm": 904.0512974710341, "language_loss": 0.72388911, "learning_rate": 2.235095018591815e-07, "loss": 0.738612, "num_input_tokens_seen": 305875660, "router_z_loss_clip": 2.13183594, "router_z_loss_mlp": 0.24755859, "step": 14181, "time_per_iteration": 2.7740259170532227 }, { "auxiliary_loss_clip": 0.01223399, "auxiliary_loss_mlp": 0.00207869, "balance_loss_clip": 1.01396203, "balance_loss_mlp": 0.18593405, "epoch": 0.8526679693371412, "flos": 13516418073600.0, "grad_norm": 134.33695646772333, "language_loss": 0.79605019, "learning_rate": 2.2333062870782894e-07, "loss": 0.81036282, "num_input_tokens_seen": 305892415, "router_z_loss_clip": 2.09375, "router_z_loss_mlp": 0.21936035, "step": 14182, "time_per_iteration": 2.826763153076172 }, { "auxiliary_loss_clip": 0.0123744, "auxiliary_loss_mlp": 0.00215363, "balance_loss_clip": 1.02801037, "balance_loss_mlp": 0.19191477, "epoch": 0.8527280925898091, "flos": 23514092017920.0, "grad_norm": 2.267210944939354, "language_loss": 0.77431792, "learning_rate": 2.2315182292795697e-07, "loss": 0.78884596, "num_input_tokens_seen": 305912665, "router_z_loss_clip": 2.09375, "router_z_loss_mlp": 0.23474121, "step": 14183, "time_per_iteration": 2.770207166671753 }, { "auxiliary_loss_clip": 0.01240955, "auxiliary_loss_mlp": 0.00190677, "balance_loss_clip": 1.02794707, "balance_loss_mlp": 0.1674436, "epoch": 0.8527882158424771, "flos": 20303031790080.0, "grad_norm": 5.980100954137653, "language_loss": 0.83422756, "learning_rate": 2.2297308452634644e-07, "loss": 0.84854388, "num_input_tokens_seen": 305931515, "router_z_loss_clip": 2.12988281, "router_z_loss_mlp": 0.2322998, "step": 14184, "time_per_iteration": 2.731628656387329 }, { "auxiliary_loss_clip": 0.01238322, "auxiliary_loss_mlp": 0.00206753, "balance_loss_clip": 1.02404821, "balance_loss_mlp": 0.18372178, "epoch": 0.852848339095145, "flos": 17202504689280.0, "grad_norm": 150.94206999242556, "language_loss": 0.83370531, "learning_rate": 2.2279441350977457e-07, "loss": 0.84815598, "num_input_tokens_seen": 305949965, "router_z_loss_clip": 2.14648438, "router_z_loss_mlp": 0.23022461, "step": 14185, "time_per_iteration": 2.7558999061584473 }, { "auxiliary_loss_clip": 0.01254934, "auxiliary_loss_mlp": 0.00202255, "balance_loss_clip": 1.03302288, "balance_loss_mlp": 0.17681536, "epoch": 0.852908462347813, "flos": 18368990864640.0, "grad_norm": 4.761952652462049, "language_loss": 0.8812983, "learning_rate": 2.2261580988501637e-07, "loss": 0.89587021, "num_input_tokens_seen": 305967820, "router_z_loss_clip": 2.22070312, "router_z_loss_mlp": 0.25476074, "step": 14186, "time_per_iteration": 2.6657252311706543 }, { "auxiliary_loss_clip": 0.01256535, "auxiliary_loss_mlp": 0.00206505, "balance_loss_clip": 1.03513074, "balance_loss_mlp": 0.18232939, "epoch": 0.8529685856004809, "flos": 18624890332800.0, "grad_norm": 36.93822021850735, "language_loss": 0.70882916, "learning_rate": 2.224372736588449e-07, "loss": 0.72345954, "num_input_tokens_seen": 305985505, "router_z_loss_clip": 2.21484375, "router_z_loss_mlp": 0.24169922, "step": 14187, "time_per_iteration": 2.7156872749328613 }, { "auxiliary_loss_clip": 0.01259433, "auxiliary_loss_mlp": 0.00224786, "balance_loss_clip": 1.03800917, "balance_loss_mlp": 0.19904888, "epoch": 0.853028708853149, "flos": 29607665748480.0, "grad_norm": 3.41686128624052, "language_loss": 0.83694756, "learning_rate": 2.2225880483803005e-07, "loss": 0.85178977, "num_input_tokens_seen": 306005220, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.25744629, "step": 14188, "time_per_iteration": 2.7682626247406006 }, { "auxiliary_loss_clip": 0.01247338, "auxiliary_loss_mlp": 0.00241969, "balance_loss_clip": 1.03028679, "balance_loss_mlp": 0.21585047, "epoch": 0.8530888321058169, "flos": 26353153042560.0, "grad_norm": 9.325566898454634, "language_loss": 0.85525542, "learning_rate": 2.2208040342933932e-07, "loss": 0.87014854, "num_input_tokens_seen": 306023785, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.26135254, "step": 14189, "time_per_iteration": 2.786848783493042 }, { "auxiliary_loss_clip": 0.01235753, "auxiliary_loss_mlp": 0.00191638, "balance_loss_clip": 1.02354801, "balance_loss_mlp": 0.16760573, "epoch": 0.8531489553584849, "flos": 20521979141760.0, "grad_norm": 16.54709832786347, "language_loss": 0.8872025, "learning_rate": 2.2190206943953793e-07, "loss": 0.90147638, "num_input_tokens_seen": 306041600, "router_z_loss_clip": 2.11914062, "router_z_loss_mlp": 0.24023438, "step": 14190, "time_per_iteration": 2.7175333499908447 }, { "auxiliary_loss_clip": 0.01246729, "auxiliary_loss_mlp": 0.00217318, "balance_loss_clip": 1.0326997, "balance_loss_mlp": 0.19272476, "epoch": 0.8532090786111529, "flos": 20704297599360.0, "grad_norm": 14.624858553508252, "language_loss": 0.8688097, "learning_rate": 2.2172380287538894e-07, "loss": 0.88345015, "num_input_tokens_seen": 306060345, "router_z_loss_clip": 2.14257812, "router_z_loss_mlp": 0.24584961, "step": 14191, "time_per_iteration": 2.699312925338745 }, { "auxiliary_loss_clip": 0.01236335, "auxiliary_loss_mlp": 0.00192488, "balance_loss_clip": 1.02334034, "balance_loss_mlp": 0.1693258, "epoch": 0.8532692018638208, "flos": 19828903242240.0, "grad_norm": 92.29535001274854, "language_loss": 0.78834581, "learning_rate": 2.2154560374365073e-07, "loss": 0.80263406, "num_input_tokens_seen": 306078285, "router_z_loss_clip": 2.12792969, "router_z_loss_mlp": 0.23168945, "step": 14192, "time_per_iteration": 2.664381504058838 }, { "auxiliary_loss_clip": 0.012741, "auxiliary_loss_mlp": 0.00236348, "balance_loss_clip": 1.04508138, "balance_loss_mlp": 0.20751174, "epoch": 0.8533293251164888, "flos": 20996790048000.0, "grad_norm": 45.9291340734518, "language_loss": 0.73302686, "learning_rate": 2.2136747205108164e-07, "loss": 0.74813133, "num_input_tokens_seen": 306093760, "router_z_loss_clip": 2.2890625, "router_z_loss_mlp": 0.28833008, "step": 14193, "time_per_iteration": 2.713536500930786 }, { "auxiliary_loss_clip": 0.01240803, "auxiliary_loss_mlp": 0.00207579, "balance_loss_clip": 1.02741015, "balance_loss_mlp": 0.18512008, "epoch": 0.8533894483691568, "flos": 22419606654720.0, "grad_norm": 48.19269509271729, "language_loss": 0.85302424, "learning_rate": 2.211894078044365e-07, "loss": 0.86750805, "num_input_tokens_seen": 306112595, "router_z_loss_clip": 2.13867188, "router_z_loss_mlp": 0.2244873, "step": 14194, "time_per_iteration": 2.7124104499816895 }, { "auxiliary_loss_clip": 0.01244693, "auxiliary_loss_mlp": 0.00215583, "balance_loss_clip": 1.03163457, "balance_loss_mlp": 0.19186017, "epoch": 0.8534495716218248, "flos": 21616536332160.0, "grad_norm": 7.6247856326891785, "language_loss": 0.794029, "learning_rate": 2.2101141101046705e-07, "loss": 0.80863172, "num_input_tokens_seen": 306131800, "router_z_loss_clip": 2.1328125, "router_z_loss_mlp": 0.23730469, "step": 14195, "time_per_iteration": 2.7437386512756348 }, { "auxiliary_loss_clip": 0.01244265, "auxiliary_loss_mlp": 0.00216609, "balance_loss_clip": 1.02499604, "balance_loss_mlp": 0.19182587, "epoch": 0.8535096948744927, "flos": 22346277039360.0, "grad_norm": 11.18822885220872, "language_loss": 0.95375526, "learning_rate": 2.2083348167592343e-07, "loss": 0.96836388, "num_input_tokens_seen": 306150590, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.24780273, "step": 14196, "time_per_iteration": 2.7912211418151855 }, { "auxiliary_loss_clip": 0.01133135, "auxiliary_loss_mlp": 0.00129823, "balance_loss_clip": 0.98284769, "balance_loss_mlp": 0.12262288, "epoch": 0.8535698181271607, "flos": 52762507891200.0, "grad_norm": 0.7383048099582208, "language_loss": 0.54241753, "learning_rate": 2.2065561980755243e-07, "loss": 0.55504704, "num_input_tokens_seen": 306205850, "router_z_loss_clip": 1.5, "router_z_loss_mlp": 0.07177734, "step": 14197, "time_per_iteration": 4.51965069770813 }, { "auxiliary_loss_clip": 0.01239716, "auxiliary_loss_mlp": 0.00208712, "balance_loss_clip": 1.02559638, "balance_loss_mlp": 0.18310626, "epoch": 0.8536299413798286, "flos": 19062892776960.0, "grad_norm": 426.1396147951634, "language_loss": 0.87044412, "learning_rate": 2.2047782541209826e-07, "loss": 0.88492841, "num_input_tokens_seen": 306225220, "router_z_loss_clip": 2.140625, "router_z_loss_mlp": 0.25622559, "step": 14198, "time_per_iteration": 4.123647212982178 }, { "auxiliary_loss_clip": 0.012326, "auxiliary_loss_mlp": 0.00214106, "balance_loss_clip": 1.02599359, "balance_loss_mlp": 0.19242185, "epoch": 0.8536900646324966, "flos": 49344743871360.0, "grad_norm": 93.84193705560455, "language_loss": 0.75962377, "learning_rate": 2.203000984963035e-07, "loss": 0.77409077, "num_input_tokens_seen": 306249865, "router_z_loss_clip": 2.06640625, "router_z_loss_mlp": 0.2166748, "step": 14199, "time_per_iteration": 2.9536290168762207 }, { "auxiliary_loss_clip": 0.01219949, "auxiliary_loss_mlp": 0.0019128, "balance_loss_clip": 1.01662898, "balance_loss_mlp": 0.17004904, "epoch": 0.8537501878851645, "flos": 21762333636480.0, "grad_norm": 3.5701672379447653, "language_loss": 0.92954105, "learning_rate": 2.201224390669072e-07, "loss": 0.94365335, "num_input_tokens_seen": 306270215, "router_z_loss_clip": 2.03027344, "router_z_loss_mlp": 0.21228027, "step": 14200, "time_per_iteration": 2.7563672065734863 }, { "auxiliary_loss_clip": 0.01240328, "auxiliary_loss_mlp": 0.00218898, "balance_loss_clip": 1.02623713, "balance_loss_mlp": 0.1941622, "epoch": 0.8538103111378326, "flos": 22269176496000.0, "grad_norm": 4.714167957645843, "language_loss": 0.85537696, "learning_rate": 2.1994484713064666e-07, "loss": 0.86996919, "num_input_tokens_seen": 306288960, "router_z_loss_clip": 2.14355469, "router_z_loss_mlp": 0.24731445, "step": 14201, "time_per_iteration": 2.8623204231262207 }, { "auxiliary_loss_clip": 0.01234749, "auxiliary_loss_mlp": 0.00212696, "balance_loss_clip": 1.02659726, "balance_loss_mlp": 0.19068956, "epoch": 0.8538704343905005, "flos": 20303929630080.0, "grad_norm": 13.516607827578337, "language_loss": 0.75938499, "learning_rate": 2.19767322694256e-07, "loss": 0.7738595, "num_input_tokens_seen": 306308735, "router_z_loss_clip": 2.08203125, "router_z_loss_mlp": 0.2199707, "step": 14202, "time_per_iteration": 2.6796255111694336 }, { "auxiliary_loss_clip": 0.01254674, "auxiliary_loss_mlp": 0.00186243, "balance_loss_clip": 1.03566587, "balance_loss_mlp": 0.16373622, "epoch": 0.8539305576431685, "flos": 24755164784640.0, "grad_norm": 5.023878378164333, "language_loss": 0.89638662, "learning_rate": 2.195898657644666e-07, "loss": 0.91079581, "num_input_tokens_seen": 306329015, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.22509766, "step": 14203, "time_per_iteration": 4.127129316329956 }, { "auxiliary_loss_clip": 0.0125949, "auxiliary_loss_mlp": 0.00232702, "balance_loss_clip": 1.03650784, "balance_loss_mlp": 0.20630893, "epoch": 0.8539906808958365, "flos": 26687625511680.0, "grad_norm": 65.8060890687957, "language_loss": 0.7682395, "learning_rate": 2.1941247634800808e-07, "loss": 0.78316146, "num_input_tokens_seen": 306349085, "router_z_loss_clip": 2.22949219, "router_z_loss_mlp": 0.26379395, "step": 14204, "time_per_iteration": 2.824389696121216 }, { "auxiliary_loss_clip": 0.01249102, "auxiliary_loss_mlp": 0.00229699, "balance_loss_clip": 1.03178847, "balance_loss_mlp": 0.20598796, "epoch": 0.8540508041485044, "flos": 13365521038080.0, "grad_norm": 6.436336562745407, "language_loss": 0.72891641, "learning_rate": 2.1923515445160667e-07, "loss": 0.74370438, "num_input_tokens_seen": 306365385, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.23718262, "step": 14205, "time_per_iteration": 2.6321768760681152 }, { "auxiliary_loss_clip": 0.01251596, "auxiliary_loss_mlp": 0.00217161, "balance_loss_clip": 1.03855133, "balance_loss_mlp": 0.1929739, "epoch": 0.8541109274011724, "flos": 32780876019840.0, "grad_norm": 4.21767213452589, "language_loss": 0.80382228, "learning_rate": 2.1905790008198655e-07, "loss": 0.81850988, "num_input_tokens_seen": 306384585, "router_z_loss_clip": 2.13378906, "router_z_loss_mlp": 0.24182129, "step": 14206, "time_per_iteration": 2.7675914764404297 }, { "auxiliary_loss_clip": 0.01260044, "auxiliary_loss_mlp": 0.00224687, "balance_loss_clip": 1.03719652, "balance_loss_mlp": 0.19896166, "epoch": 0.8541710506538404, "flos": 17639286071040.0, "grad_norm": 54.9764765995017, "language_loss": 0.88888097, "learning_rate": 2.1888071324586987e-07, "loss": 0.90372825, "num_input_tokens_seen": 306401565, "router_z_loss_clip": 2.22460938, "router_z_loss_mlp": 0.25756836, "step": 14207, "time_per_iteration": 4.073173761367798 }, { "auxiliary_loss_clip": 0.01252617, "auxiliary_loss_mlp": 0.00229623, "balance_loss_clip": 1.03595424, "balance_loss_mlp": 0.20303899, "epoch": 0.8542311739065084, "flos": 20263062931200.0, "grad_norm": 12.718075233346875, "language_loss": 0.91574287, "learning_rate": 2.1870359394997485e-07, "loss": 0.93056524, "num_input_tokens_seen": 306419995, "router_z_loss_clip": 2.16601562, "router_z_loss_mlp": 0.26586914, "step": 14208, "time_per_iteration": 2.648850679397583 }, { "auxiliary_loss_clip": 0.01230692, "auxiliary_loss_mlp": 0.00211794, "balance_loss_clip": 1.02388048, "balance_loss_mlp": 0.18851198, "epoch": 0.8542912971591763, "flos": 17785657992960.0, "grad_norm": 14.432771171216665, "language_loss": 0.77405137, "learning_rate": 2.1852654220101785e-07, "loss": 0.78847623, "num_input_tokens_seen": 306439240, "router_z_loss_clip": 2.06640625, "router_z_loss_mlp": 0.23303223, "step": 14209, "time_per_iteration": 2.6812827587127686 }, { "auxiliary_loss_clip": 0.01232195, "auxiliary_loss_mlp": 0.0019257, "balance_loss_clip": 1.02178574, "balance_loss_mlp": 0.1709339, "epoch": 0.8543514204118443, "flos": 26979507429120.0, "grad_norm": 3.5673983151773903, "language_loss": 0.76126301, "learning_rate": 2.1834955800571287e-07, "loss": 0.77551067, "num_input_tokens_seen": 306458425, "router_z_loss_clip": 2.10351562, "router_z_loss_mlp": 0.21643066, "step": 14210, "time_per_iteration": 2.706014633178711 }, { "auxiliary_loss_clip": 0.01253471, "auxiliary_loss_mlp": 0.00225906, "balance_loss_clip": 1.03278923, "balance_loss_mlp": 0.19968036, "epoch": 0.8544115436645122, "flos": 24024598064640.0, "grad_norm": 29.596345502021187, "language_loss": 0.76811945, "learning_rate": 2.1817264137077141e-07, "loss": 0.78291321, "num_input_tokens_seen": 306477210, "router_z_loss_clip": 2.20117188, "router_z_loss_mlp": 0.26220703, "step": 14211, "time_per_iteration": 2.7154407501220703 }, { "auxiliary_loss_clip": 0.01252576, "auxiliary_loss_mlp": 0.00214072, "balance_loss_clip": 1.03380501, "balance_loss_mlp": 0.19087401, "epoch": 0.8544716669171802, "flos": 16617986668800.0, "grad_norm": 3.813895657578689, "language_loss": 0.92968673, "learning_rate": 2.1799579230290166e-07, "loss": 0.94435322, "num_input_tokens_seen": 306495820, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.23193359, "step": 14212, "time_per_iteration": 2.65909743309021 }, { "auxiliary_loss_clip": 0.01246094, "auxiliary_loss_mlp": 0.00209979, "balance_loss_clip": 1.02941751, "balance_loss_mlp": 0.18533805, "epoch": 0.8545317901698481, "flos": 40005779489280.0, "grad_norm": 25.032864588699223, "language_loss": 0.77062875, "learning_rate": 2.178190108088105e-07, "loss": 0.78518945, "num_input_tokens_seen": 306516420, "router_z_loss_clip": 2.16601562, "router_z_loss_mlp": 0.24621582, "step": 14213, "time_per_iteration": 2.8303985595703125 }, { "auxiliary_loss_clip": 0.01232528, "auxiliary_loss_mlp": 0.00202601, "balance_loss_clip": 1.02179742, "balance_loss_mlp": 0.17831811, "epoch": 0.8545919134225162, "flos": 19902520166400.0, "grad_norm": 150.61969479948013, "language_loss": 0.8651402, "learning_rate": 2.1764229689520098e-07, "loss": 0.87949145, "num_input_tokens_seen": 306534785, "router_z_loss_clip": 2.10742188, "router_z_loss_mlp": 0.24316406, "step": 14214, "time_per_iteration": 2.667465925216675 }, { "auxiliary_loss_clip": 0.01277145, "auxiliary_loss_mlp": 0.00206996, "balance_loss_clip": 1.04636216, "balance_loss_mlp": 0.18028107, "epoch": 0.8546520366751841, "flos": 18952970181120.0, "grad_norm": 11.804320066745388, "language_loss": 0.79449391, "learning_rate": 2.1746565056877397e-07, "loss": 0.80933535, "num_input_tokens_seen": 306552440, "router_z_loss_clip": 2.30859375, "router_z_loss_mlp": 0.26733398, "step": 14215, "time_per_iteration": 2.644312858581543 }, { "auxiliary_loss_clip": 0.01226209, "auxiliary_loss_mlp": 0.00206477, "balance_loss_clip": 1.01645863, "balance_loss_mlp": 0.1832906, "epoch": 0.8547121599278521, "flos": 35621445415680.0, "grad_norm": 118.47439639825717, "language_loss": 0.70666015, "learning_rate": 2.172890718362279e-07, "loss": 0.72098702, "num_input_tokens_seen": 306573600, "router_z_loss_clip": 2.09765625, "router_z_loss_mlp": 0.23205566, "step": 14216, "time_per_iteration": 2.809391736984253 }, { "auxiliary_loss_clip": 0.0125998, "auxiliary_loss_mlp": 0.00225415, "balance_loss_clip": 1.03471088, "balance_loss_mlp": 0.19972566, "epoch": 0.8547722831805201, "flos": 16910048154240.0, "grad_norm": 21.00571313300758, "language_loss": 0.75093275, "learning_rate": 2.17112560704259e-07, "loss": 0.76578665, "num_input_tokens_seen": 306592840, "router_z_loss_clip": 2.25390625, "router_z_loss_mlp": 0.25720215, "step": 14217, "time_per_iteration": 2.7738723754882812 }, { "auxiliary_loss_clip": 0.0124733, "auxiliary_loss_mlp": 0.00208726, "balance_loss_clip": 1.0378468, "balance_loss_mlp": 0.18551588, "epoch": 0.854832406433188, "flos": 23002616304000.0, "grad_norm": 22.947881151272544, "language_loss": 0.72966808, "learning_rate": 2.1693611717956072e-07, "loss": 0.7442286, "num_input_tokens_seen": 306613210, "router_z_loss_clip": 2.09082031, "router_z_loss_mlp": 0.23217773, "step": 14218, "time_per_iteration": 2.8099868297576904 }, { "auxiliary_loss_clip": 0.01248666, "auxiliary_loss_mlp": 0.00219945, "balance_loss_clip": 1.02973199, "balance_loss_mlp": 0.19476837, "epoch": 0.854892529685856, "flos": 20412595249920.0, "grad_norm": 4.40077728852622, "language_loss": 0.7817682, "learning_rate": 2.167597412688238e-07, "loss": 0.79645431, "num_input_tokens_seen": 306631620, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.25170898, "step": 14219, "time_per_iteration": 2.696798086166382 }, { "auxiliary_loss_clip": 0.01265443, "auxiliary_loss_mlp": 0.00218855, "balance_loss_clip": 1.04214346, "balance_loss_mlp": 0.19253349, "epoch": 0.854952652938524, "flos": 16398716094720.0, "grad_norm": 13.047788904656594, "language_loss": 0.81010783, "learning_rate": 2.1658343297873549e-07, "loss": 0.82495081, "num_input_tokens_seen": 306646695, "router_z_loss_clip": 2.23339844, "router_z_loss_mlp": 0.26318359, "step": 14220, "time_per_iteration": 2.6991024017333984 }, { "auxiliary_loss_clip": 0.01239899, "auxiliary_loss_mlp": 0.00214999, "balance_loss_clip": 1.02666926, "balance_loss_mlp": 0.19047767, "epoch": 0.855012776191192, "flos": 21178677542400.0, "grad_norm": 139.84482742079985, "language_loss": 0.78693753, "learning_rate": 2.164071923159827e-07, "loss": 0.80148649, "num_input_tokens_seen": 306665465, "router_z_loss_clip": 2.1328125, "router_z_loss_mlp": 0.24536133, "step": 14221, "time_per_iteration": 2.7621543407440186 }, { "auxiliary_loss_clip": 0.01272582, "auxiliary_loss_mlp": 0.00210878, "balance_loss_clip": 1.04956591, "balance_loss_mlp": 0.18529549, "epoch": 0.8550728994438599, "flos": 26140993361280.0, "grad_norm": 3.9214346050682143, "language_loss": 0.70486623, "learning_rate": 2.1623101928724763e-07, "loss": 0.71970081, "num_input_tokens_seen": 306685950, "router_z_loss_clip": 2.23242188, "router_z_loss_mlp": 0.25585938, "step": 14222, "time_per_iteration": 2.784186363220215 }, { "auxiliary_loss_clip": 0.01223894, "auxiliary_loss_mlp": 0.00198918, "balance_loss_clip": 1.0126735, "balance_loss_mlp": 0.1750527, "epoch": 0.8551330226965279, "flos": 22786793435520.0, "grad_norm": 31.995705457320025, "language_loss": 0.88908088, "learning_rate": 2.1605491389921093e-07, "loss": 0.90330899, "num_input_tokens_seen": 306705740, "router_z_loss_clip": 2.11132812, "router_z_loss_mlp": 0.23876953, "step": 14223, "time_per_iteration": 2.75124192237854 }, { "auxiliary_loss_clip": 0.0125259, "auxiliary_loss_mlp": 0.00213652, "balance_loss_clip": 1.03844619, "balance_loss_mlp": 0.1898818, "epoch": 0.8551931459491958, "flos": 22419032037120.0, "grad_norm": 48.36460215251908, "language_loss": 0.82145512, "learning_rate": 2.158788761585515e-07, "loss": 0.83611757, "num_input_tokens_seen": 306725065, "router_z_loss_clip": 2.14453125, "router_z_loss_mlp": 0.2376709, "step": 14224, "time_per_iteration": 2.7249348163604736 }, { "auxiliary_loss_clip": 0.01241366, "auxiliary_loss_mlp": 0.0020376, "balance_loss_clip": 1.02866447, "balance_loss_mlp": 0.17983516, "epoch": 0.8552532692018638, "flos": 19573183342080.0, "grad_norm": 45.07361865871097, "language_loss": 0.84686255, "learning_rate": 2.1570290607194307e-07, "loss": 0.86131382, "num_input_tokens_seen": 306743630, "router_z_loss_clip": 2.12695312, "router_z_loss_mlp": 0.23950195, "step": 14225, "time_per_iteration": 2.725727081298828 }, { "auxiliary_loss_clip": 0.01244314, "auxiliary_loss_mlp": 0.00226987, "balance_loss_clip": 1.03027296, "balance_loss_mlp": 0.20343086, "epoch": 0.8553133924545318, "flos": 26432767537920.0, "grad_norm": 16.08362745847981, "language_loss": 0.85277921, "learning_rate": 2.1552700364605925e-07, "loss": 0.8674922, "num_input_tokens_seen": 306763105, "router_z_loss_clip": 2.140625, "router_z_loss_mlp": 0.23535156, "step": 14226, "time_per_iteration": 2.9028964042663574 }, { "auxiliary_loss_clip": 0.01262046, "auxiliary_loss_mlp": 0.00217729, "balance_loss_clip": 1.0395658, "balance_loss_mlp": 0.19335033, "epoch": 0.8553735157071998, "flos": 16362446336640.0, "grad_norm": 24.65284595382217, "language_loss": 0.63698936, "learning_rate": 2.153511688875702e-07, "loss": 0.65178716, "num_input_tokens_seen": 306779875, "router_z_loss_clip": 2.22265625, "router_z_loss_mlp": 0.24377441, "step": 14227, "time_per_iteration": 2.6965513229370117 }, { "auxiliary_loss_clip": 0.0122455, "auxiliary_loss_mlp": 0.00211632, "balance_loss_clip": 1.01162446, "balance_loss_mlp": 0.18770671, "epoch": 0.8554336389598677, "flos": 20887334328960.0, "grad_norm": 10.423763194389622, "language_loss": 0.75225562, "learning_rate": 2.151754018031442e-07, "loss": 0.76661742, "num_input_tokens_seen": 306800015, "router_z_loss_clip": 2.12890625, "router_z_loss_mlp": 0.23925781, "step": 14228, "time_per_iteration": 2.6728408336639404 }, { "auxiliary_loss_clip": 0.01255689, "auxiliary_loss_mlp": 0.00211058, "balance_loss_clip": 1.03510046, "balance_loss_mlp": 0.1854037, "epoch": 0.8554937622125357, "flos": 21284721469440.0, "grad_norm": 6.898935780436777, "language_loss": 0.82907015, "learning_rate": 2.1499970239944542e-07, "loss": 0.84373766, "num_input_tokens_seen": 306814160, "router_z_loss_clip": 2.20507812, "router_z_loss_mlp": 0.25646973, "step": 14229, "time_per_iteration": 2.674943685531616 }, { "auxiliary_loss_clip": 0.01245156, "auxiliary_loss_mlp": 0.00212763, "balance_loss_clip": 1.03038049, "balance_loss_mlp": 0.18964782, "epoch": 0.8555538854652037, "flos": 22413178120320.0, "grad_norm": 26.720097611356532, "language_loss": 0.79197478, "learning_rate": 2.1482407068313724e-07, "loss": 0.80655402, "num_input_tokens_seen": 306833310, "router_z_loss_clip": 2.14648438, "router_z_loss_mlp": 0.23120117, "step": 14230, "time_per_iteration": 2.656473159790039 }, { "auxiliary_loss_clip": 0.01255848, "auxiliary_loss_mlp": 0.0021881, "balance_loss_clip": 1.04243016, "balance_loss_mlp": 0.19534962, "epoch": 0.8556140087178716, "flos": 20193719725440.0, "grad_norm": 4.436299622163843, "language_loss": 0.89884806, "learning_rate": 2.1464850666087897e-07, "loss": 0.91359466, "num_input_tokens_seen": 306851345, "router_z_loss_clip": 2.13867188, "router_z_loss_mlp": 0.23449707, "step": 14231, "time_per_iteration": 2.6727588176727295 }, { "auxiliary_loss_clip": 0.01270307, "auxiliary_loss_mlp": 0.00224638, "balance_loss_clip": 1.04724705, "balance_loss_mlp": 0.19857889, "epoch": 0.8556741319705397, "flos": 22638123043200.0, "grad_norm": 9.644639556269928, "language_loss": 0.77335703, "learning_rate": 2.1447301033932796e-07, "loss": 0.78830647, "num_input_tokens_seen": 306871040, "router_z_loss_clip": 2.23046875, "router_z_loss_mlp": 0.26062012, "step": 14232, "time_per_iteration": 2.658348321914673 }, { "auxiliary_loss_clip": 0.01258288, "auxiliary_loss_mlp": 0.00195865, "balance_loss_clip": 1.03983426, "balance_loss_mlp": 0.17154679, "epoch": 0.8557342552232076, "flos": 23549320281600.0, "grad_norm": 7.897309040130791, "language_loss": 0.74132168, "learning_rate": 2.1429758172513955e-07, "loss": 0.75586319, "num_input_tokens_seen": 306891625, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.24304199, "step": 14233, "time_per_iteration": 2.756861448287964 }, { "auxiliary_loss_clip": 0.01245443, "auxiliary_loss_mlp": 0.00207841, "balance_loss_clip": 1.02933812, "balance_loss_mlp": 0.18348616, "epoch": 0.8557943784758756, "flos": 19609884063360.0, "grad_norm": 22.157818050144954, "language_loss": 0.84035808, "learning_rate": 2.1412222082496556e-07, "loss": 0.85489088, "num_input_tokens_seen": 306910020, "router_z_loss_clip": 2.16210938, "router_z_loss_mlp": 0.24353027, "step": 14234, "time_per_iteration": 2.6609466075897217 }, { "auxiliary_loss_clip": 0.01145936, "auxiliary_loss_mlp": 0.00167452, "balance_loss_clip": 0.99543345, "balance_loss_mlp": 0.15886906, "epoch": 0.8558545017285435, "flos": 70641891446400.0, "grad_norm": 0.8681460424089202, "language_loss": 0.57574439, "learning_rate": 2.1394692764545684e-07, "loss": 0.58887827, "num_input_tokens_seen": 306969505, "router_z_loss_clip": 1.5, "router_z_loss_mlp": 0.0859375, "step": 14235, "time_per_iteration": 3.1162047386169434 }, { "auxiliary_loss_clip": 0.01142364, "auxiliary_loss_mlp": 0.00176964, "balance_loss_clip": 0.99363172, "balance_loss_mlp": 0.16852359, "epoch": 0.8559146249812115, "flos": 56649983086080.0, "grad_norm": 0.7739741556005995, "language_loss": 0.5603987, "learning_rate": 2.1377170219325858e-07, "loss": 0.57359201, "num_input_tokens_seen": 307027710, "router_z_loss_clip": 1.484375, "router_z_loss_mlp": 0.08447266, "step": 14236, "time_per_iteration": 3.032259702682495 }, { "auxiliary_loss_clip": 0.01253851, "auxiliary_loss_mlp": 0.00228232, "balance_loss_clip": 1.03674555, "balance_loss_mlp": 0.20288828, "epoch": 0.8559747482338794, "flos": 22888240421760.0, "grad_norm": 42.736410455997586, "language_loss": 0.7899183, "learning_rate": 2.1359654447501673e-07, "loss": 0.80473906, "num_input_tokens_seen": 307045515, "router_z_loss_clip": 2.171875, "router_z_loss_mlp": 0.25341797, "step": 14237, "time_per_iteration": 2.6695728302001953 }, { "auxiliary_loss_clip": 0.01221004, "auxiliary_loss_mlp": 0.00205229, "balance_loss_clip": 1.01238441, "balance_loss_mlp": 0.1825431, "epoch": 0.8560348714865474, "flos": 22601925112320.0, "grad_norm": 21.689396647985976, "language_loss": 0.71575499, "learning_rate": 2.1342145449737314e-07, "loss": 0.7300173, "num_input_tokens_seen": 307064470, "router_z_loss_clip": 2.08691406, "router_z_loss_mlp": 0.22680664, "step": 14238, "time_per_iteration": 2.7927229404449463 }, { "auxiliary_loss_clip": 0.01212926, "auxiliary_loss_mlp": 0.00194772, "balance_loss_clip": 1.00898623, "balance_loss_mlp": 0.17438744, "epoch": 0.8560949947392154, "flos": 17931455297280.0, "grad_norm": 332.58174176644667, "language_loss": 0.75380278, "learning_rate": 2.1324643226696648e-07, "loss": 0.76787972, "num_input_tokens_seen": 307083900, "router_z_loss_clip": 2.0390625, "router_z_loss_mlp": 0.20361328, "step": 14239, "time_per_iteration": 4.12511134147644 }, { "auxiliary_loss_clip": 0.01267578, "auxiliary_loss_mlp": 0.00235068, "balance_loss_clip": 1.04420662, "balance_loss_mlp": 0.20878235, "epoch": 0.8561551179918834, "flos": 31026208636800.0, "grad_norm": 13.16943897769031, "language_loss": 0.75299382, "learning_rate": 2.1307147779043455e-07, "loss": 0.76802027, "num_input_tokens_seen": 307104590, "router_z_loss_clip": 2.23828125, "router_z_loss_mlp": 0.26269531, "step": 14240, "time_per_iteration": 2.7529990673065186 }, { "auxiliary_loss_clip": 0.01253142, "auxiliary_loss_mlp": 0.00197938, "balance_loss_clip": 1.03523231, "balance_loss_mlp": 0.17353548, "epoch": 0.8562152412445513, "flos": 30665198995200.0, "grad_norm": 2.160219111091949, "language_loss": 0.7285167, "learning_rate": 2.1289659107441182e-07, "loss": 0.74302745, "num_input_tokens_seen": 307125580, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.24389648, "step": 14241, "time_per_iteration": 4.186947584152222 }, { "auxiliary_loss_clip": 0.01265034, "auxiliary_loss_mlp": 0.00225168, "balance_loss_clip": 1.03962266, "balance_loss_mlp": 0.19691503, "epoch": 0.8562753644972193, "flos": 31576144838400.0, "grad_norm": 5.472323926969982, "language_loss": 0.80293143, "learning_rate": 2.1272177212552855e-07, "loss": 0.81783342, "num_input_tokens_seen": 307147625, "router_z_loss_clip": 2.25390625, "router_z_loss_mlp": 0.2824707, "step": 14242, "time_per_iteration": 2.7701523303985596 }, { "auxiliary_loss_clip": 0.01253444, "auxiliary_loss_mlp": 0.00201492, "balance_loss_clip": 1.0357058, "balance_loss_mlp": 0.17668432, "epoch": 0.8563354877498872, "flos": 26213640618240.0, "grad_norm": 130.20637617507347, "language_loss": 0.86712551, "learning_rate": 2.1254702095041498e-07, "loss": 0.88167489, "num_input_tokens_seen": 307164665, "router_z_loss_clip": 2.17578125, "router_z_loss_mlp": 0.24780273, "step": 14243, "time_per_iteration": 2.7446744441986084 }, { "auxiliary_loss_clip": 0.01252421, "auxiliary_loss_mlp": 0.00218071, "balance_loss_clip": 1.03692508, "balance_loss_mlp": 0.19456294, "epoch": 0.8563956110025552, "flos": 24134341092480.0, "grad_norm": 2.2189827980436974, "language_loss": 0.76102471, "learning_rate": 2.123723375556974e-07, "loss": 0.77572966, "num_input_tokens_seen": 307182530, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.23498535, "step": 14244, "time_per_iteration": 2.7434844970703125 }, { "auxiliary_loss_clip": 0.0114069, "auxiliary_loss_mlp": 0.00149915, "balance_loss_clip": 0.99311423, "balance_loss_mlp": 0.14176062, "epoch": 0.8564557342552233, "flos": 56271986311680.0, "grad_norm": 0.8095457134742159, "language_loss": 0.57555372, "learning_rate": 2.1219772194800046e-07, "loss": 0.58845973, "num_input_tokens_seen": 307241240, "router_z_loss_clip": 1.4765625, "router_z_loss_mlp": 0.08154297, "step": 14245, "time_per_iteration": 4.5041303634643555 }, { "auxiliary_loss_clip": 0.01269832, "auxiliary_loss_mlp": 0.00214895, "balance_loss_clip": 1.04705715, "balance_loss_mlp": 0.19008774, "epoch": 0.8565158575078912, "flos": 23440618748160.0, "grad_norm": 478.312023433773, "language_loss": 0.87507021, "learning_rate": 2.1202317413394488e-07, "loss": 0.88991749, "num_input_tokens_seen": 307261485, "router_z_loss_clip": 2.2265625, "router_z_loss_mlp": 0.24841309, "step": 14246, "time_per_iteration": 2.8442184925079346 }, { "auxiliary_loss_clip": 0.01238404, "auxiliary_loss_mlp": 0.0019973, "balance_loss_clip": 1.02711785, "balance_loss_mlp": 0.17735487, "epoch": 0.8565759807605592, "flos": 20375930442240.0, "grad_norm": 20.99201303334203, "language_loss": 0.90182281, "learning_rate": 2.1184869412014938e-07, "loss": 0.91620415, "num_input_tokens_seen": 307279160, "router_z_loss_clip": 2.11328125, "router_z_loss_mlp": 0.22363281, "step": 14247, "time_per_iteration": 2.6937243938446045 }, { "auxiliary_loss_clip": 0.01262438, "auxiliary_loss_mlp": 0.00239529, "balance_loss_clip": 1.04131222, "balance_loss_mlp": 0.21165764, "epoch": 0.8566361040132271, "flos": 18807101049600.0, "grad_norm": 247.94705937489417, "language_loss": 0.86517954, "learning_rate": 2.1167428191323112e-07, "loss": 0.88019919, "num_input_tokens_seen": 307297920, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.27844238, "step": 14248, "time_per_iteration": 2.642778158187866 }, { "auxiliary_loss_clip": 0.01253245, "auxiliary_loss_mlp": 0.00254475, "balance_loss_clip": 1.03356254, "balance_loss_mlp": 0.22799878, "epoch": 0.8566962272658951, "flos": 24535355506560.0, "grad_norm": 13.409694701844776, "language_loss": 0.86520642, "learning_rate": 2.1149993751980278e-07, "loss": 0.88028359, "num_input_tokens_seen": 307318320, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.26477051, "step": 14249, "time_per_iteration": 4.104541063308716 }, { "auxiliary_loss_clip": 0.01247946, "auxiliary_loss_mlp": 0.00225099, "balance_loss_clip": 1.03042459, "balance_loss_mlp": 0.20125695, "epoch": 0.856756350518563, "flos": 23178506227200.0, "grad_norm": 63.172656568682534, "language_loss": 0.87011147, "learning_rate": 2.1132566094647597e-07, "loss": 0.8848418, "num_input_tokens_seen": 307336720, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.23876953, "step": 14250, "time_per_iteration": 2.69720196723938 }, { "auxiliary_loss_clip": 0.01243152, "auxiliary_loss_mlp": 0.00207087, "balance_loss_clip": 1.03123665, "balance_loss_mlp": 0.1847589, "epoch": 0.856816473771231, "flos": 20808581760000.0, "grad_norm": 14.101776121304919, "language_loss": 0.86379099, "learning_rate": 2.1115145219985942e-07, "loss": 0.8782934, "num_input_tokens_seen": 307354120, "router_z_loss_clip": 2.11328125, "router_z_loss_mlp": 0.22314453, "step": 14251, "time_per_iteration": 2.6781182289123535 }, { "auxiliary_loss_clip": 0.01238898, "auxiliary_loss_mlp": 0.00215566, "balance_loss_clip": 1.02551973, "balance_loss_mlp": 0.19239151, "epoch": 0.856876597023899, "flos": 20228157889920.0, "grad_norm": 252.13311251672474, "language_loss": 0.70115763, "learning_rate": 2.1097731128656005e-07, "loss": 0.7157023, "num_input_tokens_seen": 307373165, "router_z_loss_clip": 2.13378906, "router_z_loss_mlp": 0.23193359, "step": 14252, "time_per_iteration": 2.7052817344665527 }, { "auxiliary_loss_clip": 0.01263105, "auxiliary_loss_mlp": 0.00219233, "balance_loss_clip": 1.03997445, "balance_loss_mlp": 0.19278002, "epoch": 0.856936720276567, "flos": 18296128126080.0, "grad_norm": 21.217400493091702, "language_loss": 0.82216579, "learning_rate": 2.1080323821317924e-07, "loss": 0.83698916, "num_input_tokens_seen": 307391000, "router_z_loss_clip": 2.23339844, "router_z_loss_mlp": 0.26428223, "step": 14253, "time_per_iteration": 2.669016122817993 }, { "auxiliary_loss_clip": 0.01137357, "auxiliary_loss_mlp": 0.00154473, "balance_loss_clip": 0.99082667, "balance_loss_mlp": 0.14665264, "epoch": 0.8569968435292349, "flos": 69878394933120.0, "grad_norm": 0.7945412955892998, "language_loss": 0.5825845, "learning_rate": 2.1062923298631907e-07, "loss": 0.59550279, "num_input_tokens_seen": 307452865, "router_z_loss_clip": 1.46875, "router_z_loss_mlp": 0.078125, "step": 14254, "time_per_iteration": 3.2103559970855713 }, { "auxiliary_loss_clip": 0.01239695, "auxiliary_loss_mlp": 0.00227129, "balance_loss_clip": 1.02735806, "balance_loss_mlp": 0.20192836, "epoch": 0.8570569667819029, "flos": 25848572739840.0, "grad_norm": 126.12193401530179, "language_loss": 0.89209735, "learning_rate": 2.1045529561257825e-07, "loss": 0.90676558, "num_input_tokens_seen": 307471940, "router_z_loss_clip": 2.12304688, "router_z_loss_mlp": 0.25195312, "step": 14255, "time_per_iteration": 2.672116279602051 }, { "auxiliary_loss_clip": 0.01225835, "auxiliary_loss_mlp": 0.00200686, "balance_loss_clip": 1.01705182, "balance_loss_mlp": 0.17895389, "epoch": 0.8571170900345708, "flos": 23257115141760.0, "grad_norm": 451.84424446651053, "language_loss": 0.75935209, "learning_rate": 2.1028142609855126e-07, "loss": 0.77361727, "num_input_tokens_seen": 307488745, "router_z_loss_clip": 2.09179688, "router_z_loss_mlp": 0.21740723, "step": 14256, "time_per_iteration": 2.682065963745117 }, { "auxiliary_loss_clip": 0.01255972, "auxiliary_loss_mlp": 0.0021348, "balance_loss_clip": 1.04047441, "balance_loss_mlp": 0.18967399, "epoch": 0.8571772132872388, "flos": 18917670090240.0, "grad_norm": 17.451434839614016, "language_loss": 0.76373816, "learning_rate": 2.1010762445083218e-07, "loss": 0.77843261, "num_input_tokens_seen": 307506855, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.23791504, "step": 14257, "time_per_iteration": 2.6956496238708496 }, { "auxiliary_loss_clip": 0.01246871, "auxiliary_loss_mlp": 0.0021101, "balance_loss_clip": 1.03511131, "balance_loss_mlp": 0.18607193, "epoch": 0.8572373365399069, "flos": 33250120318080.0, "grad_norm": 421.0781086393169, "language_loss": 0.86409974, "learning_rate": 2.0993389067601197e-07, "loss": 0.87867856, "num_input_tokens_seen": 307526115, "router_z_loss_clip": 2.11523438, "router_z_loss_mlp": 0.24951172, "step": 14258, "time_per_iteration": 2.7596445083618164 }, { "auxiliary_loss_clip": 0.01250371, "auxiliary_loss_mlp": 0.00209502, "balance_loss_clip": 1.0356133, "balance_loss_mlp": 0.18444464, "epoch": 0.8572974597925748, "flos": 23327535755520.0, "grad_norm": 7.332504997617393, "language_loss": 0.76492095, "learning_rate": 2.0976022478067735e-07, "loss": 0.77951974, "num_input_tokens_seen": 307545230, "router_z_loss_clip": 2.14746094, "router_z_loss_mlp": 0.25085449, "step": 14259, "time_per_iteration": 2.6724648475646973 }, { "auxiliary_loss_clip": 0.01253252, "auxiliary_loss_mlp": 0.00214059, "balance_loss_clip": 1.03006542, "balance_loss_mlp": 0.18833363, "epoch": 0.8573575830452428, "flos": 24535858296960.0, "grad_norm": 14.316764368645496, "language_loss": 0.85032153, "learning_rate": 2.0958662677141437e-07, "loss": 0.86499465, "num_input_tokens_seen": 307564900, "router_z_loss_clip": 2.23046875, "router_z_loss_mlp": 0.25708008, "step": 14260, "time_per_iteration": 2.702571392059326 }, { "auxiliary_loss_clip": 0.0126376, "auxiliary_loss_mlp": 0.00218801, "balance_loss_clip": 1.04155612, "balance_loss_mlp": 0.19252746, "epoch": 0.8574177062979107, "flos": 24165403378560.0, "grad_norm": 107.37341959075485, "language_loss": 0.83875191, "learning_rate": 2.09413096654806e-07, "loss": 0.85357749, "num_input_tokens_seen": 307583500, "router_z_loss_clip": 2.22265625, "router_z_loss_mlp": 0.26293945, "step": 14261, "time_per_iteration": 2.7587006092071533 }, { "auxiliary_loss_clip": 0.01259759, "auxiliary_loss_mlp": 0.00219734, "balance_loss_clip": 1.03430367, "balance_loss_mlp": 0.19357976, "epoch": 0.8574778295505787, "flos": 17930737025280.0, "grad_norm": 12.959787545880534, "language_loss": 0.87616968, "learning_rate": 2.0923963443743276e-07, "loss": 0.89096463, "num_input_tokens_seen": 307601430, "router_z_loss_clip": 2.25390625, "router_z_loss_mlp": 0.26171875, "step": 14262, "time_per_iteration": 2.688798427581787 }, { "auxiliary_loss_clip": 0.01250473, "auxiliary_loss_mlp": 0.00233687, "balance_loss_clip": 1.03125215, "balance_loss_mlp": 0.21004814, "epoch": 0.8575379528032466, "flos": 21580697537280.0, "grad_norm": 10.794874046347948, "language_loss": 0.76255536, "learning_rate": 2.0906624012587203e-07, "loss": 0.77739692, "num_input_tokens_seen": 307621495, "router_z_loss_clip": 2.19335938, "router_z_loss_mlp": 0.23632812, "step": 14263, "time_per_iteration": 2.7800662517547607 }, { "auxiliary_loss_clip": 0.01252886, "auxiliary_loss_mlp": 0.00236508, "balance_loss_clip": 1.0336957, "balance_loss_mlp": 0.2101268, "epoch": 0.8575980760559146, "flos": 21761579450880.0, "grad_norm": 6.782057349313423, "language_loss": 0.83983827, "learning_rate": 2.088929137266986e-07, "loss": 0.85473228, "num_input_tokens_seen": 307640840, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.26403809, "step": 14264, "time_per_iteration": 2.7383668422698975 }, { "auxiliary_loss_clip": 0.01261393, "auxiliary_loss_mlp": 0.00217883, "balance_loss_clip": 1.04293895, "balance_loss_mlp": 0.19414882, "epoch": 0.8576581993085826, "flos": 34386442047360.0, "grad_norm": 4.629990956394929, "language_loss": 0.75873566, "learning_rate": 2.0871965524648582e-07, "loss": 0.77352846, "num_input_tokens_seen": 307663820, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.23754883, "step": 14265, "time_per_iteration": 2.8090732097625732 }, { "auxiliary_loss_clip": 0.01246011, "auxiliary_loss_mlp": 0.0020993, "balance_loss_clip": 1.0319587, "balance_loss_mlp": 0.18441892, "epoch": 0.8577183225612506, "flos": 23222497409280.0, "grad_norm": 2.307621639587455, "language_loss": 0.75131786, "learning_rate": 2.085464646918027e-07, "loss": 0.76587725, "num_input_tokens_seen": 307682385, "router_z_loss_clip": 2.140625, "router_z_loss_mlp": 0.25500488, "step": 14266, "time_per_iteration": 2.694284200668335 }, { "auxiliary_loss_clip": 0.01252363, "auxiliary_loss_mlp": 0.0023714, "balance_loss_clip": 1.03744626, "balance_loss_mlp": 0.21315494, "epoch": 0.8577784458139185, "flos": 28804164462720.0, "grad_norm": 1296.5274129180618, "language_loss": 0.81742656, "learning_rate": 2.0837334206921731e-07, "loss": 0.83232164, "num_input_tokens_seen": 307704680, "router_z_loss_clip": 2.14550781, "router_z_loss_mlp": 0.24023438, "step": 14267, "time_per_iteration": 2.7416114807128906 }, { "auxiliary_loss_clip": 0.01230744, "auxiliary_loss_mlp": 0.00206518, "balance_loss_clip": 1.01973212, "balance_loss_mlp": 0.18280689, "epoch": 0.8578385690665865, "flos": 19755573626880.0, "grad_norm": 3.278480720812314, "language_loss": 0.92467439, "learning_rate": 2.082002873852946e-07, "loss": 0.93904698, "num_input_tokens_seen": 307723245, "router_z_loss_clip": 2.109375, "router_z_loss_mlp": 0.23718262, "step": 14268, "time_per_iteration": 2.696329116821289 }, { "auxiliary_loss_clip": 0.01282432, "auxiliary_loss_mlp": 0.00231512, "balance_loss_clip": 1.05616486, "balance_loss_mlp": 0.20529763, "epoch": 0.8578986923192544, "flos": 20704082117760.0, "grad_norm": 66.76546193887017, "language_loss": 0.83218187, "learning_rate": 2.0802730064659667e-07, "loss": 0.84732127, "num_input_tokens_seen": 307742510, "router_z_loss_clip": 2.26367188, "router_z_loss_mlp": 0.26208496, "step": 14269, "time_per_iteration": 2.6659796237945557 }, { "auxiliary_loss_clip": 0.01247504, "auxiliary_loss_mlp": 0.00221455, "balance_loss_clip": 1.03133237, "balance_loss_mlp": 0.19682601, "epoch": 0.8579588155719224, "flos": 36101715189120.0, "grad_norm": 841.940393210129, "language_loss": 0.74384427, "learning_rate": 2.0785438185968252e-07, "loss": 0.7585339, "num_input_tokens_seen": 307766030, "router_z_loss_clip": 2.16601562, "router_z_loss_mlp": 0.24584961, "step": 14270, "time_per_iteration": 2.8010268211364746 }, { "auxiliary_loss_clip": 0.01233801, "auxiliary_loss_mlp": 0.00206677, "balance_loss_clip": 1.02399266, "balance_loss_mlp": 0.18426615, "epoch": 0.8580189388245905, "flos": 22853479034880.0, "grad_norm": 2.319321775058704, "language_loss": 0.8055557, "learning_rate": 2.0768153103110997e-07, "loss": 0.81996047, "num_input_tokens_seen": 307785800, "router_z_loss_clip": 2.09570312, "router_z_loss_mlp": 0.22412109, "step": 14271, "time_per_iteration": 2.637087345123291 }, { "auxiliary_loss_clip": 0.01148827, "auxiliary_loss_mlp": 0.00111957, "balance_loss_clip": 1.00114214, "balance_loss_mlp": 0.10470901, "epoch": 0.8580790620772584, "flos": 69642104290560.0, "grad_norm": 0.7535652686782242, "language_loss": 0.57494354, "learning_rate": 2.0750874816743358e-07, "loss": 0.58755136, "num_input_tokens_seen": 307850995, "router_z_loss_clip": 1.4765625, "router_z_loss_mlp": 0.07226562, "step": 14272, "time_per_iteration": 3.2118399143218994 }, { "auxiliary_loss_clip": 0.012597, "auxiliary_loss_mlp": 0.00223491, "balance_loss_clip": 1.039554, "balance_loss_mlp": 0.19737247, "epoch": 0.8581391853299264, "flos": 13334243270400.0, "grad_norm": 3.08451155555398, "language_loss": 0.83491278, "learning_rate": 2.0733603327520499e-07, "loss": 0.84974468, "num_input_tokens_seen": 307868585, "router_z_loss_clip": 2.19921875, "router_z_loss_mlp": 0.26098633, "step": 14273, "time_per_iteration": 2.6116373538970947 }, { "auxiliary_loss_clip": 0.01237436, "auxiliary_loss_mlp": 0.00209423, "balance_loss_clip": 1.02327704, "balance_loss_mlp": 0.18566465, "epoch": 0.8581993085825943, "flos": 19645651031040.0, "grad_norm": 5.913474749737479, "language_loss": 0.90088665, "learning_rate": 2.0716338636097385e-07, "loss": 0.91535527, "num_input_tokens_seen": 307886820, "router_z_loss_clip": 2.140625, "router_z_loss_mlp": 0.23779297, "step": 14274, "time_per_iteration": 2.6973061561584473 }, { "auxiliary_loss_clip": 0.0114684, "auxiliary_loss_mlp": 0.0015676, "balance_loss_clip": 1.00051332, "balance_loss_mlp": 0.14841536, "epoch": 0.8582594318352623, "flos": 55825077294720.0, "grad_norm": 0.7874807662382023, "language_loss": 0.60385394, "learning_rate": 2.0699080743128672e-07, "loss": 0.61688995, "num_input_tokens_seen": 307944020, "router_z_loss_clip": 1.46875, "router_z_loss_mlp": 0.08349609, "step": 14275, "time_per_iteration": 3.207456588745117 }, { "auxiliary_loss_clip": 0.012544, "auxiliary_loss_mlp": 0.00224171, "balance_loss_clip": 1.03397512, "balance_loss_mlp": 0.198553, "epoch": 0.8583195550879302, "flos": 24279563779200.0, "grad_norm": 330.7119002521883, "language_loss": 0.71332109, "learning_rate": 2.0681829649268768e-07, "loss": 0.7281068, "num_input_tokens_seen": 307961055, "router_z_loss_clip": 2.20507812, "router_z_loss_mlp": 0.25634766, "step": 14276, "time_per_iteration": 2.693603992462158 }, { "auxiliary_loss_clip": 0.01234012, "auxiliary_loss_mlp": 0.00218095, "balance_loss_clip": 1.02466607, "balance_loss_mlp": 0.19560069, "epoch": 0.8583796783405983, "flos": 13444129952640.0, "grad_norm": 12.364494643425802, "language_loss": 0.86362749, "learning_rate": 2.0664585355171838e-07, "loss": 0.87814862, "num_input_tokens_seen": 307978690, "router_z_loss_clip": 2.09375, "router_z_loss_mlp": 0.22497559, "step": 14277, "time_per_iteration": 2.6557676792144775 }, { "auxiliary_loss_clip": 0.01252497, "auxiliary_loss_mlp": 0.00206613, "balance_loss_clip": 1.03161979, "balance_loss_mlp": 0.18238945, "epoch": 0.8584398015932662, "flos": 16180271533440.0, "grad_norm": 4.15342012842555, "language_loss": 0.910402, "learning_rate": 2.0647347861491803e-07, "loss": 0.92499304, "num_input_tokens_seen": 307995870, "router_z_loss_clip": 2.2109375, "router_z_loss_mlp": 0.2421875, "step": 14278, "time_per_iteration": 2.6382579803466797 }, { "auxiliary_loss_clip": 0.01264746, "auxiliary_loss_mlp": 0.00237536, "balance_loss_clip": 1.04039919, "balance_loss_mlp": 0.21244246, "epoch": 0.8584999248459342, "flos": 17450431338240.0, "grad_norm": 3.7236580033652964, "language_loss": 0.85325491, "learning_rate": 2.0630117168882366e-07, "loss": 0.86827773, "num_input_tokens_seen": 308013645, "router_z_loss_clip": 2.24414062, "router_z_loss_mlp": 0.25073242, "step": 14279, "time_per_iteration": 2.6680517196655273 }, { "auxiliary_loss_clip": 0.01243442, "auxiliary_loss_mlp": 0.00208452, "balance_loss_clip": 1.02872276, "balance_loss_mlp": 0.18377607, "epoch": 0.8585600480986021, "flos": 23441013797760.0, "grad_norm": 9.197209967371144, "language_loss": 0.7663424, "learning_rate": 2.0612893277996845e-07, "loss": 0.78086138, "num_input_tokens_seen": 308032490, "router_z_loss_clip": 2.14746094, "router_z_loss_mlp": 0.24682617, "step": 14280, "time_per_iteration": 2.6727077960968018 }, { "auxiliary_loss_clip": 0.01235198, "auxiliary_loss_mlp": 0.00210174, "balance_loss_clip": 1.02190399, "balance_loss_mlp": 0.18791756, "epoch": 0.8586201713512701, "flos": 19937927998080.0, "grad_norm": 276.3227323059585, "language_loss": 0.73023939, "learning_rate": 2.0595676189488343e-07, "loss": 0.74469304, "num_input_tokens_seen": 308052110, "router_z_loss_clip": 2.13085938, "router_z_loss_mlp": 0.22241211, "step": 14281, "time_per_iteration": 4.073981761932373 }, { "auxiliary_loss_clip": 0.01232453, "auxiliary_loss_mlp": 0.0022749, "balance_loss_clip": 1.02371848, "balance_loss_mlp": 0.20482847, "epoch": 0.858680294603938, "flos": 15304769435520.0, "grad_norm": 28.45370534995835, "language_loss": 0.80572695, "learning_rate": 2.0578465904009845e-07, "loss": 0.82032645, "num_input_tokens_seen": 308070660, "router_z_loss_clip": 2.08691406, "router_z_loss_mlp": 0.2265625, "step": 14282, "time_per_iteration": 2.677732467651367 }, { "auxiliary_loss_clip": 0.01238792, "auxiliary_loss_mlp": 0.00216136, "balance_loss_clip": 1.02645159, "balance_loss_mlp": 0.19260404, "epoch": 0.858740417856606, "flos": 22711237176960.0, "grad_norm": 12.532271124313478, "language_loss": 0.84757268, "learning_rate": 2.0561262422213832e-07, "loss": 0.86212194, "num_input_tokens_seen": 308089520, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.23535156, "step": 14283, "time_per_iteration": 4.044692754745483 }, { "auxiliary_loss_clip": 0.01251512, "auxiliary_loss_mlp": 0.00218314, "balance_loss_clip": 1.03472495, "balance_loss_mlp": 0.19405511, "epoch": 0.8588005411092741, "flos": 34054303962240.0, "grad_norm": 19.20677327188584, "language_loss": 0.67822373, "learning_rate": 2.0544065744752736e-07, "loss": 0.692922, "num_input_tokens_seen": 308111545, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.24267578, "step": 14284, "time_per_iteration": 2.7587804794311523 }, { "auxiliary_loss_clip": 0.01232667, "auxiliary_loss_mlp": 0.00206928, "balance_loss_clip": 1.02169275, "balance_loss_mlp": 0.18382561, "epoch": 0.858860664361942, "flos": 28913584268160.0, "grad_norm": 2273.7161764439893, "language_loss": 0.83787256, "learning_rate": 2.0526875872278749e-07, "loss": 0.85226852, "num_input_tokens_seen": 308129690, "router_z_loss_clip": 2.10742188, "router_z_loss_mlp": 0.23095703, "step": 14285, "time_per_iteration": 2.7597591876983643 }, { "auxiliary_loss_clip": 0.01261176, "auxiliary_loss_mlp": 0.00219527, "balance_loss_clip": 1.0393815, "balance_loss_mlp": 0.19467151, "epoch": 0.85892078761461, "flos": 19792525743360.0, "grad_norm": 728.0003018366128, "language_loss": 0.82124317, "learning_rate": 2.0509692805443524e-07, "loss": 0.83605021, "num_input_tokens_seen": 308147410, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.2487793, "step": 14286, "time_per_iteration": 2.645179510116577 }, { "auxiliary_loss_clip": 0.01144544, "auxiliary_loss_mlp": 0.00200483, "balance_loss_clip": 0.99846327, "balance_loss_mlp": 0.19104135, "epoch": 0.8589809108672779, "flos": 67106630039040.0, "grad_norm": 0.7486414605097893, "language_loss": 0.48384076, "learning_rate": 2.0492516544898718e-07, "loss": 0.49729103, "num_input_tokens_seen": 308204875, "router_z_loss_clip": 1.4609375, "router_z_loss_mlp": 0.09423828, "step": 14287, "time_per_iteration": 3.1338818073272705 }, { "auxiliary_loss_clip": 0.0125582, "auxiliary_loss_mlp": 0.00210887, "balance_loss_clip": 1.03612638, "balance_loss_mlp": 0.18772511, "epoch": 0.8590410341199459, "flos": 29716259541120.0, "grad_norm": 56.51074552107692, "language_loss": 0.86784911, "learning_rate": 2.0475347091295704e-07, "loss": 0.88251615, "num_input_tokens_seen": 308225690, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.23181152, "step": 14288, "time_per_iteration": 4.218093156814575 }, { "auxiliary_loss_clip": 0.01251037, "auxiliary_loss_mlp": 0.00229823, "balance_loss_clip": 1.03458655, "balance_loss_mlp": 0.204539, "epoch": 0.8591011573726138, "flos": 23987430466560.0, "grad_norm": 9.228872453372185, "language_loss": 0.86378682, "learning_rate": 2.045818444528553e-07, "loss": 0.87859547, "num_input_tokens_seen": 308245255, "router_z_loss_clip": 2.16601562, "router_z_loss_mlp": 0.25256348, "step": 14289, "time_per_iteration": 2.697650194168091 }, { "auxiliary_loss_clip": 0.01254786, "auxiliary_loss_mlp": 0.0022271, "balance_loss_clip": 1.0379324, "balance_loss_mlp": 0.19880879, "epoch": 0.8591612806252819, "flos": 14428656806400.0, "grad_norm": 2.9407045378278576, "language_loss": 0.73513216, "learning_rate": 2.0441028607518973e-07, "loss": 0.74990714, "num_input_tokens_seen": 308261755, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.23913574, "step": 14290, "time_per_iteration": 2.6409811973571777 }, { "auxiliary_loss_clip": 0.01266541, "auxiliary_loss_mlp": 0.002315, "balance_loss_clip": 1.04234648, "balance_loss_mlp": 0.2053691, "epoch": 0.8592214038779498, "flos": 31577150419200.0, "grad_norm": 3.9677036981674134, "language_loss": 0.63132137, "learning_rate": 2.0423879578646642e-07, "loss": 0.64630181, "num_input_tokens_seen": 308285145, "router_z_loss_clip": 2.2421875, "router_z_loss_mlp": 0.26135254, "step": 14291, "time_per_iteration": 4.1673712730407715 }, { "auxiliary_loss_clip": 0.0125107, "auxiliary_loss_mlp": 0.00235166, "balance_loss_clip": 1.03076088, "balance_loss_mlp": 0.20990574, "epoch": 0.8592815271306178, "flos": 17457290835840.0, "grad_norm": 13.157478427225579, "language_loss": 0.81299424, "learning_rate": 2.0406737359318792e-07, "loss": 0.82785654, "num_input_tokens_seen": 308304130, "router_z_loss_clip": 2.20214844, "router_z_loss_mlp": 0.25268555, "step": 14292, "time_per_iteration": 2.738149642944336 }, { "auxiliary_loss_clip": 0.01233486, "auxiliary_loss_mlp": 0.00230597, "balance_loss_clip": 1.02081954, "balance_loss_mlp": 0.20756546, "epoch": 0.8593416503832857, "flos": 25411360394880.0, "grad_norm": 2.457561689828223, "language_loss": 0.77970266, "learning_rate": 2.038960195018542e-07, "loss": 0.79434353, "num_input_tokens_seen": 308324670, "router_z_loss_clip": 2.12109375, "router_z_loss_mlp": 0.23034668, "step": 14293, "time_per_iteration": 2.6700170040130615 }, { "auxiliary_loss_clip": 0.01226072, "auxiliary_loss_mlp": 0.00204637, "balance_loss_clip": 1.01532793, "balance_loss_mlp": 0.18091398, "epoch": 0.8594017736359537, "flos": 20996646393600.0, "grad_norm": 6.172904225191042, "language_loss": 0.77314413, "learning_rate": 2.0372473351896358e-07, "loss": 0.78745127, "num_input_tokens_seen": 308344215, "router_z_loss_clip": 2.10742188, "router_z_loss_mlp": 0.23742676, "step": 14294, "time_per_iteration": 2.682424306869507 }, { "auxiliary_loss_clip": 0.01240742, "auxiliary_loss_mlp": 0.00226973, "balance_loss_clip": 1.02667487, "balance_loss_mlp": 0.20319089, "epoch": 0.8594618968886216, "flos": 22091059929600.0, "grad_norm": 9.968371465590897, "language_loss": 0.83532596, "learning_rate": 2.0355351565101087e-07, "loss": 0.85000312, "num_input_tokens_seen": 308360520, "router_z_loss_clip": 2.140625, "router_z_loss_mlp": 0.23754883, "step": 14295, "time_per_iteration": 2.7045810222625732 }, { "auxiliary_loss_clip": 0.01268286, "auxiliary_loss_mlp": 0.00219409, "balance_loss_clip": 1.0413543, "balance_loss_mlp": 0.19323094, "epoch": 0.8595220201412896, "flos": 11656245467520.0, "grad_norm": 25.3900259771761, "language_loss": 0.79837954, "learning_rate": 2.0338236590448975e-07, "loss": 0.8132565, "num_input_tokens_seen": 308376865, "router_z_loss_clip": 2.26660156, "router_z_loss_mlp": 0.26147461, "step": 14296, "time_per_iteration": 2.6724467277526855 }, { "auxiliary_loss_clip": 0.01246636, "auxiliary_loss_mlp": 0.00210508, "balance_loss_clip": 1.02909946, "balance_loss_mlp": 0.18641546, "epoch": 0.8595821433939577, "flos": 25040366772480.0, "grad_norm": 48.242175239895644, "language_loss": 0.89327174, "learning_rate": 2.0321128428588842e-07, "loss": 0.90784317, "num_input_tokens_seen": 308395870, "router_z_loss_clip": 2.17578125, "router_z_loss_mlp": 0.2409668, "step": 14297, "time_per_iteration": 2.7072982788085938 }, { "auxiliary_loss_clip": 0.01241408, "auxiliary_loss_mlp": 0.00229948, "balance_loss_clip": 1.02939034, "balance_loss_mlp": 0.20682183, "epoch": 0.8596422666466256, "flos": 28511528359680.0, "grad_norm": 6.735101612182346, "language_loss": 0.74938452, "learning_rate": 2.030402708016954e-07, "loss": 0.76409811, "num_input_tokens_seen": 308417250, "router_z_loss_clip": 2.1171875, "router_z_loss_mlp": 0.23156738, "step": 14298, "time_per_iteration": 2.700176954269409 }, { "auxiliary_loss_clip": 0.0124909, "auxiliary_loss_mlp": 0.00184158, "balance_loss_clip": 1.02726173, "balance_loss_mlp": 0.1589098, "epoch": 0.8597023898992936, "flos": 13589137157760.0, "grad_norm": 379.52249149472163, "language_loss": 0.79026198, "learning_rate": 2.0286932545839576e-07, "loss": 0.80459452, "num_input_tokens_seen": 308434565, "router_z_loss_clip": 2.22070312, "router_z_loss_mlp": 0.25280762, "step": 14299, "time_per_iteration": 2.6490731239318848 }, { "auxiliary_loss_clip": 0.0126339, "auxiliary_loss_mlp": 0.0023086, "balance_loss_clip": 1.04150224, "balance_loss_mlp": 0.20372796, "epoch": 0.8597625131519615, "flos": 32300821728000.0, "grad_norm": 26.704762055105466, "language_loss": 0.81316125, "learning_rate": 2.0269844826247096e-07, "loss": 0.82810378, "num_input_tokens_seen": 308450040, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.27124023, "step": 14300, "time_per_iteration": 2.7466633319854736 }, { "auxiliary_loss_clip": 0.01229364, "auxiliary_loss_mlp": 0.00233057, "balance_loss_clip": 1.01799047, "balance_loss_mlp": 0.21063358, "epoch": 0.8598226364046295, "flos": 28730367970560.0, "grad_norm": 13.746523944417254, "language_loss": 0.7713027, "learning_rate": 2.0252763922040116e-07, "loss": 0.78592694, "num_input_tokens_seen": 308470545, "router_z_loss_clip": 2.11328125, "router_z_loss_mlp": 0.22424316, "step": 14301, "time_per_iteration": 2.7922587394714355 }, { "auxiliary_loss_clip": 0.01247407, "auxiliary_loss_mlp": 0.00234459, "balance_loss_clip": 1.03254986, "balance_loss_mlp": 0.21024749, "epoch": 0.8598827596572974, "flos": 21871825269120.0, "grad_norm": 23.47135442212001, "language_loss": 0.83736861, "learning_rate": 2.023568983386641e-07, "loss": 0.85218728, "num_input_tokens_seen": 308490020, "router_z_loss_clip": 2.14648438, "router_z_loss_mlp": 0.24206543, "step": 14302, "time_per_iteration": 2.666097640991211 }, { "auxiliary_loss_clip": 0.01241396, "auxiliary_loss_mlp": 0.00217925, "balance_loss_clip": 1.02721143, "balance_loss_mlp": 0.19338022, "epoch": 0.8599428829099655, "flos": 23767297966080.0, "grad_norm": 13.920340769344048, "language_loss": 0.91125941, "learning_rate": 2.02186225623733e-07, "loss": 0.9258526, "num_input_tokens_seen": 308509065, "router_z_loss_clip": 2.14257812, "router_z_loss_mlp": 0.24536133, "step": 14303, "time_per_iteration": 2.7480568885803223 }, { "auxiliary_loss_clip": 0.01258624, "auxiliary_loss_mlp": 0.00224475, "balance_loss_clip": 1.03666973, "balance_loss_mlp": 0.20014438, "epoch": 0.8600030061626334, "flos": 16212770363520.0, "grad_norm": 218.36325184219, "language_loss": 0.84679443, "learning_rate": 2.0201562108208025e-07, "loss": 0.86162537, "num_input_tokens_seen": 308524725, "router_z_loss_clip": 2.22070312, "router_z_loss_mlp": 0.2434082, "step": 14304, "time_per_iteration": 2.6200146675109863 }, { "auxiliary_loss_clip": 0.01256459, "auxiliary_loss_mlp": 0.00228898, "balance_loss_clip": 1.03210235, "balance_loss_mlp": 0.20181355, "epoch": 0.8600631294153014, "flos": 15669370437120.0, "grad_norm": 181.07559485472635, "language_loss": 0.70408297, "learning_rate": 2.0184508472017537e-07, "loss": 0.71893656, "num_input_tokens_seen": 308543525, "router_z_loss_clip": 2.2421875, "router_z_loss_mlp": 0.27111816, "step": 14305, "time_per_iteration": 2.6677825450897217 }, { "auxiliary_loss_clip": 0.01265678, "auxiliary_loss_mlp": 0.00239869, "balance_loss_clip": 1.04019427, "balance_loss_mlp": 0.21183084, "epoch": 0.8601232526679693, "flos": 17493093717120.0, "grad_norm": 14.925509709869033, "language_loss": 0.93329149, "learning_rate": 2.0167461654448558e-07, "loss": 0.94834697, "num_input_tokens_seen": 308557995, "router_z_loss_clip": 2.25585938, "router_z_loss_mlp": 0.28051758, "step": 14306, "time_per_iteration": 2.6223297119140625 }, { "auxiliary_loss_clip": 0.01233218, "auxiliary_loss_mlp": 0.00222665, "balance_loss_clip": 1.02083206, "balance_loss_mlp": 0.20034912, "epoch": 0.8601833759206373, "flos": 26985935963520.0, "grad_norm": 32.017748354199014, "language_loss": 0.77064097, "learning_rate": 2.01504216561474e-07, "loss": 0.78519976, "num_input_tokens_seen": 308582750, "router_z_loss_clip": 2.12304688, "router_z_loss_mlp": 0.22314453, "step": 14307, "time_per_iteration": 2.8089101314544678 }, { "auxiliary_loss_clip": 0.01263983, "auxiliary_loss_mlp": 0.00224528, "balance_loss_clip": 1.03773952, "balance_loss_mlp": 0.19786115, "epoch": 0.8602434991733052, "flos": 25229760209280.0, "grad_norm": 19.06802311021845, "language_loss": 0.73759198, "learning_rate": 2.0133388477760316e-07, "loss": 0.75247711, "num_input_tokens_seen": 308603770, "router_z_loss_clip": 2.26171875, "router_z_loss_mlp": 0.26672363, "step": 14308, "time_per_iteration": 2.7378528118133545 }, { "auxiliary_loss_clip": 0.01153493, "auxiliary_loss_mlp": 0.00093069, "balance_loss_clip": 1.0069809, "balance_loss_mlp": 0.08591661, "epoch": 0.8603036224259732, "flos": 71015363107200.0, "grad_norm": 0.6010551850814191, "language_loss": 0.47220272, "learning_rate": 2.0116362119933172e-07, "loss": 0.48466831, "num_input_tokens_seen": 308667735, "router_z_loss_clip": 1.46875, "router_z_loss_mlp": 0.07128906, "step": 14309, "time_per_iteration": 3.253805637359619 }, { "auxiliary_loss_clip": 0.01267065, "auxiliary_loss_mlp": 0.00223409, "balance_loss_clip": 1.03951955, "balance_loss_mlp": 0.19856568, "epoch": 0.8603637456786413, "flos": 20300625578880.0, "grad_norm": 17.84586207525321, "language_loss": 0.77946472, "learning_rate": 2.0099342583311563e-07, "loss": 0.79436946, "num_input_tokens_seen": 308686300, "router_z_loss_clip": 2.27539062, "router_z_loss_mlp": 0.24841309, "step": 14310, "time_per_iteration": 2.6889407634735107 }, { "auxiliary_loss_clip": 0.01238764, "auxiliary_loss_mlp": 0.00205525, "balance_loss_clip": 1.024894, "balance_loss_mlp": 0.18144467, "epoch": 0.8604238689313092, "flos": 21835842819840.0, "grad_norm": 1.8048999818184683, "language_loss": 0.85941613, "learning_rate": 2.0082329868540905e-07, "loss": 0.87385905, "num_input_tokens_seen": 308705825, "router_z_loss_clip": 2.14257812, "router_z_loss_mlp": 0.2409668, "step": 14311, "time_per_iteration": 2.667060136795044 }, { "auxiliary_loss_clip": 0.01237735, "auxiliary_loss_mlp": 0.00216095, "balance_loss_clip": 1.02492154, "balance_loss_mlp": 0.19295655, "epoch": 0.8604839921839772, "flos": 18004210295040.0, "grad_norm": 387.2728827644302, "language_loss": 0.80917555, "learning_rate": 2.006532397626639e-07, "loss": 0.82371384, "num_input_tokens_seen": 308723340, "router_z_loss_clip": 2.12695312, "router_z_loss_mlp": 0.23156738, "step": 14312, "time_per_iteration": 2.6780643463134766 }, { "auxiliary_loss_clip": 0.01245058, "auxiliary_loss_mlp": 0.00228344, "balance_loss_clip": 1.02727175, "balance_loss_mlp": 0.20364425, "epoch": 0.8605441154366451, "flos": 16252164604800.0, "grad_norm": 3.1805744663613695, "language_loss": 0.86688519, "learning_rate": 2.0048324907132797e-07, "loss": 0.88161922, "num_input_tokens_seen": 308741280, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.24707031, "step": 14313, "time_per_iteration": 2.6578269004821777 }, { "auxiliary_loss_clip": 0.01235702, "auxiliary_loss_mlp": 0.00217008, "balance_loss_clip": 1.02257299, "balance_loss_mlp": 0.19332138, "epoch": 0.8606042386893131, "flos": 32267065921920.0, "grad_norm": 11.509375284897356, "language_loss": 0.79717314, "learning_rate": 2.003133266178474e-07, "loss": 0.81170034, "num_input_tokens_seen": 308762875, "router_z_loss_clip": 2.12890625, "router_z_loss_mlp": 0.23681641, "step": 14314, "time_per_iteration": 2.7836146354675293 }, { "auxiliary_loss_clip": 0.01250945, "auxiliary_loss_mlp": 0.00216829, "balance_loss_clip": 1.03014445, "balance_loss_mlp": 0.19187877, "epoch": 0.860664361941981, "flos": 20229774001920.0, "grad_norm": 19.70023189289947, "language_loss": 0.77398324, "learning_rate": 2.001434724086657e-07, "loss": 0.788661, "num_input_tokens_seen": 308780315, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.24975586, "step": 14315, "time_per_iteration": 2.604783058166504 }, { "auxiliary_loss_clip": 0.01248589, "auxiliary_loss_mlp": 0.00205525, "balance_loss_clip": 1.0315578, "balance_loss_mlp": 0.18125379, "epoch": 0.8607244851946491, "flos": 25191622944000.0, "grad_norm": 2.5755637093846078, "language_loss": 0.80716789, "learning_rate": 1.9997368645022418e-07, "loss": 0.82170904, "num_input_tokens_seen": 308799435, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.24255371, "step": 14316, "time_per_iteration": 2.6834583282470703 }, { "auxiliary_loss_clip": 0.01258441, "auxiliary_loss_mlp": 0.00206838, "balance_loss_clip": 1.03599524, "balance_loss_mlp": 0.18251923, "epoch": 0.860784608447317, "flos": 20482082110080.0, "grad_norm": 6.771762125545195, "language_loss": 0.92403257, "learning_rate": 1.9980396874896056e-07, "loss": 0.9386853, "num_input_tokens_seen": 308817730, "router_z_loss_clip": 2.22265625, "router_z_loss_mlp": 0.24328613, "step": 14317, "time_per_iteration": 2.63031268119812 }, { "auxiliary_loss_clip": 0.01228379, "auxiliary_loss_mlp": 0.00226613, "balance_loss_clip": 1.0168916, "balance_loss_mlp": 0.20302147, "epoch": 0.860844731699985, "flos": 50476037696640.0, "grad_norm": 251.04769574112913, "language_loss": 0.74866951, "learning_rate": 1.996343193113108e-07, "loss": 0.76321948, "num_input_tokens_seen": 308841735, "router_z_loss_clip": 2.11523438, "router_z_loss_mlp": 0.23620605, "step": 14318, "time_per_iteration": 2.946009874343872 }, { "auxiliary_loss_clip": 0.01225821, "auxiliary_loss_mlp": 0.00207246, "balance_loss_clip": 1.01812184, "balance_loss_mlp": 0.18461998, "epoch": 0.8609048549526529, "flos": 41172768455040.0, "grad_norm": 39.1483966258169, "language_loss": 0.7813105, "learning_rate": 1.9946473814370911e-07, "loss": 0.79564118, "num_input_tokens_seen": 308865050, "router_z_loss_clip": 2.07617188, "router_z_loss_mlp": 0.22631836, "step": 14319, "time_per_iteration": 2.8476593494415283 }, { "auxiliary_loss_clip": 0.01242921, "auxiliary_loss_mlp": 0.00226406, "balance_loss_clip": 1.02641153, "balance_loss_mlp": 0.20208696, "epoch": 0.8609649782053209, "flos": 23951196622080.0, "grad_norm": 9.820531759385522, "language_loss": 0.75517696, "learning_rate": 1.992952252525839e-07, "loss": 0.76987022, "num_input_tokens_seen": 308885375, "router_z_loss_clip": 2.16601562, "router_z_loss_mlp": 0.24316406, "step": 14320, "time_per_iteration": 2.6529202461242676 }, { "auxiliary_loss_clip": 0.0125914, "auxiliary_loss_mlp": 0.0022577, "balance_loss_clip": 1.03447497, "balance_loss_mlp": 0.19938931, "epoch": 0.8610251014579888, "flos": 23112574813440.0, "grad_norm": 9.90602648964567, "language_loss": 0.86779189, "learning_rate": 1.9912578064436446e-07, "loss": 0.88264096, "num_input_tokens_seen": 308904700, "router_z_loss_clip": 2.24609375, "router_z_loss_mlp": 0.26391602, "step": 14321, "time_per_iteration": 2.689584970474243 }, { "auxiliary_loss_clip": 0.01236682, "auxiliary_loss_mlp": 0.00211729, "balance_loss_clip": 1.02270532, "balance_loss_mlp": 0.18698151, "epoch": 0.8610852247106568, "flos": 19426811420160.0, "grad_norm": 6.788019643222985, "language_loss": 0.77982271, "learning_rate": 1.9895640432547567e-07, "loss": 0.79430681, "num_input_tokens_seen": 308922985, "router_z_loss_clip": 2.140625, "router_z_loss_mlp": 0.24755859, "step": 14322, "time_per_iteration": 2.632833480834961 }, { "auxiliary_loss_clip": 0.01273274, "auxiliary_loss_mlp": 0.00235802, "balance_loss_clip": 1.04496181, "balance_loss_mlp": 0.20898037, "epoch": 0.8611453479633249, "flos": 19312076401920.0, "grad_norm": 28.22211880581904, "language_loss": 0.6758281, "learning_rate": 1.9878709630234102e-07, "loss": 0.6909188, "num_input_tokens_seen": 308940765, "router_z_loss_clip": 2.28710938, "router_z_loss_mlp": 0.26806641, "step": 14323, "time_per_iteration": 4.142085552215576 }, { "auxiliary_loss_clip": 0.0123689, "auxiliary_loss_mlp": 0.00206269, "balance_loss_clip": 1.02532268, "balance_loss_mlp": 0.18187837, "epoch": 0.8612054712159928, "flos": 23253667436160.0, "grad_norm": 85.37818325092238, "language_loss": 0.81365889, "learning_rate": 1.986178565813801e-07, "loss": 0.82809055, "num_input_tokens_seen": 308960110, "router_z_loss_clip": 2.11523438, "router_z_loss_mlp": 0.24401855, "step": 14324, "time_per_iteration": 2.714848518371582 }, { "auxiliary_loss_clip": 0.01235674, "auxiliary_loss_mlp": 0.00219148, "balance_loss_clip": 1.01920474, "balance_loss_mlp": 0.19481775, "epoch": 0.8612655944686608, "flos": 16028440744320.0, "grad_norm": 50.58293871152654, "language_loss": 0.77916586, "learning_rate": 1.9844868516901036e-07, "loss": 0.79371411, "num_input_tokens_seen": 308976665, "router_z_loss_clip": 2.16210938, "router_z_loss_mlp": 0.24328613, "step": 14325, "time_per_iteration": 4.117102146148682 }, { "auxiliary_loss_clip": 0.01258962, "auxiliary_loss_mlp": 0.00250357, "balance_loss_clip": 1.03451443, "balance_loss_mlp": 0.22401144, "epoch": 0.8613257177213287, "flos": 22492720788480.0, "grad_norm": 276.38481228771303, "language_loss": 0.728046, "learning_rate": 1.982795820716472e-07, "loss": 0.74313915, "num_input_tokens_seen": 308997015, "router_z_loss_clip": 2.2421875, "router_z_loss_mlp": 0.26342773, "step": 14326, "time_per_iteration": 2.7022030353546143 }, { "auxiliary_loss_clip": 0.01273889, "auxiliary_loss_mlp": 0.00218757, "balance_loss_clip": 1.04661286, "balance_loss_mlp": 0.19398537, "epoch": 0.8613858409739967, "flos": 17238056175360.0, "grad_norm": 41.06210846109242, "language_loss": 0.93708766, "learning_rate": 1.9811054729570253e-07, "loss": 0.95201409, "num_input_tokens_seen": 309015250, "router_z_loss_clip": 2.27441406, "router_z_loss_mlp": 0.24780273, "step": 14327, "time_per_iteration": 2.6531853675842285 }, { "auxiliary_loss_clip": 0.012223, "auxiliary_loss_mlp": 0.00204245, "balance_loss_clip": 1.01259267, "balance_loss_mlp": 0.18185705, "epoch": 0.8614459642266646, "flos": 22821123859200.0, "grad_norm": 5.6945903910028415, "language_loss": 0.83614099, "learning_rate": 1.9794158084758661e-07, "loss": 0.85040647, "num_input_tokens_seen": 309034140, "router_z_loss_clip": 2.09375, "router_z_loss_mlp": 0.22399902, "step": 14328, "time_per_iteration": 2.68983793258667 }, { "auxiliary_loss_clip": 0.01252826, "auxiliary_loss_mlp": 0.00218735, "balance_loss_clip": 1.03040707, "balance_loss_mlp": 0.19410643, "epoch": 0.8615060874793327, "flos": 26504301473280.0, "grad_norm": 379.6709337268802, "language_loss": 0.85471326, "learning_rate": 1.9777268273370673e-07, "loss": 0.86942887, "num_input_tokens_seen": 309055075, "router_z_loss_clip": 2.22460938, "router_z_loss_mlp": 0.24633789, "step": 14329, "time_per_iteration": 2.714632749557495 }, { "auxiliary_loss_clip": 0.01245959, "auxiliary_loss_mlp": 0.00231886, "balance_loss_clip": 1.03169346, "balance_loss_mlp": 0.20893838, "epoch": 0.8615662107320006, "flos": 24061011477120.0, "grad_norm": 14.69481423834983, "language_loss": 0.85849476, "learning_rate": 1.9760385296046757e-07, "loss": 0.87327325, "num_input_tokens_seen": 309074650, "router_z_loss_clip": 2.140625, "router_z_loss_mlp": 0.22937012, "step": 14330, "time_per_iteration": 4.145712852478027 }, { "auxiliary_loss_clip": 0.01244551, "auxiliary_loss_mlp": 0.00245292, "balance_loss_clip": 1.02671695, "balance_loss_mlp": 0.22081783, "epoch": 0.8616263339846686, "flos": 24165044242560.0, "grad_norm": 4.039480731002817, "language_loss": 0.74224758, "learning_rate": 1.974350915342702e-07, "loss": 0.757146, "num_input_tokens_seen": 309094385, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.24462891, "step": 14331, "time_per_iteration": 2.7027318477630615 }, { "auxiliary_loss_clip": 0.01238425, "auxiliary_loss_mlp": 0.0021436, "balance_loss_clip": 1.02265549, "balance_loss_mlp": 0.19031528, "epoch": 0.8616864572373365, "flos": 21724340025600.0, "grad_norm": 67.42012706042104, "language_loss": 0.82751942, "learning_rate": 1.9726639846151506e-07, "loss": 0.84204733, "num_input_tokens_seen": 309111815, "router_z_loss_clip": 2.15917969, "router_z_loss_mlp": 0.24047852, "step": 14332, "time_per_iteration": 2.735325574874878 }, { "auxiliary_loss_clip": 0.01246717, "auxiliary_loss_mlp": 0.00221991, "balance_loss_clip": 1.02774119, "balance_loss_mlp": 0.19674239, "epoch": 0.8617465804900045, "flos": 23766651521280.0, "grad_norm": 6.565313810215746, "language_loss": 0.75286162, "learning_rate": 1.9709777374859904e-07, "loss": 0.76754868, "num_input_tokens_seen": 309131385, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.25256348, "step": 14333, "time_per_iteration": 4.040869951248169 }, { "auxiliary_loss_clip": 0.0127761, "auxiliary_loss_mlp": 0.00227043, "balance_loss_clip": 1.04987252, "balance_loss_mlp": 0.20183027, "epoch": 0.8618067037426724, "flos": 37703941251840.0, "grad_norm": 33.145796839878535, "language_loss": 0.72671592, "learning_rate": 1.969292174019157e-07, "loss": 0.7417624, "num_input_tokens_seen": 309155020, "router_z_loss_clip": 2.27734375, "router_z_loss_mlp": 0.25231934, "step": 14334, "time_per_iteration": 2.856536388397217 }, { "auxiliary_loss_clip": 0.01272205, "auxiliary_loss_mlp": 0.00225183, "balance_loss_clip": 1.04761028, "balance_loss_mlp": 0.19936225, "epoch": 0.8618668269953405, "flos": 21471026336640.0, "grad_norm": 111.86925210857656, "language_loss": 0.78761065, "learning_rate": 1.967607294278577e-07, "loss": 0.80258453, "num_input_tokens_seen": 309172865, "router_z_loss_clip": 2.24609375, "router_z_loss_mlp": 0.25805664, "step": 14335, "time_per_iteration": 2.632023811340332 }, { "auxiliary_loss_clip": 0.01241716, "auxiliary_loss_mlp": 0.00218707, "balance_loss_clip": 1.02734387, "balance_loss_mlp": 0.19494833, "epoch": 0.8619269502480085, "flos": 22232691256320.0, "grad_norm": 19.85125057850752, "language_loss": 0.88200456, "learning_rate": 1.965923098328135e-07, "loss": 0.89660877, "num_input_tokens_seen": 309193575, "router_z_loss_clip": 2.14453125, "router_z_loss_mlp": 0.2376709, "step": 14336, "time_per_iteration": 2.673985481262207 }, { "auxiliary_loss_clip": 0.01262036, "auxiliary_loss_mlp": 0.00231699, "balance_loss_clip": 1.03660357, "balance_loss_mlp": 0.20571114, "epoch": 0.8619870735006764, "flos": 22710626645760.0, "grad_norm": 9.65960412518409, "language_loss": 0.75280297, "learning_rate": 1.9642395862316907e-07, "loss": 0.76774037, "num_input_tokens_seen": 309212680, "router_z_loss_clip": 2.25390625, "router_z_loss_mlp": 0.26000977, "step": 14337, "time_per_iteration": 2.807074546813965 }, { "auxiliary_loss_clip": 0.01230098, "auxiliary_loss_mlp": 0.00230485, "balance_loss_clip": 1.01722264, "balance_loss_mlp": 0.20622629, "epoch": 0.8620471967533444, "flos": 37520293991040.0, "grad_norm": 38.4651219976966, "language_loss": 0.72933251, "learning_rate": 1.962556758053089e-07, "loss": 0.74393833, "num_input_tokens_seen": 309234485, "router_z_loss_clip": 2.13085938, "router_z_loss_mlp": 0.24255371, "step": 14338, "time_per_iteration": 2.8973278999328613 }, { "auxiliary_loss_clip": 0.01252958, "auxiliary_loss_mlp": 0.00238902, "balance_loss_clip": 1.03430283, "balance_loss_mlp": 0.212974, "epoch": 0.8621073200060123, "flos": 19682459493120.0, "grad_norm": 37.21203645997719, "language_loss": 0.70217675, "learning_rate": 1.9608746138561448e-07, "loss": 0.71709538, "num_input_tokens_seen": 309253630, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.25927734, "step": 14339, "time_per_iteration": 2.6548831462860107 }, { "auxiliary_loss_clip": 0.01237745, "auxiliary_loss_mlp": 0.00234094, "balance_loss_clip": 1.02421546, "balance_loss_mlp": 0.20870277, "epoch": 0.8621674432586803, "flos": 14536855549440.0, "grad_norm": 35.120360982631944, "language_loss": 0.71843028, "learning_rate": 1.9591931537046458e-07, "loss": 0.73314863, "num_input_tokens_seen": 309270950, "router_z_loss_clip": 2.13476562, "router_z_loss_mlp": 0.25427246, "step": 14340, "time_per_iteration": 2.617051362991333 }, { "auxiliary_loss_clip": 0.01237481, "auxiliary_loss_mlp": 0.00217822, "balance_loss_clip": 1.02617717, "balance_loss_mlp": 0.19468337, "epoch": 0.8622275665113482, "flos": 20740100480640.0, "grad_norm": 18.820973438108375, "language_loss": 0.85916257, "learning_rate": 1.9575123776623493e-07, "loss": 0.87371558, "num_input_tokens_seen": 309288780, "router_z_loss_clip": 2.11328125, "router_z_loss_mlp": 0.23156738, "step": 14341, "time_per_iteration": 2.637075901031494 }, { "auxiliary_loss_clip": 0.01222583, "auxiliary_loss_mlp": 0.00200893, "balance_loss_clip": 1.01464963, "balance_loss_mlp": 0.18012658, "epoch": 0.8622876897640163, "flos": 24715914197760.0, "grad_norm": 236.92999977855038, "language_loss": 0.78968704, "learning_rate": 1.9558322857929887e-07, "loss": 0.80392182, "num_input_tokens_seen": 309310875, "router_z_loss_clip": 2.078125, "router_z_loss_mlp": 0.20776367, "step": 14342, "time_per_iteration": 2.684673309326172 }, { "auxiliary_loss_clip": 0.01257241, "auxiliary_loss_mlp": 0.0021048, "balance_loss_clip": 1.0328083, "balance_loss_mlp": 0.18464765, "epoch": 0.8623478130166842, "flos": 17457362663040.0, "grad_norm": 6293.630149970105, "language_loss": 0.79391992, "learning_rate": 1.95415287816028e-07, "loss": 0.80859715, "num_input_tokens_seen": 309329900, "router_z_loss_clip": 2.24316406, "router_z_loss_mlp": 0.25854492, "step": 14343, "time_per_iteration": 2.6458966732025146 }, { "auxiliary_loss_clip": 0.01249332, "auxiliary_loss_mlp": 0.00236264, "balance_loss_clip": 1.02634406, "balance_loss_mlp": 0.21049069, "epoch": 0.8624079362693522, "flos": 18109176814080.0, "grad_norm": 77.95940398322014, "language_loss": 0.76330376, "learning_rate": 1.9524741548278967e-07, "loss": 0.77815974, "num_input_tokens_seen": 309347870, "router_z_loss_clip": 2.23046875, "router_z_loss_mlp": 0.25769043, "step": 14344, "time_per_iteration": 2.6135497093200684 }, { "auxiliary_loss_clip": 0.01240198, "auxiliary_loss_mlp": 0.00208787, "balance_loss_clip": 1.02272487, "balance_loss_mlp": 0.18539776, "epoch": 0.8624680595220201, "flos": 30666455971200.0, "grad_norm": 18.00560328856228, "language_loss": 0.87214923, "learning_rate": 1.9507961158595054e-07, "loss": 0.88663912, "num_input_tokens_seen": 309371695, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.23400879, "step": 14345, "time_per_iteration": 2.77447509765625 }, { "auxiliary_loss_clip": 0.01259151, "auxiliary_loss_mlp": 0.00217177, "balance_loss_clip": 1.03877342, "balance_loss_mlp": 0.18938884, "epoch": 0.8625281827746881, "flos": 37998588516480.0, "grad_norm": 179.58916523535865, "language_loss": 0.62478083, "learning_rate": 1.9491187613187355e-07, "loss": 0.63954413, "num_input_tokens_seen": 309394645, "router_z_loss_clip": 2.20507812, "router_z_loss_mlp": 0.27770996, "step": 14346, "time_per_iteration": 2.7872304916381836 }, { "auxiliary_loss_clip": 0.01232899, "auxiliary_loss_mlp": 0.0021131, "balance_loss_clip": 1.01698112, "balance_loss_mlp": 0.18625259, "epoch": 0.862588306027356, "flos": 26249730808320.0, "grad_norm": 37.43958634301877, "language_loss": 0.83957326, "learning_rate": 1.9474420912691913e-07, "loss": 0.85401535, "num_input_tokens_seen": 309413170, "router_z_loss_clip": 2.16015625, "router_z_loss_mlp": 0.25061035, "step": 14347, "time_per_iteration": 2.6904377937316895 }, { "auxiliary_loss_clip": 0.01266959, "auxiliary_loss_mlp": 0.00229098, "balance_loss_clip": 1.04513276, "balance_loss_mlp": 0.20339648, "epoch": 0.862648429280024, "flos": 25878809013120.0, "grad_norm": 122.14639244932326, "language_loss": 0.87871295, "learning_rate": 1.945766105774449e-07, "loss": 0.89367354, "num_input_tokens_seen": 309431315, "router_z_loss_clip": 2.22070312, "router_z_loss_mlp": 0.25708008, "step": 14348, "time_per_iteration": 2.649545192718506 }, { "auxiliary_loss_clip": 0.01224494, "auxiliary_loss_mlp": 0.00215761, "balance_loss_clip": 1.01605952, "balance_loss_mlp": 0.1923601, "epoch": 0.862708552532692, "flos": 37816413713280.0, "grad_norm": 3.0570185635311615, "language_loss": 0.73612404, "learning_rate": 1.9440908048980665e-07, "loss": 0.75052655, "num_input_tokens_seen": 309453020, "router_z_loss_clip": 2.08203125, "router_z_loss_mlp": 0.23413086, "step": 14349, "time_per_iteration": 2.79958438873291 }, { "auxiliary_loss_clip": 0.01240793, "auxiliary_loss_mlp": 0.00219494, "balance_loss_clip": 1.02562046, "balance_loss_mlp": 0.19522266, "epoch": 0.86276867578536, "flos": 19091800247040.0, "grad_norm": 107.14628554680039, "language_loss": 0.80353284, "learning_rate": 1.942416188703573e-07, "loss": 0.81813562, "num_input_tokens_seen": 309469780, "router_z_loss_clip": 2.15234375, "router_z_loss_mlp": 0.24291992, "step": 14350, "time_per_iteration": 2.681988000869751 }, { "auxiliary_loss_clip": 0.01248175, "auxiliary_loss_mlp": 0.00227534, "balance_loss_clip": 1.03102517, "balance_loss_mlp": 0.20208269, "epoch": 0.862828799038028, "flos": 22164281804160.0, "grad_norm": 59.396901337585575, "language_loss": 0.85445797, "learning_rate": 1.9407422572544618e-07, "loss": 0.86921501, "num_input_tokens_seen": 309489610, "router_z_loss_clip": 2.17285156, "router_z_loss_mlp": 0.25476074, "step": 14351, "time_per_iteration": 2.7239506244659424 }, { "auxiliary_loss_clip": 0.01237947, "auxiliary_loss_mlp": 0.00216097, "balance_loss_clip": 1.02265263, "balance_loss_mlp": 0.19163534, "epoch": 0.8628889222906959, "flos": 23145576433920.0, "grad_norm": 4.773737585955737, "language_loss": 0.91561306, "learning_rate": 1.9390690106142204e-07, "loss": 0.93015355, "num_input_tokens_seen": 309508295, "router_z_loss_clip": 2.15429688, "router_z_loss_mlp": 0.24475098, "step": 14352, "time_per_iteration": 2.7037534713745117 }, { "auxiliary_loss_clip": 0.01146844, "auxiliary_loss_mlp": 0.00108772, "balance_loss_clip": 1.00298798, "balance_loss_mlp": 0.10042701, "epoch": 0.8629490455433639, "flos": 57817762151040.0, "grad_norm": 0.7958328391907492, "language_loss": 0.60930324, "learning_rate": 1.9373964488462913e-07, "loss": 0.62185943, "num_input_tokens_seen": 309567960, "router_z_loss_clip": 1.4375, "router_z_loss_mlp": 0.08349609, "step": 14353, "time_per_iteration": 3.1632461547851562 }, { "auxiliary_loss_clip": 0.01247492, "auxiliary_loss_mlp": 0.00199716, "balance_loss_clip": 1.0292995, "balance_loss_mlp": 0.17679223, "epoch": 0.8630091687960318, "flos": 15919667383680.0, "grad_norm": 6.630218251854488, "language_loss": 0.88305432, "learning_rate": 1.9357245720140948e-07, "loss": 0.89752638, "num_input_tokens_seen": 309586050, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.22924805, "step": 14354, "time_per_iteration": 2.7023520469665527 }, { "auxiliary_loss_clip": 0.01252252, "auxiliary_loss_mlp": 0.00243267, "balance_loss_clip": 1.03060102, "balance_loss_mlp": 0.2168379, "epoch": 0.8630692920486999, "flos": 17961691570560.0, "grad_norm": 33.88813979553913, "language_loss": 0.93429881, "learning_rate": 1.934053380181031e-07, "loss": 0.94925404, "num_input_tokens_seen": 309602910, "router_z_loss_clip": 2.21484375, "router_z_loss_mlp": 0.26452637, "step": 14355, "time_per_iteration": 2.6136744022369385 }, { "auxiliary_loss_clip": 0.01242973, "auxiliary_loss_mlp": 0.00225865, "balance_loss_clip": 1.02763867, "balance_loss_mlp": 0.20092668, "epoch": 0.8631294153013678, "flos": 22455158140800.0, "grad_norm": 8.856742713015937, "language_loss": 0.68566531, "learning_rate": 1.9323828734104763e-07, "loss": 0.70035374, "num_input_tokens_seen": 309621175, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.24963379, "step": 14356, "time_per_iteration": 2.6748008728027344 }, { "auxiliary_loss_clip": 0.01250772, "auxiliary_loss_mlp": 0.00219202, "balance_loss_clip": 1.03030515, "balance_loss_mlp": 0.19493124, "epoch": 0.8631895385540358, "flos": 16837005847680.0, "grad_norm": 168.52669492376054, "language_loss": 0.8197459, "learning_rate": 1.9307130517657756e-07, "loss": 0.83444566, "num_input_tokens_seen": 309639395, "router_z_loss_clip": 2.20214844, "router_z_loss_mlp": 0.24316406, "step": 14357, "time_per_iteration": 2.6073689460754395 }, { "auxiliary_loss_clip": 0.01245239, "auxiliary_loss_mlp": 0.00206098, "balance_loss_clip": 1.03124833, "balance_loss_mlp": 0.18297181, "epoch": 0.8632496618067037, "flos": 18697214367360.0, "grad_norm": 317.4374917359018, "language_loss": 0.86893272, "learning_rate": 1.9290439153102468e-07, "loss": 0.88344616, "num_input_tokens_seen": 309657265, "router_z_loss_clip": 2.140625, "router_z_loss_mlp": 0.23120117, "step": 14358, "time_per_iteration": 2.614860773086548 }, { "auxiliary_loss_clip": 0.01246348, "auxiliary_loss_mlp": 0.00221088, "balance_loss_clip": 1.02995276, "balance_loss_mlp": 0.19630449, "epoch": 0.8633097850593717, "flos": 24279922915200.0, "grad_norm": 1.9281351862610359, "language_loss": 0.81084478, "learning_rate": 1.9273754641071816e-07, "loss": 0.82551914, "num_input_tokens_seen": 309678610, "router_z_loss_clip": 2.16210938, "router_z_loss_mlp": 0.24829102, "step": 14359, "time_per_iteration": 2.6473844051361084 }, { "auxiliary_loss_clip": 0.01245017, "auxiliary_loss_mlp": 0.00222493, "balance_loss_clip": 1.03252745, "balance_loss_mlp": 0.19907999, "epoch": 0.8633699083120396, "flos": 21178569801600.0, "grad_norm": 4.3550136305399185, "language_loss": 0.80530798, "learning_rate": 1.9257076982198517e-07, "loss": 0.81998312, "num_input_tokens_seen": 309697710, "router_z_loss_clip": 2.12011719, "router_z_loss_mlp": 0.234375, "step": 14360, "time_per_iteration": 2.632931709289551 }, { "auxiliary_loss_clip": 0.01271001, "auxiliary_loss_mlp": 0.00204415, "balance_loss_clip": 1.04647851, "balance_loss_mlp": 0.17721179, "epoch": 0.8634300315647077, "flos": 19244888012160.0, "grad_norm": 35.57493504981243, "language_loss": 0.84705859, "learning_rate": 1.9240406177114953e-07, "loss": 0.86181277, "num_input_tokens_seen": 309715985, "router_z_loss_clip": 2.24609375, "router_z_loss_mlp": 0.27197266, "step": 14361, "time_per_iteration": 2.647881269454956 }, { "auxiliary_loss_clip": 0.01147759, "auxiliary_loss_mlp": 0.00158342, "balance_loss_clip": 1.00676966, "balance_loss_mlp": 0.14975844, "epoch": 0.8634901548173756, "flos": 66195648282240.0, "grad_norm": 0.9326061620710311, "language_loss": 0.57640785, "learning_rate": 1.922374222645329e-07, "loss": 0.58946884, "num_input_tokens_seen": 309779930, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.0859375, "step": 14362, "time_per_iteration": 3.147634506225586 }, { "auxiliary_loss_clip": 0.01287004, "auxiliary_loss_mlp": 0.00229038, "balance_loss_clip": 1.05525565, "balance_loss_mlp": 0.20142871, "epoch": 0.8635502780700436, "flos": 24789531121920.0, "grad_norm": 34.015331800517096, "language_loss": 0.86901867, "learning_rate": 1.9207085130845524e-07, "loss": 0.88417906, "num_input_tokens_seen": 309800580, "router_z_loss_clip": 2.31640625, "router_z_loss_mlp": 0.27624512, "step": 14363, "time_per_iteration": 2.7080166339874268 }, { "auxiliary_loss_clip": 0.01256848, "auxiliary_loss_mlp": 0.00227319, "balance_loss_clip": 1.03181982, "balance_loss_mlp": 0.20177273, "epoch": 0.8636104013227116, "flos": 25189970918400.0, "grad_norm": 154.8147800357241, "language_loss": 0.83278322, "learning_rate": 1.9190434890923112e-07, "loss": 0.8476249, "num_input_tokens_seen": 309821725, "router_z_loss_clip": 2.25390625, "router_z_loss_mlp": 0.25537109, "step": 14364, "time_per_iteration": 2.66694974899292 }, { "auxiliary_loss_clip": 0.01248369, "auxiliary_loss_mlp": 0.00228952, "balance_loss_clip": 1.02545726, "balance_loss_mlp": 0.20369177, "epoch": 0.8636705245753795, "flos": 23878441624320.0, "grad_norm": 12.115761788400455, "language_loss": 0.79367566, "learning_rate": 1.917379150731755e-07, "loss": 0.80844879, "num_input_tokens_seen": 309841565, "router_z_loss_clip": 2.23046875, "router_z_loss_mlp": 0.25231934, "step": 14365, "time_per_iteration": 4.091534376144409 }, { "auxiliary_loss_clip": 0.01252736, "auxiliary_loss_mlp": 0.00220843, "balance_loss_clip": 1.03029251, "balance_loss_mlp": 0.1928165, "epoch": 0.8637306478280475, "flos": 23110455911040.0, "grad_norm": 5.848553572346952, "language_loss": 0.80688179, "learning_rate": 1.915715498065993e-07, "loss": 0.82161748, "num_input_tokens_seen": 309858635, "router_z_loss_clip": 2.22265625, "router_z_loss_mlp": 0.27990723, "step": 14366, "time_per_iteration": 2.686221122741699 }, { "auxiliary_loss_clip": 0.01221728, "auxiliary_loss_mlp": 0.00213732, "balance_loss_clip": 1.01367617, "balance_loss_mlp": 0.19079632, "epoch": 0.8637907710807154, "flos": 21906802137600.0, "grad_norm": 56.70632066217224, "language_loss": 0.88949651, "learning_rate": 1.9140525311581146e-07, "loss": 0.90385103, "num_input_tokens_seen": 309877885, "router_z_loss_clip": 2.08203125, "router_z_loss_mlp": 0.22961426, "step": 14367, "time_per_iteration": 4.041132211685181 }, { "auxiliary_loss_clip": 0.01272835, "auxiliary_loss_mlp": 0.00225237, "balance_loss_clip": 1.0456388, "balance_loss_mlp": 0.19860587, "epoch": 0.8638508943333835, "flos": 23580526222080.0, "grad_norm": 12.398817603443801, "language_loss": 0.68545729, "learning_rate": 1.9123902500711743e-07, "loss": 0.70043802, "num_input_tokens_seen": 309893140, "router_z_loss_clip": 2.2734375, "router_z_loss_mlp": 0.26623535, "step": 14368, "time_per_iteration": 2.685426712036133 }, { "auxiliary_loss_clip": 0.01252904, "auxiliary_loss_mlp": 0.00210983, "balance_loss_clip": 1.03583288, "balance_loss_mlp": 0.18700959, "epoch": 0.8639110175860514, "flos": 25775853655680.0, "grad_norm": 13.654067418643026, "language_loss": 0.83020103, "learning_rate": 1.91072865486821e-07, "loss": 0.84483993, "num_input_tokens_seen": 309914175, "router_z_loss_clip": 2.171875, "router_z_loss_mlp": 0.23986816, "step": 14369, "time_per_iteration": 2.6966726779937744 }, { "auxiliary_loss_clip": 0.01244145, "auxiliary_loss_mlp": 0.00226549, "balance_loss_clip": 1.02698576, "balance_loss_mlp": 0.19888015, "epoch": 0.8639711408387194, "flos": 23369443948800.0, "grad_norm": 32.181832929791405, "language_loss": 0.7200315, "learning_rate": 1.9090677456122294e-07, "loss": 0.73473847, "num_input_tokens_seen": 309932395, "router_z_loss_clip": 2.16796875, "router_z_loss_mlp": 0.27661133, "step": 14370, "time_per_iteration": 2.6568968296051025 }, { "auxiliary_loss_clip": 0.01253268, "auxiliary_loss_mlp": 0.00222785, "balance_loss_clip": 1.03275692, "balance_loss_mlp": 0.19778684, "epoch": 0.8640312640913873, "flos": 22127221946880.0, "grad_norm": 13.265204141229706, "language_loss": 0.71840596, "learning_rate": 1.907407522366209e-07, "loss": 0.73316646, "num_input_tokens_seen": 309951720, "router_z_loss_clip": 2.20507812, "router_z_loss_mlp": 0.25024414, "step": 14371, "time_per_iteration": 2.6812589168548584 }, { "auxiliary_loss_clip": 0.01149475, "auxiliary_loss_mlp": 0.00103631, "balance_loss_clip": 1.0086782, "balance_loss_mlp": 0.09657407, "epoch": 0.8640913873440553, "flos": 57571735944960.0, "grad_norm": 0.8590224396306199, "language_loss": 0.56119263, "learning_rate": 1.905747985193107e-07, "loss": 0.57372367, "num_input_tokens_seen": 310006120, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.07080078, "step": 14372, "time_per_iteration": 4.544673681259155 }, { "auxiliary_loss_clip": 0.01236562, "auxiliary_loss_mlp": 0.00210337, "balance_loss_clip": 1.02008569, "balance_loss_mlp": 0.18579221, "epoch": 0.8641515105967232, "flos": 23987430466560.0, "grad_norm": 3.376889187154304, "language_loss": 0.8695749, "learning_rate": 1.9040891341558597e-07, "loss": 0.88404387, "num_input_tokens_seen": 310026740, "router_z_loss_clip": 2.16601562, "router_z_loss_mlp": 0.24536133, "step": 14373, "time_per_iteration": 2.7180488109588623 }, { "auxiliary_loss_clip": 0.0124318, "auxiliary_loss_mlp": 0.00211091, "balance_loss_clip": 1.02477527, "balance_loss_mlp": 0.18710628, "epoch": 0.8642116338493913, "flos": 19062749122560.0, "grad_norm": 5.543940126072593, "language_loss": 0.7110002, "learning_rate": 1.9024309693173656e-07, "loss": 0.7255429, "num_input_tokens_seen": 310044135, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.23962402, "step": 14374, "time_per_iteration": 2.628518581390381 }, { "auxiliary_loss_clip": 0.01242312, "auxiliary_loss_mlp": 0.002086, "balance_loss_clip": 1.02750063, "balance_loss_mlp": 0.18394795, "epoch": 0.8642717571020592, "flos": 18254148105600.0, "grad_norm": 6.601886478847673, "language_loss": 0.84035861, "learning_rate": 1.9007734907404993e-07, "loss": 0.85486776, "num_input_tokens_seen": 310061560, "router_z_loss_clip": 2.15039062, "router_z_loss_mlp": 0.24645996, "step": 14375, "time_per_iteration": 4.065406799316406 }, { "auxiliary_loss_clip": 0.01248851, "auxiliary_loss_mlp": 0.00219008, "balance_loss_clip": 1.02858782, "balance_loss_mlp": 0.19154185, "epoch": 0.8643318803547272, "flos": 57663270777600.0, "grad_norm": 205.30196786310208, "language_loss": 0.6869123, "learning_rate": 1.899116698488117e-07, "loss": 0.7015909, "num_input_tokens_seen": 310087310, "router_z_loss_clip": 2.20410156, "router_z_loss_mlp": 0.27453613, "step": 14376, "time_per_iteration": 3.010939598083496 }, { "auxiliary_loss_clip": 0.01237049, "auxiliary_loss_mlp": 0.00214506, "balance_loss_clip": 1.02083826, "balance_loss_mlp": 0.19074759, "epoch": 0.8643920036073952, "flos": 19609524927360.0, "grad_norm": 24.677814633856915, "language_loss": 0.72927237, "learning_rate": 1.8974605926230457e-07, "loss": 0.74378788, "num_input_tokens_seen": 310106260, "router_z_loss_clip": 2.16113281, "router_z_loss_mlp": 0.23742676, "step": 14377, "time_per_iteration": 2.6152167320251465 }, { "auxiliary_loss_clip": 0.0122932, "auxiliary_loss_mlp": 0.00230271, "balance_loss_clip": 1.01664686, "balance_loss_mlp": 0.20713274, "epoch": 0.8644521268600631, "flos": 20850346298880.0, "grad_norm": 29.18133767689712, "language_loss": 0.77176362, "learning_rate": 1.8958051732080804e-07, "loss": 0.78635955, "num_input_tokens_seen": 310125305, "router_z_loss_clip": 2.12695312, "router_z_loss_mlp": 0.23144531, "step": 14378, "time_per_iteration": 2.6468923091888428 }, { "auxiliary_loss_clip": 0.01154119, "auxiliary_loss_mlp": 0.00105537, "balance_loss_clip": 1.01197362, "balance_loss_mlp": 0.09795562, "epoch": 0.8645122501127311, "flos": 66719550101760.0, "grad_norm": 0.8294144576564058, "language_loss": 0.59613699, "learning_rate": 1.894150440305995e-07, "loss": 0.60873348, "num_input_tokens_seen": 310189270, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.07568359, "step": 14379, "time_per_iteration": 3.152498722076416 }, { "auxiliary_loss_clip": 0.01225399, "auxiliary_loss_mlp": 0.00215313, "balance_loss_clip": 1.01280916, "balance_loss_mlp": 0.19176956, "epoch": 0.864572373365399, "flos": 21690009601920.0, "grad_norm": 6.588715925072324, "language_loss": 0.821769, "learning_rate": 1.8924963939795478e-07, "loss": 0.83617616, "num_input_tokens_seen": 310208395, "router_z_loss_clip": 2.12109375, "router_z_loss_mlp": 0.2355957, "step": 14380, "time_per_iteration": 2.687246561050415 }, { "auxiliary_loss_clip": 0.01273462, "auxiliary_loss_mlp": 0.00241655, "balance_loss_clip": 1.04668736, "balance_loss_mlp": 0.21534538, "epoch": 0.8646324966180671, "flos": 20266402896000.0, "grad_norm": 2.641306064148575, "language_loss": 0.83977842, "learning_rate": 1.8908430342914473e-07, "loss": 0.85492969, "num_input_tokens_seen": 310227415, "router_z_loss_clip": 2.26757812, "router_z_loss_mlp": 0.26306152, "step": 14381, "time_per_iteration": 2.711496114730835 }, { "auxiliary_loss_clip": 0.01238861, "auxiliary_loss_mlp": 0.00215655, "balance_loss_clip": 1.0222789, "balance_loss_mlp": 0.19090761, "epoch": 0.864692619870735, "flos": 11946188050560.0, "grad_norm": 18.57091060437416, "language_loss": 0.93912113, "learning_rate": 1.8891903613043892e-07, "loss": 0.95366633, "num_input_tokens_seen": 310242625, "router_z_loss_clip": 2.16796875, "router_z_loss_mlp": 0.24743652, "step": 14382, "time_per_iteration": 2.5789554119110107 }, { "auxiliary_loss_clip": 0.01252127, "auxiliary_loss_mlp": 0.00208841, "balance_loss_clip": 1.03431284, "balance_loss_mlp": 0.18563122, "epoch": 0.864752743123403, "flos": 21470703114240.0, "grad_norm": 4.963112963675815, "language_loss": 0.83482778, "learning_rate": 1.8875383750810504e-07, "loss": 0.84943748, "num_input_tokens_seen": 310260585, "router_z_loss_clip": 2.18457031, "router_z_loss_mlp": 0.23217773, "step": 14383, "time_per_iteration": 2.7833220958709717 }, { "auxiliary_loss_clip": 0.01237835, "auxiliary_loss_mlp": 0.00226563, "balance_loss_clip": 1.02435386, "balance_loss_mlp": 0.20412821, "epoch": 0.8648128663760709, "flos": 19530018172800.0, "grad_norm": 108.81259752096503, "language_loss": 0.90615225, "learning_rate": 1.8858870756840738e-07, "loss": 0.92079622, "num_input_tokens_seen": 310277210, "router_z_loss_clip": 2.13476562, "router_z_loss_mlp": 0.22436523, "step": 14384, "time_per_iteration": 2.700066089630127 }, { "auxiliary_loss_clip": 0.01243933, "auxiliary_loss_mlp": 0.00208449, "balance_loss_clip": 1.02140188, "balance_loss_mlp": 0.18472642, "epoch": 0.8648729896287389, "flos": 21287953693440.0, "grad_norm": 11.03662883926231, "language_loss": 0.87645662, "learning_rate": 1.884236463176072e-07, "loss": 0.89098036, "num_input_tokens_seen": 310296610, "router_z_loss_clip": 2.22460938, "router_z_loss_mlp": 0.23742676, "step": 14385, "time_per_iteration": 2.696552276611328 }, { "auxiliary_loss_clip": 0.012517, "auxiliary_loss_mlp": 0.00235551, "balance_loss_clip": 1.03415573, "balance_loss_mlp": 0.21071957, "epoch": 0.8649331128814068, "flos": 24604483230720.0, "grad_norm": 8.216199750306064, "language_loss": 0.81087625, "learning_rate": 1.8825865376196437e-07, "loss": 0.8257488, "num_input_tokens_seen": 310316830, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.24853516, "step": 14386, "time_per_iteration": 2.651942014694214 }, { "auxiliary_loss_clip": 0.01239124, "auxiliary_loss_mlp": 0.00195709, "balance_loss_clip": 1.02496767, "balance_loss_mlp": 0.17364338, "epoch": 0.8649932361340749, "flos": 15377811742080.0, "grad_norm": 12.942378939306655, "language_loss": 0.92440283, "learning_rate": 1.8809372990773476e-07, "loss": 0.9387511, "num_input_tokens_seen": 310334355, "router_z_loss_clip": 2.14355469, "router_z_loss_mlp": 0.22070312, "step": 14387, "time_per_iteration": 2.6702756881713867 }, { "auxiliary_loss_clip": 0.01240542, "auxiliary_loss_mlp": 0.00207042, "balance_loss_clip": 1.02479422, "balance_loss_mlp": 0.18334323, "epoch": 0.8650533593867428, "flos": 19901227276800.0, "grad_norm": 20.909125636602628, "language_loss": 0.77960038, "learning_rate": 1.8792887476117224e-07, "loss": 0.7940762, "num_input_tokens_seen": 310352900, "router_z_loss_clip": 2.15820312, "router_z_loss_mlp": 0.23681641, "step": 14388, "time_per_iteration": 2.6260225772857666 }, { "auxiliary_loss_clip": 0.01233868, "auxiliary_loss_mlp": 0.00216564, "balance_loss_clip": 1.02431917, "balance_loss_mlp": 0.19383046, "epoch": 0.8651134826394108, "flos": 25626931868160.0, "grad_norm": 7.5446668959443395, "language_loss": 0.95925349, "learning_rate": 1.877640883285283e-07, "loss": 0.97375786, "num_input_tokens_seen": 310372855, "router_z_loss_clip": 2.09765625, "router_z_loss_mlp": 0.22741699, "step": 14389, "time_per_iteration": 2.7499094009399414 }, { "auxiliary_loss_clip": 0.01234203, "auxiliary_loss_mlp": 0.00214515, "balance_loss_clip": 1.02179801, "balance_loss_mlp": 0.19157892, "epoch": 0.8651736058920788, "flos": 18734525619840.0, "grad_norm": 122.43791970226535, "language_loss": 0.79264939, "learning_rate": 1.8759937061605212e-07, "loss": 0.80713665, "num_input_tokens_seen": 310391595, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.22937012, "step": 14390, "time_per_iteration": 2.671577215194702 }, { "auxiliary_loss_clip": 0.01233919, "auxiliary_loss_mlp": 0.00211421, "balance_loss_clip": 1.02254355, "balance_loss_mlp": 0.18842587, "epoch": 0.8652337291447467, "flos": 20776765288320.0, "grad_norm": 5.258311608757628, "language_loss": 0.87733889, "learning_rate": 1.8743472162998941e-07, "loss": 0.8917923, "num_input_tokens_seen": 310410090, "router_z_loss_clip": 2.11328125, "router_z_loss_mlp": 0.22998047, "step": 14391, "time_per_iteration": 2.6889514923095703 }, { "auxiliary_loss_clip": 0.01140267, "auxiliary_loss_mlp": 0.00117403, "balance_loss_clip": 1.00013411, "balance_loss_mlp": 0.11153827, "epoch": 0.8652938523974147, "flos": 64227887464320.0, "grad_norm": 0.7747449395480951, "language_loss": 0.6727401, "learning_rate": 1.8727014137658337e-07, "loss": 0.6853168, "num_input_tokens_seen": 310470055, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.05859375, "step": 14392, "time_per_iteration": 3.097878932952881 }, { "auxiliary_loss_clip": 0.01260194, "auxiliary_loss_mlp": 0.00238431, "balance_loss_clip": 1.03528833, "balance_loss_mlp": 0.21071491, "epoch": 0.8653539756500827, "flos": 18040587793920.0, "grad_norm": 18.329437619150884, "language_loss": 0.85503328, "learning_rate": 1.8710562986207523e-07, "loss": 0.87001956, "num_input_tokens_seen": 310487665, "router_z_loss_clip": 2.24609375, "router_z_loss_mlp": 0.27734375, "step": 14393, "time_per_iteration": 2.656834602355957 }, { "auxiliary_loss_clip": 0.01245595, "auxiliary_loss_mlp": 0.00223805, "balance_loss_clip": 1.02764022, "balance_loss_mlp": 0.19822286, "epoch": 0.8654140989027507, "flos": 17382416935680.0, "grad_norm": 21.978875759767707, "language_loss": 0.82920671, "learning_rate": 1.8694118709270357e-07, "loss": 0.84390068, "num_input_tokens_seen": 310506130, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.25598145, "step": 14394, "time_per_iteration": 2.6428256034851074 }, { "auxiliary_loss_clip": 0.01239165, "auxiliary_loss_mlp": 0.00213702, "balance_loss_clip": 1.02353001, "balance_loss_mlp": 0.18914476, "epoch": 0.8654742221554186, "flos": 53284862448000.0, "grad_norm": 8.883806201888433, "language_loss": 0.72564995, "learning_rate": 1.867768130747036e-07, "loss": 0.74017859, "num_input_tokens_seen": 310532445, "router_z_loss_clip": 2.15234375, "router_z_loss_mlp": 0.24560547, "step": 14395, "time_per_iteration": 2.9636669158935547 }, { "auxiliary_loss_clip": 0.01253879, "auxiliary_loss_mlp": 0.00235001, "balance_loss_clip": 1.04208183, "balance_loss_mlp": 0.21104044, "epoch": 0.8655343454080866, "flos": 23914711382400.0, "grad_norm": 14.085665904854377, "language_loss": 0.76858211, "learning_rate": 1.8661250781430838e-07, "loss": 0.78347093, "num_input_tokens_seen": 310552300, "router_z_loss_clip": 2.11914062, "router_z_loss_mlp": 0.23937988, "step": 14396, "time_per_iteration": 2.774465322494507 }, { "auxiliary_loss_clip": 0.01249969, "auxiliary_loss_mlp": 0.00219636, "balance_loss_clip": 1.02960896, "balance_loss_mlp": 0.19667596, "epoch": 0.8655944686607545, "flos": 24097209408000.0, "grad_norm": 38.363896738058294, "language_loss": 0.78758061, "learning_rate": 1.8644827131774954e-07, "loss": 0.80227661, "num_input_tokens_seen": 310572710, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.22973633, "step": 14397, "time_per_iteration": 2.6789660453796387 }, { "auxiliary_loss_clip": 0.01233836, "auxiliary_loss_mlp": 0.00228449, "balance_loss_clip": 1.01672173, "balance_loss_mlp": 0.20322424, "epoch": 0.8656545919134225, "flos": 23112718467840.0, "grad_norm": 7.862851656968512, "language_loss": 0.72673047, "learning_rate": 1.86284103591253e-07, "loss": 0.74135333, "num_input_tokens_seen": 310592460, "router_z_loss_clip": 2.171875, "router_z_loss_mlp": 0.25244141, "step": 14398, "time_per_iteration": 2.703394651412964 }, { "auxiliary_loss_clip": 0.01239356, "auxiliary_loss_mlp": 0.00201403, "balance_loss_clip": 1.01894426, "balance_loss_mlp": 0.17798999, "epoch": 0.8657147151660904, "flos": 21141761339520.0, "grad_norm": 292.5650207115117, "language_loss": 0.8509922, "learning_rate": 1.8612000464104517e-07, "loss": 0.86539984, "num_input_tokens_seen": 310609375, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.23425293, "step": 14399, "time_per_iteration": 2.6535515785217285 }, { "auxiliary_loss_clip": 0.01231446, "auxiliary_loss_mlp": 0.00208243, "balance_loss_clip": 1.01775622, "balance_loss_mlp": 0.1857129, "epoch": 0.8657748384187585, "flos": 16289439943680.0, "grad_norm": 16.96858046722188, "language_loss": 0.99793309, "learning_rate": 1.8595597447334855e-07, "loss": 1.01233006, "num_input_tokens_seen": 310627405, "router_z_loss_clip": 2.13574219, "router_z_loss_mlp": 0.22546387, "step": 14400, "time_per_iteration": 2.6258764266967773 }, { "auxiliary_loss_clip": 0.01229325, "auxiliary_loss_mlp": 0.00236553, "balance_loss_clip": 1.01848197, "balance_loss_mlp": 0.21088734, "epoch": 0.8658349616714264, "flos": 30843890179200.0, "grad_norm": 34.879371449753165, "language_loss": 0.73249573, "learning_rate": 1.8579201309438353e-07, "loss": 0.74715453, "num_input_tokens_seen": 310649945, "router_z_loss_clip": 2.10742188, "router_z_loss_mlp": 0.25671387, "step": 14401, "time_per_iteration": 2.754730701446533 }, { "auxiliary_loss_clip": 0.01266294, "auxiliary_loss_mlp": 0.00218836, "balance_loss_clip": 1.04048908, "balance_loss_mlp": 0.19163197, "epoch": 0.8658950849240944, "flos": 18952862440320.0, "grad_norm": 5.126452328879962, "language_loss": 0.84613448, "learning_rate": 1.8562812051036714e-07, "loss": 0.86098576, "num_input_tokens_seen": 310668285, "router_z_loss_clip": 2.2578125, "router_z_loss_mlp": 0.2722168, "step": 14402, "time_per_iteration": 2.666607141494751 }, { "auxiliary_loss_clip": 0.01228121, "auxiliary_loss_mlp": 0.00203018, "balance_loss_clip": 1.01738036, "balance_loss_mlp": 0.18110704, "epoch": 0.8659552081767624, "flos": 23364344217600.0, "grad_norm": 8.192125364689412, "language_loss": 0.82053989, "learning_rate": 1.8546429672751397e-07, "loss": 0.83485126, "num_input_tokens_seen": 310687015, "router_z_loss_clip": 2.10351562, "router_z_loss_mlp": 0.21899414, "step": 14403, "time_per_iteration": 2.6747066974639893 }, { "auxiliary_loss_clip": 0.01251817, "auxiliary_loss_mlp": 0.00226781, "balance_loss_clip": 1.03836632, "balance_loss_mlp": 0.2016044, "epoch": 0.8660153314294303, "flos": 23841992298240.0, "grad_norm": 86.69113300028728, "language_loss": 0.8095206, "learning_rate": 1.853005417520368e-07, "loss": 0.82430661, "num_input_tokens_seen": 310707580, "router_z_loss_clip": 2.13671875, "router_z_loss_mlp": 0.25183105, "step": 14404, "time_per_iteration": 2.71597957611084 }, { "auxiliary_loss_clip": 0.01243052, "auxiliary_loss_mlp": 0.00212941, "balance_loss_clip": 1.02468002, "balance_loss_mlp": 0.18800277, "epoch": 0.8660754546820983, "flos": 23112467072640.0, "grad_norm": 19.89051435043802, "language_loss": 0.79810834, "learning_rate": 1.851368555901447e-07, "loss": 0.8126682, "num_input_tokens_seen": 310727300, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.24975586, "step": 14405, "time_per_iteration": 2.6641123294830322 }, { "auxiliary_loss_clip": 0.01268758, "auxiliary_loss_mlp": 0.00227738, "balance_loss_clip": 1.04121065, "balance_loss_mlp": 0.20048696, "epoch": 0.8661355779347663, "flos": 14391991998720.0, "grad_norm": 16.150614698109077, "language_loss": 0.76327878, "learning_rate": 1.8497323824804467e-07, "loss": 0.77824372, "num_input_tokens_seen": 310744935, "router_z_loss_clip": 2.27734375, "router_z_loss_mlp": 0.2722168, "step": 14406, "time_per_iteration": 2.620616912841797 }, { "auxiliary_loss_clip": 0.01243063, "auxiliary_loss_mlp": 0.00227335, "balance_loss_clip": 1.02711117, "balance_loss_mlp": 0.20411265, "epoch": 0.8661957011874343, "flos": 21870137329920.0, "grad_norm": 4.359759604987049, "language_loss": 0.89192414, "learning_rate": 1.8480968973194177e-07, "loss": 0.90662813, "num_input_tokens_seen": 310765085, "router_z_loss_clip": 2.16015625, "router_z_loss_mlp": 0.23205566, "step": 14407, "time_per_iteration": 4.053891658782959 }, { "auxiliary_loss_clip": 0.01252002, "auxiliary_loss_mlp": 0.00216195, "balance_loss_clip": 1.03314495, "balance_loss_mlp": 0.19290137, "epoch": 0.8662558244401022, "flos": 21835160461440.0, "grad_norm": 63.02038596845861, "language_loss": 0.80491132, "learning_rate": 1.8464621004803748e-07, "loss": 0.81959331, "num_input_tokens_seen": 310783260, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.23303223, "step": 14408, "time_per_iteration": 2.6535091400146484 }, { "auxiliary_loss_clip": 0.01222907, "auxiliary_loss_mlp": 0.00190276, "balance_loss_clip": 1.01665354, "balance_loss_mlp": 0.16794786, "epoch": 0.8663159476927702, "flos": 17384104874880.0, "grad_norm": 40.455557470668964, "language_loss": 0.86307043, "learning_rate": 1.844827992025304e-07, "loss": 0.87720227, "num_input_tokens_seen": 310801970, "router_z_loss_clip": 2.0625, "router_z_loss_mlp": 0.2232666, "step": 14409, "time_per_iteration": 4.104370594024658 }, { "auxiliary_loss_clip": 0.01264265, "auxiliary_loss_mlp": 0.00222431, "balance_loss_clip": 1.04012597, "balance_loss_mlp": 0.1964554, "epoch": 0.8663760709454381, "flos": 22747722416640.0, "grad_norm": 6.367403859302036, "language_loss": 0.87569445, "learning_rate": 1.8431945720161757e-07, "loss": 0.89056146, "num_input_tokens_seen": 310822070, "router_z_loss_clip": 2.2421875, "router_z_loss_mlp": 0.25952148, "step": 14410, "time_per_iteration": 2.7467525005340576 }, { "auxiliary_loss_clip": 0.01246463, "auxiliary_loss_mlp": 0.00211102, "balance_loss_clip": 1.02793646, "balance_loss_mlp": 0.18826175, "epoch": 0.8664361941981061, "flos": 17376850327680.0, "grad_norm": 12.454712624612602, "language_loss": 0.86189008, "learning_rate": 1.8415618405149315e-07, "loss": 0.87646574, "num_input_tokens_seen": 310838355, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.22827148, "step": 14411, "time_per_iteration": 2.644090414047241 }, { "auxiliary_loss_clip": 0.01220528, "auxiliary_loss_mlp": 0.00222443, "balance_loss_clip": 1.01042342, "balance_loss_mlp": 0.19973385, "epoch": 0.866496317450774, "flos": 16034438315520.0, "grad_norm": 10.227516009729678, "language_loss": 0.81355405, "learning_rate": 1.8399297975834794e-07, "loss": 0.8279838, "num_input_tokens_seen": 310856055, "router_z_loss_clip": 2.10351562, "router_z_loss_mlp": 0.22741699, "step": 14412, "time_per_iteration": 2.6722054481506348 }, { "auxiliary_loss_clip": 0.01243708, "auxiliary_loss_mlp": 0.00201883, "balance_loss_clip": 1.02777004, "balance_loss_mlp": 0.17895931, "epoch": 0.8665564407034421, "flos": 20814830726400.0, "grad_norm": 30.21029381658901, "language_loss": 0.77283454, "learning_rate": 1.83829844328371e-07, "loss": 0.78729039, "num_input_tokens_seen": 310876695, "router_z_loss_clip": 2.16015625, "router_z_loss_mlp": 0.22937012, "step": 14413, "time_per_iteration": 2.671900510787964 }, { "auxiliary_loss_clip": 0.01242969, "auxiliary_loss_mlp": 0.0022233, "balance_loss_clip": 1.02377892, "balance_loss_mlp": 0.19934618, "epoch": 0.86661656395611, "flos": 15815167741440.0, "grad_norm": 21.145821436005885, "language_loss": 0.73288125, "learning_rate": 1.8366677776774874e-07, "loss": 0.74753428, "num_input_tokens_seen": 310893880, "router_z_loss_clip": 2.19238281, "router_z_loss_mlp": 0.22973633, "step": 14414, "time_per_iteration": 2.6783015727996826 }, { "auxiliary_loss_clip": 0.01236498, "auxiliary_loss_mlp": 0.00216513, "balance_loss_clip": 1.02131832, "balance_loss_mlp": 0.1914556, "epoch": 0.866676687208778, "flos": 23036910814080.0, "grad_norm": 37.670684273387245, "language_loss": 0.70843768, "learning_rate": 1.8350378008266377e-07, "loss": 0.72296774, "num_input_tokens_seen": 310914145, "router_z_loss_clip": 2.15429688, "router_z_loss_mlp": 0.25036621, "step": 14415, "time_per_iteration": 4.13609766960144 }, { "auxiliary_loss_clip": 0.01128294, "auxiliary_loss_mlp": 0.00078283, "balance_loss_clip": 0.99091297, "balance_loss_mlp": 0.07198897, "epoch": 0.866736810461446, "flos": 63802275212160.0, "grad_norm": 0.787172171129337, "language_loss": 0.59926486, "learning_rate": 1.8334085127929754e-07, "loss": 0.61133063, "num_input_tokens_seen": 310972825, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.06298828, "step": 14416, "time_per_iteration": 3.2247490882873535 }, { "auxiliary_loss_clip": 0.01251512, "auxiliary_loss_mlp": 0.00220719, "balance_loss_clip": 1.03084087, "balance_loss_mlp": 0.19510069, "epoch": 0.8667969337141139, "flos": 20449367798400.0, "grad_norm": 980.640818664037, "language_loss": 0.8297472, "learning_rate": 1.831779913638285e-07, "loss": 0.84446955, "num_input_tokens_seen": 310992050, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.25646973, "step": 14417, "time_per_iteration": 2.6243443489074707 }, { "auxiliary_loss_clip": 0.01241865, "auxiliary_loss_mlp": 0.00215285, "balance_loss_clip": 1.02233267, "balance_loss_mlp": 0.19073938, "epoch": 0.866857056966782, "flos": 21653703930240.0, "grad_norm": 5.087363103491599, "language_loss": 0.83104503, "learning_rate": 1.830152003424319e-07, "loss": 0.84561652, "num_input_tokens_seen": 311011105, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.2454834, "step": 14418, "time_per_iteration": 4.033541917800903 }, { "auxiliary_loss_clip": 0.01217321, "auxiliary_loss_mlp": 0.00189642, "balance_loss_clip": 1.00653458, "balance_loss_mlp": 0.1660144, "epoch": 0.8669171802194499, "flos": 22852832590080.0, "grad_norm": 6.681436647483804, "language_loss": 0.75300062, "learning_rate": 1.8285247822128126e-07, "loss": 0.76707023, "num_input_tokens_seen": 311032080, "router_z_loss_clip": 2.109375, "router_z_loss_mlp": 0.2364502, "step": 14419, "time_per_iteration": 2.6571156978607178 }, { "auxiliary_loss_clip": 0.01234311, "auxiliary_loss_mlp": 0.00204928, "balance_loss_clip": 1.02071548, "balance_loss_mlp": 0.18029936, "epoch": 0.8669773034721179, "flos": 18734166483840.0, "grad_norm": 11.676271653736977, "language_loss": 0.86335301, "learning_rate": 1.826898250065465e-07, "loss": 0.87774539, "num_input_tokens_seen": 311049735, "router_z_loss_clip": 2.13671875, "router_z_loss_mlp": 0.24633789, "step": 14420, "time_per_iteration": 2.6447904109954834 }, { "auxiliary_loss_clip": 0.01232858, "auxiliary_loss_mlp": 0.00211396, "balance_loss_clip": 1.01879215, "balance_loss_mlp": 0.18750647, "epoch": 0.8670374267247858, "flos": 18916018064640.0, "grad_norm": 140.98921641469622, "language_loss": 0.89397126, "learning_rate": 1.8252724070439586e-07, "loss": 0.90841377, "num_input_tokens_seen": 311067675, "router_z_loss_clip": 2.140625, "router_z_loss_mlp": 0.2388916, "step": 14421, "time_per_iteration": 2.696129083633423 }, { "auxiliary_loss_clip": 0.01131538, "auxiliary_loss_mlp": 0.00151688, "balance_loss_clip": 0.99352878, "balance_loss_mlp": 0.14420199, "epoch": 0.8670975499774538, "flos": 48814527214080.0, "grad_norm": 0.7276496043649967, "language_loss": 0.48243839, "learning_rate": 1.823647253209941e-07, "loss": 0.49527067, "num_input_tokens_seen": 311126605, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.07470703, "step": 14422, "time_per_iteration": 3.156510829925537 }, { "auxiliary_loss_clip": 0.01243428, "auxiliary_loss_mlp": 0.00211733, "balance_loss_clip": 1.02738857, "balance_loss_mlp": 0.18859471, "epoch": 0.8671576732301217, "flos": 26136145025280.0, "grad_norm": 2.1205543439032493, "language_loss": 0.82059848, "learning_rate": 1.8220227886250417e-07, "loss": 0.83515012, "num_input_tokens_seen": 311147325, "router_z_loss_clip": 2.16015625, "router_z_loss_mlp": 0.23132324, "step": 14423, "time_per_iteration": 2.6857030391693115 }, { "auxiliary_loss_clip": 0.01217235, "auxiliary_loss_mlp": 0.00212977, "balance_loss_clip": 1.00788581, "balance_loss_mlp": 0.18944532, "epoch": 0.8672177964827897, "flos": 18367446579840.0, "grad_norm": 4.721070032725136, "language_loss": 0.82072103, "learning_rate": 1.8203990133508684e-07, "loss": 0.83502316, "num_input_tokens_seen": 311165385, "router_z_loss_clip": 2.09277344, "router_z_loss_mlp": 0.23535156, "step": 14424, "time_per_iteration": 2.626976251602173 }, { "auxiliary_loss_clip": 0.01212536, "auxiliary_loss_mlp": 0.00196343, "balance_loss_clip": 1.00832975, "balance_loss_mlp": 0.17351469, "epoch": 0.8672779197354576, "flos": 28545355992960.0, "grad_norm": 655.0083146338258, "language_loss": 0.77178931, "learning_rate": 1.8187759274489767e-07, "loss": 0.78587812, "num_input_tokens_seen": 311185860, "router_z_loss_clip": 2.04492188, "router_z_loss_mlp": 0.22827148, "step": 14425, "time_per_iteration": 2.7270257472991943 }, { "auxiliary_loss_clip": 0.01254177, "auxiliary_loss_mlp": 0.0023119, "balance_loss_clip": 1.03431845, "balance_loss_mlp": 0.20631056, "epoch": 0.8673380429881257, "flos": 22382474970240.0, "grad_norm": 16.336243606853476, "language_loss": 0.75594604, "learning_rate": 1.817153530980926e-07, "loss": 0.7707997, "num_input_tokens_seen": 311205810, "router_z_loss_clip": 2.19921875, "router_z_loss_mlp": 0.24865723, "step": 14426, "time_per_iteration": 2.6846866607666016 }, { "auxiliary_loss_clip": 0.01263695, "auxiliary_loss_mlp": 0.00224122, "balance_loss_clip": 1.03689492, "balance_loss_mlp": 0.19646585, "epoch": 0.8673981662407936, "flos": 20996430912000.0, "grad_norm": 24.370743604756942, "language_loss": 0.79621732, "learning_rate": 1.815531824008234e-07, "loss": 0.81109554, "num_input_tokens_seen": 311226080, "router_z_loss_clip": 2.26757812, "router_z_loss_mlp": 0.27648926, "step": 14427, "time_per_iteration": 2.692060947418213 }, { "auxiliary_loss_clip": 0.01225461, "auxiliary_loss_mlp": 0.00219228, "balance_loss_clip": 1.0140028, "balance_loss_mlp": 0.19564791, "epoch": 0.8674582894934616, "flos": 24426797627520.0, "grad_norm": 166.29935808867512, "language_loss": 0.76380277, "learning_rate": 1.8139108065924004e-07, "loss": 0.77824962, "num_input_tokens_seen": 311246380, "router_z_loss_clip": 2.11328125, "router_z_loss_mlp": 0.2355957, "step": 14428, "time_per_iteration": 2.7054030895233154 }, { "auxiliary_loss_clip": 0.01237441, "auxiliary_loss_mlp": 0.00215559, "balance_loss_clip": 1.02201843, "balance_loss_mlp": 0.19220544, "epoch": 0.8675184127461296, "flos": 20737514701440.0, "grad_norm": 4.269687509827742, "language_loss": 0.78764737, "learning_rate": 1.812290478794889e-07, "loss": 0.80217737, "num_input_tokens_seen": 311266465, "router_z_loss_clip": 2.15234375, "router_z_loss_mlp": 0.23352051, "step": 14429, "time_per_iteration": 2.8552002906799316 }, { "auxiliary_loss_clip": 0.01238557, "auxiliary_loss_mlp": 0.00198632, "balance_loss_clip": 1.02138162, "balance_loss_mlp": 0.17501622, "epoch": 0.8675785359987975, "flos": 19135647774720.0, "grad_norm": 4.204298931177658, "language_loss": 0.77968502, "learning_rate": 1.810670840677151e-07, "loss": 0.79405689, "num_input_tokens_seen": 311285075, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.23608398, "step": 14430, "time_per_iteration": 2.6675286293029785 }, { "auxiliary_loss_clip": 0.01244113, "auxiliary_loss_mlp": 0.00222239, "balance_loss_clip": 1.02326143, "balance_loss_mlp": 0.19808714, "epoch": 0.8676386592514655, "flos": 22710662559360.0, "grad_norm": 71.64296434698727, "language_loss": 0.76824564, "learning_rate": 1.8090518923005948e-07, "loss": 0.78290915, "num_input_tokens_seen": 311303230, "router_z_loss_clip": 2.20800781, "router_z_loss_mlp": 0.24157715, "step": 14431, "time_per_iteration": 2.6766357421875 }, { "auxiliary_loss_clip": 0.01257566, "auxiliary_loss_mlp": 0.00205884, "balance_loss_clip": 1.0330106, "balance_loss_mlp": 0.1805045, "epoch": 0.8676987825041335, "flos": 14209853109120.0, "grad_norm": 29.788487520721386, "language_loss": 0.75343692, "learning_rate": 1.8074336337266116e-07, "loss": 0.76807141, "num_input_tokens_seen": 311318070, "router_z_loss_clip": 2.24316406, "router_z_loss_mlp": 0.25390625, "step": 14432, "time_per_iteration": 2.6552889347076416 }, { "auxiliary_loss_clip": 0.01252492, "auxiliary_loss_mlp": 0.00209199, "balance_loss_clip": 1.03458095, "balance_loss_mlp": 0.185166, "epoch": 0.8677589057568015, "flos": 13589927256960.0, "grad_norm": 70.93752956941222, "language_loss": 0.8876788, "learning_rate": 1.8058160650165656e-07, "loss": 0.90229571, "num_input_tokens_seen": 311334885, "router_z_loss_clip": 2.18164062, "router_z_loss_mlp": 0.24035645, "step": 14433, "time_per_iteration": 2.6424641609191895 }, { "auxiliary_loss_clip": 0.01134133, "auxiliary_loss_mlp": 0.00089919, "balance_loss_clip": 0.99428189, "balance_loss_mlp": 0.08398274, "epoch": 0.8678190290094694, "flos": 68933657370240.0, "grad_norm": 0.7024979810915418, "language_loss": 0.57549173, "learning_rate": 1.804199186231805e-07, "loss": 0.58773226, "num_input_tokens_seen": 311399780, "router_z_loss_clip": 1.3984375, "router_z_loss_mlp": 0.05932617, "step": 14434, "time_per_iteration": 3.229625940322876 }, { "auxiliary_loss_clip": 0.01221965, "auxiliary_loss_mlp": 0.0020923, "balance_loss_clip": 1.01252651, "balance_loss_mlp": 0.18646134, "epoch": 0.8678791522621374, "flos": 32557726776960.0, "grad_norm": 17.720751389474906, "language_loss": 0.85944796, "learning_rate": 1.802582997433628e-07, "loss": 0.87375993, "num_input_tokens_seen": 311419610, "router_z_loss_clip": 2.09765625, "router_z_loss_mlp": 0.22766113, "step": 14435, "time_per_iteration": 2.747859239578247 }, { "auxiliary_loss_clip": 0.01249511, "auxiliary_loss_mlp": 0.00222805, "balance_loss_clip": 1.02756238, "balance_loss_mlp": 0.19584012, "epoch": 0.8679392755148053, "flos": 35042637657600.0, "grad_norm": 8.90087199043531, "language_loss": 0.72887003, "learning_rate": 1.8009674986833322e-07, "loss": 0.74359322, "num_input_tokens_seen": 311440045, "router_z_loss_clip": 2.21484375, "router_z_loss_mlp": 0.26965332, "step": 14436, "time_per_iteration": 2.859226703643799 }, { "auxiliary_loss_clip": 0.01257131, "auxiliary_loss_mlp": 0.00206988, "balance_loss_clip": 1.03548217, "balance_loss_mlp": 0.18059464, "epoch": 0.8679993987674733, "flos": 18552494471040.0, "grad_norm": 5.72616863708048, "language_loss": 0.77667844, "learning_rate": 1.7993526900421706e-07, "loss": 0.79131961, "num_input_tokens_seen": 311456660, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.26416016, "step": 14437, "time_per_iteration": 2.6287810802459717 }, { "auxiliary_loss_clip": 0.01252688, "auxiliary_loss_mlp": 0.00224476, "balance_loss_clip": 1.03579092, "balance_loss_mlp": 0.20113534, "epoch": 0.8680595220201412, "flos": 27454390162560.0, "grad_norm": 5.427303325734982, "language_loss": 0.87289059, "learning_rate": 1.797738571571381e-07, "loss": 0.88766229, "num_input_tokens_seen": 311475460, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.23327637, "step": 14438, "time_per_iteration": 2.772761821746826 }, { "auxiliary_loss_clip": 0.01226867, "auxiliary_loss_mlp": 0.00208268, "balance_loss_clip": 1.01828671, "balance_loss_mlp": 0.18443799, "epoch": 0.8681196452728093, "flos": 19208797822080.0, "grad_norm": 26.92967881897053, "language_loss": 0.75124097, "learning_rate": 1.7961251433321656e-07, "loss": 0.76559234, "num_input_tokens_seen": 311494575, "router_z_loss_clip": 2.0859375, "router_z_loss_mlp": 0.23815918, "step": 14439, "time_per_iteration": 2.660846471786499 }, { "auxiliary_loss_clip": 0.0122355, "auxiliary_loss_mlp": 0.00214087, "balance_loss_clip": 1.01394606, "balance_loss_mlp": 0.19081748, "epoch": 0.8681797685254772, "flos": 37560442417920.0, "grad_norm": 48.79252300773218, "language_loss": 0.72423911, "learning_rate": 1.7945124053857085e-07, "loss": 0.73861539, "num_input_tokens_seen": 311515805, "router_z_loss_clip": 2.09570312, "router_z_loss_mlp": 0.23254395, "step": 14440, "time_per_iteration": 2.941610097885132 }, { "auxiliary_loss_clip": 0.0123467, "auxiliary_loss_mlp": 0.00214835, "balance_loss_clip": 1.02068806, "balance_loss_mlp": 0.18947887, "epoch": 0.8682398917781452, "flos": 23289937194240.0, "grad_norm": 77.91177420800938, "language_loss": 0.73050857, "learning_rate": 1.7929003577931722e-07, "loss": 0.74500358, "num_input_tokens_seen": 311536000, "router_z_loss_clip": 2.13671875, "router_z_loss_mlp": 0.25354004, "step": 14441, "time_per_iteration": 2.808337688446045 }, { "auxiliary_loss_clip": 0.01224636, "auxiliary_loss_mlp": 0.00204069, "balance_loss_clip": 1.0165, "balance_loss_mlp": 0.18038228, "epoch": 0.8683000150308132, "flos": 21872794936320.0, "grad_norm": 45.48090989881931, "language_loss": 0.74201071, "learning_rate": 1.7912890006156722e-07, "loss": 0.75629783, "num_input_tokens_seen": 311556220, "router_z_loss_clip": 2.078125, "router_z_loss_mlp": 0.23669434, "step": 14442, "time_per_iteration": 2.7357115745544434 }, { "auxiliary_loss_clip": 0.01250109, "auxiliary_loss_mlp": 0.00231354, "balance_loss_clip": 1.02687919, "balance_loss_mlp": 0.20577207, "epoch": 0.8683601382834811, "flos": 14647209108480.0, "grad_norm": 4.047999401479111, "language_loss": 0.79764593, "learning_rate": 1.7896783339143195e-07, "loss": 0.81246054, "num_input_tokens_seen": 311572530, "router_z_loss_clip": 2.23242188, "router_z_loss_mlp": 0.25585938, "step": 14443, "time_per_iteration": 2.6591262817382812 }, { "auxiliary_loss_clip": 0.01251148, "auxiliary_loss_mlp": 0.00214168, "balance_loss_clip": 1.03292799, "balance_loss_mlp": 0.19055283, "epoch": 0.8684202615361492, "flos": 26359904799360.0, "grad_norm": 2.2218030562216695, "language_loss": 0.8985728, "learning_rate": 1.7880683577501877e-07, "loss": 0.91322601, "num_input_tokens_seen": 311591105, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.23620605, "step": 14444, "time_per_iteration": 2.698276996612549 }, { "auxiliary_loss_clip": 0.01251323, "auxiliary_loss_mlp": 0.00217001, "balance_loss_clip": 1.02832842, "balance_loss_mlp": 0.19020227, "epoch": 0.8684803847888171, "flos": 20704010290560.0, "grad_norm": 12.313134326894172, "language_loss": 0.85530615, "learning_rate": 1.7864590721843342e-07, "loss": 0.8699894, "num_input_tokens_seen": 311608350, "router_z_loss_clip": 2.23046875, "router_z_loss_mlp": 0.26782227, "step": 14445, "time_per_iteration": 2.662186622619629 }, { "auxiliary_loss_clip": 0.01257648, "auxiliary_loss_mlp": 0.00206508, "balance_loss_clip": 1.04279423, "balance_loss_mlp": 0.18228433, "epoch": 0.8685405080414851, "flos": 22638123043200.0, "grad_norm": 36.1010968906973, "language_loss": 0.76787734, "learning_rate": 1.7848504772777728e-07, "loss": 0.78251892, "num_input_tokens_seen": 311626380, "router_z_loss_clip": 2.14746094, "router_z_loss_mlp": 0.24206543, "step": 14446, "time_per_iteration": 2.7264111042022705 }, { "auxiliary_loss_clip": 0.01240262, "auxiliary_loss_mlp": 0.00229536, "balance_loss_clip": 1.02345335, "balance_loss_mlp": 0.20625478, "epoch": 0.868600631294153, "flos": 24822065865600.0, "grad_norm": 18.59204443926526, "language_loss": 0.88548744, "learning_rate": 1.7832425730915102e-07, "loss": 0.90018547, "num_input_tokens_seen": 311644345, "router_z_loss_clip": 2.16796875, "router_z_loss_mlp": 0.23291016, "step": 14447, "time_per_iteration": 2.7392091751098633 }, { "auxiliary_loss_clip": 0.01238199, "auxiliary_loss_mlp": 0.00208946, "balance_loss_clip": 1.02486157, "balance_loss_mlp": 0.18493719, "epoch": 0.868660754546821, "flos": 25113983696640.0, "grad_norm": 218.7240625937966, "language_loss": 0.79137343, "learning_rate": 1.781635359686515e-07, "loss": 0.8058449, "num_input_tokens_seen": 311663340, "router_z_loss_clip": 2.13085938, "router_z_loss_mlp": 0.2401123, "step": 14448, "time_per_iteration": 2.681224822998047 }, { "auxiliary_loss_clip": 0.01244249, "auxiliary_loss_mlp": 0.00208597, "balance_loss_clip": 1.02916217, "balance_loss_mlp": 0.18527961, "epoch": 0.8687208777994889, "flos": 12677832178560.0, "grad_norm": 5.216532892274374, "language_loss": 0.87833142, "learning_rate": 1.7800288371237303e-07, "loss": 0.89285988, "num_input_tokens_seen": 311679860, "router_z_loss_clip": 2.14648438, "router_z_loss_mlp": 0.23327637, "step": 14449, "time_per_iteration": 4.085373163223267 }, { "auxiliary_loss_clip": 0.01136429, "auxiliary_loss_mlp": 0.00105901, "balance_loss_clip": 0.9980613, "balance_loss_mlp": 0.09970205, "epoch": 0.8687810010521569, "flos": 65617235573760.0, "grad_norm": 0.8330900777375345, "language_loss": 0.59758663, "learning_rate": 1.7784230054640758e-07, "loss": 0.61000991, "num_input_tokens_seen": 311738135, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 0.06176758, "step": 14450, "time_per_iteration": 3.105616807937622 }, { "auxiliary_loss_clip": 0.01243383, "auxiliary_loss_mlp": 0.00217641, "balance_loss_clip": 1.02726805, "balance_loss_mlp": 0.19264235, "epoch": 0.8688411243048249, "flos": 24244012293120.0, "grad_norm": 37.30710260819788, "language_loss": 0.83444375, "learning_rate": 1.7768178647684517e-07, "loss": 0.84905398, "num_input_tokens_seen": 311756975, "router_z_loss_clip": 2.16210938, "router_z_loss_mlp": 0.25012207, "step": 14451, "time_per_iteration": 4.143481731414795 }, { "auxiliary_loss_clip": 0.01252032, "auxiliary_loss_mlp": 0.0022744, "balance_loss_clip": 1.03320909, "balance_loss_mlp": 0.20186959, "epoch": 0.8689012475574929, "flos": 18221828843520.0, "grad_norm": 15.213254317624477, "language_loss": 0.79059625, "learning_rate": 1.7752134150977205e-07, "loss": 0.80539101, "num_input_tokens_seen": 311771830, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.25610352, "step": 14452, "time_per_iteration": 2.6442818641662598 }, { "auxiliary_loss_clip": 0.01272597, "auxiliary_loss_mlp": 0.00234817, "balance_loss_clip": 1.04509568, "balance_loss_mlp": 0.20747015, "epoch": 0.8689613708101608, "flos": 19646728439040.0, "grad_norm": 7.578078017777437, "language_loss": 0.79736006, "learning_rate": 1.7736096565127201e-07, "loss": 0.81243414, "num_input_tokens_seen": 311790130, "router_z_loss_clip": 2.27148438, "router_z_loss_mlp": 0.27331543, "step": 14453, "time_per_iteration": 2.708402156829834 }, { "auxiliary_loss_clip": 0.0123231, "auxiliary_loss_mlp": 0.00199639, "balance_loss_clip": 1.01972747, "balance_loss_mlp": 0.1762263, "epoch": 0.8690214940628288, "flos": 11728749070080.0, "grad_norm": 10.049987043127816, "language_loss": 0.83503371, "learning_rate": 1.7720065890742664e-07, "loss": 0.84935319, "num_input_tokens_seen": 311808360, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.23388672, "step": 14454, "time_per_iteration": 2.642094373703003 }, { "auxiliary_loss_clip": 0.01263672, "auxiliary_loss_mlp": 0.00198473, "balance_loss_clip": 1.04338551, "balance_loss_mlp": 0.1756202, "epoch": 0.8690816173154968, "flos": 34936450076160.0, "grad_norm": 65.83516551867336, "language_loss": 0.66331506, "learning_rate": 1.7704042128431552e-07, "loss": 0.67793649, "num_input_tokens_seen": 311831325, "router_z_loss_clip": 2.19921875, "router_z_loss_mlp": 0.22827148, "step": 14455, "time_per_iteration": 2.880167245864868 }, { "auxiliary_loss_clip": 0.01236798, "auxiliary_loss_mlp": 0.00211579, "balance_loss_clip": 1.02215004, "balance_loss_mlp": 0.18879855, "epoch": 0.8691417405681647, "flos": 11614804151040.0, "grad_norm": 138.72649612675872, "language_loss": 0.91189021, "learning_rate": 1.7688025278801378e-07, "loss": 0.92637396, "num_input_tokens_seen": 311848090, "router_z_loss_clip": 2.14160156, "router_z_loss_mlp": 0.22753906, "step": 14456, "time_per_iteration": 2.593683958053589 }, { "auxiliary_loss_clip": 0.01271912, "auxiliary_loss_mlp": 0.00214227, "balance_loss_clip": 1.04475009, "balance_loss_mlp": 0.1868448, "epoch": 0.8692018638208328, "flos": 24608038677120.0, "grad_norm": 15.095050253067795, "language_loss": 0.8905015, "learning_rate": 1.7672015342459568e-07, "loss": 0.90536284, "num_input_tokens_seen": 311867855, "router_z_loss_clip": 2.27148438, "router_z_loss_mlp": 0.27368164, "step": 14457, "time_per_iteration": 4.158725738525391 }, { "auxiliary_loss_clip": 0.01229659, "auxiliary_loss_mlp": 0.00209541, "balance_loss_clip": 1.02386093, "balance_loss_mlp": 0.18667704, "epoch": 0.8692619870735007, "flos": 25995124229760.0, "grad_norm": 26.58316767470622, "language_loss": 0.85039681, "learning_rate": 1.765601232001328e-07, "loss": 0.86478883, "num_input_tokens_seen": 311888675, "router_z_loss_clip": 2.05664062, "router_z_loss_mlp": 0.2286377, "step": 14458, "time_per_iteration": 2.7336814403533936 }, { "auxiliary_loss_clip": 0.0126188, "auxiliary_loss_mlp": 0.00220396, "balance_loss_clip": 1.04235756, "balance_loss_mlp": 0.19478981, "epoch": 0.8693221103261687, "flos": 18041808856320.0, "grad_norm": 11.062678181025008, "language_loss": 0.78712213, "learning_rate": 1.7640016212069187e-07, "loss": 0.80194485, "num_input_tokens_seen": 311907310, "router_z_loss_clip": 2.19335938, "router_z_loss_mlp": 0.25646973, "step": 14459, "time_per_iteration": 2.6156773567199707 }, { "auxiliary_loss_clip": 0.01214763, "auxiliary_loss_mlp": 0.00220933, "balance_loss_clip": 1.0098691, "balance_loss_mlp": 0.19797295, "epoch": 0.8693822335788366, "flos": 27492347859840.0, "grad_norm": 110.63369094961008, "language_loss": 0.79828227, "learning_rate": 1.762402701923398e-07, "loss": 0.81263924, "num_input_tokens_seen": 311929635, "router_z_loss_clip": 2.04980469, "router_z_loss_mlp": 0.22973633, "step": 14460, "time_per_iteration": 4.112328052520752 }, { "auxiliary_loss_clip": 0.01254736, "auxiliary_loss_mlp": 0.00228546, "balance_loss_clip": 1.0295577, "balance_loss_mlp": 0.20226039, "epoch": 0.8694423568315046, "flos": 24097712198400.0, "grad_norm": 12.034693757717, "language_loss": 0.73447168, "learning_rate": 1.7608044742113947e-07, "loss": 0.74930453, "num_input_tokens_seen": 311948800, "router_z_loss_clip": 2.25, "router_z_loss_mlp": 0.26293945, "step": 14461, "time_per_iteration": 2.6623528003692627 }, { "auxiliary_loss_clip": 0.01237624, "auxiliary_loss_mlp": 0.00217633, "balance_loss_clip": 1.01984239, "balance_loss_mlp": 0.19189554, "epoch": 0.8695024800841725, "flos": 18362131367040.0, "grad_norm": 27.844154832878786, "language_loss": 0.91448706, "learning_rate": 1.7592069381315123e-07, "loss": 0.9290396, "num_input_tokens_seen": 311964090, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.25769043, "step": 14462, "time_per_iteration": 2.604078769683838 }, { "auxiliary_loss_clip": 0.01266346, "auxiliary_loss_mlp": 0.00215595, "balance_loss_clip": 1.04371762, "balance_loss_mlp": 0.19050106, "epoch": 0.8695626033368405, "flos": 14027750133120.0, "grad_norm": 16.422594601264986, "language_loss": 0.74908972, "learning_rate": 1.757610093744335e-07, "loss": 0.7639091, "num_input_tokens_seen": 311981460, "router_z_loss_clip": 2.2265625, "router_z_loss_mlp": 0.25109863, "step": 14463, "time_per_iteration": 2.6005260944366455 }, { "auxiliary_loss_clip": 0.01256872, "auxiliary_loss_mlp": 0.00233202, "balance_loss_clip": 1.03645873, "balance_loss_mlp": 0.20578426, "epoch": 0.8696227265895085, "flos": 16836862193280.0, "grad_norm": 46.14021836808044, "language_loss": 0.76520407, "learning_rate": 1.7560139411104058e-07, "loss": 0.78010488, "num_input_tokens_seen": 312000115, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.27429199, "step": 14464, "time_per_iteration": 2.5898923873901367 }, { "auxiliary_loss_clip": 0.01271989, "auxiliary_loss_mlp": 0.00200165, "balance_loss_clip": 1.04126239, "balance_loss_mlp": 0.17329511, "epoch": 0.8696828498421765, "flos": 21799070271360.0, "grad_norm": 50.39975482149876, "language_loss": 0.73274297, "learning_rate": 1.7544184802902607e-07, "loss": 0.74746454, "num_input_tokens_seen": 312020770, "router_z_loss_clip": 2.30664062, "router_z_loss_mlp": 0.26843262, "step": 14465, "time_per_iteration": 2.6800997257232666 }, { "auxiliary_loss_clip": 0.01225815, "auxiliary_loss_mlp": 0.00217368, "balance_loss_clip": 1.01467586, "balance_loss_mlp": 0.1928224, "epoch": 0.8697429730948444, "flos": 22894812610560.0, "grad_norm": 3.410334224477989, "language_loss": 0.89551115, "learning_rate": 1.7528237113443934e-07, "loss": 0.90994298, "num_input_tokens_seen": 312041870, "router_z_loss_clip": 2.11132812, "router_z_loss_mlp": 0.2454834, "step": 14466, "time_per_iteration": 2.670989990234375 }, { "auxiliary_loss_clip": 0.01272732, "auxiliary_loss_mlp": 0.00233061, "balance_loss_clip": 1.04244208, "balance_loss_mlp": 0.20611998, "epoch": 0.8698030963475124, "flos": 24717458482560.0, "grad_norm": 21.046532184981153, "language_loss": 0.73686492, "learning_rate": 1.7512296343332779e-07, "loss": 0.75192285, "num_input_tokens_seen": 312058210, "router_z_loss_clip": 2.30273438, "router_z_loss_mlp": 0.26965332, "step": 14467, "time_per_iteration": 2.7799267768859863 }, { "auxiliary_loss_clip": 0.01231399, "auxiliary_loss_mlp": 0.00204426, "balance_loss_clip": 1.02191401, "balance_loss_mlp": 0.18033394, "epoch": 0.8698632196001803, "flos": 28442221067520.0, "grad_norm": 22.081772116788926, "language_loss": 0.74699962, "learning_rate": 1.7496362493173655e-07, "loss": 0.7613579, "num_input_tokens_seen": 312082665, "router_z_loss_clip": 2.09472656, "router_z_loss_mlp": 0.24084473, "step": 14468, "time_per_iteration": 2.777477502822876 }, { "auxiliary_loss_clip": 0.01226509, "auxiliary_loss_mlp": 0.00201817, "balance_loss_clip": 1.01625562, "balance_loss_mlp": 0.17861933, "epoch": 0.8699233428528483, "flos": 27636457224960.0, "grad_norm": 311.412655219317, "language_loss": 0.77070272, "learning_rate": 1.7480435563570773e-07, "loss": 0.7849859, "num_input_tokens_seen": 312101960, "router_z_loss_clip": 2.09960938, "router_z_loss_mlp": 0.23205566, "step": 14469, "time_per_iteration": 2.705764055252075 }, { "auxiliary_loss_clip": 0.01228708, "auxiliary_loss_mlp": 0.00179351, "balance_loss_clip": 1.01698804, "balance_loss_mlp": 0.15517579, "epoch": 0.8699834661055164, "flos": 20045659864320.0, "grad_norm": 12.120257713655684, "language_loss": 0.9197787, "learning_rate": 1.7464515555128024e-07, "loss": 0.93385923, "num_input_tokens_seen": 312117125, "router_z_loss_clip": 2.11816406, "router_z_loss_mlp": 0.24169922, "step": 14470, "time_per_iteration": 2.6190690994262695 }, { "auxiliary_loss_clip": 0.0123979, "auxiliary_loss_mlp": 0.00208896, "balance_loss_clip": 1.02494907, "balance_loss_mlp": 0.1839934, "epoch": 0.8700435893581843, "flos": 23732787974400.0, "grad_norm": 78.3740547772297, "language_loss": 0.79336393, "learning_rate": 1.7448602468449148e-07, "loss": 0.80785084, "num_input_tokens_seen": 312135775, "router_z_loss_clip": 2.14648438, "router_z_loss_mlp": 0.24902344, "step": 14471, "time_per_iteration": 2.671668529510498 }, { "auxiliary_loss_clip": 0.01239632, "auxiliary_loss_mlp": 0.00208555, "balance_loss_clip": 1.02742243, "balance_loss_mlp": 0.18560722, "epoch": 0.8701037126108523, "flos": 23548422441600.0, "grad_norm": 17.623798832514534, "language_loss": 0.84735155, "learning_rate": 1.7432696304137573e-07, "loss": 0.86183345, "num_input_tokens_seen": 312156070, "router_z_loss_clip": 2.12109375, "router_z_loss_mlp": 0.22949219, "step": 14472, "time_per_iteration": 2.673552989959717 }, { "auxiliary_loss_clip": 0.01247144, "auxiliary_loss_mlp": 0.0020041, "balance_loss_clip": 1.02669406, "balance_loss_mlp": 0.17746247, "epoch": 0.8701638358635202, "flos": 18843442634880.0, "grad_norm": 3.2535886430242464, "language_loss": 0.82449841, "learning_rate": 1.741679706279644e-07, "loss": 0.838974, "num_input_tokens_seen": 312174380, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.22973633, "step": 14473, "time_per_iteration": 2.6134870052337646 }, { "auxiliary_loss_clip": 0.01244219, "auxiliary_loss_mlp": 0.00199802, "balance_loss_clip": 1.02539849, "balance_loss_mlp": 0.17535236, "epoch": 0.8702239591161882, "flos": 27928339142400.0, "grad_norm": 54.33032493520876, "language_loss": 0.78827953, "learning_rate": 1.7400904745028644e-07, "loss": 0.80271971, "num_input_tokens_seen": 312195130, "router_z_loss_clip": 2.19140625, "router_z_loss_mlp": 0.2442627, "step": 14474, "time_per_iteration": 2.7181267738342285 }, { "auxiliary_loss_clip": 0.01263635, "auxiliary_loss_mlp": 0.00224834, "balance_loss_clip": 1.03855753, "balance_loss_mlp": 0.1987514, "epoch": 0.8702840823688561, "flos": 17233997938560.0, "grad_norm": 957.754079875734, "language_loss": 0.79931957, "learning_rate": 1.7385019351436925e-07, "loss": 0.81420428, "num_input_tokens_seen": 312212300, "router_z_loss_clip": 2.25, "router_z_loss_mlp": 0.2611084, "step": 14475, "time_per_iteration": 2.7456400394439697 }, { "auxiliary_loss_clip": 0.01253454, "auxiliary_loss_mlp": 0.0021785, "balance_loss_clip": 1.02880073, "balance_loss_mlp": 0.19145715, "epoch": 0.8703442056215241, "flos": 19427565605760.0, "grad_norm": 27.707442234599654, "language_loss": 0.86105067, "learning_rate": 1.736914088262349e-07, "loss": 0.87576365, "num_input_tokens_seen": 312231735, "router_z_loss_clip": 2.24609375, "router_z_loss_mlp": 0.26403809, "step": 14476, "time_per_iteration": 2.7972958087921143 }, { "auxiliary_loss_clip": 0.01240347, "auxiliary_loss_mlp": 0.0021517, "balance_loss_clip": 1.02869904, "balance_loss_mlp": 0.19125688, "epoch": 0.8704043288741921, "flos": 22273845264000.0, "grad_norm": 14.411463146296816, "language_loss": 0.79428113, "learning_rate": 1.7353269339190525e-07, "loss": 0.80883634, "num_input_tokens_seen": 312253060, "router_z_loss_clip": 2.11914062, "router_z_loss_mlp": 0.23925781, "step": 14477, "time_per_iteration": 2.6834914684295654 }, { "auxiliary_loss_clip": 0.01253155, "auxiliary_loss_mlp": 0.00199069, "balance_loss_clip": 1.03177059, "balance_loss_mlp": 0.17372511, "epoch": 0.8704644521268601, "flos": 16648725732480.0, "grad_norm": 63.436235071279214, "language_loss": 0.6904639, "learning_rate": 1.7337404721739946e-07, "loss": 0.7049861, "num_input_tokens_seen": 312269460, "router_z_loss_clip": 2.21289062, "router_z_loss_mlp": 0.25354004, "step": 14478, "time_per_iteration": 2.7032220363616943 }, { "auxiliary_loss_clip": 0.0124302, "auxiliary_loss_mlp": 0.00208214, "balance_loss_clip": 1.02921748, "balance_loss_mlp": 0.18407427, "epoch": 0.870524575379528, "flos": 24280210224000.0, "grad_norm": 83.51673532397983, "language_loss": 0.80450118, "learning_rate": 1.732154703087323e-07, "loss": 0.81901348, "num_input_tokens_seen": 312289830, "router_z_loss_clip": 2.140625, "router_z_loss_mlp": 0.24133301, "step": 14479, "time_per_iteration": 2.7134392261505127 }, { "auxiliary_loss_clip": 0.01247509, "auxiliary_loss_mlp": 0.00221546, "balance_loss_clip": 1.02786589, "balance_loss_mlp": 0.19539195, "epoch": 0.870584698632196, "flos": 28768684803840.0, "grad_norm": 5.945497997819187, "language_loss": 0.80040467, "learning_rate": 1.7305696267191805e-07, "loss": 0.81509519, "num_input_tokens_seen": 312311320, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.26171875, "step": 14480, "time_per_iteration": 2.7847492694854736 }, { "auxiliary_loss_clip": 0.01262746, "auxiliary_loss_mlp": 0.00216319, "balance_loss_clip": 1.03735423, "balance_loss_mlp": 0.18907958, "epoch": 0.8706448218848639, "flos": 32449635774720.0, "grad_norm": 7.4329716029140736, "language_loss": 0.77999079, "learning_rate": 1.728985243129666e-07, "loss": 0.79478145, "num_input_tokens_seen": 312332095, "router_z_loss_clip": 2.25292969, "router_z_loss_mlp": 0.27233887, "step": 14481, "time_per_iteration": 2.713491678237915 }, { "auxiliary_loss_clip": 0.01228056, "auxiliary_loss_mlp": 0.00193971, "balance_loss_clip": 1.0169729, "balance_loss_mlp": 0.17216748, "epoch": 0.8707049451375319, "flos": 22748009725440.0, "grad_norm": 16.10345414681128, "language_loss": 0.83947051, "learning_rate": 1.7274015523788643e-07, "loss": 0.85369086, "num_input_tokens_seen": 312351225, "router_z_loss_clip": 2.11132812, "router_z_loss_mlp": 0.21801758, "step": 14482, "time_per_iteration": 2.695855140686035 }, { "auxiliary_loss_clip": 0.01248239, "auxiliary_loss_mlp": 0.0021345, "balance_loss_clip": 1.03366458, "balance_loss_mlp": 0.18852319, "epoch": 0.8707650683902, "flos": 15851976203520.0, "grad_norm": 177.12846440245409, "language_loss": 0.84467399, "learning_rate": 1.7258185545268234e-07, "loss": 0.85929084, "num_input_tokens_seen": 312369730, "router_z_loss_clip": 2.14648438, "router_z_loss_mlp": 0.24951172, "step": 14483, "time_per_iteration": 2.6260478496551514 }, { "auxiliary_loss_clip": 0.01265195, "auxiliary_loss_mlp": 0.00222933, "balance_loss_clip": 1.04477072, "balance_loss_mlp": 0.19595626, "epoch": 0.8708251916428679, "flos": 16468131127680.0, "grad_norm": 30.611077869350186, "language_loss": 0.71669275, "learning_rate": 1.7242362496335749e-07, "loss": 0.73157406, "num_input_tokens_seen": 312386780, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.26965332, "step": 14484, "time_per_iteration": 2.6987297534942627 }, { "auxiliary_loss_clip": 0.01264318, "auxiliary_loss_mlp": 0.00201224, "balance_loss_clip": 1.0437218, "balance_loss_mlp": 0.17728674, "epoch": 0.8708853148955359, "flos": 15377847655680.0, "grad_norm": 117.77642804006734, "language_loss": 0.80282366, "learning_rate": 1.7226546377591222e-07, "loss": 0.81747907, "num_input_tokens_seen": 312404875, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.23937988, "step": 14485, "time_per_iteration": 2.6970574855804443 }, { "auxiliary_loss_clip": 0.01241223, "auxiliary_loss_mlp": 0.00220512, "balance_loss_clip": 1.02571332, "balance_loss_mlp": 0.19473846, "epoch": 0.8709454381482038, "flos": 30551325903360.0, "grad_norm": 3.04471261074546, "language_loss": 0.6957981, "learning_rate": 1.7210737189634373e-07, "loss": 0.71041542, "num_input_tokens_seen": 312425280, "router_z_loss_clip": 2.15917969, "router_z_loss_mlp": 0.25744629, "step": 14486, "time_per_iteration": 2.7306582927703857 }, { "auxiliary_loss_clip": 0.01271254, "auxiliary_loss_mlp": 0.00221297, "balance_loss_clip": 1.0481956, "balance_loss_mlp": 0.19578664, "epoch": 0.8710055614008718, "flos": 22601422321920.0, "grad_norm": 60.62939393813725, "language_loss": 0.7203474, "learning_rate": 1.7194934933064653e-07, "loss": 0.73527288, "num_input_tokens_seen": 312443835, "router_z_loss_clip": 2.22851562, "router_z_loss_mlp": 0.25512695, "step": 14487, "time_per_iteration": 2.651566743850708 }, { "auxiliary_loss_clip": 0.01248176, "auxiliary_loss_mlp": 0.00211615, "balance_loss_clip": 1.03054225, "balance_loss_mlp": 0.18854778, "epoch": 0.8710656846535397, "flos": 18443146492800.0, "grad_norm": 175.7333165485703, "language_loss": 0.76993775, "learning_rate": 1.7179139608481318e-07, "loss": 0.78453577, "num_input_tokens_seen": 312460830, "router_z_loss_clip": 2.17578125, "router_z_loss_mlp": 0.23046875, "step": 14488, "time_per_iteration": 2.603893518447876 }, { "auxiliary_loss_clip": 0.01270253, "auxiliary_loss_mlp": 0.00218098, "balance_loss_clip": 1.04663801, "balance_loss_mlp": 0.19269493, "epoch": 0.8711258079062077, "flos": 16503862181760.0, "grad_norm": 4.901576352529993, "language_loss": 0.92841756, "learning_rate": 1.716335121648338e-07, "loss": 0.94330114, "num_input_tokens_seen": 312477575, "router_z_loss_clip": 2.24023438, "router_z_loss_mlp": 0.25390625, "step": 14489, "time_per_iteration": 2.6193275451660156 }, { "auxiliary_loss_clip": 0.01280146, "auxiliary_loss_mlp": 0.00220063, "balance_loss_clip": 1.05003071, "balance_loss_mlp": 0.19428957, "epoch": 0.8711859311588757, "flos": 15663336952320.0, "grad_norm": 35.41614737443463, "language_loss": 0.85983276, "learning_rate": 1.7147569757669445e-07, "loss": 0.87483484, "num_input_tokens_seen": 312492140, "router_z_loss_clip": 2.30078125, "router_z_loss_mlp": 0.25756836, "step": 14490, "time_per_iteration": 2.6201045513153076 }, { "auxiliary_loss_clip": 0.01255727, "auxiliary_loss_mlp": 0.00224123, "balance_loss_clip": 1.03357077, "balance_loss_mlp": 0.19818312, "epoch": 0.8712460544115437, "flos": 15557544420480.0, "grad_norm": 51.918846993825625, "language_loss": 0.8554275, "learning_rate": 1.7131795232638012e-07, "loss": 0.87022603, "num_input_tokens_seen": 312508400, "router_z_loss_clip": 2.22167969, "router_z_loss_mlp": 0.25952148, "step": 14491, "time_per_iteration": 2.6245675086975098 }, { "auxiliary_loss_clip": 0.01238034, "auxiliary_loss_mlp": 0.00201111, "balance_loss_clip": 1.02710462, "balance_loss_mlp": 0.17718577, "epoch": 0.8713061776642116, "flos": 16763568491520.0, "grad_norm": 5.789398285383175, "language_loss": 0.73789054, "learning_rate": 1.711602764198723e-07, "loss": 0.7522819, "num_input_tokens_seen": 312525915, "router_z_loss_clip": 2.109375, "router_z_loss_mlp": 0.23937988, "step": 14492, "time_per_iteration": 4.085930824279785 }, { "auxiliary_loss_clip": 0.01231492, "auxiliary_loss_mlp": 0.00190984, "balance_loss_clip": 1.01867545, "balance_loss_mlp": 0.16842982, "epoch": 0.8713663009168796, "flos": 24279887001600.0, "grad_norm": 23.8760136361393, "language_loss": 0.77669191, "learning_rate": 1.7100266986314992e-07, "loss": 0.79091668, "num_input_tokens_seen": 312544735, "router_z_loss_clip": 2.12890625, "router_z_loss_mlp": 0.22546387, "step": 14493, "time_per_iteration": 4.167388439178467 }, { "auxiliary_loss_clip": 0.01256785, "auxiliary_loss_mlp": 0.00200736, "balance_loss_clip": 1.03577781, "balance_loss_mlp": 0.17411667, "epoch": 0.8714264241695475, "flos": 23795594904960.0, "grad_norm": 23.48311690135246, "language_loss": 0.97903329, "learning_rate": 1.7084513266218936e-07, "loss": 0.99360847, "num_input_tokens_seen": 312557910, "router_z_loss_clip": 2.2109375, "router_z_loss_mlp": 0.26635742, "step": 14494, "time_per_iteration": 2.6229453086853027 }, { "auxiliary_loss_clip": 0.01230192, "auxiliary_loss_mlp": 0.00192327, "balance_loss_clip": 1.02203214, "balance_loss_mlp": 0.16965359, "epoch": 0.8714865474222155, "flos": 37997942071680.0, "grad_norm": 3.7509431244749916, "language_loss": 0.66597086, "learning_rate": 1.7068766482296514e-07, "loss": 0.68019605, "num_input_tokens_seen": 312580360, "router_z_loss_clip": 2.07910156, "router_z_loss_mlp": 0.2265625, "step": 14495, "time_per_iteration": 2.777500629425049 }, { "auxiliary_loss_clip": 0.0125138, "auxiliary_loss_mlp": 0.00200682, "balance_loss_clip": 1.03066242, "balance_loss_mlp": 0.17701942, "epoch": 0.8715466706748836, "flos": 22455696844800.0, "grad_norm": 77.24618230417248, "language_loss": 0.9146384, "learning_rate": 1.7053026635144762e-07, "loss": 0.92915899, "num_input_tokens_seen": 312597550, "router_z_loss_clip": 2.20507812, "router_z_loss_mlp": 0.23669434, "step": 14496, "time_per_iteration": 2.6776983737945557 }, { "auxiliary_loss_clip": 0.01256233, "auxiliary_loss_mlp": 0.00216165, "balance_loss_clip": 1.03795922, "balance_loss_mlp": 0.19022501, "epoch": 0.8716067939275515, "flos": 21215126868480.0, "grad_norm": 125.42606296148755, "language_loss": 0.86040181, "learning_rate": 1.7037293725360624e-07, "loss": 0.87512583, "num_input_tokens_seen": 312616435, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.25964355, "step": 14497, "time_per_iteration": 2.686739444732666 }, { "auxiliary_loss_clip": 0.01257207, "auxiliary_loss_mlp": 0.00197405, "balance_loss_clip": 1.03715181, "balance_loss_mlp": 0.17108358, "epoch": 0.8716669171802195, "flos": 22997732054400.0, "grad_norm": 101.82940045582598, "language_loss": 0.76165682, "learning_rate": 1.70215677535406e-07, "loss": 0.77620292, "num_input_tokens_seen": 312632770, "router_z_loss_clip": 2.19921875, "router_z_loss_mlp": 0.26342773, "step": 14498, "time_per_iteration": 2.6759791374206543 }, { "auxiliary_loss_clip": 0.01262519, "auxiliary_loss_mlp": 0.00206095, "balance_loss_clip": 1.04067349, "balance_loss_mlp": 0.17992896, "epoch": 0.8717270404328874, "flos": 29784058462080.0, "grad_norm": 6.6126582225960915, "language_loss": 0.65094918, "learning_rate": 1.700584872028108e-07, "loss": 0.66563535, "num_input_tokens_seen": 312651900, "router_z_loss_clip": 2.22070312, "router_z_loss_mlp": 0.26184082, "step": 14499, "time_per_iteration": 4.149197578430176 }, { "auxiliary_loss_clip": 0.01258395, "auxiliary_loss_mlp": 0.00200321, "balance_loss_clip": 1.03567219, "balance_loss_mlp": 0.17492931, "epoch": 0.8717871636855554, "flos": 22018125363840.0, "grad_norm": 7.065797763527555, "language_loss": 0.90485406, "learning_rate": 1.6990136626178097e-07, "loss": 0.91944128, "num_input_tokens_seen": 312671380, "router_z_loss_clip": 2.22851562, "router_z_loss_mlp": 0.25415039, "step": 14500, "time_per_iteration": 2.6655802726745605 }, { "auxiliary_loss_clip": 0.01244913, "auxiliary_loss_mlp": 0.00210672, "balance_loss_clip": 1.02843976, "balance_loss_mlp": 0.18692577, "epoch": 0.8718472869382233, "flos": 16654256426880.0, "grad_norm": 9.121639187295402, "language_loss": 0.83436751, "learning_rate": 1.6974431471827466e-07, "loss": 0.84892333, "num_input_tokens_seen": 312689215, "router_z_loss_clip": 2.16601562, "router_z_loss_mlp": 0.23742676, "step": 14501, "time_per_iteration": 2.626265048980713 }, { "auxiliary_loss_clip": 0.01261501, "auxiliary_loss_mlp": 0.00215915, "balance_loss_clip": 1.03455353, "balance_loss_mlp": 0.18880659, "epoch": 0.8719074101908914, "flos": 19495328613120.0, "grad_norm": 302.4525894446234, "language_loss": 0.749596, "learning_rate": 1.695873325782482e-07, "loss": 0.76437008, "num_input_tokens_seen": 312706400, "router_z_loss_clip": 2.26757812, "router_z_loss_mlp": 0.27124023, "step": 14502, "time_per_iteration": 4.041804075241089 }, { "auxiliary_loss_clip": 0.01246184, "auxiliary_loss_mlp": 0.00200023, "balance_loss_clip": 1.03113723, "balance_loss_mlp": 0.17615743, "epoch": 0.8719675334435593, "flos": 33070890430080.0, "grad_norm": 37.583777503916146, "language_loss": 0.74528849, "learning_rate": 1.6943041984765262e-07, "loss": 0.75975055, "num_input_tokens_seen": 312727985, "router_z_loss_clip": 2.15234375, "router_z_loss_mlp": 0.2388916, "step": 14503, "time_per_iteration": 2.7424938678741455 }, { "auxiliary_loss_clip": 0.01234114, "auxiliary_loss_mlp": 0.00205079, "balance_loss_clip": 1.02095342, "balance_loss_mlp": 0.18018788, "epoch": 0.8720276566962273, "flos": 13626268842240.0, "grad_norm": 19706.802810613754, "language_loss": 0.77401066, "learning_rate": 1.6927357653243912e-07, "loss": 0.78840256, "num_input_tokens_seen": 312745025, "router_z_loss_clip": 2.13183594, "router_z_loss_mlp": 0.2487793, "step": 14504, "time_per_iteration": 2.665323257446289 }, { "auxiliary_loss_clip": 0.01242403, "auxiliary_loss_mlp": 0.002203, "balance_loss_clip": 1.0238682, "balance_loss_mlp": 0.19409731, "epoch": 0.8720877799488952, "flos": 23514163845120.0, "grad_norm": 69.77825874882038, "language_loss": 0.78929836, "learning_rate": 1.691168026385552e-07, "loss": 0.8039254, "num_input_tokens_seen": 312764170, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.26196289, "step": 14505, "time_per_iteration": 2.650418758392334 }, { "auxiliary_loss_clip": 0.01244384, "auxiliary_loss_mlp": 0.00197278, "balance_loss_clip": 1.03169751, "balance_loss_mlp": 0.17294714, "epoch": 0.8721479032015632, "flos": 20814148368000.0, "grad_norm": 15.284107560560482, "language_loss": 0.84737837, "learning_rate": 1.6896009817194545e-07, "loss": 0.86179507, "num_input_tokens_seen": 312783830, "router_z_loss_clip": 2.12695312, "router_z_loss_mlp": 0.24328613, "step": 14506, "time_per_iteration": 2.746166706085205 }, { "auxiliary_loss_clip": 0.01260336, "auxiliary_loss_mlp": 0.00206585, "balance_loss_clip": 1.03819811, "balance_loss_mlp": 0.18244514, "epoch": 0.8722080264542311, "flos": 19463655795840.0, "grad_norm": 174.46938398317207, "language_loss": 0.84305286, "learning_rate": 1.6880346313855221e-07, "loss": 0.8577221, "num_input_tokens_seen": 312802015, "router_z_loss_clip": 2.22265625, "router_z_loss_mlp": 0.24133301, "step": 14507, "time_per_iteration": 2.628892183303833 }, { "auxiliary_loss_clip": 0.01283119, "auxiliary_loss_mlp": 0.00234862, "balance_loss_clip": 1.04877019, "balance_loss_mlp": 0.20702681, "epoch": 0.8722681497068991, "flos": 21761866759680.0, "grad_norm": 14.746888390435984, "language_loss": 0.82811856, "learning_rate": 1.686468975443156e-07, "loss": 0.84329832, "num_input_tokens_seen": 312820650, "router_z_loss_clip": 2.34179688, "router_z_loss_mlp": 0.27832031, "step": 14508, "time_per_iteration": 2.6429219245910645 }, { "auxiliary_loss_clip": 0.01272709, "auxiliary_loss_mlp": 0.00214786, "balance_loss_clip": 1.04505205, "balance_loss_mlp": 0.18820238, "epoch": 0.8723282729595672, "flos": 28877134942080.0, "grad_norm": 14.309454550351468, "language_loss": 0.75320399, "learning_rate": 1.6849040139517202e-07, "loss": 0.76807892, "num_input_tokens_seen": 312841310, "router_z_loss_clip": 2.2734375, "router_z_loss_mlp": 0.26574707, "step": 14509, "time_per_iteration": 2.673492670059204 }, { "auxiliary_loss_clip": 0.01253163, "auxiliary_loss_mlp": 0.00229361, "balance_loss_clip": 1.04009438, "balance_loss_mlp": 0.20509028, "epoch": 0.8723883962122351, "flos": 26469145036800.0, "grad_norm": 24.423605678364538, "language_loss": 0.66766095, "learning_rate": 1.683339746970558e-07, "loss": 0.68248618, "num_input_tokens_seen": 312862100, "router_z_loss_clip": 2.1328125, "router_z_loss_mlp": 0.24267578, "step": 14510, "time_per_iteration": 2.67177152633667 }, { "auxiliary_loss_clip": 0.01291977, "auxiliary_loss_mlp": 0.00231419, "balance_loss_clip": 1.05663514, "balance_loss_mlp": 0.20208187, "epoch": 0.8724485194649031, "flos": 20521476351360.0, "grad_norm": 1825.2609354814567, "language_loss": 0.78407907, "learning_rate": 1.6817761745589865e-07, "loss": 0.79931307, "num_input_tokens_seen": 312880220, "router_z_loss_clip": 2.35449219, "router_z_loss_mlp": 0.29345703, "step": 14511, "time_per_iteration": 2.6482410430908203 }, { "auxiliary_loss_clip": 0.01237272, "auxiliary_loss_mlp": 0.00188479, "balance_loss_clip": 1.02357984, "balance_loss_mlp": 0.165472, "epoch": 0.872508642717571, "flos": 24353360271360.0, "grad_norm": 73.75506718892468, "language_loss": 0.89352709, "learning_rate": 1.6802132967763027e-07, "loss": 0.90778458, "num_input_tokens_seen": 312900765, "router_z_loss_clip": 2.13867188, "router_z_loss_mlp": 0.22998047, "step": 14512, "time_per_iteration": 2.6537091732025146 }, { "auxiliary_loss_clip": 0.01158281, "auxiliary_loss_mlp": 0.00077728, "balance_loss_clip": 1.01876462, "balance_loss_mlp": 0.07038489, "epoch": 0.872568765970239, "flos": 61410012485760.0, "grad_norm": 0.7800547381198412, "language_loss": 0.58200127, "learning_rate": 1.6786511136817617e-07, "loss": 0.59436136, "num_input_tokens_seen": 312955840, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.07324219, "step": 14513, "time_per_iteration": 3.0596351623535156 }, { "auxiliary_loss_clip": 0.01245737, "auxiliary_loss_mlp": 0.00213228, "balance_loss_clip": 1.02959204, "balance_loss_mlp": 0.18595287, "epoch": 0.8726288892229069, "flos": 22598046443520.0, "grad_norm": 17.079614424013812, "language_loss": 0.82390845, "learning_rate": 1.6770896253346112e-07, "loss": 0.83849806, "num_input_tokens_seen": 312973565, "router_z_loss_clip": 2.16796875, "router_z_loss_mlp": 0.27294922, "step": 14514, "time_per_iteration": 2.694761276245117 }, { "auxiliary_loss_clip": 0.01273473, "auxiliary_loss_mlp": 0.00238421, "balance_loss_clip": 1.04231715, "balance_loss_mlp": 0.21170656, "epoch": 0.872689012475575, "flos": 25885201633920.0, "grad_norm": 13.839089380117997, "language_loss": 0.747379, "learning_rate": 1.675528831794055e-07, "loss": 0.7624979, "num_input_tokens_seen": 312994660, "router_z_loss_clip": 2.3125, "router_z_loss_mlp": 0.26696777, "step": 14515, "time_per_iteration": 2.692723035812378 }, { "auxiliary_loss_clip": 0.0125394, "auxiliary_loss_mlp": 0.00211249, "balance_loss_clip": 1.03028607, "balance_loss_mlp": 0.18430801, "epoch": 0.8727491357282429, "flos": 21506721477120.0, "grad_norm": 9.442429702474277, "language_loss": 0.86964959, "learning_rate": 1.6739687331192842e-07, "loss": 0.88430148, "num_input_tokens_seen": 313009860, "router_z_loss_clip": 2.24023438, "router_z_loss_mlp": 0.26940918, "step": 14516, "time_per_iteration": 2.6394665241241455 }, { "auxiliary_loss_clip": 0.01255419, "auxiliary_loss_mlp": 0.00233192, "balance_loss_clip": 1.03203619, "balance_loss_mlp": 0.20646548, "epoch": 0.8728092589809109, "flos": 19207504932480.0, "grad_norm": 43.78887780087233, "language_loss": 0.8524487, "learning_rate": 1.672409329369453e-07, "loss": 0.86733484, "num_input_tokens_seen": 313027025, "router_z_loss_clip": 2.23242188, "router_z_loss_mlp": 0.26733398, "step": 14517, "time_per_iteration": 2.7066662311553955 }, { "auxiliary_loss_clip": 0.0122895, "auxiliary_loss_mlp": 0.00200712, "balance_loss_clip": 1.0175705, "balance_loss_mlp": 0.17691831, "epoch": 0.8728693822335788, "flos": 20595308757120.0, "grad_norm": 6.4835541553170835, "language_loss": 0.80909604, "learning_rate": 1.6708506206036966e-07, "loss": 0.82339263, "num_input_tokens_seen": 313046830, "router_z_loss_clip": 2.11328125, "router_z_loss_mlp": 0.23803711, "step": 14518, "time_per_iteration": 2.6162405014038086 }, { "auxiliary_loss_clip": 0.01217015, "auxiliary_loss_mlp": 0.00191458, "balance_loss_clip": 1.00917768, "balance_loss_mlp": 0.16733032, "epoch": 0.8729295054862468, "flos": 21728613744000.0, "grad_norm": 4.465150732347615, "language_loss": 0.795259, "learning_rate": 1.6692926068811275e-07, "loss": 0.8093437, "num_input_tokens_seen": 313067715, "router_z_loss_clip": 2.078125, "router_z_loss_mlp": 0.24157715, "step": 14519, "time_per_iteration": 2.746877670288086 }, { "auxiliary_loss_clip": 0.01259853, "auxiliary_loss_mlp": 0.00219701, "balance_loss_clip": 1.03870106, "balance_loss_mlp": 0.19339153, "epoch": 0.8729896287389147, "flos": 17673436926720.0, "grad_norm": 14.016767935769975, "language_loss": 0.90666008, "learning_rate": 1.6677352882608142e-07, "loss": 0.92145568, "num_input_tokens_seen": 313082305, "router_z_loss_clip": 2.21484375, "router_z_loss_mlp": 0.26293945, "step": 14520, "time_per_iteration": 2.574380874633789 }, { "auxiliary_loss_clip": 0.01262586, "auxiliary_loss_mlp": 0.00225774, "balance_loss_clip": 1.03580523, "balance_loss_mlp": 0.19970301, "epoch": 0.8730497519915827, "flos": 24571804832640.0, "grad_norm": 7.980171804249051, "language_loss": 0.89436966, "learning_rate": 1.666178664801816e-07, "loss": 0.90925336, "num_input_tokens_seen": 313101190, "router_z_loss_clip": 2.26953125, "router_z_loss_mlp": 0.26086426, "step": 14521, "time_per_iteration": 2.693101167678833 }, { "auxiliary_loss_clip": 0.01254818, "auxiliary_loss_mlp": 0.00197077, "balance_loss_clip": 1.02795672, "balance_loss_mlp": 0.17185271, "epoch": 0.8731098752442508, "flos": 13443734903040.0, "grad_norm": 25.438994952392257, "language_loss": 0.88901007, "learning_rate": 1.6646227365631616e-07, "loss": 0.90352905, "num_input_tokens_seen": 313118965, "router_z_loss_clip": 2.26953125, "router_z_loss_mlp": 0.25219727, "step": 14522, "time_per_iteration": 2.822897434234619 }, { "auxiliary_loss_clip": 0.01243142, "auxiliary_loss_mlp": 0.00202754, "balance_loss_clip": 1.02571726, "balance_loss_mlp": 0.18089069, "epoch": 0.8731699984969187, "flos": 23474446381440.0, "grad_norm": 10.617856775214662, "language_loss": 0.82096308, "learning_rate": 1.66306750360385e-07, "loss": 0.83542204, "num_input_tokens_seen": 313139280, "router_z_loss_clip": 2.171875, "router_z_loss_mlp": 0.21850586, "step": 14523, "time_per_iteration": 2.732360601425171 }, { "auxiliary_loss_clip": 0.01241012, "auxiliary_loss_mlp": 0.0021995, "balance_loss_clip": 1.02290118, "balance_loss_mlp": 0.19546488, "epoch": 0.8732301217495867, "flos": 17712651600000.0, "grad_norm": 34.88794815190918, "language_loss": 0.86957246, "learning_rate": 1.6615129659828542e-07, "loss": 0.8841821, "num_input_tokens_seen": 313156655, "router_z_loss_clip": 2.18164062, "router_z_loss_mlp": 0.24487305, "step": 14524, "time_per_iteration": 2.6596908569335938 }, { "auxiliary_loss_clip": 0.01234402, "auxiliary_loss_mlp": 0.00214026, "balance_loss_clip": 1.02287292, "balance_loss_mlp": 0.1898739, "epoch": 0.8732902450022546, "flos": 22054359208320.0, "grad_norm": 58.39445124055214, "language_loss": 0.87794304, "learning_rate": 1.6599591237591272e-07, "loss": 0.89242733, "num_input_tokens_seen": 313174050, "router_z_loss_clip": 2.11621094, "router_z_loss_mlp": 0.24133301, "step": 14525, "time_per_iteration": 2.6477370262145996 }, { "auxiliary_loss_clip": 0.01255455, "auxiliary_loss_mlp": 0.00215169, "balance_loss_clip": 1.03321958, "balance_loss_mlp": 0.18964586, "epoch": 0.8733503682549226, "flos": 22272983337600.0, "grad_norm": 5.90751090232, "language_loss": 0.77355409, "learning_rate": 1.6584059769915902e-07, "loss": 0.78826028, "num_input_tokens_seen": 313192765, "router_z_loss_clip": 2.22265625, "router_z_loss_mlp": 0.25500488, "step": 14526, "time_per_iteration": 2.689669132232666 }, { "auxiliary_loss_clip": 0.01283104, "auxiliary_loss_mlp": 0.00230433, "balance_loss_clip": 1.05151498, "balance_loss_mlp": 0.2028597, "epoch": 0.8734104915075905, "flos": 23364344217600.0, "grad_norm": 6.226968744488414, "language_loss": 0.70248789, "learning_rate": 1.6568535257391326e-07, "loss": 0.71762323, "num_input_tokens_seen": 313210925, "router_z_loss_clip": 2.31445312, "router_z_loss_mlp": 0.27575684, "step": 14527, "time_per_iteration": 2.650578737258911 }, { "auxiliary_loss_clip": 0.01281303, "auxiliary_loss_mlp": 0.00244042, "balance_loss_clip": 1.0441283, "balance_loss_mlp": 0.21242815, "epoch": 0.8734706147602586, "flos": 17712292464000.0, "grad_norm": 26.76379282293426, "language_loss": 0.78376186, "learning_rate": 1.6553017700606265e-07, "loss": 0.79901534, "num_input_tokens_seen": 313228250, "router_z_loss_clip": 2.36914062, "router_z_loss_mlp": 0.31604004, "step": 14528, "time_per_iteration": 2.6415998935699463 }, { "auxiliary_loss_clip": 0.01254414, "auxiliary_loss_mlp": 0.0020593, "balance_loss_clip": 1.03336692, "balance_loss_mlp": 0.18149218, "epoch": 0.8735307380129265, "flos": 22049367217920.0, "grad_norm": 15.305147364567022, "language_loss": 0.98546195, "learning_rate": 1.6537507100149205e-07, "loss": 1.00006545, "num_input_tokens_seen": 313247880, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.24450684, "step": 14529, "time_per_iteration": 2.7355544567108154 }, { "auxiliary_loss_clip": 0.01237975, "auxiliary_loss_mlp": 0.00190719, "balance_loss_clip": 1.02263701, "balance_loss_mlp": 0.16711578, "epoch": 0.8735908612655945, "flos": 25338425829120.0, "grad_norm": 167.28797000664858, "language_loss": 0.91196448, "learning_rate": 1.6522003456608258e-07, "loss": 0.92625141, "num_input_tokens_seen": 313266790, "router_z_loss_clip": 2.15039062, "router_z_loss_mlp": 0.23596191, "step": 14530, "time_per_iteration": 2.725600481033325 }, { "auxiliary_loss_clip": 0.01246012, "auxiliary_loss_mlp": 0.00227354, "balance_loss_clip": 1.02715135, "balance_loss_mlp": 0.20257026, "epoch": 0.8736509845182624, "flos": 21540908246400.0, "grad_norm": 11.526513833678468, "language_loss": 0.79919672, "learning_rate": 1.650650677057128e-07, "loss": 0.81393033, "num_input_tokens_seen": 313286805, "router_z_loss_clip": 2.19140625, "router_z_loss_mlp": 0.24755859, "step": 14531, "time_per_iteration": 2.64861798286438 }, { "auxiliary_loss_clip": 0.01237747, "auxiliary_loss_mlp": 0.00213005, "balance_loss_clip": 1.02162838, "balance_loss_mlp": 0.18804277, "epoch": 0.8737111077709304, "flos": 22017227523840.0, "grad_norm": 38.98710614497059, "language_loss": 0.71583307, "learning_rate": 1.6491017042625966e-07, "loss": 0.7303406, "num_input_tokens_seen": 313305415, "router_z_loss_clip": 2.1640625, "router_z_loss_mlp": 0.24951172, "step": 14532, "time_per_iteration": 2.648535966873169 }, { "auxiliary_loss_clip": 0.01154884, "auxiliary_loss_mlp": 0.00098904, "balance_loss_clip": 1.0167079, "balance_loss_mlp": 0.09113135, "epoch": 0.8737712310235983, "flos": 70066315912320.0, "grad_norm": 0.788467110515332, "language_loss": 0.57423538, "learning_rate": 1.6475534273359704e-07, "loss": 0.58677322, "num_input_tokens_seen": 313369940, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 0.07763672, "step": 14533, "time_per_iteration": 3.254159450531006 }, { "auxiliary_loss_clip": 0.0123717, "auxiliary_loss_mlp": 0.0019981, "balance_loss_clip": 1.02262223, "balance_loss_mlp": 0.1731783, "epoch": 0.8738313542762663, "flos": 28658331244800.0, "grad_norm": 2.126952454035014, "language_loss": 0.83186692, "learning_rate": 1.646005846335954e-07, "loss": 0.84623671, "num_input_tokens_seen": 313390965, "router_z_loss_clip": 2.14453125, "router_z_loss_mlp": 0.26647949, "step": 14534, "time_per_iteration": 4.099276781082153 }, { "auxiliary_loss_clip": 0.01259685, "auxiliary_loss_mlp": 0.00224599, "balance_loss_clip": 1.04108405, "balance_loss_mlp": 0.20030379, "epoch": 0.8738914775289344, "flos": 22346384780160.0, "grad_norm": 7.962579669046744, "language_loss": 0.83105052, "learning_rate": 1.6444589613212357e-07, "loss": 0.84589332, "num_input_tokens_seen": 313409680, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.24291992, "step": 14535, "time_per_iteration": 2.7131760120391846 }, { "auxiliary_loss_clip": 0.01253249, "auxiliary_loss_mlp": 0.00212112, "balance_loss_clip": 1.03022504, "balance_loss_mlp": 0.18687564, "epoch": 0.8739516007816023, "flos": 31759648444800.0, "grad_norm": 39.762676088623955, "language_loss": 0.81876302, "learning_rate": 1.64291277235048e-07, "loss": 0.83341664, "num_input_tokens_seen": 313431335, "router_z_loss_clip": 2.23046875, "router_z_loss_mlp": 0.25256348, "step": 14536, "time_per_iteration": 4.143074035644531 }, { "auxiliary_loss_clip": 0.0123962, "auxiliary_loss_mlp": 0.00210999, "balance_loss_clip": 1.02379513, "balance_loss_mlp": 0.18535727, "epoch": 0.8740117240342703, "flos": 21211715076480.0, "grad_norm": 1739.1905529959822, "language_loss": 0.72566676, "learning_rate": 1.641367279482304e-07, "loss": 0.74017298, "num_input_tokens_seen": 313449225, "router_z_loss_clip": 2.15917969, "router_z_loss_mlp": 0.2565918, "step": 14537, "time_per_iteration": 2.6269357204437256 }, { "auxiliary_loss_clip": 0.01245392, "auxiliary_loss_mlp": 0.00217275, "balance_loss_clip": 1.02797532, "balance_loss_mlp": 0.19307575, "epoch": 0.8740718472869382, "flos": 25186666867200.0, "grad_norm": 11.882555518556183, "language_loss": 0.65900123, "learning_rate": 1.6398224827753216e-07, "loss": 0.67362785, "num_input_tokens_seen": 313467715, "router_z_loss_clip": 2.17578125, "router_z_loss_mlp": 0.24194336, "step": 14538, "time_per_iteration": 2.683023691177368 }, { "auxiliary_loss_clip": 0.01248221, "auxiliary_loss_mlp": 0.00202654, "balance_loss_clip": 1.03168225, "balance_loss_mlp": 0.17865711, "epoch": 0.8741319705396062, "flos": 19500931134720.0, "grad_norm": 6.56253320767131, "language_loss": 0.76495147, "learning_rate": 1.6382783822881142e-07, "loss": 0.77946019, "num_input_tokens_seen": 313486805, "router_z_loss_clip": 2.16601562, "router_z_loss_mlp": 0.2401123, "step": 14539, "time_per_iteration": 2.61826229095459 }, { "auxiliary_loss_clip": 0.0125679, "auxiliary_loss_mlp": 0.00195659, "balance_loss_clip": 1.03174794, "balance_loss_mlp": 0.16974315, "epoch": 0.8741920937922741, "flos": 14100900180480.0, "grad_norm": 20.521007293217092, "language_loss": 0.83824599, "learning_rate": 1.6367349780792262e-07, "loss": 0.85277045, "num_input_tokens_seen": 313504880, "router_z_loss_clip": 2.25, "router_z_loss_mlp": 0.25915527, "step": 14540, "time_per_iteration": 2.66563081741333 }, { "auxiliary_loss_clip": 0.01237492, "auxiliary_loss_mlp": 0.0021962, "balance_loss_clip": 1.02221298, "balance_loss_mlp": 0.19366789, "epoch": 0.8742522170449422, "flos": 27709858667520.0, "grad_norm": 22.78457974400633, "language_loss": 0.86261803, "learning_rate": 1.635192270207193e-07, "loss": 0.87718916, "num_input_tokens_seen": 313524995, "router_z_loss_clip": 2.1484375, "router_z_loss_mlp": 0.25927734, "step": 14541, "time_per_iteration": 4.265751123428345 }, { "auxiliary_loss_clip": 0.01276732, "auxiliary_loss_mlp": 0.00217734, "balance_loss_clip": 1.04537249, "balance_loss_mlp": 0.18836093, "epoch": 0.8743123402976101, "flos": 21142587352320.0, "grad_norm": 41.83784164916678, "language_loss": 0.79256535, "learning_rate": 1.6336502587305035e-07, "loss": 0.80751002, "num_input_tokens_seen": 313541740, "router_z_loss_clip": 2.31445312, "router_z_loss_mlp": 0.29370117, "step": 14542, "time_per_iteration": 2.731696367263794 }, { "auxiliary_loss_clip": 0.01158004, "auxiliary_loss_mlp": 0.00096783, "balance_loss_clip": 1.01898479, "balance_loss_mlp": 0.08944009, "epoch": 0.8743724635502781, "flos": 60870024351360.0, "grad_norm": 0.7788095516614272, "language_loss": 0.54224908, "learning_rate": 1.632108943707642e-07, "loss": 0.55479705, "num_input_tokens_seen": 313593445, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.07324219, "step": 14543, "time_per_iteration": 3.0078442096710205 }, { "auxiliary_loss_clip": 0.0126975, "auxiliary_loss_mlp": 0.00211818, "balance_loss_clip": 1.04630899, "balance_loss_mlp": 0.18630758, "epoch": 0.874432586802946, "flos": 28109292883200.0, "grad_norm": 22.360090352621437, "language_loss": 0.79928833, "learning_rate": 1.6305683251970458e-07, "loss": 0.81410396, "num_input_tokens_seen": 313615640, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.25500488, "step": 14544, "time_per_iteration": 2.6958084106445312 }, { "auxiliary_loss_clip": 0.01236003, "auxiliary_loss_mlp": 0.0020293, "balance_loss_clip": 1.02481341, "balance_loss_mlp": 0.17931432, "epoch": 0.874492710055614, "flos": 23550289948800.0, "grad_norm": 24.054184562291052, "language_loss": 0.82829952, "learning_rate": 1.62902840325714e-07, "loss": 0.84268892, "num_input_tokens_seen": 313635550, "router_z_loss_clip": 2.11328125, "router_z_loss_mlp": 0.23608398, "step": 14545, "time_per_iteration": 4.152297019958496 }, { "auxiliary_loss_clip": 0.01248083, "auxiliary_loss_mlp": 0.0019814, "balance_loss_clip": 1.02830172, "balance_loss_mlp": 0.17193739, "epoch": 0.8745528333082819, "flos": 40915647924480.0, "grad_norm": 8.113883812030293, "language_loss": 0.73732299, "learning_rate": 1.6274891779463217e-07, "loss": 0.75178522, "num_input_tokens_seen": 313659275, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.26171875, "step": 14546, "time_per_iteration": 2.893653392791748 }, { "auxiliary_loss_clip": 0.01250944, "auxiliary_loss_mlp": 0.00205026, "balance_loss_clip": 1.03161621, "balance_loss_mlp": 0.17864524, "epoch": 0.87461295656095, "flos": 23622901292160.0, "grad_norm": 25.10941401131916, "language_loss": 0.80051172, "learning_rate": 1.6259506493229536e-07, "loss": 0.8150714, "num_input_tokens_seen": 313680595, "router_z_loss_clip": 2.19140625, "router_z_loss_mlp": 0.26391602, "step": 14547, "time_per_iteration": 2.648446798324585 }, { "auxiliary_loss_clip": 0.01276999, "auxiliary_loss_mlp": 0.00216257, "balance_loss_clip": 1.04413199, "balance_loss_mlp": 0.1888984, "epoch": 0.874673079813618, "flos": 38794116983040.0, "grad_norm": 35.801354675897024, "language_loss": 0.81123096, "learning_rate": 1.6244128174453752e-07, "loss": 0.82616353, "num_input_tokens_seen": 313699730, "router_z_loss_clip": 2.32421875, "router_z_loss_mlp": 0.27355957, "step": 14548, "time_per_iteration": 2.8567323684692383 }, { "auxiliary_loss_clip": 0.01259365, "auxiliary_loss_mlp": 0.00212792, "balance_loss_clip": 1.0340265, "balance_loss_mlp": 0.18610096, "epoch": 0.8747332030662859, "flos": 23696159080320.0, "grad_norm": 13.24738804444235, "language_loss": 0.81571257, "learning_rate": 1.6228756823719093e-07, "loss": 0.83043408, "num_input_tokens_seen": 313720090, "router_z_loss_clip": 2.25195312, "router_z_loss_mlp": 0.2668457, "step": 14549, "time_per_iteration": 2.724985361099243 }, { "auxiliary_loss_clip": 0.01270388, "auxiliary_loss_mlp": 0.00215038, "balance_loss_clip": 1.03869975, "balance_loss_mlp": 0.18729818, "epoch": 0.8747933263189539, "flos": 24462456854400.0, "grad_norm": 5.675299623368861, "language_loss": 0.93509483, "learning_rate": 1.6213392441608352e-07, "loss": 0.94994903, "num_input_tokens_seen": 313736795, "router_z_loss_clip": 2.31835938, "router_z_loss_mlp": 0.27783203, "step": 14550, "time_per_iteration": 2.760155200958252 }, { "auxiliary_loss_clip": 0.0123687, "auxiliary_loss_mlp": 0.00221149, "balance_loss_clip": 1.02078044, "balance_loss_mlp": 0.19470805, "epoch": 0.8748534495716218, "flos": 13809161917440.0, "grad_norm": 12.440311560659946, "language_loss": 0.80690396, "learning_rate": 1.6198035028704183e-07, "loss": 0.82148415, "num_input_tokens_seen": 313754820, "router_z_loss_clip": 2.16015625, "router_z_loss_mlp": 0.2644043, "step": 14551, "time_per_iteration": 2.6184983253479004 }, { "auxiliary_loss_clip": 0.01232713, "auxiliary_loss_mlp": 0.00187335, "balance_loss_clip": 1.02174067, "balance_loss_mlp": 0.16367176, "epoch": 0.8749135728242898, "flos": 29862092759040.0, "grad_norm": 161.2816906188908, "language_loss": 0.72438318, "learning_rate": 1.6182684585588934e-07, "loss": 0.73858368, "num_input_tokens_seen": 313775830, "router_z_loss_clip": 2.11328125, "router_z_loss_mlp": 0.23669434, "step": 14552, "time_per_iteration": 2.708862066268921 }, { "auxiliary_loss_clip": 0.01250628, "auxiliary_loss_mlp": 0.00222459, "balance_loss_clip": 1.03080821, "balance_loss_mlp": 0.19609033, "epoch": 0.8749736960769577, "flos": 24133479166080.0, "grad_norm": 44.40525694049383, "language_loss": 0.8864572, "learning_rate": 1.616734111284479e-07, "loss": 0.90118808, "num_input_tokens_seen": 313795745, "router_z_loss_clip": 2.19921875, "router_z_loss_mlp": 0.2635498, "step": 14553, "time_per_iteration": 2.7477195262908936 }, { "auxiliary_loss_clip": 0.01239815, "auxiliary_loss_mlp": 0.00208795, "balance_loss_clip": 1.02292216, "balance_loss_mlp": 0.18380827, "epoch": 0.8750338193296258, "flos": 17202540602880.0, "grad_norm": 28.107059691979547, "language_loss": 0.78392988, "learning_rate": 1.6152004611053416e-07, "loss": 0.79841602, "num_input_tokens_seen": 313813895, "router_z_loss_clip": 2.17089844, "router_z_loss_mlp": 0.25, "step": 14554, "time_per_iteration": 2.7716212272644043 }, { "auxiliary_loss_clip": 0.01224264, "auxiliary_loss_mlp": 0.00205097, "balance_loss_clip": 1.01183426, "balance_loss_mlp": 0.18238741, "epoch": 0.8750939425822937, "flos": 23733218937600.0, "grad_norm": 63.364867669872574, "language_loss": 0.90828866, "learning_rate": 1.6136675080796457e-07, "loss": 0.92258227, "num_input_tokens_seen": 313834225, "router_z_loss_clip": 2.12402344, "router_z_loss_mlp": 0.22705078, "step": 14555, "time_per_iteration": 2.7751495838165283 }, { "auxiliary_loss_clip": 0.01242299, "auxiliary_loss_mlp": 0.00201622, "balance_loss_clip": 1.02488625, "balance_loss_mlp": 0.177542, "epoch": 0.8751540658349617, "flos": 26541684552960.0, "grad_norm": 2.7408883011704677, "language_loss": 0.76455456, "learning_rate": 1.6121352522655252e-07, "loss": 0.77899379, "num_input_tokens_seen": 313854430, "router_z_loss_clip": 2.17480469, "router_z_loss_mlp": 0.24084473, "step": 14556, "time_per_iteration": 2.7581534385681152 }, { "auxiliary_loss_clip": 0.01252456, "auxiliary_loss_mlp": 0.00221452, "balance_loss_clip": 1.03227758, "balance_loss_mlp": 0.19322361, "epoch": 0.8752141890876296, "flos": 19386806647680.0, "grad_norm": 4.609049320203973, "language_loss": 0.85478264, "learning_rate": 1.6106036937210732e-07, "loss": 0.86952168, "num_input_tokens_seen": 313871600, "router_z_loss_clip": 2.20117188, "router_z_loss_mlp": 0.28234863, "step": 14557, "time_per_iteration": 2.723848581314087 }, { "auxiliary_loss_clip": 0.01238499, "auxiliary_loss_mlp": 0.00216866, "balance_loss_clip": 1.02340174, "balance_loss_mlp": 0.19283327, "epoch": 0.8752743123402976, "flos": 25374408278400.0, "grad_norm": 47.59507182795253, "language_loss": 0.89762759, "learning_rate": 1.6090728325043767e-07, "loss": 0.91218114, "num_input_tokens_seen": 313891570, "router_z_loss_clip": 2.15234375, "router_z_loss_mlp": 0.24047852, "step": 14558, "time_per_iteration": 2.662726879119873 }, { "auxiliary_loss_clip": 0.01158558, "auxiliary_loss_mlp": 0.00108438, "balance_loss_clip": 1.01760697, "balance_loss_mlp": 0.10195326, "epoch": 0.8753344355929655, "flos": 59952398578560.0, "grad_norm": 0.7967228675909831, "language_loss": 0.55482024, "learning_rate": 1.6075426686734784e-07, "loss": 0.56749022, "num_input_tokens_seen": 313951290, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.06494141, "step": 14559, "time_per_iteration": 3.1683766841888428 }, { "auxiliary_loss_clip": 0.01243195, "auxiliary_loss_mlp": 0.002033, "balance_loss_clip": 1.02580166, "balance_loss_mlp": 0.17736025, "epoch": 0.8753945588456336, "flos": 17894646835200.0, "grad_norm": 96.81975940087634, "language_loss": 0.73204255, "learning_rate": 1.606013202286407e-07, "loss": 0.74650753, "num_input_tokens_seen": 313968645, "router_z_loss_clip": 2.171875, "router_z_loss_mlp": 0.2590332, "step": 14560, "time_per_iteration": 2.632150411605835 }, { "auxiliary_loss_clip": 0.01244731, "auxiliary_loss_mlp": 0.00208678, "balance_loss_clip": 1.03031039, "balance_loss_mlp": 0.18557553, "epoch": 0.8754546820983016, "flos": 30914885410560.0, "grad_norm": 10.280182517602949, "language_loss": 0.8667134, "learning_rate": 1.6044844334011541e-07, "loss": 0.88124758, "num_input_tokens_seen": 313987580, "router_z_loss_clip": 2.13769531, "router_z_loss_mlp": 0.2310791, "step": 14561, "time_per_iteration": 2.726731300354004 }, { "auxiliary_loss_clip": 0.01249084, "auxiliary_loss_mlp": 0.00213954, "balance_loss_clip": 1.02642417, "balance_loss_mlp": 0.18781181, "epoch": 0.8755148053509695, "flos": 20631075724800.0, "grad_norm": 47.106853704488444, "language_loss": 0.88041127, "learning_rate": 1.6029563620756982e-07, "loss": 0.89504158, "num_input_tokens_seen": 314004460, "router_z_loss_clip": 2.22460938, "router_z_loss_mlp": 0.26123047, "step": 14562, "time_per_iteration": 2.6602189540863037 }, { "auxiliary_loss_clip": 0.01218267, "auxiliary_loss_mlp": 0.00200956, "balance_loss_clip": 1.00942087, "balance_loss_mlp": 0.17675611, "epoch": 0.8755749286036375, "flos": 34969739005440.0, "grad_norm": 48.44945553023005, "language_loss": 0.77327377, "learning_rate": 1.601428988367981e-07, "loss": 0.78746599, "num_input_tokens_seen": 314026855, "router_z_loss_clip": 2.08984375, "router_z_loss_mlp": 0.2421875, "step": 14563, "time_per_iteration": 2.749040126800537 }, { "auxiliary_loss_clip": 0.01260546, "auxiliary_loss_mlp": 0.00214275, "balance_loss_clip": 1.03411889, "balance_loss_mlp": 0.18745241, "epoch": 0.8756350518563054, "flos": 18186456925440.0, "grad_norm": 11.609146757677005, "language_loss": 0.74719626, "learning_rate": 1.5999023123359235e-07, "loss": 0.76194441, "num_input_tokens_seen": 314042830, "router_z_loss_clip": 2.26757812, "router_z_loss_mlp": 0.26831055, "step": 14564, "time_per_iteration": 2.6852827072143555 }, { "auxiliary_loss_clip": 0.01228518, "auxiliary_loss_mlp": 0.00190885, "balance_loss_clip": 1.01553297, "balance_loss_mlp": 0.16782945, "epoch": 0.8756951751089734, "flos": 20084012611200.0, "grad_norm": 2.600381599361483, "language_loss": 0.78095287, "learning_rate": 1.598376334037408e-07, "loss": 0.79514694, "num_input_tokens_seen": 314062225, "router_z_loss_clip": 2.13085938, "router_z_loss_mlp": 0.23071289, "step": 14565, "time_per_iteration": 2.675233840942383 }, { "auxiliary_loss_clip": 0.0128842, "auxiliary_loss_mlp": 0.00219811, "balance_loss_clip": 1.04987371, "balance_loss_mlp": 0.1905808, "epoch": 0.8757552983616413, "flos": 27525241739520.0, "grad_norm": 1035.8602754126143, "language_loss": 0.86270118, "learning_rate": 1.5968510535303102e-07, "loss": 0.87778342, "num_input_tokens_seen": 314082325, "router_z_loss_clip": 2.38476562, "router_z_loss_mlp": 0.2923584, "step": 14566, "time_per_iteration": 2.716942548751831 }, { "auxiliary_loss_clip": 0.01249248, "auxiliary_loss_mlp": 0.00206207, "balance_loss_clip": 1.03227925, "balance_loss_mlp": 0.1827701, "epoch": 0.8758154216143094, "flos": 18073014796800.0, "grad_norm": 6.743684987493111, "language_loss": 0.80332142, "learning_rate": 1.5953264708724624e-07, "loss": 0.81787592, "num_input_tokens_seen": 314100310, "router_z_loss_clip": 2.171875, "router_z_loss_mlp": 0.23449707, "step": 14567, "time_per_iteration": 2.668977975845337 }, { "auxiliary_loss_clip": 0.01238216, "auxiliary_loss_mlp": 0.00196824, "balance_loss_clip": 1.0240705, "balance_loss_mlp": 0.17297, "epoch": 0.8758755448669773, "flos": 25045681985280.0, "grad_norm": 9.093998061303829, "language_loss": 0.81467044, "learning_rate": 1.5938025861216776e-07, "loss": 0.8290208, "num_input_tokens_seen": 314121330, "router_z_loss_clip": 2.140625, "router_z_loss_mlp": 0.23840332, "step": 14568, "time_per_iteration": 2.895484209060669 }, { "auxiliary_loss_clip": 0.01246797, "auxiliary_loss_mlp": 0.00206794, "balance_loss_clip": 1.03487349, "balance_loss_mlp": 0.18390587, "epoch": 0.8759356681196453, "flos": 22856818999680.0, "grad_norm": 139.57934808100515, "language_loss": 0.94490796, "learning_rate": 1.5922793993357475e-07, "loss": 0.95944393, "num_input_tokens_seen": 314139875, "router_z_loss_clip": 2.11914062, "router_z_loss_mlp": 0.22875977, "step": 14569, "time_per_iteration": 2.6754565238952637 }, { "auxiliary_loss_clip": 0.01238891, "auxiliary_loss_mlp": 0.00220344, "balance_loss_clip": 1.02370846, "balance_loss_mlp": 0.19563225, "epoch": 0.8759957913723132, "flos": 21032521102080.0, "grad_norm": 2.3891493480551627, "language_loss": 0.81959653, "learning_rate": 1.5907569105724284e-07, "loss": 0.83418894, "num_input_tokens_seen": 314157850, "router_z_loss_clip": 2.15039062, "router_z_loss_mlp": 0.24731445, "step": 14570, "time_per_iteration": 2.7163448333740234 }, { "auxiliary_loss_clip": 0.01259379, "auxiliary_loss_mlp": 0.00220069, "balance_loss_clip": 1.03570819, "balance_loss_mlp": 0.19598868, "epoch": 0.8760559146249812, "flos": 20010467514240.0, "grad_norm": 151.43724567602834, "language_loss": 0.76096004, "learning_rate": 1.5892351198894472e-07, "loss": 0.77575445, "num_input_tokens_seen": 314176720, "router_z_loss_clip": 2.23632812, "router_z_loss_mlp": 0.24084473, "step": 14571, "time_per_iteration": 2.657125949859619 }, { "auxiliary_loss_clip": 0.01247677, "auxiliary_loss_mlp": 0.00214267, "balance_loss_clip": 1.03034234, "balance_loss_mlp": 0.19041277, "epoch": 0.8761160378776491, "flos": 19974161842560.0, "grad_norm": 56.95716268715254, "language_loss": 0.72477996, "learning_rate": 1.5877140273445156e-07, "loss": 0.73939943, "num_input_tokens_seen": 314196645, "router_z_loss_clip": 2.171875, "router_z_loss_mlp": 0.2388916, "step": 14572, "time_per_iteration": 2.6856110095977783 }, { "auxiliary_loss_clip": 0.0122924, "auxiliary_loss_mlp": 0.00198088, "balance_loss_clip": 1.01513267, "balance_loss_mlp": 0.17381679, "epoch": 0.8761761611303172, "flos": 28804415857920.0, "grad_norm": 3.2689841689013, "language_loss": 0.82700497, "learning_rate": 1.5861936329953162e-07, "loss": 0.8412782, "num_input_tokens_seen": 314217430, "router_z_loss_clip": 2.13867188, "router_z_loss_mlp": 0.24267578, "step": 14573, "time_per_iteration": 2.790480136871338 }, { "auxiliary_loss_clip": 0.01220241, "auxiliary_loss_mlp": 0.00190009, "balance_loss_clip": 1.01223993, "balance_loss_mlp": 0.16747804, "epoch": 0.8762362843829851, "flos": 18332505624960.0, "grad_norm": 10.668956481498908, "language_loss": 0.81072527, "learning_rate": 1.5846739368994966e-07, "loss": 0.82482779, "num_input_tokens_seen": 314235310, "router_z_loss_clip": 2.08007812, "router_z_loss_mlp": 0.2253418, "step": 14574, "time_per_iteration": 2.6432807445526123 }, { "auxiliary_loss_clip": 0.01250759, "auxiliary_loss_mlp": 0.00190161, "balance_loss_clip": 1.03475654, "balance_loss_mlp": 0.16631949, "epoch": 0.8762964076356531, "flos": 15779149378560.0, "grad_norm": 2.708278683630191, "language_loss": 0.82937771, "learning_rate": 1.5831549391146903e-07, "loss": 0.8437869, "num_input_tokens_seen": 314252355, "router_z_loss_clip": 2.16113281, "router_z_loss_mlp": 0.23852539, "step": 14575, "time_per_iteration": 2.649977207183838 }, { "auxiliary_loss_clip": 0.01220707, "auxiliary_loss_mlp": 0.00221574, "balance_loss_clip": 1.01296175, "balance_loss_mlp": 0.1984714, "epoch": 0.8763565308883211, "flos": 33176754789120.0, "grad_norm": 27.33318817678827, "language_loss": 0.72395641, "learning_rate": 1.5816366396984916e-07, "loss": 0.73837924, "num_input_tokens_seen": 314272755, "router_z_loss_clip": 2.078125, "router_z_loss_mlp": 0.23095703, "step": 14576, "time_per_iteration": 4.193576335906982 }, { "auxiliary_loss_clip": 0.01231114, "auxiliary_loss_mlp": 0.00218554, "balance_loss_clip": 1.02066541, "balance_loss_mlp": 0.19448574, "epoch": 0.876416654140989, "flos": 15888102307200.0, "grad_norm": 66.09588055863574, "language_loss": 0.74392718, "learning_rate": 1.5801190387084806e-07, "loss": 0.75842381, "num_input_tokens_seen": 314291365, "router_z_loss_clip": 2.10546875, "router_z_loss_mlp": 0.24084473, "step": 14577, "time_per_iteration": 2.7047746181488037 }, { "auxiliary_loss_clip": 0.0128187, "auxiliary_loss_mlp": 0.00213177, "balance_loss_clip": 1.05122542, "balance_loss_mlp": 0.18585438, "epoch": 0.876476777393657, "flos": 25885237547520.0, "grad_norm": 23.319994319975244, "language_loss": 0.80586708, "learning_rate": 1.5786021362021962e-07, "loss": 0.82081747, "num_input_tokens_seen": 314310075, "router_z_loss_clip": 2.30664062, "router_z_loss_mlp": 0.27319336, "step": 14578, "time_per_iteration": 4.164779424667358 }, { "auxiliary_loss_clip": 0.01269376, "auxiliary_loss_mlp": 0.00204452, "balance_loss_clip": 1.04363847, "balance_loss_mlp": 0.1783925, "epoch": 0.876536900646325, "flos": 13589675861760.0, "grad_norm": 3.795488010824798, "language_loss": 0.80867249, "learning_rate": 1.5770859322371676e-07, "loss": 0.82341075, "num_input_tokens_seen": 314325695, "router_z_loss_clip": 2.25976562, "router_z_loss_mlp": 0.26086426, "step": 14579, "time_per_iteration": 2.5782058238983154 }, { "auxiliary_loss_clip": 0.01226835, "auxiliary_loss_mlp": 0.00191798, "balance_loss_clip": 1.01539087, "balance_loss_mlp": 0.1687783, "epoch": 0.876597023898993, "flos": 12203344494720.0, "grad_norm": 26.455425653127794, "language_loss": 0.78176737, "learning_rate": 1.5755704268708912e-07, "loss": 0.79595375, "num_input_tokens_seen": 314343605, "router_z_loss_clip": 2.109375, "router_z_loss_mlp": 0.23022461, "step": 14580, "time_per_iteration": 2.6377501487731934 }, { "auxiliary_loss_clip": 0.01239171, "auxiliary_loss_mlp": 0.00212826, "balance_loss_clip": 1.02970707, "balance_loss_mlp": 0.18909146, "epoch": 0.8766571471516609, "flos": 25336773803520.0, "grad_norm": 4.996658510276659, "language_loss": 0.73672557, "learning_rate": 1.5740556201608256e-07, "loss": 0.75124556, "num_input_tokens_seen": 314364275, "router_z_loss_clip": 2.09472656, "router_z_loss_mlp": 0.23742676, "step": 14581, "time_per_iteration": 2.67145037651062 }, { "auxiliary_loss_clip": 0.0122522, "auxiliary_loss_mlp": 0.00201621, "balance_loss_clip": 1.0114522, "balance_loss_mlp": 0.17830411, "epoch": 0.8767172704043289, "flos": 30113287545600.0, "grad_norm": 4.456378252789214, "language_loss": 0.78447974, "learning_rate": 1.572541512164416e-07, "loss": 0.79874814, "num_input_tokens_seen": 314385140, "router_z_loss_clip": 2.13867188, "router_z_loss_mlp": 0.23303223, "step": 14582, "time_per_iteration": 2.727855920791626 }, { "auxiliary_loss_clip": 0.01235644, "auxiliary_loss_mlp": 0.00220275, "balance_loss_clip": 1.01924264, "balance_loss_mlp": 0.19578928, "epoch": 0.8767773936569968, "flos": 19281157770240.0, "grad_norm": 166.28602104886014, "language_loss": 0.76763874, "learning_rate": 1.5710281029390826e-07, "loss": 0.78219795, "num_input_tokens_seen": 314403715, "router_z_loss_clip": 2.16796875, "router_z_loss_mlp": 0.24499512, "step": 14583, "time_per_iteration": 4.070705890655518 }, { "auxiliary_loss_clip": 0.01263964, "auxiliary_loss_mlp": 0.00197656, "balance_loss_clip": 1.04085279, "balance_loss_mlp": 0.17150171, "epoch": 0.8768375169096648, "flos": 21247230648960.0, "grad_norm": 9.996074619429974, "language_loss": 0.84345907, "learning_rate": 1.5695153925422067e-07, "loss": 0.85807526, "num_input_tokens_seen": 314421880, "router_z_loss_clip": 2.23242188, "router_z_loss_mlp": 0.26171875, "step": 14584, "time_per_iteration": 2.6554510593414307 }, { "auxiliary_loss_clip": 0.01249325, "auxiliary_loss_mlp": 0.00197331, "balance_loss_clip": 1.03239441, "balance_loss_mlp": 0.172571, "epoch": 0.8768976401623327, "flos": 23295539715840.0, "grad_norm": 5.222666518443242, "language_loss": 0.79361904, "learning_rate": 1.5680033810311555e-07, "loss": 0.80808556, "num_input_tokens_seen": 314441585, "router_z_loss_clip": 2.16796875, "router_z_loss_mlp": 0.24755859, "step": 14585, "time_per_iteration": 2.6986677646636963 }, { "auxiliary_loss_clip": 0.01239485, "auxiliary_loss_mlp": 0.00217825, "balance_loss_clip": 1.02522087, "balance_loss_mlp": 0.19273192, "epoch": 0.8769577634150008, "flos": 21361247395200.0, "grad_norm": 171.40198163867146, "language_loss": 0.85352206, "learning_rate": 1.5664920684632654e-07, "loss": 0.86809516, "num_input_tokens_seen": 314459020, "router_z_loss_clip": 2.14453125, "router_z_loss_mlp": 0.25085449, "step": 14586, "time_per_iteration": 2.7267696857452393 }, { "auxiliary_loss_clip": 0.01237449, "auxiliary_loss_mlp": 0.00193574, "balance_loss_clip": 1.02314425, "balance_loss_mlp": 0.16859969, "epoch": 0.8770178866676687, "flos": 23514056104320.0, "grad_norm": 2.183645230567387, "language_loss": 0.86796963, "learning_rate": 1.564981454895844e-07, "loss": 0.88227987, "num_input_tokens_seen": 314478935, "router_z_loss_clip": 2.1484375, "router_z_loss_mlp": 0.24975586, "step": 14587, "time_per_iteration": 4.106330871582031 }, { "auxiliary_loss_clip": 0.01256939, "auxiliary_loss_mlp": 0.00221559, "balance_loss_clip": 1.03581858, "balance_loss_mlp": 0.19549988, "epoch": 0.8770780099203367, "flos": 19719052473600.0, "grad_norm": 16.303160895652475, "language_loss": 0.82239532, "learning_rate": 1.5634715403861697e-07, "loss": 0.83718026, "num_input_tokens_seen": 314497635, "router_z_loss_clip": 2.21386719, "router_z_loss_mlp": 0.26086426, "step": 14588, "time_per_iteration": 2.7327075004577637 }, { "auxiliary_loss_clip": 0.01229503, "auxiliary_loss_mlp": 0.00192659, "balance_loss_clip": 1.01723611, "balance_loss_mlp": 0.16743411, "epoch": 0.8771381331730047, "flos": 21395901041280.0, "grad_norm": 2.603843820372525, "language_loss": 0.75424004, "learning_rate": 1.5619623249915016e-07, "loss": 0.7684617, "num_input_tokens_seen": 314515445, "router_z_loss_clip": 2.12402344, "router_z_loss_mlp": 0.25231934, "step": 14589, "time_per_iteration": 2.6294307708740234 }, { "auxiliary_loss_clip": 0.01260357, "auxiliary_loss_mlp": 0.00233976, "balance_loss_clip": 1.03969049, "balance_loss_mlp": 0.20896566, "epoch": 0.8771982564256726, "flos": 20261770041600.0, "grad_norm": 13.307063405463575, "language_loss": 0.8075614, "learning_rate": 1.5604538087690732e-07, "loss": 0.82250476, "num_input_tokens_seen": 314533040, "router_z_loss_clip": 2.20605469, "router_z_loss_mlp": 0.25036621, "step": 14590, "time_per_iteration": 2.681377410888672 }, { "auxiliary_loss_clip": 0.01277666, "auxiliary_loss_mlp": 0.00224263, "balance_loss_clip": 1.04114151, "balance_loss_mlp": 0.19579543, "epoch": 0.8772583796783406, "flos": 12489372495360.0, "grad_norm": 7.080004984949966, "language_loss": 0.83791763, "learning_rate": 1.558945991776086e-07, "loss": 0.85293692, "num_input_tokens_seen": 314548280, "router_z_loss_clip": 2.36328125, "router_z_loss_mlp": 0.2845459, "step": 14591, "time_per_iteration": 2.5977182388305664 }, { "auxiliary_loss_clip": 0.01242974, "auxiliary_loss_mlp": 0.00213938, "balance_loss_clip": 1.03008902, "balance_loss_mlp": 0.19076365, "epoch": 0.8773185029310085, "flos": 15921103927680.0, "grad_norm": 5.458893668530817, "language_loss": 0.86886346, "learning_rate": 1.5574388740697096e-07, "loss": 0.88343257, "num_input_tokens_seen": 314565345, "router_z_loss_clip": 2.13085938, "router_z_loss_mlp": 0.23168945, "step": 14592, "time_per_iteration": 2.6402688026428223 }, { "auxiliary_loss_clip": 0.0122842, "auxiliary_loss_mlp": 0.00202196, "balance_loss_clip": 1.01724088, "balance_loss_mlp": 0.17862871, "epoch": 0.8773786261836766, "flos": 21504530747520.0, "grad_norm": 156.58574978599952, "language_loss": 0.89689583, "learning_rate": 1.5559324557071052e-07, "loss": 0.91120195, "num_input_tokens_seen": 314584190, "router_z_loss_clip": 2.109375, "router_z_loss_mlp": 0.23583984, "step": 14593, "time_per_iteration": 2.6624386310577393 }, { "auxiliary_loss_clip": 0.01236862, "auxiliary_loss_mlp": 0.00175975, "balance_loss_clip": 1.02327061, "balance_loss_mlp": 0.15271729, "epoch": 0.8774387494363445, "flos": 26761493831040.0, "grad_norm": 859.4081847641895, "language_loss": 0.82837224, "learning_rate": 1.5544267367453845e-07, "loss": 0.84250063, "num_input_tokens_seen": 314605625, "router_z_loss_clip": 2.13671875, "router_z_loss_mlp": 0.23266602, "step": 14594, "time_per_iteration": 2.7501978874206543 }, { "auxiliary_loss_clip": 0.01239648, "auxiliary_loss_mlp": 0.0021344, "balance_loss_clip": 1.02192259, "balance_loss_mlp": 0.18940759, "epoch": 0.8774988726890125, "flos": 18478841633280.0, "grad_norm": 12.577624378309755, "language_loss": 0.84011662, "learning_rate": 1.552921717241651e-07, "loss": 0.85464746, "num_input_tokens_seen": 314622630, "router_z_loss_clip": 2.17578125, "router_z_loss_mlp": 0.2401123, "step": 14595, "time_per_iteration": 2.602811336517334 }, { "auxiliary_loss_clip": 0.01246394, "auxiliary_loss_mlp": 0.0023349, "balance_loss_clip": 1.02932334, "balance_loss_mlp": 0.20856312, "epoch": 0.8775589959416804, "flos": 24426366664320.0, "grad_norm": 5.629836949590199, "language_loss": 0.79592705, "learning_rate": 1.5514173972529743e-07, "loss": 0.81072581, "num_input_tokens_seen": 314642460, "router_z_loss_clip": 2.16796875, "router_z_loss_mlp": 0.24951172, "step": 14596, "time_per_iteration": 2.6414475440979004 }, { "auxiliary_loss_clip": 0.01242781, "auxiliary_loss_mlp": 0.00207717, "balance_loss_clip": 1.0283078, "balance_loss_mlp": 0.18267064, "epoch": 0.8776191191943484, "flos": 23440151871360.0, "grad_norm": 2.386298678785432, "language_loss": 0.91946256, "learning_rate": 1.5499137768364067e-07, "loss": 0.93396759, "num_input_tokens_seen": 314659875, "router_z_loss_clip": 2.14746094, "router_z_loss_mlp": 0.25048828, "step": 14597, "time_per_iteration": 2.717808246612549 }, { "auxiliary_loss_clip": 0.01248149, "auxiliary_loss_mlp": 0.00212457, "balance_loss_clip": 1.03022242, "balance_loss_mlp": 0.18727967, "epoch": 0.8776792424470163, "flos": 26830872950400.0, "grad_norm": 12.567023401236579, "language_loss": 0.78799808, "learning_rate": 1.5484108560489494e-07, "loss": 0.8026042, "num_input_tokens_seen": 314680260, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.25170898, "step": 14598, "time_per_iteration": 2.699855327606201 }, { "auxiliary_loss_clip": 0.0124858, "auxiliary_loss_mlp": 0.00177655, "balance_loss_clip": 1.02958703, "balance_loss_mlp": 0.15358624, "epoch": 0.8777393656996844, "flos": 15626169354240.0, "grad_norm": 2.1616581632719405, "language_loss": 0.86978734, "learning_rate": 1.5469086349476036e-07, "loss": 0.88404959, "num_input_tokens_seen": 314696260, "router_z_loss_clip": 2.19140625, "router_z_loss_mlp": 0.24047852, "step": 14599, "time_per_iteration": 2.621403932571411 }, { "auxiliary_loss_clip": 0.01258508, "auxiliary_loss_mlp": 0.00218631, "balance_loss_clip": 1.03654301, "balance_loss_mlp": 0.19350141, "epoch": 0.8777994889523523, "flos": 18879999701760.0, "grad_norm": 2.8994859335993333, "language_loss": 0.79882121, "learning_rate": 1.545407113589332e-07, "loss": 0.81359261, "num_input_tokens_seen": 314714215, "router_z_loss_clip": 2.21679688, "router_z_loss_mlp": 0.2512207, "step": 14600, "time_per_iteration": 2.6168739795684814 }, { "auxiliary_loss_clip": 0.01262535, "auxiliary_loss_mlp": 0.00215028, "balance_loss_clip": 1.03650749, "balance_loss_mlp": 0.19006555, "epoch": 0.8778596122050203, "flos": 48826516400640.0, "grad_norm": 2.5222066752054064, "language_loss": 0.7676267, "learning_rate": 1.543906292031072e-07, "loss": 0.78240234, "num_input_tokens_seen": 314735700, "router_z_loss_clip": 2.25976562, "router_z_loss_mlp": 0.24938965, "step": 14601, "time_per_iteration": 3.027595043182373 }, { "auxiliary_loss_clip": 0.01264264, "auxiliary_loss_mlp": 0.00231191, "balance_loss_clip": 1.04037809, "balance_loss_mlp": 0.20454788, "epoch": 0.8779197354576883, "flos": 25660184883840.0, "grad_norm": 11.016274885561202, "language_loss": 0.81276774, "learning_rate": 1.542406170329733e-07, "loss": 0.82772237, "num_input_tokens_seen": 314753335, "router_z_loss_clip": 2.23828125, "router_z_loss_mlp": 0.26623535, "step": 14602, "time_per_iteration": 2.689143657684326 }, { "auxiliary_loss_clip": 0.0123009, "auxiliary_loss_mlp": 0.00210538, "balance_loss_clip": 1.02100456, "balance_loss_mlp": 0.18915194, "epoch": 0.8779798587103562, "flos": 18843227153280.0, "grad_norm": 110.71171628474467, "language_loss": 0.77154112, "learning_rate": 1.5409067485422056e-07, "loss": 0.78594744, "num_input_tokens_seen": 314770800, "router_z_loss_clip": 2.09375, "router_z_loss_mlp": 0.21362305, "step": 14603, "time_per_iteration": 2.6671180725097656 }, { "auxiliary_loss_clip": 0.01142758, "auxiliary_loss_mlp": 0.00069364, "balance_loss_clip": 1.0040673, "balance_loss_mlp": 0.06116246, "epoch": 0.8780399819630242, "flos": 68613119377920.0, "grad_norm": 0.7349116157251885, "language_loss": 0.53582758, "learning_rate": 1.539408026725344e-07, "loss": 0.54794878, "num_input_tokens_seen": 314837275, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.08203125, "step": 14604, "time_per_iteration": 3.190201759338379 }, { "auxiliary_loss_clip": 0.01145479, "auxiliary_loss_mlp": 0.00080339, "balance_loss_clip": 1.00840402, "balance_loss_mlp": 0.07337709, "epoch": 0.8781001052156922, "flos": 65734807766400.0, "grad_norm": 0.6889713135709393, "language_loss": 0.5784229, "learning_rate": 1.537910004935976e-07, "loss": 0.59068108, "num_input_tokens_seen": 314902220, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.06982422, "step": 14605, "time_per_iteration": 3.1572859287261963 }, { "auxiliary_loss_clip": 0.01242357, "auxiliary_loss_mlp": 0.00208706, "balance_loss_clip": 1.0280875, "balance_loss_mlp": 0.18410078, "epoch": 0.8781602284683602, "flos": 22049654526720.0, "grad_norm": 20.24468496330013, "language_loss": 0.92179638, "learning_rate": 1.536412683230912e-07, "loss": 0.93630695, "num_input_tokens_seen": 314921645, "router_z_loss_clip": 2.14160156, "router_z_loss_mlp": 0.24609375, "step": 14606, "time_per_iteration": 2.755272388458252 }, { "auxiliary_loss_clip": 0.01265823, "auxiliary_loss_mlp": 0.00228795, "balance_loss_clip": 1.04136038, "balance_loss_mlp": 0.20116207, "epoch": 0.8782203517210281, "flos": 17562939713280.0, "grad_norm": 227.28301414421742, "language_loss": 0.80491662, "learning_rate": 1.534916061666931e-07, "loss": 0.81986284, "num_input_tokens_seen": 314939390, "router_z_loss_clip": 2.24609375, "router_z_loss_mlp": 0.27661133, "step": 14607, "time_per_iteration": 2.6171460151672363 }, { "auxiliary_loss_clip": 0.01231875, "auxiliary_loss_mlp": 0.0020044, "balance_loss_clip": 1.0227077, "balance_loss_mlp": 0.17646727, "epoch": 0.8782804749736961, "flos": 25520421064320.0, "grad_norm": 7.108441214532879, "language_loss": 0.79673266, "learning_rate": 1.533420140300785e-07, "loss": 0.81105578, "num_input_tokens_seen": 314959205, "router_z_loss_clip": 2.09375, "router_z_loss_mlp": 0.23974609, "step": 14608, "time_per_iteration": 2.7729554176330566 }, { "auxiliary_loss_clip": 0.01250675, "auxiliary_loss_mlp": 0.00229415, "balance_loss_clip": 1.03111148, "balance_loss_mlp": 0.20525165, "epoch": 0.878340598226364, "flos": 21798747048960.0, "grad_norm": 409.60870711053053, "language_loss": 0.96345317, "learning_rate": 1.5319249191891936e-07, "loss": 0.97825408, "num_input_tokens_seen": 314977485, "router_z_loss_clip": 2.19335938, "router_z_loss_mlp": 0.24169922, "step": 14609, "time_per_iteration": 2.622631549835205 }, { "auxiliary_loss_clip": 0.01260979, "auxiliary_loss_mlp": 0.00217918, "balance_loss_clip": 1.03690839, "balance_loss_mlp": 0.19280057, "epoch": 0.878400721479032, "flos": 21102403011840.0, "grad_norm": 6.341992656864886, "language_loss": 0.77331185, "learning_rate": 1.5304303983888643e-07, "loss": 0.78810084, "num_input_tokens_seen": 314997830, "router_z_loss_clip": 2.24023438, "router_z_loss_mlp": 0.2512207, "step": 14610, "time_per_iteration": 2.75335431098938 }, { "auxiliary_loss_clip": 0.01241751, "auxiliary_loss_mlp": 0.00207522, "balance_loss_clip": 1.0256238, "balance_loss_mlp": 0.18354857, "epoch": 0.8784608447316999, "flos": 20923532259840.0, "grad_norm": 11.121690678331516, "language_loss": 0.88660067, "learning_rate": 1.5289365779564612e-07, "loss": 0.90109342, "num_input_tokens_seen": 315016480, "router_z_loss_clip": 2.1640625, "router_z_loss_mlp": 0.23986816, "step": 14611, "time_per_iteration": 2.6261003017425537 }, { "auxiliary_loss_clip": 0.01251774, "auxiliary_loss_mlp": 0.00209429, "balance_loss_clip": 1.03082442, "balance_loss_mlp": 0.18580216, "epoch": 0.878520967984368, "flos": 23330660238720.0, "grad_norm": 11.023422556483977, "language_loss": 0.8358717, "learning_rate": 1.5274434579486338e-07, "loss": 0.85048378, "num_input_tokens_seen": 315036135, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.2364502, "step": 14612, "time_per_iteration": 2.707693576812744 }, { "auxiliary_loss_clip": 0.01243054, "auxiliary_loss_mlp": 0.00195835, "balance_loss_clip": 1.02655423, "balance_loss_mlp": 0.17274451, "epoch": 0.8785810912370359, "flos": 25518984520320.0, "grad_norm": 370.2296197082719, "language_loss": 0.78430372, "learning_rate": 1.525951038422002e-07, "loss": 0.79869258, "num_input_tokens_seen": 315057995, "router_z_loss_clip": 2.16503906, "router_z_loss_mlp": 0.2310791, "step": 14613, "time_per_iteration": 2.6760265827178955 }, { "auxiliary_loss_clip": 0.01139881, "auxiliary_loss_mlp": 0.00137464, "balance_loss_clip": 1.0013231, "balance_loss_mlp": 0.12854719, "epoch": 0.8786412144897039, "flos": 61841047691520.0, "grad_norm": 0.9952612940458774, "language_loss": 0.6366232, "learning_rate": 1.5244593194331667e-07, "loss": 0.64939666, "num_input_tokens_seen": 315104010, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.08935547, "step": 14614, "time_per_iteration": 3.011845827102661 }, { "auxiliary_loss_clip": 0.01138479, "auxiliary_loss_mlp": 0.0018543, "balance_loss_clip": 1.00004768, "balance_loss_mlp": 0.17694235, "epoch": 0.8787013377423719, "flos": 70989364638720.0, "grad_norm": 0.6747015530389926, "language_loss": 0.57317495, "learning_rate": 1.5229683010386762e-07, "loss": 0.5864141, "num_input_tokens_seen": 315174550, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.08496094, "step": 14615, "time_per_iteration": 3.1996216773986816 }, { "auxiliary_loss_clip": 0.01229591, "auxiliary_loss_mlp": 0.00195566, "balance_loss_clip": 1.01358366, "balance_loss_mlp": 0.17072304, "epoch": 0.8787614609950398, "flos": 17347404153600.0, "grad_norm": 4.741661414565293, "language_loss": 0.83353543, "learning_rate": 1.5214779832950807e-07, "loss": 0.84778702, "num_input_tokens_seen": 315191825, "router_z_loss_clip": 2.16210938, "router_z_loss_mlp": 0.24841309, "step": 14616, "time_per_iteration": 2.616708517074585 }, { "auxiliary_loss_clip": 0.01140752, "auxiliary_loss_mlp": 0.00161044, "balance_loss_clip": 1.00127816, "balance_loss_mlp": 0.15236598, "epoch": 0.8788215842477078, "flos": 72511401588480.0, "grad_norm": 0.7813506904411884, "language_loss": 0.57107425, "learning_rate": 1.5199883662588953e-07, "loss": 0.5840922, "num_input_tokens_seen": 315255075, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.08691406, "step": 14617, "time_per_iteration": 3.2486398220062256 }, { "auxiliary_loss_clip": 0.01239515, "auxiliary_loss_mlp": 0.00229066, "balance_loss_clip": 1.02190185, "balance_loss_mlp": 0.20386526, "epoch": 0.8788817075003758, "flos": 24827452905600.0, "grad_norm": 5.167972963115753, "language_loss": 0.90056866, "learning_rate": 1.5184994499865987e-07, "loss": 0.91525447, "num_input_tokens_seen": 315273995, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.25183105, "step": 14618, "time_per_iteration": 4.082475662231445 }, { "auxiliary_loss_clip": 0.01237054, "auxiliary_loss_mlp": 0.00231362, "balance_loss_clip": 1.02541447, "balance_loss_mlp": 0.20619714, "epoch": 0.8789418307530438, "flos": 22638769488000.0, "grad_norm": 48.26608417631701, "language_loss": 0.76573575, "learning_rate": 1.5170112345346598e-07, "loss": 0.78041989, "num_input_tokens_seen": 315294485, "router_z_loss_clip": 2.11914062, "router_z_loss_mlp": 0.25134277, "step": 14619, "time_per_iteration": 2.738611936569214 }, { "auxiliary_loss_clip": 0.01241426, "auxiliary_loss_mlp": 0.00202289, "balance_loss_clip": 1.0251925, "balance_loss_mlp": 0.17864934, "epoch": 0.8790019540057117, "flos": 19785738072960.0, "grad_norm": 135.51900608121278, "language_loss": 0.84347105, "learning_rate": 1.5155237199595016e-07, "loss": 0.85790819, "num_input_tokens_seen": 315310420, "router_z_loss_clip": 2.16503906, "router_z_loss_mlp": 0.23657227, "step": 14620, "time_per_iteration": 4.061131000518799 }, { "auxiliary_loss_clip": 0.01253777, "auxiliary_loss_mlp": 0.00231344, "balance_loss_clip": 1.03716898, "balance_loss_mlp": 0.20507036, "epoch": 0.8790620772583797, "flos": 20229774001920.0, "grad_norm": 5.0138116011689124, "language_loss": 0.89048719, "learning_rate": 1.514036906317542e-07, "loss": 0.90533835, "num_input_tokens_seen": 315330110, "router_z_loss_clip": 2.16210938, "router_z_loss_mlp": 0.26281738, "step": 14621, "time_per_iteration": 2.7099084854125977 }, { "auxiliary_loss_clip": 0.01267571, "auxiliary_loss_mlp": 0.00214538, "balance_loss_clip": 1.04112887, "balance_loss_mlp": 0.18899122, "epoch": 0.8791222005110476, "flos": 24130785646080.0, "grad_norm": 26.932774059664062, "language_loss": 0.7386089, "learning_rate": 1.5125507936651506e-07, "loss": 0.75343001, "num_input_tokens_seen": 315350080, "router_z_loss_clip": 2.265625, "router_z_loss_mlp": 0.25561523, "step": 14622, "time_per_iteration": 2.6987380981445312 }, { "auxiliary_loss_clip": 0.01251025, "auxiliary_loss_mlp": 0.00185973, "balance_loss_clip": 1.02710021, "balance_loss_mlp": 0.1589362, "epoch": 0.8791823237637156, "flos": 21614201948160.0, "grad_norm": 11.433020326348347, "language_loss": 0.8004688, "learning_rate": 1.511065382058687e-07, "loss": 0.81483883, "num_input_tokens_seen": 315366360, "router_z_loss_clip": 2.24023438, "router_z_loss_mlp": 0.27026367, "step": 14623, "time_per_iteration": 2.6417529582977295 }, { "auxiliary_loss_clip": 0.01229896, "auxiliary_loss_mlp": 0.0020364, "balance_loss_clip": 1.01542604, "balance_loss_mlp": 0.18088251, "epoch": 0.8792424470163835, "flos": 24243401761920.0, "grad_norm": 3.7370957623426158, "language_loss": 0.86750436, "learning_rate": 1.5095806715544801e-07, "loss": 0.88183975, "num_input_tokens_seen": 315385890, "router_z_loss_clip": 2.140625, "router_z_loss_mlp": 0.22753906, "step": 14624, "time_per_iteration": 2.6745190620422363 }, { "auxiliary_loss_clip": 0.01246464, "auxiliary_loss_mlp": 0.00227951, "balance_loss_clip": 1.02765107, "balance_loss_mlp": 0.20181994, "epoch": 0.8793025702690516, "flos": 24893204751360.0, "grad_norm": 40.16130120357125, "language_loss": 0.88377905, "learning_rate": 1.5080966622088265e-07, "loss": 0.89852315, "num_input_tokens_seen": 315403400, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.26135254, "step": 14625, "time_per_iteration": 4.113278865814209 }, { "auxiliary_loss_clip": 0.0122681, "auxiliary_loss_mlp": 0.00200583, "balance_loss_clip": 1.01100683, "balance_loss_mlp": 0.17541805, "epoch": 0.8793626935217195, "flos": 25373115388800.0, "grad_norm": 14.319227897277162, "language_loss": 0.80040765, "learning_rate": 1.5066133540779967e-07, "loss": 0.81468153, "num_input_tokens_seen": 315423670, "router_z_loss_clip": 2.15820312, "router_z_loss_mlp": 0.25158691, "step": 14626, "time_per_iteration": 2.7406954765319824 }, { "auxiliary_loss_clip": 0.01239209, "auxiliary_loss_mlp": 0.00199145, "balance_loss_clip": 1.02145064, "balance_loss_mlp": 0.17288366, "epoch": 0.8794228167743875, "flos": 34678000742400.0, "grad_norm": 3.5238374791680114, "language_loss": 0.78820407, "learning_rate": 1.505130747218246e-07, "loss": 0.80258763, "num_input_tokens_seen": 315446265, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.26257324, "step": 14627, "time_per_iteration": 2.7868871688842773 }, { "auxiliary_loss_clip": 0.01244392, "auxiliary_loss_mlp": 0.00206259, "balance_loss_clip": 1.02619386, "balance_loss_mlp": 0.18074787, "epoch": 0.8794829400270555, "flos": 19464014931840.0, "grad_norm": 20.27071204848293, "language_loss": 0.78856128, "learning_rate": 1.5036488416857873e-07, "loss": 0.8030678, "num_input_tokens_seen": 315464655, "router_z_loss_clip": 2.18652344, "router_z_loss_mlp": 0.25488281, "step": 14628, "time_per_iteration": 2.674473285675049 }, { "auxiliary_loss_clip": 0.01241599, "auxiliary_loss_mlp": 0.00205399, "balance_loss_clip": 1.02439499, "balance_loss_mlp": 0.17987639, "epoch": 0.8795430632797234, "flos": 15231403906560.0, "grad_norm": 20.739233241421587, "language_loss": 0.813519, "learning_rate": 1.5021676375368175e-07, "loss": 0.82798904, "num_input_tokens_seen": 315481090, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.25549316, "step": 14629, "time_per_iteration": 2.6836891174316406 }, { "auxiliary_loss_clip": 0.01228817, "auxiliary_loss_mlp": 0.00223782, "balance_loss_clip": 1.01707864, "balance_loss_mlp": 0.20048892, "epoch": 0.8796031865323914, "flos": 27744727795200.0, "grad_norm": 3.908048739034047, "language_loss": 0.75191039, "learning_rate": 1.5006871348275053e-07, "loss": 0.7664364, "num_input_tokens_seen": 315502010, "router_z_loss_clip": 2.11816406, "router_z_loss_mlp": 0.2331543, "step": 14630, "time_per_iteration": 4.160262823104858 }, { "auxiliary_loss_clip": 0.01230264, "auxiliary_loss_mlp": 0.00205031, "balance_loss_clip": 1.01881266, "balance_loss_mlp": 0.18161789, "epoch": 0.8796633097850594, "flos": 31285412156160.0, "grad_norm": 29.687521108392104, "language_loss": 0.8043642, "learning_rate": 1.499207333613999e-07, "loss": 0.81871718, "num_input_tokens_seen": 315523040, "router_z_loss_clip": 2.1171875, "router_z_loss_mlp": 0.23413086, "step": 14631, "time_per_iteration": 2.7756783962249756 }, { "auxiliary_loss_clip": 0.01237367, "auxiliary_loss_mlp": 0.00207757, "balance_loss_clip": 1.02112317, "balance_loss_mlp": 0.18262736, "epoch": 0.8797234330377274, "flos": 24243150366720.0, "grad_norm": 18.463355010957255, "language_loss": 0.75962138, "learning_rate": 1.4977282339523954e-07, "loss": 0.77407265, "num_input_tokens_seen": 315541865, "router_z_loss_clip": 2.16601562, "router_z_loss_mlp": 0.2512207, "step": 14632, "time_per_iteration": 2.6979832649230957 }, { "auxiliary_loss_clip": 0.01251767, "auxiliary_loss_mlp": 0.00202145, "balance_loss_clip": 1.03452122, "balance_loss_mlp": 0.17850557, "epoch": 0.8797835562903953, "flos": 24167414540160.0, "grad_norm": 4.110969139897216, "language_loss": 0.75456822, "learning_rate": 1.4962498358987929e-07, "loss": 0.76910734, "num_input_tokens_seen": 315561470, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.2364502, "step": 14633, "time_per_iteration": 2.6748769283294678 }, { "auxiliary_loss_clip": 0.01241561, "auxiliary_loss_mlp": 0.00212968, "balance_loss_clip": 1.02167618, "balance_loss_mlp": 0.18899456, "epoch": 0.8798436795430633, "flos": 19284677303040.0, "grad_norm": 2.5114088293170442, "language_loss": 0.89020437, "learning_rate": 1.4947721395092528e-07, "loss": 0.90474963, "num_input_tokens_seen": 315583140, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.23986816, "step": 14634, "time_per_iteration": 2.7000691890716553 }, { "auxiliary_loss_clip": 0.01244361, "auxiliary_loss_mlp": 0.00213229, "balance_loss_clip": 1.02582145, "balance_loss_mlp": 0.18787315, "epoch": 0.8799038027957312, "flos": 28179390274560.0, "grad_norm": 99.5129776238023, "language_loss": 0.87621319, "learning_rate": 1.4932951448398056e-07, "loss": 0.89078903, "num_input_tokens_seen": 315601935, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.25354004, "step": 14635, "time_per_iteration": 2.730013847351074 }, { "auxiliary_loss_clip": 0.0125314, "auxiliary_loss_mlp": 0.00205208, "balance_loss_clip": 1.03108573, "balance_loss_mlp": 0.17887495, "epoch": 0.8799639260483992, "flos": 24644703484800.0, "grad_norm": 13.087960564262703, "language_loss": 0.73733461, "learning_rate": 1.4918188519464648e-07, "loss": 0.75191808, "num_input_tokens_seen": 315619995, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.26318359, "step": 14636, "time_per_iteration": 2.6831247806549072 }, { "auxiliary_loss_clip": 0.01230301, "auxiliary_loss_mlp": 0.00204146, "balance_loss_clip": 1.01495647, "balance_loss_mlp": 0.17927849, "epoch": 0.8800240493010671, "flos": 22200479735040.0, "grad_norm": 2.830732921815829, "language_loss": 0.77855396, "learning_rate": 1.4903432608852074e-07, "loss": 0.79289842, "num_input_tokens_seen": 315637895, "router_z_loss_clip": 2.15039062, "router_z_loss_mlp": 0.24853516, "step": 14637, "time_per_iteration": 2.6743836402893066 }, { "auxiliary_loss_clip": 0.01249995, "auxiliary_loss_mlp": 0.00200929, "balance_loss_clip": 1.03531623, "balance_loss_mlp": 0.17810075, "epoch": 0.8800841725537352, "flos": 14246086953600.0, "grad_norm": 13.028554837422135, "language_loss": 0.75146008, "learning_rate": 1.4888683717119843e-07, "loss": 0.76596934, "num_input_tokens_seen": 315655520, "router_z_loss_clip": 2.15039062, "router_z_loss_mlp": 0.22814941, "step": 14638, "time_per_iteration": 2.624941110610962 }, { "auxiliary_loss_clip": 0.01241134, "auxiliary_loss_mlp": 0.00193256, "balance_loss_clip": 1.0269537, "balance_loss_mlp": 0.17066596, "epoch": 0.8801442958064031, "flos": 37415794348800.0, "grad_norm": 4.012448640835184, "language_loss": 0.65899128, "learning_rate": 1.4873941844827286e-07, "loss": 0.67333519, "num_input_tokens_seen": 315678955, "router_z_loss_clip": 2.140625, "router_z_loss_mlp": 0.22607422, "step": 14639, "time_per_iteration": 2.8232548236846924 }, { "auxiliary_loss_clip": 0.01235423, "auxiliary_loss_mlp": 0.00205233, "balance_loss_clip": 1.02245569, "balance_loss_mlp": 0.18140276, "epoch": 0.8802044190590711, "flos": 25047334010880.0, "grad_norm": 40.51349960442892, "language_loss": 0.81971192, "learning_rate": 1.4859206992533402e-07, "loss": 0.83411849, "num_input_tokens_seen": 315700360, "router_z_loss_clip": 2.13085938, "router_z_loss_mlp": 0.23840332, "step": 14640, "time_per_iteration": 2.690929651260376 }, { "auxiliary_loss_clip": 0.01239025, "auxiliary_loss_mlp": 0.00196386, "balance_loss_clip": 1.01909184, "balance_loss_mlp": 0.17160256, "epoch": 0.8802645423117391, "flos": 24133874215680.0, "grad_norm": 249.49906959825012, "language_loss": 0.77314454, "learning_rate": 1.4844479160796985e-07, "loss": 0.78749859, "num_input_tokens_seen": 315719270, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.24743652, "step": 14641, "time_per_iteration": 2.695016622543335 }, { "auxiliary_loss_clip": 0.0126547, "auxiliary_loss_mlp": 0.00237034, "balance_loss_clip": 1.03764319, "balance_loss_mlp": 0.20950896, "epoch": 0.880324665564407, "flos": 17931203902080.0, "grad_norm": 15.462335048879238, "language_loss": 0.95374441, "learning_rate": 1.4829758350176457e-07, "loss": 0.96876955, "num_input_tokens_seen": 315737425, "router_z_loss_clip": 2.28125, "router_z_loss_mlp": 0.2755127, "step": 14642, "time_per_iteration": 2.6634738445281982 }, { "auxiliary_loss_clip": 0.01241527, "auxiliary_loss_mlp": 0.0020865, "balance_loss_clip": 1.02401817, "balance_loss_mlp": 0.18276951, "epoch": 0.880384788817075, "flos": 21287630471040.0, "grad_norm": 8.555387416619876, "language_loss": 0.85945487, "learning_rate": 1.4815044561230038e-07, "loss": 0.87395656, "num_input_tokens_seen": 315755725, "router_z_loss_clip": 2.17578125, "router_z_loss_mlp": 0.25915527, "step": 14643, "time_per_iteration": 2.678297281265259 }, { "auxiliary_loss_clip": 0.01224129, "auxiliary_loss_mlp": 0.00218979, "balance_loss_clip": 1.00961804, "balance_loss_mlp": 0.19262151, "epoch": 0.880444912069743, "flos": 12458489777280.0, "grad_norm": 322.32196068194935, "language_loss": 0.80925012, "learning_rate": 1.4800337794515705e-07, "loss": 0.82368112, "num_input_tokens_seen": 315773835, "router_z_loss_clip": 2.14453125, "router_z_loss_mlp": 0.2635498, "step": 14644, "time_per_iteration": 2.6891934871673584 }, { "auxiliary_loss_clip": 0.01254103, "auxiliary_loss_mlp": 0.00221241, "balance_loss_clip": 1.02959645, "balance_loss_mlp": 0.19536027, "epoch": 0.880505035322411, "flos": 13625945619840.0, "grad_norm": 13.299011058239252, "language_loss": 0.89939505, "learning_rate": 1.47856380505911e-07, "loss": 0.91414845, "num_input_tokens_seen": 315790615, "router_z_loss_clip": 2.2421875, "router_z_loss_mlp": 0.25891113, "step": 14645, "time_per_iteration": 2.6585636138916016 }, { "auxiliary_loss_clip": 0.01234985, "auxiliary_loss_mlp": 0.00224473, "balance_loss_clip": 1.02243018, "balance_loss_mlp": 0.19937953, "epoch": 0.8805651585750789, "flos": 23183067254400.0, "grad_norm": 16.830985805200758, "language_loss": 0.71650374, "learning_rate": 1.477094533001364e-07, "loss": 0.73109829, "num_input_tokens_seen": 315811010, "router_z_loss_clip": 2.12402344, "router_z_loss_mlp": 0.25061035, "step": 14646, "time_per_iteration": 2.652214288711548 }, { "auxiliary_loss_clip": 0.01261394, "auxiliary_loss_mlp": 0.00223313, "balance_loss_clip": 1.03624225, "balance_loss_mlp": 0.19752851, "epoch": 0.8806252818277469, "flos": 14903000835840.0, "grad_norm": 9.637728397416852, "language_loss": 0.88839787, "learning_rate": 1.475625963334055e-07, "loss": 0.90324497, "num_input_tokens_seen": 315828130, "router_z_loss_clip": 2.25195312, "router_z_loss_mlp": 0.25817871, "step": 14647, "time_per_iteration": 2.5911874771118164 }, { "auxiliary_loss_clip": 0.01242461, "auxiliary_loss_mlp": 0.00211461, "balance_loss_clip": 1.02702856, "balance_loss_mlp": 0.18728489, "epoch": 0.8806854050804148, "flos": 17639178330240.0, "grad_norm": 3.055741766769449, "language_loss": 0.84167427, "learning_rate": 1.4741580961128652e-07, "loss": 0.85621351, "num_input_tokens_seen": 315844900, "router_z_loss_clip": 2.15234375, "router_z_loss_mlp": 0.24182129, "step": 14648, "time_per_iteration": 2.6600260734558105 }, { "auxiliary_loss_clip": 0.01235981, "auxiliary_loss_mlp": 0.0021762, "balance_loss_clip": 1.02156258, "balance_loss_mlp": 0.19374272, "epoch": 0.8807455283330828, "flos": 25332392344320.0, "grad_norm": 4.214813449353809, "language_loss": 0.72544032, "learning_rate": 1.4726909313934522e-07, "loss": 0.73997635, "num_input_tokens_seen": 315863745, "router_z_loss_clip": 2.14648438, "router_z_loss_mlp": 0.23864746, "step": 14649, "time_per_iteration": 2.6548690795898438 }, { "auxiliary_loss_clip": 0.01240199, "auxiliary_loss_mlp": 0.00211031, "balance_loss_clip": 1.02259135, "balance_loss_mlp": 0.18583035, "epoch": 0.8808056515857507, "flos": 25265168040960.0, "grad_norm": 9.976858269062886, "language_loss": 0.68802983, "learning_rate": 1.4712244692314578e-07, "loss": 0.70254213, "num_input_tokens_seen": 315885765, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.25195312, "step": 14650, "time_per_iteration": 2.700778007507324 }, { "auxiliary_loss_clip": 0.01236399, "auxiliary_loss_mlp": 0.00215135, "balance_loss_clip": 1.02431452, "balance_loss_mlp": 0.18939802, "epoch": 0.8808657748384188, "flos": 26578852151040.0, "grad_norm": 5.082270517712404, "language_loss": 0.78509367, "learning_rate": 1.4697587096824914e-07, "loss": 0.79960907, "num_input_tokens_seen": 315907340, "router_z_loss_clip": 2.12304688, "router_z_loss_mlp": 0.25708008, "step": 14651, "time_per_iteration": 2.7909634113311768 }, { "auxiliary_loss_clip": 0.01257177, "auxiliary_loss_mlp": 0.00191376, "balance_loss_clip": 1.03491592, "balance_loss_mlp": 0.1666525, "epoch": 0.8809258980910867, "flos": 18661231918080.0, "grad_norm": 2.429150804684975, "language_loss": 0.78341973, "learning_rate": 1.4682936528021284e-07, "loss": 0.79790521, "num_input_tokens_seen": 315924935, "router_z_loss_clip": 2.22265625, "router_z_loss_mlp": 0.24719238, "step": 14652, "time_per_iteration": 2.674785852432251 }, { "auxiliary_loss_clip": 0.01226887, "auxiliary_loss_mlp": 0.00192402, "balance_loss_clip": 1.0115875, "balance_loss_mlp": 0.1684055, "epoch": 0.8809860213437547, "flos": 19792274348160.0, "grad_norm": 478.45755585983363, "language_loss": 0.85242909, "learning_rate": 1.4668292986459286e-07, "loss": 0.86662197, "num_input_tokens_seen": 315943165, "router_z_loss_clip": 2.15234375, "router_z_loss_mlp": 0.23999023, "step": 14653, "time_per_iteration": 2.6306660175323486 }, { "auxiliary_loss_clip": 0.01257958, "auxiliary_loss_mlp": 0.00209164, "balance_loss_clip": 1.0343883, "balance_loss_mlp": 0.18300985, "epoch": 0.8810461445964227, "flos": 17894467267200.0, "grad_norm": 152.27389710197167, "language_loss": 0.80716908, "learning_rate": 1.465365647269421e-07, "loss": 0.82184029, "num_input_tokens_seen": 315961340, "router_z_loss_clip": 2.23242188, "router_z_loss_mlp": 0.26135254, "step": 14654, "time_per_iteration": 2.670576810836792 }, { "auxiliary_loss_clip": 0.01232634, "auxiliary_loss_mlp": 0.0020995, "balance_loss_clip": 1.01438069, "balance_loss_mlp": 0.18297267, "epoch": 0.8811062678490906, "flos": 29163917128320.0, "grad_norm": 51.40218536582138, "language_loss": 0.79025245, "learning_rate": 1.4639026987281012e-07, "loss": 0.8046782, "num_input_tokens_seen": 315981335, "router_z_loss_clip": 2.18066406, "router_z_loss_mlp": 0.26989746, "step": 14655, "time_per_iteration": 2.7009658813476562 }, { "auxiliary_loss_clip": 0.01242798, "auxiliary_loss_mlp": 0.00212105, "balance_loss_clip": 1.0256449, "balance_loss_mlp": 0.18766747, "epoch": 0.8811663911017587, "flos": 20338834671360.0, "grad_norm": 18.21658981170416, "language_loss": 0.8823477, "learning_rate": 1.462440453077449e-07, "loss": 0.89689672, "num_input_tokens_seen": 316001325, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.24438477, "step": 14656, "time_per_iteration": 2.6468594074249268 }, { "auxiliary_loss_clip": 0.01262175, "auxiliary_loss_mlp": 0.00203047, "balance_loss_clip": 1.03819251, "balance_loss_mlp": 0.17812021, "epoch": 0.8812265143544266, "flos": 25885704424320.0, "grad_norm": 4.431932756179847, "language_loss": 0.77183688, "learning_rate": 1.460978910372914e-07, "loss": 0.78648913, "num_input_tokens_seen": 316022540, "router_z_loss_clip": 2.24023438, "router_z_loss_mlp": 0.24938965, "step": 14657, "time_per_iteration": 2.7759833335876465 }, { "auxiliary_loss_clip": 0.01247002, "auxiliary_loss_mlp": 0.00181672, "balance_loss_clip": 1.02782404, "balance_loss_mlp": 0.15699555, "epoch": 0.8812866376070946, "flos": 27195509865600.0, "grad_norm": 7.8039857556862335, "language_loss": 0.93415344, "learning_rate": 1.4595180706699207e-07, "loss": 0.94844019, "num_input_tokens_seen": 316037735, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.24694824, "step": 14658, "time_per_iteration": 2.746023178100586 }, { "auxiliary_loss_clip": 0.01262232, "auxiliary_loss_mlp": 0.00210433, "balance_loss_clip": 1.0377934, "balance_loss_mlp": 0.18146518, "epoch": 0.8813467608597625, "flos": 23807194997760.0, "grad_norm": 81.5324246702588, "language_loss": 0.85448045, "learning_rate": 1.4580579340238554e-07, "loss": 0.86920714, "num_input_tokens_seen": 316058105, "router_z_loss_clip": 2.24609375, "router_z_loss_mlp": 0.28979492, "step": 14659, "time_per_iteration": 2.734225273132324 }, { "auxiliary_loss_clip": 0.01233571, "auxiliary_loss_mlp": 0.00220373, "balance_loss_clip": 1.01897383, "balance_loss_mlp": 0.19607805, "epoch": 0.8814068841124305, "flos": 21105455667840.0, "grad_norm": 96.1752773942248, "language_loss": 0.69550681, "learning_rate": 1.4565985004900894e-07, "loss": 0.71004617, "num_input_tokens_seen": 316074415, "router_z_loss_clip": 2.14160156, "router_z_loss_mlp": 0.24279785, "step": 14660, "time_per_iteration": 4.215841293334961 }, { "auxiliary_loss_clip": 0.01234385, "auxiliary_loss_mlp": 0.00199558, "balance_loss_clip": 1.0176928, "balance_loss_mlp": 0.17378452, "epoch": 0.8814670073650984, "flos": 24716991605760.0, "grad_norm": 11.68168917412, "language_loss": 0.86500096, "learning_rate": 1.455139770123972e-07, "loss": 0.87934041, "num_input_tokens_seen": 316094405, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.2578125, "step": 14661, "time_per_iteration": 2.7820184230804443 }, { "auxiliary_loss_clip": 0.01232175, "auxiliary_loss_mlp": 0.00210241, "balance_loss_clip": 1.01739562, "balance_loss_mlp": 0.18531425, "epoch": 0.8815271306177664, "flos": 22966274718720.0, "grad_norm": 18.992304143602148, "language_loss": 0.83568907, "learning_rate": 1.45368174298081e-07, "loss": 0.85011327, "num_input_tokens_seen": 316113390, "router_z_loss_clip": 2.14746094, "router_z_loss_mlp": 0.24951172, "step": 14662, "time_per_iteration": 4.058739900588989 }, { "auxiliary_loss_clip": 0.01207693, "auxiliary_loss_mlp": 0.00186262, "balance_loss_clip": 1.00162315, "balance_loss_mlp": 0.16435108, "epoch": 0.8815872538704344, "flos": 19460064435840.0, "grad_norm": 133.0019995448286, "language_loss": 0.80989254, "learning_rate": 1.4522244191158929e-07, "loss": 0.82383215, "num_input_tokens_seen": 316131085, "router_z_loss_clip": 2.05664062, "router_z_loss_mlp": 0.21911621, "step": 14663, "time_per_iteration": 2.584728479385376 }, { "auxiliary_loss_clip": 0.01240075, "auxiliary_loss_mlp": 0.00195035, "balance_loss_clip": 1.02184319, "balance_loss_mlp": 0.17068025, "epoch": 0.8816473771231024, "flos": 32156604622080.0, "grad_norm": 64.46199582649771, "language_loss": 0.77097166, "learning_rate": 1.450767798584489e-07, "loss": 0.78532273, "num_input_tokens_seen": 316151440, "router_z_loss_clip": 2.18164062, "router_z_loss_mlp": 0.24365234, "step": 14664, "time_per_iteration": 2.7726173400878906 }, { "auxiliary_loss_clip": 0.01221989, "auxiliary_loss_mlp": 0.00212993, "balance_loss_clip": 1.00846207, "balance_loss_mlp": 0.18913949, "epoch": 0.8817075003757703, "flos": 19682279925120.0, "grad_norm": 80.82836312143671, "language_loss": 0.87654269, "learning_rate": 1.449311881441828e-07, "loss": 0.89089251, "num_input_tokens_seen": 316170750, "router_z_loss_clip": 2.13476562, "router_z_loss_mlp": 0.23876953, "step": 14665, "time_per_iteration": 2.610426902770996 }, { "auxiliary_loss_clip": 0.01245069, "auxiliary_loss_mlp": 0.00224831, "balance_loss_clip": 1.02904272, "balance_loss_mlp": 0.19811629, "epoch": 0.8817676236284383, "flos": 15668616251520.0, "grad_norm": 14.000775480360062, "language_loss": 0.71191186, "learning_rate": 1.447856667743117e-07, "loss": 0.7266109, "num_input_tokens_seen": 316187265, "router_z_loss_clip": 2.16015625, "router_z_loss_mlp": 0.26708984, "step": 14666, "time_per_iteration": 2.6398582458496094 }, { "auxiliary_loss_clip": 0.0124949, "auxiliary_loss_mlp": 0.00214043, "balance_loss_clip": 1.02895355, "balance_loss_mlp": 0.18770933, "epoch": 0.8818277468811063, "flos": 17895185539200.0, "grad_norm": 4.151952001996691, "language_loss": 0.92142582, "learning_rate": 1.4464021575435403e-07, "loss": 0.93606114, "num_input_tokens_seen": 316206555, "router_z_loss_clip": 2.20507812, "router_z_loss_mlp": 0.26342773, "step": 14667, "time_per_iteration": 2.6231319904327393 }, { "auxiliary_loss_clip": 0.0124303, "auxiliary_loss_mlp": 0.00223839, "balance_loss_clip": 1.02239716, "balance_loss_mlp": 0.19724298, "epoch": 0.8818878701337742, "flos": 18770508069120.0, "grad_norm": 59.92313416709527, "language_loss": 0.70968878, "learning_rate": 1.4449483508982563e-07, "loss": 0.72435749, "num_input_tokens_seen": 316225210, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.265625, "step": 14668, "time_per_iteration": 4.080662250518799 }, { "auxiliary_loss_clip": 0.01245054, "auxiliary_loss_mlp": 0.00203339, "balance_loss_clip": 1.02730906, "balance_loss_mlp": 0.17950867, "epoch": 0.8819479933864423, "flos": 17712292464000.0, "grad_norm": 262.2267646469065, "language_loss": 0.68045986, "learning_rate": 1.4434952478623918e-07, "loss": 0.69494379, "num_input_tokens_seen": 316242685, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.23828125, "step": 14669, "time_per_iteration": 2.657020330429077 }, { "auxiliary_loss_clip": 0.01240416, "auxiliary_loss_mlp": 0.00208692, "balance_loss_clip": 1.02762687, "balance_loss_mlp": 0.18425432, "epoch": 0.8820081166391102, "flos": 11728749070080.0, "grad_norm": 111.58850295884521, "language_loss": 0.81086338, "learning_rate": 1.442042848491043e-07, "loss": 0.82535446, "num_input_tokens_seen": 316260935, "router_z_loss_clip": 2.12988281, "router_z_loss_mlp": 0.24438477, "step": 14670, "time_per_iteration": 2.6029529571533203 }, { "auxiliary_loss_clip": 0.01237135, "auxiliary_loss_mlp": 0.00212812, "balance_loss_clip": 1.01499295, "balance_loss_mlp": 0.18691954, "epoch": 0.8820682398917782, "flos": 27490372611840.0, "grad_norm": 4.770168390484394, "language_loss": 0.8418957, "learning_rate": 1.44059115283929e-07, "loss": 0.85639513, "num_input_tokens_seen": 316281190, "router_z_loss_clip": 2.22460938, "router_z_loss_mlp": 0.25891113, "step": 14671, "time_per_iteration": 2.719602584838867 }, { "auxiliary_loss_clip": 0.01246653, "auxiliary_loss_mlp": 0.00215362, "balance_loss_clip": 1.02510667, "balance_loss_mlp": 0.1897081, "epoch": 0.8821283631444461, "flos": 16873850223360.0, "grad_norm": 24.339113006566468, "language_loss": 0.94037211, "learning_rate": 1.43914016096218e-07, "loss": 0.95499223, "num_input_tokens_seen": 316297115, "router_z_loss_clip": 2.21386719, "router_z_loss_mlp": 0.25646973, "step": 14672, "time_per_iteration": 4.064741611480713 }, { "auxiliary_loss_clip": 0.01224112, "auxiliary_loss_mlp": 0.00189148, "balance_loss_clip": 1.01219428, "balance_loss_mlp": 0.16504414, "epoch": 0.8821884863971141, "flos": 24280964409600.0, "grad_norm": 7.5916824220498444, "language_loss": 0.81336719, "learning_rate": 1.4376898729147336e-07, "loss": 0.82749975, "num_input_tokens_seen": 316318235, "router_z_loss_clip": 2.1171875, "router_z_loss_mlp": 0.2409668, "step": 14673, "time_per_iteration": 2.6906182765960693 }, { "auxiliary_loss_clip": 0.0112683, "auxiliary_loss_mlp": 0.00163249, "balance_loss_clip": 0.98742968, "balance_loss_mlp": 0.15352184, "epoch": 0.882248609649782, "flos": 59432342492160.0, "grad_norm": 0.8740379852358127, "language_loss": 0.48230854, "learning_rate": 1.4362402887519487e-07, "loss": 0.49520937, "num_input_tokens_seen": 316384705, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.09716797, "step": 14674, "time_per_iteration": 3.2636678218841553 }, { "auxiliary_loss_clip": 0.01241598, "auxiliary_loss_mlp": 0.00200932, "balance_loss_clip": 1.02035117, "balance_loss_mlp": 0.17512369, "epoch": 0.88230873290245, "flos": 19937784343680.0, "grad_norm": 7.298368459313634, "language_loss": 0.86231601, "learning_rate": 1.4347914085287971e-07, "loss": 0.87674129, "num_input_tokens_seen": 316401165, "router_z_loss_clip": 2.21289062, "router_z_loss_mlp": 0.2578125, "step": 14675, "time_per_iteration": 2.624028205871582 }, { "auxiliary_loss_clip": 0.01239347, "auxiliary_loss_mlp": 0.00201087, "balance_loss_clip": 1.02032685, "balance_loss_mlp": 0.17577839, "epoch": 0.882368856155118, "flos": 16362769559040.0, "grad_norm": 12.504372927636396, "language_loss": 0.88308454, "learning_rate": 1.4333432323002105e-07, "loss": 0.89748889, "num_input_tokens_seen": 316418780, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.2532959, "step": 14676, "time_per_iteration": 2.6801092624664307 }, { "auxiliary_loss_clip": 0.01135198, "auxiliary_loss_mlp": 0.00170786, "balance_loss_clip": 0.99346673, "balance_loss_mlp": 0.16158313, "epoch": 0.882428979407786, "flos": 70594563277440.0, "grad_norm": 0.6748812105169887, "language_loss": 0.54113507, "learning_rate": 1.431895760121109e-07, "loss": 0.55419493, "num_input_tokens_seen": 316482030, "router_z_loss_clip": 1.4140625, "router_z_loss_mlp": 0.09179688, "step": 14677, "time_per_iteration": 3.228390693664551 }, { "auxiliary_loss_clip": 0.01238542, "auxiliary_loss_mlp": 0.00201575, "balance_loss_clip": 1.02031446, "balance_loss_mlp": 0.17724393, "epoch": 0.8824891026604539, "flos": 18150294908160.0, "grad_norm": 12.425328324724813, "language_loss": 0.80194914, "learning_rate": 1.4304489920463847e-07, "loss": 0.81635034, "num_input_tokens_seen": 316499175, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.24316406, "step": 14678, "time_per_iteration": 2.6402156352996826 }, { "auxiliary_loss_clip": 0.01250859, "auxiliary_loss_mlp": 0.00205493, "balance_loss_clip": 1.02816927, "balance_loss_mlp": 0.17964804, "epoch": 0.8825492259131219, "flos": 27232713377280.0, "grad_norm": 19.073233712294453, "language_loss": 0.7931664, "learning_rate": 1.4290029281308936e-07, "loss": 0.80772996, "num_input_tokens_seen": 316519495, "router_z_loss_clip": 2.22851562, "router_z_loss_mlp": 0.25842285, "step": 14679, "time_per_iteration": 2.716090202331543 }, { "auxiliary_loss_clip": 0.01226431, "auxiliary_loss_mlp": 0.00198438, "balance_loss_clip": 1.01674962, "balance_loss_mlp": 0.17471512, "epoch": 0.8826093491657898, "flos": 22274419881600.0, "grad_norm": 9.51942845860056, "language_loss": 0.70791435, "learning_rate": 1.4275575684294694e-07, "loss": 0.72216296, "num_input_tokens_seen": 316538180, "router_z_loss_clip": 2.09960938, "router_z_loss_mlp": 0.23742676, "step": 14680, "time_per_iteration": 2.670997142791748 }, { "auxiliary_loss_clip": 0.0124433, "auxiliary_loss_mlp": 0.00202585, "balance_loss_clip": 1.02935529, "balance_loss_mlp": 0.1778726, "epoch": 0.8826694724184578, "flos": 14204753377920.0, "grad_norm": 74.40897921032519, "language_loss": 0.85891134, "learning_rate": 1.4261129129969328e-07, "loss": 0.87338054, "num_input_tokens_seen": 316551750, "router_z_loss_clip": 2.15136719, "router_z_loss_mlp": 0.24731445, "step": 14681, "time_per_iteration": 2.6426925659179688 }, { "auxiliary_loss_clip": 0.0124049, "auxiliary_loss_mlp": 0.00224348, "balance_loss_clip": 1.02511823, "balance_loss_mlp": 0.19932608, "epoch": 0.8827295956711259, "flos": 20631686256000.0, "grad_norm": 103.90839054202644, "language_loss": 0.80049372, "learning_rate": 1.424668961888047e-07, "loss": 0.81514204, "num_input_tokens_seen": 316570680, "router_z_loss_clip": 2.15429688, "router_z_loss_mlp": 0.25024414, "step": 14682, "time_per_iteration": 2.6768016815185547 }, { "auxiliary_loss_clip": 0.01258845, "auxiliary_loss_mlp": 0.00233551, "balance_loss_clip": 1.03264713, "balance_loss_mlp": 0.20745608, "epoch": 0.8827897189237938, "flos": 18513064316160.0, "grad_norm": 45.87329545389999, "language_loss": 0.83762318, "learning_rate": 1.4232257151575765e-07, "loss": 0.85254717, "num_input_tokens_seen": 316588635, "router_z_loss_clip": 2.25976562, "router_z_loss_mlp": 0.26074219, "step": 14683, "time_per_iteration": 2.715405225753784 }, { "auxiliary_loss_clip": 0.01250298, "auxiliary_loss_mlp": 0.00208005, "balance_loss_clip": 1.03025651, "balance_loss_mlp": 0.18408018, "epoch": 0.8828498421764618, "flos": 22747399194240.0, "grad_norm": 8.853996792302022, "language_loss": 0.74168873, "learning_rate": 1.4217831728602492e-07, "loss": 0.75627172, "num_input_tokens_seen": 316607550, "router_z_loss_clip": 2.19824219, "router_z_loss_mlp": 0.23937988, "step": 14684, "time_per_iteration": 2.627473831176758 }, { "auxiliary_loss_clip": 0.01226993, "auxiliary_loss_mlp": 0.00199811, "balance_loss_clip": 1.00826311, "balance_loss_mlp": 0.17524171, "epoch": 0.8829099654291297, "flos": 15012384727680.0, "grad_norm": 16.69880685740119, "language_loss": 0.80787444, "learning_rate": 1.4203413350507677e-07, "loss": 0.82214248, "num_input_tokens_seen": 316624460, "router_z_loss_clip": 2.18164062, "router_z_loss_mlp": 0.24609375, "step": 14685, "time_per_iteration": 2.672715902328491 }, { "auxiliary_loss_clip": 0.0126385, "auxiliary_loss_mlp": 0.00221116, "balance_loss_clip": 1.03707147, "balance_loss_mlp": 0.19435333, "epoch": 0.8829700886817977, "flos": 16720546976640.0, "grad_norm": 27.777383588340935, "language_loss": 0.81507003, "learning_rate": 1.418900201783806e-07, "loss": 0.82991976, "num_input_tokens_seen": 316640765, "router_z_loss_clip": 2.26367188, "router_z_loss_mlp": 0.26745605, "step": 14686, "time_per_iteration": 2.591649293899536 }, { "auxiliary_loss_clip": 0.01240278, "auxiliary_loss_mlp": 0.00206779, "balance_loss_clip": 1.02564323, "balance_loss_mlp": 0.18379524, "epoch": 0.8830302119344656, "flos": 15263256291840.0, "grad_norm": 9.90285958477027, "language_loss": 0.71533227, "learning_rate": 1.417459773114007e-07, "loss": 0.72980285, "num_input_tokens_seen": 316656120, "router_z_loss_clip": 2.14648438, "router_z_loss_mlp": 0.22998047, "step": 14687, "time_per_iteration": 2.696958065032959 }, { "auxiliary_loss_clip": 0.01238212, "auxiliary_loss_mlp": 0.00227476, "balance_loss_clip": 1.0217669, "balance_loss_mlp": 0.20307374, "epoch": 0.8830903351871336, "flos": 28617751854720.0, "grad_norm": 21.475176425463243, "language_loss": 0.760934, "learning_rate": 1.4160200490959984e-07, "loss": 0.7755909, "num_input_tokens_seen": 316676095, "router_z_loss_clip": 2.16210938, "router_z_loss_mlp": 0.24389648, "step": 14688, "time_per_iteration": 2.702655076980591 }, { "auxiliary_loss_clip": 0.012356, "auxiliary_loss_mlp": 0.00195222, "balance_loss_clip": 1.02405095, "balance_loss_mlp": 0.17291823, "epoch": 0.8831504584398016, "flos": 28001632844160.0, "grad_norm": 697.4893449547667, "language_loss": 0.73646438, "learning_rate": 1.4145810297843697e-07, "loss": 0.7507726, "num_input_tokens_seen": 316696235, "router_z_loss_clip": 2.1171875, "router_z_loss_mlp": 0.22302246, "step": 14689, "time_per_iteration": 2.7074341773986816 }, { "auxiliary_loss_clip": 0.01232734, "auxiliary_loss_mlp": 0.00203823, "balance_loss_clip": 1.02487648, "balance_loss_mlp": 0.18104178, "epoch": 0.8832105816924696, "flos": 26579642250240.0, "grad_norm": 4.3972042108469696, "language_loss": 0.78604245, "learning_rate": 1.4131427152336905e-07, "loss": 0.80040801, "num_input_tokens_seen": 316719680, "router_z_loss_clip": 2.08007812, "router_z_loss_mlp": 0.2277832, "step": 14690, "time_per_iteration": 2.7105960845947266 }, { "auxiliary_loss_clip": 0.01251003, "auxiliary_loss_mlp": 0.00207252, "balance_loss_clip": 1.03268993, "balance_loss_mlp": 0.1839347, "epoch": 0.8832707049451375, "flos": 24898771359360.0, "grad_norm": 59.277559105340515, "language_loss": 0.80029374, "learning_rate": 1.4117051054985018e-07, "loss": 0.81487632, "num_input_tokens_seen": 316739830, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.2331543, "step": 14691, "time_per_iteration": 2.669515371322632 }, { "auxiliary_loss_clip": 0.01279709, "auxiliary_loss_mlp": 0.00221412, "balance_loss_clip": 1.04881454, "balance_loss_mlp": 0.19451821, "epoch": 0.8833308281978055, "flos": 15451141357440.0, "grad_norm": 22.279993737572884, "language_loss": 0.60980463, "learning_rate": 1.4102682006333243e-07, "loss": 0.62481594, "num_input_tokens_seen": 316758105, "router_z_loss_clip": 2.31054688, "router_z_loss_mlp": 0.26940918, "step": 14692, "time_per_iteration": 2.6290252208709717 }, { "auxiliary_loss_clip": 0.01267491, "auxiliary_loss_mlp": 0.00225231, "balance_loss_clip": 1.03846383, "balance_loss_mlp": 0.20037603, "epoch": 0.8833909514504734, "flos": 20301523418880.0, "grad_norm": 33.269456652717686, "language_loss": 0.7083593, "learning_rate": 1.4088320006926346e-07, "loss": 0.72328651, "num_input_tokens_seen": 316777455, "router_z_loss_clip": 2.28710938, "router_z_loss_mlp": 0.24853516, "step": 14693, "time_per_iteration": 2.608477830886841 }, { "auxiliary_loss_clip": 0.01232883, "auxiliary_loss_mlp": 0.00194671, "balance_loss_clip": 1.02394485, "balance_loss_mlp": 0.17211697, "epoch": 0.8834510747031414, "flos": 20374027021440.0, "grad_norm": 6.924358597653154, "language_loss": 0.81907088, "learning_rate": 1.407396505730898e-07, "loss": 0.83334643, "num_input_tokens_seen": 316796300, "router_z_loss_clip": 2.08984375, "router_z_loss_mlp": 0.22570801, "step": 14694, "time_per_iteration": 2.631570339202881 }, { "auxiliary_loss_clip": 0.01250181, "auxiliary_loss_mlp": 0.00215469, "balance_loss_clip": 1.02725613, "balance_loss_mlp": 0.19166242, "epoch": 0.8835111979558095, "flos": 29752026508800.0, "grad_norm": 79.62749844412413, "language_loss": 0.82377708, "learning_rate": 1.4059617158025527e-07, "loss": 0.8384335, "num_input_tokens_seen": 316819090, "router_z_loss_clip": 2.22851562, "router_z_loss_mlp": 0.23840332, "step": 14695, "time_per_iteration": 2.7290844917297363 }, { "auxiliary_loss_clip": 0.01241772, "auxiliary_loss_mlp": 0.00200135, "balance_loss_clip": 1.02973473, "balance_loss_mlp": 0.17675805, "epoch": 0.8835713212084774, "flos": 24134556574080.0, "grad_norm": 8.86482051632961, "language_loss": 0.87528312, "learning_rate": 1.404527630961998e-07, "loss": 0.8897022, "num_input_tokens_seen": 316839250, "router_z_loss_clip": 2.12207031, "router_z_loss_mlp": 0.23388672, "step": 14696, "time_per_iteration": 2.6965370178222656 }, { "auxiliary_loss_clip": 0.01252208, "auxiliary_loss_mlp": 0.00218437, "balance_loss_clip": 1.03018069, "balance_loss_mlp": 0.19314043, "epoch": 0.8836314444611454, "flos": 27672331933440.0, "grad_norm": 1.7616264259153829, "language_loss": 0.82806128, "learning_rate": 1.4030942512636236e-07, "loss": 0.84276772, "num_input_tokens_seen": 316861315, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.25280762, "step": 14697, "time_per_iteration": 2.732086658477783 }, { "auxiliary_loss_clip": 0.01235921, "auxiliary_loss_mlp": 0.00227707, "balance_loss_clip": 1.02160466, "balance_loss_mlp": 0.20386487, "epoch": 0.8836915677138133, "flos": 16836969934080.0, "grad_norm": 223.8172143185505, "language_loss": 0.78797674, "learning_rate": 1.401661576761779e-07, "loss": 0.80261302, "num_input_tokens_seen": 316879325, "router_z_loss_clip": 2.14355469, "router_z_loss_mlp": 0.23864746, "step": 14698, "time_per_iteration": 2.7728383541107178 }, { "auxiliary_loss_clip": 0.01141814, "auxiliary_loss_mlp": 0.00113995, "balance_loss_clip": 1.00084865, "balance_loss_mlp": 0.10684266, "epoch": 0.8837516909664813, "flos": 69310540823040.0, "grad_norm": 0.7637945995492635, "language_loss": 0.53091764, "learning_rate": 1.4002296075107856e-07, "loss": 0.54347575, "num_input_tokens_seen": 316936425, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.07128906, "step": 14699, "time_per_iteration": 3.1652109622955322 }, { "auxiliary_loss_clip": 0.0123698, "auxiliary_loss_mlp": 0.00205419, "balance_loss_clip": 1.01772213, "balance_loss_mlp": 0.1811479, "epoch": 0.8838118142191492, "flos": 21324726241920.0, "grad_norm": 3.643324967016273, "language_loss": 0.86915839, "learning_rate": 1.3987983435649508e-07, "loss": 0.88358229, "num_input_tokens_seen": 316956360, "router_z_loss_clip": 2.19335938, "router_z_loss_mlp": 0.24267578, "step": 14700, "time_per_iteration": 2.6609997749328613 }, { "auxiliary_loss_clip": 0.01235206, "auxiliary_loss_mlp": 0.00197099, "balance_loss_clip": 1.02005672, "balance_loss_mlp": 0.17409131, "epoch": 0.8838719374718172, "flos": 21470559459840.0, "grad_norm": 14.579911230540528, "language_loss": 0.81771207, "learning_rate": 1.3973677849785494e-07, "loss": 0.83203506, "num_input_tokens_seen": 316975295, "router_z_loss_clip": 2.1484375, "router_z_loss_mlp": 0.23022461, "step": 14701, "time_per_iteration": 2.6913814544677734 }, { "auxiliary_loss_clip": 0.01266466, "auxiliary_loss_mlp": 0.00238951, "balance_loss_clip": 1.03674245, "balance_loss_mlp": 0.21167552, "epoch": 0.8839320607244852, "flos": 26468929555200.0, "grad_norm": 10.853367756405776, "language_loss": 0.7967189, "learning_rate": 1.3959379318058262e-07, "loss": 0.81177306, "num_input_tokens_seen": 316994520, "router_z_loss_clip": 2.29492188, "router_z_loss_mlp": 0.27294922, "step": 14702, "time_per_iteration": 2.7025229930877686 }, { "auxiliary_loss_clip": 0.01260478, "auxiliary_loss_mlp": 0.00235837, "balance_loss_clip": 1.0404197, "balance_loss_mlp": 0.21174484, "epoch": 0.8839921839771532, "flos": 45222270923520.0, "grad_norm": 547.140188799923, "language_loss": 0.78151608, "learning_rate": 1.3945087841010006e-07, "loss": 0.79647923, "num_input_tokens_seen": 317018095, "router_z_loss_clip": 2.19921875, "router_z_loss_mlp": 0.24121094, "step": 14703, "time_per_iteration": 4.360274314880371 }, { "auxiliary_loss_clip": 0.01220636, "auxiliary_loss_mlp": 0.0020428, "balance_loss_clip": 1.01174474, "balance_loss_mlp": 0.18148738, "epoch": 0.8840523072298211, "flos": 20006876154240.0, "grad_norm": 16.124461153745127, "language_loss": 0.74086428, "learning_rate": 1.3930803419182645e-07, "loss": 0.75511348, "num_input_tokens_seen": 317035755, "router_z_loss_clip": 2.08789062, "router_z_loss_mlp": 0.22790527, "step": 14704, "time_per_iteration": 4.118948698043823 }, { "auxiliary_loss_clip": 0.01228666, "auxiliary_loss_mlp": 0.00202132, "balance_loss_clip": 1.01865542, "balance_loss_mlp": 0.18012556, "epoch": 0.8841124304824891, "flos": 24426007528320.0, "grad_norm": 59.81905263893535, "language_loss": 0.78129488, "learning_rate": 1.3916526053117905e-07, "loss": 0.79560292, "num_input_tokens_seen": 317055765, "router_z_loss_clip": 2.09863281, "router_z_loss_mlp": 0.22009277, "step": 14705, "time_per_iteration": 2.7672982215881348 }, { "auxiliary_loss_clip": 0.01224239, "auxiliary_loss_mlp": 0.00190829, "balance_loss_clip": 1.01501322, "balance_loss_mlp": 0.16794077, "epoch": 0.884172553735157, "flos": 31284622056960.0, "grad_norm": 33.00697817258948, "language_loss": 0.77858293, "learning_rate": 1.3902255743357104e-07, "loss": 0.79273367, "num_input_tokens_seen": 317077955, "router_z_loss_clip": 2.09082031, "router_z_loss_mlp": 0.22900391, "step": 14706, "time_per_iteration": 2.7416415214538574 }, { "auxiliary_loss_clip": 0.0124017, "auxiliary_loss_mlp": 0.00191915, "balance_loss_clip": 1.02462041, "balance_loss_mlp": 0.16858621, "epoch": 0.884232676987825, "flos": 21391160446080.0, "grad_norm": 19.78912078166092, "language_loss": 0.8179155, "learning_rate": 1.3887992490441413e-07, "loss": 0.83223635, "num_input_tokens_seen": 317095825, "router_z_loss_clip": 2.15332031, "router_z_loss_mlp": 0.23327637, "step": 14707, "time_per_iteration": 2.6320641040802 }, { "auxiliary_loss_clip": 0.01141808, "auxiliary_loss_mlp": 0.00151849, "balance_loss_clip": 1.00118279, "balance_loss_mlp": 0.14317098, "epoch": 0.8842928002404931, "flos": 57911451799680.0, "grad_norm": 0.7882163729747542, "language_loss": 0.59675199, "learning_rate": 1.387373629491173e-07, "loss": 0.60968858, "num_input_tokens_seen": 317152875, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.08691406, "step": 14708, "time_per_iteration": 3.0161643028259277 }, { "auxiliary_loss_clip": 0.01204058, "auxiliary_loss_mlp": 0.00211459, "balance_loss_clip": 1.00029969, "balance_loss_mlp": 0.18805826, "epoch": 0.884352923493161, "flos": 41463896186880.0, "grad_norm": 21.255959003701594, "language_loss": 0.72973096, "learning_rate": 1.3859487157308625e-07, "loss": 0.74388611, "num_input_tokens_seen": 317176725, "router_z_loss_clip": 2.0390625, "router_z_loss_mlp": 0.23400879, "step": 14709, "time_per_iteration": 2.8503310680389404 }, { "auxiliary_loss_clip": 0.01252168, "auxiliary_loss_mlp": 0.0023239, "balance_loss_clip": 1.0300523, "balance_loss_mlp": 0.20696232, "epoch": 0.884413046745829, "flos": 46541234332800.0, "grad_norm": 2.5397855808006473, "language_loss": 0.69634676, "learning_rate": 1.3845245078172373e-07, "loss": 0.71119225, "num_input_tokens_seen": 317206880, "router_z_loss_clip": 2.21679688, "router_z_loss_mlp": 0.25427246, "step": 14710, "time_per_iteration": 4.3776774406433105 }, { "auxiliary_loss_clip": 0.01238829, "auxiliary_loss_mlp": 0.00193162, "balance_loss_clip": 1.02528369, "balance_loss_mlp": 0.17027363, "epoch": 0.8844731699984969, "flos": 19135324552320.0, "grad_norm": 72.6991784042285, "language_loss": 0.72249603, "learning_rate": 1.38310100580431e-07, "loss": 0.73681587, "num_input_tokens_seen": 317224135, "router_z_loss_clip": 2.13574219, "router_z_loss_mlp": 0.22900391, "step": 14711, "time_per_iteration": 2.6472487449645996 }, { "auxiliary_loss_clip": 0.01258108, "auxiliary_loss_mlp": 0.00234661, "balance_loss_clip": 1.03471732, "balance_loss_mlp": 0.20812455, "epoch": 0.8845332932511649, "flos": 23260634674560.0, "grad_norm": 21.278099370276966, "language_loss": 0.83436543, "learning_rate": 1.38167820974606e-07, "loss": 0.84929311, "num_input_tokens_seen": 317244505, "router_z_loss_clip": 2.23535156, "router_z_loss_mlp": 0.26550293, "step": 14712, "time_per_iteration": 2.6864051818847656 }, { "auxiliary_loss_clip": 0.01254546, "auxiliary_loss_mlp": 0.00211633, "balance_loss_clip": 1.03211689, "balance_loss_mlp": 0.18659942, "epoch": 0.8845934165038328, "flos": 17564591738880.0, "grad_norm": 16.57193039971631, "language_loss": 0.91806209, "learning_rate": 1.3802561196964368e-07, "loss": 0.93272388, "num_input_tokens_seen": 317257830, "router_z_loss_clip": 2.22851562, "router_z_loss_mlp": 0.25036621, "step": 14713, "time_per_iteration": 2.6288020610809326 }, { "auxiliary_loss_clip": 0.01219512, "auxiliary_loss_mlp": 0.00190797, "balance_loss_clip": 1.00917578, "balance_loss_mlp": 0.16823025, "epoch": 0.8846535397565009, "flos": 27485739757440.0, "grad_norm": 4.30061428507834, "language_loss": 0.63034725, "learning_rate": 1.3788347357093688e-07, "loss": 0.64445037, "num_input_tokens_seen": 317278430, "router_z_loss_clip": 2.10546875, "router_z_loss_mlp": 0.22570801, "step": 14714, "time_per_iteration": 2.7148754596710205 }, { "auxiliary_loss_clip": 0.01228403, "auxiliary_loss_mlp": 0.0020987, "balance_loss_clip": 1.01456141, "balance_loss_mlp": 0.18460976, "epoch": 0.8847136630091688, "flos": 28761430256640.0, "grad_norm": 594.9196128781076, "language_loss": 0.80749762, "learning_rate": 1.377414057838755e-07, "loss": 0.82188034, "num_input_tokens_seen": 317295970, "router_z_loss_clip": 2.14160156, "router_z_loss_mlp": 0.25231934, "step": 14715, "time_per_iteration": 4.145519018173218 }, { "auxiliary_loss_clip": 0.01239681, "auxiliary_loss_mlp": 0.00225019, "balance_loss_clip": 1.02515769, "balance_loss_mlp": 0.20072442, "epoch": 0.8847737862618368, "flos": 23476924419840.0, "grad_norm": 260.8045641395967, "language_loss": 0.81236911, "learning_rate": 1.375994086138461e-07, "loss": 0.82701606, "num_input_tokens_seen": 317316185, "router_z_loss_clip": 2.14453125, "router_z_loss_mlp": 0.24279785, "step": 14716, "time_per_iteration": 2.713517189025879 }, { "auxiliary_loss_clip": 0.01250119, "auxiliary_loss_mlp": 0.00217161, "balance_loss_clip": 1.03137553, "balance_loss_mlp": 0.19161427, "epoch": 0.8848339095145047, "flos": 18660872782080.0, "grad_norm": 241.98286960747376, "language_loss": 0.79654944, "learning_rate": 1.3745748206623397e-07, "loss": 0.81122231, "num_input_tokens_seen": 317333275, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.25524902, "step": 14717, "time_per_iteration": 2.6731925010681152 }, { "auxiliary_loss_clip": 0.01224575, "auxiliary_loss_mlp": 0.00183498, "balance_loss_clip": 1.01904988, "balance_loss_mlp": 0.1611703, "epoch": 0.8848940327671727, "flos": 32270298145920.0, "grad_norm": 70.04279209807329, "language_loss": 0.79754728, "learning_rate": 1.373156261464208e-07, "loss": 0.81162798, "num_input_tokens_seen": 317351245, "router_z_loss_clip": 2.0546875, "router_z_loss_mlp": 0.22338867, "step": 14718, "time_per_iteration": 2.725996494293213 }, { "auxiliary_loss_clip": 0.01245275, "auxiliary_loss_mlp": 0.00190232, "balance_loss_clip": 1.02174675, "balance_loss_mlp": 0.16392267, "epoch": 0.8849541560198406, "flos": 24021832717440.0, "grad_norm": 24.197588879023172, "language_loss": 0.85692811, "learning_rate": 1.3717384085978602e-07, "loss": 0.87128317, "num_input_tokens_seen": 317370740, "router_z_loss_clip": 2.23632812, "router_z_loss_mlp": 0.26318359, "step": 14719, "time_per_iteration": 2.718214988708496 }, { "auxiliary_loss_clip": 0.01246566, "auxiliary_loss_mlp": 0.00211736, "balance_loss_clip": 1.02533436, "balance_loss_mlp": 0.18821582, "epoch": 0.8850142792725086, "flos": 16873060124160.0, "grad_norm": 11.496224948543954, "language_loss": 0.78372121, "learning_rate": 1.3703212621170579e-07, "loss": 0.7983042, "num_input_tokens_seen": 317388370, "router_z_loss_clip": 2.21289062, "router_z_loss_mlp": 0.23522949, "step": 14720, "time_per_iteration": 2.6510961055755615 }, { "auxiliary_loss_clip": 0.01250763, "auxiliary_loss_mlp": 0.00222327, "balance_loss_clip": 1.03087139, "balance_loss_mlp": 0.19558857, "epoch": 0.8850744025251767, "flos": 24024059360640.0, "grad_norm": 112.12904758839495, "language_loss": 0.91653514, "learning_rate": 1.3689048220755383e-07, "loss": 0.93126607, "num_input_tokens_seen": 317407390, "router_z_loss_clip": 2.19921875, "router_z_loss_mlp": 0.26757812, "step": 14721, "time_per_iteration": 2.6652679443359375 }, { "auxiliary_loss_clip": 0.01258796, "auxiliary_loss_mlp": 0.00207002, "balance_loss_clip": 1.03336728, "balance_loss_mlp": 0.17839147, "epoch": 0.8851345257778446, "flos": 47955575329920.0, "grad_norm": 8.63124123148216, "language_loss": 0.71990705, "learning_rate": 1.3674890885270186e-07, "loss": 0.73456502, "num_input_tokens_seen": 317430825, "router_z_loss_clip": 2.25585938, "router_z_loss_mlp": 0.2857666, "step": 14722, "time_per_iteration": 2.8640894889831543 }, { "auxiliary_loss_clip": 0.01240439, "auxiliary_loss_mlp": 0.00182203, "balance_loss_clip": 1.02525413, "balance_loss_mlp": 0.15782472, "epoch": 0.8851946490305126, "flos": 36611000173440.0, "grad_norm": 24.651356149789887, "language_loss": 0.78260469, "learning_rate": 1.3660740615251754e-07, "loss": 0.79683113, "num_input_tokens_seen": 317451905, "router_z_loss_clip": 2.15039062, "router_z_loss_mlp": 0.24389648, "step": 14723, "time_per_iteration": 2.8277499675750732 }, { "auxiliary_loss_clip": 0.01233335, "auxiliary_loss_mlp": 0.00201011, "balance_loss_clip": 1.02071774, "balance_loss_mlp": 0.17795607, "epoch": 0.8852547722831805, "flos": 21544248211200.0, "grad_norm": 2.5383805255559153, "language_loss": 0.86220711, "learning_rate": 1.3646597411236703e-07, "loss": 0.87655056, "num_input_tokens_seen": 317470030, "router_z_loss_clip": 2.12988281, "router_z_loss_mlp": 0.23059082, "step": 14724, "time_per_iteration": 2.646821975708008 }, { "auxiliary_loss_clip": 0.01136657, "auxiliary_loss_mlp": 0.00180114, "balance_loss_clip": 0.99567902, "balance_loss_mlp": 0.16995783, "epoch": 0.8853148955358485, "flos": 63059246472960.0, "grad_norm": 0.7909354004589548, "language_loss": 0.58287889, "learning_rate": 1.363246127376143e-07, "loss": 0.59604657, "num_input_tokens_seen": 317527460, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.1015625, "step": 14725, "time_per_iteration": 3.0173439979553223 }, { "auxiliary_loss_clip": 0.01269636, "auxiliary_loss_mlp": 0.00235458, "balance_loss_clip": 1.03883874, "balance_loss_mlp": 0.2071097, "epoch": 0.8853750187885164, "flos": 18149828031360.0, "grad_norm": 33.07999910585455, "language_loss": 0.80525249, "learning_rate": 1.3618332203361837e-07, "loss": 0.82030344, "num_input_tokens_seen": 317544070, "router_z_loss_clip": 2.30859375, "router_z_loss_mlp": 0.28356934, "step": 14726, "time_per_iteration": 2.675887107849121 }, { "auxiliary_loss_clip": 0.01240907, "auxiliary_loss_mlp": 0.00207792, "balance_loss_clip": 1.02463865, "balance_loss_mlp": 0.18230531, "epoch": 0.8854351420411845, "flos": 39570542392320.0, "grad_norm": 338.1914952440052, "language_loss": 0.74775285, "learning_rate": 1.3604210200573785e-07, "loss": 0.76223981, "num_input_tokens_seen": 317570275, "router_z_loss_clip": 2.16601562, "router_z_loss_mlp": 0.25524902, "step": 14727, "time_per_iteration": 2.8345322608947754 }, { "auxiliary_loss_clip": 0.0123739, "auxiliary_loss_mlp": 0.00199687, "balance_loss_clip": 1.02717912, "balance_loss_mlp": 0.17861083, "epoch": 0.8854952652938524, "flos": 23769309127680.0, "grad_norm": 15.928599101948274, "language_loss": 0.78632843, "learning_rate": 1.3590095265932733e-07, "loss": 0.80069917, "num_input_tokens_seen": 317590160, "router_z_loss_clip": 2.1015625, "router_z_loss_mlp": 0.21069336, "step": 14728, "time_per_iteration": 2.668821096420288 }, { "auxiliary_loss_clip": 0.0123231, "auxiliary_loss_mlp": 0.00199257, "balance_loss_clip": 1.0167222, "balance_loss_mlp": 0.17527181, "epoch": 0.8855553885465204, "flos": 18290310122880.0, "grad_norm": 5.508606094082406, "language_loss": 0.77391303, "learning_rate": 1.3575987399973987e-07, "loss": 0.78822875, "num_input_tokens_seen": 317608340, "router_z_loss_clip": 2.15820312, "router_z_loss_mlp": 0.2401123, "step": 14729, "time_per_iteration": 2.606114387512207 }, { "auxiliary_loss_clip": 0.01233458, "auxiliary_loss_mlp": 0.00190115, "balance_loss_clip": 1.02094615, "balance_loss_mlp": 0.16778669, "epoch": 0.8856155117991883, "flos": 36867402432000.0, "grad_norm": 6.012381092998263, "language_loss": 0.71389079, "learning_rate": 1.3561886603232453e-07, "loss": 0.72812647, "num_input_tokens_seen": 317629910, "router_z_loss_clip": 2.12304688, "router_z_loss_mlp": 0.22338867, "step": 14730, "time_per_iteration": 2.7688257694244385 }, { "auxiliary_loss_clip": 0.01231604, "auxiliary_loss_mlp": 0.00197299, "balance_loss_clip": 1.02381968, "balance_loss_mlp": 0.17506675, "epoch": 0.8856756350518563, "flos": 22163886754560.0, "grad_norm": 473.99953936545285, "language_loss": 0.85778463, "learning_rate": 1.3547792876242904e-07, "loss": 0.87207365, "num_input_tokens_seen": 317650265, "router_z_loss_clip": 2.08007812, "router_z_loss_mlp": 0.22229004, "step": 14731, "time_per_iteration": 2.6504909992218018 }, { "auxiliary_loss_clip": 0.0124342, "auxiliary_loss_mlp": 0.00217554, "balance_loss_clip": 1.02406108, "balance_loss_mlp": 0.191495, "epoch": 0.8857357583045242, "flos": 20740962407040.0, "grad_norm": 82.3850920338073, "language_loss": 0.91579926, "learning_rate": 1.3533706219539708e-07, "loss": 0.93040907, "num_input_tokens_seen": 317669045, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.26037598, "step": 14732, "time_per_iteration": 2.6966965198516846 }, { "auxiliary_loss_clip": 0.01138767, "auxiliary_loss_mlp": 0.00184638, "balance_loss_clip": 0.99782854, "balance_loss_mlp": 0.17481494, "epoch": 0.8857958815571922, "flos": 69892329409920.0, "grad_norm": 0.9239748013275888, "language_loss": 0.58870566, "learning_rate": 1.3519626633657045e-07, "loss": 0.60193968, "num_input_tokens_seen": 317728065, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.09814453, "step": 14733, "time_per_iteration": 3.1644692420959473 }, { "auxiliary_loss_clip": 0.01236603, "auxiliary_loss_mlp": 0.00197861, "balance_loss_clip": 1.02468729, "balance_loss_mlp": 0.17373255, "epoch": 0.8858560048098603, "flos": 15121948187520.0, "grad_norm": 6.236357589746058, "language_loss": 0.76392972, "learning_rate": 1.3505554119128838e-07, "loss": 0.77827442, "num_input_tokens_seen": 317746120, "router_z_loss_clip": 2.12109375, "router_z_loss_mlp": 0.2409668, "step": 14734, "time_per_iteration": 2.7121474742889404 }, { "auxiliary_loss_clip": 0.01237073, "auxiliary_loss_mlp": 0.00196984, "balance_loss_clip": 1.02955818, "balance_loss_mlp": 0.17506097, "epoch": 0.8859161280625282, "flos": 16611019430400.0, "grad_norm": 44.88065343617451, "language_loss": 0.82479, "learning_rate": 1.3491488676488682e-07, "loss": 0.83913052, "num_input_tokens_seen": 317762280, "router_z_loss_clip": 2.07421875, "router_z_loss_mlp": 0.21923828, "step": 14735, "time_per_iteration": 2.6151435375213623 }, { "auxiliary_loss_clip": 0.01242018, "auxiliary_loss_mlp": 0.00213227, "balance_loss_clip": 1.02467108, "balance_loss_mlp": 0.18775165, "epoch": 0.8859762513151962, "flos": 18694484933760.0, "grad_norm": 7.157788703512122, "language_loss": 0.79107082, "learning_rate": 1.3477430306270066e-07, "loss": 0.80562329, "num_input_tokens_seen": 317780615, "router_z_loss_clip": 2.17285156, "router_z_loss_mlp": 0.25463867, "step": 14736, "time_per_iteration": 2.714839458465576 }, { "auxiliary_loss_clip": 0.01236339, "auxiliary_loss_mlp": 0.00201176, "balance_loss_clip": 1.02517939, "balance_loss_mlp": 0.17772718, "epoch": 0.8860363745678641, "flos": 19536877670400.0, "grad_norm": 256.4452416786309, "language_loss": 0.92459756, "learning_rate": 1.3463379009005892e-07, "loss": 0.93897271, "num_input_tokens_seen": 317798830, "router_z_loss_clip": 2.11132812, "router_z_loss_mlp": 0.23449707, "step": 14737, "time_per_iteration": 2.6794421672821045 }, { "auxiliary_loss_clip": 0.01251855, "auxiliary_loss_mlp": 0.00219483, "balance_loss_clip": 1.0376693, "balance_loss_mlp": 0.1937104, "epoch": 0.8860964978205321, "flos": 35954912304000.0, "grad_norm": 30.746100708611035, "language_loss": 0.77321517, "learning_rate": 1.3449334785229093e-07, "loss": 0.78792858, "num_input_tokens_seen": 317819235, "router_z_loss_clip": 2.14257812, "router_z_loss_mlp": 0.25769043, "step": 14738, "time_per_iteration": 2.7835981845855713 }, { "auxiliary_loss_clip": 0.01255107, "auxiliary_loss_mlp": 0.00214654, "balance_loss_clip": 1.02978778, "balance_loss_mlp": 0.18833217, "epoch": 0.8861566210732, "flos": 21212577002880.0, "grad_norm": 14.527332357269145, "language_loss": 0.83336455, "learning_rate": 1.343529763547222e-07, "loss": 0.84806216, "num_input_tokens_seen": 317836785, "router_z_loss_clip": 2.25195312, "router_z_loss_mlp": 0.26330566, "step": 14739, "time_per_iteration": 2.6709787845611572 }, { "auxiliary_loss_clip": 0.01229832, "auxiliary_loss_mlp": 0.00209107, "balance_loss_clip": 1.0188508, "balance_loss_mlp": 0.18509871, "epoch": 0.886216744325868, "flos": 14609071843200.0, "grad_norm": 14.006959975435175, "language_loss": 0.93290395, "learning_rate": 1.3421267560267559e-07, "loss": 0.94729328, "num_input_tokens_seen": 317854225, "router_z_loss_clip": 2.11132812, "router_z_loss_mlp": 0.23999023, "step": 14740, "time_per_iteration": 2.6033682823181152 }, { "auxiliary_loss_clip": 0.01232416, "auxiliary_loss_mlp": 0.0022007, "balance_loss_clip": 1.0190444, "balance_loss_mlp": 0.19571508, "epoch": 0.886276867578536, "flos": 26651643062400.0, "grad_norm": 37.813440093416524, "language_loss": 0.71888936, "learning_rate": 1.34072445601471e-07, "loss": 0.73341423, "num_input_tokens_seen": 317874865, "router_z_loss_clip": 2.1328125, "router_z_loss_mlp": 0.24353027, "step": 14741, "time_per_iteration": 2.6978201866149902 }, { "auxiliary_loss_clip": 0.01248034, "auxiliary_loss_mlp": 0.00217692, "balance_loss_clip": 1.03103518, "balance_loss_mlp": 0.19157293, "epoch": 0.886336990831204, "flos": 16764071281920.0, "grad_norm": 27.38537539454488, "language_loss": 0.82577485, "learning_rate": 1.3393228635642717e-07, "loss": 0.84043217, "num_input_tokens_seen": 317892830, "router_z_loss_clip": 2.16894531, "router_z_loss_mlp": 0.2611084, "step": 14742, "time_per_iteration": 2.628241777420044 }, { "auxiliary_loss_clip": 0.012238, "auxiliary_loss_mlp": 0.00209766, "balance_loss_clip": 1.01256871, "balance_loss_mlp": 0.18604338, "epoch": 0.8863971140838719, "flos": 25265275781760.0, "grad_norm": 27.7719153692272, "language_loss": 0.67324054, "learning_rate": 1.3379219787285733e-07, "loss": 0.68757623, "num_input_tokens_seen": 317911780, "router_z_loss_clip": 2.11132812, "router_z_loss_mlp": 0.23718262, "step": 14743, "time_per_iteration": 2.860140562057495 }, { "auxiliary_loss_clip": 0.01257017, "auxiliary_loss_mlp": 0.00214389, "balance_loss_clip": 1.0342195, "balance_loss_mlp": 0.18661365, "epoch": 0.8864572373365399, "flos": 23404313076480.0, "grad_norm": 10.071489109421657, "language_loss": 0.67864847, "learning_rate": 1.3365218015607437e-07, "loss": 0.69336253, "num_input_tokens_seen": 317932855, "router_z_loss_clip": 2.22851562, "router_z_loss_mlp": 0.27770996, "step": 14744, "time_per_iteration": 2.66629695892334 }, { "auxiliary_loss_clip": 0.01234161, "auxiliary_loss_mlp": 0.00218347, "balance_loss_clip": 1.01898837, "balance_loss_mlp": 0.19256178, "epoch": 0.8865173605892078, "flos": 18548759456640.0, "grad_norm": 6.586067973907986, "language_loss": 0.84048098, "learning_rate": 1.3351223321138762e-07, "loss": 0.8550061, "num_input_tokens_seen": 317952090, "router_z_loss_clip": 2.15039062, "router_z_loss_mlp": 0.2578125, "step": 14745, "time_per_iteration": 4.181748390197754 }, { "auxiliary_loss_clip": 0.01234048, "auxiliary_loss_mlp": 0.00202109, "balance_loss_clip": 1.02069712, "balance_loss_mlp": 0.17838621, "epoch": 0.8865774838418758, "flos": 19025868833280.0, "grad_norm": 39.64515260491521, "language_loss": 0.8649857, "learning_rate": 1.3337235704410454e-07, "loss": 0.87934726, "num_input_tokens_seen": 317970370, "router_z_loss_clip": 2.13085938, "router_z_loss_mlp": 0.23718262, "step": 14746, "time_per_iteration": 4.153962850570679 }, { "auxiliary_loss_clip": 0.01258901, "auxiliary_loss_mlp": 0.00197276, "balance_loss_clip": 1.03134692, "balance_loss_mlp": 0.17121691, "epoch": 0.8866376070945439, "flos": 22163168482560.0, "grad_norm": 3.922959358141624, "language_loss": 0.85235119, "learning_rate": 1.3323255165952873e-07, "loss": 0.86691296, "num_input_tokens_seen": 317989125, "router_z_loss_clip": 2.27539062, "router_z_loss_mlp": 0.26062012, "step": 14747, "time_per_iteration": 2.6865029335021973 }, { "auxiliary_loss_clip": 0.01228439, "auxiliary_loss_mlp": 0.00240471, "balance_loss_clip": 1.01500547, "balance_loss_mlp": 0.21579504, "epoch": 0.8866977303472118, "flos": 20704261685760.0, "grad_norm": 11.243385949710184, "language_loss": 0.90725338, "learning_rate": 1.3309281706296127e-07, "loss": 0.92194253, "num_input_tokens_seen": 318007820, "router_z_loss_clip": 2.13671875, "router_z_loss_mlp": 0.24694824, "step": 14748, "time_per_iteration": 2.6466941833496094 }, { "auxiliary_loss_clip": 0.01248031, "auxiliary_loss_mlp": 0.00206547, "balance_loss_clip": 1.02928936, "balance_loss_mlp": 0.18178675, "epoch": 0.8867578535998798, "flos": 48794448533760.0, "grad_norm": 15.32179602818951, "language_loss": 0.83754945, "learning_rate": 1.3295315325970148e-07, "loss": 0.85209513, "num_input_tokens_seen": 318030435, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.2479248, "step": 14749, "time_per_iteration": 2.9730517864227295 }, { "auxiliary_loss_clip": 0.01252489, "auxiliary_loss_mlp": 0.00201429, "balance_loss_clip": 1.02972555, "balance_loss_mlp": 0.17476191, "epoch": 0.8868179768525477, "flos": 21105312013440.0, "grad_norm": 4.920100825621436, "language_loss": 0.79544407, "learning_rate": 1.328135602550451e-07, "loss": 0.80998325, "num_input_tokens_seen": 318049465, "router_z_loss_clip": 2.2265625, "router_z_loss_mlp": 0.2668457, "step": 14750, "time_per_iteration": 2.65580153465271 }, { "auxiliary_loss_clip": 0.01237761, "auxiliary_loss_mlp": 0.00215572, "balance_loss_clip": 1.02317929, "balance_loss_mlp": 0.19224297, "epoch": 0.8868781001052157, "flos": 21830922656640.0, "grad_norm": 10.802415020196346, "language_loss": 0.6825816, "learning_rate": 1.3267403805428546e-07, "loss": 0.69711494, "num_input_tokens_seen": 318067760, "router_z_loss_clip": 2.14550781, "router_z_loss_mlp": 0.23339844, "step": 14751, "time_per_iteration": 2.773310899734497 }, { "auxiliary_loss_clip": 0.01251682, "auxiliary_loss_mlp": 0.00217684, "balance_loss_clip": 1.03348362, "balance_loss_mlp": 0.19247118, "epoch": 0.8869382233578836, "flos": 13516418073600.0, "grad_norm": 23.16617959842969, "language_loss": 0.90325141, "learning_rate": 1.3253458666271344e-07, "loss": 0.91794509, "num_input_tokens_seen": 318082785, "router_z_loss_clip": 2.18164062, "router_z_loss_mlp": 0.2520752, "step": 14752, "time_per_iteration": 4.029150485992432 }, { "auxiliary_loss_clip": 0.01252162, "auxiliary_loss_mlp": 0.00223627, "balance_loss_clip": 1.02629507, "balance_loss_mlp": 0.19598264, "epoch": 0.8869983466105517, "flos": 22704988210560.0, "grad_norm": 17.728027801765545, "language_loss": 0.87223834, "learning_rate": 1.3239520608561793e-07, "loss": 0.88699621, "num_input_tokens_seen": 318101925, "router_z_loss_clip": 2.25585938, "router_z_loss_mlp": 0.27648926, "step": 14753, "time_per_iteration": 2.6618664264678955 }, { "auxiliary_loss_clip": 0.01252566, "auxiliary_loss_mlp": 0.00213285, "balance_loss_clip": 1.03252077, "balance_loss_mlp": 0.18841785, "epoch": 0.8870584698632196, "flos": 15340751884800.0, "grad_norm": 214.27015660362002, "language_loss": 0.7496686, "learning_rate": 1.3225589632828248e-07, "loss": 0.76432705, "num_input_tokens_seen": 318119945, "router_z_loss_clip": 2.20019531, "router_z_loss_mlp": 0.2487793, "step": 14754, "time_per_iteration": 2.6466569900512695 }, { "auxiliary_loss_clip": 0.0124995, "auxiliary_loss_mlp": 0.00207426, "balance_loss_clip": 1.03139591, "balance_loss_mlp": 0.18300045, "epoch": 0.8871185931158876, "flos": 26615624699520.0, "grad_norm": 18.597687385218368, "language_loss": 0.84435606, "learning_rate": 1.3211665739599065e-07, "loss": 0.85892987, "num_input_tokens_seen": 318139685, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.24438477, "step": 14755, "time_per_iteration": 2.6790030002593994 }, { "auxiliary_loss_clip": 0.01238464, "auxiliary_loss_mlp": 0.00216439, "balance_loss_clip": 1.02015734, "balance_loss_mlp": 0.18911585, "epoch": 0.8871787163685555, "flos": 21799034357760.0, "grad_norm": 8.841719774390247, "language_loss": 0.83539402, "learning_rate": 1.3197748929402262e-07, "loss": 0.8499431, "num_input_tokens_seen": 318160375, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.2734375, "step": 14756, "time_per_iteration": 2.707352876663208 }, { "auxiliary_loss_clip": 0.01244117, "auxiliary_loss_mlp": 0.00218655, "balance_loss_clip": 1.02428746, "balance_loss_mlp": 0.19382419, "epoch": 0.8872388396212235, "flos": 14902964922240.0, "grad_norm": 18.19898344030165, "language_loss": 0.85407734, "learning_rate": 1.3183839202765535e-07, "loss": 0.86870503, "num_input_tokens_seen": 318177995, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.24816895, "step": 14757, "time_per_iteration": 4.07853889465332 }, { "auxiliary_loss_clip": 0.01221029, "auxiliary_loss_mlp": 0.0020661, "balance_loss_clip": 1.00837159, "balance_loss_mlp": 0.18337612, "epoch": 0.8872989628738914, "flos": 26432157006720.0, "grad_norm": 6.982756854302053, "language_loss": 0.75985229, "learning_rate": 1.316993656021632e-07, "loss": 0.77412868, "num_input_tokens_seen": 318197030, "router_z_loss_clip": 2.12695312, "router_z_loss_mlp": 0.23254395, "step": 14758, "time_per_iteration": 2.8234407901763916 }, { "auxiliary_loss_clip": 0.01260079, "auxiliary_loss_mlp": 0.00225863, "balance_loss_clip": 1.04188788, "balance_loss_mlp": 0.20001872, "epoch": 0.8873590861265594, "flos": 48142562555520.0, "grad_norm": 73.20494585873374, "language_loss": 0.76197946, "learning_rate": 1.3156041002281915e-07, "loss": 0.7768389, "num_input_tokens_seen": 318221780, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.25817871, "step": 14759, "time_per_iteration": 2.9613053798675537 }, { "auxiliary_loss_clip": 0.01216966, "auxiliary_loss_mlp": 0.00211678, "balance_loss_clip": 1.00351095, "balance_loss_mlp": 0.18782456, "epoch": 0.8874192093792275, "flos": 18332972501760.0, "grad_norm": 4.934560025686785, "language_loss": 0.83832526, "learning_rate": 1.3142152529489092e-07, "loss": 0.85261172, "num_input_tokens_seen": 318239710, "router_z_loss_clip": 2.13085938, "router_z_loss_mlp": 0.23864746, "step": 14760, "time_per_iteration": 2.6534879207611084 }, { "auxiliary_loss_clip": 0.01250032, "auxiliary_loss_mlp": 0.00202303, "balance_loss_clip": 1.02884221, "balance_loss_mlp": 0.17654169, "epoch": 0.8874793326318954, "flos": 17894215872000.0, "grad_norm": 182.49121923634502, "language_loss": 0.87312514, "learning_rate": 1.3128271142364565e-07, "loss": 0.88764846, "num_input_tokens_seen": 318257425, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.25769043, "step": 14761, "time_per_iteration": 2.623445749282837 }, { "auxiliary_loss_clip": 0.01224673, "auxiliary_loss_mlp": 0.00215614, "balance_loss_clip": 1.01293731, "balance_loss_mlp": 0.19242817, "epoch": 0.8875394558845634, "flos": 31102231772160.0, "grad_norm": 32.79207795417152, "language_loss": 0.71176195, "learning_rate": 1.3114396841434717e-07, "loss": 0.72616482, "num_input_tokens_seen": 318278485, "router_z_loss_clip": 2.11523438, "router_z_loss_mlp": 0.23168945, "step": 14762, "time_per_iteration": 2.7377376556396484 }, { "auxiliary_loss_clip": 0.01237343, "auxiliary_loss_mlp": 0.00208586, "balance_loss_clip": 1.02290916, "balance_loss_mlp": 0.18427958, "epoch": 0.8875995791372313, "flos": 21142048648320.0, "grad_norm": 28.374094330900785, "language_loss": 0.74814999, "learning_rate": 1.3100529627225697e-07, "loss": 0.76260924, "num_input_tokens_seen": 318297560, "router_z_loss_clip": 2.140625, "router_z_loss_mlp": 0.2434082, "step": 14763, "time_per_iteration": 2.727959156036377 }, { "auxiliary_loss_clip": 0.01255143, "auxiliary_loss_mlp": 0.00204986, "balance_loss_clip": 1.03000903, "balance_loss_mlp": 0.17912953, "epoch": 0.8876597023898993, "flos": 17455136019840.0, "grad_norm": 115.8381253482719, "language_loss": 0.78874862, "learning_rate": 1.3086669500263335e-07, "loss": 0.80334997, "num_input_tokens_seen": 318313060, "router_z_loss_clip": 2.25, "router_z_loss_mlp": 0.25866699, "step": 14764, "time_per_iteration": 2.6225690841674805 }, { "auxiliary_loss_clip": 0.01245192, "auxiliary_loss_mlp": 0.002189, "balance_loss_clip": 1.02342641, "balance_loss_mlp": 0.19454512, "epoch": 0.8877198256425672, "flos": 22707933125760.0, "grad_norm": 24.110255011175106, "language_loss": 0.76687437, "learning_rate": 1.3072816461073166e-07, "loss": 0.78151536, "num_input_tokens_seen": 318332030, "router_z_loss_clip": 2.21484375, "router_z_loss_mlp": 0.24328613, "step": 14765, "time_per_iteration": 2.7009658813476562 }, { "auxiliary_loss_clip": 0.01226327, "auxiliary_loss_mlp": 0.00221086, "balance_loss_clip": 1.01518691, "balance_loss_mlp": 0.1960997, "epoch": 0.8877799488952353, "flos": 24535104111360.0, "grad_norm": 4.555663669142268, "language_loss": 0.84501046, "learning_rate": 1.3058970510180568e-07, "loss": 0.85948461, "num_input_tokens_seen": 318351090, "router_z_loss_clip": 2.11132812, "router_z_loss_mlp": 0.24987793, "step": 14766, "time_per_iteration": 2.699216842651367 }, { "auxiliary_loss_clip": 0.01232446, "auxiliary_loss_mlp": 0.00209461, "balance_loss_clip": 1.01773584, "balance_loss_mlp": 0.18601286, "epoch": 0.8878400721479032, "flos": 20959191486720.0, "grad_norm": 79.07665593989941, "language_loss": 0.81851208, "learning_rate": 1.3045131648110496e-07, "loss": 0.83293122, "num_input_tokens_seen": 318372000, "router_z_loss_clip": 2.15039062, "router_z_loss_mlp": 0.23474121, "step": 14767, "time_per_iteration": 2.748370409011841 }, { "auxiliary_loss_clip": 0.01225174, "auxiliary_loss_mlp": 0.00190151, "balance_loss_clip": 1.01352239, "balance_loss_mlp": 0.16716768, "epoch": 0.8879001954005712, "flos": 25295260659840.0, "grad_norm": 12.27886566546524, "language_loss": 0.79586577, "learning_rate": 1.303129987538778e-07, "loss": 0.81001902, "num_input_tokens_seen": 318391530, "router_z_loss_clip": 2.1171875, "router_z_loss_mlp": 0.22961426, "step": 14768, "time_per_iteration": 2.6915369033813477 }, { "auxiliary_loss_clip": 0.01240405, "auxiliary_loss_mlp": 0.0019566, "balance_loss_clip": 1.02681005, "balance_loss_mlp": 0.17128217, "epoch": 0.8879603186532391, "flos": 23185329811200.0, "grad_norm": 11.141767622818953, "language_loss": 0.78676951, "learning_rate": 1.3017475192536932e-07, "loss": 0.80113024, "num_input_tokens_seen": 318410690, "router_z_loss_clip": 2.13476562, "router_z_loss_mlp": 0.24389648, "step": 14769, "time_per_iteration": 2.7202377319335938 }, { "auxiliary_loss_clip": 0.01218863, "auxiliary_loss_mlp": 0.00202504, "balance_loss_clip": 1.00854242, "balance_loss_mlp": 0.18000922, "epoch": 0.8880204419059071, "flos": 13655427707520.0, "grad_norm": 19989.14465877103, "language_loss": 0.74721152, "learning_rate": 1.3003657600082174e-07, "loss": 0.7614252, "num_input_tokens_seen": 318427380, "router_z_loss_clip": 2.1015625, "router_z_loss_mlp": 0.22509766, "step": 14770, "time_per_iteration": 2.70035982131958 }, { "auxiliary_loss_clip": 0.01219162, "auxiliary_loss_mlp": 0.00221951, "balance_loss_clip": 1.00925684, "balance_loss_mlp": 0.19763236, "epoch": 0.888080565158575, "flos": 20631865824000.0, "grad_norm": 463.96450495673207, "language_loss": 0.72878861, "learning_rate": 1.2989847098547424e-07, "loss": 0.74319971, "num_input_tokens_seen": 318448530, "router_z_loss_clip": 2.09765625, "router_z_loss_mlp": 0.24304199, "step": 14771, "time_per_iteration": 2.695709705352783 }, { "auxiliary_loss_clip": 0.01236276, "auxiliary_loss_mlp": 0.00227089, "balance_loss_clip": 1.02139473, "balance_loss_mlp": 0.20311588, "epoch": 0.888140688411243, "flos": 28620014411520.0, "grad_norm": 52.38675223208279, "language_loss": 0.87537813, "learning_rate": 1.2976043688456396e-07, "loss": 0.89001173, "num_input_tokens_seen": 318468655, "router_z_loss_clip": 2.15136719, "router_z_loss_mlp": 0.23986816, "step": 14772, "time_per_iteration": 2.738405466079712 }, { "auxiliary_loss_clip": 0.01216898, "auxiliary_loss_mlp": 0.00219376, "balance_loss_clip": 1.01016068, "balance_loss_mlp": 0.19607024, "epoch": 0.8882008116639111, "flos": 25520241496320.0, "grad_norm": 4.1739591762975685, "language_loss": 0.82001591, "learning_rate": 1.296224737033258e-07, "loss": 0.8343786, "num_input_tokens_seen": 318488740, "router_z_loss_clip": 2.06542969, "router_z_loss_mlp": 0.23303223, "step": 14773, "time_per_iteration": 2.6990530490875244 }, { "auxiliary_loss_clip": 0.01237028, "auxiliary_loss_mlp": 0.0019712, "balance_loss_clip": 1.02510619, "balance_loss_mlp": 0.17442302, "epoch": 0.888260934916579, "flos": 27673696650240.0, "grad_norm": 681.7323883765392, "language_loss": 0.82029927, "learning_rate": 1.294845814469907e-07, "loss": 0.83464074, "num_input_tokens_seen": 318508810, "router_z_loss_clip": 2.12207031, "router_z_loss_mlp": 0.2265625, "step": 14774, "time_per_iteration": 2.729085922241211 }, { "auxiliary_loss_clip": 0.01254208, "auxiliary_loss_mlp": 0.00225846, "balance_loss_clip": 1.03203201, "balance_loss_mlp": 0.19959661, "epoch": 0.888321058169247, "flos": 21611077464960.0, "grad_norm": 9.49057400519304, "language_loss": 0.84317327, "learning_rate": 1.2934676012078783e-07, "loss": 0.85797381, "num_input_tokens_seen": 318526860, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.26245117, "step": 14775, "time_per_iteration": 2.6381781101226807 }, { "auxiliary_loss_clip": 0.01231492, "auxiliary_loss_mlp": 0.0019848, "balance_loss_clip": 1.01850867, "balance_loss_mlp": 0.17503199, "epoch": 0.8883811814219149, "flos": 18149109759360.0, "grad_norm": 19.47064955627887, "language_loss": 0.87001908, "learning_rate": 1.292090097299432e-07, "loss": 0.88431883, "num_input_tokens_seen": 318545180, "router_z_loss_clip": 2.12890625, "router_z_loss_mlp": 0.23425293, "step": 14776, "time_per_iteration": 2.643582582473755 }, { "auxiliary_loss_clip": 0.0125934, "auxiliary_loss_mlp": 0.00214253, "balance_loss_clip": 1.03377032, "balance_loss_mlp": 0.1888133, "epoch": 0.8884413046745829, "flos": 28324648874880.0, "grad_norm": 14.786214627757134, "language_loss": 0.7939924, "learning_rate": 1.290713302796802e-07, "loss": 0.80872834, "num_input_tokens_seen": 318564350, "router_z_loss_clip": 2.25585938, "router_z_loss_mlp": 0.25463867, "step": 14777, "time_per_iteration": 2.711566686630249 }, { "auxiliary_loss_clip": 0.01226045, "auxiliary_loss_mlp": 0.00201912, "balance_loss_clip": 1.01427567, "balance_loss_mlp": 0.17812979, "epoch": 0.8885014279272508, "flos": 15158756649600.0, "grad_norm": 6.87632635900233, "language_loss": 0.79748499, "learning_rate": 1.2893372177522e-07, "loss": 0.8117646, "num_input_tokens_seen": 318582275, "router_z_loss_clip": 2.11914062, "router_z_loss_mlp": 0.23791504, "step": 14778, "time_per_iteration": 2.6354029178619385 }, { "auxiliary_loss_clip": 0.01239512, "auxiliary_loss_mlp": 0.00214121, "balance_loss_clip": 1.02278554, "balance_loss_mlp": 0.19027874, "epoch": 0.8885615511799189, "flos": 19099593498240.0, "grad_norm": 43.0590915770348, "language_loss": 0.83736879, "learning_rate": 1.287961842217804e-07, "loss": 0.85190517, "num_input_tokens_seen": 318601230, "router_z_loss_clip": 2.16796875, "router_z_loss_mlp": 0.23828125, "step": 14779, "time_per_iteration": 2.639014959335327 }, { "auxiliary_loss_clip": 0.01125381, "auxiliary_loss_mlp": 0.00158438, "balance_loss_clip": 0.98438537, "balance_loss_mlp": 0.15018876, "epoch": 0.8886216744325868, "flos": 51186567605760.0, "grad_norm": 0.8466552501689502, "language_loss": 0.55715346, "learning_rate": 1.2865871762457747e-07, "loss": 0.56999165, "num_input_tokens_seen": 318645595, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.08251953, "step": 14780, "time_per_iteration": 2.971055507659912 }, { "auxiliary_loss_clip": 0.0112768, "auxiliary_loss_mlp": 0.00080199, "balance_loss_clip": 0.98505974, "balance_loss_mlp": 0.07366651, "epoch": 0.8886817976852548, "flos": 61612981263360.0, "grad_norm": 0.7735029368922866, "language_loss": 0.61341631, "learning_rate": 1.2852132198882326e-07, "loss": 0.62549508, "num_input_tokens_seen": 318707850, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.06542969, "step": 14781, "time_per_iteration": 3.202974557876587 }, { "auxiliary_loss_clip": 0.01122802, "auxiliary_loss_mlp": 0.00179859, "balance_loss_clip": 0.98479784, "balance_loss_mlp": 0.17060831, "epoch": 0.8887419209379227, "flos": 60646946935680.0, "grad_norm": 0.7798984368725708, "language_loss": 0.57402527, "learning_rate": 1.2838399731972805e-07, "loss": 0.58705187, "num_input_tokens_seen": 318764915, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.09228516, "step": 14782, "time_per_iteration": 2.9798190593719482 }, { "auxiliary_loss_clip": 0.01219296, "auxiliary_loss_mlp": 0.00222758, "balance_loss_clip": 1.01412058, "balance_loss_mlp": 0.19880924, "epoch": 0.8888020441905907, "flos": 29205861235200.0, "grad_norm": 28.947117193984937, "language_loss": 0.72636372, "learning_rate": 1.2824674362249922e-07, "loss": 0.74078423, "num_input_tokens_seen": 318785660, "router_z_loss_clip": 2.05175781, "router_z_loss_mlp": 0.23950195, "step": 14783, "time_per_iteration": 2.800751209259033 }, { "auxiliary_loss_clip": 0.01256312, "auxiliary_loss_mlp": 0.0022295, "balance_loss_clip": 1.03535509, "balance_loss_mlp": 0.19878623, "epoch": 0.8888621674432586, "flos": 22162701605760.0, "grad_norm": 46.33082866041383, "language_loss": 0.84973061, "learning_rate": 1.281095609023415e-07, "loss": 0.86452323, "num_input_tokens_seen": 318806080, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.24145508, "step": 14784, "time_per_iteration": 2.663804769515991 }, { "auxiliary_loss_clip": 0.01272192, "auxiliary_loss_mlp": 0.00219273, "balance_loss_clip": 1.04370308, "balance_loss_mlp": 0.19359508, "epoch": 0.8889222906959267, "flos": 27672834723840.0, "grad_norm": 185.7006610821539, "language_loss": 0.71530616, "learning_rate": 1.279724491644565e-07, "loss": 0.73022079, "num_input_tokens_seen": 318826445, "router_z_loss_clip": 2.28320312, "router_z_loss_mlp": 0.2565918, "step": 14785, "time_per_iteration": 2.7417757511138916 }, { "auxiliary_loss_clip": 0.01250592, "auxiliary_loss_mlp": 0.00185635, "balance_loss_clip": 1.03232384, "balance_loss_mlp": 0.16049415, "epoch": 0.8889824139485947, "flos": 14168627274240.0, "grad_norm": 36.21589165396545, "language_loss": 0.72570443, "learning_rate": 1.278354084140445e-07, "loss": 0.74006677, "num_input_tokens_seen": 318843915, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.25146484, "step": 14786, "time_per_iteration": 2.5951390266418457 }, { "auxiliary_loss_clip": 0.01251956, "auxiliary_loss_mlp": 0.00210768, "balance_loss_clip": 1.02966595, "balance_loss_mlp": 0.18529284, "epoch": 0.8890425372012626, "flos": 12853003829760.0, "grad_norm": 110.93755951110654, "language_loss": 0.94508791, "learning_rate": 1.276984386563009e-07, "loss": 0.95971519, "num_input_tokens_seen": 318859670, "router_z_loss_clip": 2.22460938, "router_z_loss_mlp": 0.25476074, "step": 14787, "time_per_iteration": 2.7349014282226562 }, { "auxiliary_loss_clip": 0.01259026, "auxiliary_loss_mlp": 0.00216646, "balance_loss_clip": 1.03612363, "balance_loss_mlp": 0.19038454, "epoch": 0.8891026604539306, "flos": 21689291329920.0, "grad_norm": 7.674388644363673, "language_loss": 0.8274104, "learning_rate": 1.2756153989642027e-07, "loss": 0.84216714, "num_input_tokens_seen": 318877855, "router_z_loss_clip": 2.22851562, "router_z_loss_mlp": 0.26293945, "step": 14788, "time_per_iteration": 4.149017810821533 }, { "auxiliary_loss_clip": 0.01222195, "auxiliary_loss_mlp": 0.00204446, "balance_loss_clip": 1.00908387, "balance_loss_mlp": 0.18053274, "epoch": 0.8891627837065985, "flos": 21871430219520.0, "grad_norm": 226.03722063454896, "language_loss": 0.76243305, "learning_rate": 1.274247121395935e-07, "loss": 0.77669954, "num_input_tokens_seen": 318896045, "router_z_loss_clip": 2.13085938, "router_z_loss_mlp": 0.23913574, "step": 14789, "time_per_iteration": 4.26997971534729 }, { "auxiliary_loss_clip": 0.01247561, "auxiliary_loss_mlp": 0.00224437, "balance_loss_clip": 1.02744579, "balance_loss_mlp": 0.19879508, "epoch": 0.8892229069592665, "flos": 21580230660480.0, "grad_norm": 28.73973978282685, "language_loss": 0.78513843, "learning_rate": 1.2728795539100956e-07, "loss": 0.79985845, "num_input_tokens_seen": 318915515, "router_z_loss_clip": 2.20117188, "router_z_loss_mlp": 0.25622559, "step": 14790, "time_per_iteration": 2.650146007537842 }, { "auxiliary_loss_clip": 0.01233262, "auxiliary_loss_mlp": 0.00210409, "balance_loss_clip": 1.02166188, "balance_loss_mlp": 0.18606693, "epoch": 0.8892830302119344, "flos": 23075981832960.0, "grad_norm": 117.5096818322478, "language_loss": 0.79814732, "learning_rate": 1.2715126965585387e-07, "loss": 0.81258404, "num_input_tokens_seen": 318934305, "router_z_loss_clip": 2.11425781, "router_z_loss_mlp": 0.24353027, "step": 14791, "time_per_iteration": 2.654984474182129 }, { "auxiliary_loss_clip": 0.01228967, "auxiliary_loss_mlp": 0.00212495, "balance_loss_clip": 1.0181551, "balance_loss_mlp": 0.18926108, "epoch": 0.8893431534646025, "flos": 23072139077760.0, "grad_norm": 2.918491780417806, "language_loss": 0.82131052, "learning_rate": 1.2701465493931008e-07, "loss": 0.83572513, "num_input_tokens_seen": 318953880, "router_z_loss_clip": 2.11230469, "router_z_loss_mlp": 0.23242188, "step": 14792, "time_per_iteration": 2.65254545211792 }, { "auxiliary_loss_clip": 0.01258557, "auxiliary_loss_mlp": 0.00217442, "balance_loss_clip": 1.03378868, "balance_loss_mlp": 0.19262311, "epoch": 0.8894032767172704, "flos": 22454978572800.0, "grad_norm": 93.5160236475789, "language_loss": 0.76350605, "learning_rate": 1.2687811124655801e-07, "loss": 0.77826607, "num_input_tokens_seen": 318971395, "router_z_loss_clip": 2.24804688, "router_z_loss_mlp": 0.24816895, "step": 14793, "time_per_iteration": 2.700469970703125 }, { "auxiliary_loss_clip": 0.01245714, "auxiliary_loss_mlp": 0.00213111, "balance_loss_clip": 1.02591848, "balance_loss_mlp": 0.18659928, "epoch": 0.8894633999699384, "flos": 25338246261120.0, "grad_norm": 9.87267713560225, "language_loss": 0.7969892, "learning_rate": 1.2674163858277552e-07, "loss": 0.81157744, "num_input_tokens_seen": 318990580, "router_z_loss_clip": 2.19921875, "router_z_loss_mlp": 0.26538086, "step": 14794, "time_per_iteration": 4.128849267959595 }, { "auxiliary_loss_clip": 0.01267707, "auxiliary_loss_mlp": 0.00221531, "balance_loss_clip": 1.04281092, "balance_loss_mlp": 0.19512579, "epoch": 0.8895235232226063, "flos": 20994096528000.0, "grad_norm": 19.064622345903363, "language_loss": 0.80703443, "learning_rate": 1.2660523695313785e-07, "loss": 0.82192683, "num_input_tokens_seen": 319010040, "router_z_loss_clip": 2.25, "router_z_loss_mlp": 0.26416016, "step": 14795, "time_per_iteration": 2.678644895553589 }, { "auxiliary_loss_clip": 0.01124133, "auxiliary_loss_mlp": 0.00140264, "balance_loss_clip": 0.98415107, "balance_loss_mlp": 0.13330229, "epoch": 0.8895836464752743, "flos": 69732956764800.0, "grad_norm": 0.9491654958233726, "language_loss": 0.55576581, "learning_rate": 1.2646890636281727e-07, "loss": 0.56840974, "num_input_tokens_seen": 319063860, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.06982422, "step": 14796, "time_per_iteration": 3.1199796199798584 }, { "auxiliary_loss_clip": 0.0125792, "auxiliary_loss_mlp": 0.00216331, "balance_loss_clip": 1.03164339, "balance_loss_mlp": 0.18952067, "epoch": 0.8896437697279422, "flos": 23221815050880.0, "grad_norm": 24.179088519443376, "language_loss": 0.79388213, "learning_rate": 1.263326468169843e-07, "loss": 0.80862468, "num_input_tokens_seen": 319082335, "router_z_loss_clip": 2.26367188, "router_z_loss_mlp": 0.26831055, "step": 14797, "time_per_iteration": 2.679567575454712 }, { "auxiliary_loss_clip": 0.01123606, "auxiliary_loss_mlp": 0.00099335, "balance_loss_clip": 0.98241985, "balance_loss_mlp": 0.09175347, "epoch": 0.8897038929806103, "flos": 70752711882240.0, "grad_norm": 0.7197303182924487, "language_loss": 0.57121325, "learning_rate": 1.2619645832080417e-07, "loss": 0.58344269, "num_input_tokens_seen": 319147075, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.07568359, "step": 14798, "time_per_iteration": 3.1688547134399414 }, { "auxiliary_loss_clip": 0.0124992, "auxiliary_loss_mlp": 0.00219303, "balance_loss_clip": 1.02669811, "balance_loss_mlp": 0.19540203, "epoch": 0.8897640162332782, "flos": 19245103493760.0, "grad_norm": 85.31840614827937, "language_loss": 0.86229837, "learning_rate": 1.2606034087944251e-07, "loss": 0.87699062, "num_input_tokens_seen": 319166630, "router_z_loss_clip": 2.23339844, "router_z_loss_mlp": 0.23913574, "step": 14799, "time_per_iteration": 2.6265149116516113 }, { "auxiliary_loss_clip": 0.01132845, "auxiliary_loss_mlp": 0.00118479, "balance_loss_clip": 0.99042368, "balance_loss_mlp": 0.11123097, "epoch": 0.8898241394859462, "flos": 41356275039360.0, "grad_norm": 0.8676691905921, "language_loss": 0.57487869, "learning_rate": 1.2592429449806053e-07, "loss": 0.58739191, "num_input_tokens_seen": 319221865, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.07226562, "step": 14800, "time_per_iteration": 4.4376749992370605 }, { "auxiliary_loss_clip": 0.01237079, "auxiliary_loss_mlp": 0.00218917, "balance_loss_clip": 1.02419186, "balance_loss_mlp": 0.19614792, "epoch": 0.8898842627386142, "flos": 18986295024000.0, "grad_norm": 9.551569550010948, "language_loss": 0.74067724, "learning_rate": 1.2578831918181698e-07, "loss": 0.75523722, "num_input_tokens_seen": 319240710, "router_z_loss_clip": 2.1328125, "router_z_loss_mlp": 0.2277832, "step": 14801, "time_per_iteration": 2.6785690784454346 }, { "auxiliary_loss_clip": 0.0126925, "auxiliary_loss_mlp": 0.00227267, "balance_loss_clip": 1.03896523, "balance_loss_mlp": 0.20147052, "epoch": 0.8899443859912821, "flos": 13217173868160.0, "grad_norm": 9.24054606211364, "language_loss": 0.84493649, "learning_rate": 1.256524149358682e-07, "loss": 0.85990167, "num_input_tokens_seen": 319256495, "router_z_loss_clip": 2.3046875, "router_z_loss_mlp": 0.25817871, "step": 14802, "time_per_iteration": 2.6918041706085205 }, { "auxiliary_loss_clip": 0.01247535, "auxiliary_loss_mlp": 0.00219812, "balance_loss_clip": 1.02889109, "balance_loss_mlp": 0.19481392, "epoch": 0.8900045092439501, "flos": 22674680110080.0, "grad_norm": 76.71293541428848, "language_loss": 0.80280274, "learning_rate": 1.2551658176536805e-07, "loss": 0.81747627, "num_input_tokens_seen": 319273620, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.25, "step": 14803, "time_per_iteration": 2.6857666969299316 }, { "auxiliary_loss_clip": 0.01241617, "auxiliary_loss_mlp": 0.00218198, "balance_loss_clip": 1.02168894, "balance_loss_mlp": 0.1927114, "epoch": 0.890064632496618, "flos": 21141617685120.0, "grad_norm": 6.649836246285387, "language_loss": 0.81687403, "learning_rate": 1.2538081967546664e-07, "loss": 0.83147216, "num_input_tokens_seen": 319291720, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.25488281, "step": 14804, "time_per_iteration": 2.677724599838257 }, { "auxiliary_loss_clip": 0.01232283, "auxiliary_loss_mlp": 0.00213891, "balance_loss_clip": 1.01835513, "balance_loss_mlp": 0.19029987, "epoch": 0.8901247557492861, "flos": 23397058529280.0, "grad_norm": 166.05009596566248, "language_loss": 0.89395332, "learning_rate": 1.252451286713123e-07, "loss": 0.90841508, "num_input_tokens_seen": 319310380, "router_z_loss_clip": 2.140625, "router_z_loss_mlp": 0.23596191, "step": 14805, "time_per_iteration": 2.6983046531677246 }, { "auxiliary_loss_clip": 0.01268964, "auxiliary_loss_mlp": 0.00200886, "balance_loss_clip": 1.03953671, "balance_loss_mlp": 0.1737659, "epoch": 0.890184879001954, "flos": 29169591477120.0, "grad_norm": 51.99318907439467, "language_loss": 0.75778282, "learning_rate": 1.251095087580505e-07, "loss": 0.77248138, "num_input_tokens_seen": 319331765, "router_z_loss_clip": 2.29882812, "router_z_loss_mlp": 0.27148438, "step": 14806, "time_per_iteration": 2.739058017730713 }, { "auxiliary_loss_clip": 0.01247624, "auxiliary_loss_mlp": 0.00205453, "balance_loss_clip": 1.027246, "balance_loss_mlp": 0.17911951, "epoch": 0.890245002254622, "flos": 14427830793600.0, "grad_norm": 68.02772651196445, "language_loss": 0.77944112, "learning_rate": 1.2497395994082438e-07, "loss": 0.7939719, "num_input_tokens_seen": 319349135, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.26330566, "step": 14807, "time_per_iteration": 2.7281761169433594 }, { "auxiliary_loss_clip": 0.01223408, "auxiliary_loss_mlp": 0.00202035, "balance_loss_clip": 1.01330471, "balance_loss_mlp": 0.1789327, "epoch": 0.8903051255072899, "flos": 22382187661440.0, "grad_norm": 7.658479076201395, "language_loss": 0.82582355, "learning_rate": 1.248384822247732e-07, "loss": 0.840078, "num_input_tokens_seen": 319368410, "router_z_loss_clip": 2.1015625, "router_z_loss_mlp": 0.2310791, "step": 14808, "time_per_iteration": 2.7291417121887207 }, { "auxiliary_loss_clip": 0.01230668, "auxiliary_loss_mlp": 0.00218341, "balance_loss_clip": 1.01641273, "balance_loss_mlp": 0.19345005, "epoch": 0.8903652487599579, "flos": 20777375819520.0, "grad_norm": 2.072279906844782, "language_loss": 0.89640951, "learning_rate": 1.2470307561503513e-07, "loss": 0.91089964, "num_input_tokens_seen": 319387535, "router_z_loss_clip": 2.14257812, "router_z_loss_mlp": 0.24890137, "step": 14809, "time_per_iteration": 2.716245412826538 }, { "auxiliary_loss_clip": 0.01240685, "auxiliary_loss_mlp": 0.00220367, "balance_loss_clip": 1.01976418, "balance_loss_mlp": 0.19430819, "epoch": 0.8904253720126258, "flos": 24424499157120.0, "grad_norm": 6.173717875626312, "language_loss": 0.77740586, "learning_rate": 1.2456774011674442e-07, "loss": 0.79201639, "num_input_tokens_seen": 319407210, "router_z_loss_clip": 2.20605469, "router_z_loss_mlp": 0.26086426, "step": 14810, "time_per_iteration": 2.8604414463043213 }, { "auxiliary_loss_clip": 0.01242735, "auxiliary_loss_mlp": 0.0020587, "balance_loss_clip": 1.01845574, "balance_loss_mlp": 0.18147957, "epoch": 0.8904854952652939, "flos": 19463871277440.0, "grad_norm": 8.62194124345209, "language_loss": 0.8095479, "learning_rate": 1.2443247573503257e-07, "loss": 0.82403392, "num_input_tokens_seen": 319425340, "router_z_loss_clip": 2.24414062, "router_z_loss_mlp": 0.24377441, "step": 14811, "time_per_iteration": 2.626422643661499 }, { "auxiliary_loss_clip": 0.01254396, "auxiliary_loss_mlp": 0.0021714, "balance_loss_clip": 1.03248501, "balance_loss_mlp": 0.19308399, "epoch": 0.8905456185179618, "flos": 50800741666560.0, "grad_norm": 287.5271858341366, "language_loss": 0.73163366, "learning_rate": 1.2429728247502924e-07, "loss": 0.74634898, "num_input_tokens_seen": 319448150, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.24072266, "step": 14812, "time_per_iteration": 2.955007314682007 }, { "auxiliary_loss_clip": 0.01239499, "auxiliary_loss_mlp": 0.00213644, "balance_loss_clip": 1.02467608, "balance_loss_mlp": 0.19061311, "epoch": 0.8906057417706298, "flos": 17784867893760.0, "grad_norm": 5.91270169639237, "language_loss": 0.76715583, "learning_rate": 1.24162160341861e-07, "loss": 0.78168726, "num_input_tokens_seen": 319466115, "router_z_loss_clip": 2.15234375, "router_z_loss_mlp": 0.23059082, "step": 14813, "time_per_iteration": 2.608144998550415 }, { "auxiliary_loss_clip": 0.01280395, "auxiliary_loss_mlp": 0.00246547, "balance_loss_clip": 1.05211663, "balance_loss_mlp": 0.21774645, "epoch": 0.8906658650232978, "flos": 21944867575680.0, "grad_norm": 1610.5879510496973, "language_loss": 0.85230052, "learning_rate": 1.2402710934065198e-07, "loss": 0.86756992, "num_input_tokens_seen": 319485255, "router_z_loss_clip": 2.28320312, "router_z_loss_mlp": 0.28808594, "step": 14814, "time_per_iteration": 2.6677498817443848 }, { "auxiliary_loss_clip": 0.01252038, "auxiliary_loss_mlp": 0.00238602, "balance_loss_clip": 1.02986407, "balance_loss_mlp": 0.2116611, "epoch": 0.8907259882759657, "flos": 21287810039040.0, "grad_norm": 30.926198115583258, "language_loss": 0.81736851, "learning_rate": 1.2389212947652229e-07, "loss": 0.83227491, "num_input_tokens_seen": 319501800, "router_z_loss_clip": 2.22265625, "router_z_loss_mlp": 0.26965332, "step": 14815, "time_per_iteration": 2.6490190029144287 }, { "auxiliary_loss_clip": 0.01225417, "auxiliary_loss_mlp": 0.00183965, "balance_loss_clip": 1.01701987, "balance_loss_mlp": 0.16265076, "epoch": 0.8907861115286337, "flos": 20120426023680.0, "grad_norm": 9.781992650451725, "language_loss": 0.83325088, "learning_rate": 1.237572207545914e-07, "loss": 0.84734476, "num_input_tokens_seen": 319520415, "router_z_loss_clip": 2.08789062, "router_z_loss_mlp": 0.21313477, "step": 14816, "time_per_iteration": 2.6856508255004883 }, { "auxiliary_loss_clip": 0.01254989, "auxiliary_loss_mlp": 0.0020033, "balance_loss_clip": 1.02934718, "balance_loss_mlp": 0.17537947, "epoch": 0.8908462347813016, "flos": 20084156265600.0, "grad_norm": 86.79647985338453, "language_loss": 0.85851753, "learning_rate": 1.2362238317997476e-07, "loss": 0.87307072, "num_input_tokens_seen": 319538410, "router_z_loss_clip": 2.25585938, "router_z_loss_mlp": 0.24975586, "step": 14817, "time_per_iteration": 2.6623480319976807 }, { "auxiliary_loss_clip": 0.01114056, "auxiliary_loss_mlp": 0.0006722, "balance_loss_clip": 0.97441614, "balance_loss_mlp": 0.06130673, "epoch": 0.8909063580339697, "flos": 65503649790720.0, "grad_norm": 0.7265400415956129, "language_loss": 0.5625028, "learning_rate": 1.2348761675778517e-07, "loss": 0.57431555, "num_input_tokens_seen": 319602565, "router_z_loss_clip": 1.3984375, "router_z_loss_mlp": 0.05908203, "step": 14818, "time_per_iteration": 3.201524019241333 }, { "auxiliary_loss_clip": 0.01244608, "auxiliary_loss_mlp": 0.00235171, "balance_loss_clip": 1.02713323, "balance_loss_mlp": 0.21037576, "epoch": 0.8909664812866376, "flos": 29863062426240.0, "grad_norm": 4.916925331198541, "language_loss": 0.72850049, "learning_rate": 1.2335292149313325e-07, "loss": 0.74329829, "num_input_tokens_seen": 319624645, "router_z_loss_clip": 2.17675781, "router_z_loss_mlp": 0.24816895, "step": 14819, "time_per_iteration": 2.7588136196136475 }, { "auxiliary_loss_clip": 0.01253094, "auxiliary_loss_mlp": 0.0021228, "balance_loss_clip": 1.03090167, "balance_loss_mlp": 0.18750793, "epoch": 0.8910266045393056, "flos": 25447127362560.0, "grad_norm": 5.18757025518573, "language_loss": 0.8735764, "learning_rate": 1.2321829739112731e-07, "loss": 0.88823009, "num_input_tokens_seen": 319644040, "router_z_loss_clip": 2.22070312, "router_z_loss_mlp": 0.24768066, "step": 14820, "time_per_iteration": 2.701744556427002 }, { "auxiliary_loss_clip": 0.01257081, "auxiliary_loss_mlp": 0.00224542, "balance_loss_clip": 1.03552222, "balance_loss_mlp": 0.20130767, "epoch": 0.8910867277919735, "flos": 24499121662080.0, "grad_norm": 256.891619464761, "language_loss": 0.82331216, "learning_rate": 1.2308374445687087e-07, "loss": 0.83812833, "num_input_tokens_seen": 319663930, "router_z_loss_clip": 2.21679688, "router_z_loss_mlp": 0.23217773, "step": 14821, "time_per_iteration": 2.818347454071045 }, { "auxiliary_loss_clip": 0.01120462, "auxiliary_loss_mlp": 0.00090126, "balance_loss_clip": 0.97976369, "balance_loss_mlp": 0.08230567, "epoch": 0.8911468510446415, "flos": 60688136856960.0, "grad_norm": 0.7873941092082194, "language_loss": 0.5886057, "learning_rate": 1.2294926269546712e-07, "loss": 0.60071158, "num_input_tokens_seen": 319721245, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.078125, "step": 14822, "time_per_iteration": 3.0373570919036865 }, { "auxiliary_loss_clip": 0.01237797, "auxiliary_loss_mlp": 0.0021959, "balance_loss_clip": 1.02145052, "balance_loss_mlp": 0.1937688, "epoch": 0.8912069742973094, "flos": 25337492075520.0, "grad_norm": 14.60635346939878, "language_loss": 0.7886734, "learning_rate": 1.2281485211201515e-07, "loss": 0.80324721, "num_input_tokens_seen": 319741200, "router_z_loss_clip": 2.16210938, "router_z_loss_mlp": 0.25830078, "step": 14823, "time_per_iteration": 2.750028371810913 }, { "auxiliary_loss_clip": 0.01232151, "auxiliary_loss_mlp": 0.00204127, "balance_loss_clip": 1.0218246, "balance_loss_mlp": 0.17936744, "epoch": 0.8912670975499775, "flos": 18223516782720.0, "grad_norm": 6.325837786881918, "language_loss": 0.78734505, "learning_rate": 1.2268051271161262e-07, "loss": 0.8017078, "num_input_tokens_seen": 319759265, "router_z_loss_clip": 2.10253906, "router_z_loss_mlp": 0.24743652, "step": 14824, "time_per_iteration": 2.658195972442627 }, { "auxiliary_loss_clip": 0.0125162, "auxiliary_loss_mlp": 0.00208858, "balance_loss_clip": 1.02940893, "balance_loss_mlp": 0.18370491, "epoch": 0.8913272208026454, "flos": 26504481041280.0, "grad_norm": 5.528170510190306, "language_loss": 0.79716051, "learning_rate": 1.2254624449935303e-07, "loss": 0.81176525, "num_input_tokens_seen": 319777560, "router_z_loss_clip": 2.22070312, "router_z_loss_mlp": 0.2512207, "step": 14825, "time_per_iteration": 2.6786162853240967 }, { "auxiliary_loss_clip": 0.01231714, "auxiliary_loss_mlp": 0.00211441, "balance_loss_clip": 1.02007818, "balance_loss_mlp": 0.1872535, "epoch": 0.8913873440553134, "flos": 18802324540800.0, "grad_norm": 39.61555353336122, "language_loss": 0.7962687, "learning_rate": 1.2241204748032786e-07, "loss": 0.8107003, "num_input_tokens_seen": 319794125, "router_z_loss_clip": 2.11523438, "router_z_loss_mlp": 0.24194336, "step": 14826, "time_per_iteration": 2.6381447315216064 }, { "auxiliary_loss_clip": 0.01244597, "auxiliary_loss_mlp": 0.00220881, "balance_loss_clip": 1.02837753, "balance_loss_mlp": 0.19643164, "epoch": 0.8914474673079814, "flos": 20884892204160.0, "grad_norm": 37.614104551475776, "language_loss": 0.83316326, "learning_rate": 1.2227792165962615e-07, "loss": 0.84781802, "num_input_tokens_seen": 319810310, "router_z_loss_clip": 2.15917969, "router_z_loss_mlp": 0.24450684, "step": 14827, "time_per_iteration": 2.644540786743164 }, { "auxiliary_loss_clip": 0.01234176, "auxiliary_loss_mlp": 0.00218427, "balance_loss_clip": 1.0165751, "balance_loss_mlp": 0.193977, "epoch": 0.8915075905606493, "flos": 20952439729920.0, "grad_norm": 12.613404511314892, "language_loss": 0.85591078, "learning_rate": 1.221438670423336e-07, "loss": 0.87043679, "num_input_tokens_seen": 319828505, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.24487305, "step": 14828, "time_per_iteration": 2.6872520446777344 }, { "auxiliary_loss_clip": 0.01235416, "auxiliary_loss_mlp": 0.00231833, "balance_loss_clip": 1.02008867, "balance_loss_mlp": 0.2072404, "epoch": 0.8915677138133173, "flos": 23076305055360.0, "grad_norm": 12.50503033540821, "language_loss": 0.83322352, "learning_rate": 1.2200988363353392e-07, "loss": 0.84789598, "num_input_tokens_seen": 319848680, "router_z_loss_clip": 2.15332031, "router_z_loss_mlp": 0.24572754, "step": 14829, "time_per_iteration": 2.6504921913146973 }, { "auxiliary_loss_clip": 0.01249732, "auxiliary_loss_mlp": 0.00218052, "balance_loss_clip": 1.03211641, "balance_loss_mlp": 0.19266048, "epoch": 0.8916278370659853, "flos": 23440259612160.0, "grad_norm": 53.89431160423309, "language_loss": 0.90450025, "learning_rate": 1.2187597143830773e-07, "loss": 0.91917801, "num_input_tokens_seen": 319868835, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.25402832, "step": 14830, "time_per_iteration": 4.0738701820373535 }, { "auxiliary_loss_clip": 0.01222595, "auxiliary_loss_mlp": 0.00203891, "balance_loss_clip": 1.01375258, "balance_loss_mlp": 0.17977522, "epoch": 0.8916879603186533, "flos": 25160488830720.0, "grad_norm": 6.3285340902086835, "language_loss": 0.82234514, "learning_rate": 1.2174213046173299e-07, "loss": 0.83661002, "num_input_tokens_seen": 319891585, "router_z_loss_clip": 2.0859375, "router_z_loss_mlp": 0.24133301, "step": 14831, "time_per_iteration": 4.372467756271362 }, { "auxiliary_loss_clip": 0.01245391, "auxiliary_loss_mlp": 0.00225497, "balance_loss_clip": 1.02592611, "balance_loss_mlp": 0.19927144, "epoch": 0.8917480835713212, "flos": 20229845829120.0, "grad_norm": 12.360240526737428, "language_loss": 0.82681191, "learning_rate": 1.216083607088847e-07, "loss": 0.84152079, "num_input_tokens_seen": 319910315, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.26220703, "step": 14832, "time_per_iteration": 2.642322540283203 }, { "auxiliary_loss_clip": 0.0125456, "auxiliary_loss_mlp": 0.00221831, "balance_loss_clip": 1.02941644, "balance_loss_mlp": 0.19595017, "epoch": 0.8918082068239892, "flos": 26101922342400.0, "grad_norm": 5.8449961262459516, "language_loss": 0.7491858, "learning_rate": 1.214746621848355e-07, "loss": 0.76394975, "num_input_tokens_seen": 319932275, "router_z_loss_clip": 2.25195312, "router_z_loss_mlp": 0.25891113, "step": 14833, "time_per_iteration": 2.805919647216797 }, { "auxiliary_loss_clip": 0.01274489, "auxiliary_loss_mlp": 0.0023033, "balance_loss_clip": 1.04482889, "balance_loss_mlp": 0.20332953, "epoch": 0.8918683300766571, "flos": 24831439315200.0, "grad_norm": 859.5092743737114, "language_loss": 0.81592774, "learning_rate": 1.2134103489465575e-07, "loss": 0.83097595, "num_input_tokens_seen": 319955335, "router_z_loss_clip": 2.296875, "router_z_loss_mlp": 0.26965332, "step": 14834, "time_per_iteration": 2.8268256187438965 }, { "auxiliary_loss_clip": 0.01240603, "auxiliary_loss_mlp": 0.00217363, "balance_loss_clip": 1.02228057, "balance_loss_mlp": 0.19141105, "epoch": 0.8919284533293251, "flos": 22305158945280.0, "grad_norm": 3.6783586374603554, "language_loss": 0.8695839, "learning_rate": 1.2120747884341188e-07, "loss": 0.88416362, "num_input_tokens_seen": 319973990, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.25952148, "step": 14835, "time_per_iteration": 2.6570937633514404 }, { "auxiliary_loss_clip": 0.01219503, "auxiliary_loss_mlp": 0.0020631, "balance_loss_clip": 1.01050782, "balance_loss_mlp": 0.18306457, "epoch": 0.891988576581993, "flos": 30373532559360.0, "grad_norm": 9.260615786289174, "language_loss": 0.81403726, "learning_rate": 1.210739940361689e-07, "loss": 0.82829535, "num_input_tokens_seen": 319995555, "router_z_loss_clip": 2.09179688, "router_z_loss_mlp": 0.23278809, "step": 14836, "time_per_iteration": 4.125575304031372 }, { "auxiliary_loss_clip": 0.01236228, "auxiliary_loss_mlp": 0.00221021, "balance_loss_clip": 1.02201164, "balance_loss_mlp": 0.19517647, "epoch": 0.8920486998346611, "flos": 15552947479680.0, "grad_norm": 20.781063359923706, "language_loss": 0.77457535, "learning_rate": 1.2094058047798838e-07, "loss": 0.78914785, "num_input_tokens_seen": 320012385, "router_z_loss_clip": 2.14257812, "router_z_loss_mlp": 0.25854492, "step": 14837, "time_per_iteration": 2.6045784950256348 }, { "auxiliary_loss_clip": 0.01249348, "auxiliary_loss_mlp": 0.00214772, "balance_loss_clip": 1.02460027, "balance_loss_mlp": 0.18957132, "epoch": 0.892108823087329, "flos": 21214983214080.0, "grad_norm": 2.853782607972406, "language_loss": 0.76885271, "learning_rate": 1.2080723817392913e-07, "loss": 0.78349388, "num_input_tokens_seen": 320032390, "router_z_loss_clip": 2.24414062, "router_z_loss_mlp": 0.25231934, "step": 14838, "time_per_iteration": 2.6806321144104004 }, { "auxiliary_loss_clip": 0.01228371, "auxiliary_loss_mlp": 0.00193679, "balance_loss_clip": 1.01714444, "balance_loss_mlp": 0.16983762, "epoch": 0.892168946339997, "flos": 21978982517760.0, "grad_norm": 4.336444467047846, "language_loss": 0.84749532, "learning_rate": 1.2067396712904777e-07, "loss": 0.86171591, "num_input_tokens_seen": 320052885, "router_z_loss_clip": 2.11523438, "router_z_loss_mlp": 0.23852539, "step": 14839, "time_per_iteration": 2.657876968383789 }, { "auxiliary_loss_clip": 0.01125157, "auxiliary_loss_mlp": 0.00052874, "balance_loss_clip": 0.98203182, "balance_loss_mlp": 0.04624606, "epoch": 0.892229069592665, "flos": 67475289277440.0, "grad_norm": 0.6650822541864144, "language_loss": 0.48695818, "learning_rate": 1.205407673483978e-07, "loss": 0.49873853, "num_input_tokens_seen": 320113685, "router_z_loss_clip": 1.4375, "router_z_loss_mlp": 0.06640625, "step": 14840, "time_per_iteration": 3.128323793411255 }, { "auxiliary_loss_clip": 0.01277941, "auxiliary_loss_mlp": 0.00233937, "balance_loss_clip": 1.04344153, "balance_loss_mlp": 0.20568401, "epoch": 0.8922891928453329, "flos": 19459561645440.0, "grad_norm": 16.6828403638237, "language_loss": 0.76165986, "learning_rate": 1.2040763883703074e-07, "loss": 0.77677864, "num_input_tokens_seen": 320130810, "router_z_loss_clip": 2.34960938, "router_z_loss_mlp": 0.28271484, "step": 14841, "time_per_iteration": 2.8260655403137207 }, { "auxiliary_loss_clip": 0.01229768, "auxiliary_loss_mlp": 0.00217183, "balance_loss_clip": 1.02122414, "balance_loss_mlp": 0.19273366, "epoch": 0.8923493160980009, "flos": 23367396873600.0, "grad_norm": 17.911179315408763, "language_loss": 0.75907832, "learning_rate": 1.2027458159999438e-07, "loss": 0.77354789, "num_input_tokens_seen": 320152170, "router_z_loss_clip": 2.08496094, "router_z_loss_mlp": 0.24475098, "step": 14842, "time_per_iteration": 4.229801893234253 }, { "auxiliary_loss_clip": 0.01235799, "auxiliary_loss_mlp": 0.00184018, "balance_loss_clip": 1.02294803, "balance_loss_mlp": 0.16208351, "epoch": 0.8924094393506689, "flos": 26177047637760.0, "grad_norm": 3.8910229966174437, "language_loss": 0.88382542, "learning_rate": 1.2014159564233373e-07, "loss": 0.89802355, "num_input_tokens_seen": 320172360, "router_z_loss_clip": 2.13085938, "router_z_loss_mlp": 0.21923828, "step": 14843, "time_per_iteration": 2.756709098815918 }, { "auxiliary_loss_clip": 0.01266228, "auxiliary_loss_mlp": 0.00215423, "balance_loss_clip": 1.03937185, "balance_loss_mlp": 0.1883032, "epoch": 0.8924695626033369, "flos": 22018520413440.0, "grad_norm": 149.62640504620046, "language_loss": 0.79110599, "learning_rate": 1.2000868096909257e-07, "loss": 0.80592251, "num_input_tokens_seen": 320192130, "router_z_loss_clip": 2.26953125, "router_z_loss_mlp": 0.27111816, "step": 14844, "time_per_iteration": 2.6724138259887695 }, { "auxiliary_loss_clip": 0.01243136, "auxiliary_loss_mlp": 0.00210846, "balance_loss_clip": 1.02374411, "balance_loss_mlp": 0.1851801, "epoch": 0.8925296858560048, "flos": 14793940166400.0, "grad_norm": 5.397478653430712, "language_loss": 1.01091218, "learning_rate": 1.1987583758531038e-07, "loss": 1.02545202, "num_input_tokens_seen": 320207760, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.2565918, "step": 14845, "time_per_iteration": 2.658132791519165 }, { "auxiliary_loss_clip": 0.01227747, "auxiliary_loss_mlp": 0.00205687, "balance_loss_clip": 1.01657057, "balance_loss_mlp": 0.18141562, "epoch": 0.8925898091086728, "flos": 22346636175360.0, "grad_norm": 188.36342225498996, "language_loss": 0.79057074, "learning_rate": 1.1974306549602476e-07, "loss": 0.80490506, "num_input_tokens_seen": 320225325, "router_z_loss_clip": 2.10742188, "router_z_loss_mlp": 0.24267578, "step": 14846, "time_per_iteration": 2.7409181594848633 }, { "auxiliary_loss_clip": 0.01266366, "auxiliary_loss_mlp": 0.00215062, "balance_loss_clip": 1.04028511, "balance_loss_mlp": 0.18795379, "epoch": 0.8926499323613407, "flos": 45806322067200.0, "grad_norm": 10.500963278811344, "language_loss": 0.6361925, "learning_rate": 1.1961036470627094e-07, "loss": 0.65100682, "num_input_tokens_seen": 320247645, "router_z_loss_clip": 2.26171875, "router_z_loss_mlp": 0.27099609, "step": 14847, "time_per_iteration": 2.8560523986816406 }, { "auxiliary_loss_clip": 0.01233721, "auxiliary_loss_mlp": 0.00200799, "balance_loss_clip": 1.02059031, "balance_loss_mlp": 0.17657575, "epoch": 0.8927100556140087, "flos": 22127042378880.0, "grad_norm": 20.721331744980745, "language_loss": 0.86188722, "learning_rate": 1.1947773522108052e-07, "loss": 0.87623239, "num_input_tokens_seen": 320266005, "router_z_loss_clip": 2.13378906, "router_z_loss_mlp": 0.24230957, "step": 14848, "time_per_iteration": 2.7141618728637695 }, { "auxiliary_loss_clip": 0.01241322, "auxiliary_loss_mlp": 0.0021771, "balance_loss_clip": 1.02187753, "balance_loss_mlp": 0.19298638, "epoch": 0.8927701788666766, "flos": 28330143655680.0, "grad_norm": 380.81301208904455, "language_loss": 0.77658772, "learning_rate": 1.1934517704548251e-07, "loss": 0.79117799, "num_input_tokens_seen": 320285555, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.24731445, "step": 14849, "time_per_iteration": 2.6829285621643066 }, { "auxiliary_loss_clip": 0.01268587, "auxiliary_loss_mlp": 0.00224293, "balance_loss_clip": 1.0420289, "balance_loss_mlp": 0.1989013, "epoch": 0.8928303021193447, "flos": 25294973351040.0, "grad_norm": 199.94475898237332, "language_loss": 0.87671149, "learning_rate": 1.1921269018450364e-07, "loss": 0.89164025, "num_input_tokens_seen": 320305395, "router_z_loss_clip": 2.265625, "router_z_loss_mlp": 0.25378418, "step": 14850, "time_per_iteration": 2.7260923385620117 }, { "auxiliary_loss_clip": 0.01235606, "auxiliary_loss_mlp": 0.00188173, "balance_loss_clip": 1.01994824, "balance_loss_mlp": 0.16431889, "epoch": 0.8928904253720126, "flos": 22236713579520.0, "grad_norm": 33.21290279012587, "language_loss": 0.82818496, "learning_rate": 1.1908027464316872e-07, "loss": 0.84242278, "num_input_tokens_seen": 320324220, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.23876953, "step": 14851, "time_per_iteration": 2.6568989753723145 }, { "auxiliary_loss_clip": 0.01251309, "auxiliary_loss_mlp": 0.00207923, "balance_loss_clip": 1.02861559, "balance_loss_mlp": 0.18231666, "epoch": 0.8929505486246806, "flos": 27092374940160.0, "grad_norm": 9.972798701593826, "language_loss": 0.86626792, "learning_rate": 1.1894793042649775e-07, "loss": 0.88086027, "num_input_tokens_seen": 320347195, "router_z_loss_clip": 2.2265625, "router_z_loss_mlp": 0.25585938, "step": 14852, "time_per_iteration": 2.766901969909668 }, { "auxiliary_loss_clip": 0.01240627, "auxiliary_loss_mlp": 0.0021307, "balance_loss_clip": 1.02639961, "balance_loss_mlp": 0.18990776, "epoch": 0.8930106718773486, "flos": 23039352938880.0, "grad_norm": 20.829924590031997, "language_loss": 0.74958408, "learning_rate": 1.1881565753951006e-07, "loss": 0.76412106, "num_input_tokens_seen": 320366850, "router_z_loss_clip": 2.14550781, "router_z_loss_mlp": 0.23144531, "step": 14853, "time_per_iteration": 2.696295976638794 }, { "auxiliary_loss_clip": 0.0124547, "auxiliary_loss_mlp": 0.00223209, "balance_loss_clip": 1.02889109, "balance_loss_mlp": 0.1992119, "epoch": 0.8930707951300165, "flos": 35626652887680.0, "grad_norm": 47.832109320132965, "language_loss": 0.75434983, "learning_rate": 1.1868345598722118e-07, "loss": 0.76903659, "num_input_tokens_seen": 320388895, "router_z_loss_clip": 2.1640625, "router_z_loss_mlp": 0.23986816, "step": 14854, "time_per_iteration": 2.826183795928955 }, { "auxiliary_loss_clip": 0.01222914, "auxiliary_loss_mlp": 0.00180828, "balance_loss_clip": 1.01212811, "balance_loss_mlp": 0.1580828, "epoch": 0.8931309183826845, "flos": 23039891642880.0, "grad_norm": 10.466801041761865, "language_loss": 0.82697409, "learning_rate": 1.1855132577464399e-07, "loss": 0.84101152, "num_input_tokens_seen": 320408520, "router_z_loss_clip": 2.109375, "router_z_loss_mlp": 0.22741699, "step": 14855, "time_per_iteration": 2.6699206829071045 }, { "auxiliary_loss_clip": 0.01232462, "auxiliary_loss_mlp": 0.00210859, "balance_loss_clip": 1.01725149, "balance_loss_mlp": 0.18497908, "epoch": 0.8931910416353525, "flos": 26504624695680.0, "grad_norm": 52.56467186414891, "language_loss": 0.71928483, "learning_rate": 1.1841926690678893e-07, "loss": 0.73371804, "num_input_tokens_seen": 320427400, "router_z_loss_clip": 2.15429688, "router_z_loss_mlp": 0.25854492, "step": 14856, "time_per_iteration": 2.6590194702148438 }, { "auxiliary_loss_clip": 0.01230556, "auxiliary_loss_mlp": 0.0020583, "balance_loss_clip": 1.01644158, "balance_loss_mlp": 0.18033111, "epoch": 0.8932511648880205, "flos": 24973609345920.0, "grad_norm": 8.92687435828467, "language_loss": 0.74370265, "learning_rate": 1.1828727938866378e-07, "loss": 0.75806653, "num_input_tokens_seen": 320447570, "router_z_loss_clip": 2.14257812, "router_z_loss_mlp": 0.25512695, "step": 14857, "time_per_iteration": 2.696143627166748 }, { "auxiliary_loss_clip": 0.01263935, "auxiliary_loss_mlp": 0.00215307, "balance_loss_clip": 1.0350225, "balance_loss_mlp": 0.18929586, "epoch": 0.8933112881406884, "flos": 24460733001600.0, "grad_norm": 1852.4414127373195, "language_loss": 0.83326125, "learning_rate": 1.1815536322527408e-07, "loss": 0.84805357, "num_input_tokens_seen": 320464405, "router_z_loss_clip": 2.28710938, "router_z_loss_mlp": 0.2598877, "step": 14858, "time_per_iteration": 2.6553304195404053 }, { "auxiliary_loss_clip": 0.0125149, "auxiliary_loss_mlp": 0.00210086, "balance_loss_clip": 1.0307008, "balance_loss_mlp": 0.18515918, "epoch": 0.8933714113933564, "flos": 28293083798400.0, "grad_norm": 12.088713937699753, "language_loss": 0.77089417, "learning_rate": 1.1802351842162139e-07, "loss": 0.78550994, "num_input_tokens_seen": 320485525, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.24926758, "step": 14859, "time_per_iteration": 2.7873921394348145 }, { "auxiliary_loss_clip": 0.01234914, "auxiliary_loss_mlp": 0.00187413, "balance_loss_clip": 1.02383947, "balance_loss_mlp": 0.16361919, "epoch": 0.8934315346460243, "flos": 21434864319360.0, "grad_norm": 9.229172856493989, "language_loss": 0.82898581, "learning_rate": 1.1789174498270526e-07, "loss": 0.84320909, "num_input_tokens_seen": 320506725, "router_z_loss_clip": 2.11328125, "router_z_loss_mlp": 0.23791504, "step": 14860, "time_per_iteration": 2.694922685623169 }, { "auxiliary_loss_clip": 0.01259093, "auxiliary_loss_mlp": 0.00190816, "balance_loss_clip": 1.0393374, "balance_loss_mlp": 0.16569835, "epoch": 0.8934916578986923, "flos": 23769596436480.0, "grad_norm": 4.263036557437016, "language_loss": 0.67248487, "learning_rate": 1.1776004291352303e-07, "loss": 0.68698394, "num_input_tokens_seen": 320525425, "router_z_loss_clip": 2.19433594, "router_z_loss_mlp": 0.25134277, "step": 14861, "time_per_iteration": 2.7063214778900146 }, { "auxiliary_loss_clip": 0.01225369, "auxiliary_loss_mlp": 0.001877, "balance_loss_clip": 1.01869524, "balance_loss_mlp": 0.16440681, "epoch": 0.8935517811513602, "flos": 18916161719040.0, "grad_norm": 46.444723478929205, "language_loss": 0.7330786, "learning_rate": 1.176284122190685e-07, "loss": 0.74720931, "num_input_tokens_seen": 320543010, "router_z_loss_clip": 2.06738281, "router_z_loss_mlp": 0.23303223, "step": 14862, "time_per_iteration": 2.657008409500122 }, { "auxiliary_loss_clip": 0.01244888, "auxiliary_loss_mlp": 0.00205301, "balance_loss_clip": 1.02915537, "balance_loss_mlp": 0.18046945, "epoch": 0.8936119044040283, "flos": 24061370613120.0, "grad_norm": 34.46431282728762, "language_loss": 0.85547006, "learning_rate": 1.1749685290433298e-07, "loss": 0.86997199, "num_input_tokens_seen": 320562180, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.24829102, "step": 14863, "time_per_iteration": 2.6638290882110596 }, { "auxiliary_loss_clip": 0.01232708, "auxiliary_loss_mlp": 0.00201736, "balance_loss_clip": 1.01962495, "balance_loss_mlp": 0.1766305, "epoch": 0.8936720276566962, "flos": 21324079797120.0, "grad_norm": 200.2983161733313, "language_loss": 0.78342396, "learning_rate": 1.1736536497430627e-07, "loss": 0.79776841, "num_input_tokens_seen": 320580395, "router_z_loss_clip": 2.13476562, "router_z_loss_mlp": 0.25085449, "step": 14864, "time_per_iteration": 2.6464576721191406 }, { "auxiliary_loss_clip": 0.01273099, "auxiliary_loss_mlp": 0.00226766, "balance_loss_clip": 1.04695022, "balance_loss_mlp": 0.19986048, "epoch": 0.8937321509093642, "flos": 18406122549120.0, "grad_norm": 19.154182163904455, "language_loss": 0.86349857, "learning_rate": 1.1723394843397283e-07, "loss": 0.87849724, "num_input_tokens_seen": 320599505, "router_z_loss_clip": 2.26171875, "router_z_loss_mlp": 0.26879883, "step": 14865, "time_per_iteration": 2.6811389923095703 }, { "auxiliary_loss_clip": 0.0122777, "auxiliary_loss_mlp": 0.00232439, "balance_loss_clip": 1.01905251, "balance_loss_mlp": 0.20857351, "epoch": 0.8937922741620322, "flos": 22054754257920.0, "grad_norm": 2.5048396708619736, "language_loss": 0.78339553, "learning_rate": 1.1710260328831668e-07, "loss": 0.79799759, "num_input_tokens_seen": 320619825, "router_z_loss_clip": 2.0859375, "router_z_loss_mlp": 0.23864746, "step": 14866, "time_per_iteration": 2.835833787918091 }, { "auxiliary_loss_clip": 0.01254616, "auxiliary_loss_mlp": 0.00224335, "balance_loss_clip": 1.03524685, "balance_loss_mlp": 0.19788255, "epoch": 0.8938523974147001, "flos": 25664386775040.0, "grad_norm": 34.75473691416527, "language_loss": 0.90462148, "learning_rate": 1.1697132954231869e-07, "loss": 0.91941094, "num_input_tokens_seen": 320638515, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.26428223, "step": 14867, "time_per_iteration": 2.687352418899536 }, { "auxiliary_loss_clip": 0.01238342, "auxiliary_loss_mlp": 0.00213028, "balance_loss_clip": 1.02169335, "balance_loss_mlp": 0.18863782, "epoch": 0.8939125206673681, "flos": 25742852035200.0, "grad_norm": 15.499879308896707, "language_loss": 0.87227082, "learning_rate": 1.168401272009567e-07, "loss": 0.88678455, "num_input_tokens_seen": 320659430, "router_z_loss_clip": 2.16796875, "router_z_loss_mlp": 0.24389648, "step": 14868, "time_per_iteration": 2.7114531993865967 }, { "auxiliary_loss_clip": 0.01259891, "auxiliary_loss_mlp": 0.00213249, "balance_loss_clip": 1.03573346, "balance_loss_mlp": 0.18716551, "epoch": 0.8939726439200361, "flos": 27344503480320.0, "grad_norm": 2.2801984206606853, "language_loss": 0.84265888, "learning_rate": 1.167089962692056e-07, "loss": 0.85739028, "num_input_tokens_seen": 320679295, "router_z_loss_clip": 2.24414062, "router_z_loss_mlp": 0.26062012, "step": 14869, "time_per_iteration": 2.67999005317688 }, { "auxiliary_loss_clip": 0.01232773, "auxiliary_loss_mlp": 0.00220049, "balance_loss_clip": 1.01740527, "balance_loss_mlp": 0.19567063, "epoch": 0.8940327671727041, "flos": 20338834671360.0, "grad_norm": 24.125237453787392, "language_loss": 0.74091721, "learning_rate": 1.1657793675203853e-07, "loss": 0.75544548, "num_input_tokens_seen": 320697535, "router_z_loss_clip": 2.15722656, "router_z_loss_mlp": 0.24365234, "step": 14870, "time_per_iteration": 2.6744472980499268 }, { "auxiliary_loss_clip": 0.01116429, "auxiliary_loss_mlp": 0.00059624, "balance_loss_clip": 0.97637308, "balance_loss_mlp": 0.05304325, "epoch": 0.894092890425372, "flos": 58410573235200.0, "grad_norm": 0.8051386583939307, "language_loss": 0.55274737, "learning_rate": 1.1644694865442461e-07, "loss": 0.56450796, "num_input_tokens_seen": 320758635, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.06591797, "step": 14871, "time_per_iteration": 3.223461151123047 }, { "auxiliary_loss_clip": 0.01235516, "auxiliary_loss_mlp": 0.00211304, "balance_loss_clip": 1.02201581, "balance_loss_mlp": 0.18829665, "epoch": 0.89415301367804, "flos": 19829657427840.0, "grad_norm": 151.41287997382028, "language_loss": 0.84048122, "learning_rate": 1.16316031981331e-07, "loss": 0.85494941, "num_input_tokens_seen": 320777175, "router_z_loss_clip": 2.13476562, "router_z_loss_mlp": 0.22998047, "step": 14872, "time_per_iteration": 4.028707981109619 }, { "auxiliary_loss_clip": 0.01230811, "auxiliary_loss_mlp": 0.00209454, "balance_loss_clip": 1.02060008, "balance_loss_mlp": 0.18667261, "epoch": 0.8942131369307079, "flos": 25775781828480.0, "grad_norm": 4.684741295558105, "language_loss": 0.74516928, "learning_rate": 1.1618518673772215e-07, "loss": 0.75957191, "num_input_tokens_seen": 320797670, "router_z_loss_clip": 2.1015625, "router_z_loss_mlp": 0.22766113, "step": 14873, "time_per_iteration": 4.170468807220459 }, { "auxiliary_loss_clip": 0.01232001, "auxiliary_loss_mlp": 0.00228012, "balance_loss_clip": 1.0217371, "balance_loss_mlp": 0.20270348, "epoch": 0.8942732601833759, "flos": 23149024139520.0, "grad_norm": 41.66496865154406, "language_loss": 0.67082536, "learning_rate": 1.1605441292856033e-07, "loss": 0.68542552, "num_input_tokens_seen": 320817410, "router_z_loss_clip": 2.09960938, "router_z_loss_mlp": 0.25317383, "step": 14874, "time_per_iteration": 2.6883206367492676 }, { "auxiliary_loss_clip": 0.01248819, "auxiliary_loss_mlp": 0.00195387, "balance_loss_clip": 1.02937627, "balance_loss_mlp": 0.17026934, "epoch": 0.8943333834360438, "flos": 27855548231040.0, "grad_norm": 39.66718117383615, "language_loss": 0.83637321, "learning_rate": 1.1592371055880356e-07, "loss": 0.8508153, "num_input_tokens_seen": 320836745, "router_z_loss_clip": 2.19335938, "router_z_loss_mlp": 0.25134277, "step": 14875, "time_per_iteration": 2.725451946258545 }, { "auxiliary_loss_clip": 0.01279977, "auxiliary_loss_mlp": 0.00225378, "balance_loss_clip": 1.04881978, "balance_loss_mlp": 0.19531363, "epoch": 0.8943935066887119, "flos": 22163958581760.0, "grad_norm": 10.350736933167141, "language_loss": 0.87279665, "learning_rate": 1.1579307963340857e-07, "loss": 0.88785022, "num_input_tokens_seen": 320853305, "router_z_loss_clip": 2.31640625, "router_z_loss_mlp": 0.30053711, "step": 14876, "time_per_iteration": 2.7310967445373535 }, { "auxiliary_loss_clip": 0.01243767, "auxiliary_loss_mlp": 0.0021449, "balance_loss_clip": 1.02492011, "balance_loss_mlp": 0.1903621, "epoch": 0.8944536299413798, "flos": 21470056669440.0, "grad_norm": 9.484179460297234, "language_loss": 0.89338028, "learning_rate": 1.156625201573287e-07, "loss": 0.90796286, "num_input_tokens_seen": 320872885, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.24145508, "step": 14877, "time_per_iteration": 2.6130852699279785 }, { "auxiliary_loss_clip": 0.01239861, "auxiliary_loss_mlp": 0.00201193, "balance_loss_clip": 1.02054262, "balance_loss_mlp": 0.17642151, "epoch": 0.8945137531940478, "flos": 17748777703680.0, "grad_norm": 25.41839400662891, "language_loss": 0.85382843, "learning_rate": 1.155320321355151e-07, "loss": 0.86823905, "num_input_tokens_seen": 320889755, "router_z_loss_clip": 2.19921875, "router_z_loss_mlp": 0.2479248, "step": 14878, "time_per_iteration": 2.6247568130493164 }, { "auxiliary_loss_clip": 0.01262022, "auxiliary_loss_mlp": 0.00206134, "balance_loss_clip": 1.03823876, "balance_loss_mlp": 0.17887044, "epoch": 0.8945738764467158, "flos": 21142264129920.0, "grad_norm": 9.674992373795435, "language_loss": 0.85473847, "learning_rate": 1.1540161557291539e-07, "loss": 0.86941999, "num_input_tokens_seen": 320907860, "router_z_loss_clip": 2.24023438, "router_z_loss_mlp": 0.27246094, "step": 14879, "time_per_iteration": 4.158797740936279 }, { "auxiliary_loss_clip": 0.01254494, "auxiliary_loss_mlp": 0.00237151, "balance_loss_clip": 1.03289831, "balance_loss_mlp": 0.20923197, "epoch": 0.8946339996993837, "flos": 14903000835840.0, "grad_norm": 5.243175669435129, "language_loss": 0.84220946, "learning_rate": 1.1527127047447538e-07, "loss": 0.85712588, "num_input_tokens_seen": 320925825, "router_z_loss_clip": 2.21777344, "router_z_loss_mlp": 0.27929688, "step": 14880, "time_per_iteration": 2.60955548286438 }, { "auxiliary_loss_clip": 0.01224773, "auxiliary_loss_mlp": 0.00209524, "balance_loss_clip": 1.01426387, "balance_loss_mlp": 0.18503842, "epoch": 0.8946941229520518, "flos": 27382173868800.0, "grad_norm": 2.2514669889732124, "language_loss": 0.89800489, "learning_rate": 1.1514099684513822e-07, "loss": 0.91234791, "num_input_tokens_seen": 320946165, "router_z_loss_clip": 2.1015625, "router_z_loss_mlp": 0.24511719, "step": 14881, "time_per_iteration": 2.72633695602417 }, { "auxiliary_loss_clip": 0.01220277, "auxiliary_loss_mlp": 0.00195777, "balance_loss_clip": 1.00721765, "balance_loss_mlp": 0.17049289, "epoch": 0.8947542462047197, "flos": 31796277338880.0, "grad_norm": 1739.1108916410462, "language_loss": 0.74675477, "learning_rate": 1.1501079468984287e-07, "loss": 0.76091528, "num_input_tokens_seen": 320969330, "router_z_loss_clip": 2.12890625, "router_z_loss_mlp": 0.25268555, "step": 14882, "time_per_iteration": 2.724710464477539 }, { "auxiliary_loss_clip": 0.01267393, "auxiliary_loss_mlp": 0.00217465, "balance_loss_clip": 1.04332852, "balance_loss_mlp": 0.19140592, "epoch": 0.8948143694573877, "flos": 20883599314560.0, "grad_norm": 21.766555815258275, "language_loss": 0.85760981, "learning_rate": 1.1488066401352691e-07, "loss": 0.87245834, "num_input_tokens_seen": 320985055, "router_z_loss_clip": 2.24023438, "router_z_loss_mlp": 0.26062012, "step": 14883, "time_per_iteration": 2.62906551361084 }, { "auxiliary_loss_clip": 0.01232122, "auxiliary_loss_mlp": 0.00217335, "balance_loss_clip": 1.01946998, "balance_loss_mlp": 0.19251531, "epoch": 0.8948744927100556, "flos": 28215552291840.0, "grad_norm": 15.458897611550313, "language_loss": 0.79834986, "learning_rate": 1.147506048211253e-07, "loss": 0.81284446, "num_input_tokens_seen": 321004720, "router_z_loss_clip": 2.12695312, "router_z_loss_mlp": 0.24816895, "step": 14884, "time_per_iteration": 4.165149450302124 }, { "auxiliary_loss_clip": 0.01228291, "auxiliary_loss_mlp": 0.00206157, "balance_loss_clip": 1.01442385, "balance_loss_mlp": 0.18249437, "epoch": 0.8949346159627236, "flos": 21902672073600.0, "grad_norm": 6.1570117006665885, "language_loss": 0.82196987, "learning_rate": 1.1462061711756987e-07, "loss": 0.83631438, "num_input_tokens_seen": 321022350, "router_z_loss_clip": 2.13476562, "router_z_loss_mlp": 0.23657227, "step": 14885, "time_per_iteration": 2.658698558807373 }, { "auxiliary_loss_clip": 0.0125706, "auxiliary_loss_mlp": 0.00204819, "balance_loss_clip": 1.03502202, "balance_loss_mlp": 0.17923693, "epoch": 0.8949947392153915, "flos": 21359128492800.0, "grad_norm": 1294.972478455038, "language_loss": 0.89970279, "learning_rate": 1.1449070090778911e-07, "loss": 0.91432154, "num_input_tokens_seen": 321040450, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.2557373, "step": 14886, "time_per_iteration": 2.748791217803955 }, { "auxiliary_loss_clip": 0.01247392, "auxiliary_loss_mlp": 0.00216869, "balance_loss_clip": 1.02884293, "balance_loss_mlp": 0.19203812, "epoch": 0.8950548624680595, "flos": 52445342799360.0, "grad_norm": 6.998018999586399, "language_loss": 0.72703046, "learning_rate": 1.1436085619671043e-07, "loss": 0.74167305, "num_input_tokens_seen": 321063970, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.24804688, "step": 14887, "time_per_iteration": 2.9542315006256104 }, { "auxiliary_loss_clip": 0.01267134, "auxiliary_loss_mlp": 0.00204785, "balance_loss_clip": 1.03998756, "balance_loss_mlp": 0.17795102, "epoch": 0.8951149857207275, "flos": 20121323863680.0, "grad_norm": 6.0896160092350975, "language_loss": 0.69890332, "learning_rate": 1.1423108298925698e-07, "loss": 0.71362251, "num_input_tokens_seen": 321083840, "router_z_loss_clip": 2.2734375, "router_z_loss_mlp": 0.26843262, "step": 14888, "time_per_iteration": 2.694272518157959 }, { "auxiliary_loss_clip": 0.01260187, "auxiliary_loss_mlp": 0.00226161, "balance_loss_clip": 1.03654337, "balance_loss_mlp": 0.19991133, "epoch": 0.8951751089733955, "flos": 29862631463040.0, "grad_norm": 268.725308443291, "language_loss": 0.78572226, "learning_rate": 1.1410138129034952e-07, "loss": 0.80058575, "num_input_tokens_seen": 321104165, "router_z_loss_clip": 2.24023438, "router_z_loss_mlp": 0.26281738, "step": 14889, "time_per_iteration": 2.716782808303833 }, { "auxiliary_loss_clip": 0.01243646, "auxiliary_loss_mlp": 0.00183294, "balance_loss_clip": 1.02345359, "balance_loss_mlp": 0.15792647, "epoch": 0.8952352322260634, "flos": 15262789415040.0, "grad_norm": 127.51551791418738, "language_loss": 0.8166256, "learning_rate": 1.1397175110490676e-07, "loss": 0.83089495, "num_input_tokens_seen": 321117290, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.25378418, "step": 14890, "time_per_iteration": 2.6683952808380127 }, { "auxiliary_loss_clip": 0.01230793, "auxiliary_loss_mlp": 0.00200441, "balance_loss_clip": 1.01752269, "balance_loss_mlp": 0.1752163, "epoch": 0.8952953554787314, "flos": 26798338206720.0, "grad_norm": 12.86940924997561, "language_loss": 0.83239025, "learning_rate": 1.1384219243784454e-07, "loss": 0.84670264, "num_input_tokens_seen": 321137115, "router_z_loss_clip": 2.13085938, "router_z_loss_mlp": 0.25219727, "step": 14891, "time_per_iteration": 2.7002155780792236 }, { "auxiliary_loss_clip": 0.0125159, "auxiliary_loss_mlp": 0.0023655, "balance_loss_clip": 1.02668715, "balance_loss_mlp": 0.21091953, "epoch": 0.8953554787313994, "flos": 14137205852160.0, "grad_norm": 3.6486318434882565, "language_loss": 0.84806776, "learning_rate": 1.1371270529407517e-07, "loss": 0.86294913, "num_input_tokens_seen": 321154490, "router_z_loss_clip": 2.24609375, "router_z_loss_mlp": 0.25610352, "step": 14892, "time_per_iteration": 2.6455674171447754 }, { "auxiliary_loss_clip": 0.01231531, "auxiliary_loss_mlp": 0.00224347, "balance_loss_clip": 1.01788735, "balance_loss_mlp": 0.19959939, "epoch": 0.8954156019840673, "flos": 25703314139520.0, "grad_norm": 4.292466524690189, "language_loss": 0.86547476, "learning_rate": 1.1358328967850895e-07, "loss": 0.88003349, "num_input_tokens_seen": 321175625, "router_z_loss_clip": 2.13476562, "router_z_loss_mlp": 0.24768066, "step": 14893, "time_per_iteration": 2.6705162525177 }, { "auxiliary_loss_clip": 0.01223411, "auxiliary_loss_mlp": 0.00207649, "balance_loss_clip": 1.0131948, "balance_loss_mlp": 0.1838069, "epoch": 0.8954757252367354, "flos": 21907987286400.0, "grad_norm": 211.8540200969957, "language_loss": 0.81782115, "learning_rate": 1.1345394559605348e-07, "loss": 0.83213174, "num_input_tokens_seen": 321193895, "router_z_loss_clip": 2.09960938, "router_z_loss_mlp": 0.23852539, "step": 14894, "time_per_iteration": 2.67779278755188 }, { "auxiliary_loss_clip": 0.01261211, "auxiliary_loss_mlp": 0.0023601, "balance_loss_clip": 1.03628671, "balance_loss_mlp": 0.21077336, "epoch": 0.8955358484894033, "flos": 12970396454400.0, "grad_norm": 155.07973440325128, "language_loss": 0.75956595, "learning_rate": 1.1332467305161352e-07, "loss": 0.77453816, "num_input_tokens_seen": 321211610, "router_z_loss_clip": 2.24804688, "router_z_loss_mlp": 0.25244141, "step": 14895, "time_per_iteration": 2.645392656326294 }, { "auxiliary_loss_clip": 0.01280652, "auxiliary_loss_mlp": 0.00218988, "balance_loss_clip": 1.05055094, "balance_loss_mlp": 0.19017556, "epoch": 0.8955959717420713, "flos": 17273966797440.0, "grad_norm": 31.027033728606206, "language_loss": 0.75903201, "learning_rate": 1.1319547205009094e-07, "loss": 0.77402842, "num_input_tokens_seen": 321229805, "router_z_loss_clip": 2.30078125, "router_z_loss_mlp": 0.28796387, "step": 14896, "time_per_iteration": 2.665987491607666 }, { "auxiliary_loss_clip": 0.01244541, "auxiliary_loss_mlp": 0.00207774, "balance_loss_clip": 1.02637005, "balance_loss_mlp": 0.18208435, "epoch": 0.8956560949947392, "flos": 14793868339200.0, "grad_norm": 205.63545778838653, "language_loss": 0.83122611, "learning_rate": 1.1306634259638492e-07, "loss": 0.84574932, "num_input_tokens_seen": 321247165, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.25720215, "step": 14897, "time_per_iteration": 2.671221971511841 }, { "auxiliary_loss_clip": 0.01118846, "auxiliary_loss_mlp": 0.0007545, "balance_loss_clip": 0.97980624, "balance_loss_mlp": 0.06906012, "epoch": 0.8957162182474072, "flos": 63607817957760.0, "grad_norm": 0.9296164641937676, "language_loss": 0.54259098, "learning_rate": 1.129372846953931e-07, "loss": 0.5545339, "num_input_tokens_seen": 321308425, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.06396484, "step": 14898, "time_per_iteration": 3.2240471839904785 }, { "auxiliary_loss_clip": 0.01250511, "auxiliary_loss_mlp": 0.00212549, "balance_loss_clip": 1.02825022, "balance_loss_mlp": 0.18564339, "epoch": 0.8957763415000751, "flos": 25009843190400.0, "grad_norm": 32.094575480965304, "language_loss": 0.7954216, "learning_rate": 1.12808298352008e-07, "loss": 0.81005216, "num_input_tokens_seen": 321329295, "router_z_loss_clip": 2.22265625, "router_z_loss_mlp": 0.26916504, "step": 14899, "time_per_iteration": 2.6835107803344727 }, { "auxiliary_loss_clip": 0.01258776, "auxiliary_loss_mlp": 0.00213995, "balance_loss_clip": 1.03573573, "balance_loss_mlp": 0.18875885, "epoch": 0.8958364647527431, "flos": 19828615933440.0, "grad_norm": 3.318614135482341, "language_loss": 0.82058334, "learning_rate": 1.1267938357112106e-07, "loss": 0.83531106, "num_input_tokens_seen": 321347580, "router_z_loss_clip": 2.22851562, "router_z_loss_mlp": 0.25256348, "step": 14900, "time_per_iteration": 2.624495267868042 }, { "auxiliary_loss_clip": 0.01116554, "auxiliary_loss_mlp": 0.00069945, "balance_loss_clip": 0.97715807, "balance_loss_mlp": 0.06379421, "epoch": 0.895896588005411, "flos": 65537190115200.0, "grad_norm": 0.7633707096228676, "language_loss": 0.61177325, "learning_rate": 1.1255054035762124e-07, "loss": 0.62363827, "num_input_tokens_seen": 321407820, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.0612793, "step": 14901, "time_per_iteration": 3.126039981842041 }, { "auxiliary_loss_clip": 0.01259957, "auxiliary_loss_mlp": 0.00233895, "balance_loss_clip": 1.0341475, "balance_loss_mlp": 0.20440264, "epoch": 0.8959567112580791, "flos": 25591021246080.0, "grad_norm": 5.77365035114095, "language_loss": 0.79208589, "learning_rate": 1.1242176871639441e-07, "loss": 0.80702442, "num_input_tokens_seen": 321426745, "router_z_loss_clip": 2.25976562, "router_z_loss_mlp": 0.29455566, "step": 14902, "time_per_iteration": 2.684617280960083 }, { "auxiliary_loss_clip": 0.01247109, "auxiliary_loss_mlp": 0.00204835, "balance_loss_clip": 1.03010988, "balance_loss_mlp": 0.18095696, "epoch": 0.896016834510747, "flos": 24201780877440.0, "grad_norm": 9252.533347940654, "language_loss": 0.85250258, "learning_rate": 1.1229306865232313e-07, "loss": 0.86702204, "num_input_tokens_seen": 321446165, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.23852539, "step": 14903, "time_per_iteration": 2.7374775409698486 }, { "auxiliary_loss_clip": 0.01264072, "auxiliary_loss_mlp": 0.00203088, "balance_loss_clip": 1.03139746, "balance_loss_mlp": 0.17671871, "epoch": 0.896076957763415, "flos": 23075945919360.0, "grad_norm": 3.6554740042741214, "language_loss": 0.81586432, "learning_rate": 1.121644401702877e-07, "loss": 0.83053589, "num_input_tokens_seen": 321465285, "router_z_loss_clip": 2.32617188, "router_z_loss_mlp": 0.26367188, "step": 14904, "time_per_iteration": 2.6761982440948486 }, { "auxiliary_loss_clip": 0.01248171, "auxiliary_loss_mlp": 0.00205812, "balance_loss_clip": 1.02551126, "balance_loss_mlp": 0.18001559, "epoch": 0.8961370810160829, "flos": 22236605838720.0, "grad_norm": 4.704703435507026, "language_loss": 0.85276836, "learning_rate": 1.12035883275166e-07, "loss": 0.86730814, "num_input_tokens_seen": 321483670, "router_z_loss_clip": 2.22949219, "router_z_loss_mlp": 0.25817871, "step": 14905, "time_per_iteration": 2.6656103134155273 }, { "auxiliary_loss_clip": 0.01218326, "auxiliary_loss_mlp": 0.00191646, "balance_loss_clip": 1.0063138, "balance_loss_mlp": 0.16631427, "epoch": 0.8961972042687509, "flos": 23072318645760.0, "grad_norm": 7.276421528706706, "language_loss": 0.85066879, "learning_rate": 1.1190739797183279e-07, "loss": 0.86476856, "num_input_tokens_seen": 321501190, "router_z_loss_clip": 2.12207031, "router_z_loss_mlp": 0.25305176, "step": 14906, "time_per_iteration": 2.6711812019348145 }, { "auxiliary_loss_clip": 0.01234832, "auxiliary_loss_mlp": 0.00216141, "balance_loss_clip": 1.01929009, "balance_loss_mlp": 0.19032043, "epoch": 0.896257327521419, "flos": 18185882307840.0, "grad_norm": 6.8631218638820295, "language_loss": 0.81299973, "learning_rate": 1.1177898426515996e-07, "loss": 0.8275094, "num_input_tokens_seen": 321518540, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.25817871, "step": 14907, "time_per_iteration": 2.6302411556243896 }, { "auxiliary_loss_clip": 0.01249484, "auxiliary_loss_mlp": 0.00208817, "balance_loss_clip": 1.03366256, "balance_loss_mlp": 0.18394998, "epoch": 0.8963174507740869, "flos": 17895472848000.0, "grad_norm": 349.6875715332795, "language_loss": 0.89244008, "learning_rate": 1.1165064216001785e-07, "loss": 0.90702307, "num_input_tokens_seen": 321536555, "router_z_loss_clip": 2.15820312, "router_z_loss_mlp": 0.24853516, "step": 14908, "time_per_iteration": 2.702270269393921 }, { "auxiliary_loss_clip": 0.01253661, "auxiliary_loss_mlp": 0.00207236, "balance_loss_clip": 1.02725101, "balance_loss_mlp": 0.18222591, "epoch": 0.8963775740267549, "flos": 21032269706880.0, "grad_norm": 73.121583651027, "language_loss": 0.79264671, "learning_rate": 1.1152237166127232e-07, "loss": 0.80725563, "num_input_tokens_seen": 321557655, "router_z_loss_clip": 2.265625, "router_z_loss_mlp": 0.25024414, "step": 14909, "time_per_iteration": 2.742488384246826 }, { "auxiliary_loss_clip": 0.01247296, "auxiliary_loss_mlp": 0.00235666, "balance_loss_clip": 1.03345108, "balance_loss_mlp": 0.20833114, "epoch": 0.8964376972794228, "flos": 23179619548800.0, "grad_norm": 2.38879957370183, "language_loss": 0.81014156, "learning_rate": 1.113941727737877e-07, "loss": 0.8249712, "num_input_tokens_seen": 321576160, "router_z_loss_clip": 2.13671875, "router_z_loss_mlp": 0.2734375, "step": 14910, "time_per_iteration": 2.745143175125122 }, { "auxiliary_loss_clip": 0.01237415, "auxiliary_loss_mlp": 0.00208708, "balance_loss_clip": 1.02577353, "balance_loss_mlp": 0.18562931, "epoch": 0.8964978205320908, "flos": 24972998814720.0, "grad_norm": 48.08724786895627, "language_loss": 0.7113111, "learning_rate": 1.1126604550242502e-07, "loss": 0.72577238, "num_input_tokens_seen": 321596205, "router_z_loss_clip": 2.11816406, "router_z_loss_mlp": 0.23083496, "step": 14911, "time_per_iteration": 2.6836836338043213 }, { "auxiliary_loss_clip": 0.01235407, "auxiliary_loss_mlp": 0.00200513, "balance_loss_clip": 1.02227128, "balance_loss_mlp": 0.1759437, "epoch": 0.8965579437847587, "flos": 19172025273600.0, "grad_norm": 239.4121125826751, "language_loss": 0.82412565, "learning_rate": 1.111379898520437e-07, "loss": 0.83848488, "num_input_tokens_seen": 321614800, "router_z_loss_clip": 2.13378906, "router_z_loss_mlp": 0.2454834, "step": 14912, "time_per_iteration": 2.6898725032806396 }, { "auxiliary_loss_clip": 0.01247873, "auxiliary_loss_mlp": 0.00209591, "balance_loss_clip": 1.0268116, "balance_loss_mlp": 0.1839253, "epoch": 0.8966180670374267, "flos": 24276690691200.0, "grad_norm": 39.85399785911896, "language_loss": 0.88258862, "learning_rate": 1.1101000582749876e-07, "loss": 0.89716327, "num_input_tokens_seen": 321633445, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.2565918, "step": 14913, "time_per_iteration": 2.6727206707000732 }, { "auxiliary_loss_clip": 0.01254826, "auxiliary_loss_mlp": 0.00206993, "balance_loss_clip": 1.03061688, "balance_loss_mlp": 0.1821855, "epoch": 0.8966781902900947, "flos": 13553190622080.0, "grad_norm": 677.1273277128881, "language_loss": 0.7282145, "learning_rate": 1.1088209343364407e-07, "loss": 0.74283266, "num_input_tokens_seen": 321650890, "router_z_loss_clip": 2.24023438, "router_z_loss_mlp": 0.24804688, "step": 14914, "time_per_iteration": 4.046157360076904 }, { "auxiliary_loss_clip": 0.01129885, "auxiliary_loss_mlp": 0.00104202, "balance_loss_clip": 0.98933768, "balance_loss_mlp": 0.09733513, "epoch": 0.8967383135427627, "flos": 65066114223360.0, "grad_norm": 0.6910881396979742, "language_loss": 0.54442871, "learning_rate": 1.1075425267532956e-07, "loss": 0.55676961, "num_input_tokens_seen": 321710960, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.06884766, "step": 14915, "time_per_iteration": 4.5932536125183105 }, { "auxiliary_loss_clip": 0.01240723, "auxiliary_loss_mlp": 0.00193091, "balance_loss_clip": 1.0265224, "balance_loss_mlp": 0.1694157, "epoch": 0.8967984367954306, "flos": 29713027317120.0, "grad_norm": 11.047714427941733, "language_loss": 0.76387024, "learning_rate": 1.1062648355740289e-07, "loss": 0.77820837, "num_input_tokens_seen": 321733290, "router_z_loss_clip": 2.14257812, "router_z_loss_mlp": 0.23657227, "step": 14916, "time_per_iteration": 2.7729239463806152 }, { "auxiliary_loss_clip": 0.0124317, "auxiliary_loss_mlp": 0.0021983, "balance_loss_clip": 1.0253278, "balance_loss_mlp": 0.19421148, "epoch": 0.8968585600480986, "flos": 25702488126720.0, "grad_norm": 77.62803926768352, "language_loss": 0.82934314, "learning_rate": 1.1049878608470931e-07, "loss": 0.84397316, "num_input_tokens_seen": 321753120, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.25610352, "step": 14917, "time_per_iteration": 2.7189583778381348 }, { "auxiliary_loss_clip": 0.01273607, "auxiliary_loss_mlp": 0.00214446, "balance_loss_clip": 1.04546523, "balance_loss_mlp": 0.18880436, "epoch": 0.8969186833007665, "flos": 30044698525440.0, "grad_norm": 17.299562947711497, "language_loss": 0.78603637, "learning_rate": 1.1037116026209137e-07, "loss": 0.80091691, "num_input_tokens_seen": 321772840, "router_z_loss_clip": 2.28320312, "router_z_loss_mlp": 0.25671387, "step": 14918, "time_per_iteration": 2.738684892654419 }, { "auxiliary_loss_clip": 0.01235112, "auxiliary_loss_mlp": 0.0019697, "balance_loss_clip": 1.01912069, "balance_loss_mlp": 0.17263922, "epoch": 0.8969788065534345, "flos": 22818143030400.0, "grad_norm": 5.021255303331505, "language_loss": 0.91791582, "learning_rate": 1.102436060943881e-07, "loss": 0.93223661, "num_input_tokens_seen": 321791020, "router_z_loss_clip": 2.15820312, "router_z_loss_mlp": 0.24304199, "step": 14919, "time_per_iteration": 2.7326104640960693 }, { "auxiliary_loss_clip": 0.01248667, "auxiliary_loss_mlp": 0.00215958, "balance_loss_clip": 1.02693725, "balance_loss_mlp": 0.19017296, "epoch": 0.8970389298061026, "flos": 13261488272640.0, "grad_norm": 268.3894563915832, "language_loss": 0.83353704, "learning_rate": 1.1011612358643696e-07, "loss": 0.84818327, "num_input_tokens_seen": 321810075, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.25756836, "step": 14920, "time_per_iteration": 2.663797616958618 }, { "auxiliary_loss_clip": 0.01280855, "auxiliary_loss_mlp": 0.00219723, "balance_loss_clip": 1.05180907, "balance_loss_mlp": 0.19268647, "epoch": 0.8970990530587705, "flos": 10266071345280.0, "grad_norm": 65.26910504402719, "language_loss": 1.02947497, "learning_rate": 1.0998871274307164e-07, "loss": 1.0444808, "num_input_tokens_seen": 321822635, "router_z_loss_clip": 2.29296875, "router_z_loss_mlp": 0.27050781, "step": 14921, "time_per_iteration": 4.0310893058776855 }, { "auxiliary_loss_clip": 0.01244157, "auxiliary_loss_mlp": 0.00219192, "balance_loss_clip": 1.02621126, "balance_loss_mlp": 0.19396758, "epoch": 0.8971591763114385, "flos": 20302708567680.0, "grad_norm": 56.202299483909805, "language_loss": 0.82127792, "learning_rate": 1.0986137356912384e-07, "loss": 0.83591139, "num_input_tokens_seen": 321841130, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.25268555, "step": 14922, "time_per_iteration": 2.670257091522217 }, { "auxiliary_loss_clip": 0.01224959, "auxiliary_loss_mlp": 0.00202533, "balance_loss_clip": 1.01024175, "balance_loss_mlp": 0.17739178, "epoch": 0.8972192995641064, "flos": 23257043314560.0, "grad_norm": 32.60584575256128, "language_loss": 0.79709589, "learning_rate": 1.097341060694219e-07, "loss": 0.81137085, "num_input_tokens_seen": 321859855, "router_z_loss_clip": 2.1484375, "router_z_loss_mlp": 0.25146484, "step": 14923, "time_per_iteration": 2.7233080863952637 }, { "auxiliary_loss_clip": 0.01244624, "auxiliary_loss_mlp": 0.00230823, "balance_loss_clip": 1.02233529, "balance_loss_mlp": 0.20514517, "epoch": 0.8972794228167744, "flos": 18369601395840.0, "grad_norm": 5.062318966090202, "language_loss": 0.8253721, "learning_rate": 1.0960691024879221e-07, "loss": 0.84012657, "num_input_tokens_seen": 321877990, "router_z_loss_clip": 2.22363281, "router_z_loss_mlp": 0.25695801, "step": 14924, "time_per_iteration": 2.6173830032348633 }, { "auxiliary_loss_clip": 0.01233371, "auxiliary_loss_mlp": 0.00207756, "balance_loss_clip": 1.01943994, "balance_loss_mlp": 0.1838906, "epoch": 0.8973395460694423, "flos": 23952058548480.0, "grad_norm": 1.868105357906014, "language_loss": 0.78902251, "learning_rate": 1.0947978611205844e-07, "loss": 0.80343378, "num_input_tokens_seen": 321898120, "router_z_loss_clip": 2.14160156, "router_z_loss_mlp": 0.2388916, "step": 14925, "time_per_iteration": 2.787172317504883 }, { "auxiliary_loss_clip": 0.01247305, "auxiliary_loss_mlp": 0.00206331, "balance_loss_clip": 1.03084016, "balance_loss_mlp": 0.1815477, "epoch": 0.8973996693221103, "flos": 24970843998720.0, "grad_norm": 6.751678155647724, "language_loss": 0.89512944, "learning_rate": 1.0935273366404008e-07, "loss": 0.90966576, "num_input_tokens_seen": 321918140, "router_z_loss_clip": 2.16210938, "router_z_loss_mlp": 0.24768066, "step": 14926, "time_per_iteration": 4.121737241744995 }, { "auxiliary_loss_clip": 0.01247271, "auxiliary_loss_mlp": 0.00204423, "balance_loss_clip": 1.02998042, "balance_loss_mlp": 0.17962779, "epoch": 0.8974597925747783, "flos": 25738937452800.0, "grad_norm": 3.621688418629379, "language_loss": 0.83336234, "learning_rate": 1.092257529095555e-07, "loss": 0.84787929, "num_input_tokens_seen": 321938580, "router_z_loss_clip": 2.171875, "router_z_loss_mlp": 0.2479248, "step": 14927, "time_per_iteration": 2.6818904876708984 }, { "auxiliary_loss_clip": 0.01240776, "auxiliary_loss_mlp": 0.00199696, "balance_loss_clip": 1.02268779, "balance_loss_mlp": 0.17554426, "epoch": 0.8975199158274463, "flos": 38071918131840.0, "grad_norm": 17.366552440849855, "language_loss": 0.74233413, "learning_rate": 1.0909884385341994e-07, "loss": 0.75673878, "num_input_tokens_seen": 321961135, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.24145508, "step": 14928, "time_per_iteration": 2.834700107574463 }, { "auxiliary_loss_clip": 0.01243823, "auxiliary_loss_mlp": 0.00211023, "balance_loss_clip": 1.02016735, "balance_loss_mlp": 0.18370037, "epoch": 0.8975800390801142, "flos": 25411683617280.0, "grad_norm": 92.9321280913213, "language_loss": 0.78432691, "learning_rate": 1.0897200650044602e-07, "loss": 0.79887539, "num_input_tokens_seen": 321980945, "router_z_loss_clip": 2.23828125, "router_z_loss_mlp": 0.27282715, "step": 14929, "time_per_iteration": 2.724102258682251 }, { "auxiliary_loss_clip": 0.0125347, "auxiliary_loss_mlp": 0.00203917, "balance_loss_clip": 1.03059876, "balance_loss_mlp": 0.17934775, "epoch": 0.8976401623327822, "flos": 21759604202880.0, "grad_norm": 18.223368721477254, "language_loss": 0.75322521, "learning_rate": 1.0884524085544256e-07, "loss": 0.76779908, "num_input_tokens_seen": 322000350, "router_z_loss_clip": 2.23046875, "router_z_loss_mlp": 0.24560547, "step": 14930, "time_per_iteration": 2.7955880165100098 }, { "auxiliary_loss_clip": 0.01233709, "auxiliary_loss_mlp": 0.00210492, "balance_loss_clip": 1.02059436, "balance_loss_mlp": 0.18601838, "epoch": 0.8977002855854501, "flos": 13845323934720.0, "grad_norm": 13.858817171283862, "language_loss": 0.84161973, "learning_rate": 1.0871854692321769e-07, "loss": 0.85606176, "num_input_tokens_seen": 322018980, "router_z_loss_clip": 2.13183594, "router_z_loss_mlp": 0.24475098, "step": 14931, "time_per_iteration": 2.649012565612793 }, { "auxiliary_loss_clip": 0.01239969, "auxiliary_loss_mlp": 0.00239012, "balance_loss_clip": 1.02622283, "balance_loss_mlp": 0.21487188, "epoch": 0.8977604088381181, "flos": 19427529692160.0, "grad_norm": 14.433160476951599, "language_loss": 0.70812166, "learning_rate": 1.0859192470857492e-07, "loss": 0.72291148, "num_input_tokens_seen": 322037675, "router_z_loss_clip": 2.13867188, "router_z_loss_mlp": 0.24121094, "step": 14932, "time_per_iteration": 2.6979901790618896 }, { "auxiliary_loss_clip": 0.01226469, "auxiliary_loss_mlp": 0.00200339, "balance_loss_clip": 1.01715732, "balance_loss_mlp": 0.17622329, "epoch": 0.8978205320907862, "flos": 22742083981440.0, "grad_norm": 36.215801962970815, "language_loss": 0.79690289, "learning_rate": 1.0846537421631552e-07, "loss": 0.81117094, "num_input_tokens_seen": 322055130, "router_z_loss_clip": 2.09179688, "router_z_loss_mlp": 0.2409668, "step": 14933, "time_per_iteration": 2.884436845779419 }, { "auxiliary_loss_clip": 0.01250491, "auxiliary_loss_mlp": 0.00239473, "balance_loss_clip": 1.03080475, "balance_loss_mlp": 0.21321163, "epoch": 0.8978806553434541, "flos": 21360529123200.0, "grad_norm": 43.79648961753964, "language_loss": 0.8114295, "learning_rate": 1.0833889545123898e-07, "loss": 0.82632917, "num_input_tokens_seen": 322074850, "router_z_loss_clip": 2.19921875, "router_z_loss_mlp": 0.26269531, "step": 14934, "time_per_iteration": 2.7523908615112305 }, { "auxiliary_loss_clip": 0.01248054, "auxiliary_loss_mlp": 0.0020361, "balance_loss_clip": 1.02975976, "balance_loss_mlp": 0.17858809, "epoch": 0.8979407785961221, "flos": 20924178704640.0, "grad_norm": 172.50392258456108, "language_loss": 0.69241166, "learning_rate": 1.0821248841814123e-07, "loss": 0.70692837, "num_input_tokens_seen": 322093315, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.25048828, "step": 14935, "time_per_iteration": 2.7395992279052734 }, { "auxiliary_loss_clip": 0.01247748, "auxiliary_loss_mlp": 0.002071, "balance_loss_clip": 1.03126669, "balance_loss_mlp": 0.18226869, "epoch": 0.89800090184879, "flos": 25228934196480.0, "grad_norm": 12.280423408814, "language_loss": 0.86964774, "learning_rate": 1.0808615312181512e-07, "loss": 0.88419622, "num_input_tokens_seen": 322112555, "router_z_loss_clip": 2.1640625, "router_z_loss_mlp": 0.24853516, "step": 14936, "time_per_iteration": 2.722834587097168 }, { "auxiliary_loss_clip": 0.01235056, "auxiliary_loss_mlp": 0.00216379, "balance_loss_clip": 1.0201261, "balance_loss_mlp": 0.19161928, "epoch": 0.898061025101458, "flos": 22562674525440.0, "grad_norm": 87.77459027009446, "language_loss": 0.81259215, "learning_rate": 1.0795988956705193e-07, "loss": 0.82710648, "num_input_tokens_seen": 322130440, "router_z_loss_clip": 2.14453125, "router_z_loss_mlp": 0.24755859, "step": 14937, "time_per_iteration": 2.725440263748169 }, { "auxiliary_loss_clip": 0.01137467, "auxiliary_loss_mlp": 0.00084904, "balance_loss_clip": 0.9967854, "balance_loss_mlp": 0.07636867, "epoch": 0.8981211483541259, "flos": 56192551384320.0, "grad_norm": 0.8211870962886032, "language_loss": 0.62918413, "learning_rate": 1.0783369775863915e-07, "loss": 0.64140785, "num_input_tokens_seen": 322187295, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.08544922, "step": 14938, "time_per_iteration": 3.0692741870880127 }, { "auxiliary_loss_clip": 0.01231197, "auxiliary_loss_mlp": 0.00213722, "balance_loss_clip": 1.01905966, "balance_loss_mlp": 0.18993935, "epoch": 0.898181271606794, "flos": 16392718523520.0, "grad_norm": 1281.2972670911786, "language_loss": 0.87422413, "learning_rate": 1.0770757770136251e-07, "loss": 0.88867331, "num_input_tokens_seen": 322202965, "router_z_loss_clip": 2.12109375, "router_z_loss_mlp": 0.23754883, "step": 14939, "time_per_iteration": 2.7186875343322754 }, { "auxiliary_loss_clip": 0.01146299, "auxiliary_loss_mlp": 0.00091835, "balance_loss_clip": 1.00304806, "balance_loss_mlp": 0.08396759, "epoch": 0.8982413948594619, "flos": 63440259989760.0, "grad_norm": 0.700137245070063, "language_loss": 0.52256083, "learning_rate": 1.0758152940000375e-07, "loss": 0.53494215, "num_input_tokens_seen": 322269490, "router_z_loss_clip": 1.4375, "router_z_loss_mlp": 0.07861328, "step": 14940, "time_per_iteration": 3.26021671295166 }, { "auxiliary_loss_clip": 0.01260193, "auxiliary_loss_mlp": 0.00228671, "balance_loss_clip": 1.03309739, "balance_loss_mlp": 0.19975102, "epoch": 0.8983015181121299, "flos": 21835340029440.0, "grad_norm": 6.46360948043896, "language_loss": 0.88220465, "learning_rate": 1.0745555285934327e-07, "loss": 0.8970933, "num_input_tokens_seen": 322288060, "router_z_loss_clip": 2.2734375, "router_z_loss_mlp": 0.28930664, "step": 14941, "time_per_iteration": 2.6849465370178223 }, { "auxiliary_loss_clip": 0.01251025, "auxiliary_loss_mlp": 0.00207093, "balance_loss_clip": 1.03518128, "balance_loss_mlp": 0.18209529, "epoch": 0.8983616413647978, "flos": 28949961767040.0, "grad_norm": 20.65981664697821, "language_loss": 0.81697428, "learning_rate": 1.0732964808415834e-07, "loss": 0.83155549, "num_input_tokens_seen": 322307930, "router_z_loss_clip": 2.16015625, "router_z_loss_mlp": 0.25, "step": 14942, "time_per_iteration": 2.7346599102020264 }, { "auxiliary_loss_clip": 0.01253703, "auxiliary_loss_mlp": 0.00225828, "balance_loss_clip": 1.03357244, "balance_loss_mlp": 0.19895832, "epoch": 0.8984217646174658, "flos": 17785083375360.0, "grad_norm": 13.98619710078044, "language_loss": 0.91064227, "learning_rate": 1.0720381507922205e-07, "loss": 0.92543763, "num_input_tokens_seen": 322326155, "router_z_loss_clip": 2.19824219, "router_z_loss_mlp": 0.26879883, "step": 14943, "time_per_iteration": 2.6412899494171143 }, { "auxiliary_loss_clip": 0.01258633, "auxiliary_loss_mlp": 0.00229126, "balance_loss_clip": 1.03419089, "balance_loss_mlp": 0.20243502, "epoch": 0.8984818878701337, "flos": 23404528558080.0, "grad_norm": 324.571002444918, "language_loss": 0.78422594, "learning_rate": 1.0707805384930701e-07, "loss": 0.7991035, "num_input_tokens_seen": 322345850, "router_z_loss_clip": 2.24609375, "router_z_loss_mlp": 0.26721191, "step": 14944, "time_per_iteration": 2.7357442378997803 }, { "auxiliary_loss_clip": 0.01252062, "auxiliary_loss_mlp": 0.00231042, "balance_loss_clip": 1.02920866, "balance_loss_mlp": 0.20404083, "epoch": 0.8985420111228017, "flos": 22346061557760.0, "grad_norm": 63.189301355003245, "language_loss": 0.86581004, "learning_rate": 1.0695236439918187e-07, "loss": 0.88064104, "num_input_tokens_seen": 322364715, "router_z_loss_clip": 2.22851562, "router_z_loss_mlp": 0.27001953, "step": 14945, "time_per_iteration": 2.6971487998962402 }, { "auxiliary_loss_clip": 0.01276143, "auxiliary_loss_mlp": 0.00227949, "balance_loss_clip": 1.04336965, "balance_loss_mlp": 0.19902879, "epoch": 0.8986021343754698, "flos": 21392776558080.0, "grad_norm": 50.090817721132986, "language_loss": 0.86000299, "learning_rate": 1.0682674673361302e-07, "loss": 0.87504399, "num_input_tokens_seen": 322383570, "router_z_loss_clip": 2.33203125, "router_z_loss_mlp": 0.28955078, "step": 14946, "time_per_iteration": 2.736772298812866 }, { "auxiliary_loss_clip": 0.01240341, "auxiliary_loss_mlp": 0.00217429, "balance_loss_clip": 1.02447557, "balance_loss_mlp": 0.1928007, "epoch": 0.8986622576281377, "flos": 21325372686720.0, "grad_norm": 14.776985114773739, "language_loss": 0.7191866, "learning_rate": 1.0670120085736334e-07, "loss": 0.73376429, "num_input_tokens_seen": 322401375, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.24633789, "step": 14947, "time_per_iteration": 2.681806802749634 }, { "auxiliary_loss_clip": 0.01244099, "auxiliary_loss_mlp": 0.00206692, "balance_loss_clip": 1.02717495, "balance_loss_mlp": 0.1804423, "epoch": 0.8987223808808057, "flos": 23988292392960.0, "grad_norm": 73.39639464959784, "language_loss": 0.79218835, "learning_rate": 1.0657572677519411e-07, "loss": 0.8066963, "num_input_tokens_seen": 322421890, "router_z_loss_clip": 2.1640625, "router_z_loss_mlp": 0.26257324, "step": 14948, "time_per_iteration": 2.7832841873168945 }, { "auxiliary_loss_clip": 0.012472, "auxiliary_loss_mlp": 0.00222209, "balance_loss_clip": 1.02591908, "balance_loss_mlp": 0.19519581, "epoch": 0.8987825041334736, "flos": 41500956044160.0, "grad_norm": 40.060983661418824, "language_loss": 0.82363653, "learning_rate": 1.0645032449186309e-07, "loss": 0.83833063, "num_input_tokens_seen": 322445730, "router_z_loss_clip": 2.2109375, "router_z_loss_mlp": 0.26989746, "step": 14949, "time_per_iteration": 2.83032488822937 }, { "auxiliary_loss_clip": 0.01273991, "auxiliary_loss_mlp": 0.00252956, "balance_loss_clip": 1.04165447, "balance_loss_mlp": 0.22293895, "epoch": 0.8988426273861416, "flos": 27564276844800.0, "grad_norm": 167.59344068171558, "language_loss": 0.8353703, "learning_rate": 1.0632499401212513e-07, "loss": 0.85063982, "num_input_tokens_seen": 322464595, "router_z_loss_clip": 2.32226562, "router_z_loss_mlp": 0.30029297, "step": 14950, "time_per_iteration": 2.7130472660064697 }, { "auxiliary_loss_clip": 0.01249907, "auxiliary_loss_mlp": 0.00203447, "balance_loss_clip": 1.03350878, "balance_loss_mlp": 0.17852066, "epoch": 0.8989027506388095, "flos": 17092653920640.0, "grad_norm": 23.051235351621226, "language_loss": 0.76185572, "learning_rate": 1.0619973534073334e-07, "loss": 0.7763893, "num_input_tokens_seen": 322483305, "router_z_loss_clip": 2.15917969, "router_z_loss_mlp": 0.24926758, "step": 14951, "time_per_iteration": 2.67946195602417 }, { "auxiliary_loss_clip": 0.01263989, "auxiliary_loss_mlp": 0.00222952, "balance_loss_clip": 1.03887558, "balance_loss_mlp": 0.19598684, "epoch": 0.8989628738914776, "flos": 20555124416640.0, "grad_norm": 8.361169087087884, "language_loss": 0.8240701, "learning_rate": 1.0607454848243769e-07, "loss": 0.83893949, "num_input_tokens_seen": 322501905, "router_z_loss_clip": 2.25195312, "router_z_loss_mlp": 0.26977539, "step": 14952, "time_per_iteration": 2.707854747772217 }, { "auxiliary_loss_clip": 0.01243847, "auxiliary_loss_mlp": 0.00215298, "balance_loss_clip": 1.02521133, "balance_loss_mlp": 0.18750995, "epoch": 0.8990229971441455, "flos": 16251087196800.0, "grad_norm": 41.63146965420085, "language_loss": 0.69290012, "learning_rate": 1.0594943344198481e-07, "loss": 0.70749158, "num_input_tokens_seen": 322518135, "router_z_loss_clip": 2.18164062, "router_z_loss_mlp": 0.2779541, "step": 14953, "time_per_iteration": 2.7253971099853516 }, { "auxiliary_loss_clip": 0.01247771, "auxiliary_loss_mlp": 0.00222013, "balance_loss_clip": 1.02739906, "balance_loss_mlp": 0.19662173, "epoch": 0.8990831203968135, "flos": 21981316901760.0, "grad_norm": 6.80418576062488, "language_loss": 0.90877491, "learning_rate": 1.0582439022411915e-07, "loss": 0.92347276, "num_input_tokens_seen": 322537905, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.25366211, "step": 14954, "time_per_iteration": 2.728855848312378 }, { "auxiliary_loss_clip": 0.01242923, "auxiliary_loss_mlp": 0.00210529, "balance_loss_clip": 1.02753043, "balance_loss_mlp": 0.18729469, "epoch": 0.8991432436494814, "flos": 27447171528960.0, "grad_norm": 51.0196201556688, "language_loss": 0.69048452, "learning_rate": 1.0569941883358224e-07, "loss": 0.70501906, "num_input_tokens_seen": 322557945, "router_z_loss_clip": 2.15722656, "router_z_loss_mlp": 0.23266602, "step": 14955, "time_per_iteration": 2.749107599258423 }, { "auxiliary_loss_clip": 0.0124407, "auxiliary_loss_mlp": 0.00207663, "balance_loss_clip": 1.03125906, "balance_loss_mlp": 0.18448859, "epoch": 0.8992033669021494, "flos": 21579835610880.0, "grad_norm": 1194.1623971174467, "language_loss": 0.62840426, "learning_rate": 1.0557451927511341e-07, "loss": 0.64292163, "num_input_tokens_seen": 322575765, "router_z_loss_clip": 2.12695312, "router_z_loss_mlp": 0.23181152, "step": 14956, "time_per_iteration": 4.052279949188232 }, { "auxiliary_loss_clip": 0.01241129, "auxiliary_loss_mlp": 0.00219022, "balance_loss_clip": 1.02749491, "balance_loss_mlp": 0.19438177, "epoch": 0.8992634901548173, "flos": 28584211530240.0, "grad_norm": 27.52854732823561, "language_loss": 0.87340331, "learning_rate": 1.0544969155344863e-07, "loss": 0.88800478, "num_input_tokens_seen": 322595665, "router_z_loss_clip": 2.140625, "router_z_loss_mlp": 0.24658203, "step": 14957, "time_per_iteration": 4.257422685623169 }, { "auxiliary_loss_clip": 0.01256411, "auxiliary_loss_mlp": 0.00230246, "balance_loss_clip": 1.03139627, "balance_loss_mlp": 0.20428193, "epoch": 0.8993236134074853, "flos": 19867435557120.0, "grad_norm": 2.861413126913298, "language_loss": 0.86749423, "learning_rate": 1.0532493567332123e-07, "loss": 0.88236082, "num_input_tokens_seen": 322614755, "router_z_loss_clip": 2.25, "router_z_loss_mlp": 0.25952148, "step": 14958, "time_per_iteration": 2.7474870681762695 }, { "auxiliary_loss_clip": 0.01235628, "auxiliary_loss_mlp": 0.00201742, "balance_loss_clip": 1.02549732, "balance_loss_mlp": 0.17739901, "epoch": 0.8993837366601534, "flos": 19390649402880.0, "grad_norm": 874.5527326568764, "language_loss": 0.82959729, "learning_rate": 1.0520025163946277e-07, "loss": 0.84397095, "num_input_tokens_seen": 322633425, "router_z_loss_clip": 2.10058594, "router_z_loss_mlp": 0.24353027, "step": 14959, "time_per_iteration": 2.643650531768799 }, { "auxiliary_loss_clip": 0.01228531, "auxiliary_loss_mlp": 0.00211901, "balance_loss_clip": 1.01512825, "balance_loss_mlp": 0.18759465, "epoch": 0.8994438599128213, "flos": 18551740285440.0, "grad_norm": 33.49251383292649, "language_loss": 0.79342508, "learning_rate": 1.0507563945660015e-07, "loss": 0.80782938, "num_input_tokens_seen": 322652065, "router_z_loss_clip": 2.13671875, "router_z_loss_mlp": 0.24304199, "step": 14960, "time_per_iteration": 2.8261499404907227 }, { "auxiliary_loss_clip": 0.01247549, "auxiliary_loss_mlp": 0.00214777, "balance_loss_clip": 1.02898049, "balance_loss_mlp": 0.19029126, "epoch": 0.8995039831654893, "flos": 24427587726720.0, "grad_norm": 34.942523289377846, "language_loss": 0.72275305, "learning_rate": 1.049510991294591e-07, "loss": 0.73737627, "num_input_tokens_seen": 322673275, "router_z_loss_clip": 2.18652344, "router_z_loss_mlp": 0.24499512, "step": 14961, "time_per_iteration": 2.770008087158203 }, { "auxiliary_loss_clip": 0.01223343, "auxiliary_loss_mlp": 0.00225923, "balance_loss_clip": 1.0132761, "balance_loss_mlp": 0.20262982, "epoch": 0.8995641064181572, "flos": 21251324799360.0, "grad_norm": 3.1274646550465204, "language_loss": 0.88090205, "learning_rate": 1.0482663066276254e-07, "loss": 0.89539468, "num_input_tokens_seen": 322693375, "router_z_loss_clip": 2.1015625, "router_z_loss_mlp": 0.2331543, "step": 14962, "time_per_iteration": 2.6767005920410156 }, { "auxiliary_loss_clip": 0.01268258, "auxiliary_loss_mlp": 0.00217217, "balance_loss_clip": 1.04023874, "balance_loss_mlp": 0.19004926, "epoch": 0.8996242296708252, "flos": 23513661054720.0, "grad_norm": 3.047560081478307, "language_loss": 0.83801186, "learning_rate": 1.047022340612298e-07, "loss": 0.85286659, "num_input_tokens_seen": 322712615, "router_z_loss_clip": 2.28320312, "router_z_loss_mlp": 0.27209473, "step": 14963, "time_per_iteration": 4.177760601043701 }, { "auxiliary_loss_clip": 0.01146478, "auxiliary_loss_mlp": 0.00100254, "balance_loss_clip": 1.00604069, "balance_loss_mlp": 0.0900974, "epoch": 0.8996843529234931, "flos": 62403230430720.0, "grad_norm": 0.7536273048986821, "language_loss": 0.56641686, "learning_rate": 1.0457790932957867e-07, "loss": 0.57888424, "num_input_tokens_seen": 322766855, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.1015625, "step": 14964, "time_per_iteration": 2.9920525550842285 }, { "auxiliary_loss_clip": 0.01293745, "auxiliary_loss_mlp": 0.00227775, "balance_loss_clip": 1.05464208, "balance_loss_mlp": 0.2003208, "epoch": 0.8997444761761612, "flos": 24236829573120.0, "grad_norm": 15.72876555013831, "language_loss": 0.82030475, "learning_rate": 1.0445365647252269e-07, "loss": 0.83551991, "num_input_tokens_seen": 322781130, "router_z_loss_clip": 2.390625, "router_z_loss_mlp": 0.27453613, "step": 14965, "time_per_iteration": 2.68679141998291 }, { "auxiliary_loss_clip": 0.01252561, "auxiliary_loss_mlp": 0.00195397, "balance_loss_clip": 1.03033555, "balance_loss_mlp": 0.17134012, "epoch": 0.8998045994288291, "flos": 21361103740800.0, "grad_norm": 435.17392338682373, "language_loss": 0.79898834, "learning_rate": 1.0432947549477433e-07, "loss": 0.81346798, "num_input_tokens_seen": 322800310, "router_z_loss_clip": 2.2265625, "router_z_loss_mlp": 0.24060059, "step": 14966, "time_per_iteration": 2.6923253536224365 }, { "auxiliary_loss_clip": 0.01241951, "auxiliary_loss_mlp": 0.0022636, "balance_loss_clip": 1.02683759, "balance_loss_mlp": 0.19920461, "epoch": 0.8998647226814971, "flos": 28986159697920.0, "grad_norm": 11.711113219074862, "language_loss": 0.80172276, "learning_rate": 1.0420536640104205e-07, "loss": 0.81640589, "num_input_tokens_seen": 322820955, "router_z_loss_clip": 2.15234375, "router_z_loss_mlp": 0.27172852, "step": 14967, "time_per_iteration": 2.715650796890259 }, { "auxiliary_loss_clip": 0.012261, "auxiliary_loss_mlp": 0.00206238, "balance_loss_clip": 1.01155901, "balance_loss_mlp": 0.18170491, "epoch": 0.899924845934165, "flos": 13625909706240.0, "grad_norm": 3.1484606151643733, "language_loss": 0.82622993, "learning_rate": 1.040813291960323e-07, "loss": 0.84055334, "num_input_tokens_seen": 322838780, "router_z_loss_clip": 2.15234375, "router_z_loss_mlp": 0.24536133, "step": 14968, "time_per_iteration": 3.988271474838257 }, { "auxiliary_loss_clip": 0.0123975, "auxiliary_loss_mlp": 0.00222064, "balance_loss_clip": 1.02459717, "balance_loss_mlp": 0.19545677, "epoch": 0.899984969186833, "flos": 20882629647360.0, "grad_norm": 34.791924160654126, "language_loss": 0.78474104, "learning_rate": 1.0395736388444864e-07, "loss": 0.7993592, "num_input_tokens_seen": 322856710, "router_z_loss_clip": 2.15136719, "router_z_loss_mlp": 0.26586914, "step": 14969, "time_per_iteration": 2.6075313091278076 }, { "auxiliary_loss_clip": 0.01248789, "auxiliary_loss_mlp": 0.00221236, "balance_loss_clip": 1.02465081, "balance_loss_mlp": 0.19524834, "epoch": 0.9000450924395009, "flos": 20921808407040.0, "grad_norm": 7.344813267680482, "language_loss": 0.86877692, "learning_rate": 1.0383347047099201e-07, "loss": 0.88347709, "num_input_tokens_seen": 322876070, "router_z_loss_clip": 2.24023438, "router_z_loss_mlp": 0.26025391, "step": 14970, "time_per_iteration": 2.78420352935791 }, { "auxiliary_loss_clip": 0.01245624, "auxiliary_loss_mlp": 0.00193198, "balance_loss_clip": 1.02533197, "balance_loss_mlp": 0.16868848, "epoch": 0.900105215692169, "flos": 17165049782400.0, "grad_norm": 284.53999455683527, "language_loss": 0.8279804, "learning_rate": 1.0370964896035972e-07, "loss": 0.8423686, "num_input_tokens_seen": 322895095, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.24523926, "step": 14971, "time_per_iteration": 2.737246513366699 }, { "auxiliary_loss_clip": 0.01256025, "auxiliary_loss_mlp": 0.00216024, "balance_loss_clip": 1.02922845, "balance_loss_mlp": 0.19040546, "epoch": 0.900165338944837, "flos": 19931930426880.0, "grad_norm": 89.09426343475955, "language_loss": 0.9284265, "learning_rate": 1.035858993572476e-07, "loss": 0.94314706, "num_input_tokens_seen": 322911845, "router_z_loss_clip": 2.265625, "router_z_loss_mlp": 0.25622559, "step": 14972, "time_per_iteration": 2.7411112785339355 }, { "auxiliary_loss_clip": 0.01252405, "auxiliary_loss_mlp": 0.00241899, "balance_loss_clip": 1.0286355, "balance_loss_mlp": 0.21561375, "epoch": 0.9002254621975049, "flos": 16107085572480.0, "grad_norm": 6.270421693920907, "language_loss": 0.90324378, "learning_rate": 1.0346222166634855e-07, "loss": 0.91818684, "num_input_tokens_seen": 322928170, "router_z_loss_clip": 2.23632812, "router_z_loss_mlp": 0.26293945, "step": 14973, "time_per_iteration": 2.720637559890747 }, { "auxiliary_loss_clip": 0.01236139, "auxiliary_loss_mlp": 0.00215326, "balance_loss_clip": 1.01683497, "balance_loss_mlp": 0.19036314, "epoch": 0.9002855854501729, "flos": 28476120528000.0, "grad_norm": 3.8680362197281375, "language_loss": 0.66735727, "learning_rate": 1.0333861589235193e-07, "loss": 0.68187189, "num_input_tokens_seen": 322948165, "router_z_loss_clip": 2.19335938, "router_z_loss_mlp": 0.24987793, "step": 14974, "time_per_iteration": 2.7946276664733887 }, { "auxiliary_loss_clip": 0.01263999, "auxiliary_loss_mlp": 0.00216925, "balance_loss_clip": 1.0406996, "balance_loss_mlp": 0.19149731, "epoch": 0.9003457087028408, "flos": 25630307746560.0, "grad_norm": 3989.179351830267, "language_loss": 0.69692379, "learning_rate": 1.0321508203994489e-07, "loss": 0.71173298, "num_input_tokens_seen": 322968880, "router_z_loss_clip": 2.22851562, "router_z_loss_mlp": 0.25427246, "step": 14975, "time_per_iteration": 2.831637382507324 }, { "auxiliary_loss_clip": 0.01245793, "auxiliary_loss_mlp": 0.00192163, "balance_loss_clip": 1.02908397, "balance_loss_mlp": 0.16634245, "epoch": 0.9004058319555088, "flos": 24389414547840.0, "grad_norm": 95.32687079638082, "language_loss": 0.81172788, "learning_rate": 1.0309162011381257e-07, "loss": 0.82610744, "num_input_tokens_seen": 322989395, "router_z_loss_clip": 2.16601562, "router_z_loss_mlp": 0.25805664, "step": 14976, "time_per_iteration": 2.7177388668060303 }, { "auxiliary_loss_clip": 0.01240682, "auxiliary_loss_mlp": 0.00214049, "balance_loss_clip": 1.0276649, "balance_loss_mlp": 0.18953922, "epoch": 0.9004659552081767, "flos": 29059345658880.0, "grad_norm": 70.3143908033979, "language_loss": 0.7604627, "learning_rate": 1.0296823011863565e-07, "loss": 0.77500999, "num_input_tokens_seen": 323009060, "router_z_loss_clip": 2.13085938, "router_z_loss_mlp": 0.24487305, "step": 14977, "time_per_iteration": 2.7912211418151855 }, { "auxiliary_loss_clip": 0.01251024, "auxiliary_loss_mlp": 0.00214199, "balance_loss_clip": 1.03062081, "balance_loss_mlp": 0.18920061, "epoch": 0.9005260784608448, "flos": 16763855800320.0, "grad_norm": 6.3485233288736005, "language_loss": 0.80022693, "learning_rate": 1.0284491205909351e-07, "loss": 0.81487918, "num_input_tokens_seen": 323027530, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.25012207, "step": 14978, "time_per_iteration": 2.6954894065856934 }, { "auxiliary_loss_clip": 0.01265534, "auxiliary_loss_mlp": 0.00209336, "balance_loss_clip": 1.03948212, "balance_loss_mlp": 0.18381356, "epoch": 0.9005862017135127, "flos": 20376002269440.0, "grad_norm": 3.1716216884038717, "language_loss": 0.88810337, "learning_rate": 1.0272166593986286e-07, "loss": 0.90285212, "num_input_tokens_seen": 323045370, "router_z_loss_clip": 2.26171875, "router_z_loss_mlp": 0.25524902, "step": 14979, "time_per_iteration": 2.6934101581573486 }, { "auxiliary_loss_clip": 0.01135075, "auxiliary_loss_mlp": 0.00161343, "balance_loss_clip": 0.99371183, "balance_loss_mlp": 0.15123378, "epoch": 0.9006463249661807, "flos": 67580255796480.0, "grad_norm": 0.8520949452415648, "language_loss": 0.52137387, "learning_rate": 1.0259849176561642e-07, "loss": 0.53433812, "num_input_tokens_seen": 323105660, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.10107422, "step": 14980, "time_per_iteration": 3.2253005504608154 }, { "auxiliary_loss_clip": 0.01259691, "auxiliary_loss_mlp": 0.00215841, "balance_loss_clip": 1.03542447, "balance_loss_mlp": 0.18782638, "epoch": 0.9007064482188486, "flos": 28293335193600.0, "grad_norm": 401.9227631423909, "language_loss": 0.90689969, "learning_rate": 1.0247538954102553e-07, "loss": 0.92165506, "num_input_tokens_seen": 323126365, "router_z_loss_clip": 2.24023438, "router_z_loss_mlp": 0.2800293, "step": 14981, "time_per_iteration": 2.77797794342041 }, { "auxiliary_loss_clip": 0.01235473, "auxiliary_loss_mlp": 0.00216084, "balance_loss_clip": 1.01986003, "balance_loss_mlp": 0.1907759, "epoch": 0.9007665714715166, "flos": 21616320850560.0, "grad_norm": 108.65820250838966, "language_loss": 0.885566, "learning_rate": 1.0235235927075758e-07, "loss": 0.90008163, "num_input_tokens_seen": 323145655, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.25305176, "step": 14982, "time_per_iteration": 2.7323219776153564 }, { "auxiliary_loss_clip": 0.0124407, "auxiliary_loss_mlp": 0.00209516, "balance_loss_clip": 1.02718997, "balance_loss_mlp": 0.18563813, "epoch": 0.9008266947241845, "flos": 26541864120960.0, "grad_norm": 3.4063121311916356, "language_loss": 0.7767309, "learning_rate": 1.0222940095947885e-07, "loss": 0.7912668, "num_input_tokens_seen": 323164540, "router_z_loss_clip": 2.16796875, "router_z_loss_mlp": 0.23876953, "step": 14983, "time_per_iteration": 2.701016664505005 }, { "auxiliary_loss_clip": 0.01235756, "auxiliary_loss_mlp": 0.00221991, "balance_loss_clip": 1.02026498, "balance_loss_mlp": 0.19675469, "epoch": 0.9008868179768525, "flos": 23110527738240.0, "grad_norm": 258.16302489021706, "language_loss": 0.80706918, "learning_rate": 1.0210651461185115e-07, "loss": 0.82164669, "num_input_tokens_seen": 323186960, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.25244141, "step": 14984, "time_per_iteration": 2.726374626159668 }, { "auxiliary_loss_clip": 0.01219603, "auxiliary_loss_mlp": 0.00198793, "balance_loss_clip": 1.01146698, "balance_loss_mlp": 0.17551103, "epoch": 0.9009469412295206, "flos": 19060809788160.0, "grad_norm": 7.621481403019943, "language_loss": 0.76561058, "learning_rate": 1.0198370023253456e-07, "loss": 0.77979451, "num_input_tokens_seen": 323206135, "router_z_loss_clip": 2.08007812, "router_z_loss_mlp": 0.23278809, "step": 14985, "time_per_iteration": 2.6914021968841553 }, { "auxiliary_loss_clip": 0.01234466, "auxiliary_loss_mlp": 0.00215528, "balance_loss_clip": 1.01728356, "balance_loss_mlp": 0.19087572, "epoch": 0.9010070644821885, "flos": 23222281927680.0, "grad_norm": 12.885990101630478, "language_loss": 0.79401976, "learning_rate": 1.0186095782618643e-07, "loss": 0.80851972, "num_input_tokens_seen": 323225980, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.24658203, "step": 14986, "time_per_iteration": 2.7460954189300537 }, { "auxiliary_loss_clip": 0.01228712, "auxiliary_loss_mlp": 0.00208977, "balance_loss_clip": 1.01319194, "balance_loss_mlp": 0.18501607, "epoch": 0.9010671877348565, "flos": 17384823146880.0, "grad_norm": 10.092715810862806, "language_loss": 0.83258355, "learning_rate": 1.0173828739746104e-07, "loss": 0.84696043, "num_input_tokens_seen": 323243700, "router_z_loss_clip": 2.15234375, "router_z_loss_mlp": 0.23962402, "step": 14987, "time_per_iteration": 2.6339879035949707 }, { "auxiliary_loss_clip": 0.01240797, "auxiliary_loss_mlp": 0.00240234, "balance_loss_clip": 1.01660895, "balance_loss_mlp": 0.21269655, "epoch": 0.9011273109875244, "flos": 21908166854400.0, "grad_norm": 14.769897462053658, "language_loss": 0.83201468, "learning_rate": 1.0161568895100981e-07, "loss": 0.846825, "num_input_tokens_seen": 323261535, "router_z_loss_clip": 2.23535156, "router_z_loss_mlp": 0.27514648, "step": 14988, "time_per_iteration": 2.6589250564575195 }, { "auxiliary_loss_clip": 0.01254417, "auxiliary_loss_mlp": 0.00215225, "balance_loss_clip": 1.033499, "balance_loss_mlp": 0.19027415, "epoch": 0.9011874342401924, "flos": 24060831909120.0, "grad_norm": 209.30486048181731, "language_loss": 0.80368507, "learning_rate": 1.0149316249148188e-07, "loss": 0.81838149, "num_input_tokens_seen": 323281855, "router_z_loss_clip": 2.21386719, "router_z_loss_mlp": 0.24951172, "step": 14989, "time_per_iteration": 2.6847009658813477 }, { "auxiliary_loss_clip": 0.01247215, "auxiliary_loss_mlp": 0.00227391, "balance_loss_clip": 1.02619159, "balance_loss_mlp": 0.20053352, "epoch": 0.9012475574928603, "flos": 16758791982720.0, "grad_norm": 723.814252877338, "language_loss": 0.90443444, "learning_rate": 1.0137070802352376e-07, "loss": 0.91918051, "num_input_tokens_seen": 323299505, "router_z_loss_clip": 2.2109375, "router_z_loss_mlp": 0.26843262, "step": 14990, "time_per_iteration": 2.7019410133361816 }, { "auxiliary_loss_clip": 0.01257715, "auxiliary_loss_mlp": 0.00234428, "balance_loss_clip": 1.03443456, "balance_loss_mlp": 0.20759422, "epoch": 0.9013076807455284, "flos": 19971109186560.0, "grad_norm": 14.414215547132091, "language_loss": 0.86783975, "learning_rate": 1.0124832555177842e-07, "loss": 0.88276118, "num_input_tokens_seen": 323318365, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.26818848, "step": 14991, "time_per_iteration": 2.7014145851135254 }, { "auxiliary_loss_clip": 0.01144953, "auxiliary_loss_mlp": 0.00099595, "balance_loss_clip": 1.00031066, "balance_loss_mlp": 0.09063063, "epoch": 0.9013678039981963, "flos": 65180274624000.0, "grad_norm": 1.3993841061653867, "language_loss": 0.59377682, "learning_rate": 1.0112601508088726e-07, "loss": 0.60622227, "num_input_tokens_seen": 323371835, "router_z_loss_clip": 1.4453125, "router_z_loss_mlp": 0.08984375, "step": 14992, "time_per_iteration": 3.072889566421509 }, { "auxiliary_loss_clip": 0.01233509, "auxiliary_loss_mlp": 0.00201345, "balance_loss_clip": 1.01843739, "balance_loss_mlp": 0.1773358, "epoch": 0.9014279272508643, "flos": 20521224956160.0, "grad_norm": 23.281941032500054, "language_loss": 0.90760767, "learning_rate": 1.0100377661548764e-07, "loss": 0.92195618, "num_input_tokens_seen": 323388495, "router_z_loss_clip": 2.15039062, "router_z_loss_mlp": 0.23999023, "step": 14993, "time_per_iteration": 2.6695525646209717 }, { "auxiliary_loss_clip": 0.01244423, "auxiliary_loss_mlp": 0.00219307, "balance_loss_clip": 1.02775025, "balance_loss_mlp": 0.19445223, "epoch": 0.9014880505035322, "flos": 17309051406720.0, "grad_norm": 3.8018175954967703, "language_loss": 0.82005048, "learning_rate": 1.0088161016021502e-07, "loss": 0.83468777, "num_input_tokens_seen": 323405280, "router_z_loss_clip": 2.16601562, "router_z_loss_mlp": 0.24841309, "step": 14994, "time_per_iteration": 2.6932692527770996 }, { "auxiliary_loss_clip": 0.01233382, "auxiliary_loss_mlp": 0.00214092, "balance_loss_clip": 1.02075565, "balance_loss_mlp": 0.18979757, "epoch": 0.9015481737562002, "flos": 28402862739840.0, "grad_norm": 3.97973985741406, "language_loss": 0.73189175, "learning_rate": 1.0075951571970187e-07, "loss": 0.74636656, "num_input_tokens_seen": 323425310, "router_z_loss_clip": 2.12597656, "router_z_loss_mlp": 0.24267578, "step": 14995, "time_per_iteration": 2.763733148574829 }, { "auxiliary_loss_clip": 0.01239586, "auxiliary_loss_mlp": 0.00213411, "balance_loss_clip": 1.01792383, "balance_loss_mlp": 0.18823402, "epoch": 0.9016082970088681, "flos": 29752672953600.0, "grad_norm": 18.84629864485391, "language_loss": 0.75751364, "learning_rate": 1.0063749329857873e-07, "loss": 0.77204359, "num_input_tokens_seen": 323447805, "router_z_loss_clip": 2.21679688, "router_z_loss_mlp": 0.25195312, "step": 14996, "time_per_iteration": 2.741243839263916 }, { "auxiliary_loss_clip": 0.01235753, "auxiliary_loss_mlp": 0.00188115, "balance_loss_clip": 1.02156007, "balance_loss_mlp": 0.16463102, "epoch": 0.9016684202615362, "flos": 23513230091520.0, "grad_norm": 10.1944999174036, "language_loss": 0.75761247, "learning_rate": 1.0051554290147168e-07, "loss": 0.77185112, "num_input_tokens_seen": 323467150, "router_z_loss_clip": 2.140625, "router_z_loss_mlp": 0.23486328, "step": 14997, "time_per_iteration": 2.7784781455993652 }, { "auxiliary_loss_clip": 0.01251361, "auxiliary_loss_mlp": 0.00225079, "balance_loss_clip": 1.03141117, "balance_loss_mlp": 0.19938916, "epoch": 0.9017285435142042, "flos": 16979247705600.0, "grad_norm": 24.4270080199845, "language_loss": 0.84484929, "learning_rate": 1.0039366453300613e-07, "loss": 0.85961366, "num_input_tokens_seen": 323484250, "router_z_loss_clip": 2.19921875, "router_z_loss_mlp": 0.25695801, "step": 14998, "time_per_iteration": 2.6779465675354004 }, { "auxiliary_loss_clip": 0.01238647, "auxiliary_loss_mlp": 0.00215491, "balance_loss_clip": 1.0206393, "balance_loss_mlp": 0.19009905, "epoch": 0.9017886667668721, "flos": 21393351175680.0, "grad_norm": 16.63805499006334, "language_loss": 0.83103526, "learning_rate": 1.0027185819780281e-07, "loss": 0.84557664, "num_input_tokens_seen": 323502910, "router_z_loss_clip": 2.18261719, "router_z_loss_mlp": 0.25378418, "step": 14999, "time_per_iteration": 5.574976921081543 }, { "auxiliary_loss_clip": 0.01258981, "auxiliary_loss_mlp": 0.00222543, "balance_loss_clip": 1.03729272, "balance_loss_mlp": 0.19518444, "epoch": 0.9018487900195401, "flos": 20996574566400.0, "grad_norm": 10.038773970461397, "language_loss": 0.84175283, "learning_rate": 1.0015012390048117e-07, "loss": 0.85656804, "num_input_tokens_seen": 323521820, "router_z_loss_clip": 2.21679688, "router_z_loss_mlp": 0.27331543, "step": 15000, "time_per_iteration": 2.642618179321289 }, { "auxiliary_loss_clip": 0.01236722, "auxiliary_loss_mlp": 0.00212158, "balance_loss_clip": 1.02321672, "balance_loss_mlp": 0.18719539, "epoch": 0.901908913272208, "flos": 53358443458560.0, "grad_norm": 58.16988022019603, "language_loss": 0.89850819, "learning_rate": 1.0002846164565704e-07, "loss": 0.91299701, "num_input_tokens_seen": 323543200, "router_z_loss_clip": 2.13867188, "router_z_loss_mlp": 0.24951172, "step": 15001, "time_per_iteration": 2.9734010696411133 }, { "auxiliary_loss_clip": 0.01232716, "auxiliary_loss_mlp": 0.00201111, "balance_loss_clip": 1.02037251, "balance_loss_mlp": 0.17892662, "epoch": 0.901969036524876, "flos": 22089838867200.0, "grad_norm": 422.18531102711256, "language_loss": 0.84160447, "learning_rate": 9.990687143794407e-08, "loss": 0.85594279, "num_input_tokens_seen": 323563075, "router_z_loss_clip": 2.12109375, "router_z_loss_mlp": 0.22167969, "step": 15002, "time_per_iteration": 2.6874022483825684 }, { "auxiliary_loss_clip": 0.01242063, "auxiliary_loss_mlp": 0.00194303, "balance_loss_clip": 1.02581549, "balance_loss_mlp": 0.16949505, "epoch": 0.9020291597775439, "flos": 23835025059840.0, "grad_norm": 20.664040522650563, "language_loss": 0.77737236, "learning_rate": 9.978535328195347e-08, "loss": 0.79173607, "num_input_tokens_seen": 323579065, "router_z_loss_clip": 2.16210938, "router_z_loss_mlp": 0.24816895, "step": 15003, "time_per_iteration": 2.6824169158935547 }, { "auxiliary_loss_clip": 0.0123673, "auxiliary_loss_mlp": 0.00228464, "balance_loss_clip": 1.02109659, "balance_loss_mlp": 0.20456275, "epoch": 0.902089283030212, "flos": 18326005263360.0, "grad_norm": 108.4412815014333, "language_loss": 0.92965311, "learning_rate": 9.9663907182292e-08, "loss": 0.94430506, "num_input_tokens_seen": 323594835, "router_z_loss_clip": 2.15527344, "router_z_loss_mlp": 0.23901367, "step": 15004, "time_per_iteration": 2.734975814819336 }, { "auxiliary_loss_clip": 0.01241901, "auxiliary_loss_mlp": 0.00199595, "balance_loss_clip": 1.02341461, "balance_loss_mlp": 0.17454875, "epoch": 0.9021494062828799, "flos": 24170359455360.0, "grad_norm": 47.6089388395283, "language_loss": 0.8227073, "learning_rate": 9.954253314356575e-08, "loss": 0.8371222, "num_input_tokens_seen": 323611475, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.25048828, "step": 15005, "time_per_iteration": 4.190165758132935 }, { "auxiliary_loss_clip": 0.01251231, "auxiliary_loss_mlp": 0.00228183, "balance_loss_clip": 1.02950597, "balance_loss_mlp": 0.20021638, "epoch": 0.9022095295355479, "flos": 21616859554560.0, "grad_norm": 5.438884616713143, "language_loss": 0.81397831, "learning_rate": 9.942123117037748e-08, "loss": 0.82877243, "num_input_tokens_seen": 323629730, "router_z_loss_clip": 2.21679688, "router_z_loss_mlp": 0.2800293, "step": 15006, "time_per_iteration": 2.6893153190612793 }, { "auxiliary_loss_clip": 0.01259467, "auxiliary_loss_mlp": 0.00226935, "balance_loss_clip": 1.0371778, "balance_loss_mlp": 0.20203194, "epoch": 0.9022696527882158, "flos": 18726229578240.0, "grad_norm": 57.048777572370646, "language_loss": 0.92812419, "learning_rate": 9.930000126732618e-08, "loss": 0.94298822, "num_input_tokens_seen": 323646000, "router_z_loss_clip": 2.22070312, "router_z_loss_mlp": 0.24902344, "step": 15007, "time_per_iteration": 2.6015121936798096 }, { "auxiliary_loss_clip": 0.01239834, "auxiliary_loss_mlp": 0.00211302, "balance_loss_clip": 1.02301955, "balance_loss_mlp": 0.18611346, "epoch": 0.9023297760408838, "flos": 26761206522240.0, "grad_norm": 5933.435773313606, "language_loss": 0.84724581, "learning_rate": 9.917884343900928e-08, "loss": 0.86175716, "num_input_tokens_seen": 323667250, "router_z_loss_clip": 2.16796875, "router_z_loss_mlp": 0.2520752, "step": 15008, "time_per_iteration": 2.7353920936584473 }, { "auxiliary_loss_clip": 0.01216398, "auxiliary_loss_mlp": 0.00202642, "balance_loss_clip": 1.00735974, "balance_loss_mlp": 0.1786336, "epoch": 0.9023898992935517, "flos": 20522553759360.0, "grad_norm": 21.35877510816422, "language_loss": 0.81319511, "learning_rate": 9.905775769002156e-08, "loss": 0.82738554, "num_input_tokens_seen": 323687150, "router_z_loss_clip": 2.08984375, "router_z_loss_mlp": 0.2401123, "step": 15009, "time_per_iteration": 2.6589205265045166 }, { "auxiliary_loss_clip": 0.0125335, "auxiliary_loss_mlp": 0.00189278, "balance_loss_clip": 1.03505707, "balance_loss_mlp": 0.16246751, "epoch": 0.9024500225462198, "flos": 17456644391040.0, "grad_norm": 12.598554133260299, "language_loss": 0.82041001, "learning_rate": 9.893674402495399e-08, "loss": 0.83483636, "num_input_tokens_seen": 323703660, "router_z_loss_clip": 2.18066406, "router_z_loss_mlp": 0.26818848, "step": 15010, "time_per_iteration": 4.252178907394409 }, { "auxiliary_loss_clip": 0.01250605, "auxiliary_loss_mlp": 0.00201241, "balance_loss_clip": 1.03203559, "balance_loss_mlp": 0.17599216, "epoch": 0.9025101457988878, "flos": 20813609664000.0, "grad_norm": 26.24036777057758, "language_loss": 0.83770376, "learning_rate": 9.881580244839538e-08, "loss": 0.8522222, "num_input_tokens_seen": 323722060, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.25219727, "step": 15011, "time_per_iteration": 2.6820366382598877 }, { "auxiliary_loss_clip": 0.01254051, "auxiliary_loss_mlp": 0.00220471, "balance_loss_clip": 1.03369093, "balance_loss_mlp": 0.19622347, "epoch": 0.9025702690515557, "flos": 19026371623680.0, "grad_norm": 44.21652215068953, "language_loss": 0.8581804, "learning_rate": 9.869493296493204e-08, "loss": 0.8729257, "num_input_tokens_seen": 323740645, "router_z_loss_clip": 2.20507812, "router_z_loss_mlp": 0.2421875, "step": 15012, "time_per_iteration": 2.624124526977539 }, { "auxiliary_loss_clip": 0.01226091, "auxiliary_loss_mlp": 0.00196769, "balance_loss_clip": 1.01225019, "balance_loss_mlp": 0.17168745, "epoch": 0.9026303923042237, "flos": 19682818629120.0, "grad_norm": 38.560705028470515, "language_loss": 0.75973207, "learning_rate": 9.857413557914763e-08, "loss": 0.77396065, "num_input_tokens_seen": 323758905, "router_z_loss_clip": 2.13476562, "router_z_loss_mlp": 0.25085449, "step": 15013, "time_per_iteration": 2.6967415809631348 }, { "auxiliary_loss_clip": 0.01228064, "auxiliary_loss_mlp": 0.00203292, "balance_loss_clip": 1.01747966, "balance_loss_mlp": 0.18027228, "epoch": 0.9026905155568916, "flos": 24608110504320.0, "grad_norm": 32.44093781396152, "language_loss": 0.79160237, "learning_rate": 9.845341029562249e-08, "loss": 0.80591589, "num_input_tokens_seen": 323780595, "router_z_loss_clip": 2.10742188, "router_z_loss_mlp": 0.23022461, "step": 15014, "time_per_iteration": 2.715789556503296 }, { "auxiliary_loss_clip": 0.01242167, "auxiliary_loss_mlp": 0.00212673, "balance_loss_clip": 1.02445698, "balance_loss_mlp": 0.18694755, "epoch": 0.9027506388095596, "flos": 20521799573760.0, "grad_norm": 327.26455813075864, "language_loss": 0.79709738, "learning_rate": 9.833275711893474e-08, "loss": 0.81164581, "num_input_tokens_seen": 323798160, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.25708008, "step": 15015, "time_per_iteration": 2.6807641983032227 }, { "auxiliary_loss_clip": 0.01257758, "auxiliary_loss_mlp": 0.00222392, "balance_loss_clip": 1.03500414, "balance_loss_mlp": 0.19670205, "epoch": 0.9028107620622275, "flos": 22784494965120.0, "grad_norm": 132.9685644941984, "language_loss": 0.77976662, "learning_rate": 9.821217605365895e-08, "loss": 0.79456812, "num_input_tokens_seen": 323816810, "router_z_loss_clip": 2.22460938, "router_z_loss_mlp": 0.25671387, "step": 15016, "time_per_iteration": 2.7219746112823486 }, { "auxiliary_loss_clip": 0.01234741, "auxiliary_loss_mlp": 0.00203063, "balance_loss_clip": 1.01997948, "balance_loss_mlp": 0.17945912, "epoch": 0.9028708853148956, "flos": 25410534382080.0, "grad_norm": 38918.254669354974, "language_loss": 0.8149693, "learning_rate": 9.809166710436855e-08, "loss": 0.82934725, "num_input_tokens_seen": 323836900, "router_z_loss_clip": 2.14941406, "router_z_loss_mlp": 0.23608398, "step": 15017, "time_per_iteration": 2.7588536739349365 }, { "auxiliary_loss_clip": 0.0125365, "auxiliary_loss_mlp": 0.00211258, "balance_loss_clip": 1.03704882, "balance_loss_mlp": 0.18655832, "epoch": 0.9029310085675635, "flos": 21871322478720.0, "grad_norm": 7.485255498032207, "language_loss": 0.75926203, "learning_rate": 9.797123027563237e-08, "loss": 0.77391112, "num_input_tokens_seen": 323855325, "router_z_loss_clip": 2.16894531, "router_z_loss_mlp": 0.24719238, "step": 15018, "time_per_iteration": 2.642148017883301 }, { "auxiliary_loss_clip": 0.01261323, "auxiliary_loss_mlp": 0.001934, "balance_loss_clip": 1.03631151, "balance_loss_mlp": 0.16767484, "epoch": 0.9029911318202315, "flos": 26214394803840.0, "grad_norm": 3.429631674040516, "language_loss": 0.76770735, "learning_rate": 9.785086557201782e-08, "loss": 0.78225452, "num_input_tokens_seen": 323875650, "router_z_loss_clip": 2.25, "router_z_loss_mlp": 0.25769043, "step": 15019, "time_per_iteration": 2.750429391860962 }, { "auxiliary_loss_clip": 0.01218463, "auxiliary_loss_mlp": 0.00217933, "balance_loss_clip": 1.00944829, "balance_loss_mlp": 0.19463921, "epoch": 0.9030512550728994, "flos": 15961360095360.0, "grad_norm": 6.985222440715066, "language_loss": 0.80988622, "learning_rate": 9.773057299808951e-08, "loss": 0.82425022, "num_input_tokens_seen": 323892920, "router_z_loss_clip": 2.08984375, "router_z_loss_mlp": 0.23291016, "step": 15020, "time_per_iteration": 2.6244869232177734 }, { "auxiliary_loss_clip": 0.01250545, "auxiliary_loss_mlp": 0.00198223, "balance_loss_clip": 1.0289669, "balance_loss_mlp": 0.17392772, "epoch": 0.9031113783255674, "flos": 23987610034560.0, "grad_norm": 58.35458515489077, "language_loss": 0.8183198, "learning_rate": 9.7610352558408e-08, "loss": 0.83280742, "num_input_tokens_seen": 323913835, "router_z_loss_clip": 2.21289062, "router_z_loss_mlp": 0.24291992, "step": 15021, "time_per_iteration": 2.704535484313965 }, { "auxiliary_loss_clip": 0.01269444, "auxiliary_loss_mlp": 0.00216036, "balance_loss_clip": 1.03860939, "balance_loss_mlp": 0.18908289, "epoch": 0.9031715015782353, "flos": 22237216369920.0, "grad_norm": 7.958720477811071, "language_loss": 0.85382843, "learning_rate": 9.749020425753251e-08, "loss": 0.86868322, "num_input_tokens_seen": 323933440, "router_z_loss_clip": 2.30664062, "router_z_loss_mlp": 0.26940918, "step": 15022, "time_per_iteration": 2.7219390869140625 }, { "auxiliary_loss_clip": 0.01222853, "auxiliary_loss_mlp": 0.00205274, "balance_loss_clip": 1.01196778, "balance_loss_mlp": 0.18065687, "epoch": 0.9032316248309034, "flos": 26323168164480.0, "grad_norm": 37.15080853215362, "language_loss": 0.80887413, "learning_rate": 9.737012810001943e-08, "loss": 0.82315534, "num_input_tokens_seen": 323954090, "router_z_loss_clip": 2.10839844, "router_z_loss_mlp": 0.24597168, "step": 15023, "time_per_iteration": 2.7446141242980957 }, { "auxiliary_loss_clip": 0.01244111, "auxiliary_loss_mlp": 0.00211184, "balance_loss_clip": 1.02728534, "balance_loss_mlp": 0.18630549, "epoch": 0.9032917480835713, "flos": 22636686499200.0, "grad_norm": 16.161374012163932, "language_loss": 0.8959893, "learning_rate": 9.725012409042155e-08, "loss": 0.91054225, "num_input_tokens_seen": 323974040, "router_z_loss_clip": 2.171875, "router_z_loss_mlp": 0.2487793, "step": 15024, "time_per_iteration": 2.8138136863708496 }, { "auxiliary_loss_clip": 0.01240403, "auxiliary_loss_mlp": 0.00213474, "balance_loss_clip": 1.02427244, "balance_loss_mlp": 0.18977487, "epoch": 0.9033518713362393, "flos": 23878764846720.0, "grad_norm": 17.408762510992197, "language_loss": 0.77023172, "learning_rate": 9.713019223328966e-08, "loss": 0.78477049, "num_input_tokens_seen": 323996125, "router_z_loss_clip": 2.1640625, "router_z_loss_mlp": 0.23706055, "step": 15025, "time_per_iteration": 2.708176612854004 }, { "auxiliary_loss_clip": 0.01237489, "auxiliary_loss_mlp": 0.0021092, "balance_loss_clip": 1.02087045, "balance_loss_mlp": 0.18708968, "epoch": 0.9034119945889073, "flos": 26905279973760.0, "grad_norm": 40.5976459823537, "language_loss": 0.84079719, "learning_rate": 9.70103325331717e-08, "loss": 0.85528123, "num_input_tokens_seen": 324017645, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.23828125, "step": 15026, "time_per_iteration": 2.74302077293396 }, { "auxiliary_loss_clip": 0.01257144, "auxiliary_loss_mlp": 0.00224993, "balance_loss_clip": 1.03505433, "balance_loss_mlp": 0.2002926, "epoch": 0.9034721178415752, "flos": 20850166730880.0, "grad_norm": 120.89512557012259, "language_loss": 0.79067469, "learning_rate": 9.68905449946129e-08, "loss": 0.8054961, "num_input_tokens_seen": 324036875, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.24694824, "step": 15027, "time_per_iteration": 2.6471056938171387 }, { "auxiliary_loss_clip": 0.01219451, "auxiliary_loss_mlp": 0.00204295, "balance_loss_clip": 1.0079695, "balance_loss_mlp": 0.18038186, "epoch": 0.9035322410942432, "flos": 22234307368320.0, "grad_norm": 11.176804098902096, "language_loss": 0.8193146, "learning_rate": 9.677082962215477e-08, "loss": 0.83355206, "num_input_tokens_seen": 324057045, "router_z_loss_clip": 2.1171875, "router_z_loss_mlp": 0.23937988, "step": 15028, "time_per_iteration": 2.743190050125122 }, { "auxiliary_loss_clip": 0.01242065, "auxiliary_loss_mlp": 0.00199508, "balance_loss_clip": 1.02302802, "balance_loss_mlp": 0.17361566, "epoch": 0.9035923643469111, "flos": 25923410726400.0, "grad_norm": 19.43433257442062, "language_loss": 0.76673383, "learning_rate": 9.665118642033765e-08, "loss": 0.78114951, "num_input_tokens_seen": 324079735, "router_z_loss_clip": 2.19140625, "router_z_loss_mlp": 0.25891113, "step": 15029, "time_per_iteration": 2.6872169971466064 }, { "auxiliary_loss_clip": 0.0125927, "auxiliary_loss_mlp": 0.00212059, "balance_loss_clip": 1.03443682, "balance_loss_mlp": 0.18722773, "epoch": 0.9036524875995792, "flos": 20339804338560.0, "grad_norm": 7.602857031576354, "language_loss": 0.8199864, "learning_rate": 9.653161539369858e-08, "loss": 0.83469975, "num_input_tokens_seen": 324097785, "router_z_loss_clip": 2.25, "router_z_loss_mlp": 0.24816895, "step": 15030, "time_per_iteration": 2.647810459136963 }, { "auxiliary_loss_clip": 0.01245594, "auxiliary_loss_mlp": 0.00231025, "balance_loss_clip": 1.02689338, "balance_loss_mlp": 0.20395218, "epoch": 0.9037126108522471, "flos": 40114624677120.0, "grad_norm": 4.16667491061631, "language_loss": 0.74191982, "learning_rate": 9.641211654677151e-08, "loss": 0.75668597, "num_input_tokens_seen": 324121625, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.27062988, "step": 15031, "time_per_iteration": 2.8130578994750977 }, { "auxiliary_loss_clip": 0.01217254, "auxiliary_loss_mlp": 0.00211663, "balance_loss_clip": 1.00680375, "balance_loss_mlp": 0.18799964, "epoch": 0.9037727341049151, "flos": 23332024955520.0, "grad_norm": 3.4867909371867034, "language_loss": 0.84336489, "learning_rate": 9.629268988408723e-08, "loss": 0.85765409, "num_input_tokens_seen": 324142535, "router_z_loss_clip": 2.10449219, "router_z_loss_mlp": 0.23657227, "step": 15032, "time_per_iteration": 2.694162368774414 }, { "auxiliary_loss_clip": 0.01249032, "auxiliary_loss_mlp": 0.00214095, "balance_loss_clip": 1.03249586, "balance_loss_mlp": 0.18863192, "epoch": 0.903832857357583, "flos": 12822659815680.0, "grad_norm": 21.771482162021986, "language_loss": 0.83669615, "learning_rate": 9.617333541017502e-08, "loss": 0.85132742, "num_input_tokens_seen": 324159610, "router_z_loss_clip": 2.16601562, "router_z_loss_mlp": 0.25439453, "step": 15033, "time_per_iteration": 2.6575756072998047 }, { "auxiliary_loss_clip": 0.01247066, "auxiliary_loss_mlp": 0.00229222, "balance_loss_clip": 1.02635205, "balance_loss_mlp": 0.20169654, "epoch": 0.903892980610251, "flos": 25703026830720.0, "grad_norm": 8.055241139084632, "language_loss": 0.80451047, "learning_rate": 9.605405312956105e-08, "loss": 0.81927329, "num_input_tokens_seen": 324182510, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.27575684, "step": 15034, "time_per_iteration": 2.7057321071624756 }, { "auxiliary_loss_clip": 0.01255427, "auxiliary_loss_mlp": 0.00228398, "balance_loss_clip": 1.0361774, "balance_loss_mlp": 0.2039123, "epoch": 0.9039531038629189, "flos": 14684089397760.0, "grad_norm": 25.403456083190346, "language_loss": 0.74443537, "learning_rate": 9.593484304676791e-08, "loss": 0.75927353, "num_input_tokens_seen": 324200555, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.24499512, "step": 15035, "time_per_iteration": 2.663966417312622 }, { "auxiliary_loss_clip": 0.01252194, "auxiliary_loss_mlp": 0.00230476, "balance_loss_clip": 1.03388143, "balance_loss_mlp": 0.2047269, "epoch": 0.904013227115587, "flos": 24024921287040.0, "grad_norm": 3.8192492047623503, "language_loss": 0.71859443, "learning_rate": 9.581570516631643e-08, "loss": 0.73342109, "num_input_tokens_seen": 324220255, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.25793457, "step": 15036, "time_per_iteration": 2.6970818042755127 }, { "auxiliary_loss_clip": 0.01219838, "auxiliary_loss_mlp": 0.00199582, "balance_loss_clip": 1.0113219, "balance_loss_mlp": 0.17655092, "epoch": 0.9040733503682549, "flos": 22856459863680.0, "grad_norm": 6.558841982692202, "language_loss": 0.89793628, "learning_rate": 9.569663949272455e-08, "loss": 0.91213048, "num_input_tokens_seen": 324237855, "router_z_loss_clip": 2.08203125, "router_z_loss_mlp": 0.23046875, "step": 15037, "time_per_iteration": 2.746861219406128 }, { "auxiliary_loss_clip": 0.01255162, "auxiliary_loss_mlp": 0.00204625, "balance_loss_clip": 1.03036976, "balance_loss_mlp": 0.17845826, "epoch": 0.9041334736209229, "flos": 19974951941760.0, "grad_norm": 6.364061276610379, "language_loss": 0.75059867, "learning_rate": 9.557764603050667e-08, "loss": 0.76519644, "num_input_tokens_seen": 324257050, "router_z_loss_clip": 2.25195312, "router_z_loss_mlp": 0.26196289, "step": 15038, "time_per_iteration": 2.6521472930908203 }, { "auxiliary_loss_clip": 0.01241138, "auxiliary_loss_mlp": 0.00231446, "balance_loss_clip": 1.02302015, "balance_loss_mlp": 0.20567343, "epoch": 0.9041935968735909, "flos": 17530548624000.0, "grad_norm": 53.80233959122208, "language_loss": 0.86583424, "learning_rate": 9.545872478417494e-08, "loss": 0.88056004, "num_input_tokens_seen": 324275510, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.25769043, "step": 15039, "time_per_iteration": 2.6551008224487305 }, { "auxiliary_loss_clip": 0.01247572, "auxiliary_loss_mlp": 0.00203026, "balance_loss_clip": 1.03472018, "balance_loss_mlp": 0.17913681, "epoch": 0.9042537201262588, "flos": 22780149419520.0, "grad_norm": 4.940922162979636, "language_loss": 0.77555156, "learning_rate": 9.533987575823977e-08, "loss": 0.79005754, "num_input_tokens_seen": 324295150, "router_z_loss_clip": 2.13085938, "router_z_loss_mlp": 0.23901367, "step": 15040, "time_per_iteration": 2.6549835205078125 }, { "auxiliary_loss_clip": 0.01242728, "auxiliary_loss_mlp": 0.00209928, "balance_loss_clip": 1.0311501, "balance_loss_mlp": 0.18464354, "epoch": 0.9043138433789268, "flos": 20595416497920.0, "grad_norm": 5.373407049316351, "language_loss": 0.75432664, "learning_rate": 9.522109895720709e-08, "loss": 0.76885319, "num_input_tokens_seen": 324313855, "router_z_loss_clip": 2.11523438, "router_z_loss_mlp": 0.25317383, "step": 15041, "time_per_iteration": 5.611579179763794 }, { "auxiliary_loss_clip": 0.0124051, "auxiliary_loss_mlp": 0.0020961, "balance_loss_clip": 1.01960647, "balance_loss_mlp": 0.1833483, "epoch": 0.9043739666315948, "flos": 32962978995840.0, "grad_norm": 25.58658441269707, "language_loss": 0.68277156, "learning_rate": 9.510239438558155e-08, "loss": 0.69727272, "num_input_tokens_seen": 324338465, "router_z_loss_clip": 2.21289062, "router_z_loss_mlp": 0.26245117, "step": 15042, "time_per_iteration": 2.772920608520508 }, { "auxiliary_loss_clip": 0.01125352, "auxiliary_loss_mlp": 0.00067865, "balance_loss_clip": 0.98505402, "balance_loss_mlp": 0.0613323, "epoch": 0.9044340898842628, "flos": 67296418525440.0, "grad_norm": 0.7606041998341159, "language_loss": 0.56024939, "learning_rate": 9.498376204786351e-08, "loss": 0.57218158, "num_input_tokens_seen": 324398740, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.06542969, "step": 15043, "time_per_iteration": 3.145738124847412 }, { "auxiliary_loss_clip": 0.01250768, "auxiliary_loss_mlp": 0.00207375, "balance_loss_clip": 1.03080821, "balance_loss_mlp": 0.18136342, "epoch": 0.9044942131369307, "flos": 17713154390400.0, "grad_norm": 69.28521764241187, "language_loss": 0.76745999, "learning_rate": 9.486520194855274e-08, "loss": 0.78204143, "num_input_tokens_seen": 324417335, "router_z_loss_clip": 2.19921875, "router_z_loss_mlp": 0.26013184, "step": 15044, "time_per_iteration": 2.679252862930298 }, { "auxiliary_loss_clip": 0.0123927, "auxiliary_loss_mlp": 0.00207372, "balance_loss_clip": 1.02295411, "balance_loss_mlp": 0.18379208, "epoch": 0.9045543363895987, "flos": 17820563034240.0, "grad_norm": 54.56605930279235, "language_loss": 0.80326009, "learning_rate": 9.474671409214407e-08, "loss": 0.81772649, "num_input_tokens_seen": 324433240, "router_z_loss_clip": 2.16210938, "router_z_loss_mlp": 0.23583984, "step": 15045, "time_per_iteration": 2.637418746948242 }, { "auxiliary_loss_clip": 0.01257307, "auxiliary_loss_mlp": 0.00223462, "balance_loss_clip": 1.03708231, "balance_loss_mlp": 0.19861889, "epoch": 0.9046144596422666, "flos": 21872723109120.0, "grad_norm": 49.42887324531474, "language_loss": 0.74995816, "learning_rate": 9.462829848313081e-08, "loss": 0.76476586, "num_input_tokens_seen": 324452675, "router_z_loss_clip": 2.19921875, "router_z_loss_mlp": 0.24816895, "step": 15046, "time_per_iteration": 2.6340174674987793 }, { "auxiliary_loss_clip": 0.01268102, "auxiliary_loss_mlp": 0.00226371, "balance_loss_clip": 1.04299045, "balance_loss_mlp": 0.20121777, "epoch": 0.9046745828949346, "flos": 17672646827520.0, "grad_norm": 3.8820145326455275, "language_loss": 0.73854256, "learning_rate": 9.450995512600379e-08, "loss": 0.75348723, "num_input_tokens_seen": 324467865, "router_z_loss_clip": 2.25, "router_z_loss_mlp": 0.25170898, "step": 15047, "time_per_iteration": 4.136888265609741 }, { "auxiliary_loss_clip": 0.01246118, "auxiliary_loss_mlp": 0.00216091, "balance_loss_clip": 1.03217435, "balance_loss_mlp": 0.19199833, "epoch": 0.9047347061476025, "flos": 25702559953920.0, "grad_norm": 6.788652520151275, "language_loss": 0.77219164, "learning_rate": 9.439168402525032e-08, "loss": 0.78681374, "num_input_tokens_seen": 324490430, "router_z_loss_clip": 2.13769531, "router_z_loss_mlp": 0.24121094, "step": 15048, "time_per_iteration": 2.764289140701294 }, { "auxiliary_loss_clip": 0.01241472, "auxiliary_loss_mlp": 0.00222862, "balance_loss_clip": 1.02097404, "balance_loss_mlp": 0.19657679, "epoch": 0.9047948294002706, "flos": 15158146118400.0, "grad_norm": 99.25524063793468, "language_loss": 0.84381485, "learning_rate": 9.427348518535483e-08, "loss": 0.85845816, "num_input_tokens_seen": 324506620, "router_z_loss_clip": 2.20507812, "router_z_loss_mlp": 0.26281738, "step": 15049, "time_per_iteration": 2.644822835922241 }, { "auxiliary_loss_clip": 0.01224864, "auxiliary_loss_mlp": 0.0021946, "balance_loss_clip": 1.01253474, "balance_loss_mlp": 0.19496228, "epoch": 0.9048549526529385, "flos": 21872292145920.0, "grad_norm": 21.77818935330412, "language_loss": 0.82271332, "learning_rate": 9.415535861079993e-08, "loss": 0.83715653, "num_input_tokens_seen": 324525505, "router_z_loss_clip": 2.12402344, "router_z_loss_mlp": 0.24487305, "step": 15050, "time_per_iteration": 2.699599027633667 }, { "auxiliary_loss_clip": 0.01251714, "auxiliary_loss_mlp": 0.00238938, "balance_loss_clip": 1.0348134, "balance_loss_mlp": 0.21322486, "epoch": 0.9049150759056065, "flos": 23546626761600.0, "grad_norm": 2.20513850135261, "language_loss": 0.88811105, "learning_rate": 9.403730430606472e-08, "loss": 0.90301758, "num_input_tokens_seen": 324544415, "router_z_loss_clip": 2.171875, "router_z_loss_mlp": 0.25695801, "step": 15051, "time_per_iteration": 2.639695167541504 }, { "auxiliary_loss_clip": 0.01247381, "auxiliary_loss_mlp": 0.00209568, "balance_loss_clip": 1.03042948, "balance_loss_mlp": 0.18384305, "epoch": 0.9049751991582745, "flos": 19645902426240.0, "grad_norm": 7.698691086967648, "language_loss": 0.98279965, "learning_rate": 9.391932227562582e-08, "loss": 0.99736911, "num_input_tokens_seen": 324562555, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.25708008, "step": 15052, "time_per_iteration": 2.6397194862365723 }, { "auxiliary_loss_clip": 0.0125922, "auxiliary_loss_mlp": 0.00206401, "balance_loss_clip": 1.03406215, "balance_loss_mlp": 0.17987646, "epoch": 0.9050353224109424, "flos": 15596220389760.0, "grad_norm": 328.56707369052475, "language_loss": 0.84397388, "learning_rate": 9.380141252395724e-08, "loss": 0.85863012, "num_input_tokens_seen": 324580865, "router_z_loss_clip": 2.25, "router_z_loss_mlp": 0.26538086, "step": 15053, "time_per_iteration": 4.010976076126099 }, { "auxiliary_loss_clip": 0.01240055, "auxiliary_loss_mlp": 0.00221796, "balance_loss_clip": 1.02529359, "balance_loss_mlp": 0.19558194, "epoch": 0.9050954456636104, "flos": 28183592165760.0, "grad_norm": 17.40690448539482, "language_loss": 0.80424678, "learning_rate": 9.368357505553049e-08, "loss": 0.8188653, "num_input_tokens_seen": 324600665, "router_z_loss_clip": 2.14648438, "router_z_loss_mlp": 0.26196289, "step": 15054, "time_per_iteration": 2.680037021636963 }, { "auxiliary_loss_clip": 0.01231497, "auxiliary_loss_mlp": 0.00211519, "balance_loss_clip": 1.0173924, "balance_loss_mlp": 0.18867886, "epoch": 0.9051555689162784, "flos": 25731611078400.0, "grad_norm": 11.452298958362615, "language_loss": 0.8891806, "learning_rate": 9.356580987481333e-08, "loss": 0.90361077, "num_input_tokens_seen": 324618145, "router_z_loss_clip": 2.14257812, "router_z_loss_mlp": 0.22839355, "step": 15055, "time_per_iteration": 2.691591262817383 }, { "auxiliary_loss_clip": 0.01220137, "auxiliary_loss_mlp": 0.00216767, "balance_loss_clip": 1.01067305, "balance_loss_mlp": 0.19216223, "epoch": 0.9052156921689464, "flos": 23257258796160.0, "grad_norm": 2.4553447549121947, "language_loss": 0.90707922, "learning_rate": 9.344811698627176e-08, "loss": 0.92144823, "num_input_tokens_seen": 324638165, "router_z_loss_clip": 2.09375, "router_z_loss_mlp": 0.24572754, "step": 15056, "time_per_iteration": 2.6667087078094482 }, { "auxiliary_loss_clip": 0.01257281, "auxiliary_loss_mlp": 0.00214001, "balance_loss_clip": 1.0360564, "balance_loss_mlp": 0.18902668, "epoch": 0.9052758154216143, "flos": 29564285097600.0, "grad_norm": 60.87470699828128, "language_loss": 0.80745453, "learning_rate": 9.333049639436863e-08, "loss": 0.8221674, "num_input_tokens_seen": 324658560, "router_z_loss_clip": 2.21289062, "router_z_loss_mlp": 0.25, "step": 15057, "time_per_iteration": 2.7902650833129883 }, { "auxiliary_loss_clip": 0.0122259, "auxiliary_loss_mlp": 0.00185063, "balance_loss_clip": 1.01541901, "balance_loss_mlp": 0.16186459, "epoch": 0.9053359386742823, "flos": 22127688823680.0, "grad_norm": 19.676869144073425, "language_loss": 0.87628412, "learning_rate": 9.321294810356418e-08, "loss": 0.89036071, "num_input_tokens_seen": 324679185, "router_z_loss_clip": 2.07128906, "router_z_loss_mlp": 0.23168945, "step": 15058, "time_per_iteration": 2.6526734828948975 }, { "auxiliary_loss_clip": 0.01132894, "auxiliary_loss_mlp": 0.00113673, "balance_loss_clip": 0.9926368, "balance_loss_mlp": 0.10618699, "epoch": 0.9053960619269502, "flos": 67090112760960.0, "grad_norm": 0.6630899415322499, "language_loss": 0.51111734, "learning_rate": 9.309547211831592e-08, "loss": 0.52358294, "num_input_tokens_seen": 324744830, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.07470703, "step": 15059, "time_per_iteration": 3.2797129154205322 }, { "auxiliary_loss_clip": 0.01253599, "auxiliary_loss_mlp": 0.00224636, "balance_loss_clip": 1.03144813, "balance_loss_mlp": 0.19962636, "epoch": 0.9054561851796182, "flos": 15815419136640.0, "grad_norm": 14.721323178055718, "language_loss": 0.75801444, "learning_rate": 9.297806844307831e-08, "loss": 0.77279687, "num_input_tokens_seen": 324762905, "router_z_loss_clip": 2.22070312, "router_z_loss_mlp": 0.25036621, "step": 15060, "time_per_iteration": 2.6126139163970947 }, { "auxiliary_loss_clip": 0.01255966, "auxiliary_loss_mlp": 0.0020405, "balance_loss_clip": 1.03423333, "balance_loss_mlp": 0.1783964, "epoch": 0.9055163084322861, "flos": 17566997950080.0, "grad_norm": 7.252747886340646, "language_loss": 0.75526571, "learning_rate": 9.286073708230357e-08, "loss": 0.76986593, "num_input_tokens_seen": 324781905, "router_z_loss_clip": 2.21777344, "router_z_loss_mlp": 0.25671387, "step": 15061, "time_per_iteration": 2.6981401443481445 }, { "auxiliary_loss_clip": 0.01257317, "auxiliary_loss_mlp": 0.00213082, "balance_loss_clip": 1.03380799, "balance_loss_mlp": 0.18578282, "epoch": 0.9055764316849542, "flos": 17639573379840.0, "grad_norm": 19.80352392666029, "language_loss": 0.7981838, "learning_rate": 9.274347804044058e-08, "loss": 0.81288785, "num_input_tokens_seen": 324799260, "router_z_loss_clip": 2.23632812, "router_z_loss_mlp": 0.2734375, "step": 15062, "time_per_iteration": 2.7697815895080566 }, { "auxiliary_loss_clip": 0.01234704, "auxiliary_loss_mlp": 0.00220701, "balance_loss_clip": 1.0193646, "balance_loss_mlp": 0.19592966, "epoch": 0.9056365549376221, "flos": 20120856986880.0, "grad_norm": 3.258175765459346, "language_loss": 0.79177296, "learning_rate": 9.2626291321936e-08, "loss": 0.80632704, "num_input_tokens_seen": 324817800, "router_z_loss_clip": 2.15429688, "router_z_loss_mlp": 0.24780273, "step": 15063, "time_per_iteration": 2.682821273803711 }, { "auxiliary_loss_clip": 0.01235163, "auxiliary_loss_mlp": 0.00212702, "balance_loss_clip": 1.01971662, "balance_loss_mlp": 0.18882427, "epoch": 0.9056966781902901, "flos": 27598786836480.0, "grad_norm": 16.767583688336437, "language_loss": 0.78474969, "learning_rate": 9.250917693123406e-08, "loss": 0.79922831, "num_input_tokens_seen": 324838445, "router_z_loss_clip": 2.15527344, "router_z_loss_mlp": 0.23901367, "step": 15064, "time_per_iteration": 2.7116708755493164 }, { "auxiliary_loss_clip": 0.01228542, "auxiliary_loss_mlp": 0.00225534, "balance_loss_clip": 1.015185, "balance_loss_mlp": 0.20066693, "epoch": 0.9057568014429581, "flos": 25920106675200.0, "grad_norm": 17.407795647337224, "language_loss": 0.8083747, "learning_rate": 9.23921348727752e-08, "loss": 0.82291543, "num_input_tokens_seen": 324859895, "router_z_loss_clip": 2.1328125, "router_z_loss_mlp": 0.24865723, "step": 15065, "time_per_iteration": 2.724809169769287 }, { "auxiliary_loss_clip": 0.01242824, "auxiliary_loss_mlp": 0.00236926, "balance_loss_clip": 1.02507246, "balance_loss_mlp": 0.21247645, "epoch": 0.905816924695626, "flos": 22930364096640.0, "grad_norm": 2.41019544858577, "language_loss": 0.71044892, "learning_rate": 9.227516515099743e-08, "loss": 0.72524643, "num_input_tokens_seen": 324879580, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.24438477, "step": 15066, "time_per_iteration": 2.713212728500366 }, { "auxiliary_loss_clip": 0.01256911, "auxiliary_loss_mlp": 0.0022578, "balance_loss_clip": 1.02863157, "balance_loss_mlp": 0.19962578, "epoch": 0.905877047948294, "flos": 22157422306560.0, "grad_norm": 41.314273779982535, "language_loss": 0.89548862, "learning_rate": 9.215826777033675e-08, "loss": 0.91031557, "num_input_tokens_seen": 324898950, "router_z_loss_clip": 2.28125, "router_z_loss_mlp": 0.26147461, "step": 15067, "time_per_iteration": 2.6788675785064697 }, { "auxiliary_loss_clip": 0.01260513, "auxiliary_loss_mlp": 0.00236981, "balance_loss_clip": 1.03641963, "balance_loss_mlp": 0.20783424, "epoch": 0.905937171200962, "flos": 15304805349120.0, "grad_norm": 7.757862125018227, "language_loss": 0.79646313, "learning_rate": 9.204144273522563e-08, "loss": 0.81143808, "num_input_tokens_seen": 324917455, "router_z_loss_clip": 2.2421875, "router_z_loss_mlp": 0.29125977, "step": 15068, "time_per_iteration": 2.695805788040161 }, { "auxiliary_loss_clip": 0.01214816, "auxiliary_loss_mlp": 0.00220227, "balance_loss_clip": 1.00533056, "balance_loss_mlp": 0.19578908, "epoch": 0.90599729445363, "flos": 19462973437440.0, "grad_norm": 366.18637109233424, "language_loss": 0.91774505, "learning_rate": 9.19246900500943e-08, "loss": 0.93209553, "num_input_tokens_seen": 324934495, "router_z_loss_clip": 2.09472656, "router_z_loss_mlp": 0.24462891, "step": 15069, "time_per_iteration": 2.6887691020965576 }, { "auxiliary_loss_clip": 0.01263863, "auxiliary_loss_mlp": 0.00217759, "balance_loss_clip": 1.03680921, "balance_loss_mlp": 0.19162877, "epoch": 0.9060574177062979, "flos": 23732967542400.0, "grad_norm": 9.096928066843194, "language_loss": 0.69115496, "learning_rate": 9.180800971936987e-08, "loss": 0.70597118, "num_input_tokens_seen": 324953230, "router_z_loss_clip": 2.27148438, "router_z_loss_mlp": 0.26147461, "step": 15070, "time_per_iteration": 2.8041584491729736 }, { "auxiliary_loss_clip": 0.0125159, "auxiliary_loss_mlp": 0.00193013, "balance_loss_clip": 1.0315845, "balance_loss_mlp": 0.16749041, "epoch": 0.9061175409589659, "flos": 17311134395520.0, "grad_norm": 13.74168586258266, "language_loss": 0.90748107, "learning_rate": 9.169140174747724e-08, "loss": 0.92192709, "num_input_tokens_seen": 324969880, "router_z_loss_clip": 2.20117188, "router_z_loss_mlp": 0.25537109, "step": 15071, "time_per_iteration": 2.639390707015991 }, { "auxiliary_loss_clip": 0.01251893, "auxiliary_loss_mlp": 0.00209283, "balance_loss_clip": 1.03058577, "balance_loss_mlp": 0.18414164, "epoch": 0.9061776642116338, "flos": 17778439359360.0, "grad_norm": 1345.976684688048, "language_loss": 0.71626198, "learning_rate": 9.157486613883758e-08, "loss": 0.7308737, "num_input_tokens_seen": 324987005, "router_z_loss_clip": 2.2109375, "router_z_loss_mlp": 0.25158691, "step": 15072, "time_per_iteration": 2.632375717163086 }, { "auxiliary_loss_clip": 0.01236947, "auxiliary_loss_mlp": 0.00224533, "balance_loss_clip": 1.02012348, "balance_loss_mlp": 0.19846226, "epoch": 0.9062377874643018, "flos": 42777688037760.0, "grad_norm": 27.184692014518347, "language_loss": 0.80857313, "learning_rate": 9.145840289787021e-08, "loss": 0.82318789, "num_input_tokens_seen": 325010700, "router_z_loss_clip": 2.16699219, "router_z_loss_mlp": 0.26074219, "step": 15073, "time_per_iteration": 2.827106475830078 }, { "auxiliary_loss_clip": 0.01235424, "auxiliary_loss_mlp": 0.00200039, "balance_loss_clip": 1.02393186, "balance_loss_mlp": 0.17563653, "epoch": 0.9062979107169697, "flos": 16361620323840.0, "grad_norm": 30.36749580305017, "language_loss": 0.89951783, "learning_rate": 9.134201202899161e-08, "loss": 0.91387248, "num_input_tokens_seen": 325028760, "router_z_loss_clip": 2.11523438, "router_z_loss_mlp": 0.24389648, "step": 15074, "time_per_iteration": 2.6496243476867676 }, { "auxiliary_loss_clip": 0.01119971, "auxiliary_loss_mlp": 0.00086248, "balance_loss_clip": 0.98014736, "balance_loss_mlp": 0.07938162, "epoch": 0.9063580339696378, "flos": 69313988528640.0, "grad_norm": 0.7857841252478698, "language_loss": 0.5131793, "learning_rate": 9.122569353661513e-08, "loss": 0.52524149, "num_input_tokens_seen": 325093545, "router_z_loss_clip": 1.3984375, "router_z_loss_mlp": 0.06884766, "step": 15075, "time_per_iteration": 3.248792886734009 }, { "auxiliary_loss_clip": 0.01127077, "auxiliary_loss_mlp": 0.00121023, "balance_loss_clip": 0.98719192, "balance_loss_mlp": 0.11429983, "epoch": 0.9064181572223057, "flos": 58794747148800.0, "grad_norm": 0.9170012894173143, "language_loss": 0.61106682, "learning_rate": 9.11094474251517e-08, "loss": 0.62354779, "num_input_tokens_seen": 325152295, "router_z_loss_clip": 1.3984375, "router_z_loss_mlp": 0.06738281, "step": 15076, "time_per_iteration": 3.055506706237793 }, { "auxiliary_loss_clip": 0.01247597, "auxiliary_loss_mlp": 0.00205407, "balance_loss_clip": 1.02706838, "balance_loss_mlp": 0.1799562, "epoch": 0.9064782804749737, "flos": 21762692772480.0, "grad_norm": 15.225410818512412, "language_loss": 0.89323127, "learning_rate": 9.09932736990091e-08, "loss": 0.90776134, "num_input_tokens_seen": 325169705, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.25439453, "step": 15077, "time_per_iteration": 2.7565386295318604 }, { "auxiliary_loss_clip": 0.01227635, "auxiliary_loss_mlp": 0.00210026, "balance_loss_clip": 1.02176189, "balance_loss_mlp": 0.18590955, "epoch": 0.9065384037276417, "flos": 21397373498880.0, "grad_norm": 28.884494557437772, "language_loss": 0.91270387, "learning_rate": 9.08771723625934e-08, "loss": 0.92708045, "num_input_tokens_seen": 325189175, "router_z_loss_clip": 2.05664062, "router_z_loss_mlp": 0.24108887, "step": 15078, "time_per_iteration": 2.643357038497925 }, { "auxiliary_loss_clip": 0.01215996, "auxiliary_loss_mlp": 0.00199724, "balance_loss_clip": 1.007303, "balance_loss_mlp": 0.17697909, "epoch": 0.9065985269803096, "flos": 38283646849920.0, "grad_norm": 59.57053548544907, "language_loss": 0.70754468, "learning_rate": 9.076114342030617e-08, "loss": 0.72170186, "num_input_tokens_seen": 325211020, "router_z_loss_clip": 2.08691406, "router_z_loss_mlp": 0.22753906, "step": 15079, "time_per_iteration": 2.8747503757476807 }, { "auxiliary_loss_clip": 0.01236994, "auxiliary_loss_mlp": 0.00209585, "balance_loss_clip": 1.02166367, "balance_loss_mlp": 0.18631576, "epoch": 0.9066586502329776, "flos": 44818562989440.0, "grad_norm": 33.734746487688554, "language_loss": 0.75382996, "learning_rate": 9.064518687654765e-08, "loss": 0.76829576, "num_input_tokens_seen": 325236970, "router_z_loss_clip": 2.15429688, "router_z_loss_mlp": 0.23254395, "step": 15080, "time_per_iteration": 2.864539384841919 }, { "auxiliary_loss_clip": 0.01264189, "auxiliary_loss_mlp": 0.00232653, "balance_loss_clip": 1.03954911, "balance_loss_mlp": 0.2062602, "epoch": 0.9067187734856456, "flos": 18623992492800.0, "grad_norm": 3.624187685730049, "language_loss": 0.79542333, "learning_rate": 9.052930273571547e-08, "loss": 0.81039178, "num_input_tokens_seen": 325252670, "router_z_loss_clip": 2.24609375, "router_z_loss_mlp": 0.26379395, "step": 15081, "time_per_iteration": 2.6636626720428467 }, { "auxiliary_loss_clip": 0.01233527, "auxiliary_loss_mlp": 0.00200619, "balance_loss_clip": 1.01616502, "balance_loss_mlp": 0.1762287, "epoch": 0.9067788967383136, "flos": 22747578762240.0, "grad_norm": 2.188990414311764, "language_loss": 0.83381546, "learning_rate": 9.04134910022032e-08, "loss": 0.84815693, "num_input_tokens_seen": 325273860, "router_z_loss_clip": 2.17578125, "router_z_loss_mlp": 0.24377441, "step": 15082, "time_per_iteration": 2.6981866359710693 }, { "auxiliary_loss_clip": 0.01246443, "auxiliary_loss_mlp": 0.0022132, "balance_loss_clip": 1.03265393, "balance_loss_mlp": 0.19596386, "epoch": 0.9068390199909815, "flos": 27670787648640.0, "grad_norm": 129.76593258799082, "language_loss": 0.85626936, "learning_rate": 9.029775168040266e-08, "loss": 0.870947, "num_input_tokens_seen": 325294140, "router_z_loss_clip": 2.13671875, "router_z_loss_mlp": 0.25341797, "step": 15083, "time_per_iteration": 4.116838455200195 }, { "auxiliary_loss_clip": 0.01230221, "auxiliary_loss_mlp": 0.00194108, "balance_loss_clip": 1.02190971, "balance_loss_mlp": 0.17065978, "epoch": 0.9068991432436495, "flos": 24244012293120.0, "grad_norm": 20.50254076558301, "language_loss": 0.76427549, "learning_rate": 9.01820847747028e-08, "loss": 0.77851874, "num_input_tokens_seen": 325313130, "router_z_loss_clip": 2.07910156, "router_z_loss_mlp": 0.234375, "step": 15084, "time_per_iteration": 4.092199087142944 }, { "auxiliary_loss_clip": 0.0123983, "auxiliary_loss_mlp": 0.00217054, "balance_loss_clip": 1.02413511, "balance_loss_mlp": 0.19265193, "epoch": 0.9069592664963174, "flos": 28033305661440.0, "grad_norm": 5.315620677173247, "language_loss": 0.75092357, "learning_rate": 9.006649028948965e-08, "loss": 0.76549244, "num_input_tokens_seen": 325334880, "router_z_loss_clip": 2.15820312, "router_z_loss_mlp": 0.24401855, "step": 15085, "time_per_iteration": 2.7476320266723633 }, { "auxiliary_loss_clip": 0.01136052, "auxiliary_loss_mlp": 0.0013217, "balance_loss_clip": 0.99547577, "balance_loss_mlp": 0.12353907, "epoch": 0.9070193897489854, "flos": 68778414789120.0, "grad_norm": 0.767234597043579, "language_loss": 0.60632211, "learning_rate": 8.995096822914638e-08, "loss": 0.61900431, "num_input_tokens_seen": 325394175, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.08642578, "step": 15086, "time_per_iteration": 3.200314998626709 }, { "auxiliary_loss_clip": 0.01228494, "auxiliary_loss_mlp": 0.0021478, "balance_loss_clip": 1.01424241, "balance_loss_mlp": 0.18998429, "epoch": 0.9070795130016533, "flos": 23441624328960.0, "grad_norm": 15.25925326289114, "language_loss": 0.79128355, "learning_rate": 8.983551859805416e-08, "loss": 0.80571628, "num_input_tokens_seen": 325415020, "router_z_loss_clip": 2.140625, "router_z_loss_mlp": 0.2479248, "step": 15087, "time_per_iteration": 2.7514586448669434 }, { "auxiliary_loss_clip": 0.01238176, "auxiliary_loss_mlp": 0.00224636, "balance_loss_clip": 1.022367, "balance_loss_mlp": 0.19871947, "epoch": 0.9071396362543214, "flos": 18916413114240.0, "grad_norm": 6.249442292165258, "language_loss": 0.86346328, "learning_rate": 8.972014140059058e-08, "loss": 0.87809145, "num_input_tokens_seen": 325433595, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.25939941, "step": 15088, "time_per_iteration": 2.612811326980591 }, { "auxiliary_loss_clip": 0.01233095, "auxiliary_loss_mlp": 0.00209611, "balance_loss_clip": 1.02050114, "balance_loss_mlp": 0.18648484, "epoch": 0.9071997595069893, "flos": 25228646887680.0, "grad_norm": 40.50116486826753, "language_loss": 0.80907714, "learning_rate": 8.960483664113038e-08, "loss": 0.82350421, "num_input_tokens_seen": 325451605, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.23144531, "step": 15089, "time_per_iteration": 4.1578288078308105 }, { "auxiliary_loss_clip": 0.01221737, "auxiliary_loss_mlp": 0.00196197, "balance_loss_clip": 1.01267004, "balance_loss_mlp": 0.17285571, "epoch": 0.9072598827596573, "flos": 24346608514560.0, "grad_norm": 29.44201264311004, "language_loss": 0.83462071, "learning_rate": 8.948960432404628e-08, "loss": 0.84880006, "num_input_tokens_seen": 325470645, "router_z_loss_clip": 2.09082031, "router_z_loss_mlp": 0.23327637, "step": 15090, "time_per_iteration": 2.7164530754089355 }, { "auxiliary_loss_clip": 0.01252495, "auxiliary_loss_mlp": 0.00204724, "balance_loss_clip": 1.03330922, "balance_loss_mlp": 0.17731753, "epoch": 0.9073200060123253, "flos": 22674967418880.0, "grad_norm": 3.429803787378421, "language_loss": 0.87925315, "learning_rate": 8.93744444537079e-08, "loss": 0.89382529, "num_input_tokens_seen": 325488070, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.27416992, "step": 15091, "time_per_iteration": 2.6655592918395996 }, { "auxiliary_loss_clip": 0.01208229, "auxiliary_loss_mlp": 0.00191232, "balance_loss_clip": 1.00533557, "balance_loss_mlp": 0.1676878, "epoch": 0.9073801292649932, "flos": 23695476721920.0, "grad_norm": 2.8121111096515468, "language_loss": 0.91888523, "learning_rate": 8.925935703448217e-08, "loss": 0.93287981, "num_input_tokens_seen": 325509285, "router_z_loss_clip": 2.02929688, "router_z_loss_mlp": 0.2355957, "step": 15092, "time_per_iteration": 2.735358715057373 }, { "auxiliary_loss_clip": 0.01246462, "auxiliary_loss_mlp": 0.00214864, "balance_loss_clip": 1.03124654, "balance_loss_mlp": 0.18900782, "epoch": 0.9074402525176612, "flos": 25375413859200.0, "grad_norm": 3.159380672791971, "language_loss": 0.86188495, "learning_rate": 8.914434207073296e-08, "loss": 0.87649822, "num_input_tokens_seen": 325529360, "router_z_loss_clip": 2.15234375, "router_z_loss_mlp": 0.25878906, "step": 15093, "time_per_iteration": 2.712634325027466 }, { "auxiliary_loss_clip": 0.01133426, "auxiliary_loss_mlp": 0.00138811, "balance_loss_clip": 0.99418342, "balance_loss_mlp": 0.12960845, "epoch": 0.9075003757703292, "flos": 67649024384640.0, "grad_norm": 157.92540449499535, "language_loss": 0.56773734, "learning_rate": 8.902939956682188e-08, "loss": 0.58045971, "num_input_tokens_seen": 325583565, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.09179688, "step": 15094, "time_per_iteration": 3.0915162563323975 }, { "auxiliary_loss_clip": 0.01250555, "auxiliary_loss_mlp": 0.00246789, "balance_loss_clip": 1.03101373, "balance_loss_mlp": 0.2213856, "epoch": 0.9075604990229972, "flos": 22453649769600.0, "grad_norm": 9.978801788903164, "language_loss": 0.79981124, "learning_rate": 8.891452952710742e-08, "loss": 0.81478465, "num_input_tokens_seen": 325603690, "router_z_loss_clip": 2.19335938, "router_z_loss_mlp": 0.25390625, "step": 15095, "time_per_iteration": 4.0524492263793945 }, { "auxiliary_loss_clip": 0.0123619, "auxiliary_loss_mlp": 0.00213183, "balance_loss_clip": 1.01988697, "balance_loss_mlp": 0.18769637, "epoch": 0.9076206222756651, "flos": 19536662188800.0, "grad_norm": 4.072849385024415, "language_loss": 0.81629556, "learning_rate": 8.879973195594526e-08, "loss": 0.83078927, "num_input_tokens_seen": 325622255, "router_z_loss_clip": 2.16308594, "router_z_loss_mlp": 0.25512695, "step": 15096, "time_per_iteration": 2.6207001209259033 }, { "auxiliary_loss_clip": 0.01243863, "auxiliary_loss_mlp": 0.00225437, "balance_loss_clip": 1.02224326, "balance_loss_mlp": 0.19993815, "epoch": 0.9076807455283331, "flos": 30116914819200.0, "grad_norm": 2.465318536860071, "language_loss": 0.64549816, "learning_rate": 8.868500685768898e-08, "loss": 0.66019118, "num_input_tokens_seen": 325640165, "router_z_loss_clip": 2.22265625, "router_z_loss_mlp": 0.25524902, "step": 15097, "time_per_iteration": 2.7904675006866455 }, { "auxiliary_loss_clip": 0.01225939, "auxiliary_loss_mlp": 0.00201376, "balance_loss_clip": 1.01281285, "balance_loss_mlp": 0.17826165, "epoch": 0.907740868781001, "flos": 18697537589760.0, "grad_norm": 2.4900984502435937, "language_loss": 0.8769545, "learning_rate": 8.857035423668935e-08, "loss": 0.89122766, "num_input_tokens_seen": 325659455, "router_z_loss_clip": 2.1328125, "router_z_loss_mlp": 0.2310791, "step": 15098, "time_per_iteration": 2.6595869064331055 }, { "auxiliary_loss_clip": 0.01242544, "auxiliary_loss_mlp": 0.00226007, "balance_loss_clip": 1.02072239, "balance_loss_mlp": 0.20136656, "epoch": 0.907800992033669, "flos": 22638805401600.0, "grad_norm": 15.397503075972372, "language_loss": 0.75125802, "learning_rate": 8.845577409729266e-08, "loss": 0.76594341, "num_input_tokens_seen": 325678095, "router_z_loss_clip": 2.21484375, "router_z_loss_mlp": 0.24645996, "step": 15099, "time_per_iteration": 2.7422943115234375 }, { "auxiliary_loss_clip": 0.01252487, "auxiliary_loss_mlp": 0.00203974, "balance_loss_clip": 1.0276978, "balance_loss_mlp": 0.17717618, "epoch": 0.907861115286337, "flos": 21287666384640.0, "grad_norm": 47.06191861049389, "language_loss": 0.80273789, "learning_rate": 8.834126644384477e-08, "loss": 0.81730247, "num_input_tokens_seen": 325695825, "router_z_loss_clip": 2.24804688, "router_z_loss_mlp": 0.26818848, "step": 15100, "time_per_iteration": 2.757944345474243 }, { "auxiliary_loss_clip": 0.01124307, "auxiliary_loss_mlp": 0.00129092, "balance_loss_clip": 0.98522198, "balance_loss_mlp": 0.12084267, "epoch": 0.907921238539005, "flos": 69739493040000.0, "grad_norm": 0.6748972980382554, "language_loss": 0.52677315, "learning_rate": 8.822683128068775e-08, "loss": 0.53930712, "num_input_tokens_seen": 325764515, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.08251953, "step": 15101, "time_per_iteration": 3.2404656410217285 }, { "auxiliary_loss_clip": 0.01240007, "auxiliary_loss_mlp": 0.00202785, "balance_loss_clip": 1.02215886, "balance_loss_mlp": 0.17851359, "epoch": 0.9079813617916729, "flos": 23477391296640.0, "grad_norm": 18.37181079044459, "language_loss": 0.77482307, "learning_rate": 8.811246861216081e-08, "loss": 0.78925097, "num_input_tokens_seen": 325783235, "router_z_loss_clip": 2.18164062, "router_z_loss_mlp": 0.24291992, "step": 15102, "time_per_iteration": 2.705183982849121 }, { "auxiliary_loss_clip": 0.01227949, "auxiliary_loss_mlp": 0.0021583, "balance_loss_clip": 1.01556945, "balance_loss_mlp": 0.19089113, "epoch": 0.9080414850443409, "flos": 22929933133440.0, "grad_norm": 7.597860036340499, "language_loss": 0.85398698, "learning_rate": 8.799817844260049e-08, "loss": 0.86842477, "num_input_tokens_seen": 325800195, "router_z_loss_clip": 2.12402344, "router_z_loss_mlp": 0.24951172, "step": 15103, "time_per_iteration": 2.6312315464019775 }, { "auxiliary_loss_clip": 0.01237917, "auxiliary_loss_mlp": 0.00220827, "balance_loss_clip": 1.01806092, "balance_loss_mlp": 0.19597185, "epoch": 0.9081016082970089, "flos": 26177083551360.0, "grad_norm": 5868.071002686604, "language_loss": 0.83157855, "learning_rate": 8.78839607763413e-08, "loss": 0.84616601, "num_input_tokens_seen": 325820215, "router_z_loss_clip": 2.19921875, "router_z_loss_mlp": 0.24865723, "step": 15104, "time_per_iteration": 2.7698302268981934 }, { "auxiliary_loss_clip": 0.01223447, "auxiliary_loss_mlp": 0.00202653, "balance_loss_clip": 1.01040947, "balance_loss_mlp": 0.17807201, "epoch": 0.9081617315496768, "flos": 24462169545600.0, "grad_norm": 7.242070262012642, "language_loss": 0.85065055, "learning_rate": 8.77698156177138e-08, "loss": 0.86491156, "num_input_tokens_seen": 325838415, "router_z_loss_clip": 2.13085938, "router_z_loss_mlp": 0.2454834, "step": 15105, "time_per_iteration": 2.6815779209136963 }, { "auxiliary_loss_clip": 0.0124397, "auxiliary_loss_mlp": 0.00200823, "balance_loss_clip": 1.0253098, "balance_loss_mlp": 0.17577687, "epoch": 0.9082218548023449, "flos": 24746868743040.0, "grad_norm": 711.1571996051864, "language_loss": 0.80861306, "learning_rate": 8.765574297104628e-08, "loss": 0.82306099, "num_input_tokens_seen": 325855580, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.25036621, "step": 15106, "time_per_iteration": 2.695061683654785 }, { "auxiliary_loss_clip": 0.0126261, "auxiliary_loss_mlp": 0.00222301, "balance_loss_clip": 1.03856325, "balance_loss_mlp": 0.19696906, "epoch": 0.9082819780550128, "flos": 24421302846720.0, "grad_norm": 2.5213840991520535, "language_loss": 0.87746572, "learning_rate": 8.754174284066462e-08, "loss": 0.89231485, "num_input_tokens_seen": 325874890, "router_z_loss_clip": 2.2421875, "router_z_loss_mlp": 0.25341797, "step": 15107, "time_per_iteration": 2.7746129035949707 }, { "auxiliary_loss_clip": 0.0111161, "auxiliary_loss_mlp": 0.00106895, "balance_loss_clip": 0.9742924, "balance_loss_mlp": 0.09878902, "epoch": 0.9083421013076808, "flos": 59609704872960.0, "grad_norm": 0.8049611643896945, "language_loss": 0.59566903, "learning_rate": 8.742781523089205e-08, "loss": 0.60785407, "num_input_tokens_seen": 325935835, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.08105469, "step": 15108, "time_per_iteration": 3.1508679389953613 }, { "auxiliary_loss_clip": 0.01236532, "auxiliary_loss_mlp": 0.0020167, "balance_loss_clip": 1.01838923, "balance_loss_mlp": 0.17701723, "epoch": 0.9084022245603487, "flos": 33620216100480.0, "grad_norm": 4.804797469814861, "language_loss": 0.82869053, "learning_rate": 8.73139601460482e-08, "loss": 0.84307259, "num_input_tokens_seen": 325958035, "router_z_loss_clip": 2.18066406, "router_z_loss_mlp": 0.24621582, "step": 15109, "time_per_iteration": 2.7531652450561523 }, { "auxiliary_loss_clip": 0.01226428, "auxiliary_loss_mlp": 0.00204566, "balance_loss_clip": 1.01422763, "balance_loss_mlp": 0.17971104, "epoch": 0.9084623478130167, "flos": 24971705925120.0, "grad_norm": 75.78143263442574, "language_loss": 0.76817852, "learning_rate": 8.720017759045073e-08, "loss": 0.78248847, "num_input_tokens_seen": 325979870, "router_z_loss_clip": 2.12109375, "router_z_loss_mlp": 0.24853516, "step": 15110, "time_per_iteration": 2.6808688640594482 }, { "auxiliary_loss_clip": 0.01213939, "auxiliary_loss_mlp": 0.00204852, "balance_loss_clip": 1.00954294, "balance_loss_mlp": 0.18208304, "epoch": 0.9085224710656846, "flos": 31461804869760.0, "grad_norm": 4.603348557740948, "language_loss": 0.76388788, "learning_rate": 8.708646756841421e-08, "loss": 0.7780757, "num_input_tokens_seen": 325998245, "router_z_loss_clip": 2.04394531, "router_z_loss_mlp": 0.22790527, "step": 15111, "time_per_iteration": 2.792809009552002 }, { "auxiliary_loss_clip": 0.01110996, "auxiliary_loss_mlp": 0.00113014, "balance_loss_clip": 0.97311807, "balance_loss_mlp": 0.10519421, "epoch": 0.9085825943183526, "flos": 64917012867840.0, "grad_norm": 0.7158046271429627, "language_loss": 0.50811899, "learning_rate": 8.697283008425026e-08, "loss": 0.52035904, "num_input_tokens_seen": 326061770, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.078125, "step": 15112, "time_per_iteration": 3.18013596534729 }, { "auxiliary_loss_clip": 0.01233814, "auxiliary_loss_mlp": 0.0020634, "balance_loss_clip": 1.0171442, "balance_loss_mlp": 0.18124676, "epoch": 0.9086427175710206, "flos": 18953221576320.0, "grad_norm": 177.9269583961014, "language_loss": 0.79181123, "learning_rate": 8.685926514226837e-08, "loss": 0.80621284, "num_input_tokens_seen": 326080945, "router_z_loss_clip": 2.16601562, "router_z_loss_mlp": 0.25109863, "step": 15113, "time_per_iteration": 2.6208336353302 }, { "auxiliary_loss_clip": 0.01224803, "auxiliary_loss_mlp": 0.00196854, "balance_loss_clip": 1.01194012, "balance_loss_mlp": 0.17360786, "epoch": 0.9087028408236886, "flos": 34014873807360.0, "grad_norm": 24.350724078749924, "language_loss": 0.86321765, "learning_rate": 8.674577274677508e-08, "loss": 0.87743413, "num_input_tokens_seen": 326100630, "router_z_loss_clip": 2.12890625, "router_z_loss_mlp": 0.2322998, "step": 15114, "time_per_iteration": 2.733224630355835 }, { "auxiliary_loss_clip": 0.01260497, "auxiliary_loss_mlp": 0.00238578, "balance_loss_clip": 1.03590631, "balance_loss_mlp": 0.20930003, "epoch": 0.9087629640763565, "flos": 21944580266880.0, "grad_norm": 5.080671438722505, "language_loss": 0.81702667, "learning_rate": 8.663235290207405e-08, "loss": 0.83201742, "num_input_tokens_seen": 326120145, "router_z_loss_clip": 2.24609375, "router_z_loss_mlp": 0.29248047, "step": 15115, "time_per_iteration": 2.751502752304077 }, { "auxiliary_loss_clip": 0.01253233, "auxiliary_loss_mlp": 0.00232364, "balance_loss_clip": 1.0299145, "balance_loss_mlp": 0.20712715, "epoch": 0.9088230873290245, "flos": 21762908254080.0, "grad_norm": 17.315585459801266, "language_loss": 0.7295385, "learning_rate": 8.651900561246561e-08, "loss": 0.74439442, "num_input_tokens_seen": 326140715, "router_z_loss_clip": 2.23242188, "router_z_loss_mlp": 0.25219727, "step": 15116, "time_per_iteration": 2.750324249267578 }, { "auxiliary_loss_clip": 0.01231621, "auxiliary_loss_mlp": 0.00227707, "balance_loss_clip": 1.01902962, "balance_loss_mlp": 0.20230371, "epoch": 0.9088832105816925, "flos": 21541267382400.0, "grad_norm": 6.086512907286031, "language_loss": 0.76762772, "learning_rate": 8.640573088224812e-08, "loss": 0.78222102, "num_input_tokens_seen": 326159130, "router_z_loss_clip": 2.12597656, "router_z_loss_mlp": 0.25402832, "step": 15117, "time_per_iteration": 2.627537250518799 }, { "auxiliary_loss_clip": 0.0122826, "auxiliary_loss_mlp": 0.00225797, "balance_loss_clip": 1.01467538, "balance_loss_mlp": 0.20101309, "epoch": 0.9089433338343604, "flos": 25996704428160.0, "grad_norm": 277.3562179158369, "language_loss": 0.81654751, "learning_rate": 8.629252871571745e-08, "loss": 0.83108807, "num_input_tokens_seen": 326181375, "router_z_loss_clip": 2.140625, "router_z_loss_mlp": 0.24804688, "step": 15118, "time_per_iteration": 2.714554786682129 }, { "auxiliary_loss_clip": 0.0125389, "auxiliary_loss_mlp": 0.00246262, "balance_loss_clip": 1.02963173, "balance_loss_mlp": 0.21725804, "epoch": 0.9090034570870285, "flos": 21178426147200.0, "grad_norm": 28.39081463998199, "language_loss": 0.81093049, "learning_rate": 8.617939911716554e-08, "loss": 0.82593197, "num_input_tokens_seen": 326199740, "router_z_loss_clip": 2.2421875, "router_z_loss_mlp": 0.28991699, "step": 15119, "time_per_iteration": 2.727130889892578 }, { "auxiliary_loss_clip": 0.01277118, "auxiliary_loss_mlp": 0.00222221, "balance_loss_clip": 1.04419303, "balance_loss_mlp": 0.19541082, "epoch": 0.9090635803396964, "flos": 16141811045760.0, "grad_norm": 7.983057635328793, "language_loss": 0.8344717, "learning_rate": 8.60663420908827e-08, "loss": 0.84946513, "num_input_tokens_seen": 326214350, "router_z_loss_clip": 2.33007812, "router_z_loss_mlp": 0.26818848, "step": 15120, "time_per_iteration": 2.6598029136657715 }, { "auxiliary_loss_clip": 0.01246671, "auxiliary_loss_mlp": 0.00208842, "balance_loss_clip": 1.02900279, "balance_loss_mlp": 0.1849764, "epoch": 0.9091237035923644, "flos": 20591537829120.0, "grad_norm": 21.751956817441865, "language_loss": 0.75376219, "learning_rate": 8.595335764115596e-08, "loss": 0.76831734, "num_input_tokens_seen": 326234580, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.23864746, "step": 15121, "time_per_iteration": 2.7122533321380615 }, { "auxiliary_loss_clip": 0.01232927, "auxiliary_loss_mlp": 0.00229382, "balance_loss_clip": 1.01855206, "balance_loss_mlp": 0.20096242, "epoch": 0.9091838268450323, "flos": 52227760164480.0, "grad_norm": 13.785364555830267, "language_loss": 0.79699314, "learning_rate": 8.58404457722699e-08, "loss": 0.8116163, "num_input_tokens_seen": 326259080, "router_z_loss_clip": 2.14453125, "router_z_loss_mlp": 0.28417969, "step": 15122, "time_per_iteration": 2.9996819496154785 }, { "auxiliary_loss_clip": 0.01226085, "auxiliary_loss_mlp": 0.00210293, "balance_loss_clip": 1.01330447, "balance_loss_mlp": 0.18608175, "epoch": 0.9092439500977003, "flos": 20559613616640.0, "grad_norm": 14.825985257247167, "language_loss": 0.79905462, "learning_rate": 8.572760648850575e-08, "loss": 0.81341839, "num_input_tokens_seen": 326280175, "router_z_loss_clip": 2.12695312, "router_z_loss_mlp": 0.24194336, "step": 15123, "time_per_iteration": 2.66717267036438 }, { "auxiliary_loss_clip": 0.01216738, "auxiliary_loss_mlp": 0.00210263, "balance_loss_clip": 1.01142848, "balance_loss_mlp": 0.18664718, "epoch": 0.9093040733503682, "flos": 28617859595520.0, "grad_norm": 5.8845346947950485, "language_loss": 0.84050941, "learning_rate": 8.561483979414253e-08, "loss": 0.85477942, "num_input_tokens_seen": 326297990, "router_z_loss_clip": 2.05175781, "router_z_loss_mlp": 0.23632812, "step": 15124, "time_per_iteration": 2.734279155731201 }, { "auxiliary_loss_clip": 0.01234298, "auxiliary_loss_mlp": 0.00209191, "balance_loss_clip": 1.02115321, "balance_loss_mlp": 0.18389477, "epoch": 0.9093641966030362, "flos": 23440187784960.0, "grad_norm": 16.523058782115832, "language_loss": 0.81799531, "learning_rate": 8.55021456934566e-08, "loss": 0.83243024, "num_input_tokens_seen": 326316735, "router_z_loss_clip": 2.13085938, "router_z_loss_mlp": 0.25305176, "step": 15125, "time_per_iteration": 4.1393773555755615 }, { "auxiliary_loss_clip": 0.01231239, "auxiliary_loss_mlp": 0.00220865, "balance_loss_clip": 1.02083254, "balance_loss_mlp": 0.19698757, "epoch": 0.9094243198557042, "flos": 16800197385600.0, "grad_norm": 39.37313926393241, "language_loss": 0.85384774, "learning_rate": 8.538952419072143e-08, "loss": 0.86836874, "num_input_tokens_seen": 326334370, "router_z_loss_clip": 2.10351562, "router_z_loss_mlp": 0.23876953, "step": 15126, "time_per_iteration": 4.127735376358032 }, { "auxiliary_loss_clip": 0.01233311, "auxiliary_loss_mlp": 0.00227436, "balance_loss_clip": 1.02269173, "balance_loss_mlp": 0.2022118, "epoch": 0.9094844431083722, "flos": 24273278899200.0, "grad_norm": 30.433627003449057, "language_loss": 0.82546598, "learning_rate": 8.527697529020694e-08, "loss": 0.84007347, "num_input_tokens_seen": 326353435, "router_z_loss_clip": 2.10742188, "router_z_loss_mlp": 0.25231934, "step": 15127, "time_per_iteration": 2.6523330211639404 }, { "auxiliary_loss_clip": 0.01245982, "auxiliary_loss_mlp": 0.00220862, "balance_loss_clip": 1.02838814, "balance_loss_mlp": 0.19606636, "epoch": 0.9095445663610401, "flos": 21944652094080.0, "grad_norm": 40.97874751747872, "language_loss": 0.7123149, "learning_rate": 8.516449899618173e-08, "loss": 0.72698331, "num_input_tokens_seen": 326371810, "router_z_loss_clip": 2.17578125, "router_z_loss_mlp": 0.24780273, "step": 15128, "time_per_iteration": 2.7147955894470215 }, { "auxiliary_loss_clip": 0.01225352, "auxiliary_loss_mlp": 0.00238594, "balance_loss_clip": 1.01307487, "balance_loss_mlp": 0.21454987, "epoch": 0.9096046896137081, "flos": 19792848965760.0, "grad_norm": 8.084603448831395, "language_loss": 0.83631045, "learning_rate": 8.505209531291013e-08, "loss": 0.85094988, "num_input_tokens_seen": 326391380, "router_z_loss_clip": 2.12109375, "router_z_loss_mlp": 0.24060059, "step": 15129, "time_per_iteration": 2.729275941848755 }, { "auxiliary_loss_clip": 0.01250309, "auxiliary_loss_mlp": 0.00227397, "balance_loss_clip": 1.02889276, "balance_loss_mlp": 0.20154038, "epoch": 0.909664812866376, "flos": 22638087129600.0, "grad_norm": 179.25023300608527, "language_loss": 0.92864645, "learning_rate": 8.49397642446552e-08, "loss": 0.94342351, "num_input_tokens_seen": 326408800, "router_z_loss_clip": 2.2109375, "router_z_loss_mlp": 0.25891113, "step": 15130, "time_per_iteration": 2.7799782752990723 }, { "auxiliary_loss_clip": 0.01254229, "auxiliary_loss_mlp": 0.00217971, "balance_loss_clip": 1.03194284, "balance_loss_mlp": 0.19124451, "epoch": 0.909724936119044, "flos": 39852153020160.0, "grad_norm": 16.980449542495236, "language_loss": 0.84180367, "learning_rate": 8.482750579567644e-08, "loss": 0.85652566, "num_input_tokens_seen": 326431565, "router_z_loss_clip": 2.22265625, "router_z_loss_mlp": 0.26708984, "step": 15131, "time_per_iteration": 2.851686954498291 }, { "auxiliary_loss_clip": 0.01240352, "auxiliary_loss_mlp": 0.00224074, "balance_loss_clip": 1.02298355, "balance_loss_mlp": 0.1986113, "epoch": 0.9097850593717121, "flos": 35071616954880.0, "grad_norm": 20.160192051177717, "language_loss": 0.67898977, "learning_rate": 8.471531997023085e-08, "loss": 0.69363403, "num_input_tokens_seen": 326451715, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.25476074, "step": 15132, "time_per_iteration": 4.156470537185669 }, { "auxiliary_loss_clip": 0.01232612, "auxiliary_loss_mlp": 0.00180529, "balance_loss_clip": 1.02338982, "balance_loss_mlp": 0.15718766, "epoch": 0.90984518262438, "flos": 23367468700800.0, "grad_norm": 11.62064872941279, "language_loss": 0.88703758, "learning_rate": 8.460320677257193e-08, "loss": 0.901169, "num_input_tokens_seen": 326470855, "router_z_loss_clip": 2.09277344, "router_z_loss_mlp": 0.23327637, "step": 15133, "time_per_iteration": 2.673035144805908 }, { "auxiliary_loss_clip": 0.01229971, "auxiliary_loss_mlp": 0.00221458, "balance_loss_clip": 1.01463473, "balance_loss_mlp": 0.19531512, "epoch": 0.909905305877048, "flos": 27523302405120.0, "grad_norm": 4.662234059870096, "language_loss": 0.82193661, "learning_rate": 8.449116620695118e-08, "loss": 0.83645087, "num_input_tokens_seen": 326490480, "router_z_loss_clip": 2.15234375, "router_z_loss_mlp": 0.26184082, "step": 15134, "time_per_iteration": 2.7051069736480713 }, { "auxiliary_loss_clip": 0.0127149, "auxiliary_loss_mlp": 0.0021566, "balance_loss_clip": 1.04379392, "balance_loss_mlp": 0.18825416, "epoch": 0.9099654291297159, "flos": 24347865490560.0, "grad_norm": 4.135031815618909, "language_loss": 0.80658638, "learning_rate": 8.437919827761786e-08, "loss": 0.82145792, "num_input_tokens_seen": 326509445, "router_z_loss_clip": 2.27734375, "router_z_loss_mlp": 0.27429199, "step": 15135, "time_per_iteration": 2.739424228668213 }, { "auxiliary_loss_clip": 0.01226047, "auxiliary_loss_mlp": 0.00207471, "balance_loss_clip": 1.01436102, "balance_loss_mlp": 0.18409391, "epoch": 0.9100255523823839, "flos": 21215234609280.0, "grad_norm": 2.285088322402059, "language_loss": 0.77890015, "learning_rate": 8.426730298881702e-08, "loss": 0.7932353, "num_input_tokens_seen": 326528380, "router_z_loss_clip": 2.11523438, "router_z_loss_mlp": 0.23376465, "step": 15136, "time_per_iteration": 2.6302878856658936 }, { "auxiliary_loss_clip": 0.01093167, "auxiliary_loss_mlp": 0.00084483, "balance_loss_clip": 0.95630264, "balance_loss_mlp": 0.07742613, "epoch": 0.9100856756350518, "flos": 46052276446080.0, "grad_norm": 0.8046831985286951, "language_loss": 0.58657587, "learning_rate": 8.415548034479214e-08, "loss": 0.59835237, "num_input_tokens_seen": 326576940, "router_z_loss_clip": 1.3671875, "router_z_loss_mlp": 0.07080078, "step": 15137, "time_per_iteration": 4.2811033725738525 }, { "auxiliary_loss_clip": 0.01237998, "auxiliary_loss_mlp": 0.0022448, "balance_loss_clip": 1.01955235, "balance_loss_mlp": 0.19863513, "epoch": 0.9101457988877198, "flos": 20229917656320.0, "grad_norm": 13.151123540772781, "language_loss": 0.89351696, "learning_rate": 8.40437303497834e-08, "loss": 0.90814173, "num_input_tokens_seen": 326596100, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.25817871, "step": 15138, "time_per_iteration": 2.9926295280456543 }, { "auxiliary_loss_clip": 0.01226286, "auxiliary_loss_mlp": 0.00232432, "balance_loss_clip": 1.01699555, "balance_loss_mlp": 0.20841157, "epoch": 0.9102059221403878, "flos": 26615157822720.0, "grad_norm": 4.762353008110366, "language_loss": 0.86079192, "learning_rate": 8.39320530080283e-08, "loss": 0.87537909, "num_input_tokens_seen": 326615700, "router_z_loss_clip": 2.09375, "router_z_loss_mlp": 0.24047852, "step": 15139, "time_per_iteration": 2.784212589263916 }, { "auxiliary_loss_clip": 0.01239484, "auxiliary_loss_mlp": 0.00218255, "balance_loss_clip": 1.0214417, "balance_loss_mlp": 0.19394815, "epoch": 0.9102660453930558, "flos": 21908561904000.0, "grad_norm": 3.658266142626434, "language_loss": 0.82913721, "learning_rate": 8.382044832376167e-08, "loss": 0.84371459, "num_input_tokens_seen": 326635905, "router_z_loss_clip": 2.18164062, "router_z_loss_mlp": 0.24328613, "step": 15140, "time_per_iteration": 2.8381917476654053 }, { "auxiliary_loss_clip": 0.01234533, "auxiliary_loss_mlp": 0.00201101, "balance_loss_clip": 1.02145123, "balance_loss_mlp": 0.17809312, "epoch": 0.9103261686457237, "flos": 36176660916480.0, "grad_norm": 235.10382414086962, "language_loss": 0.74030578, "learning_rate": 8.370891630121569e-08, "loss": 0.75466216, "num_input_tokens_seen": 326661855, "router_z_loss_clip": 2.12890625, "router_z_loss_mlp": 0.2298584, "step": 15141, "time_per_iteration": 2.9178528785705566 }, { "auxiliary_loss_clip": 0.01241723, "auxiliary_loss_mlp": 0.00220705, "balance_loss_clip": 1.02146423, "balance_loss_mlp": 0.19426477, "epoch": 0.9103862918983917, "flos": 23878549365120.0, "grad_norm": 73.88859369565351, "language_loss": 0.8326655, "learning_rate": 8.359745694462005e-08, "loss": 0.8472898, "num_input_tokens_seen": 326679320, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.26464844, "step": 15142, "time_per_iteration": 2.892758846282959 }, { "auxiliary_loss_clip": 0.01224074, "auxiliary_loss_mlp": 0.00208755, "balance_loss_clip": 1.01092994, "balance_loss_mlp": 0.18599752, "epoch": 0.9104464151510596, "flos": 14939521989120.0, "grad_norm": 13.582807217951162, "language_loss": 0.71899247, "learning_rate": 8.348607025820076e-08, "loss": 0.73332077, "num_input_tokens_seen": 326698110, "router_z_loss_clip": 2.13085938, "router_z_loss_mlp": 0.22753906, "step": 15143, "time_per_iteration": 2.8068759441375732 }, { "auxiliary_loss_clip": 0.01240837, "auxiliary_loss_mlp": 0.00213262, "balance_loss_clip": 1.02172256, "balance_loss_mlp": 0.18543839, "epoch": 0.9105065384037276, "flos": 33655803500160.0, "grad_norm": 10.196176277840399, "language_loss": 0.7037164, "learning_rate": 8.337475624618152e-08, "loss": 0.71825743, "num_input_tokens_seen": 326718370, "router_z_loss_clip": 2.19335938, "router_z_loss_mlp": 0.27819824, "step": 15144, "time_per_iteration": 2.914198637008667 }, { "auxiliary_loss_clip": 0.01212203, "auxiliary_loss_mlp": 0.00198976, "balance_loss_clip": 1.00518608, "balance_loss_mlp": 0.17500326, "epoch": 0.9105666616563957, "flos": 24316695463680.0, "grad_norm": 53.53801923569034, "language_loss": 0.78150618, "learning_rate": 8.326351491278382e-08, "loss": 0.79561794, "num_input_tokens_seen": 326738445, "router_z_loss_clip": 2.07128906, "router_z_loss_mlp": 0.23950195, "step": 15145, "time_per_iteration": 2.7549173831939697 }, { "auxiliary_loss_clip": 0.01204832, "auxiliary_loss_mlp": 0.00210952, "balance_loss_clip": 0.9999218, "balance_loss_mlp": 0.18770599, "epoch": 0.9106267849090636, "flos": 29971692132480.0, "grad_norm": 2.3018574022160094, "language_loss": 0.78138876, "learning_rate": 8.315234626222545e-08, "loss": 0.79554659, "num_input_tokens_seen": 326758855, "router_z_loss_clip": 2.04980469, "router_z_loss_mlp": 0.23242188, "step": 15146, "time_per_iteration": 2.7828433513641357 }, { "auxiliary_loss_clip": 0.01238532, "auxiliary_loss_mlp": 0.00210142, "balance_loss_clip": 1.02327538, "balance_loss_mlp": 0.18527508, "epoch": 0.9106869081617316, "flos": 25337743470720.0, "grad_norm": 200.22926889246204, "language_loss": 0.82619298, "learning_rate": 8.304125029872233e-08, "loss": 0.84067976, "num_input_tokens_seen": 326777140, "router_z_loss_clip": 2.15039062, "router_z_loss_mlp": 0.2487793, "step": 15147, "time_per_iteration": 2.7467541694641113 }, { "auxiliary_loss_clip": 0.01245322, "auxiliary_loss_mlp": 0.0020246, "balance_loss_clip": 1.02495563, "balance_loss_mlp": 0.17739055, "epoch": 0.9107470314143995, "flos": 18187031543040.0, "grad_norm": 4.997713145314226, "language_loss": 0.88533688, "learning_rate": 8.293022702648711e-08, "loss": 0.89981467, "num_input_tokens_seen": 326794070, "router_z_loss_clip": 2.20117188, "router_z_loss_mlp": 0.25061035, "step": 15148, "time_per_iteration": 2.669952392578125 }, { "auxiliary_loss_clip": 0.01233273, "auxiliary_loss_mlp": 0.00223649, "balance_loss_clip": 1.01673818, "balance_loss_mlp": 0.19978349, "epoch": 0.9108071546670675, "flos": 23550828652800.0, "grad_norm": 411.2638205817862, "language_loss": 0.76953816, "learning_rate": 8.281927644972996e-08, "loss": 0.78410739, "num_input_tokens_seen": 326814695, "router_z_loss_clip": 2.16601562, "router_z_loss_mlp": 0.23876953, "step": 15149, "time_per_iteration": 2.7897896766662598 }, { "auxiliary_loss_clip": 0.01237365, "auxiliary_loss_mlp": 0.00211159, "balance_loss_clip": 1.01795673, "balance_loss_mlp": 0.18297818, "epoch": 0.9108672779197354, "flos": 25630307746560.0, "grad_norm": 29.0189455799177, "language_loss": 0.71968496, "learning_rate": 8.270839857265776e-08, "loss": 0.7341702, "num_input_tokens_seen": 326835295, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.28222656, "step": 15150, "time_per_iteration": 2.790698766708374 }, { "auxiliary_loss_clip": 0.01240637, "auxiliary_loss_mlp": 0.00224068, "balance_loss_clip": 1.02360678, "balance_loss_mlp": 0.19846165, "epoch": 0.9109274011724035, "flos": 22339094319360.0, "grad_norm": 4.404218003408579, "language_loss": 0.81157684, "learning_rate": 8.259759339947514e-08, "loss": 0.82622385, "num_input_tokens_seen": 326853350, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.25585938, "step": 15151, "time_per_iteration": 2.655329465866089 }, { "auxiliary_loss_clip": 0.01211022, "auxiliary_loss_mlp": 0.00210151, "balance_loss_clip": 1.00517368, "balance_loss_mlp": 0.18709597, "epoch": 0.9109875244250714, "flos": 26688200129280.0, "grad_norm": 1775.8284102356038, "language_loss": 0.71813691, "learning_rate": 8.248686093438429e-08, "loss": 0.73234868, "num_input_tokens_seen": 326873425, "router_z_loss_clip": 2.05371094, "router_z_loss_mlp": 0.23046875, "step": 15152, "time_per_iteration": 2.7658543586730957 }, { "auxiliary_loss_clip": 0.0124139, "auxiliary_loss_mlp": 0.00217355, "balance_loss_clip": 1.01996422, "balance_loss_mlp": 0.19221354, "epoch": 0.9110476476777394, "flos": 22930112701440.0, "grad_norm": 57.10333627716773, "language_loss": 0.80128002, "learning_rate": 8.23762011815834e-08, "loss": 0.81586748, "num_input_tokens_seen": 326893455, "router_z_loss_clip": 2.2109375, "router_z_loss_mlp": 0.25146484, "step": 15153, "time_per_iteration": 2.7493207454681396 }, { "auxiliary_loss_clip": 0.01243548, "auxiliary_loss_mlp": 0.00221812, "balance_loss_clip": 1.02788627, "balance_loss_mlp": 0.19724312, "epoch": 0.9111077709304073, "flos": 13472857854720.0, "grad_norm": 4.699183421207225, "language_loss": 0.79303461, "learning_rate": 8.226561414526956e-08, "loss": 0.80768824, "num_input_tokens_seen": 326910210, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.24584961, "step": 15154, "time_per_iteration": 2.6382365226745605 }, { "auxiliary_loss_clip": 0.01234451, "auxiliary_loss_mlp": 0.00215111, "balance_loss_clip": 1.0237031, "balance_loss_mlp": 0.19022022, "epoch": 0.9111678941830753, "flos": 20850561780480.0, "grad_norm": 3.2425487081174853, "language_loss": 0.91590154, "learning_rate": 8.215509982963564e-08, "loss": 0.93039721, "num_input_tokens_seen": 326929350, "router_z_loss_clip": 2.109375, "router_z_loss_mlp": 0.2487793, "step": 15155, "time_per_iteration": 2.6384549140930176 }, { "auxiliary_loss_clip": 0.01236431, "auxiliary_loss_mlp": 0.00218317, "balance_loss_clip": 1.0227834, "balance_loss_mlp": 0.19455847, "epoch": 0.9112280174357432, "flos": 19682244011520.0, "grad_norm": 43.41043724460105, "language_loss": 0.66947699, "learning_rate": 8.204465823887252e-08, "loss": 0.68402445, "num_input_tokens_seen": 326949060, "router_z_loss_clip": 2.13476562, "router_z_loss_mlp": 0.23742676, "step": 15156, "time_per_iteration": 2.6806352138519287 }, { "auxiliary_loss_clip": 0.01241259, "auxiliary_loss_mlp": 0.00208573, "balance_loss_clip": 1.02099764, "balance_loss_mlp": 0.1825735, "epoch": 0.9112881406884112, "flos": 25447163276160.0, "grad_norm": 700.4247347549093, "language_loss": 0.81445742, "learning_rate": 8.193428937716796e-08, "loss": 0.82895577, "num_input_tokens_seen": 326968950, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.26013184, "step": 15157, "time_per_iteration": 2.670891761779785 }, { "auxiliary_loss_clip": 0.01215914, "auxiliary_loss_mlp": 0.00186535, "balance_loss_clip": 1.00579393, "balance_loss_mlp": 0.1640761, "epoch": 0.9113482639410793, "flos": 33066975847680.0, "grad_norm": 3.632969599038402, "language_loss": 0.68521476, "learning_rate": 8.182399324870747e-08, "loss": 0.69923925, "num_input_tokens_seen": 326989455, "router_z_loss_clip": 2.1015625, "router_z_loss_mlp": 0.22460938, "step": 15158, "time_per_iteration": 2.784550189971924 }, { "auxiliary_loss_clip": 0.01221766, "auxiliary_loss_mlp": 0.00220605, "balance_loss_clip": 1.01060426, "balance_loss_mlp": 0.19570239, "epoch": 0.9114083871937472, "flos": 21835591424640.0, "grad_norm": 2543.343420082134, "language_loss": 0.75065589, "learning_rate": 8.171376985767375e-08, "loss": 0.76507962, "num_input_tokens_seen": 327009640, "router_z_loss_clip": 2.11132812, "router_z_loss_mlp": 0.24890137, "step": 15159, "time_per_iteration": 2.6260087490081787 }, { "auxiliary_loss_clip": 0.01225778, "auxiliary_loss_mlp": 0.00210869, "balance_loss_clip": 1.01403475, "balance_loss_mlp": 0.18565577, "epoch": 0.9114685104464152, "flos": 27088999061760.0, "grad_norm": 2.4633408166723356, "language_loss": 0.85357761, "learning_rate": 8.160361920824588e-08, "loss": 0.86794412, "num_input_tokens_seen": 327027690, "router_z_loss_clip": 2.12304688, "router_z_loss_mlp": 0.25231934, "step": 15160, "time_per_iteration": 2.6778383255004883 }, { "auxiliary_loss_clip": 0.01265714, "auxiliary_loss_mlp": 0.00234041, "balance_loss_clip": 1.04277742, "balance_loss_mlp": 0.20487064, "epoch": 0.9115286336990831, "flos": 17967042696960.0, "grad_norm": 244.95996680243576, "language_loss": 0.78334773, "learning_rate": 8.149354130460073e-08, "loss": 0.79834521, "num_input_tokens_seen": 327045915, "router_z_loss_clip": 2.23242188, "router_z_loss_mlp": 0.29187012, "step": 15161, "time_per_iteration": 2.819289445877075 }, { "auxiliary_loss_clip": 0.01242967, "auxiliary_loss_mlp": 0.00213949, "balance_loss_clip": 1.01964033, "balance_loss_mlp": 0.18790133, "epoch": 0.9115887569517511, "flos": 22929861306240.0, "grad_norm": 15.838985480114276, "language_loss": 0.85780823, "learning_rate": 8.138353615091321e-08, "loss": 0.8723774, "num_input_tokens_seen": 327066355, "router_z_loss_clip": 2.22851562, "router_z_loss_mlp": 0.26037598, "step": 15162, "time_per_iteration": 2.7309370040893555 }, { "auxiliary_loss_clip": 0.01224224, "auxiliary_loss_mlp": 0.00233217, "balance_loss_clip": 1.01448941, "balance_loss_mlp": 0.2092796, "epoch": 0.911648880204419, "flos": 23988436047360.0, "grad_norm": 26.511323279638745, "language_loss": 0.73819649, "learning_rate": 8.127360375135395e-08, "loss": 0.7527709, "num_input_tokens_seen": 327086735, "router_z_loss_clip": 2.10058594, "router_z_loss_mlp": 0.23925781, "step": 15163, "time_per_iteration": 2.7200591564178467 }, { "auxiliary_loss_clip": 0.01251838, "auxiliary_loss_mlp": 0.00242752, "balance_loss_clip": 1.02505422, "balance_loss_mlp": 0.21620445, "epoch": 0.911709003457087, "flos": 17055306754560.0, "grad_norm": 368.0479644577485, "language_loss": 0.81335402, "learning_rate": 8.116374411009186e-08, "loss": 0.82829988, "num_input_tokens_seen": 327104035, "router_z_loss_clip": 2.26367188, "router_z_loss_mlp": 0.26574707, "step": 15164, "time_per_iteration": 2.6807126998901367 }, { "auxiliary_loss_clip": 0.01226789, "auxiliary_loss_mlp": 0.00199334, "balance_loss_clip": 1.01723647, "balance_loss_mlp": 0.17582585, "epoch": 0.911769126709755, "flos": 21653344794240.0, "grad_norm": 22.141503100476793, "language_loss": 0.82143152, "learning_rate": 8.105395723129315e-08, "loss": 0.83569276, "num_input_tokens_seen": 327124370, "router_z_loss_clip": 2.09960938, "router_z_loss_mlp": 0.23510742, "step": 15165, "time_per_iteration": 2.6981706619262695 }, { "auxiliary_loss_clip": 0.01248853, "auxiliary_loss_mlp": 0.00211277, "balance_loss_clip": 1.02917433, "balance_loss_mlp": 0.18599312, "epoch": 0.911829249962423, "flos": 24790321221120.0, "grad_norm": 3.601440051723143, "language_loss": 0.82748806, "learning_rate": 8.094424311912074e-08, "loss": 0.84208935, "num_input_tokens_seen": 327140915, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.25292969, "step": 15166, "time_per_iteration": 2.6600568294525146 }, { "auxiliary_loss_clip": 0.01263865, "auxiliary_loss_mlp": 0.00226583, "balance_loss_clip": 1.03252041, "balance_loss_mlp": 0.19923644, "epoch": 0.9118893732150909, "flos": 20959406968320.0, "grad_norm": 3.6419729031099175, "language_loss": 0.82391262, "learning_rate": 8.083460177773482e-08, "loss": 0.83881712, "num_input_tokens_seen": 327158940, "router_z_loss_clip": 2.31054688, "router_z_loss_mlp": 0.27355957, "step": 15167, "time_per_iteration": 4.059603691101074 }, { "auxiliary_loss_clip": 0.01086792, "auxiliary_loss_mlp": 0.00074483, "balance_loss_clip": 0.95079195, "balance_loss_mlp": 0.06795068, "epoch": 0.9119494964677589, "flos": 67917385872000.0, "grad_norm": 0.7555371186955577, "language_loss": 0.64838064, "learning_rate": 8.072503321129298e-08, "loss": 0.65999341, "num_input_tokens_seen": 327217450, "router_z_loss_clip": 1.359375, "router_z_loss_mlp": 0.06542969, "step": 15168, "time_per_iteration": 4.550852298736572 }, { "auxiliary_loss_clip": 0.0123159, "auxiliary_loss_mlp": 0.00210662, "balance_loss_clip": 1.02025867, "balance_loss_mlp": 0.18728471, "epoch": 0.9120096197204268, "flos": 18551524803840.0, "grad_norm": 11.23093551234543, "language_loss": 0.85729623, "learning_rate": 8.061553742395033e-08, "loss": 0.87171876, "num_input_tokens_seen": 327233905, "router_z_loss_clip": 2.11230469, "router_z_loss_mlp": 0.23376465, "step": 15169, "time_per_iteration": 2.6620635986328125 }, { "auxiliary_loss_clip": 0.01234909, "auxiliary_loss_mlp": 0.00213502, "balance_loss_clip": 1.01660764, "balance_loss_mlp": 0.18782394, "epoch": 0.9120697429730948, "flos": 19025725178880.0, "grad_norm": 75.19426090336482, "language_loss": 0.89947945, "learning_rate": 8.05061144198591e-08, "loss": 0.91396362, "num_input_tokens_seen": 327252430, "router_z_loss_clip": 2.18164062, "router_z_loss_mlp": 0.25695801, "step": 15170, "time_per_iteration": 2.6526594161987305 }, { "auxiliary_loss_clip": 0.01239277, "auxiliary_loss_mlp": 0.00213506, "balance_loss_clip": 1.01981449, "balance_loss_mlp": 0.18722011, "epoch": 0.9121298662257629, "flos": 17163685065600.0, "grad_norm": 219.02244106901773, "language_loss": 0.88929844, "learning_rate": 8.039676420316799e-08, "loss": 0.90382624, "num_input_tokens_seen": 327269215, "router_z_loss_clip": 2.19335938, "router_z_loss_mlp": 0.26269531, "step": 15171, "time_per_iteration": 2.6428802013397217 }, { "auxiliary_loss_clip": 0.01229491, "auxiliary_loss_mlp": 0.00224893, "balance_loss_clip": 1.01352739, "balance_loss_mlp": 0.2013019, "epoch": 0.9121899894784308, "flos": 19682710888320.0, "grad_norm": 70.6049377733483, "language_loss": 0.72601545, "learning_rate": 8.02874867780241e-08, "loss": 0.74055934, "num_input_tokens_seen": 327290320, "router_z_loss_clip": 2.15820312, "router_z_loss_mlp": 0.23596191, "step": 15172, "time_per_iteration": 2.6725051403045654 }, { "auxiliary_loss_clip": 0.01243418, "auxiliary_loss_mlp": 0.00239439, "balance_loss_clip": 1.02744496, "balance_loss_mlp": 0.21324843, "epoch": 0.9122501127310988, "flos": 22235743912320.0, "grad_norm": 132.2154748583167, "language_loss": 0.82434773, "learning_rate": 8.017828214857103e-08, "loss": 0.8391763, "num_input_tokens_seen": 327310150, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.26184082, "step": 15173, "time_per_iteration": 2.69443678855896 }, { "auxiliary_loss_clip": 0.01256779, "auxiliary_loss_mlp": 0.00221783, "balance_loss_clip": 1.03077793, "balance_loss_mlp": 0.19428153, "epoch": 0.9123102359837667, "flos": 15957122290560.0, "grad_norm": 61.906340312503744, "language_loss": 0.75600952, "learning_rate": 8.00691503189499e-08, "loss": 0.77079517, "num_input_tokens_seen": 327326660, "router_z_loss_clip": 2.25976562, "router_z_loss_mlp": 0.27526855, "step": 15174, "time_per_iteration": 4.156372785568237 }, { "auxiliary_loss_clip": 0.01227495, "auxiliary_loss_mlp": 0.00191075, "balance_loss_clip": 1.01310587, "balance_loss_mlp": 0.16655341, "epoch": 0.9123703592364347, "flos": 25155784149120.0, "grad_norm": 2.9696271677495165, "language_loss": 0.83573443, "learning_rate": 7.996009129329894e-08, "loss": 0.84992015, "num_input_tokens_seen": 327346700, "router_z_loss_clip": 2.14257812, "router_z_loss_mlp": 0.24536133, "step": 15175, "time_per_iteration": 2.6867010593414307 }, { "auxiliary_loss_clip": 0.01089776, "auxiliary_loss_mlp": 0.00068723, "balance_loss_clip": 0.95233476, "balance_loss_mlp": 0.06266695, "epoch": 0.9124304824891026, "flos": 60801650812800.0, "grad_norm": 0.9574020081588935, "language_loss": 0.57630992, "learning_rate": 7.985110507575421e-08, "loss": 0.58789492, "num_input_tokens_seen": 327403050, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.06054688, "step": 15176, "time_per_iteration": 3.2126123905181885 }, { "auxiliary_loss_clip": 0.01235554, "auxiliary_loss_mlp": 0.00210934, "balance_loss_clip": 1.01904273, "balance_loss_mlp": 0.18431483, "epoch": 0.9124906057417707, "flos": 18150941352960.0, "grad_norm": 1127.9117966502836, "language_loss": 0.76152539, "learning_rate": 7.97421916704475e-08, "loss": 0.77599025, "num_input_tokens_seen": 327422225, "router_z_loss_clip": 2.1640625, "router_z_loss_mlp": 0.26635742, "step": 15177, "time_per_iteration": 2.6383533477783203 }, { "auxiliary_loss_clip": 0.01229141, "auxiliary_loss_mlp": 0.00203188, "balance_loss_clip": 1.01862979, "balance_loss_mlp": 0.17934623, "epoch": 0.9125507289944386, "flos": 11686769049600.0, "grad_norm": 5.130844342602503, "language_loss": 0.8923496, "learning_rate": 7.963335108150926e-08, "loss": 0.90667284, "num_input_tokens_seen": 327437025, "router_z_loss_clip": 2.10449219, "router_z_loss_mlp": 0.23852539, "step": 15178, "time_per_iteration": 2.6197571754455566 }, { "auxiliary_loss_clip": 0.01229213, "auxiliary_loss_mlp": 0.00248999, "balance_loss_clip": 1.01261961, "balance_loss_mlp": 0.22314274, "epoch": 0.9126108522471066, "flos": 17748813617280.0, "grad_norm": 12.750744013220011, "language_loss": 0.86715716, "learning_rate": 7.952458331306711e-08, "loss": 0.88193929, "num_input_tokens_seen": 327453915, "router_z_loss_clip": 2.16503906, "router_z_loss_mlp": 0.25830078, "step": 15179, "time_per_iteration": 4.0271079540252686 }, { "auxiliary_loss_clip": 0.01217052, "auxiliary_loss_mlp": 0.00179149, "balance_loss_clip": 1.00913072, "balance_loss_mlp": 0.15509218, "epoch": 0.9126709754997745, "flos": 27635738952960.0, "grad_norm": 449.8073812144304, "language_loss": 0.74556196, "learning_rate": 7.941588836924507e-08, "loss": 0.75952399, "num_input_tokens_seen": 327474415, "router_z_loss_clip": 2.08105469, "router_z_loss_mlp": 0.24047852, "step": 15180, "time_per_iteration": 2.7400448322296143 }, { "auxiliary_loss_clip": 0.01215482, "auxiliary_loss_mlp": 0.00200521, "balance_loss_clip": 1.00917506, "balance_loss_mlp": 0.1771203, "epoch": 0.9127310987524425, "flos": 15924982596480.0, "grad_norm": 6.166886355935559, "language_loss": 0.8377676, "learning_rate": 7.930726625416495e-08, "loss": 0.85192764, "num_input_tokens_seen": 327492750, "router_z_loss_clip": 2.06542969, "router_z_loss_mlp": 0.23388672, "step": 15181, "time_per_iteration": 2.6591107845306396 }, { "auxiliary_loss_clip": 0.01250395, "auxiliary_loss_mlp": 0.00207713, "balance_loss_clip": 1.02755022, "balance_loss_mlp": 0.18104537, "epoch": 0.9127912220051104, "flos": 21536885923200.0, "grad_norm": 16.176513667530834, "language_loss": 0.83381116, "learning_rate": 7.919871697194614e-08, "loss": 0.84839219, "num_input_tokens_seen": 327509470, "router_z_loss_clip": 2.23046875, "router_z_loss_mlp": 0.26660156, "step": 15182, "time_per_iteration": 2.668879985809326 }, { "auxiliary_loss_clip": 0.01247845, "auxiliary_loss_mlp": 0.00219359, "balance_loss_clip": 1.02350307, "balance_loss_mlp": 0.19341953, "epoch": 0.9128513452577784, "flos": 24063561342720.0, "grad_norm": 2.6778235083100212, "language_loss": 0.83988869, "learning_rate": 7.909024052670421e-08, "loss": 0.85456073, "num_input_tokens_seen": 327530520, "router_z_loss_clip": 2.24511719, "router_z_loss_mlp": 0.25939941, "step": 15183, "time_per_iteration": 2.705538511276245 }, { "auxiliary_loss_clip": 0.01259019, "auxiliary_loss_mlp": 0.0022853, "balance_loss_clip": 1.03534579, "balance_loss_mlp": 0.20338893, "epoch": 0.9129114685104465, "flos": 16216469464320.0, "grad_norm": 8.317495280846062, "language_loss": 0.8492372, "learning_rate": 7.898183692255256e-08, "loss": 0.86411273, "num_input_tokens_seen": 327546960, "router_z_loss_clip": 2.23632812, "router_z_loss_mlp": 0.25146484, "step": 15184, "time_per_iteration": 2.634359359741211 }, { "auxiliary_loss_clip": 0.01227068, "auxiliary_loss_mlp": 0.00195445, "balance_loss_clip": 1.01634467, "balance_loss_mlp": 0.17341486, "epoch": 0.9129715917631144, "flos": 19384364522880.0, "grad_norm": 7.524071213030906, "language_loss": 0.83471233, "learning_rate": 7.887350616360233e-08, "loss": 0.84893751, "num_input_tokens_seen": 327564830, "router_z_loss_clip": 2.10839844, "router_z_loss_mlp": 0.22033691, "step": 15185, "time_per_iteration": 2.7252070903778076 }, { "auxiliary_loss_clip": 0.01230929, "auxiliary_loss_mlp": 0.00211495, "balance_loss_clip": 1.01518011, "balance_loss_mlp": 0.18588866, "epoch": 0.9130317150157824, "flos": 20590460421120.0, "grad_norm": 4.870786995984541, "language_loss": 0.78439516, "learning_rate": 7.876524825396158e-08, "loss": 0.79881936, "num_input_tokens_seen": 327583675, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.25634766, "step": 15186, "time_per_iteration": 2.657087564468384 }, { "auxiliary_loss_clip": 0.01245716, "auxiliary_loss_mlp": 0.00212773, "balance_loss_clip": 1.02639318, "balance_loss_mlp": 0.18831134, "epoch": 0.9130918382684503, "flos": 20189230525440.0, "grad_norm": 19.45421687575443, "language_loss": 0.86118162, "learning_rate": 7.865706319773502e-08, "loss": 0.87576652, "num_input_tokens_seen": 327602280, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.24475098, "step": 15187, "time_per_iteration": 2.6904025077819824 }, { "auxiliary_loss_clip": 0.01244566, "auxiliary_loss_mlp": 0.00214419, "balance_loss_clip": 1.02381456, "balance_loss_mlp": 0.18942064, "epoch": 0.9131519615211183, "flos": 25556870390400.0, "grad_norm": 561.3572068252946, "language_loss": 0.73943448, "learning_rate": 7.854895099902515e-08, "loss": 0.75402439, "num_input_tokens_seen": 327623515, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.25024414, "step": 15188, "time_per_iteration": 2.680144786834717 }, { "auxiliary_loss_clip": 0.01228962, "auxiliary_loss_mlp": 0.00224844, "balance_loss_clip": 1.0164156, "balance_loss_mlp": 0.19998923, "epoch": 0.9132120847737862, "flos": 17931563038080.0, "grad_norm": 2.3920337343922795, "language_loss": 0.85091245, "learning_rate": 7.844091166193157e-08, "loss": 0.86545062, "num_input_tokens_seen": 327642875, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.24853516, "step": 15189, "time_per_iteration": 2.670252799987793 }, { "auxiliary_loss_clip": 0.01226291, "auxiliary_loss_mlp": 0.00207947, "balance_loss_clip": 1.01066208, "balance_loss_mlp": 0.18425986, "epoch": 0.9132722080264543, "flos": 20047635112320.0, "grad_norm": 51.18450463493105, "language_loss": 0.83647001, "learning_rate": 7.8332945190551e-08, "loss": 0.85081238, "num_input_tokens_seen": 327662450, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.23693848, "step": 15190, "time_per_iteration": 2.7182059288024902 }, { "auxiliary_loss_clip": 0.01083239, "auxiliary_loss_mlp": 0.00091578, "balance_loss_clip": 0.94682646, "balance_loss_mlp": 0.0839009, "epoch": 0.9133323312791222, "flos": 70439967141120.0, "grad_norm": 0.7027294052999026, "language_loss": 0.56410134, "learning_rate": 7.822505158897797e-08, "loss": 0.57584947, "num_input_tokens_seen": 327723845, "router_z_loss_clip": 1.3671875, "router_z_loss_mlp": 0.07666016, "step": 15191, "time_per_iteration": 3.1930105686187744 }, { "auxiliary_loss_clip": 0.01233394, "auxiliary_loss_mlp": 0.00200004, "balance_loss_clip": 1.02109194, "balance_loss_mlp": 0.17491055, "epoch": 0.9133924545317902, "flos": 25483792170240.0, "grad_norm": 5.216472784138341, "language_loss": 0.80534714, "learning_rate": 7.81172308613034e-08, "loss": 0.81968111, "num_input_tokens_seen": 327742590, "router_z_loss_clip": 2.12207031, "router_z_loss_mlp": 0.25097656, "step": 15192, "time_per_iteration": 2.768113613128662 }, { "auxiliary_loss_clip": 0.01230177, "auxiliary_loss_mlp": 0.00207486, "balance_loss_clip": 1.01518404, "balance_loss_mlp": 0.18372795, "epoch": 0.9134525777844581, "flos": 39930690107520.0, "grad_norm": 424.1350886672366, "language_loss": 0.77126062, "learning_rate": 7.800948301161647e-08, "loss": 0.78563726, "num_input_tokens_seen": 327764350, "router_z_loss_clip": 2.1484375, "router_z_loss_mlp": 0.23779297, "step": 15193, "time_per_iteration": 2.8360421657562256 }, { "auxiliary_loss_clip": 0.01235898, "auxiliary_loss_mlp": 0.0021955, "balance_loss_clip": 1.02089691, "balance_loss_mlp": 0.19500509, "epoch": 0.9135127010371261, "flos": 20886723797760.0, "grad_norm": 2229.675209293125, "language_loss": 0.80308104, "learning_rate": 7.790180804400215e-08, "loss": 0.81763542, "num_input_tokens_seen": 327783120, "router_z_loss_clip": 2.1484375, "router_z_loss_mlp": 0.2454834, "step": 15194, "time_per_iteration": 2.6483874320983887 }, { "auxiliary_loss_clip": 0.01244318, "auxiliary_loss_mlp": 0.00236826, "balance_loss_clip": 1.02529311, "balance_loss_mlp": 0.20955065, "epoch": 0.913572824289794, "flos": 20813250528000.0, "grad_norm": 20.92280030603179, "language_loss": 0.72246176, "learning_rate": 7.779420596254383e-08, "loss": 0.73727322, "num_input_tokens_seen": 327801960, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.27282715, "step": 15195, "time_per_iteration": 2.6423444747924805 }, { "auxiliary_loss_clip": 0.01235802, "auxiliary_loss_mlp": 0.00224938, "balance_loss_clip": 1.01794624, "balance_loss_mlp": 0.19871241, "epoch": 0.913632947542462, "flos": 25703278225920.0, "grad_norm": 10.094544826301764, "language_loss": 0.79457068, "learning_rate": 7.768667677132201e-08, "loss": 0.80917799, "num_input_tokens_seen": 327823795, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.26269531, "step": 15196, "time_per_iteration": 2.7720067501068115 }, { "auxiliary_loss_clip": 0.01236305, "auxiliary_loss_mlp": 0.00211033, "balance_loss_clip": 1.02253544, "balance_loss_mlp": 0.1874655, "epoch": 0.9136930707951301, "flos": 26286216048000.0, "grad_norm": 2.5689518077882143, "language_loss": 0.7730189, "learning_rate": 7.757922047441411e-08, "loss": 0.78749228, "num_input_tokens_seen": 327845175, "router_z_loss_clip": 2.13867188, "router_z_loss_mlp": 0.23571777, "step": 15197, "time_per_iteration": 2.6641390323638916 }, { "auxiliary_loss_clip": 0.01244292, "auxiliary_loss_mlp": 0.00221147, "balance_loss_clip": 1.02214587, "balance_loss_mlp": 0.19465879, "epoch": 0.913753194047798, "flos": 22091885942400.0, "grad_norm": 28.933515667311795, "language_loss": 0.85417068, "learning_rate": 7.747183707589489e-08, "loss": 0.86882508, "num_input_tokens_seen": 327863150, "router_z_loss_clip": 2.2265625, "router_z_loss_mlp": 0.26501465, "step": 15198, "time_per_iteration": 2.769416093826294 }, { "auxiliary_loss_clip": 0.01234326, "auxiliary_loss_mlp": 0.00203303, "balance_loss_clip": 1.02150559, "balance_loss_mlp": 0.17687465, "epoch": 0.913813317300466, "flos": 23587206151680.0, "grad_norm": 49.53372887669639, "language_loss": 0.74102563, "learning_rate": 7.736452657983616e-08, "loss": 0.75540197, "num_input_tokens_seen": 327883445, "router_z_loss_clip": 2.12792969, "router_z_loss_mlp": 0.2644043, "step": 15199, "time_per_iteration": 2.7137839794158936 }, { "auxiliary_loss_clip": 0.01249806, "auxiliary_loss_mlp": 0.00215566, "balance_loss_clip": 1.03012824, "balance_loss_mlp": 0.18949476, "epoch": 0.9138734405531339, "flos": 28876452583680.0, "grad_norm": 50.33344386873857, "language_loss": 0.74128026, "learning_rate": 7.725728899030714e-08, "loss": 0.755934, "num_input_tokens_seen": 327905745, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.26074219, "step": 15200, "time_per_iteration": 2.726574182510376 }, { "auxiliary_loss_clip": 0.01231855, "auxiliary_loss_mlp": 0.00216258, "balance_loss_clip": 1.01907432, "balance_loss_mlp": 0.19178393, "epoch": 0.9139335638058019, "flos": 22821087945600.0, "grad_norm": 6.275974848505211, "language_loss": 0.79504609, "learning_rate": 7.715012431137435e-08, "loss": 0.80952722, "num_input_tokens_seen": 327925435, "router_z_loss_clip": 2.12890625, "router_z_loss_mlp": 0.24475098, "step": 15201, "time_per_iteration": 2.650866746902466 }, { "auxiliary_loss_clip": 0.01226285, "auxiliary_loss_mlp": 0.002421, "balance_loss_clip": 1.01515508, "balance_loss_mlp": 0.21699433, "epoch": 0.9139936870584698, "flos": 18004174381440.0, "grad_norm": 22.370980122587003, "language_loss": 0.78725278, "learning_rate": 7.704303254710165e-08, "loss": 0.80193663, "num_input_tokens_seen": 327944145, "router_z_loss_clip": 2.11132812, "router_z_loss_mlp": 0.25134277, "step": 15202, "time_per_iteration": 2.6384048461914062 }, { "auxiliary_loss_clip": 0.01236251, "auxiliary_loss_mlp": 0.00211777, "balance_loss_clip": 1.0210743, "balance_loss_mlp": 0.18438303, "epoch": 0.9140538103111379, "flos": 15813767111040.0, "grad_norm": 67.20164936036782, "language_loss": 0.79440361, "learning_rate": 7.693601370155001e-08, "loss": 0.80888391, "num_input_tokens_seen": 327960565, "router_z_loss_clip": 2.15039062, "router_z_loss_mlp": 0.27416992, "step": 15203, "time_per_iteration": 2.649212121963501 }, { "auxiliary_loss_clip": 0.01234347, "auxiliary_loss_mlp": 0.00211462, "balance_loss_clip": 1.022717, "balance_loss_mlp": 0.18674992, "epoch": 0.9141139335638058, "flos": 23987035416960.0, "grad_norm": 118.10637971284312, "language_loss": 0.77199733, "learning_rate": 7.682906777877751e-08, "loss": 0.78645545, "num_input_tokens_seen": 327981180, "router_z_loss_clip": 2.11425781, "router_z_loss_mlp": 0.24694824, "step": 15204, "time_per_iteration": 2.743765354156494 }, { "auxiliary_loss_clip": 0.01237347, "auxiliary_loss_mlp": 0.00242253, "balance_loss_clip": 1.01805544, "balance_loss_mlp": 0.21553841, "epoch": 0.9141740568164738, "flos": 24024418496640.0, "grad_norm": 11.599380362575976, "language_loss": 0.701024, "learning_rate": 7.672219478283915e-08, "loss": 0.71581995, "num_input_tokens_seen": 328001500, "router_z_loss_clip": 2.19335938, "router_z_loss_mlp": 0.26721191, "step": 15205, "time_per_iteration": 2.7192745208740234 }, { "auxiliary_loss_clip": 0.01230902, "auxiliary_loss_mlp": 0.00204756, "balance_loss_clip": 1.01944852, "balance_loss_mlp": 0.18013912, "epoch": 0.9142341800691417, "flos": 27018291139200.0, "grad_norm": 59.03333719654073, "language_loss": 0.89449704, "learning_rate": 7.661539471778811e-08, "loss": 0.90885359, "num_input_tokens_seen": 328023025, "router_z_loss_clip": 2.11230469, "router_z_loss_mlp": 0.24609375, "step": 15206, "time_per_iteration": 2.7348172664642334 }, { "auxiliary_loss_clip": 0.01227972, "auxiliary_loss_mlp": 0.00217726, "balance_loss_clip": 1.01562572, "balance_loss_mlp": 0.19318049, "epoch": 0.9142943033218097, "flos": 20412487509120.0, "grad_norm": 20.402348763291076, "language_loss": 0.8304742, "learning_rate": 7.650866758767382e-08, "loss": 0.84493124, "num_input_tokens_seen": 328041410, "router_z_loss_clip": 2.12304688, "router_z_loss_mlp": 0.24536133, "step": 15207, "time_per_iteration": 2.921351671218872 }, { "auxiliary_loss_clip": 0.01237938, "auxiliary_loss_mlp": 0.00217112, "balance_loss_clip": 1.01989019, "balance_loss_mlp": 0.19408101, "epoch": 0.9143544265744776, "flos": 19755322231680.0, "grad_norm": 4.525769776494969, "language_loss": 0.81761432, "learning_rate": 7.640201339654373e-08, "loss": 0.83216488, "num_input_tokens_seen": 328060495, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.23059082, "step": 15208, "time_per_iteration": 2.7250730991363525 }, { "auxiliary_loss_clip": 0.01230924, "auxiliary_loss_mlp": 0.00215395, "balance_loss_clip": 1.02181101, "balance_loss_mlp": 0.1923995, "epoch": 0.9144145498271457, "flos": 17165444832000.0, "grad_norm": 5.290281620066998, "language_loss": 0.94196707, "learning_rate": 7.629543214844237e-08, "loss": 0.9564302, "num_input_tokens_seen": 328076905, "router_z_loss_clip": 2.09179688, "router_z_loss_mlp": 0.23010254, "step": 15209, "time_per_iteration": 4.105029106140137 }, { "auxiliary_loss_clip": 0.01248252, "auxiliary_loss_mlp": 0.00233749, "balance_loss_clip": 1.02507555, "balance_loss_mlp": 0.20609234, "epoch": 0.9144746730798137, "flos": 23726072131200.0, "grad_norm": 17.456356301018843, "language_loss": 0.84735978, "learning_rate": 7.618892384741093e-08, "loss": 0.86217976, "num_input_tokens_seen": 328096960, "router_z_loss_clip": 2.23242188, "router_z_loss_mlp": 0.2767334, "step": 15210, "time_per_iteration": 4.1807167530059814 }, { "auxiliary_loss_clip": 0.01227831, "auxiliary_loss_mlp": 0.00210856, "balance_loss_clip": 1.01666069, "balance_loss_mlp": 0.18805096, "epoch": 0.9145347963324816, "flos": 25847854467840.0, "grad_norm": 8.697062765935094, "language_loss": 0.84546208, "learning_rate": 7.6082488497488e-08, "loss": 0.85984898, "num_input_tokens_seen": 328115445, "router_z_loss_clip": 2.11230469, "router_z_loss_mlp": 0.22814941, "step": 15211, "time_per_iteration": 2.7157561779022217 }, { "auxiliary_loss_clip": 0.01229772, "auxiliary_loss_mlp": 0.00218953, "balance_loss_clip": 1.01388025, "balance_loss_mlp": 0.19499183, "epoch": 0.9145949195851496, "flos": 19242769109760.0, "grad_norm": 7.7657828846739045, "language_loss": 0.90300107, "learning_rate": 7.597612610270986e-08, "loss": 0.91748834, "num_input_tokens_seen": 328133965, "router_z_loss_clip": 2.16210938, "router_z_loss_mlp": 0.23962402, "step": 15212, "time_per_iteration": 2.685877561569214 }, { "auxiliary_loss_clip": 0.01235772, "auxiliary_loss_mlp": 0.00212989, "balance_loss_clip": 1.02221012, "balance_loss_mlp": 0.18746585, "epoch": 0.9146550428378175, "flos": 18296379521280.0, "grad_norm": 9.87575385844662, "language_loss": 0.89776421, "learning_rate": 7.586983666711022e-08, "loss": 0.91225183, "num_input_tokens_seen": 328151520, "router_z_loss_clip": 2.13378906, "router_z_loss_mlp": 0.25524902, "step": 15213, "time_per_iteration": 2.666491985321045 }, { "auxiliary_loss_clip": 0.0123718, "auxiliary_loss_mlp": 0.00205856, "balance_loss_clip": 1.01946259, "balance_loss_mlp": 0.18072656, "epoch": 0.9147151660904855, "flos": 20084264006400.0, "grad_norm": 7.330480119635453, "language_loss": 0.82511938, "learning_rate": 7.576362019471894e-08, "loss": 0.83954972, "num_input_tokens_seen": 328171275, "router_z_loss_clip": 2.17578125, "router_z_loss_mlp": 0.25109863, "step": 15214, "time_per_iteration": 2.6956350803375244 }, { "auxiliary_loss_clip": 0.01256634, "auxiliary_loss_mlp": 0.0021571, "balance_loss_clip": 1.03601694, "balance_loss_mlp": 0.19078302, "epoch": 0.9147752893431534, "flos": 24389127239040.0, "grad_norm": 14.629216149161605, "language_loss": 0.72229058, "learning_rate": 7.565747668956413e-08, "loss": 0.737014, "num_input_tokens_seen": 328192115, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.24938965, "step": 15215, "time_per_iteration": 2.7841944694519043 }, { "auxiliary_loss_clip": 0.01252862, "auxiliary_loss_mlp": 0.00214201, "balance_loss_clip": 1.03127599, "balance_loss_mlp": 0.18736693, "epoch": 0.9148354125958215, "flos": 18150402648960.0, "grad_norm": 43.99972806887092, "language_loss": 0.86701262, "learning_rate": 7.555140615567058e-08, "loss": 0.88168323, "num_input_tokens_seen": 328208990, "router_z_loss_clip": 2.22070312, "router_z_loss_mlp": 0.26855469, "step": 15216, "time_per_iteration": 4.031267404556274 }, { "auxiliary_loss_clip": 0.01228105, "auxiliary_loss_mlp": 0.00228824, "balance_loss_clip": 1.01361799, "balance_loss_mlp": 0.20334873, "epoch": 0.9148955358484894, "flos": 23367540528000.0, "grad_norm": 20.142432060383065, "language_loss": 0.76259696, "learning_rate": 7.544540859706062e-08, "loss": 0.77716625, "num_input_tokens_seen": 328227840, "router_z_loss_clip": 2.14746094, "router_z_loss_mlp": 0.25476074, "step": 15217, "time_per_iteration": 2.7114360332489014 }, { "auxiliary_loss_clip": 0.01247707, "auxiliary_loss_mlp": 0.0020675, "balance_loss_clip": 1.02893353, "balance_loss_mlp": 0.18231168, "epoch": 0.9149556591011574, "flos": 18076498416000.0, "grad_norm": 13.50187486799799, "language_loss": 0.88517094, "learning_rate": 7.533948401775347e-08, "loss": 0.89971554, "num_input_tokens_seen": 328246250, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.24450684, "step": 15218, "time_per_iteration": 2.648017644882202 }, { "auxiliary_loss_clip": 0.01083006, "auxiliary_loss_mlp": 0.00070228, "balance_loss_clip": 0.94717634, "balance_loss_mlp": 0.06293241, "epoch": 0.9150157823538253, "flos": 54586374825600.0, "grad_norm": 0.8209146784577512, "language_loss": 0.58300436, "learning_rate": 7.523363242176595e-08, "loss": 0.59453666, "num_input_tokens_seen": 328303625, "router_z_loss_clip": 1.359375, "router_z_loss_mlp": 0.07275391, "step": 15219, "time_per_iteration": 3.110989570617676 }, { "auxiliary_loss_clip": 0.01222978, "auxiliary_loss_mlp": 0.0021889, "balance_loss_clip": 1.01032019, "balance_loss_mlp": 0.19534624, "epoch": 0.9150759056064933, "flos": 17893102550400.0, "grad_norm": 14.45182193405094, "language_loss": 0.8638345, "learning_rate": 7.512785381311216e-08, "loss": 0.87825316, "num_input_tokens_seen": 328322135, "router_z_loss_clip": 2.12890625, "router_z_loss_mlp": 0.23522949, "step": 15220, "time_per_iteration": 2.630849838256836 }, { "auxiliary_loss_clip": 0.01254282, "auxiliary_loss_mlp": 0.00226496, "balance_loss_clip": 1.02812433, "balance_loss_mlp": 0.20091371, "epoch": 0.9151360288591612, "flos": 18073517587200.0, "grad_norm": 12.200188971529789, "language_loss": 0.74002695, "learning_rate": 7.50221481958031e-08, "loss": 0.75483477, "num_input_tokens_seen": 328340750, "router_z_loss_clip": 2.26171875, "router_z_loss_mlp": 0.25585938, "step": 15221, "time_per_iteration": 4.133982181549072 }, { "auxiliary_loss_clip": 0.01233092, "auxiliary_loss_mlp": 0.00214712, "balance_loss_clip": 1.01648331, "balance_loss_mlp": 0.19005959, "epoch": 0.9151961521118293, "flos": 19354523299200.0, "grad_norm": 378.26645009248045, "language_loss": 0.94844764, "learning_rate": 7.491651557384692e-08, "loss": 0.96292567, "num_input_tokens_seen": 328359995, "router_z_loss_clip": 2.1640625, "router_z_loss_mlp": 0.24645996, "step": 15222, "time_per_iteration": 2.6632165908813477 }, { "auxiliary_loss_clip": 0.01077301, "auxiliary_loss_mlp": 0.00062585, "balance_loss_clip": 0.94213009, "balance_loss_mlp": 0.05605267, "epoch": 0.9152562753644973, "flos": 72146621018880.0, "grad_norm": 0.7051164639632026, "language_loss": 0.48844492, "learning_rate": 7.481095595124953e-08, "loss": 0.49984378, "num_input_tokens_seen": 328426865, "router_z_loss_clip": 1.3515625, "router_z_loss_mlp": 0.06542969, "step": 15223, "time_per_iteration": 3.184323787689209 }, { "auxiliary_loss_clip": 0.0124005, "auxiliary_loss_mlp": 0.00222751, "balance_loss_clip": 1.02590489, "balance_loss_mlp": 0.19801475, "epoch": 0.9153163986171652, "flos": 20777016683520.0, "grad_norm": 24.520946316790237, "language_loss": 0.81810486, "learning_rate": 7.470546933201349e-08, "loss": 0.83273292, "num_input_tokens_seen": 328445970, "router_z_loss_clip": 2.13964844, "router_z_loss_mlp": 0.24743652, "step": 15224, "time_per_iteration": 2.6698310375213623 }, { "auxiliary_loss_clip": 0.0124129, "auxiliary_loss_mlp": 0.00215056, "balance_loss_clip": 1.0219512, "balance_loss_mlp": 0.18981916, "epoch": 0.9153765218698332, "flos": 23040107124480.0, "grad_norm": 15.44462375530467, "language_loss": 0.89967287, "learning_rate": 7.460005572013895e-08, "loss": 0.91423631, "num_input_tokens_seen": 328464585, "router_z_loss_clip": 2.19140625, "router_z_loss_mlp": 0.25244141, "step": 15225, "time_per_iteration": 2.7057507038116455 }, { "auxiliary_loss_clip": 0.01249935, "auxiliary_loss_mlp": 0.00213159, "balance_loss_clip": 1.03072762, "balance_loss_mlp": 0.18829212, "epoch": 0.9154366451225011, "flos": 28990900293120.0, "grad_norm": 692.0240411450426, "language_loss": 0.76450008, "learning_rate": 7.44947151196238e-08, "loss": 0.77913105, "num_input_tokens_seen": 328490155, "router_z_loss_clip": 2.18847656, "router_z_loss_mlp": 0.24890137, "step": 15226, "time_per_iteration": 2.7581610679626465 }, { "auxiliary_loss_clip": 0.01242394, "auxiliary_loss_mlp": 0.00212402, "balance_loss_clip": 1.02766395, "balance_loss_mlp": 0.18771356, "epoch": 0.9154967683751691, "flos": 22309504490880.0, "grad_norm": 11.297041058300382, "language_loss": 0.84784067, "learning_rate": 7.43894475344613e-08, "loss": 0.86238861, "num_input_tokens_seen": 328508275, "router_z_loss_clip": 2.14648438, "router_z_loss_mlp": 0.24658203, "step": 15227, "time_per_iteration": 2.676530122756958 }, { "auxiliary_loss_clip": 0.01234126, "auxiliary_loss_mlp": 0.00214145, "balance_loss_clip": 1.01830733, "balance_loss_mlp": 0.18994585, "epoch": 0.915556891627837, "flos": 24571481610240.0, "grad_norm": 4.0706522416841215, "language_loss": 0.81174254, "learning_rate": 7.428425296864404e-08, "loss": 0.82622522, "num_input_tokens_seen": 328529425, "router_z_loss_clip": 2.15820312, "router_z_loss_mlp": 0.24206543, "step": 15228, "time_per_iteration": 2.7147719860076904 }, { "auxiliary_loss_clip": 0.01222631, "auxiliary_loss_mlp": 0.00216629, "balance_loss_clip": 1.00890386, "balance_loss_mlp": 0.19141592, "epoch": 0.9156170148805051, "flos": 22164676853760.0, "grad_norm": 39.24322981409838, "language_loss": 0.78504753, "learning_rate": 7.417913142616106e-08, "loss": 0.79944015, "num_input_tokens_seen": 328550200, "router_z_loss_clip": 2.13769531, "router_z_loss_mlp": 0.25219727, "step": 15229, "time_per_iteration": 2.691774845123291 }, { "auxiliary_loss_clip": 0.01234605, "auxiliary_loss_mlp": 0.0019957, "balance_loss_clip": 1.01610923, "balance_loss_mlp": 0.17541781, "epoch": 0.915677138133173, "flos": 20920659171840.0, "grad_norm": 25.828114866928644, "language_loss": 0.9096486, "learning_rate": 7.407408291099848e-08, "loss": 0.92399037, "num_input_tokens_seen": 328568540, "router_z_loss_clip": 2.18457031, "router_z_loss_mlp": 0.24133301, "step": 15230, "time_per_iteration": 2.651468276977539 }, { "auxiliary_loss_clip": 0.01220622, "auxiliary_loss_mlp": 0.00216028, "balance_loss_clip": 1.00998187, "balance_loss_mlp": 0.19154264, "epoch": 0.915737261385841, "flos": 24345136056960.0, "grad_norm": 82.04429104477137, "language_loss": 0.90565991, "learning_rate": 7.396910742713957e-08, "loss": 0.92002636, "num_input_tokens_seen": 328587300, "router_z_loss_clip": 2.10546875, "router_z_loss_mlp": 0.24511719, "step": 15231, "time_per_iteration": 2.682678461074829 }, { "auxiliary_loss_clip": 0.01216622, "auxiliary_loss_mlp": 0.00222345, "balance_loss_clip": 1.00579381, "balance_loss_mlp": 0.19868165, "epoch": 0.9157973846385089, "flos": 26761386090240.0, "grad_norm": 1.973290299637632, "language_loss": 0.78802264, "learning_rate": 7.386420497856516e-08, "loss": 0.80241227, "num_input_tokens_seen": 328610055, "router_z_loss_clip": 2.109375, "router_z_loss_mlp": 0.23681641, "step": 15232, "time_per_iteration": 2.7297003269195557 }, { "auxiliary_loss_clip": 0.01247151, "auxiliary_loss_mlp": 0.00224966, "balance_loss_clip": 1.02766979, "balance_loss_mlp": 0.19912139, "epoch": 0.9158575078911769, "flos": 18478733892480.0, "grad_norm": 33.873154415169026, "language_loss": 0.79134047, "learning_rate": 7.375937556925338e-08, "loss": 0.80606163, "num_input_tokens_seen": 328626815, "router_z_loss_clip": 2.19433594, "router_z_loss_mlp": 0.25854492, "step": 15233, "time_per_iteration": 2.6318297386169434 }, { "auxiliary_loss_clip": 0.01254195, "auxiliary_loss_mlp": 0.00223757, "balance_loss_clip": 1.03160906, "balance_loss_mlp": 0.19713759, "epoch": 0.9159176311438448, "flos": 21798926616960.0, "grad_norm": 3.6578985422361194, "language_loss": 0.76668859, "learning_rate": 7.365461920317861e-08, "loss": 0.78146803, "num_input_tokens_seen": 328643995, "router_z_loss_clip": 2.22460938, "router_z_loss_mlp": 0.26623535, "step": 15234, "time_per_iteration": 2.6570727825164795 }, { "auxiliary_loss_clip": 0.01254788, "auxiliary_loss_mlp": 0.00239875, "balance_loss_clip": 1.03397107, "balance_loss_mlp": 0.21348169, "epoch": 0.9159777543965129, "flos": 24783749032320.0, "grad_norm": 12.859115663110945, "language_loss": 0.95915759, "learning_rate": 7.354993588431391e-08, "loss": 0.97410429, "num_input_tokens_seen": 328659565, "router_z_loss_clip": 2.20800781, "router_z_loss_mlp": 0.26403809, "step": 15235, "time_per_iteration": 2.757220983505249 }, { "auxiliary_loss_clip": 0.01241548, "auxiliary_loss_mlp": 0.00213908, "balance_loss_clip": 1.02796006, "balance_loss_mlp": 0.18768154, "epoch": 0.9160378776491809, "flos": 26868758820480.0, "grad_norm": 3.0306463752634074, "language_loss": 0.85290563, "learning_rate": 7.344532561662853e-08, "loss": 0.86746019, "num_input_tokens_seen": 328679045, "router_z_loss_clip": 2.13671875, "router_z_loss_mlp": 0.26245117, "step": 15236, "time_per_iteration": 2.802663803100586 }, { "auxiliary_loss_clip": 0.01080787, "auxiliary_loss_mlp": 0.00087023, "balance_loss_clip": 0.94271892, "balance_loss_mlp": 0.08044279, "epoch": 0.9160980009018488, "flos": 70578222589440.0, "grad_norm": 0.6488012212840875, "language_loss": 0.61121917, "learning_rate": 7.334078840409019e-08, "loss": 0.62289727, "num_input_tokens_seen": 328744565, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.06591797, "step": 15237, "time_per_iteration": 3.10310697555542 }, { "auxiliary_loss_clip": 0.01263112, "auxiliary_loss_mlp": 0.00235205, "balance_loss_clip": 1.03639233, "balance_loss_mlp": 0.20872837, "epoch": 0.9161581241545168, "flos": 16289332202880.0, "grad_norm": 708.5518001103608, "language_loss": 0.8266868, "learning_rate": 7.323632425066151e-08, "loss": 0.84166998, "num_input_tokens_seen": 328762455, "router_z_loss_clip": 2.265625, "router_z_loss_mlp": 0.26501465, "step": 15238, "time_per_iteration": 2.6327779293060303 }, { "auxiliary_loss_clip": 0.0122918, "auxiliary_loss_mlp": 0.00221395, "balance_loss_clip": 1.01660609, "balance_loss_mlp": 0.1977323, "epoch": 0.9162182474071847, "flos": 18438154502400.0, "grad_norm": 58.219639065668666, "language_loss": 0.82888085, "learning_rate": 7.313193316030464e-08, "loss": 0.84338653, "num_input_tokens_seen": 328780320, "router_z_loss_clip": 2.12402344, "router_z_loss_mlp": 0.23693848, "step": 15239, "time_per_iteration": 2.727893590927124 }, { "auxiliary_loss_clip": 0.01259558, "auxiliary_loss_mlp": 0.00208556, "balance_loss_clip": 1.03452539, "balance_loss_mlp": 0.18228218, "epoch": 0.9162783706598527, "flos": 19167248764800.0, "grad_norm": 3.2536242414954337, "language_loss": 0.74819469, "learning_rate": 7.302761513697819e-08, "loss": 0.7628758, "num_input_tokens_seen": 328797570, "router_z_loss_clip": 2.25, "router_z_loss_mlp": 0.26269531, "step": 15240, "time_per_iteration": 2.646620273590088 }, { "auxiliary_loss_clip": 0.01240761, "auxiliary_loss_mlp": 0.00215278, "balance_loss_clip": 1.02403069, "balance_loss_mlp": 0.1911615, "epoch": 0.9163384939125206, "flos": 20412990299520.0, "grad_norm": 26.679201270703295, "language_loss": 0.81102395, "learning_rate": 7.292337018463746e-08, "loss": 0.82558429, "num_input_tokens_seen": 328814075, "router_z_loss_clip": 2.16894531, "router_z_loss_mlp": 0.24133301, "step": 15241, "time_per_iteration": 2.614020824432373 }, { "auxiliary_loss_clip": 0.01270738, "auxiliary_loss_mlp": 0.00235004, "balance_loss_clip": 1.04144418, "balance_loss_mlp": 0.20901629, "epoch": 0.9163986171651887, "flos": 19645902426240.0, "grad_norm": 5.735059735816699, "language_loss": 0.84074992, "learning_rate": 7.281919830723549e-08, "loss": 0.8558073, "num_input_tokens_seen": 328831990, "router_z_loss_clip": 2.29296875, "router_z_loss_mlp": 0.26013184, "step": 15242, "time_per_iteration": 2.662379503250122 }, { "auxiliary_loss_clip": 0.01228829, "auxiliary_loss_mlp": 0.00209476, "balance_loss_clip": 1.01622343, "balance_loss_mlp": 0.18519349, "epoch": 0.9164587404178566, "flos": 12823054865280.0, "grad_norm": 6.330474422754946, "language_loss": 0.89370787, "learning_rate": 7.271509950872334e-08, "loss": 0.90809095, "num_input_tokens_seen": 328849105, "router_z_loss_clip": 2.12890625, "router_z_loss_mlp": 0.24291992, "step": 15243, "time_per_iteration": 2.62955379486084 }, { "auxiliary_loss_clip": 0.01255269, "auxiliary_loss_mlp": 0.00223216, "balance_loss_clip": 1.03392112, "balance_loss_mlp": 0.19908835, "epoch": 0.9165188636705246, "flos": 22309396750080.0, "grad_norm": 390.9932462753688, "language_loss": 0.88470531, "learning_rate": 7.261107379304721e-08, "loss": 0.89949024, "num_input_tokens_seen": 328866810, "router_z_loss_clip": 2.21289062, "router_z_loss_mlp": 0.24133301, "step": 15244, "time_per_iteration": 2.709573745727539 }, { "auxiliary_loss_clip": 0.01256705, "auxiliary_loss_mlp": 0.00233347, "balance_loss_clip": 1.02790213, "balance_loss_mlp": 0.20666757, "epoch": 0.9165789869231925, "flos": 18223337214720.0, "grad_norm": 6.10810663887689, "language_loss": 0.83797425, "learning_rate": 7.250712116415214e-08, "loss": 0.85287476, "num_input_tokens_seen": 328885325, "router_z_loss_clip": 2.28515625, "router_z_loss_mlp": 0.2668457, "step": 15245, "time_per_iteration": 2.7091357707977295 }, { "auxiliary_loss_clip": 0.01228069, "auxiliary_loss_mlp": 0.00211796, "balance_loss_clip": 1.01694226, "balance_loss_mlp": 0.18763183, "epoch": 0.9166391101758605, "flos": 13691553811200.0, "grad_norm": 3.449491965752427, "language_loss": 0.84198064, "learning_rate": 7.240324162598033e-08, "loss": 0.85637927, "num_input_tokens_seen": 328902655, "router_z_loss_clip": 2.11621094, "router_z_loss_mlp": 0.24133301, "step": 15246, "time_per_iteration": 2.5992815494537354 }, { "auxiliary_loss_clip": 0.01236224, "auxiliary_loss_mlp": 0.0024139, "balance_loss_clip": 1.02011013, "balance_loss_mlp": 0.21375775, "epoch": 0.9166992334285284, "flos": 17346793622400.0, "grad_norm": 73.58223654399899, "language_loss": 0.85180962, "learning_rate": 7.229943518247106e-08, "loss": 0.86658573, "num_input_tokens_seen": 328918440, "router_z_loss_clip": 2.16210938, "router_z_loss_mlp": 0.27648926, "step": 15247, "time_per_iteration": 2.649966239929199 }, { "auxiliary_loss_clip": 0.01243553, "auxiliary_loss_mlp": 0.00221301, "balance_loss_clip": 1.02482259, "balance_loss_mlp": 0.19661269, "epoch": 0.9167593566811965, "flos": 23731135948800.0, "grad_norm": 7.946335431247394, "language_loss": 0.85047311, "learning_rate": 7.219570183756052e-08, "loss": 0.8651216, "num_input_tokens_seen": 328938055, "router_z_loss_clip": 2.18847656, "router_z_loss_mlp": 0.24658203, "step": 15248, "time_per_iteration": 2.6834778785705566 }, { "auxiliary_loss_clip": 0.01255173, "auxiliary_loss_mlp": 0.00210652, "balance_loss_clip": 1.03327823, "balance_loss_mlp": 0.185225, "epoch": 0.9168194799338644, "flos": 27818201064960.0, "grad_norm": 13.496206006993104, "language_loss": 0.83480394, "learning_rate": 7.209204159518178e-08, "loss": 0.84946227, "num_input_tokens_seen": 328957895, "router_z_loss_clip": 2.21679688, "router_z_loss_mlp": 0.25415039, "step": 15249, "time_per_iteration": 2.7630374431610107 }, { "auxiliary_loss_clip": 0.01247851, "auxiliary_loss_mlp": 0.00243682, "balance_loss_clip": 1.02380109, "balance_loss_mlp": 0.21737224, "epoch": 0.9168796031865324, "flos": 21717552355200.0, "grad_norm": 5.921877204822616, "language_loss": 0.88501585, "learning_rate": 7.198845445926616e-08, "loss": 0.89993119, "num_input_tokens_seen": 328971365, "router_z_loss_clip": 2.23828125, "router_z_loss_mlp": 0.26306152, "step": 15250, "time_per_iteration": 2.708756446838379 }, { "auxiliary_loss_clip": 0.01235439, "auxiliary_loss_mlp": 0.00219121, "balance_loss_clip": 1.02224088, "balance_loss_mlp": 0.19357446, "epoch": 0.9169397264392004, "flos": 23404420817280.0, "grad_norm": 25.610293426937552, "language_loss": 0.83965361, "learning_rate": 7.188494043374138e-08, "loss": 0.85419929, "num_input_tokens_seen": 328990830, "router_z_loss_clip": 2.1328125, "router_z_loss_mlp": 0.25524902, "step": 15251, "time_per_iteration": 4.174715042114258 }, { "auxiliary_loss_clip": 0.01242968, "auxiliary_loss_mlp": 0.0021346, "balance_loss_clip": 1.02825499, "balance_loss_mlp": 0.1888914, "epoch": 0.9169998496918683, "flos": 23950981140480.0, "grad_norm": 9.472703206867905, "language_loss": 0.91088712, "learning_rate": 7.178149952253298e-08, "loss": 0.9254514, "num_input_tokens_seen": 329008345, "router_z_loss_clip": 2.1484375, "router_z_loss_mlp": 0.24572754, "step": 15252, "time_per_iteration": 4.159137010574341 }, { "auxiliary_loss_clip": 0.01246428, "auxiliary_loss_mlp": 0.00241633, "balance_loss_clip": 1.02583218, "balance_loss_mlp": 0.21584851, "epoch": 0.9170599729445363, "flos": 18332469711360.0, "grad_norm": 7.693590633640967, "language_loss": 0.84130275, "learning_rate": 7.167813172956316e-08, "loss": 0.85618329, "num_input_tokens_seen": 329027440, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.25793457, "step": 15253, "time_per_iteration": 2.667346954345703 }, { "auxiliary_loss_clip": 0.01247579, "auxiliary_loss_mlp": 0.00213937, "balance_loss_clip": 1.02733028, "balance_loss_mlp": 0.18757981, "epoch": 0.9171200961972042, "flos": 22674859678080.0, "grad_norm": 40.724287550347, "language_loss": 0.81644267, "learning_rate": 7.157483705875256e-08, "loss": 0.83105785, "num_input_tokens_seen": 329046445, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.26342773, "step": 15254, "time_per_iteration": 2.6539664268493652 }, { "auxiliary_loss_clip": 0.01240535, "auxiliary_loss_mlp": 0.00208489, "balance_loss_clip": 1.02666926, "balance_loss_mlp": 0.18477848, "epoch": 0.9171802194498723, "flos": 26719298328960.0, "grad_norm": 4.163976068138394, "language_loss": 0.85309458, "learning_rate": 7.14716155140167e-08, "loss": 0.86758476, "num_input_tokens_seen": 329065555, "router_z_loss_clip": 2.13964844, "router_z_loss_mlp": 0.23730469, "step": 15255, "time_per_iteration": 2.7292609214782715 }, { "auxiliary_loss_clip": 0.01227298, "auxiliary_loss_mlp": 0.00224407, "balance_loss_clip": 1.01094568, "balance_loss_mlp": 0.19877703, "epoch": 0.9172403427025402, "flos": 37889240538240.0, "grad_norm": 21.579931478258025, "language_loss": 0.76632261, "learning_rate": 7.136846709927047e-08, "loss": 0.78083968, "num_input_tokens_seen": 329087515, "router_z_loss_clip": 2.16210938, "router_z_loss_mlp": 0.25634766, "step": 15256, "time_per_iteration": 2.831430435180664 }, { "auxiliary_loss_clip": 0.01237775, "auxiliary_loss_mlp": 0.00241754, "balance_loss_clip": 1.02435303, "balance_loss_mlp": 0.21642169, "epoch": 0.9173004659552082, "flos": 17055163100160.0, "grad_norm": 12.051486105726836, "language_loss": 0.89350551, "learning_rate": 7.126539181842561e-08, "loss": 0.90830076, "num_input_tokens_seen": 329106820, "router_z_loss_clip": 2.13476562, "router_z_loss_mlp": 0.25354004, "step": 15257, "time_per_iteration": 2.7379651069641113 }, { "auxiliary_loss_clip": 0.0122592, "auxiliary_loss_mlp": 0.00214251, "balance_loss_clip": 1.01491857, "balance_loss_mlp": 0.19050473, "epoch": 0.9173605892078761, "flos": 22201593056640.0, "grad_norm": 8.887843174491286, "language_loss": 0.84089363, "learning_rate": 7.116238967539012e-08, "loss": 0.85529524, "num_input_tokens_seen": 329126515, "router_z_loss_clip": 2.11328125, "router_z_loss_mlp": 0.23754883, "step": 15258, "time_per_iteration": 4.167887449264526 }, { "auxiliary_loss_clip": 0.01235626, "auxiliary_loss_mlp": 0.00214155, "balance_loss_clip": 1.01930416, "balance_loss_mlp": 0.18878764, "epoch": 0.9174207124605441, "flos": 16507776764160.0, "grad_norm": 14.929708630150316, "language_loss": 0.90036774, "learning_rate": 7.105946067406999e-08, "loss": 0.91486549, "num_input_tokens_seen": 329142660, "router_z_loss_clip": 2.1640625, "router_z_loss_mlp": 0.25341797, "step": 15259, "time_per_iteration": 2.607780694961548 }, { "auxiliary_loss_clip": 0.01232582, "auxiliary_loss_mlp": 0.00210118, "balance_loss_clip": 1.02115643, "balance_loss_mlp": 0.18721783, "epoch": 0.917480835713212, "flos": 24535606901760.0, "grad_norm": 5.208437935640262, "language_loss": 0.82100642, "learning_rate": 7.095660481836895e-08, "loss": 0.83543348, "num_input_tokens_seen": 329162575, "router_z_loss_clip": 2.11328125, "router_z_loss_mlp": 0.22900391, "step": 15260, "time_per_iteration": 2.7098939418792725 }, { "auxiliary_loss_clip": 0.01225415, "auxiliary_loss_mlp": 0.00205013, "balance_loss_clip": 1.01408434, "balance_loss_mlp": 0.18096897, "epoch": 0.9175409589658801, "flos": 20880726226560.0, "grad_norm": 175.564638717051, "language_loss": 0.68369496, "learning_rate": 7.085382211218637e-08, "loss": 0.69799924, "num_input_tokens_seen": 329182090, "router_z_loss_clip": 2.109375, "router_z_loss_mlp": 0.24072266, "step": 15261, "time_per_iteration": 2.658200740814209 }, { "auxiliary_loss_clip": 0.01227196, "auxiliary_loss_mlp": 0.00219122, "balance_loss_clip": 1.01669526, "balance_loss_mlp": 0.19396912, "epoch": 0.917601082218548, "flos": 14276035918080.0, "grad_norm": 14.00533231983276, "language_loss": 0.79871935, "learning_rate": 7.075111255942002e-08, "loss": 0.81318253, "num_input_tokens_seen": 329196535, "router_z_loss_clip": 2.10742188, "router_z_loss_mlp": 0.25170898, "step": 15262, "time_per_iteration": 2.6448192596435547 }, { "auxiliary_loss_clip": 0.01242606, "auxiliary_loss_mlp": 0.00241145, "balance_loss_clip": 1.01693153, "balance_loss_mlp": 0.21398944, "epoch": 0.917661205471216, "flos": 19099234362240.0, "grad_norm": 6.465028146443321, "language_loss": 0.85869133, "learning_rate": 7.064847616396496e-08, "loss": 0.87352884, "num_input_tokens_seen": 329215135, "router_z_loss_clip": 2.25976562, "router_z_loss_mlp": 0.27160645, "step": 15263, "time_per_iteration": 4.205925941467285 }, { "auxiliary_loss_clip": 0.01252377, "auxiliary_loss_mlp": 0.0020399, "balance_loss_clip": 1.03046036, "balance_loss_mlp": 0.17491525, "epoch": 0.917721328723884, "flos": 21106568989440.0, "grad_norm": 8.296636653531964, "language_loss": 0.8413012, "learning_rate": 7.054591292971324e-08, "loss": 0.85586488, "num_input_tokens_seen": 329235150, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.29077148, "step": 15264, "time_per_iteration": 2.6572518348693848 }, { "auxiliary_loss_clip": 0.01245573, "auxiliary_loss_mlp": 0.00229071, "balance_loss_clip": 1.03184533, "balance_loss_mlp": 0.20419209, "epoch": 0.9177814519765519, "flos": 21943215550080.0, "grad_norm": 3.7175220362405645, "language_loss": 0.88561678, "learning_rate": 7.044342286055394e-08, "loss": 0.90036321, "num_input_tokens_seen": 329254365, "router_z_loss_clip": 2.13671875, "router_z_loss_mlp": 0.2487793, "step": 15265, "time_per_iteration": 2.6789212226867676 }, { "auxiliary_loss_clip": 0.01255804, "auxiliary_loss_mlp": 0.00241416, "balance_loss_clip": 1.03115511, "balance_loss_mlp": 0.21578641, "epoch": 0.9178415752292199, "flos": 24205982768640.0, "grad_norm": 25.974736321782235, "language_loss": 0.79649544, "learning_rate": 7.034100596037306e-08, "loss": 0.81146765, "num_input_tokens_seen": 329274385, "router_z_loss_clip": 2.24414062, "router_z_loss_mlp": 0.25646973, "step": 15266, "time_per_iteration": 2.698477029800415 }, { "auxiliary_loss_clip": 0.01235142, "auxiliary_loss_mlp": 0.00207204, "balance_loss_clip": 1.02207375, "balance_loss_mlp": 0.18351689, "epoch": 0.9179016984818879, "flos": 20042068504320.0, "grad_norm": 3.8308938558620036, "language_loss": 0.84162819, "learning_rate": 7.023866223305486e-08, "loss": 0.85605174, "num_input_tokens_seen": 329292160, "router_z_loss_clip": 2.12890625, "router_z_loss_mlp": 0.23693848, "step": 15267, "time_per_iteration": 2.695997714996338 }, { "auxiliary_loss_clip": 0.01078258, "auxiliary_loss_mlp": 0.00085925, "balance_loss_clip": 0.94492429, "balance_loss_mlp": 0.0790583, "epoch": 0.9179618217345559, "flos": 65555901100800.0, "grad_norm": 0.7221236347431655, "language_loss": 0.55181789, "learning_rate": 7.013639168247975e-08, "loss": 0.56345975, "num_input_tokens_seen": 329351870, "router_z_loss_clip": 1.328125, "router_z_loss_mlp": 0.06884766, "step": 15268, "time_per_iteration": 3.1861915588378906 }, { "auxiliary_loss_clip": 0.01249132, "auxiliary_loss_mlp": 0.00206615, "balance_loss_clip": 1.03358579, "balance_loss_mlp": 0.18317857, "epoch": 0.9180219449872238, "flos": 21324618501120.0, "grad_norm": 17.037016916240564, "language_loss": 0.8605454, "learning_rate": 7.0034194312526e-08, "loss": 0.87510288, "num_input_tokens_seen": 329370930, "router_z_loss_clip": 2.15332031, "router_z_loss_mlp": 0.234375, "step": 15269, "time_per_iteration": 2.7049005031585693 }, { "auxiliary_loss_clip": 0.01234565, "auxiliary_loss_mlp": 0.00215544, "balance_loss_clip": 1.02269387, "balance_loss_mlp": 0.19134471, "epoch": 0.9180820682398918, "flos": 41060008684800.0, "grad_norm": 64.37252766424696, "language_loss": 0.79542136, "learning_rate": 6.993207012706936e-08, "loss": 0.8099224, "num_input_tokens_seen": 329391275, "router_z_loss_clip": 2.11914062, "router_z_loss_mlp": 0.24194336, "step": 15270, "time_per_iteration": 2.815192461013794 }, { "auxiliary_loss_clip": 0.01235036, "auxiliary_loss_mlp": 0.00219005, "balance_loss_clip": 1.02023232, "balance_loss_mlp": 0.19531783, "epoch": 0.9181421914925597, "flos": 28072915384320.0, "grad_norm": 6.440304225107744, "language_loss": 0.85127318, "learning_rate": 6.98300191299821e-08, "loss": 0.86581355, "num_input_tokens_seen": 329412775, "router_z_loss_clip": 2.14941406, "router_z_loss_mlp": 0.23693848, "step": 15271, "time_per_iteration": 2.7710206508636475 }, { "auxiliary_loss_clip": 0.01251315, "auxiliary_loss_mlp": 0.0021525, "balance_loss_clip": 1.02785635, "balance_loss_mlp": 0.18780807, "epoch": 0.9182023147452277, "flos": 29169411909120.0, "grad_norm": 83.26397951214456, "language_loss": 0.79940724, "learning_rate": 6.972804132513355e-08, "loss": 0.81407291, "num_input_tokens_seen": 329432440, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.27429199, "step": 15272, "time_per_iteration": 2.7304728031158447 }, { "auxiliary_loss_clip": 0.01240653, "auxiliary_loss_mlp": 0.00239493, "balance_loss_clip": 1.02312875, "balance_loss_mlp": 0.21407795, "epoch": 0.9182624379978956, "flos": 24060831909120.0, "grad_norm": 3.318749045153596, "language_loss": 0.80270207, "learning_rate": 6.962613671639105e-08, "loss": 0.81750357, "num_input_tokens_seen": 329450605, "router_z_loss_clip": 2.17285156, "router_z_loss_mlp": 0.25427246, "step": 15273, "time_per_iteration": 2.687993288040161 }, { "auxiliary_loss_clip": 0.01211769, "auxiliary_loss_mlp": 0.00193815, "balance_loss_clip": 1.00383496, "balance_loss_mlp": 0.17191647, "epoch": 0.9183225612505637, "flos": 23293528554240.0, "grad_norm": 4.59984661871367, "language_loss": 0.80897021, "learning_rate": 6.952430530761933e-08, "loss": 0.82302606, "num_input_tokens_seen": 329470550, "router_z_loss_clip": 2.07910156, "router_z_loss_mlp": 0.21923828, "step": 15274, "time_per_iteration": 2.7292332649230957 }, { "auxiliary_loss_clip": 0.01235545, "auxiliary_loss_mlp": 0.00227701, "balance_loss_clip": 1.01863647, "balance_loss_mlp": 0.20275098, "epoch": 0.9183826845032316, "flos": 19609237618560.0, "grad_norm": 2.4407560737734366, "language_loss": 0.77363271, "learning_rate": 6.942254710267902e-08, "loss": 0.78826517, "num_input_tokens_seen": 329489765, "router_z_loss_clip": 2.171875, "router_z_loss_mlp": 0.24963379, "step": 15275, "time_per_iteration": 2.6649746894836426 }, { "auxiliary_loss_clip": 0.01240171, "auxiliary_loss_mlp": 0.00221093, "balance_loss_clip": 1.01936758, "balance_loss_mlp": 0.1953914, "epoch": 0.9184428077558996, "flos": 18479057114880.0, "grad_norm": 2.7821880033777435, "language_loss": 0.81497943, "learning_rate": 6.932086210542953e-08, "loss": 0.82959211, "num_input_tokens_seen": 329507040, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.25708008, "step": 15276, "time_per_iteration": 2.665553092956543 }, { "auxiliary_loss_clip": 0.01231385, "auxiliary_loss_mlp": 0.00229151, "balance_loss_clip": 1.01843989, "balance_loss_mlp": 0.20646539, "epoch": 0.9185029310085676, "flos": 20741034234240.0, "grad_norm": 28.1593548207596, "language_loss": 0.80210531, "learning_rate": 6.921925031972642e-08, "loss": 0.81671059, "num_input_tokens_seen": 329525540, "router_z_loss_clip": 2.12792969, "router_z_loss_mlp": 0.22692871, "step": 15277, "time_per_iteration": 2.6705400943756104 }, { "auxiliary_loss_clip": 0.01082335, "auxiliary_loss_mlp": 0.0009502, "balance_loss_clip": 0.94956112, "balance_loss_mlp": 0.08820107, "epoch": 0.9185630542612355, "flos": 68209231875840.0, "grad_norm": 0.7123128988714621, "language_loss": 0.58686674, "learning_rate": 6.91177117494226e-08, "loss": 0.59864032, "num_input_tokens_seen": 329592905, "router_z_loss_clip": 1.328125, "router_z_loss_mlp": 0.06835938, "step": 15278, "time_per_iteration": 3.315192222595215 }, { "auxiliary_loss_clip": 0.01209664, "auxiliary_loss_mlp": 0.00246944, "balance_loss_clip": 1.00413394, "balance_loss_mlp": 0.22255367, "epoch": 0.9186231775139035, "flos": 12239470598400.0, "grad_norm": 2.3475673876315373, "language_loss": 0.72408712, "learning_rate": 6.901624639836879e-08, "loss": 0.73865318, "num_input_tokens_seen": 329610150, "router_z_loss_clip": 2.05371094, "router_z_loss_mlp": 0.24414062, "step": 15279, "time_per_iteration": 2.6280343532562256 }, { "auxiliary_loss_clip": 0.01087031, "auxiliary_loss_mlp": 0.00131632, "balance_loss_clip": 0.95293975, "balance_loss_mlp": 0.12304907, "epoch": 0.9186833007665715, "flos": 63939237770880.0, "grad_norm": 0.8372111121220113, "language_loss": 0.59556425, "learning_rate": 6.891485427041211e-08, "loss": 0.60775089, "num_input_tokens_seen": 329673650, "router_z_loss_clip": 1.34375, "router_z_loss_mlp": 0.0859375, "step": 15280, "time_per_iteration": 3.1077654361724854 }, { "auxiliary_loss_clip": 0.01242463, "auxiliary_loss_mlp": 0.0024838, "balance_loss_clip": 1.02319634, "balance_loss_mlp": 0.22301212, "epoch": 0.9187434240192395, "flos": 19974700546560.0, "grad_norm": 18.53952195789101, "language_loss": 0.78686625, "learning_rate": 6.881353536939815e-08, "loss": 0.80177468, "num_input_tokens_seen": 329692520, "router_z_loss_clip": 2.19042969, "router_z_loss_mlp": 0.25366211, "step": 15281, "time_per_iteration": 2.6522364616394043 }, { "auxiliary_loss_clip": 0.01242145, "auxiliary_loss_mlp": 0.00243939, "balance_loss_clip": 1.0224762, "balance_loss_mlp": 0.21698585, "epoch": 0.9188035472719074, "flos": 25227820874880.0, "grad_norm": 3.0651160219570617, "language_loss": 0.91297817, "learning_rate": 6.871228969916831e-08, "loss": 0.92783904, "num_input_tokens_seen": 329713750, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.26940918, "step": 15282, "time_per_iteration": 2.703188180923462 }, { "auxiliary_loss_clip": 0.01241946, "auxiliary_loss_mlp": 0.00222379, "balance_loss_clip": 1.02716625, "balance_loss_mlp": 0.19581893, "epoch": 0.9188636705245754, "flos": 18405547931520.0, "grad_norm": 16.772361842433988, "language_loss": 0.69731379, "learning_rate": 6.861111726356194e-08, "loss": 0.71195704, "num_input_tokens_seen": 329730960, "router_z_loss_clip": 2.1484375, "router_z_loss_mlp": 0.26525879, "step": 15283, "time_per_iteration": 2.6577792167663574 }, { "auxiliary_loss_clip": 0.01251672, "auxiliary_loss_mlp": 0.00214511, "balance_loss_clip": 1.02844071, "balance_loss_mlp": 0.18896404, "epoch": 0.9189237937772433, "flos": 23769129559680.0, "grad_norm": 22.55970126777019, "language_loss": 0.74312842, "learning_rate": 6.851001806641554e-08, "loss": 0.75779027, "num_input_tokens_seen": 329750975, "router_z_loss_clip": 2.23339844, "router_z_loss_mlp": 0.25537109, "step": 15284, "time_per_iteration": 2.6397571563720703 }, { "auxiliary_loss_clip": 0.01242151, "auxiliary_loss_mlp": 0.00217894, "balance_loss_clip": 1.02446747, "balance_loss_mlp": 0.19315851, "epoch": 0.9189839170299113, "flos": 21214624078080.0, "grad_norm": 31.03367851559166, "language_loss": 0.8085686, "learning_rate": 6.840899211156292e-08, "loss": 0.82316905, "num_input_tokens_seen": 329769645, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.24743652, "step": 15285, "time_per_iteration": 2.711258888244629 }, { "auxiliary_loss_clip": 0.01227408, "auxiliary_loss_mlp": 0.00212901, "balance_loss_clip": 1.01268458, "balance_loss_mlp": 0.18935709, "epoch": 0.9190440402825792, "flos": 16727370560640.0, "grad_norm": 9.692133386208775, "language_loss": 0.81612039, "learning_rate": 6.830803940283458e-08, "loss": 0.83052343, "num_input_tokens_seen": 329788185, "router_z_loss_clip": 2.14648438, "router_z_loss_mlp": 0.23535156, "step": 15286, "time_per_iteration": 2.627272844314575 }, { "auxiliary_loss_clip": 0.01230945, "auxiliary_loss_mlp": 0.00217129, "balance_loss_clip": 1.01746297, "balance_loss_mlp": 0.1927273, "epoch": 0.9191041635352473, "flos": 23441193365760.0, "grad_norm": 5.371200508514221, "language_loss": 0.8087092, "learning_rate": 6.820715994405945e-08, "loss": 0.82318997, "num_input_tokens_seen": 329806780, "router_z_loss_clip": 2.13867188, "router_z_loss_mlp": 0.24401855, "step": 15287, "time_per_iteration": 2.753887414932251 }, { "auxiliary_loss_clip": 0.01256757, "auxiliary_loss_mlp": 0.00237274, "balance_loss_clip": 1.03458261, "balance_loss_mlp": 0.2108096, "epoch": 0.9191642867879152, "flos": 18807532012800.0, "grad_norm": 36.441656266204625, "language_loss": 0.76337284, "learning_rate": 6.810635373906226e-08, "loss": 0.7783131, "num_input_tokens_seen": 329826350, "router_z_loss_clip": 2.22265625, "router_z_loss_mlp": 0.26477051, "step": 15288, "time_per_iteration": 2.6368422508239746 }, { "auxiliary_loss_clip": 0.01230417, "auxiliary_loss_mlp": 0.0024084, "balance_loss_clip": 1.01624584, "balance_loss_mlp": 0.21670061, "epoch": 0.9192244100405832, "flos": 32160950167680.0, "grad_norm": 15.822950260766937, "language_loss": 0.76872575, "learning_rate": 6.800562079166549e-08, "loss": 0.78343832, "num_input_tokens_seen": 329846160, "router_z_loss_clip": 2.140625, "router_z_loss_mlp": 0.24145508, "step": 15289, "time_per_iteration": 2.7910735607147217 }, { "auxiliary_loss_clip": 0.01238704, "auxiliary_loss_mlp": 0.00223942, "balance_loss_clip": 1.02350521, "balance_loss_mlp": 0.19890842, "epoch": 0.9192845332932512, "flos": 16357669827840.0, "grad_norm": 29.960995938785068, "language_loss": 0.83717263, "learning_rate": 6.790496110568921e-08, "loss": 0.85179913, "num_input_tokens_seen": 329862020, "router_z_loss_clip": 2.15332031, "router_z_loss_mlp": 0.25048828, "step": 15290, "time_per_iteration": 2.6840367317199707 }, { "auxiliary_loss_clip": 0.01224534, "auxiliary_loss_mlp": 0.00223018, "balance_loss_clip": 1.01384521, "balance_loss_mlp": 0.19894917, "epoch": 0.9193446565459191, "flos": 26614475464320.0, "grad_norm": 2.2201347891948604, "language_loss": 0.80299628, "learning_rate": 6.78043746849506e-08, "loss": 0.81747174, "num_input_tokens_seen": 329880185, "router_z_loss_clip": 2.109375, "router_z_loss_mlp": 0.24060059, "step": 15291, "time_per_iteration": 2.7109055519104004 }, { "auxiliary_loss_clip": 0.01235803, "auxiliary_loss_mlp": 0.00218325, "balance_loss_clip": 1.02102137, "balance_loss_mlp": 0.19450735, "epoch": 0.9194047797985871, "flos": 22492182084480.0, "grad_norm": 12.557589802992355, "language_loss": 0.76791626, "learning_rate": 6.770386153326346e-08, "loss": 0.78245753, "num_input_tokens_seen": 329900255, "router_z_loss_clip": 2.14648438, "router_z_loss_mlp": 0.23815918, "step": 15292, "time_per_iteration": 2.6725354194641113 }, { "auxiliary_loss_clip": 0.01244618, "auxiliary_loss_mlp": 0.00227921, "balance_loss_clip": 1.0232625, "balance_loss_mlp": 0.20128971, "epoch": 0.9194649030512551, "flos": 25078791346560.0, "grad_norm": 21.49134076103688, "language_loss": 0.8009395, "learning_rate": 6.760342165443988e-08, "loss": 0.81566489, "num_input_tokens_seen": 329919095, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.26635742, "step": 15293, "time_per_iteration": 4.148429870605469 }, { "auxiliary_loss_clip": 0.01238417, "auxiliary_loss_mlp": 0.0022747, "balance_loss_clip": 1.02547407, "balance_loss_mlp": 0.20223336, "epoch": 0.9195250263039231, "flos": 11911139354880.0, "grad_norm": 1902.7063289848104, "language_loss": 0.87338561, "learning_rate": 6.750305505228837e-08, "loss": 0.88804448, "num_input_tokens_seen": 329936505, "router_z_loss_clip": 2.12988281, "router_z_loss_mlp": 0.25231934, "step": 15294, "time_per_iteration": 4.13311767578125 }, { "auxiliary_loss_clip": 0.01246392, "auxiliary_loss_mlp": 0.00247898, "balance_loss_clip": 1.02404284, "balance_loss_mlp": 0.22121876, "epoch": 0.919585149556591, "flos": 21834154880640.0, "grad_norm": 9.712696001186586, "language_loss": 0.84959567, "learning_rate": 6.74027617306141e-08, "loss": 0.86453861, "num_input_tokens_seen": 329956795, "router_z_loss_clip": 2.22070312, "router_z_loss_mlp": 0.26696777, "step": 15295, "time_per_iteration": 2.73774790763855 }, { "auxiliary_loss_clip": 0.01226356, "auxiliary_loss_mlp": 0.00196547, "balance_loss_clip": 1.01674688, "balance_loss_mlp": 0.17431399, "epoch": 0.919645272809259, "flos": 28184059042560.0, "grad_norm": 20.270979338862993, "language_loss": 0.80451691, "learning_rate": 6.730254169322114e-08, "loss": 0.81874597, "num_input_tokens_seen": 329977195, "router_z_loss_clip": 2.09667969, "router_z_loss_mlp": 0.22241211, "step": 15296, "time_per_iteration": 2.6866872310638428 }, { "auxiliary_loss_clip": 0.01234431, "auxiliary_loss_mlp": 0.002216, "balance_loss_clip": 1.01966202, "balance_loss_mlp": 0.19558904, "epoch": 0.9197053960619269, "flos": 18332828847360.0, "grad_norm": 98.63096330836115, "language_loss": 0.83136034, "learning_rate": 6.720239494390912e-08, "loss": 0.84592068, "num_input_tokens_seen": 329992095, "router_z_loss_clip": 2.14648438, "router_z_loss_mlp": 0.26013184, "step": 15297, "time_per_iteration": 2.676393985748291 }, { "auxiliary_loss_clip": 0.01222983, "auxiliary_loss_mlp": 0.0020615, "balance_loss_clip": 1.00886679, "balance_loss_mlp": 0.18046071, "epoch": 0.9197655193145949, "flos": 28183448511360.0, "grad_norm": 53.31273874762472, "language_loss": 0.82930678, "learning_rate": 6.710232148647676e-08, "loss": 0.84359813, "num_input_tokens_seen": 330011490, "router_z_loss_clip": 2.14453125, "router_z_loss_mlp": 0.25695801, "step": 15298, "time_per_iteration": 2.7990834712982178 }, { "auxiliary_loss_clip": 0.01255361, "auxiliary_loss_mlp": 0.00208199, "balance_loss_clip": 1.02774668, "balance_loss_mlp": 0.1813416, "epoch": 0.9198256425672628, "flos": 17306321973120.0, "grad_norm": 13.522842732772652, "language_loss": 0.90569818, "learning_rate": 6.70023213247175e-08, "loss": 0.92033386, "num_input_tokens_seen": 330027885, "router_z_loss_clip": 2.27539062, "router_z_loss_mlp": 0.26855469, "step": 15299, "time_per_iteration": 2.7258474826812744 }, { "auxiliary_loss_clip": 0.01245524, "auxiliary_loss_mlp": 0.00224132, "balance_loss_clip": 1.02786469, "balance_loss_mlp": 0.19932503, "epoch": 0.9198857658199309, "flos": 17858520731520.0, "grad_norm": 6.321690960178277, "language_loss": 0.73426592, "learning_rate": 6.690239446242385e-08, "loss": 0.74896246, "num_input_tokens_seen": 330046230, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.24829102, "step": 15300, "time_per_iteration": 4.154743671417236 }, { "auxiliary_loss_clip": 0.01211147, "auxiliary_loss_mlp": 0.00206231, "balance_loss_clip": 1.00759935, "balance_loss_mlp": 0.18434413, "epoch": 0.9199458890725988, "flos": 22127545169280.0, "grad_norm": 7.224584397551697, "language_loss": 0.76258111, "learning_rate": 6.680254090338545e-08, "loss": 0.77675486, "num_input_tokens_seen": 330065535, "router_z_loss_clip": 2.03320312, "router_z_loss_mlp": 0.21887207, "step": 15301, "time_per_iteration": 2.643026351928711 }, { "auxiliary_loss_clip": 0.01246734, "auxiliary_loss_mlp": 0.0023401, "balance_loss_clip": 1.02972639, "balance_loss_mlp": 0.20843919, "epoch": 0.9200060123252668, "flos": 16034043265920.0, "grad_norm": 45.836394793817696, "language_loss": 0.78099191, "learning_rate": 6.670276065138814e-08, "loss": 0.79579926, "num_input_tokens_seen": 330082920, "router_z_loss_clip": 2.16601562, "router_z_loss_mlp": 0.25585938, "step": 15302, "time_per_iteration": 2.6429622173309326 }, { "auxiliary_loss_clip": 0.01247148, "auxiliary_loss_mlp": 0.00227276, "balance_loss_clip": 1.03035223, "balance_loss_mlp": 0.20181285, "epoch": 0.9200661355779348, "flos": 26864521015680.0, "grad_norm": 11.245016100516198, "language_loss": 0.84606034, "learning_rate": 6.660305371021579e-08, "loss": 0.86080456, "num_input_tokens_seen": 330101165, "router_z_loss_clip": 2.171875, "router_z_loss_mlp": 0.25488281, "step": 15303, "time_per_iteration": 2.7643415927886963 }, { "auxiliary_loss_clip": 0.01240125, "auxiliary_loss_mlp": 0.00225508, "balance_loss_clip": 1.02451491, "balance_loss_mlp": 0.20099851, "epoch": 0.9201262588306027, "flos": 12786749193600.0, "grad_norm": 114.32055516751215, "language_loss": 0.9703477, "learning_rate": 6.650342008365006e-08, "loss": 0.98500407, "num_input_tokens_seen": 330118775, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.24536133, "step": 15304, "time_per_iteration": 2.6702382564544678 }, { "auxiliary_loss_clip": 0.01255629, "auxiliary_loss_mlp": 0.00212582, "balance_loss_clip": 1.03220272, "balance_loss_mlp": 0.18539028, "epoch": 0.9201863820832707, "flos": 20631614428800.0, "grad_norm": 30.849733841774707, "language_loss": 0.88800526, "learning_rate": 6.64038597754677e-08, "loss": 0.90268731, "num_input_tokens_seen": 330135570, "router_z_loss_clip": 2.23632812, "router_z_loss_mlp": 0.27197266, "step": 15305, "time_per_iteration": 4.0837318897247314 }, { "auxiliary_loss_clip": 0.01234456, "auxiliary_loss_mlp": 0.00218563, "balance_loss_clip": 1.01756334, "balance_loss_mlp": 0.19331464, "epoch": 0.9202465053359387, "flos": 26395815421440.0, "grad_norm": 17.749543791388774, "language_loss": 0.91627693, "learning_rate": 6.630437278944501e-08, "loss": 0.93080711, "num_input_tokens_seen": 330152840, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.25256348, "step": 15306, "time_per_iteration": 2.685586452484131 }, { "auxiliary_loss_clip": 0.01237776, "auxiliary_loss_mlp": 0.00240746, "balance_loss_clip": 1.02097166, "balance_loss_mlp": 0.2146277, "epoch": 0.9203066285886067, "flos": 10488179093760.0, "grad_norm": 12.233686508978892, "language_loss": 0.79577363, "learning_rate": 6.62049591293541e-08, "loss": 0.81055892, "num_input_tokens_seen": 330168605, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.26135254, "step": 15307, "time_per_iteration": 2.641709566116333 }, { "auxiliary_loss_clip": 0.01253597, "auxiliary_loss_mlp": 0.00220229, "balance_loss_clip": 1.02958274, "balance_loss_mlp": 0.19366881, "epoch": 0.9203667518412746, "flos": 19390721230080.0, "grad_norm": 13.615322489437727, "language_loss": 0.86169922, "learning_rate": 6.610561879896526e-08, "loss": 0.87643749, "num_input_tokens_seen": 330186160, "router_z_loss_clip": 2.23632812, "router_z_loss_mlp": 0.265625, "step": 15308, "time_per_iteration": 2.684983015060425 }, { "auxiliary_loss_clip": 0.01229162, "auxiliary_loss_mlp": 0.00207777, "balance_loss_clip": 1.0174129, "balance_loss_mlp": 0.18349406, "epoch": 0.9204268750939426, "flos": 15924982596480.0, "grad_norm": 15.952259017432539, "language_loss": 0.86467457, "learning_rate": 6.600635180204484e-08, "loss": 0.87904394, "num_input_tokens_seen": 330201780, "router_z_loss_clip": 2.11914062, "router_z_loss_mlp": 0.24279785, "step": 15309, "time_per_iteration": 2.61946964263916 }, { "auxiliary_loss_clip": 0.01235756, "auxiliary_loss_mlp": 0.00223344, "balance_loss_clip": 1.01601887, "balance_loss_mlp": 0.19788124, "epoch": 0.9204869983466105, "flos": 16471758401280.0, "grad_norm": 12.246883344516585, "language_loss": 0.76735079, "learning_rate": 6.590715814235781e-08, "loss": 0.78194177, "num_input_tokens_seen": 330219165, "router_z_loss_clip": 2.19921875, "router_z_loss_mlp": 0.25463867, "step": 15310, "time_per_iteration": 2.608581066131592 }, { "auxiliary_loss_clip": 0.01246457, "auxiliary_loss_mlp": 0.00233486, "balance_loss_clip": 1.0281775, "balance_loss_mlp": 0.20780876, "epoch": 0.9205471215992785, "flos": 21539220307200.0, "grad_norm": 56.82261477842547, "language_loss": 0.73029888, "learning_rate": 6.580803782366495e-08, "loss": 0.74509823, "num_input_tokens_seen": 330238975, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.25671387, "step": 15311, "time_per_iteration": 2.73721981048584 }, { "auxiliary_loss_clip": 0.01232977, "auxiliary_loss_mlp": 0.00208217, "balance_loss_clip": 1.01618123, "balance_loss_mlp": 0.18362378, "epoch": 0.9206072448519464, "flos": 25005892694400.0, "grad_norm": 9.566399790910664, "language_loss": 0.82912457, "learning_rate": 6.570899084972503e-08, "loss": 0.8435365, "num_input_tokens_seen": 330259755, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.24597168, "step": 15312, "time_per_iteration": 2.688659906387329 }, { "auxiliary_loss_clip": 0.01233338, "auxiliary_loss_mlp": 0.00223262, "balance_loss_clip": 1.02035475, "balance_loss_mlp": 0.19751312, "epoch": 0.9206673681046145, "flos": 20522661500160.0, "grad_norm": 7.760239934815944, "language_loss": 0.85198343, "learning_rate": 6.561001722429394e-08, "loss": 0.86654937, "num_input_tokens_seen": 330277660, "router_z_loss_clip": 2.12597656, "router_z_loss_mlp": 0.25744629, "step": 15313, "time_per_iteration": 2.7138595581054688 }, { "auxiliary_loss_clip": 0.01249434, "auxiliary_loss_mlp": 0.0021691, "balance_loss_clip": 1.02643085, "balance_loss_mlp": 0.1897537, "epoch": 0.9207274913572824, "flos": 20883455660160.0, "grad_norm": 50.95621584759037, "language_loss": 0.86005044, "learning_rate": 6.55111169511251e-08, "loss": 0.8747139, "num_input_tokens_seen": 330295455, "router_z_loss_clip": 2.2265625, "router_z_loss_mlp": 0.27160645, "step": 15314, "time_per_iteration": 2.690624237060547 }, { "auxiliary_loss_clip": 0.0125752, "auxiliary_loss_mlp": 0.00223664, "balance_loss_clip": 1.03109622, "balance_loss_mlp": 0.19680631, "epoch": 0.9207876146099504, "flos": 22708256348160.0, "grad_norm": 42.618944020461235, "language_loss": 0.87133527, "learning_rate": 6.541229003396864e-08, "loss": 0.88614714, "num_input_tokens_seen": 330315310, "router_z_loss_clip": 2.265625, "router_z_loss_mlp": 0.26867676, "step": 15315, "time_per_iteration": 2.7019407749176025 }, { "auxiliary_loss_clip": 0.01242967, "auxiliary_loss_mlp": 0.00233222, "balance_loss_clip": 1.02232504, "balance_loss_mlp": 0.20858151, "epoch": 0.9208477378626184, "flos": 18507354053760.0, "grad_norm": 34.47360047615683, "language_loss": 0.83131719, "learning_rate": 6.531353647657156e-08, "loss": 0.84607911, "num_input_tokens_seen": 330333260, "router_z_loss_clip": 2.20507812, "router_z_loss_mlp": 0.24609375, "step": 15316, "time_per_iteration": 2.6290252208709717 }, { "auxiliary_loss_clip": 0.01246044, "auxiliary_loss_mlp": 0.00241989, "balance_loss_clip": 1.02588439, "balance_loss_mlp": 0.21460652, "epoch": 0.9209078611152863, "flos": 22999635475200.0, "grad_norm": 20.460919845514542, "language_loss": 0.76775414, "learning_rate": 6.521485628267931e-08, "loss": 0.78263444, "num_input_tokens_seen": 330352465, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.27429199, "step": 15317, "time_per_iteration": 2.666034698486328 }, { "auxiliary_loss_clip": 0.01249252, "auxiliary_loss_mlp": 0.00234328, "balance_loss_clip": 1.03263068, "balance_loss_mlp": 0.21015222, "epoch": 0.9209679843679544, "flos": 24061514267520.0, "grad_norm": 2.7586696599966123, "language_loss": 0.90960515, "learning_rate": 6.511624945603378e-08, "loss": 0.92444086, "num_input_tokens_seen": 330372685, "router_z_loss_clip": 2.16503906, "router_z_loss_mlp": 0.24182129, "step": 15318, "time_per_iteration": 2.756600856781006 }, { "auxiliary_loss_clip": 0.01228411, "auxiliary_loss_mlp": 0.00228529, "balance_loss_clip": 1.01308489, "balance_loss_mlp": 0.2042342, "epoch": 0.9210281076206223, "flos": 13553370190080.0, "grad_norm": 12.52983353865456, "language_loss": 0.94388306, "learning_rate": 6.501771600037354e-08, "loss": 0.95845246, "num_input_tokens_seen": 330388860, "router_z_loss_clip": 2.15429688, "router_z_loss_mlp": 0.24291992, "step": 15319, "time_per_iteration": 2.636667251586914 }, { "auxiliary_loss_clip": 0.01090359, "auxiliary_loss_mlp": 0.00084861, "balance_loss_clip": 0.95447314, "balance_loss_mlp": 0.07694574, "epoch": 0.9210882308732903, "flos": 71426289674880.0, "grad_norm": 0.748546950482804, "language_loss": 0.55282009, "learning_rate": 6.491925591943559e-08, "loss": 0.56457233, "num_input_tokens_seen": 330448735, "router_z_loss_clip": 1.359375, "router_z_loss_mlp": 0.07910156, "step": 15320, "time_per_iteration": 3.188377618789673 }, { "auxiliary_loss_clip": 0.01258396, "auxiliary_loss_mlp": 0.00246512, "balance_loss_clip": 1.03000534, "balance_loss_mlp": 0.21718684, "epoch": 0.9211483541259582, "flos": 18509113820160.0, "grad_norm": 4.211996314704572, "language_loss": 0.75683224, "learning_rate": 6.482086921695384e-08, "loss": 0.77188134, "num_input_tokens_seen": 330465600, "router_z_loss_clip": 2.28515625, "router_z_loss_mlp": 0.29333496, "step": 15321, "time_per_iteration": 2.683562994003296 }, { "auxiliary_loss_clip": 0.0121749, "auxiliary_loss_mlp": 0.00229636, "balance_loss_clip": 1.0129385, "balance_loss_mlp": 0.2058423, "epoch": 0.9212084773786262, "flos": 23258228463360.0, "grad_norm": 130.35455703233765, "language_loss": 0.76560378, "learning_rate": 6.47225558966582e-08, "loss": 0.78007507, "num_input_tokens_seen": 330485770, "router_z_loss_clip": 2.04589844, "router_z_loss_mlp": 0.23803711, "step": 15322, "time_per_iteration": 2.781521797180176 }, { "auxiliary_loss_clip": 0.01225399, "auxiliary_loss_mlp": 0.00216477, "balance_loss_clip": 1.01512003, "balance_loss_mlp": 0.19361255, "epoch": 0.9212686006312941, "flos": 16289511770880.0, "grad_norm": 152.0715259139222, "language_loss": 0.78138793, "learning_rate": 6.462431596227725e-08, "loss": 0.79580677, "num_input_tokens_seen": 330504255, "router_z_loss_clip": 2.10351562, "router_z_loss_mlp": 0.22851562, "step": 15323, "time_per_iteration": 2.6777901649475098 }, { "auxiliary_loss_clip": 0.01246457, "auxiliary_loss_mlp": 0.00224137, "balance_loss_clip": 1.02817822, "balance_loss_mlp": 0.19855484, "epoch": 0.9213287238839621, "flos": 19785773986560.0, "grad_norm": 5.333550870987672, "language_loss": 0.83140063, "learning_rate": 6.452614941753597e-08, "loss": 0.84610659, "num_input_tokens_seen": 330520705, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.25585938, "step": 15324, "time_per_iteration": 2.7237203121185303 }, { "auxiliary_loss_clip": 0.01236969, "auxiliary_loss_mlp": 0.00229991, "balance_loss_clip": 1.02253866, "balance_loss_mlp": 0.20523092, "epoch": 0.92138884713663, "flos": 21030402199680.0, "grad_norm": 9.42029925662829, "language_loss": 0.77130294, "learning_rate": 6.442805626615744e-08, "loss": 0.78597254, "num_input_tokens_seen": 330539245, "router_z_loss_clip": 2.14257812, "router_z_loss_mlp": 0.24768066, "step": 15325, "time_per_iteration": 2.7086875438690186 }, { "auxiliary_loss_clip": 0.01221, "auxiliary_loss_mlp": 0.00222605, "balance_loss_clip": 1.00984001, "balance_loss_mlp": 0.19838132, "epoch": 0.9214489703892981, "flos": 28587264186240.0, "grad_norm": 45.13668281868633, "language_loss": 0.83145273, "learning_rate": 6.433003651186109e-08, "loss": 0.84588879, "num_input_tokens_seen": 330561815, "router_z_loss_clip": 2.11132812, "router_z_loss_mlp": 0.24255371, "step": 15326, "time_per_iteration": 2.7338593006134033 }, { "auxiliary_loss_clip": 0.01239823, "auxiliary_loss_mlp": 0.00230698, "balance_loss_clip": 1.02160943, "balance_loss_mlp": 0.20573595, "epoch": 0.921509093641966, "flos": 16361476669440.0, "grad_norm": 19.407726215618105, "language_loss": 0.80849981, "learning_rate": 6.42320901583635e-08, "loss": 0.82320505, "num_input_tokens_seen": 330579760, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.24963379, "step": 15327, "time_per_iteration": 2.6413962841033936 }, { "auxiliary_loss_clip": 0.01257244, "auxiliary_loss_mlp": 0.00209077, "balance_loss_clip": 1.03740966, "balance_loss_mlp": 0.18201697, "epoch": 0.921569216894634, "flos": 26830837036800.0, "grad_norm": 69.6432180867408, "language_loss": 0.84818256, "learning_rate": 6.413421720937906e-08, "loss": 0.86284572, "num_input_tokens_seen": 330598545, "router_z_loss_clip": 2.19921875, "router_z_loss_mlp": 0.27050781, "step": 15328, "time_per_iteration": 2.6970250606536865 }, { "auxiliary_loss_clip": 0.01214644, "auxiliary_loss_mlp": 0.00231089, "balance_loss_clip": 1.00717187, "balance_loss_mlp": 0.207748, "epoch": 0.921629340147302, "flos": 24645134448000.0, "grad_norm": 3.764740678393423, "language_loss": 0.80438179, "learning_rate": 6.4036417668619e-08, "loss": 0.81883907, "num_input_tokens_seen": 330616700, "router_z_loss_clip": 2.0703125, "router_z_loss_mlp": 0.23339844, "step": 15329, "time_per_iteration": 2.732074737548828 }, { "auxiliary_loss_clip": 0.01223836, "auxiliary_loss_mlp": 0.0022093, "balance_loss_clip": 1.01228261, "balance_loss_mlp": 0.19566919, "epoch": 0.9216894633999699, "flos": 15086504442240.0, "grad_norm": 12.741446299019758, "language_loss": 0.93771327, "learning_rate": 6.393869153979192e-08, "loss": 0.95216089, "num_input_tokens_seen": 330633355, "router_z_loss_clip": 2.11523438, "router_z_loss_mlp": 0.25256348, "step": 15330, "time_per_iteration": 2.640230894088745 }, { "auxiliary_loss_clip": 0.01250877, "auxiliary_loss_mlp": 0.00222075, "balance_loss_clip": 1.02779758, "balance_loss_mlp": 0.19716024, "epoch": 0.921749586652638, "flos": 19204524103680.0, "grad_norm": 21.859772856401584, "language_loss": 0.89749742, "learning_rate": 6.384103882660397e-08, "loss": 0.91222697, "num_input_tokens_seen": 330651470, "router_z_loss_clip": 2.23046875, "router_z_loss_mlp": 0.24902344, "step": 15331, "time_per_iteration": 2.645536422729492 }, { "auxiliary_loss_clip": 0.01240529, "auxiliary_loss_mlp": 0.00234356, "balance_loss_clip": 1.02154326, "balance_loss_mlp": 0.20832051, "epoch": 0.9218097099053059, "flos": 20522446018560.0, "grad_norm": 171.22508102793736, "language_loss": 0.82126319, "learning_rate": 6.374345953275794e-08, "loss": 0.83601207, "num_input_tokens_seen": 330669170, "router_z_loss_clip": 2.19140625, "router_z_loss_mlp": 0.26049805, "step": 15332, "time_per_iteration": 2.655120849609375 }, { "auxiliary_loss_clip": 0.0123554, "auxiliary_loss_mlp": 0.00207315, "balance_loss_clip": 1.02240002, "balance_loss_mlp": 0.18303214, "epoch": 0.9218698331579739, "flos": 17348625216000.0, "grad_norm": 6.678415478215065, "language_loss": 0.82528263, "learning_rate": 6.364595366195358e-08, "loss": 0.83971119, "num_input_tokens_seen": 330686635, "router_z_loss_clip": 2.13574219, "router_z_loss_mlp": 0.24304199, "step": 15333, "time_per_iteration": 2.6898014545440674 }, { "auxiliary_loss_clip": 0.01093429, "auxiliary_loss_mlp": 0.0007597, "balance_loss_clip": 0.95603228, "balance_loss_mlp": 0.06867441, "epoch": 0.9219299564106418, "flos": 61958332575360.0, "grad_norm": 0.7770724287987117, "language_loss": 0.52294946, "learning_rate": 6.354852121788879e-08, "loss": 0.53464347, "num_input_tokens_seen": 330749160, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.07275391, "step": 15334, "time_per_iteration": 3.134171724319458 }, { "auxiliary_loss_clip": 0.01221937, "auxiliary_loss_mlp": 0.00216909, "balance_loss_clip": 1.01376915, "balance_loss_mlp": 0.19372304, "epoch": 0.9219900796633098, "flos": 15701761526400.0, "grad_norm": 7.491733649023814, "language_loss": 0.68898314, "learning_rate": 6.345116220425839e-08, "loss": 0.70337164, "num_input_tokens_seen": 330766840, "router_z_loss_clip": 2.08007812, "router_z_loss_mlp": 0.23181152, "step": 15335, "time_per_iteration": 4.112954378128052 }, { "auxiliary_loss_clip": 0.01231001, "auxiliary_loss_mlp": 0.00198696, "balance_loss_clip": 1.01832783, "balance_loss_mlp": 0.17429426, "epoch": 0.9220502029159777, "flos": 24932670819840.0, "grad_norm": 10.71374001787533, "language_loss": 0.78857481, "learning_rate": 6.335387662475366e-08, "loss": 0.80287182, "num_input_tokens_seen": 330785585, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.24389648, "step": 15336, "time_per_iteration": 4.304477214813232 }, { "auxiliary_loss_clip": 0.0123668, "auxiliary_loss_mlp": 0.00213382, "balance_loss_clip": 1.02075315, "balance_loss_mlp": 0.18986225, "epoch": 0.9221103261686457, "flos": 15667215621120.0, "grad_norm": 2912.9963774508815, "language_loss": 0.79053593, "learning_rate": 6.325666448306433e-08, "loss": 0.80503654, "num_input_tokens_seen": 330800750, "router_z_loss_clip": 2.15820312, "router_z_loss_mlp": 0.23547363, "step": 15337, "time_per_iteration": 2.742065668106079 }, { "auxiliary_loss_clip": 0.01088631, "auxiliary_loss_mlp": 0.00071211, "balance_loss_clip": 0.95347273, "balance_loss_mlp": 0.06386812, "epoch": 0.9221704494213137, "flos": 67516299630720.0, "grad_norm": 11293.29688537772, "language_loss": 0.64511508, "learning_rate": 6.31595257828763e-08, "loss": 0.65671349, "num_input_tokens_seen": 330863640, "router_z_loss_clip": 1.3515625, "router_z_loss_mlp": 0.07324219, "step": 15338, "time_per_iteration": 3.108013391494751 }, { "auxiliary_loss_clip": 0.01228602, "auxiliary_loss_mlp": 0.00225208, "balance_loss_clip": 1.01621258, "balance_loss_mlp": 0.20108044, "epoch": 0.9222305726739817, "flos": 30226945155840.0, "grad_norm": 5.589196113420816, "language_loss": 0.76537651, "learning_rate": 6.306246052787289e-08, "loss": 0.77991462, "num_input_tokens_seen": 330884675, "router_z_loss_clip": 2.12402344, "router_z_loss_mlp": 0.24108887, "step": 15339, "time_per_iteration": 2.757204532623291 }, { "auxiliary_loss_clip": 0.01244058, "auxiliary_loss_mlp": 0.0021511, "balance_loss_clip": 1.02414834, "balance_loss_mlp": 0.18965848, "epoch": 0.9222906959266496, "flos": 25337204766720.0, "grad_norm": 38.44551113492542, "language_loss": 0.80322164, "learning_rate": 6.296546872173513e-08, "loss": 0.81781328, "num_input_tokens_seen": 330904125, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.25476074, "step": 15340, "time_per_iteration": 2.71545147895813 }, { "auxiliary_loss_clip": 0.0123725, "auxiliary_loss_mlp": 0.00220631, "balance_loss_clip": 1.02657199, "balance_loss_mlp": 0.19667016, "epoch": 0.9223508191793176, "flos": 27599864244480.0, "grad_norm": 489.91558296082377, "language_loss": 0.76353323, "learning_rate": 6.286855036814098e-08, "loss": 0.77811199, "num_input_tokens_seen": 330925140, "router_z_loss_clip": 2.10546875, "router_z_loss_mlp": 0.23937988, "step": 15341, "time_per_iteration": 2.7345080375671387 }, { "auxiliary_loss_clip": 0.01210797, "auxiliary_loss_mlp": 0.00196656, "balance_loss_clip": 1.00626314, "balance_loss_mlp": 0.17505553, "epoch": 0.9224109424319856, "flos": 27307587277440.0, "grad_norm": 6.719607869016103, "language_loss": 0.75536454, "learning_rate": 6.277170547076571e-08, "loss": 0.7694391, "num_input_tokens_seen": 330946625, "router_z_loss_clip": 2.04492188, "router_z_loss_mlp": 0.21594238, "step": 15342, "time_per_iteration": 4.145627975463867 }, { "auxiliary_loss_clip": 0.0124725, "auxiliary_loss_mlp": 0.00218181, "balance_loss_clip": 1.02620196, "balance_loss_mlp": 0.19438672, "epoch": 0.9224710656846535, "flos": 48208314401280.0, "grad_norm": 69.13730524489941, "language_loss": 0.77598196, "learning_rate": 6.26749340332815e-08, "loss": 0.7906363, "num_input_tokens_seen": 330967795, "router_z_loss_clip": 2.21289062, "router_z_loss_mlp": 0.23803711, "step": 15343, "time_per_iteration": 2.9954402446746826 }, { "auxiliary_loss_clip": 0.01084438, "auxiliary_loss_mlp": 0.00085679, "balance_loss_clip": 0.94927239, "balance_loss_mlp": 0.07814451, "epoch": 0.9225311889373216, "flos": 66722171794560.0, "grad_norm": 0.901252464271142, "language_loss": 0.51237071, "learning_rate": 6.257823605935786e-08, "loss": 0.52407181, "num_input_tokens_seen": 331040850, "router_z_loss_clip": 1.3515625, "router_z_loss_mlp": 0.07519531, "step": 15344, "time_per_iteration": 3.340118646621704 }, { "auxiliary_loss_clip": 0.01226288, "auxiliary_loss_mlp": 0.00228251, "balance_loss_clip": 1.02150035, "balance_loss_mlp": 0.20488602, "epoch": 0.9225913121899895, "flos": 22271295398400.0, "grad_norm": 143.06440907060596, "language_loss": 0.77120876, "learning_rate": 6.248161155266162e-08, "loss": 0.7857542, "num_input_tokens_seen": 331060595, "router_z_loss_clip": 2.04980469, "router_z_loss_mlp": 0.23376465, "step": 15345, "time_per_iteration": 2.6779019832611084 }, { "auxiliary_loss_clip": 0.0124057, "auxiliary_loss_mlp": 0.00230562, "balance_loss_clip": 1.02710509, "balance_loss_mlp": 0.20601705, "epoch": 0.9226514354426575, "flos": 20082719721600.0, "grad_norm": 26.327256530466336, "language_loss": 0.85298455, "learning_rate": 6.238506051685677e-08, "loss": 0.86769587, "num_input_tokens_seen": 331080195, "router_z_loss_clip": 2.13671875, "router_z_loss_mlp": 0.2454834, "step": 15346, "time_per_iteration": 2.689383029937744 }, { "auxiliary_loss_clip": 0.01253895, "auxiliary_loss_mlp": 0.00215322, "balance_loss_clip": 1.03329539, "balance_loss_mlp": 0.19006133, "epoch": 0.9227115586953254, "flos": 16070851728000.0, "grad_norm": 7.909285276172332, "language_loss": 0.84329021, "learning_rate": 6.228858295560457e-08, "loss": 0.8579824, "num_input_tokens_seen": 331097645, "router_z_loss_clip": 2.20410156, "router_z_loss_mlp": 0.25268555, "step": 15347, "time_per_iteration": 2.6222996711730957 }, { "auxiliary_loss_clip": 0.0121129, "auxiliary_loss_mlp": 0.00225162, "balance_loss_clip": 1.00563991, "balance_loss_mlp": 0.20195223, "epoch": 0.9227716819479934, "flos": 20446027833600.0, "grad_norm": 5.819183806436241, "language_loss": 0.81629491, "learning_rate": 6.219217887256367e-08, "loss": 0.83065945, "num_input_tokens_seen": 331116830, "router_z_loss_clip": 2.05273438, "router_z_loss_mlp": 0.23193359, "step": 15348, "time_per_iteration": 4.045306205749512 }, { "auxiliary_loss_clip": 0.01256006, "auxiliary_loss_mlp": 0.00242209, "balance_loss_clip": 1.03048825, "balance_loss_mlp": 0.21495757, "epoch": 0.9228318052006613, "flos": 25007401065600.0, "grad_norm": 23.88420904454497, "language_loss": 0.76054108, "learning_rate": 6.209584827138959e-08, "loss": 0.77552325, "num_input_tokens_seen": 331137235, "router_z_loss_clip": 2.25585938, "router_z_loss_mlp": 0.27246094, "step": 15349, "time_per_iteration": 2.738607406616211 }, { "auxiliary_loss_clip": 0.01245739, "auxiliary_loss_mlp": 0.00231603, "balance_loss_clip": 1.02329564, "balance_loss_mlp": 0.20557973, "epoch": 0.9228919284533293, "flos": 12677257560960.0, "grad_norm": 860.5284308247421, "language_loss": 0.97601151, "learning_rate": 6.199959115573495e-08, "loss": 0.990785, "num_input_tokens_seen": 331153155, "router_z_loss_clip": 2.2265625, "router_z_loss_mlp": 0.26025391, "step": 15350, "time_per_iteration": 2.651982545852661 }, { "auxiliary_loss_clip": 0.01091124, "auxiliary_loss_mlp": 0.00084868, "balance_loss_clip": 0.95464194, "balance_loss_mlp": 0.07766781, "epoch": 0.9229520517059973, "flos": 69986162712960.0, "grad_norm": 0.7441270979252695, "language_loss": 0.59533286, "learning_rate": 6.190340752924994e-08, "loss": 0.60709274, "num_input_tokens_seen": 331214895, "router_z_loss_clip": 1.3671875, "router_z_loss_mlp": 0.07177734, "step": 15351, "time_per_iteration": 3.1568312644958496 }, { "auxiliary_loss_clip": 0.01255853, "auxiliary_loss_mlp": 0.00216724, "balance_loss_clip": 1.0333153, "balance_loss_mlp": 0.1911414, "epoch": 0.9230121749586653, "flos": 14793832425600.0, "grad_norm": 15.278472013009086, "language_loss": 0.86329484, "learning_rate": 6.180729739558233e-08, "loss": 0.87802052, "num_input_tokens_seen": 331232185, "router_z_loss_clip": 2.22460938, "router_z_loss_mlp": 0.2557373, "step": 15352, "time_per_iteration": 2.6653170585632324 }, { "auxiliary_loss_clip": 0.01258948, "auxiliary_loss_mlp": 0.00227479, "balance_loss_clip": 1.03398824, "balance_loss_mlp": 0.20106186, "epoch": 0.9230722982113332, "flos": 22967208472320.0, "grad_norm": 54.87972018831437, "language_loss": 0.70298755, "learning_rate": 6.171126075837585e-08, "loss": 0.71785188, "num_input_tokens_seen": 331251065, "router_z_loss_clip": 2.25, "router_z_loss_mlp": 0.2644043, "step": 15353, "time_per_iteration": 2.6592440605163574 }, { "auxiliary_loss_clip": 0.01231864, "auxiliary_loss_mlp": 0.00229508, "balance_loss_clip": 1.01973927, "balance_loss_mlp": 0.20428352, "epoch": 0.9231324214640012, "flos": 18551452976640.0, "grad_norm": 72.7246187514909, "language_loss": 0.81879979, "learning_rate": 6.161529762127293e-08, "loss": 0.83341348, "num_input_tokens_seen": 331269110, "router_z_loss_clip": 2.12304688, "router_z_loss_mlp": 0.25231934, "step": 15354, "time_per_iteration": 2.658181667327881 }, { "auxiliary_loss_clip": 0.01267133, "auxiliary_loss_mlp": 0.00236957, "balance_loss_clip": 1.0322578, "balance_loss_mlp": 0.20846568, "epoch": 0.9231925447166691, "flos": 22082727974400.0, "grad_norm": 2.3001549587716346, "language_loss": 0.76944131, "learning_rate": 6.1519407987912e-08, "loss": 0.78448224, "num_input_tokens_seen": 331286555, "router_z_loss_clip": 2.34765625, "router_z_loss_mlp": 0.28479004, "step": 15355, "time_per_iteration": 2.666051149368286 }, { "auxiliary_loss_clip": 0.01228062, "auxiliary_loss_mlp": 0.00207969, "balance_loss_clip": 1.01752496, "balance_loss_mlp": 0.18407995, "epoch": 0.9232526679693371, "flos": 26541145848960.0, "grad_norm": 4.274135170336573, "language_loss": 0.82605147, "learning_rate": 6.142359186192947e-08, "loss": 0.84041178, "num_input_tokens_seen": 331307660, "router_z_loss_clip": 2.10839844, "router_z_loss_mlp": 0.23901367, "step": 15356, "time_per_iteration": 2.7975668907165527 }, { "auxiliary_loss_clip": 0.01238243, "auxiliary_loss_mlp": 0.00211484, "balance_loss_clip": 1.02105856, "balance_loss_mlp": 0.18602064, "epoch": 0.9233127912220052, "flos": 14756664827520.0, "grad_norm": 34.17062730295717, "language_loss": 0.70029581, "learning_rate": 6.132784924695844e-08, "loss": 0.71479309, "num_input_tokens_seen": 331324885, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.25463867, "step": 15357, "time_per_iteration": 2.603517770767212 }, { "auxiliary_loss_clip": 0.01261887, "auxiliary_loss_mlp": 0.00240655, "balance_loss_clip": 1.03290462, "balance_loss_mlp": 0.21175879, "epoch": 0.9233729144746731, "flos": 25261792162560.0, "grad_norm": 12.759676932185963, "language_loss": 0.76764011, "learning_rate": 6.123218014662956e-08, "loss": 0.78266549, "num_input_tokens_seen": 331345885, "router_z_loss_clip": 2.2890625, "router_z_loss_mlp": 0.28881836, "step": 15358, "time_per_iteration": 2.7020230293273926 }, { "auxiliary_loss_clip": 0.01222182, "auxiliary_loss_mlp": 0.00221347, "balance_loss_clip": 1.00912523, "balance_loss_mlp": 0.1974102, "epoch": 0.9234330377273411, "flos": 27849837968640.0, "grad_norm": 109.05223959264094, "language_loss": 0.81372929, "learning_rate": 6.113658456457104e-08, "loss": 0.82816452, "num_input_tokens_seen": 331364320, "router_z_loss_clip": 2.12890625, "router_z_loss_mlp": 0.23950195, "step": 15359, "time_per_iteration": 2.696103096008301 }, { "auxiliary_loss_clip": 0.01232412, "auxiliary_loss_mlp": 0.0023068, "balance_loss_clip": 1.02222824, "balance_loss_mlp": 0.20724364, "epoch": 0.923493160980009, "flos": 24608361899520.0, "grad_norm": 4.49111588569745, "language_loss": 0.73751003, "learning_rate": 6.104106250440732e-08, "loss": 0.75214094, "num_input_tokens_seen": 331384135, "router_z_loss_clip": 2.09960938, "router_z_loss_mlp": 0.23425293, "step": 15360, "time_per_iteration": 2.696033477783203 }, { "auxiliary_loss_clip": 0.01090349, "auxiliary_loss_mlp": 0.00073956, "balance_loss_clip": 0.95406818, "balance_loss_mlp": 0.06651688, "epoch": 0.923553284232677, "flos": 67700916558720.0, "grad_norm": 0.808797484907436, "language_loss": 0.54470754, "learning_rate": 6.094561396976083e-08, "loss": 0.55635059, "num_input_tokens_seen": 331440645, "router_z_loss_clip": 1.359375, "router_z_loss_mlp": 0.07421875, "step": 15361, "time_per_iteration": 3.1378085613250732 }, { "auxiliary_loss_clip": 0.0125153, "auxiliary_loss_mlp": 0.00214875, "balance_loss_clip": 1.02846515, "balance_loss_mlp": 0.18813637, "epoch": 0.9236134074853449, "flos": 18807244704000.0, "grad_norm": 5.657518793349361, "language_loss": 0.8110621, "learning_rate": 6.085023896425112e-08, "loss": 0.82572609, "num_input_tokens_seen": 331459580, "router_z_loss_clip": 2.23046875, "router_z_loss_mlp": 0.26757812, "step": 15362, "time_per_iteration": 2.7207212448120117 }, { "auxiliary_loss_clip": 0.01255381, "auxiliary_loss_mlp": 0.00228866, "balance_loss_clip": 1.02984977, "balance_loss_mlp": 0.20008841, "epoch": 0.923673530738013, "flos": 27782362270080.0, "grad_norm": 53.039070556140345, "language_loss": 0.83675373, "learning_rate": 6.075493749149463e-08, "loss": 0.85159612, "num_input_tokens_seen": 331481560, "router_z_loss_clip": 2.2578125, "router_z_loss_mlp": 0.28796387, "step": 15363, "time_per_iteration": 2.8429746627807617 }, { "auxiliary_loss_clip": 0.0123037, "auxiliary_loss_mlp": 0.00206227, "balance_loss_clip": 1.02038455, "balance_loss_mlp": 0.18277867, "epoch": 0.9237336539906809, "flos": 26797117144320.0, "grad_norm": 110.31931657423164, "language_loss": 0.91319996, "learning_rate": 6.065970955510514e-08, "loss": 0.92756593, "num_input_tokens_seen": 331499090, "router_z_loss_clip": 2.09960938, "router_z_loss_mlp": 0.23449707, "step": 15364, "time_per_iteration": 2.729325771331787 }, { "auxiliary_loss_clip": 0.01246752, "auxiliary_loss_mlp": 0.0024959, "balance_loss_clip": 1.02877557, "balance_loss_mlp": 0.2247709, "epoch": 0.9237937772433489, "flos": 23587708942080.0, "grad_norm": 41.16634062103163, "language_loss": 0.7502054, "learning_rate": 6.056455515869419e-08, "loss": 0.76516879, "num_input_tokens_seen": 331519420, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.24816895, "step": 15365, "time_per_iteration": 2.745678186416626 }, { "auxiliary_loss_clip": 0.01253513, "auxiliary_loss_mlp": 0.00225038, "balance_loss_clip": 1.03120744, "balance_loss_mlp": 0.20123202, "epoch": 0.9238539004960168, "flos": 26140562398080.0, "grad_norm": 5.5534525582082335, "language_loss": 0.72215492, "learning_rate": 6.046947430586913e-08, "loss": 0.7369405, "num_input_tokens_seen": 331538720, "router_z_loss_clip": 2.22265625, "router_z_loss_mlp": 0.23803711, "step": 15366, "time_per_iteration": 2.682913303375244 }, { "auxiliary_loss_clip": 0.01228352, "auxiliary_loss_mlp": 0.0021597, "balance_loss_clip": 1.01487827, "balance_loss_mlp": 0.19223532, "epoch": 0.9239140237486848, "flos": 21068000760960.0, "grad_norm": 13.820170409697463, "language_loss": 0.81325698, "learning_rate": 6.037446700023619e-08, "loss": 0.8277002, "num_input_tokens_seen": 331558505, "router_z_loss_clip": 2.1328125, "router_z_loss_mlp": 0.23742676, "step": 15367, "time_per_iteration": 2.7145352363586426 }, { "auxiliary_loss_clip": 0.01228119, "auxiliary_loss_mlp": 0.00222978, "balance_loss_clip": 1.01832902, "balance_loss_mlp": 0.20097163, "epoch": 0.9239741470013527, "flos": 24607930936320.0, "grad_norm": 4.106528577847107, "language_loss": 0.72543359, "learning_rate": 6.027953324539759e-08, "loss": 0.73994458, "num_input_tokens_seen": 331578440, "router_z_loss_clip": 2.09570312, "router_z_loss_mlp": 0.2199707, "step": 15368, "time_per_iteration": 2.6830554008483887 }, { "auxiliary_loss_clip": 0.01244265, "auxiliary_loss_mlp": 0.00241331, "balance_loss_clip": 1.02521479, "balance_loss_mlp": 0.21723834, "epoch": 0.9240342702540207, "flos": 24718248581760.0, "grad_norm": 15.870974565261431, "language_loss": 0.83601928, "learning_rate": 6.018467304495401e-08, "loss": 0.8508752, "num_input_tokens_seen": 331598945, "router_z_loss_clip": 2.19238281, "router_z_loss_mlp": 0.24072266, "step": 15369, "time_per_iteration": 2.6846280097961426 }, { "auxiliary_loss_clip": 0.01255114, "auxiliary_loss_mlp": 0.00240793, "balance_loss_clip": 1.02987778, "balance_loss_mlp": 0.21296942, "epoch": 0.9240943935066888, "flos": 20849987162880.0, "grad_norm": 256.9684957686365, "language_loss": 0.86173916, "learning_rate": 6.008988640250145e-08, "loss": 0.87669826, "num_input_tokens_seen": 331616700, "router_z_loss_clip": 2.25195312, "router_z_loss_mlp": 0.2779541, "step": 15370, "time_per_iteration": 2.6263363361358643 }, { "auxiliary_loss_clip": 0.01225801, "auxiliary_loss_mlp": 0.00204191, "balance_loss_clip": 1.01355028, "balance_loss_mlp": 0.18034896, "epoch": 0.9241545167593567, "flos": 24462313200000.0, "grad_norm": 7.458546260609154, "language_loss": 0.73180103, "learning_rate": 5.999517332163528e-08, "loss": 0.7461009, "num_input_tokens_seen": 331635625, "router_z_loss_clip": 2.12695312, "router_z_loss_mlp": 0.23852539, "step": 15371, "time_per_iteration": 2.705002784729004 }, { "auxiliary_loss_clip": 0.01102119, "auxiliary_loss_mlp": 0.00070443, "balance_loss_clip": 0.96192944, "balance_loss_mlp": 0.06271869, "epoch": 0.9242146400120247, "flos": 61827259847040.0, "grad_norm": 0.7056982990241214, "language_loss": 0.57049716, "learning_rate": 5.99005338059464e-08, "loss": 0.58222276, "num_input_tokens_seen": 331698595, "router_z_loss_clip": 1.3984375, "router_z_loss_mlp": 0.07714844, "step": 15372, "time_per_iteration": 3.112746477127075 }, { "auxiliary_loss_clip": 0.01218972, "auxiliary_loss_mlp": 0.00233313, "balance_loss_clip": 1.00784683, "balance_loss_mlp": 0.20984066, "epoch": 0.9242747632646926, "flos": 22048397550720.0, "grad_norm": 117.29596536092251, "language_loss": 0.78218961, "learning_rate": 5.98059678590237e-08, "loss": 0.79671246, "num_input_tokens_seen": 331717975, "router_z_loss_clip": 2.11328125, "router_z_loss_mlp": 0.23461914, "step": 15373, "time_per_iteration": 2.6335248947143555 }, { "auxiliary_loss_clip": 0.0123575, "auxiliary_loss_mlp": 0.00204225, "balance_loss_clip": 1.02327001, "balance_loss_mlp": 0.17969128, "epoch": 0.9243348865173606, "flos": 18478338842880.0, "grad_norm": 8.292031415015979, "language_loss": 0.83609653, "learning_rate": 5.971147548445299e-08, "loss": 0.85049629, "num_input_tokens_seen": 331737220, "router_z_loss_clip": 2.12207031, "router_z_loss_mlp": 0.24523926, "step": 15374, "time_per_iteration": 2.699777126312256 }, { "auxiliary_loss_clip": 0.01222035, "auxiliary_loss_mlp": 0.00204693, "balance_loss_clip": 1.01225567, "balance_loss_mlp": 0.18184046, "epoch": 0.9243950097700285, "flos": 23258767167360.0, "grad_norm": 30.512391122485294, "language_loss": 0.73964083, "learning_rate": 5.961705668581784e-08, "loss": 0.75390804, "num_input_tokens_seen": 331757300, "router_z_loss_clip": 2.09667969, "router_z_loss_mlp": 0.2286377, "step": 15375, "time_per_iteration": 2.724558115005493 }, { "auxiliary_loss_clip": 0.01237317, "auxiliary_loss_mlp": 0.0022464, "balance_loss_clip": 1.02203512, "balance_loss_mlp": 0.19849706, "epoch": 0.9244551330226966, "flos": 29749081593600.0, "grad_norm": 11.098159840546234, "language_loss": 0.74095106, "learning_rate": 5.952271146669829e-08, "loss": 0.75557065, "num_input_tokens_seen": 331776995, "router_z_loss_clip": 2.15234375, "router_z_loss_mlp": 0.26135254, "step": 15376, "time_per_iteration": 2.8236963748931885 }, { "auxiliary_loss_clip": 0.01092531, "auxiliary_loss_mlp": 0.00067225, "balance_loss_clip": 0.95526063, "balance_loss_mlp": 0.05992972, "epoch": 0.9245152562753645, "flos": 68864960609280.0, "grad_norm": 0.6430496470717083, "language_loss": 0.60542929, "learning_rate": 5.94284398306717e-08, "loss": 0.61702693, "num_input_tokens_seen": 331845015, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.07275391, "step": 15377, "time_per_iteration": 4.672845840454102 }, { "auxiliary_loss_clip": 0.01235185, "auxiliary_loss_mlp": 0.00224418, "balance_loss_clip": 1.02219439, "balance_loss_mlp": 0.20043309, "epoch": 0.9245753795280325, "flos": 21579260993280.0, "grad_norm": 34.89366483582198, "language_loss": 0.80964327, "learning_rate": 5.933424178131341e-08, "loss": 0.82423931, "num_input_tokens_seen": 331862795, "router_z_loss_clip": 2.13183594, "router_z_loss_mlp": 0.23999023, "step": 15378, "time_per_iteration": 4.307880163192749 }, { "auxiliary_loss_clip": 0.01246127, "auxiliary_loss_mlp": 0.00237299, "balance_loss_clip": 1.02853, "balance_loss_mlp": 0.21081088, "epoch": 0.9246355027807004, "flos": 34496077334400.0, "grad_norm": 24.511777334342728, "language_loss": 0.70571208, "learning_rate": 5.924011732219503e-08, "loss": 0.72054631, "num_input_tokens_seen": 331882535, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.26489258, "step": 15379, "time_per_iteration": 2.8104381561279297 }, { "auxiliary_loss_clip": 0.01221433, "auxiliary_loss_mlp": 0.00221841, "balance_loss_clip": 1.01115608, "balance_loss_mlp": 0.19826122, "epoch": 0.9246956260333684, "flos": 15953854152960.0, "grad_norm": 6.138477461793837, "language_loss": 0.92177683, "learning_rate": 5.914606645688591e-08, "loss": 0.93620956, "num_input_tokens_seen": 331899335, "router_z_loss_clip": 2.10351562, "router_z_loss_mlp": 0.23608398, "step": 15380, "time_per_iteration": 2.740817070007324 }, { "auxiliary_loss_clip": 0.01249843, "auxiliary_loss_mlp": 0.00239739, "balance_loss_clip": 1.02763748, "balance_loss_mlp": 0.21344116, "epoch": 0.9247557492860363, "flos": 23368366540800.0, "grad_norm": 20.383857026752406, "language_loss": 0.80592942, "learning_rate": 5.905208918895233e-08, "loss": 0.82082522, "num_input_tokens_seen": 331919030, "router_z_loss_clip": 2.22265625, "router_z_loss_mlp": 0.26330566, "step": 15381, "time_per_iteration": 2.692462205886841 }, { "auxiliary_loss_clip": 0.01254667, "auxiliary_loss_mlp": 0.00231802, "balance_loss_clip": 1.03533721, "balance_loss_mlp": 0.20593357, "epoch": 0.9248158725387043, "flos": 23039855729280.0, "grad_norm": 23.510702157914203, "language_loss": 0.85819733, "learning_rate": 5.8958185521958524e-08, "loss": 0.87306201, "num_input_tokens_seen": 331936465, "router_z_loss_clip": 2.19433594, "router_z_loss_mlp": 0.25842285, "step": 15382, "time_per_iteration": 2.7246627807617188 }, { "auxiliary_loss_clip": 0.01249729, "auxiliary_loss_mlp": 0.00238122, "balance_loss_clip": 1.02864122, "balance_loss_mlp": 0.21122883, "epoch": 0.9248759957913724, "flos": 22522418357760.0, "grad_norm": 120.78709099181876, "language_loss": 0.81093585, "learning_rate": 5.886435545946455e-08, "loss": 0.82581437, "num_input_tokens_seen": 331954625, "router_z_loss_clip": 2.21484375, "router_z_loss_mlp": 0.2689209, "step": 15383, "time_per_iteration": 2.6523597240448 }, { "auxiliary_loss_clip": 0.01235319, "auxiliary_loss_mlp": 0.00213477, "balance_loss_clip": 1.0232625, "balance_loss_mlp": 0.18899174, "epoch": 0.9249361190440403, "flos": 25447271016960.0, "grad_norm": 3.040619293590424, "language_loss": 0.82654673, "learning_rate": 5.8770599005028456e-08, "loss": 0.84103471, "num_input_tokens_seen": 331975865, "router_z_loss_clip": 2.12109375, "router_z_loss_mlp": 0.24487305, "step": 15384, "time_per_iteration": 2.7744874954223633 }, { "auxiliary_loss_clip": 0.01227227, "auxiliary_loss_mlp": 0.00205788, "balance_loss_clip": 1.01126051, "balance_loss_mlp": 0.18117164, "epoch": 0.9249962422967083, "flos": 12378623886720.0, "grad_norm": 5.965523551749448, "language_loss": 0.78471905, "learning_rate": 5.8676916162206045e-08, "loss": 0.79904926, "num_input_tokens_seen": 331992760, "router_z_loss_clip": 2.16015625, "router_z_loss_mlp": 0.24621582, "step": 15385, "time_per_iteration": 4.175187110900879 }, { "auxiliary_loss_clip": 0.01250287, "auxiliary_loss_mlp": 0.0023084, "balance_loss_clip": 1.03284109, "balance_loss_mlp": 0.20417309, "epoch": 0.9250563655493762, "flos": 22929430343040.0, "grad_norm": 47.04090976642357, "language_loss": 0.90319276, "learning_rate": 5.85833069345496e-08, "loss": 0.91800404, "num_input_tokens_seen": 332011890, "router_z_loss_clip": 2.17285156, "router_z_loss_mlp": 0.26672363, "step": 15386, "time_per_iteration": 2.662161111831665 }, { "auxiliary_loss_clip": 0.01228508, "auxiliary_loss_mlp": 0.00209124, "balance_loss_clip": 1.01728344, "balance_loss_mlp": 0.18563968, "epoch": 0.9251164888020442, "flos": 18478662065280.0, "grad_norm": 9.686475516662387, "language_loss": 0.84393728, "learning_rate": 5.8489771325608504e-08, "loss": 0.85831356, "num_input_tokens_seen": 332029485, "router_z_loss_clip": 2.11132812, "router_z_loss_mlp": 0.23474121, "step": 15387, "time_per_iteration": 2.7189342975616455 }, { "auxiliary_loss_clip": 0.01229599, "auxiliary_loss_mlp": 0.00227088, "balance_loss_clip": 1.01562595, "balance_loss_mlp": 0.20145822, "epoch": 0.9251766120547121, "flos": 33037062796800.0, "grad_norm": 304.56368967306435, "language_loss": 0.757936, "learning_rate": 5.839630933893014e-08, "loss": 0.7725029, "num_input_tokens_seen": 332052970, "router_z_loss_clip": 2.14160156, "router_z_loss_mlp": 0.25634766, "step": 15388, "time_per_iteration": 2.8978002071380615 }, { "auxiliary_loss_clip": 0.01252511, "auxiliary_loss_mlp": 0.00227099, "balance_loss_clip": 1.02769732, "balance_loss_mlp": 0.20179096, "epoch": 0.9252367353073802, "flos": 24387906176640.0, "grad_norm": 3.9951890913897277, "language_loss": 0.90933061, "learning_rate": 5.8302920978058115e-08, "loss": 0.92412674, "num_input_tokens_seen": 332070395, "router_z_loss_clip": 2.24609375, "router_z_loss_mlp": 0.25305176, "step": 15389, "time_per_iteration": 2.655665397644043 }, { "auxiliary_loss_clip": 0.01278288, "auxiliary_loss_mlp": 0.00249882, "balance_loss_clip": 1.04782283, "balance_loss_mlp": 0.2224396, "epoch": 0.9252968585600481, "flos": 18916844077440.0, "grad_norm": 285.7073213668664, "language_loss": 0.86137009, "learning_rate": 5.820960624653381e-08, "loss": 0.87665182, "num_input_tokens_seen": 332090185, "router_z_loss_clip": 2.30664062, "router_z_loss_mlp": 0.27429199, "step": 15390, "time_per_iteration": 4.13965630531311 }, { "auxiliary_loss_clip": 0.01253677, "auxiliary_loss_mlp": 0.00216289, "balance_loss_clip": 1.03088343, "balance_loss_mlp": 0.19071874, "epoch": 0.9253569818127161, "flos": 21725345606400.0, "grad_norm": 3.0044628113383665, "language_loss": 0.83562493, "learning_rate": 5.811636514789597e-08, "loss": 0.85032463, "num_input_tokens_seen": 332109050, "router_z_loss_clip": 2.22851562, "router_z_loss_mlp": 0.2557373, "step": 15391, "time_per_iteration": 2.6316168308258057 }, { "auxiliary_loss_clip": 0.01273696, "auxiliary_loss_mlp": 0.00236449, "balance_loss_clip": 1.04480505, "balance_loss_mlp": 0.20849422, "epoch": 0.925417105065384, "flos": 34240357434240.0, "grad_norm": 99.47517312716941, "language_loss": 0.62251544, "learning_rate": 5.80231976856802e-08, "loss": 0.63761687, "num_input_tokens_seen": 332131180, "router_z_loss_clip": 2.28515625, "router_z_loss_mlp": 0.27929688, "step": 15392, "time_per_iteration": 2.804828405380249 }, { "auxiliary_loss_clip": 0.01240635, "auxiliary_loss_mlp": 0.00245686, "balance_loss_clip": 1.02355027, "balance_loss_mlp": 0.22044985, "epoch": 0.925477228318052, "flos": 25959536830080.0, "grad_norm": 8.858574184139616, "language_loss": 0.84770155, "learning_rate": 5.7930103863419454e-08, "loss": 0.8625648, "num_input_tokens_seen": 332149555, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.25244141, "step": 15393, "time_per_iteration": 2.6844425201416016 }, { "auxiliary_loss_clip": 0.01232339, "auxiliary_loss_mlp": 0.00217855, "balance_loss_clip": 1.01721978, "balance_loss_mlp": 0.19183135, "epoch": 0.9255373515707199, "flos": 11838240702720.0, "grad_norm": 3.426160207552794, "language_loss": 0.78342563, "learning_rate": 5.783708368464357e-08, "loss": 0.79792756, "num_input_tokens_seen": 332165830, "router_z_loss_clip": 2.15234375, "router_z_loss_mlp": 0.26037598, "step": 15394, "time_per_iteration": 2.700195074081421 }, { "auxiliary_loss_clip": 0.01232808, "auxiliary_loss_mlp": 0.00235915, "balance_loss_clip": 1.02119303, "balance_loss_mlp": 0.21011797, "epoch": 0.925597474823388, "flos": 21434325615360.0, "grad_norm": 55.25787419169357, "language_loss": 0.80679655, "learning_rate": 5.7744137152879956e-08, "loss": 0.82148379, "num_input_tokens_seen": 332185130, "router_z_loss_clip": 2.11328125, "router_z_loss_mlp": 0.25793457, "step": 15395, "time_per_iteration": 2.625373125076294 }, { "auxiliary_loss_clip": 0.0123418, "auxiliary_loss_mlp": 0.00226917, "balance_loss_clip": 1.01731181, "balance_loss_mlp": 0.20197842, "epoch": 0.925657598076056, "flos": 22857573185280.0, "grad_norm": 113.15015449971071, "language_loss": 0.80808151, "learning_rate": 5.7651264271653785e-08, "loss": 0.82269251, "num_input_tokens_seen": 332203695, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.24963379, "step": 15396, "time_per_iteration": 2.696953296661377 }, { "auxiliary_loss_clip": 0.01246989, "auxiliary_loss_mlp": 0.00227104, "balance_loss_clip": 1.02816606, "balance_loss_mlp": 0.19902982, "epoch": 0.9257177213287239, "flos": 25704032411520.0, "grad_norm": 691.4068881012736, "language_loss": 0.9361918, "learning_rate": 5.755846504448603e-08, "loss": 0.95093274, "num_input_tokens_seen": 332224850, "router_z_loss_clip": 2.18847656, "router_z_loss_mlp": 0.28088379, "step": 15397, "time_per_iteration": 2.71514892578125 }, { "auxiliary_loss_clip": 0.01093142, "auxiliary_loss_mlp": 0.00075514, "balance_loss_clip": 0.9586888, "balance_loss_mlp": 0.06759874, "epoch": 0.9257778445813919, "flos": 59592933221760.0, "grad_norm": 0.7917543502888512, "language_loss": 0.54809833, "learning_rate": 5.746573947489586e-08, "loss": 0.55978489, "num_input_tokens_seen": 332278085, "router_z_loss_clip": 1.34375, "router_z_loss_mlp": 0.07910156, "step": 15398, "time_per_iteration": 3.0459885597229004 }, { "auxiliary_loss_clip": 0.01263374, "auxiliary_loss_mlp": 0.00248683, "balance_loss_clip": 1.0362134, "balance_loss_mlp": 0.21945331, "epoch": 0.9258379678340598, "flos": 27709427704320.0, "grad_norm": 56.72210130941292, "language_loss": 0.86134928, "learning_rate": 5.7373087566400025e-08, "loss": 0.87646985, "num_input_tokens_seen": 332297875, "router_z_loss_clip": 2.26953125, "router_z_loss_mlp": 0.29248047, "step": 15399, "time_per_iteration": 2.772329092025757 }, { "auxiliary_loss_clip": 0.01214567, "auxiliary_loss_mlp": 0.00217432, "balance_loss_clip": 1.00759935, "balance_loss_mlp": 0.19373342, "epoch": 0.9258980910867278, "flos": 24863543095680.0, "grad_norm": 26.39246507986695, "language_loss": 0.84525532, "learning_rate": 5.7280509322510826e-08, "loss": 0.85957533, "num_input_tokens_seen": 332318500, "router_z_loss_clip": 2.06640625, "router_z_loss_mlp": 0.23718262, "step": 15400, "time_per_iteration": 2.7075726985931396 }, { "auxiliary_loss_clip": 0.01094257, "auxiliary_loss_mlp": 0.00091956, "balance_loss_clip": 0.95918465, "balance_loss_mlp": 0.08394549, "epoch": 0.9259582143393957, "flos": 63134587249920.0, "grad_norm": 0.7105062275276695, "language_loss": 0.51001406, "learning_rate": 5.718800474673946e-08, "loss": 0.52187622, "num_input_tokens_seen": 332381980, "router_z_loss_clip": 1.3515625, "router_z_loss_mlp": 0.08007812, "step": 15401, "time_per_iteration": 3.1205708980560303 }, { "auxiliary_loss_clip": 0.01218453, "auxiliary_loss_mlp": 0.00210331, "balance_loss_clip": 1.00835109, "balance_loss_mlp": 0.18616766, "epoch": 0.9260183375920638, "flos": 24127122458880.0, "grad_norm": 123.38493707269622, "language_loss": 0.87598801, "learning_rate": 5.709557384259378e-08, "loss": 0.89027584, "num_input_tokens_seen": 332399510, "router_z_loss_clip": 2.09960938, "router_z_loss_mlp": 0.24145508, "step": 15402, "time_per_iteration": 2.6664953231811523 }, { "auxiliary_loss_clip": 0.01096122, "auxiliary_loss_mlp": 0.00050064, "balance_loss_clip": 0.95960307, "balance_loss_mlp": 0.04281577, "epoch": 0.9260784608447317, "flos": 63042872849280.0, "grad_norm": 0.7342002143172576, "language_loss": 0.50281346, "learning_rate": 5.700321661357876e-08, "loss": 0.51427537, "num_input_tokens_seen": 332459130, "router_z_loss_clip": 1.359375, "router_z_loss_mlp": 0.07226562, "step": 15403, "time_per_iteration": 3.2268285751342773 }, { "auxiliary_loss_clip": 0.01090815, "auxiliary_loss_mlp": 0.00062987, "balance_loss_clip": 0.95345581, "balance_loss_mlp": 0.05597761, "epoch": 0.9261385840973997, "flos": 70585979927040.0, "grad_norm": 0.6684257328871466, "language_loss": 0.58284312, "learning_rate": 5.69109330631965e-08, "loss": 0.59438115, "num_input_tokens_seen": 332526555, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.0703125, "step": 15404, "time_per_iteration": 3.1576764583587646 }, { "auxiliary_loss_clip": 0.01231953, "auxiliary_loss_mlp": 0.00224953, "balance_loss_clip": 1.01626897, "balance_loss_mlp": 0.1994426, "epoch": 0.9261987073500676, "flos": 20229917656320.0, "grad_norm": 12.797307021393562, "language_loss": 0.8090415, "learning_rate": 5.681872319494596e-08, "loss": 0.82361054, "num_input_tokens_seen": 332544005, "router_z_loss_clip": 2.16113281, "router_z_loss_mlp": 0.25500488, "step": 15405, "time_per_iteration": 2.688830852508545 }, { "auxiliary_loss_clip": 0.01246156, "auxiliary_loss_mlp": 0.00213156, "balance_loss_clip": 1.02895427, "balance_loss_mlp": 0.18791978, "epoch": 0.9262588306027356, "flos": 20954163582720.0, "grad_norm": 7.239018238409776, "language_loss": 0.78608048, "learning_rate": 5.672658701232458e-08, "loss": 0.80067366, "num_input_tokens_seen": 332563070, "router_z_loss_clip": 2.171875, "router_z_loss_mlp": 0.25256348, "step": 15406, "time_per_iteration": 2.7194502353668213 }, { "auxiliary_loss_clip": 0.01239221, "auxiliary_loss_mlp": 0.00231666, "balance_loss_clip": 1.01987612, "balance_loss_mlp": 0.20559523, "epoch": 0.9263189538554035, "flos": 22158679282560.0, "grad_norm": 16.029974416878353, "language_loss": 0.85996878, "learning_rate": 5.663452451882555e-08, "loss": 0.87467766, "num_input_tokens_seen": 332579620, "router_z_loss_clip": 2.19140625, "router_z_loss_mlp": 0.26049805, "step": 15407, "time_per_iteration": 2.656231641769409 }, { "auxiliary_loss_clip": 0.01255842, "auxiliary_loss_mlp": 0.00249534, "balance_loss_clip": 1.02937126, "balance_loss_mlp": 0.22138873, "epoch": 0.9263790771080715, "flos": 18187211111040.0, "grad_norm": 20.81688336069759, "language_loss": 0.82317507, "learning_rate": 5.6542535717940096e-08, "loss": 0.83822882, "num_input_tokens_seen": 332597795, "router_z_loss_clip": 2.26367188, "router_z_loss_mlp": 0.28161621, "step": 15408, "time_per_iteration": 2.758786678314209 }, { "auxiliary_loss_clip": 0.01219084, "auxiliary_loss_mlp": 0.00223272, "balance_loss_clip": 1.00960231, "balance_loss_mlp": 0.19971617, "epoch": 0.9264392003607396, "flos": 48178545004800.0, "grad_norm": 11.855922670118483, "language_loss": 0.75139117, "learning_rate": 5.645062061315675e-08, "loss": 0.76581472, "num_input_tokens_seen": 332620375, "router_z_loss_clip": 2.09277344, "router_z_loss_mlp": 0.2355957, "step": 15409, "time_per_iteration": 2.867767810821533 }, { "auxiliary_loss_clip": 0.01244262, "auxiliary_loss_mlp": 0.00224548, "balance_loss_clip": 1.02479768, "balance_loss_mlp": 0.1996925, "epoch": 0.9264993236134075, "flos": 26389458714240.0, "grad_norm": 4.338239355233278, "language_loss": 0.84387076, "learning_rate": 5.6358779207960506e-08, "loss": 0.85855889, "num_input_tokens_seen": 332639510, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.2487793, "step": 15410, "time_per_iteration": 2.7551653385162354 }, { "auxiliary_loss_clip": 0.01235532, "auxiliary_loss_mlp": 0.00233255, "balance_loss_clip": 1.02154255, "balance_loss_mlp": 0.20831636, "epoch": 0.9265594468660755, "flos": 20920084554240.0, "grad_norm": 31.344764043631727, "language_loss": 0.87671173, "learning_rate": 5.6267011505833905e-08, "loss": 0.89139962, "num_input_tokens_seen": 332658350, "router_z_loss_clip": 2.13867188, "router_z_loss_mlp": 0.24902344, "step": 15411, "time_per_iteration": 2.6602351665496826 }, { "auxiliary_loss_clip": 0.01256055, "auxiliary_loss_mlp": 0.00224014, "balance_loss_clip": 1.03661084, "balance_loss_mlp": 0.1997072, "epoch": 0.9266195701187434, "flos": 17525017929600.0, "grad_norm": 475.1065275754509, "language_loss": 0.81985337, "learning_rate": 5.617531751025728e-08, "loss": 0.83465403, "num_input_tokens_seen": 332676715, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.24316406, "step": 15412, "time_per_iteration": 2.7413437366485596 }, { "auxiliary_loss_clip": 0.01221193, "auxiliary_loss_mlp": 0.0021906, "balance_loss_clip": 1.01097882, "balance_loss_mlp": 0.19465759, "epoch": 0.9266796933714114, "flos": 33688733293440.0, "grad_norm": 2.953455408945847, "language_loss": 0.74784946, "learning_rate": 5.6083697224707406e-08, "loss": 0.76225197, "num_input_tokens_seen": 332701470, "router_z_loss_clip": 2.1015625, "router_z_loss_mlp": 0.24401855, "step": 15413, "time_per_iteration": 2.8012936115264893 }, { "auxiliary_loss_clip": 0.01245679, "auxiliary_loss_mlp": 0.002155, "balance_loss_clip": 1.02884781, "balance_loss_mlp": 0.1919919, "epoch": 0.9267398166240793, "flos": 18916520855040.0, "grad_norm": 251.37320504146476, "language_loss": 0.83851433, "learning_rate": 5.5992150652658167e-08, "loss": 0.85312605, "num_input_tokens_seen": 332719060, "router_z_loss_clip": 2.16796875, "router_z_loss_mlp": 0.23510742, "step": 15414, "time_per_iteration": 2.6309783458709717 }, { "auxiliary_loss_clip": 0.01218026, "auxiliary_loss_mlp": 0.0021318, "balance_loss_clip": 1.00928402, "balance_loss_mlp": 0.18933846, "epoch": 0.9267999398767474, "flos": 20478957626880.0, "grad_norm": 36.481733428654216, "language_loss": 0.8790074, "learning_rate": 5.59006777975819e-08, "loss": 0.89331949, "num_input_tokens_seen": 332736345, "router_z_loss_clip": 2.0859375, "router_z_loss_mlp": 0.23840332, "step": 15415, "time_per_iteration": 2.655496835708618 }, { "auxiliary_loss_clip": 0.01240004, "auxiliary_loss_mlp": 0.0023148, "balance_loss_clip": 1.02087736, "balance_loss_mlp": 0.20624366, "epoch": 0.9268600631294153, "flos": 24789351553920.0, "grad_norm": 23.465515359641316, "language_loss": 0.62257993, "learning_rate": 5.580927866294671e-08, "loss": 0.63729483, "num_input_tokens_seen": 332756270, "router_z_loss_clip": 2.19140625, "router_z_loss_mlp": 0.25231934, "step": 15416, "time_per_iteration": 2.664907932281494 }, { "auxiliary_loss_clip": 0.01235416, "auxiliary_loss_mlp": 0.00211292, "balance_loss_clip": 1.02199161, "balance_loss_mlp": 0.18767682, "epoch": 0.9269201863820833, "flos": 18697178453760.0, "grad_norm": 540.6228057440904, "language_loss": 0.80163038, "learning_rate": 5.571795325221807e-08, "loss": 0.81609744, "num_input_tokens_seen": 332775185, "router_z_loss_clip": 2.13378906, "router_z_loss_mlp": 0.23608398, "step": 15417, "time_per_iteration": 2.6940248012542725 }, { "auxiliary_loss_clip": 0.01243053, "auxiliary_loss_mlp": 0.00212867, "balance_loss_clip": 1.02325141, "balance_loss_mlp": 0.18536542, "epoch": 0.9269803096347512, "flos": 20923999136640.0, "grad_norm": 96.38788273348412, "language_loss": 0.85124457, "learning_rate": 5.5626701568859624e-08, "loss": 0.86580372, "num_input_tokens_seen": 332794320, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.27490234, "step": 15418, "time_per_iteration": 2.6684722900390625 }, { "auxiliary_loss_clip": 0.01235837, "auxiliary_loss_mlp": 0.00236786, "balance_loss_clip": 1.01892781, "balance_loss_mlp": 0.21126322, "epoch": 0.9270404328874192, "flos": 28002710252160.0, "grad_norm": 99.37403375480774, "language_loss": 0.82889682, "learning_rate": 5.553552361633174e-08, "loss": 0.84362304, "num_input_tokens_seen": 332818095, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.25537109, "step": 15419, "time_per_iteration": 2.776309013366699 }, { "auxiliary_loss_clip": 0.0120391, "auxiliary_loss_mlp": 0.00208726, "balance_loss_clip": 0.9979459, "balance_loss_mlp": 0.18588509, "epoch": 0.9271005561400871, "flos": 25889870401920.0, "grad_norm": 33.83367208887662, "language_loss": 0.81639642, "learning_rate": 5.5444419398091636e-08, "loss": 0.83052278, "num_input_tokens_seen": 332839860, "router_z_loss_clip": 2.05761719, "router_z_loss_mlp": 0.22851562, "step": 15420, "time_per_iteration": 4.248643398284912 }, { "auxiliary_loss_clip": 0.01244144, "auxiliary_loss_mlp": 0.00218947, "balance_loss_clip": 1.02475071, "balance_loss_mlp": 0.19319771, "epoch": 0.9271606793927551, "flos": 27053914452480.0, "grad_norm": 32.38280326417553, "language_loss": 0.83703107, "learning_rate": 5.535338891759389e-08, "loss": 0.85166204, "num_input_tokens_seen": 332861155, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.2578125, "step": 15421, "time_per_iteration": 4.149538278579712 }, { "auxiliary_loss_clip": 0.01231068, "auxiliary_loss_mlp": 0.0023037, "balance_loss_clip": 1.01532614, "balance_loss_mlp": 0.20611057, "epoch": 0.9272208026454232, "flos": 26209869690240.0, "grad_norm": 9.87297963187902, "language_loss": 0.81245023, "learning_rate": 5.526243217829041e-08, "loss": 0.82706457, "num_input_tokens_seen": 332881110, "router_z_loss_clip": 2.16015625, "router_z_loss_mlp": 0.24255371, "step": 15422, "time_per_iteration": 2.7476139068603516 }, { "auxiliary_loss_clip": 0.01248174, "auxiliary_loss_mlp": 0.00221279, "balance_loss_clip": 1.02582037, "balance_loss_mlp": 0.19610187, "epoch": 0.9272809258980911, "flos": 12458453863680.0, "grad_norm": 60.2545033265279, "language_loss": 0.9046604, "learning_rate": 5.517154918363065e-08, "loss": 0.91935492, "num_input_tokens_seen": 332899350, "router_z_loss_clip": 2.22460938, "router_z_loss_mlp": 0.25195312, "step": 15423, "time_per_iteration": 2.7256083488464355 }, { "auxiliary_loss_clip": 0.01246643, "auxiliary_loss_mlp": 0.00240787, "balance_loss_clip": 1.0281266, "balance_loss_mlp": 0.21763682, "epoch": 0.9273410491507591, "flos": 22856890826880.0, "grad_norm": 4.1344896653316505, "language_loss": 0.83677971, "learning_rate": 5.508073993706053e-08, "loss": 0.85165405, "num_input_tokens_seen": 332918105, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.23168945, "step": 15424, "time_per_iteration": 2.7141287326812744 }, { "auxiliary_loss_clip": 0.01091804, "auxiliary_loss_mlp": 0.00046536, "balance_loss_clip": 0.9562608, "balance_loss_mlp": 0.03986066, "epoch": 0.927401172403427, "flos": 47665384329600.0, "grad_norm": 1.5649922388032202, "language_loss": 0.59617722, "learning_rate": 5.499000444202351e-08, "loss": 0.60756063, "num_input_tokens_seen": 332969490, "router_z_loss_clip": 1.359375, "router_z_loss_mlp": 0.06689453, "step": 15425, "time_per_iteration": 2.9562928676605225 }, { "auxiliary_loss_clip": 0.01233868, "auxiliary_loss_mlp": 0.00220378, "balance_loss_clip": 1.02046156, "balance_loss_mlp": 0.19613121, "epoch": 0.927461295656095, "flos": 29972374490880.0, "grad_norm": 29.46208262031019, "language_loss": 0.77396286, "learning_rate": 5.489934270196106e-08, "loss": 0.78850543, "num_input_tokens_seen": 332988805, "router_z_loss_clip": 2.1328125, "router_z_loss_mlp": 0.24243164, "step": 15426, "time_per_iteration": 2.7989273071289062 }, { "auxiliary_loss_clip": 0.01238984, "auxiliary_loss_mlp": 0.00225005, "balance_loss_clip": 1.02936614, "balance_loss_mlp": 0.20212841, "epoch": 0.9275214189087629, "flos": 20375427651840.0, "grad_norm": 35.783362862748156, "language_loss": 0.89252079, "learning_rate": 5.480875472030977e-08, "loss": 0.9071607, "num_input_tokens_seen": 333007960, "router_z_loss_clip": 2.09667969, "router_z_loss_mlp": 0.2286377, "step": 15427, "time_per_iteration": 4.263451099395752 }, { "auxiliary_loss_clip": 0.01257041, "auxiliary_loss_mlp": 0.00237804, "balance_loss_clip": 1.03172207, "balance_loss_mlp": 0.21163712, "epoch": 0.927581542161431, "flos": 22383193242240.0, "grad_norm": 15.895171789224603, "language_loss": 0.83966625, "learning_rate": 5.471824050050555e-08, "loss": 0.85461462, "num_input_tokens_seen": 333026035, "router_z_loss_clip": 2.25585938, "router_z_loss_mlp": 0.26147461, "step": 15428, "time_per_iteration": 2.6723058223724365 }, { "auxiliary_loss_clip": 0.01234385, "auxiliary_loss_mlp": 0.0022815, "balance_loss_clip": 1.01877129, "balance_loss_mlp": 0.20175725, "epoch": 0.9276416654140989, "flos": 23952453598080.0, "grad_norm": 10.683578051879332, "language_loss": 0.81623709, "learning_rate": 5.4627800045980555e-08, "loss": 0.8308624, "num_input_tokens_seen": 333045590, "router_z_loss_clip": 2.15429688, "router_z_loss_mlp": 0.26403809, "step": 15429, "time_per_iteration": 2.6646621227264404 }, { "auxiliary_loss_clip": 0.01222625, "auxiliary_loss_mlp": 0.00199499, "balance_loss_clip": 1.01080656, "balance_loss_mlp": 0.17613395, "epoch": 0.9277017886667669, "flos": 13917719796480.0, "grad_norm": 12.693469777870273, "language_loss": 0.83456677, "learning_rate": 5.45374333601647e-08, "loss": 0.84878802, "num_input_tokens_seen": 333063355, "router_z_loss_clip": 2.11816406, "router_z_loss_mlp": 0.23388672, "step": 15430, "time_per_iteration": 2.695611000061035 }, { "auxiliary_loss_clip": 0.01250393, "auxiliary_loss_mlp": 0.00209474, "balance_loss_clip": 1.03111744, "balance_loss_mlp": 0.18280694, "epoch": 0.9277619119194348, "flos": 35666478092160.0, "grad_norm": 112.84610767742186, "language_loss": 0.83005768, "learning_rate": 5.444714044648391e-08, "loss": 0.84465635, "num_input_tokens_seen": 333088045, "router_z_loss_clip": 2.19335938, "router_z_loss_mlp": 0.26660156, "step": 15431, "time_per_iteration": 2.829538345336914 }, { "auxiliary_loss_clip": 0.01235576, "auxiliary_loss_mlp": 0.00231653, "balance_loss_clip": 1.02095723, "balance_loss_mlp": 0.20654801, "epoch": 0.9278220351721028, "flos": 23841238112640.0, "grad_norm": 10.578138407385245, "language_loss": 0.77570283, "learning_rate": 5.4356921308363e-08, "loss": 0.79037511, "num_input_tokens_seen": 333108005, "router_z_loss_clip": 2.14550781, "router_z_loss_mlp": 0.25134277, "step": 15432, "time_per_iteration": 4.078392028808594 }, { "auxiliary_loss_clip": 0.01242865, "auxiliary_loss_mlp": 0.00211934, "balance_loss_clip": 1.02430975, "balance_loss_mlp": 0.18769917, "epoch": 0.9278821584247707, "flos": 15228135768960.0, "grad_norm": 15.095740712043044, "language_loss": 0.91149449, "learning_rate": 5.4266775949222354e-08, "loss": 0.92604244, "num_input_tokens_seen": 333124335, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.24243164, "step": 15433, "time_per_iteration": 2.737490653991699 }, { "auxiliary_loss_clip": 0.0122307, "auxiliary_loss_mlp": 0.00210827, "balance_loss_clip": 1.01346827, "balance_loss_mlp": 0.18697347, "epoch": 0.9279422816774388, "flos": 24681404206080.0, "grad_norm": 34.48764169784444, "language_loss": 0.76146126, "learning_rate": 5.417670437248056e-08, "loss": 0.77580017, "num_input_tokens_seen": 333143995, "router_z_loss_clip": 2.09472656, "router_z_loss_mlp": 0.23840332, "step": 15434, "time_per_iteration": 2.6718480587005615 }, { "auxiliary_loss_clip": 0.01217391, "auxiliary_loss_mlp": 0.00207503, "balance_loss_clip": 1.01230395, "balance_loss_mlp": 0.1843403, "epoch": 0.9280024049301068, "flos": 19169188099200.0, "grad_norm": 13.365002507407802, "language_loss": 0.76086974, "learning_rate": 5.40867065815529e-08, "loss": 0.77511871, "num_input_tokens_seen": 333162805, "router_z_loss_clip": 2.04980469, "router_z_loss_mlp": 0.23144531, "step": 15435, "time_per_iteration": 2.675955057144165 }, { "auxiliary_loss_clip": 0.01231558, "auxiliary_loss_mlp": 0.00226819, "balance_loss_clip": 1.01704216, "balance_loss_mlp": 0.19998461, "epoch": 0.9280625281827747, "flos": 11393701983360.0, "grad_norm": 7.897831613507709, "language_loss": 0.82580185, "learning_rate": 5.399678257985263e-08, "loss": 0.84038556, "num_input_tokens_seen": 333175770, "router_z_loss_clip": 2.14355469, "router_z_loss_mlp": 0.26843262, "step": 15436, "time_per_iteration": 2.6557416915893555 }, { "auxiliary_loss_clip": 0.01229081, "auxiliary_loss_mlp": 0.00226733, "balance_loss_clip": 1.01195335, "balance_loss_mlp": 0.20095977, "epoch": 0.9281226514354427, "flos": 24785616539520.0, "grad_norm": 3.410574679440539, "language_loss": 0.76510274, "learning_rate": 5.390693237078925e-08, "loss": 0.77966088, "num_input_tokens_seen": 333194775, "router_z_loss_clip": 2.16699219, "router_z_loss_mlp": 0.2578125, "step": 15437, "time_per_iteration": 2.7620935440063477 }, { "auxiliary_loss_clip": 0.01251424, "auxiliary_loss_mlp": 0.00232681, "balance_loss_clip": 1.02520299, "balance_loss_mlp": 0.20479837, "epoch": 0.9281827746881106, "flos": 15083128563840.0, "grad_norm": 64.48551733551943, "language_loss": 0.80897009, "learning_rate": 5.3817155957770254e-08, "loss": 0.82381111, "num_input_tokens_seen": 333208920, "router_z_loss_clip": 2.26171875, "router_z_loss_mlp": 0.27893066, "step": 15438, "time_per_iteration": 2.634073257446289 }, { "auxiliary_loss_clip": 0.01253177, "auxiliary_loss_mlp": 0.00224176, "balance_loss_clip": 1.02737308, "balance_loss_mlp": 0.19738963, "epoch": 0.9282428979407786, "flos": 24135059364480.0, "grad_norm": 9.374278134354405, "language_loss": 0.71927619, "learning_rate": 5.3727453344199366e-08, "loss": 0.73404974, "num_input_tokens_seen": 333229350, "router_z_loss_clip": 2.25585938, "router_z_loss_mlp": 0.26794434, "step": 15439, "time_per_iteration": 2.725388288497925 }, { "auxiliary_loss_clip": 0.01230895, "auxiliary_loss_mlp": 0.00227124, "balance_loss_clip": 1.01841593, "balance_loss_mlp": 0.20157781, "epoch": 0.9283030211934465, "flos": 24823215100800.0, "grad_norm": 9.778166603539537, "language_loss": 0.77077419, "learning_rate": 5.363782453347876e-08, "loss": 0.78535438, "num_input_tokens_seen": 333246125, "router_z_loss_clip": 2.12402344, "router_z_loss_mlp": 0.2557373, "step": 15440, "time_per_iteration": 2.6651241779327393 }, { "auxiliary_loss_clip": 0.01237681, "auxiliary_loss_mlp": 0.00246698, "balance_loss_clip": 1.01726091, "balance_loss_mlp": 0.22144917, "epoch": 0.9283631444461146, "flos": 23981037845760.0, "grad_norm": 2.112013319051999, "language_loss": 0.8250463, "learning_rate": 5.354826952900682e-08, "loss": 0.83989012, "num_input_tokens_seen": 333263685, "router_z_loss_clip": 2.20507812, "router_z_loss_mlp": 0.25256348, "step": 15441, "time_per_iteration": 2.7204153537750244 }, { "auxiliary_loss_clip": 0.01214674, "auxiliary_loss_mlp": 0.00205929, "balance_loss_clip": 1.00779951, "balance_loss_mlp": 0.18298107, "epoch": 0.9284232676987825, "flos": 22784530878720.0, "grad_norm": 14.956388082679284, "language_loss": 0.70298028, "learning_rate": 5.345878833417949e-08, "loss": 0.71718633, "num_input_tokens_seen": 333282435, "router_z_loss_clip": 2.06640625, "router_z_loss_mlp": 0.22961426, "step": 15442, "time_per_iteration": 2.7121894359588623 }, { "auxiliary_loss_clip": 0.01251747, "auxiliary_loss_mlp": 0.0022517, "balance_loss_clip": 1.02826154, "balance_loss_mlp": 0.19880095, "epoch": 0.9284833909514505, "flos": 19500500171520.0, "grad_norm": 12.623367352212918, "language_loss": 0.89953172, "learning_rate": 5.3369380952390295e-08, "loss": 0.91430092, "num_input_tokens_seen": 333300400, "router_z_loss_clip": 2.23535156, "router_z_loss_mlp": 0.26379395, "step": 15443, "time_per_iteration": 2.6618549823760986 }, { "auxiliary_loss_clip": 0.01252066, "auxiliary_loss_mlp": 0.00197737, "balance_loss_clip": 1.03493381, "balance_loss_mlp": 0.17428844, "epoch": 0.9285435142041184, "flos": 23185976256000.0, "grad_norm": 2.5320436532044543, "language_loss": 0.71918237, "learning_rate": 5.328004738702896e-08, "loss": 0.73368049, "num_input_tokens_seen": 333318980, "router_z_loss_clip": 2.171875, "router_z_loss_mlp": 0.234375, "step": 15444, "time_per_iteration": 2.697817802429199 }, { "auxiliary_loss_clip": 0.01233663, "auxiliary_loss_mlp": 0.00235936, "balance_loss_clip": 1.01830661, "balance_loss_mlp": 0.21099713, "epoch": 0.9286036374567864, "flos": 17675519915520.0, "grad_norm": 72.69278201598813, "language_loss": 0.79319853, "learning_rate": 5.3190787641483215e-08, "loss": 0.80789447, "num_input_tokens_seen": 333334135, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.24951172, "step": 15445, "time_per_iteration": 2.62845778465271 }, { "auxiliary_loss_clip": 0.01244415, "auxiliary_loss_mlp": 0.00209982, "balance_loss_clip": 1.02491009, "balance_loss_mlp": 0.18298069, "epoch": 0.9286637607094543, "flos": 20886687884160.0, "grad_norm": 27.972911554193367, "language_loss": 0.77522588, "learning_rate": 5.3101601719138135e-08, "loss": 0.78976983, "num_input_tokens_seen": 333353325, "router_z_loss_clip": 2.19335938, "router_z_loss_mlp": 0.26989746, "step": 15446, "time_per_iteration": 2.672994613647461 }, { "auxiliary_loss_clip": 0.01279289, "auxiliary_loss_mlp": 0.00262221, "balance_loss_clip": 1.04764438, "balance_loss_mlp": 0.23390892, "epoch": 0.9287238839621224, "flos": 19026012487680.0, "grad_norm": 60.488169106855096, "language_loss": 0.78522128, "learning_rate": 5.301248962337523e-08, "loss": 0.80063635, "num_input_tokens_seen": 333371110, "router_z_loss_clip": 2.31445312, "router_z_loss_mlp": 0.28308105, "step": 15447, "time_per_iteration": 2.6597719192504883 }, { "auxiliary_loss_clip": 0.0120294, "auxiliary_loss_mlp": 0.00220234, "balance_loss_clip": 0.99938792, "balance_loss_mlp": 0.19638059, "epoch": 0.9287840072147904, "flos": 20557027837440.0, "grad_norm": 13.694989020778324, "language_loss": 0.79132259, "learning_rate": 5.292345135757403e-08, "loss": 0.80555433, "num_input_tokens_seen": 333391420, "router_z_loss_clip": 2.03515625, "router_z_loss_mlp": 0.23864746, "step": 15448, "time_per_iteration": 2.7015221118927 }, { "auxiliary_loss_clip": 0.01230049, "auxiliary_loss_mlp": 0.00221086, "balance_loss_clip": 1.01696599, "balance_loss_mlp": 0.19565871, "epoch": 0.9288441304674583, "flos": 21250822008960.0, "grad_norm": 29.05478418133339, "language_loss": 0.85479891, "learning_rate": 5.283448692511072e-08, "loss": 0.86931026, "num_input_tokens_seen": 333410365, "router_z_loss_clip": 2.13085938, "router_z_loss_mlp": 0.25427246, "step": 15449, "time_per_iteration": 2.6801469326019287 }, { "auxiliary_loss_clip": 0.01243124, "auxiliary_loss_mlp": 0.00213154, "balance_loss_clip": 1.02463973, "balance_loss_mlp": 0.18748796, "epoch": 0.9289042537201263, "flos": 27669853895040.0, "grad_norm": 25.401955523269958, "language_loss": 0.77358103, "learning_rate": 5.27455963293586e-08, "loss": 0.78814381, "num_input_tokens_seen": 333430000, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.25695801, "step": 15450, "time_per_iteration": 2.8047618865966797 }, { "auxiliary_loss_clip": 0.01242109, "auxiliary_loss_mlp": 0.002317, "balance_loss_clip": 1.02153742, "balance_loss_mlp": 0.20738147, "epoch": 0.9289643769727942, "flos": 19317750750720.0, "grad_norm": 4.416802477589287, "language_loss": 0.80745065, "learning_rate": 5.265677957368875e-08, "loss": 0.82218874, "num_input_tokens_seen": 333445800, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.24328613, "step": 15451, "time_per_iteration": 2.623225212097168 }, { "auxiliary_loss_clip": 0.01248777, "auxiliary_loss_mlp": 0.00231782, "balance_loss_clip": 1.02945876, "balance_loss_mlp": 0.20695093, "epoch": 0.9290245002254622, "flos": 14058058233600.0, "grad_norm": 31.36450287562341, "language_loss": 0.83301497, "learning_rate": 5.25680366614687e-08, "loss": 0.84782052, "num_input_tokens_seen": 333461550, "router_z_loss_clip": 2.19140625, "router_z_loss_mlp": 0.24829102, "step": 15452, "time_per_iteration": 2.6763408184051514 }, { "auxiliary_loss_clip": 0.0124301, "auxiliary_loss_mlp": 0.00210139, "balance_loss_clip": 1.02449918, "balance_loss_mlp": 0.18460417, "epoch": 0.9290846234781301, "flos": 20047132321920.0, "grad_norm": 65.14658438178863, "language_loss": 0.80083323, "learning_rate": 5.2479367596064196e-08, "loss": 0.81536472, "num_input_tokens_seen": 333478835, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.25524902, "step": 15453, "time_per_iteration": 2.660738468170166 }, { "auxiliary_loss_clip": 0.01088241, "auxiliary_loss_mlp": 0.00065802, "balance_loss_clip": 0.95374775, "balance_loss_mlp": 0.05850658, "epoch": 0.9291447467307982, "flos": 61227514460160.0, "grad_norm": 1.230976824539382, "language_loss": 0.59923208, "learning_rate": 5.2390772380837226e-08, "loss": 0.61077261, "num_input_tokens_seen": 333535250, "router_z_loss_clip": 1.34375, "router_z_loss_mlp": 0.07275391, "step": 15454, "time_per_iteration": 3.0820119380950928 }, { "auxiliary_loss_clip": 0.01246608, "auxiliary_loss_mlp": 0.0022695, "balance_loss_clip": 1.02837026, "balance_loss_mlp": 0.20024714, "epoch": 0.9292048699834661, "flos": 20553328736640.0, "grad_norm": 66.97817840952578, "language_loss": 0.77798808, "learning_rate": 5.230225101914709e-08, "loss": 0.79272366, "num_input_tokens_seen": 333553805, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.26708984, "step": 15455, "time_per_iteration": 2.660629987716675 }, { "auxiliary_loss_clip": 0.01244018, "auxiliary_loss_mlp": 0.00246188, "balance_loss_clip": 1.02732718, "balance_loss_mlp": 0.22101054, "epoch": 0.9292649932361341, "flos": 23623655477760.0, "grad_norm": 87.6129157084128, "language_loss": 0.73047358, "learning_rate": 5.22138035143509e-08, "loss": 0.74537569, "num_input_tokens_seen": 333572800, "router_z_loss_clip": 2.16699219, "router_z_loss_mlp": 0.25158691, "step": 15456, "time_per_iteration": 2.661466598510742 }, { "auxiliary_loss_clip": 0.01249393, "auxiliary_loss_mlp": 0.00226714, "balance_loss_clip": 1.03147602, "balance_loss_mlp": 0.20058292, "epoch": 0.929325116488802, "flos": 15009942602880.0, "grad_norm": 4.9440371565594665, "language_loss": 0.78255892, "learning_rate": 5.2125429869802615e-08, "loss": 0.79732001, "num_input_tokens_seen": 333588520, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.26123047, "step": 15457, "time_per_iteration": 2.6429173946380615 }, { "auxiliary_loss_clip": 0.01248005, "auxiliary_loss_mlp": 0.00223556, "balance_loss_clip": 1.02876019, "balance_loss_mlp": 0.19743752, "epoch": 0.92938523974147, "flos": 17967365919360.0, "grad_norm": 12.03429764172215, "language_loss": 0.88776553, "learning_rate": 5.203713008885291e-08, "loss": 0.9024812, "num_input_tokens_seen": 333603435, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.26123047, "step": 15458, "time_per_iteration": 2.598708391189575 }, { "auxiliary_loss_clip": 0.01238741, "auxiliary_loss_mlp": 0.00221963, "balance_loss_clip": 1.02409399, "balance_loss_mlp": 0.19827592, "epoch": 0.9294453629941379, "flos": 23003047267200.0, "grad_norm": 15.499761655774588, "language_loss": 0.82309443, "learning_rate": 5.194890417485065e-08, "loss": 0.83770144, "num_input_tokens_seen": 333623305, "router_z_loss_clip": 2.1484375, "router_z_loss_mlp": 0.2364502, "step": 15459, "time_per_iteration": 2.6708805561065674 }, { "auxiliary_loss_clip": 0.01248146, "auxiliary_loss_mlp": 0.00215847, "balance_loss_clip": 1.02908945, "balance_loss_mlp": 0.19113502, "epoch": 0.929505486246806, "flos": 17055234927360.0, "grad_norm": 6.340206885576306, "language_loss": 0.71513146, "learning_rate": 5.1860752131141384e-08, "loss": 0.72977138, "num_input_tokens_seen": 333641205, "router_z_loss_clip": 2.19335938, "router_z_loss_mlp": 0.24682617, "step": 15460, "time_per_iteration": 2.6326968669891357 }, { "auxiliary_loss_clip": 0.01264171, "auxiliary_loss_mlp": 0.00218966, "balance_loss_clip": 1.03812444, "balance_loss_mlp": 0.19309726, "epoch": 0.9295656094994739, "flos": 27340409329920.0, "grad_norm": 13.685073338605157, "language_loss": 0.8783499, "learning_rate": 5.177267396106733e-08, "loss": 0.89318132, "num_input_tokens_seen": 333659615, "router_z_loss_clip": 2.25976562, "router_z_loss_mlp": 0.25854492, "step": 15461, "time_per_iteration": 2.7172281742095947 }, { "auxiliary_loss_clip": 0.01225661, "auxiliary_loss_mlp": 0.00207019, "balance_loss_clip": 1.0134747, "balance_loss_mlp": 0.18367782, "epoch": 0.9296257327521419, "flos": 21470954509440.0, "grad_norm": 10.701968105336944, "language_loss": 0.86268675, "learning_rate": 5.168466966796869e-08, "loss": 0.87701356, "num_input_tokens_seen": 333678985, "router_z_loss_clip": 2.12011719, "router_z_loss_mlp": 0.23364258, "step": 15462, "time_per_iteration": 4.160700559616089 }, { "auxiliary_loss_clip": 0.01234077, "auxiliary_loss_mlp": 0.00237493, "balance_loss_clip": 1.02017164, "balance_loss_mlp": 0.21355619, "epoch": 0.9296858560048099, "flos": 16362661818240.0, "grad_norm": 15.135776101207872, "language_loss": 0.73867691, "learning_rate": 5.159673925518282e-08, "loss": 0.75339264, "num_input_tokens_seen": 333696410, "router_z_loss_clip": 2.14257812, "router_z_loss_mlp": 0.23937988, "step": 15463, "time_per_iteration": 4.0837531089782715 }, { "auxiliary_loss_clip": 0.01218658, "auxiliary_loss_mlp": 0.00229536, "balance_loss_clip": 1.01146626, "balance_loss_mlp": 0.20688581, "epoch": 0.9297459792574778, "flos": 29858609139840.0, "grad_norm": 21.596395820870754, "language_loss": 0.76781756, "learning_rate": 5.15088827260437e-08, "loss": 0.78229952, "num_input_tokens_seen": 333716615, "router_z_loss_clip": 2.07128906, "router_z_loss_mlp": 0.22644043, "step": 15464, "time_per_iteration": 2.7974557876586914 }, { "auxiliary_loss_clip": 0.01251649, "auxiliary_loss_mlp": 0.00243067, "balance_loss_clip": 1.03019643, "balance_loss_mlp": 0.21805689, "epoch": 0.9298061025101458, "flos": 15924838942080.0, "grad_norm": 12.807057274972777, "language_loss": 0.8574459, "learning_rate": 5.1421100083883115e-08, "loss": 0.87239301, "num_input_tokens_seen": 333732800, "router_z_loss_clip": 2.21484375, "router_z_loss_mlp": 0.25024414, "step": 15465, "time_per_iteration": 2.6604020595550537 }, { "auxiliary_loss_clip": 0.01096896, "auxiliary_loss_mlp": 0.00080994, "balance_loss_clip": 0.96154726, "balance_loss_mlp": 0.07403195, "epoch": 0.9298662257628137, "flos": 64096994304000.0, "grad_norm": 0.6907225337915476, "language_loss": 0.55872661, "learning_rate": 5.133339133202952e-08, "loss": 0.57050556, "num_input_tokens_seen": 333799300, "router_z_loss_clip": 1.3515625, "router_z_loss_mlp": 0.06982422, "step": 15466, "time_per_iteration": 3.3077917098999023 }, { "auxiliary_loss_clip": 0.01243545, "auxiliary_loss_mlp": 0.00221354, "balance_loss_clip": 1.03032172, "balance_loss_mlp": 0.19649887, "epoch": 0.9299263490154818, "flos": 24280210224000.0, "grad_norm": 15.377848730370221, "language_loss": 0.80041087, "learning_rate": 5.1245756473809355e-08, "loss": 0.8150599, "num_input_tokens_seen": 333820360, "router_z_loss_clip": 2.13183594, "router_z_loss_mlp": 0.24853516, "step": 15467, "time_per_iteration": 2.7261979579925537 }, { "auxiliary_loss_clip": 0.01236818, "auxiliary_loss_mlp": 0.00224169, "balance_loss_clip": 1.02116442, "balance_loss_mlp": 0.19783539, "epoch": 0.9299864722681497, "flos": 23294354567040.0, "grad_norm": 11.802700606354401, "language_loss": 0.7858752, "learning_rate": 5.1158195512545076e-08, "loss": 0.80048507, "num_input_tokens_seen": 333840415, "router_z_loss_clip": 2.15820312, "router_z_loss_mlp": 0.26330566, "step": 15468, "time_per_iteration": 2.711766242980957 }, { "auxiliary_loss_clip": 0.01244421, "auxiliary_loss_mlp": 0.00227721, "balance_loss_clip": 1.0255456, "balance_loss_mlp": 0.20077963, "epoch": 0.9300465955208177, "flos": 21395972868480.0, "grad_norm": 16.389506539490718, "language_loss": 0.84429395, "learning_rate": 5.107070845155737e-08, "loss": 0.85901535, "num_input_tokens_seen": 333859910, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.26977539, "step": 15469, "time_per_iteration": 2.6973044872283936 }, { "auxiliary_loss_clip": 0.01242412, "auxiliary_loss_mlp": 0.00223063, "balance_loss_clip": 1.02134085, "balance_loss_mlp": 0.19775447, "epoch": 0.9301067187734856, "flos": 24571445696640.0, "grad_norm": 36.74656877367761, "language_loss": 0.83713901, "learning_rate": 5.098329529416379e-08, "loss": 0.85179371, "num_input_tokens_seen": 333880495, "router_z_loss_clip": 2.2109375, "router_z_loss_mlp": 0.25292969, "step": 15470, "time_per_iteration": 4.127457618713379 }, { "auxiliary_loss_clip": 0.01228206, "auxiliary_loss_mlp": 0.0022244, "balance_loss_clip": 1.01867056, "balance_loss_mlp": 0.19887258, "epoch": 0.9301668420261536, "flos": 22196960202240.0, "grad_norm": 33.89218153413759, "language_loss": 0.82427436, "learning_rate": 5.089595604367902e-08, "loss": 0.83878082, "num_input_tokens_seen": 333897640, "router_z_loss_clip": 2.09570312, "router_z_loss_mlp": 0.23583984, "step": 15471, "time_per_iteration": 2.7078909873962402 }, { "auxiliary_loss_clip": 0.01231167, "auxiliary_loss_mlp": 0.00230245, "balance_loss_clip": 1.0143801, "balance_loss_mlp": 0.20603326, "epoch": 0.9302269652788215, "flos": 17747628468480.0, "grad_norm": 17.24508435760109, "language_loss": 0.79655349, "learning_rate": 5.080869070341487e-08, "loss": 0.8111676, "num_input_tokens_seen": 333913670, "router_z_loss_clip": 2.16601562, "router_z_loss_mlp": 0.24206543, "step": 15472, "time_per_iteration": 2.670297145843506 }, { "auxiliary_loss_clip": 0.01229453, "auxiliary_loss_mlp": 0.00234498, "balance_loss_clip": 1.01732922, "balance_loss_mlp": 0.21145469, "epoch": 0.9302870885314896, "flos": 19390793057280.0, "grad_norm": 10.897087659480508, "language_loss": 0.94844586, "learning_rate": 5.0721499276680233e-08, "loss": 0.96308535, "num_input_tokens_seen": 333934105, "router_z_loss_clip": 2.12011719, "router_z_loss_mlp": 0.23059082, "step": 15473, "time_per_iteration": 2.6460723876953125 }, { "auxiliary_loss_clip": 0.01267102, "auxiliary_loss_mlp": 0.00223341, "balance_loss_clip": 1.03866553, "balance_loss_mlp": 0.19650725, "epoch": 0.9303472117841575, "flos": 21760286561280.0, "grad_norm": 5.783071209879749, "language_loss": 0.73836851, "learning_rate": 5.063438176678203e-08, "loss": 0.75327301, "num_input_tokens_seen": 333953635, "router_z_loss_clip": 2.28320312, "router_z_loss_mlp": 0.26831055, "step": 15474, "time_per_iteration": 4.187880516052246 }, { "auxiliary_loss_clip": 0.0123614, "auxiliary_loss_mlp": 0.00220144, "balance_loss_clip": 1.02219272, "balance_loss_mlp": 0.19458561, "epoch": 0.9304073350368255, "flos": 19609740408960.0, "grad_norm": 14.115748776263876, "language_loss": 0.8240056, "learning_rate": 5.054733817702339e-08, "loss": 0.83856845, "num_input_tokens_seen": 333971825, "router_z_loss_clip": 2.13671875, "router_z_loss_mlp": 0.25561523, "step": 15475, "time_per_iteration": 2.7446584701538086 }, { "auxiliary_loss_clip": 0.01235134, "auxiliary_loss_mlp": 0.00224152, "balance_loss_clip": 1.01704502, "balance_loss_mlp": 0.19958329, "epoch": 0.9304674582894935, "flos": 30441582875520.0, "grad_norm": 87.18662172199377, "language_loss": 0.73458278, "learning_rate": 5.0460368510704786e-08, "loss": 0.74917567, "num_input_tokens_seen": 333990120, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.24572754, "step": 15476, "time_per_iteration": 2.732071876525879 }, { "auxiliary_loss_clip": 0.0125577, "auxiliary_loss_mlp": 0.00236201, "balance_loss_clip": 1.03362644, "balance_loss_mlp": 0.20973611, "epoch": 0.9305275815421614, "flos": 17785693906560.0, "grad_norm": 812.3675798868563, "language_loss": 0.79174459, "learning_rate": 5.0373472771124914e-08, "loss": 0.80666435, "num_input_tokens_seen": 334007970, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.26477051, "step": 15477, "time_per_iteration": 2.660285234451294 }, { "auxiliary_loss_clip": 0.01224356, "auxiliary_loss_mlp": 0.00211395, "balance_loss_clip": 1.01292205, "balance_loss_mlp": 0.18676677, "epoch": 0.9305877047948294, "flos": 25298456970240.0, "grad_norm": 13.979079667178993, "language_loss": 0.65259892, "learning_rate": 5.0286650961578027e-08, "loss": 0.66695642, "num_input_tokens_seen": 334027120, "router_z_loss_clip": 2.11523438, "router_z_loss_mlp": 0.24621582, "step": 15478, "time_per_iteration": 2.701500177383423 }, { "auxiliary_loss_clip": 0.01257167, "auxiliary_loss_mlp": 0.0022098, "balance_loss_clip": 1.03082728, "balance_loss_mlp": 0.1951827, "epoch": 0.9306478280474973, "flos": 16977236544000.0, "grad_norm": 14910.929320770738, "language_loss": 0.88611376, "learning_rate": 5.01999030853566e-08, "loss": 0.90089524, "num_input_tokens_seen": 334042785, "router_z_loss_clip": 2.25976562, "router_z_loss_mlp": 0.25842285, "step": 15479, "time_per_iteration": 2.7905688285827637 }, { "auxiliary_loss_clip": 0.01231, "auxiliary_loss_mlp": 0.00213483, "balance_loss_clip": 1.01902533, "balance_loss_mlp": 0.18925937, "epoch": 0.9307079513001654, "flos": 35663353608960.0, "grad_norm": 13.445204572926379, "language_loss": 0.75940138, "learning_rate": 5.0113229145750445e-08, "loss": 0.77384621, "num_input_tokens_seen": 334063480, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.24230957, "step": 15480, "time_per_iteration": 2.794933795928955 }, { "auxiliary_loss_clip": 0.01254186, "auxiliary_loss_mlp": 0.00234092, "balance_loss_clip": 1.03166437, "balance_loss_mlp": 0.2080685, "epoch": 0.9307680745528333, "flos": 19208151377280.0, "grad_norm": 2.950933596683998, "language_loss": 0.76738226, "learning_rate": 5.002662914604583e-08, "loss": 0.78226507, "num_input_tokens_seen": 334082005, "router_z_loss_clip": 2.2265625, "router_z_loss_mlp": 0.26013184, "step": 15481, "time_per_iteration": 2.689924955368042 }, { "auxiliary_loss_clip": 0.01212846, "auxiliary_loss_mlp": 0.00198854, "balance_loss_clip": 1.00586867, "balance_loss_mlp": 0.17579883, "epoch": 0.9308281978055013, "flos": 19062641381760.0, "grad_norm": 25.814423419619395, "language_loss": 0.82025772, "learning_rate": 4.994010308952701e-08, "loss": 0.83437467, "num_input_tokens_seen": 334101375, "router_z_loss_clip": 2.0703125, "router_z_loss_mlp": 0.23059082, "step": 15482, "time_per_iteration": 2.673771619796753 }, { "auxiliary_loss_clip": 0.01227749, "auxiliary_loss_mlp": 0.00231244, "balance_loss_clip": 1.01626086, "balance_loss_mlp": 0.20742594, "epoch": 0.9308883210581692, "flos": 20521548178560.0, "grad_norm": 19.709567275406044, "language_loss": 0.87705886, "learning_rate": 4.985365097947469e-08, "loss": 0.89164883, "num_input_tokens_seen": 334119460, "router_z_loss_clip": 2.11523438, "router_z_loss_mlp": 0.23815918, "step": 15483, "time_per_iteration": 2.6858410835266113 }, { "auxiliary_loss_clip": 0.01239762, "auxiliary_loss_mlp": 0.00217081, "balance_loss_clip": 1.01961625, "balance_loss_mlp": 0.19242826, "epoch": 0.9309484443108372, "flos": 13001422826880.0, "grad_norm": 21.0727337646115, "language_loss": 0.83137584, "learning_rate": 4.976727281916782e-08, "loss": 0.84594429, "num_input_tokens_seen": 334136065, "router_z_loss_clip": 2.20214844, "router_z_loss_mlp": 0.24658203, "step": 15484, "time_per_iteration": 2.6589295864105225 }, { "auxiliary_loss_clip": 0.01251489, "auxiliary_loss_mlp": 0.00242227, "balance_loss_clip": 1.03537869, "balance_loss_mlp": 0.21905278, "epoch": 0.9310085675635051, "flos": 12567765928320.0, "grad_norm": 19.440832302242182, "language_loss": 0.85495472, "learning_rate": 4.968096861188087e-08, "loss": 0.86989188, "num_input_tokens_seen": 334153690, "router_z_loss_clip": 2.15917969, "router_z_loss_mlp": 0.23193359, "step": 15485, "time_per_iteration": 2.677751302719116 }, { "auxiliary_loss_clip": 0.01249054, "auxiliary_loss_mlp": 0.00253241, "balance_loss_clip": 1.02576423, "balance_loss_mlp": 0.22483294, "epoch": 0.9310686908161732, "flos": 23477570864640.0, "grad_norm": 13.281361744337142, "language_loss": 0.87557727, "learning_rate": 4.959473836088723e-08, "loss": 0.8906002, "num_input_tokens_seen": 334171880, "router_z_loss_clip": 2.23046875, "router_z_loss_mlp": 0.28442383, "step": 15486, "time_per_iteration": 2.8261735439300537 }, { "auxiliary_loss_clip": 0.01271222, "auxiliary_loss_mlp": 0.00229329, "balance_loss_clip": 1.04101694, "balance_loss_mlp": 0.20230439, "epoch": 0.9311288140688411, "flos": 24170287628160.0, "grad_norm": 752.9272591946858, "language_loss": 0.85245502, "learning_rate": 4.950858206945674e-08, "loss": 0.86746049, "num_input_tokens_seen": 334190005, "router_z_loss_clip": 2.30078125, "router_z_loss_mlp": 0.27038574, "step": 15487, "time_per_iteration": 2.702105760574341 }, { "auxiliary_loss_clip": 0.0122979, "auxiliary_loss_mlp": 0.00219266, "balance_loss_clip": 1.01353359, "balance_loss_mlp": 0.19422054, "epoch": 0.9311889373215091, "flos": 35590203561600.0, "grad_norm": 18.321833798089116, "language_loss": 0.78016311, "learning_rate": 4.942249974085633e-08, "loss": 0.79465365, "num_input_tokens_seen": 334209545, "router_z_loss_clip": 2.16015625, "router_z_loss_mlp": 0.25085449, "step": 15488, "time_per_iteration": 2.8490989208221436 }, { "auxiliary_loss_clip": 0.01233265, "auxiliary_loss_mlp": 0.00228283, "balance_loss_clip": 1.01833093, "balance_loss_mlp": 0.2032015, "epoch": 0.9312490605741771, "flos": 20230528187520.0, "grad_norm": 7.744922456808966, "language_loss": 0.82670254, "learning_rate": 4.933649137834983e-08, "loss": 0.84131801, "num_input_tokens_seen": 334228900, "router_z_loss_clip": 2.14746094, "router_z_loss_mlp": 0.25085449, "step": 15489, "time_per_iteration": 2.6741156578063965 }, { "auxiliary_loss_clip": 0.01247513, "auxiliary_loss_mlp": 0.00215597, "balance_loss_clip": 1.02884591, "balance_loss_mlp": 0.19064625, "epoch": 0.931309183826845, "flos": 13950577762560.0, "grad_norm": 18.649659816719712, "language_loss": 0.90163684, "learning_rate": 4.925055698519931e-08, "loss": 0.91626799, "num_input_tokens_seen": 334245500, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.24938965, "step": 15490, "time_per_iteration": 2.6627843379974365 }, { "auxiliary_loss_clip": 0.01255839, "auxiliary_loss_mlp": 0.00243519, "balance_loss_clip": 1.03119755, "balance_loss_mlp": 0.21680379, "epoch": 0.931369307079513, "flos": 20156731695360.0, "grad_norm": 39.22206894379515, "language_loss": 0.79995775, "learning_rate": 4.9164696564663264e-08, "loss": 0.81495136, "num_input_tokens_seen": 334264370, "router_z_loss_clip": 2.24609375, "router_z_loss_mlp": 0.26745605, "step": 15491, "time_per_iteration": 2.632183313369751 }, { "auxiliary_loss_clip": 0.01216413, "auxiliary_loss_mlp": 0.0021113, "balance_loss_clip": 1.01107883, "balance_loss_mlp": 0.18809851, "epoch": 0.931429430332181, "flos": 25338569483520.0, "grad_norm": 91.45650051874382, "language_loss": 0.80665493, "learning_rate": 4.9078910119997096e-08, "loss": 0.82093036, "num_input_tokens_seen": 334283905, "router_z_loss_clip": 2.05371094, "router_z_loss_mlp": 0.23034668, "step": 15492, "time_per_iteration": 2.687941312789917 }, { "auxiliary_loss_clip": 0.01091922, "auxiliary_loss_mlp": 0.00068038, "balance_loss_clip": 0.95552862, "balance_loss_mlp": 0.06026599, "epoch": 0.931489553584849, "flos": 71226193985280.0, "grad_norm": 0.6925480551560913, "language_loss": 0.52825314, "learning_rate": 4.899319765445442e-08, "loss": 0.53985274, "num_input_tokens_seen": 334339925, "router_z_loss_clip": 1.3671875, "router_z_loss_mlp": 0.07763672, "step": 15493, "time_per_iteration": 3.053372621536255 }, { "auxiliary_loss_clip": 0.01231403, "auxiliary_loss_mlp": 0.00213889, "balance_loss_clip": 1.02271199, "balance_loss_mlp": 0.19159693, "epoch": 0.9315496768375169, "flos": 14643653662080.0, "grad_norm": 3.4132665830314646, "language_loss": 0.79786146, "learning_rate": 4.890755917128531e-08, "loss": 0.81231439, "num_input_tokens_seen": 334357225, "router_z_loss_clip": 2.08789062, "router_z_loss_mlp": 0.22302246, "step": 15494, "time_per_iteration": 2.6551594734191895 }, { "auxiliary_loss_clip": 0.01244472, "auxiliary_loss_mlp": 0.00236924, "balance_loss_clip": 1.02683342, "balance_loss_mlp": 0.21171159, "epoch": 0.9316098000901849, "flos": 28329928174080.0, "grad_norm": 2.2123506609612127, "language_loss": 0.75733805, "learning_rate": 4.882199467373671e-08, "loss": 0.77215207, "num_input_tokens_seen": 334375945, "router_z_loss_clip": 2.17578125, "router_z_loss_mlp": 0.2520752, "step": 15495, "time_per_iteration": 2.7256836891174316 }, { "auxiliary_loss_clip": 0.0122573, "auxiliary_loss_mlp": 0.00206538, "balance_loss_clip": 1.01742911, "balance_loss_mlp": 0.18387654, "epoch": 0.9316699233428528, "flos": 28512677594880.0, "grad_norm": 3.6845568476541923, "language_loss": 0.69156337, "learning_rate": 4.8736504165053815e-08, "loss": 0.70588607, "num_input_tokens_seen": 334395310, "router_z_loss_clip": 2.0859375, "router_z_loss_mlp": 0.22668457, "step": 15496, "time_per_iteration": 2.8016929626464844 }, { "auxiliary_loss_clip": 0.01242709, "auxiliary_loss_mlp": 0.0021414, "balance_loss_clip": 1.02077127, "balance_loss_mlp": 0.18887925, "epoch": 0.9317300465955208, "flos": 33693402061440.0, "grad_norm": 9.005002246276026, "language_loss": 0.8419674, "learning_rate": 4.865108764847825e-08, "loss": 0.85653591, "num_input_tokens_seen": 334416965, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.25256348, "step": 15497, "time_per_iteration": 2.7524819374084473 }, { "auxiliary_loss_clip": 0.01250715, "auxiliary_loss_mlp": 0.00236085, "balance_loss_clip": 1.03259695, "balance_loss_mlp": 0.21106258, "epoch": 0.9317901698481887, "flos": 23658237296640.0, "grad_norm": 15.880221765807132, "language_loss": 0.73605359, "learning_rate": 4.856574512724898e-08, "loss": 0.75092149, "num_input_tokens_seen": 334435620, "router_z_loss_clip": 2.18164062, "router_z_loss_mlp": 0.25012207, "step": 15498, "time_per_iteration": 2.646432876586914 }, { "auxiliary_loss_clip": 0.01229678, "auxiliary_loss_mlp": 0.00224613, "balance_loss_clip": 1.01724124, "balance_loss_mlp": 0.19966222, "epoch": 0.9318502931008568, "flos": 20960017499520.0, "grad_norm": 32.044361141111786, "language_loss": 0.8720597, "learning_rate": 4.8480476604602305e-08, "loss": 0.88660264, "num_input_tokens_seen": 334456210, "router_z_loss_clip": 2.12304688, "router_z_loss_mlp": 0.24951172, "step": 15499, "time_per_iteration": 2.722696542739868 }, { "auxiliary_loss_clip": 0.01221112, "auxiliary_loss_mlp": 0.00237041, "balance_loss_clip": 1.01025677, "balance_loss_mlp": 0.21273404, "epoch": 0.9319104163535247, "flos": 23441049711360.0, "grad_norm": 9.168177780059693, "language_loss": 0.83027387, "learning_rate": 4.8395282083771196e-08, "loss": 0.84485543, "num_input_tokens_seen": 334475485, "router_z_loss_clip": 2.10742188, "router_z_loss_mlp": 0.24304199, "step": 15500, "time_per_iteration": 2.718877077102661 }, { "auxiliary_loss_clip": 0.01220328, "auxiliary_loss_mlp": 0.00226831, "balance_loss_clip": 1.01140952, "balance_loss_mlp": 0.20279843, "epoch": 0.9319705396061927, "flos": 22347426274560.0, "grad_norm": 16.492030218114675, "language_loss": 0.80968034, "learning_rate": 4.8310161567987064e-08, "loss": 0.82415193, "num_input_tokens_seen": 334494740, "router_z_loss_clip": 2.08496094, "router_z_loss_mlp": 0.24047852, "step": 15501, "time_per_iteration": 2.7366244792938232 }, { "auxiliary_loss_clip": 0.0124354, "auxiliary_loss_mlp": 0.00220032, "balance_loss_clip": 1.02674437, "balance_loss_mlp": 0.19539149, "epoch": 0.9320306628588607, "flos": 20993557824000.0, "grad_norm": 51.83051139109661, "language_loss": 0.75780427, "learning_rate": 4.822511506047666e-08, "loss": 0.77244002, "num_input_tokens_seen": 334511910, "router_z_loss_clip": 2.16796875, "router_z_loss_mlp": 0.24621582, "step": 15502, "time_per_iteration": 2.6186678409576416 }, { "auxiliary_loss_clip": 0.012387, "auxiliary_loss_mlp": 0.00230005, "balance_loss_clip": 1.02341819, "balance_loss_mlp": 0.20678331, "epoch": 0.9320907861115286, "flos": 24538300421760.0, "grad_norm": 51.65767327422452, "language_loss": 0.72489476, "learning_rate": 4.814014256446586e-08, "loss": 0.73958182, "num_input_tokens_seen": 334533150, "router_z_loss_clip": 2.15039062, "router_z_loss_mlp": 0.23254395, "step": 15503, "time_per_iteration": 2.704935073852539 }, { "auxiliary_loss_clip": 0.01270001, "auxiliary_loss_mlp": 0.00221947, "balance_loss_clip": 1.04071879, "balance_loss_mlp": 0.19584017, "epoch": 0.9321509093641966, "flos": 19785414850560.0, "grad_norm": 9.618496916442938, "language_loss": 0.84545398, "learning_rate": 4.805524408317652e-08, "loss": 0.8603735, "num_input_tokens_seen": 334550940, "router_z_loss_clip": 2.29101562, "router_z_loss_mlp": 0.26123047, "step": 15504, "time_per_iteration": 4.061170816421509 }, { "auxiliary_loss_clip": 0.01256987, "auxiliary_loss_mlp": 0.00243746, "balance_loss_clip": 1.03354073, "balance_loss_mlp": 0.21788917, "epoch": 0.9322110326168646, "flos": 24972675592320.0, "grad_norm": 5.905805331187033, "language_loss": 0.77954328, "learning_rate": 4.797041961982762e-08, "loss": 0.79455066, "num_input_tokens_seen": 334570935, "router_z_loss_clip": 2.23046875, "router_z_loss_mlp": 0.25878906, "step": 15505, "time_per_iteration": 4.122279167175293 }, { "auxiliary_loss_clip": 0.01233726, "auxiliary_loss_mlp": 0.00202062, "balance_loss_clip": 1.0185709, "balance_loss_mlp": 0.17701565, "epoch": 0.9322711558695326, "flos": 16143642639360.0, "grad_norm": 5.6433078891458655, "language_loss": 0.8498345, "learning_rate": 4.788566917763614e-08, "loss": 0.86419237, "num_input_tokens_seen": 334589315, "router_z_loss_clip": 2.1484375, "router_z_loss_mlp": 0.25073242, "step": 15506, "time_per_iteration": 2.6875457763671875 }, { "auxiliary_loss_clip": 0.01225181, "auxiliary_loss_mlp": 0.00212231, "balance_loss_clip": 1.01530147, "balance_loss_mlp": 0.18775725, "epoch": 0.9323312791222005, "flos": 23732428838400.0, "grad_norm": 10.882925030968346, "language_loss": 0.91896451, "learning_rate": 4.780099275981597e-08, "loss": 0.93333858, "num_input_tokens_seen": 334608990, "router_z_loss_clip": 2.09472656, "router_z_loss_mlp": 0.24450684, "step": 15507, "time_per_iteration": 2.7391045093536377 }, { "auxiliary_loss_clip": 0.01235301, "auxiliary_loss_mlp": 0.00222746, "balance_loss_clip": 1.01846838, "balance_loss_mlp": 0.19795053, "epoch": 0.9323914023748685, "flos": 20777914523520.0, "grad_norm": 18.52536809762023, "language_loss": 0.75559568, "learning_rate": 4.771639036957742e-08, "loss": 0.77017617, "num_input_tokens_seen": 334628655, "router_z_loss_clip": 2.17089844, "router_z_loss_mlp": 0.24768066, "step": 15508, "time_per_iteration": 2.654867649078369 }, { "auxiliary_loss_clip": 0.01230641, "auxiliary_loss_mlp": 0.00224501, "balance_loss_clip": 1.01589751, "balance_loss_mlp": 0.19927633, "epoch": 0.9324515256275364, "flos": 23915178259200.0, "grad_norm": 2.9002203082277833, "language_loss": 0.79565138, "learning_rate": 4.7631862010129033e-08, "loss": 0.81020278, "num_input_tokens_seen": 334648295, "router_z_loss_clip": 2.14550781, "router_z_loss_mlp": 0.25219727, "step": 15509, "time_per_iteration": 2.713151454925537 }, { "auxiliary_loss_clip": 0.01235252, "auxiliary_loss_mlp": 0.00207212, "balance_loss_clip": 1.02074277, "balance_loss_mlp": 0.18373968, "epoch": 0.9325116488802044, "flos": 18005215875840.0, "grad_norm": 12.066863616672919, "language_loss": 0.8277154, "learning_rate": 4.754740768467624e-08, "loss": 0.84214008, "num_input_tokens_seen": 334666280, "router_z_loss_clip": 2.14648438, "router_z_loss_mlp": 0.23498535, "step": 15510, "time_per_iteration": 2.6322970390319824 }, { "auxiliary_loss_clip": 0.0125055, "auxiliary_loss_mlp": 0.00227269, "balance_loss_clip": 1.02875185, "balance_loss_mlp": 0.20218728, "epoch": 0.9325717721328723, "flos": 29021603443200.0, "grad_norm": 17.47599519994485, "language_loss": 0.7764684, "learning_rate": 4.746302739642161e-08, "loss": 0.79124653, "num_input_tokens_seen": 334688830, "router_z_loss_clip": 2.21679688, "router_z_loss_mlp": 0.25073242, "step": 15511, "time_per_iteration": 2.7939913272857666 }, { "auxiliary_loss_clip": 0.01230038, "auxiliary_loss_mlp": 0.00215744, "balance_loss_clip": 1.01755381, "balance_loss_mlp": 0.19349989, "epoch": 0.9326318953855404, "flos": 21646341642240.0, "grad_norm": 8.00560178063444, "language_loss": 0.84622926, "learning_rate": 4.737872114856412e-08, "loss": 0.86068714, "num_input_tokens_seen": 334705205, "router_z_loss_clip": 2.12304688, "router_z_loss_mlp": 0.22241211, "step": 15512, "time_per_iteration": 4.22195839881897 }, { "auxiliary_loss_clip": 0.01248631, "auxiliary_loss_mlp": 0.00230279, "balance_loss_clip": 1.02797878, "balance_loss_mlp": 0.20592441, "epoch": 0.9326920186382083, "flos": 26065724411520.0, "grad_norm": 19.61232645726312, "language_loss": 0.8652485, "learning_rate": 4.7294488944301436e-08, "loss": 0.88003761, "num_input_tokens_seen": 334723830, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.24365234, "step": 15513, "time_per_iteration": 2.6946003437042236 }, { "auxiliary_loss_clip": 0.01256047, "auxiliary_loss_mlp": 0.00225027, "balance_loss_clip": 1.03334332, "balance_loss_mlp": 0.19895561, "epoch": 0.9327521418908763, "flos": 12057116227200.0, "grad_norm": 31.560437466450825, "language_loss": 0.90314716, "learning_rate": 4.721033078682768e-08, "loss": 0.9179579, "num_input_tokens_seen": 334740825, "router_z_loss_clip": 2.22558594, "router_z_loss_mlp": 0.26086426, "step": 15514, "time_per_iteration": 2.641972064971924 }, { "auxiliary_loss_clip": 0.01245659, "auxiliary_loss_mlp": 0.00221297, "balance_loss_clip": 1.03095388, "balance_loss_mlp": 0.1964661, "epoch": 0.9328122651435443, "flos": 43834395271680.0, "grad_norm": 3.687854607931068, "language_loss": 0.77548623, "learning_rate": 4.7126246679333626e-08, "loss": 0.79015583, "num_input_tokens_seen": 334765825, "router_z_loss_clip": 2.15039062, "router_z_loss_mlp": 0.24804688, "step": 15515, "time_per_iteration": 2.8462512493133545 }, { "auxiliary_loss_clip": 0.01260853, "auxiliary_loss_mlp": 0.0022578, "balance_loss_clip": 1.0389564, "balance_loss_mlp": 0.19932747, "epoch": 0.9328723883962122, "flos": 15194954580480.0, "grad_norm": 4.204943427560678, "language_loss": 0.91693336, "learning_rate": 4.704223662500806e-08, "loss": 0.93179965, "num_input_tokens_seen": 334782680, "router_z_loss_clip": 2.22070312, "router_z_loss_mlp": 0.26452637, "step": 15516, "time_per_iteration": 4.080990791320801 }, { "auxiliary_loss_clip": 0.01245114, "auxiliary_loss_mlp": 0.00231733, "balance_loss_clip": 1.02424598, "balance_loss_mlp": 0.20760551, "epoch": 0.9329325116488802, "flos": 20261770041600.0, "grad_norm": 11.559954983735405, "language_loss": 0.88071436, "learning_rate": 4.695830062703643e-08, "loss": 0.89548278, "num_input_tokens_seen": 334800160, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.24121094, "step": 15517, "time_per_iteration": 2.7151999473571777 }, { "auxiliary_loss_clip": 0.01248165, "auxiliary_loss_mlp": 0.00231828, "balance_loss_clip": 1.02773428, "balance_loss_mlp": 0.20376575, "epoch": 0.9329926349015482, "flos": 13115008609920.0, "grad_norm": 61.327149960495774, "language_loss": 0.84014827, "learning_rate": 4.687443868860219e-08, "loss": 0.85494816, "num_input_tokens_seen": 334815840, "router_z_loss_clip": 2.19921875, "router_z_loss_mlp": 0.28063965, "step": 15518, "time_per_iteration": 2.599527359008789 }, { "auxiliary_loss_clip": 0.01221871, "auxiliary_loss_mlp": 0.00214335, "balance_loss_clip": 1.01233196, "balance_loss_mlp": 0.19087471, "epoch": 0.9330527581542162, "flos": 23040250778880.0, "grad_norm": 12.27627608655398, "language_loss": 0.84600842, "learning_rate": 4.679065081288458e-08, "loss": 0.86037046, "num_input_tokens_seen": 334834735, "router_z_loss_clip": 2.09765625, "router_z_loss_mlp": 0.23474121, "step": 15519, "time_per_iteration": 2.6852145195007324 }, { "auxiliary_loss_clip": 0.01232684, "auxiliary_loss_mlp": 0.00207922, "balance_loss_clip": 1.01802492, "balance_loss_mlp": 0.184247, "epoch": 0.9331128814068841, "flos": 15559627409280.0, "grad_norm": 13.316388415540754, "language_loss": 0.91839033, "learning_rate": 4.6706937003061275e-08, "loss": 0.93279648, "num_input_tokens_seen": 334853490, "router_z_loss_clip": 2.1484375, "router_z_loss_mlp": 0.23693848, "step": 15520, "time_per_iteration": 2.606900930404663 }, { "auxiliary_loss_clip": 0.01224766, "auxiliary_loss_mlp": 0.00216111, "balance_loss_clip": 1.01268375, "balance_loss_mlp": 0.19100554, "epoch": 0.9331730046595521, "flos": 22271762275200.0, "grad_norm": 4.387365544225445, "language_loss": 0.83864427, "learning_rate": 4.6623297262306846e-08, "loss": 0.85305303, "num_input_tokens_seen": 334873675, "router_z_loss_clip": 2.12304688, "router_z_loss_mlp": 0.25097656, "step": 15521, "time_per_iteration": 2.6989078521728516 }, { "auxiliary_loss_clip": 0.0125224, "auxiliary_loss_mlp": 0.00223274, "balance_loss_clip": 1.03495526, "balance_loss_mlp": 0.19961125, "epoch": 0.93323312791222, "flos": 15777641007360.0, "grad_norm": 26.049468875339286, "language_loss": 0.84981799, "learning_rate": 4.6539731593792545e-08, "loss": 0.86457312, "num_input_tokens_seen": 334890970, "router_z_loss_clip": 2.17578125, "router_z_loss_mlp": 0.23669434, "step": 15522, "time_per_iteration": 2.6587812900543213 }, { "auxiliary_loss_clip": 0.01248878, "auxiliary_loss_mlp": 0.00232952, "balance_loss_clip": 1.02317739, "balance_loss_mlp": 0.20462805, "epoch": 0.933293251164888, "flos": 22010978557440.0, "grad_norm": 19.39291013424816, "language_loss": 0.73706931, "learning_rate": 4.6456240000687373e-08, "loss": 0.75188762, "num_input_tokens_seen": 334906635, "router_z_loss_clip": 2.25585938, "router_z_loss_mlp": 0.2833252, "step": 15523, "time_per_iteration": 2.68953537940979 }, { "auxiliary_loss_clip": 0.01242004, "auxiliary_loss_mlp": 0.00236803, "balance_loss_clip": 1.02637148, "balance_loss_mlp": 0.20921823, "epoch": 0.933353374417556, "flos": 26031358074240.0, "grad_norm": 76.22521592717139, "language_loss": 0.76272827, "learning_rate": 4.63728224861577e-08, "loss": 0.77751637, "num_input_tokens_seen": 334926230, "router_z_loss_clip": 2.15429688, "router_z_loss_mlp": 0.27575684, "step": 15524, "time_per_iteration": 2.7391645908355713 }, { "auxiliary_loss_clip": 0.01254464, "auxiliary_loss_mlp": 0.0025468, "balance_loss_clip": 1.03059983, "balance_loss_mlp": 0.22682112, "epoch": 0.933413497670224, "flos": 24900100162560.0, "grad_norm": 15.508547714207198, "language_loss": 0.80024397, "learning_rate": 4.628947905336589e-08, "loss": 0.81533539, "num_input_tokens_seen": 334946680, "router_z_loss_clip": 2.23828125, "router_z_loss_mlp": 0.27844238, "step": 15525, "time_per_iteration": 2.8457634449005127 }, { "auxiliary_loss_clip": 0.01243214, "auxiliary_loss_mlp": 0.00231414, "balance_loss_clip": 1.02434111, "balance_loss_mlp": 0.20595071, "epoch": 0.9334736209228919, "flos": 23688689051520.0, "grad_norm": 15.468032369957692, "language_loss": 0.90071893, "learning_rate": 4.6206209705473175e-08, "loss": 0.91546524, "num_input_tokens_seen": 334964785, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.2545166, "step": 15526, "time_per_iteration": 2.716691732406616 }, { "auxiliary_loss_clip": 0.01233928, "auxiliary_loss_mlp": 0.00205292, "balance_loss_clip": 1.01618671, "balance_loss_mlp": 0.18174858, "epoch": 0.9335337441755599, "flos": 15377344865280.0, "grad_norm": 36.61411475946276, "language_loss": 0.77313459, "learning_rate": 4.61230144456366e-08, "loss": 0.78752685, "num_input_tokens_seen": 334982400, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.2355957, "step": 15527, "time_per_iteration": 2.6246745586395264 }, { "auxiliary_loss_clip": 0.01257442, "auxiliary_loss_mlp": 0.0022481, "balance_loss_clip": 1.02996945, "balance_loss_mlp": 0.1992994, "epoch": 0.9335938674282279, "flos": 16106726436480.0, "grad_norm": 59.26182965531216, "language_loss": 0.75865006, "learning_rate": 4.603989327701141e-08, "loss": 0.77347261, "num_input_tokens_seen": 334999685, "router_z_loss_clip": 2.27539062, "router_z_loss_mlp": 0.25512695, "step": 15528, "time_per_iteration": 2.709789514541626 }, { "auxiliary_loss_clip": 0.0125258, "auxiliary_loss_mlp": 0.00223806, "balance_loss_clip": 1.02966237, "balance_loss_mlp": 0.19791359, "epoch": 0.9336539906808958, "flos": 18952898353920.0, "grad_norm": 49.28586282885602, "language_loss": 0.84202397, "learning_rate": 4.5956846202748867e-08, "loss": 0.8567878, "num_input_tokens_seen": 335019160, "router_z_loss_clip": 2.23046875, "router_z_loss_mlp": 0.25878906, "step": 15529, "time_per_iteration": 2.641139268875122 }, { "auxiliary_loss_clip": 0.01226716, "auxiliary_loss_mlp": 0.0021441, "balance_loss_clip": 1.0143379, "balance_loss_mlp": 0.18962659, "epoch": 0.9337141139335638, "flos": 18109104986880.0, "grad_norm": 5.2112185238403494, "language_loss": 0.69594067, "learning_rate": 4.5873873225998674e-08, "loss": 0.71035194, "num_input_tokens_seen": 335037350, "router_z_loss_clip": 2.12304688, "router_z_loss_mlp": 0.24804688, "step": 15530, "time_per_iteration": 2.670599937438965 }, { "auxiliary_loss_clip": 0.01218967, "auxiliary_loss_mlp": 0.00208297, "balance_loss_clip": 1.00858235, "balance_loss_mlp": 0.18338257, "epoch": 0.9337742371862318, "flos": 17345716214400.0, "grad_norm": 14.006072876486176, "language_loss": 0.80056256, "learning_rate": 4.5790974349907194e-08, "loss": 0.81483519, "num_input_tokens_seen": 335056060, "router_z_loss_clip": 2.10351562, "router_z_loss_mlp": 0.24902344, "step": 15531, "time_per_iteration": 2.6670663356781006 }, { "auxiliary_loss_clip": 0.01245819, "auxiliary_loss_mlp": 0.00240735, "balance_loss_clip": 1.02840185, "balance_loss_mlp": 0.21464014, "epoch": 0.9338343604388998, "flos": 29058986522880.0, "grad_norm": 350.9700647075944, "language_loss": 0.78348601, "learning_rate": 4.5708149577617925e-08, "loss": 0.79835153, "num_input_tokens_seen": 335075410, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.26098633, "step": 15532, "time_per_iteration": 2.7141005992889404 }, { "auxiliary_loss_clip": 0.0124342, "auxiliary_loss_mlp": 0.00202897, "balance_loss_clip": 1.02464247, "balance_loss_mlp": 0.1773148, "epoch": 0.9338944836915677, "flos": 18660908695680.0, "grad_norm": 2.4038024899809955, "language_loss": 0.79661453, "learning_rate": 4.5625398912271016e-08, "loss": 0.81107771, "num_input_tokens_seen": 335095190, "router_z_loss_clip": 2.19140625, "router_z_loss_mlp": 0.25598145, "step": 15533, "time_per_iteration": 2.6496822834014893 }, { "auxiliary_loss_clip": 0.01230257, "auxiliary_loss_mlp": 0.00216423, "balance_loss_clip": 1.02264214, "balance_loss_mlp": 0.19299865, "epoch": 0.9339546069442357, "flos": 16617735273600.0, "grad_norm": 14.54597768126562, "language_loss": 0.88832355, "learning_rate": 4.554272235700507e-08, "loss": 0.90279043, "num_input_tokens_seen": 335113825, "router_z_loss_clip": 2.078125, "router_z_loss_mlp": 0.23413086, "step": 15534, "time_per_iteration": 2.6481070518493652 }, { "auxiliary_loss_clip": 0.01210589, "auxiliary_loss_mlp": 0.00217563, "balance_loss_clip": 1.00770319, "balance_loss_mlp": 0.19670171, "epoch": 0.9340147301969036, "flos": 23693106424320.0, "grad_norm": 39.9084949307438, "language_loss": 0.81992388, "learning_rate": 4.546011991495513e-08, "loss": 0.83420539, "num_input_tokens_seen": 335136425, "router_z_loss_clip": 2.02832031, "router_z_loss_mlp": 0.20861816, "step": 15535, "time_per_iteration": 2.663069009780884 }, { "auxiliary_loss_clip": 0.0124136, "auxiliary_loss_mlp": 0.00227957, "balance_loss_clip": 1.02724993, "balance_loss_mlp": 0.20425861, "epoch": 0.9340748534495716, "flos": 28654452576000.0, "grad_norm": 4.519069718895035, "language_loss": 0.85826504, "learning_rate": 4.537759158925292e-08, "loss": 0.87295818, "num_input_tokens_seen": 335157925, "router_z_loss_clip": 2.14257812, "router_z_loss_mlp": 0.23718262, "step": 15536, "time_per_iteration": 2.7499163150787354 }, { "auxiliary_loss_clip": 0.01239147, "auxiliary_loss_mlp": 0.00231833, "balance_loss_clip": 1.0244782, "balance_loss_mlp": 0.20658422, "epoch": 0.9341349767022396, "flos": 24899633285760.0, "grad_norm": 9.236322965020197, "language_loss": 0.87169814, "learning_rate": 4.5295137383028593e-08, "loss": 0.88640785, "num_input_tokens_seen": 335177840, "router_z_loss_clip": 2.14648438, "router_z_loss_mlp": 0.25256348, "step": 15537, "time_per_iteration": 2.6641745567321777 }, { "auxiliary_loss_clip": 0.01247896, "auxiliary_loss_mlp": 0.00216095, "balance_loss_clip": 1.03049016, "balance_loss_mlp": 0.1923603, "epoch": 0.9341950999549076, "flos": 29059525226880.0, "grad_norm": 11.83637431301546, "language_loss": 0.8600992, "learning_rate": 4.5212757299408764e-08, "loss": 0.87473911, "num_input_tokens_seen": 335199470, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.23718262, "step": 15538, "time_per_iteration": 2.7360665798187256 }, { "auxiliary_loss_clip": 0.01223555, "auxiliary_loss_mlp": 0.00237166, "balance_loss_clip": 1.01377082, "balance_loss_mlp": 0.21185784, "epoch": 0.9342552232075755, "flos": 23587062497280.0, "grad_norm": 14.040589423277787, "language_loss": 0.80585945, "learning_rate": 4.513045134151672e-08, "loss": 0.82046664, "num_input_tokens_seen": 335218885, "router_z_loss_clip": 2.09765625, "router_z_loss_mlp": 0.2532959, "step": 15539, "time_per_iteration": 2.6802611351013184 }, { "auxiliary_loss_clip": 0.01233453, "auxiliary_loss_mlp": 0.00224281, "balance_loss_clip": 1.0204258, "balance_loss_mlp": 0.2002241, "epoch": 0.9343153464602435, "flos": 36721389646080.0, "grad_norm": 3.909132210147867, "language_loss": 0.72396731, "learning_rate": 4.504821951247373e-08, "loss": 0.7385447, "num_input_tokens_seen": 335239485, "router_z_loss_clip": 2.12890625, "router_z_loss_mlp": 0.24047852, "step": 15540, "time_per_iteration": 2.7938761711120605 }, { "auxiliary_loss_clip": 0.01243459, "auxiliary_loss_mlp": 0.00241075, "balance_loss_clip": 1.02693975, "balance_loss_mlp": 0.21596974, "epoch": 0.9343754697129115, "flos": 22236498097920.0, "grad_norm": 21.168169693624638, "language_loss": 0.80411768, "learning_rate": 4.496606181539864e-08, "loss": 0.81896305, "num_input_tokens_seen": 335258355, "router_z_loss_clip": 2.16601562, "router_z_loss_mlp": 0.2512207, "step": 15541, "time_per_iteration": 2.632828712463379 }, { "auxiliary_loss_clip": 0.01242288, "auxiliary_loss_mlp": 0.00222963, "balance_loss_clip": 1.02550721, "balance_loss_mlp": 0.19798848, "epoch": 0.9344355929655794, "flos": 29710333797120.0, "grad_norm": 369.3265493594942, "language_loss": 0.76219708, "learning_rate": 4.4883978253406066e-08, "loss": 0.77684957, "num_input_tokens_seen": 335276835, "router_z_loss_clip": 2.16796875, "router_z_loss_mlp": 0.24975586, "step": 15542, "time_per_iteration": 2.7179527282714844 }, { "auxiliary_loss_clip": 0.01238268, "auxiliary_loss_mlp": 0.00233228, "balance_loss_clip": 1.02498233, "balance_loss_mlp": 0.20938663, "epoch": 0.9344957162182475, "flos": 18880394751360.0, "grad_norm": 24.40880566007105, "language_loss": 0.77448571, "learning_rate": 4.480196882960907e-08, "loss": 0.78920066, "num_input_tokens_seen": 335296220, "router_z_loss_clip": 2.13085938, "router_z_loss_mlp": 0.23840332, "step": 15543, "time_per_iteration": 2.6407833099365234 }, { "auxiliary_loss_clip": 0.01261871, "auxiliary_loss_mlp": 0.00217431, "balance_loss_clip": 1.03780186, "balance_loss_mlp": 0.19164571, "epoch": 0.9345558394709154, "flos": 27417761268480.0, "grad_norm": 7.796354761430544, "language_loss": 0.78977919, "learning_rate": 4.4720033547117394e-08, "loss": 0.80457217, "num_input_tokens_seen": 335316335, "router_z_loss_clip": 2.23925781, "router_z_loss_mlp": 0.2578125, "step": 15544, "time_per_iteration": 2.705824613571167 }, { "auxiliary_loss_clip": 0.01251484, "auxiliary_loss_mlp": 0.00228718, "balance_loss_clip": 1.03190482, "balance_loss_mlp": 0.20321959, "epoch": 0.9346159627235834, "flos": 20741285629440.0, "grad_norm": 3.8186101469634823, "language_loss": 0.86884946, "learning_rate": 4.463817240903789e-08, "loss": 0.88365149, "num_input_tokens_seen": 335335545, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.25512695, "step": 15545, "time_per_iteration": 2.667757034301758 }, { "auxiliary_loss_clip": 0.01247583, "auxiliary_loss_mlp": 0.00224977, "balance_loss_clip": 1.0273304, "balance_loss_mlp": 0.2004915, "epoch": 0.9346760859762513, "flos": 21069221823360.0, "grad_norm": 3.57802594515728, "language_loss": 0.78580326, "learning_rate": 4.455638541847495e-08, "loss": 0.80052888, "num_input_tokens_seen": 335355350, "router_z_loss_clip": 2.20507812, "router_z_loss_mlp": 0.24487305, "step": 15546, "time_per_iteration": 2.661632776260376 }, { "auxiliary_loss_clip": 0.01223394, "auxiliary_loss_mlp": 0.00224726, "balance_loss_clip": 1.01364005, "balance_loss_mlp": 0.20027612, "epoch": 0.9347362092289193, "flos": 29204927481600.0, "grad_norm": 3.088034252620135, "language_loss": 0.88482535, "learning_rate": 4.447467257852966e-08, "loss": 0.8993066, "num_input_tokens_seen": 335375160, "router_z_loss_clip": 2.09765625, "router_z_loss_mlp": 0.24475098, "step": 15547, "time_per_iteration": 5.526913404464722 }, { "auxiliary_loss_clip": 0.01216998, "auxiliary_loss_mlp": 0.00222849, "balance_loss_clip": 1.01392639, "balance_loss_mlp": 0.2009626, "epoch": 0.9347963324815872, "flos": 19427350124160.0, "grad_norm": 9.51847517466302, "language_loss": 0.91040492, "learning_rate": 4.439303389230087e-08, "loss": 0.92480338, "num_input_tokens_seen": 335394080, "router_z_loss_clip": 2.03417969, "router_z_loss_mlp": 0.21887207, "step": 15548, "time_per_iteration": 2.652658700942993 }, { "auxiliary_loss_clip": 0.01267922, "auxiliary_loss_mlp": 0.0023947, "balance_loss_clip": 1.03837454, "balance_loss_mlp": 0.21330363, "epoch": 0.9348564557342552, "flos": 36901840596480.0, "grad_norm": 7.947792243132131, "language_loss": 0.73651099, "learning_rate": 4.4311469362884326e-08, "loss": 0.75158489, "num_input_tokens_seen": 335414230, "router_z_loss_clip": 2.296875, "router_z_loss_mlp": 0.26171875, "step": 15549, "time_per_iteration": 2.8802950382232666 }, { "auxiliary_loss_clip": 0.01253469, "auxiliary_loss_mlp": 0.0023129, "balance_loss_clip": 1.03775668, "balance_loss_mlp": 0.20684013, "epoch": 0.9349165789869232, "flos": 21690117342720.0, "grad_norm": 6.7874475544550315, "language_loss": 0.87089157, "learning_rate": 4.4229978993372665e-08, "loss": 0.88573909, "num_input_tokens_seen": 335432890, "router_z_loss_clip": 2.15820312, "router_z_loss_mlp": 0.24462891, "step": 15550, "time_per_iteration": 2.6804585456848145 }, { "auxiliary_loss_clip": 0.01233588, "auxiliary_loss_mlp": 0.00217591, "balance_loss_clip": 1.01946604, "balance_loss_mlp": 0.19353443, "epoch": 0.9349767022395912, "flos": 18844053166080.0, "grad_norm": 61.962439854561005, "language_loss": 0.82515919, "learning_rate": 4.4148562786856524e-08, "loss": 0.83967102, "num_input_tokens_seen": 335452085, "router_z_loss_clip": 2.14160156, "router_z_loss_mlp": 0.24072266, "step": 15551, "time_per_iteration": 2.7184085845947266 }, { "auxiliary_loss_clip": 0.01218079, "auxiliary_loss_mlp": 0.00220913, "balance_loss_clip": 1.01544821, "balance_loss_mlp": 0.19945522, "epoch": 0.9350368254922591, "flos": 24973429777920.0, "grad_norm": 3.953087860862921, "language_loss": 0.80305195, "learning_rate": 4.406722074642255e-08, "loss": 0.81744182, "num_input_tokens_seen": 335472130, "router_z_loss_clip": 2.02539062, "router_z_loss_mlp": 0.21459961, "step": 15552, "time_per_iteration": 2.757685422897339 }, { "auxiliary_loss_clip": 0.01246181, "auxiliary_loss_mlp": 0.00226797, "balance_loss_clip": 1.02892375, "balance_loss_mlp": 0.20157282, "epoch": 0.9350969487449271, "flos": 23070594792960.0, "grad_norm": 6.023764845713289, "language_loss": 0.85063016, "learning_rate": 4.3985952875155386e-08, "loss": 0.8653599, "num_input_tokens_seen": 335489970, "router_z_loss_clip": 2.171875, "router_z_loss_mlp": 0.25256348, "step": 15553, "time_per_iteration": 2.681281805038452 }, { "auxiliary_loss_clip": 0.01246224, "auxiliary_loss_mlp": 0.00248471, "balance_loss_clip": 1.02575111, "balance_loss_mlp": 0.22058779, "epoch": 0.9351570719975951, "flos": 18625177641600.0, "grad_norm": 2.8859647275255322, "language_loss": 0.87742686, "learning_rate": 4.390475917613723e-08, "loss": 0.8923738, "num_input_tokens_seen": 335509125, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.27868652, "step": 15554, "time_per_iteration": 2.660900831222534 }, { "auxiliary_loss_clip": 0.01217031, "auxiliary_loss_mlp": 0.00236714, "balance_loss_clip": 1.01236379, "balance_loss_mlp": 0.21451716, "epoch": 0.935217195250263, "flos": 15888353702400.0, "grad_norm": 46.62174090583487, "language_loss": 0.76082867, "learning_rate": 4.382363965244695e-08, "loss": 0.77536613, "num_input_tokens_seen": 335525620, "router_z_loss_clip": 2.04882812, "router_z_loss_mlp": 0.22192383, "step": 15555, "time_per_iteration": 4.031850337982178 }, { "auxiliary_loss_clip": 0.01247607, "auxiliary_loss_mlp": 0.00220453, "balance_loss_clip": 1.03314972, "balance_loss_mlp": 0.19469154, "epoch": 0.935277318502931, "flos": 24390312387840.0, "grad_norm": 2.957483627653401, "language_loss": 0.81505609, "learning_rate": 4.374259430715965e-08, "loss": 0.82973665, "num_input_tokens_seen": 335547565, "router_z_loss_clip": 2.140625, "router_z_loss_mlp": 0.25756836, "step": 15556, "time_per_iteration": 2.669058322906494 }, { "auxiliary_loss_clip": 0.0124056, "auxiliary_loss_mlp": 0.00220572, "balance_loss_clip": 1.02509272, "balance_loss_mlp": 0.19535917, "epoch": 0.935337441755599, "flos": 27600259294080.0, "grad_norm": 37.4779983918222, "language_loss": 0.82186794, "learning_rate": 4.366162314334953e-08, "loss": 0.83647931, "num_input_tokens_seen": 335570285, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.25219727, "step": 15557, "time_per_iteration": 2.736241340637207 }, { "auxiliary_loss_clip": 0.01254665, "auxiliary_loss_mlp": 0.00240419, "balance_loss_clip": 1.03342152, "balance_loss_mlp": 0.21348988, "epoch": 0.935397565008267, "flos": 20482872209280.0, "grad_norm": 15.154014205114041, "language_loss": 0.6955837, "learning_rate": 4.358072616408681e-08, "loss": 0.71053451, "num_input_tokens_seen": 335588600, "router_z_loss_clip": 2.21289062, "router_z_loss_mlp": 0.26940918, "step": 15558, "time_per_iteration": 3.998095750808716 }, { "auxiliary_loss_clip": 0.01247285, "auxiliary_loss_mlp": 0.00228277, "balance_loss_clip": 1.02841163, "balance_loss_mlp": 0.20385063, "epoch": 0.9354576882609349, "flos": 23654394541440.0, "grad_norm": 54.270410743296786, "language_loss": 0.80125457, "learning_rate": 4.34999033724388e-08, "loss": 0.81601024, "num_input_tokens_seen": 335606235, "router_z_loss_clip": 2.19335938, "router_z_loss_mlp": 0.24438477, "step": 15559, "time_per_iteration": 2.625577926635742 }, { "auxiliary_loss_clip": 0.01229919, "auxiliary_loss_mlp": 0.00215407, "balance_loss_clip": 1.01653337, "balance_loss_mlp": 0.1925907, "epoch": 0.9355178115136029, "flos": 36684904406400.0, "grad_norm": 131.66655623220962, "language_loss": 0.71224546, "learning_rate": 4.341915477147062e-08, "loss": 0.72669876, "num_input_tokens_seen": 335628240, "router_z_loss_clip": 2.13671875, "router_z_loss_mlp": 0.22802734, "step": 15560, "time_per_iteration": 2.8129944801330566 }, { "auxiliary_loss_clip": 0.01288174, "auxiliary_loss_mlp": 0.00258969, "balance_loss_clip": 1.05146217, "balance_loss_mlp": 0.22991754, "epoch": 0.9355779347662708, "flos": 14460401450880.0, "grad_norm": 102.84219749090408, "language_loss": 0.74849677, "learning_rate": 4.3338480364244034e-08, "loss": 0.76396823, "num_input_tokens_seen": 335643755, "router_z_loss_clip": 2.36523438, "router_z_loss_mlp": 0.2902832, "step": 15561, "time_per_iteration": 2.570456027984619 }, { "auxiliary_loss_clip": 0.01225772, "auxiliary_loss_mlp": 0.00215949, "balance_loss_clip": 1.01322699, "balance_loss_mlp": 0.19046235, "epoch": 0.9356380580189388, "flos": 23185976256000.0, "grad_norm": 6.653802437921711, "language_loss": 0.83289671, "learning_rate": 4.325788015381859e-08, "loss": 0.84731388, "num_input_tokens_seen": 335665160, "router_z_loss_clip": 2.12695312, "router_z_loss_mlp": 0.25500488, "step": 15562, "time_per_iteration": 2.683290719985962 }, { "auxiliary_loss_clip": 0.01097375, "auxiliary_loss_mlp": 0.00084832, "balance_loss_clip": 0.96184158, "balance_loss_mlp": 0.07701227, "epoch": 0.9356981812716068, "flos": 67471626090240.0, "grad_norm": 0.9204581917690923, "language_loss": 0.61278003, "learning_rate": 4.31773541432503e-08, "loss": 0.62460208, "num_input_tokens_seen": 335715240, "router_z_loss_clip": 1.3515625, "router_z_loss_mlp": 0.078125, "step": 15563, "time_per_iteration": 3.068084955215454 }, { "auxiliary_loss_clip": 0.01229546, "auxiliary_loss_mlp": 0.0022949, "balance_loss_clip": 1.02093279, "balance_loss_mlp": 0.20445579, "epoch": 0.9357583045242748, "flos": 24681619687680.0, "grad_norm": 35.71172037073797, "language_loss": 0.85818642, "learning_rate": 4.3096902335592714e-08, "loss": 0.87277675, "num_input_tokens_seen": 335734970, "router_z_loss_clip": 2.0859375, "router_z_loss_mlp": 0.25012207, "step": 15564, "time_per_iteration": 2.736680269241333 }, { "auxiliary_loss_clip": 0.01246226, "auxiliary_loss_mlp": 0.00226285, "balance_loss_clip": 1.02477789, "balance_loss_mlp": 0.19925982, "epoch": 0.9358184277769427, "flos": 19463727623040.0, "grad_norm": 40.68632055537719, "language_loss": 0.8832193, "learning_rate": 4.301652473389694e-08, "loss": 0.89794439, "num_input_tokens_seen": 335753435, "router_z_loss_clip": 2.21289062, "router_z_loss_mlp": 0.27026367, "step": 15565, "time_per_iteration": 2.648409843444824 }, { "auxiliary_loss_clip": 0.01226955, "auxiliary_loss_mlp": 0.00223557, "balance_loss_clip": 1.01460111, "balance_loss_mlp": 0.199274, "epoch": 0.9358785510296107, "flos": 18916987731840.0, "grad_norm": 12.105627541937784, "language_loss": 0.81814492, "learning_rate": 4.2936221341210774e-08, "loss": 0.83265007, "num_input_tokens_seen": 335772105, "router_z_loss_clip": 2.12597656, "router_z_loss_mlp": 0.24267578, "step": 15566, "time_per_iteration": 2.6648855209350586 }, { "auxiliary_loss_clip": 0.0124249, "auxiliary_loss_mlp": 0.00248258, "balance_loss_clip": 1.02351069, "balance_loss_mlp": 0.22230583, "epoch": 0.9359386742822787, "flos": 23441265192960.0, "grad_norm": 10.502668450642062, "language_loss": 0.75556701, "learning_rate": 4.285599216057889e-08, "loss": 0.77047449, "num_input_tokens_seen": 335789125, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.25952148, "step": 15567, "time_per_iteration": 2.675158977508545 }, { "auxiliary_loss_clip": 0.01239153, "auxiliary_loss_mlp": 0.00233083, "balance_loss_clip": 1.02288496, "balance_loss_mlp": 0.20826343, "epoch": 0.9359987975349466, "flos": 32744067557760.0, "grad_norm": 57.76794043675989, "language_loss": 0.6951375, "learning_rate": 4.277583719504418e-08, "loss": 0.70985979, "num_input_tokens_seen": 335810995, "router_z_loss_clip": 2.16015625, "router_z_loss_mlp": 0.24804688, "step": 15568, "time_per_iteration": 2.7356882095336914 }, { "auxiliary_loss_clip": 0.01229397, "auxiliary_loss_mlp": 0.00224151, "balance_loss_clip": 1.01685834, "balance_loss_mlp": 0.19873545, "epoch": 0.9360589207876147, "flos": 22819651401600.0, "grad_norm": 91.7720085321227, "language_loss": 0.85990536, "learning_rate": 4.269575644764556e-08, "loss": 0.87444085, "num_input_tokens_seen": 335830580, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.25415039, "step": 15569, "time_per_iteration": 2.7031190395355225 }, { "auxiliary_loss_clip": 0.01257587, "auxiliary_loss_mlp": 0.00210591, "balance_loss_clip": 1.03071713, "balance_loss_mlp": 0.18341164, "epoch": 0.9361190440402826, "flos": 20885251340160.0, "grad_norm": 28.16502857516713, "language_loss": 0.8329919, "learning_rate": 4.261574992142014e-08, "loss": 0.84767365, "num_input_tokens_seen": 335846515, "router_z_loss_clip": 2.26757812, "router_z_loss_mlp": 0.27185059, "step": 15570, "time_per_iteration": 2.6091902256011963 }, { "auxiliary_loss_clip": 0.01244664, "auxiliary_loss_mlp": 0.00241007, "balance_loss_clip": 1.02232325, "balance_loss_mlp": 0.21671197, "epoch": 0.9361791672929506, "flos": 19317822577920.0, "grad_norm": 2.6804914802671735, "language_loss": 0.8715207, "learning_rate": 4.2535817619401726e-08, "loss": 0.88637745, "num_input_tokens_seen": 335863350, "router_z_loss_clip": 2.22265625, "router_z_loss_mlp": 0.24291992, "step": 15571, "time_per_iteration": 2.686521053314209 }, { "auxiliary_loss_clip": 0.01229389, "auxiliary_loss_mlp": 0.00233028, "balance_loss_clip": 1.014678, "balance_loss_mlp": 0.20880422, "epoch": 0.9362392905456185, "flos": 15158182032000.0, "grad_norm": 67.00881105714546, "language_loss": 0.83190626, "learning_rate": 4.2455959544621224e-08, "loss": 0.84653044, "num_input_tokens_seen": 335880510, "router_z_loss_clip": 2.14648438, "router_z_loss_mlp": 0.24206543, "step": 15572, "time_per_iteration": 2.729785203933716 }, { "auxiliary_loss_clip": 0.0121357, "auxiliary_loss_mlp": 0.00227544, "balance_loss_clip": 1.00843155, "balance_loss_mlp": 0.20288, "epoch": 0.9362994137982865, "flos": 22085888371200.0, "grad_norm": 81.56616342439204, "language_loss": 0.83900851, "learning_rate": 4.237617570010688e-08, "loss": 0.85341966, "num_input_tokens_seen": 335899440, "router_z_loss_clip": 2.05175781, "router_z_loss_mlp": 0.24682617, "step": 15573, "time_per_iteration": 2.7123303413391113 }, { "auxiliary_loss_clip": 0.01235444, "auxiliary_loss_mlp": 0.0023282, "balance_loss_clip": 1.02573013, "balance_loss_mlp": 0.20970556, "epoch": 0.9363595370509544, "flos": 23512260424320.0, "grad_norm": 6.776627446798069, "language_loss": 0.80763531, "learning_rate": 4.2296466088884044e-08, "loss": 0.82231796, "num_input_tokens_seen": 335919540, "router_z_loss_clip": 2.09375, "router_z_loss_mlp": 0.23120117, "step": 15574, "time_per_iteration": 2.63759708404541 }, { "auxiliary_loss_clip": 0.01231699, "auxiliary_loss_mlp": 0.00229333, "balance_loss_clip": 1.01854086, "balance_loss_mlp": 0.20414406, "epoch": 0.9364196603036224, "flos": 27123473139840.0, "grad_norm": 2.9230057169037162, "language_loss": 0.77109325, "learning_rate": 4.221683071397564e-08, "loss": 0.78570354, "num_input_tokens_seen": 335939665, "router_z_loss_clip": 2.13378906, "router_z_loss_mlp": 0.25170898, "step": 15575, "time_per_iteration": 2.7454733848571777 }, { "auxiliary_loss_clip": 0.01226149, "auxiliary_loss_mlp": 0.00228041, "balance_loss_clip": 1.01702785, "balance_loss_mlp": 0.2038776, "epoch": 0.9364797835562904, "flos": 18479057114880.0, "grad_norm": 14.086532551305359, "language_loss": 0.73572528, "learning_rate": 4.2137269578401026e-08, "loss": 0.75026721, "num_input_tokens_seen": 335958580, "router_z_loss_clip": 2.09082031, "router_z_loss_mlp": 0.24157715, "step": 15576, "time_per_iteration": 2.66841983795166 }, { "auxiliary_loss_clip": 0.01253144, "auxiliary_loss_mlp": 0.00258524, "balance_loss_clip": 1.03214765, "balance_loss_mlp": 0.23111805, "epoch": 0.9365399068089584, "flos": 13005552890880.0, "grad_norm": 6.22552093588715, "language_loss": 0.84477103, "learning_rate": 4.2057782685177566e-08, "loss": 0.85988772, "num_input_tokens_seen": 335974965, "router_z_loss_clip": 2.2109375, "router_z_loss_mlp": 0.27416992, "step": 15577, "time_per_iteration": 2.6386115550994873 }, { "auxiliary_loss_clip": 0.01249341, "auxiliary_loss_mlp": 0.00231461, "balance_loss_clip": 1.02621555, "balance_loss_mlp": 0.20673685, "epoch": 0.9366000300616263, "flos": 25666433850240.0, "grad_norm": 53.30732995077369, "language_loss": 0.6215694, "learning_rate": 4.1978370037318855e-08, "loss": 0.63637745, "num_input_tokens_seen": 335996575, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.24755859, "step": 15578, "time_per_iteration": 2.7307257652282715 }, { "auxiliary_loss_clip": 0.01214792, "auxiliary_loss_mlp": 0.00218386, "balance_loss_clip": 1.00597119, "balance_loss_mlp": 0.19568864, "epoch": 0.9366601533142943, "flos": 21433355948160.0, "grad_norm": 167.97711694274545, "language_loss": 0.7594949, "learning_rate": 4.189903163783692e-08, "loss": 0.77382672, "num_input_tokens_seen": 336017265, "router_z_loss_clip": 2.0859375, "router_z_loss_mlp": 0.22705078, "step": 15579, "time_per_iteration": 2.659555435180664 }, { "auxiliary_loss_clip": 0.01238792, "auxiliary_loss_mlp": 0.00212741, "balance_loss_clip": 1.02287889, "balance_loss_mlp": 0.18851727, "epoch": 0.9367202765669622, "flos": 24093222998400.0, "grad_norm": 48.5082654826905, "language_loss": 0.83760655, "learning_rate": 4.181976748973959e-08, "loss": 0.85212189, "num_input_tokens_seen": 336035905, "router_z_loss_clip": 2.16015625, "router_z_loss_mlp": 0.2421875, "step": 15580, "time_per_iteration": 2.6931943893432617 }, { "auxiliary_loss_clip": 0.01247764, "auxiliary_loss_mlp": 0.00206262, "balance_loss_clip": 1.02630711, "balance_loss_mlp": 0.18124047, "epoch": 0.9367803998196302, "flos": 20888842700160.0, "grad_norm": 6.418798410295584, "language_loss": 0.75445402, "learning_rate": 4.1740577596033114e-08, "loss": 0.76899427, "num_input_tokens_seen": 336055585, "router_z_loss_clip": 2.2109375, "router_z_loss_mlp": 0.25024414, "step": 15581, "time_per_iteration": 2.6866257190704346 }, { "auxiliary_loss_clip": 0.01235759, "auxiliary_loss_mlp": 0.00234573, "balance_loss_clip": 1.02097654, "balance_loss_mlp": 0.21024235, "epoch": 0.9368405230722983, "flos": 22564362464640.0, "grad_norm": 13.165358457303302, "language_loss": 0.8197577, "learning_rate": 4.166146195972042e-08, "loss": 0.83446103, "num_input_tokens_seen": 336076695, "router_z_loss_clip": 2.14257812, "router_z_loss_mlp": 0.2434082, "step": 15582, "time_per_iteration": 2.7942373752593994 }, { "auxiliary_loss_clip": 0.01231657, "auxiliary_loss_mlp": 0.00215656, "balance_loss_clip": 1.01513743, "balance_loss_mlp": 0.19044307, "epoch": 0.9369006463249662, "flos": 18880215183360.0, "grad_norm": 7.122201322974767, "language_loss": 0.82150388, "learning_rate": 4.1582420583800905e-08, "loss": 0.83597702, "num_input_tokens_seen": 336094740, "router_z_loss_clip": 2.16503906, "router_z_loss_mlp": 0.25183105, "step": 15583, "time_per_iteration": 2.6281843185424805 }, { "auxiliary_loss_clip": 0.012542, "auxiliary_loss_mlp": 0.00242512, "balance_loss_clip": 1.03360689, "balance_loss_mlp": 0.21713229, "epoch": 0.9369607695776342, "flos": 26432516142720.0, "grad_norm": 12.27016678872365, "language_loss": 0.92929423, "learning_rate": 4.1503453471272376e-08, "loss": 0.94426131, "num_input_tokens_seen": 336113985, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.25390625, "step": 15584, "time_per_iteration": 2.726494073867798 }, { "auxiliary_loss_clip": 0.01260023, "auxiliary_loss_mlp": 0.002265, "balance_loss_clip": 1.03512144, "balance_loss_mlp": 0.20022628, "epoch": 0.9370208928303021, "flos": 39567346081920.0, "grad_norm": 10.217554247871849, "language_loss": 0.7957384, "learning_rate": 4.1424560625129334e-08, "loss": 0.81060362, "num_input_tokens_seen": 336136395, "router_z_loss_clip": 2.25, "router_z_loss_mlp": 0.26293945, "step": 15585, "time_per_iteration": 2.786827564239502 }, { "auxiliary_loss_clip": 0.01230696, "auxiliary_loss_mlp": 0.00208964, "balance_loss_clip": 1.02005827, "balance_loss_mlp": 0.18578991, "epoch": 0.9370810160829701, "flos": 22963114321920.0, "grad_norm": 11.745579038144808, "language_loss": 0.89168096, "learning_rate": 4.134574204836316e-08, "loss": 0.90607756, "num_input_tokens_seen": 336156345, "router_z_loss_clip": 2.10546875, "router_z_loss_mlp": 0.23156738, "step": 15586, "time_per_iteration": 2.695124864578247 }, { "auxiliary_loss_clip": 0.01232509, "auxiliary_loss_mlp": 0.00233349, "balance_loss_clip": 1.01655364, "balance_loss_mlp": 0.20929249, "epoch": 0.937141139335638, "flos": 23075048079360.0, "grad_norm": 32.09513910952519, "language_loss": 0.83060992, "learning_rate": 4.126699774396258e-08, "loss": 0.84526849, "num_input_tokens_seen": 336176760, "router_z_loss_clip": 2.15917969, "router_z_loss_mlp": 0.24023438, "step": 15587, "time_per_iteration": 2.6536314487457275 }, { "auxiliary_loss_clip": 0.01259399, "auxiliary_loss_mlp": 0.00224455, "balance_loss_clip": 1.03489554, "balance_loss_mlp": 0.19746608, "epoch": 0.937201262588306, "flos": 16356664247040.0, "grad_norm": 29.025155728378124, "language_loss": 0.95954072, "learning_rate": 4.118832771491387e-08, "loss": 0.9743793, "num_input_tokens_seen": 336193285, "router_z_loss_clip": 2.24609375, "router_z_loss_mlp": 0.26977539, "step": 15588, "time_per_iteration": 2.6686251163482666 }, { "auxiliary_loss_clip": 0.01216351, "auxiliary_loss_mlp": 0.00196638, "balance_loss_clip": 1.0103935, "balance_loss_mlp": 0.17444128, "epoch": 0.937261385840974, "flos": 20194078861440.0, "grad_norm": 9.909509425637324, "language_loss": 0.86493671, "learning_rate": 4.11097319642002e-08, "loss": 0.87906659, "num_input_tokens_seen": 336211425, "router_z_loss_clip": 2.05566406, "router_z_loss_mlp": 0.22180176, "step": 15589, "time_per_iteration": 5.517268657684326 }, { "auxiliary_loss_clip": 0.01226828, "auxiliary_loss_mlp": 0.00217464, "balance_loss_clip": 1.01574111, "balance_loss_mlp": 0.19304956, "epoch": 0.937321509093642, "flos": 18295948558080.0, "grad_norm": 2.475140703648732, "language_loss": 0.86413586, "learning_rate": 4.103121049480163e-08, "loss": 0.87857878, "num_input_tokens_seen": 336230205, "router_z_loss_clip": 2.10742188, "router_z_loss_mlp": 0.24414062, "step": 15590, "time_per_iteration": 2.684887409210205 }, { "auxiliary_loss_clip": 0.01267152, "auxiliary_loss_mlp": 0.00227692, "balance_loss_clip": 1.04278207, "balance_loss_mlp": 0.20002401, "epoch": 0.9373816323463099, "flos": 25884662929920.0, "grad_norm": 52.10301991218458, "language_loss": 0.7804935, "learning_rate": 4.095276330969577e-08, "loss": 0.79544193, "num_input_tokens_seen": 336252440, "router_z_loss_clip": 2.24414062, "router_z_loss_mlp": 0.27697754, "step": 15591, "time_per_iteration": 2.696643829345703 }, { "auxiliary_loss_clip": 0.01273426, "auxiliary_loss_mlp": 0.00249396, "balance_loss_clip": 1.04427814, "balance_loss_mlp": 0.22129866, "epoch": 0.9374417555989779, "flos": 27198849830400.0, "grad_norm": 155.75189421282445, "language_loss": 0.66441083, "learning_rate": 4.0874390411857804e-08, "loss": 0.67963898, "num_input_tokens_seen": 336273845, "router_z_loss_clip": 2.29296875, "router_z_loss_mlp": 0.28088379, "step": 15592, "time_per_iteration": 2.744729995727539 }, { "auxiliary_loss_clip": 0.01229341, "auxiliary_loss_mlp": 0.00209789, "balance_loss_clip": 1.01821697, "balance_loss_mlp": 0.18551832, "epoch": 0.9375018788516458, "flos": 23621249266560.0, "grad_norm": 17.407201122891987, "language_loss": 0.73709363, "learning_rate": 4.0796091804259136e-08, "loss": 0.75148493, "num_input_tokens_seen": 336292790, "router_z_loss_clip": 2.11132812, "router_z_loss_mlp": 0.24267578, "step": 15593, "time_per_iteration": 2.6868793964385986 }, { "auxiliary_loss_clip": 0.01241029, "auxiliary_loss_mlp": 0.00216193, "balance_loss_clip": 1.02632666, "balance_loss_mlp": 0.19180259, "epoch": 0.9375620021043138, "flos": 22678774260480.0, "grad_norm": 5.59303225658966, "language_loss": 0.80763471, "learning_rate": 4.0717867489868715e-08, "loss": 0.82220697, "num_input_tokens_seen": 336312600, "router_z_loss_clip": 2.1484375, "router_z_loss_mlp": 0.24389648, "step": 15594, "time_per_iteration": 2.7398180961608887 }, { "auxiliary_loss_clip": 0.01233022, "auxiliary_loss_mlp": 0.00232054, "balance_loss_clip": 1.02151453, "balance_loss_mlp": 0.20747307, "epoch": 0.9376221253569819, "flos": 27560254521600.0, "grad_norm": 44.027464868045485, "language_loss": 0.79140443, "learning_rate": 4.063971747165351e-08, "loss": 0.80605519, "num_input_tokens_seen": 336332770, "router_z_loss_clip": 2.1171875, "router_z_loss_mlp": 0.24572754, "step": 15595, "time_per_iteration": 2.6829793453216553 }, { "auxiliary_loss_clip": 0.0124043, "auxiliary_loss_mlp": 0.00230295, "balance_loss_clip": 1.02151012, "balance_loss_mlp": 0.20441501, "epoch": 0.9376822486096498, "flos": 24129887806080.0, "grad_norm": 9.34100108349207, "language_loss": 0.81811535, "learning_rate": 4.056164175257626e-08, "loss": 0.83282256, "num_input_tokens_seen": 336351445, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.25878906, "step": 15596, "time_per_iteration": 2.71138334274292 }, { "auxiliary_loss_clip": 0.01247395, "auxiliary_loss_mlp": 0.00231901, "balance_loss_clip": 1.03117943, "balance_loss_mlp": 0.20712924, "epoch": 0.9377423718623178, "flos": 22784028088320.0, "grad_norm": 20.196752903894055, "language_loss": 0.8462072, "learning_rate": 4.0483640335597926e-08, "loss": 0.86100018, "num_input_tokens_seen": 336368690, "router_z_loss_clip": 2.16210938, "router_z_loss_mlp": 0.24780273, "step": 15597, "time_per_iteration": 4.131103515625 }, { "auxiliary_loss_clip": 0.01250066, "auxiliary_loss_mlp": 0.00236423, "balance_loss_clip": 1.02876329, "balance_loss_mlp": 0.21115083, "epoch": 0.9378024951149857, "flos": 19168900790400.0, "grad_norm": 13.748635552159744, "language_loss": 0.87158203, "learning_rate": 4.0405713223676363e-08, "loss": 0.88644695, "num_input_tokens_seen": 336388165, "router_z_loss_clip": 2.21289062, "router_z_loss_mlp": 0.25292969, "step": 15598, "time_per_iteration": 2.6725594997406006 }, { "auxiliary_loss_clip": 0.01280679, "auxiliary_loss_mlp": 0.00237021, "balance_loss_clip": 1.04786325, "balance_loss_mlp": 0.20918593, "epoch": 0.9378626183676537, "flos": 23505508667520.0, "grad_norm": 4.911277721057267, "language_loss": 0.73466897, "learning_rate": 4.0327860419766994e-08, "loss": 0.74984598, "num_input_tokens_seen": 336406475, "router_z_loss_clip": 2.328125, "router_z_loss_mlp": 0.27819824, "step": 15599, "time_per_iteration": 2.6827194690704346 }, { "auxiliary_loss_clip": 0.01238463, "auxiliary_loss_mlp": 0.00209474, "balance_loss_clip": 1.02073383, "balance_loss_mlp": 0.18447554, "epoch": 0.9379227416203216, "flos": 18405655672320.0, "grad_norm": 443.5368653062814, "language_loss": 0.8340435, "learning_rate": 4.0250081926821e-08, "loss": 0.84852278, "num_input_tokens_seen": 336424690, "router_z_loss_clip": 2.17871094, "router_z_loss_mlp": 0.24975586, "step": 15600, "time_per_iteration": 2.6001124382019043 }, { "auxiliary_loss_clip": 0.01229441, "auxiliary_loss_mlp": 0.00218745, "balance_loss_clip": 1.02088976, "balance_loss_mlp": 0.19549896, "epoch": 0.9379828648729897, "flos": 17821855923840.0, "grad_norm": 3.4352255670744762, "language_loss": 0.79057831, "learning_rate": 4.0172377747788474e-08, "loss": 0.80506015, "num_input_tokens_seen": 336443055, "router_z_loss_clip": 2.08496094, "router_z_loss_mlp": 0.23242188, "step": 15601, "time_per_iteration": 4.02301025390625 }, { "auxiliary_loss_clip": 0.01090783, "auxiliary_loss_mlp": 0.00101922, "balance_loss_clip": 0.95688087, "balance_loss_mlp": 0.09381577, "epoch": 0.9380429881256576, "flos": 68024399466240.0, "grad_norm": 0.7371018664960794, "language_loss": 0.57574141, "learning_rate": 4.009474788561573e-08, "loss": 0.58766854, "num_input_tokens_seen": 336510190, "router_z_loss_clip": 1.34375, "router_z_loss_mlp": 0.08105469, "step": 15602, "time_per_iteration": 3.3500747680664062 }, { "auxiliary_loss_clip": 0.0124523, "auxiliary_loss_mlp": 0.00244236, "balance_loss_clip": 1.02723718, "balance_loss_mlp": 0.22007236, "epoch": 0.9381031113783256, "flos": 20776980769920.0, "grad_norm": 7.549429443611423, "language_loss": 0.82028437, "learning_rate": 4.001719234324663e-08, "loss": 0.83517909, "num_input_tokens_seen": 336529250, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.24169922, "step": 15603, "time_per_iteration": 2.671347141265869 }, { "auxiliary_loss_clip": 0.01216283, "auxiliary_loss_mlp": 0.00230527, "balance_loss_clip": 1.00939953, "balance_loss_mlp": 0.20663723, "epoch": 0.9381632346309935, "flos": 19025078734080.0, "grad_norm": 318.5546859022231, "language_loss": 0.81854486, "learning_rate": 3.993971112362171e-08, "loss": 0.833013, "num_input_tokens_seen": 336548530, "router_z_loss_clip": 2.06738281, "router_z_loss_mlp": 0.23901367, "step": 15604, "time_per_iteration": 2.6916964054107666 }, { "auxiliary_loss_clip": 0.01250634, "auxiliary_loss_mlp": 0.00232621, "balance_loss_clip": 1.03052628, "balance_loss_mlp": 0.2067645, "epoch": 0.9382233578836615, "flos": 23513840622720.0, "grad_norm": 79.64482429123558, "language_loss": 0.75084627, "learning_rate": 3.9862304229679734e-08, "loss": 0.76567876, "num_input_tokens_seen": 336568510, "router_z_loss_clip": 2.20117188, "router_z_loss_mlp": 0.25842285, "step": 15605, "time_per_iteration": 2.6507906913757324 }, { "auxiliary_loss_clip": 0.0125165, "auxiliary_loss_mlp": 0.00242309, "balance_loss_clip": 1.03123748, "balance_loss_mlp": 0.21479563, "epoch": 0.9382834811363294, "flos": 43067882016000.0, "grad_norm": 14.286230486228872, "language_loss": 0.74980503, "learning_rate": 3.9784971664355683e-08, "loss": 0.76474464, "num_input_tokens_seen": 336592020, "router_z_loss_clip": 2.20507812, "router_z_loss_mlp": 0.27526855, "step": 15606, "time_per_iteration": 2.892580986022949 }, { "auxiliary_loss_clip": 0.01233699, "auxiliary_loss_mlp": 0.00240415, "balance_loss_clip": 1.02205586, "balance_loss_mlp": 0.21684712, "epoch": 0.9383436043889974, "flos": 16436242828800.0, "grad_norm": 49.432217073185825, "language_loss": 0.84331143, "learning_rate": 3.970771343058166e-08, "loss": 0.85805255, "num_input_tokens_seen": 336610010, "router_z_loss_clip": 2.1171875, "router_z_loss_mlp": 0.23583984, "step": 15607, "time_per_iteration": 2.6296467781066895 }, { "auxiliary_loss_clip": 0.01251161, "auxiliary_loss_mlp": 0.00222664, "balance_loss_clip": 1.03408122, "balance_loss_mlp": 0.19937068, "epoch": 0.9384037276416655, "flos": 20740603271040.0, "grad_norm": 5.86093155042477, "language_loss": 0.89507985, "learning_rate": 3.963052953128776e-08, "loss": 0.90981811, "num_input_tokens_seen": 336628520, "router_z_loss_clip": 2.16894531, "router_z_loss_mlp": 0.23303223, "step": 15608, "time_per_iteration": 2.7089502811431885 }, { "auxiliary_loss_clip": 0.01255307, "auxiliary_loss_mlp": 0.00216828, "balance_loss_clip": 1.04004169, "balance_loss_mlp": 0.19355837, "epoch": 0.9384638508943334, "flos": 19062677295360.0, "grad_norm": 5.202069831087432, "language_loss": 0.78269136, "learning_rate": 3.9553419969400536e-08, "loss": 0.79741275, "num_input_tokens_seen": 336647365, "router_z_loss_clip": 2.15234375, "router_z_loss_mlp": 0.23278809, "step": 15609, "time_per_iteration": 2.6806883811950684 }, { "auxiliary_loss_clip": 0.01242111, "auxiliary_loss_mlp": 0.00234266, "balance_loss_clip": 1.02071404, "balance_loss_mlp": 0.20888655, "epoch": 0.9385239741470014, "flos": 23404887694080.0, "grad_norm": 6.457780001609047, "language_loss": 0.83411562, "learning_rate": 3.9476384747844316e-08, "loss": 0.84887934, "num_input_tokens_seen": 336667165, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.25415039, "step": 15610, "time_per_iteration": 2.767110586166382 }, { "auxiliary_loss_clip": 0.01235812, "auxiliary_loss_mlp": 0.00235364, "balance_loss_clip": 1.02456987, "balance_loss_mlp": 0.21189138, "epoch": 0.9385840973996693, "flos": 12824742804480.0, "grad_norm": 13.079266632995255, "language_loss": 0.83590078, "learning_rate": 3.939942386953987e-08, "loss": 0.85061252, "num_input_tokens_seen": 336684130, "router_z_loss_clip": 2.11328125, "router_z_loss_mlp": 0.23498535, "step": 15611, "time_per_iteration": 2.6698975563049316 }, { "auxiliary_loss_clip": 0.01247282, "auxiliary_loss_mlp": 0.00212267, "balance_loss_clip": 1.03052449, "balance_loss_mlp": 0.18831766, "epoch": 0.9386442206523373, "flos": 15486980152320.0, "grad_norm": 53.94276312771302, "language_loss": 0.74979019, "learning_rate": 3.9322537337405756e-08, "loss": 0.76438558, "num_input_tokens_seen": 336701520, "router_z_loss_clip": 2.16796875, "router_z_loss_mlp": 0.23950195, "step": 15612, "time_per_iteration": 2.6834936141967773 }, { "auxiliary_loss_clip": 0.01229521, "auxiliary_loss_mlp": 0.00216509, "balance_loss_clip": 1.01656771, "balance_loss_mlp": 0.19010389, "epoch": 0.9387043439050052, "flos": 21178821196800.0, "grad_norm": 16.945195546177292, "language_loss": 0.66490912, "learning_rate": 3.924572515435742e-08, "loss": 0.67936945, "num_input_tokens_seen": 336720675, "router_z_loss_clip": 2.13085938, "router_z_loss_mlp": 0.26416016, "step": 15613, "time_per_iteration": 2.6714515686035156 }, { "auxiliary_loss_clip": 0.01241736, "auxiliary_loss_mlp": 0.00234087, "balance_loss_clip": 1.02465951, "balance_loss_mlp": 0.20869544, "epoch": 0.9387644671576733, "flos": 27668273696640.0, "grad_norm": 11.60002047952367, "language_loss": 0.78522861, "learning_rate": 3.916898732330764e-08, "loss": 0.79998684, "num_input_tokens_seen": 336741005, "router_z_loss_clip": 2.171875, "router_z_loss_mlp": 0.25402832, "step": 15614, "time_per_iteration": 2.779069423675537 }, { "auxiliary_loss_clip": 0.01256194, "auxiliary_loss_mlp": 0.00239693, "balance_loss_clip": 1.0326736, "balance_loss_mlp": 0.21253729, "epoch": 0.9388245904103412, "flos": 18836331742080.0, "grad_norm": 15.474924624744057, "language_loss": 0.90762484, "learning_rate": 3.9092323847166544e-08, "loss": 0.9225837, "num_input_tokens_seen": 336757990, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.27160645, "step": 15615, "time_per_iteration": 2.6993496417999268 }, { "auxiliary_loss_clip": 0.01243446, "auxiliary_loss_mlp": 0.00223868, "balance_loss_clip": 1.02536201, "balance_loss_mlp": 0.19971621, "epoch": 0.9388847136630092, "flos": 25483828083840.0, "grad_norm": 12.284332679502512, "language_loss": 0.77541935, "learning_rate": 3.901573472884134e-08, "loss": 0.79009253, "num_input_tokens_seen": 336777705, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.24145508, "step": 15616, "time_per_iteration": 2.699192523956299 }, { "auxiliary_loss_clip": 0.01243905, "auxiliary_loss_mlp": 0.00220704, "balance_loss_clip": 1.02896047, "balance_loss_mlp": 0.19777964, "epoch": 0.9389448369156771, "flos": 18734992496640.0, "grad_norm": 39.26288293255402, "language_loss": 0.74589604, "learning_rate": 3.89392199712355e-08, "loss": 0.76054209, "num_input_tokens_seen": 336798275, "router_z_loss_clip": 2.14746094, "router_z_loss_mlp": 0.22937012, "step": 15617, "time_per_iteration": 2.6532461643218994 }, { "auxiliary_loss_clip": 0.01238896, "auxiliary_loss_mlp": 0.00220168, "balance_loss_clip": 1.02008963, "balance_loss_mlp": 0.19508649, "epoch": 0.9390049601683451, "flos": 21717839664000.0, "grad_norm": 10.006928979725021, "language_loss": 0.83380651, "learning_rate": 3.886277957725092e-08, "loss": 0.8483972, "num_input_tokens_seen": 336813835, "router_z_loss_clip": 2.19042969, "router_z_loss_mlp": 0.25073242, "step": 15618, "time_per_iteration": 2.7879798412323 }, { "auxiliary_loss_clip": 0.01246857, "auxiliary_loss_mlp": 0.00200213, "balance_loss_clip": 1.02245426, "balance_loss_mlp": 0.17451145, "epoch": 0.939065083421013, "flos": 19391224020480.0, "grad_norm": 10.80969959299753, "language_loss": 0.78913873, "learning_rate": 3.878641354978662e-08, "loss": 0.80360943, "num_input_tokens_seen": 336832210, "router_z_loss_clip": 2.24414062, "router_z_loss_mlp": 0.25695801, "step": 15619, "time_per_iteration": 2.663057804107666 }, { "auxiliary_loss_clip": 0.01254989, "auxiliary_loss_mlp": 0.00233386, "balance_loss_clip": 1.03325677, "balance_loss_mlp": 0.20804235, "epoch": 0.939125206673681, "flos": 24681511946880.0, "grad_norm": 13.383182424451329, "language_loss": 0.87282836, "learning_rate": 3.8710121891737834e-08, "loss": 0.88771206, "num_input_tokens_seen": 336851380, "router_z_loss_clip": 2.21679688, "router_z_loss_mlp": 0.25354004, "step": 15620, "time_per_iteration": 2.734651565551758 }, { "auxiliary_loss_clip": 0.012471, "auxiliary_loss_mlp": 0.00235201, "balance_loss_clip": 1.02617967, "balance_loss_mlp": 0.20827135, "epoch": 0.9391853299263491, "flos": 16325961096960.0, "grad_norm": 13.772363416544232, "language_loss": 0.82441831, "learning_rate": 3.8633904605998025e-08, "loss": 0.83924127, "num_input_tokens_seen": 336868525, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.26916504, "step": 15621, "time_per_iteration": 2.664625644683838 }, { "auxiliary_loss_clip": 0.01265256, "auxiliary_loss_mlp": 0.00219515, "balance_loss_clip": 1.03538752, "balance_loss_mlp": 0.19240654, "epoch": 0.939245453179017, "flos": 11655778590720.0, "grad_norm": 37.20460682349716, "language_loss": 0.80092055, "learning_rate": 3.855776169545688e-08, "loss": 0.81576824, "num_input_tokens_seen": 336886200, "router_z_loss_clip": 2.29785156, "router_z_loss_mlp": 0.27087402, "step": 15622, "time_per_iteration": 2.645827293395996 }, { "auxiliary_loss_clip": 0.01230015, "auxiliary_loss_mlp": 0.00213784, "balance_loss_clip": 1.01795435, "balance_loss_mlp": 0.19121777, "epoch": 0.939305576431685, "flos": 23148700917120.0, "grad_norm": 159.65286078472315, "language_loss": 0.80688196, "learning_rate": 3.848169316300209e-08, "loss": 0.82132006, "num_input_tokens_seen": 336905815, "router_z_loss_clip": 2.12207031, "router_z_loss_mlp": 0.22570801, "step": 15623, "time_per_iteration": 2.6366541385650635 }, { "auxiliary_loss_clip": 0.01258747, "auxiliary_loss_mlp": 0.00229461, "balance_loss_clip": 1.03830314, "balance_loss_mlp": 0.20302007, "epoch": 0.9393656996843529, "flos": 33287790706560.0, "grad_norm": 4.426365744157299, "language_loss": 0.80840802, "learning_rate": 3.84056990115178e-08, "loss": 0.82329011, "num_input_tokens_seen": 336928460, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.2644043, "step": 15624, "time_per_iteration": 2.811030149459839 }, { "auxiliary_loss_clip": 0.01237323, "auxiliary_loss_mlp": 0.00210727, "balance_loss_clip": 1.02009726, "balance_loss_mlp": 0.18616965, "epoch": 0.9394258229370209, "flos": 21689434984320.0, "grad_norm": 34.411038116492236, "language_loss": 0.9703005, "learning_rate": 3.832977924388614e-08, "loss": 0.98478091, "num_input_tokens_seen": 336948320, "router_z_loss_clip": 2.17578125, "router_z_loss_mlp": 0.24572754, "step": 15625, "time_per_iteration": 2.650332450866699 }, { "auxiliary_loss_clip": 0.01229424, "auxiliary_loss_mlp": 0.00213564, "balance_loss_clip": 1.01394463, "balance_loss_mlp": 0.18841124, "epoch": 0.9394859461896888, "flos": 23874203819520.0, "grad_norm": 28.30830936233703, "language_loss": 0.92614198, "learning_rate": 3.825393386298592e-08, "loss": 0.9405719, "num_input_tokens_seen": 336967670, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.25158691, "step": 15626, "time_per_iteration": 2.783263921737671 }, { "auxiliary_loss_clip": 0.01090569, "auxiliary_loss_mlp": 0.00096645, "balance_loss_clip": 0.95351541, "balance_loss_mlp": 0.08849104, "epoch": 0.9395460694423569, "flos": 61566116993280.0, "grad_norm": 2.2826832672892454, "language_loss": 0.55237955, "learning_rate": 3.8178162871693284e-08, "loss": 0.56425166, "num_input_tokens_seen": 337028395, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.08154297, "step": 15627, "time_per_iteration": 3.1157634258270264 }, { "auxiliary_loss_clip": 0.01241674, "auxiliary_loss_mlp": 0.00218691, "balance_loss_clip": 1.02499735, "balance_loss_mlp": 0.1939434, "epoch": 0.9396061926950248, "flos": 20995712640000.0, "grad_norm": 3.597551364923694, "language_loss": 0.77822161, "learning_rate": 3.810246627288105e-08, "loss": 0.79282522, "num_input_tokens_seen": 337048150, "router_z_loss_clip": 2.16503906, "router_z_loss_mlp": 0.24755859, "step": 15628, "time_per_iteration": 2.6912858486175537 }, { "auxiliary_loss_clip": 0.01233095, "auxiliary_loss_mlp": 0.00219292, "balance_loss_clip": 1.02182245, "balance_loss_mlp": 0.19666627, "epoch": 0.9396663159476928, "flos": 27487786832640.0, "grad_norm": 11.761129850436555, "language_loss": 0.81836796, "learning_rate": 3.8026844069420025e-08, "loss": 0.83289182, "num_input_tokens_seen": 337069315, "router_z_loss_clip": 2.11328125, "router_z_loss_mlp": 0.22631836, "step": 15629, "time_per_iteration": 2.7018728256225586 }, { "auxiliary_loss_clip": 0.01230228, "auxiliary_loss_mlp": 0.00208662, "balance_loss_clip": 1.02030718, "balance_loss_mlp": 0.1860723, "epoch": 0.9397264392003607, "flos": 19427457864960.0, "grad_norm": 176.1296119216745, "language_loss": 0.80932754, "learning_rate": 3.795129626417748e-08, "loss": 0.82371646, "num_input_tokens_seen": 337087765, "router_z_loss_clip": 2.09765625, "router_z_loss_mlp": 0.22583008, "step": 15630, "time_per_iteration": 2.6834423542022705 }, { "auxiliary_loss_clip": 0.01238088, "auxiliary_loss_mlp": 0.00215468, "balance_loss_clip": 1.02429032, "balance_loss_mlp": 0.19170983, "epoch": 0.9397865624530287, "flos": 18004820826240.0, "grad_norm": 10.15756685524353, "language_loss": 0.77395606, "learning_rate": 3.787582286001845e-08, "loss": 0.78849167, "num_input_tokens_seen": 337106265, "router_z_loss_clip": 2.13671875, "router_z_loss_mlp": 0.23742676, "step": 15631, "time_per_iteration": 3.9837496280670166 }, { "auxiliary_loss_clip": 0.01231066, "auxiliary_loss_mlp": 0.00216114, "balance_loss_clip": 1.02007151, "balance_loss_mlp": 0.19380975, "epoch": 0.9398466857056966, "flos": 22564613859840.0, "grad_norm": 10.352757855643768, "language_loss": 0.80600321, "learning_rate": 3.7800423859805086e-08, "loss": 0.82047504, "num_input_tokens_seen": 337126090, "router_z_loss_clip": 2.11132812, "router_z_loss_mlp": 0.22314453, "step": 15632, "time_per_iteration": 4.11297082901001 }, { "auxiliary_loss_clip": 0.01274906, "auxiliary_loss_mlp": 0.00223225, "balance_loss_clip": 1.04364145, "balance_loss_mlp": 0.19518733, "epoch": 0.9399068089583646, "flos": 24535678728960.0, "grad_norm": 4.163891407541728, "language_loss": 0.82799459, "learning_rate": 3.772509926639622e-08, "loss": 0.84297597, "num_input_tokens_seen": 337145655, "router_z_loss_clip": 2.3125, "router_z_loss_mlp": 0.28063965, "step": 15633, "time_per_iteration": 2.6861555576324463 }, { "auxiliary_loss_clip": 0.01250414, "auxiliary_loss_mlp": 0.00242885, "balance_loss_clip": 1.03059173, "balance_loss_mlp": 0.21636045, "epoch": 0.9399669322110327, "flos": 25630343660160.0, "grad_norm": 22.90618264421856, "language_loss": 0.79794091, "learning_rate": 3.764984908264823e-08, "loss": 0.81287396, "num_input_tokens_seen": 337164805, "router_z_loss_clip": 2.19921875, "router_z_loss_mlp": 0.26525879, "step": 15634, "time_per_iteration": 2.697537660598755 }, { "auxiliary_loss_clip": 0.01241188, "auxiliary_loss_mlp": 0.00227368, "balance_loss_clip": 1.02376604, "balance_loss_mlp": 0.20240557, "epoch": 0.9400270554637006, "flos": 17089385783040.0, "grad_norm": 27.88210834808814, "language_loss": 0.77165049, "learning_rate": 3.75746733114144e-08, "loss": 0.78633606, "num_input_tokens_seen": 337182280, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.24987793, "step": 15635, "time_per_iteration": 2.638913631439209 }, { "auxiliary_loss_clip": 0.01216617, "auxiliary_loss_mlp": 0.0022427, "balance_loss_clip": 1.00301909, "balance_loss_mlp": 0.20040408, "epoch": 0.9400871787163686, "flos": 22055113393920.0, "grad_norm": 5.059159360943777, "language_loss": 0.80568051, "learning_rate": 3.7499571955545985e-08, "loss": 0.82008934, "num_input_tokens_seen": 337203495, "router_z_loss_clip": 2.13671875, "router_z_loss_mlp": 0.2388916, "step": 15636, "time_per_iteration": 2.7413289546966553 }, { "auxiliary_loss_clip": 0.01247489, "auxiliary_loss_mlp": 0.00234239, "balance_loss_clip": 1.03246784, "balance_loss_mlp": 0.2095392, "epoch": 0.9401473019690365, "flos": 16982767238400.0, "grad_norm": 9.00614526166143, "language_loss": 0.92175972, "learning_rate": 3.7424545017890054e-08, "loss": 0.93657708, "num_input_tokens_seen": 337220435, "router_z_loss_clip": 2.15039062, "router_z_loss_mlp": 0.24743652, "step": 15637, "time_per_iteration": 2.5982284545898438 }, { "auxiliary_loss_clip": 0.0125005, "auxiliary_loss_mlp": 0.00246433, "balance_loss_clip": 1.0272119, "balance_loss_mlp": 0.21983761, "epoch": 0.9402074252217045, "flos": 19681956702720.0, "grad_norm": 7.283390182518419, "language_loss": 0.77404106, "learning_rate": 3.7349592501292325e-08, "loss": 0.78900588, "num_input_tokens_seen": 337238095, "router_z_loss_clip": 2.23046875, "router_z_loss_mlp": 0.26611328, "step": 15638, "time_per_iteration": 2.7237775325775146 }, { "auxiliary_loss_clip": 0.01224397, "auxiliary_loss_mlp": 0.00226458, "balance_loss_clip": 1.01642239, "balance_loss_mlp": 0.20302191, "epoch": 0.9402675484743724, "flos": 24754302858240.0, "grad_norm": 97.55517157476578, "language_loss": 0.909621, "learning_rate": 3.727471440859498e-08, "loss": 0.92412961, "num_input_tokens_seen": 337256645, "router_z_loss_clip": 2.08007812, "router_z_loss_mlp": 0.234375, "step": 15639, "time_per_iteration": 2.6810500621795654 }, { "auxiliary_loss_clip": 0.01232658, "auxiliary_loss_mlp": 0.00228331, "balance_loss_clip": 1.01874518, "balance_loss_mlp": 0.20402375, "epoch": 0.9403276717270405, "flos": 25558630156800.0, "grad_norm": 20.55576841693115, "language_loss": 0.83593059, "learning_rate": 3.719991074263662e-08, "loss": 0.85054046, "num_input_tokens_seen": 337278360, "router_z_loss_clip": 2.140625, "router_z_loss_mlp": 0.24328613, "step": 15640, "time_per_iteration": 4.236801624298096 }, { "auxiliary_loss_clip": 0.01246773, "auxiliary_loss_mlp": 0.00237241, "balance_loss_clip": 1.02834892, "balance_loss_mlp": 0.21223059, "epoch": 0.9403877949797084, "flos": 26689852154880.0, "grad_norm": 2.687163274392545, "language_loss": 0.79833561, "learning_rate": 3.7125181506254544e-08, "loss": 0.8131758, "num_input_tokens_seen": 337302480, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.25024414, "step": 15641, "time_per_iteration": 2.7950894832611084 }, { "auxiliary_loss_clip": 0.01265237, "auxiliary_loss_mlp": 0.00261548, "balance_loss_clip": 1.03668773, "balance_loss_mlp": 0.23112628, "epoch": 0.9404479182323764, "flos": 15011666455680.0, "grad_norm": 158.97083110046825, "language_loss": 0.923531, "learning_rate": 3.7050526702282256e-08, "loss": 0.9387989, "num_input_tokens_seen": 337316600, "router_z_loss_clip": 2.28710938, "router_z_loss_mlp": 0.30407715, "step": 15642, "time_per_iteration": 2.636763334274292 }, { "auxiliary_loss_clip": 0.01239127, "auxiliary_loss_mlp": 0.00209139, "balance_loss_clip": 1.026124, "balance_loss_mlp": 0.18411711, "epoch": 0.9405080414850443, "flos": 24973573432320.0, "grad_norm": 15.072632171253085, "language_loss": 0.76460469, "learning_rate": 3.697594633355084e-08, "loss": 0.77908742, "num_input_tokens_seen": 337336895, "router_z_loss_clip": 2.12988281, "router_z_loss_mlp": 0.25, "step": 15643, "time_per_iteration": 4.086908340454102 }, { "auxiliary_loss_clip": 0.0125455, "auxiliary_loss_mlp": 0.00232653, "balance_loss_clip": 1.03472161, "balance_loss_mlp": 0.20544964, "epoch": 0.9405681647377123, "flos": 20844743777280.0, "grad_norm": 27.530910808664522, "language_loss": 0.83704811, "learning_rate": 3.6901440402888226e-08, "loss": 0.85192013, "num_input_tokens_seen": 337355105, "router_z_loss_clip": 2.19921875, "router_z_loss_mlp": 0.27209473, "step": 15644, "time_per_iteration": 2.7179343700408936 }, { "auxiliary_loss_clip": 0.0122738, "auxiliary_loss_mlp": 0.00215655, "balance_loss_clip": 1.01503801, "balance_loss_mlp": 0.19257598, "epoch": 0.9406282879903802, "flos": 23805578885760.0, "grad_norm": 3.591609952433629, "language_loss": 0.75482804, "learning_rate": 3.682700891311974e-08, "loss": 0.76925844, "num_input_tokens_seen": 337374905, "router_z_loss_clip": 2.12304688, "router_z_loss_mlp": 0.23095703, "step": 15645, "time_per_iteration": 2.7667152881622314 }, { "auxiliary_loss_clip": 0.01228871, "auxiliary_loss_mlp": 0.00216694, "balance_loss_clip": 1.01516581, "balance_loss_mlp": 0.1922922, "epoch": 0.9406884112430483, "flos": 27674953626240.0, "grad_norm": 5.865773232753186, "language_loss": 0.75676346, "learning_rate": 3.6752651867067774e-08, "loss": 0.77121913, "num_input_tokens_seen": 337397130, "router_z_loss_clip": 2.13476562, "router_z_loss_mlp": 0.24389648, "step": 15646, "time_per_iteration": 2.8120265007019043 }, { "auxiliary_loss_clip": 0.01227737, "auxiliary_loss_mlp": 0.00226074, "balance_loss_clip": 1.01789224, "balance_loss_mlp": 0.20211318, "epoch": 0.9407485344957163, "flos": 23075048079360.0, "grad_norm": 18.661525358271636, "language_loss": 0.80407864, "learning_rate": 3.667836926755208e-08, "loss": 0.81861675, "num_input_tokens_seen": 337418660, "router_z_loss_clip": 2.09570312, "router_z_loss_mlp": 0.23974609, "step": 15647, "time_per_iteration": 2.6581857204437256 }, { "auxiliary_loss_clip": 0.01095832, "auxiliary_loss_mlp": 0.00068686, "balance_loss_clip": 0.95981598, "balance_loss_mlp": 0.0615331, "epoch": 0.9408086577483842, "flos": 71014034304000.0, "grad_norm": 1.0260680415798746, "language_loss": 0.62633562, "learning_rate": 3.660416111738907e-08, "loss": 0.63798082, "num_input_tokens_seen": 337478055, "router_z_loss_clip": 1.359375, "router_z_loss_mlp": 0.07128906, "step": 15648, "time_per_iteration": 3.289170026779175 }, { "auxiliary_loss_clip": 0.01219973, "auxiliary_loss_mlp": 0.00212746, "balance_loss_clip": 1.01106191, "balance_loss_mlp": 0.19067986, "epoch": 0.9408687810010522, "flos": 23730956380800.0, "grad_norm": 8.370204705130892, "language_loss": 0.72940803, "learning_rate": 3.653002741939337e-08, "loss": 0.74373519, "num_input_tokens_seen": 337499405, "router_z_loss_clip": 2.09472656, "router_z_loss_mlp": 0.2208252, "step": 15649, "time_per_iteration": 2.723191261291504 }, { "auxiliary_loss_clip": 0.01247651, "auxiliary_loss_mlp": 0.00209631, "balance_loss_clip": 1.0252775, "balance_loss_mlp": 0.18444172, "epoch": 0.9409289042537201, "flos": 18369314087040.0, "grad_norm": 7.189506878780832, "language_loss": 0.85659873, "learning_rate": 3.645596817637586e-08, "loss": 0.87117159, "num_input_tokens_seen": 337517195, "router_z_loss_clip": 2.22265625, "router_z_loss_mlp": 0.2520752, "step": 15650, "time_per_iteration": 2.652817964553833 }, { "auxiliary_loss_clip": 0.01245194, "auxiliary_loss_mlp": 0.00215154, "balance_loss_clip": 1.02594757, "balance_loss_mlp": 0.19114533, "epoch": 0.9409890275063881, "flos": 23878333883520.0, "grad_norm": 147.28767515227554, "language_loss": 0.81290758, "learning_rate": 3.638198339114451e-08, "loss": 0.82751107, "num_input_tokens_seen": 337535245, "router_z_loss_clip": 2.19140625, "router_z_loss_mlp": 0.24035645, "step": 15651, "time_per_iteration": 2.732922077178955 }, { "auxiliary_loss_clip": 0.01227989, "auxiliary_loss_mlp": 0.00213045, "balance_loss_clip": 1.0156666, "balance_loss_mlp": 0.18891658, "epoch": 0.941049150759056, "flos": 16545088016640.0, "grad_norm": 142.28307556640908, "language_loss": 0.80218947, "learning_rate": 3.630807306650507e-08, "loss": 0.81659979, "num_input_tokens_seen": 337553040, "router_z_loss_clip": 2.12304688, "router_z_loss_mlp": 0.24121094, "step": 15652, "time_per_iteration": 2.6501452922821045 }, { "auxiliary_loss_clip": 0.01252789, "auxiliary_loss_mlp": 0.00227181, "balance_loss_clip": 1.02980638, "balance_loss_mlp": 0.19904782, "epoch": 0.9411092740117241, "flos": 25118401069440.0, "grad_norm": 5.349472972063294, "language_loss": 0.73626351, "learning_rate": 3.6234237205260645e-08, "loss": 0.75106329, "num_input_tokens_seen": 337574580, "router_z_loss_clip": 2.22851562, "router_z_loss_mlp": 0.28173828, "step": 15653, "time_per_iteration": 2.7915754318237305 }, { "auxiliary_loss_clip": 0.01260927, "auxiliary_loss_mlp": 0.00235411, "balance_loss_clip": 1.03963065, "balance_loss_mlp": 0.2102699, "epoch": 0.941169397264392, "flos": 21142264129920.0, "grad_norm": 7.607088992613365, "language_loss": 0.87590325, "learning_rate": 3.6160475810210536e-08, "loss": 0.89086658, "num_input_tokens_seen": 337593010, "router_z_loss_clip": 2.21484375, "router_z_loss_mlp": 0.25146484, "step": 15654, "time_per_iteration": 2.714036226272583 }, { "auxiliary_loss_clip": 0.01244065, "auxiliary_loss_mlp": 0.00238477, "balance_loss_clip": 1.02544415, "balance_loss_mlp": 0.21253729, "epoch": 0.94122952051706, "flos": 38508914995200.0, "grad_norm": 2.284868444639572, "language_loss": 0.75053543, "learning_rate": 3.6086788884152065e-08, "loss": 0.76536083, "num_input_tokens_seen": 337616170, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.25952148, "step": 15655, "time_per_iteration": 2.8940610885620117 }, { "auxiliary_loss_clip": 0.01240458, "auxiliary_loss_mlp": 0.00234995, "balance_loss_clip": 1.02299833, "balance_loss_mlp": 0.20954387, "epoch": 0.9412896437697279, "flos": 18369206346240.0, "grad_norm": 1983.1586769552289, "language_loss": 0.81338745, "learning_rate": 3.601317642987944e-08, "loss": 0.82814199, "num_input_tokens_seen": 337635215, "router_z_loss_clip": 2.17480469, "router_z_loss_mlp": 0.25439453, "step": 15656, "time_per_iteration": 2.728567600250244 }, { "auxiliary_loss_clip": 0.01223967, "auxiliary_loss_mlp": 0.0020751, "balance_loss_clip": 1.0130322, "balance_loss_mlp": 0.1824642, "epoch": 0.9413497670223959, "flos": 25884950238720.0, "grad_norm": 1629.7223447638899, "language_loss": 0.86420512, "learning_rate": 3.593963845018377e-08, "loss": 0.87851983, "num_input_tokens_seen": 337654195, "router_z_loss_clip": 2.109375, "router_z_loss_mlp": 0.25048828, "step": 15657, "time_per_iteration": 2.8645620346069336 }, { "auxiliary_loss_clip": 0.01244838, "auxiliary_loss_mlp": 0.00213085, "balance_loss_clip": 1.02200747, "balance_loss_mlp": 0.18832564, "epoch": 0.9414098902750638, "flos": 16618309891200.0, "grad_norm": 5.645295654178704, "language_loss": 0.93124795, "learning_rate": 3.586617494785371e-08, "loss": 0.94582713, "num_input_tokens_seen": 337671810, "router_z_loss_clip": 2.22851562, "router_z_loss_mlp": 0.24780273, "step": 15658, "time_per_iteration": 2.70286226272583 }, { "auxiliary_loss_clip": 0.01257021, "auxiliary_loss_mlp": 0.00226153, "balance_loss_clip": 1.0328474, "balance_loss_mlp": 0.19995122, "epoch": 0.9414700135277319, "flos": 18625033987200.0, "grad_norm": 255.95684633953167, "language_loss": 0.80776066, "learning_rate": 3.5792785925675254e-08, "loss": 0.82259238, "num_input_tokens_seen": 337689410, "router_z_loss_clip": 2.24121094, "router_z_loss_mlp": 0.26208496, "step": 15659, "time_per_iteration": 2.656669855117798 }, { "auxiliary_loss_clip": 0.01240213, "auxiliary_loss_mlp": 0.002079, "balance_loss_clip": 1.02769852, "balance_loss_mlp": 0.18352146, "epoch": 0.9415301367803999, "flos": 26280146649600.0, "grad_norm": 11.09209277650587, "language_loss": 0.8641237, "learning_rate": 3.571947138643172e-08, "loss": 0.87860483, "num_input_tokens_seen": 337709950, "router_z_loss_clip": 2.12597656, "router_z_loss_mlp": 0.24377441, "step": 15660, "time_per_iteration": 2.727726697921753 }, { "auxiliary_loss_clip": 0.01219394, "auxiliary_loss_mlp": 0.00214335, "balance_loss_clip": 1.00951374, "balance_loss_mlp": 0.1901716, "epoch": 0.9415902600330678, "flos": 23261388860160.0, "grad_norm": 22.883985712888748, "language_loss": 0.73730528, "learning_rate": 3.564623133290201e-08, "loss": 0.75164258, "num_input_tokens_seen": 337731320, "router_z_loss_clip": 2.10058594, "router_z_loss_mlp": 0.24169922, "step": 15661, "time_per_iteration": 2.7433485984802246 }, { "auxiliary_loss_clip": 0.01227765, "auxiliary_loss_mlp": 0.00212382, "balance_loss_clip": 1.01228809, "balance_loss_mlp": 0.18824178, "epoch": 0.9416503832857358, "flos": 14719138093440.0, "grad_norm": 5.525631363688128, "language_loss": 0.76446772, "learning_rate": 3.557306576786434e-08, "loss": 0.77886915, "num_input_tokens_seen": 337747720, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.24145508, "step": 15662, "time_per_iteration": 2.7142772674560547 }, { "auxiliary_loss_clip": 0.01092847, "auxiliary_loss_mlp": 0.00078251, "balance_loss_clip": 0.95503354, "balance_loss_mlp": 0.0702879, "epoch": 0.9417105065384037, "flos": 70312698276480.0, "grad_norm": 0.7501407916938616, "language_loss": 0.58655727, "learning_rate": 3.5499974694092935e-08, "loss": 0.59826827, "num_input_tokens_seen": 337806930, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.07958984, "step": 15663, "time_per_iteration": 3.2916345596313477 }, { "auxiliary_loss_clip": 0.01265942, "auxiliary_loss_mlp": 0.00244355, "balance_loss_clip": 1.03843009, "balance_loss_mlp": 0.21722302, "epoch": 0.9417706297910717, "flos": 34057895322240.0, "grad_norm": 3.549241326502264, "language_loss": 0.75702524, "learning_rate": 3.542695811435914e-08, "loss": 0.77212822, "num_input_tokens_seen": 337828100, "router_z_loss_clip": 2.27539062, "router_z_loss_mlp": 0.27111816, "step": 15664, "time_per_iteration": 2.9771177768707275 }, { "auxiliary_loss_clip": 0.01240319, "auxiliary_loss_mlp": 0.00237972, "balance_loss_clip": 1.02715683, "balance_loss_mlp": 0.21237752, "epoch": 0.9418307530437396, "flos": 16471614746880.0, "grad_norm": 11.405582327334304, "language_loss": 0.83135545, "learning_rate": 3.535401603143207e-08, "loss": 0.84613836, "num_input_tokens_seen": 337844805, "router_z_loss_clip": 2.1328125, "router_z_loss_mlp": 0.25585938, "step": 15665, "time_per_iteration": 2.663813829421997 }, { "auxiliary_loss_clip": 0.01224038, "auxiliary_loss_mlp": 0.00229821, "balance_loss_clip": 1.01609814, "balance_loss_mlp": 0.20588349, "epoch": 0.9418908762964077, "flos": 11253543114240.0, "grad_norm": 81.93067326715413, "language_loss": 0.71682018, "learning_rate": 3.528114844807773e-08, "loss": 0.73135877, "num_input_tokens_seen": 337860490, "router_z_loss_clip": 2.08300781, "router_z_loss_mlp": 0.23937988, "step": 15666, "time_per_iteration": 2.6352028846740723 }, { "auxiliary_loss_clip": 0.01250229, "auxiliary_loss_mlp": 0.00238396, "balance_loss_clip": 1.02857482, "balance_loss_mlp": 0.21208695, "epoch": 0.9419509995490756, "flos": 18438836860800.0, "grad_norm": 93.83440938262397, "language_loss": 0.86540508, "learning_rate": 3.520835536705902e-08, "loss": 0.8802914, "num_input_tokens_seen": 337878360, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.26293945, "step": 15667, "time_per_iteration": 2.6307830810546875 }, { "auxiliary_loss_clip": 0.0121948, "auxiliary_loss_mlp": 0.00209995, "balance_loss_clip": 1.01027179, "balance_loss_mlp": 0.18635587, "epoch": 0.9420111228017436, "flos": 20737945664640.0, "grad_norm": 10.796567202950989, "language_loss": 0.82885945, "learning_rate": 3.5135636791136404e-08, "loss": 0.84315419, "num_input_tokens_seen": 337895635, "router_z_loss_clip": 2.09277344, "router_z_loss_mlp": 0.23669434, "step": 15668, "time_per_iteration": 2.6976938247680664 }, { "auxiliary_loss_clip": 0.01246405, "auxiliary_loss_mlp": 0.00226576, "balance_loss_clip": 1.02378416, "balance_loss_mlp": 0.20086288, "epoch": 0.9420712460544115, "flos": 21141940907520.0, "grad_norm": 688.5155290487963, "language_loss": 0.71732587, "learning_rate": 3.506299272306723e-08, "loss": 0.73205566, "num_input_tokens_seen": 337913940, "router_z_loss_clip": 2.22851562, "router_z_loss_mlp": 0.25744629, "step": 15669, "time_per_iteration": 2.6740849018096924 }, { "auxiliary_loss_clip": 0.01230405, "auxiliary_loss_mlp": 0.00222815, "balance_loss_clip": 1.0191896, "balance_loss_mlp": 0.1988782, "epoch": 0.9421313693070795, "flos": 15851760721920.0, "grad_norm": 29.01230211677344, "language_loss": 0.84367359, "learning_rate": 3.4990423165606406e-08, "loss": 0.8582058, "num_input_tokens_seen": 337932015, "router_z_loss_clip": 2.109375, "router_z_loss_mlp": 0.23937988, "step": 15670, "time_per_iteration": 2.63840651512146 }, { "auxiliary_loss_clip": 0.01265909, "auxiliary_loss_mlp": 0.00249883, "balance_loss_clip": 1.03801453, "balance_loss_mlp": 0.22264403, "epoch": 0.9421914925597474, "flos": 32415915882240.0, "grad_norm": 8.769852245690254, "language_loss": 0.72789866, "learning_rate": 3.491792812150574e-08, "loss": 0.7430566, "num_input_tokens_seen": 337953345, "router_z_loss_clip": 2.27929688, "router_z_loss_mlp": 0.27246094, "step": 15671, "time_per_iteration": 2.737896680831909 }, { "auxiliary_loss_clip": 0.01232351, "auxiliary_loss_mlp": 0.00223913, "balance_loss_clip": 1.01753783, "balance_loss_mlp": 0.19918934, "epoch": 0.9422516158124155, "flos": 19718513769600.0, "grad_norm": 2.3917286994247577, "language_loss": 0.85724956, "learning_rate": 3.48455075935139e-08, "loss": 0.87181222, "num_input_tokens_seen": 337973685, "router_z_loss_clip": 2.1484375, "router_z_loss_mlp": 0.24731445, "step": 15672, "time_per_iteration": 2.6532561779022217 }, { "auxiliary_loss_clip": 0.01249036, "auxiliary_loss_mlp": 0.00240271, "balance_loss_clip": 1.02469921, "balance_loss_mlp": 0.21307981, "epoch": 0.9423117390650835, "flos": 16253277926400.0, "grad_norm": 10.062525107733494, "language_loss": 0.82210726, "learning_rate": 3.47731615843776e-08, "loss": 0.83700037, "num_input_tokens_seen": 337989175, "router_z_loss_clip": 2.24414062, "router_z_loss_mlp": 0.27209473, "step": 15673, "time_per_iteration": 4.03102707862854 }, { "auxiliary_loss_clip": 0.01233968, "auxiliary_loss_mlp": 0.00206832, "balance_loss_clip": 1.01470399, "balance_loss_mlp": 0.18157168, "epoch": 0.9423718623177514, "flos": 31796564647680.0, "grad_norm": 23.47217890119892, "language_loss": 0.76989591, "learning_rate": 3.470089009683974e-08, "loss": 0.78430396, "num_input_tokens_seen": 338011800, "router_z_loss_clip": 2.19335938, "router_z_loss_mlp": 0.25244141, "step": 15674, "time_per_iteration": 4.169728994369507 }, { "auxiliary_loss_clip": 0.01238294, "auxiliary_loss_mlp": 0.00220918, "balance_loss_clip": 1.02121854, "balance_loss_mlp": 0.19646837, "epoch": 0.9424319855704194, "flos": 23331809473920.0, "grad_norm": 37.91519345134589, "language_loss": 0.88790083, "learning_rate": 3.462869313364125e-08, "loss": 0.90249288, "num_input_tokens_seen": 338032120, "router_z_loss_clip": 2.16796875, "router_z_loss_mlp": 0.24450684, "step": 15675, "time_per_iteration": 2.6434576511383057 }, { "auxiliary_loss_clip": 0.01235955, "auxiliary_loss_mlp": 0.00233184, "balance_loss_clip": 1.02253628, "balance_loss_mlp": 0.20912716, "epoch": 0.9424921088230873, "flos": 20777627214720.0, "grad_norm": 115.8410655340488, "language_loss": 0.69207919, "learning_rate": 3.4556570697519494e-08, "loss": 0.70677054, "num_input_tokens_seen": 338051880, "router_z_loss_clip": 2.13867188, "router_z_loss_mlp": 0.24035645, "step": 15676, "time_per_iteration": 2.7256834506988525 }, { "auxiliary_loss_clip": 0.01246416, "auxiliary_loss_mlp": 0.00232017, "balance_loss_clip": 1.02499676, "balance_loss_mlp": 0.20602974, "epoch": 0.9425522320757553, "flos": 19026658932480.0, "grad_norm": 9.621141295280562, "language_loss": 0.74754894, "learning_rate": 3.448452279120984e-08, "loss": 0.76233327, "num_input_tokens_seen": 338069665, "router_z_loss_clip": 2.21484375, "router_z_loss_mlp": 0.25964355, "step": 15677, "time_per_iteration": 2.6897754669189453 }, { "auxiliary_loss_clip": 0.0125112, "auxiliary_loss_mlp": 0.00227882, "balance_loss_clip": 1.03010416, "balance_loss_mlp": 0.20090494, "epoch": 0.9426123553284232, "flos": 25155353185920.0, "grad_norm": 11.644822652736954, "language_loss": 0.7399869, "learning_rate": 3.441254941744387e-08, "loss": 0.7547769, "num_input_tokens_seen": 338090490, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.26989746, "step": 15678, "time_per_iteration": 2.7791101932525635 }, { "auxiliary_loss_clip": 0.01233465, "auxiliary_loss_mlp": 0.00212267, "balance_loss_clip": 1.01925719, "balance_loss_mlp": 0.18621978, "epoch": 0.9426724785810913, "flos": 21179359900800.0, "grad_norm": 34.98105525144641, "language_loss": 0.83807099, "learning_rate": 3.434065057895097e-08, "loss": 0.85252827, "num_input_tokens_seen": 338109825, "router_z_loss_clip": 2.14257812, "router_z_loss_mlp": 0.26037598, "step": 15679, "time_per_iteration": 2.700272798538208 }, { "auxiliary_loss_clip": 0.01243998, "auxiliary_loss_mlp": 0.00218446, "balance_loss_clip": 1.02684402, "balance_loss_mlp": 0.19281596, "epoch": 0.9427326018337592, "flos": 14756916222720.0, "grad_norm": 9.987865765847612, "language_loss": 0.87023199, "learning_rate": 3.426882627845762e-08, "loss": 0.8848564, "num_input_tokens_seen": 338125790, "router_z_loss_clip": 2.17285156, "router_z_loss_mlp": 0.25646973, "step": 15680, "time_per_iteration": 2.6354875564575195 }, { "auxiliary_loss_clip": 0.01241609, "auxiliary_loss_mlp": 0.00233103, "balance_loss_clip": 1.02758193, "balance_loss_mlp": 0.20883152, "epoch": 0.9427927250864272, "flos": 20923640000640.0, "grad_norm": 34791.49453295058, "language_loss": 0.82811165, "learning_rate": 3.419707651868742e-08, "loss": 0.84285873, "num_input_tokens_seen": 338145610, "router_z_loss_clip": 2.14453125, "router_z_loss_mlp": 0.24291992, "step": 15681, "time_per_iteration": 2.6990833282470703 }, { "auxiliary_loss_clip": 0.01252683, "auxiliary_loss_mlp": 0.00219697, "balance_loss_clip": 1.03142881, "balance_loss_mlp": 0.19336328, "epoch": 0.9428528483390951, "flos": 19752520970880.0, "grad_norm": 27.788291523080776, "language_loss": 0.75010806, "learning_rate": 3.412540130236086e-08, "loss": 0.76483184, "num_input_tokens_seen": 338165960, "router_z_loss_clip": 2.21679688, "router_z_loss_mlp": 0.26342773, "step": 15682, "time_per_iteration": 4.171631097793579 }, { "auxiliary_loss_clip": 0.01243218, "auxiliary_loss_mlp": 0.00231426, "balance_loss_clip": 1.02283573, "balance_loss_mlp": 0.20518845, "epoch": 0.9429129715917631, "flos": 24534996370560.0, "grad_norm": 39.45542646452124, "language_loss": 0.85814542, "learning_rate": 3.405380063219665e-08, "loss": 0.87289184, "num_input_tokens_seen": 338187215, "router_z_loss_clip": 2.19921875, "router_z_loss_mlp": 0.2623291, "step": 15683, "time_per_iteration": 2.7601516246795654 }, { "auxiliary_loss_clip": 0.01259847, "auxiliary_loss_mlp": 0.00233324, "balance_loss_clip": 1.0363636, "balance_loss_mlp": 0.20736055, "epoch": 0.942973094844431, "flos": 17959824063360.0, "grad_norm": 22.70085955848421, "language_loss": 0.87751251, "learning_rate": 3.398227451090885e-08, "loss": 0.89244425, "num_input_tokens_seen": 338201825, "router_z_loss_clip": 2.23242188, "router_z_loss_mlp": 0.25976562, "step": 15684, "time_per_iteration": 2.648496150970459 }, { "auxiliary_loss_clip": 0.01228428, "auxiliary_loss_mlp": 0.00225953, "balance_loss_clip": 1.01907372, "balance_loss_mlp": 0.20252812, "epoch": 0.9430332180970991, "flos": 26137689310080.0, "grad_norm": 138.5949563535763, "language_loss": 0.84160048, "learning_rate": 3.391082294121017e-08, "loss": 0.85614431, "num_input_tokens_seen": 338220865, "router_z_loss_clip": 2.09765625, "router_z_loss_mlp": 0.234375, "step": 15685, "time_per_iteration": 4.080591440200806 }, { "auxiliary_loss_clip": 0.01212939, "auxiliary_loss_mlp": 0.00192793, "balance_loss_clip": 1.01077056, "balance_loss_mlp": 0.16839108, "epoch": 0.943093341349767, "flos": 23951376190080.0, "grad_norm": 156.00964018211724, "language_loss": 0.84126246, "learning_rate": 3.383944592581023e-08, "loss": 0.8553198, "num_input_tokens_seen": 338240160, "router_z_loss_clip": 2.02441406, "router_z_loss_mlp": 0.24377441, "step": 15686, "time_per_iteration": 2.6758170127868652 }, { "auxiliary_loss_clip": 0.01231645, "auxiliary_loss_mlp": 0.00231346, "balance_loss_clip": 1.02198625, "balance_loss_mlp": 0.20619301, "epoch": 0.943153464602435, "flos": 17968407413760.0, "grad_norm": 15.387021263275203, "language_loss": 0.88993758, "learning_rate": 3.376814346741575e-08, "loss": 0.90456754, "num_input_tokens_seen": 338259305, "router_z_loss_clip": 2.09765625, "router_z_loss_mlp": 0.25195312, "step": 15687, "time_per_iteration": 2.639714241027832 }, { "auxiliary_loss_clip": 0.01252664, "auxiliary_loss_mlp": 0.00237813, "balance_loss_clip": 1.02946949, "balance_loss_mlp": 0.21091893, "epoch": 0.943213587855103, "flos": 14501519544960.0, "grad_norm": 52.46079674870258, "language_loss": 0.86028248, "learning_rate": 3.369691556873011e-08, "loss": 0.87518728, "num_input_tokens_seen": 338274950, "router_z_loss_clip": 2.23046875, "router_z_loss_mlp": 0.26904297, "step": 15688, "time_per_iteration": 2.6492702960968018 }, { "auxiliary_loss_clip": 0.0122704, "auxiliary_loss_mlp": 0.00200298, "balance_loss_clip": 1.01605868, "balance_loss_mlp": 0.17656311, "epoch": 0.9432737111077709, "flos": 28986411093120.0, "grad_norm": 56.278707544026695, "language_loss": 0.75295079, "learning_rate": 3.3625762232454504e-08, "loss": 0.76722413, "num_input_tokens_seen": 338295585, "router_z_loss_clip": 2.11230469, "router_z_loss_mlp": 0.23718262, "step": 15689, "time_per_iteration": 2.813739538192749 }, { "auxiliary_loss_clip": 0.01206354, "auxiliary_loss_mlp": 0.00209702, "balance_loss_clip": 1.00180483, "balance_loss_mlp": 0.18801789, "epoch": 0.9433338343604389, "flos": 21609066303360.0, "grad_norm": 49.62747263622572, "language_loss": 0.87407422, "learning_rate": 3.35546834612872e-08, "loss": 0.88823485, "num_input_tokens_seen": 338314555, "router_z_loss_clip": 2.04492188, "router_z_loss_mlp": 0.21691895, "step": 15690, "time_per_iteration": 2.659653902053833 }, { "auxiliary_loss_clip": 0.01233055, "auxiliary_loss_mlp": 0.00239025, "balance_loss_clip": 1.02285016, "balance_loss_mlp": 0.21469411, "epoch": 0.9433939576131068, "flos": 33182285483520.0, "grad_norm": 25.80031863183923, "language_loss": 0.67235577, "learning_rate": 3.348367925792317e-08, "loss": 0.68707657, "num_input_tokens_seen": 338336260, "router_z_loss_clip": 2.09960938, "router_z_loss_mlp": 0.24328613, "step": 15691, "time_per_iteration": 2.7773056030273438 }, { "auxiliary_loss_clip": 0.01243276, "auxiliary_loss_mlp": 0.00217586, "balance_loss_clip": 1.02443421, "balance_loss_mlp": 0.19260009, "epoch": 0.9434540808657749, "flos": 20486391742080.0, "grad_norm": 26.147835712215763, "language_loss": 0.75869465, "learning_rate": 3.341274962505514e-08, "loss": 0.77330327, "num_input_tokens_seen": 338354680, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.25, "step": 15692, "time_per_iteration": 2.6313483715057373 }, { "auxiliary_loss_clip": 0.01246486, "auxiliary_loss_mlp": 0.00225613, "balance_loss_clip": 1.0254612, "balance_loss_mlp": 0.19941071, "epoch": 0.9435142041184428, "flos": 21542955321600.0, "grad_norm": 264.6690311241186, "language_loss": 0.83295, "learning_rate": 3.334189456537251e-08, "loss": 0.84767097, "num_input_tokens_seen": 338372490, "router_z_loss_clip": 2.2109375, "router_z_loss_mlp": 0.2623291, "step": 15693, "time_per_iteration": 2.6765332221984863 }, { "auxiliary_loss_clip": 0.0124293, "auxiliary_loss_mlp": 0.00222085, "balance_loss_clip": 1.02768743, "balance_loss_mlp": 0.19793274, "epoch": 0.9435743273711108, "flos": 25009089004800.0, "grad_norm": 7.865532122250188, "language_loss": 0.80489045, "learning_rate": 3.327111408156291e-08, "loss": 0.81954062, "num_input_tokens_seen": 338390870, "router_z_loss_clip": 2.15429688, "router_z_loss_mlp": 0.24145508, "step": 15694, "time_per_iteration": 2.675974130630493 }, { "auxiliary_loss_clip": 0.01091121, "auxiliary_loss_mlp": 0.0006953, "balance_loss_clip": 0.95436841, "balance_loss_mlp": 0.06252004, "epoch": 0.9436344506237787, "flos": 60158707320960.0, "grad_norm": 2.517113411525512, "language_loss": 0.49771762, "learning_rate": 3.3200408176309316e-08, "loss": 0.50932413, "num_input_tokens_seen": 338453075, "router_z_loss_clip": 1.3671875, "router_z_loss_mlp": 0.0703125, "step": 15695, "time_per_iteration": 3.23744797706604 }, { "auxiliary_loss_clip": 0.01210919, "auxiliary_loss_mlp": 0.00204321, "balance_loss_clip": 1.00638103, "balance_loss_mlp": 0.18204062, "epoch": 0.9436945738764467, "flos": 22237252283520.0, "grad_norm": 65.58963433816983, "language_loss": 0.73341918, "learning_rate": 3.312977685229335e-08, "loss": 0.74757159, "num_input_tokens_seen": 338471770, "router_z_loss_clip": 2.04492188, "router_z_loss_mlp": 0.22290039, "step": 15696, "time_per_iteration": 2.6998977661132812 }, { "auxiliary_loss_clip": 0.01237733, "auxiliary_loss_mlp": 0.00204366, "balance_loss_clip": 1.02534473, "balance_loss_mlp": 0.1807034, "epoch": 0.9437546971291146, "flos": 25045179194880.0, "grad_norm": 95.31960541045768, "language_loss": 0.75372344, "learning_rate": 3.305922011219353e-08, "loss": 0.76814437, "num_input_tokens_seen": 338492190, "router_z_loss_clip": 2.12304688, "router_z_loss_mlp": 0.2364502, "step": 15697, "time_per_iteration": 2.728760004043579 }, { "auxiliary_loss_clip": 0.01090505, "auxiliary_loss_mlp": 0.00058958, "balance_loss_clip": 0.95396364, "balance_loss_mlp": 0.05180521, "epoch": 0.9438148203817827, "flos": 56790788400000.0, "grad_norm": 0.8212770561454543, "language_loss": 0.61694348, "learning_rate": 3.298873795868506e-08, "loss": 0.62843812, "num_input_tokens_seen": 338552560, "router_z_loss_clip": 1.359375, "router_z_loss_mlp": 0.07128906, "step": 15698, "time_per_iteration": 3.1211838722229004 }, { "auxiliary_loss_clip": 0.01244883, "auxiliary_loss_mlp": 0.00229928, "balance_loss_clip": 1.02251661, "balance_loss_mlp": 0.20111531, "epoch": 0.9438749436344506, "flos": 22346384780160.0, "grad_norm": 21.6318022610425, "language_loss": 0.77989185, "learning_rate": 3.291833039444092e-08, "loss": 0.79463995, "num_input_tokens_seen": 338571770, "router_z_loss_clip": 2.22265625, "router_z_loss_mlp": 0.28845215, "step": 15699, "time_per_iteration": 2.6793012619018555 }, { "auxiliary_loss_clip": 0.01223978, "auxiliary_loss_mlp": 0.00229896, "balance_loss_clip": 1.01636434, "balance_loss_mlp": 0.20663871, "epoch": 0.9439350668871186, "flos": 13370800337280.0, "grad_norm": 25.727182048325727, "language_loss": 0.83786809, "learning_rate": 3.2847997422130734e-08, "loss": 0.85240686, "num_input_tokens_seen": 338587310, "router_z_loss_clip": 2.07714844, "router_z_loss_mlp": 0.23278809, "step": 15700, "time_per_iteration": 2.6991260051727295 }, { "auxiliary_loss_clip": 0.01232981, "auxiliary_loss_mlp": 0.00213098, "balance_loss_clip": 1.0251534, "balance_loss_mlp": 0.18844587, "epoch": 0.9439951901397866, "flos": 17785334770560.0, "grad_norm": 55.62835655494158, "language_loss": 0.79562187, "learning_rate": 3.2777739044421495e-08, "loss": 0.81008261, "num_input_tokens_seen": 338606235, "router_z_loss_clip": 2.08105469, "router_z_loss_mlp": 0.24633789, "step": 15701, "time_per_iteration": 2.7022478580474854 }, { "auxiliary_loss_clip": 0.0125177, "auxiliary_loss_mlp": 0.00250024, "balance_loss_clip": 1.03014827, "balance_loss_mlp": 0.22472805, "epoch": 0.9440553133924545, "flos": 18879568738560.0, "grad_norm": 119.6071011945671, "language_loss": 0.86477602, "learning_rate": 3.2707555263977505e-08, "loss": 0.879794, "num_input_tokens_seen": 338624090, "router_z_loss_clip": 2.21484375, "router_z_loss_mlp": 0.25305176, "step": 15702, "time_per_iteration": 2.698657274246216 }, { "auxiliary_loss_clip": 0.01268919, "auxiliary_loss_mlp": 0.00242156, "balance_loss_clip": 1.04316044, "balance_loss_mlp": 0.21484488, "epoch": 0.9441154366451225, "flos": 19572967860480.0, "grad_norm": 394.0047743146646, "language_loss": 0.7644282, "learning_rate": 3.2637446083460194e-08, "loss": 0.77953893, "num_input_tokens_seen": 338643695, "router_z_loss_clip": 2.2578125, "router_z_loss_mlp": 0.2734375, "step": 15703, "time_per_iteration": 2.660036325454712 }, { "auxiliary_loss_clip": 0.01252622, "auxiliary_loss_mlp": 0.00244581, "balance_loss_clip": 1.02913916, "balance_loss_mlp": 0.21816452, "epoch": 0.9441755598977905, "flos": 30294995472000.0, "grad_norm": 77.58822439235817, "language_loss": 0.81658566, "learning_rate": 3.256741150552833e-08, "loss": 0.83155775, "num_input_tokens_seen": 338664725, "router_z_loss_clip": 2.23242188, "router_z_loss_mlp": 0.26428223, "step": 15704, "time_per_iteration": 2.755995273590088 }, { "auxiliary_loss_clip": 0.01229649, "auxiliary_loss_mlp": 0.00222651, "balance_loss_clip": 1.01328778, "balance_loss_mlp": 0.19767642, "epoch": 0.9442356831504585, "flos": 20667884186880.0, "grad_norm": 163.94844616316863, "language_loss": 0.85442102, "learning_rate": 3.2497451532837336e-08, "loss": 0.86894393, "num_input_tokens_seen": 338683990, "router_z_loss_clip": 2.16015625, "router_z_loss_mlp": 0.24987793, "step": 15705, "time_per_iteration": 2.648963212966919 }, { "auxiliary_loss_clip": 0.01237604, "auxiliary_loss_mlp": 0.00203458, "balance_loss_clip": 1.02403951, "balance_loss_mlp": 0.1801286, "epoch": 0.9442958064031264, "flos": 16107265140480.0, "grad_norm": 183.21745023682266, "language_loss": 0.8626045, "learning_rate": 3.2427566168039986e-08, "loss": 0.87701511, "num_input_tokens_seen": 338702025, "router_z_loss_clip": 2.13867188, "router_z_loss_mlp": 0.23352051, "step": 15706, "time_per_iteration": 2.681553602218628 }, { "auxiliary_loss_clip": 0.01213753, "auxiliary_loss_mlp": 0.00237072, "balance_loss_clip": 1.00709081, "balance_loss_mlp": 0.21380232, "epoch": 0.9443559296557944, "flos": 20447392550400.0, "grad_norm": 13.469468247638869, "language_loss": 0.75027323, "learning_rate": 3.23577554137866e-08, "loss": 0.76478148, "num_input_tokens_seen": 338720920, "router_z_loss_clip": 2.06933594, "router_z_loss_mlp": 0.23254395, "step": 15707, "time_per_iteration": 2.6461386680603027 }, { "auxiliary_loss_clip": 0.01227866, "auxiliary_loss_mlp": 0.00209166, "balance_loss_clip": 1.02189827, "balance_loss_mlp": 0.18617094, "epoch": 0.9444160529084623, "flos": 21610897896960.0, "grad_norm": 78.13475560854229, "language_loss": 0.76378375, "learning_rate": 3.22880192727244e-08, "loss": 0.77815408, "num_input_tokens_seen": 338739590, "router_z_loss_clip": 2.06054688, "router_z_loss_mlp": 0.2298584, "step": 15708, "time_per_iteration": 2.670027017593384 }, { "auxiliary_loss_clip": 0.01242581, "auxiliary_loss_mlp": 0.00215688, "balance_loss_clip": 1.02978802, "balance_loss_mlp": 0.19222759, "epoch": 0.9444761761611303, "flos": 18441781776000.0, "grad_norm": 244.58267413296477, "language_loss": 0.80655605, "learning_rate": 3.221835774749748e-08, "loss": 0.82113874, "num_input_tokens_seen": 338757240, "router_z_loss_clip": 2.12890625, "router_z_loss_mlp": 0.23461914, "step": 15709, "time_per_iteration": 2.6113405227661133 }, { "auxiliary_loss_clip": 0.01219952, "auxiliary_loss_mlp": 0.00216348, "balance_loss_clip": 1.01004136, "balance_loss_mlp": 0.19546258, "epoch": 0.9445362994137982, "flos": 20957144411520.0, "grad_norm": 1810.358016725857, "language_loss": 0.92839062, "learning_rate": 3.214877084074774e-08, "loss": 0.94275361, "num_input_tokens_seen": 338773750, "router_z_loss_clip": 2.09765625, "router_z_loss_mlp": 0.2088623, "step": 15710, "time_per_iteration": 2.778749942779541 }, { "auxiliary_loss_clip": 0.01234657, "auxiliary_loss_mlp": 0.00238363, "balance_loss_clip": 1.0176537, "balance_loss_mlp": 0.21117173, "epoch": 0.9445964226664663, "flos": 20303283185280.0, "grad_norm": 9.97297737973938, "language_loss": 0.78154385, "learning_rate": 3.2079258555113956e-08, "loss": 0.79627407, "num_input_tokens_seen": 338792115, "router_z_loss_clip": 2.16796875, "router_z_loss_mlp": 0.27197266, "step": 15711, "time_per_iteration": 2.7165844440460205 }, { "auxiliary_loss_clip": 0.01243434, "auxiliary_loss_mlp": 0.00221197, "balance_loss_clip": 1.02868664, "balance_loss_mlp": 0.19718787, "epoch": 0.9446565459191342, "flos": 26396030903040.0, "grad_norm": 15.056205502301392, "language_loss": 0.76234686, "learning_rate": 3.200982089323179e-08, "loss": 0.77699322, "num_input_tokens_seen": 338812480, "router_z_loss_clip": 2.14257812, "router_z_loss_mlp": 0.2401123, "step": 15712, "time_per_iteration": 2.6959824562072754 }, { "auxiliary_loss_clip": 0.01261868, "auxiliary_loss_mlp": 0.00243505, "balance_loss_clip": 1.03405583, "balance_loss_mlp": 0.21423946, "epoch": 0.9447166691718022, "flos": 16544764794240.0, "grad_norm": 31.438380638228004, "language_loss": 0.79215163, "learning_rate": 3.1940457857734246e-08, "loss": 0.80720532, "num_input_tokens_seen": 338829105, "router_z_loss_clip": 2.27539062, "router_z_loss_mlp": 0.29248047, "step": 15713, "time_per_iteration": 2.6425936222076416 }, { "auxiliary_loss_clip": 0.01234022, "auxiliary_loss_mlp": 0.00225362, "balance_loss_clip": 1.02211618, "balance_loss_mlp": 0.20260471, "epoch": 0.9447767924244702, "flos": 29164635400320.0, "grad_norm": 6.538795757938669, "language_loss": 0.8395859, "learning_rate": 3.187116945125212e-08, "loss": 0.8541798, "num_input_tokens_seen": 338850670, "router_z_loss_clip": 2.1171875, "router_z_loss_mlp": 0.22766113, "step": 15714, "time_per_iteration": 2.711700677871704 }, { "auxiliary_loss_clip": 0.01256644, "auxiliary_loss_mlp": 0.00231958, "balance_loss_clip": 1.03074455, "balance_loss_mlp": 0.20589897, "epoch": 0.9448369156771381, "flos": 19274908803840.0, "grad_norm": 113.25434964580083, "language_loss": 0.75276303, "learning_rate": 3.1801955676412194e-08, "loss": 0.76764905, "num_input_tokens_seen": 338867795, "router_z_loss_clip": 2.25976562, "router_z_loss_mlp": 0.26074219, "step": 15715, "time_per_iteration": 4.067935466766357 }, { "auxiliary_loss_clip": 0.0123552, "auxiliary_loss_mlp": 0.00240659, "balance_loss_clip": 1.02324867, "balance_loss_mlp": 0.21543387, "epoch": 0.9448970389298061, "flos": 23841166285440.0, "grad_norm": 15.808150731392649, "language_loss": 0.82287467, "learning_rate": 3.173281653583948e-08, "loss": 0.83763647, "num_input_tokens_seen": 338887205, "router_z_loss_clip": 2.12109375, "router_z_loss_mlp": 0.25244141, "step": 15716, "time_per_iteration": 2.6892614364624023 }, { "auxiliary_loss_clip": 0.01261524, "auxiliary_loss_mlp": 0.00228553, "balance_loss_clip": 1.03557158, "balance_loss_mlp": 0.20074144, "epoch": 0.944957162182474, "flos": 22382259488640.0, "grad_norm": 3.0024220774128745, "language_loss": 0.69863164, "learning_rate": 3.166375203215565e-08, "loss": 0.71353245, "num_input_tokens_seen": 338906130, "router_z_loss_clip": 2.25976562, "router_z_loss_mlp": 0.27807617, "step": 15717, "time_per_iteration": 4.1310882568359375 }, { "auxiliary_loss_clip": 0.01236044, "auxiliary_loss_mlp": 0.00228448, "balance_loss_clip": 1.02387428, "balance_loss_mlp": 0.20468968, "epoch": 0.9450172854351421, "flos": 17383889393280.0, "grad_norm": 3.1921625785407772, "language_loss": 0.84888691, "learning_rate": 3.1594762167979514e-08, "loss": 0.86353183, "num_input_tokens_seen": 338923045, "router_z_loss_clip": 2.1171875, "router_z_loss_mlp": 0.23779297, "step": 15718, "time_per_iteration": 2.672595262527466 }, { "auxiliary_loss_clip": 0.01087381, "auxiliary_loss_mlp": 0.00070449, "balance_loss_clip": 0.9510597, "balance_loss_mlp": 0.06343907, "epoch": 0.94507740868781, "flos": 68466352406400.0, "grad_norm": 0.7299572399325407, "language_loss": 0.57400268, "learning_rate": 3.152584694592719e-08, "loss": 0.58558095, "num_input_tokens_seen": 338987545, "router_z_loss_clip": 1.359375, "router_z_loss_mlp": 0.0703125, "step": 15719, "time_per_iteration": 3.190312147140503 }, { "auxiliary_loss_clip": 0.01237589, "auxiliary_loss_mlp": 0.00228009, "balance_loss_clip": 1.02214622, "balance_loss_mlp": 0.2030105, "epoch": 0.945137531940478, "flos": 21142479611520.0, "grad_norm": 4.700827870151682, "language_loss": 0.81297356, "learning_rate": 3.145700636861193e-08, "loss": 0.82762957, "num_input_tokens_seen": 339007830, "router_z_loss_clip": 2.15820312, "router_z_loss_mlp": 0.24987793, "step": 15720, "time_per_iteration": 2.6379942893981934 }, { "auxiliary_loss_clip": 0.01216733, "auxiliary_loss_mlp": 0.00221522, "balance_loss_clip": 1.00888824, "balance_loss_mlp": 0.19820431, "epoch": 0.9451976551931459, "flos": 24533918962560.0, "grad_norm": 69.73470005751041, "language_loss": 0.78503847, "learning_rate": 3.138824043864452e-08, "loss": 0.79942101, "num_input_tokens_seen": 339028980, "router_z_loss_clip": 2.07910156, "router_z_loss_mlp": 0.23327637, "step": 15721, "time_per_iteration": 2.7217938899993896 }, { "auxiliary_loss_clip": 0.01243563, "auxiliary_loss_mlp": 0.00256708, "balance_loss_clip": 1.02677286, "balance_loss_mlp": 0.23140022, "epoch": 0.9452577784458139, "flos": 23440582834560.0, "grad_norm": 8.265496637982888, "language_loss": 0.93670821, "learning_rate": 3.131954915863244e-08, "loss": 0.95171082, "num_input_tokens_seen": 339047950, "router_z_loss_clip": 2.16601562, "router_z_loss_mlp": 0.2532959, "step": 15722, "time_per_iteration": 2.6794915199279785 }, { "auxiliary_loss_clip": 0.01090123, "auxiliary_loss_mlp": 0.00084107, "balance_loss_clip": 0.95304322, "balance_loss_mlp": 0.07695486, "epoch": 0.9453179016984818, "flos": 52017686449920.0, "grad_norm": 0.9419840470695992, "language_loss": 0.63580012, "learning_rate": 3.125093253118005e-08, "loss": 0.64754242, "num_input_tokens_seen": 339104535, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.07128906, "step": 15723, "time_per_iteration": 3.0852644443511963 }, { "auxiliary_loss_clip": 0.01238377, "auxiliary_loss_mlp": 0.0021768, "balance_loss_clip": 1.02648282, "balance_loss_mlp": 0.19365975, "epoch": 0.9453780249511499, "flos": 13473001509120.0, "grad_norm": 10.189135454611666, "language_loss": 0.81973696, "learning_rate": 3.1182390558889715e-08, "loss": 0.83429748, "num_input_tokens_seen": 339122050, "router_z_loss_clip": 2.11914062, "router_z_loss_mlp": 0.24023438, "step": 15724, "time_per_iteration": 2.620089054107666 }, { "auxiliary_loss_clip": 0.01221228, "auxiliary_loss_mlp": 0.0021731, "balance_loss_clip": 1.0114727, "balance_loss_mlp": 0.19371887, "epoch": 0.9454381482038178, "flos": 23258515772160.0, "grad_norm": 5.267125747039644, "language_loss": 0.93175691, "learning_rate": 3.111392324436024e-08, "loss": 0.94614232, "num_input_tokens_seen": 339138940, "router_z_loss_clip": 2.09960938, "router_z_loss_mlp": 0.23583984, "step": 15725, "time_per_iteration": 4.122671842575073 }, { "auxiliary_loss_clip": 0.01232633, "auxiliary_loss_mlp": 0.00218278, "balance_loss_clip": 1.02010369, "balance_loss_mlp": 0.19412634, "epoch": 0.9454982714564858, "flos": 19496621502720.0, "grad_norm": 22.0289124579766, "language_loss": 0.77926028, "learning_rate": 3.104553059018822e-08, "loss": 0.79376936, "num_input_tokens_seen": 339158245, "router_z_loss_clip": 2.12402344, "router_z_loss_mlp": 0.24133301, "step": 15726, "time_per_iteration": 2.6906869411468506 }, { "auxiliary_loss_clip": 0.01228729, "auxiliary_loss_mlp": 0.00232868, "balance_loss_clip": 1.014153, "balance_loss_mlp": 0.20587894, "epoch": 0.9455583947091538, "flos": 23258120722560.0, "grad_norm": 8.573864251115046, "language_loss": 0.73305929, "learning_rate": 3.097721259896735e-08, "loss": 0.74767518, "num_input_tokens_seen": 339178200, "router_z_loss_clip": 2.14257812, "router_z_loss_mlp": 0.26977539, "step": 15727, "time_per_iteration": 4.192553520202637 }, { "auxiliary_loss_clip": 0.01222463, "auxiliary_loss_mlp": 0.00247646, "balance_loss_clip": 1.01489723, "balance_loss_mlp": 0.22305307, "epoch": 0.9456185179618217, "flos": 17673041877120.0, "grad_norm": 53.16807866616446, "language_loss": 0.88463551, "learning_rate": 3.0908969273287566e-08, "loss": 0.89933658, "num_input_tokens_seen": 339193950, "router_z_loss_clip": 2.07519531, "router_z_loss_mlp": 0.24584961, "step": 15728, "time_per_iteration": 2.6553094387054443 }, { "auxiliary_loss_clip": 0.01086142, "auxiliary_loss_mlp": 0.00087755, "balance_loss_clip": 0.95176065, "balance_loss_mlp": 0.07917231, "epoch": 0.9456786412144897, "flos": 61415040389760.0, "grad_norm": 0.7009839136374952, "language_loss": 0.577613, "learning_rate": 3.08408006157368e-08, "loss": 0.58935201, "num_input_tokens_seen": 339252330, "router_z_loss_clip": 1.34375, "router_z_loss_mlp": 0.0859375, "step": 15729, "time_per_iteration": 3.0930285453796387 }, { "auxiliary_loss_clip": 0.01232755, "auxiliary_loss_mlp": 0.0020865, "balance_loss_clip": 1.01551235, "balance_loss_mlp": 0.18294895, "epoch": 0.9457387644671577, "flos": 18588369179520.0, "grad_norm": 65.70178223015573, "language_loss": 0.86070645, "learning_rate": 3.077270662890052e-08, "loss": 0.87512052, "num_input_tokens_seen": 339270325, "router_z_loss_clip": 2.16894531, "router_z_loss_mlp": 0.25695801, "step": 15730, "time_per_iteration": 2.6084976196289062 }, { "auxiliary_loss_clip": 0.01236314, "auxiliary_loss_mlp": 0.00226189, "balance_loss_clip": 1.02001715, "balance_loss_mlp": 0.20040429, "epoch": 0.9457988877198257, "flos": 21108544237440.0, "grad_norm": 2.9301037077457672, "language_loss": 0.7307173, "learning_rate": 3.070468731536047e-08, "loss": 0.74534237, "num_input_tokens_seen": 339291980, "router_z_loss_clip": 2.1640625, "router_z_loss_mlp": 0.25793457, "step": 15731, "time_per_iteration": 2.698867082595825 }, { "auxiliary_loss_clip": 0.01244389, "auxiliary_loss_mlp": 0.00246418, "balance_loss_clip": 1.02668273, "balance_loss_mlp": 0.22205195, "epoch": 0.9458590109724936, "flos": 26688379697280.0, "grad_norm": 7.795217950399212, "language_loss": 0.71666729, "learning_rate": 3.063674267769589e-08, "loss": 0.73157537, "num_input_tokens_seen": 339311795, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.24401855, "step": 15732, "time_per_iteration": 2.6917245388031006 }, { "auxiliary_loss_clip": 0.01267152, "auxiliary_loss_mlp": 0.00241307, "balance_loss_clip": 1.04171801, "balance_loss_mlp": 0.21499777, "epoch": 0.9459191342251616, "flos": 18661591054080.0, "grad_norm": 14.405907350351852, "language_loss": 0.92965657, "learning_rate": 3.056887271848363e-08, "loss": 0.94474113, "num_input_tokens_seen": 339327745, "router_z_loss_clip": 2.25976562, "router_z_loss_mlp": 0.26318359, "step": 15733, "time_per_iteration": 2.6496026515960693 }, { "auxiliary_loss_clip": 0.01229095, "auxiliary_loss_mlp": 0.0023959, "balance_loss_clip": 1.02185869, "balance_loss_mlp": 0.21672592, "epoch": 0.9459792574778295, "flos": 23398459159680.0, "grad_norm": 8.288664939624198, "language_loss": 0.79992259, "learning_rate": 3.0501077440297173e-08, "loss": 0.81460941, "num_input_tokens_seen": 339346445, "router_z_loss_clip": 2.07519531, "router_z_loss_mlp": 0.22888184, "step": 15734, "time_per_iteration": 2.6284732818603516 }, { "auxiliary_loss_clip": 0.01212266, "auxiliary_loss_mlp": 0.00197771, "balance_loss_clip": 1.00999486, "balance_loss_mlp": 0.17714766, "epoch": 0.9460393807304975, "flos": 24392969994240.0, "grad_norm": 21.57427035347676, "language_loss": 0.91685855, "learning_rate": 3.043335684570692e-08, "loss": 0.93095893, "num_input_tokens_seen": 339367945, "router_z_loss_clip": 2.02441406, "router_z_loss_mlp": 0.20629883, "step": 15735, "time_per_iteration": 2.712923049926758 }, { "auxiliary_loss_clip": 0.01254401, "auxiliary_loss_mlp": 0.00217316, "balance_loss_clip": 1.03567362, "balance_loss_mlp": 0.19441627, "epoch": 0.9460995039831654, "flos": 21939408708480.0, "grad_norm": 48.86565973331044, "language_loss": 0.79086906, "learning_rate": 3.036571093728102e-08, "loss": 0.80558622, "num_input_tokens_seen": 339386060, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.22900391, "step": 15736, "time_per_iteration": 2.632089853286743 }, { "auxiliary_loss_clip": 0.01088819, "auxiliary_loss_mlp": 0.0008576, "balance_loss_clip": 0.95198214, "balance_loss_mlp": 0.078178, "epoch": 0.9461596272358335, "flos": 70322466775680.0, "grad_norm": 0.8597885364064569, "language_loss": 0.64887071, "learning_rate": 3.029813971758499e-08, "loss": 0.66061652, "num_input_tokens_seen": 339446695, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.07568359, "step": 15737, "time_per_iteration": 3.154442071914673 }, { "auxiliary_loss_clip": 0.01091013, "auxiliary_loss_mlp": 0.00068646, "balance_loss_clip": 0.95336968, "balance_loss_mlp": 0.06125479, "epoch": 0.9462197504885014, "flos": 58591242645120.0, "grad_norm": 0.774236130461802, "language_loss": 0.58061618, "learning_rate": 3.0230643189181225e-08, "loss": 0.5922128, "num_input_tokens_seen": 339510080, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.07373047, "step": 15738, "time_per_iteration": 3.1332743167877197 }, { "auxiliary_loss_clip": 0.01217825, "auxiliary_loss_mlp": 0.00225028, "balance_loss_clip": 1.00871992, "balance_loss_mlp": 0.20204467, "epoch": 0.9462798737411694, "flos": 23433759250560.0, "grad_norm": 6.228703859454261, "language_loss": 0.8036027, "learning_rate": 3.016322135462834e-08, "loss": 0.81803119, "num_input_tokens_seen": 339529335, "router_z_loss_clip": 2.08984375, "router_z_loss_mlp": 0.2298584, "step": 15739, "time_per_iteration": 2.7031428813934326 }, { "auxiliary_loss_clip": 0.01236207, "auxiliary_loss_mlp": 0.00246449, "balance_loss_clip": 1.02193117, "balance_loss_mlp": 0.22093853, "epoch": 0.9463399969938374, "flos": 25046077034880.0, "grad_norm": 9.390772681550105, "language_loss": 0.7414313, "learning_rate": 3.009587421648363e-08, "loss": 0.75625789, "num_input_tokens_seen": 339548820, "router_z_loss_clip": 2.14257812, "router_z_loss_mlp": 0.25537109, "step": 15740, "time_per_iteration": 2.743234395980835 }, { "auxiliary_loss_clip": 0.01218703, "auxiliary_loss_mlp": 0.00201707, "balance_loss_clip": 1.01201117, "balance_loss_mlp": 0.17849684, "epoch": 0.9464001202465053, "flos": 24352606085760.0, "grad_norm": 20.35511196366351, "language_loss": 0.72139323, "learning_rate": 3.0028601777301045e-08, "loss": 0.73559737, "num_input_tokens_seen": 339566775, "router_z_loss_clip": 2.06445312, "router_z_loss_mlp": 0.23205566, "step": 15741, "time_per_iteration": 2.6970322132110596 }, { "auxiliary_loss_clip": 0.01241691, "auxiliary_loss_mlp": 0.00246218, "balance_loss_clip": 1.02480066, "balance_loss_mlp": 0.22080255, "epoch": 0.9464602434991733, "flos": 17165444832000.0, "grad_norm": 17.96833780584023, "language_loss": 0.82542169, "learning_rate": 2.9961404039630987e-08, "loss": 0.8403008, "num_input_tokens_seen": 339581905, "router_z_loss_clip": 2.16699219, "router_z_loss_mlp": 0.25427246, "step": 15742, "time_per_iteration": 2.681450128555298 }, { "auxiliary_loss_clip": 0.01219229, "auxiliary_loss_mlp": 0.00213326, "balance_loss_clip": 1.01065385, "balance_loss_mlp": 0.1911885, "epoch": 0.9465203667518413, "flos": 19938107566080.0, "grad_norm": 4.826208602805238, "language_loss": 0.79283613, "learning_rate": 2.989428100602187e-08, "loss": 0.80716169, "num_input_tokens_seen": 339599870, "router_z_loss_clip": 2.08398438, "router_z_loss_mlp": 0.22155762, "step": 15743, "time_per_iteration": 2.6483638286590576 }, { "auxiliary_loss_clip": 0.01252093, "auxiliary_loss_mlp": 0.00216228, "balance_loss_clip": 1.03150129, "balance_loss_mlp": 0.18945324, "epoch": 0.9465804900045093, "flos": 20120318282880.0, "grad_norm": 38.115111743554834, "language_loss": 0.87346917, "learning_rate": 2.982723267901943e-08, "loss": 0.88815236, "num_input_tokens_seen": 339620250, "router_z_loss_clip": 2.20507812, "router_z_loss_mlp": 0.2677002, "step": 15744, "time_per_iteration": 2.7056326866149902 }, { "auxiliary_loss_clip": 0.01253355, "auxiliary_loss_mlp": 0.00242275, "balance_loss_clip": 1.03040123, "balance_loss_mlp": 0.21746796, "epoch": 0.9466406132571772, "flos": 23911622812800.0, "grad_norm": 12.972836540878014, "language_loss": 0.86229932, "learning_rate": 2.9760259061165417e-08, "loss": 0.87725568, "num_input_tokens_seen": 339639900, "router_z_loss_clip": 2.22851562, "router_z_loss_mlp": 0.24816895, "step": 15745, "time_per_iteration": 2.664133310317993 }, { "auxiliary_loss_clip": 0.01247693, "auxiliary_loss_mlp": 0.00241633, "balance_loss_clip": 1.02964687, "balance_loss_mlp": 0.21680146, "epoch": 0.9467007365098452, "flos": 19933223316480.0, "grad_norm": 9.456357144756616, "language_loss": 0.76226425, "learning_rate": 2.9693360155000014e-08, "loss": 0.77715755, "num_input_tokens_seen": 339658970, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.24853516, "step": 15746, "time_per_iteration": 2.6889138221740723 }, { "auxiliary_loss_clip": 0.01254136, "auxiliary_loss_mlp": 0.00221886, "balance_loss_clip": 1.03328156, "balance_loss_mlp": 0.19761544, "epoch": 0.9467608597625131, "flos": 19310496203520.0, "grad_norm": 12.576369917924946, "language_loss": 0.67153013, "learning_rate": 2.962653596305964e-08, "loss": 0.68629038, "num_input_tokens_seen": 339675600, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.24267578, "step": 15747, "time_per_iteration": 2.745791435241699 }, { "auxiliary_loss_clip": 0.01087785, "auxiliary_loss_mlp": 0.00078588, "balance_loss_clip": 0.95149469, "balance_loss_mlp": 0.07095869, "epoch": 0.9468209830151811, "flos": 69630252802560.0, "grad_norm": 0.6825674353355761, "language_loss": 0.52817667, "learning_rate": 2.955978648787871e-08, "loss": 0.53984034, "num_input_tokens_seen": 339744505, "router_z_loss_clip": 1.359375, "router_z_loss_mlp": 0.07617188, "step": 15748, "time_per_iteration": 3.378309726715088 }, { "auxiliary_loss_clip": 0.01252578, "auxiliary_loss_mlp": 0.00223012, "balance_loss_clip": 1.03039503, "balance_loss_mlp": 0.19778773, "epoch": 0.946881106267849, "flos": 27016639113600.0, "grad_norm": 38.917986448243745, "language_loss": 0.74875224, "learning_rate": 2.9493111731988096e-08, "loss": 0.76350808, "num_input_tokens_seen": 339765810, "router_z_loss_clip": 2.22460938, "router_z_loss_mlp": 0.25219727, "step": 15749, "time_per_iteration": 2.728712558746338 }, { "auxiliary_loss_clip": 0.01238102, "auxiliary_loss_mlp": 0.00223402, "balance_loss_clip": 1.01977825, "balance_loss_mlp": 0.19741407, "epoch": 0.9469412295205171, "flos": 20190092451840.0, "grad_norm": 9.430849423535632, "language_loss": 0.85508168, "learning_rate": 2.942651169791621e-08, "loss": 0.86969674, "num_input_tokens_seen": 339784125, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.26000977, "step": 15750, "time_per_iteration": 2.6390621662139893 }, { "auxiliary_loss_clip": 0.01232143, "auxiliary_loss_mlp": 0.00211763, "balance_loss_clip": 1.02197218, "balance_loss_mlp": 0.18905368, "epoch": 0.947001352773185, "flos": 21324905809920.0, "grad_norm": 38.39850773149587, "language_loss": 0.74682248, "learning_rate": 2.9359986388188372e-08, "loss": 0.76126152, "num_input_tokens_seen": 339803450, "router_z_loss_clip": 2.1015625, "router_z_loss_mlp": 0.22729492, "step": 15751, "time_per_iteration": 2.7050259113311768 }, { "auxiliary_loss_clip": 0.01254736, "auxiliary_loss_mlp": 0.00243836, "balance_loss_clip": 1.03303623, "balance_loss_mlp": 0.21877824, "epoch": 0.947061476025853, "flos": 21944041562880.0, "grad_norm": 149.4918828541659, "language_loss": 0.71580291, "learning_rate": 2.929353580532723e-08, "loss": 0.73078865, "num_input_tokens_seen": 339823215, "router_z_loss_clip": 2.22265625, "router_z_loss_mlp": 0.25048828, "step": 15752, "time_per_iteration": 2.6872706413269043 }, { "auxiliary_loss_clip": 0.01234937, "auxiliary_loss_mlp": 0.00220482, "balance_loss_clip": 1.02004671, "balance_loss_mlp": 0.19437516, "epoch": 0.947121599278521, "flos": 21394715892480.0, "grad_norm": 200.15320783420603, "language_loss": 0.780828, "learning_rate": 2.9227159951852764e-08, "loss": 0.79538214, "num_input_tokens_seen": 339842230, "router_z_loss_clip": 2.1484375, "router_z_loss_mlp": 0.2611084, "step": 15753, "time_per_iteration": 2.7307941913604736 }, { "auxiliary_loss_clip": 0.01251588, "auxiliary_loss_mlp": 0.00220368, "balance_loss_clip": 1.02381897, "balance_loss_mlp": 0.19440463, "epoch": 0.9471817225311889, "flos": 23075730437760.0, "grad_norm": 609.5336300258261, "language_loss": 0.81315756, "learning_rate": 2.9160858830281855e-08, "loss": 0.8278771, "num_input_tokens_seen": 339861640, "router_z_loss_clip": 2.27539062, "router_z_loss_mlp": 0.25952148, "step": 15754, "time_per_iteration": 2.7217957973480225 }, { "auxiliary_loss_clip": 0.01246131, "auxiliary_loss_mlp": 0.00211269, "balance_loss_clip": 1.02846432, "balance_loss_mlp": 0.18560307, "epoch": 0.947241845783857, "flos": 11910744305280.0, "grad_norm": 10.436702092921726, "language_loss": 0.89188612, "learning_rate": 2.9094632443129153e-08, "loss": 0.90646017, "num_input_tokens_seen": 339878210, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.25646973, "step": 15755, "time_per_iteration": 2.7593705654144287 }, { "auxiliary_loss_clip": 0.01282824, "auxiliary_loss_mlp": 0.00251221, "balance_loss_clip": 1.04882717, "balance_loss_mlp": 0.22273028, "epoch": 0.9473019690365249, "flos": 20740675098240.0, "grad_norm": 4.953453320097887, "language_loss": 0.85347211, "learning_rate": 2.9028480792904876e-08, "loss": 0.86881262, "num_input_tokens_seen": 339894255, "router_z_loss_clip": 2.34375, "router_z_loss_mlp": 0.28503418, "step": 15756, "time_per_iteration": 2.786654233932495 }, { "auxiliary_loss_clip": 0.01235052, "auxiliary_loss_mlp": 0.00241769, "balance_loss_clip": 1.02004814, "balance_loss_mlp": 0.21700966, "epoch": 0.9473620922891929, "flos": 17639896602240.0, "grad_norm": 4.183806560923486, "language_loss": 0.8446905, "learning_rate": 2.8962403882118347e-08, "loss": 0.85945874, "num_input_tokens_seen": 339912425, "router_z_loss_clip": 2.15039062, "router_z_loss_mlp": 0.24743652, "step": 15757, "time_per_iteration": 2.647167444229126 }, { "auxiliary_loss_clip": 0.01245534, "auxiliary_loss_mlp": 0.00228905, "balance_loss_clip": 1.02720034, "balance_loss_mlp": 0.20371626, "epoch": 0.9474222155418608, "flos": 23550002640000.0, "grad_norm": 48.00510411600294, "language_loss": 0.87263107, "learning_rate": 2.889640171327512e-08, "loss": 0.88737547, "num_input_tokens_seen": 339929635, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.25195312, "step": 15758, "time_per_iteration": 4.057866334915161 }, { "auxiliary_loss_clip": 0.01231825, "auxiliary_loss_mlp": 0.0023367, "balance_loss_clip": 1.01764512, "balance_loss_mlp": 0.20799184, "epoch": 0.9474823387945288, "flos": 27089753247360.0, "grad_norm": 540.8298597907101, "language_loss": 0.78374803, "learning_rate": 2.8830474288877638e-08, "loss": 0.79840297, "num_input_tokens_seen": 339951200, "router_z_loss_clip": 2.14355469, "router_z_loss_mlp": 0.25683594, "step": 15759, "time_per_iteration": 4.179820775985718 }, { "auxiliary_loss_clip": 0.0123342, "auxiliary_loss_mlp": 0.00209597, "balance_loss_clip": 1.02932382, "balance_loss_mlp": 0.18916428, "epoch": 0.9475424620471967, "flos": 22966526113920.0, "grad_norm": 21.675174209074235, "language_loss": 0.83364701, "learning_rate": 2.8764621611426344e-08, "loss": 0.84807718, "num_input_tokens_seen": 339971820, "router_z_loss_clip": 2.04199219, "router_z_loss_mlp": 0.20446777, "step": 15760, "time_per_iteration": 2.6980113983154297 }, { "auxiliary_loss_clip": 0.01230835, "auxiliary_loss_mlp": 0.00228127, "balance_loss_clip": 1.01790071, "balance_loss_mlp": 0.20401117, "epoch": 0.9476025852998647, "flos": 20047671025920.0, "grad_norm": 31.952381981582032, "language_loss": 0.80883062, "learning_rate": 2.8698843683418128e-08, "loss": 0.82342029, "num_input_tokens_seen": 339989420, "router_z_loss_clip": 2.12890625, "router_z_loss_mlp": 0.24121094, "step": 15761, "time_per_iteration": 2.665698289871216 }, { "auxiliary_loss_clip": 0.01235686, "auxiliary_loss_mlp": 0.00227016, "balance_loss_clip": 1.02176118, "balance_loss_mlp": 0.20362671, "epoch": 0.9476627085525327, "flos": 14975468524800.0, "grad_norm": 28.185537648324015, "language_loss": 0.80800015, "learning_rate": 2.863314050734722e-08, "loss": 0.82262719, "num_input_tokens_seen": 340006690, "router_z_loss_clip": 2.14257812, "router_z_loss_mlp": 0.23388672, "step": 15762, "time_per_iteration": 2.6210334300994873 }, { "auxiliary_loss_clip": 0.0125514, "auxiliary_loss_mlp": 0.0023915, "balance_loss_clip": 1.034464, "balance_loss_mlp": 0.21371059, "epoch": 0.9477228318052007, "flos": 18697788984960.0, "grad_norm": 11.080046364427545, "language_loss": 0.76733065, "learning_rate": 2.856751208570518e-08, "loss": 0.78227359, "num_input_tokens_seen": 340025480, "router_z_loss_clip": 2.20800781, "router_z_loss_mlp": 0.25463867, "step": 15763, "time_per_iteration": 2.7327163219451904 }, { "auxiliary_loss_clip": 0.01237916, "auxiliary_loss_mlp": 0.00218537, "balance_loss_clip": 1.02472913, "balance_loss_mlp": 0.19334812, "epoch": 0.9477829550578686, "flos": 23875065745920.0, "grad_norm": 2.3104922000517476, "language_loss": 0.769871, "learning_rate": 2.8501958420980466e-08, "loss": 0.78443551, "num_input_tokens_seen": 340043785, "router_z_loss_clip": 2.13085938, "router_z_loss_mlp": 0.2520752, "step": 15764, "time_per_iteration": 2.725487232208252 }, { "auxiliary_loss_clip": 0.01213303, "auxiliary_loss_mlp": 0.00223323, "balance_loss_clip": 1.01144016, "balance_loss_mlp": 0.20087565, "epoch": 0.9478430783105366, "flos": 22562890007040.0, "grad_norm": 7.051341868670931, "language_loss": 0.76267946, "learning_rate": 2.8436479515659306e-08, "loss": 0.77704573, "num_input_tokens_seen": 340064360, "router_z_loss_clip": 2.01757812, "router_z_loss_mlp": 0.22460938, "step": 15765, "time_per_iteration": 2.7210450172424316 }, { "auxiliary_loss_clip": 0.01090633, "auxiliary_loss_mlp": 0.00080987, "balance_loss_clip": 0.95254612, "balance_loss_mlp": 0.07359651, "epoch": 0.9479032015632046, "flos": 60857885554560.0, "grad_norm": 0.7836456366488854, "language_loss": 0.57481772, "learning_rate": 2.8371075372224384e-08, "loss": 0.5865339, "num_input_tokens_seen": 340114425, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.07373047, "step": 15766, "time_per_iteration": 2.984362840652466 }, { "auxiliary_loss_clip": 0.01225778, "auxiliary_loss_mlp": 0.00223566, "balance_loss_clip": 1.01168919, "balance_loss_mlp": 0.19931872, "epoch": 0.9479633248158725, "flos": 14683873916160.0, "grad_norm": 49.301189647079575, "language_loss": 0.83082867, "learning_rate": 2.8305745993155938e-08, "loss": 0.84532213, "num_input_tokens_seen": 340132200, "router_z_loss_clip": 2.14257812, "router_z_loss_mlp": 0.2421875, "step": 15767, "time_per_iteration": 4.086865663528442 }, { "auxiliary_loss_clip": 0.01259603, "auxiliary_loss_mlp": 0.00233726, "balance_loss_clip": 1.03918386, "balance_loss_mlp": 0.20842943, "epoch": 0.9480234480685406, "flos": 20333878594560.0, "grad_norm": 264.75682910197935, "language_loss": 0.82018417, "learning_rate": 2.8240491380931096e-08, "loss": 0.83511746, "num_input_tokens_seen": 340149175, "router_z_loss_clip": 2.20117188, "router_z_loss_mlp": 0.25292969, "step": 15768, "time_per_iteration": 2.7303524017333984 }, { "auxiliary_loss_clip": 0.0109065, "auxiliary_loss_mlp": 0.00118778, "balance_loss_clip": 0.9520539, "balance_loss_mlp": 0.10957488, "epoch": 0.9480835713212085, "flos": 70293092428800.0, "grad_norm": 0.7330031000873575, "language_loss": 0.54329777, "learning_rate": 2.8175311538024326e-08, "loss": 0.55539203, "num_input_tokens_seen": 340208155, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.09179688, "step": 15769, "time_per_iteration": 4.572947978973389 }, { "auxiliary_loss_clip": 0.0123529, "auxiliary_loss_mlp": 0.00225996, "balance_loss_clip": 1.01852429, "balance_loss_mlp": 0.20203494, "epoch": 0.9481436945738765, "flos": 25449749055360.0, "grad_norm": 6.904436941537562, "language_loss": 0.83348733, "learning_rate": 2.8110206466907428e-08, "loss": 0.84810019, "num_input_tokens_seen": 340229275, "router_z_loss_clip": 2.171875, "router_z_loss_mlp": 0.23937988, "step": 15770, "time_per_iteration": 2.7193915843963623 }, { "auxiliary_loss_clip": 0.01239983, "auxiliary_loss_mlp": 0.00215979, "balance_loss_clip": 1.02394617, "balance_loss_mlp": 0.18895395, "epoch": 0.9482038178265444, "flos": 26979902478720.0, "grad_norm": 41.590468030805695, "language_loss": 0.85442245, "learning_rate": 2.8045176170049313e-08, "loss": 0.86898208, "num_input_tokens_seen": 340248920, "router_z_loss_clip": 2.16210938, "router_z_loss_mlp": 0.27001953, "step": 15771, "time_per_iteration": 2.723235845565796 }, { "auxiliary_loss_clip": 0.01230519, "auxiliary_loss_mlp": 0.00234975, "balance_loss_clip": 1.01606631, "balance_loss_mlp": 0.2099527, "epoch": 0.9482639410792124, "flos": 17785442511360.0, "grad_norm": 20.034405428830496, "language_loss": 0.77176154, "learning_rate": 2.7980220649915566e-08, "loss": 0.78641641, "num_input_tokens_seen": 340266775, "router_z_loss_clip": 2.14257812, "router_z_loss_mlp": 0.25012207, "step": 15772, "time_per_iteration": 2.6266589164733887 }, { "auxiliary_loss_clip": 0.01230866, "auxiliary_loss_mlp": 0.00213337, "balance_loss_clip": 1.01743269, "balance_loss_mlp": 0.18908992, "epoch": 0.9483240643318803, "flos": 20996682307200.0, "grad_norm": 18.174757000235882, "language_loss": 0.81349337, "learning_rate": 2.7915339908969327e-08, "loss": 0.82793534, "num_input_tokens_seen": 340285295, "router_z_loss_clip": 2.1328125, "router_z_loss_mlp": 0.24255371, "step": 15773, "time_per_iteration": 2.6718454360961914 }, { "auxiliary_loss_clip": 0.01249343, "auxiliary_loss_mlp": 0.00204598, "balance_loss_clip": 1.0286994, "balance_loss_mlp": 0.17803809, "epoch": 0.9483841875845483, "flos": 20083294339200.0, "grad_norm": 3.6625026049680387, "language_loss": 0.74839157, "learning_rate": 2.7850533949671072e-08, "loss": 0.76293099, "num_input_tokens_seen": 340304265, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.265625, "step": 15774, "time_per_iteration": 2.7701330184936523 }, { "auxiliary_loss_clip": 0.01249747, "auxiliary_loss_mlp": 0.00217617, "balance_loss_clip": 1.03127599, "balance_loss_mlp": 0.19173664, "epoch": 0.9484443108372163, "flos": 20813645577600.0, "grad_norm": 3.885092001277878, "language_loss": 0.6703856, "learning_rate": 2.7785802774478396e-08, "loss": 0.68505919, "num_input_tokens_seen": 340323690, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.25915527, "step": 15775, "time_per_iteration": 2.66629695892334 }, { "auxiliary_loss_clip": 0.01243273, "auxiliary_loss_mlp": 0.00213421, "balance_loss_clip": 1.02426481, "balance_loss_mlp": 0.18898261, "epoch": 0.9485044340898843, "flos": 36429184506240.0, "grad_norm": 15154.756483698007, "language_loss": 0.67898792, "learning_rate": 2.772114638584555e-08, "loss": 0.69355488, "num_input_tokens_seen": 340345830, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.24475098, "step": 15776, "time_per_iteration": 2.8397116661071777 }, { "auxiliary_loss_clip": 0.01247983, "auxiliary_loss_mlp": 0.00227118, "balance_loss_clip": 1.02523685, "balance_loss_mlp": 0.19962817, "epoch": 0.9485645573425522, "flos": 22602535643520.0, "grad_norm": 12.385746189089813, "language_loss": 0.8462075, "learning_rate": 2.765656478622458e-08, "loss": 0.86095852, "num_input_tokens_seen": 340365910, "router_z_loss_clip": 2.2265625, "router_z_loss_mlp": 0.27502441, "step": 15777, "time_per_iteration": 2.672736406326294 }, { "auxiliary_loss_clip": 0.01267159, "auxiliary_loss_mlp": 0.00234097, "balance_loss_clip": 1.04010332, "balance_loss_mlp": 0.20642836, "epoch": 0.9486246805952202, "flos": 22017766227840.0, "grad_norm": 19.722782116912896, "language_loss": 0.8686105, "learning_rate": 2.759205797806441e-08, "loss": 0.88362312, "num_input_tokens_seen": 340383935, "router_z_loss_clip": 2.26953125, "router_z_loss_mlp": 0.2767334, "step": 15778, "time_per_iteration": 2.6551849842071533 }, { "auxiliary_loss_clip": 0.01209231, "auxiliary_loss_mlp": 0.00219015, "balance_loss_clip": 1.00675309, "balance_loss_mlp": 0.19698542, "epoch": 0.9486848038478882, "flos": 16508674604160.0, "grad_norm": 4.436244724610979, "language_loss": 0.77652967, "learning_rate": 2.7527625963810865e-08, "loss": 0.79081213, "num_input_tokens_seen": 340402760, "router_z_loss_clip": 2.02734375, "router_z_loss_mlp": 0.22045898, "step": 15779, "time_per_iteration": 2.6172664165496826 }, { "auxiliary_loss_clip": 0.01245489, "auxiliary_loss_mlp": 0.00249872, "balance_loss_clip": 1.02498829, "balance_loss_mlp": 0.2222158, "epoch": 0.9487449271005561, "flos": 19244385221760.0, "grad_norm": 6.107454060978959, "language_loss": 0.88772881, "learning_rate": 2.7463268745907542e-08, "loss": 0.90268242, "num_input_tokens_seen": 340422105, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.27661133, "step": 15780, "time_per_iteration": 2.6514127254486084 }, { "auxiliary_loss_clip": 0.01237805, "auxiliary_loss_mlp": 0.00222132, "balance_loss_clip": 1.02231085, "balance_loss_mlp": 0.19781324, "epoch": 0.9488050503532242, "flos": 21762692772480.0, "grad_norm": 3.6662047746337016, "language_loss": 0.73469532, "learning_rate": 2.7398986326794494e-08, "loss": 0.74929464, "num_input_tokens_seen": 340441160, "router_z_loss_clip": 2.15332031, "router_z_loss_mlp": 0.2434082, "step": 15781, "time_per_iteration": 2.645259141921997 }, { "auxiliary_loss_clip": 0.01234895, "auxiliary_loss_mlp": 0.00231383, "balance_loss_clip": 1.02092183, "balance_loss_mlp": 0.20482326, "epoch": 0.9488651736058921, "flos": 18368919037440.0, "grad_norm": 42.214615418477166, "language_loss": 0.88175082, "learning_rate": 2.733477870890999e-08, "loss": 0.89641368, "num_input_tokens_seen": 340458200, "router_z_loss_clip": 2.140625, "router_z_loss_mlp": 0.265625, "step": 15782, "time_per_iteration": 2.871408224105835 }, { "auxiliary_loss_clip": 0.01084597, "auxiliary_loss_mlp": 0.0004825, "balance_loss_clip": 0.94735539, "balance_loss_mlp": 0.04228922, "epoch": 0.9489252968585601, "flos": 70084057230720.0, "grad_norm": 1.5440127501073468, "language_loss": 0.5933212, "learning_rate": 2.7270645894688082e-08, "loss": 0.60464966, "num_input_tokens_seen": 340526420, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.05957031, "step": 15783, "time_per_iteration": 3.280482769012451 }, { "auxiliary_loss_clip": 0.01238698, "auxiliary_loss_mlp": 0.00219474, "balance_loss_clip": 1.02144945, "balance_loss_mlp": 0.19380793, "epoch": 0.948985420111228, "flos": 27855440490240.0, "grad_norm": 55.29551359691361, "language_loss": 0.79139811, "learning_rate": 2.720658788656105e-08, "loss": 0.80597979, "num_input_tokens_seen": 340546325, "router_z_loss_clip": 2.17480469, "router_z_loss_mlp": 0.25671387, "step": 15784, "time_per_iteration": 2.7334346771240234 }, { "auxiliary_loss_clip": 0.01234985, "auxiliary_loss_mlp": 0.0023144, "balance_loss_clip": 1.02104867, "balance_loss_mlp": 0.20681109, "epoch": 0.949045543363896, "flos": 24316049018880.0, "grad_norm": 5.669556360518885, "language_loss": 0.77159965, "learning_rate": 2.714260468695806e-08, "loss": 0.78626394, "num_input_tokens_seen": 340565145, "router_z_loss_clip": 2.13476562, "router_z_loss_mlp": 0.24621582, "step": 15785, "time_per_iteration": 2.728456974029541 }, { "auxiliary_loss_clip": 0.0124846, "auxiliary_loss_mlp": 0.00247464, "balance_loss_clip": 1.02793288, "balance_loss_mlp": 0.22145236, "epoch": 0.9491056666165639, "flos": 24241677909120.0, "grad_norm": 21.004689177963268, "language_loss": 0.82455122, "learning_rate": 2.707869629830495e-08, "loss": 0.83951044, "num_input_tokens_seen": 340585465, "router_z_loss_clip": 2.20507812, "router_z_loss_mlp": 0.26025391, "step": 15786, "time_per_iteration": 2.703324556350708 }, { "auxiliary_loss_clip": 0.01231533, "auxiliary_loss_mlp": 0.00202256, "balance_loss_clip": 1.01870847, "balance_loss_mlp": 0.17744815, "epoch": 0.949165789869232, "flos": 24531261356160.0, "grad_norm": 6.7348755986581725, "language_loss": 0.86257833, "learning_rate": 2.7014862723025335e-08, "loss": 0.87691629, "num_input_tokens_seen": 340606010, "router_z_loss_clip": 2.12890625, "router_z_loss_mlp": 0.24780273, "step": 15787, "time_per_iteration": 2.723062753677368 }, { "auxiliary_loss_clip": 0.012397, "auxiliary_loss_mlp": 0.00204989, "balance_loss_clip": 1.02552867, "balance_loss_mlp": 0.18065877, "epoch": 0.9492259131218999, "flos": 22235348862720.0, "grad_norm": 21.212599265297058, "language_loss": 0.82351696, "learning_rate": 2.6951103963540388e-08, "loss": 0.83796388, "num_input_tokens_seen": 340626135, "router_z_loss_clip": 2.14257812, "router_z_loss_mlp": 0.2434082, "step": 15788, "time_per_iteration": 2.6704752445220947 }, { "auxiliary_loss_clip": 0.01257553, "auxiliary_loss_mlp": 0.00249951, "balance_loss_clip": 1.03421152, "balance_loss_mlp": 0.22153118, "epoch": 0.9492860363745679, "flos": 22966310632320.0, "grad_norm": 75.5700422038416, "language_loss": 0.80905122, "learning_rate": 2.6887420022266848e-08, "loss": 0.82412618, "num_input_tokens_seen": 340644870, "router_z_loss_clip": 2.22949219, "router_z_loss_mlp": 0.28430176, "step": 15789, "time_per_iteration": 2.6900794506073 }, { "auxiliary_loss_clip": 0.01250251, "auxiliary_loss_mlp": 0.00245906, "balance_loss_clip": 1.03248119, "balance_loss_mlp": 0.21951291, "epoch": 0.9493461596272358, "flos": 18370283754240.0, "grad_norm": 4.450523661736236, "language_loss": 0.83416915, "learning_rate": 2.682381090161989e-08, "loss": 0.84913075, "num_input_tokens_seen": 340663695, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.26379395, "step": 15790, "time_per_iteration": 2.6581103801727295 }, { "auxiliary_loss_clip": 0.01257591, "auxiliary_loss_mlp": 0.00220605, "balance_loss_clip": 1.03429842, "balance_loss_mlp": 0.19468907, "epoch": 0.9494062828799038, "flos": 20011724490240.0, "grad_norm": 5.2308508273767105, "language_loss": 0.88449275, "learning_rate": 2.6760276604012033e-08, "loss": 0.89927471, "num_input_tokens_seen": 340682970, "router_z_loss_clip": 2.23242188, "router_z_loss_mlp": 0.25927734, "step": 15791, "time_per_iteration": 2.695176124572754 }, { "auxiliary_loss_clip": 0.01267563, "auxiliary_loss_mlp": 0.00243452, "balance_loss_clip": 1.04052901, "balance_loss_mlp": 0.21707103, "epoch": 0.9494664061325718, "flos": 27228583313280.0, "grad_norm": 70.8311351801471, "language_loss": 0.83533746, "learning_rate": 2.6696817131852234e-08, "loss": 0.85044765, "num_input_tokens_seen": 340702275, "router_z_loss_clip": 2.26855469, "router_z_loss_mlp": 0.26403809, "step": 15792, "time_per_iteration": 2.6934125423431396 }, { "auxiliary_loss_clip": 0.01247655, "auxiliary_loss_mlp": 0.00241071, "balance_loss_clip": 1.0255326, "balance_loss_mlp": 0.21396306, "epoch": 0.9495265293852397, "flos": 18369816877440.0, "grad_norm": 3.441796488048126, "language_loss": 0.86009514, "learning_rate": 2.663343248754679e-08, "loss": 0.87498236, "num_input_tokens_seen": 340719060, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.27111816, "step": 15793, "time_per_iteration": 2.8286287784576416 }, { "auxiliary_loss_clip": 0.01231329, "auxiliary_loss_mlp": 0.00226113, "balance_loss_clip": 1.01307726, "balance_loss_mlp": 0.19985139, "epoch": 0.9495866526379078, "flos": 23075766351360.0, "grad_norm": 2.3425633336079636, "language_loss": 0.85555297, "learning_rate": 2.6570122673499562e-08, "loss": 0.87012738, "num_input_tokens_seen": 340737815, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.26293945, "step": 15794, "time_per_iteration": 2.665341377258301 }, { "auxiliary_loss_clip": 0.01255144, "auxiliary_loss_mlp": 0.00232887, "balance_loss_clip": 1.03382874, "balance_loss_mlp": 0.2048724, "epoch": 0.9496467758905757, "flos": 17529902179200.0, "grad_norm": 11.99780752676853, "language_loss": 0.70593619, "learning_rate": 2.650688769211107e-08, "loss": 0.72081649, "num_input_tokens_seen": 340756035, "router_z_loss_clip": 2.21679688, "router_z_loss_mlp": 0.28039551, "step": 15795, "time_per_iteration": 2.6450936794281006 }, { "auxiliary_loss_clip": 0.01241955, "auxiliary_loss_mlp": 0.00203781, "balance_loss_clip": 1.0286814, "balance_loss_mlp": 0.17782864, "epoch": 0.9497068991432437, "flos": 24133910129280.0, "grad_norm": 2.215066795912897, "language_loss": 0.89060235, "learning_rate": 2.644372754577895e-08, "loss": 0.9050597, "num_input_tokens_seen": 340775620, "router_z_loss_clip": 2.1328125, "router_z_loss_mlp": 0.25964355, "step": 15796, "time_per_iteration": 2.7032852172851562 }, { "auxiliary_loss_clip": 0.01242385, "auxiliary_loss_mlp": 0.00226981, "balance_loss_clip": 1.0270977, "balance_loss_mlp": 0.20019495, "epoch": 0.9497670223959116, "flos": 20303319098880.0, "grad_norm": 162.80154829777493, "language_loss": 0.86418766, "learning_rate": 2.6380642236898398e-08, "loss": 0.87888134, "num_input_tokens_seen": 340794510, "router_z_loss_clip": 2.15234375, "router_z_loss_mlp": 0.26794434, "step": 15797, "time_per_iteration": 2.6758038997650146 }, { "auxiliary_loss_clip": 0.01254165, "auxiliary_loss_mlp": 0.00232599, "balance_loss_clip": 1.03138411, "balance_loss_mlp": 0.20582479, "epoch": 0.9498271456485796, "flos": 13698916099200.0, "grad_norm": 188.7339248069587, "language_loss": 0.79422653, "learning_rate": 2.6317631767861727e-08, "loss": 0.80909419, "num_input_tokens_seen": 340812955, "router_z_loss_clip": 2.2265625, "router_z_loss_mlp": 0.26782227, "step": 15798, "time_per_iteration": 2.691800594329834 }, { "auxiliary_loss_clip": 0.01257335, "auxiliary_loss_mlp": 0.00238321, "balance_loss_clip": 1.03290868, "balance_loss_mlp": 0.21165362, "epoch": 0.9498872689012475, "flos": 20814004713600.0, "grad_norm": 28.58240724760987, "language_loss": 0.84338123, "learning_rate": 2.6254696141058575e-08, "loss": 0.85833776, "num_input_tokens_seen": 340829200, "router_z_loss_clip": 2.25, "router_z_loss_mlp": 0.26696777, "step": 15799, "time_per_iteration": 2.652902603149414 }, { "auxiliary_loss_clip": 0.01230441, "auxiliary_loss_mlp": 0.00219745, "balance_loss_clip": 1.02278316, "balance_loss_mlp": 0.19730966, "epoch": 0.9499473921539155, "flos": 21032700670080.0, "grad_norm": 8.430299119264365, "language_loss": 0.77819777, "learning_rate": 2.6191835358874814e-08, "loss": 0.79269964, "num_input_tokens_seen": 340848035, "router_z_loss_clip": 2.07714844, "router_z_loss_mlp": 0.22424316, "step": 15800, "time_per_iteration": 4.0798609256744385 }, { "auxiliary_loss_clip": 0.01233619, "auxiliary_loss_mlp": 0.00226236, "balance_loss_clip": 1.02303946, "balance_loss_mlp": 0.20087993, "epoch": 0.9500075154065835, "flos": 20998693468800.0, "grad_norm": 13.177532421379135, "language_loss": 0.77782845, "learning_rate": 2.6129049423694315e-08, "loss": 0.792427, "num_input_tokens_seen": 340870025, "router_z_loss_clip": 2.10839844, "router_z_loss_mlp": 0.25366211, "step": 15801, "time_per_iteration": 4.2112648487091064 }, { "auxiliary_loss_clip": 0.01241503, "auxiliary_loss_mlp": 0.00221205, "balance_loss_clip": 1.02448666, "balance_loss_mlp": 0.19583774, "epoch": 0.9500676386592515, "flos": 25121956515840.0, "grad_norm": 2.906936185526266, "language_loss": 0.8666169, "learning_rate": 2.6066338337898508e-08, "loss": 0.88124394, "num_input_tokens_seen": 340892290, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.25390625, "step": 15802, "time_per_iteration": 2.7257747650146484 }, { "auxiliary_loss_clip": 0.01248146, "auxiliary_loss_mlp": 0.00213281, "balance_loss_clip": 1.02761185, "balance_loss_mlp": 0.18692344, "epoch": 0.9501277619119194, "flos": 27523625627520.0, "grad_norm": 91.95782266895279, "language_loss": 0.76250482, "learning_rate": 2.60037021038646e-08, "loss": 0.77711904, "num_input_tokens_seen": 340912260, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.26379395, "step": 15803, "time_per_iteration": 2.771003246307373 }, { "auxiliary_loss_clip": 0.01244187, "auxiliary_loss_mlp": 0.00236462, "balance_loss_clip": 1.02728593, "balance_loss_mlp": 0.2106652, "epoch": 0.9501878851645874, "flos": 20813968800000.0, "grad_norm": 38.62721439579709, "language_loss": 0.82368028, "learning_rate": 2.5941140723968247e-08, "loss": 0.83848679, "num_input_tokens_seen": 340928930, "router_z_loss_clip": 2.16601562, "router_z_loss_mlp": 0.25805664, "step": 15804, "time_per_iteration": 2.6887571811676025 }, { "auxiliary_loss_clip": 0.01257666, "auxiliary_loss_mlp": 0.00242847, "balance_loss_clip": 1.03739059, "balance_loss_mlp": 0.21699041, "epoch": 0.9502480084172553, "flos": 18369385914240.0, "grad_norm": 27.974530026838927, "language_loss": 0.80130172, "learning_rate": 2.5878654200581775e-08, "loss": 0.81630683, "num_input_tokens_seen": 340946615, "router_z_loss_clip": 2.19921875, "router_z_loss_mlp": 0.25842285, "step": 15805, "time_per_iteration": 2.723966360092163 }, { "auxiliary_loss_clip": 0.01251312, "auxiliary_loss_mlp": 0.00228244, "balance_loss_clip": 1.0321523, "balance_loss_mlp": 0.20392539, "epoch": 0.9503081316699233, "flos": 23549607590400.0, "grad_norm": 5.876291151439991, "language_loss": 0.86986649, "learning_rate": 2.5816242536074618e-08, "loss": 0.88466203, "num_input_tokens_seen": 340967545, "router_z_loss_clip": 2.18847656, "router_z_loss_mlp": 0.24304199, "step": 15806, "time_per_iteration": 2.7442362308502197 }, { "auxiliary_loss_clip": 0.01255911, "auxiliary_loss_mlp": 0.00212589, "balance_loss_clip": 1.03265548, "balance_loss_mlp": 0.18810326, "epoch": 0.9503682549225914, "flos": 18040444139520.0, "grad_norm": 13.598647980095388, "language_loss": 0.91444814, "learning_rate": 2.5753905732813108e-08, "loss": 0.92913318, "num_input_tokens_seen": 340984955, "router_z_loss_clip": 2.23242188, "router_z_loss_mlp": 0.24511719, "step": 15807, "time_per_iteration": 2.665917158126831 }, { "auxiliary_loss_clip": 0.01230484, "auxiliary_loss_mlp": 0.00225407, "balance_loss_clip": 1.02362561, "balance_loss_mlp": 0.20231596, "epoch": 0.9504283781752593, "flos": 25886135387520.0, "grad_norm": 432.7997886520582, "language_loss": 0.8009311, "learning_rate": 2.5691643793161355e-08, "loss": 0.81549001, "num_input_tokens_seen": 341007300, "router_z_loss_clip": 2.0703125, "router_z_loss_mlp": 0.2310791, "step": 15808, "time_per_iteration": 2.744380235671997 }, { "auxiliary_loss_clip": 0.01238264, "auxiliary_loss_mlp": 0.00231731, "balance_loss_clip": 1.02418756, "balance_loss_mlp": 0.20608936, "epoch": 0.9504885014279273, "flos": 22124025636480.0, "grad_norm": 54.90380081068397, "language_loss": 0.75802827, "learning_rate": 2.562945671948058e-08, "loss": 0.7727282, "num_input_tokens_seen": 341026695, "router_z_loss_clip": 2.13867188, "router_z_loss_mlp": 0.25671387, "step": 15809, "time_per_iteration": 4.1642937660217285 }, { "auxiliary_loss_clip": 0.01236566, "auxiliary_loss_mlp": 0.00218693, "balance_loss_clip": 1.02084446, "balance_loss_mlp": 0.1942198, "epoch": 0.9505486246805952, "flos": 21615961714560.0, "grad_norm": 62.046279382403114, "language_loss": 0.83705997, "learning_rate": 2.5567344514128452e-08, "loss": 0.85161257, "num_input_tokens_seen": 341047080, "router_z_loss_clip": 2.15722656, "router_z_loss_mlp": 0.24487305, "step": 15810, "time_per_iteration": 2.7499430179595947 }, { "auxiliary_loss_clip": 0.01248478, "auxiliary_loss_mlp": 0.00206142, "balance_loss_clip": 1.02388275, "balance_loss_mlp": 0.18070242, "epoch": 0.9506087479332632, "flos": 22528236360960.0, "grad_norm": 19.51031163321085, "language_loss": 0.88401151, "learning_rate": 2.5505307179460643e-08, "loss": 0.89855772, "num_input_tokens_seen": 341067310, "router_z_loss_clip": 2.25, "router_z_loss_mlp": 0.25427246, "step": 15811, "time_per_iteration": 4.119182109832764 }, { "auxiliary_loss_clip": 0.01236521, "auxiliary_loss_mlp": 0.00210729, "balance_loss_clip": 1.02386701, "balance_loss_mlp": 0.18710166, "epoch": 0.9506688711859311, "flos": 27527360641920.0, "grad_norm": 37.65993977337847, "language_loss": 0.7821309, "learning_rate": 2.5443344717829495e-08, "loss": 0.79660344, "num_input_tokens_seen": 341085110, "router_z_loss_clip": 2.12695312, "router_z_loss_mlp": 0.23632812, "step": 15812, "time_per_iteration": 2.6937525272369385 }, { "auxiliary_loss_clip": 0.01259926, "auxiliary_loss_mlp": 0.00231235, "balance_loss_clip": 1.03974652, "balance_loss_mlp": 0.20568863, "epoch": 0.9507289944385992, "flos": 19865783531520.0, "grad_norm": 38.169982487320624, "language_loss": 0.7111901, "learning_rate": 2.538145713158446e-08, "loss": 0.72610164, "num_input_tokens_seen": 341103190, "router_z_loss_clip": 2.19921875, "router_z_loss_mlp": 0.25524902, "step": 15813, "time_per_iteration": 2.711503028869629 }, { "auxiliary_loss_clip": 0.01249919, "auxiliary_loss_mlp": 0.00228486, "balance_loss_clip": 1.02822697, "balance_loss_mlp": 0.20350008, "epoch": 0.9507891176912671, "flos": 25193274969600.0, "grad_norm": 17.16236292377557, "language_loss": 0.76454437, "learning_rate": 2.5319644423072327e-08, "loss": 0.77932847, "num_input_tokens_seen": 341125695, "router_z_loss_clip": 2.21679688, "router_z_loss_mlp": 0.25, "step": 15814, "time_per_iteration": 2.7025868892669678 }, { "auxiliary_loss_clip": 0.01226529, "auxiliary_loss_mlp": 0.00230603, "balance_loss_clip": 1.01698017, "balance_loss_mlp": 0.20846581, "epoch": 0.9508492409439351, "flos": 24899561458560.0, "grad_norm": 65.49330742687987, "language_loss": 0.73092973, "learning_rate": 2.5257906594637445e-08, "loss": 0.74550104, "num_input_tokens_seen": 341143930, "router_z_loss_clip": 2.09375, "router_z_loss_mlp": 0.22143555, "step": 15815, "time_per_iteration": 2.714184045791626 }, { "auxiliary_loss_clip": 0.01235814, "auxiliary_loss_mlp": 0.00213862, "balance_loss_clip": 1.02276266, "balance_loss_mlp": 0.19133148, "epoch": 0.950909364196603, "flos": 29784094375680.0, "grad_norm": 4.768734191581832, "language_loss": 0.63186324, "learning_rate": 2.519624364862061e-08, "loss": 0.64635998, "num_input_tokens_seen": 341164280, "router_z_loss_clip": 2.13476562, "router_z_loss_mlp": 0.22521973, "step": 15816, "time_per_iteration": 2.7738850116729736 }, { "auxiliary_loss_clip": 0.01255216, "auxiliary_loss_mlp": 0.00248135, "balance_loss_clip": 1.03621101, "balance_loss_mlp": 0.22311309, "epoch": 0.950969487449271, "flos": 24717781704960.0, "grad_norm": 63.51649822574304, "language_loss": 0.791291, "learning_rate": 2.513465558735994e-08, "loss": 0.80632454, "num_input_tokens_seen": 341183670, "router_z_loss_clip": 2.19140625, "router_z_loss_mlp": 0.25012207, "step": 15817, "time_per_iteration": 2.767761707305908 }, { "auxiliary_loss_clip": 0.01245371, "auxiliary_loss_mlp": 0.00233579, "balance_loss_clip": 1.02439094, "balance_loss_mlp": 0.20663744, "epoch": 0.9510296107019389, "flos": 13699167494400.0, "grad_norm": 13.631983546026476, "language_loss": 0.68452245, "learning_rate": 2.5073142413190918e-08, "loss": 0.69931197, "num_input_tokens_seen": 341201900, "router_z_loss_clip": 2.2109375, "router_z_loss_mlp": 0.26953125, "step": 15818, "time_per_iteration": 2.633450984954834 }, { "auxiliary_loss_clip": 0.01251864, "auxiliary_loss_mlp": 0.00233413, "balance_loss_clip": 1.03431439, "balance_loss_mlp": 0.20730636, "epoch": 0.9510897339546069, "flos": 17311852667520.0, "grad_norm": 16.665778810156606, "language_loss": 0.77738762, "learning_rate": 2.5011704128446552e-08, "loss": 0.79224038, "num_input_tokens_seen": 341218340, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.26135254, "step": 15819, "time_per_iteration": 2.612177610397339 }, { "auxiliary_loss_clip": 0.01254017, "auxiliary_loss_mlp": 0.00219544, "balance_loss_clip": 1.0339365, "balance_loss_mlp": 0.19448633, "epoch": 0.951149857207275, "flos": 14793940166400.0, "grad_norm": 69.09766270961251, "language_loss": 0.819929, "learning_rate": 2.49503407354561e-08, "loss": 0.83466464, "num_input_tokens_seen": 341235885, "router_z_loss_clip": 2.20019531, "router_z_loss_mlp": 0.25061035, "step": 15820, "time_per_iteration": 2.659472703933716 }, { "auxiliary_loss_clip": 0.01266789, "auxiliary_loss_mlp": 0.00234878, "balance_loss_clip": 1.04197502, "balance_loss_mlp": 0.20742384, "epoch": 0.9512099804599429, "flos": 19391152193280.0, "grad_norm": 16.6682541806507, "language_loss": 0.8548826, "learning_rate": 2.4889052236546804e-08, "loss": 0.86989927, "num_input_tokens_seen": 341255280, "router_z_loss_clip": 2.25, "router_z_loss_mlp": 0.2746582, "step": 15821, "time_per_iteration": 2.611767530441284 }, { "auxiliary_loss_clip": 0.01229726, "auxiliary_loss_mlp": 0.00222343, "balance_loss_clip": 1.01658607, "balance_loss_mlp": 0.19724897, "epoch": 0.9512701037126109, "flos": 36757874885760.0, "grad_norm": 21.824817660886406, "language_loss": 0.7621069, "learning_rate": 2.4827838634042586e-08, "loss": 0.7766276, "num_input_tokens_seen": 341279055, "router_z_loss_clip": 2.1328125, "router_z_loss_mlp": 0.25085449, "step": 15822, "time_per_iteration": 2.821018934249878 }, { "auxiliary_loss_clip": 0.01233222, "auxiliary_loss_mlp": 0.00211161, "balance_loss_clip": 1.01894116, "balance_loss_mlp": 0.18787916, "epoch": 0.9513302269652788, "flos": 22638266697600.0, "grad_norm": 5.441965348864809, "language_loss": 0.74043095, "learning_rate": 2.47666999302647e-08, "loss": 0.75487483, "num_input_tokens_seen": 341298560, "router_z_loss_clip": 2.14648438, "router_z_loss_mlp": 0.23291016, "step": 15823, "time_per_iteration": 2.642061710357666 }, { "auxiliary_loss_clip": 0.01230477, "auxiliary_loss_mlp": 0.00234623, "balance_loss_clip": 1.01905215, "balance_loss_mlp": 0.20825371, "epoch": 0.9513903502179468, "flos": 22893232412160.0, "grad_norm": 214.19550912266715, "language_loss": 0.84060454, "learning_rate": 2.4705636127531292e-08, "loss": 0.8552556, "num_input_tokens_seen": 341316650, "router_z_loss_clip": 2.11132812, "router_z_loss_mlp": 0.26367188, "step": 15824, "time_per_iteration": 2.685042381286621 }, { "auxiliary_loss_clip": 0.01257571, "auxiliary_loss_mlp": 0.00204022, "balance_loss_clip": 1.02976084, "balance_loss_mlp": 0.17783222, "epoch": 0.9514504734706147, "flos": 27928626451200.0, "grad_norm": 26.62635312743441, "language_loss": 0.85215592, "learning_rate": 2.4644647228158065e-08, "loss": 0.86677188, "num_input_tokens_seen": 341336185, "router_z_loss_clip": 2.27734375, "router_z_loss_mlp": 0.26208496, "step": 15825, "time_per_iteration": 2.716860055923462 }, { "auxiliary_loss_clip": 0.01091333, "auxiliary_loss_mlp": 0.00075097, "balance_loss_clip": 0.95536971, "balance_loss_mlp": 0.06746782, "epoch": 0.9515105967232828, "flos": 67366767312000.0, "grad_norm": 0.7928457181296782, "language_loss": 0.52059829, "learning_rate": 2.458373323445806e-08, "loss": 0.53226256, "num_input_tokens_seen": 341395795, "router_z_loss_clip": 1.359375, "router_z_loss_mlp": 0.07617188, "step": 15826, "time_per_iteration": 3.1069536209106445 }, { "auxiliary_loss_clip": 0.01244294, "auxiliary_loss_mlp": 0.00216215, "balance_loss_clip": 1.02161682, "balance_loss_mlp": 0.19081165, "epoch": 0.9515707199759507, "flos": 25846525664640.0, "grad_norm": 120.43605357139815, "language_loss": 0.82052052, "learning_rate": 2.452289414874076e-08, "loss": 0.83512557, "num_input_tokens_seen": 341415675, "router_z_loss_clip": 2.2265625, "router_z_loss_mlp": 0.25390625, "step": 15827, "time_per_iteration": 2.74379825592041 }, { "auxiliary_loss_clip": 0.01237683, "auxiliary_loss_mlp": 0.00232185, "balance_loss_clip": 1.02304697, "balance_loss_mlp": 0.20823607, "epoch": 0.9516308432286187, "flos": 21828983322240.0, "grad_norm": 453.8631536641007, "language_loss": 0.82970583, "learning_rate": 2.4462129973313207e-08, "loss": 0.84440458, "num_input_tokens_seen": 341432990, "router_z_loss_clip": 2.1484375, "router_z_loss_mlp": 0.23925781, "step": 15828, "time_per_iteration": 2.7452690601348877 }, { "auxiliary_loss_clip": 0.01220302, "auxiliary_loss_mlp": 0.00224835, "balance_loss_clip": 1.00999951, "balance_loss_mlp": 0.20008703, "epoch": 0.9516909664812866, "flos": 27269593666560.0, "grad_norm": 32.369517875903504, "language_loss": 0.7913242, "learning_rate": 2.440144071047978e-08, "loss": 0.80577558, "num_input_tokens_seen": 341454100, "router_z_loss_clip": 2.10546875, "router_z_loss_mlp": 0.24731445, "step": 15829, "time_per_iteration": 2.729369640350342 }, { "auxiliary_loss_clip": 0.01239557, "auxiliary_loss_mlp": 0.00216856, "balance_loss_clip": 1.0223707, "balance_loss_mlp": 0.19388404, "epoch": 0.9517510897339546, "flos": 21215342350080.0, "grad_norm": 87.64141959212265, "language_loss": 0.69429004, "learning_rate": 2.4340826362541533e-08, "loss": 0.7088542, "num_input_tokens_seen": 341472955, "router_z_loss_clip": 2.17285156, "router_z_loss_mlp": 0.22973633, "step": 15830, "time_per_iteration": 2.649383068084717 }, { "auxiliary_loss_clip": 0.01248624, "auxiliary_loss_mlp": 0.00228171, "balance_loss_clip": 1.02809107, "balance_loss_mlp": 0.20237377, "epoch": 0.9518112129866225, "flos": 18733986915840.0, "grad_norm": 33.45550860641747, "language_loss": 0.81951559, "learning_rate": 2.428028693179729e-08, "loss": 0.83428359, "num_input_tokens_seen": 341490165, "router_z_loss_clip": 2.20117188, "router_z_loss_mlp": 0.2578125, "step": 15831, "time_per_iteration": 2.696910858154297 }, { "auxiliary_loss_clip": 0.01221581, "auxiliary_loss_mlp": 0.00214675, "balance_loss_clip": 1.0099299, "balance_loss_mlp": 0.19169113, "epoch": 0.9518713362392905, "flos": 16763676232320.0, "grad_norm": 1766.2300499468508, "language_loss": 0.74046969, "learning_rate": 2.4219822420542545e-08, "loss": 0.75483221, "num_input_tokens_seen": 341508055, "router_z_loss_clip": 2.1171875, "router_z_loss_mlp": 0.2298584, "step": 15832, "time_per_iteration": 2.718071222305298 }, { "auxiliary_loss_clip": 0.01221477, "auxiliary_loss_mlp": 0.00210261, "balance_loss_clip": 1.01167262, "balance_loss_mlp": 0.18700278, "epoch": 0.9519314594919586, "flos": 15230649720960.0, "grad_norm": 14.52903486153226, "language_loss": 0.86416197, "learning_rate": 2.4159432831070135e-08, "loss": 0.87847936, "num_input_tokens_seen": 341526155, "router_z_loss_clip": 2.09960938, "router_z_loss_mlp": 0.23266602, "step": 15833, "time_per_iteration": 2.6821751594543457 }, { "auxiliary_loss_clip": 0.01238676, "auxiliary_loss_mlp": 0.00227988, "balance_loss_clip": 1.02014637, "balance_loss_mlp": 0.20363332, "epoch": 0.9519915827446265, "flos": 19352943100800.0, "grad_norm": 7.299862758045733, "language_loss": 0.85776585, "learning_rate": 2.4099118165670007e-08, "loss": 0.87243253, "num_input_tokens_seen": 341540450, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.2434082, "step": 15834, "time_per_iteration": 2.6397647857666016 }, { "auxiliary_loss_clip": 0.01253608, "auxiliary_loss_mlp": 0.00221663, "balance_loss_clip": 1.02820647, "balance_loss_mlp": 0.19447155, "epoch": 0.9520517059972945, "flos": 22266303408000.0, "grad_norm": 31.505206182640507, "language_loss": 0.84827602, "learning_rate": 2.4038878426629216e-08, "loss": 0.86302871, "num_input_tokens_seen": 341557865, "router_z_loss_clip": 2.25390625, "router_z_loss_mlp": 0.27172852, "step": 15835, "time_per_iteration": 2.6756808757781982 }, { "auxiliary_loss_clip": 0.01247187, "auxiliary_loss_mlp": 0.00235921, "balance_loss_clip": 1.02731669, "balance_loss_mlp": 0.21025509, "epoch": 0.9521118292499624, "flos": 14862313704960.0, "grad_norm": 149.68162431699727, "language_loss": 0.76962686, "learning_rate": 2.397871361623238e-08, "loss": 0.78445792, "num_input_tokens_seen": 341573890, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.25671387, "step": 15836, "time_per_iteration": 2.608229398727417 }, { "auxiliary_loss_clip": 0.01245938, "auxiliary_loss_mlp": 0.00218451, "balance_loss_clip": 1.03066587, "balance_loss_mlp": 0.19327396, "epoch": 0.9521719525026304, "flos": 23508812718720.0, "grad_norm": 34.112026010186405, "language_loss": 0.76744372, "learning_rate": 2.391862373676057e-08, "loss": 0.78208756, "num_input_tokens_seen": 341593770, "router_z_loss_clip": 2.15234375, "router_z_loss_mlp": 0.25183105, "step": 15837, "time_per_iteration": 2.683856725692749 }, { "auxiliary_loss_clip": 0.01233756, "auxiliary_loss_mlp": 0.0024266, "balance_loss_clip": 1.01159692, "balance_loss_mlp": 0.21540861, "epoch": 0.9522320757552983, "flos": 19714922409600.0, "grad_norm": 91.022437355982, "language_loss": 0.81142086, "learning_rate": 2.3858608790492617e-08, "loss": 0.82618505, "num_input_tokens_seen": 341612065, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.2722168, "step": 15838, "time_per_iteration": 2.6518428325653076 }, { "auxiliary_loss_clip": 0.0125046, "auxiliary_loss_mlp": 0.00226315, "balance_loss_clip": 1.02868783, "balance_loss_mlp": 0.20087577, "epoch": 0.9522921990079664, "flos": 25921291824000.0, "grad_norm": 38.86694199492652, "language_loss": 0.86085773, "learning_rate": 2.379866877970449e-08, "loss": 0.87562549, "num_input_tokens_seen": 341631365, "router_z_loss_clip": 2.21679688, "router_z_loss_mlp": 0.25427246, "step": 15839, "time_per_iteration": 2.7992429733276367 }, { "auxiliary_loss_clip": 0.01236438, "auxiliary_loss_mlp": 0.00208777, "balance_loss_clip": 1.01970875, "balance_loss_mlp": 0.18344527, "epoch": 0.9523523222606343, "flos": 19208115463680.0, "grad_norm": 3.7964539939132242, "language_loss": 0.87128985, "learning_rate": 2.3738803706668585e-08, "loss": 0.88574201, "num_input_tokens_seen": 341650300, "router_z_loss_clip": 2.1640625, "router_z_loss_mlp": 0.2532959, "step": 15840, "time_per_iteration": 2.688999652862549 }, { "auxiliary_loss_clip": 0.01223907, "auxiliary_loss_mlp": 0.00227297, "balance_loss_clip": 1.01324701, "balance_loss_mlp": 0.20347893, "epoch": 0.9524124455133023, "flos": 20921269703040.0, "grad_norm": 5.581072108015814, "language_loss": 0.79206991, "learning_rate": 2.3679013573655314e-08, "loss": 0.80658197, "num_input_tokens_seen": 341667680, "router_z_loss_clip": 2.10742188, "router_z_loss_mlp": 0.23803711, "step": 15841, "time_per_iteration": 2.6731691360473633 }, { "auxiliary_loss_clip": 0.0123923, "auxiliary_loss_mlp": 0.00228328, "balance_loss_clip": 1.02756381, "balance_loss_mlp": 0.20381817, "epoch": 0.9524725687659702, "flos": 18843550375680.0, "grad_norm": 5.3916060844006575, "language_loss": 0.8748824, "learning_rate": 2.3619298382931972e-08, "loss": 0.88955796, "num_input_tokens_seen": 341685760, "router_z_loss_clip": 2.11523438, "router_z_loss_mlp": 0.24487305, "step": 15842, "time_per_iteration": 4.064564943313599 }, { "auxiliary_loss_clip": 0.01253513, "auxiliary_loss_mlp": 0.00220816, "balance_loss_clip": 1.03129387, "balance_loss_mlp": 0.19394675, "epoch": 0.9525326920186382, "flos": 22674680110080.0, "grad_norm": 4.697284272879988, "language_loss": 0.81310779, "learning_rate": 2.3559658136762973e-08, "loss": 0.82785112, "num_input_tokens_seen": 341705300, "router_z_loss_clip": 2.22265625, "router_z_loss_mlp": 0.26904297, "step": 15843, "time_per_iteration": 2.647260904312134 }, { "auxiliary_loss_clip": 0.01249675, "auxiliary_loss_mlp": 0.0021396, "balance_loss_clip": 1.02622592, "balance_loss_mlp": 0.1886514, "epoch": 0.9525928152713061, "flos": 22086642556800.0, "grad_norm": 46.88213696738439, "language_loss": 0.84954143, "learning_rate": 2.3500092837409612e-08, "loss": 0.86417782, "num_input_tokens_seen": 341724565, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.25305176, "step": 15844, "time_per_iteration": 4.122177362442017 }, { "auxiliary_loss_clip": 0.01251968, "auxiliary_loss_mlp": 0.00240796, "balance_loss_clip": 1.02378774, "balance_loss_mlp": 0.2146412, "epoch": 0.9526529385239741, "flos": 20704728562560.0, "grad_norm": 10.241567782462893, "language_loss": 0.82662791, "learning_rate": 2.3440602487130977e-08, "loss": 0.8415556, "num_input_tokens_seen": 341743605, "router_z_loss_clip": 2.27929688, "router_z_loss_mlp": 0.26159668, "step": 15845, "time_per_iteration": 2.706728219985962 }, { "auxiliary_loss_clip": 0.01262623, "auxiliary_loss_mlp": 0.00217962, "balance_loss_clip": 1.03659177, "balance_loss_mlp": 0.19292837, "epoch": 0.9527130617766422, "flos": 23368043318400.0, "grad_norm": 61.06808207929735, "language_loss": 0.81801969, "learning_rate": 2.338118708818282e-08, "loss": 0.83282554, "num_input_tokens_seen": 341763475, "router_z_loss_clip": 2.25585938, "router_z_loss_mlp": 0.25048828, "step": 15846, "time_per_iteration": 2.6651999950408936 }, { "auxiliary_loss_clip": 0.01226909, "auxiliary_loss_mlp": 0.00204859, "balance_loss_clip": 1.01405263, "balance_loss_mlp": 0.18137513, "epoch": 0.9527731850293101, "flos": 18985935888000.0, "grad_norm": 14.655008896201254, "language_loss": 0.85936385, "learning_rate": 2.3321846642817998e-08, "loss": 0.87368155, "num_input_tokens_seen": 341781265, "router_z_loss_clip": 2.13085938, "router_z_loss_mlp": 0.23474121, "step": 15847, "time_per_iteration": 2.6938254833221436 }, { "auxiliary_loss_clip": 0.01236661, "auxiliary_loss_mlp": 0.00214498, "balance_loss_clip": 1.02083135, "balance_loss_mlp": 0.18940419, "epoch": 0.9528333082819781, "flos": 19318038059520.0, "grad_norm": 12.248967435863854, "language_loss": 0.85911191, "learning_rate": 2.326258115328672e-08, "loss": 0.87362349, "num_input_tokens_seen": 341798825, "router_z_loss_clip": 2.15820312, "router_z_loss_mlp": 0.25109863, "step": 15848, "time_per_iteration": 2.6351847648620605 }, { "auxiliary_loss_clip": 0.01265045, "auxiliary_loss_mlp": 0.00248797, "balance_loss_clip": 1.04193234, "balance_loss_mlp": 0.22062817, "epoch": 0.952893431534646, "flos": 23951340276480.0, "grad_norm": 14.14838595563198, "language_loss": 0.82728994, "learning_rate": 2.320339062183674e-08, "loss": 0.84242839, "num_input_tokens_seen": 341819480, "router_z_loss_clip": 2.23144531, "router_z_loss_mlp": 0.28161621, "step": 15849, "time_per_iteration": 2.6979191303253174 }, { "auxiliary_loss_clip": 0.01264244, "auxiliary_loss_mlp": 0.00258588, "balance_loss_clip": 1.03667545, "balance_loss_mlp": 0.22994165, "epoch": 0.952953554787314, "flos": 21030545854080.0, "grad_norm": 18.23138867739095, "language_loss": 0.81977415, "learning_rate": 2.314427505071226e-08, "loss": 0.83500254, "num_input_tokens_seen": 341838035, "router_z_loss_clip": 2.27734375, "router_z_loss_mlp": 0.28637695, "step": 15850, "time_per_iteration": 2.6633379459381104 }, { "auxiliary_loss_clip": 0.01238741, "auxiliary_loss_mlp": 0.00237195, "balance_loss_clip": 1.02703977, "balance_loss_mlp": 0.21356785, "epoch": 0.9530136780399819, "flos": 22382870019840.0, "grad_norm": 27.608871880433526, "language_loss": 0.80675769, "learning_rate": 2.308523444215482e-08, "loss": 0.82151711, "num_input_tokens_seen": 341855895, "router_z_loss_clip": 2.1171875, "router_z_loss_mlp": 0.23608398, "step": 15851, "time_per_iteration": 4.108676195144653 }, { "auxiliary_loss_clip": 0.01243376, "auxiliary_loss_mlp": 0.00213787, "balance_loss_clip": 1.02339792, "balance_loss_mlp": 0.18915869, "epoch": 0.95307380129265, "flos": 22159613036160.0, "grad_norm": 3.4991070710054166, "language_loss": 0.87152362, "learning_rate": 2.3026268798403525e-08, "loss": 0.88609529, "num_input_tokens_seen": 341875240, "router_z_loss_clip": 2.19921875, "router_z_loss_mlp": 0.24621582, "step": 15852, "time_per_iteration": 2.705815076828003 }, { "auxiliary_loss_clip": 0.0124305, "auxiliary_loss_mlp": 0.00218618, "balance_loss_clip": 1.02584934, "balance_loss_mlp": 0.19367908, "epoch": 0.9531339245453179, "flos": 44022747214080.0, "grad_norm": 23.695794842561423, "language_loss": 0.67846072, "learning_rate": 2.2967378121694138e-08, "loss": 0.69307733, "num_input_tokens_seen": 341901020, "router_z_loss_clip": 2.16796875, "router_z_loss_mlp": 0.24938965, "step": 15853, "time_per_iteration": 4.358322381973267 }, { "auxiliary_loss_clip": 0.01214701, "auxiliary_loss_mlp": 0.00198441, "balance_loss_clip": 1.0070343, "balance_loss_mlp": 0.17525478, "epoch": 0.9531940477979859, "flos": 20266690204800.0, "grad_norm": 5.8159073689184995, "language_loss": 0.80295396, "learning_rate": 2.290856241425998e-08, "loss": 0.81708539, "num_input_tokens_seen": 341919365, "router_z_loss_clip": 2.078125, "router_z_loss_mlp": 0.23181152, "step": 15854, "time_per_iteration": 2.713099479675293 }, { "auxiliary_loss_clip": 0.01243084, "auxiliary_loss_mlp": 0.00211816, "balance_loss_clip": 1.02424479, "balance_loss_mlp": 0.18690088, "epoch": 0.9532541710506538, "flos": 25335732309120.0, "grad_norm": 2.1595460111099425, "language_loss": 0.75753617, "learning_rate": 2.284982167833127e-08, "loss": 0.77208513, "num_input_tokens_seen": 341939985, "router_z_loss_clip": 2.19042969, "router_z_loss_mlp": 0.24902344, "step": 15855, "time_per_iteration": 2.7499821186065674 }, { "auxiliary_loss_clip": 0.01233651, "auxiliary_loss_mlp": 0.00232343, "balance_loss_clip": 1.01551747, "balance_loss_mlp": 0.20670119, "epoch": 0.9533142943033218, "flos": 26469288691200.0, "grad_norm": 440.85673392297986, "language_loss": 0.84044361, "learning_rate": 2.279115591613556e-08, "loss": 0.85510361, "num_input_tokens_seen": 341959255, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.25634766, "step": 15856, "time_per_iteration": 2.7109339237213135 }, { "auxiliary_loss_clip": 0.0123782, "auxiliary_loss_mlp": 0.00230438, "balance_loss_clip": 1.02240372, "balance_loss_mlp": 0.20596385, "epoch": 0.9533744175559897, "flos": 23656944407040.0, "grad_norm": 24.924025841971236, "language_loss": 0.85832417, "learning_rate": 2.2732565129897075e-08, "loss": 0.8730067, "num_input_tokens_seen": 341977205, "router_z_loss_clip": 2.15429688, "router_z_loss_mlp": 0.24475098, "step": 15857, "time_per_iteration": 2.705632448196411 }, { "auxiliary_loss_clip": 0.01091418, "auxiliary_loss_mlp": 0.00072602, "balance_loss_clip": 0.95501828, "balance_loss_mlp": 0.06630792, "epoch": 0.9534345408086577, "flos": 61052055500160.0, "grad_norm": 0.6852831605729798, "language_loss": 0.61741602, "learning_rate": 2.267404932183803e-08, "loss": 0.62905627, "num_input_tokens_seen": 342038545, "router_z_loss_clip": 1.3671875, "router_z_loss_mlp": 0.06298828, "step": 15858, "time_per_iteration": 3.152625322341919 }, { "auxiliary_loss_clip": 0.01223646, "auxiliary_loss_mlp": 0.0022611, "balance_loss_clip": 1.01015496, "balance_loss_mlp": 0.20055123, "epoch": 0.9534946640613258, "flos": 18951677291520.0, "grad_norm": 245.6938377183869, "language_loss": 0.63467133, "learning_rate": 2.2615608494177097e-08, "loss": 0.64916891, "num_input_tokens_seen": 342058195, "router_z_loss_clip": 2.1328125, "router_z_loss_mlp": 0.2557373, "step": 15859, "time_per_iteration": 2.6484739780426025 }, { "auxiliary_loss_clip": 0.0123293, "auxiliary_loss_mlp": 0.00226585, "balance_loss_clip": 1.0207448, "balance_loss_mlp": 0.2032681, "epoch": 0.9535547873139937, "flos": 16654292340480.0, "grad_norm": 18.507550229277935, "language_loss": 0.90853316, "learning_rate": 2.2557242649130504e-08, "loss": 0.92312831, "num_input_tokens_seen": 342075025, "router_z_loss_clip": 2.12304688, "router_z_loss_mlp": 0.23327637, "step": 15860, "time_per_iteration": 2.718653440475464 }, { "auxiliary_loss_clip": 0.01242793, "auxiliary_loss_mlp": 0.00219144, "balance_loss_clip": 1.02823436, "balance_loss_mlp": 0.19501579, "epoch": 0.9536149105666617, "flos": 20667776446080.0, "grad_norm": 15.936233713435355, "language_loss": 0.76070714, "learning_rate": 2.249895178891159e-08, "loss": 0.77532649, "num_input_tokens_seen": 342094595, "router_z_loss_clip": 2.15039062, "router_z_loss_mlp": 0.24108887, "step": 15861, "time_per_iteration": 2.641321897506714 }, { "auxiliary_loss_clip": 0.0126462, "auxiliary_loss_mlp": 0.00233473, "balance_loss_clip": 1.03959179, "balance_loss_mlp": 0.20789102, "epoch": 0.9536750338193296, "flos": 30700499086080.0, "grad_norm": 86.24497226355157, "language_loss": 0.73860776, "learning_rate": 2.244073591573037e-08, "loss": 0.75358868, "num_input_tokens_seen": 342115970, "router_z_loss_clip": 2.24804688, "router_z_loss_mlp": 0.25585938, "step": 15862, "time_per_iteration": 2.736992597579956 }, { "auxiliary_loss_clip": 0.01221601, "auxiliary_loss_mlp": 0.00203956, "balance_loss_clip": 1.01649141, "balance_loss_mlp": 0.18286785, "epoch": 0.9537351570719976, "flos": 20405484357120.0, "grad_norm": 8.196515165240898, "language_loss": 0.75634778, "learning_rate": 2.238259503179485e-08, "loss": 0.77060342, "num_input_tokens_seen": 342134080, "router_z_loss_clip": 2.04882812, "router_z_loss_mlp": 0.2109375, "step": 15863, "time_per_iteration": 2.640151262283325 }, { "auxiliary_loss_clip": 0.01238076, "auxiliary_loss_mlp": 0.00245316, "balance_loss_clip": 1.02377737, "balance_loss_mlp": 0.2198168, "epoch": 0.9537952803246655, "flos": 29929245235200.0, "grad_norm": 18.601700691740344, "language_loss": 0.85299253, "learning_rate": 2.2324529139309267e-08, "loss": 0.8678264, "num_input_tokens_seen": 342154725, "router_z_loss_clip": 2.14257812, "router_z_loss_mlp": 0.25524902, "step": 15864, "time_per_iteration": 2.7463579177856445 }, { "auxiliary_loss_clip": 0.01229441, "auxiliary_loss_mlp": 0.00220741, "balance_loss_clip": 1.01817966, "balance_loss_mlp": 0.19633929, "epoch": 0.9538554035773336, "flos": 20521404524160.0, "grad_norm": 30.934684706518723, "language_loss": 0.71287978, "learning_rate": 2.226653824047586e-08, "loss": 0.72738159, "num_input_tokens_seen": 342172275, "router_z_loss_clip": 2.11132812, "router_z_loss_mlp": 0.24401855, "step": 15865, "time_per_iteration": 2.636202812194824 }, { "auxiliary_loss_clip": 0.01244949, "auxiliary_loss_mlp": 0.00216886, "balance_loss_clip": 1.0236721, "balance_loss_mlp": 0.19334209, "epoch": 0.9539155268300015, "flos": 18406517598720.0, "grad_norm": 34.359420817967816, "language_loss": 0.7856338, "learning_rate": 2.2208622337493765e-08, "loss": 0.8002522, "num_input_tokens_seen": 342190880, "router_z_loss_clip": 2.21484375, "router_z_loss_mlp": 0.23547363, "step": 15866, "time_per_iteration": 2.7195987701416016 }, { "auxiliary_loss_clip": 0.01251148, "auxiliary_loss_mlp": 0.00234279, "balance_loss_clip": 1.03313816, "balance_loss_mlp": 0.20881617, "epoch": 0.9539756500826695, "flos": 26213281482240.0, "grad_norm": 6.295607558188785, "language_loss": 0.94648206, "learning_rate": 2.215078143255855e-08, "loss": 0.96133637, "num_input_tokens_seen": 342208165, "router_z_loss_clip": 2.18164062, "router_z_loss_mlp": 0.25488281, "step": 15867, "time_per_iteration": 2.692237138748169 }, { "auxiliary_loss_clip": 0.01084833, "auxiliary_loss_mlp": 0.00085535, "balance_loss_clip": 0.94769204, "balance_loss_mlp": 0.07814423, "epoch": 0.9540357733353374, "flos": 68289097766400.0, "grad_norm": 0.7573580884529483, "language_loss": 0.61441362, "learning_rate": 2.2093015527864024e-08, "loss": 0.62611729, "num_input_tokens_seen": 342277110, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.07373047, "step": 15868, "time_per_iteration": 3.2424678802490234 }, { "auxiliary_loss_clip": 0.01246118, "auxiliary_loss_mlp": 0.00227553, "balance_loss_clip": 1.02882481, "balance_loss_mlp": 0.20138651, "epoch": 0.9540958965880054, "flos": 21288276915840.0, "grad_norm": 25.27196440063528, "language_loss": 0.69949901, "learning_rate": 2.2035324625600425e-08, "loss": 0.71423566, "num_input_tokens_seen": 342294695, "router_z_loss_clip": 2.16894531, "router_z_loss_mlp": 0.26208496, "step": 15869, "time_per_iteration": 2.731685161590576 }, { "auxiliary_loss_clip": 0.01239651, "auxiliary_loss_mlp": 0.00222193, "balance_loss_clip": 1.02631593, "balance_loss_mlp": 0.19773081, "epoch": 0.9541560198406733, "flos": 19751407649280.0, "grad_norm": 23.23193031247423, "language_loss": 0.77223408, "learning_rate": 2.197770872795579e-08, "loss": 0.78685248, "num_input_tokens_seen": 342314970, "router_z_loss_clip": 2.13476562, "router_z_loss_mlp": 0.24462891, "step": 15870, "time_per_iteration": 2.6886661052703857 }, { "auxiliary_loss_clip": 0.01223564, "auxiliary_loss_mlp": 0.00219146, "balance_loss_clip": 1.0134989, "balance_loss_mlp": 0.19517308, "epoch": 0.9542161430933414, "flos": 24715626888960.0, "grad_norm": 10.392400797034151, "language_loss": 0.84210193, "learning_rate": 2.1920167837114368e-08, "loss": 0.856529, "num_input_tokens_seen": 342334255, "router_z_loss_clip": 2.09960938, "router_z_loss_mlp": 0.23974609, "step": 15871, "time_per_iteration": 2.801481246948242 }, { "auxiliary_loss_clip": 0.01249257, "auxiliary_loss_mlp": 0.00222733, "balance_loss_clip": 1.03103185, "balance_loss_mlp": 0.19644764, "epoch": 0.9542762663460094, "flos": 31065818359680.0, "grad_norm": 58.441071624515274, "language_loss": 0.67652518, "learning_rate": 2.1862701955258634e-08, "loss": 0.69124508, "num_input_tokens_seen": 342354730, "router_z_loss_clip": 2.18164062, "router_z_loss_mlp": 0.26293945, "step": 15872, "time_per_iteration": 2.7740161418914795 }, { "auxiliary_loss_clip": 0.01251181, "auxiliary_loss_mlp": 0.00246494, "balance_loss_clip": 1.03285646, "balance_loss_mlp": 0.22036333, "epoch": 0.9543363895986773, "flos": 20776729374720.0, "grad_norm": 5.141404940545341, "language_loss": 0.80183136, "learning_rate": 2.1805311084567514e-08, "loss": 0.8168081, "num_input_tokens_seen": 342374565, "router_z_loss_clip": 2.18457031, "router_z_loss_mlp": 0.26171875, "step": 15873, "time_per_iteration": 2.7065377235412598 }, { "auxiliary_loss_clip": 0.0123988, "auxiliary_loss_mlp": 0.0023038, "balance_loss_clip": 1.02119684, "balance_loss_mlp": 0.20566808, "epoch": 0.9543965128513453, "flos": 24462744163200.0, "grad_norm": 93.19581214879203, "language_loss": 0.71859574, "learning_rate": 2.1747995227217265e-08, "loss": 0.7332983, "num_input_tokens_seen": 342394590, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.24731445, "step": 15874, "time_per_iteration": 2.719271659851074 }, { "auxiliary_loss_clip": 0.01220718, "auxiliary_loss_mlp": 0.00193123, "balance_loss_clip": 1.01087427, "balance_loss_mlp": 0.16966246, "epoch": 0.9544566361040132, "flos": 15261532439040.0, "grad_norm": 26.911968920757936, "language_loss": 0.96972775, "learning_rate": 2.169075438538104e-08, "loss": 0.98386621, "num_input_tokens_seen": 342410445, "router_z_loss_clip": 2.09667969, "router_z_loss_mlp": 0.23486328, "step": 15875, "time_per_iteration": 2.679400682449341 }, { "auxiliary_loss_clip": 0.01244825, "auxiliary_loss_mlp": 0.0021989, "balance_loss_clip": 1.02358747, "balance_loss_mlp": 0.19502321, "epoch": 0.9545167593566812, "flos": 25918777872000.0, "grad_norm": 25.495137398754864, "language_loss": 0.73820359, "learning_rate": 2.1633588561229765e-08, "loss": 0.75285077, "num_input_tokens_seen": 342430970, "router_z_loss_clip": 2.21289062, "router_z_loss_mlp": 0.24865723, "step": 15876, "time_per_iteration": 2.705110549926758 }, { "auxiliary_loss_clip": 0.01271853, "auxiliary_loss_mlp": 0.00236146, "balance_loss_clip": 1.04285049, "balance_loss_mlp": 0.20919245, "epoch": 0.9545768826093491, "flos": 25628188844160.0, "grad_norm": 20.488919263616246, "language_loss": 0.80516779, "learning_rate": 2.1576497756931267e-08, "loss": 0.82024777, "num_input_tokens_seen": 342449505, "router_z_loss_clip": 2.29101562, "router_z_loss_mlp": 0.26940918, "step": 15877, "time_per_iteration": 2.7232794761657715 }, { "auxiliary_loss_clip": 0.01261815, "auxiliary_loss_mlp": 0.0024166, "balance_loss_clip": 1.03770602, "balance_loss_mlp": 0.21408646, "epoch": 0.9546370058620172, "flos": 22491499726080.0, "grad_norm": 3.2663031324487535, "language_loss": 0.79137534, "learning_rate": 2.1519481974650035e-08, "loss": 0.80641007, "num_input_tokens_seen": 342470390, "router_z_loss_clip": 2.2421875, "router_z_loss_mlp": 0.27612305, "step": 15878, "time_per_iteration": 2.666131019592285 }, { "auxiliary_loss_clip": 0.01235739, "auxiliary_loss_mlp": 0.00211912, "balance_loss_clip": 1.0204823, "balance_loss_mlp": 0.18704453, "epoch": 0.9546971291146851, "flos": 24609582961920.0, "grad_norm": 38.054938263771966, "language_loss": 0.74891102, "learning_rate": 2.1462541216548335e-08, "loss": 0.76338756, "num_input_tokens_seen": 342492560, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.24890137, "step": 15879, "time_per_iteration": 2.737482786178589 }, { "auxiliary_loss_clip": 0.01223149, "auxiliary_loss_mlp": 0.00208683, "balance_loss_clip": 1.01309276, "balance_loss_mlp": 0.18524694, "epoch": 0.9547572523673531, "flos": 28657756627200.0, "grad_norm": 45.26124743866386, "language_loss": 0.92222697, "learning_rate": 2.1405675484785334e-08, "loss": 0.93654525, "num_input_tokens_seen": 342512315, "router_z_loss_clip": 2.09960938, "router_z_loss_mlp": 0.234375, "step": 15880, "time_per_iteration": 2.7568304538726807 }, { "auxiliary_loss_clip": 0.01219021, "auxiliary_loss_mlp": 0.00205189, "balance_loss_clip": 1.0048039, "balance_loss_mlp": 0.18029803, "epoch": 0.954817375620021, "flos": 33802606385280.0, "grad_norm": 4.5791957522684825, "language_loss": 0.82277012, "learning_rate": 2.134888478151753e-08, "loss": 0.83701217, "num_input_tokens_seen": 342533060, "router_z_loss_clip": 2.14355469, "router_z_loss_mlp": 0.24890137, "step": 15881, "time_per_iteration": 2.737898826599121 }, { "auxiliary_loss_clip": 0.01241171, "auxiliary_loss_mlp": 0.00212549, "balance_loss_clip": 1.02163482, "balance_loss_mlp": 0.18750308, "epoch": 0.954877498872689, "flos": 14428225843200.0, "grad_norm": 6.658612232628167, "language_loss": 0.79098082, "learning_rate": 2.1292169108898083e-08, "loss": 0.80551809, "num_input_tokens_seen": 342550830, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.25048828, "step": 15882, "time_per_iteration": 2.639922618865967 }, { "auxiliary_loss_clip": 0.01252194, "auxiliary_loss_mlp": 0.0021495, "balance_loss_clip": 1.03489816, "balance_loss_mlp": 0.19045229, "epoch": 0.9549376221253569, "flos": 59269447336320.0, "grad_norm": 12.996853146148107, "language_loss": 0.7463342, "learning_rate": 2.1235528469078168e-08, "loss": 0.76100564, "num_input_tokens_seen": 342575070, "router_z_loss_clip": 2.171875, "router_z_loss_mlp": 0.24487305, "step": 15883, "time_per_iteration": 3.019486665725708 }, { "auxiliary_loss_clip": 0.01265475, "auxiliary_loss_mlp": 0.00255203, "balance_loss_clip": 1.04186869, "balance_loss_mlp": 0.22826147, "epoch": 0.954997745378025, "flos": 17274397760640.0, "grad_norm": 9.83511828139019, "language_loss": 0.87885416, "learning_rate": 2.1178962864205175e-08, "loss": 0.89406085, "num_input_tokens_seen": 342592215, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.26940918, "step": 15884, "time_per_iteration": 4.1777870655059814 }, { "auxiliary_loss_clip": 0.0126492, "auxiliary_loss_mlp": 0.00241198, "balance_loss_clip": 1.03959882, "balance_loss_mlp": 0.21487637, "epoch": 0.955057868630693, "flos": 13006378903680.0, "grad_norm": 21.07892828914089, "language_loss": 0.85587341, "learning_rate": 2.1122472296424054e-08, "loss": 0.87093461, "num_input_tokens_seen": 342610030, "router_z_loss_clip": 2.25585938, "router_z_loss_mlp": 0.26330566, "step": 15885, "time_per_iteration": 2.7532761096954346 }, { "auxiliary_loss_clip": 0.01259575, "auxiliary_loss_mlp": 0.00231756, "balance_loss_clip": 1.03556466, "balance_loss_mlp": 0.20542276, "epoch": 0.9551179918833609, "flos": 22637692080000.0, "grad_norm": 6.197469903524882, "language_loss": 0.79985321, "learning_rate": 2.1066056767877317e-08, "loss": 0.81476653, "num_input_tokens_seen": 342626475, "router_z_loss_clip": 2.24121094, "router_z_loss_mlp": 0.26306152, "step": 15886, "time_per_iteration": 4.0955681800842285 }, { "auxiliary_loss_clip": 0.01256602, "auxiliary_loss_mlp": 0.00237615, "balance_loss_clip": 1.03632498, "balance_loss_mlp": 0.21167536, "epoch": 0.9551781151360289, "flos": 21542811667200.0, "grad_norm": 21.99549910262301, "language_loss": 0.82715786, "learning_rate": 2.1009716280703916e-08, "loss": 0.84209996, "num_input_tokens_seen": 342646645, "router_z_loss_clip": 2.20117188, "router_z_loss_mlp": 0.25939941, "step": 15887, "time_per_iteration": 2.696613311767578 }, { "auxiliary_loss_clip": 0.01224235, "auxiliary_loss_mlp": 0.0021052, "balance_loss_clip": 1.01441276, "balance_loss_mlp": 0.18813211, "epoch": 0.9552382383886968, "flos": 20702250524160.0, "grad_norm": 476.64486757951545, "language_loss": 0.66141391, "learning_rate": 2.0953450837040364e-08, "loss": 0.67576146, "num_input_tokens_seen": 342663615, "router_z_loss_clip": 2.09765625, "router_z_loss_mlp": 0.22387695, "step": 15888, "time_per_iteration": 2.682880401611328 }, { "auxiliary_loss_clip": 0.01081951, "auxiliary_loss_mlp": 0.00084177, "balance_loss_clip": 0.94551998, "balance_loss_mlp": 0.07716745, "epoch": 0.9552983616413648, "flos": 67769792887680.0, "grad_norm": 0.6991587068506511, "language_loss": 0.57269555, "learning_rate": 2.0897260439020514e-08, "loss": 0.58435684, "num_input_tokens_seen": 342728275, "router_z_loss_clip": 1.359375, "router_z_loss_mlp": 0.0703125, "step": 15889, "time_per_iteration": 3.2431600093841553 }, { "auxiliary_loss_clip": 0.01247425, "auxiliary_loss_mlp": 0.00245909, "balance_loss_clip": 1.0262599, "balance_loss_mlp": 0.21980263, "epoch": 0.9553584848940327, "flos": 21579979265280.0, "grad_norm": 8.039679440392607, "language_loss": 0.74330539, "learning_rate": 2.084114508877466e-08, "loss": 0.75823873, "num_input_tokens_seen": 342748860, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.26123047, "step": 15890, "time_per_iteration": 2.729304313659668 }, { "auxiliary_loss_clip": 0.01235052, "auxiliary_loss_mlp": 0.0020429, "balance_loss_clip": 1.02020967, "balance_loss_mlp": 0.18125868, "epoch": 0.9554186081467008, "flos": 24208173498240.0, "grad_norm": 20.985417470580906, "language_loss": 0.81045008, "learning_rate": 2.0785104788430874e-08, "loss": 0.82484353, "num_input_tokens_seen": 342769705, "router_z_loss_clip": 2.14648438, "router_z_loss_mlp": 0.23022461, "step": 15891, "time_per_iteration": 2.7491800785064697 }, { "auxiliary_loss_clip": 0.01234446, "auxiliary_loss_mlp": 0.00188142, "balance_loss_clip": 1.02403879, "balance_loss_mlp": 0.16515833, "epoch": 0.9554787313993687, "flos": 16251554073600.0, "grad_norm": 28.090973489665902, "language_loss": 0.8379274, "learning_rate": 2.072913954011435e-08, "loss": 0.8521533, "num_input_tokens_seen": 342787000, "router_z_loss_clip": 2.10644531, "router_z_loss_mlp": 0.2298584, "step": 15892, "time_per_iteration": 2.7048635482788086 }, { "auxiliary_loss_clip": 0.01239115, "auxiliary_loss_mlp": 0.00238278, "balance_loss_clip": 1.02312851, "balance_loss_mlp": 0.21264753, "epoch": 0.9555388546520367, "flos": 23404133508480.0, "grad_norm": 14.104615642358187, "language_loss": 0.77913976, "learning_rate": 2.0673249345947386e-08, "loss": 0.7939136, "num_input_tokens_seen": 342807795, "router_z_loss_clip": 2.16113281, "router_z_loss_mlp": 0.25646973, "step": 15893, "time_per_iteration": 2.7344796657562256 }, { "auxiliary_loss_clip": 0.01236045, "auxiliary_loss_mlp": 0.00224489, "balance_loss_clip": 1.02442586, "balance_loss_mlp": 0.19900244, "epoch": 0.9555989779047046, "flos": 14794047907200.0, "grad_norm": 12.166697816160342, "language_loss": 0.74041295, "learning_rate": 2.0617434208048955e-08, "loss": 0.75501829, "num_input_tokens_seen": 342825490, "router_z_loss_clip": 2.1171875, "router_z_loss_mlp": 0.25476074, "step": 15894, "time_per_iteration": 4.1294496059417725 }, { "auxiliary_loss_clip": 0.01243219, "auxiliary_loss_mlp": 0.00233433, "balance_loss_clip": 1.02385354, "balance_loss_mlp": 0.20633629, "epoch": 0.9556591011573726, "flos": 22236749493120.0, "grad_norm": 12.616301325545763, "language_loss": 0.89595377, "learning_rate": 2.056169412853581e-08, "loss": 0.91072029, "num_input_tokens_seen": 342844965, "router_z_loss_clip": 2.19238281, "router_z_loss_mlp": 0.27075195, "step": 15895, "time_per_iteration": 2.766310691833496 }, { "auxiliary_loss_clip": 0.01250457, "auxiliary_loss_mlp": 0.00218546, "balance_loss_clip": 1.03313613, "balance_loss_mlp": 0.19457263, "epoch": 0.9557192244100405, "flos": 27855296835840.0, "grad_norm": 10.920498968007214, "language_loss": 0.78938699, "learning_rate": 2.0506029109521593e-08, "loss": 0.80407703, "num_input_tokens_seen": 342865915, "router_z_loss_clip": 2.17480469, "router_z_loss_mlp": 0.23986816, "step": 15896, "time_per_iteration": 4.172297477722168 }, { "auxiliary_loss_clip": 0.01233666, "auxiliary_loss_mlp": 0.0023288, "balance_loss_clip": 1.01953185, "balance_loss_mlp": 0.20875227, "epoch": 0.9557793476627086, "flos": 17602800831360.0, "grad_norm": 65.119240543668, "language_loss": 0.86861992, "learning_rate": 2.045043915311706e-08, "loss": 0.8832854, "num_input_tokens_seen": 342884000, "router_z_loss_clip": 2.13867188, "router_z_loss_mlp": 0.24145508, "step": 15897, "time_per_iteration": 2.641435146331787 }, { "auxiliary_loss_clip": 0.01246921, "auxiliary_loss_mlp": 0.00229887, "balance_loss_clip": 1.02592659, "balance_loss_mlp": 0.20538951, "epoch": 0.9558394709153766, "flos": 23875496709120.0, "grad_norm": 6.7600612071742106, "language_loss": 0.79458272, "learning_rate": 2.03949242614303e-08, "loss": 0.80935079, "num_input_tokens_seen": 342903095, "router_z_loss_clip": 2.21191406, "router_z_loss_mlp": 0.24487305, "step": 15898, "time_per_iteration": 2.6880149841308594 }, { "auxiliary_loss_clip": 0.01084355, "auxiliary_loss_mlp": 0.00059335, "balance_loss_clip": 0.94837749, "balance_loss_mlp": 0.05289748, "epoch": 0.9558995941680445, "flos": 53682001171200.0, "grad_norm": 0.8166010222731609, "language_loss": 0.51703405, "learning_rate": 2.033948443656652e-08, "loss": 0.52847099, "num_input_tokens_seen": 342958155, "router_z_loss_clip": 1.359375, "router_z_loss_mlp": 0.06445312, "step": 15899, "time_per_iteration": 3.111462354660034 }, { "auxiliary_loss_clip": 0.01286395, "auxiliary_loss_mlp": 0.0022905, "balance_loss_clip": 1.05215681, "balance_loss_mlp": 0.20177488, "epoch": 0.9559597174207125, "flos": 13764488376960.0, "grad_norm": 35.826620685942984, "language_loss": 0.80884004, "learning_rate": 2.028411968062782e-08, "loss": 0.82399452, "num_input_tokens_seen": 342972500, "router_z_loss_clip": 2.33984375, "router_z_loss_mlp": 0.27282715, "step": 15900, "time_per_iteration": 2.6648924350738525 }, { "auxiliary_loss_clip": 0.01245139, "auxiliary_loss_mlp": 0.0024248, "balance_loss_clip": 1.02468693, "balance_loss_mlp": 0.21599191, "epoch": 0.9560198406733804, "flos": 19936347799680.0, "grad_norm": 270.2633841270569, "language_loss": 0.90506506, "learning_rate": 2.0228829995713627e-08, "loss": 0.91994119, "num_input_tokens_seen": 342989035, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.26477051, "step": 15901, "time_per_iteration": 2.6180145740509033 }, { "auxiliary_loss_clip": 0.01086872, "auxiliary_loss_mlp": 0.00098638, "balance_loss_clip": 0.95079273, "balance_loss_mlp": 0.09086575, "epoch": 0.9560799639260484, "flos": 57289550699520.0, "grad_norm": 0.6986916228166169, "language_loss": 0.5348382, "learning_rate": 2.0173615383920485e-08, "loss": 0.54669333, "num_input_tokens_seen": 343051675, "router_z_loss_clip": 1.359375, "router_z_loss_mlp": 0.07763672, "step": 15902, "time_per_iteration": 3.218428611755371 }, { "auxiliary_loss_clip": 0.01223441, "auxiliary_loss_mlp": 0.0022353, "balance_loss_clip": 1.01533866, "balance_loss_mlp": 0.20036739, "epoch": 0.9561400871787163, "flos": 18917167299840.0, "grad_norm": 97.16681958650014, "language_loss": 0.91564459, "learning_rate": 2.01184758473425e-08, "loss": 0.93011433, "num_input_tokens_seen": 343068895, "router_z_loss_clip": 2.078125, "router_z_loss_mlp": 0.23144531, "step": 15903, "time_per_iteration": 2.730149984359741 }, { "auxiliary_loss_clip": 0.01228674, "auxiliary_loss_mlp": 0.00213939, "balance_loss_clip": 1.01597691, "balance_loss_mlp": 0.18945304, "epoch": 0.9562002104313844, "flos": 18038576632320.0, "grad_norm": 83.76818372810664, "language_loss": 0.88065505, "learning_rate": 2.0063411388070217e-08, "loss": 0.89508116, "num_input_tokens_seen": 343087115, "router_z_loss_clip": 2.12890625, "router_z_loss_mlp": 0.24475098, "step": 15904, "time_per_iteration": 2.659135580062866 }, { "auxiliary_loss_clip": 0.01244994, "auxiliary_loss_mlp": 0.00239197, "balance_loss_clip": 1.026245, "balance_loss_mlp": 0.21376985, "epoch": 0.9562603336840523, "flos": 24717673964160.0, "grad_norm": 14.650404858455657, "language_loss": 0.70043761, "learning_rate": 2.0008422008191972e-08, "loss": 0.71527958, "num_input_tokens_seen": 343105575, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.25402832, "step": 15905, "time_per_iteration": 2.723362684249878 }, { "auxiliary_loss_clip": 0.01222498, "auxiliary_loss_mlp": 0.00212819, "balance_loss_clip": 1.01297975, "balance_loss_mlp": 0.18964455, "epoch": 0.9563204569367203, "flos": 21177205084800.0, "grad_norm": 434.7777526391976, "language_loss": 0.78855145, "learning_rate": 1.995350770979254e-08, "loss": 0.80290455, "num_input_tokens_seen": 343123025, "router_z_loss_clip": 2.09765625, "router_z_loss_mlp": 0.23193359, "step": 15906, "time_per_iteration": 2.6490638256073 }, { "auxiliary_loss_clip": 0.01244364, "auxiliary_loss_mlp": 0.00229822, "balance_loss_clip": 1.02605832, "balance_loss_mlp": 0.20562238, "epoch": 0.9563805801893882, "flos": 20229738088320.0, "grad_norm": 44.1766920936188, "language_loss": 0.80780828, "learning_rate": 1.9898668494954473e-08, "loss": 0.82255018, "num_input_tokens_seen": 343141625, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.24182129, "step": 15907, "time_per_iteration": 2.686408042907715 }, { "auxiliary_loss_clip": 0.01224997, "auxiliary_loss_mlp": 0.00201581, "balance_loss_clip": 1.0126971, "balance_loss_mlp": 0.17835921, "epoch": 0.9564407034420562, "flos": 25411001258880.0, "grad_norm": 7.123229354793934, "language_loss": 0.80737293, "learning_rate": 1.9843904365757447e-08, "loss": 0.8216387, "num_input_tokens_seen": 343161300, "router_z_loss_clip": 2.12304688, "router_z_loss_mlp": 0.23217773, "step": 15908, "time_per_iteration": 2.6864986419677734 }, { "auxiliary_loss_clip": 0.01244838, "auxiliary_loss_mlp": 0.00207321, "balance_loss_clip": 1.02783036, "balance_loss_mlp": 0.18139294, "epoch": 0.9565008266947241, "flos": 18623884752000.0, "grad_norm": 3.9155384182783446, "language_loss": 0.90314496, "learning_rate": 1.978921532427802e-08, "loss": 0.91766655, "num_input_tokens_seen": 343177815, "router_z_loss_clip": 2.16796875, "router_z_loss_mlp": 0.25964355, "step": 15909, "time_per_iteration": 2.673283576965332 }, { "auxiliary_loss_clip": 0.01227449, "auxiliary_loss_mlp": 0.00237502, "balance_loss_clip": 1.01648045, "balance_loss_mlp": 0.21321906, "epoch": 0.9565609499473922, "flos": 24862142465280.0, "grad_norm": 823.1594420849292, "language_loss": 0.76585096, "learning_rate": 1.9734601372590086e-08, "loss": 0.78050041, "num_input_tokens_seen": 343198140, "router_z_loss_clip": 2.11035156, "router_z_loss_mlp": 0.24304199, "step": 15910, "time_per_iteration": 2.6603896617889404 }, { "auxiliary_loss_clip": 0.0125583, "auxiliary_loss_mlp": 0.00231482, "balance_loss_clip": 1.03454518, "balance_loss_mlp": 0.20598298, "epoch": 0.9566210732000601, "flos": 21798459740160.0, "grad_norm": 11.98535381646701, "language_loss": 0.83550346, "learning_rate": 1.968006251276444e-08, "loss": 0.85037655, "num_input_tokens_seen": 343218280, "router_z_loss_clip": 2.21289062, "router_z_loss_mlp": 0.25476074, "step": 15911, "time_per_iteration": 2.673964500427246 }, { "auxiliary_loss_clip": 0.01230797, "auxiliary_loss_mlp": 0.00233122, "balance_loss_clip": 1.019081, "balance_loss_mlp": 0.20829055, "epoch": 0.9566811964527281, "flos": 18697609416960.0, "grad_norm": 65.8967945486838, "language_loss": 0.77356827, "learning_rate": 1.9625598746869198e-08, "loss": 0.78820753, "num_input_tokens_seen": 343236850, "router_z_loss_clip": 2.12109375, "router_z_loss_mlp": 0.24841309, "step": 15912, "time_per_iteration": 2.7108404636383057 }, { "auxiliary_loss_clip": 0.01272181, "auxiliary_loss_mlp": 0.00220251, "balance_loss_clip": 1.04558802, "balance_loss_mlp": 0.19416848, "epoch": 0.9567413197053961, "flos": 13000632727680.0, "grad_norm": 14.548586147972852, "language_loss": 0.82852709, "learning_rate": 1.95712100769696e-08, "loss": 0.84345144, "num_input_tokens_seen": 343253065, "router_z_loss_clip": 2.26464844, "router_z_loss_mlp": 0.26098633, "step": 15913, "time_per_iteration": 2.8356926441192627 }, { "auxiliary_loss_clip": 0.01235255, "auxiliary_loss_mlp": 0.00214964, "balance_loss_clip": 1.02257252, "balance_loss_mlp": 0.19242191, "epoch": 0.956801442958064, "flos": 19719267955200.0, "grad_norm": 58.905304445457475, "language_loss": 0.81291282, "learning_rate": 1.9516896505128444e-08, "loss": 0.82741499, "num_input_tokens_seen": 343270330, "router_z_loss_clip": 2.13085938, "router_z_loss_mlp": 0.22521973, "step": 15914, "time_per_iteration": 2.8147265911102295 }, { "auxiliary_loss_clip": 0.01241895, "auxiliary_loss_mlp": 0.0022546, "balance_loss_clip": 1.02374911, "balance_loss_mlp": 0.20024735, "epoch": 0.956861566210732, "flos": 18222834424320.0, "grad_norm": 12.858078953240724, "language_loss": 0.7499243, "learning_rate": 1.9462658033404965e-08, "loss": 0.76459789, "num_input_tokens_seen": 343289625, "router_z_loss_clip": 2.18066406, "router_z_loss_mlp": 0.25183105, "step": 15915, "time_per_iteration": 2.743666887283325 }, { "auxiliary_loss_clip": 0.0121754, "auxiliary_loss_mlp": 0.00221154, "balance_loss_clip": 1.01005113, "balance_loss_mlp": 0.1992196, "epoch": 0.9569216894634, "flos": 22196960202240.0, "grad_norm": 11.794568840850472, "language_loss": 0.722453, "learning_rate": 1.9408494663855967e-08, "loss": 0.73683995, "num_input_tokens_seen": 343309200, "router_z_loss_clip": 2.07421875, "router_z_loss_mlp": 0.21948242, "step": 15916, "time_per_iteration": 2.768366813659668 }, { "auxiliary_loss_clip": 0.01237621, "auxiliary_loss_mlp": 0.00214565, "balance_loss_clip": 1.02704382, "balance_loss_mlp": 0.19105729, "epoch": 0.956981812716068, "flos": 21689291329920.0, "grad_norm": 28.93940060099328, "language_loss": 0.87089527, "learning_rate": 1.935440639853536e-08, "loss": 0.8854171, "num_input_tokens_seen": 343326270, "router_z_loss_clip": 2.10351562, "router_z_loss_mlp": 0.23522949, "step": 15917, "time_per_iteration": 2.7304859161376953 }, { "auxiliary_loss_clip": 0.01234627, "auxiliary_loss_mlp": 0.00204015, "balance_loss_clip": 1.01789427, "balance_loss_mlp": 0.17814687, "epoch": 0.9570419359687359, "flos": 13990905757440.0, "grad_norm": 4.944997785652027, "language_loss": 0.83538622, "learning_rate": 1.9300393239494172e-08, "loss": 0.84977269, "num_input_tokens_seen": 343344430, "router_z_loss_clip": 2.16601562, "router_z_loss_mlp": 0.25891113, "step": 15918, "time_per_iteration": 2.703498363494873 }, { "auxiliary_loss_clip": 0.01075093, "auxiliary_loss_mlp": 0.00056329, "balance_loss_clip": 0.93959868, "balance_loss_mlp": 0.04960566, "epoch": 0.9571020592214039, "flos": 65196938534400.0, "grad_norm": 0.6864530837422297, "language_loss": 0.52609986, "learning_rate": 1.924645518878032e-08, "loss": 0.53741407, "num_input_tokens_seen": 343416155, "router_z_loss_clip": 1.359375, "router_z_loss_mlp": 0.06738281, "step": 15919, "time_per_iteration": 3.250814914703369 }, { "auxiliary_loss_clip": 0.01260112, "auxiliary_loss_mlp": 0.00250543, "balance_loss_clip": 1.03600049, "balance_loss_mlp": 0.22355425, "epoch": 0.9571621824740718, "flos": 17384068961280.0, "grad_norm": 11.674950824952175, "language_loss": 0.88344324, "learning_rate": 1.919259224843972e-08, "loss": 0.8985498, "num_input_tokens_seen": 343431715, "router_z_loss_clip": 2.24023438, "router_z_loss_mlp": 0.26989746, "step": 15920, "time_per_iteration": 2.7001428604125977 }, { "auxiliary_loss_clip": 0.01247687, "auxiliary_loss_mlp": 0.0021763, "balance_loss_clip": 1.02853179, "balance_loss_mlp": 0.19248873, "epoch": 0.9572223057267398, "flos": 14538184352640.0, "grad_norm": 62.815352327567325, "language_loss": 0.89102972, "learning_rate": 1.9138804420514298e-08, "loss": 0.90568286, "num_input_tokens_seen": 343450425, "router_z_loss_clip": 2.19140625, "router_z_loss_mlp": 0.25170898, "step": 15921, "time_per_iteration": 2.6014506816864014 }, { "auxiliary_loss_clip": 0.0126305, "auxiliary_loss_mlp": 0.00250177, "balance_loss_clip": 1.03516173, "balance_loss_mlp": 0.22333059, "epoch": 0.9572824289794077, "flos": 33947793158400.0, "grad_norm": 3.5020670111838963, "language_loss": 0.6195761, "learning_rate": 1.9085091707044197e-08, "loss": 0.6347084, "num_input_tokens_seen": 343470445, "router_z_loss_clip": 2.27832031, "router_z_loss_mlp": 0.26867676, "step": 15922, "time_per_iteration": 2.7867624759674072 }, { "auxiliary_loss_clip": 0.01232018, "auxiliary_loss_mlp": 0.00228065, "balance_loss_clip": 1.01958013, "balance_loss_mlp": 0.20338902, "epoch": 0.9573425522320758, "flos": 18694915896960.0, "grad_norm": 221.7724046475925, "language_loss": 0.89966863, "learning_rate": 1.903145411006557e-08, "loss": 0.91426945, "num_input_tokens_seen": 343485200, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.2467041, "step": 15923, "time_per_iteration": 2.6207916736602783 }, { "auxiliary_loss_clip": 0.01237178, "auxiliary_loss_mlp": 0.00225679, "balance_loss_clip": 1.02546835, "balance_loss_mlp": 0.20102663, "epoch": 0.9574026754847437, "flos": 28510307297280.0, "grad_norm": 3.0565194644960925, "language_loss": 0.83020705, "learning_rate": 1.8977891631613008e-08, "loss": 0.84483564, "num_input_tokens_seen": 343505080, "router_z_loss_clip": 2.1171875, "router_z_loss_mlp": 0.24658203, "step": 15924, "time_per_iteration": 2.722047805786133 }, { "auxiliary_loss_clip": 0.01240791, "auxiliary_loss_mlp": 0.00208001, "balance_loss_clip": 1.02231252, "balance_loss_mlp": 0.18347955, "epoch": 0.9574627987374117, "flos": 24352390604160.0, "grad_norm": 10.07935776101723, "language_loss": 0.93399519, "learning_rate": 1.892440427371711e-08, "loss": 0.94848311, "num_input_tokens_seen": 343523995, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.24523926, "step": 15925, "time_per_iteration": 2.8003714084625244 }, { "auxiliary_loss_clip": 0.01239344, "auxiliary_loss_mlp": 0.00230738, "balance_loss_clip": 1.02159047, "balance_loss_mlp": 0.2069913, "epoch": 0.9575229219900797, "flos": 23510680225920.0, "grad_norm": 18.498261909115836, "language_loss": 0.85128474, "learning_rate": 1.8870992038406474e-08, "loss": 0.86598551, "num_input_tokens_seen": 343542015, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.23718262, "step": 15926, "time_per_iteration": 4.083993434906006 }, { "auxiliary_loss_clip": 0.01218041, "auxiliary_loss_mlp": 0.00223916, "balance_loss_clip": 1.01163673, "balance_loss_mlp": 0.20132574, "epoch": 0.9575830452427476, "flos": 22674823764480.0, "grad_norm": 9.913960155852877, "language_loss": 0.85872042, "learning_rate": 1.8817654927706373e-08, "loss": 0.87313998, "num_input_tokens_seen": 343561680, "router_z_loss_clip": 2.06347656, "router_z_loss_mlp": 0.22595215, "step": 15927, "time_per_iteration": 2.6955087184906006 }, { "auxiliary_loss_clip": 0.01244503, "auxiliary_loss_mlp": 0.00208445, "balance_loss_clip": 1.02574754, "balance_loss_mlp": 0.1826123, "epoch": 0.9576431684954156, "flos": 30485250835200.0, "grad_norm": 6.365563804515498, "language_loss": 0.79112267, "learning_rate": 1.8764392943639183e-08, "loss": 0.80565214, "num_input_tokens_seen": 343585290, "router_z_loss_clip": 2.18847656, "router_z_loss_mlp": 0.25830078, "step": 15928, "time_per_iteration": 4.2189271450042725 }, { "auxiliary_loss_clip": 0.01257034, "auxiliary_loss_mlp": 0.00244113, "balance_loss_clip": 1.03594422, "balance_loss_mlp": 0.21927023, "epoch": 0.9577032917480836, "flos": 21687387909120.0, "grad_norm": 57.161611549720675, "language_loss": 0.89095914, "learning_rate": 1.871120608822485e-08, "loss": 0.90597057, "num_input_tokens_seen": 343604045, "router_z_loss_clip": 2.2109375, "router_z_loss_mlp": 0.24853516, "step": 15929, "time_per_iteration": 2.695303201675415 }, { "auxiliary_loss_clip": 0.012664, "auxiliary_loss_mlp": 0.00234275, "balance_loss_clip": 1.03987908, "balance_loss_mlp": 0.20770347, "epoch": 0.9577634150007516, "flos": 29023147728000.0, "grad_norm": 10.340216679243799, "language_loss": 0.79556477, "learning_rate": 1.8658094363480202e-08, "loss": 0.81057143, "num_input_tokens_seen": 343626595, "router_z_loss_clip": 2.26269531, "router_z_loss_mlp": 0.26574707, "step": 15930, "time_per_iteration": 2.76374888420105 }, { "auxiliary_loss_clip": 0.0123089, "auxiliary_loss_mlp": 0.00217518, "balance_loss_clip": 1.01788306, "balance_loss_mlp": 0.19443919, "epoch": 0.9578235382534195, "flos": 19282235178240.0, "grad_norm": 6.731238866114594, "language_loss": 0.70711613, "learning_rate": 1.8605057771419185e-08, "loss": 0.72160023, "num_input_tokens_seen": 343646195, "router_z_loss_clip": 2.12988281, "router_z_loss_mlp": 0.23083496, "step": 15931, "time_per_iteration": 2.640101432800293 }, { "auxiliary_loss_clip": 0.01233203, "auxiliary_loss_mlp": 0.00216386, "balance_loss_clip": 1.02476335, "balance_loss_mlp": 0.19327137, "epoch": 0.9578836615060875, "flos": 13699275235200.0, "grad_norm": 6.34569102565622, "language_loss": 0.78714442, "learning_rate": 1.8552096314052633e-08, "loss": 0.80164033, "num_input_tokens_seen": 343663665, "router_z_loss_clip": 2.08691406, "router_z_loss_mlp": 0.23132324, "step": 15932, "time_per_iteration": 2.675977945327759 }, { "auxiliary_loss_clip": 0.0124068, "auxiliary_loss_mlp": 0.00217293, "balance_loss_clip": 1.02264941, "balance_loss_mlp": 0.19217509, "epoch": 0.9579437847587554, "flos": 17054516655360.0, "grad_norm": 5.062825875338505, "language_loss": 0.83797127, "learning_rate": 1.849920999338961e-08, "loss": 0.85255098, "num_input_tokens_seen": 343682145, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.2512207, "step": 15933, "time_per_iteration": 2.6474924087524414 }, { "auxiliary_loss_clip": 0.01085492, "auxiliary_loss_mlp": 0.00091758, "balance_loss_clip": 0.9460727, "balance_loss_mlp": 0.08532095, "epoch": 0.9580039080114234, "flos": 60570887886720.0, "grad_norm": 0.771698629647379, "language_loss": 0.56679499, "learning_rate": 1.8446398811434948e-08, "loss": 0.5785675, "num_input_tokens_seen": 343744685, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.06445312, "step": 15934, "time_per_iteration": 3.2700588703155518 }, { "auxiliary_loss_clip": 0.01085803, "auxiliary_loss_mlp": 0.00076792, "balance_loss_clip": 0.94735265, "balance_loss_mlp": 0.06997342, "epoch": 0.9580640312640913, "flos": 66235365745920.0, "grad_norm": 0.9231450210099374, "language_loss": 0.65114504, "learning_rate": 1.8393662770191277e-08, "loss": 0.66277099, "num_input_tokens_seen": 343801835, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 0.06835938, "step": 15935, "time_per_iteration": 3.1240625381469727 }, { "auxiliary_loss_clip": 0.01084516, "auxiliary_loss_mlp": 0.00077481, "balance_loss_clip": 0.94810927, "balance_loss_mlp": 0.07061409, "epoch": 0.9581241545167594, "flos": 62218002971520.0, "grad_norm": 1.2682911897164928, "language_loss": 0.56021047, "learning_rate": 1.8341001871658546e-08, "loss": 0.57183039, "num_input_tokens_seen": 343861515, "router_z_loss_clip": 1.359375, "router_z_loss_mlp": 0.06884766, "step": 15936, "time_per_iteration": 4.5569727420806885 }, { "auxiliary_loss_clip": 0.01235018, "auxiliary_loss_mlp": 0.00215317, "balance_loss_clip": 1.02064705, "balance_loss_mlp": 0.19002058, "epoch": 0.9581842777694273, "flos": 23768088065280.0, "grad_norm": 227.98601711354974, "language_loss": 0.84960777, "learning_rate": 1.8288416117833825e-08, "loss": 0.86411107, "num_input_tokens_seen": 343881240, "router_z_loss_clip": 2.14355469, "router_z_loss_mlp": 0.25317383, "step": 15937, "time_per_iteration": 2.685835361480713 }, { "auxiliary_loss_clip": 0.01235263, "auxiliary_loss_mlp": 0.00215358, "balance_loss_clip": 1.02149701, "balance_loss_mlp": 0.19171843, "epoch": 0.9582444010220953, "flos": 21213079793280.0, "grad_norm": 201.49208286132023, "language_loss": 0.75102943, "learning_rate": 1.8235905510710636e-08, "loss": 0.76553565, "num_input_tokens_seen": 343900885, "router_z_loss_clip": 2.13476562, "router_z_loss_mlp": 0.23657227, "step": 15938, "time_per_iteration": 4.153298377990723 }, { "auxiliary_loss_clip": 0.0124105, "auxiliary_loss_mlp": 0.00208033, "balance_loss_clip": 1.02339959, "balance_loss_mlp": 0.18279657, "epoch": 0.9583045242747633, "flos": 23805147922560.0, "grad_norm": 29.778663762207774, "language_loss": 0.75155139, "learning_rate": 1.8183470052280712e-08, "loss": 0.76604223, "num_input_tokens_seen": 343918460, "router_z_loss_clip": 2.17675781, "router_z_loss_mlp": 0.25256348, "step": 15939, "time_per_iteration": 2.6777939796447754 }, { "auxiliary_loss_clip": 0.01236298, "auxiliary_loss_mlp": 0.00212226, "balance_loss_clip": 1.02033818, "balance_loss_mlp": 0.1882056, "epoch": 0.9583646475274312, "flos": 24131468004480.0, "grad_norm": 40.63321780662848, "language_loss": 0.813739, "learning_rate": 1.8131109744532025e-08, "loss": 0.82822424, "num_input_tokens_seen": 343938030, "router_z_loss_clip": 2.16015625, "router_z_loss_mlp": 0.2401123, "step": 15940, "time_per_iteration": 2.747972011566162 }, { "auxiliary_loss_clip": 0.01244301, "auxiliary_loss_mlp": 0.00233206, "balance_loss_clip": 1.02958333, "balance_loss_mlp": 0.20869663, "epoch": 0.9584247707800992, "flos": 20886651970560.0, "grad_norm": 5.615152832245876, "language_loss": 0.78436553, "learning_rate": 1.8078824589450535e-08, "loss": 0.79914063, "num_input_tokens_seen": 343956635, "router_z_loss_clip": 2.14648438, "router_z_loss_mlp": 0.24499512, "step": 15941, "time_per_iteration": 2.691894292831421 }, { "auxiliary_loss_clip": 0.01237007, "auxiliary_loss_mlp": 0.00216618, "balance_loss_clip": 1.01822734, "balance_loss_mlp": 0.19092828, "epoch": 0.9584848940327672, "flos": 26067591918720.0, "grad_norm": 10.932385313947638, "language_loss": 0.80878943, "learning_rate": 1.8026614589018442e-08, "loss": 0.82332563, "num_input_tokens_seen": 343976625, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.25695801, "step": 15942, "time_per_iteration": 2.7199442386627197 }, { "auxiliary_loss_clip": 0.01263565, "auxiliary_loss_mlp": 0.00248738, "balance_loss_clip": 1.03975105, "balance_loss_mlp": 0.221523, "epoch": 0.9585450172854352, "flos": 34492988764800.0, "grad_norm": 12.585601489018357, "language_loss": 0.77954161, "learning_rate": 1.797447974521571e-08, "loss": 0.79466462, "num_input_tokens_seen": 343997790, "router_z_loss_clip": 2.23828125, "router_z_loss_mlp": 0.27197266, "step": 15943, "time_per_iteration": 2.797304391860962 }, { "auxiliary_loss_clip": 0.01241538, "auxiliary_loss_mlp": 0.00236064, "balance_loss_clip": 1.02659464, "balance_loss_mlp": 0.21234107, "epoch": 0.9586051405381031, "flos": 23110743219840.0, "grad_norm": 16.70943876026597, "language_loss": 0.77835703, "learning_rate": 1.792242006001965e-08, "loss": 0.79313302, "num_input_tokens_seen": 344016935, "router_z_loss_clip": 2.14648438, "router_z_loss_mlp": 0.23730469, "step": 15944, "time_per_iteration": 2.6319422721862793 }, { "auxiliary_loss_clip": 0.01237677, "auxiliary_loss_mlp": 0.00219569, "balance_loss_clip": 1.02231467, "balance_loss_mlp": 0.1941779, "epoch": 0.9586652637907711, "flos": 19603994232960.0, "grad_norm": 41.47750343514497, "language_loss": 0.74929971, "learning_rate": 1.7870435535403795e-08, "loss": 0.76387215, "num_input_tokens_seen": 344035590, "router_z_loss_clip": 2.15429688, "router_z_loss_mlp": 0.25415039, "step": 15945, "time_per_iteration": 2.7991485595703125 }, { "auxiliary_loss_clip": 0.01081458, "auxiliary_loss_mlp": 0.00051716, "balance_loss_clip": 0.9447782, "balance_loss_mlp": 0.04577913, "epoch": 0.958725387043439, "flos": 72073327317120.0, "grad_norm": 1.3892362992953164, "language_loss": 0.60752285, "learning_rate": 1.7818526173339678e-08, "loss": 0.61885464, "num_input_tokens_seen": 344100845, "router_z_loss_clip": 1.359375, "router_z_loss_mlp": 0.05932617, "step": 15946, "time_per_iteration": 3.212904930114746 }, { "auxiliary_loss_clip": 0.01218549, "auxiliary_loss_mlp": 0.00219589, "balance_loss_clip": 1.00850153, "balance_loss_mlp": 0.19491276, "epoch": 0.958785510296107, "flos": 28911932242560.0, "grad_norm": 27.749108004146336, "language_loss": 0.83144337, "learning_rate": 1.7766691975795723e-08, "loss": 0.84582472, "num_input_tokens_seen": 344121780, "router_z_loss_clip": 2.1015625, "router_z_loss_mlp": 0.24682617, "step": 15947, "time_per_iteration": 2.7232885360717773 }, { "auxiliary_loss_clip": 0.01232539, "auxiliary_loss_mlp": 0.00223633, "balance_loss_clip": 1.01982069, "balance_loss_mlp": 0.19836034, "epoch": 0.958845633548775, "flos": 18477189607680.0, "grad_norm": 7.771897169768314, "language_loss": 0.80773497, "learning_rate": 1.771493294473747e-08, "loss": 0.82229668, "num_input_tokens_seen": 344140150, "router_z_loss_clip": 2.12890625, "router_z_loss_mlp": 0.25292969, "step": 15948, "time_per_iteration": 2.6499760150909424 }, { "auxiliary_loss_clip": 0.01242454, "auxiliary_loss_mlp": 0.00219088, "balance_loss_clip": 1.02668405, "balance_loss_mlp": 0.19432794, "epoch": 0.958905756801443, "flos": 24206916522240.0, "grad_norm": 20.473381154955653, "language_loss": 0.87777555, "learning_rate": 1.7663249082127574e-08, "loss": 0.89239097, "num_input_tokens_seen": 344158200, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.24768066, "step": 15949, "time_per_iteration": 2.6811866760253906 }, { "auxiliary_loss_clip": 0.01256278, "auxiliary_loss_mlp": 0.00235527, "balance_loss_clip": 1.03542233, "balance_loss_mlp": 0.210815, "epoch": 0.9589658800541109, "flos": 25007939769600.0, "grad_norm": 57.84139942192315, "language_loss": 0.75275302, "learning_rate": 1.761164038992602e-08, "loss": 0.76767105, "num_input_tokens_seen": 344174720, "router_z_loss_clip": 2.2109375, "router_z_loss_mlp": 0.24731445, "step": 15950, "time_per_iteration": 2.779010534286499 }, { "auxiliary_loss_clip": 0.01237786, "auxiliary_loss_mlp": 0.00229319, "balance_loss_clip": 1.02109885, "balance_loss_mlp": 0.20466611, "epoch": 0.9590260033067789, "flos": 23514558894720.0, "grad_norm": 6.531131279492148, "language_loss": 0.9166795, "learning_rate": 1.7560106870089687e-08, "loss": 0.93135053, "num_input_tokens_seen": 344192580, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.24609375, "step": 15951, "time_per_iteration": 2.689840316772461 }, { "auxiliary_loss_clip": 0.01273048, "auxiliary_loss_mlp": 0.0023996, "balance_loss_clip": 1.0399456, "balance_loss_mlp": 0.21287596, "epoch": 0.9590861265594469, "flos": 25520349237120.0, "grad_norm": 13.949562302174128, "language_loss": 0.91226876, "learning_rate": 1.7508648524572568e-08, "loss": 0.9273988, "num_input_tokens_seen": 344210345, "router_z_loss_clip": 2.33007812, "router_z_loss_mlp": 0.27075195, "step": 15952, "time_per_iteration": 2.7003345489501953 }, { "auxiliary_loss_clip": 0.01246013, "auxiliary_loss_mlp": 0.00232762, "balance_loss_clip": 1.02843928, "balance_loss_mlp": 0.2077879, "epoch": 0.9591462498121148, "flos": 21179323987200.0, "grad_norm": 4.644159547058579, "language_loss": 0.76747394, "learning_rate": 1.7457265355326434e-08, "loss": 0.78226167, "num_input_tokens_seen": 344229540, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.24975586, "step": 15953, "time_per_iteration": 2.69694185256958 }, { "auxiliary_loss_clip": 0.01241264, "auxiliary_loss_mlp": 0.00239911, "balance_loss_clip": 1.01898241, "balance_loss_mlp": 0.21406657, "epoch": 0.9592063730647828, "flos": 21723047136000.0, "grad_norm": 32.270364495961495, "language_loss": 0.69964105, "learning_rate": 1.7405957364299285e-08, "loss": 0.71445286, "num_input_tokens_seen": 344247830, "router_z_loss_clip": 2.22265625, "router_z_loss_mlp": 0.25842285, "step": 15954, "time_per_iteration": 2.7470951080322266 }, { "auxiliary_loss_clip": 0.01255116, "auxiliary_loss_mlp": 0.00240695, "balance_loss_clip": 1.03096199, "balance_loss_mlp": 0.21378908, "epoch": 0.9592664963174508, "flos": 29891395278720.0, "grad_norm": 15.190123707692134, "language_loss": 0.80602998, "learning_rate": 1.7354724553437117e-08, "loss": 0.82098812, "num_input_tokens_seen": 344267760, "router_z_loss_clip": 2.24414062, "router_z_loss_mlp": 0.26916504, "step": 15955, "time_per_iteration": 2.718113422393799 }, { "auxiliary_loss_clip": 0.01226891, "auxiliary_loss_mlp": 0.00227759, "balance_loss_clip": 1.01027513, "balance_loss_mlp": 0.20119914, "epoch": 0.9593266195701188, "flos": 17999613354240.0, "grad_norm": 3.0765382166215622, "language_loss": 0.71874583, "learning_rate": 1.7303566924682378e-08, "loss": 0.73329228, "num_input_tokens_seen": 344284905, "router_z_loss_clip": 2.16503906, "router_z_loss_mlp": 0.26550293, "step": 15956, "time_per_iteration": 2.7358133792877197 }, { "auxiliary_loss_clip": 0.01236686, "auxiliary_loss_mlp": 0.00209601, "balance_loss_clip": 1.01601887, "balance_loss_mlp": 0.18288597, "epoch": 0.9593867428227867, "flos": 18838271076480.0, "grad_norm": 81.37440947153556, "language_loss": 0.69996786, "learning_rate": 1.725248447997507e-08, "loss": 0.71443063, "num_input_tokens_seen": 344302025, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.26696777, "step": 15957, "time_per_iteration": 2.6533398628234863 }, { "auxiliary_loss_clip": 0.01278504, "auxiliary_loss_mlp": 0.00224121, "balance_loss_clip": 1.04912722, "balance_loss_mlp": 0.19715557, "epoch": 0.9594468660754547, "flos": 29567050444800.0, "grad_norm": 75.96590726456219, "language_loss": 0.83091795, "learning_rate": 1.7201477221252314e-08, "loss": 0.84594423, "num_input_tokens_seen": 344321935, "router_z_loss_clip": 2.29882812, "router_z_loss_mlp": 0.2701416, "step": 15958, "time_per_iteration": 2.7950828075408936 }, { "auxiliary_loss_clip": 0.01224881, "auxiliary_loss_mlp": 0.00219703, "balance_loss_clip": 1.01188004, "balance_loss_mlp": 0.19559871, "epoch": 0.9595069893281226, "flos": 20703256104960.0, "grad_norm": 7.730235145322037, "language_loss": 0.81198502, "learning_rate": 1.7150545150448116e-08, "loss": 0.8264308, "num_input_tokens_seen": 344340405, "router_z_loss_clip": 2.13085938, "router_z_loss_mlp": 0.24133301, "step": 15959, "time_per_iteration": 2.735825777053833 }, { "auxiliary_loss_clip": 0.01241596, "auxiliary_loss_mlp": 0.00217817, "balance_loss_clip": 1.02264559, "balance_loss_mlp": 0.1921279, "epoch": 0.9595671125807906, "flos": 22453613856000.0, "grad_norm": 8.87321350925989, "language_loss": 0.77466559, "learning_rate": 1.7099688269493816e-08, "loss": 0.78925973, "num_input_tokens_seen": 344359925, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.25695801, "step": 15960, "time_per_iteration": 2.700773239135742 }, { "auxiliary_loss_clip": 0.01230214, "auxiliary_loss_mlp": 0.00231906, "balance_loss_clip": 1.01732588, "balance_loss_mlp": 0.20570403, "epoch": 0.9596272358334585, "flos": 23915214172800.0, "grad_norm": 5.25175740173542, "language_loss": 0.8489219, "learning_rate": 1.7048906580318544e-08, "loss": 0.86354315, "num_input_tokens_seen": 344379100, "router_z_loss_clip": 2.12988281, "router_z_loss_mlp": 0.26196289, "step": 15961, "time_per_iteration": 2.686474323272705 }, { "auxiliary_loss_clip": 0.01229335, "auxiliary_loss_mlp": 0.00230954, "balance_loss_clip": 1.01717007, "balance_loss_mlp": 0.20794667, "epoch": 0.9596873590861266, "flos": 17672539086720.0, "grad_norm": 74.53591552744773, "language_loss": 0.83661747, "learning_rate": 1.699820008484698e-08, "loss": 0.85122037, "num_input_tokens_seen": 344396895, "router_z_loss_clip": 2.12304688, "router_z_loss_mlp": 0.23010254, "step": 15962, "time_per_iteration": 2.619694471359253 }, { "auxiliary_loss_clip": 0.0124226, "auxiliary_loss_mlp": 0.00266638, "balance_loss_clip": 1.02593005, "balance_loss_mlp": 0.23802751, "epoch": 0.9597474823387945, "flos": 25808532053760.0, "grad_norm": 7.391797460805174, "language_loss": 0.8199175, "learning_rate": 1.6947568785002698e-08, "loss": 0.83500648, "num_input_tokens_seen": 344415115, "router_z_loss_clip": 2.16210938, "router_z_loss_mlp": 0.28601074, "step": 15963, "time_per_iteration": 2.6970748901367188 }, { "auxiliary_loss_clip": 0.01218121, "auxiliary_loss_mlp": 0.00203776, "balance_loss_clip": 1.01123083, "balance_loss_mlp": 0.18140058, "epoch": 0.9598076055914625, "flos": 23768519028480.0, "grad_norm": 24.28100492726069, "language_loss": 0.80507231, "learning_rate": 1.689701268270527e-08, "loss": 0.81929123, "num_input_tokens_seen": 344435185, "router_z_loss_clip": 2.0703125, "router_z_loss_mlp": 0.22399902, "step": 15964, "time_per_iteration": 2.6624326705932617 }, { "auxiliary_loss_clip": 0.0107452, "auxiliary_loss_mlp": 0.00043124, "balance_loss_clip": 0.93739879, "balance_loss_mlp": 0.03733046, "epoch": 0.9598677288441305, "flos": 56515962464640.0, "grad_norm": 1.0605869786334077, "language_loss": 0.56964982, "learning_rate": 1.684653177987161e-08, "loss": 0.58082628, "num_input_tokens_seen": 344488950, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.05786133, "step": 15965, "time_per_iteration": 3.176532506942749 }, { "auxiliary_loss_clip": 0.01248922, "auxiliary_loss_mlp": 0.00218392, "balance_loss_clip": 1.0289762, "balance_loss_mlp": 0.19458576, "epoch": 0.9599278520967984, "flos": 22997480659200.0, "grad_norm": 3.522406266792881, "language_loss": 0.84665602, "learning_rate": 1.6796126078416627e-08, "loss": 0.8613292, "num_input_tokens_seen": 344506740, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.23828125, "step": 15966, "time_per_iteration": 2.6461853981018066 }, { "auxiliary_loss_clip": 0.01235602, "auxiliary_loss_mlp": 0.00227526, "balance_loss_clip": 1.02135539, "balance_loss_mlp": 0.20168176, "epoch": 0.9599879753494664, "flos": 23039676161280.0, "grad_norm": 12.85437816837059, "language_loss": 0.86602283, "learning_rate": 1.674579558025102e-08, "loss": 0.8806541, "num_input_tokens_seen": 344526670, "router_z_loss_clip": 2.14746094, "router_z_loss_mlp": 0.25854492, "step": 15967, "time_per_iteration": 2.730241298675537 }, { "auxiliary_loss_clip": 0.01258545, "auxiliary_loss_mlp": 0.00227131, "balance_loss_clip": 1.03032541, "balance_loss_mlp": 0.20052361, "epoch": 0.9600480986021344, "flos": 16392287560320.0, "grad_norm": 31.311893056058967, "language_loss": 0.90425193, "learning_rate": 1.669554028728348e-08, "loss": 0.91910863, "num_input_tokens_seen": 344541995, "router_z_loss_clip": 2.28320312, "router_z_loss_mlp": 0.26611328, "step": 15968, "time_per_iteration": 2.6620426177978516 }, { "auxiliary_loss_clip": 0.01260228, "auxiliary_loss_mlp": 0.00241758, "balance_loss_clip": 1.03422403, "balance_loss_mlp": 0.21599722, "epoch": 0.9601082218548024, "flos": 24276439296000.0, "grad_norm": 111.08449222666309, "language_loss": 0.787875, "learning_rate": 1.6645360201420044e-08, "loss": 0.80289483, "num_input_tokens_seen": 344559980, "router_z_loss_clip": 2.25683594, "router_z_loss_mlp": 0.25744629, "step": 15969, "time_per_iteration": 4.0765769481658936 }, { "auxiliary_loss_clip": 0.01242111, "auxiliary_loss_mlp": 0.00237599, "balance_loss_clip": 1.02597213, "balance_loss_mlp": 0.21289897, "epoch": 0.9601683451074703, "flos": 19609991804160.0, "grad_norm": 98.57274819820601, "language_loss": 0.88086843, "learning_rate": 1.6595255324563186e-08, "loss": 0.89566553, "num_input_tokens_seen": 344577765, "router_z_loss_clip": 2.16699219, "router_z_loss_mlp": 0.24719238, "step": 15970, "time_per_iteration": 4.077553987503052 }, { "auxiliary_loss_clip": 0.01242014, "auxiliary_loss_mlp": 0.0022549, "balance_loss_clip": 1.02610326, "balance_loss_mlp": 0.20094456, "epoch": 0.9602284683601383, "flos": 26651104358400.0, "grad_norm": 12.85000161317417, "language_loss": 0.83179367, "learning_rate": 1.654522565861316e-08, "loss": 0.84646869, "num_input_tokens_seen": 344597650, "router_z_loss_clip": 2.16015625, "router_z_loss_mlp": 0.24536133, "step": 15971, "time_per_iteration": 2.7990846633911133 }, { "auxiliary_loss_clip": 0.01251964, "auxiliary_loss_mlp": 0.00253374, "balance_loss_clip": 1.02635455, "balance_loss_mlp": 0.22358373, "epoch": 0.9602885916128062, "flos": 15554096714880.0, "grad_norm": 20.811595878718666, "language_loss": 0.7506392, "learning_rate": 1.64952712054669e-08, "loss": 0.76569259, "num_input_tokens_seen": 344613580, "router_z_loss_clip": 2.25390625, "router_z_loss_mlp": 0.29797363, "step": 15972, "time_per_iteration": 2.6459720134735107 }, { "auxiliary_loss_clip": 0.01233009, "auxiliary_loss_mlp": 0.00228846, "balance_loss_clip": 1.01955748, "balance_loss_mlp": 0.20474172, "epoch": 0.9603487148654742, "flos": 16502353810560.0, "grad_norm": 45.95099964452309, "language_loss": 0.83929384, "learning_rate": 1.644539196701844e-08, "loss": 0.85391235, "num_input_tokens_seen": 344626910, "router_z_loss_clip": 2.13574219, "router_z_loss_mlp": 0.2409668, "step": 15973, "time_per_iteration": 2.6219522953033447 }, { "auxiliary_loss_clip": 0.01225111, "auxiliary_loss_mlp": 0.00212336, "balance_loss_clip": 1.01701701, "balance_loss_mlp": 0.18913756, "epoch": 0.9604088381181421, "flos": 20845354308480.0, "grad_norm": 24.423503758393455, "language_loss": 0.75223374, "learning_rate": 1.639558794515983e-08, "loss": 0.76660824, "num_input_tokens_seen": 344644330, "router_z_loss_clip": 2.07910156, "router_z_loss_mlp": 0.23193359, "step": 15974, "time_per_iteration": 2.6559197902679443 }, { "auxiliary_loss_clip": 0.0124353, "auxiliary_loss_mlp": 0.00209047, "balance_loss_clip": 1.0263015, "balance_loss_mlp": 0.18381086, "epoch": 0.9604689613708102, "flos": 19683105937920.0, "grad_norm": 567.6978911996267, "language_loss": 0.75696886, "learning_rate": 1.6345859141779105e-08, "loss": 0.77149469, "num_input_tokens_seen": 344663910, "router_z_loss_clip": 2.17578125, "router_z_loss_mlp": 0.25219727, "step": 15975, "time_per_iteration": 2.708954095840454 }, { "auxiliary_loss_clip": 0.01220537, "auxiliary_loss_mlp": 0.00200794, "balance_loss_clip": 1.01334453, "balance_loss_mlp": 0.17702374, "epoch": 0.9605290846234781, "flos": 24097568544000.0, "grad_norm": 406.0729957797124, "language_loss": 0.65627599, "learning_rate": 1.6296205558762322e-08, "loss": 0.67048931, "num_input_tokens_seen": 344682320, "router_z_loss_clip": 2.07421875, "router_z_loss_mlp": 0.2376709, "step": 15976, "time_per_iteration": 2.6741058826446533 }, { "auxiliary_loss_clip": 0.01218083, "auxiliary_loss_mlp": 0.0022483, "balance_loss_clip": 1.01001501, "balance_loss_mlp": 0.20005873, "epoch": 0.9605892078761461, "flos": 27122575299840.0, "grad_norm": 627.1399249780445, "language_loss": 0.74704051, "learning_rate": 1.624662719799219e-08, "loss": 0.76146966, "num_input_tokens_seen": 344701355, "router_z_loss_clip": 2.08203125, "router_z_loss_mlp": 0.24780273, "step": 15977, "time_per_iteration": 2.7639200687408447 }, { "auxiliary_loss_clip": 0.01220298, "auxiliary_loss_mlp": 0.00209768, "balance_loss_clip": 1.00671709, "balance_loss_mlp": 0.18593821, "epoch": 0.9606493311288141, "flos": 14136918543360.0, "grad_norm": 5.203916626320149, "language_loss": 0.90661657, "learning_rate": 1.6197124061348766e-08, "loss": 0.92091721, "num_input_tokens_seen": 344717980, "router_z_loss_clip": 2.13671875, "router_z_loss_mlp": 0.23840332, "step": 15978, "time_per_iteration": 4.174534320831299 }, { "auxiliary_loss_clip": 0.01268796, "auxiliary_loss_mlp": 0.00221151, "balance_loss_clip": 1.03908932, "balance_loss_mlp": 0.19364971, "epoch": 0.960709454381482, "flos": 15813336147840.0, "grad_norm": 663.7601205374656, "language_loss": 0.92558652, "learning_rate": 1.614769615070921e-08, "loss": 0.94048595, "num_input_tokens_seen": 344733480, "router_z_loss_clip": 2.296875, "router_z_loss_mlp": 0.27514648, "step": 15979, "time_per_iteration": 2.6592459678649902 }, { "auxiliary_loss_clip": 0.01255919, "auxiliary_loss_mlp": 0.00234165, "balance_loss_clip": 1.03442955, "balance_loss_mlp": 0.21012081, "epoch": 0.96076957763415, "flos": 22565403959040.0, "grad_norm": 9.062463266687596, "language_loss": 0.88011748, "learning_rate": 1.6098343467947805e-08, "loss": 0.89501834, "num_input_tokens_seen": 344752130, "router_z_loss_clip": 2.21679688, "router_z_loss_mlp": 0.24060059, "step": 15980, "time_per_iteration": 4.120023488998413 }, { "auxiliary_loss_clip": 0.01250198, "auxiliary_loss_mlp": 0.00229712, "balance_loss_clip": 1.02711618, "balance_loss_mlp": 0.20304462, "epoch": 0.960829700886818, "flos": 24681260551680.0, "grad_norm": 223.5245389722119, "language_loss": 0.76244473, "learning_rate": 1.6049066014935942e-08, "loss": 0.77724385, "num_input_tokens_seen": 344771195, "router_z_loss_clip": 2.22851562, "router_z_loss_mlp": 0.26672363, "step": 15981, "time_per_iteration": 2.741119623184204 }, { "auxiliary_loss_clip": 0.01226565, "auxiliary_loss_mlp": 0.00225621, "balance_loss_clip": 1.01539111, "balance_loss_mlp": 0.20280465, "epoch": 0.960889824139486, "flos": 26542223256960.0, "grad_norm": 22.682792721485242, "language_loss": 0.77175117, "learning_rate": 1.5999863793542344e-08, "loss": 0.78627306, "num_input_tokens_seen": 344793150, "router_z_loss_clip": 2.11425781, "router_z_loss_mlp": 0.22827148, "step": 15982, "time_per_iteration": 2.705253839492798 }, { "auxiliary_loss_clip": 0.0107976, "auxiliary_loss_mlp": 0.00115943, "balance_loss_clip": 0.94331849, "balance_loss_mlp": 0.10821827, "epoch": 0.9609499473921539, "flos": 71114942586240.0, "grad_norm": 0.6669234889349531, "language_loss": 0.5283258, "learning_rate": 1.595073680563286e-08, "loss": 0.54028285, "num_input_tokens_seen": 344852855, "router_z_loss_clip": 1.359375, "router_z_loss_mlp": 0.07714844, "step": 15983, "time_per_iteration": 3.2680556774139404 }, { "auxiliary_loss_clip": 0.01240779, "auxiliary_loss_mlp": 0.00232799, "balance_loss_clip": 1.02168238, "balance_loss_mlp": 0.20566732, "epoch": 0.9610100706448219, "flos": 20552466810240.0, "grad_norm": 2127.070369428902, "language_loss": 0.77831501, "learning_rate": 1.5901685053070212e-08, "loss": 0.79305077, "num_input_tokens_seen": 344869830, "router_z_loss_clip": 2.19433594, "router_z_loss_mlp": 0.27124023, "step": 15984, "time_per_iteration": 2.6434197425842285 }, { "auxiliary_loss_clip": 0.01215898, "auxiliary_loss_mlp": 0.00229265, "balance_loss_clip": 1.01143622, "balance_loss_mlp": 0.20698512, "epoch": 0.9610701938974898, "flos": 14064199459200.0, "grad_norm": 91.73398460032372, "language_loss": 0.75889838, "learning_rate": 1.5852708537714477e-08, "loss": 0.77335006, "num_input_tokens_seen": 344888905, "router_z_loss_clip": 2.046875, "router_z_loss_mlp": 0.22277832, "step": 15985, "time_per_iteration": 2.701310634613037 }, { "auxiliary_loss_clip": 0.01243757, "auxiliary_loss_mlp": 0.00239023, "balance_loss_clip": 1.02986622, "balance_loss_mlp": 0.21547963, "epoch": 0.9611303171501578, "flos": 20229989483520.0, "grad_norm": 16.688619520016573, "language_loss": 0.86604214, "learning_rate": 1.580380726142283e-08, "loss": 0.88086998, "num_input_tokens_seen": 344907160, "router_z_loss_clip": 2.13671875, "router_z_loss_mlp": 0.23535156, "step": 15986, "time_per_iteration": 2.647540330886841 }, { "auxiliary_loss_clip": 0.0125287, "auxiliary_loss_mlp": 0.00234686, "balance_loss_clip": 1.0329144, "balance_loss_mlp": 0.20971204, "epoch": 0.9611904404028258, "flos": 20951075013120.0, "grad_norm": 109.37862831168249, "language_loss": 0.72853833, "learning_rate": 1.5754981226049792e-08, "loss": 0.74341393, "num_input_tokens_seen": 344922400, "router_z_loss_clip": 2.20214844, "router_z_loss_mlp": 0.25, "step": 15987, "time_per_iteration": 2.6470534801483154 }, { "auxiliary_loss_clip": 0.01241391, "auxiliary_loss_mlp": 0.00213684, "balance_loss_clip": 1.02690446, "balance_loss_mlp": 0.18907873, "epoch": 0.9612505636554938, "flos": 24827740214400.0, "grad_norm": 30.380931107448582, "language_loss": 0.71649033, "learning_rate": 1.5706230433446544e-08, "loss": 0.73104101, "num_input_tokens_seen": 344941910, "router_z_loss_clip": 2.14746094, "router_z_loss_mlp": 0.24597168, "step": 15988, "time_per_iteration": 2.699697494506836 }, { "auxiliary_loss_clip": 0.01227558, "auxiliary_loss_mlp": 0.00239966, "balance_loss_clip": 1.01680911, "balance_loss_mlp": 0.21666095, "epoch": 0.9613106869081617, "flos": 17164977955200.0, "grad_norm": 266.73201484310965, "language_loss": 0.8165729, "learning_rate": 1.5657554885462055e-08, "loss": 0.83124816, "num_input_tokens_seen": 344960020, "router_z_loss_clip": 2.10742188, "router_z_loss_mlp": 0.23303223, "step": 15989, "time_per_iteration": 2.629465341567993 }, { "auxiliary_loss_clip": 0.01077481, "auxiliary_loss_mlp": 0.00064872, "balance_loss_clip": 0.93940878, "balance_loss_mlp": 0.05941209, "epoch": 0.9613708101608297, "flos": 61563818522880.0, "grad_norm": 0.799933855185931, "language_loss": 0.62860483, "learning_rate": 1.5608954583941737e-08, "loss": 0.64002836, "num_input_tokens_seen": 345018290, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 0.0546875, "step": 15990, "time_per_iteration": 3.054628372192383 }, { "auxiliary_loss_clip": 0.01238695, "auxiliary_loss_mlp": 0.00234281, "balance_loss_clip": 1.02034342, "balance_loss_mlp": 0.20879391, "epoch": 0.9614309334134977, "flos": 27417904922880.0, "grad_norm": 29.91382051003049, "language_loss": 0.87859643, "learning_rate": 1.5560429530729003e-08, "loss": 0.89332616, "num_input_tokens_seen": 345040235, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.25439453, "step": 15991, "time_per_iteration": 2.731978178024292 }, { "auxiliary_loss_clip": 0.01244318, "auxiliary_loss_mlp": 0.00227816, "balance_loss_clip": 1.02296591, "balance_loss_mlp": 0.20275854, "epoch": 0.9614910566661656, "flos": 22819148611200.0, "grad_norm": 19.79734238262538, "language_loss": 0.96620643, "learning_rate": 1.5511979727663493e-08, "loss": 0.98092777, "num_input_tokens_seen": 345054540, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.25073242, "step": 15992, "time_per_iteration": 2.725853681564331 }, { "auxiliary_loss_clip": 0.01241396, "auxiliary_loss_mlp": 0.00248356, "balance_loss_clip": 1.02346909, "balance_loss_mlp": 0.22389433, "epoch": 0.9615511799188337, "flos": 20667812359680.0, "grad_norm": 250.5092454152884, "language_loss": 0.81260806, "learning_rate": 1.5463605176582406e-08, "loss": 0.82750559, "num_input_tokens_seen": 345074035, "router_z_loss_clip": 2.18066406, "router_z_loss_mlp": 0.2442627, "step": 15993, "time_per_iteration": 2.6639461517333984 }, { "auxiliary_loss_clip": 0.01239158, "auxiliary_loss_mlp": 0.00233515, "balance_loss_clip": 1.02370954, "balance_loss_mlp": 0.20832609, "epoch": 0.9616113031715016, "flos": 33149212035840.0, "grad_norm": 68.48854326557878, "language_loss": 0.73996973, "learning_rate": 1.5415305879320716e-08, "loss": 0.75469649, "num_input_tokens_seen": 345099270, "router_z_loss_clip": 2.15527344, "router_z_loss_mlp": 0.2520752, "step": 15994, "time_per_iteration": 2.8319759368896484 }, { "auxiliary_loss_clip": 0.01224656, "auxiliary_loss_mlp": 0.00242739, "balance_loss_clip": 1.01208007, "balance_loss_mlp": 0.21914771, "epoch": 0.9616714264241696, "flos": 25009807276800.0, "grad_norm": 47.79394965923078, "language_loss": 0.89550966, "learning_rate": 1.5367081837709183e-08, "loss": 0.91018361, "num_input_tokens_seen": 345116975, "router_z_loss_clip": 2.12304688, "router_z_loss_mlp": 0.23583984, "step": 15995, "time_per_iteration": 2.6702170372009277 }, { "auxiliary_loss_clip": 0.01246743, "auxiliary_loss_mlp": 0.00231196, "balance_loss_clip": 1.02606797, "balance_loss_mlp": 0.20541114, "epoch": 0.9617315496768375, "flos": 13547480359680.0, "grad_norm": 2.364739993183151, "language_loss": 0.84167957, "learning_rate": 1.5318933053576788e-08, "loss": 0.8564589, "num_input_tokens_seen": 345133645, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.25793457, "step": 15996, "time_per_iteration": 2.7916643619537354 }, { "auxiliary_loss_clip": 0.0123787, "auxiliary_loss_mlp": 0.00224075, "balance_loss_clip": 1.01823974, "balance_loss_mlp": 0.19859987, "epoch": 0.9617916729295055, "flos": 11254512781440.0, "grad_norm": 157.89000651975698, "language_loss": 0.87579119, "learning_rate": 1.52708595287494e-08, "loss": 0.8904106, "num_input_tokens_seen": 345150740, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.25439453, "step": 15997, "time_per_iteration": 2.6400372982025146 }, { "auxiliary_loss_clip": 0.01233563, "auxiliary_loss_mlp": 0.00225415, "balance_loss_clip": 1.02158356, "balance_loss_mlp": 0.20079809, "epoch": 0.9618517961821734, "flos": 22819723228800.0, "grad_norm": 3.108762394418708, "language_loss": 0.74650019, "learning_rate": 1.522286126505001e-08, "loss": 0.76108992, "num_input_tokens_seen": 345170365, "router_z_loss_clip": 2.12109375, "router_z_loss_mlp": 0.24633789, "step": 15998, "time_per_iteration": 2.7263853549957275 }, { "auxiliary_loss_clip": 0.01234491, "auxiliary_loss_mlp": 0.00220357, "balance_loss_clip": 1.01914299, "balance_loss_mlp": 0.1954782, "epoch": 0.9619119194348414, "flos": 16617340224000.0, "grad_norm": 119.83350670735315, "language_loss": 0.80151176, "learning_rate": 1.5174938264298498e-08, "loss": 0.81606025, "num_input_tokens_seen": 345188930, "router_z_loss_clip": 2.15429688, "router_z_loss_mlp": 0.24865723, "step": 15999, "time_per_iteration": 2.698713779449463 }, { "auxiliary_loss_clip": 0.01224555, "auxiliary_loss_mlp": 0.00208292, "balance_loss_clip": 1.01315427, "balance_loss_mlp": 0.18360405, "epoch": 0.9619720426875094, "flos": 24535140024960.0, "grad_norm": 7.668157566458209, "language_loss": 0.73685062, "learning_rate": 1.5127090528312514e-08, "loss": 0.7511791, "num_input_tokens_seen": 345209615, "router_z_loss_clip": 2.11328125, "router_z_loss_mlp": 0.24694824, "step": 16000, "time_per_iteration": 2.704021453857422 }, { "auxiliary_loss_clip": 0.01238268, "auxiliary_loss_mlp": 0.00234363, "balance_loss_clip": 1.02433038, "balance_loss_mlp": 0.21072398, "epoch": 0.9620321659401774, "flos": 20632224960000.0, "grad_norm": 3.653878887377098, "language_loss": 0.80802357, "learning_rate": 1.5079318058905723e-08, "loss": 0.82274985, "num_input_tokens_seen": 345229175, "router_z_loss_clip": 2.13671875, "router_z_loss_mlp": 0.23620605, "step": 16001, "time_per_iteration": 2.723879337310791 }, { "auxiliary_loss_clip": 0.01243041, "auxiliary_loss_mlp": 0.0023468, "balance_loss_clip": 1.02037251, "balance_loss_mlp": 0.20835811, "epoch": 0.9620922891928453, "flos": 18515290959360.0, "grad_norm": 10.436337070351978, "language_loss": 0.76543903, "learning_rate": 1.5031620857890447e-08, "loss": 0.78021622, "num_input_tokens_seen": 345247815, "router_z_loss_clip": 2.22460938, "router_z_loss_mlp": 0.26306152, "step": 16002, "time_per_iteration": 2.6429059505462646 }, { "auxiliary_loss_clip": 0.01236214, "auxiliary_loss_mlp": 0.00237863, "balance_loss_clip": 1.02395248, "balance_loss_mlp": 0.21120733, "epoch": 0.9621524124455133, "flos": 28767391914240.0, "grad_norm": 5.387286623586943, "language_loss": 0.71115541, "learning_rate": 1.4983998927074804e-08, "loss": 0.72589624, "num_input_tokens_seen": 345269935, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.26635742, "step": 16003, "time_per_iteration": 2.7399063110351562 }, { "auxiliary_loss_clip": 0.01257569, "auxiliary_loss_mlp": 0.00221566, "balance_loss_clip": 1.03325307, "balance_loss_mlp": 0.19697304, "epoch": 0.9622125356981813, "flos": 19098875226240.0, "grad_norm": 91.99047213810094, "language_loss": 0.86478066, "learning_rate": 1.493645226826512e-08, "loss": 0.87957203, "num_input_tokens_seen": 345288310, "router_z_loss_clip": 2.24414062, "router_z_loss_mlp": 0.24597168, "step": 16004, "time_per_iteration": 2.761401891708374 }, { "auxiliary_loss_clip": 0.0124251, "auxiliary_loss_mlp": 0.00231512, "balance_loss_clip": 1.02787447, "balance_loss_mlp": 0.20505928, "epoch": 0.9622726589508492, "flos": 20302816308480.0, "grad_norm": 6.7286130954680505, "language_loss": 0.8936283, "learning_rate": 1.4888980883263958e-08, "loss": 0.90836853, "num_input_tokens_seen": 345306615, "router_z_loss_clip": 2.14453125, "router_z_loss_mlp": 0.26477051, "step": 16005, "time_per_iteration": 2.823108434677124 }, { "auxiliary_loss_clip": 0.01242692, "auxiliary_loss_mlp": 0.00221077, "balance_loss_clip": 1.02632642, "balance_loss_mlp": 0.1956259, "epoch": 0.9623327822035173, "flos": 54929750889600.0, "grad_norm": 281.06047501978907, "language_loss": 0.74683022, "learning_rate": 1.4841584773871652e-08, "loss": 0.76146793, "num_input_tokens_seen": 345331935, "router_z_loss_clip": 2.16210938, "router_z_loss_mlp": 0.25463867, "step": 16006, "time_per_iteration": 2.952655553817749 }, { "auxiliary_loss_clip": 0.01215302, "auxiliary_loss_mlp": 0.00219938, "balance_loss_clip": 1.01034999, "balance_loss_mlp": 0.19707385, "epoch": 0.9623929054561852, "flos": 21759029585280.0, "grad_norm": 21.865208298661358, "language_loss": 0.83234167, "learning_rate": 1.479426394188521e-08, "loss": 0.84669399, "num_input_tokens_seen": 345351510, "router_z_loss_clip": 2.04882812, "router_z_loss_mlp": 0.2286377, "step": 16007, "time_per_iteration": 2.6530075073242188 }, { "auxiliary_loss_clip": 0.01270499, "auxiliary_loss_mlp": 0.00242255, "balance_loss_clip": 1.04483199, "balance_loss_mlp": 0.2141096, "epoch": 0.9624530287088532, "flos": 17931563038080.0, "grad_norm": 27.51757825185796, "language_loss": 0.75370044, "learning_rate": 1.4747018389099198e-08, "loss": 0.76882803, "num_input_tokens_seen": 345367750, "router_z_loss_clip": 2.2578125, "router_z_loss_mlp": 0.28186035, "step": 16008, "time_per_iteration": 2.627997875213623 }, { "auxiliary_loss_clip": 0.0125697, "auxiliary_loss_mlp": 0.00233993, "balance_loss_clip": 1.03220487, "balance_loss_mlp": 0.20911407, "epoch": 0.9625131519615211, "flos": 23253739263360.0, "grad_norm": 54.213821397284015, "language_loss": 0.84549916, "learning_rate": 1.469984811730529e-08, "loss": 0.86040878, "num_input_tokens_seen": 345384790, "router_z_loss_clip": 2.24609375, "router_z_loss_mlp": 0.24890137, "step": 16009, "time_per_iteration": 2.656085252761841 }, { "auxiliary_loss_clip": 0.01238935, "auxiliary_loss_mlp": 0.00222662, "balance_loss_clip": 1.02529073, "balance_loss_mlp": 0.19673394, "epoch": 0.9625732752141891, "flos": 18916628595840.0, "grad_norm": 15.2611642599492, "language_loss": 0.84141439, "learning_rate": 1.4652753128292061e-08, "loss": 0.85603034, "num_input_tokens_seen": 345403390, "router_z_loss_clip": 2.13867188, "router_z_loss_mlp": 0.25915527, "step": 16010, "time_per_iteration": 2.6238789558410645 }, { "auxiliary_loss_clip": 0.01264377, "auxiliary_loss_mlp": 0.00230664, "balance_loss_clip": 1.03553188, "balance_loss_mlp": 0.20379402, "epoch": 0.962633398466857, "flos": 16252918790400.0, "grad_norm": 29.545010267224107, "language_loss": 0.78018612, "learning_rate": 1.4605733423845635e-08, "loss": 0.79513651, "num_input_tokens_seen": 345418685, "router_z_loss_clip": 2.28710938, "router_z_loss_mlp": 0.26831055, "step": 16011, "time_per_iteration": 4.0425310134887695 }, { "auxiliary_loss_clip": 0.01229991, "auxiliary_loss_mlp": 0.00230799, "balance_loss_clip": 1.01702547, "balance_loss_mlp": 0.20768446, "epoch": 0.962693521719525, "flos": 54197424403200.0, "grad_norm": 38.86128334567636, "language_loss": 0.77808642, "learning_rate": 1.4558789005748585e-08, "loss": 0.79269433, "num_input_tokens_seen": 345442380, "router_z_loss_clip": 2.13085938, "router_z_loss_mlp": 0.23083496, "step": 16012, "time_per_iteration": 4.448180198669434 }, { "auxiliary_loss_clip": 0.01276264, "auxiliary_loss_mlp": 0.00227461, "balance_loss_clip": 1.04357457, "balance_loss_mlp": 0.19921988, "epoch": 0.962753644972193, "flos": 33105795471360.0, "grad_norm": 8.464249907310474, "language_loss": 0.8292551, "learning_rate": 1.4511919875781264e-08, "loss": 0.84429228, "num_input_tokens_seen": 345463815, "router_z_loss_clip": 2.32421875, "router_z_loss_mlp": 0.28234863, "step": 16013, "time_per_iteration": 2.7786550521850586 }, { "auxiliary_loss_clip": 0.01245501, "auxiliary_loss_mlp": 0.00220325, "balance_loss_clip": 1.0302422, "balance_loss_mlp": 0.19475421, "epoch": 0.962813768224861, "flos": 42230660837760.0, "grad_norm": 12.255167723647027, "language_loss": 0.75574982, "learning_rate": 1.4465126035720698e-08, "loss": 0.77040809, "num_input_tokens_seen": 345484525, "router_z_loss_clip": 2.15332031, "router_z_loss_mlp": 0.25561523, "step": 16014, "time_per_iteration": 2.827484607696533 }, { "auxiliary_loss_clip": 0.01231364, "auxiliary_loss_mlp": 0.002251, "balance_loss_clip": 1.02143335, "balance_loss_mlp": 0.20136544, "epoch": 0.9628738914775289, "flos": 43944677003520.0, "grad_norm": 33.61944243503026, "language_loss": 0.79991311, "learning_rate": 1.4418407487341688e-08, "loss": 0.81447768, "num_input_tokens_seen": 345508295, "router_z_loss_clip": 2.09960938, "router_z_loss_mlp": 0.23730469, "step": 16015, "time_per_iteration": 2.8674893379211426 }, { "auxiliary_loss_clip": 0.01244533, "auxiliary_loss_mlp": 0.00245413, "balance_loss_clip": 1.02367663, "balance_loss_mlp": 0.21918671, "epoch": 0.9629340147301969, "flos": 15596184476160.0, "grad_norm": 59.693810804997355, "language_loss": 0.85606247, "learning_rate": 1.4371764232415707e-08, "loss": 0.8709619, "num_input_tokens_seen": 345525155, "router_z_loss_clip": 2.2109375, "router_z_loss_mlp": 0.2623291, "step": 16016, "time_per_iteration": 2.6473381519317627 }, { "auxiliary_loss_clip": 0.01079679, "auxiliary_loss_mlp": 0.00061333, "balance_loss_clip": 0.94173801, "balance_loss_mlp": 0.0549906, "epoch": 0.9629941379828649, "flos": 62951011816320.0, "grad_norm": 0.79992087670393, "language_loss": 0.62327689, "learning_rate": 1.4325196272711337e-08, "loss": 0.63468701, "num_input_tokens_seen": 345578905, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.06347656, "step": 16017, "time_per_iteration": 3.078737258911133 }, { "auxiliary_loss_clip": 0.01246887, "auxiliary_loss_mlp": 0.00234507, "balance_loss_clip": 1.02648914, "balance_loss_mlp": 0.21086776, "epoch": 0.9630542612355328, "flos": 29899116702720.0, "grad_norm": 53.02526087777699, "language_loss": 0.77861971, "learning_rate": 1.4278703609994502e-08, "loss": 0.79343367, "num_input_tokens_seen": 345598965, "router_z_loss_clip": 2.20605469, "router_z_loss_mlp": 0.23632812, "step": 16018, "time_per_iteration": 2.837155818939209 }, { "auxiliary_loss_clip": 0.01229788, "auxiliary_loss_mlp": 0.00208995, "balance_loss_clip": 1.0210197, "balance_loss_mlp": 0.18517706, "epoch": 0.9631143844882009, "flos": 17894575008000.0, "grad_norm": 3.869184366893518, "language_loss": 0.88202071, "learning_rate": 1.4232286246028457e-08, "loss": 0.8964085, "num_input_tokens_seen": 345617945, "router_z_loss_clip": 2.08789062, "router_z_loss_mlp": 0.23815918, "step": 16019, "time_per_iteration": 2.639145851135254 }, { "auxiliary_loss_clip": 0.01218472, "auxiliary_loss_mlp": 0.00211638, "balance_loss_clip": 1.01169574, "balance_loss_mlp": 0.18919113, "epoch": 0.9631745077408688, "flos": 26139161767680.0, "grad_norm": 320.13845137603016, "language_loss": 0.77001917, "learning_rate": 1.4185944182572907e-08, "loss": 0.78432029, "num_input_tokens_seen": 345637920, "router_z_loss_clip": 2.06640625, "router_z_loss_mlp": 0.2244873, "step": 16020, "time_per_iteration": 2.725978136062622 }, { "auxiliary_loss_clip": 0.01234489, "auxiliary_loss_mlp": 0.00230425, "balance_loss_clip": 1.02101707, "balance_loss_mlp": 0.20648748, "epoch": 0.9632346309935368, "flos": 24973645259520.0, "grad_norm": 7.270055386161854, "language_loss": 0.84972632, "learning_rate": 1.4139677421385331e-08, "loss": 0.86437547, "num_input_tokens_seen": 345656195, "router_z_loss_clip": 2.13574219, "router_z_loss_mlp": 0.23937988, "step": 16021, "time_per_iteration": 4.176476716995239 }, { "auxiliary_loss_clip": 0.01255277, "auxiliary_loss_mlp": 0.00237463, "balance_loss_clip": 1.03592038, "balance_loss_mlp": 0.20832866, "epoch": 0.9632947542462047, "flos": 23617226943360.0, "grad_norm": 3.0421000489921677, "language_loss": 0.73653698, "learning_rate": 1.4093485964220331e-08, "loss": 0.75146443, "num_input_tokens_seen": 345676700, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.29138184, "step": 16022, "time_per_iteration": 4.086475849151611 }, { "auxiliary_loss_clip": 0.01245801, "auxiliary_loss_mlp": 0.00221793, "balance_loss_clip": 1.0310024, "balance_loss_mlp": 0.19702142, "epoch": 0.9633548774988727, "flos": 26395599939840.0, "grad_norm": 98.90981022054248, "language_loss": 0.81558239, "learning_rate": 1.4047369812829168e-08, "loss": 0.83025837, "num_input_tokens_seen": 345696725, "router_z_loss_clip": 2.15039062, "router_z_loss_mlp": 0.2479248, "step": 16023, "time_per_iteration": 2.735689640045166 }, { "auxiliary_loss_clip": 0.01234759, "auxiliary_loss_mlp": 0.00234188, "balance_loss_clip": 1.01863194, "balance_loss_mlp": 0.20982145, "epoch": 0.9634150007515406, "flos": 23767728929280.0, "grad_norm": 20.806852464153888, "language_loss": 0.88528061, "learning_rate": 1.4001328968960891e-08, "loss": 0.89997005, "num_input_tokens_seen": 345716245, "router_z_loss_clip": 2.16113281, "router_z_loss_mlp": 0.24353027, "step": 16024, "time_per_iteration": 2.7104156017303467 }, { "auxiliary_loss_clip": 0.01251302, "auxiliary_loss_mlp": 0.00232544, "balance_loss_clip": 1.02921748, "balance_loss_mlp": 0.20745055, "epoch": 0.9634751240042086, "flos": 24135346673280.0, "grad_norm": 250.80005476418194, "language_loss": 0.8673138, "learning_rate": 1.3955363434361212e-08, "loss": 0.88215232, "num_input_tokens_seen": 345739060, "router_z_loss_clip": 2.22070312, "router_z_loss_mlp": 0.25097656, "step": 16025, "time_per_iteration": 2.7182412147521973 }, { "auxiliary_loss_clip": 0.01239826, "auxiliary_loss_mlp": 0.00233286, "balance_loss_clip": 1.02412081, "balance_loss_mlp": 0.20752476, "epoch": 0.9635352472568766, "flos": 24349086552960.0, "grad_norm": 30.952100972337007, "language_loss": 0.83572853, "learning_rate": 1.3909473210773181e-08, "loss": 0.85045969, "num_input_tokens_seen": 345758325, "router_z_loss_clip": 2.15429688, "router_z_loss_mlp": 0.25756836, "step": 16026, "time_per_iteration": 2.7038135528564453 }, { "auxiliary_loss_clip": 0.01233223, "auxiliary_loss_mlp": 0.00215872, "balance_loss_clip": 1.01661646, "balance_loss_mlp": 0.19070736, "epoch": 0.9635953705095446, "flos": 23984772860160.0, "grad_norm": 35.289960653634324, "language_loss": 0.72313213, "learning_rate": 1.3863658299936965e-08, "loss": 0.73762304, "num_input_tokens_seen": 345778530, "router_z_loss_clip": 2.1640625, "router_z_loss_mlp": 0.25170898, "step": 16027, "time_per_iteration": 2.6863901615142822 }, { "auxiliary_loss_clip": 0.01247694, "auxiliary_loss_mlp": 0.0022811, "balance_loss_clip": 1.02986646, "balance_loss_mlp": 0.20245585, "epoch": 0.9636554937622125, "flos": 19828436365440.0, "grad_norm": 23.35389592962768, "language_loss": 0.9500103, "learning_rate": 1.3817918703589837e-08, "loss": 0.96476829, "num_input_tokens_seen": 345796535, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.25646973, "step": 16028, "time_per_iteration": 2.674448251724243 }, { "auxiliary_loss_clip": 0.01080138, "auxiliary_loss_mlp": 0.00069902, "balance_loss_clip": 0.9434604, "balance_loss_mlp": 0.06391731, "epoch": 0.9637156170148805, "flos": 67435499986560.0, "grad_norm": 0.7004204566291904, "language_loss": 0.52196383, "learning_rate": 1.3772254423466412e-08, "loss": 0.53346419, "num_input_tokens_seen": 345859700, "router_z_loss_clip": 1.3671875, "router_z_loss_mlp": 0.05981445, "step": 16029, "time_per_iteration": 3.1287684440612793 }, { "auxiliary_loss_clip": 0.01247187, "auxiliary_loss_mlp": 0.0024951, "balance_loss_clip": 1.02532721, "balance_loss_mlp": 0.22467905, "epoch": 0.9637757402675484, "flos": 20300912887680.0, "grad_norm": 733.3273720037168, "language_loss": 0.80289453, "learning_rate": 1.372666546129797e-08, "loss": 0.8178615, "num_input_tokens_seen": 345878760, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.24841309, "step": 16030, "time_per_iteration": 2.726562738418579 }, { "auxiliary_loss_clip": 0.01233166, "auxiliary_loss_mlp": 0.00235527, "balance_loss_clip": 1.02386749, "balance_loss_mlp": 0.21150681, "epoch": 0.9638358635202164, "flos": 27234544970880.0, "grad_norm": 147.555808896212, "language_loss": 0.72499293, "learning_rate": 1.3681151818813575e-08, "loss": 0.73967993, "num_input_tokens_seen": 345900445, "router_z_loss_clip": 2.09570312, "router_z_loss_mlp": 0.24023438, "step": 16031, "time_per_iteration": 2.6884586811065674 }, { "auxiliary_loss_clip": 0.01079563, "auxiliary_loss_mlp": 0.00063665, "balance_loss_clip": 0.94269633, "balance_loss_mlp": 0.05627393, "epoch": 0.9638959867728845, "flos": 70288998278400.0, "grad_norm": 0.8073286176547437, "language_loss": 0.59893489, "learning_rate": 1.3635713497738955e-08, "loss": 0.61036718, "num_input_tokens_seen": 345961020, "router_z_loss_clip": 1.3671875, "router_z_loss_mlp": 0.07373047, "step": 16032, "time_per_iteration": 3.168043851852417 }, { "auxiliary_loss_clip": 0.01218073, "auxiliary_loss_mlp": 0.00203991, "balance_loss_clip": 1.01032019, "balance_loss_mlp": 0.18089983, "epoch": 0.9639561100255524, "flos": 25407517639680.0, "grad_norm": 47.19508905505992, "language_loss": 0.75502861, "learning_rate": 1.3590350499796954e-08, "loss": 0.7692492, "num_input_tokens_seen": 345980210, "router_z_loss_clip": 2.08007812, "router_z_loss_mlp": 0.23083496, "step": 16033, "time_per_iteration": 2.64340877532959 }, { "auxiliary_loss_clip": 0.01224122, "auxiliary_loss_mlp": 0.00207235, "balance_loss_clip": 1.01228762, "balance_loss_mlp": 0.18036509, "epoch": 0.9640162332782204, "flos": 18113881495680.0, "grad_norm": 57.4227907816985, "language_loss": 0.74824029, "learning_rate": 1.3545062826707976e-08, "loss": 0.76255387, "num_input_tokens_seen": 345998280, "router_z_loss_clip": 2.12011719, "router_z_loss_mlp": 0.26843262, "step": 16034, "time_per_iteration": 2.6642842292785645 }, { "auxiliary_loss_clip": 0.01247581, "auxiliary_loss_mlp": 0.00218701, "balance_loss_clip": 1.02397656, "balance_loss_mlp": 0.19232035, "epoch": 0.9640763565308883, "flos": 23440295525760.0, "grad_norm": 122.12069369765945, "language_loss": 0.83964038, "learning_rate": 1.3499850480189313e-08, "loss": 0.85430324, "num_input_tokens_seen": 346015545, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.26379395, "step": 16035, "time_per_iteration": 2.649488925933838 }, { "auxiliary_loss_clip": 0.01239232, "auxiliary_loss_mlp": 0.002166, "balance_loss_clip": 1.02888918, "balance_loss_mlp": 0.19172059, "epoch": 0.9641364797835563, "flos": 22419355259520.0, "grad_norm": 4.58348700199132, "language_loss": 0.90196854, "learning_rate": 1.3454713461955591e-08, "loss": 0.91652679, "num_input_tokens_seen": 346034055, "router_z_loss_clip": 2.10546875, "router_z_loss_mlp": 0.2487793, "step": 16036, "time_per_iteration": 2.6674163341522217 }, { "auxiliary_loss_clip": 0.01234053, "auxiliary_loss_mlp": 0.00223416, "balance_loss_clip": 1.01938927, "balance_loss_mlp": 0.19829878, "epoch": 0.9641966030362242, "flos": 30622357048320.0, "grad_norm": 17.91679758735931, "language_loss": 0.76210952, "learning_rate": 1.340965177371789e-08, "loss": 0.77668417, "num_input_tokens_seen": 346054130, "router_z_loss_clip": 2.1484375, "router_z_loss_mlp": 0.2512207, "step": 16037, "time_per_iteration": 2.7418813705444336 }, { "auxiliary_loss_clip": 0.01233621, "auxiliary_loss_mlp": 0.00230228, "balance_loss_clip": 1.02035666, "balance_loss_mlp": 0.2074708, "epoch": 0.9642567262888923, "flos": 20953122088320.0, "grad_norm": 14.065202096460201, "language_loss": 0.69478887, "learning_rate": 1.3364665417185506e-08, "loss": 0.70942736, "num_input_tokens_seen": 346072990, "router_z_loss_clip": 2.1328125, "router_z_loss_mlp": 0.22741699, "step": 16038, "time_per_iteration": 2.659046173095703 }, { "auxiliary_loss_clip": 0.01248821, "auxiliary_loss_mlp": 0.00232834, "balance_loss_clip": 1.03330028, "balance_loss_mlp": 0.20801473, "epoch": 0.9643168495415602, "flos": 22639415932800.0, "grad_norm": 227.94115804067437, "language_loss": 0.78803706, "learning_rate": 1.3319754394064187e-08, "loss": 0.80285358, "num_input_tokens_seen": 346093745, "router_z_loss_clip": 2.15332031, "router_z_loss_mlp": 0.24829102, "step": 16039, "time_per_iteration": 2.7354507446289062 }, { "auxiliary_loss_clip": 0.01257119, "auxiliary_loss_mlp": 0.00216227, "balance_loss_clip": 1.03519535, "balance_loss_mlp": 0.19093126, "epoch": 0.9643769727942282, "flos": 20266259241600.0, "grad_norm": 15.870405697518342, "language_loss": 0.85004413, "learning_rate": 1.327491870605657e-08, "loss": 0.86477757, "num_input_tokens_seen": 346110115, "router_z_loss_clip": 2.21679688, "router_z_loss_mlp": 0.25317383, "step": 16040, "time_per_iteration": 2.6476831436157227 }, { "auxiliary_loss_clip": 0.0123178, "auxiliary_loss_mlp": 0.00222095, "balance_loss_clip": 1.01946509, "balance_loss_mlp": 0.19880167, "epoch": 0.9644370960468961, "flos": 13881845088000.0, "grad_norm": 159.44970579045307, "language_loss": 0.83261269, "learning_rate": 1.3230158354863296e-08, "loss": 0.8471514, "num_input_tokens_seen": 346127165, "router_z_loss_clip": 2.12304688, "router_z_loss_mlp": 0.23266602, "step": 16041, "time_per_iteration": 2.662736654281616 }, { "auxiliary_loss_clip": 0.01225236, "auxiliary_loss_mlp": 0.0022784, "balance_loss_clip": 1.02179646, "balance_loss_mlp": 0.20429617, "epoch": 0.9644972192995641, "flos": 17238199829760.0, "grad_norm": 28.368779296668453, "language_loss": 0.80384845, "learning_rate": 1.3185473342181674e-08, "loss": 0.81837922, "num_input_tokens_seen": 346145950, "router_z_loss_clip": 2.03222656, "router_z_loss_mlp": 0.23522949, "step": 16042, "time_per_iteration": 2.8859167098999023 }, { "auxiliary_loss_clip": 0.01244769, "auxiliary_loss_mlp": 0.00242585, "balance_loss_clip": 1.02647448, "balance_loss_mlp": 0.21836179, "epoch": 0.964557342552232, "flos": 23840340272640.0, "grad_norm": 690.989249903127, "language_loss": 0.87576449, "learning_rate": 1.3140863669705683e-08, "loss": 0.89063799, "num_input_tokens_seen": 346165005, "router_z_loss_clip": 2.18164062, "router_z_loss_mlp": 0.24230957, "step": 16043, "time_per_iteration": 2.8195996284484863 }, { "auxiliary_loss_clip": 0.01243456, "auxiliary_loss_mlp": 0.00231267, "balance_loss_clip": 1.0251565, "balance_loss_mlp": 0.20700774, "epoch": 0.9646174658049, "flos": 21653129312640.0, "grad_norm": 12.357221797274441, "language_loss": 0.79001093, "learning_rate": 1.3096329339127522e-08, "loss": 0.80475819, "num_input_tokens_seen": 346185095, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.24255371, "step": 16044, "time_per_iteration": 2.6677896976470947 }, { "auxiliary_loss_clip": 0.01223471, "auxiliary_loss_mlp": 0.00212885, "balance_loss_clip": 1.0124011, "balance_loss_mlp": 0.18812475, "epoch": 0.9646775890575681, "flos": 17129570123520.0, "grad_norm": 9.234018698491953, "language_loss": 0.76792979, "learning_rate": 1.3051870352135397e-08, "loss": 0.78229338, "num_input_tokens_seen": 346202580, "router_z_loss_clip": 2.11035156, "router_z_loss_mlp": 0.24755859, "step": 16045, "time_per_iteration": 2.6754860877990723 }, { "auxiliary_loss_clip": 0.01243115, "auxiliary_loss_mlp": 0.00220753, "balance_loss_clip": 1.02417874, "balance_loss_mlp": 0.19693494, "epoch": 0.964737712310236, "flos": 13005732458880.0, "grad_norm": 19.70780348796743, "language_loss": 0.84962606, "learning_rate": 1.3007486710415737e-08, "loss": 0.86426473, "num_input_tokens_seen": 346219395, "router_z_loss_clip": 2.18652344, "router_z_loss_mlp": 0.23828125, "step": 16046, "time_per_iteration": 2.62587833404541 }, { "auxiliary_loss_clip": 0.01249644, "auxiliary_loss_mlp": 0.00238038, "balance_loss_clip": 1.02752352, "balance_loss_mlp": 0.21073928, "epoch": 0.964797835562904, "flos": 24279240556800.0, "grad_norm": 14.9231426429612, "language_loss": 0.7069878, "learning_rate": 1.2963178415651199e-08, "loss": 0.72186464, "num_input_tokens_seen": 346239715, "router_z_loss_clip": 2.22070312, "router_z_loss_mlp": 0.27270508, "step": 16047, "time_per_iteration": 2.7213146686553955 }, { "auxiliary_loss_clip": 0.01230453, "auxiliary_loss_mlp": 0.00226928, "balance_loss_clip": 1.02066231, "balance_loss_mlp": 0.2032773, "epoch": 0.9648579588155719, "flos": 20522697413760.0, "grad_norm": 40.308074546095426, "language_loss": 0.78927666, "learning_rate": 1.2918945469521992e-08, "loss": 0.80385047, "num_input_tokens_seen": 346258500, "router_z_loss_clip": 2.10058594, "router_z_loss_mlp": 0.2364502, "step": 16048, "time_per_iteration": 2.66385817527771 }, { "auxiliary_loss_clip": 0.01229334, "auxiliary_loss_mlp": 0.00229304, "balance_loss_clip": 1.01680589, "balance_loss_mlp": 0.20461527, "epoch": 0.9649180820682399, "flos": 32154844855680.0, "grad_norm": 37.66494921235357, "language_loss": 0.70861268, "learning_rate": 1.2874787873705662e-08, "loss": 0.72319901, "num_input_tokens_seen": 346279110, "router_z_loss_clip": 2.12695312, "router_z_loss_mlp": 0.24694824, "step": 16049, "time_per_iteration": 2.7230241298675537 }, { "auxiliary_loss_clip": 0.01239714, "auxiliary_loss_mlp": 0.00220159, "balance_loss_clip": 1.02529359, "balance_loss_mlp": 0.19417137, "epoch": 0.9649782053209078, "flos": 20522589672960.0, "grad_norm": 4.18536910132653, "language_loss": 0.78156394, "learning_rate": 1.2830705629876427e-08, "loss": 0.79616266, "num_input_tokens_seen": 346297860, "router_z_loss_clip": 2.14257812, "router_z_loss_mlp": 0.2598877, "step": 16050, "time_per_iteration": 2.7811851501464844 }, { "auxiliary_loss_clip": 0.01262523, "auxiliary_loss_mlp": 0.00254795, "balance_loss_clip": 1.03670716, "balance_loss_mlp": 0.22710255, "epoch": 0.9650383285735759, "flos": 43067953843200.0, "grad_norm": 25.248348732429484, "language_loss": 0.7908054, "learning_rate": 1.278669873970606e-08, "loss": 0.8059786, "num_input_tokens_seen": 346319860, "router_z_loss_clip": 2.26171875, "router_z_loss_mlp": 0.27685547, "step": 16051, "time_per_iteration": 2.8524117469787598 }, { "auxiliary_loss_clip": 0.010845, "auxiliary_loss_mlp": 0.00067519, "balance_loss_clip": 0.9487431, "balance_loss_mlp": 0.06122473, "epoch": 0.9650984518262438, "flos": 61748255882880.0, "grad_norm": 1.1735954464003318, "language_loss": 0.58413297, "learning_rate": 1.2742767204863004e-08, "loss": 0.59565318, "num_input_tokens_seen": 346379025, "router_z_loss_clip": 1.359375, "router_z_loss_mlp": 0.06298828, "step": 16052, "time_per_iteration": 3.2335007190704346 }, { "auxiliary_loss_clip": 0.01252899, "auxiliary_loss_mlp": 0.00224525, "balance_loss_clip": 1.03534567, "balance_loss_mlp": 0.19945529, "epoch": 0.9651585750789118, "flos": 29789337761280.0, "grad_norm": 6.759906974662552, "language_loss": 0.8341375, "learning_rate": 1.2698911027013482e-08, "loss": 0.84891164, "num_input_tokens_seen": 346402250, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.25085449, "step": 16053, "time_per_iteration": 4.137972593307495 }, { "auxiliary_loss_clip": 0.01242267, "auxiliary_loss_mlp": 0.00213632, "balance_loss_clip": 1.02542102, "balance_loss_mlp": 0.18928899, "epoch": 0.9652186983315797, "flos": 16873060124160.0, "grad_norm": 6.834424232153116, "language_loss": 0.77356136, "learning_rate": 1.2655130207820386e-08, "loss": 0.78812027, "num_input_tokens_seen": 346419555, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.24365234, "step": 16054, "time_per_iteration": 2.59033203125 }, { "auxiliary_loss_clip": 0.01227844, "auxiliary_loss_mlp": 0.0023117, "balance_loss_clip": 1.01742721, "balance_loss_mlp": 0.20764959, "epoch": 0.9652788215842477, "flos": 31649761762560.0, "grad_norm": 1358.806664114813, "language_loss": 0.69074845, "learning_rate": 1.2611424748943944e-08, "loss": 0.7053386, "num_input_tokens_seen": 346441245, "router_z_loss_clip": 2.10351562, "router_z_loss_mlp": 0.23535156, "step": 16055, "time_per_iteration": 4.151713848114014 }, { "auxiliary_loss_clip": 0.01239327, "auxiliary_loss_mlp": 0.00213715, "balance_loss_clip": 1.02547514, "balance_loss_mlp": 0.18816891, "epoch": 0.9653389448369156, "flos": 24754266944640.0, "grad_norm": 5.286538478779013, "language_loss": 0.83959264, "learning_rate": 1.2567794652041719e-08, "loss": 0.85412306, "num_input_tokens_seen": 346460065, "router_z_loss_clip": 2.140625, "router_z_loss_mlp": 0.25524902, "step": 16056, "time_per_iteration": 2.782090187072754 }, { "auxiliary_loss_clip": 0.0124927, "auxiliary_loss_mlp": 0.00247447, "balance_loss_clip": 1.0299046, "balance_loss_mlp": 0.22190058, "epoch": 0.9653990680895836, "flos": 20297249700480.0, "grad_norm": 106.34545396185382, "language_loss": 0.80572629, "learning_rate": 1.2524239918767498e-08, "loss": 0.82069343, "num_input_tokens_seen": 346478005, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.25561523, "step": 16057, "time_per_iteration": 2.6287038326263428 }, { "auxiliary_loss_clip": 0.01239761, "auxiliary_loss_mlp": 0.00225339, "balance_loss_clip": 1.02648568, "balance_loss_mlp": 0.20136558, "epoch": 0.9654591913422517, "flos": 22528775064960.0, "grad_norm": 21.586023371900346, "language_loss": 0.78263527, "learning_rate": 1.2480760550773295e-08, "loss": 0.79728627, "num_input_tokens_seen": 346497575, "router_z_loss_clip": 2.13476562, "router_z_loss_mlp": 0.23986816, "step": 16058, "time_per_iteration": 2.6838817596435547 }, { "auxiliary_loss_clip": 0.0123785, "auxiliary_loss_mlp": 0.00192192, "balance_loss_clip": 1.0279448, "balance_loss_mlp": 0.16918433, "epoch": 0.9655193145949196, "flos": 26763002202240.0, "grad_norm": 19.02626840679074, "language_loss": 0.82097745, "learning_rate": 1.2437356549708011e-08, "loss": 0.83527786, "num_input_tokens_seen": 346520000, "router_z_loss_clip": 2.09960938, "router_z_loss_mlp": 0.23010254, "step": 16059, "time_per_iteration": 2.745992422103882 }, { "auxiliary_loss_clip": 0.01252805, "auxiliary_loss_mlp": 0.00256187, "balance_loss_clip": 1.02906477, "balance_loss_mlp": 0.2282919, "epoch": 0.9655794378475876, "flos": 41970703132800.0, "grad_norm": 134.7781265006225, "language_loss": 0.80284786, "learning_rate": 1.239402791721722e-08, "loss": 0.81793773, "num_input_tokens_seen": 346541605, "router_z_loss_clip": 2.23632812, "router_z_loss_mlp": 0.27905273, "step": 16060, "time_per_iteration": 2.85612154006958 }, { "auxiliary_loss_clip": 0.01228715, "auxiliary_loss_mlp": 0.00199633, "balance_loss_clip": 1.02012515, "balance_loss_mlp": 0.17695987, "epoch": 0.9656395611002555, "flos": 27709427704320.0, "grad_norm": 31.15301023410572, "language_loss": 0.83108926, "learning_rate": 1.2350774654944273e-08, "loss": 0.8453728, "num_input_tokens_seen": 346560955, "router_z_loss_clip": 2.08398438, "router_z_loss_mlp": 0.22680664, "step": 16061, "time_per_iteration": 2.737530469894409 }, { "auxiliary_loss_clip": 0.01080239, "auxiliary_loss_mlp": 0.0008479, "balance_loss_clip": 0.94041443, "balance_loss_mlp": 0.07844776, "epoch": 0.9656996843529235, "flos": 68968562411520.0, "grad_norm": 0.7100571784626574, "language_loss": 0.63340652, "learning_rate": 1.2307596764528749e-08, "loss": 0.64505678, "num_input_tokens_seen": 346621615, "router_z_loss_clip": 1.3984375, "router_z_loss_mlp": 0.06347656, "step": 16062, "time_per_iteration": 3.187025785446167 }, { "auxiliary_loss_clip": 0.01228986, "auxiliary_loss_mlp": 0.00221133, "balance_loss_clip": 1.01367259, "balance_loss_mlp": 0.19781616, "epoch": 0.9657598076055914, "flos": 20631327120000.0, "grad_norm": 25.919293353340915, "language_loss": 1.01887298, "learning_rate": 1.226449424760867e-08, "loss": 1.03337419, "num_input_tokens_seen": 346637460, "router_z_loss_clip": 2.15429688, "router_z_loss_mlp": 0.23303223, "step": 16063, "time_per_iteration": 4.122468709945679 }, { "auxiliary_loss_clip": 0.01244846, "auxiliary_loss_mlp": 0.0022323, "balance_loss_clip": 1.02610791, "balance_loss_mlp": 0.19900671, "epoch": 0.9658199308582595, "flos": 20448577699200.0, "grad_norm": 68.34429802492816, "language_loss": 0.88882756, "learning_rate": 1.2221467105818062e-08, "loss": 0.90350831, "num_input_tokens_seen": 346655625, "router_z_loss_clip": 2.19042969, "router_z_loss_mlp": 0.24230957, "step": 16064, "time_per_iteration": 4.067443132400513 }, { "auxiliary_loss_clip": 0.0124312, "auxiliary_loss_mlp": 0.00211311, "balance_loss_clip": 1.03405166, "balance_loss_mlp": 0.18767229, "epoch": 0.9658800541109274, "flos": 24718033100160.0, "grad_norm": 8.291143834987027, "language_loss": 0.90097052, "learning_rate": 1.2178515340788731e-08, "loss": 0.91551483, "num_input_tokens_seen": 346675220, "router_z_loss_clip": 2.08984375, "router_z_loss_mlp": 0.2364502, "step": 16065, "time_per_iteration": 2.678403854370117 }, { "auxiliary_loss_clip": 0.01240591, "auxiliary_loss_mlp": 0.00216324, "balance_loss_clip": 1.02351415, "balance_loss_mlp": 0.19031286, "epoch": 0.9659401773635954, "flos": 21610035970560.0, "grad_norm": 20.490582944723464, "language_loss": 0.74095845, "learning_rate": 1.2135638954149151e-08, "loss": 0.75552762, "num_input_tokens_seen": 346694710, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.2598877, "step": 16066, "time_per_iteration": 2.7183218002319336 }, { "auxiliary_loss_clip": 0.01237954, "auxiliary_loss_mlp": 0.00234252, "balance_loss_clip": 1.02466965, "balance_loss_mlp": 0.21010032, "epoch": 0.9660003006162633, "flos": 20301200196480.0, "grad_norm": 86.76660110778045, "language_loss": 0.86932957, "learning_rate": 1.209283794752558e-08, "loss": 0.88405162, "num_input_tokens_seen": 346712645, "router_z_loss_clip": 2.13476562, "router_z_loss_mlp": 0.24157715, "step": 16067, "time_per_iteration": 2.704571485519409 }, { "auxiliary_loss_clip": 0.01232128, "auxiliary_loss_mlp": 0.0021376, "balance_loss_clip": 1.0198586, "balance_loss_mlp": 0.19022846, "epoch": 0.9660604238689313, "flos": 24461954064000.0, "grad_norm": 32.15760322170617, "language_loss": 0.77083075, "learning_rate": 1.2050112322540496e-08, "loss": 0.78528959, "num_input_tokens_seen": 346732375, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.23547363, "step": 16068, "time_per_iteration": 2.7286553382873535 }, { "auxiliary_loss_clip": 0.01210744, "auxiliary_loss_mlp": 0.00209306, "balance_loss_clip": 1.00820589, "balance_loss_mlp": 0.18588185, "epoch": 0.9661205471215992, "flos": 19864023765120.0, "grad_norm": 6.737551698146475, "language_loss": 0.7539562, "learning_rate": 1.20074620808146e-08, "loss": 0.76815677, "num_input_tokens_seen": 346750430, "router_z_loss_clip": 2.02539062, "router_z_loss_mlp": 0.23413086, "step": 16069, "time_per_iteration": 2.624368190765381 }, { "auxiliary_loss_clip": 0.01234567, "auxiliary_loss_mlp": 0.00211083, "balance_loss_clip": 1.02290881, "balance_loss_mlp": 0.18820685, "epoch": 0.9661806703742672, "flos": 20557889763840.0, "grad_norm": 3.8803610642127073, "language_loss": 0.95437944, "learning_rate": 1.1964887223964826e-08, "loss": 0.96883595, "num_input_tokens_seen": 346768455, "router_z_loss_clip": 2.11914062, "router_z_loss_mlp": 0.22888184, "step": 16070, "time_per_iteration": 2.6665706634521484 }, { "auxiliary_loss_clip": 0.01254747, "auxiliary_loss_mlp": 0.00250542, "balance_loss_clip": 1.03235102, "balance_loss_mlp": 0.22398247, "epoch": 0.9662407936269353, "flos": 21430949736960.0, "grad_norm": 33.27540364110205, "language_loss": 0.85548556, "learning_rate": 1.1922387753605878e-08, "loss": 0.87053847, "num_input_tokens_seen": 346786530, "router_z_loss_clip": 2.22460938, "router_z_loss_mlp": 0.26574707, "step": 16071, "time_per_iteration": 2.6475629806518555 }, { "auxiliary_loss_clip": 0.01236508, "auxiliary_loss_mlp": 0.00203119, "balance_loss_clip": 1.02290416, "balance_loss_mlp": 0.17967048, "epoch": 0.9663009168796032, "flos": 14902893095040.0, "grad_norm": 13.31786272731078, "language_loss": 0.76811254, "learning_rate": 1.1879963671349137e-08, "loss": 0.78250885, "num_input_tokens_seen": 346804635, "router_z_loss_clip": 2.1328125, "router_z_loss_mlp": 0.234375, "step": 16072, "time_per_iteration": 2.697726011276245 }, { "auxiliary_loss_clip": 0.01241605, "auxiliary_loss_mlp": 0.00215316, "balance_loss_clip": 1.02516127, "balance_loss_mlp": 0.18972206, "epoch": 0.9663610401322712, "flos": 24310877460480.0, "grad_norm": 12.83078097193414, "language_loss": 0.83839679, "learning_rate": 1.1837614978803534e-08, "loss": 0.85296595, "num_input_tokens_seen": 346823070, "router_z_loss_clip": 2.16503906, "router_z_loss_mlp": 0.25610352, "step": 16073, "time_per_iteration": 2.7032251358032227 }, { "auxiliary_loss_clip": 0.01255378, "auxiliary_loss_mlp": 0.00236716, "balance_loss_clip": 1.03215456, "balance_loss_mlp": 0.21075241, "epoch": 0.9664211633849391, "flos": 17637849527040.0, "grad_norm": 128.12336563022203, "language_loss": 0.85517573, "learning_rate": 1.1795341677574677e-08, "loss": 0.87009668, "num_input_tokens_seen": 346841180, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.2598877, "step": 16074, "time_per_iteration": 2.6254944801330566 }, { "auxiliary_loss_clip": 0.0124155, "auxiliary_loss_mlp": 0.00233484, "balance_loss_clip": 1.02683616, "balance_loss_mlp": 0.20779479, "epoch": 0.9664812866376071, "flos": 29789409588480.0, "grad_norm": 8.868503954418076, "language_loss": 0.82085472, "learning_rate": 1.1753143769265728e-08, "loss": 0.83560503, "num_input_tokens_seen": 346864250, "router_z_loss_clip": 2.14453125, "router_z_loss_mlp": 0.25671387, "step": 16075, "time_per_iteration": 2.7335364818573 }, { "auxiliary_loss_clip": 0.01235204, "auxiliary_loss_mlp": 0.00203798, "balance_loss_clip": 1.02386856, "balance_loss_mlp": 0.17940757, "epoch": 0.966541409890275, "flos": 14282320798080.0, "grad_norm": 320.97164014551845, "language_loss": 0.87752211, "learning_rate": 1.171102125547696e-08, "loss": 0.89191216, "num_input_tokens_seen": 346881955, "router_z_loss_clip": 2.11132812, "router_z_loss_mlp": 0.24401855, "step": 16076, "time_per_iteration": 2.625866413116455 }, { "auxiliary_loss_clip": 0.01248496, "auxiliary_loss_mlp": 0.00250752, "balance_loss_clip": 1.02826619, "balance_loss_mlp": 0.22381061, "epoch": 0.9666015331429431, "flos": 19860432405120.0, "grad_norm": 8.025939785534883, "language_loss": 0.77308279, "learning_rate": 1.166897413780532e-08, "loss": 0.78807521, "num_input_tokens_seen": 346900445, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.26904297, "step": 16077, "time_per_iteration": 2.724830389022827 }, { "auxiliary_loss_clip": 0.0124553, "auxiliary_loss_mlp": 0.0022611, "balance_loss_clip": 1.02727222, "balance_loss_mlp": 0.20102805, "epoch": 0.966661656395611, "flos": 27125951178240.0, "grad_norm": 3.392560170517307, "language_loss": 0.68255758, "learning_rate": 1.1627002417845533e-08, "loss": 0.69727397, "num_input_tokens_seen": 346920135, "router_z_loss_clip": 2.18164062, "router_z_loss_mlp": 0.25109863, "step": 16078, "time_per_iteration": 2.6919450759887695 }, { "auxiliary_loss_clip": 0.01251089, "auxiliary_loss_mlp": 0.00233187, "balance_loss_clip": 1.0327239, "balance_loss_mlp": 0.20859386, "epoch": 0.966721779648279, "flos": 21508229848320.0, "grad_norm": 3.0807456211007618, "language_loss": 0.80859441, "learning_rate": 1.158510609718899e-08, "loss": 0.82343721, "num_input_tokens_seen": 346940450, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.24621582, "step": 16079, "time_per_iteration": 2.6972713470458984 }, { "auxiliary_loss_clip": 0.01221111, "auxiliary_loss_mlp": 0.00213379, "balance_loss_clip": 1.01140475, "balance_loss_mlp": 0.1904674, "epoch": 0.9667819029009469, "flos": 23878118401920.0, "grad_norm": 17.879021800709342, "language_loss": 0.78431839, "learning_rate": 1.1543285177424644e-08, "loss": 0.79866326, "num_input_tokens_seen": 346960935, "router_z_loss_clip": 2.09570312, "router_z_loss_mlp": 0.22924805, "step": 16080, "time_per_iteration": 2.7048211097717285 }, { "auxiliary_loss_clip": 0.01247863, "auxiliary_loss_mlp": 0.00212207, "balance_loss_clip": 1.02765965, "balance_loss_mlp": 0.18804336, "epoch": 0.9668420261536149, "flos": 21507224267520.0, "grad_norm": 6.474950784707148, "language_loss": 0.85457557, "learning_rate": 1.1501539660138115e-08, "loss": 0.86917627, "num_input_tokens_seen": 346980100, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.24133301, "step": 16081, "time_per_iteration": 2.7167351245880127 }, { "auxiliary_loss_clip": 0.01229996, "auxiliary_loss_mlp": 0.0021256, "balance_loss_clip": 1.01990521, "balance_loss_mlp": 0.18970716, "epoch": 0.9669021494062828, "flos": 26687266375680.0, "grad_norm": 30.666187238825437, "language_loss": 0.74176311, "learning_rate": 1.145986954691236e-08, "loss": 0.75618863, "num_input_tokens_seen": 347001250, "router_z_loss_clip": 2.09960938, "router_z_loss_mlp": 0.2286377, "step": 16082, "time_per_iteration": 2.699021100997925 }, { "auxiliary_loss_clip": 0.01217567, "auxiliary_loss_mlp": 0.00212382, "balance_loss_clip": 1.00807619, "balance_loss_mlp": 0.18985185, "epoch": 0.9669622726589508, "flos": 29825032901760.0, "grad_norm": 4.82468993756781, "language_loss": 0.83906853, "learning_rate": 1.141827483932789e-08, "loss": 0.85336804, "num_input_tokens_seen": 347022975, "router_z_loss_clip": 2.09570312, "router_z_loss_mlp": 0.2253418, "step": 16083, "time_per_iteration": 2.7624294757843018 }, { "auxiliary_loss_clip": 0.01239618, "auxiliary_loss_mlp": 0.00232577, "balance_loss_clip": 1.02222252, "balance_loss_mlp": 0.20880625, "epoch": 0.9670223959116189, "flos": 22922499018240.0, "grad_norm": 6.545236104497162, "language_loss": 0.86622488, "learning_rate": 1.1376755538961669e-08, "loss": 0.88094687, "num_input_tokens_seen": 347038780, "router_z_loss_clip": 2.17578125, "router_z_loss_mlp": 0.2376709, "step": 16084, "time_per_iteration": 2.6289479732513428 }, { "auxiliary_loss_clip": 0.0126003, "auxiliary_loss_mlp": 0.00241102, "balance_loss_clip": 1.03129029, "balance_loss_mlp": 0.21447098, "epoch": 0.9670825191642868, "flos": 18624495283200.0, "grad_norm": 107.25675004009703, "language_loss": 0.80954093, "learning_rate": 1.1335311647387991e-08, "loss": 0.82455224, "num_input_tokens_seen": 347056705, "router_z_loss_clip": 2.28515625, "router_z_loss_mlp": 0.26635742, "step": 16085, "time_per_iteration": 2.749258041381836 }, { "auxiliary_loss_clip": 0.01260526, "auxiliary_loss_mlp": 0.00233739, "balance_loss_clip": 1.0360868, "balance_loss_mlp": 0.20689301, "epoch": 0.9671426424169548, "flos": 24497936513280.0, "grad_norm": 106.67068275163957, "language_loss": 0.78121781, "learning_rate": 1.1293943166178709e-08, "loss": 0.79616046, "num_input_tokens_seen": 347075710, "router_z_loss_clip": 2.2421875, "router_z_loss_mlp": 0.26855469, "step": 16086, "time_per_iteration": 2.697448492050171 }, { "auxiliary_loss_clip": 0.01253822, "auxiliary_loss_mlp": 0.00234794, "balance_loss_clip": 1.03109562, "balance_loss_mlp": 0.2087705, "epoch": 0.9672027656696227, "flos": 20371189847040.0, "grad_norm": 7.851238971414784, "language_loss": 0.86185652, "learning_rate": 1.125265009690235e-08, "loss": 0.87674272, "num_input_tokens_seen": 347092325, "router_z_loss_clip": 2.22851562, "router_z_loss_mlp": 0.26037598, "step": 16087, "time_per_iteration": 2.722717761993408 }, { "auxiliary_loss_clip": 0.01241534, "auxiliary_loss_mlp": 0.00230275, "balance_loss_clip": 1.02307081, "balance_loss_mlp": 0.20424002, "epoch": 0.9672628889222907, "flos": 18880179269760.0, "grad_norm": 53.778709715209565, "language_loss": 0.83198428, "learning_rate": 1.1211432441124769e-08, "loss": 0.84670234, "num_input_tokens_seen": 347110595, "router_z_loss_clip": 2.18652344, "router_z_loss_mlp": 0.26062012, "step": 16088, "time_per_iteration": 2.7862236499786377 }, { "auxiliary_loss_clip": 0.01224409, "auxiliary_loss_mlp": 0.00212828, "balance_loss_clip": 1.01403594, "balance_loss_mlp": 0.18920034, "epoch": 0.9673230121749586, "flos": 28695247447680.0, "grad_norm": 9.188387246578614, "language_loss": 0.77402389, "learning_rate": 1.117029020040916e-08, "loss": 0.78839624, "num_input_tokens_seen": 347131625, "router_z_loss_clip": 2.10253906, "router_z_loss_mlp": 0.23632812, "step": 16089, "time_per_iteration": 2.737764835357666 }, { "auxiliary_loss_clip": 0.01251492, "auxiliary_loss_mlp": 0.00229818, "balance_loss_clip": 1.03609085, "balance_loss_mlp": 0.20448619, "epoch": 0.9673831354276267, "flos": 20484452407680.0, "grad_norm": 32.48472914990659, "language_loss": 0.84526157, "learning_rate": 1.1129223376315167e-08, "loss": 0.86007464, "num_input_tokens_seen": 347147910, "router_z_loss_clip": 2.15429688, "router_z_loss_mlp": 0.25292969, "step": 16090, "time_per_iteration": 2.699033260345459 }, { "auxiliary_loss_clip": 0.01255913, "auxiliary_loss_mlp": 0.00219504, "balance_loss_clip": 1.03343391, "balance_loss_mlp": 0.19314706, "epoch": 0.9674432586802946, "flos": 26797548107520.0, "grad_norm": 11.695896630996101, "language_loss": 0.76526582, "learning_rate": 1.1088231970400653e-08, "loss": 0.78002, "num_input_tokens_seen": 347168805, "router_z_loss_clip": 2.22460938, "router_z_loss_mlp": 0.26367188, "step": 16091, "time_per_iteration": 2.744002342224121 }, { "auxiliary_loss_clip": 0.01219191, "auxiliary_loss_mlp": 0.00209476, "balance_loss_clip": 1.01402569, "balance_loss_mlp": 0.18584868, "epoch": 0.9675033819329626, "flos": 22310941034880.0, "grad_norm": 4.95826007215197, "language_loss": 0.83764458, "learning_rate": 1.1047315984219484e-08, "loss": 0.85193121, "num_input_tokens_seen": 347189455, "router_z_loss_clip": 2.05175781, "router_z_loss_mlp": 0.2364502, "step": 16092, "time_per_iteration": 2.706245183944702 }, { "auxiliary_loss_clip": 0.01234271, "auxiliary_loss_mlp": 0.00214687, "balance_loss_clip": 1.02002501, "balance_loss_mlp": 0.19082169, "epoch": 0.9675635051856305, "flos": 12675713276160.0, "grad_norm": 167.75898387761623, "language_loss": 0.86145008, "learning_rate": 1.1006475419323313e-08, "loss": 0.87593961, "num_input_tokens_seen": 347206030, "router_z_loss_clip": 2.140625, "router_z_loss_mlp": 0.2388916, "step": 16093, "time_per_iteration": 2.6254518032073975 }, { "auxiliary_loss_clip": 0.01236436, "auxiliary_loss_mlp": 0.00261138, "balance_loss_clip": 1.01974225, "balance_loss_mlp": 0.23493563, "epoch": 0.9676236284382985, "flos": 24608469640320.0, "grad_norm": 35.127874809093534, "language_loss": 0.76325339, "learning_rate": 1.096571027726112e-08, "loss": 0.77822918, "num_input_tokens_seen": 347226250, "router_z_loss_clip": 2.1640625, "router_z_loss_mlp": 0.26196289, "step": 16094, "time_per_iteration": 2.791022300720215 }, { "auxiliary_loss_clip": 0.01243058, "auxiliary_loss_mlp": 0.00228812, "balance_loss_clip": 1.02477884, "balance_loss_mlp": 0.20303863, "epoch": 0.9676837516909664, "flos": 23367145478400.0, "grad_norm": 6.562757356260783, "language_loss": 0.83825397, "learning_rate": 1.0925020559578557e-08, "loss": 0.85297263, "num_input_tokens_seen": 347247350, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.25805664, "step": 16095, "time_per_iteration": 4.2886643409729 }, { "auxiliary_loss_clip": 0.01254482, "auxiliary_loss_mlp": 0.00245084, "balance_loss_clip": 1.02969384, "balance_loss_mlp": 0.21876267, "epoch": 0.9677438749436345, "flos": 20486894532480.0, "grad_norm": 7.581739807520376, "language_loss": 0.80317825, "learning_rate": 1.0884406267818392e-08, "loss": 0.81817389, "num_input_tokens_seen": 347266870, "router_z_loss_clip": 2.24804688, "router_z_loss_mlp": 0.26318359, "step": 16096, "time_per_iteration": 2.732398748397827 }, { "auxiliary_loss_clip": 0.0127083, "auxiliary_loss_mlp": 0.00239652, "balance_loss_clip": 1.04782891, "balance_loss_mlp": 0.21193582, "epoch": 0.9678039981963025, "flos": 47555889719040.0, "grad_norm": 27.65889983206515, "language_loss": 0.79284012, "learning_rate": 1.0843867403520946e-08, "loss": 0.80794495, "num_input_tokens_seen": 347290120, "router_z_loss_clip": 2.22851562, "router_z_loss_mlp": 0.2767334, "step": 16097, "time_per_iteration": 4.2704455852508545 }, { "auxiliary_loss_clip": 0.01237039, "auxiliary_loss_mlp": 0.00236001, "balance_loss_clip": 1.0242666, "balance_loss_mlp": 0.21096659, "epoch": 0.9678641214489704, "flos": 25040474513280.0, "grad_norm": 9.429070093084126, "language_loss": 0.84187287, "learning_rate": 1.0803403968223434e-08, "loss": 0.85660326, "num_input_tokens_seen": 347308785, "router_z_loss_clip": 2.12890625, "router_z_loss_mlp": 0.25048828, "step": 16098, "time_per_iteration": 2.705796241760254 }, { "auxiliary_loss_clip": 0.01217245, "auxiliary_loss_mlp": 0.00209617, "balance_loss_clip": 1.01264465, "balance_loss_mlp": 0.186777, "epoch": 0.9679242447016384, "flos": 19240937516160.0, "grad_norm": 7.4084666126506225, "language_loss": 0.95897555, "learning_rate": 1.0763015963459965e-08, "loss": 0.97324419, "num_input_tokens_seen": 347326375, "router_z_loss_clip": 2.04785156, "router_z_loss_mlp": 0.22839355, "step": 16099, "time_per_iteration": 2.642177104949951 }, { "auxiliary_loss_clip": 0.01240338, "auxiliary_loss_mlp": 0.00225028, "balance_loss_clip": 1.02286744, "balance_loss_mlp": 0.20070945, "epoch": 0.9679843679543063, "flos": 33254681345280.0, "grad_norm": 104.52143700240686, "language_loss": 0.74254668, "learning_rate": 1.0722703390762643e-08, "loss": 0.75720036, "num_input_tokens_seen": 347348250, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.24304199, "step": 16100, "time_per_iteration": 2.781959056854248 }, { "auxiliary_loss_clip": 0.01235055, "auxiliary_loss_mlp": 0.00222138, "balance_loss_clip": 1.01937461, "balance_loss_mlp": 0.19686571, "epoch": 0.9680444912069743, "flos": 22783633038720.0, "grad_norm": 4.795172440562857, "language_loss": 0.81104398, "learning_rate": 1.0682466251659584e-08, "loss": 0.82561588, "num_input_tokens_seen": 347367400, "router_z_loss_clip": 2.15332031, "router_z_loss_mlp": 0.25280762, "step": 16101, "time_per_iteration": 2.674607038497925 }, { "auxiliary_loss_clip": 0.01245257, "auxiliary_loss_mlp": 0.00224702, "balance_loss_clip": 1.02607715, "balance_loss_mlp": 0.19894043, "epoch": 0.9681046144596422, "flos": 24024095274240.0, "grad_norm": 13.191644825324829, "language_loss": 0.82361829, "learning_rate": 1.0642304547676672e-08, "loss": 0.83831787, "num_input_tokens_seen": 347387600, "router_z_loss_clip": 2.19335938, "router_z_loss_mlp": 0.25769043, "step": 16102, "time_per_iteration": 2.809123992919922 }, { "auxiliary_loss_clip": 0.01259428, "auxiliary_loss_mlp": 0.00244189, "balance_loss_clip": 1.03445828, "balance_loss_mlp": 0.21689023, "epoch": 0.9681647377123103, "flos": 23441013797760.0, "grad_norm": 77.6135917296932, "language_loss": 0.87260389, "learning_rate": 1.0602218280337139e-08, "loss": 0.88764006, "num_input_tokens_seen": 347406915, "router_z_loss_clip": 2.25195312, "router_z_loss_mlp": 0.27294922, "step": 16103, "time_per_iteration": 2.6587398052215576 }, { "auxiliary_loss_clip": 0.01250877, "auxiliary_loss_mlp": 0.0021419, "balance_loss_clip": 1.03190804, "balance_loss_mlp": 0.18923935, "epoch": 0.9682248609649782, "flos": 22675075159680.0, "grad_norm": 394.30577439037637, "language_loss": 0.89178312, "learning_rate": 1.0562207451160655e-08, "loss": 0.90643376, "num_input_tokens_seen": 347425140, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.24951172, "step": 16104, "time_per_iteration": 2.6975202560424805 }, { "auxiliary_loss_clip": 0.01228999, "auxiliary_loss_mlp": 0.00225192, "balance_loss_clip": 1.01742816, "balance_loss_mlp": 0.20098066, "epoch": 0.9682849842176462, "flos": 24428413739520.0, "grad_norm": 7.476333050915163, "language_loss": 0.84878832, "learning_rate": 1.0522272061664672e-08, "loss": 0.86333019, "num_input_tokens_seen": 347446350, "router_z_loss_clip": 2.11523438, "router_z_loss_mlp": 0.2421875, "step": 16105, "time_per_iteration": 4.2865309715271 }, { "auxiliary_loss_clip": 0.01074759, "auxiliary_loss_mlp": 0.00069345, "balance_loss_clip": 0.93851793, "balance_loss_mlp": 0.06286027, "epoch": 0.9683451074703141, "flos": 59995132784640.0, "grad_norm": 0.8254122886656305, "language_loss": 0.56179547, "learning_rate": 1.0482412113363536e-08, "loss": 0.57323647, "num_input_tokens_seen": 347510135, "router_z_loss_clip": 1.359375, "router_z_loss_mlp": 0.06494141, "step": 16106, "time_per_iteration": 4.731706142425537 }, { "auxiliary_loss_clip": 0.0108216, "auxiliary_loss_mlp": 0.00072546, "balance_loss_clip": 0.94513786, "balance_loss_mlp": 0.06587067, "epoch": 0.9684052307229821, "flos": 52696145514240.0, "grad_norm": 0.852318865068155, "language_loss": 0.60928154, "learning_rate": 1.0442627607768707e-08, "loss": 0.62082863, "num_input_tokens_seen": 347562505, "router_z_loss_clip": 1.3671875, "router_z_loss_mlp": 0.06689453, "step": 16107, "time_per_iteration": 3.042938470840454 }, { "auxiliary_loss_clip": 0.01276594, "auxiliary_loss_mlp": 0.00234989, "balance_loss_clip": 1.04686487, "balance_loss_mlp": 0.20752297, "epoch": 0.96846535397565, "flos": 22783848520320.0, "grad_norm": 73.80572342415742, "language_loss": 0.83229673, "learning_rate": 1.040291854638875e-08, "loss": 0.84741253, "num_input_tokens_seen": 347579150, "router_z_loss_clip": 2.296875, "router_z_loss_mlp": 0.2746582, "step": 16108, "time_per_iteration": 2.701876640319824 }, { "auxiliary_loss_clip": 0.01258148, "auxiliary_loss_mlp": 0.00230521, "balance_loss_clip": 1.03415668, "balance_loss_mlp": 0.20428297, "epoch": 0.968525477228318, "flos": 23323980309120.0, "grad_norm": 15.698614204420853, "language_loss": 0.68577373, "learning_rate": 1.0363284930729576e-08, "loss": 0.70066035, "num_input_tokens_seen": 347596705, "router_z_loss_clip": 2.24414062, "router_z_loss_mlp": 0.26196289, "step": 16109, "time_per_iteration": 2.6440136432647705 }, { "auxiliary_loss_clip": 0.01080268, "auxiliary_loss_mlp": 0.00087428, "balance_loss_clip": 0.94336665, "balance_loss_mlp": 0.0807521, "epoch": 0.9685856004809861, "flos": 67882947707520.0, "grad_norm": 0.6558160345838183, "language_loss": 0.53616893, "learning_rate": 1.0323726762294205e-08, "loss": 0.54784596, "num_input_tokens_seen": 347661870, "router_z_loss_clip": 1.3671875, "router_z_loss_mlp": 0.06689453, "step": 16110, "time_per_iteration": 3.116230010986328 }, { "auxiliary_loss_clip": 0.01268428, "auxiliary_loss_mlp": 0.00235304, "balance_loss_clip": 1.04123223, "balance_loss_mlp": 0.20649149, "epoch": 0.968645723733654, "flos": 33947900899200.0, "grad_norm": 3.4842799127336725, "language_loss": 0.71407557, "learning_rate": 1.0284244042582325e-08, "loss": 0.72911292, "num_input_tokens_seen": 347684295, "router_z_loss_clip": 2.27148438, "router_z_loss_mlp": 0.28833008, "step": 16111, "time_per_iteration": 2.806518793106079 }, { "auxiliary_loss_clip": 0.01218812, "auxiliary_loss_mlp": 0.00232599, "balance_loss_clip": 1.01250768, "balance_loss_mlp": 0.20985344, "epoch": 0.968705846986322, "flos": 18551488890240.0, "grad_norm": 17.356067521667402, "language_loss": 0.81794864, "learning_rate": 1.024483677309118e-08, "loss": 0.83246273, "num_input_tokens_seen": 347702585, "router_z_loss_clip": 2.06640625, "router_z_loss_mlp": 0.22753906, "step": 16112, "time_per_iteration": 2.636869430541992 }, { "auxiliary_loss_clip": 0.01216439, "auxiliary_loss_mlp": 0.00206427, "balance_loss_clip": 1.00825775, "balance_loss_mlp": 0.18355042, "epoch": 0.9687659702389899, "flos": 17420913336960.0, "grad_norm": 211.05151314804283, "language_loss": 0.77653462, "learning_rate": 1.020550495531558e-08, "loss": 0.79076326, "num_input_tokens_seen": 347721810, "router_z_loss_clip": 2.08496094, "router_z_loss_mlp": 0.2286377, "step": 16113, "time_per_iteration": 2.7376270294189453 }, { "auxiliary_loss_clip": 0.01078656, "auxiliary_loss_mlp": 0.00061109, "balance_loss_clip": 0.94231987, "balance_loss_mlp": 0.05495805, "epoch": 0.9688260934916579, "flos": 62047176865920.0, "grad_norm": 0.7907912796422909, "language_loss": 0.5589307, "learning_rate": 1.0166248590746329e-08, "loss": 0.57032835, "num_input_tokens_seen": 347782330, "router_z_loss_clip": 1.359375, "router_z_loss_mlp": 0.06152344, "step": 16114, "time_per_iteration": 3.154376983642578 }, { "auxiliary_loss_clip": 0.012459, "auxiliary_loss_mlp": 0.00238793, "balance_loss_clip": 1.02996266, "balance_loss_mlp": 0.21313956, "epoch": 0.9688862167443258, "flos": 15076520461440.0, "grad_norm": 89.848044004011, "language_loss": 0.88673425, "learning_rate": 1.0127067680872458e-08, "loss": 0.90158117, "num_input_tokens_seen": 347794835, "router_z_loss_clip": 2.15917969, "router_z_loss_mlp": 0.2565918, "step": 16115, "time_per_iteration": 2.6489615440368652 }, { "auxiliary_loss_clip": 0.01225309, "auxiliary_loss_mlp": 0.00217477, "balance_loss_clip": 1.02159011, "balance_loss_mlp": 0.19356328, "epoch": 0.9689463399969939, "flos": 19938215306880.0, "grad_norm": 2.2789533367332893, "language_loss": 0.77319521, "learning_rate": 1.0087962227179448e-08, "loss": 0.78762311, "num_input_tokens_seen": 347814320, "router_z_loss_clip": 2.03808594, "router_z_loss_mlp": 0.23913574, "step": 16116, "time_per_iteration": 2.7303414344787598 }, { "auxiliary_loss_clip": 0.01257576, "auxiliary_loss_mlp": 0.00214654, "balance_loss_clip": 1.03393865, "balance_loss_mlp": 0.19035889, "epoch": 0.9690064632496618, "flos": 19573039687680.0, "grad_norm": 56.387747263273624, "language_loss": 0.87247378, "learning_rate": 1.0048932231150553e-08, "loss": 0.88719606, "num_input_tokens_seen": 347832125, "router_z_loss_clip": 2.23535156, "router_z_loss_mlp": 0.24279785, "step": 16117, "time_per_iteration": 2.709587335586548 }, { "auxiliary_loss_clip": 0.01251106, "auxiliary_loss_mlp": 0.0021898, "balance_loss_clip": 1.02801251, "balance_loss_mlp": 0.19362424, "epoch": 0.9690665865023298, "flos": 21872292145920.0, "grad_norm": 5.122109792731385, "language_loss": 0.86402804, "learning_rate": 1.000997769426548e-08, "loss": 0.87872893, "num_input_tokens_seen": 347850765, "router_z_loss_clip": 2.23632812, "router_z_loss_mlp": 0.25366211, "step": 16118, "time_per_iteration": 2.6755597591400146 }, { "auxiliary_loss_clip": 0.01267629, "auxiliary_loss_mlp": 0.00220592, "balance_loss_clip": 1.04355097, "balance_loss_mlp": 0.19561791, "epoch": 0.9691267097549977, "flos": 20994491577600.0, "grad_norm": 8.288463906834535, "language_loss": 0.84269589, "learning_rate": 9.971098618001272e-09, "loss": 0.85757816, "num_input_tokens_seen": 347870125, "router_z_loss_clip": 2.24023438, "router_z_loss_mlp": 0.24975586, "step": 16119, "time_per_iteration": 2.6615943908691406 }, { "auxiliary_loss_clip": 0.01218188, "auxiliary_loss_mlp": 0.00204486, "balance_loss_clip": 1.01408279, "balance_loss_mlp": 0.18112087, "epoch": 0.9691868330076657, "flos": 24279132816000.0, "grad_norm": 2469.0820264052363, "language_loss": 0.82214129, "learning_rate": 9.932295003832747e-09, "loss": 0.83636808, "num_input_tokens_seen": 347890615, "router_z_loss_clip": 2.04101562, "router_z_loss_mlp": 0.23364258, "step": 16120, "time_per_iteration": 2.7635676860809326 }, { "auxiliary_loss_clip": 0.01254123, "auxiliary_loss_mlp": 0.00211269, "balance_loss_clip": 1.03321433, "balance_loss_mlp": 0.18660414, "epoch": 0.9692469562603336, "flos": 17675699483520.0, "grad_norm": 18.07260513677884, "language_loss": 0.7875793, "learning_rate": 9.89356685323095e-09, "loss": 0.80223322, "num_input_tokens_seen": 347908685, "router_z_loss_clip": 2.20996094, "router_z_loss_mlp": 0.24658203, "step": 16121, "time_per_iteration": 2.6621673107147217 }, { "auxiliary_loss_clip": 0.01233041, "auxiliary_loss_mlp": 0.00219676, "balance_loss_clip": 1.01579773, "balance_loss_mlp": 0.19576298, "epoch": 0.9693070795130017, "flos": 26834392483200.0, "grad_norm": 11.87878769602746, "language_loss": 0.78171158, "learning_rate": 9.854914167664486e-09, "loss": 0.79623878, "num_input_tokens_seen": 347926385, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.23913574, "step": 16122, "time_per_iteration": 2.7750794887542725 }, { "auxiliary_loss_clip": 0.01242132, "auxiliary_loss_mlp": 0.00224186, "balance_loss_clip": 1.02266312, "balance_loss_mlp": 0.19840136, "epoch": 0.9693672027656697, "flos": 18077288515200.0, "grad_norm": 13.529469625731599, "language_loss": 0.86865222, "learning_rate": 9.81633694859907e-09, "loss": 0.88331544, "num_input_tokens_seen": 347945290, "router_z_loss_clip": 2.19335938, "router_z_loss_mlp": 0.25756836, "step": 16123, "time_per_iteration": 2.6430623531341553 }, { "auxiliary_loss_clip": 0.01237606, "auxiliary_loss_mlp": 0.00237776, "balance_loss_clip": 1.02049685, "balance_loss_mlp": 0.21263444, "epoch": 0.9694273260183376, "flos": 21763015994880.0, "grad_norm": 10.248951420221792, "language_loss": 0.79751807, "learning_rate": 9.777835197497753e-09, "loss": 0.81227189, "num_input_tokens_seen": 347966330, "router_z_loss_clip": 2.171875, "router_z_loss_mlp": 0.25146484, "step": 16124, "time_per_iteration": 2.7112903594970703 }, { "auxiliary_loss_clip": 0.01243099, "auxiliary_loss_mlp": 0.00228569, "balance_loss_clip": 1.03272748, "balance_loss_mlp": 0.20513274, "epoch": 0.9694874492710056, "flos": 24426115269120.0, "grad_norm": 7.377086402552472, "language_loss": 0.8091563, "learning_rate": 9.739408915820258e-09, "loss": 0.82387292, "num_input_tokens_seen": 347982590, "router_z_loss_clip": 2.10351562, "router_z_loss_mlp": 0.23425293, "step": 16125, "time_per_iteration": 2.7148406505584717 }, { "auxiliary_loss_clip": 0.01080609, "auxiliary_loss_mlp": 0.00071942, "balance_loss_clip": 0.94335878, "balance_loss_mlp": 0.06607695, "epoch": 0.9695475725236735, "flos": 67650748237440.0, "grad_norm": 0.8720065858694876, "language_loss": 0.614043, "learning_rate": 9.70105810502364e-09, "loss": 0.62556851, "num_input_tokens_seen": 348043310, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.05859375, "step": 16126, "time_per_iteration": 3.194054365158081 }, { "auxiliary_loss_clip": 0.01223991, "auxiliary_loss_mlp": 0.00193215, "balance_loss_clip": 1.01540494, "balance_loss_mlp": 0.17110123, "epoch": 0.9696076957763415, "flos": 19129326981120.0, "grad_norm": 144.57010919841505, "language_loss": 0.82249486, "learning_rate": 9.662782766562738e-09, "loss": 0.83666688, "num_input_tokens_seen": 348062200, "router_z_loss_clip": 2.08789062, "router_z_loss_mlp": 0.22119141, "step": 16127, "time_per_iteration": 2.6848373413085938 }, { "auxiliary_loss_clip": 0.01258241, "auxiliary_loss_mlp": 0.00225474, "balance_loss_clip": 1.03138983, "balance_loss_mlp": 0.19715011, "epoch": 0.9696678190290094, "flos": 15486836497920.0, "grad_norm": 34.14115045495221, "language_loss": 0.77493697, "learning_rate": 9.62458290188839e-09, "loss": 0.78977406, "num_input_tokens_seen": 348080685, "router_z_loss_clip": 2.26953125, "router_z_loss_mlp": 0.2833252, "step": 16128, "time_per_iteration": 2.7133736610412598 }, { "auxiliary_loss_clip": 0.01243167, "auxiliary_loss_mlp": 0.00221751, "balance_loss_clip": 1.02712631, "balance_loss_mlp": 0.19814759, "epoch": 0.9697279422816775, "flos": 36208692869760.0, "grad_norm": 35.57161310597105, "language_loss": 0.70640939, "learning_rate": 9.586458512449213e-09, "loss": 0.72105861, "num_input_tokens_seen": 348102500, "router_z_loss_clip": 2.16210938, "router_z_loss_mlp": 0.23620605, "step": 16129, "time_per_iteration": 2.811979055404663 }, { "auxiliary_loss_clip": 0.01264735, "auxiliary_loss_mlp": 0.00224374, "balance_loss_clip": 1.04146671, "balance_loss_mlp": 0.1979091, "epoch": 0.9697880655343454, "flos": 25484007651840.0, "grad_norm": 8.76165874049283, "language_loss": 0.7428351, "learning_rate": 9.548409599691166e-09, "loss": 0.75772619, "num_input_tokens_seen": 348122515, "router_z_loss_clip": 2.23632812, "router_z_loss_mlp": 0.26464844, "step": 16130, "time_per_iteration": 2.7834250926971436 }, { "auxiliary_loss_clip": 0.01260944, "auxiliary_loss_mlp": 0.00235311, "balance_loss_clip": 1.0343138, "balance_loss_mlp": 0.20722538, "epoch": 0.9698481887870134, "flos": 15333533251200.0, "grad_norm": 47.21777294756925, "language_loss": 0.82213926, "learning_rate": 9.510436165056867e-09, "loss": 0.83710182, "num_input_tokens_seen": 348138775, "router_z_loss_clip": 2.265625, "router_z_loss_mlp": 0.28076172, "step": 16131, "time_per_iteration": 2.692765951156616 }, { "auxiliary_loss_clip": 0.0125186, "auxiliary_loss_mlp": 0.00242891, "balance_loss_clip": 1.03289044, "balance_loss_mlp": 0.21677238, "epoch": 0.9699083120396813, "flos": 21982250655360.0, "grad_norm": 150.89689424080146, "language_loss": 0.84399819, "learning_rate": 9.472538209986058e-09, "loss": 0.85894573, "num_input_tokens_seen": 348157115, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.26135254, "step": 16132, "time_per_iteration": 2.6768553256988525 }, { "auxiliary_loss_clip": 0.01251638, "auxiliary_loss_mlp": 0.00232319, "balance_loss_clip": 1.02947271, "balance_loss_mlp": 0.20468658, "epoch": 0.9699684352923493, "flos": 15664055224320.0, "grad_norm": 13.97366293206911, "language_loss": 0.87451875, "learning_rate": 9.434715735916477e-09, "loss": 0.88935828, "num_input_tokens_seen": 348173035, "router_z_loss_clip": 2.22070312, "router_z_loss_mlp": 0.27624512, "step": 16133, "time_per_iteration": 2.7264420986175537 }, { "auxiliary_loss_clip": 0.01234135, "auxiliary_loss_mlp": 0.00210453, "balance_loss_clip": 1.02178681, "balance_loss_mlp": 0.1870876, "epoch": 0.9700285585450172, "flos": 21908382336000.0, "grad_norm": 181.75962266216848, "language_loss": 0.7301681, "learning_rate": 9.396968744281863e-09, "loss": 0.74461401, "num_input_tokens_seen": 348192960, "router_z_loss_clip": 2.12304688, "router_z_loss_mlp": 0.23352051, "step": 16134, "time_per_iteration": 2.6613759994506836 }, { "auxiliary_loss_clip": 0.01239777, "auxiliary_loss_mlp": 0.00222467, "balance_loss_clip": 1.02300024, "balance_loss_mlp": 0.19681363, "epoch": 0.9700886817976853, "flos": 23914890950400.0, "grad_norm": 70.87027077835961, "language_loss": 0.90880895, "learning_rate": 9.359297236513519e-09, "loss": 0.92343134, "num_input_tokens_seen": 348212805, "router_z_loss_clip": 2.1640625, "router_z_loss_mlp": 0.2565918, "step": 16135, "time_per_iteration": 2.685509204864502 }, { "auxiliary_loss_clip": 0.01253677, "auxiliary_loss_mlp": 0.00233731, "balance_loss_clip": 1.03474391, "balance_loss_mlp": 0.20639601, "epoch": 0.9701488050503532, "flos": 25447845634560.0, "grad_norm": 1177.6285166939435, "language_loss": 0.79777145, "learning_rate": 9.321701214040079e-09, "loss": 0.81264555, "num_input_tokens_seen": 348232900, "router_z_loss_clip": 2.19335938, "router_z_loss_mlp": 0.27355957, "step": 16136, "time_per_iteration": 2.672851324081421 }, { "auxiliary_loss_clip": 0.01234455, "auxiliary_loss_mlp": 0.0021769, "balance_loss_clip": 1.01886761, "balance_loss_mlp": 0.19290632, "epoch": 0.9702089283030212, "flos": 20590855470720.0, "grad_norm": 192.44384770132842, "language_loss": 0.82551211, "learning_rate": 9.28418067828729e-09, "loss": 0.84003353, "num_input_tokens_seen": 348253065, "router_z_loss_clip": 2.15234375, "router_z_loss_mlp": 0.24768066, "step": 16137, "time_per_iteration": 4.075519323348999 }, { "auxiliary_loss_clip": 0.01078142, "auxiliary_loss_mlp": 0.00068368, "balance_loss_clip": 0.94221455, "balance_loss_mlp": 0.06221697, "epoch": 0.9702690515556892, "flos": 70651516291200.0, "grad_norm": 1.3490824488432207, "language_loss": 0.54268032, "learning_rate": 9.246735630678015e-09, "loss": 0.5541454, "num_input_tokens_seen": 348316075, "router_z_loss_clip": 1.359375, "router_z_loss_mlp": 0.0612793, "step": 16138, "time_per_iteration": 3.257861852645874 }, { "auxiliary_loss_clip": 0.01240679, "auxiliary_loss_mlp": 0.00231803, "balance_loss_clip": 1.02357888, "balance_loss_mlp": 0.20638794, "epoch": 0.9703291748083571, "flos": 35881439034240.0, "grad_norm": 3.1882888792733466, "language_loss": 0.78377271, "learning_rate": 9.209366072632007e-09, "loss": 0.79849756, "num_input_tokens_seen": 348337605, "router_z_loss_clip": 2.171875, "router_z_loss_mlp": 0.25402832, "step": 16139, "time_per_iteration": 4.29990553855896 }, { "auxiliary_loss_clip": 0.01248983, "auxiliary_loss_mlp": 0.00238955, "balance_loss_clip": 1.03197443, "balance_loss_mlp": 0.21455257, "epoch": 0.9703892980610251, "flos": 24316479982080.0, "grad_norm": 18.37017949837744, "language_loss": 0.79655123, "learning_rate": 9.172072005566134e-09, "loss": 0.81143057, "num_input_tokens_seen": 348359430, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.24401855, "step": 16140, "time_per_iteration": 2.8178303241729736 }, { "auxiliary_loss_clip": 0.01264005, "auxiliary_loss_mlp": 0.00243737, "balance_loss_clip": 1.03666019, "balance_loss_mlp": 0.21745145, "epoch": 0.970449421313693, "flos": 18003743418240.0, "grad_norm": 29.671142634656846, "language_loss": 0.78326499, "learning_rate": 9.13485343089504e-09, "loss": 0.79834247, "num_input_tokens_seen": 348377890, "router_z_loss_clip": 2.27148438, "router_z_loss_mlp": 0.26269531, "step": 16141, "time_per_iteration": 2.720022439956665 }, { "auxiliary_loss_clip": 0.01235025, "auxiliary_loss_mlp": 0.00215588, "balance_loss_clip": 1.02289987, "balance_loss_mlp": 0.19302157, "epoch": 0.9705095445663611, "flos": 25337994865920.0, "grad_norm": 34.986783928083504, "language_loss": 0.77097392, "learning_rate": 9.097710350029597e-09, "loss": 0.78548002, "num_input_tokens_seen": 348396550, "router_z_loss_clip": 2.12011719, "router_z_loss_mlp": 0.22558594, "step": 16142, "time_per_iteration": 2.736443281173706 }, { "auxiliary_loss_clip": 0.01225813, "auxiliary_loss_mlp": 0.00220516, "balance_loss_clip": 1.01228178, "balance_loss_mlp": 0.19591075, "epoch": 0.970569667819029, "flos": 26833602384000.0, "grad_norm": 19.24213589651548, "language_loss": 0.63621467, "learning_rate": 9.060642764378457e-09, "loss": 0.65067792, "num_input_tokens_seen": 348417120, "router_z_loss_clip": 2.13671875, "router_z_loss_mlp": 0.24609375, "step": 16143, "time_per_iteration": 2.770338773727417 }, { "auxiliary_loss_clip": 0.01241893, "auxiliary_loss_mlp": 0.00225787, "balance_loss_clip": 1.02787375, "balance_loss_mlp": 0.20148018, "epoch": 0.970629791071697, "flos": 25848644567040.0, "grad_norm": 3.6916547797739354, "language_loss": 0.78038508, "learning_rate": 9.023650675347382e-09, "loss": 0.79506189, "num_input_tokens_seen": 348437750, "router_z_loss_clip": 2.14453125, "router_z_loss_mlp": 0.24291992, "step": 16144, "time_per_iteration": 2.726402759552002 }, { "auxiliary_loss_clip": 0.0123442, "auxiliary_loss_mlp": 0.00224021, "balance_loss_clip": 1.02225304, "balance_loss_mlp": 0.19938016, "epoch": 0.9706899143243649, "flos": 36540184510080.0, "grad_norm": 4.532033751261966, "language_loss": 0.78374505, "learning_rate": 8.986734084339253e-09, "loss": 0.79832947, "num_input_tokens_seen": 348460935, "router_z_loss_clip": 2.12109375, "router_z_loss_mlp": 0.24633789, "step": 16145, "time_per_iteration": 2.8008430004119873 }, { "auxiliary_loss_clip": 0.01240872, "auxiliary_loss_mlp": 0.00219732, "balance_loss_clip": 1.02254915, "balance_loss_mlp": 0.19364899, "epoch": 0.9707500375770329, "flos": 12268234414080.0, "grad_norm": 91.54954527634001, "language_loss": 0.89441991, "learning_rate": 8.949892992753395e-09, "loss": 0.90902591, "num_input_tokens_seen": 348474480, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.26062012, "step": 16146, "time_per_iteration": 2.6258480548858643 }, { "auxiliary_loss_clip": 0.01080837, "auxiliary_loss_mlp": 0.00067316, "balance_loss_clip": 0.94352353, "balance_loss_mlp": 0.06073585, "epoch": 0.9708101608297008, "flos": 60853040196480.0, "grad_norm": 0.7432567410949653, "language_loss": 0.53896737, "learning_rate": 8.91312740198713e-09, "loss": 0.55044889, "num_input_tokens_seen": 348541220, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.06591797, "step": 16147, "time_per_iteration": 4.592820167541504 }, { "auxiliary_loss_clip": 0.01247352, "auxiliary_loss_mlp": 0.00229051, "balance_loss_clip": 1.02795458, "balance_loss_mlp": 0.20281324, "epoch": 0.9708702840823689, "flos": 27124766029440.0, "grad_norm": 9.600658139443269, "language_loss": 0.70865583, "learning_rate": 8.876437313434682e-09, "loss": 0.72341985, "num_input_tokens_seen": 348559230, "router_z_loss_clip": 2.19140625, "router_z_loss_mlp": 0.2623291, "step": 16148, "time_per_iteration": 4.1794867515563965 }, { "auxiliary_loss_clip": 0.01240603, "auxiliary_loss_mlp": 0.00239185, "balance_loss_clip": 1.02641475, "balance_loss_mlp": 0.21294671, "epoch": 0.9709304073350368, "flos": 20777699041920.0, "grad_norm": 205.34246988543683, "language_loss": 0.8147409, "learning_rate": 8.839822728487155e-09, "loss": 0.82953882, "num_input_tokens_seen": 348577850, "router_z_loss_clip": 2.140625, "router_z_loss_mlp": 0.26220703, "step": 16149, "time_per_iteration": 2.682129383087158 }, { "auxiliary_loss_clip": 0.01225253, "auxiliary_loss_mlp": 0.00236385, "balance_loss_clip": 1.01515365, "balance_loss_mlp": 0.21257904, "epoch": 0.9709905305877048, "flos": 41934541115520.0, "grad_norm": 64.46422473559372, "language_loss": 0.85759497, "learning_rate": 8.803283648533222e-09, "loss": 0.87221128, "num_input_tokens_seen": 348598345, "router_z_loss_clip": 2.1015625, "router_z_loss_mlp": 0.23815918, "step": 16150, "time_per_iteration": 2.827500343322754 }, { "auxiliary_loss_clip": 0.01266345, "auxiliary_loss_mlp": 0.00260517, "balance_loss_clip": 1.04052186, "balance_loss_mlp": 0.23359931, "epoch": 0.9710506538403728, "flos": 17165588486400.0, "grad_norm": 37.19044690109298, "language_loss": 0.82765192, "learning_rate": 8.766820074958214e-09, "loss": 0.84292054, "num_input_tokens_seen": 348616300, "router_z_loss_clip": 2.25488281, "router_z_loss_mlp": 0.2689209, "step": 16151, "time_per_iteration": 2.69598650932312 }, { "auxiliary_loss_clip": 0.01237168, "auxiliary_loss_mlp": 0.00227746, "balance_loss_clip": 1.02139425, "balance_loss_mlp": 0.20287867, "epoch": 0.9711107770930407, "flos": 21173470070400.0, "grad_norm": 41.044402825625895, "language_loss": 0.81737089, "learning_rate": 8.730432009145027e-09, "loss": 0.83201998, "num_input_tokens_seen": 348633845, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.24902344, "step": 16152, "time_per_iteration": 2.698352575302124 }, { "auxiliary_loss_clip": 0.01239556, "auxiliary_loss_mlp": 0.00210185, "balance_loss_clip": 1.02617252, "balance_loss_mlp": 0.18580648, "epoch": 0.9711709003457087, "flos": 22237072715520.0, "grad_norm": 17.490892697082142, "language_loss": 0.76405615, "learning_rate": 8.694119452473448e-09, "loss": 0.77855355, "num_input_tokens_seen": 348653070, "router_z_loss_clip": 2.13085938, "router_z_loss_mlp": 0.24353027, "step": 16153, "time_per_iteration": 2.723904848098755 }, { "auxiliary_loss_clip": 0.01235683, "auxiliary_loss_mlp": 0.00232406, "balance_loss_clip": 1.02232003, "balance_loss_mlp": 0.20882687, "epoch": 0.9712310235983767, "flos": 26213856099840.0, "grad_norm": 10.236122073091812, "language_loss": 0.78827095, "learning_rate": 8.65788240632037e-09, "loss": 0.80295187, "num_input_tokens_seen": 348672145, "router_z_loss_clip": 2.13671875, "router_z_loss_mlp": 0.23596191, "step": 16154, "time_per_iteration": 2.7038395404815674 }, { "auxiliary_loss_clip": 0.01254454, "auxiliary_loss_mlp": 0.00216314, "balance_loss_clip": 1.02963018, "balance_loss_mlp": 0.18943188, "epoch": 0.9712911468510447, "flos": 20668171495680.0, "grad_norm": 6.90298909560916, "language_loss": 0.89571005, "learning_rate": 8.621720872059812e-09, "loss": 0.9104178, "num_input_tokens_seen": 348690615, "router_z_loss_clip": 2.24804688, "router_z_loss_mlp": 0.2689209, "step": 16155, "time_per_iteration": 2.6977810859680176 }, { "auxiliary_loss_clip": 0.0124701, "auxiliary_loss_mlp": 0.00222661, "balance_loss_clip": 1.02359617, "balance_loss_mlp": 0.19473039, "epoch": 0.9713512701037126, "flos": 13552903313280.0, "grad_norm": 15.777199612942702, "language_loss": 0.8013922, "learning_rate": 8.58563485106334e-09, "loss": 0.81608886, "num_input_tokens_seen": 348708665, "router_z_loss_clip": 2.23242188, "router_z_loss_mlp": 0.27905273, "step": 16156, "time_per_iteration": 2.7109899520874023 }, { "auxiliary_loss_clip": 0.01249901, "auxiliary_loss_mlp": 0.00238973, "balance_loss_clip": 1.02684224, "balance_loss_mlp": 0.21300924, "epoch": 0.9714113933563806, "flos": 25848752307840.0, "grad_norm": 232.3388311567096, "language_loss": 1.01259995, "learning_rate": 8.54962434469919e-09, "loss": 1.02748871, "num_input_tokens_seen": 348726105, "router_z_loss_clip": 2.23144531, "router_z_loss_mlp": 0.25964355, "step": 16157, "time_per_iteration": 2.688114881515503 }, { "auxiliary_loss_clip": 0.01236267, "auxiliary_loss_mlp": 0.00223658, "balance_loss_clip": 1.02211106, "balance_loss_mlp": 0.19825462, "epoch": 0.9714715166090485, "flos": 12743081233920.0, "grad_norm": 9.730551903531165, "language_loss": 0.80624837, "learning_rate": 8.513689354332721e-09, "loss": 0.82084763, "num_input_tokens_seen": 348743360, "router_z_loss_clip": 2.14160156, "router_z_loss_mlp": 0.25378418, "step": 16158, "time_per_iteration": 2.6883795261383057 }, { "auxiliary_loss_clip": 0.01234582, "auxiliary_loss_mlp": 0.0021457, "balance_loss_clip": 1.02396512, "balance_loss_mlp": 0.19158642, "epoch": 0.9715316398617165, "flos": 18405547931520.0, "grad_norm": 81.52111117810479, "language_loss": 0.6791296, "learning_rate": 8.477829881326836e-09, "loss": 0.6936211, "num_input_tokens_seen": 348759045, "router_z_loss_clip": 2.10742188, "router_z_loss_mlp": 0.23010254, "step": 16159, "time_per_iteration": 2.6136252880096436 }, { "auxiliary_loss_clip": 0.01221756, "auxiliary_loss_mlp": 0.00215913, "balance_loss_clip": 1.01616788, "balance_loss_mlp": 0.19247648, "epoch": 0.9715917631143844, "flos": 28913799749760.0, "grad_norm": 33812.07552406592, "language_loss": 0.86918032, "learning_rate": 8.44204592704112e-09, "loss": 0.88355702, "num_input_tokens_seen": 348779910, "router_z_loss_clip": 2.05273438, "router_z_loss_mlp": 0.234375, "step": 16160, "time_per_iteration": 2.7283012866973877 }, { "auxiliary_loss_clip": 0.0108412, "auxiliary_loss_mlp": 0.00079899, "balance_loss_clip": 0.94620144, "balance_loss_mlp": 0.07312768, "epoch": 0.9716518863670525, "flos": 65939712900480.0, "grad_norm": 0.7613475340704952, "language_loss": 0.53800535, "learning_rate": 8.406337492832704e-09, "loss": 0.54964554, "num_input_tokens_seen": 348838995, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.06787109, "step": 16161, "time_per_iteration": 3.194183826446533 }, { "auxiliary_loss_clip": 0.01244118, "auxiliary_loss_mlp": 0.00196578, "balance_loss_clip": 1.02702737, "balance_loss_mlp": 0.17073366, "epoch": 0.9717120096197204, "flos": 17712759340800.0, "grad_norm": 6.982070291968716, "language_loss": 0.80206621, "learning_rate": 8.3707045800554e-09, "loss": 0.81647325, "num_input_tokens_seen": 348858090, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.25854492, "step": 16162, "time_per_iteration": 2.671966791152954 }, { "auxiliary_loss_clip": 0.01220877, "auxiliary_loss_mlp": 0.00213328, "balance_loss_clip": 1.0107528, "balance_loss_mlp": 0.18868747, "epoch": 0.9717721328723884, "flos": 24463426521600.0, "grad_norm": 368.37018411811397, "language_loss": 0.86874235, "learning_rate": 8.335147190060787e-09, "loss": 0.88308436, "num_input_tokens_seen": 348877885, "router_z_loss_clip": 2.09765625, "router_z_loss_mlp": 0.24645996, "step": 16163, "time_per_iteration": 2.6728408336639404 }, { "auxiliary_loss_clip": 0.0122648, "auxiliary_loss_mlp": 0.00226789, "balance_loss_clip": 1.01667655, "balance_loss_mlp": 0.20385271, "epoch": 0.9718322561250564, "flos": 20776477979520.0, "grad_norm": 51.6533614962847, "language_loss": 0.80211997, "learning_rate": 8.299665324196903e-09, "loss": 0.81665266, "num_input_tokens_seen": 348897720, "router_z_loss_clip": 2.09765625, "router_z_loss_mlp": 0.22937012, "step": 16164, "time_per_iteration": 2.6946167945861816 }, { "auxiliary_loss_clip": 0.01255407, "auxiliary_loss_mlp": 0.00228748, "balance_loss_clip": 1.03621793, "balance_loss_mlp": 0.20212862, "epoch": 0.9718923793777243, "flos": 19025904746880.0, "grad_norm": 4.367374204647696, "language_loss": 0.93567157, "learning_rate": 8.264258983809114e-09, "loss": 0.95051312, "num_input_tokens_seen": 348915410, "router_z_loss_clip": 2.19140625, "router_z_loss_mlp": 0.26599121, "step": 16165, "time_per_iteration": 2.6646156311035156 }, { "auxiliary_loss_clip": 0.01234666, "auxiliary_loss_mlp": 0.00209051, "balance_loss_clip": 1.01787448, "balance_loss_mlp": 0.18710414, "epoch": 0.9719525026303923, "flos": 21871717528320.0, "grad_norm": 8.957334054696027, "language_loss": 0.86505729, "learning_rate": 8.228928170240345e-09, "loss": 0.87949443, "num_input_tokens_seen": 348934335, "router_z_loss_clip": 2.1640625, "router_z_loss_mlp": 0.21948242, "step": 16166, "time_per_iteration": 2.6677448749542236 }, { "auxiliary_loss_clip": 0.01241484, "auxiliary_loss_mlp": 0.00210096, "balance_loss_clip": 1.02509177, "balance_loss_mlp": 0.1864689, "epoch": 0.9720126258830603, "flos": 14429303251200.0, "grad_norm": 2166.687489652843, "language_loss": 0.78464788, "learning_rate": 8.193672884830195e-09, "loss": 0.7991637, "num_input_tokens_seen": 348952405, "router_z_loss_clip": 2.16308594, "router_z_loss_mlp": 0.23632812, "step": 16167, "time_per_iteration": 2.6740047931671143 }, { "auxiliary_loss_clip": 0.01231765, "auxiliary_loss_mlp": 0.00247937, "balance_loss_clip": 1.01965153, "balance_loss_mlp": 0.22316484, "epoch": 0.9720727491357283, "flos": 26251167352320.0, "grad_norm": 7.151812035628794, "language_loss": 0.81795907, "learning_rate": 8.158493128915812e-09, "loss": 0.83275604, "num_input_tokens_seen": 348973580, "router_z_loss_clip": 2.11914062, "router_z_loss_mlp": 0.2479248, "step": 16168, "time_per_iteration": 2.7491254806518555 }, { "auxiliary_loss_clip": 0.0124733, "auxiliary_loss_mlp": 0.00217115, "balance_loss_clip": 1.02703047, "balance_loss_mlp": 0.18980427, "epoch": 0.9721328723883962, "flos": 22674105492480.0, "grad_norm": 7.349375676582137, "language_loss": 0.85337591, "learning_rate": 8.123388903830797e-09, "loss": 0.8680203, "num_input_tokens_seen": 348992035, "router_z_loss_clip": 2.20117188, "router_z_loss_mlp": 0.27319336, "step": 16169, "time_per_iteration": 2.7156646251678467 }, { "auxiliary_loss_clip": 0.01259349, "auxiliary_loss_mlp": 0.0024774, "balance_loss_clip": 1.03627825, "balance_loss_mlp": 0.22010753, "epoch": 0.9721929956410642, "flos": 28074172360320.0, "grad_norm": 10.223338073570817, "language_loss": 0.6727283, "learning_rate": 8.088360210906309e-09, "loss": 0.68779916, "num_input_tokens_seen": 349013160, "router_z_loss_clip": 2.23242188, "router_z_loss_mlp": 0.27636719, "step": 16170, "time_per_iteration": 2.7602391242980957 }, { "auxiliary_loss_clip": 0.01236262, "auxiliary_loss_mlp": 0.00209421, "balance_loss_clip": 1.01808178, "balance_loss_mlp": 0.18512601, "epoch": 0.9722531188937321, "flos": 20996251344000.0, "grad_norm": 1972.6106664433446, "language_loss": 0.79689497, "learning_rate": 8.053407051471062e-09, "loss": 0.81135178, "num_input_tokens_seen": 349033485, "router_z_loss_clip": 2.18164062, "router_z_loss_mlp": 0.24316406, "step": 16171, "time_per_iteration": 2.7074804306030273 }, { "auxiliary_loss_clip": 0.01240606, "auxiliary_loss_mlp": 0.00214624, "balance_loss_clip": 1.01697755, "balance_loss_mlp": 0.18976936, "epoch": 0.9723132421464001, "flos": 16070600332800.0, "grad_norm": 38.61372859961805, "language_loss": 0.76917857, "learning_rate": 8.018529426850218e-09, "loss": 0.78373092, "num_input_tokens_seen": 349051705, "router_z_loss_clip": 2.23632812, "router_z_loss_mlp": 0.2487793, "step": 16172, "time_per_iteration": 2.7399237155914307 }, { "auxiliary_loss_clip": 0.01232322, "auxiliary_loss_mlp": 0.00195862, "balance_loss_clip": 1.02010179, "balance_loss_mlp": 0.17054188, "epoch": 0.972373365399068, "flos": 27745769289600.0, "grad_norm": 8.016700891334938, "language_loss": 0.93527317, "learning_rate": 7.983727338366274e-09, "loss": 0.94955504, "num_input_tokens_seen": 349070825, "router_z_loss_clip": 2.12207031, "router_z_loss_mlp": 0.25305176, "step": 16173, "time_per_iteration": 2.7563977241516113 }, { "auxiliary_loss_clip": 0.01258625, "auxiliary_loss_mlp": 0.00254419, "balance_loss_clip": 1.03185296, "balance_loss_mlp": 0.22721532, "epoch": 0.9724334886517361, "flos": 23002939526400.0, "grad_norm": 15.489471920503966, "language_loss": 0.75836974, "learning_rate": 7.949000787339289e-09, "loss": 0.7735002, "num_input_tokens_seen": 349089730, "router_z_loss_clip": 2.26953125, "router_z_loss_mlp": 0.27185059, "step": 16174, "time_per_iteration": 2.657973527908325 }, { "auxiliary_loss_clip": 0.01237604, "auxiliary_loss_mlp": 0.00230455, "balance_loss_clip": 1.02613235, "balance_loss_mlp": 0.20467016, "epoch": 0.972493611904404, "flos": 25447055535360.0, "grad_norm": 24.64826117803069, "language_loss": 0.83066183, "learning_rate": 7.914349775085538e-09, "loss": 0.8453424, "num_input_tokens_seen": 349111315, "router_z_loss_clip": 2.11621094, "router_z_loss_mlp": 0.25769043, "step": 16175, "time_per_iteration": 2.7292118072509766 }, { "auxiliary_loss_clip": 0.01238017, "auxiliary_loss_mlp": 0.00217031, "balance_loss_clip": 1.02247787, "balance_loss_mlp": 0.19169861, "epoch": 0.972553735157072, "flos": 16983054547200.0, "grad_norm": 72724.42511937155, "language_loss": 0.70742083, "learning_rate": 7.879774302919307e-09, "loss": 0.72197127, "num_input_tokens_seen": 349129495, "router_z_loss_clip": 2.15429688, "router_z_loss_mlp": 0.25317383, "step": 16176, "time_per_iteration": 2.625964641571045 }, { "auxiliary_loss_clip": 0.01247369, "auxiliary_loss_mlp": 0.00211207, "balance_loss_clip": 1.03487349, "balance_loss_mlp": 0.1864592, "epoch": 0.97261385840974, "flos": 26104651776000.0, "grad_norm": 14.995685107146182, "language_loss": 0.82433259, "learning_rate": 7.845274372151545e-09, "loss": 0.83891833, "num_input_tokens_seen": 349148850, "router_z_loss_clip": 2.12890625, "router_z_loss_mlp": 0.24755859, "step": 16177, "time_per_iteration": 2.685791015625 }, { "auxiliary_loss_clip": 0.01222909, "auxiliary_loss_mlp": 0.00211821, "balance_loss_clip": 1.01164556, "balance_loss_mlp": 0.18707332, "epoch": 0.9726739816624079, "flos": 25447881548160.0, "grad_norm": 60.14388732392215, "language_loss": 0.76493841, "learning_rate": 7.810849984090984e-09, "loss": 0.77928573, "num_input_tokens_seen": 349167620, "router_z_loss_clip": 2.11132812, "router_z_loss_mlp": 0.24755859, "step": 16178, "time_per_iteration": 2.7299063205718994 }, { "auxiliary_loss_clip": 0.01255829, "auxiliary_loss_mlp": 0.00212962, "balance_loss_clip": 1.03445256, "balance_loss_mlp": 0.18720068, "epoch": 0.972734104915076, "flos": 29014923513600.0, "grad_norm": 154.81021133459683, "language_loss": 0.78787154, "learning_rate": 7.776501140042358e-09, "loss": 0.80255949, "num_input_tokens_seen": 349185845, "router_z_loss_clip": 2.2109375, "router_z_loss_mlp": 0.25744629, "step": 16179, "time_per_iteration": 2.7671890258789062 }, { "auxiliary_loss_clip": 0.01221544, "auxiliary_loss_mlp": 0.00239817, "balance_loss_clip": 1.01183665, "balance_loss_mlp": 0.21564104, "epoch": 0.9727942281677439, "flos": 23437637919360.0, "grad_norm": 1186.0420506668936, "language_loss": 0.83634329, "learning_rate": 7.742227841308624e-09, "loss": 0.85095692, "num_input_tokens_seen": 349204525, "router_z_loss_clip": 2.09570312, "router_z_loss_mlp": 0.24169922, "step": 16180, "time_per_iteration": 4.115322828292847 }, { "auxiliary_loss_clip": 0.01259639, "auxiliary_loss_mlp": 0.00218757, "balance_loss_clip": 1.0353719, "balance_loss_mlp": 0.19241145, "epoch": 0.9728543514204119, "flos": 31724599749120.0, "grad_norm": 36.53420727656358, "language_loss": 0.8421303, "learning_rate": 7.708030089189188e-09, "loss": 0.85691428, "num_input_tokens_seen": 349228075, "router_z_loss_clip": 2.24414062, "router_z_loss_mlp": 0.26342773, "step": 16181, "time_per_iteration": 4.219792127609253 }, { "auxiliary_loss_clip": 0.01225926, "auxiliary_loss_mlp": 0.00214165, "balance_loss_clip": 1.01842844, "balance_loss_mlp": 0.19118121, "epoch": 0.9729144746730798, "flos": 16289368116480.0, "grad_norm": 6.0717880781844675, "language_loss": 0.72154367, "learning_rate": 7.67390788498079e-09, "loss": 0.73594463, "num_input_tokens_seen": 349246990, "router_z_loss_clip": 2.07617188, "router_z_loss_mlp": 0.2298584, "step": 16182, "time_per_iteration": 2.625229597091675 }, { "auxiliary_loss_clip": 0.01243928, "auxiliary_loss_mlp": 0.00229666, "balance_loss_clip": 1.0245353, "balance_loss_mlp": 0.20506072, "epoch": 0.9729745979257478, "flos": 25041408266880.0, "grad_norm": 3.4798191984159295, "language_loss": 0.71040291, "learning_rate": 7.639861229977507e-09, "loss": 0.72513884, "num_input_tokens_seen": 349265890, "router_z_loss_clip": 2.19335938, "router_z_loss_mlp": 0.24609375, "step": 16183, "time_per_iteration": 2.72348952293396 }, { "auxiliary_loss_clip": 0.01234486, "auxiliary_loss_mlp": 0.00215357, "balance_loss_clip": 1.02252817, "balance_loss_mlp": 0.19002491, "epoch": 0.9730347211784157, "flos": 22638733574400.0, "grad_norm": 41.992670404513156, "language_loss": 0.84214449, "learning_rate": 7.605890125470527e-09, "loss": 0.85664284, "num_input_tokens_seen": 349285275, "router_z_loss_clip": 2.12109375, "router_z_loss_mlp": 0.25317383, "step": 16184, "time_per_iteration": 2.657150983810425 }, { "auxiliary_loss_clip": 0.01226817, "auxiliary_loss_mlp": 0.00218618, "balance_loss_clip": 1.01556838, "balance_loss_mlp": 0.19556344, "epoch": 0.9730948444310837, "flos": 10998613313280.0, "grad_norm": 333.62272018975904, "language_loss": 0.90088528, "learning_rate": 7.571994572747709e-09, "loss": 0.91533959, "num_input_tokens_seen": 349301515, "router_z_loss_clip": 2.11132812, "router_z_loss_mlp": 0.23071289, "step": 16185, "time_per_iteration": 2.6689112186431885 }, { "auxiliary_loss_clip": 0.01236555, "auxiliary_loss_mlp": 0.00243423, "balance_loss_clip": 1.02673626, "balance_loss_mlp": 0.2191759, "epoch": 0.9731549676837516, "flos": 16799479113600.0, "grad_norm": 9.545509518770686, "language_loss": 0.86543292, "learning_rate": 7.538174573094469e-09, "loss": 0.88023269, "num_input_tokens_seen": 349319590, "router_z_loss_clip": 2.09960938, "router_z_loss_mlp": 0.24243164, "step": 16186, "time_per_iteration": 2.842618227005005 }, { "auxiliary_loss_clip": 0.01225718, "auxiliary_loss_mlp": 0.00214563, "balance_loss_clip": 1.01651073, "balance_loss_mlp": 0.19058996, "epoch": 0.9732150909364197, "flos": 21141761339520.0, "grad_norm": 10.16871007870013, "language_loss": 0.73288918, "learning_rate": 7.504430127793337e-09, "loss": 0.74729204, "num_input_tokens_seen": 349339230, "router_z_loss_clip": 2.08984375, "router_z_loss_mlp": 0.23974609, "step": 16187, "time_per_iteration": 2.6906025409698486 }, { "auxiliary_loss_clip": 0.01224174, "auxiliary_loss_mlp": 0.00228449, "balance_loss_clip": 1.01837254, "balance_loss_mlp": 0.205704, "epoch": 0.9732752141890876, "flos": 33727337435520.0, "grad_norm": 98.99851540636057, "language_loss": 0.86266941, "learning_rate": 7.47076123812418e-09, "loss": 0.87719566, "num_input_tokens_seen": 349361155, "router_z_loss_clip": 2.05761719, "router_z_loss_mlp": 0.22766113, "step": 16188, "time_per_iteration": 2.81126070022583 }, { "auxiliary_loss_clip": 0.01223556, "auxiliary_loss_mlp": 0.00224608, "balance_loss_clip": 1.01663637, "balance_loss_mlp": 0.20268494, "epoch": 0.9733353374417556, "flos": 23404384903680.0, "grad_norm": 3288.8542512170943, "language_loss": 0.85514325, "learning_rate": 7.437167905363084e-09, "loss": 0.86962485, "num_input_tokens_seen": 349379335, "router_z_loss_clip": 2.0703125, "router_z_loss_mlp": 0.21899414, "step": 16189, "time_per_iteration": 4.122826337814331 }, { "auxiliary_loss_clip": 0.01239729, "auxiliary_loss_mlp": 0.00238461, "balance_loss_clip": 1.01984239, "balance_loss_mlp": 0.21089996, "epoch": 0.9733954606944236, "flos": 39165792963840.0, "grad_norm": 23.73755643716483, "language_loss": 0.63533264, "learning_rate": 7.403650130784367e-09, "loss": 0.65011454, "num_input_tokens_seen": 349401575, "router_z_loss_clip": 2.20117188, "router_z_loss_mlp": 0.27587891, "step": 16190, "time_per_iteration": 4.2825987339019775 }, { "auxiliary_loss_clip": 0.01229352, "auxiliary_loss_mlp": 0.00220153, "balance_loss_clip": 1.01536942, "balance_loss_mlp": 0.19585842, "epoch": 0.9734555839470915, "flos": 21981819692160.0, "grad_norm": 36.96559132832664, "language_loss": 0.88826704, "learning_rate": 7.3702079156590105e-09, "loss": 0.90276206, "num_input_tokens_seen": 349420650, "router_z_loss_clip": 2.13867188, "router_z_loss_mlp": 0.24316406, "step": 16191, "time_per_iteration": 2.7092318534851074 }, { "auxiliary_loss_clip": 0.01231305, "auxiliary_loss_mlp": 0.00211123, "balance_loss_clip": 1.02100742, "balance_loss_mlp": 0.18719798, "epoch": 0.9735157071997596, "flos": 16575539771520.0, "grad_norm": 15.841699905906337, "language_loss": 0.88652003, "learning_rate": 7.336841261255111e-09, "loss": 0.90094435, "num_input_tokens_seen": 349436830, "router_z_loss_clip": 2.10351562, "router_z_loss_mlp": 0.23937988, "step": 16192, "time_per_iteration": 2.668213367462158 }, { "auxiliary_loss_clip": 0.01248944, "auxiliary_loss_mlp": 0.00218819, "balance_loss_clip": 1.03405869, "balance_loss_mlp": 0.19352314, "epoch": 0.9735758304524275, "flos": 20223237726720.0, "grad_norm": 16.508139538438147, "language_loss": 0.82374287, "learning_rate": 7.303550168837658e-09, "loss": 0.83842051, "num_input_tokens_seen": 349454325, "router_z_loss_clip": 2.15039062, "router_z_loss_mlp": 0.25292969, "step": 16193, "time_per_iteration": 2.711557149887085 }, { "auxiliary_loss_clip": 0.01221649, "auxiliary_loss_mlp": 0.00203723, "balance_loss_clip": 1.01299214, "balance_loss_mlp": 0.17972568, "epoch": 0.9736359537050955, "flos": 23653353047040.0, "grad_norm": 7.090746767104533, "language_loss": 0.89834678, "learning_rate": 7.270334639669417e-09, "loss": 0.91260052, "num_input_tokens_seen": 349470230, "router_z_loss_clip": 2.08789062, "router_z_loss_mlp": 0.2401123, "step": 16194, "time_per_iteration": 2.6752498149871826 }, { "auxiliary_loss_clip": 0.01219762, "auxiliary_loss_mlp": 0.00210079, "balance_loss_clip": 1.0103004, "balance_loss_mlp": 0.18736999, "epoch": 0.9736960769577634, "flos": 15560202026880.0, "grad_norm": 43.90341564661956, "language_loss": 0.82619631, "learning_rate": 7.237194675009828e-09, "loss": 0.84049469, "num_input_tokens_seen": 349486250, "router_z_loss_clip": 2.09277344, "router_z_loss_mlp": 0.22717285, "step": 16195, "time_per_iteration": 2.7164461612701416 }, { "auxiliary_loss_clip": 0.01078079, "auxiliary_loss_mlp": 0.00075072, "balance_loss_clip": 0.94246942, "balance_loss_mlp": 0.06863426, "epoch": 0.9737562002104314, "flos": 65351783088000.0, "grad_norm": 0.7422950747260958, "language_loss": 0.52011031, "learning_rate": 7.204130276115439e-09, "loss": 0.53164184, "num_input_tokens_seen": 349545865, "router_z_loss_clip": 1.359375, "router_z_loss_mlp": 0.06445312, "step": 16196, "time_per_iteration": 3.1543633937835693 }, { "auxiliary_loss_clip": 0.0124413, "auxiliary_loss_mlp": 0.00217083, "balance_loss_clip": 1.02611375, "balance_loss_mlp": 0.19071397, "epoch": 0.9738163234630993, "flos": 27196730928000.0, "grad_norm": 26.70620439471528, "language_loss": 0.84123582, "learning_rate": 7.171141444240136e-09, "loss": 0.85584795, "num_input_tokens_seen": 349566080, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.26379395, "step": 16197, "time_per_iteration": 2.717935562133789 }, { "auxiliary_loss_clip": 0.01248778, "auxiliary_loss_mlp": 0.00235586, "balance_loss_clip": 1.03062844, "balance_loss_mlp": 0.20946687, "epoch": 0.9738764467157673, "flos": 21069365477760.0, "grad_norm": 34.91683933718339, "language_loss": 0.74952787, "learning_rate": 7.13822818063492e-09, "loss": 0.76437151, "num_input_tokens_seen": 349585665, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.26123047, "step": 16198, "time_per_iteration": 2.654101610183716 }, { "auxiliary_loss_clip": 0.0123683, "auxiliary_loss_mlp": 0.00218029, "balance_loss_clip": 1.02176261, "balance_loss_mlp": 0.19232726, "epoch": 0.9739365699684353, "flos": 21361211481600.0, "grad_norm": 56.73073765595318, "language_loss": 0.86036587, "learning_rate": 7.10539048654768e-09, "loss": 0.87491441, "num_input_tokens_seen": 349605125, "router_z_loss_clip": 2.15039062, "router_z_loss_mlp": 0.25683594, "step": 16199, "time_per_iteration": 2.7174887657165527 }, { "auxiliary_loss_clip": 0.01245056, "auxiliary_loss_mlp": 0.00227704, "balance_loss_clip": 1.02454495, "balance_loss_mlp": 0.20119202, "epoch": 0.9739966932211033, "flos": 21902061542400.0, "grad_norm": 2.890389247197782, "language_loss": 0.86548007, "learning_rate": 7.072628363223865e-09, "loss": 0.88020766, "num_input_tokens_seen": 349623360, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.26525879, "step": 16200, "time_per_iteration": 2.6927223205566406 }, { "auxiliary_loss_clip": 0.01260854, "auxiliary_loss_mlp": 0.00223019, "balance_loss_clip": 1.03597021, "balance_loss_mlp": 0.19467102, "epoch": 0.9740568164737712, "flos": 24827345164800.0, "grad_norm": 202.21681968890906, "language_loss": 0.79432976, "learning_rate": 7.039941811905592e-09, "loss": 0.80916852, "num_input_tokens_seen": 349644390, "router_z_loss_clip": 2.24609375, "router_z_loss_mlp": 0.28356934, "step": 16201, "time_per_iteration": 2.756596565246582 }, { "auxiliary_loss_clip": 0.01233989, "auxiliary_loss_mlp": 0.00224309, "balance_loss_clip": 1.01973724, "balance_loss_mlp": 0.19929919, "epoch": 0.9741169397264392, "flos": 23623583650560.0, "grad_norm": 136.1084581379726, "language_loss": 0.78499126, "learning_rate": 7.0073308338325364e-09, "loss": 0.79957426, "num_input_tokens_seen": 349663200, "router_z_loss_clip": 2.140625, "router_z_loss_mlp": 0.25, "step": 16202, "time_per_iteration": 2.7647225856781006 }, { "auxiliary_loss_clip": 0.01245748, "auxiliary_loss_mlp": 0.00232941, "balance_loss_clip": 1.02516258, "balance_loss_mlp": 0.20635705, "epoch": 0.9741770629791072, "flos": 18841144164480.0, "grad_norm": 14.717247721433038, "language_loss": 0.80242538, "learning_rate": 6.974795430241265e-09, "loss": 0.81721228, "num_input_tokens_seen": 349681975, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.26611328, "step": 16203, "time_per_iteration": 2.6258127689361572 }, { "auxiliary_loss_clip": 0.01220691, "auxiliary_loss_mlp": 0.00212219, "balance_loss_clip": 1.01120138, "balance_loss_mlp": 0.18991506, "epoch": 0.9742371862317751, "flos": 22346241125760.0, "grad_norm": 118.90728882985239, "language_loss": 0.84442729, "learning_rate": 6.942335602365235e-09, "loss": 0.85875642, "num_input_tokens_seen": 349701185, "router_z_loss_clip": 2.09570312, "router_z_loss_mlp": 0.22302246, "step": 16204, "time_per_iteration": 2.7884714603424072 }, { "auxiliary_loss_clip": 0.01239175, "auxiliary_loss_mlp": 0.00211778, "balance_loss_clip": 1.02524495, "balance_loss_mlp": 0.18843725, "epoch": 0.9742973094844432, "flos": 21762764599680.0, "grad_norm": 6.89950299509813, "language_loss": 0.88741112, "learning_rate": 6.909951351435905e-09, "loss": 0.90192062, "num_input_tokens_seen": 349720360, "router_z_loss_clip": 2.14257812, "router_z_loss_mlp": 0.23352051, "step": 16205, "time_per_iteration": 2.694004774093628 }, { "auxiliary_loss_clip": 0.01223879, "auxiliary_loss_mlp": 0.00236659, "balance_loss_clip": 1.01132464, "balance_loss_mlp": 0.21324611, "epoch": 0.9743574327371111, "flos": 26248725227520.0, "grad_norm": 18.83700736305046, "language_loss": 0.81524849, "learning_rate": 6.87764267868074e-09, "loss": 0.82985389, "num_input_tokens_seen": 349741040, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.23425293, "step": 16206, "time_per_iteration": 2.796604871749878 }, { "auxiliary_loss_clip": 0.01235296, "auxiliary_loss_mlp": 0.00217832, "balance_loss_clip": 1.01692772, "balance_loss_mlp": 0.1927381, "epoch": 0.9744175559897791, "flos": 12349321367040.0, "grad_norm": 624.1739698970022, "language_loss": 0.94442916, "learning_rate": 6.8454095853252015e-09, "loss": 0.95896041, "num_input_tokens_seen": 349758895, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.25134277, "step": 16207, "time_per_iteration": 2.6921350955963135 }, { "auxiliary_loss_clip": 0.01229222, "auxiliary_loss_mlp": 0.00222562, "balance_loss_clip": 1.01878762, "balance_loss_mlp": 0.19732541, "epoch": 0.974477679242447, "flos": 28397834835840.0, "grad_norm": 254.7772606763417, "language_loss": 0.76936287, "learning_rate": 6.813252072591425e-09, "loss": 0.78388071, "num_input_tokens_seen": 349779740, "router_z_loss_clip": 2.10351562, "router_z_loss_mlp": 0.25244141, "step": 16208, "time_per_iteration": 2.767425537109375 }, { "auxiliary_loss_clip": 0.01219456, "auxiliary_loss_mlp": 0.00214422, "balance_loss_clip": 1.01535034, "balance_loss_mlp": 0.19233257, "epoch": 0.974537802495115, "flos": 17785370684160.0, "grad_norm": 108.3852404222588, "language_loss": 0.8151387, "learning_rate": 6.781170141698878e-09, "loss": 0.82947743, "num_input_tokens_seen": 349796820, "router_z_loss_clip": 2.04003906, "router_z_loss_mlp": 0.22094727, "step": 16209, "time_per_iteration": 2.7001402378082275 }, { "auxiliary_loss_clip": 0.01221134, "auxiliary_loss_mlp": 0.00210737, "balance_loss_clip": 1.01225102, "balance_loss_mlp": 0.18767044, "epoch": 0.9745979257477829, "flos": 23842315520640.0, "grad_norm": 9.514206310568769, "language_loss": 0.88109934, "learning_rate": 6.749163793864144e-09, "loss": 0.89541805, "num_input_tokens_seen": 349816550, "router_z_loss_clip": 2.08789062, "router_z_loss_mlp": 0.23059082, "step": 16210, "time_per_iteration": 2.7096471786499023 }, { "auxiliary_loss_clip": 0.0123871, "auxiliary_loss_mlp": 0.00231179, "balance_loss_clip": 1.01951218, "balance_loss_mlp": 0.20608559, "epoch": 0.9746580490004509, "flos": 27016172236800.0, "grad_norm": 13.173118903806367, "language_loss": 0.88108277, "learning_rate": 6.7172330303009176e-09, "loss": 0.8957817, "num_input_tokens_seen": 349834350, "router_z_loss_clip": 2.19433594, "router_z_loss_mlp": 0.25109863, "step": 16211, "time_per_iteration": 2.7420713901519775 }, { "auxiliary_loss_clip": 0.01253969, "auxiliary_loss_mlp": 0.00228304, "balance_loss_clip": 1.03026175, "balance_loss_mlp": 0.20253114, "epoch": 0.9747181722531189, "flos": 19792022952960.0, "grad_norm": 39.76931154596857, "language_loss": 0.89896613, "learning_rate": 6.685377852219787e-09, "loss": 0.9137888, "num_input_tokens_seen": 349853460, "router_z_loss_clip": 2.23828125, "router_z_loss_mlp": 0.25793457, "step": 16212, "time_per_iteration": 2.75981068611145 }, { "auxiliary_loss_clip": 0.01223371, "auxiliary_loss_mlp": 0.00220139, "balance_loss_clip": 1.01234388, "balance_loss_mlp": 0.19567722, "epoch": 0.9747782955057869, "flos": 31430598929280.0, "grad_norm": 507.7371110739221, "language_loss": 0.86659813, "learning_rate": 6.653598260829118e-09, "loss": 0.88103318, "num_input_tokens_seen": 349874830, "router_z_loss_clip": 2.11132812, "router_z_loss_mlp": 0.24462891, "step": 16213, "time_per_iteration": 2.8016834259033203 }, { "auxiliary_loss_clip": 0.01227509, "auxiliary_loss_mlp": 0.00225488, "balance_loss_clip": 1.01001608, "balance_loss_mlp": 0.19984564, "epoch": 0.9748384187584548, "flos": 15961288268160.0, "grad_norm": 8.754681251521369, "language_loss": 0.77145827, "learning_rate": 6.6218942573335044e-09, "loss": 0.78598821, "num_input_tokens_seen": 349893690, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.25634766, "step": 16214, "time_per_iteration": 2.6834027767181396 }, { "auxiliary_loss_clip": 0.01253431, "auxiliary_loss_mlp": 0.00241599, "balance_loss_clip": 1.02692294, "balance_loss_mlp": 0.21280977, "epoch": 0.9748985420111228, "flos": 20558715776640.0, "grad_norm": 107.0960892995615, "language_loss": 0.86019433, "learning_rate": 6.5902658429355386e-09, "loss": 0.8751446, "num_input_tokens_seen": 349912480, "router_z_loss_clip": 2.27148438, "router_z_loss_mlp": 0.28796387, "step": 16215, "time_per_iteration": 2.7155840396881104 }, { "auxiliary_loss_clip": 0.0122002, "auxiliary_loss_mlp": 0.00220182, "balance_loss_clip": 1.01131856, "balance_loss_mlp": 0.19575624, "epoch": 0.9749586652637908, "flos": 36721605127680.0, "grad_norm": 44.35629571278392, "language_loss": 0.75383413, "learning_rate": 6.558713018834483e-09, "loss": 0.76823616, "num_input_tokens_seen": 349932470, "router_z_loss_clip": 2.08398438, "router_z_loss_mlp": 0.24438477, "step": 16216, "time_per_iteration": 2.8109610080718994 }, { "auxiliary_loss_clip": 0.01246105, "auxiliary_loss_mlp": 0.00234674, "balance_loss_clip": 1.02770948, "balance_loss_mlp": 0.20799449, "epoch": 0.9750187885164587, "flos": 10999223844480.0, "grad_norm": 36.24676825344804, "language_loss": 0.80790502, "learning_rate": 6.527235786226937e-09, "loss": 0.82271278, "num_input_tokens_seen": 349949060, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.26696777, "step": 16217, "time_per_iteration": 2.6724417209625244 }, { "auxiliary_loss_clip": 0.01231546, "auxiliary_loss_mlp": 0.00211608, "balance_loss_clip": 1.02012753, "balance_loss_mlp": 0.18730086, "epoch": 0.9750789117691268, "flos": 25739512070400.0, "grad_norm": 124.51398956129525, "language_loss": 0.84845591, "learning_rate": 6.495834146306167e-09, "loss": 0.8628875, "num_input_tokens_seen": 349968010, "router_z_loss_clip": 2.109375, "router_z_loss_mlp": 0.24316406, "step": 16218, "time_per_iteration": 2.7818009853363037 }, { "auxiliary_loss_clip": 0.01217979, "auxiliary_loss_mlp": 0.00220499, "balance_loss_clip": 1.00905657, "balance_loss_mlp": 0.19663364, "epoch": 0.9751390350217947, "flos": 13333955961600.0, "grad_norm": 22.242053712478572, "language_loss": 0.89091384, "learning_rate": 6.464508100263222e-09, "loss": 0.90529859, "num_input_tokens_seen": 349985270, "router_z_loss_clip": 2.09179688, "router_z_loss_mlp": 0.23840332, "step": 16219, "time_per_iteration": 2.647954225540161 }, { "auxiliary_loss_clip": 0.01240323, "auxiliary_loss_mlp": 0.00233117, "balance_loss_clip": 1.02108002, "balance_loss_mlp": 0.2079404, "epoch": 0.9751991582744627, "flos": 22820621068800.0, "grad_norm": 7.2529682929922705, "language_loss": 0.87173021, "learning_rate": 6.433257649285817e-09, "loss": 0.8864646, "num_input_tokens_seen": 350003935, "router_z_loss_clip": 2.19238281, "router_z_loss_mlp": 0.25183105, "step": 16220, "time_per_iteration": 2.706794500350952 }, { "auxiliary_loss_clip": 0.01227577, "auxiliary_loss_mlp": 0.00216821, "balance_loss_clip": 1.0131408, "balance_loss_mlp": 0.1916679, "epoch": 0.9752592815271306, "flos": 19646189735040.0, "grad_norm": 44.00887711918814, "language_loss": 0.82914245, "learning_rate": 6.402082794559227e-09, "loss": 0.84358644, "num_input_tokens_seen": 350023595, "router_z_loss_clip": 2.14453125, "router_z_loss_mlp": 0.25146484, "step": 16221, "time_per_iteration": 2.6257498264312744 }, { "auxiliary_loss_clip": 0.01227366, "auxiliary_loss_mlp": 0.00221809, "balance_loss_clip": 1.01502728, "balance_loss_mlp": 0.19691788, "epoch": 0.9753194047797986, "flos": 26690462686080.0, "grad_norm": 5.316208194262544, "language_loss": 0.72151834, "learning_rate": 6.370983537265395e-09, "loss": 0.73601007, "num_input_tokens_seen": 350045920, "router_z_loss_clip": 2.12304688, "router_z_loss_mlp": 0.24890137, "step": 16222, "time_per_iteration": 4.162256240844727 }, { "auxiliary_loss_clip": 0.0122852, "auxiliary_loss_mlp": 0.00224673, "balance_loss_clip": 1.01826024, "balance_loss_mlp": 0.20034245, "epoch": 0.9753795280324665, "flos": 23221779137280.0, "grad_norm": 11.257583943360041, "language_loss": 0.96743888, "learning_rate": 6.3399598785836004e-09, "loss": 0.98197079, "num_input_tokens_seen": 350063925, "router_z_loss_clip": 2.10449219, "router_z_loss_mlp": 0.24316406, "step": 16223, "time_per_iteration": 4.177023410797119 }, { "auxiliary_loss_clip": 0.01213111, "auxiliary_loss_mlp": 0.00213872, "balance_loss_clip": 1.00939155, "balance_loss_mlp": 0.18982738, "epoch": 0.9754396512851345, "flos": 19463835363840.0, "grad_norm": 3.3906571597513087, "language_loss": 0.81838667, "learning_rate": 6.309011819690457e-09, "loss": 0.8326565, "num_input_tokens_seen": 350080900, "router_z_loss_clip": 2.03613281, "router_z_loss_mlp": 0.24035645, "step": 16224, "time_per_iteration": 2.689598321914673 }, { "auxiliary_loss_clip": 0.01081995, "auxiliary_loss_mlp": 0.00095061, "balance_loss_clip": 0.94338739, "balance_loss_mlp": 0.08790831, "epoch": 0.9754997745378025, "flos": 68459313340800.0, "grad_norm": 0.822374471881492, "language_loss": 0.57825315, "learning_rate": 6.278139361759249e-09, "loss": 0.59002376, "num_input_tokens_seen": 350144550, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 0.07128906, "step": 16225, "time_per_iteration": 3.1108663082122803 }, { "auxiliary_loss_clip": 0.01239274, "auxiliary_loss_mlp": 0.00225892, "balance_loss_clip": 1.02353072, "balance_loss_mlp": 0.20236011, "epoch": 0.9755598977904705, "flos": 26395168976640.0, "grad_norm": 29.162433941785103, "language_loss": 0.75766957, "learning_rate": 6.247342505960818e-09, "loss": 0.77232122, "num_input_tokens_seen": 350164050, "router_z_loss_clip": 2.15917969, "router_z_loss_mlp": 0.23547363, "step": 16226, "time_per_iteration": 2.7377209663391113 }, { "auxiliary_loss_clip": 0.01226362, "auxiliary_loss_mlp": 0.00236603, "balance_loss_clip": 1.01267838, "balance_loss_mlp": 0.21026964, "epoch": 0.9756200210431384, "flos": 16617663446400.0, "grad_norm": 77.69331581242963, "language_loss": 0.89924324, "learning_rate": 6.216621253462894e-09, "loss": 0.91387284, "num_input_tokens_seen": 350181350, "router_z_loss_clip": 2.13867188, "router_z_loss_mlp": 0.26330566, "step": 16227, "time_per_iteration": 2.6521334648132324 }, { "auxiliary_loss_clip": 0.01226087, "auxiliary_loss_mlp": 0.00216341, "balance_loss_clip": 1.01378274, "balance_loss_mlp": 0.19372651, "epoch": 0.9756801442958064, "flos": 23623044946560.0, "grad_norm": 48.88447484243357, "language_loss": 0.82017934, "learning_rate": 6.185975605430549e-09, "loss": 0.83460367, "num_input_tokens_seen": 350199765, "router_z_loss_clip": 2.11914062, "router_z_loss_mlp": 0.22631836, "step": 16228, "time_per_iteration": 2.719388961791992 }, { "auxiliary_loss_clip": 0.01081534, "auxiliary_loss_mlp": 0.00100444, "balance_loss_clip": 0.94448125, "balance_loss_mlp": 0.09319642, "epoch": 0.9757402675484744, "flos": 61625799440640.0, "grad_norm": 0.8205117420542077, "language_loss": 0.54928732, "learning_rate": 6.155405563025962e-09, "loss": 0.5611071, "num_input_tokens_seen": 350256420, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.07226562, "step": 16229, "time_per_iteration": 3.0592432022094727 }, { "auxiliary_loss_clip": 0.01244379, "auxiliary_loss_mlp": 0.00221963, "balance_loss_clip": 1.02703214, "balance_loss_mlp": 0.19707173, "epoch": 0.9758003908011423, "flos": 24058964401920.0, "grad_norm": 9.702327605002319, "language_loss": 0.81072885, "learning_rate": 6.124911127407984e-09, "loss": 0.82539225, "num_input_tokens_seen": 350276270, "router_z_loss_clip": 2.17578125, "router_z_loss_mlp": 0.24890137, "step": 16230, "time_per_iteration": 2.7876923084259033 }, { "auxiliary_loss_clip": 0.01233081, "auxiliary_loss_mlp": 0.00213242, "balance_loss_clip": 1.02079844, "balance_loss_mlp": 0.18851823, "epoch": 0.9758605140538104, "flos": 17493093717120.0, "grad_norm": 27.675026655545082, "language_loss": 0.78647989, "learning_rate": 6.094492299733245e-09, "loss": 0.80094314, "num_input_tokens_seen": 350295000, "router_z_loss_clip": 2.12109375, "router_z_loss_mlp": 0.24719238, "step": 16231, "time_per_iteration": 4.214065790176392 }, { "auxiliary_loss_clip": 0.01257066, "auxiliary_loss_mlp": 0.00227646, "balance_loss_clip": 1.03174448, "balance_loss_mlp": 0.2022664, "epoch": 0.9759206373064783, "flos": 24826950115200.0, "grad_norm": 464.5336924419608, "language_loss": 0.87547898, "learning_rate": 6.064149081155267e-09, "loss": 0.89032608, "num_input_tokens_seen": 350314980, "router_z_loss_clip": 2.25585938, "router_z_loss_mlp": 0.25366211, "step": 16232, "time_per_iteration": 4.183711767196655 }, { "auxiliary_loss_clip": 0.0108685, "auxiliary_loss_mlp": 0.00079617, "balance_loss_clip": 0.94937658, "balance_loss_mlp": 0.07317932, "epoch": 0.9759807605591463, "flos": 68161182456960.0, "grad_norm": 0.7148727283411961, "language_loss": 0.53418088, "learning_rate": 6.033881472824465e-09, "loss": 0.54584551, "num_input_tokens_seen": 350371985, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.06445312, "step": 16233, "time_per_iteration": 2.9971158504486084 }, { "auxiliary_loss_clip": 0.01223377, "auxiliary_loss_mlp": 0.00220019, "balance_loss_clip": 1.01064324, "balance_loss_mlp": 0.19435287, "epoch": 0.9760408838118142, "flos": 18989239939200.0, "grad_norm": 1559.686644095083, "language_loss": 0.79997528, "learning_rate": 6.003689475888807e-09, "loss": 0.81440926, "num_input_tokens_seen": 350390590, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.25671387, "step": 16234, "time_per_iteration": 2.667994499206543 }, { "auxiliary_loss_clip": 0.01255573, "auxiliary_loss_mlp": 0.00263549, "balance_loss_clip": 1.0342927, "balance_loss_mlp": 0.23621476, "epoch": 0.9761010070644822, "flos": 17125978763520.0, "grad_norm": 9117.643117787062, "language_loss": 0.88877207, "learning_rate": 5.973573091493156e-09, "loss": 0.90396321, "num_input_tokens_seen": 350403770, "router_z_loss_clip": 2.21679688, "router_z_loss_mlp": 0.27294922, "step": 16235, "time_per_iteration": 2.6228573322296143 }, { "auxiliary_loss_clip": 0.01241167, "auxiliary_loss_mlp": 0.00225065, "balance_loss_clip": 1.02503586, "balance_loss_mlp": 0.19947119, "epoch": 0.9761611303171501, "flos": 22052599441920.0, "grad_norm": 42.85426679509969, "language_loss": 0.84784937, "learning_rate": 5.943532320779265e-09, "loss": 0.86251163, "num_input_tokens_seen": 350421870, "router_z_loss_clip": 2.16210938, "router_z_loss_mlp": 0.25610352, "step": 16236, "time_per_iteration": 2.6841397285461426 }, { "auxiliary_loss_clip": 0.01225012, "auxiliary_loss_mlp": 0.00207278, "balance_loss_clip": 1.01322007, "balance_loss_mlp": 0.1838291, "epoch": 0.9762212535698181, "flos": 21757521214080.0, "grad_norm": 2.7229030867723676, "language_loss": 0.8113538, "learning_rate": 5.913567164886446e-09, "loss": 0.82567668, "num_input_tokens_seen": 350440025, "router_z_loss_clip": 2.11523438, "router_z_loss_mlp": 0.23461914, "step": 16237, "time_per_iteration": 2.647490978240967 }, { "auxiliary_loss_clip": 0.01232162, "auxiliary_loss_mlp": 0.00233487, "balance_loss_clip": 1.01799655, "balance_loss_mlp": 0.20665297, "epoch": 0.9762813768224861, "flos": 25921615046400.0, "grad_norm": 575.6691389052797, "language_loss": 0.81598473, "learning_rate": 5.8836776249509e-09, "loss": 0.83064127, "num_input_tokens_seen": 350459435, "router_z_loss_clip": 2.14453125, "router_z_loss_mlp": 0.26867676, "step": 16238, "time_per_iteration": 2.737009286880493 }, { "auxiliary_loss_clip": 0.01235335, "auxiliary_loss_mlp": 0.00216492, "balance_loss_clip": 1.01904273, "balance_loss_mlp": 0.19167233, "epoch": 0.9763415000751541, "flos": 24051853509120.0, "grad_norm": 4.330286897739265, "language_loss": 0.91517144, "learning_rate": 5.8538637021063875e-09, "loss": 0.92968971, "num_input_tokens_seen": 350472655, "router_z_loss_clip": 2.16210938, "router_z_loss_mlp": 0.24804688, "step": 16239, "time_per_iteration": 2.633209228515625 }, { "auxiliary_loss_clip": 0.01255109, "auxiliary_loss_mlp": 0.00220956, "balance_loss_clip": 1.03179741, "balance_loss_mlp": 0.19525474, "epoch": 0.976401623327822, "flos": 17018677860480.0, "grad_norm": 298.2840742130606, "language_loss": 0.72634709, "learning_rate": 5.824125397483115e-09, "loss": 0.74110776, "num_input_tokens_seen": 350488160, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.25732422, "step": 16240, "time_per_iteration": 2.6207425594329834 }, { "auxiliary_loss_clip": 0.01231404, "auxiliary_loss_mlp": 0.00240589, "balance_loss_clip": 1.02119827, "balance_loss_mlp": 0.21565083, "epoch": 0.97646174658049, "flos": 16106941918080.0, "grad_norm": 6.848011378138832, "language_loss": 0.90181112, "learning_rate": 5.7944627122088474e-09, "loss": 0.91653103, "num_input_tokens_seen": 350506065, "router_z_loss_clip": 2.10449219, "router_z_loss_mlp": 0.24963379, "step": 16241, "time_per_iteration": 2.616647481918335 }, { "auxiliary_loss_clip": 0.01234329, "auxiliary_loss_mlp": 0.00238372, "balance_loss_clip": 1.02063894, "balance_loss_mlp": 0.21329011, "epoch": 0.9765218698331579, "flos": 21252725429760.0, "grad_norm": 14.502495673999231, "language_loss": 0.90238667, "learning_rate": 5.764875647408463e-09, "loss": 0.91711366, "num_input_tokens_seen": 350524495, "router_z_loss_clip": 2.13671875, "router_z_loss_mlp": 0.25097656, "step": 16242, "time_per_iteration": 2.736884355545044 }, { "auxiliary_loss_clip": 0.01260368, "auxiliary_loss_mlp": 0.00232511, "balance_loss_clip": 1.03687882, "balance_loss_mlp": 0.20545033, "epoch": 0.9765819930858259, "flos": 18588045957120.0, "grad_norm": 73.41753821292835, "language_loss": 0.84038484, "learning_rate": 5.7353642042037294e-09, "loss": 0.85531366, "num_input_tokens_seen": 350544185, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.27062988, "step": 16243, "time_per_iteration": 2.6889610290527344 }, { "auxiliary_loss_clip": 0.01251989, "auxiliary_loss_mlp": 0.00241154, "balance_loss_clip": 1.03429818, "balance_loss_mlp": 0.21428385, "epoch": 0.976642116338494, "flos": 20266833859200.0, "grad_norm": 39.05945211415106, "language_loss": 0.77910221, "learning_rate": 5.705928383713754e-09, "loss": 0.79403359, "num_input_tokens_seen": 350562675, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.26879883, "step": 16244, "time_per_iteration": 2.7319159507751465 }, { "auxiliary_loss_clip": 0.01265688, "auxiliary_loss_mlp": 0.00238714, "balance_loss_clip": 1.0400219, "balance_loss_mlp": 0.21080747, "epoch": 0.9767022395911619, "flos": 25550477769600.0, "grad_norm": 20.51221854786558, "language_loss": 0.91079926, "learning_rate": 5.676568187055197e-09, "loss": 0.9258433, "num_input_tokens_seen": 350581535, "router_z_loss_clip": 2.25585938, "router_z_loss_mlp": 0.27905273, "step": 16245, "time_per_iteration": 2.6892638206481934 }, { "auxiliary_loss_clip": 0.01211263, "auxiliary_loss_mlp": 0.0018415, "balance_loss_clip": 1.0044744, "balance_loss_mlp": 0.16211993, "epoch": 0.9767623628438299, "flos": 21762656858880.0, "grad_norm": 5.77747716387729, "language_loss": 0.84085852, "learning_rate": 5.647283615340726e-09, "loss": 0.85481262, "num_input_tokens_seen": 350601615, "router_z_loss_clip": 2.07128906, "router_z_loss_mlp": 0.22033691, "step": 16246, "time_per_iteration": 2.6546645164489746 }, { "auxiliary_loss_clip": 0.0120461, "auxiliary_loss_mlp": 0.00201288, "balance_loss_clip": 1.00142276, "balance_loss_mlp": 0.17849557, "epoch": 0.9768224860964978, "flos": 15851114277120.0, "grad_norm": 3.7837761873468234, "language_loss": 0.81544459, "learning_rate": 5.6180746696812275e-09, "loss": 0.8295036, "num_input_tokens_seen": 350619580, "router_z_loss_clip": 2.03125, "router_z_loss_mlp": 0.22790527, "step": 16247, "time_per_iteration": 2.6310787200927734 }, { "auxiliary_loss_clip": 0.01233321, "auxiliary_loss_mlp": 0.00220498, "balance_loss_clip": 1.01917517, "balance_loss_mlp": 0.19555925, "epoch": 0.9768826093491658, "flos": 25151151294720.0, "grad_norm": 124.7805352865306, "language_loss": 0.87772298, "learning_rate": 5.58894135118404e-09, "loss": 0.89226115, "num_input_tokens_seen": 350640015, "router_z_loss_clip": 2.14453125, "router_z_loss_mlp": 0.24951172, "step": 16248, "time_per_iteration": 2.711103916168213 }, { "auxiliary_loss_clip": 0.0128193, "auxiliary_loss_mlp": 0.00237763, "balance_loss_clip": 1.04828215, "balance_loss_mlp": 0.21038045, "epoch": 0.9769427326018337, "flos": 22967028904320.0, "grad_norm": 9.741223619467215, "language_loss": 0.87144804, "learning_rate": 5.559883660954278e-09, "loss": 0.88664496, "num_input_tokens_seen": 350659155, "router_z_loss_clip": 2.33398438, "router_z_loss_mlp": 0.27368164, "step": 16249, "time_per_iteration": 2.685798168182373 }, { "auxiliary_loss_clip": 0.0123218, "auxiliary_loss_mlp": 0.00232307, "balance_loss_clip": 1.02219558, "balance_loss_mlp": 0.20821477, "epoch": 0.9770028558545018, "flos": 15264297786240.0, "grad_norm": 3.463755993551968, "language_loss": 0.74200439, "learning_rate": 5.530901600093507e-09, "loss": 0.75664926, "num_input_tokens_seen": 350676615, "router_z_loss_clip": 2.09765625, "router_z_loss_mlp": 0.24108887, "step": 16250, "time_per_iteration": 2.6292941570281982 }, { "auxiliary_loss_clip": 0.01085779, "auxiliary_loss_mlp": 0.00096035, "balance_loss_clip": 0.94753325, "balance_loss_mlp": 0.08955018, "epoch": 0.9770629791071697, "flos": 71450348808960.0, "grad_norm": 0.8068208371561941, "language_loss": 0.58551288, "learning_rate": 5.501995169700846e-09, "loss": 0.59733105, "num_input_tokens_seen": 350736805, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 0.06494141, "step": 16251, "time_per_iteration": 3.2168140411376953 }, { "auxiliary_loss_clip": 0.01231139, "auxiliary_loss_mlp": 0.00226459, "balance_loss_clip": 1.01919329, "balance_loss_mlp": 0.20209301, "epoch": 0.9771231023598377, "flos": 22412854897920.0, "grad_norm": 2.272999807730126, "language_loss": 0.84602702, "learning_rate": 5.473164370872307e-09, "loss": 0.86060297, "num_input_tokens_seen": 350753600, "router_z_loss_clip": 2.11425781, "router_z_loss_mlp": 0.2434082, "step": 16252, "time_per_iteration": 2.643819570541382 }, { "auxiliary_loss_clip": 0.01239095, "auxiliary_loss_mlp": 0.00211959, "balance_loss_clip": 1.02488732, "balance_loss_mlp": 0.18653166, "epoch": 0.9771832256125056, "flos": 19025940660480.0, "grad_norm": 11.18882064298086, "language_loss": 0.74528778, "learning_rate": 5.444409204701461e-09, "loss": 0.75979829, "num_input_tokens_seen": 350771225, "router_z_loss_clip": 2.14257812, "router_z_loss_mlp": 0.2545166, "step": 16253, "time_per_iteration": 2.674994707107544 }, { "auxiliary_loss_clip": 0.01257267, "auxiliary_loss_mlp": 0.00232338, "balance_loss_clip": 1.0374074, "balance_loss_mlp": 0.20613563, "epoch": 0.9772433488651736, "flos": 17822143232640.0, "grad_norm": 27.64473994552685, "language_loss": 0.87091553, "learning_rate": 5.415729672278324e-09, "loss": 0.88581157, "num_input_tokens_seen": 350789100, "router_z_loss_clip": 2.20019531, "router_z_loss_mlp": 0.26196289, "step": 16254, "time_per_iteration": 2.6136586666107178 }, { "auxiliary_loss_clip": 0.01247793, "auxiliary_loss_mlp": 0.00218928, "balance_loss_clip": 1.02479386, "balance_loss_mlp": 0.19303623, "epoch": 0.9773034721178415, "flos": 37629785623680.0, "grad_norm": 23.395900252812392, "language_loss": 0.73954725, "learning_rate": 5.387125774690471e-09, "loss": 0.75421453, "num_input_tokens_seen": 350811085, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.25927734, "step": 16255, "time_per_iteration": 2.907097578048706 }, { "auxiliary_loss_clip": 0.01253963, "auxiliary_loss_mlp": 0.00236953, "balance_loss_clip": 1.02722728, "balance_loss_mlp": 0.20850942, "epoch": 0.9773635953705095, "flos": 20302457172480.0, "grad_norm": 60.67629590516847, "language_loss": 0.85057545, "learning_rate": 5.358597513023033e-09, "loss": 0.8654846, "num_input_tokens_seen": 350831065, "router_z_loss_clip": 2.265625, "router_z_loss_mlp": 0.28430176, "step": 16256, "time_per_iteration": 2.74334454536438 }, { "auxiliary_loss_clip": 0.01228621, "auxiliary_loss_mlp": 0.00221578, "balance_loss_clip": 1.0203706, "balance_loss_mlp": 0.19814169, "epoch": 0.9774237186231776, "flos": 22309253095680.0, "grad_norm": 1022.2956328169155, "language_loss": 0.84368372, "learning_rate": 5.330144888357369e-09, "loss": 0.85818577, "num_input_tokens_seen": 350849675, "router_z_loss_clip": 2.07714844, "router_z_loss_mlp": 0.23425293, "step": 16257, "time_per_iteration": 2.703503131866455 }, { "auxiliary_loss_clip": 0.01235395, "auxiliary_loss_mlp": 0.00204463, "balance_loss_clip": 1.01809728, "balance_loss_mlp": 0.17907117, "epoch": 0.9774838418758455, "flos": 24204905360640.0, "grad_norm": 27.15278134334487, "language_loss": 0.83392042, "learning_rate": 5.301767901772391e-09, "loss": 0.84831893, "num_input_tokens_seen": 350868955, "router_z_loss_clip": 2.171875, "router_z_loss_mlp": 0.25390625, "step": 16258, "time_per_iteration": 2.7626097202301025 }, { "auxiliary_loss_clip": 0.01079858, "auxiliary_loss_mlp": 0.00064478, "balance_loss_clip": 0.94313622, "balance_loss_mlp": 0.05799286, "epoch": 0.9775439651285135, "flos": 66357139829760.0, "grad_norm": 0.6517648830383103, "language_loss": 0.58762157, "learning_rate": 5.273466554344353e-09, "loss": 0.59906495, "num_input_tokens_seen": 350935110, "router_z_loss_clip": 1.3671875, "router_z_loss_mlp": 0.06494141, "step": 16259, "time_per_iteration": 3.2399446964263916 }, { "auxiliary_loss_clip": 0.01263439, "auxiliary_loss_mlp": 0.00256942, "balance_loss_clip": 1.03842437, "balance_loss_mlp": 0.23072787, "epoch": 0.9776040883811814, "flos": 22601565976320.0, "grad_norm": 7.060549281545783, "language_loss": 0.80907238, "learning_rate": 5.2452408471461705e-09, "loss": 0.82427621, "num_input_tokens_seen": 350953220, "router_z_loss_clip": 2.25, "router_z_loss_mlp": 0.26208496, "step": 16260, "time_per_iteration": 2.736387014389038 }, { "auxiliary_loss_clip": 0.01250372, "auxiliary_loss_mlp": 0.00218553, "balance_loss_clip": 1.02846956, "balance_loss_mlp": 0.19334051, "epoch": 0.9776642116338494, "flos": 18442176825600.0, "grad_norm": 17.085578628646612, "language_loss": 0.86664057, "learning_rate": 5.2170907812485456e-09, "loss": 0.88132983, "num_input_tokens_seen": 350971915, "router_z_loss_clip": 2.21679688, "router_z_loss_mlp": 0.25219727, "step": 16261, "time_per_iteration": 2.6369025707244873 }, { "auxiliary_loss_clip": 0.01240701, "auxiliary_loss_mlp": 0.00223241, "balance_loss_clip": 1.02483225, "balance_loss_mlp": 0.19800401, "epoch": 0.9777243348865173, "flos": 22638446265600.0, "grad_norm": 9.168908657971407, "language_loss": 0.82087898, "learning_rate": 5.189016357718845e-09, "loss": 0.83551842, "num_input_tokens_seen": 350990470, "router_z_loss_clip": 2.15820312, "router_z_loss_mlp": 0.25268555, "step": 16262, "time_per_iteration": 2.672481060028076 }, { "auxiliary_loss_clip": 0.01258662, "auxiliary_loss_mlp": 0.00226998, "balance_loss_clip": 1.03781867, "balance_loss_mlp": 0.20093869, "epoch": 0.9777844581391854, "flos": 31321394605440.0, "grad_norm": 20.39665158937016, "language_loss": 0.82785302, "learning_rate": 5.16101757762133e-09, "loss": 0.84270966, "num_input_tokens_seen": 351010755, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.26037598, "step": 16263, "time_per_iteration": 2.7242703437805176 }, { "auxiliary_loss_clip": 0.01247152, "auxiliary_loss_mlp": 0.00213649, "balance_loss_clip": 1.02817035, "balance_loss_mlp": 0.18843587, "epoch": 0.9778445813918533, "flos": 23039101543680.0, "grad_norm": 91.8419861418273, "language_loss": 0.7637046, "learning_rate": 5.133094442018038e-09, "loss": 0.77831262, "num_input_tokens_seen": 351029965, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.2520752, "step": 16264, "time_per_iteration": 4.064895868301392 }, { "auxiliary_loss_clip": 0.01258742, "auxiliary_loss_mlp": 0.00251361, "balance_loss_clip": 1.03412986, "balance_loss_mlp": 0.22357304, "epoch": 0.9779047046445213, "flos": 17566351505280.0, "grad_norm": 6.957040622787914, "language_loss": 0.80186272, "learning_rate": 5.105246951967679e-09, "loss": 0.81696373, "num_input_tokens_seen": 351046205, "router_z_loss_clip": 2.24316406, "router_z_loss_mlp": 0.27770996, "step": 16265, "time_per_iteration": 4.115423679351807 }, { "auxiliary_loss_clip": 0.01231166, "auxiliary_loss_mlp": 0.00204429, "balance_loss_clip": 1.01733613, "balance_loss_mlp": 0.17914446, "epoch": 0.9779648278971892, "flos": 20741141975040.0, "grad_norm": 2.9394924362489663, "language_loss": 0.76859546, "learning_rate": 5.077475108526297e-09, "loss": 0.78295135, "num_input_tokens_seen": 351065390, "router_z_loss_clip": 2.1328125, "router_z_loss_mlp": 0.25280762, "step": 16266, "time_per_iteration": 2.6692798137664795 }, { "auxiliary_loss_clip": 0.01220781, "auxiliary_loss_mlp": 0.00215853, "balance_loss_clip": 1.00975513, "balance_loss_mlp": 0.19236879, "epoch": 0.9780249511498572, "flos": 21026954494080.0, "grad_norm": 58.197109480576614, "language_loss": 0.92261618, "learning_rate": 5.049778912747049e-09, "loss": 0.93698251, "num_input_tokens_seen": 351084355, "router_z_loss_clip": 2.109375, "router_z_loss_mlp": 0.23486328, "step": 16267, "time_per_iteration": 2.64871883392334 }, { "auxiliary_loss_clip": 0.01252033, "auxiliary_loss_mlp": 0.00230112, "balance_loss_clip": 1.02947712, "balance_loss_mlp": 0.20483944, "epoch": 0.9780850744025251, "flos": 30774223751040.0, "grad_norm": 6.8907062284222285, "language_loss": 0.80825776, "learning_rate": 5.022158365679985e-09, "loss": 0.82307923, "num_input_tokens_seen": 351105870, "router_z_loss_clip": 2.22265625, "router_z_loss_mlp": 0.25292969, "step": 16268, "time_per_iteration": 2.786489486694336 }, { "auxiliary_loss_clip": 0.01244154, "auxiliary_loss_mlp": 0.00218888, "balance_loss_clip": 1.02761769, "balance_loss_mlp": 0.19428343, "epoch": 0.9781451976551931, "flos": 20302995876480.0, "grad_norm": 18.62926350093626, "language_loss": 0.81520212, "learning_rate": 4.994613468372711e-09, "loss": 0.82983261, "num_input_tokens_seen": 351124760, "router_z_loss_clip": 2.16699219, "router_z_loss_mlp": 0.24645996, "step": 16269, "time_per_iteration": 2.6774442195892334 }, { "auxiliary_loss_clip": 0.01240105, "auxiliary_loss_mlp": 0.00216283, "balance_loss_clip": 1.02399457, "balance_loss_mlp": 0.19172561, "epoch": 0.9782053209078612, "flos": 24316479982080.0, "grad_norm": 7.2838684262576106, "language_loss": 0.79586476, "learning_rate": 4.967144221869501e-09, "loss": 0.81042862, "num_input_tokens_seen": 351142820, "router_z_loss_clip": 2.1640625, "router_z_loss_mlp": 0.24536133, "step": 16270, "time_per_iteration": 2.84726881980896 }, { "auxiliary_loss_clip": 0.01260886, "auxiliary_loss_mlp": 0.00226677, "balance_loss_clip": 1.0387826, "balance_loss_mlp": 0.20083275, "epoch": 0.9782654441605291, "flos": 32489425065600.0, "grad_norm": 5.576487829372038, "language_loss": 0.74707645, "learning_rate": 4.939750627212191e-09, "loss": 0.7619521, "num_input_tokens_seen": 351164805, "router_z_loss_clip": 2.22070312, "router_z_loss_mlp": 0.25842285, "step": 16271, "time_per_iteration": 2.75707745552063 }, { "auxiliary_loss_clip": 0.01240706, "auxiliary_loss_mlp": 0.00226163, "balance_loss_clip": 1.02858293, "balance_loss_mlp": 0.20276181, "epoch": 0.9783255674131971, "flos": 26979076465920.0, "grad_norm": 11.932406703058483, "language_loss": 0.77447152, "learning_rate": 4.912432685439505e-09, "loss": 0.78914022, "num_input_tokens_seen": 351187005, "router_z_loss_clip": 2.12109375, "router_z_loss_mlp": 0.23413086, "step": 16272, "time_per_iteration": 2.800302267074585 }, { "auxiliary_loss_clip": 0.01246189, "auxiliary_loss_mlp": 0.00219347, "balance_loss_clip": 1.0297308, "balance_loss_mlp": 0.19551691, "epoch": 0.978385690665865, "flos": 23112251591040.0, "grad_norm": 1.8542855234406044, "language_loss": 0.7302947, "learning_rate": 4.88519039758728e-09, "loss": 0.74495006, "num_input_tokens_seen": 351208450, "router_z_loss_clip": 2.16699219, "router_z_loss_mlp": 0.23864746, "step": 16273, "time_per_iteration": 4.140228748321533 }, { "auxiliary_loss_clip": 0.01235557, "auxiliary_loss_mlp": 0.00217906, "balance_loss_clip": 1.02208853, "balance_loss_mlp": 0.19361128, "epoch": 0.978445813918533, "flos": 25409672455680.0, "grad_norm": 2071.381482089908, "language_loss": 0.81077611, "learning_rate": 4.85802376468869e-09, "loss": 0.82531077, "num_input_tokens_seen": 351229585, "router_z_loss_clip": 2.13574219, "router_z_loss_mlp": 0.24328613, "step": 16274, "time_per_iteration": 4.232133150100708 }, { "auxiliary_loss_clip": 0.0123357, "auxiliary_loss_mlp": 0.00199903, "balance_loss_clip": 1.01934755, "balance_loss_mlp": 0.17752695, "epoch": 0.9785059371712009, "flos": 23550218121600.0, "grad_norm": 961.1306205425983, "language_loss": 0.83969748, "learning_rate": 4.830932787773579e-09, "loss": 0.85403222, "num_input_tokens_seen": 351249525, "router_z_loss_clip": 2.14257812, "router_z_loss_mlp": 0.22363281, "step": 16275, "time_per_iteration": 2.7170255184173584 }, { "auxiliary_loss_clip": 0.01242723, "auxiliary_loss_mlp": 0.00226682, "balance_loss_clip": 1.02387118, "balance_loss_mlp": 0.19946614, "epoch": 0.978566060423869, "flos": 34351177870080.0, "grad_norm": 8.287289845731094, "language_loss": 0.77747613, "learning_rate": 4.803917467869567e-09, "loss": 0.79217011, "num_input_tokens_seen": 351272530, "router_z_loss_clip": 2.19140625, "router_z_loss_mlp": 0.27246094, "step": 16276, "time_per_iteration": 2.916033983230591 }, { "auxiliary_loss_clip": 0.01215125, "auxiliary_loss_mlp": 0.00236035, "balance_loss_clip": 1.00388944, "balance_loss_mlp": 0.21159747, "epoch": 0.9786261836765369, "flos": 11618862387840.0, "grad_norm": 3.158904131297437, "language_loss": 0.94788188, "learning_rate": 4.776977806000726e-09, "loss": 0.96239352, "num_input_tokens_seen": 351288530, "router_z_loss_clip": 2.11230469, "router_z_loss_mlp": 0.24450684, "step": 16277, "time_per_iteration": 2.685847282409668 }, { "auxiliary_loss_clip": 0.01219534, "auxiliary_loss_mlp": 0.00202781, "balance_loss_clip": 1.01411283, "balance_loss_mlp": 0.1793322, "epoch": 0.9786863069292049, "flos": 17420949250560.0, "grad_norm": 25.384552093134108, "language_loss": 0.76931733, "learning_rate": 4.7501138031891264e-09, "loss": 0.78354043, "num_input_tokens_seen": 351305890, "router_z_loss_clip": 2.05371094, "router_z_loss_mlp": 0.23449707, "step": 16278, "time_per_iteration": 2.7139861583709717 }, { "auxiliary_loss_clip": 0.01243086, "auxiliary_loss_mlp": 0.00245174, "balance_loss_clip": 1.02673936, "balance_loss_mlp": 0.2199139, "epoch": 0.9787464301818728, "flos": 20844923345280.0, "grad_norm": 5.926996921124684, "language_loss": 0.89178962, "learning_rate": 4.723325460453065e-09, "loss": 0.90667224, "num_input_tokens_seen": 351325010, "router_z_loss_clip": 2.16503906, "router_z_loss_mlp": 0.25268555, "step": 16279, "time_per_iteration": 2.658730983734131 }, { "auxiliary_loss_clip": 0.01241513, "auxiliary_loss_mlp": 0.00234864, "balance_loss_clip": 1.0250001, "balance_loss_mlp": 0.21099798, "epoch": 0.9788065534345408, "flos": 18222942165120.0, "grad_norm": 4.360775846687186, "language_loss": 0.85680628, "learning_rate": 4.696612778808395e-09, "loss": 0.87157011, "num_input_tokens_seen": 351343060, "router_z_loss_clip": 2.16015625, "router_z_loss_mlp": 0.2388916, "step": 16280, "time_per_iteration": 2.702376365661621 }, { "auxiliary_loss_clip": 0.01231615, "auxiliary_loss_mlp": 0.00237014, "balance_loss_clip": 1.02069116, "balance_loss_mlp": 0.21213463, "epoch": 0.9788666766872087, "flos": 21578219498880.0, "grad_norm": 6.5799659530107775, "language_loss": 0.85148239, "learning_rate": 4.669975759268085e-09, "loss": 0.86616868, "num_input_tokens_seen": 351363260, "router_z_loss_clip": 2.109375, "router_z_loss_mlp": 0.24853516, "step": 16281, "time_per_iteration": 2.717907667160034 }, { "auxiliary_loss_clip": 0.0123204, "auxiliary_loss_mlp": 0.00219559, "balance_loss_clip": 1.01826501, "balance_loss_mlp": 0.19472784, "epoch": 0.9789267999398767, "flos": 24900495212160.0, "grad_norm": 8.59731671448413, "language_loss": 0.88432813, "learning_rate": 4.643414402842216e-09, "loss": 0.89884418, "num_input_tokens_seen": 351382610, "router_z_loss_clip": 2.13476562, "router_z_loss_mlp": 0.24829102, "step": 16282, "time_per_iteration": 2.7499947547912598 }, { "auxiliary_loss_clip": 0.01230885, "auxiliary_loss_mlp": 0.00221829, "balance_loss_clip": 1.01782894, "balance_loss_mlp": 0.19765362, "epoch": 0.9789869231925448, "flos": 19573111514880.0, "grad_norm": 54.3564905762967, "language_loss": 0.91296953, "learning_rate": 4.616928710538204e-09, "loss": 0.92749667, "num_input_tokens_seen": 351401075, "router_z_loss_clip": 2.1328125, "router_z_loss_mlp": 0.24169922, "step": 16283, "time_per_iteration": 2.6116442680358887 }, { "auxiliary_loss_clip": 0.01232789, "auxiliary_loss_mlp": 0.00214414, "balance_loss_clip": 1.02102172, "balance_loss_mlp": 0.19050071, "epoch": 0.9790470464452127, "flos": 16796641939200.0, "grad_norm": 4.968554677013541, "language_loss": 0.78013003, "learning_rate": 4.590518683360134e-09, "loss": 0.79460204, "num_input_tokens_seen": 351419275, "router_z_loss_clip": 2.1171875, "router_z_loss_mlp": 0.2388916, "step": 16284, "time_per_iteration": 2.6286303997039795 }, { "auxiliary_loss_clip": 0.01221895, "auxiliary_loss_mlp": 0.00193142, "balance_loss_clip": 1.01139832, "balance_loss_mlp": 0.17181505, "epoch": 0.9791071696978807, "flos": 18369350000640.0, "grad_norm": 31.419694620312537, "language_loss": 0.71195686, "learning_rate": 4.56418432230965e-09, "loss": 0.72610724, "num_input_tokens_seen": 351437375, "router_z_loss_clip": 2.10644531, "router_z_loss_mlp": 0.21325684, "step": 16285, "time_per_iteration": 2.6587142944335938 }, { "auxiliary_loss_clip": 0.01243948, "auxiliary_loss_mlp": 0.00245761, "balance_loss_clip": 1.02971506, "balance_loss_mlp": 0.22032146, "epoch": 0.9791672929505486, "flos": 24170323541760.0, "grad_norm": 2.2798835602713234, "language_loss": 0.76264369, "learning_rate": 4.537925628385286e-09, "loss": 0.77754074, "num_input_tokens_seen": 351457810, "router_z_loss_clip": 2.14160156, "router_z_loss_mlp": 0.25439453, "step": 16286, "time_per_iteration": 2.6801750659942627 }, { "auxiliary_loss_clip": 0.01207596, "auxiliary_loss_mlp": 0.00253681, "balance_loss_clip": 1.00208676, "balance_loss_mlp": 0.2286348, "epoch": 0.9792274162032166, "flos": 24354114456960.0, "grad_norm": 4.98261146570125, "language_loss": 0.65179592, "learning_rate": 4.511742602582691e-09, "loss": 0.66640872, "num_input_tokens_seen": 351478825, "router_z_loss_clip": 2.05566406, "router_z_loss_mlp": 0.25036621, "step": 16287, "time_per_iteration": 2.7861382961273193 }, { "auxiliary_loss_clip": 0.01227015, "auxiliary_loss_mlp": 0.0022689, "balance_loss_clip": 1.01643372, "balance_loss_mlp": 0.20109317, "epoch": 0.9792875394558845, "flos": 26395779507840.0, "grad_norm": 144.08679208986095, "language_loss": 0.8872, "learning_rate": 4.485635245894626e-09, "loss": 0.901739, "num_input_tokens_seen": 351498785, "router_z_loss_clip": 2.10449219, "router_z_loss_mlp": 0.25805664, "step": 16288, "time_per_iteration": 2.6793642044067383 }, { "auxiliary_loss_clip": 0.01240272, "auxiliary_loss_mlp": 0.00241142, "balance_loss_clip": 1.02415478, "balance_loss_mlp": 0.21566735, "epoch": 0.9793476627085526, "flos": 28148004766080.0, "grad_norm": 6.473004714924833, "language_loss": 0.78820592, "learning_rate": 4.459603559311631e-09, "loss": 0.80302006, "num_input_tokens_seen": 351520235, "router_z_loss_clip": 2.16308594, "router_z_loss_mlp": 0.25500488, "step": 16289, "time_per_iteration": 2.767590284347534 }, { "auxiliary_loss_clip": 0.01243047, "auxiliary_loss_mlp": 0.00217871, "balance_loss_clip": 1.0285238, "balance_loss_mlp": 0.19393378, "epoch": 0.9794077859612205, "flos": 16763927627520.0, "grad_norm": 136.7993264209483, "language_loss": 0.85195804, "learning_rate": 4.43364754382003e-09, "loss": 0.86656713, "num_input_tokens_seen": 351538900, "router_z_loss_clip": 2.14257812, "router_z_loss_mlp": 0.23962402, "step": 16290, "time_per_iteration": 2.605208396911621 }, { "auxiliary_loss_clip": 0.01247117, "auxiliary_loss_mlp": 0.00233734, "balance_loss_clip": 1.02609658, "balance_loss_mlp": 0.20920098, "epoch": 0.9794679092138885, "flos": 19280834547840.0, "grad_norm": 7.332824560948242, "language_loss": 0.73815191, "learning_rate": 4.4077672004048105e-09, "loss": 0.75296038, "num_input_tokens_seen": 351558715, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.24536133, "step": 16291, "time_per_iteration": 2.6688232421875 }, { "auxiliary_loss_clip": 0.01247259, "auxiliary_loss_mlp": 0.0020966, "balance_loss_clip": 1.02847242, "balance_loss_mlp": 0.1840423, "epoch": 0.9795280324665564, "flos": 32156640535680.0, "grad_norm": 12.046315721228824, "language_loss": 0.71307516, "learning_rate": 4.3819625300467456e-09, "loss": 0.72764426, "num_input_tokens_seen": 351578450, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.25622559, "step": 16292, "time_per_iteration": 2.7112526893615723 }, { "auxiliary_loss_clip": 0.01245147, "auxiliary_loss_mlp": 0.00210536, "balance_loss_clip": 1.0248127, "balance_loss_mlp": 0.18512104, "epoch": 0.9795881557192244, "flos": 19060953442560.0, "grad_norm": 69.02848360732574, "language_loss": 0.8175298, "learning_rate": 4.356233533724829e-09, "loss": 0.83208668, "num_input_tokens_seen": 351597195, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.25439453, "step": 16293, "time_per_iteration": 2.7284018993377686 }, { "auxiliary_loss_clip": 0.01250461, "auxiliary_loss_mlp": 0.00226605, "balance_loss_clip": 1.0285604, "balance_loss_mlp": 0.19912696, "epoch": 0.9796482789718923, "flos": 28329928174080.0, "grad_norm": 21.383546625368126, "language_loss": 0.9059186, "learning_rate": 4.330580212414503e-09, "loss": 0.92068923, "num_input_tokens_seen": 351617460, "router_z_loss_clip": 2.21679688, "router_z_loss_mlp": 0.27490234, "step": 16294, "time_per_iteration": 2.744008779525757 }, { "auxiliary_loss_clip": 0.01219376, "auxiliary_loss_mlp": 0.00214979, "balance_loss_clip": 1.01340175, "balance_loss_mlp": 0.19052938, "epoch": 0.9797084022245603, "flos": 17967976450560.0, "grad_norm": 87.98606562165072, "language_loss": 0.80753756, "learning_rate": 4.305002567088767e-09, "loss": 0.82188118, "num_input_tokens_seen": 351635900, "router_z_loss_clip": 2.05859375, "router_z_loss_mlp": 0.24462891, "step": 16295, "time_per_iteration": 2.7190592288970947 }, { "auxiliary_loss_clip": 0.01254276, "auxiliary_loss_mlp": 0.0022086, "balance_loss_clip": 1.03292453, "balance_loss_mlp": 0.19563575, "epoch": 0.9797685254772284, "flos": 20266726118400.0, "grad_norm": 115.48868009921229, "language_loss": 0.88724929, "learning_rate": 4.2795005987170674e-09, "loss": 0.90200067, "num_input_tokens_seen": 351655400, "router_z_loss_clip": 2.21484375, "router_z_loss_mlp": 0.25256348, "step": 16296, "time_per_iteration": 2.686901807785034 }, { "auxiliary_loss_clip": 0.01240046, "auxiliary_loss_mlp": 0.00215379, "balance_loss_clip": 1.02332687, "balance_loss_mlp": 0.19148964, "epoch": 0.9798286487298963, "flos": 26907147480960.0, "grad_norm": 9.175257161792059, "language_loss": 0.82807332, "learning_rate": 4.254074308266853e-09, "loss": 0.84262753, "num_input_tokens_seen": 351675505, "router_z_loss_clip": 2.16601562, "router_z_loss_mlp": 0.2388916, "step": 16297, "time_per_iteration": 2.735041856765747 }, { "auxiliary_loss_clip": 0.01233555, "auxiliary_loss_mlp": 0.0022653, "balance_loss_clip": 1.0162859, "balance_loss_mlp": 0.20086384, "epoch": 0.9798887719825643, "flos": 27161071701120.0, "grad_norm": 21.97678809805317, "language_loss": 0.85470533, "learning_rate": 4.228723696702019e-09, "loss": 0.86930621, "num_input_tokens_seen": 351697920, "router_z_loss_clip": 2.171875, "router_z_loss_mlp": 0.25646973, "step": 16298, "time_per_iteration": 2.8203988075256348 }, { "auxiliary_loss_clip": 0.01229653, "auxiliary_loss_mlp": 0.0020252, "balance_loss_clip": 1.0161252, "balance_loss_mlp": 0.17894034, "epoch": 0.9799488952352322, "flos": 20668422890880.0, "grad_norm": 839.7778042942178, "language_loss": 0.80217326, "learning_rate": 4.203448764984019e-09, "loss": 0.81649506, "num_input_tokens_seen": 351717615, "router_z_loss_clip": 2.13574219, "router_z_loss_mlp": 0.2355957, "step": 16299, "time_per_iteration": 2.688943862915039 }, { "auxiliary_loss_clip": 0.01242693, "auxiliary_loss_mlp": 0.00251464, "balance_loss_clip": 1.02400422, "balance_loss_mlp": 0.22595364, "epoch": 0.9800090184879002, "flos": 21981209160960.0, "grad_norm": 87.30744216215206, "language_loss": 0.96241319, "learning_rate": 4.178249514071419e-09, "loss": 0.97735476, "num_input_tokens_seen": 351735260, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.25524902, "step": 16300, "time_per_iteration": 2.68994402885437 }, { "auxiliary_loss_clip": 0.01264652, "auxiliary_loss_mlp": 0.00248221, "balance_loss_clip": 1.03562045, "balance_loss_mlp": 0.22130369, "epoch": 0.9800691417405681, "flos": 21288420570240.0, "grad_norm": 13.587509752242063, "language_loss": 0.87498879, "learning_rate": 4.1531259449194555e-09, "loss": 0.89011753, "num_input_tokens_seen": 351755800, "router_z_loss_clip": 2.2890625, "router_z_loss_mlp": 0.26916504, "step": 16301, "time_per_iteration": 2.708635091781616 }, { "auxiliary_loss_clip": 0.01246035, "auxiliary_loss_mlp": 0.0020576, "balance_loss_clip": 1.02700615, "balance_loss_mlp": 0.18098858, "epoch": 0.9801292649932362, "flos": 18439878355200.0, "grad_norm": 15.051102795327642, "language_loss": 0.83959901, "learning_rate": 4.128078058480921e-09, "loss": 0.85411698, "num_input_tokens_seen": 351774790, "router_z_loss_clip": 2.19140625, "router_z_loss_mlp": 0.2479248, "step": 16302, "time_per_iteration": 2.7335031032562256 }, { "auxiliary_loss_clip": 0.01247061, "auxiliary_loss_mlp": 0.00221472, "balance_loss_clip": 1.02949739, "balance_loss_mlp": 0.19546065, "epoch": 0.9801893882459041, "flos": 25046364343680.0, "grad_norm": 42.40916283866196, "language_loss": 0.87303162, "learning_rate": 4.103105855705724e-09, "loss": 0.88771695, "num_input_tokens_seen": 351792855, "router_z_loss_clip": 2.17578125, "router_z_loss_mlp": 0.25976562, "step": 16303, "time_per_iteration": 2.6880338191986084 }, { "auxiliary_loss_clip": 0.0124373, "auxiliary_loss_mlp": 0.00225144, "balance_loss_clip": 1.02299118, "balance_loss_mlp": 0.19965683, "epoch": 0.9802495114985721, "flos": 18511484117760.0, "grad_norm": 21.362129794492535, "language_loss": 0.94276816, "learning_rate": 4.078209337540883e-09, "loss": 0.95745689, "num_input_tokens_seen": 351811450, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.25488281, "step": 16304, "time_per_iteration": 2.7117700576782227 }, { "auxiliary_loss_clip": 0.01211076, "auxiliary_loss_mlp": 0.00213844, "balance_loss_clip": 1.01137984, "balance_loss_mlp": 0.19165912, "epoch": 0.98030963475124, "flos": 21469841187840.0, "grad_norm": 7.513123927249419, "language_loss": 0.76263905, "learning_rate": 4.053388504930089e-09, "loss": 0.77688825, "num_input_tokens_seen": 351831960, "router_z_loss_clip": 1.99609375, "router_z_loss_mlp": 0.22192383, "step": 16305, "time_per_iteration": 2.718914747238159 }, { "auxiliary_loss_clip": 0.012406, "auxiliary_loss_mlp": 0.00219112, "balance_loss_clip": 1.0252192, "balance_loss_mlp": 0.19374451, "epoch": 0.980369758003908, "flos": 20412272027520.0, "grad_norm": 19.310188764008878, "language_loss": 0.81420398, "learning_rate": 4.028643358815032e-09, "loss": 0.82880116, "num_input_tokens_seen": 351851585, "router_z_loss_clip": 2.15039062, "router_z_loss_mlp": 0.25390625, "step": 16306, "time_per_iteration": 4.041212320327759 }, { "auxiliary_loss_clip": 0.0121446, "auxiliary_loss_mlp": 0.00213713, "balance_loss_clip": 1.00596571, "balance_loss_mlp": 0.19091991, "epoch": 0.9804298812565759, "flos": 23399177431680.0, "grad_norm": 22.864723072124242, "language_loss": 0.79563648, "learning_rate": 4.00397390013385e-09, "loss": 0.80991817, "num_input_tokens_seen": 351871085, "router_z_loss_clip": 2.0859375, "router_z_loss_mlp": 0.22802734, "step": 16307, "time_per_iteration": 2.7245934009552 }, { "auxiliary_loss_clip": 0.01208774, "auxiliary_loss_mlp": 0.00230744, "balance_loss_clip": 1.00564742, "balance_loss_mlp": 0.20781967, "epoch": 0.980490004509244, "flos": 23292666627840.0, "grad_norm": 9.04174134322885, "language_loss": 0.79470301, "learning_rate": 3.979380129822018e-09, "loss": 0.80909818, "num_input_tokens_seen": 351891775, "router_z_loss_clip": 2.03417969, "router_z_loss_mlp": 0.22912598, "step": 16308, "time_per_iteration": 4.273146152496338 }, { "auxiliary_loss_clip": 0.01080238, "auxiliary_loss_mlp": 0.00062446, "balance_loss_clip": 0.94224513, "balance_loss_mlp": 0.05610429, "epoch": 0.980550127761912, "flos": 56051027798400.0, "grad_norm": 0.7371754450680106, "language_loss": 0.56852931, "learning_rate": 3.954862048811902e-09, "loss": 0.57995617, "num_input_tokens_seen": 351946770, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.06347656, "step": 16309, "time_per_iteration": 3.0559093952178955 }, { "auxiliary_loss_clip": 0.01223614, "auxiliary_loss_mlp": 0.00192915, "balance_loss_clip": 1.01148129, "balance_loss_mlp": 0.16807216, "epoch": 0.9806102510145799, "flos": 25333290184320.0, "grad_norm": 54.04474305867808, "language_loss": 0.76143271, "learning_rate": 3.930419658033646e-09, "loss": 0.77559799, "num_input_tokens_seen": 351966155, "router_z_loss_clip": 2.12597656, "router_z_loss_mlp": 0.24841309, "step": 16310, "time_per_iteration": 2.7110953330993652 }, { "auxiliary_loss_clip": 0.01080591, "auxiliary_loss_mlp": 0.00054102, "balance_loss_clip": 0.94453526, "balance_loss_mlp": 0.04799844, "epoch": 0.9806703742672479, "flos": 67274837429760.0, "grad_norm": 0.8007131983654494, "language_loss": 0.54008162, "learning_rate": 3.906052958413841e-09, "loss": 0.55142856, "num_input_tokens_seen": 352031655, "router_z_loss_clip": 1.359375, "router_z_loss_mlp": 0.06103516, "step": 16311, "time_per_iteration": 3.1736879348754883 }, { "auxiliary_loss_clip": 0.01236506, "auxiliary_loss_mlp": 0.00207719, "balance_loss_clip": 1.02348828, "balance_loss_mlp": 0.18295926, "epoch": 0.9807304975199158, "flos": 25228970110080.0, "grad_norm": 2208.536004938229, "language_loss": 0.86248791, "learning_rate": 3.881761950876638e-09, "loss": 0.87693012, "num_input_tokens_seen": 352051920, "router_z_loss_clip": 2.1328125, "router_z_loss_mlp": 0.24743652, "step": 16312, "time_per_iteration": 2.6949682235717773 }, { "auxiliary_loss_clip": 0.01215149, "auxiliary_loss_mlp": 0.00205405, "balance_loss_clip": 1.01005936, "balance_loss_mlp": 0.1827555, "epoch": 0.9807906207725838, "flos": 17456392995840.0, "grad_norm": 22.853361168194095, "language_loss": 0.72337198, "learning_rate": 3.8575466363430785e-09, "loss": 0.73757756, "num_input_tokens_seen": 352069315, "router_z_loss_clip": 2.04980469, "router_z_loss_mlp": 0.22668457, "step": 16313, "time_per_iteration": 2.606459379196167 }, { "auxiliary_loss_clip": 0.01230707, "auxiliary_loss_mlp": 0.0021193, "balance_loss_clip": 1.0173384, "balance_loss_mlp": 0.1884577, "epoch": 0.9808507440252517, "flos": 21032413361280.0, "grad_norm": 45.61679953300054, "language_loss": 0.83077353, "learning_rate": 3.833407015731316e-09, "loss": 0.84519988, "num_input_tokens_seen": 352089480, "router_z_loss_clip": 2.13476562, "router_z_loss_mlp": 0.23461914, "step": 16314, "time_per_iteration": 2.6594932079315186 }, { "auxiliary_loss_clip": 0.01082003, "auxiliary_loss_mlp": 0.00070798, "balance_loss_clip": 0.94410026, "balance_loss_mlp": 0.06417038, "epoch": 0.9809108672779198, "flos": 64044491598720.0, "grad_norm": 0.7043468409988751, "language_loss": 0.51217192, "learning_rate": 3.80934308995684e-09, "loss": 0.52369994, "num_input_tokens_seen": 352150000, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.06640625, "step": 16315, "time_per_iteration": 3.164409637451172 }, { "auxiliary_loss_clip": 0.01241685, "auxiliary_loss_mlp": 0.00218571, "balance_loss_clip": 1.02328873, "balance_loss_mlp": 0.19409713, "epoch": 0.9809709905305877, "flos": 22780616296320.0, "grad_norm": 12.533499791220915, "language_loss": 0.75832176, "learning_rate": 3.785354859932033e-09, "loss": 0.7729243, "num_input_tokens_seen": 352170990, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.24499512, "step": 16316, "time_per_iteration": 4.140840768814087 }, { "auxiliary_loss_clip": 0.01234327, "auxiliary_loss_mlp": 0.0022274, "balance_loss_clip": 1.01755857, "balance_loss_mlp": 0.19740823, "epoch": 0.9810311137832557, "flos": 37013415217920.0, "grad_norm": 12.389761143328121, "language_loss": 0.63903576, "learning_rate": 3.76144232656661e-09, "loss": 0.65360641, "num_input_tokens_seen": 352195335, "router_z_loss_clip": 2.16796875, "router_z_loss_mlp": 0.25354004, "step": 16317, "time_per_iteration": 4.228930234909058 }, { "auxiliary_loss_clip": 0.01227885, "auxiliary_loss_mlp": 0.00219988, "balance_loss_clip": 1.01652622, "balance_loss_mlp": 0.19665906, "epoch": 0.9810912370359236, "flos": 18916305373440.0, "grad_norm": 146.41845677356477, "language_loss": 0.81286526, "learning_rate": 3.737605490767404e-09, "loss": 0.82734406, "num_input_tokens_seen": 352214170, "router_z_loss_clip": 2.11621094, "router_z_loss_mlp": 0.2331543, "step": 16318, "time_per_iteration": 2.7606089115142822 }, { "auxiliary_loss_clip": 0.01235807, "auxiliary_loss_mlp": 0.00213653, "balance_loss_clip": 1.01957774, "balance_loss_mlp": 0.18811798, "epoch": 0.9811513602885916, "flos": 18441602208000.0, "grad_norm": 21.68365443455554, "language_loss": 0.90730274, "learning_rate": 3.7138443534383555e-09, "loss": 0.92179728, "num_input_tokens_seen": 352231470, "router_z_loss_clip": 2.16210938, "router_z_loss_mlp": 0.25524902, "step": 16319, "time_per_iteration": 2.6557388305664062 }, { "auxiliary_loss_clip": 0.01079175, "auxiliary_loss_mlp": 0.00078291, "balance_loss_clip": 0.94265497, "balance_loss_mlp": 0.07233032, "epoch": 0.9812114835412595, "flos": 68058945371520.0, "grad_norm": 0.7010632284204807, "language_loss": 0.52835375, "learning_rate": 3.6901589154803014e-09, "loss": 0.53992844, "num_input_tokens_seen": 352291770, "router_z_loss_clip": 1.3671875, "router_z_loss_mlp": 0.05957031, "step": 16320, "time_per_iteration": 3.058997869491577 }, { "auxiliary_loss_clip": 0.01244882, "auxiliary_loss_mlp": 0.00237713, "balance_loss_clip": 1.02633214, "balance_loss_mlp": 0.21221446, "epoch": 0.9812716067939276, "flos": 25373007648000.0, "grad_norm": 30.879759936882248, "language_loss": 0.82124043, "learning_rate": 3.6665491777914116e-09, "loss": 0.83606637, "num_input_tokens_seen": 352310735, "router_z_loss_clip": 2.18457031, "router_z_loss_mlp": 0.25500488, "step": 16321, "time_per_iteration": 2.746926784515381 }, { "auxiliary_loss_clip": 0.01236353, "auxiliary_loss_mlp": 0.00196979, "balance_loss_clip": 1.02712727, "balance_loss_mlp": 0.17367369, "epoch": 0.9813317300465956, "flos": 22856818999680.0, "grad_norm": 19.959766840070778, "language_loss": 0.89244521, "learning_rate": 3.6430151412669698e-09, "loss": 0.90677851, "num_input_tokens_seen": 352329545, "router_z_loss_clip": 2.09179688, "router_z_loss_mlp": 0.2331543, "step": 16322, "time_per_iteration": 2.8419382572174072 }, { "auxiliary_loss_clip": 0.01237849, "auxiliary_loss_mlp": 0.00229083, "balance_loss_clip": 1.01867974, "balance_loss_mlp": 0.20388207, "epoch": 0.9813918532992635, "flos": 23586954756480.0, "grad_norm": 18.700631654211502, "language_loss": 0.86929464, "learning_rate": 3.619556806799595e-09, "loss": 0.88396394, "num_input_tokens_seen": 352352080, "router_z_loss_clip": 2.19042969, "router_z_loss_mlp": 0.2520752, "step": 16323, "time_per_iteration": 2.7461953163146973 }, { "auxiliary_loss_clip": 0.01263506, "auxiliary_loss_mlp": 0.00221154, "balance_loss_clip": 1.03520489, "balance_loss_mlp": 0.1936402, "epoch": 0.9814519765519315, "flos": 19606328616960.0, "grad_norm": 14.959315904548063, "language_loss": 0.94089782, "learning_rate": 3.596174175278799e-09, "loss": 0.9557445, "num_input_tokens_seen": 352366455, "router_z_loss_clip": 2.28515625, "router_z_loss_mlp": 0.27539062, "step": 16324, "time_per_iteration": 2.6415398120880127 }, { "auxiliary_loss_clip": 0.01236732, "auxiliary_loss_mlp": 0.00222467, "balance_loss_clip": 1.02205646, "balance_loss_mlp": 0.19825545, "epoch": 0.9815120998045994, "flos": 33946284787200.0, "grad_norm": 14.29343369702156, "language_loss": 0.81294191, "learning_rate": 3.5728672475909827e-09, "loss": 0.8275339, "num_input_tokens_seen": 352386090, "router_z_loss_clip": 2.14648438, "router_z_loss_mlp": 0.24243164, "step": 16325, "time_per_iteration": 2.819620370864868 }, { "auxiliary_loss_clip": 0.01213865, "auxiliary_loss_mlp": 0.00212462, "balance_loss_clip": 1.00723839, "balance_loss_mlp": 0.18953855, "epoch": 0.9815722230572674, "flos": 20850023076480.0, "grad_norm": 7.202271795796581, "language_loss": 0.8270641, "learning_rate": 3.5496360246201063e-09, "loss": 0.84132737, "num_input_tokens_seen": 352404000, "router_z_loss_clip": 2.06835938, "router_z_loss_mlp": 0.22912598, "step": 16326, "time_per_iteration": 2.61122989654541 }, { "auxiliary_loss_clip": 0.01263894, "auxiliary_loss_mlp": 0.0024433, "balance_loss_clip": 1.03792715, "balance_loss_mlp": 0.21905762, "epoch": 0.9816323463099353, "flos": 22894525301760.0, "grad_norm": 102.96195625050345, "language_loss": 0.76130801, "learning_rate": 3.5264805072470205e-09, "loss": 0.77639019, "num_input_tokens_seen": 352423540, "router_z_loss_clip": 2.2578125, "router_z_loss_mlp": 0.25280762, "step": 16327, "time_per_iteration": 2.70525860786438 }, { "auxiliary_loss_clip": 0.01253112, "auxiliary_loss_mlp": 0.00232144, "balance_loss_clip": 1.02798676, "balance_loss_mlp": 0.20489277, "epoch": 0.9816924695626034, "flos": 31539444117120.0, "grad_norm": 7.154738555415265, "language_loss": 0.80570781, "learning_rate": 3.5034006963501337e-09, "loss": 0.82056034, "num_input_tokens_seen": 352445530, "router_z_loss_clip": 2.24609375, "router_z_loss_mlp": 0.27246094, "step": 16328, "time_per_iteration": 2.746405839920044 }, { "auxiliary_loss_clip": 0.0126981, "auxiliary_loss_mlp": 0.00246829, "balance_loss_clip": 1.04041338, "balance_loss_mlp": 0.21916078, "epoch": 0.9817525928152713, "flos": 21506901045120.0, "grad_norm": 171.4038196051224, "language_loss": 0.89174187, "learning_rate": 3.4803965928040802e-09, "loss": 0.90690833, "num_input_tokens_seen": 352466325, "router_z_loss_clip": 2.29492188, "router_z_loss_mlp": 0.2767334, "step": 16329, "time_per_iteration": 2.731083631515503 }, { "auxiliary_loss_clip": 0.01259267, "auxiliary_loss_mlp": 0.00241021, "balance_loss_clip": 1.03313494, "balance_loss_mlp": 0.21546285, "epoch": 0.9818127160679393, "flos": 25550513683200.0, "grad_norm": 35.431375575714895, "language_loss": 0.85334426, "learning_rate": 3.4574681974817168e-09, "loss": 0.86834717, "num_input_tokens_seen": 352485505, "router_z_loss_clip": 2.26171875, "router_z_loss_mlp": 0.25598145, "step": 16330, "time_per_iteration": 2.691133737564087 }, { "auxiliary_loss_clip": 0.01296083, "auxiliary_loss_mlp": 0.00237256, "balance_loss_clip": 1.05521393, "balance_loss_mlp": 0.20726249, "epoch": 0.9818728393206072, "flos": 28803661672320.0, "grad_norm": 2.8907639717446214, "language_loss": 0.77325886, "learning_rate": 3.434615511252126e-09, "loss": 0.78859228, "num_input_tokens_seen": 352505360, "router_z_loss_clip": 2.40625, "router_z_loss_mlp": 0.29980469, "step": 16331, "time_per_iteration": 2.765388250350952 }, { "auxiliary_loss_clip": 0.01240375, "auxiliary_loss_mlp": 0.00227066, "balance_loss_clip": 1.02531087, "balance_loss_mlp": 0.20246109, "epoch": 0.9819329625732752, "flos": 23222246014080.0, "grad_norm": 21.506227201949756, "language_loss": 0.80609918, "learning_rate": 3.411838534981948e-09, "loss": 0.8207736, "num_input_tokens_seen": 352524035, "router_z_loss_clip": 2.14941406, "router_z_loss_mlp": 0.24609375, "step": 16332, "time_per_iteration": 2.7377254962921143 }, { "auxiliary_loss_clip": 0.01232305, "auxiliary_loss_mlp": 0.00223859, "balance_loss_clip": 1.0164001, "balance_loss_mlp": 0.19987422, "epoch": 0.9819930858259431, "flos": 17530440883200.0, "grad_norm": 8.407458066707925, "language_loss": 0.84010047, "learning_rate": 3.389137269534936e-09, "loss": 0.85466212, "num_input_tokens_seen": 352543210, "router_z_loss_clip": 2.16015625, "router_z_loss_mlp": 0.23986816, "step": 16333, "time_per_iteration": 2.620948553085327 }, { "auxiliary_loss_clip": 0.01245366, "auxiliary_loss_mlp": 0.00214521, "balance_loss_clip": 1.02265418, "balance_loss_mlp": 0.18903369, "epoch": 0.9820532090786112, "flos": 12529915971840.0, "grad_norm": 5.518848978105254, "language_loss": 0.82783538, "learning_rate": 3.366511715771958e-09, "loss": 0.84243429, "num_input_tokens_seen": 352559770, "router_z_loss_clip": 2.2265625, "router_z_loss_mlp": 0.25524902, "step": 16334, "time_per_iteration": 2.6263182163238525 }, { "auxiliary_loss_clip": 0.01252203, "auxiliary_loss_mlp": 0.00220835, "balance_loss_clip": 1.03075576, "balance_loss_mlp": 0.1945134, "epoch": 0.9821133323312792, "flos": 18840174497280.0, "grad_norm": 4.876408395253795, "language_loss": 0.85732037, "learning_rate": 3.3439618745509934e-09, "loss": 0.87205076, "num_input_tokens_seen": 352577690, "router_z_loss_clip": 2.21289062, "router_z_loss_mlp": 0.26367188, "step": 16335, "time_per_iteration": 2.6166598796844482 }, { "auxiliary_loss_clip": 0.0126168, "auxiliary_loss_mlp": 0.00249664, "balance_loss_clip": 1.03634477, "balance_loss_mlp": 0.22168523, "epoch": 0.9821734555839471, "flos": 34824013528320.0, "grad_norm": 2.5759403515421435, "language_loss": 0.74387151, "learning_rate": 3.3214877467271362e-09, "loss": 0.75898492, "num_input_tokens_seen": 352598850, "router_z_loss_clip": 2.25195312, "router_z_loss_mlp": 0.27990723, "step": 16336, "time_per_iteration": 2.7779881954193115 }, { "auxiliary_loss_clip": 0.01245986, "auxiliary_loss_mlp": 0.00249131, "balance_loss_clip": 1.01958132, "balance_loss_mlp": 0.22153422, "epoch": 0.9822335788366151, "flos": 17128169493120.0, "grad_norm": 14.197571501863143, "language_loss": 0.82151115, "learning_rate": 3.299089333152372e-09, "loss": 0.83646238, "num_input_tokens_seen": 352616130, "router_z_loss_clip": 2.26367188, "router_z_loss_mlp": 0.27612305, "step": 16337, "time_per_iteration": 2.624941349029541 }, { "auxiliary_loss_clip": 0.01243806, "auxiliary_loss_mlp": 0.00217311, "balance_loss_clip": 1.02017057, "balance_loss_mlp": 0.19228876, "epoch": 0.982293702089283, "flos": 20813250528000.0, "grad_norm": 15.193677770722369, "language_loss": 0.81189036, "learning_rate": 3.2767666346764645e-09, "loss": 0.82650149, "num_input_tokens_seen": 352636885, "router_z_loss_clip": 2.23828125, "router_z_loss_mlp": 0.25048828, "step": 16338, "time_per_iteration": 2.6886606216430664 }, { "auxiliary_loss_clip": 0.01251522, "auxiliary_loss_mlp": 0.00232878, "balance_loss_clip": 1.02984822, "balance_loss_mlp": 0.2077252, "epoch": 0.982353825341951, "flos": 24680829588480.0, "grad_norm": 12.210128205471857, "language_loss": 0.88270968, "learning_rate": 3.2545196521454045e-09, "loss": 0.89755368, "num_input_tokens_seen": 352657905, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.25170898, "step": 16339, "time_per_iteration": 2.670863151550293 }, { "auxiliary_loss_clip": 0.0121986, "auxiliary_loss_mlp": 0.00224441, "balance_loss_clip": 1.0133723, "balance_loss_mlp": 0.20196998, "epoch": 0.982413948594619, "flos": 20850489953280.0, "grad_norm": 10.276968412153517, "language_loss": 0.69490111, "learning_rate": 3.232348386403405e-09, "loss": 0.70934415, "num_input_tokens_seen": 352676320, "router_z_loss_clip": 2.06347656, "router_z_loss_mlp": 0.22473145, "step": 16340, "time_per_iteration": 2.6659348011016846 }, { "auxiliary_loss_clip": 0.01245047, "auxiliary_loss_mlp": 0.00208425, "balance_loss_clip": 1.02694941, "balance_loss_mlp": 0.18374825, "epoch": 0.982474071847287, "flos": 15377380778880.0, "grad_norm": 3.147864296772023, "language_loss": 0.95151114, "learning_rate": 3.2102528382904613e-09, "loss": 0.96604586, "num_input_tokens_seen": 352692665, "router_z_loss_clip": 2.18164062, "router_z_loss_mlp": 0.24694824, "step": 16341, "time_per_iteration": 2.657302141189575 }, { "auxiliary_loss_clip": 0.01225258, "auxiliary_loss_mlp": 0.00217056, "balance_loss_clip": 1.01311612, "balance_loss_mlp": 0.19285697, "epoch": 0.9825341950999549, "flos": 23774732081280.0, "grad_norm": 15.50509299432167, "language_loss": 0.73085439, "learning_rate": 3.188233008645014e-09, "loss": 0.74527764, "num_input_tokens_seen": 352716130, "router_z_loss_clip": 2.12109375, "router_z_loss_mlp": 0.24194336, "step": 16342, "time_per_iteration": 2.7434628009796143 }, { "auxiliary_loss_clip": 0.01244461, "auxiliary_loss_mlp": 0.00241829, "balance_loss_clip": 1.02554965, "balance_loss_mlp": 0.21605608, "epoch": 0.9825943183526229, "flos": 22746285872640.0, "grad_norm": 38.01637448040974, "language_loss": 0.83291149, "learning_rate": 3.16628889830195e-09, "loss": 0.84777439, "num_input_tokens_seen": 352734705, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.25769043, "step": 16343, "time_per_iteration": 2.646665573120117 }, { "auxiliary_loss_clip": 0.01219397, "auxiliary_loss_mlp": 0.00203439, "balance_loss_clip": 1.01299357, "balance_loss_mlp": 0.18134905, "epoch": 0.9826544416052908, "flos": 27709966408320.0, "grad_norm": 3.994293756621373, "language_loss": 0.82596111, "learning_rate": 3.1444205080932707e-09, "loss": 0.84018952, "num_input_tokens_seen": 352756225, "router_z_loss_clip": 2.0625, "router_z_loss_mlp": 0.22094727, "step": 16344, "time_per_iteration": 2.766458749771118 }, { "auxiliary_loss_clip": 0.01250234, "auxiliary_loss_mlp": 0.00219297, "balance_loss_clip": 1.03626585, "balance_loss_mlp": 0.19472775, "epoch": 0.9827145648579588, "flos": 26941657472640.0, "grad_norm": 39.130770738517015, "language_loss": 0.75858271, "learning_rate": 3.122627838848313e-09, "loss": 0.773278, "num_input_tokens_seen": 352776210, "router_z_loss_clip": 2.140625, "router_z_loss_mlp": 0.24560547, "step": 16345, "time_per_iteration": 2.762996196746826 }, { "auxiliary_loss_clip": 0.01212923, "auxiliary_loss_mlp": 0.00202, "balance_loss_clip": 1.00766456, "balance_loss_mlp": 0.1808878, "epoch": 0.9827746881106267, "flos": 21866545969920.0, "grad_norm": 5.967524072558988, "language_loss": 0.84543037, "learning_rate": 3.1009108913933045e-09, "loss": 0.85957962, "num_input_tokens_seen": 352795455, "router_z_loss_clip": 2.04980469, "router_z_loss_mlp": 0.21118164, "step": 16346, "time_per_iteration": 2.715184450149536 }, { "auxiliary_loss_clip": 0.01274955, "auxiliary_loss_mlp": 0.00231706, "balance_loss_clip": 1.04665446, "balance_loss_mlp": 0.20503844, "epoch": 0.9828348113632948, "flos": 20850777262080.0, "grad_norm": 50.7969416860526, "language_loss": 0.84999108, "learning_rate": 3.079269666552031e-09, "loss": 0.86505765, "num_input_tokens_seen": 352812895, "router_z_loss_clip": 2.28125, "router_z_loss_mlp": 0.26660156, "step": 16347, "time_per_iteration": 2.6930296421051025 }, { "auxiliary_loss_clip": 0.01221812, "auxiliary_loss_mlp": 0.00207935, "balance_loss_clip": 1.01170981, "balance_loss_mlp": 0.18398547, "epoch": 0.9828949346159628, "flos": 34569227381760.0, "grad_norm": 8.486976570000698, "language_loss": 0.74149811, "learning_rate": 3.0577041651449474e-09, "loss": 0.7557956, "num_input_tokens_seen": 352835470, "router_z_loss_clip": 2.09765625, "router_z_loss_mlp": 0.23950195, "step": 16348, "time_per_iteration": 4.204597473144531 }, { "auxiliary_loss_clip": 0.01243793, "auxiliary_loss_mlp": 0.00217941, "balance_loss_clip": 1.02624822, "balance_loss_mlp": 0.19313323, "epoch": 0.9829550578686307, "flos": 24457464864000.0, "grad_norm": 3.6578316268219684, "language_loss": 0.78743058, "learning_rate": 3.0362143879898437e-09, "loss": 0.80204791, "num_input_tokens_seen": 352854295, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.24816895, "step": 16349, "time_per_iteration": 2.7074873447418213 }, { "auxiliary_loss_clip": 0.01215515, "auxiliary_loss_mlp": 0.00210159, "balance_loss_clip": 1.00628459, "balance_loss_mlp": 0.18656704, "epoch": 0.9830151811212987, "flos": 16910084067840.0, "grad_norm": 2.0708150125518796, "language_loss": 0.83197749, "learning_rate": 3.0148003359014018e-09, "loss": 0.8462342, "num_input_tokens_seen": 352869695, "router_z_loss_clip": 2.09179688, "router_z_loss_mlp": 0.23583984, "step": 16350, "time_per_iteration": 4.074397563934326 }, { "auxiliary_loss_clip": 0.01245995, "auxiliary_loss_mlp": 0.00237886, "balance_loss_clip": 1.02479041, "balance_loss_mlp": 0.21173169, "epoch": 0.9830753043739666, "flos": 21288312829440.0, "grad_norm": 21.722559710679455, "language_loss": 0.91796279, "learning_rate": 2.9934620096920826e-09, "loss": 0.9328016, "num_input_tokens_seen": 352887430, "router_z_loss_clip": 2.2109375, "router_z_loss_mlp": 0.26147461, "step": 16351, "time_per_iteration": 2.7260775566101074 }, { "auxiliary_loss_clip": 0.01227039, "auxiliary_loss_mlp": 0.00203054, "balance_loss_clip": 1.01573586, "balance_loss_mlp": 0.17912897, "epoch": 0.9831354276266346, "flos": 31723522341120.0, "grad_norm": 2.5116152081205105, "language_loss": 0.74231958, "learning_rate": 2.972199410170795e-09, "loss": 0.75662053, "num_input_tokens_seen": 352907555, "router_z_loss_clip": 2.11132812, "router_z_loss_mlp": 0.23937988, "step": 16352, "time_per_iteration": 2.7652695178985596 }, { "auxiliary_loss_clip": 0.01233004, "auxiliary_loss_mlp": 0.00200484, "balance_loss_clip": 1.01856577, "balance_loss_mlp": 0.17750092, "epoch": 0.9831955508793025, "flos": 21619050284160.0, "grad_norm": 9.688368469649223, "language_loss": 0.72848034, "learning_rate": 2.951012538143782e-09, "loss": 0.74281526, "num_input_tokens_seen": 352928670, "router_z_loss_clip": 2.14746094, "router_z_loss_mlp": 0.2298584, "step": 16353, "time_per_iteration": 2.7003748416900635 }, { "auxiliary_loss_clip": 0.0121213, "auxiliary_loss_mlp": 0.00189307, "balance_loss_clip": 1.00498295, "balance_loss_mlp": 0.16740791, "epoch": 0.9832556741319706, "flos": 22968214053120.0, "grad_norm": 20.167362215565937, "language_loss": 0.80905867, "learning_rate": 2.9299013944144025e-09, "loss": 0.82307303, "num_input_tokens_seen": 352948345, "router_z_loss_clip": 2.07324219, "router_z_loss_mlp": 0.21899414, "step": 16354, "time_per_iteration": 2.7592215538024902 }, { "auxiliary_loss_clip": 0.01222032, "auxiliary_loss_mlp": 0.00205012, "balance_loss_clip": 1.01194692, "balance_loss_mlp": 0.18170664, "epoch": 0.9833157973846385, "flos": 21323900229120.0, "grad_norm": 3.6719376992198223, "language_loss": 0.86134505, "learning_rate": 2.9088659797835702e-09, "loss": 0.87561554, "num_input_tokens_seen": 352967250, "router_z_loss_clip": 2.1015625, "router_z_loss_mlp": 0.23291016, "step": 16355, "time_per_iteration": 2.705031633377075 }, { "auxiliary_loss_clip": 0.01219229, "auxiliary_loss_mlp": 0.00218532, "balance_loss_clip": 1.01339531, "balance_loss_mlp": 0.1948213, "epoch": 0.9833759206373065, "flos": 21068719032960.0, "grad_norm": 6.4160101369193825, "language_loss": 0.79746854, "learning_rate": 2.8879062950484256e-09, "loss": 0.81184614, "num_input_tokens_seen": 352984725, "router_z_loss_clip": 2.05957031, "router_z_loss_mlp": 0.23730469, "step": 16356, "time_per_iteration": 2.6544947624206543 }, { "auxiliary_loss_clip": 0.01233073, "auxiliary_loss_mlp": 0.00232559, "balance_loss_clip": 1.02042949, "balance_loss_mlp": 0.20752501, "epoch": 0.9834360438899744, "flos": 18697322108160.0, "grad_norm": 16.536782786641233, "language_loss": 0.83779532, "learning_rate": 2.8670223410041104e-09, "loss": 0.85245162, "num_input_tokens_seen": 353003480, "router_z_loss_clip": 2.12890625, "router_z_loss_mlp": 0.25024414, "step": 16357, "time_per_iteration": 2.7038164138793945 }, { "auxiliary_loss_clip": 0.01241755, "auxiliary_loss_mlp": 0.00248286, "balance_loss_clip": 1.02764702, "balance_loss_mlp": 0.22298992, "epoch": 0.9834961671426424, "flos": 21105240186240.0, "grad_norm": 235.81257248650513, "language_loss": 0.88934773, "learning_rate": 2.846214118442436e-09, "loss": 0.90424818, "num_input_tokens_seen": 353021425, "router_z_loss_clip": 2.14257812, "router_z_loss_mlp": 0.25305176, "step": 16358, "time_per_iteration": 4.22212553024292 }, { "auxiliary_loss_clip": 0.01239966, "auxiliary_loss_mlp": 0.00219385, "balance_loss_clip": 1.02906156, "balance_loss_mlp": 0.19585294, "epoch": 0.9835562903953103, "flos": 26687625511680.0, "grad_norm": 5.440816572205234, "language_loss": 0.78122157, "learning_rate": 2.8254816281523263e-09, "loss": 0.79581505, "num_input_tokens_seen": 353039870, "router_z_loss_clip": 2.11230469, "router_z_loss_mlp": 0.23535156, "step": 16359, "time_per_iteration": 4.180644989013672 }, { "auxiliary_loss_clip": 0.01228857, "auxiliary_loss_mlp": 0.00235393, "balance_loss_clip": 1.01674294, "balance_loss_mlp": 0.20985827, "epoch": 0.9836164136479784, "flos": 22090162089600.0, "grad_norm": 2.292414177773084, "language_loss": 0.75498801, "learning_rate": 2.804824870920264e-09, "loss": 0.76963055, "num_input_tokens_seen": 353059750, "router_z_loss_clip": 2.11621094, "router_z_loss_mlp": 0.25549316, "step": 16360, "time_per_iteration": 2.7426106929779053 }, { "auxiliary_loss_clip": 0.01238765, "auxiliary_loss_mlp": 0.00217568, "balance_loss_clip": 1.02384329, "balance_loss_mlp": 0.19169971, "epoch": 0.9836765369006463, "flos": 23878405710720.0, "grad_norm": 6.291054791336306, "language_loss": 0.9167968, "learning_rate": 2.7842438475293996e-09, "loss": 0.93136013, "num_input_tokens_seen": 353079940, "router_z_loss_clip": 2.15039062, "router_z_loss_mlp": 0.25891113, "step": 16361, "time_per_iteration": 2.860017776489258 }, { "auxiliary_loss_clip": 0.01231205, "auxiliary_loss_mlp": 0.00228539, "balance_loss_clip": 1.01747155, "balance_loss_mlp": 0.20475662, "epoch": 0.9837366601533143, "flos": 25845017293440.0, "grad_norm": 237.14876962525628, "language_loss": 0.83254361, "learning_rate": 2.76373855876022e-09, "loss": 0.84714115, "num_input_tokens_seen": 353099990, "router_z_loss_clip": 2.13867188, "router_z_loss_mlp": 0.23791504, "step": 16362, "time_per_iteration": 2.75726056098938 }, { "auxiliary_loss_clip": 0.01224876, "auxiliary_loss_mlp": 0.00231932, "balance_loss_clip": 1.01480067, "balance_loss_mlp": 0.2071601, "epoch": 0.9837967834059823, "flos": 21358015171200.0, "grad_norm": 6.380756521277208, "language_loss": 0.80325198, "learning_rate": 2.7433090053901043e-09, "loss": 0.81782001, "num_input_tokens_seen": 353118710, "router_z_loss_clip": 2.09667969, "router_z_loss_mlp": 0.24780273, "step": 16363, "time_per_iteration": 2.7395401000976562 }, { "auxiliary_loss_clip": 0.01220073, "auxiliary_loss_mlp": 0.00230602, "balance_loss_clip": 1.01297128, "balance_loss_mlp": 0.20687929, "epoch": 0.9838569066586502, "flos": 18515793749760.0, "grad_norm": 19.961129288573545, "language_loss": 0.70281357, "learning_rate": 2.7229551881937653e-09, "loss": 0.71732032, "num_input_tokens_seen": 353136415, "router_z_loss_clip": 2.0703125, "router_z_loss_mlp": 0.23730469, "step": 16364, "time_per_iteration": 2.6733365058898926 }, { "auxiliary_loss_clip": 0.01227677, "auxiliary_loss_mlp": 0.00204121, "balance_loss_clip": 1.01393747, "balance_loss_mlp": 0.17994547, "epoch": 0.9839170299113182, "flos": 22452392793600.0, "grad_norm": 5.856562579590682, "language_loss": 0.83391511, "learning_rate": 2.702677107943252e-09, "loss": 0.8482331, "num_input_tokens_seen": 353154650, "router_z_loss_clip": 2.13867188, "router_z_loss_mlp": 0.24182129, "step": 16365, "time_per_iteration": 2.728226661682129 }, { "auxiliary_loss_clip": 0.01243391, "auxiliary_loss_mlp": 0.00211899, "balance_loss_clip": 1.02778256, "balance_loss_mlp": 0.18830763, "epoch": 0.9839771531639862, "flos": 27892320779520.0, "grad_norm": 15.086750701295855, "language_loss": 0.84841496, "learning_rate": 2.6824747654072832e-09, "loss": 0.86296785, "num_input_tokens_seen": 353174065, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.23596191, "step": 16366, "time_per_iteration": 2.785456895828247 }, { "auxiliary_loss_clip": 0.01230742, "auxiliary_loss_mlp": 0.00205767, "balance_loss_clip": 1.01512122, "balance_loss_mlp": 0.1818534, "epoch": 0.9840372764166542, "flos": 28214510797440.0, "grad_norm": 3.725087710940824, "language_loss": 0.81667161, "learning_rate": 2.662348161352357e-09, "loss": 0.83103669, "num_input_tokens_seen": 353193560, "router_z_loss_clip": 2.15820312, "router_z_loss_mlp": 0.23901367, "step": 16367, "time_per_iteration": 2.793383836746216 }, { "auxiliary_loss_clip": 0.01230698, "auxiliary_loss_mlp": 0.00236326, "balance_loss_clip": 1.01910424, "balance_loss_mlp": 0.2113757, "epoch": 0.9840973996693221, "flos": 23403989854080.0, "grad_norm": 27.607646312335376, "language_loss": 0.67451012, "learning_rate": 2.642297296540974e-09, "loss": 0.68918037, "num_input_tokens_seen": 353213525, "router_z_loss_clip": 2.12109375, "router_z_loss_mlp": 0.24951172, "step": 16368, "time_per_iteration": 2.793736696243286 }, { "auxiliary_loss_clip": 0.01220354, "auxiliary_loss_mlp": 0.00210497, "balance_loss_clip": 1.01062846, "balance_loss_mlp": 0.18648843, "epoch": 0.9841575229219901, "flos": 21395865127680.0, "grad_norm": 212.85127187963755, "language_loss": 0.71559489, "learning_rate": 2.6223221717340816e-09, "loss": 0.7299034, "num_input_tokens_seen": 353234000, "router_z_loss_clip": 2.09570312, "router_z_loss_mlp": 0.24023438, "step": 16369, "time_per_iteration": 2.7768707275390625 }, { "auxiliary_loss_clip": 0.01245788, "auxiliary_loss_mlp": 0.00238946, "balance_loss_clip": 1.02633893, "balance_loss_mlp": 0.21106303, "epoch": 0.984217646174658, "flos": 24464072966400.0, "grad_norm": 990.577907218728, "language_loss": 0.75038683, "learning_rate": 2.6024227876886295e-09, "loss": 0.76523423, "num_input_tokens_seen": 353254940, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.27893066, "step": 16370, "time_per_iteration": 2.7357654571533203 }, { "auxiliary_loss_clip": 0.01233245, "auxiliary_loss_mlp": 0.00221703, "balance_loss_clip": 1.01991487, "balance_loss_mlp": 0.19703819, "epoch": 0.984277769427326, "flos": 16435057680000.0, "grad_norm": 10.316440628278242, "language_loss": 0.82253224, "learning_rate": 2.582599145159792e-09, "loss": 0.83708173, "num_input_tokens_seen": 353272590, "router_z_loss_clip": 2.13671875, "router_z_loss_mlp": 0.24682617, "step": 16371, "time_per_iteration": 2.6558477878570557 }, { "auxiliary_loss_clip": 0.01083638, "auxiliary_loss_mlp": 0.00057207, "balance_loss_clip": 0.94734263, "balance_loss_mlp": 0.05067455, "epoch": 0.9843378926799939, "flos": 64530615288960.0, "grad_norm": 0.7465845376841311, "language_loss": 0.64136517, "learning_rate": 2.562851244898745e-09, "loss": 0.65277362, "num_input_tokens_seen": 353334380, "router_z_loss_clip": 1.359375, "router_z_loss_mlp": 0.06542969, "step": 16372, "time_per_iteration": 3.2055323123931885 }, { "auxiliary_loss_clip": 0.01231717, "auxiliary_loss_mlp": 0.00226396, "balance_loss_clip": 1.01934469, "balance_loss_mlp": 0.2007187, "epoch": 0.984398015932662, "flos": 17382811985280.0, "grad_norm": 25.796506074918124, "language_loss": 0.78770506, "learning_rate": 2.5431790876544456e-09, "loss": 0.80228615, "num_input_tokens_seen": 353351640, "router_z_loss_clip": 2.12304688, "router_z_loss_mlp": 0.25683594, "step": 16373, "time_per_iteration": 2.6613762378692627 }, { "auxiliary_loss_clip": 0.01232633, "auxiliary_loss_mlp": 0.00231252, "balance_loss_clip": 1.01984429, "balance_loss_mlp": 0.20524061, "epoch": 0.9844581391853299, "flos": 23879088069120.0, "grad_norm": 7.962208605674262, "language_loss": 0.86982298, "learning_rate": 2.523582674173186e-09, "loss": 0.88446188, "num_input_tokens_seen": 353372555, "router_z_loss_clip": 2.12695312, "router_z_loss_mlp": 0.26000977, "step": 16374, "time_per_iteration": 2.720186710357666 }, { "auxiliary_loss_clip": 0.01231442, "auxiliary_loss_mlp": 0.00218571, "balance_loss_clip": 1.01490617, "balance_loss_mlp": 0.19418055, "epoch": 0.9845182624379979, "flos": 19865352568320.0, "grad_norm": 16.18081962717638, "language_loss": 0.77196968, "learning_rate": 2.504062005197927e-09, "loss": 0.78646982, "num_input_tokens_seen": 353391385, "router_z_loss_clip": 2.16601562, "router_z_loss_mlp": 0.24389648, "step": 16375, "time_per_iteration": 2.638761520385742 }, { "auxiliary_loss_clip": 0.01233759, "auxiliary_loss_mlp": 0.00222921, "balance_loss_clip": 1.02156615, "balance_loss_mlp": 0.19637334, "epoch": 0.9845783856906659, "flos": 28254659224320.0, "grad_norm": 18.630572921708257, "language_loss": 0.88860404, "learning_rate": 2.484617081468521e-09, "loss": 0.90317082, "num_input_tokens_seen": 353411630, "router_z_loss_clip": 2.11914062, "router_z_loss_mlp": 0.265625, "step": 16376, "time_per_iteration": 2.730957269668579 }, { "auxiliary_loss_clip": 0.01227615, "auxiliary_loss_mlp": 0.00204067, "balance_loss_clip": 1.0175668, "balance_loss_mlp": 0.18182291, "epoch": 0.9846385089433338, "flos": 28328383889280.0, "grad_norm": 8.375875121825002, "language_loss": 0.67728102, "learning_rate": 2.4652479037228224e-09, "loss": 0.69159788, "num_input_tokens_seen": 353432895, "router_z_loss_clip": 2.09960938, "router_z_loss_mlp": 0.22253418, "step": 16377, "time_per_iteration": 2.67885684967041 }, { "auxiliary_loss_clip": 0.01243033, "auxiliary_loss_mlp": 0.00202314, "balance_loss_clip": 1.02223873, "balance_loss_mlp": 0.17829378, "epoch": 0.9846986321960018, "flos": 24316767290880.0, "grad_norm": 4.156157520967601, "language_loss": 0.8208456, "learning_rate": 2.445954472695133e-09, "loss": 0.83529902, "num_input_tokens_seen": 353454195, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.24035645, "step": 16378, "time_per_iteration": 2.7189829349517822 }, { "auxiliary_loss_clip": 0.01235126, "auxiliary_loss_mlp": 0.00229236, "balance_loss_clip": 1.02129543, "balance_loss_mlp": 0.2026765, "epoch": 0.9847587554486698, "flos": 27271999877760.0, "grad_norm": 16.962534961668887, "language_loss": 0.79567063, "learning_rate": 2.426736789116868e-09, "loss": 0.81031424, "num_input_tokens_seen": 353475125, "router_z_loss_clip": 2.13671875, "router_z_loss_mlp": 0.26550293, "step": 16379, "time_per_iteration": 2.693990468978882 }, { "auxiliary_loss_clip": 0.01261718, "auxiliary_loss_mlp": 0.00220586, "balance_loss_clip": 1.04067791, "balance_loss_mlp": 0.19384748, "epoch": 0.9848188787013378, "flos": 16542717719040.0, "grad_norm": 42.11610177677969, "language_loss": 0.80519331, "learning_rate": 2.407594853716999e-09, "loss": 0.82001626, "num_input_tokens_seen": 353493265, "router_z_loss_clip": 2.20800781, "router_z_loss_mlp": 0.26733398, "step": 16380, "time_per_iteration": 2.6727030277252197 }, { "auxiliary_loss_clip": 0.01253224, "auxiliary_loss_mlp": 0.00240441, "balance_loss_clip": 1.03147912, "balance_loss_mlp": 0.21369022, "epoch": 0.9848790019540057, "flos": 20193647898240.0, "grad_norm": 35.35147739979233, "language_loss": 0.86590654, "learning_rate": 2.38852866722139e-09, "loss": 0.88084316, "num_input_tokens_seen": 353511650, "router_z_loss_clip": 2.21679688, "router_z_loss_mlp": 0.26745605, "step": 16381, "time_per_iteration": 2.6617817878723145 }, { "auxiliary_loss_clip": 0.01251137, "auxiliary_loss_mlp": 0.00207947, "balance_loss_clip": 1.02655399, "balance_loss_mlp": 0.18234076, "epoch": 0.9849391252066737, "flos": 28259723041920.0, "grad_norm": 4.414370858625201, "language_loss": 0.87670392, "learning_rate": 2.3695382303527965e-09, "loss": 0.89129472, "num_input_tokens_seen": 353534035, "router_z_loss_clip": 2.24609375, "router_z_loss_mlp": 0.25610352, "step": 16382, "time_per_iteration": 2.7850043773651123 }, { "auxiliary_loss_clip": 0.01245893, "auxiliary_loss_mlp": 0.0024416, "balance_loss_clip": 1.026052, "balance_loss_mlp": 0.21736196, "epoch": 0.9849992484593416, "flos": 22454942659200.0, "grad_norm": 361.6144307366848, "language_loss": 0.83231592, "learning_rate": 2.3506235438315316e-09, "loss": 0.84721649, "num_input_tokens_seen": 353549950, "router_z_loss_clip": 2.19921875, "router_z_loss_mlp": 0.26831055, "step": 16383, "time_per_iteration": 2.699254274368286 }, { "auxiliary_loss_clip": 0.01239109, "auxiliary_loss_mlp": 0.00222757, "balance_loss_clip": 1.02482247, "balance_loss_mlp": 0.1987008, "epoch": 0.9850593717120096, "flos": 34497190656000.0, "grad_norm": 16.960925158402933, "language_loss": 0.73477328, "learning_rate": 2.3317846083750203e-09, "loss": 0.74939197, "num_input_tokens_seen": 353573745, "router_z_loss_clip": 2.14550781, "router_z_loss_mlp": 0.24072266, "step": 16384, "time_per_iteration": 2.8294875621795654 }, { "auxiliary_loss_clip": 0.0125733, "auxiliary_loss_mlp": 0.00239878, "balance_loss_clip": 1.03554738, "balance_loss_mlp": 0.21220919, "epoch": 0.9851194949646775, "flos": 38837282152320.0, "grad_norm": 3.8612851249219613, "language_loss": 0.79788548, "learning_rate": 2.313021424697359e-09, "loss": 0.81285757, "num_input_tokens_seen": 353595335, "router_z_loss_clip": 2.21679688, "router_z_loss_mlp": 0.27685547, "step": 16385, "time_per_iteration": 2.8501267433166504 }, { "auxiliary_loss_clip": 0.01240387, "auxiliary_loss_mlp": 0.0023197, "balance_loss_clip": 1.02149534, "balance_loss_mlp": 0.20493332, "epoch": 0.9851796182173456, "flos": 17712436118400.0, "grad_norm": 8.788532031189666, "language_loss": 0.8995384, "learning_rate": 2.294333993509978e-09, "loss": 0.91426194, "num_input_tokens_seen": 353614270, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.27050781, "step": 16386, "time_per_iteration": 2.6563351154327393 }, { "auxiliary_loss_clip": 0.0124937, "auxiliary_loss_mlp": 0.00233123, "balance_loss_clip": 1.03072977, "balance_loss_mlp": 0.20614588, "epoch": 0.9852397414700135, "flos": 27454318335360.0, "grad_norm": 6.144629341693529, "language_loss": 0.76949036, "learning_rate": 2.2757223155216442e-09, "loss": 0.78431535, "num_input_tokens_seen": 353634900, "router_z_loss_clip": 2.18164062, "router_z_loss_mlp": 0.26977539, "step": 16387, "time_per_iteration": 2.7453458309173584 }, { "auxiliary_loss_clip": 0.01229778, "auxiliary_loss_mlp": 0.00230379, "balance_loss_clip": 1.02083445, "balance_loss_mlp": 0.20572606, "epoch": 0.9852998647226815, "flos": 18296702743680.0, "grad_norm": 128.41994672436783, "language_loss": 0.81588531, "learning_rate": 2.257186391438237e-09, "loss": 0.83048689, "num_input_tokens_seen": 353652890, "router_z_loss_clip": 2.08984375, "router_z_loss_mlp": 0.2467041, "step": 16388, "time_per_iteration": 2.62351655960083 }, { "auxiliary_loss_clip": 0.01232614, "auxiliary_loss_mlp": 0.00235573, "balance_loss_clip": 1.01907527, "balance_loss_mlp": 0.21088502, "epoch": 0.9853599879753495, "flos": 19642562461440.0, "grad_norm": 3.588088708279799, "language_loss": 0.88837665, "learning_rate": 2.238726221962528e-09, "loss": 0.90305853, "num_input_tokens_seen": 353671295, "router_z_loss_clip": 2.13476562, "router_z_loss_mlp": 0.24694824, "step": 16389, "time_per_iteration": 2.7184340953826904 }, { "auxiliary_loss_clip": 0.01231545, "auxiliary_loss_mlp": 0.00243551, "balance_loss_clip": 1.01797509, "balance_loss_mlp": 0.22056708, "epoch": 0.9854201112280174, "flos": 23841956384640.0, "grad_norm": 46.80467645764888, "language_loss": 0.7519896, "learning_rate": 2.2203418077946234e-09, "loss": 0.76674056, "num_input_tokens_seen": 353690560, "router_z_loss_clip": 2.13671875, "router_z_loss_mlp": 0.22998047, "step": 16390, "time_per_iteration": 2.6916801929473877 }, { "auxiliary_loss_clip": 0.01259854, "auxiliary_loss_mlp": 0.00234681, "balance_loss_clip": 1.04290819, "balance_loss_mlp": 0.20982623, "epoch": 0.9854802344806854, "flos": 30080573233920.0, "grad_norm": 23.73618254374838, "language_loss": 0.84447026, "learning_rate": 2.2020331496312994e-09, "loss": 0.85941565, "num_input_tokens_seen": 353710660, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.24865723, "step": 16391, "time_per_iteration": 4.096127033233643 }, { "auxiliary_loss_clip": 0.01216039, "auxiliary_loss_mlp": 0.00203367, "balance_loss_clip": 1.00936091, "balance_loss_mlp": 0.18037142, "epoch": 0.9855403577333534, "flos": 21907412668800.0, "grad_norm": 53.810183722697545, "language_loss": 0.75471842, "learning_rate": 2.1838002481673333e-09, "loss": 0.76891249, "num_input_tokens_seen": 353730440, "router_z_loss_clip": 2.06835938, "router_z_loss_mlp": 0.22998047, "step": 16392, "time_per_iteration": 4.153555631637573 }, { "auxiliary_loss_clip": 0.01268518, "auxiliary_loss_mlp": 0.00244004, "balance_loss_clip": 1.03850079, "balance_loss_mlp": 0.21652591, "epoch": 0.9856004809860214, "flos": 15413794191360.0, "grad_norm": 2.860824928032303, "language_loss": 0.69126254, "learning_rate": 2.1656431040937286e-09, "loss": 0.70638776, "num_input_tokens_seen": 353748360, "router_z_loss_clip": 2.30175781, "router_z_loss_mlp": 0.2746582, "step": 16393, "time_per_iteration": 2.720158338546753 }, { "auxiliary_loss_clip": 0.01259492, "auxiliary_loss_mlp": 0.00237797, "balance_loss_clip": 1.03164959, "balance_loss_mlp": 0.21164235, "epoch": 0.9856606042386893, "flos": 13653201064320.0, "grad_norm": 41.45401790234757, "language_loss": 0.93302929, "learning_rate": 2.1475617180990444e-09, "loss": 0.94800216, "num_input_tokens_seen": 353760880, "router_z_loss_clip": 2.28125, "router_z_loss_mlp": 0.26135254, "step": 16394, "time_per_iteration": 2.7025628089904785 }, { "auxiliary_loss_clip": 0.01242245, "auxiliary_loss_mlp": 0.00211469, "balance_loss_clip": 1.0251298, "balance_loss_mlp": 0.18519574, "epoch": 0.9857207274913573, "flos": 23479151063040.0, "grad_norm": 3.6969732109418696, "language_loss": 0.83568096, "learning_rate": 2.129556090869178e-09, "loss": 0.85021818, "num_input_tokens_seen": 353782255, "router_z_loss_clip": 2.171875, "router_z_loss_mlp": 0.26257324, "step": 16395, "time_per_iteration": 2.7305617332458496 }, { "auxiliary_loss_clip": 0.01240485, "auxiliary_loss_mlp": 0.00224957, "balance_loss_clip": 1.02886915, "balance_loss_mlp": 0.19964886, "epoch": 0.9857808507440252, "flos": 21065486808960.0, "grad_norm": 5.33168840537741, "language_loss": 0.81654978, "learning_rate": 2.1116262230866933e-09, "loss": 0.83120424, "num_input_tokens_seen": 353803580, "router_z_loss_clip": 2.11914062, "router_z_loss_mlp": 0.25317383, "step": 16396, "time_per_iteration": 2.7122530937194824 }, { "auxiliary_loss_clip": 0.01219516, "auxiliary_loss_mlp": 0.00209233, "balance_loss_clip": 1.01153326, "balance_loss_mlp": 0.18697646, "epoch": 0.9858409739966932, "flos": 25301365971840.0, "grad_norm": 15.515059865084526, "language_loss": 0.7760787, "learning_rate": 2.0937721154317133e-09, "loss": 0.79036617, "num_input_tokens_seen": 353824200, "router_z_loss_clip": 2.078125, "router_z_loss_mlp": 0.22241211, "step": 16397, "time_per_iteration": 2.698887586593628 }, { "auxiliary_loss_clip": 0.01227811, "auxiliary_loss_mlp": 0.00203822, "balance_loss_clip": 1.0208863, "balance_loss_mlp": 0.18076703, "epoch": 0.9859010972493611, "flos": 20558751690240.0, "grad_norm": 86.60596968445357, "language_loss": 0.81496525, "learning_rate": 2.0759937685810304e-09, "loss": 0.82928157, "num_input_tokens_seen": 353843350, "router_z_loss_clip": 2.07128906, "router_z_loss_mlp": 0.23059082, "step": 16398, "time_per_iteration": 2.6342132091522217 }, { "auxiliary_loss_clip": 0.01225098, "auxiliary_loss_mlp": 0.00204061, "balance_loss_clip": 1.01513827, "balance_loss_mlp": 0.18013521, "epoch": 0.9859612205020292, "flos": 24754985216640.0, "grad_norm": 51.057099002963916, "language_loss": 0.78875601, "learning_rate": 2.058291183208771e-09, "loss": 0.80304754, "num_input_tokens_seen": 353864520, "router_z_loss_clip": 2.09960938, "router_z_loss_mlp": 0.23913574, "step": 16399, "time_per_iteration": 2.7026000022888184 }, { "auxiliary_loss_clip": 0.01227959, "auxiliary_loss_mlp": 0.00230461, "balance_loss_clip": 1.01551771, "balance_loss_mlp": 0.2075614, "epoch": 0.9860213437546971, "flos": 21105850717440.0, "grad_norm": 22.79957982057928, "language_loss": 0.6580013, "learning_rate": 2.0406643599863993e-09, "loss": 0.67258555, "num_input_tokens_seen": 353882240, "router_z_loss_clip": 2.12695312, "router_z_loss_mlp": 0.22912598, "step": 16400, "time_per_iteration": 4.0594072341918945 }, { "auxiliary_loss_clip": 0.01278123, "auxiliary_loss_mlp": 0.00235307, "balance_loss_clip": 1.05025578, "balance_loss_mlp": 0.20844944, "epoch": 0.9860814670073651, "flos": 19136078737920.0, "grad_norm": 4.2772631477449, "language_loss": 0.89006186, "learning_rate": 2.023113299582491e-09, "loss": 0.90519607, "num_input_tokens_seen": 353901590, "router_z_loss_clip": 2.27929688, "router_z_loss_mlp": 0.26831055, "step": 16401, "time_per_iteration": 4.06519889831543 }, { "auxiliary_loss_clip": 0.01245427, "auxiliary_loss_mlp": 0.00218789, "balance_loss_clip": 1.02665639, "balance_loss_mlp": 0.19292036, "epoch": 0.9861415902600331, "flos": 17237050594560.0, "grad_norm": 74.56161738468194, "language_loss": 0.88180673, "learning_rate": 2.005638002662069e-09, "loss": 0.89644891, "num_input_tokens_seen": 353918785, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.25866699, "step": 16402, "time_per_iteration": 2.6880178451538086 }, { "auxiliary_loss_clip": 0.0124026, "auxiliary_loss_mlp": 0.00240636, "balance_loss_clip": 1.02151275, "balance_loss_mlp": 0.21431419, "epoch": 0.986201713512701, "flos": 27782577751680.0, "grad_norm": 4.3486179960691445, "language_loss": 0.80565351, "learning_rate": 1.9882384698881596e-09, "loss": 0.82046252, "num_input_tokens_seen": 353940390, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.26330566, "step": 16403, "time_per_iteration": 2.702977418899536 }, { "auxiliary_loss_clip": 0.01242139, "auxiliary_loss_mlp": 0.0021046, "balance_loss_clip": 1.02836275, "balance_loss_mlp": 0.18695208, "epoch": 0.986261836765369, "flos": 28730403884160.0, "grad_norm": 150.3131364017385, "language_loss": 0.81787896, "learning_rate": 1.9709147019204566e-09, "loss": 0.83240497, "num_input_tokens_seen": 353962180, "router_z_loss_clip": 2.13964844, "router_z_loss_mlp": 0.23535156, "step": 16404, "time_per_iteration": 2.7581045627593994 }, { "auxiliary_loss_clip": 0.01228209, "auxiliary_loss_mlp": 0.00229213, "balance_loss_clip": 1.01837957, "balance_loss_mlp": 0.20550179, "epoch": 0.986321960018037, "flos": 34313471568000.0, "grad_norm": 33.799213093090565, "language_loss": 0.76572967, "learning_rate": 1.953666699415768e-09, "loss": 0.7803039, "num_input_tokens_seen": 353984305, "router_z_loss_clip": 2.10058594, "router_z_loss_mlp": 0.23730469, "step": 16405, "time_per_iteration": 2.833843469619751 }, { "auxiliary_loss_clip": 0.01222728, "auxiliary_loss_mlp": 0.0021896, "balance_loss_clip": 1.01712441, "balance_loss_mlp": 0.19639409, "epoch": 0.986382083270705, "flos": 25189755436800.0, "grad_norm": 4.386479667536548, "language_loss": 0.76589203, "learning_rate": 1.93649446302846e-09, "loss": 0.7803089, "num_input_tokens_seen": 354004495, "router_z_loss_clip": 2.05761719, "router_z_loss_mlp": 0.22558594, "step": 16406, "time_per_iteration": 2.696772575378418 }, { "auxiliary_loss_clip": 0.01218092, "auxiliary_loss_mlp": 0.00235591, "balance_loss_clip": 1.00649679, "balance_loss_mlp": 0.21242872, "epoch": 0.9864422065233729, "flos": 11025904671360.0, "grad_norm": 3.5201019950977117, "language_loss": 0.85072708, "learning_rate": 1.9193979934095663e-09, "loss": 0.86526394, "num_input_tokens_seen": 354015985, "router_z_loss_clip": 2.11132812, "router_z_loss_mlp": 0.23168945, "step": 16407, "time_per_iteration": 2.817225456237793 }, { "auxiliary_loss_clip": 0.01231006, "auxiliary_loss_mlp": 0.00249727, "balance_loss_clip": 1.01959491, "balance_loss_mlp": 0.22449014, "epoch": 0.9865023297760409, "flos": 16545590807040.0, "grad_norm": 17.363303659358063, "language_loss": 0.85850745, "learning_rate": 1.9023772912072357e-09, "loss": 0.87331486, "num_input_tokens_seen": 354033260, "router_z_loss_clip": 2.11230469, "router_z_loss_mlp": 0.2520752, "step": 16408, "time_per_iteration": 2.6257805824279785 }, { "auxiliary_loss_clip": 0.01263574, "auxiliary_loss_mlp": 0.00239854, "balance_loss_clip": 1.03710938, "balance_loss_mlp": 0.21292505, "epoch": 0.9865624530287088, "flos": 18880179269760.0, "grad_norm": 3.6357402548718274, "language_loss": 0.77910113, "learning_rate": 1.8854323570669515e-09, "loss": 0.79413533, "num_input_tokens_seen": 354052825, "router_z_loss_clip": 2.265625, "router_z_loss_mlp": 0.26940918, "step": 16409, "time_per_iteration": 2.6606240272521973 }, { "auxiliary_loss_clip": 0.01078615, "auxiliary_loss_mlp": 0.00089556, "balance_loss_clip": 0.94291478, "balance_loss_mlp": 0.08288059, "epoch": 0.9866225762813768, "flos": 68887798680960.0, "grad_norm": 0.7769489494028117, "language_loss": 0.60205215, "learning_rate": 1.8685631916313118e-09, "loss": 0.61373389, "num_input_tokens_seen": 354113920, "router_z_loss_clip": 1.359375, "router_z_loss_mlp": 0.06689453, "step": 16410, "time_per_iteration": 3.19671630859375 }, { "auxiliary_loss_clip": 0.01235728, "auxiliary_loss_mlp": 0.0022588, "balance_loss_clip": 1.02251267, "balance_loss_mlp": 0.20284829, "epoch": 0.9866826995340447, "flos": 29023111814400.0, "grad_norm": 12.511303536442396, "language_loss": 0.76365489, "learning_rate": 1.8517697955400258e-09, "loss": 0.77827096, "num_input_tokens_seen": 354134210, "router_z_loss_clip": 2.1328125, "router_z_loss_mlp": 0.23046875, "step": 16411, "time_per_iteration": 2.695289134979248 }, { "auxiliary_loss_clip": 0.01082229, "auxiliary_loss_mlp": 0.00072971, "balance_loss_clip": 0.94648147, "balance_loss_mlp": 0.06529434, "epoch": 0.9867428227867128, "flos": 65376814867200.0, "grad_norm": 0.7093398609285745, "language_loss": 0.55780268, "learning_rate": 1.8350521694299182e-09, "loss": 0.56935471, "num_input_tokens_seen": 354198010, "router_z_loss_clip": 1.359375, "router_z_loss_mlp": 0.07666016, "step": 16412, "time_per_iteration": 3.2130746841430664 }, { "auxiliary_loss_clip": 0.01262183, "auxiliary_loss_mlp": 0.00241337, "balance_loss_clip": 1.03620267, "balance_loss_mlp": 0.21465787, "epoch": 0.9868029460393807, "flos": 26506312634880.0, "grad_norm": 2871.6392798962825, "language_loss": 0.79360855, "learning_rate": 1.818410313934926e-09, "loss": 0.80864382, "num_input_tokens_seen": 354220000, "router_z_loss_clip": 2.25976562, "router_z_loss_mlp": 0.26672363, "step": 16413, "time_per_iteration": 2.7395803928375244 }, { "auxiliary_loss_clip": 0.01231004, "auxiliary_loss_mlp": 0.00236123, "balance_loss_clip": 1.01649189, "balance_loss_mlp": 0.21130365, "epoch": 0.9868630692920487, "flos": 22967280299520.0, "grad_norm": 886.413056916961, "language_loss": 0.77427101, "learning_rate": 1.8018442296858782e-09, "loss": 0.78894228, "num_input_tokens_seen": 354240910, "router_z_loss_clip": 2.14746094, "router_z_loss_mlp": 0.24829102, "step": 16414, "time_per_iteration": 2.8171451091766357 }, { "auxiliary_loss_clip": 0.0123239, "auxiliary_loss_mlp": 0.00208247, "balance_loss_clip": 1.01970506, "balance_loss_mlp": 0.18378516, "epoch": 0.9869231925447167, "flos": 19828687760640.0, "grad_norm": 38.4954820571449, "language_loss": 0.78934395, "learning_rate": 1.7853539173111608e-09, "loss": 0.80375028, "num_input_tokens_seen": 354259430, "router_z_loss_clip": 2.12695312, "router_z_loss_mlp": 0.24475098, "step": 16415, "time_per_iteration": 2.683988332748413 }, { "auxiliary_loss_clip": 0.01211441, "auxiliary_loss_mlp": 0.00216509, "balance_loss_clip": 1.00612116, "balance_loss_mlp": 0.193955, "epoch": 0.9869833157973846, "flos": 20195228096640.0, "grad_norm": 67.10134157224223, "language_loss": 0.81099963, "learning_rate": 1.7689393774362737e-09, "loss": 0.82527912, "num_input_tokens_seen": 354279490, "router_z_loss_clip": 2.0546875, "router_z_loss_mlp": 0.22558594, "step": 16416, "time_per_iteration": 2.6917765140533447 }, { "auxiliary_loss_clip": 0.0123036, "auxiliary_loss_mlp": 0.00241658, "balance_loss_clip": 1.0152303, "balance_loss_mlp": 0.21583733, "epoch": 0.9870434390500527, "flos": 16099507802880.0, "grad_norm": 59.439778349237834, "language_loss": 0.80651474, "learning_rate": 1.7526006106833858e-09, "loss": 0.82123494, "num_input_tokens_seen": 354295080, "router_z_loss_clip": 2.15234375, "router_z_loss_mlp": 0.25830078, "step": 16417, "time_per_iteration": 2.597642660140991 }, { "auxiliary_loss_clip": 0.01245082, "auxiliary_loss_mlp": 0.002132, "balance_loss_clip": 1.02383161, "balance_loss_mlp": 0.18692668, "epoch": 0.9871035623027206, "flos": 21760753438080.0, "grad_norm": 69.19657778660932, "language_loss": 0.79488182, "learning_rate": 1.7363376176720013e-09, "loss": 0.80946457, "num_input_tokens_seen": 354314610, "router_z_loss_clip": 2.2109375, "router_z_loss_mlp": 0.26269531, "step": 16418, "time_per_iteration": 2.7293643951416016 }, { "auxiliary_loss_clip": 0.01079949, "auxiliary_loss_mlp": 0.00069093, "balance_loss_clip": 0.94287336, "balance_loss_mlp": 0.06222672, "epoch": 0.9871636855553886, "flos": 70219583245440.0, "grad_norm": 0.6379620772736818, "language_loss": 0.52984536, "learning_rate": 1.7201503990189603e-09, "loss": 0.54133576, "num_input_tokens_seen": 354383115, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.06884766, "step": 16419, "time_per_iteration": 3.2431344985961914 }, { "auxiliary_loss_clip": 0.01247088, "auxiliary_loss_mlp": 0.00235328, "balance_loss_clip": 1.02875888, "balance_loss_mlp": 0.21053293, "epoch": 0.9872238088080565, "flos": 25045825639680.0, "grad_norm": 21.635862113780163, "language_loss": 0.85203528, "learning_rate": 1.7040389553382162e-09, "loss": 0.86685944, "num_input_tokens_seen": 354403115, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.2479248, "step": 16420, "time_per_iteration": 2.6969125270843506 }, { "auxiliary_loss_clip": 0.01240055, "auxiliary_loss_mlp": 0.00221995, "balance_loss_clip": 1.02738225, "balance_loss_mlp": 0.19810551, "epoch": 0.9872839320607245, "flos": 19465846525440.0, "grad_norm": 76.72788794156608, "language_loss": 0.77461231, "learning_rate": 1.6880032872403916e-09, "loss": 0.78923285, "num_input_tokens_seen": 354424520, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.2388916, "step": 16421, "time_per_iteration": 2.7131736278533936 }, { "auxiliary_loss_clip": 0.0125702, "auxiliary_loss_mlp": 0.00215244, "balance_loss_clip": 1.03198004, "balance_loss_mlp": 0.1907825, "epoch": 0.9873440553133924, "flos": 26942914448640.0, "grad_norm": 9.264761210599916, "language_loss": 0.91015172, "learning_rate": 1.6720433953338886e-09, "loss": 0.92487431, "num_input_tokens_seen": 354444800, "router_z_loss_clip": 2.24707031, "router_z_loss_mlp": 0.24462891, "step": 16422, "time_per_iteration": 2.7011077404022217 }, { "auxiliary_loss_clip": 0.01231584, "auxiliary_loss_mlp": 0.00227171, "balance_loss_clip": 1.02353024, "balance_loss_mlp": 0.20411548, "epoch": 0.9874041785660604, "flos": 19062210418560.0, "grad_norm": 852.4570303915461, "language_loss": 0.92928463, "learning_rate": 1.656159280223779e-09, "loss": 0.94387215, "num_input_tokens_seen": 354464590, "router_z_loss_clip": 2.08105469, "router_z_loss_mlp": 0.23083496, "step": 16423, "time_per_iteration": 2.7133634090423584 }, { "auxiliary_loss_clip": 0.01230667, "auxiliary_loss_mlp": 0.0022557, "balance_loss_clip": 1.01859689, "balance_loss_mlp": 0.20250256, "epoch": 0.9874643018187284, "flos": 21105814803840.0, "grad_norm": 116.86908193710957, "language_loss": 0.7678808, "learning_rate": 1.6403509425122475e-09, "loss": 0.78244317, "num_input_tokens_seen": 354484145, "router_z_loss_clip": 2.12304688, "router_z_loss_mlp": 0.23071289, "step": 16424, "time_per_iteration": 2.635361433029175 }, { "auxiliary_loss_clip": 0.01238906, "auxiliary_loss_mlp": 0.00215849, "balance_loss_clip": 1.02709293, "balance_loss_mlp": 0.19062476, "epoch": 0.9875244250713964, "flos": 24426043441920.0, "grad_norm": 2.9256027796776087, "language_loss": 0.87361169, "learning_rate": 1.6246183827990366e-09, "loss": 0.88815922, "num_input_tokens_seen": 354502475, "router_z_loss_clip": 2.11914062, "router_z_loss_mlp": 0.25231934, "step": 16425, "time_per_iteration": 2.73378586769104 }, { "auxiliary_loss_clip": 0.01233957, "auxiliary_loss_mlp": 0.002115, "balance_loss_clip": 1.01599669, "balance_loss_mlp": 0.18560734, "epoch": 0.9875845483240643, "flos": 25117610970240.0, "grad_norm": 33.69641255530035, "language_loss": 0.87630737, "learning_rate": 1.6089616016803364e-09, "loss": 0.89076191, "num_input_tokens_seen": 354521855, "router_z_loss_clip": 2.17578125, "router_z_loss_mlp": 0.25891113, "step": 16426, "time_per_iteration": 2.694972515106201 }, { "auxiliary_loss_clip": 0.01241599, "auxiliary_loss_mlp": 0.00225397, "balance_loss_clip": 1.0239352, "balance_loss_mlp": 0.19993374, "epoch": 0.9876446715767323, "flos": 16581788737920.0, "grad_norm": 43.298836983496976, "language_loss": 0.95846617, "learning_rate": 1.593380599750338e-09, "loss": 0.97313619, "num_input_tokens_seen": 354539535, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.25463867, "step": 16427, "time_per_iteration": 2.683358907699585 }, { "auxiliary_loss_clip": 0.01235894, "auxiliary_loss_mlp": 0.0021388, "balance_loss_clip": 1.02198207, "balance_loss_mlp": 0.18979982, "epoch": 0.9877047948294003, "flos": 21616141282560.0, "grad_norm": 6.014965487043642, "language_loss": 0.76970154, "learning_rate": 1.577875377599458e-09, "loss": 0.7841993, "num_input_tokens_seen": 354557430, "router_z_loss_clip": 2.140625, "router_z_loss_mlp": 0.24060059, "step": 16428, "time_per_iteration": 2.6441667079925537 }, { "auxiliary_loss_clip": 0.01223781, "auxiliary_loss_mlp": 0.001929, "balance_loss_clip": 1.01361728, "balance_loss_mlp": 0.16984487, "epoch": 0.9877649180820682, "flos": 21178497974400.0, "grad_norm": 24.620375465063912, "language_loss": 0.89368927, "learning_rate": 1.5624459358158926e-09, "loss": 0.90785611, "num_input_tokens_seen": 354574735, "router_z_loss_clip": 2.10351562, "router_z_loss_mlp": 0.23034668, "step": 16429, "time_per_iteration": 2.699779987335205 }, { "auxiliary_loss_clip": 0.012285, "auxiliary_loss_mlp": 0.00220794, "balance_loss_clip": 1.02004957, "balance_loss_mlp": 0.19732144, "epoch": 0.9878250413347363, "flos": 39749233576320.0, "grad_norm": 188.40185637165885, "language_loss": 0.69735992, "learning_rate": 1.5470922749845073e-09, "loss": 0.71185291, "num_input_tokens_seen": 354597050, "router_z_loss_clip": 2.08691406, "router_z_loss_mlp": 0.23449707, "step": 16430, "time_per_iteration": 2.812028646469116 }, { "auxiliary_loss_clip": 0.01226301, "auxiliary_loss_mlp": 0.00216124, "balance_loss_clip": 1.01383269, "balance_loss_mlp": 0.19165048, "epoch": 0.9878851645874042, "flos": 29425634599680.0, "grad_norm": 47.84433847425038, "language_loss": 0.78748453, "learning_rate": 1.531814395687725e-09, "loss": 0.80190873, "num_input_tokens_seen": 354619095, "router_z_loss_clip": 2.12402344, "router_z_loss_mlp": 0.24487305, "step": 16431, "time_per_iteration": 2.7460155487060547 }, { "auxiliary_loss_clip": 0.01254292, "auxiliary_loss_mlp": 0.00220596, "balance_loss_clip": 1.03156209, "balance_loss_mlp": 0.19372663, "epoch": 0.9879452878400722, "flos": 15806261168640.0, "grad_norm": 41.87557923670202, "language_loss": 0.89964139, "learning_rate": 1.5166122985048602e-09, "loss": 0.91439033, "num_input_tokens_seen": 354633790, "router_z_loss_clip": 2.22558594, "router_z_loss_mlp": 0.2689209, "step": 16432, "time_per_iteration": 2.6498472690582275 }, { "auxiliary_loss_clip": 0.01222481, "auxiliary_loss_mlp": 0.00209478, "balance_loss_clip": 1.01651406, "balance_loss_mlp": 0.18668547, "epoch": 0.9880054110927401, "flos": 22233912318720.0, "grad_norm": 52.85548425442688, "language_loss": 0.86500263, "learning_rate": 1.5014859840123405e-09, "loss": 0.87932223, "num_input_tokens_seen": 354653180, "router_z_loss_clip": 2.06054688, "router_z_loss_mlp": 0.22790527, "step": 16433, "time_per_iteration": 4.077907085418701 }, { "auxiliary_loss_clip": 0.01234405, "auxiliary_loss_mlp": 0.00212997, "balance_loss_clip": 1.02071023, "balance_loss_mlp": 0.1873197, "epoch": 0.9880655343454081, "flos": 28763836467840.0, "grad_norm": 13.755083144502663, "language_loss": 0.73020351, "learning_rate": 1.4864354527837075e-09, "loss": 0.74467754, "num_input_tokens_seen": 354669900, "router_z_loss_clip": 2.13867188, "router_z_loss_mlp": 0.25671387, "step": 16434, "time_per_iteration": 4.135472059249878 }, { "auxiliary_loss_clip": 0.01253088, "auxiliary_loss_mlp": 0.00228419, "balance_loss_clip": 1.02988863, "balance_loss_mlp": 0.20146608, "epoch": 0.988125657598076, "flos": 32853379622400.0, "grad_norm": 13.061179887771765, "language_loss": 0.77151084, "learning_rate": 1.4714607053896154e-09, "loss": 0.78632593, "num_input_tokens_seen": 354693165, "router_z_loss_clip": 2.23242188, "router_z_loss_mlp": 0.26989746, "step": 16435, "time_per_iteration": 2.813199043273926 }, { "auxiliary_loss_clip": 0.01241254, "auxiliary_loss_mlp": 0.00238254, "balance_loss_clip": 1.0200386, "balance_loss_mlp": 0.20923892, "epoch": 0.988185780850744, "flos": 19390685316480.0, "grad_norm": 152.9427038754238, "language_loss": 0.85283339, "learning_rate": 1.4565617423980548e-09, "loss": 0.8676284, "num_input_tokens_seen": 354711915, "router_z_loss_clip": 2.21386719, "router_z_loss_mlp": 0.29016113, "step": 16436, "time_per_iteration": 2.6134495735168457 }, { "auxiliary_loss_clip": 0.01237217, "auxiliary_loss_mlp": 0.00219173, "balance_loss_clip": 1.02193248, "balance_loss_mlp": 0.19285193, "epoch": 0.988245904103412, "flos": 22528415928960.0, "grad_norm": 28.10490421508015, "language_loss": 0.82421231, "learning_rate": 1.4417385643741286e-09, "loss": 0.83877623, "num_input_tokens_seen": 354729135, "router_z_loss_clip": 2.15234375, "router_z_loss_mlp": 0.26318359, "step": 16437, "time_per_iteration": 2.722653388977051 }, { "auxiliary_loss_clip": 0.0123424, "auxiliary_loss_mlp": 0.0022754, "balance_loss_clip": 1.02105427, "balance_loss_mlp": 0.20108792, "epoch": 0.98830602735608, "flos": 28659193171200.0, "grad_norm": 3.697763235972394, "language_loss": 0.67059362, "learning_rate": 1.4269911718796103e-09, "loss": 0.68521142, "num_input_tokens_seen": 354752530, "router_z_loss_clip": 2.1328125, "router_z_loss_mlp": 0.26489258, "step": 16438, "time_per_iteration": 2.69144868850708 }, { "auxiliary_loss_clip": 0.01234342, "auxiliary_loss_mlp": 0.00225933, "balance_loss_clip": 1.01971495, "balance_loss_mlp": 0.20163862, "epoch": 0.9883661506087479, "flos": 20996035862400.0, "grad_norm": 236.27611688756699, "language_loss": 0.81839895, "learning_rate": 1.4123195654738295e-09, "loss": 0.83300167, "num_input_tokens_seen": 354771135, "router_z_loss_clip": 2.14941406, "router_z_loss_mlp": 0.24279785, "step": 16439, "time_per_iteration": 2.6881725788116455 }, { "auxiliary_loss_clip": 0.01232111, "auxiliary_loss_mlp": 0.00220299, "balance_loss_clip": 1.01648068, "balance_loss_mlp": 0.19600391, "epoch": 0.9884262738614159, "flos": 32706109860480.0, "grad_norm": 6.311669706479498, "language_loss": 0.68757361, "learning_rate": 1.3977237457134528e-09, "loss": 0.70209765, "num_input_tokens_seen": 354791800, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.24291992, "step": 16440, "time_per_iteration": 2.751551866531372 }, { "auxiliary_loss_clip": 0.01232398, "auxiliary_loss_mlp": 0.00222098, "balance_loss_clip": 1.01759052, "balance_loss_mlp": 0.19866177, "epoch": 0.9884863971140839, "flos": 17564699479680.0, "grad_norm": 11.74374714482979, "language_loss": 0.85746014, "learning_rate": 1.3832037131513707e-09, "loss": 0.87200516, "num_input_tokens_seen": 354809200, "router_z_loss_clip": 2.14746094, "router_z_loss_mlp": 0.234375, "step": 16441, "time_per_iteration": 2.665485382080078 }, { "auxiliary_loss_clip": 0.01234009, "auxiliary_loss_mlp": 0.00207924, "balance_loss_clip": 1.02320039, "balance_loss_mlp": 0.1841895, "epoch": 0.9885465203667518, "flos": 40552519380480.0, "grad_norm": 14.538058876408806, "language_loss": 0.76861089, "learning_rate": 1.3687594683386982e-09, "loss": 0.78303027, "num_input_tokens_seen": 354829945, "router_z_loss_clip": 2.109375, "router_z_loss_mlp": 0.23754883, "step": 16442, "time_per_iteration": 4.3051393032073975 }, { "auxiliary_loss_clip": 0.01223774, "auxiliary_loss_mlp": 0.00210615, "balance_loss_clip": 1.0099262, "balance_loss_mlp": 0.18615375, "epoch": 0.9886066436194199, "flos": 13807976768640.0, "grad_norm": 47.88662784574514, "language_loss": 0.83856696, "learning_rate": 1.3543910118227753e-09, "loss": 0.85291088, "num_input_tokens_seen": 354845055, "router_z_loss_clip": 2.14160156, "router_z_loss_mlp": 0.24462891, "step": 16443, "time_per_iteration": 2.5895583629608154 }, { "auxiliary_loss_clip": 0.01238126, "auxiliary_loss_mlp": 0.00221743, "balance_loss_clip": 1.02330208, "balance_loss_mlp": 0.19686446, "epoch": 0.9886667668720878, "flos": 23325129544320.0, "grad_norm": 45.083914691830614, "language_loss": 0.81172395, "learning_rate": 1.3400983441487213e-09, "loss": 0.82632262, "num_input_tokens_seen": 354864680, "router_z_loss_clip": 2.14648438, "router_z_loss_mlp": 0.24865723, "step": 16444, "time_per_iteration": 4.11650013923645 }, { "auxiliary_loss_clip": 0.0123481, "auxiliary_loss_mlp": 0.00239805, "balance_loss_clip": 1.02340698, "balance_loss_mlp": 0.21528399, "epoch": 0.9887268901247558, "flos": 22706029704960.0, "grad_norm": 34.73207559201181, "language_loss": 0.74180526, "learning_rate": 1.325881465858547e-09, "loss": 0.75655138, "num_input_tokens_seen": 354885685, "router_z_loss_clip": 2.11035156, "router_z_loss_mlp": 0.24511719, "step": 16445, "time_per_iteration": 2.7107250690460205 }, { "auxiliary_loss_clip": 0.01248213, "auxiliary_loss_mlp": 0.00236603, "balance_loss_clip": 1.02836645, "balance_loss_mlp": 0.21321425, "epoch": 0.9887870133774237, "flos": 13041283944960.0, "grad_norm": 6916.711010310122, "language_loss": 0.70706242, "learning_rate": 1.311740377491155e-09, "loss": 0.7219106, "num_input_tokens_seen": 354901505, "router_z_loss_clip": 2.20117188, "router_z_loss_mlp": 0.23388672, "step": 16446, "time_per_iteration": 2.671391725540161 }, { "auxiliary_loss_clip": 0.01227635, "auxiliary_loss_mlp": 0.00212078, "balance_loss_clip": 1.0138849, "balance_loss_mlp": 0.18825966, "epoch": 0.9888471366300917, "flos": 15158864390400.0, "grad_norm": 123.23157119517614, "language_loss": 0.80312574, "learning_rate": 1.297675079582783e-09, "loss": 0.81752288, "num_input_tokens_seen": 354920060, "router_z_loss_clip": 2.13671875, "router_z_loss_mlp": 0.23828125, "step": 16447, "time_per_iteration": 2.714468240737915 }, { "auxiliary_loss_clip": 0.01242046, "auxiliary_loss_mlp": 0.00239836, "balance_loss_clip": 1.02512908, "balance_loss_mlp": 0.21380042, "epoch": 0.9889072598827596, "flos": 25118796119040.0, "grad_norm": 38.27350631737183, "language_loss": 0.91745949, "learning_rate": 1.2836855726667818e-09, "loss": 0.93227828, "num_input_tokens_seen": 354938690, "router_z_loss_clip": 2.17089844, "router_z_loss_mlp": 0.26037598, "step": 16448, "time_per_iteration": 2.7000789642333984 }, { "auxiliary_loss_clip": 0.01227777, "auxiliary_loss_mlp": 0.00218446, "balance_loss_clip": 1.01517379, "balance_loss_mlp": 0.19393663, "epoch": 0.9889673831354276, "flos": 16728663450240.0, "grad_norm": 16.606542102246024, "language_loss": 0.77397728, "learning_rate": 1.26977185727406e-09, "loss": 0.78843951, "num_input_tokens_seen": 354956955, "router_z_loss_clip": 2.12304688, "router_z_loss_mlp": 0.24511719, "step": 16449, "time_per_iteration": 2.680272102355957 }, { "auxiliary_loss_clip": 0.01250077, "auxiliary_loss_mlp": 0.00223059, "balance_loss_clip": 1.0344398, "balance_loss_mlp": 0.19792983, "epoch": 0.9890275063880956, "flos": 35585175657600.0, "grad_norm": 13.3998368658885, "language_loss": 0.81919342, "learning_rate": 1.25593393393153e-09, "loss": 0.83392477, "num_input_tokens_seen": 354976800, "router_z_loss_clip": 2.15234375, "router_z_loss_mlp": 0.25158691, "step": 16450, "time_per_iteration": 2.7828776836395264 }, { "auxiliary_loss_clip": 0.0126546, "auxiliary_loss_mlp": 0.00238025, "balance_loss_clip": 1.03279495, "balance_loss_mlp": 0.21134584, "epoch": 0.9890876296407636, "flos": 18952359649920.0, "grad_norm": 28.28922196961217, "language_loss": 0.86970431, "learning_rate": 1.242171803164549e-09, "loss": 0.88473916, "num_input_tokens_seen": 354996625, "router_z_loss_clip": 2.32617188, "router_z_loss_mlp": 0.26696777, "step": 16451, "time_per_iteration": 2.679987668991089 }, { "auxiliary_loss_clip": 0.01244776, "auxiliary_loss_mlp": 0.00222186, "balance_loss_clip": 1.02300406, "balance_loss_mlp": 0.19655649, "epoch": 0.9891477528934315, "flos": 23769309127680.0, "grad_norm": 100.56224707016807, "language_loss": 0.82247907, "learning_rate": 1.2284854654946996e-09, "loss": 0.83714867, "num_input_tokens_seen": 355014535, "router_z_loss_clip": 2.22070312, "router_z_loss_mlp": 0.2565918, "step": 16452, "time_per_iteration": 2.7512590885162354 }, { "auxiliary_loss_clip": 0.0122728, "auxiliary_loss_mlp": 0.0021122, "balance_loss_clip": 1.01880217, "balance_loss_mlp": 0.18859437, "epoch": 0.9892078761460995, "flos": 20772922533120.0, "grad_norm": 35.13283438718066, "language_loss": 0.80741739, "learning_rate": 1.2148749214409004e-09, "loss": 0.82180244, "num_input_tokens_seen": 355033280, "router_z_loss_clip": 2.08691406, "router_z_loss_mlp": 0.22631836, "step": 16453, "time_per_iteration": 2.748844623565674 }, { "auxiliary_loss_clip": 0.01241514, "auxiliary_loss_mlp": 0.00212283, "balance_loss_clip": 1.02402282, "balance_loss_mlp": 0.18795279, "epoch": 0.9892679993987675, "flos": 23367827836800.0, "grad_norm": 21.989045780661897, "language_loss": 0.81170386, "learning_rate": 1.2013401715191828e-09, "loss": 0.82624179, "num_input_tokens_seen": 355053320, "router_z_loss_clip": 2.17480469, "router_z_loss_mlp": 0.2434082, "step": 16454, "time_per_iteration": 2.6580522060394287 }, { "auxiliary_loss_clip": 0.01216087, "auxiliary_loss_mlp": 0.002161, "balance_loss_clip": 1.00642431, "balance_loss_mlp": 0.19274732, "epoch": 0.9893281226514354, "flos": 22705419173760.0, "grad_norm": 52.84502920929598, "language_loss": 0.82347512, "learning_rate": 1.1878812162433583e-09, "loss": 0.83779699, "num_input_tokens_seen": 355070230, "router_z_loss_clip": 2.09765625, "router_z_loss_mlp": 0.23339844, "step": 16455, "time_per_iteration": 2.6646130084991455 }, { "auxiliary_loss_clip": 0.01212282, "auxiliary_loss_mlp": 0.00209325, "balance_loss_clip": 1.00597405, "balance_loss_mlp": 0.18756916, "epoch": 0.9893882459041035, "flos": 21796664060160.0, "grad_norm": 88.18578060183438, "language_loss": 0.71783686, "learning_rate": 1.1744980561230188e-09, "loss": 0.73205292, "num_input_tokens_seen": 355090125, "router_z_loss_clip": 2.06640625, "router_z_loss_mlp": 0.21740723, "step": 16456, "time_per_iteration": 2.7494754791259766 }, { "auxiliary_loss_clip": 0.01247474, "auxiliary_loss_mlp": 0.00221394, "balance_loss_clip": 1.03281999, "balance_loss_mlp": 0.19762415, "epoch": 0.9894483691567714, "flos": 18113773754880.0, "grad_norm": 22.603881533748783, "language_loss": 0.81290811, "learning_rate": 1.161190691666203e-09, "loss": 0.82759678, "num_input_tokens_seen": 355107890, "router_z_loss_clip": 2.1484375, "router_z_loss_mlp": 0.23779297, "step": 16457, "time_per_iteration": 2.620739459991455 }, { "auxiliary_loss_clip": 0.01247042, "auxiliary_loss_mlp": 0.00222712, "balance_loss_clip": 1.02409577, "balance_loss_mlp": 0.1970461, "epoch": 0.9895084924094394, "flos": 31211615664000.0, "grad_norm": 6.708450096746502, "language_loss": 0.77029181, "learning_rate": 1.1479591233773954e-09, "loss": 0.78498936, "num_input_tokens_seen": 355126340, "router_z_loss_clip": 2.23046875, "router_z_loss_mlp": 0.25683594, "step": 16458, "time_per_iteration": 2.924142837524414 }, { "auxiliary_loss_clip": 0.0123023, "auxiliary_loss_mlp": 0.00227242, "balance_loss_clip": 1.01522422, "balance_loss_mlp": 0.20293482, "epoch": 0.9895686156621073, "flos": 19678042120320.0, "grad_norm": 176.14930563080875, "language_loss": 0.87276745, "learning_rate": 1.1348033517581956e-09, "loss": 0.8873421, "num_input_tokens_seen": 355144025, "router_z_loss_clip": 2.15136719, "router_z_loss_mlp": 0.24316406, "step": 16459, "time_per_iteration": 2.717639923095703 }, { "auxiliary_loss_clip": 0.01234877, "auxiliary_loss_mlp": 0.00250801, "balance_loss_clip": 1.01977384, "balance_loss_mlp": 0.22319236, "epoch": 0.9896287389147753, "flos": 23581675457280.0, "grad_norm": 3.3913475190075184, "language_loss": 0.82665551, "learning_rate": 1.1217233773075373e-09, "loss": 0.84151232, "num_input_tokens_seen": 355163125, "router_z_loss_clip": 2.14941406, "router_z_loss_mlp": 0.27600098, "step": 16460, "time_per_iteration": 2.8002874851226807 }, { "auxiliary_loss_clip": 0.0122507, "auxiliary_loss_mlp": 0.00216442, "balance_loss_clip": 1.01386929, "balance_loss_mlp": 0.19211158, "epoch": 0.9896888621674432, "flos": 29605331364480.0, "grad_norm": 8.137511663840248, "language_loss": 0.94816637, "learning_rate": 1.1087192005214685e-09, "loss": 0.96258152, "num_input_tokens_seen": 355184060, "router_z_loss_clip": 2.11132812, "router_z_loss_mlp": 0.24316406, "step": 16461, "time_per_iteration": 2.7109920978546143 }, { "auxiliary_loss_clip": 0.01237076, "auxiliary_loss_mlp": 0.00231767, "balance_loss_clip": 1.02278352, "balance_loss_mlp": 0.20537391, "epoch": 0.9897489854201112, "flos": 23695045758720.0, "grad_norm": 2.3857857408952934, "language_loss": 0.71074742, "learning_rate": 1.09579082189315e-09, "loss": 0.72543585, "num_input_tokens_seen": 355204505, "router_z_loss_clip": 2.14550781, "router_z_loss_mlp": 0.26367188, "step": 16462, "time_per_iteration": 2.6969478130340576 }, { "auxiliary_loss_clip": 0.01223015, "auxiliary_loss_mlp": 0.00219424, "balance_loss_clip": 1.0134325, "balance_loss_mlp": 0.19480726, "epoch": 0.9898091086727792, "flos": 13225146687360.0, "grad_norm": 88.10375310957765, "language_loss": 0.82813895, "learning_rate": 1.0829382419126343e-09, "loss": 0.84256327, "num_input_tokens_seen": 355223055, "router_z_loss_clip": 2.09667969, "router_z_loss_mlp": 0.24597168, "step": 16463, "time_per_iteration": 2.6140565872192383 }, { "auxiliary_loss_clip": 0.01232138, "auxiliary_loss_mlp": 0.00225734, "balance_loss_clip": 1.01764536, "balance_loss_mlp": 0.20029427, "epoch": 0.9898692319254472, "flos": 22930400010240.0, "grad_norm": 6.927904185855233, "language_loss": 0.79052961, "learning_rate": 1.0701614610675314e-09, "loss": 0.80510837, "num_input_tokens_seen": 355242000, "router_z_loss_clip": 2.14648438, "router_z_loss_mlp": 0.25463867, "step": 16464, "time_per_iteration": 2.697918653488159 }, { "auxiliary_loss_clip": 0.01250219, "auxiliary_loss_mlp": 0.00205469, "balance_loss_clip": 1.02887106, "balance_loss_mlp": 0.18060169, "epoch": 0.9899293551781151, "flos": 12458346122880.0, "grad_norm": 13.578977714351863, "language_loss": 0.84849107, "learning_rate": 1.0574604798421204e-09, "loss": 0.8630479, "num_input_tokens_seen": 355260175, "router_z_loss_clip": 2.21289062, "router_z_loss_mlp": 0.2487793, "step": 16465, "time_per_iteration": 2.6960396766662598 }, { "auxiliary_loss_clip": 0.01226297, "auxiliary_loss_mlp": 0.00224391, "balance_loss_clip": 1.01549768, "balance_loss_mlp": 0.20104951, "epoch": 0.9899894784307831, "flos": 26871129118080.0, "grad_norm": 8.72001707126764, "language_loss": 0.92993367, "learning_rate": 1.0448352987182386e-09, "loss": 0.94444048, "num_input_tokens_seen": 355281930, "router_z_loss_clip": 2.10742188, "router_z_loss_mlp": 0.23339844, "step": 16466, "time_per_iteration": 2.6999242305755615 }, { "auxiliary_loss_clip": 0.01254704, "auxiliary_loss_mlp": 0.00228398, "balance_loss_clip": 1.03706193, "balance_loss_mlp": 0.20243391, "epoch": 0.990049601683451, "flos": 21542093395200.0, "grad_norm": 18.53942164226143, "language_loss": 0.80671507, "learning_rate": 1.0322859181743915e-09, "loss": 0.82154608, "num_input_tokens_seen": 355301555, "router_z_loss_clip": 2.17578125, "router_z_loss_mlp": 0.25964355, "step": 16467, "time_per_iteration": 2.7069664001464844 }, { "auxiliary_loss_clip": 0.01225765, "auxiliary_loss_mlp": 0.00231475, "balance_loss_clip": 1.01404011, "balance_loss_mlp": 0.20751387, "epoch": 0.990109724936119, "flos": 28771809287040.0, "grad_norm": 137.13526532170113, "language_loss": 0.70592195, "learning_rate": 1.019812338686643e-09, "loss": 0.72049439, "num_input_tokens_seen": 355324925, "router_z_loss_clip": 2.11914062, "router_z_loss_mlp": 0.23962402, "step": 16468, "time_per_iteration": 2.766195058822632 }, { "auxiliary_loss_clip": 0.01234179, "auxiliary_loss_mlp": 0.00219026, "balance_loss_clip": 1.01648903, "balance_loss_mlp": 0.19411097, "epoch": 0.9901698481887871, "flos": 29274270687360.0, "grad_norm": 41.8644551709993, "language_loss": 0.68848401, "learning_rate": 1.0074145607281704e-09, "loss": 0.70301604, "num_input_tokens_seen": 355343875, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.24890137, "step": 16469, "time_per_iteration": 2.823916435241699 }, { "auxiliary_loss_clip": 0.01237172, "auxiliary_loss_mlp": 0.00212871, "balance_loss_clip": 1.02054024, "balance_loss_mlp": 0.18907705, "epoch": 0.990229971441455, "flos": 15959025711360.0, "grad_norm": 2563.997388038879, "language_loss": 0.83910179, "learning_rate": 9.950925847685976e-10, "loss": 0.85360223, "num_input_tokens_seen": 355358835, "router_z_loss_clip": 2.16308594, "router_z_loss_mlp": 0.2376709, "step": 16470, "time_per_iteration": 2.6320197582244873 }, { "auxiliary_loss_clip": 0.01088751, "auxiliary_loss_mlp": 0.00106489, "balance_loss_clip": 0.95048898, "balance_loss_mlp": 0.09924134, "epoch": 0.990290094694123, "flos": 69780287911680.0, "grad_norm": 0.6575699287367043, "language_loss": 0.55110282, "learning_rate": 9.828464112755509e-10, "loss": 0.56305522, "num_input_tokens_seen": 355431225, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 0.07226562, "step": 16471, "time_per_iteration": 3.362274169921875 }, { "auxiliary_loss_clip": 0.01236642, "auxiliary_loss_mlp": 0.00237883, "balance_loss_clip": 1.01890516, "balance_loss_mlp": 0.21152577, "epoch": 0.9903502179467909, "flos": 16252451913600.0, "grad_norm": 36.594762281984714, "language_loss": 0.9517501, "learning_rate": 9.706760407131032e-10, "loss": 0.96649528, "num_input_tokens_seen": 355448250, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.26367188, "step": 16472, "time_per_iteration": 2.722848892211914 }, { "auxiliary_loss_clip": 0.01254624, "auxiliary_loss_mlp": 0.0022028, "balance_loss_clip": 1.03299952, "balance_loss_mlp": 0.19619986, "epoch": 0.9904103411994589, "flos": 21688393489920.0, "grad_norm": 63.37112388152848, "language_loss": 0.9490642, "learning_rate": 9.585814735431075e-10, "loss": 0.96381319, "num_input_tokens_seen": 355467040, "router_z_loss_clip": 2.21484375, "router_z_loss_mlp": 0.2409668, "step": 16473, "time_per_iteration": 2.689098834991455 }, { "auxiliary_loss_clip": 0.01223879, "auxiliary_loss_mlp": 0.0020678, "balance_loss_clip": 1.01474881, "balance_loss_mlp": 0.18459526, "epoch": 0.9904704644521268, "flos": 25739440243200.0, "grad_norm": 12.987051515433265, "language_loss": 0.91381192, "learning_rate": 9.465627102240859e-10, "loss": 0.92811853, "num_input_tokens_seen": 355487825, "router_z_loss_clip": 2.09570312, "router_z_loss_mlp": 0.22192383, "step": 16474, "time_per_iteration": 2.7350127696990967 }, { "auxiliary_loss_clip": 0.01216367, "auxiliary_loss_mlp": 0.0020451, "balance_loss_clip": 1.00859213, "balance_loss_mlp": 0.18194371, "epoch": 0.9905305877047949, "flos": 21908346422400.0, "grad_norm": 24.81956710580109, "language_loss": 0.83664644, "learning_rate": 9.346197512116738e-10, "loss": 0.85085523, "num_input_tokens_seen": 355507445, "router_z_loss_clip": 2.08105469, "router_z_loss_mlp": 0.22558594, "step": 16475, "time_per_iteration": 4.0634543895721436 }, { "auxiliary_loss_clip": 0.01231604, "auxiliary_loss_mlp": 0.00227771, "balance_loss_clip": 1.01784658, "balance_loss_mlp": 0.20282093, "epoch": 0.9905907109574628, "flos": 21392417422080.0, "grad_norm": 18.237719639998318, "language_loss": 0.8119579, "learning_rate": 9.227525969588423e-10, "loss": 0.82655168, "num_input_tokens_seen": 355527205, "router_z_loss_clip": 2.13671875, "router_z_loss_mlp": 0.24975586, "step": 16476, "time_per_iteration": 4.107767820358276 }, { "auxiliary_loss_clip": 0.01260448, "auxiliary_loss_mlp": 0.0022137, "balance_loss_clip": 1.03243864, "balance_loss_mlp": 0.19430988, "epoch": 0.9906508342101308, "flos": 20521620005760.0, "grad_norm": 10.135148583812596, "language_loss": 0.77246821, "learning_rate": 9.109612479154538e-10, "loss": 0.7872864, "num_input_tokens_seen": 355544740, "router_z_loss_clip": 2.27636719, "router_z_loss_mlp": 0.27050781, "step": 16477, "time_per_iteration": 2.6859357357025146 }, { "auxiliary_loss_clip": 0.01249481, "auxiliary_loss_mlp": 0.00246665, "balance_loss_clip": 1.02520919, "balance_loss_mlp": 0.21960446, "epoch": 0.9907109574627987, "flos": 21361211481600.0, "grad_norm": 44.224488453388396, "language_loss": 0.8175329, "learning_rate": 8.992457045289282e-10, "loss": 0.83249438, "num_input_tokens_seen": 355564385, "router_z_loss_clip": 2.24023438, "router_z_loss_mlp": 0.27075195, "step": 16478, "time_per_iteration": 2.6302011013031006 }, { "auxiliary_loss_clip": 0.01236374, "auxiliary_loss_mlp": 0.00209432, "balance_loss_clip": 1.02226639, "balance_loss_mlp": 0.18594778, "epoch": 0.9907710807154667, "flos": 17338605321600.0, "grad_norm": 86.82168390300431, "language_loss": 0.9258827, "learning_rate": 8.876059672433545e-10, "loss": 0.94034076, "num_input_tokens_seen": 355579260, "router_z_loss_clip": 2.13867188, "router_z_loss_mlp": 0.23486328, "step": 16479, "time_per_iteration": 2.672309160232544 }, { "auxiliary_loss_clip": 0.01243947, "auxiliary_loss_mlp": 0.00222094, "balance_loss_clip": 1.02821696, "balance_loss_mlp": 0.19834751, "epoch": 0.9908312039681346, "flos": 28621881918720.0, "grad_norm": 68.05892358093139, "language_loss": 0.74830759, "learning_rate": 8.760420364999355e-10, "loss": 0.76296806, "num_input_tokens_seen": 355599790, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.23754883, "step": 16480, "time_per_iteration": 2.7031054496765137 }, { "auxiliary_loss_clip": 0.01222342, "auxiliary_loss_mlp": 0.00233177, "balance_loss_clip": 1.01574147, "balance_loss_mlp": 0.20975247, "epoch": 0.9908913272208026, "flos": 35770654512000.0, "grad_norm": 6.3083010710251175, "language_loss": 0.79272914, "learning_rate": 8.645539127374313e-10, "loss": 0.80728436, "num_input_tokens_seen": 355620925, "router_z_loss_clip": 2.06835938, "router_z_loss_mlp": 0.234375, "step": 16481, "time_per_iteration": 2.8135292530059814 }, { "auxiliary_loss_clip": 0.01232996, "auxiliary_loss_mlp": 0.00206936, "balance_loss_clip": 1.0225544, "balance_loss_mlp": 0.18308258, "epoch": 0.9909514504734707, "flos": 19902196944000.0, "grad_norm": 51.62188905973563, "language_loss": 0.86403012, "learning_rate": 8.531415963912713e-10, "loss": 0.87842953, "num_input_tokens_seen": 355639165, "router_z_loss_clip": 2.10546875, "router_z_loss_mlp": 0.23840332, "step": 16482, "time_per_iteration": 2.6286721229553223 }, { "auxiliary_loss_clip": 0.01235073, "auxiliary_loss_mlp": 0.00202759, "balance_loss_clip": 1.01591098, "balance_loss_mlp": 0.17767732, "epoch": 0.9910115737261386, "flos": 20004793165440.0, "grad_norm": 6.24750070624695, "language_loss": 0.83844936, "learning_rate": 8.418050878944427e-10, "loss": 0.85282767, "num_input_tokens_seen": 355657320, "router_z_loss_clip": 2.19335938, "router_z_loss_mlp": 0.25073242, "step": 16483, "time_per_iteration": 2.671978712081909 }, { "auxiliary_loss_clip": 0.010865, "auxiliary_loss_mlp": 0.00122258, "balance_loss_clip": 0.94951642, "balance_loss_mlp": 0.1139606, "epoch": 0.9910716969788066, "flos": 70688432494080.0, "grad_norm": 0.702064029248106, "language_loss": 0.5328595, "learning_rate": 8.305443876768237e-10, "loss": 0.54494703, "num_input_tokens_seen": 355726370, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.08300781, "step": 16484, "time_per_iteration": 4.759593725204468 }, { "auxiliary_loss_clip": 0.01216224, "auxiliary_loss_mlp": 0.0018326, "balance_loss_clip": 1.00791478, "balance_loss_mlp": 0.16034836, "epoch": 0.9911318202314745, "flos": 21434038306560.0, "grad_norm": 2.9789052154267885, "language_loss": 0.88739002, "learning_rate": 8.19359496165184e-10, "loss": 0.90138489, "num_input_tokens_seen": 355745840, "router_z_loss_clip": 2.08203125, "router_z_loss_mlp": 0.22912598, "step": 16485, "time_per_iteration": 2.6591391563415527 }, { "auxiliary_loss_clip": 0.012157, "auxiliary_loss_mlp": 0.00208065, "balance_loss_clip": 1.00800359, "balance_loss_mlp": 0.18378167, "epoch": 0.9911919434841425, "flos": 19826820253440.0, "grad_norm": 1939.1946951997734, "language_loss": 0.8825385, "learning_rate": 8.082504137836288e-10, "loss": 0.89677614, "num_input_tokens_seen": 355763385, "router_z_loss_clip": 2.08007812, "router_z_loss_mlp": 0.24291992, "step": 16486, "time_per_iteration": 4.165836572647095 }, { "auxiliary_loss_clip": 0.01253933, "auxiliary_loss_mlp": 0.00222119, "balance_loss_clip": 1.03351092, "balance_loss_mlp": 0.19865896, "epoch": 0.9912520667368104, "flos": 41719364691840.0, "grad_norm": 56.743499163267835, "language_loss": 0.727633, "learning_rate": 7.972171409538209e-10, "loss": 0.74239355, "num_input_tokens_seen": 355786075, "router_z_loss_clip": 2.20507812, "router_z_loss_mlp": 0.234375, "step": 16487, "time_per_iteration": 2.8944039344787598 }, { "auxiliary_loss_clip": 0.01224933, "auxiliary_loss_mlp": 0.0022268, "balance_loss_clip": 1.01569772, "balance_loss_mlp": 0.19948146, "epoch": 0.9913121899894785, "flos": 23769668263680.0, "grad_norm": 6.567665056829688, "language_loss": 0.85840666, "learning_rate": 7.862596780936481e-10, "loss": 0.87288284, "num_input_tokens_seen": 355806295, "router_z_loss_clip": 2.09082031, "router_z_loss_mlp": 0.23217773, "step": 16488, "time_per_iteration": 2.781287670135498 }, { "auxiliary_loss_clip": 0.01270747, "auxiliary_loss_mlp": 0.0023709, "balance_loss_clip": 1.04078507, "balance_loss_mlp": 0.21105474, "epoch": 0.9913723132421464, "flos": 23769668263680.0, "grad_norm": 51.27256282801621, "language_loss": 0.7835114, "learning_rate": 7.753780256190001e-10, "loss": 0.79858977, "num_input_tokens_seen": 355825730, "router_z_loss_clip": 2.29980469, "router_z_loss_mlp": 0.26037598, "step": 16489, "time_per_iteration": 2.789767265319824 }, { "auxiliary_loss_clip": 0.01088402, "auxiliary_loss_mlp": 0.00108, "balance_loss_clip": 0.94971979, "balance_loss_mlp": 0.1006094, "epoch": 0.9914324364948144, "flos": 71267419820160.0, "grad_norm": 0.5997531979115908, "language_loss": 0.51975977, "learning_rate": 7.645721839424357e-10, "loss": 0.53172386, "num_input_tokens_seen": 355891545, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.07373047, "step": 16490, "time_per_iteration": 3.305001974105835 }, { "auxiliary_loss_clip": 0.01262552, "auxiliary_loss_mlp": 0.00267257, "balance_loss_clip": 1.03977883, "balance_loss_mlp": 0.24000612, "epoch": 0.9914925597474823, "flos": 23695440808320.0, "grad_norm": 2.345374789350484, "language_loss": 0.81744695, "learning_rate": 7.538421534734052e-10, "loss": 0.83274496, "num_input_tokens_seen": 355909920, "router_z_loss_clip": 2.2265625, "router_z_loss_mlp": 0.27233887, "step": 16491, "time_per_iteration": 2.6855058670043945 }, { "auxiliary_loss_clip": 0.01276683, "auxiliary_loss_mlp": 0.00233155, "balance_loss_clip": 1.04867268, "balance_loss_mlp": 0.20558232, "epoch": 0.9915526830001503, "flos": 13433822749440.0, "grad_norm": 39.87257292475765, "language_loss": 0.80002749, "learning_rate": 7.431879346191383e-10, "loss": 0.81512582, "num_input_tokens_seen": 355923130, "router_z_loss_clip": 2.27734375, "router_z_loss_mlp": 0.27563477, "step": 16492, "time_per_iteration": 2.6651148796081543 }, { "auxiliary_loss_clip": 0.01223799, "auxiliary_loss_mlp": 0.00238253, "balance_loss_clip": 1.0115298, "balance_loss_mlp": 0.2121934, "epoch": 0.9916128062528182, "flos": 20740962407040.0, "grad_norm": 11.344732675848794, "language_loss": 0.77369201, "learning_rate": 7.326095277837563e-10, "loss": 0.78831255, "num_input_tokens_seen": 355941960, "router_z_loss_clip": 2.12109375, "router_z_loss_mlp": 0.26098633, "step": 16493, "time_per_iteration": 2.6620824337005615 }, { "auxiliary_loss_clip": 0.01247373, "auxiliary_loss_mlp": 0.0021201, "balance_loss_clip": 1.02881646, "balance_loss_mlp": 0.18661799, "epoch": 0.9916729295054862, "flos": 22487082353280.0, "grad_norm": 30.812334382461028, "language_loss": 0.80193341, "learning_rate": 7.221069333678276e-10, "loss": 0.81652725, "num_input_tokens_seen": 355961640, "router_z_loss_clip": 2.18652344, "router_z_loss_mlp": 0.25402832, "step": 16494, "time_per_iteration": 2.742496967315674 }, { "auxiliary_loss_clip": 0.01248836, "auxiliary_loss_mlp": 0.00248078, "balance_loss_clip": 1.02494371, "balance_loss_mlp": 0.2219232, "epoch": 0.9917330527581543, "flos": 14792467708800.0, "grad_norm": 26.481006820147865, "language_loss": 0.7633431, "learning_rate": 7.116801517701443e-10, "loss": 0.77831221, "num_input_tokens_seen": 355977980, "router_z_loss_clip": 2.23632812, "router_z_loss_mlp": 0.26123047, "step": 16495, "time_per_iteration": 2.649563789367676 }, { "auxiliary_loss_clip": 0.01088736, "auxiliary_loss_mlp": 0.0010933, "balance_loss_clip": 0.95109546, "balance_loss_mlp": 0.10165288, "epoch": 0.9917931760108222, "flos": 59191595585280.0, "grad_norm": 0.6927210930794526, "language_loss": 0.52838516, "learning_rate": 7.013291833859458e-10, "loss": 0.54036587, "num_input_tokens_seen": 356042900, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.07666016, "step": 16496, "time_per_iteration": 3.286686420440674 }, { "auxiliary_loss_clip": 0.01243679, "auxiliary_loss_mlp": 0.00235548, "balance_loss_clip": 1.02640569, "balance_loss_mlp": 0.20933348, "epoch": 0.9918532992634902, "flos": 26761637485440.0, "grad_norm": 451.09802006460455, "language_loss": 0.79210699, "learning_rate": 6.91054028607585e-10, "loss": 0.80689925, "num_input_tokens_seen": 356063000, "router_z_loss_clip": 2.17089844, "router_z_loss_mlp": 0.2623291, "step": 16497, "time_per_iteration": 2.7016396522521973 }, { "auxiliary_loss_clip": 0.0128441, "auxiliary_loss_mlp": 0.00229553, "balance_loss_clip": 1.05094624, "balance_loss_mlp": 0.20237315, "epoch": 0.9919134225161581, "flos": 14975719920000.0, "grad_norm": 9.372951219711378, "language_loss": 0.92390704, "learning_rate": 6.808546878249721e-10, "loss": 0.93904674, "num_input_tokens_seen": 356078130, "router_z_loss_clip": 2.33398438, "router_z_loss_mlp": 0.27185059, "step": 16498, "time_per_iteration": 2.765265703201294 }, { "auxiliary_loss_clip": 0.01250437, "auxiliary_loss_mlp": 0.00251857, "balance_loss_clip": 1.03353095, "balance_loss_mlp": 0.22532137, "epoch": 0.9919735457688261, "flos": 27818201064960.0, "grad_norm": 43.68345690115969, "language_loss": 0.74326146, "learning_rate": 6.707311614246869e-10, "loss": 0.75828439, "num_input_tokens_seen": 356101655, "router_z_loss_clip": 2.16601562, "router_z_loss_mlp": 0.26538086, "step": 16499, "time_per_iteration": 2.740226984024048 }, { "auxiliary_loss_clip": 0.01245327, "auxiliary_loss_mlp": 0.0023696, "balance_loss_clip": 1.02677989, "balance_loss_mlp": 0.21135369, "epoch": 0.992033669021494, "flos": 22562782266240.0, "grad_norm": 1082.2475868752913, "language_loss": 0.89712691, "learning_rate": 6.606834497904223e-10, "loss": 0.91194975, "num_input_tokens_seen": 356121425, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.25610352, "step": 16500, "time_per_iteration": 2.649876117706299 }, { "auxiliary_loss_clip": 0.01225527, "auxiliary_loss_mlp": 0.0022061, "balance_loss_clip": 1.01183832, "balance_loss_mlp": 0.19699436, "epoch": 0.9920937922741621, "flos": 25374587846400.0, "grad_norm": 14.609997346629363, "language_loss": 0.91276085, "learning_rate": 6.507115533036511e-10, "loss": 0.92722219, "num_input_tokens_seen": 356140710, "router_z_loss_clip": 2.13476562, "router_z_loss_mlp": 0.23596191, "step": 16501, "time_per_iteration": 2.700427293777466 }, { "auxiliary_loss_clip": 0.01238807, "auxiliary_loss_mlp": 0.00216541, "balance_loss_clip": 1.02098107, "balance_loss_mlp": 0.1937838, "epoch": 0.99215391552683, "flos": 22054466949120.0, "grad_norm": 19.84579360401703, "language_loss": 0.85513222, "learning_rate": 6.408154723420711e-10, "loss": 0.86968565, "num_input_tokens_seen": 356159835, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.22766113, "step": 16502, "time_per_iteration": 2.663682222366333 }, { "auxiliary_loss_clip": 0.01252836, "auxiliary_loss_mlp": 0.00223564, "balance_loss_clip": 1.03765345, "balance_loss_mlp": 0.1992806, "epoch": 0.992214038779498, "flos": 15413937845760.0, "grad_norm": 195.8174558279032, "language_loss": 0.83684552, "learning_rate": 6.309952072811597e-10, "loss": 0.85160953, "num_input_tokens_seen": 356177555, "router_z_loss_clip": 2.15039062, "router_z_loss_mlp": 0.24291992, "step": 16503, "time_per_iteration": 2.724957227706909 }, { "auxiliary_loss_clip": 0.01085114, "auxiliary_loss_mlp": 0.00071305, "balance_loss_clip": 0.94821107, "balance_loss_mlp": 0.06448591, "epoch": 0.9922741620321659, "flos": 62014498467840.0, "grad_norm": 0.6367992032610935, "language_loss": 0.54621208, "learning_rate": 6.212507584932858e-10, "loss": 0.55777633, "num_input_tokens_seen": 356244975, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.06835938, "step": 16504, "time_per_iteration": 3.2451038360595703 }, { "auxiliary_loss_clip": 0.01233876, "auxiliary_loss_mlp": 0.00225205, "balance_loss_clip": 1.01755095, "balance_loss_mlp": 0.19955149, "epoch": 0.9923342852848339, "flos": 17165480745600.0, "grad_norm": 4.3926460890777985, "language_loss": 0.77783507, "learning_rate": 6.115821263481536e-10, "loss": 0.79242587, "num_input_tokens_seen": 356262605, "router_z_loss_clip": 2.16503906, "router_z_loss_mlp": 0.2565918, "step": 16505, "time_per_iteration": 2.863426446914673 }, { "auxiliary_loss_clip": 0.01258116, "auxiliary_loss_mlp": 0.00244128, "balance_loss_clip": 1.03209043, "balance_loss_mlp": 0.21533938, "epoch": 0.9923944085375018, "flos": 23183210908800.0, "grad_norm": 10.568142206761422, "language_loss": 0.75122857, "learning_rate": 6.019893112119146e-10, "loss": 0.76625097, "num_input_tokens_seen": 356278935, "router_z_loss_clip": 2.26367188, "router_z_loss_mlp": 0.28771973, "step": 16506, "time_per_iteration": 2.701571464538574 }, { "auxiliary_loss_clip": 0.01229873, "auxiliary_loss_mlp": 0.00208499, "balance_loss_clip": 1.0131371, "balance_loss_mlp": 0.18402497, "epoch": 0.9924545317901698, "flos": 20813861059200.0, "grad_norm": 69.61601861732785, "language_loss": 0.71377832, "learning_rate": 5.924723134487219e-10, "loss": 0.72816205, "num_input_tokens_seen": 356295675, "router_z_loss_clip": 2.16601562, "router_z_loss_mlp": 0.24450684, "step": 16507, "time_per_iteration": 2.669567108154297 }, { "auxiliary_loss_clip": 0.01242046, "auxiliary_loss_mlp": 0.00232625, "balance_loss_clip": 1.02399099, "balance_loss_mlp": 0.20630321, "epoch": 0.9925146550428379, "flos": 20083437993600.0, "grad_norm": 6.307447943007849, "language_loss": 0.81269348, "learning_rate": 5.830311334193983e-10, "loss": 0.82744014, "num_input_tokens_seen": 356312885, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.26306152, "step": 16508, "time_per_iteration": 2.677767276763916 }, { "auxiliary_loss_clip": 0.01235445, "auxiliary_loss_mlp": 0.00227328, "balance_loss_clip": 1.02018547, "balance_loss_mlp": 0.20314057, "epoch": 0.9925747782955058, "flos": 24973717086720.0, "grad_norm": 23.922033594928056, "language_loss": 0.75346953, "learning_rate": 5.736657714818793e-10, "loss": 0.76809728, "num_input_tokens_seen": 356334070, "router_z_loss_clip": 2.15234375, "router_z_loss_mlp": 0.24206543, "step": 16509, "time_per_iteration": 2.688413143157959 }, { "auxiliary_loss_clip": 0.01238567, "auxiliary_loss_mlp": 0.00244894, "balance_loss_clip": 1.02387702, "balance_loss_mlp": 0.21850082, "epoch": 0.9926349015481738, "flos": 60472526492160.0, "grad_norm": 25.99686019039775, "language_loss": 0.77343249, "learning_rate": 5.643762279912146e-10, "loss": 0.78826702, "num_input_tokens_seen": 356359410, "router_z_loss_clip": 2.1484375, "router_z_loss_mlp": 0.26391602, "step": 16510, "time_per_iteration": 3.0585341453552246 }, { "auxiliary_loss_clip": 0.01271799, "auxiliary_loss_mlp": 0.00261881, "balance_loss_clip": 1.04779255, "balance_loss_mlp": 0.23445086, "epoch": 0.9926950248008417, "flos": 20741716592640.0, "grad_norm": 14.19674017905556, "language_loss": 0.91134918, "learning_rate": 5.551625032997886e-10, "loss": 0.92668605, "num_input_tokens_seen": 356378345, "router_z_loss_clip": 2.2421875, "router_z_loss_mlp": 0.2746582, "step": 16511, "time_per_iteration": 2.6557037830352783 }, { "auxiliary_loss_clip": 0.01225801, "auxiliary_loss_mlp": 0.00243388, "balance_loss_clip": 1.01189685, "balance_loss_mlp": 0.21929535, "epoch": 0.9927551480535097, "flos": 24352965221760.0, "grad_norm": 21.78224778428894, "language_loss": 0.98380369, "learning_rate": 5.460245977570998e-10, "loss": 0.99849558, "num_input_tokens_seen": 356397345, "router_z_loss_clip": 2.13867188, "router_z_loss_mlp": 0.24084473, "step": 16512, "time_per_iteration": 2.7030177116394043 }, { "auxiliary_loss_clip": 0.01089314, "auxiliary_loss_mlp": 0.00078893, "balance_loss_clip": 0.95154464, "balance_loss_mlp": 0.07207419, "epoch": 0.9928152713061776, "flos": 71275572207360.0, "grad_norm": 0.665306866476308, "language_loss": 0.54098523, "learning_rate": 5.369625117095378e-10, "loss": 0.55266726, "num_input_tokens_seen": 356459160, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.06835938, "step": 16513, "time_per_iteration": 3.246744155883789 }, { "auxiliary_loss_clip": 0.012396, "auxiliary_loss_mlp": 0.00220273, "balance_loss_clip": 1.02449322, "balance_loss_mlp": 0.1963481, "epoch": 0.9928753945588457, "flos": 57809499045120.0, "grad_norm": 15.610215247421506, "language_loss": 0.71279252, "learning_rate": 5.279762455006054e-10, "loss": 0.7273913, "num_input_tokens_seen": 356486405, "router_z_loss_clip": 2.1484375, "router_z_loss_mlp": 0.23937988, "step": 16514, "time_per_iteration": 3.0181102752685547 }, { "auxiliary_loss_clip": 0.01250774, "auxiliary_loss_mlp": 0.00232304, "balance_loss_clip": 1.02942741, "balance_loss_mlp": 0.20631675, "epoch": 0.9929355178115136, "flos": 19568981450880.0, "grad_norm": 32.56298812599633, "language_loss": 0.82532811, "learning_rate": 5.190657994713632e-10, "loss": 0.84015894, "num_input_tokens_seen": 356502905, "router_z_loss_clip": 2.21386719, "router_z_loss_mlp": 0.25964355, "step": 16515, "time_per_iteration": 2.688469648361206 }, { "auxiliary_loss_clip": 0.01238723, "auxiliary_loss_mlp": 0.0020308, "balance_loss_clip": 1.02326679, "balance_loss_mlp": 0.1787971, "epoch": 0.9929956410641816, "flos": 22964658606720.0, "grad_norm": 5.067708829560748, "language_loss": 0.83268368, "learning_rate": 5.102311739593191e-10, "loss": 0.84710175, "num_input_tokens_seen": 356523830, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.24255371, "step": 16516, "time_per_iteration": 2.6610054969787598 }, { "auxiliary_loss_clip": 0.01236034, "auxiliary_loss_mlp": 0.00221713, "balance_loss_clip": 1.02134478, "balance_loss_mlp": 0.19725154, "epoch": 0.9930557643168495, "flos": 22566409539840.0, "grad_norm": 15.255650842330422, "language_loss": 0.83660555, "learning_rate": 5.014723692997602e-10, "loss": 0.851183, "num_input_tokens_seen": 356543965, "router_z_loss_clip": 2.14648438, "router_z_loss_mlp": 0.24450684, "step": 16517, "time_per_iteration": 4.103203058242798 }, { "auxiliary_loss_clip": 0.01252495, "auxiliary_loss_mlp": 0.00230877, "balance_loss_clip": 1.03038847, "balance_loss_mlp": 0.20306534, "epoch": 0.9931158875695175, "flos": 17201032231680.0, "grad_norm": 11.76157041536115, "language_loss": 0.78712457, "learning_rate": 4.927893858248655e-10, "loss": 0.80195826, "num_input_tokens_seen": 356561530, "router_z_loss_clip": 2.22070312, "router_z_loss_mlp": 0.27807617, "step": 16518, "time_per_iteration": 2.6290438175201416 }, { "auxiliary_loss_clip": 0.01084361, "auxiliary_loss_mlp": 0.00065596, "balance_loss_clip": 0.94700444, "balance_loss_mlp": 0.05920672, "epoch": 0.9931760108221854, "flos": 63711204278400.0, "grad_norm": 0.7375659773293618, "language_loss": 0.52797472, "learning_rate": 4.84182223863483e-10, "loss": 0.53947437, "num_input_tokens_seen": 356616845, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.06396484, "step": 16519, "time_per_iteration": 4.490699291229248 }, { "auxiliary_loss_clip": 0.01247899, "auxiliary_loss_mlp": 0.00227513, "balance_loss_clip": 1.0298419, "balance_loss_mlp": 0.20201382, "epoch": 0.9932361340748534, "flos": 15304805349120.0, "grad_norm": 3.346449943622986, "language_loss": 0.6741367, "learning_rate": 4.756508837426842e-10, "loss": 0.68889081, "num_input_tokens_seen": 356633560, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.25524902, "step": 16520, "time_per_iteration": 2.7774269580841064 }, { "auxiliary_loss_clip": 0.01236239, "auxiliary_loss_mlp": 0.00227908, "balance_loss_clip": 1.02196121, "balance_loss_mlp": 0.20313603, "epoch": 0.9932962573275215, "flos": 36064906727040.0, "grad_norm": 3.235038121903276, "language_loss": 0.70467424, "learning_rate": 4.671953657853223e-10, "loss": 0.71931571, "num_input_tokens_seen": 356657600, "router_z_loss_clip": 2.140625, "router_z_loss_mlp": 0.24780273, "step": 16521, "time_per_iteration": 2.8796026706695557 }, { "auxiliary_loss_clip": 0.01251033, "auxiliary_loss_mlp": 0.00222094, "balance_loss_clip": 1.03157949, "balance_loss_mlp": 0.19427007, "epoch": 0.9933563805801894, "flos": 21470523546240.0, "grad_norm": 268.6881163513795, "language_loss": 0.81401491, "learning_rate": 4.5881567031225145e-10, "loss": 0.8287462, "num_input_tokens_seen": 356675880, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.27832031, "step": 16522, "time_per_iteration": 2.63860821723938 }, { "auxiliary_loss_clip": 0.01217505, "auxiliary_loss_mlp": 0.00194987, "balance_loss_clip": 1.00954056, "balance_loss_mlp": 0.1734817, "epoch": 0.9934165038328574, "flos": 23986532626560.0, "grad_norm": 17.733554998542473, "language_loss": 0.78994232, "learning_rate": 4.5051179764143964e-10, "loss": 0.80406719, "num_input_tokens_seen": 356696000, "router_z_loss_clip": 2.08203125, "router_z_loss_mlp": 0.21508789, "step": 16523, "time_per_iteration": 2.659036159515381 }, { "auxiliary_loss_clip": 0.01250369, "auxiliary_loss_mlp": 0.00228858, "balance_loss_clip": 1.03021049, "balance_loss_mlp": 0.20313291, "epoch": 0.9934766270855253, "flos": 21907807718400.0, "grad_norm": 413.7155240636684, "language_loss": 0.78849387, "learning_rate": 4.422837480875241e-10, "loss": 0.80328614, "num_input_tokens_seen": 356716845, "router_z_loss_clip": 2.20019531, "router_z_loss_mlp": 0.25732422, "step": 16524, "time_per_iteration": 2.661891460418701 }, { "auxiliary_loss_clip": 0.01241181, "auxiliary_loss_mlp": 0.00230202, "balance_loss_clip": 1.02653074, "balance_loss_mlp": 0.20600277, "epoch": 0.9935367503381933, "flos": 17129139160320.0, "grad_norm": 9.089336059557317, "language_loss": 0.87728471, "learning_rate": 4.341315219624775e-10, "loss": 0.89199853, "num_input_tokens_seen": 356732100, "router_z_loss_clip": 2.14648438, "router_z_loss_mlp": 0.24182129, "step": 16525, "time_per_iteration": 2.655880928039551 }, { "auxiliary_loss_clip": 0.01229739, "auxiliary_loss_mlp": 0.00232114, "balance_loss_clip": 1.01518941, "balance_loss_mlp": 0.20957118, "epoch": 0.9935968735908612, "flos": 22346241125760.0, "grad_norm": 46.41809797221321, "language_loss": 0.84127688, "learning_rate": 4.2605511957582995e-10, "loss": 0.85589552, "num_input_tokens_seen": 356751480, "router_z_loss_clip": 2.14648438, "router_z_loss_mlp": 0.22509766, "step": 16526, "time_per_iteration": 2.6889169216156006 }, { "auxiliary_loss_clip": 0.01212159, "auxiliary_loss_mlp": 0.00199854, "balance_loss_clip": 1.00658011, "balance_loss_mlp": 0.17714453, "epoch": 0.9936569968435293, "flos": 29460539640960.0, "grad_norm": 4.0628877623638875, "language_loss": 0.78654516, "learning_rate": 4.180545412333369e-10, "loss": 0.80066538, "num_input_tokens_seen": 356772650, "router_z_loss_clip": 2.05273438, "router_z_loss_mlp": 0.22717285, "step": 16527, "time_per_iteration": 4.133192777633667 }, { "auxiliary_loss_clip": 0.01234067, "auxiliary_loss_mlp": 0.00218054, "balance_loss_clip": 1.01647925, "balance_loss_mlp": 0.19481966, "epoch": 0.9937171200961972, "flos": 16544046522240.0, "grad_norm": 21.51108404834354, "language_loss": 0.88168871, "learning_rate": 4.1012978723875547e-10, "loss": 0.89620996, "num_input_tokens_seen": 356788510, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.23242188, "step": 16528, "time_per_iteration": 4.021409749984741 }, { "auxiliary_loss_clip": 0.01241714, "auxiliary_loss_mlp": 0.00228304, "balance_loss_clip": 1.02434254, "balance_loss_mlp": 0.20096895, "epoch": 0.9937772433488652, "flos": 24390276474240.0, "grad_norm": 5.924734428495689, "language_loss": 0.80365825, "learning_rate": 4.022808578922898e-10, "loss": 0.81835842, "num_input_tokens_seen": 356809115, "router_z_loss_clip": 2.171875, "router_z_loss_mlp": 0.27307129, "step": 16529, "time_per_iteration": 2.7030584812164307 }, { "auxiliary_loss_clip": 0.01260959, "auxiliary_loss_mlp": 0.00245837, "balance_loss_clip": 1.03268123, "balance_loss_mlp": 0.21814454, "epoch": 0.9938373666015331, "flos": 15669909141120.0, "grad_norm": 28.876709035402538, "language_loss": 0.73407495, "learning_rate": 3.9450775349170186e-10, "loss": 0.74914289, "num_input_tokens_seen": 356826410, "router_z_loss_clip": 2.28222656, "router_z_loss_mlp": 0.27722168, "step": 16530, "time_per_iteration": 2.695976972579956 }, { "auxiliary_loss_clip": 0.01246542, "auxiliary_loss_mlp": 0.00229392, "balance_loss_clip": 1.0310322, "balance_loss_mlp": 0.20463207, "epoch": 0.9938974898542011, "flos": 19496190539520.0, "grad_norm": 16.352853902482398, "language_loss": 0.80081052, "learning_rate": 3.8681047433186676e-10, "loss": 0.81556988, "num_input_tokens_seen": 356844990, "router_z_loss_clip": 2.15527344, "router_z_loss_mlp": 0.24768066, "step": 16531, "time_per_iteration": 2.666436195373535 }, { "auxiliary_loss_clip": 0.01242317, "auxiliary_loss_mlp": 0.00220054, "balance_loss_clip": 1.02233791, "balance_loss_mlp": 0.19413838, "epoch": 0.993957613106869, "flos": 26906896085760.0, "grad_norm": 37.086312649465306, "language_loss": 0.80915499, "learning_rate": 3.791890207045512e-10, "loss": 0.82377875, "num_input_tokens_seen": 356866530, "router_z_loss_clip": 2.19824219, "router_z_loss_mlp": 0.25927734, "step": 16532, "time_per_iteration": 2.773681879043579 }, { "auxiliary_loss_clip": 0.01215938, "auxiliary_loss_mlp": 0.00235918, "balance_loss_clip": 1.01108873, "balance_loss_mlp": 0.21395996, "epoch": 0.994017736359537, "flos": 14939593816320.0, "grad_norm": 55.54423628640272, "language_loss": 0.79124629, "learning_rate": 3.7164339289885717e-10, "loss": 0.80576479, "num_input_tokens_seen": 356884660, "router_z_loss_clip": 2.04882812, "router_z_loss_mlp": 0.21936035, "step": 16533, "time_per_iteration": 2.683507204055786 }, { "auxiliary_loss_clip": 0.01263787, "auxiliary_loss_mlp": 0.0025771, "balance_loss_clip": 1.04028702, "balance_loss_mlp": 0.22950517, "epoch": 0.9940778596122051, "flos": 15377883569280.0, "grad_norm": 34.96823256752456, "language_loss": 0.92600626, "learning_rate": 3.641735912007782e-10, "loss": 0.94122124, "num_input_tokens_seen": 356900895, "router_z_loss_clip": 2.23828125, "router_z_loss_mlp": 0.28222656, "step": 16534, "time_per_iteration": 2.6230242252349854 }, { "auxiliary_loss_clip": 0.01207699, "auxiliary_loss_mlp": 0.00194396, "balance_loss_clip": 1.00696874, "balance_loss_mlp": 0.17149633, "epoch": 0.994137982864873, "flos": 25228108183680.0, "grad_norm": 6.1689319252438475, "language_loss": 0.73476595, "learning_rate": 3.567796158934211e-10, "loss": 0.74878693, "num_input_tokens_seen": 356920985, "router_z_loss_clip": 2.0078125, "router_z_loss_mlp": 0.22900391, "step": 16535, "time_per_iteration": 2.7016384601593018 }, { "auxiliary_loss_clip": 0.01238051, "auxiliary_loss_mlp": 0.00219705, "balance_loss_clip": 1.02570534, "balance_loss_mlp": 0.19520727, "epoch": 0.994198106117541, "flos": 18442140912000.0, "grad_norm": 249.1881609087331, "language_loss": 0.71938372, "learning_rate": 3.4946146725767235e-10, "loss": 0.73396122, "num_input_tokens_seen": 356939800, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.24475098, "step": 16536, "time_per_iteration": 2.6551599502563477 }, { "auxiliary_loss_clip": 0.01246292, "auxiliary_loss_mlp": 0.00229449, "balance_loss_clip": 1.03154373, "balance_loss_mlp": 0.20471302, "epoch": 0.9942582293702089, "flos": 16654112772480.0, "grad_norm": 7.446334202852814, "language_loss": 0.872841, "learning_rate": 3.4221914557064357e-10, "loss": 0.88759851, "num_input_tokens_seen": 356957780, "router_z_loss_clip": 2.14941406, "router_z_loss_mlp": 0.24743652, "step": 16537, "time_per_iteration": 2.6629555225372314 }, { "auxiliary_loss_clip": 0.01249932, "auxiliary_loss_mlp": 0.00241747, "balance_loss_clip": 1.02849364, "balance_loss_mlp": 0.2155807, "epoch": 0.9943183526228769, "flos": 21944580266880.0, "grad_norm": 736.9505408900905, "language_loss": 0.79420137, "learning_rate": 3.35052651107004e-10, "loss": 0.80911809, "num_input_tokens_seen": 356979185, "router_z_loss_clip": 2.21289062, "router_z_loss_mlp": 0.26184082, "step": 16538, "time_per_iteration": 2.6753618717193604 }, { "auxiliary_loss_clip": 0.01239173, "auxiliary_loss_mlp": 0.00221762, "balance_loss_clip": 1.02561069, "balance_loss_mlp": 0.19809903, "epoch": 0.9943784758755448, "flos": 23842566915840.0, "grad_norm": 43.8959737107112, "language_loss": 0.84435934, "learning_rate": 3.2796198413853614e-10, "loss": 0.85896868, "num_input_tokens_seen": 356997735, "router_z_loss_clip": 2.1328125, "router_z_loss_mlp": 0.23657227, "step": 16539, "time_per_iteration": 2.7180936336517334 }, { "auxiliary_loss_clip": 0.01241723, "auxiliary_loss_mlp": 0.00223174, "balance_loss_clip": 1.02241075, "balance_loss_mlp": 0.1991173, "epoch": 0.9944385991282129, "flos": 21469984842240.0, "grad_norm": 12.887900147739076, "language_loss": 0.81786144, "learning_rate": 3.209471449341361e-10, "loss": 0.83251035, "num_input_tokens_seen": 357015660, "router_z_loss_clip": 2.19433594, "router_z_loss_mlp": 0.24060059, "step": 16540, "time_per_iteration": 2.630388021469116 }, { "auxiliary_loss_clip": 0.01220305, "auxiliary_loss_mlp": 0.00203473, "balance_loss_clip": 1.00966227, "balance_loss_mlp": 0.18088315, "epoch": 0.9944987223808808, "flos": 22927024131840.0, "grad_norm": 1.814587393317536, "language_loss": 0.83907974, "learning_rate": 3.140081337600353e-10, "loss": 0.8533175, "num_input_tokens_seen": 357034800, "router_z_loss_clip": 2.10742188, "router_z_loss_mlp": 0.22607422, "step": 16541, "time_per_iteration": 2.687471866607666 }, { "auxiliary_loss_clip": 0.01231458, "auxiliary_loss_mlp": 0.0021425, "balance_loss_clip": 1.01880085, "balance_loss_mlp": 0.19032449, "epoch": 0.9945588456335488, "flos": 22383013674240.0, "grad_norm": 6736.877188587143, "language_loss": 0.85630476, "learning_rate": 3.0714495087891255e-10, "loss": 0.87076181, "num_input_tokens_seen": 357053785, "router_z_loss_clip": 2.12695312, "router_z_loss_mlp": 0.23937988, "step": 16542, "time_per_iteration": 2.658353090286255 }, { "auxiliary_loss_clip": 0.01266786, "auxiliary_loss_mlp": 0.00246556, "balance_loss_clip": 1.03963876, "balance_loss_mlp": 0.21961501, "epoch": 0.9946189688862167, "flos": 21397517153280.0, "grad_norm": 18.05348106060521, "language_loss": 0.84559542, "learning_rate": 3.0035759655122615e-10, "loss": 0.86072886, "num_input_tokens_seen": 357072025, "router_z_loss_clip": 2.26953125, "router_z_loss_mlp": 0.26965332, "step": 16543, "time_per_iteration": 2.691941499710083 }, { "auxiliary_loss_clip": 0.01255551, "auxiliary_loss_mlp": 0.00224341, "balance_loss_clip": 1.03724766, "balance_loss_mlp": 0.19940251, "epoch": 0.9946790921388847, "flos": 12416545670400.0, "grad_norm": 34.53069055293938, "language_loss": 0.91206157, "learning_rate": 2.9364607103454785e-10, "loss": 0.92686045, "num_input_tokens_seen": 357086960, "router_z_loss_clip": 2.18261719, "router_z_loss_mlp": 0.24951172, "step": 16544, "time_per_iteration": 2.6720240116119385 }, { "auxiliary_loss_clip": 0.01228445, "auxiliary_loss_mlp": 0.00203651, "balance_loss_clip": 1.01948833, "balance_loss_mlp": 0.17930859, "epoch": 0.9947392153915526, "flos": 19058295836160.0, "grad_norm": 5.794105852599902, "language_loss": 0.86749327, "learning_rate": 2.870103745831187e-10, "loss": 0.88181412, "num_input_tokens_seen": 357105095, "router_z_loss_clip": 2.08789062, "router_z_loss_mlp": 0.24353027, "step": 16545, "time_per_iteration": 2.696570634841919 }, { "auxiliary_loss_clip": 0.01252518, "auxiliary_loss_mlp": 0.00211528, "balance_loss_clip": 1.03010106, "balance_loss_mlp": 0.18885455, "epoch": 0.9947993386442207, "flos": 27308808339840.0, "grad_norm": 38.54969671364069, "language_loss": 0.79915905, "learning_rate": 2.8045050744873733e-10, "loss": 0.8137995, "num_input_tokens_seen": 357125065, "router_z_loss_clip": 2.22460938, "router_z_loss_mlp": 0.22692871, "step": 16546, "time_per_iteration": 2.7621941566467285 }, { "auxiliary_loss_clip": 0.01227187, "auxiliary_loss_mlp": 0.00218297, "balance_loss_clip": 1.01581597, "balance_loss_mlp": 0.1925718, "epoch": 0.9948594618968887, "flos": 20806498771200.0, "grad_norm": 198.91047910188874, "language_loss": 0.84345233, "learning_rate": 2.739664698798716e-10, "loss": 0.85790718, "num_input_tokens_seen": 357141600, "router_z_loss_clip": 2.11035156, "router_z_loss_mlp": 0.25720215, "step": 16547, "time_per_iteration": 2.639575481414795 }, { "auxiliary_loss_clip": 0.01231192, "auxiliary_loss_mlp": 0.00232206, "balance_loss_clip": 1.01576996, "balance_loss_mlp": 0.2066716, "epoch": 0.9949195851495566, "flos": 23292953936640.0, "grad_norm": 43.90621936781188, "language_loss": 0.78009737, "learning_rate": 2.67558262122769e-10, "loss": 0.79473138, "num_input_tokens_seen": 357157880, "router_z_loss_clip": 2.15234375, "router_z_loss_mlp": 0.25537109, "step": 16548, "time_per_iteration": 2.6841349601745605 }, { "auxiliary_loss_clip": 0.01240538, "auxiliary_loss_mlp": 0.00217103, "balance_loss_clip": 1.02895141, "balance_loss_mlp": 0.19366659, "epoch": 0.9949797084022246, "flos": 18515470527360.0, "grad_norm": 9.219354374492578, "language_loss": 0.85068393, "learning_rate": 2.6122588442012427e-10, "loss": 0.86526024, "num_input_tokens_seen": 357176705, "router_z_loss_clip": 2.11621094, "router_z_loss_mlp": 0.234375, "step": 16549, "time_per_iteration": 2.6400370597839355 }, { "auxiliary_loss_clip": 0.01250084, "auxiliary_loss_mlp": 0.00233914, "balance_loss_clip": 1.03151083, "balance_loss_mlp": 0.20691293, "epoch": 0.9950398316548925, "flos": 30407719328640.0, "grad_norm": 118.4599220095371, "language_loss": 0.8165369, "learning_rate": 2.5496933701241177e-10, "loss": 0.83137685, "num_input_tokens_seen": 357197630, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.2701416, "step": 16550, "time_per_iteration": 2.7480826377868652 }, { "auxiliary_loss_clip": 0.01237946, "auxiliary_loss_mlp": 0.00217184, "balance_loss_clip": 1.02177799, "balance_loss_mlp": 0.19248411, "epoch": 0.9950999549075605, "flos": 19900868140800.0, "grad_norm": 6.063127787408688, "language_loss": 0.83576131, "learning_rate": 2.4878862013655297e-10, "loss": 0.85031259, "num_input_tokens_seen": 357215445, "router_z_loss_clip": 2.16210938, "router_z_loss_mlp": 0.24707031, "step": 16551, "time_per_iteration": 2.779247283935547 }, { "auxiliary_loss_clip": 0.01203267, "auxiliary_loss_mlp": 0.00211276, "balance_loss_clip": 1.00003695, "balance_loss_mlp": 0.18925828, "epoch": 0.9951600781602284, "flos": 17603555016960.0, "grad_norm": 354.01264893102916, "language_loss": 0.72995353, "learning_rate": 2.426837340270271e-10, "loss": 0.7440989, "num_input_tokens_seen": 357234285, "router_z_loss_clip": 2.03027344, "router_z_loss_mlp": 0.22033691, "step": 16552, "time_per_iteration": 2.7281394004821777 }, { "auxiliary_loss_clip": 0.01221615, "auxiliary_loss_mlp": 0.00239444, "balance_loss_clip": 1.00997329, "balance_loss_mlp": 0.21547058, "epoch": 0.9952202014128965, "flos": 28950715952640.0, "grad_norm": 5.600250806083065, "language_loss": 0.87305117, "learning_rate": 2.3665467891520465e-10, "loss": 0.88766181, "num_input_tokens_seen": 357257565, "router_z_loss_clip": 2.11621094, "router_z_loss_mlp": 0.23950195, "step": 16553, "time_per_iteration": 2.7295591831207275 }, { "auxiliary_loss_clip": 0.01083027, "auxiliary_loss_mlp": 0.00076406, "balance_loss_clip": 0.94577932, "balance_loss_mlp": 0.06953974, "epoch": 0.9952803246655644, "flos": 70810386145920.0, "grad_norm": 0.9798022398788457, "language_loss": 0.56611049, "learning_rate": 2.3070145503001348e-10, "loss": 0.57770479, "num_input_tokens_seen": 357320205, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.06884766, "step": 16554, "time_per_iteration": 3.282322883605957 }, { "auxiliary_loss_clip": 0.01238617, "auxiliary_loss_mlp": 0.00219026, "balance_loss_clip": 1.02302575, "balance_loss_mlp": 0.19431348, "epoch": 0.9953404479182324, "flos": 21799070271360.0, "grad_norm": 25.19005127189776, "language_loss": 0.82095063, "learning_rate": 2.24824062597051e-10, "loss": 0.83552706, "num_input_tokens_seen": 357340695, "router_z_loss_clip": 2.15820312, "router_z_loss_mlp": 0.24731445, "step": 16555, "time_per_iteration": 2.692577600479126 }, { "auxiliary_loss_clip": 0.01249703, "auxiliary_loss_mlp": 0.00220412, "balance_loss_clip": 1.02843344, "balance_loss_mlp": 0.19393578, "epoch": 0.9954005711709003, "flos": 21937397546880.0, "grad_norm": 15.96413358545849, "language_loss": 0.92395854, "learning_rate": 2.1902250183902793e-10, "loss": 0.93865955, "num_input_tokens_seen": 357357505, "router_z_loss_clip": 2.21289062, "router_z_loss_mlp": 0.26477051, "step": 16556, "time_per_iteration": 2.685053825378418 }, { "auxiliary_loss_clip": 0.01226517, "auxiliary_loss_mlp": 0.00214344, "balance_loss_clip": 1.01271999, "balance_loss_mlp": 0.18783179, "epoch": 0.9954606944235683, "flos": 19354559212800.0, "grad_norm": 2.6647194237411083, "language_loss": 0.81331629, "learning_rate": 2.132967729762125e-10, "loss": 0.82772493, "num_input_tokens_seen": 357375395, "router_z_loss_clip": 2.1328125, "router_z_loss_mlp": 0.26525879, "step": 16557, "time_per_iteration": 2.759101390838623 }, { "auxiliary_loss_clip": 0.01233433, "auxiliary_loss_mlp": 0.00235975, "balance_loss_clip": 1.02154195, "balance_loss_mlp": 0.21026182, "epoch": 0.9955208176762362, "flos": 30518611591680.0, "grad_norm": 19.381380807276656, "language_loss": 0.83515757, "learning_rate": 2.0764687622554233e-10, "loss": 0.84985161, "num_input_tokens_seen": 357397375, "router_z_loss_clip": 2.12304688, "router_z_loss_mlp": 0.25683594, "step": 16558, "time_per_iteration": 2.75089168548584 }, { "auxiliary_loss_clip": 0.01233752, "auxiliary_loss_mlp": 0.0023462, "balance_loss_clip": 1.0191406, "balance_loss_mlp": 0.21036127, "epoch": 0.9955809409289043, "flos": 30008249199360.0, "grad_norm": 1.9791932274768513, "language_loss": 0.70890087, "learning_rate": 2.0207281180129044e-10, "loss": 0.72358453, "num_input_tokens_seen": 357418880, "router_z_loss_clip": 2.1484375, "router_z_loss_mlp": 0.24279785, "step": 16559, "time_per_iteration": 4.188792943954468 }, { "auxiliary_loss_clip": 0.01241359, "auxiliary_loss_mlp": 0.00244616, "balance_loss_clip": 1.02277827, "balance_loss_mlp": 0.21776983, "epoch": 0.9956410641815723, "flos": 21543278544000.0, "grad_norm": 12.748680880236638, "language_loss": 0.81753409, "learning_rate": 1.965745799148433e-10, "loss": 0.83239383, "num_input_tokens_seen": 357438310, "router_z_loss_clip": 2.19140625, "router_z_loss_mlp": 0.26831055, "step": 16560, "time_per_iteration": 2.727598190307617 }, { "auxiliary_loss_clip": 0.0123281, "auxiliary_loss_mlp": 0.00232795, "balance_loss_clip": 1.01936352, "balance_loss_mlp": 0.20833334, "epoch": 0.9957011874342402, "flos": 21689470897920.0, "grad_norm": 18.231731519089898, "language_loss": 0.86671293, "learning_rate": 1.9115218077470073e-10, "loss": 0.88136899, "num_input_tokens_seen": 357457155, "router_z_loss_clip": 2.13085938, "router_z_loss_mlp": 0.24475098, "step": 16561, "time_per_iteration": 4.2704758644104 }, { "auxiliary_loss_clip": 0.01229128, "auxiliary_loss_mlp": 0.00217329, "balance_loss_clip": 1.02071369, "balance_loss_mlp": 0.19279601, "epoch": 0.9957613106869082, "flos": 17702667619200.0, "grad_norm": 12.206746929082898, "language_loss": 0.72697496, "learning_rate": 1.8580561458647614e-10, "loss": 0.74143958, "num_input_tokens_seen": 357468060, "router_z_loss_clip": 2.0859375, "router_z_loss_mlp": 0.24523926, "step": 16562, "time_per_iteration": 2.618220090866089 }, { "auxiliary_loss_clip": 0.0125648, "auxiliary_loss_mlp": 0.00228769, "balance_loss_clip": 1.03812885, "balance_loss_mlp": 0.20385432, "epoch": 0.9958214339395761, "flos": 30555994671360.0, "grad_norm": 70.4109955026384, "language_loss": 0.73124826, "learning_rate": 1.805348815528962e-10, "loss": 0.74610066, "num_input_tokens_seen": 357489665, "router_z_loss_clip": 2.18652344, "router_z_loss_mlp": 0.24890137, "step": 16563, "time_per_iteration": 2.7206223011016846 }, { "auxiliary_loss_clip": 0.01242874, "auxiliary_loss_mlp": 0.00222604, "balance_loss_clip": 1.02329481, "balance_loss_mlp": 0.19777274, "epoch": 0.9958815571922441, "flos": 24169174306560.0, "grad_norm": 2.6414132453819783, "language_loss": 0.70556778, "learning_rate": 1.7533998187380105e-10, "loss": 0.72022259, "num_input_tokens_seen": 357511975, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.24816895, "step": 16564, "time_per_iteration": 2.738818407058716 }, { "auxiliary_loss_clip": 0.01241572, "auxiliary_loss_mlp": 0.00209061, "balance_loss_clip": 1.02289438, "balance_loss_mlp": 0.1845994, "epoch": 0.995941680444912, "flos": 15487016065920.0, "grad_norm": 6.667707215835814, "language_loss": 0.81976861, "learning_rate": 1.7022091574636633e-10, "loss": 0.83427495, "num_input_tokens_seen": 357529345, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.24475098, "step": 16565, "time_per_iteration": 2.6399688720703125 }, { "auxiliary_loss_clip": 0.01229217, "auxiliary_loss_mlp": 0.00211343, "balance_loss_clip": 1.01464939, "balance_loss_mlp": 0.18715498, "epoch": 0.9960018036975801, "flos": 18621227145600.0, "grad_norm": 4.387997349070509, "language_loss": 0.86823797, "learning_rate": 1.6517768336443694e-10, "loss": 0.88264358, "num_input_tokens_seen": 357547615, "router_z_loss_clip": 2.14453125, "router_z_loss_mlp": 0.24194336, "step": 16566, "time_per_iteration": 2.638319730758667 }, { "auxiliary_loss_clip": 0.01226626, "auxiliary_loss_mlp": 0.00216615, "balance_loss_clip": 1.01545262, "balance_loss_mlp": 0.19237974, "epoch": 0.996061926950248, "flos": 20084120352000.0, "grad_norm": 4.131045265756462, "language_loss": 0.77857691, "learning_rate": 1.6021028491941535e-10, "loss": 0.79300928, "num_input_tokens_seen": 357567380, "router_z_loss_clip": 2.11132812, "router_z_loss_mlp": 0.24243164, "step": 16567, "time_per_iteration": 2.650552272796631 }, { "auxiliary_loss_clip": 0.01242411, "auxiliary_loss_mlp": 0.00215419, "balance_loss_clip": 1.02323961, "balance_loss_mlp": 0.19118418, "epoch": 0.996122050202916, "flos": 24347829576960.0, "grad_norm": 297.37611073071065, "language_loss": 0.88468057, "learning_rate": 1.5531872059959538e-10, "loss": 0.89925885, "num_input_tokens_seen": 357586435, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.24230957, "step": 16568, "time_per_iteration": 2.7046265602111816 }, { "auxiliary_loss_clip": 0.01212038, "auxiliary_loss_mlp": 0.0021773, "balance_loss_clip": 1.00705624, "balance_loss_mlp": 0.19403136, "epoch": 0.9961821734555839, "flos": 24199302839040.0, "grad_norm": 18.69045864020417, "language_loss": 0.88015926, "learning_rate": 1.5050299059060634e-10, "loss": 0.89445698, "num_input_tokens_seen": 357604720, "router_z_loss_clip": 2.04980469, "router_z_loss_mlp": 0.23706055, "step": 16569, "time_per_iteration": 4.096976280212402 }, { "auxiliary_loss_clip": 0.01225923, "auxiliary_loss_mlp": 0.00231325, "balance_loss_clip": 1.01978445, "balance_loss_mlp": 0.20782936, "epoch": 0.9962422967082519, "flos": 22633741584000.0, "grad_norm": 13.958920508058076, "language_loss": 0.76898801, "learning_rate": 1.457630950747468e-10, "loss": 0.78356051, "num_input_tokens_seen": 357622345, "router_z_loss_clip": 2.06445312, "router_z_loss_mlp": 0.23522949, "step": 16570, "time_per_iteration": 4.087749481201172 }, { "auxiliary_loss_clip": 0.01215949, "auxiliary_loss_mlp": 0.00181654, "balance_loss_clip": 1.00635207, "balance_loss_mlp": 0.1579428, "epoch": 0.9963024199609198, "flos": 26396030903040.0, "grad_norm": 4.863374272225943, "language_loss": 0.82557249, "learning_rate": 1.4109903423209502e-10, "loss": 0.83954859, "num_input_tokens_seen": 357642710, "router_z_loss_clip": 2.09667969, "router_z_loss_mlp": 0.23718262, "step": 16571, "time_per_iteration": 2.7431440353393555 }, { "auxiliary_loss_clip": 0.01241125, "auxiliary_loss_mlp": 0.00242046, "balance_loss_clip": 1.02764606, "balance_loss_mlp": 0.21680962, "epoch": 0.9963625432135879, "flos": 16581537342720.0, "grad_norm": 76.33164146272001, "language_loss": 0.86799371, "learning_rate": 1.3651080823939843e-10, "loss": 0.88282543, "num_input_tokens_seen": 357659870, "router_z_loss_clip": 2.13476562, "router_z_loss_mlp": 0.25268555, "step": 16572, "time_per_iteration": 2.7001986503601074 }, { "auxiliary_loss_clip": 0.01235518, "auxiliary_loss_mlp": 0.00212113, "balance_loss_clip": 1.02112651, "balance_loss_mlp": 0.18684021, "epoch": 0.9964226664662559, "flos": 26468534505600.0, "grad_norm": 114.31999327130974, "language_loss": 0.78541636, "learning_rate": 1.3199841727074e-10, "loss": 0.79989266, "num_input_tokens_seen": 357677075, "router_z_loss_clip": 2.14257812, "router_z_loss_mlp": 0.25305176, "step": 16573, "time_per_iteration": 2.706543207168579 }, { "auxiliary_loss_clip": 0.01272125, "auxiliary_loss_mlp": 0.00245024, "balance_loss_clip": 1.04142118, "balance_loss_mlp": 0.21593669, "epoch": 0.9964827897189238, "flos": 27448320764160.0, "grad_norm": 120.40077990328838, "language_loss": 0.71303147, "learning_rate": 1.275618614968721e-10, "loss": 0.72820294, "num_input_tokens_seen": 357696715, "router_z_loss_clip": 2.3046875, "router_z_loss_mlp": 0.29089355, "step": 16574, "time_per_iteration": 2.774317502975464 }, { "auxiliary_loss_clip": 0.01270942, "auxiliary_loss_mlp": 0.00223624, "balance_loss_clip": 1.04610467, "balance_loss_mlp": 0.1961585, "epoch": 0.9965429129715918, "flos": 11721566350080.0, "grad_norm": 15.764739332690324, "language_loss": 0.87139094, "learning_rate": 1.2320114108654856e-10, "loss": 0.88633668, "num_input_tokens_seen": 357712345, "router_z_loss_clip": 2.24804688, "router_z_loss_mlp": 0.2746582, "step": 16575, "time_per_iteration": 2.7576189041137695 }, { "auxiliary_loss_clip": 0.0123875, "auxiliary_loss_mlp": 0.00231847, "balance_loss_clip": 1.01826203, "balance_loss_mlp": 0.2065749, "epoch": 0.9966030362242597, "flos": 19756004590080.0, "grad_norm": 4.235594879055588, "language_loss": 0.80493522, "learning_rate": 1.1891625620474855e-10, "loss": 0.81964123, "num_input_tokens_seen": 357731815, "router_z_loss_clip": 2.20410156, "router_z_loss_mlp": 0.25268555, "step": 16576, "time_per_iteration": 2.7277677059173584 }, { "auxiliary_loss_clip": 0.01230544, "auxiliary_loss_mlp": 0.00208263, "balance_loss_clip": 1.02124691, "balance_loss_mlp": 0.18507686, "epoch": 0.9966631594769277, "flos": 23915178259200.0, "grad_norm": 61.6547062776753, "language_loss": 0.77934408, "learning_rate": 1.1470720701400871e-10, "loss": 0.79373217, "num_input_tokens_seen": 357751640, "router_z_loss_clip": 2.09277344, "router_z_loss_mlp": 0.23181152, "step": 16577, "time_per_iteration": 2.8478941917419434 }, { "auxiliary_loss_clip": 0.01246877, "auxiliary_loss_mlp": 0.00229938, "balance_loss_clip": 1.02783048, "balance_loss_mlp": 0.20426027, "epoch": 0.9967232827295956, "flos": 15559591495680.0, "grad_norm": 4.805930754237019, "language_loss": 0.876863, "learning_rate": 1.1057399367397912e-10, "loss": 0.89163107, "num_input_tokens_seen": 357769850, "router_z_loss_clip": 2.19042969, "router_z_loss_mlp": 0.25671387, "step": 16578, "time_per_iteration": 2.675095319747925 }, { "auxiliary_loss_clip": 0.0123887, "auxiliary_loss_mlp": 0.0022181, "balance_loss_clip": 1.01881897, "balance_loss_mlp": 0.19833767, "epoch": 0.9967834059822637, "flos": 20813035046400.0, "grad_norm": 11.271707333796156, "language_loss": 0.84350896, "learning_rate": 1.0651661634142328e-10, "loss": 0.85811579, "num_input_tokens_seen": 357789550, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.23486328, "step": 16579, "time_per_iteration": 2.7197325229644775 }, { "auxiliary_loss_clip": 0.01269765, "auxiliary_loss_mlp": 0.00245666, "balance_loss_clip": 1.04918456, "balance_loss_mlp": 0.21982156, "epoch": 0.9968435292349316, "flos": 36719234830080.0, "grad_norm": 12.863825556038911, "language_loss": 0.77734882, "learning_rate": 1.0253507516999604e-10, "loss": 0.79250312, "num_input_tokens_seen": 357809525, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.25842285, "step": 16580, "time_per_iteration": 2.800299882888794 }, { "auxiliary_loss_clip": 0.01234298, "auxiliary_loss_mlp": 0.00234767, "balance_loss_clip": 1.01697159, "balance_loss_mlp": 0.21087721, "epoch": 0.9969036524875996, "flos": 26760919213440.0, "grad_norm": 224.80733546663268, "language_loss": 0.87307882, "learning_rate": 9.862937031113184e-11, "loss": 0.88776946, "num_input_tokens_seen": 357829795, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.2388916, "step": 16581, "time_per_iteration": 2.764221429824829 }, { "auxiliary_loss_clip": 0.0121551, "auxiliary_loss_mlp": 0.00209566, "balance_loss_clip": 1.01035118, "balance_loss_mlp": 0.18626085, "epoch": 0.9969637757402675, "flos": 24827237424000.0, "grad_norm": 11.497046465882892, "language_loss": 0.86911052, "learning_rate": 9.479950191249031e-11, "loss": 0.88336122, "num_input_tokens_seen": 357851655, "router_z_loss_clip": 2.05078125, "router_z_loss_mlp": 0.23327637, "step": 16582, "time_per_iteration": 2.7055647373199463 }, { "auxiliary_loss_clip": 0.01226184, "auxiliary_loss_mlp": 0.00210405, "balance_loss_clip": 1.01907778, "balance_loss_mlp": 0.1874337, "epoch": 0.9970238989929355, "flos": 23038742407680.0, "grad_norm": 8.67845671192374, "language_loss": 0.68216878, "learning_rate": 9.104547011951069e-11, "loss": 0.69653469, "num_input_tokens_seen": 357871205, "router_z_loss_clip": 2.07226562, "router_z_loss_mlp": 0.22949219, "step": 16583, "time_per_iteration": 2.7280802726745605 }, { "auxiliary_loss_clip": 0.0122813, "auxiliary_loss_mlp": 0.00218527, "balance_loss_clip": 1.01613939, "balance_loss_mlp": 0.19430397, "epoch": 0.9970840222456034, "flos": 25298816106240.0, "grad_norm": 28.895955705319665, "language_loss": 0.8458727, "learning_rate": 8.736727507452357e-11, "loss": 0.86033928, "num_input_tokens_seen": 357892145, "router_z_loss_clip": 2.12109375, "router_z_loss_mlp": 0.2421875, "step": 16584, "time_per_iteration": 2.760244846343994 }, { "auxiliary_loss_clip": 0.01225375, "auxiliary_loss_mlp": 0.00214366, "balance_loss_clip": 1.01693368, "balance_loss_mlp": 0.19246739, "epoch": 0.9971441454982715, "flos": 21615602578560.0, "grad_norm": 85.73223505717876, "language_loss": 0.76127326, "learning_rate": 8.376491691697297e-11, "loss": 0.77567065, "num_input_tokens_seen": 357911205, "router_z_loss_clip": 2.08789062, "router_z_loss_mlp": 0.21911621, "step": 16585, "time_per_iteration": 2.713378667831421 }, { "auxiliary_loss_clip": 0.01246552, "auxiliary_loss_mlp": 0.00218528, "balance_loss_clip": 1.02888942, "balance_loss_mlp": 0.19450778, "epoch": 0.9972042687509394, "flos": 14975612179200.0, "grad_norm": 50.40521665268173, "language_loss": 0.89362746, "learning_rate": 8.023839578363834e-11, "loss": 0.90827835, "num_input_tokens_seen": 357928190, "router_z_loss_clip": 2.17578125, "router_z_loss_mlp": 0.23999023, "step": 16586, "time_per_iteration": 2.720348358154297 }, { "auxiliary_loss_clip": 0.01248909, "auxiliary_loss_mlp": 0.00218561, "balance_loss_clip": 1.02985525, "balance_loss_mlp": 0.19361043, "epoch": 0.9972643920036074, "flos": 25806664546560.0, "grad_norm": 40.11626285163719, "language_loss": 0.85713166, "learning_rate": 7.678771180796851e-11, "loss": 0.87180638, "num_input_tokens_seen": 357946985, "router_z_loss_clip": 2.19238281, "router_z_loss_mlp": 0.24963379, "step": 16587, "time_per_iteration": 2.72541880607605 }, { "auxiliary_loss_clip": 0.01236196, "auxiliary_loss_mlp": 0.00225766, "balance_loss_clip": 1.02574849, "balance_loss_mlp": 0.20012385, "epoch": 0.9973245152562754, "flos": 23326242865920.0, "grad_norm": 33.82044944937642, "language_loss": 0.79572654, "learning_rate": 7.341286512074773e-11, "loss": 0.81034619, "num_input_tokens_seen": 357966720, "router_z_loss_clip": 2.10546875, "router_z_loss_mlp": 0.25634766, "step": 16588, "time_per_iteration": 2.738438129425049 }, { "auxiliary_loss_clip": 0.01255737, "auxiliary_loss_mlp": 0.00213172, "balance_loss_clip": 1.03069735, "balance_loss_mlp": 0.18725547, "epoch": 0.9973846385089433, "flos": 12166212810240.0, "grad_norm": 66.32120986831885, "language_loss": 0.93798906, "learning_rate": 7.011385585031781e-11, "loss": 0.95267808, "num_input_tokens_seen": 357981375, "router_z_loss_clip": 2.25195312, "router_z_loss_mlp": 0.25939941, "step": 16589, "time_per_iteration": 2.6630773544311523 }, { "auxiliary_loss_clip": 0.01257722, "auxiliary_loss_mlp": 0.002453, "balance_loss_clip": 1.032493, "balance_loss_mlp": 0.21826319, "epoch": 0.9974447617616113, "flos": 20045157073920.0, "grad_norm": 105.15835151509867, "language_loss": 0.83371812, "learning_rate": 6.689068412168986e-11, "loss": 0.84874833, "num_input_tokens_seen": 358000290, "router_z_loss_clip": 2.25488281, "router_z_loss_mlp": 0.27038574, "step": 16590, "time_per_iteration": 2.871161699295044 }, { "auxiliary_loss_clip": 0.01239275, "auxiliary_loss_mlp": 0.00218805, "balance_loss_clip": 1.02329445, "balance_loss_mlp": 0.19421199, "epoch": 0.9975048850142793, "flos": 32014614159360.0, "grad_norm": 13.216788827906528, "language_loss": 0.71428025, "learning_rate": 6.374335005676634e-11, "loss": 0.72886097, "num_input_tokens_seen": 358022075, "router_z_loss_clip": 2.15820312, "router_z_loss_mlp": 0.24597168, "step": 16591, "time_per_iteration": 2.7827186584472656 }, { "auxiliary_loss_clip": 0.01216157, "auxiliary_loss_mlp": 0.00242118, "balance_loss_clip": 1.00438094, "balance_loss_mlp": 0.21828815, "epoch": 0.9975650082669473, "flos": 36933728895360.0, "grad_norm": 43.13138871391549, "language_loss": 0.8043561, "learning_rate": 6.067185377522933e-11, "loss": 0.81893885, "num_input_tokens_seen": 358043940, "router_z_loss_clip": 2.11914062, "router_z_loss_mlp": 0.23815918, "step": 16592, "time_per_iteration": 2.8508174419403076 }, { "auxiliary_loss_clip": 0.0125357, "auxiliary_loss_mlp": 0.00237318, "balance_loss_clip": 1.03837943, "balance_loss_mlp": 0.21173602, "epoch": 0.9976251315196152, "flos": 16472117537280.0, "grad_norm": 80.97008592739985, "language_loss": 0.9208622, "learning_rate": 5.767619539343016e-11, "loss": 0.93577111, "num_input_tokens_seen": 358062720, "router_z_loss_clip": 2.15234375, "router_z_loss_mlp": 0.2557373, "step": 16593, "time_per_iteration": 2.690086841583252 }, { "auxiliary_loss_clip": 0.01227598, "auxiliary_loss_mlp": 0.0023413, "balance_loss_clip": 1.01545095, "balance_loss_mlp": 0.20882148, "epoch": 0.9976852547722832, "flos": 19646836179840.0, "grad_norm": 1.9778662728342842, "language_loss": 0.76737225, "learning_rate": 5.4756375024833656e-11, "loss": 0.78198951, "num_input_tokens_seen": 358081560, "router_z_loss_clip": 2.11914062, "router_z_loss_mlp": 0.25317383, "step": 16594, "time_per_iteration": 2.7187623977661133 }, { "auxiliary_loss_clip": 0.01239083, "auxiliary_loss_mlp": 0.00220407, "balance_loss_clip": 1.02224064, "balance_loss_mlp": 0.19580173, "epoch": 0.9977453780249511, "flos": 20448434044800.0, "grad_norm": 34.84663259618439, "language_loss": 0.82754183, "learning_rate": 5.1912392780462113e-11, "loss": 0.84213674, "num_input_tokens_seen": 358099065, "router_z_loss_clip": 2.171875, "router_z_loss_mlp": 0.24597168, "step": 16595, "time_per_iteration": 2.684847831726074 }, { "auxiliary_loss_clip": 0.01085348, "auxiliary_loss_mlp": 0.00070982, "balance_loss_clip": 0.94971257, "balance_loss_mlp": 0.06440124, "epoch": 0.9978055012776191, "flos": 65455097581440.0, "grad_norm": 0.7721913134771462, "language_loss": 0.5953052, "learning_rate": 4.9144248768007156e-11, "loss": 0.60686851, "num_input_tokens_seen": 358156095, "router_z_loss_clip": 1.359375, "router_z_loss_mlp": 0.06591797, "step": 16596, "time_per_iteration": 3.1451401710510254 }, { "auxiliary_loss_clip": 0.01234748, "auxiliary_loss_mlp": 0.00229013, "balance_loss_clip": 1.0217495, "balance_loss_mlp": 0.20407426, "epoch": 0.997865624530287, "flos": 20631506688000.0, "grad_norm": 91.71640841262044, "language_loss": 0.8545481, "learning_rate": 4.645194309227385e-11, "loss": 0.86918581, "num_input_tokens_seen": 358175230, "router_z_loss_clip": 2.12988281, "router_z_loss_mlp": 0.24914551, "step": 16597, "time_per_iteration": 2.815448760986328 }, { "auxiliary_loss_clip": 0.01245414, "auxiliary_loss_mlp": 0.00249797, "balance_loss_clip": 1.02788365, "balance_loss_mlp": 0.22400045, "epoch": 0.9979257477829551, "flos": 29387102284800.0, "grad_norm": 18.26954040829393, "language_loss": 0.88949037, "learning_rate": 4.383547585562475e-11, "loss": 0.90444243, "num_input_tokens_seen": 358197075, "router_z_loss_clip": 2.17675781, "router_z_loss_mlp": 0.25793457, "step": 16598, "time_per_iteration": 2.745711088180542 }, { "auxiliary_loss_clip": 0.01272956, "auxiliary_loss_mlp": 0.00256252, "balance_loss_clip": 1.04042387, "balance_loss_mlp": 0.22795135, "epoch": 0.997985871035623, "flos": 22635070387200.0, "grad_norm": 416.70286109257705, "language_loss": 0.73960924, "learning_rate": 4.129484715709175e-11, "loss": 0.75490141, "num_input_tokens_seen": 358215925, "router_z_loss_clip": 2.32421875, "router_z_loss_mlp": 0.28320312, "step": 16599, "time_per_iteration": 2.7535345554351807 }, { "auxiliary_loss_clip": 0.01085617, "auxiliary_loss_mlp": 0.00063743, "balance_loss_clip": 0.94962943, "balance_loss_mlp": 0.05725785, "epoch": 0.998045994288291, "flos": 61806968663040.0, "grad_norm": 0.8514978621142071, "language_loss": 0.61087346, "learning_rate": 3.8830057093264256e-11, "loss": 0.62236702, "num_input_tokens_seen": 358269035, "router_z_loss_clip": 1.359375, "router_z_loss_mlp": 0.06494141, "step": 16600, "time_per_iteration": 3.010524034500122 }, { "auxiliary_loss_clip": 0.012317, "auxiliary_loss_mlp": 0.00230307, "balance_loss_clip": 1.01969457, "balance_loss_mlp": 0.20637017, "epoch": 0.998106117540959, "flos": 19245534456960.0, "grad_norm": 5.26331455980335, "language_loss": 0.83937508, "learning_rate": 3.644110575717896e-11, "loss": 0.85399514, "num_input_tokens_seen": 358287680, "router_z_loss_clip": 2.11914062, "router_z_loss_mlp": 0.23925781, "step": 16601, "time_per_iteration": 2.7195663452148438 }, { "auxiliary_loss_clip": 0.01251561, "auxiliary_loss_mlp": 0.00235573, "balance_loss_clip": 1.0301671, "balance_loss_mlp": 0.20856048, "epoch": 0.9981662407936269, "flos": 21106209853440.0, "grad_norm": 43.03967984297676, "language_loss": 0.90042174, "learning_rate": 3.412799323987414e-11, "loss": 0.9152931, "num_input_tokens_seen": 358304080, "router_z_loss_clip": 2.21289062, "router_z_loss_mlp": 0.2701416, "step": 16602, "time_per_iteration": 4.1708009243011475 }, { "auxiliary_loss_clip": 0.01254136, "auxiliary_loss_mlp": 0.00238026, "balance_loss_clip": 1.03659689, "balance_loss_mlp": 0.21433963, "epoch": 0.998226364046295, "flos": 24316839118080.0, "grad_norm": 189.34743735927208, "language_loss": 0.70251256, "learning_rate": 3.189071962883538e-11, "loss": 0.71743417, "num_input_tokens_seen": 358323670, "router_z_loss_clip": 2.18164062, "router_z_loss_mlp": 0.23706055, "step": 16603, "time_per_iteration": 4.102585077285767 }, { "auxiliary_loss_clip": 0.01255078, "auxiliary_loss_mlp": 0.00223068, "balance_loss_clip": 1.03723168, "balance_loss_mlp": 0.19682986, "epoch": 0.9982864872989629, "flos": 23836389776640.0, "grad_norm": 2.772419191033544, "language_loss": 0.80179638, "learning_rate": 2.972928500866168e-11, "loss": 0.81657785, "num_input_tokens_seen": 358341980, "router_z_loss_clip": 2.18066406, "router_z_loss_mlp": 0.26245117, "step": 16604, "time_per_iteration": 2.6704256534576416 }, { "auxiliary_loss_clip": 0.01232769, "auxiliary_loss_mlp": 0.00233147, "balance_loss_clip": 1.0159657, "balance_loss_mlp": 0.20774341, "epoch": 0.9983466105516309, "flos": 18333116156160.0, "grad_norm": 104.84921817012014, "language_loss": 0.73264331, "learning_rate": 2.7643689461953613e-11, "loss": 0.74730247, "num_input_tokens_seen": 358360400, "router_z_loss_clip": 2.16796875, "router_z_loss_mlp": 0.25415039, "step": 16605, "time_per_iteration": 2.6932003498077393 }, { "auxiliary_loss_clip": 0.01216271, "auxiliary_loss_mlp": 0.0021426, "balance_loss_clip": 1.01156545, "balance_loss_mlp": 0.1919795, "epoch": 0.9984067338042988, "flos": 17236763285760.0, "grad_norm": 8.275200020715154, "language_loss": 0.76782995, "learning_rate": 2.5633933067092938e-11, "loss": 0.78213525, "num_input_tokens_seen": 358378990, "router_z_loss_clip": 2.04980469, "router_z_loss_mlp": 0.22277832, "step": 16606, "time_per_iteration": 2.658618211746216 }, { "auxiliary_loss_clip": 0.01224974, "auxiliary_loss_mlp": 0.00223893, "balance_loss_clip": 1.01523137, "balance_loss_mlp": 0.20055167, "epoch": 0.9984668570569668, "flos": 20667884186880.0, "grad_norm": 169.45549261058227, "language_loss": 0.90447199, "learning_rate": 2.370001590090709e-11, "loss": 0.91896069, "num_input_tokens_seen": 358395970, "router_z_loss_clip": 2.09960938, "router_z_loss_mlp": 0.23352051, "step": 16607, "time_per_iteration": 2.7428290843963623 }, { "auxiliary_loss_clip": 0.01243133, "auxiliary_loss_mlp": 0.00226519, "balance_loss_clip": 1.02162552, "balance_loss_mlp": 0.20048413, "epoch": 0.9985269803096347, "flos": 30262532555520.0, "grad_norm": 34.7064364922944, "language_loss": 0.74430275, "learning_rate": 2.184193803622669e-11, "loss": 0.75899935, "num_input_tokens_seen": 358417355, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.26025391, "step": 16608, "time_per_iteration": 2.7291312217712402 }, { "auxiliary_loss_clip": 0.01244234, "auxiliary_loss_mlp": 0.0022159, "balance_loss_clip": 1.02608705, "balance_loss_mlp": 0.19603148, "epoch": 0.9985871035623027, "flos": 10560970005120.0, "grad_norm": 8.07253556070073, "language_loss": 0.91101086, "learning_rate": 2.0059699543883978e-11, "loss": 0.92566907, "num_input_tokens_seen": 358434345, "router_z_loss_clip": 2.18164062, "router_z_loss_mlp": 0.25561523, "step": 16609, "time_per_iteration": 2.6860036849975586 }, { "auxiliary_loss_clip": 0.0121751, "auxiliary_loss_mlp": 0.00216224, "balance_loss_clip": 1.00841737, "balance_loss_mlp": 0.19332406, "epoch": 0.9986472268149706, "flos": 16873455173760.0, "grad_norm": 108.56073159261118, "language_loss": 0.69237393, "learning_rate": 1.8353300491158462e-11, "loss": 0.70671129, "num_input_tokens_seen": 358452870, "router_z_loss_clip": 2.09179688, "router_z_loss_mlp": 0.22888184, "step": 16610, "time_per_iteration": 2.6298043727874756 }, { "auxiliary_loss_clip": 0.01233943, "auxiliary_loss_mlp": 0.00208072, "balance_loss_clip": 1.01778078, "balance_loss_mlp": 0.18427771, "epoch": 0.9987073500676387, "flos": 22054538776320.0, "grad_norm": 8.024679860661672, "language_loss": 0.75596386, "learning_rate": 1.672274094288717e-11, "loss": 0.77038395, "num_input_tokens_seen": 358472210, "router_z_loss_clip": 2.15917969, "router_z_loss_mlp": 0.23791504, "step": 16611, "time_per_iteration": 4.084531307220459 }, { "auxiliary_loss_clip": 0.01240438, "auxiliary_loss_mlp": 0.00230101, "balance_loss_clip": 1.02536023, "balance_loss_mlp": 0.20591329, "epoch": 0.9987674733203066, "flos": 30482880537600.0, "grad_norm": 20.16336911341428, "language_loss": 0.77149248, "learning_rate": 1.5168020961020544e-11, "loss": 0.78619784, "num_input_tokens_seen": 358493840, "router_z_loss_clip": 2.15429688, "router_z_loss_mlp": 0.24194336, "step": 16612, "time_per_iteration": 4.161989212036133 }, { "auxiliary_loss_clip": 0.01225455, "auxiliary_loss_mlp": 0.00220386, "balance_loss_clip": 1.01623535, "balance_loss_mlp": 0.19800995, "epoch": 0.9988275965729746, "flos": 27745230585600.0, "grad_norm": 126.13989831645686, "language_loss": 0.81432384, "learning_rate": 1.3689140604400407e-11, "loss": 0.8287822, "num_input_tokens_seen": 358515060, "router_z_loss_clip": 2.09179688, "router_z_loss_mlp": 0.22375488, "step": 16613, "time_per_iteration": 2.78298020362854 }, { "auxiliary_loss_clip": 0.01237675, "auxiliary_loss_mlp": 0.00222662, "balance_loss_clip": 1.01965547, "balance_loss_mlp": 0.19826031, "epoch": 0.9988877198256426, "flos": 17524191916800.0, "grad_norm": 29.270666894562073, "language_loss": 0.80658853, "learning_rate": 1.2286099928981996e-11, "loss": 0.82119191, "num_input_tokens_seen": 358528200, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.24389648, "step": 16614, "time_per_iteration": 2.662229299545288 }, { "auxiliary_loss_clip": 0.01234397, "auxiliary_loss_mlp": 0.00232046, "balance_loss_clip": 1.02368605, "balance_loss_mlp": 0.20736995, "epoch": 0.9989478430783105, "flos": 20996502739200.0, "grad_norm": 1572.1607684513924, "language_loss": 0.78420424, "learning_rate": 1.0958898988278065e-11, "loss": 0.79886866, "num_input_tokens_seen": 358548360, "router_z_loss_clip": 2.109375, "router_z_loss_mlp": 0.24694824, "step": 16615, "time_per_iteration": 2.689746618270874 }, { "auxiliary_loss_clip": 0.01254954, "auxiliary_loss_mlp": 0.00228871, "balance_loss_clip": 1.03128457, "balance_loss_mlp": 0.20380098, "epoch": 0.9990079663309785, "flos": 13370620769280.0, "grad_norm": 12.36423137567669, "language_loss": 0.89674729, "learning_rate": 9.70753783247069e-12, "loss": 0.91158557, "num_input_tokens_seen": 358566270, "router_z_loss_clip": 2.23828125, "router_z_loss_mlp": 0.25061035, "step": 16616, "time_per_iteration": 2.692957639694214 }, { "auxiliary_loss_clip": 0.01239947, "auxiliary_loss_mlp": 0.00233181, "balance_loss_clip": 1.02516544, "balance_loss_mlp": 0.2090297, "epoch": 0.9990680895836465, "flos": 17310236555520.0, "grad_norm": 15.042175561482889, "language_loss": 0.91082335, "learning_rate": 8.532016508855378e-12, "loss": 0.92555463, "num_input_tokens_seen": 358584710, "router_z_loss_clip": 2.1484375, "router_z_loss_mlp": 0.24169922, "step": 16617, "time_per_iteration": 2.797769784927368 }, { "auxiliary_loss_clip": 0.01246822, "auxiliary_loss_mlp": 0.00221173, "balance_loss_clip": 1.02932525, "balance_loss_mlp": 0.19621015, "epoch": 0.9991282128363145, "flos": 24207993930240.0, "grad_norm": 70.04045839005306, "language_loss": 0.85250551, "learning_rate": 7.43233506206309e-12, "loss": 0.86718547, "num_input_tokens_seen": 358606750, "router_z_loss_clip": 2.17285156, "router_z_loss_mlp": 0.24951172, "step": 16618, "time_per_iteration": 2.7307963371276855 }, { "auxiliary_loss_clip": 0.01228112, "auxiliary_loss_mlp": 0.0022525, "balance_loss_clip": 1.01792145, "balance_loss_mlp": 0.20024002, "epoch": 0.9991883360889824, "flos": 21175301664000.0, "grad_norm": 111.89641564932002, "language_loss": 0.81983757, "learning_rate": 6.408493534060255e-12, "loss": 0.83437121, "num_input_tokens_seen": 358624675, "router_z_loss_clip": 2.10449219, "router_z_loss_mlp": 0.25012207, "step": 16619, "time_per_iteration": 2.7559618949890137 }, { "auxiliary_loss_clip": 0.0121823, "auxiliary_loss_mlp": 0.00197524, "balance_loss_clip": 1.01097941, "balance_loss_mlp": 0.17609021, "epoch": 0.9992484593416504, "flos": 19901155449600.0, "grad_norm": 6.359363516533516, "language_loss": 0.94724643, "learning_rate": 5.460491963260594e-12, "loss": 0.96140403, "num_input_tokens_seen": 358640715, "router_z_loss_clip": 2.06835938, "router_z_loss_mlp": 0.21447754, "step": 16620, "time_per_iteration": 2.6698157787323 }, { "auxiliary_loss_clip": 0.0124508, "auxiliary_loss_mlp": 0.00243164, "balance_loss_clip": 1.02928853, "balance_loss_mlp": 0.21735501, "epoch": 0.9993085825943183, "flos": 24857832833280.0, "grad_norm": 7.648131820358017, "language_loss": 0.79182458, "learning_rate": 4.58833038607942e-12, "loss": 0.80670702, "num_input_tokens_seen": 358659630, "router_z_loss_clip": 2.16210938, "router_z_loss_mlp": 0.25830078, "step": 16621, "time_per_iteration": 2.720644235610962 }, { "auxiliary_loss_clip": 0.01084786, "auxiliary_loss_mlp": 0.00071425, "balance_loss_clip": 0.94855797, "balance_loss_mlp": 0.0649403, "epoch": 0.9993687058469863, "flos": 71284478780160.0, "grad_norm": 0.7292509272902782, "language_loss": 0.56009519, "learning_rate": 3.79200883515729e-12, "loss": 0.5716573, "num_input_tokens_seen": 358727840, "router_z_loss_clip": 1.359375, "router_z_loss_mlp": 0.06494141, "step": 16622, "time_per_iteration": 3.4020938873291016 }, { "auxiliary_loss_clip": 0.0124198, "auxiliary_loss_mlp": 0.00216208, "balance_loss_clip": 1.02532113, "balance_loss_mlp": 0.19138816, "epoch": 0.9994288290996542, "flos": 12199573566720.0, "grad_norm": 3.1985054719277186, "language_loss": 0.8121208, "learning_rate": 3.071527340914315e-12, "loss": 0.82670265, "num_input_tokens_seen": 358744125, "router_z_loss_clip": 2.16699219, "router_z_loss_mlp": 0.24816895, "step": 16623, "time_per_iteration": 2.680408000946045 }, { "auxiliary_loss_clip": 0.01247879, "auxiliary_loss_mlp": 0.00224933, "balance_loss_clip": 1.02670395, "balance_loss_mlp": 0.19812256, "epoch": 0.9994889523523223, "flos": 17889942153600.0, "grad_norm": 12.681448969547212, "language_loss": 0.8310861, "learning_rate": 2.4268859304399368e-12, "loss": 0.84581423, "num_input_tokens_seen": 358761420, "router_z_loss_clip": 2.2109375, "router_z_loss_mlp": 0.26782227, "step": 16624, "time_per_iteration": 2.6754543781280518 }, { "auxiliary_loss_clip": 0.01234609, "auxiliary_loss_mlp": 0.00247478, "balance_loss_clip": 1.02176785, "balance_loss_mlp": 0.22163385, "epoch": 0.9995490756049902, "flos": 26578888064640.0, "grad_norm": 30.221078176492927, "language_loss": 0.82475013, "learning_rate": 1.8580846286031514e-12, "loss": 0.839571, "num_input_tokens_seen": 358782600, "router_z_loss_clip": 2.12792969, "router_z_loss_mlp": 0.25878906, "step": 16625, "time_per_iteration": 2.6785974502563477 }, { "auxiliary_loss_clip": 0.01218161, "auxiliary_loss_mlp": 0.00203055, "balance_loss_clip": 1.00777519, "balance_loss_mlp": 0.17957032, "epoch": 0.9996091988576582, "flos": 22200048771840.0, "grad_norm": 267.96574208083814, "language_loss": 0.85752964, "learning_rate": 1.3651234567202408e-12, "loss": 0.87174177, "num_input_tokens_seen": 358801220, "router_z_loss_clip": 2.10742188, "router_z_loss_mlp": 0.23474121, "step": 16626, "time_per_iteration": 2.764575958251953 }, { "auxiliary_loss_clip": 0.0125091, "auxiliary_loss_mlp": 0.00243048, "balance_loss_clip": 1.03341556, "balance_loss_mlp": 0.21742991, "epoch": 0.9996693221103262, "flos": 27373195468800.0, "grad_norm": 185.36679579226887, "language_loss": 0.87218666, "learning_rate": 9.480024334429515e-13, "loss": 0.88712621, "num_input_tokens_seen": 358819190, "router_z_loss_clip": 2.17675781, "router_z_loss_mlp": 0.25598145, "step": 16627, "time_per_iteration": 2.676591396331787 }, { "auxiliary_loss_clip": 0.01241842, "auxiliary_loss_mlp": 0.00240874, "balance_loss_clip": 1.02119279, "balance_loss_mlp": 0.21545884, "epoch": 0.9997294453629941, "flos": 26870410846080.0, "grad_norm": 29.223261145139087, "language_loss": 0.79282004, "learning_rate": 6.067215747584952e-13, "loss": 0.80764717, "num_input_tokens_seen": 358839850, "router_z_loss_clip": 2.20507812, "router_z_loss_mlp": 0.25427246, "step": 16628, "time_per_iteration": 2.822291851043701 }, { "auxiliary_loss_clip": 0.0123628, "auxiliary_loss_mlp": 0.00227987, "balance_loss_clip": 1.01825774, "balance_loss_mlp": 0.20133162, "epoch": 0.9997895686156621, "flos": 23476996247040.0, "grad_norm": 18.278218376381442, "language_loss": 0.81810993, "learning_rate": 3.4128089332341456e-13, "loss": 0.83275259, "num_input_tokens_seen": 358859805, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.26660156, "step": 16629, "time_per_iteration": 2.660444736480713 }, { "auxiliary_loss_clip": 0.0126585, "auxiliary_loss_mlp": 0.00245981, "balance_loss_clip": 1.03863335, "balance_loss_mlp": 0.22045828, "epoch": 0.9998496918683301, "flos": 20224961579520.0, "grad_norm": 31.24288768614072, "language_loss": 0.69713485, "learning_rate": 1.5168039935176126e-13, "loss": 0.71225315, "num_input_tokens_seen": 358877900, "router_z_loss_clip": 2.27539062, "router_z_loss_mlp": 0.25537109, "step": 16630, "time_per_iteration": 2.6606853008270264 }, { "auxiliary_loss_clip": 0.01256557, "auxiliary_loss_mlp": 0.00236441, "balance_loss_clip": 1.03392315, "balance_loss_mlp": 0.2107513, "epoch": 0.9999098151209981, "flos": 21652913831040.0, "grad_norm": 8.989207531647557, "language_loss": 0.70419616, "learning_rate": 3.792010017100722e-14, "loss": 0.71912611, "num_input_tokens_seen": 358897285, "router_z_loss_clip": 2.22363281, "router_z_loss_mlp": 0.25683594, "step": 16631, "time_per_iteration": 2.629532814025879 }, { "auxiliary_loss_clip": 0.01215986, "auxiliary_loss_mlp": 0.00212989, "balance_loss_clip": 1.00726497, "balance_loss_mlp": 0.18977913, "epoch": 0.999969938373666, "flos": 11544599018880.0, "grad_norm": 59.75671724993049, "language_loss": 0.80082119, "learning_rate": 0.0, "loss": 0.81511098, "num_input_tokens_seen": 358911570, "router_z_loss_clip": 2.08496094, "router_z_loss_mlp": 0.23193359, "step": 16632, "time_per_iteration": 2.624641180038452 } ], "logging_steps": 1.0, "max_steps": 16632, "num_input_tokens_seen": 358911570, "num_train_epochs": 1, "save_steps": 3328, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.3992169073237033e+18, "train_batch_size": 5, "trial_name": null, "trial_params": null }